From 93c91e39b29142dec1d03a30df9f6e757f56c193 Mon Sep 17 00:00:00 2001
From: Dimitry Andric <dim@FreeBSD.org>
Date: Wed, 19 Jul 2017 07:02:10 +0000
Subject: Vendor import of llvm trunk r308421:
 https://llvm.org/svn/llvm-project/llvm/trunk@308421

---
 RELEASE_TESTERS.TXT                                |    7 +-
 docs/AliasAnalysis.rst                             |    3 +-
 docs/CodingStandards.rst                           |    6 +-
 docs/CommandGuide/lit.rst                          |    7 +
 include/llvm/Analysis/DominanceFrontier.h          |   59 +-
 include/llvm/Analysis/DominanceFrontierImpl.h      |   36 +-
 include/llvm/Analysis/IteratedDominanceFrontier.h  |   20 +-
 include/llvm/Analysis/LazyCallGraph.h              |   28 +-
 include/llvm/Analysis/LoopInfo.h                   |    9 +-
 include/llvm/Analysis/LoopInfoImpl.h               |   17 +-
 include/llvm/Analysis/PostDominators.h             |    6 +-
 include/llvm/Analysis/ScalarEvolution.h            |   48 +-
 include/llvm/Analysis/TargetTransformInfo.h        |   11 +
 include/llvm/Analysis/TargetTransformInfoImpl.h    |    6 +
 include/llvm/CodeGen/BasicTTIImpl.h                |   12 +
 include/llvm/CodeGen/MachineDominanceFrontier.h    |   31 +-
 include/llvm/CodeGen/MachineDominators.h           |   15 +-
 include/llvm/CodeGen/MachinePostDominators.h       |    2 +-
 include/llvm/DebugInfo/CodeView/CVTypeVisitor.h    |   16 +-
 include/llvm/DebugInfo/CodeView/CodeViewRecordIO.h |    2 +-
 include/llvm/DebugInfo/CodeView/Formatters.h       |   10 +-
 include/llvm/DebugInfo/CodeView/GUID.h             |   55 +
 include/llvm/DebugInfo/CodeView/SymbolRecord.h     |    2 +-
 include/llvm/DebugInfo/CodeView/TypeRecord.h       |   13 +-
 .../llvm/DebugInfo/CodeView/TypeServerHandler.h    |   38 -
 include/llvm/DebugInfo/CodeView/TypeStreamMerger.h |   14 +-
 include/llvm/DebugInfo/DWARF/DWARFUnit.h           |   28 +
 include/llvm/DebugInfo/DWARF/DWARFVerifier.h       |   44 +-
 include/llvm/DebugInfo/PDB/DIA/DIARawSymbol.h      |    2 +-
 include/llvm/DebugInfo/PDB/GenericError.h          |    1 +
 include/llvm/DebugInfo/PDB/IPDBRawSymbol.h         |    2 +-
 include/llvm/DebugInfo/PDB/Native/Formatters.h     |    7 -
 include/llvm/DebugInfo/PDB/Native/InfoStream.h     |    5 +-
 .../llvm/DebugInfo/PDB/Native/InfoStreamBuilder.h  |    4 +-
 .../llvm/DebugInfo/PDB/Native/NativeExeSymbol.h    |    2 +-
 .../llvm/DebugInfo/PDB/Native/NativeRawSymbol.h    |    2 +-
 .../DebugInfo/PDB/Native/PDBTypeServerHandler.h    |   46 -
 include/llvm/DebugInfo/PDB/Native/RawTypes.h       |   14 +-
 include/llvm/DebugInfo/PDB/Native/TpiHashing.h     |   73 +-
 include/llvm/DebugInfo/PDB/PDBExtras.h             |    1 -
 .../ExecutionEngine/Orc/CompileOnDemandLayer.h     |   24 +-
 include/llvm/ExecutionEngine/RTDyldMemoryManager.h |    5 +-
 include/llvm/IR/CallingConv.h                      |   14 +-
 include/llvm/IR/Constants.h                        |    8 +
 include/llvm/IR/DIBuilder.h                        |   37 +-
 include/llvm/IR/DebugInfoMetadata.h                |   28 +-
 include/llvm/IR/Dominators.h                       |   49 +-
 include/llvm/IR/IntrinsicsHexagon.td               |   26 +-
 include/llvm/IR/IntrinsicsSystemZ.td               |   43 +
 include/llvm/MC/LaneBitmask.h                      |    3 +
 include/llvm/MC/MCFixup.h                          |    2 +-
 include/llvm/MC/MCInstrDesc.h                      |    9 +
 include/llvm/Object/COFFImportFile.h               |    2 +-
 include/llvm/Object/COFFModuleDefinition.h         |    8 +-
 include/llvm/ObjectYAML/CodeViewYAMLTypes.h        |    2 +
 include/llvm/Support/AArch64TargetParser.def       |    5 +-
 include/llvm/Support/BinaryItemStream.h            |   39 +-
 include/llvm/Support/Format.h                      |   25 +-
 include/llvm/Support/GenericDomTree.h              |  160 +-
 include/llvm/Support/GenericDomTreeConstruction.h  |  670 +++-
 include/llvm/Support/TargetParser.h                |    4 +-
 include/llvm/Support/YAMLTraits.h                  |    4 +
 .../llvm/Target/GlobalISel/SelectionDAGCompat.td   |    1 +
 include/llvm/Target/TargetLowering.h               |   29 +
 .../llvm/ToolDrivers/llvm-dlltool/DlltoolDriver.h  |   24 +
 lib/Analysis/CGSCCPassManager.cpp                  |    9 +-
 lib/Analysis/DominanceFrontier.cpp                 |    3 +-
 lib/Analysis/InstCount.cpp                         |    8 -
 lib/Analysis/InstructionSimplify.cpp               |   21 +-
 lib/Analysis/IteratedDominanceFrontier.cpp         |    8 +-
 lib/Analysis/LazyCallGraph.cpp                     |   49 +-
 lib/Analysis/LoopInfo.cpp                          |    2 +-
 lib/Analysis/MemorySSA.cpp                         |    1 -
 lib/Analysis/PostDominators.cpp                    |    2 +
 lib/Analysis/ScalarEvolution.cpp                   |  385 ++-
 lib/Analysis/TargetTransformInfo.cpp               |    5 +
 lib/AsmParser/LLLexer.cpp                          |    2 +-
 lib/AsmParser/LLParser.cpp                         |   10 +-
 lib/AsmParser/LLToken.h                            |    2 +-
 lib/Bitcode/Reader/MetadataLoader.cpp              |    8 +-
 lib/Bitcode/Writer/BitcodeWriter.cpp               |    1 +
 lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp        |    5 +-
 lib/CodeGen/CodeGenPrepare.cpp                     |   81 +-
 lib/CodeGen/GlobalISel/LegalizerHelper.cpp         |    3 +
 lib/CodeGen/MachineCombiner.cpp                    |    4 +-
 lib/CodeGen/MachineDominanceFrontier.cpp           |    3 +-
 lib/CodeGen/MachineDominators.cpp                  |    6 +-
 lib/CodeGen/MachinePostDominators.cpp              |    7 +-
 lib/CodeGen/SelectionDAG/DAGCombiner.cpp           |  161 +-
 lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp  |   21 +-
 lib/CodeGen/XRayInstrumentation.cpp                |    6 +-
 lib/DebugInfo/CodeView/CVTypeVisitor.cpp           |   77 +-
 lib/DebugInfo/CodeView/CodeViewRecordIO.cpp        |    9 +-
 lib/DebugInfo/CodeView/Formatters.cpp              |    7 +
 lib/DebugInfo/CodeView/SymbolDumper.cpp            |    2 +-
 lib/DebugInfo/CodeView/TypeDumpVisitor.cpp         |    2 +-
 lib/DebugInfo/CodeView/TypeStreamMerger.cpp        |  173 +-
 lib/DebugInfo/DWARF/DWARFVerifier.cpp              |  195 +-
 lib/DebugInfo/PDB/CMakeLists.txt                   |    1 -
 lib/DebugInfo/PDB/DIA/DIARawSymbol.cpp             |   12 +-
 lib/DebugInfo/PDB/GenericError.cpp                 |    2 +
 lib/DebugInfo/PDB/Native/InfoStream.cpp            |    2 +-
 lib/DebugInfo/PDB/Native/InfoStreamBuilder.cpp     |    2 +-
 lib/DebugInfo/PDB/Native/NativeExeSymbol.cpp       |    4 +-
 lib/DebugInfo/PDB/Native/NativeRawSymbol.cpp       |    4 +-
 lib/DebugInfo/PDB/Native/PDBTypeServerHandler.cpp  |  126 -
 lib/DebugInfo/PDB/Native/TpiHashing.cpp            |  128 +-
 lib/DebugInfo/PDB/Native/TpiStream.cpp             |    1 -
 lib/DebugInfo/PDB/PDBExtras.cpp                    |    6 -
 .../RuntimeDyld/Targets/RuntimeDyldMachOARM.h      |    1 -
 lib/Fuzzer/CMakeLists.txt                          |    1 -
 lib/Fuzzer/FuzzerCorpus.h                          |   40 +-
 lib/Fuzzer/FuzzerDriver.cpp                        |   20 +-
 lib/Fuzzer/FuzzerFlags.def                         |    3 +
 lib/Fuzzer/FuzzerInternal.h                        |   14 +-
 lib/Fuzzer/FuzzerLoop.cpp                          |   30 +-
 lib/Fuzzer/FuzzerMerge.cpp                         |    8 +-
 lib/Fuzzer/FuzzerMutate.cpp                        |   23 +-
 lib/Fuzzer/FuzzerMutate.h                          |    6 -
 lib/Fuzzer/FuzzerTracePC.cpp                       |   83 +
 lib/Fuzzer/FuzzerTracePC.h                         |   23 +
 lib/Fuzzer/FuzzerTraceState.cpp                    |  181 --
 lib/Fuzzer/FuzzerUtil.cpp                          |    7 +
 lib/Fuzzer/FuzzerUtil.h                            |   10 +
 lib/Fuzzer/afl/afl_driver.cpp                      |    4 +-
 lib/Fuzzer/test/CMakeLists.txt                     |    3 +-
 lib/Fuzzer/test/FlagsTest.cpp                      |   32 +
 lib/Fuzzer/test/FuzzerUnittest.cpp                 |   31 +-
 lib/Fuzzer/test/fuzzer-flags.test                  |   17 +-
 lib/Fuzzer/test/fuzzer-traces-hooks.test           |    2 +-
 lib/Fuzzer/test/reduce_inputs.test                 |    3 +-
 lib/IR/AsmWriter.cpp                               |    3 +-
 lib/IR/Constants.cpp                               |   86 +-
 lib/IR/Core.cpp                                    |    1 -
 lib/IR/DIBuilder.cpp                               |   26 +-
 lib/IR/DebugInfoMetadata.cpp                       |    9 +-
 lib/IR/Dominators.cpp                              |   42 +-
 lib/IR/LLVMContextImpl.h                           |   16 +-
 lib/IR/LegacyPassManager.cpp                       |   51 +-
 lib/IR/Module.cpp                                  |    2 +-
 lib/Object/ArchiveWriter.cpp                       |    3 +-
 lib/Object/COFFImportFile.cpp                      |  144 +-
 lib/Object/COFFModuleDefinition.cpp                |   36 +-
 lib/Object/COFFObjectFile.cpp                      |    5 +-
 lib/ObjectYAML/CodeViewYAMLTypes.cpp               |   27 +
 lib/Option/OptTable.cpp                            |   25 +-
 lib/Support/ErrorHandling.cpp                      |   22 +
 lib/Support/Host.cpp                               |   41 +-
 lib/Support/Path.cpp                               |    2 -
 lib/Support/TargetParser.cpp                       |    2 +
 lib/Support/YAMLTraits.cpp                         |    8 +
 lib/Support/raw_ostream.cpp                        |   29 +-
 lib/Target/AArch64/AArch64.h                       |    4 +
 lib/Target/AArch64/AArch64.td                      |    4 +
 lib/Target/AArch64/AArch64CallingConvention.td     |    7 +
 .../AArch64/AArch64DeadRegisterDefinitionsPass.cpp |    4 +
 lib/Target/AArch64/AArch64FalkorHWPFFix.cpp        |  790 +++++
 lib/Target/AArch64/AArch64FastISel.cpp             |    1 +
 lib/Target/AArch64/AArch64FrameLowering.cpp        |   12 +-
 lib/Target/AArch64/AArch64ISelDAGToDAG.cpp         |   15 +-
 lib/Target/AArch64/AArch64ISelLowering.cpp         |   64 +-
 lib/Target/AArch64/AArch64ISelLowering.h           |   16 +
 lib/Target/AArch64/AArch64InstrAtomics.td          |   10 +
 lib/Target/AArch64/AArch64InstrInfo.cpp            |   13 +-
 lib/Target/AArch64/AArch64InstrInfo.h              |   12 +-
 lib/Target/AArch64/AArch64InstrInfo.td             |    2 +
 lib/Target/AArch64/AArch64LegalizerInfo.cpp        |    2 +-
 lib/Target/AArch64/AArch64RegisterInfo.cpp         |    4 +-
 lib/Target/AArch64/AArch64Subtarget.cpp            |    7 +-
 lib/Target/AArch64/AArch64Subtarget.h              |   13 +
 lib/Target/AArch64/AArch64TargetMachine.cpp        |   23 +-
 lib/Target/AArch64/AArch64TargetMachine.h          |    2 +
 lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp  |   21 +-
 lib/Target/AArch64/CMakeLists.txt                  |    1 +
 .../AArch64/MCTargetDesc/AArch64AsmBackend.cpp     |    6 +-
 .../MCTargetDesc/AArch64WinCOFFObjectWriter.cpp    |   61 +-
 lib/Target/AMDGPU/AMDGPU.h                         |    2 +-
 lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp |  252 +-
 lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp             |    9 -
 lib/Target/AMDGPU/AMDGPUISelLowering.cpp           |   32 +-
 lib/Target/AMDGPU/AMDGPUISelLowering.h             |    1 +
 lib/Target/AMDGPU/AMDGPUSubtarget.cpp              |    2 +-
 lib/Target/AMDGPU/AMDGPUSubtarget.h                |    4 +
 lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp    |    2 +-
 .../AMDGPU/Disassembler/AMDGPUDisassembler.cpp     |    5 +
 .../AMDGPU/Disassembler/AMDGPUDisassembler.h       |    1 +
 lib/Target/AMDGPU/SIFoldOperands.cpp               |    1 +
 lib/Target/AMDGPU/SIFrameLowering.cpp              |   11 +-
 lib/Target/AMDGPU/SIISelLowering.cpp               |  119 +-
 lib/Target/AMDGPU/SIISelLowering.h                 |    2 +
 lib/Target/AMDGPU/SIInstrInfo.cpp                  |   24 +-
 lib/Target/AMDGPU/SIInstrInfo.h                    |   19 +-
 lib/Target/AMDGPU/SIInstrInfo.td                   |    2 +-
 lib/Target/AMDGPU/SIInstructions.td                |    2 +-
 lib/Target/AMDGPU/SIMachineFunctionInfo.cpp        |   68 +-
 lib/Target/AMDGPU/SIMachineFunctionInfo.h          |   10 +-
 lib/Target/AMDGPU/SIRegisterInfo.cpp               |    4 +
 lib/Target/AMDGPU/SIRegisterInfo.td                |   17 +-
 lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp         |   18 +-
 lib/Target/AMDGPU/VOP2Instructions.td              |   11 +-
 lib/Target/AMDGPU/VOP3Instructions.td              |   11 +-
 lib/Target/AMDGPU/VOP3PInstructions.td             |   10 +-
 lib/Target/AMDGPU/VOPCInstructions.td              |   24 +-
 lib/Target/AMDGPU/VOPInstructions.td               |    2 +
 lib/Target/ARM/ARM.td                              |  431 +--
 lib/Target/ARM/ARMBaseRegisterInfo.cpp             |    4 +-
 lib/Target/ARM/ARMFastISel.cpp                     |   43 +-
 lib/Target/ARM/ARMISelDAGToDAG.cpp                 |   20 +-
 lib/Target/ARM/ARMInstructionSelector.cpp          |   23 +
 lib/Target/ARM/ARMLegalizerInfo.cpp                |    8 +-
 lib/Target/ARM/ARMMCInstLower.cpp                  |    4 +-
 lib/Target/ARM/ARMRegisterBankInfo.cpp             |    9 +
 lib/Target/ARM/ARMTargetMachine.h                  |    2 +
 lib/Target/BPF/BPFISelLowering.cpp                 |   45 +-
 lib/Target/BPF/BPFInstrInfo.td                     |    5 +
 lib/Target/Hexagon/HexagonBitSimplify.cpp          |    4 +-
 lib/Target/Hexagon/HexagonDepInstrInfo.td          |    4 +
 lib/Target/Hexagon/HexagonEarlyIfConv.cpp          |    4 +-
 lib/Target/Hexagon/HexagonExpandCondsets.cpp       |    4 +-
 lib/Target/Hexagon/HexagonFrameLowering.cpp        |    4 +-
 lib/Target/Hexagon/HexagonGenInsert.cpp            |    4 +-
 lib/Target/Hexagon/HexagonGenPredicate.cpp         |    4 +-
 lib/Target/Hexagon/HexagonISelLowering.cpp         |  147 +-
 lib/Target/Hexagon/HexagonISelLowering.h           |    4 +-
 lib/Target/Hexagon/HexagonIntrinsics.td            |   12 +
 lib/Target/Hexagon/HexagonOptAddrMode.cpp          |    4 +-
 lib/Target/Hexagon/HexagonPatterns.td              |   92 +-
 lib/Target/Hexagon/HexagonTargetObjectFile.cpp     |   47 +
 lib/Target/Hexagon/HexagonTargetObjectFile.h       |    6 +
 lib/Target/Mips/AsmParser/MipsAsmParser.cpp        |  219 --
 .../Mips/MCTargetDesc/MipsTargetStreamer.cpp       |   15 -
 lib/Target/Mips/Mips.td                            |    3 +
 lib/Target/Mips/MipsISelLowering.cpp               |  146 +-
 lib/Target/Mips/MipsInstrFPU.td                    |    9 +
 lib/Target/Mips/MipsMTInstrFormats.td              |   21 -
 lib/Target/Mips/MipsMTInstrInfo.td                 |  110 -
 lib/Target/Mips/MipsSEISelDAGToDAG.cpp             |   99 +-
 lib/Target/Mips/MipsSEISelDAGToDAG.h               |    3 +-
 lib/Target/Mips/MipsSEISelLowering.cpp             |  203 +-
 lib/Target/Mips/MipsSchedule.td                    |    4 -
 lib/Target/Mips/MipsSubtarget.h                    |    5 +
 lib/Target/Mips/MipsTargetStreamer.h               |    3 -
 .../PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp      |    3 +-
 lib/Target/PowerPC/PPCISelDAGToDAG.cpp             |   28 +-
 lib/Target/PowerPC/PPCISelLowering.cpp             |   18 +-
 lib/Target/PowerPC/PPCISelLowering.h               |    2 +-
 lib/Target/PowerPC/PPCInstrInfo.cpp                |   10 +-
 lib/Target/PowerPC/PPCInstrInfo.td                 |   22 +-
 lib/Target/PowerPC/PPCInstrVSX.td                  |   90 +-
 lib/Target/PowerPC/PPCRegisterInfo.cpp             |   30 +-
 lib/Target/PowerPC/PPCTargetMachine.h              |    2 +
 lib/Target/Sparc/Sparc.td                          |    6 +-
 lib/Target/Sparc/SparcISelLowering.cpp             |   13 +
 lib/Target/Sparc/SparcInstrInfo.td                 |    3 +
 lib/Target/Sparc/SparcSubtarget.cpp                |    1 +
 lib/Target/Sparc/SparcSubtarget.h                  |    2 +
 lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp  |   15 +-
 lib/Target/SystemZ/LLVMBuild.txt                   |    2 +-
 lib/Target/SystemZ/SystemZFeatures.td              |   58 +-
 lib/Target/SystemZ/SystemZISelLowering.cpp         |  104 +-
 lib/Target/SystemZ/SystemZISelLowering.h           |    7 +
 lib/Target/SystemZ/SystemZInstrFP.td               |   88 +-
 lib/Target/SystemZ/SystemZInstrFormats.td          |  330 +-
 lib/Target/SystemZ/SystemZInstrInfo.cpp            |   32 +
 lib/Target/SystemZ/SystemZInstrInfo.td             |   68 +-
 lib/Target/SystemZ/SystemZInstrSystem.td           |    4 +
 lib/Target/SystemZ/SystemZInstrVector.td           |  366 ++-
 lib/Target/SystemZ/SystemZOperators.td             |   20 +
 lib/Target/SystemZ/SystemZPatterns.td              |    7 +
 lib/Target/SystemZ/SystemZProcessors.td            |    3 +
 lib/Target/SystemZ/SystemZRegisterInfo.td          |   14 +-
 lib/Target/SystemZ/SystemZSchedule.td              |    3 +-
 lib/Target/SystemZ/SystemZScheduleZ14.td           | 1611 ++++++++++
 lib/Target/SystemZ/SystemZScheduleZ196.td          |  163 +-
 lib/Target/SystemZ/SystemZScheduleZEC12.td         |  161 +-
 lib/Target/SystemZ/SystemZShortenInst.cpp          |   40 +
 lib/Target/SystemZ/SystemZSubtarget.cpp            |    4 +
 lib/Target/SystemZ/SystemZSubtarget.h              |   34 +
 lib/Target/SystemZ/SystemZTargetMachine.cpp        |    4 +-
 lib/Target/SystemZ/SystemZTargetTransformInfo.cpp  |    3 +
 lib/Target/SystemZ/SystemZTargetTransformInfo.h    |    4 +
 lib/Target/X86/CMakeLists.txt                      |    1 +
 lib/Target/X86/X86.h                               |    3 +
 lib/Target/X86/X86.td                              |    6 +-
 lib/Target/X86/X86CallingConv.td                   |    4 +-
 lib/Target/X86/X86CmovConversion.cpp               |  611 ++++
 lib/Target/X86/X86FastISel.cpp                     |    4 +-
 lib/Target/X86/X86FixupBWInsts.cpp                 |    2 +-
 lib/Target/X86/X86ISelLowering.cpp                 |   66 +-
 lib/Target/X86/X86InstrAVX512.td                   |  231 +-
 lib/Target/X86/X86RegisterInfo.cpp                 |    6 +-
 lib/Target/X86/X86Schedule.td                      |    1 +
 lib/Target/X86/X86ScheduleBtVer2.td                |   16 +
 lib/Target/X86/X86ScheduleZnver1.td                |  223 ++
 lib/Target/X86/X86Subtarget.h                      |    2 +-
 lib/Target/X86/X86TargetMachine.cpp                |    1 +
 lib/Target/X86/X86TargetMachine.h                  |    2 +
 lib/ToolDrivers/CMakeLists.txt                     |    1 +
 lib/ToolDrivers/LLVMBuild.txt                      |    2 +-
 lib/ToolDrivers/llvm-dlltool/CMakeLists.txt        |    9 +
 lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp     |  160 +
 lib/ToolDrivers/llvm-dlltool/LLVMBuild.txt         |   22 +
 lib/ToolDrivers/llvm-dlltool/Options.td            |   26 +
 lib/Transforms/IPO/GlobalOpt.cpp                   |   18 +
 lib/Transforms/IPO/Inliner.cpp                     |    2 +-
 lib/Transforms/IPO/SampleProfile.cpp               |   11 +-
 lib/Transforms/InstCombine/InstCombineAndOrXor.cpp |   56 +-
 lib/Transforms/InstCombine/InstCombineCompares.cpp |   36 +-
 .../InstCombine/InstCombineLoadStoreAlloca.cpp     |    5 +-
 .../InstCombine/InstCombineSimplifyDemanded.cpp    |    6 +-
 .../InstCombine/InstructionCombining.cpp           |   76 +-
 .../Instrumentation/AddressSanitizer.cpp           |   38 +
 lib/Transforms/Instrumentation/MemorySanitizer.cpp |    4 +-
 .../Instrumentation/SanitizerCoverage.cpp          |   10 +
 lib/Transforms/Scalar/EarlyCSE.cpp                 |   16 +-
 lib/Transforms/Scalar/GVN.cpp                      |    1 +
 .../Scalar/InductiveRangeCheckElimination.cpp      |   72 +-
 lib/Transforms/Scalar/JumpThreading.cpp            |   82 +-
 lib/Transforms/Scalar/LoopInterchange.cpp          |  119 +-
 lib/Transforms/Scalar/TailRecursionElimination.cpp |   15 +-
 lib/Transforms/Utils/LoopUnrollRuntime.cpp         |   30 +-
 lib/Transforms/Vectorize/LoopVectorize.cpp         |   64 +-
 lib/Transforms/Vectorize/SLPVectorizer.cpp         |   30 +-
 runtimes/CMakeLists.txt                            |    9 +-
 test/Analysis/CostModel/SystemZ/fp-arith.ll        |   53 +-
 test/Assembler/diimportedentity.ll                 |    4 +-
 test/Bitcode/DIGlobalVariableExpression.ll         |    2 +-
 test/Bitcode/compatibility-3.6.ll                  |    6 +-
 test/Bitcode/compatibility-3.7.ll                  |    6 +-
 test/Bitcode/compatibility-3.8.ll                  |    6 +-
 test/Bitcode/compatibility-3.9.ll                  |    6 +-
 test/Bitcode/compatibility-4.0.ll                  |    6 +-
 test/Bitcode/compatibility.ll                      |    6 +-
 test/Bitcode/upgrade-importedentity.ll             |   15 +
 test/Bitcode/upgrade-importedentity.ll.bc          |  Bin 0 -> 1216 bytes
 test/CMakeLists.txt                                |    3 +
 test/CodeGen/AArch64/GlobalISel/select-fma.mir     |   41 +
 test/CodeGen/AArch64/aarch64_win64cc_vararg.ll     |   74 +
 test/CodeGen/AArch64/arm64-abi-varargs.ll          |    3 +-
 test/CodeGen/AArch64/arm64-abi_align.ll            |   32 +-
 .../AArch64/arm64-alloca-frame-pointer-offset.ll   |    6 +-
 test/CodeGen/AArch64/arm64-extern-weak.ll          |    2 +-
 test/CodeGen/AArch64/arm64-inline-asm.ll           |   10 +
 test/CodeGen/AArch64/arm64-platform-reg.ll         |    1 +
 test/CodeGen/AArch64/arm64-vext.ll                 |    8 +-
 test/CodeGen/AArch64/atomic-ops-lse.ll             |  161 +
 test/CodeGen/AArch64/dag-combine-invaraints.ll     |    2 +-
 test/CodeGen/AArch64/extern-weak.ll                |    2 +-
 test/CodeGen/AArch64/falkor-hwpf-fix.ll            |   67 +
 test/CodeGen/AArch64/falkor-hwpf-fix.mir           |   52 +
 test/CodeGen/AArch64/falkor-hwpf.ll                |  106 +
 .../AArch64/preferred-function-alignment.ll        |    2 +-
 test/CodeGen/AArch64/swifterror.ll                 |   12 +-
 test/CodeGen/AArch64/win64_vararg.ll               |   95 +
 .../AMDGPU/annotate-kernel-features-hsa-call.ll    |  312 ++
 .../CodeGen/AMDGPU/annotate-kernel-features-hsa.ll |   11 +
 .../AMDGPU/attr-amdgpu-flat-work-group-size.ll     |    2 +-
 test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll    |   14 +-
 test/CodeGen/AMDGPU/fcanonicalize-elimination.ll   |   62 +-
 test/CodeGen/AMDGPU/function-args.ll               |   16 +
 test/CodeGen/AMDGPU/hsa.ll                         |   10 +-
 .../AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll      |   12 +
 test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll         |    2 +-
 test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll       |    4 +-
 test/CodeGen/AMDGPU/move-to-valu-worklist.ll       |   29 +
 test/CodeGen/AMDGPU/mubuf-offset-private.ll        |   26 +-
 test/CodeGen/AMDGPU/parallelandifcollapse.ll       |    2 +-
 test/CodeGen/AMDGPU/parallelorifcollapse.ll        |    2 +-
 test/CodeGen/AMDGPU/private-access-no-objects.ll   |   10 +-
 .../rename-independent-subregs-mac-operands.mir    |    2 +-
 test/CodeGen/AMDGPU/scratch-simple.ll              |   72 +-
 test/CodeGen/AMDGPU/sdwa-peephole-instr.mir        |   35 +-
 test/CodeGen/AMDGPU/sdwa-vop2-64bit.mir            |    4 +-
 test/CodeGen/AMDGPU/trap.ll                        |    8 +-
 .../CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir |    2 +-
 .../vgpr-spill-emergency-stack-slot-compute.ll     |   36 +-
 .../AMDGPU/vgpr-spill-emergency-stack-slot.ll      |    6 +-
 .../ARM/GlobalISel/arm-instruction-select.mir      |   39 +
 test/CodeGen/ARM/GlobalISel/arm-isel-divmod.ll     |   52 +
 test/CodeGen/ARM/GlobalISel/arm-isel.ll            |   39 +
 .../CodeGen/ARM/GlobalISel/arm-legalize-divmod.mir |  174 +
 test/CodeGen/ARM/GlobalISel/arm-legalizer.mir      |   36 +
 test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir  |   30 +
 test/CodeGen/ARM/atomic-op.ll                      |   15 +
 test/CodeGen/AVR/branch-relaxation.ll              |    4 +-
 test/CodeGen/BPF/select_ri.ll                      |   27 +
 test/CodeGen/BPF/setcc.ll                          |    4 +-
 test/CodeGen/Generic/2003-07-29-BadConstSbyte.ll   |    1 -
 .../CodeGen/Generic/2011-07-07-ScheduleDAGCrash.ll |    3 -
 test/CodeGen/Generic/print-mul-exp.ll              |    1 +
 test/CodeGen/Generic/print-mul.ll                  |    1 +
 test/CodeGen/Generic/print-shift.ll                |    1 +
 test/CodeGen/Generic/v-split.ll                    |    3 -
 test/CodeGen/Generic/vector-redux.ll               |    3 -
 test/CodeGen/Generic/vector.ll                     |    3 -
 test/CodeGen/Hexagon/intrinsics/system_user.ll     |   76 +-
 .../CodeGen/Hexagon/switch-lut-explicit-section.ll |   32 +
 .../CodeGen/Hexagon/switch-lut-function-section.ll |   30 +
 .../Hexagon/switch-lut-multiple-functions.ll       |   42 +
 test/CodeGen/Hexagon/switch-lut-text-section.ll    |   27 +
 test/CodeGen/Hexagon/v6vec-vprint.ll               |    2 +-
 test/CodeGen/Hexagon/vect/vect-load-v4i16.ll       |   23 +
 test/CodeGen/Hexagon/vect/vect-loadv4i16.ll        |   73 -
 test/CodeGen/Hexagon/vect/vect-v4i16.ll            |   73 +
 test/CodeGen/MIR/AArch64/target-memoperands.mir    |    4 +
 test/CodeGen/MIR/AMDGPU/fold-multiple.mir          |   40 +
 test/CodeGen/MSP430/vararg.ll                      |    4 +-
 test/CodeGen/Mips/2008-06-05-Carry.ll              |   13 +-
 test/CodeGen/Mips/dins.ll                          |    4 +-
 test/CodeGen/Mips/dsp-patterns.ll                  |    4 +-
 test/CodeGen/Mips/llcarry.ll                       |   11 +-
 test/CodeGen/Mips/llvm-ir/add.ll                   |  394 ++-
 test/CodeGen/Mips/llvm-ir/sub.ll                   |  174 +-
 test/CodeGen/Mips/long-calls.ll                    |   57 +
 test/CodeGen/Mips/madd-msub.ll                     |   81 +-
 test/CodeGen/Mips/msa/f16-llvm-ir.ll               |   12 +-
 test/CodeGen/PowerPC/PR33671.ll                    |   32 +
 test/CodeGen/PowerPC/build-vector-tests.ll         |   40 +-
 test/CodeGen/PowerPC/ppc64-i128-abi.ll             |    6 +-
 test/CodeGen/PowerPC/swaps-le-6.ll                 |    8 +-
 test/CodeGen/PowerPC/vsx-p9.ll                     |   48 +-
 test/CodeGen/SPARC/soft-mul-div.ll                 |   65 +
 test/CodeGen/SystemZ/branch-11.ll                  |   56 +
 test/CodeGen/SystemZ/fp-abs-03.ll                  |   43 +
 test/CodeGen/SystemZ/fp-abs-04.ll                  |   46 +
 test/CodeGen/SystemZ/fp-add-01.ll                  |    6 +-
 test/CodeGen/SystemZ/fp-add-04.ll                  |   17 +
 test/CodeGen/SystemZ/fp-cmp-01.ll                  |  102 +-
 test/CodeGen/SystemZ/fp-cmp-06.ll                  |   33 +
 test/CodeGen/SystemZ/fp-const-11.ll                |   40 +
 test/CodeGen/SystemZ/fp-conv-15.ll                 |   50 +
 test/CodeGen/SystemZ/fp-conv-16.ll                 |   99 +
 test/CodeGen/SystemZ/fp-copysign-02.ll             |   81 +
 test/CodeGen/SystemZ/fp-div-01.ll                  |    6 +-
 test/CodeGen/SystemZ/fp-div-04.ll                  |   17 +
 test/CodeGen/SystemZ/fp-move-13.ll                 |   46 +
 test/CodeGen/SystemZ/fp-mul-01.ll                  |    6 +-
 test/CodeGen/SystemZ/fp-mul-06.ll                  |   31 +-
 test/CodeGen/SystemZ/fp-mul-08.ll                  |   31 +-
 test/CodeGen/SystemZ/fp-mul-10.ll                  |   43 +
 test/CodeGen/SystemZ/fp-mul-11.ll                  |   32 +
 test/CodeGen/SystemZ/fp-mul-12.ll                  |   72 +
 test/CodeGen/SystemZ/fp-neg-02.ll                  |   41 +
 test/CodeGen/SystemZ/fp-round-03.ll                |  207 ++
 test/CodeGen/SystemZ/fp-sqrt-01.ll                 |    8 +-
 test/CodeGen/SystemZ/fp-sqrt-04.ll                 |   17 +
 test/CodeGen/SystemZ/fp-sub-01.ll                  |    6 +-
 test/CodeGen/SystemZ/fp-sub-04.ll                  |   17 +
 test/CodeGen/SystemZ/int-add-17.ll                 |   95 +
 test/CodeGen/SystemZ/int-mul-09.ll                 |   95 +
 test/CodeGen/SystemZ/int-mul-10.ll                 |  165 +
 test/CodeGen/SystemZ/int-mul-11.ll                 |   32 +
 test/CodeGen/SystemZ/int-sub-10.ll                 |   95 +
 test/CodeGen/SystemZ/tdc-07.ll                     |   18 +
 test/CodeGen/SystemZ/vec-abs-06.ll                 |   47 +
 test/CodeGen/SystemZ/vec-add-02.ll                 |   24 +
 test/CodeGen/SystemZ/vec-and-04.ll                 |   47 +
 test/CodeGen/SystemZ/vec-cmp-07.ll                 |  349 ++
 test/CodeGen/SystemZ/vec-ctpop-02.ll               |   45 +
 test/CodeGen/SystemZ/vec-div-02.ll                 |   24 +
 test/CodeGen/SystemZ/vec-intrinsics-01.ll          | 3335 ++++++++++++++++++++
 test/CodeGen/SystemZ/vec-intrinsics-02.ll          |  441 +++
 test/CodeGen/SystemZ/vec-intrinsics.ll             | 3335 --------------------
 test/CodeGen/SystemZ/vec-max-05.ll                 |  175 +
 test/CodeGen/SystemZ/vec-min-05.ll                 |  175 +
 test/CodeGen/SystemZ/vec-move-18.ll                |   24 +
 test/CodeGen/SystemZ/vec-mul-03.ll                 |   24 +
 test/CodeGen/SystemZ/vec-mul-04.ll                 |   31 +
 test/CodeGen/SystemZ/vec-mul-05.ll                 |   63 +
 test/CodeGen/SystemZ/vec-neg-02.ll                 |   23 +
 test/CodeGen/SystemZ/vec-or-03.ll                  |   91 +
 test/CodeGen/SystemZ/vec-round-02.ll               |  118 +
 test/CodeGen/SystemZ/vec-sqrt-02.ll                |   23 +
 test/CodeGen/SystemZ/vec-sub-02.ll                 |   31 +
 test/CodeGen/SystemZ/vec-xor-02.ll                 |   47 +
 test/CodeGen/Thumb/litpoolremat.ll                 |   28 +
 test/CodeGen/Thumb/select.ll                       |    4 +-
 test/CodeGen/WebAssembly/indirect-import.ll        |    9 +-
 test/CodeGen/WebAssembly/userstack.ll              |   10 +-
 test/CodeGen/X86/2008-01-08-SchedulerCrash.ll      |    2 +-
 test/CodeGen/X86/2009-06-03-Win64DisableRedZone.ll |    2 +-
 test/CodeGen/X86/2011-10-19-widen_vselect.ll       |    7 +-
 test/CodeGen/X86/DynamicCalleeSavedRegisters.ll    |    2 +-
 test/CodeGen/X86/alias-static-alloca.ll            |   37 +
 test/CodeGen/X86/atomic-minmax-i6432.ll            |   16 +-
 test/CodeGen/X86/atomic128.ll                      |   64 +-
 test/CodeGen/X86/avx-schedule.ll                   |  508 +--
 test/CodeGen/X86/avx2-arith.ll                     |    8 +-
 test/CodeGen/X86/avx2-schedule.ll                  |  116 +-
 test/CodeGen/X86/avx2-vector-shifts.ll             |    4 +-
 test/CodeGen/X86/avx512-cvt.ll                     |    2 +-
 test/CodeGen/X86/avx512-mask-op.ll                 |    5 +-
 test/CodeGen/X86/avx512-rotate.ll                  |  256 ++
 test/CodeGen/X86/avx512-shift.ll                   |  148 +-
 test/CodeGen/X86/bmi-schedule.ll                   |  529 ++++
 test/CodeGen/X86/bmi2-schedule.ll                  |  180 ++
 test/CodeGen/X86/bool-ext-inc.ll                   |    8 +-
 test/CodeGen/X86/bswap-rotate.ll                   |   27 +
 test/CodeGen/X86/clobber-fi0.ll                    |   14 +-
 test/CodeGen/X86/combine-rotates.ll                |   27 +-
 test/CodeGen/X86/combine-shl.ll                    |   12 +-
 test/CodeGen/X86/combine-srl.ll                    |    8 +-
 test/CodeGen/X86/combine-udiv.ll                   |    2 +-
 test/CodeGen/X86/combine-urem.ll                   |    8 +-
 test/CodeGen/X86/f16c-schedule.ll                  |  144 +
 test/CodeGen/X86/fast-isel-x86-64.ll               |    2 +-
 test/CodeGen/X86/hipe-cc.ll                        |    6 +-
 test/CodeGen/X86/hipe-cc64.ll                      |    6 +-
 test/CodeGen/X86/lea32-schedule.ll                 |  653 ++++
 test/CodeGen/X86/lea64-schedule.ll                 |  534 ++++
 test/CodeGen/X86/legalize-shift-64.ll              |    8 +-
 test/CodeGen/X86/lzcnt-schedule.ll                 |  119 +
 test/CodeGen/X86/machine-outliner-debuginfo.ll     |    1 +
 test/CodeGen/X86/machine-outliner.ll               |    1 +
 test/CodeGen/X86/memcmp-minsize.ll                 |  721 +++++
 test/CodeGen/X86/memcmp-optsize.ll                 |  871 +++++
 test/CodeGen/X86/memcmp.ll                         |  827 +++--
 test/CodeGen/X86/pmul.ll                           |    6 +-
 test/CodeGen/X86/popcnt-schedule.ll                |  167 +
 test/CodeGen/X86/pr32282.ll                        |  104 +
 test/CodeGen/X86/pr32515.ll                        |   29 +
 test/CodeGen/X86/pr33772.ll                        |   15 +
 test/CodeGen/X86/pr33828.ll                        |   48 +
 test/CodeGen/X86/regparm.ll                        |    2 +-
 test/CodeGen/X86/rotate_vec.ll                     |   54 +
 test/CodeGen/X86/sibcall-win64.ll                  |   22 +-
 test/CodeGen/X86/sse-schedule.ll                   |  327 +-
 test/CodeGen/X86/sse2-schedule.ll                  |  824 ++++-
 test/CodeGen/X86/sse3-schedule.ll                  |   64 +-
 test/CodeGen/X86/sse41-schedule.ll                 |  311 +-
 test/CodeGen/X86/sse42-schedule.ll                 |   81 +-
 test/CodeGen/X86/sse4a-schedule.ll                 |   40 +-
 test/CodeGen/X86/ssse3-schedule.ll                 |   98 +-
 test/CodeGen/X86/statepoint-invoke.ll              |    2 +-
 test/CodeGen/X86/statepoint-stack-usage.ll         |   42 +-
 test/CodeGen/X86/statepoint-vector.ll              |    4 +-
 test/CodeGen/X86/vec_cmp_uint-128.ll               |    8 +-
 test/CodeGen/X86/vector-idiv-sdiv-128.ll           |    6 +-
 test/CodeGen/X86/vector-idiv-sdiv-256.ll           |    6 +-
 test/CodeGen/X86/vector-idiv-udiv-128.ll           |    6 +-
 test/CodeGen/X86/vector-idiv-udiv-256.ll           |    6 +-
 test/CodeGen/X86/vector-idiv.ll                    |    2 +-
 test/CodeGen/X86/vector-rotate-128.ll              |  203 +-
 test/CodeGen/X86/vector-rotate-256.ll              |  256 +-
 test/CodeGen/X86/vector-rotate-512.ll              |  831 +++++
 test/CodeGen/X86/vector-shift-ashr-256.ll          |   10 +-
 test/CodeGen/X86/vector-tzcnt-128.ll               |    4 +-
 test/CodeGen/X86/vector-tzcnt-256.ll               |    8 +-
 test/CodeGen/X86/vector-tzcnt-512.ll               |    8 +-
 test/CodeGen/X86/vselect-avx.ll                    |    8 +-
 test/CodeGen/X86/widen_arith-2.ll                  |   15 +-
 test/CodeGen/X86/widen_cast-4.ll                   |   34 +-
 test/CodeGen/X86/win64-nosse-csrs.ll               |    2 +-
 test/CodeGen/X86/win64_nonvol.ll                   |    2 +-
 test/CodeGen/X86/win64_params.ll                   |    2 +-
 test/CodeGen/X86/win_chkstk.ll                     |    2 +-
 test/CodeGen/X86/win_coreclr_chkstk.ll             |    4 +-
 test/CodeGen/X86/x86-64-ms_abi-vararg.ll           |   14 +-
 test/CodeGen/X86/x86-cmov-converter.ll             |  321 ++
 test/CodeGen/XCore/varargs.ll                      |    8 +-
 test/DebugInfo/Generic/namespace.ll                |   41 +-
 test/DebugInfo/PDB/pdbdump-headers.test            |  184 +-
 test/DebugInfo/X86/DIModule.ll                     |    2 +-
 test/DebugInfo/X86/DIModuleContext.ll              |    2 +-
 test/DebugInfo/X86/fission-inline.ll               |    2 +-
 test/DebugInfo/X86/gnu-public-names.ll             |    4 +-
 test/DebugInfo/X86/lexical-block-file-inline.ll    |    2 +-
 test/DebugInfo/X86/pr19307.ll                      |   12 +-
 test/DllTool/coff-exports.def                      |   13 +
 test/DllTool/coff-weak-exports.def                 |   19 +
 test/DllTool/lit.local.cfg                         |    1 +
 test/FileCheck/regex-scope.txt                     |    2 +-
 test/Instrumentation/AddressSanitizer/basic.ll     |   20 +
 .../AddressSanitizer/stack-poisoning-byval-args.ll |   48 +
 .../unordered_atomic_mem_intrins.ll                |   37 +
 .../EfficiencySanitizer/working_set_basic.ll       |   33 +
 .../EfficiencySanitizer/working_set_slow.ll        |   32 +
 test/Instrumentation/MemorySanitizer/msan_basic.ll |   35 +
 .../SanitizerCoverage/cmp-tracing-api-x86_32.ll    |   22 +
 .../SanitizerCoverage/cmp-tracing-api-x86_64.ll    |   22 +
 test/Linker/pr26037.ll                             |    4 +-
 test/MC/AArch64/coff-relocations.s                 |   52 +
 test/MC/AArch64/invalid-instructions-spellcheck.s  |   37 +
 test/MC/AMDGPU/gfx9_asm_all.s                      |  351 ++
 test/MC/AMDGPU/vop3-errs.s                         |   38 +-
 test/MC/ARM/virtexts-thumb.s                       |    2 +-
 test/MC/Disassembler/AMDGPU/gfx9_dasm_all.txt      |  405 +++
 test/MC/Disassembler/Mips/mt/valid-r2-el.txt       |   21 +-
 test/MC/Disassembler/Mips/mt/valid-r2.txt          |   21 +-
 test/MC/Disassembler/SystemZ/insns-z14.txt         | 3253 +++++++++++++++++++
 test/MC/Mips/mt/invalid-wrong-error.s              |    3 -
 test/MC/Mips/mt/invalid.s                          |   38 +-
 .../mt/mftr-mttr-aliases-invalid-wrong-error.s     |   18 -
 test/MC/Mips/mt/mftr-mttr-aliases-invalid.s        |   23 -
 test/MC/Mips/mt/mftr-mttr-aliases.s                |   47 -
 test/MC/Mips/mt/mftr-mttr-reserved-valid.s         |    8 -
 test/MC/Mips/mt/valid.s                            |   42 +-
 test/MC/SystemZ/insn-bad-z13.s                     |  705 +++++
 test/MC/SystemZ/insn-bad-z14.s                     |  752 +++++
 test/MC/SystemZ/insn-good-z14.s                    | 2674 ++++++++++++++++
 test/MC/SystemZ/invalid-instructions-spellcheck.s  |   66 +
 test/MC/X86/pr22028.s                              |    4 +-
 test/Object/no-section-table.test                  |    2 +-
 test/Object/readobj-shared-object.test             |   12 +-
 test/ObjectYAML/CodeView/guid.yaml                 |   59 +
 test/Other/cgscc-libcall-update.ll                 |   61 +
 test/Other/new-pass-manager.ll                     |    2 +
 test/ThinLTO/X86/debuginfo-cu-import.ll            |    4 +-
 test/Transforms/CodeGenPrepare/X86/memcmp.ll       | 1635 +++++++++-
 .../Transforms/CodeGenPrepare/X86/sink-addrmode.ll |   35 +-
 test/Transforms/EarlyCSE/globalsaa-memoryssa.ll    |   25 +
 .../GVN/PRE/2017-06-28-pre-load-dbgloc.ll          |   79 +
 test/Transforms/GVN/PRE/phi-translate.ll           |    4 +-
 test/Transforms/GlobalOpt/pr33686.ll               |   17 +
 test/Transforms/IRCE/eq_ne.ll                      |  257 ++
 test/Transforms/IRCE/pre_post_loops.ll             |  117 +
 test/Transforms/Inline/AArch64/ext.ll              |  249 ++
 test/Transforms/Inline/PowerPC/ext.ll              |  140 +
 test/Transforms/Inline/PowerPC/lit.local.cfg       |    3 +
 test/Transforms/Inline/X86/ext.ll                  |  201 ++
 .../Transforms/InstCombine/2017-07-07-UMul-ZExt.ll |   24 +-
 test/Transforms/InstCombine/and-not-or.ll          |   34 -
 test/Transforms/InstCombine/and.ll                 |  192 ++
 test/Transforms/InstCombine/and2.ll                |   85 +-
 .../InstCombine/element-atomic-memintrins.ll       |   98 +
 test/Transforms/InstCombine/icmp-logical.ll        |  165 +-
 test/Transforms/InstCombine/or-xor.ll              |   28 +-
 test/Transforms/InstCombine/or.ll                  |  291 +-
 test/Transforms/InstCombine/pr33765.ll             |   32 +
 test/Transforms/JumpThreading/select.ll            |   77 +-
 .../LoopInterchange/current-limitations-lcssa.ll   |   76 +
 .../LoopInterchange/interchange-flow-dep-outer.ll  |  118 +
 .../LoopInterchange/interchange-not-profitable.ll  |   66 +
 .../interchange-output-dependencies.ll             |   86 +
 .../interchange-simple-count-down.ll               |   69 +
 .../LoopInterchange/interchange-simple-count-up.ll |   86 +
 test/Transforms/LoopInterchange/interchange.ll     |  749 -----
 .../loop-interchange-optimization-remarks.ll       |  220 ++
 .../not-interchanged-dependencies-1.ll             |   64 +
 .../not-interchanged-loop-nest-3.ll                |   87 +
 .../not-interchanged-tightly-nested.ll             |  143 +
 .../runtime-loop-multiexit-dom-verify.ll           |  126 +
 .../LoopVectorize/X86/float-induction-x86.ll       |    6 +-
 test/Transforms/LoopVectorize/debugloc.ll          |    2 +-
 .../LoopVectorize/first-order-recurrence.ll        |    8 +-
 test/Transforms/LoopVectorize/float-induction.ll   |   14 +-
 .../Transforms/LoopVectorize/if-conversion-nest.ll |   25 +-
 test/Transforms/LoopVectorize/induction-step.ll    |    4 +-
 test/Transforms/LoopVectorize/induction.ll         |    4 +-
 .../interleaved-accesses-pred-stores.ll            |    6 +-
 .../LoopVectorize/interleaved-accesses.ll          |   10 +-
 test/Transforms/LoopVectorize/iv_outside_user.ll   |    2 +-
 test/Transforms/LoopVectorize/miniters.ll          |    4 +-
 .../LoopVectorize/pr30654-phiscev-sext-trunc.ll    |  240 ++
 .../LoopVectorize/runtime-check-readonly.ll        |    1 -
 test/Transforms/LoopVectorize/runtime-check.ll     |    2 +-
 test/tools/llvm-cov/showTabsHTML.cpp               |    4 +-
 test/tools/llvm-dwarfdump/X86/verify_debug_info.s  |  193 ++
 .../llvm-dwarfdump/X86/verify_unit_header_chain.s  |   81 +
 test/tools/llvm-mt/help.test                       |    7 +
 .../AArch64/Inputs/reloc-addend.obj.macho-aarch64  |  Bin 0 -> 424 bytes
 .../llvm-objdump/AArch64/macho-reloc-addend.test   |    6 +
 .../tools/llvm-readobj/Inputs/dynamic-table-so.x86 |  Bin 8280 -> 8256 bytes
 test/tools/llvm-readobj/Inputs/dynamic-table.c     |    4 +-
 test/tools/llvm-readobj/dynamic.test               |   39 +-
 test/tools/llvm-readobj/gnu-sections.test          |   10 +-
 tools/llvm-ar/CMakeLists.txt                       |    2 +
 tools/llvm-ar/llvm-ar.cpp                          |    6 +-
 tools/llvm-mt/CMakeLists.txt                       |   13 +
 tools/llvm-mt/LLVMBuild.txt                        |   22 +
 tools/llvm-mt/Opts.td                              |   29 +
 tools/llvm-mt/llvm-mt.cpp                          |  117 +
 tools/llvm-objdump/llvm-objdump.cpp                |    5 +-
 tools/llvm-pdbutil/DumpOutputStyle.cpp             |    4 +-
 tools/llvm-pdbutil/MinimalSymbolDumper.cpp         |    5 +-
 tools/llvm-pdbutil/MinimalTypeDumper.cpp           |   20 +-
 tools/llvm-pdbutil/MinimalTypeDumper.h             |    4 +-
 tools/llvm-pdbutil/PdbYaml.cpp                     |   35 -
 tools/llvm-pdbutil/PdbYaml.h                       |    2 +-
 tools/llvm-pdbutil/llvm-pdbutil.cpp                |    4 +-
 tools/llvm-readobj/CMakeLists.txt                  |    2 +
 tools/llvm-readobj/COFFDumper.cpp                  |    3 +-
 tools/llvm-readobj/ELFDumper.cpp                   |   14 +-
 tools/llvm-readobj/llvm-readobj.cpp                |   22 +-
 tools/opt-viewer/opt-diff.py                       |   27 +-
 tools/opt-viewer/opt-stats.py                      |   14 +-
 tools/opt-viewer/opt-viewer.py                     |   14 +-
 tools/opt-viewer/optpmap.py                        |    1 +
 tools/opt-viewer/optrecord.py                      |   20 +-
 unittests/Analysis/CGSCCPassManagerTest.cpp        |    2 +
 unittests/Analysis/LazyCallGraphTest.cpp           |   47 +-
 .../DebugInfo/CodeView/RandomAccessVisitorTest.cpp |    3 +-
 unittests/DebugInfo/PDB/CMakeLists.txt             |    3 +-
 unittests/DebugInfo/PDB/TypeServerHandlerTest.cpp  |  183 --
 unittests/IR/CFGBuilder.cpp                        |  269 ++
 unittests/IR/CFGBuilder.h                          |   94 +
 unittests/IR/CMakeLists.txt                        |    1 +
 unittests/IR/DominatorTreeTest.cpp                 |  300 +-
 unittests/IR/IRBuilderTest.cpp                     |   11 +-
 unittests/IR/MetadataTest.cpp                      |   24 +-
 unittests/Support/TargetParserTest.cpp             |    5 +-
 unittests/Support/YAMLIOTest.cpp                   |   16 +
 unittests/Support/raw_ostream_test.cpp             |    5 +
 utils/TableGen/CodeGenRegisters.cpp                |   12 +-
 utils/lit/lit/LitConfig.py                         |    4 +-
 utils/lit/lit/TestRunner.py                        |    2 +
 utils/lit/lit/main.py                              |   12 +-
 utils/vim/syntax/llvm.vim                          |    4 +-
 708 files changed, 43451 insertions(+), 10484 deletions(-)
 create mode 100644 include/llvm/DebugInfo/CodeView/GUID.h
 delete mode 100644 include/llvm/DebugInfo/CodeView/TypeServerHandler.h
 delete mode 100644 include/llvm/DebugInfo/PDB/Native/PDBTypeServerHandler.h
 create mode 100644 include/llvm/ToolDrivers/llvm-dlltool/DlltoolDriver.h
 delete mode 100644 lib/DebugInfo/PDB/Native/PDBTypeServerHandler.cpp
 delete mode 100644 lib/Fuzzer/FuzzerTraceState.cpp
 create mode 100644 lib/Fuzzer/test/FlagsTest.cpp
 create mode 100644 lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
 create mode 100644 lib/Target/SystemZ/SystemZScheduleZ14.td
 create mode 100644 lib/Target/X86/X86CmovConversion.cpp
 create mode 100644 lib/Target/X86/X86ScheduleZnver1.td
 create mode 100644 lib/ToolDrivers/llvm-dlltool/CMakeLists.txt
 create mode 100644 lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp
 create mode 100644 lib/ToolDrivers/llvm-dlltool/LLVMBuild.txt
 create mode 100644 lib/ToolDrivers/llvm-dlltool/Options.td
 create mode 100644 test/Bitcode/upgrade-importedentity.ll
 create mode 100644 test/Bitcode/upgrade-importedentity.ll.bc
 create mode 100644 test/CodeGen/AArch64/GlobalISel/select-fma.mir
 create mode 100644 test/CodeGen/AArch64/aarch64_win64cc_vararg.ll
 create mode 100644 test/CodeGen/AArch64/falkor-hwpf-fix.ll
 create mode 100644 test/CodeGen/AArch64/falkor-hwpf-fix.mir
 create mode 100644 test/CodeGen/AArch64/falkor-hwpf.ll
 create mode 100644 test/CodeGen/AArch64/win64_vararg.ll
 create mode 100644 test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
 create mode 100644 test/CodeGen/AMDGPU/move-to-valu-worklist.ll
 create mode 100644 test/CodeGen/BPF/select_ri.ll
 create mode 100644 test/CodeGen/Hexagon/switch-lut-explicit-section.ll
 create mode 100644 test/CodeGen/Hexagon/switch-lut-function-section.ll
 create mode 100644 test/CodeGen/Hexagon/switch-lut-multiple-functions.ll
 create mode 100644 test/CodeGen/Hexagon/switch-lut-text-section.ll
 create mode 100644 test/CodeGen/Hexagon/vect/vect-load-v4i16.ll
 delete mode 100644 test/CodeGen/Hexagon/vect/vect-loadv4i16.ll
 create mode 100644 test/CodeGen/Hexagon/vect/vect-v4i16.ll
 create mode 100644 test/CodeGen/MIR/AMDGPU/fold-multiple.mir
 create mode 100644 test/CodeGen/Mips/long-calls.ll
 create mode 100644 test/CodeGen/PowerPC/PR33671.ll
 create mode 100644 test/CodeGen/SPARC/soft-mul-div.ll
 create mode 100644 test/CodeGen/SystemZ/branch-11.ll
 create mode 100644 test/CodeGen/SystemZ/fp-abs-03.ll
 create mode 100644 test/CodeGen/SystemZ/fp-abs-04.ll
 create mode 100644 test/CodeGen/SystemZ/fp-add-04.ll
 create mode 100644 test/CodeGen/SystemZ/fp-cmp-06.ll
 create mode 100644 test/CodeGen/SystemZ/fp-const-11.ll
 create mode 100644 test/CodeGen/SystemZ/fp-conv-15.ll
 create mode 100644 test/CodeGen/SystemZ/fp-conv-16.ll
 create mode 100644 test/CodeGen/SystemZ/fp-copysign-02.ll
 create mode 100644 test/CodeGen/SystemZ/fp-div-04.ll
 create mode 100644 test/CodeGen/SystemZ/fp-move-13.ll
 create mode 100644 test/CodeGen/SystemZ/fp-mul-10.ll
 create mode 100644 test/CodeGen/SystemZ/fp-mul-11.ll
 create mode 100644 test/CodeGen/SystemZ/fp-mul-12.ll
 create mode 100644 test/CodeGen/SystemZ/fp-neg-02.ll
 create mode 100644 test/CodeGen/SystemZ/fp-round-03.ll
 create mode 100644 test/CodeGen/SystemZ/fp-sqrt-04.ll
 create mode 100644 test/CodeGen/SystemZ/fp-sub-04.ll
 create mode 100644 test/CodeGen/SystemZ/int-add-17.ll
 create mode 100644 test/CodeGen/SystemZ/int-mul-09.ll
 create mode 100644 test/CodeGen/SystemZ/int-mul-10.ll
 create mode 100644 test/CodeGen/SystemZ/int-mul-11.ll
 create mode 100644 test/CodeGen/SystemZ/int-sub-10.ll
 create mode 100644 test/CodeGen/SystemZ/tdc-07.ll
 create mode 100644 test/CodeGen/SystemZ/vec-abs-06.ll
 create mode 100644 test/CodeGen/SystemZ/vec-add-02.ll
 create mode 100644 test/CodeGen/SystemZ/vec-and-04.ll
 create mode 100644 test/CodeGen/SystemZ/vec-cmp-07.ll
 create mode 100644 test/CodeGen/SystemZ/vec-ctpop-02.ll
 create mode 100644 test/CodeGen/SystemZ/vec-div-02.ll
 create mode 100644 test/CodeGen/SystemZ/vec-intrinsics-01.ll
 create mode 100644 test/CodeGen/SystemZ/vec-intrinsics-02.ll
 delete mode 100644 test/CodeGen/SystemZ/vec-intrinsics.ll
 create mode 100644 test/CodeGen/SystemZ/vec-max-05.ll
 create mode 100644 test/CodeGen/SystemZ/vec-min-05.ll
 create mode 100644 test/CodeGen/SystemZ/vec-move-18.ll
 create mode 100644 test/CodeGen/SystemZ/vec-mul-03.ll
 create mode 100644 test/CodeGen/SystemZ/vec-mul-04.ll
 create mode 100644 test/CodeGen/SystemZ/vec-mul-05.ll
 create mode 100644 test/CodeGen/SystemZ/vec-neg-02.ll
 create mode 100644 test/CodeGen/SystemZ/vec-or-03.ll
 create mode 100644 test/CodeGen/SystemZ/vec-round-02.ll
 create mode 100644 test/CodeGen/SystemZ/vec-sqrt-02.ll
 create mode 100644 test/CodeGen/SystemZ/vec-sub-02.ll
 create mode 100644 test/CodeGen/SystemZ/vec-xor-02.ll
 create mode 100644 test/CodeGen/Thumb/litpoolremat.ll
 create mode 100644 test/CodeGen/X86/alias-static-alloca.ll
 create mode 100644 test/CodeGen/X86/avx512-rotate.ll
 create mode 100644 test/CodeGen/X86/bmi-schedule.ll
 create mode 100644 test/CodeGen/X86/bmi2-schedule.ll
 create mode 100644 test/CodeGen/X86/bswap-rotate.ll
 create mode 100644 test/CodeGen/X86/f16c-schedule.ll
 create mode 100644 test/CodeGen/X86/lea32-schedule.ll
 create mode 100644 test/CodeGen/X86/lea64-schedule.ll
 create mode 100644 test/CodeGen/X86/lzcnt-schedule.ll
 create mode 100644 test/CodeGen/X86/memcmp-minsize.ll
 create mode 100644 test/CodeGen/X86/memcmp-optsize.ll
 create mode 100644 test/CodeGen/X86/popcnt-schedule.ll
 create mode 100644 test/CodeGen/X86/pr32282.ll
 create mode 100644 test/CodeGen/X86/pr32515.ll
 create mode 100644 test/CodeGen/X86/pr33772.ll
 create mode 100644 test/CodeGen/X86/pr33828.ll
 create mode 100644 test/CodeGen/X86/rotate_vec.ll
 create mode 100644 test/CodeGen/X86/vector-rotate-512.ll
 create mode 100644 test/CodeGen/X86/x86-cmov-converter.ll
 create mode 100644 test/DllTool/coff-exports.def
 create mode 100644 test/DllTool/coff-weak-exports.def
 create mode 100644 test/DllTool/lit.local.cfg
 create mode 100644 test/Instrumentation/AddressSanitizer/stack-poisoning-byval-args.ll
 create mode 100644 test/Instrumentation/DataFlowSanitizer/unordered_atomic_mem_intrins.ll
 create mode 100644 test/Instrumentation/SanitizerCoverage/cmp-tracing-api-x86_32.ll
 create mode 100644 test/Instrumentation/SanitizerCoverage/cmp-tracing-api-x86_64.ll
 create mode 100644 test/MC/AArch64/coff-relocations.s
 create mode 100644 test/MC/AArch64/invalid-instructions-spellcheck.s
 create mode 100644 test/MC/Disassembler/SystemZ/insns-z14.txt
 delete mode 100644 test/MC/Mips/mt/invalid-wrong-error.s
 delete mode 100644 test/MC/Mips/mt/mftr-mttr-aliases-invalid-wrong-error.s
 delete mode 100644 test/MC/Mips/mt/mftr-mttr-aliases-invalid.s
 delete mode 100644 test/MC/Mips/mt/mftr-mttr-aliases.s
 delete mode 100644 test/MC/Mips/mt/mftr-mttr-reserved-valid.s
 create mode 100644 test/MC/SystemZ/insn-bad-z14.s
 create mode 100644 test/MC/SystemZ/insn-good-z14.s
 create mode 100644 test/MC/SystemZ/invalid-instructions-spellcheck.s
 create mode 100644 test/ObjectYAML/CodeView/guid.yaml
 create mode 100644 test/Other/cgscc-libcall-update.ll
 create mode 100644 test/Transforms/EarlyCSE/globalsaa-memoryssa.ll
 create mode 100644 test/Transforms/GVN/PRE/2017-06-28-pre-load-dbgloc.ll
 create mode 100644 test/Transforms/GlobalOpt/pr33686.ll
 create mode 100644 test/Transforms/IRCE/eq_ne.ll
 create mode 100644 test/Transforms/IRCE/pre_post_loops.ll
 create mode 100644 test/Transforms/Inline/AArch64/ext.ll
 create mode 100644 test/Transforms/Inline/PowerPC/ext.ll
 create mode 100644 test/Transforms/Inline/PowerPC/lit.local.cfg
 create mode 100644 test/Transforms/Inline/X86/ext.ll
 delete mode 100644 test/Transforms/InstCombine/and-not-or.ll
 create mode 100644 test/Transforms/InstCombine/element-atomic-memintrins.ll
 create mode 100644 test/Transforms/InstCombine/pr33765.ll
 create mode 100644 test/Transforms/LoopInterchange/current-limitations-lcssa.ll
 create mode 100644 test/Transforms/LoopInterchange/interchange-flow-dep-outer.ll
 create mode 100644 test/Transforms/LoopInterchange/interchange-not-profitable.ll
 create mode 100644 test/Transforms/LoopInterchange/interchange-output-dependencies.ll
 create mode 100644 test/Transforms/LoopInterchange/interchange-simple-count-down.ll
 create mode 100644 test/Transforms/LoopInterchange/interchange-simple-count-up.ll
 delete mode 100644 test/Transforms/LoopInterchange/interchange.ll
 create mode 100644 test/Transforms/LoopInterchange/loop-interchange-optimization-remarks.ll
 create mode 100644 test/Transforms/LoopInterchange/not-interchanged-dependencies-1.ll
 create mode 100644 test/Transforms/LoopInterchange/not-interchanged-loop-nest-3.ll
 create mode 100644 test/Transforms/LoopInterchange/not-interchanged-tightly-nested.ll
 create mode 100644 test/Transforms/LoopUnroll/runtime-loop-multiexit-dom-verify.ll
 create mode 100644 test/Transforms/LoopVectorize/pr30654-phiscev-sext-trunc.ll
 create mode 100644 test/tools/llvm-dwarfdump/X86/verify_debug_info.s
 create mode 100644 test/tools/llvm-dwarfdump/X86/verify_unit_header_chain.s
 create mode 100644 test/tools/llvm-mt/help.test
 create mode 100644 test/tools/llvm-objdump/AArch64/Inputs/reloc-addend.obj.macho-aarch64
 create mode 100644 test/tools/llvm-objdump/AArch64/macho-reloc-addend.test
 create mode 100644 tools/llvm-mt/CMakeLists.txt
 create mode 100644 tools/llvm-mt/LLVMBuild.txt
 create mode 100644 tools/llvm-mt/Opts.td
 create mode 100644 tools/llvm-mt/llvm-mt.cpp
 delete mode 100644 unittests/DebugInfo/PDB/TypeServerHandlerTest.cpp
 create mode 100644 unittests/IR/CFGBuilder.cpp
 create mode 100644 unittests/IR/CFGBuilder.h

diff --git a/RELEASE_TESTERS.TXT b/RELEASE_TESTERS.TXT
index 7bfa88c6cf0e..9a01c725fb51 100644
--- a/RELEASE_TESTERS.TXT
+++ b/RELEASE_TESTERS.TXT
@@ -41,14 +41,9 @@ E: hans@chromium.org
 T: x86
 O: Windows
 
-N: Renato Golin
-E: renato.golin@linaro.org
-T: ARM
-O: Linux
-
 N: Diana Picus
 E: diana.picus@linaro.org
-T: AArch64
+T: ARM, AArch64
 O: Linux
 
 N: Simon Dardis
diff --git a/docs/AliasAnalysis.rst b/docs/AliasAnalysis.rst
index e201333f3007..0a5cb00a48d3 100644
--- a/docs/AliasAnalysis.rst
+++ b/docs/AliasAnalysis.rst
@@ -132,7 +132,8 @@ The ``MayAlias`` response is used whenever the two pointers might refer to the
 same object.
 
 The ``PartialAlias`` response is used when the two memory objects are known to
-be overlapping in some way, but do not start at the same address.
+be overlapping in some way, regardless whether they start at the same address
+or not.
 
 The ``MustAlias`` response may only be returned if the two memory objects are
 guaranteed to always start at exactly the same location. A ``MustAlias``
diff --git a/docs/CodingStandards.rst b/docs/CodingStandards.rst
index 722718bf4f16..fa41198755fd 100644
--- a/docs/CodingStandards.rst
+++ b/docs/CodingStandards.rst
@@ -34,10 +34,10 @@ There are some conventions that are not uniformly followed in the code base
 (e.g. the naming convention).  This is because they are relatively new, and a
 lot of code was written before they were put in place.  Our long term goal is
 for the entire codebase to follow the convention, but we explicitly *do not*
-want patches that do large-scale reformating of existing code.  On the other
+want patches that do large-scale reformatting of existing code.  On the other
 hand, it is reasonable to rename the methods of a class if you're about to
-change it in some other way.  Just do the reformating as a separate commit from
-the functionality change.
+change it in some other way.  Just do the reformatting as a separate commit
+from the functionality change.
   
 The ultimate goal of these guidelines is to increase the readability and
 maintainability of our common source base. If you have suggestions for topics to
diff --git a/docs/CommandGuide/lit.rst b/docs/CommandGuide/lit.rst
index b4d15ef57b73..fbe1a9ab1843 100644
--- a/docs/CommandGuide/lit.rst
+++ b/docs/CommandGuide/lit.rst
@@ -80,6 +80,13 @@ OUTPUT OPTIONS
  Show more information on test failures, for example the entire test output
  instead of just the test result.
 
+.. option:: -vv, --echo-all-commands
+
+ Echo all commands to stdout, as they are being executed.
+ This can be valuable for debugging test failures, as the last echoed command
+ will be the one which has failed.
+ This option implies ``--verbose``.
+
 .. option:: -a, --show-all
 
  Show more information about all tests, for example the entire test
diff --git a/include/llvm/Analysis/DominanceFrontier.h b/include/llvm/Analysis/DominanceFrontier.h
index 8cae63c3c869..b566aeaf1fd6 100644
--- a/include/llvm/Analysis/DominanceFrontier.h
+++ b/include/llvm/Analysis/DominanceFrontier.h
@@ -29,9 +29,9 @@ namespace llvm {
 /// DominanceFrontierBase - Common base class for computing forward and inverse
 /// dominance frontiers for a function.
 ///
-template <class BlockT>
+template <class BlockT, bool IsPostDom>
 class DominanceFrontierBase {
-public:
+ public:
   typedef std::set<BlockT *> DomSetType;                // Dom set for a bb
   typedef std::map<BlockT *, DomSetType> DomSetMapType; // Dom set map
 
@@ -40,10 +40,10 @@ protected:
 
   DomSetMapType Frontiers;
   std::vector<BlockT *> Roots;
-  const bool IsPostDominators;
+  static constexpr bool IsPostDominators = IsPostDom;
 
-public:
-  DominanceFrontierBase(bool isPostDom) : IsPostDominators(isPostDom) {}
+ public:
+  DominanceFrontierBase() {}
 
   /// getRoots - Return the root blocks of the current CFG.  This may include
   /// multiple blocks if we are computing post dominators.  For forward
@@ -96,7 +96,7 @@ public:
 
   /// compare - Return true if the other dominance frontier base matches
   /// this dominance frontier base. Otherwise return false.
-  bool compare(DominanceFrontierBase<BlockT> &Other) const;
+  bool compare(DominanceFrontierBase &Other) const;
 
   /// print - Convert to human readable form
   ///
@@ -113,22 +113,21 @@ public:
 /// used to compute a forward dominator frontiers.
 ///
 template <class BlockT>
-class ForwardDominanceFrontierBase : public DominanceFrontierBase<BlockT> {
-private:
+class ForwardDominanceFrontierBase
+    : public DominanceFrontierBase<BlockT, false> {
+ private:
   typedef GraphTraits<BlockT *> BlockTraits;
 
 public:
-  typedef DominatorTreeBase<BlockT> DomTreeT;
-  typedef DomTreeNodeBase<BlockT> DomTreeNodeT;
-  typedef typename DominanceFrontierBase<BlockT>::DomSetType DomSetType;
-
-  ForwardDominanceFrontierBase() : DominanceFrontierBase<BlockT>(false) {}
-
-  void analyze(DomTreeT &DT) {
-    this->Roots = DT.getRoots();
-    assert(this->Roots.size() == 1 &&
-           "Only one entry block for forward domfronts!");
-    calculate(DT, DT[this->Roots[0]]);
+ typedef DomTreeBase<BlockT> DomTreeT;
+ typedef DomTreeNodeBase<BlockT> DomTreeNodeT;
+ typedef typename DominanceFrontierBase<BlockT, false>::DomSetType DomSetType;
+
+ void analyze(DomTreeT &DT) {
+   this->Roots = DT.getRoots();
+   assert(this->Roots.size() == 1 &&
+          "Only one entry block for forward domfronts!");
+   calculate(DT, DT[this->Roots[0]]);
   }
 
   const DomSetType &calculate(const DomTreeT &DT, const DomTreeNodeT *Node);
@@ -136,15 +135,16 @@ public:
 
 class DominanceFrontier : public ForwardDominanceFrontierBase<BasicBlock> {
 public:
-  typedef DominatorTreeBase<BasicBlock> DomTreeT;
-  typedef DomTreeNodeBase<BasicBlock> DomTreeNodeT;
-  typedef DominanceFrontierBase<BasicBlock>::DomSetType DomSetType;
-  typedef DominanceFrontierBase<BasicBlock>::iterator iterator;
-  typedef DominanceFrontierBase<BasicBlock>::const_iterator const_iterator;
-
-  /// Handle invalidation explicitly.
-  bool invalidate(Function &F, const PreservedAnalyses &PA,
-                  FunctionAnalysisManager::Invalidator &);
+ typedef DomTreeBase<BasicBlock> DomTreeT;
+ typedef DomTreeNodeBase<BasicBlock> DomTreeNodeT;
+ typedef DominanceFrontierBase<BasicBlock, false>::DomSetType DomSetType;
+ typedef DominanceFrontierBase<BasicBlock, false>::iterator iterator;
+ typedef DominanceFrontierBase<BasicBlock, false>::const_iterator
+     const_iterator;
+
+ /// Handle invalidation explicitly.
+ bool invalidate(Function &F, const PreservedAnalyses &PA,
+                 FunctionAnalysisManager::Invalidator &);
 };
 
 class DominanceFrontierWrapperPass : public FunctionPass {
@@ -168,7 +168,8 @@ public:
   void dump() const;
 };
 
-extern template class DominanceFrontierBase<BasicBlock>;
+extern template class DominanceFrontierBase<BasicBlock, false>;
+extern template class DominanceFrontierBase<BasicBlock, true>;
 extern template class ForwardDominanceFrontierBase<BasicBlock>;
 
 /// \brief Analysis pass which computes a \c DominanceFrontier.
diff --git a/include/llvm/Analysis/DominanceFrontierImpl.h b/include/llvm/Analysis/DominanceFrontierImpl.h
index 9f8cacc24f2c..5093b975e709 100644
--- a/include/llvm/Analysis/DominanceFrontierImpl.h
+++ b/include/llvm/Analysis/DominanceFrontierImpl.h
@@ -39,33 +39,33 @@ public:
   const DomTreeNodeT *parentNode;
 };
 
-template <class BlockT>
-void DominanceFrontierBase<BlockT>::removeBlock(BlockT *BB) {
+template <class BlockT, bool IsPostDom>
+void DominanceFrontierBase<BlockT, IsPostDom>::removeBlock(BlockT *BB) {
   assert(find(BB) != end() && "Block is not in DominanceFrontier!");
   for (iterator I = begin(), E = end(); I != E; ++I)
     I->second.erase(BB);
   Frontiers.erase(BB);
 }
 
-template <class BlockT>
-void DominanceFrontierBase<BlockT>::addToFrontier(iterator I,
-                                                  BlockT *Node) {
+template <class BlockT, bool IsPostDom>
+void DominanceFrontierBase<BlockT, IsPostDom>::addToFrontier(iterator I,
+                                                             BlockT *Node) {
   assert(I != end() && "BB is not in DominanceFrontier!");
   assert(I->second.count(Node) && "Node is not in DominanceFrontier of BB");
   I->second.erase(Node);
 }
 
-template <class BlockT>
-void DominanceFrontierBase<BlockT>::removeFromFrontier(iterator I,
-                                                       BlockT *Node) {
+template <class BlockT, bool IsPostDom>
+void DominanceFrontierBase<BlockT, IsPostDom>::removeFromFrontier(
+    iterator I, BlockT *Node) {
   assert(I != end() && "BB is not in DominanceFrontier!");
   assert(I->second.count(Node) && "Node is not in DominanceFrontier of BB");
   I->second.erase(Node);
 }
 
-template <class BlockT>
-bool DominanceFrontierBase<BlockT>::compareDomSet(DomSetType &DS1,
-                                                  const DomSetType &DS2) const {
+template <class BlockT, bool IsPostDom>
+bool DominanceFrontierBase<BlockT, IsPostDom>::compareDomSet(
+    DomSetType &DS1, const DomSetType &DS2) const {
   std::set<BlockT *> tmpSet;
   for (BlockT *BB : DS2)
     tmpSet.insert(BB);
@@ -88,9 +88,9 @@ bool DominanceFrontierBase<BlockT>::compareDomSet(DomSetType &DS1,
   return false;
 }
 
-template <class BlockT>
-bool DominanceFrontierBase<BlockT>::compare(
-    DominanceFrontierBase<BlockT> &Other) const {
+template <class BlockT, bool IsPostDom>
+bool DominanceFrontierBase<BlockT, IsPostDom>::compare(
+    DominanceFrontierBase<BlockT, IsPostDom> &Other) const {
   DomSetMapType tmpFrontiers;
   for (typename DomSetMapType::const_iterator I = Other.begin(),
                                               E = Other.end();
@@ -118,8 +118,8 @@ bool DominanceFrontierBase<BlockT>::compare(
   return false;
 }
 
-template <class BlockT>
-void DominanceFrontierBase<BlockT>::print(raw_ostream &OS) const {
+template <class BlockT, bool IsPostDom>
+void DominanceFrontierBase<BlockT, IsPostDom>::print(raw_ostream &OS) const {
   for (const_iterator I = begin(), E = end(); I != E; ++I) {
     OS << "  DomFrontier for BB ";
     if (I->first)
@@ -142,8 +142,8 @@ void DominanceFrontierBase<BlockT>::print(raw_ostream &OS) const {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-template <class BlockT>
-void DominanceFrontierBase<BlockT>::dump() const {
+template <class BlockT, bool IsPostDom>
+void DominanceFrontierBase<BlockT, IsPostDom>::dump() const {
   print(dbgs());
 }
 #endif
diff --git a/include/llvm/Analysis/IteratedDominanceFrontier.h b/include/llvm/Analysis/IteratedDominanceFrontier.h
index bd74d6bd14c3..edaf4e9025bc 100644
--- a/include/llvm/Analysis/IteratedDominanceFrontier.h
+++ b/include/llvm/Analysis/IteratedDominanceFrontier.h
@@ -42,11 +42,11 @@ namespace llvm {
 /// By default, liveness is not used to prune the IDF computation.
 /// The template parameters should be either BasicBlock* or Inverse<BasicBlock
 /// *>, depending on if you want the forward or reverse IDF.
-template <class NodeTy>
+template <class NodeTy, bool IsPostDom>
 class IDFCalculator {
-
-public:
-  IDFCalculator(DominatorTreeBase<BasicBlock> &DT) : DT(DT), useLiveIn(false) {}
+ public:
+  IDFCalculator(DominatorTreeBase<BasicBlock, IsPostDom> &DT)
+      : DT(DT), useLiveIn(false) {}
 
   /// \brief Give the IDF calculator the set of blocks in which the value is
   /// defined.  This is equivalent to the set of starting blocks it should be
@@ -84,12 +84,12 @@ public:
   void calculate(SmallVectorImpl<BasicBlock *> &IDFBlocks);
 
 private:
-  DominatorTreeBase<BasicBlock> &DT;
-  bool useLiveIn;
-  const SmallPtrSetImpl<BasicBlock *> *LiveInBlocks;
-  const SmallPtrSetImpl<BasicBlock *> *DefBlocks;
+ DominatorTreeBase<BasicBlock, IsPostDom> &DT;
+ bool useLiveIn;
+ const SmallPtrSetImpl<BasicBlock *> *LiveInBlocks;
+ const SmallPtrSetImpl<BasicBlock *> *DefBlocks;
 };
-typedef IDFCalculator<BasicBlock *> ForwardIDFCalculator;
-typedef IDFCalculator<Inverse<BasicBlock *>> ReverseIDFCalculator;
+typedef IDFCalculator<BasicBlock *, false> ForwardIDFCalculator;
+typedef IDFCalculator<Inverse<BasicBlock *>, true> ReverseIDFCalculator;
 }
 #endif
diff --git a/include/llvm/Analysis/LazyCallGraph.h b/include/llvm/Analysis/LazyCallGraph.h
index 3a052761ad7d..a025f2275fb4 100644
--- a/include/llvm/Analysis/LazyCallGraph.h
+++ b/include/llvm/Analysis/LazyCallGraph.h
@@ -43,6 +43,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/iterator.h"
 #include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
@@ -908,7 +909,7 @@ public:
   /// This sets up the graph and computes all of the entry points of the graph.
   /// No function definitions are scanned until their nodes in the graph are
   /// requested during traversal.
-  LazyCallGraph(Module &M);
+  LazyCallGraph(Module &M, TargetLibraryInfo &TLI);
 
   LazyCallGraph(LazyCallGraph &&G);
   LazyCallGraph &operator=(LazyCallGraph &&RHS);
@@ -966,6 +967,22 @@ public:
     return insertInto(F, N);
   }
 
+  /// Get the sequence of known and defined library functions.
+  ///
+  /// These functions, because they are known to LLVM, can have calls
+  /// introduced out of thin air from arbitrary IR.
+  ArrayRef<Function *> getLibFunctions() const {
+    return LibFunctions.getArrayRef();
+  }
+
+  /// Test whether a function is a known and defined library function tracked by
+  /// the call graph.
+  ///
+  /// Because these functions are known to LLVM they are specially modeled in
+  /// the call graph and even when all IR-level references have been removed
+  /// remain active and reachable.
+  bool isLibFunction(Function &F) const { return LibFunctions.count(&F); }
+
   ///@{
   /// \name Pre-SCC Mutation API
   ///
@@ -1100,6 +1117,11 @@ private:
   /// These are all of the RefSCCs which have no children.
   SmallVector<RefSCC *, 4> LeafRefSCCs;
 
+  /// Defined functions that are also known library functions which the
+  /// optimizer can reason about and therefore might introduce calls to out of
+  /// thin air.
+  SmallSetVector<Function *, 4> LibFunctions;
+
   /// Helper to insert a new function, with an already looked-up entry in
   /// the NodeMap.
   Node &insertInto(Function &F, Node *&MappedN);
@@ -1216,8 +1238,8 @@ public:
   ///
   /// This just builds the set of entry points to the call graph. The rest is
   /// built lazily as it is walked.
-  LazyCallGraph run(Module &M, ModuleAnalysisManager &) {
-    return LazyCallGraph(M);
+  LazyCallGraph run(Module &M, ModuleAnalysisManager &AM) {
+    return LazyCallGraph(M, AM.getResult<TargetLibraryAnalysis>(M));
   }
 };
 
diff --git a/include/llvm/Analysis/LoopInfo.h b/include/llvm/Analysis/LoopInfo.h
index 096df1e421a7..70ce9a870517 100644
--- a/include/llvm/Analysis/LoopInfo.h
+++ b/include/llvm/Analysis/LoopInfo.h
@@ -56,7 +56,8 @@ class Loop;
 class MDNode;
 class PHINode;
 class raw_ostream;
-template<class N> class DominatorTreeBase;
+template <class N, bool IsPostDom>
+class DominatorTreeBase;
 template<class N, class M> class LoopInfoBase;
 template<class N, class M> class LoopBase;
 
@@ -663,12 +664,12 @@ public:
   }
 
   /// Create the loop forest using a stable algorithm.
-  void analyze(const DominatorTreeBase<BlockT> &DomTree);
+  void analyze(const DominatorTreeBase<BlockT, false> &DomTree);
 
   // Debugging
   void print(raw_ostream &OS) const;
 
-  void verify(const DominatorTreeBase<BlockT> &DomTree) const;
+  void verify(const DominatorTreeBase<BlockT, false> &DomTree) const;
 };
 
 // Implementation in LoopInfoImpl.h
@@ -683,7 +684,7 @@ class LoopInfo : public LoopInfoBase<BasicBlock, Loop> {
   LoopInfo(const LoopInfo &) = delete;
 public:
   LoopInfo() {}
-  explicit LoopInfo(const DominatorTreeBase<BasicBlock> &DomTree);
+  explicit LoopInfo(const DominatorTreeBase<BasicBlock, false> &DomTree);
 
   LoopInfo(LoopInfo &&Arg) : BaseT(std::move(static_cast<BaseT &>(Arg))) {}
   LoopInfo &operator=(LoopInfo &&RHS) {
diff --git a/include/llvm/Analysis/LoopInfoImpl.h b/include/llvm/Analysis/LoopInfoImpl.h
index 372fc8b21745..e9177e68ed77 100644
--- a/include/llvm/Analysis/LoopInfoImpl.h
+++ b/include/llvm/Analysis/LoopInfoImpl.h
@@ -340,10 +340,10 @@ void LoopBase<BlockT, LoopT>::print(raw_ostream &OS, unsigned Depth,
 /// Discover a subloop with the specified backedges such that: All blocks within
 /// this loop are mapped to this loop or a subloop. And all subloops within this
 /// loop have their parent loop set to this loop or a subloop.
-template<class BlockT, class LoopT>
-static void discoverAndMapSubloop(LoopT *L, ArrayRef<BlockT*> Backedges,
-                                  LoopInfoBase<BlockT, LoopT> *LI,
-                                  const DominatorTreeBase<BlockT> &DomTree) {
+template <class BlockT, class LoopT>
+static void discoverAndMapSubloop(
+    LoopT *L, ArrayRef<BlockT *> Backedges, LoopInfoBase<BlockT, LoopT> *LI,
+    const DomTreeBase<BlockT> &DomTree) {
   typedef GraphTraits<Inverse<BlockT*> > InvBlockTraits;
 
   unsigned NumBlocks = 0;
@@ -462,10 +462,9 @@ void PopulateLoopsDFS<BlockT, LoopT>::insertIntoLoop(BlockT *Block) {
 ///
 /// The Block vectors are inclusive, so step 3 requires loop-depth number of
 /// insertions per block.
-template<class BlockT, class LoopT>
-void LoopInfoBase<BlockT, LoopT>::
-analyze(const DominatorTreeBase<BlockT> &DomTree) {
-
+template <class BlockT, class LoopT>
+void LoopInfoBase<BlockT, LoopT>::analyze(
+    const DomTreeBase<BlockT> &DomTree) {
   // Postorder traversal of the dominator tree.
   const DomTreeNodeBase<BlockT> *DomRoot = DomTree.getRootNode();
   for (auto DomNode : post_order(DomRoot)) {
@@ -607,7 +606,7 @@ static void compareLoops(const LoopT *L, const LoopT *OtherL,
 
 template <class BlockT, class LoopT>
 void LoopInfoBase<BlockT, LoopT>::verify(
-    const DominatorTreeBase<BlockT> &DomTree) const {
+    const DomTreeBase<BlockT> &DomTree) const {
   DenseSet<const LoopT*> Loops;
   for (iterator I = begin(), E = end(); I != E; ++I) {
     assert(!(*I)->getParentLoop() && "Top-level loop has a parent!");
diff --git a/include/llvm/Analysis/PostDominators.h b/include/llvm/Analysis/PostDominators.h
index 94ee3b03bb86..17f2e8eaf4a2 100644
--- a/include/llvm/Analysis/PostDominators.h
+++ b/include/llvm/Analysis/PostDominators.h
@@ -22,10 +22,8 @@ namespace llvm {
 /// PostDominatorTree Class - Concrete subclass of DominatorTree that is used to
 /// compute the post-dominator tree.
 ///
-struct PostDominatorTree : public DominatorTreeBase<BasicBlock> {
-  typedef DominatorTreeBase<BasicBlock> Base;
-
-  PostDominatorTree() : DominatorTreeBase<BasicBlock>(true) {}
+struct PostDominatorTree : public PostDomTreeBase<BasicBlock> {
+  typedef PostDomTreeBase<BasicBlock> Base;
 
   /// Handle invalidation explicitly.
   bool invalidate(Function &F, const PreservedAnalyses &PA,
diff --git a/include/llvm/Analysis/ScalarEvolution.h b/include/llvm/Analysis/ScalarEvolution.h
index c7accfae78b0..d1b182755cf8 100644
--- a/include/llvm/Analysis/ScalarEvolution.h
+++ b/include/llvm/Analysis/ScalarEvolution.h
@@ -237,17 +237,15 @@ struct FoldingSetTrait<SCEVPredicate> : DefaultFoldingSetTrait<SCEVPredicate> {
 };
 
 /// This class represents an assumption that two SCEV expressions are equal,
-/// and this can be checked at run-time. We assume that the left hand side is
-/// a SCEVUnknown and the right hand side a constant.
+/// and this can be checked at run-time.
 class SCEVEqualPredicate final : public SCEVPredicate {
-  /// We assume that LHS == RHS, where LHS is a SCEVUnknown and RHS a
-  /// constant.
-  const SCEVUnknown *LHS;
-  const SCEVConstant *RHS;
+  /// We assume that LHS == RHS.
+  const SCEV *LHS;
+  const SCEV *RHS;
 
 public:
-  SCEVEqualPredicate(const FoldingSetNodeIDRef ID, const SCEVUnknown *LHS,
-                     const SCEVConstant *RHS);
+  SCEVEqualPredicate(const FoldingSetNodeIDRef ID, const SCEV *LHS,
+                     const SCEV *RHS);
 
   /// Implementation of the SCEVPredicate interface
   bool implies(const SCEVPredicate *N) const override;
@@ -256,10 +254,10 @@ public:
   const SCEV *getExpr() const override;
 
   /// Returns the left hand side of the equality.
-  const SCEVUnknown *getLHS() const { return LHS; }
+  const SCEV *getLHS() const { return LHS; }
 
   /// Returns the right hand side of the equality.
-  const SCEVConstant *getRHS() const { return RHS; }
+  const SCEV *getRHS() const { return RHS; }
 
   /// Methods for support type inquiry through isa, cast, and dyn_cast:
   static bool classof(const SCEVPredicate *P) {
@@ -1241,6 +1239,14 @@ public:
     SmallVector<const SCEV *, 4> NewOp(Operands.begin(), Operands.end());
     return getAddRecExpr(NewOp, L, Flags);
   }
+
+  /// Checks if \p SymbolicPHI can be rewritten as an AddRecExpr under some
+  /// Predicates. If successful return these <AddRecExpr, Predicates>;
+  /// The function is intended to be called from PSCEV (the caller will decide
+  /// whether to actually add the predicates and carry out the rewrites).
+  Optional<std::pair<const SCEV *, SmallVector<const SCEVPredicate *, 3>>>
+  createAddRecFromPHIWithCasts(const SCEVUnknown *SymbolicPHI);
+  
   /// Returns an expression for a GEP
   ///
   /// \p GEP The GEP. The indices contained in the GEP itself are ignored,
@@ -1675,8 +1681,7 @@ public:
     return F.getParent()->getDataLayout();
   }
 
-  const SCEVPredicate *getEqualPredicate(const SCEVUnknown *LHS,
-                                         const SCEVConstant *RHS);
+  const SCEVPredicate *getEqualPredicate(const SCEV *LHS, const SCEV *RHS);
 
   const SCEVPredicate *
   getWrapPredicate(const SCEVAddRecExpr *AR,
@@ -1692,6 +1697,19 @@ public:
       SmallPtrSetImpl<const SCEVPredicate *> &Preds);
 
 private:
+  /// Similar to createAddRecFromPHI, but with the additional flexibility of 
+  /// suggesting runtime overflow checks in case casts are encountered.
+  /// If successful, the analysis records that for this loop, \p SymbolicPHI,
+  /// which is the UnknownSCEV currently representing the PHI, can be rewritten
+  /// into an AddRec, assuming some predicates; The function then returns the
+  /// AddRec and the predicates as a pair, and caches this pair in
+  /// PredicatedSCEVRewrites.
+  /// If the analysis is not successful, a mapping from the \p SymbolicPHI to 
+  /// itself (with no predicates) is recorded, and a nullptr with an empty
+  /// predicates vector is returned as a pair. 
+  Optional<std::pair<const SCEV *, SmallVector<const SCEVPredicate *, 3>>>
+  createAddRecFromPHIWithCastsImpl(const SCEVUnknown *SymbolicPHI);
+
   /// Compute the backedge taken count knowing the interval difference, the
   /// stride and presence of the equality in the comparison.
   const SCEV *computeBECount(const SCEV *Delta, const SCEV *Stride,
@@ -1722,6 +1740,12 @@ private:
   FoldingSet<SCEVPredicate> UniquePreds;
   BumpPtrAllocator SCEVAllocator;
 
+  /// Cache tentative mappings from UnknownSCEVs in a Loop, to a SCEV expression
+  /// they can be rewritten into under certain predicates.
+  DenseMap<std::pair<const SCEVUnknown *, const Loop *>,
+           std::pair<const SCEV *, SmallVector<const SCEVPredicate *, 3>>>
+      PredicatedSCEVRewrites;
+   
   /// The head of a linked list of all SCEVUnknown values that have been
   /// allocated. This is used by releaseMemory to locate them all and call
   /// their destructors.
diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h
index dfb525e3de7a..24edd3826a2e 100644
--- a/include/llvm/Analysis/TargetTransformInfo.h
+++ b/include/llvm/Analysis/TargetTransformInfo.h
@@ -155,6 +155,13 @@ public:
   int getGEPCost(Type *PointeeType, const Value *Ptr,
                  ArrayRef<const Value *> Operands) const;
 
+  /// \brief Estimate the cost of a EXT operation when lowered.
+  ///
+  /// The contract for this function is the same as \c getOperationCost except
+  /// that it supports an interface that provides extra information specific to
+  /// the EXT operation.
+  int getExtCost(const Instruction *I, const Value *Src) const;
+
   /// \brief Estimate the cost of a function call when lowered.
   ///
   /// The contract for this is the same as \c getOperationCost except that it
@@ -849,6 +856,7 @@ public:
   virtual int getOperationCost(unsigned Opcode, Type *Ty, Type *OpTy) = 0;
   virtual int getGEPCost(Type *PointeeType, const Value *Ptr,
                          ArrayRef<const Value *> Operands) = 0;
+  virtual int getExtCost(const Instruction *I, const Value *Src) = 0;
   virtual int getCallCost(FunctionType *FTy, int NumArgs) = 0;
   virtual int getCallCost(const Function *F, int NumArgs) = 0;
   virtual int getCallCost(const Function *F,
@@ -1022,6 +1030,9 @@ public:
                  ArrayRef<const Value *> Operands) override {
     return Impl.getGEPCost(PointeeType, Ptr, Operands);
   }
+  int getExtCost(const Instruction *I, const Value *Src) override {
+    return Impl.getExtCost(I, Src);
+  }
   int getCallCost(FunctionType *FTy, int NumArgs) override {
     return Impl.getCallCost(FTy, NumArgs);
   }
diff --git a/include/llvm/Analysis/TargetTransformInfoImpl.h b/include/llvm/Analysis/TargetTransformInfoImpl.h
index 8740ee92eed5..0b07fe9aa232 100644
--- a/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -120,6 +120,10 @@ public:
     return SI.getNumCases();
   }
 
+  int getExtCost(const Instruction *I, const Value *Src) {
+    return TTI::TCC_Basic;
+  }
+
   unsigned getCallCost(FunctionType *FTy, int NumArgs) {
     assert(FTy && "FunctionType must be provided to this routine.");
 
@@ -728,6 +732,8 @@ public:
       // nop on most sane targets.
       if (isa<CmpInst>(CI->getOperand(0)))
         return TTI::TCC_Free;
+      if (isa<SExtInst>(CI) || isa<ZExtInst>(CI) || isa<FPExtInst>(CI))
+        return static_cast<T *>(this)->getExtCost(CI, Operands.back());
     }
 
     return static_cast<T *>(this)->getOperationCost(
diff --git a/include/llvm/CodeGen/BasicTTIImpl.h b/include/llvm/CodeGen/BasicTTIImpl.h
index b59fd60e8aed..633107024792 100644
--- a/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/include/llvm/CodeGen/BasicTTIImpl.h
@@ -155,6 +155,18 @@ public:
     return BaseT::getGEPCost(PointeeType, Ptr, Operands);
   }
 
+  int getExtCost(const Instruction *I, const Value *Src) {
+    if (getTLI()->isExtFree(I))
+      return TargetTransformInfo::TCC_Free;
+
+    if (isa<ZExtInst>(I) || isa<SExtInst>(I))
+      if (const LoadInst *LI = dyn_cast<LoadInst>(Src))
+        if (getTLI()->isExtLoad(LI, I, DL))
+          return TargetTransformInfo::TCC_Free;
+
+    return TargetTransformInfo::TCC_Basic;
+  }
+
   unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
                             ArrayRef<const Value *> Arguments) {
     return BaseT::getIntrinsicCost(IID, RetTy, Arguments);
diff --git a/include/llvm/CodeGen/MachineDominanceFrontier.h b/include/llvm/CodeGen/MachineDominanceFrontier.h
index 370ffbe4862e..6efeefd9a721 100644
--- a/include/llvm/CodeGen/MachineDominanceFrontier.h
+++ b/include/llvm/CodeGen/MachineDominanceFrontier.h
@@ -23,27 +23,24 @@ class MachineDominanceFrontier : public MachineFunctionPass {
   ForwardDominanceFrontierBase<MachineBasicBlock> Base;
 
 public:
-  using DomTreeT = DominatorTreeBase<MachineBasicBlock>;
-  using DomTreeNodeT = DomTreeNodeBase<MachineBasicBlock>;
-  using DomSetType = DominanceFrontierBase<MachineBasicBlock>::DomSetType;
-  using iterator = DominanceFrontierBase<MachineBasicBlock>::iterator;
-  using const_iterator =
-      DominanceFrontierBase<MachineBasicBlock>::const_iterator;
+ using DomTreeT = DomTreeBase<MachineBasicBlock>;
+ using DomTreeNodeT = DomTreeNodeBase<MachineBasicBlock>;
+ using DomSetType = DominanceFrontierBase<MachineBasicBlock, false>::DomSetType;
+ using iterator = DominanceFrontierBase<MachineBasicBlock, false>::iterator;
+ using const_iterator =
+     DominanceFrontierBase<MachineBasicBlock, false>::const_iterator;
 
-  MachineDominanceFrontier(const MachineDominanceFrontier &) = delete;
-  MachineDominanceFrontier &
-  operator=(const MachineDominanceFrontier &) = delete;
+ MachineDominanceFrontier(const MachineDominanceFrontier &) = delete;
+ MachineDominanceFrontier &operator=(const MachineDominanceFrontier &) = delete;
 
-  static char ID;
+ static char ID;
 
-  MachineDominanceFrontier();
+ MachineDominanceFrontier();
 
-  DominanceFrontierBase<MachineBasicBlock> &getBase() {
-    return Base;
-  }
+ DominanceFrontierBase<MachineBasicBlock, false> &getBase() { return Base; }
 
-  inline const std::vector<MachineBasicBlock*> &getRoots() const {
-    return Base.getRoots();
+ inline const std::vector<MachineBasicBlock *> &getRoots() const {
+   return Base.getRoots();
   }
 
   MachineBasicBlock *getRoot() const {
@@ -98,7 +95,7 @@ public:
     return Base.compareDomSet(DS1, DS2);
   }
 
-  bool compare(DominanceFrontierBase<MachineBasicBlock> &Other) const {
+  bool compare(DominanceFrontierBase<MachineBasicBlock, false> &Other) const {
     return Base.compare(Other);
   }
 
diff --git a/include/llvm/CodeGen/MachineDominators.h b/include/llvm/CodeGen/MachineDominators.h
index 74a7c3ea04ae..8bf98f606495 100644
--- a/include/llvm/CodeGen/MachineDominators.h
+++ b/include/llvm/CodeGen/MachineDominators.h
@@ -28,13 +28,15 @@
 
 namespace llvm {
 
-template<>
-inline void DominatorTreeBase<MachineBasicBlock>::addRoot(MachineBasicBlock* MBB) {
+template <>
+inline void DominatorTreeBase<MachineBasicBlock, false>::addRoot(
+    MachineBasicBlock *MBB) {
   this->Roots.push_back(MBB);
 }
 
 extern template class DomTreeNodeBase<MachineBasicBlock>;
-extern template class DominatorTreeBase<MachineBasicBlock>;
+extern template class DominatorTreeBase<MachineBasicBlock, false>; // DomTree
+extern template class DominatorTreeBase<MachineBasicBlock, true>; // PostDomTree
 
 using MachineDomTreeNode = DomTreeNodeBase<MachineBasicBlock>;
 
@@ -65,7 +67,7 @@ class MachineDominatorTree : public MachineFunctionPass {
   mutable SmallSet<MachineBasicBlock *, 32> NewBBs;
 
   /// The DominatorTreeBase that is used to compute a normal dominator tree
-  std::unique_ptr<DominatorTreeBase<MachineBasicBlock>> DT;
+  std::unique_ptr<DomTreeBase<MachineBasicBlock>> DT;
 
   /// \brief Apply all the recorded critical edges to the DT.
   /// This updates the underlying DT information in a way that uses
@@ -79,9 +81,8 @@ public:
 
   MachineDominatorTree();
 
-  DominatorTreeBase<MachineBasicBlock> &getBase() {
-    if (!DT)
-      DT.reset(new DominatorTreeBase<MachineBasicBlock>(false));
+  DomTreeBase<MachineBasicBlock> &getBase() {
+    if (!DT) DT.reset(new DomTreeBase<MachineBasicBlock>());
     applySplitCriticalEdges();
     return *DT;
   }
diff --git a/include/llvm/CodeGen/MachinePostDominators.h b/include/llvm/CodeGen/MachinePostDominators.h
index 70bdb191ad34..d29d2d85cb0a 100644
--- a/include/llvm/CodeGen/MachinePostDominators.h
+++ b/include/llvm/CodeGen/MachinePostDominators.h
@@ -26,7 +26,7 @@ namespace llvm {
 ///
 struct MachinePostDominatorTree : public MachineFunctionPass {
 private:
-  DominatorTreeBase<MachineBasicBlock> *DT;
+ PostDomTreeBase<MachineBasicBlock> *DT;
 
 public:
   static char ID;
diff --git a/include/llvm/DebugInfo/CodeView/CVTypeVisitor.h b/include/llvm/DebugInfo/CodeView/CVTypeVisitor.h
index 70ccc867cd38..df55e181364c 100644
--- a/include/llvm/DebugInfo/CodeView/CVTypeVisitor.h
+++ b/include/llvm/DebugInfo/CodeView/CVTypeVisitor.h
@@ -17,7 +17,6 @@
 namespace llvm {
 namespace codeview {
 class TypeCollection;
-class TypeServerHandler;
 class TypeVisitorCallbacks;
 
 enum VisitorDataSource {
@@ -31,11 +30,9 @@ enum VisitorDataSource {
 
 Error visitTypeRecord(CVType &Record, TypeIndex Index,
                       TypeVisitorCallbacks &Callbacks,
-                      VisitorDataSource Source = VDS_BytesPresent,
-                      TypeServerHandler *TS = nullptr);
+                      VisitorDataSource Source = VDS_BytesPresent);
 Error visitTypeRecord(CVType &Record, TypeVisitorCallbacks &Callbacks,
-                      VisitorDataSource Source = VDS_BytesPresent,
-                      TypeServerHandler *TS = nullptr);
+                      VisitorDataSource Source = VDS_BytesPresent);
 
 Error visitMemberRecord(CVMemberRecord Record, TypeVisitorCallbacks &Callbacks,
                         VisitorDataSource Source = VDS_BytesPresent);
@@ -46,12 +43,9 @@ Error visitMemberRecordStream(ArrayRef<uint8_t> FieldList,
                               TypeVisitorCallbacks &Callbacks);
 
 Error visitTypeStream(const CVTypeArray &Types, TypeVisitorCallbacks &Callbacks,
-                      VisitorDataSource Source = VDS_BytesPresent,
-                      TypeServerHandler *TS = nullptr);
-Error visitTypeStream(CVTypeRange Types, TypeVisitorCallbacks &Callbacks,
-                      TypeServerHandler *TS = nullptr);
-Error visitTypeStream(TypeCollection &Types, TypeVisitorCallbacks &Callbacks,
-                      TypeServerHandler *TS = nullptr);
+                      VisitorDataSource Source = VDS_BytesPresent);
+Error visitTypeStream(CVTypeRange Types, TypeVisitorCallbacks &Callbacks);
+Error visitTypeStream(TypeCollection &Types, TypeVisitorCallbacks &Callbacks);
 
 } // end namespace codeview
 } // end namespace llvm
diff --git a/include/llvm/DebugInfo/CodeView/CodeViewRecordIO.h b/include/llvm/DebugInfo/CodeView/CodeViewRecordIO.h
index db944c7057f7..94f104ff772c 100644
--- a/include/llvm/DebugInfo/CodeView/CodeViewRecordIO.h
+++ b/include/llvm/DebugInfo/CodeView/CodeViewRecordIO.h
@@ -84,7 +84,7 @@ public:
   Error mapEncodedInteger(uint64_t &Value);
   Error mapEncodedInteger(APSInt &Value);
   Error mapStringZ(StringRef &Value);
-  Error mapGuid(StringRef &Guid);
+  Error mapGuid(GUID &Guid);
 
   Error mapStringZVectorZ(std::vector<StringRef> &Value);
 
diff --git a/include/llvm/DebugInfo/CodeView/Formatters.h b/include/llvm/DebugInfo/CodeView/Formatters.h
index 0842c1e373db..278ad02a39cd 100644
--- a/include/llvm/DebugInfo/CodeView/Formatters.h
+++ b/include/llvm/DebugInfo/CodeView/Formatters.h
@@ -12,6 +12,7 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/DebugInfo/CodeView/GUID.h"
 #include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/Support/FormatAdapters.h"
 #include "llvm/Support/FormatVariadic.h"
@@ -31,7 +32,7 @@ public:
   explicit GuidAdapter(ArrayRef<uint8_t> Guid);
   explicit GuidAdapter(StringRef Guid);
 
-  void format(raw_ostream &Stream, StringRef Style) override ;
+  void format(raw_ostream &Stream, StringRef Style) override;
 };
 
 } // end namespace detail
@@ -60,6 +61,13 @@ public:
   }
 };
 
+template <> struct format_provider<codeview::GUID> {
+  static void format(const codeview::GUID &V, llvm::raw_ostream &Stream,
+                     StringRef Style) {
+    Stream << V;
+  }
+};
+
 } // end namespace llvm
 
 #endif // LLVM_DEBUGINFO_CODEVIEW_FORMATTERS_H
diff --git a/include/llvm/DebugInfo/CodeView/GUID.h b/include/llvm/DebugInfo/CodeView/GUID.h
new file mode 100644
index 000000000000..a055ce9e2e45
--- /dev/null
+++ b/include/llvm/DebugInfo/CodeView/GUID.h
@@ -0,0 +1,55 @@
+//===- GUID.h ---------------------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_CODEVIEW_GUID_H
+#define LLVM_DEBUGINFO_CODEVIEW_GUID_H
+
+#include <cstdint>
+#include <cstring>
+
+namespace llvm {
+class raw_ostream;
+
+namespace codeview {
+
+/// This represents the 'GUID' type from windows.h.
+struct GUID {
+  uint8_t Guid[16];
+};
+
+inline bool operator==(const GUID &LHS, const GUID &RHS) {
+  return 0 == ::memcmp(LHS.Guid, RHS.Guid, sizeof(LHS.Guid));
+}
+
+inline bool operator<(const GUID &LHS, const GUID &RHS) {
+  return ::memcmp(LHS.Guid, RHS.Guid, sizeof(LHS.Guid)) < 0;
+}
+
+inline bool operator<=(const GUID &LHS, const GUID &RHS) {
+  return ::memcmp(LHS.Guid, RHS.Guid, sizeof(LHS.Guid)) <= 0;
+}
+
+inline bool operator>(const GUID &LHS, const GUID &RHS) {
+  return !(LHS <= RHS);
+}
+
+inline bool operator>=(const GUID &LHS, const GUID &RHS) {
+  return !(LHS < RHS);
+}
+
+inline bool operator!=(const GUID &LHS, const GUID &RHS) {
+  return !(LHS == RHS);
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const GUID &Guid);
+
+} // namespace codeview
+} // namespace llvm
+
+#endif
diff --git a/include/llvm/DebugInfo/CodeView/SymbolRecord.h b/include/llvm/DebugInfo/CodeView/SymbolRecord.h
index cdfc1745cea5..f3086cf3dbb9 100644
--- a/include/llvm/DebugInfo/CodeView/SymbolRecord.h
+++ b/include/llvm/DebugInfo/CodeView/SymbolRecord.h
@@ -848,7 +848,7 @@ public:
       : SymbolRecord(SymbolRecordKind::BuildInfoSym),
         RecordOffset(RecordOffset) {}
 
-  uint32_t BuildId;
+  TypeIndex BuildId;
 
   uint32_t RecordOffset;
 };
diff --git a/include/llvm/DebugInfo/CodeView/TypeRecord.h b/include/llvm/DebugInfo/CodeView/TypeRecord.h
index 2efeb1b3cefd..7942c0c0bc21 100644
--- a/include/llvm/DebugInfo/CodeView/TypeRecord.h
+++ b/include/llvm/DebugInfo/CodeView/TypeRecord.h
@@ -18,6 +18,7 @@
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/DebugInfo/CodeView/CVRecord.h"
 #include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/DebugInfo/CodeView/GUID.h"
 #include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/Support/BinaryStreamArray.h"
 #include "llvm/Support/Endian.h"
@@ -539,15 +540,17 @@ class TypeServer2Record : public TypeRecord {
 public:
   TypeServer2Record() = default;
   explicit TypeServer2Record(TypeRecordKind Kind) : TypeRecord(Kind) {}
-  TypeServer2Record(StringRef Guid, uint32_t Age, StringRef Name)
-      : TypeRecord(TypeRecordKind::TypeServer2), Guid(Guid), Age(Age),
-        Name(Name) {}
+  TypeServer2Record(StringRef GuidStr, uint32_t Age, StringRef Name)
+      : TypeRecord(TypeRecordKind::TypeServer2), Age(Age), Name(Name) {
+    assert(GuidStr.size() == 16 && "guid isn't 16 bytes");
+    ::memcpy(Guid.Guid, GuidStr.data(), 16);
+  }
 
-  StringRef getGuid() const { return Guid; }
+  const GUID &getGuid() const { return Guid; }
   uint32_t getAge() const { return Age; }
   StringRef getName() const { return Name; }
 
-  StringRef Guid;
+  GUID Guid;
   uint32_t Age;
   StringRef Name;
 };
diff --git a/include/llvm/DebugInfo/CodeView/TypeServerHandler.h b/include/llvm/DebugInfo/CodeView/TypeServerHandler.h
deleted file mode 100644
index e96baad9ceae..000000000000
--- a/include/llvm/DebugInfo/CodeView/TypeServerHandler.h
+++ /dev/null
@@ -1,38 +0,0 @@
-//===- TypeServerHandler.h --------------------------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_DEBUGINFO_CODEVIEW_TYPESERVERHANDLER_H
-#define LLVM_DEBUGINFO_CODEVIEW_TYPESERVERHANDLER_H
-
-#include "llvm/Support/Error.h"
-
-namespace llvm {
-namespace codeview {
-
-class TypeServer2Record;
-class TypeVisitorCallbacks;
-
-class TypeServerHandler {
-public:
-  virtual ~TypeServerHandler() = default;
-
-  /// Handle a TypeServer record.  If the implementation returns true
-  /// the record will not be processed by the top-level visitor.  If
-  /// it returns false, it will be processed.  If it returns an Error,
-  /// then the top-level visitor will fail.
-  virtual Expected<bool> handle(TypeServer2Record &TS,
-                                TypeVisitorCallbacks &Callbacks) {
-    return false;
-  }
-};
-
-} // end namespace codeview
-} // end namespace llvm
-
-#endif // LLVM_DEBUGINFO_CODEVIEW_TYPESERVERHANDLER_H
diff --git a/include/llvm/DebugInfo/CodeView/TypeStreamMerger.h b/include/llvm/DebugInfo/CodeView/TypeStreamMerger.h
index 3ad2b4e9c92f..d78fab47db66 100644
--- a/include/llvm/DebugInfo/CodeView/TypeStreamMerger.h
+++ b/include/llvm/DebugInfo/CodeView/TypeStreamMerger.h
@@ -19,7 +19,6 @@ namespace llvm {
 namespace codeview {
 
 class TypeIndex;
-class TypeServerHandler;
 class TypeTableBuilder;
 
 /// \brief Merge one set of type records into another.  This method assumes
@@ -31,16 +30,13 @@ class TypeTableBuilder;
 /// type stream, that contains the index of the corresponding type record
 /// in the destination stream.
 ///
-/// \param Handler (optional) If non-null, an interface that gets invoked
-/// to handle type server records.
-///
 /// \param Types The collection of types to merge in.
 ///
 /// \returns Error::success() if the operation succeeded, otherwise an
 /// appropriate error code.
 Error mergeTypeRecords(TypeTableBuilder &Dest,
                        SmallVectorImpl<TypeIndex> &SourceToDest,
-                       TypeServerHandler *Handler, const CVTypeArray &Types);
+                       const CVTypeArray &Types);
 
 /// \brief Merge one set of id records into another.  This method assumes
 /// that all records are id records, and there are no Type records present.
@@ -65,7 +61,7 @@ Error mergeTypeRecords(TypeTableBuilder &Dest,
 /// appropriate error code.
 Error mergeIdRecords(TypeTableBuilder &Dest, ArrayRef<TypeIndex> Types,
                      SmallVectorImpl<TypeIndex> &SourceToDest,
-  const CVTypeArray &Ids);
+                     const CVTypeArray &Ids);
 
 /// \brief Merge a unified set of type and id records, splitting them into
 /// separate output streams.
@@ -78,9 +74,6 @@ Error mergeIdRecords(TypeTableBuilder &Dest, ArrayRef<TypeIndex> Types,
 /// id stream, that contains the index of the corresponding id record
 /// in the destination stream.
 ///
-/// \param Handler (optional) If non-null, an interface that gets invoked
-/// to handle type server records.
-///
 /// \param IdsAndTypes The collection of id records to merge in.
 ///
 /// \returns Error::success() if the operation succeeded, otherwise an
@@ -88,8 +81,7 @@ Error mergeIdRecords(TypeTableBuilder &Dest, ArrayRef<TypeIndex> Types,
 Error mergeTypeAndIdRecords(TypeTableBuilder &DestIds,
                             TypeTableBuilder &DestTypes,
                             SmallVectorImpl<TypeIndex> &SourceToDest,
-                            TypeServerHandler *Handler,
-  const CVTypeArray &IdsAndTypes);
+                            const CVTypeArray &IdsAndTypes);
 
 } // end namespace codeview
 } // end namespace llvm
diff --git a/include/llvm/DebugInfo/DWARF/DWARFUnit.h b/include/llvm/DebugInfo/DWARF/DWARFUnit.h
index ea36ab7ab5b6..056c1b77c65d 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFUnit.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFUnit.h
@@ -238,6 +238,34 @@ public:
 
   uint8_t getUnitType() const { return UnitType; }
 
+  static bool isValidUnitType(uint8_t UnitType) {
+    return UnitType == dwarf::DW_UT_compile || UnitType == dwarf::DW_UT_type ||
+           UnitType == dwarf::DW_UT_partial ||
+           UnitType == dwarf::DW_UT_skeleton ||
+           UnitType == dwarf::DW_UT_split_compile ||
+           UnitType == dwarf::DW_UT_split_type;
+  }
+
+  /// \brief Return the number of bytes for the header of a unit of
+  /// UnitType type.
+  ///
+  /// This function must be called with a valid unit type which in
+  /// DWARF5 is defined as one of the following six types.
+  static uint32_t getDWARF5HeaderSize(uint8_t UnitType) {
+    switch (UnitType) {
+    case dwarf::DW_UT_compile:
+    case dwarf::DW_UT_partial:
+      return 12;
+    case dwarf::DW_UT_skeleton:
+    case dwarf::DW_UT_split_compile:
+      return 20;
+    case dwarf::DW_UT_type:
+    case dwarf::DW_UT_split_type:
+      return 24;
+    }
+    llvm_unreachable("Invalid UnitType.");
+  }
+
   uint64_t getBaseAddress() const { return BaseAddr; }
 
   void setBaseAddress(uint64_t base_addr) {
diff --git a/include/llvm/DebugInfo/DWARF/DWARFVerifier.h b/include/llvm/DebugInfo/DWARF/DWARFVerifier.h
index 9eb5c45faba8..c0291a83ed97 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFVerifier.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFVerifier.h
@@ -21,6 +21,7 @@ class DWARFContext;
 class DWARFDie;
 class DWARFUnit;
 class DWARFAcceleratorTable;
+class DWARFDataExtractor;
 
 /// A class that verifies DWARF debug information given a DWARF Context.
 class DWARFVerifier {
@@ -30,10 +31,35 @@ class DWARFVerifier {
   /// can verify each reference points to a valid DIE and not an offset that
   /// lies between to valid DIEs.
   std::map<uint64_t, std::set<uint32_t>> ReferenceToDIEOffsets;
-  uint32_t NumDebugInfoErrors = 0;
   uint32_t NumDebugLineErrors = 0;
   uint32_t NumAppleNamesErrors = 0;
 
+  /// Verifies the header of a unit in the .debug_info section.
+  ///
+  /// This function currently checks for:
+  /// - Unit is in 32-bit DWARF format. The function can be modified to
+  /// support 64-bit format.
+  /// - The DWARF version is valid
+  /// - The unit type is valid (if unit is in version >=5)
+  /// - The unit doesn't extend beyond .debug_info section
+  /// - The address size is valid
+  /// - The offset in the .debug_abbrev section is valid
+  ///
+  /// \param DebugInfoData The .debug_info section data
+  /// \param Offset A reference to the offset start of the unit. The offset will
+  /// be updated to point to the next unit in .debug_info
+  /// \param UnitIndex The index of the unit to be verified
+  /// \param UnitType A reference to the type of the unit
+  /// \param isUnitDWARF64 A reference to a flag that shows whether the unit is
+  /// in 64-bit format.
+  ///
+  /// \returns true if the header is verified successfully, false otherwise.
+  bool verifyUnitHeader(const DWARFDataExtractor DebugInfoData,
+                        uint32_t *Offset, unsigned UnitIndex, uint8_t &UnitType,
+                        bool &isUnitDWARF64);
+
+
+  bool verifyUnitContents(DWARFUnit Unit);
   /// Verifies the attribute's DWARF attribute and its value.
   ///
   /// This function currently checks for:
@@ -42,7 +68,11 @@ class DWARFVerifier {
   ///
   /// \param Die          The DWARF DIE that owns the attribute value
   /// \param AttrValue    The DWARF attribute value to check
-  void verifyDebugInfoAttribute(const DWARFDie &Die, DWARFAttribute &AttrValue);
+  ///
+  /// \returns NumErrors The number of errors occured during verification of
+  /// attributes' values in a .debug_info section unit
+  unsigned verifyDebugInfoAttribute(const DWARFDie &Die,
+                                    DWARFAttribute &AttrValue);
 
   /// Verifies the attribute's DWARF form.
   ///
@@ -53,7 +83,10 @@ class DWARFVerifier {
   ///
   /// \param Die          The DWARF DIE that owns the attribute value
   /// \param AttrValue    The DWARF attribute value to check
-  void verifyDebugInfoForm(const DWARFDie &Die, DWARFAttribute &AttrValue);
+  ///
+  /// \returns NumErrors The number of errors occured during verification of
+  /// attributes' forms in a .debug_info section unit
+  unsigned verifyDebugInfoForm(const DWARFDie &Die, DWARFAttribute &AttrValue);
 
   /// Verifies the all valid references that were found when iterating through
   /// all of the DIE attributes.
@@ -62,7 +95,10 @@ class DWARFVerifier {
   /// offset matches. This helps to ensure if a DWARF link phase moved things
   /// around, that it doesn't create invalid references by failing to relocate
   /// CU relative and absolute references.
-  void verifyDebugInfoReferences();
+  ///
+  /// \returns NumErrors The number of errors occured during verification of
+  /// references for the .debug_info section
+  unsigned verifyDebugInfoReferences();
 
   /// Verify the the DW_AT_stmt_list encoding and value and ensure that no
   /// compile units that have the same DW_AT_stmt_list value.
diff --git a/include/llvm/DebugInfo/PDB/DIA/DIARawSymbol.h b/include/llvm/DebugInfo/PDB/DIA/DIARawSymbol.h
index 3710eb29e7f9..d37b48540ffa 100644
--- a/include/llvm/DebugInfo/PDB/DIA/DIARawSymbol.h
+++ b/include/llvm/DebugInfo/PDB/DIA/DIARawSymbol.h
@@ -106,7 +106,7 @@ public:
   getVirtualBaseTableType() const override;
   PDB_DataKind getDataKind() const override;
   PDB_SymType getSymTag() const override;
-  PDB_UniqueId getGuid() const override;
+  codeview::GUID getGuid() const override;
   int32_t getOffset() const override;
   int32_t getThisAdjust() const override;
   int32_t getVirtualBasePointerOffset() const override;
diff --git a/include/llvm/DebugInfo/PDB/GenericError.h b/include/llvm/DebugInfo/PDB/GenericError.h
index 466cb455651b..03205a986f1a 100644
--- a/include/llvm/DebugInfo/PDB/GenericError.h
+++ b/include/llvm/DebugInfo/PDB/GenericError.h
@@ -19,6 +19,7 @@ namespace pdb {
 enum class generic_error_code {
   invalid_path = 1,
   dia_sdk_not_present,
+  type_server_not_found,
   unspecified,
 };
 
diff --git a/include/llvm/DebugInfo/PDB/IPDBRawSymbol.h b/include/llvm/DebugInfo/PDB/IPDBRawSymbol.h
index fab086c62c72..eefc36518728 100644
--- a/include/llvm/DebugInfo/PDB/IPDBRawSymbol.h
+++ b/include/llvm/DebugInfo/PDB/IPDBRawSymbol.h
@@ -118,7 +118,7 @@ public:
   virtual uint32_t getVirtualTableShapeId() const = 0;
   virtual PDB_DataKind getDataKind() const = 0;
   virtual PDB_SymType getSymTag() const = 0;
-  virtual PDB_UniqueId getGuid() const = 0;
+  virtual codeview::GUID getGuid() const = 0;
   virtual int32_t getOffset() const = 0;
   virtual int32_t getThisAdjust() const = 0;
   virtual int32_t getVirtualBasePointerOffset() const = 0;
diff --git a/include/llvm/DebugInfo/PDB/Native/Formatters.h b/include/llvm/DebugInfo/PDB/Native/Formatters.h
index 183f0ad8307e..7d5eab2e2a09 100644
--- a/include/llvm/DebugInfo/PDB/Native/Formatters.h
+++ b/include/llvm/DebugInfo/PDB/Native/Formatters.h
@@ -23,13 +23,6 @@
     break;
 
 namespace llvm {
-template <> struct format_provider<pdb::PDB_UniqueId> {
-  static void format(const pdb::PDB_UniqueId &V, llvm::raw_ostream &Stream,
-                     StringRef Style) {
-    codeview::fmt_guid(V.Guid).format(Stream, Style);
-  }
-};
-
 template <> struct format_provider<pdb::PdbRaw_ImplVer> {
   static void format(const pdb::PdbRaw_ImplVer &V, llvm::raw_ostream &Stream,
                      StringRef Style) {
diff --git a/include/llvm/DebugInfo/PDB/Native/InfoStream.h b/include/llvm/DebugInfo/PDB/Native/InfoStream.h
index 37bf5f3b573c..fb8271cb5ebc 100644
--- a/include/llvm/DebugInfo/PDB/Native/InfoStream.h
+++ b/include/llvm/DebugInfo/PDB/Native/InfoStream.h
@@ -12,6 +12,7 @@
 
 #include "llvm/ADT/BitmaskEnum.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/DebugInfo/CodeView/GUID.h"
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
 #include "llvm/DebugInfo/PDB/Native/NamedStreamMap.h"
 #include "llvm/DebugInfo/PDB/Native/RawConstants.h"
@@ -39,7 +40,7 @@ public:
   PdbRaw_ImplVer getVersion() const;
   uint32_t getSignature() const;
   uint32_t getAge() const;
-  PDB_UniqueId getGuid() const;
+  codeview::GUID getGuid() const;
   uint32_t getNamedStreamMapByteSize() const;
 
   PdbRaw_Features getFeatures() const;
@@ -71,7 +72,7 @@ private:
   // Due to the aforementioned limitations with `Signature`, this is a new
   // signature present on VC70 and higher PDBs which is guaranteed to be
   // universally unique.
-  PDB_UniqueId Guid;
+  codeview::GUID Guid;
 
   BinarySubstreamRef SubNamedStreams;
 
diff --git a/include/llvm/DebugInfo/PDB/Native/InfoStreamBuilder.h b/include/llvm/DebugInfo/PDB/Native/InfoStreamBuilder.h
index 90c28a90d252..c6cb0e221e70 100644
--- a/include/llvm/DebugInfo/PDB/Native/InfoStreamBuilder.h
+++ b/include/llvm/DebugInfo/PDB/Native/InfoStreamBuilder.h
@@ -37,7 +37,7 @@ public:
   void setVersion(PdbRaw_ImplVer V);
   void setSignature(uint32_t S);
   void setAge(uint32_t A);
-  void setGuid(PDB_UniqueId G);
+  void setGuid(codeview::GUID G);
   void addFeature(PdbRaw_FeatureSig Sig);
 
   uint32_t finalize();
@@ -54,7 +54,7 @@ private:
   PdbRaw_ImplVer Ver;
   uint32_t Sig;
   uint32_t Age;
-  PDB_UniqueId Guid;
+  codeview::GUID Guid;
 
   NamedStreamMap &NamedStreams;
 };
diff --git a/include/llvm/DebugInfo/PDB/Native/NativeExeSymbol.h b/include/llvm/DebugInfo/PDB/Native/NativeExeSymbol.h
index ddb7f811da38..587c7ff2b092 100644
--- a/include/llvm/DebugInfo/PDB/Native/NativeExeSymbol.h
+++ b/include/llvm/DebugInfo/PDB/Native/NativeExeSymbol.h
@@ -27,7 +27,7 @@ public:
 
   uint32_t getAge() const override;
   std::string getSymbolsFileName() const override;
-  PDB_UniqueId getGuid() const override;
+  codeview::GUID getGuid() const override;
   bool hasCTypes() const override;
   bool hasPrivateSymbols() const override;
 
diff --git a/include/llvm/DebugInfo/PDB/Native/NativeRawSymbol.h b/include/llvm/DebugInfo/PDB/Native/NativeRawSymbol.h
index 66a9eae28e23..2c6548dcce21 100644
--- a/include/llvm/DebugInfo/PDB/Native/NativeRawSymbol.h
+++ b/include/llvm/DebugInfo/PDB/Native/NativeRawSymbol.h
@@ -111,7 +111,7 @@ public:
   getVirtualBaseTableType() const override;
   PDB_DataKind getDataKind() const override;
   PDB_SymType getSymTag() const override;
-  PDB_UniqueId getGuid() const override;
+  codeview::GUID getGuid() const override;
   int32_t getOffset() const override;
   int32_t getThisAdjust() const override;
   int32_t getVirtualBasePointerOffset() const override;
diff --git a/include/llvm/DebugInfo/PDB/Native/PDBTypeServerHandler.h b/include/llvm/DebugInfo/PDB/Native/PDBTypeServerHandler.h
deleted file mode 100644
index 196ba4d6ffbd..000000000000
--- a/include/llvm/DebugInfo/PDB/Native/PDBTypeServerHandler.h
+++ /dev/null
@@ -1,46 +0,0 @@
-//===- PDBTypeServerHandler.h -----------------------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_DEBUGINFO_PDB_PDBTYPESERVERHANDLER_H
-#define LLVM_DEBUGINFO_PDB_PDBTYPESERVERHANDLER_H
-
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/StringSet.h"
-#include "llvm/DebugInfo/CodeView/TypeRecord.h"
-#include "llvm/DebugInfo/CodeView/TypeServerHandler.h"
-#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
-#include "llvm/DebugInfo/PDB/PDBTypes.h"
-
-#include <memory>
-#include <string>
-
-namespace llvm {
-namespace pdb {
-class NativeSession;
-
-class PDBTypeServerHandler : public codeview::TypeServerHandler {
-public:
-  PDBTypeServerHandler(bool RevisitAlways = false);
-
-  void addSearchPath(StringRef Path);
-  Expected<bool> handle(codeview::TypeServer2Record &TS,
-                        codeview::TypeVisitorCallbacks &Callbacks) override;
-
-private:
-  Expected<bool> handleInternal(PDBFile &File,
-                                codeview::TypeVisitorCallbacks &Callbacks);
-
-  bool RevisitAlways;
-  std::unique_ptr<NativeSession> Session;
-  StringSet<> SearchPaths;
-};
-}
-}
-
-#endif
diff --git a/include/llvm/DebugInfo/PDB/Native/RawTypes.h b/include/llvm/DebugInfo/PDB/Native/RawTypes.h
index a3cdd3f09a44..b6321cbf45a8 100644
--- a/include/llvm/DebugInfo/PDB/Native/RawTypes.h
+++ b/include/llvm/DebugInfo/PDB/Native/RawTypes.h
@@ -10,6 +10,7 @@
 #ifndef LLVM_DEBUGINFO_PDB_RAW_RAWTYPES_H
 #define LLVM_DEBUGINFO_PDB_RAW_RAWTYPES_H
 
+#include "llvm/DebugInfo/CodeView/GUID.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
 #include "llvm/Support/Endian.h"
 
@@ -268,17 +269,6 @@ struct PublicsStreamHeader {
   support::ulittle32_t NumSections;
 };
 
-/// Defines a 128-bit unique identifier.  This maps to a GUID on Windows, but
-/// is abstracted here for the purposes of non-Windows platforms that don't have
-/// the GUID structure defined.
-struct PDB_UniqueId {
-  uint8_t Guid[16];
-};
-
-inline bool operator==(const PDB_UniqueId &LHS, const PDB_UniqueId &RHS) {
-  return 0 == ::memcmp(LHS.Guid, RHS.Guid, sizeof(LHS.Guid));
-}
-
 // The header preceeding the global TPI stream.
 // This corresponds to `HDR` in PDB/dbi/tpi.h.
 struct TpiStreamHeader {
@@ -312,7 +302,7 @@ struct InfoStreamHeader {
   support::ulittle32_t Version;
   support::ulittle32_t Signature;
   support::ulittle32_t Age;
-  PDB_UniqueId Guid;
+  codeview::GUID Guid;
 };
 
 /// The header preceeding the /names stream.
diff --git a/include/llvm/DebugInfo/PDB/Native/TpiHashing.h b/include/llvm/DebugInfo/PDB/Native/TpiHashing.h
index 156abb59a6be..c1edec7a26fe 100644
--- a/include/llvm/DebugInfo/PDB/Native/TpiHashing.h
+++ b/include/llvm/DebugInfo/PDB/Native/TpiHashing.h
@@ -10,84 +10,13 @@
 #ifndef LLVM_DEBUGINFO_PDB_TPIHASHING_H
 #define LLVM_DEBUGINFO_PDB_TPIHASHING_H
 
-#include "llvm/ADT/Optional.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
-#include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h"
-#include "llvm/DebugInfo/PDB/Native/RawError.h"
-#include "llvm/Support/BinaryStreamArray.h"
-#include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
-#include <cstdint>
-#include <string>
 
 namespace llvm {
 namespace pdb {
 
-class TpiHashUpdater : public codeview::TypeVisitorCallbacks {
-public:
-  TpiHashUpdater() = default;
-
-#define TYPE_RECORD(EnumName, EnumVal, Name)                                   \
-  virtual Error visitKnownRecord(codeview::CVType &CVR,                        \
-                                 codeview::Name##Record &Record) override {    \
-    visitKnownRecordImpl(CVR, Record);                                         \
-    return Error::success();                                                   \
-  }
-#define TYPE_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName)
-#define MEMBER_RECORD(EnumName, EnumVal, Name)
-#define MEMBER_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName)
-#include "llvm/DebugInfo/CodeView/CodeViewTypes.def"
-
-private:
-  template <typename RecordKind>
-  void visitKnownRecordImpl(codeview::CVType &CVR, RecordKind &Record) {
-    CVR.Hash = 0;
-  }
-
-  void visitKnownRecordImpl(codeview::CVType &CVR,
-                            codeview::UdtSourceLineRecord &Rec);
-  void visitKnownRecordImpl(codeview::CVType &CVR,
-                            codeview::UdtModSourceLineRecord &Rec);
-  void visitKnownRecordImpl(codeview::CVType &CVR, codeview::ClassRecord &Rec);
-  void visitKnownRecordImpl(codeview::CVType &CVR, codeview::EnumRecord &Rec);
-  void visitKnownRecordImpl(codeview::CVType &CVR, codeview::UnionRecord &Rec);
-};
-
-class TpiHashVerifier : public codeview::TypeVisitorCallbacks {
-public:
-  TpiHashVerifier(FixedStreamArray<support::ulittle32_t> &HashValues,
-                  uint32_t NumHashBuckets)
-      : HashValues(HashValues), NumHashBuckets(NumHashBuckets) {}
-
-  Error visitKnownRecord(codeview::CVType &CVR,
-                         codeview::UdtSourceLineRecord &Rec) override;
-  Error visitKnownRecord(codeview::CVType &CVR,
-                         codeview::UdtModSourceLineRecord &Rec) override;
-  Error visitKnownRecord(codeview::CVType &CVR,
-                         codeview::ClassRecord &Rec) override;
-  Error visitKnownRecord(codeview::CVType &CVR,
-                         codeview::EnumRecord &Rec) override;
-  Error visitKnownRecord(codeview::CVType &CVR,
-                         codeview::UnionRecord &Rec) override;
-  Error visitTypeBegin(codeview::CVType &CVR) override;
-
-private:
-  Error verifySourceLine(codeview::TypeIndex TI);
-
-  Error errorInvalidHash() {
-    return make_error<RawError>(
-        raw_error_code::invalid_tpi_hash,
-        "Type index is 0x" +
-            utohexstr(codeview::TypeIndex::FirstNonSimpleIndex + Index));
-  }
-
-  FixedStreamArray<support::ulittle32_t> HashValues;
-  codeview::CVType RawRecord;
-  uint32_t NumHashBuckets;
-  uint32_t Index = -1;
-};
+Expected<uint32_t> hashTypeRecord(const llvm::codeview::CVType &Type);
 
 } // end namespace pdb
 } // end namespace llvm
diff --git a/include/llvm/DebugInfo/PDB/PDBExtras.h b/include/llvm/DebugInfo/PDB/PDBExtras.h
index 3a38f21b94c8..778121c8eb79 100644
--- a/include/llvm/DebugInfo/PDB/PDBExtras.h
+++ b/include/llvm/DebugInfo/PDB/PDBExtras.h
@@ -32,7 +32,6 @@ raw_ostream &operator<<(raw_ostream &OS, const PDB_Checksum &Checksum);
 raw_ostream &operator<<(raw_ostream &OS, const PDB_Lang &Lang);
 raw_ostream &operator<<(raw_ostream &OS, const PDB_SymType &Tag);
 raw_ostream &operator<<(raw_ostream &OS, const PDB_MemberAccess &Access);
-raw_ostream &operator<<(raw_ostream &OS, const PDB_UniqueId &Guid);
 raw_ostream &operator<<(raw_ostream &OS, const PDB_UdtType &Type);
 raw_ostream &operator<<(raw_ostream &OS, const PDB_Machine &Machine);
 
diff --git a/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h b/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h
index c1acca386820..27b5457fc8ff 100644
--- a/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h
+++ b/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h
@@ -22,6 +22,7 @@
 #include "llvm/ExecutionEngine/JITSymbol.h"
 #include "llvm/ExecutionEngine/Orc/IndirectionUtils.h"
 #include "llvm/ExecutionEngine/Orc/LambdaResolver.h"
+#include "llvm/ExecutionEngine/Orc/OrcError.h"
 #include "llvm/ExecutionEngine/RuntimeDyld.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Constant.h"
@@ -289,21 +290,21 @@ public:
   // FIXME: We should track and free associated resources (unused compile
   //        callbacks, uncompiled IR, and no-longer-needed/reachable function
   //        implementations).
-  // FIXME: Return Error once the JIT APIs are Errorized.
-  bool updatePointer(std::string FuncName, JITTargetAddress FnBodyAddr) {
+  Error updatePointer(std::string FuncName, JITTargetAddress FnBodyAddr) {
     //Find out which logical dylib contains our symbol
     auto LDI = LogicalDylibs.begin();
     for (auto LDE = LogicalDylibs.end(); LDI != LDE; ++LDI) {
-      if (auto LMResources = LDI->getLogicalModuleResourcesForSymbol(FuncName, false)) {
+      if (auto LMResources =
+            LDI->getLogicalModuleResourcesForSymbol(FuncName, false)) {
         Module &SrcM = LMResources->SourceModule->getResource();
         std::string CalledFnName = mangle(FuncName, SrcM.getDataLayout());
-        if (auto EC = LMResources->StubsMgr->updatePointer(CalledFnName, FnBodyAddr))
-          return false;
-        else
-          return true;
+        if (auto Err = LMResources->StubsMgr->updatePointer(CalledFnName,
+                                                            FnBodyAddr))
+          return Err;
+        return Error::success();
       }
     }
-    return false;
+    return make_error<JITSymbolNotFound>(FuncName);
   }
 
 private:
@@ -363,11 +364,8 @@ private:
           });
       }
 
-      auto EC = LD.StubsMgr->createStubs(StubInits);
-      (void)EC;
-      // FIXME: This should be propagated back to the user. Stub creation may
-      //        fail for remote JITs.
-      assert(!EC && "Error generating stubs");
+      if (auto Err = LD.StubsMgr->createStubs(StubInits))
+        return Err;
     }
 
     // If this module doesn't contain any globals, aliases, or module flags then
diff --git a/include/llvm/ExecutionEngine/RTDyldMemoryManager.h b/include/llvm/ExecutionEngine/RTDyldMemoryManager.h
index a9778514b9f1..0c1862c5c3ea 100644
--- a/include/llvm/ExecutionEngine/RTDyldMemoryManager.h
+++ b/include/llvm/ExecutionEngine/RTDyldMemoryManager.h
@@ -135,12 +135,13 @@ public:
   virtual void *getPointerToNamedFunction(const std::string &Name,
                                           bool AbortOnFailure = true);
 
-private:
+protected:
   struct EHFrame {
     uint8_t *Addr;
     size_t Size;
   };
-  std::vector<EHFrame> EHFrames;
+  typedef std::vector<EHFrame> EHFrameInfos;
+  EHFrameInfos EHFrames;
 };
 
 // Create wrappers for C Binding types (see CBindingWrapping.h).
diff --git a/include/llvm/IR/CallingConv.h b/include/llvm/IR/CallingConv.h
index 801e88aba4d1..850964afc307 100644
--- a/include/llvm/IR/CallingConv.h
+++ b/include/llvm/IR/CallingConv.h
@@ -143,11 +143,15 @@ namespace CallingConv {
     /// System V ABI, used on most non-Windows systems.
     X86_64_SysV = 78,
 
-    /// \brief The C convention as implemented on Windows/x86-64. This
-    /// convention differs from the more common \c X86_64_SysV convention
-    /// in a number of ways, most notably in that XMM registers used to pass
-    /// arguments are shadowed by GPRs, and vice versa.
-    X86_64_Win64 = 79,
+    /// \brief The C convention as implemented on Windows/x86-64 and
+    /// AArch64. This convention differs from the more common
+    /// \c X86_64_SysV convention in a number of ways, most notably in
+    /// that XMM registers used to pass arguments are shadowed by GPRs,
+    /// and vice versa.
+    /// On AArch64, this is identical to the normal C (AAPCS) calling
+    /// convention for normal functions, but floats are passed in integer
+    /// registers to variadic functions.
+    Win64 = 79,
 
     /// \brief MSVC calling convention that passes vectors and vector aggregates
     /// in SSE registers.
diff --git a/include/llvm/IR/Constants.h b/include/llvm/IR/Constants.h
index 2e72c41ccee3..0094fd54992a 100644
--- a/include/llvm/IR/Constants.h
+++ b/include/llvm/IR/Constants.h
@@ -598,6 +598,10 @@ public:
   /// specified element in the low bits of a uint64_t.
   uint64_t getElementAsInteger(unsigned i) const;
 
+  /// If this is a sequential container of integers (of any size), return the
+  /// specified element as an APInt.
+  APInt getElementAsAPInt(unsigned i) const;
+
   /// If this is a sequential container of floating point type, return the
   /// specified element as an APFloat.
   APFloat getElementAsAPFloat(unsigned i) const;
@@ -761,6 +765,10 @@ public:
   /// i32/i64/float/double) and must be a ConstantFP or ConstantInt.
   static Constant *getSplat(unsigned NumElts, Constant *Elt);
 
+  /// Returns true if this is a splat constant, meaning that all elements have
+  /// the same value.
+  bool isSplat() const;
+
   /// If this is a splat constant, meaning that all of the elements have the
   /// same value, return that value. Otherwise return NULL.
   Constant *getSplatValue() const;
diff --git a/include/llvm/IR/DIBuilder.h b/include/llvm/IR/DIBuilder.h
index 8e6bb4baccaf..6a14f783005d 100644
--- a/include/llvm/IR/DIBuilder.h
+++ b/include/llvm/IR/DIBuilder.h
@@ -674,32 +674,37 @@ namespace llvm {
 
     /// Create a descriptor for an imported module.
     /// \param Context The scope this module is imported into
-    /// \param NS The namespace being imported here
-    /// \param Line Line number
+    /// \param NS      The namespace being imported here.
+    /// \param File    File where the declaration is located.
+    /// \param Line    Line number of the declaration.
     DIImportedEntity *createImportedModule(DIScope *Context, DINamespace *NS,
-                                           unsigned Line);
+                                           DIFile *File, unsigned Line);
 
     /// Create a descriptor for an imported module.
-    /// \param Context The scope this module is imported into
-    /// \param NS An aliased namespace
-    /// \param Line Line number
+    /// \param Context The scope this module is imported into.
+    /// \param NS      An aliased namespace.
+    /// \param File    File where the declaration is located.
+    /// \param Line    Line number of the declaration.
     DIImportedEntity *createImportedModule(DIScope *Context,
-                                           DIImportedEntity *NS, unsigned Line);
+                                           DIImportedEntity *NS, DIFile *File,
+                                           unsigned Line);
 
     /// Create a descriptor for an imported module.
-    /// \param Context The scope this module is imported into
-    /// \param M The module being imported here
-    /// \param Line Line number
+    /// \param Context The scope this module is imported into.
+    /// \param M       The module being imported here
+    /// \param File    File where the declaration is located.
+    /// \param Line    Line number of the declaration.
     DIImportedEntity *createImportedModule(DIScope *Context, DIModule *M,
-                                           unsigned Line);
+                                           DIFile *File, unsigned Line);
 
     /// Create a descriptor for an imported function.
-    /// \param Context The scope this module is imported into
-    /// \param Decl The declaration (or definition) of a function, type, or
-    ///             variable
-    /// \param Line Line number
+    /// \param Context The scope this module is imported into.
+    /// \param Decl    The declaration (or definition) of a function, type, or
+    ///                variable.
+    /// \param File    File where the declaration is located.
+    /// \param Line    Line number of the declaration.
     DIImportedEntity *createImportedDeclaration(DIScope *Context, DINode *Decl,
-                                                unsigned Line,
+                                                DIFile *File, unsigned Line,
                                                 StringRef Name = "");
 
     /// Insert a new llvm.dbg.declare intrinsic call.
diff --git a/include/llvm/IR/DebugInfoMetadata.h b/include/llvm/IR/DebugInfoMetadata.h
index 9374fe4fae76..678a43ae7926 100644
--- a/include/llvm/IR/DebugInfoMetadata.h
+++ b/include/llvm/IR/DebugInfoMetadata.h
@@ -435,10 +435,10 @@ public:
 
   /// Return the raw underlying file.
   ///
-  /// A \a DIFile is a \a DIScope, but it doesn't point at a separate file
-  /// (it\em is the file).  If \c this is an \a DIFile, we need to return \c
-  /// this.  Otherwise, return the first operand, which is where all other
-  /// subclasses store their file pointer.
+  /// A \a DIFile is a \a DIScope, but it doesn't point at a separate file (it
+  /// \em is the file).  If \c this is an \a DIFile, we need to return \c this.
+  /// Otherwise, return the first operand, which is where all other subclasses
+  /// store their file pointer.
   Metadata *getRawFile() const {
     return isa<DIFile>(this) ? const_cast<DIScope *>(this)
                              : static_cast<Metadata *>(getOperand(0));
@@ -2551,32 +2551,32 @@ class DIImportedEntity : public DINode {
 
   static DIImportedEntity *getImpl(LLVMContext &Context, unsigned Tag,
                                    DIScope *Scope, DINodeRef Entity,
-                                   unsigned Line, StringRef Name,
+                                   DIFile *File, unsigned Line, StringRef Name,
                                    StorageType Storage,
                                    bool ShouldCreate = true) {
-    return getImpl(Context, Tag, Scope, Entity, Line,
+    return getImpl(Context, Tag, Scope, Entity, File, Line,
                    getCanonicalMDString(Context, Name), Storage, ShouldCreate);
   }
   static DIImportedEntity *getImpl(LLVMContext &Context, unsigned Tag,
                                    Metadata *Scope, Metadata *Entity,
-                                   unsigned Line, MDString *Name,
-                                   StorageType Storage,
+                                   Metadata *File, unsigned Line,
+                                   MDString *Name, StorageType Storage,
                                    bool ShouldCreate = true);
 
   TempDIImportedEntity cloneImpl() const {
     return getTemporary(getContext(), getTag(), getScope(), getEntity(),
-                        getLine(), getName());
+                        getFile(), getLine(), getName());
   }
 
 public:
   DEFINE_MDNODE_GET(DIImportedEntity,
                     (unsigned Tag, DIScope *Scope, DINodeRef Entity,
-                     unsigned Line, StringRef Name = ""),
-                    (Tag, Scope, Entity, Line, Name))
+                     DIFile *File, unsigned Line, StringRef Name = ""),
+                    (Tag, Scope, Entity, File, Line, Name))
   DEFINE_MDNODE_GET(DIImportedEntity,
                     (unsigned Tag, Metadata *Scope, Metadata *Entity,
-                     unsigned Line, MDString *Name),
-                    (Tag, Scope, Entity, Line, Name))
+                     Metadata *File, unsigned Line, MDString *Name),
+                    (Tag, Scope, Entity, File, Line, Name))
 
   TempDIImportedEntity clone() const { return cloneImpl(); }
 
@@ -2584,10 +2584,12 @@ public:
   DIScope *getScope() const { return cast_or_null<DIScope>(getRawScope()); }
   DINodeRef getEntity() const { return DINodeRef(getRawEntity()); }
   StringRef getName() const { return getStringOperand(2); }
+  DIFile *getFile() const { return cast_or_null<DIFile>(getRawFile()); }
 
   Metadata *getRawScope() const { return getOperand(0); }
   Metadata *getRawEntity() const { return getOperand(1); }
   MDString *getRawName() const { return getOperandAs<MDString>(2); }
+  Metadata *getRawFile() const { return getOperand(3); }
 
   static bool classof(const Metadata *MD) {
     return MD->getMetadataID() == DIImportedEntityKind;
diff --git a/include/llvm/IR/Dominators.h b/include/llvm/IR/Dominators.h
index e10d14c19793..5b21a2c83e4a 100644
--- a/include/llvm/IR/Dominators.h
+++ b/include/llvm/IR/Dominators.h
@@ -34,22 +34,31 @@ class Module;
 class raw_ostream;
 
 extern template class DomTreeNodeBase<BasicBlock>;
-extern template class DominatorTreeBase<BasicBlock>;
+extern template class DominatorTreeBase<BasicBlock, false>; // DomTree
+extern template class DominatorTreeBase<BasicBlock, true>; // PostDomTree
 
 namespace DomTreeBuilder {
-extern template void Calculate<Function, BasicBlock *>(
-    DominatorTreeBaseByGraphTraits<GraphTraits<BasicBlock *>> &DT, Function &F);
-
-extern template void Calculate<Function, Inverse<BasicBlock *>>(
-    DominatorTreeBaseByGraphTraits<GraphTraits<Inverse<BasicBlock *>>> &DT,
-    Function &F);
-
-extern template bool Verify<BasicBlock *>(
-    const DominatorTreeBaseByGraphTraits<GraphTraits<BasicBlock *>> &DT);
-
-extern template bool Verify<Inverse<BasicBlock *>>(
-    const DominatorTreeBaseByGraphTraits<GraphTraits<Inverse<BasicBlock *>>>
-        &DT);
+using BBDomTree = DomTreeBase<BasicBlock>;
+using BBPostDomTree = PostDomTreeBase<BasicBlock>;
+
+extern template void Calculate<BBDomTree, Function>(BBDomTree &DT, Function &F);
+extern template void Calculate<BBPostDomTree, Function>(BBPostDomTree &DT,
+                                                        Function &F);
+
+extern template void InsertEdge<BBDomTree>(BBDomTree &DT, BasicBlock *From,
+                                           BasicBlock *To);
+extern template void InsertEdge<BBPostDomTree>(BBPostDomTree &DT,
+                                               BasicBlock *From,
+                                               BasicBlock *To);
+
+extern template void DeleteEdge<BBDomTree>(BBDomTree &DT, BasicBlock *From,
+                                           BasicBlock *To);
+extern template void DeleteEdge<BBPostDomTree>(BBPostDomTree &DT,
+                                               BasicBlock *From,
+                                               BasicBlock *To);
+
+extern template bool Verify<BBDomTree>(const BBDomTree &DT);
+extern template bool Verify<BBPostDomTree>(const BBPostDomTree &DT);
 }  // namespace DomTreeBuilder
 
 using DomTreeNode = DomTreeNodeBase<BasicBlock>;
@@ -122,14 +131,12 @@ template <> struct DenseMapInfo<BasicBlockEdge> {
 /// the dominator tree is initially constructed may still exist in the tree,
 /// even if the tree is properly updated. Calling code should not rely on the
 /// preceding statements; this is stated only to assist human understanding.
-class DominatorTree : public DominatorTreeBase<BasicBlock> {
-public:
-  using Base = DominatorTreeBase<BasicBlock>;
+class DominatorTree : public DominatorTreeBase<BasicBlock, false> {
+ public:
+  using Base = DominatorTreeBase<BasicBlock, false>;
 
-  DominatorTree() : DominatorTreeBase<BasicBlock>(false) {}
-  explicit DominatorTree(Function &F) : DominatorTreeBase<BasicBlock>(false) {
-    recalculate(F);
-  }
+  DominatorTree() = default;
+  explicit DominatorTree(Function &F) { recalculate(F); }
 
   /// Handle invalidation explicitly.
   bool invalidate(Function &F, const PreservedAnalyses &PA,
diff --git a/include/llvm/IR/IntrinsicsHexagon.td b/include/llvm/IR/IntrinsicsHexagon.td
index 8ac56e03be6a..098245344725 100644
--- a/include/llvm/IR/IntrinsicsHexagon.td
+++ b/include/llvm/IR/IntrinsicsHexagon.td
@@ -32,16 +32,6 @@ class Hexagon_qi_mem_Intrinsic<string GCCIntSuffix>
   : Hexagon_Intrinsic<GCCIntSuffix,
                           [llvm_i1_ty], [llvm_ptr_ty],
                           [IntrNoMem]>;
-
-//
-// DEF_FUNCTION_TYPE_1(void_ftype_SI,BT_VOID,BT_INT) ->
-// Hexagon_void_si_Intrinsic<string GCCIntSuffix>
-//
-class Hexagon_void_si_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [], [llvm_ptr_ty],
-                          []>;
-
 //
 // DEF_FUNCTION_TYPE_1(HI_ftype_SI,BT_I16,BT_INT) ->
 // Hexagon_hi_si_Intrinsic<string GCCIntSuffix>
@@ -4959,11 +4949,25 @@ Hexagon_di_di_Intrinsic<"HEXAGON_S2_interleave">;
 //
 def int_hexagon_S2_deinterleave :
 Hexagon_di_di_Intrinsic<"HEXAGON_S2_deinterleave">;
+
 //
 // BUILTIN_INFO(HEXAGON.dcfetch_A,v_ftype_DI*,1)
 //
 def int_hexagon_prefetch :
-Hexagon_void_si_Intrinsic<"HEXAGON_prefetch">;
+Hexagon_Intrinsic<"HEXAGON_prefetch", [], [llvm_ptr_ty], []>;
+def int_hexagon_Y2_dccleana :
+Hexagon_Intrinsic<"HEXAGON_Y2_dccleana", [], [llvm_ptr_ty], []>;
+def int_hexagon_Y2_dccleaninva :
+Hexagon_Intrinsic<"HEXAGON_Y2_dccleaninva", [], [llvm_ptr_ty], []>;
+def int_hexagon_Y2_dcinva :
+Hexagon_Intrinsic<"HEXAGON_Y2_dcinva", [], [llvm_ptr_ty], []>;
+def int_hexagon_Y2_dczeroa :
+Hexagon_Intrinsic<"HEXAGON_Y2_dczeroa", [], [llvm_ptr_ty],
+      [IntrWriteMem, IntrArgMemOnly, IntrHasSideEffects]>;
+def int_hexagon_Y4_l2fetch :
+Hexagon_Intrinsic<"HEXAGON_Y4_l2fetch", [], [llvm_ptr_ty, llvm_i32_ty], []>;
+def int_hexagon_Y5_l2fetch :
+Hexagon_Intrinsic<"HEXAGON_Y5_l2fetch", [], [llvm_ptr_ty, llvm_i64_ty], []>;
 
 def llvm_ptr32_ty : LLVMPointerType<llvm_i32_ty>;
 def llvm_ptr64_ty : LLVMPointerType<llvm_i64_ty>;
diff --git a/include/llvm/IR/IntrinsicsSystemZ.td b/include/llvm/IR/IntrinsicsSystemZ.td
index 9be37d3645b2..98065bc51d99 100644
--- a/include/llvm/IR/IntrinsicsSystemZ.td
+++ b/include/llvm/IR/IntrinsicsSystemZ.td
@@ -373,6 +373,49 @@ let TargetPrefix = "s390" in {
   def int_s390_vfidb : Intrinsic<[llvm_v2f64_ty],
                                  [llvm_v2f64_ty, llvm_i32_ty, llvm_i32_ty],
                                  [IntrNoMem]>;
+
+  // Instructions from the Vector Enhancements Facility 1
+  def int_s390_vbperm : SystemZBinaryConv<"vbperm", llvm_v2i64_ty,
+                                          llvm_v16i8_ty>;
+
+  def int_s390_vmslg  : GCCBuiltin<"__builtin_s390_vmslg">,
+                        Intrinsic<[llvm_v16i8_ty],
+                                  [llvm_v2i64_ty, llvm_v2i64_ty, llvm_v16i8_ty,
+                                   llvm_i32_ty], [IntrNoMem]>;
+
+  def int_s390_vfmaxdb : Intrinsic<[llvm_v2f64_ty],
+                                   [llvm_v2f64_ty, llvm_v2f64_ty, llvm_i32_ty],
+                                   [IntrNoMem]>;
+  def int_s390_vfmindb : Intrinsic<[llvm_v2f64_ty],
+                                   [llvm_v2f64_ty, llvm_v2f64_ty, llvm_i32_ty],
+                                   [IntrNoMem]>;
+  def int_s390_vfmaxsb : Intrinsic<[llvm_v4f32_ty],
+                                   [llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty],
+                                   [IntrNoMem]>;
+  def int_s390_vfminsb : Intrinsic<[llvm_v4f32_ty],
+                                   [llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty],
+                                   [IntrNoMem]>;
+
+  def int_s390_vfcesbs  : SystemZBinaryConvCC<llvm_v4i32_ty, llvm_v4f32_ty>;
+  def int_s390_vfchsbs  : SystemZBinaryConvCC<llvm_v4i32_ty, llvm_v4f32_ty>;
+  def int_s390_vfchesbs : SystemZBinaryConvCC<llvm_v4i32_ty, llvm_v4f32_ty>;
+
+  def int_s390_vftcisb : SystemZBinaryConvIntCC<llvm_v4i32_ty, llvm_v4f32_ty>;
+
+  def int_s390_vfisb : Intrinsic<[llvm_v4f32_ty],
+                                 [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty],
+                                 [IntrNoMem]>;
+
+  // Instructions from the Vector Packed Decimal Facility
+  def int_s390_vlrl : GCCBuiltin<"__builtin_s390_vlrl">,
+                      Intrinsic<[llvm_v16i8_ty], [llvm_i32_ty, llvm_ptr_ty],
+                                [IntrReadMem, IntrArgMemOnly]>;
+
+  def int_s390_vstrl : GCCBuiltin<"__builtin_s390_vstrl">,
+                       Intrinsic<[], [llvm_v16i8_ty, llvm_i32_ty, llvm_ptr_ty],
+                                 // In fact write-only but there's no property
+                                 // for that.
+                                 [IntrArgMemOnly]>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/include/llvm/MC/LaneBitmask.h b/include/llvm/MC/LaneBitmask.h
index 5ca06d1148e2..73b987b074db 100644
--- a/include/llvm/MC/LaneBitmask.h
+++ b/include/llvm/MC/LaneBitmask.h
@@ -75,6 +75,9 @@ namespace llvm {
 
     static LaneBitmask getNone() { return LaneBitmask(0); }
     static LaneBitmask getAll()  { return ~LaneBitmask(0); }
+    static LaneBitmask getLane(unsigned Lane) {
+      return LaneBitmask(Type(1) << Lane);
+    }
 
   private:
     Type Mask = 0;
diff --git a/include/llvm/MC/MCFixup.h b/include/llvm/MC/MCFixup.h
index b493ca0b0ea7..b83086c327f2 100644
--- a/include/llvm/MC/MCFixup.h
+++ b/include/llvm/MC/MCFixup.h
@@ -69,7 +69,7 @@ class MCFixup {
   /// an instruction or an assembler directive.
   const MCExpr *Value;
 
-  /// The byte index of start of the relocation inside the encoded instruction.
+  /// The byte index of start of the relocation inside the MCFragment.
   uint32_t Offset;
 
   /// The target dependent kind of fixup item this is. The kind is used to
diff --git a/include/llvm/MC/MCInstrDesc.h b/include/llvm/MC/MCInstrDesc.h
index 340d8253b8c9..9150a8b5c80a 100644
--- a/include/llvm/MC/MCInstrDesc.h
+++ b/include/llvm/MC/MCInstrDesc.h
@@ -209,6 +209,15 @@ public:
   /// well.
   unsigned getNumOperands() const { return NumOperands; }
 
+  using const_opInfo_iterator = const MCOperandInfo *;
+
+  const_opInfo_iterator opInfo_begin() const { return OpInfo; }
+  const_opInfo_iterator opInfo_end() const { return OpInfo + NumOperands; }
+
+  iterator_range<const_opInfo_iterator> operands() const {
+    return make_range(opInfo_begin(), opInfo_end());
+  }
+
   /// \brief Return the number of MachineOperands that are register
   /// definitions.  Register definitions always occur at the start of the
   /// machine operand list.  This is the number of "outs" in the .td file,
diff --git a/include/llvm/Object/COFFImportFile.h b/include/llvm/Object/COFFImportFile.h
index 060f965233e1..8e215b565fc4 100644
--- a/include/llvm/Object/COFFImportFile.h
+++ b/include/llvm/Object/COFFImportFile.h
@@ -95,7 +95,7 @@ struct COFFShortExport {
   }
 };
 
-std::error_code writeImportLibrary(StringRef DLLName,
+std::error_code writeImportLibrary(StringRef ImportName,
                                    StringRef Path,
                                    ArrayRef<COFFShortExport> Exports,
                                    COFF::MachineTypes Machine);
diff --git a/include/llvm/Object/COFFModuleDefinition.h b/include/llvm/Object/COFFModuleDefinition.h
index a0e8eacdb7a3..be139a2833b0 100644
--- a/include/llvm/Object/COFFModuleDefinition.h
+++ b/include/llvm/Object/COFFModuleDefinition.h
@@ -16,7 +16,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-
 #ifndef LLVM_OBJECT_COFF_MODULE_DEFINITION_H
 #define LLVM_OBJECT_COFF_MODULE_DEFINITION_H
 
@@ -29,6 +28,7 @@ namespace object {
 struct COFFModuleDefinition {
   std::vector<COFFShortExport> Exports;
   std::string OutputFile;
+  std::string ImportName;
   uint64_t ImageBase = 0;
   uint64_t StackReserve = 0;
   uint64_t StackCommit = 0;
@@ -40,8 +40,12 @@ struct COFFModuleDefinition {
   uint32_t MinorOSVersion = 0;
 };
 
+// mingw and wine def files do not mangle _ for x86 which
+// is a consequence of legacy binutils' dlltool functionality.
+// This MingwDef flag should be removed once mingw stops this pratice.
 Expected<COFFModuleDefinition>
-parseCOFFModuleDefinition(MemoryBufferRef MB, COFF::MachineTypes Machine);
+parseCOFFModuleDefinition(MemoryBufferRef MB, COFF::MachineTypes Machine,
+                          bool MingwDef = false);
 
 } // End namespace object.
 } // End namespace llvm.
diff --git a/include/llvm/ObjectYAML/CodeViewYAMLTypes.h b/include/llvm/ObjectYAML/CodeViewYAMLTypes.h
index 6746fd60b6cb..88a5668f0a14 100644
--- a/include/llvm/ObjectYAML/CodeViewYAMLTypes.h
+++ b/include/llvm/ObjectYAML/CodeViewYAMLTypes.h
@@ -60,6 +60,8 @@ ArrayRef<uint8_t> toDebugT(ArrayRef<LeafRecord>, BumpPtrAllocator &Alloc);
 
 } // end namespace llvm
 
+LLVM_YAML_DECLARE_SCALAR_TRAITS(codeview::GUID, true)
+
 LLVM_YAML_DECLARE_MAPPING_TRAITS(CodeViewYAML::LeafRecord)
 LLVM_YAML_DECLARE_MAPPING_TRAITS(CodeViewYAML::MemberRecord)
 
diff --git a/include/llvm/Support/AArch64TargetParser.def b/include/llvm/Support/AArch64TargetParser.def
index 8eccebcd932a..09f9602a24d9 100644
--- a/include/llvm/Support/AArch64TargetParser.def
+++ b/include/llvm/Support/AArch64TargetParser.def
@@ -43,8 +43,9 @@ AARCH64_ARCH_EXT_NAME("crypto",   AArch64::AEK_CRYPTO,   "+crypto","-crypto")
 AARCH64_ARCH_EXT_NAME("fp",       AArch64::AEK_FP,       "+fp-armv8",  "-fp-armv8")
 AARCH64_ARCH_EXT_NAME("simd",     AArch64::AEK_SIMD,     "+neon",  "-neon")
 AARCH64_ARCH_EXT_NAME("fp16",     AArch64::AEK_FP16,     "+fullfp16",  "-fullfp16")
-AARCH64_ARCH_EXT_NAME("profile",     AArch64::AEK_PROFILE,     "+spe",  "-spe")
-AARCH64_ARCH_EXT_NAME("ras",     AArch64::AEK_RAS,     "+ras",  "-ras")
+AARCH64_ARCH_EXT_NAME("profile",  AArch64::AEK_PROFILE,  "+spe",  "-spe")
+AARCH64_ARCH_EXT_NAME("ras",      AArch64::AEK_RAS,      "+ras",  "-ras")
+AARCH64_ARCH_EXT_NAME("sve",      AArch64::AEK_SVE,      "+sve",  "-sve")
 #undef AARCH64_ARCH_EXT_NAME
 
 #ifndef AARCH64_CPU_NAME
diff --git a/include/llvm/Support/BinaryItemStream.h b/include/llvm/Support/BinaryItemStream.h
index f4b319217819..fe7e6caeaafb 100644
--- a/include/llvm/Support/BinaryItemStream.h
+++ b/include/llvm/Support/BinaryItemStream.h
@@ -62,32 +62,45 @@ public:
     return Error::success();
   }
 
-  void setItems(ArrayRef<T> ItemArray) { Items = ItemArray; }
+  void setItems(ArrayRef<T> ItemArray) {
+    Items = ItemArray;
+    computeItemOffsets();
+  }
 
   uint32_t getLength() override {
-    uint32_t Size = 0;
-    for (const auto &Item : Items)
-      Size += Traits::length(Item);
-    return Size;
+    return ItemEndOffsets.empty() ? 0 : ItemEndOffsets.back();
   }
 
 private:
-  Expected<uint32_t> translateOffsetIndex(uint32_t Offset) const {
+  void computeItemOffsets() {
+    ItemEndOffsets.clear();
+    ItemEndOffsets.reserve(Items.size());
     uint32_t CurrentOffset = 0;
-    uint32_t CurrentIndex = 0;
     for (const auto &Item : Items) {
-      if (CurrentOffset >= Offset)
-        break;
-      CurrentOffset += Traits::length(Item);
-      ++CurrentIndex;
+      uint32_t Len = Traits::length(Item);
+      assert(Len > 0 && "no empty items");
+      CurrentOffset += Len;
+      ItemEndOffsets.push_back(CurrentOffset);
     }
-    if (CurrentOffset != Offset)
+  }
+
+  Expected<uint32_t> translateOffsetIndex(uint32_t Offset) {
+    // Make sure the offset is somewhere in our items array.
+    if (Offset >= getLength())
       return make_error<BinaryStreamError>(stream_error_code::stream_too_short);
-    return CurrentIndex;
+    ++Offset;
+    auto Iter =
+        std::lower_bound(ItemEndOffsets.begin(), ItemEndOffsets.end(), Offset);
+    size_t Idx = std::distance(ItemEndOffsets.begin(), Iter);
+    assert(Idx < Items.size() && "binary search for offset failed");
+    return Idx;
   }
 
   llvm::support::endianness Endian;
   ArrayRef<T> Items;
+
+  // Sorted vector of offsets to accelerate lookup.
+  std::vector<uint32_t> ItemEndOffsets;
 };
 
 } // end namespace llvm
diff --git a/include/llvm/Support/Format.h b/include/llvm/Support/Format.h
index 017b4973f1ff..bcbd2bec5722 100644
--- a/include/llvm/Support/Format.h
+++ b/include/llvm/Support/Format.h
@@ -125,30 +125,39 @@ inline format_object<Ts...> format(const char *Fmt, const Ts &... Vals) {
   return format_object<Ts...>(Fmt, Vals...);
 }
 
-/// This is a helper class used for left_justify() and right_justify().
+/// This is a helper class for left_justify, right_justify, and center_justify.
 class FormattedString {
+public:
+  enum Justification { JustifyNone, JustifyLeft, JustifyRight, JustifyCenter };
+  FormattedString(StringRef S, unsigned W, Justification J)
+      : Str(S), Width(W), Justify(J) {}
+
+private:
   StringRef Str;
   unsigned Width;
-  bool RightJustify;
+  Justification Justify;
   friend class raw_ostream;
-
-public:
-    FormattedString(StringRef S, unsigned W, bool R)
-      : Str(S), Width(W), RightJustify(R) { }
 };
 
 /// left_justify - append spaces after string so total output is
 /// \p Width characters.  If \p Str is larger that \p Width, full string
 /// is written with no padding.
 inline FormattedString left_justify(StringRef Str, unsigned Width) {
-  return FormattedString(Str, Width, false);
+  return FormattedString(Str, Width, FormattedString::JustifyLeft);
 }
 
 /// right_justify - add spaces before string so total output is
 /// \p Width characters.  If \p Str is larger that \p Width, full string
 /// is written with no padding.
 inline FormattedString right_justify(StringRef Str, unsigned Width) {
-  return FormattedString(Str, Width, true);
+  return FormattedString(Str, Width, FormattedString::JustifyRight);
+}
+
+/// center_justify - add spaces before and after string so total output is
+/// \p Width characters.  If \p Str is larger that \p Width, full string
+/// is written with no padding.
+inline FormattedString center_justify(StringRef Str, unsigned Width) {
+  return FormattedString(Str, Width, FormattedString::JustifyCenter);
 }
 
 /// This is a helper class used for format_hex() and format_decimal().
diff --git a/include/llvm/Support/GenericDomTree.h b/include/llvm/Support/GenericDomTree.h
index 394a45387d8a..706320fed9a7 100644
--- a/include/llvm/Support/GenericDomTree.h
+++ b/include/llvm/Support/GenericDomTree.h
@@ -41,27 +41,21 @@
 
 namespace llvm {
 
-template <class NodeT> class DominatorTreeBase;
+template <typename NodeT, bool IsPostDom>
+class DominatorTreeBase;
 
-namespace detail {
-
-template <typename GT> struct DominatorTreeBaseTraits {
-  static_assert(std::is_pointer<typename GT::NodeRef>::value,
-                "Currently NodeRef must be a pointer type.");
-  using type = DominatorTreeBase<
-      typename std::remove_pointer<typename GT::NodeRef>::type>;
-};
-
-} // end namespace detail
-
-template <typename GT>
-using DominatorTreeBaseByGraphTraits =
-    typename detail::DominatorTreeBaseTraits<GT>::type;
+namespace DomTreeBuilder {
+template <class DomTreeT>
+struct SemiNCAInfo;
+}  // namespace DomTreeBuilder
 
 /// \brief Base class for the actual dominator tree node.
 template <class NodeT> class DomTreeNodeBase {
   friend struct PostDominatorTree;
-  template <class N> friend class DominatorTreeBase;
+  friend class DominatorTreeBase<NodeT, false>;
+  friend class DominatorTreeBase<NodeT, true>;
+  friend struct DomTreeBuilder::SemiNCAInfo<DominatorTreeBase<NodeT, false>>;
+  friend struct DomTreeBuilder::SemiNCAInfo<DominatorTreeBase<NodeT, true>>;
 
   NodeT *TheBB;
   DomTreeNodeBase *IDom;
@@ -192,58 +186,69 @@ void PrintDomTree(const DomTreeNodeBase<NodeT> *N, raw_ostream &O,
 }
 
 namespace DomTreeBuilder {
-template <class NodeT>
-struct SemiNCAInfo;
+// The routines below are provided in a separate header but referenced here.
+template <typename DomTreeT, typename FuncT>
+void Calculate(DomTreeT &DT, FuncT &F);
+
+template <class DomTreeT>
+void InsertEdge(DomTreeT &DT, typename DomTreeT::NodePtr From,
+                typename DomTreeT::NodePtr To);
 
-// The calculate routine is provided in a separate header but referenced here.
-template <class FuncT, class N>
-void Calculate(DominatorTreeBaseByGraphTraits<GraphTraits<N>> &DT, FuncT &F);
+template <class DomTreeT>
+void DeleteEdge(DomTreeT &DT, typename DomTreeT::NodePtr From,
+                typename DomTreeT::NodePtr To);
 
-// The verify function is provided in a separate header but referenced here.
-template <class N>
-bool Verify(const DominatorTreeBaseByGraphTraits<GraphTraits<N>> &DT);
+template <typename DomTreeT>
+bool Verify(const DomTreeT &DT);
 }  // namespace DomTreeBuilder
 
 /// \brief Core dominator tree base class.
 ///
 /// This class is a generic template over graph nodes. It is instantiated for
 /// various graphs in the LLVM IR or in the code generator.
-template <class NodeT> class DominatorTreeBase {
+template <typename NodeT, bool IsPostDom>
+class DominatorTreeBase {
  protected:
   std::vector<NodeT *> Roots;
-  bool IsPostDominators;
 
   using DomTreeNodeMapType =
      DenseMap<NodeT *, std::unique_ptr<DomTreeNodeBase<NodeT>>>;
   DomTreeNodeMapType DomTreeNodes;
   DomTreeNodeBase<NodeT> *RootNode;
+  using ParentPtr = decltype(std::declval<NodeT *>()->getParent());
+  ParentPtr Parent = nullptr;
 
   mutable bool DFSInfoValid = false;
   mutable unsigned int SlowQueries = 0;
 
-  friend struct DomTreeBuilder::SemiNCAInfo<NodeT>;
-  using SNCAInfoTy = DomTreeBuilder::SemiNCAInfo<NodeT>;
+  friend struct DomTreeBuilder::SemiNCAInfo<DominatorTreeBase>;
 
  public:
-  explicit DominatorTreeBase(bool isPostDom) : IsPostDominators(isPostDom) {}
+  static_assert(std::is_pointer<typename GraphTraits<NodeT *>::NodeRef>::value,
+                "Currently DominatorTreeBase supports only pointer nodes");
+  using NodeType = NodeT;
+  using NodePtr = NodeT *;
+  static constexpr bool IsPostDominator = IsPostDom;
+
+  DominatorTreeBase() {}
 
   DominatorTreeBase(DominatorTreeBase &&Arg)
       : Roots(std::move(Arg.Roots)),
-        IsPostDominators(Arg.IsPostDominators),
         DomTreeNodes(std::move(Arg.DomTreeNodes)),
-        RootNode(std::move(Arg.RootNode)),
-        DFSInfoValid(std::move(Arg.DFSInfoValid)),
-        SlowQueries(std::move(Arg.SlowQueries)) {
+        RootNode(Arg.RootNode),
+        Parent(Arg.Parent),
+        DFSInfoValid(Arg.DFSInfoValid),
+        SlowQueries(Arg.SlowQueries) {
     Arg.wipe();
   }
 
   DominatorTreeBase &operator=(DominatorTreeBase &&RHS) {
     Roots = std::move(RHS.Roots);
-    IsPostDominators = RHS.IsPostDominators;
     DomTreeNodes = std::move(RHS.DomTreeNodes);
-    RootNode = std::move(RHS.RootNode);
-    DFSInfoValid = std::move(RHS.DFSInfoValid);
-    SlowQueries = std::move(RHS.SlowQueries);
+    RootNode = RHS.RootNode;
+    Parent = RHS.Parent;
+    DFSInfoValid = RHS.DFSInfoValid;
+    SlowQueries = RHS.SlowQueries;
     RHS.wipe();
     return *this;
   }
@@ -259,11 +264,12 @@ template <class NodeT> class DominatorTreeBase {
 
   /// isPostDominator - Returns true if analysis based of postdoms
   ///
-  bool isPostDominator() const { return IsPostDominators; }
+  bool isPostDominator() const { return IsPostDominator; }
 
   /// compare - Return false if the other dominator tree base matches this
   /// dominator tree base. Otherwise return true.
   bool compare(const DominatorTreeBase &Other) const {
+    if (Parent != Other.Parent) return true;
 
     const DomTreeNodeMapType &OtherDomTreeNodes = Other.DomTreeNodes;
     if (DomTreeNodes.size() != OtherDomTreeNodes.size())
@@ -443,10 +449,50 @@ template <class NodeT> class DominatorTreeBase {
                                       const_cast<NodeT *>(B));
   }
 
+  bool isVirtualRoot(const DomTreeNodeBase<NodeT> *A) const {
+    return isPostDominator() && !A->getBlock();
+  }
+
   //===--------------------------------------------------------------------===//
   // API to update (Post)DominatorTree information based on modifications to
   // the CFG...
 
+  /// Inform the dominator tree about a CFG edge insertion and update the tree.
+  ///
+  /// This function has to be called just before or just after making the update
+  /// on the actual CFG. There cannot be any other updates that the dominator
+  /// tree doesn't know about.
+  ///
+  /// Note that for postdominators it automatically takes care of inserting
+  /// a reverse edge internally (so there's no need to swap the parameters).
+  ///
+  void insertEdge(NodeT *From, NodeT *To) {
+    assert(From);
+    assert(To);
+    assert(From->getParent() == Parent);
+    assert(To->getParent() == Parent);
+    DomTreeBuilder::InsertEdge(*this, From, To);
+  }
+
+  /// Inform the dominator tree about a CFG edge deletion and update the tree.
+  ///
+  /// This function has to be called just after making the update
+  /// on the actual CFG. There cannot be any other updates that the dominator
+  /// tree doesn't know about. The only exception is when the deletion that the
+  /// tree is informed about makes some (domominator) subtree unreachable -- in
+  /// this case, it is fine to perform deletions within this subtree.
+  ///
+  /// Note that for postdominators it automatically takes care of deleting
+  /// a reverse edge internally (so there's no need to swap the parameters).
+  ///
+  void deleteEdge(NodeT *From, NodeT *To) {
+    assert(From);
+    assert(To);
+    assert(From->getParent() == Parent);
+    assert(To->getParent() == Parent);
+    DomTreeBuilder::DeleteEdge(*this, From, To);
+  }
+
   /// Add a new node to the dominator tree information.
   ///
   /// This creates a new node as a child of DomBB dominator node, linking it
@@ -530,7 +576,7 @@ template <class NodeT> class DominatorTreeBase {
   /// splitBlock - BB is split and now it has one successor. Update dominator
   /// tree to reflect this change.
   void splitBlock(NodeT *NewBB) {
-    if (this->IsPostDominators)
+    if (IsPostDominator)
       Split<Inverse<NodeT *>>(NewBB);
     else
       Split<NodeT *>(NewBB);
@@ -607,37 +653,33 @@ public:
   template <class FT> void recalculate(FT &F) {
     using TraitsTy = GraphTraits<FT *>;
     reset();
+    Parent = &F;
 
-    if (!this->IsPostDominators) {
+    if (!IsPostDominator) {
       // Initialize root
       NodeT *entry = TraitsTy::getEntryNode(&F);
       addRoot(entry);
-
-      DomTreeBuilder::Calculate<FT, NodeT *>(*this, F);
     } else {
       // Initialize the roots list
       for (auto *Node : nodes(&F))
         if (TraitsTy::child_begin(Node) == TraitsTy::child_end(Node))
           addRoot(Node);
-
-      DomTreeBuilder::Calculate<FT, Inverse<NodeT *>>(*this, F);
     }
+
+    DomTreeBuilder::Calculate(*this, F);
   }
 
   /// verify - check parent and sibling property
-  bool verify() const {
-    return this->isPostDominator()
-           ? DomTreeBuilder::Verify<Inverse<NodeT *>>(*this)
-           : DomTreeBuilder::Verify<NodeT *>(*this);
-  }
+  bool verify() const { return DomTreeBuilder::Verify(*this); }
 
  protected:
   void addRoot(NodeT *BB) { this->Roots.push_back(BB); }
 
   void reset() {
     DomTreeNodes.clear();
-    this->Roots.clear();
+    Roots.clear();
     RootNode = nullptr;
+    Parent = nullptr;
     DFSInfoValid = false;
     SlowQueries = 0;
   }
@@ -719,13 +761,21 @@ public:
   void wipe() {
     DomTreeNodes.clear();
     RootNode = nullptr;
+    Parent = nullptr;
   }
 };
 
+template <typename T>
+using DomTreeBase = DominatorTreeBase<T, false>;
+
+template <typename T>
+using PostDomTreeBase = DominatorTreeBase<T, true>;
+
 // These two functions are declared out of line as a workaround for building
 // with old (< r147295) versions of clang because of pr11642.
-template <class NodeT>
-bool DominatorTreeBase<NodeT>::dominates(const NodeT *A, const NodeT *B) const {
+template <typename NodeT, bool IsPostDom>
+bool DominatorTreeBase<NodeT, IsPostDom>::dominates(const NodeT *A,
+                                                    const NodeT *B) const {
   if (A == B)
     return true;
 
@@ -735,9 +785,9 @@ bool DominatorTreeBase<NodeT>::dominates(const NodeT *A, const NodeT *B) const {
   return dominates(getNode(const_cast<NodeT *>(A)),
                    getNode(const_cast<NodeT *>(B)));
 }
-template <class NodeT>
-bool DominatorTreeBase<NodeT>::properlyDominates(const NodeT *A,
-                                                 const NodeT *B) const {
+template <typename NodeT, bool IsPostDom>
+bool DominatorTreeBase<NodeT, IsPostDom>::properlyDominates(
+    const NodeT *A, const NodeT *B) const {
   if (A == B)
     return false;
 
diff --git a/include/llvm/Support/GenericDomTreeConstruction.h b/include/llvm/Support/GenericDomTreeConstruction.h
index a0fec668e05c..be90afa4c3c8 100644
--- a/include/llvm/Support/GenericDomTreeConstruction.h
+++ b/include/llvm/Support/GenericDomTreeConstruction.h
@@ -20,15 +20,28 @@
 /// out that the theoretically slower O(n*log(n)) implementation is actually
 /// faster than the almost-linear O(n*alpha(n)) version, even for large CFGs.
 ///
+/// The file uses the Depth Based Search algorithm to perform incremental
+/// upates (insertion and deletions). The implemented algorithm is based on this
+/// publication:
+///
+///   An Experimental Study of Dynamic Dominators
+///   Loukas Georgiadis, et al., April 12 2016, pp. 5-7, 9-10:
+///   https://arxiv.org/pdf/1604.02711.pdf
+///
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_SUPPORT_GENERICDOMTREECONSTRUCTION_H
 #define LLVM_SUPPORT_GENERICDOMTREECONSTRUCTION_H
 
+#include <queue>
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/GenericDomTree.h"
 
+#define DEBUG_TYPE "dom-tree-builder"
+
 namespace llvm {
 namespace DomTreeBuilder {
 
@@ -46,13 +59,14 @@ struct ChildrenGetter<NodePtr, true> {
   }
 };
 
-// Information record used by Semi-NCA during tree construction.
-template <typename NodeT>
+template <typename DomTreeT>
 struct SemiNCAInfo {
-  using NodePtr = NodeT *;
-  using DomTreeT = DominatorTreeBase<NodeT>;
+  using NodePtr = typename DomTreeT::NodePtr;
+  using NodeT = typename DomTreeT::NodeType;
   using TreeNodePtr = DomTreeNodeBase<NodeT> *;
+  static constexpr bool IsPostDom = DomTreeT::IsPostDominator;
 
+  // Information record used by Semi-NCA during tree construction.
   struct InfoRec {
     unsigned DFSNum = 0;
     unsigned Parent = 0;
@@ -62,11 +76,13 @@ struct SemiNCAInfo {
     SmallVector<NodePtr, 2> ReverseChildren;
   };
 
-  std::vector<NodePtr> NumToNode;
+  // Number to node mapping is 1-based. Initialize the mapping to start with
+  // a dummy element.
+  std::vector<NodePtr> NumToNode = {nullptr};
   DenseMap<NodePtr, InfoRec> NodeToInfo;
 
   void clear() {
-    NumToNode.clear();
+    NumToNode = {nullptr}; // Restore to initial state with a dummy start node.
     NodeToInfo.clear();
   }
 
@@ -90,12 +106,28 @@ struct SemiNCAInfo {
     // Add a new tree node for this NodeT, and link it as a child of
     // IDomNode
     return (DT.DomTreeNodes[BB] = IDomNode->addChild(
-                llvm::make_unique<DomTreeNodeBase<NodeT>>(BB, IDomNode)))
+        llvm::make_unique<DomTreeNodeBase<NodeT>>(BB, IDomNode)))
         .get();
   }
 
   static bool AlwaysDescend(NodePtr, NodePtr) { return true; }
 
+  struct BlockNamePrinter {
+    NodePtr N;
+
+    BlockNamePrinter(NodePtr Block) : N(Block) {}
+    BlockNamePrinter(TreeNodePtr TN) : N(TN ? TN->getBlock() : nullptr) {}
+
+    friend raw_ostream &operator<<(raw_ostream &O, const BlockNamePrinter &BP) {
+      if (!BP.N)
+        O << "nullptr";
+      else
+        BP.N->printAsOperand(O, false);
+
+      return O;
+    }
+  };
+
   // Custom DFS implementation which can skip nodes based on a provided
   // predicate. It also collects ReverseChildren so that we don't have to spend
   // time getting predecessors in SemiNCA.
@@ -177,44 +209,42 @@ struct SemiNCAInfo {
     return VInInfo.Label;
   }
 
-  template <typename NodeType>
-  void runSemiNCA(DomTreeT &DT, unsigned NumBlocks) {
-    // Step #1: Number blocks in depth-first order and initialize variables used
-    // in later stages of the algorithm.
-    const unsigned N = doFullDFSWalk(DT, AlwaysDescend);
-
-    // It might be that some blocks did not get a DFS number (e.g., blocks of
-    // infinite loops). In these cases an artificial exit node is required.
-    const bool MultipleRoots =
-        DT.Roots.size() > 1 || (DT.isPostDominator() && N != NumBlocks);
-
+  // This function requires DFS to be run before calling it.
+  void runSemiNCA(DomTreeT &DT, const unsigned MinLevel = 0) {
+    const unsigned NextDFSNum(NumToNode.size());
     // Initialize IDoms to spanning tree parents.
-    for (unsigned i = 1; i <= N; ++i) {
+    for (unsigned i = 1; i < NextDFSNum; ++i) {
       const NodePtr V = NumToNode[i];
       auto &VInfo = NodeToInfo[V];
       VInfo.IDom = NumToNode[VInfo.Parent];
     }
 
-    // Step #2: Calculate the semidominators of all vertices.
-    for (unsigned i = N; i >= 2; --i) {
+    // Step #1: Calculate the semidominators of all vertices.
+    for (unsigned i = NextDFSNum - 1; i >= 2; --i) {
       NodePtr W = NumToNode[i];
       auto &WInfo = NodeToInfo[W];
 
       // Initialize the semi dominator to point to the parent node.
       WInfo.Semi = WInfo.Parent;
-      for (const auto &N : WInfo.ReverseChildren)
-        if (NodeToInfo.count(N)) {  // Only if this predecessor is reachable!
-          unsigned SemiU = NodeToInfo[eval(N, i + 1)].Semi;
-          if (SemiU < WInfo.Semi)
-            WInfo.Semi = SemiU;
-        }
+      for (const auto &N : WInfo.ReverseChildren) {
+        if (NodeToInfo.count(N) == 0)  // Skip unreachable predecessors.
+          continue;
+
+        const TreeNodePtr TN = DT.getNode(N);
+        // Skip predecessors whose level is above the subtree we are processing.
+        if (TN && TN->getLevel() < MinLevel)
+          continue;
+
+        unsigned SemiU = NodeToInfo[eval(N, i + 1)].Semi;
+        if (SemiU < WInfo.Semi) WInfo.Semi = SemiU;
+      }
     }
 
-    // Step #3: Explicitly define the immediate dominator of each vertex.
+    // Step #2: Explicitly define the immediate dominator of each vertex.
     //          IDom[i] = NCA(SDom[i], SpanningTreeParent(i)).
     // Note that the parents were stored in IDoms and later got invalidated
     // during path compression in Eval.
-    for (unsigned i = 2; i <= N; ++i) {
+    for (unsigned i = 2; i < NextDFSNum; ++i) {
       const NodePtr W = NumToNode[i];
       auto &WInfo = NodeToInfo[W];
       const unsigned SDomNum = NodeToInfo[NumToNode[WInfo.Semi]].DFSNum;
@@ -224,6 +254,36 @@ struct SemiNCAInfo {
 
       WInfo.IDom = WIDomCandidate;
     }
+  }
+
+  template <typename DescendCondition>
+  unsigned doFullDFSWalk(const DomTreeT &DT, DescendCondition DC) {
+    unsigned Num = 0;
+
+    if (DT.Roots.size() > 1) {
+      auto &BBInfo = NodeToInfo[nullptr];
+      BBInfo.DFSNum = BBInfo.Semi = ++Num;
+      BBInfo.Label = nullptr;
+
+      NumToNode.push_back(nullptr);  // NumToNode[n] = V;
+    }
+
+    if (DT.isPostDominator()) {
+      for (auto *Root : DT.Roots) Num = runDFS<true>(Root, Num, DC, 1);
+    } else {
+      assert(DT.Roots.size() == 1);
+      Num = runDFS<false>(DT.Roots[0], Num, DC, Num);
+    }
+
+    return Num;
+  }
+
+  void calculateFromScratch(DomTreeT &DT, const unsigned NumBlocks) {
+    // Step #0: Number blocks in depth-first order and initialize variables used
+    // in later stages of the algorithm.
+    const unsigned LastDFSNum = doFullDFSWalk(DT, AlwaysDescend);
+
+    runSemiNCA(DT);
 
     if (DT.Roots.empty()) return;
 
@@ -231,25 +291,32 @@ struct SemiNCAInfo {
     // one exit block, or it may be the virtual exit (denoted by
     // (BasicBlock *)0) which postdominates all real exits if there are multiple
     // exit blocks, or an infinite loop.
+    // It might be that some blocks did not get a DFS number (e.g., blocks of
+    // infinite loops). In these cases an artificial exit node is required.
+    const bool MultipleRoots = DT.Roots.size() > 1 || (DT.isPostDominator() &&
+                                                       LastDFSNum != NumBlocks);
     NodePtr Root = !MultipleRoots ? DT.Roots[0] : nullptr;
 
-    DT.RootNode =
-        (DT.DomTreeNodes[Root] =
-             llvm::make_unique<DomTreeNodeBase<NodeT>>(Root, nullptr))
-            .get();
+    DT.RootNode = (DT.DomTreeNodes[Root] =
+                       llvm::make_unique<DomTreeNodeBase<NodeT>>(Root, nullptr))
+        .get();
+    attachNewSubtree(DT, DT.RootNode);
+  }
 
-    // Loop over all of the reachable blocks in the function...
-    for (unsigned i = 2; i <= N; ++i) {
+  void attachNewSubtree(DomTreeT& DT, const TreeNodePtr AttachTo) {
+    // Attach the first unreachable block to AttachTo.
+    NodeToInfo[NumToNode[1]].IDom = AttachTo->getBlock();
+    // Loop over all of the discovered blocks in the function...
+    for (size_t i = 1, e = NumToNode.size(); i != e; ++i) {
       NodePtr W = NumToNode[i];
+      DEBUG(dbgs() << "\tdiscovered a new reachable node "
+                   << BlockNamePrinter(W) << "\n");
 
       // Don't replace this with 'count', the insertion side effect is important
-      if (DT.DomTreeNodes[W])
-        continue; // Haven't calculated this node yet?
+      if (DT.DomTreeNodes[W]) continue;  // Haven't calculated this node yet?
 
       NodePtr ImmDom = getIDom(W);
 
-      assert(ImmDom || DT.DomTreeNodes[nullptr]);
-
       // Get or calculate the node for the immediate dominator
       TreeNodePtr IDomNode = getNodeForBlock(ImmDom, DT);
 
@@ -260,34 +327,208 @@ struct SemiNCAInfo {
     }
   }
 
-  template <typename DescendCondition>
-  unsigned doFullDFSWalk(const DomTreeT &DT, DescendCondition DC) {
-    unsigned Num = 0;
-    NumToNode.push_back(nullptr);
+  void reattachExistingSubtree(DomTreeT &DT, const TreeNodePtr AttachTo) {
+    NodeToInfo[NumToNode[1]].IDom = AttachTo->getBlock();
+    for (size_t i = 1, e = NumToNode.size(); i != e; ++i) {
+      const NodePtr N = NumToNode[i];
+      const TreeNodePtr TN = DT.getNode(N);
+      assert(TN);
+      const TreeNodePtr NewIDom = DT.getNode(NodeToInfo[N].IDom);
+      TN->setIDom(NewIDom);
+    }
+  }
 
-    if (DT.Roots.size() > 1) {
-      auto &BBInfo = NodeToInfo[nullptr];
-      BBInfo.DFSNum = BBInfo.Semi = ++Num;
-      BBInfo.Label = nullptr;
+  // Helper struct used during edge insertions.
+  struct InsertionInfo {
+    using BucketElementTy = std::pair<unsigned, TreeNodePtr>;
+    struct DecreasingLevel {
+      bool operator()(const BucketElementTy &First,
+                      const BucketElementTy &Second) const {
+        return First.first > Second.first;
+      }
+    };
+
+    std::priority_queue<BucketElementTy, SmallVector<BucketElementTy, 8>,
+        DecreasingLevel>
+        Bucket;  // Queue of tree nodes sorted by level in descending order.
+    SmallDenseSet<TreeNodePtr, 8> Affected;
+    SmallDenseSet<TreeNodePtr, 8> Visited;
+    SmallVector<TreeNodePtr, 8> AffectedQueue;
+    SmallVector<TreeNodePtr, 8> VisitedNotAffectedQueue;
+  };
 
-      NumToNode.push_back(nullptr);  // NumToNode[n] = V;
+  static void InsertEdge(DomTreeT &DT, const NodePtr From, const NodePtr To) {
+    assert(From && To && "Cannot connect nullptrs");
+    DEBUG(dbgs() << "Inserting edge " << BlockNamePrinter(From) << " -> "
+                 << BlockNamePrinter(To) << "\n");
+    const TreeNodePtr FromTN = DT.getNode(From);
+
+    // Ignore edges from unreachable nodes.
+    if (!FromTN) return;
+
+    DT.DFSInfoValid = false;
+
+    const TreeNodePtr ToTN = DT.getNode(To);
+    if (!ToTN)
+      InsertUnreachable(DT, FromTN, To);
+    else
+      InsertReachable(DT, FromTN, ToTN);
+  }
+
+  // Handles insertion to a node already in the dominator tree.
+  static void InsertReachable(DomTreeT &DT, const TreeNodePtr From,
+                              const TreeNodePtr To) {
+    DEBUG(dbgs() << "\tReachable " << BlockNamePrinter(From->getBlock())
+                 << " -> " << BlockNamePrinter(To->getBlock()) << "\n");
+    const NodePtr NCDBlock =
+        DT.findNearestCommonDominator(From->getBlock(), To->getBlock());
+    assert(NCDBlock || DT.isPostDominator());
+    const TreeNodePtr NCD = DT.getNode(NCDBlock);
+    assert(NCD);
+
+    DEBUG(dbgs() << "\t\tNCA == " << BlockNamePrinter(NCD) << "\n");
+    const TreeNodePtr ToIDom = To->getIDom();
+
+    // Nothing affected -- NCA property holds.
+    // (Based on the lemma 2.5 from the second paper.)
+    if (NCD == To || NCD == ToIDom) return;
+
+    // Identify and collect affected nodes.
+    InsertionInfo II;
+    DEBUG(dbgs() << "Marking " << BlockNamePrinter(To) << " as affected\n");
+    II.Affected.insert(To);
+    const unsigned ToLevel = To->getLevel();
+    DEBUG(dbgs() << "Putting " << BlockNamePrinter(To) << " into a Bucket\n");
+    II.Bucket.push({ToLevel, To});
+
+    while (!II.Bucket.empty()) {
+      const TreeNodePtr CurrentNode = II.Bucket.top().second;
+      II.Bucket.pop();
+      DEBUG(dbgs() << "\tAdding to Visited and AffectedQueue: "
+                   << BlockNamePrinter(CurrentNode) << "\n");
+      II.Visited.insert(CurrentNode);
+      II.AffectedQueue.push_back(CurrentNode);
+
+      // Discover and collect affected successors of the current node.
+      VisitInsertion(DT, CurrentNode, CurrentNode->getLevel(), NCD, II);
     }
 
-    if (DT.isPostDominator()) {
-      for (auto *Root : DT.Roots) Num = runDFS<true>(Root, Num, DC, 1);
-    } else {
-      assert(DT.Roots.size() == 1);
-      Num = runDFS<false>(DT.Roots[0], Num, DC, Num);
+    // Finish by updating immediate dominators and levels.
+    UpdateInsertion(DT, NCD, II);
+  }
+
+  // Visits an affected node and collect its affected successors.
+  static void VisitInsertion(DomTreeT &DT, const TreeNodePtr TN,
+                             const unsigned RootLevel, const TreeNodePtr NCD,
+                             InsertionInfo &II) {
+    const unsigned NCDLevel = NCD->getLevel();
+    DEBUG(dbgs() << "Visiting " << BlockNamePrinter(TN) << "\n");
+
+    assert(TN->getBlock());
+    for (const NodePtr Succ :
+        ChildrenGetter<NodePtr, IsPostDom>::Get(TN->getBlock())) {
+      const TreeNodePtr SuccTN = DT.getNode(Succ);
+      assert(SuccTN && "Unreachable successor found at reachable insertion");
+      const unsigned SuccLevel = SuccTN->getLevel();
+
+      DEBUG(dbgs() << "\tSuccessor " << BlockNamePrinter(Succ)
+                   << ", level = " << SuccLevel << "\n");
+
+      // Succ dominated by subtree From -- not affected.
+      // (Based on the lemma 2.5 from the second paper.)
+      if (SuccLevel > RootLevel) {
+        DEBUG(dbgs() << "\t\tDominated by subtree From\n");
+        if (II.Visited.count(SuccTN) != 0) continue;
+
+        DEBUG(dbgs() << "\t\tMarking visited not affected "
+                     << BlockNamePrinter(Succ) << "\n");
+        II.Visited.insert(SuccTN);
+        II.VisitedNotAffectedQueue.push_back(SuccTN);
+        VisitInsertion(DT, SuccTN, RootLevel, NCD, II);
+      } else if ((SuccLevel > NCDLevel + 1) && II.Affected.count(SuccTN) == 0) {
+        DEBUG(dbgs() << "\t\tMarking affected and adding "
+                     << BlockNamePrinter(Succ) << " to a Bucket\n");
+        II.Affected.insert(SuccTN);
+        II.Bucket.push({SuccLevel, SuccTN});
+      }
     }
+  }
 
-    return Num;
+  // Updates immediate dominators and levels after insertion.
+  static void UpdateInsertion(DomTreeT &DT, const TreeNodePtr NCD,
+                              InsertionInfo &II) {
+    DEBUG(dbgs() << "Updating NCD = " << BlockNamePrinter(NCD) << "\n");
+
+    for (const TreeNodePtr TN : II.AffectedQueue) {
+      DEBUG(dbgs() << "\tIDom(" << BlockNamePrinter(TN)
+                   << ") = " << BlockNamePrinter(NCD) << "\n");
+      TN->setIDom(NCD);
+    }
+
+    UpdateLevelsAfterInsertion(II);
   }
 
-  static void PrintBlockOrNullptr(raw_ostream &O, NodePtr Obj) {
-    if (!Obj)
-      O << "nullptr";
-    else
-      Obj->printAsOperand(O, false);
+  static void UpdateLevelsAfterInsertion(InsertionInfo &II) {
+    DEBUG(dbgs() << "Updating levels for visited but not affected nodes\n");
+
+    for (const TreeNodePtr TN : II.VisitedNotAffectedQueue) {
+      DEBUG(dbgs() << "\tlevel(" << BlockNamePrinter(TN) << ") = ("
+                   << BlockNamePrinter(TN->getIDom()) << ") "
+                   << TN->getIDom()->getLevel() << " + 1\n");
+      TN->UpdateLevel();
+    }
+  }
+
+  // Handles insertion to previously unreachable nodes.
+  static void InsertUnreachable(DomTreeT &DT, const TreeNodePtr From,
+                                const NodePtr To) {
+    DEBUG(dbgs() << "Inserting " << BlockNamePrinter(From)
+                 << " -> (unreachable) " << BlockNamePrinter(To) << "\n");
+
+    // Collect discovered edges to already reachable nodes.
+    SmallVector<std::pair<NodePtr, TreeNodePtr>, 8> DiscoveredEdgesToReachable;
+    // Discover and connect nodes that became reachable with the insertion.
+    ComputeUnreachableDominators(DT, To, From, DiscoveredEdgesToReachable);
+
+    DEBUG(dbgs() << "Inserted " << BlockNamePrinter(From)
+                 << " -> (prev unreachable) " << BlockNamePrinter(To) << "\n");
+
+    DEBUG(DT.print(dbgs()));
+
+    // Used the discovered edges and inset discovered connecting (incoming)
+    // edges.
+    for (const auto &Edge : DiscoveredEdgesToReachable) {
+      DEBUG(dbgs() << "\tInserting discovered connecting edge "
+                   << BlockNamePrinter(Edge.first) << " -> "
+                   << BlockNamePrinter(Edge.second) << "\n");
+      InsertReachable(DT, DT.getNode(Edge.first), Edge.second);
+    }
+  }
+
+  // Connects nodes that become reachable with an insertion.
+  static void ComputeUnreachableDominators(
+      DomTreeT &DT, const NodePtr Root, const TreeNodePtr Incoming,
+      SmallVectorImpl<std::pair<NodePtr, TreeNodePtr>>
+      &DiscoveredConnectingEdges) {
+    assert(!DT.getNode(Root) && "Root must not be reachable");
+
+    // Visit only previously unreachable nodes.
+    auto UnreachableDescender = [&DT, &DiscoveredConnectingEdges](NodePtr From,
+                                                                  NodePtr To) {
+      const TreeNodePtr ToTN = DT.getNode(To);
+      if (!ToTN) return true;
+
+      DiscoveredConnectingEdges.push_back({From, ToTN});
+      return false;
+    };
+
+    SemiNCAInfo SNCA;
+    SNCA.runDFS<IsPostDom>(Root, 0, UnreachableDescender, 0);
+    SNCA.runSemiNCA(DT);
+    SNCA.attachNewSubtree(DT, Incoming);
+
+    DEBUG(dbgs() << "After adding unreachable nodes\n");
+    DEBUG(DT.print(dbgs()));
   }
 
   // Checks if the tree contains all reachable nodes in the input graph.
@@ -298,12 +539,23 @@ struct SemiNCAInfo {
     for (auto &NodeToTN : DT.DomTreeNodes) {
       const TreeNodePtr TN = NodeToTN.second.get();
       const NodePtr BB = TN->getBlock();
-      if (!BB) continue;
+
+      // Virtual root has a corresponding virtual CFG node.
+      if (DT.isVirtualRoot(TN)) continue;
 
       if (NodeToInfo.count(BB) == 0) {
-        errs() << "DomTree node ";
-        PrintBlockOrNullptr(errs(), BB);
-        errs() << " not found by DFS walk!\n";
+        errs() << "DomTree node " << BlockNamePrinter(BB)
+               << " not found by DFS walk!\n";
+        errs().flush();
+
+        return false;
+      }
+    }
+
+    for (const NodePtr N : NumToNode) {
+      if (N && !DT.getNode(N)) {
+        errs() << "CFG node " << BlockNamePrinter(N)
+               << " not found in the DomTree!\n";
         errs().flush();
 
         return false;
@@ -313,6 +565,215 @@ struct SemiNCAInfo {
     return true;
   }
 
+  static void DeleteEdge(DomTreeT &DT, const NodePtr From, const NodePtr To) {
+    assert(From && To && "Cannot disconnect nullptrs");
+    DEBUG(dbgs() << "Deleting edge " << BlockNamePrinter(From) << " -> "
+                 << BlockNamePrinter(To) << "\n");
+
+#ifndef NDEBUG
+    // Ensure that the edge was in fact deleted from the CFG before informing
+    // the DomTree about it.
+    // The check is O(N), so run it only in debug configuration.
+    auto IsSuccessor = [](const NodePtr SuccCandidate, const NodePtr Of) {
+      auto Successors = ChildrenGetter<NodePtr, IsPostDom>::Get(Of);
+      return llvm::find(Successors, SuccCandidate) != Successors.end();
+    };
+    (void)IsSuccessor;
+    assert(!IsSuccessor(To, From) && "Deleted edge still exists in the CFG!");
+#endif
+
+    const TreeNodePtr FromTN = DT.getNode(From);
+    // Deletion in an unreachable subtree -- nothing to do.
+    if (!FromTN) return;
+
+    const TreeNodePtr ToTN = DT.getNode(To);
+    assert(ToTN && "To already unreachable -- there is no edge to delete");
+    const NodePtr NCDBlock = DT.findNearestCommonDominator(From, To);
+    const TreeNodePtr NCD = DT.getNode(NCDBlock);
+
+    // To dominates From -- nothing to do.
+    if (ToTN == NCD) return;
+
+    const TreeNodePtr ToIDom = ToTN->getIDom();
+    DEBUG(dbgs() << "\tNCD " << BlockNamePrinter(NCD) << ", ToIDom "
+                 << BlockNamePrinter(ToIDom) << "\n");
+
+    // To remains reachable after deletion.
+    // (Based on the caption under Figure 4. from the second paper.)
+    if (FromTN != ToIDom || HasProperSupport(DT, ToTN))
+      DeleteReachable(DT, FromTN, ToTN);
+    else
+      DeleteUnreachable(DT, ToTN);
+  }
+
+  // Handles deletions that leave destination nodes reachable.
+  static void DeleteReachable(DomTreeT &DT, const TreeNodePtr FromTN,
+                              const TreeNodePtr ToTN) {
+    DEBUG(dbgs() << "Deleting reachable " << BlockNamePrinter(FromTN) << " -> "
+                 << BlockNamePrinter(ToTN) << "\n");
+    DEBUG(dbgs() << "\tRebuilding subtree\n");
+
+    // Find the top of the subtree that needs to be rebuilt.
+    // (Based on the lemma 2.6 from the second paper.)
+    const NodePtr ToIDom =
+        DT.findNearestCommonDominator(FromTN->getBlock(), ToTN->getBlock());
+    assert(ToIDom || DT.isPostDominator());
+    const TreeNodePtr ToIDomTN = DT.getNode(ToIDom);
+    assert(ToIDomTN);
+    const TreeNodePtr PrevIDomSubTree = ToIDomTN->getIDom();
+    // Top of the subtree to rebuild is the root node. Rebuild the tree from
+    // scratch.
+    if (!PrevIDomSubTree) {
+      DEBUG(dbgs() << "The entire tree needs to be rebuilt\n");
+      DT.recalculate(*DT.Parent);
+      return;
+    }
+
+    // Only visit nodes in the subtree starting at To.
+    const unsigned Level = ToIDomTN->getLevel();
+    auto DescendBelow = [Level, &DT](NodePtr, NodePtr To) {
+      return DT.getNode(To)->getLevel() > Level;
+    };
+
+    DEBUG(dbgs() << "\tTop of subtree: " << BlockNamePrinter(ToIDomTN) << "\n");
+
+    SemiNCAInfo SNCA;
+    SNCA.runDFS<IsPostDom>(ToIDom, 0, DescendBelow, 0);
+    DEBUG(dbgs() << "\tRunning Semi-NCA\n");
+    SNCA.runSemiNCA(DT, Level);
+    SNCA.reattachExistingSubtree(DT, PrevIDomSubTree);
+  }
+
+  // Checks if a node has proper support, as defined on the page 3 and later
+  // explained on the page 7 of the second paper.
+  static bool HasProperSupport(DomTreeT &DT, const TreeNodePtr TN) {
+    DEBUG(dbgs() << "IsReachableFromIDom " << BlockNamePrinter(TN) << "\n");
+    for (const NodePtr Pred :
+        ChildrenGetter<NodePtr, !IsPostDom>::Get(TN->getBlock())) {
+      DEBUG(dbgs() << "\tPred " << BlockNamePrinter(Pred) << "\n");
+      if (!DT.getNode(Pred)) continue;
+
+      const NodePtr Support =
+          DT.findNearestCommonDominator(TN->getBlock(), Pred);
+      DEBUG(dbgs() << "\tSupport " << BlockNamePrinter(Support) << "\n");
+      if (Support != TN->getBlock()) {
+        DEBUG(dbgs() << "\t" << BlockNamePrinter(TN)
+                     << " is reachable from support "
+                     << BlockNamePrinter(Support) << "\n");
+        return true;
+      }
+    }
+
+    return false;
+  }
+
+  // Handle deletions that make destination node unreachable.
+  // (Based on the lemma 2.7 from the second paper.)
+  static void DeleteUnreachable(DomTreeT &DT, const TreeNodePtr ToTN) {
+    DEBUG(dbgs() << "Deleting unreachable subtree " << BlockNamePrinter(ToTN)
+                 << "\n");
+    assert(ToTN);
+    assert(ToTN->getBlock());
+
+    SmallVector<NodePtr, 16> AffectedQueue;
+    const unsigned Level = ToTN->getLevel();
+
+    // Traverse destination node's descendants with greater level in the tree
+    // and collect visited nodes.
+    auto DescendAndCollect = [Level, &AffectedQueue, &DT](NodePtr, NodePtr To) {
+      const TreeNodePtr TN = DT.getNode(To);
+      assert(TN);
+      if (TN->getLevel() > Level) return true;
+      if (llvm::find(AffectedQueue, To) == AffectedQueue.end())
+        AffectedQueue.push_back(To);
+
+      return false;
+    };
+
+    SemiNCAInfo SNCA;
+    unsigned LastDFSNum =
+        SNCA.runDFS<IsPostDom>(ToTN->getBlock(), 0, DescendAndCollect, 0);
+
+    TreeNodePtr MinNode = ToTN;
+
+    // Identify the top of the subtree to rebuilt by finding the NCD of all
+    // the affected nodes.
+    for (const NodePtr N : AffectedQueue) {
+      const TreeNodePtr TN = DT.getNode(N);
+      const NodePtr NCDBlock =
+          DT.findNearestCommonDominator(TN->getBlock(), ToTN->getBlock());
+      assert(NCDBlock || DT.isPostDominator());
+      const TreeNodePtr NCD = DT.getNode(NCDBlock);
+      assert(NCD);
+
+      DEBUG(dbgs() << "Processing affected node " << BlockNamePrinter(TN)
+                   << " with NCD = " << BlockNamePrinter(NCD)
+                   << ", MinNode =" << BlockNamePrinter(MinNode) << "\n");
+      if (NCD != TN && NCD->getLevel() < MinNode->getLevel()) MinNode = NCD;
+    }
+
+    // Root reached, rebuild the whole tree from scratch.
+    if (!MinNode->getIDom()) {
+      DEBUG(dbgs() << "The entire tree needs to be rebuilt\n");
+      DT.recalculate(*DT.Parent);
+      return;
+    }
+
+    // Erase the unreachable subtree in reverse preorder to process all children
+    // before deleting their parent.
+    for (unsigned i = LastDFSNum; i > 0; --i) {
+      const NodePtr N = SNCA.NumToNode[i];
+      const TreeNodePtr TN = DT.getNode(N);
+      DEBUG(dbgs() << "Erasing node " << BlockNamePrinter(TN) << "\n");
+
+      EraseNode(DT, TN);
+    }
+
+    // The affected subtree start at the To node -- there's no extra work to do.
+    if (MinNode == ToTN) return;
+
+    DEBUG(dbgs() << "DeleteUnreachable: running DFS with MinNode = "
+                 << BlockNamePrinter(MinNode) << "\n");
+    const unsigned MinLevel = MinNode->getLevel();
+    const TreeNodePtr PrevIDom = MinNode->getIDom();
+    assert(PrevIDom);
+    SNCA.clear();
+
+    // Identify nodes that remain in the affected subtree.
+    auto DescendBelow = [MinLevel, &DT](NodePtr, NodePtr To) {
+      const TreeNodePtr ToTN = DT.getNode(To);
+      return ToTN && ToTN->getLevel() > MinLevel;
+    };
+    SNCA.runDFS<IsPostDom>(MinNode->getBlock(), 0, DescendBelow, 0);
+
+    DEBUG(dbgs() << "Previous IDom(MinNode) = " << BlockNamePrinter(PrevIDom)
+                 << "\nRunning Semi-NCA\n");
+
+    // Rebuild the remaining part of affected subtree.
+    SNCA.runSemiNCA(DT, MinLevel);
+    SNCA.reattachExistingSubtree(DT, PrevIDom);
+  }
+
+  // Removes leaf tree nodes from the dominator tree.
+  static void EraseNode(DomTreeT &DT, const TreeNodePtr TN) {
+    assert(TN);
+    assert(TN->getNumChildren() == 0 && "Not a tree leaf");
+
+    const TreeNodePtr IDom = TN->getIDom();
+    assert(IDom);
+
+    auto ChIt = llvm::find(IDom->Children, TN);
+    assert(ChIt != IDom->Children.end());
+    std::swap(*ChIt, IDom->Children.back());
+    IDom->Children.pop_back();
+
+    DT.DomTreeNodes.erase(TN->getBlock());
+  }
+
+  //~~
+  //===--------------- DomTree correctness verification ---------------------===
+  //~~
+
   // Check if for every parent with a level L in the tree all of its children
   // have level L + 1.
   static bool VerifyLevels(const DomTreeT &DT) {
@@ -323,20 +784,18 @@ struct SemiNCAInfo {
 
       const TreeNodePtr IDom = TN->getIDom();
       if (!IDom && TN->getLevel() != 0) {
-        errs() << "Node without an IDom ";
-        PrintBlockOrNullptr(errs(), BB);
-        errs() << " has a nonzero level " << TN->getLevel() << "!\n";
+        errs() << "Node without an IDom " << BlockNamePrinter(BB)
+               << " has a nonzero level " << TN->getLevel() << "!\n";
         errs().flush();
 
         return false;
       }
 
       if (IDom && TN->getLevel() != IDom->getLevel() + 1) {
-        errs() << "Node ";
-        PrintBlockOrNullptr(errs(), BB);
-        errs() << " has level " << TN->getLevel() << " while it's IDom ";
-        PrintBlockOrNullptr(errs(), IDom->getBlock());
-        errs() << " has level " << IDom->getLevel() << "!\n";
+        errs() << "Node " << BlockNamePrinter(BB) << " has level "
+               << TN->getLevel() << " while its IDom "
+               << BlockNamePrinter(IDom->getBlock()) << " has level "
+               << IDom->getLevel() << "!\n";
         errs().flush();
 
         return false;
@@ -363,18 +822,14 @@ struct SemiNCAInfo {
       assert(ToTN);
 
       const NodePtr NCD = DT.findNearestCommonDominator(From, To);
-      const TreeNodePtr NCDTN = NCD ? DT.getNode(NCD) : nullptr;
+      const TreeNodePtr NCDTN = DT.getNode(NCD);
       const TreeNodePtr ToIDom = ToTN->getIDom();
       if (NCDTN != ToTN && NCDTN != ToIDom) {
-        errs() << "NearestCommonDominator verification failed:\n\tNCD(From:";
-        PrintBlockOrNullptr(errs(), From);
-        errs() << ", To:";
-        PrintBlockOrNullptr(errs(), To);
-        errs() << ") = ";
-        PrintBlockOrNullptr(errs(), NCD);
-        errs() << ",\t (should be To or IDom[To]: ";
-        PrintBlockOrNullptr(errs(), ToIDom ? ToIDom->getBlock() : nullptr);
-        errs() << ")\n";
+        errs() << "NearestCommonDominator verification failed:\n\tNCD(From:"
+               << BlockNamePrinter(From) << ", To:" << BlockNamePrinter(To)
+               << ") = " << BlockNamePrinter(NCD)
+               << ",\t (should be To or IDom[To]: " << BlockNamePrinter(ToIDom)
+               << ")\n";
         errs().flush();
 
         return false;
@@ -440,11 +895,9 @@ struct SemiNCAInfo {
 
       for (TreeNodePtr Child : TN->getChildren())
         if (NodeToInfo.count(Child->getBlock()) != 0) {
-          errs() << "Child ";
-          PrintBlockOrNullptr(errs(), Child->getBlock());
-          errs() << " reachable after its parent ";
-          PrintBlockOrNullptr(errs(), BB);
-          errs() << " is removed!\n";
+          errs() << "Child " << BlockNamePrinter(Child)
+                 << " reachable after its parent " << BlockNamePrinter(BB)
+                 << " is removed!\n";
           errs().flush();
 
           return false;
@@ -477,11 +930,9 @@ struct SemiNCAInfo {
           if (S == N) continue;
 
           if (NodeToInfo.count(S->getBlock()) == 0) {
-            errs() << "Node ";
-            PrintBlockOrNullptr(errs(), S->getBlock());
-            errs() << " not reachable when its sibling ";
-            PrintBlockOrNullptr(errs(), N->getBlock());
-            errs() << " is removed!\n";
+            errs() << "Node " << BlockNamePrinter(S)
+                   << " not reachable when its sibling " << BlockNamePrinter(N)
+                   << " is removed!\n";
             errs().flush();
 
             return false;
@@ -494,23 +945,30 @@ struct SemiNCAInfo {
   }
 };
 
-template <class FuncT, class NodeT>
-void Calculate(DominatorTreeBaseByGraphTraits<GraphTraits<NodeT>> &DT,
-               FuncT &F) {
-  using NodePtr = typename GraphTraits<NodeT>::NodeRef;
-  static_assert(std::is_pointer<NodePtr>::value,
-                "NodePtr should be a pointer type");
-  SemiNCAInfo<typename std::remove_pointer<NodePtr>::type> SNCA;
-  SNCA.template runSemiNCA<NodeT>(DT, GraphTraits<FuncT *>::size(&F));
+
+template <class DomTreeT, class FuncT>
+void Calculate(DomTreeT &DT, FuncT &F) {
+  SemiNCAInfo<DomTreeT> SNCA;
+  SNCA.calculateFromScratch(DT, GraphTraits<FuncT *>::size(&F));
 }
 
-template <class NodeT>
-bool Verify(const DominatorTreeBaseByGraphTraits<GraphTraits<NodeT>> &DT) {
-  using NodePtr = typename GraphTraits<NodeT>::NodeRef;
-  static_assert(std::is_pointer<NodePtr>::value,
-                "NodePtr should be a pointer type");
-  SemiNCAInfo<typename std::remove_pointer<NodePtr>::type> SNCA;
+template <class DomTreeT>
+void InsertEdge(DomTreeT &DT, typename DomTreeT::NodePtr From,
+                typename DomTreeT::NodePtr To) {
+  if (DT.isPostDominator()) std::swap(From, To);
+  SemiNCAInfo<DomTreeT>::InsertEdge(DT, From, To);
+}
+
+template <class DomTreeT>
+void DeleteEdge(DomTreeT &DT, typename DomTreeT::NodePtr From,
+                typename DomTreeT::NodePtr To) {
+  if (DT.isPostDominator()) std::swap(From, To);
+  SemiNCAInfo<DomTreeT>::DeleteEdge(DT, From, To);
+}
 
+template <class DomTreeT>
+bool Verify(const DomTreeT &DT) {
+  SemiNCAInfo<DomTreeT> SNCA;
   return SNCA.verifyReachability(DT) && SNCA.VerifyLevels(DT) &&
          SNCA.verifyNCD(DT) && SNCA.verifyParentProperty(DT) &&
          SNCA.verifySiblingProperty(DT);
@@ -519,4 +977,6 @@ bool Verify(const DominatorTreeBaseByGraphTraits<GraphTraits<NodeT>> &DT) {
 }  // namespace DomTreeBuilder
 }  // namespace llvm
 
+#undef DEBUG_TYPE
+
 #endif
diff --git a/include/llvm/Support/TargetParser.h b/include/llvm/Support/TargetParser.h
index 72c28865ac57..e13582f6a6d3 100644
--- a/include/llvm/Support/TargetParser.h
+++ b/include/llvm/Support/TargetParser.h
@@ -85,6 +85,7 @@ enum ArchExtKind : unsigned {
   AEK_DSP = 0x400,
   AEK_FP16 = 0x800,
   AEK_RAS = 0x1000,
+  AEK_SVE = 0x2000,
   // Unsupported extensions.
   AEK_OS = 0x8000000,
   AEK_IWMMXT = 0x10000000,
@@ -166,7 +167,8 @@ enum ArchExtKind : unsigned {
   AEK_FP16 = 0x20,
   AEK_PROFILE = 0x40,
   AEK_RAS = 0x80,
-  AEK_LSE = 0x100
+  AEK_LSE = 0x100,
+  AEK_SVE = 0x200
 };
 
 StringRef getCanonicalArchName(StringRef Arch);
diff --git a/include/llvm/Support/YAMLTraits.h b/include/llvm/Support/YAMLTraits.h
index 15b3b11db045..71fdf47f1979 100644
--- a/include/llvm/Support/YAMLTraits.h
+++ b/include/llvm/Support/YAMLTraits.h
@@ -1114,6 +1114,10 @@ public:
         void *Ctxt = nullptr,
         SourceMgr::DiagHandlerTy DiagHandler = nullptr,
         void *DiagHandlerCtxt = nullptr);
+  Input(MemoryBufferRef Input,
+        void *Ctxt = nullptr,
+        SourceMgr::DiagHandlerTy DiagHandler = nullptr,
+        void *DiagHandlerCtxt = nullptr);
   ~Input() override;
 
   // Check if there was an syntax or semantic error during parsing.
diff --git a/include/llvm/Target/GlobalISel/SelectionDAGCompat.td b/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
index 178b08d7b8b7..50de41fd1320 100644
--- a/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
+++ b/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
@@ -58,6 +58,7 @@ def : GINodeEquiv<G_SITOFP, sint_to_fp>;
 def : GINodeEquiv<G_UITOFP, uint_to_fp>;
 def : GINodeEquiv<G_FADD, fadd>;
 def : GINodeEquiv<G_FSUB, fsub>;
+def : GINodeEquiv<G_FMA, fma>;
 def : GINodeEquiv<G_FMUL, fmul>;
 def : GINodeEquiv<G_FDIV, fdiv>;
 def : GINodeEquiv<G_FREM, frem>;
diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h
index 60a03bdc182d..23711d636c9a 100644
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@@ -2012,6 +2012,35 @@ public:
     return isExtFreeImpl(I);
   }
 
+  /// Return true if \p Load and \p Ext can form an ExtLoad.
+  /// For example, in AArch64
+  ///   %L = load i8, i8* %ptr
+  ///   %E = zext i8 %L to i32
+  /// can be lowered into one load instruction
+  ///   ldrb w0, [x0]
+  bool isExtLoad(const LoadInst *Load, const Instruction *Ext,
+                 const DataLayout &DL) const {
+    EVT VT = getValueType(DL, Ext->getType());
+    EVT LoadVT = getValueType(DL, Load->getType());
+
+    // If the load has other users and the truncate is not free, the ext
+    // probably isn't free.
+    if (!Load->hasOneUse() && (isTypeLegal(LoadVT) || !isTypeLegal(VT)) &&
+        !isTruncateFree(Ext->getType(), Load->getType()))
+      return false;
+
+    // Check whether the target supports casts folded into loads.
+    unsigned LType;
+    if (isa<ZExtInst>(Ext))
+      LType = ISD::ZEXTLOAD;
+    else {
+      assert(isa<SExtInst>(Ext) && "Unexpected ext type!");
+      LType = ISD::SEXTLOAD;
+    }
+
+    return isLoadExtLegal(LType, VT, LoadVT);
+  }
+
   /// Return true if any actual instruction that defines a value of type FromTy
   /// implicitly zero-extends the value to ToTy in the result register.
   ///
diff --git a/include/llvm/ToolDrivers/llvm-dlltool/DlltoolDriver.h b/include/llvm/ToolDrivers/llvm-dlltool/DlltoolDriver.h
new file mode 100644
index 000000000000..964b0f7620a2
--- /dev/null
+++ b/include/llvm/ToolDrivers/llvm-dlltool/DlltoolDriver.h
@@ -0,0 +1,24 @@
+//===- DlltoolDriver.h - dlltool.exe-compatible driver ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines an interface to a dlltool.exe-compatible driver.
+// Used by llvm-dlltool.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLDRIVERS_LLVM_DLLTOOL_DLLTOOLDRIVER_H
+#define LLVM_TOOLDRIVERS_LLVM_DLLTOOL_DLLTOOLDRIVER_H
+
+namespace llvm {
+template <typename T> class ArrayRef;
+
+int dlltoolDriverMain(ArrayRef<const char *> ArgsArr);
+} // namespace llvm
+
+#endif
diff --git a/lib/Analysis/CGSCCPassManager.cpp b/lib/Analysis/CGSCCPassManager.cpp
index 3ddefc6520a7..74b5d79ebac5 100644
--- a/lib/Analysis/CGSCCPassManager.cpp
+++ b/lib/Analysis/CGSCCPassManager.cpp
@@ -433,7 +433,7 @@ LazyCallGraph::SCC &llvm::updateCGAndAnalysisManagerForFunctionPass(
         if (Visited.insert(C).second)
           Worklist.push_back(C);
 
-  LazyCallGraph::visitReferences(Worklist, Visited, [&](Function &Referee) {
+  auto VisitRef = [&](Function &Referee) {
     Node &RefereeN = *G.lookup(Referee);
     Edge *E = N->lookup(RefereeN);
     // FIXME: Similarly to new calls, we also currently preclude
@@ -444,7 +444,12 @@ LazyCallGraph::SCC &llvm::updateCGAndAnalysisManagerForFunctionPass(
     RetainedEdges.insert(&RefereeN);
     if (E->isCall())
       DemotedCallTargets.insert(&RefereeN);
-  });
+  };
+  LazyCallGraph::visitReferences(Worklist, Visited, VisitRef);
+
+  // Include synthetic reference edges to known, defined lib functions.
+  for (auto *F : G.getLibFunctions())
+    VisitRef(*F);
 
   // First remove all of the edges that are no longer present in this function.
   // We have to build a list of dead targets first and then remove them as the
diff --git a/lib/Analysis/DominanceFrontier.cpp b/lib/Analysis/DominanceFrontier.cpp
index 5b6e2d0476e4..c08c6cfe0c3b 100644
--- a/lib/Analysis/DominanceFrontier.cpp
+++ b/lib/Analysis/DominanceFrontier.cpp
@@ -14,7 +14,8 @@
 using namespace llvm;
 
 namespace llvm {
-template class DominanceFrontierBase<BasicBlock>;
+template class DominanceFrontierBase<BasicBlock, false>;
+template class DominanceFrontierBase<BasicBlock, true>;
 template class ForwardDominanceFrontierBase<BasicBlock>;
 }
 
diff --git a/lib/Analysis/InstCount.cpp b/lib/Analysis/InstCount.cpp
index 27c6b580e7ac..95ab6ee3db5b 100644
--- a/lib/Analysis/InstCount.cpp
+++ b/lib/Analysis/InstCount.cpp
@@ -26,7 +26,6 @@ using namespace llvm;
 STATISTIC(TotalInsts , "Number of instructions (of all types)");
 STATISTIC(TotalBlocks, "Number of basic blocks");
 STATISTIC(TotalFuncs , "Number of non-external functions");
-STATISTIC(TotalMemInst, "Number of memory instructions");
 
 #define HANDLE_INST(N, OPCODE, CLASS) \
   STATISTIC(Num ## OPCODE ## Inst, "Number of " #OPCODE " insts");
@@ -75,13 +74,6 @@ FunctionPass *llvm::createInstCountPass() { return new InstCount(); }
 // function.
 //
 bool InstCount::runOnFunction(Function &F) {
-  unsigned StartMemInsts =
-    NumGetElementPtrInst + NumLoadInst + NumStoreInst + NumCallInst +
-    NumInvokeInst + NumAllocaInst;
   visit(F);
-  unsigned EndMemInsts =
-    NumGetElementPtrInst + NumLoadInst + NumStoreInst + NumCallInst +
-    NumInvokeInst + NumAllocaInst;
-  TotalMemInst += EndMemInsts-StartMemInsts;
   return false;
 }
diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp
index f6632020b8fc..b4f3b87e1846 100644
--- a/lib/Analysis/InstructionSimplify.cpp
+++ b/lib/Analysis/InstructionSimplify.cpp
@@ -1745,14 +1745,11 @@ static Value *SimplifyAndInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
     return Constant::getNullValue(Op0->getType());
 
   // (A | ?) & A = A
-  Value *A = nullptr, *B = nullptr;
-  if (match(Op0, m_Or(m_Value(A), m_Value(B))) &&
-      (A == Op1 || B == Op1))
+  if (match(Op0, m_c_Or(m_Specific(Op1), m_Value())))
     return Op1;
 
   // A & (A | ?) = A
-  if (match(Op1, m_Or(m_Value(A), m_Value(B))) &&
-      (A == Op0 || B == Op0))
+  if (match(Op1, m_c_Or(m_Specific(Op0), m_Value())))
     return Op0;
 
   // A mask that only clears known zeros of a shifted value is a no-op.
@@ -1852,26 +1849,22 @@ static Value *SimplifyOrInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
     return Constant::getAllOnesValue(Op0->getType());
 
   // (A & ?) | A = A
-  Value *A = nullptr, *B = nullptr;
-  if (match(Op0, m_And(m_Value(A), m_Value(B))) &&
-      (A == Op1 || B == Op1))
+  if (match(Op0, m_c_And(m_Specific(Op1), m_Value())))
     return Op1;
 
   // A | (A & ?) = A
-  if (match(Op1, m_And(m_Value(A), m_Value(B))) &&
-      (A == Op0 || B == Op0))
+  if (match(Op1, m_c_And(m_Specific(Op0), m_Value())))
     return Op0;
 
   // ~(A & ?) | A = -1
-  if (match(Op0, m_Not(m_And(m_Value(A), m_Value(B)))) &&
-      (A == Op1 || B == Op1))
+  if (match(Op0, m_Not(m_c_And(m_Specific(Op1), m_Value()))))
     return Constant::getAllOnesValue(Op1->getType());
 
   // A | ~(A & ?) = -1
-  if (match(Op1, m_Not(m_And(m_Value(A), m_Value(B)))) &&
-      (A == Op0 || B == Op0))
+  if (match(Op1, m_Not(m_c_And(m_Specific(Op1), m_Value()))))
     return Constant::getAllOnesValue(Op0->getType());
 
+  Value *A, *B;
   // (A & ~B) | (A ^ B) -> (A ^ B)
   // (~B & A) | (A ^ B) -> (A ^ B)
   // (A & ~B) | (B ^ A) -> (B ^ A)
diff --git a/lib/Analysis/IteratedDominanceFrontier.cpp b/lib/Analysis/IteratedDominanceFrontier.cpp
index 0e02850df349..3992657417c5 100644
--- a/lib/Analysis/IteratedDominanceFrontier.cpp
+++ b/lib/Analysis/IteratedDominanceFrontier.cpp
@@ -17,8 +17,8 @@
 #include <queue>
 
 namespace llvm {
-template <class NodeTy>
-void IDFCalculator<NodeTy>::calculate(
+template <class NodeTy, bool IsPostDom>
+void IDFCalculator<NodeTy, IsPostDom>::calculate(
     SmallVectorImpl<BasicBlock *> &PHIBlocks) {
   // Use a priority queue keyed on dominator tree level so that inserted nodes
   // are handled from the bottom of the dominator tree upwards.
@@ -88,6 +88,6 @@ void IDFCalculator<NodeTy>::calculate(
   }
 }
 
-template class IDFCalculator<BasicBlock *>;
-template class IDFCalculator<Inverse<BasicBlock *>>;
+template class IDFCalculator<BasicBlock *, false>;
+template class IDFCalculator<Inverse<BasicBlock *>, true>;
 }
diff --git a/lib/Analysis/LazyCallGraph.cpp b/lib/Analysis/LazyCallGraph.cpp
index a4c3e43b4b0c..d287f81985fd 100644
--- a/lib/Analysis/LazyCallGraph.cpp
+++ b/lib/Analysis/LazyCallGraph.cpp
@@ -106,6 +106,13 @@ LazyCallGraph::EdgeSequence &LazyCallGraph::Node::populateSlow() {
             LazyCallGraph::Edge::Ref);
   });
 
+  // Add implicit reference edges to any defined libcall functions (if we
+  // haven't found an explicit edge).
+  for (auto *F : G->LibFunctions)
+    if (!Visited.count(F))
+      addEdge(Edges->Edges, Edges->EdgeIndexMap, G->get(*F),
+              LazyCallGraph::Edge::Ref);
+
   return *Edges;
 }
 
@@ -120,15 +127,34 @@ LLVM_DUMP_METHOD void LazyCallGraph::Node::dump() const {
 }
 #endif
 
-LazyCallGraph::LazyCallGraph(Module &M) {
+static bool isKnownLibFunction(Function &F, TargetLibraryInfo &TLI) {
+  LibFunc LF;
+
+  // Either this is a normal library function or a "vectorizable" function.
+  return TLI.getLibFunc(F, LF) || TLI.isFunctionVectorizable(F.getName());
+}
+
+LazyCallGraph::LazyCallGraph(Module &M, TargetLibraryInfo &TLI) {
   DEBUG(dbgs() << "Building CG for module: " << M.getModuleIdentifier()
                << "\n");
-  for (Function &F : M)
-    if (!F.isDeclaration() && !F.hasLocalLinkage()) {
-      DEBUG(dbgs() << "  Adding '" << F.getName()
-                   << "' to entry set of the graph.\n");
-      addEdge(EntryEdges.Edges, EntryEdges.EdgeIndexMap, get(F), Edge::Ref);
-    }
+  for (Function &F : M) {
+    if (F.isDeclaration())
+      continue;
+    // If this function is a known lib function to LLVM then we want to
+    // synthesize reference edges to it to model the fact that LLVM can turn
+    // arbitrary code into a library function call.
+    if (isKnownLibFunction(F, TLI))
+      LibFunctions.insert(&F);
+
+    if (F.hasLocalLinkage())
+      continue;
+
+    // External linkage defined functions have edges to them from other
+    // modules.
+    DEBUG(dbgs() << "  Adding '" << F.getName()
+                 << "' to entry set of the graph.\n");
+    addEdge(EntryEdges.Edges, EntryEdges.EdgeIndexMap, get(F), Edge::Ref);
+  }
 
   // Now add entry nodes for functions reachable via initializers to globals.
   SmallVector<Constant *, 16> Worklist;
@@ -149,7 +175,8 @@ LazyCallGraph::LazyCallGraph(Module &M) {
 LazyCallGraph::LazyCallGraph(LazyCallGraph &&G)
     : BPA(std::move(G.BPA)), NodeMap(std::move(G.NodeMap)),
       EntryEdges(std::move(G.EntryEdges)), SCCBPA(std::move(G.SCCBPA)),
-      SCCMap(std::move(G.SCCMap)), LeafRefSCCs(std::move(G.LeafRefSCCs)) {
+      SCCMap(std::move(G.SCCMap)), LeafRefSCCs(std::move(G.LeafRefSCCs)),
+      LibFunctions(std::move(G.LibFunctions)) {
   updateGraphPtrs();
 }
 
@@ -160,6 +187,7 @@ LazyCallGraph &LazyCallGraph::operator=(LazyCallGraph &&G) {
   SCCBPA = std::move(G.SCCBPA);
   SCCMap = std::move(G.SCCMap);
   LeafRefSCCs = std::move(G.LeafRefSCCs);
+  LibFunctions = std::move(G.LibFunctions);
   updateGraphPtrs();
   return *this;
 }
@@ -1580,6 +1608,11 @@ void LazyCallGraph::removeDeadFunction(Function &F) {
   assert(F.use_empty() &&
          "This routine should only be called on trivially dead functions!");
 
+  // We shouldn't remove library functions as they are never really dead while
+  // the call graph is in use -- every function definition refers to them.
+  assert(!isLibFunction(F) &&
+         "Must not remove lib functions from the call graph!");
+
   auto NI = NodeMap.find(&F);
   if (NI == NodeMap.end())
     // Not in the graph at all!
diff --git a/lib/Analysis/LoopInfo.cpp b/lib/Analysis/LoopInfo.cpp
index baf932432a0a..697b58622bb4 100644
--- a/lib/Analysis/LoopInfo.cpp
+++ b/lib/Analysis/LoopInfo.cpp
@@ -609,7 +609,7 @@ Loop *UnloopUpdater::getNearestLoop(BasicBlock *BB, Loop *BBLoop) {
   return NearLoop;
 }
 
-LoopInfo::LoopInfo(const DominatorTreeBase<BasicBlock> &DomTree) {
+LoopInfo::LoopInfo(const DomTreeBase<BasicBlock> &DomTree) {
   analyze(DomTree);
 }
 
diff --git a/lib/Analysis/MemorySSA.cpp b/lib/Analysis/MemorySSA.cpp
index 86d0d92799f2..86de474c7aa9 100644
--- a/lib/Analysis/MemorySSA.cpp
+++ b/lib/Analysis/MemorySSA.cpp
@@ -39,7 +39,6 @@
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FormattedStream.h"
-#include "llvm/Transforms/Scalar.h"
 #include <algorithm>
 
 #define DEBUG_TYPE "memoryssa"
diff --git a/lib/Analysis/PostDominators.cpp b/lib/Analysis/PostDominators.cpp
index 1caf151546d9..811373ac850b 100644
--- a/lib/Analysis/PostDominators.cpp
+++ b/lib/Analysis/PostDominators.cpp
@@ -23,6 +23,8 @@ using namespace llvm;
 
 #define DEBUG_TYPE "postdomtree"
 
+template class llvm::DominatorTreeBase<BasicBlock, true>; // PostDomTreeBase
+
 //===----------------------------------------------------------------------===//
 //  PostDominatorTree Implementation
 //===----------------------------------------------------------------------===//
diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index 3fb1ab980add..b973203a89b6 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -4173,6 +4173,319 @@ static Optional<BinaryOp> MatchBinaryOp(Value *V, DominatorTree &DT) {
   return None;
 }
 
+/// Helper function to createAddRecFromPHIWithCasts. We have a phi 
+/// node whose symbolic (unknown) SCEV is \p SymbolicPHI, which is updated via
+/// the loop backedge by a SCEVAddExpr, possibly also with a few casts on the 
+/// way. This function checks if \p Op, an operand of this SCEVAddExpr, 
+/// follows one of the following patterns:
+/// Op == (SExt ix (Trunc iy (%SymbolicPHI) to ix) to iy)
+/// Op == (ZExt ix (Trunc iy (%SymbolicPHI) to ix) to iy)
+/// If the SCEV expression of \p Op conforms with one of the expected patterns
+/// we return the type of the truncation operation, and indicate whether the
+/// truncated type should be treated as signed/unsigned by setting 
+/// \p Signed to true/false, respectively.
+static Type *isSimpleCastedPHI(const SCEV *Op, const SCEVUnknown *SymbolicPHI,
+                               bool &Signed, ScalarEvolution &SE) {
+
+  // The case where Op == SymbolicPHI (that is, with no type conversions on 
+  // the way) is handled by the regular add recurrence creating logic and 
+  // would have already been triggered in createAddRecForPHI. Reaching it here
+  // means that createAddRecFromPHI had failed for this PHI before (e.g., 
+  // because one of the other operands of the SCEVAddExpr updating this PHI is
+  // not invariant). 
+  //
+  // Here we look for the case where Op = (ext(trunc(SymbolicPHI))), and in 
+  // this case predicates that allow us to prove that Op == SymbolicPHI will
+  // be added.
+  if (Op == SymbolicPHI)
+    return nullptr;
+
+  unsigned SourceBits = SE.getTypeSizeInBits(SymbolicPHI->getType());
+  unsigned NewBits = SE.getTypeSizeInBits(Op->getType());
+  if (SourceBits != NewBits)
+    return nullptr;
+
+  const SCEVSignExtendExpr *SExt = dyn_cast<SCEVSignExtendExpr>(Op);
+  const SCEVZeroExtendExpr *ZExt = dyn_cast<SCEVZeroExtendExpr>(Op);
+  if (!SExt && !ZExt)
+    return nullptr;
+  const SCEVTruncateExpr *Trunc =
+      SExt ? dyn_cast<SCEVTruncateExpr>(SExt->getOperand())
+           : dyn_cast<SCEVTruncateExpr>(ZExt->getOperand());
+  if (!Trunc)
+    return nullptr;
+  const SCEV *X = Trunc->getOperand();
+  if (X != SymbolicPHI)
+    return nullptr;
+  Signed = SExt ? true : false; 
+  return Trunc->getType();
+}
+
+static const Loop *isIntegerLoopHeaderPHI(const PHINode *PN, LoopInfo &LI) {
+  if (!PN->getType()->isIntegerTy())
+    return nullptr;
+  const Loop *L = LI.getLoopFor(PN->getParent());
+  if (!L || L->getHeader() != PN->getParent())
+    return nullptr;
+  return L;
+}
+
+// Analyze \p SymbolicPHI, a SCEV expression of a phi node, and check if the
+// computation that updates the phi follows the following pattern:
+//   (SExt/ZExt ix (Trunc iy (%SymbolicPHI) to ix) to iy) + InvariantAccum
+// which correspond to a phi->trunc->sext/zext->add->phi update chain.
+// If so, try to see if it can be rewritten as an AddRecExpr under some
+// Predicates. If successful, return them as a pair. Also cache the results
+// of the analysis.
+//
+// Example usage scenario:
+//    Say the Rewriter is called for the following SCEV:
+//         8 * ((sext i32 (trunc i64 %X to i32) to i64) + %Step)
+//    where:
+//         %X = phi i64 (%Start, %BEValue)
+//    It will visitMul->visitAdd->visitSExt->visitTrunc->visitUnknown(%X),
+//    and call this function with %SymbolicPHI = %X.
+//
+//    The analysis will find that the value coming around the backedge has 
+//    the following SCEV:
+//         BEValue = ((sext i32 (trunc i64 %X to i32) to i64) + %Step)
+//    Upon concluding that this matches the desired pattern, the function
+//    will return the pair {NewAddRec, SmallPredsVec} where:
+//         NewAddRec = {%Start,+,%Step}
+//         SmallPredsVec = {P1, P2, P3} as follows:
+//           P1(WrapPred): AR: {trunc(%Start),+,(trunc %Step)}<nsw> Flags: <nssw>
+//           P2(EqualPred): %Start == (sext i32 (trunc i64 %Start to i32) to i64)
+//           P3(EqualPred): %Step == (sext i32 (trunc i64 %Step to i32) to i64)
+//    The returned pair means that SymbolicPHI can be rewritten into NewAddRec
+//    under the predicates {P1,P2,P3}.
+//    This predicated rewrite will be cached in PredicatedSCEVRewrites:
+//         PredicatedSCEVRewrites[{%X,L}] = {NewAddRec, {P1,P2,P3)} 
+//
+// TODO's:
+//
+// 1) Extend the Induction descriptor to also support inductions that involve
+//    casts: When needed (namely, when we are called in the context of the 
+//    vectorizer induction analysis), a Set of cast instructions will be 
+//    populated by this method, and provided back to isInductionPHI. This is
+//    needed to allow the vectorizer to properly record them to be ignored by
+//    the cost model and to avoid vectorizing them (otherwise these casts,
+//    which are redundant under the runtime overflow checks, will be 
+//    vectorized, which can be costly).  
+//
+// 2) Support additional induction/PHISCEV patterns: We also want to support
+//    inductions where the sext-trunc / zext-trunc operations (partly) occur 
+//    after the induction update operation (the induction increment):
+//
+//      (Trunc iy (SExt/ZExt ix (%SymbolicPHI + InvariantAccum) to iy) to ix)
+//    which correspond to a phi->add->trunc->sext/zext->phi update chain.
+//
+//      (Trunc iy ((SExt/ZExt ix (%SymbolicPhi) to iy) + InvariantAccum) to ix)
+//    which correspond to a phi->trunc->add->sext/zext->phi update chain.
+//
+// 3) Outline common code with createAddRecFromPHI to avoid duplication.
+//
+Optional<std::pair<const SCEV *, SmallVector<const SCEVPredicate *, 3>>>
+ScalarEvolution::createAddRecFromPHIWithCastsImpl(const SCEVUnknown *SymbolicPHI) {
+  SmallVector<const SCEVPredicate *, 3> Predicates;
+
+  // *** Part1: Analyze if we have a phi-with-cast pattern for which we can 
+  // return an AddRec expression under some predicate.
+ 
+  auto *PN = cast<PHINode>(SymbolicPHI->getValue());
+  const Loop *L = isIntegerLoopHeaderPHI(PN, LI);
+  assert (L && "Expecting an integer loop header phi");
+
+  // The loop may have multiple entrances or multiple exits; we can analyze
+  // this phi as an addrec if it has a unique entry value and a unique
+  // backedge value.
+  Value *BEValueV = nullptr, *StartValueV = nullptr;
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+    Value *V = PN->getIncomingValue(i);
+    if (L->contains(PN->getIncomingBlock(i))) {
+      if (!BEValueV) {
+        BEValueV = V;
+      } else if (BEValueV != V) {
+        BEValueV = nullptr;
+        break;
+      }
+    } else if (!StartValueV) {
+      StartValueV = V;
+    } else if (StartValueV != V) {
+      StartValueV = nullptr;
+      break;
+    }
+  }
+  if (!BEValueV || !StartValueV)
+    return None;
+
+  const SCEV *BEValue = getSCEV(BEValueV);
+
+  // If the value coming around the backedge is an add with the symbolic
+  // value we just inserted, possibly with casts that we can ignore under
+  // an appropriate runtime guard, then we found a simple induction variable!
+  const auto *Add = dyn_cast<SCEVAddExpr>(BEValue);
+  if (!Add)
+    return None;
+
+  // If there is a single occurrence of the symbolic value, possibly
+  // casted, replace it with a recurrence. 
+  unsigned FoundIndex = Add->getNumOperands();
+  Type *TruncTy = nullptr;
+  bool Signed;
+  for (unsigned i = 0, e = Add->getNumOperands(); i != e; ++i)
+    if ((TruncTy = 
+             isSimpleCastedPHI(Add->getOperand(i), SymbolicPHI, Signed, *this)))
+      if (FoundIndex == e) {
+        FoundIndex = i;
+        break;
+      }
+
+  if (FoundIndex == Add->getNumOperands())
+    return None;
+
+  // Create an add with everything but the specified operand.
+  SmallVector<const SCEV *, 8> Ops;
+  for (unsigned i = 0, e = Add->getNumOperands(); i != e; ++i)
+    if (i != FoundIndex)
+      Ops.push_back(Add->getOperand(i));
+  const SCEV *Accum = getAddExpr(Ops);
+
+  // The runtime checks will not be valid if the step amount is
+  // varying inside the loop.
+  if (!isLoopInvariant(Accum, L))
+    return None;
+
+  
+  // *** Part2: Create the predicates 
+
+  // Analysis was successful: we have a phi-with-cast pattern for which we
+  // can return an AddRec expression under the following predicates:
+  //
+  // P1: A Wrap predicate that guarantees that Trunc(Start) + i*Trunc(Accum)
+  //     fits within the truncated type (does not overflow) for i = 0 to n-1.
+  // P2: An Equal predicate that guarantees that 
+  //     Start = (Ext ix (Trunc iy (Start) to ix) to iy)
+  // P3: An Equal predicate that guarantees that 
+  //     Accum = (Ext ix (Trunc iy (Accum) to ix) to iy)
+  //
+  // As we next prove, the above predicates guarantee that: 
+  //     Start + i*Accum = (Ext ix (Trunc iy ( Start + i*Accum ) to ix) to iy)
+  //
+  //
+  // More formally, we want to prove that:
+  //     Expr(i+1) = Start + (i+1) * Accum 
+  //               = (Ext ix (Trunc iy (Expr(i)) to ix) to iy) + Accum 
+  //
+  // Given that:
+  // 1) Expr(0) = Start 
+  // 2) Expr(1) = Start + Accum 
+  //            = (Ext ix (Trunc iy (Start) to ix) to iy) + Accum :: from P2
+  // 3) Induction hypothesis (step i):
+  //    Expr(i) = (Ext ix (Trunc iy (Expr(i-1)) to ix) to iy) + Accum 
+  //
+  // Proof:
+  //  Expr(i+1) =
+  //   = Start + (i+1)*Accum
+  //   = (Start + i*Accum) + Accum
+  //   = Expr(i) + Accum  
+  //   = (Ext ix (Trunc iy (Expr(i-1)) to ix) to iy) + Accum + Accum 
+  //                                                             :: from step i
+  //
+  //   = (Ext ix (Trunc iy (Start + (i-1)*Accum) to ix) to iy) + Accum + Accum 
+  //
+  //   = (Ext ix (Trunc iy (Start + (i-1)*Accum) to ix) to iy)
+  //     + (Ext ix (Trunc iy (Accum) to ix) to iy)
+  //     + Accum                                                     :: from P3
+  //
+  //   = (Ext ix (Trunc iy ((Start + (i-1)*Accum) + Accum) to ix) to iy) 
+  //     + Accum                            :: from P1: Ext(x)+Ext(y)=>Ext(x+y)
+  //
+  //   = (Ext ix (Trunc iy (Start + i*Accum) to ix) to iy) + Accum
+  //   = (Ext ix (Trunc iy (Expr(i)) to ix) to iy) + Accum 
+  //
+  // By induction, the same applies to all iterations 1<=i<n:
+  //
+  
+  // Create a truncated addrec for which we will add a no overflow check (P1).
+  const SCEV *StartVal = getSCEV(StartValueV);
+  const SCEV *PHISCEV = 
+      getAddRecExpr(getTruncateExpr(StartVal, TruncTy),
+                    getTruncateExpr(Accum, TruncTy), L, SCEV::FlagAnyWrap); 
+  const auto *AR = cast<SCEVAddRecExpr>(PHISCEV);
+
+  SCEVWrapPredicate::IncrementWrapFlags AddedFlags =
+      Signed ? SCEVWrapPredicate::IncrementNSSW
+             : SCEVWrapPredicate::IncrementNUSW;
+  const SCEVPredicate *AddRecPred = getWrapPredicate(AR, AddedFlags);
+  Predicates.push_back(AddRecPred);
+
+  // Create the Equal Predicates P2,P3:
+  auto AppendPredicate = [&](const SCEV *Expr) -> void {
+    assert (isLoopInvariant(Expr, L) && "Expr is expected to be invariant");
+    const SCEV *TruncatedExpr = getTruncateExpr(Expr, TruncTy);
+    const SCEV *ExtendedExpr =
+        Signed ? getSignExtendExpr(TruncatedExpr, Expr->getType())
+               : getZeroExtendExpr(TruncatedExpr, Expr->getType());
+    if (Expr != ExtendedExpr &&
+        !isKnownPredicate(ICmpInst::ICMP_EQ, Expr, ExtendedExpr)) {
+      const SCEVPredicate *Pred = getEqualPredicate(Expr, ExtendedExpr);
+      DEBUG (dbgs() << "Added Predicate: " << *Pred);
+      Predicates.push_back(Pred);
+    }
+  };
+  
+  AppendPredicate(StartVal);
+  AppendPredicate(Accum);
+  
+  // *** Part3: Predicates are ready. Now go ahead and create the new addrec in
+  // which the casts had been folded away. The caller can rewrite SymbolicPHI
+  // into NewAR if it will also add the runtime overflow checks specified in
+  // Predicates.  
+  auto *NewAR = getAddRecExpr(StartVal, Accum, L, SCEV::FlagAnyWrap);
+
+  std::pair<const SCEV *, SmallVector<const SCEVPredicate *, 3>> PredRewrite =
+      std::make_pair(NewAR, Predicates);
+  // Remember the result of the analysis for this SCEV at this locayyytion.
+  PredicatedSCEVRewrites[{SymbolicPHI, L}] = PredRewrite;
+  return PredRewrite;
+}
+
+Optional<std::pair<const SCEV *, SmallVector<const SCEVPredicate *, 3>>>
+ScalarEvolution::createAddRecFromPHIWithCasts(const SCEVUnknown *SymbolicPHI) {
+
+  auto *PN = cast<PHINode>(SymbolicPHI->getValue());
+  const Loop *L = isIntegerLoopHeaderPHI(PN, LI);
+  if (!L)
+    return None;
+
+  // Check to see if we already analyzed this PHI.
+  auto I = PredicatedSCEVRewrites.find({SymbolicPHI, L});
+  if (I != PredicatedSCEVRewrites.end()) {
+    std::pair<const SCEV *, SmallVector<const SCEVPredicate *, 3>> Rewrite =
+        I->second;
+    // Analysis was done before and failed to create an AddRec:
+    if (Rewrite.first == SymbolicPHI) 
+      return None;
+    // Analysis was done before and succeeded to create an AddRec under
+    // a predicate:
+    assert(isa<SCEVAddRecExpr>(Rewrite.first) && "Expected an AddRec");
+    assert(!(Rewrite.second).empty() && "Expected to find Predicates");
+    return Rewrite;
+  }
+
+  Optional<std::pair<const SCEV *, SmallVector<const SCEVPredicate *, 3>>>
+    Rewrite = createAddRecFromPHIWithCastsImpl(SymbolicPHI);
+
+  // Record in the cache that the analysis failed
+  if (!Rewrite) {
+    SmallVector<const SCEVPredicate *, 3> Predicates;
+    PredicatedSCEVRewrites[{SymbolicPHI, L}] = {SymbolicPHI, Predicates};
+    return None;
+  }
+
+  return Rewrite;
+}
+
 /// A helper function for createAddRecFromPHI to handle simple cases.
 ///
 /// This function tries to find an AddRec expression for the simplest (yet most
@@ -5904,6 +6217,16 @@ void ScalarEvolution::forgetLoop(const Loop *L) {
   RemoveLoopFromBackedgeMap(BackedgeTakenCounts);
   RemoveLoopFromBackedgeMap(PredicatedBackedgeTakenCounts);
 
+  // Drop information about predicated SCEV rewrites for this loop.
+  for (auto I = PredicatedSCEVRewrites.begin();
+       I != PredicatedSCEVRewrites.end();) {
+    std::pair<const SCEV *, const Loop *> Entry = I->first;
+    if (Entry.second == L)
+      PredicatedSCEVRewrites.erase(I++);
+    else
+      ++I;
+  }
+
   // Drop information about expressions based on loop-header PHIs.
   SmallVector<Instruction *, 16> Worklist;
   PushLoopPHIs(L, Worklist);
@@ -10062,6 +10385,7 @@ ScalarEvolution::ScalarEvolution(ScalarEvolution &&Arg)
       UniqueSCEVs(std::move(Arg.UniqueSCEVs)),
       UniquePreds(std::move(Arg.UniquePreds)),
       SCEVAllocator(std::move(Arg.SCEVAllocator)),
+      PredicatedSCEVRewrites(std::move(Arg.PredicatedSCEVRewrites)),
       FirstUnknown(Arg.FirstUnknown) {
   Arg.FirstUnknown = nullptr;
 }
@@ -10462,6 +10786,15 @@ void ScalarEvolution::forgetMemoizedResults(const SCEV *S) {
   HasRecMap.erase(S);
   MinTrailingZerosCache.erase(S);
 
+  for (auto I = PredicatedSCEVRewrites.begin(); 
+       I != PredicatedSCEVRewrites.end();) {
+    std::pair<const SCEV *, const Loop *> Entry = I->first;
+    if (Entry.first == S)
+      PredicatedSCEVRewrites.erase(I++);
+    else
+      ++I;
+  }
+
   auto RemoveSCEVFromBackedgeMap =
       [S, this](DenseMap<const Loop *, BackedgeTakenInfo> &Map) {
         for (auto I = Map.begin(), E = Map.end(); I != E;) {
@@ -10621,10 +10954,11 @@ void ScalarEvolutionWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequiredTransitive<TargetLibraryInfoWrapperPass>();
 }
 
-const SCEVPredicate *
-ScalarEvolution::getEqualPredicate(const SCEVUnknown *LHS,
-                                   const SCEVConstant *RHS) {
+const SCEVPredicate *ScalarEvolution::getEqualPredicate(const SCEV *LHS,
+                                                        const SCEV *RHS) {
   FoldingSetNodeID ID;
+  assert(LHS->getType() == RHS->getType() &&
+         "Type mismatch between LHS and RHS");
   // Unique this node based on the arguments
   ID.AddInteger(SCEVPredicate::P_Equal);
   ID.AddPointer(LHS);
@@ -10687,8 +11021,7 @@ public:
           if (IPred->getLHS() == Expr)
             return IPred->getRHS();
     }
-
-    return Expr;
+    return convertToAddRecWithPreds(Expr);
   }
 
   const SCEV *visitZeroExtendExpr(const SCEVZeroExtendExpr *Expr) {
@@ -10724,17 +11057,41 @@ public:
   }
 
 private:
-  bool addOverflowAssumption(const SCEVAddRecExpr *AR,
-                             SCEVWrapPredicate::IncrementWrapFlags AddedFlags) {
-    auto *A = SE.getWrapPredicate(AR, AddedFlags);
+  bool addOverflowAssumption(const SCEVPredicate *P) {
     if (!NewPreds) {
       // Check if we've already made this assumption.
-      return Pred && Pred->implies(A);
+      return Pred && Pred->implies(P);
     }
-    NewPreds->insert(A);
+    NewPreds->insert(P);
     return true;
   }
 
+  bool addOverflowAssumption(const SCEVAddRecExpr *AR,
+                             SCEVWrapPredicate::IncrementWrapFlags AddedFlags) {
+    auto *A = SE.getWrapPredicate(AR, AddedFlags);
+    return addOverflowAssumption(A);
+  }
+
+  // If \p Expr represents a PHINode, we try to see if it can be represented
+  // as an AddRec, possibly under a predicate (PHISCEVPred). If it is possible 
+  // to add this predicate as a runtime overflow check, we return the AddRec.
+  // If \p Expr does not meet these conditions (is not a PHI node, or we 
+  // couldn't create an AddRec for it, or couldn't add the predicate), we just 
+  // return \p Expr.
+  const SCEV *convertToAddRecWithPreds(const SCEVUnknown *Expr) {
+    if (!isa<PHINode>(Expr->getValue()))
+      return Expr;
+    Optional<std::pair<const SCEV *, SmallVector<const SCEVPredicate *, 3>>>
+    PredicatedRewrite = SE.createAddRecFromPHIWithCasts(Expr);
+    if (!PredicatedRewrite)
+      return Expr;
+    for (auto *P : PredicatedRewrite->second){
+      if (!addOverflowAssumption(P))
+        return Expr;
+    }
+    return PredicatedRewrite->first;
+  }
+  
   SmallPtrSetImpl<const SCEVPredicate *> *NewPreds;
   SCEVUnionPredicate *Pred;
   const Loop *L;
@@ -10771,9 +11128,11 @@ SCEVPredicate::SCEVPredicate(const FoldingSetNodeIDRef ID,
     : FastID(ID), Kind(Kind) {}
 
 SCEVEqualPredicate::SCEVEqualPredicate(const FoldingSetNodeIDRef ID,
-                                       const SCEVUnknown *LHS,
-                                       const SCEVConstant *RHS)
-    : SCEVPredicate(ID, P_Equal), LHS(LHS), RHS(RHS) {}
+                                       const SCEV *LHS, const SCEV *RHS)
+    : SCEVPredicate(ID, P_Equal), LHS(LHS), RHS(RHS) {
+  assert(LHS->getType() == RHS->getType() && "LHS and RHS types don't match");
+  assert(LHS != RHS && "LHS and RHS are the same SCEV");
+}
 
 bool SCEVEqualPredicate::implies(const SCEVPredicate *N) const {
   const auto *Op = dyn_cast<SCEVEqualPredicate>(N);
diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp
index 94bbc58541a7..25813c65037f 100644
--- a/lib/Analysis/TargetTransformInfo.cpp
+++ b/lib/Analysis/TargetTransformInfo.cpp
@@ -82,6 +82,11 @@ int TargetTransformInfo::getGEPCost(Type *PointeeType, const Value *Ptr,
   return TTIImpl->getGEPCost(PointeeType, Ptr, Operands);
 }
 
+int TargetTransformInfo::getExtCost(const Instruction *I,
+                                    const Value *Src) const {
+  return TTIImpl->getExtCost(I, Src);
+}
+
 int TargetTransformInfo::getIntrinsicCost(
     Intrinsic::ID IID, Type *RetTy, ArrayRef<const Value *> Arguments) const {
   int Cost = TTIImpl->getIntrinsicCost(IID, RetTy, Arguments);
diff --git a/lib/AsmParser/LLLexer.cpp b/lib/AsmParser/LLLexer.cpp
index 428bb21fbf51..90e0d6a216ee 100644
--- a/lib/AsmParser/LLLexer.cpp
+++ b/lib/AsmParser/LLLexer.cpp
@@ -588,7 +588,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(spir_func);
   KEYWORD(intel_ocl_bicc);
   KEYWORD(x86_64_sysvcc);
-  KEYWORD(x86_64_win64cc);
+  KEYWORD(win64cc);
   KEYWORD(x86_regcallcc);
   KEYWORD(webkit_jscc);
   KEYWORD(swiftcc);
diff --git a/lib/AsmParser/LLParser.cpp b/lib/AsmParser/LLParser.cpp
index 717eb0e00f4f..13679ce1d25c 100644
--- a/lib/AsmParser/LLParser.cpp
+++ b/lib/AsmParser/LLParser.cpp
@@ -1670,7 +1670,7 @@ void LLParser::ParseOptionalDLLStorageClass(unsigned &Res) {
 ///   ::= 'spir_func'
 ///   ::= 'spir_kernel'
 ///   ::= 'x86_64_sysvcc'
-///   ::= 'x86_64_win64cc'
+///   ::= 'win64cc'
 ///   ::= 'webkit_jscc'
 ///   ::= 'anyregcc'
 ///   ::= 'preserve_mostcc'
@@ -1712,7 +1712,7 @@ bool LLParser::ParseOptionalCallingConv(unsigned &CC) {
   case lltok::kw_spir_func:      CC = CallingConv::SPIR_FUNC; break;
   case lltok::kw_intel_ocl_bicc: CC = CallingConv::Intel_OCL_BI; break;
   case lltok::kw_x86_64_sysvcc:  CC = CallingConv::X86_64_SysV; break;
-  case lltok::kw_x86_64_win64cc: CC = CallingConv::X86_64_Win64; break;
+  case lltok::kw_win64cc:        CC = CallingConv::Win64; break;
   case lltok::kw_webkit_jscc:    CC = CallingConv::WebKit_JS; break;
   case lltok::kw_anyregcc:       CC = CallingConv::AnyReg; break;
   case lltok::kw_preserve_mostcc:CC = CallingConv::PreserveMost; break;
@@ -4411,13 +4411,15 @@ bool LLParser::ParseDIImportedEntity(MDNode *&Result, bool IsDistinct) {
   REQUIRED(tag, DwarfTagField, );                                              \
   REQUIRED(scope, MDField, );                                                  \
   OPTIONAL(entity, MDField, );                                                 \
+  OPTIONAL(file, MDField, );                                                   \
   OPTIONAL(line, LineField, );                                                 \
   OPTIONAL(name, MDStringField, );
   PARSE_MD_FIELDS();
 #undef VISIT_MD_FIELDS
 
-  Result = GET_OR_DISTINCT(DIImportedEntity, (Context, tag.Val, scope.Val,
-                                              entity.Val, line.Val, name.Val));
+  Result = GET_OR_DISTINCT(
+      DIImportedEntity,
+      (Context, tag.Val, scope.Val, entity.Val, file.Val, line.Val, name.Val));
   return false;
 }
 
diff --git a/lib/AsmParser/LLToken.h b/lib/AsmParser/LLToken.h
index 9c7a06de81b4..0f3707ba0d1e 100644
--- a/lib/AsmParser/LLToken.h
+++ b/lib/AsmParser/LLToken.h
@@ -141,7 +141,7 @@ enum Kind {
   kw_spir_kernel,
   kw_spir_func,
   kw_x86_64_sysvcc,
-  kw_x86_64_win64cc,
+  kw_win64cc,
   kw_webkit_jscc,
   kw_anyregcc,
   kw_swiftcc,
diff --git a/lib/Bitcode/Reader/MetadataLoader.cpp b/lib/Bitcode/Reader/MetadataLoader.cpp
index b1504a8034e0..10fbcdea784f 100644
--- a/lib/Bitcode/Reader/MetadataLoader.cpp
+++ b/lib/Bitcode/Reader/MetadataLoader.cpp
@@ -1671,15 +1671,17 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     break;
   }
   case bitc::METADATA_IMPORTED_ENTITY: {
-    if (Record.size() != 6)
+    if (Record.size() != 6 && Record.size() != 7)
       return error("Invalid record");
 
     IsDistinct = Record[0];
+    bool HasFile = (Record.size() == 7);
     MetadataList.assignValue(
         GET_OR_DISTINCT(DIImportedEntity,
                         (Context, Record[1], getMDOrNull(Record[2]),
-                         getDITypeRefOrNull(Record[3]), Record[4],
-                         getMDString(Record[5]))),
+                         getDITypeRefOrNull(Record[3]),
+                         HasFile ? getMDOrNull(Record[6]) : nullptr,
+                         HasFile ? Record[4] : 0, getMDString(Record[5]))),
         NextMetadataNo);
     NextMetadataNo++;
     break;
diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp
index 0e518d2bbc8f..dcffde1742cd 100644
--- a/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -1718,6 +1718,7 @@ void ModuleBitcodeWriter::writeDIImportedEntity(
   Record.push_back(VE.getMetadataOrNullID(N->getEntity()));
   Record.push_back(N->getLine());
   Record.push_back(VE.getMetadataOrNullID(N->getRawName()));
+  Record.push_back(VE.getMetadataOrNullID(N->getRawFile()));
 
   Stream.EmitRecord(bitc::METADATA_IMPORTED_ENTITY, Record, Abbrev);
   Record.clear();
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index d4a90eeabe15..676c48fe5c67 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -664,8 +664,9 @@ DIE *DwarfCompileUnit::constructImportedEntityDIE(
   else
     EntityDie = getDIE(Entity);
   assert(EntityDie);
-  addSourceLine(*IMDie, Module->getLine(), Module->getScope()->getFilename(),
-                Module->getScope()->getDirectory());
+  auto *File = Module->getFile();
+  addSourceLine(*IMDie, Module->getLine(), File ? File->getFilename() : "",
+                File ? File->getDirectory() : "");
   addDIEEntry(*IMDie, dwarf::DW_AT_import, *EntityDie);
   StringRef Name = Module->getName();
   if (!Name.empty())
diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp
index b7155ac2480a..45dc13d58de7 100644
--- a/lib/CodeGen/CodeGenPrepare.cpp
+++ b/lib/CodeGen/CodeGenPrepare.cpp
@@ -4267,9 +4267,7 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
   // Use a worklist to iteratively look through PHI nodes, and ensure that
   // the addressing mode obtained from the non-PHI roots of the graph
   // are equivalent.
-  Value *Consensus = nullptr;
-  unsigned NumUsesConsensus = 0;
-  bool IsNumUsesConsensusValid = false;
+  bool AddrModeFound = false;
   bool PhiSeen = false;
   SmallVector<Instruction*, 16> AddrModeInsts;
   ExtAddrMode AddrMode;
@@ -4280,11 +4278,17 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
     Value *V = worklist.back();
     worklist.pop_back();
 
-    // Break use-def graph loops.
-    if (!Visited.insert(V).second) {
-      Consensus = nullptr;
-      break;
-    }
+    // We allow traversing cyclic Phi nodes.
+    // In case of success after this loop we ensure that traversing through
+    // Phi nodes ends up with all cases to compute address of the form
+    //    BaseGV + Base + Scale * Index + Offset
+    // where Scale and Offset are constans and BaseGV, Base and Index
+    // are exactly the same Values in all cases.
+    // It means that BaseGV, Scale and Offset dominate our memory instruction
+    // and have the same value as they had in address computation represented
+    // as Phi. So we can safely sink address computation to memory instruction.
+    if (!Visited.insert(V).second)
+      continue;
 
     // For a PHI node, push all of its incoming values.
     if (PHINode *P = dyn_cast<PHINode>(V)) {
@@ -4297,47 +4301,26 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
     // For non-PHIs, determine the addressing mode being computed.  Note that
     // the result may differ depending on what other uses our candidate
     // addressing instructions might have.
-    SmallVector<Instruction*, 16> NewAddrModeInsts;
+    AddrModeInsts.clear();
     ExtAddrMode NewAddrMode = AddressingModeMatcher::Match(
-      V, AccessTy, AddrSpace, MemoryInst, NewAddrModeInsts, *TLI, *TRI,
-      InsertedInsts, PromotedInsts, TPT);
-
-    // This check is broken into two cases with very similar code to avoid using
-    // getNumUses() as much as possible. Some values have a lot of uses, so
-    // calling getNumUses() unconditionally caused a significant compile-time
-    // regression.
-    if (!Consensus) {
-      Consensus = V;
-      AddrMode = NewAddrMode;
-      AddrModeInsts = NewAddrModeInsts;
-      continue;
-    } else if (NewAddrMode == AddrMode) {
-      if (!IsNumUsesConsensusValid) {
-        NumUsesConsensus = Consensus->getNumUses();
-        IsNumUsesConsensusValid = true;
-      }
+        V, AccessTy, AddrSpace, MemoryInst, AddrModeInsts, *TLI, *TRI,
+        InsertedInsts, PromotedInsts, TPT);
 
-      // Ensure that the obtained addressing mode is equivalent to that obtained
-      // for all other roots of the PHI traversal.  Also, when choosing one
-      // such root as representative, select the one with the most uses in order
-      // to keep the cost modeling heuristics in AddressingModeMatcher
-      // applicable.
-      unsigned NumUses = V->getNumUses();
-      if (NumUses > NumUsesConsensus) {
-        Consensus = V;
-        NumUsesConsensus = NumUses;
-        AddrModeInsts = NewAddrModeInsts;
-      }
+    if (!AddrModeFound) {
+      AddrModeFound = true;
+      AddrMode = NewAddrMode;
       continue;
     }
+    if (NewAddrMode == AddrMode)
+      continue;
 
-    Consensus = nullptr;
+    AddrModeFound = false;
     break;
   }
 
   // If the addressing mode couldn't be determined, or if multiple different
   // ones were determined, bail out now.
-  if (!Consensus) {
+  if (!AddrModeFound) {
     TPT.rollback(LastKnownGood);
     return false;
   }
@@ -4847,25 +4830,7 @@ bool CodeGenPrepare::canFormExtLd(
   if (!HasPromoted && LI->getParent() == Inst->getParent())
     return false;
 
-  EVT VT = TLI->getValueType(*DL, Inst->getType());
-  EVT LoadVT = TLI->getValueType(*DL, LI->getType());
-
-  // If the load has other users and the truncate is not free, this probably
-  // isn't worthwhile.
-  if (!LI->hasOneUse() && (TLI->isTypeLegal(LoadVT) || !TLI->isTypeLegal(VT)) &&
-      !TLI->isTruncateFree(Inst->getType(), LI->getType()))
-    return false;
-
-  // Check whether the target supports casts folded into loads.
-  unsigned LType;
-  if (isa<ZExtInst>(Inst))
-    LType = ISD::ZEXTLOAD;
-  else {
-    assert(isa<SExtInst>(Inst) && "Unexpected ext type!");
-    LType = ISD::SEXTLOAD;
-  }
-
-  return TLI->isLoadExtLegal(LType, VT, LoadVT);
+  return TLI->isExtLoad(LI, Inst, *DL);
 }
 
 /// Move a zext or sext fed by a load into the same basic block as the load,
diff --git a/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 49fb5e8f075b..5258370e6680 100644
--- a/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -433,9 +433,12 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
   }
   case TargetOpcode::G_SDIV:
   case TargetOpcode::G_UDIV:
+  case TargetOpcode::G_SREM:
+  case TargetOpcode::G_UREM:
   case TargetOpcode::G_ASHR:
   case TargetOpcode::G_LSHR: {
     unsigned ExtOp = MI.getOpcode() == TargetOpcode::G_SDIV ||
+                             MI.getOpcode() == TargetOpcode::G_SREM ||
                              MI.getOpcode() == TargetOpcode::G_ASHR
                          ? TargetOpcode::G_SEXT
                          : TargetOpcode::G_ZEXT;
diff --git a/lib/CodeGen/MachineCombiner.cpp b/lib/CodeGen/MachineCombiner.cpp
index c176de16b593..e6f80dbb8630 100644
--- a/lib/CodeGen/MachineCombiner.cpp
+++ b/lib/CodeGen/MachineCombiner.cpp
@@ -11,8 +11,6 @@
 // instructions do not lengthen the critical path or the resource depth.
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "machine-combiner"
-
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineDominators.h"
@@ -32,6 +30,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "machine-combiner"
+
 STATISTIC(NumInstCombined, "Number of machineinst combined");
 
 namespace {
diff --git a/lib/CodeGen/MachineDominanceFrontier.cpp b/lib/CodeGen/MachineDominanceFrontier.cpp
index 28ecc8f96805..b559e4e513a6 100644
--- a/lib/CodeGen/MachineDominanceFrontier.cpp
+++ b/lib/CodeGen/MachineDominanceFrontier.cpp
@@ -15,7 +15,8 @@
 using namespace llvm;
 
 namespace llvm {
-template class DominanceFrontierBase<MachineBasicBlock>;
+template class DominanceFrontierBase<MachineBasicBlock, false>;
+template class DominanceFrontierBase<MachineBasicBlock, true>;
 template class ForwardDominanceFrontierBase<MachineBasicBlock>;
 }
 
diff --git a/lib/CodeGen/MachineDominators.cpp b/lib/CodeGen/MachineDominators.cpp
index 65e9e5d195a4..845e8232477c 100644
--- a/lib/CodeGen/MachineDominators.cpp
+++ b/lib/CodeGen/MachineDominators.cpp
@@ -31,7 +31,7 @@ static cl::opt<bool, true> VerifyMachineDomInfoX(
 
 namespace llvm {
 template class DomTreeNodeBase<MachineBasicBlock>;
-template class DominatorTreeBase<MachineBasicBlock>;
+template class DominatorTreeBase<MachineBasicBlock, false>; // DomTreeBase
 }
 
 char MachineDominatorTree::ID = 0;
@@ -49,7 +49,7 @@ void MachineDominatorTree::getAnalysisUsage(AnalysisUsage &AU) const {
 bool MachineDominatorTree::runOnMachineFunction(MachineFunction &F) {
   CriticalEdgesToSplit.clear();
   NewBBs.clear();
-  DT.reset(new DominatorTreeBase<MachineBasicBlock>(false));
+  DT.reset(new DomTreeBase<MachineBasicBlock>());
   DT->recalculate(F);
   return false;
 }
@@ -144,7 +144,7 @@ void MachineDominatorTree::verifyDomTree() const {
     return;
   MachineFunction &F = *getRoot()->getParent();
 
-  DominatorTreeBase<MachineBasicBlock> OtherDT(false);
+  DomTreeBase<MachineBasicBlock> OtherDT;
   OtherDT.recalculate(F);
   if (getRootNode()->getBlock() != OtherDT.getRootNode()->getBlock() ||
       DT->compare(OtherDT)) {
diff --git a/lib/CodeGen/MachinePostDominators.cpp b/lib/CodeGen/MachinePostDominators.cpp
index c3f6e9249e7d..488377998cb3 100644
--- a/lib/CodeGen/MachinePostDominators.cpp
+++ b/lib/CodeGen/MachinePostDominators.cpp
@@ -16,6 +16,10 @@
 
 using namespace llvm;
 
+namespace llvm {
+template class DominatorTreeBase<MachineBasicBlock, true>; // PostDomTreeBase
+}
+
 char MachinePostDominatorTree::ID = 0;
 
 //declare initializeMachinePostDominatorTreePass
@@ -24,8 +28,7 @@ INITIALIZE_PASS(MachinePostDominatorTree, "machinepostdomtree",
 
 MachinePostDominatorTree::MachinePostDominatorTree() : MachineFunctionPass(ID) {
   initializeMachinePostDominatorTreePass(*PassRegistry::getPassRegistry());
-  DT = new DominatorTreeBase<MachineBasicBlock>(true); //true indicate
-                                                       // postdominator
+  DT = new PostDomTreeBase<MachineBasicBlock>();
 }
 
 FunctionPass *
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 71382c18fdf9..d5d3f7a61a9f 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -3889,9 +3889,8 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
   // Note: the SimplifyDemandedBits fold below can make an information-losing
   // transform, and then we have no way to find this better fold.
   if (N1C && N1C->isOne() && N0.getOpcode() == ISD::SUB) {
-    ConstantSDNode *SubLHS = isConstOrConstSplat(N0.getOperand(0));
-    SDValue SubRHS = N0.getOperand(1);
-    if (SubLHS && SubLHS->isNullValue()) {
+    if (isNullConstantOrNullSplatConstant(N0.getOperand(0))) {
+      SDValue SubRHS = N0.getOperand(1);
       if (SubRHS.getOpcode() == ISD::ZERO_EXTEND &&
           SubRHS.getOperand(0).getScalarValueSizeInBits() == 1)
         return SubRHS;
@@ -4586,6 +4585,20 @@ SDNode *DAGCombiner::MatchRotatePosNeg(SDValue Shifted, SDValue Pos,
   return nullptr;
 }
 
+// if Left + Right == Sum (constant or constant splat vector)
+static bool sumMatchConstant(SDValue Left, SDValue Right, unsigned Sum,
+                             SelectionDAG &DAG, const SDLoc &DL) {
+  EVT ShiftVT = Left.getValueType();
+  if (ShiftVT != Right.getValueType()) return false;
+
+  SDValue ShiftSum = DAG.FoldConstantArithmetic(ISD::ADD, DL, ShiftVT,
+                         Left.getNode(), Right.getNode());
+  if (!ShiftSum) return false;
+
+  ConstantSDNode *CSum = isConstOrConstSplat(ShiftSum);
+  return CSum && CSum->getZExtValue() == Sum;
+}
+
 // MatchRotate - Handle an 'or' of two operands.  If this is one of the many
 // idioms for rotate, and if the target supports rotation instructions, generate
 // a rot[lr].
@@ -4631,30 +4644,24 @@ SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
 
   // fold (or (shl x, C1), (srl x, C2)) -> (rotl x, C1)
   // fold (or (shl x, C1), (srl x, C2)) -> (rotr x, C2)
-  if (isConstOrConstSplat(LHSShiftAmt) && isConstOrConstSplat(RHSShiftAmt)) {
-    uint64_t LShVal = isConstOrConstSplat(LHSShiftAmt)->getZExtValue();
-    uint64_t RShVal = isConstOrConstSplat(RHSShiftAmt)->getZExtValue();
-    if ((LShVal + RShVal) != EltSizeInBits)
-      return nullptr;
-
+  if (sumMatchConstant(LHSShiftAmt, RHSShiftAmt, EltSizeInBits, DAG, DL)) {
     SDValue Rot = DAG.getNode(HasROTL ? ISD::ROTL : ISD::ROTR, DL, VT,
                               LHSShiftArg, HasROTL ? LHSShiftAmt : RHSShiftAmt);
 
     // If there is an AND of either shifted operand, apply it to the result.
     if (LHSMask.getNode() || RHSMask.getNode()) {
-      SDValue Mask = DAG.getAllOnesConstant(DL, VT);
+      SDValue AllOnes = DAG.getAllOnesConstant(DL, VT);
+      SDValue Mask = AllOnes;
 
       if (LHSMask.getNode()) {
-        APInt RHSBits = APInt::getLowBitsSet(EltSizeInBits, LShVal);
+        SDValue RHSBits = DAG.getNode(ISD::SRL, DL, VT, AllOnes, RHSShiftAmt);
         Mask = DAG.getNode(ISD::AND, DL, VT, Mask,
-                           DAG.getNode(ISD::OR, DL, VT, LHSMask,
-                                       DAG.getConstant(RHSBits, DL, VT)));
+                           DAG.getNode(ISD::OR, DL, VT, LHSMask, RHSBits));
       }
       if (RHSMask.getNode()) {
-        APInt LHSBits = APInt::getHighBitsSet(EltSizeInBits, RShVal);
+        SDValue LHSBits = DAG.getNode(ISD::SHL, DL, VT, AllOnes, LHSShiftAmt);
         Mask = DAG.getNode(ISD::AND, DL, VT, Mask,
-                           DAG.getNode(ISD::OR, DL, VT, RHSMask,
-                                       DAG.getConstant(LHSBits, DL, VT)));
+                           DAG.getNode(ISD::OR, DL, VT, RHSMask, LHSBits));
       }
 
       Rot = DAG.getNode(ISD::AND, DL, VT, Rot, Mask);
@@ -5272,11 +5279,21 @@ SDValue DAGCombiner::visitRotate(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
+  unsigned Bitsize = VT.getScalarSizeInBits();
 
   // fold (rot x, 0) -> x
   if (isNullConstantOrNullSplatConstant(N1))
     return N0;
 
+  // fold (rot x, c) -> (rot x, c % BitSize)
+  if (ConstantSDNode *Cst = isConstOrConstSplat(N1)) {
+    if (Cst->getAPIntValue().uge(Bitsize)) {
+      uint64_t RotAmt = Cst->getAPIntValue().urem(Bitsize);
+      return DAG.getNode(N->getOpcode(), dl, VT, N0,
+                         DAG.getConstant(RotAmt, dl, N1.getValueType()));
+    }
+  }
+
   // fold (rot* x, (trunc (and y, c))) -> (rot* x, (and (trunc y), (trunc c))).
   if (N1.getOpcode() == ISD::TRUNCATE &&
       N1.getOperand(0).getOpcode() == ISD::AND) {
@@ -5286,22 +5303,24 @@ SDValue DAGCombiner::visitRotate(SDNode *N) {
 
   unsigned NextOp = N0.getOpcode();
   // fold (rot* (rot* x, c2), c1) -> (rot* x, c1 +- c2 % bitsize)
-  if (NextOp == ISD::ROTL || NextOp == ISD::ROTR)
-    if (SDNode *C1 = DAG.isConstantIntBuildVectorOrConstantInt(N1))
-      if (SDNode *C2 =
-          DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1))) {
-        bool SameSide = (N->getOpcode() == NextOp);
-        unsigned CombineOp = SameSide ? ISD::ADD : ISD::SUB;
-        if (SDValue CombinedShift =
-            DAG.FoldConstantArithmetic(CombineOp, dl, VT, C1, C2)) {
-          unsigned Bitsize = VT.getScalarSizeInBits();
-          SDValue BitsizeC = DAG.getConstant(Bitsize, dl, VT);
-          SDValue CombinedShiftNorm = DAG.FoldConstantArithmetic(
-            ISD::SREM, dl, VT, CombinedShift.getNode(), BitsizeC.getNode());
-          return DAG.getNode(
-            N->getOpcode(), dl, VT, N0->getOperand(0), CombinedShiftNorm);
-        }
+  if (NextOp == ISD::ROTL || NextOp == ISD::ROTR) {
+    SDNode *C1 = DAG.isConstantIntBuildVectorOrConstantInt(N1);
+    SDNode *C2 = DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1));
+    if (C1 && C2 && C1->getValueType(0) == C2->getValueType(0)) {
+      EVT ShiftVT = C1->getValueType(0);
+      bool SameSide = (N->getOpcode() == NextOp);
+      unsigned CombineOp = SameSide ? ISD::ADD : ISD::SUB;
+      if (SDValue CombinedShift =
+              DAG.FoldConstantArithmetic(CombineOp, dl, ShiftVT, C1, C2)) {
+        SDValue BitsizeC = DAG.getConstant(Bitsize, dl, ShiftVT);
+        SDValue CombinedShiftNorm = DAG.FoldConstantArithmetic(
+            ISD::SREM, dl, ShiftVT, CombinedShift.getNode(),
+            BitsizeC.getNode());
+        return DAG.getNode(N->getOpcode(), dl, VT, N0->getOperand(0),
+                           CombinedShiftNorm);
       }
+    }
+  }
   return SDValue();
 }
 
@@ -7152,8 +7171,14 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
       SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
                                   N0.getValueType(), ExtLoad);
       ExtendSetCCUses(SetCCs, Trunc, ExtLoad, DL, ISD::SIGN_EXTEND);
-      CombineTo(N0.getNode(), Trunc, ExtLoad.getValue(1));
-      return CombineTo(N, ExtLoad); // Return N so it doesn't get rechecked!
+      // If the load value is used only by N, replace it via CombineTo N.
+      bool NoReplaceTrunc = SDValue(LN0, 0).hasOneUse();
+      CombineTo(N, ExtLoad);
+      if (NoReplaceTrunc)
+        DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));
+      else
+        CombineTo(LN0, Trunc, ExtLoad.getValue(1));
+      return SDValue(N, 0);
     }
   }
 
@@ -7210,8 +7235,13 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
                                     SDLoc(N0.getOperand(0)),
                                     N0.getOperand(0).getValueType(), ExtLoad);
         ExtendSetCCUses(SetCCs, Trunc, ExtLoad, DL, ISD::SIGN_EXTEND);
-        CombineTo(N0.getOperand(0).getNode(), Trunc, ExtLoad.getValue(1));
-        return CombineTo(N, And); // Return N so it doesn't get rechecked!
+        bool NoReplaceTrunc = SDValue(LN0, 0).hasOneUse();
+        CombineTo(N, And);
+        if (NoReplaceTrunc)
+          DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));
+        else
+          CombineTo(LN0, Trunc, ExtLoad.getValue(1));
+        return SDValue(N,0); // Return N so it doesn't get rechecked!
       }
     }
   }
@@ -7451,8 +7481,14 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
       SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
                                   N0.getValueType(), ExtLoad);
       ExtendSetCCUses(SetCCs, Trunc, ExtLoad, SDLoc(N), ISD::ZERO_EXTEND);
-      CombineTo(N0.getNode(), Trunc, ExtLoad.getValue(1));
-      return CombineTo(N, ExtLoad); // Return N so it doesn't get rechecked!
+      // If the load value is used only by N, replace it via CombineTo N.
+      bool NoReplaceTrunc = SDValue(LN0, 0).hasOneUse();
+      CombineTo(N, ExtLoad);
+      if (NoReplaceTrunc)
+        DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));
+      else
+        CombineTo(LN0, Trunc, ExtLoad.getValue(1));
+      return SDValue(N, 0); // Return N so it doesn't get rechecked!
     }
   }
 
@@ -7503,8 +7539,13 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
                                     SDLoc(N0.getOperand(0)),
                                     N0.getOperand(0).getValueType(), ExtLoad);
         ExtendSetCCUses(SetCCs, Trunc, ExtLoad, DL, ISD::ZERO_EXTEND);
-        CombineTo(N0.getOperand(0).getNode(), Trunc, ExtLoad.getValue(1));
-        return CombineTo(N, And); // Return N so it doesn't get rechecked!
+        bool NoReplaceTrunc = SDValue(LN0, 0).hasOneUse();
+        CombineTo(N, And);
+        if (NoReplaceTrunc)
+          DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));
+        else
+          CombineTo(LN0, Trunc, ExtLoad.getValue(1));
+        return SDValue(N,0); // Return N so it doesn't get rechecked!
       }
     }
   }
@@ -7676,13 +7717,18 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
                                        LN0->getChain(),
                                        LN0->getBasePtr(), N0.getValueType(),
                                        LN0->getMemOperand());
-      CombineTo(N, ExtLoad);
       SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
                                   N0.getValueType(), ExtLoad);
-      CombineTo(N0.getNode(), Trunc, ExtLoad.getValue(1));
       ExtendSetCCUses(SetCCs, Trunc, ExtLoad, SDLoc(N),
                       ISD::ANY_EXTEND);
-      return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+      // If the load value is used only by N, replace it via CombineTo N.
+      bool NoReplaceTrunc = N0.hasOneUse();
+      CombineTo(N, ExtLoad); 
+      if (NoReplaceTrunc)
+        DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));
+      else
+        CombineTo(LN0, Trunc, ExtLoad.getValue(1));
+      return SDValue(N, 0); // Return N so it doesn't get rechecked!
     }
   }
 
@@ -11373,12 +11419,8 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {
       SDValue Token = DAG.getNode(ISD::TokenFactor, SDLoc(N),
                                   MVT::Other, Chain, ReplLoad.getValue(1));
 
-      // Make sure the new and old chains are cleaned up.
-      AddToWorklist(Token.getNode());
-
-      // Replace uses with load result and token factor. Don't add users
-      // to work list.
-      return CombineTo(N, ReplLoad.getValue(0), Token, false);
+      // Replace uses with load result and token factor
+      return CombineTo(N, ReplLoad.getValue(0), Token);
     }
   }
 
@@ -12744,7 +12786,12 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
              TLI.storeOfVectorConstantIsCheap(MemVT, i + 1, FirstStoreAS)) &&
             !NoVectors) {
           // Find a legal type for the vector store.
-          EVT Ty = EVT::getVectorVT(Context, MemVT, i + 1);
+          unsigned Elts = i + 1;
+          if (MemVT.isVector()) {
+            // When merging vector stores, get the total number of elements.
+            Elts *= MemVT.getVectorNumElements();
+          }
+          EVT Ty = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts);
           if (TLI.isTypeLegal(Ty) &&
               TLI.canMergeStoresTo(FirstStoreAS, Ty, DAG) &&
               TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS,
@@ -13003,7 +13050,7 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
     SDValue NewStoreChain = getMergeStoreChains(StoreNodes, NumElem);
     AddToWorklist(NewStoreChain.getNode());
 
-    MachineMemOperand::Flags MMOFlags = isDereferenceable ? 
+    MachineMemOperand::Flags MMOFlags = isDereferenceable ?
                                           MachineMemOperand::MODereferenceable:
                                           MachineMemOperand::MONone;
 
@@ -16703,6 +16750,20 @@ bool DAGCombiner::isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1) const {
   if (BasePtr0.equalBaseIndex(BasePtr1, DAG, PtrDiff))
     return !((NumBytes0 <= PtrDiff) || (PtrDiff + NumBytes1 <= 0));
 
+  // If both BasePtr0 and BasePtr1 are FrameIndexes, we will not be
+  // able to calculate their relative offset if at least one arises
+  // from an alloca. However, these allocas cannot overlap and we
+  // can infer there is no alias.
+  if (auto *A = dyn_cast<FrameIndexSDNode>(BasePtr0.getBase()))
+    if (auto *B = dyn_cast<FrameIndexSDNode>(BasePtr1.getBase())) {
+      MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+      // If the base are the same frame index but the we couldn't find a
+      // constant offset, (indices are different) be conservative.
+      if (A != B && (!MFI.isFixedObjectIndex(A->getIndex()) ||
+                     !MFI.isFixedObjectIndex(B->getIndex())))
+        return false;
+    }
+
   // FIXME: findBaseOffset and ConstantValue/GlobalValue/FrameIndex analysis
   // modified to use BaseIndexOffset.
 
diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index ac3247948169..75fec7bd1d48 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -1827,10 +1827,11 @@ void DAGTypeLegalizer::ExpandIntRes_ADDSUB(SDNode *N,
     TLI.isOperationLegalOrCustom(N->getOpcode() == ISD::ADD ?
                                    ISD::UADDO : ISD::USUBO,
                                  TLI.getTypeToExpandTo(*DAG.getContext(), NVT));
+  TargetLoweringBase::BooleanContent BoolType = TLI.getBooleanContents(NVT);
+
   if (hasOVF) {
     EVT OvfVT = getSetCCResultType(NVT);
     SDVTList VTList = DAG.getVTList(NVT, OvfVT);
-    TargetLoweringBase::BooleanContent BoolType = TLI.getBooleanContents(NVT);
     int RevOpc;
     if (N->getOpcode() == ISD::ADD) {
       RevOpc = ISD::SUB;
@@ -1863,6 +1864,13 @@ void DAGTypeLegalizer::ExpandIntRes_ADDSUB(SDNode *N,
     Hi = DAG.getNode(ISD::ADD, dl, NVT, makeArrayRef(HiOps, 2));
     SDValue Cmp1 = DAG.getSetCC(dl, getSetCCResultType(NVT), Lo, LoOps[0],
                                 ISD::SETULT);
+
+    if (BoolType == TargetLoweringBase::ZeroOrOneBooleanContent) {
+      SDValue Carry = DAG.getZExtOrTrunc(Cmp1, dl, NVT);
+      Hi = DAG.getNode(ISD::ADD, dl, NVT, Hi, Carry);
+      return;
+    }
+
     SDValue Carry1 = DAG.getSelect(dl, NVT, Cmp1,
                                    DAG.getConstant(1, dl, NVT),
                                    DAG.getConstant(0, dl, NVT));
@@ -1877,9 +1885,14 @@ void DAGTypeLegalizer::ExpandIntRes_ADDSUB(SDNode *N,
     SDValue Cmp =
       DAG.getSetCC(dl, getSetCCResultType(LoOps[0].getValueType()),
                    LoOps[0], LoOps[1], ISD::SETULT);
-    SDValue Borrow = DAG.getSelect(dl, NVT, Cmp,
-                                   DAG.getConstant(1, dl, NVT),
-                                   DAG.getConstant(0, dl, NVT));
+
+    SDValue Borrow;
+    if (BoolType == TargetLoweringBase::ZeroOrOneBooleanContent)
+      Borrow = DAG.getZExtOrTrunc(Cmp, dl, NVT);
+    else
+      Borrow = DAG.getSelect(dl, NVT, Cmp, DAG.getConstant(1, dl, NVT),
+                             DAG.getConstant(0, dl, NVT));
+
     Hi = DAG.getNode(ISD::SUB, dl, NVT, Hi, Borrow);
   }
 }
diff --git a/lib/CodeGen/XRayInstrumentation.cpp b/lib/CodeGen/XRayInstrumentation.cpp
index 1a8d5a4f45da..0b4c6e551667 100644
--- a/lib/CodeGen/XRayInstrumentation.cpp
+++ b/lib/CodeGen/XRayInstrumentation.cpp
@@ -142,9 +142,9 @@ bool XRayInstrumentation::runOnMachineFunction(MachineFunction &MF) {
       return false; // Invalid value for threshold.
 
     // Count the number of MachineInstr`s in MachineFunction
-    int64_t MICount = 0;
-    for (const auto& MBB : MF)
-      MICount += MBB.size();
+    int64_t MICount = 0;
+    for (const auto& MBB : MF)
+      MICount += MBB.size();
 
     // Check if we have a loop.
     // FIXME: Maybe make this smarter, and see whether the loops are dependent
diff --git a/lib/DebugInfo/CodeView/CVTypeVisitor.cpp b/lib/DebugInfo/CodeView/CVTypeVisitor.cpp
index 22f166a2335d..79b9fdefd40e 100644
--- a/lib/DebugInfo/CodeView/CVTypeVisitor.cpp
+++ b/lib/DebugInfo/CodeView/CVTypeVisitor.cpp
@@ -14,7 +14,6 @@
 #include "llvm/DebugInfo/CodeView/TypeCollection.h"
 #include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
 #include "llvm/DebugInfo/CodeView/TypeRecordMapping.h"
-#include "llvm/DebugInfo/CodeView/TypeServerHandler.h"
 #include "llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h"
 #include "llvm/Support/BinaryByteStream.h"
 #include "llvm/Support/BinaryStreamReader.h"
@@ -42,13 +41,6 @@ static Error visitKnownMember(CVMemberRecord &Record,
   return Error::success();
 }
 
-static Expected<TypeServer2Record> deserializeTypeServerRecord(CVType &Record) {
-  TypeServer2Record R(TypeRecordKind::TypeServer2);
-  if (auto EC = TypeDeserializer::deserializeAs(Record, R))
-    return std::move(EC);
-  return R;
-}
-
 static Error visitMemberRecord(CVMemberRecord &Record,
                                TypeVisitorCallbacks &Callbacks) {
   if (auto EC = Callbacks.visitMemberBegin(Record))
@@ -84,8 +76,6 @@ class CVTypeVisitor {
 public:
   explicit CVTypeVisitor(TypeVisitorCallbacks &Callbacks);
 
-  void addTypeServerHandler(TypeServerHandler &Handler);
-
   Error visitTypeRecord(CVType &Record, TypeIndex Index);
   Error visitTypeRecord(CVType &Record);
 
@@ -98,45 +88,15 @@ public:
   Error visitFieldListMemberStream(BinaryStreamReader &Stream);
 
 private:
-  Expected<bool> handleTypeServer(CVType &Record);
   Error finishVisitation(CVType &Record);
 
   /// The interface to the class that gets notified of each visitation.
   TypeVisitorCallbacks &Callbacks;
-
-  TinyPtrVector<TypeServerHandler *> Handlers;
 };
 
 CVTypeVisitor::CVTypeVisitor(TypeVisitorCallbacks &Callbacks)
     : Callbacks(Callbacks) {}
 
-void CVTypeVisitor::addTypeServerHandler(TypeServerHandler &Handler) {
-  Handlers.push_back(&Handler);
-}
-
-Expected<bool> CVTypeVisitor::handleTypeServer(CVType &Record) {
-  if (Record.Type == TypeLeafKind::LF_TYPESERVER2 && !Handlers.empty()) {
-    auto TS = deserializeTypeServerRecord(Record);
-    if (!TS)
-      return TS.takeError();
-
-    for (auto Handler : Handlers) {
-      auto ExpectedResult = Handler->handle(*TS, Callbacks);
-      // If there was an error, return the error.
-      if (!ExpectedResult)
-        return ExpectedResult.takeError();
-
-      // If the handler processed the record, return success.
-      if (*ExpectedResult)
-        return true;
-
-      // Otherwise keep searching for a handler, eventually falling out and
-      // using the default record handler.
-    }
-  }
-  return false;
-}
-
 Error CVTypeVisitor::finishVisitation(CVType &Record) {
   switch (Record.Type) {
   default:
@@ -163,12 +123,6 @@ Error CVTypeVisitor::finishVisitation(CVType &Record) {
 }
 
 Error CVTypeVisitor::visitTypeRecord(CVType &Record, TypeIndex Index) {
-  auto ExpectedResult = handleTypeServer(Record);
-  if (!ExpectedResult)
-    return ExpectedResult.takeError();
-  if (*ExpectedResult)
-    return Error::success();
-
   if (auto EC = Callbacks.visitTypeBegin(Record, Index))
     return EC;
 
@@ -176,12 +130,6 @@ Error CVTypeVisitor::visitTypeRecord(CVType &Record, TypeIndex Index) {
 }
 
 Error CVTypeVisitor::visitTypeRecord(CVType &Record) {
-  auto ExpectedResult = handleTypeServer(Record);
-  if (!ExpectedResult)
-    return ExpectedResult.takeError();
-  if (*ExpectedResult)
-    return Error::success();
-
   if (auto EC = Callbacks.visitTypeBegin(Record))
     return EC;
 
@@ -271,52 +219,37 @@ struct VisitHelper {
 
 Error llvm::codeview::visitTypeRecord(CVType &Record, TypeIndex Index,
                                       TypeVisitorCallbacks &Callbacks,
-                                      VisitorDataSource Source,
-                                      TypeServerHandler *TS) {
+                                      VisitorDataSource Source) {
   VisitHelper V(Callbacks, Source);
-  if (TS)
-    V.Visitor.addTypeServerHandler(*TS);
   return V.Visitor.visitTypeRecord(Record, Index);
 }
 
 Error llvm::codeview::visitTypeRecord(CVType &Record,
                                       TypeVisitorCallbacks &Callbacks,
-                                      VisitorDataSource Source,
-                                      TypeServerHandler *TS) {
+                                      VisitorDataSource Source) {
   VisitHelper V(Callbacks, Source);
-  if (TS)
-    V.Visitor.addTypeServerHandler(*TS);
   return V.Visitor.visitTypeRecord(Record);
 }
 
 Error llvm::codeview::visitTypeStream(const CVTypeArray &Types,
                                       TypeVisitorCallbacks &Callbacks,
-                                      VisitorDataSource Source,
-                                      TypeServerHandler *TS) {
+                                      VisitorDataSource Source) {
   VisitHelper V(Callbacks, Source);
-  if (TS)
-    V.Visitor.addTypeServerHandler(*TS);
   return V.Visitor.visitTypeStream(Types);
 }
 
 Error llvm::codeview::visitTypeStream(CVTypeRange Types,
-                                      TypeVisitorCallbacks &Callbacks,
-                                      TypeServerHandler *TS) {
+                                      TypeVisitorCallbacks &Callbacks) {
   VisitHelper V(Callbacks, VDS_BytesPresent);
-  if (TS)
-    V.Visitor.addTypeServerHandler(*TS);
   return V.Visitor.visitTypeStream(Types);
 }
 
 Error llvm::codeview::visitTypeStream(TypeCollection &Types,
-                                      TypeVisitorCallbacks &Callbacks,
-                                      TypeServerHandler *TS) {
+                                      TypeVisitorCallbacks &Callbacks) {
   // When the internal visitor calls Types.getType(Index) the interface is
   // required to return a CVType with the bytes filled out.  So we can assume
   // that the bytes will be present when individual records are visited.
   VisitHelper V(Callbacks, VDS_BytesPresent);
-  if (TS)
-    V.Visitor.addTypeServerHandler(*TS);
   return V.Visitor.visitTypeStream(Types);
 }
 
diff --git a/lib/DebugInfo/CodeView/CodeViewRecordIO.cpp b/lib/DebugInfo/CodeView/CodeViewRecordIO.cpp
index 711144fc2faa..4fc14480578e 100644
--- a/lib/DebugInfo/CodeView/CodeViewRecordIO.cpp
+++ b/lib/DebugInfo/CodeView/CodeViewRecordIO.cpp
@@ -168,18 +168,19 @@ Error CodeViewRecordIO::mapStringZ(StringRef &Value) {
   return Error::success();
 }
 
-Error CodeViewRecordIO::mapGuid(StringRef &Guid) {
+Error CodeViewRecordIO::mapGuid(GUID &Guid) {
   constexpr uint32_t GuidSize = 16;
   if (maxFieldLength() < GuidSize)
     return make_error<CodeViewError>(cv_error_code::insufficient_buffer);
 
   if (isWriting()) {
-    assert(Guid.size() == 16 && "Invalid Guid Size!");
-    if (auto EC = Writer->writeFixedString(Guid))
+    if (auto EC = Writer->writeBytes(Guid.Guid))
       return EC;
   } else {
-    if (auto EC = Reader->readFixedString(Guid, 16))
+    ArrayRef<uint8_t> GuidBytes;
+    if (auto EC = Reader->readBytes(GuidBytes, GuidSize))
       return EC;
+    memcpy(Guid.Guid, GuidBytes.data(), GuidSize);
   }
   return Error::success();
 }
diff --git a/lib/DebugInfo/CodeView/Formatters.cpp b/lib/DebugInfo/CodeView/Formatters.cpp
index 1fa8d219d6ac..b8d89c76da3b 100644
--- a/lib/DebugInfo/CodeView/Formatters.cpp
+++ b/lib/DebugInfo/CodeView/Formatters.cpp
@@ -9,6 +9,7 @@
 
 #include "llvm/DebugInfo/CodeView/Formatters.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/DebugInfo/CodeView/GUID.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cassert>
@@ -39,3 +40,9 @@ void GuidAdapter::format(raw_ostream &Stream, StringRef Style) {
   }
   Stream << "}";
 }
+
+raw_ostream &llvm::codeview::operator<<(raw_ostream &OS, const GUID &Guid) {
+  codeview::detail::GuidAdapter A(Guid.Guid);
+  A.format(OS, "");
+  return OS;
+}
diff --git a/lib/DebugInfo/CodeView/SymbolDumper.cpp b/lib/DebugInfo/CodeView/SymbolDumper.cpp
index c2c02f8de03f..62e73acc72d6 100644
--- a/lib/DebugInfo/CodeView/SymbolDumper.cpp
+++ b/lib/DebugInfo/CodeView/SymbolDumper.cpp
@@ -186,7 +186,7 @@ Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR,
 
 Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR,
                                            BuildInfoSym &BuildInfo) {
-  W.printNumber("BuildId", BuildInfo.BuildId);
+  printTypeIndex("BuildId", BuildInfo.BuildId);
   return Error::success();
 }
 
diff --git a/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp b/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp
index 589966705015..e18a35ca1f38 100644
--- a/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp
+++ b/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp
@@ -354,7 +354,7 @@ Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, FuncIdRecord &Func) {
 }
 
 Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, TypeServer2Record &TS) {
-  W->printString("Guid", formatv("{0}", fmt_guid(TS.getGuid())).str());
+  W->printString("Guid", formatv("{0}", TS.getGuid()).str());
   W->printNumber("Age", TS.getAge());
   W->printString("Name", TS.getName());
   return Error::success();
diff --git a/lib/DebugInfo/CodeView/TypeStreamMerger.cpp b/lib/DebugInfo/CodeView/TypeStreamMerger.cpp
index 71a0966df036..bff3516203a0 100644
--- a/lib/DebugInfo/CodeView/TypeStreamMerger.cpp
+++ b/lib/DebugInfo/CodeView/TypeStreamMerger.cpp
@@ -10,13 +10,11 @@
 #include "llvm/DebugInfo/CodeView/TypeStreamMerger.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
-#include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
 #include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
 #include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/DebugInfo/CodeView/TypeIndexDiscovery.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
 #include "llvm/DebugInfo/CodeView/TypeTableBuilder.h"
-#include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ScopedPrinter.h"
 
@@ -57,56 +55,35 @@ namespace {
 /// streams: an item (or IPI) stream and a type stream, as this is what is
 /// actually stored in the final PDB. We choose which records go where by
 /// looking at the record kind.
-class TypeStreamMerger : public TypeVisitorCallbacks {
+class TypeStreamMerger {
 public:
-  explicit TypeStreamMerger(SmallVectorImpl<TypeIndex> &SourceToDest,
-                            TypeServerHandler *Handler)
-      : Handler(Handler), IndexMap(SourceToDest) {
+  explicit TypeStreamMerger(SmallVectorImpl<TypeIndex> &SourceToDest)
+      : IndexMap(SourceToDest) {
     SourceToDest.clear();
   }
 
   static const TypeIndex Untranslated;
 
-  Error visitTypeBegin(CVType &Record) override;
-  Error visitTypeEnd(CVType &Record) override;
-
   Error mergeTypesAndIds(TypeTableBuilder &DestIds, TypeTableBuilder &DestTypes,
-    const CVTypeArray &IdsAndTypes);
+                         const CVTypeArray &IdsAndTypes);
   Error mergeIdRecords(TypeTableBuilder &Dest,
                        ArrayRef<TypeIndex> TypeSourceToDest,
-    const CVTypeArray &Ids);
+                       const CVTypeArray &Ids);
   Error mergeTypeRecords(TypeTableBuilder &Dest, const CVTypeArray &Types);
 
 private:
   Error doit(const CVTypeArray &Types);
 
+  Error remapAllTypes(const CVTypeArray &Types);
+
+  Error remapType(const CVType &Type);
+
   void addMapping(TypeIndex Idx);
 
   bool remapTypeIndex(TypeIndex &Idx);
   bool remapItemIndex(TypeIndex &Idx);
 
-  bool remapIndices(RemappedType &Record, ArrayRef<TiReference> Refs) {
-    auto OriginalData = Record.OriginalRecord.content();
-    bool Success = true;
-    for (auto &Ref : Refs) {
-      uint32_t Offset = Ref.Offset;
-      ArrayRef<uint8_t> Bytes =
-          OriginalData.slice(Ref.Offset, sizeof(TypeIndex));
-      ArrayRef<TypeIndex> TIs(reinterpret_cast<const TypeIndex *>(Bytes.data()),
-                              Ref.Count);
-      for (auto TI : TIs) {
-        TypeIndex NewTI = TI;
-        bool ThisSuccess = (Ref.Kind == TiRefKind::IndexRef)
-                               ? remapItemIndex(NewTI)
-                               : remapTypeIndex(NewTI);
-        if (ThisSuccess && NewTI != TI)
-          Record.Mappings.emplace_back(Offset, NewTI);
-        Offset += sizeof(TypeIndex);
-        Success &= ThisSuccess;
-      }
-    }
-    return Success;
-  }
+  bool remapIndices(RemappedType &Record, ArrayRef<TiReference> Refs);
 
   bool remapIndex(TypeIndex &Idx, ArrayRef<TypeIndex> Map);
 
@@ -128,21 +105,6 @@ private:
     return Error::success();
   }
 
-  Error writeTypeRecord(const CVType &Record) {
-    TypeIndex DestIdx =
-        DestTypeStream->writeSerializedRecord(Record.RecordData);
-    addMapping(DestIdx);
-    return Error::success();
-  }
-
-  Error writeTypeRecord(const RemappedType &Record, bool RemapSuccess) {
-    return writeRecord(*DestTypeStream, Record, RemapSuccess);
-  }
-
-  Error writeIdRecord(const RemappedType &Record, bool RemapSuccess) {
-    return writeRecord(*DestIdStream, Record, RemapSuccess);
-  }
-
   Optional<Error> LastError;
 
   bool IsSecondPass = false;
@@ -153,7 +115,6 @@ private:
 
   TypeTableBuilder *DestIdStream = nullptr;
   TypeTableBuilder *DestTypeStream = nullptr;
-  TypeServerHandler *Handler = nullptr;
 
   // If we're only mapping id records, this array contains the mapping for
   // type records.
@@ -168,12 +129,8 @@ private:
 
 const TypeIndex TypeStreamMerger::Untranslated(SimpleTypeKind::NotTranslated);
 
-Error TypeStreamMerger::visitTypeBegin(CVType &Rec) {
-  RemappedType R(Rec);
-  SmallVector<TiReference, 32> Refs;
-  discoverTypeIndices(Rec.RecordData, Refs);
-  bool Success = remapIndices(R, Refs);
-  switch (Rec.kind()) {
+static bool isIdRecord(TypeLeafKind K) {
+  switch (K) {
   case TypeLeafKind::LF_FUNC_ID:
   case TypeLeafKind::LF_MFUNC_ID:
   case TypeLeafKind::LF_STRING_ID:
@@ -181,19 +138,10 @@ Error TypeStreamMerger::visitTypeBegin(CVType &Rec) {
   case TypeLeafKind::LF_BUILDINFO:
   case TypeLeafKind::LF_UDT_SRC_LINE:
   case TypeLeafKind::LF_UDT_MOD_SRC_LINE:
-    return writeIdRecord(R, Success);
+    return true;
   default:
-    return writeTypeRecord(R, Success);
+    return false;
   }
-  return Error::success();
-}
-
-Error TypeStreamMerger::visitTypeEnd(CVType &Rec) {
-  ++CurIndex;
-  if (!IsSecondPass)
-    assert(IndexMap.size() == slotForIndex(CurIndex) &&
-           "visitKnownRecord should add one index map entry");
-  return Error::success();
 }
 
 void TypeStreamMerger::addMapping(TypeIndex Idx) {
@@ -256,7 +204,7 @@ bool TypeStreamMerger::remapItemIndex(TypeIndex &Idx) {
 }
 
 Error TypeStreamMerger::mergeTypeRecords(TypeTableBuilder &Dest,
-  const CVTypeArray &Types) {
+                                         const CVTypeArray &Types) {
   DestTypeStream = &Dest;
 
   return doit(Types);
@@ -264,7 +212,7 @@ Error TypeStreamMerger::mergeTypeRecords(TypeTableBuilder &Dest,
 
 Error TypeStreamMerger::mergeIdRecords(TypeTableBuilder &Dest,
                                        ArrayRef<TypeIndex> TypeSourceToDest,
-  const CVTypeArray &Ids) {
+                                       const CVTypeArray &Ids) {
   DestIdStream = &Dest;
   TypeLookup = TypeSourceToDest;
 
@@ -273,25 +221,14 @@ Error TypeStreamMerger::mergeIdRecords(TypeTableBuilder &Dest,
 
 Error TypeStreamMerger::mergeTypesAndIds(TypeTableBuilder &DestIds,
                                          TypeTableBuilder &DestTypes,
-  const CVTypeArray &IdsAndTypes) {
+                                         const CVTypeArray &IdsAndTypes) {
   DestIdStream = &DestIds;
   DestTypeStream = &DestTypes;
-
   return doit(IdsAndTypes);
 }
 
 Error TypeStreamMerger::doit(const CVTypeArray &Types) {
-  LastError = Error::success();
-
-  // We don't want to deserialize records.  I guess this flag is poorly named,
-  // but it really means "Don't deserialize records before switching on the
-  // concrete type.
-  // FIXME: We can probably get even more speed here if we don't use the visitor
-  // pipeline here, but instead write the switch ourselves.  I don't think it
-  // would buy us much since it's already pretty fast, but it's probably worth
-  // a few cycles.
-  if (auto EC =
-          codeview::visitTypeStream(Types, *this, VDS_BytesExternal, Handler))
+  if (auto EC = remapAllTypes(Types))
     return EC;
 
   // If we found bad indices but no other errors, try doing another pass and see
@@ -301,50 +238,92 @@ Error TypeStreamMerger::doit(const CVTypeArray &Types) {
   // topologically sorted. The standard library contains MASM-produced objects,
   // so this is important to handle correctly, but we don't have to be too
   // efficient. MASM type streams are usually very small.
-  while (!*LastError && NumBadIndices > 0) {
+  while (!LastError && NumBadIndices > 0) {
     unsigned BadIndicesRemaining = NumBadIndices;
     IsSecondPass = true;
     NumBadIndices = 0;
     CurIndex = TypeIndex(TypeIndex::FirstNonSimpleIndex);
 
-    if (auto EC =
-            codeview::visitTypeStream(Types, *this, VDS_BytesExternal, Handler))
+    if (auto EC = remapAllTypes(Types))
       return EC;
 
     assert(NumBadIndices <= BadIndicesRemaining &&
            "second pass found more bad indices");
-    if (!*LastError && NumBadIndices == BadIndicesRemaining) {
+    if (!LastError && NumBadIndices == BadIndicesRemaining) {
       return llvm::make_error<CodeViewError>(
           cv_error_code::corrupt_record, "input type graph contains cycles");
     }
   }
 
-  Error Ret = std::move(*LastError);
-  LastError.reset();
-  return Ret;
+  if (LastError)
+    return std::move(*LastError);
+  return Error::success();
+}
+
+Error TypeStreamMerger::remapAllTypes(const CVTypeArray &Types) {
+  for (const CVType &Type : Types)
+    if (auto EC = remapType(Type))
+      return EC;
+  return Error::success();
+}
+
+Error TypeStreamMerger::remapType(const CVType &Type) {
+  RemappedType R(Type);
+  SmallVector<TiReference, 32> Refs;
+  discoverTypeIndices(Type.RecordData, Refs);
+  bool MappedAllIndices = remapIndices(R, Refs);
+  TypeTableBuilder &Dest =
+      isIdRecord(Type.kind()) ? *DestIdStream : *DestTypeStream;
+  if (auto EC = writeRecord(Dest, R, MappedAllIndices))
+    return EC;
+
+  ++CurIndex;
+  assert((IsSecondPass || IndexMap.size() == slotForIndex(CurIndex)) &&
+         "visitKnownRecord should add one index map entry");
+  return Error::success();
+}
+
+bool TypeStreamMerger::remapIndices(RemappedType &Record,
+                                    ArrayRef<TiReference> Refs) {
+  ArrayRef<uint8_t> OriginalData = Record.OriginalRecord.content();
+  bool Success = true;
+  for (auto &Ref : Refs) {
+    uint32_t Offset = Ref.Offset;
+    ArrayRef<uint8_t> Bytes = OriginalData.slice(Ref.Offset, sizeof(TypeIndex));
+    ArrayRef<TypeIndex> TIs(reinterpret_cast<const TypeIndex *>(Bytes.data()),
+                            Ref.Count);
+    for (auto TI : TIs) {
+      TypeIndex NewTI = TI;
+      bool ThisSuccess = (Ref.Kind == TiRefKind::IndexRef)
+                             ? remapItemIndex(NewTI)
+                             : remapTypeIndex(NewTI);
+      if (ThisSuccess && NewTI != TI)
+        Record.Mappings.emplace_back(Offset, NewTI);
+      Offset += sizeof(TypeIndex);
+      Success &= ThisSuccess;
+    }
+  }
+  return Success;
 }
 
 Error llvm::codeview::mergeTypeRecords(TypeTableBuilder &Dest,
                                        SmallVectorImpl<TypeIndex> &SourceToDest,
-                                       TypeServerHandler *Handler,
-  const CVTypeArray &Types) {
-  TypeStreamMerger M(SourceToDest, Handler);
+                                       const CVTypeArray &Types) {
+  TypeStreamMerger M(SourceToDest);
   return M.mergeTypeRecords(Dest, Types);
 }
 
 Error llvm::codeview::mergeIdRecords(TypeTableBuilder &Dest,
                                      ArrayRef<TypeIndex> TypeSourceToDest,
                                      SmallVectorImpl<TypeIndex> &SourceToDest,
-  const CVTypeArray &Ids) {
-  TypeStreamMerger M(SourceToDest, nullptr);
+                                     const CVTypeArray &Ids) {
+  TypeStreamMerger M(SourceToDest);
   return M.mergeIdRecords(Dest, TypeSourceToDest, Ids);
 }
 
 Error llvm::codeview::mergeTypeAndIdRecords(
     TypeTableBuilder &DestIds, TypeTableBuilder &DestTypes,
-    SmallVectorImpl<TypeIndex> &SourceToDest, TypeServerHandler *Handler,
-  const CVTypeArray &IdsAndTypes) {
-
-  TypeStreamMerger M(SourceToDest, Handler);
+    SmallVectorImpl<TypeIndex> &SourceToDest, const CVTypeArray &IdsAndTypes) {
+  TypeStreamMerger M(SourceToDest);
   return M.mergeTypesAndIds(DestIds, DestTypes, IdsAndTypes);
 }
diff --git a/lib/DebugInfo/DWARF/DWARFVerifier.cpp b/lib/DebugInfo/DWARF/DWARFVerifier.cpp
index 0a10e6b78911..6cf44ffa3796 100644
--- a/lib/DebugInfo/DWARF/DWARFVerifier.cpp
+++ b/lib/DebugInfo/DWARF/DWARFVerifier.cpp
@@ -24,22 +24,166 @@ using namespace llvm;
 using namespace dwarf;
 using namespace object;
 
-void DWARFVerifier::verifyDebugInfoAttribute(const DWARFDie &Die,
-                                             DWARFAttribute &AttrValue) {
+bool DWARFVerifier::verifyUnitHeader(const DWARFDataExtractor DebugInfoData,
+                                     uint32_t *Offset, unsigned UnitIndex,
+                                     uint8_t &UnitType, bool &isUnitDWARF64) {
+  uint32_t AbbrOffset, Length;
+  uint8_t AddrSize = 0;
+  uint16_t Version;
+  bool Success = true;
+
+  bool ValidLength = false;
+  bool ValidVersion = false;
+  bool ValidAddrSize = false;
+  bool ValidType = true;
+  bool ValidAbbrevOffset = true;
+
+  uint32_t OffsetStart = *Offset;
+  Length = DebugInfoData.getU32(Offset);
+  if (Length == UINT32_MAX) {
+    isUnitDWARF64 = true;
+    OS << format(
+        "Unit[%d] is in 64-bit DWARF format; cannot verify from this point.\n",
+        UnitIndex);
+    return false;
+  }
+  Version = DebugInfoData.getU16(Offset);
+
+  if (Version >= 5) {
+    UnitType = DebugInfoData.getU8(Offset);
+    AddrSize = DebugInfoData.getU8(Offset);
+    AbbrOffset = DebugInfoData.getU32(Offset);
+    ValidType = DWARFUnit::isValidUnitType(UnitType);
+  } else {
+    UnitType = 0;
+    AbbrOffset = DebugInfoData.getU32(Offset);
+    AddrSize = DebugInfoData.getU8(Offset);
+  }
+
+  if (!DCtx.getDebugAbbrev()->getAbbreviationDeclarationSet(AbbrOffset))
+    ValidAbbrevOffset = false;
+
+  ValidLength = DebugInfoData.isValidOffset(OffsetStart + Length + 3);
+  ValidVersion = DWARFContext::isSupportedVersion(Version);
+  ValidAddrSize = AddrSize == 4 || AddrSize == 8;
+  if (!ValidLength || !ValidVersion || !ValidAddrSize || !ValidAbbrevOffset ||
+      !ValidType) {
+    Success = false;
+    OS << format("Units[%d] - start offset: 0x%08x \n", UnitIndex, OffsetStart);
+    if (!ValidLength)
+      OS << "\tError: The length for this unit is too "
+            "large for the .debug_info provided.\n";
+    if (!ValidVersion)
+      OS << "\tError: The 16 bit unit header version is not valid.\n";
+    if (!ValidType)
+      OS << "\tError: The unit type encoding is not valid.\n";
+    if (!ValidAbbrevOffset)
+      OS << "\tError: The offset into the .debug_abbrev section is "
+            "not valid.\n";
+    if (!ValidAddrSize)
+      OS << "\tError: The address size is unsupported.\n";
+  }
+  *Offset = OffsetStart + Length + 4;
+  return Success;
+}
+
+bool DWARFVerifier::verifyUnitContents(DWARFUnit Unit) {
+  uint32_t NumUnitErrors = 0;
+  unsigned NumDies = Unit.getNumDIEs();
+  for (unsigned I = 0; I < NumDies; ++I) {
+    auto Die = Unit.getDIEAtIndex(I);
+    if (Die.getTag() == DW_TAG_null)
+      continue;
+    for (auto AttrValue : Die.attributes()) {
+      NumUnitErrors += verifyDebugInfoAttribute(Die, AttrValue);
+      NumUnitErrors += verifyDebugInfoForm(Die, AttrValue);
+    }
+  }
+  return NumUnitErrors == 0;
+}
+
+bool DWARFVerifier::handleDebugInfo() {
+  OS << "Verifying .debug_info Unit Header Chain...\n";
+
+  DWARFDataExtractor DebugInfoData(DCtx.getInfoSection(), DCtx.isLittleEndian(),
+                                   0);
+  uint32_t NumDebugInfoErrors = 0;
+  uint32_t OffsetStart = 0, Offset = 0, UnitIdx = 0;
+  uint8_t UnitType = 0;
+  bool isUnitDWARF64 = false;
+  bool isHeaderChainValid = true;
+  bool hasDIE = DebugInfoData.isValidOffset(Offset);
+  while (hasDIE) {
+    OffsetStart = Offset;
+    if (!verifyUnitHeader(DebugInfoData, &Offset, UnitIdx, UnitType,
+                          isUnitDWARF64)) {
+      isHeaderChainValid = false;
+      if (isUnitDWARF64)
+        break;
+    } else {
+      std::unique_ptr<DWARFUnit> Unit;
+      switch (UnitType) {
+      case dwarf::DW_UT_type:
+      case dwarf::DW_UT_split_type: {
+        DWARFUnitSection<DWARFTypeUnit> TUSection{};
+        Unit.reset(new DWARFTypeUnit(
+            DCtx, DCtx.getInfoSection(), DCtx.getDebugAbbrev(),
+            &DCtx.getRangeSection(), DCtx.getStringSection(),
+            DCtx.getStringOffsetSection(), &DCtx.getAppleObjCSection(),
+            DCtx.getLineSection(), DCtx.isLittleEndian(), false, TUSection,
+            nullptr));
+        break;
+      }
+      case dwarf::DW_UT_skeleton:
+      case dwarf::DW_UT_split_compile:
+      case dwarf::DW_UT_compile:
+      case dwarf::DW_UT_partial:
+      // UnitType = 0 means that we are
+      // verifying a compile unit in DWARF v4.
+      case 0: {
+        DWARFUnitSection<DWARFCompileUnit> CUSection{};
+        Unit.reset(new DWARFCompileUnit(
+            DCtx, DCtx.getInfoSection(), DCtx.getDebugAbbrev(),
+            &DCtx.getRangeSection(), DCtx.getStringSection(),
+            DCtx.getStringOffsetSection(), &DCtx.getAppleObjCSection(),
+            DCtx.getLineSection(), DCtx.isLittleEndian(), false, CUSection,
+            nullptr));
+        break;
+      }
+      default: { llvm_unreachable("Invalid UnitType."); }
+      }
+      Unit->extract(DebugInfoData, &OffsetStart);
+      if (!verifyUnitContents(*Unit))
+        ++NumDebugInfoErrors;
+    }
+    hasDIE = DebugInfoData.isValidOffset(Offset);
+    ++UnitIdx;
+  }
+  if (UnitIdx == 0 && !hasDIE) {
+    OS << "Warning: .debug_info is empty.\n";
+    isHeaderChainValid = true;
+  }
+  NumDebugInfoErrors += verifyDebugInfoReferences();
+  return (isHeaderChainValid && NumDebugInfoErrors == 0);
+}
+
+unsigned DWARFVerifier::verifyDebugInfoAttribute(const DWARFDie &Die,
+                                                 DWARFAttribute &AttrValue) {
+  unsigned NumErrors = 0;
   const auto Attr = AttrValue.Attr;
   switch (Attr) {
   case DW_AT_ranges:
     // Make sure the offset in the DW_AT_ranges attribute is valid.
     if (auto SectionOffset = AttrValue.Value.getAsSectionOffset()) {
       if (*SectionOffset >= DCtx.getRangeSection().Data.size()) {
-        ++NumDebugInfoErrors;
+        ++NumErrors;
         OS << "error: DW_AT_ranges offset is beyond .debug_ranges "
               "bounds:\n";
         Die.dump(OS, 0);
         OS << "\n";
       }
     } else {
-      ++NumDebugInfoErrors;
+      ++NumErrors;
       OS << "error: DIE has invalid DW_AT_ranges encoding:\n";
       Die.dump(OS, 0);
       OS << "\n";
@@ -49,7 +193,7 @@ void DWARFVerifier::verifyDebugInfoAttribute(const DWARFDie &Die,
     // Make sure the offset in the DW_AT_stmt_list attribute is valid.
     if (auto SectionOffset = AttrValue.Value.getAsSectionOffset()) {
       if (*SectionOffset >= DCtx.getLineSection().Data.size()) {
-        ++NumDebugInfoErrors;
+        ++NumErrors;
         OS << "error: DW_AT_stmt_list offset is beyond .debug_line "
               "bounds: "
            << format("0x%08" PRIx32, *SectionOffset) << "\n";
@@ -57,7 +201,7 @@ void DWARFVerifier::verifyDebugInfoAttribute(const DWARFDie &Die,
         OS << "\n";
       }
     } else {
-      ++NumDebugInfoErrors;
+      ++NumErrors;
       OS << "error: DIE has invalid DW_AT_stmt_list encoding:\n";
       Die.dump(OS, 0);
       OS << "\n";
@@ -67,10 +211,12 @@ void DWARFVerifier::verifyDebugInfoAttribute(const DWARFDie &Die,
   default:
     break;
   }
+  return NumErrors;
 }
 
-void DWARFVerifier::verifyDebugInfoForm(const DWARFDie &Die,
-                                        DWARFAttribute &AttrValue) {
+unsigned DWARFVerifier::verifyDebugInfoForm(const DWARFDie &Die,
+                                            DWARFAttribute &AttrValue) {
+  unsigned NumErrors = 0;
   const auto Form = AttrValue.Value.getForm();
   switch (Form) {
   case DW_FORM_ref1:
@@ -86,7 +232,7 @@ void DWARFVerifier::verifyDebugInfoForm(const DWARFDie &Die,
       auto CUSize = DieCU->getNextUnitOffset() - DieCU->getOffset();
       auto CUOffset = AttrValue.Value.getRawUValue();
       if (CUOffset >= CUSize) {
-        ++NumDebugInfoErrors;
+        ++NumErrors;
         OS << "error: " << FormEncodingString(Form) << " CU offset "
            << format("0x%08" PRIx32, CUOffset)
            << " is invalid (must be less than CU size of "
@@ -108,7 +254,7 @@ void DWARFVerifier::verifyDebugInfoForm(const DWARFDie &Die,
     assert(RefVal);
     if (RefVal) {
       if (*RefVal >= DCtx.getInfoSection().Data.size()) {
-        ++NumDebugInfoErrors;
+        ++NumErrors;
         OS << "error: DW_FORM_ref_addr offset beyond .debug_info "
               "bounds:\n";
         Die.dump(OS, 0);
@@ -125,7 +271,7 @@ void DWARFVerifier::verifyDebugInfoForm(const DWARFDie &Die,
     auto SecOffset = AttrValue.Value.getAsSectionOffset();
     assert(SecOffset); // DW_FORM_strp is a section offset.
     if (SecOffset && *SecOffset >= DCtx.getStringSection().size()) {
-      ++NumDebugInfoErrors;
+      ++NumErrors;
       OS << "error: DW_FORM_strp offset beyond .debug_str bounds:\n";
       Die.dump(OS, 0);
       OS << "\n";
@@ -135,17 +281,19 @@ void DWARFVerifier::verifyDebugInfoForm(const DWARFDie &Die,
   default:
     break;
   }
+  return NumErrors;
 }
 
-void DWARFVerifier::verifyDebugInfoReferences() {
+unsigned DWARFVerifier::verifyDebugInfoReferences() {
   // Take all references and make sure they point to an actual DIE by
   // getting the DIE by offset and emitting an error
   OS << "Verifying .debug_info references...\n";
+  unsigned NumErrors = 0;
   for (auto Pair : ReferenceToDIEOffsets) {
     auto Die = DCtx.getDIEForOffset(Pair.first);
     if (Die)
       continue;
-    ++NumDebugInfoErrors;
+    ++NumErrors;
     OS << "error: invalid DIE reference " << format("0x%08" PRIx64, Pair.first)
        << ". Offset is in between DIEs:\n";
     for (auto Offset : Pair.second) {
@@ -155,26 +303,7 @@ void DWARFVerifier::verifyDebugInfoReferences() {
     }
     OS << "\n";
   }
-}
-
-bool DWARFVerifier::handleDebugInfo() {
-  NumDebugInfoErrors = 0;
-  OS << "Verifying .debug_info...\n";
-  for (const auto &CU : DCtx.compile_units()) {
-    unsigned NumDies = CU->getNumDIEs();
-    for (unsigned I = 0; I < NumDies; ++I) {
-      auto Die = CU->getDIEAtIndex(I);
-      const auto Tag = Die.getTag();
-      if (Tag == DW_TAG_null)
-        continue;
-      for (auto AttrValue : Die.attributes()) {
-        verifyDebugInfoAttribute(Die, AttrValue);
-        verifyDebugInfoForm(Die, AttrValue);
-      }
-    }
-  }
-  verifyDebugInfoReferences();
-  return NumDebugInfoErrors == 0;
+  return NumErrors;
 }
 
 void DWARFVerifier::verifyDebugLineStmtOffsets() {
diff --git a/lib/DebugInfo/PDB/CMakeLists.txt b/lib/DebugInfo/PDB/CMakeLists.txt
index ff01c948e099..9b1f37943e67 100644
--- a/lib/DebugInfo/PDB/CMakeLists.txt
+++ b/lib/DebugInfo/PDB/CMakeLists.txt
@@ -52,7 +52,6 @@ add_pdb_impl_folder(Native
   Native/PDBFileBuilder.cpp
   Native/PDBStringTable.cpp
   Native/PDBStringTableBuilder.cpp
-  Native/PDBTypeServerHandler.cpp
   Native/PublicsStream.cpp
   Native/PublicsStreamBuilder.cpp
   Native/RawError.cpp
diff --git a/lib/DebugInfo/PDB/DIA/DIARawSymbol.cpp b/lib/DebugInfo/PDB/DIA/DIARawSymbol.cpp
index 0b48a366bd24..4c59d2f2a9d9 100644
--- a/lib/DebugInfo/PDB/DIA/DIARawSymbol.cpp
+++ b/lib/DebugInfo/PDB/DIA/DIARawSymbol.cpp
@@ -125,16 +125,16 @@ PrivateGetDIAValue(IDiaSymbol *Symbol,
   return Result8;
 }
 
-PDB_UniqueId
+codeview::GUID
 PrivateGetDIAValue(IDiaSymbol *Symbol,
                    HRESULT (__stdcall IDiaSymbol::*Method)(GUID *)) {
   GUID Result;
   if (S_OK != (Symbol->*Method)(&Result))
-    return PDB_UniqueId();
+    return codeview::GUID();
 
-  static_assert(sizeof(PDB_UniqueId) == sizeof(GUID),
-                "PDB_UniqueId is the wrong size!");
-  PDB_UniqueId IdResult;
+  static_assert(sizeof(codeview::GUID) == sizeof(GUID),
+                "GUID is the wrong size!");
+  codeview::GUID IdResult;
   ::memcpy(&IdResult, &Result, sizeof(GUID));
   return IdResult;
 }
@@ -746,7 +746,7 @@ PDB_SymType DIARawSymbol::getSymTag() const {
                                                 &IDiaSymbol::get_symTag);
 }
 
-PDB_UniqueId DIARawSymbol::getGuid() const {
+codeview::GUID DIARawSymbol::getGuid() const {
   return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_guid);
 }
 
diff --git a/lib/DebugInfo/PDB/GenericError.cpp b/lib/DebugInfo/PDB/GenericError.cpp
index 789f3b813170..4fcecb92fd15 100644
--- a/lib/DebugInfo/PDB/GenericError.cpp
+++ b/lib/DebugInfo/PDB/GenericError.cpp
@@ -26,6 +26,8 @@ public:
     switch (static_cast<generic_error_code>(Condition)) {
     case generic_error_code::unspecified:
       return "An unknown error has occurred.";
+    case generic_error_code::type_server_not_found:
+      return "Type server PDB was not found.";
     case generic_error_code::dia_sdk_not_present:
       return "LLVM was not compiled with support for DIA.  This usually means "
              "that you are are not using MSVC, or your Visual Studio "
diff --git a/lib/DebugInfo/PDB/Native/InfoStream.cpp b/lib/DebugInfo/PDB/Native/InfoStream.cpp
index 21b66b3e7bcf..829879060c33 100644
--- a/lib/DebugInfo/PDB/Native/InfoStream.cpp
+++ b/lib/DebugInfo/PDB/Native/InfoStream.cpp
@@ -118,7 +118,7 @@ uint32_t InfoStream::getSignature() const { return Signature; }
 
 uint32_t InfoStream::getAge() const { return Age; }
 
-PDB_UniqueId InfoStream::getGuid() const { return Guid; }
+GUID InfoStream::getGuid() const { return Guid; }
 
 uint32_t InfoStream::getNamedStreamMapByteSize() const {
   return NamedStreamMapByteSize;
diff --git a/lib/DebugInfo/PDB/Native/InfoStreamBuilder.cpp b/lib/DebugInfo/PDB/Native/InfoStreamBuilder.cpp
index 707128f7efd4..6450ae752f96 100644
--- a/lib/DebugInfo/PDB/Native/InfoStreamBuilder.cpp
+++ b/lib/DebugInfo/PDB/Native/InfoStreamBuilder.cpp
@@ -34,7 +34,7 @@ void InfoStreamBuilder::setSignature(uint32_t S) { Sig = S; }
 
 void InfoStreamBuilder::setAge(uint32_t A) { Age = A; }
 
-void InfoStreamBuilder::setGuid(PDB_UniqueId G) { Guid = G; }
+void InfoStreamBuilder::setGuid(GUID G) { Guid = G; }
 
 void InfoStreamBuilder::addFeature(PdbRaw_FeatureSig Sig) {
   Features.push_back(Sig);
diff --git a/lib/DebugInfo/PDB/Native/NativeExeSymbol.cpp b/lib/DebugInfo/PDB/Native/NativeExeSymbol.cpp
index cb0830f453c8..3241000b06db 100644
--- a/lib/DebugInfo/PDB/Native/NativeExeSymbol.cpp
+++ b/lib/DebugInfo/PDB/Native/NativeExeSymbol.cpp
@@ -56,12 +56,12 @@ std::string NativeExeSymbol::getSymbolsFileName() const {
   return File.getFilePath();
 }
 
-PDB_UniqueId NativeExeSymbol::getGuid() const {
+codeview::GUID NativeExeSymbol::getGuid() const {
   auto IS = File.getPDBInfoStream();
   if (IS)
     return IS->getGuid();
   consumeError(IS.takeError());
-  return PDB_UniqueId{{0}};
+  return codeview::GUID{{0}};
 }
 
 bool NativeExeSymbol::hasCTypes() const {
diff --git a/lib/DebugInfo/PDB/Native/NativeRawSymbol.cpp b/lib/DebugInfo/PDB/Native/NativeRawSymbol.cpp
index 92612bcea4ac..df3f418052a9 100644
--- a/lib/DebugInfo/PDB/Native/NativeRawSymbol.cpp
+++ b/lib/DebugInfo/PDB/Native/NativeRawSymbol.cpp
@@ -323,9 +323,7 @@ PDB_SymType NativeRawSymbol::getSymTag() const {
   return PDB_SymType::None;
 }
 
-PDB_UniqueId NativeRawSymbol::getGuid() const {
-  return PDB_UniqueId{{0}};
-}
+codeview::GUID NativeRawSymbol::getGuid() const { return codeview::GUID{{0}}; }
 
 int32_t NativeRawSymbol::getOffset() const {
   return 0;
diff --git a/lib/DebugInfo/PDB/Native/PDBTypeServerHandler.cpp b/lib/DebugInfo/PDB/Native/PDBTypeServerHandler.cpp
deleted file mode 100644
index 9fd90102f72c..000000000000
--- a/lib/DebugInfo/PDB/Native/PDBTypeServerHandler.cpp
+++ /dev/null
@@ -1,126 +0,0 @@
-//===- PDBTypeServerHandler.cpp ---------------------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-// Handles CodeView LF_TYPESERVER2 records by attempting to locate a matching
-// PDB file, then loading the PDB file and visiting all types from the
-// referenced PDB using the original supplied visitor.
-//
-// The net effect of this is that when visiting a PDB containing a TypeServer
-// record, the TypeServer record is "replaced" with all of the records in
-// the referenced PDB file.  If a single instance of PDBTypeServerHandler
-// encounters the same TypeServer multiple times (for example reusing one
-// PDBTypeServerHandler across multiple visitations of distinct object files or
-// PDB files), PDBTypeServerHandler will optionally revisit all the records
-// again, or simply consume the record and do nothing.
-//===----------------------------------------------------------------------===//
-
-#include "llvm/DebugInfo/PDB/Native/PDBTypeServerHandler.h"
-
-#include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
-#include "llvm/DebugInfo/CodeView/CodeViewError.h"
-#include "llvm/DebugInfo/PDB/GenericError.h"
-#include "llvm/DebugInfo/PDB/Native/InfoStream.h"
-#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
-#include "llvm/DebugInfo/PDB/Native/PDBFile.h"
-#include "llvm/DebugInfo/PDB/Native/TpiStream.h"
-#include "llvm/DebugInfo/PDB/PDB.h"
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/Path.h"
-
-using namespace llvm;
-using namespace llvm::codeview;
-using namespace llvm::pdb;
-
-static void ignoreErrors(Error EC) {
-  llvm::handleAllErrors(std::move(EC), [&](ErrorInfoBase &EIB) {});
-}
-
-PDBTypeServerHandler::PDBTypeServerHandler(bool RevisitAlways)
-    : RevisitAlways(RevisitAlways) {}
-
-void PDBTypeServerHandler::addSearchPath(StringRef Path) {
-  if (Path.empty() || !sys::fs::is_directory(Path))
-    return;
-
-  SearchPaths.insert(Path);
-}
-
-Expected<bool>
-PDBTypeServerHandler::handleInternal(PDBFile &File,
-                                     TypeVisitorCallbacks &Callbacks) {
-  auto ExpectedTpi = File.getPDBTpiStream();
-  if (!ExpectedTpi)
-    return ExpectedTpi.takeError();
-
-  // For handling a type server, we should be using whatever the callback array
-  // was
-  // that is being used for the original file.  We shouldn't allow the visitor
-  // to
-  // arbitrarily stick a deserializer in there.
-  if (auto EC = codeview::visitTypeStream(ExpectedTpi->typeArray(), Callbacks,
-                                          VDS_BytesExternal))
-    return std::move(EC);
-
-  return true;
-}
-
-Expected<bool> PDBTypeServerHandler::handle(TypeServer2Record &TS,
-                                            TypeVisitorCallbacks &Callbacks) {
-  if (Session) {
-    // If we've already handled this TypeServer and we only want to handle each
-    // TypeServer once, consume the record without doing anything.
-    if (!RevisitAlways)
-      return true;
-
-    return handleInternal(Session->getPDBFile(), Callbacks);
-  }
-
-  StringRef File = sys::path::filename(TS.Name);
-  if (File.empty())
-    return make_error<CodeViewError>(
-        cv_error_code::corrupt_record,
-        "TypeServer2Record does not contain filename!");
-
-  for (auto &Path : SearchPaths) {
-    SmallString<64> PathStr = Path.getKey();
-    sys::path::append(PathStr, File);
-    if (!sys::fs::exists(PathStr))
-      continue;
-
-    std::unique_ptr<IPDBSession> ThisSession;
-    if (auto EC = loadDataForPDB(PDB_ReaderType::Native, PathStr, ThisSession)) {
-      // It is not an error if this PDB fails to load, it just means that it
-      // doesn't match and we should continue searching.
-      ignoreErrors(std::move(EC));
-      continue;
-    }
-
-    std::unique_ptr<NativeSession> NS(
-        static_cast<NativeSession *>(ThisSession.release()));
-    PDBFile &File = NS->getPDBFile();
-    auto ExpectedInfo = File.getPDBInfoStream();
-    // All PDB Files should have an Info stream.
-    if (!ExpectedInfo)
-      return ExpectedInfo.takeError();
-
-    // Just because a file with a matching name was found and it was an actual
-    // PDB file doesn't mean it matches.  For it to match the InfoStream's GUID
-    // must match the GUID specified in the TypeServer2 record.
-    ArrayRef<uint8_t> GuidBytes(ExpectedInfo->getGuid().Guid);
-    StringRef GuidStr(reinterpret_cast<const char *>(GuidBytes.begin()),
-                      GuidBytes.size());
-    if (GuidStr != TS.Guid)
-      continue;
-
-    Session = std::move(NS);
-    return handleInternal(File, Callbacks);
-  }
-
-  // We couldn't find a matching PDB, so let it be handled by someone else.
-  return false;
-}
diff --git a/lib/DebugInfo/PDB/Native/TpiHashing.cpp b/lib/DebugInfo/PDB/Native/TpiHashing.cpp
index 91b8d648fcf9..77a2d57a8369 100644
--- a/lib/DebugInfo/PDB/Native/TpiHashing.cpp
+++ b/lib/DebugInfo/PDB/Native/TpiHashing.cpp
@@ -11,101 +11,79 @@
 
 #include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
 #include "llvm/DebugInfo/PDB/Native/Hash.h"
-#include "llvm/DebugInfo/PDB/Native/RawError.h"
+#include "llvm/Support/JamCRC.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
 using namespace llvm::pdb;
 
 // Corresponds to `fUDTAnon`.
-template <typename T> static bool isAnonymous(T &Rec) {
-  StringRef Name = Rec.getName();
+static bool isAnonymous(StringRef Name) {
   return Name == "<unnamed-tag>" || Name == "__unnamed" ||
          Name.endswith("::<unnamed-tag>") || Name.endswith("::__unnamed");
 }
 
-// Computes a hash for a given TPI record.
-template <typename T>
-static uint32_t getTpiHash(T &Rec, ArrayRef<uint8_t> FullRecord) {
-  auto Opts = static_cast<uint16_t>(Rec.getOptions());
-
-  bool ForwardRef =
-      Opts & static_cast<uint16_t>(ClassOptions::ForwardReference);
-  bool Scoped = Opts & static_cast<uint16_t>(ClassOptions::Scoped);
-  bool UniqueName = Opts & static_cast<uint16_t>(ClassOptions::HasUniqueName);
-  bool IsAnon = UniqueName && isAnonymous(Rec);
+// Computes the hash for a user-defined type record. This could be a struct,
+// class, union, or enum.
+static uint32_t getHashForUdt(const TagRecord &Rec,
+                              ArrayRef<uint8_t> FullRecord) {
+  ClassOptions Opts = Rec.getOptions();
+  bool ForwardRef = bool(Opts & ClassOptions::ForwardReference);
+  bool Scoped = bool(Opts & ClassOptions::Scoped);
+  bool HasUniqueName = bool(Opts & ClassOptions::HasUniqueName);
+  bool IsAnon = HasUniqueName && isAnonymous(Rec.getName());
 
   if (!ForwardRef && !Scoped && !IsAnon)
     return hashStringV1(Rec.getName());
-  if (!ForwardRef && UniqueName && !IsAnon)
+  if (!ForwardRef && HasUniqueName && !IsAnon)
     return hashStringV1(Rec.getUniqueName());
   return hashBufferV8(FullRecord);
 }
 
-template <typename T> static uint32_t getSourceLineHash(T &Rec) {
-  char Buf[4];
-  support::endian::write32le(Buf, Rec.getUDT().getIndex());
-  return hashStringV1(StringRef(Buf, 4));
-}
-
-void TpiHashUpdater::visitKnownRecordImpl(CVType &CVR,
-                                          UdtSourceLineRecord &Rec) {
-  CVR.Hash = getSourceLineHash(Rec);
-}
-
-void TpiHashUpdater::visitKnownRecordImpl(CVType &CVR,
-                                          UdtModSourceLineRecord &Rec) {
-  CVR.Hash = getSourceLineHash(Rec);
-}
-
-void TpiHashUpdater::visitKnownRecordImpl(CVType &CVR, ClassRecord &Rec) {
-  CVR.Hash = getTpiHash(Rec, CVR.data());
-}
-
-void TpiHashUpdater::visitKnownRecordImpl(CVType &CVR, EnumRecord &Rec) {
-  CVR.Hash = getTpiHash(Rec, CVR.data());
-}
-
-void TpiHashUpdater::visitKnownRecordImpl(CVType &CVR, UnionRecord &Rec) {
-  CVR.Hash = getTpiHash(Rec, CVR.data());
-}
-
-Error TpiHashVerifier::visitKnownRecord(CVType &CVR, UdtSourceLineRecord &Rec) {
-  return verifySourceLine(Rec.getUDT());
-}
-
-Error TpiHashVerifier::visitKnownRecord(CVType &CVR,
-                                        UdtModSourceLineRecord &Rec) {
-  return verifySourceLine(Rec.getUDT());
-}
-
-Error TpiHashVerifier::visitKnownRecord(CVType &CVR, ClassRecord &Rec) {
-  if (getTpiHash(Rec, CVR.data()) % NumHashBuckets != HashValues[Index])
-    return errorInvalidHash();
-  return Error::success();
-}
-Error TpiHashVerifier::visitKnownRecord(CVType &CVR, EnumRecord &Rec) {
-  if (getTpiHash(Rec, CVR.data()) % NumHashBuckets != HashValues[Index])
-    return errorInvalidHash();
-  return Error::success();
-}
-Error TpiHashVerifier::visitKnownRecord(CVType &CVR, UnionRecord &Rec) {
-  if (getTpiHash(Rec, CVR.data()) % NumHashBuckets != HashValues[Index])
-    return errorInvalidHash();
-  return Error::success();
+template <typename T>
+static Expected<uint32_t> getHashForUdt(const CVType &Rec) {
+  T Deserialized;
+  if (auto E = TypeDeserializer::deserializeAs(const_cast<CVType &>(Rec),
+                                               Deserialized))
+    return std::move(E);
+  return getHashForUdt(Deserialized, Rec.data());
 }
 
-Error TpiHashVerifier::verifySourceLine(codeview::TypeIndex TI) {
+template <typename T>
+static Expected<uint32_t> getSourceLineHash(const CVType &Rec) {
+  T Deserialized;
+  if (auto E = TypeDeserializer::deserializeAs(const_cast<CVType &>(Rec),
+                                               Deserialized))
+    return std::move(E);
   char Buf[4];
-  support::endian::write32le(Buf, TI.getIndex());
-  uint32_t Hash = hashStringV1(StringRef(Buf, 4));
-  if (Hash % NumHashBuckets != HashValues[Index])
-    return errorInvalidHash();
-  return Error::success();
+  support::endian::write32le(Buf, Deserialized.getUDT().getIndex());
+  return hashStringV1(StringRef(Buf, 4));
 }
 
-Error TpiHashVerifier::visitTypeBegin(CVType &Rec) {
-  ++Index;
-  RawRecord = Rec;
-  return Error::success();
+Expected<uint32_t> llvm::pdb::hashTypeRecord(const CVType &Rec) {
+  switch (Rec.kind()) {
+  case LF_CLASS:
+  case LF_STRUCTURE:
+  case LF_INTERFACE:
+    return getHashForUdt<ClassRecord>(Rec);
+  case LF_UNION:
+    return getHashForUdt<UnionRecord>(Rec);
+  case LF_ENUM:
+    return getHashForUdt<EnumRecord>(Rec);
+
+  case LF_UDT_SRC_LINE:
+    return getSourceLineHash<UdtSourceLineRecord>(Rec);
+  case LF_UDT_MOD_SRC_LINE:
+    return getSourceLineHash<UdtModSourceLineRecord>(Rec);
+
+  default:
+    break;
+  }
+
+  // Run CRC32 over the bytes. This corresponds to `hashBufv8`.
+  JamCRC JC(/*Init=*/0U);
+  ArrayRef<char> Bytes(reinterpret_cast<const char *>(Rec.data().data()),
+                       Rec.data().size());
+  JC.update(Bytes);
+  return JC.getCRC();
 }
diff --git a/lib/DebugInfo/PDB/Native/TpiStream.cpp b/lib/DebugInfo/PDB/Native/TpiStream.cpp
index f917ef91f639..d3ef87d9009d 100644
--- a/lib/DebugInfo/PDB/Native/TpiStream.cpp
+++ b/lib/DebugInfo/PDB/Native/TpiStream.cpp
@@ -14,7 +14,6 @@
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
 #include "llvm/DebugInfo/PDB/Native/PDBFile.h"
-#include "llvm/DebugInfo/PDB/Native/PDBTypeServerHandler.h"
 #include "llvm/DebugInfo/PDB/Native/RawConstants.h"
 #include "llvm/DebugInfo/PDB/Native/RawError.h"
 #include "llvm/DebugInfo/PDB/Native/RawTypes.h"
diff --git a/lib/DebugInfo/PDB/PDBExtras.cpp b/lib/DebugInfo/PDB/PDBExtras.cpp
index faf1142ddf17..c291185bc67a 100644
--- a/lib/DebugInfo/PDB/PDBExtras.cpp
+++ b/lib/DebugInfo/PDB/PDBExtras.cpp
@@ -260,12 +260,6 @@ raw_ostream &llvm::pdb::operator<<(raw_ostream &OS,
   return OS;
 }
 
-raw_ostream &llvm::pdb::operator<<(raw_ostream &OS, const PDB_UniqueId &Guid) {
-  codeview::detail::GuidAdapter A(Guid.Guid);
-  A.format(OS, "");
-  return OS;
-}
-
 raw_ostream &llvm::pdb::operator<<(raw_ostream &OS, const PDB_UdtType &Type) {
   switch (Type) {
     CASE_OUTPUT_ENUM_CLASS_STR(PDB_UdtType, Class, "class", OS)
diff --git a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOARM.h b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOARM.h
index adca0eeb08b4..43461de4c491 100644
--- a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOARM.h
+++ b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOARM.h
@@ -288,7 +288,6 @@ private:
                       HalfDiffKindBits);
 
     addRelocationForSection(R, SectionAID);
-    addRelocationForSection(R, SectionBID);
 
     return ++RelI;
   }
diff --git a/lib/Fuzzer/CMakeLists.txt b/lib/Fuzzer/CMakeLists.txt
index fa743c280e86..bc744890b997 100644
--- a/lib/Fuzzer/CMakeLists.txt
+++ b/lib/Fuzzer/CMakeLists.txt
@@ -46,7 +46,6 @@ if ( LLVM_USE_SANITIZE_COVERAGE OR CMAKE_SYSTEM_NAME MATCHES "Darwin|Linux" )
       FuzzerShmemPosix.cpp
       FuzzerShmemWindows.cpp
       FuzzerTracePC.cpp
-      FuzzerTraceState.cpp
       FuzzerUtil.cpp
       FuzzerUtilDarwin.cpp
       FuzzerUtilLinux.cpp
diff --git a/lib/Fuzzer/FuzzerCorpus.h b/lib/Fuzzer/FuzzerCorpus.h
index 218ae5b6ac4d..bae0aea78f13 100644
--- a/lib/Fuzzer/FuzzerCorpus.h
+++ b/lib/Fuzzer/FuzzerCorpus.h
@@ -34,7 +34,8 @@ struct InputInfo {
   size_t NumExecutedMutations = 0;
   size_t NumSuccessfullMutations = 0;
   bool MayDeleteFile = false;
-  std::vector<uint32_t> FeatureSet;
+  bool Reduced = false;
+  std::vector<uint32_t> UniqFeatureSet;
 };
 
 class InputCorpus {
@@ -79,7 +80,8 @@ class InputCorpus {
     II.U = U;
     II.NumFeatures = NumFeatures;
     II.MayDeleteFile = MayDeleteFile;
-    II.FeatureSet = FeatureSet;
+    II.UniqFeatureSet = FeatureSet;
+    std::sort(II.UniqFeatureSet.begin(), II.UniqFeatureSet.end());
     ComputeSHA1(U.data(), U.size(), II.Sha1);
     Hashes.insert(Sha1ToString(II.Sha1));
     UpdateCorpusDistribution();
@@ -117,34 +119,21 @@ class InputCorpus {
         Printf("%s sz=%zd ", Sha1ToString(II->Sha1).c_str(), II->U.size());
         PrintUnit(II->U);
         Printf(" ");
-        PrintFeatureSet(II->FeatureSet);
+        PrintFeatureSet(II->UniqFeatureSet);
         Printf("\n");
       }
       i++;
     }
   }
 
-  // If FeatureSet is that same as in II, replace II->U with {Data,Size}.
-  bool TryToReplace(InputInfo *II, const uint8_t *Data, size_t Size,
-                    const std::vector<uint32_t> &FeatureSet) {
-    if (II->U.size() > Size && II->FeatureSet.size() &&
-        II->FeatureSet == FeatureSet) {
-      if (FeatureDebug)
-        Printf("Replace: %zd => %zd\n", II->U.size(), Size);
-      Replace(II, {Data, Data + Size});
-      PrintCorpus();
-      return true;
-    }
-    return false;
-  }
-
   void Replace(InputInfo *II, const Unit &U) {
-    assert(II->U.size());
+    assert(II->U.size() > U.size());
     Hashes.erase(Sha1ToString(II->Sha1));
     DeleteFile(*II);
     ComputeSHA1(U.data(), U.size(), II->Sha1);
     Hashes.insert(Sha1ToString(II->Sha1));
     II->U = U;
+    II->Reduced = true;
   }
 
   bool HasUnit(const Unit &U) { return Hashes.count(Hash(U)); }
@@ -198,7 +187,7 @@ class InputCorpus {
       Printf("EVICTED %zd\n", Idx);
   }
 
-  void AddFeature(size_t Idx, uint32_t NewSize, bool Shrink) {
+  bool AddFeature(size_t Idx, uint32_t NewSize, bool Shrink) {
     assert(NewSize);
     Idx = Idx % kFeatureSetSize;
     uint32_t OldSize = GetFeature(Idx);
@@ -218,8 +207,9 @@ class InputCorpus {
         Printf("ADD FEATURE %zd sz %d\n", Idx, NewSize);
       SmallestElementPerFeature[Idx] = Inputs.size();
       InputSizesPerFeature[Idx] = NewSize;
-      CountingFeatures = true;
+      return true;
     }
+    return false;
   }
 
   size_t NumFeatures() const { return NumAddedFeatures; }
@@ -238,7 +228,6 @@ private:
   size_t GetFeature(size_t Idx) const { return InputSizesPerFeature[Idx]; }
 
   void ValidateFeatureSet() {
-    if (!CountingFeatures) return;
     if (FeatureDebug)
       PrintFeatureSet();
     for (size_t Idx = 0; Idx < kFeatureSetSize; Idx++)
@@ -256,14 +245,12 @@ private:
   // Must be called whenever the corpus or unit weights are changed.
   void UpdateCorpusDistribution() {
     size_t N = Inputs.size();
+    assert(N);
     Intervals.resize(N + 1);
     Weights.resize(N);
     std::iota(Intervals.begin(), Intervals.end(), 0);
-    if (CountingFeatures)
-      for (size_t i = 0; i < N; i++)
-        Weights[i] = Inputs[i]->NumFeatures * (i + 1);
-    else
-      std::iota(Weights.begin(), Weights.end(), 1);
+    for (size_t i = 0; i < N; i++)
+      Weights[i] = Inputs[i]->NumFeatures * (i + 1);
     CorpusDistribution = std::piecewise_constant_distribution<double>(
         Intervals.begin(), Intervals.end(), Weights.begin());
   }
@@ -275,7 +262,6 @@ private:
   std::unordered_set<std::string> Hashes;
   std::vector<InputInfo*> Inputs;
 
-  bool CountingFeatures = false;
   size_t NumAddedFeatures = 0;
   size_t NumUpdatedFeatures = 0;
   uint32_t InputSizesPerFeature[kFeatureSetSize];
diff --git a/lib/Fuzzer/FuzzerDriver.cpp b/lib/Fuzzer/FuzzerDriver.cpp
index 87968893853e..fd8cab38a7bb 100644
--- a/lib/Fuzzer/FuzzerDriver.cpp
+++ b/lib/Fuzzer/FuzzerDriver.cpp
@@ -186,7 +186,11 @@ static void ParseFlags(const std::vector<std::string> &Args) {
   }
   Inputs = new std::vector<std::string>;
   for (size_t A = 1; A < Args.size(); A++) {
-    if (ParseOneFlag(Args[A].c_str())) continue;
+    if (ParseOneFlag(Args[A].c_str())) {
+      if (Flags.ignore_remaining_args)
+        break;
+      continue;
+    }
     Inputs->push_back(Args[A]);
   }
 }
@@ -356,16 +360,17 @@ int MinimizeCrashInput(const std::vector<std::string> &Args,
     exit(1);
   }
   std::string InputFilePath = Inputs->at(0);
-  std::string BaseCmd =
-      CloneArgsWithoutX(Args, "minimize_crash", "exact_artifact_path");
-  auto InputPos = BaseCmd.find(" " + InputFilePath + " ");
+  auto BaseCmd = SplitBefore(
+      "-ignore_remaining_args=1",
+      CloneArgsWithoutX(Args, "minimize_crash", "exact_artifact_path"));
+  auto InputPos = BaseCmd.first.find(" " + InputFilePath + " ");
   assert(InputPos != std::string::npos);
-  BaseCmd.erase(InputPos, InputFilePath.size() + 1);
+  BaseCmd.first.erase(InputPos, InputFilePath.size() + 1);
   if (Flags.runs <= 0 && Flags.max_total_time == 0) {
     Printf("INFO: you need to specify -runs=N or "
            "-max_total_time=N with -minimize_crash=1\n"
            "INFO: defaulting to -max_total_time=600\n");
-    BaseCmd += " -max_total_time=600";
+    BaseCmd.first += " -max_total_time=600";
   }
 
   auto LogFilePath = DirPlusFile(
@@ -378,7 +383,8 @@ int MinimizeCrashInput(const std::vector<std::string> &Args,
     Printf("CRASH_MIN: minimizing crash input: '%s' (%zd bytes)\n",
            CurrentFilePath.c_str(), U.size());
 
-    auto Cmd = BaseCmd + " " + CurrentFilePath + LogFileRedirect;
+    auto Cmd = BaseCmd.first + " " + CurrentFilePath + LogFileRedirect + " " +
+               BaseCmd.second;
 
     Printf("CRASH_MIN: executing: %s\n", Cmd.c_str());
     int ExitCode = ExecuteCommand(Cmd);
diff --git a/lib/Fuzzer/FuzzerFlags.def b/lib/Fuzzer/FuzzerFlags.def
index 5e70cbad3cf1..526805705b20 100644
--- a/lib/Fuzzer/FuzzerFlags.def
+++ b/lib/Fuzzer/FuzzerFlags.def
@@ -121,6 +121,9 @@ FUZZER_FLAG_STRING(exit_on_src_pos, "Exit if a newly found PC originates"
 FUZZER_FLAG_STRING(exit_on_item, "Exit if an item with a given sha1 sum"
     " was added to the corpus. "
     "Used primarily for testing libFuzzer itself.")
+FUZZER_FLAG_INT(ignore_remaining_args, 0, "If 1, ignore all arguments passed "
+                "after this one. Useful for fuzzers that need to do their own "
+                "argument parsing.")
 
 FUZZER_FLAG_STRING(run_equivalence_server, "Experimental")
 FUZZER_FLAG_STRING(use_equivalence_server, "Experimental")
diff --git a/lib/Fuzzer/FuzzerInternal.h b/lib/Fuzzer/FuzzerInternal.h
index a732f895375e..3fc3fe004cef 100644
--- a/lib/Fuzzer/FuzzerInternal.h
+++ b/lib/Fuzzer/FuzzerInternal.h
@@ -38,7 +38,6 @@ public:
   void Loop();
   void MinimizeCrashLoop(const Unit &U);
   void ShuffleAndMinimize(UnitVector *V);
-  void InitializeTraceState();
   void RereadOutputCorpus(size_t MaxSize);
 
   size_t secondsSinceProcessStartUp() {
@@ -100,19 +99,10 @@ private:
   void WriteToOutputCorpus(const Unit &U);
   void WriteUnitToFileWithPrefix(const Unit &U, const char *Prefix);
   void PrintStats(const char *Where, const char *End = "\n", size_t Units = 0);
-  void PrintStatusForNewUnit(const Unit &U);
+  void PrintStatusForNewUnit(const Unit &U, const char *Text);
   void ShuffleCorpus(UnitVector *V);
   void CheckExitOnSrcPosOrItem();
 
-  // Trace-based fuzzing: we run a unit with some kind of tracing
-  // enabled and record potentially useful mutations. Then
-  // We apply these mutations one by one to the unit and run it again.
-
-  // Start tracing; forget all previously proposed mutations.
-  void StartTraceRecording();
-  // Stop tracing.
-  void StopTraceRecording();
-
   static void StaticDeathCallback();
   void DumpCurrentUnit(const char *Prefix);
   void DeathCallback();
@@ -142,7 +132,7 @@ private:
   size_t MaxInputLen = 0;
   size_t MaxMutationLen = 0;
 
-  std::vector<uint32_t> FeatureSetTmp;
+  std::vector<uint32_t> UniqFeatureSetTmp;
 
   // Need to know our own thread.
   static thread_local bool IsMyThread;
diff --git a/lib/Fuzzer/FuzzerLoop.cpp b/lib/Fuzzer/FuzzerLoop.cpp
index 6816f3af8a6f..8ac7a847aef7 100644
--- a/lib/Fuzzer/FuzzerLoop.cpp
+++ b/lib/Fuzzer/FuzzerLoop.cpp
@@ -114,7 +114,6 @@ Fuzzer::Fuzzer(UserCallback CB, InputCorpus &Corpus, MutationDispatcher &MD,
     : CB(CB), Corpus(Corpus), MD(MD), Options(Options) {
   if (EF->__sanitizer_set_death_callback)
     EF->__sanitizer_set_death_callback(StaticDeathCallback);
-  InitializeTraceState();
   assert(!F);
   F = this;
   TPC.ResetMaps();
@@ -403,22 +402,29 @@ bool Fuzzer::RunOne(const uint8_t *Data, size_t Size, bool MayDeleteFile,
 
   ExecuteCallback(Data, Size);
 
-  FeatureSetTmp.clear();
+  UniqFeatureSetTmp.clear();
+  size_t FoundUniqFeaturesOfII = 0;
   size_t NumUpdatesBefore = Corpus.NumFeatureUpdates();
   TPC.CollectFeatures([&](size_t Feature) {
-    Corpus.AddFeature(Feature, Size, Options.Shrink);
-    if (Options.ReduceInputs)
-      FeatureSetTmp.push_back(Feature);
+    if (Corpus.AddFeature(Feature, Size, Options.Shrink))
+      UniqFeatureSetTmp.push_back(Feature);
+    if (Options.ReduceInputs && II)
+      if (std::binary_search(II->UniqFeatureSet.begin(),
+                             II->UniqFeatureSet.end(), Feature))
+        FoundUniqFeaturesOfII++;
   });
   PrintPulseAndReportSlowInput(Data, Size);
   size_t NumNewFeatures = Corpus.NumFeatureUpdates() - NumUpdatesBefore;
   if (NumNewFeatures) {
     Corpus.AddToCorpus({Data, Data + Size}, NumNewFeatures, MayDeleteFile,
-                       FeatureSetTmp);
+                       UniqFeatureSetTmp);
     CheckExitOnSrcPosOrItem();
     return true;
   }
-  if (II && Corpus.TryToReplace(II, Data, Size, FeatureSetTmp)) {
+  if (II && FoundUniqFeaturesOfII &&
+      FoundUniqFeaturesOfII == II->UniqFeatureSet.size() &&
+      II->U.size() > Size) {
+    Corpus.Replace(II, {Data, Data + Size});
     CheckExitOnSrcPosOrItem();
     return true;
   }
@@ -501,10 +507,10 @@ void Fuzzer::WriteUnitToFileWithPrefix(const Unit &U, const char *Prefix) {
     Printf("Base64: %s\n", Base64(U).c_str());
 }
 
-void Fuzzer::PrintStatusForNewUnit(const Unit &U) {
+void Fuzzer::PrintStatusForNewUnit(const Unit &U, const char *Text) {
   if (!Options.PrintNEW)
     return;
-  PrintStats("NEW   ", "");
+  PrintStats(Text, "");
   if (Options.Verbosity) {
     Printf(" L: %zd ", U.size());
     MD.PrintMutationSequence();
@@ -515,7 +521,8 @@ void Fuzzer::PrintStatusForNewUnit(const Unit &U) {
 void Fuzzer::ReportNewCoverage(InputInfo *II, const Unit &U) {
   II->NumSuccessfullMutations++;
   MD.RecordSuccessfulMutationSequence();
-  PrintStatusForNewUnit(U);
+  PrintStatusForNewUnit(U, II->Reduced ? "REDUCE" :
+                                         "NEW   ");
   WriteToOutputCorpus(U);
   NumberOfNewUnitsAdded++;
   TPC.PrintNewPCs();
@@ -600,13 +607,10 @@ void Fuzzer::MutateAndTestOne() {
     assert(NewSize > 0 && "Mutator returned empty unit");
     assert(NewSize <= CurrentMaxMutationLen && "Mutator return overisized unit");
     Size = NewSize;
-    if (i == 0)
-      StartTraceRecording();
     II.NumExecutedMutations++;
     if (RunOne(CurrentUnitData, Size, /*MayDeleteFile=*/true, &II))
       ReportNewCoverage(&II, {CurrentUnitData, CurrentUnitData + Size});
 
-    StopTraceRecording();
     TryDetectingAMemoryLeak(CurrentUnitData, Size,
                             /*DuringInitialCorpusExecution*/ false);
   }
diff --git a/lib/Fuzzer/FuzzerMerge.cpp b/lib/Fuzzer/FuzzerMerge.cpp
index 612f4bbb28f2..616c0999aa39 100644
--- a/lib/Fuzzer/FuzzerMerge.cpp
+++ b/lib/Fuzzer/FuzzerMerge.cpp
@@ -241,7 +241,6 @@ void Fuzzer::CrashResistantMergeInternalStep(const std::string &CFPath) {
       return true;
     });
     // Show stats.
-    TotalNumberOfRuns++;
     if (!(TotalNumberOfRuns & (TotalNumberOfRuns - 1)))
       PrintStats("pulse ");
     // Write the post-run marker and the coverage.
@@ -286,12 +285,13 @@ void Fuzzer::CrashResistantMerge(const std::vector<std::string> &Args,
 
   // Execute the inner process untill it passes.
   // Every inner process should execute at least one input.
-  std::string BaseCmd = CloneArgsWithoutX(Args, "keep-all-flags");
+  auto BaseCmd = SplitBefore("-ignore_remaining_args=1",
+                             CloneArgsWithoutX(Args, "keep-all-flags"));
   bool Success = false;
   for (size_t i = 1; i <= AllFiles.size(); i++) {
     Printf("MERGE-OUTER: attempt %zd\n", i);
-    auto ExitCode =
-        ExecuteCommand(BaseCmd + " -merge_control_file=" + CFPath);
+    auto ExitCode = ExecuteCommand(BaseCmd.first + " -merge_control_file=" +
+                                   CFPath + " " + BaseCmd.second);
     if (!ExitCode) {
       Printf("MERGE-OUTER: succesfull in %zd attempt(s)\n", i);
       Success = true;
diff --git a/lib/Fuzzer/FuzzerMutate.cpp b/lib/Fuzzer/FuzzerMutate.cpp
index 53cb9027e455..5998ef9d3193 100644
--- a/lib/Fuzzer/FuzzerMutate.cpp
+++ b/lib/Fuzzer/FuzzerMutate.cpp
@@ -43,8 +43,6 @@ MutationDispatcher::MutationDispatcher(Random &Rand,
           {&MutationDispatcher::Mutate_CrossOver, "CrossOver"},
           {&MutationDispatcher::Mutate_AddWordFromManualDictionary,
            "ManualDict"},
-          {&MutationDispatcher::Mutate_AddWordFromTemporaryAutoDictionary,
-           "TempAutoDict"},
           {&MutationDispatcher::Mutate_AddWordFromPersistentAutoDictionary,
            "PersAutoDict"},
       });
@@ -165,11 +163,6 @@ size_t MutationDispatcher::Mutate_AddWordFromManualDictionary(uint8_t *Data,
   return AddWordFromDictionary(ManualDictionary, Data, Size, MaxSize);
 }
 
-size_t MutationDispatcher::Mutate_AddWordFromTemporaryAutoDictionary(
-    uint8_t *Data, size_t Size, size_t MaxSize) {
-  return AddWordFromDictionary(TempAutoDictionary, Data, Size, MaxSize);
-}
-
 size_t MutationDispatcher::ApplyDictionaryEntry(uint8_t *Data, size_t Size,
                                                 size_t MaxSize,
                                                 DictionaryEntry &DE) {
@@ -251,7 +244,7 @@ size_t MutationDispatcher::Mutate_AddWordFromTORC(
     uint8_t *Data, size_t Size, size_t MaxSize) {
   Word W;
   DictionaryEntry DE;
-  switch (Rand(3)) {
+  switch (Rand(4)) {
   case 0: {
     auto X = TPC.TORC8.Get(Rand.Rand());
     DE = MakeDictionaryEntryFromCMP(X.A, X.B, Data, Size);
@@ -267,6 +260,10 @@ size_t MutationDispatcher::Mutate_AddWordFromTORC(
     auto X = TPC.TORCW.Get(Rand.Rand());
     DE = MakeDictionaryEntryFromCMP(X.A, X.B, Data, Size);
   } break;
+  case 3: if (Options.UseMemmem) {
+    auto X = TPC.MMT.Get(Rand.Rand());
+    DE = DictionaryEntry(X);
+  } break;
   default:
     assert(0);
   }
@@ -533,14 +530,4 @@ void MutationDispatcher::AddWordToManualDictionary(const Word &W) {
       {W, std::numeric_limits<size_t>::max()});
 }
 
-void MutationDispatcher::AddWordToAutoDictionary(DictionaryEntry DE) {
-  static const size_t kMaxAutoDictSize = 1 << 14;
-  if (TempAutoDictionary.size() >= kMaxAutoDictSize) return;
-  TempAutoDictionary.push_back(DE);
-}
-
-void MutationDispatcher::ClearAutoDictionary() {
-  TempAutoDictionary.clear();
-}
-
 }  // namespace fuzzer
diff --git a/lib/Fuzzer/FuzzerMutate.h b/lib/Fuzzer/FuzzerMutate.h
index 8c8fb3fd74c7..84b04c0dbf3e 100644
--- a/lib/Fuzzer/FuzzerMutate.h
+++ b/lib/Fuzzer/FuzzerMutate.h
@@ -52,10 +52,6 @@ public:
   size_t Mutate_AddWordFromManualDictionary(uint8_t *Data, size_t Size,
                                             size_t MaxSize);
 
-  /// Mutates data by adding a word from the temporary automatic dictionary.
-  size_t Mutate_AddWordFromTemporaryAutoDictionary(uint8_t *Data, size_t Size,
-                                                   size_t MaxSize);
-
   /// Mutates data by adding a word from the TORC.
   size_t Mutate_AddWordFromTORC(uint8_t *Data, size_t Size, size_t MaxSize);
 
@@ -84,8 +80,6 @@ public:
 
   void AddWordToManualDictionary(const Word &W);
 
-  void AddWordToAutoDictionary(DictionaryEntry DE);
-  void ClearAutoDictionary();
   void PrintRecommendedDictionary();
 
   void SetCorpus(const InputCorpus *Corpus) { this->Corpus = Corpus; }
diff --git a/lib/Fuzzer/FuzzerTracePC.cpp b/lib/Fuzzer/FuzzerTracePC.cpp
index 6f5c7be41062..ced0a2133340 100644
--- a/lib/Fuzzer/FuzzerTracePC.cpp
+++ b/lib/Fuzzer/FuzzerTracePC.cpp
@@ -37,6 +37,8 @@ namespace fuzzer {
 
 TracePC TPC;
 
+int ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr;
+
 uint8_t *TracePC::Counters() const {
   return __sancov_trace_pc_guard_8bit_counters;
 }
@@ -293,6 +295,20 @@ void TracePC::HandleCmp(uintptr_t PC, T Arg1, T Arg2) {
   ValueProfileMap.AddValue(Idx);
 }
 
+static size_t InternalStrnlen(const char *S, size_t MaxLen) {
+  size_t Len = 0;
+  for (; Len < MaxLen && S[Len]; Len++) {}
+  return Len;
+}
+
+// Finds min of (strlen(S1), strlen(S2)).
+// Needed bacause one of these strings may actually be non-zero terminated.
+static size_t InternalStrnlen2(const char *S1, const char *S2) {
+  size_t Len = 0;
+  for (; S1[Len] && S2[Len]; Len++)  {}
+  return Len;
+}
+
 } // namespace fuzzer
 
 extern "C" {
@@ -415,4 +431,71 @@ void __sanitizer_cov_trace_gep(uintptr_t Idx) {
   uintptr_t PC = reinterpret_cast<uintptr_t>(__builtin_return_address(0));
   fuzzer::TPC.HandleCmp(PC, Idx, (uintptr_t)0);
 }
+
+ATTRIBUTE_INTERFACE ATTRIBUTE_NO_SANITIZE_MEMORY
+void __sanitizer_weak_hook_memcmp(void *caller_pc, const void *s1,
+                                  const void *s2, size_t n, int result) {
+  if (fuzzer::ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr) return;
+  if (result == 0) return;  // No reason to mutate.
+  if (n <= 1) return;  // Not interesting.
+  fuzzer::TPC.AddValueForMemcmp(caller_pc, s1, s2, n, /*StopAtZero*/false);
+}
+
+ATTRIBUTE_INTERFACE ATTRIBUTE_NO_SANITIZE_MEMORY
+void __sanitizer_weak_hook_strncmp(void *caller_pc, const char *s1,
+                                   const char *s2, size_t n, int result) {
+  if (fuzzer::ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr) return;
+  if (result == 0) return;  // No reason to mutate.
+  size_t Len1 = fuzzer::InternalStrnlen(s1, n);
+  size_t Len2 = fuzzer::InternalStrnlen(s2, n);
+  n = std::min(n, Len1);
+  n = std::min(n, Len2);
+  if (n <= 1) return;  // Not interesting.
+  fuzzer::TPC.AddValueForMemcmp(caller_pc, s1, s2, n, /*StopAtZero*/true);
+}
+
+ATTRIBUTE_INTERFACE ATTRIBUTE_NO_SANITIZE_MEMORY
+void __sanitizer_weak_hook_strcmp(void *caller_pc, const char *s1,
+                                   const char *s2, int result) {
+  if (fuzzer::ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr) return;
+  if (result == 0) return;  // No reason to mutate.
+  size_t N = fuzzer::InternalStrnlen2(s1, s2);
+  if (N <= 1) return;  // Not interesting.
+  fuzzer::TPC.AddValueForMemcmp(caller_pc, s1, s2, N, /*StopAtZero*/true);
+}
+
+ATTRIBUTE_INTERFACE ATTRIBUTE_NO_SANITIZE_MEMORY
+void __sanitizer_weak_hook_strncasecmp(void *called_pc, const char *s1,
+                                       const char *s2, size_t n, int result) {
+  if (fuzzer::ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr) return;
+  return __sanitizer_weak_hook_strncmp(called_pc, s1, s2, n, result);
+}
+
+ATTRIBUTE_INTERFACE ATTRIBUTE_NO_SANITIZE_MEMORY
+void __sanitizer_weak_hook_strcasecmp(void *called_pc, const char *s1,
+                                      const char *s2, int result) {
+  if (fuzzer::ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr) return;
+  return __sanitizer_weak_hook_strcmp(called_pc, s1, s2, result);
+}
+
+ATTRIBUTE_INTERFACE ATTRIBUTE_NO_SANITIZE_MEMORY
+void __sanitizer_weak_hook_strstr(void *called_pc, const char *s1,
+                                  const char *s2, char *result) {
+  if (fuzzer::ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr) return;
+  fuzzer::TPC.MMT.Add(reinterpret_cast<const uint8_t *>(s2), strlen(s2));
+}
+
+ATTRIBUTE_INTERFACE ATTRIBUTE_NO_SANITIZE_MEMORY
+void __sanitizer_weak_hook_strcasestr(void *called_pc, const char *s1,
+                                      const char *s2, char *result) {
+  if (fuzzer::ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr) return;
+  fuzzer::TPC.MMT.Add(reinterpret_cast<const uint8_t *>(s2), strlen(s2));
+}
+
+ATTRIBUTE_INTERFACE ATTRIBUTE_NO_SANITIZE_MEMORY
+void __sanitizer_weak_hook_memmem(void *called_pc, const void *s1, size_t len1,
+                                  const void *s2, size_t len2, void *result) {
+  if (fuzzer::ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr) return;
+  fuzzer::TPC.MMT.Add(reinterpret_cast<const uint8_t *>(s2), len2);
+}
 }  // extern "C"
diff --git a/lib/Fuzzer/FuzzerTracePC.h b/lib/Fuzzer/FuzzerTracePC.h
index 5ec8c590b4df..b36c4f54306c 100644
--- a/lib/Fuzzer/FuzzerTracePC.h
+++ b/lib/Fuzzer/FuzzerTracePC.h
@@ -45,6 +45,28 @@ struct TableOfRecentCompares {
   Pair Table[kSize];
 };
 
+template <size_t kSizeT>
+struct MemMemTable {
+  static const size_t kSize = kSizeT;
+  Word MemMemWords[kSize];
+  Word EmptyWord;
+
+  void Add(const uint8_t *Data, size_t Size) {
+    if (Size <= 2) return;
+    Size = std::min(Size, Word::GetMaxSize());
+    size_t Idx = SimpleFastHash(Data, Size) % kSize;
+    MemMemWords[Idx].Set(Data, Size);
+  }
+  const Word &Get(size_t Idx) {
+    for (size_t i = 0; i < kSize; i++) {
+      const Word &W = MemMemWords[(Idx + i) % kSize];
+      if (W.size()) return W;
+    }
+    EmptyWord.Set(nullptr, 0);
+    return EmptyWord;
+  }
+};
+
 class TracePC {
  public:
   static const size_t kNumPCs = 1 << 21;
@@ -81,6 +103,7 @@ class TracePC {
   TableOfRecentCompares<uint32_t, 32> TORC4;
   TableOfRecentCompares<uint64_t, 32> TORC8;
   TableOfRecentCompares<Word, 32> TORCW;
+  MemMemTable<1024> MMT;
 
   void PrintNewPCs();
   void InitializePrintNewPCs();
diff --git a/lib/Fuzzer/FuzzerTraceState.cpp b/lib/Fuzzer/FuzzerTraceState.cpp
deleted file mode 100644
index 8670e2ad6727..000000000000
--- a/lib/Fuzzer/FuzzerTraceState.cpp
+++ /dev/null
@@ -1,181 +0,0 @@
-//===- FuzzerTraceState.cpp - Trace-based fuzzer mutator ------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-// Data tracing.
-//===----------------------------------------------------------------------===//
-
-#include "FuzzerDictionary.h"
-#include "FuzzerIO.h"
-#include "FuzzerInternal.h"
-#include "FuzzerMutate.h"
-#include "FuzzerTracePC.h"
-#include <algorithm>
-#include <cstring>
-#include <map>
-#include <set>
-#include <thread>
-
-namespace fuzzer {
-
-// Declared as static globals for faster checks inside the hooks.
-static bool RecordingMemmem = false;
-
-int ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr;
-
-class TraceState {
-public:
-  TraceState(MutationDispatcher &MD, const FuzzingOptions &Options,
-             const Fuzzer *F)
-      : MD(MD), Options(Options), F(F) {}
-
-  void StartTraceRecording() {
-    if (!Options.UseMemmem)
-      return;
-    RecordingMemmem = true;
-    InterestingWords.clear();
-    MD.ClearAutoDictionary();
-  }
-
-  void StopTraceRecording() {
-    if (!RecordingMemmem)
-      return;
-    for (auto &W : InterestingWords)
-      MD.AddWordToAutoDictionary({W});
-  }
-
-  void AddInterestingWord(const uint8_t *Data, size_t Size) {
-    if (!RecordingMemmem || !F->InFuzzingThread()) return;
-    if (Size <= 1) return;
-    Size = std::min(Size, Word::GetMaxSize());
-    Word W(Data, Size);
-    InterestingWords.insert(W);
-  }
-
- private:
-
-  // TODO: std::set is too inefficient, need to have a custom DS here.
-  std::set<Word> InterestingWords;
-  MutationDispatcher &MD;
-  const FuzzingOptions Options;
-  const Fuzzer *F;
-};
-
-static TraceState *TS;
-
-void Fuzzer::StartTraceRecording() {
-  if (!TS) return;
-  TS->StartTraceRecording();
-}
-
-void Fuzzer::StopTraceRecording() {
-  if (!TS) return;
-  TS->StopTraceRecording();
-}
-
-void Fuzzer::InitializeTraceState() {
-  if (!Options.UseMemmem) return;
-  TS = new TraceState(MD, Options, this);
-}
-
-static size_t InternalStrnlen(const char *S, size_t MaxLen) {
-  size_t Len = 0;
-  for (; Len < MaxLen && S[Len]; Len++) {}
-  return Len;
-}
-
-// Finds min of (strlen(S1), strlen(S2)).
-// Needed bacause one of these strings may actually be non-zero terminated.
-static size_t InternalStrnlen2(const char *S1, const char *S2) {
-  size_t Len = 0;
-  for (; S1[Len] && S2[Len]; Len++)  {}
-  return Len;
-}
-
-}  // namespace fuzzer
-
-using fuzzer::TS;
-
-extern "C" {
-
-// We may need to avoid defining weak hooks to stay compatible with older clang.
-#ifndef LLVM_FUZZER_DEFINES_SANITIZER_WEAK_HOOOKS
-# define LLVM_FUZZER_DEFINES_SANITIZER_WEAK_HOOOKS 1
-#endif
-
-#if LLVM_FUZZER_DEFINES_SANITIZER_WEAK_HOOOKS
-
-ATTRIBUTE_INTERFACE ATTRIBUTE_NO_SANITIZE_MEMORY
-void __sanitizer_weak_hook_memcmp(void *caller_pc, const void *s1,
-                                  const void *s2, size_t n, int result) {
-  if (fuzzer::ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr) return;
-  if (result == 0) return;  // No reason to mutate.
-  if (n <= 1) return;  // Not interesting.
-  fuzzer::TPC.AddValueForMemcmp(caller_pc, s1, s2, n, /*StopAtZero*/false);
-}
-
-ATTRIBUTE_INTERFACE ATTRIBUTE_NO_SANITIZE_MEMORY
-void __sanitizer_weak_hook_strncmp(void *caller_pc, const char *s1,
-                                   const char *s2, size_t n, int result) {
-  if (fuzzer::ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr) return;
-  if (result == 0) return;  // No reason to mutate.
-  size_t Len1 = fuzzer::InternalStrnlen(s1, n);
-  size_t Len2 = fuzzer::InternalStrnlen(s2, n);
-  n = std::min(n, Len1);
-  n = std::min(n, Len2);
-  if (n <= 1) return;  // Not interesting.
-  fuzzer::TPC.AddValueForMemcmp(caller_pc, s1, s2, n, /*StopAtZero*/true);
-}
-
-
-ATTRIBUTE_INTERFACE ATTRIBUTE_NO_SANITIZE_MEMORY
-void __sanitizer_weak_hook_strcmp(void *caller_pc, const char *s1,
-                                   const char *s2, int result) {
-  if (fuzzer::ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr) return;
-  if (result == 0) return;  // No reason to mutate.
-  size_t N = fuzzer::InternalStrnlen2(s1, s2);
-  if (N <= 1) return;  // Not interesting.
-  fuzzer::TPC.AddValueForMemcmp(caller_pc, s1, s2, N, /*StopAtZero*/true);
-}
-
-ATTRIBUTE_INTERFACE ATTRIBUTE_NO_SANITIZE_MEMORY
-void __sanitizer_weak_hook_strncasecmp(void *called_pc, const char *s1,
-                                       const char *s2, size_t n, int result) {
-  if (fuzzer::ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr) return;
-  return __sanitizer_weak_hook_strncmp(called_pc, s1, s2, n, result);
-}
-
-ATTRIBUTE_INTERFACE ATTRIBUTE_NO_SANITIZE_MEMORY
-void __sanitizer_weak_hook_strcasecmp(void *called_pc, const char *s1,
-                                      const char *s2, int result) {
-  if (fuzzer::ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr) return;
-  return __sanitizer_weak_hook_strcmp(called_pc, s1, s2, result);
-}
-
-ATTRIBUTE_INTERFACE ATTRIBUTE_NO_SANITIZE_MEMORY
-void __sanitizer_weak_hook_strstr(void *called_pc, const char *s1,
-                                  const char *s2, char *result) {
-  if (fuzzer::ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr) return;
-  TS->AddInterestingWord(reinterpret_cast<const uint8_t *>(s2), strlen(s2));
-}
-
-ATTRIBUTE_INTERFACE ATTRIBUTE_NO_SANITIZE_MEMORY
-void __sanitizer_weak_hook_strcasestr(void *called_pc, const char *s1,
-                                      const char *s2, char *result) {
-  if (fuzzer::ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr) return;
-  TS->AddInterestingWord(reinterpret_cast<const uint8_t *>(s2), strlen(s2));
-}
-
-ATTRIBUTE_INTERFACE ATTRIBUTE_NO_SANITIZE_MEMORY
-void __sanitizer_weak_hook_memmem(void *called_pc, const void *s1, size_t len1,
-                                  const void *s2, size_t len2, void *result) {
-  if (fuzzer::ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr) return;
-  TS->AddInterestingWord(reinterpret_cast<const uint8_t *>(s2), len2);
-}
-
-#endif  // LLVM_FUZZER_DEFINES_SANITIZER_WEAK_HOOOKS
-}  // extern "C"
diff --git a/lib/Fuzzer/FuzzerUtil.cpp b/lib/Fuzzer/FuzzerUtil.cpp
index f5fd3a85187c..2d95f40e46a1 100644
--- a/lib/Fuzzer/FuzzerUtil.cpp
+++ b/lib/Fuzzer/FuzzerUtil.cpp
@@ -215,4 +215,11 @@ bool ExecuteCommandAndReadOutput(const std::string &Command, std::string *Out) {
   return true;
 }
 
+size_t SimpleFastHash(const uint8_t *Data, size_t Size) {
+  size_t Res = 0;
+  for (size_t i = 0; i < Size; i++)
+    Res = Res * 11 + Data[i];
+  return Res;
+}
+
 }  // namespace fuzzer
diff --git a/lib/Fuzzer/FuzzerUtil.h b/lib/Fuzzer/FuzzerUtil.h
index f84fd9ef0fce..62d6e61dcf17 100644
--- a/lib/Fuzzer/FuzzerUtil.h
+++ b/lib/Fuzzer/FuzzerUtil.h
@@ -67,10 +67,20 @@ inline std::string CloneArgsWithoutX(const std::vector<std::string> &Args,
   return CloneArgsWithoutX(Args, X, X);
 }
 
+inline std::pair<std::string, std::string> SplitBefore(std::string X,
+                                                       std::string S) {
+  auto Pos = S.find(X);
+  if (Pos == std::string::npos)
+    return std::make_pair(S, "");
+  return std::make_pair(S.substr(0, Pos), S.substr(Pos));
+}
+
 std::string DisassembleCmd(const std::string &FileName);
 
 std::string SearchRegexCmd(const std::string &Regex);
 
+size_t SimpleFastHash(const uint8_t *Data, size_t Size);
+
 }  // namespace fuzzer
 
 #endif  // LLVM_FUZZER_UTIL_H
diff --git a/lib/Fuzzer/afl/afl_driver.cpp b/lib/Fuzzer/afl/afl_driver.cpp
index d0521bdfdd67..15bceb896e17 100644
--- a/lib/Fuzzer/afl/afl_driver.cpp
+++ b/lib/Fuzzer/afl/afl_driver.cpp
@@ -22,8 +22,8 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
   return 0;
 }
 EOF
-# Build your target with -fsanitize-coverage=trace-pc using fresh clang.
-clang -g -fsanitize-coverage=trace-pc test_fuzzer.cc -c
+# Build your target with -fsanitize-coverage=trace-pc-guard using fresh clang.
+clang -g -fsanitize-coverage=trace-pc-guard test_fuzzer.cc -c
 # Build afl-llvm-rt.o.c from the AFL distribution.
 clang -c -w $AFL_HOME/llvm_mode/afl-llvm-rt.o.c
 # Build this file, link it with afl-llvm-rt.o.o and the target code.
diff --git a/lib/Fuzzer/test/CMakeLists.txt b/lib/Fuzzer/test/CMakeLists.txt
index 30566bdc87ae..43aea2b7a186 100644
--- a/lib/Fuzzer/test/CMakeLists.txt
+++ b/lib/Fuzzer/test/CMakeLists.txt
@@ -90,6 +90,7 @@ set(Tests
   EmptyTest
   EquivalenceATest
   EquivalenceBTest
+  FlagsTest
   FourIndependentBranchesTest
   FullCoverageSetTest
   InitializeTest
@@ -272,5 +273,5 @@ add_lit_testsuite(check-fuzzer "Running Fuzzer tests"
 # Don't add dependencies on Windows. The linker step would fail on Windows,
 # since cmake will use link.exe for linking and won't include compiler-rt libs.
 if(NOT MSVC)
-  add_dependencies(check-fuzzer FileCheck sancov not llvm-symbolizer)
+  add_dependencies(check-fuzzer FileCheck sancov not)
 endif()
diff --git a/lib/Fuzzer/test/FlagsTest.cpp b/lib/Fuzzer/test/FlagsTest.cpp
new file mode 100644
index 000000000000..ac64b9d48df5
--- /dev/null
+++ b/lib/Fuzzer/test/FlagsTest.cpp
@@ -0,0 +1,32 @@
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+
+// Parse some flags
+#include <string>
+#include <vector>
+
+static std::vector<std::string> Flags;
+
+extern "C" int LLVMFuzzerInitialize(int *Argc, char ***Argv) {
+  // Parse --flags and anything after -ignore_remaining_args=1 is passed.
+  int I = 1;
+  while (I < *Argc) {
+    std::string S((*Argv)[I++]);
+    if (S == "-ignore_remaining_args=1")
+      break;
+    if (S.substr(0, 2) == "--")
+      Flags.push_back(S);
+  }
+  while (I < *Argc)
+    Flags.push_back(std::string((*Argv)[I++]));
+
+  return 0;
+}
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+  fprintf(stderr, "BINGO ");
+  for (auto Flag : Flags)
+    fprintf(stderr, "%s ", Flag.c_str());
+  fprintf(stderr, "\n");
+  exit(0);
+}
diff --git a/lib/Fuzzer/test/FuzzerUnittest.cpp b/lib/Fuzzer/test/FuzzerUnittest.cpp
index 1053c28527bf..eba2663029b2 100644
--- a/lib/Fuzzer/test/FuzzerUnittest.cpp
+++ b/lib/Fuzzer/test/FuzzerUnittest.cpp
@@ -424,35 +424,6 @@ TEST(FuzzerMutate, AddWordFromDictionary2) {
   TestAddWordFromDictionary(&MutationDispatcher::Mutate, 1 << 15);
 }
 
-void TestAddWordFromDictionaryWithHint(Mutator M, int NumIter) {
-  std::unique_ptr<ExternalFunctions> t(new ExternalFunctions());
-  fuzzer::EF = t.get();
-  Random Rand(0);
-  MutationDispatcher MD(Rand, {});
-  uint8_t W[] = {0xAA, 0xBB, 0xCC, 0xDD, 0xFF, 0xEE, 0xEF};
-  size_t PosHint = 7777;
-  MD.AddWordToAutoDictionary({Word(W, sizeof(W)), PosHint});
-  int FoundMask = 0;
-  for (int i = 0; i < NumIter; i++) {
-    uint8_t T[10000];
-    memset(T, 0, sizeof(T));
-    size_t NewSize = (MD.*M)(T, 9000, 10000);
-    if (NewSize >= PosHint + sizeof(W) &&
-        !memcmp(W, T + PosHint, sizeof(W)))
-      FoundMask = 1;
-  }
-  EXPECT_EQ(FoundMask, 1);
-}
-
-TEST(FuzzerMutate, AddWordFromDictionaryWithHint1) {
-  TestAddWordFromDictionaryWithHint(
-      &MutationDispatcher::Mutate_AddWordFromTemporaryAutoDictionary, 1 << 5);
-}
-
-TEST(FuzzerMutate, AddWordFromDictionaryWithHint2) {
-  TestAddWordFromDictionaryWithHint(&MutationDispatcher::Mutate, 1 << 10);
-}
-
 void TestChangeASCIIInteger(Mutator M, int NumIter) {
   std::unique_ptr<ExternalFunctions> t(new ExternalFunctions());
   fuzzer::EF = t.get();
@@ -593,7 +564,7 @@ TEST(Corpus, Distribution) {
   size_t N = 10;
   size_t TriesPerUnit = 1<<16;
   for (size_t i = 0; i < N; i++)
-    C->AddToCorpus(Unit{ static_cast<uint8_t>(i) }, 0, false, {});
+    C->AddToCorpus(Unit{ static_cast<uint8_t>(i) }, 1, false, {});
 
   std::vector<size_t> Hist(N);
   for (size_t i = 0; i < N * TriesPerUnit; i++) {
diff --git a/lib/Fuzzer/test/fuzzer-flags.test b/lib/Fuzzer/test/fuzzer-flags.test
index 76ea27705750..976da2906d7c 100644
--- a/lib/Fuzzer/test/fuzzer-flags.test
+++ b/lib/Fuzzer/test/fuzzer-flags.test
@@ -1,10 +1,21 @@
-RUN: LLVMFuzzer-SimpleTest -foo_bar=1 2>&1 | FileCheck %s --check-prefix=FOO_BAR
+# Does not work on windows for unknown reason.
+UNSUPPORTED: windows
+
+RUN: LLVMFuzzer-FlagsTest -foo_bar=1 2>&1 | FileCheck %s --check-prefix=FOO_BAR
 FOO_BAR: WARNING: unrecognized flag '-foo_bar=1'; use -help=1 to list all flags
 FOO_BAR: BINGO
 
-RUN: LLVMFuzzer-SimpleTest -runs=10 --max_len=100 2>&1 | FileCheck %s --check-prefix=DASH_DASH
+RUN: LLVMFuzzer-FlagsTest -runs=10 --max_len=100 2>&1 | FileCheck %s --check-prefix=DASH_DASH
 DASH_DASH: WARNING: did you mean '-max_len=100' (single dash)?
 DASH_DASH: INFO: A corpus is not provided, starting from an empty corpus
 
-RUN: LLVMFuzzer-SimpleTest -help=1 2>&1 | FileCheck %s --check-prefix=NO_INTERNAL
+RUN: LLVMFuzzer-FlagsTest -help=1 2>&1 | FileCheck %s --check-prefix=NO_INTERNAL
 NO_INTERNAL-NOT: internal flag
+
+RUN: LLVMFuzzer-FlagsTest --foo-bar -runs=10 -ignore_remaining_args=1 --baz -help=1 test 2>&1 | FileCheck %s --check-prefix=PASSTHRU
+PASSTHRU: BINGO --foo-bar --baz -help=1 test
+
+RUN: mkdir -p %t/T0 %t/T1
+RUN: touch %t/T1/empty
+RUN: LLVMFuzzer-FlagsTest --foo-bar -merge=1 %t/T0 %t/T1 -ignore_remaining_args=1 --baz -help=1 test 2>&1 | FileCheck %s --check-prefix=PASSTHRU-MERGE
+PASSTHRU-MERGE: BINGO --foo-bar --baz -help=1 test
diff --git a/lib/Fuzzer/test/fuzzer-traces-hooks.test b/lib/Fuzzer/test/fuzzer-traces-hooks.test
index f93a8b7199e2..77ca4b47bd01 100644
--- a/lib/Fuzzer/test/fuzzer-traces-hooks.test
+++ b/lib/Fuzzer/test/fuzzer-traces-hooks.test
@@ -10,7 +10,7 @@ RUN: not LLVMFuzzer-StrstrTest               -seed=1 -runs=2000000   2>&1 | File
 
 RUN: not LLVMFuzzer-Memcmp64BytesTest        -seed=1 -runs=1000000   2>&1 | FileCheck %s
 
-RUN: LLVMFuzzer-RepeatedMemcmp -seed=11 -runs=100000 2>&1 | FileCheck %s --check-prefix=RECOMMENDED_DICT
+RUN: LLVMFuzzer-RepeatedMemcmp -seed=11 -runs=100000 -max_len=20 2>&1 | FileCheck %s --check-prefix=RECOMMENDED_DICT
 RECOMMENDED_DICT:###### Recommended dictionary. ######
 RECOMMENDED_DICT-DAG: "foo"
 RECOMMENDED_DICT-DAG: "bar"
diff --git a/lib/Fuzzer/test/reduce_inputs.test b/lib/Fuzzer/test/reduce_inputs.test
index a4a5c57123d3..5ce4440788f4 100644
--- a/lib/Fuzzer/test/reduce_inputs.test
+++ b/lib/Fuzzer/test/reduce_inputs.test
@@ -9,5 +9,6 @@ CHECK: INFO: found item with checksum '0eb8e4ed029b774d80f2b66408203801cb982a60'
 RUN: LLVMFuzzer-ShrinkControlFlowSimpleTest -runs=0 %t/C 2>&1 | FileCheck %s --check-prefix=COUNT
 COUNT: READ units: 3
 
-
+# a bit longer test
+RUN: LLVMFuzzer-ShrinkControlFlowTest  -exit_on_item=0eb8e4ed029b774d80f2b66408203801cb982a60  -seed=1 -reduce_inputs=1 -runs=1000000  2>&1 | FileCheck %s
 
diff --git a/lib/IR/AsmWriter.cpp b/lib/IR/AsmWriter.cpp
index 80371780fb6d..170bc544d53f 100644
--- a/lib/IR/AsmWriter.cpp
+++ b/lib/IR/AsmWriter.cpp
@@ -365,7 +365,7 @@ static void PrintCallingConv(unsigned cc, raw_ostream &Out) {
   case CallingConv::PTX_Kernel:    Out << "ptx_kernel"; break;
   case CallingConv::PTX_Device:    Out << "ptx_device"; break;
   case CallingConv::X86_64_SysV:   Out << "x86_64_sysvcc"; break;
-  case CallingConv::X86_64_Win64:  Out << "x86_64_win64cc"; break;
+  case CallingConv::Win64:         Out << "win64cc"; break;
   case CallingConv::SPIR_FUNC:     Out << "spir_func"; break;
   case CallingConv::SPIR_KERNEL:   Out << "spir_kernel"; break;
   case CallingConv::Swift:         Out << "swiftcc"; break;
@@ -1964,6 +1964,7 @@ static void writeDIImportedEntity(raw_ostream &Out, const DIImportedEntity *N,
   Printer.printString("name", N->getName());
   Printer.printMetadata("scope", N->getRawScope(), /* ShouldSkipNull */ false);
   Printer.printMetadata("entity", N->getRawEntity());
+  Printer.printMetadata("file", N->getRawFile());
   Printer.printInt("line", N->getLine());
   Out << ")";
 }
diff --git a/lib/IR/Constants.cpp b/lib/IR/Constants.cpp
index e31779c83e3a..f56fe7089807 100644
--- a/lib/IR/Constants.cpp
+++ b/lib/IR/Constants.cpp
@@ -44,8 +44,8 @@ bool Constant::isNegativeZeroValue() const {
 
   // Equivalent for a vector of -0.0's.
   if (const ConstantDataVector *CV = dyn_cast<ConstantDataVector>(this))
-    if (ConstantFP *SplatCFP = dyn_cast_or_null<ConstantFP>(CV->getSplatValue()))
-      if (SplatCFP && SplatCFP->isZero() && SplatCFP->isNegative())
+    if (CV->getElementType()->isFloatingPointTy() && CV->isSplat())
+      if (CV->getElementAsAPFloat(0).isNegZero())
         return true;
 
   if (const ConstantVector *CV = dyn_cast<ConstantVector>(this))
@@ -70,8 +70,8 @@ bool Constant::isZeroValue() const {
 
   // Equivalent for a vector of -0.0's.
   if (const ConstantDataVector *CV = dyn_cast<ConstantDataVector>(this))
-    if (ConstantFP *SplatCFP = dyn_cast_or_null<ConstantFP>(CV->getSplatValue()))
-      if (SplatCFP && SplatCFP->isZero())
+    if (CV->getElementType()->isFloatingPointTy() && CV->isSplat())
+      if (CV->getElementAsAPFloat(0).isZero())
         return true;
 
   if (const ConstantVector *CV = dyn_cast<ConstantVector>(this))
@@ -113,9 +113,13 @@ bool Constant::isAllOnesValue() const {
       return Splat->isAllOnesValue();
 
   // Check for constant vectors which are splats of -1 values.
-  if (const ConstantDataVector *CV = dyn_cast<ConstantDataVector>(this))
-    if (Constant *Splat = CV->getSplatValue())
-      return Splat->isAllOnesValue();
+  if (const ConstantDataVector *CV = dyn_cast<ConstantDataVector>(this)) {
+    if (CV->isSplat()) {
+      if (CV->getElementType()->isFloatingPointTy())
+        return CV->getElementAsAPFloat(0).bitcastToAPInt().isAllOnesValue();
+      return CV->getElementAsAPInt(0).isAllOnesValue();
+    }
+  }
 
   return false;
 }
@@ -135,9 +139,13 @@ bool Constant::isOneValue() const {
       return Splat->isOneValue();
 
   // Check for constant vectors which are splats of 1 values.
-  if (const ConstantDataVector *CV = dyn_cast<ConstantDataVector>(this))
-    if (Constant *Splat = CV->getSplatValue())
-      return Splat->isOneValue();
+  if (const ConstantDataVector *CV = dyn_cast<ConstantDataVector>(this)) {
+    if (CV->isSplat()) {
+      if (CV->getElementType()->isFloatingPointTy())
+        return CV->getElementAsAPFloat(0).bitcastToAPInt().isOneValue();
+      return CV->getElementAsAPInt(0).isOneValue();
+    }
+  }
 
   return false;
 }
@@ -157,9 +165,13 @@ bool Constant::isMinSignedValue() const {
       return Splat->isMinSignedValue();
 
   // Check for constant vectors which are splats of INT_MIN values.
-  if (const ConstantDataVector *CV = dyn_cast<ConstantDataVector>(this))
-    if (Constant *Splat = CV->getSplatValue())
-      return Splat->isMinSignedValue();
+  if (const ConstantDataVector *CV = dyn_cast<ConstantDataVector>(this)) {
+    if (CV->isSplat()) {
+      if (CV->getElementType()->isFloatingPointTy())
+        return CV->getElementAsAPFloat(0).bitcastToAPInt().isMinSignedValue();
+      return CV->getElementAsAPInt(0).isMinSignedValue();
+    }
+  }
 
   return false;
 }
@@ -179,9 +191,13 @@ bool Constant::isNotMinSignedValue() const {
       return Splat->isNotMinSignedValue();
 
   // Check for constant vectors which are splats of INT_MIN values.
-  if (const ConstantDataVector *CV = dyn_cast<ConstantDataVector>(this))
-    if (Constant *Splat = CV->getSplatValue())
-      return Splat->isNotMinSignedValue();
+  if (const ConstantDataVector *CV = dyn_cast<ConstantDataVector>(this)) {
+    if (CV->isSplat()) {
+      if (CV->getElementType()->isFloatingPointTy())
+        return !CV->getElementAsAPFloat(0).bitcastToAPInt().isMinSignedValue();
+      return !CV->getElementAsAPInt(0).isMinSignedValue();
+    }
+  }
 
   // It *may* contain INT_MIN, we can't tell.
   return false;
@@ -2565,6 +2581,34 @@ uint64_t ConstantDataSequential::getElementAsInteger(unsigned Elt) const {
   }
 }
 
+APInt ConstantDataSequential::getElementAsAPInt(unsigned Elt) const {
+  assert(isa<IntegerType>(getElementType()) &&
+         "Accessor can only be used when element is an integer");
+  const char *EltPtr = getElementPointer(Elt);
+
+  // The data is stored in host byte order, make sure to cast back to the right
+  // type to load with the right endianness.
+  switch (getElementType()->getIntegerBitWidth()) {
+  default: llvm_unreachable("Invalid bitwidth for CDS");
+  case 8: {
+    auto EltVal = *reinterpret_cast<const uint8_t *>(EltPtr);
+    return APInt(8, EltVal);
+  }
+  case 16: {
+    auto EltVal = *reinterpret_cast<const uint16_t *>(EltPtr);
+    return APInt(16, EltVal);
+  }
+  case 32: {
+    auto EltVal = *reinterpret_cast<const uint32_t *>(EltPtr);
+    return APInt(32, EltVal);
+  }
+  case 64: {
+    auto EltVal = *reinterpret_cast<const uint64_t *>(EltPtr);
+    return APInt(64, EltVal);
+  }
+  }
+}
+
 APFloat ConstantDataSequential::getElementAsAPFloat(unsigned Elt) const {
   const char *EltPtr = getElementPointer(Elt);
 
@@ -2623,17 +2667,21 @@ bool ConstantDataSequential::isCString() const {
   return Str.drop_back().find(0) == StringRef::npos;
 }
 
-Constant *ConstantDataVector::getSplatValue() const {
+bool ConstantDataVector::isSplat() const {
   const char *Base = getRawDataValues().data();
 
   // Compare elements 1+ to the 0'th element.
   unsigned EltSize = getElementByteSize();
   for (unsigned i = 1, e = getNumElements(); i != e; ++i)
     if (memcmp(Base, Base+i*EltSize, EltSize))
-      return nullptr;
+      return false;
+
+  return true;
+}
 
+Constant *ConstantDataVector::getSplatValue() const {
   // If they're all the same, return the 0th one as a representative.
-  return getElementAsConstant(0);
+  return isSplat() ? getElementAsConstant(0) : nullptr;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/IR/Core.cpp b/lib/IR/Core.cpp
index 2165ae5a9470..aba770457e2f 100644
--- a/lib/IR/Core.cpp
+++ b/lib/IR/Core.cpp
@@ -14,7 +14,6 @@
 
 #include "llvm-c/Core.h"
 #include "llvm/ADT/StringSwitch.h"
-#include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
diff --git a/lib/IR/DIBuilder.cpp b/lib/IR/DIBuilder.cpp
index 7e598b43ac16..bce28ba3b950 100644
--- a/lib/IR/DIBuilder.cpp
+++ b/lib/IR/DIBuilder.cpp
@@ -148,10 +148,13 @@ DICompileUnit *DIBuilder::createCompileUnit(
 
 static DIImportedEntity *
 createImportedModule(LLVMContext &C, dwarf::Tag Tag, DIScope *Context,
-                     Metadata *NS, unsigned Line, StringRef Name,
+                     Metadata *NS, DIFile *File, unsigned Line, StringRef Name,
                      SmallVectorImpl<TrackingMDNodeRef> &AllImportedModules) {
+  if (Line)
+    assert(File && "Source location has line number but no file");
   unsigned EntitiesCount = C.pImpl->DIImportedEntitys.size();
-  auto *M = DIImportedEntity::get(C, Tag, Context, DINodeRef(NS), Line, Name);
+  auto *M =
+      DIImportedEntity::get(C, Tag, Context, DINodeRef(NS), File, Line, Name);
   if (EntitiesCount < C.pImpl->DIImportedEntitys.size())
     // A new Imported Entity was just added to the context.
     // Add it to the Imported Modules list.
@@ -160,33 +163,38 @@ createImportedModule(LLVMContext &C, dwarf::Tag Tag, DIScope *Context,
 }
 
 DIImportedEntity *DIBuilder::createImportedModule(DIScope *Context,
-                                                  DINamespace *NS,
+                                                  DINamespace *NS, DIFile *File,
                                                   unsigned Line) {
   return ::createImportedModule(VMContext, dwarf::DW_TAG_imported_module,
-                                Context, NS, Line, StringRef(), AllImportedModules);
+                                Context, NS, File, Line, StringRef(),
+                                AllImportedModules);
 }
 
 DIImportedEntity *DIBuilder::createImportedModule(DIScope *Context,
                                                   DIImportedEntity *NS,
-                                                  unsigned Line) {
+                                                  DIFile *File, unsigned Line) {
   return ::createImportedModule(VMContext, dwarf::DW_TAG_imported_module,
-                                Context, NS, Line, StringRef(), AllImportedModules);
+                                Context, NS, File, Line, StringRef(),
+                                AllImportedModules);
 }
 
 DIImportedEntity *DIBuilder::createImportedModule(DIScope *Context, DIModule *M,
-                                                  unsigned Line) {
+                                                  DIFile *File, unsigned Line) {
   return ::createImportedModule(VMContext, dwarf::DW_TAG_imported_module,
-                                Context, M, Line, StringRef(), AllImportedModules);
+                                Context, M, File, Line, StringRef(),
+                                AllImportedModules);
 }
 
 DIImportedEntity *DIBuilder::createImportedDeclaration(DIScope *Context,
                                                        DINode *Decl,
+                                                       DIFile *File,
                                                        unsigned Line,
                                                        StringRef Name) {
   // Make sure to use the unique identifier based metadata reference for
   // types that have one.
   return ::createImportedModule(VMContext, dwarf::DW_TAG_imported_declaration,
-                                Context, Decl, Line, Name, AllImportedModules);
+                                Context, Decl, File, Line, Name,
+                                AllImportedModules);
 }
 
 DIFile *DIBuilder::createFile(StringRef Filename, StringRef Directory,
diff --git a/lib/IR/DebugInfoMetadata.cpp b/lib/IR/DebugInfoMetadata.cpp
index 0bf68b4c53bb..c14940bad45d 100644
--- a/lib/IR/DebugInfoMetadata.cpp
+++ b/lib/IR/DebugInfoMetadata.cpp
@@ -760,12 +760,13 @@ DIObjCProperty *DIObjCProperty::getImpl(
 
 DIImportedEntity *DIImportedEntity::getImpl(LLVMContext &Context, unsigned Tag,
                                             Metadata *Scope, Metadata *Entity,
-                                            unsigned Line, MDString *Name,
-                                            StorageType Storage,
+                                            Metadata *File, unsigned Line,
+                                            MDString *Name, StorageType Storage,
                                             bool ShouldCreate) {
   assert(isCanonical(Name) && "Expected canonical MDString");
-  DEFINE_GETIMPL_LOOKUP(DIImportedEntity, (Tag, Scope, Entity, Line, Name));
-  Metadata *Ops[] = {Scope, Entity, Name};
+  DEFINE_GETIMPL_LOOKUP(DIImportedEntity,
+                        (Tag, Scope, Entity, File, Line, Name));
+  Metadata *Ops[] = {Scope, Entity, Name, File};
   DEFINE_GETIMPL_STORE(DIImportedEntity, (Tag, Line), Ops);
 }
 
diff --git a/lib/IR/Dominators.cpp b/lib/IR/Dominators.cpp
index 9bd0e297f4ef..4d7e3040ecd7 100644
--- a/lib/IR/Dominators.cpp
+++ b/lib/IR/Dominators.cpp
@@ -61,24 +61,30 @@ bool BasicBlockEdge::isSingleEdge() const {
 //===----------------------------------------------------------------------===//
 
 template class llvm::DomTreeNodeBase<BasicBlock>;
-template class llvm::DominatorTreeBase<BasicBlock>;
-
-template void llvm::DomTreeBuilder::Calculate<Function, BasicBlock *>(
-    DominatorTreeBase<
-        typename std::remove_pointer<GraphTraits<BasicBlock *>::NodeRef>::type>
-        &DT,
-    Function &F);
-template void llvm::DomTreeBuilder::Calculate<Function, Inverse<BasicBlock *>>(
-    DominatorTreeBase<typename std::remove_pointer<
-        GraphTraits<Inverse<BasicBlock *>>::NodeRef>::type> &DT,
-    Function &F);
-template bool llvm::DomTreeBuilder::Verify<BasicBlock *>(
-    const DominatorTreeBase<
-        typename std::remove_pointer<GraphTraits<BasicBlock *>::NodeRef>::type>
-        &DT);
-template bool llvm::DomTreeBuilder::Verify<Inverse<BasicBlock *>>(
-    const DominatorTreeBase<typename std::remove_pointer<
-        GraphTraits<Inverse<BasicBlock *>>::NodeRef>::type> &DT);
+template class llvm::DominatorTreeBase<BasicBlock, false>; // DomTreeBase
+template class llvm::DominatorTreeBase<BasicBlock, true>; // PostDomTreeBase
+
+template void
+llvm::DomTreeBuilder::Calculate<DomTreeBuilder::BBDomTree, Function>(
+    DomTreeBuilder::BBDomTree &DT, Function &F);
+template void
+llvm::DomTreeBuilder::Calculate<DomTreeBuilder::BBPostDomTree, Function>(
+    DomTreeBuilder::BBPostDomTree &DT, Function &F);
+
+template void llvm::DomTreeBuilder::InsertEdge<DomTreeBuilder::BBDomTree>(
+    DomTreeBuilder::BBDomTree &DT, BasicBlock *From, BasicBlock *To);
+template void llvm::DomTreeBuilder::InsertEdge<DomTreeBuilder::BBPostDomTree>(
+    DomTreeBuilder::BBPostDomTree &DT, BasicBlock *From, BasicBlock *To);
+
+template void llvm::DomTreeBuilder::DeleteEdge<DomTreeBuilder::BBDomTree>(
+    DomTreeBuilder::BBDomTree &DT, BasicBlock *From, BasicBlock *To);
+template void llvm::DomTreeBuilder::DeleteEdge<DomTreeBuilder::BBPostDomTree>(
+    DomTreeBuilder::BBPostDomTree &DT, BasicBlock *From, BasicBlock *To);
+
+template bool llvm::DomTreeBuilder::Verify<DomTreeBuilder::BBDomTree>(
+    const DomTreeBuilder::BBDomTree &DT);
+template bool llvm::DomTreeBuilder::Verify<DomTreeBuilder::BBPostDomTree>(
+    const DomTreeBuilder::BBPostDomTree &DT);
 
 bool DominatorTree::invalidate(Function &F, const PreservedAnalyses &PA,
                                FunctionAnalysisManager::Invalidator &) {
diff --git a/lib/IR/LLVMContextImpl.h b/lib/IR/LLVMContextImpl.h
index e413a4f34432..bea2c7ae8ff2 100644
--- a/lib/IR/LLVMContextImpl.h
+++ b/lib/IR/LLVMContextImpl.h
@@ -990,24 +990,26 @@ template <> struct MDNodeKeyImpl<DIImportedEntity> {
   unsigned Tag;
   Metadata *Scope;
   Metadata *Entity;
+  Metadata *File;
   unsigned Line;
   MDString *Name;
 
-  MDNodeKeyImpl(unsigned Tag, Metadata *Scope, Metadata *Entity, unsigned Line,
-                MDString *Name)
-      : Tag(Tag), Scope(Scope), Entity(Entity), Line(Line), Name(Name) {}
+  MDNodeKeyImpl(unsigned Tag, Metadata *Scope, Metadata *Entity, Metadata *File,
+                unsigned Line, MDString *Name)
+      : Tag(Tag), Scope(Scope), Entity(Entity), File(File), Line(Line),
+        Name(Name) {}
   MDNodeKeyImpl(const DIImportedEntity *N)
       : Tag(N->getTag()), Scope(N->getRawScope()), Entity(N->getRawEntity()),
-        Line(N->getLine()), Name(N->getRawName()) {}
+        File(N->getRawFile()), Line(N->getLine()), Name(N->getRawName()) {}
 
   bool isKeyOf(const DIImportedEntity *RHS) const {
     return Tag == RHS->getTag() && Scope == RHS->getRawScope() &&
-           Entity == RHS->getRawEntity() && Line == RHS->getLine() &&
-           Name == RHS->getRawName();
+           Entity == RHS->getRawEntity() && File == RHS->getFile() &&
+           Line == RHS->getLine() && Name == RHS->getRawName();
   }
 
   unsigned getHashValue() const {
-    return hash_combine(Tag, Scope, Entity, Line, Name);
+    return hash_combine(Tag, Scope, Entity, File, Line, Name);
   }
 };
 
diff --git a/lib/IR/LegacyPassManager.cpp b/lib/IR/LegacyPassManager.cpp
index 29e2f42d3e05..995e1e570340 100644
--- a/lib/IR/LegacyPassManager.cpp
+++ b/lib/IR/LegacyPassManager.cpp
@@ -625,21 +625,21 @@ void PMTopLevelManager::schedulePass(Pass *P) {
     checkAnalysis = false;
 
     const AnalysisUsage::VectorType &RequiredSet = AnUsage->getRequiredSet();
-    for (AnalysisUsage::VectorType::const_iterator I = RequiredSet.begin(),
-           E = RequiredSet.end(); I != E; ++I) {
+    for (const AnalysisID ID : RequiredSet) {
 
-      Pass *AnalysisPass = findAnalysisPass(*I);
+      Pass *AnalysisPass = findAnalysisPass(ID);
       if (!AnalysisPass) {
-        const PassInfo *PI = findAnalysisPassInfo(*I);
+        const PassInfo *PI = findAnalysisPassInfo(ID);
 
         if (!PI) {
           // Pass P is not in the global PassRegistry
           dbgs() << "Pass '"  << P->getPassName() << "' is not initialized." << "\n";
           dbgs() << "Verify if there is a pass dependency cycle." << "\n";
           dbgs() << "Required Passes:" << "\n";
-          for (AnalysisUsage::VectorType::const_iterator I2 = RequiredSet.begin(),
-                 E = RequiredSet.end(); I2 != E && I2 != I; ++I2) {
-            Pass *AnalysisPass2 = findAnalysisPass(*I2);
+          for (const AnalysisID ID2 : RequiredSet) {
+            if (ID == ID2)
+              break;
+            Pass *AnalysisPass2 = findAnalysisPass(ID2);
             if (AnalysisPass2) {
               dbgs() << "\t" << AnalysisPass2->getPassName() << "\n";
             } else {
@@ -1070,17 +1070,15 @@ void PMDataManager::collectRequiredAndUsedAnalyses(
 void PMDataManager::initializeAnalysisImpl(Pass *P) {
   AnalysisUsage *AnUsage = TPM->findAnalysisUsage(P);
 
-  for (AnalysisUsage::VectorType::const_iterator
-         I = AnUsage->getRequiredSet().begin(),
-         E = AnUsage->getRequiredSet().end(); I != E; ++I) {
-    Pass *Impl = findAnalysisPass(*I, true);
+  for (const AnalysisID ID : AnUsage->getRequiredSet()) {
+    Pass *Impl = findAnalysisPass(ID, true);
     if (!Impl)
       // This may be analysis pass that is initialized on the fly.
       // If that is not the case then it will raise an assert when it is used.
       continue;
     AnalysisResolver *AR = P->getResolver();
     assert(AR && "Analysis Resolver is not set");
-    AR->addAnalysisImplsPair(*I, Impl);
+    AR->addAnalysisImplsPair(ID, Impl);
   }
 }
 
@@ -1112,21 +1110,19 @@ void PMDataManager::dumpLastUses(Pass *P, unsigned Offset) const{
 
   TPM->collectLastUses(LUses, P);
 
-  for (SmallVectorImpl<Pass *>::iterator I = LUses.begin(),
-         E = LUses.end(); I != E; ++I) {
+  for (Pass *P : LUses) {
     dbgs() << "--" << std::string(Offset*2, ' ');
-    (*I)->dumpPassStructure(0);
+    P->dumpPassStructure(0);
   }
 }
 
 void PMDataManager::dumpPassArguments() const {
-  for (SmallVectorImpl<Pass *>::const_iterator I = PassVector.begin(),
-        E = PassVector.end(); I != E; ++I) {
-    if (PMDataManager *PMD = (*I)->getAsPMDataManager())
+  for (Pass *P : PassVector) {
+    if (PMDataManager *PMD = P->getAsPMDataManager())
       PMD->dumpPassArguments();
     else
       if (const PassInfo *PI =
-            TPM->findAnalysisPassInfo((*I)->getPassID()))
+            TPM->findAnalysisPassInfo(P->getPassID()))
         if (!PI->isAnalysisGroup())
           dbgs() << " -" << PI->getPassArgument();
   }
@@ -1255,9 +1251,8 @@ Pass *PMDataManager::getOnTheFlyPass(Pass *P, AnalysisID PI, Function &F) {
 
 // Destructor
 PMDataManager::~PMDataManager() {
-  for (SmallVectorImpl<Pass *>::iterator I = PassVector.begin(),
-         E = PassVector.end(); I != E; ++I)
-    delete *I;
+  for (Pass *P : PassVector)
+    delete P;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1284,35 +1279,35 @@ bool BBPassManager::runOnFunction(Function &F) {
 
   bool Changed = doInitialization(F);
 
-  for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I)
+  for (BasicBlock &BB : F)
     for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
       BasicBlockPass *BP = getContainedPass(Index);
       bool LocalChanged = false;
 
-      dumpPassInfo(BP, EXECUTION_MSG, ON_BASICBLOCK_MSG, I->getName());
+      dumpPassInfo(BP, EXECUTION_MSG, ON_BASICBLOCK_MSG, BB.getName());
       dumpRequiredSet(BP);
 
       initializeAnalysisImpl(BP);
 
       {
         // If the pass crashes, remember this.
-        PassManagerPrettyStackEntry X(BP, *I);
+        PassManagerPrettyStackEntry X(BP, BB);
         TimeRegion PassTimer(getPassTimer(BP));
 
-        LocalChanged |= BP->runOnBasicBlock(*I);
+        LocalChanged |= BP->runOnBasicBlock(BB);
       }
 
       Changed |= LocalChanged;
       if (LocalChanged)
         dumpPassInfo(BP, MODIFICATION_MSG, ON_BASICBLOCK_MSG,
-                     I->getName());
+                     BB.getName());
       dumpPreservedSet(BP);
       dumpUsedSet(BP);
 
       verifyPreservedAnalysis(BP);
       removeNotPreservedAnalysis(BP);
       recordAvailableAnalysis(BP);
-      removeDeadPasses(BP, I->getName(), ON_BASICBLOCK_MSG);
+      removeDeadPasses(BP, BB.getName(), ON_BASICBLOCK_MSG);
     }
 
   return doFinalization(F) || Changed;
diff --git a/lib/IR/Module.cpp b/lib/IR/Module.cpp
index fdc7de6eaa34..c230a50044c7 100644
--- a/lib/IR/Module.cpp
+++ b/lib/IR/Module.cpp
@@ -103,7 +103,7 @@ std::unique_ptr<RandomNumberGenerator> Module::createRNG(const Pass* P) const {
   // store salt metadata from the Module constructor.
   Salt += sys::path::filename(getModuleIdentifier());
 
-  return std::unique_ptr<RandomNumberGenerator>{new RandomNumberGenerator(Salt)};
+  return std::unique_ptr<RandomNumberGenerator>(new RandomNumberGenerator(Salt));
 }
 
 /// getNamedValue - Return the first global value in the module with
diff --git a/lib/Object/ArchiveWriter.cpp b/lib/Object/ArchiveWriter.cpp
index 4034f9039dda..b052c76d1fed 100644
--- a/lib/Object/ArchiveWriter.cpp
+++ b/lib/Object/ArchiveWriter.cpp
@@ -318,7 +318,8 @@ writeSymbolTable(raw_fd_ostream &Out, object::Archive::Kind Kind,
         continue;
       if (!(Symflags & object::SymbolRef::SF_Global))
         continue;
-      if (Symflags & object::SymbolRef::SF_Undefined)
+      if (Symflags & object::SymbolRef::SF_Undefined &&
+          !(Symflags & object::SymbolRef::SF_Indirect))
         continue;
 
       unsigned NameOffset = NameOS.tell();
diff --git a/lib/Object/COFFImportFile.cpp b/lib/Object/COFFImportFile.cpp
index 740bf94d40e0..d1f46fdfa292 100644
--- a/lib/Object/COFFImportFile.cpp
+++ b/lib/Object/COFFImportFile.cpp
@@ -131,14 +131,14 @@ class ObjectFactory {
   using u32 = support::ulittle32_t;
   MachineTypes Machine;
   BumpPtrAllocator Alloc;
-  StringRef DLLName;
+  StringRef ImportName;
   StringRef Library;
   std::string ImportDescriptorSymbolName;
   std::string NullThunkSymbolName;
 
 public:
   ObjectFactory(StringRef S, MachineTypes M)
-      : Machine(M), DLLName(S), Library(S.drop_back(4)),
+      : Machine(M), ImportName(S), Library(S.drop_back(4)),
         ImportDescriptorSymbolName(("__IMPORT_DESCRIPTOR_" + Library).str()),
         NullThunkSymbolName(("\x7f" + Library + "_NULL_THUNK_DATA").str()) {}
 
@@ -162,14 +162,17 @@ public:
   // Library Format.
   NewArchiveMember createShortImport(StringRef Sym, uint16_t Ordinal,
                                      ImportType Type, ImportNameType NameType);
+
+  // Create a weak external file which is described in PE/COFF Aux Format 3.
+  NewArchiveMember createWeakExternal(StringRef Sym, StringRef Weak, bool Imp);
 };
 } // namespace
 
 NewArchiveMember
 ObjectFactory::createImportDescriptor(std::vector<uint8_t> &Buffer) {
-  static const uint32_t NumberOfSections = 2;
-  static const uint32_t NumberOfSymbols = 7;
-  static const uint32_t NumberOfRelocations = 3;
+  const uint32_t NumberOfSections = 2;
+  const uint32_t NumberOfSymbols = 7;
+  const uint32_t NumberOfRelocations = 3;
 
   // COFF Header
   coff_file_header Header{
@@ -181,7 +184,7 @@ ObjectFactory::createImportDescriptor(std::vector<uint8_t> &Buffer) {
           sizeof(coff_import_directory_table_entry) +
           NumberOfRelocations * sizeof(coff_relocation) +
           // .idata$4
-          (DLLName.size() + 1)),
+          (ImportName.size() + 1)),
       u32(NumberOfSymbols),
       u16(0),
       u16(is32bit(Machine) ? IMAGE_FILE_32BIT_MACHINE : 0),
@@ -189,7 +192,7 @@ ObjectFactory::createImportDescriptor(std::vector<uint8_t> &Buffer) {
   append(Buffer, Header);
 
   // Section Header Table
-  static const coff_section SectionTable[NumberOfSections] = {
+  const coff_section SectionTable[NumberOfSections] = {
       {{'.', 'i', 'd', 'a', 't', 'a', '$', '2'},
        u32(0),
        u32(0),
@@ -205,7 +208,7 @@ ObjectFactory::createImportDescriptor(std::vector<uint8_t> &Buffer) {
       {{'.', 'i', 'd', 'a', 't', 'a', '$', '6'},
        u32(0),
        u32(0),
-       u32(DLLName.size() + 1),
+       u32(ImportName.size() + 1),
        u32(sizeof(coff_file_header) + NumberOfSections * sizeof(coff_section) +
            sizeof(coff_import_directory_table_entry) +
            NumberOfRelocations * sizeof(coff_relocation)),
@@ -219,12 +222,12 @@ ObjectFactory::createImportDescriptor(std::vector<uint8_t> &Buffer) {
   append(Buffer, SectionTable);
 
   // .idata$2
-  static const coff_import_directory_table_entry ImportDescriptor{
+  const coff_import_directory_table_entry ImportDescriptor{
       u32(0), u32(0), u32(0), u32(0), u32(0),
   };
   append(Buffer, ImportDescriptor);
 
-  static const coff_relocation RelocationTable[NumberOfRelocations] = {
+  const coff_relocation RelocationTable[NumberOfRelocations] = {
       {u32(offsetof(coff_import_directory_table_entry, NameRVA)), u32(2),
        u16(getImgRelRelocation(Machine))},
       {u32(offsetof(coff_import_directory_table_entry, ImportLookupTableRVA)),
@@ -236,9 +239,9 @@ ObjectFactory::createImportDescriptor(std::vector<uint8_t> &Buffer) {
 
   // .idata$6
   auto S = Buffer.size();
-  Buffer.resize(S + DLLName.size() + 1);
-  memcpy(&Buffer[S], DLLName.data(), DLLName.size());
-  Buffer[S + DLLName.size()] = '\0';
+  Buffer.resize(S + ImportName.size() + 1);
+  memcpy(&Buffer[S], ImportName.data(), ImportName.size());
+  Buffer[S + ImportName.size()] = '\0';
 
   // Symbol Table
   coff_symbol16 SymbolTable[NumberOfSymbols] = {
@@ -302,13 +305,13 @@ ObjectFactory::createImportDescriptor(std::vector<uint8_t> &Buffer) {
                     NullThunkSymbolName});
 
   StringRef F{reinterpret_cast<const char *>(Buffer.data()), Buffer.size()};
-  return {MemoryBufferRef(F, DLLName)};
+  return {MemoryBufferRef(F, ImportName)};
 }
 
 NewArchiveMember
 ObjectFactory::createNullImportDescriptor(std::vector<uint8_t> &Buffer) {
-  static const uint32_t NumberOfSections = 1;
-  static const uint32_t NumberOfSymbols = 1;
+  const uint32_t NumberOfSections = 1;
+  const uint32_t NumberOfSymbols = 1;
 
   // COFF Header
   coff_file_header Header{
@@ -325,7 +328,7 @@ ObjectFactory::createNullImportDescriptor(std::vector<uint8_t> &Buffer) {
   append(Buffer, Header);
 
   // Section Header Table
-  static const coff_section SectionTable[NumberOfSections] = {
+  const coff_section SectionTable[NumberOfSections] = {
       {{'.', 'i', 'd', 'a', 't', 'a', '$', '3'},
        u32(0),
        u32(0),
@@ -342,7 +345,7 @@ ObjectFactory::createNullImportDescriptor(std::vector<uint8_t> &Buffer) {
   append(Buffer, SectionTable);
 
   // .idata$3
-  static const coff_import_directory_table_entry ImportDescriptor{
+  const coff_import_directory_table_entry ImportDescriptor{
       u32(0), u32(0), u32(0), u32(0), u32(0),
   };
   append(Buffer, ImportDescriptor);
@@ -363,12 +366,12 @@ ObjectFactory::createNullImportDescriptor(std::vector<uint8_t> &Buffer) {
   writeStringTable(Buffer, {NullImportDescriptorSymbolName});
 
   StringRef F{reinterpret_cast<const char *>(Buffer.data()), Buffer.size()};
-  return {MemoryBufferRef(F, DLLName)};
+  return {MemoryBufferRef(F, ImportName)};
 }
 
 NewArchiveMember ObjectFactory::createNullThunk(std::vector<uint8_t> &Buffer) {
-  static const uint32_t NumberOfSections = 2;
-  static const uint32_t NumberOfSymbols = 1;
+  const uint32_t NumberOfSections = 2;
+  const uint32_t NumberOfSymbols = 1;
   uint32_t VASize = is32bit(Machine) ? 4 : 8;
 
   // COFF Header
@@ -388,7 +391,7 @@ NewArchiveMember ObjectFactory::createNullThunk(std::vector<uint8_t> &Buffer) {
   append(Buffer, Header);
 
   // Section Header Table
-  static const coff_section SectionTable[NumberOfSections] = {
+  const coff_section SectionTable[NumberOfSections] = {
       {{'.', 'i', 'd', 'a', 't', 'a', '$', '5'},
        u32(0),
        u32(0),
@@ -445,14 +448,14 @@ NewArchiveMember ObjectFactory::createNullThunk(std::vector<uint8_t> &Buffer) {
   writeStringTable(Buffer, {NullThunkSymbolName});
 
   StringRef F{reinterpret_cast<const char *>(Buffer.data()), Buffer.size()};
-  return {MemoryBufferRef{F, DLLName}};
+  return {MemoryBufferRef{F, ImportName}};
 }
 
 NewArchiveMember ObjectFactory::createShortImport(StringRef Sym,
                                                   uint16_t Ordinal,
                                                   ImportType ImportType,
                                                   ImportNameType NameType) {
-  size_t ImpSize = DLLName.size() + Sym.size() + 2; // +2 for NULs
+  size_t ImpSize = ImportName.size() + Sym.size() + 2; // +2 for NULs
   size_t Size = sizeof(coff_import_header) + ImpSize;
   char *Buf = Alloc.Allocate<char>(Size);
   memset(Buf, 0, Size);
@@ -471,17 +474,96 @@ NewArchiveMember ObjectFactory::createShortImport(StringRef Sym,
   // Write symbol name and DLL name.
   memcpy(P, Sym.data(), Sym.size());
   P += Sym.size() + 1;
-  memcpy(P, DLLName.data(), DLLName.size());
+  memcpy(P, ImportName.data(), ImportName.size());
 
-  return {MemoryBufferRef(StringRef(Buf, Size), DLLName)};
+  return {MemoryBufferRef(StringRef(Buf, Size), ImportName)};
 }
 
-std::error_code writeImportLibrary(StringRef DLLName, StringRef Path,
+NewArchiveMember ObjectFactory::createWeakExternal(StringRef Sym,
+                                                   StringRef Weak, bool Imp) {
+  std::vector<uint8_t> Buffer;
+  const uint32_t NumberOfSections = 1;
+  const uint32_t NumberOfSymbols = 5;
+
+  // COFF Header
+  coff_file_header Header{
+      u16(0),
+      u16(NumberOfSections),
+      u32(0),
+      u32(sizeof(Header) + (NumberOfSections * sizeof(coff_section))),
+      u32(NumberOfSymbols),
+      u16(0),
+      u16(0),
+  };
+  append(Buffer, Header);
+
+  // Section Header Table
+  const coff_section SectionTable[NumberOfSections] = {
+      {{'.', 'd', 'r', 'e', 'c', 't', 'v', 'e'},
+       u32(0),
+       u32(0),
+       u32(0),
+       u32(0),
+       u32(0),
+       u32(0),
+       u16(0),
+       u16(0),
+       u32(IMAGE_SCN_LNK_INFO | IMAGE_SCN_LNK_REMOVE)}};
+  append(Buffer, SectionTable);
+
+  // Symbol Table
+  coff_symbol16 SymbolTable[NumberOfSymbols] = {
+      {{{'@', 'c', 'o', 'm', 'p', '.', 'i', 'd'}},
+       u32(0),
+       u16(0xFFFF),
+       u16(0),
+       IMAGE_SYM_CLASS_STATIC,
+       0},
+      {{{'@', 'f', 'e', 'a', 't', '.', '0', '0'}},
+       u32(0),
+       u16(0xFFFF),
+       u16(0),
+       IMAGE_SYM_CLASS_STATIC,
+       0},
+      {{{0, 0, 0, 0, 0, 0, 0, 0}},
+       u32(0),
+       u16(0),
+       u16(0),
+       IMAGE_SYM_CLASS_EXTERNAL,
+       0},
+      {{{0, 0, 0, 0, 0, 0, 0, 0}},
+       u32(0),
+       u16(0),
+       u16(0),
+       IMAGE_SYM_CLASS_WEAK_EXTERNAL,
+       1},
+      {{{2, 0, 0, 0, 3, 0, 0, 0}}, u32(0), u16(0), u16(0), uint8_t(0), 0},
+  };
+  SymbolTable[2].Name.Offset.Offset = sizeof(uint32_t);
+
+  //__imp_ String Table
+  if (Imp) {
+    SymbolTable[3].Name.Offset.Offset = sizeof(uint32_t) + Sym.size() + 7;
+    writeStringTable(Buffer, {std::string("__imp_").append(Sym),
+                              std::string("__imp_").append(Weak)});
+  } else {
+    SymbolTable[3].Name.Offset.Offset = sizeof(uint32_t) + Sym.size() + 1;
+    writeStringTable(Buffer, {Sym, Weak});
+  }
+  append(Buffer, SymbolTable);
+
+  // Copied here so we can still use writeStringTable
+  char *Buf = Alloc.Allocate<char>(Buffer.size());
+  memcpy(Buf, Buffer.data(), Buffer.size());
+  return {MemoryBufferRef(StringRef(Buf, Buffer.size()), ImportName)};
+}
+
+std::error_code writeImportLibrary(StringRef ImportName, StringRef Path,
                                    ArrayRef<COFFShortExport> Exports,
                                    MachineTypes Machine) {
 
   std::vector<NewArchiveMember> Members;
-  ObjectFactory OF(llvm::sys::path::filename(DLLName), Machine);
+  ObjectFactory OF(llvm::sys::path::filename(ImportName), Machine);
 
   std::vector<uint8_t> ImportDescriptor;
   Members.push_back(OF.createImportDescriptor(ImportDescriptor));
@@ -496,6 +578,12 @@ std::error_code writeImportLibrary(StringRef DLLName, StringRef Path,
     if (E.Private)
       continue;
 
+    if (E.isWeak()) {
+      Members.push_back(OF.createWeakExternal(E.Name, E.ExtName, false));
+      Members.push_back(OF.createWeakExternal(E.Name, E.ExtName, true));
+      continue;
+    }
+
     ImportType ImportType = IMPORT_CODE;
     if (E.Data)
       ImportType = IMPORT_DATA;
diff --git a/lib/Object/COFFModuleDefinition.cpp b/lib/Object/COFFModuleDefinition.cpp
index 0d69cb6b709c..ed9140d1fe08 100644
--- a/lib/Object/COFFModuleDefinition.cpp
+++ b/lib/Object/COFFModuleDefinition.cpp
@@ -22,6 +22,7 @@
 #include "llvm/Object/COFFImportFile.h"
 #include "llvm/Object/Error.h"
 #include "llvm/Support/Error.h"
+#include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm::COFF;
@@ -55,8 +56,10 @@ struct Token {
   StringRef Value;
 };
 
-static bool isDecorated(StringRef Sym) {
-  return Sym.startswith("_") || Sym.startswith("@") || Sym.startswith("?");
+static bool isDecorated(StringRef Sym, bool MingwDef) {
+  // mingw does not prepend "_".
+  return (!MingwDef && Sym.startswith("_")) || Sym.startswith("@") ||
+         Sym.startswith("?");
 }
 
 static Error createError(const Twine &Err) {
@@ -83,6 +86,9 @@ public:
     }
     case '=':
       Buf = Buf.drop_front();
+      // GNU dlltool accepts both = and ==.
+      if (Buf.startswith("="))
+        Buf = Buf.drop_front();
       return Token(Equal, "=");
     case ',':
       Buf = Buf.drop_front();
@@ -120,7 +126,8 @@ private:
 
 class Parser {
 public:
-  explicit Parser(StringRef S, MachineTypes M) : Lex(S), Machine(M) {}
+  explicit Parser(StringRef S, MachineTypes M, bool B)
+      : Lex(S), Machine(M), MingwDef(B) {}
 
   Expected<COFFModuleDefinition> parse() {
     do {
@@ -181,14 +188,17 @@ private:
       std::string Name;
       if (Error Err = parseName(&Name, &Info.ImageBase))
         return Err;
-      // Append the appropriate file extension if not already present.
-      StringRef Ext = IsDll ? ".dll" : ".exe";
-      if (!StringRef(Name).endswith_lower(Ext))
-        Name += Ext;
+
+      Info.ImportName = Name;
 
       // Set the output file, but don't override /out if it was already passed.
-      if (Info.OutputFile.empty())
+      if (Info.OutputFile.empty()) {
         Info.OutputFile = Name;
+        // Append the appropriate file extension if not already present.
+        if (!sys::path::has_extension(Name))
+          Info.OutputFile += IsDll ? ".dll" : ".exe";
+      }
+
       return Error::success();
     }
     case KwVersion:
@@ -213,9 +223,9 @@ private:
     }
 
     if (Machine == IMAGE_FILE_MACHINE_I386) {
-      if (!isDecorated(E.Name))
+      if (!isDecorated(E.Name, MingwDef))
         E.Name = (std::string("_").append(E.Name));
-      if (!E.ExtName.empty() && !isDecorated(E.ExtName))
+      if (!E.ExtName.empty() && !isDecorated(E.ExtName, MingwDef))
         E.ExtName = (std::string("_").append(E.ExtName));
     }
 
@@ -308,11 +318,13 @@ private:
   std::vector<Token> Stack;
   MachineTypes Machine;
   COFFModuleDefinition Info;
+  bool MingwDef;
 };
 
 Expected<COFFModuleDefinition> parseCOFFModuleDefinition(MemoryBufferRef MB,
-                                                         MachineTypes Machine) {
-  return Parser(MB.getBuffer(), Machine).parse();
+                                                         MachineTypes Machine,
+                                                         bool MingwDef) {
+  return Parser(MB.getBuffer(), Machine, MingwDef).parse();
 }
 
 } // namespace object
diff --git a/lib/Object/COFFObjectFile.cpp b/lib/Object/COFFObjectFile.cpp
index 1e9b0c5b0454..0a2053477caf 100644
--- a/lib/Object/COFFObjectFile.cpp
+++ b/lib/Object/COFFObjectFile.cpp
@@ -227,8 +227,11 @@ uint32_t COFFObjectFile::getSymbolFlags(DataRefImpl Ref) const {
   if (Symb.isExternal() || Symb.isWeakExternal())
     Result |= SymbolRef::SF_Global;
 
-  if (Symb.isWeakExternal())
+  if (Symb.isWeakExternal()) {
     Result |= SymbolRef::SF_Weak;
+    // We use indirect to allow the archiver to write weak externs
+    Result |= SymbolRef::SF_Indirect;
+  }
 
   if (Symb.getSectionNumber() == COFF::IMAGE_SYM_ABSOLUTE)
     Result |= SymbolRef::SF_Absolute;
diff --git a/lib/ObjectYAML/CodeViewYAMLTypes.cpp b/lib/ObjectYAML/CodeViewYAMLTypes.cpp
index 0b2ea61c5fe0..81046b217862 100644
--- a/lib/ObjectYAML/CodeViewYAMLTypes.cpp
+++ b/lib/ObjectYAML/CodeViewYAMLTypes.cpp
@@ -141,6 +141,33 @@ template <typename T> struct MemberRecordImpl : public MemberRecordBase {
 } // end namespace CodeViewYAML
 } // end namespace llvm
 
+void ScalarTraits<GUID>::output(const GUID &G, void *, llvm::raw_ostream &OS) {
+  OS << G;
+}
+
+StringRef ScalarTraits<GUID>::input(StringRef Scalar, void *Ctx, GUID &S) {
+  if (Scalar.size() != 38)
+    return "GUID strings are 38 characters long";
+  if (Scalar[0] != '{' || Scalar[37] != '}')
+    return "GUID is not enclosed in {}";
+  if (Scalar[9] != '-' || Scalar[14] != '-' || Scalar[19] != '-' ||
+      Scalar[24] != '-')
+    return "GUID sections are not properly delineated with dashes";
+
+  uint8_t *OutBuffer = S.Guid;
+  for (auto Iter = Scalar.begin(); Iter != Scalar.end();) {
+    if (*Iter == '-' || *Iter == '{' || *Iter == '}') {
+      ++Iter;
+      continue;
+    }
+    uint8_t Value = (llvm::hexDigitValue(*Iter++) << 4);
+    Value |= llvm::hexDigitValue(*Iter++);
+    *OutBuffer++ = Value;
+  }
+
+  return "";
+}
+
 void ScalarTraits<TypeIndex>::output(const TypeIndex &S, void *,
                                      raw_ostream &OS) {
   OS << S.getIndex();
diff --git a/lib/Option/OptTable.cpp b/lib/Option/OptTable.cpp
index bcd365236e46..f3b438e829d6 100644
--- a/lib/Option/OptTable.cpp
+++ b/lib/Option/OptTable.cpp
@@ -390,27 +390,29 @@ static std::string getOptionHelpName(const OptTable &Opts, OptSpecifier Id) {
   return Name;
 }
 
+namespace {
+struct OptionInfo {
+  std::string Name;
+  StringRef HelpText;
+};
+} // namespace
+
 static void PrintHelpOptionList(raw_ostream &OS, StringRef Title,
-                                std::vector<std::pair<std::string,
-                                const char*>> &OptionHelp) {
+                                std::vector<OptionInfo> &OptionHelp) {
   OS << Title << ":\n";
 
   // Find the maximum option length.
   unsigned OptionFieldWidth = 0;
   for (unsigned i = 0, e = OptionHelp.size(); i != e; ++i) {
-    // Skip titles.
-    if (!OptionHelp[i].second)
-      continue;
-
     // Limit the amount of padding we are willing to give up for alignment.
-    unsigned Length = OptionHelp[i].first.size();
+    unsigned Length = OptionHelp[i].Name.size();
     if (Length <= 23)
       OptionFieldWidth = std::max(OptionFieldWidth, Length);
   }
 
   const unsigned InitialPad = 2;
   for (unsigned i = 0, e = OptionHelp.size(); i != e; ++i) {
-    const std::string &Option = OptionHelp[i].first;
+    const std::string &Option = OptionHelp[i].Name;
     int Pad = OptionFieldWidth - int(Option.size());
     OS.indent(InitialPad) << Option;
 
@@ -419,7 +421,7 @@ static void PrintHelpOptionList(raw_ostream &OS, StringRef Title,
       OS << "\n";
       Pad = OptionFieldWidth + InitialPad;
     }
-    OS.indent(Pad + 1) << OptionHelp[i].second << '\n';
+    OS.indent(Pad + 1) << OptionHelp[i].HelpText << '\n';
   }
 }
 
@@ -458,8 +460,7 @@ void OptTable::PrintHelp(raw_ostream &OS, const char *Name, const char *Title,
 
   // Render help text into a map of group-name to a list of (option, help)
   // pairs.
-  using helpmap_ty =
-      std::map<std::string, std::vector<std::pair<std::string, const char*>>>;
+  using helpmap_ty = std::map<std::string, std::vector<OptionInfo>>;
   helpmap_ty GroupedOptionHelp;
 
   for (unsigned i = 0, e = getNumOptions(); i != e; ++i) {
@@ -478,7 +479,7 @@ void OptTable::PrintHelp(raw_ostream &OS, const char *Name, const char *Title,
     if (const char *Text = getOptionHelpText(Id)) {
       const char *HelpGroup = getOptionHelpGroup(*this, Id);
       const std::string &OptName = getOptionHelpName(*this, Id);
-      GroupedOptionHelp[HelpGroup].push_back(std::make_pair(OptName, Text));
+      GroupedOptionHelp[HelpGroup].push_back({OptName, Text});
     }
   }
 
diff --git a/lib/Support/ErrorHandling.cpp b/lib/Support/ErrorHandling.cpp
index fe69151665c6..2fd4f3ea0d45 100644
--- a/lib/Support/ErrorHandling.cpp
+++ b/lib/Support/ErrorHandling.cpp
@@ -45,22 +45,36 @@ static void *ErrorHandlerUserData = nullptr;
 static fatal_error_handler_t BadAllocErrorHandler = nullptr;
 static void *BadAllocErrorHandlerUserData = nullptr;
 
+#if LLVM_ENABLE_THREADS == 1
 // Mutexes to synchronize installing error handlers and calling error handlers.
 // Do not use ManagedStatic, or that may allocate memory while attempting to
 // report an OOM.
+//
+// This usage of std::mutex has to be conditionalized behind ifdefs because
+// of this script:
+//   compiler-rt/lib/sanitizer_common/symbolizer/scripts/build_symbolizer.sh
+// That script attempts to statically link the LLVM symbolizer library with the
+// STL and hide all of its symbols with 'opt -internalize'. To reduce size, it
+// cuts out the threading portions of the hermetic copy of libc++ that it
+// builds. We can remove these ifdefs if that script goes away.
 static std::mutex ErrorHandlerMutex;
 static std::mutex BadAllocErrorHandlerMutex;
+#endif
 
 void llvm::install_fatal_error_handler(fatal_error_handler_t handler,
                                        void *user_data) {
+#if LLVM_ENABLE_THREADS == 1
   std::lock_guard<std::mutex> Lock(ErrorHandlerMutex);
+#endif
   assert(!ErrorHandler && "Error handler already registered!\n");
   ErrorHandler = handler;
   ErrorHandlerUserData = user_data;
 }
 
 void llvm::remove_fatal_error_handler() {
+#if LLVM_ENABLE_THREADS == 1
   std::lock_guard<std::mutex> Lock(ErrorHandlerMutex);
+#endif
   ErrorHandler = nullptr;
   ErrorHandlerUserData = nullptr;
 }
@@ -83,7 +97,9 @@ void llvm::report_fatal_error(const Twine &Reason, bool GenCrashDiag) {
   {
     // Only acquire the mutex while reading the handler, so as not to invoke a
     // user-supplied callback under a lock.
+#if LLVM_ENABLE_THREADS == 1
     std::lock_guard<std::mutex> Lock(ErrorHandlerMutex);
+#endif
     handler = ErrorHandler;
     handlerData = ErrorHandlerUserData;
   }
@@ -112,14 +128,18 @@ void llvm::report_fatal_error(const Twine &Reason, bool GenCrashDiag) {
 
 void llvm::install_bad_alloc_error_handler(fatal_error_handler_t handler,
                                            void *user_data) {
+#if LLVM_ENABLE_THREADS == 1
   std::lock_guard<std::mutex> Lock(BadAllocErrorHandlerMutex);
+#endif
   assert(!ErrorHandler && "Bad alloc error handler already registered!\n");
   BadAllocErrorHandler = handler;
   BadAllocErrorHandlerUserData = user_data;
 }
 
 void llvm::remove_bad_alloc_error_handler() {
+#if LLVM_ENABLE_THREADS == 1
   std::lock_guard<std::mutex> Lock(BadAllocErrorHandlerMutex);
+#endif
   BadAllocErrorHandler = nullptr;
   BadAllocErrorHandlerUserData = nullptr;
 }
@@ -130,7 +150,9 @@ void llvm::report_bad_alloc_error(const char *Reason, bool GenCrashDiag) {
   {
     // Only acquire the mutex while reading the handler, so as not to invoke a
     // user-supplied callback under a lock.
+#if LLVM_ENABLE_THREADS == 1
     std::lock_guard<std::mutex> Lock(BadAllocErrorHandlerMutex);
+#endif
     Handler = BadAllocErrorHandler;
     HandlerData = BadAllocErrorHandlerUserData;
   }
diff --git a/lib/Support/Host.cpp b/lib/Support/Host.cpp
index 9f22f89b3c9e..5cf0316d4d71 100644
--- a/lib/Support/Host.cpp
+++ b/lib/Support/Host.cpp
@@ -250,6 +250,8 @@ StringRef sys::detail::getHostCPUNameForS390x(
         Pos += sizeof("machine = ") - 1;
         unsigned int Id;
         if (!Lines[I].drop_front(Pos).getAsInteger(10, Id)) {
+          if (Id >= 3906 && HaveVectorSupport)
+            return "z14";
           if (Id >= 2964 && HaveVectorSupport)
             return "z13";
           if (Id >= 2827)
@@ -460,8 +462,8 @@ static bool getX86CpuIDAndInfo(unsigned value, unsigned *rEAX, unsigned *rEBX,
 static bool getX86CpuIDAndInfoEx(unsigned value, unsigned subleaf,
                                  unsigned *rEAX, unsigned *rEBX, unsigned *rECX,
                                  unsigned *rEDX) {
-#if defined(__x86_64__) || defined(_M_X64)
 #if defined(__GNUC__) || defined(__clang__)
+#if defined(__x86_64__)
   // gcc doesn't know cpuid would clobber ebx/rbx. Preserve it manually.
   // FIXME: should we save this for Clang?
   __asm__("movq\t%%rbx, %%rsi\n\t"
@@ -470,43 +472,24 @@ static bool getX86CpuIDAndInfoEx(unsigned value, unsigned subleaf,
           : "=a"(*rEAX), "=S"(*rEBX), "=c"(*rECX), "=d"(*rEDX)
           : "a"(value), "c"(subleaf));
   return false;
-#elif defined(_MSC_VER)
-  int registers[4];
-  __cpuidex(registers, value, subleaf);
-  *rEAX = registers[0];
-  *rEBX = registers[1];
-  *rECX = registers[2];
-  *rEDX = registers[3];
-  return false;
-#else
-  return true;
-#endif
-#elif defined(__i386__) || defined(_M_IX86)
-#if defined(__GNUC__) || defined(__clang__)
+#elif defined(__i386__)
   __asm__("movl\t%%ebx, %%esi\n\t"
           "cpuid\n\t"
           "xchgl\t%%ebx, %%esi\n\t"
           : "=a"(*rEAX), "=S"(*rEBX), "=c"(*rECX), "=d"(*rEDX)
           : "a"(value), "c"(subleaf));
   return false;
-#elif defined(_MSC_VER)
-  __asm {
-      mov   eax,value
-      mov   ecx,subleaf
-      cpuid
-      mov   esi,rEAX
-      mov   dword ptr [esi],eax
-      mov   esi,rEBX
-      mov   dword ptr [esi],ebx
-      mov   esi,rECX
-      mov   dword ptr [esi],ecx
-      mov   esi,rEDX
-      mov   dword ptr [esi],edx
-  }
-  return false;
 #else
   return true;
 #endif
+#elif defined(_MSC_VER)
+  int registers[4];
+  __cpuidex(registers, value, subleaf);
+  *rEAX = registers[0];
+  *rEBX = registers[1];
+  *rECX = registers[2];
+  *rEDX = registers[3];
+  return false;
 #else
   return true;
 #endif
diff --git a/lib/Support/Path.cpp b/lib/Support/Path.cpp
index e58f856ca244..ea59ba62d7bd 100644
--- a/lib/Support/Path.cpp
+++ b/lib/Support/Path.cpp
@@ -13,8 +13,6 @@
 
 #include "llvm/Support/Path.h"
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/BinaryFormat/COFF.h"
-#include "llvm/BinaryFormat/MachO.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/ErrorHandling.h"
diff --git a/lib/Support/TargetParser.cpp b/lib/Support/TargetParser.cpp
index 13bb6f23bc83..e8ef1d2fd8b9 100644
--- a/lib/Support/TargetParser.cpp
+++ b/lib/Support/TargetParser.cpp
@@ -452,6 +452,8 @@ bool llvm::AArch64::getExtensionFeatures(unsigned Extensions,
     Features.push_back("+ras");
   if (Extensions & AArch64::AEK_LSE)
     Features.push_back("+lse");
+  if (Extensions & AArch64::AEK_SVE)
+    Features.push_back("+sve");
 
   return true;
 }
diff --git a/lib/Support/YAMLTraits.cpp b/lib/Support/YAMLTraits.cpp
index 601084f9eae3..65eda246a7fe 100644
--- a/lib/Support/YAMLTraits.cpp
+++ b/lib/Support/YAMLTraits.cpp
@@ -60,6 +60,14 @@ Input::Input(StringRef InputContent, void *Ctxt,
   DocIterator = Strm->begin();
 }
 
+Input::Input(MemoryBufferRef Input, void *Ctxt,
+             SourceMgr::DiagHandlerTy DiagHandler, void *DiagHandlerCtxt)
+    : IO(Ctxt), Strm(new Stream(Input, SrcMgr, false, &EC)) {
+  if (DiagHandler)
+    SrcMgr.setDiagHandler(DiagHandler, DiagHandlerCtxt);
+  DocIterator = Strm->begin();
+}
+
 Input::~Input() = default;
 
 std::error_code Input::error() { return EC; }
diff --git a/lib/Support/raw_ostream.cpp b/lib/Support/raw_ostream.cpp
index 9480cd46d28f..dd58eccee957 100644
--- a/lib/Support/raw_ostream.cpp
+++ b/lib/Support/raw_ostream.cpp
@@ -326,13 +326,30 @@ raw_ostream &raw_ostream::operator<<(const formatv_object_base &Obj) {
 }
 
 raw_ostream &raw_ostream::operator<<(const FormattedString &FS) {
-  unsigned Len = FS.Str.size(); 
-  int PadAmount = FS.Width - Len;
-  if (FS.RightJustify && (PadAmount > 0))
-    this->indent(PadAmount);
-  this->operator<<(FS.Str);
-  if (!FS.RightJustify && (PadAmount > 0))
+  if (FS.Str.size() >= FS.Width || FS.Justify == FormattedString::JustifyNone) {
+    this->operator<<(FS.Str);
+    return *this;
+  }
+  const size_t Difference = FS.Width - FS.Str.size();
+  switch (FS.Justify) {
+  case FormattedString::JustifyLeft:
+    this->operator<<(FS.Str);
+    this->indent(Difference);
+    break;
+  case FormattedString::JustifyRight:
+    this->indent(Difference);
+    this->operator<<(FS.Str);
+    break;
+  case FormattedString::JustifyCenter: {
+    int PadAmount = Difference / 2;
     this->indent(PadAmount);
+    this->operator<<(FS.Str);
+    this->indent(Difference - PadAmount);
+    break;
+  }
+  default:
+    llvm_unreachable("Bad Justification");
+  }
   return *this;
 }
 
diff --git a/lib/Target/AArch64/AArch64.h b/lib/Target/AArch64/AArch64.h
index 37b9690d0434..1dda746a6be1 100644
--- a/lib/Target/AArch64/AArch64.h
+++ b/lib/Target/AArch64/AArch64.h
@@ -44,6 +44,8 @@ ModulePass *createAArch64PromoteConstantPass();
 FunctionPass *createAArch64ConditionOptimizerPass();
 FunctionPass *createAArch64A57FPLoadBalancing();
 FunctionPass *createAArch64A53Fix835769();
+FunctionPass *createFalkorHWPFFixPass();
+FunctionPass *createFalkorMarkStridedAccessesPass();
 
 FunctionPass *createAArch64CleanupLocalDynamicTLSPass();
 
@@ -66,6 +68,8 @@ void initializeAArch64VectorByElementOptPass(PassRegistry&);
 void initializeAArch64PromoteConstantPass(PassRegistry&);
 void initializeAArch64RedundantCopyEliminationPass(PassRegistry&);
 void initializeAArch64StorePairSuppressPass(PassRegistry&);
+void initializeFalkorHWPFFixPass(PassRegistry&);
+void initializeFalkorMarkStridedAccessesLegacyPass(PassRegistry&);
 void initializeLDTLSCleanupPass(PassRegistry&);
 } // end namespace llvm
 
diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td
index 53eef79c4df3..436bf1193304 100644
--- a/lib/Target/AArch64/AArch64.td
+++ b/lib/Target/AArch64/AArch64.td
@@ -50,6 +50,9 @@ def FeatureFullFP16 : SubtargetFeature<"fullfp16", "HasFullFP16", "true",
 def FeatureSPE : SubtargetFeature<"spe", "HasSPE", "true",
   "Enable Statistical Profiling extension">;
 
+def FeatureSVE : SubtargetFeature<"sve", "HasSVE", "true",
+  "Enable Scalable Vector Extension (SVE) instructions">;
+
 /// Cyclone has register move instructions which are "free".
 def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true",
                                         "Has zero-cycle register moves">;
@@ -269,6 +272,7 @@ def ProcExynosM2 : SubtargetFeature<"exynosm2", "ARMProcFamily", "ExynosM1",
                                      FeatureCrypto,
                                      FeatureCustomCheapAsMoveHandling,
                                      FeatureFPARMv8,
+                                     FeatureFuseAES,
                                      FeatureNEON,
                                      FeaturePerfMon,
                                      FeaturePostRAScheduler,
diff --git a/lib/Target/AArch64/AArch64CallingConvention.td b/lib/Target/AArch64/AArch64CallingConvention.td
index 938779d23690..291bc5ea858e 100644
--- a/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/lib/Target/AArch64/AArch64CallingConvention.td
@@ -118,6 +118,13 @@ def RetCC_AArch64_AAPCS : CallingConv<[
       CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>
 ]>;
 
+// Vararg functions on windows pass floats in integer registers
+def CC_AArch64_Win64_VarArg : CallingConv<[
+  CCIfType<[f16, f32],    CCPromoteToType<f64>>,
+  CCIfType<[f64], CCBitConvertToType<i64>>,
+  CCDelegateTo<CC_AArch64_AAPCS>
+]>;
+
 
 // Darwin uses a calling convention which differs in only two ways
 // from the standard one at this level:
diff --git a/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
index ee54550c9900..b72f23b109d9 100644
--- a/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
+++ b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
@@ -102,6 +102,10 @@ void AArch64DeadRegisterDefinitions::processMachineBasicBlock(
       case AArch64::LDADDALh:
       case AArch64::LDADDALs:
       case AArch64::LDADDALd:
+      case AArch64::LDCLRALb:
+      case AArch64::LDCLRALh:
+      case AArch64::LDCLRALs:
+      case AArch64::LDCLRALd:
       case AArch64::LDEORALb:
       case AArch64::LDEORALh:
       case AArch64::LDEORALs:
diff --git a/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp b/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
new file mode 100644
index 000000000000..c0e22355a9ff
--- /dev/null
+++ b/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
@@ -0,0 +1,790 @@
+//===-- AArch64FalkorHWPFFix.cpp - Avoid HW prefetcher pitfalls on Falkor--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file For Falkor, we want to avoid HW prefetcher instruction tag collisions
+/// that may inhibit the HW prefetching.  This is done in two steps.  Before
+/// ISel, we mark strided loads (i.e. those that will likely benefit from
+/// prefetching) with metadata.  Then, after opcodes have been finalized, we
+/// insert MOVs and re-write loads to prevent unintnentional tag collisions.
+// ===---------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64TargetMachine.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/CodeGen/LiveRegUnits.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "falkor-hwpf-fix"
+
+STATISTIC(NumStridedLoadsMarked, "Number of strided loads marked");
+STATISTIC(NumCollisionsAvoided,
+          "Number of HW prefetch tag collisions avoided");
+STATISTIC(NumCollisionsNotAvoided,
+          "Number of HW prefetch tag collisions not avoided due to lack of regsiters");
+
+namespace {
+
+class FalkorMarkStridedAccesses {
+public:
+  FalkorMarkStridedAccesses(LoopInfo &LI, ScalarEvolution &SE)
+      : LI(LI), SE(SE) {}
+
+  bool run();
+
+private:
+  bool runOnLoop(Loop &L);
+
+  LoopInfo &LI;
+  ScalarEvolution &SE;
+};
+
+class FalkorMarkStridedAccessesLegacy : public FunctionPass {
+public:
+  static char ID; // Pass ID, replacement for typeid
+  FalkorMarkStridedAccessesLegacy() : FunctionPass(ID) {
+    initializeFalkorMarkStridedAccessesLegacyPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetPassConfig>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addPreserved<LoopInfoWrapperPass>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    // FIXME: For some reason, preserving SE here breaks LSR (even if
+    // this pass changes nothing).
+    // AU.addPreserved<ScalarEvolutionWrapperPass>();
+  }
+
+  bool runOnFunction(Function &F) override;
+};
+} // namespace
+
+char FalkorMarkStridedAccessesLegacy::ID = 0;
+INITIALIZE_PASS_BEGIN(FalkorMarkStridedAccessesLegacy, DEBUG_TYPE,
+                      "Falkor HW Prefetch Fix", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_END(FalkorMarkStridedAccessesLegacy, DEBUG_TYPE,
+                    "Falkor HW Prefetch Fix", false, false)
+
+FunctionPass *llvm::createFalkorMarkStridedAccessesPass() {
+  return new FalkorMarkStridedAccessesLegacy();
+}
+
+bool FalkorMarkStridedAccessesLegacy::runOnFunction(Function &F) {
+  TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
+  const AArch64Subtarget *ST =
+      TPC.getTM<AArch64TargetMachine>().getSubtargetImpl(F);
+  if (ST->getProcFamily() != AArch64Subtarget::Falkor)
+    return false;
+
+  if (skipFunction(F))
+    return false;
+
+  LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+
+  FalkorMarkStridedAccesses LDP(LI, SE);
+  return LDP.run();
+}
+
+bool FalkorMarkStridedAccesses::run() {
+  bool MadeChange = false;
+
+  for (Loop *L : LI)
+    for (auto LIt = df_begin(L), LE = df_end(L); LIt != LE; ++LIt)
+      MadeChange |= runOnLoop(**LIt);
+
+  return MadeChange;
+}
+
+bool FalkorMarkStridedAccesses::runOnLoop(Loop &L) {
+  // Only mark strided loads in the inner-most loop
+  if (!L.empty())
+    return false;
+
+  bool MadeChange = false;
+
+  for (BasicBlock *BB : L.blocks()) {
+    for (Instruction &I : *BB) {
+      LoadInst *LoadI = dyn_cast<LoadInst>(&I);
+      if (!LoadI)
+        continue;
+
+      Value *PtrValue = LoadI->getPointerOperand();
+      if (L.isLoopInvariant(PtrValue))
+        continue;
+
+      const SCEV *LSCEV = SE.getSCEV(PtrValue);
+      const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
+      if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
+        continue;
+
+      LoadI->setMetadata(FALKOR_STRIDED_ACCESS_MD,
+                         MDNode::get(LoadI->getContext(), {}));
+      ++NumStridedLoadsMarked;
+      DEBUG(dbgs() << "Load: " << I << " marked as strided\n");
+      MadeChange = true;
+    }
+  }
+
+  return MadeChange;
+}
+
+namespace {
+
+class FalkorHWPFFix : public MachineFunctionPass {
+public:
+  static char ID;
+
+  FalkorHWPFFix() : MachineFunctionPass(ID) {
+    initializeFalkorHWPFFixPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &Fn) override;
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineLoopInfo>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::NoVRegs);
+  }
+
+private:
+  void runOnLoop(MachineLoop &L, MachineFunction &Fn);
+
+  const AArch64InstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  DenseMap<unsigned, SmallVector<MachineInstr *, 4>> TagMap;
+  bool Modified;
+};
+
+/// Bits from load opcodes used to compute HW prefetcher instruction tags.
+struct LoadInfo {
+  LoadInfo()
+      : DestReg(0), BaseReg(0), BaseRegIdx(-1), OffsetOpnd(nullptr),
+        IsPrePost(false) {}
+  unsigned DestReg;
+  unsigned BaseReg;
+  int BaseRegIdx;
+  const MachineOperand *OffsetOpnd;
+  bool IsPrePost;
+};
+
+} // namespace
+
+char FalkorHWPFFix::ID = 0;
+
+INITIALIZE_PASS_BEGIN(FalkorHWPFFix, "falkor-hwpf-fix-late",
+                      "Falkor HW Prefetch Fix Late Phase", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_END(FalkorHWPFFix, "falkor-hwpf-fix-late",
+                    "Falkor HW Prefetch Fix Late Phase", false, false)
+
+static unsigned makeTag(unsigned Dest, unsigned Base, unsigned Offset) {
+  return (Dest & 0xf) | ((Base & 0xf) << 4) | ((Offset & 0x3f) << 8);
+}
+
+static Optional<LoadInfo> getLoadInfo(const MachineInstr &MI) {
+  int DestRegIdx;
+  int BaseRegIdx;
+  int OffsetIdx;
+  bool IsPrePost;
+
+  switch (MI.getOpcode()) {
+  default:
+    return None;
+
+  case AArch64::LD1i8:
+  case AArch64::LD1i16:
+  case AArch64::LD1i32:
+  case AArch64::LD1i64:
+  case AArch64::LD2i8:
+  case AArch64::LD2i16:
+  case AArch64::LD2i32:
+  case AArch64::LD2i64:
+  case AArch64::LD3i8:
+  case AArch64::LD3i16:
+  case AArch64::LD3i32:
+  case AArch64::LD4i8:
+  case AArch64::LD4i16:
+  case AArch64::LD4i32:
+    DestRegIdx = 0;
+    BaseRegIdx = 3;
+    OffsetIdx = -1;
+    IsPrePost = false;
+    break;
+
+  case AArch64::LD3i64:
+  case AArch64::LD4i64:
+    DestRegIdx = -1;
+    BaseRegIdx = 3;
+    OffsetIdx = -1;
+    IsPrePost = false;
+    break;
+
+  case AArch64::LD1Onev1d:
+  case AArch64::LD1Onev2s:
+  case AArch64::LD1Onev4h:
+  case AArch64::LD1Onev8b:
+  case AArch64::LD1Onev2d:
+  case AArch64::LD1Onev4s:
+  case AArch64::LD1Onev8h:
+  case AArch64::LD1Onev16b:
+  case AArch64::LD1Rv1d:
+  case AArch64::LD1Rv2s:
+  case AArch64::LD1Rv4h:
+  case AArch64::LD1Rv8b:
+  case AArch64::LD1Rv2d:
+  case AArch64::LD1Rv4s:
+  case AArch64::LD1Rv8h:
+  case AArch64::LD1Rv16b:
+  case AArch64::LD1Twov1d:
+  case AArch64::LD1Twov2s:
+  case AArch64::LD1Twov4h:
+  case AArch64::LD1Twov8b:
+  case AArch64::LD2Twov2s:
+  case AArch64::LD2Twov4s:
+  case AArch64::LD2Twov8b:
+  case AArch64::LD2Rv1d:
+  case AArch64::LD2Rv2s:
+  case AArch64::LD2Rv4s:
+  case AArch64::LD2Rv8b:
+    DestRegIdx = 0;
+    BaseRegIdx = 1;
+    OffsetIdx = -1;
+    IsPrePost = false;
+    break;
+
+  case AArch64::LD1Twov2d:
+  case AArch64::LD1Twov4s:
+  case AArch64::LD1Twov8h:
+  case AArch64::LD1Twov16b:
+  case AArch64::LD1Threev1d:
+  case AArch64::LD1Threev2s:
+  case AArch64::LD1Threev4h:
+  case AArch64::LD1Threev8b:
+  case AArch64::LD1Threev2d:
+  case AArch64::LD1Threev4s:
+  case AArch64::LD1Threev8h:
+  case AArch64::LD1Threev16b:
+  case AArch64::LD1Fourv1d:
+  case AArch64::LD1Fourv2s:
+  case AArch64::LD1Fourv4h:
+  case AArch64::LD1Fourv8b:
+  case AArch64::LD1Fourv2d:
+  case AArch64::LD1Fourv4s:
+  case AArch64::LD1Fourv8h:
+  case AArch64::LD1Fourv16b:
+  case AArch64::LD2Twov2d:
+  case AArch64::LD2Twov4h:
+  case AArch64::LD2Twov8h:
+  case AArch64::LD2Twov16b:
+  case AArch64::LD2Rv2d:
+  case AArch64::LD2Rv4h:
+  case AArch64::LD2Rv8h:
+  case AArch64::LD2Rv16b:
+  case AArch64::LD3Threev2s:
+  case AArch64::LD3Threev4h:
+  case AArch64::LD3Threev8b:
+  case AArch64::LD3Threev2d:
+  case AArch64::LD3Threev4s:
+  case AArch64::LD3Threev8h:
+  case AArch64::LD3Threev16b:
+  case AArch64::LD3Rv1d:
+  case AArch64::LD3Rv2s:
+  case AArch64::LD3Rv4h:
+  case AArch64::LD3Rv8b:
+  case AArch64::LD3Rv2d:
+  case AArch64::LD3Rv4s:
+  case AArch64::LD3Rv8h:
+  case AArch64::LD3Rv16b:
+  case AArch64::LD4Fourv2s:
+  case AArch64::LD4Fourv4h:
+  case AArch64::LD4Fourv8b:
+  case AArch64::LD4Fourv2d:
+  case AArch64::LD4Fourv4s:
+  case AArch64::LD4Fourv8h:
+  case AArch64::LD4Fourv16b:
+  case AArch64::LD4Rv1d:
+  case AArch64::LD4Rv2s:
+  case AArch64::LD4Rv4h:
+  case AArch64::LD4Rv8b:
+  case AArch64::LD4Rv2d:
+  case AArch64::LD4Rv4s:
+  case AArch64::LD4Rv8h:
+  case AArch64::LD4Rv16b:
+    DestRegIdx = -1;
+    BaseRegIdx = 1;
+    OffsetIdx = -1;
+    IsPrePost = false;
+    break;
+
+  case AArch64::LD1i8_POST:
+  case AArch64::LD1i16_POST:
+  case AArch64::LD1i32_POST:
+  case AArch64::LD1i64_POST:
+  case AArch64::LD2i8_POST:
+  case AArch64::LD2i16_POST:
+  case AArch64::LD2i32_POST:
+  case AArch64::LD2i64_POST:
+  case AArch64::LD3i8_POST:
+  case AArch64::LD3i16_POST:
+  case AArch64::LD3i32_POST:
+  case AArch64::LD4i8_POST:
+  case AArch64::LD4i16_POST:
+  case AArch64::LD4i32_POST:
+    DestRegIdx = 1;
+    BaseRegIdx = 4;
+    OffsetIdx = 5;
+    IsPrePost = false;
+    break;
+
+  case AArch64::LD3i64_POST:
+  case AArch64::LD4i64_POST:
+    DestRegIdx = -1;
+    BaseRegIdx = 4;
+    OffsetIdx = 5;
+    IsPrePost = false;
+    break;
+
+  case AArch64::LD1Onev1d_POST:
+  case AArch64::LD1Onev2s_POST:
+  case AArch64::LD1Onev4h_POST:
+  case AArch64::LD1Onev8b_POST:
+  case AArch64::LD1Onev2d_POST:
+  case AArch64::LD1Onev4s_POST:
+  case AArch64::LD1Onev8h_POST:
+  case AArch64::LD1Onev16b_POST:
+  case AArch64::LD1Rv1d_POST:
+  case AArch64::LD1Rv2s_POST:
+  case AArch64::LD1Rv4h_POST:
+  case AArch64::LD1Rv8b_POST:
+  case AArch64::LD1Rv2d_POST:
+  case AArch64::LD1Rv4s_POST:
+  case AArch64::LD1Rv8h_POST:
+  case AArch64::LD1Rv16b_POST:
+  case AArch64::LD1Twov1d_POST:
+  case AArch64::LD1Twov2s_POST:
+  case AArch64::LD1Twov4h_POST:
+  case AArch64::LD1Twov8b_POST:
+  case AArch64::LD2Twov2s_POST:
+  case AArch64::LD2Twov4s_POST:
+  case AArch64::LD2Twov8b_POST:
+  case AArch64::LD2Rv1d_POST:
+  case AArch64::LD2Rv2s_POST:
+  case AArch64::LD2Rv4s_POST:
+  case AArch64::LD2Rv8b_POST:
+    DestRegIdx = 1;
+    BaseRegIdx = 2;
+    OffsetIdx = 3;
+    IsPrePost = false;
+    break;
+
+  case AArch64::LD1Twov2d_POST:
+  case AArch64::LD1Twov4s_POST:
+  case AArch64::LD1Twov8h_POST:
+  case AArch64::LD1Twov16b_POST:
+  case AArch64::LD1Threev1d_POST:
+  case AArch64::LD1Threev2s_POST:
+  case AArch64::LD1Threev4h_POST:
+  case AArch64::LD1Threev8b_POST:
+  case AArch64::LD1Threev2d_POST:
+  case AArch64::LD1Threev4s_POST:
+  case AArch64::LD1Threev8h_POST:
+  case AArch64::LD1Threev16b_POST:
+  case AArch64::LD1Fourv1d_POST:
+  case AArch64::LD1Fourv2s_POST:
+  case AArch64::LD1Fourv4h_POST:
+  case AArch64::LD1Fourv8b_POST:
+  case AArch64::LD1Fourv2d_POST:
+  case AArch64::LD1Fourv4s_POST:
+  case AArch64::LD1Fourv8h_POST:
+  case AArch64::LD1Fourv16b_POST:
+  case AArch64::LD2Twov2d_POST:
+  case AArch64::LD2Twov4h_POST:
+  case AArch64::LD2Twov8h_POST:
+  case AArch64::LD2Twov16b_POST:
+  case AArch64::LD2Rv2d_POST:
+  case AArch64::LD2Rv4h_POST:
+  case AArch64::LD2Rv8h_POST:
+  case AArch64::LD2Rv16b_POST:
+  case AArch64::LD3Threev2s_POST:
+  case AArch64::LD3Threev4h_POST:
+  case AArch64::LD3Threev8b_POST:
+  case AArch64::LD3Threev2d_POST:
+  case AArch64::LD3Threev4s_POST:
+  case AArch64::LD3Threev8h_POST:
+  case AArch64::LD3Threev16b_POST:
+  case AArch64::LD3Rv1d_POST:
+  case AArch64::LD3Rv2s_POST:
+  case AArch64::LD3Rv4h_POST:
+  case AArch64::LD3Rv8b_POST:
+  case AArch64::LD3Rv2d_POST:
+  case AArch64::LD3Rv4s_POST:
+  case AArch64::LD3Rv8h_POST:
+  case AArch64::LD3Rv16b_POST:
+  case AArch64::LD4Fourv2s_POST:
+  case AArch64::LD4Fourv4h_POST:
+  case AArch64::LD4Fourv8b_POST:
+  case AArch64::LD4Fourv2d_POST:
+  case AArch64::LD4Fourv4s_POST:
+  case AArch64::LD4Fourv8h_POST:
+  case AArch64::LD4Fourv16b_POST:
+  case AArch64::LD4Rv1d_POST:
+  case AArch64::LD4Rv2s_POST:
+  case AArch64::LD4Rv4h_POST:
+  case AArch64::LD4Rv8b_POST:
+  case AArch64::LD4Rv2d_POST:
+  case AArch64::LD4Rv4s_POST:
+  case AArch64::LD4Rv8h_POST:
+  case AArch64::LD4Rv16b_POST:
+    DestRegIdx = -1;
+    BaseRegIdx = 2;
+    OffsetIdx = 3;
+    IsPrePost = false;
+    break;
+
+  case AArch64::LDRBBroW:
+  case AArch64::LDRBBroX:
+  case AArch64::LDRBBui:
+  case AArch64::LDRBroW:
+  case AArch64::LDRBroX:
+  case AArch64::LDRBui:
+  case AArch64::LDRDl:
+  case AArch64::LDRDroW:
+  case AArch64::LDRDroX:
+  case AArch64::LDRDui:
+  case AArch64::LDRHHroW:
+  case AArch64::LDRHHroX:
+  case AArch64::LDRHHui:
+  case AArch64::LDRHroW:
+  case AArch64::LDRHroX:
+  case AArch64::LDRHui:
+  case AArch64::LDRQl:
+  case AArch64::LDRQroW:
+  case AArch64::LDRQroX:
+  case AArch64::LDRQui:
+  case AArch64::LDRSBWroW:
+  case AArch64::LDRSBWroX:
+  case AArch64::LDRSBWui:
+  case AArch64::LDRSBXroW:
+  case AArch64::LDRSBXroX:
+  case AArch64::LDRSBXui:
+  case AArch64::LDRSHWroW:
+  case AArch64::LDRSHWroX:
+  case AArch64::LDRSHWui:
+  case AArch64::LDRSHXroW:
+  case AArch64::LDRSHXroX:
+  case AArch64::LDRSHXui:
+  case AArch64::LDRSWl:
+  case AArch64::LDRSWroW:
+  case AArch64::LDRSWroX:
+  case AArch64::LDRSWui:
+  case AArch64::LDRSl:
+  case AArch64::LDRSroW:
+  case AArch64::LDRSroX:
+  case AArch64::LDRSui:
+  case AArch64::LDRWl:
+  case AArch64::LDRWroW:
+  case AArch64::LDRWroX:
+  case AArch64::LDRWui:
+  case AArch64::LDRXl:
+  case AArch64::LDRXroW:
+  case AArch64::LDRXroX:
+  case AArch64::LDRXui:
+  case AArch64::LDURBBi:
+  case AArch64::LDURBi:
+  case AArch64::LDURDi:
+  case AArch64::LDURHHi:
+  case AArch64::LDURHi:
+  case AArch64::LDURQi:
+  case AArch64::LDURSBWi:
+  case AArch64::LDURSBXi:
+  case AArch64::LDURSHWi:
+  case AArch64::LDURSHXi:
+  case AArch64::LDURSWi:
+  case AArch64::LDURSi:
+  case AArch64::LDURWi:
+  case AArch64::LDURXi:
+    DestRegIdx = 0;
+    BaseRegIdx = 1;
+    OffsetIdx = 2;
+    IsPrePost = false;
+    break;
+
+  case AArch64::LDRBBpost:
+  case AArch64::LDRBBpre:
+  case AArch64::LDRBpost:
+  case AArch64::LDRBpre:
+  case AArch64::LDRDpost:
+  case AArch64::LDRDpre:
+  case AArch64::LDRHHpost:
+  case AArch64::LDRHHpre:
+  case AArch64::LDRHpost:
+  case AArch64::LDRHpre:
+  case AArch64::LDRQpost:
+  case AArch64::LDRQpre:
+  case AArch64::LDRSBWpost:
+  case AArch64::LDRSBWpre:
+  case AArch64::LDRSBXpost:
+  case AArch64::LDRSBXpre:
+  case AArch64::LDRSHWpost:
+  case AArch64::LDRSHWpre:
+  case AArch64::LDRSHXpost:
+  case AArch64::LDRSHXpre:
+  case AArch64::LDRSWpost:
+  case AArch64::LDRSWpre:
+  case AArch64::LDRSpost:
+  case AArch64::LDRSpre:
+  case AArch64::LDRWpost:
+  case AArch64::LDRWpre:
+  case AArch64::LDRXpost:
+  case AArch64::LDRXpre:
+    DestRegIdx = 1;
+    BaseRegIdx = 2;
+    OffsetIdx = 3;
+    IsPrePost = true;
+    break;
+
+  case AArch64::LDPDi:
+  case AArch64::LDPQi:
+    DestRegIdx = -1;
+    BaseRegIdx = 2;
+    OffsetIdx = 3;
+    IsPrePost = false;
+    break;
+
+  case AArch64::LDPSWi:
+  case AArch64::LDPSi:
+  case AArch64::LDPWi:
+  case AArch64::LDPXi:
+    DestRegIdx = 0;
+    BaseRegIdx = 2;
+    OffsetIdx = 3;
+    IsPrePost = false;
+    break;
+
+  case AArch64::LDPQpost:
+  case AArch64::LDPQpre:
+    DestRegIdx = -1;
+    BaseRegIdx = 3;
+    OffsetIdx = 4;
+    IsPrePost = true;
+    break;
+
+  case AArch64::LDPDpost:
+  case AArch64::LDPDpre:
+  case AArch64::LDPSWpost:
+  case AArch64::LDPSWpre:
+  case AArch64::LDPSpost:
+  case AArch64::LDPSpre:
+  case AArch64::LDPWpost:
+  case AArch64::LDPWpre:
+  case AArch64::LDPXpost:
+  case AArch64::LDPXpre:
+    DestRegIdx = 1;
+    BaseRegIdx = 3;
+    OffsetIdx = 4;
+    IsPrePost = true;
+    break;
+  }
+
+  LoadInfo LI;
+  LI.DestReg = DestRegIdx == -1 ? 0 : MI.getOperand(DestRegIdx).getReg();
+  LI.BaseReg = MI.getOperand(BaseRegIdx).getReg();
+  LI.BaseRegIdx = BaseRegIdx;
+  LI.OffsetOpnd = OffsetIdx == -1 ? nullptr : &MI.getOperand(OffsetIdx);
+  LI.IsPrePost = IsPrePost;
+  return LI;
+}
+
+static Optional<unsigned> getTag(const TargetRegisterInfo *TRI,
+                                 const MachineInstr &MI, const LoadInfo &LI) {
+  unsigned Dest = LI.DestReg ? TRI->getEncodingValue(LI.DestReg) : 0;
+  unsigned Base = TRI->getEncodingValue(LI.BaseReg);
+  unsigned Off;
+  if (LI.OffsetOpnd == nullptr)
+    Off = 0;
+  else if (LI.OffsetOpnd->isGlobal() || LI.OffsetOpnd->isSymbol() ||
+           LI.OffsetOpnd->isCPI())
+    return None;
+  else if (LI.OffsetOpnd->isReg())
+    Off = (1 << 5) | TRI->getEncodingValue(LI.OffsetOpnd->getReg());
+  else
+    Off = LI.OffsetOpnd->getImm() >> 2;
+
+  return makeTag(Dest, Base, Off);
+}
+
+void FalkorHWPFFix::runOnLoop(MachineLoop &L, MachineFunction &Fn) {
+  // Build the initial tag map for the whole loop.
+  TagMap.clear();
+  for (MachineBasicBlock *MBB : L.getBlocks())
+    for (MachineInstr &MI : *MBB) {
+      Optional<LoadInfo> LInfo = getLoadInfo(MI);
+      if (!LInfo)
+        continue;
+      Optional<unsigned> Tag = getTag(TRI, MI, *LInfo);
+      if (!Tag)
+        continue;
+      TagMap[*Tag].push_back(&MI);
+    }
+
+  bool AnyCollisions = false;
+  for (auto &P : TagMap) {
+    auto Size = P.second.size();
+    if (Size > 1) {
+      for (auto *MI : P.second) {
+        if (TII->isStridedAccess(*MI)) {
+          AnyCollisions = true;
+          break;
+        }
+      }
+    }
+    if (AnyCollisions)
+      break;
+  }
+  // Nothing to fix.
+  if (!AnyCollisions)
+    return;
+
+  MachineRegisterInfo &MRI = Fn.getRegInfo();
+
+  // Go through all the basic blocks in the current loop and fix any streaming
+  // loads to avoid collisions with any other loads.
+  LiveRegUnits LR(*TRI);
+  for (MachineBasicBlock *MBB : L.getBlocks()) {
+    LR.clear();
+    LR.addLiveOuts(*MBB);
+    for (auto I = MBB->rbegin(); I != MBB->rend(); LR.stepBackward(*I), ++I) {
+      MachineInstr &MI = *I;
+      if (!TII->isStridedAccess(MI))
+        continue;
+
+      LoadInfo LdI = *getLoadInfo(MI);
+      unsigned OldTag = *getTag(TRI, MI, LdI);
+      auto &OldCollisions = TagMap[OldTag];
+      if (OldCollisions.size() <= 1)
+        continue;
+
+      bool Fixed = false;
+      DEBUG(dbgs() << "Attempting to fix tag collision: " << MI);
+
+      for (unsigned ScratchReg : AArch64::GPR64RegClass) {
+        if (!LR.available(ScratchReg) || MRI.isReserved(ScratchReg))
+          continue;
+
+        LoadInfo NewLdI(LdI);
+        NewLdI.BaseReg = ScratchReg;
+        unsigned NewTag = *getTag(TRI, MI, NewLdI);
+        // Scratch reg tag would collide too, so don't use it.
+        if (TagMap.count(NewTag))
+          continue;
+
+        DEBUG(dbgs() << "Changing base reg to: " << PrintReg(ScratchReg, TRI)
+                     << '\n');
+
+        // Rewrite:
+        //   Xd = LOAD Xb, off
+        // to:
+        //   Xc = MOV Xb
+        //   Xd = LOAD Xc, off
+        DebugLoc DL = MI.getDebugLoc();
+        BuildMI(*MBB, &MI, DL, TII->get(AArch64::ORRXrs), ScratchReg)
+            .addReg(AArch64::XZR)
+            .addReg(LdI.BaseReg)
+            .addImm(0);
+        MachineOperand &BaseOpnd = MI.getOperand(LdI.BaseRegIdx);
+        BaseOpnd.setReg(ScratchReg);
+
+        // If the load does a pre/post increment, then insert a MOV after as
+        // well to update the real base register.
+        if (LdI.IsPrePost) {
+          DEBUG(dbgs() << "Doing post MOV of incremented reg: "
+                       << PrintReg(ScratchReg, TRI) << '\n');
+          MI.getOperand(0).setReg(
+              ScratchReg); // Change tied operand pre/post update dest.
+          BuildMI(*MBB, std::next(MachineBasicBlock::iterator(MI)), DL,
+                  TII->get(AArch64::ORRXrs), LdI.BaseReg)
+              .addReg(AArch64::XZR)
+              .addReg(ScratchReg)
+              .addImm(0);
+        }
+
+        for (int I = 0, E = OldCollisions.size(); I != E; ++I)
+          if (OldCollisions[I] == &MI) {
+            std::swap(OldCollisions[I], OldCollisions[E - 1]);
+            OldCollisions.pop_back();
+            break;
+          }
+
+        // Update TagMap to reflect instruction changes to reduce the number
+        // of later MOVs to be inserted.  This needs to be done after
+        // OldCollisions is updated since it may be relocated by this
+        // insertion.
+        TagMap[NewTag].push_back(&MI);
+        ++NumCollisionsAvoided;
+        Fixed = true;
+        Modified = true;
+        break;
+      }
+      if (!Fixed)
+        ++NumCollisionsNotAvoided;
+    }
+  }
+}
+
+bool FalkorHWPFFix::runOnMachineFunction(MachineFunction &Fn) {
+  auto &ST = static_cast<const AArch64Subtarget &>(Fn.getSubtarget());
+  if (ST.getProcFamily() != AArch64Subtarget::Falkor)
+    return false;
+
+  if (skipFunction(*Fn.getFunction()))
+    return false;
+
+  TII = static_cast<const AArch64InstrInfo *>(ST.getInstrInfo());
+  TRI = ST.getRegisterInfo();
+
+  assert(TRI->trackLivenessAfterRegAlloc(Fn) &&
+         "Register liveness not available!");
+
+  MachineLoopInfo &LI = getAnalysis<MachineLoopInfo>();
+
+  Modified = false;
+
+  for (MachineLoop *I : LI)
+    for (auto L = df_begin(I), LE = df_end(I); L != LE; ++L)
+      // Only process inner-loops
+      if (L->empty())
+        runOnLoop(**L, Fn);
+
+  return Modified;
+}
+
+FunctionPass *llvm::createFalkorHWPFFixPass() { return new FalkorHWPFFix(); }
diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp
index 3682b62d2b84..97396057dce0 100644
--- a/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/lib/Target/AArch64/AArch64FastISel.cpp
@@ -5138,6 +5138,7 @@ bool AArch64FastISel::fastSelectInstruction(const Instruction *I) {
   return selectOperator(I, I->getOpcode());
   // Silence warnings.
   (void)&CC_AArch64_DarwinPCS_VarArg;
+  (void)&CC_AArch64_Win64_VarArg;
 }
 
 namespace llvm {
diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp
index e96ee7d29b3e..4907d082eda0 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -41,6 +41,10 @@
 // |                                   |
 // |-----------------------------------|
 // |                                   |
+// | (Win64 only) varargs from reg     |
+// |                                   |
+// |-----------------------------------|
+// |                                   |
 // | prev_fp, prev_lr                  |
 // | (a.k.a. "frame record")           |
 // |-----------------------------------| <- fp(=x29)
@@ -950,7 +954,13 @@ static void computeCalleeSaveRegisterPairs(
           CC == CallingConv::PreserveMost ||
           (Count & 1) == 0) &&
          "Odd number of callee-saved regs to spill!");
-  unsigned Offset = AFI->getCalleeSavedStackSize();
+  int Offset = AFI->getCalleeSavedStackSize();
+
+  unsigned GPRSaveSize = AFI->getVarArgsGPRSize();
+  const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+  bool IsWin64 = Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv());
+  if (IsWin64)
+    Offset -= alignTo(GPRSaveSize, 16);
 
   for (unsigned i = 0; i < Count; ++i) {
     RegPairInfo RPI;
diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 04687847c1a3..06005f6b6886 100644
--- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -239,10 +239,17 @@ bool AArch64DAGToDAGISel::SelectInlineAsmMemoryOperand(
   case InlineAsm::Constraint_i:
   case InlineAsm::Constraint_m:
   case InlineAsm::Constraint_Q:
-    // Require the address to be in a register.  That is safe for all AArch64
-    // variants and it is hard to do anything much smarter without knowing
-    // how the operand is used.
-    OutOps.push_back(Op);
+    // We need to make sure that this one operand does not end up in XZR, thus
+    // require the address to be in a PointerRegClass register.
+    const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
+    const TargetRegisterClass *TRC = TRI->getPointerRegClass(*MF);
+    SDLoc dl(Op);
+    SDValue RC = CurDAG->getTargetConstant(TRC->getID(), dl, MVT::i64);
+    SDValue NewOp =
+        SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
+                                       dl, Op.getValueType(),
+                                       Op, RC), 0);
+    OutOps.push_back(NewOp);
     return false;
   }
   return true;
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 60fde5caa339..c6150f9e5d1d 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2650,9 +2650,13 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
   case CallingConv::PreserveMost:
   case CallingConv::CXX_FAST_TLS:
   case CallingConv::Swift:
+    if (Subtarget->isTargetWindows() && IsVarArg)
+      return CC_AArch64_Win64_VarArg;
     if (!Subtarget->isTargetDarwin())
       return CC_AArch64_AAPCS;
     return IsVarArg ? CC_AArch64_DarwinPCS_VarArg : CC_AArch64_DarwinPCS;
+  case CallingConv::Win64:
+    return IsVarArg ? CC_AArch64_Win64_VarArg : CC_AArch64_AAPCS;
   }
 }
 
@@ -2668,6 +2672,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo &MFI = MF.getFrameInfo();
+  bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv());
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
@@ -2824,10 +2829,12 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
   // varargs
   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
   if (isVarArg) {
-    if (!Subtarget->isTargetDarwin()) {
+    if (!Subtarget->isTargetDarwin() || IsWin64) {
       // The AAPCS variadic function ABI is identical to the non-variadic
       // one. As a result there may be more arguments in registers and we should
       // save them for future reference.
+      // Win64 variadic functions also pass arguments in registers, but all float
+      // arguments are passed in integer registers.
       saveVarArgRegisters(CCInfo, DAG, DL, Chain);
     }
 
@@ -2869,6 +2876,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
   MachineFrameInfo &MFI = MF.getFrameInfo();
   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
   auto PtrVT = getPointerTy(DAG.getDataLayout());
+  bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv());
 
   SmallVector<SDValue, 8> MemOps;
 
@@ -2881,7 +2889,10 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
   unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
   int GPRIdx = 0;
   if (GPRSaveSize != 0) {
-    GPRIdx = MFI.CreateStackObject(GPRSaveSize, 8, false);
+    if (IsWin64)
+      GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
+    else
+      GPRIdx = MFI.CreateStackObject(GPRSaveSize, 8, false);
 
     SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
 
@@ -2890,7 +2901,11 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
       SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
       SDValue Store = DAG.getStore(
           Val.getValue(1), DL, Val, FIN,
-          MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8));
+          IsWin64
+              ? MachinePointerInfo::getFixedStack(DAG.getMachineFunction(),
+                                                  GPRIdx,
+                                                  (i - FirstVariadicGPR) * 8)
+              : MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8));
       MemOps.push_back(Store);
       FIN =
           DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
@@ -2899,7 +2914,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
   FuncInfo->setVarArgsGPRIndex(GPRIdx);
   FuncInfo->setVarArgsGPRSize(GPRSaveSize);
 
-  if (Subtarget->hasFPARMv8()) {
+  if (Subtarget->hasFPARMv8() && !IsWin64) {
     static const MCPhysReg FPRArgRegs[] = {
         AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3,
         AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7};
@@ -4491,6 +4506,21 @@ SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
                       MachinePointerInfo(SV));
 }
 
+SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  AArch64FunctionInfo *FuncInfo =
+      DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
+
+  SDLoc DL(Op);
+  SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0
+                                     ? FuncInfo->getVarArgsGPRIndex()
+                                     : FuncInfo->getVarArgsStackIndex(),
+                                 getPointerTy(DAG.getDataLayout()));
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
+                      MachinePointerInfo(SV));
+}
+
 SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
                                                 SelectionDAG &DAG) const {
   // The layout of the va_list struct is specified in the AArch64 Procedure Call
@@ -4562,8 +4592,14 @@ SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
 
 SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
                                             SelectionDAG &DAG) const {
-  return Subtarget->isTargetDarwin() ? LowerDarwin_VASTART(Op, DAG)
-                                     : LowerAAPCS_VASTART(Op, DAG);
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  if (Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv()))
+    return LowerWin64_VASTART(Op, DAG);
+  else if (Subtarget->isTargetDarwin())
+    return LowerDarwin_VASTART(Op, DAG);
+  else
+    return LowerAAPCS_VASTART(Op, DAG);
 }
 
 SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
@@ -4571,7 +4607,8 @@ SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
   // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
   // pointer.
   SDLoc DL(Op);
-  unsigned VaListSize = Subtarget->isTargetDarwin() ? 8 : 32;
+  unsigned VaListSize =
+      Subtarget->isTargetDarwin() || Subtarget->isTargetWindows() ? 8 : 32;
   const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
   const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
 
@@ -7451,6 +7488,14 @@ AArch64TargetLowering::getNumInterleavedAccesses(VectorType *VecTy,
   return (DL.getTypeSizeInBits(VecTy) + 127) / 128;
 }
 
+MachineMemOperand::Flags
+AArch64TargetLowering::getMMOFlags(const Instruction &I) const {
+  if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
+      I.getMetadata(FALKOR_STRIDED_ACCESS_MD) != nullptr)
+    return MOStridedAccess;
+  return MachineMemOperand::MONone;
+}
+
 bool AArch64TargetLowering::isLegalInterleavedAccessType(
     VectorType *VecTy, const DataLayout &DL) const {
 
@@ -10567,9 +10612,6 @@ AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
   if (Size > 128) return AtomicExpansionKind::None;
   // Nand not supported in LSE.
   if (AI->getOperation() == AtomicRMWInst::Nand) return AtomicExpansionKind::LLSC;
-  // Currently leaving And and Sub to LLSC
-  if ((AI->getOperation() == AtomicRMWInst::And) || (AI->getOperation() == AtomicRMWInst::Sub))
-    return AtomicExpansionKind::LLSC;
   // Leave 128 bits to LLSC.
   return (Subtarget->hasLSE() && Size < 128) ? AtomicExpansionKind::None : AtomicExpansionKind::LLSC;
 }
@@ -10783,7 +10825,7 @@ bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
 
 unsigned
 AArch64TargetLowering::getVaListSizeInBits(const DataLayout &DL) const {
-  if (Subtarget->isTargetDarwin())
+  if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
     return getPointerTy(DL).getSizeInBits();
 
   return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32;
diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h
index ecc2517fb288..3b0e0f1de894 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/lib/Target/AArch64/AArch64ISelLowering.h
@@ -408,6 +408,19 @@ public:
 
   bool isIntDivCheap(EVT VT, AttributeList Attr) const override;
 
+  bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
+                        const SelectionDAG &DAG) const override {
+    // Do not merge to float value size (128 bytes) if no implicit
+    // float attribute is set.
+
+    bool NoFloat = DAG.getMachineFunction().getFunction()->hasFnAttribute(
+        Attribute::NoImplicitFloat);
+
+    if (NoFloat)
+      return (MemVT.getSizeInBits() <= 64);
+    return true;
+  }
+
   bool isCheapToSpeculateCttz() const override {
     return true;
   }
@@ -455,6 +468,8 @@ public:
   unsigned getNumInterleavedAccesses(VectorType *VecTy,
                                      const DataLayout &DL) const;
 
+  MachineMemOperand::Flags getMMOFlags(const Instruction &I) const override;
+
 private:
   bool isExtFreeImpl(const Instruction *Ext) const override;
 
@@ -541,6 +556,7 @@ private:
   SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerAAPCS_VASTART(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerDarwin_VASTART(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerWin64_VASTART(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/AArch64/AArch64InstrAtomics.td b/lib/Target/AArch64/AArch64InstrAtomics.td
index de283b70210f..eec41ddbc159 100644
--- a/lib/Target/AArch64/AArch64InstrAtomics.td
+++ b/lib/Target/AArch64/AArch64InstrAtomics.td
@@ -451,3 +451,13 @@ def : Pat<(atomic_swap_8 GPR64:$Rn, GPR32:$Rs), (SWPALb GPR32:$Rs, GPR64sp:$Rn)>
 def : Pat<(atomic_swap_16 GPR64:$Rn, GPR32:$Rs), (SWPALh GPR32:$Rs, GPR64sp:$Rn)>;
 def : Pat<(atomic_swap_32 GPR64:$Rn, GPR32:$Rs), (SWPALs GPR32:$Rs, GPR64sp:$Rn)>;
 def : Pat<(atomic_swap_64 GPR64:$Rn, GPR64:$Rs), (SWPALd GPR64:$Rs, GPR64sp:$Rn)>;
+
+def : Pat<(atomic_load_sub_8 GPR64:$Rn, GPR32:$Rs), (LDADDALb (SUBWrr WZR, GPR32:$Rs), GPR64sp:$Rn)>;
+def : Pat<(atomic_load_sub_16 GPR64:$Rn, GPR32:$Rs), (LDADDALh (SUBWrr WZR, GPR32:$Rs), GPR64sp:$Rn)>;
+def : Pat<(atomic_load_sub_32 GPR64:$Rn, GPR32:$Rs), (LDADDALs (SUBWrr WZR, GPR32:$Rs), GPR64sp:$Rn)>;
+def : Pat<(atomic_load_sub_64 GPR64:$Rn, GPR64:$Rs), (LDADDALd (SUBXrr XZR, GPR64:$Rs), GPR64sp:$Rn)>;
+
+def : Pat<(atomic_load_and_8 GPR64:$Rn, GPR32:$Rs), (LDCLRALb (ORNWrr WZR, GPR32:$Rs), GPR64sp:$Rn)>;
+def : Pat<(atomic_load_and_16 GPR64:$Rn, GPR32:$Rs), (LDCLRALh (ORNWrr WZR, GPR32:$Rs), GPR64sp:$Rn)>;
+def : Pat<(atomic_load_and_32 GPR64:$Rn, GPR32:$Rs), (LDCLRALs (ORNWrr WZR, GPR32:$Rs), GPR64sp:$Rn)>;
+def : Pat<(atomic_load_and_64 GPR64:$Rn, GPR64:$Rs), (LDCLRALd (ORNXrr XZR, GPR64:$Rs), GPR64sp:$Rn)>;
diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp
index dba3e4bdf82f..c0c6055c358f 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -52,9 +52,6 @@ using namespace llvm;
 #define GET_INSTRINFO_CTOR_DTOR
 #include "AArch64GenInstrInfo.inc"
 
-static const MachineMemOperand::Flags MOSuppressPair =
-    MachineMemOperand::MOTargetFlag1;
-
 static cl::opt<unsigned>
 TBZDisplacementBits("aarch64-tbz-offset-bits", cl::Hidden, cl::init(14),
                     cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));
@@ -1715,6 +1712,13 @@ void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) const {
   (*MI.memoperands_begin())->setFlags(MOSuppressPair);
 }
 
+/// Check all MachineMemOperands for a hint that the load/store is strided.
+bool AArch64InstrInfo::isStridedAccess(const MachineInstr &MI) const {
+  return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
+    return MMO->getFlags() & MOStridedAccess;
+  });
+}
+
 bool AArch64InstrInfo::isUnscaledLdSt(unsigned Opc) const {
   switch (Opc) {
   default:
@@ -4433,7 +4437,8 @@ AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
 ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>
 AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const {
   static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
-      {{MOSuppressPair, "aarch64-suppress-pair"}};
+      {{MOSuppressPair, "aarch64-suppress-pair"},
+       {MOStridedAccess, "aarch64-strided-access"}};
   return makeArrayRef(TargetFlags);
 }
 
diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h
index 0809ede4df2a..1765a0263ea4 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/lib/Target/AArch64/AArch64InstrInfo.h
@@ -27,6 +27,13 @@ namespace llvm {
 class AArch64Subtarget;
 class AArch64TargetMachine;
 
+static const MachineMemOperand::Flags MOSuppressPair =
+    MachineMemOperand::MOTargetFlag1;
+static const MachineMemOperand::Flags MOStridedAccess =
+    MachineMemOperand::MOTargetFlag2;
+
+#define FALKOR_STRIDED_ACCESS_MD "falkor.strided.access"
+
 class AArch64InstrInfo final : public AArch64GenInstrInfo {
   const AArch64RegisterInfo RI;
   const AArch64Subtarget &Subtarget;
@@ -81,6 +88,9 @@ public:
   /// unprofitable.
   bool isLdStPairSuppressed(const MachineInstr &MI) const;
 
+  /// Return true if the given load or store is a strided memory access.
+  bool isStridedAccess(const MachineInstr &MI) const;
+
   /// Return true if this is an unscaled load/store.
   bool isUnscaledLdSt(unsigned Opc) const;
 
@@ -356,7 +366,7 @@ enum AArch64FrameOffsetStatus {
 /// If result == AArch64FrameOffsetCannotUpdate, @p MI cannot be updated to
 /// use an offset.eq
 /// If result & AArch64FrameOffsetIsLegal, @p Offset can completely be
-/// rewriten in @p MI.
+/// rewritten in @p MI.
 /// If result & AArch64FrameOffsetCanUpdate, @p Offset contains the
 /// amount that is off the limit of the legal offset.
 /// If set, @p OutUseUnscaledOp will contain the whether @p MI should be
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index 0be14673eb20..0dcf07f98412 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -37,6 +37,8 @@ def HasFullFP16      : Predicate<"Subtarget->hasFullFP16()">,
                                  AssemblerPredicate<"FeatureFullFP16", "fullfp16">;
 def HasSPE           : Predicate<"Subtarget->hasSPE()">,
                                  AssemblerPredicate<"FeatureSPE", "spe">;
+def HasSVE           : Predicate<"Subtarget->hasSVE()">,
+                                 AssemblerPredicate<"FeatureSVE", "sve">;
 
 def IsLE             : Predicate<"Subtarget->isLittleEndian()">;
 def IsBE             : Predicate<"!Subtarget->isLittleEndian()">;
diff --git a/lib/Target/AArch64/AArch64LegalizerInfo.cpp b/lib/Target/AArch64/AArch64LegalizerInfo.cpp
index 4a0a7c36baf8..ffb27834c31c 100644
--- a/lib/Target/AArch64/AArch64LegalizerInfo.cpp
+++ b/lib/Target/AArch64/AArch64LegalizerInfo.cpp
@@ -82,7 +82,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() {
     setAction({Op, 1, s1}, Legal);
   }
 
-  for (unsigned BinOp : {G_FADD, G_FSUB, G_FMUL, G_FDIV})
+  for (unsigned BinOp : {G_FADD, G_FSUB, G_FMA, G_FMUL, G_FDIV})
     for (auto Ty : {s32, s64})
       setAction({BinOp, Ty}, Legal);
 
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp
index fab92e139dd0..9f7dcb3fe1c3 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -74,7 +74,7 @@ const uint32_t *
 AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
                                           CallingConv::ID CC) const {
   if (CC == CallingConv::GHC)
-    // This is academic becase all GHC calls are (supposed to be) tail calls
+    // This is academic because all GHC calls are (supposed to be) tail calls
     return CSR_AArch64_NoRegs_RegMask;
   if (CC == CallingConv::AnyReg)
     return CSR_AArch64_AllRegs_RegMask;
@@ -167,7 +167,7 @@ bool AArch64RegisterInfo::isConstantPhysReg(unsigned PhysReg) const {
 const TargetRegisterClass *
 AArch64RegisterInfo::getPointerRegClass(const MachineFunction &MF,
                                       unsigned Kind) const {
-  return &AArch64::GPR64RegClass;
+  return &AArch64::GPR64spRegClass;
 }
 
 const TargetRegisterClass *
diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp
index a3238cf3b60f..ea6112452736 100644
--- a/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -134,7 +134,9 @@ void AArch64Subtarget::initializeProperties() {
   case CortexA72:
     PrefFunctionAlignment = 4;
     break;
-  case CortexA73: break;
+  case CortexA73:
+    PrefFunctionAlignment = 4;
+    break;
   case Others: break;
   }
 }
@@ -171,7 +173,8 @@ struct AArch64GISelActualAccessor : public GISelAccessor {
 AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU,
                                    const std::string &FS,
                                    const TargetMachine &TM, bool LittleEndian)
-    : AArch64GenSubtargetInfo(TT, CPU, FS), ReserveX18(TT.isOSDarwin()),
+    : AArch64GenSubtargetInfo(TT, CPU, FS),
+      ReserveX18(TT.isOSDarwin() || TT.isOSWindows()),
       IsLittle(LittleEndian), TargetTriple(TT), FrameLowering(),
       InstrInfo(initializeSubtargetDependencies(FS, CPU)), TSInfo(),
       TLInfo(TM, *this), GISel() {
diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h
index db53946cbc77..5a1f45ee2552 100644
--- a/lib/Target/AArch64/AArch64Subtarget.h
+++ b/lib/Target/AArch64/AArch64Subtarget.h
@@ -70,6 +70,7 @@ protected:
   bool HasFullFP16 = false;
   bool HasSPE = false;
   bool HasLSLFast = false;
+  bool HasSVE = false;
 
   // HasZeroCycleRegMove - Has zero-cycle register mov instructions.
   bool HasZeroCycleRegMove = false;
@@ -251,6 +252,7 @@ public:
   bool hasFullFP16() const { return HasFullFP16; }
   bool hasSPE() const { return HasSPE; }
   bool hasLSLFast() const { return HasLSLFast; }
+  bool hasSVE() const { return HasSVE; }
 
   bool isLittleEndian() const { return IsLittle; }
 
@@ -304,6 +306,17 @@ public:
   bool enableEarlyIfConversion() const override;
 
   std::unique_ptr<PBQPRAConstraint> getCustomPBQPConstraints() const override;
+
+  bool isCallingConvWin64(CallingConv::ID CC) const {
+    switch (CC) {
+    case CallingConv::C:
+      return isTargetWindows();
+    case CallingConv::Win64:
+      return true;
+    default:
+      return false;
+    }
+  }
 };
 } // End llvm namespace
 
diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp
index 6237b8f3e7b9..ba28c01a2eff 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -138,6 +138,9 @@ static cl::opt<int> EnableGlobalISelAtO(
     cl::desc("Enable GlobalISel at or below an opt level (-1 to disable)"),
     cl::init(-1));
 
+static cl::opt<bool> EnableFalkorHWPFFix("aarch64-enable-falkor-hwpf-fix",
+                                         cl::init(true), cl::Hidden);
+
 extern "C" void LLVMInitializeAArch64Target() {
   // Register the target.
   RegisterTargetMachine<AArch64leTargetMachine> X(getTheAArch64leTarget());
@@ -158,6 +161,8 @@ extern "C" void LLVMInitializeAArch64Target() {
   initializeAArch64PromoteConstantPass(*PR);
   initializeAArch64RedundantCopyEliminationPass(*PR);
   initializeAArch64StorePairSuppressPass(*PR);
+  initializeFalkorHWPFFixPass(*PR);
+  initializeFalkorMarkStridedAccessesLegacyPass(*PR);
   initializeLDTLSCleanupPass(*PR);
 }
 
@@ -182,7 +187,7 @@ static std::string computeDataLayout(const Triple &TT,
   if (TT.isOSBinFormatMachO())
     return "e-m:o-i64:64-i128:128-n32:64-S128";
   if (TT.isOSBinFormatCOFF())
-    return "e-m:w-i64:64-i128:128-n32:64-S128";
+    return "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128";
   if (LittleEndian)
     return "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128";
   return "E-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128";
@@ -346,8 +351,12 @@ void AArch64PassConfig::addIRPasses() {
   //
   // Run this before LSR to remove the multiplies involved in computing the
   // pointer values N iterations ahead.
-  if (TM->getOptLevel() != CodeGenOpt::None && EnableLoopDataPrefetch)
-    addPass(createLoopDataPrefetchPass());
+  if (TM->getOptLevel() != CodeGenOpt::None) {
+    if (EnableLoopDataPrefetch)
+      addPass(createLoopDataPrefetchPass());
+    if (EnableFalkorHWPFFix)
+      addPass(createFalkorMarkStridedAccessesPass());
+  }
 
   TargetPassConfig::addIRPasses();
 
@@ -478,8 +487,12 @@ void AArch64PassConfig::addPreSched2() {
   // Expand some pseudo instructions to allow proper scheduling.
   addPass(createAArch64ExpandPseudoPass());
   // Use load/store pair instructions when possible.
-  if (TM->getOptLevel() != CodeGenOpt::None && EnableLoadStoreOpt)
-    addPass(createAArch64LoadStoreOptimizationPass());
+  if (TM->getOptLevel() != CodeGenOpt::None) {
+    if (EnableLoadStoreOpt)
+      addPass(createAArch64LoadStoreOptimizationPass());
+    if (EnableFalkorHWPFFix)
+      addPass(createFalkorHWPFFixPass());
+  }
 }
 
 void AArch64PassConfig::addPreEmitPass() {
diff --git a/lib/Target/AArch64/AArch64TargetMachine.h b/lib/Target/AArch64/AArch64TargetMachine.h
index fefa7e26b79f..85de02e859e0 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.h
+++ b/lib/Target/AArch64/AArch64TargetMachine.h
@@ -36,6 +36,8 @@ public:
 
   ~AArch64TargetMachine() override;
   const AArch64Subtarget *getSubtargetImpl(const Function &F) const override;
+  // The no argument getSubtargetImpl, while it exists on some, targets is
+  // deprecated and should not be used.
   const AArch64Subtarget *getSubtargetImpl() const = delete;
 
   // Pass Pipeline Configuration
diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index e841fb894519..a79d51820545 100644
--- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -86,7 +86,7 @@ private:
   bool parseOperand(OperandVector &Operands, bool isCondCode,
                     bool invertCondCode);
 
-  bool showMatchError(SMLoc Loc, unsigned ErrCode);
+  bool showMatchError(SMLoc Loc, unsigned ErrCode, OperandVector &Operands);
 
   bool parseDirectiveArch(SMLoc L);
   bool parseDirectiveCPU(SMLoc L);
@@ -3257,7 +3257,10 @@ bool AArch64AsmParser::validateInstruction(MCInst &Inst,
   }
 }
 
-bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode) {
+std::string AArch64MnemonicSpellCheck(StringRef S, uint64_t FBS);
+
+bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode,
+                                      OperandVector &Operands) {
   switch (ErrCode) {
   case Match_MissingFeature:
     return Error(Loc,
@@ -3380,8 +3383,12 @@ bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode) {
     return Error(Loc, "expected readable system register");
   case Match_MSR:
     return Error(Loc, "expected writable system register or pstate");
-  case Match_MnemonicFail:
-    return Error(Loc, "unrecognized instruction mnemonic");
+  case Match_MnemonicFail: {
+    std::string Suggestion = AArch64MnemonicSpellCheck(
+        ((AArch64Operand &)*Operands[0]).getToken(),
+        ComputeAvailableFeatures(STI->getFeatureBits()));
+    return Error(Loc, "unrecognized instruction mnemonic" + Suggestion);
+  }
   default:
     llvm_unreachable("unexpected error code!");
   }
@@ -3707,7 +3714,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     return Error(IDLoc, Msg);
   }
   case Match_MnemonicFail:
-    return showMatchError(IDLoc, MatchResult);
+    return showMatchError(IDLoc, MatchResult, Operands);
   case Match_InvalidOperand: {
     SMLoc ErrorLoc = IDLoc;
 
@@ -3726,7 +3733,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
         ((AArch64Operand &)*Operands[ErrorInfo]).isTokenSuffix())
       MatchResult = Match_InvalidSuffix;
 
-    return showMatchError(ErrorLoc, MatchResult);
+    return showMatchError(ErrorLoc, MatchResult, Operands);
   }
   case Match_InvalidMemoryIndexed1:
   case Match_InvalidMemoryIndexed2:
@@ -3784,7 +3791,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     SMLoc ErrorLoc = ((AArch64Operand &)*Operands[ErrorInfo]).getStartLoc();
     if (ErrorLoc == SMLoc())
       ErrorLoc = IDLoc;
-    return showMatchError(ErrorLoc, MatchResult);
+    return showMatchError(ErrorLoc, MatchResult, Operands);
   }
   }
 
diff --git a/lib/Target/AArch64/CMakeLists.txt b/lib/Target/AArch64/CMakeLists.txt
index 02b12b5e90ca..f7e0a5c7bed3 100644
--- a/lib/Target/AArch64/CMakeLists.txt
+++ b/lib/Target/AArch64/CMakeLists.txt
@@ -47,6 +47,7 @@ add_llvm_target(AArch64CodeGen
   AArch64ConditionalCompares.cpp
   AArch64DeadRegisterDefinitionsPass.cpp
   AArch64ExpandPseudoInsts.cpp
+  AArch64FalkorHWPFFix.cpp
   AArch64FastISel.cpp
   AArch64A53Fix835769.cpp
   AArch64FrameLowering.cpp
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
index a7a7daf4b4a5..2bd0cbf9f7c6 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -104,8 +104,9 @@ static unsigned getFixupKindNumBytes(unsigned Kind) {
   case FK_Data_1:
     return 1;
 
-  case FK_Data_2:
   case AArch64::fixup_aarch64_movw:
+  case FK_Data_2:
+  case FK_SecRel_2:
     return 2;
 
   case AArch64::fixup_aarch64_pcrel_branch14:
@@ -124,6 +125,7 @@ static unsigned getFixupKindNumBytes(unsigned Kind) {
   case AArch64::fixup_aarch64_pcrel_branch26:
   case AArch64::fixup_aarch64_pcrel_call26:
   case FK_Data_4:
+  case FK_SecRel_4:
     return 4;
 
   case FK_Data_8:
@@ -218,6 +220,8 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
   case FK_Data_2:
   case FK_Data_4:
   case FK_Data_8:
+  case FK_SecRel_2:
+  case FK_SecRel_4:
     return Value;
   }
 }
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
index 7862a03e771c..31762b9e4cd5 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
@@ -27,8 +27,7 @@ namespace {
 class AArch64WinCOFFObjectWriter : public MCWinCOFFObjectTargetWriter {
 public:
   AArch64WinCOFFObjectWriter()
-    : MCWinCOFFObjectTargetWriter(COFF::IMAGE_FILE_MACHINE_ARM64) {
-  }
+      : MCWinCOFFObjectTargetWriter(COFF::IMAGE_FILE_MACHINE_ARM64) {}
 
   ~AArch64WinCOFFObjectWriter() override = default;
 
@@ -36,19 +35,59 @@ public:
                         const MCFixup &Fixup, bool IsCrossSection,
                         const MCAsmBackend &MAB) const override;
 
-   bool recordRelocation(const MCFixup &) const override;
+  bool recordRelocation(const MCFixup &) const override;
 };
 
 } // end anonymous namespace
 
-unsigned
-AArch64WinCOFFObjectWriter::getRelocType(MCContext &Ctx,
-                                         const MCValue &Target,
-                                         const MCFixup &Fixup,
-                                         bool IsCrossSection,
-                                         const MCAsmBackend &MAB) const {
-  const MCFixupKindInfo &Info = MAB.getFixupKindInfo(Fixup.getKind());
-  report_fatal_error(Twine("unsupported relocation type: ") + Info.Name);
+unsigned AArch64WinCOFFObjectWriter::getRelocType(
+    MCContext &Ctx, const MCValue &Target, const MCFixup &Fixup,
+    bool IsCrossSection, const MCAsmBackend &MAB) const {
+  auto Modifier = Target.isAbsolute() ? MCSymbolRefExpr::VK_None
+                                      : Target.getSymA()->getKind();
+
+  switch (static_cast<unsigned>(Fixup.getKind())) {
+  default: {
+    const MCFixupKindInfo &Info = MAB.getFixupKindInfo(Fixup.getKind());
+    report_fatal_error(Twine("unsupported relocation type: ") + Info.Name);
+  }
+
+  case FK_Data_4:
+    switch (Modifier) {
+    default:
+      return COFF::IMAGE_REL_ARM64_ADDR32;
+    case MCSymbolRefExpr::VK_COFF_IMGREL32:
+      return COFF::IMAGE_REL_ARM64_ADDR32NB;
+    case MCSymbolRefExpr::VK_SECREL:
+      return COFF::IMAGE_REL_ARM64_SECREL;
+    }
+
+  case FK_Data_8:
+    return COFF::IMAGE_REL_ARM64_ADDR64;
+
+  case FK_SecRel_2:
+    return COFF::IMAGE_REL_ARM64_SECTION;
+
+  case FK_SecRel_4:
+    return COFF::IMAGE_REL_ARM64_SECREL;
+
+  case AArch64::fixup_aarch64_add_imm12:
+    return COFF::IMAGE_REL_ARM64_PAGEOFFSET_12A;
+
+  case AArch64::fixup_aarch64_ldst_imm12_scale1:
+  case AArch64::fixup_aarch64_ldst_imm12_scale2:
+  case AArch64::fixup_aarch64_ldst_imm12_scale4:
+  case AArch64::fixup_aarch64_ldst_imm12_scale8:
+  case AArch64::fixup_aarch64_ldst_imm12_scale16:
+    return COFF::IMAGE_REL_ARM64_PAGEOFFSET_12L;
+
+  case AArch64::fixup_aarch64_pcrel_adrp_imm21:
+    return COFF::IMAGE_REL_ARM64_PAGEBASE_REL21;
+
+  case AArch64::fixup_aarch64_pcrel_branch26:
+  case AArch64::fixup_aarch64_pcrel_call26:
+    return COFF::IMAGE_REL_ARM64_BRANCH26;
+  }
 }
 
 bool AArch64WinCOFFObjectWriter::recordRelocation(const MCFixup &Fixup) const {
diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h
index 5a799b2d88d0..568682899be5 100644
--- a/lib/Target/AMDGPU/AMDGPU.h
+++ b/lib/Target/AMDGPU/AMDGPU.h
@@ -56,7 +56,7 @@ extern char &AMDGPUMachineCFGStructurizerID;
 
 void initializeAMDGPUAlwaysInlinePass(PassRegistry&);
 
-ModulePass *createAMDGPUAnnotateKernelFeaturesPass();
+Pass *createAMDGPUAnnotateKernelFeaturesPass();
 void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &);
 extern char &AMDGPUAnnotateKernelFeaturesID;
 
diff --git a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
index 7235d8fae332..c68e5861ff25 100644
--- a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
@@ -15,8 +15,10 @@
 #include "AMDGPU.h"
 #include "AMDGPUSubtarget.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/CallGraphSCCPass.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 
@@ -26,26 +28,27 @@ using namespace llvm;
 
 namespace {
 
-class AMDGPUAnnotateKernelFeatures : public ModulePass {
+class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass {
 private:
+  const TargetMachine *TM = nullptr;
   AMDGPUAS AS;
-  static bool hasAddrSpaceCast(const Function &F, AMDGPUAS AS);
 
-  void addAttrToCallers(Function *Intrin, StringRef AttrName);
-  bool addAttrsForIntrinsics(Module &M, ArrayRef<StringRef[2]>);
+  bool addFeatureAttributes(Function &F);
 
 public:
   static char ID;
 
-  AMDGPUAnnotateKernelFeatures() : ModulePass(ID) {}
-  bool runOnModule(Module &M) override;
+  AMDGPUAnnotateKernelFeatures() : CallGraphSCCPass(ID) {}
+
+  bool doInitialization(CallGraph &CG) override;
+  bool runOnSCC(CallGraphSCC &SCC) override;
   StringRef getPassName() const override {
     return "AMDGPU Annotate Kernel Features";
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesAll();
-    ModulePass::getAnalysisUsage(AU);
+    CallGraphSCCPass::getAnalysisUsage(AU);
   }
 
   static bool visitConstantExpr(const ConstantExpr *CE, AMDGPUAS AS);
@@ -121,16 +124,130 @@ bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively(
   return false;
 }
 
-// Return true if an addrspacecast is used that requires the queue ptr.
-bool AMDGPUAnnotateKernelFeatures::hasAddrSpaceCast(const Function &F,
-    AMDGPUAS AS) {
+// We do not need to note the x workitem or workgroup id because they are always
+// initialized.
+//
+// TODO: We should not add the attributes if the known compile time workgroup
+// size is 1 for y/z.
+static StringRef intrinsicToAttrName(Intrinsic::ID ID,
+                                     bool &NonKernelOnly,
+                                     bool &IsQueuePtr) {
+  switch (ID) {
+  case Intrinsic::amdgcn_workitem_id_x:
+    NonKernelOnly = true;
+    return "amdgpu-work-item-id-x";
+  case Intrinsic::amdgcn_workgroup_id_x:
+    NonKernelOnly = true;
+    return "amdgpu-work-group-id-x";
+  case Intrinsic::amdgcn_workitem_id_y:
+  case Intrinsic::r600_read_tidig_y:
+    return "amdgpu-work-item-id-y";
+  case Intrinsic::amdgcn_workitem_id_z:
+  case Intrinsic::r600_read_tidig_z:
+    return "amdgpu-work-item-id-z";
+  case Intrinsic::amdgcn_workgroup_id_y:
+  case Intrinsic::r600_read_tgid_y:
+    return "amdgpu-work-group-id-y";
+  case Intrinsic::amdgcn_workgroup_id_z:
+  case Intrinsic::r600_read_tgid_z:
+    return "amdgpu-work-group-id-z";
+  case Intrinsic::amdgcn_dispatch_ptr:
+    return "amdgpu-dispatch-ptr";
+  case Intrinsic::amdgcn_dispatch_id:
+    return "amdgpu-dispatch-id";
+  case Intrinsic::amdgcn_kernarg_segment_ptr:
+  case Intrinsic::amdgcn_implicitarg_ptr:
+    return "amdgpu-kernarg-segment-ptr";
+  case Intrinsic::amdgcn_queue_ptr:
+  case Intrinsic::trap:
+  case Intrinsic::debugtrap:
+    IsQueuePtr = true;
+    return "amdgpu-queue-ptr";
+  default:
+    return "";
+  }
+}
+
+static bool handleAttr(Function &Parent, const Function &Callee,
+                       StringRef Name) {
+  if (Callee.hasFnAttribute(Name)) {
+    Parent.addFnAttr(Name);
+    return true;
+  }
+
+  return false;
+}
+
+static void copyFeaturesToFunction(Function &Parent, const Function &Callee,
+                                   bool &NeedQueuePtr) {
+  // X ids unnecessarily propagated to kernels.
+  static const StringRef AttrNames[] = {
+    { "amdgpu-work-item-id-x" },
+    { "amdgpu-work-item-id-y" },
+    { "amdgpu-work-item-id-z" },
+    { "amdgpu-work-group-id-x" },
+    { "amdgpu-work-group-id-y" },
+    { "amdgpu-work-group-id-z" },
+    { "amdgpu-dispatch-ptr" },
+    { "amdgpu-dispatch-id" },
+    { "amdgpu-kernarg-segment-ptr" }
+  };
+
+  if (handleAttr(Parent, Callee, "amdgpu-queue-ptr"))
+    NeedQueuePtr = true;
+
+  for (StringRef AttrName : AttrNames)
+    handleAttr(Parent, Callee, AttrName);
+}
+
+bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
+  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
+  bool HasFlat = ST.hasFlatAddressSpace();
+  bool HasApertureRegs = ST.hasApertureRegs();
   SmallPtrSet<const Constant *, 8> ConstantExprVisited;
 
-  for (const BasicBlock &BB : F) {
-    for (const Instruction &I : BB) {
+  bool Changed = false;
+  bool NeedQueuePtr = false;
+  bool HaveCall = false;
+  bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv());
+
+  for (BasicBlock &BB : F) {
+    for (Instruction &I : BB) {
+      CallSite CS(&I);
+      if (CS) {
+        Function *Callee = CS.getCalledFunction();
+
+        // TODO: Do something with indirect calls.
+        if (!Callee) {
+          if (!CS.isInlineAsm())
+            HaveCall = true;
+          continue;
+        }
+
+        Intrinsic::ID IID = Callee->getIntrinsicID();
+        if (IID == Intrinsic::not_intrinsic) {
+          HaveCall = true;
+          copyFeaturesToFunction(F, *Callee, NeedQueuePtr);
+          Changed = true;
+        } else {
+          bool NonKernelOnly = false;
+          StringRef AttrName = intrinsicToAttrName(IID,
+                                                   NonKernelOnly, NeedQueuePtr);
+          if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) {
+            F.addFnAttr(AttrName);
+            Changed = true;
+          }
+        }
+      }
+
+      if (NeedQueuePtr || HasApertureRegs)
+        continue;
+
       if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) {
-        if (castRequiresQueuePtr(ASC, AS))
-          return true;
+        if (castRequiresQueuePtr(ASC, AS)) {
+          NeedQueuePtr = true;
+          continue;
+        }
       }
 
       for (const Use &U : I.operands()) {
@@ -138,100 +255,57 @@ bool AMDGPUAnnotateKernelFeatures::hasAddrSpaceCast(const Function &F,
         if (!OpC)
           continue;
 
-        if (visitConstantExprsRecursively(OpC, ConstantExprVisited, AS))
-          return true;
+        if (visitConstantExprsRecursively(OpC, ConstantExprVisited, AS)) {
+          NeedQueuePtr = true;
+          break;
+        }
       }
     }
   }
 
-  return false;
-}
-
-void AMDGPUAnnotateKernelFeatures::addAttrToCallers(Function *Intrin,
-                                                    StringRef AttrName) {
-  SmallPtrSet<Function *, 4> SeenFuncs;
-
-  for (User *U : Intrin->users()) {
-    // CallInst is the only valid user for an intrinsic.
-    CallInst *CI = cast<CallInst>(U);
-
-    Function *CallingFunction = CI->getParent()->getParent();
-    if (SeenFuncs.insert(CallingFunction).second)
-      CallingFunction->addFnAttr(AttrName);
+  if (NeedQueuePtr) {
+    F.addFnAttr("amdgpu-queue-ptr");
+    Changed = true;
   }
-}
-
-bool AMDGPUAnnotateKernelFeatures::addAttrsForIntrinsics(
-  Module &M,
-  ArrayRef<StringRef[2]> IntrinsicToAttr) {
-  bool Changed = false;
 
-  for (const StringRef *Arr  : IntrinsicToAttr) {
-    if (Function *Fn = M.getFunction(Arr[0])) {
-      addAttrToCallers(Fn, Arr[1]);
-      Changed = true;
-    }
+  // TODO: We could refine this to captured pointers that could possibly be
+  // accessed by flat instructions. For now this is mostly a poor way of
+  // estimating whether there are calls before argument lowering.
+  if (HasFlat && !IsFunc && HaveCall) {
+    F.addFnAttr("amdgpu-flat-scratch");
+    Changed = true;
   }
 
   return Changed;
 }
 
-bool AMDGPUAnnotateKernelFeatures::runOnModule(Module &M) {
+bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) {
+  Module &M = SCC.getCallGraph().getModule();
   Triple TT(M.getTargetTriple());
-  AS = AMDGPU::getAMDGPUAS(M);
-
-  static const StringRef IntrinsicToAttr[][2] = {
-    // .x omitted
-    { "llvm.amdgcn.workitem.id.y", "amdgpu-work-item-id-y" },
-    { "llvm.amdgcn.workitem.id.z", "amdgpu-work-item-id-z" },
-
-    { "llvm.amdgcn.workgroup.id.y", "amdgpu-work-group-id-y" },
-    { "llvm.amdgcn.workgroup.id.z", "amdgpu-work-group-id-z" },
-
-    { "llvm.r600.read.tgid.y", "amdgpu-work-group-id-y" },
-    { "llvm.r600.read.tgid.z", "amdgpu-work-group-id-z" },
-
-    // .x omitted
-    { "llvm.r600.read.tidig.y", "amdgpu-work-item-id-y" },
-    { "llvm.r600.read.tidig.z", "amdgpu-work-item-id-z" }
-  };
 
-  static const StringRef HSAIntrinsicToAttr[][2] = {
-    { "llvm.amdgcn.dispatch.ptr", "amdgpu-dispatch-ptr" },
-    { "llvm.amdgcn.queue.ptr", "amdgpu-queue-ptr" },
-    { "llvm.amdgcn.dispatch.id", "amdgpu-dispatch-id" },
-    { "llvm.trap", "amdgpu-queue-ptr" },
-    { "llvm.debugtrap", "amdgpu-queue-ptr" }
-  };
-
-  // TODO: We should not add the attributes if the known compile time workgroup
-  // size is 1 for y/z.
-
-  // TODO: Intrinsics that require queue ptr.
+  bool Changed = false;
+  for (CallGraphNode *I : SCC) {
+    Function *F = I->getFunction();
+    if (!F || F->isDeclaration())
+      continue;
 
-  // We do not need to note the x workitem or workgroup id because they are
-  // always initialized.
+    Changed |= addFeatureAttributes(*F);
+  }
 
-  bool Changed = addAttrsForIntrinsics(M, IntrinsicToAttr);
-  if (TT.getOS() == Triple::AMDHSA || TT.getOS() == Triple::Mesa3D) {
-    Changed |= addAttrsForIntrinsics(M, HSAIntrinsicToAttr);
 
-    for (Function &F : M) {
-      if (F.hasFnAttribute("amdgpu-queue-ptr"))
-        continue;
+  return Changed;
+}
 
-      auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
-      bool HasApertureRegs = TPC && TPC->getTM<TargetMachine>()
-                                        .getSubtarget<AMDGPUSubtarget>(F)
-                                        .hasApertureRegs();
-      if (!HasApertureRegs && hasAddrSpaceCast(F, AS))
-        F.addFnAttr("amdgpu-queue-ptr");
-    }
-  }
+bool AMDGPUAnnotateKernelFeatures::doInitialization(CallGraph &CG) {
+  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+  if (!TPC)
+    report_fatal_error("TargetMachine is required");
 
-  return Changed;
+  AS = AMDGPU::getAMDGPUAS(CG.getModule());
+  TM = &TPC->getTM<TargetMachine>();
+  return false;
 }
 
-ModulePass *llvm::createAMDGPUAnnotateKernelFeaturesPass() {
+Pass *llvm::createAMDGPUAnnotateKernelFeaturesPass() {
   return new AMDGPUAnnotateKernelFeatures();
 }
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 83ad1a5c6ee3..2247814cfe55 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -268,19 +268,10 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
                                  CurrentProgramInfo.ScratchSize,
                                  getFunctionCodeSize(MF));
 
-      OutStreamer->emitRawComment(" codeLenInByte = " +
-                                  Twine(getFunctionCodeSize(MF)), false);
-      OutStreamer->emitRawComment(
-        " NumSgprs: " + Twine(CurrentProgramInfo.NumSGPR), false);
-      OutStreamer->emitRawComment(
-        " NumVgprs: " + Twine(CurrentProgramInfo.NumVGPR), false);
-
       OutStreamer->emitRawComment(
         " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
       OutStreamer->emitRawComment(
         " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false);
-      OutStreamer->emitRawComment(
-        " ScratchSize: " + Twine(CurrentProgramInfo.ScratchSize), false);
       OutStreamer->emitRawComment(
         " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) +
         " bytes/workgroup (compile time only)", false);
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 2553cf4da0fe..258b1737deb3 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -573,6 +573,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(ISD::FSUB);
   setTargetDAGCombine(ISD::FNEG);
   setTargetDAGCombine(ISD::FABS);
+  setTargetDAGCombine(ISD::AssertZext);
+  setTargetDAGCombine(ISD::AssertSext);
 }
 
 //===----------------------------------------------------------------------===//
@@ -883,7 +885,7 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
 
 /// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
 /// input values across multiple registers.  Each item in the Ins array
-/// represents a single value that will be stored in regsters.  Ins[x].VT is
+/// represents a single value that will be stored in registers.  Ins[x].VT is
 /// the value type of the value that will be stored in the register, so
 /// whatever SDNode we lower the argument to needs to be this type.
 ///
@@ -2591,6 +2593,31 @@ SDValue AMDGPUTargetLowering::performClampCombine(SDNode *N,
   return SDValue(CSrc, 0);
 }
 
+// FIXME: This should go in generic DAG combiner with an isTruncateFree check,
+// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
+// issues.
+SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N,
+                                                        DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue N0 = N->getOperand(0);
+
+  // (vt2 (assertzext (truncate vt0:x), vt1)) ->
+  //     (vt2 (truncate (assertzext vt0:x, vt1)))
+  if (N0.getOpcode() == ISD::TRUNCATE) {
+    SDValue N1 = N->getOperand(1);
+    EVT ExtVT = cast<VTSDNode>(N1)->getVT();
+    SDLoc SL(N);
+
+    SDValue Src = N0.getOperand(0);
+    EVT SrcVT = Src.getValueType();
+    if (SrcVT.bitsGE(ExtVT)) {
+      SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
+      return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
+    }
+  }
+
+  return SDValue();
+}
 /// Split the 64-bit value \p LHS into two 32-bit components, and perform the
 /// binary operation \p Opc to it with the corresponding constant operands.
 SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
@@ -3521,6 +3548,9 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
 
     break;
   }
+  case ISD::AssertZext:
+  case ISD::AssertSext:
+    return performAssertSZExtCombine(N, DCI);
   }
   return SDValue();
 }
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h
index a45234e2b39f..d85aada6053a 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -76,6 +76,7 @@ protected:
   SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performClampCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performAssertSZExtCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
   SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL,
                                        unsigned Opc, SDValue LHS,
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 1bc5a52053ec..779617629010 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -277,7 +277,7 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
   // Make sure requested values are compatible with values implied by requested
   // minimum/maximum flat work group sizes.
   if (RequestedFlatWorkGroupSize &&
-      Requested.first > MinImpliedByFlatWorkGroupSize)
+      Requested.first < MinImpliedByFlatWorkGroupSize)
     return Default;
 
   return Requested;
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 22cede59086a..d4b6a5fe8020 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -359,6 +359,10 @@ public:
     return FP64FP16Denormals;
   }
 
+  bool supportsMinMaxDenormModes() const {
+    return getGeneration() >= AMDGPUSubtarget::GFX9;
+  }
+
   bool hasFPExceptions() const {
     return FPExceptions;
   }
diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index e3c90f250600..b37c274102bc 100644
--- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -1208,7 +1208,7 @@ bool AMDGPUOperand::isInlinableImm(MVT type) const {
 }
 
 bool AMDGPUOperand::isLiteralImm(MVT type) const {
-  // Check that this imediate can be added as literal
+  // Check that this immediate can be added as literal
   if (!isImmTy(ImmTyNone)) {
     return false;
   }
diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index f26e49295e69..966c6fec20c6 100644
--- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -87,6 +87,7 @@ DECODE_OPERAND(Decode##RegClass##RegisterClass, decodeOperand_##RegClass)
 DECODE_OPERAND_REG(VGPR_32)
 DECODE_OPERAND_REG(VS_32)
 DECODE_OPERAND_REG(VS_64)
+DECODE_OPERAND_REG(VS_128)
 
 DECODE_OPERAND_REG(VReg_64)
 DECODE_OPERAND_REG(VReg_96)
@@ -318,6 +319,10 @@ MCOperand AMDGPUDisassembler::decodeOperand_VS_64(unsigned Val) const {
   return decodeSrcOp(OPW64, Val);
 }
 
+MCOperand AMDGPUDisassembler::decodeOperand_VS_128(unsigned Val) const {
+  return decodeSrcOp(OPW128, Val);
+}
+
 MCOperand AMDGPUDisassembler::decodeOperand_VSrc16(unsigned Val) const {
   return decodeSrcOp(OPW16, Val);
 }
diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index 3d71db909e20..4c755be09999 100644
--- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -70,6 +70,7 @@ public:
   MCOperand decodeOperand_VGPR_32(unsigned Val) const;
   MCOperand decodeOperand_VS_32(unsigned Val) const;
   MCOperand decodeOperand_VS_64(unsigned Val) const;
+  MCOperand decodeOperand_VS_128(unsigned Val) const;
   MCOperand decodeOperand_VSrc16(unsigned Val) const;
   MCOperand decodeOperand_VSrcV216(unsigned Val) const;
 
diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp
index 3af242d9ea66..0aad8f0843d6 100644
--- a/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -653,6 +653,7 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
         // again. The same constant folded instruction could also have a second
         // use operand.
         NextUse = MRI->use_begin(Dst.getReg());
+        FoldList.clear();
         continue;
       }
 
diff --git a/lib/Target/AMDGPU/SIFrameLowering.cpp b/lib/Target/AMDGPU/SIFrameLowering.cpp
index 08a64de38501..7334781916d8 100644
--- a/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -158,7 +158,7 @@ SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg(
   // No replacement necessary.
   if (ScratchWaveOffsetReg == AMDGPU::NoRegister ||
       !MRI.isPhysRegUsed(ScratchWaveOffsetReg)) {
-    assert(MFI->getStackPtrOffsetReg() == AMDGPU::NoRegister);
+    assert(MFI->getStackPtrOffsetReg() == AMDGPU::SP_REG);
     return std::make_pair(AMDGPU::NoRegister, AMDGPU::NoRegister);
   }
 
@@ -246,13 +246,16 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
   // this point it appears we need the setup. This part of the prolog should be
   // emitted after frame indices are eliminated.
 
-  if (MF.getFrameInfo().hasStackObjects() && MFI->hasFlatScratchInit())
+  if (MFI->hasFlatScratchInit())
     emitFlatScratchInit(ST, MF, MBB);
 
   unsigned SPReg = MFI->getStackPtrOffsetReg();
-  if (SPReg != AMDGPU::NoRegister) {
+  if (SPReg != AMDGPU::SP_REG) {
+    assert(MRI.isReserved(SPReg) && "SPReg used but not reserved");
+
     DebugLoc DL;
-    int64_t StackSize = MF.getFrameInfo().getStackSize();
+    const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
+    int64_t StackSize = FrameInfo.getStackSize();
 
     if (StackSize == 0) {
       BuildMI(MBB, MBB.begin(), DL, TII->get(AMDGPU::COPY), SPReg)
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 2ba570b9ebbb..2356405f0919 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1171,8 +1171,7 @@ static void allocateSystemSGPRs(CCState &CCInfo,
 static void reservePrivateMemoryRegs(const TargetMachine &TM,
                                      MachineFunction &MF,
                                      const SIRegisterInfo &TRI,
-                                     SIMachineFunctionInfo &Info,
-                                     bool NeedSP) {
+                                     SIMachineFunctionInfo &Info) {
   // Now that we've figured out where the scratch register inputs are, see if
   // should reserve the arguments and use them directly.
   MachineFrameInfo &MFI = MF.getFrameInfo();
@@ -1234,15 +1233,6 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
       Info.setScratchWaveOffsetReg(ReservedOffsetReg);
     }
   }
-
-  if (NeedSP) {
-    unsigned ReservedStackPtrOffsetReg = TRI.reservedStackPtrOffsetReg(MF);
-    Info.setStackPtrOffsetReg(ReservedStackPtrOffsetReg);
-
-    assert(Info.getStackPtrOffsetReg() != Info.getFrameOffsetReg());
-    assert(!TRI.isSubRegister(Info.getScratchRSrcReg(),
-                              Info.getStackPtrOffsetReg()));
-  }
 }
 
 SDValue SITargetLowering::LowerFormalArguments(
@@ -1380,10 +1370,37 @@ SDValue SITargetLowering::LowerFormalArguments(
 
     unsigned Reg = VA.getLocReg();
     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
+    EVT ValVT = VA.getValVT();
 
     Reg = MF.addLiveIn(Reg, RC);
     SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
 
+    // If this is an 8 or 16-bit value, it is really passed promoted
+    // to 32 bits. Insert an assert[sz]ext to capture this, then
+    // truncate to the right size.
+    switch (VA.getLocInfo()) {
+    case CCValAssign::Full:
+      break;
+    case CCValAssign::BCvt:
+      Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
+      break;
+    case CCValAssign::SExt:
+      Val = DAG.getNode(ISD::AssertSext, DL, VT, Val,
+                        DAG.getValueType(ValVT));
+      Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
+      break;
+    case CCValAssign::ZExt:
+      Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
+                        DAG.getValueType(ValVT));
+      Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
+      break;
+    case CCValAssign::AExt:
+      Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
+      break;
+    default:
+      llvm_unreachable("Unknown loc info!");
+    }
+
     if (IsShader && Arg.VT.isVector()) {
       // Build a vector from the registers
       Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
@@ -1410,25 +1427,13 @@ SDValue SITargetLowering::LowerFormalArguments(
     InVals.push_back(Val);
   }
 
-  const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
-
-  // TODO: Could maybe omit SP if only tail calls?
-  bool NeedSP = FrameInfo.hasCalls() || FrameInfo.hasVarSizedObjects();
-
   // Start adding system SGPRs.
   if (IsEntryFunc) {
     allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader);
-    reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info, NeedSP);
   } else {
     CCInfo.AllocateReg(Info->getScratchRSrcReg());
     CCInfo.AllocateReg(Info->getScratchWaveOffsetReg());
     CCInfo.AllocateReg(Info->getFrameOffsetReg());
-
-    if (NeedSP) {
-      unsigned StackPtrReg = findFirstFreeSGPR(CCInfo);
-      CCInfo.AllocateReg(StackPtrReg);
-      Info->setStackPtrOffsetReg(StackPtrReg);
-    }
   }
 
   return Chains.empty() ? Chain :
@@ -4624,8 +4629,8 @@ static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) {
   return DAG.isKnownNeverNaN(Op);
 }
 
-static bool isCanonicalized(SDValue Op, const SISubtarget *ST,
-                            unsigned MaxDepth=5) {
+static bool isCanonicalized(SelectionDAG &DAG, SDValue Op,
+                            const SISubtarget *ST, unsigned MaxDepth=5) {
   // If source is a result of another standard FP operation it is already in
   // canonical form.
 
@@ -4663,7 +4668,7 @@ static bool isCanonicalized(SDValue Op, const SISubtarget *ST,
   case ISD::FNEG:
   case ISD::FABS:
     return (MaxDepth > 0) &&
-           isCanonicalized(Op.getOperand(0), ST, MaxDepth - 1);
+           isCanonicalized(DAG, Op.getOperand(0), ST, MaxDepth - 1);
 
   case ISD::FSIN:
   case ISD::FCOS:
@@ -4672,16 +4677,19 @@ static bool isCanonicalized(SDValue Op, const SISubtarget *ST,
 
   // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms.
   // For such targets need to check their input recursively.
-  // TODO: on GFX9+ we could return true without checking provided no-nan
-  // mode, since canonicalization is also used to quiet sNaNs.
   case ISD::FMINNUM:
   case ISD::FMAXNUM:
   case ISD::FMINNAN:
   case ISD::FMAXNAN:
 
+    if (ST->supportsMinMaxDenormModes() &&
+        DAG.isKnownNeverNaN(Op.getOperand(0)) &&
+        DAG.isKnownNeverNaN(Op.getOperand(1)))
+      return true;
+
     return (MaxDepth > 0) &&
-           isCanonicalized(Op.getOperand(0), ST, MaxDepth - 1) &&
-           isCanonicalized(Op.getOperand(1), ST, MaxDepth - 1);
+           isCanonicalized(DAG, Op.getOperand(0), ST, MaxDepth - 1) &&
+           isCanonicalized(DAG, Op.getOperand(1), ST, MaxDepth - 1);
 
   case ISD::ConstantFP: {
     auto F = cast<ConstantFPSDNode>(Op)->getValueAPF();
@@ -4700,11 +4708,19 @@ SDValue SITargetLowering::performFCanonicalizeCombine(
 
   if (!CFP) {
     SDValue N0 = N->getOperand(0);
+    EVT VT = N0.getValueType().getScalarType();
+    auto ST = getSubtarget();
+
+    if (((VT == MVT::f32 && ST->hasFP32Denormals()) ||
+         (VT == MVT::f64 && ST->hasFP64Denormals()) ||
+         (VT == MVT::f16 && ST->hasFP16Denormals())) &&
+        DAG.isKnownNeverNaN(N0))
+      return N0;
 
     bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction());
 
     if ((IsIEEEMode || isKnownNeverSNan(DAG, N0)) &&
-        isCanonicalized(N0, getSubtarget()))
+        isCanonicalized(DAG, N0, ST))
       return N0;
 
     return SDValue();
@@ -5813,3 +5829,44 @@ SITargetLowering::getConstraintType(StringRef Constraint) const {
   }
   return TargetLowering::getConstraintType(Constraint);
 }
+
+// Figure out which registers should be reserved for stack access. Only after
+// the function is legalized do we know all of the non-spill stack objects or if
+// calls are present.
+void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+
+  if (Info->isEntryFunction()) {
+    // Callable functions have fixed registers used for stack access.
+    reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info);
+  }
+
+  // We have to assume the SP is needed in case there are calls in the function
+  // during lowering. Calls are only detected after the function is
+  // lowered. We're about to reserve registers, so don't bother using it if we
+  // aren't really going to use it.
+  bool NeedSP = !Info->isEntryFunction() ||
+    MFI.hasVarSizedObjects() ||
+    MFI.hasCalls();
+
+  if (NeedSP) {
+    unsigned ReservedStackPtrOffsetReg = TRI->reservedStackPtrOffsetReg(MF);
+    Info->setStackPtrOffsetReg(ReservedStackPtrOffsetReg);
+
+    assert(Info->getStackPtrOffsetReg() != Info->getFrameOffsetReg());
+    assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
+                               Info->getStackPtrOffsetReg()));
+    MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
+  }
+
+  MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
+  MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
+  MRI.replaceRegWith(AMDGPU::SCRATCH_WAVE_OFFSET_REG,
+                     Info->getScratchWaveOffsetReg());
+
+  TargetLoweringBase::finalizeLowering(MF);
+}
diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h
index 83392a7ab1b2..e6bb3d6cd419 100644
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@@ -232,6 +232,8 @@ public:
   ConstraintType getConstraintType(StringRef Constraint) const override;
   SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL,
                    SDValue V) const;
+
+  void finalizeLowering(MachineFunction &MF) const override;
 };
 
 } // End namespace llvm
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp
index 160f8837d49c..a7e0feb10b9f 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3408,8 +3408,8 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const {
 }
 
 void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
-  SmallVector<MachineInstr *, 128> Worklist;
-  Worklist.push_back(&TopInst);
+  SetVectorType Worklist;
+  Worklist.insert(&TopInst);
 
   while (!Worklist.empty()) {
     MachineInstr &Inst = *Worklist.pop_back_val();
@@ -3610,7 +3610,7 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
   }
 }
 
-void SIInstrInfo::lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist,
+void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist,
                                  MachineInstr &Inst) const {
   MachineBasicBlock &MBB = *Inst.getParent();
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
@@ -3635,7 +3635,7 @@ void SIInstrInfo::lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist,
 }
 
 void SIInstrInfo::splitScalar64BitUnaryOp(
-    SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst,
+    SetVectorType &Worklist, MachineInstr &Inst,
     unsigned Opcode) const {
   MachineBasicBlock &MBB = *Inst.getParent();
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
@@ -3686,7 +3686,7 @@ void SIInstrInfo::splitScalar64BitUnaryOp(
 }
 
 void SIInstrInfo::splitScalar64BitBinaryOp(
-    SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst,
+    SetVectorType &Worklist, MachineInstr &Inst,
     unsigned Opcode) const {
   MachineBasicBlock &MBB = *Inst.getParent();
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
@@ -3753,7 +3753,7 @@ void SIInstrInfo::splitScalar64BitBinaryOp(
 }
 
 void SIInstrInfo::splitScalar64BitBCNT(
-    SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst) const {
+    SetVectorType &Worklist, MachineInstr &Inst) const {
   MachineBasicBlock &MBB = *Inst.getParent();
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
 
@@ -3789,7 +3789,7 @@ void SIInstrInfo::splitScalar64BitBCNT(
   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
 }
 
-void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist,
+void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist,
                                       MachineInstr &Inst) const {
   MachineBasicBlock &MBB = *Inst.getParent();
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
@@ -3853,12 +3853,12 @@ void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist,
 void SIInstrInfo::addUsersToMoveToVALUWorklist(
   unsigned DstReg,
   MachineRegisterInfo &MRI,
-  SmallVectorImpl<MachineInstr *> &Worklist) const {
+  SetVectorType &Worklist) const {
   for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg),
          E = MRI.use_end(); I != E;) {
     MachineInstr &UseMI = *I->getParent();
     if (!canReadVGPR(UseMI, I.getOperandNo())) {
-      Worklist.push_back(&UseMI);
+      Worklist.insert(&UseMI);
 
       do {
         ++I;
@@ -3869,7 +3869,7 @@ void SIInstrInfo::addUsersToMoveToVALUWorklist(
   }
 }
 
-void SIInstrInfo::movePackToVALU(SmallVectorImpl<MachineInstr *> &Worklist,
+void SIInstrInfo::movePackToVALU(SetVectorType &Worklist,
                                  MachineRegisterInfo &MRI,
                                  MachineInstr &Inst) const {
   unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
@@ -3932,7 +3932,7 @@ void SIInstrInfo::movePackToVALU(SmallVectorImpl<MachineInstr *> &Worklist,
 }
 
 void SIInstrInfo::addSCCDefUsersToVALUWorklist(
-    MachineInstr &SCCDefInst, SmallVectorImpl<MachineInstr *> &Worklist) const {
+    MachineInstr &SCCDefInst, SetVectorType &Worklist) const {
   // This assumes that all the users of SCC are in the same block
   // as the SCC def.
   for (MachineInstr &MI :
@@ -3943,7 +3943,7 @@ void SIInstrInfo::addSCCDefUsersToVALUWorklist(
       return;
 
     if (MI.findRegisterUseOperandIdx(AMDGPU::SCC) != -1)
-      Worklist.push_back(&MI);
+      Worklist.insert(&MI);
   }
 }
 
diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h
index d00c0d4a7f4e..3dd5bc89e6c7 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/lib/Target/AMDGPU/SIInstrInfo.h
@@ -19,6 +19,7 @@
 #include "AMDGPUInstrInfo.h"
 #include "SIDefines.h"
 #include "SIRegisterInfo.h"
+#include "llvm/ADT/SetVector.h"
 
 namespace llvm {
 
@@ -38,6 +39,8 @@ private:
     EXECZ = 3
   };
 
+  typedef SmallSetVector<MachineInstr *, 32> SetVectorType;
+
   static unsigned getBranchOpcode(BranchPredicate Cond);
   static BranchPredicate getBranchPredicate(unsigned Opcode);
 
@@ -56,30 +59,30 @@ private:
 
   void swapOperands(MachineInstr &Inst) const;
 
-  void lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist,
+  void lowerScalarAbs(SetVectorType &Worklist,
                       MachineInstr &Inst) const;
 
-  void splitScalar64BitUnaryOp(SmallVectorImpl<MachineInstr *> &Worklist,
+  void splitScalar64BitUnaryOp(SetVectorType &Worklist,
                                MachineInstr &Inst, unsigned Opcode) const;
 
-  void splitScalar64BitBinaryOp(SmallVectorImpl<MachineInstr *> &Worklist,
+  void splitScalar64BitBinaryOp(SetVectorType &Worklist,
                                 MachineInstr &Inst, unsigned Opcode) const;
 
-  void splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist,
+  void splitScalar64BitBCNT(SetVectorType &Worklist,
                             MachineInstr &Inst) const;
-  void splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist,
+  void splitScalar64BitBFE(SetVectorType &Worklist,
                            MachineInstr &Inst) const;
-  void movePackToVALU(SmallVectorImpl<MachineInstr *> &Worklist,
+  void movePackToVALU(SetVectorType &Worklist,
                       MachineRegisterInfo &MRI,
                       MachineInstr &Inst) const;
 
   void addUsersToMoveToVALUWorklist(
     unsigned Reg, MachineRegisterInfo &MRI,
-    SmallVectorImpl<MachineInstr *> &Worklist) const;
+    SetVectorType &Worklist) const;
 
   void
   addSCCDefUsersToVALUWorklist(MachineInstr &SCCDefInst,
-                               SmallVectorImpl<MachineInstr *> &Worklist) const;
+                               SetVectorType &Worklist) const;
 
   const TargetRegisterClass *
   getDestEquivalentVGPRClass(const MachineInstr &Inst) const;
diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td
index ffb01363e131..088173680fa8 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1436,7 +1436,7 @@ class VOPProfile <list<ValueType> _ArgVT> {
 
   field bit IsPacked = isPackedType<Src0VT>.ret;
   field bit HasOpSel = IsPacked;
-  field bit HasOMod = !if(HasOpSel, 0, HasModifiers);
+  field bit HasOMod = !if(HasOpSel, 0, isFloatType<DstVT>.ret);
   field bit HasSDWAOMod = isFloatType<DstVT>.ret;
 
   field bit HasExt = getHasExt<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;
diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td
index bcc685015cf5..ba69e42d9125 100644
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -1060,7 +1060,7 @@ def : Pat <
 
 class FPToI1Pat<Instruction Inst, int KOne, ValueType kone_type, ValueType vt, SDPatternOperator fp_to_int> : Pat <
   (i1 (fp_to_int (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)))),
-  (i1 (Inst 0, (kone_type KOne), $src0_modifiers, $src0, DSTCLAMP.NONE, DSTOMOD.NONE))
+  (i1 (Inst 0, (kone_type KOne), $src0_modifiers, $src0, DSTCLAMP.NONE))
 >;
 
 def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_ONE, i32, f32, fp_to_uint>;
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 3203c38dae34..a7c8166ff6d2 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -23,10 +23,10 @@ using namespace llvm;
 SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
   : AMDGPUMachineFunction(MF),
     TIDReg(AMDGPU::NoRegister),
-    ScratchRSrcReg(AMDGPU::NoRegister),
-    ScratchWaveOffsetReg(AMDGPU::NoRegister),
-    FrameOffsetReg(AMDGPU::NoRegister),
-    StackPtrOffsetReg(AMDGPU::NoRegister),
+    ScratchRSrcReg(AMDGPU::PRIVATE_RSRC_REG),
+    ScratchWaveOffsetReg(AMDGPU::SCRATCH_WAVE_OFFSET_REG),
+    FrameOffsetReg(AMDGPU::FP_REG),
+    StackPtrOffsetReg(AMDGPU::SP_REG),
     PrivateSegmentBufferUserSGPR(AMDGPU::NoRegister),
     DispatchPtrUserSGPR(AMDGPU::NoRegister),
     QueuePtrUserSGPR(AMDGPU::NoRegister),
@@ -42,6 +42,9 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
     WorkGroupIDZSystemSGPR(AMDGPU::NoRegister),
     WorkGroupInfoSystemSGPR(AMDGPU::NoRegister),
     PrivateSegmentWaveByteOffsetSystemSGPR(AMDGPU::NoRegister),
+    WorkItemIDXVGPR(AMDGPU::NoRegister),
+    WorkItemIDYVGPR(AMDGPU::NoRegister),
+    WorkItemIDZVGPR(AMDGPU::NoRegister),
     PSInputAddr(0),
     PSInputEnable(0),
     ReturnsVoid(true),
@@ -87,12 +90,14 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
     ScratchWaveOffsetReg = AMDGPU::SGPR4;
     FrameOffsetReg = AMDGPU::SGPR5;
     StackPtrOffsetReg = AMDGPU::SGPR32;
-    return;
+
+    // FIXME: Not really a system SGPR.
+    PrivateSegmentWaveByteOffsetSystemSGPR = ScratchWaveOffsetReg;
   }
 
   CallingConv::ID CC = F->getCallingConv();
   if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) {
-    KernargSegmentPtr = true;
+    KernargSegmentPtr = !F->arg_empty();
     WorkGroupIDX = true;
     WorkItemIDX = true;
   } else if (CC == CallingConv::AMDGPU_PS) {
@@ -101,17 +106,25 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
 
   if (ST.debuggerEmitPrologue()) {
     // Enable everything.
+    WorkGroupIDX = true;
     WorkGroupIDY = true;
     WorkGroupIDZ = true;
+    WorkItemIDX = true;
     WorkItemIDY = true;
     WorkItemIDZ = true;
   } else {
+    if (F->hasFnAttribute("amdgpu-work-group-id-x"))
+      WorkGroupIDX = true;
+
     if (F->hasFnAttribute("amdgpu-work-group-id-y"))
       WorkGroupIDY = true;
 
     if (F->hasFnAttribute("amdgpu-work-group-id-z"))
       WorkGroupIDZ = true;
 
+    if (F->hasFnAttribute("amdgpu-work-item-id-x"))
+      WorkItemIDX = true;
+
     if (F->hasFnAttribute("amdgpu-work-item-id-y"))
       WorkItemIDY = true;
 
@@ -119,25 +132,28 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
       WorkItemIDZ = true;
   }
 
-  // X, XY, and XYZ are the only supported combinations, so make sure Y is
-  // enabled if Z is.
-  if (WorkItemIDZ)
-    WorkItemIDY = true;
-
   const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
   bool MaySpill = ST.isVGPRSpillingEnabled(*F);
-  bool HasStackObjects = FrameInfo.hasStackObjects() || FrameInfo.hasCalls();
+  bool HasStackObjects = FrameInfo.hasStackObjects();
+
+  if (isEntryFunction()) {
+    // X, XY, and XYZ are the only supported combinations, so make sure Y is
+    // enabled if Z is.
+    if (WorkItemIDZ)
+      WorkItemIDY = true;
 
-  if (HasStackObjects || MaySpill) {
-    PrivateSegmentWaveByteOffset = true;
+    if (HasStackObjects || MaySpill) {
+      PrivateSegmentWaveByteOffset = true;
 
-    // HS and GS always have the scratch wave offset in SGPR5 on GFX9.
-    if (ST.getGeneration() >= AMDGPUSubtarget::GFX9 &&
-        (CC == CallingConv::AMDGPU_HS || CC == CallingConv::AMDGPU_GS))
-      PrivateSegmentWaveByteOffsetSystemSGPR = AMDGPU::SGPR5;
+      // HS and GS always have the scratch wave offset in SGPR5 on GFX9.
+      if (ST.getGeneration() >= AMDGPUSubtarget::GFX9 &&
+          (CC == CallingConv::AMDGPU_HS || CC == CallingConv::AMDGPU_GS))
+        PrivateSegmentWaveByteOffsetSystemSGPR = AMDGPU::SGPR5;
+    }
   }
 
-  if (ST.isAmdCodeObjectV2(MF)) {
+  bool IsCOV2 = ST.isAmdCodeObjectV2(MF);
+  if (IsCOV2) {
     if (HasStackObjects || MaySpill)
       PrivateSegmentBuffer = true;
 
@@ -154,11 +170,15 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
       ImplicitBufferPtr = true;
   }
 
-  // We don't need to worry about accessing spills with flat instructions.
-  // TODO: On VI where we must use flat for global, we should be able to omit
-  // this if it is never used for generic access.
-  if (HasStackObjects && ST.hasFlatAddressSpace() && ST.isAmdHsaOS())
-    FlatScratchInit = true;
+  if (F->hasFnAttribute("amdgpu-kernarg-segment-ptr"))
+    KernargSegmentPtr = true;
+
+  if (ST.hasFlatAddressSpace() && isEntryFunction() && IsCOV2) {
+    // TODO: This could be refined a lot. The attribute is a poor way of
+    // detecting calls that may require it before argument lowering.
+    if (HasStackObjects || F->hasFnAttribute("amdgpu-flat-scratch"))
+      FlatScratchInit = true;
+  }
 }
 
 unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer(
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 05aa249584bf..4c7f38a09a48 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -119,6 +119,11 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
   unsigned WorkGroupInfoSystemSGPR;
   unsigned PrivateSegmentWaveByteOffsetSystemSGPR;
 
+  // VGPR inputs. These are always v0, v1 and v2 for entry functions.
+  unsigned WorkItemIDXVGPR;
+  unsigned WorkItemIDYVGPR;
+  unsigned WorkItemIDZVGPR;
+
   // Graphics info.
   unsigned PSInputAddr;
   unsigned PSInputEnable;
@@ -377,10 +382,13 @@ public:
   }
 
   void setStackPtrOffsetReg(unsigned Reg) {
-    assert(Reg != AMDGPU::NoRegister && "Should never be unset");
     StackPtrOffsetReg = Reg;
   }
 
+  // Note the unset value for this is AMDGPU::SP_REG rather than
+  // NoRegister. This is mostly a workaround for MIR tests where state that
+  // can't be directly computed from the function is not preserved in serialized
+  // MIR.
   unsigned getStackPtrOffsetReg() const {
     return StackPtrOffsetReg;
   }
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp
index ef6ad4ad0c8f..4a3fbb4593bb 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -207,7 +207,11 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
     assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg));
   }
 
+  // We have to assume the SP is needed in case there are calls in the function,
+  // which is detected after the function is lowered. If we aren't really going
+  // to need SP, don't bother reserving it.
   unsigned StackPtrReg = MFI->getStackPtrOffsetReg();
+
   if (StackPtrReg != AMDGPU::NoRegister) {
     reserveRegisterTuples(Reserved, StackPtrReg);
     assert(!isSubRegister(ScratchRSrcReg, StackPtrReg));
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.td b/lib/Target/AMDGPU/SIRegisterInfo.td
index fc808011cd88..54ea7805e18d 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -23,6 +23,13 @@ class SIReg <string n, bits<16> regIdx = 0> : Register<n>,
 def VCC_LO : SIReg<"vcc_lo", 106>;
 def VCC_HI : SIReg<"vcc_hi", 107>;
 
+// Pseudo-registers: Used as placeholders during isel and immediately
+// replaced, never seeing the verifier.
+def PRIVATE_RSRC_REG : SIReg<"", 0>;
+def FP_REG : SIReg<"", 0>;
+def SP_REG : SIReg<"", 0>;
+def SCRATCH_WAVE_OFFSET_REG : SIReg<"", 0>;
+
 // VCC for 64-bit instructions
 def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]>,
           DwarfRegAlias<VCC_LO> {
@@ -267,7 +274,8 @@ def VGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7,
 def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
   (add SGPR_32, VCC_LO, VCC_HI, FLAT_SCR_LO, FLAT_SCR_HI,
    TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE, SRC_SHARED_LIMIT,
-   SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT)> {
+   SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT,
+   FP_REG, SP_REG, SCRATCH_WAVE_OFFSET_REG)> {
   let AllocationPriority = 7;
 }
 
@@ -314,7 +322,8 @@ def TTMP_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add TTMP_128R
   let isAllocatable = 0;
 }
 
-def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add SGPR_128, TTMP_128)> {
+def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32,
+  (add SGPR_128, TTMP_128)> {
   let AllocationPriority = 10;
 }
 
@@ -464,7 +473,9 @@ defm SCSrc : RegInlineOperand<"SReg", "SCSrc"> ;
 
 defm VSrc : RegImmOperand<"VS", "VSrc">;
 
-def VSrc_128 : RegisterOperand<VReg_128>;
+def VSrc_128 : RegisterOperand<VReg_128> {
+  let DecoderMethod = "DecodeVS_128RegisterClass";
+}
 
 //===----------------------------------------------------------------------===//
 //  VSrc_* Operands with an VGPR
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 26515b27bb77..67ad904ca972 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -539,23 +539,9 @@ bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI) {
 }
 
 bool isRegIntersect(unsigned Reg0, unsigned Reg1, const MCRegisterInfo* TRI) {
-
-  if (Reg0 == Reg1) {
-    return true;
+  for (MCRegAliasIterator R(Reg0, TRI, true); R.isValid(); ++R) {
+    if (*R == Reg1) return true;
   }
-
-  unsigned SubReg0 = TRI->getSubReg(Reg0, 1);
-  if (SubReg0 == 0) {
-    return TRI->getSubRegIndex(Reg1, Reg0) > 0;
-  }
-
-  for (unsigned Idx = 2; SubReg0 > 0; ++Idx) {
-    if (isRegIntersect(Reg1, SubReg0, TRI)) {
-      return true;
-    }
-    SubReg0 = TRI->getSubReg(Reg0, Idx);
-  }
-
   return false;
 }
 
diff --git a/lib/Target/AMDGPU/VOP2Instructions.td b/lib/Target/AMDGPU/VOP2Instructions.td
index 7b9bc71ad4c7..d5acb49b4f39 100644
--- a/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/lib/Target/AMDGPU/VOP2Instructions.td
@@ -117,7 +117,10 @@ class VOP2_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
 class getVOP2Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies {
   list<dag> ret = !if(P.HasModifiers,
     [(set P.DstVT:$vdst,
-      (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
+      (node (P.Src0VT
+              !if(P.HasOMod,
+                  (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod),
+                  (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp))),
             (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))],
     [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))]);
 }
@@ -813,9 +816,11 @@ let SubtargetPredicate = isVI in {
 
 // Aliases to simplify matching of floating-point instructions that
 // are VOP2 on SI and VOP3 on VI.
-class SI2_VI3Alias <string name, Instruction inst> : InstAlias <
+class SI2_VI3Alias <string name, VOP3_Real inst> : InstAlias <
   name#" $dst, $src0, $src1",
-  (inst VGPR_32:$dst, 0, VCSrc_f32:$src0, 0, VCSrc_f32:$src1, 0, 0)
+  !if(inst.Pfl.HasOMod,
+      (inst VGPR_32:$dst, 0, VCSrc_f32:$src0, 0, VCSrc_f32:$src1, 0, 0),
+      (inst VGPR_32:$dst, 0, VCSrc_f32:$src0, 0, VCSrc_f32:$src1, 0))
 >, PredicateControl {
   let UseInstAsmMatchConverter = 0;
   let AsmVariantName = AMDGPUAsmVariants.VOP3;
diff --git a/lib/Target/AMDGPU/VOP3Instructions.td b/lib/Target/AMDGPU/VOP3Instructions.td
index a8ca593f14ed..92ed0706dc01 100644
--- a/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/lib/Target/AMDGPU/VOP3Instructions.td
@@ -12,17 +12,21 @@
 //===----------------------------------------------------------------------===//
 
 class getVOP3ModPat<VOPProfile P, SDPatternOperator node> {
+  dag src0 = !if(P.HasOMod,
+    (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod),
+    (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp));
+
   list<dag> ret3 = [(set P.DstVT:$vdst,
-    (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
+    (node (P.Src0VT src0),
           (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
           (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers))))];
 
   list<dag> ret2 = [(set P.DstVT:$vdst,
-    (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
+    (node (P.Src0VT src0),
           (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))];
 
   list<dag> ret1 = [(set P.DstVT:$vdst,
-    (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod))))];
+    (node (P.Src0VT src0)))];
 
   list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
                   !if(!eq(P.NumSrcArgs, 2), ret2,
@@ -92,6 +96,7 @@ class VOP3_Profile<VOPProfile P> : VOPProfile<P.ArgVT> {
 class VOP3b_Profile<ValueType vt> : VOPProfile<[vt, vt, vt, vt]> {
   // v_div_scale_{f32|f64} do not support input modifiers.
   let HasModifiers = 0;
+  let HasOMod = 0;
   let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst);
   let Asm64 = " $vdst, $sdst, $src0, $src1, $src2";
 }
diff --git a/lib/Target/AMDGPU/VOP3PInstructions.td b/lib/Target/AMDGPU/VOP3PInstructions.td
index f2de1f995726..3becf758aaa3 100644
--- a/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -34,6 +34,9 @@ class VOP3_VOP3PInst<string OpName, VOPProfile P, SDPatternOperator node = null_
 
 let isCommutable = 1 in {
 def V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, fma>;
+def V_PK_MAD_I16 : VOP3PInst<"v_pk_mad_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>;
+def V_PK_MAD_U16 : VOP3PInst<"v_pk_mad_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>;
+
 def V_PK_ADD_F16 : VOP3PInst<"v_pk_add_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fadd>;
 def V_PK_MUL_F16 : VOP3PInst<"v_pk_mul_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmul>;
 def V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmaxnum>;
@@ -41,7 +44,6 @@ def V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>
 
 def V_PK_ADD_U16 : VOP3PInst<"v_pk_add_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, add>;
 def V_PK_ADD_I16 : VOP3PInst<"v_pk_add_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>;
-def V_PK_SUB_I16 : VOP3PInst<"v_pk_sub_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, sub>;
 def V_PK_MUL_LO_U16 : VOP3PInst<"v_pk_mul_lo_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, mul>;
 
 def V_PK_MIN_I16 : VOP3PInst<"v_pk_min_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, smin>;
@@ -50,6 +52,9 @@ def V_PK_MAX_I16 : VOP3PInst<"v_pk_max_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>
 def V_PK_MAX_U16 : VOP3PInst<"v_pk_max_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, umax>;
 }
 
+def V_PK_SUB_U16 : VOP3PInst<"v_pk_sub_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>;
+def V_PK_SUB_I16 : VOP3PInst<"v_pk_sub_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, sub>;
+
 def V_PK_LSHLREV_B16 : VOP3PInst<"v_pk_lshlrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, lshl_rev>;
 def V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, ashr_rev>;
 def V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, lshr_rev>;
@@ -71,6 +76,7 @@ multiclass VOP3P_Real_vi<bits<10> op> {
   }
 }
 
+defm V_PK_MAD_I16 : VOP3P_Real_vi <0x380>;
 defm V_PK_MUL_LO_U16 : VOP3P_Real_vi <0x381>;
 defm V_PK_ADD_I16 : VOP3P_Real_vi <0x382>;
 defm V_PK_SUB_I16 : VOP3P_Real_vi <0x383>;
@@ -79,8 +85,10 @@ defm V_PK_LSHRREV_B16 : VOP3P_Real_vi <0x385>;
 defm V_PK_ASHRREV_I16 : VOP3P_Real_vi <0x386>;
 defm V_PK_MAX_I16 : VOP3P_Real_vi <0x387>;
 defm V_PK_MIN_I16 : VOP3P_Real_vi <0x388>;
+defm V_PK_MAD_U16 : VOP3P_Real_vi <0x389>;
 
 defm V_PK_ADD_U16 : VOP3P_Real_vi <0x38a>;
+defm V_PK_SUB_U16 : VOP3P_Real_vi <0x38b>;
 defm V_PK_MAX_U16 : VOP3P_Real_vi <0x38c>;
 defm V_PK_MIN_U16 : VOP3P_Real_vi <0x38d>;
 defm V_PK_FMA_F16 : VOP3P_Real_vi <0x38e>;
diff --git a/lib/Target/AMDGPU/VOPCInstructions.td b/lib/Target/AMDGPU/VOPCInstructions.td
index f3482a22d5dc..b636fc9be431 100644
--- a/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/lib/Target/AMDGPU/VOPCInstructions.td
@@ -148,6 +148,19 @@ class VOPCInstAlias <VOP3_Pseudo ps, Instruction inst, VOPProfile p = ps.Pfl> :
   let SubtargetPredicate = AssemblerPredicate;
 }
 
+class getVOPCPat64 <PatLeaf cond, VOPProfile P> : LetDummies {
+  list<dag> ret = !if(P.HasModifiers,
+      [(set i1:$sdst,
+        (setcc (P.Src0VT
+                  !if(P.HasOMod,
+                    (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod),
+                    (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp))),
+               (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
+               cond))],
+      [(set i1:$sdst, (setcc P.Src0VT:$src0, P.Src1VT:$src1, cond))]);
+}
+
+
 multiclass VOPC_Pseudos <string opName,
                          VOPC_Profile P,
                          PatLeaf cond = COND_NULL,
@@ -163,14 +176,7 @@ multiclass VOPC_Pseudos <string opName,
     let isCommutable = 1;
   }
 
-  def _e64 : VOP3_Pseudo<opName, P,
-    !if(P.HasModifiers,
-      [(set i1:$sdst,
-          (setcc (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
-                                      i1:$clamp, i32:$omod)),
-                 (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
-                 cond))],
-      [(set i1:$sdst, (setcc P.Src0VT:$src0, P.Src1VT:$src1, cond))])>,
+  def _e64 : VOP3_Pseudo<opName, P, getVOPCPat64<cond, P>.ret>,
     Commutable_REV<revOp#"_e64", !eq(revOp, opName)> {
     let Defs = !if(DefExec, [EXEC], []);
     let SchedRW = P.Schedule;
@@ -634,7 +640,7 @@ class FCMP_Pattern <PatLeaf cond, Instruction inst, ValueType vt> : Pat <
   (i64 (AMDGPUsetcc (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)),
                    (vt (VOP3Mods vt:$src1, i32:$src1_modifiers)), cond)),
   (inst $src0_modifiers, $src0, $src1_modifiers, $src1,
-        DSTCLAMP.NONE, DSTOMOD.NONE)
+        DSTCLAMP.NONE)
 >;
 
 def : FCMP_Pattern <COND_OEQ, V_CMP_EQ_F32_e64, f32>;
diff --git a/lib/Target/AMDGPU/VOPInstructions.td b/lib/Target/AMDGPU/VOPInstructions.td
index 77b7952b22a8..b47538ba0349 100644
--- a/lib/Target/AMDGPU/VOPInstructions.td
+++ b/lib/Target/AMDGPU/VOPInstructions.td
@@ -136,6 +136,8 @@ class VOP3_Real <VOP3_Pseudo ps, int EncodingFamily> :
   let TSFlags            = ps.TSFlags;
   let UseNamedOperandTable = ps.UseNamedOperandTable;
   let Uses                 = ps.Uses;
+
+  VOPProfile Pfl = ps.Pfl;
 }
 
 // XXX - Is there any reason to distingusih this from regular VOP3
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td
index c40b4450a5b5..e49c1babac21 100644
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -17,144 +17,172 @@
 include "llvm/Target/Target.td"
 
 //===----------------------------------------------------------------------===//
-// ARM Helper classes.
+// ARM Subtarget state.
 //
 
-class ProcNoItin<string Name, list<SubtargetFeature> Features>
- : Processor<Name, NoItineraries, Features>;
+def ModeThumb             : SubtargetFeature<"thumb-mode", "InThumbMode",
+                                             "true", "Thumb mode">;
+
+def ModeSoftFloat         : SubtargetFeature<"soft-float","UseSoftFloat",
+                                             "true", "Use software floating "
+                                             "point features.">;
 
-class Architecture<string fname, string aname, list<SubtargetFeature> features >
-  : SubtargetFeature<fname, "ARMArch", aname,
-                     !strconcat(aname, " architecture"), features>;
 
 //===----------------------------------------------------------------------===//
-// ARM Subtarget state.
+// ARM Subtarget features.
 //
 
-def ModeThumb  : SubtargetFeature<"thumb-mode", "InThumbMode", "true",
-                                  "Thumb mode">;
+// Floating Point, HW Division and Neon Support
+def FeatureVFP2           : SubtargetFeature<"vfp2", "HasVFPv2", "true",
+                                             "Enable VFP2 instructions">;
 
-def ModeSoftFloat : SubtargetFeature<"soft-float", "UseSoftFloat", "true",
-                                     "Use software floating point features.">;
+def FeatureVFP3           : SubtargetFeature<"vfp3", "HasVFPv3", "true",
+                                             "Enable VFP3 instructions",
+                                             [FeatureVFP2]>;
 
-//===----------------------------------------------------------------------===//
-// ARM Subtarget features.
-//
+def FeatureNEON           : SubtargetFeature<"neon", "HasNEON", "true",
+                                             "Enable NEON instructions",
+                                             [FeatureVFP3]>;
+
+def FeatureFP16           : SubtargetFeature<"fp16", "HasFP16", "true",
+                                             "Enable half-precision "
+                                             "floating point">;
+
+def FeatureVFP4           : SubtargetFeature<"vfp4", "HasVFPv4", "true",
+                                             "Enable VFP4 instructions",
+                                             [FeatureVFP3, FeatureFP16]>;
+
+def FeatureFPARMv8        : SubtargetFeature<"fp-armv8", "HasFPARMv8",
+                                             "true", "Enable ARMv8 FP",
+                                             [FeatureVFP4]>;
+
+def FeatureFullFP16       : SubtargetFeature<"fullfp16", "HasFullFP16", "true",
+                                             "Enable full half-precision "
+                                             "floating point",
+                                             [FeatureFPARMv8]>;
+
+def FeatureVFPOnlySP      : SubtargetFeature<"fp-only-sp", "FPOnlySP", "true",
+                                             "Floating point unit supports "
+                                             "single precision only">;
+
+def FeatureD16            : SubtargetFeature<"d16", "HasD16", "true",
+                                             "Restrict FP to 16 double registers">;
+
+def FeatureHWDivThumb     : SubtargetFeature<"hwdiv",
+                                             "HasHardwareDivideInThumb", "true",
+                                             "Enable divide instructions in Thumb">;
+
+def FeatureHWDivARM       : SubtargetFeature<"hwdiv-arm",
+                                             "HasHardwareDivideInARM", "true",
+                                             "Enable divide instructions in ARM mode">;
+
+// Atomic Support
+def FeatureDB             : SubtargetFeature<"db", "HasDataBarrier", "true",
+                                             "Has data barrier (dmb/dsb) instructions">;
+
+def FeatureV7Clrex        : SubtargetFeature<"v7clrex", "HasV7Clrex", "true",
+                                             "Has v7 clrex instruction">;
 
-def FeatureVFP2 : SubtargetFeature<"vfp2", "HasVFPv2", "true",
-                                   "Enable VFP2 instructions">;
-def FeatureVFP3 : SubtargetFeature<"vfp3", "HasVFPv3", "true",
-                                   "Enable VFP3 instructions",
-                                   [FeatureVFP2]>;
-def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true",
-                                   "Enable NEON instructions",
-                                   [FeatureVFP3]>;
-def FeatureThumb2 : SubtargetFeature<"thumb2", "HasThumb2", "true",
-                                     "Enable Thumb2 instructions">;
-def FeatureNoARM  : SubtargetFeature<"noarm", "NoARM", "true",
-                                     "Does not support ARM mode execution",
-                                     [ModeThumb]>;
-def FeatureFP16   : SubtargetFeature<"fp16", "HasFP16", "true",
-                                     "Enable half-precision floating point">;
-def FeatureVFP4   : SubtargetFeature<"vfp4", "HasVFPv4", "true",
-                                     "Enable VFP4 instructions",
-                                     [FeatureVFP3, FeatureFP16]>;
-def FeatureFPARMv8 : SubtargetFeature<"fp-armv8", "HasFPARMv8",
-                                   "true", "Enable ARMv8 FP",
-                                   [FeatureVFP4]>;
-def FeatureFullFP16 : SubtargetFeature<"fullfp16", "HasFullFP16", "true",
-                                       "Enable full half-precision floating point",
-                                       [FeatureFPARMv8]>;
-def FeatureD16    : SubtargetFeature<"d16", "HasD16", "true",
-                                     "Restrict FP to 16 double registers">;
-def FeatureHWDivThumb : SubtargetFeature<"hwdiv", "HasHardwareDivideInThumb",
-                                         "true",
-                                         "Enable divide instructions in Thumb">;
-def FeatureHWDivARM  : SubtargetFeature<"hwdiv-arm",
-                                        "HasHardwareDivideInARM", "true",
-                                      "Enable divide instructions in ARM mode">;
-def FeatureDB     : SubtargetFeature<"db", "HasDataBarrier", "true",
-                                   "Has data barrier (dmb / dsb) instructions">;
-def FeatureV7Clrex : SubtargetFeature<"v7clrex", "HasV7Clrex", "true",
-                                      "Has v7 clrex instruction">;
 def FeatureAcquireRelease : SubtargetFeature<"acquire-release",
                                              "HasAcquireRelease", "true",
-                         "Has v8 acquire/release (lda/ldaex etc) instructions">;
-def FeatureSlowFPBrcc : SubtargetFeature<"slow-fp-brcc", "SlowFPBrcc", "true",
-                                         "FP compare + branch is slow">;
-def FeatureVFPOnlySP : SubtargetFeature<"fp-only-sp", "FPOnlySP", "true",
-                          "Floating point unit supports single precision only">;
-def FeaturePerfMon : SubtargetFeature<"perfmon", "HasPerfMon", "true",
-                           "Enable support for Performance Monitor extensions">;
-def FeatureTrustZone : SubtargetFeature<"trustzone", "HasTrustZone", "true",
-                          "Enable support for TrustZone security extensions">;
-def Feature8MSecExt : SubtargetFeature<"8msecext", "Has8MSecExt", "true",
-                          "Enable support for ARMv8-M Security Extensions">;
-def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true",
-                          "Enable support for Cryptography extensions",
-                          [FeatureNEON]>;
-def FeatureCRC : SubtargetFeature<"crc", "HasCRC", "true",
-                          "Enable support for CRC instructions">;
+                                             "Has v8 acquire/release (lda/ldaex "
+                                             " etc) instructions">;
+
+
+def FeatureSlowFPBrcc     : SubtargetFeature<"slow-fp-brcc", "SlowFPBrcc", "true",
+                                             "FP compare + branch is slow">;
+
+def FeaturePerfMon        : SubtargetFeature<"perfmon", "HasPerfMon", "true",
+                                             "Enable support for Performance "
+                                             "Monitor extensions">;
+
+
+// TrustZone Security Extensions
+def FeatureTrustZone      : SubtargetFeature<"trustzone", "HasTrustZone", "true",
+                                             "Enable support for TrustZone "
+                                             "security extensions">;
+
+def Feature8MSecExt       : SubtargetFeature<"8msecext", "Has8MSecExt", "true",
+                                             "Enable support for ARMv8-M "
+                                             "Security Extensions">;
+
+def FeatureCrypto         : SubtargetFeature<"crypto", "HasCrypto", "true",
+                                             "Enable support for "
+                                             "Cryptography extensions",
+                                             [FeatureNEON]>;
+
+def FeatureCRC            : SubtargetFeature<"crc", "HasCRC", "true",
+                                             "Enable support for CRC instructions">;
+
+
 // Not to be confused with FeatureHasRetAddrStack (return address stack)
-def FeatureRAS : SubtargetFeature<"ras", "HasRAS", "true",
-                "Enable Reliability, Availability and Serviceability extensions">;
-def FeatureFPAO : SubtargetFeature<"fpao", "HasFPAO", "true",
-                "Enable fast computation of positive address offsets">;
-def FeatureFuseAES : SubtargetFeature<"fuse-aes", "HasFuseAES", "true",
-                                      "CPU fuses AES crypto operations">;
-
-// Cyclone has preferred instructions for zeroing VFP registers, which can
-// execute in 0 cycles.
-def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true",
-                                        "Has zero-cycle zeroing instructions">;
-
-// Whether or not it may be profitable to unpredicate certain instructions
-// during if conversion.
+def FeatureRAS            : SubtargetFeature<"ras", "HasRAS", "true",
+                                             "Enable Reliability, Availability "
+                                             "and Serviceability extensions">;
+
+// Fast computation of non-negative address offsets
+def FeatureFPAO           : SubtargetFeature<"fpao", "HasFPAO", "true",
+                                             "Enable fast computation of "
+                                             "positive address offsets">;
+
+// Fast execution of AES crypto operations
+def FeatureFuseAES        : SubtargetFeature<"fuse-aes", "HasFuseAES", "true",
+                                             "CPU fuses AES crypto operations">;
+
+// Cyclone can zero VFP registers in 0 cycles.
+def FeatureZCZeroing      : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true",
+                                             "Has zero-cycle zeroing instructions">;
+
+// Whether it is profitable to unpredicate certain instructions during if-conversion
 def FeatureProfUnpredicate : SubtargetFeature<"prof-unpr",
-                                              "IsProfitableToUnpredicate",
-                                              "true",
+                                              "IsProfitableToUnpredicate", "true",
                                               "Is profitable to unpredicate">;
 
 // Some targets (e.g. Swift) have microcoded VGETLNi32.
-def FeatureSlowVGETLNi32 : SubtargetFeature<"slow-vgetlni32",
-                                            "HasSlowVGETLNi32", "true",
-                                            "Has slow VGETLNi32 - prefer VMOV">;
+def FeatureSlowVGETLNi32  : SubtargetFeature<"slow-vgetlni32",
+                                             "HasSlowVGETLNi32", "true",
+                                             "Has slow VGETLNi32 - prefer VMOV">;
 
 // Some targets (e.g. Swift) have microcoded VDUP32.
-def FeatureSlowVDUP32 : SubtargetFeature<"slow-vdup32", "HasSlowVDUP32", "true",
-                                         "Has slow VDUP32 - prefer VMOV">;
+def FeatureSlowVDUP32     : SubtargetFeature<"slow-vdup32", "HasSlowVDUP32",
+                                             "true",
+                                             "Has slow VDUP32 - prefer VMOV">;
 
 // Some targets (e.g. Cortex-A9) prefer VMOVSR to VMOVDRR even when using NEON
 // for scalar FP, as this allows more effective execution domain optimization.
-def FeaturePreferVMOVSR : SubtargetFeature<"prefer-vmovsr", "PreferVMOVSR",
-                                           "true", "Prefer VMOVSR">;
+def FeaturePreferVMOVSR   : SubtargetFeature<"prefer-vmovsr", "PreferVMOVSR",
+                                             "true", "Prefer VMOVSR">;
 
 // Swift has ISHST barriers compatible with Atomic Release semantics but weaker
 // than ISH
 def FeaturePrefISHSTBarrier : SubtargetFeature<"prefer-ishst", "PreferISHST",
-                                           "true", "Prefer ISHST barriers">;
+                                               "true", "Prefer ISHST barriers">;
 
 // Some targets (e.g. Cortex-A9) have muxed AGU and NEON/FPU.
-def FeatureMuxedUnits : SubtargetFeature<"muxed-units", "HasMuxedUnits", "true",
-                                         "Has muxed AGU and NEON/FPU">;
+def FeatureMuxedUnits     : SubtargetFeature<"muxed-units", "HasMuxedUnits",
+                                             "true",
+                                             "Has muxed AGU and NEON/FPU">;
 
-// On some targets, a VLDM/VSTM starting with an odd register number needs more
-// microops than single VLDRS.
+// Whether VLDM/VSTM starting with odd register number need more microops
+// than single VLDRS
 def FeatureSlowOddRegister : SubtargetFeature<"slow-odd-reg", "SlowOddRegister",
-                     "true", "VLDM/VSTM starting with an odd register is slow">;
+                                              "true", "VLDM/VSTM starting "
+                                              "with an odd register is slow">;
 
 // Some targets have a renaming dependency when loading into D subregisters.
 def FeatureSlowLoadDSubreg : SubtargetFeature<"slow-load-D-subreg",
                                               "SlowLoadDSubregister", "true",
                                               "Loading into D subregs is slow">;
+
 // Some targets (e.g. Cortex-A15) never want VMOVS to be widened to VMOVD.
 def FeatureDontWidenVMOVS : SubtargetFeature<"dont-widen-vmovs",
                                              "DontWidenVMOVS", "true",
                                              "Don't widen VMOVS to VMOVD">;
 
 // Whether or not it is profitable to expand VFP/NEON MLA/MLS instructions.
-def FeatureExpandMLx : SubtargetFeature<"expand-fp-mlx", "ExpandMLx", "true",
-                                        "Expand VFP/NEON MLA/MLS instructions">;
+def FeatureExpandMLx      : SubtargetFeature<"expand-fp-mlx",
+                                             "ExpandMLx", "true",
+                                             "Expand VFP/NEON MLA/MLS instructions">;
 
 // Some targets have special RAW hazards for VFP/NEON VMLA/VMLS.
 def FeatureHasVMLxHazards : SubtargetFeature<"vmlx-hazards", "HasVMLxHazards",
@@ -162,15 +190,18 @@ def FeatureHasVMLxHazards : SubtargetFeature<"vmlx-hazards", "HasVMLxHazards",
 
 // Some targets (e.g. Cortex-A9) want to convert VMOVRS, VMOVSR and VMOVS from
 // VFP to NEON, as an execution domain optimization.
-def FeatureNEONForFPMovs : SubtargetFeature<"neon-fpmovs", "UseNEONForFPMovs",
-                              "true", "Convert VMOVSR, VMOVRS, VMOVS to NEON">;
+def FeatureNEONForFPMovs  : SubtargetFeature<"neon-fpmovs",
+                                             "UseNEONForFPMovs", "true",
+                                             "Convert VMOVSR, VMOVRS, "
+                                             "VMOVS to NEON">;
 
 // Some processors benefit from using NEON instructions for scalar
 // single-precision FP operations. This affects instruction selection and should
 // only be enabled if the handling of denormals is not important.
-def FeatureNEONForFP : SubtargetFeature<"neonfp", "UseNEONForSinglePrecisionFP",
-                                        "true",
-                                        "Use NEON for single precision FP">;
+def FeatureNEONForFP      : SubtargetFeature<"neonfp",
+                                             "UseNEONForSinglePrecisionFP",
+                                             "true",
+                                             "Use NEON for single precision FP">;
 
 // On some processors, VLDn instructions that access unaligned data take one
 // extra cycle. Take that into account when computing operand latencies.
@@ -181,18 +212,18 @@ def FeatureCheckVLDnAlign : SubtargetFeature<"vldn-align", "CheckVLDnAlign",
 // Some processors have a nonpipelined VFP coprocessor.
 def FeatureNonpipelinedVFP : SubtargetFeature<"nonpipelined-vfp",
                                               "NonpipelinedVFP", "true",
-                                          "VFP instructions are not pipelined">;
+                                              "VFP instructions are not pipelined">;
 
 // Some processors have FP multiply-accumulate instructions that don't
 // play nicely with other VFP / NEON instructions, and it's generally better
 // to just not use them.
-def FeatureHasSlowFPVMLx : SubtargetFeature<"slowfpvmlx", "SlowFPVMLx", "true",
-                                         "Disable VFP / NEON MAC instructions">;
+def FeatureHasSlowFPVMLx  : SubtargetFeature<"slowfpvmlx", "SlowFPVMLx", "true",
+                                             "Disable VFP / NEON MAC instructions">;
 
 // Cortex-A8 / A9 Advanced SIMD has multiplier accumulator forwarding.
 def FeatureVMLxForwarding : SubtargetFeature<"vmlx-forwarding",
-                                       "HasVMLxForwarding", "true",
-                                       "Has multiplier accumulator forwarding">;
+                                             "HasVMLxForwarding", "true",
+                                             "Has multiplier accumulator forwarding">;
 
 // Disable 32-bit to 16-bit narrowing for experimentation.
 def FeaturePref32BitThumb : SubtargetFeature<"32bit", "Pref32BitThumb", "true",
@@ -213,14 +244,16 @@ def FeatureCheapPredicableCPSR : SubtargetFeature<"cheap-predicable-cpsr",
                                                   "true",
                   "Disable +1 predication cost for instructions updating CPSR">;
 
-def FeatureAvoidMOVsShOp : SubtargetFeature<"avoid-movs-shop",
-                                            "AvoidMOVsShifterOperand", "true",
-                                "Avoid movs instructions with shifter operand">;
+def FeatureAvoidMOVsShOp  : SubtargetFeature<"avoid-movs-shop",
+                                             "AvoidMOVsShifterOperand", "true",
+                                             "Avoid movs instructions with "
+                                             "shifter operand">;
 
 // Some processors perform return stack prediction. CodeGen should avoid issue
 // "normal" call instructions to callees which do not return.
-def FeatureHasRetAddrStack : SubtargetFeature<"ret-addr-stack", "HasRetAddrStack", "true",
-                                     "Has return address stack">;
+def FeatureHasRetAddrStack : SubtargetFeature<"ret-addr-stack",
+                                              "HasRetAddrStack", "true",
+                                              "Has return address stack">;
 
 // Some processors have no branch predictor, which changes the expected cost of
 // taking a branch which affects the choice of whether to use predicated
@@ -230,63 +263,80 @@ def FeatureHasNoBranchPredictor : SubtargetFeature<"no-branch-predictor",
                                                    "Has no branch predictor">;
 
 /// DSP extension.
-def FeatureDSP : SubtargetFeature<"dsp", "HasDSP", "true",
-                              "Supports DSP instructions in ARM and/or Thumb2">;
+def FeatureDSP            : SubtargetFeature<"dsp", "HasDSP", "true",
+                                             "Supports DSP instructions in "
+                                             "ARM and/or Thumb2">;
 
 // Multiprocessing extension.
-def FeatureMP : SubtargetFeature<"mp", "HasMPExtension", "true",
-                                 "Supports Multiprocessing extension">;
+def FeatureMP             : SubtargetFeature<"mp", "HasMPExtension", "true",
+                                        "Supports Multiprocessing extension">;
 
 // Virtualization extension - requires HW divide (ARMv7-AR ARMARM - 4.4.8).
 def FeatureVirtualization : SubtargetFeature<"virtualization",
-                                 "HasVirtualization", "true",
-                                 "Supports Virtualization extension",
-                                 [FeatureHWDivThumb, FeatureHWDivARM]>;
+                                             "HasVirtualization", "true",
+                                             "Supports Virtualization extension",
+                                             [FeatureHWDivThumb, FeatureHWDivARM]>;
 
-// M-series ISA
-def FeatureMClass : SubtargetFeature<"mclass", "ARMProcClass", "MClass",
-                                     "Is microcontroller profile ('M' series)">;
+// Special TRAP encoding for NaCl, which looks like a TRAP in Thumb too.
+// See ARMInstrInfo.td for details.
+def FeatureNaClTrap       : SubtargetFeature<"nacl-trap", "UseNaClTrap", "true",
+                                             "NaCl trap">;
 
-// R-series ISA
-def FeatureRClass : SubtargetFeature<"rclass", "ARMProcClass", "RClass",
-                                     "Is realtime profile ('R' series)">;
+def FeatureStrictAlign    : SubtargetFeature<"strict-align",
+                                             "StrictAlign", "true",
+                                             "Disallow all unaligned memory "
+                                             "access">;
+
+def FeatureLongCalls      : SubtargetFeature<"long-calls", "GenLongCalls", "true",
+                                             "Generate calls via indirect call "
+                                             "instructions">;
+
+def FeatureExecuteOnly    : SubtargetFeature<"execute-only",
+                                             "GenExecuteOnly", "true",
+                                             "Enable the generation of "
+                                             "execute only code.">;
+
+def FeatureReserveR9      : SubtargetFeature<"reserve-r9", "ReserveR9", "true",
+                                             "Reserve R9, making it unavailable"
+                                             " as GPR">;
+
+def FeatureNoMovt         : SubtargetFeature<"no-movt", "NoMovt", "true",
+                                             "Don't use movt/movw pairs for "
+                                             "32-bit imms">;
+
+def FeatureNoNegativeImmediates
+                          : SubtargetFeature<"no-neg-immediates",
+                                             "NegativeImmediates", "false",
+                                             "Convert immediates and instructions "
+                                             "to their negated or complemented "
+                                             "equivalent when the immediate does "
+                                             "not fit in the encoding.">;
+
+
+//===----------------------------------------------------------------------===//
+// ARM architecture class
+//
 
 // A-series ISA
 def FeatureAClass : SubtargetFeature<"aclass", "ARMProcClass", "AClass",
                                      "Is application profile ('A' series)">;
 
-// Special TRAP encoding for NaCl, which looks like a TRAP in Thumb too.
-// See ARMInstrInfo.td for details.
-def FeatureNaClTrap : SubtargetFeature<"nacl-trap", "UseNaClTrap", "true",
-                                       "NaCl trap">;
-
-def FeatureStrictAlign : SubtargetFeature<"strict-align",
-                                          "StrictAlign", "true",
-                                          "Disallow all unaligned memory "
-                                          "access">;
+// R-series ISA
+def FeatureRClass : SubtargetFeature<"rclass", "ARMProcClass", "RClass",
+                                     "Is realtime profile ('R' series)">;
 
-def FeatureLongCalls : SubtargetFeature<"long-calls", "GenLongCalls", "true",
-                                        "Generate calls via indirect call "
-                                        "instructions">;
+// M-series ISA
+def FeatureMClass : SubtargetFeature<"mclass", "ARMProcClass", "MClass",
+                                     "Is microcontroller profile ('M' series)">;
 
-def FeatureExecuteOnly
-    : SubtargetFeature<"execute-only", "GenExecuteOnly", "true",
-                       "Enable the generation of execute only code.">;
 
-def FeatureReserveR9 : SubtargetFeature<"reserve-r9", "ReserveR9", "true",
-                                        "Reserve R9, making it unavailable as "
-                                        "GPR">;
+def FeatureThumb2 : SubtargetFeature<"thumb2", "HasThumb2", "true",
+                                     "Enable Thumb2 instructions">;
 
-def FeatureNoMovt : SubtargetFeature<"no-movt", "NoMovt", "true",
-                                     "Don't use movt/movw pairs for 32-bit "
-                                     "imms">;
+def FeatureNoARM  : SubtargetFeature<"noarm", "NoARM", "true",
+                                     "Does not support ARM mode execution",
+                                     [ModeThumb]>;
 
-def FeatureNoNegativeImmediates : SubtargetFeature<"no-neg-immediates",
-                                        "NegativeImmediates", "false",
-                                        "Convert immediates and instructions "
-                                        "to their negated or complemented "
-                                        "equivalent when the immediate does "
-                                        "not fit in the encoding.">;
 
 //===----------------------------------------------------------------------===//
 // ARM ISAa.
@@ -294,43 +344,57 @@ def FeatureNoNegativeImmediates : SubtargetFeature<"no-neg-immediates",
 
 def HasV4TOps   : SubtargetFeature<"v4t", "HasV4TOps", "true",
                                    "Support ARM v4T instructions">;
+
 def HasV5TOps   : SubtargetFeature<"v5t", "HasV5TOps", "true",
                                    "Support ARM v5T instructions",
                                    [HasV4TOps]>;
+
 def HasV5TEOps  : SubtargetFeature<"v5te", "HasV5TEOps", "true",
-                             "Support ARM v5TE, v5TEj, and v5TExp instructions",
+                                   "Support ARM v5TE, v5TEj, and "
+                                   "v5TExp instructions",
                                    [HasV5TOps]>;
+
 def HasV6Ops    : SubtargetFeature<"v6", "HasV6Ops", "true",
                                    "Support ARM v6 instructions",
                                    [HasV5TEOps]>;
+
 def HasV6MOps   : SubtargetFeature<"v6m", "HasV6MOps", "true",
                                    "Support ARM v6M instructions",
                                    [HasV6Ops]>;
+
 def HasV8MBaselineOps : SubtargetFeature<"v8m", "HasV8MBaselineOps", "true",
                                          "Support ARM v8M Baseline instructions",
                                          [HasV6MOps]>;
+
 def HasV6KOps   : SubtargetFeature<"v6k", "HasV6KOps", "true",
                                    "Support ARM v6k instructions",
                                    [HasV6Ops]>;
+
 def HasV6T2Ops  : SubtargetFeature<"v6t2", "HasV6T2Ops", "true",
                                    "Support ARM v6t2 instructions",
                                    [HasV8MBaselineOps, HasV6KOps, FeatureThumb2]>;
+
 def HasV7Ops    : SubtargetFeature<"v7", "HasV7Ops", "true",
                                    "Support ARM v7 instructions",
                                    [HasV6T2Ops, FeaturePerfMon,
                                     FeatureV7Clrex]>;
+
+def HasV8MMainlineOps :
+                  SubtargetFeature<"v8m.main", "HasV8MMainlineOps", "true",
+                                   "Support ARM v8M Mainline instructions",
+                                   [HasV7Ops]>;
+
 def HasV8Ops    : SubtargetFeature<"v8", "HasV8Ops", "true",
                                    "Support ARM v8 instructions",
                                    [HasV7Ops, FeatureAcquireRelease]>;
+
 def HasV8_1aOps : SubtargetFeature<"v8.1a", "HasV8_1aOps", "true",
                                    "Support ARM v8.1a instructions",
                                    [HasV8Ops]>;
-def HasV8_2aOps   : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true",
+
+def HasV8_2aOps : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true",
                                    "Support ARM v8.2a instructions",
                                    [HasV8_1aOps]>;
-def HasV8MMainlineOps : SubtargetFeature<"v8m.main", "HasV8MMainlineOps", "true",
-                                         "Support ARM v8M Mainline instructions",
-                                         [HasV7Ops]>;
 
 
 //===----------------------------------------------------------------------===//
@@ -386,11 +450,17 @@ def ProcR52     : SubtargetFeature<"r52", "ARMProcFamily", "CortexR52",
 def ProcM3      : SubtargetFeature<"m3", "ARMProcFamily", "CortexM3",
                                    "Cortex-M3 ARM processors", []>;
 
+
 //===----------------------------------------------------------------------===//
-// ARM schedules.
+// ARM Helper classes.
 //
 
-include "ARMSchedule.td"
+class Architecture<string fname, string aname, list<SubtargetFeature> features>
+  : SubtargetFeature<fname, "ARMArch", aname,
+                     !strconcat(aname, " architecture"), features>;
+
+class ProcNoItin<string Name, list<SubtargetFeature> Features>
+  : Processor<Name, NoItineraries, Features>;
 
 
 //===----------------------------------------------------------------------===//
@@ -546,6 +616,12 @@ def ARMv7k   : Architecture<"armv7k",      "ARMv7a",   [ARMv7a]>;
 def ARMv7s   : Architecture<"armv7s",      "ARMv7a",   [ARMv7a]>;
 
 
+//===----------------------------------------------------------------------===//
+// ARM schedules.
+//===----------------------------------------------------------------------===//
+//
+include "ARMSchedule.td"
+
 //===----------------------------------------------------------------------===//
 // ARM processors
 //
@@ -553,6 +629,9 @@ def ARMv7s   : Architecture<"armv7s",      "ARMv7a",   [ARMv7a]>;
 // Dummy CPU, used to target architectures
 def : ProcessorModel<"generic",     CortexA8Model,      []>;
 
+// FIXME: Several processors below are not using their own scheduler
+// model, but one of similar/previous processor. These should be fixed.
+
 def : ProcNoItin<"arm8",                                [ARMv4]>;
 def : ProcNoItin<"arm810",                              [ARMv4]>;
 def : ProcNoItin<"strongarm",                           [ARMv4]>;
@@ -612,7 +691,6 @@ def : Processor<"arm1156t2f-s",     ARMV6Itineraries,   [ARMv6t2,
                                                          FeatureVFP2,
                                                          FeatureHasSlowFPVMLx]>;
 
-// FIXME: A5 has currently the same Schedule model as A8
 def : ProcessorModel<"cortex-a5",   CortexA8Model,      [ARMv7a, ProcA5,
                                                          FeatureHasRetAddrStack,
                                                          FeatureTrustZone,
@@ -656,7 +734,6 @@ def : ProcessorModel<"cortex-a9",   CortexA9Model,      [ARMv7a, ProcA9,
                                                          FeatureCheckVLDnAlign,
                                                          FeatureMP]>;
 
-// FIXME: A12 has currently the same Schedule model as A9
 def : ProcessorModel<"cortex-a12",  CortexA9Model,      [ARMv7a, ProcA12,
                                                          FeatureHasRetAddrStack,
                                                          FeatureTrustZone,
@@ -666,7 +743,6 @@ def : ProcessorModel<"cortex-a12",  CortexA9Model,      [ARMv7a, ProcA12,
                                                          FeatureVirtualization,
                                                          FeatureMP]>;
 
-// FIXME: A15 has currently the same Schedule model as A9.
 def : ProcessorModel<"cortex-a15",  CortexA9Model,      [ARMv7a, ProcA15,
                                                          FeatureDontWidenVMOVS,
                                                          FeatureHasRetAddrStack,
@@ -678,7 +754,6 @@ def : ProcessorModel<"cortex-a15",  CortexA9Model,      [ARMv7a, ProcA15,
                                                          FeatureAvoidPartialCPSR,
                                                          FeatureVirtualization]>;
 
-// FIXME: A17 has currently the same Schedule model as A9
 def : ProcessorModel<"cortex-a17",  CortexA9Model,      [ARMv7a, ProcA17,
                                                          FeatureHasRetAddrStack,
                                                          FeatureTrustZone,
@@ -688,9 +763,7 @@ def : ProcessorModel<"cortex-a17",  CortexA9Model,      [ARMv7a, ProcA17,
                                                          FeatureAvoidPartialCPSR,
                                                          FeatureVirtualization]>;
 
-// FIXME: krait has currently the same Schedule model as A9
-// FIXME: krait has currently the same features as A9 plus VFP4 and hardware
-//        division features.
+// FIXME: krait has currently the same features as A9 plus VFP4 and  HWDiv
 def : ProcessorModel<"krait",       CortexA9Model,      [ARMv7a, ProcKrait,
                                                          FeatureHasRetAddrStack,
                                                          FeatureMuxedUnits,
@@ -720,12 +793,10 @@ def : ProcessorModel<"swift",       SwiftModel,         [ARMv7a, ProcSwift,
                                                          FeatureSlowVGETLNi32,
                                                          FeatureSlowVDUP32]>;
 
-// FIXME: R4 has currently the same ProcessorModel as A8.
 def : ProcessorModel<"cortex-r4",   CortexA8Model,      [ARMv7r, ProcR4,
                                                          FeatureHasRetAddrStack,
                                                          FeatureAvoidPartialCPSR]>;
 
-// FIXME: R4F has currently the same ProcessorModel as A8.
 def : ProcessorModel<"cortex-r4f",  CortexA8Model,      [ARMv7r, ProcR4,
                                                          FeatureHasRetAddrStack,
                                                          FeatureSlowFPBrcc,
@@ -734,7 +805,6 @@ def : ProcessorModel<"cortex-r4f",  CortexA8Model,      [ARMv7r, ProcR4,
                                                          FeatureD16,
                                                          FeatureAvoidPartialCPSR]>;
 
-// FIXME: R5 has currently the same ProcessorModel as A8.
 def : ProcessorModel<"cortex-r5",   CortexA8Model,      [ARMv7r, ProcR5,
                                                          FeatureHasRetAddrStack,
                                                          FeatureVFP3,
@@ -744,7 +814,6 @@ def : ProcessorModel<"cortex-r5",   CortexA8Model,      [ARMv7r, ProcR5,
                                                          FeatureHasSlowFPVMLx,
                                                          FeatureAvoidPartialCPSR]>;
 
-// FIXME: R7 has currently the same ProcessorModel as A8 and is modelled as R5.
 def : ProcessorModel<"cortex-r7",   CortexA8Model,      [ARMv7r, ProcR7,
                                                          FeatureHasRetAddrStack,
                                                          FeatureVFP3,
@@ -814,14 +883,14 @@ def : ProcNoItin<"cortex-a53",                          [ARMv8a, ProcA53,
                                                          FeatureCRC,
                                                          FeatureFPAO]>;
 
-def : ProcessorModel<"cortex-a57",  CortexA57Model, [ARMv8a, ProcA57,
-                                                     FeatureHWDivThumb,
-                                                     FeatureHWDivARM,
-                                                     FeatureCrypto,
-                                                     FeatureCRC,
-                                                     FeatureFPAO,
-                                                     FeatureAvoidPartialCPSR,
-                                                     FeatureCheapPredicableCPSR]>;
+def : ProcessorModel<"cortex-a57",  CortexA57Model,     [ARMv8a, ProcA57,
+                                                         FeatureHWDivThumb,
+                                                         FeatureHWDivARM,
+                                                         FeatureCrypto,
+                                                         FeatureCRC,
+                                                         FeatureFPAO,
+                                                         FeatureAvoidPartialCPSR,
+                                                         FeatureCheapPredicableCPSR]>;
 
 def : ProcNoItin<"cortex-a72",                          [ARMv8a, ProcA72,
                                                          FeatureHWDivThumb,
@@ -835,7 +904,6 @@ def : ProcNoItin<"cortex-a73",                          [ARMv8a, ProcA73,
                                                          FeatureCrypto,
                                                          FeatureCRC]>;
 
-// Cyclone is very similar to swift
 def : ProcessorModel<"cyclone",     SwiftModel,         [ARMv8a, ProcSwift,
                                                          FeatureHasRetAddrStack,
                                                          FeatureNEONForFP,
@@ -881,9 +949,7 @@ def : ProcessorModel<"cortex-r52", CortexR52Model,      [ARMv8r, ProcR52,
 //===----------------------------------------------------------------------===//
 
 include "ARMRegisterInfo.td"
-
 include "ARMRegisterBanks.td"
-
 include "ARMCallingConv.td"
 
 //===----------------------------------------------------------------------===//
@@ -891,7 +957,6 @@ include "ARMCallingConv.td"
 //===----------------------------------------------------------------------===//
 
 include "ARMInstrInfo.td"
-
 def ARMInstrInfo : InstrInfo;
 
 //===----------------------------------------------------------------------===//
@@ -912,7 +977,7 @@ def ARMAsmParserVariant : AsmParserVariant {
 }
 
 def ARM : Target {
-  // Pull in Instruction Info:
+  // Pull in Instruction Info.
   let InstructionSet = ARMInstrInfo;
   let AssemblyWriters = [ARMAsmWriter];
   let AssemblyParserVariants = [ARMAsmParserVariant];
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index e97a7ce5067f..370c0a7f5c53 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -117,7 +117,7 @@ ARMBaseRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
                                           CallingConv::ID CC) const {
   const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>();
   if (CC == CallingConv::GHC)
-    // This is academic becase all GHC calls are (supposed to be) tail calls
+    // This is academic because all GHC calls are (supposed to be) tail calls
     return CSR_NoRegs_RegMask;
 
   if (STI.isTargetDarwin() && STI.getTargetLowering()->supportSwiftError() &&
@@ -163,7 +163,7 @@ ARMBaseRegisterInfo::getThisReturnPreservedMask(const MachineFunction &MF,
   // both or otherwise does not want to enable this optimization, the function
   // should return NULL
   if (CC == CallingConv::GHC)
-    // This is academic becase all GHC calls are (supposed to be) tail calls
+    // This is academic because all GHC calls are (supposed to be) tail calls
     return nullptr;
   return STI.isTargetDarwin() ? CSR_iOS_ThisReturn_RegMask
                               : CSR_AAPCS_ThisReturn_RegMask;
diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp
index 384f80356cc8..bf00ef61c2d1 100644
--- a/lib/Target/ARM/ARMFastISel.cpp
+++ b/lib/Target/ARM/ARMFastISel.cpp
@@ -250,8 +250,7 @@ bool ARMFastISel::DefinesOptionalPredicate(MachineInstr *MI, bool *CPSR) {
     return false;
 
   // Look to see if our OptionalDef is defining CPSR or CCR.
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
+  for (const MachineOperand &MO : MI->operands()) {
     if (!MO.isReg() || !MO.isDef()) continue;
     if (MO.getReg() == ARM::CPSR)
       *CPSR = true;
@@ -267,8 +266,8 @@ bool ARMFastISel::isARMNEONPred(const MachineInstr *MI) {
        AFI->isThumb2Function())
     return MI->isPredicable();
 
-  for (unsigned i = 0, e = MCID.getNumOperands(); i != e; ++i)
-    if (MCID.OpInfo[i].isPredicate())
+  for (const MCOperandInfo &opInfo : MCID.operands())
+    if (opInfo.isPredicate())
       return true;
 
   return false;
@@ -1972,7 +1971,7 @@ bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args,
         break;
       }
       case CCValAssign::AExt:
-        // Intentional fall-through.  Handle AExt and ZExt.
+      // Intentional fall-through.  Handle AExt and ZExt.
       case CCValAssign::ZExt: {
         MVT DestVT = VA.getLocVT();
         Arg = ARMEmitIntExt(ArgVT, Arg, DestVT, /*isZExt*/true);
@@ -2001,6 +2000,7 @@ bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args,
       assert(VA.getLocVT() == MVT::f64 &&
              "Custom lowering for v2f64 args not available");
 
+      // FIXME: ArgLocs[++i] may extend beyond ArgLocs.size()
       CCValAssign &NextVA = ArgLocs[++i];
 
       assert(VA.isRegLoc() && NextVA.isRegLoc() &&
@@ -2172,8 +2172,8 @@ bool ARMFastISel::SelectRet(const Instruction *I) {
   MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                                     TII.get(RetOpc));
   AddOptionalDefs(MIB);
-  for (unsigned i = 0, e = RetRegs.size(); i != e; ++i)
-    MIB.addReg(RetRegs[i], RegState::Implicit);
+  for (unsigned R : RetRegs)
+    MIB.addReg(R, RegState::Implicit);
   return true;
 }
 
@@ -2233,8 +2233,7 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
   ArgRegs.reserve(I->getNumOperands());
   ArgVTs.reserve(I->getNumOperands());
   ArgFlags.reserve(I->getNumOperands());
-  for (unsigned i = 0; i < I->getNumOperands(); ++i) {
-    Value *Op = I->getOperand(i);
+  for (Value *Op :  I->operands()) {
     unsigned Arg = getRegForValue(Op);
     if (Arg == 0) return false;
 
@@ -2278,8 +2277,8 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
     MIB.addExternalSymbol(TLI.getLibcallName(Call));
 
   // Add implicit physical register uses to the call.
-  for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
-    MIB.addReg(RegArgs[i], RegState::Implicit);
+  for (unsigned R : RegArgs)
+    MIB.addReg(R, RegState::Implicit);
 
   // Add a register mask with the call-preserved registers.
   // Proper defs for return values will be added by setPhysRegsDeadExcept().
@@ -2423,8 +2422,8 @@ bool ARMFastISel::SelectCall(const Instruction *I,
     MIB.addExternalSymbol(IntrMemName, 0);
 
   // Add implicit physical register uses to the call.
-  for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
-    MIB.addReg(RegArgs[i], RegState::Implicit);
+  for (unsigned R : RegArgs)
+    MIB.addReg(R, RegState::Implicit);
 
   // Add a register mask with the call-preserved registers.
   // Proper defs for return values will be added by setPhysRegsDeadExcept().
@@ -2932,13 +2931,12 @@ bool ARMFastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
 
   bool Found = false;
   bool isZExt;
-  for (unsigned i = 0, e = array_lengthof(FoldableLoadExtends);
-       i != e; ++i) {
-    if (FoldableLoadExtends[i].Opc[isThumb2] == MI->getOpcode() &&
-        (uint64_t)FoldableLoadExtends[i].ExpectedImm == Imm &&
-        MVT((MVT::SimpleValueType)FoldableLoadExtends[i].ExpectedVT) == VT) {
+  for (const FoldableLoadExtendsStruct &FLE : FoldableLoadExtends) {
+    if (FLE.Opc[isThumb2] == MI->getOpcode() &&
+        (uint64_t)FLE.ExpectedImm == Imm &&
+        MVT((MVT::SimpleValueType)FLE.ExpectedVT) == VT) {
       Found = true;
-      isZExt = FoldableLoadExtends[i].isZExt;
+      isZExt = FLE.isZExt;
     }
   }
   if (!Found) return false;
@@ -3057,9 +3055,8 @@ bool ARMFastISel::fastLowerArguments() {
   };
 
   const TargetRegisterClass *RC = &ARM::rGPRRegClass;
-  for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
-       I != E; ++I) {
-    unsigned ArgNo = I->getArgNo();
+  for (const Argument &Arg : F->args()) {
+    unsigned ArgNo = Arg.getArgNo();
     unsigned SrcReg = GPRArgRegs[ArgNo];
     unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC);
     // FIXME: Unfortunately it's necessary to emit a copy from the livein copy.
@@ -3069,7 +3066,7 @@ bool ARMFastISel::fastLowerArguments() {
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::COPY),
             ResultReg).addReg(DstReg, getKillRegState(true));
-    updateValueMap(&*I, ResultReg);
+    updateValueMap(&Arg, ResultReg);
   }
 
   return true;
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 7f9fe55a5c38..f75dd4de3f96 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -2682,9 +2682,12 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
 
       SDNode *ResNode;
       if (Subtarget->isThumb()) {
-        SDValue Pred = getAL(CurDAG, dl);
-        SDValue PredReg = CurDAG->getRegister(0, MVT::i32);
-        SDValue Ops[] = { CPIdx, Pred, PredReg, CurDAG->getEntryNode() };
+        SDValue Ops[] = {
+          CPIdx,
+          getAL(CurDAG, dl),
+          CurDAG->getRegister(0, MVT::i32),
+          CurDAG->getEntryNode()
+        };
         ResNode = CurDAG->getMachineNode(ARM::tLDRpci, dl, MVT::i32, MVT::Other,
                                          Ops);
       } else {
@@ -2698,6 +2701,17 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
         ResNode = CurDAG->getMachineNode(ARM::LDRcp, dl, MVT::i32, MVT::Other,
                                          Ops);
       }
+      // Annotate the Node with memory operand information so that MachineInstr
+      // queries work properly. This e.g. gives the register allocation the
+      // required information for rematerialization.
+      MachineFunction& MF = CurDAG->getMachineFunction();
+      MachineSDNode::mmo_iterator MemOp = MF.allocateMemRefsArray(1);
+      MemOp[0] = MF.getMachineMemOperand(
+          MachinePointerInfo::getConstantPool(MF),
+          MachineMemOperand::MOLoad, 4, 4);
+
+      cast<MachineSDNode>(ResNode)->setMemRefs(MemOp, MemOp+1);
+        
       ReplaceNode(N, ResNode);
       return;
     }
diff --git a/lib/Target/ARM/ARMInstructionSelector.cpp b/lib/Target/ARM/ARMInstructionSelector.cpp
index 29ef69ad0010..faed6b867e2b 100644
--- a/lib/Target/ARM/ARMInstructionSelector.cpp
+++ b/lib/Target/ARM/ARMInstructionSelector.cpp
@@ -722,6 +722,29 @@ bool ARMInstructionSelector::select(MachineInstr &I) const {
       return false;
     break;
   }
+  case G_BRCOND: {
+    if (!validReg(MRI, I.getOperand(0).getReg(), 1, ARM::GPRRegBankID)) {
+      DEBUG(dbgs() << "Unsupported condition register for G_BRCOND");
+      return false;
+    }
+
+    // Set the flags.
+    auto Test = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(ARM::TSTri))
+                    .addReg(I.getOperand(0).getReg())
+                    .addImm(1)
+                    .add(predOps(ARMCC::AL));
+    if (!constrainSelectedInstRegOperands(*Test, TII, TRI, RBI))
+      return false;
+
+    // Branch conditionally.
+    auto Branch = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(ARM::Bcc))
+                      .add(I.getOperand(1))
+                      .add(predOps(ARMCC::EQ, ARM::CPSR));
+    if (!constrainSelectedInstRegOperands(*Branch, TII, TRI, RBI))
+      return false;
+    I.eraseFromParent();
+    return true;
+  }
   default:
     return false;
   }
diff --git a/lib/Target/ARM/ARMLegalizerInfo.cpp b/lib/Target/ARM/ARMLegalizerInfo.cpp
index f23e62595d2e..1c17c07e4cb0 100644
--- a/lib/Target/ARM/ARMLegalizerInfo.cpp
+++ b/lib/Target/ARM/ARMLegalizerInfo.cpp
@@ -66,14 +66,16 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
       setAction({Op, s32}, Libcall);
   }
 
-  // FIXME: Support s8 and s16 as well
-  for (unsigned Op : {G_SREM, G_UREM})
+  for (unsigned Op : {G_SREM, G_UREM}) {
+    for (auto Ty : {s8, s16})
+      setAction({Op, Ty}, WidenScalar);
     if (ST.hasDivideInARMMode())
       setAction({Op, s32}, Lower);
     else if (AEABI(ST))
       setAction({Op, s32}, Custom);
     else
       setAction({Op, s32}, Libcall);
+  }
 
   for (unsigned Op : {G_SEXT, G_ZEXT}) {
     setAction({Op, s32}, Legal);
@@ -88,6 +90,8 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
   setAction({G_SELECT, p0}, Legal);
   setAction({G_SELECT, 1, s1}, Legal);
 
+  setAction({G_BRCOND, s1}, Legal);
+
   setAction({G_CONSTANT, s32}, Legal);
   for (auto Ty : {s1, s8, s16})
     setAction({G_CONSTANT, Ty}, WidenScalar);
diff --git a/lib/Target/ARM/ARMMCInstLower.cpp b/lib/Target/ARM/ARMMCInstLower.cpp
index 13acea3c28a9..48b02d40b246 100644
--- a/lib/Target/ARM/ARMMCInstLower.cpp
+++ b/lib/Target/ARM/ARMMCInstLower.cpp
@@ -153,9 +153,7 @@ void llvm::LowerARMMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
     break;
   }
 
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
-
+  for (const MachineOperand &MO : MI->operands()) {
     MCOperand MCOp;
     if (AP.lowerOperand(MO, MCOp)) {
       if (MCOp.isImm() && EncodeImms) {
diff --git a/lib/Target/ARM/ARMRegisterBankInfo.cpp b/lib/Target/ARM/ARMRegisterBankInfo.cpp
index c0c09e8c15af..844930235894 100644
--- a/lib/Target/ARM/ARMRegisterBankInfo.cpp
+++ b/lib/Target/ARM/ARMRegisterBankInfo.cpp
@@ -259,6 +259,7 @@ ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     break;
   case G_SELECT: {
     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
+    (void)Ty;
     LLT Ty2 = MRI.getType(MI.getOperand(1).getReg());
     (void)Ty2;
     assert(Ty.getSizeInBits() == 32 && "Unsupported size for G_SELECT");
@@ -282,6 +283,7 @@ ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   }
   case G_FCMP: {
     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
+    (void)Ty;
     LLT Ty1 = MRI.getType(MI.getOperand(2).getReg());
     LLT Ty2 = MRI.getType(MI.getOperand(3).getReg());
     (void)Ty2;
@@ -329,6 +331,13 @@ ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
                             &ARM::ValueMappings[ARM::DPR3OpsIdx]});
     break;
   }
+  case G_BR:
+    OperandsMapping = getOperandsMapping({nullptr});
+    break;
+  case G_BRCOND:
+    OperandsMapping =
+        getOperandsMapping({&ARM::ValueMappings[ARM::GPR3OpsIdx], nullptr});
+    break;
   default:
     return getInvalidInstructionMapping();
   }
diff --git a/lib/Target/ARM/ARMTargetMachine.h b/lib/Target/ARM/ARMTargetMachine.h
index f41da3e8e223..22ce949367f3 100644
--- a/lib/Target/ARM/ARMTargetMachine.h
+++ b/lib/Target/ARM/ARMTargetMachine.h
@@ -47,6 +47,8 @@ public:
   ~ARMBaseTargetMachine() override;
 
   const ARMSubtarget *getSubtargetImpl(const Function &F) const override;
+  // The no argument getSubtargetImpl, while it exists on some targets, is
+  // deprecated and should not be used.
   const ARMSubtarget *getSubtargetImpl() const = delete;
   bool isLittleEndian() const { return isLittle; }
 
diff --git a/lib/Target/BPF/BPFISelLowering.cpp b/lib/Target/BPF/BPFISelLowering.cpp
index cc7a7c3849bc..81b0aa7f8b98 100644
--- a/lib/Target/BPF/BPFISelLowering.cpp
+++ b/lib/Target/BPF/BPFISelLowering.cpp
@@ -515,8 +515,9 @@ BPFTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                                                MachineBasicBlock *BB) const {
   const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo();
   DebugLoc DL = MI.getDebugLoc();
+  bool isSelectOp = MI.getOpcode() == BPF::Select;
 
-  assert(MI.getOpcode() == BPF::Select && "Unexpected instr type to insert");
+  assert((isSelectOp || MI.getOpcode() == BPF::Select_Ri) && "Unexpected instr type to insert");
 
   // To "insert" a SELECT instruction, we actually have to insert the diamond
   // control-flow pattern.  The incoming instruction knows the destination vreg
@@ -548,48 +549,40 @@ BPFTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
 
   // Insert Branch if Flag
   unsigned LHS = MI.getOperand(1).getReg();
-  unsigned RHS = MI.getOperand(2).getReg();
   int CC = MI.getOperand(3).getImm();
+  int NewCC;
   switch (CC) {
   case ISD::SETGT:
-    BuildMI(BB, DL, TII.get(BPF::JSGT_rr))
-        .addReg(LHS)
-        .addReg(RHS)
-        .addMBB(Copy1MBB);
+    NewCC = isSelectOp ? BPF::JSGT_rr : BPF::JSGT_ri;
     break;
   case ISD::SETUGT:
-    BuildMI(BB, DL, TII.get(BPF::JUGT_rr))
-        .addReg(LHS)
-        .addReg(RHS)
-        .addMBB(Copy1MBB);
+    NewCC = isSelectOp ? BPF::JUGT_rr : BPF::JUGT_ri;
     break;
   case ISD::SETGE:
-    BuildMI(BB, DL, TII.get(BPF::JSGE_rr))
-        .addReg(LHS)
-        .addReg(RHS)
-        .addMBB(Copy1MBB);
+    NewCC = isSelectOp ? BPF::JSGE_rr : BPF::JSGE_ri;
     break;
   case ISD::SETUGE:
-    BuildMI(BB, DL, TII.get(BPF::JUGE_rr))
-        .addReg(LHS)
-        .addReg(RHS)
-        .addMBB(Copy1MBB);
+    NewCC = isSelectOp ? BPF::JUGE_rr : BPF::JUGE_ri;
     break;
   case ISD::SETEQ:
-    BuildMI(BB, DL, TII.get(BPF::JEQ_rr))
-        .addReg(LHS)
-        .addReg(RHS)
-        .addMBB(Copy1MBB);
+    NewCC = isSelectOp ? BPF::JEQ_rr : BPF::JEQ_ri;
     break;
   case ISD::SETNE:
-    BuildMI(BB, DL, TII.get(BPF::JNE_rr))
-        .addReg(LHS)
-        .addReg(RHS)
-        .addMBB(Copy1MBB);
+    NewCC = isSelectOp ? BPF::JNE_rr : BPF::JNE_ri;
     break;
   default:
     report_fatal_error("unimplemented select CondCode " + Twine(CC));
   }
+  if (isSelectOp)
+    BuildMI(BB, DL, TII.get(NewCC))
+        .addReg(LHS)
+        .addReg(MI.getOperand(2).getReg())
+        .addMBB(Copy1MBB);
+  else
+    BuildMI(BB, DL, TII.get(NewCC))
+        .addReg(LHS)
+        .addImm(MI.getOperand(2).getImm())
+        .addMBB(Copy1MBB);
 
   // Copy0MBB:
   //  %FalseValue = ...
diff --git a/lib/Target/BPF/BPFInstrInfo.td b/lib/Target/BPF/BPFInstrInfo.td
index 5ad777268208..f68357809add 100644
--- a/lib/Target/BPF/BPFInstrInfo.td
+++ b/lib/Target/BPF/BPFInstrInfo.td
@@ -460,6 +460,11 @@ let usesCustomInserter = 1 in {
                       "# Select PSEUDO $dst = $lhs $imm $rhs ? $src : $src2",
                       [(set i64:$dst,
                        (BPFselectcc i64:$lhs, i64:$rhs, (i64 imm:$imm), i64:$src, i64:$src2))]>;
+  def Select_Ri : Pseudo<(outs GPR:$dst),
+                      (ins GPR:$lhs, i64imm:$rhs, i64imm:$imm, GPR:$src, GPR:$src2),
+                      "# Select PSEUDO $dst = $lhs $imm $rhs ? $src : $src2",
+                      [(set i64:$dst,
+                       (BPFselectcc i64:$lhs, (i64 imm:$rhs), (i64 imm:$imm), i64:$src, i64:$src2))]>;
 }
 
 // load 64-bit global addr into register
diff --git a/lib/Target/Hexagon/HexagonBitSimplify.cpp b/lib/Target/Hexagon/HexagonBitSimplify.cpp
index b064778c4bbd..d75d95a6baea 100644
--- a/lib/Target/Hexagon/HexagonBitSimplify.cpp
+++ b/lib/Target/Hexagon/HexagonBitSimplify.cpp
@@ -7,8 +7,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "hexbit"
-
 #include "HexagonBitTracker.h"
 #include "HexagonTargetMachine.h"
 #include "llvm/ADT/BitVector.h"
@@ -42,6 +40,8 @@
 #include <utility>
 #include <vector>
 
+#define DEBUG_TYPE "hexbit"
+
 using namespace llvm;
 
 static cl::opt<bool> PreserveTiedOps("hexbit-keep-tied", cl::Hidden,
diff --git a/lib/Target/Hexagon/HexagonDepInstrInfo.td b/lib/Target/Hexagon/HexagonDepInstrInfo.td
index 2dc74632e9be..30ebf89c9808 100644
--- a/lib/Target/Hexagon/HexagonDepInstrInfo.td
+++ b/lib/Target/Hexagon/HexagonDepInstrInfo.td
@@ -45863,6 +45863,7 @@ tc_30665cb0, TypeST>, Enc_ecbcc8 {
 let Inst{13-0} = 0b00000000000000;
 let Inst{31-21} = 0b10100000000;
 let isSoloAin1 = 1;
+let hasSideEffects = 1;
 }
 def Y2_dccleaninva : HInst<
 (outs),
@@ -45872,6 +45873,7 @@ tc_30665cb0, TypeST>, Enc_ecbcc8 {
 let Inst{13-0} = 0b00000000000000;
 let Inst{31-21} = 0b10100000010;
 let isSoloAin1 = 1;
+let hasSideEffects = 1;
 }
 def Y2_dcfetch : HInst<
 (outs),
@@ -45900,6 +45902,7 @@ tc_30665cb0, TypeST>, Enc_ecbcc8 {
 let Inst{13-0} = 0b00000000000000;
 let Inst{31-21} = 0b10100000001;
 let isSoloAin1 = 1;
+let hasSideEffects = 1;
 }
 def Y2_dczeroa : HInst<
 (outs),
@@ -45909,6 +45912,7 @@ tc_30665cb0, TypeST>, Enc_ecbcc8 {
 let Inst{13-0} = 0b00000000000000;
 let Inst{31-21} = 0b10100000110;
 let isSoloAin1 = 1;
+let hasSideEffects = 1;
 let mayStore = 1;
 }
 def Y2_icinva : HInst<
diff --git a/lib/Target/Hexagon/HexagonEarlyIfConv.cpp b/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
index 03c4a83594b3..80361015e649 100644
--- a/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
+++ b/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
@@ -59,8 +59,6 @@
 //         J2_jump <BB#6>, %PC<imp-def,dead>
 //     Successors according to CFG: BB#6 BB#3
 
-#define DEBUG_TYPE "hexagon-eif"
-
 #include "Hexagon.h"
 #include "HexagonInstrInfo.h"
 #include "HexagonSubtarget.h"
@@ -90,6 +88,8 @@
 #include <cassert>
 #include <iterator>
 
+#define DEBUG_TYPE "hexagon-eif"
+
 using namespace llvm;
 
 namespace llvm {
diff --git a/lib/Target/Hexagon/HexagonExpandCondsets.cpp b/lib/Target/Hexagon/HexagonExpandCondsets.cpp
index 734f3c6658d9..a2f6dd68c1a1 100644
--- a/lib/Target/Hexagon/HexagonExpandCondsets.cpp
+++ b/lib/Target/Hexagon/HexagonExpandCondsets.cpp
@@ -86,8 +86,6 @@
 // however, is that finding the locations where the implicit uses need
 // to be added, and updating the live ranges will be more involved.
 
-#define DEBUG_TYPE "expand-condsets"
-
 #include "HexagonInstrInfo.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SetVector.h"
@@ -116,6 +114,8 @@
 #include <set>
 #include <utility>
 
+#define DEBUG_TYPE "expand-condsets"
+
 using namespace llvm;
 
 static cl::opt<unsigned> OptTfrLimit("expand-condsets-tfr-limit",
diff --git a/lib/Target/Hexagon/HexagonFrameLowering.cpp b/lib/Target/Hexagon/HexagonFrameLowering.cpp
index c790579ccebc..e5e75198b2d1 100644
--- a/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -8,8 +8,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "hexagon-pei"
-
 #include "HexagonFrameLowering.h"
 #include "HexagonBlockRanges.h"
 #include "HexagonInstrInfo.h"
@@ -63,6 +61,8 @@
 #include <utility>
 #include <vector>
 
+#define DEBUG_TYPE "hexagon-pei"
+
 // Hexagon stack frame layout as defined by the ABI:
 //
 //                                                       Incoming arguments
diff --git a/lib/Target/Hexagon/HexagonGenInsert.cpp b/lib/Target/Hexagon/HexagonGenInsert.cpp
index bf31e1699284..0a955aedaf1a 100644
--- a/lib/Target/Hexagon/HexagonGenInsert.cpp
+++ b/lib/Target/Hexagon/HexagonGenInsert.cpp
@@ -7,8 +7,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "hexinsert"
-
 #include "BitTracker.h"
 #include "HexagonBitTracker.h"
 #include "HexagonInstrInfo.h"
@@ -44,6 +42,8 @@
 #include <utility>
 #include <vector>
 
+#define DEBUG_TYPE "hexinsert"
+
 using namespace llvm;
 
 static cl::opt<unsigned> VRegIndexCutoff("insert-vreg-cutoff", cl::init(~0U),
diff --git a/lib/Target/Hexagon/HexagonGenPredicate.cpp b/lib/Target/Hexagon/HexagonGenPredicate.cpp
index 3470480d607d..2da211563e0a 100644
--- a/lib/Target/Hexagon/HexagonGenPredicate.cpp
+++ b/lib/Target/Hexagon/HexagonGenPredicate.cpp
@@ -7,8 +7,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "gen-pred"
-
 #include "HexagonInstrInfo.h"
 #include "HexagonSubtarget.h"
 #include "llvm/ADT/SetVector.h"
@@ -35,6 +33,8 @@
 #include <set>
 #include <utility>
 
+#define DEBUG_TYPE "gen-pred"
+
 using namespace llvm;
 
 namespace llvm {
diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp
index 67242764d453..3997702bc962 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -1286,16 +1286,6 @@ HexagonTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
                       MachinePointerInfo(SV));
 }
 
-// Creates a SPLAT instruction for a constant value VAL.
-static SDValue createSplat(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
-                           SDValue Val) {
-  EVT T = VT.getVectorElementType();
-  if (T == MVT::i8 || T == MVT::i16)
-    return DAG.getNode(HexagonISD::VSPLAT, dl, VT, Val);
-
-  return SDValue();
-}
-
 static bool isSExtFree(SDValue N) {
   // A sign-extend of a truncate of a sign-extend is free.
   if (N.getOpcode() == ISD::TRUNCATE &&
@@ -1374,79 +1364,6 @@ HexagonTargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
   return SDValue();
 }
 
-// Handle only specific vector loads.
-SDValue HexagonTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
-  EVT VT = Op.getValueType();
-  SDLoc DL(Op);
-  LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
-  SDValue Chain = LoadNode->getChain();
-  SDValue Ptr = Op.getOperand(1);
-  SDValue LoweredLoad;
-  SDValue Result;
-  SDValue Base = LoadNode->getBasePtr();
-  ISD::LoadExtType Ext = LoadNode->getExtensionType();
-  unsigned Alignment = LoadNode->getAlignment();
-  SDValue LoadChain;
-
-  if(Ext == ISD::NON_EXTLOAD)
-    Ext = ISD::ZEXTLOAD;
-
-  if (VT == MVT::v4i16) {
-    if (Alignment == 2) {
-      SDValue Loads[4];
-      // Base load.
-      Loads[0] = DAG.getExtLoad(Ext, DL, MVT::i32, Chain, Base,
-                                LoadNode->getPointerInfo(), MVT::i16, Alignment,
-                                LoadNode->getMemOperand()->getFlags());
-      // Base+2 load.
-      SDValue Increment = DAG.getConstant(2, DL, MVT::i32);
-      Ptr = DAG.getNode(ISD::ADD, DL, Base.getValueType(), Base, Increment);
-      Loads[1] = DAG.getExtLoad(Ext, DL, MVT::i32, Chain, Ptr,
-                                LoadNode->getPointerInfo(), MVT::i16, Alignment,
-                                LoadNode->getMemOperand()->getFlags());
-      // SHL 16, then OR base and base+2.
-      SDValue ShiftAmount = DAG.getConstant(16, DL, MVT::i32);
-      SDValue Tmp1 = DAG.getNode(ISD::SHL, DL, MVT::i32, Loads[1], ShiftAmount);
-      SDValue Tmp2 = DAG.getNode(ISD::OR, DL, MVT::i32, Tmp1, Loads[0]);
-      // Base + 4.
-      Increment = DAG.getConstant(4, DL, MVT::i32);
-      Ptr = DAG.getNode(ISD::ADD, DL, Base.getValueType(), Base, Increment);
-      Loads[2] = DAG.getExtLoad(Ext, DL, MVT::i32, Chain, Ptr,
-                                LoadNode->getPointerInfo(), MVT::i16, Alignment,
-                                LoadNode->getMemOperand()->getFlags());
-      // Base + 6.
-      Increment = DAG.getConstant(6, DL, MVT::i32);
-      Ptr = DAG.getNode(ISD::ADD, DL, Base.getValueType(), Base, Increment);
-      Loads[3] = DAG.getExtLoad(Ext, DL, MVT::i32, Chain, Ptr,
-                                LoadNode->getPointerInfo(), MVT::i16, Alignment,
-                                LoadNode->getMemOperand()->getFlags());
-      // SHL 16, then OR base+4 and base+6.
-      Tmp1 = DAG.getNode(ISD::SHL, DL, MVT::i32, Loads[3], ShiftAmount);
-      SDValue Tmp4 = DAG.getNode(ISD::OR, DL, MVT::i32, Tmp1, Loads[2]);
-      // Combine to i64. This could be optimised out later if we can
-      // affect reg allocation of this code.
-      Result = DAG.getNode(HexagonISD::COMBINE, DL, MVT::i64, Tmp4, Tmp2);
-      LoadChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
-                              Loads[0].getValue(1), Loads[1].getValue(1),
-                              Loads[2].getValue(1), Loads[3].getValue(1));
-    } else {
-      // Perform default type expansion.
-      Result = DAG.getLoad(MVT::i64, DL, Chain, Ptr, LoadNode->getPointerInfo(),
-                           LoadNode->getAlignment(),
-                           LoadNode->getMemOperand()->getFlags());
-      LoadChain = Result.getValue(1);
-    }
-  } else
-    llvm_unreachable("Custom lowering unsupported load");
-
-  Result = DAG.getNode(ISD::BITCAST, DL, VT, Result);
-  // Since we pretend to lower a load, we need the original chain
-  // info attached to the result.
-  SDValue Ops[] = { Result, LoadChain };
-
-  return DAG.getMergeValues(Ops, DL);
-}
-
 SDValue
 HexagonTargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
   EVT ValTy = Op.getValueType();
@@ -1971,18 +1888,12 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
   // Handling of vector operations.
   //
 
-  // Custom lower v4i16 load only. Let v4i16 store to be
-  // promoted for now.
   promoteLdStType(MVT::v4i8,  MVT::i32);
   promoteLdStType(MVT::v2i16, MVT::i32);
   promoteLdStType(MVT::v8i8,  MVT::i64);
+  promoteLdStType(MVT::v4i16, MVT::i64);
   promoteLdStType(MVT::v2i32, MVT::i64);
 
-  setOperationAction(ISD::LOAD,  MVT::v4i16, Custom);
-  setOperationAction(ISD::STORE, MVT::v4i16, Promote);
-  AddPromotedToType(ISD::LOAD,  MVT::v4i16, MVT::i64);
-  AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::i64);
-
   // Set the action for vector operations to "expand", then override it with
   // either "custom" or "legal" for specific cases.
   static const unsigned VectExpOps[] = {
@@ -2301,7 +2212,8 @@ const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case HexagonISD::RET_FLAG:      return "HexagonISD::RET_FLAG";
   case HexagonISD::TC_RETURN:     return "HexagonISD::TC_RETURN";
   case HexagonISD::VCOMBINE:      return "HexagonISD::VCOMBINE";
-  case HexagonISD::VPACK:         return "HexagonISD::VPACK";
+  case HexagonISD::VPACKE:        return "HexagonISD::VPACKE";
+  case HexagonISD::VPACKO:        return "HexagonISD::VPACKO";
   case HexagonISD::VASL:          return "HexagonISD::VASL";
   case HexagonISD::VASR:          return "HexagonISD::VASR";
   case HexagonISD::VLSR:          return "HexagonISD::VLSR";
@@ -2394,7 +2306,7 @@ HexagonTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG)
 
     // Test if V1 is a SCALAR_TO_VECTOR.
     if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
-      return createSplat(DAG, dl, VT, V1.getOperand(0));
+      return DAG.getNode(HexagonISD::VSPLAT, dl, VT, V1.getOperand(0));
 
     // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR
     // (and probably will turn into a SCALAR_TO_VECTOR once legalization
@@ -2409,28 +2321,26 @@ HexagonTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG)
         }
       }
       if (IsScalarToVector)
-        return createSplat(DAG, dl, VT, V1.getOperand(0));
+        return DAG.getNode(HexagonISD::VSPLAT, dl, VT, V1.getOperand(0));
     }
-    return createSplat(DAG, dl, VT, DAG.getConstant(Lane, dl, MVT::i32));
+    return DAG.getNode(HexagonISD::VSPLAT, dl, VT,
+                       DAG.getConstant(Lane, dl, MVT::i32));
   }
 
   if (UseHVX) {
     ArrayRef<int> Mask = SVN->getMask();
     size_t MaskLen = Mask.size();
-    int ElemSizeInBits = VT.getScalarSizeInBits();
-    if ((Subtarget.useHVXSglOps() && (ElemSizeInBits * MaskLen) == 64 * 8) ||
-        (Subtarget.useHVXDblOps() && (ElemSizeInBits * MaskLen) == 128 * 8)) {
-      // Return 1 for odd and 2 of even
-      StridedLoadKind Pattern = isStridedLoad(Mask);
+    unsigned SizeInBits = VT.getScalarSizeInBits() * MaskLen;
 
+    if ((Subtarget.useHVXSglOps() && SizeInBits == 64 * 8) ||
+        (Subtarget.useHVXDblOps() && SizeInBits == 128 * 8)) {
+      StridedLoadKind Pattern = isStridedLoad(Mask);
       if (Pattern == StridedLoadKind::NoPattern)
         return SDValue();
 
-      SDValue Vec0 = Op.getOperand(0);
-      SDValue Vec1 = Op.getOperand(1);
-      SDValue StridePattern = DAG.getConstant(Pattern, dl, MVT::i32);
-      SDValue Ops[] = { Vec1, Vec0, StridePattern };
-      return DAG.getNode(HexagonISD::VPACK, dl, VT, Ops);
+      unsigned Opc = Pattern == StridedLoadKind::Even ? HexagonISD::VPACKE
+                                                      : HexagonISD::VPACKO;
+      return DAG.getNode(Opc, dl, VT, {Op.getOperand(1), Op.getOperand(0)});
     }
     // We used to assert in the "else" part here, but that is bad for Halide
     // Halide creates intermediate double registers by interleaving two
@@ -2531,19 +2441,26 @@ HexagonTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   if (Size > 64)
     return SDValue();
 
-  APInt APSplatBits, APSplatUndef;
-  unsigned SplatBitSize;
-  bool HasAnyUndefs;
   unsigned NElts = BVN->getNumOperands();
 
   // Try to generate a SPLAT instruction.
-  if ((VT.getSimpleVT() == MVT::v4i8 || VT.getSimpleVT() == MVT::v4i16) &&
-      (BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
-                            HasAnyUndefs, 0, true) && SplatBitSize <= 16)) {
-    unsigned SplatBits = APSplatBits.getZExtValue();
-    int32_t SextVal = ((int32_t) (SplatBits << (32 - SplatBitSize)) >>
-                       (32 - SplatBitSize));
-    return createSplat(DAG, dl, VT, DAG.getConstant(SextVal, dl, MVT::i32));
+  if (VT == MVT::v4i8 || VT == MVT::v4i16 || VT == MVT::v2i32) {
+    APInt APSplatBits, APSplatUndef;
+    unsigned SplatBitSize;
+    bool HasAnyUndefs;
+    if (BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
+                             HasAnyUndefs, 0, false)) {
+      if (SplatBitSize == VT.getVectorElementType().getSizeInBits()) {
+        unsigned ZV = APSplatBits.getZExtValue();
+        assert(SplatBitSize <= 32 && "Can only handle up to i32");
+        // Sign-extend the splat value from SplatBitSize to 32.
+        int32_t SV = SplatBitSize < 32
+                        ? int32_t(ZV << (32-SplatBitSize)) >> (32-SplatBitSize)
+                        : int32_t(ZV);
+        return DAG.getNode(HexagonISD::VSPLAT, dl, VT,
+                           DAG.getConstant(SV, dl, MVT::i32));
+      }
+    }
   }
 
   // Try to generate COMBINE to build v2i32 vectors.
@@ -2974,8 +2891,6 @@ HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     case ISD::BlockAddress:         return LowerBlockAddress(Op, DAG);
     case ISD::GLOBAL_OFFSET_TABLE:  return LowerGLOBAL_OFFSET_TABLE(Op, DAG);
     case ISD::VASTART:              return LowerVASTART(Op, DAG);
-    // Custom lower some vector loads.
-    case ISD::LOAD:                 return LowerLOAD(Op, DAG);
     case ISD::DYNAMIC_STACKALLOC:   return LowerDYNAMIC_STACKALLOC(Op, DAG);
     case ISD::SETCC:                return LowerSETCC(Op, DAG);
     case ISD::VSELECT:              return LowerVSELECT(Op, DAG);
diff --git a/lib/Target/Hexagon/HexagonISelLowering.h b/lib/Target/Hexagon/HexagonISelLowering.h
index bfd2c94eeaba..d66cbc95e918 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/lib/Target/Hexagon/HexagonISelLowering.h
@@ -62,7 +62,8 @@ namespace HexagonISD {
       EXTRACTU,
       EXTRACTURP,
       VCOMBINE,
-      VPACK,
+      VPACKE,
+      VPACKO,
       TC_RETURN,
       EH_RETURN,
       DCFETCH,
@@ -164,7 +165,6 @@ namespace HexagonISD {
     SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG& DAG) const;
     SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
 
     bool CanLowerReturn(CallingConv::ID CallConv,
                         MachineFunction &MF, bool isVarArg,
diff --git a/lib/Target/Hexagon/HexagonIntrinsics.td b/lib/Target/Hexagon/HexagonIntrinsics.td
index c611857ec26a..104a28654dd5 100644
--- a/lib/Target/Hexagon/HexagonIntrinsics.td
+++ b/lib/Target/Hexagon/HexagonIntrinsics.td
@@ -1366,6 +1366,18 @@ defm : MaskedStore <V6_vS32b_nqpred_ai, int_hexagon_V6_vmaskedstorenq>;
 defm : MaskedStore <V6_vS32b_nt_qpred_ai, int_hexagon_V6_vmaskedstorentq>;
 defm : MaskedStore <V6_vS32b_nt_nqpred_ai, int_hexagon_V6_vmaskedstorentnq>;
 
+//*******************************************************************
+//           SYSTEM
+//*******************************************************************
+
+def: T_R_pat<Y2_dccleana,    int_hexagon_Y2_dccleana>;
+def: T_R_pat<Y2_dccleaninva, int_hexagon_Y2_dccleaninva>;
+def: T_R_pat<Y2_dcinva,      int_hexagon_Y2_dcinva>;
+def: T_R_pat<Y2_dczeroa,     int_hexagon_Y2_dczeroa>;
+
+def: T_RR_pat<Y4_l2fetch,    int_hexagon_Y4_l2fetch>;
+def: T_RP_pat<Y5_l2fetch,    int_hexagon_Y5_l2fetch>;
+
 include "HexagonIntrinsicsV3.td"
 include "HexagonIntrinsicsV4.td"
 include "HexagonIntrinsicsV5.td"
diff --git a/lib/Target/Hexagon/HexagonOptAddrMode.cpp b/lib/Target/Hexagon/HexagonOptAddrMode.cpp
index a331c978f59d..374ffa3799b0 100644
--- a/lib/Target/Hexagon/HexagonOptAddrMode.cpp
+++ b/lib/Target/Hexagon/HexagonOptAddrMode.cpp
@@ -10,8 +10,6 @@
 // load/store instructions.
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "opt-addr-mode"
-
 #include "HexagonInstrInfo.h"
 #include "HexagonSubtarget.h"
 #include "MCTargetDesc/HexagonBaseInfo.h"
@@ -36,6 +34,8 @@
 #include <cassert>
 #include <cstdint>
 
+#define DEBUG_TYPE "opt-addr-mode"
+
 static cl::opt<int> CodeGrowthLimit("hexagon-amode-growth-limit",
   cl::Hidden, cl::init(0), cl::desc("Code growth limit for address mode "
   "optimization"));
diff --git a/lib/Target/Hexagon/HexagonPatterns.td b/lib/Target/Hexagon/HexagonPatterns.td
index ba98b8994937..804a547d5b33 100644
--- a/lib/Target/Hexagon/HexagonPatterns.td
+++ b/lib/Target/Hexagon/HexagonPatterns.td
@@ -2250,6 +2250,12 @@ def: Storea_pat<SwapSt<atomic_store_16>, I32, addrgp, PS_storerhabs>;
 def: Storea_pat<SwapSt<atomic_store_32>, I32, addrgp, PS_storeriabs>;
 def: Storea_pat<SwapSt<atomic_store_64>, I64, addrgp, PS_storerdabs>;
 
+// Prefer this pattern to S2_asl_i_p_or for the special case of joining
+// two 32-bit words into a 64-bit word.
+let AddedComplexity = 200 in
+def: Pat<(or (shl (Aext64 I32:$a), (i32 32)), (Zext64 I32:$b)),
+         (A2_combinew I32:$a, I32:$b)>;
+
 def: Pat<(or (or (or (shl (i64 (zext (and I32:$b, (i32 65535)))), (i32 16)),
                      (i64 (zext (i32 (and I32:$a, (i32 65535)))))),
                  (shl (i64 (anyext (and I32:$c, (i32 65535)))), (i32 32))),
@@ -2971,45 +2977,40 @@ def: Pat<(v64i32 (HexagonVCOMBINE (v32i32 VecDblRegs:$Vs),
          (V6_vcombine_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>,
          Requires<[UseHVXDbl]>;
 
-def SDTHexagonVPACK: SDTypeProfile<1, 3, [SDTCisSameAs<1, 2>,
-                                          SDTCisInt<3>]>;
-
-def HexagonVPACK: SDNode<"HexagonISD::VPACK", SDTHexagonVPACK>;
-
-// 0 as the last argument denotes vpacke. 1 denotes vpacko
-def: Pat<(v64i8 (HexagonVPACK (v64i8 VectorRegs:$Vs),
-                              (v64i8 VectorRegs:$Vt), (i32 0))),
-         (V6_vpackeb VectorRegs:$Vs, VectorRegs:$Vt)>,
-         Requires<[UseHVXSgl]>;
-def: Pat<(v64i8 (HexagonVPACK (v64i8 VectorRegs:$Vs),
-                              (v64i8 VectorRegs:$Vt), (i32 1))),
-         (V6_vpackob VectorRegs:$Vs, VectorRegs:$Vt)>,
-         Requires<[UseHVXSgl]>;
-def: Pat<(v32i16 (HexagonVPACK (v32i16 VectorRegs:$Vs),
-                               (v32i16 VectorRegs:$Vt), (i32 0))),
-         (V6_vpackeh VectorRegs:$Vs, VectorRegs:$Vt)>,
-         Requires<[UseHVXSgl]>;
-def: Pat<(v32i16 (HexagonVPACK (v32i16 VectorRegs:$Vs),
-                             (v32i16 VectorRegs:$Vt), (i32 1))),
-         (V6_vpackoh VectorRegs:$Vs, VectorRegs:$Vt)>,
-         Requires<[UseHVXSgl]>;
-
-def: Pat<(v128i8 (HexagonVPACK (v128i8 VecDblRegs:$Vs),
-                             (v128i8 VecDblRegs:$Vt), (i32 0))),
-         (V6_vpackeb_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>,
-         Requires<[UseHVXDbl]>;
-def: Pat<(v128i8 (HexagonVPACK (v128i8 VecDblRegs:$Vs),
-                             (v128i8 VecDblRegs:$Vt), (i32 1))),
-         (V6_vpackob_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>,
-         Requires<[UseHVXDbl]>;
-def: Pat<(v64i16 (HexagonVPACK (v64i16 VecDblRegs:$Vs),
-                             (v64i16 VecDblRegs:$Vt), (i32 0))),
-         (V6_vpackeh_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>,
-         Requires<[UseHVXDbl]>;
-def: Pat<(v64i16 (HexagonVPACK (v64i16 VecDblRegs:$Vs),
-                            (v64i16 VecDblRegs:$Vt), (i32 1))),
-        (V6_vpackoh_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>,
-        Requires<[UseHVXDbl]>;
+def SDTHexagonVPACK: SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>, SDTCisVec<1>]>;
+
+def HexagonVPACKE: SDNode<"HexagonISD::VPACKE", SDTHexagonVPACK>;
+def HexagonVPACKO: SDNode<"HexagonISD::VPACKO", SDTHexagonVPACK>;
+
+let Predicates = [UseHVXSgl] in {
+  def: Pat<(v64i8 (HexagonVPACKE (v64i8 VectorRegs:$Vs),
+                                 (v64i8 VectorRegs:$Vt))),
+           (V6_vpackeb VectorRegs:$Vs, VectorRegs:$Vt)>;
+  def: Pat<(v64i8 (HexagonVPACKO (v64i8 VectorRegs:$Vs),
+                                 (v64i8 VectorRegs:$Vt))),
+           (V6_vpackob VectorRegs:$Vs, VectorRegs:$Vt)>;
+  def: Pat<(v32i16 (HexagonVPACKE (v32i16 VectorRegs:$Vs),
+                                  (v32i16 VectorRegs:$Vt))),
+           (V6_vpackeh VectorRegs:$Vs, VectorRegs:$Vt)>;
+  def: Pat<(v32i16 (HexagonVPACKO (v32i16 VectorRegs:$Vs),
+                                  (v32i16 VectorRegs:$Vt))),
+           (V6_vpackoh VectorRegs:$Vs, VectorRegs:$Vt)>;
+}
+
+let Predicates = [UseHVXDbl] in {
+  def: Pat<(v128i8 (HexagonVPACKE (v128i8 VecDblRegs:$Vs),
+                                  (v128i8 VecDblRegs:$Vt))),
+           (V6_vpackeb_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>;
+  def: Pat<(v128i8 (HexagonVPACKO (v128i8 VecDblRegs:$Vs),
+                                  (v128i8 VecDblRegs:$Vt))),
+           (V6_vpackob_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>;
+  def: Pat<(v64i16 (HexagonVPACKE (v64i16 VecDblRegs:$Vs),
+                                  (v64i16 VecDblRegs:$Vt))),
+           (V6_vpackeh_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>;
+  def: Pat<(v64i16 (HexagonVPACKO (v64i16 VecDblRegs:$Vs),
+                                  (v64i16 VecDblRegs:$Vt))),
+          (V6_vpackoh_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>;
+}
 
 def V2I1:  PatLeaf<(v2i1  PredRegs:$R)>;
 def V4I1:  PatLeaf<(v4i1  PredRegs:$R)>;
@@ -3073,6 +3074,10 @@ def: Pat<(v4i8 (HexagonVSPLAT I32:$Rs)), (S2_vsplatrb I32:$Rs)>;
 // four halfwords of 64-bits destination register.
 def: Pat<(v4i16 (HexagonVSPLAT I32:$Rs)), (S2_vsplatrh I32:$Rs)>;
 
+def: Pat<(v2i32 (HexagonVSPLAT s8_0ImmPred:$s8)),
+         (A2_combineii imm:$s8, imm:$s8)>;
+def: Pat<(v2i32 (HexagonVSPLAT I32:$Rs)), (A2_combinew I32:$Rs, I32:$Rs)>;
+
 
 class VArith_pat <InstHexagon MI, SDNode Op, PatFrag Type>
   : Pat <(Op Type:$Rss, Type:$Rtt),
@@ -3099,14 +3104,11 @@ def: VArith_pat <A2_xorp,   xor, V8I8>;
 def: VArith_pat <A2_xorp,   xor, V4I16>;
 def: VArith_pat <A2_xorp,   xor, V2I32>;
 
-def: Pat<(v2i32 (sra V2I32:$b, (i64 (HexagonCOMBINE (i32 u5_0ImmPred:$c),
-                                                    (i32 u5_0ImmPred:$c))))),
+def: Pat<(v2i32 (sra V2I32:$b, (v2i32 (HexagonVSPLAT u5_0ImmPred:$c)))),
          (S2_asr_i_vw V2I32:$b, imm:$c)>;
-def: Pat<(v2i32 (srl V2I32:$b, (i64 (HexagonCOMBINE (i32 u5_0ImmPred:$c),
-                                                    (i32 u5_0ImmPred:$c))))),
+def: Pat<(v2i32 (srl V2I32:$b, (v2i32 (HexagonVSPLAT u5_0ImmPred:$c)))),
          (S2_lsr_i_vw V2I32:$b, imm:$c)>;
-def: Pat<(v2i32 (shl V2I32:$b, (i64 (HexagonCOMBINE (i32 u5_0ImmPred:$c),
-                                                    (i32 u5_0ImmPred:$c))))),
+def: Pat<(v2i32 (shl V2I32:$b, (v2i32 (HexagonVSPLAT u5_0ImmPred:$c)))),
          (S2_asl_i_vw V2I32:$b, imm:$c)>;
 
 def: Pat<(v4i16 (sra V4I16:$b, (v4i16 (HexagonVSPLAT u4_0ImmPred:$c)))),
diff --git a/lib/Target/Hexagon/HexagonTargetObjectFile.cpp b/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
index 34df2ebcc520..ea86c9c42f47 100644
--- a/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
+++ b/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
@@ -53,6 +53,10 @@ static cl::opt<bool>
     EmitJtInText("hexagon-emit-jt-text", cl::Hidden, cl::init(false),
                  cl::desc("Emit hexagon jump tables in function section"));
 
+static cl::opt<bool>
+    EmitLutInText("hexagon-emit-lut-text", cl::Hidden, cl::init(false),
+                 cl::desc("Emit hexagon lookup tables in function section"));
+
 // TraceGVPlacement controls messages for all builds. For builds with assertions
 // (debug or release), messages are also controlled by the usual debug flags
 // (e.g. -debug and -debug-only=globallayout)
@@ -136,6 +140,13 @@ MCSection *HexagonTargetObjectFile::SelectSectionForGlobal(
          << (Kind.isBSS() ? "kind_bss " : "" )
          << (Kind.isBSSLocal() ? "kind_bss_local " : "" ));
 
+  // If the lookup table is used by more than one function, do not place
+  // it in text section.
+  if (EmitLutInText && GO->getName().startswith("switch.table")) {
+    if (const Function *Fn = getLutUsedFunction(GO))
+      return selectSectionForLookupTable(GO, TM, Fn);
+  }
+
   if (isGlobalInSmallSection(GO, TM))
     return selectSmallSectionForGlobal(GO, Kind, TM);
 
@@ -402,3 +413,39 @@ MCSection *HexagonTargetObjectFile::selectSmallSectionForGlobal(
   // Otherwise, we work the same as ELF.
   return TargetLoweringObjectFileELF::SelectSectionForGlobal(GO, Kind, TM);
 }
+
+// Return the function that uses the lookup table. If there are more
+// than one live function that uses this look table, bail out and place
+// the lookup table in default section.
+const Function *
+HexagonTargetObjectFile::getLutUsedFunction(const GlobalObject *GO) const {
+  const Function *ReturnFn = nullptr;
+  for (auto U : GO->users()) {
+    // validate each instance of user to be a live function.
+    auto *I = dyn_cast<Instruction>(U);
+    if (!I)
+      continue;
+    auto *Bb = I->getParent();
+    if (!Bb)
+      continue;
+    auto *UserFn = Bb->getParent();
+    if (!ReturnFn)
+      ReturnFn = UserFn;
+    else if (ReturnFn != UserFn)
+      return nullptr;
+  }
+  return ReturnFn;
+}
+
+MCSection *HexagonTargetObjectFile::selectSectionForLookupTable(
+    const GlobalObject *GO, const TargetMachine &TM, const Function *Fn) const {
+
+  SectionKind Kind = SectionKind::getText();
+  // If the function has explicit section, place the lookup table in this
+  // explicit section.
+  if (Fn->hasSection())
+    return getExplicitSectionGlobal(Fn, Kind, TM);
+
+  const auto *FuncObj = dyn_cast<GlobalObject>(Fn);
+  return SelectSectionForGlobal(FuncObj, Kind, TM);
+}
diff --git a/lib/Target/Hexagon/HexagonTargetObjectFile.h b/lib/Target/Hexagon/HexagonTargetObjectFile.h
index 373d850b53be..eff44f097e03 100644
--- a/lib/Target/Hexagon/HexagonTargetObjectFile.h
+++ b/lib/Target/Hexagon/HexagonTargetObjectFile.h
@@ -36,6 +36,8 @@ namespace llvm {
     bool shouldPutJumpTableInFunctionSection(bool UsesLabelDifference,
                                              const Function &F) const override;
 
+    const Function *getLutUsedFunction(const GlobalObject *GO) const;
+
   private:
     MCSectionELF *SmallDataSection;
     MCSectionELF *SmallBSSSection;
@@ -46,6 +48,10 @@ namespace llvm {
     MCSection *selectSmallSectionForGlobal(const GlobalObject *GO,
                                            SectionKind Kind,
                                            const TargetMachine &TM) const;
+
+    MCSection *selectSectionForLookupTable(const GlobalObject *GO,
+                                           const TargetMachine &TM,
+                                           const Function *Fn) const;
   };
 
 } // namespace llvm
diff --git a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index b72c9d534478..e12188e70602 100644
--- a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -304,9 +304,6 @@ class MipsAsmParser : public MCTargetAsmParser {
   bool expandSeqI(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
                   const MCSubtargetInfo *STI);
 
-  bool expandMXTRAlias(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
-                       const MCSubtargetInfo *STI);
-
   bool reportParseError(Twine ErrorMsg);
   bool reportParseError(SMLoc Loc, Twine ErrorMsg);
 
@@ -2514,16 +2511,6 @@ MipsAsmParser::tryExpandInstruction(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
     return expandSeq(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
   case Mips::SEQIMacro:
     return expandSeqI(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
-  case Mips::MFTC0:   case Mips::MTTC0:
-  case Mips::MFTGPR:  case Mips::MTTGPR:
-  case Mips::MFTLO:   case Mips::MTTLO:
-  case Mips::MFTHI:   case Mips::MTTHI:
-  case Mips::MFTACX:  case Mips::MTTACX:
-  case Mips::MFTDSP:  case Mips::MTTDSP:
-  case Mips::MFTC1:   case Mips::MTTC1:
-  case Mips::MFTHC1:  case Mips::MTTHC1:
-  case Mips::CFTC1:   case Mips::CTTC1:
-    return expandMXTRAlias(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
   }
 }
 
@@ -4895,212 +4882,6 @@ bool MipsAsmParser::expandSeqI(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
   return false;
 }
 
-// Map the DSP accumulator and control register to the corresponding gpr
-// operand. Unlike the other alias, the m(f|t)t(lo|hi|acx) instructions
-// do not map the DSP registers contigously to gpr registers.
-static unsigned getRegisterForMxtrDSP(MCInst &Inst, bool IsMFDSP) {
-  switch (Inst.getOpcode()) {
-    case Mips::MFTLO:
-    case Mips::MTTLO:
-      switch (Inst.getOperand(IsMFDSP ? 1 : 0).getReg()) {
-        case Mips::AC0:
-          return Mips::ZERO;
-        case Mips::AC1:
-          return Mips::A0;
-        case Mips::AC2:
-          return Mips::T0;
-        case Mips::AC3:
-          return Mips::T4;
-        default:
-          llvm_unreachable("Unknown register for 'mttr' alias!");
-    }
-    case Mips::MFTHI:
-    case Mips::MTTHI:
-      switch (Inst.getOperand(IsMFDSP ? 1 : 0).getReg()) {
-        case Mips::AC0:
-          return Mips::AT;
-        case Mips::AC1:
-          return Mips::A1;
-        case Mips::AC2:
-          return Mips::T1;
-        case Mips::AC3:
-          return Mips::T5;
-        default:
-          llvm_unreachable("Unknown register for 'mttr' alias!");
-    }
-    case Mips::MFTACX:
-    case Mips::MTTACX:
-      switch (Inst.getOperand(IsMFDSP ? 1 : 0).getReg()) {
-        case Mips::AC0:
-          return Mips::V0;
-        case Mips::AC1:
-          return Mips::A2;
-        case Mips::AC2:
-          return Mips::T2;
-        case Mips::AC3:
-          return Mips::T6;
-        default:
-          llvm_unreachable("Unknown register for 'mttr' alias!");
-    }
-    case Mips::MFTDSP:
-    case Mips::MTTDSP:
-      return Mips::S0;
-    default:
-      llvm_unreachable("Unknown instruction for 'mttr' dsp alias!");
-  }
-}
-
-// Map the floating point register operand to the corresponding register
-// operand.
-static unsigned getRegisterForMxtrFP(MCInst &Inst, bool IsMFTC1) {
-  switch (Inst.getOperand(IsMFTC1 ? 1 : 0).getReg()) {
-    case Mips::F0:  return Mips::ZERO;
-    case Mips::F1:  return Mips::AT;
-    case Mips::F2:  return Mips::V0;
-    case Mips::F3:  return Mips::V1;
-    case Mips::F4:  return Mips::A0;
-    case Mips::F5:  return Mips::A1;
-    case Mips::F6:  return Mips::A2;
-    case Mips::F7:  return Mips::A3;
-    case Mips::F8:  return Mips::T0;
-    case Mips::F9:  return Mips::T1;
-    case Mips::F10: return Mips::T2;
-    case Mips::F11: return Mips::T3;
-    case Mips::F12: return Mips::T4;
-    case Mips::F13: return Mips::T5;
-    case Mips::F14: return Mips::T6;
-    case Mips::F15: return Mips::T7;
-    case Mips::F16: return Mips::S0;
-    case Mips::F17: return Mips::S1;
-    case Mips::F18: return Mips::S2;
-    case Mips::F19: return Mips::S3;
-    case Mips::F20: return Mips::S4;
-    case Mips::F21: return Mips::S5;
-    case Mips::F22: return Mips::S6;
-    case Mips::F23: return Mips::S7;
-    case Mips::F24: return Mips::T8;
-    case Mips::F25: return Mips::T9;
-    case Mips::F26: return Mips::K0;
-    case Mips::F27: return Mips::K1;
-    case Mips::F28: return Mips::GP;
-    case Mips::F29: return Mips::SP;
-    case Mips::F30: return Mips::FP;
-    case Mips::F31: return Mips::RA;
-    default: llvm_unreachable("Unknown register for mttc1 alias!");
-  }
-}
-
-// Map the coprocessor operand the corresponding gpr register operand.
-static unsigned getRegisterForMxtrC0(MCInst &Inst, bool IsMFTC0) {
-  switch (Inst.getOperand(IsMFTC0 ? 1 : 0).getReg()) {
-    case Mips::COP00:  return Mips::ZERO;
-    case Mips::COP01:  return Mips::AT;
-    case Mips::COP02:  return Mips::V0;
-    case Mips::COP03:  return Mips::V1;
-    case Mips::COP04:  return Mips::A0;
-    case Mips::COP05:  return Mips::A1;
-    case Mips::COP06:  return Mips::A2;
-    case Mips::COP07:  return Mips::A3;
-    case Mips::COP08:  return Mips::T0;
-    case Mips::COP09:  return Mips::T1;
-    case Mips::COP010: return Mips::T2;
-    case Mips::COP011: return Mips::T3;
-    case Mips::COP012: return Mips::T4;
-    case Mips::COP013: return Mips::T5;
-    case Mips::COP014: return Mips::T6;
-    case Mips::COP015: return Mips::T7;
-    case Mips::COP016: return Mips::S0;
-    case Mips::COP017: return Mips::S1;
-    case Mips::COP018: return Mips::S2;
-    case Mips::COP019: return Mips::S3;
-    case Mips::COP020: return Mips::S4;
-    case Mips::COP021: return Mips::S5;
-    case Mips::COP022: return Mips::S6;
-    case Mips::COP023: return Mips::S7;
-    case Mips::COP024: return Mips::T8;
-    case Mips::COP025: return Mips::T9;
-    case Mips::COP026: return Mips::K0;
-    case Mips::COP027: return Mips::K1;
-    case Mips::COP028: return Mips::GP;
-    case Mips::COP029: return Mips::SP;
-    case Mips::COP030: return Mips::FP;
-    case Mips::COP031: return Mips::RA;
-    default: llvm_unreachable("Unknown register for mttc0 alias!");
-  }
-}
-
-/// Expand an alias of 'mftr' or 'mttr' into the full instruction, by producing
-/// an mftr or mttr with the correctly mapped gpr register, u, sel and h bits.
-bool MipsAsmParser::expandMXTRAlias(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
-                                    const MCSubtargetInfo *STI) {
-  MipsTargetStreamer &TOut = getTargetStreamer();
-  unsigned rd = 0;
-  unsigned u = 1;
-  unsigned sel = 0;
-  unsigned h = 0;
-  bool IsMFTR = false;
-  switch (Inst.getOpcode()) {
-    case Mips::MFTC0:
-      IsMFTR = true;
-      LLVM_FALLTHROUGH;
-    case Mips::MTTC0:
-      u = 0;
-      rd = getRegisterForMxtrC0(Inst, IsMFTR);
-      sel = Inst.getOperand(2).getImm();
-      break;
-    case Mips::MFTGPR:
-      IsMFTR = true;
-      LLVM_FALLTHROUGH;
-    case Mips::MTTGPR:
-      rd = Inst.getOperand(IsMFTR ? 1 : 0).getReg();
-      break;
-    case Mips::MFTLO:
-    case Mips::MFTHI:
-    case Mips::MFTACX:
-    case Mips::MFTDSP:
-      IsMFTR = true;
-      LLVM_FALLTHROUGH;
-    case Mips::MTTLO:
-    case Mips::MTTHI:
-    case Mips::MTTACX:
-    case Mips::MTTDSP:
-      rd = getRegisterForMxtrDSP(Inst, IsMFTR);
-      sel = 1;
-      break;
-    case Mips::MFTHC1:
-      h = 1;
-      LLVM_FALLTHROUGH;
-    case Mips::MFTC1:
-      IsMFTR = true;
-      rd = getRegisterForMxtrFP(Inst, IsMFTR);
-      sel = 2;
-      break;
-    case Mips::MTTHC1:
-      h = 1;
-      LLVM_FALLTHROUGH;
-    case Mips::MTTC1:
-      rd = getRegisterForMxtrFP(Inst, IsMFTR);
-      sel = 2;
-      break;
-    case Mips::CFTC1:
-      IsMFTR = true;
-      LLVM_FALLTHROUGH;
-    case Mips::CTTC1:
-      rd = getRegisterForMxtrFP(Inst, IsMFTR);
-      sel = 3;
-      break;
-  }
-  unsigned Op0 = IsMFTR ? Inst.getOperand(0).getReg() : rd;
-  unsigned Op1 =
-      IsMFTR ? rd
-             : (Inst.getOpcode() != Mips::MTTDSP ? Inst.getOperand(1).getReg()
-                                                 : Inst.getOperand(0).getReg());
-
-  TOut.emitRRIII(IsMFTR ? Mips::MFTR : Mips::MTTR, Op0, Op1, u, sel, h, IDLoc,
-                 STI);
-  return false;
-}
-
 unsigned
 MipsAsmParser::checkEarlyTargetMatchPredicate(MCInst &Inst,
                                               const OperandVector &Operands) {
diff --git a/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
index 7caeb08589af..2907b7715857 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -193,21 +193,6 @@ void MipsTargetStreamer::emitRRI(unsigned Opcode, unsigned Reg0, unsigned Reg1,
   emitRRX(Opcode, Reg0, Reg1, MCOperand::createImm(Imm), IDLoc, STI);
 }
 
-void MipsTargetStreamer::emitRRIII(unsigned Opcode, unsigned Reg0,
-                                   unsigned Reg1, int16_t Imm0, int16_t Imm1,
-                                   int16_t Imm2, SMLoc IDLoc,
-                                   const MCSubtargetInfo *STI) {
-  MCInst TmpInst;
-  TmpInst.setOpcode(Opcode);
-  TmpInst.addOperand(MCOperand::createReg(Reg0));
-  TmpInst.addOperand(MCOperand::createReg(Reg1));
-  TmpInst.addOperand(MCOperand::createImm(Imm0));
-  TmpInst.addOperand(MCOperand::createImm(Imm1));
-  TmpInst.addOperand(MCOperand::createImm(Imm2));
-  TmpInst.setLoc(IDLoc);
-  getStreamer().EmitInstruction(TmpInst, *STI);
-}
-
 void MipsTargetStreamer::emitAddu(unsigned DstReg, unsigned SrcReg,
                                   unsigned TrgReg, bool Is64Bit,
                                   const MCSubtargetInfo *STI) {
diff --git a/lib/Target/Mips/Mips.td b/lib/Target/Mips/Mips.td
index d2f0fdcc6cc1..6ceb05577538 100644
--- a/lib/Target/Mips/Mips.td
+++ b/lib/Target/Mips/Mips.td
@@ -190,6 +190,9 @@ def FeatureMadd4 : SubtargetFeature<"nomadd4", "DisableMadd4", "true",
 
 def FeatureMT : SubtargetFeature<"mt", "HasMT", "true", "Mips MT ASE">;
 
+def FeatureLongCalls : SubtargetFeature<"long-calls", "UseLongCalls", "true",
+                                        "Disable use of the jal instruction">;
+
 //===----------------------------------------------------------------------===//
 // Mips processors supported.
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index a6ec9fb2e598..20319f85696c 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -364,6 +364,18 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
   setOperationAction(ISD::UDIV, MVT::i64, Expand);
   setOperationAction(ISD::UREM, MVT::i64, Expand);
 
+  if (!(Subtarget.hasDSP() && Subtarget.hasMips32r2())) {
+    setOperationAction(ISD::ADDC, MVT::i32, Expand);
+    setOperationAction(ISD::ADDE, MVT::i32, Expand);
+  }
+
+  setOperationAction(ISD::ADDC, MVT::i64, Expand);
+  setOperationAction(ISD::ADDE, MVT::i64, Expand);
+  setOperationAction(ISD::SUBC, MVT::i32, Expand);
+  setOperationAction(ISD::SUBE, MVT::i32, Expand);
+  setOperationAction(ISD::SUBC, MVT::i64, Expand);
+  setOperationAction(ISD::SUBE, MVT::i64, Expand);
+
   // Operations not directly supported by Mips.
   setOperationAction(ISD::BR_CC,             MVT::f32,   Expand);
   setOperationAction(ISD::BR_CC,             MVT::f64,   Expand);
@@ -469,6 +481,7 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
   setTargetDAGCombine(ISD::AND);
   setTargetDAGCombine(ISD::OR);
   setTargetDAGCombine(ISD::ADD);
+  setTargetDAGCombine(ISD::SUB);
   setTargetDAGCombine(ISD::AssertZext);
   setTargetDAGCombine(ISD::SHL);
 
@@ -923,14 +936,127 @@ static SDValue performORCombine(SDNode *N, SelectionDAG &DAG,
   }
 }
 
+static SDValue performMADD_MSUBCombine(SDNode *ROOTNode, SelectionDAG &CurDAG,
+                                       const MipsSubtarget &Subtarget) {
+  // ROOTNode must have a multiplication as an operand for the match to be
+  // successful.
+  if (ROOTNode->getOperand(0).getOpcode() != ISD::MUL &&
+      ROOTNode->getOperand(1).getOpcode() != ISD::MUL)
+    return SDValue();
+
+  // We don't handle vector types here.
+  if (ROOTNode->getValueType(0).isVector())
+    return SDValue();
+
+  // For MIPS64, madd / msub instructions are inefficent to use with 64 bit
+  // arithmetic. E.g.
+  // (add (mul a b) c) =>
+  //   let res = (madd (mthi (drotr c 32))x(mtlo c) a b) in
+  //   MIPS64:   (or (dsll (mfhi res) 32) (dsrl (dsll (mflo res) 32) 32)
+  //   or
+  //   MIPS64R2: (dins (mflo res) (mfhi res) 32 32)
+  //
+  // The overhead of setting up the Hi/Lo registers and reassembling the
+  // result makes this a dubious optimzation for MIPS64. The core of the
+  // problem is that Hi/Lo contain the upper and lower 32 bits of the
+  // operand and result.
+  //
+  // It requires a chain of 4 add/mul for MIPS64R2 to get better code
+  // density than doing it naively, 5 for MIPS64. Additionally, using
+  // madd/msub on MIPS64 requires the operands actually be 32 bit sign
+  // extended operands, not true 64 bit values.
+  //
+  // FIXME: For the moment, disable this completely for MIPS64.
+  if (Subtarget.hasMips64())
+    return SDValue();
+
+  SDValue Mult = ROOTNode->getOperand(0).getOpcode() == ISD::MUL
+                     ? ROOTNode->getOperand(0)
+                     : ROOTNode->getOperand(1);
+
+  SDValue AddOperand = ROOTNode->getOperand(0).getOpcode() == ISD::MUL
+                     ? ROOTNode->getOperand(1)
+                     : ROOTNode->getOperand(0);
+
+  // Transform this to a MADD only if the user of this node is the add.
+  // If there are other users of the mul, this function returns here.
+  if (!Mult.hasOneUse())
+    return SDValue();
+
+  // maddu and madd are unusual instructions in that on MIPS64 bits 63..31
+  // must be in canonical form, i.e. sign extended. For MIPS32, the operands
+  // of the multiply must have 32 or more sign bits, otherwise we cannot
+  // perform this optimization. We have to check this here as we're performing
+  // this optimization pre-legalization.
+  SDValue MultLHS = Mult->getOperand(0);
+  SDValue MultRHS = Mult->getOperand(1);
+
+  bool IsSigned = MultLHS->getOpcode() == ISD::SIGN_EXTEND &&
+                  MultRHS->getOpcode() == ISD::SIGN_EXTEND;
+  bool IsUnsigned = MultLHS->getOpcode() == ISD::ZERO_EXTEND &&
+                    MultRHS->getOpcode() == ISD::ZERO_EXTEND;
+
+  if (!IsSigned && !IsUnsigned)
+    return SDValue();
+
+  // Initialize accumulator.
+  SDLoc DL(ROOTNode);
+  SDValue TopHalf;
+  SDValue BottomHalf;
+  BottomHalf = CurDAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, AddOperand,
+                              CurDAG.getIntPtrConstant(0, DL));
+
+  TopHalf = CurDAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, AddOperand,
+                           CurDAG.getIntPtrConstant(1, DL));
+  SDValue ACCIn = CurDAG.getNode(MipsISD::MTLOHI, DL, MVT::Untyped,
+                                  BottomHalf,
+                                  TopHalf);
+
+  // Create MipsMAdd(u) / MipsMSub(u) node.
+  bool IsAdd = ROOTNode->getOpcode() == ISD::ADD;
+  unsigned Opcode = IsAdd ? (IsUnsigned ? MipsISD::MAddu : MipsISD::MAdd)
+                          : (IsUnsigned ? MipsISD::MSubu : MipsISD::MSub);
+  SDValue MAddOps[3] = {
+      CurDAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mult->getOperand(0)),
+      CurDAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mult->getOperand(1)), ACCIn};
+  EVT VTs[2] = {MVT::i32, MVT::i32};
+  SDValue MAdd = CurDAG.getNode(Opcode, DL, VTs, MAddOps);
+
+  SDValue ResLo = CurDAG.getNode(MipsISD::MFLO, DL, MVT::i32, MAdd);
+  SDValue ResHi = CurDAG.getNode(MipsISD::MFHI, DL, MVT::i32, MAdd);
+  SDValue Combined =
+      CurDAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResLo, ResHi);
+  return Combined;
+}
+
+static SDValue performSUBCombine(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const MipsSubtarget &Subtarget) {
+  // (sub v0 (mul v1, v2)) => (msub v1, v2, v0)
+  if (DCI.isBeforeLegalizeOps()) {
+    if (Subtarget.hasMips32() && !Subtarget.hasMips32r6() &&
+        !Subtarget.inMips16Mode() && N->getValueType(0) == MVT::i64)
+      return performMADD_MSUBCombine(N, DAG, Subtarget);
+
+    return SDValue();
+  }
+
+  return SDValue();
+}
+
 static SDValue performADDCombine(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const MipsSubtarget &Subtarget) {
-  // (add v0, (add v1, abs_lo(tjt))) => (add (add v0, v1), abs_lo(tjt))
+  // (add v0 (mul v1, v2)) => (madd v1, v2, v0)
+  if (DCI.isBeforeLegalizeOps()) {
+    if (Subtarget.hasMips32() && !Subtarget.hasMips32r6() &&
+        !Subtarget.inMips16Mode() && N->getValueType(0) == MVT::i64)
+      return performMADD_MSUBCombine(N, DAG, Subtarget);
 
-  if (DCI.isBeforeLegalizeOps())
     return SDValue();
+  }
 
+  // (add v0, (add v1, abs_lo(tjt))) => (add (add v0, v1), abs_lo(tjt))
   SDValue Add = N->getOperand(1);
 
   if (Add.getOpcode() != ISD::ADD)
@@ -1058,6 +1184,8 @@ SDValue  MipsTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI)
     return performAssertZextCombine(N, DAG, DCI, Subtarget);
   case ISD::SHL:
     return performSHLCombine(N, DAG, DCI, Subtarget);
+  case ISD::SUB:
+    return performSUBCombine(N, DAG, DCI, Subtarget);
   }
 
   return SDValue();
@@ -3021,6 +3149,20 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   EVT Ty = Callee.getValueType();
   bool GlobalOrExternal = false, IsCallReloc = false;
 
+  // The long-calls feature is ignored in case of PIC.
+  // While we do not support -mshared / -mno-shared properly,
+  // ignore long-calls in case of -mabicalls too.
+  if (Subtarget.useLongCalls() && !Subtarget.isABICalls() && !IsPIC) {
+    // Get the address of the callee into a register to prevent
+    // using of the `jal` instruction for the direct call.
+    if (auto *N = dyn_cast<GlobalAddressSDNode>(Callee))
+      Callee = Subtarget.hasSym32() ? getAddrNonPIC(N, SDLoc(N), Ty, DAG)
+                                    : getAddrNonPICSym64(N, SDLoc(N), Ty, DAG);
+    else if (auto *N = dyn_cast<ExternalSymbolSDNode>(Callee))
+      Callee = Subtarget.hasSym32() ? getAddrNonPIC(N, SDLoc(N), Ty, DAG)
+                                    : getAddrNonPICSym64(N, SDLoc(N), Ty, DAG);
+  }
+
   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
     if (IsPIC) {
       const GlobalValue *Val = G->getGlobal();
diff --git a/lib/Target/Mips/MipsInstrFPU.td b/lib/Target/Mips/MipsInstrFPU.td
index 94f3a74be98b..0333fe6520fa 100644
--- a/lib/Target/Mips/MipsInstrFPU.td
+++ b/lib/Target/Mips/MipsInstrFPU.td
@@ -443,8 +443,17 @@ let AdditionalPredicates = [NotInMicroMips] in {
 }
 def MFC1 : MMRel, MFC1_FT<"mfc1", GPR32Opnd, FGR32Opnd, II_MFC1,
                           bitconvert>, MFC1_FM<0>;
+def MFC1_D64 : MFC1_FT<"mfc1", GPR32Opnd, FGR64Opnd, II_MFC1>, MFC1_FM<0>,
+               FGR_64 {
+  let DecoderNamespace = "Mips64";
+}
 def MTC1 : MMRel, MTC1_FT<"mtc1", FGR32Opnd, GPR32Opnd, II_MTC1,
                           bitconvert>, MFC1_FM<4>;
+def MTC1_D64 : MTC1_FT<"mtc1", FGR64Opnd, GPR32Opnd, II_MTC1>, MFC1_FM<4>,
+               FGR_64 {
+  let DecoderNamespace = "Mips64";
+}
+
 let AdditionalPredicates = [NotInMicroMips] in {
   def MFHC1_D32 : MMRel, MFC1_FT<"mfhc1", GPR32Opnd, AFGR64Opnd, II_MFHC1>,
                   MFC1_FM<3>, ISA_MIPS32R2, FGR_32;
diff --git a/lib/Target/Mips/MipsMTInstrFormats.td b/lib/Target/Mips/MipsMTInstrFormats.td
index edc0981e6278..64bee5bfba18 100644
--- a/lib/Target/Mips/MipsMTInstrFormats.td
+++ b/lib/Target/Mips/MipsMTInstrFormats.td
@@ -35,8 +35,6 @@ class FIELD5<bits<5> Val> {
 def FIELD5_1_DMT_EMT  : FIELD5<0b00001>;
 def FIELD5_2_DMT_EMT  : FIELD5<0b01111>;
 def FIELD5_1_2_DVPE_EVPE : FIELD5<0b00000>;
-def FIELD5_MFTR : FIELD5<0b01000>;
-def FIELD5_MTTR : FIELD5<0b01100>;
 
 class COP0_MFMC0_MT<FIELD5 Op1, FIELD5 Op2, OPCODE1 sc> : MipsMTInst {
   bits<32> Inst;
@@ -52,25 +50,6 @@ class COP0_MFMC0_MT<FIELD5 Op1, FIELD5 Op2, OPCODE1 sc> : MipsMTInst {
   let Inst{2-0}   = 0b001;
 }
 
-class COP0_MFTTR_MT<FIELD5 Op> : MipsMTInst {
-  bits<32> Inst;
-
-  bits<5> rt;
-  bits<5> rd;
-  bits<1> u;
-  bits<1> h;
-  bits<3> sel;
-  let Inst{31-26} = 0b010000; // COP0
-  let Inst{25-21} = Op.Value; // MFMC0
-  let Inst{20-16} = rt;
-  let Inst{15-11} = rd;
-  let Inst{10-6}  = 0b00000;  // rx - currently unsupported.
-  let Inst{5}     = u;
-  let Inst{4}     = h;
-  let Inst{3}     = 0b0;
-  let Inst{2-0}   = sel;
-}
-
 class SPECIAL3_MT_FORK : MipsMTInst {
   bits<32> Inst;
 
diff --git a/lib/Target/Mips/MipsMTInstrInfo.td b/lib/Target/Mips/MipsMTInstrInfo.td
index 72e626cbec40..ab6693f60fd9 100644
--- a/lib/Target/Mips/MipsMTInstrInfo.td
+++ b/lib/Target/Mips/MipsMTInstrInfo.td
@@ -6,13 +6,6 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// This file describes the MIPS MT ASE as defined by MD00378 1.12.
-//
-// TODO: Add support for the microMIPS encodings for the MT ASE and add the
-//       instruction mappings.
-//
-//===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
 // MIPS MT Instruction Encodings
@@ -34,10 +27,6 @@ class FORK_ENC : SPECIAL3_MT_FORK;
 
 class YIELD_ENC : SPECIAL3_MT_YIELD;
 
-class MFTR_ENC : COP0_MFTTR_MT<FIELD5_MFTR>;
-
-class MTTR_ENC : COP0_MFTTR_MT<FIELD5_MTTR>;
-
 //===----------------------------------------------------------------------===//
 // MIPS MT Instruction Descriptions
 //===----------------------------------------------------------------------===//
@@ -50,22 +39,6 @@ class MT_1R_DESC_BASE<string instr_asm, InstrItinClass Itin = NoItinerary> {
   InstrItinClass Itinerary = Itin;
 }
 
-class MFTR_DESC {
-  dag OutOperandList = (outs GPR32Opnd:$rd);
-  dag InOperandList = (ins GPR32Opnd:$rt, uimm1:$u, uimm3:$sel, uimm1:$h);
-  string AsmString = "mftr\t$rd, $rt, $u, $sel, $h";
-  list<dag> Pattern = [];
-  InstrItinClass Itinerary = II_MFTR;
-}
-
-class MTTR_DESC {
-  dag OutOperandList = (outs GPR32Opnd:$rd);
-  dag InOperandList = (ins GPR32Opnd:$rt, uimm1:$u, uimm3:$sel, uimm1:$h);
-  string AsmString = "mttr\t$rt, $rd, $u, $sel, $h";
-  list<dag> Pattern = [];
-  InstrItinClass Itinerary = II_MTTR;
-}
-
 class FORK_DESC {
   dag OutOperandList = (outs GPR32Opnd:$rs, GPR32Opnd:$rd);
   dag InOperandList = (ins GPR32Opnd:$rt);
@@ -106,73 +79,8 @@ let hasSideEffects = 1, isNotDuplicable = 1,
   def FORK : FORK_ENC, FORK_DESC, ASE_MT;
 
   def YIELD : YIELD_ENC, YIELD_DESC, ASE_MT;
-
-  def MFTR : MFTR_ENC, MFTR_DESC, ASE_MT;
-
-  def MTTR : MTTR_ENC, MTTR_DESC, ASE_MT;
 }
 
-//===----------------------------------------------------------------------===//
-// MIPS MT Pseudo Instructions - used to support mtfr & mttr aliases.
-//===----------------------------------------------------------------------===//
-def MFTC0 : MipsAsmPseudoInst<(outs GPR32Opnd:$rd), (ins COP0Opnd:$rt,
-                                                        uimm3:$sel),
-                              "mftc0 $rd, $rt, $sel">, ASE_MT;
-
-def MFTGPR : MipsAsmPseudoInst<(outs GPR32Opnd:$rd), (ins GPR32Opnd:$rt,
-                                                          uimm3:$sel),
-                               "mftgpr $rd, $rt">, ASE_MT;
-
-def MFTLO : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins ACC64DSPOpnd:$ac),
-                              "mftlo $rt, $ac">, ASE_MT;
-
-def MFTHI : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins ACC64DSPOpnd:$ac),
-                              "mfthi $rt, $ac">, ASE_MT;
-
-def MFTACX : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins ACC64DSPOpnd:$ac),
-                               "mftacx $rt, $ac">, ASE_MT;
-
-def MFTDSP : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins),
-                               "mftdsp $rt">, ASE_MT;
-
-def MFTC1 : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins FGR32Opnd:$ft),
-                              "mftc1 $rt, $ft">, ASE_MT;
-
-def MFTHC1 : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins FGR32Opnd:$ft),
-                               "mfthc1 $rt, $ft">, ASE_MT;
-
-def CFTC1 : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins FGRCCOpnd:$ft),
-                              "cftc1 $rt, $ft">, ASE_MT;
-
-
-def MTTC0 : MipsAsmPseudoInst<(outs COP0Opnd:$rd), (ins GPR32Opnd:$rt,
-                                                        uimm3:$sel),
-                              "mttc0 $rt, $rd, $sel">, ASE_MT;
-
-def MTTGPR : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins GPR32Opnd:$rd),
-                               "mttgpr $rd, $rt">, ASE_MT;
-
-def MTTLO : MipsAsmPseudoInst<(outs ACC64DSPOpnd:$ac), (ins GPR32Opnd:$rt),
-                              "mttlo $rt, $ac">, ASE_MT;
-
-def MTTHI : MipsAsmPseudoInst<(outs ACC64DSPOpnd:$ac), (ins GPR32Opnd:$rt),
-                              "mtthi $rt, $ac">, ASE_MT;
-
-def MTTACX : MipsAsmPseudoInst<(outs ACC64DSPOpnd:$ac), (ins GPR32Opnd:$rt),
-                               "mttacx $rt, $ac">, ASE_MT;
-
-def MTTDSP : MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rt),
-                               "mttdsp $rt">, ASE_MT;
-
-def MTTC1 : MipsAsmPseudoInst<(outs FGR32Opnd:$ft), (ins GPR32Opnd:$rt),
-                              "mttc1 $rt, $ft">, ASE_MT;
-
-def MTTHC1 : MipsAsmPseudoInst<(outs FGR32Opnd:$ft), (ins GPR32Opnd:$rt),
-                               "mtthc1 $rt, $ft">, ASE_MT;
-
-def CTTC1 : MipsAsmPseudoInst<(outs FGRCCOpnd:$ft), (ins GPR32Opnd:$rt),
-                              "cttc1 $rt, $ft">, ASE_MT;
-
 //===----------------------------------------------------------------------===//
 // MIPS MT Instruction Definitions
 //===----------------------------------------------------------------------===//
@@ -187,22 +95,4 @@ let AdditionalPredicates = [NotInMicroMips] in {
   def : MipsInstAlias<"evpe", (EVPE ZERO), 1>, ASE_MT;
 
   def : MipsInstAlias<"yield $rs", (YIELD ZERO, GPR32Opnd:$rs), 1>, ASE_MT;
-
-  def : MipsInstAlias<"mftc0 $rd, $rt", (MFTC0 GPR32Opnd:$rd, COP0Opnd:$rt, 0),
-                      1>, ASE_MT;
-
-  def : MipsInstAlias<"mftlo $rt", (MFTLO GPR32Opnd:$rt, AC0), 1>, ASE_MT;
-
-  def : MipsInstAlias<"mfthi $rt", (MFTHI GPR32Opnd:$rt, AC0), 1>, ASE_MT;
-
-  def : MipsInstAlias<"mftacx $rt", (MFTACX GPR32Opnd:$rt, AC0), 1>, ASE_MT;
-
-  def : MipsInstAlias<"mttc0 $rd, $rt", (MTTC0 COP0Opnd:$rt, GPR32Opnd:$rd, 0),
-                      1>, ASE_MT;
-
-  def : MipsInstAlias<"mttlo $rt", (MTTLO AC0, GPR32Opnd:$rt), 1>, ASE_MT;
-
-  def : MipsInstAlias<"mtthi $rt", (MTTHI AC0, GPR32Opnd:$rt), 1>, ASE_MT;
-
-  def : MipsInstAlias<"mttacx $rt", (MTTACX AC0, GPR32Opnd:$rt), 1>, ASE_MT;
 }
diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
index 49ae6dd4cd39..4be26dd25dc0 100644
--- a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
@@ -245,46 +245,64 @@ void MipsSEDAGToDAGISel::processFunctionAfterISel(MachineFunction &MF) {
   }
 }
 
-void MipsSEDAGToDAGISel::selectAddESubE(unsigned MOp, SDValue InFlag,
-                                        SDValue CmpLHS, const SDLoc &DL,
-                                        SDNode *Node) const {
-  unsigned Opc = InFlag.getOpcode(); (void)Opc;
-
-  assert(((Opc == ISD::ADDC || Opc == ISD::ADDE) ||
-          (Opc == ISD::SUBC || Opc == ISD::SUBE)) &&
-         "(ADD|SUB)E flag operand must come from (ADD|SUB)C/E insn");
-
-  unsigned SLTuOp = Mips::SLTu, ADDuOp = Mips::ADDu;
-  if (Subtarget->isGP64bit()) {
-    SLTuOp = Mips::SLTu64;
-    ADDuOp = Mips::DADDu;
-  }
-
-  SDValue Ops[] = { CmpLHS, InFlag.getOperand(1) };
+void MipsSEDAGToDAGISel::selectAddE(SDNode *Node, const SDLoc &DL) const {
+  SDValue InFlag = Node->getOperand(2);
+  unsigned Opc = InFlag.getOpcode();
   SDValue LHS = Node->getOperand(0), RHS = Node->getOperand(1);
   EVT VT = LHS.getValueType();
 
-  SDNode *Carry = CurDAG->getMachineNode(SLTuOp, DL, VT, Ops);
-
-  if (Subtarget->isGP64bit()) {
-    // On 64-bit targets, sltu produces an i64 but our backend currently says
-    // that SLTu64 produces an i32. We need to fix this in the long run but for
-    // now, just make the DAG type-correct by asserting the upper bits are zero.
-    Carry = CurDAG->getMachineNode(Mips::SUBREG_TO_REG, DL, VT,
-                                   CurDAG->getTargetConstant(0, DL, VT),
-                                   SDValue(Carry, 0),
-                                   CurDAG->getTargetConstant(Mips::sub_32, DL,
-                                                             VT));
+  // In the base case, we can rely on the carry bit from the addsc
+  // instruction.
+  if (Opc == ISD::ADDC) {
+    SDValue Ops[3] = {LHS, RHS, InFlag};
+    CurDAG->SelectNodeTo(Node, Mips::ADDWC, VT, MVT::Glue, Ops);
+    return;
   }
 
-  // Generate a second addition only if we know that RHS is not a
-  // constant-zero node.
-  SDNode *AddCarry = Carry;
-  ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS);
-  if (!C || C->getZExtValue())
-    AddCarry = CurDAG->getMachineNode(ADDuOp, DL, VT, SDValue(Carry, 0), RHS);
+  assert(Opc == ISD::ADDE && "ISD::ADDE not in a chain of ADDE nodes!");
+
+  // The more complex case is when there is a chain of ISD::ADDE nodes like:
+  // (adde (adde (adde (addc a b) c) d) e).
+  //
+  // The addwc instruction does not write to the carry bit, instead it writes
+  // to bit 20 of the dsp control register. To match this series of nodes, each
+  // intermediate adde node must be expanded to write the carry bit before the
+  // addition.
+
+  // Start by reading the overflow field for addsc and moving the value to the
+  // carry field. The usage of 1 here with MipsISD::RDDSP / Mips::WRDSP
+  // corresponds to reading/writing the entire control register to/from a GPR.
+
+  SDValue CstOne = CurDAG->getTargetConstant(1, DL, MVT::i32);
+
+  SDValue OuFlag = CurDAG->getTargetConstant(20, DL, MVT::i32);
+
+  SDNode *DSPCtrlField =
+      CurDAG->getMachineNode(Mips::RDDSP, DL, MVT::i32, MVT::Glue, CstOne, InFlag);
+
+  SDNode *Carry = CurDAG->getMachineNode(
+      Mips::EXT, DL, MVT::i32, SDValue(DSPCtrlField, 0), OuFlag, CstOne);
 
-  CurDAG->SelectNodeTo(Node, MOp, VT, MVT::Glue, LHS, SDValue(AddCarry, 0));
+  SDValue Ops[4] = {SDValue(DSPCtrlField, 0),
+                    CurDAG->getTargetConstant(6, DL, MVT::i32), CstOne,
+                    SDValue(Carry, 0)};
+  SDNode *DSPCFWithCarry = CurDAG->getMachineNode(Mips::INS, DL, MVT::i32, Ops);
+
+  // My reading of the the MIPS DSP 3.01 specification isn't as clear as I
+  // would like about whether bit 20 always gets overwritten by addwc.
+  // Hence take an extremely conservative view and presume it's sticky. We
+  // therefore need to clear it.
+
+  SDValue Zero = CurDAG->getRegister(Mips::ZERO, MVT::i32);
+
+  SDValue InsOps[4] = {Zero, OuFlag, CstOne, SDValue(DSPCFWithCarry, 0)};
+  SDNode *DSPCtrlFinal = CurDAG->getMachineNode(Mips::INS, DL, MVT::i32, InsOps);
+
+  SDNode *WrDSP = CurDAG->getMachineNode(Mips::WRDSP, DL, MVT::Glue,
+                                         SDValue(DSPCtrlFinal, 0), CstOne);
+
+  SDValue Operands[3] = {LHS, RHS, SDValue(WrDSP, 0)};
+  CurDAG->SelectNodeTo(Node, Mips::ADDWC, VT, MVT::Glue, Operands);
 }
 
 /// Match frameindex
@@ -765,19 +783,8 @@ bool MipsSEDAGToDAGISel::trySelect(SDNode *Node) {
   switch(Opcode) {
   default: break;
 
-  case ISD::SUBE: {
-    SDValue InFlag = Node->getOperand(2);
-    unsigned Opc = Subtarget->isGP64bit() ? Mips::DSUBu : Mips::SUBu;
-    selectAddESubE(Opc, InFlag, InFlag.getOperand(0), DL, Node);
-    return true;
-  }
-
   case ISD::ADDE: {
-    if (Subtarget->hasDSP()) // Select DSP instructions, ADDSC and ADDWC.
-      break;
-    SDValue InFlag = Node->getOperand(2);
-    unsigned Opc = Subtarget->isGP64bit() ? Mips::DADDu : Mips::ADDu;
-    selectAddESubE(Opc, InFlag, InFlag.getValue(0), DL, Node);
+    selectAddE(Node, DL);
     return true;
   }
 
diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.h b/lib/Target/Mips/MipsSEISelDAGToDAG.h
index f89a350cab04..6f38289c5a45 100644
--- a/lib/Target/Mips/MipsSEISelDAGToDAG.h
+++ b/lib/Target/Mips/MipsSEISelDAGToDAG.h
@@ -41,8 +41,7 @@ private:
                                            const SDLoc &dl, EVT Ty, bool HasLo,
                                            bool HasHi);
 
-  void selectAddESubE(unsigned MOp, SDValue InFlag, SDValue CmpLHS,
-                      const SDLoc &DL, SDNode *Node) const;
+  void selectAddE(SDNode *Node, const SDLoc &DL) const;
 
   bool selectAddrFrameIndex(SDValue Addr, SDValue &Base, SDValue &Offset) const;
   bool selectAddrFrameIndexOffset(SDValue Addr, SDValue &Base, SDValue &Offset,
diff --git a/lib/Target/Mips/MipsSEISelLowering.cpp b/lib/Target/Mips/MipsSEISelLowering.cpp
index 06a97b9d123e..72b2738bfac4 100644
--- a/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -179,8 +179,6 @@ MipsSETargetLowering::MipsSETargetLowering(const MipsTargetMachine &TM,
   setOperationAction(ISD::LOAD,               MVT::i32, Custom);
   setOperationAction(ISD::STORE,              MVT::i32, Custom);
 
-  setTargetDAGCombine(ISD::ADDE);
-  setTargetDAGCombine(ISD::SUBE);
   setTargetDAGCombine(ISD::MUL);
 
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
@@ -421,163 +419,6 @@ SDValue MipsSETargetLowering::LowerOperation(SDValue Op,
   return MipsTargetLowering::LowerOperation(Op, DAG);
 }
 
-// selectMADD -
-// Transforms a subgraph in CurDAG if the following pattern is found:
-//  (addc multLo, Lo0), (adde multHi, Hi0),
-// where,
-//  multHi/Lo: product of multiplication
-//  Lo0: initial value of Lo register
-//  Hi0: initial value of Hi register
-// Return true if pattern matching was successful.
-static bool selectMADD(SDNode *ADDENode, SelectionDAG *CurDAG) {
-  // ADDENode's second operand must be a flag output of an ADDC node in order
-  // for the matching to be successful.
-  SDNode *ADDCNode = ADDENode->getOperand(2).getNode();
-
-  if (ADDCNode->getOpcode() != ISD::ADDC)
-    return false;
-
-  SDValue MultHi = ADDENode->getOperand(0);
-  SDValue MultLo = ADDCNode->getOperand(0);
-  SDNode *MultNode = MultHi.getNode();
-  unsigned MultOpc = MultHi.getOpcode();
-
-  // MultHi and MultLo must be generated by the same node,
-  if (MultLo.getNode() != MultNode)
-    return false;
-
-  // and it must be a multiplication.
-  if (MultOpc != ISD::SMUL_LOHI && MultOpc != ISD::UMUL_LOHI)
-    return false;
-
-  // MultLo amd MultHi must be the first and second output of MultNode
-  // respectively.
-  if (MultHi.getResNo() != 1 || MultLo.getResNo() != 0)
-    return false;
-
-  // Transform this to a MADD only if ADDENode and ADDCNode are the only users
-  // of the values of MultNode, in which case MultNode will be removed in later
-  // phases.
-  // If there exist users other than ADDENode or ADDCNode, this function returns
-  // here, which will result in MultNode being mapped to a single MULT
-  // instruction node rather than a pair of MULT and MADD instructions being
-  // produced.
-  if (!MultHi.hasOneUse() || !MultLo.hasOneUse())
-    return false;
-
-  SDLoc DL(ADDENode);
-
-  // Initialize accumulator.
-  SDValue ACCIn = CurDAG->getNode(MipsISD::MTLOHI, DL, MVT::Untyped,
-                                  ADDCNode->getOperand(1),
-                                  ADDENode->getOperand(1));
-
-  // create MipsMAdd(u) node
-  MultOpc = MultOpc == ISD::UMUL_LOHI ? MipsISD::MAddu : MipsISD::MAdd;
-
-  SDValue MAdd = CurDAG->getNode(MultOpc, DL, MVT::Untyped,
-                                 MultNode->getOperand(0),// Factor 0
-                                 MultNode->getOperand(1),// Factor 1
-                                 ACCIn);
-
-  // replace uses of adde and addc here
-  if (!SDValue(ADDCNode, 0).use_empty()) {
-    SDValue LoOut = CurDAG->getNode(MipsISD::MFLO, DL, MVT::i32, MAdd);
-    CurDAG->ReplaceAllUsesOfValueWith(SDValue(ADDCNode, 0), LoOut);
-  }
-  if (!SDValue(ADDENode, 0).use_empty()) {
-    SDValue HiOut = CurDAG->getNode(MipsISD::MFHI, DL, MVT::i32, MAdd);
-    CurDAG->ReplaceAllUsesOfValueWith(SDValue(ADDENode, 0), HiOut);
-  }
-
-  return true;
-}
-
-// selectMSUB -
-// Transforms a subgraph in CurDAG if the following pattern is found:
-//  (addc Lo0, multLo), (sube Hi0, multHi),
-// where,
-//  multHi/Lo: product of multiplication
-//  Lo0: initial value of Lo register
-//  Hi0: initial value of Hi register
-// Return true if pattern matching was successful.
-static bool selectMSUB(SDNode *SUBENode, SelectionDAG *CurDAG) {
-  // SUBENode's second operand must be a flag output of an SUBC node in order
-  // for the matching to be successful.
-  SDNode *SUBCNode = SUBENode->getOperand(2).getNode();
-
-  if (SUBCNode->getOpcode() != ISD::SUBC)
-    return false;
-
-  SDValue MultHi = SUBENode->getOperand(1);
-  SDValue MultLo = SUBCNode->getOperand(1);
-  SDNode *MultNode = MultHi.getNode();
-  unsigned MultOpc = MultHi.getOpcode();
-
-  // MultHi and MultLo must be generated by the same node,
-  if (MultLo.getNode() != MultNode)
-    return false;
-
-  // and it must be a multiplication.
-  if (MultOpc != ISD::SMUL_LOHI && MultOpc != ISD::UMUL_LOHI)
-    return false;
-
-  // MultLo amd MultHi must be the first and second output of MultNode
-  // respectively.
-  if (MultHi.getResNo() != 1 || MultLo.getResNo() != 0)
-    return false;
-
-  // Transform this to a MSUB only if SUBENode and SUBCNode are the only users
-  // of the values of MultNode, in which case MultNode will be removed in later
-  // phases.
-  // If there exist users other than SUBENode or SUBCNode, this function returns
-  // here, which will result in MultNode being mapped to a single MULT
-  // instruction node rather than a pair of MULT and MSUB instructions being
-  // produced.
-  if (!MultHi.hasOneUse() || !MultLo.hasOneUse())
-    return false;
-
-  SDLoc DL(SUBENode);
-
-  // Initialize accumulator.
-  SDValue ACCIn = CurDAG->getNode(MipsISD::MTLOHI, DL, MVT::Untyped,
-                                  SUBCNode->getOperand(0),
-                                  SUBENode->getOperand(0));
-
-  // create MipsSub(u) node
-  MultOpc = MultOpc == ISD::UMUL_LOHI ? MipsISD::MSubu : MipsISD::MSub;
-
-  SDValue MSub = CurDAG->getNode(MultOpc, DL, MVT::Glue,
-                                 MultNode->getOperand(0),// Factor 0
-                                 MultNode->getOperand(1),// Factor 1
-                                 ACCIn);
-
-  // replace uses of sube and subc here
-  if (!SDValue(SUBCNode, 0).use_empty()) {
-    SDValue LoOut = CurDAG->getNode(MipsISD::MFLO, DL, MVT::i32, MSub);
-    CurDAG->ReplaceAllUsesOfValueWith(SDValue(SUBCNode, 0), LoOut);
-  }
-  if (!SDValue(SUBENode, 0).use_empty()) {
-    SDValue HiOut = CurDAG->getNode(MipsISD::MFHI, DL, MVT::i32, MSub);
-    CurDAG->ReplaceAllUsesOfValueWith(SDValue(SUBENode, 0), HiOut);
-  }
-
-  return true;
-}
-
-static SDValue performADDECombine(SDNode *N, SelectionDAG &DAG,
-                                  TargetLowering::DAGCombinerInfo &DCI,
-                                  const MipsSubtarget &Subtarget) {
-  if (DCI.isBeforeLegalize())
-    return SDValue();
-
-  if (Subtarget.hasMips32() && !Subtarget.hasMips32r6() &&
-      N->getValueType(0) == MVT::i32 && selectMADD(N, &DAG))
-    return SDValue(N, 0);
-
-  return SDValue();
-}
-
 // Fold zero extensions into MipsISD::VEXTRACT_[SZ]EXT_ELT
 //
 // Performs the following transformations:
@@ -820,19 +661,6 @@ static SDValue performORCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-static SDValue performSUBECombine(SDNode *N, SelectionDAG &DAG,
-                                  TargetLowering::DAGCombinerInfo &DCI,
-                                  const MipsSubtarget &Subtarget) {
-  if (DCI.isBeforeLegalize())
-    return SDValue();
-
-  if (Subtarget.hasMips32() && N->getValueType(0) == MVT::i32 &&
-      selectMSUB(N, &DAG))
-    return SDValue(N, 0);
-
-  return SDValue();
-}
-
 static SDValue genConstMult(SDValue X, uint64_t C, const SDLoc &DL, EVT VT,
                             EVT ShiftTy, SelectionDAG &DAG) {
   // Clear the upper (64 - VT.sizeInBits) bits.
@@ -1110,16 +938,12 @@ MipsSETargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const {
   SDValue Val;
 
   switch (N->getOpcode()) {
-  case ISD::ADDE:
-    return performADDECombine(N, DAG, DCI, Subtarget);
   case ISD::AND:
     Val = performANDCombine(N, DAG, DCI, Subtarget);
     break;
   case ISD::OR:
     Val = performORCombine(N, DAG, DCI, Subtarget);
     break;
-  case ISD::SUBE:
-    return performSUBECombine(N, DAG, DCI, Subtarget);
   case ISD::MUL:
     return performMULCombine(N, DAG, DCI, this);
   case ISD::SHL:
@@ -3596,9 +3420,17 @@ MipsSETargetLowering::emitST_F16_PSEUDO(MachineInstr &MI,
                                : (Subtarget.isABI_O32() ? &Mips::GPR32RegClass
                                                         : &Mips::GPR64RegClass);
   const bool UsingMips32 = RC == &Mips::GPR32RegClass;
-  unsigned Rs = RegInfo.createVirtualRegister(RC);
+  unsigned Rs = RegInfo.createVirtualRegister(&Mips::GPR32RegClass);
 
   BuildMI(*BB, MI, DL, TII->get(Mips::COPY_U_H), Rs).addReg(Ws).addImm(0);
+  if(!UsingMips32) {
+    unsigned Tmp = RegInfo.createVirtualRegister(&Mips::GPR64RegClass);
+    BuildMI(*BB, MI, DL, TII->get(Mips::SUBREG_TO_REG), Tmp)
+        .addImm(0)
+        .addReg(Rs)
+        .addImm(Mips::sub_32);
+    Rs = Tmp;
+  }
   BuildMI(*BB, MI, DL, TII->get(UsingMips32 ? Mips::SH : Mips::SH64))
       .addReg(Rs)
       .addReg(Rt)
@@ -3649,6 +3481,12 @@ MipsSETargetLowering::emitLD_F16_PSEUDO(MachineInstr &MI,
   for (unsigned i = 1; i < MI.getNumOperands(); i++)
     MIB.add(MI.getOperand(i));
 
+  if(!UsingMips32) {
+    unsigned Tmp = RegInfo.createVirtualRegister(&Mips::GPR32RegClass);
+    BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Tmp).addReg(Rt, 0, Mips::sub_32);
+    Rt = Tmp;
+  }
+
   BuildMI(*BB, MI, DL, TII->get(Mips::FILL_H), Wd).addReg(Rt);
 
   MI.eraseFromParent();
@@ -3716,6 +3554,7 @@ MipsSETargetLowering::emitFPROUND_PSEUDO(MachineInstr &MI,
   assert(Subtarget.hasMSA() && Subtarget.hasMips32r2());
 
   bool IsFGR64onMips64 = Subtarget.hasMips64() && IsFGR64;
+  bool IsFGR64onMips32 = !Subtarget.hasMips64() && IsFGR64;
 
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   DebugLoc DL = MI.getDebugLoc();
@@ -3726,7 +3565,9 @@ MipsSETargetLowering::emitFPROUND_PSEUDO(MachineInstr &MI,
   unsigned Wtemp = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
   const TargetRegisterClass *GPRRC =
       IsFGR64onMips64 ? &Mips::GPR64RegClass : &Mips::GPR32RegClass;
-  unsigned MFC1Opc = IsFGR64onMips64 ? Mips::DMFC1 : Mips::MFC1;
+  unsigned MFC1Opc = IsFGR64onMips64
+                         ? Mips::DMFC1
+                         : (IsFGR64onMips32 ? Mips::MFC1_D64 : Mips::MFC1);
   unsigned FILLOpc = IsFGR64onMips64 ? Mips::FILL_D : Mips::FILL_W;
 
   // Perform the register class copy as mentioned above.
@@ -3735,7 +3576,7 @@ MipsSETargetLowering::emitFPROUND_PSEUDO(MachineInstr &MI,
   BuildMI(*BB, MI, DL, TII->get(FILLOpc), Wtemp).addReg(Rtemp);
   unsigned WPHI = Wtemp;
 
-  if (!Subtarget.hasMips64() && IsFGR64) {
+  if (IsFGR64onMips32) {
     unsigned Rtemp2 = RegInfo.createVirtualRegister(GPRRC);
     BuildMI(*BB, MI, DL, TII->get(Mips::MFHC1_D64), Rtemp2).addReg(Fs);
     unsigned Wtemp2 = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
@@ -3829,7 +3670,9 @@ MipsSETargetLowering::emitFPEXTEND_PSEUDO(MachineInstr &MI,
   MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
   const TargetRegisterClass *GPRRC =
       IsFGR64onMips64 ? &Mips::GPR64RegClass : &Mips::GPR32RegClass;
-  unsigned MTC1Opc = IsFGR64onMips64 ? Mips::DMTC1 : Mips::MTC1;
+  unsigned MTC1Opc = IsFGR64onMips64
+                         ? Mips::DMTC1
+                         : (IsFGR64onMips32 ? Mips::MTC1_D64 : Mips::MTC1);
   unsigned COPYOpc = IsFGR64onMips64 ? Mips::COPY_S_D : Mips::COPY_S_W;
 
   unsigned Wtemp = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
diff --git a/lib/Target/Mips/MipsSchedule.td b/lib/Target/Mips/MipsSchedule.td
index 8ec55ab6284d..c2947bb44ef5 100644
--- a/lib/Target/Mips/MipsSchedule.td
+++ b/lib/Target/Mips/MipsSchedule.td
@@ -226,7 +226,6 @@ def II_MFC1             : InstrItinClass;
 def II_MFHC1            : InstrItinClass;
 def II_MFC2             : InstrItinClass;
 def II_MFHI_MFLO        : InstrItinClass; // mfhi and mflo
-def II_MFTR             : InstrItinClass;
 def II_MOD              : InstrItinClass;
 def II_MODU             : InstrItinClass;
 def II_MOVE             : InstrItinClass;
@@ -256,7 +255,6 @@ def II_MTC1             : InstrItinClass;
 def II_MTHC1            : InstrItinClass;
 def II_MTC2             : InstrItinClass;
 def II_MTHI_MTLO        : InstrItinClass; // mthi and mtlo
-def II_MTTR             : InstrItinClass;
 def II_MUL              : InstrItinClass;
 def II_MUH              : InstrItinClass;
 def II_MUHU             : InstrItinClass;
@@ -666,14 +664,12 @@ def MipsGenericItineraries : ProcessorItineraries<[ALU, IMULDIV], [], [
   InstrItinData<II_MFHC0           , [InstrStage<2,  [ALU]>]>,
   InstrItinData<II_MFC1            , [InstrStage<2,  [ALU]>]>,
   InstrItinData<II_MFC2            , [InstrStage<2,  [ALU]>]>,
-  InstrItinData<II_MFTR            , [InstrStage<2,  [ALU]>]>,
   InstrItinData<II_MTC0            , [InstrStage<2,  [ALU]>]>,
   InstrItinData<II_MTHC0           , [InstrStage<2,  [ALU]>]>,
   InstrItinData<II_MTC1            , [InstrStage<2,  [ALU]>]>,
   InstrItinData<II_MTC2            , [InstrStage<2,  [ALU]>]>,
   InstrItinData<II_MFHC1           , [InstrStage<2,  [ALU]>]>,
   InstrItinData<II_MTHC1           , [InstrStage<2,  [ALU]>]>,
-  InstrItinData<II_MTTR            , [InstrStage<2,  [ALU]>]>,
   InstrItinData<II_CACHE           , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_PREF            , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_CACHEE          , [InstrStage<1,  [ALU]>]>,
diff --git a/lib/Target/Mips/MipsSubtarget.h b/lib/Target/Mips/MipsSubtarget.h
index 7619e7b08612..cce3b8c4c8d1 100644
--- a/lib/Target/Mips/MipsSubtarget.h
+++ b/lib/Target/Mips/MipsSubtarget.h
@@ -152,6 +152,9 @@ class MipsSubtarget : public MipsGenSubtargetInfo {
   // HasMT -- support MT ASE.
   bool HasMT;
 
+  // Disable use of the `jal` instruction.
+  bool UseLongCalls = false;
+
   InstrItineraryData InstrItins;
 
   // We can override the determination of whether we are in mips16 mode
@@ -269,6 +272,8 @@ public:
 
   bool useSoftFloat() const { return IsSoftFloat; }
 
+  bool useLongCalls() const { return UseLongCalls; }
+
   bool enableLongBranchPass() const {
     return hasStandardEncoding() || allowMixed16_32();
   }
diff --git a/lib/Target/Mips/MipsTargetStreamer.h b/lib/Target/Mips/MipsTargetStreamer.h
index af24838665e1..7d9f99ce071e 100644
--- a/lib/Target/Mips/MipsTargetStreamer.h
+++ b/lib/Target/Mips/MipsTargetStreamer.h
@@ -119,9 +119,6 @@ public:
                SMLoc IDLoc, const MCSubtargetInfo *STI);
   void emitRRI(unsigned Opcode, unsigned Reg0, unsigned Reg1, int16_t Imm,
                SMLoc IDLoc, const MCSubtargetInfo *STI);
-  void emitRRIII(unsigned Opcode, unsigned Reg0, unsigned Reg1, int16_t Imm0,
-                 int16_t Imm1, int16_t Imm2, SMLoc IDLoc,
-                 const MCSubtargetInfo *STI);
   void emitAddu(unsigned DstReg, unsigned SrcReg, unsigned TrgReg, bool Is64Bit,
                 const MCSubtargetInfo *STI);
   void emitDSLL(unsigned DstReg, unsigned SrcReg, int16_t ShiftAmount,
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
index a00b56af0490..92c8c224b71b 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
@@ -271,7 +271,8 @@ unsigned PPCMCCodeEmitter::getMemRIX16Encoding(const MCInst &MI, unsigned OpNo,
   unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups, STI) << 12;
 
   const MCOperand &MO = MI.getOperand(OpNo);
-  assert(MO.isImm());
+  assert(MO.isImm() && !(MO.getImm() % 16) &&
+         "Expecting an immediate that is a multiple of 16");
 
   return ((getMachineOpValue(MI, MO, Fixups, STI) >> 4) & 0xFFF) | RegBits;
 }
diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 3aaf7ef2c2a0..901539b682ba 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -178,7 +178,7 @@ namespace {
     /// a base register plus a signed 16-bit displacement [r+imm].
     bool SelectAddrImm(SDValue N, SDValue &Disp,
                        SDValue &Base) {
-      return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, false);
+      return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, 0);
     }
 
     /// SelectAddrImmOffs - Return true if the operand is valid for a preinc
@@ -211,7 +211,11 @@ namespace {
     /// a base register plus a signed 16-bit displacement that is a multiple of 4.
     /// Suitable for use by STD and friends.
     bool SelectAddrImmX4(SDValue N, SDValue &Disp, SDValue &Base) {
-      return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, true);
+      return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, 4);
+    }
+
+    bool SelectAddrImmX16(SDValue N, SDValue &Disp, SDValue &Base) {
+      return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, 16);
     }
 
     // Select an address into a single register.
@@ -305,6 +309,7 @@ private:
     bool AllUsersSelectZero(SDNode *N);
     void SwapAllSelectUsers(SDNode *N);
 
+    bool isOffsetMultipleOf(SDNode *N, unsigned Val) const;
     void transferMemOperands(SDNode *N, SDNode *Result);
   };
 
@@ -2999,6 +3004,25 @@ SDValue PPCDAGToDAGISel::getSETCCInGPR(SDValue Compare,
   return get64BitZExtCompare(LHS, RHS, CC, RHSValue, dl);
 }
 
+/// Does this node represent a load/store node whose address can be represented
+/// with a register plus an immediate that's a multiple of \p Val:
+bool PPCDAGToDAGISel::isOffsetMultipleOf(SDNode *N, unsigned Val) const {
+  LoadSDNode *LDN = dyn_cast<LoadSDNode>(N);
+  StoreSDNode *STN = dyn_cast<StoreSDNode>(N);
+  SDValue AddrOp;
+  if (LDN)
+    AddrOp = LDN->getOperand(1);
+  else if (STN)
+    AddrOp = STN->getOperand(2);
+
+  short Imm = 0;
+  if (AddrOp.getOpcode() == ISD::ADD)
+    return isIntS16Immediate(AddrOp.getOperand(1), Imm) && !(Imm % Val);
+
+  // If the address comes from the outside, the offset will be zero.
+  return AddrOp.getOpcode() == ISD::CopyFromReg;
+}
+
 void PPCDAGToDAGISel::transferMemOperands(SDNode *N, SDNode *Result) {
   // Transfer memoperands.
   MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 0e069ec1665f..b3a3c73f6df0 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -2130,12 +2130,12 @@ static void fixupFuncForFI(SelectionDAG &DAG, int FrameIdx, EVT VT) {
 
 /// Returns true if the address N can be represented by a base register plus
 /// a signed 16-bit displacement [r+imm], and if it is not better
-/// represented as reg+reg.  If Aligned is true, only accept displacements
-/// suitable for STD and friends, i.e. multiples of 4.
+/// represented as reg+reg.  If \p Alignment is non-zero, only accept
+/// displacements that are multiples of that value.
 bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
                                             SDValue &Base,
                                             SelectionDAG &DAG,
-                                            bool Aligned) const {
+                                            unsigned Alignment) const {
   // FIXME dl should come from parent load or store, not from address
   SDLoc dl(N);
   // If this can be more profitably realized as r+r, fail.
@@ -2145,7 +2145,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
   if (N.getOpcode() == ISD::ADD) {
     int16_t imm = 0;
     if (isIntS16Immediate(N.getOperand(1), imm) &&
-        (!Aligned || (imm & 3) == 0)) {
+        (!Alignment || (imm % Alignment) == 0)) {
       Disp = DAG.getTargetConstant(imm, dl, N.getValueType());
       if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0))) {
         Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
@@ -2169,7 +2169,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
   } else if (N.getOpcode() == ISD::OR) {
     int16_t imm = 0;
     if (isIntS16Immediate(N.getOperand(1), imm) &&
-        (!Aligned || (imm & 3) == 0)) {
+        (!Alignment || (imm % Alignment) == 0)) {
       // If this is an or of disjoint bitfields, we can codegen this as an add
       // (for better address arithmetic) if the LHS and RHS of the OR are
       // provably disjoint.
@@ -2196,7 +2196,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
     // If this address fits entirely in a 16-bit sext immediate field, codegen
     // this as "d, 0"
     int16_t Imm;
-    if (isIntS16Immediate(CN, Imm) && (!Aligned || (Imm & 3) == 0)) {
+    if (isIntS16Immediate(CN, Imm) && (!Alignment || (Imm % Alignment) == 0)) {
       Disp = DAG.getTargetConstant(Imm, dl, CN->getValueType(0));
       Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
                              CN->getValueType(0));
@@ -2206,7 +2206,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
     // Handle 32-bit sext immediates with LIS + addr mode.
     if ((CN->getValueType(0) == MVT::i32 ||
          (int64_t)CN->getZExtValue() == (int)CN->getZExtValue()) &&
-        (!Aligned || (CN->getZExtValue() & 3) == 0)) {
+        (!Alignment || (CN->getZExtValue() % Alignment) == 0)) {
       int Addr = (int)CN->getZExtValue();
 
       // Otherwise, break this down into an LIS + disp.
@@ -2321,14 +2321,14 @@ bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
 
   // LDU/STU can only handle immediates that are a multiple of 4.
   if (VT != MVT::i64) {
-    if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, false))
+    if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, 0))
       return false;
   } else {
     // LDU/STU need an address with at least 4-byte alignment.
     if (Alignment < 4)
       return false;
 
-    if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, true))
+    if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, 4))
       return false;
   }
 
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index 821927d3b157..49d7d8220af1 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -616,7 +616,7 @@ namespace llvm {
     /// is not better represented as reg+reg.  If Aligned is true, only accept
     /// displacements suitable for STD and friends, i.e. multiples of 4.
     bool SelectAddressRegImm(SDValue N, SDValue &Disp, SDValue &Base,
-                             SelectionDAG &DAG, bool Aligned) const;
+                             SelectionDAG &DAG, unsigned Alignment) const;
 
     /// SelectAddressRegRegOnly - Given the specified addressed, force it to be
     /// represented as an indexed [r+r] operation.
diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp
index 13b4f9ab962d..e74ba38c351f 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -1635,8 +1635,9 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
   if (equalityOnly) {
     // We need to check the uses of the condition register in order to reject
     // non-equality comparisons.
-    for (MachineRegisterInfo::use_instr_iterator I =MRI->use_instr_begin(CRReg),
-         IE = MRI->use_instr_end(); I != IE; ++I) {
+    for (MachineRegisterInfo::use_instr_iterator
+         I = MRI->use_instr_begin(CRReg), IE = MRI->use_instr_end();
+         I != IE; ++I) {
       MachineInstr *UseMI = &*I;
       if (UseMI->getOpcode() == PPC::BCC) {
         unsigned Pred = UseMI->getOperand(0).getImm();
@@ -1658,8 +1659,9 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
   for (MachineBasicBlock::iterator EL = CmpInstr.getParent()->end(); I != EL;
        ++I) {
     bool FoundUse = false;
-    for (MachineRegisterInfo::use_instr_iterator J =MRI->use_instr_begin(CRReg),
-         JE = MRI->use_instr_end(); J != JE; ++J)
+    for (MachineRegisterInfo::use_instr_iterator
+         J = MRI->use_instr_begin(CRReg), JE = MRI->use_instr_end();
+         J != JE; ++J)
       if (&*J == &*I) {
         FoundUse = true;
         break;
diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td
index 6d9f55206b6a..dd7fc2659102 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/lib/Target/PowerPC/PPCInstrInfo.td
@@ -405,6 +405,25 @@ def unaligned4sextloadi32 : PatFrag<(ops node:$ptr), (sextloadi32 node:$ptr), [{
   return cast<LoadSDNode>(N)->getAlignment() < 4;
 }]>;
 
+// This is a somewhat weaker condition than actually checking for 16-byte
+// alignment. It is simply checking that the displacement can be represented
+// as an immediate that is a multiple of 16 (i.e. the requirements for DQ-Form
+// instructions).
+def quadwOffsetLoad : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return isOffsetMultipleOf(N, 16);
+}]>;
+def quadwOffsetStore : PatFrag<(ops node:$val, node:$ptr),
+                               (store node:$val, node:$ptr), [{
+  return isOffsetMultipleOf(N, 16);
+}]>;
+def nonQuadwOffsetLoad : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return !isOffsetMultipleOf(N, 16);
+}]>;
+def nonQuadwOffsetStore : PatFrag<(ops node:$val, node:$ptr),
+                                  (store node:$val, node:$ptr), [{
+  return !isOffsetMultipleOf(N, 16);
+}]>;
+
 //===----------------------------------------------------------------------===//
 // PowerPC Flag Definitions.
 
@@ -815,7 +834,8 @@ def pred : Operand<OtherVT> {
 def iaddr  : ComplexPattern<iPTR, 2, "SelectAddrImm",    [], []>;
 def xaddr  : ComplexPattern<iPTR, 2, "SelectAddrIdx",    [], []>;
 def xoaddr : ComplexPattern<iPTR, 2, "SelectAddrIdxOnly",[], []>;
-def ixaddr : ComplexPattern<iPTR, 2, "SelectAddrImmX4",  [], []>; // "std"
+def ixaddr : ComplexPattern<iPTR, 2, "SelectAddrImmX4",  [], []>;  // "std"
+def iqaddr : ComplexPattern<iPTR, 2, "SelectAddrImmX16",  [], []>; // "stxv"
 
 // The address in a single register. This is used with the SjLj
 // pseudo-instructions.
diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td
index 43635a8919e2..942e8b392b82 100644
--- a/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/lib/Target/PowerPC/PPCInstrVSX.td
@@ -2606,37 +2606,41 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
   } // IsLittleEndian, HasP9Vector
 
   // D-Form Load/Store
-  def : Pat<(v4i32 (load iaddr:$src)), (LXV memrix16:$src)>;
-  def : Pat<(v4f32 (load iaddr:$src)), (LXV memrix16:$src)>;
-  def : Pat<(v2i64 (load iaddr:$src)), (LXV memrix16:$src)>;
-  def : Pat<(v2f64 (load iaddr:$src)), (LXV memrix16:$src)>;
-  def : Pat<(v4i32 (int_ppc_vsx_lxvw4x iaddr:$src)), (LXV memrix16:$src)>;
-  def : Pat<(v2f64 (int_ppc_vsx_lxvd2x iaddr:$src)), (LXV memrix16:$src)>;
-
-  def : Pat<(store v4f32:$rS, iaddr:$dst), (STXV $rS, memrix16:$dst)>;
-  def : Pat<(store v4i32:$rS, iaddr:$dst), (STXV $rS, memrix16:$dst)>;
-  def : Pat<(store v2f64:$rS, iaddr:$dst), (STXV $rS, memrix16:$dst)>;
-  def : Pat<(store v2i64:$rS, iaddr:$dst), (STXV $rS, memrix16:$dst)>;
-  def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, iaddr:$dst),
+  def : Pat<(v4i32 (quadwOffsetLoad iqaddr:$src)), (LXV memrix16:$src)>;
+  def : Pat<(v4f32 (quadwOffsetLoad iqaddr:$src)), (LXV memrix16:$src)>;
+  def : Pat<(v2i64 (quadwOffsetLoad iqaddr:$src)), (LXV memrix16:$src)>;
+  def : Pat<(v2f64 (quadwOffsetLoad iqaddr:$src)), (LXV memrix16:$src)>;
+  def : Pat<(v4i32 (int_ppc_vsx_lxvw4x iqaddr:$src)), (LXV memrix16:$src)>;
+  def : Pat<(v2f64 (int_ppc_vsx_lxvd2x iqaddr:$src)), (LXV memrix16:$src)>;
+
+  def : Pat<(quadwOffsetStore v4f32:$rS, iqaddr:$dst), (STXV $rS, memrix16:$dst)>;
+  def : Pat<(quadwOffsetStore v4i32:$rS, iqaddr:$dst), (STXV $rS, memrix16:$dst)>;
+  def : Pat<(quadwOffsetStore v2f64:$rS, iqaddr:$dst), (STXV $rS, memrix16:$dst)>;
+  def : Pat<(quadwOffsetStore v2i64:$rS, iqaddr:$dst), (STXV $rS, memrix16:$dst)>;
+  def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, iqaddr:$dst),
             (STXV $rS, memrix16:$dst)>;
-  def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, iaddr:$dst),
+  def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, iqaddr:$dst),
             (STXV $rS, memrix16:$dst)>;
 
 
-  def : Pat<(v2f64 (load xaddr:$src)), (LXVX xaddr:$src)>;
-  def : Pat<(v2i64 (load xaddr:$src)), (LXVX xaddr:$src)>;
-  def : Pat<(v4f32 (load xaddr:$src)), (LXVX xaddr:$src)>;
-  def : Pat<(v4i32 (load xaddr:$src)), (LXVX xaddr:$src)>;
-  def : Pat<(v4i32 (int_ppc_vsx_lxvw4x xaddr:$src)), (LXVX xaddr:$src)>;
-  def : Pat<(v2f64 (int_ppc_vsx_lxvd2x xaddr:$src)), (LXVX xaddr:$src)>;
-  def : Pat<(store v2f64:$rS, xaddr:$dst), (STXVX $rS, xaddr:$dst)>;
-  def : Pat<(store v2i64:$rS, xaddr:$dst), (STXVX $rS, xaddr:$dst)>;
-  def : Pat<(store v4f32:$rS, xaddr:$dst), (STXVX $rS, xaddr:$dst)>;
-  def : Pat<(store v4i32:$rS, xaddr:$dst), (STXVX $rS, xaddr:$dst)>;
-  def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, xaddr:$dst),
-            (STXVX $rS, xaddr:$dst)>;
-  def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, xaddr:$dst),
-            (STXVX $rS, xaddr:$dst)>;
+  def : Pat<(v2f64 (nonQuadwOffsetLoad xoaddr:$src)), (LXVX xoaddr:$src)>;
+  def : Pat<(v2i64 (nonQuadwOffsetLoad xoaddr:$src)), (LXVX xoaddr:$src)>;
+  def : Pat<(v4f32 (nonQuadwOffsetLoad xoaddr:$src)), (LXVX xoaddr:$src)>;
+  def : Pat<(v4i32 (nonQuadwOffsetLoad xoaddr:$src)), (LXVX xoaddr:$src)>;
+  def : Pat<(v4i32 (int_ppc_vsx_lxvw4x xoaddr:$src)), (LXVX xoaddr:$src)>;
+  def : Pat<(v2f64 (int_ppc_vsx_lxvd2x xoaddr:$src)), (LXVX xoaddr:$src)>;
+  def : Pat<(nonQuadwOffsetStore v2f64:$rS, xoaddr:$dst),
+            (STXVX $rS, xoaddr:$dst)>;
+  def : Pat<(nonQuadwOffsetStore v2i64:$rS, xoaddr:$dst),
+            (STXVX $rS, xoaddr:$dst)>;
+  def : Pat<(nonQuadwOffsetStore v4f32:$rS, xoaddr:$dst),
+            (STXVX $rS, xoaddr:$dst)>;
+  def : Pat<(nonQuadwOffsetStore v4i32:$rS, xoaddr:$dst),
+            (STXVX $rS, xoaddr:$dst)>;
+  def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, xoaddr:$dst),
+            (STXVX $rS, xoaddr:$dst)>;
+  def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, xoaddr:$dst),
+            (STXVX $rS, xoaddr:$dst)>;
   def : Pat<(v4i32 (scalar_to_vector (i32 (load xoaddr:$src)))),
             (v4i32 (LXVWSX xoaddr:$src))>;
   def : Pat<(v4f32 (scalar_to_vector (f32 (load xoaddr:$src)))),
@@ -2788,21 +2792,21 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
   let isPseudo = 1 in {
     def DFLOADf32  : Pseudo<(outs vssrc:$XT), (ins memrix:$src),
                             "#DFLOADf32",
-                            [(set f32:$XT, (load iaddr:$src))]>;
+                            [(set f32:$XT, (load ixaddr:$src))]>;
     def DFLOADf64  : Pseudo<(outs vsfrc:$XT), (ins memrix:$src),
                             "#DFLOADf64",
-                            [(set f64:$XT, (load iaddr:$src))]>;
+                            [(set f64:$XT, (load ixaddr:$src))]>;
     def DFSTOREf32 : Pseudo<(outs), (ins vssrc:$XT, memrix:$dst),
                             "#DFSTOREf32",
-                            [(store f32:$XT, iaddr:$dst)]>;
+                            [(store f32:$XT, ixaddr:$dst)]>;
     def DFSTOREf64 : Pseudo<(outs), (ins vsfrc:$XT, memrix:$dst),
                             "#DFSTOREf64",
-                            [(store f64:$XT, iaddr:$dst)]>;
+                            [(store f64:$XT, ixaddr:$dst)]>;
   }
-  def : Pat<(f64 (extloadf32 iaddr:$src)),
-            (COPY_TO_REGCLASS (DFLOADf32 iaddr:$src), VSFRC)>;
-  def : Pat<(f32 (fpround (extloadf32 iaddr:$src))),
-            (f32 (DFLOADf32 iaddr:$src))>;
+  def : Pat<(f64 (extloadf32 ixaddr:$src)),
+            (COPY_TO_REGCLASS (DFLOADf32 ixaddr:$src), VSFRC)>;
+  def : Pat<(f32 (fpround (extloadf32 ixaddr:$src))),
+            (f32 (DFLOADf32 ixaddr:$src))>;
 } // end HasP9Vector, AddedComplexity
 
 // Integer extend helper dags 32 -> 64
@@ -2881,13 +2885,13 @@ def FltToLongLoad {
   dag A = (i64 (PPCmfvsr (PPCfctidz (f64 (extloadf32 xoaddr:$A)))));
 }
 def FltToLongLoadP9 {
-  dag A = (i64 (PPCmfvsr (PPCfctidz (f64 (extloadf32 iaddr:$A)))));
+  dag A = (i64 (PPCmfvsr (PPCfctidz (f64 (extloadf32 ixaddr:$A)))));
 }
 def FltToULongLoad {
   dag A = (i64 (PPCmfvsr (PPCfctiduz (f64 (extloadf32 xoaddr:$A)))));
 }
 def FltToULongLoadP9 {
-  dag A = (i64 (PPCmfvsr (PPCfctiduz (f64 (extloadf32 iaddr:$A)))));
+  dag A = (i64 (PPCmfvsr (PPCfctiduz (f64 (extloadf32 ixaddr:$A)))));
 }
 def FltToLong {
   dag A = (i64 (PPCmfvsr (PPCfctidz (fpextend f32:$A))));
@@ -2911,13 +2915,13 @@ def DblToIntLoad {
   dag A = (i32 (PPCmfvsr (PPCfctiwz (f64 (load xoaddr:$A)))));
 }
 def DblToIntLoadP9 {
-  dag A = (i32 (PPCmfvsr (PPCfctiwz (f64 (load iaddr:$A)))));
+  dag A = (i32 (PPCmfvsr (PPCfctiwz (f64 (load ixaddr:$A)))));
 }
 def DblToUIntLoad {
   dag A = (i32 (PPCmfvsr (PPCfctiwuz (f64 (load xoaddr:$A)))));
 }
 def DblToUIntLoadP9 {
-  dag A = (i32 (PPCmfvsr (PPCfctiwuz (f64 (load iaddr:$A)))));
+  dag A = (i32 (PPCmfvsr (PPCfctiwuz (f64 (load ixaddr:$A)))));
 }
 def DblToLongLoad {
   dag A = (i64 (PPCmfvsr (PPCfctidz (f64 (load xoaddr:$A)))));
@@ -3088,17 +3092,17 @@ let AddedComplexity = 400 in {
               (v4i32 (XVCVSPUXWS (LXVWSX xoaddr:$A)))>;
     def : Pat<(v4i32 (scalar_to_vector DblToIntLoadP9.A)),
               (v4i32 (XXSPLTW (COPY_TO_REGCLASS
-                                (XSCVDPSXWS (DFLOADf64 iaddr:$A)), VSRC), 1))>;
+                                (XSCVDPSXWS (DFLOADf64 ixaddr:$A)), VSRC), 1))>;
     def : Pat<(v4i32 (scalar_to_vector DblToUIntLoadP9.A)),
               (v4i32 (XXSPLTW (COPY_TO_REGCLASS
-                                (XSCVDPUXWS (DFLOADf64 iaddr:$A)), VSRC), 1))>;
+                                (XSCVDPUXWS (DFLOADf64 ixaddr:$A)), VSRC), 1))>;
     def : Pat<(v2i64 (scalar_to_vector FltToLongLoadP9.A)),
               (v2i64 (XXPERMDIs (XSCVDPSXDS (COPY_TO_REGCLASS
-                                              (DFLOADf32 iaddr:$A),
+                                              (DFLOADf32 ixaddr:$A),
                                               VSFRC)), 0))>;
     def : Pat<(v2i64 (scalar_to_vector FltToULongLoadP9.A)),
               (v2i64 (XXPERMDIs (XSCVDPUXDS (COPY_TO_REGCLASS
-                                              (DFLOADf32 iaddr:$A),
+                                              (DFLOADf32 ixaddr:$A),
                                               VSFRC)), 0))>;
   }
 
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 8af7f7e98117..9207165c46a6 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -754,19 +754,31 @@ bool PPCRegisterInfo::hasReservedSpillSlot(const MachineFunction &MF,
   return false;
 }
 
-// Figure out if the offset in the instruction must be a multiple of 4.
-// This is true for instructions like "STD".
-static bool usesIXAddr(const MachineInstr &MI) {
+// If the offset must be a multiple of some value, return what that value is.
+static unsigned offsetMinAlign(const MachineInstr &MI) {
   unsigned OpC = MI.getOpcode();
 
   switch (OpC) {
   default:
-    return false;
+    return 1;
   case PPC::LWA:
   case PPC::LWA_32:
   case PPC::LD:
+  case PPC::LDU:
   case PPC::STD:
-    return true;
+  case PPC::STDU:
+  case PPC::DFLOADf32:
+  case PPC::DFLOADf64:
+  case PPC::DFSTOREf32:
+  case PPC::DFSTOREf64:
+  case PPC::LXSD:
+  case PPC::LXSSP:
+  case PPC::STXSD:
+  case PPC::STXSSP:
+    return 4;
+  case PPC::LXV:
+  case PPC::STXV:
+    return 16;
   }
 }
 
@@ -852,9 +864,6 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   MI.getOperand(FIOperandNum).ChangeToRegister(
     FrameIndex < 0 ? getBaseRegister(MF) : getFrameRegister(MF), false);
 
-  // Figure out if the offset in the instruction is shifted right two bits.
-  bool isIXAddr = usesIXAddr(MI);
-
   // If the instruction is not present in ImmToIdxMap, then it has no immediate
   // form (and must be r+r).
   bool noImmForm = !MI.isInlineAsm() && OpC != TargetOpcode::STACKMAP &&
@@ -883,7 +892,8 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   // happen in invalid code.
   assert(OpC != PPC::DBG_VALUE &&
          "This should be handled in a target-independent way");
-  if (!noImmForm && ((isInt<16>(Offset) && (!isIXAddr || (Offset & 3) == 0)) ||
+  if (!noImmForm && ((isInt<16>(Offset) &&
+                      ((Offset % offsetMinAlign(MI)) == 0)) ||
                      OpC == TargetOpcode::STACKMAP ||
                      OpC == TargetOpcode::PATCHPOINT)) {
     MI.getOperand(OffsetOperandNo).ChangeToImmediate(Offset);
@@ -1076,5 +1086,5 @@ bool PPCRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
   return MI->getOpcode() == PPC::DBG_VALUE || // DBG_VALUE is always Reg+Imm
          MI->getOpcode() == TargetOpcode::STACKMAP ||
          MI->getOpcode() == TargetOpcode::PATCHPOINT ||
-         (isInt<16>(Offset) && (!usesIXAddr(*MI) || (Offset & 3) == 0));
+         (isInt<16>(Offset) && (Offset % offsetMinAlign(*MI)) == 0);
 }
diff --git a/lib/Target/PowerPC/PPCTargetMachine.h b/lib/Target/PowerPC/PPCTargetMachine.h
index 2dc3828334ac..be705507b534 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.h
+++ b/lib/Target/PowerPC/PPCTargetMachine.h
@@ -41,6 +41,8 @@ public:
   ~PPCTargetMachine() override;
 
   const PPCSubtarget *getSubtargetImpl(const Function &F) const override;
+  // The no argument getSubtargetImpl, while it exists on some targets, is
+  // deprecated and should not be used.
   const PPCSubtarget *getSubtargetImpl() const = delete;
 
   // Pass Pipeline Configuration
diff --git a/lib/Target/Sparc/Sparc.td b/lib/Target/Sparc/Sparc.td
index 11004c5a952f..91cab00b2b65 100644
--- a/lib/Target/Sparc/Sparc.td
+++ b/lib/Target/Sparc/Sparc.td
@@ -20,6 +20,10 @@ include "llvm/Target/Target.td"
 // SPARC Subtarget features.
 //
 
+def FeatureSoftMulDiv
+  : SubtargetFeature<"soft-mul-div", "UseSoftMulDiv", "true",
+                     "Use software emulation for integer multiply and divide">;
+
 def FeatureV9
   : SubtargetFeature<"v9", "IsV9", "true",
                      "Enable SPARC-V9 instructions">;
@@ -75,7 +79,7 @@ class Proc<string Name, list<SubtargetFeature> Features>
  : Processor<Name, NoItineraries, Features>;
 
 def : Proc<"generic",         []>;
-def : Proc<"v7",              []>;
+def : Proc<"v7",              [FeatureSoftMulDiv]>;
 def : Proc<"v8",              []>;
 def : Proc<"supersparc",      []>;
 def : Proc<"sparclite",       []>;
diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp
index 9e7e3c6b705a..6767a59a9757 100644
--- a/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/lib/Target/Sparc/SparcISelLowering.cpp
@@ -1689,6 +1689,19 @@ SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::MULHS,     MVT::i32, Expand);
   setOperationAction(ISD::MUL,       MVT::i32, Expand);
 
+  if (Subtarget->useSoftMulDiv()) {
+    // .umul works for both signed and unsigned
+    setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
+    setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
+    setLibcallName(RTLIB::MUL_I32, ".umul");
+
+    setOperationAction(ISD::SDIV, MVT::i32, Expand);
+    setLibcallName(RTLIB::SDIV_I32, ".div");
+
+    setOperationAction(ISD::UDIV, MVT::i32, Expand);
+    setLibcallName(RTLIB::UDIV_I32, ".udiv");
+  }
+
   if (Subtarget->is64Bit()) {
     setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
     setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
diff --git a/lib/Target/Sparc/SparcInstrInfo.td b/lib/Target/Sparc/SparcInstrInfo.td
index ae45c8be6752..3194ad4aeb6b 100644
--- a/lib/Target/Sparc/SparcInstrInfo.td
+++ b/lib/Target/Sparc/SparcInstrInfo.td
@@ -27,6 +27,9 @@ def Is32Bit : Predicate<"!Subtarget->is64Bit()">;
 // True when generating 64-bit code. This also implies HasV9.
 def Is64Bit : Predicate<"Subtarget->is64Bit()">;
 
+def UseSoftMulDiv : Predicate<"Subtarget->useSoftMulDiv()">,
+              AssemblerPredicate<"FeatureSoftMulDiv">;
+
 // HasV9 - This predicate is true when the target processor supports V9
 // instructions.  Note that the machine may be running in 32-bit mode.
 def HasV9   : Predicate<"Subtarget->isV9()">,
diff --git a/lib/Target/Sparc/SparcSubtarget.cpp b/lib/Target/Sparc/SparcSubtarget.cpp
index 43ddef3cc96e..daac56add87c 100644
--- a/lib/Target/Sparc/SparcSubtarget.cpp
+++ b/lib/Target/Sparc/SparcSubtarget.cpp
@@ -28,6 +28,7 @@ void SparcSubtarget::anchor() { }
 
 SparcSubtarget &SparcSubtarget::initializeSubtargetDependencies(StringRef CPU,
                                                                 StringRef FS) {
+  UseSoftMulDiv = false;
   IsV9 = false;
   IsLeon = false;
   V8DeprecatedInsts = false;
diff --git a/lib/Target/Sparc/SparcSubtarget.h b/lib/Target/Sparc/SparcSubtarget.h
index fa42da425ff2..d18139984b87 100644
--- a/lib/Target/Sparc/SparcSubtarget.h
+++ b/lib/Target/Sparc/SparcSubtarget.h
@@ -32,6 +32,7 @@ class StringRef;
 class SparcSubtarget : public SparcGenSubtargetInfo {
   Triple TargetTriple;
   virtual void anchor();
+  bool UseSoftMulDiv;
   bool IsV9;
   bool IsLeon;
   bool V8DeprecatedInsts;
@@ -76,6 +77,7 @@ public:
 
   bool enableMachineScheduler() const override;
 
+  bool useSoftMulDiv() const { return UseSoftMulDiv; }
   bool isV9() const { return IsV9; }
   bool isLeon() const { return IsLeon; }
   bool isVIS() const { return IsVIS; }
diff --git a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
index ee23692ad1db..33680789ee08 100644
--- a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
+++ b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
@@ -275,6 +275,10 @@ public:
   SMLoc getEndLoc() const override { return EndLoc; }
   void print(raw_ostream &OS) const override;
 
+  /// getLocRange - Get the range between the first and last token of this
+  /// operand.
+  SMRange getLocRange() const { return SMRange(StartLoc, EndLoc); }
+
   // Used by the TableGen code to add particular types of operand
   // to an instruction.
   void addRegOperands(MCInst &Inst, unsigned N) const {
@@ -1164,6 +1168,8 @@ bool SystemZAsmParser::parseOperand(OperandVector &Operands,
   return false;
 }
 
+std::string SystemZMnemonicSpellCheck(StringRef S, uint64_t FBS);
+
 bool SystemZAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                                OperandVector &Operands,
                                                MCStreamer &Out,
@@ -1209,8 +1215,13 @@ bool SystemZAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     return Error(ErrorLoc, "invalid operand for instruction");
   }
 
-  case Match_MnemonicFail:
-    return Error(IDLoc, "invalid instruction");
+  case Match_MnemonicFail: {
+    uint64_t FBS = ComputeAvailableFeatures(getSTI().getFeatureBits());
+    std::string Suggestion = SystemZMnemonicSpellCheck(
+      ((SystemZOperand &)*Operands[0]).getToken(), FBS);
+    return Error(IDLoc, "invalid instruction" + Suggestion,
+                 ((SystemZOperand &)*Operands[0]).getLocRange());
+  }
   }
 
   llvm_unreachable("Unexpected match type");
diff --git a/lib/Target/SystemZ/LLVMBuild.txt b/lib/Target/SystemZ/LLVMBuild.txt
index 6f8431db7b11..9b8b141fd52a 100644
--- a/lib/Target/SystemZ/LLVMBuild.txt
+++ b/lib/Target/SystemZ/LLVMBuild.txt
@@ -31,5 +31,5 @@ has_jit = 1
 type = Library
 name = SystemZCodeGen
 parent = SystemZ
-required_libraries = Analysis AsmPrinter CodeGen Core MC SelectionDAG Support SystemZAsmPrinter SystemZDesc SystemZInfo Target
+required_libraries = Analysis AsmPrinter CodeGen Core MC Scalar SelectionDAG Support SystemZAsmPrinter SystemZDesc SystemZInfo Target
 add_to_library_groups = SystemZ
diff --git a/lib/Target/SystemZ/SystemZFeatures.td b/lib/Target/SystemZ/SystemZFeatures.td
index c5faa0d62881..fda9c30fe3fc 100644
--- a/lib/Target/SystemZ/SystemZFeatures.td
+++ b/lib/Target/SystemZ/SystemZFeatures.td
@@ -187,6 +187,58 @@ def Arch11NewFeatures : SystemZFeatureList<[
     FeatureVector
 ]>;
 
+//===----------------------------------------------------------------------===//
+//
+// New features added in the Twelvth Edition of the z/Architecture
+//
+//===----------------------------------------------------------------------===//
+
+def FeatureMiscellaneousExtensions2 : SystemZFeature<
+  "miscellaneous-extensions-2", "MiscellaneousExtensions2",
+  "Assume that the miscellaneous-extensions facility 2 is installed"
+>;
+
+def FeatureGuardedStorage : SystemZFeature<
+  "guarded-storage", "GuardedStorage",
+  "Assume that the guarded-storage facility is installed"
+>;
+
+def FeatureMessageSecurityAssist7 : SystemZFeature<
+  "message-security-assist-extension7", "MessageSecurityAssist7",
+  "Assume that the message-security-assist extension facility 7 is installed"
+>;
+
+def FeatureMessageSecurityAssist8 : SystemZFeature<
+  "message-security-assist-extension8", "MessageSecurityAssist8",
+  "Assume that the message-security-assist extension facility 8 is installed"
+>;
+
+def FeatureVectorEnhancements1 : SystemZFeature<
+  "vector-enhancements-1", "VectorEnhancements1",
+  "Assume that the vector enhancements facility 1 is installed"
+>;
+def FeatureNoVectorEnhancements1 : SystemZMissingFeature<"VectorEnhancements1">;
+
+def FeatureVectorPackedDecimal : SystemZFeature<
+  "vector-packed-decimal", "VectorPackedDecimal",
+  "Assume that the vector packed decimal facility is installed"
+>;
+
+def FeatureInsertReferenceBitsMultiple : SystemZFeature<
+  "insert-reference-bits-multiple", "InsertReferenceBitsMultiple",
+  "Assume that the insert-reference-bits-multiple facility is installed"
+>;
+
+def Arch12NewFeatures : SystemZFeatureList<[
+    FeatureMiscellaneousExtensions2,
+    FeatureGuardedStorage,
+    FeatureMessageSecurityAssist7,
+    FeatureMessageSecurityAssist8,
+    FeatureVectorEnhancements1,
+    FeatureVectorPackedDecimal,
+    FeatureInsertReferenceBitsMultiple
+]>;
+
 //===----------------------------------------------------------------------===//
 //
 // Cumulative supported and unsupported feature sets
@@ -201,9 +253,13 @@ def Arch10SupportedFeatures
   : SystemZFeatureAdd<Arch9SupportedFeatures.List,  Arch10NewFeatures.List>;
 def Arch11SupportedFeatures
   : SystemZFeatureAdd<Arch10SupportedFeatures.List, Arch11NewFeatures.List>;
+def Arch12SupportedFeatures
+  : SystemZFeatureAdd<Arch11SupportedFeatures.List, Arch12NewFeatures.List>;
 
-def Arch11UnsupportedFeatures
+def Arch12UnsupportedFeatures
   : SystemZFeatureList<[]>;
+def Arch11UnsupportedFeatures
+  : SystemZFeatureAdd<Arch12UnsupportedFeatures.List, Arch12NewFeatures.List>;
 def Arch10UnsupportedFeatures
   : SystemZFeatureAdd<Arch11UnsupportedFeatures.List, Arch11NewFeatures.List>;
 def Arch9UnsupportedFeatures
diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp
index 2801141cd951..2d916d2e1521 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -101,7 +101,10 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
     addRegisterClass(MVT::f32, &SystemZ::FP32BitRegClass);
     addRegisterClass(MVT::f64, &SystemZ::FP64BitRegClass);
   }
-  addRegisterClass(MVT::f128, &SystemZ::FP128BitRegClass);
+  if (Subtarget.hasVectorEnhancements1())
+    addRegisterClass(MVT::f128, &SystemZ::VR128BitRegClass);
+  else
+    addRegisterClass(MVT::f128, &SystemZ::FP128BitRegClass);
 
   if (Subtarget.hasVector()) {
     addRegisterClass(MVT::v16i8, &SystemZ::VR128BitRegClass);
@@ -316,7 +319,10 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::AND, VT, Legal);
       setOperationAction(ISD::OR, VT, Legal);
       setOperationAction(ISD::XOR, VT, Legal);
-      setOperationAction(ISD::CTPOP, VT, Custom);
+      if (Subtarget.hasVectorEnhancements1())
+        setOperationAction(ISD::CTPOP, VT, Legal);
+      else
+        setOperationAction(ISD::CTPOP, VT, Custom);
       setOperationAction(ISD::CTTZ, VT, Legal);
       setOperationAction(ISD::CTLZ, VT, Legal);
 
@@ -414,10 +420,60 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FROUND, MVT::v2f64, Legal);
   }
 
+  // The vector enhancements facility 1 has instructions for these.
+  if (Subtarget.hasVectorEnhancements1()) {
+    setOperationAction(ISD::FADD, MVT::v4f32, Legal);
+    setOperationAction(ISD::FNEG, MVT::v4f32, Legal);
+    setOperationAction(ISD::FSUB, MVT::v4f32, Legal);
+    setOperationAction(ISD::FMUL, MVT::v4f32, Legal);
+    setOperationAction(ISD::FMA, MVT::v4f32, Legal);
+    setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
+    setOperationAction(ISD::FABS, MVT::v4f32, Legal);
+    setOperationAction(ISD::FSQRT, MVT::v4f32, Legal);
+    setOperationAction(ISD::FRINT, MVT::v4f32, Legal);
+    setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal);
+    setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal);
+    setOperationAction(ISD::FCEIL, MVT::v4f32, Legal);
+    setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
+    setOperationAction(ISD::FROUND, MVT::v4f32, Legal);
+
+    setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
+    setOperationAction(ISD::FMAXNAN, MVT::f64, Legal);
+    setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
+    setOperationAction(ISD::FMINNAN, MVT::f64, Legal);
+
+    setOperationAction(ISD::FMAXNUM, MVT::v2f64, Legal);
+    setOperationAction(ISD::FMAXNAN, MVT::v2f64, Legal);
+    setOperationAction(ISD::FMINNUM, MVT::v2f64, Legal);
+    setOperationAction(ISD::FMINNAN, MVT::v2f64, Legal);
+
+    setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
+    setOperationAction(ISD::FMAXNAN, MVT::f32, Legal);
+    setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
+    setOperationAction(ISD::FMINNAN, MVT::f32, Legal);
+
+    setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal);
+    setOperationAction(ISD::FMAXNAN, MVT::v4f32, Legal);
+    setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal);
+    setOperationAction(ISD::FMINNAN, MVT::v4f32, Legal);
+
+    setOperationAction(ISD::FMAXNUM, MVT::f128, Legal);
+    setOperationAction(ISD::FMAXNAN, MVT::f128, Legal);
+    setOperationAction(ISD::FMINNUM, MVT::f128, Legal);
+    setOperationAction(ISD::FMINNAN, MVT::f128, Legal);
+  }
+
   // We have fused multiply-addition for f32 and f64 but not f128.
   setOperationAction(ISD::FMA, MVT::f32,  Legal);
   setOperationAction(ISD::FMA, MVT::f64,  Legal);
-  setOperationAction(ISD::FMA, MVT::f128, Expand);
+  if (Subtarget.hasVectorEnhancements1())
+    setOperationAction(ISD::FMA, MVT::f128, Legal);
+  else
+    setOperationAction(ISD::FMA, MVT::f128, Expand);
+
+  // We don't have a copysign instruction on vector registers.
+  if (Subtarget.hasVectorEnhancements1())
+    setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
 
   // Needed so that we don't try to implement f128 constant loads using
   // a load-and-extend of a f80 constant (in cases where the constant
@@ -425,6 +481,12 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
   for (MVT VT : MVT::fp_valuetypes())
     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
 
+  // We don't have extending load instruction on vector registers.
+  if (Subtarget.hasVectorEnhancements1()) {
+    setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
+    setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
+  }
+
   // Floating-point truncation and stores need to be done separately.
   setTruncStoreAction(MVT::f64,  MVT::f32, Expand);
   setTruncStoreAction(MVT::f128, MVT::f32, Expand);
@@ -489,7 +551,7 @@ bool SystemZTargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
   case MVT::f64:
     return true;
   case MVT::f128:
-    return false;
+    return Subtarget.hasVectorEnhancements1();
   default:
     break;
   }
@@ -1462,21 +1524,25 @@ static bool isIntrinsicWithCC(SDValue Op, unsigned &Opcode, unsigned &CCValid) {
     return true;
 
   case Intrinsic::s390_vfcedbs:
+  case Intrinsic::s390_vfcesbs:
     Opcode = SystemZISD::VFCMPES;
     CCValid = SystemZ::CCMASK_VCMP;
     return true;
 
   case Intrinsic::s390_vfchdbs:
+  case Intrinsic::s390_vfchsbs:
     Opcode = SystemZISD::VFCMPHS;
     CCValid = SystemZ::CCMASK_VCMP;
     return true;
 
   case Intrinsic::s390_vfchedbs:
+  case Intrinsic::s390_vfchesbs:
     Opcode = SystemZISD::VFCMPHES;
     CCValid = SystemZ::CCMASK_VCMP;
     return true;
 
   case Intrinsic::s390_vftcidb:
+  case Intrinsic::s390_vftcisb:
     Opcode = SystemZISD::VFTCI;
     CCValid = SystemZ::CCMASK_VCMP;
     return true;
@@ -2316,11 +2382,15 @@ static SDValue expandV4F32ToV2F64(SelectionDAG &DAG, int Start, const SDLoc &DL,
 
 // Build a comparison of vectors CmpOp0 and CmpOp1 using opcode Opcode,
 // producing a result of type VT.
-static SDValue getVectorCmp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &DL,
-                            EVT VT, SDValue CmpOp0, SDValue CmpOp1) {
-  // There is no hardware support for v4f32, so extend the vector into
-  // two v2f64s and compare those.
-  if (CmpOp0.getValueType() == MVT::v4f32) {
+SDValue SystemZTargetLowering::getVectorCmp(SelectionDAG &DAG, unsigned Opcode,
+                                            const SDLoc &DL, EVT VT,
+                                            SDValue CmpOp0,
+                                            SDValue CmpOp1) const {
+  // There is no hardware support for v4f32 (unless we have the vector
+  // enhancements facility 1), so extend the vector into two v2f64s
+  // and compare those.
+  if (CmpOp0.getValueType() == MVT::v4f32 &&
+      !Subtarget.hasVectorEnhancements1()) {
     SDValue H0 = expandV4F32ToV2F64(DAG, 0, DL, CmpOp0);
     SDValue L0 = expandV4F32ToV2F64(DAG, 2, DL, CmpOp0);
     SDValue H1 = expandV4F32ToV2F64(DAG, 0, DL, CmpOp1);
@@ -2334,9 +2404,11 @@ static SDValue getVectorCmp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &DL,
 
 // Lower a vector comparison of type CC between CmpOp0 and CmpOp1, producing
 // an integer mask of type VT.
-static SDValue lowerVectorSETCC(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
-                                ISD::CondCode CC, SDValue CmpOp0,
-                                SDValue CmpOp1) {
+SDValue SystemZTargetLowering::lowerVectorSETCC(SelectionDAG &DAG,
+                                                const SDLoc &DL, EVT VT,
+                                                ISD::CondCode CC,
+                                                SDValue CmpOp0,
+                                                SDValue CmpOp1) const {
   bool IsFP = CmpOp0.getValueType().isFloatingPoint();
   bool Invert = false;
   SDValue Cmp;
@@ -2960,6 +3032,12 @@ SDValue SystemZTargetLowering::lowerSMUL_LOHI(SDValue Op,
     // We define this so that it can be used for constant division.
     lowerMUL_LOHI32(DAG, DL, ISD::SIGN_EXTEND, Op.getOperand(0),
                     Op.getOperand(1), Ops[1], Ops[0]);
+  else if (Subtarget.hasMiscellaneousExtensions2())
+    // SystemZISD::SMUL_LOHI returns the low result in the odd register and
+    // the high result in the even register.  ISD::SMUL_LOHI is defined to
+    // return the low half first, so the results are in reverse order.
+    lowerGR128Binary(DAG, DL, VT, SystemZISD::SMUL_LOHI,
+                     Op.getOperand(0), Op.getOperand(1), Ops[1], Ops[0]);
   else {
     // Do a full 128-bit multiplication based on SystemZISD::UMUL_LOHI:
     //
@@ -4658,6 +4736,7 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
     OPCODE(SELECT_CCMASK);
     OPCODE(ADJDYNALLOC);
     OPCODE(POPCNT);
+    OPCODE(SMUL_LOHI);
     OPCODE(UMUL_LOHI);
     OPCODE(SDIVREM);
     OPCODE(UDIVREM);
@@ -6118,6 +6197,7 @@ MachineBasicBlock *SystemZTargetLowering::EmitInstrWithCustomInserter(
   case SystemZ::SelectF32:
   case SystemZ::SelectF64:
   case SystemZ::SelectF128:
+  case SystemZ::SelectVR128:
     return emitSelect(MI, MBB, 0);
 
   case SystemZ::CondStore8Mux:
diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h
index 6c9c404816f0..abe8b7233e60 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/lib/Target/SystemZ/SystemZISelLowering.h
@@ -88,6 +88,7 @@ enum NodeType : unsigned {
 
   // Wrappers around the ISD opcodes of the same name.  The output is GR128.
   // Input operands may be GR64 or GR32, depending on the instruction.
+  SMUL_LOHI,
   UMUL_LOHI,
   SDIVREM,
   UDIVREM,
@@ -479,6 +480,12 @@ private:
   const SystemZSubtarget &Subtarget;
 
   // Implement LowerOperation for individual opcodes.
+  SDValue getVectorCmp(SelectionDAG &DAG, unsigned Opcode,
+                       const SDLoc &DL, EVT VT,
+                       SDValue CmpOp0, SDValue CmpOp1) const;
+  SDValue lowerVectorSETCC(SelectionDAG &DAG, const SDLoc &DL,
+                           EVT VT, ISD::CondCode CC,
+                           SDValue CmpOp0, SDValue CmpOp1) const;
   SDValue lowerSETCC(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/SystemZ/SystemZInstrFP.td b/lib/Target/SystemZ/SystemZInstrFP.td
index 10172bd45203..02aeaadad0d9 100644
--- a/lib/Target/SystemZ/SystemZInstrFP.td
+++ b/lib/Target/SystemZ/SystemZInstrFP.td
@@ -12,9 +12,12 @@
 //===----------------------------------------------------------------------===//
 
 // C's ?: operator for floating-point operands.
-def SelectF32  : SelectWrapper<FP32>;
-def SelectF64  : SelectWrapper<FP64>;
-def SelectF128 : SelectWrapper<FP128>;
+def SelectF32  : SelectWrapper<f32, FP32>;
+def SelectF64  : SelectWrapper<f64, FP64>;
+let Predicates = [FeatureNoVectorEnhancements1] in
+  def SelectF128 : SelectWrapper<f128, FP128>;
+let Predicates = [FeatureVectorEnhancements1] in
+  def SelectVR128 : SelectWrapper<f128, VR128>;
 
 defm CondStoreF32 : CondStores<FP32, nonvolatile_store,
                                nonvolatile_load, bdxaddr20only>;
@@ -69,8 +72,9 @@ let Defs = [CC], usesCustomInserter = 1 in {
 let Predicates = [FeatureVector] in {
   defm : CompareZeroFP<LTEBRCompare_VecPseudo, FP32>;
   defm : CompareZeroFP<LTDBRCompare_VecPseudo, FP64>;
-  defm : CompareZeroFP<LTXBRCompare_VecPseudo, FP128>;
 }
+let Predicates = [FeatureVector, FeatureNoVectorEnhancements1] in
+  defm : CompareZeroFP<LTXBRCompare_VecPseudo, FP128>;
 
 // Moves between 64-bit integer and floating-point registers.
 def LGDR : UnaryRRE<"lgdr", 0xB3CD, bitconvert, GR64, FP64>;
@@ -83,8 +87,12 @@ let isCodeGenOnly = 1 in {
 }
 
 // The sign of an FP128 is in the high register.
-def : Pat<(fcopysign FP32:$src1, FP128:$src2),
-          (CPSDRsd FP32:$src1, (EXTRACT_SUBREG FP128:$src2, subreg_h64))>;
+let Predicates = [FeatureNoVectorEnhancements1] in
+  def : Pat<(fcopysign FP32:$src1, (f32 (fpround (f128 FP128:$src2)))),
+            (CPSDRsd FP32:$src1, (EXTRACT_SUBREG FP128:$src2, subreg_h64))>;
+let Predicates = [FeatureVectorEnhancements1] in
+  def : Pat<(fcopysign FP32:$src1, (f32 (fpround (f128 VR128:$src2)))),
+            (CPSDRsd FP32:$src1, (EXTRACT_SUBREG VR128:$src2, subreg_r64))>;
 
 // fcopysign with an FP64 result.
 let isCodeGenOnly = 1 in
@@ -92,8 +100,12 @@ let isCodeGenOnly = 1 in
 def CPSDRdd : BinaryRRFb<"cpsdr", 0xB372, fcopysign, FP64, FP64, FP64>;
 
 // The sign of an FP128 is in the high register.
-def : Pat<(fcopysign FP64:$src1, FP128:$src2),
-          (CPSDRdd FP64:$src1, (EXTRACT_SUBREG FP128:$src2, subreg_h64))>;
+let Predicates = [FeatureNoVectorEnhancements1] in
+  def : Pat<(fcopysign FP64:$src1, (f64 (fpround (f128 FP128:$src2)))),
+            (CPSDRdd FP64:$src1, (EXTRACT_SUBREG FP128:$src2, subreg_h64))>;
+let Predicates = [FeatureVectorEnhancements1] in
+  def : Pat<(fcopysign FP64:$src1, (f64 (fpround (f128 VR128:$src2)))),
+            (CPSDRdd FP64:$src1, (EXTRACT_SUBREG VR128:$src2, subreg_r64))>;
 
 // fcopysign with an FP128 result.  Use "upper" as the high half and leave
 // the low half as-is.
@@ -101,12 +113,14 @@ class CopySign128<RegisterOperand cls, dag upper>
   : Pat<(fcopysign FP128:$src1, cls:$src2),
         (INSERT_SUBREG FP128:$src1, upper, subreg_h64)>;
 
-def : CopySign128<FP32,  (CPSDRds (EXTRACT_SUBREG FP128:$src1, subreg_h64),
-                                  FP32:$src2)>;
-def : CopySign128<FP64,  (CPSDRdd (EXTRACT_SUBREG FP128:$src1, subreg_h64),
-                                  FP64:$src2)>;
-def : CopySign128<FP128, (CPSDRdd (EXTRACT_SUBREG FP128:$src1, subreg_h64),
-                                  (EXTRACT_SUBREG FP128:$src2, subreg_h64))>;
+let Predicates = [FeatureNoVectorEnhancements1] in {
+  def : CopySign128<FP32,  (CPSDRds (EXTRACT_SUBREG FP128:$src1, subreg_h64),
+                                    FP32:$src2)>;
+  def : CopySign128<FP64,  (CPSDRdd (EXTRACT_SUBREG FP128:$src1, subreg_h64),
+                                    FP64:$src2)>;
+  def : CopySign128<FP128, (CPSDRdd (EXTRACT_SUBREG FP128:$src1, subreg_h64),
+                                    (EXTRACT_SUBREG FP128:$src2, subreg_h64))>;
+}
 
 defm LoadStoreF32  : MVCLoadStore<load, f32,  MVCSequence, 4>;
 defm LoadStoreF64  : MVCLoadStore<load, f64,  MVCSequence, 8>;
@@ -166,20 +180,32 @@ def LEXBRA : TernaryRRFe<"lexbra", 0xB346, FP128, FP128>,
 def LDXBRA : TernaryRRFe<"ldxbra", 0xB345, FP128, FP128>,
              Requires<[FeatureFPExtension]>;
 
-def : Pat<(f32 (fpround FP128:$src)),
-          (EXTRACT_SUBREG (LEXBR FP128:$src), subreg_hr32)>;
-def : Pat<(f64 (fpround FP128:$src)),
-          (EXTRACT_SUBREG (LDXBR FP128:$src), subreg_h64)>;
+let Predicates = [FeatureNoVectorEnhancements1] in {
+  def : Pat<(f32 (fpround FP128:$src)),
+            (EXTRACT_SUBREG (LEXBR FP128:$src), subreg_hr32)>;
+  def : Pat<(f64 (fpround FP128:$src)),
+            (EXTRACT_SUBREG (LDXBR FP128:$src), subreg_h64)>;
+}
 
 // Extend register floating-point values to wider representations.
-def LDEBR : UnaryRRE<"ldebr", 0xB304, fpextend, FP64,  FP32>;
-def LXEBR : UnaryRRE<"lxebr", 0xB306, fpextend, FP128, FP32>;
-def LXDBR : UnaryRRE<"lxdbr", 0xB305, fpextend, FP128, FP64>;
+def LDEBR : UnaryRRE<"ldebr", 0xB304, fpextend,  FP64,  FP32>;
+def LXEBR : UnaryRRE<"lxebr", 0xB306, null_frag, FP128, FP32>;
+def LXDBR : UnaryRRE<"lxdbr", 0xB305, null_frag, FP128, FP64>;
+let Predicates = [FeatureNoVectorEnhancements1] in {
+  def : Pat<(f128 (fpextend (f32 FP32:$src))), (LXEBR FP32:$src)>;
+  def : Pat<(f128 (fpextend (f64 FP64:$src))), (LXDBR FP64:$src)>;
+}
 
 // Extend memory floating-point values to wider representations.
 def LDEB : UnaryRXE<"ldeb", 0xED04, extloadf32, FP64,  4>;
-def LXEB : UnaryRXE<"lxeb", 0xED06, extloadf32, FP128, 4>;
-def LXDB : UnaryRXE<"lxdb", 0xED05, extloadf64, FP128, 8>;
+def LXEB : UnaryRXE<"lxeb", 0xED06, null_frag,  FP128, 4>;
+def LXDB : UnaryRXE<"lxdb", 0xED05, null_frag,  FP128, 8>;
+let Predicates = [FeatureNoVectorEnhancements1] in {
+  def : Pat<(f128 (extloadf32 bdxaddr12only:$src)),
+            (LXEB bdxaddr12only:$src)>;
+  def : Pat<(f128 (extloadf64 bdxaddr12only:$src)),
+            (LXDB bdxaddr12only:$src)>;
+}
 
 // Convert a signed integer register value to a floating-point one.
 def CEFBR : UnaryRRE<"cefbr", 0xB394, sint_to_fp, FP32,  GR32>;
@@ -426,16 +452,18 @@ def : Pat<(fmul (f64 (fpextend FP32:$src1)),
 
 // f128 multiplication of two FP64 registers.
 def MXDBR : BinaryRRE<"mxdbr", 0xB307, null_frag, FP128, FP64>;
-def : Pat<(fmul (f128 (fpextend FP64:$src1)), (f128 (fpextend FP64:$src2))),
-          (MXDBR (INSERT_SUBREG (f128 (IMPLICIT_DEF)),
-                                FP64:$src1, subreg_h64), FP64:$src2)>;
+let Predicates = [FeatureNoVectorEnhancements1] in
+  def : Pat<(fmul (f128 (fpextend FP64:$src1)), (f128 (fpextend FP64:$src2))),
+            (MXDBR (INSERT_SUBREG (f128 (IMPLICIT_DEF)),
+                                  FP64:$src1, subreg_h64), FP64:$src2)>;
 
 // f128 multiplication of an FP64 register and an f64 memory.
 def MXDB : BinaryRXE<"mxdb", 0xED07, null_frag, FP128, load, 8>;
-def : Pat<(fmul (f128 (fpextend FP64:$src1)),
-                (f128 (extloadf64 bdxaddr12only:$addr))),
-          (MXDB (INSERT_SUBREG (f128 (IMPLICIT_DEF)), FP64:$src1, subreg_h64),
-                bdxaddr12only:$addr)>;
+let Predicates = [FeatureNoVectorEnhancements1] in
+  def : Pat<(fmul (f128 (fpextend FP64:$src1)),
+                  (f128 (extloadf64 bdxaddr12only:$addr))),
+            (MXDB (INSERT_SUBREG (f128 (IMPLICIT_DEF)), FP64:$src1, subreg_h64),
+                  bdxaddr12only:$addr)>;
 
 // Fused multiply-add.
 def MAEBR : TernaryRRD<"maebr", 0xB30E, z_fma, FP32, FP32>;
diff --git a/lib/Target/SystemZ/SystemZInstrFormats.td b/lib/Target/SystemZ/SystemZInstrFormats.td
index 7620e06ccbc9..033a0a879d37 100644
--- a/lib/Target/SystemZ/SystemZInstrFormats.td
+++ b/lib/Target/SystemZ/SystemZInstrFormats.td
@@ -1091,6 +1091,94 @@ class InstVRIe<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   let Inst{7-0}   = op{7-0};
 }
 
+class InstVRIf<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<5> V1;
+  bits<5> V2;
+  bits<5> V3;
+  bits<8> I4;
+  bits<4> M5;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = V1{3-0};
+  let Inst{35-32} = V2{3-0};
+  let Inst{31-28} = V3{3-0};
+  let Inst{27-24} = 0;
+  let Inst{23-20} = M5;
+  let Inst{19-12} = I4;
+  let Inst{11}    = V1{4};
+  let Inst{10}    = V2{4};
+  let Inst{9}     = V3{4};
+  let Inst{8}     = 0;
+  let Inst{7-0}   = op{7-0};
+}
+
+class InstVRIg<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<5> V1;
+  bits<5> V2;
+  bits<8> I3;
+  bits<8> I4;
+  bits<4> M5;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = V1{3-0};
+  let Inst{35-32} = V2{3-0};
+  let Inst{31-24} = I4;
+  let Inst{23-20} = M5;
+  let Inst{19-12} = I3;
+  let Inst{11}    = V1{4};
+  let Inst{10}    = V2{4};
+  let Inst{9-8}   = 0;
+  let Inst{7-0}   = op{7-0};
+}
+
+class InstVRIh<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<5> V1;
+  bits<16> I2;
+  bits<4> I3;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = V1{3-0};
+  let Inst{35-32} = 0;
+  let Inst{31-16} = I2;
+  let Inst{15-12} = I3;
+  let Inst{11}    = V1{4};
+  let Inst{10-8}  = 0;
+  let Inst{7-0}   = op{7-0};
+}
+
+class InstVRIi<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<5> V1;
+  bits<4> R2;
+  bits<8> I3;
+  bits<4> M4;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = V1{3-0};
+  let Inst{35-32} = R2;
+  let Inst{31-24} = 0;
+  let Inst{23-20} = M4;
+  let Inst{19-12} = I3;
+  let Inst{11}    = V1{4};
+  let Inst{10-8}  = 0;
+  let Inst{7-0}   = op{7-0};
+}
+
 // Depending on the instruction mnemonic, certain bits may be or-ed into
 // the M4 value provided as explicit operand.  These are passed as m4or.
 class InstVRRa<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern,
@@ -1259,6 +1347,67 @@ class InstVRRf<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   let Inst{7-0}   = op{7-0};
 }
 
+class InstVRRg<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<5> V1;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = 0;
+  let Inst{35-32} = V1{3-0};
+  let Inst{31-12} = 0;
+  let Inst{11}    = 0;
+  let Inst{10}    = V1{4};
+  let Inst{9-8}   = 0;
+  let Inst{7-0}   = op{7-0};
+}
+
+class InstVRRh<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<5> V1;
+  bits<5> V2;
+  bits<4> M3;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = 0;
+  let Inst{35-32} = V1{3-0};
+  let Inst{31-28} = V2{3-0};
+  let Inst{27-24} = 0;
+  let Inst{23-20} = M3;
+  let Inst{19-12} = 0;
+  let Inst{11}    = 0;
+  let Inst{10}    = V1{4};
+  let Inst{9}     = V2{4};
+  let Inst{8}     = 0;
+  let Inst{7-0}   = op{7-0};
+}
+
+class InstVRRi<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<4> R1;
+  bits<5> V2;
+  bits<4> M3;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = R1;
+  let Inst{35-32} = V2{3-0};
+  let Inst{31-24} = 0;
+  let Inst{23-20} = M3;
+  let Inst{19-12} = 0;
+  let Inst{11}    = 0;
+  let Inst{10}    = V2{4};
+  let Inst{9-8}   = 0;
+  let Inst{7-0}   = op{7-0};
+}
+
 class InstVRSa<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   : InstSystemZ<6, outs, ins, asmstr, pattern> {
   field bits<48> Inst;
@@ -1321,6 +1470,25 @@ class InstVRSc<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   let Inst{7-0}   = op{7-0};
 }
 
+class InstVRSd<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<5> V1;
+  bits<16> BD2;
+  bits<4> R3;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = 0;
+  let Inst{35-32} = R3;
+  let Inst{31-16} = BD2;
+  let Inst{15-12} = V1{3-0};
+  let Inst{11-9}  = 0;
+  let Inst{8}     = V1{4};
+  let Inst{7-0}   = op{7-0};
+}
+
 class InstVRV<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   : InstSystemZ<6, outs, ins, asmstr, pattern> {
   field bits<48> Inst;
@@ -1358,6 +1526,24 @@ class InstVRX<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   let Inst{7-0}   = op{7-0};
 }
 
+class InstVSI<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<5> V1;
+  bits<16> BD2;
+  bits<8> I3;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-32} = I3;
+  let Inst{31-16} = BD2;
+  let Inst{15-12} = V1{3-0};
+  let Inst{11-9}  = 0;
+  let Inst{8}     = V1{4};
+  let Inst{7-0}   = op{7-0};
+}
+
 //===----------------------------------------------------------------------===//
 // Instruction classes for .insn directives
 //===----------------------------------------------------------------------===//
@@ -1910,6 +2096,25 @@ class FixedCondBranchRX<CondVariant V, string mnemonic, bits<8> opcode>
   let M1 = V.ccmask;
 }
 
+class CondBranchRXY<string mnemonic, bits<16> opcode>
+  : InstRXYb<opcode, (outs), (ins cond4:$valid, cond4:$M1, bdxaddr20only:$XBD2),
+             !subst("#", "${M1}", mnemonic)#"\t$XBD2", []> {
+  let CCMaskFirst = 1;
+}
+
+class AsmCondBranchRXY<string mnemonic, bits<16> opcode>
+  : InstRXYb<opcode, (outs), (ins imm32zx4:$M1, bdxaddr20only:$XBD2),
+             mnemonic#"\t$M1, $XBD2", []>;
+
+class FixedCondBranchRXY<CondVariant V, string mnemonic, bits<16> opcode,
+                         SDPatternOperator operator = null_frag>
+  : InstRXYb<opcode, (outs), (ins bdxaddr20only:$XBD2),
+             !subst("#", V.suffix, mnemonic)#"\t$XBD2",
+             [(operator (load bdxaddr20only:$XBD2))]> {
+  let isAsmParserOnly = V.alternate;
+  let M1 = V.ccmask;
+}
+
 class CmpBranchRIEa<string mnemonic, bits<16> opcode,
                     RegisterOperand cls, Immediate imm>
   : InstRIEa<opcode, (outs), (ins cls:$R1, imm:$I2, cond4:$M3),
@@ -2272,6 +2477,24 @@ class StoreLengthVRSb<string mnemonic, bits<16> opcode,
   let AccessBytes = bytes;
 }
 
+class StoreLengthVRSd<string mnemonic, bits<16> opcode,
+                      SDPatternOperator operator, bits<5> bytes>
+  : InstVRSd<opcode, (outs), (ins VR128:$V1, GR32:$R3, bdaddr12only:$BD2),
+             mnemonic#"\t$V1, $R3, $BD2",
+             [(operator VR128:$V1, GR32:$R3, bdaddr12only:$BD2)]> {
+  let mayStore = 1;
+  let AccessBytes = bytes;
+}
+
+class StoreLengthVSI<string mnemonic, bits<16> opcode,
+                     SDPatternOperator operator, bits<5> bytes>
+  : InstVSI<opcode, (outs), (ins VR128:$V1, bdaddr12only:$BD2, imm32zx8:$I3),
+            mnemonic#"\t$V1, $BD2, $I3",
+            [(operator VR128:$V1, imm32zx8:$I3, bdaddr12only:$BD2)]> {
+  let mayStore = 1;
+  let AccessBytes = bytes;
+}
+
 class StoreMultipleRS<string mnemonic, bits<8> opcode, RegisterOperand cls,
                       AddressingMode mode = bdaddr12only>
   : InstRSa<opcode, (outs), (ins cls:$R1, cls:$R3, mode:$BD2),
@@ -2700,6 +2923,11 @@ class SideEffectBinaryRX<string mnemonic, bits<8> opcode,
   : InstRXa<opcode, (outs), (ins cls:$R1, bdxaddr12only:$XBD2),
             mnemonic##"\t$R1, $XBD2", []>;
 
+class SideEffectBinaryRXY<string mnemonic, bits<16> opcode,
+                          RegisterOperand cls>
+  : InstRXYa<opcode, (outs), (ins cls:$R1, bdxaddr20only:$XBD2),
+             mnemonic##"\t$R1, $XBD2", []>;
+
 class SideEffectBinaryRILPC<string mnemonic, bits<12> opcode,
                             RegisterOperand cls>
   : InstRILb<opcode, (outs), (ins cls:$R1, pcrel32:$RI2),
@@ -3188,6 +3416,11 @@ class BinaryVRIeFloatGeneric<string mnemonic, bits<16> opcode>
              (ins VR128:$V2, imm32zx12:$I3, imm32zx4:$M4, imm32zx4:$M5),
              mnemonic#"\t$V1, $V2, $I3, $M4, $M5", []>;
 
+class BinaryVRIh<string mnemonic, bits<16> opcode>
+  : InstVRIh<opcode, (outs VR128:$V1),
+             (ins imm32zx16:$I2, imm32zx4:$I3),
+             mnemonic#"\t$V1, $I2, $I3", []>;
+
 class BinaryVRRa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                  TypedReg tr1, TypedReg tr2, bits<4> type = 0, bits<4> m4 = 0>
   : InstVRRa<opcode, (outs tr1.op:$V1), (ins tr2.op:$V2, imm32zx4:$M5),
@@ -3316,6 +3549,10 @@ class BinaryVRRf<string mnemonic, bits<16> opcode, SDPatternOperator operator,
              mnemonic#"\t$V1, $R2, $R3",
              [(set tr.op:$V1, (tr.vt (operator GR64:$R2, GR64:$R3)))]>;
 
+class BinaryVRRi<string mnemonic, bits<16> opcode, RegisterOperand cls>
+  : InstVRRi<opcode, (outs cls:$R1), (ins VR128:$V2, imm32zx4:$M3),
+             mnemonic#"\t$R1, $V2, $M3", []>;
+
 class BinaryVRSa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                  TypedReg tr1, TypedReg tr2, bits<4> type>
   : InstVRSa<opcode, (outs tr1.op:$V1), (ins tr2.op:$V3, shift12only:$BD2),
@@ -3353,6 +3590,15 @@ class BinaryVRScGeneric<string mnemonic, bits<16> opcode>
              (ins VR128:$V3, shift12only:$BD2, imm32zx4: $M4),
              mnemonic#"\t$R1, $V3, $BD2, $M4", []>;
 
+class BinaryVRSd<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                 bits<5> bytes>
+  : InstVRSd<opcode, (outs VR128:$V1), (ins GR32:$R3, bdaddr12only:$BD2),
+             mnemonic#"\t$V1, $R3, $BD2",
+             [(set VR128:$V1, (operator GR32:$R3, bdaddr12only:$BD2))]> {
+  let mayLoad = 1;
+  let AccessBytes = bytes;
+}
+
 class BinaryVRX<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                 TypedReg tr, bits<5> bytes>
   : InstVRX<opcode, (outs VR128:$V1), (ins bdxaddr12only:$XBD2, imm32zx4:$M3),
@@ -3398,6 +3644,15 @@ class StoreBinaryRSL<string mnemonic, bits<16> opcode, RegisterOperand cls>
   let mayStore = 1;
 }
 
+class BinaryVSI<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                bits<5> bytes>
+  : InstVSI<opcode, (outs VR128:$V1), (ins bdaddr12only:$BD2, imm32zx8:$I3),
+            mnemonic#"\t$V1, $BD2, $I3",
+            [(set VR128:$V1, (operator imm32zx8:$I3, bdaddr12only:$BD2))]> {
+  let mayLoad = 1;
+  let AccessBytes = bytes;
+}
+
 class StoreBinaryVRV<string mnemonic, bits<16> opcode, bits<5> bytes,
                      Immediate index>
   : InstVRV<opcode, (outs), (ins VR128:$V1, bdvaddr12only:$VBD2, index:$M3),
@@ -3625,6 +3880,12 @@ class CompareVRRaFloatGeneric<string mnemonic, bits<16> opcode>
   let M5 = 0;
 }
 
+class CompareVRRh<string mnemonic, bits<16> opcode>
+  : InstVRRh<opcode, (outs), (ins VR128:$V1, VR128:$V2, imm32zx4:$M3),
+             mnemonic#"\t$V1, $V2, $M3", []> {
+  let isCompare = 1;
+}
+
 class TestRXE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
               RegisterOperand cls>
   : InstRXE<opcode, (outs), (ins cls:$R1, bdxaddr12only:$XBD2),
@@ -3639,6 +3900,10 @@ class TestRSL<string mnemonic, bits<16> opcode>
   let mayLoad = 1;
 }
 
+class TestVRRg<string mnemonic, bits<16> opcode>
+  : InstVRRg<opcode, (outs), (ins VR128:$V1),
+             mnemonic#"\t$V1", []>;
+
 class SideEffectTernarySSc<string mnemonic, bits<8> opcode>
   : InstSSc<opcode, (outs), (ins bdladdr12onlylen4:$BDL1,
                                  shift12only:$BD2, imm32zx4:$I3),
@@ -3842,6 +4107,11 @@ class TernaryVRId<string mnemonic, bits<16> opcode, SDPatternOperator operator,
   let M5 = type;
 }
 
+class TernaryVRIi<string mnemonic, bits<16> opcode, RegisterOperand cls>
+  : InstVRIi<opcode, (outs VR128:$V1),
+             (ins cls:$R2, imm32zx8:$I3, imm32zx4:$M4),
+             mnemonic#"\t$V1, $R2, $I3, $M4", []>;
+
 class TernaryVRRa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                   TypedReg tr1, TypedReg tr2, bits<4> type, bits<4> m4or>
   : InstVRRa<opcode, (outs tr1.op:$V1),
@@ -3914,6 +4184,25 @@ class TernaryVRRc<string mnemonic, bits<16> opcode, SDPatternOperator operator,
   let M6 = 0;
 }
 
+class TernaryVRRcFloat<string mnemonic, bits<16> opcode,
+                       SDPatternOperator operator, TypedReg tr1, TypedReg tr2,
+                       bits<4> type = 0, bits<4> m5 = 0>
+  : InstVRRc<opcode, (outs tr1.op:$V1),
+             (ins tr2.op:$V2, tr2.op:$V3, imm32zx4:$M6),
+             mnemonic#"\t$V1, $V2, $V3, $M6",
+             [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2),
+                                                 (tr2.vt tr2.op:$V3),
+                                                 imm32zx4:$M6)))]> {
+  let M4 = type;
+  let M5 = m5;
+}
+
+class TernaryVRRcFloatGeneric<string mnemonic, bits<16> opcode>
+  : InstVRRc<opcode, (outs VR128:$V1),
+             (ins VR128:$V2, VR128:$V3, imm32zx4:$M4, imm32zx4:$M5,
+                  imm32zx4:$M6),
+             mnemonic#"\t$V1, $V2, $V3, $M4, $M5, $M6", []>;
+
 class TernaryVRRd<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                   TypedReg tr1, TypedReg tr2, bits<4> type = 0>
   : InstVRRd<opcode, (outs tr1.op:$V1),
@@ -4019,20 +4308,38 @@ class QuaternaryVRIdGeneric<string mnemonic, bits<16> opcode>
   let DisableEncoding = "$V1src";
 }
 
+class QuaternaryVRIf<string mnemonic, bits<16> opcode>
+  : InstVRIf<opcode, (outs VR128:$V1),
+             (ins VR128:$V2, VR128:$V3,
+                  imm32zx8:$I4, imm32zx4:$M5),
+             mnemonic#"\t$V1, $V2, $V3, $I4, $M5", []>;
+
+class QuaternaryVRIg<string mnemonic, bits<16> opcode>
+  : InstVRIg<opcode, (outs VR128:$V1),
+             (ins VR128:$V2, imm32zx8:$I3,
+                  imm32zx8:$I4, imm32zx4:$M5),
+             mnemonic#"\t$V1, $V2, $I3, $I4, $M5", []>;
+
 class QuaternaryVRRd<string mnemonic, bits<16> opcode,
                      SDPatternOperator operator, TypedReg tr1, TypedReg tr2,
-                     bits<4> type, SDPatternOperator m6mask, bits<4> m6or>
+                     TypedReg tr3, TypedReg tr4, bits<4> type,
+                     SDPatternOperator m6mask = imm32zx4, bits<4> m6or = 0>
   : InstVRRd<opcode, (outs tr1.op:$V1),
-             (ins tr2.op:$V2, tr2.op:$V3, tr2.op:$V4, m6mask:$M6),
+             (ins tr2.op:$V2, tr3.op:$V3, tr4.op:$V4, m6mask:$M6),
              mnemonic#"\t$V1, $V2, $V3, $V4, $M6",
              [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2),
-                                                 (tr2.vt tr2.op:$V3),
-                                                 (tr2.vt tr2.op:$V4),
+                                                 (tr3.vt tr3.op:$V3),
+                                                 (tr4.vt tr4.op:$V4),
                                                  m6mask:$M6)))],
              m6or> {
   let M5 = type;
 }
 
+class QuaternaryVRRdGeneric<string mnemonic, bits<16> opcode>
+  : InstVRRd<opcode, (outs VR128:$V1),
+             (ins VR128:$V2, VR128:$V3, VR128:$V4, imm32zx4:$M5, imm32zx4:$M6),
+             mnemonic#"\t$V1, $V2, $V3, $V4, $M5, $M6", []>;
+
 // Declare a pair of instructions, one which sets CC and one which doesn't.
 // The CC-setting form ends with "S" and sets the low bit of M6.
 // Also create aliases to make use of M6 operand optional in assembler.
@@ -4041,13 +4348,15 @@ multiclass QuaternaryOptVRRdSPair<string mnemonic, bits<16> opcode,
                                 SDPatternOperator operator_cc,
                                 TypedReg tr1, TypedReg tr2, bits<4> type,
                                 bits<4> modifier = 0> {
-  def "" : QuaternaryVRRd<mnemonic, opcode, operator, tr1, tr2, type,
+  def "" : QuaternaryVRRd<mnemonic, opcode, operator,
+                          tr1, tr2, tr2, tr2, type,
                           imm32zx4even, !and (modifier, 14)>;
   def : InstAlias<mnemonic#"\t$V1, $V2, $V3, $V4",
                   (!cast<Instruction>(NAME) tr1.op:$V1, tr2.op:$V2,
                                             tr2.op:$V3, tr2.op:$V4, 0)>;
   let Defs = [CC] in
-    def S : QuaternaryVRRd<mnemonic##"s", opcode, operator_cc, tr1, tr2, type,
+    def S : QuaternaryVRRd<mnemonic##"s", opcode, operator_cc,
+                           tr1, tr2, tr2, tr2, type,
                            imm32zx4even, !add (!and (modifier, 14), 1)>;
   def : InstAlias<mnemonic#"s\t$V1, $V2, $V3, $V4",
                   (!cast<Instruction>(NAME#"S") tr1.op:$V1, tr2.op:$V2,
@@ -4055,10 +4364,7 @@ multiclass QuaternaryOptVRRdSPair<string mnemonic, bits<16> opcode,
 }
 
 multiclass QuaternaryOptVRRdSPairGeneric<string mnemonic, bits<16> opcode> {
-  def "" : InstVRRd<opcode, (outs VR128:$V1),
-                   (ins VR128:$V2, VR128:$V3, VR128:$V4,
-                        imm32zx4:$M5, imm32zx4:$M6),
-                   mnemonic#"\t$V1, $V2, $V3, $V4, $M5, $M6", []>;
+  def "" : QuaternaryVRRdGeneric<mnemonic, opcode>;
   def : InstAlias<mnemonic#"\t$V1, $V2, $V3, $V4, $M5",
                   (!cast<Instruction>(NAME) VR128:$V1, VR128:$V2, VR128:$V3,
                                             VR128:$V4, imm32zx4:$M5, 0)>;
@@ -4366,10 +4672,10 @@ class RotateSelectRIEfPseudo<RegisterOperand cls1, RegisterOperand cls2>
 
 // Implements "$dst = $cc & (8 >> CC) ? $src1 : $src2", where CC is
 // the value of the PSW's 2-bit condition code field.
-class SelectWrapper<RegisterOperand cls>
+class SelectWrapper<ValueType vt, RegisterOperand cls>
   : Pseudo<(outs cls:$dst),
            (ins cls:$src1, cls:$src2, imm32zx4:$valid, imm32zx4:$cc),
-           [(set cls:$dst, (z_select_ccmask cls:$src1, cls:$src2,
+           [(set (vt cls:$dst), (z_select_ccmask cls:$src1, cls:$src2,
                                             imm32zx4:$valid, imm32zx4:$cc))]> {
   let usesCustomInserter = 1;
   // Although the instructions used by these nodes do not in themselves
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.cpp b/lib/Target/SystemZ/SystemZInstrInfo.cpp
index 66a5ff12be46..4533f4fdf21a 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -869,6 +869,37 @@ void SystemZInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     return;
   }
 
+  // Move 128-bit floating-point values between VR128 and FP128.
+  if (SystemZ::VR128BitRegClass.contains(DestReg) &&
+      SystemZ::FP128BitRegClass.contains(SrcReg)) {
+    unsigned SrcRegHi =
+      RI.getMatchingSuperReg(RI.getSubReg(SrcReg, SystemZ::subreg_h64),
+                             SystemZ::subreg_r64, &SystemZ::VR128BitRegClass);
+    unsigned SrcRegLo =
+      RI.getMatchingSuperReg(RI.getSubReg(SrcReg, SystemZ::subreg_l64),
+                             SystemZ::subreg_r64, &SystemZ::VR128BitRegClass);
+
+    BuildMI(MBB, MBBI, DL, get(SystemZ::VMRHG), DestReg)
+      .addReg(SrcRegHi, getKillRegState(KillSrc))
+      .addReg(SrcRegLo, getKillRegState(KillSrc));
+    return;
+  }
+  if (SystemZ::FP128BitRegClass.contains(DestReg) &&
+      SystemZ::VR128BitRegClass.contains(SrcReg)) {
+    unsigned DestRegHi =
+      RI.getMatchingSuperReg(RI.getSubReg(DestReg, SystemZ::subreg_h64),
+                             SystemZ::subreg_r64, &SystemZ::VR128BitRegClass);
+    unsigned DestRegLo =
+      RI.getMatchingSuperReg(RI.getSubReg(DestReg, SystemZ::subreg_l64),
+                             SystemZ::subreg_r64, &SystemZ::VR128BitRegClass);
+
+    if (DestRegHi != SrcReg)
+      copyPhysReg(MBB, MBBI, DL, DestRegHi, SrcReg, false);
+    BuildMI(MBB, MBBI, DL, get(SystemZ::VREPG), DestRegLo)
+      .addReg(SrcReg, getKillRegState(KillSrc)).addImm(1);
+    return;
+  }
+
   // Everything else needs only one instruction.
   unsigned Opcode;
   if (SystemZ::GR64BitRegClass.contains(DestReg, SrcReg))
@@ -1434,6 +1465,7 @@ SystemZII::Branch
 SystemZInstrInfo::getBranchInfo(const MachineInstr &MI) const {
   switch (MI.getOpcode()) {
   case SystemZ::BR:
+  case SystemZ::BI:
   case SystemZ::J:
   case SystemZ::JG:
     return SystemZII::Branch(SystemZII::BranchNormal, SystemZ::CCMASK_ANY,
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.td b/lib/Target/SystemZ/SystemZInstrInfo.td
index 4569be7602e4..f64c0d15ef83 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.td
+++ b/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -48,6 +48,8 @@ let isBranch = 1, isTerminator = 1, Uses = [CC] in {
     let isIndirectBranch = 1 in {
       def BC  : CondBranchRX<"b#",  0x47>;
       def BCR : CondBranchRR<"b#r", 0x07>;
+      def BIC : CondBranchRXY<"bi#", 0xe347>,
+                Requires<[FeatureMiscellaneousExtensions2]>;
     }
   }
 
@@ -58,6 +60,8 @@ let isBranch = 1, isTerminator = 1, Uses = [CC] in {
   let isIndirectBranch = 1 in {
     def BCAsm  : AsmCondBranchRX<"bc",  0x47>;
     def BCRAsm : AsmCondBranchRR<"bcr", 0x07>;
+    def BICAsm : AsmCondBranchRXY<"bic", 0xe347>,
+                 Requires<[FeatureMiscellaneousExtensions2]>;
   }
 
   // Define AsmParser extended mnemonics for each general condition-code mask
@@ -69,6 +73,8 @@ let isBranch = 1, isTerminator = 1, Uses = [CC] in {
     let isIndirectBranch = 1 in {
       def BAsm#V  : FixedCondBranchRX <CV<V>, "b#",  0x47>;
       def BRAsm#V : FixedCondBranchRR <CV<V>, "b#r", 0x07>;
+      def BIAsm#V : FixedCondBranchRXY<CV<V>, "bi#", 0xe347>,
+                    Requires<[FeatureMiscellaneousExtensions2]>;
     }
   }
 }
@@ -81,6 +87,8 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
   let isIndirectBranch = 1 in {
     def B  : FixedCondBranchRX<CondAlways, "b",  0x47>;
     def BR : FixedCondBranchRR<CondAlways, "br", 0x07, brind>;
+    def BI : FixedCondBranchRXY<CondAlways, "bi", 0xe347, brind>,
+             Requires<[FeatureMiscellaneousExtensions2]>;
   }
 }
 
@@ -316,9 +324,9 @@ let isReturn = 1, isTerminator = 1, hasCtrlDep = 1 in {
 // Select instructions
 //===----------------------------------------------------------------------===//
 
-def Select32Mux : SelectWrapper<GRX32>, Requires<[FeatureHighWord]>;
-def Select32    : SelectWrapper<GR32>;
-def Select64    : SelectWrapper<GR64>;
+def Select32Mux : SelectWrapper<i32, GRX32>, Requires<[FeatureHighWord]>;
+def Select32    : SelectWrapper<i32, GR32>;
+def Select64    : SelectWrapper<i64, GR64>;
 
 // We don't define 32-bit Mux stores if we don't have STOCFH, because the
 // low-only STOC should then always be used if possible.
@@ -921,6 +929,8 @@ let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0x8 in {
   // Addition of memory.
   defm AH  : BinaryRXPair<"ah", 0x4A, 0xE37A, add, GR32, asextloadi16, 2>;
   defm A   : BinaryRXPair<"a",  0x5A, 0xE35A, add, GR32, load, 4>;
+  def  AGH : BinaryRXY<"agh", 0xE338, add, GR64, asextloadi16, 2>,
+             Requires<[FeatureMiscellaneousExtensions2]>;
   def  AGF : BinaryRXY<"agf", 0xE318, add, GR64, asextloadi32, 4>;
   def  AG  : BinaryRXY<"ag",  0xE308, add, GR64, load, 8>;
 
@@ -1006,6 +1016,8 @@ let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0x8 in {
   // Subtraction of memory.
   defm SH  : BinaryRXPair<"sh", 0x4B, 0xE37B, sub, GR32, asextloadi16, 2>;
   defm S   : BinaryRXPair<"s", 0x5B, 0xE35B, sub, GR32, load, 4>;
+  def  SGH : BinaryRXY<"sgh", 0xE339, sub, GR64, asextloadi16, 2>,
+             Requires<[FeatureMiscellaneousExtensions2]>;
   def  SGF : BinaryRXY<"sgf", 0xE319, sub, GR64, asextloadi32, 4>;
   def  SG  : BinaryRXY<"sg",  0xE309, sub, GR64, load, 8>;
 }
@@ -1207,6 +1219,15 @@ defm : RMWIByte<xor, bdaddr20pair, XIY>;
 // Multiplication
 //===----------------------------------------------------------------------===//
 
+// Multiplication of a register, setting the condition code.  We prefer these
+// over MS(G)R if available, even though we cannot use the condition code,
+// since they are three-operand instructions.
+let Predicates = [FeatureMiscellaneousExtensions2],
+    Defs = [CC], isCommutable = 1 in {
+  def MSRKC  : BinaryRRFa<"msrkc",  0xB9FD, mul, GR32, GR32, GR32>;
+  def MSGRKC : BinaryRRFa<"msgrkc", 0xB9ED, mul, GR64, GR64, GR64>;
+}
+
 // Multiplication of a register.
 let isCommutable = 1 in {
   def MSR  : BinaryRRE<"msr",  0xB252, mul, GR32, GR32>;
@@ -1226,21 +1247,37 @@ def MSGFI : BinaryRIL<"msgfi", 0xC20, mul, GR64, imm64sx32>;
 // Multiplication of memory.
 defm MH   : BinaryRXPair<"mh", 0x4C, 0xE37C, mul, GR32, asextloadi16, 2>;
 defm MS   : BinaryRXPair<"ms", 0x71, 0xE351, mul, GR32, load, 4>;
+def  MGH  : BinaryRXY<"mgh", 0xE33C, mul, GR64, asextloadi16, 2>,
+            Requires<[FeatureMiscellaneousExtensions2]>;
 def  MSGF : BinaryRXY<"msgf", 0xE31C, mul, GR64, asextloadi32, 4>;
 def  MSG  : BinaryRXY<"msg",  0xE30C, mul, GR64, load, 8>;
 
+// Multiplication of memory, setting the condition code.
+let Predicates = [FeatureMiscellaneousExtensions2], Defs = [CC] in {
+  def MSC  : BinaryRXY<"msc",  0xE353, null_frag, GR32, load, 4>;
+  def MSGC : BinaryRXY<"msgc", 0xE383, null_frag, GR64, load, 8>;
+}
+
 // Multiplication of a register, producing two results.
-def MR   : BinaryRR <"mr",   0x1C,   null_frag, GR128, GR32>;
+def MR   : BinaryRR <"mr",    0x1C,   null_frag, GR128, GR32>;
+def MGRK : BinaryRRFa<"mgrk", 0xB9EC, null_frag, GR128, GR64, GR64>,
+           Requires<[FeatureMiscellaneousExtensions2]>;
 def MLR  : BinaryRRE<"mlr",  0xB996, null_frag, GR128, GR32>;
 def MLGR : BinaryRRE<"mlgr", 0xB986, null_frag, GR128, GR64>;
+def : Pat<(z_smul_lohi GR64:$src1, GR64:$src2),
+          (MGRK GR64:$src1, GR64:$src2)>;
 def : Pat<(z_umul_lohi GR64:$src1, GR64:$src2),
           (MLGR (AEXT128 GR64:$src1), GR64:$src2)>;
 
 // Multiplication of memory, producing two results.
 def M   : BinaryRX <"m",   0x5C,   null_frag, GR128, load, 4>;
 def MFY : BinaryRXY<"mfy", 0xE35C, null_frag, GR128, load, 4>;
+def MG  : BinaryRXY<"mg",  0xE384, null_frag, GR128, load, 8>,
+          Requires<[FeatureMiscellaneousExtensions2]>;
 def ML  : BinaryRXY<"ml",  0xE396, null_frag, GR128, load, 4>;
 def MLG : BinaryRXY<"mlg", 0xE386, null_frag, GR128, load, 8>;
+def : Pat<(z_smul_lohi GR64:$src1, (i64 (load bdxaddr20only:$src2))),
+          (MG (AEXT128 GR64:$src1), bdxaddr20only:$src2)>;
 def : Pat<(z_umul_lohi GR64:$src1, (i64 (load bdxaddr20only:$src2))),
           (MLG (AEXT128 GR64:$src1), bdxaddr20only:$src2)>;
 
@@ -1765,8 +1802,29 @@ let mayLoad = 1, mayStore = 1, Uses = [R0L, R1D], Defs = [CC] in {
                                                GR128, GR128, GR128>;
     def PCC   : SideEffectInherentRRE<"pcc", 0xB92C>;
   }
+
   let Predicates = [FeatureMessageSecurityAssist5] in
-    def PPNO  : SideEffectBinaryMemMemRRE<"ppno", 0xB93C, GR128, GR128>;
+    def PPNO : SideEffectBinaryMemMemRRE<"ppno", 0xB93C, GR128, GR128>;
+  let Predicates = [FeatureMessageSecurityAssist7], isAsmParserOnly = 1 in
+    def PRNO : SideEffectBinaryMemMemRRE<"prno", 0xB93C, GR128, GR128>;
+
+  let Predicates = [FeatureMessageSecurityAssist8] in
+    def KMA : SideEffectTernaryMemMemMemRRFb<"kma", 0xB929,
+                                              GR128, GR128, GR128>;
+}
+
+//===----------------------------------------------------------------------===//
+// Guarded storage
+//===----------------------------------------------------------------------===//
+
+let Predicates = [FeatureGuardedStorage] in {
+  def LGG : UnaryRXY<"lgg", 0xE34C, null_frag, GR64, 8>;
+  def LLGFSG : UnaryRXY<"llgfsg", 0xE348, null_frag, GR64, 4>;
+
+  let mayLoad = 1 in
+    def LGSC : SideEffectBinaryRXY<"lgsc", 0xE34D, GR64>;
+  let mayStore = 1 in
+    def STGSC : SideEffectBinaryRXY<"stgsc", 0xE349, GR64>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/SystemZ/SystemZInstrSystem.td b/lib/Target/SystemZ/SystemZInstrSystem.td
index a9803c2d83e9..0112ebf1eb10 100644
--- a/lib/Target/SystemZ/SystemZInstrSystem.td
+++ b/lib/Target/SystemZ/SystemZInstrSystem.td
@@ -126,6 +126,10 @@ let hasSideEffects = 1, Defs = [CC] in
 let Predicates = [FeatureResetReferenceBitsMultiple], hasSideEffects = 1 in
   def RRBM : UnaryRRE<"rrbm", 0xB9AE, null_frag, GR64, GR64>;
 
+// Insert reference bits multiple.
+let Predicates = [FeatureInsertReferenceBitsMultiple], hasSideEffects = 1 in
+  def IRBM : UnaryRRE<"irbm", 0xB9AC, null_frag, GR64, GR64>;
+
 // Perform frame management function.
 let hasSideEffects = 1 in
   def PFMF : SideEffectBinaryMemRRE<"pfmf", 0xB9AF, GR32, GR64>;
diff --git a/lib/Target/SystemZ/SystemZInstrVector.td b/lib/Target/SystemZ/SystemZInstrVector.td
index 0158fe6aec08..c9a02d9c8082 100644
--- a/lib/Target/SystemZ/SystemZInstrVector.td
+++ b/lib/Target/SystemZ/SystemZInstrVector.td
@@ -14,7 +14,7 @@
 let Predicates = [FeatureVector] in {
   // Register move.
   def VLR : UnaryVRRa<"vlr", 0xE756, null_frag, v128any, v128any>;
-  def VLR32 : UnaryAliasVRR<null_frag, v32eb, v32eb>;
+  def VLR32 : UnaryAliasVRR<null_frag, v32sb, v32sb>;
   def VLR64 : UnaryAliasVRR<null_frag, v64db, v64db>;
 
   // Load GR from VR element.
@@ -141,7 +141,7 @@ let Predicates = [FeatureVector] in {
   // LEY and LDY offer full 20-bit displacement fields.  It's often better
   // to use those instructions rather than force a 20-bit displacement
   // into a GPR temporary.
-  def VL32 : UnaryAliasVRX<load, v32eb, bdxaddr12pair>;
+  def VL32 : UnaryAliasVRX<load, v32sb, bdxaddr12pair>;
   def VL64 : UnaryAliasVRX<load, v64db, bdxaddr12pair>;
 
   // Load logical element and zero.
@@ -154,6 +154,11 @@ let Predicates = [FeatureVector] in {
             (VLLEZF bdxaddr12only:$addr)>;
   def : Pat<(v2f64 (z_vllezf64 bdxaddr12only:$addr)),
             (VLLEZG bdxaddr12only:$addr)>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VLLEZLF : UnaryVRX<"vllezlf", 0xE704, z_vllezli32, v128f, 4, 6>;
+    def : Pat<(v4f32 (z_vllezlf32 bdxaddr12only:$addr)),
+              (VLLEZLF bdxaddr12only:$addr)>;
+  }
 
   // Load element.
   def VLEB : TernaryVRX<"vleb", 0xE700, z_vlei8,  v128b, v128b, 1, imm32zx4>;
@@ -170,6 +175,13 @@ let Predicates = [FeatureVector] in {
   def VGEG : TernaryVRV<"vgeg", 0xE712, 8, imm32zx1>;
 }
 
+let Predicates = [FeatureVectorPackedDecimal] in {
+  // Load rightmost with length.  The number of loaded bytes is only known
+  // at run time.
+  def VLRL : BinaryVSI<"vlrl", 0xE635, int_s390_vlrl, 0>;
+  def VLRLR : BinaryVRSd<"vlrlr", 0xE637, int_s390_vlrl, 0>;
+}
+
 // Use replicating loads if we're inserting a single element into an
 // undefined vector.  This avoids a false dependency on the previous
 // register contents.
@@ -219,7 +231,7 @@ let Predicates = [FeatureVector] in {
   // STEY and STDY offer full 20-bit displacement fields.  It's often better
   // to use those instructions rather than force a 20-bit displacement
   // into a GPR temporary.
-  def VST32 : StoreAliasVRX<store, v32eb, bdxaddr12pair>;
+  def VST32 : StoreAliasVRX<store, v32sb, bdxaddr12pair>;
   def VST64 : StoreAliasVRX<store, v64db, bdxaddr12pair>;
 
   // Scatter element.
@@ -227,6 +239,13 @@ let Predicates = [FeatureVector] in {
   def VSCEG : StoreBinaryVRV<"vsceg", 0xE71A, 8, imm32zx1>;
 }
 
+let Predicates = [FeatureVectorPackedDecimal] in {
+  // Store rightmost with length.  The number of stored bytes is only known
+  // at run time.
+  def VSTRL : StoreLengthVSI<"vstrl", 0xE63D, int_s390_vstrl, 0>;
+  def VSTRLR : StoreLengthVRSd<"vstrlr", 0xE63F, int_s390_vstrl, 0>;
+}
+
 //===----------------------------------------------------------------------===//
 // Selects and permutes
 //===----------------------------------------------------------------------===//
@@ -256,6 +275,10 @@ let Predicates = [FeatureVector] in {
   // Permute doubleword immediate.
   def VPDI : TernaryVRRc<"vpdi", 0xE784, z_permute_dwords, v128g, v128g>;
 
+  // Bit Permute.
+  let Predicates = [FeatureVectorEnhancements1] in
+    def VBPERM : BinaryVRRc<"vbperm", 0xE785, int_s390_vbperm, v128g, v128b>;
+
   // Replicate.
   def VREP:   BinaryVRIcGeneric<"vrep", 0xE74D>;
   def VREPB : BinaryVRIc<"vrepb", 0xE74D, z_splat, v128b, v128b, 0>;
@@ -424,6 +447,10 @@ let Predicates = [FeatureVector] in {
   def VCTZF : UnaryVRRa<"vctzf", 0xE752, cttz, v128f, v128f, 2>;
   def VCTZG : UnaryVRRa<"vctzg", 0xE752, cttz, v128g, v128g, 3>;
 
+  // Not exclusive or.
+  let Predicates = [FeatureVectorEnhancements1] in
+    def VNX : BinaryVRRc<"vnx", 0xE76C, null_frag, v128any, v128any>;
+
   // Exclusive or.
   def VX : BinaryVRRc<"vx", 0xE76D, null_frag, v128any, v128any>;
 
@@ -567,6 +594,17 @@ let Predicates = [FeatureVector] in {
   def VMLOH : BinaryVRRc<"vmloh", 0xE7A5, int_s390_vmloh, v128f, v128h, 1>;
   def VMLOF : BinaryVRRc<"vmlof", 0xE7A5, int_s390_vmlof, v128g, v128f, 2>;
 
+  // Multiply sum logical.
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VMSL  : QuaternaryVRRdGeneric<"vmsl", 0xE7B8>;
+    def VMSLG : QuaternaryVRRd<"vmslg", 0xE7B8, int_s390_vmslg,
+                               v128q, v128g, v128g, v128q, 3>;
+  }
+
+  // Nand.
+  let Predicates = [FeatureVectorEnhancements1] in
+    def VNN : BinaryVRRc<"vnn", 0xE76E, null_frag, v128any, v128any>;
+
   // Nor.
   def VNO : BinaryVRRc<"vno", 0xE76B, null_frag, v128any, v128any>;
   def : InstAlias<"vnot\t$V1, $V2", (VNO VR128:$V1, VR128:$V2, VR128:$V2), 0>;
@@ -574,9 +612,19 @@ let Predicates = [FeatureVector] in {
   // Or.
   def VO : BinaryVRRc<"vo", 0xE76A, null_frag, v128any, v128any>;
 
+  // Or with complement.
+  let Predicates = [FeatureVectorEnhancements1] in
+    def VOC : BinaryVRRc<"voc", 0xE76F, null_frag, v128any, v128any>;
+
   // Population count.
   def VPOPCT : UnaryVRRaGeneric<"vpopct", 0xE750>;
   def : Pat<(v16i8 (z_popcnt VR128:$x)), (VPOPCT VR128:$x, 0)>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VPOPCTB : UnaryVRRa<"vpopctb", 0xE750, ctpop, v128b, v128b, 0>;
+    def VPOPCTH : UnaryVRRa<"vpopcth", 0xE750, ctpop, v128h, v128h, 1>;
+    def VPOPCTF : UnaryVRRa<"vpopctf", 0xE750, ctpop, v128f, v128f, 2>;
+    def VPOPCTG : UnaryVRRa<"vpopctg", 0xE750, ctpop, v128g, v128g, 3>;
+  }
 
   // Element rotate left logical (with vector shift amount).
   def VERLLV  : BinaryVRRcGeneric<"verllv", 0xE773>;
@@ -724,6 +772,14 @@ multiclass BitwiseVectorOps<ValueType type> {
               (VNO VR128:$x, VR128:$y)>;
     def : Pat<(type (z_vnot VR128:$x)), (VNO VR128:$x, VR128:$x)>;
   }
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def : Pat<(type (z_vnot (xor VR128:$x, VR128:$y))),
+              (VNX VR128:$x, VR128:$y)>;
+    def : Pat<(type (z_vnot (and VR128:$x, VR128:$y))),
+              (VNN VR128:$x, VR128:$y)>;
+    def : Pat<(type (or VR128:$x, (z_vnot VR128:$y))),
+              (VOC VR128:$x, VR128:$y)>;
+  }
 }
 
 defm : BitwiseVectorOps<v16i8>;
@@ -879,6 +935,11 @@ let Predicates = [FeatureVector] in {
   def VFA   : BinaryVRRcFloatGeneric<"vfa", 0xE7E3>;
   def VFADB : BinaryVRRc<"vfadb", 0xE7E3, fadd, v128db, v128db, 3, 0>;
   def WFADB : BinaryVRRc<"wfadb", 0xE7E3, fadd, v64db, v64db, 3, 8>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VFASB : BinaryVRRc<"vfasb", 0xE7E3, fadd, v128sb, v128sb, 2, 0>;
+    def WFASB : BinaryVRRc<"wfasb", 0xE7E3, fadd, v32sb, v32sb, 2, 8>;
+    def WFAXB : BinaryVRRc<"wfaxb", 0xE7E3, fadd, v128xb, v128xb, 4, 8>;
+  }
 
   // Convert from fixed 64-bit.
   def VCDG  : TernaryVRRaFloatGeneric<"vcdg", 0xE7C3>;
@@ -910,6 +971,11 @@ let Predicates = [FeatureVector] in {
   def VFD   : BinaryVRRcFloatGeneric<"vfd", 0xE7E5>;
   def VFDDB : BinaryVRRc<"vfddb", 0xE7E5, fdiv, v128db, v128db, 3, 0>;
   def WFDDB : BinaryVRRc<"wfddb", 0xE7E5, fdiv, v64db, v64db, 3, 8>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VFDSB : BinaryVRRc<"vfdsb", 0xE7E5, fdiv, v128sb, v128sb, 2, 0>;
+    def WFDSB : BinaryVRRc<"wfdsb", 0xE7E5, fdiv, v32sb, v32sb, 2, 8>;
+    def WFDXB : BinaryVRRc<"wfdxb", 0xE7E5, fdiv, v128xb, v128xb, 4, 8>;
+  }
 
   // Load FP integer.
   def VFI   : TernaryVRRaFloatGeneric<"vfi", 0xE7C7>;
@@ -917,66 +983,213 @@ let Predicates = [FeatureVector] in {
   def WFIDB : TernaryVRRa<"wfidb", 0xE7C7, null_frag, v64db, v64db, 3, 8>;
   defm : VectorRounding<VFIDB, v128db>;
   defm : VectorRounding<WFIDB, v64db>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VFISB : TernaryVRRa<"vfisb", 0xE7C7, int_s390_vfisb, v128sb, v128sb, 2, 0>;
+    def WFISB : TernaryVRRa<"wfisb", 0xE7C7, null_frag, v32sb, v32sb, 2, 8>;
+    def WFIXB : TernaryVRRa<"wfixb", 0xE7C7, null_frag, v128xb, v128xb, 4, 8>;
+    defm : VectorRounding<VFISB, v128sb>;
+    defm : VectorRounding<WFISB, v32sb>;
+    defm : VectorRounding<WFIXB, v128xb>;
+  }
 
   // Load lengthened.
   def VLDE  : UnaryVRRaFloatGeneric<"vlde", 0xE7C4>;
-  def VLDEB : UnaryVRRa<"vldeb", 0xE7C4, z_vextend, v128db, v128eb, 2, 0>;
-  def WLDEB : UnaryVRRa<"wldeb", 0xE7C4, fpextend, v64db, v32eb, 2, 8>;
+  def VLDEB : UnaryVRRa<"vldeb", 0xE7C4, z_vextend, v128db, v128sb, 2, 0>;
+  def WLDEB : UnaryVRRa<"wldeb", 0xE7C4, fpextend, v64db, v32sb, 2, 8>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    let isAsmParserOnly = 1 in {
+      def VFLL  : UnaryVRRaFloatGeneric<"vfll", 0xE7C4>;
+      def VFLLS : UnaryVRRa<"vflls", 0xE7C4, null_frag, v128db, v128sb, 2, 0>;
+      def WFLLS : UnaryVRRa<"wflls", 0xE7C4, null_frag, v64db, v32sb, 2, 8>;
+    }
+    def WFLLD : UnaryVRRa<"wflld", 0xE7C4, fpextend, v128xb, v64db, 3, 8>;
+    def : Pat<(f128 (fpextend (f32 VR32:$src))),
+              (WFLLD (WLDEB VR32:$src))>;
+  }
 
-  // Load rounded,
+  // Load rounded.
   def VLED  : TernaryVRRaFloatGeneric<"vled", 0xE7C5>;
-  def VLEDB : TernaryVRRa<"vledb", 0xE7C5, null_frag, v128eb, v128db, 3, 0>;
-  def WLEDB : TernaryVRRa<"wledb", 0xE7C5, null_frag, v32eb, v64db, 3, 8>;
+  def VLEDB : TernaryVRRa<"vledb", 0xE7C5, null_frag, v128sb, v128db, 3, 0>;
+  def WLEDB : TernaryVRRa<"wledb", 0xE7C5, null_frag, v32sb, v64db, 3, 8>;
   def : Pat<(v4f32 (z_vround (v2f64 VR128:$src))), (VLEDB VR128:$src, 0, 0)>;
-  def : FPConversion<WLEDB, fpround, v32eb, v64db, 0, 0>;
+  def : FPConversion<WLEDB, fpround, v32sb, v64db, 0, 0>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    let isAsmParserOnly = 1 in {
+      def VFLR  : TernaryVRRaFloatGeneric<"vflr", 0xE7C5>;
+      def VFLRD : TernaryVRRa<"vflrd", 0xE7C5, null_frag, v128sb, v128db, 3, 0>;
+      def WFLRD : TernaryVRRa<"wflrd", 0xE7C5, null_frag, v32sb, v64db, 3, 8>;
+    }
+    def WFLRX : TernaryVRRa<"wflrx", 0xE7C5, null_frag, v64db, v128xb, 4, 8>;
+    def : FPConversion<WFLRX, fpround, v64db, v128xb, 0, 0>;
+    def : Pat<(f32 (fpround (f128 VR128:$src))),
+              (WLEDB (WFLRX VR128:$src, 0, 3), 0, 0)>;
+  }
+
+  // Maximum.
+  multiclass VectorMax<Instruction insn, TypedReg tr> {
+    def : FPMinMax<insn, fmaxnum, tr, 4>;
+    def : FPMinMax<insn, fmaxnan, tr, 1>;
+  }
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VFMAX   : TernaryVRRcFloatGeneric<"vfmax", 0xE7EF>;
+    def VFMAXDB : TernaryVRRcFloat<"vfmaxdb", 0xE7EF, int_s390_vfmaxdb,
+                                   v128db, v128db, 3, 0>;
+    def WFMAXDB : TernaryVRRcFloat<"wfmaxdb", 0xE7EF, null_frag,
+                                   v64db, v64db, 3, 8>;
+    def VFMAXSB : TernaryVRRcFloat<"vfmaxsb", 0xE7EF, int_s390_vfmaxsb,
+                                   v128sb, v128sb, 2, 0>;
+    def WFMAXSB : TernaryVRRcFloat<"wfmaxsb", 0xE7EF, null_frag,
+                                   v32sb, v32sb, 2, 8>;
+    def WFMAXXB : TernaryVRRcFloat<"wfmaxxb", 0xE7EF, null_frag,
+                                   v128xb, v128xb, 4, 8>;
+    defm : VectorMax<VFMAXDB, v128db>;
+    defm : VectorMax<WFMAXDB, v64db>;
+    defm : VectorMax<VFMAXSB, v128sb>;
+    defm : VectorMax<WFMAXSB, v32sb>;
+    defm : VectorMax<WFMAXXB, v128xb>;
+  }
+
+  // Minimum.
+  multiclass VectorMin<Instruction insn, TypedReg tr> {
+    def : FPMinMax<insn, fminnum, tr, 4>;
+    def : FPMinMax<insn, fminnan, tr, 1>;
+  }
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VFMIN   : TernaryVRRcFloatGeneric<"vfmin", 0xE7EE>;
+    def VFMINDB : TernaryVRRcFloat<"vfmindb", 0xE7EE, int_s390_vfmindb,
+                                   v128db, v128db, 3, 0>;
+    def WFMINDB : TernaryVRRcFloat<"wfmindb", 0xE7EE, null_frag,
+                                   v64db, v64db, 3, 8>;
+    def VFMINSB : TernaryVRRcFloat<"vfminsb", 0xE7EE, int_s390_vfminsb,
+                                   v128sb, v128sb, 2, 0>;
+    def WFMINSB : TernaryVRRcFloat<"wfminsb", 0xE7EE, null_frag,
+                                   v32sb, v32sb, 2, 8>;
+    def WFMINXB : TernaryVRRcFloat<"wfminxb", 0xE7EE, null_frag,
+                                   v128xb, v128xb, 4, 8>;
+    defm : VectorMin<VFMINDB, v128db>;
+    defm : VectorMin<WFMINDB, v64db>;
+    defm : VectorMin<VFMINSB, v128sb>;
+    defm : VectorMin<WFMINSB, v32sb>;
+    defm : VectorMin<WFMINXB, v128xb>;
+  }
 
   // Multiply.
   def VFM   : BinaryVRRcFloatGeneric<"vfm", 0xE7E7>;
   def VFMDB : BinaryVRRc<"vfmdb", 0xE7E7, fmul, v128db, v128db, 3, 0>;
   def WFMDB : BinaryVRRc<"wfmdb", 0xE7E7, fmul, v64db, v64db, 3, 8>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VFMSB : BinaryVRRc<"vfmsb", 0xE7E7, fmul, v128sb, v128sb, 2, 0>;
+    def WFMSB : BinaryVRRc<"wfmsb", 0xE7E7, fmul, v32sb, v32sb, 2, 8>;
+    def WFMXB : BinaryVRRc<"wfmxb", 0xE7E7, fmul, v128xb, v128xb, 4, 8>;
+  }
 
   // Multiply and add.
   def VFMA   : TernaryVRReFloatGeneric<"vfma", 0xE78F>;
   def VFMADB : TernaryVRRe<"vfmadb", 0xE78F, fma, v128db, v128db, 0, 3>;
   def WFMADB : TernaryVRRe<"wfmadb", 0xE78F, fma, v64db, v64db, 8, 3>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VFMASB : TernaryVRRe<"vfmasb", 0xE78F, fma, v128sb, v128sb, 0, 2>;
+    def WFMASB : TernaryVRRe<"wfmasb", 0xE78F, fma, v32sb, v32sb, 8, 2>;
+    def WFMAXB : TernaryVRRe<"wfmaxb", 0xE78F, fma, v128xb, v128xb, 8, 4>;
+  }
 
   // Multiply and subtract.
   def VFMS   : TernaryVRReFloatGeneric<"vfms", 0xE78E>;
   def VFMSDB : TernaryVRRe<"vfmsdb", 0xE78E, fms, v128db, v128db, 0, 3>;
   def WFMSDB : TernaryVRRe<"wfmsdb", 0xE78E, fms, v64db, v64db, 8, 3>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VFMSSB : TernaryVRRe<"vfmssb", 0xE78E, fms, v128sb, v128sb, 0, 2>;
+    def WFMSSB : TernaryVRRe<"wfmssb", 0xE78E, fms, v32sb, v32sb, 8, 2>;
+    def WFMSXB : TernaryVRRe<"wfmsxb", 0xE78E, fms, v128xb, v128xb, 8, 4>;
+  }
+
+  // Negative multiply and add.
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VFNMA   : TernaryVRReFloatGeneric<"vfnma", 0xE79F>;
+    def VFNMADB : TernaryVRRe<"vfnmadb", 0xE79F, fnma, v128db, v128db, 0, 3>;
+    def WFNMADB : TernaryVRRe<"wfnmadb", 0xE79F, fnma, v64db, v64db, 8, 3>;
+    def VFNMASB : TernaryVRRe<"vfnmasb", 0xE79F, fnma, v128sb, v128sb, 0, 2>;
+    def WFNMASB : TernaryVRRe<"wfnmasb", 0xE79F, fnma, v32sb, v32sb, 8, 2>;
+    def WFNMAXB : TernaryVRRe<"wfnmaxb", 0xE79F, fnma, v128xb, v128xb, 8, 4>;
+  }
+
+  // Negative multiply and subtract.
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VFNMS   : TernaryVRReFloatGeneric<"vfnms", 0xE79E>;
+    def VFNMSDB : TernaryVRRe<"vfnmsdb", 0xE79E, fnms, v128db, v128db, 0, 3>;
+    def WFNMSDB : TernaryVRRe<"wfnmsdb", 0xE79E, fnms, v64db, v64db, 8, 3>;
+    def VFNMSSB : TernaryVRRe<"vfnmssb", 0xE79E, fnms, v128sb, v128sb, 0, 2>;
+    def WFNMSSB : TernaryVRRe<"wfnmssb", 0xE79E, fnms, v32sb, v32sb, 8, 2>;
+    def WFNMSXB : TernaryVRRe<"wfnmsxb", 0xE79E, fnms, v128xb, v128xb, 8, 4>;
+  }
 
   // Perform sign operation.
   def VFPSO   : BinaryVRRaFloatGeneric<"vfpso", 0xE7CC>;
   def VFPSODB : BinaryVRRa<"vfpsodb", 0xE7CC, null_frag, v128db, v128db, 3, 0>;
   def WFPSODB : BinaryVRRa<"wfpsodb", 0xE7CC, null_frag, v64db, v64db, 3, 8>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VFPSOSB : BinaryVRRa<"vfpsosb", 0xE7CC, null_frag, v128sb, v128sb, 2, 0>;
+    def WFPSOSB : BinaryVRRa<"wfpsosb", 0xE7CC, null_frag, v32sb, v32sb, 2, 8>;
+    def WFPSOXB : BinaryVRRa<"wfpsoxb", 0xE7CC, null_frag, v128xb, v128xb, 4, 8>;
+  }
 
   // Load complement.
   def VFLCDB : UnaryVRRa<"vflcdb", 0xE7CC, fneg, v128db, v128db, 3, 0, 0>;
   def WFLCDB : UnaryVRRa<"wflcdb", 0xE7CC, fneg, v64db, v64db, 3, 8, 0>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VFLCSB : UnaryVRRa<"vflcsb", 0xE7CC, fneg, v128sb, v128sb, 2, 0, 0>;
+    def WFLCSB : UnaryVRRa<"wflcsb", 0xE7CC, fneg, v32sb, v32sb, 2, 8, 0>;
+    def WFLCXB : UnaryVRRa<"wflcxb", 0xE7CC, fneg, v128xb, v128xb, 4, 8, 0>;
+  }
 
   // Load negative.
   def VFLNDB : UnaryVRRa<"vflndb", 0xE7CC, fnabs, v128db, v128db, 3, 0, 1>;
   def WFLNDB : UnaryVRRa<"wflndb", 0xE7CC, fnabs, v64db, v64db, 3, 8, 1>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VFLNSB : UnaryVRRa<"vflnsb", 0xE7CC, fnabs, v128sb, v128sb, 2, 0, 1>;
+    def WFLNSB : UnaryVRRa<"wflnsb", 0xE7CC, fnabs, v32sb, v32sb, 2, 8, 1>;
+    def WFLNXB : UnaryVRRa<"wflnxb", 0xE7CC, fnabs, v128xb, v128xb, 4, 8, 1>;
+  }
 
   // Load positive.
   def VFLPDB : UnaryVRRa<"vflpdb", 0xE7CC, fabs, v128db, v128db, 3, 0, 2>;
   def WFLPDB : UnaryVRRa<"wflpdb", 0xE7CC, fabs, v64db, v64db, 3, 8, 2>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VFLPSB : UnaryVRRa<"vflpsb", 0xE7CC, fabs, v128sb, v128sb, 2, 0, 2>;
+    def WFLPSB : UnaryVRRa<"wflpsb", 0xE7CC, fabs, v32sb, v32sb, 2, 8, 2>;
+    def WFLPXB : UnaryVRRa<"wflpxb", 0xE7CC, fabs, v128xb, v128xb, 4, 8, 2>;
+  }
 
   // Square root.
   def VFSQ   : UnaryVRRaFloatGeneric<"vfsq", 0xE7CE>;
   def VFSQDB : UnaryVRRa<"vfsqdb", 0xE7CE, fsqrt, v128db, v128db, 3, 0>;
   def WFSQDB : UnaryVRRa<"wfsqdb", 0xE7CE, fsqrt, v64db, v64db, 3, 8>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VFSQSB : UnaryVRRa<"vfsqsb", 0xE7CE, fsqrt, v128sb, v128sb, 2, 0>;
+    def WFSQSB : UnaryVRRa<"wfsqsb", 0xE7CE, fsqrt, v32sb, v32sb, 2, 8>;
+    def WFSQXB : UnaryVRRa<"wfsqxb", 0xE7CE, fsqrt, v128xb, v128xb, 4, 8>;
+  }
 
   // Subtract.
   def VFS   : BinaryVRRcFloatGeneric<"vfs", 0xE7E2>;
   def VFSDB : BinaryVRRc<"vfsdb", 0xE7E2, fsub, v128db, v128db, 3, 0>;
   def WFSDB : BinaryVRRc<"wfsdb", 0xE7E2, fsub, v64db, v64db, 3, 8>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VFSSB : BinaryVRRc<"vfssb", 0xE7E2, fsub, v128sb, v128sb, 2, 0>;
+    def WFSSB : BinaryVRRc<"wfssb", 0xE7E2, fsub, v32sb, v32sb, 2, 8>;
+    def WFSXB : BinaryVRRc<"wfsxb", 0xE7E2, fsub, v128xb, v128xb, 4, 8>;
+  }
 
   // Test data class immediate.
   let Defs = [CC] in {
     def VFTCI   : BinaryVRIeFloatGeneric<"vftci", 0xE74A>;
     def VFTCIDB : BinaryVRIe<"vftcidb", 0xE74A, z_vftci, v128g, v128db, 3, 0>;
     def WFTCIDB : BinaryVRIe<"wftcidb", 0xE74A, null_frag, v64g, v64db, 3, 8>;
+    let Predicates = [FeatureVectorEnhancements1] in {
+      def VFTCISB : BinaryVRIe<"vftcisb", 0xE74A, z_vftci, v128f, v128sb, 2, 0>;
+      def WFTCISB : BinaryVRIe<"wftcisb", 0xE74A, null_frag, v32f, v32sb, 2, 8>;
+      def WFTCIXB : BinaryVRIe<"wftcixb", 0xE74A, null_frag, v128q, v128xb, 4, 8>;
+    }
   }
 }
 
@@ -989,12 +1202,20 @@ let Predicates = [FeatureVector] in {
   let Defs = [CC] in {
     def WFC   : CompareVRRaFloatGeneric<"wfc", 0xE7CB>;
     def WFCDB : CompareVRRa<"wfcdb", 0xE7CB, z_fcmp, v64db, 3>;
+    let Predicates = [FeatureVectorEnhancements1] in {
+      def WFCSB : CompareVRRa<"wfcsb", 0xE7CB, z_fcmp, v32sb, 2>;
+      def WFCXB : CompareVRRa<"wfcxb", 0xE7CB, z_fcmp, v128xb, 4>;
+    }
   }
 
   // Compare and signal scalar.
   let Defs = [CC] in {
     def WFK   : CompareVRRaFloatGeneric<"wfk", 0xE7CA>;
     def WFKDB : CompareVRRa<"wfkdb", 0xE7CA, null_frag, v64db, 3>;
+    let Predicates = [FeatureVectorEnhancements1] in {
+      def WFKSB : CompareVRRa<"wfksb", 0xE7CA, null_frag, v32sb, 2>;
+      def WFKXB : CompareVRRa<"wfkxb", 0xE7CA, null_frag, v128xb, 4>;
+    }
   }
 
   // Compare equal.
@@ -1003,6 +1224,28 @@ let Predicates = [FeatureVector] in {
                                 v128g, v128db, 3, 0>;
   defm WFCEDB : BinaryVRRcSPair<"wfcedb", 0xE7E8, null_frag, null_frag,
                                 v64g, v64db, 3, 8>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    defm VFCESB : BinaryVRRcSPair<"vfcesb", 0xE7E8, z_vfcmpe, z_vfcmpes,
+                                  v128f, v128sb, 2, 0>;
+    defm WFCESB : BinaryVRRcSPair<"wfcesb", 0xE7E8, null_frag, null_frag,
+                                  v32f, v32sb, 2, 8>;
+    defm WFCEXB : BinaryVRRcSPair<"wfcexb", 0xE7E8, null_frag, null_frag,
+                                  v128q, v128xb, 4, 8>;
+  }
+
+  // Compare and signal equal.
+  let Predicates = [FeatureVectorEnhancements1] in {
+    defm VFKEDB : BinaryVRRcSPair<"vfkedb", 0xE7E8, null_frag, null_frag,
+                                  v128g, v128db, 3, 4>;
+    defm WFKEDB : BinaryVRRcSPair<"wfkedb", 0xE7E8, null_frag, null_frag,
+                                  v64g, v64db, 3, 12>;
+    defm VFKESB : BinaryVRRcSPair<"vfkesb", 0xE7E8, null_frag, null_frag,
+                                  v128f, v128sb, 2, 4>;
+    defm WFKESB : BinaryVRRcSPair<"wfkesb", 0xE7E8, null_frag, null_frag,
+                                  v32f, v32sb, 2, 12>;
+    defm WFKEXB : BinaryVRRcSPair<"wfkexb", 0xE7E8, null_frag, null_frag,
+                                  v128q, v128xb, 4, 12>;
+  }
 
   // Compare high.
   def  VFCH   : BinaryVRRcSPairFloatGeneric<"vfch", 0xE7EB>;
@@ -1010,6 +1253,28 @@ let Predicates = [FeatureVector] in {
                                 v128g, v128db, 3, 0>;
   defm WFCHDB : BinaryVRRcSPair<"wfchdb", 0xE7EB, null_frag, null_frag,
                                 v64g, v64db, 3, 8>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    defm VFCHSB : BinaryVRRcSPair<"vfchsb", 0xE7EB, z_vfcmph, z_vfcmphs,
+                                  v128f, v128sb, 2, 0>;
+    defm WFCHSB : BinaryVRRcSPair<"wfchsb", 0xE7EB, null_frag, null_frag,
+                                  v32f, v32sb, 2, 8>;
+    defm WFCHXB : BinaryVRRcSPair<"wfchxb", 0xE7EB, null_frag, null_frag,
+                                  v128q, v128xb, 4, 8>;
+  }
+
+  // Compare and signal high.
+  let Predicates = [FeatureVectorEnhancements1] in {
+    defm VFKHDB : BinaryVRRcSPair<"vfkhdb", 0xE7EB, null_frag, null_frag,
+                                  v128g, v128db, 3, 4>;
+    defm WFKHDB : BinaryVRRcSPair<"wfkhdb", 0xE7EB, null_frag, null_frag,
+                                  v64g, v64db, 3, 12>;
+    defm VFKHSB : BinaryVRRcSPair<"vfkhsb", 0xE7EB, null_frag, null_frag,
+                                  v128f, v128sb, 2, 4>;
+    defm WFKHSB : BinaryVRRcSPair<"wfkhsb", 0xE7EB, null_frag, null_frag,
+                                  v32f, v32sb, 2, 12>;
+    defm WFKHXB : BinaryVRRcSPair<"wfkhxb", 0xE7EB, null_frag, null_frag,
+                                  v128q, v128xb, 4, 12>;
+  }
 
   // Compare high or equal.
   def  VFCHE   : BinaryVRRcSPairFloatGeneric<"vfche", 0xE7EA>;
@@ -1017,6 +1282,28 @@ let Predicates = [FeatureVector] in {
                                  v128g, v128db, 3, 0>;
   defm WFCHEDB : BinaryVRRcSPair<"wfchedb", 0xE7EA, null_frag, null_frag,
                                  v64g, v64db, 3, 8>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    defm VFCHESB : BinaryVRRcSPair<"vfchesb", 0xE7EA, z_vfcmphe, z_vfcmphes,
+                                   v128f, v128sb, 2, 0>;
+    defm WFCHESB : BinaryVRRcSPair<"wfchesb", 0xE7EA, null_frag, null_frag,
+                                   v32f, v32sb, 2, 8>;
+    defm WFCHEXB : BinaryVRRcSPair<"wfchexb", 0xE7EA, null_frag, null_frag,
+                                   v128q, v128xb, 4, 8>;
+  }
+
+  // Compare and signal high or equal.
+  let Predicates = [FeatureVectorEnhancements1] in {
+    defm VFKHEDB : BinaryVRRcSPair<"vfkhedb", 0xE7EA, null_frag, null_frag,
+                                   v128g, v128db, 3, 4>;
+    defm WFKHEDB : BinaryVRRcSPair<"wfkhedb", 0xE7EA, null_frag, null_frag,
+                                   v64g, v64db, 3, 12>;
+    defm VFKHESB : BinaryVRRcSPair<"vfkhesb", 0xE7EA, null_frag, null_frag,
+                                   v128f, v128sb, 2, 4>;
+    defm WFKHESB : BinaryVRRcSPair<"wfkhesb", 0xE7EA, null_frag, null_frag,
+                                   v32f, v32sb, 2, 12>;
+    defm WFKHEXB : BinaryVRRcSPair<"wfkhexb", 0xE7EA, null_frag, null_frag,
+                                   v128q, v128xb, 4, 12>;
+  }
 }
 
 //===----------------------------------------------------------------------===//
@@ -1028,36 +1315,49 @@ def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>;
 def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>;
 def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>;
 def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>;
+def : Pat<(v16i8 (bitconvert (f128  VR128:$src))), (v16i8 VR128:$src)>;
 
 def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>;
 def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>;
 def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>;
 def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>;
 def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>;
+def : Pat<(v8i16 (bitconvert (f128  VR128:$src))), (v8i16 VR128:$src)>;
 
 def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>;
 def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>;
 def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>;
 def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>;
 def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>;
+def : Pat<(v4i32 (bitconvert (f128  VR128:$src))), (v4i32 VR128:$src)>;
 
 def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>;
 def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>;
 def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>;
 def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>;
 def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>;
+def : Pat<(v2i64 (bitconvert (f128  VR128:$src))), (v2i64 VR128:$src)>;
 
 def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>;
 def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>;
 def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>;
 def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>;
 def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>;
+def : Pat<(v4f32 (bitconvert (f128  VR128:$src))), (v4f32 VR128:$src)>;
 
 def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>;
 def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>;
 def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>;
 def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>;
 def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>;
+def : Pat<(v2f64 (bitconvert (f128  VR128:$src))), (v2f64 VR128:$src)>;
+
+def : Pat<(f128  (bitconvert (v16i8 VR128:$src))), (f128  VR128:$src)>;
+def : Pat<(f128  (bitconvert (v8i16 VR128:$src))), (f128  VR128:$src)>;
+def : Pat<(f128  (bitconvert (v4i32 VR128:$src))), (f128  VR128:$src)>;
+def : Pat<(f128  (bitconvert (v2i64 VR128:$src))), (f128  VR128:$src)>;
+def : Pat<(f128  (bitconvert (v4f32 VR128:$src))), (f128  VR128:$src)>;
+def : Pat<(f128  (bitconvert (v2f64 VR128:$src))), (f128  VR128:$src)>;
 
 //===----------------------------------------------------------------------===//
 // Replicating scalars
@@ -1133,6 +1433,20 @@ let AddedComplexity = 4 in {
             (EXTRACT_SUBREG (VREPG VR128:$vec, imm32zx1:$index), subreg_r64)>;
 }
 
+//===----------------------------------------------------------------------===//
+// Support for 128-bit floating-point values in vector registers
+//===----------------------------------------------------------------------===//
+
+let Predicates = [FeatureVectorEnhancements1] in {
+  def : Pat<(f128 (load bdxaddr12only:$addr)),
+            (VL bdxaddr12only:$addr)>;
+  def : Pat<(store (f128 VR128:$src), bdxaddr12only:$addr),
+            (VST VR128:$src, bdxaddr12only:$addr)>;
+
+  def : Pat<(f128 fpimm0), (VZERO)>;
+  def : Pat<(f128 fpimmneg0), (WFLNXB (VZERO))>;
+}
+
 //===----------------------------------------------------------------------===//
 // String instructions
 //===----------------------------------------------------------------------===//
@@ -1202,3 +1516,37 @@ let Predicates = [FeatureVector] in {
   defm VSTRCZF : QuaternaryOptVRRdSPair<"vstrczf", 0xE78A, int_s390_vstrczf,
                                         z_vstrcz_cc, v128f, v128f, 2, 2>;
 }
+
+//===----------------------------------------------------------------------===//
+// Packed-decimal instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [FeatureVectorPackedDecimal] in {
+  def VLIP : BinaryVRIh<"vlip", 0xE649>;
+
+  def VPKZ : BinaryVSI<"vpkz", 0xE634, null_frag, 0>;
+  def VUPKZ : StoreLengthVSI<"vupkz", 0xE63C, null_frag, 0>;
+
+  let Defs = [CC] in {
+    def VCVB : BinaryVRRi<"vcvb", 0xE650, GR32>;
+    def VCVBG : BinaryVRRi<"vcvbg", 0xE652, GR64>;
+    def VCVD : TernaryVRIi<"vcvd", 0xE658, GR32>;
+    def VCVDG : TernaryVRIi<"vcvdg", 0xE65A, GR64>;
+
+    def VAP : QuaternaryVRIf<"vap", 0xE671>;
+    def VSP : QuaternaryVRIf<"vsp", 0xE673>;
+
+    def VMP : QuaternaryVRIf<"vmp", 0xE678>;
+    def VMSP : QuaternaryVRIf<"vmsp", 0xE679>;
+
+    def VDP : QuaternaryVRIf<"vdp", 0xE67A>;
+    def VRP : QuaternaryVRIf<"vrp", 0xE67B>;
+    def VSDP : QuaternaryVRIf<"vsdp", 0xE67E>;
+
+    def VSRP : QuaternaryVRIg<"vsrp", 0xE659>;
+    def VPSOP : QuaternaryVRIg<"vpsop", 0xE65B>;
+
+    def VTP : TestVRRg<"vtp", 0xE65F>;
+    def VCP : CompareVRRh<"vcp", 0xE677>;
+  }
+}
diff --git a/lib/Target/SystemZ/SystemZOperators.td b/lib/Target/SystemZ/SystemZOperators.td
index 9c6d5819f8a7..759a8bb0ce14 100644
--- a/lib/Target/SystemZ/SystemZOperators.td
+++ b/lib/Target/SystemZ/SystemZOperators.td
@@ -181,6 +181,7 @@ def z_select_ccmask     : SDNode<"SystemZISD::SELECT_CCMASK", SDT_ZSelectCCMask,
     		                 [SDNPInGlue]>;
 def z_adjdynalloc       : SDNode<"SystemZISD::ADJDYNALLOC", SDT_ZAdjDynAlloc>;
 def z_popcnt            : SDNode<"SystemZISD::POPCNT", SDTIntUnaryOp>;
+def z_smul_lohi         : SDNode<"SystemZISD::SMUL_LOHI", SDT_ZGR128Binary>;
 def z_umul_lohi         : SDNode<"SystemZISD::UMUL_LOHI", SDT_ZGR128Binary>;
 def z_sdivrem           : SDNode<"SystemZISD::SDIVREM", SDT_ZGR128Binary>;
 def z_udivrem           : SDNode<"SystemZISD::UDIVREM", SDT_ZGR128Binary>;
@@ -549,6 +550,12 @@ def z_fma : PatFrag<(ops node:$src1, node:$src2, node:$src3),
 def z_fms : PatFrag<(ops node:$src1, node:$src2, node:$src3),
                     (fma node:$src2, node:$src3, (fneg node:$src1))>;
 
+// Negative fused multiply-add and multiply-subtract.
+def fnma : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+                   (fneg (fma node:$src1, node:$src2, node:$src3))>;
+def fnms : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+                   (fneg (fms node:$src1, node:$src2, node:$src3))>;
+
 // Floating-point negative absolute.
 def fnabs : PatFrag<(ops node:$ptr), (fneg (fabs node:$ptr))>;
 
@@ -624,6 +631,19 @@ def z_vllezf64 : PatFrag<(ops node:$addr),
                           (scalar_to_vector (f64 (load node:$addr))),
                           (z_vzero))>;
 
+// Similarly for the high element of a zeroed vector.
+def z_vllezli32 : z_vllez<i32, load, 0>;
+def z_vllezlf32 : PatFrag<(ops node:$addr),
+                          (bitconvert
+                           (z_merge_high
+                            (v2i64
+                             (bitconvert
+                              (z_merge_high
+                               (v4f32 (scalar_to_vector
+                                       (f32 (load node:$addr)))),
+                               (v4f32 (z_vzero))))),
+                            (v2i64 (z_vzero))))>;
+
 // Store one element of a vector.
 class z_vste<ValueType scalartype, SDPatternOperator store>
   : PatFrag<(ops node:$vec, node:$addr, node:$index),
diff --git a/lib/Target/SystemZ/SystemZPatterns.td b/lib/Target/SystemZ/SystemZPatterns.td
index 16a7ed784d70..152521fb66a8 100644
--- a/lib/Target/SystemZ/SystemZPatterns.td
+++ b/lib/Target/SystemZ/SystemZPatterns.td
@@ -167,3 +167,10 @@ class FPConversion<Instruction insn, SDPatternOperator operator, TypedReg tr1,
                    TypedReg tr2, bits<3> suppress, bits<4> mode>
   : Pat<(tr1.vt (operator (tr2.vt tr2.op:$vec))),
         (insn tr2.op:$vec, suppress, mode)>;
+
+// Use INSN to perform mininum/maximum operation OPERATOR on type TR.
+// FUNCTION is the type of minimum/maximum function to perform.
+class FPMinMax<Instruction insn, SDPatternOperator operator, TypedReg tr,
+               bits<4> function>
+  : Pat<(tr.vt (operator (tr.vt tr.op:$vec1), (tr.vt tr.op:$vec2))),
+        (insn tr.op:$vec1, tr.op:$vec2, function)>;
diff --git a/lib/Target/SystemZ/SystemZProcessors.td b/lib/Target/SystemZ/SystemZProcessors.td
index 1cdc0949ff4a..0dca4582dc0d 100644
--- a/lib/Target/SystemZ/SystemZProcessors.td
+++ b/lib/Target/SystemZ/SystemZProcessors.td
@@ -33,3 +33,6 @@ def : ProcessorModel<"zEC12", ZEC12Model, Arch10SupportedFeatures.List>;
 def : ProcessorModel<"arch11", Z13Model, Arch11SupportedFeatures.List>;
 def : ProcessorModel<"z13", Z13Model, Arch11SupportedFeatures.List>;
 
+def : ProcessorModel<"arch12", Z14Model, Arch12SupportedFeatures.List>;
+def : ProcessorModel<"z14", Z14Model, Arch12SupportedFeatures.List>;
+
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.td b/lib/Target/SystemZ/SystemZRegisterInfo.td
index 36809ea81dc1..52ba1a584017 100644
--- a/lib/Target/SystemZ/SystemZRegisterInfo.td
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.td
@@ -260,10 +260,10 @@ defm VF128 : SystemZRegClass<"VF128",
 
 // All vector registers.
 defm VR128 : SystemZRegClass<"VR128",
-                             [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], 128,
-                             (add (sequence "V%u", 0, 7),
-                                  (sequence "V%u", 16, 31),
-                                  (sequence "V%u", 8, 15))>;
+                             [f128, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+                             128, (add (sequence "V%u", 0, 7),
+                                       (sequence "V%u", 16, 31),
+                                       (sequence "V%u", 8, 15))>;
 
 // Attaches a ValueType to a register operand, to make the instruction
 // definitions easier.
@@ -272,7 +272,8 @@ class TypedReg<ValueType vtin, RegisterOperand opin> {
   RegisterOperand op = opin;
 }
 
-def v32eb   : TypedReg<f32,     VR32>;
+def v32f    : TypedReg<i32,     VR32>;
+def v32sb   : TypedReg<f32,     VR32>;
 def v64g    : TypedReg<i64,     VR64>;
 def v64db   : TypedReg<f64,     VR64>;
 def v128b   : TypedReg<v16i8,   VR128>;
@@ -280,8 +281,9 @@ def v128h   : TypedReg<v8i16,   VR128>;
 def v128f   : TypedReg<v4i32,   VR128>;
 def v128g   : TypedReg<v2i64,   VR128>;
 def v128q   : TypedReg<v16i8,   VR128>;
-def v128eb  : TypedReg<v4f32,   VR128>;
+def v128sb  : TypedReg<v4f32,   VR128>;
 def v128db  : TypedReg<v2f64,   VR128>;
+def v128xb  : TypedReg<f128,    VR128>;
 def v128any : TypedReg<untyped, VR128>;
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/SystemZ/SystemZSchedule.td b/lib/Target/SystemZ/SystemZSchedule.td
index 1ce0168f95e9..8dba89f70a42 100644
--- a/lib/Target/SystemZ/SystemZSchedule.td
+++ b/lib/Target/SystemZ/SystemZSchedule.td
@@ -59,7 +59,7 @@ def FPU2 : SchedWrite;
 def DFU  : SchedWrite;
 def DFU2 : SchedWrite;
 
-// Vector sub units (z13)
+// Vector sub units (z13 and later)
 def VecBF     : SchedWrite;
 def VecBF2    : SchedWrite;
 def VecDF     : SchedWrite;
@@ -75,6 +75,7 @@ def VecXsPm   : SchedWrite;
 def VBU         : SchedWrite;
 
 
+include "SystemZScheduleZ14.td"
 include "SystemZScheduleZ13.td"
 include "SystemZScheduleZEC12.td"
 include "SystemZScheduleZ196.td"
diff --git a/lib/Target/SystemZ/SystemZScheduleZ14.td b/lib/Target/SystemZ/SystemZScheduleZ14.td
new file mode 100644
index 000000000000..f11177af91a5
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZScheduleZ14.td
@@ -0,0 +1,1611 @@
+//-- SystemZScheduleZ14.td - SystemZ Scheduling Definitions ----*- tblgen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Z14 to support instruction
+// scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+def Z14Model : SchedMachineModel {
+
+    let UnsupportedFeatures = Arch12UnsupportedFeatures.List;
+
+    let IssueWidth = 8;
+    let MicroOpBufferSize = 60;     // Issue queues
+    let LoadLatency = 1;            // Optimistic load latency.
+
+    let PostRAScheduler = 1;
+
+    // Extra cycles for a mispredicted branch.
+    let MispredictPenalty = 20;
+}
+
+let SchedModel = Z14Model in  {
+
+// These definitions could be put in a subtarget common include file,
+// but it seems the include system in Tablegen currently rejects
+// multiple includes of same file.
+def : WriteRes<GroupAlone, []> {
+  let NumMicroOps = 0;
+  let BeginGroup  = 1;
+  let EndGroup    = 1;
+}
+def : WriteRes<BeginGroup, []> {
+  let NumMicroOps = 0;
+  let BeginGroup  = 1;
+}
+def : WriteRes<EndGroup, []> {
+  let NumMicroOps = 0;
+  let EndGroup    = 1;
+}
+def : WriteRes<Lat2, []> { let Latency = 2; let NumMicroOps = 0;}
+def : WriteRes<Lat3, []> { let Latency = 3; let NumMicroOps = 0;}
+def : WriteRes<Lat4, []> { let Latency = 4; let NumMicroOps = 0;}
+def : WriteRes<Lat5, []> { let Latency = 5; let NumMicroOps = 0;}
+def : WriteRes<Lat6, []> { let Latency = 6; let NumMicroOps = 0;}
+def : WriteRes<Lat7, []> { let Latency = 7; let NumMicroOps = 0;}
+def : WriteRes<Lat8, []> { let Latency = 8; let NumMicroOps = 0;}
+def : WriteRes<Lat9, []> { let Latency = 9; let NumMicroOps = 0;}
+def : WriteRes<Lat10, []> { let Latency = 10; let NumMicroOps = 0;}
+def : WriteRes<Lat11, []> { let Latency = 11; let NumMicroOps = 0;}
+def : WriteRes<Lat12, []> { let Latency = 12; let NumMicroOps = 0;}
+def : WriteRes<Lat15, []> { let Latency = 15; let NumMicroOps = 0;}
+def : WriteRes<Lat20, []> { let Latency = 20; let NumMicroOps = 0;}
+def : WriteRes<Lat30, []> { let Latency = 30; let NumMicroOps = 0;}
+
+// Execution units.
+def Z14_FXaUnit     : ProcResource<2>;
+def Z14_FXbUnit     : ProcResource<2>;
+def Z14_LSUnit      : ProcResource<2>;
+def Z14_VecUnit     : ProcResource<2>;
+def Z14_VecFPdUnit  : ProcResource<2> { let BufferSize = 1; /* blocking */ }
+def Z14_VBUnit      : ProcResource<2>;
+
+// Subtarget specific definitions of scheduling resources.
+def : WriteRes<FXa,     [Z14_FXaUnit]> { let Latency = 1; }
+def : WriteRes<FXa2,    [Z14_FXaUnit, Z14_FXaUnit]> { let Latency = 2; }
+def : WriteRes<FXb,     [Z14_FXbUnit]> { let Latency = 1; }
+def : WriteRes<LSU,     [Z14_LSUnit]>  { let Latency = 4; }
+def : WriteRes<VecBF,   [Z14_VecUnit]> { let Latency = 8; }
+def : WriteRes<VecBF2,  [Z14_VecUnit, Z14_VecUnit]> { let Latency = 9; }
+def : WriteRes<VecDF,   [Z14_VecUnit]> { let Latency = 8; }
+def : WriteRes<VecDF2,  [Z14_VecUnit, Z14_VecUnit]> { let Latency = 9; }
+def : WriteRes<VecDFX,  [Z14_VecUnit]> { let Latency = 1; }
+def : WriteRes<VecDFX2, [Z14_VecUnit, Z14_VecUnit]> { let Latency = 2; }
+def : WriteRes<VecFPd,  [Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit,
+                         Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit,
+                         Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit,
+                         Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit,
+                         Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit,
+                         Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit,
+                         Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit,
+                         Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit,
+                         Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit,
+                         Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit]>
+                         { let Latency = 30; }
+def : WriteRes<VecMul,  [Z14_VecUnit]> { let Latency = 5; }
+def : WriteRes<VecStr,  [Z14_VecUnit]> { let Latency = 4; }
+def : WriteRes<VecXsPm, [Z14_VecUnit]> { let Latency = 3; }
+def : WriteRes<VBU,     [Z14_VBUnit]>; // Virtual Branching Unit
+
+// -------------------------- INSTRUCTIONS ---------------------------------- //
+
+// InstRW constructs have been used in order to preserve the
+// readability of the InstrInfo files.
+
+// For each instruction, as matched by a regexp, provide a list of
+// resources that it needs. These will be combined into a SchedClass.
+
+//===----------------------------------------------------------------------===//
+// Stack allocation
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa], (instregex "ADJDYNALLOC$")>; // Pseudo -> LA / LAY
+
+//===----------------------------------------------------------------------===//
+// Branch instructions
+//===----------------------------------------------------------------------===//
+
+// Branch
+def : InstRW<[VBU], (instregex "(Call)?BRC(L)?(Asm.*)?$")>;
+def : InstRW<[VBU], (instregex "(Call)?J(G)?(Asm.*)?$")>;
+def : InstRW<[FXb], (instregex "(Call)?BC(R)?(Asm.*)?$")>;
+def : InstRW<[FXb], (instregex "(Call)?B(R)?(Asm.*)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "BI(C)?(Asm.*)?$")>;
+def : InstRW<[FXa, EndGroup], (instregex "BRCT(G)?$")>;
+def : InstRW<[FXb, FXa, Lat2, GroupAlone], (instregex "BRCTH$")>;
+def : InstRW<[FXb, FXa, Lat2, GroupAlone], (instregex "BCT(G)?(R)?$")>;
+def : InstRW<[FXa, FXa, FXb, FXb, Lat4, GroupAlone],
+             (instregex "B(R)?X(H|L).*$")>;
+
+// Compare and branch
+def : InstRW<[FXb], (instregex "C(L)?(G)?(I|R)J(Asm.*)?$")>;
+def : InstRW<[FXb, FXb, Lat2, GroupAlone],
+             (instregex "C(L)?(G)?(I|R)B(Call|Return|Asm.*)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Trap instructions
+//===----------------------------------------------------------------------===//
+
+// Trap
+def : InstRW<[VBU], (instregex "(Cond)?Trap$")>;
+
+// Compare and trap
+def : InstRW<[FXb], (instregex "C(G)?(I|R)T(Asm.*)?$")>;
+def : InstRW<[FXb], (instregex "CL(G)?RT(Asm.*)?$")>;
+def : InstRW<[FXb], (instregex "CL(F|G)IT(Asm.*)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "CL(G)?T(Asm.*)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Call and return instructions
+//===----------------------------------------------------------------------===//
+
+// Call
+def : InstRW<[VBU, FXa, FXa, Lat3, GroupAlone], (instregex "(Call)?BRAS$")>;
+def : InstRW<[FXa, FXa, FXb, Lat3, GroupAlone], (instregex "(Call)?BRASL$")>;
+def : InstRW<[FXa, FXa, FXb, Lat3, GroupAlone], (instregex "(Call)?BAS(R)?$")>;
+def : InstRW<[FXa, FXa, FXb, Lat3, GroupAlone], (instregex "TLS_(G|L)DCALL$")>;
+
+// Return
+def : InstRW<[FXb, EndGroup], (instregex "Return$")>;
+def : InstRW<[FXb], (instregex "CondReturn$")>;
+
+//===----------------------------------------------------------------------===//
+// Select instructions
+//===----------------------------------------------------------------------===//
+
+// Select pseudo
+def : InstRW<[FXa], (instregex "Select(32|64|32Mux)$")>;
+
+// CondStore pseudos
+def : InstRW<[FXa], (instregex "CondStore16(Inv)?$")>;
+def : InstRW<[FXa], (instregex "CondStore16Mux(Inv)?$")>;
+def : InstRW<[FXa], (instregex "CondStore32(Inv)?$")>;
+def : InstRW<[FXa], (instregex "CondStore32Mux(Inv)?$")>;
+def : InstRW<[FXa], (instregex "CondStore64(Inv)?$")>;
+def : InstRW<[FXa], (instregex "CondStore8(Inv)?$")>;
+def : InstRW<[FXa], (instregex "CondStore8Mux(Inv)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Move instructions
+//===----------------------------------------------------------------------===//
+
+// Moves
+def : InstRW<[FXb, LSU, Lat5], (instregex "MV(G|H)?HI$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "MVI(Y)?$")>;
+
+// Move character
+def : InstRW<[FXb, LSU, LSU, LSU, Lat8, GroupAlone], (instregex "MVC$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCL(E|U)?$")>;
+
+// Pseudo -> reg move
+def : InstRW<[FXa], (instregex "COPY(_TO_REGCLASS)?$")>;
+def : InstRW<[FXa], (instregex "EXTRACT_SUBREG$")>;
+def : InstRW<[FXa], (instregex "INSERT_SUBREG$")>;
+def : InstRW<[FXa], (instregex "REG_SEQUENCE$")>;
+def : InstRW<[FXa], (instregex "SUBREG_TO_REG$")>;
+
+// Loads
+def : InstRW<[LSU], (instregex "L(Y|FH|RL|Mux|CBB)?$")>;
+def : InstRW<[LSU], (instregex "LG(RL)?$")>;
+def : InstRW<[LSU], (instregex "L128$")>;
+
+def : InstRW<[FXa], (instregex "LLIH(F|H|L)$")>;
+def : InstRW<[FXa], (instregex "LLIL(F|H|L)$")>;
+
+def : InstRW<[FXa], (instregex "LG(F|H)I$")>;
+def : InstRW<[FXa], (instregex "LHI(Mux)?$")>;
+def : InstRW<[FXa], (instregex "LR(Mux)?$")>;
+
+// Load and zero rightmost byte
+def : InstRW<[LSU], (instregex "LZR(F|G)$")>;
+
+// Load and trap
+def : InstRW<[FXb, LSU, Lat5], (instregex "L(FH|G)?AT$")>;
+
+// Load and test
+def : InstRW<[FXa, LSU, Lat5], (instregex "LT(G)?$")>;
+def : InstRW<[FXa], (instregex "LT(G)?R$")>;
+
+// Stores
+def : InstRW<[FXb, LSU, Lat5], (instregex "STG(RL)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "ST128$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "ST(Y|FH|RL|Mux)?$")>;
+
+// String moves.
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVST$")>;
+
+//===----------------------------------------------------------------------===//
+// Conditional move instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, Lat2], (instregex "LOCRMux$")>;
+def : InstRW<[FXa, Lat2], (instregex "LOC(G|FH)?R(Asm.*)?$")>;
+def : InstRW<[FXa, Lat2], (instregex "LOC(G|H)?HI(Mux|(Asm.*))?$")>;
+def : InstRW<[FXa, LSU, Lat6], (instregex "LOC(G|FH|Mux)?(Asm.*)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "STOC(G|FH|Mux)?(Asm.*)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Sign extensions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa], (instregex "L(B|H|G)R$")>;
+def : InstRW<[FXa], (instregex "LG(B|H|F)R$")>;
+
+def : InstRW<[FXa, LSU, Lat5], (instregex "LTGF$")>;
+def : InstRW<[FXa], (instregex "LTGFR$")>;
+
+def : InstRW<[FXa, LSU, Lat5], (instregex "LB(H|Mux)?$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "LH(Y)?$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "LH(H|Mux|RL)$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "LG(B|H|F)$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "LG(H|F)RL$")>;
+
+//===----------------------------------------------------------------------===//
+// Zero extensions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa], (instregex "LLCR(Mux)?$")>;
+def : InstRW<[FXa], (instregex "LLHR(Mux)?$")>;
+def : InstRW<[FXa], (instregex "LLG(C|H|F|T)R$")>;
+def : InstRW<[LSU], (instregex "LLC(Mux)?$")>;
+def : InstRW<[LSU], (instregex "LLH(Mux)?$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "LL(C|H)H$")>;
+def : InstRW<[LSU], (instregex "LLHRL$")>;
+def : InstRW<[LSU], (instregex "LLG(C|H|F|T|HRL|FRL)$")>;
+
+// Load and zero rightmost byte
+def : InstRW<[LSU], (instregex "LLZRGF$")>;
+
+// Load and trap
+def : InstRW<[FXb, LSU, Lat5], (instregex "LLG(F|T)?AT$")>;
+
+//===----------------------------------------------------------------------===//
+// Truncations
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, LSU, Lat5], (instregex "STC(H|Y|Mux)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "STH(H|Y|RL|Mux)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "STCM(H|Y)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Multi-register moves
+//===----------------------------------------------------------------------===//
+
+// Load multiple (estimated average of 5 ops)
+def : InstRW<[LSU, LSU, LSU, LSU, LSU, Lat10, GroupAlone],
+             (instregex "LM(H|Y|G)?$")>;
+
+// Load multiple disjoint
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "LMD$")>;
+
+// Store multiple (estimated average of ceil(5/2) FXb ops)
+def : InstRW<[LSU, LSU, FXb, FXb, FXb, Lat10,
+              GroupAlone], (instregex "STM(G|H|Y)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Byte swaps
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa], (instregex "LRV(G)?R$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "LRV(G|H)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "STRV(G|H)?$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCIN$")>;
+
+//===----------------------------------------------------------------------===//
+// Load address instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa], (instregex "LA(Y|RL)?$")>;
+
+// Load the Global Offset Table address ( -> larl )
+def : InstRW<[FXa], (instregex "GOT$")>;
+
+//===----------------------------------------------------------------------===//
+// Absolute and Negation
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa], (instregex "LP(G)?R$")>;
+def : InstRW<[FXa, FXa, Lat2, BeginGroup], (instregex "L(N|P)GFR$")>;
+def : InstRW<[FXa], (instregex "LN(R|GR)$")>;
+def : InstRW<[FXa], (instregex "LC(R|GR)$")>;
+def : InstRW<[FXa, FXa, Lat2, BeginGroup], (instregex "LCGFR$")>;
+
+//===----------------------------------------------------------------------===//
+// Insertion
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, LSU, Lat5], (instregex "IC(Y)?$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "IC32(Y)?$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "ICM(H|Y)?$")>;
+def : InstRW<[FXa], (instregex "II(F|H|L)Mux$")>;
+def : InstRW<[FXa], (instregex "IIHF(64)?$")>;
+def : InstRW<[FXa], (instregex "IIHH(64)?$")>;
+def : InstRW<[FXa], (instregex "IIHL(64)?$")>;
+def : InstRW<[FXa], (instregex "IILF(64)?$")>;
+def : InstRW<[FXa], (instregex "IILH(64)?$")>;
+def : InstRW<[FXa], (instregex "IILL(64)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Addition
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, LSU, Lat5], (instregex "A(Y)?$")>;
+def : InstRW<[FXa, LSU, Lat6], (instregex "AH(Y)?$")>;
+def : InstRW<[FXa], (instregex "AIH$")>;
+def : InstRW<[FXa], (instregex "AFI(Mux)?$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "AG$")>;
+def : InstRW<[FXa], (instregex "AGFI$")>;
+def : InstRW<[FXa], (instregex "AGHI(K)?$")>;
+def : InstRW<[FXa], (instregex "AGR(K)?$")>;
+def : InstRW<[FXa], (instregex "AHI(K)?$")>;
+def : InstRW<[FXa], (instregex "AHIMux(K)?$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "AL(Y)?$")>;
+def : InstRW<[FXa], (instregex "AL(FI|HSIK)$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "ALG(F)?$")>;
+def : InstRW<[FXa], (instregex "ALGHSIK$")>;
+def : InstRW<[FXa], (instregex "ALGF(I|R)$")>;
+def : InstRW<[FXa], (instregex "ALGR(K)?$")>;
+def : InstRW<[FXa], (instregex "ALR(K)?$")>;
+def : InstRW<[FXa], (instregex "AR(K)?$")>;
+def : InstRW<[FXa], (instregex "A(L)?HHHR$")>;
+def : InstRW<[FXa, Lat2], (instregex "A(L)?HHLR$")>;
+def : InstRW<[FXa], (instregex "ALSIH(N)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "A(L)?(G)?SI$")>;
+
+// Logical addition with carry
+def : InstRW<[FXa, LSU, Lat6, GroupAlone], (instregex "ALC(G)?$")>;
+def : InstRW<[FXa, Lat2, GroupAlone], (instregex "ALC(G)?R$")>;
+
+// Add with sign extension (16/32 -> 64)
+def : InstRW<[FXa, LSU, Lat6], (instregex "AG(F|H)$")>;
+def : InstRW<[FXa, Lat2], (instregex "AGFR$")>;
+
+//===----------------------------------------------------------------------===//
+// Subtraction
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, LSU, Lat5], (instregex "S(G|Y)?$")>;
+def : InstRW<[FXa, LSU, Lat6], (instregex "SH(Y)?$")>;
+def : InstRW<[FXa], (instregex "SGR(K)?$")>;
+def : InstRW<[FXa], (instregex "SLFI$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "SL(G|GF|Y)?$")>;
+def : InstRW<[FXa], (instregex "SLGF(I|R)$")>;
+def : InstRW<[FXa], (instregex "SLGR(K)?$")>;
+def : InstRW<[FXa], (instregex "SLR(K)?$")>;
+def : InstRW<[FXa], (instregex "SR(K)?$")>;
+def : InstRW<[FXa], (instregex "S(L)?HHHR$")>;
+def : InstRW<[FXa, Lat2], (instregex "S(L)?HHLR$")>;
+
+// Subtraction with borrow
+def : InstRW<[FXa, LSU, Lat6, GroupAlone], (instregex "SLB(G)?$")>;
+def : InstRW<[FXa, Lat2, GroupAlone], (instregex "SLB(G)?R$")>;
+
+// Subtraction with sign extension (16/32 -> 64)
+def : InstRW<[FXa, LSU, Lat6], (instregex "SG(F|H)$")>;
+def : InstRW<[FXa, Lat2], (instregex "SGFR$")>;
+
+//===----------------------------------------------------------------------===//
+// AND
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, LSU, Lat5], (instregex "N(G|Y)?$")>;
+def : InstRW<[FXa], (instregex "NGR(K)?$")>;
+def : InstRW<[FXa], (instregex "NI(FMux|HMux|LMux)$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "NI(Y)?$")>;
+def : InstRW<[FXa], (instregex "NIHF(64)?$")>;
+def : InstRW<[FXa], (instregex "NIHH(64)?$")>;
+def : InstRW<[FXa], (instregex "NIHL(64)?$")>;
+def : InstRW<[FXa], (instregex "NILF(64)?$")>;
+def : InstRW<[FXa], (instregex "NILH(64)?$")>;
+def : InstRW<[FXa], (instregex "NILL(64)?$")>;
+def : InstRW<[FXa], (instregex "NR(K)?$")>;
+def : InstRW<[LSU, LSU, FXb, Lat9, BeginGroup], (instregex "NC$")>;
+
+//===----------------------------------------------------------------------===//
+// OR
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, LSU, Lat5], (instregex "O(G|Y)?$")>;
+def : InstRW<[FXa], (instregex "OGR(K)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "OI(Y)?$")>;
+def : InstRW<[FXa], (instregex "OI(FMux|HMux|LMux)$")>;
+def : InstRW<[FXa], (instregex "OIHF(64)?$")>;
+def : InstRW<[FXa], (instregex "OIHH(64)?$")>;
+def : InstRW<[FXa], (instregex "OIHL(64)?$")>;
+def : InstRW<[FXa], (instregex "OILF(64)?$")>;
+def : InstRW<[FXa], (instregex "OILH(64)?$")>;
+def : InstRW<[FXa], (instregex "OILL(64)?$")>;
+def : InstRW<[FXa], (instregex "OR(K)?$")>;
+def : InstRW<[LSU, LSU, FXb, Lat9, BeginGroup], (instregex "OC$")>;
+
+//===----------------------------------------------------------------------===//
+// XOR
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, LSU, Lat5], (instregex "X(G|Y)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "XI(Y)?$")>;
+def : InstRW<[FXa], (instregex "XIFMux$")>;
+def : InstRW<[FXa], (instregex "XGR(K)?$")>;
+def : InstRW<[FXa], (instregex "XIHF(64)?$")>;
+def : InstRW<[FXa], (instregex "XILF(64)?$")>;
+def : InstRW<[FXa], (instregex "XR(K)?$")>;
+def : InstRW<[LSU, LSU, FXb, Lat9, BeginGroup], (instregex "XC$")>;
+
+//===----------------------------------------------------------------------===//
+// Multiplication
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, LSU, Lat9], (instregex "MS(GF|Y)?$")>;
+def : InstRW<[FXa, Lat5], (instregex "MS(R|FI)$")>;
+def : InstRW<[FXa, LSU, Lat11], (instregex "MSG$")>;
+def : InstRW<[FXa, Lat7], (instregex "MSGR$")>;
+def : InstRW<[FXa, Lat5], (instregex "MSGF(I|R)$")>;
+def : InstRW<[FXa2, LSU, Lat12, GroupAlone], (instregex "MLG$")>;
+def : InstRW<[FXa2, Lat8, GroupAlone], (instregex "MLGR$")>;
+def : InstRW<[FXa, Lat4], (instregex "MGHI$")>;
+def : InstRW<[FXa, Lat4], (instregex "MHI$")>;
+def : InstRW<[FXa, LSU, Lat8], (instregex "MH(Y)?$")>;
+def : InstRW<[FXa2, Lat6, GroupAlone], (instregex "M(L)?R$")>;
+def : InstRW<[FXa2, LSU, Lat10, GroupAlone], (instregex "M(FY|L)?$")>;
+def : InstRW<[FXa, LSU, Lat8], (instregex "MGH$")>;
+def : InstRW<[FXa, LSU, Lat12, GroupAlone], (instregex "MG$")>;
+def : InstRW<[FXa, Lat8, GroupAlone], (instregex "MGRK$")>;
+def : InstRW<[FXa, LSU, Lat9, GroupAlone], (instregex "MSC$")>;
+def : InstRW<[FXa, LSU, Lat11, GroupAlone], (instregex "MSGC$")>;
+def : InstRW<[FXa, Lat5], (instregex "MSRKC$")>;
+def : InstRW<[FXa, Lat7], (instregex "MSGRKC$")>;
+
+//===----------------------------------------------------------------------===//
+// Division and remainder
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa2, FXa2, Lat20, GroupAlone], (instregex "DR$")>;
+def : InstRW<[FXa2, FXa2, LSU, Lat30, GroupAlone], (instregex "D$")>;
+def : InstRW<[FXa2, Lat30, GroupAlone], (instregex "DSG(F)?R$")>;
+def : InstRW<[LSU, FXa2, Lat30, GroupAlone], (instregex "DSG(F)?$")>;
+def : InstRW<[FXa2, FXa2, Lat20, GroupAlone], (instregex "DLR$")>;
+def : InstRW<[FXa2, FXa2, Lat30, GroupAlone], (instregex "DLGR$")>;
+def : InstRW<[FXa2, FXa2, LSU, Lat30, GroupAlone], (instregex "DL(G)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Shifts
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa], (instregex "SLL(G|K)?$")>;
+def : InstRW<[FXa], (instregex "SRL(G|K)?$")>;
+def : InstRW<[FXa], (instregex "SRA(G|K)?$")>;
+def : InstRW<[FXa], (instregex "SLA(G|K)?$")>;
+def : InstRW<[FXa, FXa, FXa, FXa, LSU, Lat8, GroupAlone],
+             (instregex "S(L|R)D(A|L)$")>;
+
+// Rotate
+def : InstRW<[FXa, LSU, Lat6], (instregex "RLL(G)?$")>;
+
+// Rotate and insert
+def : InstRW<[FXa], (instregex "RISBG(N|32)?$")>;
+def : InstRW<[FXa], (instregex "RISBH(G|H|L)$")>;
+def : InstRW<[FXa], (instregex "RISBL(G|H|L)$")>;
+def : InstRW<[FXa], (instregex "RISBMux$")>;
+
+// Rotate and Select
+def : InstRW<[FXa, FXa, Lat2, BeginGroup], (instregex "R(N|O|X)SBG$")>;
+
+//===----------------------------------------------------------------------===//
+// Comparison
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, LSU, Lat5], (instregex "C(G|Y|Mux|RL)?$")>;
+def : InstRW<[FXb], (instregex "C(F|H)I(Mux)?$")>;
+def : InstRW<[FXb], (instregex "CG(F|H)I$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "CG(HSI|RL)$")>;
+def : InstRW<[FXb], (instregex "C(G)?R$")>;
+def : InstRW<[FXb], (instregex "CIH$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "CH(F|SI)$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "CL(Y|Mux|FHSI)?$")>;
+def : InstRW<[FXb], (instregex "CLFI(Mux)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "CLG(HRL|HSI)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "CLGF(RL)?$")>;
+def : InstRW<[FXb], (instregex "CLGF(I|R)$")>;
+def : InstRW<[FXb], (instregex "CLGR$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "CLGRL$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "CLH(F|RL|HSI)$")>;
+def : InstRW<[FXb], (instregex "CLIH$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "CLI(Y)?$")>;
+def : InstRW<[FXb], (instregex "CLR$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "CLRL$")>;
+def : InstRW<[FXb], (instregex "C(L)?HHR$")>;
+def : InstRW<[FXb, Lat2], (instregex "C(L)?HLR$")>;
+
+// Compare halfword
+def : InstRW<[FXb, LSU, Lat6], (instregex "CH(Y|RL)?$")>;
+def : InstRW<[FXb, LSU, Lat6], (instregex "CGH(RL)?$")>;
+def : InstRW<[FXa, FXb, LSU, Lat6, BeginGroup], (instregex "CHHSI$")>;
+
+// Compare with sign extension (32 -> 64)
+def : InstRW<[FXb, LSU, Lat6], (instregex "CGF(RL)?$")>;
+def : InstRW<[FXb, Lat2], (instregex "CGFR$")>;
+
+// Compare logical character
+def : InstRW<[FXb, LSU, LSU, Lat9, BeginGroup], (instregex "CLC$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLCL(E|U)?$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLST$")>;
+
+// Test under mask
+def : InstRW<[FXb, LSU, Lat5], (instregex "TM(Y)?$")>;
+def : InstRW<[FXb], (instregex "TM(H|L)Mux$")>;
+def : InstRW<[FXb], (instregex "TMHH(64)?$")>;
+def : InstRW<[FXb], (instregex "TMHL(64)?$")>;
+def : InstRW<[FXb], (instregex "TMLH(64)?$")>;
+def : InstRW<[FXb], (instregex "TMLL(64)?$")>;
+
+// Compare logical characters under mask
+def : InstRW<[FXb, LSU, Lat6], (instregex "CLM(H|Y)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Prefetch and execution hint
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[LSU], (instregex "PFD(RL)?$")>;
+def : InstRW<[FXb, Lat2], (instregex "BPP$")>;
+def : InstRW<[FXb, EndGroup], (instregex "BPRP$")>;
+def : InstRW<[FXb], (instregex "NIAI$")>;
+
+//===----------------------------------------------------------------------===//
+// Atomic operations
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, EndGroup], (instregex "Serialize$")>;
+
+def : InstRW<[FXb, LSU, Lat5], (instregex "LAA(G)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "LAAL(G)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "LAN(G)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "LAO(G)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "LAX(G)?$")>;
+
+// Test and set
+def : InstRW<[FXb, LSU, Lat5, EndGroup], (instregex "TS$")>;
+
+// Compare and swap
+def : InstRW<[FXa, FXb, LSU, Lat6, GroupAlone], (instregex "CS(G|Y)?$")>;
+
+// Compare double and swap
+def : InstRW<[FXa, FXa, FXb, FXb, FXa, LSU, Lat10, GroupAlone],
+             (instregex "CDS(Y)?$")>;
+def : InstRW<[FXa, FXa, FXb, FXb, LSU, FXb, FXb, LSU, LSU, Lat20, GroupAlone],
+             (instregex "CDSG$")>;
+
+// Compare and swap and store
+def : InstRW<[FXa, LSU, Lat30], (instregex "CSST$")>;
+
+// Perform locked operation
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "PLO$")>;
+
+// Load/store pair from/to quadword
+def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPQ$")>;
+def : InstRW<[FXb, FXb, LSU, Lat6, GroupAlone], (instregex "STPQ$")>;
+
+// Load pair disjoint
+def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPD(G)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Translate and convert
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "TR$")>;
+def : InstRW<[FXa, FXa, FXa, LSU, LSU, Lat30, GroupAlone], (instregex "TRT$")>;
+def : InstRW<[FXa, LSU, Lat30], (instregex "TRTR$")>;
+def : InstRW<[FXa, Lat30], (instregex "TR(TR)?(T)?(E|EOpt)?$")>;
+def : InstRW<[LSU, Lat30], (instregex "TR(T|O)(T|O)(Opt)?$")>;
+def : InstRW<[FXa, Lat30], (instregex "CU(12|14|21|24|41|42)(Opt)?$")>;
+def : InstRW<[FXa, Lat30], (instregex "(CUUTF|CUTFU)(Opt)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Message-security assist
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, Lat30], (instregex "KM(C|F|O|CTR|A)?$")>;
+def : InstRW<[FXa, Lat30], (instregex "(KIMD|KLMD|KMAC)$")>;
+def : InstRW<[FXa, Lat30], (instregex "(PCC|PPNO|PRNO)$")>;
+
+//===----------------------------------------------------------------------===//
+// Guarded storage
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[LSU], (instregex "LGG$")>;
+def : InstRW<[LSU, Lat5], (instregex "LLGFSG$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(L|ST)GSC$")>;
+
+//===----------------------------------------------------------------------===//
+// Decimal arithmetic
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, VecDF, VecDF, LSU, LSU, Lat30, GroupAlone],
+             (instregex "CVBG$")>;
+def : InstRW<[FXb, VecDF, LSU, Lat30, GroupAlone], (instregex "CVB(Y)?$")>;
+def : InstRW<[FXb, FXb, FXb, VecDF2, VecDF2, LSU, Lat30, GroupAlone],
+             (instregex "CVDG$")>;
+def : InstRW<[FXb, VecDF, FXb, LSU, Lat30, GroupAlone], (instregex "CVD(Y)?$")>;
+def : InstRW<[LSU, Lat10, GroupAlone], (instregex "MVO$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MV(N|Z)$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(PACK|PKA|PKU)$")>;
+def : InstRW<[LSU, Lat12, GroupAlone], (instregex "UNPK(A|U)$")>;
+def : InstRW<[FXb, LSU, LSU, Lat9, BeginGroup], (instregex "UNPK$")>;
+
+def : InstRW<[FXb, VecDFX, LSU, LSU, LSU, Lat9, GroupAlone],
+             (instregex "(A|S|ZA)P$")>;
+def : InstRW<[FXb, VecDFX2, VecDFX2, LSU, LSU, LSU, Lat30, GroupAlone],
+             (instregex "(M|D)P$")>;
+def : InstRW<[FXb, VecDFX, VecDFX, LSU, LSU, Lat15, GroupAlone],
+             (instregex "SRP$")>;
+def : InstRW<[VecDFX, LSU, LSU, Lat5, GroupAlone], (instregex "CP$")>;
+def : InstRW<[VecDFX, LSU, Lat4, BeginGroup], (instregex "TP$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "ED(MK)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Access registers
+//===----------------------------------------------------------------------===//
+
+// Extract/set/copy access register
+def : InstRW<[LSU], (instregex "(EAR|SAR|CPYA)$")>;
+
+// Load address extended
+def : InstRW<[LSU, FXa, Lat5, BeginGroup], (instregex "LAE(Y)?$")>;
+
+// Load/store access multiple (not modeled precisely)
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(L|ST)AM(Y)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Program mask and addressing mode
+//===----------------------------------------------------------------------===//
+
+// Insert Program Mask
+def : InstRW<[FXa, Lat3, EndGroup], (instregex "IPM$")>;
+
+// Set Program Mask
+def : InstRW<[LSU, EndGroup], (instregex "SPM$")>;
+
+// Branch and link
+def : InstRW<[FXa, FXa, FXb, Lat5, GroupAlone], (instregex "BAL(R)?$")>;
+
+// Test addressing mode
+def : InstRW<[FXb], (instregex "TAM$")>;
+
+// Set addressing mode
+def : InstRW<[FXb, Lat2, EndGroup], (instregex "SAM(24|31|64)$")>;
+
+// Branch (and save) and set mode.
+def : InstRW<[FXa, FXb, Lat2, GroupAlone], (instregex "BSM$")>;
+def : InstRW<[FXa, FXa, FXb, Lat3, GroupAlone], (instregex "BASSM$")>;
+
+//===----------------------------------------------------------------------===//
+// Transactional execution
+//===----------------------------------------------------------------------===//
+
+// Transaction begin
+def : InstRW<[LSU, LSU, FXb, FXb, FXb, FXb, FXb, Lat15, GroupAlone],
+              (instregex "TBEGIN(C|_nofloat)?$")>;
+
+// Transaction end
+def : InstRW<[FXb, GroupAlone], (instregex "TEND$")>;
+
+// Transaction abort
+def : InstRW<[LSU, GroupAlone], (instregex "TABORT$")>;
+
+// Extract Transaction Nesting Depth
+def : InstRW<[FXa], (instregex "ETND$")>;
+
+// Nontransactional store
+def : InstRW<[FXb, LSU, Lat5], (instregex "NTSTG$")>;
+
+//===----------------------------------------------------------------------===//
+// Processor assist
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb], (instregex "PPA$")>;
+
+//===----------------------------------------------------------------------===//
+// Miscellaneous Instructions.
+//===----------------------------------------------------------------------===//
+
+// Find leftmost one
+def : InstRW<[FXa, FXa, Lat4, GroupAlone], (instregex "FLOGR$")>;
+
+// Population count
+def : InstRW<[FXa, Lat3], (instregex "POPCNT$")>;
+
+// Extend
+def : InstRW<[FXa], (instregex "AEXT128$")>;
+def : InstRW<[FXa], (instregex "ZEXT128$")>;
+
+// String instructions
+def : InstRW<[FXa, LSU, Lat30], (instregex "SRST$")>;
+def : InstRW<[FXa, Lat30], (instregex "SRSTU$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CUSE$")>;
+
+// Various complex instructions
+def : InstRW<[LSU, Lat30], (instregex "CFC$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "UPT$")>;
+def : InstRW<[LSU, Lat30], (instregex "CKSM$")>;
+def : InstRW<[FXa, Lat30], (instregex "CMPSC$")>;
+
+// Execute
+def : InstRW<[FXb, GroupAlone], (instregex "EX(RL)?$")>;
+
+//===----------------------------------------------------------------------===//
+// .insn directive instructions
+//===----------------------------------------------------------------------===//
+
+// An "empty" sched-class will be assigned instead of the "invalid sched-class".
+// getNumDecoderSlots() will then return 1 instead of 0.
+def : InstRW<[], (instregex "Insn.*")>;
+
+
+// ----------------------------- Floating point ----------------------------- //
+
+//===----------------------------------------------------------------------===//
+// FP: Select instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa], (instregex "Select(F32|F64|F128|VR128)$")>;
+def : InstRW<[FXa], (instregex "CondStoreF32(Inv)?$")>;
+def : InstRW<[FXa], (instregex "CondStoreF64(Inv)?$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Move instructions
+//===----------------------------------------------------------------------===//
+
+// Load zero
+def : InstRW<[FXb], (instregex "LZ(DR|ER)$")>;
+def : InstRW<[FXb, FXb, Lat2, BeginGroup], (instregex "LZXR$")>;
+
+// Load
+def : InstRW<[VecXsPm], (instregex "LER$")>;
+def : InstRW<[FXb], (instregex "LD(R|R32|GR)$")>;
+def : InstRW<[FXb, Lat3], (instregex "LGDR$")>;
+def : InstRW<[FXb, FXb, Lat2, GroupAlone], (instregex "LXR$")>;
+
+// Load and Test
+def : InstRW<[VecXsPm, Lat4], (instregex "LT(D|E)BR$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "LTEBRCompare(_VecPseudo)?$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "LTDBRCompare(_VecPseudo)?$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "LTXBR$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone],
+             (instregex "LTXBRCompare(_VecPseudo)?$")>;
+
+// Copy sign
+def : InstRW<[VecXsPm], (instregex "CPSDRd(d|s)$")>;
+def : InstRW<[VecXsPm], (instregex "CPSDRs(d|s)$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Load instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[VecXsPm, LSU, Lat7], (instregex "LE(Y)?$")>;
+def : InstRW<[LSU], (instregex "LD(Y|E32)?$")>;
+def : InstRW<[LSU], (instregex "LX$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Store instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, LSU, Lat7], (instregex "STD(Y)?$")>;
+def : InstRW<[FXb, LSU, Lat7], (instregex "STE(Y)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "STX$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Conversion instructions
+//===----------------------------------------------------------------------===//
+
+// Load rounded
+def : InstRW<[VecBF], (instregex "LEDBR(A)?$")>;
+def : InstRW<[VecDF, VecDF, Lat20], (instregex "LEXBR(A)?$")>;
+def : InstRW<[VecDF, VecDF, Lat20], (instregex "LDXBR(A)?$")>;
+
+// Load lengthened
+def : InstRW<[VecBF, LSU, Lat12], (instregex "LDEB$")>;
+def : InstRW<[VecBF], (instregex "LDEBR$")>;
+def : InstRW<[VecBF2, VecBF2, LSU, Lat12 , GroupAlone], (instregex "LX(D|E)B$")>;
+def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "LX(D|E)BR$")>;
+
+// Convert from fixed / logical
+def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CE(F|G)BR(A)?$")>;
+def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CD(F|G)BR(A)?$")>;
+def : InstRW<[FXb, VecDF2, VecDF2, Lat12, GroupAlone], (instregex "CX(F|G)BR(A)?$")>;
+def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CEL(F|G)BR$")>;
+def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CDL(F|G)BR$")>;
+def : InstRW<[FXb, VecDF2, VecDF2, Lat12, GroupAlone], (instregex "CXL(F|G)BR$")>;
+
+// Convert to fixed / logical
+def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CF(E|D)BR(A)?$")>;
+def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CG(E|D)BR(A)?$")>;
+def : InstRW<[FXb, VecDF, VecDF, Lat20, BeginGroup], (instregex "C(F|G)XBR(A)?$")>;
+def : InstRW<[FXb, VecBF, Lat11, GroupAlone], (instregex "CLFEBR$")>;
+def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CLFDBR$")>;
+def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CLG(E|D)BR$")>;
+def : InstRW<[FXb, VecDF, VecDF, Lat20, BeginGroup], (instregex "CL(F|G)XBR$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Unary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Load Complement / Negative / Positive
+def : InstRW<[VecXsPm, Lat4], (instregex "L(C|N|P)DBR$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "L(C|N|P)EBR$")>;
+def : InstRW<[FXb], (instregex "LCDFR(_32)?$")>;
+def : InstRW<[FXb], (instregex "LNDFR(_32)?$")>;
+def : InstRW<[FXb], (instregex "LPDFR(_32)?$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "L(C|N|P)XBR$")>;
+
+// Square root
+def : InstRW<[VecFPd, LSU], (instregex "SQ(E|D)B$")>;
+def : InstRW<[VecFPd], (instregex "SQ(E|D)BR$")>;
+def : InstRW<[VecFPd, VecFPd, GroupAlone], (instregex "SQXBR$")>;
+
+// Load FP integer
+def : InstRW<[VecBF], (instregex "FIEBR(A)?$")>;
+def : InstRW<[VecBF], (instregex "FIDBR(A)?$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "FIXBR(A)?$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Binary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Addition
+def : InstRW<[VecBF, LSU, Lat12], (instregex "A(E|D)B$")>;
+def : InstRW<[VecBF], (instregex "A(E|D)BR$")>;
+def : InstRW<[VecDF2, VecDF2, Lat10, GroupAlone], (instregex "AXBR$")>;
+
+// Subtraction
+def : InstRW<[VecBF, LSU, Lat12], (instregex "S(E|D)B$")>;
+def : InstRW<[VecBF], (instregex "S(E|D)BR$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "SXBR$")>;
+
+// Multiply
+def : InstRW<[VecBF, LSU, Lat12], (instregex "M(D|DE|EE)B$")>;
+def : InstRW<[VecBF], (instregex "M(D|DE|EE)BR$")>;
+def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "MXDB$")>;
+def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "MXDBR$")>;
+def : InstRW<[VecDF2, VecDF2, Lat20, GroupAlone], (instregex "MXBR$")>;
+
+// Multiply and add / subtract
+def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "M(A|S)EB$")>;
+def : InstRW<[VecBF, GroupAlone], (instregex "M(A|S)EBR$")>;
+def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "M(A|S)DB$")>;
+def : InstRW<[VecBF], (instregex "M(A|S)DBR$")>;
+
+// Division
+def : InstRW<[VecFPd, LSU], (instregex "D(E|D)B$")>;
+def : InstRW<[VecFPd], (instregex "D(E|D)BR$")>;
+def : InstRW<[VecFPd, VecFPd, GroupAlone], (instregex "DXBR$")>;
+
+// Divide to integer
+def : InstRW<[VecFPd, Lat30], (instregex "DI(E|D)BR$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Comparisons
+//===----------------------------------------------------------------------===//
+
+// Compare
+def : InstRW<[VecXsPm, LSU, Lat8], (instregex "(K|C)(E|D)B$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "(K|C)(E|D)BR?$")>;
+def : InstRW<[VecDF, VecDF, Lat20, GroupAlone], (instregex "(K|C)XBR$")>;
+
+// Test Data Class
+def : InstRW<[LSU, VecXsPm, Lat9], (instregex "TC(E|D)B$")>;
+def : InstRW<[LSU, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "TCXB$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Floating-point control register instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, LSU, Lat4, GroupAlone], (instregex "EFPC$")>;
+def : InstRW<[FXb, LSU, Lat5, GroupAlone], (instregex "STFPC$")>;
+def : InstRW<[LSU, Lat3, GroupAlone], (instregex "SFPC$")>;
+def : InstRW<[LSU, LSU, Lat6, GroupAlone], (instregex "LFPC$")>;
+def : InstRW<[FXa, Lat30], (instregex "SFASR$")>;
+def : InstRW<[FXa, LSU, Lat30], (instregex "LFAS$")>;
+def : InstRW<[FXb, Lat3, GroupAlone], (instregex "SRNM(B|T)?$")>;
+
+
+// --------------------- Hexadecimal floating point ------------------------- //
+
+//===----------------------------------------------------------------------===//
+// HFP: Move instructions
+//===----------------------------------------------------------------------===//
+
+// Load and Test
+def : InstRW<[VecXsPm, Lat4], (instregex "LT(D|E)R$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "LTXR$")>;
+
+//===----------------------------------------------------------------------===//
+// HFP: Conversion instructions
+//===----------------------------------------------------------------------===//
+
+// Load rounded
+def : InstRW<[VecBF], (instregex "(LEDR|LRER)$")>;
+def : InstRW<[VecBF], (instregex "LEXR$")>;
+def : InstRW<[VecDF2], (instregex "(LDXR|LRDR)$")>;
+
+// Load lengthened
+def : InstRW<[LSU], (instregex "LDE$")>;
+def : InstRW<[FXb], (instregex "LDER$")>;
+def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "LX(D|E)$")>;
+def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "LX(D|E)R$")>;
+
+// Convert from fixed
+def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CE(F|G)R$")>;
+def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CD(F|G)R$")>;
+def : InstRW<[FXb, VecDF2, VecDF2, Lat12, GroupAlone], (instregex "CX(F|G)R$")>;
+
+// Convert to fixed
+def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CF(E|D)R$")>;
+def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CG(E|D)R$")>;
+def : InstRW<[FXb, VecDF, VecDF, Lat20, BeginGroup], (instregex "C(F|G)XR$")>;
+
+// Convert BFP to HFP / HFP to BFP.
+def : InstRW<[VecBF], (instregex "THD(E)?R$")>;
+def : InstRW<[VecBF], (instregex "TB(E)?DR$")>;
+
+//===----------------------------------------------------------------------===//
+// HFP: Unary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Load Complement / Negative / Positive
+def : InstRW<[VecXsPm, Lat4], (instregex "L(C|N|P)DR$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "L(C|N|P)ER$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "L(C|N|P)XR$")>;
+
+// Halve
+def : InstRW<[VecBF], (instregex "H(E|D)R$")>;
+
+// Square root
+def : InstRW<[VecFPd, LSU], (instregex "SQ(E|D)$")>;
+def : InstRW<[VecFPd], (instregex "SQ(E|D)R$")>;
+def : InstRW<[VecFPd, VecFPd, GroupAlone], (instregex "SQXR$")>;
+
+// Load FP integer
+def : InstRW<[VecBF], (instregex "FIER$")>;
+def : InstRW<[VecBF], (instregex "FIDR$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "FIXR$")>;
+
+//===----------------------------------------------------------------------===//
+// HFP: Binary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Addition
+def : InstRW<[VecBF, LSU, Lat12], (instregex "A(E|D|U|W)$")>;
+def : InstRW<[VecBF], (instregex "A(E|D|U|W)R$")>;
+def : InstRW<[VecDF2, VecDF2, Lat10, GroupAlone], (instregex "AXR$")>;
+
+// Subtraction
+def : InstRW<[VecBF, LSU, Lat12], (instregex "S(E|D|U|W)$")>;
+def : InstRW<[VecBF], (instregex "S(E|D|U|W)R$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "SXR$")>;
+
+// Multiply
+def : InstRW<[VecBF, LSU, Lat12], (instregex "M(D|DE|E|EE)$")>;
+def : InstRW<[VecBF], (instregex "M(D|DE|E|EE)R$")>;
+def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "MXD$")>;
+def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "MXDR$")>;
+def : InstRW<[VecDF2, VecDF2, Lat20, GroupAlone], (instregex "MXR$")>;
+def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "MY$")>;
+def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "MY(H|L)$")>;
+def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "MYR$")>;
+def : InstRW<[VecBF, GroupAlone], (instregex "MY(H|L)R$")>;
+
+// Multiply and add / subtract
+def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "M(A|S)E$")>;
+def : InstRW<[VecBF, GroupAlone], (instregex "M(A|S)ER$")>;
+def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "M(A|S)D$")>;
+def : InstRW<[VecBF, GroupAlone], (instregex "M(A|S)DR$")>;
+def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "MAY(H|L)$")>;
+def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "MAY$")>;
+def : InstRW<[VecBF, GroupAlone], (instregex "MAY(H|L)R$")>;
+def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "MAYR$")>;
+
+// Division
+def : InstRW<[VecFPd, LSU], (instregex "D(E|D)$")>;
+def : InstRW<[VecFPd], (instregex "D(E|D)R$")>;
+def : InstRW<[VecFPd, VecFPd, GroupAlone], (instregex "DXR$")>;
+
+//===----------------------------------------------------------------------===//
+// HFP: Comparisons
+//===----------------------------------------------------------------------===//
+
+// Compare
+def : InstRW<[VecBF, LSU, Lat12], (instregex "C(E|D)$")>;
+def : InstRW<[VecBF], (instregex "C(E|D)R$")>;
+def : InstRW<[VecDF, VecDF, Lat20, GroupAlone], (instregex "CXR$")>;
+
+
+// ------------------------ Decimal floating point -------------------------- //
+
+//===----------------------------------------------------------------------===//
+// DFP: Move instructions
+//===----------------------------------------------------------------------===//
+
+// Load and Test
+def : InstRW<[VecDF], (instregex "LTDTR$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "LTXTR$")>;
+
+//===----------------------------------------------------------------------===//
+// DFP: Conversion instructions
+//===----------------------------------------------------------------------===//
+
+// Load rounded
+def : InstRW<[VecDF, Lat15], (instregex "LEDTR$")>;
+def : InstRW<[VecDF, VecDF, Lat20], (instregex "LDXTR$")>;
+
+// Load lengthened
+def : InstRW<[VecDF], (instregex "LDETR$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "LXDTR$")>;
+
+// Convert from fixed / logical
+def : InstRW<[FXb, VecDF, Lat30, BeginGroup], (instregex "CD(F|G)TR(A)?$")>;
+def : InstRW<[FXb, VecDF2, VecDF2, Lat30, GroupAlone], (instregex "CX(F|G)TR(A)?$")>;
+def : InstRW<[FXb, VecDF, Lat30, BeginGroup], (instregex "CDL(F|G)TR$")>;
+def : InstRW<[FXb, VecDF2, VecDF2, Lat30, GroupAlone], (instregex "CXL(F|G)TR$")>;
+
+// Convert to fixed / logical
+def : InstRW<[FXb, VecDF, Lat30, BeginGroup], (instregex "C(F|G)DTR(A)?$")>;
+def : InstRW<[FXb, VecDF, VecDF, Lat30, BeginGroup], (instregex "C(F|G)XTR(A)?$")>;
+def : InstRW<[FXb, VecDF, Lat30, BeginGroup], (instregex "CL(F|G)DTR$")>;
+def : InstRW<[FXb, VecDF, VecDF, Lat30, BeginGroup], (instregex "CL(F|G)XTR$")>;
+
+// Convert from / to signed / unsigned packed
+def : InstRW<[FXb, VecDF, Lat9, BeginGroup], (instregex "CD(S|U)TR$")>;
+def : InstRW<[FXb, FXb, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "CX(S|U)TR$")>;
+def : InstRW<[FXb, VecDF, Lat12, BeginGroup], (instregex "C(S|U)DTR$")>;
+def : InstRW<[FXb, FXb, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "C(S|U)XTR$")>;
+
+// Convert from / to zoned
+def : InstRW<[LSU, VecDF, Lat11, BeginGroup], (instregex "CDZT$")>;
+def : InstRW<[LSU, LSU, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "CXZT$")>;
+def : InstRW<[FXb, LSU, VecDF, Lat11, BeginGroup], (instregex "CZDT$")>;
+def : InstRW<[FXb, LSU, VecDF, VecDF, Lat15, GroupAlone], (instregex "CZXT$")>;
+
+// Convert from / to packed
+def : InstRW<[LSU, VecDF, Lat11, BeginGroup], (instregex "CDPT$")>;
+def : InstRW<[LSU, LSU, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "CXPT$")>;
+def : InstRW<[FXb, LSU, VecDF, Lat11, BeginGroup], (instregex "CPDT$")>;
+def : InstRW<[FXb, LSU, VecDF, VecDF, Lat15, GroupAlone], (instregex "CPXT$")>;
+
+// Perform floating-point operation
+def : InstRW<[FXb, Lat30], (instregex "PFPO$")>;
+
+//===----------------------------------------------------------------------===//
+// DFP: Unary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Load FP integer
+def : InstRW<[VecDF], (instregex "FIDTR$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "FIXTR$")>;
+
+// Extract biased exponent
+def : InstRW<[FXb, VecDF, Lat12, BeginGroup], (instregex "EEDTR$")>;
+def : InstRW<[FXb, VecDF, Lat12, BeginGroup], (instregex "EEXTR$")>;
+
+// Extract significance
+def : InstRW<[FXb, VecDF, Lat12, BeginGroup], (instregex "ESDTR$")>;
+def : InstRW<[FXb, VecDF, VecDF, Lat15, BeginGroup], (instregex "ESXTR$")>;
+
+//===----------------------------------------------------------------------===//
+// DFP: Binary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Addition
+def : InstRW<[VecDF], (instregex "ADTR(A)?$")>;
+def : InstRW<[VecDF2, VecDF2, Lat10, GroupAlone], (instregex "AXTR(A)?$")>;
+
+// Subtraction
+def : InstRW<[VecDF], (instregex "SDTR(A)?$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "SXTR(A)?$")>;
+
+// Multiply
+def : InstRW<[VecDF, Lat30], (instregex "MDTR(A)?$")>;
+def : InstRW<[VecDF2, VecDF2, Lat30, GroupAlone], (instregex "MXTR(A)?$")>;
+
+// Division
+def : InstRW<[VecDF, Lat30], (instregex "DDTR(A)?$")>;
+def : InstRW<[VecDF2, VecDF2, Lat30, GroupAlone], (instregex "DXTR(A)?$")>;
+
+// Quantize
+def : InstRW<[VecDF], (instregex "QADTR$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "QAXTR$")>;
+
+// Reround
+def : InstRW<[FXb, VecDF, Lat11, BeginGroup], (instregex "RRDTR$")>;
+def : InstRW<[FXb, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "RRXTR$")>;
+
+// Shift significand left/right
+def : InstRW<[LSU, VecDF, Lat11, GroupAlone], (instregex "S(L|R)DT$")>;
+def : InstRW<[LSU, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "S(L|R)XT$")>;
+
+// Insert biased exponent
+def : InstRW<[FXb, VecDF, Lat11, BeginGroup], (instregex "IEDTR$")>;
+def : InstRW<[FXb, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "IEXTR$")>;
+
+//===----------------------------------------------------------------------===//
+// DFP: Comparisons
+//===----------------------------------------------------------------------===//
+
+// Compare
+def : InstRW<[VecDF], (instregex "(K|C)DTR$")>;
+def : InstRW<[VecDF, VecDF, Lat11, GroupAlone], (instregex "(K|C)XTR$")>;
+
+// Compare biased exponent
+def : InstRW<[VecDF], (instregex "CEDTR$")>;
+def : InstRW<[VecDF], (instregex "CEXTR$")>;
+
+// Test Data Class/Group
+def : InstRW<[LSU, VecDF, Lat11], (instregex "TD(C|G)(E|D)T$")>;
+def : InstRW<[LSU, VecDF, VecDF, Lat15, GroupAlone], (instregex "TD(C|G)XT$")>;
+
+
+// --------------------------------- Vector --------------------------------- //
+
+//===----------------------------------------------------------------------===//
+// Vector: Move instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb], (instregex "VLR(32|64)?$")>;
+def : InstRW<[FXb, Lat4], (instregex "VLGV(B|F|G|H)?$")>;
+def : InstRW<[FXb], (instregex "VLVG(B|F|G|H)?$")>;
+def : InstRW<[FXb, Lat2], (instregex "VLVGP(32)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Immediate instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[VecXsPm], (instregex "VZERO$")>;
+def : InstRW<[VecXsPm], (instregex "VONE$")>;
+def : InstRW<[VecXsPm], (instregex "VGBM$")>;
+def : InstRW<[VecXsPm], (instregex "VGM(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VREPI(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VLEI(B|F|G|H)$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Loads
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[LSU], (instregex "VL(L|BB)?$")>;
+def : InstRW<[LSU], (instregex "VL(32|64)$")>;
+def : InstRW<[LSU], (instregex "VLLEZ(B|F|G|H|LF)?$")>;
+def : InstRW<[LSU], (instregex "VLREP(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm, LSU, Lat7], (instregex "VLE(B|F|G|H)$")>;
+def : InstRW<[FXb, LSU, VecXsPm, Lat11, BeginGroup], (instregex "VGE(F|G)$")>;
+def : InstRW<[LSU, LSU, LSU, LSU, LSU, Lat10, GroupAlone],
+              (instregex "VLM$")>;
+def : InstRW<[LSU, Lat5], (instregex "VLRL(R)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Stores
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, LSU, Lat8], (instregex "VST(L|32|64)?$")>;
+def : InstRW<[FXb, LSU, Lat8], (instregex "VSTE(F|G)$")>;
+def : InstRW<[FXb, LSU, VecXsPm, Lat11, BeginGroup], (instregex "VSTE(B|H)$")>;
+def : InstRW<[LSU, LSU, FXb, FXb, FXb, FXb, FXb, Lat20, GroupAlone],
+              (instregex "VSTM$")>;
+def : InstRW<[FXb, FXb, LSU, Lat12, BeginGroup], (instregex "VSCE(F|G)$")>;
+def : InstRW<[FXb, LSU, Lat8], (instregex "VSTRL(R)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Selects and permutes
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[VecXsPm], (instregex "VMRH(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VMRL(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VPERM$")>;
+def : InstRW<[VecXsPm], (instregex "VPDI$")>;
+def : InstRW<[VecXsPm], (instregex "VBPERM$")>;
+def : InstRW<[VecXsPm], (instregex "VREP(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VSEL$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Widening and narrowing
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[VecXsPm], (instregex "VPK(F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VPKS(F|G|H)?$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "VPKS(F|G|H)S$")>;
+def : InstRW<[VecXsPm], (instregex "VPKLS(F|G|H)?$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "VPKLS(F|G|H)S$")>;
+def : InstRW<[VecXsPm], (instregex "VSEG(B|F|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VUPH(B|F|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VUPL(B|F)?$")>;
+def : InstRW<[VecXsPm], (instregex "VUPLH(B|F|H|W)?$")>;
+def : InstRW<[VecXsPm], (instregex "VUPLL(B|F|H)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Integer arithmetic
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[VecXsPm], (instregex "VA(B|F|G|H|Q|C|CQ)?$")>;
+def : InstRW<[VecXsPm], (instregex "VACC(B|F|G|H|Q|C|CQ)?$")>;
+def : InstRW<[VecXsPm], (instregex "VAVG(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VAVGL(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VN(C|O|N|X)?$")>;
+def : InstRW<[VecXsPm], (instregex "VO(C)?$")>;
+def : InstRW<[VecMul], (instregex "VCKSM$")>;
+def : InstRW<[VecXsPm], (instregex "VCLZ(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VCTZ(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VX$")>;
+def : InstRW<[VecMul], (instregex "VGFM?$")>;
+def : InstRW<[VecMul], (instregex "VGFMA(B|F|G|H)?$")>;
+def : InstRW<[VecMul], (instregex "VGFM(B|F|G|H)$")>;
+def : InstRW<[VecXsPm], (instregex "VLC(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VLP(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VMX(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VMXL(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VMN(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VMNL(B|F|G|H)?$")>;
+def : InstRW<[VecMul], (instregex "VMAL(B|F)?$")>;
+def : InstRW<[VecMul], (instregex "VMALE(B|F|H)?$")>;
+def : InstRW<[VecMul], (instregex "VMALH(B|F|H|W)?$")>;
+def : InstRW<[VecMul], (instregex "VMALO(B|F|H)?$")>;
+def : InstRW<[VecMul], (instregex "VMAO(B|F|H)?$")>;
+def : InstRW<[VecMul], (instregex "VMAE(B|F|H)?$")>;
+def : InstRW<[VecMul], (instregex "VMAH(B|F|H)?$")>;
+def : InstRW<[VecMul], (instregex "VME(B|F|H)?$")>;
+def : InstRW<[VecMul], (instregex "VMH(B|F|H)?$")>;
+def : InstRW<[VecMul], (instregex "VML(B|F)?$")>;
+def : InstRW<[VecMul], (instregex "VMLE(B|F|H)?$")>;
+def : InstRW<[VecMul], (instregex "VMLH(B|F|H|W)?$")>;
+def : InstRW<[VecMul], (instregex "VMLO(B|F|H)?$")>;
+def : InstRW<[VecMul], (instregex "VMO(B|F|H)?$")>;
+def : InstRW<[VecBF2], (instregex "VMSL(G)?$")>;
+
+def : InstRW<[VecXsPm], (instregex "VPOPCT(B|F|G|H)?$")>;
+
+def : InstRW<[VecXsPm], (instregex "VERLL(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VERLLV(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VERIM(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VESL(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VESLV(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VESRA(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VESRAV(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VESRL(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VESRLV(B|F|G|H)?$")>;
+
+def : InstRW<[VecXsPm], (instregex "VSL(DB)?$")>;
+def : InstRW<[VecXsPm, VecXsPm, Lat8], (instregex "VSLB$")>;
+def : InstRW<[VecXsPm], (instregex "VSR(A|L)$")>;
+def : InstRW<[VecXsPm, VecXsPm, Lat8], (instregex "VSR(A|L)B$")>;
+
+def : InstRW<[VecXsPm], (instregex "VSB(I|IQ|CBI|CBIQ)?$")>;
+def : InstRW<[VecXsPm], (instregex "VSCBI(B|F|G|H|Q)?$")>;
+def : InstRW<[VecXsPm], (instregex "VS(F|G|H|Q)?$")>;
+
+def : InstRW<[VecMul], (instregex "VSUM(B|H)?$")>;
+def : InstRW<[VecMul], (instregex "VSUMG(F|H)?$")>;
+def : InstRW<[VecMul], (instregex "VSUMQ(F|G)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Integer comparison
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[VecXsPm, Lat4], (instregex "VEC(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "VECL(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VCEQ(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "VCEQ(B|F|G|H)S$")>;
+def : InstRW<[VecXsPm], (instregex "VCH(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "VCH(B|F|G|H)S$")>;
+def : InstRW<[VecXsPm], (instregex "VCHL(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "VCHL(B|F|G|H)S$")>;
+def : InstRW<[VecStr, Lat5], (instregex "VTM$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Floating-point arithmetic
+//===----------------------------------------------------------------------===//
+
+// Conversion and rounding
+def : InstRW<[VecBF], (instregex "VCD(L)?G$")>;
+def : InstRW<[VecBF], (instregex "VCD(L)?GB$")>;
+def : InstRW<[VecBF], (instregex "WCD(L)?GB$")>;
+def : InstRW<[VecBF], (instregex "VC(L)?GD$")>;
+def : InstRW<[VecBF], (instregex "VC(L)?GDB$")>;
+def : InstRW<[VecBF], (instregex "WC(L)?GDB$")>;
+def : InstRW<[VecBF], (instregex "VL(DE|ED)$")>;
+def : InstRW<[VecBF], (instregex "VL(DE|ED)B$")>;
+def : InstRW<[VecBF], (instregex "WL(DE|ED)B$")>;
+def : InstRW<[VecBF], (instregex "VFL(L|R)$")>;
+def : InstRW<[VecBF], (instregex "VFL(LS|RD)$")>;
+def : InstRW<[VecBF], (instregex "WFL(LS|RD)$")>;
+def : InstRW<[VecBF2], (instregex "WFLLD$")>;
+def : InstRW<[VecDF2, Lat10], (instregex "WFLRX$")>;
+def : InstRW<[VecBF2], (instregex "VFI$")>;
+def : InstRW<[VecBF], (instregex "VFIDB$")>;
+def : InstRW<[VecBF], (instregex "WFIDB$")>;
+def : InstRW<[VecBF2], (instregex "VFISB$")>;
+def : InstRW<[VecBF], (instregex "WFISB$")>;
+def : InstRW<[VecDF2, Lat10], (instregex "WFIXB$")>;
+
+// Sign operations
+def : InstRW<[VecXsPm], (instregex "VFPSO$")>;
+def : InstRW<[VecXsPm], (instregex "(V|W)FPSODB$")>;
+def : InstRW<[VecXsPm], (instregex "(V|W)FPSOSB$")>;
+def : InstRW<[VecXsPm], (instregex "WFPSOXB$")>;
+def : InstRW<[VecXsPm], (instregex "(V|W)FL(C|N|P)DB$")>;
+def : InstRW<[VecXsPm], (instregex "(V|W)FL(C|N|P)SB$")>;
+def : InstRW<[VecXsPm], (instregex "WFL(C|N|P)XB$")>;
+
+// Minimum / maximum
+def : InstRW<[VecXsPm], (instregex "VF(MAX|MIN)$")>;
+def : InstRW<[VecXsPm], (instregex "VF(MAX|MIN)DB$")>;
+def : InstRW<[VecXsPm], (instregex "WF(MAX|MIN)DB$")>;
+def : InstRW<[VecXsPm], (instregex "VF(MAX|MIN)SB$")>;
+def : InstRW<[VecXsPm], (instregex "WF(MAX|MIN)SB$")>;
+def : InstRW<[VecDFX], (instregex "WF(MAX|MIN)XB$")>;
+
+// Test data class
+def : InstRW<[VecXsPm, Lat4], (instregex "VFTCI$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "(V|W)FTCIDB$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "(V|W)FTCISB$")>;
+def : InstRW<[VecDFX, Lat4], (instregex "WFTCIXB$")>;
+
+// Add / subtract
+def : InstRW<[VecBF2], (instregex "VF(A|S)$")>;
+def : InstRW<[VecBF], (instregex "VF(A|S)DB$")>;
+def : InstRW<[VecBF], (instregex "WF(A|S)DB$")>;
+def : InstRW<[VecBF2], (instregex "VF(A|S)SB$")>;
+def : InstRW<[VecBF], (instregex "WF(A|S)SB$")>;
+def : InstRW<[VecDF2, Lat10], (instregex "WF(A|S)XB$")>;
+
+// Multiply / multiply-and-add/subtract
+def : InstRW<[VecBF2], (instregex "VFM$")>;
+def : InstRW<[VecBF], (instregex "VFMDB$")>;
+def : InstRW<[VecBF], (instregex "WFMDB$")>;
+def : InstRW<[VecBF2], (instregex "VFMSB$")>;
+def : InstRW<[VecBF], (instregex "WFMSB$")>;
+def : InstRW<[VecDF2, Lat20], (instregex "WFMXB$")>;
+def : InstRW<[VecBF2], (instregex "VF(N)?M(A|S)$")>;
+def : InstRW<[VecBF], (instregex "VF(N)?M(A|S)DB$")>;
+def : InstRW<[VecBF], (instregex "WF(N)?M(A|S)DB$")>;
+def : InstRW<[VecBF2], (instregex "VF(N)?M(A|S)SB$")>;
+def : InstRW<[VecBF], (instregex "WF(N)?M(A|S)SB$")>;
+def : InstRW<[VecDF2, Lat20], (instregex "WF(N)?M(A|S)XB$")>;
+
+// Divide / square root
+def : InstRW<[VecFPd], (instregex "VFD$")>;
+def : InstRW<[VecFPd], (instregex "(V|W)FDDB$")>;
+def : InstRW<[VecFPd], (instregex "(V|W)FDSB$")>;
+def : InstRW<[VecFPd], (instregex "WFDXB$")>;
+def : InstRW<[VecFPd], (instregex "VFSQ$")>;
+def : InstRW<[VecFPd], (instregex "(V|W)FSQDB$")>;
+def : InstRW<[VecFPd], (instregex "(V|W)FSQSB$")>;
+def : InstRW<[VecFPd], (instregex "WFSQXB$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Floating-point comparison
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[VecXsPm], (instregex "VF(C|K)(E|H|HE)$")>;
+def : InstRW<[VecXsPm], (instregex "VF(C|K)(E|H|HE)DB$")>;
+def : InstRW<[VecXsPm], (instregex "WF(C|K)(E|H|HE)DB$")>;
+def : InstRW<[VecXsPm], (instregex "VF(C|K)(E|H|HE)SB$")>;
+def : InstRW<[VecXsPm], (instregex "WF(C|K)(E|H|HE)SB$")>;
+def : InstRW<[VecDFX], (instregex "WF(C|K)(E|H|HE)XB$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "VF(C|K)(E|H|HE)DBS$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "WF(C|K)(E|H|HE)DBS$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "VF(C|K)(E|H|HE)SBS$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "WF(C|K)(E|H|HE)SBS$")>;
+def : InstRW<[VecDFX, Lat4], (instregex "WF(C|K)(E|H|HE)XBS$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "WF(C|K)$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "WF(C|K)DB$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "WF(C|K)SB$")>;
+def : InstRW<[VecDFX, Lat4], (instregex "WF(C|K)XB$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Floating-point insertion and extraction
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb], (instregex "LEFR$")>;
+def : InstRW<[FXb, Lat4], (instregex "LFER$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: String instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[VecStr], (instregex "VFAE(B)?$")>;
+def : InstRW<[VecStr, Lat5], (instregex "VFAEBS$")>;
+def : InstRW<[VecStr], (instregex "VFAE(F|H)$")>;
+def : InstRW<[VecStr, Lat5], (instregex "VFAE(F|H)S$")>;
+def : InstRW<[VecStr], (instregex "VFAEZ(B|F|H)$")>;
+def : InstRW<[VecStr, Lat5], (instregex "VFAEZ(B|F|H)S$")>;
+def : InstRW<[VecStr], (instregex "VFEE(B|F|H|ZB|ZF|ZH)?$")>;
+def : InstRW<[VecStr, Lat5], (instregex "VFEE(B|F|H|ZB|ZF|ZH)S$")>;
+def : InstRW<[VecStr], (instregex "VFENE(B|F|H|ZB|ZF|ZH)?$")>;
+def : InstRW<[VecStr, Lat5], (instregex "VFENE(B|F|H|ZB|ZF|ZH)S$")>;
+def : InstRW<[VecStr], (instregex "VISTR(B|F|H)?$")>;
+def : InstRW<[VecStr, Lat5], (instregex "VISTR(B|F|H)S$")>;
+def : InstRW<[VecStr], (instregex "VSTRC(B|F|H)?$")>;
+def : InstRW<[VecStr, Lat5], (instregex "VSTRC(B|F|H)S$")>;
+def : InstRW<[VecStr], (instregex "VSTRCZ(B|F|H)$")>;
+def : InstRW<[VecStr, Lat5], (instregex "VSTRCZ(B|F|H)S$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Packed-decimal instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[VecDF, VecDF, Lat10, GroupAlone], (instregex "VLIP$")>;
+def : InstRW<[VecDFX, LSU, Lat12, GroupAlone], (instregex "VPKZ$")>;
+def : InstRW<[VecDFX, FXb, LSU, Lat12, GroupAlone], (instregex "VUPKZ$")>;
+def : InstRW<[VecDF, VecDF, FXb, Lat20, GroupAlone], (instregex "VCVB(G)?$")>;
+def : InstRW<[VecDF, VecDF, FXb, Lat20, GroupAlone], (instregex "VCVD(G)?$")>;
+def : InstRW<[VecDFX], (instregex "V(A|S)P$")>;
+def : InstRW<[VecDF, VecDF, Lat30, GroupAlone], (instregex "VM(S)?P$")>;
+def : InstRW<[VecDF, VecDF, Lat30, GroupAlone], (instregex "V(D|R)P$")>;
+def : InstRW<[VecDFX, Lat30, GroupAlone], (instregex "VSDP$")>;
+def : InstRW<[VecDF, VecDF, Lat11], (instregex "VSRP$")>;
+def : InstRW<[VecDFX], (instregex "VPSOP$")>;
+def : InstRW<[VecDFX], (instregex "V(T|C)P$")>;
+
+
+// -------------------------------- System ---------------------------------- //
+
+//===----------------------------------------------------------------------===//
+// System: Program-Status Word Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, Lat30], (instregex "EPSW$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "LPSW(E)?$")>;
+def : InstRW<[FXa, Lat3, GroupAlone], (instregex "IPK$")>;
+def : InstRW<[LSU, EndGroup], (instregex "SPKA$")>;
+def : InstRW<[LSU, EndGroup], (instregex "SSM$")>;
+def : InstRW<[FXb, LSU, GroupAlone], (instregex "ST(N|O)SM$")>;
+def : InstRW<[FXa, Lat3], (instregex "IAC$")>;
+def : InstRW<[LSU, EndGroup], (instregex "SAC(F)?$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Control Register Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, LSU, Lat30], (instregex "LCTL(G)?$")>;
+def : InstRW<[LSU, Lat30], (instregex "STCT(L|G)$")>;
+def : InstRW<[LSU], (instregex "E(P|S)A(I)?R$")>;
+def : InstRW<[FXb, Lat30], (instregex "SSA(I)?R$")>;
+def : InstRW<[FXb, Lat30], (instregex "ESEA$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Prefix-Register Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, LSU, Lat30], (instregex "SPX$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "STPX$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Storage-Key and Real Memory Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, Lat30], (instregex "ISKE$")>;
+def : InstRW<[FXb, Lat30], (instregex "IVSK$")>;
+def : InstRW<[FXb, Lat30], (instregex "SSKE(Opt)?$")>;
+def : InstRW<[FXb, Lat30], (instregex "RRB(E|M)$")>;
+def : InstRW<[FXb, Lat30], (instregex "IRBM$")>;
+def : InstRW<[FXb, Lat30], (instregex "PFMF$")>;
+def : InstRW<[FXb, Lat30], (instregex "TB$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "PGIN$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "PGOUT$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Dynamic-Address-Translation Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, LSU, Lat30], (instregex "IPTE(Opt)?(Opt)?$")>;
+def : InstRW<[FXb, Lat30], (instregex "IDTE(Opt)?$")>;
+def : InstRW<[FXb, Lat30], (instregex "CRDTE(Opt)?$")>;
+def : InstRW<[FXb, Lat30], (instregex "PTLB$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "CSP(G)?$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "LPTEA$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "LRA(Y|G)?$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "STRAG$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "LURA(G)?$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "STUR(A|G)$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "TPROT$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Memory-move Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, FXa, FXb, LSU, Lat8, GroupAlone], (instregex "MVC(K|P|S)$")>;
+def : InstRW<[FXa, LSU, Lat6, GroupAlone], (instregex "MVC(S|D)K$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "MVCOS$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVPG$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Address-Space Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, LSU, Lat30], (instregex "LASP$")>;
+def : InstRW<[LSU, GroupAlone], (instregex "PALB$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "PC$")>;
+def : InstRW<[FXb, Lat30], (instregex "PR$")>;
+def : InstRW<[FXb, Lat30], (instregex "PT(I)?$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "RP$")>;
+def : InstRW<[FXb, Lat30], (instregex "BS(G|A)$")>;
+def : InstRW<[FXb, Lat20], (instregex "TAR$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Linkage-Stack Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, Lat30, EndGroup], (instregex "BAKR$")>;
+def : InstRW<[FXb, Lat30], (instregex "EREG(G)?$")>;
+def : InstRW<[FXb, Lat30], (instregex "(E|M)STA$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Time-Related Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, Lat30], (instregex "PTFF$")>;
+def : InstRW<[FXb, LSU, Lat20], (instregex "SCK$")>;
+def : InstRW<[FXb, Lat30], (instregex "SCKPF$")>;
+def : InstRW<[FXb, LSU, Lat20], (instregex "SCKC$")>;
+def : InstRW<[LSU, LSU, GroupAlone], (instregex "SPT$")>;
+def : InstRW<[LSU, LSU, LSU, FXa, FXa, FXb, Lat9, GroupAlone],
+             (instregex "STCK(F)?$")>;
+def : InstRW<[LSU, LSU, LSU, LSU, FXa, FXa, FXb, FXb, Lat11, GroupAlone],
+             (instregex "STCKE$")>;
+def : InstRW<[FXb, LSU, Lat9], (instregex "STCKC$")>;
+def : InstRW<[LSU, LSU, FXb, Lat5, BeginGroup], (instregex "STPT$")>;
+
+//===----------------------------------------------------------------------===//
+// System: CPU-Related Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, LSU, Lat30], (instregex "STAP$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "STIDP$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "STSI$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "STFL(E)?$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "ECAG$")>;
+def : InstRW<[FXa, LSU, Lat30], (instregex "ECTG$")>;
+def : InstRW<[FXb, Lat30], (instregex "PTF$")>;
+def : InstRW<[FXb, Lat30], (instregex "PCKMO$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Miscellaneous Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, Lat30], (instregex "SVC$")>;
+def : InstRW<[FXb, GroupAlone], (instregex "MC$")>;
+def : InstRW<[FXb, Lat30], (instregex "DIAG$")>;
+def : InstRW<[FXb], (instregex "TRAC(E|G)$")>;
+def : InstRW<[FXb, Lat30], (instregex "TRAP(2|4)$")>;
+def : InstRW<[FXb, Lat30], (instregex "SIGP$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "SIGA$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "SIE$")>;
+
+//===----------------------------------------------------------------------===//
+// System: CPU-Measurement Facility Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb], (instregex "LPP$")>;
+def : InstRW<[FXb, Lat30], (instregex "ECPGA$")>;
+def : InstRW<[FXb, Lat30], (instregex "E(C|P)CTR$")>;
+def : InstRW<[FXb, Lat30], (instregex "LCCTL$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "L(P|S)CTL$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "Q(S|CTR)I$")>;
+def : InstRW<[FXb, Lat30], (instregex "S(C|P)CTR$")>;
+
+//===----------------------------------------------------------------------===//
+// System: I/O Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, Lat30], (instregex "(C|H|R|X)SCH$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "(M|S|ST|T)SCH$")>;
+def : InstRW<[FXb, Lat30], (instregex "RCHP$")>;
+def : InstRW<[FXb, Lat30], (instregex "SCHM$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "STC(PS|RW)$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "TPI$")>;
+def : InstRW<[FXb, Lat30], (instregex "SAL$")>;
+
+}
+
diff --git a/lib/Target/SystemZ/SystemZScheduleZ196.td b/lib/Target/SystemZ/SystemZScheduleZ196.td
index e3e1999d8ad8..4d986e8391cf 100644
--- a/lib/Target/SystemZ/SystemZScheduleZ196.td
+++ b/lib/Target/SystemZ/SystemZScheduleZ196.td
@@ -311,7 +311,7 @@ def : InstRW<[FXU], (instregex "ALGR(K)?$")>;
 def : InstRW<[FXU], (instregex "ALR(K)?$")>;
 def : InstRW<[FXU], (instregex "AR(K)?$")>;
 def : InstRW<[FXU], (instregex "A(L)?HHHR$")>;
-def : InstRW<[FXU, FXU, Lat3], (instregex "A(L)?HHLR$")>;
+def : InstRW<[FXU, FXU, Lat3, GroupAlone], (instregex "A(L)?HHLR$")>;
 def : InstRW<[FXU], (instregex "ALSIH(N)?$")>;
 def : InstRW<[FXU, LSU, Lat5], (instregex "A(L)?G(SI)?$")>;
 
@@ -337,7 +337,7 @@ def : InstRW<[FXU], (instregex "SLGR(K)?$")>;
 def : InstRW<[FXU], (instregex "SLR(K)?$")>;
 def : InstRW<[FXU], (instregex "SR(K)?$")>;
 def : InstRW<[FXU], (instregex "S(L)?HHHR$")>;
-def : InstRW<[FXU, FXU, Lat3], (instregex "S(L)?HHLR$")>;
+def : InstRW<[FXU, FXU, Lat3, GroupAlone], (instregex "S(L)?HHLR$")>;
 
 // Subtraction with borrow
 def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "SLB(G)?$")>;
@@ -403,13 +403,13 @@ def : InstRW<[FXU, Lat6], (instregex "MS(R|FI)$")>;
 def : InstRW<[FXU, LSU, Lat12], (instregex "MSG$")>;
 def : InstRW<[FXU, Lat8], (instregex "MSGR$")>;
 def : InstRW<[FXU, Lat6], (instregex "MSGF(I|R)$")>;
-def : InstRW<[FXU, LSU, Lat15, GroupAlone], (instregex "MLG$")>;
-def : InstRW<[FXU, Lat9, GroupAlone], (instregex "MLGR$")>;
+def : InstRW<[FXU, FXU, LSU, Lat15, GroupAlone], (instregex "MLG$")>;
+def : InstRW<[FXU, FXU, Lat9, GroupAlone], (instregex "MLGR$")>;
 def : InstRW<[FXU, Lat5], (instregex "MGHI$")>;
 def : InstRW<[FXU, Lat5], (instregex "MHI$")>;
 def : InstRW<[FXU, LSU, Lat9], (instregex "MH(Y)?$")>;
-def : InstRW<[FXU, Lat7, GroupAlone], (instregex "M(L)?R$")>;
-def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "M(FY|L)?$")>;
+def : InstRW<[FXU, FXU, Lat7, GroupAlone], (instregex "M(L)?R$")>;
+def : InstRW<[FXU, FXU, LSU, Lat7, GroupAlone], (instregex "M(FY|L)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Division and remainder
@@ -436,7 +436,8 @@ def : InstRW<[FXU], (instregex "SLL(G|K)?$")>;
 def : InstRW<[FXU], (instregex "SRL(G|K)?$")>;
 def : InstRW<[FXU], (instregex "SRA(G|K)?$")>;
 def : InstRW<[FXU, Lat2], (instregex "SLA(G|K)?$")>;
-def : InstRW<[FXU, FXU, FXU, FXU, Lat8], (instregex "S(L|R)D(A|L)$")>;
+def : InstRW<[FXU, FXU, FXU, FXU, LSU, Lat8, GroupAlone],
+             (instregex "S(L|R)D(A|L)$")>;
 
 // Rotate
 def : InstRW<[FXU, LSU, Lat6], (instregex "RLL(G)?$")>;
@@ -474,7 +475,7 @@ def : InstRW<[FXU, LSU, Lat5], (instregex "CLI(Y)?$")>;
 def : InstRW<[FXU], (instregex "CLR$")>;
 def : InstRW<[FXU, LSU, Lat5], (instregex "CLRL$")>;
 def : InstRW<[FXU], (instregex "C(L)?HHR$")>;
-def : InstRW<[FXU, FXU, Lat3], (instregex "C(L)?HLR$")>;
+def : InstRW<[FXU, FXU, Lat3, GroupAlone], (instregex "C(L)?HLR$")>;
 
 // Compare halfword
 def : InstRW<[FXU, LSU, FXU, Lat6, GroupAlone], (instregex "CH(Y|RL)?$")>;
@@ -499,7 +500,7 @@ def : InstRW<[FXU], (instregex "TMLH(64)?$")>;
 def : InstRW<[FXU], (instregex "TMLL(64)?$")>;
 
 // Compare logical characters under mask
-def : InstRW<[FXU, LSU, Lat5], (instregex "CLM(H|Y)?$")>;
+def : InstRW<[FXU, FXU, LSU, Lat5, GroupAlone], (instregex "CLM(H|Y)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Prefetch
@@ -532,7 +533,7 @@ def : InstRW<[FXU, FXU, FXU, FXU, FXU, FXU, LSU, LSU, Lat12, GroupAlone],
              (instregex "CDSG$")>;
 
 // Compare and swap and store
-def : InstRW<[FXU, Lat30, GroupAlone], (instregex "CSST$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "CSST$")>;
 
 // Perform locked operation
 def : InstRW<[LSU, Lat30, GroupAlone], (instregex "PLO$")>;
@@ -548,36 +549,44 @@ def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPD(G)?$")>;
 // Translate and convert
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, Lat30, GroupAlone], (instregex "TR(T|TR)?(E|EOpt)?$")>;
-def : InstRW<[FXU, Lat30, GroupAlone], (instregex "TR(T|O)(T|O)(Opt)?$")>;
-def : InstRW<[FXU, Lat30, GroupAlone], (instregex "CU(12|14|21|24|41|42)(Opt)?$")>;
-def : InstRW<[FXU, Lat30, GroupAlone], (instregex "(CUUTF|CUTFU)(Opt)?$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "TR$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "TRT$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "TRTR$")>;
+def : InstRW<[FXU, Lat30], (instregex "TR(TR)?(T)?(E|EOpt)?$")>;
+def : InstRW<[LSU, Lat30], (instregex "TR(T|O)(T|O)(Opt)?$")>;
+def : InstRW<[FXU, Lat30], (instregex "CU(12|14|21|24|41|42)(Opt)?$")>;
+def : InstRW<[FXU, Lat30], (instregex "(CUUTF|CUTFU)(Opt)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Message-security assist
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, Lat30, GroupAlone], (instregex "KM(C|F|O|CTR)?$")>;
-def : InstRW<[FXU, Lat30, GroupAlone], (instregex "(KIMD|KLMD|KMAC|PCC)$")>;
+def : InstRW<[FXU, Lat30], (instregex "KM(C|F|O|CTR)?$")>;
+def : InstRW<[FXU, Lat30], (instregex "(KIMD|KLMD|KMAC|PCC)$")>;
 
 //===----------------------------------------------------------------------===//
 // Decimal arithmetic
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, DFU, LSU, Lat30, GroupAlone], (instregex "CVB(Y|G)?$")>;
-def : InstRW<[FXU, DFU, FXU, Lat30, GroupAlone], (instregex "CVD(Y|G)?$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MV(N|Z|O)$")>;
+def : InstRW<[FXU, DFU2, LSU, LSU, Lat30, GroupAlone], (instregex "CVBG$")>;
+def : InstRW<[FXU, DFU, LSU, Lat30, GroupAlone], (instregex "CVB(Y)?$")>;
+def : InstRW<[FXU, FXU, FXU, DFU2, DFU2, LSU, Lat30, GroupAlone],
+             (instregex "CVDG$")>;
+def : InstRW<[FXU, FXU, DFU, LSU, Lat30, GroupAlone], (instregex "CVD(Y)?$")>;
+def : InstRW<[LSU, Lat10, GroupAlone], (instregex "MVO$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MV(N|Z)$")>;
 def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(PACK|PKA|PKU)$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UNPK(A|U)?$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UNPK$")>;
+def : InstRW<[LSU, Lat12, GroupAlone], (instregex "UNPK(A|U)$")>;
 
-def : InstRW<[FXU, FXU, DFU2, LSU, LSU, LSU, LSU, Lat15, GroupAlone],
+def : InstRW<[FXU, DFU2, DFU2, LSU, LSU, Lat15, GroupAlone],
              (instregex "(A|S|ZA)P$")>;
-def : InstRW<[FXU, FXU, DFU2, LSU, LSU, LSU, LSU, Lat30, GroupAlone],
+def : InstRW<[FXU, DFU2, DFU2, LSU, LSU, Lat30, GroupAlone],
              (instregex "(M|D)P$")>;
-def : InstRW<[FXU, FXU, DFU2, LSU, LSU, Lat15, GroupAlone],
+def : InstRW<[FXU, FXU, DFU2, DFU2, LSU, LSU, LSU, Lat15, GroupAlone],
              (instregex "SRP$")>;
-def : InstRW<[DFU2, LSU, LSU, LSU, LSU, Lat11, GroupAlone], (instregex "CP$")>;
-def : InstRW<[DFU2, LSU, LSU, Lat3, GroupAlone], (instregex "TP$")>;
+def : InstRW<[DFU2, DFU2, LSU, LSU, Lat11, GroupAlone], (instregex "CP$")>;
+def : InstRW<[DFU2, LSU, LSU, GroupAlone], (instregex "TP$")>;
 def : InstRW<[LSU, Lat30, GroupAlone], (instregex "ED(MK)?$")>;
 
 //===----------------------------------------------------------------------===//
@@ -621,7 +630,7 @@ def : InstRW<[FXU, FXU, LSU, Lat6, GroupAlone], (instregex "BASSM$")>;
 //===----------------------------------------------------------------------===//
 
 // Find leftmost one
-def : InstRW<[FXU, Lat7, GroupAlone], (instregex "FLOGR$")>;
+def : InstRW<[FXU, FXU, Lat7, GroupAlone], (instregex "FLOGR$")>;
 
 // Population count
 def : InstRW<[FXU, Lat3], (instregex "POPCNT$")>;
@@ -632,14 +641,14 @@ def : InstRW<[FXU], (instregex "ZEXT128$")>;
 
 // String instructions
 def : InstRW<[FXU, LSU, Lat30], (instregex "SRST$")>;
-def : InstRW<[LSU, Lat30], (instregex "SRSTU$")>;
+def : InstRW<[FXU, Lat30], (instregex "SRSTU$")>;
 def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CUSE$")>;
 
 // Various complex instructions
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CFC$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UPT$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CKSM$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CMPSC$")>;
+def : InstRW<[LSU, Lat30], (instregex "CFC$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "UPT$")>;
+def : InstRW<[LSU, Lat30], (instregex "CKSM$")>;
+def : InstRW<[FXU, Lat30], (instregex "CMPSC$")>;
 
 // Execute
 def : InstRW<[LSU, GroupAlone], (instregex "EX(RL)?$")>;
@@ -780,9 +789,9 @@ def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "MXDBR$")>;
 def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "MXBR$")>;
 
 // Multiply and add / subtract
-def : InstRW<[FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)EB$")>;
+def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)EB$")>;
 def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)EBR$")>;
-def : InstRW<[FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)DB$")>;
+def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)DB$")>;
 def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)DBR$")>;
 
 // Division
@@ -791,7 +800,7 @@ def : InstRW<[FPU, Lat30], (instregex "D(E|D)BR$")>;
 def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "DXBR$")>;
 
 // Divide to integer
-def : InstRW<[FPU, Lat30, GroupAlone], (instregex "DI(E|D)BR$")>;
+def : InstRW<[FPU, Lat30], (instregex "DI(E|D)BR$")>;
 
 //===----------------------------------------------------------------------===//
 // FP: Comparisons
@@ -813,9 +822,9 @@ def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "TCXB$")>;
 def : InstRW<[FXU, LSU, Lat4, GroupAlone], (instregex "EFPC$")>;
 def : InstRW<[LSU, Lat3, GroupAlone], (instregex "SFPC$")>;
 def : InstRW<[LSU, LSU, Lat6, GroupAlone], (instregex "LFPC$")>;
-def : InstRW<[LSU, Lat3, GroupAlone], (instregex "STFPC$")>;
-def : InstRW<[FXU, Lat30, GroupAlone], (instregex "SFASR$")>;
-def : InstRW<[FXU, LSU, Lat30, GroupAlone], (instregex "LFAS$")>;
+def : InstRW<[FXU, LSU, Lat3, GroupAlone], (instregex "STFPC$")>;
+def : InstRW<[FXU, Lat30], (instregex "SFASR$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "LFAS$")>;
 def : InstRW<[FXU, Lat2, GroupAlone], (instregex "SRNM(B|T)?$")>;
 
 
@@ -900,16 +909,20 @@ def : InstRW<[FPU], (instregex "M(D|DE|E|EE)R$")>;
 def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "MXD$")>;
 def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "MXDR$")>;
 def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "MXR$")>;
-def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "MY(H|L)?$")>;
-def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "MY(H|L)?R$")>;
+def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "MY$")>;
+def : InstRW<[FPU, FPU, LSU, Lat15, GroupAlone], (instregex "MY(H|L)$")>;
+def : InstRW<[FPU2, FPU2,  Lat10, GroupAlone], (instregex "MYR$")>;
+def : InstRW<[FPU, Lat10, GroupAlone], (instregex "MY(H|L)R$")>;
 
 // Multiply and add / subtract
-def : InstRW<[FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)E$")>;
+def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)E$")>;
 def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)ER$")>;
-def : InstRW<[FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)D$")>;
+def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)D$")>;
 def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)DR$")>;
-def : InstRW<[FPU2, FPU2, LSU, Lat12, GroupAlone], (instregex "MAY(H|L)?$")>;
-def : InstRW<[FPU2, FPU2, GroupAlone], (instregex "MAY(H|L)?R$")>;
+def : InstRW<[FPU2, FPU2, LSU, GroupAlone], (instregex "MAY$")>;
+def : InstRW<[FPU2, FPU2, GroupAlone], (instregex "MAYR$")>;
+def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "MAY(H|L)$")>;
+def : InstRW<[FPU, GroupAlone], (instregex "MAY(H|L)R$")>;
 
 // Division
 def : InstRW<[FPU, LSU, Lat30], (instregex "D(E|D)$")>;
@@ -949,16 +962,21 @@ def : InstRW<[DFU, Lat20], (instregex "LDETR$")>;
 def : InstRW<[DFU2, DFU2, Lat20, GroupAlone], (instregex "LXDTR$")>;
 
 // Convert from fixed / logical
-def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "CD(F|G)TR(A)?$")>;
-def : InstRW<[FXU, DFU2, DFU2, Lat30, GroupAlone], (instregex "CX(F|G)TR(A)?$")>;
+def : InstRW<[FXU, DFU, Lat9, GroupAlone], (instregex "CDFTR$")>;
+def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "CDGTR(A)?$")>;
+def : InstRW<[FXU, DFU2, DFU2, GroupAlone], (instregex "CXFTR$")>;
+def : InstRW<[FXU, DFU2, DFU2, Lat30, GroupAlone], (instregex "CXGTR(A)?$")>;
 def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "CDL(F|G)TR$")>;
-def : InstRW<[FXU, DFU2, DFU2, Lat11, GroupAlone], (instregex "CXL(F|G)TR$")>;
+def : InstRW<[FXU, DFU2, DFU2, Lat11, GroupAlone], (instregex "CXLFTR$")>;
+def : InstRW<[FXU, DFU2, DFU2, Lat6, GroupAlone], (instregex "CXLGTR$")>;
 
 // Convert to fixed / logical
-def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "C(F|G)DTR(A)?$")>;
-def : InstRW<[FXU, DFU, DFU, Lat30, GroupAlone], (instregex "C(F|G)XTR(A)?$")>;
-def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "CL(F|G)DTR$")>;
-def : InstRW<[FXU, DFU, DFU, Lat30, GroupAlone], (instregex "CL(F|G)XTR$")>;
+def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "CFDTR(A)?$")>;
+def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "CGDTR(A)?$")>;
+def : InstRW<[FXU, DFU, DFU, Lat11, GroupAlone], (instregex "CFXTR$")>;
+def : InstRW<[FXU, DFU, DFU, Lat30, GroupAlone], (instregex "CGXTR(A)?$")>;
+def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "CL(F|G)DTR$")>;
+def : InstRW<[FXU, DFU, DFU, Lat11, GroupAlone], (instregex "CL(F|G)XTR$")>;
 
 // Convert from / to signed / unsigned packed
 def : InstRW<[FXU, DFU, Lat12, GroupAlone], (instregex "CD(S|U)TR$")>;
@@ -967,7 +985,7 @@ def : InstRW<[FXU, DFU, Lat12, GroupAlone], (instregex "C(S|U)DTR$")>;
 def : InstRW<[FXU, FXU, DFU2, DFU2, Lat20, GroupAlone], (instregex "C(S|U)XTR$")>;
 
 // Perform floating-point operation
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "PFPO$")>;
+def : InstRW<[FXU, Lat30], (instregex "PFPO$")>;
 
 //===----------------------------------------------------------------------===//
 // DFP: Unary arithmetic
@@ -979,7 +997,7 @@ def : InstRW<[DFU2, DFU2, Lat20, GroupAlone], (instregex "FIXTR$")>;
 
 // Extract biased exponent
 def : InstRW<[FXU, DFU, Lat15, GroupAlone], (instregex "EEDTR$")>;
-def : InstRW<[FXU, DFU, Lat15, GroupAlone], (instregex "EEXTR$")>;
+def : InstRW<[FXU, DFU2, Lat15, GroupAlone], (instregex "EEXTR$")>;
 
 // Extract significance
 def : InstRW<[FXU, DFU, Lat15, GroupAlone], (instregex "ESDTR$")>;
@@ -1010,15 +1028,15 @@ def : InstRW<[DFU, Lat30], (instregex "QADTR$")>;
 def : InstRW<[DFU2, DFU2, Lat30, GroupAlone], (instregex "QAXTR$")>;
 
 // Reround
-def : InstRW<[FXU, DFU, Lat30], (instregex "RRDTR$")>;
+def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "RRDTR$")>;
 def : InstRW<[FXU, DFU2, DFU2, Lat30, GroupAlone], (instregex "RRXTR$")>;
 
 // Shift significand left/right
-def : InstRW<[LSU, DFU, Lat11], (instregex "S(L|R)DT$")>;
+def : InstRW<[LSU, DFU, Lat11, GroupAlone], (instregex "S(L|R)DT$")>;
 def : InstRW<[LSU, DFU2, DFU2, Lat15, GroupAlone], (instregex "S(L|R)XT$")>;
 
 // Insert biased exponent
-def : InstRW<[FXU, DFU, Lat11], (instregex "IEDTR$")>;
+def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "IEDTR$")>;
 def : InstRW<[FXU, DFU2, DFU2, Lat15, GroupAlone], (instregex "IEXTR$")>;
 
 //===----------------------------------------------------------------------===//
@@ -1027,15 +1045,15 @@ def : InstRW<[FXU, DFU2, DFU2, Lat15, GroupAlone], (instregex "IEXTR$")>;
 
 // Compare
 def : InstRW<[DFU, Lat11], (instregex "(K|C)DTR$")>;
-def : InstRW<[DFU, DFU, Lat15, GroupAlone], (instregex "(K|C)XTR$")>;
+def : InstRW<[DFU, DFU, Lat15], (instregex "(K|C)XTR$")>;
 
 // Compare biased exponent
 def : InstRW<[DFU, Lat8], (instregex "CEDTR$")>;
-def : InstRW<[DFU, Lat9], (instregex "CEXTR$")>;
+def : InstRW<[DFU2, Lat9], (instregex "CEXTR$")>;
 
 // Test Data Class/Group
 def : InstRW<[LSU, DFU, Lat15], (instregex "TD(C|G)(E|D)T$")>;
-def : InstRW<[LSU, DFU2, DFU2, Lat15, GroupAlone], (instregex "TD(C|G)XT$")>;
+def : InstRW<[LSU, DFU2, Lat15], (instregex "TD(C|G)XT$")>;
 
 
 // -------------------------------- System ---------------------------------- //
@@ -1046,19 +1064,20 @@ def : InstRW<[LSU, DFU2, DFU2, Lat15, GroupAlone], (instregex "TD(C|G)XT$")>;
 
 def : InstRW<[FXU, Lat30], (instregex "EPSW$")>;
 def : InstRW<[FXU, LSU, Lat30], (instregex "LPSW(E)?$")>;
-def : InstRW<[FXU, Lat3], (instregex "IPK$")>;
-def : InstRW<[LSU], (instregex "SPKA$")>;
-def : InstRW<[LSU], (instregex "SSM$")>;
-def : InstRW<[FXU], (instregex "ST(N|O)SM$")>;
+def : InstRW<[FXU, Lat3, GroupAlone], (instregex "IPK$")>;
+def : InstRW<[LSU, EndGroup], (instregex "SPKA$")>;
+def : InstRW<[LSU, EndGroup], (instregex "SSM$")>;
+def : InstRW<[FXU, LSU, GroupAlone], (instregex "ST(N|O)SM$")>;
 def : InstRW<[FXU, Lat3], (instregex "IAC$")>;
-def : InstRW<[LSU], (instregex "SAC(F)?$")>;
+def : InstRW<[LSU, EndGroup], (instregex "SAC(F)?$")>;
 
 //===----------------------------------------------------------------------===//
 // System: Control Register Instructions
 //===----------------------------------------------------------------------===//
 
 def : InstRW<[FXU, LSU, Lat30], (instregex "LCTL(G)?$")>;
-def : InstRW<[LSU, Lat30], (instregex "STCT(L|G)$")>;
+def : InstRW<[FXU, LSU, LSU, LSU, LSU, Lat10, GroupAlone],
+             (instregex "STCT(L|G)$")>;
 def : InstRW<[LSU], (instregex "E(P|S)A(I)?R$")>;
 def : InstRW<[FXU, Lat30], (instregex "SSA(I)?R$")>;
 def : InstRW<[FXU, Lat30], (instregex "ESEA$")>;
@@ -1103,16 +1122,17 @@ def : InstRW<[FXU, LSU, Lat30], (instregex "TPROT$")>;
 //===----------------------------------------------------------------------===//
 
 def : InstRW<[LSU, Lat8, GroupAlone], (instregex "MVC(K|P|S)$")>;
-def : InstRW<[LSU, Lat6, GroupAlone], (instregex "MVC(S|D)K$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCSK$")>;
+def : InstRW<[LSU, Lat6, GroupAlone], (instregex "MVCDK$")>;
 def : InstRW<[FXU, LSU, Lat30], (instregex "MVCOS$")>;
-def : InstRW<[LSU, Lat30], (instregex "MVPG$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVPG$")>;
 
 //===----------------------------------------------------------------------===//
 // System: Address-Space Instructions
 //===----------------------------------------------------------------------===//
 
 def : InstRW<[FXU, LSU, Lat30], (instregex "LASP$")>;
-def : InstRW<[LSU], (instregex "PALB$")>;
+def : InstRW<[LSU, GroupAlone], (instregex "PALB$")>;
 def : InstRW<[FXU, LSU, Lat30], (instregex "PC$")>;
 def : InstRW<[FXU, Lat30], (instregex "PR$")>;
 def : InstRW<[FXU, Lat30], (instregex "PT(I)?$")>;
@@ -1124,7 +1144,7 @@ def : InstRW<[FXU, Lat20], (instregex "TAR$")>;
 // System: Linkage-Stack Instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, LSU, Lat30], (instregex "BAKR$")>;
+def : InstRW<[FXU, LSU, Lat30, EndGroup], (instregex "BAKR$")>;
 def : InstRW<[FXU, Lat30], (instregex "EREG(G)?$")>;
 def : InstRW<[FXU, Lat30], (instregex "(E|M)STA$")>;
 
@@ -1161,9 +1181,9 @@ def : InstRW<[FXU, Lat30], (instregex "PCKMO$")>;
 //===----------------------------------------------------------------------===//
 
 def : InstRW<[FXU, Lat30], (instregex "SVC$")>;
-def : InstRW<[FXU], (instregex "MC$")>;
+def : InstRW<[FXU, GroupAlone], (instregex "MC$")>;
 def : InstRW<[FXU, Lat30], (instregex "DIAG$")>;
-def : InstRW<[FXU], (instregex "TRAC(E|G)$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "TRAC(E|G)$")>;
 def : InstRW<[FXU, Lat30], (instregex "TRAP(2|4)$")>;
 def : InstRW<[FXU, Lat30], (instregex "SIGP$")>;
 def : InstRW<[FXU, LSU, Lat30], (instregex "SIGA$")>;
@@ -1176,7 +1196,8 @@ def : InstRW<[FXU, LSU, Lat30], (instregex "SIE$")>;
 def : InstRW<[FXU], (instregex "LPP$")>;
 def : InstRW<[FXU, Lat30], (instregex "ECPGA$")>;
 def : InstRW<[FXU, Lat30], (instregex "E(C|P)CTR$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "L(C|P|S)CTL$")>;
+def : InstRW<[FXU, Lat30], (instregex "LCCTL$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "L(P|S)CTL$")>;
 def : InstRW<[FXU, LSU, Lat30], (instregex "Q(S|CTR)I$")>;
 def : InstRW<[FXU, Lat30], (instregex "S(C|P)CTR$")>;
 
diff --git a/lib/Target/SystemZ/SystemZScheduleZEC12.td b/lib/Target/SystemZ/SystemZScheduleZEC12.td
index 59f37205f412..a0f2115eb9d7 100644
--- a/lib/Target/SystemZ/SystemZScheduleZEC12.td
+++ b/lib/Target/SystemZ/SystemZScheduleZEC12.td
@@ -69,7 +69,7 @@ def : WriteRes<LSU_lat1, [ZEC12_LSUnit]> { let Latency = 1; }
 def : WriteRes<FPU,  [ZEC12_FPUnit]> { let Latency = 8; }
 def : WriteRes<FPU2, [ZEC12_FPUnit, ZEC12_FPUnit]> { let Latency = 9; }
 def : WriteRes<DFU,  [ZEC12_DFUnit]> { let Latency = 2; }
-def : WriteRes<DFU2, [ZEC12_DFUnit, ZEC12_FPUnit]> { let Latency = 3; }
+def : WriteRes<DFU2, [ZEC12_DFUnit, ZEC12_DFUnit]> { let Latency = 3; }
 def : WriteRes<VBU,  [ZEC12_VBUnit]>; // Virtual Branching Unit
 
 // -------------------------- INSTRUCTIONS ---------------------------------- //
@@ -251,7 +251,7 @@ def : InstRW<[LSU, LSU, LSU, LSU, LSU, Lat10, GroupAlone],
              (instregex "LM(H|Y|G)?$")>;
 
 // Load multiple disjoint
-def : InstRW<[FXU, Lat30, GroupAlone], (instregex "LMD$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "LMD$")>;
 
 // Store multiple (estimated average of 3 ops)
 def : InstRW<[LSU, LSU, FXU, FXU, FXU, Lat10, GroupAlone],
@@ -413,13 +413,13 @@ def : InstRW<[FXU, Lat6], (instregex "MS(R|FI)$")>;
 def : InstRW<[FXU, LSU, Lat12], (instregex "MSG$")>;
 def : InstRW<[FXU, Lat8], (instregex "MSGR$")>;
 def : InstRW<[FXU, Lat6], (instregex "MSGF(I|R)$")>;
-def : InstRW<[FXU, LSU, Lat15, GroupAlone], (instregex "MLG$")>;
-def : InstRW<[FXU, Lat9, GroupAlone], (instregex "MLGR$")>;
+def : InstRW<[FXU, FXU, LSU, Lat15, GroupAlone], (instregex "MLG$")>;
+def : InstRW<[FXU, FXU, Lat9, GroupAlone], (instregex "MLGR$")>;
 def : InstRW<[FXU, Lat5], (instregex "MGHI$")>;
 def : InstRW<[FXU, Lat5], (instregex "MHI$")>;
 def : InstRW<[FXU, LSU, Lat9], (instregex "MH(Y)?$")>;
-def : InstRW<[FXU, Lat7, GroupAlone], (instregex "M(L)?R$")>;
-def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "M(FY|L)?$")>;
+def : InstRW<[FXU, FXU, Lat7, GroupAlone], (instregex "M(L)?R$")>;
+def : InstRW<[FXU, FXU, LSU, Lat7, GroupAlone], (instregex "M(FY|L)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Division and remainder
@@ -446,7 +446,8 @@ def : InstRW<[FXU], (instregex "SLL(G|K)?$")>;
 def : InstRW<[FXU], (instregex "SRL(G|K)?$")>;
 def : InstRW<[FXU], (instregex "SRA(G|K)?$")>;
 def : InstRW<[FXU], (instregex "SLA(G|K)?$")>;
-def : InstRW<[FXU, FXU, FXU, FXU, Lat8], (instregex "S(L|R)D(A|L)$")>;
+def : InstRW<[FXU, FXU, FXU, FXU, LSU, Lat8, GroupAlone],
+             (instregex "S(L|R)D(A|L)$")>;
 
 // Rotate
 def : InstRW<[FXU, LSU, Lat6], (instregex "RLL(G)?$")>;
@@ -544,7 +545,7 @@ def : InstRW<[FXU, FXU, FXU, FXU, FXU, FXU, LSU, LSU, Lat12, GroupAlone],
              (instregex "CDSG$")>;
 
 // Compare and swap and store
-def : InstRW<[FXU, Lat30, GroupAlone], (instregex "CSST$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "CSST$")>;
 
 // Perform locked operation
 def : InstRW<[LSU, Lat30, GroupAlone], (instregex "PLO$")>;
@@ -560,36 +561,44 @@ def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPD(G)?$")>;
 // Translate and convert
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, Lat30, GroupAlone], (instregex "TR(T|TR)?(E|EOpt)?$")>;
-def : InstRW<[FXU, Lat30, GroupAlone], (instregex "TR(T|O)(T|O)(Opt)?$")>;
-def : InstRW<[FXU, Lat30, GroupAlone], (instregex "CU(12|14|21|24|41|42)(Opt)?$")>;
-def : InstRW<[FXU, Lat30, GroupAlone], (instregex "(CUUTF|CUTFU)(Opt)?$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "TR$")>;
+def : InstRW<[FXU, FXU, FXU, LSU, LSU, Lat30, GroupAlone], (instregex "TRT$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "TRTR$")>;
+def : InstRW<[FXU, Lat30], (instregex "TR(TR)?(T)?(E|EOpt)?$")>;
+def : InstRW<[LSU, Lat30], (instregex "TR(T|O)(T|O)(Opt)?$")>;
+def : InstRW<[FXU, Lat30], (instregex "CU(12|14|21|24|41|42)(Opt)?$")>;
+def : InstRW<[FXU, Lat30], (instregex "(CUUTF|CUTFU)(Opt)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Message-security assist
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, Lat30, GroupAlone], (instregex "KM(C|F|O|CTR)?$")>;
-def : InstRW<[FXU, Lat30, GroupAlone], (instregex "(KIMD|KLMD|KMAC|PCC)$")>;
+def : InstRW<[FXU, Lat30], (instregex "KM(C|F|O|CTR)?$")>;
+def : InstRW<[FXU, Lat30], (instregex "(KIMD|KLMD|KMAC|PCC)$")>;
 
 //===----------------------------------------------------------------------===//
 // Decimal arithmetic
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, DFU, LSU, Lat30, GroupAlone], (instregex "CVB(Y|G)?$")>;
-def : InstRW<[FXU, DFU, FXU, Lat30, GroupAlone], (instregex "CVD(Y|G)?$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MV(N|Z|O)$")>;
+def : InstRW<[FXU, DFU2, LSU, LSU, Lat30, GroupAlone], (instregex "CVBG$")>;
+def : InstRW<[FXU, DFU, LSU, Lat30, GroupAlone], (instregex "CVB(Y)?$")>;
+def : InstRW<[FXU, FXU, FXU, DFU2, DFU2, LSU, Lat30, GroupAlone],
+             (instregex "CVDG$")>;
+def : InstRW<[FXU, FXU, DFU, LSU, Lat30, GroupAlone], (instregex "CVD(Y)?$")>;
+def : InstRW<[LSU, Lat10, GroupAlone], (instregex "MVO$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MV(N|Z)$")>;
 def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(PACK|PKA|PKU)$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UNPK(A|U)?$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UNPK$")>;
+def : InstRW<[LSU, Lat12, GroupAlone], (instregex "UNPK(A|U)$")>;
 
-def : InstRW<[FXU, FXU, DFU2, LSU, LSU, LSU, LSU, Lat15, GroupAlone],
+def : InstRW<[FXU, DFU2, DFU2, LSU, LSU, Lat15, GroupAlone],
              (instregex "(A|S|ZA)P$")>;
-def : InstRW<[FXU, FXU, DFU2, LSU, LSU, LSU, LSU, Lat30, GroupAlone],
+def : InstRW<[FXU, DFU2, DFU2, LSU, LSU, Lat30, GroupAlone],
              (instregex "(M|D)P$")>;
-def : InstRW<[FXU, FXU, DFU2, LSU, LSU, Lat15, GroupAlone],
+def : InstRW<[FXU, FXU, DFU2, DFU2, LSU, LSU, LSU, Lat15, GroupAlone],
              (instregex "SRP$")>;
-def : InstRW<[DFU2, LSU, LSU, LSU, LSU, Lat11, GroupAlone], (instregex "CP$")>;
-def : InstRW<[DFU2, LSU, LSU, Lat3, GroupAlone], (instregex "TP$")>;
+def : InstRW<[DFU2, DFU2, LSU, LSU, Lat11, GroupAlone], (instregex "CP$")>;
+def : InstRW<[DFU2, LSU, LSU, Lat5, GroupAlone], (instregex "TP$")>;
 def : InstRW<[LSU, Lat30, GroupAlone], (instregex "ED(MK)?$")>;
 
 //===----------------------------------------------------------------------===//
@@ -659,7 +668,7 @@ def : InstRW<[FXU], (instregex "PPA$")>;
 //===----------------------------------------------------------------------===//
 
 // Find leftmost one
-def : InstRW<[FXU, Lat7, GroupAlone], (instregex "FLOGR$")>;
+def : InstRW<[FXU, FXU, Lat7, GroupAlone], (instregex "FLOGR$")>;
 
 // Population count
 def : InstRW<[FXU, Lat3], (instregex "POPCNT$")>;
@@ -670,14 +679,14 @@ def : InstRW<[FXU], (instregex "ZEXT128$")>;
 
 // String instructions
 def : InstRW<[FXU, LSU, Lat30], (instregex "SRST$")>;
-def : InstRW<[LSU, Lat30], (instregex "SRSTU$")>;
+def : InstRW<[FXU, Lat30], (instregex "SRSTU$")>;
 def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CUSE$")>;
 
 // Various complex instructions
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CFC$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UPT$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CKSM$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CMPSC$")>;
+def : InstRW<[LSU, Lat30], (instregex "CFC$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "UPT$")>;
+def : InstRW<[LSU, Lat30], (instregex "CKSM$")>;
+def : InstRW<[FXU, Lat30], (instregex "CMPSC$")>;
 
 // Execute
 def : InstRW<[LSU, GroupAlone], (instregex "EX(RL)?$")>;
@@ -818,9 +827,9 @@ def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "MXDBR$")>;
 def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "MXBR$")>;
 
 // Multiply and add / subtract
-def : InstRW<[FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)EB$")>;
+def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)EB$")>;
 def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)EBR$")>;
-def : InstRW<[FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)DB$")>;
+def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)DB$")>;
 def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)DBR$")>;
 
 // Division
@@ -829,7 +838,7 @@ def : InstRW<[FPU, Lat30], (instregex "D(E|D)BR$")>;
 def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "DXBR$")>;
 
 // Divide to integer
-def : InstRW<[FPU, Lat30, GroupAlone], (instregex "DI(E|D)BR$")>;
+def : InstRW<[FPU, Lat30], (instregex "DI(E|D)BR$")>;
 
 //===----------------------------------------------------------------------===//
 // FP: Comparisons
@@ -851,10 +860,10 @@ def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "TCXB$")>;
 def : InstRW<[FXU, LSU, Lat4, GroupAlone], (instregex "EFPC$")>;
 def : InstRW<[LSU, Lat3, GroupAlone], (instregex "SFPC$")>;
 def : InstRW<[LSU, LSU, Lat6, GroupAlone], (instregex "LFPC$")>;
-def : InstRW<[LSU, Lat3, GroupAlone], (instregex "STFPC$")>;
-def : InstRW<[FXU, Lat30, GroupAlone], (instregex "SFASR$")>;
-def : InstRW<[FXU, LSU, Lat30, GroupAlone], (instregex "LFAS$")>;
-def : InstRW<[FXU, Lat2, GroupAlone], (instregex "SRNM(B|T)?$")>;
+def : InstRW<[FXU, LSU, Lat3, GroupAlone], (instregex "STFPC$")>;
+def : InstRW<[FXU, Lat30], (instregex "SFASR$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "LFAS$")>;
+def : InstRW<[FXU, GroupAlone], (instregex "SRNM(B|T)?$")>;
 
 
 // --------------------- Hexadecimal floating point ------------------------- //
@@ -938,16 +947,20 @@ def : InstRW<[FPU], (instregex "M(D|DE|E|EE)R$")>;
 def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "MXD$")>;
 def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "MXDR$")>;
 def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "MXR$")>;
-def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "MY(H|L)?$")>;
-def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "MY(H|L)?R$")>;
+def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "MY$")>;
+def : InstRW<[FPU, FPU, LSU, Lat15, GroupAlone], (instregex "MY(H|L)$")>;
+def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "MYR$")>;
+def : InstRW<[FPU, Lat10, GroupAlone], (instregex "MY(H|L)R$")>;
 
 // Multiply and add / subtract
-def : InstRW<[FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)E$")>;
+def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)E$")>;
 def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)ER$")>;
-def : InstRW<[FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)D$")>;
+def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)D$")>;
 def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)DR$")>;
-def : InstRW<[FPU2, FPU2, LSU, Lat12, GroupAlone], (instregex "MAY(H|L)?$")>;
-def : InstRW<[FPU2, FPU2, GroupAlone], (instregex "MAY(H|L)?R$")>;
+def : InstRW<[FPU2, FPU2, LSU, GroupAlone], (instregex "MAY$")>;
+def : InstRW<[FPU2, FPU2, GroupAlone], (instregex "MAYR$")>;
+def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "MAY(H|L)$")>;
+def : InstRW<[FPU, GroupAlone], (instregex "MAY(H|L)R$")>;
 
 // Division
 def : InstRW<[FPU, LSU, Lat30], (instregex "D(E|D)$")>;
@@ -987,16 +1000,21 @@ def : InstRW<[DFU, Lat20], (instregex "LDETR$")>;
 def : InstRW<[DFU2, DFU2, Lat20, GroupAlone], (instregex "LXDTR$")>;
 
 // Convert from fixed / logical
-def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "CD(F|G)TR(A)?$")>;
-def : InstRW<[FXU, DFU2, DFU2, Lat30, GroupAlone], (instregex "CX(F|G)TR(A)?$")>;
+def : InstRW<[FXU, DFU, Lat9, GroupAlone], (instregex "CDFTR$")>;
+def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "CDGTR(A)?$")>;
+def : InstRW<[FXU, DFU2, DFU2, GroupAlone], (instregex "CXFTR$")>;
+def : InstRW<[FXU, DFU2, DFU2, Lat30, GroupAlone], (instregex "CXGTR(A)?$")>;
 def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "CDL(F|G)TR$")>;
-def : InstRW<[FXU, DFU2, DFU2, Lat11, GroupAlone], (instregex "CXL(F|G)TR$")>;
+def : InstRW<[FXU, DFU2, DFU2, Lat11, GroupAlone], (instregex "CXLFTR$")>;
+def : InstRW<[FXU, DFU2, DFU2, Lat6, GroupAlone], (instregex "CXLGTR$")>;
 
 // Convert to fixed / logical
-def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "C(F|G)DTR(A)?$")>;
-def : InstRW<[FXU, DFU, DFU, Lat30, GroupAlone], (instregex "C(F|G)XTR(A)?$")>;
-def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "CL(F|G)DTR$")>;
-def : InstRW<[FXU, DFU, DFU, Lat30, GroupAlone], (instregex "CL(F|G)XTR$")>;
+def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "CFDTR(A)?$")>;
+def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "CGDTR(A)?$")>;
+def : InstRW<[FXU, DFU, DFU, Lat11, GroupAlone], (instregex "CFXTR$")>;
+def : InstRW<[FXU, DFU, DFU, Lat30, GroupAlone], (instregex "CGXTR(A)?$")>;
+def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "CL(F|G)DTR$")>;
+def : InstRW<[FXU, DFU, DFU, Lat11, GroupAlone], (instregex "CL(F|G)XTR$")>;
 
 // Convert from / to signed / unsigned packed
 def : InstRW<[FXU, DFU, Lat12, GroupAlone], (instregex "CD(S|U)TR$")>;
@@ -1007,11 +1025,11 @@ def : InstRW<[FXU, FXU, DFU2, DFU2, Lat20, GroupAlone], (instregex "C(S|U)XTR$")
 // Convert from / to zoned
 def : InstRW<[LSU, DFU2, Lat7, GroupAlone], (instregex "CDZT$")>;
 def : InstRW<[LSU, LSU, DFU2, DFU2, Lat10, GroupAlone], (instregex "CXZT$")>;
-def : InstRW<[FXU, LSU, DFU, Lat11, GroupAlone], (instregex "CZDT$")>;
+def : InstRW<[FXU, LSU, DFU, DFU, Lat11, GroupAlone], (instregex "CZDT$")>;
 def : InstRW<[FXU, LSU, DFU, DFU, Lat15, GroupAlone], (instregex "CZXT$")>;
 
 // Perform floating-point operation
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "PFPO$")>;
+def : InstRW<[FXU, Lat30], (instregex "PFPO$")>;
 
 //===----------------------------------------------------------------------===//
 // DFP: Unary arithmetic
@@ -1023,7 +1041,7 @@ def : InstRW<[DFU2, DFU2, Lat20, GroupAlone], (instregex "FIXTR$")>;
 
 // Extract biased exponent
 def : InstRW<[FXU, DFU, Lat15, GroupAlone], (instregex "EEDTR$")>;
-def : InstRW<[FXU, DFU, Lat15, GroupAlone], (instregex "EEXTR$")>;
+def : InstRW<[FXU, DFU2, Lat15, GroupAlone], (instregex "EEXTR$")>;
 
 // Extract significance
 def : InstRW<[FXU, DFU, Lat15, GroupAlone], (instregex "ESDTR$")>;
@@ -1054,15 +1072,15 @@ def : InstRW<[DFU, Lat30], (instregex "QADTR$")>;
 def : InstRW<[DFU2, DFU2, Lat30, GroupAlone], (instregex "QAXTR$")>;
 
 // Reround
-def : InstRW<[FXU, DFU, Lat30], (instregex "RRDTR$")>;
+def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "RRDTR$")>;
 def : InstRW<[FXU, DFU2, DFU2, Lat30, GroupAlone], (instregex "RRXTR$")>;
 
 // Shift significand left/right
-def : InstRW<[LSU, DFU, Lat11], (instregex "S(L|R)DT$")>;
+def : InstRW<[LSU, DFU, Lat11, GroupAlone], (instregex "S(L|R)DT$")>;
 def : InstRW<[LSU, DFU2, DFU2, Lat15, GroupAlone], (instregex "S(L|R)XT$")>;
 
 // Insert biased exponent
-def : InstRW<[FXU, DFU, Lat11], (instregex "IEDTR$")>;
+def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "IEDTR$")>;
 def : InstRW<[FXU, DFU2, DFU2, Lat15, GroupAlone], (instregex "IEXTR$")>;
 
 //===----------------------------------------------------------------------===//
@@ -1071,15 +1089,15 @@ def : InstRW<[FXU, DFU2, DFU2, Lat15, GroupAlone], (instregex "IEXTR$")>;
 
 // Compare
 def : InstRW<[DFU, Lat11], (instregex "(K|C)DTR$")>;
-def : InstRW<[DFU, DFU, Lat15, GroupAlone], (instregex "(K|C)XTR$")>;
+def : InstRW<[DFU, DFU, Lat15], (instregex "(K|C)XTR$")>;
 
 // Compare biased exponent
 def : InstRW<[DFU, Lat8], (instregex "CEDTR$")>;
-def : InstRW<[DFU, Lat9], (instregex "CEXTR$")>;
+def : InstRW<[DFU, DFU, Lat9], (instregex "CEXTR$")>;
 
 // Test Data Class/Group
 def : InstRW<[LSU, DFU, Lat15], (instregex "TD(C|G)(E|D)T$")>;
-def : InstRW<[LSU, DFU2, DFU2, Lat15, GroupAlone], (instregex "TD(C|G)XT$")>;
+def : InstRW<[LSU, DFU2, Lat15], (instregex "TD(C|G)XT$")>;
 
 
 // -------------------------------- System ---------------------------------- //
@@ -1090,19 +1108,20 @@ def : InstRW<[LSU, DFU2, DFU2, Lat15, GroupAlone], (instregex "TD(C|G)XT$")>;
 
 def : InstRW<[FXU, Lat30], (instregex "EPSW$")>;
 def : InstRW<[FXU, LSU, Lat30], (instregex "LPSW(E)?$")>;
-def : InstRW<[FXU, Lat3], (instregex "IPK$")>;
-def : InstRW<[LSU], (instregex "SPKA$")>;
-def : InstRW<[LSU], (instregex "SSM$")>;
-def : InstRW<[FXU], (instregex "ST(N|O)SM$")>;
+def : InstRW<[FXU, Lat3, GroupAlone], (instregex "IPK$")>;
+def : InstRW<[LSU, EndGroup], (instregex "SPKA$")>;
+def : InstRW<[LSU, EndGroup], (instregex "SSM$")>;
+def : InstRW<[FXU, LSU, GroupAlone], (instregex "ST(N|O)SM$")>;
 def : InstRW<[FXU, Lat3], (instregex "IAC$")>;
-def : InstRW<[LSU], (instregex "SAC(F)?$")>;
+def : InstRW<[LSU, EndGroup], (instregex "SAC(F)?$")>;
 
 //===----------------------------------------------------------------------===//
 // System: Control Register Instructions
 //===----------------------------------------------------------------------===//
 
 def : InstRW<[FXU, LSU, Lat30], (instregex "LCTL(G)?$")>;
-def : InstRW<[LSU, Lat30], (instregex "STCT(L|G)$")>;
+def : InstRW<[FXU, LSU, LSU, LSU, LSU, Lat30, GroupAlone],
+             (instregex "STCT(L|G)$")>;
 def : InstRW<[LSU], (instregex "E(P|S)A(I)?R$")>;
 def : InstRW<[FXU, Lat30], (instregex "SSA(I)?R$")>;
 def : InstRW<[FXU, Lat30], (instregex "ESEA$")>;
@@ -1148,16 +1167,17 @@ def : InstRW<[FXU, LSU, Lat30], (instregex "TPROT$")>;
 //===----------------------------------------------------------------------===//
 
 def : InstRW<[LSU, Lat8, GroupAlone], (instregex "MVC(K|P|S)$")>;
-def : InstRW<[LSU, Lat6, GroupAlone], (instregex "MVC(S|D)K$")>;
+def : InstRW<[LSU, Lat6, Lat30, GroupAlone], (instregex "MVCSK$")>;
+def : InstRW<[LSU, Lat6, GroupAlone], (instregex "MVCDK$")>;
 def : InstRW<[FXU, LSU, Lat30], (instregex "MVCOS$")>;
-def : InstRW<[LSU, Lat30], (instregex "MVPG$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVPG$")>;
 
 //===----------------------------------------------------------------------===//
 // System: Address-Space Instructions
 //===----------------------------------------------------------------------===//
 
 def : InstRW<[FXU, LSU, Lat30], (instregex "LASP$")>;
-def : InstRW<[LSU], (instregex "PALB$")>;
+def : InstRW<[LSU, GroupAlone], (instregex "PALB$")>;
 def : InstRW<[FXU, LSU, Lat30], (instregex "PC$")>;
 def : InstRW<[FXU, Lat30], (instregex "PR$")>;
 def : InstRW<[FXU, Lat30], (instregex "PT(I)?$")>;
@@ -1169,7 +1189,7 @@ def : InstRW<[FXU, Lat20], (instregex "TAR$")>;
 // System: Linkage-Stack Instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, LSU, Lat30], (instregex "BAKR$")>;
+def : InstRW<[FXU, LSU, Lat30, EndGroup], (instregex "BAKR$")>;
 def : InstRW<[FXU, Lat30], (instregex "EREG(G)?$")>;
 def : InstRW<[FXU, Lat30], (instregex "(E|M)STA$")>;
 
@@ -1206,7 +1226,7 @@ def : InstRW<[FXU, Lat30], (instregex "PCKMO$")>;
 //===----------------------------------------------------------------------===//
 
 def : InstRW<[FXU, Lat30], (instregex "SVC$")>;
-def : InstRW<[FXU], (instregex "MC$")>;
+def : InstRW<[FXU, GroupAlone], (instregex "MC$")>;
 def : InstRW<[FXU, Lat30], (instregex "DIAG$")>;
 def : InstRW<[FXU], (instregex "TRAC(E|G)$")>;
 def : InstRW<[FXU, Lat30], (instregex "TRAP(2|4)$")>;
@@ -1221,7 +1241,8 @@ def : InstRW<[FXU, LSU, Lat30], (instregex "SIE$")>;
 def : InstRW<[FXU], (instregex "LPP$")>;
 def : InstRW<[FXU, Lat30], (instregex "ECPGA$")>;
 def : InstRW<[FXU, Lat30], (instregex "E(C|P)CTR$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "L(C|P|S)CTL$")>;
+def : InstRW<[FXU, Lat30], (instregex "LCCTL$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "L(P|S)CTL$")>;
 def : InstRW<[FXU, LSU, Lat30], (instregex "Q(S|CTR)I$")>;
 def : InstRW<[FXU, Lat30], (instregex "S(C|P)CTR$")>;
 
diff --git a/lib/Target/SystemZ/SystemZShortenInst.cpp b/lib/Target/SystemZ/SystemZShortenInst.cpp
index 7391df8342ef..13ceb371a425 100644
--- a/lib/Target/SystemZ/SystemZShortenInst.cpp
+++ b/lib/Target/SystemZ/SystemZShortenInst.cpp
@@ -200,14 +200,26 @@ bool SystemZShortenInst::processBlock(MachineBasicBlock &MBB) {
       Changed |= shortenOn001AddCC(MI, SystemZ::ADBR);
       break;
 
+    case SystemZ::WFASB:
+      Changed |= shortenOn001AddCC(MI, SystemZ::AEBR);
+      break;
+
     case SystemZ::WFDDB:
       Changed |= shortenOn001(MI, SystemZ::DDBR);
       break;
 
+    case SystemZ::WFDSB:
+      Changed |= shortenOn001(MI, SystemZ::DEBR);
+      break;
+
     case SystemZ::WFIDB:
       Changed |= shortenFPConv(MI, SystemZ::FIDBRA);
       break;
 
+    case SystemZ::WFISB:
+      Changed |= shortenFPConv(MI, SystemZ::FIEBRA);
+      break;
+
     case SystemZ::WLDEB:
       Changed |= shortenOn01(MI, SystemZ::LDEBR);
       break;
@@ -220,30 +232,58 @@ bool SystemZShortenInst::processBlock(MachineBasicBlock &MBB) {
       Changed |= shortenOn001(MI, SystemZ::MDBR);
       break;
 
+    case SystemZ::WFMSB:
+      Changed |= shortenOn001(MI, SystemZ::MEEBR);
+      break;
+
     case SystemZ::WFLCDB:
       Changed |= shortenOn01(MI, SystemZ::LCDFR);
       break;
 
+    case SystemZ::WFLCSB:
+      Changed |= shortenOn01(MI, SystemZ::LCDFR_32);
+      break;
+
     case SystemZ::WFLNDB:
       Changed |= shortenOn01(MI, SystemZ::LNDFR);
       break;
 
+    case SystemZ::WFLNSB:
+      Changed |= shortenOn01(MI, SystemZ::LNDFR_32);
+      break;
+
     case SystemZ::WFLPDB:
       Changed |= shortenOn01(MI, SystemZ::LPDFR);
       break;
 
+    case SystemZ::WFLPSB:
+      Changed |= shortenOn01(MI, SystemZ::LPDFR_32);
+      break;
+
     case SystemZ::WFSQDB:
       Changed |= shortenOn01(MI, SystemZ::SQDBR);
       break;
 
+    case SystemZ::WFSQSB:
+      Changed |= shortenOn01(MI, SystemZ::SQEBR);
+      break;
+
     case SystemZ::WFSDB:
       Changed |= shortenOn001AddCC(MI, SystemZ::SDBR);
       break;
 
+    case SystemZ::WFSSB:
+      Changed |= shortenOn001AddCC(MI, SystemZ::SEBR);
+      break;
+
     case SystemZ::WFCDB:
       Changed |= shortenOn01(MI, SystemZ::CDBR);
       break;
 
+    case SystemZ::WFCSB:
+      Changed |= shortenOn01(MI, SystemZ::CEBR);
+      break;
+
     case SystemZ::VL32:
       // For z13 we prefer LDE over LE to avoid partial register dependencies.
       Changed |= shortenOn0(MI, SystemZ::LDE32);
diff --git a/lib/Target/SystemZ/SystemZSubtarget.cpp b/lib/Target/SystemZ/SystemZSubtarget.cpp
index eb4a0962f7eb..9cd09b0f911e 100644
--- a/lib/Target/SystemZ/SystemZSubtarget.cpp
+++ b/lib/Target/SystemZ/SystemZSubtarget.cpp
@@ -47,6 +47,10 @@ SystemZSubtarget::SystemZSubtarget(const Triple &TT, const std::string &CPU,
       HasVector(false), HasLoadStoreOnCond2(false),
       HasLoadAndZeroRightmostByte(false), HasMessageSecurityAssist5(false),
       HasDFPPackedConversion(false),
+      HasMiscellaneousExtensions2(false), HasGuardedStorage(false),
+      HasMessageSecurityAssist7(false), HasMessageSecurityAssist8(false),
+      HasVectorEnhancements1(false), HasVectorPackedDecimal(false),
+      HasInsertReferenceBitsMultiple(false),
       TargetTriple(TT), InstrInfo(initializeSubtargetDependencies(CPU, FS)),
       TLInfo(TM, *this), TSInfo(), FrameLowering() {}
 
diff --git a/lib/Target/SystemZ/SystemZSubtarget.h b/lib/Target/SystemZ/SystemZSubtarget.h
index b05a1bb6cafd..4829f73e080e 100644
--- a/lib/Target/SystemZ/SystemZSubtarget.h
+++ b/lib/Target/SystemZ/SystemZSubtarget.h
@@ -56,6 +56,13 @@ protected:
   bool HasLoadAndZeroRightmostByte;
   bool HasMessageSecurityAssist5;
   bool HasDFPPackedConversion;
+  bool HasMiscellaneousExtensions2;
+  bool HasGuardedStorage;
+  bool HasMessageSecurityAssist7;
+  bool HasMessageSecurityAssist8;
+  bool HasVectorEnhancements1;
+  bool HasVectorPackedDecimal;
+  bool HasInsertReferenceBitsMultiple;
 
 private:
   Triple TargetTriple;
@@ -168,6 +175,33 @@ public:
   // Return true if the target has the vector facility.
   bool hasVector() const { return HasVector; }
 
+  // Return true if the target has the miscellaneous-extensions facility 2.
+  bool hasMiscellaneousExtensions2() const {
+    return HasMiscellaneousExtensions2;
+  }
+
+  // Return true if the target has the guarded-storage facility.
+  bool hasGuardedStorage() const { return HasGuardedStorage; }
+
+  // Return true if the target has the message-security-assist
+  // extension facility 7.
+  bool hasMessageSecurityAssist7() const { return HasMessageSecurityAssist7; }
+
+  // Return true if the target has the message-security-assist
+  // extension facility 8.
+  bool hasMessageSecurityAssist8() const { return HasMessageSecurityAssist8; }
+
+  // Return true if the target has the vector-enhancements facility 1.
+  bool hasVectorEnhancements1() const { return HasVectorEnhancements1; }
+
+  // Return true if the target has the vector-packed-decimal facility.
+  bool hasVectorPackedDecimal() const { return HasVectorPackedDecimal; }
+
+  // Return true if the target has the insert-reference-bits-multiple facility.
+  bool hasInsertReferenceBitsMultiple() const {
+    return HasInsertReferenceBitsMultiple;
+  }
+
   // Return true if GV can be accessed using LARL for reloc model RM
   // and code model CM.
   bool isPC32DBLSymbol(const GlobalValue *GV, CodeModel::Model CM) const;
diff --git a/lib/Target/SystemZ/SystemZTargetMachine.cpp b/lib/Target/SystemZ/SystemZTargetMachine.cpp
index cb81c0e5276e..025bf73d2df0 100644
--- a/lib/Target/SystemZ/SystemZTargetMachine.cpp
+++ b/lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -143,8 +143,10 @@ public:
 } // end anonymous namespace
 
 void SystemZPassConfig::addIRPasses() {
-  if (getOptLevel() != CodeGenOpt::None)
+  if (getOptLevel() != CodeGenOpt::None) {
     addPass(createSystemZTDCPass());
+    addPass(createLoopDataPrefetchPass());
+  }
 
   TargetPassConfig::addIRPasses();
 }
diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index 9ac768b2189d..506dc7427993 100644
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -372,6 +372,9 @@ int SystemZTTIImpl::getArithmeticInstrCost(
         Opcode == Instruction::FMul || Opcode == Instruction::FDiv) {
       switch (ScalarBits) {
       case 32: {
+        // The vector enhancements facility 1 provides v4f32 instructions.
+        if (ST->hasVectorEnhancements1())
+          return NumVectors;
         // Return the cost of multiple scalar invocation plus the cost of
         // inserting and extracting the values.
         unsigned ScalarCost = getArithmeticInstrCost(Opcode, Ty->getScalarType());
diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index 6923fc6fc910..a0c6fa94f8c1 100644
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -56,6 +56,10 @@ public:
   unsigned getNumberOfRegisters(bool Vector);
   unsigned getRegisterBitWidth(bool Vector) const;
 
+  unsigned getCacheLineSize() { return 256; }
+  unsigned getPrefetchDistance() { return 2000; }
+  unsigned getMinPrefetchStride() { return 2048; }
+
   bool prefersVectorizedAddressing() { return false; }
   bool supportsEfficientVectorElementLoadStore() { return true; }
   bool enableInterleavedAccessVectorization() { return true; }
diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt
index fc4adddc149b..6e08d4cff6ea 100644
--- a/lib/Target/X86/CMakeLists.txt
+++ b/lib/Target/X86/CMakeLists.txt
@@ -37,6 +37,7 @@ endif()
 set(sources
   X86AsmPrinter.cpp
   X86CallFrameOptimization.cpp
+  X86CmovConversion.cpp
   X86ExpandPseudo.cpp
   X86FastISel.cpp
   X86FixupBWInsts.cpp
diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h
index 19c93cfff0fe..91201d1fec85 100644
--- a/lib/Target/X86/X86.h
+++ b/lib/Target/X86/X86.h
@@ -83,6 +83,9 @@ FunctionPass *createX86WinEHStatePass();
 /// the MachineInstr to MC.
 FunctionPass *createX86ExpandPseudoPass();
 
+/// This pass converts X86 cmov instructions into branch when profitable.
+FunctionPass *createX86CmovConverterPass();
+
 /// Return a Machine IR pass that selectively replaces
 /// certain byte and word instructions by equivalent 32 bit instructions,
 /// in order to eliminate partial register usage, false dependences on
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 4ca57fe9fb00..54eabeac5126 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -814,10 +814,8 @@ def : Proc<"bdver4", [
   FeatureMWAITX
 ]>;
 
-// TODO: The scheduler model falls to BTVER2 model.
-// The znver1 model has to be put in place.
-// Zen
-def: ProcessorModel<"znver1", BtVer2Model, [
+// Znver1
+def: ProcessorModel<"znver1", Znver1Model, [
   FeatureADX,
   FeatureAES,
   FeatureAVX2,
diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td
index 6decb550ad5f..26461986427d 100644
--- a/lib/Target/X86/X86CallingConv.td
+++ b/lib/Target/X86/X86CallingConv.td
@@ -448,7 +448,7 @@ def RetCC_X86_64 : CallingConv<[
   CCIfCC<"CallingConv::Swift", CCDelegateTo<RetCC_X86_64_Swift>>,
 
   // Handle explicit CC selection
-  CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo<RetCC_X86_Win64_C>>,
+  CCIfCC<"CallingConv::Win64", CCDelegateTo<RetCC_X86_Win64_C>>,
   CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo<RetCC_X86_64_C>>,
 
   // Handle Vectorcall CC
@@ -1004,7 +1004,7 @@ def CC_X86_64 : CallingConv<[
   CCIfCC<"CallingConv::HiPE", CCDelegateTo<CC_X86_64_HiPE>>,
   CCIfCC<"CallingConv::WebKit_JS", CCDelegateTo<CC_X86_64_WebKit_JS>>,
   CCIfCC<"CallingConv::AnyReg", CCDelegateTo<CC_X86_64_AnyReg>>,
-  CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo<CC_X86_Win64_C>>,
+  CCIfCC<"CallingConv::Win64", CCDelegateTo<CC_X86_Win64_C>>,
   CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo<CC_X86_64_C>>,
   CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_Win64_VectorCall>>,
   CCIfCC<"CallingConv::HHVM", CCDelegateTo<CC_X86_64_HHVM>>,
diff --git a/lib/Target/X86/X86CmovConversion.cpp b/lib/Target/X86/X86CmovConversion.cpp
new file mode 100644
index 000000000000..bfc834435de5
--- /dev/null
+++ b/lib/Target/X86/X86CmovConversion.cpp
@@ -0,0 +1,611 @@
+//====-- X86CmovConversion.cpp - Convert Cmov to Branch -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements a pass that converts X86 cmov instructions into branch
+/// when profitable. This pass is conservative, i.e., it applies transformation
+/// if and only if it can gaurantee a gain with high confidence.
+///
+/// Thus, the optimization applies under the following conditions:
+///   1. Consider as a candidate only CMOV in most inner loop, assuming that
+///       most hotspots are represented by these loops.
+///   2. Given a group of CMOV instructions, that are using same EFLAGS def
+///      instruction:
+///      a. Consider them as candidates only if all have same code condition or
+///         opposite one, to prevent generating more than one conditional jump
+///         per EFLAGS def instruction.
+///      b. Consider them as candidates only if all are profitable to be
+///         converted, assuming that one bad conversion may casue a degradation.
+///   3. Apply conversion only for loop that are found profitable and only for
+///      CMOV candidates that were found profitable.
+///      a. Loop is considered profitable only if conversion will reduce its
+///         depth cost by some thrishold.
+///      b. CMOV is considered profitable if the cost of its condition is higher
+///         than the average cost of its true-value and false-value by 25% of
+///         branch-misprediction-penalty, this to assure no degredassion even
+///         with 25% branch misprediction.
+///
+/// Note: This pass is assumed to run on SSA machine code.
+//===----------------------------------------------------------------------===//
+//
+//  External interfaces:
+//      FunctionPass *llvm::createX86CmovConverterPass();
+//      bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF);
+//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-cmov-converter"
+
+STATISTIC(NumOfSkippedCmovGroups, "Number of unsupported CMOV-groups");
+STATISTIC(NumOfCmovGroupCandidate, "Number of CMOV-group candidates");
+STATISTIC(NumOfLoopCandidate, "Number of CMOV-conversion profitable loops");
+STATISTIC(NumOfOptimizedCmovGroups, "Number of optimized CMOV-groups");
+
+namespace {
+// This internal switch can be used to turn off the cmov/branch optimization.
+static cl::opt<bool>
+    EnableCmovConverter("x86-cmov-converter",
+                        cl::desc("Enable the X86 cmov-to-branch optimization."),
+                        cl::init(true), cl::Hidden);
+
+/// Converts X86 cmov instructions into branches when profitable.
+class X86CmovConverterPass : public MachineFunctionPass {
+public:
+  X86CmovConverterPass() : MachineFunctionPass(ID) {}
+  ~X86CmovConverterPass() {}
+
+  StringRef getPassName() const override { return "X86 cmov Conversion"; }
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+private:
+  /// Pass identification, replacement for typeid.
+  static char ID;
+
+  const MachineRegisterInfo *MRI;
+  const TargetInstrInfo *TII;
+  TargetSchedModel TSchedModel;
+
+  /// List of consecutive CMOV instructions.
+  typedef SmallVector<MachineInstr *, 2> CmovGroup;
+  typedef SmallVector<CmovGroup, 2> CmovGroups;
+
+  /// Collect all CMOV-group-candidates in \p CurrLoop and update \p
+  /// CmovInstGroups accordingly.
+  ///
+  /// \param CurrLoop Loop being processed.
+  /// \param CmovInstGroups List of consecutive CMOV instructions in CurrLoop.
+  /// \returns true iff it found any CMOV-group-candidate.
+  bool collectCmovCandidates(MachineLoop *CurrLoop, CmovGroups &CmovInstGroups);
+
+  /// Check if it is profitable to transform each CMOV-group-candidates into
+  /// branch. Remove all groups that are not profitable from \p CmovInstGroups.
+  ///
+  /// \param CurrLoop Loop being processed.
+  /// \param CmovInstGroups List of consecutive CMOV instructions in CurrLoop.
+  /// \returns true iff any CMOV-group-candidate remain.
+  bool checkForProfitableCmovCandidates(MachineLoop *CurrLoop,
+                                        CmovGroups &CmovInstGroups);
+
+  /// Convert the given list of consecutive CMOV instructions into a branch.
+  ///
+  /// \param Group Consecutive CMOV instructions to be converted into branch.
+  void convertCmovInstsToBranches(SmallVectorImpl<MachineInstr *> &Group) const;
+};
+
+char X86CmovConverterPass::ID = 0;
+
+void X86CmovConverterPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  MachineFunctionPass::getAnalysisUsage(AU);
+  AU.addRequired<MachineLoopInfo>();
+}
+
+bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+  if (!EnableCmovConverter)
+    return false;
+
+  DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName()
+               << "**********\n");
+
+  bool Changed = false;
+  MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>();
+  const TargetSubtargetInfo &STI = MF.getSubtarget();
+  MRI = &MF.getRegInfo();
+  TII = STI.getInstrInfo();
+  TSchedModel.init(STI.getSchedModel(), &STI, TII);
+
+  //===--------------------------------------------------------------------===//
+  // Algorithm
+  // ---------
+  //   For each inner most loop
+  //     collectCmovCandidates() {
+  //       Find all CMOV-group-candidates.
+  //     }
+  //
+  //     checkForProfitableCmovCandidates() {
+  //       * Calculate both loop-depth and optimized-loop-depth.
+  //       * Use these depth to check for loop transformation profitability.
+  //       * Check for CMOV-group-candidate transformation profitability.
+  //     }
+  //
+  //     For each profitable CMOV-group-candidate
+  //       convertCmovInstsToBranches() {
+  //           * Create FalseBB, SinkBB, Conditional branch to SinkBB.
+  //           * Replace each CMOV instruction with a PHI instruction in SinkBB.
+  //       }
+  //
+  // Note: For more details, see each function description.
+  //===--------------------------------------------------------------------===//
+  for (MachineBasicBlock &MBB : MF) {
+    MachineLoop *CurrLoop = MLI.getLoopFor(&MBB);
+
+    // Optimize only inner most loops.
+    if (!CurrLoop || CurrLoop->getHeader() != &MBB ||
+        !CurrLoop->getSubLoops().empty())
+      continue;
+
+    // List of consecutive CMOV instructions to be processed.
+    CmovGroups CmovInstGroups;
+
+    if (!collectCmovCandidates(CurrLoop, CmovInstGroups))
+      continue;
+
+    if (!checkForProfitableCmovCandidates(CurrLoop, CmovInstGroups))
+      continue;
+
+    Changed = true;
+    for (auto &Group : CmovInstGroups)
+      convertCmovInstsToBranches(Group);
+  }
+  return Changed;
+}
+
+bool X86CmovConverterPass::collectCmovCandidates(MachineLoop *CurrLoop,
+                                                 CmovGroups &CmovInstGroups) {
+  //===--------------------------------------------------------------------===//
+  // Collect all CMOV-group-candidates and add them into CmovInstGroups.
+  //
+  // CMOV-group:
+  //   CMOV instructions, in same MBB, that uses same EFLAGS def instruction.
+  //
+  // CMOV-group-candidate:
+  //   CMOV-group where all the CMOV instructions are
+  //     1. consecutive.
+  //     2. have same condition code or opposite one.
+  //     3. have only operand registers (X86::CMOVrr).
+  //===--------------------------------------------------------------------===//
+  // List of possible improvement (TODO's):
+  // --------------------------------------
+  //   TODO: Add support for X86::CMOVrm instructions.
+  //   TODO: Add support for X86::SETcc instructions.
+  //   TODO: Add support for CMOV-groups with non consecutive CMOV instructions.
+  //===--------------------------------------------------------------------===//
+
+  // Current processed CMOV-Group.
+  CmovGroup Group;
+  for (auto *MBB : CurrLoop->getBlocks()) {
+    Group.clear();
+    // Condition code of first CMOV instruction current processed range and its
+    // opposite condition code.
+    X86::CondCode FirstCC, FirstOppCC;
+    // Indicator of a non CMOVrr instruction in the current processed range.
+    bool FoundNonCMOVInst = false;
+    // Indicator for current processed CMOV-group if it should be skipped.
+    bool SkipGroup = false;
+
+    for (auto &I : *MBB) {
+      X86::CondCode CC = X86::getCondFromCMovOpc(I.getOpcode());
+      // Check if we found a X86::CMOVrr instruction.
+      if (CC != X86::COND_INVALID && !I.mayLoad()) {
+        if (Group.empty()) {
+          // We found first CMOV in the range, reset flags.
+          FirstCC = CC;
+          FirstOppCC = X86::GetOppositeBranchCondition(CC);
+          FoundNonCMOVInst = false;
+          SkipGroup = false;
+        }
+        Group.push_back(&I);
+        // Check if it is a non-consecutive CMOV instruction or it has different
+        // condition code than FirstCC or FirstOppCC.
+        if (FoundNonCMOVInst || (CC != FirstCC && CC != FirstOppCC))
+          // Mark the SKipGroup indicator to skip current processed CMOV-Group.
+          SkipGroup = true;
+        continue;
+      }
+      // If Group is empty, keep looking for first CMOV in the range.
+      if (Group.empty())
+        continue;
+
+      // We found a non X86::CMOVrr instruction.
+      FoundNonCMOVInst = true;
+      // Check if this instruction define EFLAGS, to determine end of processed
+      // range, as there would be no more instructions using current EFLAGS def.
+      if (I.definesRegister(X86::EFLAGS)) {
+        // Check if current processed CMOV-group should not be skipped and add
+        // it as a CMOV-group-candidate.
+        if (!SkipGroup)
+          CmovInstGroups.push_back(Group);
+        else
+          ++NumOfSkippedCmovGroups;
+        Group.clear();
+      }
+    }
+    // End of basic block is considered end of range, check if current processed
+    // CMOV-group should not be skipped and add it as a CMOV-group-candidate.
+    if (Group.empty())
+      continue;
+    if (!SkipGroup)
+      CmovInstGroups.push_back(Group);
+    else
+      ++NumOfSkippedCmovGroups;
+  }
+
+  NumOfCmovGroupCandidate += CmovInstGroups.size();
+  return !CmovInstGroups.empty();
+}
+
+/// \returns Depth of CMOV instruction as if it was converted into branch.
+/// \param TrueOpDepth depth cost of CMOV true value operand.
+/// \param FalseOpDepth depth cost of CMOV false value operand.
+static unsigned getDepthOfOptCmov(unsigned TrueOpDepth, unsigned FalseOpDepth) {
+  //===--------------------------------------------------------------------===//
+  // With no info about branch weight, we assume 50% for each value operand.
+  // Thus, depth of optimized CMOV instruction is the rounded up average of
+  // its True-Operand-Value-Depth and False-Operand-Value-Depth.
+  //===--------------------------------------------------------------------===//
+  return (TrueOpDepth + FalseOpDepth + 1) / 2;
+}
+
+bool X86CmovConverterPass::checkForProfitableCmovCandidates(
+    MachineLoop *CurrLoop, CmovGroups &CmovInstGroups) {
+  struct DepthInfo {
+    /// Depth of original loop.
+    unsigned Depth;
+    /// Depth of optimized loop.
+    unsigned OptDepth;
+  };
+  /// Number of loop iterations to calculate depth for ?!
+  static const unsigned LoopIterations = 2;
+  DenseMap<MachineInstr *, DepthInfo> DepthMap;
+  DepthInfo LoopDepth[LoopIterations] = {{0, 0}, {0, 0}};
+  enum { PhyRegType = 0, VirRegType = 1, RegTypeNum = 2 };
+  /// For each register type maps the register to its last def instruction.
+  DenseMap<unsigned, MachineInstr *> RegDefMaps[RegTypeNum];
+  /// Maps register operand to its def instruction, which can be nullptr if it
+  /// is unknown (e.g., operand is defined outside the loop).
+  DenseMap<MachineOperand *, MachineInstr *> OperandToDefMap;
+
+  // Set depth of unknown instruction (i.e., nullptr) to zero.
+  DepthMap[nullptr] = {0, 0};
+
+  SmallPtrSet<MachineInstr *, 4> CmovInstructions;
+  for (auto &Group : CmovInstGroups)
+    CmovInstructions.insert(Group.begin(), Group.end());
+
+  //===--------------------------------------------------------------------===//
+  // Step 1: Calculate instruction depth and loop depth.
+  // Optimized-Loop:
+  //   loop with CMOV-group-candidates converted into branches.
+  //
+  // Instruction-Depth:
+  //   instruction latency + max operand depth.
+  //     * For CMOV instruction in optimized loop the depth is calculated as:
+  //       CMOV latency + getDepthOfOptCmov(True-Op-Depth, False-Op-depth)
+  // TODO: Find a better way to estimate the latency of the branch instruction
+  //       rather than using the CMOV latency.
+  //
+  // Loop-Depth:
+  //   max instruction depth of all instructions in the loop.
+  // Note: instruction with max depth represents the critical-path in the loop.
+  //
+  // Loop-Depth[i]:
+  //   Loop-Depth calculated for first `i` iterations.
+  //   Note: it is enough to calculate depth for up to two iterations.
+  //
+  // Depth-Diff[i]:
+  //   Number of cycles saved in first 'i` iterations by optimizing the loop.
+  //===--------------------------------------------------------------------===//
+  for (unsigned I = 0; I < LoopIterations; ++I) {
+    DepthInfo &MaxDepth = LoopDepth[I];
+    for (auto *MBB : CurrLoop->getBlocks()) {
+      // Clear physical registers Def map.
+      RegDefMaps[PhyRegType].clear();
+      for (MachineInstr &MI : *MBB) {
+        unsigned MIDepth = 0;
+        unsigned MIDepthOpt = 0;
+        bool IsCMOV = CmovInstructions.count(&MI);
+        for (auto &MO : MI.uses()) {
+          // Checks for "isUse()" as "uses()" returns also implicit definitions.
+          if (!MO.isReg() || !MO.isUse())
+            continue;
+          unsigned Reg = MO.getReg();
+          auto &RDM = RegDefMaps[TargetRegisterInfo::isVirtualRegister(Reg)];
+          if (MachineInstr *DefMI = RDM.lookup(Reg)) {
+            OperandToDefMap[&MO] = DefMI;
+            DepthInfo Info = DepthMap.lookup(DefMI);
+            MIDepth = std::max(MIDepth, Info.Depth);
+            if (!IsCMOV)
+              MIDepthOpt = std::max(MIDepthOpt, Info.OptDepth);
+          }
+        }
+
+        if (IsCMOV)
+          MIDepthOpt = getDepthOfOptCmov(
+              DepthMap[OperandToDefMap.lookup(&MI.getOperand(1))].OptDepth,
+              DepthMap[OperandToDefMap.lookup(&MI.getOperand(2))].OptDepth);
+
+        // Iterates over all operands to handle implicit definitions as well.
+        for (auto &MO : MI.operands()) {
+          if (!MO.isReg() || !MO.isDef())
+            continue;
+          unsigned Reg = MO.getReg();
+          RegDefMaps[TargetRegisterInfo::isVirtualRegister(Reg)][Reg] = &MI;
+        }
+
+        unsigned Latency = TSchedModel.computeInstrLatency(&MI);
+        DepthMap[&MI] = {MIDepth += Latency, MIDepthOpt += Latency};
+        MaxDepth.Depth = std::max(MaxDepth.Depth, MIDepth);
+        MaxDepth.OptDepth = std::max(MaxDepth.OptDepth, MIDepthOpt);
+      }
+    }
+  }
+
+  unsigned Diff[LoopIterations] = {LoopDepth[0].Depth - LoopDepth[0].OptDepth,
+                                   LoopDepth[1].Depth - LoopDepth[1].OptDepth};
+
+  //===--------------------------------------------------------------------===//
+  // Step 2: Check if Loop worth to be optimized.
+  // Worth-Optimize-Loop:
+  //   case 1: Diff[1] == Diff[0]
+  //           Critical-path is iteration independent - there is no dependency
+  //           of critical-path instructions on critical-path instructions of
+  //           previous iteration.
+  //           Thus, it is enough to check gain percent of 1st iteration -
+  //           To be conservative, the optimized loop need to have a depth of
+  //           12.5% cycles less than original loop, per iteration.
+  //
+  //   case 2: Diff[1] > Diff[0]
+  //           Critical-path is iteration dependent - there is dependency of
+  //           critical-path instructions on critical-path instructions of
+  //           previous iteration.
+  //           Thus, it is required to check the gradient of the gain - the
+  //           change in Depth-Diff compared to the change in Loop-Depth between
+  //           1st and 2nd iterations.
+  //           To be conservative, the gradient need to be at least 50%.
+  //
+  // If loop is not worth optimizing, remove all CMOV-group-candidates.
+  //===--------------------------------------------------------------------===//
+  bool WorthOptLoop = false;
+  if (Diff[1] == Diff[0])
+    WorthOptLoop = Diff[0] * 8 >= LoopDepth[0].Depth;
+  else if (Diff[1] > Diff[0])
+    WorthOptLoop =
+        (Diff[1] - Diff[0]) * 2 >= (LoopDepth[1].Depth - LoopDepth[0].Depth);
+
+  if (!WorthOptLoop)
+    return false;
+
+  ++NumOfLoopCandidate;
+
+  //===--------------------------------------------------------------------===//
+  // Step 3: Check for each CMOV-group-candidate if it worth to be optimized.
+  // Worth-Optimize-Group:
+  //   Iff it worths to optimize all CMOV instructions in the group.
+  //
+  // Worth-Optimize-CMOV:
+  //   Predicted branch is faster than CMOV by the difference between depth of
+  //   condition operand and depth of taken (predicted) value operand.
+  //   To be conservative, the gain of such CMOV transformation should cover at
+  //   at least 25% of branch-misprediction-penalty.
+  //===--------------------------------------------------------------------===//
+  unsigned MispredictPenalty = TSchedModel.getMCSchedModel()->MispredictPenalty;
+  CmovGroups TempGroups;
+  std::swap(TempGroups, CmovInstGroups);
+  for (auto &Group : TempGroups) {
+    bool WorthOpGroup = true;
+    for (auto *MI : Group) {
+      // Avoid CMOV instruction which value is used as a pointer to load from.
+      // This is another conservative check to avoid converting CMOV instruction
+      // used with tree-search like algorithm, where the branch is unpredicted.
+      auto UIs = MRI->use_instructions(MI->defs().begin()->getReg());
+      if (UIs.begin() != UIs.end() && ++UIs.begin() == UIs.end()) {
+        unsigned Op = UIs.begin()->getOpcode();
+        if (Op == X86::MOV64rm || Op == X86::MOV32rm) {
+          WorthOpGroup = false;
+          break;
+        }
+      }
+
+      unsigned CondCost =
+          DepthMap[OperandToDefMap.lookup(&MI->getOperand(3))].Depth;
+      unsigned ValCost = getDepthOfOptCmov(
+          DepthMap[OperandToDefMap.lookup(&MI->getOperand(1))].Depth,
+          DepthMap[OperandToDefMap.lookup(&MI->getOperand(2))].Depth);
+      if (ValCost > CondCost || (CondCost - ValCost) * 4 < MispredictPenalty) {
+        WorthOpGroup = false;
+        break;
+      }
+    }
+
+    if (WorthOpGroup)
+      CmovInstGroups.push_back(Group);
+  }
+
+  return !CmovInstGroups.empty();
+}
+
+static bool checkEFLAGSLive(MachineInstr *MI) {
+  if (MI->killsRegister(X86::EFLAGS))
+    return false;
+
+  // The EFLAGS operand of MI might be missing a kill marker.
+  // Figure out whether EFLAGS operand should LIVE after MI instruction.
+  MachineBasicBlock *BB = MI->getParent();
+  MachineBasicBlock::iterator ItrMI = MI;
+
+  // Scan forward through BB for a use/def of EFLAGS.
+  for (auto I = std::next(ItrMI), E = BB->end(); I != E; ++I) {
+    if (I->readsRegister(X86::EFLAGS))
+      return true;
+    if (I->definesRegister(X86::EFLAGS))
+      return false;
+  }
+
+  // We hit the end of the block, check whether EFLAGS is live into a successor.
+  for (auto I = BB->succ_begin(), E = BB->succ_end(); I != E; ++I) {
+    if ((*I)->isLiveIn(X86::EFLAGS))
+      return true;
+  }
+
+  return false;
+}
+
+void X86CmovConverterPass::convertCmovInstsToBranches(
+    SmallVectorImpl<MachineInstr *> &Group) const {
+  assert(!Group.empty() && "No CMOV instructions to convert");
+  ++NumOfOptimizedCmovGroups;
+
+  // To convert a CMOVcc instruction, we actually have to insert the diamond
+  // control-flow pattern.  The incoming instruction knows the destination vreg
+  // to set, the condition code register to branch on, the true/false values to
+  // select between, and a branch opcode to use.
+
+  // Before
+  // -----
+  // MBB:
+  //   cond = cmp ...
+  //   v1 = CMOVge t1, f1, cond
+  //   v2 = CMOVlt t2, f2, cond
+  //   v3 = CMOVge v1, f3, cond
+  //
+  // After
+  // -----
+  // MBB:
+  //   cond = cmp ...
+  //   jge %SinkMBB
+  //
+  // FalseMBB:
+  //   jmp %SinkMBB
+  //
+  // SinkMBB:
+  //   %v1 = phi[%f1, %FalseMBB], [%t1, %MBB]
+  //   %v2 = phi[%t2, %FalseMBB], [%f2, %MBB] ; For CMOV with OppCC switch
+  //                                          ; true-value with false-value
+  //   %v3 = phi[%f3, %FalseMBB], [%t1, %MBB] ; Phi instruction cannot use
+  //                                          ; previous Phi instruction result
+
+  MachineInstr &MI = *Group.front();
+  MachineInstr *LastCMOV = Group.back();
+  DebugLoc DL = MI.getDebugLoc();
+  X86::CondCode CC = X86::CondCode(X86::getCondFromCMovOpc(MI.getOpcode()));
+  X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
+  MachineBasicBlock *MBB = MI.getParent();
+  MachineFunction::iterator It = ++MBB->getIterator();
+  MachineFunction *F = MBB->getParent();
+  const BasicBlock *BB = MBB->getBasicBlock();
+
+  MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(BB);
+  MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(BB);
+  F->insert(It, FalseMBB);
+  F->insert(It, SinkMBB);
+
+  // If the EFLAGS register isn't dead in the terminator, then claim that it's
+  // live into the sink and copy blocks.
+  if (checkEFLAGSLive(LastCMOV)) {
+    FalseMBB->addLiveIn(X86::EFLAGS);
+    SinkMBB->addLiveIn(X86::EFLAGS);
+  }
+
+  // Transfer the remainder of BB and its successor edges to SinkMBB.
+  SinkMBB->splice(SinkMBB->begin(), MBB,
+                  std::next(MachineBasicBlock::iterator(LastCMOV)), MBB->end());
+  SinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+  // Add the false and sink blocks as its successors.
+  MBB->addSuccessor(FalseMBB);
+  MBB->addSuccessor(SinkMBB);
+
+  // Create the conditional branch instruction.
+  BuildMI(MBB, DL, TII->get(X86::GetCondBranchFromCond(CC))).addMBB(SinkMBB);
+
+  // Add the sink block to the false block successors.
+  FalseMBB->addSuccessor(SinkMBB);
+
+  MachineInstrBuilder MIB;
+  MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
+  MachineBasicBlock::iterator MIItEnd =
+      std::next(MachineBasicBlock::iterator(LastCMOV));
+  MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
+  // As we are creating the PHIs, we have to be careful if there is more than
+  // one.  Later CMOVs may reference the results of earlier CMOVs, but later
+  // PHIs have to reference the individual true/false inputs from earlier PHIs.
+  // That also means that PHI construction must work forward from earlier to
+  // later, and that the code must maintain a mapping from earlier PHI's
+  // destination registers, and the registers that went into the PHI.
+  DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
+
+  for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
+    unsigned DestReg = MIIt->getOperand(0).getReg();
+    unsigned Op1Reg = MIIt->getOperand(1).getReg();
+    unsigned Op2Reg = MIIt->getOperand(2).getReg();
+
+    // If this CMOV we are processing is the opposite condition from the jump we
+    // generated, then we have to swap the operands for the PHI that is going to
+    // be generated.
+    if (X86::getCondFromCMovOpc(MIIt->getOpcode()) == OppCC)
+      std::swap(Op1Reg, Op2Reg);
+
+    auto Op1Itr = RegRewriteTable.find(Op1Reg);
+    if (Op1Itr != RegRewriteTable.end())
+      Op1Reg = Op1Itr->second.first;
+
+    auto Op2Itr = RegRewriteTable.find(Op2Reg);
+    if (Op2Itr != RegRewriteTable.end())
+      Op2Reg = Op2Itr->second.second;
+
+    //  SinkMBB:
+    //   %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, MBB ]
+    //  ...
+    MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)
+              .addReg(Op1Reg)
+              .addMBB(FalseMBB)
+              .addReg(Op2Reg)
+              .addMBB(MBB);
+    (void)MIB;
+    DEBUG(dbgs() << "\tFrom: "; MIIt->dump());
+    DEBUG(dbgs() << "\tTo: "; MIB->dump());
+
+    // Add this PHI to the rewrite table.
+    RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
+  }
+
+  // Now remove the CMOV(s).
+  MBB->erase(MIItBegin, MIItEnd);
+}
+
+} // End anonymous namespace.
+
+FunctionPass *llvm::createX86CmovConverterPass() {
+  return new X86CmovConverterPass();
+}
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index ee9e78146305..527e5d568ac6 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -1187,7 +1187,7 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
       CC != CallingConv::X86_StdCall &&
       CC != CallingConv::X86_ThisCall &&
       CC != CallingConv::X86_64_SysV &&
-      CC != CallingConv::X86_64_Win64)
+      CC != CallingConv::Win64)
     return false;
 
   // Don't handle popping bytes if they don't fit the ret's immediate.
@@ -3171,7 +3171,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
   case CallingConv::X86_FastCall:
   case CallingConv::X86_StdCall:
   case CallingConv::X86_ThisCall:
-  case CallingConv::X86_64_Win64:
+  case CallingConv::Win64:
   case CallingConv::X86_64_SysV:
     break;
   }
diff --git a/lib/Target/X86/X86FixupBWInsts.cpp b/lib/Target/X86/X86FixupBWInsts.cpp
index c28746f96439..95c6f2a3fa34 100644
--- a/lib/Target/X86/X86FixupBWInsts.cpp
+++ b/lib/Target/X86/X86FixupBWInsts.cpp
@@ -22,7 +22,7 @@
 /// instructions and register-to-register moves.  It would
 /// seem like cmov(s) would also be affected, but because of the way cmov is
 /// really implemented by most machines as reading both the destination and
-/// and source regsters, and then "merging" the two based on a condition,
+/// and source registers, and then "merging" the two based on a condition,
 /// it really already should be considered as having a true dependence on the
 /// destination register as well.
 ///
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 65486cf7f529..44eecd664714 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -1335,6 +1335,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::CTTZ,             VT, Custom);
     }
 
+    // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
+    for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64, MVT::v4i64,
+                    MVT::v8i64}) {
+      setOperationAction(ISD::ROTL,             VT, Custom);
+      setOperationAction(ISD::ROTR,             VT, Custom);
+    }
+
     // Need to promote to 64-bit even though we have 32-bit masked instructions
     // because the IR optimizers rearrange bitcasts around logic ops leaving
     // too many variations to handle if we don't promote them.
@@ -1663,10 +1670,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
   MaxStoresPerMemmoveOptSize = 4;
 
-  // TODO: These control memcmp expansion in CGP and are set low to prevent
-  // altering the vector expansion for 16/32 byte memcmp in SelectionDAGBuilder.
-  MaxLoadsPerMemcmp = 1;
-  MaxLoadsPerMemcmpOptSize = 1;
+  // TODO: These control memcmp expansion in CGP and could be raised higher, but
+  // that needs to benchmarked and balanced with the potential use of vector
+  // load/store types (PR33329).
+  MaxLoadsPerMemcmp = 4;
+  MaxLoadsPerMemcmpOptSize = 2;
 
   // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
   setPrefLoopAlignment(ExperimentalPrefLoopAlignment);
@@ -2661,7 +2669,7 @@ static bool mayTailCallThisCC(CallingConv::ID CC) {
   switch (CC) {
   // C calling conventions:
   case CallingConv::C:
-  case CallingConv::X86_64_Win64:
+  case CallingConv::Win64:
   case CallingConv::X86_64_SysV:
   // Callee pop conventions:
   case CallingConv::X86_ThisCall:
@@ -20188,7 +20196,10 @@ static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
                                  SDValue Index, SDValue ScaleOp, SDValue Chain,
                                  const X86Subtarget &Subtarget) {
   SDLoc dl(Op);
-  auto *C = cast<ConstantSDNode>(ScaleOp);
+  auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
+  // Scale must be constant.
+  if (!C)
+    return SDValue();
   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
   EVT MaskVT = Mask.getValueType();
   SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
@@ -20210,7 +20221,10 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
                               SDValue Index, SDValue ScaleOp, SDValue Chain,
                               const X86Subtarget &Subtarget) {
   SDLoc dl(Op);
-  auto *C = cast<ConstantSDNode>(ScaleOp);
+  auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
+  // Scale must be constant.
+  if (!C)
+    return SDValue();
   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
   MVT MaskVT = MVT::getVectorVT(MVT::i1,
                              Index.getSimpleValueType().getVectorNumElements());
@@ -20235,7 +20249,10 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
                                SDValue Index, SDValue ScaleOp, SDValue Chain,
                                const X86Subtarget &Subtarget) {
   SDLoc dl(Op);
-  auto *C = cast<ConstantSDNode>(ScaleOp);
+  auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
+  // Scale must be constant.
+  if (!C)
+    return SDValue();
   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
   SDValue Segment = DAG.getRegister(0, MVT::i32);
@@ -20254,7 +20271,10 @@ static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
                                SDValue ScaleOp, SDValue Chain,
                                const X86Subtarget &Subtarget) {
   SDLoc dl(Op);
-  auto *C = cast<ConstantSDNode>(ScaleOp);
+  auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
+  // Scale must be constant.
+  if (!C)
+    return SDValue();
   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
   SDValue Segment = DAG.getRegister(0, MVT::i32);
@@ -22665,10 +22685,31 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
   SDLoc DL(Op);
   SDValue R = Op.getOperand(0);
   SDValue Amt = Op.getOperand(1);
+  unsigned Opcode = Op.getOpcode();
+  unsigned EltSizeInBits = VT.getScalarSizeInBits();
+
+  if (Subtarget.hasAVX512()) {
+    // Attempt to rotate by immediate.
+    APInt UndefElts;
+    SmallVector<APInt, 16> EltBits;
+    if (getTargetConstantBitsFromNode(Amt, EltSizeInBits, UndefElts, EltBits)) {
+      if (!UndefElts && llvm::all_of(EltBits, [EltBits](APInt &V) {
+            return EltBits[0] == V;
+          })) {
+        unsigned Op = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);
+        uint64_t RotateAmt = EltBits[0].urem(EltSizeInBits);
+        return DAG.getNode(Op, DL, VT, R,
+                           DAG.getConstant(RotateAmt, DL, MVT::i8));
+      }
+    }
+
+    // Else, fall-back on VPROLV/VPRORV.
+    return Op;
+  }
 
   assert(VT.isVector() && "Custom lowering only for vector rotates!");
   assert(Subtarget.hasXOP() && "XOP support required for vector rotates!");
-  assert((Op.getOpcode() == ISD::ROTL) && "Only ROTL supported");
+  assert((Opcode == ISD::ROTL) && "Only ROTL supported");
 
   // XOP has 128-bit vector variable + immediate rotates.
   // +ve/-ve Amt = rotate left/right.
@@ -22683,7 +22724,7 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
   if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
     if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
       uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
-      assert(RotateAmt < VT.getScalarSizeInBits() && "Rotation out of range");
+      assert(RotateAmt < EltSizeInBits && "Rotation out of range");
       return DAG.getNode(X86ISD::VPROTI, DL, VT, R,
                          DAG.getConstant(RotateAmt, DL, MVT::i8));
     }
@@ -24030,7 +24071,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::MULHU:              return LowerMULH(Op, Subtarget, DAG);
   case ISD::UMUL_LOHI:
   case ISD::SMUL_LOHI:          return LowerMUL_LOHI(Op, Subtarget, DAG);
-  case ISD::ROTL:               return LowerRotate(Op, Subtarget, DAG);
+  case ISD::ROTL:
+  case ISD::ROTR:               return LowerRotate(Op, Subtarget, DAG);
   case ISD::SRA:
   case ISD::SRL:
   case ISD::SHL:                return LowerShift(Op, Subtarget, DAG);
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index cc5c09cbf0e5..705d0f7a5cf7 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -1759,29 +1759,29 @@ let Predicates = Preds in {
                               (i64 0)),
             (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rr) _.RC:$src1, _.RC:$src2),
                               NewInf.KRC)>;
-  
+
   def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
-                              (_.KVT (OpNode (_.VT _.RC:$src1), 
+                              (_.KVT (OpNode (_.VT _.RC:$src1),
                                              (_.VT (bitconvert (_.LdFrag addr:$src2))))),
                               (i64 0)),
             (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rm) _.RC:$src1, addr:$src2),
                               NewInf.KRC)>;
-  
+
   def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
-                              (_.KVT (and _.KRCWM:$mask, 
+                              (_.KVT (and _.KRCWM:$mask,
                                           (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2)))),
                               (i64 0)),
             (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rrk) _.KRCWM:$mask,
                                                                 _.RC:$src1, _.RC:$src2),
                               NewInf.KRC)>;
-  
+
   def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
-                              (_.KVT (and (_.KVT _.KRCWM:$mask), 
-                                          (_.KVT (OpNode (_.VT _.RC:$src1), 
-                                                         (_.VT (bitconvert 
+                              (_.KVT (and (_.KVT _.KRCWM:$mask),
+                                          (_.KVT (OpNode (_.VT _.RC:$src1),
+                                                         (_.VT (bitconvert
                                                                 (_.LdFrag addr:$src2))))))),
                               (i64 0)),
-            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmk) _.KRCWM:$mask, 
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmk) _.KRCWM:$mask,
                                                                  _.RC:$src1, addr:$src2),
                               NewInf.KRC)>;
 }
@@ -1798,7 +1798,7 @@ let Predicates = Preds in {
                               (i64 0)),
             (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmb) _.RC:$src1, addr:$src2),
                               NewInf.KRC)>;
-  
+
   def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
                               (_.KVT (and (_.KVT _.KRCWM:$mask),
                                           (_.KVT (OpNode (_.VT _.RC:$src1),
@@ -1879,7 +1879,7 @@ defm : avx512_icmp_packed_rmb_lowering<v4i64x_info, v32i1_info, X86pcmpeqm,
 defm : avx512_icmp_packed_rmb_lowering<v4i64x_info, v64i1_info, X86pcmpeqm,
                                        "VPCMPEQQZ256", [HasAVX512, HasVLX]>;
 
-defm : avx512_icmp_packed_rmb_lowering<v8i64_info, v16i1_info, X86pcmpeqm, 
+defm : avx512_icmp_packed_rmb_lowering<v8i64_info, v16i1_info, X86pcmpeqm,
                                        "VPCMPEQQZ", [HasAVX512]>;
 defm : avx512_icmp_packed_rmb_lowering<v8i64_info, v32i1_info, X86pcmpeqm,
                                        "VPCMPEQQZ", [HasAVX512]>;
@@ -2127,17 +2127,17 @@ multiclass avx512_icmp_cc_packed_lowering<X86VectorVTInfo _, X86KVectorVTInfo Ne
                                           list<Predicate> Preds> {
 let Predicates = Preds in {
   def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
-                              (_.KVT (OpNode (_.VT _.RC:$src1), 
-                                             (_.VT _.RC:$src2), 
+                              (_.KVT (OpNode (_.VT _.RC:$src1),
+                                             (_.VT _.RC:$src2),
                                              imm:$cc)),
                               (i64 0)),
-            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rri) _.RC:$src1, 
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rri) _.RC:$src1,
                                                                  _.RC:$src2,
                                                                  imm:$cc),
                               NewInf.KRC)>;
-  
+
   def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
-                              (_.KVT (OpNode (_.VT _.RC:$src1), 
+                              (_.KVT (OpNode (_.VT _.RC:$src1),
                                              (_.VT (bitconvert (_.LdFrag addr:$src2))),
                                              imm:$cc)),
                               (i64 0)),
@@ -2145,37 +2145,37 @@ let Predicates = Preds in {
                                                                  addr:$src2,
                                                                  imm:$cc),
                               NewInf.KRC)>;
-  
+
   def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
-                              (_.KVT (and _.KRCWM:$mask, 
+                              (_.KVT (and _.KRCWM:$mask,
                                           (OpNode (_.VT _.RC:$src1),
                                                   (_.VT _.RC:$src2),
                                                   imm:$cc))),
                               (i64 0)),
             (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rrik) _.KRCWM:$mask,
-                                                                  _.RC:$src1, 
+                                                                  _.RC:$src1,
                                                                   _.RC:$src2,
                                                                   imm:$cc),
                               NewInf.KRC)>;
-  
+
   def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
-                              (_.KVT (and (_.KVT _.KRCWM:$mask), 
-                                          (_.KVT (OpNode (_.VT _.RC:$src1), 
-                                                         (_.VT (bitconvert 
+                              (_.KVT (and (_.KVT _.KRCWM:$mask),
+                                          (_.KVT (OpNode (_.VT _.RC:$src1),
+                                                         (_.VT (bitconvert
                                                                 (_.LdFrag addr:$src2))),
                                                          imm:$cc)))),
                               (i64 0)),
-            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmik) _.KRCWM:$mask, 
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmik) _.KRCWM:$mask,
                                                                   _.RC:$src1,
                                                                   addr:$src2,
                                                                   imm:$cc),
                               NewInf.KRC)>;
 }
 }
-  
+
 multiclass avx512_icmp_cc_packed_rmb_lowering<X86VectorVTInfo _, X86KVectorVTInfo NewInf,
                                               SDNode OpNode, string InstrStr,
-                                              list<Predicate> Preds> 
+                                              list<Predicate> Preds>
          : avx512_icmp_cc_packed_lowering<_, NewInf, OpNode, InstrStr, Preds> {
 let Predicates = Preds in {
   def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
@@ -2187,7 +2187,7 @@ let Predicates = Preds in {
                                                                   addr:$src2,
                                                                   imm:$cc),
                               NewInf.KRC)>;
-  
+
   def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
                               (_.KVT (and (_.KVT _.KRCWM:$mask),
                                           (_.KVT (OpNode (_.VT _.RC:$src1),
@@ -2447,17 +2447,17 @@ multiclass avx512_fcmp_cc_packed_lowering<X86VectorVTInfo _, X86KVectorVTInfo Ne
                                           string InstrStr, list<Predicate> Preds> {
 let Predicates = Preds in {
   def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
-                              (_.KVT (X86cmpm (_.VT _.RC:$src1), 
-                                              (_.VT _.RC:$src2), 
+                              (_.KVT (X86cmpm (_.VT _.RC:$src1),
+                                              (_.VT _.RC:$src2),
                                               imm:$cc)),
                               (i64 0)),
-            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rri) _.RC:$src1, 
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rri) _.RC:$src1,
                                                                  _.RC:$src2,
                                                                  imm:$cc),
                               NewInf.KRC)>;
-  
+
   def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
-                              (_.KVT (X86cmpm (_.VT _.RC:$src1), 
+                              (_.KVT (X86cmpm (_.VT _.RC:$src1),
                                               (_.VT (bitconvert (_.LdFrag addr:$src2))),
                                               imm:$cc)),
                               (i64 0)),
@@ -2477,19 +2477,19 @@ let Predicates = Preds in {
                               NewInf.KRC)>;
 }
 }
-  
+
 multiclass avx512_fcmp_cc_packed_sae_lowering<X86VectorVTInfo _, X86KVectorVTInfo NewInf,
-                                              string InstrStr, list<Predicate> Preds> 
+                                              string InstrStr, list<Predicate> Preds>
          : avx512_fcmp_cc_packed_lowering<_, NewInf, InstrStr, Preds> {
 
 let Predicates = Preds in
   def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
-                              (_.KVT (X86cmpmRnd (_.VT _.RC:$src1), 
-                                                 (_.VT _.RC:$src2), 
+                              (_.KVT (X86cmpmRnd (_.VT _.RC:$src1),
+                                                 (_.VT _.RC:$src2),
                                                  imm:$cc,
                                                  (i32 FROUND_NO_EXC))),
                               (i64 0)),
-            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rrib) _.RC:$src1, 
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rrib) _.RC:$src1,
                                                                  _.RC:$src2,
                                                                  imm:$cc),
                               NewInf.KRC)>;
@@ -2817,16 +2817,16 @@ let Predicates = [HasAVX512] in {
     def : Pat<(maskVT (scalar_to_vector GR32:$src)),
               (COPY_TO_REGCLASS GR32:$src, maskRC)>;
 
-    def : Pat<(i32 (X86Vextract maskRC:$src, (iPTR 0))), 
+    def : Pat<(i32 (X86Vextract maskRC:$src, (iPTR 0))),
               (COPY_TO_REGCLASS maskRC:$src, GR32)>;
 
     def : Pat<(maskVT (scalar_to_vector GR8:$src)),
               (COPY_TO_REGCLASS (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src, sub_8bit), maskRC)>;
 
-    def : Pat<(i8 (X86Vextract maskRC:$src, (iPTR 0))), 
+    def : Pat<(i8 (X86Vextract maskRC:$src, (iPTR 0))),
               (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS maskRC:$src, GR32)), sub_8bit)>;
 
-    def : Pat<(i32 (anyext (i8 (X86Vextract maskRC:$src, (iPTR 0))))), 
+    def : Pat<(i32 (anyext (i8 (X86Vextract maskRC:$src, (iPTR 0))))),
               (COPY_TO_REGCLASS maskRC:$src, GR32)>;
   }
 
@@ -3036,7 +3036,7 @@ def : Pat<(v8i1 (OpNode (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
             (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
             (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))), VK8)>;
 
-def : Pat<(insert_subvector (v16i1 immAllZerosV), 
+def : Pat<(insert_subvector (v16i1 immAllZerosV),
                             (v8i1 (OpNode (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
                             (i64 0)),
             (KSHIFTRWri (KSHIFTLWri (!cast<Instruction>(InstStr##Zrr)
@@ -3044,8 +3044,8 @@ def : Pat<(insert_subvector (v16i1 immAllZerosV),
                      (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))),
             (i8 8)), (i8 8))>;
 
-def : Pat<(insert_subvector (v16i1 immAllZerosV), 
-                            (v8i1 (and VK8:$mask, 
+def : Pat<(insert_subvector (v16i1 immAllZerosV),
+                            (v8i1 (and VK8:$mask,
                                        (OpNode (v8i32 VR256X:$src1), (v8i32 VR256X:$src2)))),
                             (i64 0)),
             (KSHIFTRWri (KSHIFTLWri (!cast<Instruction>(InstStr##Zrrk)
@@ -3063,7 +3063,7 @@ def : Pat<(v8i1 (OpNode (_.info256.VT VR256X:$src1), (_.info256.VT VR256X:$src2)
             (_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)),
             imm:$cc), VK8)>;
 
-def : Pat<(insert_subvector (v16i1 immAllZerosV), 
+def : Pat<(insert_subvector (v16i1 immAllZerosV),
                             (v8i1 (OpNode (_.info256.VT VR256X:$src1), (_.info256.VT VR256X:$src2), imm:$cc)),
                             (i64 0)),
             (KSHIFTRWri (KSHIFTLWri (!cast<Instruction>(InstStr##Zrri)
@@ -3072,8 +3072,8 @@ def : Pat<(insert_subvector (v16i1 immAllZerosV),
                      imm:$cc),
             (i8 8)), (i8 8))>;
 
-def : Pat<(insert_subvector (v16i1 immAllZerosV), 
-                            (v8i1 (and VK8:$mask, 
+def : Pat<(insert_subvector (v16i1 immAllZerosV),
+                            (v8i1 (and VK8:$mask,
                                        (OpNode (_.info256.VT VR256X:$src1), (_.info256.VT VR256X:$src2), imm:$cc))),
                             (i64 0)),
             (KSHIFTRWri (KSHIFTLWri (!cast<Instruction>(InstStr##Zrrik)
@@ -3379,35 +3379,35 @@ defm VMOVUPD : avx512_load_vl<0x10, "vmovupd", avx512vl_f64_info, HasAVX512,
 defm VMOVDQA32 : avx512_alignedload_vl<0x6F, "vmovdqa32", avx512vl_i32_info,
                                        HasAVX512>,
                  avx512_alignedstore_vl<0x7F, "vmovdqa32", avx512vl_i32_info,
-                                       HasAVX512, "VMOVDQA32">, 
+                                       HasAVX512, "VMOVDQA32">,
                  PD, EVEX_CD8<32, CD8VF>;
 
 defm VMOVDQA64 : avx512_alignedload_vl<0x6F, "vmovdqa64", avx512vl_i64_info,
                                        HasAVX512>,
                  avx512_alignedstore_vl<0x7F, "vmovdqa64", avx512vl_i64_info,
-                                    HasAVX512, "VMOVDQA64">, 
+                                    HasAVX512, "VMOVDQA64">,
                  PD, VEX_W, EVEX_CD8<64, CD8VF>;
 
 defm VMOVDQU8 : avx512_load_vl<0x6F, "vmovdqu8", avx512vl_i8_info, HasBWI>,
                 avx512_store_vl<0x7F, "vmovdqu8", avx512vl_i8_info,
-                                 HasBWI, "VMOVDQU8">, 
+                                 HasBWI, "VMOVDQU8">,
                 XD, EVEX_CD8<8, CD8VF>;
 
 defm VMOVDQU16 : avx512_load_vl<0x6F, "vmovdqu16", avx512vl_i16_info, HasBWI>,
                  avx512_store_vl<0x7F, "vmovdqu16", avx512vl_i16_info,
-                                 HasBWI, "VMOVDQU16">, 
+                                 HasBWI, "VMOVDQU16">,
                  XD, VEX_W, EVEX_CD8<16, CD8VF>;
 
 defm VMOVDQU32 : avx512_load_vl<0x6F, "vmovdqu32", avx512vl_i32_info, HasAVX512,
                                 null_frag>,
                  avx512_store_vl<0x7F, "vmovdqu32", avx512vl_i32_info,
-                                 HasAVX512, "VMOVDQU32">, 
+                                 HasAVX512, "VMOVDQU32">,
                  XS, EVEX_CD8<32, CD8VF>;
 
 defm VMOVDQU64 : avx512_load_vl<0x6F, "vmovdqu64", avx512vl_i64_info, HasAVX512,
                                 null_frag>,
                  avx512_store_vl<0x7F, "vmovdqu64", avx512vl_i64_info,
-                                 HasAVX512, "VMOVDQU64">, 
+                                 HasAVX512, "VMOVDQU64">,
                  XS, VEX_W, EVEX_CD8<64, CD8VF>;
 
 // Special instructions to help with spilling when we don't have VLX. We need
@@ -3964,49 +3964,49 @@ def : Pat<(int_x86_avx512_mask_store_ss addr:$dst, VR128X:$src, GR8:$mask),
            (COPY_TO_REGCLASS VR128X:$src, FR32X))>;
 
 let hasSideEffects = 0 in {
-  def VMOVSSZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), 
+  def VMOVSSZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
                            (ins VR128X:$src1, FR32X:$src2),
                            "vmovss.s\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                            [], NoItinerary>, XS, EVEX_4V, VEX_LIG,
                            FoldGenData<"VMOVSSZrr">;
 
 let Constraints = "$src0 = $dst" in
-  def VMOVSSZrrk_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), 
-                             (ins f32x_info.RC:$src0, f32x_info.KRCWM:$mask, 
+  def VMOVSSZrrk_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
+                             (ins f32x_info.RC:$src0, f32x_info.KRCWM:$mask,
                                                    VR128X:$src1, FR32X:$src2),
                              "vmovss.s\t{$src2, $src1, $dst {${mask}}|"#
                                         "$dst {${mask}}, $src1, $src2}",
                              [], NoItinerary>, EVEX_K, XS, EVEX_4V, VEX_LIG,
                              FoldGenData<"VMOVSSZrrk">;
- 
-  def VMOVSSZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), 
+
+  def VMOVSSZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
                          (ins f32x_info.KRCWM:$mask, VR128X:$src1, FR32X:$src2),
                          "vmovss.s\t{$src2, $src1, $dst {${mask}} {z}|"#
                                     "$dst {${mask}} {z}, $src1, $src2}",
                          [], NoItinerary>, EVEX_KZ, XS, EVEX_4V, VEX_LIG,
                          FoldGenData<"VMOVSSZrrkz">;
 
-  def VMOVSDZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), 
+  def VMOVSDZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
                            (ins VR128X:$src1, FR64X:$src2),
                            "vmovsd.s\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                            [], NoItinerary>, XD, EVEX_4V, VEX_LIG, VEX_W,
                            FoldGenData<"VMOVSDZrr">;
 
 let Constraints = "$src0 = $dst" in
-  def VMOVSDZrrk_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), 
-                             (ins f64x_info.RC:$src0, f64x_info.KRCWM:$mask, 
+  def VMOVSDZrrk_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
+                             (ins f64x_info.RC:$src0, f64x_info.KRCWM:$mask,
                                                    VR128X:$src1, FR64X:$src2),
                              "vmovsd.s\t{$src2, $src1, $dst {${mask}}|"#
                                         "$dst {${mask}}, $src1, $src2}",
                              [], NoItinerary>, EVEX_K, XD, EVEX_4V, VEX_LIG,
-                             VEX_W, FoldGenData<"VMOVSDZrrk">; 
+                             VEX_W, FoldGenData<"VMOVSDZrrk">;
 
-  def VMOVSDZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), 
-                              (ins f64x_info.KRCWM:$mask, VR128X:$src1, 
+  def VMOVSDZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
+                              (ins f64x_info.KRCWM:$mask, VR128X:$src1,
                                                           FR64X:$src2),
                               "vmovsd.s\t{$src2, $src1, $dst {${mask}} {z}|"#
                                          "$dst {${mask}} {z}, $src1, $src2}",
-                              [], NoItinerary>, EVEX_KZ, XD, EVEX_4V, VEX_LIG, 
+                              [], NoItinerary>, EVEX_KZ, XD, EVEX_4V, VEX_LIG,
                               VEX_W, FoldGenData<"VMOVSDZrrkz">;
 }
 
@@ -5676,6 +5676,109 @@ defm : avx512_var_shift_int_lowering_mb<"VPSRAVQ", v2i64x_info, [HasVLX]>;
 defm : avx512_var_shift_int_lowering_mb<"VPSRAVQ", v4i64x_info, [HasVLX]>;
 defm : avx512_var_shift_int_lowering_mb<"VPSRAVQ", v8i64_info, [HasAVX512]>;
 
+
+// Use 512bit VPROL/VPROLI version to implement v2i64/v4i64 + v4i32/v8i32 in case NoVLX.
+let Predicates = [HasAVX512, NoVLX] in {
+  def : Pat<(v2i64 (rotl (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))),
+            (EXTRACT_SUBREG (v8i64
+              (VPROLVQZrr
+                (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
+                       (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src2, sub_xmm))),
+                       sub_xmm)>;
+  def : Pat<(v4i64 (rotl (v4i64 VR256X:$src1), (v4i64 VR256X:$src2))),
+            (EXTRACT_SUBREG (v8i64
+              (VPROLVQZrr
+                (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+                       (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))),
+                       sub_ymm)>;
+
+  def : Pat<(v4i32 (rotl (v4i32 VR128X:$src1), (v4i32 VR128X:$src2))),
+            (EXTRACT_SUBREG (v16i32
+              (VPROLVDZrr
+                (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
+                        (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src2, sub_xmm))),
+                        sub_xmm)>;
+  def : Pat<(v8i32 (rotl (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
+            (EXTRACT_SUBREG (v16i32
+              (VPROLVDZrr
+                (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+                        (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))),
+                        sub_ymm)>;
+
+  def : Pat<(v2i64 (X86vrotli (v2i64 VR128X:$src1), (i8 imm:$src2))),
+            (EXTRACT_SUBREG (v8i64
+              (VPROLQZri
+                (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
+                        imm:$src2)), sub_xmm)>;
+  def : Pat<(v4i64 (X86vrotli (v4i64 VR256X:$src1), (i8 imm:$src2))),
+            (EXTRACT_SUBREG (v8i64
+              (VPROLQZri
+                (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+                       imm:$src2)), sub_ymm)>;
+
+  def : Pat<(v4i32 (X86vrotli (v4i32 VR128X:$src1), (i8 imm:$src2))),
+            (EXTRACT_SUBREG (v16i32
+              (VPROLDZri
+                (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
+                        imm:$src2)), sub_xmm)>;
+  def : Pat<(v8i32 (X86vrotli (v8i32 VR256X:$src1), (i8 imm:$src2))),
+            (EXTRACT_SUBREG (v16i32
+              (VPROLDZri
+                (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+                        imm:$src2)), sub_ymm)>;
+}
+
+// Use 512bit VPROR/VPRORI version to implement v2i64/v4i64 + v4i32/v8i32 in case NoVLX.
+let Predicates = [HasAVX512, NoVLX] in {
+  def : Pat<(v2i64 (rotr (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))),
+            (EXTRACT_SUBREG (v8i64
+              (VPRORVQZrr
+                (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
+                       (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src2, sub_xmm))),
+                       sub_xmm)>;
+  def : Pat<(v4i64 (rotr (v4i64 VR256X:$src1), (v4i64 VR256X:$src2))),
+            (EXTRACT_SUBREG (v8i64
+              (VPRORVQZrr
+                (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+                       (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))),
+                       sub_ymm)>;
+
+  def : Pat<(v4i32 (rotr (v4i32 VR128X:$src1), (v4i32 VR128X:$src2))),
+            (EXTRACT_SUBREG (v16i32
+              (VPRORVDZrr
+                (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
+                        (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src2, sub_xmm))),
+                        sub_xmm)>;
+  def : Pat<(v8i32 (rotr (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
+            (EXTRACT_SUBREG (v16i32
+              (VPRORVDZrr
+                (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+                        (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))),
+                        sub_ymm)>;
+
+  def : Pat<(v2i64 (X86vrotri (v2i64 VR128X:$src1), (i8 imm:$src2))),
+            (EXTRACT_SUBREG (v8i64
+              (VPRORQZri
+                (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
+                        imm:$src2)), sub_xmm)>;
+  def : Pat<(v4i64 (X86vrotri (v4i64 VR256X:$src1), (i8 imm:$src2))),
+            (EXTRACT_SUBREG (v8i64
+              (VPRORQZri
+                (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+                       imm:$src2)), sub_ymm)>;
+
+  def : Pat<(v4i32 (X86vrotri (v4i32 VR128X:$src1), (i8 imm:$src2))),
+            (EXTRACT_SUBREG (v16i32
+              (VPRORDZri
+                (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
+                        imm:$src2)), sub_xmm)>;
+  def : Pat<(v8i32 (X86vrotri (v8i32 VR256X:$src1), (i8 imm:$src2))),
+            (EXTRACT_SUBREG (v16i32
+              (VPRORDZri
+                (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+                        imm:$src2)), sub_ymm)>;
+}
+
 //===-------------------------------------------------------------------===//
 // 1-src variable permutation VPERMW/D/Q
 //===-------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index 7e4cba1c8345..343da2573b55 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -224,7 +224,7 @@ X86RegisterInfo::getPointerRegClass(const MachineFunction &MF,
 const TargetRegisterClass *
 X86RegisterInfo::getGPRsForTailCall(const MachineFunction &MF) const {
   const Function *F = MF.getFunction();
-  if (IsWin64 || (F && F->getCallingConv() == CallingConv::X86_64_Win64))
+  if (IsWin64 || (F && F->getCallingConv() == CallingConv::Win64))
     return &X86::GR64_TCW64RegClass;
   else if (Is64Bit)
     return &X86::GR64_TCRegClass;
@@ -334,7 +334,7 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
     if (Is64Bit)
       return CSR_64_MostRegs_SaveList;
     break;
-  case CallingConv::X86_64_Win64:
+  case CallingConv::Win64:
     if (!HasSSE)
       return CSR_Win64_NoSSE_SaveList;
     return CSR_Win64_SaveList;
@@ -450,7 +450,7 @@ X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
     if (Is64Bit)
       return CSR_64_MostRegs_RegMask;
     break;
-  case CallingConv::X86_64_Win64:
+  case CallingConv::Win64:
     return CSR_Win64_RegMask;
   case CallingConv::X86_64_SysV:
     return CSR_64_RegMask;
diff --git a/lib/Target/X86/X86Schedule.td b/lib/Target/X86/X86Schedule.td
index a12fa68faf4f..d831a7974359 100644
--- a/lib/Target/X86/X86Schedule.td
+++ b/lib/Target/X86/X86Schedule.td
@@ -663,5 +663,6 @@ include "X86ScheduleAtom.td"
 include "X86SchedSandyBridge.td"
 include "X86SchedHaswell.td"
 include "X86ScheduleSLM.td"
+include "X86ScheduleZnver1.td"
 include "X86ScheduleBtVer2.td"
 
diff --git a/lib/Target/X86/X86ScheduleBtVer2.td b/lib/Target/X86/X86ScheduleBtVer2.td
index ed53893b779c..9dcc968a1a7a 100644
--- a/lib/Target/X86/X86ScheduleBtVer2.td
+++ b/lib/Target/X86/X86ScheduleBtVer2.td
@@ -370,6 +370,22 @@ def : WriteRes<WriteMicrocoded, [JAny]> { let Latency = 100; }
 def : WriteRes<WriteFence,  [JSAGU]>;
 def : WriteRes<WriteNop, []>;
 
+////////////////////////////////////////////////////////////////////////////////
+// SSE4A instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+def WriteEXTRQ: SchedWriteRes<[JFPU01]> {
+  let Latency = 1;
+  let ResourceCycles = [1];
+}
+def : InstRW<[WriteEXTRQ], (instregex "EXTRQ")>;
+
+def WriteINSERTQ: SchedWriteRes<[JFPU01]> {
+  let Latency = 2;
+  let ResourceCycles = [4];
+}
+def : InstRW<[WriteINSERTQ], (instregex "INSERTQ")>;
+
 ////////////////////////////////////////////////////////////////////////////////
 // AVX instructions.
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/lib/Target/X86/X86ScheduleZnver1.td b/lib/Target/X86/X86ScheduleZnver1.td
new file mode 100644
index 000000000000..d5b4cfe2ddee
--- /dev/null
+++ b/lib/Target/X86/X86ScheduleZnver1.td
@@ -0,0 +1,223 @@
+//=- X86ScheduleZnver1.td - X86 Znver1 Scheduling -------------*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Znver1 to support instruction
+// scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+def Znver1Model : SchedMachineModel {
+  // Zen can decode 4 instructions per cycle.
+  let IssueWidth = 4;
+  // Based on the reorder buffer we define MicroOpBufferSize
+  let MicroOpBufferSize = 192;
+  let LoadLatency = 4;
+  let MispredictPenalty = 17;
+  let HighLatency = 25;
+  let PostRAScheduler = 1;
+
+  // FIXME: This variable is required for incomplete model.
+  // We haven't catered all instructions.
+  // So, we reset the value of this variable so as to
+  // say that the model is incomplete.
+  let CompleteModel = 0;
+}
+
+let SchedModel = Znver1Model in {
+
+// Zen can issue micro-ops to 10 different units in one cycle.
+// These are
+//  * Four integer ALU units (ZALU0, ZALU1, ZALU2, ZALU3)
+//  * Two AGU units (ZAGU0, ZAGU1)
+//  * Four FPU units (ZFPU0, ZFPU1, ZFPU2, ZFPU3)
+// AGUs feed load store queues @two loads and 1 store per cycle.
+
+// Four ALU units are defined below
+def ZnALU0 : ProcResource<1>;
+def ZnALU1 : ProcResource<1>;
+def ZnALU2 : ProcResource<1>;
+def ZnALU3 : ProcResource<1>;
+
+// Two AGU units are defined below
+def ZnAGU0 : ProcResource<1>;
+def ZnAGU1 : ProcResource<1>;
+
+// Four FPU units are defined below
+def ZnFPU0 : ProcResource<1>;
+def ZnFPU1 : ProcResource<1>;
+def ZnFPU2 : ProcResource<1>;
+def ZnFPU3 : ProcResource<1>;
+
+// FPU grouping
+def ZnFPU     : ProcResGroup<[ZnFPU0, ZnFPU1, ZnFPU2, ZnFPU3]>;
+def ZnFPU013  : ProcResGroup<[ZnFPU0, ZnFPU1, ZnFPU3]>;
+def ZnFPU01   : ProcResGroup<[ZnFPU0, ZnFPU1]>;
+def ZnFPU12   : ProcResGroup<[ZnFPU1, ZnFPU2]>;
+def ZnFPU13   : ProcResGroup<[ZnFPU1, ZnFPU3]>;
+def ZnFPU23   : ProcResGroup<[ZnFPU2, ZnFPU3]>;
+def ZnFPU02   : ProcResGroup<[ZnFPU0, ZnFPU2]>;
+def ZnFPU03   : ProcResGroup<[ZnFPU0, ZnFPU3]>;
+
+// Below are the grouping of the units.
+// Micro-ops to be issued to multiple units are tackled this way.
+
+// ALU grouping
+// ZnALU03 - 0,3 grouping
+def ZnALU03: ProcResGroup<[ZnALU0, ZnALU3]>;
+
+// 56 Entry (14x4 entries) Int Scheduler
+def ZnALU : ProcResGroup<[ZnALU0, ZnALU1, ZnALU2, ZnALU3]> {
+  let BufferSize=56;
+}
+
+// 28 Entry (14x2) AGU group. AGUs can't be used for all ALU operations
+// but are relevant for some instructions
+def ZnAGU : ProcResGroup<[ZnAGU0, ZnAGU1]> {
+  let BufferSize=28;
+}
+
+// Integer Multiplication issued on ALU1.
+def ZnMultiplier : ProcResource<1>;
+
+// Integer division issued on ALU2.
+def ZnDivider : ProcResource<1>;
+
+// 4 Cycles load-to use Latency is captured
+def : ReadAdvance<ReadAfterLd, 4>;
+
+// (a folded load is an instruction that loads and does some operation)
+// Ex: ADDPD xmm,[mem]-> This instruction has two micro-ops 
+// Instructions with folded loads are usually micro-fused, so they only appear
+// as two micro-ops.
+//      a. load and
+//      b. addpd
+// This multiclass is for folded loads for integer units.
+multiclass ZnWriteResPair<X86FoldableSchedWrite SchedRW,
+                          ProcResourceKind ExePort,
+                          int Lat> {
+  // Register variant takes 1-cycle on Execution Port.
+  def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+
+  // Memory variant also uses a cycle on ZnAGU 
+  // adds 4 cycles to the latency.
+  def : WriteRes<SchedRW.Folded, [ZnAGU, ExePort]> {
+     let Latency = !add(Lat, 4);
+  }
+}
+
+// This multiclass is for folded loads for floating point units.
+multiclass ZnWriteResFpuPair<X86FoldableSchedWrite SchedRW,
+                          ProcResourceKind ExePort,
+                          int Lat> {
+  // Register variant takes 1-cycle on Execution Port.
+  def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+
+  // Memory variant also uses a cycle on ZnAGU
+  // adds 7 cycles to the latency.
+  def : WriteRes<SchedRW.Folded, [ZnAGU, ExePort]> {
+     let Latency = !add(Lat, 7);
+  }
+}
+
+// WriteRMW is set for instructions with Memory write 
+// operation in codegen
+def : WriteRes<WriteRMW, [ZnAGU]>;
+
+def : WriteRes<WriteStore, [ZnAGU]>;
+def : WriteRes<WriteMove,  [ZnALU]>;
+def : WriteRes<WriteLoad,  [ZnAGU]> { let Latency = 8; }
+
+def : WriteRes<WriteZero,  []>;
+def : WriteRes<WriteLEA, [ZnALU]>;
+defm : ZnWriteResPair<WriteALU,   ZnALU, 1>;
+defm : ZnWriteResPair<WriteShift, ZnALU, 1>;
+defm : ZnWriteResPair<WriteJump,  ZnALU, 1>;
+
+// IDIV
+def : WriteRes<WriteIDiv, [ZnALU2, ZnDivider]> {
+  let Latency = 41;
+  let ResourceCycles = [1, 41];
+}
+
+def : WriteRes<WriteIDivLd, [ZnALU2, ZnAGU, ZnDivider]> {
+  let Latency = 45;
+  let ResourceCycles = [1, 4, 41];
+}
+
+// IMUL
+def  : WriteRes<WriteIMulH, [ZnALU1, ZnMultiplier]>{
+  let Latency = 4;
+}
+def : WriteRes<WriteIMul, [ZnALU1, ZnMultiplier]> {
+  let Latency = 4;
+}
+
+def : WriteRes<WriteIMulLd,[ZnALU1, ZnMultiplier]> {
+  let Latency = 8;
+}
+
+// Floating point operations
+defm : ZnWriteResFpuPair<WriteFHAdd,     ZnFPU0,  3>;
+defm : ZnWriteResFpuPair<WriteFAdd,      ZnFPU0,  3>;
+defm : ZnWriteResFpuPair<WriteFBlend,    ZnFPU01, 1>;
+defm : ZnWriteResFpuPair<WriteFVarBlend, ZnFPU01, 1>;
+defm : ZnWriteResFpuPair<WriteVarBlend,  ZnFPU0,  1>;
+defm : ZnWriteResFpuPair<WriteCvtI2F,    ZnFPU3,  5>;
+defm : ZnWriteResFpuPair<WriteCvtF2F,    ZnFPU3,  5>;
+defm : ZnWriteResFpuPair<WriteCvtF2I,    ZnFPU3,  5>;
+defm : ZnWriteResFpuPair<WriteFDiv,      ZnFPU3, 15>;
+defm : ZnWriteResFpuPair<WriteFShuffle,  ZnFPU12, 1>;
+defm : ZnWriteResFpuPair<WriteFMul,      ZnFPU0,  5>;
+defm : ZnWriteResFpuPair<WriteFRcp,      ZnFPU01, 5>;
+defm : ZnWriteResFpuPair<WriteFRsqrt,    ZnFPU01, 5>;
+defm : ZnWriteResFpuPair<WriteFSqrt,     ZnFPU3, 20>;
+
+// Vector integer operations which uses FPU units
+defm : ZnWriteResFpuPair<WriteVecShift,   ZnFPU,   1>;
+defm : ZnWriteResFpuPair<WriteVecLogic,   ZnFPU,   1>;
+defm : ZnWriteResFpuPair<WritePHAdd,      ZnFPU,   1>;
+defm : ZnWriteResFpuPair<WriteVecALU,     ZnFPU,   1>;
+defm : ZnWriteResFpuPair<WriteVecIMul,    ZnFPU0,  4>;
+defm : ZnWriteResFpuPair<WriteShuffle,    ZnFPU,   1>;
+defm : ZnWriteResFpuPair<WriteBlend,      ZnFPU01, 1>;
+defm : ZnWriteResFpuPair<WriteShuffle256, ZnFPU,   2>;
+
+// Vector Shift Operations
+defm : ZnWriteResFpuPair<WriteVarVecShift, ZnFPU12, 1>;
+
+// AES Instructions.
+defm : ZnWriteResFpuPair<WriteAESDecEnc, ZnFPU01, 4>;
+defm : ZnWriteResFpuPair<WriteAESIMC, ZnFPU01, 4>;
+defm : ZnWriteResFpuPair<WriteAESKeyGen, ZnFPU01, 4>;
+
+def : WriteRes<WriteFence,  [ZnAGU]>;
+def : WriteRes<WriteNop, []>;
+
+// Following instructions with latency=100 are microcoded.
+// We set long latency so as to block the entire pipeline.
+defm : ZnWriteResFpuPair<WriteFShuffle256, ZnFPU, 100>;
+
+//Microcoded Instructions
+let Latency = 100 in {
+  def : WriteRes<WriteMicrocoded, []>;
+  def : WriteRes<WriteSystem, []>;
+  def : WriteRes<WriteMPSAD, []>;
+  def : WriteRes<WriteMPSADLd, []>;
+  def : WriteRes<WriteCLMul, []>;
+  def : WriteRes<WriteCLMulLd, []>;
+  def : WriteRes<WritePCmpIStrM, []>;
+  def : WriteRes<WritePCmpIStrMLd, []>;
+  def : WriteRes<WritePCmpEStrI, []>;
+  def : WriteRes<WritePCmpEStrILd, []>;
+  def : WriteRes<WritePCmpEStrM, []>;
+  def : WriteRes<WritePCmpEStrMLd, []>;
+  def : WriteRes<WritePCmpIStrI, []>;
+  def : WriteRes<WritePCmpIStrILd, []>;
+  }
+}
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index fa0afe29586b..427a0001bef9 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -597,7 +597,7 @@ public:
     case CallingConv::Intel_OCL_BI:
       return isTargetWin64();
     // This convention allows using the Win64 convention on other targets.
-    case CallingConv::X86_64_Win64:
+    case CallingConv::Win64:
       return true;
     // This convention allows using the SysV convention on Windows targets.
     case CallingConv::X86_64_SysV:
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index 8d891c983fab..08c2cdaefe71 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -375,6 +375,7 @@ bool X86PassConfig::addILPOpts() {
   addPass(&EarlyIfConverterID);
   if (EnableMachineCombinerPass)
     addPass(&MachineCombinerID);
+  addPass(createX86CmovConverterPass());
   return true;
 }
 
diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h
index aaa6d58bd134..c16207973b39 100644
--- a/lib/Target/X86/X86TargetMachine.h
+++ b/lib/Target/X86/X86TargetMachine.h
@@ -40,6 +40,8 @@ public:
   ~X86TargetMachine() override;
 
   const X86Subtarget *getSubtargetImpl(const Function &F) const override;
+  // The no argument getSubtargetImpl, while it exists on some targets, is
+  // deprecated and should not be used.
   const X86Subtarget *getSubtargetImpl() const = delete;
 
   TargetIRAnalysis getTargetIRAnalysis() override;
diff --git a/lib/ToolDrivers/CMakeLists.txt b/lib/ToolDrivers/CMakeLists.txt
index ad458450fda3..28da36bba209 100644
--- a/lib/ToolDrivers/CMakeLists.txt
+++ b/lib/ToolDrivers/CMakeLists.txt
@@ -1 +1,2 @@
+add_subdirectory(llvm-dlltool)
 add_subdirectory(llvm-lib)
diff --git a/lib/ToolDrivers/LLVMBuild.txt b/lib/ToolDrivers/LLVMBuild.txt
index 7da9a5c01005..a49e04bdf3c1 100644
--- a/lib/ToolDrivers/LLVMBuild.txt
+++ b/lib/ToolDrivers/LLVMBuild.txt
@@ -16,7 +16,7 @@
 ;===------------------------------------------------------------------------===;
 
 [common]
-subdirectories = llvm-lib
+subdirectories = llvm-dlltool llvm-lib
 
 [component_0]
 type = Group
diff --git a/lib/ToolDrivers/llvm-dlltool/CMakeLists.txt b/lib/ToolDrivers/llvm-dlltool/CMakeLists.txt
new file mode 100644
index 000000000000..52bd5cba86f4
--- /dev/null
+++ b/lib/ToolDrivers/llvm-dlltool/CMakeLists.txt
@@ -0,0 +1,9 @@
+set(LLVM_TARGET_DEFINITIONS Options.td)
+tablegen(LLVM Options.inc -gen-opt-parser-defs)
+add_public_tablegen_target(DllOptionsTableGen)
+
+add_llvm_library(LLVMDlltoolDriver
+  DlltoolDriver.cpp
+  )
+
+add_dependencies(LLVMDlltoolDriver DllOptionsTableGen)
diff --git a/lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp b/lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp
new file mode 100644
index 000000000000..a7de79306074
--- /dev/null
+++ b/lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp
@@ -0,0 +1,160 @@
+//===- DlltoolDriver.cpp - dlltool.exe-compatible driver ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines an interface to a dlltool.exe-compatible driver.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ToolDrivers/llvm-dlltool/DlltoolDriver.h"
+#include "llvm/Object/ArchiveWriter.h"
+#include "llvm/Object/COFF.h"
+#include "llvm/Object/COFFImportFile.h"
+#include "llvm/Object/COFFModuleDefinition.h"
+#include "llvm/Option/Arg.h"
+#include "llvm/Option/ArgList.h"
+#include "llvm/Option/Option.h"
+#include "llvm/Support/Path.h"
+
+#include <string>
+#include <vector>
+
+using namespace llvm;
+using namespace llvm::object;
+using namespace llvm::COFF;
+
+namespace {
+
+enum {
+  OPT_INVALID = 0,
+#define OPTION(_1, _2, ID, _4, _5, _6, _7, _8, _9, _10, _11, _12) OPT_##ID,
+#include "Options.inc"
+#undef OPTION
+};
+
+#define PREFIX(NAME, VALUE) const char *const NAME[] = VALUE;
+#include "Options.inc"
+#undef PREFIX
+
+static const llvm::opt::OptTable::Info infoTable[] = {
+#define OPTION(X1, X2, ID, KIND, GROUP, ALIAS, X7, X8, X9, X10, X11, X12)      \
+  {X1, X2, X10,         X11,         OPT_##ID, llvm::opt::Option::KIND##Class, \
+   X9, X8, OPT_##GROUP, OPT_##ALIAS, X7,       X12},
+#include "Options.inc"
+#undef OPTION
+};
+
+class DllOptTable : public llvm::opt::OptTable {
+public:
+  DllOptTable() : OptTable(infoTable, false) {}
+};
+
+} // namespace
+
+std::vector<std::unique_ptr<MemoryBuffer>> OwningMBs;
+
+// Opens a file. Path has to be resolved already.
+// Newly created memory buffers are owned by this driver.
+MemoryBufferRef openFile(StringRef Path) {
+  ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> MB = MemoryBuffer::getFile(Path);
+
+  if (std::error_code EC = MB.getError())
+    llvm::errs() << "fail openFile: " << EC.message() << "\n";
+
+  MemoryBufferRef MBRef = MB.get()->getMemBufferRef();
+  OwningMBs.push_back(std::move(MB.get())); // take ownership
+  return MBRef;
+}
+
+static MachineTypes getEmulation(StringRef S) {
+  return StringSwitch<MachineTypes>(S)
+      .Case("i386", IMAGE_FILE_MACHINE_I386)
+      .Case("i386:x86-64", IMAGE_FILE_MACHINE_AMD64)
+      .Case("arm", IMAGE_FILE_MACHINE_ARMNT)
+      .Default(IMAGE_FILE_MACHINE_UNKNOWN);
+}
+
+static std::string getImplibPath(std::string Path) {
+  SmallString<128> Out = StringRef("lib");
+  Out.append(Path);
+  sys::path::replace_extension(Out, ".a");
+  return Out.str();
+}
+
+int llvm::dlltoolDriverMain(llvm::ArrayRef<const char *> ArgsArr) {
+  DllOptTable Table;
+  unsigned MissingIndex;
+  unsigned MissingCount;
+  llvm::opt::InputArgList Args =
+      Table.ParseArgs(ArgsArr.slice(1), MissingIndex, MissingCount);
+  if (MissingCount) {
+    llvm::errs() << Args.getArgString(MissingIndex) << ": missing argument\n";
+    return 1;
+  }
+
+  // Handle when no input or output is specified
+  if (Args.hasArgNoClaim(OPT_INPUT) ||
+      (!Args.hasArgNoClaim(OPT_d) && !Args.hasArgNoClaim(OPT_l))) {
+    Table.PrintHelp(outs(), ArgsArr[0], "dlltool", false);
+    llvm::outs() << "\nTARGETS: i386, i386:x86-64, arm\n";
+    return 1;
+  }
+
+  if (!Args.hasArgNoClaim(OPT_m) && Args.hasArgNoClaim(OPT_d)) {
+    llvm::errs() << "error: no target machine specified\n"
+                 << "supported targets: i386, i386:x86-64, arm\n";
+    return 1;
+  }
+
+  for (auto *Arg : Args.filtered(OPT_UNKNOWN))
+    llvm::errs() << "ignoring unknown argument: " << Arg->getSpelling() << "\n";
+
+  MemoryBufferRef MB;
+  if (auto *Arg = Args.getLastArg(OPT_d))
+    MB = openFile(Arg->getValue());
+
+  if (!MB.getBufferSize()) {
+    llvm::errs() << "definition file empty\n";
+    return 1;
+  }
+
+  COFF::MachineTypes Machine = IMAGE_FILE_MACHINE_UNKNOWN;
+  if (auto *Arg = Args.getLastArg(OPT_m))
+    Machine = getEmulation(Arg->getValue());
+
+  if (Machine == IMAGE_FILE_MACHINE_UNKNOWN) {
+    llvm::errs() << "unknown target\n";
+    return 1;
+  }
+
+  Expected<COFFModuleDefinition> Def =
+      parseCOFFModuleDefinition(MB, Machine, true);
+
+  if (!Def) {
+    llvm::errs() << "error parsing definition\n"
+                 << errorToErrorCode(Def.takeError()).message();
+    return 1;
+  }
+
+  // Do this after the parser because parseCOFFModuleDefinition sets OutputFile.
+  if (auto *Arg = Args.getLastArg(OPT_D))
+    Def->OutputFile = Arg->getValue();
+
+  if (Def->OutputFile.empty()) {
+    llvm::errs() << "no output file specified\n";
+    return 1;
+  }
+
+  std::string Path = Args.getLastArgValue(OPT_l);
+  if (Path.empty())
+    Path = getImplibPath(Def->OutputFile);
+
+  if (writeImportLibrary(Def->OutputFile, Path, Def->Exports, Machine))
+    return 1;
+  return 0;
+}
diff --git a/lib/ToolDrivers/llvm-dlltool/LLVMBuild.txt b/lib/ToolDrivers/llvm-dlltool/LLVMBuild.txt
new file mode 100644
index 000000000000..11736eb47bcb
--- /dev/null
+++ b/lib/ToolDrivers/llvm-dlltool/LLVMBuild.txt
@@ -0,0 +1,22 @@
+;===- ./lib/ToolDrivers/llvm-dlltool/LLVMBuild.txt -------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = DlltoolDriver
+parent = Libraries
+required_libraries = Object Option Support
diff --git a/lib/ToolDrivers/llvm-dlltool/Options.td b/lib/ToolDrivers/llvm-dlltool/Options.td
new file mode 100644
index 000000000000..213c6a4d7674
--- /dev/null
+++ b/lib/ToolDrivers/llvm-dlltool/Options.td
@@ -0,0 +1,26 @@
+include "llvm/Option/OptParser.td"
+
+def m: JoinedOrSeparate<["-"], "m">, HelpText<"Set target machine">;
+def m_long : JoinedOrSeparate<["--"], "machine">, Alias<m>;
+
+def l: JoinedOrSeparate<["-"], "l">, HelpText<"Generate an import lib">;
+def l_long : JoinedOrSeparate<["--"], "output-lib">, Alias<l>;
+
+def D: JoinedOrSeparate<["-"], "D">, HelpText<"Specify the input DLL Name">;
+def D_long : JoinedOrSeparate<["--"], "dllname">, Alias<D>;
+
+def d: JoinedOrSeparate<["-"], "d">, HelpText<"Input .def File">;
+def d_long : JoinedOrSeparate<["--"], "input-def">, Alias<d>;
+
+//==============================================================================
+// The flags below do nothing. They are defined only for dlltool compatibility.
+//==============================================================================
+
+def k: Flag<["-"], "k">, HelpText<"Kill @n Symbol from export">;
+def k_alias: Flag<["--"], "kill-at">, Alias<k>;
+
+def S: JoinedOrSeparate<["-"], "S">, HelpText<"Assembler">;
+def S_alias: JoinedOrSeparate<["--"], "as">, Alias<S>;
+
+def f: JoinedOrSeparate<["-"], "f">, HelpText<"Assembler Flags">;
+def f_alias: JoinedOrSeparate<["--"], "as-flags">, Alias<f>;
diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp
index 3d57acf06e74..93eab680ca6b 100644
--- a/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/lib/Transforms/IPO/GlobalOpt.cpp
@@ -2026,6 +2026,24 @@ OptimizeFunctions(Module &M, TargetLibraryInfo *TLI,
       continue;
     }
 
+    // LLVM's definition of dominance allows instructions that are cyclic
+    // in unreachable blocks, e.g.:
+    // %pat = select i1 %condition, @global, i16* %pat
+    // because any instruction dominates an instruction in a block that's
+    // not reachable from entry.
+    // So, remove unreachable blocks from the function, because a) there's
+    // no point in analyzing them and b) GlobalOpt should otherwise grow
+    // some more complicated logic to break these cycles.
+    // Removing unreachable blocks might invalidate the dominator so we
+    // recalculate it.
+    if (!F->isDeclaration()) {
+      if (removeUnreachableBlocks(*F)) {
+        auto &DT = LookupDomTree(*F);
+        DT.recalculate(*F);
+        Changed = true;
+      }
+    }
+
     Changed |= processGlobal(*F, TLI, LookupDomTree);
 
     if (!F->hasLocalLinkage())
diff --git a/lib/Transforms/IPO/Inliner.cpp b/lib/Transforms/IPO/Inliner.cpp
index 00ddb93df830..317770d133b3 100644
--- a/lib/Transforms/IPO/Inliner.cpp
+++ b/lib/Transforms/IPO/Inliner.cpp
@@ -909,7 +909,7 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
         // To check this we also need to nuke any dead constant uses (perhaps
         // made dead by this operation on other functions).
         Callee.removeDeadConstantUsers();
-        if (Callee.use_empty()) {
+        if (Callee.use_empty() && !CG.isLibFunction(Callee)) {
           Calls.erase(
               std::remove_if(Calls.begin() + i + 1, Calls.end(),
                              [&Callee](const std::pair<CallSite, int> &Call) {
diff --git a/lib/Transforms/IPO/SampleProfile.cpp b/lib/Transforms/IPO/SampleProfile.cpp
index ac4765f96075..6baada2c1ae1 100644
--- a/lib/Transforms/IPO/SampleProfile.cpp
+++ b/lib/Transforms/IPO/SampleProfile.cpp
@@ -173,8 +173,10 @@ protected:
   void printBlockEquivalence(raw_ostream &OS, const BasicBlock *BB);
   bool computeBlockWeights(Function &F);
   void findEquivalenceClasses(Function &F);
+  template <bool IsPostDom>
   void findEquivalencesFor(BasicBlock *BB1, ArrayRef<BasicBlock *> Descendants,
-                           DominatorTreeBase<BasicBlock> *DomTree);
+                           DominatorTreeBase<BasicBlock, IsPostDom> *DomTree);
+
   void propagateWeights(Function &F);
   uint64_t visitEdge(Edge E, unsigned *NumUnknownEdges, Edge *UnknownEdge);
   void buildEdges(Function &F);
@@ -217,7 +219,7 @@ protected:
 
   /// \brief Dominance, post-dominance and loop information.
   std::unique_ptr<DominatorTree> DT;
-  std::unique_ptr<DominatorTreeBase<BasicBlock>> PDT;
+  std::unique_ptr<PostDomTreeBase<BasicBlock>> PDT;
   std::unique_ptr<LoopInfo> LI;
 
   AssumptionCacheTracker *ACT;
@@ -773,9 +775,10 @@ bool SampleProfileLoader::inlineHotFunctions(
 /// \param DomTree  Opposite dominator tree. If \p Descendants is filled
 ///                 with blocks from \p BB1's dominator tree, then
 ///                 this is the post-dominator tree, and vice versa.
+template <bool IsPostDom>
 void SampleProfileLoader::findEquivalencesFor(
     BasicBlock *BB1, ArrayRef<BasicBlock *> Descendants,
-    DominatorTreeBase<BasicBlock> *DomTree) {
+    DominatorTreeBase<BasicBlock, IsPostDom> *DomTree) {
   const BasicBlock *EC = EquivalenceClass[BB1];
   uint64_t Weight = BlockWeights[EC];
   for (const auto *BB2 : Descendants) {
@@ -1283,7 +1286,7 @@ void SampleProfileLoader::computeDominanceAndLoopInfo(Function &F) {
   DT.reset(new DominatorTree);
   DT->recalculate(F);
 
-  PDT.reset(new DominatorTreeBase<BasicBlock>(true));
+  PDT.reset(new PostDomTreeBase<BasicBlock>());
   PDT->recalculate(F);
 
   LI.reset(new LoopInfo);
diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 773c86e23707..fdc9c373b95e 100644
--- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -1284,6 +1284,16 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
   if (Value *V = SimplifyBSwap(I, Builder))
     return replaceInstUsesWith(I, V);
 
+  if (match(Op1, m_One())) {
+    // (1 << x) & 1 --> zext(x == 0)
+    // (1 >> x) & 1 --> zext(x == 0)
+    Value *X;
+    if (match(Op0, m_OneUse(m_LogicalShift(m_One(), m_Value(X))))) {
+      Value *IsZero = Builder.CreateICmpEQ(X, ConstantInt::get(I.getType(), 0));
+      return new ZExtInst(IsZero, I.getType());
+    }
+  }
+
   if (ConstantInt *AndRHS = dyn_cast<ConstantInt>(Op1)) {
     const APInt &AndRHSMask = AndRHS->getValue();
 
@@ -1315,23 +1325,6 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
 
         break;
       }
-      case Instruction::Sub:
-        // -x & 1 -> x & 1
-        if (AndRHSMask.isOneValue() && match(Op0LHS, m_Zero()))
-          return BinaryOperator::CreateAnd(Op0RHS, AndRHS);
-
-        break;
-
-      case Instruction::Shl:
-      case Instruction::LShr:
-        // (1 << x) & 1 --> zext(x == 0)
-        // (1 >> x) & 1 --> zext(x == 0)
-        if (AndRHSMask.isOneValue() && Op0LHS == AndRHS) {
-          Value *NewICmp =
-            Builder.CreateICmpEQ(Op0RHS, Constant::getNullValue(I.getType()));
-          return new ZExtInst(NewICmp, I.getType());
-        }
-        break;
       }
 
       // ((C1 OP zext(X)) & C2) -> zext((C1-X) & C2) if C2 fits in the bitwidth
@@ -1417,12 +1410,6 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
       }
     }
 
-    // (A&((~A)|B)) -> A&B
-    if (match(Op0, m_c_Or(m_Not(m_Specific(Op1)), m_Value(A))))
-      return BinaryOperator::CreateAnd(A, Op1);
-    if (match(Op1, m_c_Or(m_Not(m_Specific(Op0)), m_Value(A))))
-      return BinaryOperator::CreateAnd(A, Op0);
-
     // (A ^ B) & ((B ^ C) ^ A) -> (A ^ B) & ~C
     if (match(Op0, m_Xor(m_Value(A), m_Value(B))))
       if (match(Op1, m_Xor(m_Xor(m_Specific(B), m_Value(C)), m_Specific(A))))
@@ -2020,18 +2007,6 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
 
   Value *A, *B;
 
-  // ((~A & B) | A) -> (A | B)
-  if (match(Op0, m_c_And(m_Not(m_Specific(Op1)), m_Value(A))))
-    return BinaryOperator::CreateOr(A, Op1);
-  if (match(Op1, m_c_And(m_Not(m_Specific(Op0)), m_Value(A))))
-    return BinaryOperator::CreateOr(Op0, A);
-
-  // ((A & B) | ~A) -> (~A | B)
-  // The NOT is guaranteed to be in the RHS by complexity ordering.
-  if (match(Op1, m_Not(m_Value(A))) &&
-      match(Op0, m_c_And(m_Specific(A), m_Value(B))))
-    return BinaryOperator::CreateOr(Op1, B);
-
   // (A & C)|(B & D)
   Value *C = nullptr, *D = nullptr;
   if (match(Op0, m_And(m_Value(A), m_Value(C))) &&
@@ -2176,17 +2151,6 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
         return BinaryOperator::CreateOr(Not, Op0);
       }
 
-  // (A & B) | (~A ^ B) -> (~A ^ B)
-  // (A & B) | (B ^ ~A) -> (~A ^ B)
-  // (B & A) | (~A ^ B) -> (~A ^ B)
-  // (B & A) | (B ^ ~A) -> (~A ^ B)
-  // The match order is important: match the xor first because the 'not'
-  // operation defines 'A'. We do not need to match the xor as Op0 because the
-  // xor was canonicalized to Op1 above.
-  if (match(Op1, m_c_Xor(m_Not(m_Value(A)), m_Value(B))) &&
-      match(Op0, m_c_And(m_Specific(A), m_Specific(B))))
-    return BinaryOperator::CreateXor(Builder.CreateNot(A), B);
-
   if (SwappedForXor)
     std::swap(Op0, Op1);
 
diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 60d1cde971dd..a8faaecb5c34 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -1814,9 +1814,21 @@ Instruction *InstCombiner::foldICmpOrConstant(ICmpInst &Cmp, BinaryOperator *Or,
         Builder.CreateICmp(Pred, P, ConstantInt::getNullValue(P->getType()));
     Value *CmpQ =
         Builder.CreateICmp(Pred, Q, ConstantInt::getNullValue(Q->getType()));
-    auto LogicOpc = Pred == ICmpInst::Predicate::ICMP_EQ ? Instruction::And
-                                                         : Instruction::Or;
-    return BinaryOperator::Create(LogicOpc, CmpP, CmpQ);
+    auto BOpc = Pred == CmpInst::ICMP_EQ ? Instruction::And : Instruction::Or;
+    return BinaryOperator::Create(BOpc, CmpP, CmpQ);
+  }
+
+  // Are we using xors to bitwise check for a pair of (in)equalities? Convert to
+  // a shorter form that has more potential to be folded even further.
+  Value *X1, *X2, *X3, *X4;
+  if (match(Or->getOperand(0), m_OneUse(m_Xor(m_Value(X1), m_Value(X2)))) &&
+      match(Or->getOperand(1), m_OneUse(m_Xor(m_Value(X3), m_Value(X4))))) {
+    // ((X1 ^ X2) || (X3 ^ X4)) == 0 --> (X1 == X2) && (X3 == X4)
+    // ((X1 ^ X2) || (X3 ^ X4)) != 0 --> (X1 != X2) || (X3 != X4)
+    Value *Cmp12 = Builder.CreateICmp(Pred, X1, X2);
+    Value *Cmp34 = Builder.CreateICmp(Pred, X3, X4);
+    auto BOpc = Pred == CmpInst::ICMP_EQ ? Instruction::And : Instruction::Or;
+    return BinaryOperator::Create(BOpc, Cmp12, Cmp34);
   }
 
   return nullptr;
@@ -3737,6 +3749,11 @@ static Instruction *processUMulZExtIdiom(ICmpInst &I, Value *MulVal,
           const APInt &CVal = CI->getValue();
           if (CVal.getBitWidth() - CVal.countLeadingZeros() > MulWidth)
             return nullptr;
+        } else {
+          // In this case we could have the operand of the binary operation
+          // being defined in another block, and performing the replacement
+          // could break the dominance relation.
+          return nullptr;
         }
       } else {
         // Other uses prohibit this transformation.
@@ -3856,18 +3873,17 @@ static Instruction *processUMulZExtIdiom(ICmpInst &I, Value *MulVal,
       } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(U)) {
         assert(BO->getOpcode() == Instruction::And);
         // Replace (mul & mask) --> zext (mul.with.overflow & short_mask)
-        Value *ShortMask =
-            Builder.CreateTrunc(BO->getOperand(1), Builder.getIntNTy(MulWidth));
+        ConstantInt *CI = cast<ConstantInt>(BO->getOperand(1));
+        APInt ShortMask = CI->getValue().trunc(MulWidth);
         Value *ShortAnd = Builder.CreateAnd(Mul, ShortMask);
-        Value *Zext = Builder.CreateZExt(ShortAnd, BO->getType());
-        if (auto *ZextI = dyn_cast<Instruction>(Zext))
-          IC.Worklist.Add(ZextI);
+        Instruction *Zext =
+            cast<Instruction>(Builder.CreateZExt(ShortAnd, BO->getType()));
+        IC.Worklist.Add(Zext);
         IC.replaceInstUsesWith(*BO, Zext);
       } else {
         llvm_unreachable("Unexpected Binary operation");
       }
-      if (auto *UI = dyn_cast<Instruction>(U))
-        IC.Worklist.Add(UI);
+      IC.Worklist.Add(cast<Instruction>(U));
     }
   }
   if (isa<Instruction>(OtherVal))
diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index c59e1ce69ac2..451036545741 100644
--- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -998,8 +998,9 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
     // that this code is not reachable.  We do this instead of inserting
     // an unreachable instruction directly because we cannot modify the
     // CFG.
-    new StoreInst(UndefValue::get(LI.getType()),
-                  Constant::getNullValue(Op->getType()), &LI);
+    StoreInst *SI = new StoreInst(UndefValue::get(LI.getType()),
+                                  Constant::getNullValue(Op->getType()), &LI);
+    SI->setDebugLoc(LI.getDebugLoc());
     return replaceInstUsesWith(LI, UndefValue::get(LI.getType()));
   }
 
diff --git a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 5689c0604239..a20f474cbf40 100644
--- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -417,8 +417,10 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
       // the highest demanded bit, we just return the other side.
       if (DemandedFromOps.isSubsetOf(RHSKnown.Zero))
         return I->getOperand(0);
-      // We can't do this with the LHS for subtraction.
-      if (I->getOpcode() == Instruction::Add &&
+      // We can't do this with the LHS for subtraction, unless we are only
+      // demanding the LSB.
+      if ((I->getOpcode() == Instruction::Add ||
+           DemandedFromOps.isOneValue()) &&
           DemandedFromOps.isSubsetOf(LHSKnown.Zero))
         return I->getOperand(1);
     }
diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index 90e232399155..c7766568fd9d 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -636,17 +636,35 @@ Value *InstCombiner::SimplifyUsingDistributiveLaws(BinaryOperator &I) {
     Value *A = Op0->getOperand(0), *B = Op0->getOperand(1), *C = RHS;
     Instruction::BinaryOps InnerOpcode = Op0->getOpcode(); // op'
 
+    Value *L = SimplifyBinOp(TopLevelOpcode, A, C, SQ.getWithInstruction(&I));
+    Value *R = SimplifyBinOp(TopLevelOpcode, B, C, SQ.getWithInstruction(&I));
+
     // Do "A op C" and "B op C" both simplify?
-    if (Value *L =
-            SimplifyBinOp(TopLevelOpcode, A, C, SQ.getWithInstruction(&I)))
-      if (Value *R =
-              SimplifyBinOp(TopLevelOpcode, B, C, SQ.getWithInstruction(&I))) {
-        // They do! Return "L op' R".
-        ++NumExpand;
-        C = Builder.CreateBinOp(InnerOpcode, L, R);
-        C->takeName(&I);
-        return C;
-      }
+    if (L && R) {
+      // They do! Return "L op' R".
+      ++NumExpand;
+      C = Builder.CreateBinOp(InnerOpcode, L, R);
+      C->takeName(&I);
+      return C;
+    }
+
+    // Does "A op C" simplify to the identity value for the inner opcode?
+    if (L && L == ConstantExpr::getBinOpIdentity(InnerOpcode, L->getType())) {
+      // They do! Return "B op C".
+      ++NumExpand;
+      C = Builder.CreateBinOp(TopLevelOpcode, B, C);
+      C->takeName(&I);
+      return C;
+    }
+
+    // Does "B op C" simplify to the identity value for the inner opcode?
+    if (R && R == ConstantExpr::getBinOpIdentity(InnerOpcode, R->getType())) {
+      // They do! Return "A op C".
+      ++NumExpand;
+      C = Builder.CreateBinOp(TopLevelOpcode, A, C);
+      C->takeName(&I);
+      return C;
+    }
   }
 
   if (Op1 && LeftDistributesOverRight(TopLevelOpcode, Op1->getOpcode())) {
@@ -655,17 +673,35 @@ Value *InstCombiner::SimplifyUsingDistributiveLaws(BinaryOperator &I) {
     Value *A = LHS, *B = Op1->getOperand(0), *C = Op1->getOperand(1);
     Instruction::BinaryOps InnerOpcode = Op1->getOpcode(); // op'
 
+    Value *L = SimplifyBinOp(TopLevelOpcode, A, B, SQ.getWithInstruction(&I));
+    Value *R = SimplifyBinOp(TopLevelOpcode, A, C, SQ.getWithInstruction(&I));
+
     // Do "A op B" and "A op C" both simplify?
-    if (Value *L =
-            SimplifyBinOp(TopLevelOpcode, A, B, SQ.getWithInstruction(&I)))
-      if (Value *R =
-              SimplifyBinOp(TopLevelOpcode, A, C, SQ.getWithInstruction(&I))) {
-        // They do! Return "L op' R".
-        ++NumExpand;
-        A = Builder.CreateBinOp(InnerOpcode, L, R);
-        A->takeName(&I);
-        return A;
-      }
+    if (L && R) {
+      // They do! Return "L op' R".
+      ++NumExpand;
+      A = Builder.CreateBinOp(InnerOpcode, L, R);
+      A->takeName(&I);
+      return A;
+    }
+
+    // Does "A op B" simplify to the identity value for the inner opcode?
+    if (L && L == ConstantExpr::getBinOpIdentity(InnerOpcode, L->getType())) {
+      // They do! Return "A op C".
+      ++NumExpand;
+      A = Builder.CreateBinOp(TopLevelOpcode, A, C);
+      A->takeName(&I);
+      return A;
+    }
+
+    // Does "A op C" simplify to the identity value for the inner opcode?
+    if (R && R == ConstantExpr::getBinOpIdentity(InnerOpcode, R->getType())) {
+      // They do! Return "A op B".
+      ++NumExpand;
+      A = Builder.CreateBinOp(TopLevelOpcode, A, B);
+      A->takeName(&I);
+      return A;
+    }
   }
 
   // (op (select (a, c, b)), (select (a, d, b))) -> (select (a, (op c, d), 0))
diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 184940b7ea58..057f746e052d 100644
--- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -22,9 +22,11 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Argument.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/DataLayout.h"
@@ -43,6 +45,7 @@
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Endian.h"
+#include "llvm/Support/ScopedPrinter.h"
 #include "llvm/Support/SwapByteOrder.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Instrumentation.h"
@@ -192,6 +195,11 @@ static cl::opt<uint32_t> ClMaxInlinePoisoningSize(
 static cl::opt<bool> ClUseAfterReturn("asan-use-after-return",
                                       cl::desc("Check stack-use-after-return"),
                                       cl::Hidden, cl::init(true));
+static cl::opt<bool> ClRedzoneByvalArgs("asan-redzone-byval-args",
+                                        cl::desc("Create redzones for byval "
+                                                 "arguments (extra copy "
+                                                 "required)"), cl::Hidden,
+                                        cl::init(true));
 static cl::opt<bool> ClUseAfterScope("asan-use-after-scope",
                                      cl::desc("Check stack-use-after-scope"),
                                      cl::Hidden, cl::init(false));
@@ -747,6 +755,9 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
 
   bool runOnFunction() {
     if (!ClStack) return false;
+
+    if (ClRedzoneByvalArgs) copyArgsPassedByValToAllocas();
+
     // Collect alloca, ret, lifetime instructions etc.
     for (BasicBlock *BB : depth_first(&F.getEntryBlock())) visit(*BB);
 
@@ -763,6 +774,11 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
     return true;
   }
 
+  // Arguments marked with the "byval" attribute are implicitly copied without
+  // using an alloca instruction.  To produce redzones for those arguments, we
+  // copy them a second time into memory allocated with an alloca instruction.
+  void copyArgsPassedByValToAllocas();
+
   // Finds all Alloca instructions and puts
   // poisoned red zones around all of them.
   // Then unpoison everything back before the function returns.
@@ -2528,6 +2544,28 @@ static int StackMallocSizeClass(uint64_t LocalStackSize) {
   llvm_unreachable("impossible LocalStackSize");
 }
 
+void FunctionStackPoisoner::copyArgsPassedByValToAllocas() {
+  BasicBlock &FirstBB = *F.begin();
+  IRBuilder<> IRB(&FirstBB, FirstBB.getFirstInsertionPt());
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  for (Argument &Arg : F.args()) {
+    if (Arg.hasByValAttr()) {
+      Type *Ty = Arg.getType()->getPointerElementType();
+      unsigned Align = Arg.getParamAlignment();
+      if (Align == 0) Align = DL.getABITypeAlignment(Ty);
+
+      const std::string &Name = Arg.hasName() ? Arg.getName().str() :
+          "Arg" + llvm::to_string(Arg.getArgNo());
+      AllocaInst *AI = IRB.CreateAlloca(Ty, nullptr, Twine(Name) + ".byval");
+      AI->setAlignment(Align);
+      Arg.replaceAllUsesWith(AI);
+
+      uint64_t AllocSize = DL.getTypeAllocSize(Ty);
+      IRB.CreateMemCpy(AI, &Arg, AllocSize, Align);
+    }
+  }
+}
+
 PHINode *FunctionStackPoisoner::createPHI(IRBuilder<> &IRB, Value *Cond,
                                           Value *ValueIfTrue,
                                           Instruction *ThenTerm,
diff --git a/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 1348e0ed0ed0..b7c6271869cd 100644
--- a/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -3039,7 +3039,7 @@ struct VarArgAMD64Helper : public VarArgHelper {
   }
 
   void visitVAStartInst(VAStartInst &I) override {
-    if (F.getCallingConv() == CallingConv::X86_64_Win64)
+    if (F.getCallingConv() == CallingConv::Win64)
       return;
     IRBuilder<> IRB(&I);
     VAStartInstrumentationList.push_back(&I);
@@ -3053,7 +3053,7 @@ struct VarArgAMD64Helper : public VarArgHelper {
   }
 
   void visitVACopyInst(VACopyInst &I) override {
-    if (F.getCallingConv() == CallingConv::X86_64_Win64)
+    if (F.getCallingConv() == CallingConv::Win64)
       return;
     IRBuilder<> IRB(&I);
     Value *VAListTag = I.getArgOperand(0);
diff --git a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
index e3c36c98ab0d..06fe07598374 100644
--- a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
+++ b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
@@ -281,6 +281,16 @@ bool SanitizerCoverageModule::runOnModule(Module &M) {
   SanCovTraceSwitchFunction =
       checkSanitizerInterfaceFunction(M.getOrInsertFunction(
           SanCovTraceSwitchName, VoidTy, Int64Ty, Int64PtrTy));
+  // Make sure smaller parameters are zero-extended to i64 as required by the
+  // x86_64 ABI.
+  if (TargetTriple.getArch() == Triple::x86_64) {
+    for (int i = 0; i < 3; i++) {
+      SanCovTraceCmpFunction[i]->addParamAttr(0, Attribute::ZExt);
+      SanCovTraceCmpFunction[i]->addParamAttr(1, Attribute::ZExt);
+    }
+    SanCovTraceDivFunction[0]->addParamAttr(0, Attribute::ZExt);
+  }
+
 
   // We insert an empty inline asm after cov callbacks to avoid callback merge.
   EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false),
diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp
index 7fd77a082b82..c5c9b2c185d6 100644
--- a/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -562,13 +562,27 @@ bool EarlyCSE::isSameMemGeneration(unsigned EarlierGeneration,
   if (!MSSA)
     return false;
 
+  // If MemorySSA has determined that one of EarlierInst or LaterInst does not
+  // read/write memory, then we can safely return true here.
+  // FIXME: We could be more aggressive when checking doesNotAccessMemory(),
+  // onlyReadsMemory(), mayReadFromMemory(), and mayWriteToMemory() in this pass
+  // by also checking the MemorySSA MemoryAccess on the instruction.  Initial
+  // experiments suggest this isn't worthwhile, at least for C/C++ code compiled
+  // with the default optimization pipeline.
+  auto *EarlierMA = MSSA->getMemoryAccess(EarlierInst);
+  if (!EarlierMA)
+    return true;
+  auto *LaterMA = MSSA->getMemoryAccess(LaterInst);
+  if (!LaterMA)
+    return true;
+
   // Since we know LaterDef dominates LaterInst and EarlierInst dominates
   // LaterInst, if LaterDef dominates EarlierInst then it can't occur between
   // EarlierInst and LaterInst and neither can any other write that potentially
   // clobbers LaterInst.
   MemoryAccess *LaterDef =
       MSSA->getWalker()->getClobberingMemoryAccess(LaterInst);
-  return MSSA->dominates(LaterDef, MSSA->getMemoryAccess(EarlierInst));
+  return MSSA->dominates(LaterDef, EarlierMA);
 }
 
 bool EarlyCSE::processNode(DomTreeNode *Node) {
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index 0fe72f3f7331..ea28705e684d 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -1168,6 +1168,7 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
                                  LI->isVolatile(), LI->getAlignment(),
                                  LI->getOrdering(), LI->getSyncScopeID(),
                                  UnavailablePred->getTerminator());
+    NewLoad->setDebugLoc(LI->getDebugLoc());
 
     // Transfer the old load's AA tags to the new load.
     AAMDNodes Tags;
diff --git a/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
index a40c22c3fce9..99b4458ea0fa 100644
--- a/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
+++ b/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
@@ -805,6 +805,25 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP
   ConstantInt *One = ConstantInt::get(IndVarTy, 1);
   // TODO: generalize the predicates here to also match their unsigned variants.
   if (IsIncreasing) {
+    bool DecreasedRightValueByOne = false;
+    // Try to turn eq/ne predicates to those we can work with.
+    if (Pred == ICmpInst::ICMP_NE && LatchBrExitIdx == 1)
+      // while (++i != len) {         while (++i < len) {
+      //   ...                 --->     ...
+      // }                            }
+      Pred = ICmpInst::ICMP_SLT;
+    else if (Pred == ICmpInst::ICMP_EQ && LatchBrExitIdx == 0 &&
+             !CanBeSMin(SE, RightSCEV)) {
+      // while (true) {               while (true) {
+      //   if (++i == len)     --->     if (++i > len - 1)
+      //     break;                       break;
+      //   ...                          ...
+      // }                            }
+      Pred = ICmpInst::ICMP_SGT;
+      RightSCEV = SE.getMinusSCEV(RightSCEV, SE.getOne(RightSCEV->getType()));
+      DecreasedRightValueByOne = true;
+    }
+
     bool FoundExpectedPred =
         (Pred == ICmpInst::ICMP_SLT && LatchBrExitIdx == 1) ||
         (Pred == ICmpInst::ICMP_SGT && LatchBrExitIdx == 0);
@@ -829,16 +848,41 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP
         return None;
       }
 
-      IRBuilder<> B(Preheader->getTerminator());
-      RightValue = B.CreateAdd(RightValue, One);
+      // We need to increase the right value unless we have already decreased
+      // it virtually when we replaced EQ with SGT.
+      if (!DecreasedRightValueByOne) {
+        IRBuilder<> B(Preheader->getTerminator());
+        RightValue = B.CreateAdd(RightValue, One);
+      }
     } else {
       if (!SE.isLoopEntryGuardedByCond(&L, CmpInst::ICMP_SLT, IndVarStart,
                                        RightSCEV)) {
         FailureReason = "Induction variable start not bounded by upper limit";
         return None;
       }
+      assert(!DecreasedRightValueByOne &&
+             "Right value can be decreased only for LatchBrExitIdx == 0!");
     }
   } else {
+    bool IncreasedRightValueByOne = false;
+    // Try to turn eq/ne predicates to those we can work with.
+    if (Pred == ICmpInst::ICMP_NE && LatchBrExitIdx == 1)
+      // while (--i != len) {         while (--i > len) {
+      //   ...                 --->     ...
+      // }                            }
+      Pred = ICmpInst::ICMP_SGT;
+    else if (Pred == ICmpInst::ICMP_EQ && LatchBrExitIdx == 0 &&
+             !CanBeSMax(SE, RightSCEV)) {
+      // while (true) {               while (true) {
+      //   if (--i == len)     --->     if (--i < len + 1)
+      //     break;                       break;
+      //   ...                          ...
+      // }                            }
+      Pred = ICmpInst::ICMP_SLT;
+      RightSCEV = SE.getAddExpr(RightSCEV, SE.getOne(RightSCEV->getType()));
+      IncreasedRightValueByOne = true;
+    }
+
     bool FoundExpectedPred =
         (Pred == ICmpInst::ICMP_SGT && LatchBrExitIdx == 1) ||
         (Pred == ICmpInst::ICMP_SLT && LatchBrExitIdx == 0);
@@ -863,14 +907,20 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP
         return None;
       }
 
-      IRBuilder<> B(Preheader->getTerminator());
-      RightValue = B.CreateSub(RightValue, One);
+      // We need to decrease the right value unless we have already increased
+      // it virtually when we replaced EQ with SLT.
+      if (!IncreasedRightValueByOne) {
+        IRBuilder<> B(Preheader->getTerminator());
+        RightValue = B.CreateSub(RightValue, One);
+      }
     } else {
       if (!SE.isLoopEntryGuardedByCond(&L, CmpInst::ICMP_SGT, IndVarStart,
                                        RightSCEV)) {
         FailureReason = "Induction variable start not bounded by lower limit";
         return None;
       }
+      assert(!IncreasedRightValueByOne &&
+             "Right value can be increased only for LatchBrExitIdx == 0!");
     }
   }
 
@@ -922,14 +972,18 @@ LoopConstrainer::calculateSubRanges() const {
 
   bool Increasing = MainLoopStructure.IndVarIncreasing;
 
-  // We compute `Smallest` and `Greatest` such that [Smallest, Greatest) is the
-  // range of values the induction variable takes.
+  // We compute `Smallest` and `Greatest` such that [Smallest, Greatest), or
+  // [Smallest, GreatestSeen] is the range of values the induction variable
+  // takes.
 
-  const SCEV *Smallest = nullptr, *Greatest = nullptr;
+  const SCEV *Smallest = nullptr, *Greatest = nullptr, *GreatestSeen = nullptr;
 
+  const SCEV *One = SE.getOne(Ty);
   if (Increasing) {
     Smallest = Start;
     Greatest = End;
+    // No overflow, because the range [Smallest, GreatestSeen] is not empty.
+    GreatestSeen = SE.getMinusSCEV(End, One);
   } else {
     // These two computations may sign-overflow.  Here is why that is okay:
     //
@@ -947,9 +1001,9 @@ LoopConstrainer::calculateSubRanges() const {
     //    will be an empty range.  Returning an empty range is always safe.
     //
 
-    const SCEV *One = SE.getOne(Ty);
     Smallest = SE.getAddExpr(End, One);
     Greatest = SE.getAddExpr(Start, One);
+    GreatestSeen = Start;
   }
 
   auto Clamp = [this, Smallest, Greatest](const SCEV *S) {
@@ -964,7 +1018,7 @@ LoopConstrainer::calculateSubRanges() const {
     Result.LowLimit = Clamp(Range.getBegin());
 
   bool ProvablyNoPostLoop =
-      SE.isKnownPredicate(ICmpInst::ICMP_SLE, Greatest, Range.getEnd());
+      SE.isKnownPredicate(ICmpInst::ICMP_SLT, GreatestSeen, Range.getEnd());
   if (!ProvablyNoPostLoop)
     Result.HighLimit = Clamp(Range.getEnd());
 
diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp
index ee3de51b1360..4056cc5cb346 100644
--- a/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/lib/Transforms/Scalar/JumpThreading.cpp
@@ -2168,11 +2168,19 @@ bool JumpThreadingPass::TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB) {
   return false;
 }
 
-/// TryToUnfoldSelectInCurrBB - Look for PHI/Select in the same BB of the form
+/// TryToUnfoldSelectInCurrBB - Look for PHI/Select or PHI/CMP/Select in the
+/// same BB in the form
 /// bb:
 ///   %p = phi [false, %bb1], [true, %bb2], [false, %bb3], [true, %bb4], ...
-///   %s = select p, trueval, falseval
+///   %s = select %p, trueval, falseval
 ///
+/// or
+///
+/// bb:
+///   %p = phi [0, %bb1], [1, %bb2], [0, %bb3], [1, %bb4], ...
+///   %c = cmp %p, 0
+///   %s = select %c, trueval, falseval
+//
 /// And expand the select into a branch structure. This later enables
 /// jump-threading over bb in this pass.
 ///
@@ -2186,44 +2194,54 @@ bool JumpThreadingPass::TryToUnfoldSelectInCurrBB(BasicBlock *BB) {
   if (LoopHeaders.count(BB))
     return false;
 
-  // Look for a Phi/Select pair in the same basic block.  The Phi feeds the
-  // condition of the Select and at least one of the incoming values is a
-  // constant.
   for (BasicBlock::iterator BI = BB->begin();
        PHINode *PN = dyn_cast<PHINode>(BI); ++BI) {
-    unsigned NumPHIValues = PN->getNumIncomingValues();
-    if (NumPHIValues == 0 || !PN->hasOneUse())
-      continue;
-
-    SelectInst *SI = dyn_cast<SelectInst>(PN->user_back());
-    if (!SI || SI->getParent() != BB)
-      continue;
-
-    Value *Cond = SI->getCondition();
-    if (!Cond || Cond != PN || !Cond->getType()->isIntegerTy(1))
+    // Look for a Phi having at least one constant incoming value.
+    if (llvm::all_of(PN->incoming_values(),
+                     [](Value *V) { return !isa<ConstantInt>(V); }))
       continue;
 
-    bool HasConst = false;
-    for (unsigned i = 0; i != NumPHIValues; ++i) {
-      if (PN->getIncomingBlock(i) == BB)
+    auto isUnfoldCandidate = [BB](SelectInst *SI, Value *V) {
+      // Check if SI is in BB and use V as condition.
+      if (SI->getParent() != BB)
         return false;
-      if (isa<ConstantInt>(PN->getIncomingValue(i)))
-        HasConst = true;
-    }
+      Value *Cond = SI->getCondition();
+      return (Cond && Cond == V && Cond->getType()->isIntegerTy(1));
+    };
 
-    if (HasConst) {
-      // Expand the select.
-      TerminatorInst *Term =
-          SplitBlockAndInsertIfThen(SI->getCondition(), SI, false);
-      PHINode *NewPN = PHINode::Create(SI->getType(), 2, "", SI);
-      NewPN->addIncoming(SI->getTrueValue(), Term->getParent());
-      NewPN->addIncoming(SI->getFalseValue(), BB);
-      SI->replaceAllUsesWith(NewPN);
-      SI->eraseFromParent();
-      return true;
+    SelectInst *SI = nullptr;
+    for (Use &U : PN->uses()) {
+      if (ICmpInst *Cmp = dyn_cast<ICmpInst>(U.getUser())) {
+        // Look for a ICmp in BB that compares PN with a constant and is the
+        // condition of a Select.
+        if (Cmp->getParent() == BB && Cmp->hasOneUse() &&
+            isa<ConstantInt>(Cmp->getOperand(1 - U.getOperandNo())))
+          if (SelectInst *SelectI = dyn_cast<SelectInst>(Cmp->user_back()))
+            if (isUnfoldCandidate(SelectI, Cmp->use_begin()->get())) {
+              SI = SelectI;
+              break;
+            }
+      } else if (SelectInst *SelectI = dyn_cast<SelectInst>(U.getUser())) {
+        // Look for a Select in BB that uses PN as condtion.
+        if (isUnfoldCandidate(SelectI, U.get())) {
+          SI = SelectI;
+          break;
+        }
+      }
     }
+
+    if (!SI)
+      continue;
+    // Expand the select.
+    TerminatorInst *Term =
+        SplitBlockAndInsertIfThen(SI->getCondition(), SI, false);
+    PHINode *NewPN = PHINode::Create(SI->getType(), 2, "", SI);
+    NewPN->addIncoming(SI->getTrueValue(), Term->getParent());
+    NewPN->addIncoming(SI->getFalseValue(), BB);
+    SI->replaceAllUsesWith(NewPN);
+    SI->eraseFromParent();
+    return true;
   }
-  
   return false;
 }
 
diff --git a/lib/Transforms/Scalar/LoopInterchange.cpp b/lib/Transforms/Scalar/LoopInterchange.cpp
index 606136dc31a4..2e0d8e0374c0 100644
--- a/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -22,6 +22,7 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopIterator.h"
 #include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
@@ -323,9 +324,10 @@ static PHINode *getInductionVariable(Loop *L, ScalarEvolution *SE) {
 class LoopInterchangeLegality {
 public:
   LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
-                          LoopInfo *LI, DominatorTree *DT, bool PreserveLCSSA)
+                          LoopInfo *LI, DominatorTree *DT, bool PreserveLCSSA,
+                          OptimizationRemarkEmitter *ORE)
       : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT),
-        PreserveLCSSA(PreserveLCSSA), InnerLoopHasReduction(false) {}
+        PreserveLCSSA(PreserveLCSSA), ORE(ORE), InnerLoopHasReduction(false) {}
 
   /// Check if the loops can be interchanged.
   bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId,
@@ -353,6 +355,8 @@ private:
   LoopInfo *LI;
   DominatorTree *DT;
   bool PreserveLCSSA;
+  /// Interface to emit optimization remarks.
+  OptimizationRemarkEmitter *ORE;
 
   bool InnerLoopHasReduction;
 };
@@ -361,8 +365,9 @@ private:
 /// loop.
 class LoopInterchangeProfitability {
 public:
-  LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE)
-      : OuterLoop(Outer), InnerLoop(Inner), SE(SE) {}
+  LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
+                               OptimizationRemarkEmitter *ORE)
+      : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
 
   /// Check if the loop interchange is profitable.
   bool isProfitable(unsigned InnerLoopId, unsigned OuterLoopId,
@@ -376,6 +381,8 @@ private:
 
   /// Scev analysis.
   ScalarEvolution *SE;
+  /// Interface to emit optimization remarks.
+  OptimizationRemarkEmitter *ORE;
 };
 
 /// LoopInterchangeTransform interchanges the loop.
@@ -422,6 +429,9 @@ struct LoopInterchange : public FunctionPass {
   DependenceInfo *DI;
   DominatorTree *DT;
   bool PreserveLCSSA;
+  /// Interface to emit optimization remarks.
+  OptimizationRemarkEmitter *ORE;
+
   LoopInterchange()
       : FunctionPass(ID), SE(nullptr), LI(nullptr), DI(nullptr), DT(nullptr) {
     initializeLoopInterchangePass(*PassRegistry::getPassRegistry());
@@ -435,6 +445,7 @@ struct LoopInterchange : public FunctionPass {
     AU.addRequired<DependenceAnalysisWrapperPass>();
     AU.addRequiredID(LoopSimplifyID);
     AU.addRequiredID(LCSSAID);
+    AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
   }
 
   bool runOnFunction(Function &F) override {
@@ -446,6 +457,7 @@ struct LoopInterchange : public FunctionPass {
     DI = &getAnalysis<DependenceAnalysisWrapperPass>().getDI();
     auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
     DT = DTWP ? &DTWP->getDomTree() : nullptr;
+    ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
     PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
 
     // Build up a worklist of loop pairs to analyze.
@@ -575,18 +587,23 @@ struct LoopInterchange : public FunctionPass {
     Loop *OuterLoop = LoopList[OuterLoopId];
 
     LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, LI, DT,
-                                PreserveLCSSA);
+                                PreserveLCSSA, ORE);
     if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) {
       DEBUG(dbgs() << "Not interchanging Loops. Cannot prove legality\n");
       return false;
     }
     DEBUG(dbgs() << "Loops are legal to interchange\n");
-    LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE);
+    LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
     if (!LIP.isProfitable(InnerLoopId, OuterLoopId, DependencyMatrix)) {
       DEBUG(dbgs() << "Interchanging loops not profitable\n");
       return false;
     }
 
+    ORE->emit(OptimizationRemark(DEBUG_TYPE, "Interchanged",
+                                 InnerLoop->getStartLoc(),
+                                 InnerLoop->getHeader())
+              << "Loop interchanged with enclosing loop.");
+
     LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT,
                                  LoopNestExit, LIL.hasInnerLoopReduction());
     LIT.transform();
@@ -760,6 +777,12 @@ bool LoopInterchangeLegality::currentLimitations() {
   if (!findInductionAndReductions(InnerLoop, Inductions, Reductions)) {
     DEBUG(dbgs() << "Only inner loops with induction or reduction PHI nodes "
                  << "are supported currently.\n");
+    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
+                                       "UnsupportedPHIInner",
+                                       InnerLoop->getStartLoc(),
+                                       InnerLoop->getHeader())
+              << "Only inner loops with induction or reduction PHI nodes can be"
+                 " interchange currently.");
     return true;
   }
 
@@ -767,6 +790,12 @@ bool LoopInterchangeLegality::currentLimitations() {
   if (Inductions.size() != 1) {
     DEBUG(dbgs() << "We currently only support loops with 1 induction variable."
                  << "Failed to interchange due to current limitation\n");
+    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
+                                       "MultiInductionInner",
+                                       InnerLoop->getStartLoc(),
+                                       InnerLoop->getHeader())
+              << "Only inner loops with 1 induction variable can be "
+                 "interchanged currently.");
     return true;
   }
   if (Reductions.size() > 0)
@@ -777,6 +806,12 @@ bool LoopInterchangeLegality::currentLimitations() {
   if (!findInductionAndReductions(OuterLoop, Inductions, Reductions)) {
     DEBUG(dbgs() << "Only outer loops with induction or reduction PHI nodes "
                  << "are supported currently.\n");
+    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
+                                       "UnsupportedPHIOuter",
+                                       OuterLoop->getStartLoc(),
+                                       OuterLoop->getHeader())
+              << "Only outer loops with induction or reduction PHI nodes can be"
+                 " interchanged currently.");
     return true;
   }
 
@@ -785,18 +820,35 @@ bool LoopInterchangeLegality::currentLimitations() {
   if (!Reductions.empty()) {
     DEBUG(dbgs() << "Outer loops with reductions are not supported "
                  << "currently.\n");
+    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
+                                       "ReductionsOuter",
+                                       OuterLoop->getStartLoc(),
+                                       OuterLoop->getHeader())
+              << "Outer loops with reductions cannot be interchangeed "
+                 "currently.");
     return true;
   }
   // TODO: Currently we handle only loops with 1 induction variable.
   if (Inductions.size() != 1) {
     DEBUG(dbgs() << "Loops with more than 1 induction variables are not "
                  << "supported currently.\n");
+    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
+                                       "MultiIndutionOuter",
+                                       OuterLoop->getStartLoc(),
+                                       OuterLoop->getHeader())
+              << "Only outer loops with 1 induction variable can be "
+                 "interchanged currently.");
     return true;
   }
 
   // TODO: Triangular loops are not handled for now.
   if (!isLoopStructureUnderstood(InnerInductionVar)) {
     DEBUG(dbgs() << "Loop structure not understood by pass\n");
+    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
+                                       "UnsupportedStructureInner",
+                                       InnerLoop->getStartLoc(),
+                                       InnerLoop->getHeader())
+              << "Inner loop structure not understood currently.");
     return true;
   }
 
@@ -805,12 +857,24 @@ bool LoopInterchangeLegality::currentLimitations() {
       getLoopLatchExitBlock(OuterLoopLatch, OuterLoopHeader);
   if (!LoopExitBlock || !containsSafePHI(LoopExitBlock, true)) {
     DEBUG(dbgs() << "Can only handle LCSSA PHIs in outer loops currently.\n");
+    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
+                                       "NoLCSSAPHIOuter",
+                                       OuterLoop->getStartLoc(),
+                                       OuterLoop->getHeader())
+              << "Only outer loops with LCSSA PHIs can be interchange "
+                 "currently.");
     return true;
   }
 
   LoopExitBlock = getLoopLatchExitBlock(InnerLoopLatch, InnerLoopHeader);
   if (!LoopExitBlock || !containsSafePHI(LoopExitBlock, false)) {
     DEBUG(dbgs() << "Can only handle LCSSA PHIs in inner loops currently.\n");
+    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
+                                       "NoLCSSAPHIOuterInner",
+                                       InnerLoop->getStartLoc(),
+                                       InnerLoop->getHeader())
+              << "Only inner loops with LCSSA PHIs can be interchange "
+                 "currently.");
     return true;
   }
 
@@ -835,6 +899,11 @@ bool LoopInterchangeLegality::currentLimitations() {
   if (!InnerIndexVarInc) {
     DEBUG(dbgs() << "Did not find an instruction to increment the induction "
                  << "variable.\n");
+    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
+                                       "NoIncrementInInner",
+                                       InnerLoop->getStartLoc(),
+                                       InnerLoop->getHeader())
+              << "The inner loop does not increment the induction variable.");
     return true;
   }
 
@@ -852,6 +921,12 @@ bool LoopInterchangeLegality::currentLimitations() {
     if (!I.isIdenticalTo(InnerIndexVarInc)) {
       DEBUG(dbgs() << "Found unsupported instructions between induction "
                    << "variable increment and branch.\n");
+    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
+                                       "UnsupportedInsBetweenInduction",
+                                       InnerLoop->getStartLoc(),
+                                       InnerLoop->getHeader())
+              << "Found unsupported instruction between induction variable "
+                 "increment and branch.");
       return true;
     }
 
@@ -862,6 +937,11 @@ bool LoopInterchangeLegality::currentLimitations() {
   // current limitation.
   if (!FoundInduction) {
     DEBUG(dbgs() << "Did not find the induction variable.\n");
+    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
+                                       "NoIndutionVariable",
+                                       InnerLoop->getStartLoc(),
+                                       InnerLoop->getHeader())
+              << "Did not find the induction variable.");
     return true;
   }
   return false;
@@ -875,6 +955,11 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
     DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId
                  << " and OuterLoopId = " << OuterLoopId
                  << " due to dependence\n");
+    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
+                                       "Dependence",
+                                       InnerLoop->getStartLoc(),
+                                       InnerLoop->getHeader())
+              << "Cannot interchange loops due to dependences.");
     return false;
   }
 
@@ -910,6 +995,12 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
   // Check if the loops are tightly nested.
   if (!tightlyNested(OuterLoop, InnerLoop)) {
     DEBUG(dbgs() << "Loops not tightly nested\n");
+    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
+                                       "NotTightlyNested",
+                                       InnerLoop->getStartLoc(),
+                                       InnerLoop->getHeader())
+              << "Cannot interchange loops because they are not tightly "
+                 "nested.");
     return false;
   }
 
@@ -1005,9 +1096,18 @@ bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId,
 
   // It is not profitable as per current cache profitability model. But check if
   // we can move this loop outside to improve parallelism.
-  bool ImprovesPar =
-      isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix);
-  return ImprovesPar;
+  if (isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix))
+    return true;
+
+  ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
+                                     "InterchangeNotProfitable",
+                                     InnerLoop->getStartLoc(),
+                                     InnerLoop->getHeader())
+            << "Interchanging loops is too costly (cost="
+            << ore::NV("Cost", Cost) << ", threshold="
+            << ore::NV("Threshold", LoopInterchangeCostThreshold) <<
+            ") and it does not improve parallelism.");
+  return false;
 }
 
 void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop,
@@ -1291,6 +1391,7 @@ INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
 
 INITIALIZE_PASS_END(LoopInterchange, "loop-interchange",
                     "Interchanges loops for cache reuse", false, false)
diff --git a/lib/Transforms/Scalar/TailRecursionElimination.cpp b/lib/Transforms/Scalar/TailRecursionElimination.cpp
index 9397b87cdf56..90c5c243f464 100644
--- a/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -68,6 +68,7 @@
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
@@ -90,16 +91,10 @@ STATISTIC(NumAccumAdded, "Number of accumulators introduced");
 /// If it contains any dynamic allocas, returns false.
 static bool canTRE(Function &F) {
   // Because of PR962, we don't TRE dynamic allocas.
-  for (auto &BB : F) {
-    for (auto &I : BB) {
-      if (AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
-        if (!AI->isStaticAlloca())
-          return false;
-      }
-    }
-  }
-
-  return true;
+  return llvm::all_of(instructions(F), [](Instruction &I) {
+    auto *AI = dyn_cast<AllocaInst>(&I);
+    return !AI || AI->isStaticAlloca();
+  });
 }
 
 namespace {
diff --git a/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index 5170c68e2915..d43ce7abb7cd 100644
--- a/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -22,6 +22,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/LoopIterator.h"
 #include "llvm/Analysis/LoopPass.h"
@@ -736,7 +737,9 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
   // remainder are connected to the original Loop's exit blocks. The remaining
   // work is to update the phi nodes in the original loop, and take in the
   // values from the cloned region. Also update the dominator info for
-  // OtherExits, since we have new edges into OtherExits.
+  // OtherExits and their immediate successors, since we have new edges into
+  // OtherExits.
+  SmallSet<BasicBlock*, 8> ImmediateSuccessorsOfExitBlocks;
   for (auto *BB : OtherExits) {
    for (auto &II : *BB) {
 
@@ -759,12 +762,35 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
                            cast<BasicBlock>(VMap[Phi->getIncomingBlock(i)]));
      }
    }
+#if defined(EXPENSIVE_CHECKS) && !defined(NDEBUG)
+    for (BasicBlock *SuccBB : successors(BB)) {
+      assert(!(any_of(OtherExits,
+                      [SuccBB](BasicBlock *EB) { return EB == SuccBB; }) ||
+               SuccBB == LatchExit) &&
+             "Breaks the definition of dedicated exits!");
+    }
+#endif
    // Update the dominator info because the immediate dominator is no longer the
    // header of the original Loop. BB has edges both from L and remainder code.
    // Since the preheader determines which loop is run (L or directly jump to
    // the remainder code), we set the immediate dominator as the preheader.
-   if (DT)
+   if (DT) {
      DT->changeImmediateDominator(BB, PreHeader);
+     // Also update the IDom for immediate successors of BB.  If the current
+     // IDom is the header, update the IDom to be the preheader because that is
+     // the nearest common dominator of all predecessors of SuccBB.  We need to
+     // check for IDom being the header because successors of exit blocks can
+     // have edges from outside the loop, and we should not incorrectly update
+     // the IDom in that case.
+     for (BasicBlock *SuccBB: successors(BB))
+       if (ImmediateSuccessorsOfExitBlocks.insert(SuccBB).second) {
+         if (DT->getNode(SuccBB)->getIDom()->getBlock() == Header) {
+           assert(!SuccBB->getSinglePredecessor() &&
+                  "BB should be the IDom then!");
+           DT->changeImmediateDominator(SuccBB, PreHeader);
+         }
+       }
+    }
   }
 
   // Loop structure should be the following:
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index eb82ee283d44..012b10c8a9b0 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -574,11 +574,9 @@ protected:
   /// Returns (and creates if needed) the trip count of the widened loop.
   Value *getOrCreateVectorTripCount(Loop *NewLoop);
 
-  /// Emit a bypass check to see if the trip count would overflow, or we
-  /// wouldn't have enough iterations to execute one vector loop.
+  /// Emit a bypass check to see if the vector trip count is zero, including if
+  /// it overflows.
   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
-  /// Emit a bypass check to see if the vector trip count is nonzero.
-  void emitVectorLoopEnteredCheck(Loop *L, BasicBlock *Bypass);
   /// Emit a bypass check to see if all of the SCEV assumptions we've
   /// had to make are correct.
   void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
@@ -3289,37 +3287,16 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
   BasicBlock *BB = L->getLoopPreheader();
   IRBuilder<> Builder(BB->getTerminator());
 
-  // Generate code to check that the loop's trip count that we computed by
-  // adding one to the backedge-taken count will not overflow.
-  Value *CheckMinIters = Builder.CreateICmpULT(
-      Count, ConstantInt::get(Count->getType(), VF * UF), "min.iters.check");
+  // Generate code to check if the loop's trip count is less than VF * UF, or
+  // equal to it in case a scalar epilogue is required; this implies that the
+  // vector trip count is zero. This check also covers the case where adding one
+  // to the backedge-taken count overflowed leading to an incorrect trip count
+  // of zero. In this case we will also jump to the scalar loop.
+  auto P = Legal->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
+                                           : ICmpInst::ICMP_ULT;
+  Value *CheckMinIters = Builder.CreateICmp(
+      P, Count, ConstantInt::get(Count->getType(), VF * UF), "min.iters.check");
 
-  BasicBlock *NewBB =
-      BB->splitBasicBlock(BB->getTerminator(), "min.iters.checked");
-  // Update dominator tree immediately if the generated block is a
-  // LoopBypassBlock because SCEV expansions to generate loop bypass
-  // checks may query it before the current function is finished.
-  DT->addNewBlock(NewBB, BB);
-  if (L->getParentLoop())
-    L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
-  ReplaceInstWithInst(BB->getTerminator(),
-                      BranchInst::Create(Bypass, NewBB, CheckMinIters));
-  LoopBypassBlocks.push_back(BB);
-}
-
-void InnerLoopVectorizer::emitVectorLoopEnteredCheck(Loop *L,
-                                                     BasicBlock *Bypass) {
-  Value *TC = getOrCreateVectorTripCount(L);
-  BasicBlock *BB = L->getLoopPreheader();
-  IRBuilder<> Builder(BB->getTerminator());
-
-  // Now, compare the new count to zero. If it is zero skip the vector loop and
-  // jump to the scalar loop.
-  Value *Cmp = Builder.CreateICmpEQ(TC, Constant::getNullValue(TC->getType()),
-                                    "cmp.zero");
-
-  // Generate code to check that the loop's trip count that we computed by
-  // adding one to the backedge-taken count will not overflow.
   BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
   // Update dominator tree immediately if the generated block is a
   // LoopBypassBlock because SCEV expansions to generate loop bypass
@@ -3328,7 +3305,7 @@ void InnerLoopVectorizer::emitVectorLoopEnteredCheck(Loop *L,
   if (L->getParentLoop())
     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
   ReplaceInstWithInst(BB->getTerminator(),
-                      BranchInst::Create(Bypass, NewBB, Cmp));
+                      BranchInst::Create(Bypass, NewBB, CheckMinIters));
   LoopBypassBlocks.push_back(BB);
 }
 
@@ -3477,14 +3454,13 @@ void InnerLoopVectorizer::createVectorizedLoopSkeleton() {
 
   Value *StartIdx = ConstantInt::get(IdxTy, 0);
 
-  // We need to test whether the backedge-taken count is uint##_max. Adding one
-  // to it will cause overflow and an incorrect loop trip count in the vector
-  // body. In case of overflow we want to directly jump to the scalar remainder
-  // loop.
-  emitMinimumIterationCountCheck(Lp, ScalarPH);
   // Now, compare the new count to zero. If it is zero skip the vector loop and
-  // jump to the scalar loop.
-  emitVectorLoopEnteredCheck(Lp, ScalarPH);
+  // jump to the scalar loop. This check also covers the case where the
+  // backedge-taken count is uint##_max: adding one to it will overflow leading
+  // to an incorrect trip count of zero. In this (rare) case we will also jump
+  // to the scalar loop.
+  emitMinimumIterationCountCheck(Lp, ScalarPH);
+
   // Generate the code to check any assumptions that we've made for SCEV
   // expressions.
   emitSCEVChecks(Lp, ScalarPH);
@@ -3527,7 +3503,7 @@ void InnerLoopVectorizer::createVectorizedLoopSkeleton() {
       // We know what the end value is.
       EndValue = CountRoundDown;
     } else {
-      IRBuilder<> B(LoopBypassBlocks.back()->getTerminator());
+      IRBuilder<> B(Lp->getLoopPreheader()->getTerminator());
       Type *StepType = II.getStep()->getType();
       Instruction::CastOps CastOp =
         CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
@@ -4168,7 +4144,7 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
   // To do so, we need to generate the 'identity' vector and override
   // one of the elements with the incoming scalar reduction. We need
   // to do it in the vector-loop preheader.
-  Builder.SetInsertPoint(LoopBypassBlocks[1]->getTerminator());
+  Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
 
   // This is the vector-clone of the value that leaves the loop.
   Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 4425043ad39a..dcbcab459a6b 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -434,7 +434,7 @@ private:
 
   /// \returns the pointer to the vectorized value if \p VL is already
   /// vectorized, or NULL. They may happen in cycles.
-  Value *alreadyVectorized(ArrayRef<Value *> VL) const;
+  Value *alreadyVectorized(ArrayRef<Value *> VL, Value *OpValue) const;
 
   /// \returns the scalarization cost for this type. Scalarization in this
   /// context means the creation of vectors from a group of scalars.
@@ -857,7 +857,7 @@ private:
     /// Checks if a bundle of instructions can be scheduled, i.e. has no
     /// cyclic dependencies. This is only a dry-run, no instructions are
     /// actually moved at this stage.
-    bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP);
+    bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, Value *OpValue);
 
     /// Un-bundles a group of instructions.
     void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
@@ -1212,7 +1212,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
   // Check that all of the users of the scalars that we want to vectorize are
   // schedulable.
   Instruction *VL0 = cast<Instruction>(VL[0]);
-  BasicBlock *BB = cast<Instruction>(VL0)->getParent();
+  BasicBlock *BB = VL0->getParent();
 
   if (!DT->isReachableFromEntry(BB)) {
     // Don't go into unreachable blocks. They may contain instructions with
@@ -1237,7 +1237,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
   }
   BlockScheduling &BS = *BSRef.get();
 
-  if (!BS.tryScheduleBundle(VL, this)) {
+  if (!BS.tryScheduleBundle(VL, this, VL0)) {
     DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
     assert((!BS.getScheduleData(VL[0]) ||
             !BS.getScheduleData(VL[0])->isPartOfBundle()) &&
@@ -2427,8 +2427,8 @@ Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
   return Vec;
 }
 
-Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL) const {
-  if (const TreeEntry *En = getTreeEntry(VL[0])) {
+Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL, Value *OpValue) const {
+  if (const TreeEntry *En = getTreeEntry(OpValue)) {
     if (En->isSame(VL) && En->VectorizedValue)
       return En->VectorizedValue;
   }
@@ -2553,7 +2553,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
 
       Value *InVec = vectorizeTree(INVL);
 
-      if (Value *V = alreadyVectorized(E->Scalars))
+      if (Value *V = alreadyVectorized(E->Scalars, VL0))
         return V;
 
       CastInst *CI = dyn_cast<CastInst>(VL0);
@@ -2575,7 +2575,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       Value *L = vectorizeTree(LHSV);
       Value *R = vectorizeTree(RHSV);
 
-      if (Value *V = alreadyVectorized(E->Scalars))
+      if (Value *V = alreadyVectorized(E->Scalars, VL0))
         return V;
 
       CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
@@ -2604,7 +2604,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       Value *True = vectorizeTree(TrueVec);
       Value *False = vectorizeTree(FalseVec);
 
-      if (Value *V = alreadyVectorized(E->Scalars))
+      if (Value *V = alreadyVectorized(E->Scalars, VL0))
         return V;
 
       Value *V = Builder.CreateSelect(Cond, True, False);
@@ -2644,7 +2644,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       Value *LHS = vectorizeTree(LHSVL);
       Value *RHS = vectorizeTree(RHSVL);
 
-      if (Value *V = alreadyVectorized(E->Scalars))
+      if (Value *V = alreadyVectorized(E->Scalars, VL0))
         return V;
 
       BinaryOperator *BinOp = cast<BinaryOperator>(VL0);
@@ -2806,7 +2806,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       Value *LHS = vectorizeTree(LHSVL);
       Value *RHS = vectorizeTree(RHSVL);
 
-      if (Value *V = alreadyVectorized(E->Scalars))
+      if (Value *V = alreadyVectorized(E->Scalars, VL0))
         return V;
 
       // Create a vector of LHS op1 RHS
@@ -3097,8 +3097,8 @@ void BoUpSLP::optimizeGatherSequence() {
 // Groups the instructions to a bundle (which is then a single scheduling entity)
 // and schedules instructions until the bundle gets ready.
 bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
-                                                 BoUpSLP *SLP) {
-  if (isa<PHINode>(VL[0]))
+                                                 BoUpSLP *SLP, Value *OpValue) {
+  if (isa<PHINode>(OpValue))
     return true;
 
   // Initialize the instruction bundle.
@@ -3106,7 +3106,7 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
   ScheduleData *PrevInBundle = nullptr;
   ScheduleData *Bundle = nullptr;
   bool ReSchedule = false;
-  DEBUG(dbgs() << "SLP:  bundle: " << *VL[0] << "\n");
+  DEBUG(dbgs() << "SLP:  bundle: " << *OpValue << "\n");
 
   // Make sure that the scheduling region contains all
   // instructions of the bundle.
@@ -3177,7 +3177,7 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
     }
   }
   if (!Bundle->isReady()) {
-    cancelScheduling(VL, VL[0]);
+    cancelScheduling(VL, OpValue);
     return false;
   }
   return true;
diff --git a/runtimes/CMakeLists.txt b/runtimes/CMakeLists.txt
index 3e3eff39d637..f475878e2f2a 100644
--- a/runtimes/CMakeLists.txt
+++ b/runtimes/CMakeLists.txt
@@ -30,7 +30,7 @@ if(${CMAKE_SOURCE_DIR} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR})
   list(INSERT CMAKE_MODULE_PATH 0
     "${CMAKE_CURRENT_SOURCE_DIR}/../cmake"
     "${CMAKE_CURRENT_SOURCE_DIR}/../cmake/modules"
-    "${LLVM_BINARY_DIR}/lib/cmake/llvm"
+    "${LLVM_LIBRARY_DIR}/cmake/llvm"
   )
 
   # Some of the runtimes will conditionally use the compiler-rt sanitizers
@@ -123,7 +123,7 @@ if(${CMAKE_SOURCE_DIR} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR})
         set(LLVM_RUNTIME_OUTPUT_INTDIR "${LLVM_TOOLS_BINARY_DIR}/${LLVM_RUNTIMES_TARGET}")
       endif()
     endif()
-    
+
     # Between each sub-project we want to cache and clear the LIT properties
     set_property(GLOBAL PROPERTY LLVM_LIT_TESTSUITES)
     set_property(GLOBAL PROPERTY LLVM_LIT_PARAMS)
@@ -154,7 +154,7 @@ if(${CMAKE_SOURCE_DIR} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR})
   if(LLVM_INCLUDE_TESTS)
     # Add a global check rule now that all subdirectories have been traversed
     # and we know the total set of lit testsuites.
-    
+
     add_lit_target(check-runtimes
       "Running all regression tests"
       ${RUNTIMES_LIT_TESTSUITES}
@@ -331,6 +331,7 @@ else() # if this is included from LLVM's CMake
                              # Builtins were built separately above
                              CMAKE_ARGS -DCOMPILER_RT_BUILD_BUILTINS=Off
                                         -DLLVM_INCLUDE_TESTS=${LLVM_INCLUDE_TESTS}
+                                        -DLLVM_LIBRARY_DIR=${LLVM_LIBRARY_DIR}
                                         -DCMAKE_C_COMPILER_TARGET=${target}
                                         -DCMAKE_CXX_COMPILER_TARGET=${target}
                                         -DCMAKE_ASM_COMPILER_TARGET=${target}
@@ -339,6 +340,7 @@ else() # if this is included from LLVM's CMake
                                         -DCMAKE_ASM_COMPILER_WORKS=ON
                                         -DCOMPILER_RT_DEFAULT_TARGET_ONLY=ON
                                         ${${target}_extra_args}
+                             TOOLCHAIN_TOOLS clang lld llvm-ar llvm-ranlib
                              PASSTHROUGH_PREFIXES ${prefixes}
                              EXTRA_TARGETS ${${target}_extra_targets}
                                            ${${target}_test_targets}
@@ -376,6 +378,7 @@ else() # if this is included from LLVM's CMake
                                # Builtins were built separately above
                                CMAKE_ARGS -DCOMPILER_RT_BUILD_BUILTINS=Off
                                           -DLLVM_INCLUDE_TESTS=${LLVM_INCLUDE_TESTS}
+                                          -DLLVM_LIBRARY_DIR=${LLVM_LIBRARY_DIR}
                                PASSTHROUGH_PREFIXES ${prefixes}
                                EXTRA_TARGETS ${extra_targets}
                                               ${test_targets}
diff --git a/test/Analysis/CostModel/SystemZ/fp-arith.ll b/test/Analysis/CostModel/SystemZ/fp-arith.ll
index 08a7c291138f..5f92db1ababf 100644
--- a/test/Analysis/CostModel/SystemZ/fp-arith.ll
+++ b/test/Analysis/CostModel/SystemZ/fp-arith.ll
@@ -1,4 +1,7 @@
-; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s
+; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z13 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-Z13 %s
+; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z14 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-Z14 %s
 ;
 ; Note: The scalarized vector instructions cost is not including any
 ; extracts, due to the undef operands
@@ -21,13 +24,17 @@ define void @fadd() {
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res0 = fadd float undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res1 = fadd double undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res2 = fadd fp128 undef, undef
-; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %res3 = fadd <2 x float> undef, undef
+; CHECK-Z13: Cost Model: Found an estimated cost of 8 for instruction:   %res3 = fadd <2 x float> undef, undef
+; CHECK-Z14: Cost Model: Found an estimated cost of 1 for instruction:   %res3 = fadd <2 x float> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res4 = fadd <2 x double> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %res5 = fadd <4 x float> undef, undef
+; CHECK-Z13: Cost Model: Found an estimated cost of 8 for instruction:   %res5 = fadd <4 x float> undef, undef
+; CHECK-Z14: Cost Model: Found an estimated cost of 1 for instruction:   %res5 = fadd <4 x float> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res6 = fadd <4 x double> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %res7 = fadd <8 x float> undef, undef
+; CHECK-Z13: Cost Model: Found an estimated cost of 16 for instruction:   %res7 = fadd <8 x float> undef, undef
+; CHECK-Z14: Cost Model: Found an estimated cost of 2 for instruction:   %res7 = fadd <8 x float> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res8 = fadd <8 x double> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 32 for instruction:   %res9 = fadd <16 x float> undef, undef
+; CHECK-Z13: Cost Model: Found an estimated cost of 32 for instruction:   %res9 = fadd <16 x float> undef, undef
+; CHECK-Z14: Cost Model: Found an estimated cost of 4 for instruction:   %res9 = fadd <16 x float> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %res10 = fadd <16 x double> undef, undef
 
   ret void;
@@ -49,13 +56,17 @@ define void @fsub() {
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res0 = fsub float undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res1 = fsub double undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res2 = fsub fp128 undef, undef
-; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %res3 = fsub <2 x float> undef, undef
+; CHECK-Z13: Cost Model: Found an estimated cost of 8 for instruction:   %res3 = fsub <2 x float> undef, undef
+; CHECK-Z14: Cost Model: Found an estimated cost of 1 for instruction:   %res3 = fsub <2 x float> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res4 = fsub <2 x double> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %res5 = fsub <4 x float> undef, undef
+; CHECK-Z13: Cost Model: Found an estimated cost of 8 for instruction:   %res5 = fsub <4 x float> undef, undef
+; CHECK-Z14: Cost Model: Found an estimated cost of 1 for instruction:   %res5 = fsub <4 x float> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res6 = fsub <4 x double> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %res7 = fsub <8 x float> undef, undef
+; CHECK-Z13: Cost Model: Found an estimated cost of 16 for instruction:   %res7 = fsub <8 x float> undef, undef
+; CHECK-Z14: Cost Model: Found an estimated cost of 2 for instruction:   %res7 = fsub <8 x float> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res8 = fsub <8 x double> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 32 for instruction:   %res9 = fsub <16 x float> undef, undef
+; CHECK-Z13: Cost Model: Found an estimated cost of 32 for instruction:   %res9 = fsub <16 x float> undef, undef
+; CHECK-Z14: Cost Model: Found an estimated cost of 4 for instruction:   %res9 = fsub <16 x float> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %res10 = fsub <16 x double> undef, undef
 
   ret void;
@@ -77,13 +88,17 @@ define void @fmul() {
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res0 = fmul float undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res1 = fmul double undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res2 = fmul fp128 undef, undef
-; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %res3 = fmul <2 x float> undef, undef
+; CHECK-Z13: Cost Model: Found an estimated cost of 8 for instruction:   %res3 = fmul <2 x float> undef, undef
+; CHECK-Z14: Cost Model: Found an estimated cost of 1 for instruction:   %res3 = fmul <2 x float> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res4 = fmul <2 x double> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %res5 = fmul <4 x float> undef, undef
+; CHECK-Z13: Cost Model: Found an estimated cost of 8 for instruction:   %res5 = fmul <4 x float> undef, undef
+; CHECK-Z14: Cost Model: Found an estimated cost of 1 for instruction:   %res5 = fmul <4 x float> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res6 = fmul <4 x double> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %res7 = fmul <8 x float> undef, undef
+; CHECK-Z13: Cost Model: Found an estimated cost of 16 for instruction:   %res7 = fmul <8 x float> undef, undef
+; CHECK-Z14: Cost Model: Found an estimated cost of 2 for instruction:   %res7 = fmul <8 x float> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res8 = fmul <8 x double> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 32 for instruction:   %res9 = fmul <16 x float> undef, undef
+; CHECK-Z13: Cost Model: Found an estimated cost of 32 for instruction:   %res9 = fmul <16 x float> undef, undef
+; CHECK-Z14: Cost Model: Found an estimated cost of 4 for instruction:   %res9 = fmul <16 x float> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %res10 = fmul <16 x double> undef, undef
 
   ret void;
@@ -105,13 +120,17 @@ define void @fdiv() {
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res0 = fdiv float undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res1 = fdiv double undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res2 = fdiv fp128 undef, undef
-; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %res3 = fdiv <2 x float> undef, undef
+; CHECK-Z13: Cost Model: Found an estimated cost of 8 for instruction:   %res3 = fdiv <2 x float> undef, undef
+; CHECK-Z14: Cost Model: Found an estimated cost of 1 for instruction:   %res3 = fdiv <2 x float> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res4 = fdiv <2 x double> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %res5 = fdiv <4 x float> undef, undef
+; CHECK-Z13: Cost Model: Found an estimated cost of 8 for instruction:   %res5 = fdiv <4 x float> undef, undef
+; CHECK-Z14: Cost Model: Found an estimated cost of 1 for instruction:   %res5 = fdiv <4 x float> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res6 = fdiv <4 x double> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %res7 = fdiv <8 x float> undef, undef
+; CHECK-Z13: Cost Model: Found an estimated cost of 16 for instruction:   %res7 = fdiv <8 x float> undef, undef
+; CHECK-Z14: Cost Model: Found an estimated cost of 2 for instruction:   %res7 = fdiv <8 x float> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res8 = fdiv <8 x double> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 32 for instruction:   %res9 = fdiv <16 x float> undef, undef
+; CHECK-Z13: Cost Model: Found an estimated cost of 32 for instruction:   %res9 = fdiv <16 x float> undef, undef
+; CHECK-Z14: Cost Model: Found an estimated cost of 4 for instruction:   %res9 = fdiv <16 x float> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %res10 = fdiv <16 x double> undef, undef
 
   ret void;
diff --git a/test/Assembler/diimportedentity.ll b/test/Assembler/diimportedentity.ll
index bc85ca09f658..6a0e1eb931c1 100644
--- a/test/Assembler/diimportedentity.ll
+++ b/test/Assembler/diimportedentity.ll
@@ -18,9 +18,9 @@
 ; CHECK: !3 = !DICompositeType({{.*}})
 !3 = !DICompositeType(tag: DW_TAG_structure_type, name: "Class", size: 32, align: 32)
 
-; CHECK-NEXT: !4 = !DIImportedEntity(tag: DW_TAG_imported_module, name: "foo", scope: !0, entity: !1, line: 7)
+; CHECK-NEXT: !4 = !DIImportedEntity(tag: DW_TAG_imported_module, name: "foo", scope: !0, entity: !1, file: !2, line: 7)
 !4 = !DIImportedEntity(tag: DW_TAG_imported_module, name: "foo", scope: !0,
-                       entity: !1, line: 7)
+                       entity: !1, file: !2, line: 7)
 
 ; CHECK-NEXT: !5 = !DIImportedEntity(tag: DW_TAG_imported_module, scope: !0)
 !5 = !DIImportedEntity(tag: DW_TAG_imported_module, scope: !0)
diff --git a/test/Bitcode/DIGlobalVariableExpression.ll b/test/Bitcode/DIGlobalVariableExpression.ll
index 31c3fda1b00a..3cf082472829 100644
--- a/test/Bitcode/DIGlobalVariableExpression.ll
+++ b/test/Bitcode/DIGlobalVariableExpression.ll
@@ -36,4 +36,4 @@
 !9 = !{!"clang version 4.0.0 (trunk 286129) (llvm/trunk 286128)"}
 !10 = distinct !DIGlobalVariable(name: "c", scope: !1, file: !2, line: 1, type: !5, isLocal: false, isDefinition: true, expr: !DIExpression(DW_OP_constu, 23, DW_OP_stack_value))
 !11 = distinct !DIGlobalVariable(name: "h", scope: !1, file: !2, line: 2, type: !5, isLocal: false, isDefinition: true)
-!12 = !DIImportedEntity(tag: DW_TAG_imported_declaration, line: 1, scope: !1, entity: !11)
+!12 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !2, line: 1, scope: !1, entity: !11)
diff --git a/test/Bitcode/compatibility-3.6.ll b/test/Bitcode/compatibility-3.6.ll
index cf6c30e7c26c..e9313dfba870 100644
--- a/test/Bitcode/compatibility-3.6.ll
+++ b/test/Bitcode/compatibility-3.6.ll
@@ -368,9 +368,9 @@ declare cc78 void @f.cc78()
 declare x86_64_sysvcc void @f.x86_64_sysvcc()
 ; CHECK: declare x86_64_sysvcc void @f.x86_64_sysvcc()
 declare cc79 void @f.cc79()
-; CHECK: declare x86_64_win64cc void @f.cc79()
-declare x86_64_win64cc void @f.x86_64_win64cc()
-; CHECK: declare x86_64_win64cc void @f.x86_64_win64cc()
+; CHECK: declare win64cc void @f.cc79()
+declare win64cc void @f.x86_64_win64cc()
+; CHECK: declare win64cc void @f.x86_64_win64cc()
 declare cc80 void @f.cc80()
 ; CHECK: declare x86_vectorcallcc void @f.cc80()
 declare x86_vectorcallcc void @f.x86_vectorcallcc()
diff --git a/test/Bitcode/compatibility-3.7.ll b/test/Bitcode/compatibility-3.7.ll
index 180dad258b68..82fc99055357 100644
--- a/test/Bitcode/compatibility-3.7.ll
+++ b/test/Bitcode/compatibility-3.7.ll
@@ -368,9 +368,9 @@ declare cc78 void @f.cc78()
 declare x86_64_sysvcc void @f.x86_64_sysvcc()
 ; CHECK: declare x86_64_sysvcc void @f.x86_64_sysvcc()
 declare cc79 void @f.cc79()
-; CHECK: declare x86_64_win64cc void @f.cc79()
-declare x86_64_win64cc void @f.x86_64_win64cc()
-; CHECK: declare x86_64_win64cc void @f.x86_64_win64cc()
+; CHECK: declare win64cc void @f.cc79()
+declare win64cc void @f.x86_64_win64cc()
+; CHECK: declare win64cc void @f.x86_64_win64cc()
 declare cc80 void @f.cc80()
 ; CHECK: declare x86_vectorcallcc void @f.cc80()
 declare x86_vectorcallcc void @f.x86_vectorcallcc()
diff --git a/test/Bitcode/compatibility-3.8.ll b/test/Bitcode/compatibility-3.8.ll
index 370c7f51a2b7..2e70a380d10e 100644
--- a/test/Bitcode/compatibility-3.8.ll
+++ b/test/Bitcode/compatibility-3.8.ll
@@ -393,9 +393,9 @@ declare cc78 void @f.cc78()
 declare x86_64_sysvcc void @f.x86_64_sysvcc()
 ; CHECK: declare x86_64_sysvcc void @f.x86_64_sysvcc()
 declare cc79 void @f.cc79()
-; CHECK: declare x86_64_win64cc void @f.cc79()
-declare x86_64_win64cc void @f.x86_64_win64cc()
-; CHECK: declare x86_64_win64cc void @f.x86_64_win64cc()
+; CHECK: declare win64cc void @f.cc79()
+declare win64cc void @f.x86_64_win64cc()
+; CHECK: declare win64cc void @f.x86_64_win64cc()
 declare cc80 void @f.cc80()
 ; CHECK: declare x86_vectorcallcc void @f.cc80()
 declare x86_vectorcallcc void @f.x86_vectorcallcc()
diff --git a/test/Bitcode/compatibility-3.9.ll b/test/Bitcode/compatibility-3.9.ll
index 4115cbd8fe64..7c84daa7d3c4 100644
--- a/test/Bitcode/compatibility-3.9.ll
+++ b/test/Bitcode/compatibility-3.9.ll
@@ -422,9 +422,9 @@ declare cc78 void @f.cc78()
 declare x86_64_sysvcc void @f.x86_64_sysvcc()
 ; CHECK: declare x86_64_sysvcc void @f.x86_64_sysvcc()
 declare cc79 void @f.cc79()
-; CHECK: declare x86_64_win64cc void @f.cc79()
-declare x86_64_win64cc void @f.x86_64_win64cc()
-; CHECK: declare x86_64_win64cc void @f.x86_64_win64cc()
+; CHECK: declare win64cc void @f.cc79()
+declare win64cc void @f.x86_64_win64cc()
+; CHECK: declare win64cc void @f.x86_64_win64cc()
 declare cc80 void @f.cc80()
 ; CHECK: declare x86_vectorcallcc void @f.cc80()
 declare x86_vectorcallcc void @f.x86_vectorcallcc()
diff --git a/test/Bitcode/compatibility-4.0.ll b/test/Bitcode/compatibility-4.0.ll
index eef925564ecb..9e34d48c95f7 100644
--- a/test/Bitcode/compatibility-4.0.ll
+++ b/test/Bitcode/compatibility-4.0.ll
@@ -422,9 +422,9 @@ declare cc78 void @f.cc78()
 declare x86_64_sysvcc void @f.x86_64_sysvcc()
 ; CHECK: declare x86_64_sysvcc void @f.x86_64_sysvcc()
 declare cc79 void @f.cc79()
-; CHECK: declare x86_64_win64cc void @f.cc79()
-declare x86_64_win64cc void @f.x86_64_win64cc()
-; CHECK: declare x86_64_win64cc void @f.x86_64_win64cc()
+; CHECK: declare win64cc void @f.cc79()
+declare win64cc void @f.x86_64_win64cc()
+; CHECK: declare win64cc void @f.x86_64_win64cc()
 declare cc80 void @f.cc80()
 ; CHECK: declare x86_vectorcallcc void @f.cc80()
 declare x86_vectorcallcc void @f.x86_vectorcallcc()
diff --git a/test/Bitcode/compatibility.ll b/test/Bitcode/compatibility.ll
index ebd727ba9aee..7df1535a6923 100644
--- a/test/Bitcode/compatibility.ll
+++ b/test/Bitcode/compatibility.ll
@@ -425,9 +425,9 @@ declare cc78 void @f.cc78()
 declare x86_64_sysvcc void @f.x86_64_sysvcc()
 ; CHECK: declare x86_64_sysvcc void @f.x86_64_sysvcc()
 declare cc79 void @f.cc79()
-; CHECK: declare x86_64_win64cc void @f.cc79()
-declare x86_64_win64cc void @f.x86_64_win64cc()
-; CHECK: declare x86_64_win64cc void @f.x86_64_win64cc()
+; CHECK: declare win64cc void @f.cc79()
+declare win64cc void @f.win64cc()
+; CHECK: declare win64cc void @f.win64cc()
 declare cc80 void @f.cc80()
 ; CHECK: declare x86_vectorcallcc void @f.cc80()
 declare x86_vectorcallcc void @f.x86_vectorcallcc()
diff --git a/test/Bitcode/upgrade-importedentity.ll b/test/Bitcode/upgrade-importedentity.ll
new file mode 100644
index 000000000000..134ccf1f3eaf
--- /dev/null
+++ b/test/Bitcode/upgrade-importedentity.ll
@@ -0,0 +1,15 @@
+; RUN:  llvm-dis < %s.bc | FileCheck %s
+; RUN:  verify-uselistorder < %s.bc
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!9, !10}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 5.0.0 (trunk 308185) (llvm/trunk 308186)", emissionKind: FullDebug, imports: !3)
+!1 = !DIFile(filename: "using.ii", directory: "/")
+!3 = !{!4}
+!4 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !5, entity: !8, line: 301)
+; CHECK: !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !5)
+!5 = !DINamespace(name: "M", scope: null)
+!8 = !DINamespace(name: "N", scope: null)
+!9 = !{i32 2, !"Dwarf Version", i32 4}
+!10 = !{i32 2, !"Debug Info Version", i32 3}
diff --git a/test/Bitcode/upgrade-importedentity.ll.bc b/test/Bitcode/upgrade-importedentity.ll.bc
new file mode 100644
index 000000000000..7fa833b50462
Binary files /dev/null and b/test/Bitcode/upgrade-importedentity.ll.bc differ
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index b52b6018e026..124f0c72fd75 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -48,6 +48,7 @@ set(LLVM_TEST_DEPENDS
           llvm-cvtres
           llvm-diff
           llvm-dis
+          llvm-dlltool
           llvm-dsymutil
           llvm-dwarfdump
           llvm-dwp
@@ -58,6 +59,7 @@ set(LLVM_TEST_DEPENDS
           llvm-mc
           llvm-mcmarkup
           llvm-modextract
+          llvm-mt
           llvm-nm
           llvm-objdump
           llvm-opt-report
@@ -65,6 +67,7 @@ set(LLVM_TEST_DEPENDS
           llvm-profdata
           llvm-ranlib
           llvm-readobj
+          llvm-readelf
           llvm-rtdyld
           llvm-size
           llvm-split
diff --git a/test/CodeGen/AArch64/GlobalISel/select-fma.mir b/test/CodeGen/AArch64/GlobalISel/select-fma.mir
new file mode 100644
index 000000000000..3b2f3746b587
--- /dev/null
+++ b/test/CodeGen/AArch64/GlobalISel/select-fma.mir
@@ -0,0 +1,41 @@
+# RUN: llc -O0 -mtriple=aarch64-- -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s
+
+--- |
+  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+  define void @FMADDSrrr_fpr() { ret void }
+...
+
+---
+# CHECK-LABEL: name: FMADDSrrr_fpr
+name:            FMADDSrrr_fpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: fpr32, preferred-register: '' }
+# CHECK-NEXT:  - { id: 1, class: fpr32, preferred-register: '' }
+# CHECK-NEXT:  - { id: 2, class: fpr32, preferred-register: '' }
+# CHECK-NEXT:  - { id: 3, class: fpr32, preferred-register: '' }
+registers:
+  - { id: 0, class: fpr }
+  - { id: 1, class: fpr }
+  - { id: 2, class: fpr }
+  - { id: 3, class: fpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %w0
+# CHECK:    %1 = COPY %w1
+# CHECK:    %2 = COPY %w2
+# CHECK:    %3 = FMADDSrrr %0, %1, %2
+body:             |
+  bb.0:
+    liveins: %w0, %w1, %w2
+
+    %0(s32) = COPY %w0
+    %1(s32) = COPY %w1
+    %2(s32) = COPY %w2
+    %3(s32) = G_FMA %0, %1, %2
+    %x0 = COPY %3
+...
+
diff --git a/test/CodeGen/AArch64/aarch64_win64cc_vararg.ll b/test/CodeGen/AArch64/aarch64_win64cc_vararg.ll
new file mode 100644
index 000000000000..2546e7c90ce5
--- /dev/null
+++ b/test/CodeGen/AArch64/aarch64_win64cc_vararg.ll
@@ -0,0 +1,74 @@
+; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s
+
+define win64cc void @pass_va(i32 %count, ...) nounwind {
+entry:
+; CHECK: sub     sp, sp, #80
+; CHECK: add     x8, sp, #24
+; CHECK: add     x0, sp, #24
+; CHECK: stp     x6, x7, [sp, #64]
+; CHECK: stp     x4, x5, [sp, #48]
+; CHECK: stp     x2, x3, [sp, #32]
+; CHECK: str     x1, [sp, #24]
+; CHECK: stp     x30, x8, [sp]
+; CHECK: bl      other_func
+; CHECK: ldr     x30, [sp], #80
+; CHECK: ret
+  %ap = alloca i8*, align 8
+  %ap1 = bitcast i8** %ap to i8*
+  call void @llvm.va_start(i8* %ap1)
+  %ap2 = load i8*, i8** %ap, align 8
+  call void @other_func(i8* %ap2)
+  ret void
+}
+
+declare void @other_func(i8*) local_unnamed_addr
+
+declare void @llvm.va_start(i8*) nounwind
+declare void @llvm.va_copy(i8*, i8*) nounwind
+
+; CHECK-LABEL: f9:
+; CHECK: sub     sp, sp, #16
+; CHECK: add     x8, sp, #24
+; CHECK: add     x0, sp, #24
+; CHECK: str     x8, [sp, #8]
+; CHECK: add     sp, sp, #16
+; CHECK: ret
+define win64cc i8* @f9(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, i64 %a5, i64 %a6, i64 %a7, i64 %a8, ...) nounwind {
+entry:
+  %ap = alloca i8*, align 8
+  %ap1 = bitcast i8** %ap to i8*
+  call void @llvm.va_start(i8* %ap1)
+  %ap2 = load i8*, i8** %ap, align 8
+  ret i8* %ap2
+}
+
+; CHECK-LABEL: f8:
+; CHECK: sub     sp, sp, #16
+; CHECK: add     x8, sp, #16
+; CHECK: add     x0, sp, #16
+; CHECK: str     x8, [sp, #8]
+; CHECK: add     sp, sp, #16
+; CHECK: ret
+define win64cc i8* @f8(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, i64 %a5, i64 %a6, i64 %a7, ...) nounwind {
+entry:
+  %ap = alloca i8*, align 8
+  %ap1 = bitcast i8** %ap to i8*
+  call void @llvm.va_start(i8* %ap1)
+  %ap2 = load i8*, i8** %ap, align 8
+  ret i8* %ap2
+}
+
+; CHECK-LABEL: f7:
+; CHECK: sub     sp, sp, #16
+; CHECK: add     x8, sp, #8
+; CHECK: add     x0, sp, #8
+; CHECK: stp     x8, x7, [sp], #16
+; CHECK: ret
+define win64cc i8* @f7(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, i64 %a5, i64 %a6, ...) nounwind {
+entry:
+  %ap = alloca i8*, align 8
+  %ap1 = bitcast i8** %ap to i8*
+  call void @llvm.va_start(i8* %ap1)
+  %ap2 = load i8*, i8** %ap, align 8
+  ret i8* %ap2
+}
diff --git a/test/CodeGen/AArch64/arm64-abi-varargs.ll b/test/CodeGen/AArch64/arm64-abi-varargs.ll
index 0a7965571480..64a6b9b6b210 100644
--- a/test/CodeGen/AArch64/arm64-abi-varargs.ll
+++ b/test/CodeGen/AArch64/arm64-abi-varargs.ll
@@ -11,9 +11,8 @@ define void @fn9(i32* %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7,
 ; CHECK: add {{x[0-9]+}}, [[ARGS]], #8
 ; First vararg
 ; CHECK: ldr {{w[0-9]+}}, [sp, #72]
-; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #8
 ; Second vararg
-; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}]
+; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}], #8
 ; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #8
 ; Third vararg
 ; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}]
diff --git a/test/CodeGen/AArch64/arm64-abi_align.ll b/test/CodeGen/AArch64/arm64-abi_align.ll
index b2ea9ad3b4a1..b844aab5628c 100644
--- a/test/CodeGen/AArch64/arm64-abi_align.ll
+++ b/test/CodeGen/AArch64/arm64-abi_align.ll
@@ -280,10 +280,10 @@ entry:
 define i32 @caller42() #3 {
 entry:
 ; CHECK-LABEL: caller42
-; CHECK: str {{x[0-9]+}}, [sp, #48]
-; CHECK: str {{q[0-9]+}}, [sp, #32]
-; CHECK: str {{x[0-9]+}}, [sp, #16]
-; CHECK: str {{q[0-9]+}}, [sp]
+; CHECK-DAG: str {{x[0-9]+}}, [sp, #48]
+; CHECK-DAG: str {{q[0-9]+}}, [sp, #32]
+; CHECK-DAG: str {{x[0-9]+}}, [sp, #16]
+; CHECK-DAG: str {{q[0-9]+}}, [sp]
 ; CHECK: add x1, sp, #32
 ; CHECK: mov x2, sp
 ; Space for s1 is allocated at sp+32
@@ -318,10 +318,10 @@ entry:
 ; CHECK-LABEL: caller42_stack
 ; CHECK: sub sp, sp, #112
 ; CHECK: add x29, sp, #96
-; CHECK: stur {{x[0-9]+}}, [x29, #-16]
-; CHECK: stur {{q[0-9]+}}, [x29, #-32]
-; CHECK: str {{x[0-9]+}}, [sp, #48]
-; CHECK: str {{q[0-9]+}}, [sp, #32]
+; CHECK-DAG: stur {{x[0-9]+}}, [x29, #-16]
+; CHECK-DAG: stur {{q[0-9]+}}, [x29, #-32]
+; CHECK-DAG: str {{x[0-9]+}}, [sp, #48]
+; CHECK-DAG: str {{q[0-9]+}}, [sp, #32]
 ; Space for s1 is allocated at x29-32 = sp+64
 ; Space for s2 is allocated at sp+32
 ; CHECK: add x[[B:[0-9]+]], sp, #32
@@ -388,10 +388,10 @@ entry:
 define i32 @caller43() #3 {
 entry:
 ; CHECK-LABEL: caller43
-; CHECK: str {{q[0-9]+}}, [sp, #48]
-; CHECK: str {{q[0-9]+}}, [sp, #32]
-; CHECK: str {{q[0-9]+}}, [sp, #16]
-; CHECK: str {{q[0-9]+}}, [sp]
+; CHECK-DAG: str {{q[0-9]+}}, [sp, #48]
+; CHECK-DAG: str {{q[0-9]+}}, [sp, #32]
+; CHECK-DAG: str {{q[0-9]+}}, [sp, #16]
+; CHECK-DAG: str {{q[0-9]+}}, [sp]
 ; CHECK: add x1, sp, #32
 ; CHECK: mov x2, sp
 ; Space for s1 is allocated at sp+32
@@ -430,10 +430,10 @@ entry:
 ; CHECK-LABEL: caller43_stack
 ; CHECK: sub sp, sp, #112
 ; CHECK: add x29, sp, #96
-; CHECK: stur {{q[0-9]+}}, [x29, #-16]
-; CHECK: stur {{q[0-9]+}}, [x29, #-32]
-; CHECK: str {{q[0-9]+}}, [sp, #48]
-; CHECK: str {{q[0-9]+}}, [sp, #32]
+; CHECK-DAG: stur {{q[0-9]+}}, [x29, #-16]
+; CHECK-DAG: stur {{q[0-9]+}}, [x29, #-32]
+; CHECK-DAG: str {{q[0-9]+}}, [sp, #48]
+; CHECK-DAG: str {{q[0-9]+}}, [sp, #32]
 ; Space for s1 is allocated at x29-32 = sp+64
 ; Space for s2 is allocated at sp+32
 ; CHECK: add x[[B:[0-9]+]], sp, #32
diff --git a/test/CodeGen/AArch64/arm64-alloca-frame-pointer-offset.ll b/test/CodeGen/AArch64/arm64-alloca-frame-pointer-offset.ll
index a3b740df9b4e..fdb379871048 100644
--- a/test/CodeGen/AArch64/arm64-alloca-frame-pointer-offset.ll
+++ b/test/CodeGen/AArch64/arm64-alloca-frame-pointer-offset.ll
@@ -1,10 +1,8 @@
 ; RUN: llc -mtriple=arm64-eabi -mcpu=cyclone < %s | FileCheck %s
 
 ; CHECK: foo
-; CHECK: str w[[REG0:[0-9]+]], [x19, #264]
-; CHECK: mov w[[REG1:[0-9]+]], w[[REG0]]
-; CHECK: str w[[REG1]], [x19, #132]
-
+; CHECK-DAG: str w[[REG0:[0-9]+]], [x19, #132]
+; CHECK-DAG: str w[[REG0]], [x19, #264]
 define i32 @foo(i32 %a) nounwind {
   %retval = alloca i32, align 4
   %a.addr = alloca i32, align 4
diff --git a/test/CodeGen/AArch64/arm64-extern-weak.ll b/test/CodeGen/AArch64/arm64-extern-weak.ll
index 990782cb69a0..c98bda0d01a0 100644
--- a/test/CodeGen/AArch64/arm64-extern-weak.ll
+++ b/test/CodeGen/AArch64/arm64-extern-weak.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -mtriple=arm64-none-linux-gnu -relocation-model=pic -o - < %s | FileCheck %s
-; RUN: llc -mtriple=arm64-none-linux-gnu -relocation-model=static -o - < %s | FileCheck --check-prefix=CHECK %s
+; RUN: llc -mtriple=arm64-none-linux-gnu -relocation-model=static -o - < %s | FileCheck %s
 ; RUN: llc -mtriple=arm64-none-linux-gnu -code-model=large -o - < %s | FileCheck --check-prefix=CHECK-LARGE %s
 
 declare extern_weak i32 @var()
diff --git a/test/CodeGen/AArch64/arm64-inline-asm.ll b/test/CodeGen/AArch64/arm64-inline-asm.ll
index f849df2a51ec..848b87fd2cfb 100644
--- a/test/CodeGen/AArch64/arm64-inline-asm.ll
+++ b/test/CodeGen/AArch64/arm64-inline-asm.ll
@@ -261,3 +261,13 @@ define void @test_inline_modifier_a(i8* %ptr) nounwind {
   ; CHECK: prfm pldl1keep, [x0]
   ret void
 }
+
+; PR33134
+define void @test_zero_address() {
+entry:
+; CHECK-LABEL: test_zero_address
+; CHECK: mov {{x[0-9]+}}, xzr
+; CHECK: ldr {{x[0-9]+}}, {{[x[0-9]+]}}
+  tail call i32 asm sideeffect "ldr $0, $1 \0A", "=r,*Q"(i32* null)
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-platform-reg.ll b/test/CodeGen/AArch64/arm64-platform-reg.ll
index f3af01a73559..9b5d8a890fa6 100644
--- a/test/CodeGen/AArch64/arm64-platform-reg.ll
+++ b/test/CodeGen/AArch64/arm64-platform-reg.ll
@@ -1,6 +1,7 @@
 ; RUN: llc -mtriple=arm64-apple-ios -mattr=+reserve-x18 -o - %s | FileCheck %s --check-prefix=CHECK-RESERVE-X18
 ; RUN: llc -mtriple=arm64-freebsd-gnu -mattr=+reserve-x18 -o - %s | FileCheck %s --check-prefix=CHECK-RESERVE-X18
 ; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-windows -o - %s | FileCheck %s --check-prefix=CHECK-RESERVE-X18
 
 ; x18 is reserved as a platform register on Darwin but not on other
 ; systems. Create loads of register pressure and make sure this is respected.
diff --git a/test/CodeGen/AArch64/arm64-vext.ll b/test/CodeGen/AArch64/arm64-vext.ll
index b315e4c409b0..c1edf1b2e9bf 100644
--- a/test/CodeGen/AArch64/arm64-vext.ll
+++ b/test/CodeGen/AArch64/arm64-vext.ll
@@ -116,7 +116,7 @@ define void @test_vext_p16() nounwind ssp {
 
 define void @test_vext_s32() nounwind ssp {
   ; CHECK-LABEL: test_vext_s32:
-  ; CHECK: {{ext.8.*#4}}
+  ; CHECK: {{rev64.2s.*}}
   %xS32x2 = alloca <2 x i32>, align 8
   %__a = alloca <2 x i32>, align 8
   %__b = alloca <2 x i32>, align 8
@@ -137,7 +137,7 @@ define void @test_vext_s32() nounwind ssp {
 
 define void @test_vext_u32() nounwind ssp {
   ; CHECK-LABEL: test_vext_u32:
-  ; CHECK: {{ext.8.*#4}}
+  ; CHECK: {{rev64.2s.*}}
   %xU32x2 = alloca <2 x i32>, align 8
   %__a = alloca <2 x i32>, align 8
   %__b = alloca <2 x i32>, align 8
@@ -158,7 +158,7 @@ define void @test_vext_u32() nounwind ssp {
 
 define void @test_vext_f32() nounwind ssp {
   ; CHECK-LABEL: test_vext_f32:
-  ; CHECK: {{ext.8.*#4}}
+  ; CHECK: {{rev64.2s.*}}
   %xF32x2 = alloca <2 x float>, align 8
   %__a = alloca <2 x float>, align 8
   %__b = alloca <2 x float>, align 8
@@ -179,7 +179,7 @@ define void @test_vext_f32() nounwind ssp {
 
 define void @test_vext_s64() nounwind ssp {
   ; CHECK-LABEL: test_vext_s64:
-  ; CHECK_FIXME: {{ext.8.*#1}}
+  ; CHECK_FIXME: {{rev64.2s.*}}
   ; this just turns into a load of the second element
   %xS64x1 = alloca <1 x i64>, align 8
   %__a = alloca <1 x i64>, align 8
diff --git a/test/CodeGen/AArch64/atomic-ops-lse.ll b/test/CodeGen/AArch64/atomic-ops-lse.ll
index a85eb6b46aff..a0c418bff573 100644
--- a/test/CodeGen/AArch64/atomic-ops-lse.ll
+++ b/test/CodeGen/AArch64/atomic-ops-lse.ll
@@ -681,3 +681,164 @@ define i64 @test_atomic_cmpxchg_i64(i64 %wanted, i64 %new) nounwind {
    ret i64 %old
 }
 
+define i8 @test_atomic_load_sub_i8(i8 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_sub_i8:
+  %old = atomicrmw sub i8* @var8, i8 %offset seq_cst
+; CHECK-NOT: dmb
+; CHECK: neg w[[NEG:[0-9]+]], w[[OLD:[0-9]+]]
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
+
+; CHECK: ldaddalb w[[NEG]], w[[NEW:[0-9]+]], [x[[ADDR]]]
+; CHECK-NOT: dmb
+
+  ret i8 %old
+}
+
+define i16 @test_atomic_load_sub_i16(i16 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_sub_i16:
+  %old = atomicrmw sub i16* @var16, i16 %offset seq_cst
+; CHECK-NOT: dmb
+; CHECK: neg w[[NEG:[0-9]+]], w[[OLD:[0-9]+]]
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16
+
+; CHECK: ldaddalh w[[NEG]], w[[NEW:[0-9]+]], [x[[ADDR]]]
+; CHECK-NOT: dmb
+
+  ret i16 %old
+}
+
+define i32 @test_atomic_load_sub_i32(i32 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_sub_i32:
+  %old = atomicrmw sub i32* @var32, i32 %offset seq_cst
+; CHECK-NOT: dmb
+; CHECK: neg w[[NEG:[0-9]+]], w[[OLD:[0-9]+]]
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32
+
+; CHECK: ldaddal w[[NEG]], w[[NEW:[0-9]+]], [x[[ADDR]]]
+; CHECK-NOT: dmb
+
+  ret i32 %old
+}
+
+define i64 @test_atomic_load_sub_i64(i64 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_sub_i64:
+  %old = atomicrmw sub i64* @var64, i64 %offset seq_cst
+; CHECK-NOT: dmb
+; CHECK: neg x[[NEG:[0-9]+]], x[[OLD:[0-9]+]]
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64
+
+; CHECK: ldaddal x[[NEG]], x[[NEW:[0-9]+]], [x[[ADDR]]]
+; CHECK-NOT: dmb
+
+  ret i64 %old
+}
+
+define void @test_atomic_load_sub_i32_noret(i32 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_sub_i32_noret:
+  atomicrmw sub i32* @var32, i32 %offset seq_cst
+; CHECK-NOT: dmb
+; CHECK: neg w[[NEG:[0-9]+]], w[[OLD:[0-9]+]]
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32
+
+; CHECK: ldaddal w[[NEG]], w[[NEW:[0-9]+]], [x[[ADDR]]]
+; CHECK-NOT: dmb
+
+  ret void
+}
+
+define void @test_atomic_load_sub_i64_noret(i64 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_sub_i64_noret:
+  atomicrmw sub i64* @var64, i64 %offset seq_cst
+; CHECK-NOT: dmb
+; CHECK: neg x[[NEG:[0-9]+]], x[[OLD:[0-9]+]]
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64
+
+; CHECK: ldaddal x[[NEG]], x[[NEW:[0-9]+]], [x[[ADDR]]]
+; CHECK-NOT: dmb
+
+  ret void
+}
+
+define i8 @test_atomic_load_and_i8(i8 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_and_i8:
+  %old = atomicrmw and i8* @var8, i8 %offset seq_cst
+; CHECK-NOT: dmb
+; CHECK: mvn w[[NOT:[0-9]+]], w[[OLD:[0-9]+]]
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
+
+; CHECK: ldclralb w[[NOT]], w[[NEW:[0-9]+]], [x[[ADDR]]]
+; CHECK-NOT: dmb
+  ret i8 %old
+}
+
+define i16 @test_atomic_load_and_i16(i16 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_and_i16:
+  %old = atomicrmw and i16* @var16, i16 %offset seq_cst
+; CHECK-NOT: dmb
+; CHECK: mvn w[[NOT:[0-9]+]], w[[OLD:[0-9]+]]
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16
+
+; CHECK: ldclralh w[[NOT]], w[[NEW:[0-9]+]], [x[[ADDR]]]
+; CHECK-NOT: dmb
+  ret i16 %old
+}
+
+define i32 @test_atomic_load_and_i32(i32 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_and_i32:
+  %old = atomicrmw and i32* @var32, i32 %offset seq_cst
+; CHECK-NOT: dmb
+; CHECK: mvn w[[NOT:[0-9]+]], w[[OLD:[0-9]+]]
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32
+
+; CHECK: ldclral w[[NOT]], w[[NEW:[0-9]+]], [x[[ADDR]]]
+; CHECK-NOT: dmb
+  ret i32 %old
+}
+
+define i64 @test_atomic_load_and_i64(i64 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_and_i64:
+  %old = atomicrmw and i64* @var64, i64 %offset seq_cst
+; CHECK-NOT: dmb
+; CHECK: mvn x[[NOT:[0-9]+]], x[[OLD:[0-9]+]]
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64
+
+; CHECK: ldclral x[[NOT]], x[[NEW:[0-9]+]], [x[[ADDR]]]
+; CHECK-NOT: dmb
+  ret i64 %old
+}
+
+define void @test_atomic_load_and_i32_noret(i32 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_and_i32_noret:
+  atomicrmw and i32* @var32, i32 %offset seq_cst
+; CHECK-NOT: dmb
+; CHECK: mvn w[[NOT:[0-9]+]], w[[OLD:[0-9]+]]
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32
+
+; CHECK: ldclral w[[NOT]], w[[NEW:[0-9]+]], [x[[ADDR]]]
+; CHECK-NOT: dmb
+  ret void
+}
+
+define void @test_atomic_load_and_i64_noret(i64 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_and_i64_noret:
+  atomicrmw and i64* @var64, i64 %offset seq_cst
+; CHECK-NOT: dmb
+; CHECK: mvn x[[NOT:[0-9]+]], x[[OLD:[0-9]+]]
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64
+
+; CHECK: ldclral x[[NOT]], x[[NEW:[0-9]+]], [x[[ADDR]]]
+; CHECK-NOT: dmb
+  ret void
+}
diff --git a/test/CodeGen/AArch64/dag-combine-invaraints.ll b/test/CodeGen/AArch64/dag-combine-invaraints.ll
index 20ba3fea8377..a2fa1db8a8ac 100644
--- a/test/CodeGen/AArch64/dag-combine-invaraints.ll
+++ b/test/CodeGen/AArch64/dag-combine-invaraints.ll
@@ -9,7 +9,7 @@ main_:
   %i32T = alloca i32, align 4
   %i32F = alloca i32, align 4
   %i32X = alloca i32, align 4
-  store i32 0, i32* %tmp
+  store i32 %argc, i32* %tmp
   store i32 15, i32* %i32T, align 4
   store i32 5, i32* %i32F, align 4
   %tmp6 = load i32, i32* %tmp, align 4
diff --git a/test/CodeGen/AArch64/extern-weak.ll b/test/CodeGen/AArch64/extern-weak.ll
index ac2153ad8ffe..5671a1070138 100644
--- a/test/CodeGen/AArch64/extern-weak.ll
+++ b/test/CodeGen/AArch64/extern-weak.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic -o - %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=static -o - < %s | FileCheck --check-prefix=CHECK %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=static -o - < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -code-model=large -o - %s | FileCheck --check-prefix=CHECK-LARGE %s
 
 declare extern_weak i32 @var()
diff --git a/test/CodeGen/AArch64/falkor-hwpf-fix.ll b/test/CodeGen/AArch64/falkor-hwpf-fix.ll
new file mode 100644
index 000000000000..9f2af5adce71
--- /dev/null
+++ b/test/CodeGen/AArch64/falkor-hwpf-fix.ll
@@ -0,0 +1,67 @@
+; RUN: llc < %s -mtriple aarch64 -mcpu=falkor -disable-post-ra | FileCheck %s
+
+; Check that strided load tag collisions are avoided on Falkor.
+
+; CHECK-LABEL: hwpf1:
+; CHECK: ldp {{w[0-9]+}}, {{w[0-9]+}}, [x[[BASE:[0-9]+]], #-16]
+; CHECK: mov x[[BASE2:[0-9]+]], x[[BASE]]
+; CHECK: ldp {{w[0-9]+}}, {{w[0-9]+}}, [x[[BASE2]], #-8]
+; CHECK: ldp {{w[0-9]+}}, {{w[0-9]+}}, [x[[BASE3:[0-9]+]]]
+; CHECK: mov x[[BASE4:[0-9]+]], x[[BASE3]]
+; CHECK: ldp {{w[0-9]+}}, {{w[0-9]+}}, [x[[BASE4]], #8]
+
+define void @hwpf1(i32* %p, i32* %sp, i32* %sp2, i32* %sp3, i32* %sp4) {
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %inc, %loop ]
+
+  %gep = getelementptr inbounds i32, i32* %p, i32 %iv
+  %load1 = load i32, i32* %gep
+
+  %gep2 = getelementptr inbounds i32, i32* %gep, i32 1
+  %load2 = load i32, i32* %gep2
+
+  %add = add i32 %load1, %load2
+  %storegep = getelementptr inbounds i32, i32* %sp, i32 %iv
+  store i32 %add, i32* %storegep
+
+  %gep3 = getelementptr inbounds i32, i32* %gep, i32 2
+  %load3 = load i32, i32* %gep3
+
+  %gep4 = getelementptr inbounds i32, i32* %gep, i32 3
+  %load4 = load i32, i32* %gep4
+
+  %add2 = add i32 %load3, %load4
+  %storegep2 = getelementptr inbounds i32, i32* %sp2, i32 %iv
+  store i32 %add2, i32* %storegep2
+
+  %gep5 = getelementptr inbounds i32, i32* %gep, i32 4
+  %load5 = load i32, i32* %gep5
+
+  %gep6 = getelementptr inbounds i32, i32* %gep, i32 5
+  %load6 = load i32, i32* %gep6
+
+  %add3 = add i32 %load5, %load6
+  %storegep3 = getelementptr inbounds i32, i32* %sp3, i32 %iv
+  store i32 %add3, i32* %storegep3
+
+  %gep7 = getelementptr inbounds i32, i32* %gep, i32 6
+  %load7 = load i32, i32* %gep7
+
+  %gep8 = getelementptr inbounds i32, i32* %gep, i32 7
+  %load8 = load i32, i32* %gep8
+
+  %add4 = add i32 %load7, %load8
+  %storegep4 = getelementptr inbounds i32, i32* %sp4, i32 %iv
+  store i32 %add4, i32* %storegep4
+
+  %inc = add i32 %iv, 8
+  %exitcnd = icmp uge i32 %inc, 1024
+  br i1 %exitcnd, label %exit, label %loop
+
+exit:
+  ret void
+}
+
diff --git a/test/CodeGen/AArch64/falkor-hwpf-fix.mir b/test/CodeGen/AArch64/falkor-hwpf-fix.mir
new file mode 100644
index 000000000000..54c8b16a9b43
--- /dev/null
+++ b/test/CodeGen/AArch64/falkor-hwpf-fix.mir
@@ -0,0 +1,52 @@
+# RUN: llc -mtriple=aarch64-linux-gnu -mcpu=falkor -run-pass falkor-hwpf-fix-late -o - %s | FileCheck %s
+--- |
+  @g = external global i32
+
+  define void @hwpf1() { ret void }
+  define void @hwpf2() { ret void }
+...
+---
+# Verify that the tag collision between the loads is resolved.
+# CHECK-LABEL: name: hwpf1
+# CHECK: %[[BASE:[a-z0-9]+]] = ORRXrs %xzr, %x1, 0
+# CHECK: LDRWui %[[BASE]], 0
+# CHECK: LDRWui %x1, 1
+name:            hwpf1
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: %w0, %x1
+
+    %w2 = LDRWui %x1, 0 :: ("aarch64-strided-access" load 4 from @g)
+    %w2 = LDRWui %x1, 1
+
+    %w0 = SUBWri %w0, 1, 0
+    %wzr = SUBSWri %w0, 0, 0, implicit-def %nzcv
+    Bcc 9, %bb.0, implicit %nzcv
+
+  bb.1:
+    RET_ReallyLR
+...
+---
+# Verify that the tag collision between the loads is resolved and written back for post increment addressing.
+# CHECK-LABEL: name: hwpf2
+# CHECK: %[[BASE:[a-z0-9]+]] = ORRXrs %xzr, %x1, 0
+# CHECK: LDRWpost %[[BASE]], 0
+# CHECK: %x1 = ORRXrs %xzr, %[[BASE]], 0
+# CHECK: LDRWui %x1, 1
+name:            hwpf2
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: %w0, %x1
+
+    %x1, %w2 = LDRWpost %x1, 0 :: ("aarch64-strided-access" load 4 from @g)
+    %w2 = LDRWui %x1, 1
+
+    %w0 = SUBWri %w0, 1, 0
+    %wzr = SUBSWri %w0, 0, 0, implicit-def %nzcv
+    Bcc 9, %bb.0, implicit %nzcv
+
+  bb.1:
+    RET_ReallyLR
+...
diff --git a/test/CodeGen/AArch64/falkor-hwpf.ll b/test/CodeGen/AArch64/falkor-hwpf.ll
new file mode 100644
index 000000000000..bbe7febe397f
--- /dev/null
+++ b/test/CodeGen/AArch64/falkor-hwpf.ll
@@ -0,0 +1,106 @@
+; RUN: opt < %s -S -falkor-hwpf-fix -mtriple aarch64 -mcpu=falkor | FileCheck %s
+; RUN: opt < %s -S -falkor-hwpf-fix -mtriple aarch64 -mcpu=cortex-a57 | FileCheck %s --check-prefix=NOHWPF
+
+; Check that strided access metadata is added to loads in inner loops when compiling for Falkor.
+
+; CHECK-LABEL: @hwpf1(
+; CHECK: load i32, i32* %gep, !falkor.strided.access !0
+; CHECK: load i32, i32* %gep2, !falkor.strided.access !0
+
+; NOHWPF-LABEL: @hwpf1(
+; NOHWPF: load i32, i32* %gep{{$}}
+; NOHWPF: load i32, i32* %gep2{{$}}
+define void @hwpf1(i32* %p, i32* %p2) {
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %inc, %loop ]
+
+  %gep = getelementptr inbounds i32, i32* %p, i32 %iv
+  %load = load i32, i32* %gep
+
+  %gep2 = getelementptr inbounds i32, i32* %p2, i32 %iv
+  %load2 = load i32, i32* %gep2
+
+  %inc = add i32 %iv, 1
+  %exitcnd = icmp uge i32 %inc, 1024
+  br i1 %exitcnd, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; Check that outer loop strided load isn't marked.
+; CHECK-LABEL: @hwpf2(
+; CHECK: load i32, i32* %gep, !falkor.strided.access !0
+; CHECK: load i32, i32* %gep2{{$}}
+
+; NOHWPF-LABEL: @hwpf2(
+; NOHWPF: load i32, i32* %gep{{$}}
+; NOHWPF: load i32, i32* %gep2{{$}}
+define void @hwpf2(i32* %p) {
+entry:
+  br label %loop1
+
+loop1:
+  %iv1 = phi i32 [ 0, %entry ], [ %inc1, %loop1.latch ]
+  %outer.sum = phi i32 [ 0, %entry ], [ %sum, %loop1.latch ]
+  br label %loop2.header
+
+loop2.header:
+  br label %loop2
+
+loop2:
+  %iv2 = phi i32 [ 0, %loop2.header ], [ %inc2, %loop2 ]
+  %sum = phi i32 [ %outer.sum, %loop2.header ], [ %sum.inc, %loop2 ]
+  %gep = getelementptr inbounds i32, i32* %p, i32 %iv2
+  %load = load i32, i32* %gep
+  %sum.inc = add i32 %sum, %load
+  %inc2 = add i32 %iv2, 1
+  %exitcnd2 = icmp uge i32 %inc2, 1024
+  br i1 %exitcnd2, label %exit2, label %loop2
+
+exit2:
+  %gep2 = getelementptr inbounds i32, i32* %p, i32 %iv1
+  %load2 = load i32, i32* %gep2
+  br label %loop1.latch
+
+loop1.latch:
+  %inc1 = add i32 %iv1, 1
+  %exitcnd1 = icmp uge i32 %inc1, 1024
+  br i1 %exitcnd2, label %exit, label %loop1
+
+exit:
+  ret void
+}
+
+
+; Check that non-strided load isn't marked.
+; CHECK-LABEL: @hwpf3(
+; CHECK: load i32, i32* %gep, !falkor.strided.access !0
+; CHECK: load i32, i32* %gep2{{$}}
+
+; NOHWPF-LABEL: @hwpf3(
+; NOHWPF: load i32, i32* %gep{{$}}
+; NOHWPF: load i32, i32* %gep2{{$}}
+define void @hwpf3(i32* %p, i32* %p2) {
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %inc, %loop ]
+
+  %gep = getelementptr inbounds i32, i32* %p, i32 %iv
+  %load = load i32, i32* %gep
+
+  %gep2 = getelementptr inbounds i32, i32* %p2, i32 %load
+  %load2 = load i32, i32* %gep2
+
+  %inc = add i32 %iv, 1
+  %exitcnd = icmp uge i32 %inc, 1024
+  br i1 %exitcnd, label %exit, label %loop
+
+exit:
+  ret void
+}
diff --git a/test/CodeGen/AArch64/preferred-function-alignment.ll b/test/CodeGen/AArch64/preferred-function-alignment.ll
index 88e6f5dd01c9..386a6ecccf54 100644
--- a/test/CodeGen/AArch64/preferred-function-alignment.ll
+++ b/test/CodeGen/AArch64/preferred-function-alignment.ll
@@ -1,7 +1,6 @@
 ; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=generic < %s | FileCheck --check-prefix=ALIGN2 %s
 ; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=cortex-a35 < %s | FileCheck --check-prefix=ALIGN2 %s
 ; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=cortex-a53 < %s | FileCheck --check-prefix=ALIGN2 %s
-; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=cortex-a73 < %s | FileCheck --check-prefix=ALIGN2 %s
 ; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=cyclone < %s | FileCheck --check-prefix=ALIGN2 %s
 ; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=falkor < %s | FileCheck --check-prefix=ALIGN2 %s
 ; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=kryo < %s | FileCheck --check-prefix=ALIGN2 %s
@@ -12,6 +11,7 @@
 ; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=thunderx2t99 < %s | FileCheck --check-prefix=ALIGN3 %s
 ; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=cortex-a57 < %s | FileCheck --check-prefix=ALIGN4 %s
 ; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=cortex-a72 < %s | FileCheck --check-prefix=ALIGN4 %s
+; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=cortex-a73 < %s | FileCheck --check-prefix=ALIGN4 %s
 ; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=exynos-m1 < %s | FileCheck --check-prefix=ALIGN4 %s
 ; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=exynos-m2 < %s | FileCheck --check-prefix=ALIGN4 %s
 ; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=exynos-m3 < %s | FileCheck --check-prefix=ALIGN4 %s
diff --git a/test/CodeGen/AArch64/swifterror.ll b/test/CodeGen/AArch64/swifterror.ll
index bc28f477c810..bcad19e391d0 100644
--- a/test/CodeGen/AArch64/swifterror.ll
+++ b/test/CodeGen/AArch64/swifterror.ll
@@ -309,17 +309,17 @@ define float @foo_vararg(%swift_error** swifterror %error_ptr_ref, ...) {
 ; CHECK-APPLE-LABEL: foo_vararg:
 ; CHECK-APPLE: orr w0, wzr, #0x10
 ; CHECK-APPLE: malloc
-; CHECK-APPLE: orr [[ID:w[0-9]+]], wzr, #0x1
-; CHECK-APPLE: add [[ARGS:x[0-9]+]], [[TMP:x[0-9]+]], #16
-; CHECK-APPLE: strb [[ID]], [x0, #8]
+; CHECK-APPLE-DAG: orr [[ID:w[0-9]+]], wzr, #0x1
+; CHECK-APPLE-DAG: add [[ARGS:x[0-9]+]], [[TMP:x[0-9]+]], #16
+; CHECK-APPLE-DAG: strb [[ID]], [x0, #8]
 
 ; First vararg
 ; CHECK-APPLE-DAG: orr {{x[0-9]+}}, [[ARGS]], #0x8
 ; CHECK-APPLE-DAG: ldr {{w[0-9]+}}, [{{.*}}[[TMP]], #16]
-; CHECK-APPLE: add {{x[0-9]+}}, {{x[0-9]+}}, #8
+; CHECK-APPLE-DAG: add {{x[0-9]+}}, {{x[0-9]+}}, #8
 ; Second vararg
-; CHECK-APPLE: ldr {{w[0-9]+}}, [{{x[0-9]+}}]
-; CHECK-APPLE: add {{x[0-9]+}}, {{x[0-9]+}}, #8
+; CHECK-APPLE-DAG: ldr {{w[0-9]+}}, [{{x[0-9]+}}], #8
+; CHECK-APPLE-DAG: add {{x[0-9]+}}, {{x[0-9]+}}, #16
 ; Third vararg
 ; CHECK-APPLE: ldr {{w[0-9]+}}, [{{x[0-9]+}}]
 
diff --git a/test/CodeGen/AArch64/win64_vararg.ll b/test/CodeGen/AArch64/win64_vararg.ll
new file mode 100644
index 000000000000..b760e4acd16a
--- /dev/null
+++ b/test/CodeGen/AArch64/win64_vararg.ll
@@ -0,0 +1,95 @@
+; RUN: llc < %s -mtriple=aarch64-pc-win32 | FileCheck %s
+
+define void @pass_va(i32 %count, ...) nounwind {
+entry:
+; CHECK: sub     sp, sp, #80
+; CHECK: add     x8, sp, #24
+; CHECK: add     x0, sp, #24
+; CHECK: stp     x6, x7, [sp, #64]
+; CHECK: stp     x4, x5, [sp, #48]
+; CHECK: stp     x2, x3, [sp, #32]
+; CHECK: str     x1, [sp, #24]
+; CHECK: stp     x30, x8, [sp]
+; CHECK: bl      other_func
+; CHECK: ldr     x30, [sp], #80
+; CHECK: ret
+  %ap = alloca i8*, align 8
+  %ap1 = bitcast i8** %ap to i8*
+  call void @llvm.va_start(i8* %ap1)
+  %ap2 = load i8*, i8** %ap, align 8
+  call void @other_func(i8* %ap2)
+  ret void
+}
+
+declare void @other_func(i8*) local_unnamed_addr
+
+declare void @llvm.va_start(i8*) nounwind
+declare void @llvm.va_copy(i8*, i8*) nounwind
+
+; CHECK-LABEL: f9:
+; CHECK: sub     sp, sp, #16
+; CHECK: add     x8, sp, #24
+; CHECK: add     x0, sp, #24
+; CHECK: str     x8, [sp, #8]
+; CHECK: add     sp, sp, #16
+; CHECK: ret
+define i8* @f9(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, i64 %a5, i64 %a6, i64 %a7, i64 %a8, ...) nounwind {
+entry:
+  %ap = alloca i8*, align 8
+  %ap1 = bitcast i8** %ap to i8*
+  call void @llvm.va_start(i8* %ap1)
+  %ap2 = load i8*, i8** %ap, align 8
+  ret i8* %ap2
+}
+
+; CHECK-LABEL: f8:
+; CHECK: sub     sp, sp, #16
+; CHECK: add     x8, sp, #16
+; CHECK: add     x0, sp, #16
+; CHECK: str     x8, [sp, #8]
+; CHECK: add     sp, sp, #16
+; CHECK: ret
+define i8* @f8(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, i64 %a5, i64 %a6, i64 %a7, ...) nounwind {
+entry:
+  %ap = alloca i8*, align 8
+  %ap1 = bitcast i8** %ap to i8*
+  call void @llvm.va_start(i8* %ap1)
+  %ap2 = load i8*, i8** %ap, align 8
+  ret i8* %ap2
+}
+
+; CHECK-LABEL: f7:
+; CHECK: sub     sp, sp, #16
+; CHECK: add     x8, sp, #8
+; CHECK: add     x0, sp, #8
+; CHECK: stp     x8, x7, [sp], #16
+; CHECK: ret
+define i8* @f7(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, i64 %a5, i64 %a6, ...) nounwind {
+entry:
+  %ap = alloca i8*, align 8
+  %ap1 = bitcast i8** %ap to i8*
+  call void @llvm.va_start(i8* %ap1)
+  %ap2 = load i8*, i8** %ap, align 8
+  ret i8* %ap2
+}
+
+; CHECK-LABEL: copy1:
+; CHECK: sub     sp, sp, #80
+; CHECK: add     x8, sp, #24
+; CHECK: stp     x6, x7, [sp, #64]
+; CHECK: stp     x4, x5, [sp, #48]
+; CHECK: stp     x2, x3, [sp, #32]
+; CHECK: stp     x8, x1, [sp, #16]
+; CHECK: str     x8, [sp, #8]
+; CHECK: add     sp, sp, #80
+; CHECK: ret
+define void @copy1(i64 %a0, ...) nounwind {
+entry:
+  %ap = alloca i8*, align 8
+  %cp = alloca i8*, align 8
+  %ap1 = bitcast i8** %ap to i8*
+  %cp1 = bitcast i8** %cp to i8*
+  call void @llvm.va_start(i8* %ap1)
+  call void @llvm.va_copy(i8* %cp1, i8* %ap1)
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll b/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
new file mode 100644
index 000000000000..e9797eff712b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
@@ -0,0 +1,312 @@
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -amdgpu-annotate-kernel-features %s | FileCheck -check-prefix=HSA %s
+
+declare i32 @llvm.amdgcn.workgroup.id.x() #0
+declare i32 @llvm.amdgcn.workgroup.id.y() #0
+declare i32 @llvm.amdgcn.workgroup.id.z() #0
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+declare i32 @llvm.amdgcn.workitem.id.y() #0
+declare i32 @llvm.amdgcn.workitem.id.z() #0
+
+declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
+declare i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0
+declare i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0
+declare i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() #0
+declare i64 @llvm.amdgcn.dispatch.id() #0
+
+; HSA: define void @use_workitem_id_x() #1 {
+define void @use_workitem_id_x() #1 {
+  %val = call i32 @llvm.amdgcn.workitem.id.x()
+  store volatile i32 %val, i32 addrspace(1)* undef
+  ret void
+}
+
+; HSA: define void @use_workitem_id_y() #2 {
+define void @use_workitem_id_y() #1 {
+  %val = call i32 @llvm.amdgcn.workitem.id.y()
+  store volatile i32 %val, i32 addrspace(1)* undef
+  ret void
+}
+
+; HSA: define void @use_workitem_id_z() #3 {
+define void @use_workitem_id_z() #1 {
+  %val = call i32 @llvm.amdgcn.workitem.id.z()
+  store volatile i32 %val, i32 addrspace(1)* undef
+  ret void
+}
+
+; HSA: define void @use_workgroup_id_x() #4 {
+define void @use_workgroup_id_x() #1 {
+  %val = call i32 @llvm.amdgcn.workgroup.id.x()
+  store volatile i32 %val, i32 addrspace(1)* undef
+  ret void
+}
+
+; HSA: define void @use_workgroup_id_y() #5 {
+define void @use_workgroup_id_y() #1 {
+  %val = call i32 @llvm.amdgcn.workgroup.id.y()
+  store volatile i32 %val, i32 addrspace(1)* undef
+  ret void
+}
+
+; HSA: define void @use_workgroup_id_z() #6 {
+define void @use_workgroup_id_z() #1 {
+  %val = call i32 @llvm.amdgcn.workgroup.id.z()
+  store volatile i32 %val, i32 addrspace(1)* undef
+  ret void
+}
+
+; HSA: define void @use_dispatch_ptr() #7 {
+define void @use_dispatch_ptr() #1 {
+  %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
+  store volatile i8 addrspace(2)* %dispatch.ptr, i8 addrspace(2)* addrspace(1)* undef
+  ret void
+}
+
+; HSA: define void @use_queue_ptr() #8 {
+define void @use_queue_ptr() #1 {
+  %queue.ptr = call i8 addrspace(2)* @llvm.amdgcn.queue.ptr()
+  store volatile i8 addrspace(2)* %queue.ptr, i8 addrspace(2)* addrspace(1)* undef
+  ret void
+}
+
+; HSA: define void @use_dispatch_id() #9 {
+define void @use_dispatch_id() #1 {
+  %val = call i64 @llvm.amdgcn.dispatch.id()
+  store volatile i64 %val, i64 addrspace(1)* undef
+  ret void
+}
+
+; HSA: define void @use_workgroup_id_y_workgroup_id_z() #10 {
+define void @use_workgroup_id_y_workgroup_id_z() #1 {
+  %val0 = call i32 @llvm.amdgcn.workgroup.id.y()
+  %val1 = call i32 @llvm.amdgcn.workgroup.id.z()
+  store volatile i32 %val0, i32 addrspace(1)* undef
+  store volatile i32 %val1, i32 addrspace(1)* undef
+  ret void
+}
+
+; HSA: define void @func_indirect_use_workitem_id_x() #1 {
+define void @func_indirect_use_workitem_id_x() #1 {
+  call void @use_workitem_id_x()
+  ret void
+}
+
+; HSA: define void @kernel_indirect_use_workitem_id_x() #1 {
+define void @kernel_indirect_use_workitem_id_x() #1 {
+  call void @use_workitem_id_x()
+  ret void
+}
+
+; HSA: define void @func_indirect_use_workitem_id_y() #2 {
+define void @func_indirect_use_workitem_id_y() #1 {
+  call void @use_workitem_id_y()
+  ret void
+}
+
+; HSA: define void @func_indirect_use_workitem_id_z() #3 {
+define void @func_indirect_use_workitem_id_z() #1 {
+  call void @use_workitem_id_z()
+  ret void
+}
+
+; HSA: define void @func_indirect_use_workgroup_id_x() #4 {
+define void @func_indirect_use_workgroup_id_x() #1 {
+  call void @use_workgroup_id_x()
+  ret void
+}
+
+; HSA: define void @kernel_indirect_use_workgroup_id_x() #4 {
+define void @kernel_indirect_use_workgroup_id_x() #1 {
+  call void @use_workgroup_id_x()
+  ret void
+}
+
+; HSA: define void @func_indirect_use_workgroup_id_y() #5 {
+define void @func_indirect_use_workgroup_id_y() #1 {
+  call void @use_workgroup_id_y()
+  ret void
+}
+
+; HSA: define void @func_indirect_use_workgroup_id_z() #6 {
+define void @func_indirect_use_workgroup_id_z() #1 {
+  call void @use_workgroup_id_z()
+  ret void
+}
+
+; HSA: define void @func_indirect_indirect_use_workgroup_id_y() #5 {
+define void @func_indirect_indirect_use_workgroup_id_y() #1 {
+  call void @func_indirect_use_workgroup_id_y()
+  ret void
+}
+
+; HSA: define void @indirect_x2_use_workgroup_id_y() #5 {
+define void @indirect_x2_use_workgroup_id_y() #1 {
+  call void @func_indirect_indirect_use_workgroup_id_y()
+  ret void
+}
+
+; HSA: define void @func_indirect_use_dispatch_ptr() #7 {
+define void @func_indirect_use_dispatch_ptr() #1 {
+  call void @use_dispatch_ptr()
+  ret void
+}
+
+; HSA: define void @func_indirect_use_queue_ptr() #8 {
+define void @func_indirect_use_queue_ptr() #1 {
+  call void @use_queue_ptr()
+  ret void
+}
+
+; HSA: define void @func_indirect_use_dispatch_id() #9 {
+define void @func_indirect_use_dispatch_id() #1 {
+  call void @use_dispatch_id()
+  ret void
+}
+
+; HSA: define void @func_indirect_use_workgroup_id_y_workgroup_id_z() #11 {
+define void @func_indirect_use_workgroup_id_y_workgroup_id_z() #1 {
+  call void @func_indirect_use_workgroup_id_y_workgroup_id_z()
+  ret void
+}
+
+; HSA: define void @recursive_use_workitem_id_y() #2 {
+define void @recursive_use_workitem_id_y() #1 {
+  %val = call i32 @llvm.amdgcn.workitem.id.y()
+  store volatile i32 %val, i32 addrspace(1)* undef
+  call void @recursive_use_workitem_id_y()
+  ret void
+}
+
+; HSA: define void @call_recursive_use_workitem_id_y() #2 {
+define void @call_recursive_use_workitem_id_y() #1 {
+  call void @recursive_use_workitem_id_y()
+  ret void
+}
+
+; HSA: define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #8 {
+define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #1 {
+  %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)*
+  store volatile i32 0, i32 addrspace(4)* %stof
+  ret void
+}
+
+; HSA: define void @use_group_to_flat_addrspacecast_gfx9(i32 addrspace(3)* %ptr) #12 {
+define void @use_group_to_flat_addrspacecast_gfx9(i32 addrspace(3)* %ptr) #2 {
+  %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)*
+  store volatile i32 0, i32 addrspace(4)* %stof
+  ret void
+}
+
+; HSA: define void @use_group_to_flat_addrspacecast_queue_ptr_gfx9(i32 addrspace(3)* %ptr) #13 {
+define void @use_group_to_flat_addrspacecast_queue_ptr_gfx9(i32 addrspace(3)* %ptr) #2 {
+  %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)*
+  store volatile i32 0, i32 addrspace(4)* %stof
+  call void @func_indirect_use_queue_ptr()
+  ret void
+}
+
+; HSA: define void @indirect_use_group_to_flat_addrspacecast() #8 {
+define void @indirect_use_group_to_flat_addrspacecast() #1 {
+  call void @use_group_to_flat_addrspacecast(i32 addrspace(3)* null)
+  ret void
+}
+
+; HSA: define void @indirect_use_group_to_flat_addrspacecast_gfx9() #11 {
+define void @indirect_use_group_to_flat_addrspacecast_gfx9() #1 {
+  call void @use_group_to_flat_addrspacecast_gfx9(i32 addrspace(3)* null)
+  ret void
+}
+
+; HSA: define void @indirect_use_group_to_flat_addrspacecast_queue_ptr_gfx9() #8 {
+define void @indirect_use_group_to_flat_addrspacecast_queue_ptr_gfx9() #1 {
+  call void @use_group_to_flat_addrspacecast_queue_ptr_gfx9(i32 addrspace(3)* null)
+  ret void
+}
+
+; HSA: define void @use_kernarg_segment_ptr() #14 {
+define void @use_kernarg_segment_ptr() #1 {
+  %kernarg.segment.ptr = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
+  store volatile i8 addrspace(2)* %kernarg.segment.ptr, i8 addrspace(2)* addrspace(1)* undef
+  ret void
+}
+
+; HSA: define void @func_indirect_use_kernarg_segment_ptr() #14 {
+define void @func_indirect_use_kernarg_segment_ptr() #1 {
+  call void @use_kernarg_segment_ptr()
+  ret void
+}
+
+; HSA: define void @use_implicitarg_ptr() #14 {
+define void @use_implicitarg_ptr() #1 {
+  %implicitarg.ptr = call i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr()
+  store volatile i8 addrspace(2)* %implicitarg.ptr, i8 addrspace(2)* addrspace(1)* undef
+  ret void
+}
+
+; HSA: define void @func_indirect_use_implicitarg_ptr() #14 {
+define void @func_indirect_use_implicitarg_ptr() #1 {
+  call void @use_implicitarg_ptr()
+  ret void
+}
+
+; HSA: declare void @external.func() #15
+declare void @external.func() #3
+
+; HSA: define internal void @defined.func() #15 {
+define internal void @defined.func() #3 {
+  ret void
+}
+
+; HSA: define void @func_call_external() #15 {
+define void @func_call_external() #3 {
+  call void @external.func()
+  ret void
+}
+
+; HSA: define void @func_call_defined() #15 {
+define void @func_call_defined() #3 {
+  call void @defined.func()
+  ret void
+}
+
+; HSA: define void @func_call_asm() #15 {
+define void @func_call_asm() #3 {
+  call void asm sideeffect "", ""() #3
+  ret void
+}
+
+; HSA: define amdgpu_kernel void @kern_call_external() #16 {
+define amdgpu_kernel void @kern_call_external() #3 {
+  call void @external.func()
+  ret void
+}
+
+; HSA: define amdgpu_kernel void @func_kern_defined() #16 {
+define amdgpu_kernel void @func_kern_defined() #3 {
+  call void @defined.func()
+  ret void
+}
+
+attributes #0 = { nounwind readnone speculatable }
+attributes #1 = { nounwind "target-cpu"="fiji" }
+attributes #2 = { nounwind "target-cpu"="gfx900" }
+attributes #3 = { nounwind }
+
+; HSA: attributes #0 = { nounwind readnone speculatable }
+; HSA: attributes #1 = { nounwind "amdgpu-work-item-id-x" "target-cpu"="fiji" }
+; HSA: attributes #2 = { nounwind "amdgpu-work-item-id-y" "target-cpu"="fiji" }
+; HSA: attributes #3 = { nounwind "amdgpu-work-item-id-z" "target-cpu"="fiji" }
+; HSA: attributes #4 = { nounwind "amdgpu-work-group-id-x" "target-cpu"="fiji" }
+; HSA: attributes #5 = { nounwind "amdgpu-work-group-id-y" "target-cpu"="fiji" }
+; HSA: attributes #6 = { nounwind "amdgpu-work-group-id-z" "target-cpu"="fiji" }
+; HSA: attributes #7 = { nounwind "amdgpu-dispatch-ptr" "target-cpu"="fiji" }
+; HSA: attributes #8 = { nounwind "amdgpu-queue-ptr" "target-cpu"="fiji" }
+; HSA: attributes #9 = { nounwind "amdgpu-dispatch-id" "target-cpu"="fiji" }
+; HSA: attributes #10 = { nounwind "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "target-cpu"="fiji" }
+; HSA: attributes #11 = { nounwind "target-cpu"="fiji" }
+; HSA: attributes #12 = { nounwind "target-cpu"="gfx900" }
+; HSA: attributes #13 = { nounwind "amdgpu-queue-ptr" "target-cpu"="gfx900" }
+; HSA: attributes #14 = { nounwind "amdgpu-kernarg-segment-ptr" "target-cpu"="fiji" }
+; HSA: attributes #15 = { nounwind }
+; HSA: attributes #16 = { nounwind "amdgpu-flat-scratch" }
diff --git a/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll b/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
index f7461b925ca1..3059a95a5098 100644
--- a/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
+++ b/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
@@ -10,6 +10,7 @@ declare i32 @llvm.amdgcn.workitem.id.z() #0
 
 declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
 declare i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0
+declare i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0
 
 ; HSA: define amdgpu_kernel void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
 define amdgpu_kernel void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
@@ -164,6 +165,15 @@ define amdgpu_kernel void @use_queue_ptr(i32 addrspace(1)* %ptr) #1 {
   ret void
 }
 
+; HSA: define amdgpu_kernel void @use_kernarg_segment_ptr(i32 addrspace(1)* %ptr) #12 {
+define amdgpu_kernel void @use_kernarg_segment_ptr(i32 addrspace(1)* %ptr) #1 {
+  %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
+  %bc = bitcast i8 addrspace(2)* %dispatch.ptr to i32 addrspace(2)*
+  %val = load i32, i32 addrspace(2)* %bc
+  store i32 %val, i32 addrspace(1)* %ptr
+  ret void
+}
+
 ; HSA: define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #11 {
 define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #1 {
   %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)*
@@ -236,3 +246,4 @@ attributes #1 = { nounwind }
 ; HSA: attributes #9 = { nounwind "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" }
 ; HSA: attributes #10 = { nounwind "amdgpu-dispatch-ptr" }
 ; HSA: attributes #11 = { nounwind "amdgpu-queue-ptr" }
+; HSA: attributes #12 = { nounwind "amdgpu-kernarg-segment-ptr" }
diff --git a/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll b/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
index 63a6f6a8d32c..a0694fb1e3c9 100644
--- a/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
+++ b/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
@@ -36,7 +36,7 @@ attributes #2 = {"amdgpu-flat-work-group-size"="128,128"}
 ; CHECK-LABEL: {{^}}min_1024_max_2048
 ; CHECK: SGPRBlocks: 1
 ; CHECK: VGPRBlocks: 7
-; CHECK: NumSGPRsForWavesPerEU: 13
+; CHECK: NumSGPRsForWavesPerEU: 12
 ; CHECK: NumVGPRsForWavesPerEU: 32
 @var = addrspace(1) global float 0.0
 define amdgpu_kernel void @min_1024_max_2048() #3 {
diff --git a/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll b/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
index 3dda73bc336e..a5e97205de21 100644
--- a/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
+++ b/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
@@ -118,7 +118,7 @@ attributes #8 = {"amdgpu-waves-per-eu"="5,10"}
 ; CHECK-LABEL: {{^}}exactly_10:
 ; CHECK: SGPRBlocks: 1
 ; CHECK: VGPRBlocks: 5
-; CHECK: NumSGPRsForWavesPerEU: 13
+; CHECK: NumSGPRsForWavesPerEU: 12
 ; CHECK: NumVGPRsForWavesPerEU: 24
 define amdgpu_kernel void @exactly_10() #9 {
   %val0 = load volatile float, float addrspace(1)* @var
@@ -188,3 +188,15 @@ define amdgpu_kernel void @exactly_10() #9 {
   ret void
 }
 attributes #9 = {"amdgpu-waves-per-eu"="10,10"}
+
+; Exactly 256 workitems and exactly 2 waves.
+; CHECK-LABEL: {{^}}empty_workitems_exactly_256_waves_exactly_2:
+; CHECK: SGPRBlocks: 12
+; CHECK: VGPRBlocks: 21
+; CHECK: NumSGPRsForWavesPerEU: 102
+; CHECK: NumVGPRsForWavesPerEU: 85
+define amdgpu_kernel void @empty_workitems_exactly_256_waves_exactly_2() #10 {
+entry:
+  ret void
+}
+attributes #10 = {"amdgpu-flat-work-group-size"="256,256" "amdgpu-waves-per-eu"="2,2"}
diff --git a/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll b/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
index 5383bbe71ae3..5ffa45595e70 100644
--- a/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
+++ b/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
@@ -347,7 +347,9 @@ define amdgpu_kernel void @test_fold_canonicalize_qNaN_value_f32(float addrspace
 }
 
 ; GCN-LABEL: test_fold_canonicalize_minnum_value_from_load_f32:
-; GCN: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
+; VI: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
+; GFX9: v_min_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}}
+; GFX9: flat_store_dword v[{{[0-9:]+}}], [[V]]
 define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32(float addrspace(1)* %arg) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
@@ -388,9 +390,11 @@ define amdgpu_kernel void @test_fold_canonicalize_sNaN_value_f32(float addrspace
 }
 
 ; GCN-LABEL: test_fold_canonicalize_denorm_value_f32:
-; GCN:  v_min_f32_e32 [[V0:v[0-9]+]], 0x7fffff, v{{[0-9]+}}
-; GCN:  v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[V0]]
-; GCN:  flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GFX9:  v_min_f32_e32 [[V:v[0-9]+]], 0x7fffff, v{{[0-9]+}}
+; VI:    v_min_f32_e32 [[V0:v[0-9]+]], 0x7fffff, v{{[0-9]+}}
+; VI:    v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[V0]]
+; GCN:   flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GFX9-NOT: 1.0
 define amdgpu_kernel void @test_fold_canonicalize_denorm_value_f32(float addrspace(1)* %arg) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
@@ -402,9 +406,11 @@ define amdgpu_kernel void @test_fold_canonicalize_denorm_value_f32(float addrspa
 }
 
 ; GCN-LABEL: test_fold_canonicalize_maxnum_value_from_load_f32:
-; GCN:  v_max_f32_e32 [[V0:v[0-9]+]], 0, v{{[0-9]+}}
-; GCN:  v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[V0]]
+; GFX9:  v_max_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}}
+; VI:    v_max_f32_e32 [[V0:v[0-9]+]], 0, v{{[0-9]+}}
+; VI:    v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[V0]]
 ; GCN:  flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GFX9-NOT: 1.0
 define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_from_load_f32(float addrspace(1)* %arg) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
@@ -465,6 +471,49 @@ entry:
   ret float %canonicalized
 }
 
+; GCN-LABEL: {{^}}test_fold_canonicalize_load_nnan_value_f32
+; GFX9-DENORM: flat_load_dword [[V:v[0-9]+]],
+; GFX9-DENORM: flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GFX9-DENORM-NOT: 1.0
+; GCN-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
+define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f32(float addrspace(1)* %arg, float addrspace(1)* %out) #1 {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+  %v = load float, float addrspace(1)* %gep, align 4
+  %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+  %gep2 = getelementptr inbounds float, float addrspace(1)* %out, i32 %id
+  store float %canonicalized, float addrspace(1)* %gep2, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_load_nnan_value_f64
+; GCN: flat_load_dwordx2 [[V:v\[[0-9:]+\]]],
+; GCN: flat_store_dwordx2 v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f64(double addrspace(1)* %arg, double addrspace(1)* %out) #1 {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds double, double addrspace(1)* %arg, i32 %id
+  %v = load double, double addrspace(1)* %gep, align 8
+  %canonicalized = tail call double @llvm.canonicalize.f64(double %v)
+  %gep2 = getelementptr inbounds double, double addrspace(1)* %out, i32 %id
+  store double %canonicalized, double addrspace(1)* %gep2, align 8
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_load_nnan_value_f16
+; GCN: flat_load_ushort [[V:v[0-9]+]],
+; GCN: flat_store_short v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f16(half addrspace(1)* %arg, half addrspace(1)* %out) #1 {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id
+  %v = load half, half addrspace(1)* %gep, align 2
+  %canonicalized = tail call half @llvm.canonicalize.f16(half %v)
+  %gep2 = getelementptr inbounds half, half addrspace(1)* %out, i32 %id
+  store half %canonicalized, half addrspace(1)* %gep2, align 2
+  ret void
+}
+
 declare float @llvm.canonicalize.f32(float) #0
 declare double @llvm.canonicalize.f64(double) #0
 declare half @llvm.canonicalize.f16(half) #0
@@ -485,3 +534,4 @@ declare float @llvm.maxnum.f32(float, float) #0
 declare double @llvm.maxnum.f64(double, double) #0
 
 attributes #0 = { nounwind readnone }
+attributes #1 = { "no-nans-fp-math"="true" }
diff --git a/test/CodeGen/AMDGPU/function-args.ll b/test/CodeGen/AMDGPU/function-args.ll
index 9b1368493ba5..6b22cb0b7e28 100644
--- a/test/CodeGen/AMDGPU/function-args.ll
+++ b/test/CodeGen/AMDGPU/function-args.ll
@@ -34,6 +34,22 @@ define void @void_func_i1_signext(i1 signext %arg0) #0 {
   ret void
 }
 
+; GCN-LABEL: {{^}}i1_arg_i1_use:
+; GCN: v_and_b32_e32 v0, 1, v0
+; GCN: v_cmp_eq_u32_e32 vcc, 1, v0
+; GCN: s_xor_b64 s{{\[[0-9]+:[0-9]+\]}}, vcc, -1
+define void @i1_arg_i1_use(i1 %arg) #0 {
+bb:
+  br i1 %arg, label %bb2, label %bb1
+
+bb1:
+  store volatile i32 0, i32 addrspace(1)* undef
+  br label %bb2
+
+bb2:
+  ret void
+}
+
 ; GCN-LABEL: {{^}}void_func_i8:
 ; GCN-NOT: v0
 ; GCN: buffer_store_byte v0, off
diff --git a/test/CodeGen/AMDGPU/hsa.ll b/test/CodeGen/AMDGPU/hsa.ll
index 972fbd66ef37..0b19fbe7d70c 100644
--- a/test/CodeGen/AMDGPU/hsa.ll
+++ b/test/CodeGen/AMDGPU/hsa.ll
@@ -40,7 +40,7 @@
 ; HSA-CI: .hsa_code_object_isa 7,0,0,"AMD","AMDGPU"
 ; HSA-VI: .hsa_code_object_isa 8,0,1,"AMD","AMDGPU"
 
-; HSA: .amdgpu_hsa_kernel simple
+; HSA-LABEL: .amdgpu_hsa_kernel simple
 ; HSA: {{^}}simple:
 ; HSA: .amd_kernel_code_t
 ; HSA: enable_sgpr_private_segment_buffer = 1
@@ -65,3 +65,11 @@ entry:
   store i32 0, i32 addrspace(1)* %out
   ret void
 }
+
+; HSA-LABEL: .amdgpu_hsa_kernel simple_no_kernargs
+; HSA: enable_sgpr_kernarg_segment_ptr = 0
+define amdgpu_kernel void @simple_no_kernargs() {
+entry:
+  store volatile i32 0, i32 addrspace(1)* undef
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll
index 9a27809f37bb..70e6b408ca29 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll
@@ -49,6 +49,18 @@ define amdgpu_kernel void @test_implicit_alignment(i32 addrspace(1)* %out, <2 x
   ret void
 }
 
+; ALL-LABEL: {{^}}test_no_kernargs:
+; HSA: enable_sgpr_kernarg_segment_ptr = 1
+; HSA: s_load_dword s{{[0-9]+}}, s[4:5]
+define amdgpu_kernel void @test_no_kernargs() #1 {
+  %kernarg.segment.ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
+  %header.ptr = bitcast i8 addrspace(2)* %kernarg.segment.ptr to i32 addrspace(2)*
+  %gep = getelementptr i32, i32 addrspace(2)* %header.ptr, i64 10
+  %value = load i32, i32 addrspace(2)* %gep
+  store volatile i32 %value, i32 addrspace(1)* undef
+  ret void
+}
+
 declare i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0
 declare i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() #0
 
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll
index f0af876567b4..1c3cba8d3e4f 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
 
 ; CHECK-LABEL: {{^}}test1:
 ; CHECK: v_cndmask_b32_e64 v0, 0, 1, exec
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll
index ee58d359a935..a466671d8c55 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
 
 ; CHECK-LABEL: {{^}}test1:
 ; CHECK: image_store
diff --git a/test/CodeGen/AMDGPU/move-to-valu-worklist.ll b/test/CodeGen/AMDGPU/move-to-valu-worklist.ll
new file mode 100644
index 000000000000..539eed92d540
--- /dev/null
+++ b/test/CodeGen/AMDGPU/move-to-valu-worklist.ll
@@ -0,0 +1,29 @@
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
+
+; In moveToVALU(), move to vector ALU is performed, all instrs in
+; the use chain will be visited. We do not want the same node to be 
+; pushed to the visit worklist more than once.
+		
+; GCN-LABEL: {{^}}in_worklist_once:
+; GCN: buffer_load_dword
+; GCN: BB0_1:
+; GCN: v_xor_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GCN-NEXT: v_xor_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GCN: v_and_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GCN-NEXT: v_and_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @in_worklist_once() #0 {
+bb:
+	%tmp = load i64, i64* undef
+br label %bb1
+
+bb1:                                              ; preds = %bb1, %bb
+	%tmp2 = phi i64 [ undef, %bb ], [ %tmp16, %bb1 ]
+	%tmp3 = phi i64 [ %tmp, %bb ], [ undef, %bb1 ]
+	%tmp11 = shl i64 %tmp2, 14
+	%tmp13 = xor i64 %tmp11, %tmp2
+	%tmp15 = and i64 %tmp3, %tmp13
+	%tmp16 = xor i64 %tmp15, %tmp3
+br label %bb1
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/mubuf-offset-private.ll b/test/CodeGen/AMDGPU/mubuf-offset-private.ll
index 3a0605fa182a..742c4f8af85d 100644
--- a/test/CodeGen/AMDGPU/mubuf-offset-private.ll
+++ b/test/CodeGen/AMDGPU/mubuf-offset-private.ll
@@ -5,42 +5,42 @@
 ; Test addressing modes when the scratch base is not a frame index.
 
 ; GCN-LABEL: {{^}}store_private_offset_i8:
-; GCN: buffer_store_byte v{{[0-9]+}}, off, s[4:7], s8 offset:8
+; GCN: buffer_store_byte v{{[0-9]+}}, off, s[4:7], s2 offset:8
 define amdgpu_kernel void @store_private_offset_i8() #0 {
   store volatile i8 5, i8* inttoptr (i32 8 to i8*)
   ret void
 }
 
 ; GCN-LABEL: {{^}}store_private_offset_i16:
-; GCN: buffer_store_short v{{[0-9]+}}, off, s[4:7], s8 offset:8
+; GCN: buffer_store_short v{{[0-9]+}}, off, s[4:7], s2 offset:8
 define amdgpu_kernel void @store_private_offset_i16() #0 {
   store volatile i16 5, i16* inttoptr (i32 8 to i16*)
   ret void
 }
 
 ; GCN-LABEL: {{^}}store_private_offset_i32:
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s[4:7], s8 offset:8
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s[4:7], s2 offset:8
 define amdgpu_kernel void @store_private_offset_i32() #0 {
   store volatile i32 5, i32* inttoptr (i32 8 to i32*)
   ret void
 }
 
 ; GCN-LABEL: {{^}}store_private_offset_v2i32:
-; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s8 offset:8
+; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s2 offset:8
 define amdgpu_kernel void @store_private_offset_v2i32() #0 {
   store volatile <2 x i32> <i32 5, i32 10>, <2 x i32>* inttoptr (i32 8 to <2 x i32>*)
   ret void
 }
 
 ; GCN-LABEL: {{^}}store_private_offset_v4i32:
-; GCN: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s8 offset:8
+; GCN: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s2 offset:8
 define amdgpu_kernel void @store_private_offset_v4i32() #0 {
   store volatile <4 x i32> <i32 5, i32 10, i32 15, i32 0>, <4 x i32>* inttoptr (i32 8 to <4 x i32>*)
   ret void
 }
 
 ; GCN-LABEL: {{^}}load_private_offset_i8:
-; GCN: buffer_load_ubyte v{{[0-9]+}}, off, s[4:7], s8 offset:8
+; GCN: buffer_load_ubyte v{{[0-9]+}}, off, s[4:7], s2 offset:8
 define amdgpu_kernel void @load_private_offset_i8() #0 {
   %load = load volatile i8, i8* inttoptr (i32 8 to i8*)
   ret void
@@ -65,7 +65,7 @@ define amdgpu_kernel void @zextload_private_offset_i8(i32 addrspace(1)* %out) #0
 }
 
 ; GCN-LABEL: {{^}}load_private_offset_i16:
-; GCN: buffer_load_ushort v{{[0-9]+}}, off, s[4:7], s8 offset:8
+; GCN: buffer_load_ushort v{{[0-9]+}}, off, s[4:7], s2 offset:8
 define amdgpu_kernel void @load_private_offset_i16() #0 {
   %load = load volatile i16, i16* inttoptr (i32 8 to i16*)
   ret void
@@ -90,28 +90,28 @@ define amdgpu_kernel void @zextload_private_offset_i16(i32 addrspace(1)* %out) #
 }
 
 ; GCN-LABEL: {{^}}load_private_offset_i32:
-; GCN: buffer_load_dword v{{[0-9]+}}, off, s[4:7], s8 offset:8
+; GCN: buffer_load_dword v{{[0-9]+}}, off, s[4:7], s2 offset:8
 define amdgpu_kernel void @load_private_offset_i32() #0 {
   %load = load volatile i32, i32* inttoptr (i32 8 to i32*)
   ret void
 }
 
 ; GCN-LABEL: {{^}}load_private_offset_v2i32:
-; GCN: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s8 offset:8
+; GCN: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s2 offset:8
 define amdgpu_kernel void @load_private_offset_v2i32() #0 {
   %load = load volatile <2 x i32>, <2 x i32>* inttoptr (i32 8 to <2 x i32>*)
   ret void
 }
 
 ; GCN-LABEL: {{^}}load_private_offset_v4i32:
-; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s8 offset:8
+; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s2 offset:8
 define amdgpu_kernel void @load_private_offset_v4i32() #0 {
   %load = load volatile <4 x i32>, <4 x i32>* inttoptr (i32 8 to <4 x i32>*)
   ret void
 }
 
 ; GCN-LABEL: {{^}}store_private_offset_i8_max_offset:
-; GCN: buffer_store_byte v{{[0-9]+}}, off, s[4:7], s8 offset:4095
+; GCN: buffer_store_byte v{{[0-9]+}}, off, s[4:7], s2 offset:4095
 define amdgpu_kernel void @store_private_offset_i8_max_offset() #0 {
   store volatile i8 5, i8* inttoptr (i32 4095 to i8*)
   ret void
@@ -119,7 +119,7 @@ define amdgpu_kernel void @store_private_offset_i8_max_offset() #0 {
 
 ; GCN-LABEL: {{^}}store_private_offset_i8_max_offset_plus1:
 ; GCN: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x1000
-; GCN: buffer_store_byte v{{[0-9]+}}, [[OFFSET]], s[4:7], s8 offen{{$}}
+; GCN: buffer_store_byte v{{[0-9]+}}, [[OFFSET]], s[4:7], s2 offen{{$}}
 define amdgpu_kernel void @store_private_offset_i8_max_offset_plus1() #0 {
   store volatile i8 5, i8* inttoptr (i32 4096 to i8*)
   ret void
@@ -127,7 +127,7 @@ define amdgpu_kernel void @store_private_offset_i8_max_offset_plus1() #0 {
 
 ; GCN-LABEL: {{^}}store_private_offset_i8_max_offset_plus2:
 ; GCN: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x1000
-; GCN: buffer_store_byte v{{[0-9]+}}, [[OFFSET]], s[4:7], s8 offen offset:1{{$}}
+; GCN: buffer_store_byte v{{[0-9]+}}, [[OFFSET]], s[4:7], s2 offen offset:1{{$}}
 define amdgpu_kernel void @store_private_offset_i8_max_offset_plus2() #0 {
   store volatile i8 5, i8* inttoptr (i32 4097 to i8*)
   ret void
diff --git a/test/CodeGen/AMDGPU/parallelandifcollapse.ll b/test/CodeGen/AMDGPU/parallelandifcollapse.ll
index 190d2b72ebaf..87f37144244e 100644
--- a/test/CodeGen/AMDGPU/parallelandifcollapse.ll
+++ b/test/CodeGen/AMDGPU/parallelandifcollapse.ll
@@ -8,7 +8,7 @@
 ; CHECK-NEXT: OR_INT
 
 ; FIXME: For some reason having the allocas here allowed the flatten cfg pass
-; to do its transfomation, however now that we are using local memory for
+; to do its transformation, however now that we are using local memory for
 ; allocas, the transformation isn't happening.
 
 define amdgpu_kernel void @_Z9chk1D_512v() #0 {
diff --git a/test/CodeGen/AMDGPU/parallelorifcollapse.ll b/test/CodeGen/AMDGPU/parallelorifcollapse.ll
index 91116b0f65ea..e199d5b5df25 100644
--- a/test/CodeGen/AMDGPU/parallelorifcollapse.ll
+++ b/test/CodeGen/AMDGPU/parallelorifcollapse.ll
@@ -5,7 +5,7 @@
 ; then merge if-regions with the same bodies.
 
 ; FIXME: For some reason having the allocas here allowed the flatten cfg pass
-; to do its transfomation, however now that we are using local memory for
+; to do its transformation, however now that we are using local memory for
 ; allocas, the transformation isn't happening.
 ; XFAIL: *
 ;
diff --git a/test/CodeGen/AMDGPU/private-access-no-objects.ll b/test/CodeGen/AMDGPU/private-access-no-objects.ll
index dcb089010e99..cf0c7944d4cd 100644
--- a/test/CodeGen/AMDGPU/private-access-no-objects.ll
+++ b/test/CodeGen/AMDGPU/private-access-no-objects.ll
@@ -10,14 +10,14 @@
 ; GCN-LABEL: {{^}}store_to_undef:
 ; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1]
 ; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3]
-; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s7{{$}}
+; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s5{{$}}
 ; OPT: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offen{{$}}
 
 ; -O0 should assume spilling, so the input scratch resource descriptor
 ; -should be used directly without any copies.
 
 ; OPTNONE-NOT: s_mov_b32
-; OPTNONE: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen{{$}}
+; OPTNONE: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s5 offen{{$}}
 define amdgpu_kernel void @store_to_undef() #0 {
   store volatile i32 0, i32* undef
   ret void
@@ -26,7 +26,7 @@ define amdgpu_kernel void @store_to_undef() #0 {
 ; GCN-LABEL: {{^}}store_to_inttoptr:
 ; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1]
 ; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3]
-; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s7{{$}}
+; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s5{{$}}
 ; OPT: buffer_store_dword v{{[0-9]+}}, off, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offset:124{{$}}
 define amdgpu_kernel void @store_to_inttoptr() #0 {
  store volatile i32 0, i32* inttoptr (i32 124 to i32*)
@@ -36,7 +36,7 @@ define amdgpu_kernel void @store_to_inttoptr() #0 {
 ; GCN-LABEL: {{^}}load_from_undef:
 ; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1]
 ; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3]
-; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s7{{$}}
+; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s5{{$}}
 ; OPT: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offen{{$}}
 define amdgpu_kernel void @load_from_undef() #0 {
   %ld = load volatile i32, i32* undef
@@ -46,7 +46,7 @@ define amdgpu_kernel void @load_from_undef() #0 {
 ; GCN-LABEL: {{^}}load_from_inttoptr:
 ; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1]
 ; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3]
-; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s7{{$}}
+; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s5{{$}}
 ; OPT: buffer_load_dword v{{[0-9]+}}, off, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offset:124{{$}}
 define amdgpu_kernel void @load_from_inttoptr() #0 {
   %ld = load volatile i32, i32* inttoptr (i32 124 to i32*)
diff --git a/test/CodeGen/AMDGPU/rename-independent-subregs-mac-operands.mir b/test/CodeGen/AMDGPU/rename-independent-subregs-mac-operands.mir
index 770bfaddb23e..a52b80ba86e5 100644
--- a/test/CodeGen/AMDGPU/rename-independent-subregs-mac-operands.mir
+++ b/test/CodeGen/AMDGPU/rename-independent-subregs-mac-operands.mir
@@ -34,7 +34,7 @@ body:             |
   bb.0:
     successors: %bb.2, %bb.1
 
-    %7 = V_CMP_NEQ_F32_e64 0, 0, 0, undef %3, 0, 0, implicit %exec
+    %7 = V_CMP_NEQ_F32_e64 0, 0, 0, undef %3, 0, implicit %exec
     %vcc = COPY killed %7
     S_CBRANCH_VCCZ %bb.2, implicit killed %vcc
 
diff --git a/test/CodeGen/AMDGPU/scratch-simple.ll b/test/CodeGen/AMDGPU/scratch-simple.ll
index 6ed730ad60f4..5e0178072e5e 100644
--- a/test/CodeGen/AMDGPU/scratch-simple.ll
+++ b/test/CodeGen/AMDGPU/scratch-simple.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -mcpu=verde -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=gfx804 -mattr=-flat-for-global -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=SI %s
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX9 %s
 
 ; This used to fail due to a v_add_i32 instruction with an illegal immediate
@@ -8,15 +8,16 @@
 ;
 ; GCN-LABEL: {{^}}ps_main:
 
-; GCN-DAG: s_mov_b32 [[SWO:s[0-9]+]], s0
+; GCN-DAG: s_mov_b32 s4, SCRATCH_RSRC_DWORD0
+; GCN-NOT: s_mov_b32 s0
 ; GCN-DAG: v_lshlrev_b32_e32 [[BYTES:v[0-9]+]], 2, v0
 ; GCN-DAG: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, [[BYTES]]
 
 ; GCN-DAG: v_or_b32_e32 [[LO_OFF:v[0-9]+]], 0x200, [[CLAMP_IDX]]
 ; GCN-DAG: v_or_b32_e32 [[HI_OFF:v[0-9]+]], 0x400, [[CLAMP_IDX]]
 
-; GCN: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen
-; GCN: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen
+; GCN: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, s0 offen
+; GCN: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, s0 offen
 define amdgpu_ps float @ps_main(i32 %idx) {
   %v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
   %v2 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0x3FE7A693C0000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFEBE5DC40000000, float 0x3FEBE5DC40000000, float 0xBFEC71C720000000, float 0x3FEC71C6E0000000, float 0xBFEBE5DC60000000, float 0x3FEBE5DC40000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFE7A693C0000000, float 0x3FE7A69380000000, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFE80000000>, i32 %idx
@@ -25,9 +26,10 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 }
 
 ; GCN-LABEL: {{^}}vs_main:
-; GCN: s_mov_b32 [[SWO:s[0-9]+]], s0
-; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen
-; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen
+; GCN-DAG: s_mov_b32 s4, SCRATCH_RSRC_DWORD0
+; GCN-NOT: s_mov_b32 s0
+; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s0 offen
+; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s0 offen
 define amdgpu_vs float @vs_main(i32 %idx) {
   %v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
   %v2 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0x3FE7A693C0000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFEBE5DC40000000, float 0x3FEBE5DC40000000, float 0xBFEC71C720000000, float 0x3FEC71C6E0000000, float 0xBFEBE5DC60000000, float 0x3FEBE5DC40000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFE7A693C0000000, float 0x3FE7A69380000000, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFE80000000>, i32 %idx
@@ -36,9 +38,9 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 }
 
 ; GCN-LABEL: {{^}}cs_main:
-; GCN: s_mov_b32 [[SWO:s[0-9]+]], s0
-; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen
-; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen
+; GCN-DAG: s_mov_b32 s4, SCRATCH_RSRC_DWORD0
+; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s0 offen
+; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s0 offen
 define amdgpu_cs float @cs_main(i32 %idx) {
   %v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
   %v2 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0x3FE7A693C0000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFEBE5DC40000000, float 0x3FEBE5DC40000000, float 0xBFEC71C720000000, float 0x3FEC71C6E0000000, float 0xBFEBE5DC60000000, float 0x3FEBE5DC40000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFE7A693C0000000, float 0x3FE7A69380000000, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFE80000000>, i32 %idx
@@ -47,10 +49,15 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 }
 
 ; GCN-LABEL: {{^}}hs_main:
-; SI: s_mov_b32 [[SWO:s[0-9]+]], s0
-; GFX9: s_mov_b32 [[SWO:s[0-9]+]], s5
-; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen
-; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen
+; SI: s_mov_b32 s4, SCRATCH_RSRC_DWORD0
+; SI-NOT: s_mov_b32 s0
+; SI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s0 offen
+; SI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s0 offen
+
+; GFX9: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
+; GFX9-NOT: s_mov_b32 s5
+; GFX9: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen
+; GFX9: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen
 define amdgpu_hs float @hs_main(i32 %idx) {
   %v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
   %v2 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0x3FE7A693C0000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFEBE5DC40000000, float 0x3FEBE5DC40000000, float 0xBFEC71C720000000, float 0x3FEC71C6E0000000, float 0xBFEBE5DC60000000, float 0x3FEBE5DC40000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFE7A693C0000000, float 0x3FE7A69380000000, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFE80000000>, i32 %idx
@@ -59,10 +66,13 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 }
 
 ; GCN-LABEL: {{^}}gs_main:
-; SI: s_mov_b32 [[SWO:s[0-9]+]], s0
-; GFX9: s_mov_b32 [[SWO:s[0-9]+]], s5
-; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen
-; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen
+; SI: s_mov_b32 s4, SCRATCH_RSRC_DWORD0
+; SI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s0 offen
+; SI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s0 offen
+
+; GFX9: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
+; GFX9: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen
+; GFX9: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen
 define amdgpu_gs float @gs_main(i32 %idx) {
   %v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
   %v2 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0x3FE7A693C0000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFEBE5DC40000000, float 0x3FEBE5DC40000000, float 0xBFEC71C720000000, float 0x3FEC71C6E0000000, float 0xBFEBE5DC60000000, float 0x3FEBE5DC40000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFE7A693C0000000, float 0x3FE7A69380000000, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFE80000000>, i32 %idx
@@ -71,10 +81,16 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 }
 
 ; GCN-LABEL: {{^}}hs_ir_uses_scratch_offset:
-; SI: s_mov_b32 [[SWO:s[0-9]+]], s6
-; GFX9: s_mov_b32 [[SWO:s[0-9]+]], s5
-; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen
-; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen
+; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+
+; SI-NOT: s_mov_b32 s6
+; SI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s6 offen
+; SI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s6 offen
+
+; GFX9-NOT: s_mov_b32 s5
+; GFX9: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen
+; GFX9: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen
+
 ; GCN: s_mov_b32 s2, s5
 define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg %swo, i32 %idx) {
   %v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
@@ -86,10 +102,14 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 }
 
 ; GCN-LABEL: {{^}}gs_ir_uses_scratch_offset:
-; SI: s_mov_b32 [[SWO:s[0-9]+]], s6
-; GFX9: s_mov_b32 [[SWO:s[0-9]+]], s5
-; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen
-; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen
+; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+
+; SI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s6 offen
+; SI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s6 offen
+
+; GFX9: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen
+; GFX9: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen
+
 ; GCN: s_mov_b32 s2, s5
 define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg %swo, i32 %idx) {
   %v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
diff --git a/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir b/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir
index 4f5c582f8b58..ff1b2ad73ef0 100644
--- a/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir
+++ b/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir
@@ -332,7 +332,7 @@ body:             |
 
 
 # VI: %vcc = V_CMP_EQ_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 4, implicit-def %vcc, implicit %exec
-# VI: %{{[0-9]+}} = V_CMPX_GT_F32_e64 0, 23, 0, killed %{{[0-9]+}}, 0, 0, implicit-def %exec, implicit %exec
+# VI: %{{[0-9]+}} = V_CMPX_GT_F32_e64 0, 23, 0, killed %{{[0-9]+}}, 0, implicit-def %exec, implicit %exec
 # VI: %vcc = V_CMP_LT_I32_sdwa 0, %{{[0-9]+}}, 0, %3, 0, 6, 4, implicit-def %vcc, implicit %exec
 # VI: %{{[0-9]+}} = V_CMPX_EQ_I32_e64 23, killed %{{[0-9]+}}, implicit-def %exec, implicit %exec
 
@@ -345,20 +345,21 @@ body:             |
 
 
 # VI: %vcc = V_CMP_EQ_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 1, 6, 4, implicit-def %vcc, implicit %exec
-# VI: %vcc = V_CMPX_GT_F32_e64 0, 23, 0, killed %{{[0-9]+}}, 0, 2, implicit-def %exec, implicit %exec
-# VI: %vcc = V_CMP_EQ_F32_e64 0, %{{[0-9]+}}, 0, killed %{{[0-9]+}}, 1, 2, implicit %exec
+# VI: %vcc = V_CMPX_GT_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 4, implicit-def %vcc, implicit-def %exec, implicit %exec
+# VI: %vcc = V_CMP_EQ_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 1, 6, 4, implicit-def %vcc, implicit %exec
 # VI: %vcc = V_CMPX_GT_F32_sdwa 1, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 4, implicit-def %vcc, implicit-def %exec, implicit %exec
 # VI: %vcc = V_CMPX_GT_F32_sdwa 0, %{{[0-9]+}}, 1, %{{[0-9]+}}, 0, 6, 4, implicit-def %vcc, implicit-def %exec, implicit %exec
 # VI: %vcc = V_CMPX_GT_F32_sdwa 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, 0, 6, 4, implicit-def %vcc, implicit-def %exec, implicit %exec
-# VI: %vcc = V_CMPX_GT_F32_e64 1, 23, 1, killed %{{[0-9]+}}, 1, 2, implicit-def %exec, implicit %exec
+# VI: %vcc = V_CMPX_GT_F32_sdwa 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, 1, 6, 4, implicit-def %vcc, implicit-def %exec, implicit %exec
 
-# GFX9: %vcc = V_CMP_EQ_F32_e64 0, %{{[0-9]+}}, 0, killed %{{[0-9]+}}, 1, 0, implicit %exec
-# GFX9: %vcc = V_CMPX_GT_F32_e64 0, 23, 0, killed %{{[0-9]+}}, 0, 2, implicit-def %exec, implicit %exec
-# GFX9: %vcc = V_CMP_EQ_F32_e64 0, %{{[0-9]+}}, 0, killed %{{[0-9]+}}, 1, 2, implicit %exec
+# GFX9: %vcc = V_CMP_EQ_F32_e64 0, %{{[0-9]+}}, 0, killed %{{[0-9]+}}, 1, implicit %exec
+# GFX9: %vcc = V_CMPX_GT_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 4, implicit-def %vcc, implicit-def %exec, implicit %exec
+# GFX9: %vcc = V_CMP_EQ_F32_e64 0, %{{[0-9]+}}, 0, killed %{{[0-9]+}}, 1, implicit %exec
 # GFX9: %vcc = V_CMPX_GT_F32_sdwa 1, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 4, implicit-def %vcc, implicit-def %exec, implicit %exec
 # GFX9: %vcc = V_CMPX_GT_F32_sdwa 0, %{{[0-9]+}}, 1, %{{[0-9]+}}, 0, 6, 4, implicit-def %vcc, implicit-def %exec, implicit %exec
 # GFX9: %vcc = V_CMPX_GT_F32_sdwa 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, 0, 6, 4, implicit-def %vcc, implicit-def %exec, implicit %exec
-# GFX9: %vcc = V_CMPX_GT_F32_e64 1, 23, 1, killed %{{[0-9]+}}, 1, 2, implicit-def %exec, implicit %exec
+# GFX9: %vcc = V_CMPX_GT_F32_e64 1, 23, 1, killed %{{[0-9]+}}, 1, implicit-def %exec, implicit %exec
+
 
 
 name:            vopc_instructions
@@ -415,28 +416,28 @@ body:             |
     V_CMPX_EQ_I32_e32 123, killed %13, implicit-def %vcc, implicit-def %exec, implicit %exec
 
     %14 = V_AND_B32_e64 %5, %3, implicit %exec
-    %vcc = V_CMP_EQ_F32_e64 0, %6, 0, killed %14, 0, 0, implicit %exec
+    %vcc = V_CMP_EQ_F32_e64 0, %6, 0, killed %14, 0, implicit %exec
     %15 = V_AND_B32_e64 %5, %3, implicit %exec
-    %18 = V_CMPX_GT_F32_e64 0, 23, 0, killed %15, 0, 0, implicit-def %exec, implicit %exec
+    %18 = V_CMPX_GT_F32_e64 0, 23, 0, killed %15, 0, implicit-def %exec, implicit %exec
     %16 = V_AND_B32_e64 %5, %3, implicit %exec
     %vcc = V_CMP_LT_I32_e64 %6, killed %16, implicit %exec
     %17 = V_AND_B32_e64 %5, %3, implicit %exec
     %19 = V_CMPX_EQ_I32_e64 23, killed %17, implicit-def %exec, implicit %exec
 
     %20 = V_AND_B32_e64 %5, %3, implicit %exec
-    %vcc = V_CMP_EQ_F32_e64 0, %6, 0, killed %20, 1, 0, implicit %exec
+    %vcc = V_CMP_EQ_F32_e64 0, %6, 0, killed %20, 1, implicit %exec
     %21 = V_AND_B32_e64 %5, %3, implicit %exec
-    %vcc = V_CMPX_GT_F32_e64 0, 23, 0, killed %21, 0, 2, implicit-def %exec, implicit %exec
+    %vcc = V_CMPX_GT_F32_e64 0, 23, 0, killed %21, 0, implicit-def %exec, implicit %exec
     %23 = V_AND_B32_e64 %5, %3, implicit %exec
-    %vcc = V_CMP_EQ_F32_e64 0, %6, 0, killed %23, 1, 2, implicit %exec
+    %vcc = V_CMP_EQ_F32_e64 0, %6, 0, killed %23, 1, implicit %exec
     %24 = V_AND_B32_e64 %5, %3, implicit %exec
-    %vcc = V_CMPX_GT_F32_e64 1, 23, 0, killed %24, 0, 0, implicit-def %exec, implicit %exec
+    %vcc = V_CMPX_GT_F32_e64 1, 23, 0, killed %24, 0, implicit-def %exec, implicit %exec
     %25 = V_AND_B32_e64 %5, %3, implicit %exec
-    %vcc = V_CMPX_GT_F32_e64 0, 23, 1, killed %25, 0, 0, implicit-def %exec, implicit %exec
+    %vcc = V_CMPX_GT_F32_e64 0, 23, 1, killed %25, 0, implicit-def %exec, implicit %exec
     %26 = V_AND_B32_e64 %5, %3, implicit %exec
-    %vcc = V_CMPX_GT_F32_e64 1, 23, 1, killed %26, 0, 0, implicit-def %exec, implicit %exec
+    %vcc = V_CMPX_GT_F32_e64 1, 23, 1, killed %26, 0, implicit-def %exec, implicit %exec
     %27 = V_AND_B32_e64 %5, %3, implicit %exec
-    %vcc = V_CMPX_GT_F32_e64 1, 23, 1, killed %27, 1, 2, implicit-def %exec, implicit %exec
+    %vcc = V_CMPX_GT_F32_e64 1, 23, 1, killed %27, 1, implicit-def %exec, implicit %exec
 
 
     %100 = V_MOV_B32_e32 %vcc_lo, implicit %exec
diff --git a/test/CodeGen/AMDGPU/sdwa-vop2-64bit.mir b/test/CodeGen/AMDGPU/sdwa-vop2-64bit.mir
index 913b54332119..bd222adf6a68 100644
--- a/test/CodeGen/AMDGPU/sdwa-vop2-64bit.mir
+++ b/test/CodeGen/AMDGPU/sdwa-vop2-64bit.mir
@@ -8,7 +8,7 @@
 
 # GCN: %{{[0-9]+}} = V_BCNT_U32_B32_e64 %{{[0-9]+}}, killed %{{[0-9]+}}, implicit-def %vcc, implicit %exec
 # GCN: %{{[0-9]+}} = V_BFM_B32_e64 %{{[0-9]+}}, killed %{{[0-9]+}}, implicit-def %vcc, implicit %exec
-# GCN: %{{[0-9]+}} = V_CVT_PKNORM_I16_F32_e64 0, %{{[0-9]+}}, 0, killed %{{[0-9]+}}, 0, 0, implicit-def %vcc, implicit %exec
+# GCN: %{{[0-9]+}} = V_CVT_PKNORM_I16_F32_e64 0, %{{[0-9]+}}, 0, killed %{{[0-9]+}}, 0, implicit-def %vcc, implicit %exec
 # GCN: %{{[0-9]+}} = V_READLANE_B32 killed %{{[0-9]+}}, 0, implicit-def %vcc, implicit %exec
 
 ---
@@ -50,7 +50,7 @@ body:             |
     %15 = V_BFM_B32_e64 %13, killed %14, implicit-def %vcc, implicit %exec
 
     %16 = V_LSHRREV_B32_e64 16, %15, implicit %exec
-    %17 = V_CVT_PKNORM_I16_F32_e64 0, %15, 0, killed %16, 0, 0, implicit-def %vcc, implicit %exec
+    %17 = V_CVT_PKNORM_I16_F32_e64 0, %15, 0, killed %16, 0, implicit-def %vcc, implicit %exec
 
     %18 = V_LSHRREV_B32_e64 16, %17, implicit %exec
     %19 = V_READLANE_B32 killed %18, 0, implicit-def %vcc, implicit %exec
diff --git a/test/CodeGen/AMDGPU/trap.ll b/test/CodeGen/AMDGPU/trap.ll
index 51771c9723e0..04ff4c87ea77 100644
--- a/test/CodeGen/AMDGPU/trap.ll
+++ b/test/CodeGen/AMDGPU/trap.ll
@@ -19,11 +19,11 @@ declare void @llvm.debugtrap() #0
 
 ; MESA-TRAP: .section .AMDGPU.config
 ; MESA-TRAP:  .long   47180
-; MESA-TRAP-NEXT: .long   208
+; MESA-TRAP-NEXT: .long   204
 
 ; NOMESA-TRAP: .section .AMDGPU.config
 ; NOMESA-TRAP:  .long   47180
-; NOMESA-TRAP-NEXT: .long   144
+; NOMESA-TRAP-NEXT: .long   140
 
 ; GCN-LABEL: {{^}}hsa_trap:
 ; HSA-TRAP: enable_trap_handler = 1
@@ -45,11 +45,11 @@ define amdgpu_kernel void @hsa_trap() {
 
 ; MESA-TRAP: .section .AMDGPU.config
 ; MESA-TRAP:  .long   47180
-; MESA-TRAP-NEXT: .long   208
+; MESA-TRAP-NEXT: .long   204
 
 ; NOMESA-TRAP: .section .AMDGPU.config
 ; NOMESA-TRAP:  .long   47180
-; NOMESA-TRAP-NEXT: .long   144
+; NOMESA-TRAP-NEXT: .long   140
 
 ; GCN-WARNING: warning: <unknown>:0:0: in function hsa_debugtrap void (): debugtrap handler not supported
 ; GCN-LABEL: {{^}}hsa_debugtrap:
diff --git a/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir b/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
index 6eb937e71b1b..54991d3d953c 100644
--- a/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
+++ b/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
@@ -81,7 +81,7 @@ body:             |
     %sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed %sgpr0_sgpr1, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
     %sgpr7 = S_MOV_B32 61440
     %sgpr6 = S_MOV_B32 -1
-    %vcc = V_CMP_EQ_F32_e64 0, 0, 0, %sgpr2, 0, 0, implicit %exec
+    %vcc = V_CMP_EQ_F32_e64 0, 0, 0, %sgpr2, 0, implicit %exec
     S_CBRANCH_VCCZ %bb.1.else, implicit killed %vcc
 
   bb.2.if:
diff --git a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
index 135f02ac205a..feae5e9f3792 100644
--- a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
+++ b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
@@ -19,8 +19,9 @@
 ; HSA: workitem_private_segment_byte_size = 1536
 
 ; GCN-NOT: flat_scr
+; MESA-NOT: s_mov_b32 s3
+; HSA-NOT: s_mov_b32 s7
 
-; GCNMESA-DAG: s_mov_b32 s16, s3
 ; GCNMESA-DAG: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
 ; GCNMESA-DAG: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GCNMESA-DAG: s_mov_b32 s14, -1
@@ -29,17 +30,32 @@
 ; GFX9MESA-DAG: s_mov_b32 s15, 0xe00000
 
 
-; GCN: buffer_store_dword {{v[0-9]+}}, off, s[12:15], s16 offset:{{[0-9]+}} ; 4-byte Folded Spill
+; GCNMESAMESA: buffer_store_dword {{v[0-9]+}}, off, s[12:15], s3 offset:{{[0-9]+}} ; 4-byte Folded Spill
 
-; GCN: buffer_store_dword {{v[0-9]}}, off, s[12:15], s16 offset:{{[0-9]+}}
-; GCN: buffer_store_dword {{v[0-9]}}, off, s[12:15], s16 offset:{{[0-9]+}}
-; GCN: buffer_store_dword {{v[0-9]}}, off, s[12:15], s16 offset:{{[0-9]+}}
-; GCN: buffer_store_dword {{v[0-9]}}, off, s[12:15], s16 offset:{{[0-9]+}}
+; GCNMESA: buffer_store_dword {{v[0-9]}}, off, s[12:15], s3 offset:{{[0-9]+}}
+; GCNMESA: buffer_store_dword {{v[0-9]}}, off, s[12:15], s3 offset:{{[0-9]+}}
+; GCNMESA: buffer_store_dword {{v[0-9]}}, off, s[12:15], s3 offset:{{[0-9]+}}
+; GCNMESA: buffer_store_dword {{v[0-9]}}, off, s[12:15], s3 offset:{{[0-9]+}}
+
+; GCNMESA: buffer_load_dword {{v[0-9]+}}, off, s[12:15], s3 offset:{{[0-9]+}}
+; GCNMESA: buffer_load_dword {{v[0-9]+}}, off, s[12:15], s3 offset:{{[0-9]+}}
+; GCNMESA: buffer_load_dword {{v[0-9]+}}, off, s[12:15], s3 offset:{{[0-9]+}}
+; GCNMESA: buffer_load_dword {{v[0-9]+}}, off, s[12:15], s3 offset:{{[0-9]+}}
+
+
+
+; HSA: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s7 offset:{{[0-9]+}} ; 4-byte Folded Spill
+
+; HSA: buffer_store_dword {{v[0-9]}}, off, s[0:3], s7 offset:{{[0-9]+}}
+; HSA: buffer_store_dword {{v[0-9]}}, off, s[0:3], s7 offset:{{[0-9]+}}
+; HSA: buffer_store_dword {{v[0-9]}}, off, s[0:3], s7 offset:{{[0-9]+}}
+; HSA: buffer_store_dword {{v[0-9]}}, off, s[0:3], s7 offset:{{[0-9]+}}
+
+; HSA: buffer_load_dword {{v[0-9]+}}, off, s[0:3], s7 offset:{{[0-9]+}}
+; HSA: buffer_load_dword {{v[0-9]+}}, off, s[0:3], s7 offset:{{[0-9]+}}
+; HSA: buffer_load_dword {{v[0-9]+}}, off, s[0:3], s7 offset:{{[0-9]+}}
+; HSA: buffer_load_dword {{v[0-9]+}}, off, s[0:3], s7 offset:{{[0-9]+}}
 
-; GCN: buffer_load_dword {{v[0-9]+}}, off, s[12:15], s16 offset:{{[0-9]+}}
-; GCN: buffer_load_dword {{v[0-9]+}}, off, s[12:15], s16 offset:{{[0-9]+}}
-; GCN: buffer_load_dword {{v[0-9]+}}, off, s[12:15], s16 offset:{{[0-9]+}}
-; GCN: buffer_load_dword {{v[0-9]+}}, off, s[12:15], s16 offset:{{[0-9]+}}
 
 ; GCN: NumVgprs: 256
 ; GCN: ScratchSize: 1536
diff --git a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
index ca2366a361fb..afbd06a00fae 100644
--- a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
+++ b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
@@ -13,7 +13,7 @@
 
 ; GCN-LABEL: {{^}}main:
 
-; GCN-DAG: s_mov_b32 s[[OFFREG:[0-9]+]], s12
+; GCN-NOT: s_mov_b32 s12
 ; GCN-DAG: s_mov_b32 s[[DESC0:[0-9]+]], SCRATCH_RSRC_DWORD0
 ; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
 ; GCN-DAG: s_mov_b32 s{{[0-9]+}}, -1
@@ -22,8 +22,8 @@
 ; GFX9-DAG: s_mov_b32 s[[DESC3:[0-9]+]], 0xe00000
 
 ; OFFREG is offset system SGPR
-; GCN: buffer_store_dword {{v[0-9]+}}, off, s{{\[}}[[DESC0]]:[[DESC3]]], s[[OFFREG]] offset:{{[0-9]+}} ; 4-byte Folded Spill
-; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[}}[[DESC0]]:[[DESC3]]], s[[OFFREG]] offset:{{[0-9]+}} ; 4-byte Folded Reload
+; GCN: buffer_store_dword {{v[0-9]+}}, off, s{{\[}}[[DESC0]]:[[DESC3]]], s12 offset:{{[0-9]+}} ; 4-byte Folded Spill
+; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[}}[[DESC0]]:[[DESC3]]], s12 offset:{{[0-9]+}} ; 4-byte Folded Reload
 ; GCN: NumVgprs: 256
 ; GCN: ScratchSize: 1536
 
diff --git a/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir b/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir
index 6a1da0dfe85f..0e3ef479bc3c 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir
+++ b/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir
@@ -45,6 +45,8 @@
   define void @test_select_s32() { ret void }
   define void @test_select_ptr() { ret void }
 
+  define void @test_br() { ret void }
+
   define void @test_soft_fp_double() #0 { ret void }
 
   attributes #0 = { "target-features"="+vfp2,-neonfp" }
@@ -1173,6 +1175,43 @@ body:             |
     ; CHECK: BX_RET 14, _, implicit %r0
 ...
 ---
+name:            test_br
+# CHECK-LABEL: name: test_br
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK: selected: true
+registers:
+  - { id: 0, class: gprb }
+body:             |
+  bb.0:
+  ; CHECK: bb.0
+    successors: %bb.1(0x40000000), %bb.2(0x40000000)
+    liveins: %r0
+
+    %0(s1) = COPY %r0
+    ; CHECK: [[COND:%[0-9]+]] = COPY %r0
+
+    G_BRCOND %0(s1), %bb.1
+    ; CHECK: TSTri [[COND]], 1, 14, _, implicit-def %cpsr
+    ; CHECK: Bcc %bb.1, 0, %cpsr
+    G_BR %bb.2
+    ; CHECK: B %bb.2
+
+  bb.1:
+  ; CHECK: bb.1
+    successors: %bb.2(0x80000000)
+
+    G_BR %bb.2
+    ; CHECK: B %bb.2
+
+  bb.2:
+  ; CHECK: bb.2
+
+    BX_RET 14, _
+    ; CHECK: BX_RET 14, _
+...
+---
 name:            test_soft_fp_double
 # CHECK-LABEL: name: test_soft_fp_double
 legalized:       true
diff --git a/test/CodeGen/ARM/GlobalISel/arm-isel-divmod.ll b/test/CodeGen/ARM/GlobalISel/arm-isel-divmod.ll
index c778caacd0f4..c2e8c5abca4e 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-isel-divmod.ll
+++ b/test/CodeGen/ARM/GlobalISel/arm-isel-divmod.ll
@@ -87,3 +87,55 @@ define arm_aapcscc i32 @test_urem_i32(i32 %x, i32 %y) {
   %r = urem i32 %x, %y
   ret i32 %r
 }
+
+define arm_aapcscc i16 @test_srem_i16(i16 %x, i16 %y) {
+; CHECK-LABEL: test_srem_i16:
+; CHECK-DAG: sxth r0, r0
+; CHECK-DAG: sxth r1, r1
+; HWDIV: sdiv [[Q:r[0-9]+]], r0, r1
+; HWDIV: mul [[P:r[0-9]+]], [[Q]], r1
+; HWDIV: sub r0, r0, [[P]]
+; SOFT-AEABI: blx __aeabi_idivmod
+; SOFT-DEFAULT: blx __modsi3
+  %r = srem i16 %x, %y
+  ret i16 %r
+}
+
+define arm_aapcscc i16 @test_urem_i16(i16 %x, i16 %y) {
+; CHECK-LABEL: test_urem_i16:
+; CHECK-DAG: uxth r0, r0
+; CHECK-DAG: uxth r1, r1
+; HWDIV: udiv [[Q:r[0-9]+]], r0, r1
+; HWDIV: mul [[P:r[0-9]+]], [[Q]], r1
+; HWDIV: sub r0, r0, [[P]]
+; SOFT-AEABI: blx __aeabi_uidivmod
+; SOFT-DEFAULT: blx __umodsi3
+  %r = urem i16 %x, %y
+  ret i16 %r
+}
+
+define arm_aapcscc i8 @test_srem_i8(i8 %x, i8 %y) {
+; CHECK-LABEL: test_srem_i8:
+; CHECK-DAG: sxtb r0, r0
+; CHECK-DAG: sxtb r1, r1
+; HWDIV: sdiv [[Q:r[0-9]+]], r0, r1
+; HWDIV: mul [[P:r[0-9]+]], [[Q]], r1
+; HWDIV: sub r0, r0, [[P]]
+; SOFT-AEABI: blx __aeabi_idivmod
+; SOFT-DEFAULT: blx __modsi3
+  %r = srem i8 %x, %y
+  ret i8 %r
+}
+
+define arm_aapcscc i8 @test_urem_i8(i8 %x, i8 %y) {
+; CHECK-LABEL: test_urem_i8:
+; CHECK-DAG: uxtb r0, r0
+; CHECK-DAG: uxtb r1, r1
+; HWDIV: udiv [[Q:r[0-9]+]], r0, r1
+; HWDIV: mul [[P:r[0-9]+]], [[Q]], r1
+; HWDIV: sub r0, r0, [[P]]
+; SOFT-AEABI: blx __aeabi_uidivmod
+; SOFT-DEFAULT: blx __umodsi3
+  %r = urem i8 %x, %y
+  ret i8 %r
+}
diff --git a/test/CodeGen/ARM/GlobalISel/arm-isel.ll b/test/CodeGen/ARM/GlobalISel/arm-isel.ll
index 4c498ff6ca9b..419bcf71c106 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-isel.ll
+++ b/test/CodeGen/ARM/GlobalISel/arm-isel.ll
@@ -420,3 +420,42 @@ entry:
   %r = select i1 %cond, i32* %a, i32* %b
   ret i32* %r
 }
+
+define arm_aapcscc void @test_br() {
+; CHECK-LABEL: test_br
+; CHECK: [[LABEL:.L[[:alnum:]_]+]]:
+; CHECK: b [[LABEL]]
+entry:
+  br label %infinite
+
+infinite:
+  br label %infinite
+}
+
+declare arm_aapcscc void @brcond1()
+declare arm_aapcscc void @brcond2()
+
+define arm_aapcscc void @test_brcond(i32 %n) {
+; CHECK-LABEL: test_brcond
+; CHECK: cmp r0
+; CHECK-NEXT: movgt [[RCMP:r[0-9]+]], #1
+; CHECK: tst [[RCMP]], #1
+; CHECK-NEXT: bne [[FALSE:.L[[:alnum:]_]+]]
+; CHECK: blx brcond1
+; CHECK: [[FALSE]]:
+; CHECK: blx brcond2
+entry:
+  %cmp = icmp sgt i32 %n, 0
+  br i1 %cmp, label %if.true, label %if.false
+
+if.true:
+  call arm_aapcscc void @brcond1()
+  br label %if.end
+
+if.false:
+  call arm_aapcscc void @brcond2()
+  br label %if.end
+
+if.end:
+  ret void
+}
diff --git a/test/CodeGen/ARM/GlobalISel/arm-legalize-divmod.mir b/test/CodeGen/ARM/GlobalISel/arm-legalize-divmod.mir
index 9a0877846fc3..f436c3774c86 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-legalize-divmod.mir
+++ b/test/CodeGen/ARM/GlobalISel/arm-legalize-divmod.mir
@@ -14,6 +14,12 @@
 
   define void @test_srem_i32() { ret void }
   define void @test_urem_i32() { ret void }
+
+  define void @test_srem_i16() { ret void }
+  define void @test_urem_i16() { ret void }
+
+  define void @test_srem_i8() { ret void }
+  define void @test_urem_i8() { ret void }
 ...
 ---
 name:            test_sdiv_i32
@@ -323,3 +329,171 @@ body:             |
     %r0 = COPY %2(s32)
     BX_RET 14, _, implicit %r0
 ...
+---
+name:            test_srem_i16
+# CHECK-LABEL: name: test_srem_i16
+legalized:       false
+# CHECK: legalized: true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+body:             |
+  bb.0:
+    liveins: %r0, %r1
+
+    ; CHECK-DAG: [[X:%[0-9]+]](s16) = COPY %r0
+    ; CHECK-DAG: [[Y:%[0-9]+]](s16) = COPY %r1
+    ; CHECK-DAG: [[X32:%[0-9]+]](s32) = G_SEXT [[X]](s16)
+    ; CHECK-DAG: [[Y32:%[0-9]+]](s32) = G_SEXT [[Y]](s16)
+    %0(s16) = COPY %r0
+    %1(s16) = COPY %r1
+    ; HWDIV: [[Q32:%[0-9]+]](s32) = G_SDIV [[X32]], [[Y32]]
+    ; HWDIV: [[P32:%[0-9]+]](s32) = G_MUL [[Q32]], [[Y32]]
+    ; HWDIV: [[R32:%[0-9]+]](s32) = G_SUB [[X32]], [[P32]]
+    ; SOFT-NOT: G_SREM
+    ; SOFT: ADJCALLSTACKDOWN
+    ; SOFT-DAG: %r0 = COPY [[X32]]
+    ; SOFT-DAG: %r1 = COPY [[Y32]]
+    ; SOFT-AEABI: BLX $__aeabi_idivmod, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+    ; SOFT-AEABI: [[R32:%[0-9]+]](s32) = COPY %r1
+    ; SOFT-DEFAULT: BLX $__modsi3, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+    ; SOFT-DEFAULT: [[R32:%[0-9]+]](s32) = COPY %r0
+    ; SOFT: ADJCALLSTACKUP
+    ; SOFT-NOT: G_SREM
+    ; CHECK: [[R:%[0-9]+]](s16) = G_TRUNC [[R32]]
+    ; SOFT-NOT: G_SREM
+    %2(s16) = G_SREM %0, %1
+    ; CHECK: %r0 = COPY [[R]]
+    %r0 = COPY %2(s16)
+    BX_RET 14, _, implicit %r0
+...
+---
+name:            test_urem_i16
+# CHECK-LABEL: name: test_urem_i16
+legalized:       false
+# CHECK: legalized: true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+body:             |
+  bb.0:
+    liveins: %r0, %r1
+
+    ; CHECK-DAG: [[X:%[0-9]+]](s16) = COPY %r0
+    ; CHECK-DAG: [[Y:%[0-9]+]](s16) = COPY %r1
+    ; CHECK-DAG: [[X32:%[0-9]+]](s32) = G_ZEXT [[X]](s16)
+    ; CHECK-DAG: [[Y32:%[0-9]+]](s32) = G_ZEXT [[Y]](s16)
+    %0(s16) = COPY %r0
+    %1(s16) = COPY %r1
+    ; HWDIV: [[Q32:%[0-9]+]](s32) = G_UDIV [[X32]], [[Y32]]
+    ; HWDIV: [[P32:%[0-9]+]](s32) = G_MUL [[Q32]], [[Y32]]
+    ; HWDIV: [[R32:%[0-9]+]](s32) = G_SUB [[X32]], [[P32]]
+    ; SOFT-NOT: G_UREM
+    ; SOFT: ADJCALLSTACKDOWN
+    ; SOFT-DAG: %r0 = COPY [[X32]]
+    ; SOFT-DAG: %r1 = COPY [[Y32]]
+    ; SOFT-AEABI: BLX $__aeabi_uidivmod, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+    ; SOFT-AEABI: [[R32:%[0-9]+]](s32) = COPY %r1
+    ; SOFT-DEFAULT: BLX $__umodsi3, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+    ; SOFT-DEFAULT: [[R32:%[0-9]+]](s32) = COPY %r0
+    ; SOFT: ADJCALLSTACKUP
+    ; SOFT-NOT: G_UREM
+    ; CHECK: [[R:%[0-9]+]](s16) = G_TRUNC [[R32]]
+    ; SOFT-NOT: G_UREM
+    %2(s16) = G_UREM %0, %1
+    ; CHECK: %r0 = COPY [[R]]
+    %r0 = COPY %2(s16)
+    BX_RET 14, _, implicit %r0
+...
+---
+name:            test_srem_i8
+# CHECK-LABEL: name: test_srem_i8
+legalized:       false
+# CHECK: legalized: true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+body:             |
+  bb.0:
+    liveins: %r0, %r1
+
+    ; CHECK-DAG: [[X:%[0-9]+]](s8) = COPY %r0
+    ; CHECK-DAG: [[Y:%[0-9]+]](s8) = COPY %r1
+    ; CHECK-DAG: [[X32:%[0-9]+]](s32) = G_SEXT [[X]](s8)
+    ; CHECK-DAG: [[Y32:%[0-9]+]](s32) = G_SEXT [[Y]](s8)
+    %0(s8) = COPY %r0
+    %1(s8) = COPY %r1
+    ; HWDIV: [[Q32:%[0-9]+]](s32) = G_SDIV [[X32]], [[Y32]]
+    ; HWDIV: [[P32:%[0-9]+]](s32) = G_MUL [[Q32]], [[Y32]]
+    ; HWDIV: [[R32:%[0-9]+]](s32) = G_SUB [[X32]], [[P32]]
+    ; SOFT-NOT: G_SREM
+    ; SOFT: ADJCALLSTACKDOWN
+    ; SOFT-DAG: %r0 = COPY [[X32]]
+    ; SOFT-DAG: %r1 = COPY [[Y32]]
+    ; SOFT-AEABI: BLX $__aeabi_idivmod, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+    ; SOFT-AEABI: [[R32:%[0-9]+]](s32) = COPY %r1
+    ; SOFT-DEFAULT: BLX $__modsi3, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+    ; SOFT-DEFAULT: [[R32:%[0-9]+]](s32) = COPY %r0
+    ; SOFT: ADJCALLSTACKUP
+    ; SOFT-NOT: G_SREM
+    ; CHECK: [[R:%[0-9]+]](s8) = G_TRUNC [[R32]]
+    ; SOFT-NOT: G_SREM
+    %2(s8) = G_SREM %0, %1
+    ; CHECK: %r0 = COPY [[R]]
+    %r0 = COPY %2(s8)
+    BX_RET 14, _, implicit %r0
+...
+---
+name:            test_urem_i8
+# CHECK-LABEL: name: test_urem_i8
+legalized:       false
+# CHECK: legalized: true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+body:             |
+  bb.0:
+    liveins: %r0, %r1
+
+    ; CHECK-DAG: [[X:%[0-9]+]](s8) = COPY %r0
+    ; CHECK-DAG: [[Y:%[0-9]+]](s8) = COPY %r1
+    ; CHECK-DAG: [[X32:%[0-9]+]](s32) = G_ZEXT [[X]](s8)
+    ; CHECK-DAG: [[Y32:%[0-9]+]](s32) = G_ZEXT [[Y]](s8)
+    %0(s8) = COPY %r0
+    %1(s8) = COPY %r1
+    ; HWDIV: [[Q32:%[0-9]+]](s32) = G_UDIV [[X32]], [[Y32]]
+    ; HWDIV: [[P32:%[0-9]+]](s32) = G_MUL [[Q32]], [[Y32]]
+    ; HWDIV: [[R32:%[0-9]+]](s32) = G_SUB [[X32]], [[P32]]
+    ; SOFT-NOT: G_UREM
+    ; SOFT: ADJCALLSTACKDOWN
+    ; SOFT-DAG: %r0 = COPY [[X32]]
+    ; SOFT-DAG: %r1 = COPY [[Y32]]
+    ; SOFT-AEABI: BLX $__aeabi_uidivmod, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+    ; SOFT-AEABI: [[R32:%[0-9]+]](s32) = COPY %r1
+    ; SOFT-DEFAULT: BLX $__umodsi3, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+    ; SOFT-DEFAULT: [[R32:%[0-9]+]](s32) = COPY %r0
+    ; SOFT: ADJCALLSTACKUP
+    ; SOFT-NOT: G_UREM
+    ; CHECK: [[R:%[0-9]+]](s8) = G_TRUNC [[R32]]
+    ; SOFT-NOT: G_UREM
+    %2(s8) = G_UREM %0, %1
+    ; CHECK: %r0 = COPY [[R]]
+    %r0 = COPY %2(s8)
+    BX_RET 14, _, implicit %r0
+...
diff --git a/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir b/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir
index 4575341dfc29..616f29d3b068 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir
+++ b/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir
@@ -42,6 +42,8 @@
   define void @test_select_s32() { ret void }
   define void @test_select_ptr() { ret void }
 
+  define void @test_brcond() { ret void }
+
   define void @test_fadd_s32() #0 { ret void }
   define void @test_fadd_s64() #0 { ret void }
 
@@ -863,6 +865,40 @@ body:             |
     BX_RET 14, _, implicit %r0
 ...
 ---
+name:            test_brcond
+# CHECK-LABEL: name: test_brcond
+legalized:       false
+# CHECK: legalized: true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+body:             |
+  bb.0:
+    successors: %bb.1(0x40000000), %bb.2(0x40000000)
+    liveins: %r0, %r1
+
+    %0(s32) = COPY %r0
+    %1(s32) = COPY %r1
+    %2(s1) = G_ICMP intpred(sgt), %0(s32), %1
+    G_BRCOND %2(s1), %bb.1
+    ; G_BRCOND with s1 is legal, so we should find it unchanged in the output
+    ; CHECK: G_BRCOND {{%[0-9]+}}(s1), %bb.1
+    G_BR %bb.2
+
+  bb.1:
+    %r0 = COPY %1(s32)
+    BX_RET 14, _, implicit %r0
+
+  bb.2:
+    %r0 = COPY %0(s32)
+    BX_RET 14, _, implicit %r0
+
+...
+---
 name:            test_fadd_s32
 # CHECK-LABEL: name: test_fadd_s32
 legalized:       false
diff --git a/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir b/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir
index ffca431d96ea..638c6e620926 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir
+++ b/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir
@@ -40,6 +40,8 @@
 
   define void @test_select_s32() { ret void }
 
+  define void @test_br() { ret void }
+
   define void @test_fadd_s32() #0 { ret void }
   define void @test_fadd_s64() #0 { ret void }
 
@@ -828,6 +830,34 @@ body:             |
     %r0 = COPY %3(s32)
     BX_RET 14, _, implicit %r0
 
+...
+---
+name:            test_br
+# CHECK-LABEL: name: test_br
+legalized:       true
+regBankSelected: false
+# CHECK: regBankSelected: true
+selected:        false
+registers:
+  - { id: 0, class: _ }
+# CHECK: { id: 0, class: gprb, preferred-register: '' }
+# Check that we map the condition of the G_BRCOND into the GPR.
+# For the G_BR, there are no registers to map, but make sure we don't crash.
+body:             |
+  bb.0:
+    successors: %bb.1(0x40000000), %bb.2(0x40000000)
+    liveins: %r0
+
+    %0(s1) = COPY %r0
+    G_BRCOND %0(s1), %bb.1
+    G_BR %bb.2
+
+  bb.1:
+    BX_RET 14, _
+
+  bb.2:
+    BX_RET 14, _
+
 ...
 ---
 name:            test_fadd_s32
diff --git a/test/CodeGen/ARM/atomic-op.ll b/test/CodeGen/ARM/atomic-op.ll
index 23c4ccea4604..644a7fbf8d9a 100644
--- a/test/CodeGen/ARM/atomic-op.ll
+++ b/test/CodeGen/ARM/atomic-op.ll
@@ -26,6 +26,7 @@ entry:
 	store i32 3855, i32* %xort
 	store i32 4, i32* %temp
 	%tmp = load i32, i32* %temp
+	call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"()
   ; CHECK: ldrex
   ; CHECK: add
   ; CHECK: strex
@@ -35,6 +36,7 @@ entry:
   ; CHECK-BAREMETAL-NOT: __sync
   %0 = atomicrmw add i32* %val1, i32 %tmp monotonic
 	store i32 %0, i32* %old
+	call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"()
   ; CHECK: ldrex
   ; CHECK: sub
   ; CHECK: strex
@@ -44,6 +46,7 @@ entry:
   ; CHECK-BAREMETAL-NOT: __sync
   %1 = atomicrmw sub i32* %val2, i32 30 monotonic
 	store i32 %1, i32* %old
+	call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"()
   ; CHECK: ldrex
   ; CHECK: add
   ; CHECK: strex
@@ -53,6 +56,7 @@ entry:
   ; CHECK-BAREMETAL-NOT: __sync
   %2 = atomicrmw add i32* %val2, i32 1 monotonic
 	store i32 %2, i32* %old
+	call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"()
   ; CHECK: ldrex
   ; CHECK: sub
   ; CHECK: strex
@@ -62,6 +66,7 @@ entry:
   ; CHECK-BAREMETAL-NOT: __sync
   %3 = atomicrmw sub i32* %val2, i32 1 monotonic
 	store i32 %3, i32* %old
+	call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"()
   ; CHECK: ldrex
   ; CHECK: and
   ; CHECK: strex
@@ -71,6 +76,7 @@ entry:
   ; CHECK-BAREMETAL-NOT: __sync
   %4 = atomicrmw and i32* %andt, i32 4080 monotonic
 	store i32 %4, i32* %old
+	call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"()
   ; CHECK: ldrex
   ; CHECK: or
   ; CHECK: strex
@@ -80,6 +86,7 @@ entry:
   ; CHECK-BAREMETAL-NOT: __sync
   %5 = atomicrmw or i32* %ort, i32 4080 monotonic
 	store i32 %5, i32* %old
+	call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"()
   ; CHECK: ldrex
   ; CHECK: eor
   ; CHECK: strex
@@ -89,6 +96,7 @@ entry:
   ; CHECK-BAREMETAL-NOT: __sync
   %6 = atomicrmw xor i32* %xort, i32 4080 monotonic
 	store i32 %6, i32* %old
+	call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"()
   ; CHECK: ldrex
   ; CHECK: cmp
   ; CHECK: strex
@@ -98,6 +106,7 @@ entry:
   ; CHECK-BAREMETAL-NOT: __sync
   %7 = atomicrmw min i32* %val2, i32 16 monotonic
 	store i32 %7, i32* %old
+	call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"()
 	%neg = sub i32 0, 1
   ; CHECK: ldrex
   ; CHECK: cmp
@@ -108,6 +117,7 @@ entry:
   ; CHECK-BAREMETAL-NOT: __sync
   %8 = atomicrmw min i32* %val2, i32 %neg monotonic
 	store i32 %8, i32* %old
+	call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"()
   ; CHECK: ldrex
   ; CHECK: cmp
   ; CHECK: strex
@@ -117,6 +127,7 @@ entry:
   ; CHECK-BAREMETAL-NOT: __sync
   %9 = atomicrmw max i32* %val2, i32 1 monotonic
 	store i32 %9, i32* %old
+	call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"()
   ; CHECK: ldrex
   ; CHECK: cmp
   ; CHECK: strex
@@ -126,6 +137,7 @@ entry:
   ; CHECK-BAREMETAL-NOT: __sync
   %10 = atomicrmw max i32* %val2, i32 0 monotonic
 	store i32 %10, i32* %old
+	call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"()
   ; CHECK: ldrex
   ; CHECK: cmp
   ; CHECK: strex
@@ -135,6 +147,7 @@ entry:
   ; CHECK-BAREMETAL-NOT: __sync
   %11 = atomicrmw umin i32* %val2, i32 16 monotonic
 	store i32 %11, i32* %old
+	call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"()
 	%uneg = sub i32 0, 1
   ; CHECK: ldrex
   ; CHECK: cmp
@@ -145,6 +158,7 @@ entry:
   ; CHECK-BAREMETAL-NOT: __sync
   %12 = atomicrmw umin i32* %val2, i32 %uneg monotonic
 	store i32 %12, i32* %old
+	call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"()
   ; CHECK: ldrex
   ; CHECK: cmp
   ; CHECK: strex
@@ -154,6 +168,7 @@ entry:
   ; CHECK-BAREMETAL-NOT: __sync
   %13 = atomicrmw umax i32* %val2, i32 1 monotonic
 	store i32 %13, i32* %old
+	call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"()
   ; CHECK: ldrex
   ; CHECK: cmp
   ; CHECK: strex
diff --git a/test/CodeGen/AVR/branch-relaxation.ll b/test/CodeGen/AVR/branch-relaxation.ll
index d6f07f653576..e415b059692e 100644
--- a/test/CodeGen/AVR/branch-relaxation.ll
+++ b/test/CodeGen/AVR/branch-relaxation.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=avr | FileCheck %s
 
-; CHECKC-LABEL: relax_breq
+; CHECK-LABEL: relax_breq
 ; CHECK: cpi     r{{[0-9]+}}, 0
 ; CHECK: brne    LBB0_1
 ; CHECK: rjmp    LBB0_2
@@ -66,7 +66,7 @@ finished:
   ret i8 3
 }
 
-; CHECKC-LABEL: no_relax_breq
+; CHECK-LABEL: no_relax_breq
 ; CHECK: cpi     r{{[0-9]+}}, 0
 ; CHECK: breq    [[END_BB:LBB[0-9]+_[0-9]+]]
 ; CHECK: nop
diff --git a/test/CodeGen/BPF/select_ri.ll b/test/CodeGen/BPF/select_ri.ll
new file mode 100644
index 000000000000..c4ac376502b8
--- /dev/null
+++ b/test/CodeGen/BPF/select_ri.ll
@@ -0,0 +1,27 @@
+; RUN: llc < %s -march=bpf -verify-machineinstrs | FileCheck %s
+;
+; Source file:
+; int b, c;
+; int test() {
+;   int a = b;
+;   if (a)
+;     a = c;
+;   return a;
+; }
+@b = common local_unnamed_addr global i32 0, align 4
+@c = common local_unnamed_addr global i32 0, align 4
+
+; Function Attrs: norecurse nounwind readonly
+define i32 @test() local_unnamed_addr #0 {
+entry:
+  %0 = load i32, i32* @b, align 4
+  %tobool = icmp eq i32 %0, 0
+  %1 = load i32, i32* @c, align 4
+  %. = select i1 %tobool, i32 0, i32 %1
+; CHECK:  r1 = <MCOperand Expr:(b)>ll
+; CHECK:  r1 = *(u32 *)(r1 + 0)
+; CHECK:  if r1 == 0 goto
+  ret i32 %.
+}
+
+attributes #0 = { norecurse nounwind readonly }
diff --git a/test/CodeGen/BPF/setcc.ll b/test/CodeGen/BPF/setcc.ll
index 294c49365670..7e20814da807 100644
--- a/test/CodeGen/BPF/setcc.ll
+++ b/test/CodeGen/BPF/setcc.ll
@@ -7,7 +7,7 @@ define i16 @sccweqand(i16 %a, i16 %b) nounwind {
   ret i16 %t3
 }
 ; CHECK-LABEL: sccweqand:
-; CHECK: if r1 == r2
+; CHECK: if r1 == 0
 
 define i16 @sccwneand(i16 %a, i16 %b) nounwind {
   %t1 = and i16 %a, %b
@@ -16,7 +16,7 @@ define i16 @sccwneand(i16 %a, i16 %b) nounwind {
   ret i16 %t3
 }
 ; CHECK-LABEL: sccwneand:
-; CHECK: if r1 != r2
+; CHECK: if r1 != 0
 
 define i16 @sccwne(i16 %a, i16 %b) nounwind {
   %t1 = icmp ne i16 %a, %b
diff --git a/test/CodeGen/Generic/2003-07-29-BadConstSbyte.ll b/test/CodeGen/Generic/2003-07-29-BadConstSbyte.ll
index 9e4664ad69c9..48c5f8f4d247 100644
--- a/test/CodeGen/Generic/2003-07-29-BadConstSbyte.ll
+++ b/test/CodeGen/Generic/2003-07-29-BadConstSbyte.ll
@@ -1,7 +1,6 @@
 ; RUN: llc < %s
 
 ; Bug: PR31341
-; XFAIL: avr
 
 ;; Date:     Jul 29, 2003.
 ;; From:     test/Programs/MultiSource/Ptrdist-bc
diff --git a/test/CodeGen/Generic/2011-07-07-ScheduleDAGCrash.ll b/test/CodeGen/Generic/2011-07-07-ScheduleDAGCrash.ll
index a9a33d72bca2..afa2e8a72ed1 100644
--- a/test/CodeGen/Generic/2011-07-07-ScheduleDAGCrash.ll
+++ b/test/CodeGen/Generic/2011-07-07-ScheduleDAGCrash.ll
@@ -1,8 +1,5 @@
 ; RUN: llc < %s
 
-; Bug: PR31898
-; XFAIL: avr
-
 ; This caused ScheduleDAG to crash in EmitPhysRegCopy when searching
 ; the uses of a copy to a physical register without ignoring non-data
 ; dependence, PR10220.
diff --git a/test/CodeGen/Generic/print-mul-exp.ll b/test/CodeGen/Generic/print-mul-exp.ll
index 91c8147aaad9..1426fb59f669 100644
--- a/test/CodeGen/Generic/print-mul-exp.ll
+++ b/test/CodeGen/Generic/print-mul-exp.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s
+; XFAIL: avr
 
 @a_str = internal constant [8 x i8] c"a = %d\0A\00"		; <[8 x i8]*> [#uses=1]
 @a_mul_str = internal constant [13 x i8] c"a * %d = %d\0A\00"		; <[13 x i8]*> [#uses=1]
diff --git a/test/CodeGen/Generic/print-mul.ll b/test/CodeGen/Generic/print-mul.ll
index 4b60d759278a..20fb1be6edef 100644
--- a/test/CodeGen/Generic/print-mul.ll
+++ b/test/CodeGen/Generic/print-mul.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s
+; XFAIL: avr
 
 @a_str = internal constant [8 x i8] c"a = %d\0A\00"		; <[8 x i8]*> [#uses=1]
 @b_str = internal constant [8 x i8] c"b = %d\0A\00"		; <[8 x i8]*> [#uses=1]
diff --git a/test/CodeGen/Generic/print-shift.ll b/test/CodeGen/Generic/print-shift.ll
index 56b3ec1df760..1fda55420b59 100644
--- a/test/CodeGen/Generic/print-shift.ll
+++ b/test/CodeGen/Generic/print-shift.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s
+; XFAIL: avr
 
 @a_str = internal constant [8 x i8] c"a = %d\0A\00"             ; <[8 x i8]*> [#uses=1]
 @b_str = internal constant [8 x i8] c"b = %d\0A\00"             ; <[8 x i8]*> [#uses=1]
diff --git a/test/CodeGen/Generic/v-split.ll b/test/CodeGen/Generic/v-split.ll
index 91aece94fecd..f9a1cee440ca 100644
--- a/test/CodeGen/Generic/v-split.ll
+++ b/test/CodeGen/Generic/v-split.ll
@@ -1,8 +1,5 @@
 ; RUN: llc < %s
 
-; Bug: PR31898
-; XFAIL: avr
-
 %f8 = type <8 x float>
 
 define void @test_f8(%f8 *%P, %f8* %Q, %f8 *%S) {
diff --git a/test/CodeGen/Generic/vector-redux.ll b/test/CodeGen/Generic/vector-redux.ll
index 64562d6d9490..8efdbf85b8c0 100644
--- a/test/CodeGen/Generic/vector-redux.ll
+++ b/test/CodeGen/Generic/vector-redux.ll
@@ -1,9 +1,6 @@
 ; RUN: llc < %s -debug-only=isel -o /dev/null 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 
-; Bug: PR31898
-; XFAIL: avr
-
 @a = global [1024 x i32] zeroinitializer, align 16
 
 define i32 @reduce_add() {
diff --git a/test/CodeGen/Generic/vector.ll b/test/CodeGen/Generic/vector.ll
index 9c0cacdcd878..2d4dc501a53a 100644
--- a/test/CodeGen/Generic/vector.ll
+++ b/test/CodeGen/Generic/vector.ll
@@ -1,9 +1,6 @@
 ; Test that vectors are scalarized/lowered correctly.
 ; RUN: llc < %s
 
-; Bug: PR31898
-; XFAIL: avr
-
 %d8 = type <8 x double>
 %f1 = type <1 x float>
 %f2 = type <2 x float>
diff --git a/test/CodeGen/Hexagon/intrinsics/system_user.ll b/test/CodeGen/Hexagon/intrinsics/system_user.ll
index ac4c53e221d0..23473c92da91 100644
--- a/test/CodeGen/Hexagon/intrinsics/system_user.ll
+++ b/test/CodeGen/Hexagon/intrinsics/system_user.ll
@@ -1,13 +1,71 @@
-; RUN: llc -march=hexagon -O0 < %s | FileCheck %s
-; RUN: llc -march=hexagon -O0 < %s | FileCheck -check-prefix=CHECK-CALL %s
-; Hexagon Programmer's Reference Manual 11.9.1 SYSTEM/USER
+; RUN: llc -march=hexagon < %s | FileCheck %s
 
-; CHECK-CALL-NOT: call
+target triple = "hexagon"
 
-; Data cache prefetch
-declare void @llvm.hexagon.prefetch(i8*)
-define void @prefetch(i8* %a) {
-  call void @llvm.hexagon.prefetch(i8* %a)
+; CHECK-LABEL: dc00:
+; CHECK: dcfetch
+define void @dc00(i8* nocapture readonly %p) local_unnamed_addr #0 {
+  tail call void @llvm.hexagon.prefetch(i8* %p)
   ret void
 }
-; CHECK: dcfetch({{.*}}+#0)
+
+; CHECK-LABEL: dc01:
+; CHECK: dccleana
+define void @dc01(i8* nocapture readonly %p) local_unnamed_addr #0 {
+entry:
+  tail call void @llvm.hexagon.Y2.dccleana(i8* %p)
+  ret void
+}
+
+; CHECK-LABEL: dc02:
+; CHECK: dccleaninva
+define void @dc02(i8* nocapture readonly %p) local_unnamed_addr #0 {
+entry:
+  tail call void @llvm.hexagon.Y2.dccleaninva(i8* %p)
+  ret void
+}
+
+; CHECK-LABEL: dc03:
+; CHECK: dcinva
+define void @dc03(i8* nocapture readonly %p) local_unnamed_addr #0 {
+entry:
+  tail call void @llvm.hexagon.Y2.dcinva(i8* %p)
+  ret void
+}
+
+; CHECK-LABEL: dc04:
+; CHECK: dczeroa
+define void @dc04(i8* nocapture %p) local_unnamed_addr #0 {
+entry:
+  tail call void @llvm.hexagon.Y2.dczeroa(i8* %p)
+  ret void
+}
+
+; CHECK-LABEL: dc05:
+; CHECK: l2fetch(r{{[0-9]+}},r{{[0-9]+}})
+define void @dc05(i8* nocapture readonly %p, i32 %q) local_unnamed_addr #0 {
+entry:
+  tail call void @llvm.hexagon.Y4.l2fetch(i8* %p, i32 %q)
+  ret void
+}
+
+; CHECK-LABEL: dc06:
+; CHECK: l2fetch(r{{[0-9]+}},r{{[0-9]+}}:{{[0-9]+}})
+define void @dc06(i8* nocapture readonly %p, i64 %q) local_unnamed_addr #0 {
+entry:
+  tail call void @llvm.hexagon.Y5.l2fetch(i8* %p, i64 %q)
+  ret void
+}
+
+declare void @llvm.hexagon.prefetch(i8* nocapture) #1
+declare void @llvm.hexagon.Y2.dccleana(i8* nocapture readonly) #2
+declare void @llvm.hexagon.Y2.dccleaninva(i8* nocapture readonly) #2
+declare void @llvm.hexagon.Y2.dcinva(i8* nocapture readonly) #2
+declare void @llvm.hexagon.Y2.dczeroa(i8* nocapture) #3
+declare void @llvm.hexagon.Y4.l2fetch(i8* nocapture readonly, i32) #2
+declare void @llvm.hexagon.Y5.l2fetch(i8* nocapture readonly, i64) #2
+
+attributes #0 = { nounwind "target-cpu"="hexagonv60" "target-features"="-hvx,-hvx-double,-long-calls" }
+attributes #1 = { inaccessiblemem_or_argmemonly nounwind }
+attributes #2 = { nounwind }
+attributes #3 = { argmemonly nounwind writeonly }
diff --git a/test/CodeGen/Hexagon/switch-lut-explicit-section.ll b/test/CodeGen/Hexagon/switch-lut-explicit-section.ll
new file mode 100644
index 000000000000..6c67a0dab1a8
--- /dev/null
+++ b/test/CodeGen/Hexagon/switch-lut-explicit-section.ll
@@ -0,0 +1,32 @@
+;RUN: llc -O2 -hexagon-emit-lut-text=true < %s | FileCheck --check-prefix=FUNCTEXT %s
+;RUN: llc -O2 -hexagon-emit-lut-text=true -function-sections < %s | FileCheck --check-prefix=FUNCTEXT %s
+
+;This test checks the placement of lookup table in explicit section from the attribute set.
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon-unknown--elf"
+
+;FUNCTEXT: .text
+;FUNCTEXT: .section{{.*}}tcm.hexagon,
+;FUNCTEXT-NOT: .section{{.*}}.rodata
+;FUNCTEXT-NOT: .text
+;FUNCTEXT: .Lswitch.table:
+;FUNCTEXT-NEXT: .word
+
+@switch.table = private unnamed_addr constant [9 x i32] [i32 9, i32 20, i32 14, i32 22, i32 12, i32 5, i32 98, i32 8, i32 11] #0
+
+; Function Attrs: norecurse nounwind readnone
+define i32 @foo(i32 %x) local_unnamed_addr #0 section "tcm.hexagon" {
+entry:
+  %0 = icmp ult i32 %x, 9
+  br i1 %0, label %switch.lookup, label %return
+
+switch.lookup:                                    ; preds = %entry
+  %switch.gep = getelementptr inbounds [9 x i32], [9 x i32]* @switch.table, i32 0, i32 %x
+  %switch.load = load i32, i32* %switch.gep, align 4
+  ret i32 %switch.load
+
+return:                                           ; preds = %entry
+  ret i32 19
+}
+
+attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv60" "target-features"="-hvx-double,-long-calls" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/Hexagon/switch-lut-function-section.ll b/test/CodeGen/Hexagon/switch-lut-function-section.ll
new file mode 100644
index 000000000000..bb2b1e798c8a
--- /dev/null
+++ b/test/CodeGen/Hexagon/switch-lut-function-section.ll
@@ -0,0 +1,30 @@
+;RUN: llc -O2 -hexagon-emit-lut-text=true -function-sections < %s | FileCheck --check-prefix=FUNCTEXT %s
+
+;This test checks the placement of lookup table in function's text section.
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon-unknown--elf"
+
+;FUNCTEXT: .text
+;FUNCTEXT: .section{{.*}}text.foo,
+;FUNCTEXT-NOT: .section{{.*}}.rodata
+;FUNCTEXT: .Lswitch.table:
+;FUNCTEXT-NEXT: .word
+
+@switch.table = private unnamed_addr constant [9 x i32] [i32 9, i32 20, i32 14, i32 22, i32 12, i32 5, i32 98, i32 8, i32 11] #0
+
+; Function Attrs: norecurse nounwind readnone
+define i32 @foo(i32 %x) local_unnamed_addr #0 {
+entry:
+  %0 = icmp ult i32 %x, 9
+  br i1 %0, label %switch.lookup, label %return
+
+switch.lookup:                                    ; preds = %entry
+  %switch.gep = getelementptr inbounds [9 x i32], [9 x i32]* @switch.table, i32 0, i32 %x
+  %switch.load = load i32, i32* %switch.gep, align 4
+  ret i32 %switch.load
+
+return:                                           ; preds = %entry
+  ret i32 19
+}
+
+attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv60" "target-features"="-hvx-double,-long-calls" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/Hexagon/switch-lut-multiple-functions.ll b/test/CodeGen/Hexagon/switch-lut-multiple-functions.ll
new file mode 100644
index 000000000000..57fdfbf33abc
--- /dev/null
+++ b/test/CodeGen/Hexagon/switch-lut-multiple-functions.ll
@@ -0,0 +1,42 @@
+;RUN: llc -O2 -hexagon-emit-lut-text=true < %s | FileCheck --check-prefix=TEXT %s
+;If the look up table is used by more than one function, we should ignore the
+;flag and place it the rodata.
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon-unknown--elf"
+
+;TEXT: .text
+;TEXT: .section{{.*}}.rodata
+;TEXT: .Lswitch.table:
+;TEXT-NEXT: .word
+@switch.table = private unnamed_addr constant [9 x i32] [i32 9, i32 20, i32 14, i32 22, i32 12, i32 5, i32 98, i32 8, i32 11]
+
+; Function Attrs: norecurse nounwind readnone
+define i32 @foo(i32 %x) local_unnamed_addr #0 {
+entry:
+  %0 = icmp ult i32 %x, 9
+  br i1 %0, label %switch.lookup, label %return
+
+switch.lookup:                                    ; preds = %entry
+  %switch.gep = getelementptr inbounds [9 x i32], [9 x i32]* @switch.table, i32 0, i32 %x
+  %switch.load = load i32, i32* %switch.gep, align 4
+  ret i32 %switch.load
+
+return:                                           ; preds = %entry
+  ret i32 19
+}
+
+define i32 @goo(i32 %x) local_unnamed_addr #0 {
+entry:
+  %0 = icmp ult i32 %x, 9
+  br i1 %0, label %switch.lookup, label %return
+
+switch.lookup:                                    ; preds = %entry
+  %switch.gep = getelementptr inbounds [9 x i32], [9 x i32]* @switch.table, i32 0, i32 %x
+  %switch.load = load i32, i32* %switch.gep, align 4
+  ret i32 %switch.load
+
+return:                                           ; preds = %entry
+  ret i32 19
+}
+
+attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv60" "target-features"="-hvx-double,-long-calls" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/Hexagon/switch-lut-text-section.ll b/test/CodeGen/Hexagon/switch-lut-text-section.ll
new file mode 100644
index 000000000000..b4d3e898d103
--- /dev/null
+++ b/test/CodeGen/Hexagon/switch-lut-text-section.ll
@@ -0,0 +1,27 @@
+;RUN: llc -O2 -hexagon-emit-lut-text=true < %s | FileCheck --check-prefix=TEXT %s
+;This test checks the placement of lookup table in text section.
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon-unknown--elf"
+
+;TEXT: .text
+;TEXT-NOT: .section{{.*}}.rodata
+;TEXT: .Lswitch.table:
+;TEXT-NEXT: .word
+@switch.table = private unnamed_addr constant [9 x i32] [i32 9, i32 20, i32 14, i32 22, i32 12, i32 5, i32 98, i32 8, i32 11]
+
+; Function Attrs: norecurse nounwind readnone
+define i32 @foo(i32 %x) local_unnamed_addr #0 {
+entry:
+  %0 = icmp ult i32 %x, 9
+  br i1 %0, label %switch.lookup, label %return
+
+switch.lookup:                                    ; preds = %entry
+  %switch.gep = getelementptr inbounds [9 x i32], [9 x i32]* @switch.table, i32 0, i32 %x
+  %switch.load = load i32, i32* %switch.gep, align 4
+  ret i32 %switch.load
+
+return:                                           ; preds = %entry
+  ret i32 19
+}
+
+attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv60" "target-features"="-hvx-double,-long-calls" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/Hexagon/v6vec-vprint.ll b/test/CodeGen/Hexagon/v6vec-vprint.ll
index 224547c24b75..24daeac3fb5d 100644
--- a/test/CodeGen/Hexagon/v6vec-vprint.ll
+++ b/test/CodeGen/Hexagon/v6vec-vprint.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv60 -enable-hexagon-hvx -disable-hexagon-shuffle=0 -O2 -enable-hexagon-vector-print < %s | FileCheck --check-prefix=CHECK %s
+; RUN: llc -march=hexagon -mcpu=hexagonv60 -enable-hexagon-hvx -disable-hexagon-shuffle=0 -O2 -enable-hexagon-vector-print < %s | FileCheck %s
 ; RUN: llc -march=hexagon -mcpu=hexagonv60 -enable-hexagon-hvx -disable-hexagon-shuffle=0 -O2 -enable-hexagon-vector-print -trace-hex-vector-stores-only < %s | FileCheck --check-prefix=VSTPRINT %s
 ;   generate .long XXXX which is a vector debug print instruction.
 ; CHECK: .long 0x1dffe0
diff --git a/test/CodeGen/Hexagon/vect/vect-load-v4i16.ll b/test/CodeGen/Hexagon/vect/vect-load-v4i16.ll
new file mode 100644
index 000000000000..32abb75f20f4
--- /dev/null
+++ b/test/CodeGen/Hexagon/vect/vect-load-v4i16.ll
@@ -0,0 +1,23 @@
+; RUN: llc -march=hexagon -O0 < %s | FileCheck %s
+
+; CHECK-LABEL: danny:
+; CHECK-DAG: [[T0:r[0-9]+]] = memuh(r0+#0)
+; CHECK-DAG: [[T1:r[0-9]+]] = memuh(r0+#2)
+; CHECK:     [[T0]] |= asl([[T1]],#16)
+; CHECK-DAG: [[T2:r[0-9]+]] = memuh(r0+#4)
+; CHECK-DAG: [[T3:r[0-9]+]] = memuh(r0+#6)
+; CHECK:     [[T2]] |= asl([[T3]],#16)
+; CHECK:     combine([[T2]],[[T0]])
+define <4 x i16> @danny(<4 x i16>* %p) {
+  %t0 = load <4 x i16>, <4 x i16>* %p, align 2
+  ret <4 x i16> %t0
+}
+
+; CHECK-LABEL: sammy:
+; CHECK-DAG: [[T0:r[0-9]+]] = memw(r0+#0)
+; CHECK-DAG: [[T1:r[0-9]+]] = memw(r0+#4)
+; CHECK:     combine([[T1]],[[T0]])
+define <4 x i16> @sammy(<4 x i16>* %p) {
+  %t0 = load <4 x i16>, <4 x i16>* %p, align 4
+  ret <4 x i16> %t0
+}
diff --git a/test/CodeGen/Hexagon/vect/vect-loadv4i16.ll b/test/CodeGen/Hexagon/vect/vect-loadv4i16.ll
deleted file mode 100644
index f49a1e24a1bb..000000000000
--- a/test/CodeGen/Hexagon/vect/vect-loadv4i16.ll
+++ /dev/null
@@ -1,73 +0,0 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv5 -disable-hsdr < %s | FileCheck %s
-
-; Check that store is post-incremented.
-; CHECK: memuh(r{{[0-9]+}}+#6)
-; CHECK: combine(r{{[0-9]+}},r{{[0-9]+}})
-; CHECK: vaddh
-
-target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a0:0-n16:32"
-target triple = "hexagon"
-
-define void @matrix_add_const(i32 %N, i16* nocapture %A, i16 signext %val) #0 {
-entry:
-  %cmp5 = icmp eq i32 %N, 0
-  br i1 %cmp5, label %for.end, label %polly.cond
-
-for.end.loopexit:                                 ; preds = %polly.stmt.for.body29
-  br label %for.end
-
-for.end:                                          ; preds = %for.end.loopexit, %polly.loop_header24.preheader, %entry
-  ret void
-
-polly.cond:                                       ; preds = %entry
-  %0 = icmp sgt i32 %N, 3
-  br i1 %0, label %polly.then, label %polly.loop_header24.preheader
-
-polly.then:                                       ; preds = %polly.cond
-  %1 = add i32 %N, -1
-  %leftover_lb = and i32 %1, -4
-  %2 = icmp sgt i32 %leftover_lb, 0
-  br i1 %2, label %polly.loop_body.lr.ph, label %polly.loop_header24.preheader
-
-polly.loop_body.lr.ph:                            ; preds = %polly.then
-  %3 = insertelement <4 x i16> undef, i16 %val, i32 0
-  %4 = insertelement <4 x i16> %3, i16 %val, i32 1
-  %5 = insertelement <4 x i16> %4, i16 %val, i32 2
-  %6 = insertelement <4 x i16> %5, i16 %val, i32 3
-  br label %polly.loop_body
-
-polly.loop_header24.preheader.loopexit:           ; preds = %polly.loop_body
-  br label %polly.loop_header24.preheader
-
-polly.loop_header24.preheader:                    ; preds = %polly.loop_header24.preheader.loopexit, %polly.then, %polly.cond
-  %polly.loopiv27.ph = phi i32 [ 0, %polly.cond ], [ %leftover_lb, %polly.then ], [ %leftover_lb, %polly.loop_header24.preheader.loopexit ]
-  %7 = icmp slt i32 %polly.loopiv27.ph, %N
-  br i1 %7, label %polly.stmt.for.body29.preheader, label %for.end
-
-polly.stmt.for.body29.preheader:                  ; preds = %polly.loop_header24.preheader
-  br label %polly.stmt.for.body29
-
-polly.loop_body:                                  ; preds = %polly.loop_body.lr.ph, %polly.loop_body
-  %p_arrayidx.phi = phi i16* [ %A, %polly.loop_body.lr.ph ], [ %p_arrayidx.inc, %polly.loop_body ]
-  %polly.loopiv34 = phi i32 [ 0, %polly.loop_body.lr.ph ], [ %polly.next_loopiv, %polly.loop_body ]
-  %polly.next_loopiv = add nsw i32 %polly.loopiv34, 4
-  %vector_ptr = bitcast i16* %p_arrayidx.phi to <4 x i16>*
-  %_p_vec_full = load <4 x i16>, <4 x i16>* %vector_ptr, align 2
-  %addp_vec = add <4 x i16> %_p_vec_full, %6
-  store <4 x i16> %addp_vec, <4 x i16>* %vector_ptr, align 2
-  %8 = icmp slt i32 %polly.next_loopiv, %leftover_lb
-  %p_arrayidx.inc = getelementptr i16, i16* %p_arrayidx.phi, i32 4
-  br i1 %8, label %polly.loop_body, label %polly.loop_header24.preheader.loopexit
-
-polly.stmt.for.body29:                            ; preds = %polly.stmt.for.body29.preheader, %polly.stmt.for.body29
-  %polly.loopiv2733 = phi i32 [ %polly.next_loopiv28, %polly.stmt.for.body29 ], [ %polly.loopiv27.ph, %polly.stmt.for.body29.preheader ]
-  %polly.next_loopiv28 = add nsw i32 %polly.loopiv2733, 1
-  %p_arrayidx30 = getelementptr i16, i16* %A, i32 %polly.loopiv2733
-  %_p_scalar_ = load i16, i16* %p_arrayidx30, align 2
-  %p_add = add i16 %_p_scalar_, %val
-  store i16 %p_add, i16* %p_arrayidx30, align 2
-  %exitcond = icmp eq i32 %polly.next_loopiv28, %N
-  br i1 %exitcond, label %for.end.loopexit, label %polly.stmt.for.body29
-}
-
-attributes #0 = { nounwind "fp-contract-model"="standard" "no-frame-pointer-elim-non-leaf" "realign-stack" "relocation-model"="static" "ssp-buffers-size"="8" }
diff --git a/test/CodeGen/Hexagon/vect/vect-v4i16.ll b/test/CodeGen/Hexagon/vect/vect-v4i16.ll
new file mode 100644
index 000000000000..f49a1e24a1bb
--- /dev/null
+++ b/test/CodeGen/Hexagon/vect/vect-v4i16.ll
@@ -0,0 +1,73 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv5 -disable-hsdr < %s | FileCheck %s
+
+; Check that store is post-incremented.
+; CHECK: memuh(r{{[0-9]+}}+#6)
+; CHECK: combine(r{{[0-9]+}},r{{[0-9]+}})
+; CHECK: vaddh
+
+target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a0:0-n16:32"
+target triple = "hexagon"
+
+define void @matrix_add_const(i32 %N, i16* nocapture %A, i16 signext %val) #0 {
+entry:
+  %cmp5 = icmp eq i32 %N, 0
+  br i1 %cmp5, label %for.end, label %polly.cond
+
+for.end.loopexit:                                 ; preds = %polly.stmt.for.body29
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %polly.loop_header24.preheader, %entry
+  ret void
+
+polly.cond:                                       ; preds = %entry
+  %0 = icmp sgt i32 %N, 3
+  br i1 %0, label %polly.then, label %polly.loop_header24.preheader
+
+polly.then:                                       ; preds = %polly.cond
+  %1 = add i32 %N, -1
+  %leftover_lb = and i32 %1, -4
+  %2 = icmp sgt i32 %leftover_lb, 0
+  br i1 %2, label %polly.loop_body.lr.ph, label %polly.loop_header24.preheader
+
+polly.loop_body.lr.ph:                            ; preds = %polly.then
+  %3 = insertelement <4 x i16> undef, i16 %val, i32 0
+  %4 = insertelement <4 x i16> %3, i16 %val, i32 1
+  %5 = insertelement <4 x i16> %4, i16 %val, i32 2
+  %6 = insertelement <4 x i16> %5, i16 %val, i32 3
+  br label %polly.loop_body
+
+polly.loop_header24.preheader.loopexit:           ; preds = %polly.loop_body
+  br label %polly.loop_header24.preheader
+
+polly.loop_header24.preheader:                    ; preds = %polly.loop_header24.preheader.loopexit, %polly.then, %polly.cond
+  %polly.loopiv27.ph = phi i32 [ 0, %polly.cond ], [ %leftover_lb, %polly.then ], [ %leftover_lb, %polly.loop_header24.preheader.loopexit ]
+  %7 = icmp slt i32 %polly.loopiv27.ph, %N
+  br i1 %7, label %polly.stmt.for.body29.preheader, label %for.end
+
+polly.stmt.for.body29.preheader:                  ; preds = %polly.loop_header24.preheader
+  br label %polly.stmt.for.body29
+
+polly.loop_body:                                  ; preds = %polly.loop_body.lr.ph, %polly.loop_body
+  %p_arrayidx.phi = phi i16* [ %A, %polly.loop_body.lr.ph ], [ %p_arrayidx.inc, %polly.loop_body ]
+  %polly.loopiv34 = phi i32 [ 0, %polly.loop_body.lr.ph ], [ %polly.next_loopiv, %polly.loop_body ]
+  %polly.next_loopiv = add nsw i32 %polly.loopiv34, 4
+  %vector_ptr = bitcast i16* %p_arrayidx.phi to <4 x i16>*
+  %_p_vec_full = load <4 x i16>, <4 x i16>* %vector_ptr, align 2
+  %addp_vec = add <4 x i16> %_p_vec_full, %6
+  store <4 x i16> %addp_vec, <4 x i16>* %vector_ptr, align 2
+  %8 = icmp slt i32 %polly.next_loopiv, %leftover_lb
+  %p_arrayidx.inc = getelementptr i16, i16* %p_arrayidx.phi, i32 4
+  br i1 %8, label %polly.loop_body, label %polly.loop_header24.preheader.loopexit
+
+polly.stmt.for.body29:                            ; preds = %polly.stmt.for.body29.preheader, %polly.stmt.for.body29
+  %polly.loopiv2733 = phi i32 [ %polly.next_loopiv28, %polly.stmt.for.body29 ], [ %polly.loopiv27.ph, %polly.stmt.for.body29.preheader ]
+  %polly.next_loopiv28 = add nsw i32 %polly.loopiv2733, 1
+  %p_arrayidx30 = getelementptr i16, i16* %A, i32 %polly.loopiv2733
+  %_p_scalar_ = load i16, i16* %p_arrayidx30, align 2
+  %p_add = add i16 %_p_scalar_, %val
+  store i16 %p_add, i16* %p_arrayidx30, align 2
+  %exitcond = icmp eq i32 %polly.next_loopiv28, %N
+  br i1 %exitcond, label %for.end.loopexit, label %polly.stmt.for.body29
+}
+
+attributes #0 = { nounwind "fp-contract-model"="standard" "no-frame-pointer-elim-non-leaf" "realign-stack" "relocation-model"="static" "ssp-buffers-size"="8" }
diff --git a/test/CodeGen/MIR/AArch64/target-memoperands.mir b/test/CodeGen/MIR/AArch64/target-memoperands.mir
index f853b551e098..c71302d97e2e 100644
--- a/test/CodeGen/MIR/AArch64/target-memoperands.mir
+++ b/test/CodeGen/MIR/AArch64/target-memoperands.mir
@@ -10,13 +10,17 @@
 ---
 # CHECK-LABEL: name: target_memoperands
 # CHECK: %1(s64) = G_LOAD %0(p0) :: ("aarch64-suppress-pair" load 8)
+# CHECK: %2(s32) = G_LOAD %0(p0) :: ("aarch64-strided-access" load 4)
 # CHECK: G_STORE %1(s64), %0(p0) :: ("aarch64-suppress-pair" store 8)
+# CHECK: G_STORE %2(s32), %0(p0) :: ("aarch64-strided-access" store 4)
 name:            target_memoperands
 body: |
   bb.0:
 
     %0:_(p0) = COPY %x0
     %1:_(s64) = G_LOAD %0(p0) :: ("aarch64-suppress-pair" load 8)
+    %2:_(s32) = G_LOAD %0(p0) :: ("aarch64-strided-access" load 4)
     G_STORE %1(s64), %0(p0) :: ("aarch64-suppress-pair" store 8)
+    G_STORE %2(s32), %0(p0) :: ("aarch64-strided-access" store 4)
     RET_ReallyLR
 ...
diff --git a/test/CodeGen/MIR/AMDGPU/fold-multiple.mir b/test/CodeGen/MIR/AMDGPU/fold-multiple.mir
new file mode 100644
index 000000000000..a5da33a997d3
--- /dev/null
+++ b/test/CodeGen/MIR/AMDGPU/fold-multiple.mir
@@ -0,0 +1,40 @@
+# RUN: llc --mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs -run-pass si-fold-operands,si-shrink-instructions %s -o - | FileCheck %s
+--- |
+  define amdgpu_kernel void @test() #0 {
+    ret void
+  }
+
+  attributes #0 = { nounwind }
+
+...
+---
+
+# This used to crash / trigger an assertion, because re-scanning the use list
+# after constant-folding the definition of %3 lead to the definition of %2
+# being processed twice.
+
+# CHECK-LABEL: name: test
+# CHECK: %2 = V_LSHLREV_B32_e32 2, killed %0, implicit %exec
+# CHECK: %4 = V_AND_B32_e32 8, killed %2, implicit %exec
+
+name:            test
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vgpr_32 }
+  - { id: 1, class: sreg_32 }
+  - { id: 2, class: vgpr_32 }
+  - { id: 3, class: sreg_32 }
+  - { id: 4, class: vgpr_32 }
+  - { id: 5, class: sreg_128 }
+body:             |
+  bb.0 (%ir-block.0):
+    %0 = IMPLICIT_DEF
+    %1 = S_MOV_B32 2
+    %2 = V_LSHLREV_B32_e64 %1, killed %0, implicit %exec
+    %3 = S_LSHL_B32 %1, killed %1, implicit-def dead %scc
+    %4 = V_AND_B32_e64 killed %2, killed %3, implicit %exec
+    %5 = IMPLICIT_DEF
+    BUFFER_STORE_DWORD_OFFSET killed %4, killed %5, 0, 0, 0, 0, 0, implicit %exec
+    S_ENDPGM
+
+...
diff --git a/test/CodeGen/MSP430/vararg.ll b/test/CodeGen/MSP430/vararg.ll
index 4baf499848fd..3501861f5757 100644
--- a/test/CodeGen/MSP430/vararg.ll
+++ b/test/CodeGen/MSP430/vararg.ll
@@ -39,11 +39,11 @@ entry:
 ; CHECK-LABEL: va_copy:
   %vl.addr = alloca i8*, align 2
   %vl2 = alloca i8*, align 2
-; CHECK: mov.w r12, 2(r1)
+; CHECK-DAG: mov.w r12, 2(r1)
   store i8* %vl, i8** %vl.addr, align 2
   %0 = bitcast i8** %vl2 to i8*
   %1 = bitcast i8** %vl.addr to i8*
-; CHECK-NEXT: mov.w r12, 0(r1)
+; CHECK-DAG: mov.w r12, 0(r1)
   call void @llvm.va_copy(i8* %0, i8* %1)
   ret void
 }
diff --git a/test/CodeGen/Mips/2008-06-05-Carry.ll b/test/CodeGen/Mips/2008-06-05-Carry.ll
index c61e1cdedea7..5e6092fc7848 100644
--- a/test/CodeGen/Mips/2008-06-05-Carry.ll
+++ b/test/CodeGen/Mips/2008-06-05-Carry.ll
@@ -2,20 +2,21 @@
 
 define i64 @add64(i64 %u, i64 %v) nounwind  {
 entry:
+; CHECK-LABEL: add64:
 ; CHECK: addu
-; CHECK: sltu 
+; CHECK-DAG: sltu
+; CHECK-DAG: addu
 ; CHECK: addu
-; CHECK: addu
-  %tmp2 = add i64 %u, %v  
+  %tmp2 = add i64 %u, %v
   ret i64 %tmp2
 }
 
 define i64 @sub64(i64 %u, i64 %v) nounwind  {
 entry:
-; CHECK: sub64
+; CHECK-LABEL: sub64
+; CHECK-DAG: sltu
+; CHECK-DAG: subu
 ; CHECK: subu
-; CHECK: sltu 
-; CHECK: addu
 ; CHECK: subu
   %tmp2 = sub i64 %u, %v
   ret i64 %tmp2
diff --git a/test/CodeGen/Mips/dins.ll b/test/CodeGen/Mips/dins.ll
index 5c0415759266..2aa824250d3b 100644
--- a/test/CodeGen/Mips/dins.ll
+++ b/test/CodeGen/Mips/dins.ll
@@ -59,9 +59,9 @@ entry:
 ; CHECK-LABEL: f123:
 ; MIPS64R2: daddiu  $[[R0:[0-9]+]], $zero, 123
 ; MIPS64R2: dins    $[[R0:[0-9]+]], $[[R1:[0-9]+]], 27, 37
-; MIPS64R2: daddiu  $[[R0:[0-9]+]], $zero, 5
 ; MIPS64R2: daddiu  $[[R0:[0-9]+]], $zero, 4
 ; MIPS64R2: dins    $[[R0:[0-9]+]], $[[R1:[0-9]+]], 28, 6
+; MIPS64R2: daddiu  $[[R0:[0-9]+]], $zero, 5
 ; MIPS64R2: dins    $[[R0:[0-9]+]], $[[R1:[0-9]+]], 50, 14
 ; MIPS64R2: dsrl    $[[R0:[0-9]+]], $[[R1:[0-9]+]], 50
 ; MIPS64R2: dins    $[[R0:[0-9]+]], $[[R1:[0-9]+]], 34, 16
@@ -94,4 +94,4 @@ entry:
 ; MIPS32R2:        ori  $[[R0:[0-9]+]], $[[R0:[0-9]+]], 8
 ; MIPS32R2-NOT:    ins {{[[:space:]].*}}
 ; MIPS64R2N32:     ori  $[[R0:[0-9]+]], $[[R0:[0-9]+]], 8
-; MIPS64R2N32-NOT: ins {{[[:space:]].*}}
\ No newline at end of file
+; MIPS64R2N32-NOT: ins {{[[:space:]].*}}
diff --git a/test/CodeGen/Mips/dsp-patterns.ll b/test/CodeGen/Mips/dsp-patterns.ll
index 837c0d8bfc52..250d3eff37dc 100644
--- a/test/CodeGen/Mips/dsp-patterns.ll
+++ b/test/CodeGen/Mips/dsp-patterns.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=mips -mattr=dsp < %s | FileCheck %s -check-prefix=R1
-; RUN: llc -march=mips -mattr=dspr2 < %s | FileCheck %s -check-prefix=R2
+; RUN: llc -march=mips -mcpu=mips32r2 -mattr=dsp < %s | FileCheck %s -check-prefix=R1
+; RUN: llc -march=mips -mcpu=mips32r2 -mattr=dspr2 < %s | FileCheck %s -check-prefix=R2
 
 ; R1-LABEL: test_lbux:
 ; R1: lbux ${{[0-9]+}}
diff --git a/test/CodeGen/Mips/llcarry.ll b/test/CodeGen/Mips/llcarry.ll
index fcf129420234..b7cc6fc8ea75 100644
--- a/test/CodeGen/Mips/llcarry.ll
+++ b/test/CodeGen/Mips/llcarry.ll
@@ -14,9 +14,9 @@ entry:
   %add = add nsw i64 %1, %0
   store i64 %add, i64* @k, align 8
 ; 16:	addu	${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
-; 16:	sltu	${{[0-9]+}}, ${{[0-9]+}}
-; 16:	move	${{[0-9]+}}, $t8
 ; 16:	addu	${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
+; 16:	sltu	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	move	${{[0-9]+}}, $24
 ; 16:	addu	${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
   ret void
 }
@@ -28,8 +28,8 @@ entry:
   %sub = sub nsw i64 %0, %1
 ; 16:	subu	${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
 ; 16:	sltu	${{[0-9]+}}, ${{[0-9]+}}
-; 16:	move	${{[0-9]+}}, $t8
-; 16:	addu	${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
+; 16:	move	${{[0-9]+}}, $24
+; 16:	subu	${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
 ; 16:	subu	${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
   store i64 %sub, i64* @l, align 8
   ret void
@@ -41,8 +41,7 @@ entry:
   %add = add nsw i64 %0, 15
 ; 16:	addiu	${{[0-9]+}}, 15
 ; 16:	sltu	${{[0-9]+}}, ${{[0-9]+}}
-; 16:	move	${{[0-9]+}}, $t8
-; 16:	addu	${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
+; 16:	move	${{[0-9]+}}, $24
 ; 16:	addu	${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
   store i64 %add, i64* @m, align 8
   ret void
diff --git a/test/CodeGen/Mips/llvm-ir/add.ll b/test/CodeGen/Mips/llvm-ir/add.ll
index a5ecdda94ce2..63884eb03b8c 100644
--- a/test/CodeGen/Mips/llvm-ir/add.ll
+++ b/test/CodeGen/Mips/llvm-ir/add.ll
@@ -1,35 +1,35 @@
 ; RUN: llc < %s -march=mips -mcpu=mips2 | FileCheck %s \
-; RUN:    -check-prefixes=ALL,NOT-R2-R6,GP32
+; RUN:    -check-prefixes=ALL,NOT-R2-R6,GP32,PRE4
 ; RUN: llc < %s -march=mips -mcpu=mips32 | FileCheck %s \
-; RUN:    -check-prefixes=ALL,NOT-R2-R6,GP32
+; RUN:    -check-prefixes=ALL,NOT-R2-R6,GP32,GP32-CMOV
 ; RUN: llc < %s -march=mips -mcpu=mips32r2 | FileCheck %s \
-; RUN:    -check-prefixes=ALL,R2-R6,GP32
+; RUN:    -check-prefixes=ALL,R2-R6,GP32,GP32-CMOV
 ; RUN: llc < %s -march=mips -mcpu=mips32r3 | FileCheck %s \
-; RUN:    -check-prefixes=ALL,R2-R6,GP32
+; RUN:    -check-prefixes=ALL,R2-R6,GP32,GP32-CMOV
 ; RUN: llc < %s -march=mips -mcpu=mips32r5 | FileCheck %s \
-; RUN:    -check-prefixes=ALL,R2-R6,GP32
+; RUN:    -check-prefixes=ALL,R2-R6,GP32,GP32-CMOV
 ; RUN: llc < %s -march=mips -mcpu=mips32r6 | FileCheck %s \
 ; RUN:    -check-prefixes=ALL,R2-R6,GP32
 ; RUN: llc < %s -march=mips64 -mcpu=mips3 | FileCheck %s \
-; RUN:    -check-prefixes=ALL,NOT-R2-R6,GP64
+; RUN:    -check-prefixes=ALL,NOT-R2-R6,GP64,GP64-NOT-R2-R6
 ; RUN: llc < %s -march=mips64 -mcpu=mips4 | FileCheck %s \
-; RUN:    -check-prefixes=ALL,NOT-R2-R6,GP64
+; RUN:    -check-prefixes=ALL,NOT-R2-R6,GP64,GP64-NOT-R2-R6
 ; RUN: llc < %s -march=mips64 -mcpu=mips64 | FileCheck %s \
-; RUN:    -check-prefixes=ALL,NOT-R2-R6,GP64
+; RUN:    -check-prefixes=ALL,NOT-R2-R6,GP64,GP64-NOT-R2-R6
 ; RUN: llc < %s -march=mips64 -mcpu=mips64r2 | FileCheck %s \
-; RUN:    -check-prefixes=ALL,R2-R6,GP64
+; RUN:    -check-prefixes=ALL,R2-R6,GP64,GP64-R2-R6
 ; RUN: llc < %s -march=mips64 -mcpu=mips64r3 | FileCheck %s \
-; RUN:    -check-prefixes=ALL,R2-R6,GP64
+; RUN:    -check-prefixes=ALL,R2-R6,GP64,GP64-R2-R6
 ; RUN: llc < %s -march=mips64 -mcpu=mips64r5 | FileCheck %s \
-; RUN:    -check-prefixes=ALL,R2-R6,GP64
+; RUN:    -check-prefixes=ALL,R2-R6,GP64,GP64-R2-R6
 ; RUN: llc < %s -march=mips64 -mcpu=mips64r6 | FileCheck %s \
-; RUN:    -check-prefixes=ALL,R2-R6,GP64
+; RUN:    -check-prefixes=ALL,R2-R6,GP64,GP64-R2-R6
 ; RUN: llc < %s -march=mips -mcpu=mips32r3 -mattr=+micromips -O2 -verify-machineinstrs | FileCheck %s \
-; RUN:    -check-prefixes=ALL,MMR6,MM32
+; RUN:    -check-prefixes=ALL,MMR3,MM32
 ; RUN: llc < %s -march=mips -mcpu=mips32r6 -mattr=+micromips -O2 | FileCheck %s \
 ; RUN:    -check-prefixes=ALL,MMR6,MM32
 ; RUN: llc < %s -march=mips -mcpu=mips64r6 -target-abi n64 -mattr=+micromips -O2 | FileCheck %s \
-; RUN:    -check-prefixes=ALL,MMR6,MM64
+; RUN:    -check-prefixes=ALL,MM64
 
 
 ; FIXME: This code sequence is inefficient as it should be 'subu $[[T0]], $zero, $[[T0]'. 
@@ -110,17 +110,17 @@ define signext i64 @add_i64(i64 signext %a, i64 signext %b) {
 entry:
 ; ALL-LABEL: add_i64:
 
-  ; GP32:       addu    $3, $5, $7
-  ; GP32:       sltu    $[[T0:[0-9]+]], $3, $7
-  ; GP32:       addu    $[[T1:[0-9]+]], $[[T0]], $6
-  ; GP32:       addu    $2, $4, $[[T1]]
+  ; GP32-DAG:   addu    $[[T0:[0-9]+]], $4, $6
+  ; GP32-DAG:   addu    $3, $5, $7
+  ; GP32:       sltu    $[[T1:[0-9]+]], $3, $5
+  ; GP32:       addu    $2, $[[T0]], $[[T1]]
 
   ; GP64:       daddu   $2, $4, $5
 
-  ; MM32:       addu16  $3, $5, $7
-  ; MM32:       sltu    $[[T0:[0-9]+]], $3, $7
-  ; MM32:       addu    $[[T1:[0-9]+]], $[[T0]], $6
-  ; MM32:       addu    $2, $4, $[[T1]]
+  ; MM32-DAG:   addu16  $3, $5, $7
+  ; MM32-DAG:   addu16  $[[T0:[0-9]+]], $4, $6
+  ; MM32:       sltu    $[[T1:[0-9]+]], $3, $5
+  ; MM32:       addu16  $2, $[[T0]], $[[T1]]
 
   ; MM64:       daddu   $2, $4, $5
 
@@ -132,49 +132,108 @@ define signext i128 @add_i128(i128 signext %a, i128 signext %b) {
 entry:
 ; ALL-LABEL: add_i128:
 
-  ; GP32:       lw        $[[T0:[0-9]+]], 28($sp)
-  ; GP32:       addu      $[[T1:[0-9]+]], $7, $[[T0]]
-  ; GP32:       sltu      $[[T2:[0-9]+]], $[[T1]], $[[T0]]
-  ; GP32:       lw        $[[T3:[0-9]+]], 24($sp)
-  ; GP32:       addu      $[[T4:[0-9]+]], $[[T2]], $[[T3]]
-  ; GP32:       addu      $[[T5:[0-9]+]], $6, $[[T4]]
-  ; GP32:       sltu      $[[T6:[0-9]+]], $[[T5]], $[[T3]]
-  ; GP32:       lw        $[[T7:[0-9]+]], 20($sp)
-  ; GP32:       addu      $[[T8:[0-9]+]], $[[T6]], $[[T7]]
-  ; GP32:       lw        $[[T9:[0-9]+]], 16($sp)
-  ; GP32:       addu      $3, $5, $[[T8]]
-  ; GP32:       sltu      $[[T10:[0-9]+]], $3, $[[T7]]
-  ; GP32:       addu      $[[T11:[0-9]+]], $[[T10]], $[[T9]]
-  ; GP32:       addu      $2, $4, $[[T11]]
-  ; GP32:       move      $4, $[[T5]]
-  ; GP32:       move      $5, $[[T1]]
-
-  ; GP64:       daddu     $3, $5, $7
-  ; GP64:       sltu      $[[T0:[0-9]+]], $3, $7
-  ; GP64:       daddu     $[[T1:[0-9]+]], $[[T0]], $6
-  ; GP64:       daddu     $2, $4, $[[T1]]
-
-  ; MM32:       lw        $[[T0:[0-9]+]], 28($sp)
-  ; MM32:       addu      $[[T1:[0-9]+]], $7, $[[T0]]
-  ; MM32:       sltu      $[[T2:[0-9]+]], $[[T1]], $[[T0]]
-  ; MM32:       lw        $[[T3:[0-9]+]], 24($sp)
-  ; MM32:       addu16    $[[T4:[0-9]+]], $[[T2]], $[[T3]]
-  ; MM32:       addu16    $[[T5:[0-9]+]], $6, $[[T4]]
-  ; MM32:       sltu      $[[T6:[0-9]+]], $[[T5]], $[[T3]]
-  ; MM32:       lw        $[[T7:[0-9]+]], 20($sp)
-  ; MM32:       addu16    $[[T8:[0-9]+]], $[[T6]], $[[T7]]
-  ; MM32:       lw        $[[T9:[0-9]+]], 16($sp)
-  ; MM32:       addu16    $[[T10:[0-9]+]], $5, $[[T8]]
-  ; MM32:       sltu      $[[T11:[0-9]+]], $[[T10]], $[[T7]]
-  ; MM32:       addu      $[[T12:[0-9]+]], $[[T11]], $[[T9]]
-  ; MM32:       addu16    $[[T13:[0-9]+]], $4, $[[T12]]
-  ; MM32:       move      $4, $[[T5]]
-  ; MM32:       move      $5, $[[T1]]
-
+  ; PRE4:       move    $[[R1:[0-9]+]], $5
+  ; PRE4:       move    $[[R2:[0-9]+]], $4
+  ; PRE4:       lw   $[[R3:[0-9]+]], 24($sp)
+  ; PRE4:       addu   $[[R4:[0-9]+]], $6, $[[R3]]
+  ; PRE4:       lw   $[[R5:[0-9]+]], 28($sp)
+  ; PRE4:       addu   $[[R6:[0-9]+]], $7, $[[R5]]
+  ; PRE4:       sltu   $[[R7:[0-9]+]], $[[R6]], $7
+  ; PRE4:       addu   $[[R8:[0-9]+]], $[[R4]], $[[R7]]
+  ; PRE4:       xor   $[[R9:[0-9]+]], $[[R8]], $6
+  ; PRE4:       sltiu   $[[R10:[0-9]+]], $[[R9]], 1
+  ; PRE4:       bnez   $[[R10]], $BB5_2
+  ; PRE4:       sltu   $[[R7]], $[[R8]], $6
+  ; PRE4:       lw   $[[R12:[0-9]+]], 20($sp)
+  ; PRE4:       addu   $[[R13:[0-9]+]], $[[R1]], $[[R12]]
+  ; PRE4:       lw   $[[R14:[0-9]+]], 16($sp)
+  ; PRE4:       addu   $[[R15:[0-9]+]], $[[R13]], $[[R7]]
+  ; PRE4:       addu   $[[R16:[0-9]+]], $[[R2]], $[[R14]]
+  ; PRE4:       sltu   $[[R17:[0-9]+]], $[[R15]], $[[R13]]
+  ; PRE4:       sltu   $[[R18:[0-9]+]], $[[R13]], $[[R1]]
+  ; PRE4:       addu   $[[R19:[0-9]+]], $[[R16]], $[[R18]]
+  ; PRE4:       addu   $2, $[[R19]], $[[R17]]
+
+  ; GP32-CMOV:  lw        $[[T0:[0-9]+]], 24($sp)
+  ; GP32-CMOV:  addu      $[[T1:[0-9]+]], $6, $[[T0]]
+  ; GP32-CMOV:  lw        $[[T2:[0-9]+]], 28($sp)
+  ; GP32-CMOV:  addu      $[[T3:[0-9]+]], $7, $[[T2]]
+  ; GP32-CMOV:  sltu      $[[T4:[0-9]+]], $[[T3]], $7
+  ; GP32-CMOV:  addu      $[[T5:[0-9]+]], $[[T1]], $[[T4]]
+  ; GP32-CMOV:  sltu      $[[T6:[0-9]+]], $[[T5]], $6
+  ; GP32-CMOV:  xor       $[[T7:[0-9]+]], $[[T5]], $6
+  ; GP32-CMOV:  movz      $[[T8:[0-9]+]], $[[T4]], $[[T7]]
+  ; GP32-CMOV:  lw        $[[T9:[0-9]+]], 20($sp)
+  ; GP32-CMOV:  addu      $[[T10:[0-9]+]], $5, $[[T4]]
+  ; GP32-CMOV:  addu      $[[T11:[0-9]+]], $[[T10]], $[[T8]]
+  ; GP32-CMOV:  lw        $[[T12:[0-9]+]], 16($sp)
+  ; GP32-CMOV:  sltu      $[[T13:[0-9]+]], $[[T11]], $[[T10]]
+  ; GP32-CMOV:  addu      $[[T14:[0-9]+]], $4, $[[T12]]
+  ; GP32-CMOV:  sltu      $[[T15:[0-9]+]], $[[T10]], $5
+  ; GP32-CMOV:  addu      $[[T16:[0-9]+]], $[[T14]], $[[T15]]
+  ; GP32-CMOV:  addu      $[[T17:[0-9]+]], $[[T16]], $[[T13]]
+  ; GP32-CMOV:  move      $4, $[[T5]]
+  ; GP32-CMOV:  move      $5, $[[T3]]
+
+  ; GP64:           daddu   $[[T0:[0-9]+]], $4, $6
+  ; GP64:           daddu   $[[T1:[0-9]+]], $5, $7
+  ; GP64:           sltu    $[[T2:[0-9]+]], $[[T1]], $5
+  ; GP64-NOT-R2-R6: dsll    $[[T3:[0-9]+]], $[[T2]], 32
+  ; GP64-NOT-R2-R6: dsrl    $[[T4:[0-9]+]], $[[T3]], 32
+  ; GP64-R2-R6:     dext    $[[T4:[0-9]+]], $[[T2]], 0, 32
+
+  ; GP64:           daddu   $2, $[[T0]], $[[T4]]
+
+  ; MMR3:       move      $[[T1:[0-9]+]], $5
+  ; MMR3-DAG:   lw        $[[T2:[0-9]+]], 32($sp)
+  ; MMR3:       addu16    $[[T3:[0-9]+]], $6, $[[T2]]
+  ; MMR3-DAG:   lw        $[[T4:[0-9]+]], 36($sp)
+  ; MMR3:       addu16    $[[T5:[0-9]+]], $7, $[[T4]]
+  ; MMR3:       sltu      $[[T6:[0-9]+]], $[[T5]], $7
+  ; MMR3:       addu16    $[[T7:[0-9]+]], $[[T3]], $[[T6]]
+  ; MMR3:       sltu      $[[T8:[0-9]+]], $[[T7]], $6
+  ; MMR3:       xor       $[[T9:[0-9]+]], $[[T7]], $6
+  ; MMR3:       movz      $[[T8]], $[[T6]], $[[T9]]
+  ; MMR3:       lw        $[[T10:[0-9]+]], 28($sp)
+  ; MMR3:       addu16    $[[T11:[0-9]+]], $[[T1]], $[[T10]]
+  ; MMR3:       addu16    $[[T12:[0-9]+]], $[[T11]], $[[T8]]
+  ; MMR3:       lw        $[[T13:[0-9]+]], 24($sp)
+  ; MMR3:       sltu      $[[T14:[0-9]+]], $[[T12]], $[[T11]]
+  ; MMR3:       addu16    $[[T15:[0-9]+]], $4, $[[T13]]
+  ; MMR3:       sltu      $[[T16:[0-9]+]], $[[T11]], $[[T1]]
+  ; MMR3:       addu16    $[[T17:[0-9]+]], $[[T15]], $[[T16]]
+  ; MMR3:       addu16    $2, $2, $[[T14]]
+
+  ; MMR6:        move      $[[T1:[0-9]+]], $5
+  ; MMR6:        move      $[[T2:[0-9]+]], $4
+  ; MMR6:        lw        $[[T3:[0-9]+]], 32($sp)
+  ; MMR6:        addu16    $[[T4:[0-9]+]], $6, $[[T3]]
+  ; MMR6:        lw        $[[T5:[0-9]+]], 36($sp)
+  ; MMR6:        addu16    $[[T6:[0-9]+]], $7, $[[T5]]
+  ; MMR6:        sltu      $[[T7:[0-9]+]], $[[T6]], $7
+  ; MMR6:        addu16    $[[T8:[0-9]+]], $[[T4]], $7
+  ; MMR6:        sltu      $[[T9:[0-9]+]], $[[T8]], $6
+  ; MMR6:        xor       $[[T10:[0-9]+]], $[[T4]], $6
+  ; MMR6:        sltiu     $[[T11:[0-9]+]], $[[T10]], 1
+  ; MMR6:        seleqz    $[[T12:[0-9]+]], $[[T9]], $[[T11]]
+  ; MMR6:        selnez    $[[T13:[0-9]+]], $[[T7]], $[[T11]]
+  ; MMR6:        lw        $[[T14:[0-9]+]], 24($sp)
+  ; MMR6:        or        $[[T15:[0-9]+]], $[[T13]], $[[T12]]
+  ; MMR6:        addu16    $[[T16:[0-9]+]], $[[T2]], $[[T14]]
+  ; MMR6:        lw        $[[T17:[0-9]+]], 28($sp)
+  ; MMR6:        addu16    $[[T18:[0-9]+]], $[[T1]], $[[T17]]
+  ; MMR6:        addu16    $[[T19:[0-9]+]], $[[T18]], $[[T15]]
+  ; MMR6:        sltu      $[[T20:[0-9]+]], $[[T18]], $[[T1]]
+  ; MMR6:        sltu      $[[T21:[0-9]+]], $[[T17]], $[[T18]]
+  ; MMR6:        addu16    $2, $[[T16]], $[[T20]]
+  ; MMR6:        addu16    $2, $[[T20]], $[[T21]]
+
+  ; MM64:       daddu     $[[T0:[0-9]+]], $4, $6
   ; MM64:       daddu     $3, $5, $7
-  ; MM64:       sltu      $[[T0:[0-9]+]], $3, $7
-  ; MM64:       daddu     $[[T1:[0-9]+]], $[[T0]], $6
-  ; MM64:       daddu     $2, $4, $[[T1]]
+  ; MM64:       sltu      $[[T1:[0-9]+]], $3, $5
+  ; MM64:       dsll      $[[T2:[0-9]+]], $[[T1]], 32
+  ; MM64:       dsrl      $[[T3:[0-9]+]], $[[T2]], 32
+  ; MM64:       daddu     $2, $[[T0]], $[[T3]]
 
   %r = add i128 %a, %b
   ret i128 %r
@@ -249,17 +308,16 @@ define signext i32 @add_i32_4(i32 signext %a) {
 define signext i64 @add_i64_4(i64 signext %a) {
 ; ALL-LABEL: add_i64_4:
 
-  ; GP32:       addiu   $[[T0:[0-9]+]], $5, 4
-  ; GP32:       addiu   $[[T1:[0-9]+]], $zero, 4
-  ; GP32:       sltu    $[[T1]], $[[T0]], $[[T1]]
-  ; GP32:       addu    $2, $4, $[[T1]]
+  ; GP32:       addiu   $3, $5, 4
+  ; GP32:       sltu    $[[T0:[0-9]+]], $3, $5
+  ; GP32:       addu    $2, $4, $[[T0]]
+
+  ; MM32:       addiur2 $[[T1:[0-9]+]], $5, 4
+  ; MM32:       sltu    $[[T2:[0-9]+]], $[[T1]], $5
+  ; MM32:       addu16  $2, $4, $[[T2]]
 
   ; GP64:       daddiu  $2, $4, 4
 
-  ; MM32:       addiu   $[[T0:[0-9]+]], $5, 4
-  ; MM32:       li16    $[[T1:[0-9]+]], 4
-  ; MM32:       sltu    $[[T2:[0-9]+]], $[[T0]], $[[T1]]
-  ; MM32:       addu    $2, $4, $[[T2]]
 
   ; MM64:       daddiu  $2, $4, 4
 
@@ -270,38 +328,67 @@ define signext i64 @add_i64_4(i64 signext %a) {
 define signext i128 @add_i128_4(i128 signext %a) {
 ; ALL-LABEL: add_i128_4:
 
-  ; GP32:       addiu   $[[T0:[0-9]+]], $7, 4
-  ; GP32:       addiu   $[[T1:[0-9]+]], $zero, 4
-  ; GP32:       sltu    $[[T1]], $[[T0]], $[[T1]]
-  ; GP32:       addu    $[[T2:[0-9]+]], $6, $[[T1]]
-  ; GP32:       sltu    $[[T1]], $[[T2]], $zero
-  ; GP32:       addu    $[[T3:[0-9]+]], $5, $[[T1]]
-  ; GP32:       sltu    $[[T1]], $[[T3]], $zero
-  ; GP32:       addu    $[[T1]], $4, $[[T1]]
-  ; GP32:       move    $4, $[[T2]]
-  ; GP32:       move    $5, $[[T0]]
-
-  ; GP64:       daddiu  $[[T0:[0-9]+]], $5, 4
-  ; GP64:       daddiu  $[[T1:[0-9]+]], $zero, 4
-  ; GP64:       sltu    $[[T1]], $[[T0]], $[[T1]]
-  ; GP64:       daddu   $2, $4, $[[T1]]
-
-  ; MM32:       addiu   $[[T0:[0-9]+]], $7, 4
-  ; MM32:       li16    $[[T1:[0-9]+]], 4
-  ; MM32:       sltu    $[[T1]], $[[T0]], $[[T1]]
-  ; MM32:       addu16  $[[T2:[0-9]+]], $6, $[[T1]]
-  ; MM32:       li16    $[[T1]], 0
-  ; MM32:       sltu    $[[T3:[0-9]+]], $[[T2]], $[[T1]]
-  ; MM32:       addu16  $[[T3]], $5, $[[T3]]
-  ; MM32:       sltu    $[[T1]], $[[T3]], $[[T1]]
-  ; MM32:       addu16  $[[T1]], $4, $[[T1]]
-  ; MM32:       move    $4, $[[T2]]
-  ; MM32:       move    $5, $[[T0]]
+  ; PRE4:       move   $[[T0:[0-9]+]], $5
+  ; PRE4:       addiu  $[[T1:[0-9]+]], $7, 4
+  ; PRE4:       sltu   $[[T2:[0-9]+]], $[[T1]], $7
+  ; PRE4:       xori   $[[T3:[0-9]+]], $[[T2]], 1
+  ; PRE4:       bnez   $[[T3]], $BB[[BB0:[0-9_]+]]
+  ; PRE4:       addu   $[[T4:[0-9]+]], $6, $[[T2]]
+  ; PRE4:       sltu   $[[T5:[0-9]+]], $[[T4]], $6
+  ; PRE4;       $BB[[BB0:[0-9]+]]:
+  ; PRE4:       addu   $[[T6:[0-9]+]], $[[T0]], $[[T5]]
+  ; PRE4:       sltu   $[[T7:[0-9]+]], $[[T6]], $[[T0]]
+  ; PRE4:       addu   $[[T8:[0-9]+]], $4, $[[T7]]
+  ; PRE4:       move    $4, $[[T4]]
+
+  ; GP32-CMOV:  addiu   $[[T0:[0-9]+]], $7, 4
+  ; GP32-CMOV:  sltu    $[[T1:[0-9]+]], $[[T0]], $7
+  ; GP32-CMOV:  addu    $[[T2:[0-9]+]], $6, $[[T1]]
+  ; GP32-CMOV:  sltu    $[[T3:[0-9]+]], $[[T2]], $6
+  ; GP32-CMOV:  movz    $[[T3]], $[[T1]], $[[T1]]
+  ; GP32-CMOV:  addu    $[[T4:[0-9]+]], $5, $[[T3]]
+  ; GP32-CMOV:  sltu    $[[T5:[0-9]+]], $[[T4]], $5
+  ; GP32-CMOV:  addu    $[[T7:[0-9]+]], $4, $[[T5]]
+  ; GP32-CMOV:  move    $4, $[[T2]]
+  ; GP32-CMOV:  move    $5, $[[T0]]
+
+  ; GP64:           daddiu  $[[T0:[0-9]+]], $5, 4
+  ; GP64:           sltu    $[[T1:[0-9]+]], $[[T0]], $5
+  ; GP64-NOT-R2-R6: dsll    $[[T2:[0-9]+]], $[[T1]], 32
+  ; GP64-NOT-R2-R6: dsrl    $[[T3:[0-9]+]], $[[T2]], 32
+  ; GP64-R2-R6:     dext    $[[T3:[0-9]+]], $[[T1]], 0, 32
+
+  ; GP64:           daddu   $2, $4, $[[T3]]
+
+  ; MMR3:       addiur2 $[[T0:[0-9]+]], $7, 4
+  ; MMR3:       sltu    $[[T1:[0-9]+]], $[[T0]], $7
+  ; MMR3:       sltu    $[[T2:[0-9]+]], $[[T0]], $7
+  ; MMR3:       addu16  $[[T3:[0-9]+]], $6, $[[T2]]
+  ; MMR3:       sltu    $[[T4:[0-9]+]], $[[T3]], $6
+  ; MMR3:       movz    $[[T4]], $[[T2]], $[[T1]]
+  ; MMR3:       addu16  $[[T6:[0-9]+]], $5, $[[T4]]
+  ; MMR3:       sltu    $[[T7:[0-9]+]], $[[T6]], $5
+  ; MMR3:       addu16  $2, $4, $[[T7]]
+
+  ; MMR6: addiur2 $[[T1:[0-9]+]], $7, 4
+  ; MMR6: sltu    $[[T2:[0-9]+]], $[[T1]], $7
+  ; MMR6: xori    $[[T3:[0-9]+]], $[[T2]], 1
+  ; MMR6: selnez  $[[T4:[0-9]+]], $[[T2]], $[[T3]]
+  ; MMR6: addu16  $[[T5:[0-9]+]], $6, $[[T2]]
+  ; MMR6: sltu    $[[T6:[0-9]+]], $[[T5]], $6
+  ; MMR6: seleqz  $[[T7:[0-9]+]], $[[T6]], $[[T3]]
+  ; MMR6: or      $[[T8:[0-9]+]], $[[T4]], $[[T7]]
+  ; MMR6: addu16  $[[T9:[0-9]+]], $5, $[[T8]]
+  ; MMR6: sltu    $[[T10:[0-9]+]], $[[T9]], $5
+  ; MMR6: addu16  $[[T11:[0-9]+]], $4, $[[T10]]
+  ; MMR6: move    $4, $7
+  ; MMR6: move    $5, $[[T1]]
 
   ; MM64:       daddiu  $[[T0:[0-9]+]], $5, 4
-  ; MM64:       daddiu  $[[T1:[0-9]+]], $zero, 4
-  ; MM64:       sltu    $[[T1]], $[[T0]], $[[T1]]
-  ; MM64:       daddu   $2, $4, $[[T1]]
+  ; MM64:       sltu    $[[T1:[0-9]+]], $[[T0]], $5
+  ; MM64:       dsll    $[[T2:[0-9]+]], $[[T1]], 32
+  ; MM64:       dsrl    $[[T3:[0-9]+]], $[[T2]], 32
+  ; MM64:       daddu   $2, $4, $[[T3]]
 
   %r = add i128 4, %a
   ret i128 %r
@@ -380,16 +467,15 @@ define signext i64 @add_i64_3(i64 signext %a) {
 ; ALL-LABEL: add_i64_3:
 
   ; GP32:       addiu   $[[T0:[0-9]+]], $5, 3
-  ; GP32:       addiu   $[[T1:[0-9]+]], $zero, 3
-  ; GP32:       sltu    $[[T1]], $[[T0]], $[[T1]]
+  ; GP32:       sltu    $[[T1:[0-9]+]], $[[T0]], $5
   ; GP32:       addu    $2, $4, $[[T1]]
 
   ; GP64:       daddiu  $2, $4, 3
 
-  ; MM32:       addiu   $[[T0:[0-9]+]], $5, 3
-  ; MM32:       li16    $[[T1:[0-9]+]], 3
-  ; MM32:       sltu    $[[T2:[0-9]+]], $[[T0]], $[[T1]]
-  ; MM32:       addu    $2, $4, $[[T2]]
+  ; MM32:       move    $[[T1:[0-9]+]], $5
+  ; MM32:       addius5 $[[T1]], 3
+  ; MM32:       sltu    $[[T2:[0-9]+]], $[[T1]], $5
+  ; MM32:       addu16  $2, $4, $[[T2]]
 
   ; MM64:       daddiu  $2, $4, 3
 
@@ -400,38 +486,70 @@ define signext i64 @add_i64_3(i64 signext %a) {
 define signext i128 @add_i128_3(i128 signext %a) {
 ; ALL-LABEL: add_i128_3:
 
-  ; GP32:       addiu   $[[T0:[0-9]+]], $7, 3
-  ; GP32:       addiu   $[[T1:[0-9]+]], $zero, 3
-  ; GP32:       sltu    $[[T1]], $[[T0]], $[[T1]]
-  ; GP32:       addu    $[[T2:[0-9]+]], $6, $[[T1]]
-  ; GP32:       sltu    $[[T3:[0-9]+]], $[[T2]], $zero
-  ; GP32:       addu    $[[T4:[0-9]+]], $5, $[[T3]]
-  ; GP32:       sltu    $[[T5:[0-9]+]], $[[T4]], $zero
-  ; GP32:       addu    $[[T5]], $4, $[[T5]]
-  ; GP32:       move    $4, $[[T2]]
-  ; GP32:       move    $5, $[[T0]]
-
-  ; GP64:       daddiu  $[[T0:[0-9]+]], $5, 3
-  ; GP64:       daddiu  $[[T1:[0-9]+]], $zero, 3
-  ; GP64:       sltu    $[[T1]], $[[T0]], $[[T1]]
-  ; GP64:       daddu   $2, $4, $[[T1]]
-
-  ; MM32:       addiu   $[[T0:[0-9]+]], $7, 3
-  ; MM32:       li16    $[[T1:[0-9]+]], 3
-  ; MM32:       sltu    $[[T1]], $[[T0]], $[[T1]]
-  ; MM32:       addu16  $[[T2:[0-9]+]], $6, $[[T1]]
-  ; MM32:       li16    $[[T3:[0-9]+]], 0
-  ; MM32:       sltu    $[[T4:[0-9]+]], $[[T2]], $[[T3]]
-  ; MM32:       addu16  $[[T4]], $5, $[[T4]]
-  ; MM32:       sltu    $[[T5:[0-9]+]], $[[T4]], $[[T3]]
-  ; MM32:       addu16  $[[T5]], $4, $[[T5]]
-  ; MM32:       move    $4, $[[T2]]
-  ; MM32:       move    $5, $[[T0]]
+  ; PRE4:       move   $[[T0:[0-9]+]], $5
+  ; PRE4:       addiu  $[[T1:[0-9]+]], $7, 3
+  ; PRE4:       sltu   $[[T2:[0-9]+]], $[[T1]], $7
+  ; PRE4:       xori   $[[T3:[0-9]+]], $[[T2]], 1
+  ; PRE4:       bnez   $[[T3]], $BB[[BB0:[0-9_]+]]
+  ; PRE4:       addu   $[[T4:[0-9]+]], $6, $[[T2]]
+  ; PRE4:       sltu   $[[T5:[0-9]+]], $[[T4]], $6
+  ; PRE4;       $BB[[BB0:[0-9]+]]:
+  ; PRE4:       addu   $[[T6:[0-9]+]], $[[T0]], $[[T5]]
+  ; PRE4:       sltu   $[[T7:[0-9]+]], $[[T6]], $[[T0]]
+  ; PRE4:       addu   $[[T8:[0-9]+]], $4, $[[T7]]
+  ; PRE4:       move    $4, $[[T4]]
+
+  ; GP32-CMOV:  addiu   $[[T0:[0-9]+]], $7, 3
+  ; GP32-CMOV:  sltu    $[[T1:[0-9]+]], $[[T0]], $7
+  ; GP32-CMOV:  addu    $[[T2:[0-9]+]], $6, $[[T1]]
+  ; GP32-CMOV:  sltu    $[[T3:[0-9]+]], $[[T2]], $6
+  ; GP32-CMOV:  movz    $[[T3]], $[[T1]], $[[T1]]
+  ; GP32-CMOV:  addu    $[[T4:[0-9]+]], $5, $[[T3]]
+  ; GP32-CMOV:  sltu    $[[T5:[0-9]+]], $[[T4]], $5
+  ; GP32-CMOV:  addu    $[[T7:[0-9]+]], $4, $[[T5]]
+  ; GP32-CMOV:  move    $4, $[[T2]]
+  ; GP32-CMOV:  move    $5, $[[T0]]
+
+  ; GP64:           daddiu  $[[T0:[0-9]+]], $5, 3
+  ; GP64:           sltu    $[[T1:[0-9]+]], $[[T0]], $5
+
+  ; GP64-NOT-R2-R6: dsll    $[[T2:[0-9]+]], $[[T1]], 32
+  ; GP64-NOT-R2-R6: dsrl    $[[T3:[0-9]+]], $[[T2]], 32
+  ; GP64-R2-R6:     dext    $[[T3:[0-9]+]], $[[T1]], 0, 32
+
+  ; GP64:           daddu   $2, $4, $[[T3]]
+
+  ; MMR3:       move    $[[T1:[0-9]+]], $7
+  ; MMR3:       addius5 $[[T1]], 3
+  ; MMR3:       sltu    $[[T2:[0-9]+]], $[[T1]], $7
+  ; MMR3:       sltu    $[[T3:[0-9]+]], $[[T1]], $7
+  ; MMR3:       addu16  $[[T4:[0-9]+]], $6, $[[T3]]
+  ; MMR3:       sltu    $[[T5:[0-9]+]], $[[T4]], $6
+  ; MMR3:       movz    $[[T5]], $[[T3]], $[[T2]]
+  ; MMR3:       addu16  $[[T6:[0-9]+]], $5, $[[T5]]
+  ; MMR3:       sltu    $[[T7:[0-9]+]], $[[T6]], $5
+  ; MMR3:       addu16  $2, $4, $[[T7]]
+
+  ; MMR6: move    $[[T1:[0-9]+]], $7
+  ; MMR6: addius5 $[[T1]], 3
+  ; MMR6: sltu    $[[T2:[0-9]+]], $[[T1]], $7
+  ; MMR6: xori    $[[T3:[0-9]+]], $[[T2]], 1
+  ; MMR6: selnez  $[[T4:[0-9]+]], $[[T2]], $[[T3]]
+  ; MMR6: addu16  $[[T5:[0-9]+]], $6, $[[T2]]
+  ; MMR6: sltu    $[[T6:[0-9]+]], $[[T5]], $6
+  ; MMR6: seleqz  $[[T7:[0-9]+]], $[[T6]], $[[T3]]
+  ; MMR6: or      $[[T8:[0-9]+]], $[[T4]], $[[T7]]
+  ; MMR6: addu16  $[[T9:[0-9]+]], $5, $[[T8]]
+  ; MMR6: sltu    $[[T10:[0-9]+]], $[[T9]], $5
+  ; MMR6: addu16  $[[T11:[0-9]+]], $4, $[[T10]]
+  ; MMR6: move    $4, $[[T5]]
+  ; MMR6: move    $5, $[[T1]]
 
   ; MM64:       daddiu  $[[T0:[0-9]+]], $5, 3
-  ; MM64:       daddiu  $[[T1:[0-9]+]], $zero, 3
-  ; MM64:       sltu    $[[T1]], $[[T0]], $[[T1]]
-  ; MM64:       daddu   $2, $4, $[[T1]]
+  ; MM64:       sltu    $[[T1:[0-9]+]], $[[T0]], $5
+  ; MM64:       dsll    $[[T2:[0-9]+]], $[[T1]], 32
+  ; MM64:       dsrl    $[[T3:[0-9]+]], $[[T2]], 32
+  ; MM64:       daddu   $2, $4, $[[T3]]
 
   %r = add i128 3, %a
   ret i128 %r
diff --git a/test/CodeGen/Mips/llvm-ir/sub.ll b/test/CodeGen/Mips/llvm-ir/sub.ll
index a730063c552f..655addb10a64 100644
--- a/test/CodeGen/Mips/llvm-ir/sub.ll
+++ b/test/CodeGen/Mips/llvm-ir/sub.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=mips -mcpu=mips2 | FileCheck %s \
-; RUN:    -check-prefixes=NOT-R2-R6,GP32,GP32-NOT-MM,NOT-MM
+; RUN:    -check-prefixes=NOT-R2-R6,GP32,GP32-NOT-MM,NOT-MM,PRE4
 ; RUN: llc < %s -march=mips -mcpu=mips32 | FileCheck %s \
 ; RUN:    -check-prefixes=NOT-R2-R6,GP32,GP32-NOT-MM,NOT-MM
 ; RUN: llc < %s -march=mips -mcpu=mips32r2 | FileCheck %s \
@@ -11,25 +11,25 @@
 ; RUN: llc < %s -march=mips -mcpu=mips32r6 | FileCheck %s \
 ; RUN:    -check-prefixes=R2-R6,GP32,GP32-NOT-MM,NOT-MM
 ; RUN: llc < %s -march=mips -mcpu=mips32r3 -mattr=+micromips -verify-machineinstrs | FileCheck %s \
-; RUN:    -check-prefixes=GP32-MM,GP32,MM
+; RUN:    -check-prefixes=GP32-MM,GP32,MM32,MMR3
 ; RUN: llc < %s -march=mips -mcpu=mips32r6 -mattr=+micromips | FileCheck %s \
-; RUN:    -check-prefixes=GP32-MM,GP32,MM
+; RUN:    -check-prefixes=GP32-MM,GP32,MM32,MMR6
 ; RUN: llc < %s -march=mips64 -mcpu=mips3 | FileCheck %s \
-; RUN:    -check-prefixes=NOT-R2-R6,GP64,NOT-MM
+; RUN:    -check-prefixes=NOT-R2-R6,GP64,NOT-MM,GP64-NOT-R2
 ; RUN: llc < %s -march=mips64 -mcpu=mips4 | FileCheck %s \
-; RUN:    -check-prefixes=NOT-R2-R6,GP64,NOT-MM
+; RUN:    -check-prefixes=NOT-R2-R6,GP64,NOT-MM,GP64-NOT-R2
 ; RUN: llc < %s -march=mips64 -mcpu=mips64 | FileCheck %s \
-; RUN:    -check-prefixes=NOT-R2-R6,GP64,NOT-MM
+; RUN:    -check-prefixes=NOT-R2-R6,GP64,NOT-MM,GP64-NOT-R2
 ; RUN: llc < %s -march=mips64 -mcpu=mips64r2 | FileCheck %s \
-; RUN:    -check-prefixes=R2-R6,GP64,NOT-MM
+; RUN:    -check-prefixes=R2-R6,GP64,NOT-MM,GP64-R2
 ; RUN: llc < %s -march=mips64 -mcpu=mips64r3 | FileCheck %s \
-; RUN:    -check-prefixes=R2-R6,GP64,NOT-MM
+; RUN:    -check-prefixes=R2-R6,GP64,NOT-MM,GP64-R2
 ; RUN: llc < %s -march=mips64 -mcpu=mips64r5 | FileCheck %s \
-; RUN:    -check-prefixes=R2-R6,GP64,NOT-MM
+; RUN:    -check-prefixes=R2-R6,GP64,NOT-MM,GP64-R2
 ; RUN: llc < %s -march=mips64 -mcpu=mips64r6 | FileCheck %s \
-; RUN:    -check-prefixes=R2-R6,GP64,NOT-MM
+; RUN:    -check-prefixes=R2-R6,GP64,NOT-MM,GP64-R2
 ; RUN: llc < %s -march=mips64 -mcpu=mips64r6 -mattr=+micromips | FileCheck %s \
-; RUN:    -check-prefixes=GP64,MM
+; RUN:    -check-prefixes=GP64,MM64
 
 define signext i1 @sub_i1(i1 signext %a, i1 signext %b) {
 entry:
@@ -100,10 +100,15 @@ define signext i64 @sub_i64(i64 signext %a, i64 signext %b) {
 entry:
 ; ALL-LABEL: sub_i64:
 
-  ; GP32-NOT-MM     subu    $3, $5, $7
-  ; GP32:           sltu    $[[T0:[0-9]+]], $5, $7
-  ; GP32:           addu    $[[T1:[0-9]+]], $[[T0]], $6
-  ; GP32:           subu    $2, $4, $[[T1]]
+  ; GP32-NOT-MM:    sltu    $[[T0:[0-9]+]], $5, $7
+  ; GP32-NOT-MM:    subu    $2, $4, $6
+  ; GP32-NOT-MM:    subu    $2, $2, $[[T0]]
+  ; GP32-NOT-MM:    subu    $3, $5, $7
+
+  ; MM32:           sltu    $[[T0:[0-9]+]], $5, $7
+  ; MM32:           subu16    $3, $4, $6
+  ; MM32:           subu16    $2, $3, $[[T0]]
+  ; MM32:           subu16    $3, $5, $7
 
   ; GP64:           dsubu   $2, $4, $5
 
@@ -115,42 +120,109 @@ define signext i128 @sub_i128(i128 signext %a, i128 signext %b) {
 entry:
 ; ALL-LABEL: sub_i128:
 
-  ; GP32-NOT-MM:    lw        $[[T0:[0-9]+]], 20($sp)
-  ; GP32-NOT-MM:    sltu      $[[T1:[0-9]+]], $5, $[[T0]]
-  ; GP32-NOT-MM:    lw        $[[T2:[0-9]+]], 16($sp)
-  ; GP32-NOT-MM:    addu      $[[T3:[0-9]+]], $[[T1]], $[[T2]]
-  ; GP32-NOT-MM:    lw        $[[T4:[0-9]+]], 24($sp)
-  ; GP32-NOT-MM:    lw        $[[T5:[0-9]+]], 28($sp)
-  ; GP32-NOT-MM:    subu      $[[T6:[0-9]+]], $7, $[[T5]]
-  ; GP32-NOT-MM:    subu      $2, $4, $[[T3]]
-  ; GP32-NOT-MM:    sltu      $[[T8:[0-9]+]], $6, $[[T4]]
-  ; GP32-NOT-MM:    addu      $[[T9:[0-9]+]], $[[T8]], $[[T0]]
-  ; GP32-NOT-MM:    subu      $3, $5, $[[T9]]
-  ; GP32-NOT-MM:    sltu      $[[T10:[0-9]+]], $7, $[[T5]]
-  ; GP32-NOT-MM:    addu      $[[T11:[0-9]+]], $[[T10]], $[[T4]]
-  ; GP32-NOT-MM:    subu      $4, $6, $[[T11]]
-  ; GP32-NOT-MM:    move      $5, $[[T6]]
-
-  ; GP32-MM:        lw        $[[T0:[0-9]+]], 20($sp)
-  ; GP32-MM:        sltu      $[[T1:[0-9]+]], $[[T2:[0-9]+]], $[[T0]]
-  ; GP32-MM:        lw        $[[T3:[0-9]+]], 16($sp)
-  ; GP32-MM:        addu      $[[T3]], $[[T1]], $[[T3]]
-  ; GP32-MM:        lw        $[[T4:[0-9]+]], 24($sp)
-  ; GP32-MM:        lw        $[[T5:[0-9]+]], 28($sp)
-  ; GP32-MM:        subu      $[[T1]], $7, $[[T5]]
-  ; GP32-MM:        subu16    $[[T3]], $[[T6:[0-9]+]], $[[T3]]
-  ; GP32-MM:        sltu      $[[T6]], $6, $[[T4]]
-  ; GP32-MM:        addu16    $[[T0]], $[[T6]], $[[T0]]
-  ; GP32-MM:        subu16    $[[T0]], $5, $[[T0]]
-  ; GP32-MM:        sltu      $[[T6]], $7, $[[T5]]
-  ; GP32-MM:        addu      $[[T6]], $[[T6]], $[[T4]]
-  ; GP32-MM:        subu16    $[[T6]], $6, $[[T6]]
-  ; GP32-MM:        move      $[[T2]], $[[T1]]
-
-  ; GP64:           dsubu     $3, $5, $7
-  ; GP64:           sltu      $[[T0:[0-9]+]], $5, $7
-  ; GP64:           daddu     $[[T1:[0-9]+]], $[[T0]], $6
-  ; GP64:           dsubu     $2, $4, $[[T1]]
+; PRE4: lw     $[[T0:[0-9]+]], 24($sp)
+; PRE4: lw     $[[T1:[0-9]+]], 28($sp)
+; PRE4: sltu   $[[T2:[0-9]+]], $7, $[[T1]]
+; PRE4: xor    $[[T3:[0-9]+]], $6, $[[T0]]
+; PRE4: sltiu  $[[T4:[0-9]+]], $[[T3]], 1
+; PRE4: bnez   $[[T4]]
+; PRE4: move   $[[T5:[0-9]+]], $[[T2]]
+; PRE4: sltu   $[[T5]], $6, $[[T0]]
+
+; PRE4: lw     $[[T6:[0-9]+]], 20($sp)
+; PRE4: subu   $[[T7:[0-9]+]], $5, $[[T6]]
+; PRE4: subu   $[[T8:[0-9]+]], $[[T7]], $[[T5]]
+; PRE4: sltu   $[[T9:[0-9]+]], $[[T7]], $[[T5]]
+; PRE4: sltu   $[[T10:[0-9]+]], $5, $[[T6]]
+; PRE4: lw     $[[T11:[0-9]+]], 16($sp)
+; PRE4: subu   $[[T12:[0-9]+]], $4, $[[T11]]
+; PRE4: subu   $[[T13:[0-9]+]], $[[T12]], $[[T10]]
+; PRE4: subu   $[[T14:[0-9]+]], $[[T13]], $[[T9]]
+; PRE4: subu   $[[T15:[0-9]+]], $6, $[[T0]]
+; PRE4: subu   $[[T16:[0-9]+]], $[[T15]], $[[T2]]
+; PRE4: subu   $5, $7, $[[T1]]
+
+; MMR3: lw       $[[T1:[0-9]+]], 48($sp)
+; MMR3: sltu     $[[T2:[0-9]+]], $6, $[[T1]]
+; MMR3: xor      $[[T3:[0-9]+]], $6, $[[T1]]
+; MMR3: lw       $[[T4:[0-9]+]], 52($sp)
+; MMR3: sltu     $[[T5:[0-9]+]], $7, $[[T4]]
+; MMR3: movz     $[[T6:[0-9]+]], $[[T5]], $[[T3]]
+; MMR3: lw       $[[T7:[0-8]+]], 44($sp)
+; MMR3: subu16   $[[T8:[0-9]+]], $5, $[[T7]]
+; MMR3: subu16   $[[T9:[0-9]+]], $[[T8]], $[[T6]]
+; MMR3: sltu     $[[T10:[0-9]+]], $[[T8]], $[[T2]]
+; MMR3: sltu     $[[T11:[0-9]+]], $5, $[[T7]]
+; MMR3: lw       $[[T12:[0-9]+]], 40($sp)
+; MMR3: lw       $[[T13:[0-9]+]], 12($sp)
+; MMR3: subu16   $[[T14:[0-9]+]], $[[T13]], $[[T12]]
+; MMR3: subu16   $[[T15:[0-9]+]], $[[T14]], $[[T11]]
+; MMR3: subu16   $[[T16:[0-9]+]], $[[T15]], $[[T10]]
+; MMR3: subu16   $[[T17:[0-9]+]], $6, $[[T1]]
+; MMR3: subu16   $[[T18:[0-9]+]], $[[T17]], $7
+; MMR3: lw       $[[T19:[0-9]+]], 8($sp)
+; MMR3: lw       $[[T20:[0-9]+]], 0($sp)
+; MMR3: subu16   $5, $[[T19]], $[[T20]]
+
+; MMR6: move     $[[T0:[0-9]+]], $7
+; MMR6: sw       $[[T0]], 8($sp)
+; MMR6: move     $[[T1:[0-9]+]], $5
+; MMR6: sw       $4, 12($sp)
+; MMR6: lw       $[[T2:[0-9]+]], 48($sp)
+; MMR6: sltu     $[[T3:[0-9]+]], $6, $[[T2]]
+; MMR6: xor      $[[T4:[0-9]+]], $6, $[[T2]]
+; MMR6: sltiu    $[[T5:[0-9]+]], $[[T4]], 1
+; MMR6: seleqz   $[[T6:[0-9]+]], $[[T3]], $[[T5]]
+; MMR6: lw       $[[T7:[0-9]+]], 52($sp)
+; MMR6: sltu     $[[T8:[0-9]+]], $[[T0]], $[[T7]]
+; MMR6: selnez   $[[T9:[0-9]+]], $[[T8]], $[[T5]]
+; MMR6: or       $[[T10:[0-9]+]], $[[T9]], $[[T6]]
+; MMR6: lw       $[[T11:[0-9]+]], 44($sp)
+; MMR6: subu16   $[[T12:[0-9]+]], $[[T1]], $[[T11]]
+; MMR6: subu16   $[[T13:[0-9]+]], $[[T12]], $[[T7]]
+; MMR6: sltu     $[[T16:[0-9]+]], $[[T12]], $[[T7]]
+; MMR6: sltu     $[[T17:[0-9]+]], $[[T1]], $[[T11]]
+; MMR6: lw       $[[T18:[0-9]+]], 40($sp)
+; MMR6: lw       $[[T19:[0-9]+]], 12($sp)
+; MMR6: subu16   $[[T20:[0-9]+]], $[[T19]], $[[T18]]
+; MMR6: subu16   $[[T21:[0-9]+]], $[[T20]], $[[T17]]
+; MMR6: subu16   $[[T22:[0-9]+]], $[[T21]], $[[T16]]
+; MMR6: subu16   $[[T23:[0-9]+]], $6, $[[T2]]
+; MMR6: subu16   $4, $[[T23]], $5
+; MMR6: lw       $[[T24:[0-9]+]], 8($sp)
+; MMR6: lw       $[[T25:[0-9]+]], 0($sp)
+; MMR6: subu16   $5, $[[T24]], $[[T25]]
+; MMR6: lw       $3, 4($sp)
+
+; FIXME: The sltu, dsll, dsrl pattern here occurs when an i32 is zero
+;        extended to 64 bits. Fortunately slt(i)(u) actually gives an i1.
+;        These should be combined away.
+
+; GP64-NOT-R2: dsubu     $1, $4, $6
+; GP64-NOT-R2: sltu      $[[T0:[0-9]+]], $5, $7
+; GP64-NOT-R2: dsll      $[[T1:[0-9]+]], $[[T0]], 32
+; GP64-NOT-R2: dsrl      $[[T2:[0-9]+]], $[[T1]], 32
+; GP64-NOT-R2: dsubu     $2, $1, $[[T2]]
+; GP64-NOT-R2: dsubu     $3, $5, $7
+
+; FIXME: Likewise for the sltu, dext here.
+
+; GP64-R2:     dsubu     $1, $4, $6
+; GP64-R2:     sltu      $[[T0:[0-9]+]], $5, $7
+; GP64-R2:     dext      $[[T1:[0-9]+]], $[[T0]], 0, 32
+; GP64-R2:     dsubu     $2, $1, $[[T1]]
+; GP64-R2:     dsubu     $3, $5, $7
+
+; FIXME: Again, redundant sign extension. Also, microMIPSR6 has the
+;        dext instruction which should be used here.
+
+; MM64: dsubu   $[[T0:[0-9]+]], $4, $6
+; MM64: sltu    $[[T1:[0-9]+]], $5, $7
+; MM64: dsll    $[[T2:[0-9]+]], $[[T1]], 32
+; MM64: dsrl    $[[T3:[0-9]+]], $[[T2]], 32
+; MM64: dsubu   $2, $[[T0]], $[[T3]]
+; MM64: dsubu   $3, $5, $7
+; MM64: jr      $ra
 
   %r = sub i128 %a, %b
   ret i128 %r
diff --git a/test/CodeGen/Mips/long-calls.ll b/test/CodeGen/Mips/long-calls.ll
new file mode 100644
index 000000000000..8a95e9b9307d
--- /dev/null
+++ b/test/CodeGen/Mips/long-calls.ll
@@ -0,0 +1,57 @@
+; RUN: llc -march=mips -mattr=-long-calls %s -o - \
+; RUN:   | FileCheck -check-prefix=OFF %s
+; RUN: llc -march=mips -mattr=+long-calls,+noabicalls %s -o - \
+; RUN:   | FileCheck -check-prefix=ON32 %s
+
+; RUN: llc -march=mips -mattr=+long-calls,-noabicalls %s -o - \
+; RUN:   | FileCheck -check-prefix=OFF %s
+
+; RUN: llc -march=mips64 -target-abi n32 -mattr=-long-calls %s -o - \
+; RUN:   | FileCheck -check-prefix=OFF %s
+; RUN: llc -march=mips64 -target-abi n32 -mattr=+long-calls,+noabicalls %s -o - \
+; RUN:   | FileCheck -check-prefix=ON32 %s
+
+; RUN: llc -march=mips64 -target-abi n64 -mattr=-long-calls %s -o - \
+; RUN:   | FileCheck -check-prefix=OFF %s
+; RUN: llc -march=mips64 -target-abi n64 -mattr=+long-calls,+noabicalls %s -o - \
+; RUN:   | FileCheck -check-prefix=ON64 %s
+
+declare void @callee()
+declare void @llvm.memset.p0i8.i32(i8* nocapture writeonly, i8, i32, i32, i1)
+
+@val = internal unnamed_addr global [20 x i32] zeroinitializer, align 4
+
+define void @caller() {
+
+; Use `jal` instruction with R_MIPS_26 relocation.
+; OFF: jal callee
+; OFF: jal memset
+
+; Save the `callee` and `memset` addresses in $25 register
+; and use `jalr` for the jumps.
+; ON32: lui    $1, %hi(callee)
+; ON32: addiu  $25, $1, %lo(callee)
+; ON32: jalr   $25
+
+; ON32: addiu  $1, $zero, %lo(memset)
+; ON32: lui    $2, %hi(memset)
+; ON32: addu   $25, $2, $1
+; ON32: jalr   $25
+
+; ON64: lui     $1, %highest(callee)
+; ON64: daddiu  $1, $1, %higher(callee)
+; ON64: daddiu  $1, $1, %hi(callee)
+; ON64: daddiu  $25, $1, %lo(callee)
+; ON64: jalr    $25
+
+; ON64: daddiu  $1, $zero, %higher(memset)
+; ON64: lui     $2, %highest(memset)
+; ON64: lui     $2, %hi(memset)
+; ON64: daddiu  $2, $zero, %lo(memset)
+; ON64: daddu   $25, $1, $2
+; ON64: jalr    $25
+
+  call void @callee()
+  call void @llvm.memset.p0i8.i32(i8* bitcast ([20 x i32]* @val to i8*), i8 0, i32 80, i32 4, i1 false)
+  ret  void
+}
diff --git a/test/CodeGen/Mips/madd-msub.ll b/test/CodeGen/Mips/madd-msub.ll
index 7baba005a072..3e1a2e8b9708 100644
--- a/test/CodeGen/Mips/madd-msub.ll
+++ b/test/CodeGen/Mips/madd-msub.ll
@@ -25,11 +25,11 @@
 
 ; 32R6-DAG:      mul  $[[T0:[0-9]+]], ${{[45]}}, ${{[45]}}
 ; 32R6-DAG:      addu $[[T1:[0-9]+]], $[[T0]], $6
-; 32R6-DAG:      sltu $[[T2:[0-9]+]], $[[T1]], $6
-; 32R6-DAG:      sra  $[[T3:[0-9]+]], $6, 31
-; 32R6-DAG:      addu $[[T4:[0-9]+]], $[[T2]], $[[T3]]
-; 32R6-DAG:      muh  $[[T5:[0-9]+]], ${{[45]}}, ${{[45]}}
-; 32R6-DAG:      addu $2, $[[T5]], $[[T4]]
+; 32R6-DAG:      sltu $[[T2:[0-9]+]], $[[T1]], $[[T0]]
+; 32R6-DAG:      muh  $[[T3:[0-9]+]], ${{[45]}}, ${{[45]}}
+; 32R6-DAG:      sra  $[[T4:[0-9]+]], $6, 31
+; 32R6-DAG:      addu $[[T5:[0-9]+]], $[[T3]], $[[T4]]
+; 32R6-DAG:      addu $2, $[[T5]], $[[T2]]
 
 ; 64-DAG:        sll $[[T0:[0-9]+]], $4, 0
 ; 64-DAG:        sll $[[T1:[0-9]+]], $5, 0
@@ -71,7 +71,7 @@ entry:
 
 ; 32R6-DAG:      mul  $[[T0:[0-9]+]], ${{[45]}}, ${{[45]}}
 ; 32R6-DAG:      addu $[[T1:[0-9]+]], $[[T0]], $6
-; 32R6-DAG:      sltu $[[T2:[0-9]+]], $[[T1]], $6
+; 32R6-DAG:      sltu $[[T2:[0-9]+]], $[[T1]], $[[T0]]
 ; FIXME: There's a redundant move here. We should remove it
 ; 32R6-DAG:      muhu $[[T3:[0-9]+]], ${{[45]}}, ${{[45]}}
 ; 32R6-DAG:      addu $2, $[[T3]], $[[T2]]
@@ -109,10 +109,10 @@ entry:
 
 ; 32R6-DAG:      mul  $[[T0:[0-9]+]], ${{[45]}}, ${{[45]}}
 ; 32R6-DAG:      addu $[[T1:[0-9]+]], $[[T0]], $7
-; 32R6-DAG:      sltu $[[T2:[0-9]+]], $[[T1]], $7
-; 32R6-DAG:      addu $[[T4:[0-9]+]], $[[T2]], $6
-; 32R6-DAG:      muh  $[[T5:[0-9]+]], ${{[45]}}, ${{[45]}}
-; 32R6-DAG:      addu $2, $[[T5]], $[[T4]]
+; 32R6-DAG:      sltu $[[T2:[0-9]+]], $[[T1]], $1
+; 32R6-DAG:      muh  $[[T3:[0-9]+]], ${{[45]}}, ${{[45]}}
+; 32R6-DAG:      addu $[[T4:[0-9]+]], $[[T3]], $6
+; 32R6-DAG:      addu $2, $[[T4]], $[[T2]]
 
 ; 64-DAG:        sll $[[T0:[0-9]+]], $4, 0
 ; 64-DAG:        sll $[[T1:[0-9]+]], $5, 0
@@ -134,6 +134,17 @@ entry:
   ret i64 %add
 }
 
+; ALL-LABEL: madd4
+; ALL-NOT: madd ${{[0-9]+}}, ${{[0-9]+}}
+
+define i32 @madd4(i32 %a, i32 %b, i32 %c) {
+entry:
+  %mul = mul nsw i32 %a, %b
+  %add = add nsw i32 %c, %mul
+
+  ret i32 %add
+}
+
 ; ALL-LABEL: msub1:
 
 ; 32-DAG:        sra $[[T0:[0-9]+]], $6, 31
@@ -148,13 +159,13 @@ entry:
 ; DSP-DAG:       mfhi $2, $[[AC]]
 ; DSP-DAG:       mflo $3, $[[AC]]
 
-; 32R6-DAG:      muh  $[[T0:[0-9]+]], ${{[45]}}, ${{[45]}}
-; 32R6-DAG:      mul  $[[T1:[0-9]+]], ${{[45]}}, ${{[45]}}
-; 32R6-DAG:      sltu $[[T3:[0-9]+]], $6, $[[T1]]
-; 32R6-DAG:      addu $[[T4:[0-9]+]], $[[T3]], $[[T0]]
-; 32R6-DAG:      sra  $[[T5:[0-9]+]], $6, 31
-; 32R6-DAG:      subu $2, $[[T5]], $[[T4]]
-; 32R6-DAG:      subu $3, $6, $[[T1]]
+; 32R6-DAG:      mul  $[[T0:[0-9]+]], ${{[45]}}, ${{[45]}}
+; 32R6-DAG:      sltu $[[T1:[0-9]+]], $6, $[[T0]]
+; 32R6-DAG:      muh  $[[T2:[0-9]+]], ${{[45]}}, ${{[45]}}
+; 32R6-DAG:      sra  $[[T3:[0-9]+]], $6, 31
+; 32R6-DAG:      subu $[[T4:[0-9]+]], $[[T3]], $[[T2]]
+; 32R6-DAG:      subu $2, $[[T4]], $[[T1]]
+; 32R6-DAG:      subu $3, $6, $[[T0]]
 
 ; 64-DAG:        sll $[[T0:[0-9]+]], $4, 0
 ; 64-DAG:        sll $[[T1:[0-9]+]], $5, 0
@@ -194,13 +205,12 @@ entry:
 ; DSP-DAG:       mfhi $2, $[[AC]]
 ; DSP-DAG:       mflo $3, $[[AC]]
 
-; 32R6-DAG:      muhu $[[T0:[0-9]+]], ${{[45]}}, ${{[45]}}
-; 32R6-DAG:      mul $[[T1:[0-9]+]], ${{[45]}}, ${{[45]}}
-
-; 32R6-DAG:      sltu $[[T2:[0-9]+]], $6, $[[T1]]
-; 32R6-DAG:      addu $[[T3:[0-9]+]], $[[T2]], $[[T0]]
-; 32R6-DAG:      negu $2, $[[T3]]
-; 32R6-DAG:      subu $3, $6, $[[T1]]
+; 32R6-DAG:      mul $[[T0:[0-9]+]], ${{[45]}}, ${{[45]}}
+; 32R6-DAG:      sltu $[[T1:[0-9]+]], $6, $[[T0]]
+; 32R6-DAG:      muhu $[[T2:[0-9]+]], ${{[45]}}, ${{[45]}}
+; 32R6-DAG:      negu $[[T3:[0-9]+]], $[[T2]]
+; 32R6-DAG:      subu $2, $[[T3]], $[[T1]]
+; 32R6-DAG:      subu $3, $6, $[[T0]]
 
 ; 64-DAG:        d[[m:m]]ult $5, $4
 ; 64-DAG:        [[m]]flo $[[T0:[0-9]+]]
@@ -234,12 +244,12 @@ entry:
 ; DSP-DAG:       mfhi $2, $[[AC]]
 ; DSP-DAG:       mflo $3, $[[AC]]
 
-; 32R6-DAG:      muh $[[T0:[0-9]+]], ${{[45]}}, ${{[45]}}
-; 32R6-DAG:      mul $[[T1:[0-9]+]], ${{[45]}}, ${{[45]}}
-; 32R6-DAG:      sltu $[[T2:[0-9]+]], $7, $[[T1]]
-; 32R6-DAG:      addu $[[T3:[0-9]+]], $[[T2]], $[[T0]]
-; 32R6-DAG:      subu $2, $6, $[[T3]]
-; 32R6-DAG:      subu $3, $7, $[[T1]]
+; 32R6-DAG:      mul $[[T0:[0-9]+]], ${{[45]}}, ${{[45]}}
+; 32R6-DAG:      sltu $[[T1:[0-9]+]], $7, $[[T0]]
+; 32R6-DAG:      muh $[[T2:[0-9]+]], ${{[45]}}, ${{[45]}}
+; 32R6-DAG:      subu $[[T3:[0-9]+]], $6, $[[T2]]
+; 32R6-DAG:      subu $2, $[[T3]], $[[T1]]
+; 32R6-DAG:      subu $3, $7, $[[T0]]
 
 ; 64-DAG:        sll $[[T0:[0-9]+]], $4, 0
 ; 64-DAG:        sll $[[T1:[0-9]+]], $5, 0
@@ -260,3 +270,14 @@ entry:
   %sub = sub nsw i64 %c, %mul
   ret i64 %sub
 }
+
+; ALL-LABEL: msub4
+; ALL-NOT: msub ${{[0-9]+}}, ${{[0-9]+}}
+
+define i32 @msub4(i32 %a, i32 %b, i32 %c) {
+entry:
+  %mul = mul nsw i32 %a, %b
+  %sub = sub nsw i32 %c, %mul
+
+  ret i32 %sub
+}
diff --git a/test/CodeGen/Mips/msa/f16-llvm-ir.ll b/test/CodeGen/Mips/msa/f16-llvm-ir.ll
index ac69dc913c18..b3ed8bdd3b9a 100644
--- a/test/CodeGen/Mips/msa/f16-llvm-ir.ll
+++ b/test/CodeGen/Mips/msa/f16-llvm-ir.ll
@@ -1,21 +1,21 @@
 ; RUN: llc -relocation-model=pic -march=mipsel -mcpu=mips32r5 \
-; RUN:     -mattr=+fp64,+msa < %s | FileCheck %s \
+; RUN:     -mattr=+fp64,+msa -verify-machineinstrs < %s | FileCheck %s \
 ; RUN:     --check-prefixes=ALL,MIPS32,MIPSR5,MIPS32-O32,MIPS32R5-O32
 ; RUN: llc -relocation-model=pic -march=mips64el -mcpu=mips64r5 \
-; RUN:     -mattr=+fp64,+msa -target-abi n32 < %s | FileCheck %s \
+; RUN:     -mattr=+fp64,+msa -verify-machineinstrs -target-abi n32 < %s | FileCheck %s \
 ; RUN:     --check-prefixes=ALL,MIPS64,MIPSR5,MIPS64-N32,MIPS64R5-N32
 ; RUN: llc -relocation-model=pic -march=mips64el -mcpu=mips64r5 \
-; RUN:     -mattr=+fp64,+msa -target-abi n64 < %s | FileCheck %s \
+; RUN:     -mattr=+fp64,+msa -verify-machineinstrs -target-abi n64 < %s | FileCheck %s \
 ; RUN:     --check-prefixes=ALL,MIPS64,MIPSR5,MIPS64-N64,MIPS64R5-N64
 
 ; RUN: llc -relocation-model=pic -march=mipsel -mcpu=mips32r6 \
-; RUN:     -mattr=+fp64,+msa < %s | FileCheck %s \
+; RUN:     -mattr=+fp64,+msa -verify-machineinstrs < %s | FileCheck %s \
 ; RUN:     --check-prefixes=ALL,MIPS32,MIPSR6,MIPSR6-O32
 ; RUN: llc -relocation-model=pic -march=mips64el -mcpu=mips64r6 \
-; RUN:     -mattr=+fp64,+msa -target-abi n32 < %s | FileCheck %s \
+; RUN:     -mattr=+fp64,+msa -verify-machineinstrs -target-abi n32 < %s | FileCheck %s \
 ; RUN:     --check-prefixes=ALL,MIPS64,MIPSR6,MIPS64-N32,MIPSR6-N32
 ; RUN: llc -relocation-model=pic -march=mips64el -mcpu=mips64r6 \
-; RUN:     -mattr=+fp64,+msa -target-abi n64 < %s | FileCheck %s \
+; RUN:     -mattr=+fp64,+msa -verify-machineinstrs -target-abi n64 < %s | FileCheck %s \
 ; RUN:     --check-prefixes=ALL,MIPS64,MIPSR6,MIPS64-N64,MIPSR6-N64
 
 
diff --git a/test/CodeGen/PowerPC/PR33671.ll b/test/CodeGen/PowerPC/PR33671.ll
new file mode 100644
index 000000000000..0edd2e8daff4
--- /dev/null
+++ b/test/CodeGen/PowerPC/PR33671.ll
@@ -0,0 +1,32 @@
+; Function Attrs: norecurse nounwind
+; RUN: llc -mtriple=powerpc64le-unknown-unknown -mcpu=pwr9 < %s | FileCheck %s
+define void @test1(i32* nocapture readonly %arr, i32* nocapture %arrTo) {
+entry:
+  %arrayidx = getelementptr inbounds i32, i32* %arrTo, i64 4
+  %0 = bitcast i32* %arrayidx to <4 x i32>*
+  %arrayidx1 = getelementptr inbounds i32, i32* %arr, i64 4
+  %1 = bitcast i32* %arrayidx1 to <4 x i32>*
+  %2 = load <4 x i32>, <4 x i32>* %1, align 16
+  store <4 x i32> %2, <4 x i32>* %0, align 16
+  ret void
+; CHECK-LABEL: test1
+; CHECK: lxv [[LD:[0-9]+]], 16(3)
+; CHECK: stxv [[LD]], 16(4)
+}
+
+; Function Attrs: norecurse nounwind
+define void @test2(i32* nocapture readonly %arr, i32* nocapture %arrTo) {
+entry:
+  %arrayidx = getelementptr inbounds i32, i32* %arrTo, i64 1
+  %0 = bitcast i32* %arrayidx to <4 x i32>*
+  %arrayidx1 = getelementptr inbounds i32, i32* %arr, i64 2
+  %1 = bitcast i32* %arrayidx1 to <4 x i32>*
+  %2 = load <4 x i32>, <4 x i32>* %1, align 16
+  store <4 x i32> %2, <4 x i32>* %0, align 16
+  ret void
+; CHECK-LABEL: test2
+; CHECK: addi 3, 3, 8
+; CHECK: lxvx [[LD:[0-9]+]], 0, 3
+; CHECK: addi 3, 4, 4
+; CHECK: stxvx [[LD]], 0, 3
+}
diff --git a/test/CodeGen/PowerPC/build-vector-tests.ll b/test/CodeGen/PowerPC/build-vector-tests.ll
index 60bec4d18f12..3ad432872c0e 100644
--- a/test/CodeGen/PowerPC/build-vector-tests.ll
+++ b/test/CodeGen/PowerPC/build-vector-tests.ll
@@ -1018,13 +1018,13 @@ entry:
 ; P8BE-LABEL: fromDiffMemVarDi
 ; P8LE-LABEL: fromDiffMemVarDi
 ; P9BE: sldi {{r[0-9]+}}, r4, 2
-; P9BE-DAG: lxv {{v[0-9]+}}
-; P9BE-DAG: lxv
+; P9BE-DAG: lxvx {{v[0-9]+}}
+; P9BE-DAG: lxvx
 ; P9BE: vperm
 ; P9BE: blr
 ; P9LE: sldi {{r[0-9]+}}, r4, 2
-; P9LE-DAG: lxv {{v[0-9]+}}
-; P9LE-DAG: lxv
+; P9LE-DAG: lxvx {{v[0-9]+}}
+; P9LE-DAG: lxvx
 ; P9LE: vperm
 ; P9LE: blr
 ; P8BE: sldi {{r[0-9]+}}, r4, 2
@@ -1584,16 +1584,16 @@ entry:
 ; P9LE-LABEL: fromDiffMemConsAConvdtoi
 ; P8BE-LABEL: fromDiffMemConsAConvdtoi
 ; P8LE-LABEL: fromDiffMemConsAConvdtoi
-; P9BE: lxv [[REG1:[vs0-9]+]], 0(r3)
-; P9BE: lxv [[REG2:[vs0-9]+]], 16(r3)
+; P9BE-DAG: lxv [[REG1:[vs0-9]+]], 0(r3)
+; P9BE-DAG: lxv [[REG2:[vs0-9]+]], 16(r3)
 ; P9BE-DAG: xxmrgld [[REG3:[vs0-9]+]], [[REG1]], [[REG2]]
 ; P9BE-DAG: xxmrghd [[REG4:[vs0-9]+]], [[REG1]], [[REG2]]
 ; P9BE-DAG: xvcvdpsp [[REG5:[vs0-9]+]], [[REG3]]
 ; P9BE-DAG: xvcvdpsp [[REG6:[vs0-9]+]], [[REG4]]
 ; P9BE: vmrgew v2, [[REG6]], [[REG5]]
 ; P9BE: xvcvspsxws v2, v2
-; P9LE: lxv [[REG1:[vs0-9]+]], 0(r3)
-; P9LE: lxv [[REG2:[vs0-9]+]], 16(r3)
+; P9LE-DAG: lxv [[REG1:[vs0-9]+]], 0(r3)
+; P9LE-DAG: lxv [[REG2:[vs0-9]+]], 16(r3)
 ; P9LE-DAG: xxmrgld [[REG3:[vs0-9]+]], [[REG2]], [[REG1]]
 ; P9LE-DAG: xxmrghd [[REG4:[vs0-9]+]], [[REG2]], [[REG1]]
 ; P9LE-DAG: xvcvdpsp [[REG5:[vs0-9]+]], [[REG3]]
@@ -2177,12 +2177,14 @@ entry:
 ; P8BE-LABEL: fromDiffMemVarDui
 ; P8LE-LABEL: fromDiffMemVarDui
 ; P9BE-DAG: sldi {{r[0-9]+}}, r4, 2
-; P9BE-DAG: lxv {{v[0-9]+}}, -12(r3)
-; P9BE-DAG: lxv
+; P9BE-DAG: addi r3, r3, -12
+; P9BE-DAG: lxvx {{v[0-9]+}}, 0, r3
+; P9BE-DAG: lxvx
 ; P9BE: vperm
 ; P9BE: blr
 ; P9LE-DAG: sldi {{r[0-9]+}}, r4, 2
-; P9LE-DAG: lxv {{v[0-9]+}}, -12(r3)
+; P9LE-DAG: addi r3, r3, -12
+; P9LE-DAG: lxvx {{v[0-9]+}}, 0, r3
 ; P9LE-DAG: lxv
 ; P9LE: vperm
 ; P9LE: blr
@@ -2742,16 +2744,16 @@ entry:
 ; P9LE-LABEL: fromDiffMemConsAConvdtoui
 ; P8BE-LABEL: fromDiffMemConsAConvdtoui
 ; P8LE-LABEL: fromDiffMemConsAConvdtoui
-; P9BE: lxv [[REG1:[vs0-9]+]], 0(r3)
-; P9BE: lxv [[REG2:[vs0-9]+]], 16(r3)
+; P9BE-DAG: lxv [[REG1:[vs0-9]+]], 0(r3)
+; P9BE-DAG: lxv [[REG2:[vs0-9]+]], 16(r3)
 ; P9BE-DAG: xxmrgld [[REG3:[vs0-9]+]], [[REG1]], [[REG2]]
 ; P9BE-DAG: xxmrghd [[REG4:[vs0-9]+]], [[REG1]], [[REG2]]
 ; P9BE-DAG: xvcvdpsp [[REG5:[vs0-9]+]], [[REG3]]
 ; P9BE-DAG: xvcvdpsp [[REG6:[vs0-9]+]], [[REG4]]
 ; P9BE: vmrgew v2, [[REG6]], [[REG5]]
 ; P9BE: xvcvspuxws v2, v2
-; P9LE: lxv [[REG1:[vs0-9]+]], 0(r3)
-; P9LE: lxv [[REG2:[vs0-9]+]], 16(r3)
+; P9LE-DAG: lxv [[REG1:[vs0-9]+]], 0(r3)
+; P9LE-DAG: lxv [[REG2:[vs0-9]+]], 16(r3)
 ; P9LE-DAG: xxmrgld [[REG3:[vs0-9]+]], [[REG2]], [[REG1]]
 ; P9LE-DAG: xxmrghd [[REG4:[vs0-9]+]], [[REG2]], [[REG1]]
 ; P9LE-DAG: xvcvdpsp [[REG5:[vs0-9]+]], [[REG3]]
@@ -3466,9 +3468,9 @@ entry:
 ; P9LE-LABEL: fromDiffConstsConvftoll
 ; P8BE-LABEL: fromDiffConstsConvftoll
 ; P8LE-LABEL: fromDiffConstsConvftoll
-; P9BE: lxv v2
+; P9BE: lxvx v2
 ; P9BE: blr
-; P9LE: lxv v2
+; P9LE: lxvx v2
 ; P9LE: blr
 ; P8BE: lxvd2x v2
 ; P8BE: blr
@@ -4370,9 +4372,9 @@ entry:
 ; P9LE-LABEL: fromDiffConstsConvftoull
 ; P8BE-LABEL: fromDiffConstsConvftoull
 ; P8LE-LABEL: fromDiffConstsConvftoull
-; P9BE: lxv v2
+; P9BE: lxvx v2
 ; P9BE: blr
-; P9LE: lxv v2
+; P9LE: lxvx v2
 ; P9LE: blr
 ; P8BE: lxvd2x v2
 ; P8BE: blr
diff --git a/test/CodeGen/PowerPC/ppc64-i128-abi.ll b/test/CodeGen/PowerPC/ppc64-i128-abi.ll
index 90dd1d84fc23..6d19d7f0d629 100644
--- a/test/CodeGen/PowerPC/ppc64-i128-abi.ll
+++ b/test/CodeGen/PowerPC/ppc64-i128-abi.ll
@@ -63,7 +63,7 @@ define <1 x i128> @v1i128_increment_by_one(<1 x i128> %a) nounwind {
 ; FIXME: li [[R1:r[0-9]+]], 1
 ; FIXME: li [[R2:r[0-9]+]], 0
 ; FIXME: mtvsrdd [[V1:v[0-9]+]], [[R2]], [[R1]]
-; CHECK-P9: lxv [[V1:v[0-9]+]]
+; CHECK-P9: lxvx [[V1:v[0-9]+]]
 ; CHECK-P9: vadduqm v2, v2, [[V1]]
 ; CHECK-P9: blr
 
@@ -237,8 +237,8 @@ define <1 x i128> @call_v1i128_increment_by_val() nounwind {
 ; CHECK-LE: blr
 
 ; CHECK-P9-LABEL: @call_v1i128_increment_by_val
-; CHECK-P9-DAG: lxv v2
-; CHECK-P9-DAG: lxv v3
+; CHECK-P9-DAG: lxvx v2
+; CHECK-P9-DAG: lxvx v3
 ; CHECK-P9: bl v1i128_increment_by_val
 ; CHECK-P9: blr
 
diff --git a/test/CodeGen/PowerPC/swaps-le-6.ll b/test/CodeGen/PowerPC/swaps-le-6.ll
index e7640cab6aef..d573441f2cc9 100644
--- a/test/CodeGen/PowerPC/swaps-le-6.ll
+++ b/test/CodeGen/PowerPC/swaps-le-6.ll
@@ -33,11 +33,11 @@ entry:
 ; CHECK: stxvd2x [[REG5]]
 
 ; CHECK-P9-LABEL: @bar0
-; CHECK-P9-DAG: lxv [[REG1:[0-9]+]]
+; CHECK-P9-DAG: lxvx [[REG1:[0-9]+]]
 ; CHECK-P9-DAG: lfd [[REG2:[0-9]+]], 0(3)
 ; CHECK-P9: xxspltd [[REG4:[0-9]+]], [[REG2]], 0
 ; CHECK-P9: xxpermdi [[REG5:[0-9]+]], [[REG1]], [[REG4]], 1
-; CHECK-P9: stxv [[REG5]]
+; CHECK-P9: stxvx [[REG5]]
 
 define void @bar1() {
 entry:
@@ -56,9 +56,9 @@ entry:
 ; CHECK: stxvd2x [[REG5]]
 
 ; CHECK-P9-LABEL: @bar1
-; CHECK-P9-DAG: lxv [[REG1:[0-9]+]]
+; CHECK-P9-DAG: lxvx [[REG1:[0-9]+]]
 ; CHECK-P9-DAG: lfd [[REG2:[0-9]+]], 0(3)
 ; CHECK-P9: xxspltd [[REG4:[0-9]+]], [[REG2]], 0
 ; CHECK-P9: xxmrgld [[REG5:[0-9]+]], [[REG4]], [[REG1]]
-; CHECK-P9: stxv [[REG5]]
+; CHECK-P9: stxvx [[REG5]]
 
diff --git a/test/CodeGen/PowerPC/vsx-p9.ll b/test/CodeGen/PowerPC/vsx-p9.ll
index 0c29b6adad77..1ca679f474c3 100644
--- a/test/CodeGen/PowerPC/vsx-p9.ll
+++ b/test/CodeGen/PowerPC/vsx-p9.ll
@@ -36,8 +36,8 @@ entry:
   %1 = load <16 x i8>, <16 x i8>* @ucb, align 16
   %add.i = add <16 x i8> %1, %0
   tail call void (...) @sink(<16 x i8> %add.i)
-; CHECK: lxv 34, 0(3)
-; CHECK: lxv 35, 0(4)
+; CHECK: lxvx 34, 0, 3
+; CHECK: lxvx 35, 0, 4
 ; CHECK: vaddubm 2, 3, 2
 ; CHECK: stxv 34,
 ; CHECK: bl sink
@@ -45,8 +45,8 @@ entry:
   %3 = load <16 x i8>, <16 x i8>* @scb, align 16
   %add.i22 = add <16 x i8> %3, %2
   tail call void (...) @sink(<16 x i8> %add.i22)
-; CHECK: lxv 34, 0(3)
-; CHECK: lxv 35, 0(4)
+; CHECK: lxvx 34, 0, 3
+; CHECK: lxvx 35, 0, 4
 ; CHECK: vaddubm 2, 3, 2
 ; CHECK: stxv 34,
 ; CHECK: bl sink
@@ -54,8 +54,8 @@ entry:
   %5 = load <8 x i16>, <8 x i16>* @usb, align 16
   %add.i21 = add <8 x i16> %5, %4
   tail call void (...) @sink(<8 x i16> %add.i21)
-; CHECK: lxv 34, 0(3)
-; CHECK: lxv 35, 0(4)
+; CHECK: lxvx 34, 0, 3
+; CHECK: lxvx 35, 0, 4
 ; CHECK: vadduhm 2, 3, 2
 ; CHECK: stxv 34,
 ; CHECK: bl sink
@@ -63,8 +63,8 @@ entry:
   %7 = load <8 x i16>, <8 x i16>* @ssb, align 16
   %add.i20 = add <8 x i16> %7, %6
   tail call void (...) @sink(<8 x i16> %add.i20)
-; CHECK: lxv 34, 0(3)
-; CHECK: lxv 35, 0(4)
+; CHECK: lxvx 34, 0, 3
+; CHECK: lxvx 35, 0, 4
 ; CHECK: vadduhm 2, 3, 2
 ; CHECK: stxv 34,
 ; CHECK: bl sink
@@ -72,8 +72,8 @@ entry:
   %9 = load <4 x i32>, <4 x i32>* @uib, align 16
   %add.i19 = add <4 x i32> %9, %8
   tail call void (...) @sink(<4 x i32> %add.i19)
-; CHECK: lxv 34, 0(3)
-; CHECK: lxv 35, 0(4)
+; CHECK: lxvx 34, 0, 3
+; CHECK: lxvx 35, 0, 4
 ; CHECK: vadduwm 2, 3, 2
 ; CHECK: stxv 34,
 ; CHECK: bl sink
@@ -81,8 +81,8 @@ entry:
   %11 = load <4 x i32>, <4 x i32>* @sib, align 16
   %add.i18 = add <4 x i32> %11, %10
   tail call void (...) @sink(<4 x i32> %add.i18)
-; CHECK: lxv 34, 0(3)
-; CHECK: lxv 35, 0(4)
+; CHECK: lxvx 34, 0, 3
+; CHECK: lxvx 35, 0, 4
 ; CHECK: vadduwm 2, 3, 2
 ; CHECK: stxv 34,
 ; CHECK: bl sink
@@ -90,8 +90,8 @@ entry:
   %13 = load <2 x i64>, <2 x i64>* @ullb, align 16
   %add.i17 = add <2 x i64> %13, %12
   tail call void (...) @sink(<2 x i64> %add.i17)
-; CHECK: lxv 34, 0(3)
-; CHECK: lxv 35, 0(4)
+; CHECK: lxvx 34, 0, 3
+; CHECK: lxvx 35, 0, 4
 ; CHECK: vaddudm 2, 3, 2
 ; CHECK: stxv 34,
 ; CHECK: bl sink
@@ -99,8 +99,8 @@ entry:
   %15 = load <2 x i64>, <2 x i64>* @sllb, align 16
   %add.i16 = add <2 x i64> %15, %14
   tail call void (...) @sink(<2 x i64> %add.i16)
-; CHECK: lxv 34, 0(3)
-; CHECK: lxv 35, 0(4)
+; CHECK: lxvx 34, 0, 3
+; CHECK: lxvx 35, 0, 4
 ; CHECK: vaddudm 2, 3, 2
 ; CHECK: stxv 34,
 ; CHECK: bl sink
@@ -108,8 +108,8 @@ entry:
   %17 = load <1 x i128>, <1 x i128>* @uxb, align 16
   %add.i15 = add <1 x i128> %17, %16
   tail call void (...) @sink(<1 x i128> %add.i15)
-; CHECK: lxv 34, 0(3)
-; CHECK: lxv 35, 0(4)
+; CHECK: lxvx 34, 0, 3
+; CHECK: lxvx 35, 0, 4
 ; CHECK: vadduqm 2, 3, 2
 ; CHECK: stxv 34,
 ; CHECK: bl sink
@@ -117,8 +117,8 @@ entry:
   %19 = load <1 x i128>, <1 x i128>* @sxb, align 16
   %add.i14 = add <1 x i128> %19, %18
   tail call void (...) @sink(<1 x i128> %add.i14)
-; CHECK: lxv 34, 0(3)
-; CHECK: lxv 35, 0(4)
+; CHECK: lxvx 34, 0, 3
+; CHECK: lxvx 35, 0, 4
 ; CHECK: vadduqm 2, 3, 2
 ; CHECK: stxv 34,
 ; CHECK: bl sink
@@ -126,8 +126,8 @@ entry:
   %21 = load <4 x float>, <4 x float>* @vfb, align 16
   %add.i13 = fadd <4 x float> %20, %21
   tail call void (...) @sink(<4 x float> %add.i13)
-; CHECK: lxv 0, 0(3)
-; CHECK: lxv 1, 0(4)
+; CHECK: lxvx 0, 0, 3
+; CHECK: lxvx 1, 0, 4
 ; CHECK: xvaddsp 34, 0, 1
 ; CHECK: stxv 34,
 ; CHECK: bl sink
@@ -135,8 +135,8 @@ entry:
   %23 = load <2 x double>, <2 x double>* @vdb, align 16
   %add.i12 = fadd <2 x double> %22, %23
   tail call void (...) @sink(<2 x double> %add.i12)
-; CHECK: lxv 0, 0(3)
-; CHECK: lxv 1, 0(4)
+; CHECK: lxvx 0, 0, 3
+; CHECK: lxvx 1, 0, 4
 ; CHECK: xvadddp 0, 0, 1
 ; CHECK: stxv 0,
 ; CHECK: bl sink
diff --git a/test/CodeGen/SPARC/soft-mul-div.ll b/test/CodeGen/SPARC/soft-mul-div.ll
new file mode 100644
index 000000000000..7c453dd35be7
--- /dev/null
+++ b/test/CodeGen/SPARC/soft-mul-div.ll
@@ -0,0 +1,65 @@
+; RUN: llc -march=sparc -mcpu=v7 -O0 < %s | FileCheck %s
+
+define i32 @test_mul32(i32 %a, i32 %b) #0 {
+    ; CHECK-LABEL: test_mul32
+    ; CHECK:       call .umul
+    %m = mul i32 %a, %b
+    ret i32 %m
+}
+
+define i16 @test_mul16(i16 %a, i16 %b) #0 {
+    ; CHECK-LABEL: test_mul16
+    ; CHECK:       call .umul
+    %m = mul i16 %a, %b
+    ret i16 %m
+}
+
+define i8 @test_mul8(i8 %a, i8 %b) #0 {
+    ; CHECK-LABEL: test_mul8
+    ; CHECK:       call .umul
+    %m = mul i8 %a, %b
+    ret i8 %m
+}
+
+define i32 @test_sdiv32(i32 %a, i32 %b) #0 {
+    ; CHECK-LABEL: test_sdiv32
+    ; CHECK:       call .div
+    %d = sdiv i32 %a, %b
+    ret i32 %d
+}
+
+define i16 @test_sdiv16(i16 %a, i16 %b) #0 {
+    ; CHECK-LABEL: test_sdiv16
+    ; CHECK:       call .div
+    %d = sdiv i16 %a, %b
+    ret i16 %d
+}
+
+define i8 @test_sdiv8(i8 %a, i8 %b) #0 {
+    ; CHECK-LABEL: test_sdiv8
+    ; CHECK:       call .div
+    %d = sdiv i8 %a, %b
+    ret i8 %d
+}
+
+define i32 @test_udiv32(i32 %a, i32 %b) #0 {
+    ; CHECK-LABEL: test_udiv32
+    ; CHECK:       call .udiv
+    %d = udiv i32 %a, %b
+    ret i32 %d
+}
+
+define i16 @test_udiv16(i16 %a, i16 %b) #0 {
+    ; CHECK-LABEL: test_udiv16
+    ; CHECK:       call .udiv
+    %d = udiv i16 %a, %b
+    ret i16 %d
+}
+
+define i8 @test_udiv8(i8 %a, i8 %b) #0 {
+    ; CHECK-LABEL: test_udiv8
+    ; CHECK:       call .udiv
+    %d = udiv i8 %a, %b
+    ret i8 %d
+}
+
diff --git a/test/CodeGen/SystemZ/branch-11.ll b/test/CodeGen/SystemZ/branch-11.ll
new file mode 100644
index 000000000000..ce7b3ef267b4
--- /dev/null
+++ b/test/CodeGen/SystemZ/branch-11.ll
@@ -0,0 +1,56 @@
+; Test indirect jumps on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+define i32 @f1(i32 %x, i32 %y, i32 %op) {
+; CHECK-LABEL: f1:
+; CHECK: ahi %r4, -1
+; CHECK: clibh %r4, 5, 0(%r14)
+; CHECK: llgfr [[OP64:%r[0-5]]], %r4
+; CHECK: sllg [[INDEX:%r[1-5]]], [[OP64]], 3
+; CHECK: larl [[BASE:%r[1-5]]]
+; CHECK: bi 0([[BASE]],[[INDEX]])
+entry:
+  switch i32 %op, label %exit [
+    i32 1, label %b.add
+    i32 2, label %b.sub
+    i32 3, label %b.and
+    i32 4, label %b.or
+    i32 5, label %b.xor
+    i32 6, label %b.mul
+  ]
+
+b.add:
+  %add = add i32 %x, %y
+  br label %exit
+
+b.sub:
+  %sub = sub i32 %x, %y
+  br label %exit
+
+b.and:
+  %and = and i32 %x, %y
+  br label %exit
+
+b.or:
+  %or = or i32 %x, %y
+  br label %exit
+
+b.xor:
+  %xor = xor i32 %x, %y
+  br label %exit
+
+b.mul:
+  %mul = mul i32 %x, %y
+  br label %exit
+
+exit:
+  %res = phi i32 [ %x,   %entry ],
+                 [ %add, %b.add ],
+                 [ %sub, %b.sub ],
+                 [ %and, %b.and ],
+                 [ %or,  %b.or ],
+                 [ %xor, %b.xor ],
+                 [ %mul, %b.mul ]
+  ret i32 %res
+}
diff --git a/test/CodeGen/SystemZ/fp-abs-03.ll b/test/CodeGen/SystemZ/fp-abs-03.ll
new file mode 100644
index 000000000000..cab6c116bc08
--- /dev/null
+++ b/test/CodeGen/SystemZ/fp-abs-03.ll
@@ -0,0 +1,43 @@
+; Test floating-point absolute on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+; Test f32.
+declare float @llvm.fabs.f32(float %f)
+define float @f1(float %f) {
+; CHECK-LABEL: f1:
+; CHECK: lpdfr %f0, %f0
+; CHECK: br %r14
+  %res = call float @llvm.fabs.f32(float %f)
+  ret float %res
+}
+
+; Test f64.
+declare double @llvm.fabs.f64(double %f)
+define double @f2(double %f) {
+; CHECK-LABEL: f2:
+; CHECK: lpdfr %f0, %f0
+; CHECK: br %r14
+  %res = call double @llvm.fabs.f64(double %f)
+  ret double %res
+}
+
+; Test f128.  With the loads and stores, a pure absolute would probably
+; be better implemented using an NI on the upper byte.  Do some extra
+; processing so that using FPRs is unequivocally better.
+declare fp128 @llvm.fabs.f128(fp128 %f)
+define void @f3(fp128 *%ptr, fp128 *%ptr2) {
+; CHECK-LABEL: f3:
+; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2)
+; CHECK-DAG: vl [[REG2:%v[0-9]+]], 0(%r3)
+; CHECK-DAG: wflpxb [[POSREG1:%v[0-9]+]], [[REG1]]
+; CHECK: wfdxb [[RES:%v[0-9]+]], [[POSREG1]], [[REG2]]
+; CHECK: vst [[RES]], 0(%r2)
+; CHECK: br %r14
+  %orig = load fp128 , fp128 *%ptr
+  %abs = call fp128 @llvm.fabs.f128(fp128 %orig)
+  %op2 = load fp128 , fp128 *%ptr2
+  %res = fdiv fp128 %abs, %op2
+  store fp128 %res, fp128 *%ptr
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/fp-abs-04.ll b/test/CodeGen/SystemZ/fp-abs-04.ll
new file mode 100644
index 000000000000..606bce3de36e
--- /dev/null
+++ b/test/CodeGen/SystemZ/fp-abs-04.ll
@@ -0,0 +1,46 @@
+; Test negated floating-point absolute on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+; Test f32.
+declare float @llvm.fabs.f32(float %f)
+define float @f1(float %f) {
+; CHECK-LABEL: f1:
+; CHECK: lndfr %f0, %f0
+; CHECK: br %r14
+  %abs = call float @llvm.fabs.f32(float %f)
+  %res = fsub float -0.0, %abs
+  ret float %res
+}
+
+; Test f64.
+declare double @llvm.fabs.f64(double %f)
+define double @f2(double %f) {
+; CHECK-LABEL: f2:
+; CHECK: lndfr %f0, %f0
+; CHECK: br %r14
+  %abs = call double @llvm.fabs.f64(double %f)
+  %res = fsub double -0.0, %abs
+  ret double %res
+}
+
+; Test f128.  With the loads and stores, a pure negative-absolute would
+; probably be better implemented using an OI on the upper byte.  Do some
+; extra processing so that using FPRs is unequivocally better.
+declare fp128 @llvm.fabs.f128(fp128 %f)
+define void @f3(fp128 *%ptr, fp128 *%ptr2) {
+; CHECK-LABEL: f3:
+; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2)
+; CHECK-DAG: vl [[REG2:%v[0-9]+]], 0(%r3)
+; CHECK-DAG: wflnxb [[NEGREG1:%v[0-9]+]], [[REG1]]
+; CHECK: wfdxb [[RES:%v[0-9]+]], [[NEGREG1]], [[REG2]]
+; CHECK: vst [[RES]], 0(%r2)
+; CHECK: br %r14
+  %orig = load fp128 , fp128 *%ptr
+  %abs = call fp128 @llvm.fabs.f128(fp128 %orig)
+  %negabs = fsub fp128 0xL00000000000000008000000000000000, %abs
+  %op2 = load fp128 , fp128 *%ptr2
+  %res = fdiv fp128 %negabs, %op2
+  store fp128 %res, fp128 *%ptr
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/fp-add-01.ll b/test/CodeGen/SystemZ/fp-add-01.ll
index 5b0ed0513a37..219607d628d7 100644
--- a/test/CodeGen/SystemZ/fp-add-01.ll
+++ b/test/CodeGen/SystemZ/fp-add-01.ll
@@ -1,6 +1,8 @@
 ; Test 32-bit floating-point addition.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
 
 declare float @foo()
 
@@ -76,7 +78,7 @@ define float @f6(float %f1, float *%base, i64 %index) {
 define float @f7(float *%ptr0) {
 ; CHECK-LABEL: f7:
 ; CHECK: brasl %r14, foo@PLT
-; CHECK: aeb %f0, 16{{[04]}}(%r15)
+; CHECK-SCALAR: aeb %f0, 16{{[04]}}(%r15)
 ; CHECK: br %r14
   %ptr1 = getelementptr float, float *%ptr0, i64 2
   %ptr2 = getelementptr float, float *%ptr0, i64 4
diff --git a/test/CodeGen/SystemZ/fp-add-04.ll b/test/CodeGen/SystemZ/fp-add-04.ll
new file mode 100644
index 000000000000..186f37ca5182
--- /dev/null
+++ b/test/CodeGen/SystemZ/fp-add-04.ll
@@ -0,0 +1,17 @@
+; Test 128-bit floating-point addition on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+define void @f1(fp128 *%ptr1, fp128 *%ptr2) {
+; CHECK-LABEL: f1:
+; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2)
+; CHECK-DAG: vl [[REG2:%v[0-9]+]], 0(%r3)
+; CHECK: wfaxb [[RES:%v[0-9]+]], [[REG1]], [[REG2]]
+; CHECK: vst [[RES]], 0(%r2)
+; CHECK: br %r14
+  %f1 = load fp128, fp128 *%ptr1
+  %f2 = load fp128, fp128 *%ptr2
+  %sum = fadd fp128 %f1, %f2
+  store fp128 %sum, fp128 *%ptr1
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/fp-cmp-01.ll b/test/CodeGen/SystemZ/fp-cmp-01.ll
index 075c7aa3dd84..146b16bc695f 100644
--- a/test/CodeGen/SystemZ/fp-cmp-01.ll
+++ b/test/CodeGen/SystemZ/fp-cmp-01.ll
@@ -1,7 +1,10 @@
 ; Test 32-bit floating-point comparison.  The tests assume a z10 implementation
 ; of select, using conditional branches rather than LOCGR.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-VECTOR %s
 
 declare float @foo()
 
@@ -9,8 +12,9 @@ declare float @foo()
 define i64 @f1(i64 %a, i64 %b, float %f1, float %f2) {
 ; CHECK-LABEL: f1:
 ; CHECK: cebr %f0, %f2
-; CHECK-NEXT: ber %r14
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: ber %r14
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgrne %r2, %r3
 ; CHECK: br %r14
   %cond = fcmp oeq float %f1, %f2
   %res = select i1 %cond, i64 %a, i64 %b
@@ -21,8 +25,9 @@ define i64 @f1(i64 %a, i64 %b, float %f1, float %f2) {
 define i64 @f2(i64 %a, i64 %b, float %f1, float *%ptr) {
 ; CHECK-LABEL: f2:
 ; CHECK: ceb %f0, 0(%r4)
-; CHECK-NEXT: ber %r14
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: ber %r14
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgrne %r2, %r3
 ; CHECK: br %r14
   %f2 = load float , float *%ptr
   %cond = fcmp oeq float %f1, %f2
@@ -34,8 +39,9 @@ define i64 @f2(i64 %a, i64 %b, float %f1, float *%ptr) {
 define i64 @f3(i64 %a, i64 %b, float %f1, float *%base) {
 ; CHECK-LABEL: f3:
 ; CHECK: ceb %f0, 4092(%r4)
-; CHECK-NEXT: ber %r14
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: ber %r14
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgrne %r2, %r3
 ; CHECK: br %r14
   %ptr = getelementptr float, float *%base, i64 1023
   %f2 = load float , float *%ptr
@@ -50,8 +56,9 @@ define i64 @f4(i64 %a, i64 %b, float %f1, float *%base) {
 ; CHECK-LABEL: f4:
 ; CHECK: aghi %r4, 4096
 ; CHECK: ceb %f0, 0(%r4)
-; CHECK-NEXT: ber %r14
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: ber %r14
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgrne %r2, %r3
 ; CHECK: br %r14
   %ptr = getelementptr float, float *%base, i64 1024
   %f2 = load float , float *%ptr
@@ -65,8 +72,9 @@ define i64 @f5(i64 %a, i64 %b, float %f1, float *%base) {
 ; CHECK-LABEL: f5:
 ; CHECK: aghi %r4, -4
 ; CHECK: ceb %f0, 0(%r4)
-; CHECK-NEXT: ber %r14
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: ber %r14
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgrne %r2, %r3
 ; CHECK: br %r14
   %ptr = getelementptr float, float *%base, i64 -1
   %f2 = load float , float *%ptr
@@ -80,8 +88,9 @@ define i64 @f6(i64 %a, i64 %b, float %f1, float *%base, i64 %index) {
 ; CHECK-LABEL: f6:
 ; CHECK: sllg %r1, %r5, 2
 ; CHECK: ceb %f0, 400(%r1,%r4)
-; CHECK-NEXT: ber %r14
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: ber %r14
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgrne %r2, %r3
 ; CHECK: br %r14
   %ptr1 = getelementptr float, float *%base, i64 %index
   %ptr2 = getelementptr float, float *%ptr1, i64 100
@@ -95,7 +104,7 @@ define i64 @f6(i64 %a, i64 %b, float %f1, float *%base, i64 %index) {
 define float @f7(float *%ptr0) {
 ; CHECK-LABEL: f7:
 ; CHECK: brasl %r14, foo@PLT
-; CHECK: ceb {{%f[0-9]+}}, 16{{[04]}}(%r15)
+; CHECK-SCALAR: ceb {{%f[0-9]+}}, 16{{[04]}}(%r15)
 ; CHECK: br %r14
   %ptr1 = getelementptr float, float *%ptr0, i64 2
   %ptr2 = getelementptr float, float *%ptr0, i64 4
@@ -153,8 +162,9 @@ define float @f7(float *%ptr0) {
 define i64 @f8(i64 %a, i64 %b, float %f) {
 ; CHECK-LABEL: f8:
 ; CHECK: ltebr %f0, %f0
-; CHECK-NEXT: ber %r14
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: ber %r14
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgrne %r2, %r3
 ; CHECK: br %r14
   %cond = fcmp oeq float %f, 0.0
   %res = select i1 %cond, i64 %a, i64 %b
@@ -166,8 +176,9 @@ define i64 @f8(i64 %a, i64 %b, float %f) {
 define i64 @f9(i64 %a, i64 %b, float %f2, float *%ptr) {
 ; CHECK-LABEL: f9:
 ; CHECK: ceb %f0, 0(%r4)
-; CHECK-NEXT: ber %r14
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: ber %r14
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgrne %r2, %r3
 ; CHECK: br %r14
   %f1 = load float , float *%ptr
   %cond = fcmp oeq float %f1, %f2
@@ -179,8 +190,9 @@ define i64 @f9(i64 %a, i64 %b, float %f2, float *%ptr) {
 define i64 @f10(i64 %a, i64 %b, float %f2, float *%ptr) {
 ; CHECK-LABEL: f10:
 ; CHECK: ceb %f0, 0(%r4)
-; CHECK-NEXT: blhr %r14
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: blhr %r14
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgrnlh %r2, %r3
 ; CHECK: br %r14
   %f1 = load float , float *%ptr
   %cond = fcmp one float %f1, %f2
@@ -192,8 +204,9 @@ define i64 @f10(i64 %a, i64 %b, float %f2, float *%ptr) {
 define i64 @f11(i64 %a, i64 %b, float %f2, float *%ptr) {
 ; CHECK-LABEL: f11:
 ; CHECK: ceb %f0, 0(%r4)
-; CHECK-NEXT: bhr %r14
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: bhr %r14
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgrnh %r2, %r3
 ; CHECK: br %r14
   %f1 = load float , float *%ptr
   %cond = fcmp olt float %f1, %f2
@@ -205,8 +218,9 @@ define i64 @f11(i64 %a, i64 %b, float %f2, float *%ptr) {
 define i64 @f12(i64 %a, i64 %b, float %f2, float *%ptr) {
 ; CHECK-LABEL: f12:
 ; CHECK: ceb %f0, 0(%r4)
-; CHECK-NEXT: bher %r14
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: bher %r14
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgrnhe %r2, %r3
 ; CHECK: br %r14
   %f1 = load float , float *%ptr
   %cond = fcmp ole float %f1, %f2
@@ -218,8 +232,9 @@ define i64 @f12(i64 %a, i64 %b, float %f2, float *%ptr) {
 define i64 @f13(i64 %a, i64 %b, float %f2, float *%ptr) {
 ; CHECK-LABEL: f13:
 ; CHECK: ceb %f0, 0(%r4)
-; CHECK-NEXT: bler %r14
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: bler %r14
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgrnle %r2, %r3
 ; CHECK: br %r14
   %f1 = load float , float *%ptr
   %cond = fcmp oge float %f1, %f2
@@ -231,8 +246,9 @@ define i64 @f13(i64 %a, i64 %b, float %f2, float *%ptr) {
 define i64 @f14(i64 %a, i64 %b, float %f2, float *%ptr) {
 ; CHECK-LABEL: f14:
 ; CHECK: ceb %f0, 0(%r4)
-; CHECK-NEXT: blr %r14
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: blr %r14
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgrnl %r2, %r3
 ; CHECK: br %r14
   %f1 = load float , float *%ptr
   %cond = fcmp ogt float %f1, %f2
@@ -244,8 +260,9 @@ define i64 @f14(i64 %a, i64 %b, float %f2, float *%ptr) {
 define i64 @f15(i64 %a, i64 %b, float %f2, float *%ptr) {
 ; CHECK-LABEL: f15:
 ; CHECK: ceb %f0, 0(%r4)
-; CHECK-NEXT: bnlhr %r14
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: bnlhr %r14
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgrlh %r2, %r3
 ; CHECK: br %r14
   %f1 = load float , float *%ptr
   %cond = fcmp ueq float %f1, %f2
@@ -257,8 +274,9 @@ define i64 @f15(i64 %a, i64 %b, float %f2, float *%ptr) {
 define i64 @f16(i64 %a, i64 %b, float %f2, float *%ptr) {
 ; CHECK-LABEL: f16:
 ; CHECK: ceb %f0, 0(%r4)
-; CHECK-NEXT: bner %r14
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: bner %r14
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgre %r2, %r3
 ; CHECK: br %r14
   %f1 = load float , float *%ptr
   %cond = fcmp une float %f1, %f2
@@ -270,8 +288,9 @@ define i64 @f16(i64 %a, i64 %b, float %f2, float *%ptr) {
 define i64 @f17(i64 %a, i64 %b, float %f2, float *%ptr) {
 ; CHECK-LABEL: f17:
 ; CHECK: ceb %f0, 0(%r4)
-; CHECK-NEXT: bnler %r14
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: bnler %r14
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgrle %r2, %r3
 ; CHECK: br %r14
   %f1 = load float , float *%ptr
   %cond = fcmp ult float %f1, %f2
@@ -283,8 +302,9 @@ define i64 @f17(i64 %a, i64 %b, float %f2, float *%ptr) {
 define i64 @f18(i64 %a, i64 %b, float %f2, float *%ptr) {
 ; CHECK-LABEL: f18:
 ; CHECK: ceb %f0, 0(%r4)
-; CHECK-NEXT: bnlr %r14
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: bnlr %r14
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgrl %r2, %r3
 ; CHECK: br %r14
   %f1 = load float , float *%ptr
   %cond = fcmp ule float %f1, %f2
@@ -296,8 +316,9 @@ define i64 @f18(i64 %a, i64 %b, float %f2, float *%ptr) {
 define i64 @f19(i64 %a, i64 %b, float %f2, float *%ptr) {
 ; CHECK-LABEL: f19:
 ; CHECK: ceb %f0, 0(%r4)
-; CHECK-NEXT: bnhr %r14
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: bnhr %r14
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgrh %r2, %r3
 ; CHECK: br %r14
   %f1 = load float , float *%ptr
   %cond = fcmp uge float %f1, %f2
@@ -309,8 +330,9 @@ define i64 @f19(i64 %a, i64 %b, float %f2, float *%ptr) {
 define i64 @f20(i64 %a, i64 %b, float %f2, float *%ptr) {
 ; CHECK-LABEL: f20:
 ; CHECK: ceb %f0, 0(%r4)
-; CHECK-NEXT: bnher %r14
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: bnher %r14
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgrhe %r2, %r3
 ; CHECK: br %r14
   %f1 = load float , float *%ptr
   %cond = fcmp ugt float %f1, %f2
diff --git a/test/CodeGen/SystemZ/fp-cmp-06.ll b/test/CodeGen/SystemZ/fp-cmp-06.ll
new file mode 100644
index 000000000000..e146b51e4fb2
--- /dev/null
+++ b/test/CodeGen/SystemZ/fp-cmp-06.ll
@@ -0,0 +1,33 @@
+; Test f128 comparisons on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+; There is no memory form of 128-bit comparison.
+define i64 @f1(i64 %a, i64 %b, fp128 *%ptr1, fp128 *%ptr2) {
+; CHECK-LABEL: f1:
+; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r4)
+; CHECK-DAG: vl [[REG2:%v[0-9]+]], 0(%r5)
+; CHECK: wfcxb [[REG1]], [[REG2]]
+; CHECK-NEXT: locgrne %r2, %r3
+; CHECK: br %r14
+  %f1 = load fp128, fp128 *%ptr1
+  %f2 = load fp128, fp128 *%ptr2
+  %cond = fcmp oeq fp128 %f1, %f2
+  %res = select i1 %cond, i64 %a, i64 %b
+  ret i64 %res
+}
+
+; Check comparison with zero -- it is not worthwhile to copy to
+; FP pairs just so we can use LTXBR, so simply load up a zero.
+define i64 @f2(i64 %a, i64 %b, fp128 *%ptr) {
+; CHECK-LABEL: f2:
+; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r4)
+; CHECK-DAG: vzero [[REG2:%v[0-9]+]]
+; CHECK: wfcxb [[REG1]], [[REG2]]
+; CHECK-NEXT: locgrne %r2, %r3
+; CHECK: br %r14
+  %f = load fp128, fp128 *%ptr
+  %cond = fcmp oeq fp128 %f, 0xL00000000000000000000000000000000
+  %res = select i1 %cond, i64 %a, i64 %b
+  ret i64 %res
+}
diff --git a/test/CodeGen/SystemZ/fp-const-11.ll b/test/CodeGen/SystemZ/fp-const-11.ll
new file mode 100644
index 000000000000..8523f2786c34
--- /dev/null
+++ b/test/CodeGen/SystemZ/fp-const-11.ll
@@ -0,0 +1,40 @@
+; Test loads of f128 floating-point constants on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s -check-prefix=CONST
+
+; Test loading zero.
+define void @f1(fp128 *%x) {
+; CHECK-LABEL: f1:
+; CHECK: vzero [[REG:%v[0-9]+]]
+; CHECK: vst [[REG]], 0(%r2)
+; CHECK: br %r14
+  store fp128 0xL00000000000000000000000000000000, fp128 *%x
+  ret void
+}
+
+; Test loading of negative floating-point zero.
+define void @f2(fp128 *%x) {
+; CHECK-LABEL: f2:
+; CHECK: vzero [[REG:%v[0-9]+]]
+; CHECK: wflnxb [[REG]], [[REG]]
+; CHECK: vst [[REG]], 0(%r2)
+; CHECK: br %r14
+  store fp128 0xL00000000000000008000000000000000, fp128 *%x
+  ret void
+}
+
+; Test loading of a 128-bit floating-point constant.  This value would
+; actually fit within the 32-bit format, but we don't have extending
+; loads into vector registers.
+define void @f3(fp128 *%x) {
+; CHECK-LABEL: f3:
+; CHECK: larl [[REGISTER:%r[1-5]+]], {{.*}}
+; CHECK: vl [[REG:%v[0-9]+]], 0([[REGISTER]])
+; CHECK: vst [[REG]], 0(%r2)
+; CHECK: br %r14
+; CONST: .quad 4611404543484231680
+; CONST: .quad 0
+  store fp128 0xL00000000000000003fff000002000000, fp128 *%x
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/fp-conv-15.ll b/test/CodeGen/SystemZ/fp-conv-15.ll
new file mode 100644
index 000000000000..61100016c426
--- /dev/null
+++ b/test/CodeGen/SystemZ/fp-conv-15.ll
@@ -0,0 +1,50 @@
+; Test f128 floating-point truncations/extensions on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+; Test f128->f64.
+define double @f1(fp128 *%ptr) {
+; CHECK-LABEL: f1:
+; CHECK: vl [[REG:%v[0-9]+]], 0(%r2)
+; CHECK: wflrx %f0, [[REG]], 0, 0
+; CHECK: br %r14
+  %val = load fp128, fp128 *%ptr
+  %res = fptrunc fp128 %val to double
+  ret double %res
+}
+
+; Test f128->f32.
+define float @f2(fp128 *%ptr) {
+; CHECK-LABEL: f2:
+; CHECK: vl [[REG:%v[0-9]+]], 0(%r2)
+; CHECK: wflrx %f0, [[REG]], 0, 3
+; CHECK: ledbra %f0, 0, %f0, 0
+; CHECK: br %r14
+  %val = load fp128, fp128 *%ptr
+  %res = fptrunc fp128 %val to float
+  ret float %res
+}
+
+; Test f64->f128.
+define void @f3(fp128 *%dst, double %val) {
+; CHECK-LABEL: f3:
+; CHECK: wflld [[RES:%v[0-9]+]], %f0
+; CHECK: vst [[RES]], 0(%r2)
+; CHECK: br %r14
+  %res = fpext double %val to fp128
+  store fp128 %res, fp128 *%dst
+  ret void
+}
+
+; Test f32->f128.
+define void @f4(fp128 *%dst, float %val) {
+; CHECK-LABEL: f4:
+; CHECK: ldebr %f0, %f0
+; CHECK: wflld [[RES:%v[0-9]+]], %f0
+; CHECK: vst [[RES]], 0(%r2)
+; CHECK: br %r14
+  %res = fpext float %val to fp128
+  store fp128 %res, fp128 *%dst
+  ret void
+}
+
diff --git a/test/CodeGen/SystemZ/fp-conv-16.ll b/test/CodeGen/SystemZ/fp-conv-16.ll
new file mode 100644
index 000000000000..4f9bb865694b
--- /dev/null
+++ b/test/CodeGen/SystemZ/fp-conv-16.ll
@@ -0,0 +1,99 @@
+; Test f128 floating-point conversion to/from integers on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+; Test signed i32->f128.
+define void @f1(i32 %i, fp128 *%dst) {
+; CHECK-LABEL: f1:
+; CHECK: cxfbr %f0, %r2
+; CHECK: vmrhg %v0, %v0, %v2
+; CHECK: vst %v0, 0(%r3)
+; CHECK: br %r14
+  %conv = sitofp i32 %i to fp128
+  store fp128 %conv, fp128 *%dst
+  ret void
+}
+
+; Test signed i64->f128.
+define void @f2(i64 %i, fp128 *%dst) {
+; CHECK-LABEL: f2:
+; CHECK: cxgbr %f0, %r2
+; CHECK: vmrhg %v0, %v0, %v2
+; CHECK: vst %v0, 0(%r3)
+; CHECK: br %r14
+  %conv = sitofp i64 %i to fp128
+  store fp128 %conv, fp128 *%dst
+  ret void
+}
+
+; Test unsigned i32->f128.
+define void @f3(i32 %i, fp128 *%dst) {
+; CHECK-LABEL: f3:
+; CHECK: cxlfbr %f0, 0, %r2, 0
+; CHECK: vmrhg %v0, %v0, %v2
+; CHECK: vst %v0, 0(%r3)
+; CHECK: br %r14
+  %conv = uitofp i32 %i to fp128
+  store fp128 %conv, fp128 *%dst
+  ret void
+}
+
+; Test unsigned i64->f128.
+define void @f4(i64 %i, fp128 *%dst) {
+; CHECK-LABEL: f4:
+; CHECK: cxlgbr %f0, 0, %r2, 0
+; CHECK: vmrhg %v0, %v0, %v2
+; CHECK: vst %v0, 0(%r3)
+; CHECK: br %r14
+  %conv = uitofp i64 %i to fp128
+  store fp128 %conv, fp128 *%dst
+  ret void
+}
+
+; Test signed f128->i32.
+define i32 @f5(fp128 *%src) {
+; CHECK-LABEL: f5:
+; CHECK: vl %v0, 0(%r2)
+; CHECK: vrepg %v2, %v0, 1
+; CHECK: cfxbr %r2, 5, %f0
+; CHECK: br %r14
+  %f = load fp128, fp128 *%src
+  %conv = fptosi fp128 %f to i32
+  ret i32 %conv
+}
+
+; Test signed f128->i64.
+define i64 @f6(fp128 *%src) {
+; CHECK-LABEL: f6:
+; CHECK: vl %v0, 0(%r2)
+; CHECK: vrepg %v2, %v0, 1
+; CHECK: cgxbr %r2, 5, %f0
+; CHECK: br %r14
+  %f = load fp128, fp128 *%src
+  %conv = fptosi fp128 %f to i64
+  ret i64 %conv
+}
+
+; Test unsigned f128->i32.
+define i32 @f7(fp128 *%src) {
+; CHECK-LABEL: f7:
+; CHECK: vl %v0, 0(%r2)
+; CHECK: vrepg %v2, %v0, 1
+; CHECK: clfxbr %r2, 5, %f0, 0
+; CHECK: br %r14
+  %f = load fp128 , fp128 *%src
+  %conv = fptoui fp128 %f to i32
+  ret i32 %conv
+}
+
+; Test unsigned f128->i64.
+define i64 @f8(fp128 *%src) {
+; CHECK-LABEL: f8:
+; CHECK: vl %v0, 0(%r2)
+; CHECK: vrepg %v2, %v0, 1
+; CHECK: clgxbr %r2, 5, %f0, 0
+; CHECK: br %r14
+  %f = load fp128 , fp128 *%src
+  %conv = fptoui fp128 %f to i64
+  ret i64 %conv
+}
diff --git a/test/CodeGen/SystemZ/fp-copysign-02.ll b/test/CodeGen/SystemZ/fp-copysign-02.ll
new file mode 100644
index 000000000000..657c0e18767b
--- /dev/null
+++ b/test/CodeGen/SystemZ/fp-copysign-02.ll
@@ -0,0 +1,81 @@
+; Test f128 copysign operations on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+declare float @copysignf(float, float) readnone
+declare double @copysign(double, double) readnone
+; FIXME: not really the correct prototype for SystemZ.
+declare fp128 @copysignl(fp128, fp128) readnone
+
+; Test f32 copies in which the sign comes from an f128.
+define float @f1(float %a, fp128 *%bptr) {
+; CHECK-LABEL: f1:
+; CHECK: vl %v[[REG:[0-9]+]], 0(%r2)
+; CHECK: cpsdr %f0, %f[[REG]], %f0
+; CHECK: br %r14
+  %bl = load volatile fp128, fp128 *%bptr
+  %b = fptrunc fp128 %bl to float
+  %res = call float @copysignf(float %a, float %b) readnone
+  ret float %res
+}
+
+; Test f64 copies in which the sign comes from an f128.
+define double @f2(double %a, fp128 *%bptr) {
+; CHECK-LABEL: f2:
+; CHECK: vl %v[[REG:[0-9]+]], 0(%r2)
+; CHECK: cpsdr %f0, %f[[REG]], %f0
+; CHECK: br %r14
+  %bl = load volatile fp128, fp128 *%bptr
+  %b = fptrunc fp128 %bl to double
+  %res = call double @copysign(double %a, double %b) readnone
+  ret double %res
+}
+
+; Test f128 copies in which the sign comes from an f32.
+define void @f7(fp128 *%cptr, fp128 *%aptr, float %bf) {
+; CHECK-LABEL: f7:
+; CHECK: vl [[REG1:%v[0-7]+]], 0(%r3)
+; CHECK: tmlh
+; CHECK: wflnxb [[REG1]], [[REG1]]
+; CHECK: wflpxb [[REG1]], [[REG1]]
+; CHECK: vst [[REG1]], 0(%r2)
+; CHECK: br %r14
+  %a = load volatile fp128, fp128 *%aptr
+  %b = fpext float %bf to fp128
+  %c = call fp128 @copysignl(fp128 %a, fp128 %b) readnone
+  store fp128 %c, fp128 *%cptr
+  ret void
+}
+
+; As above, but the sign comes from an f64.
+define void @f8(fp128 *%cptr, fp128 *%aptr, double %bd) {
+; CHECK-LABEL: f8:
+; CHECK: vl [[REG1:%v[0-7]+]], 0(%r3)
+; CHECK: tmhh
+; CHECK: wflnxb [[REG1]], [[REG1]]
+; CHECK: wflpxb [[REG1]], [[REG1]]
+; CHECK: vst [[REG1]], 0(%r2)
+; CHECK: br %r14
+  %a = load volatile fp128, fp128 *%aptr
+  %b = fpext double %bd to fp128
+  %c = call fp128 @copysignl(fp128 %a, fp128 %b) readnone
+  store fp128 %c, fp128 *%cptr
+  ret void
+}
+
+; As above, but the sign comes from an f128.
+define void @f9(fp128 *%cptr, fp128 *%aptr, fp128 *%bptr) {
+; CHECK-LABEL: f9:
+; CHECK: vl [[REG1:%v[0-7]+]], 0(%r3)
+; CHECK: vl [[REG2:%v[0-7]+]], 0(%r4)
+; CHECK: tm
+; CHECK: wflnxb [[REG1]], [[REG1]]
+; CHECK: wflpxb [[REG1]], [[REG1]]
+; CHECK: vst [[REG1]], 0(%r2)
+; CHECK: br %r14
+  %a = load volatile fp128, fp128 *%aptr
+  %b = load volatile fp128, fp128 *%bptr
+  %c = call fp128 @copysignl(fp128 %a, fp128 %b) readnone
+  store fp128 %c, fp128 *%cptr
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/fp-div-01.ll b/test/CodeGen/SystemZ/fp-div-01.ll
index 0791e8db93f8..ee514dc474e9 100644
--- a/test/CodeGen/SystemZ/fp-div-01.ll
+++ b/test/CodeGen/SystemZ/fp-div-01.ll
@@ -1,6 +1,8 @@
 ; Test 32-bit floating-point division.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
 
 declare float @foo()
 
@@ -76,7 +78,7 @@ define float @f6(float %f1, float *%base, i64 %index) {
 define float @f7(float *%ptr0) {
 ; CHECK-LABEL: f7:
 ; CHECK: brasl %r14, foo@PLT
-; CHECK: deb %f0, 16{{[04]}}(%r15)
+; CHECK-SCALAR: deb %f0, 16{{[04]}}(%r15)
 ; CHECK: br %r14
   %ptr1 = getelementptr float, float *%ptr0, i64 2
   %ptr2 = getelementptr float, float *%ptr0, i64 4
diff --git a/test/CodeGen/SystemZ/fp-div-04.ll b/test/CodeGen/SystemZ/fp-div-04.ll
new file mode 100644
index 000000000000..54e87f46c84a
--- /dev/null
+++ b/test/CodeGen/SystemZ/fp-div-04.ll
@@ -0,0 +1,17 @@
+; Test 128-bit floating-point division on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+define void @f1(fp128 *%ptr1, fp128 *%ptr2) {
+; CHECK-LABEL: f1:
+; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2)
+; CHECK-DAG: vl [[REG2:%v[0-9]+]], 0(%r3)
+; CHECK: wfdxb [[RES:%v[0-9]+]], [[REG1]], [[REG2]]
+; CHECK: vst [[RES]], 0(%r2)
+; CHECK: br %r14
+  %f1 = load fp128, fp128 *%ptr1
+  %f2 = load fp128, fp128 *%ptr2
+  %sum = fdiv fp128 %f1, %f2
+  store fp128 %sum, fp128 *%ptr1
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/fp-move-13.ll b/test/CodeGen/SystemZ/fp-move-13.ll
new file mode 100644
index 000000000000..d6c53eaceeef
--- /dev/null
+++ b/test/CodeGen/SystemZ/fp-move-13.ll
@@ -0,0 +1,46 @@
+; Test f128 moves on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+; VR-to-VR moves.  Since f128s are passed by reference,
+; we need to force a copy by other means.
+define void @f1(fp128 *%x) {
+; CHECK-LABEL: f1:
+; CHECK: vlr
+; CHECK: vleig
+; CHECK: br %r14
+  %val = load volatile fp128 , fp128 *%x
+  %t1 = bitcast fp128 %val to <2 x i64>
+  %t2 = insertelement <2 x i64> %t1, i64 0, i32 0
+  %res = bitcast <2 x i64> %t2 to fp128
+  store volatile fp128 %res, fp128 *%x
+  store volatile fp128 %val, fp128 *%x
+  ret void
+}
+
+; Test 128-bit moves from GPRs to VRs.  i128 isn't a legitimate type,
+; so this goes through memory.
+define void @f2(fp128 *%a, i128 *%b) {
+; CHECK-LABEL: f2:
+; CHECK: lg
+; CHECK: lg
+; CHECK: stg
+; CHECK: stg
+; CHECK: br %r14
+  %val = load i128 , i128 *%b
+  %res = bitcast i128 %val to fp128
+  store fp128 %res, fp128 *%a
+  ret void
+}
+
+; Test 128-bit moves from VRs to GPRs, with the same restriction as f2.
+define void @f3(fp128 *%a, i128 *%b) {
+; CHECK-LABEL: f3:
+; CHECK: vl
+; CHECK: vst
+  %val = load fp128 , fp128 *%a
+  %res = bitcast fp128 %val to i128
+  store i128 %res, i128 *%b
+  ret void
+}
+
diff --git a/test/CodeGen/SystemZ/fp-mul-01.ll b/test/CodeGen/SystemZ/fp-mul-01.ll
index 3b72d25e0b5c..126567b218ab 100644
--- a/test/CodeGen/SystemZ/fp-mul-01.ll
+++ b/test/CodeGen/SystemZ/fp-mul-01.ll
@@ -1,6 +1,8 @@
 ; Test multiplication of two f32s, producing an f32 result.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
 
 declare float @foo()
 
@@ -76,7 +78,7 @@ define float @f6(float %f1, float *%base, i64 %index) {
 define float @f7(float *%ptr0) {
 ; CHECK-LABEL: f7:
 ; CHECK: brasl %r14, foo@PLT
-; CHECK: meeb %f0, 16{{[04]}}(%r15)
+; CHECK-SCALAR: meeb %f0, 16{{[04]}}(%r15)
 ; CHECK: br %r14
   %ptr1 = getelementptr float, float *%ptr0, i64 2
   %ptr2 = getelementptr float, float *%ptr0, i64 4
diff --git a/test/CodeGen/SystemZ/fp-mul-06.ll b/test/CodeGen/SystemZ/fp-mul-06.ll
index 896fafecbdaf..581e44eeaa2f 100644
--- a/test/CodeGen/SystemZ/fp-mul-06.ll
+++ b/test/CodeGen/SystemZ/fp-mul-06.ll
@@ -1,11 +1,15 @@
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-VECTOR %s
 
 declare float @llvm.fma.f32(float %f1, float %f2, float %f3)
 
 define float @f1(float %f1, float %f2, float %acc) {
 ; CHECK-LABEL: f1:
-; CHECK: maebr %f4, %f0, %f2
-; CHECK: ler %f0, %f4
+; CHECK-SCALAR: maebr %f4, %f0, %f2
+; CHECK-SCALAR: ler %f0, %f4
+; CHECK-VECTOR: wfmasb %f0, %f0, %f2, %f4
 ; CHECK: br %r14
   %res = call float @llvm.fma.f32 (float %f1, float %f2, float %acc)
   ret float %res
@@ -14,7 +18,8 @@ define float @f1(float %f1, float %f2, float %acc) {
 define float @f2(float %f1, float *%ptr, float %acc) {
 ; CHECK-LABEL: f2:
 ; CHECK: maeb %f2, %f0, 0(%r2)
-; CHECK: ler %f0, %f2
+; CHECK-SCALAR: ler %f0, %f2
+; CHECK-VECTOR: ldr %f0, %f2
 ; CHECK: br %r14
   %f2 = load float , float *%ptr
   %res = call float @llvm.fma.f32 (float %f1, float %f2, float %acc)
@@ -24,7 +29,8 @@ define float @f2(float %f1, float *%ptr, float %acc) {
 define float @f3(float %f1, float *%base, float %acc) {
 ; CHECK-LABEL: f3:
 ; CHECK: maeb %f2, %f0, 4092(%r2)
-; CHECK: ler %f0, %f2
+; CHECK-SCALAR: ler %f0, %f2
+; CHECK-VECTOR: ldr %f0, %f2
 ; CHECK: br %r14
   %ptr = getelementptr float, float *%base, i64 1023
   %f2 = load float , float *%ptr
@@ -39,7 +45,8 @@ define float @f4(float %f1, float *%base, float %acc) {
 ; CHECK-LABEL: f4:
 ; CHECK: aghi %r2, 4096
 ; CHECK: maeb %f2, %f0, 0(%r2)
-; CHECK: ler %f0, %f2
+; CHECK-SCALAR: ler %f0, %f2
+; CHECK-VECTOR: ldr %f0, %f2
 ; CHECK: br %r14
   %ptr = getelementptr float, float *%base, i64 1024
   %f2 = load float , float *%ptr
@@ -54,7 +61,8 @@ define float @f5(float %f1, float *%base, float %acc) {
 ; CHECK-LABEL: f5:
 ; CHECK: aghi %r2, -4
 ; CHECK: maeb %f2, %f0, 0(%r2)
-; CHECK: ler %f0, %f2
+; CHECK-SCALAR: ler %f0, %f2
+; CHECK-VECTOR: ldr %f0, %f2
 ; CHECK: br %r14
   %ptr = getelementptr float, float *%base, i64 -1
   %f2 = load float , float *%ptr
@@ -66,7 +74,8 @@ define float @f6(float %f1, float *%base, i64 %index, float %acc) {
 ; CHECK-LABEL: f6:
 ; CHECK: sllg %r1, %r3, 2
 ; CHECK: maeb %f2, %f0, 0(%r1,%r2)
-; CHECK: ler %f0, %f2
+; CHECK-SCALAR: ler %f0, %f2
+; CHECK-VECTOR: ldr %f0, %f2
 ; CHECK: br %r14
   %ptr = getelementptr float, float *%base, i64 %index
   %f2 = load float , float *%ptr
@@ -78,7 +87,8 @@ define float @f7(float %f1, float *%base, i64 %index, float %acc) {
 ; CHECK-LABEL: f7:
 ; CHECK: sllg %r1, %r3, 2
 ; CHECK: maeb %f2, %f0, 4092({{%r1,%r2|%r2,%r1}})
-; CHECK: ler %f0, %f2
+; CHECK-SCALAR: ler %f0, %f2
+; CHECK-VECTOR: ldr %f0, %f2
 ; CHECK: br %r14
   %index2 = add i64 %index, 1023
   %ptr = getelementptr float, float *%base, i64 %index2
@@ -92,7 +102,8 @@ define float @f8(float %f1, float *%base, i64 %index, float %acc) {
 ; CHECK: sllg %r1, %r3, 2
 ; CHECK: lay %r1, 4096({{%r1,%r2|%r2,%r1}})
 ; CHECK: maeb %f2, %f0, 0(%r1)
-; CHECK: ler %f0, %f2
+; CHECK-SCALAR: ler %f0, %f2
+; CHECK-VECTOR: ldr %f0, %f2
 ; CHECK: br %r14
   %index2 = add i64 %index, 1024
   %ptr = getelementptr float, float *%base, i64 %index2
diff --git a/test/CodeGen/SystemZ/fp-mul-08.ll b/test/CodeGen/SystemZ/fp-mul-08.ll
index 5e5538bfacc9..5b1f9b96c089 100644
--- a/test/CodeGen/SystemZ/fp-mul-08.ll
+++ b/test/CodeGen/SystemZ/fp-mul-08.ll
@@ -1,11 +1,15 @@
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-VECTOR %s
 
 declare float @llvm.fma.f32(float %f1, float %f2, float %f3)
 
 define float @f1(float %f1, float %f2, float %acc) {
 ; CHECK-LABEL: f1:
-; CHECK: msebr %f4, %f0, %f2
-; CHECK: ler %f0, %f4
+; CHECK-SCALAR: msebr %f4, %f0, %f2
+; CHECK-SCALAR: ler %f0, %f4
+; CHECK-VECTOR: wfmssb %f0, %f0, %f2, %f4
 ; CHECK: br %r14
   %negacc = fsub float -0.0, %acc
   %res = call float @llvm.fma.f32 (float %f1, float %f2, float %negacc)
@@ -15,7 +19,8 @@ define float @f1(float %f1, float %f2, float %acc) {
 define float @f2(float %f1, float *%ptr, float %acc) {
 ; CHECK-LABEL: f2:
 ; CHECK: mseb %f2, %f0, 0(%r2)
-; CHECK: ler %f0, %f2
+; CHECK-SCALAR: ler %f0, %f2
+; CHECK-VECTOR: ldr %f0, %f2
 ; CHECK: br %r14
   %f2 = load float , float *%ptr
   %negacc = fsub float -0.0, %acc
@@ -26,7 +31,8 @@ define float @f2(float %f1, float *%ptr, float %acc) {
 define float @f3(float %f1, float *%base, float %acc) {
 ; CHECK-LABEL: f3:
 ; CHECK: mseb %f2, %f0, 4092(%r2)
-; CHECK: ler %f0, %f2
+; CHECK-SCALAR: ler %f0, %f2
+; CHECK-VECTOR: ldr %f0, %f2
 ; CHECK: br %r14
   %ptr = getelementptr float, float *%base, i64 1023
   %f2 = load float , float *%ptr
@@ -42,7 +48,8 @@ define float @f4(float %f1, float *%base, float %acc) {
 ; CHECK-LABEL: f4:
 ; CHECK: aghi %r2, 4096
 ; CHECK: mseb %f2, %f0, 0(%r2)
-; CHECK: ler %f0, %f2
+; CHECK-SCALAR: ler %f0, %f2
+; CHECK-VECTOR: ldr %f0, %f2
 ; CHECK: br %r14
   %ptr = getelementptr float, float *%base, i64 1024
   %f2 = load float , float *%ptr
@@ -58,7 +65,8 @@ define float @f5(float %f1, float *%base, float %acc) {
 ; CHECK-LABEL: f5:
 ; CHECK: aghi %r2, -4
 ; CHECK: mseb %f2, %f0, 0(%r2)
-; CHECK: ler %f0, %f2
+; CHECK-SCALAR: ler %f0, %f2
+; CHECK-VECTOR: ldr %f0, %f2
 ; CHECK: br %r14
   %ptr = getelementptr float, float *%base, i64 -1
   %f2 = load float , float *%ptr
@@ -71,7 +79,8 @@ define float @f6(float %f1, float *%base, i64 %index, float %acc) {
 ; CHECK-LABEL: f6:
 ; CHECK: sllg %r1, %r3, 2
 ; CHECK: mseb %f2, %f0, 0(%r1,%r2)
-; CHECK: ler %f0, %f2
+; CHECK-SCALAR: ler %f0, %f2
+; CHECK-VECTOR: ldr %f0, %f2
 ; CHECK: br %r14
   %ptr = getelementptr float, float *%base, i64 %index
   %f2 = load float , float *%ptr
@@ -84,7 +93,8 @@ define float @f7(float %f1, float *%base, i64 %index, float %acc) {
 ; CHECK-LABEL: f7:
 ; CHECK: sllg %r1, %r3, 2
 ; CHECK: mseb %f2, %f0, 4092({{%r1,%r2|%r2,%r1}})
-; CHECK: ler %f0, %f2
+; CHECK-SCALAR: ler %f0, %f2
+; CHECK-VECTOR: ldr %f0, %f2
 ; CHECK: br %r14
   %index2 = add i64 %index, 1023
   %ptr = getelementptr float, float *%base, i64 %index2
@@ -99,7 +109,8 @@ define float @f8(float %f1, float *%base, i64 %index, float %acc) {
 ; CHECK: sllg %r1, %r3, 2
 ; CHECK: lay %r1, 4096({{%r1,%r2|%r2,%r1}})
 ; CHECK: mseb %f2, %f0, 0(%r1)
-; CHECK: ler %f0, %f2
+; CHECK-SCALAR: ler %f0, %f2
+; CHECK-VECTOR: ldr %f0, %f2
 ; CHECK: br %r14
   %index2 = add i64 %index, 1024
   %ptr = getelementptr float, float *%base, i64 %index2
diff --git a/test/CodeGen/SystemZ/fp-mul-10.ll b/test/CodeGen/SystemZ/fp-mul-10.ll
new file mode 100644
index 000000000000..c23a6a202ad5
--- /dev/null
+++ b/test/CodeGen/SystemZ/fp-mul-10.ll
@@ -0,0 +1,43 @@
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+declare double @llvm.fma.f64(double %f1, double %f2, double %f3)
+declare float @llvm.fma.f32(float %f1, float %f2, float %f3)
+
+define double @f1(double %f1, double %f2, double %acc) {
+; CHECK-LABEL: f1:
+; CHECK: wfnmadb %f0, %f0, %f2, %f4
+; CHECK: br %r14
+  %res = call double @llvm.fma.f64 (double %f1, double %f2, double %acc)
+  %negres = fsub double -0.0, %res
+  ret double %negres
+}
+
+define double @f2(double %f1, double %f2, double %acc) {
+; CHECK-LABEL: f2:
+; CHECK: wfnmsdb %f0, %f0, %f2, %f4
+; CHECK: br %r14
+  %negacc = fsub double -0.0, %acc
+  %res = call double @llvm.fma.f64 (double %f1, double %f2, double %negacc)
+  %negres = fsub double -0.0, %res
+  ret double %negres
+}
+
+define float @f3(float %f1, float %f2, float %acc) {
+; CHECK-LABEL: f3:
+; CHECK: wfnmasb %f0, %f0, %f2, %f4
+; CHECK: br %r14
+  %res = call float @llvm.fma.f32 (float %f1, float %f2, float %acc)
+  %negres = fsub float -0.0, %res
+  ret float %negres
+}
+
+define float @f4(float %f1, float %f2, float %acc) {
+; CHECK-LABEL: f4:
+; CHECK: wfnmssb %f0, %f0, %f2, %f4
+; CHECK: br %r14
+  %negacc = fsub float -0.0, %acc
+  %res = call float @llvm.fma.f32 (float %f1, float %f2, float %negacc)
+  %negres = fsub float -0.0, %res
+  ret float %negres
+}
+
diff --git a/test/CodeGen/SystemZ/fp-mul-11.ll b/test/CodeGen/SystemZ/fp-mul-11.ll
new file mode 100644
index 000000000000..ef45bf184a4c
--- /dev/null
+++ b/test/CodeGen/SystemZ/fp-mul-11.ll
@@ -0,0 +1,32 @@
+; Test 128-bit floating-point multiplication on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+define void @f1(fp128 *%ptr1, fp128 *%ptr2) {
+; CHECK-LABEL: f1:
+; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2)
+; CHECK-DAG: vl [[REG2:%v[0-9]+]], 0(%r3)
+; CHECK: wfmxb [[RES:%v[0-9]+]], [[REG1]], [[REG2]]
+; CHECK: vst [[RES]], 0(%r2)
+; CHECK: br %r14
+  %f1 = load fp128, fp128 *%ptr1
+  %f2 = load fp128, fp128 *%ptr2
+  %sum = fmul fp128 %f1, %f2
+  store fp128 %sum, fp128 *%ptr1
+  ret void
+}
+
+define void @f2(double %f1, double %f2, fp128 *%dst) {
+; CHECK-LABEL: f2:
+; CHECK-DAG: wflld [[REG1:%v[0-9]+]], %f0
+; CHECK-DAG: wflld [[REG2:%v[0-9]+]], %f2
+; CHECK: wfmxb [[RES:%v[0-9]+]], [[REG1]], [[REG2]]
+; CHECK: vst [[RES]], 0(%r2)
+; CHECK: br %r14
+  %f1x = fpext double %f1 to fp128
+  %f2x = fpext double %f2 to fp128
+  %res = fmul fp128 %f1x, %f2x
+  store fp128 %res, fp128 *%dst
+  ret void
+}
+
diff --git a/test/CodeGen/SystemZ/fp-mul-12.ll b/test/CodeGen/SystemZ/fp-mul-12.ll
new file mode 100644
index 000000000000..331f9a30c274
--- /dev/null
+++ b/test/CodeGen/SystemZ/fp-mul-12.ll
@@ -0,0 +1,72 @@
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+declare fp128 @llvm.fma.f128(fp128 %f1, fp128 %f2, fp128 %f3)
+
+define void @f1(fp128 *%ptr1, fp128 *%ptr2, fp128 *%ptr3, fp128 *%dst) {
+; CHECK-LABEL: f1:
+; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2)
+; CHECK-DAG: vl [[REG2:%v[0-9]+]], 0(%r3)
+; CHECK-DAG: vl [[REG3:%v[0-9]+]], 0(%r4)
+; CHECK: wfmaxb [[RES:%v[0-9]+]], [[REG1]], [[REG2]], [[REG3]]
+; CHECK: vst [[RES]], 0(%r5)
+; CHECK: br %r14
+  %f1 = load fp128, fp128 *%ptr1
+  %f2 = load fp128, fp128 *%ptr2
+  %f3 = load fp128, fp128 *%ptr3
+  %res = call fp128 @llvm.fma.f128 (fp128 %f1, fp128 %f2, fp128 %f3)
+  store fp128 %res, fp128 *%dst
+  ret void
+}
+
+define void @f2(fp128 *%ptr1, fp128 *%ptr2, fp128 *%ptr3, fp128 *%dst) {
+; CHECK-LABEL: f2:
+; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2)
+; CHECK-DAG: vl [[REG2:%v[0-9]+]], 0(%r3)
+; CHECK-DAG: vl [[REG3:%v[0-9]+]], 0(%r4)
+; CHECK: wfmsxb [[RES:%v[0-9]+]], [[REG1]], [[REG2]], [[REG3]]
+; CHECK: vst [[RES]], 0(%r5)
+; CHECK: br %r14
+  %f1 = load fp128, fp128 *%ptr1
+  %f2 = load fp128, fp128 *%ptr2
+  %f3 = load fp128, fp128 *%ptr3
+  %neg = fsub fp128 0xL00000000000000008000000000000000, %f3
+  %res = call fp128 @llvm.fma.f128 (fp128 %f1, fp128 %f2, fp128 %neg)
+  store fp128 %res, fp128 *%dst
+  ret void
+}
+
+define void @f3(fp128 *%ptr1, fp128 *%ptr2, fp128 *%ptr3, fp128 *%dst) {
+; CHECK-LABEL: f3:
+; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2)
+; CHECK-DAG: vl [[REG2:%v[0-9]+]], 0(%r3)
+; CHECK-DAG: vl [[REG3:%v[0-9]+]], 0(%r4)
+; CHECK: wfnmaxb [[RES:%v[0-9]+]], [[REG1]], [[REG2]], [[REG3]]
+; CHECK: vst [[RES]], 0(%r5)
+; CHECK: br %r14
+  %f1 = load fp128, fp128 *%ptr1
+  %f2 = load fp128, fp128 *%ptr2
+  %f3 = load fp128, fp128 *%ptr3
+  %res = call fp128 @llvm.fma.f128 (fp128 %f1, fp128 %f2, fp128 %f3)
+  %negres = fsub fp128 0xL00000000000000008000000000000000, %res
+  store fp128 %negres, fp128 *%dst
+  ret void
+}
+
+define void @f4(fp128 *%ptr1, fp128 *%ptr2, fp128 *%ptr3, fp128 *%dst) {
+; CHECK-LABEL: f4:
+; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2)
+; CHECK-DAG: vl [[REG2:%v[0-9]+]], 0(%r3)
+; CHECK-DAG: vl [[REG3:%v[0-9]+]], 0(%r4)
+; CHECK: wfnmsxb [[RES:%v[0-9]+]], [[REG1]], [[REG2]], [[REG3]]
+; CHECK: vst [[RES]], 0(%r5)
+; CHECK: br %r14
+  %f1 = load fp128, fp128 *%ptr1
+  %f2 = load fp128, fp128 *%ptr2
+  %f3 = load fp128, fp128 *%ptr3
+  %neg = fsub fp128 0xL00000000000000008000000000000000, %f3
+  %res = call fp128 @llvm.fma.f128 (fp128 %f1, fp128 %f2, fp128 %neg)
+  %negres = fsub fp128 0xL00000000000000008000000000000000, %res
+  store fp128 %negres, fp128 *%dst
+  ret void
+}
+
diff --git a/test/CodeGen/SystemZ/fp-neg-02.ll b/test/CodeGen/SystemZ/fp-neg-02.ll
new file mode 100644
index 000000000000..38fb3a58d404
--- /dev/null
+++ b/test/CodeGen/SystemZ/fp-neg-02.ll
@@ -0,0 +1,41 @@
+; Test floating-point negation on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+; Test f32.
+define float @f1(float %f) {
+; CHECK-LABEL: f1:
+; CHECK: lcdfr %f0, %f0
+; CHECK: br %r14
+  %res = fsub float -0.0, %f
+  ret float %res
+}
+
+; Test f64.
+define double @f2(double %f) {
+; CHECK-LABEL: f2:
+; CHECK: lcdfr %f0, %f0
+; CHECK: br %r14
+  %res = fsub double -0.0, %f
+  ret double %res
+}
+
+; Test f128.  With the loads and stores, a pure negation would probably
+; be better implemented using an XI on the upper byte.  Do some extra
+; processing so that using FPRs is unequivocally better.
+define void @f3(fp128 *%ptr, fp128 *%ptr2) {
+; CHECK-LABEL: f3:
+; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2)
+; CHECK-DAG: vl [[REG2:%v[0-9]+]], 0(%r3)
+; CHECK-DAG: wflcxb [[NEGREG1:%v[0-9]+]], [[REG1]]
+; CHECK: wfdxb [[RES:%v[0-9]+]], [[NEGREG1]], [[REG2]]
+; CHECK: vst [[RES]], 0(%r2)
+; CHECK: br %r14
+  %orig = load fp128 , fp128 *%ptr
+  %negzero = fpext float -0.0 to fp128
+  %neg = fsub fp128 0xL00000000000000008000000000000000, %orig
+  %op2 = load fp128 , fp128 *%ptr2
+  %res = fdiv fp128 %neg, %op2
+  store fp128 %res, fp128 *%ptr
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/fp-round-03.ll b/test/CodeGen/SystemZ/fp-round-03.ll
new file mode 100644
index 000000000000..762e793701d1
--- /dev/null
+++ b/test/CodeGen/SystemZ/fp-round-03.ll
@@ -0,0 +1,207 @@
+; Test rounding functions for z14 and above.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+; Test rint for f32.
+declare float @llvm.rint.f32(float %f)
+define float @f1(float %f) {
+; CHECK-LABEL: f1:
+; CHECK: fiebra %f0, 0, %f0, 0
+; CHECK: br %r14
+  %res = call float @llvm.rint.f32(float %f)
+  ret float %res
+}
+
+; Test rint for f64.
+declare double @llvm.rint.f64(double %f)
+define double @f2(double %f) {
+; CHECK-LABEL: f2:
+; CHECK: fidbra %f0, 0, %f0, 0
+; CHECK: br %r14
+  %res = call double @llvm.rint.f64(double %f)
+  ret double %res
+}
+
+; Test rint for f128.
+declare fp128 @llvm.rint.f128(fp128 %f)
+define void @f3(fp128 *%ptr) {
+; CHECK-LABEL: f3:
+; CHECK: vl [[REG:%v[0-9]+]], 0(%r2)
+; CHECK: wfixb [[RES:%v[0-9]+]], [[REG]], 0, 0
+; CHECK: vst [[RES]], 0(%r2)
+; CHECK: br %r14
+  %src = load fp128 , fp128 *%ptr
+  %res = call fp128 @llvm.rint.f128(fp128 %src)
+  store fp128 %res, fp128 *%ptr
+  ret void
+}
+
+; Test nearbyint for f32.
+declare float @llvm.nearbyint.f32(float %f)
+define float @f4(float %f) {
+; CHECK-LABEL: f4:
+; CHECK: fiebra %f0, 0, %f0, 4
+; CHECK: br %r14
+  %res = call float @llvm.nearbyint.f32(float %f)
+  ret float %res
+}
+
+; Test nearbyint for f64.
+declare double @llvm.nearbyint.f64(double %f)
+define double @f5(double %f) {
+; CHECK-LABEL: f5:
+; CHECK: fidbra %f0, 0, %f0, 4
+; CHECK: br %r14
+  %res = call double @llvm.nearbyint.f64(double %f)
+  ret double %res
+}
+
+; Test nearbyint for f128.
+declare fp128 @llvm.nearbyint.f128(fp128 %f)
+define void @f6(fp128 *%ptr) {
+; CHECK-LABEL: f6:
+; CHECK: vl [[REG:%v[0-9]+]], 0(%r2)
+; CHECK: wfixb [[RES:%v[0-9]+]], [[REG]], 4, 0
+; CHECK: vst [[RES]], 0(%r2)
+; CHECK: br %r14
+  %src = load fp128 , fp128 *%ptr
+  %res = call fp128 @llvm.nearbyint.f128(fp128 %src)
+  store fp128 %res, fp128 *%ptr
+  ret void
+}
+
+; Test floor for f32.
+declare float @llvm.floor.f32(float %f)
+define float @f7(float %f) {
+; CHECK-LABEL: f7:
+; CHECK: fiebra %f0, 7, %f0, 4
+; CHECK: br %r14
+  %res = call float @llvm.floor.f32(float %f)
+  ret float %res
+}
+
+; Test floor for f64.
+declare double @llvm.floor.f64(double %f)
+define double @f8(double %f) {
+; CHECK-LABEL: f8:
+; CHECK: fidbra %f0, 7, %f0, 4
+; CHECK: br %r14
+  %res = call double @llvm.floor.f64(double %f)
+  ret double %res
+}
+
+; Test floor for f128.
+declare fp128 @llvm.floor.f128(fp128 %f)
+define void @f9(fp128 *%ptr) {
+; CHECK-LABEL: f9:
+; CHECK: vl [[REG:%v[0-9]+]], 0(%r2)
+; CHECK: wfixb [[RES:%v[0-9]+]], [[REG]], 4, 7
+; CHECK: vst [[RES]], 0(%r2)
+; CHECK: br %r14
+  %src = load fp128 , fp128 *%ptr
+  %res = call fp128 @llvm.floor.f128(fp128 %src)
+  store fp128 %res, fp128 *%ptr
+  ret void
+}
+
+; Test ceil for f32.
+declare float @llvm.ceil.f32(float %f)
+define float @f10(float %f) {
+; CHECK-LABEL: f10:
+; CHECK: fiebra %f0, 6, %f0, 4
+; CHECK: br %r14
+  %res = call float @llvm.ceil.f32(float %f)
+  ret float %res
+}
+
+; Test ceil for f64.
+declare double @llvm.ceil.f64(double %f)
+define double @f11(double %f) {
+; CHECK-LABEL: f11:
+; CHECK: fidbra %f0, 6, %f0, 4
+; CHECK: br %r14
+  %res = call double @llvm.ceil.f64(double %f)
+  ret double %res
+}
+
+; Test ceil for f128.
+declare fp128 @llvm.ceil.f128(fp128 %f)
+define void @f12(fp128 *%ptr) {
+; CHECK-LABEL: f12:
+; CHECK: vl [[REG:%v[0-9]+]], 0(%r2)
+; CHECK: wfixb [[RES:%v[0-9]+]], [[REG]], 4, 6
+; CHECK: vst [[RES]], 0(%r2)
+; CHECK: br %r14
+  %src = load fp128 , fp128 *%ptr
+  %res = call fp128 @llvm.ceil.f128(fp128 %src)
+  store fp128 %res, fp128 *%ptr
+  ret void
+}
+
+; Test trunc for f32.
+declare float @llvm.trunc.f32(float %f)
+define float @f13(float %f) {
+; CHECK-LABEL: f13:
+; CHECK: fiebra %f0, 5, %f0, 4
+; CHECK: br %r14
+  %res = call float @llvm.trunc.f32(float %f)
+  ret float %res
+}
+
+; Test trunc for f64.
+declare double @llvm.trunc.f64(double %f)
+define double @f14(double %f) {
+; CHECK-LABEL: f14:
+; CHECK: fidbra %f0, 5, %f0, 4
+; CHECK: br %r14
+  %res = call double @llvm.trunc.f64(double %f)
+  ret double %res
+}
+
+; Test trunc for f128.
+declare fp128 @llvm.trunc.f128(fp128 %f)
+define void @f15(fp128 *%ptr) {
+; CHECK-LABEL: f15:
+; CHECK: vl [[REG:%v[0-9]+]], 0(%r2)
+; CHECK: wfixb [[RES:%v[0-9]+]], [[REG]], 4, 5
+; CHECK: vst [[RES]], 0(%r2)
+; CHECK: br %r14
+  %src = load fp128 , fp128 *%ptr
+  %res = call fp128 @llvm.trunc.f128(fp128 %src)
+  store fp128 %res, fp128 *%ptr
+  ret void
+}
+
+; Test round for f32.
+declare float @llvm.round.f32(float %f)
+define float @f16(float %f) {
+; CHECK-LABEL: f16:
+; CHECK: fiebra %f0, 1, %f0, 4
+; CHECK: br %r14
+  %res = call float @llvm.round.f32(float %f)
+  ret float %res
+}
+
+; Test round for f64.
+declare double @llvm.round.f64(double %f)
+define double @f17(double %f) {
+; CHECK-LABEL: f17:
+; CHECK: fidbra %f0, 1, %f0, 4
+; CHECK: br %r14
+  %res = call double @llvm.round.f64(double %f)
+  ret double %res
+}
+
+; Test round for f128.
+declare fp128 @llvm.round.f128(fp128 %f)
+define void @f18(fp128 *%ptr) {
+; CHECK-LABEL: f18:
+; CHECK: vl [[REG:%v[0-9]+]], 0(%r2)
+; CHECK: wfixb [[RES:%v[0-9]+]], [[REG]], 4, 1
+; CHECK: vst [[RES]], 0(%r2)
+; CHECK: br %r14
+  %src = load fp128 , fp128 *%ptr
+  %res = call fp128 @llvm.round.f128(fp128 %src)
+  store fp128 %res, fp128 *%ptr
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/fp-sqrt-01.ll b/test/CodeGen/SystemZ/fp-sqrt-01.ll
index 3680207e7f20..85a46bc2d7fc 100644
--- a/test/CodeGen/SystemZ/fp-sqrt-01.ll
+++ b/test/CodeGen/SystemZ/fp-sqrt-01.ll
@@ -1,6 +1,8 @@
 ; Test 32-bit square root.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
 
 declare float @llvm.sqrt.f32(float)
 declare float @sqrtf(float)
@@ -77,7 +79,7 @@ define float @f6(float *%base, i64 %index) {
 ; to use SQEB if possible.
 define void @f7(float *%ptr) {
 ; CHECK-LABEL: f7:
-; CHECK: sqeb {{%f[0-9]+}}, 16{{[04]}}(%r15)
+; CHECK-SCALAR: sqeb {{%f[0-9]+}}, 16{{[04]}}(%r15)
 ; CHECK: br %r14
   %val0 = load volatile float , float *%ptr
   %val1 = load volatile float , float *%ptr
@@ -160,7 +162,7 @@ define float @f8(float %dummy, float %val) {
 ; CHECK: sqebr %f0, %f2
 ; CHECK: cebr %f0, %f0
 ; CHECK: bnor %r14
-; CHECK: ler %f0, %f2
+; CHECK: {{ler|ldr}} %f0, %f2
 ; CHECK: jg sqrtf@PLT
   %res = tail call float @sqrtf(float %val)
   ret float %res
diff --git a/test/CodeGen/SystemZ/fp-sqrt-04.ll b/test/CodeGen/SystemZ/fp-sqrt-04.ll
new file mode 100644
index 000000000000..e0fb2569b39a
--- /dev/null
+++ b/test/CodeGen/SystemZ/fp-sqrt-04.ll
@@ -0,0 +1,17 @@
+; Test 128-bit floating-point square root on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+declare fp128 @llvm.sqrt.f128(fp128 %f)
+
+define void @f1(fp128 *%ptr) {
+; CHECK-LABEL: f1:
+; CHECK-DAG: vl [[REG:%v[0-9]+]], 0(%r2)
+; CHECK: wfsqxb [[RES:%v[0-9]+]], [[REG]]
+; CHECK: vst [[RES]], 0(%r2)
+; CHECK: br %r14
+  %f = load fp128, fp128 *%ptr
+  %res = call fp128 @llvm.sqrt.f128(fp128 %f)
+  store fp128 %res, fp128 *%ptr
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/fp-sub-01.ll b/test/CodeGen/SystemZ/fp-sub-01.ll
index f4185ca3108d..41f72e1810e9 100644
--- a/test/CodeGen/SystemZ/fp-sub-01.ll
+++ b/test/CodeGen/SystemZ/fp-sub-01.ll
@@ -1,6 +1,8 @@
 ; Test 32-bit floating-point subtraction.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
 
 declare float @foo()
 
@@ -76,7 +78,7 @@ define float @f6(float %f1, float *%base, i64 %index) {
 define float @f7(float *%ptr0) {
 ; CHECK-LABEL: f7:
 ; CHECK: brasl %r14, foo@PLT
-; CHECK: seb %f0, 16{{[04]}}(%r15)
+; CHECK-SCALAR: seb %f0, 16{{[04]}}(%r15)
 ; CHECK: br %r14
   %ptr1 = getelementptr float, float *%ptr0, i64 2
   %ptr2 = getelementptr float, float *%ptr0, i64 4
diff --git a/test/CodeGen/SystemZ/fp-sub-04.ll b/test/CodeGen/SystemZ/fp-sub-04.ll
new file mode 100644
index 000000000000..5f88132664ef
--- /dev/null
+++ b/test/CodeGen/SystemZ/fp-sub-04.ll
@@ -0,0 +1,17 @@
+; Test 128-bit floating-point subtraction on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+define void @f1(fp128 *%ptr1, fp128 *%ptr2) {
+; CHECK-LABEL: f1:
+; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2)
+; CHECK-DAG: vl [[REG2:%v[0-9]+]], 0(%r3)
+; CHECK: wfsxb [[RES:%v[0-9]+]], [[REG1]], [[REG2]]
+; CHECK: vst [[RES]], 0(%r2)
+; CHECK: br %r14
+  %f1 = load fp128, fp128 *%ptr1
+  %f2 = load fp128, fp128 *%ptr2
+  %sum = fsub fp128 %f1, %f2
+  store fp128 %sum, fp128 *%ptr1
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/int-add-17.ll b/test/CodeGen/SystemZ/int-add-17.ll
new file mode 100644
index 000000000000..fd245871c652
--- /dev/null
+++ b/test/CodeGen/SystemZ/int-add-17.ll
@@ -0,0 +1,95 @@
+; Test additions between an i64 and a sign-extended i16 on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+declare i64 @foo()
+
+; Check AGH with no displacement.
+define i64 @f1(i64 %a, i16 *%src) {
+; CHECK-LABEL: f1:
+; CHECK: agh %r2, 0(%r3)
+; CHECK: br %r14
+  %b = load i16, i16 *%src
+  %bext = sext i16 %b to i64
+  %add = add i64 %a, %bext
+  ret i64 %add
+}
+
+; Check the high end of the aligned AGH range.
+define i64 @f2(i64 %a, i16 *%src) {
+; CHECK-LABEL: f2:
+; CHECK: agh %r2, 524286(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i16, i16 *%src, i64 262143
+  %b = load i16, i16 *%ptr
+  %bext = sext i16 %b to i64
+  %add = add i64 %a, %bext
+  ret i64 %add
+}
+
+; Check the next word up, which needs separate address logic.
+; Other sequences besides this one would be OK.
+define i64 @f3(i64 %a, i16 *%src) {
+; CHECK-LABEL: f3:
+; CHECK: agfi %r3, 524288
+; CHECK: agh %r2, 0(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i16, i16 *%src, i64 262144
+  %b = load i16, i16 *%ptr
+  %bext = sext i16 %b to i64
+  %add = add i64 %a, %bext
+  ret i64 %add
+}
+
+; Check the high end of the negative aligned AGH range.
+define i64 @f4(i64 %a, i16 *%src) {
+; CHECK-LABEL: f4:
+; CHECK: agh %r2, -2(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i16, i16 *%src, i64 -1
+  %b = load i16, i16 *%ptr
+  %bext = sext i16 %b to i64
+  %add = add i64 %a, %bext
+  ret i64 %add
+}
+
+; Check the low end of the AGH range.
+define i64 @f5(i64 %a, i16 *%src) {
+; CHECK-LABEL: f5:
+; CHECK: agh %r2, -524288(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i16, i16 *%src, i64 -262144
+  %b = load i16, i16 *%ptr
+  %bext = sext i16 %b to i64
+  %add = add i64 %a, %bext
+  ret i64 %add
+}
+
+; Check the next word down, which needs separate address logic.
+; Other sequences besides this one would be OK.
+define i64 @f6(i64 %a, i16 *%src) {
+; CHECK-LABEL: f6:
+; CHECK: agfi %r3, -524290
+; CHECK: agh %r2, 0(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i16, i16 *%src, i64 -262145
+  %b = load i16, i16 *%ptr
+  %bext = sext i16 %b to i64
+  %add = add i64 %a, %bext
+  ret i64 %add
+}
+
+; Check that AGH allows an index.
+define i64 @f7(i64 %a, i64 %src, i64 %index) {
+; CHECK-LABEL: f7:
+; CHECK: agh %r2, 524284({{%r4,%r3|%r3,%r4}})
+; CHECK: br %r14
+  %add1 = add i64 %src, %index
+  %add2 = add i64 %add1, 524284
+  %ptr = inttoptr i64 %add2 to i16 *
+  %b = load i16, i16 *%ptr
+  %bext = sext i16 %b to i64
+  %add = add i64 %a, %bext
+  ret i64 %add
+}
+
diff --git a/test/CodeGen/SystemZ/int-mul-09.ll b/test/CodeGen/SystemZ/int-mul-09.ll
new file mode 100644
index 000000000000..3e384e72db5d
--- /dev/null
+++ b/test/CodeGen/SystemZ/int-mul-09.ll
@@ -0,0 +1,95 @@
+; Test multiplications between an i64 and a sign-extended i16 on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+declare i64 @foo()
+
+; Check MGH with no displacement.
+define i64 @f1(i64 %a, i16 *%src) {
+; CHECK-LABEL: f1:
+; CHECK: mgh %r2, 0(%r3)
+; CHECK: br %r14
+  %b = load i16, i16 *%src
+  %bext = sext i16 %b to i64
+  %mul = mul i64 %a, %bext
+  ret i64 %mul
+}
+
+; Check the high end of the aligned MGH range.
+define i64 @f2(i64 %a, i16 *%src) {
+; CHECK-LABEL: f2:
+; CHECK: mgh %r2, 524286(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i16, i16 *%src, i64 262143
+  %b = load i16, i16 *%ptr
+  %bext = sext i16 %b to i64
+  %mul = mul i64 %a, %bext
+  ret i64 %mul
+}
+
+; Check the next word up, which needs separate address logic.
+; Other sequences besides this one would be OK.
+define i64 @f3(i64 %a, i16 *%src) {
+; CHECK-LABEL: f3:
+; CHECK: agfi %r3, 524288
+; CHECK: mgh %r2, 0(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i16, i16 *%src, i64 262144
+  %b = load i16, i16 *%ptr
+  %bext = sext i16 %b to i64
+  %mul = mul i64 %a, %bext
+  ret i64 %mul
+}
+
+; Check the high end of the negative aligned MGH range.
+define i64 @f4(i64 %a, i16 *%src) {
+; CHECK-LABEL: f4:
+; CHECK: mgh %r2, -2(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i16, i16 *%src, i64 -1
+  %b = load i16, i16 *%ptr
+  %bext = sext i16 %b to i64
+  %mul = mul i64 %a, %bext
+  ret i64 %mul
+}
+
+; Check the low end of the MGH range.
+define i64 @f5(i64 %a, i16 *%src) {
+; CHECK-LABEL: f5:
+; CHECK: mgh %r2, -524288(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i16, i16 *%src, i64 -262144
+  %b = load i16, i16 *%ptr
+  %bext = sext i16 %b to i64
+  %mul = mul i64 %a, %bext
+  ret i64 %mul
+}
+
+; Check the next word down, which needs separate address logic.
+; Other sequences besides this one would be OK.
+define i64 @f6(i64 %a, i16 *%src) {
+; CHECK-LABEL: f6:
+; CHECK: agfi %r3, -524290
+; CHECK: mgh %r2, 0(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i16, i16 *%src, i64 -262145
+  %b = load i16, i16 *%ptr
+  %bext = sext i16 %b to i64
+  %mul = mul i64 %a, %bext
+  ret i64 %mul
+}
+
+; Check that MGH allows an index.
+define i64 @f7(i64 %a, i64 %src, i64 %index) {
+; CHECK-LABEL: f7:
+; CHECK: mgh %r2, 524284({{%r4,%r3|%r3,%r4}})
+; CHECK: br %r14
+  %add1 = add i64 %src, %index
+  %add2 = add i64 %add1, 524284
+  %ptr = inttoptr i64 %add2 to i16 *
+  %b = load i16, i16 *%ptr
+  %bext = sext i16 %b to i64
+  %mul = mul i64 %a, %bext
+  ret i64 %mul
+}
+
diff --git a/test/CodeGen/SystemZ/int-mul-10.ll b/test/CodeGen/SystemZ/int-mul-10.ll
new file mode 100644
index 000000000000..a4d80af36a3c
--- /dev/null
+++ b/test/CodeGen/SystemZ/int-mul-10.ll
@@ -0,0 +1,165 @@
+; Test signed high-part i64->i128 multiplications on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+declare i64 @foo()
+
+; Check sign-extended multiplication in which only the high part is used.
+define i64 @f1(i64 %dummy, i64 %a, i64 %b) {
+; CHECK-LABEL: f1:
+; CHECK-NOT: {{%r[234]}}
+; CHECK: mgrk %r2, %r3, %r4
+; CHECK: br %r14
+  %ax = sext i64 %a to i128
+  %bx = sext i64 %b to i128
+  %mulx = mul i128 %ax, %bx
+  %highx = lshr i128 %mulx, 64
+  %high = trunc i128 %highx to i64
+  ret i64 %high
+}
+
+; Check sign-extended multiplication in which only part of the high half
+; is used.
+define i64 @f2(i64 %dummy, i64 %a, i64 %b) {
+; CHECK-LABEL: f2:
+; CHECK-NOT: {{%r[234]}}
+; CHECK: mgrk [[REG:%r[0-9]+]], %r3, %r4
+; CHECK: srlg %r2, [[REG]], 3
+; CHECK: br %r14
+  %ax = sext i64 %a to i128
+  %bx = sext i64 %b to i128
+  %mulx = mul i128 %ax, %bx
+  %highx = lshr i128 %mulx, 67
+  %high = trunc i128 %highx to i64
+  ret i64 %high
+}
+
+; Check sign-extended multiplication in which the result is split into
+; high and low halves.
+define i64 @f3(i64 %dummy, i64 %a, i64 %b) {
+; CHECK-LABEL: f3:
+; CHECK-NOT: {{%r[234]}}
+; CHECK: mgrk %r2, %r3, %r4
+; CHECK: ogr %r2, %r3
+; CHECK: br %r14
+  %ax = sext i64 %a to i128
+  %bx = sext i64 %b to i128
+  %mulx = mul i128 %ax, %bx
+  %highx = lshr i128 %mulx, 64
+  %high = trunc i128 %highx to i64
+  %low = trunc i128 %mulx to i64
+  %or = or i64 %high, %low
+  ret i64 %or
+}
+
+; Check MG with no displacement.
+define i64 @f4(i64 %dummy, i64 %a, i64 *%src) {
+; CHECK-LABEL: f4:
+; CHECK-NOT: {{%r[234]}}
+; CHECK: mg %r2, 0(%r4)
+; CHECK: br %r14
+  %b = load i64 , i64 *%src
+  %ax = sext i64 %a to i128
+  %bx = sext i64 %b to i128
+  %mulx = mul i128 %ax, %bx
+  %highx = lshr i128 %mulx, 64
+  %high = trunc i128 %highx to i64
+  ret i64 %high
+}
+
+; Check the high end of the aligned MG range.
+define i64 @f5(i64 %dummy, i64 %a, i64 *%src) {
+; CHECK-LABEL: f5:
+; CHECK: mg %r2, 524280(%r4)
+; CHECK: br %r14
+  %ptr = getelementptr i64, i64 *%src, i64 65535
+  %b = load i64 , i64 *%ptr
+  %ax = sext i64 %a to i128
+  %bx = sext i64 %b to i128
+  %mulx = mul i128 %ax, %bx
+  %highx = lshr i128 %mulx, 64
+  %high = trunc i128 %highx to i64
+  ret i64 %high
+}
+
+; Check the next doubleword up, which requires separate address logic.
+; Other sequences besides this one would be OK.
+define i64 @f6(i64 %dummy, i64 %a, i64 *%src) {
+; CHECK-LABEL: f6:
+; CHECK: agfi %r4, 524288
+; CHECK: mg %r2, 0(%r4)
+; CHECK: br %r14
+  %ptr = getelementptr i64, i64 *%src, i64 65536
+  %b = load i64 , i64 *%ptr
+  %ax = sext i64 %a to i128
+  %bx = sext i64 %b to i128
+  %mulx = mul i128 %ax, %bx
+  %highx = lshr i128 %mulx, 64
+  %high = trunc i128 %highx to i64
+  ret i64 %high
+}
+
+; Check the high end of the negative aligned MG range.
+define i64 @f7(i64 %dummy, i64 %a, i64 *%src) {
+; CHECK-LABEL: f7:
+; CHECK: mg %r2, -8(%r4)
+; CHECK: br %r14
+  %ptr = getelementptr i64, i64 *%src, i64 -1
+  %b = load i64 , i64 *%ptr
+  %ax = sext i64 %a to i128
+  %bx = sext i64 %b to i128
+  %mulx = mul i128 %ax, %bx
+  %highx = lshr i128 %mulx, 64
+  %high = trunc i128 %highx to i64
+  ret i64 %high
+}
+
+; Check the low end of the MG range.
+define i64 @f8(i64 %dummy, i64 %a, i64 *%src) {
+; CHECK-LABEL: f8:
+; CHECK: mg %r2, -524288(%r4)
+; CHECK: br %r14
+  %ptr = getelementptr i64, i64 *%src, i64 -65536
+  %b = load i64 , i64 *%ptr
+  %ax = sext i64 %a to i128
+  %bx = sext i64 %b to i128
+  %mulx = mul i128 %ax, %bx
+  %highx = lshr i128 %mulx, 64
+  %high = trunc i128 %highx to i64
+  ret i64 %high
+}
+
+; Check the next doubleword down, which needs separate address logic.
+; Other sequences besides this one would be OK.
+define i64 @f9(i64 *%dest, i64 %a, i64 *%src) {
+; CHECK-LABEL: f9:
+; CHECK: agfi %r4, -524296
+; CHECK: mg %r2, 0(%r4)
+; CHECK: br %r14
+  %ptr = getelementptr i64, i64 *%src, i64 -65537
+  %b = load i64 , i64 *%ptr
+  %ax = sext i64 %a to i128
+  %bx = sext i64 %b to i128
+  %mulx = mul i128 %ax, %bx
+  %highx = lshr i128 %mulx, 64
+  %high = trunc i128 %highx to i64
+  ret i64 %high
+}
+
+; Check that MG allows an index.
+define i64 @f10(i64 *%dest, i64 %a, i64 %src, i64 %index) {
+; CHECK-LABEL: f10:
+; CHECK: mg %r2, 524287(%r5,%r4)
+; CHECK: br %r14
+  %add1 = add i64 %src, %index
+  %add2 = add i64 %add1, 524287
+  %ptr = inttoptr i64 %add2 to i64 *
+  %b = load i64 , i64 *%ptr
+  %ax = sext i64 %a to i128
+  %bx = sext i64 %b to i128
+  %mulx = mul i128 %ax, %bx
+  %highx = lshr i128 %mulx, 64
+  %high = trunc i128 %highx to i64
+  ret i64 %high
+}
+
diff --git a/test/CodeGen/SystemZ/int-mul-11.ll b/test/CodeGen/SystemZ/int-mul-11.ll
new file mode 100644
index 000000000000..f26251982518
--- /dev/null
+++ b/test/CodeGen/SystemZ/int-mul-11.ll
@@ -0,0 +1,32 @@
+; Test three-operand multiplication instructions on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+; Check MSRKC.
+define i32 @f1(i32 %dummy, i32 %a, i32 %b) {
+; CHECK-LABEL: f1:
+; CHECK: msrkc %r2, %r3, %r4
+; CHECK: br %r14
+  %mul = mul i32 %a, %b
+  ret i32 %mul
+}
+
+; Check MSGRKC.
+define i64 @f2(i64 %dummy, i64 %a, i64 %b) {
+; CHECK-LABEL: f2:
+; CHECK: msgrkc %r2, %r3, %r4
+; CHECK: br %r14
+  %mul = mul i64 %a, %b
+  ret i64 %mul
+}
+
+; Verify that we still use MSGFR for i32->i64 multiplies.
+define i64 @f3(i64 %a, i32 %b) {
+; CHECK-LABEL: f3:
+; CHECK: msgfr %r2, %r3
+; CHECK: br %r14
+  %bext = sext i32 %b to i64
+  %mul = mul i64 %a, %bext
+  ret i64 %mul
+}
+
diff --git a/test/CodeGen/SystemZ/int-sub-10.ll b/test/CodeGen/SystemZ/int-sub-10.ll
new file mode 100644
index 000000000000..bf6638575e55
--- /dev/null
+++ b/test/CodeGen/SystemZ/int-sub-10.ll
@@ -0,0 +1,95 @@
+; Test subtractions of a sign-extended i16 from an i64 on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+declare i64 @foo()
+
+; Check SGH with no displacement.
+define i64 @f1(i64 %a, i16 *%src) {
+; CHECK-LABEL: f1:
+; CHECK: sgh %r2, 0(%r3)
+; CHECK: br %r14
+  %b = load i16, i16 *%src
+  %bext = sext i16 %b to i64
+  %sub = sub i64 %a, %bext
+  ret i64 %sub
+}
+
+; Check the high end of the aligned SGH range.
+define i64 @f2(i64 %a, i16 *%src) {
+; CHECK-LABEL: f2:
+; CHECK: sgh %r2, 524286(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i16, i16 *%src, i64 262143
+  %b = load i16, i16 *%ptr
+  %bext = sext i16 %b to i64
+  %sub = sub i64 %a, %bext
+  ret i64 %sub
+}
+
+; Check the next word up, which needs separate address logic.
+; Other sequences besides this one would be OK.
+define i64 @f3(i64 %a, i16 *%src) {
+; CHECK-LABEL: f3:
+; CHECK: agfi %r3, 524288
+; CHECK: sgh %r2, 0(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i16, i16 *%src, i64 262144
+  %b = load i16, i16 *%ptr
+  %bext = sext i16 %b to i64
+  %sub = sub i64 %a, %bext
+  ret i64 %sub
+}
+
+; Check the high end of the negative aligned SGH range.
+define i64 @f4(i64 %a, i16 *%src) {
+; CHECK-LABEL: f4:
+; CHECK: sgh %r2, -2(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i16, i16 *%src, i64 -1
+  %b = load i16, i16 *%ptr
+  %bext = sext i16 %b to i64
+  %sub = sub i64 %a, %bext
+  ret i64 %sub
+}
+
+; Check the low end of the SGH range.
+define i64 @f5(i64 %a, i16 *%src) {
+; CHECK-LABEL: f5:
+; CHECK: sgh %r2, -524288(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i16, i16 *%src, i64 -262144
+  %b = load i16, i16 *%ptr
+  %bext = sext i16 %b to i64
+  %sub = sub i64 %a, %bext
+  ret i64 %sub
+}
+
+; Check the next word down, which needs separate address logic.
+; Other sequences besides this one would be OK.
+define i64 @f6(i64 %a, i16 *%src) {
+; CHECK-LABEL: f6:
+; CHECK: agfi %r3, -524290
+; CHECK: sgh %r2, 0(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i16, i16 *%src, i64 -262145
+  %b = load i16, i16 *%ptr
+  %bext = sext i16 %b to i64
+  %sub = sub i64 %a, %bext
+  ret i64 %sub
+}
+
+; Check that SGH allows an index.
+define i64 @f7(i64 %a, i64 %src, i64 %index) {
+; CHECK-LABEL: f7:
+; CHECK: sgh %r2, 524284({{%r4,%r3|%r3,%r4}})
+; CHECK: br %r14
+  %add1 = add i64 %src, %index
+  %add2 = add i64 %add1, 524284
+  %ptr = inttoptr i64 %add2 to i16 *
+  %b = load i16, i16 *%ptr
+  %bext = sext i16 %b to i64
+  %sub = sub i64 %a, %bext
+  ret i64 %sub
+}
+
diff --git a/test/CodeGen/SystemZ/tdc-07.ll b/test/CodeGen/SystemZ/tdc-07.ll
new file mode 100644
index 000000000000..6651410e7c66
--- /dev/null
+++ b/test/CodeGen/SystemZ/tdc-07.ll
@@ -0,0 +1,18 @@
+; Test the Test Data Class instruction on z14
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+declare i32 @llvm.s390.tdc.f128(fp128, i64)
+
+; Check using as i32 - f128
+define i32 @f3(fp128 %x) {
+; CHECK-LABEL: f3
+; CHECK: vl %v0, 0(%r2)
+; CHECK: vrepg  %v2, %v0, 1
+; CHECK: tcxb %f0, 123
+; CHECK: ipm %r2
+; CHECK: srl %r2, 28
+  %res = call i32 @llvm.s390.tdc.f128(fp128 %x, i64 123)
+  ret i32 %res
+}
+
diff --git a/test/CodeGen/SystemZ/vec-abs-06.ll b/test/CodeGen/SystemZ/vec-abs-06.ll
new file mode 100644
index 000000000000..8eee1d9d2507
--- /dev/null
+++ b/test/CodeGen/SystemZ/vec-abs-06.ll
@@ -0,0 +1,47 @@
+; Test f32 and v4f32 absolute on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+declare float @llvm.fabs.f32(float)
+declare <4 x float> @llvm.fabs.v4f32(<4 x float>)
+
+; Test a plain absolute.
+define <4 x float> @f1(<4 x float> %val) {
+; CHECK-LABEL: f1:
+; CHECK: vflpsb %v24, %v24
+; CHECK: br %r14
+  %ret = call <4 x float> @llvm.fabs.v4f32(<4 x float> %val)
+  ret <4 x float> %ret
+}
+
+; Test a negative absolute.
+define <4 x float> @f2(<4 x float> %val) {
+; CHECK-LABEL: f2:
+; CHECK: vflnsb %v24, %v24
+; CHECK: br %r14
+  %abs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %val)
+  %ret = fsub <4 x float> <float -0.0, float -0.0,
+                           float -0.0, float -0.0>, %abs
+  ret <4 x float> %ret
+}
+
+; Test an f32 absolute that uses vector registers.
+define float @f3(<4 x float> %val) {
+; CHECK-LABEL: f3:
+; CHECK: wflpsb %f0, %v24
+; CHECK: br %r14
+  %scalar = extractelement <4 x float> %val, i32 0
+  %ret = call float @llvm.fabs.f32(float %scalar)
+  ret float %ret
+}
+
+; Test an f32 negative absolute that uses vector registers.
+define float @f4(<4 x float> %val) {
+; CHECK-LABEL: f4:
+; CHECK: wflnsb %f0, %v24
+; CHECK: br %r14
+  %scalar = extractelement <4 x float> %val, i32 0
+  %abs = call float @llvm.fabs.f32(float %scalar)
+  %ret = fsub float -0.0, %abs
+  ret float %ret
+}
diff --git a/test/CodeGen/SystemZ/vec-add-02.ll b/test/CodeGen/SystemZ/vec-add-02.ll
new file mode 100644
index 000000000000..97a9b84a063c
--- /dev/null
+++ b/test/CodeGen/SystemZ/vec-add-02.ll
@@ -0,0 +1,24 @@
+; Test vector addition on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+; Test a v4f32 addition.
+define <4 x float> @f1(<4 x float> %dummy, <4 x float> %val1,
+                       <4 x float> %val2) {
+; CHECK-LABEL: f1:
+; CHECK: vfasb %v24, %v26, %v28
+; CHECK: br %r14
+  %ret = fadd <4 x float> %val1, %val2
+  ret <4 x float> %ret
+}
+
+; Test an f32 addition that uses vector registers.
+define float @f2(<4 x float> %val1, <4 x float> %val2) {
+; CHECK-LABEL: f2:
+; CHECK: wfasb %f0, %v24, %v26
+; CHECK: br %r14
+  %scalar1 = extractelement <4 x float> %val1, i32 0
+  %scalar2 = extractelement <4 x float> %val2, i32 0
+  %ret = fadd float %scalar1, %scalar2
+  ret float %ret
+}
diff --git a/test/CodeGen/SystemZ/vec-and-04.ll b/test/CodeGen/SystemZ/vec-and-04.ll
new file mode 100644
index 000000000000..e9355beb4296
--- /dev/null
+++ b/test/CodeGen/SystemZ/vec-and-04.ll
@@ -0,0 +1,47 @@
+; Test vector NAND on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+; Test a v16i8 NAND.
+define <16 x i8> @f1(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) {
+; CHECK-LABEL: f1:
+; CHECK: vnn %v24, %v26, %v28
+; CHECK: br %r14
+  %ret = and <16 x i8> %val1, %val2
+  %not = xor <16 x i8> %ret, <i8 -1, i8 -1, i8 -1, i8 -1,
+                              i8 -1, i8 -1, i8 -1, i8 -1,
+                              i8 -1, i8 -1, i8 -1, i8 -1,
+                              i8 -1, i8 -1, i8 -1, i8 -1>
+  ret <16 x i8> %not
+}
+
+; Test a v8i16 NAND.
+define <8 x i16> @f2(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) {
+; CHECK-LABEL: f2:
+; CHECK: vnn %v24, %v26, %v28
+; CHECK: br %r14
+  %ret = and <8 x i16> %val1, %val2
+  %not = xor <8 x i16> %ret, <i16 -1, i16 -1, i16 -1, i16 -1,
+                              i16 -1, i16 -1, i16 -1, i16 -1>
+  ret <8 x i16> %not
+}
+
+; Test a v4i32 NAND.
+define <4 x i32> @f3(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) {
+; CHECK-LABEL: f3:
+; CHECK: vnn %v24, %v26, %v28
+; CHECK: br %r14
+  %ret = and <4 x i32> %val1, %val2
+  %not = xor <4 x i32> %ret, <i32 -1, i32 -1, i32 -1, i32 -1>
+  ret <4 x i32> %not
+}
+
+; Test a v2i64 NAND.
+define <2 x i64> @f4(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) {
+; CHECK-LABEL: f4:
+; CHECK: vnn %v24, %v26, %v28
+; CHECK: br %r14
+  %ret = and <2 x i64> %val1, %val2
+  %not = xor <2 x i64> %ret, <i64 -1, i64 -1>
+  ret <2 x i64> %not
+}
diff --git a/test/CodeGen/SystemZ/vec-cmp-07.ll b/test/CodeGen/SystemZ/vec-cmp-07.ll
new file mode 100644
index 000000000000..f272ba4bd755
--- /dev/null
+++ b/test/CodeGen/SystemZ/vec-cmp-07.ll
@@ -0,0 +1,349 @@
+; Test f32 and v4f32 comparisons on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+; Test oeq.
+define <4 x i32> @f1(<4 x i32> %dummy, <4 x float> %val1, <4 x float> %val2) {
+; CHECK-LABEL: f1:
+; CHECK: vfcesb %v24, %v26, %v28
+; CHECK-NEXT: br %r14
+  %cmp = fcmp oeq <4 x float> %val1, %val2
+  %ret = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %ret
+}
+
+; Test one.
+define <4 x i32> @f2(<4 x i32> %dummy, <4 x float> %val1, <4 x float> %val2) {
+; CHECK-LABEL: f2:
+; CHECK-DAG: vfchsb [[REG1:%v[0-9]+]], %v28, %v26
+; CHECK-DAG: vfchsb [[REG2:%v[0-9]+]], %v26, %v28
+; CHECK: vo %v24, [[REG1]], [[REG2]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp one <4 x float> %val1, %val2
+  %ret = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %ret
+}
+
+; Test ogt.
+define <4 x i32> @f3(<4 x i32> %dummy, <4 x float> %val1, <4 x float> %val2) {
+; CHECK-LABEL: f3:
+; CHECK: vfchsb %v24, %v26, %v28
+; CHECK-NEXT: br %r14
+  %cmp = fcmp ogt <4 x float> %val1, %val2
+  %ret = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %ret
+}
+
+; Test oge.
+define <4 x i32> @f4(<4 x i32> %dummy, <4 x float> %val1, <4 x float> %val2) {
+; CHECK-LABEL: f4:
+; CHECK: vfchesb %v24, %v26, %v28
+; CHECK-NEXT: br %r14
+  %cmp = fcmp oge <4 x float> %val1, %val2
+  %ret = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %ret
+}
+
+; Test ole.
+define <4 x i32> @f5(<4 x i32> %dummy, <4 x float> %val1, <4 x float> %val2) {
+; CHECK-LABEL: f5:
+; CHECK: vfchesb %v24, %v28, %v26
+; CHECK-NEXT: br %r14
+  %cmp = fcmp ole <4 x float> %val1, %val2
+  %ret = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %ret
+}
+
+; Test olt.
+define <4 x i32> @f6(<4 x i32> %dummy, <4 x float> %val1, <4 x float> %val2) {
+; CHECK-LABEL: f6:
+; CHECK: vfchsb %v24, %v28, %v26
+; CHECK-NEXT: br %r14
+  %cmp = fcmp olt <4 x float> %val1, %val2
+  %ret = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %ret
+}
+
+; Test ueq.
+define <4 x i32> @f7(<4 x i32> %dummy, <4 x float> %val1, <4 x float> %val2) {
+; CHECK-LABEL: f7:
+; CHECK-DAG: vfchsb [[REG1:%v[0-9]+]], %v28, %v26
+; CHECK-DAG: vfchsb [[REG2:%v[0-9]+]], %v26, %v28
+; CHECK: vno %v24, [[REG1]], [[REG2]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp ueq <4 x float> %val1, %val2
+  %ret = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %ret
+}
+
+; Test une.
+define <4 x i32> @f8(<4 x i32> %dummy, <4 x float> %val1, <4 x float> %val2) {
+; CHECK-LABEL: f8:
+; CHECK: vfcesb [[REG:%v[0-9]+]], %v26, %v28
+; CHECK-NEXT: vno %v24, [[REG]], [[REG]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp une <4 x float> %val1, %val2
+  %ret = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %ret
+}
+
+; Test ugt.
+define <4 x i32> @f9(<4 x i32> %dummy, <4 x float> %val1, <4 x float> %val2) {
+; CHECK-LABEL: f9:
+; CHECK: vfchesb [[REG:%v[0-9]+]], %v28, %v26
+; CHECK-NEXT: vno %v24, [[REG]], [[REG]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp ugt <4 x float> %val1, %val2
+  %ret = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %ret
+}
+
+; Test uge.
+define <4 x i32> @f10(<4 x i32> %dummy, <4 x float> %val1,
+                      <4 x float> %val2) {
+; CHECK-LABEL: f10:
+; CHECK: vfchsb [[REG:%v[0-9]+]], %v28, %v26
+; CHECK-NEXT: vno %v24, [[REG]], [[REG]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp uge <4 x float> %val1, %val2
+  %ret = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %ret
+}
+
+; Test ule.
+define <4 x i32> @f11(<4 x i32> %dummy, <4 x float> %val1,
+                      <4 x float> %val2) {
+; CHECK-LABEL: f11:
+; CHECK: vfchsb [[REG:%v[0-9]+]], %v26, %v28
+; CHECK-NEXT: vno %v24, [[REG]], [[REG]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp ule <4 x float> %val1, %val2
+  %ret = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %ret
+}
+
+; Test ult.
+define <4 x i32> @f12(<4 x i32> %dummy, <4 x float> %val1,
+                      <4 x float> %val2) {
+; CHECK-LABEL: f12:
+; CHECK: vfchesb [[REG:%v[0-9]+]], %v26, %v28
+; CHECK-NEXT: vno %v24, [[REG]], [[REG]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp ult <4 x float> %val1, %val2
+  %ret = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %ret
+}
+
+; Test ord.
+define <4 x i32> @f13(<4 x i32> %dummy, <4 x float> %val1,
+                      <4 x float> %val2) {
+; CHECK-LABEL: f13:
+; CHECK-DAG: vfchsb [[REG1:%v[0-9]+]], %v28, %v26
+; CHECK-DAG: vfchesb [[REG2:%v[0-9]+]], %v26, %v28
+; CHECK: vo %v24, [[REG1]], [[REG2]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp ord <4 x float> %val1, %val2
+  %ret = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %ret
+}
+
+; Test uno.
+define <4 x i32> @f14(<4 x i32> %dummy, <4 x float> %val1,
+                      <4 x float> %val2) {
+; CHECK-LABEL: f14:
+; CHECK-DAG: vfchsb [[REG1:%v[0-9]+]], %v28, %v26
+; CHECK-DAG: vfchesb [[REG2:%v[0-9]+]], %v26, %v28
+; CHECK: vno %v24, [[REG1]], [[REG2]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp uno <4 x float> %val1, %val2
+  %ret = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %ret
+}
+
+; Test oeq selects.
+define <4 x float> @f15(<4 x float> %val1, <4 x float> %val2,
+                        <4 x float> %val3, <4 x float> %val4) {
+; CHECK-LABEL: f15:
+; CHECK: vfcesb [[REG:%v[0-9]+]], %v24, %v26
+; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp oeq <4 x float> %val1, %val2
+  %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
+  ret <4 x float> %ret
+}
+
+; Test one selects.
+define <4 x float> @f16(<4 x float> %val1, <4 x float> %val2,
+                        <4 x float> %val3, <4 x float> %val4) {
+; CHECK-LABEL: f16:
+; CHECK-DAG: vfchsb [[REG1:%v[0-9]+]], %v26, %v24
+; CHECK-DAG: vfchsb [[REG2:%v[0-9]+]], %v24, %v26
+; CHECK: vo [[REG:%v[0-9]+]], [[REG1]], [[REG2]]
+; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp one <4 x float> %val1, %val2
+  %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
+  ret <4 x float> %ret
+}
+
+; Test ogt selects.
+define <4 x float> @f17(<4 x float> %val1, <4 x float> %val2,
+                        <4 x float> %val3, <4 x float> %val4) {
+; CHECK-LABEL: f17:
+; CHECK: vfchsb [[REG:%v[0-9]+]], %v24, %v26
+; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp ogt <4 x float> %val1, %val2
+  %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
+  ret <4 x float> %ret
+}
+
+; Test oge selects.
+define <4 x float> @f18(<4 x float> %val1, <4 x float> %val2,
+                        <4 x float> %val3, <4 x float> %val4) {
+; CHECK-LABEL: f18:
+; CHECK: vfchesb [[REG:%v[0-9]+]], %v24, %v26
+; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp oge <4 x float> %val1, %val2
+  %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
+  ret <4 x float> %ret
+}
+
+; Test ole selects.
+define <4 x float> @f19(<4 x float> %val1, <4 x float> %val2,
+                        <4 x float> %val3, <4 x float> %val4) {
+; CHECK-LABEL: f19:
+; CHECK: vfchesb [[REG:%v[0-9]+]], %v26, %v24
+; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp ole <4 x float> %val1, %val2
+  %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
+  ret <4 x float> %ret
+}
+
+; Test olt selects.
+define <4 x float> @f20(<4 x float> %val1, <4 x float> %val2,
+                        <4 x float> %val3, <4 x float> %val4) {
+; CHECK-LABEL: f20:
+; CHECK: vfchsb [[REG:%v[0-9]+]], %v26, %v24
+; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp olt <4 x float> %val1, %val2
+  %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
+  ret <4 x float> %ret
+}
+
+; Test ueq selects.
+define <4 x float> @f21(<4 x float> %val1, <4 x float> %val2,
+                        <4 x float> %val3, <4 x float> %val4) {
+; CHECK-LABEL: f21:
+; CHECK-DAG: vfchsb [[REG1:%v[0-9]+]], %v26, %v24
+; CHECK-DAG: vfchsb [[REG2:%v[0-9]+]], %v24, %v26
+; CHECK: vo [[REG:%v[0-9]+]], [[REG1]], [[REG2]]
+; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp ueq <4 x float> %val1, %val2
+  %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
+  ret <4 x float> %ret
+}
+
+; Test une selects.
+define <4 x float> @f22(<4 x float> %val1, <4 x float> %val2,
+                        <4 x float> %val3, <4 x float> %val4) {
+; CHECK-LABEL: f22:
+; CHECK: vfcesb [[REG:%v[0-9]+]], %v24, %v26
+; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp une <4 x float> %val1, %val2
+  %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
+  ret <4 x float> %ret
+}
+
+; Test ugt selects.
+define <4 x float> @f23(<4 x float> %val1, <4 x float> %val2,
+                        <4 x float> %val3, <4 x float> %val4) {
+; CHECK-LABEL: f23:
+; CHECK: vfchesb [[REG:%v[0-9]+]], %v26, %v24
+; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp ugt <4 x float> %val1, %val2
+  %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
+  ret <4 x float> %ret
+}
+
+; Test uge selects.
+define <4 x float> @f24(<4 x float> %val1, <4 x float> %val2,
+                        <4 x float> %val3, <4 x float> %val4) {
+; CHECK-LABEL: f24:
+; CHECK: vfchsb [[REG:%v[0-9]+]], %v26, %v24
+; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp uge <4 x float> %val1, %val2
+  %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
+  ret <4 x float> %ret
+}
+
+; Test ule selects.
+define <4 x float> @f25(<4 x float> %val1, <4 x float> %val2,
+                        <4 x float> %val3, <4 x float> %val4) {
+; CHECK-LABEL: f25:
+; CHECK: vfchsb [[REG:%v[0-9]+]], %v24, %v26
+; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp ule <4 x float> %val1, %val2
+  %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
+  ret <4 x float> %ret
+}
+
+; Test ult selects.
+define <4 x float> @f26(<4 x float> %val1, <4 x float> %val2,
+                        <4 x float> %val3, <4 x float> %val4) {
+; CHECK-LABEL: f26:
+; CHECK: vfchesb [[REG:%v[0-9]+]], %v24, %v26
+; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp ult <4 x float> %val1, %val2
+  %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
+  ret <4 x float> %ret
+}
+
+; Test ord selects.
+define <4 x float> @f27(<4 x float> %val1, <4 x float> %val2,
+                        <4 x float> %val3, <4 x float> %val4) {
+; CHECK-LABEL: f27:
+; CHECK-DAG: vfchsb [[REG1:%v[0-9]+]], %v26, %v24
+; CHECK-DAG: vfchesb [[REG2:%v[0-9]+]], %v24, %v26
+; CHECK: vo [[REG:%v[0-9]+]], [[REG1]], [[REG2]]
+; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp ord <4 x float> %val1, %val2
+  %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
+  ret <4 x float> %ret
+}
+
+; Test uno selects.
+define <4 x float> @f28(<4 x float> %val1, <4 x float> %val2,
+                        <4 x float> %val3, <4 x float> %val4) {
+; CHECK-LABEL: f28:
+; CHECK-DAG: vfchsb [[REG1:%v[0-9]+]], %v26, %v24
+; CHECK-DAG: vfchesb [[REG2:%v[0-9]+]], %v24, %v26
+; CHECK: vo [[REG:%v[0-9]+]], [[REG1]], [[REG2]]
+; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp uno <4 x float> %val1, %val2
+  %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
+  ret <4 x float> %ret
+}
+
+; Test an f32 comparison that uses vector registers.
+define i64 @f29(i64 %a, i64 %b, float %f1, <4 x float> %vec) {
+; CHECK-LABEL: f29:
+; CHECK: wfcsb %f0, %v24
+; CHECK-NEXT: locgrne %r2, %r3
+; CHECK: br %r14
+  %f2 = extractelement <4 x float> %vec, i32 0
+  %cond = fcmp oeq float %f1, %f2
+  %res = select i1 %cond, i64 %a, i64 %b
+  ret i64 %res
+}
diff --git a/test/CodeGen/SystemZ/vec-ctpop-02.ll b/test/CodeGen/SystemZ/vec-ctpop-02.ll
new file mode 100644
index 000000000000..ee50e88d0430
--- /dev/null
+++ b/test/CodeGen/SystemZ/vec-ctpop-02.ll
@@ -0,0 +1,45 @@
+; Test vector population-count instruction on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a)
+declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %a)
+declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %a)
+declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
+
+define <16 x i8> @f1(<16 x i8> %a) {
+; CHECK-LABEL: f1:
+; CHECK: vpopctb  %v24, %v24
+; CHECK: br      %r14
+
+  %popcnt = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a)
+  ret <16 x i8> %popcnt
+}
+
+define <8 x i16> @f2(<8 x i16> %a) {
+; CHECK-LABEL: f2:
+; CHECK: vpopcth  %v24, %v24
+; CHECK: br      %r14
+
+  %popcnt = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %a)
+  ret <8 x i16> %popcnt
+}
+
+define <4 x i32> @f3(<4 x i32> %a) {
+; CHECK-LABEL: f3:
+; CHECK: vpopctf  %v24, %v24
+; CHECK: br      %r14
+
+  %popcnt = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %a)
+  ret <4 x i32> %popcnt
+}
+
+define <2 x i64> @f4(<2 x i64> %a) {
+; CHECK-LABEL: f4:
+; CHECK: vpopctg  %v24, %v24
+; CHECK: br      %r14
+
+  %popcnt = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
+  ret <2 x i64> %popcnt
+}
+
diff --git a/test/CodeGen/SystemZ/vec-div-02.ll b/test/CodeGen/SystemZ/vec-div-02.ll
new file mode 100644
index 000000000000..74e3b5148ad5
--- /dev/null
+++ b/test/CodeGen/SystemZ/vec-div-02.ll
@@ -0,0 +1,24 @@
+; Test vector division on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+; Test a v4f32 division.
+define <4 x float> @f1(<4 x float> %dummy, <4 x float> %val1,
+                       <4 x float> %val2) {
+; CHECK-LABEL: f1:
+; CHECK: vfdsb %v24, %v26, %v28
+; CHECK: br %r14
+  %ret = fdiv <4 x float> %val1, %val2
+  ret <4 x float> %ret
+}
+
+; Test an f32 division that uses vector registers.
+define float @f2(<4 x float> %val1, <4 x float> %val2) {
+; CHECK-LABEL: f2:
+; CHECK: wfdsb %f0, %v24, %v26
+; CHECK: br %r14
+  %scalar1 = extractelement <4 x float> %val1, i32 0
+  %scalar2 = extractelement <4 x float> %val2, i32 0
+  %ret = fdiv float %scalar1, %scalar2
+  ret float %ret
+}
diff --git a/test/CodeGen/SystemZ/vec-intrinsics-01.ll b/test/CodeGen/SystemZ/vec-intrinsics-01.ll
new file mode 100644
index 000000000000..6f5eb0691aa8
--- /dev/null
+++ b/test/CodeGen/SystemZ/vec-intrinsics-01.ll
@@ -0,0 +1,3335 @@
+; Test vector intrinsics.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+
+declare i32 @llvm.s390.lcbb(i8 *, i32)
+declare <16 x i8> @llvm.s390.vlbb(i8 *, i32)
+declare <16 x i8> @llvm.s390.vll(i32, i8 *)
+declare <2 x i64> @llvm.s390.vpdi(<2 x i64>, <2 x i64>, i32)
+declare <16 x i8> @llvm.s390.vperm(<16 x i8>, <16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.s390.vpksh(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.s390.vpksf(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.s390.vpksg(<2 x i64>, <2 x i64>)
+declare {<16 x i8>, i32} @llvm.s390.vpkshs(<8 x i16>, <8 x i16>)
+declare {<8 x i16>, i32} @llvm.s390.vpksfs(<4 x i32>, <4 x i32>)
+declare {<4 x i32>, i32} @llvm.s390.vpksgs(<2 x i64>, <2 x i64>)
+declare <16 x i8> @llvm.s390.vpklsh(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.s390.vpklsf(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.s390.vpklsg(<2 x i64>, <2 x i64>)
+declare {<16 x i8>, i32} @llvm.s390.vpklshs(<8 x i16>, <8 x i16>)
+declare {<8 x i16>, i32} @llvm.s390.vpklsfs(<4 x i32>, <4 x i32>)
+declare {<4 x i32>, i32} @llvm.s390.vpklsgs(<2 x i64>, <2 x i64>)
+declare void @llvm.s390.vstl(<16 x i8>, i32, i8 *)
+declare <8 x i16> @llvm.s390.vuphb(<16 x i8>)
+declare <4 x i32> @llvm.s390.vuphh(<8 x i16>)
+declare <2 x i64> @llvm.s390.vuphf(<4 x i32>)
+declare <8 x i16> @llvm.s390.vuplhb(<16 x i8>)
+declare <4 x i32> @llvm.s390.vuplhh(<8 x i16>)
+declare <2 x i64> @llvm.s390.vuplhf(<4 x i32>)
+declare <8 x i16> @llvm.s390.vuplb(<16 x i8>)
+declare <4 x i32> @llvm.s390.vuplhw(<8 x i16>)
+declare <2 x i64> @llvm.s390.vuplf(<4 x i32>)
+declare <8 x i16> @llvm.s390.vupllb(<16 x i8>)
+declare <4 x i32> @llvm.s390.vupllh(<8 x i16>)
+declare <2 x i64> @llvm.s390.vupllf(<4 x i32>)
+declare <16 x i8> @llvm.s390.vaccb(<16 x i8>, <16 x i8>)
+declare <8 x i16> @llvm.s390.vacch(<8 x i16>, <8 x i16>)
+declare <4 x i32> @llvm.s390.vaccf(<4 x i32>, <4 x i32>)
+declare <2 x i64> @llvm.s390.vaccg(<2 x i64>, <2 x i64>)
+declare <16 x i8> @llvm.s390.vaq(<16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.s390.vacq(<16 x i8>, <16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.s390.vaccq(<16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.s390.vacccq(<16 x i8>, <16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.s390.vavgb(<16 x i8>, <16 x i8>)
+declare <8 x i16> @llvm.s390.vavgh(<8 x i16>, <8 x i16>)
+declare <4 x i32> @llvm.s390.vavgf(<4 x i32>, <4 x i32>)
+declare <2 x i64> @llvm.s390.vavgg(<2 x i64>, <2 x i64>)
+declare <16 x i8> @llvm.s390.vavglb(<16 x i8>, <16 x i8>)
+declare <8 x i16> @llvm.s390.vavglh(<8 x i16>, <8 x i16>)
+declare <4 x i32> @llvm.s390.vavglf(<4 x i32>, <4 x i32>)
+declare <2 x i64> @llvm.s390.vavglg(<2 x i64>, <2 x i64>)
+declare <4 x i32> @llvm.s390.vcksm(<4 x i32>, <4 x i32>)
+declare <8 x i16> @llvm.s390.vgfmb(<16 x i8>, <16 x i8>)
+declare <4 x i32> @llvm.s390.vgfmh(<8 x i16>, <8 x i16>)
+declare <2 x i64> @llvm.s390.vgfmf(<4 x i32>, <4 x i32>)
+declare <16 x i8> @llvm.s390.vgfmg(<2 x i64>, <2 x i64>)
+declare <8 x i16> @llvm.s390.vgfmab(<16 x i8>, <16 x i8>, <8 x i16>)
+declare <4 x i32> @llvm.s390.vgfmah(<8 x i16>, <8 x i16>, <4 x i32>)
+declare <2 x i64> @llvm.s390.vgfmaf(<4 x i32>, <4 x i32>, <2 x i64>)
+declare <16 x i8> @llvm.s390.vgfmag(<2 x i64>, <2 x i64>, <16 x i8>)
+declare <16 x i8> @llvm.s390.vmahb(<16 x i8>, <16 x i8>, <16 x i8>)
+declare <8 x i16> @llvm.s390.vmahh(<8 x i16>, <8 x i16>, <8 x i16>)
+declare <4 x i32> @llvm.s390.vmahf(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <16 x i8> @llvm.s390.vmalhb(<16 x i8>, <16 x i8>, <16 x i8>)
+declare <8 x i16> @llvm.s390.vmalhh(<8 x i16>, <8 x i16>, <8 x i16>)
+declare <4 x i32> @llvm.s390.vmalhf(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <8 x i16> @llvm.s390.vmaeb(<16 x i8>, <16 x i8>, <8 x i16>)
+declare <4 x i32> @llvm.s390.vmaeh(<8 x i16>, <8 x i16>, <4 x i32>)
+declare <2 x i64> @llvm.s390.vmaef(<4 x i32>, <4 x i32>, <2 x i64>)
+declare <8 x i16> @llvm.s390.vmaleb(<16 x i8>, <16 x i8>, <8 x i16>)
+declare <4 x i32> @llvm.s390.vmaleh(<8 x i16>, <8 x i16>, <4 x i32>)
+declare <2 x i64> @llvm.s390.vmalef(<4 x i32>, <4 x i32>, <2 x i64>)
+declare <8 x i16> @llvm.s390.vmaob(<16 x i8>, <16 x i8>, <8 x i16>)
+declare <4 x i32> @llvm.s390.vmaoh(<8 x i16>, <8 x i16>, <4 x i32>)
+declare <2 x i64> @llvm.s390.vmaof(<4 x i32>, <4 x i32>, <2 x i64>)
+declare <8 x i16> @llvm.s390.vmalob(<16 x i8>, <16 x i8>, <8 x i16>)
+declare <4 x i32> @llvm.s390.vmaloh(<8 x i16>, <8 x i16>, <4 x i32>)
+declare <2 x i64> @llvm.s390.vmalof(<4 x i32>, <4 x i32>, <2 x i64>)
+declare <16 x i8> @llvm.s390.vmhb(<16 x i8>, <16 x i8>)
+declare <8 x i16> @llvm.s390.vmhh(<8 x i16>, <8 x i16>)
+declare <4 x i32> @llvm.s390.vmhf(<4 x i32>, <4 x i32>)
+declare <16 x i8> @llvm.s390.vmlhb(<16 x i8>, <16 x i8>)
+declare <8 x i16> @llvm.s390.vmlhh(<8 x i16>, <8 x i16>)
+declare <4 x i32> @llvm.s390.vmlhf(<4 x i32>, <4 x i32>)
+declare <8 x i16> @llvm.s390.vmeb(<16 x i8>, <16 x i8>)
+declare <4 x i32> @llvm.s390.vmeh(<8 x i16>, <8 x i16>)
+declare <2 x i64> @llvm.s390.vmef(<4 x i32>, <4 x i32>)
+declare <8 x i16> @llvm.s390.vmleb(<16 x i8>, <16 x i8>)
+declare <4 x i32> @llvm.s390.vmleh(<8 x i16>, <8 x i16>)
+declare <2 x i64> @llvm.s390.vmlef(<4 x i32>, <4 x i32>)
+declare <8 x i16> @llvm.s390.vmob(<16 x i8>, <16 x i8>)
+declare <4 x i32> @llvm.s390.vmoh(<8 x i16>, <8 x i16>)
+declare <2 x i64> @llvm.s390.vmof(<4 x i32>, <4 x i32>)
+declare <8 x i16> @llvm.s390.vmlob(<16 x i8>, <16 x i8>)
+declare <4 x i32> @llvm.s390.vmloh(<8 x i16>, <8 x i16>)
+declare <2 x i64> @llvm.s390.vmlof(<4 x i32>, <4 x i32>)
+declare <16 x i8> @llvm.s390.verllvb(<16 x i8>, <16 x i8>)
+declare <8 x i16> @llvm.s390.verllvh(<8 x i16>, <8 x i16>)
+declare <4 x i32> @llvm.s390.verllvf(<4 x i32>, <4 x i32>)
+declare <2 x i64> @llvm.s390.verllvg(<2 x i64>, <2 x i64>)
+declare <16 x i8> @llvm.s390.verllb(<16 x i8>, i32)
+declare <8 x i16> @llvm.s390.verllh(<8 x i16>, i32)
+declare <4 x i32> @llvm.s390.verllf(<4 x i32>, i32)
+declare <2 x i64> @llvm.s390.verllg(<2 x i64>, i32)
+declare <16 x i8> @llvm.s390.verimb(<16 x i8>, <16 x i8>, <16 x i8>, i32)
+declare <8 x i16> @llvm.s390.verimh(<8 x i16>, <8 x i16>, <8 x i16>, i32)
+declare <4 x i32> @llvm.s390.verimf(<4 x i32>, <4 x i32>, <4 x i32>, i32)
+declare <2 x i64> @llvm.s390.verimg(<2 x i64>, <2 x i64>, <2 x i64>, i32)
+declare <16 x i8> @llvm.s390.vsl(<16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.s390.vslb(<16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.s390.vsra(<16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.s390.vsrab(<16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.s390.vsrl(<16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.s390.vsrlb(<16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.s390.vsldb(<16 x i8>, <16 x i8>, i32)
+declare <16 x i8> @llvm.s390.vscbib(<16 x i8>, <16 x i8>)
+declare <8 x i16> @llvm.s390.vscbih(<8 x i16>, <8 x i16>)
+declare <4 x i32> @llvm.s390.vscbif(<4 x i32>, <4 x i32>)
+declare <2 x i64> @llvm.s390.vscbig(<2 x i64>, <2 x i64>)
+declare <16 x i8> @llvm.s390.vsq(<16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.s390.vsbiq(<16 x i8>, <16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.s390.vscbiq(<16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.s390.vsbcbiq(<16 x i8>, <16 x i8>, <16 x i8>)
+declare <4 x i32> @llvm.s390.vsumb(<16 x i8>, <16 x i8>)
+declare <4 x i32> @llvm.s390.vsumh(<8 x i16>, <8 x i16>)
+declare <2 x i64> @llvm.s390.vsumgh(<8 x i16>, <8 x i16>)
+declare <2 x i64> @llvm.s390.vsumgf(<4 x i32>, <4 x i32>)
+declare <16 x i8> @llvm.s390.vsumqf(<4 x i32>, <4 x i32>)
+declare <16 x i8> @llvm.s390.vsumqg(<2 x i64>, <2 x i64>)
+declare i32 @llvm.s390.vtm(<16 x i8>, <16 x i8>)
+declare {<16 x i8>, i32} @llvm.s390.vceqbs(<16 x i8>, <16 x i8>)
+declare {<8 x i16>, i32} @llvm.s390.vceqhs(<8 x i16>, <8 x i16>)
+declare {<4 x i32>, i32} @llvm.s390.vceqfs(<4 x i32>, <4 x i32>)
+declare {<2 x i64>, i32} @llvm.s390.vceqgs(<2 x i64>, <2 x i64>)
+declare {<16 x i8>, i32} @llvm.s390.vchbs(<16 x i8>, <16 x i8>)
+declare {<8 x i16>, i32} @llvm.s390.vchhs(<8 x i16>, <8 x i16>)
+declare {<4 x i32>, i32} @llvm.s390.vchfs(<4 x i32>, <4 x i32>)
+declare {<2 x i64>, i32} @llvm.s390.vchgs(<2 x i64>, <2 x i64>)
+declare {<16 x i8>, i32} @llvm.s390.vchlbs(<16 x i8>, <16 x i8>)
+declare {<8 x i16>, i32} @llvm.s390.vchlhs(<8 x i16>, <8 x i16>)
+declare {<4 x i32>, i32} @llvm.s390.vchlfs(<4 x i32>, <4 x i32>)
+declare {<2 x i64>, i32} @llvm.s390.vchlgs(<2 x i64>, <2 x i64>)
+declare <16 x i8> @llvm.s390.vfaeb(<16 x i8>, <16 x i8>, i32)
+declare <8 x i16> @llvm.s390.vfaeh(<8 x i16>, <8 x i16>, i32)
+declare <4 x i32> @llvm.s390.vfaef(<4 x i32>, <4 x i32>, i32)
+declare {<16 x i8>, i32} @llvm.s390.vfaebs(<16 x i8>, <16 x i8>, i32)
+declare {<8 x i16>, i32} @llvm.s390.vfaehs(<8 x i16>, <8 x i16>, i32)
+declare {<4 x i32>, i32} @llvm.s390.vfaefs(<4 x i32>, <4 x i32>, i32)
+declare <16 x i8> @llvm.s390.vfaezb(<16 x i8>, <16 x i8>, i32)
+declare <8 x i16> @llvm.s390.vfaezh(<8 x i16>, <8 x i16>, i32)
+declare <4 x i32> @llvm.s390.vfaezf(<4 x i32>, <4 x i32>, i32)
+declare {<16 x i8>, i32} @llvm.s390.vfaezbs(<16 x i8>, <16 x i8>, i32)
+declare {<8 x i16>, i32} @llvm.s390.vfaezhs(<8 x i16>, <8 x i16>, i32)
+declare {<4 x i32>, i32} @llvm.s390.vfaezfs(<4 x i32>, <4 x i32>, i32)
+declare <16 x i8> @llvm.s390.vfeeb(<16 x i8>, <16 x i8>)
+declare <8 x i16> @llvm.s390.vfeeh(<8 x i16>, <8 x i16>)
+declare <4 x i32> @llvm.s390.vfeef(<4 x i32>, <4 x i32>)
+declare {<16 x i8>, i32} @llvm.s390.vfeebs(<16 x i8>, <16 x i8>)
+declare {<8 x i16>, i32} @llvm.s390.vfeehs(<8 x i16>, <8 x i16>)
+declare {<4 x i32>, i32} @llvm.s390.vfeefs(<4 x i32>, <4 x i32>)
+declare <16 x i8> @llvm.s390.vfeezb(<16 x i8>, <16 x i8>)
+declare <8 x i16> @llvm.s390.vfeezh(<8 x i16>, <8 x i16>)
+declare <4 x i32> @llvm.s390.vfeezf(<4 x i32>, <4 x i32>)
+declare {<16 x i8>, i32} @llvm.s390.vfeezbs(<16 x i8>, <16 x i8>)
+declare {<8 x i16>, i32} @llvm.s390.vfeezhs(<8 x i16>, <8 x i16>)
+declare {<4 x i32>, i32} @llvm.s390.vfeezfs(<4 x i32>, <4 x i32>)
+declare <16 x i8> @llvm.s390.vfeneb(<16 x i8>, <16 x i8>)
+declare <8 x i16> @llvm.s390.vfeneh(<8 x i16>, <8 x i16>)
+declare <4 x i32> @llvm.s390.vfenef(<4 x i32>, <4 x i32>)
+declare {<16 x i8>, i32} @llvm.s390.vfenebs(<16 x i8>, <16 x i8>)
+declare {<8 x i16>, i32} @llvm.s390.vfenehs(<8 x i16>, <8 x i16>)
+declare {<4 x i32>, i32} @llvm.s390.vfenefs(<4 x i32>, <4 x i32>)
+declare <16 x i8> @llvm.s390.vfenezb(<16 x i8>, <16 x i8>)
+declare <8 x i16> @llvm.s390.vfenezh(<8 x i16>, <8 x i16>)
+declare <4 x i32> @llvm.s390.vfenezf(<4 x i32>, <4 x i32>)
+declare {<16 x i8>, i32} @llvm.s390.vfenezbs(<16 x i8>, <16 x i8>)
+declare {<8 x i16>, i32} @llvm.s390.vfenezhs(<8 x i16>, <8 x i16>)
+declare {<4 x i32>, i32} @llvm.s390.vfenezfs(<4 x i32>, <4 x i32>)
+declare <16 x i8> @llvm.s390.vistrb(<16 x i8>)
+declare <8 x i16> @llvm.s390.vistrh(<8 x i16>)
+declare <4 x i32> @llvm.s390.vistrf(<4 x i32>)
+declare {<16 x i8>, i32} @llvm.s390.vistrbs(<16 x i8>)
+declare {<8 x i16>, i32} @llvm.s390.vistrhs(<8 x i16>)
+declare {<4 x i32>, i32} @llvm.s390.vistrfs(<4 x i32>)
+declare <16 x i8> @llvm.s390.vstrcb(<16 x i8>, <16 x i8>, <16 x i8>, i32)
+declare <8 x i16> @llvm.s390.vstrch(<8 x i16>, <8 x i16>, <8 x i16>, i32)
+declare <4 x i32> @llvm.s390.vstrcf(<4 x i32>, <4 x i32>, <4 x i32>, i32)
+declare {<16 x i8>, i32} @llvm.s390.vstrcbs(<16 x i8>, <16 x i8>, <16 x i8>,
+                                            i32)
+declare {<8 x i16>, i32} @llvm.s390.vstrchs(<8 x i16>, <8 x i16>, <8 x i16>,
+                                            i32)
+declare {<4 x i32>, i32} @llvm.s390.vstrcfs(<4 x i32>, <4 x i32>, <4 x i32>,
+                                            i32)
+declare <16 x i8> @llvm.s390.vstrczb(<16 x i8>, <16 x i8>, <16 x i8>, i32)
+declare <8 x i16> @llvm.s390.vstrczh(<8 x i16>, <8 x i16>, <8 x i16>, i32)
+declare <4 x i32> @llvm.s390.vstrczf(<4 x i32>, <4 x i32>, <4 x i32>, i32)
+declare {<16 x i8>, i32} @llvm.s390.vstrczbs(<16 x i8>, <16 x i8>, <16 x i8>,
+                                             i32)
+declare {<8 x i16>, i32} @llvm.s390.vstrczhs(<8 x i16>, <8 x i16>, <8 x i16>,
+                                             i32)
+declare {<4 x i32>, i32} @llvm.s390.vstrczfs(<4 x i32>, <4 x i32>, <4 x i32>,
+                                             i32)
+declare {<2 x i64>, i32} @llvm.s390.vfcedbs(<2 x double>, <2 x double>)
+declare {<2 x i64>, i32} @llvm.s390.vfchdbs(<2 x double>, <2 x double>)
+declare {<2 x i64>, i32} @llvm.s390.vfchedbs(<2 x double>, <2 x double>)
+declare {<2 x i64>, i32} @llvm.s390.vftcidb(<2 x double>, i32)
+declare <2 x double> @llvm.s390.vfidb(<2 x double>, i32, i32)
+
+; LCBB with the lowest M3 operand.
+define i32 @test_lcbb1(i8 *%ptr) {
+; CHECK-LABEL: test_lcbb1:
+; CHECK: lcbb %r2, 0(%r2), 0
+; CHECK: br %r14
+  %res = call i32 @llvm.s390.lcbb(i8 *%ptr, i32 0)
+  ret i32 %res
+}
+
+; LCBB with the highest M3 operand.
+define i32 @test_lcbb2(i8 *%ptr) {
+; CHECK-LABEL: test_lcbb2:
+; CHECK: lcbb %r2, 0(%r2), 15
+; CHECK: br %r14
+  %res = call i32 @llvm.s390.lcbb(i8 *%ptr, i32 15)
+  ret i32 %res
+}
+
+; LCBB with a displacement and index.
+define i32 @test_lcbb3(i8 *%base, i64 %index) {
+; CHECK-LABEL: test_lcbb3:
+; CHECK: lcbb %r2, 4095({{%r2,%r3|%r3,%r2}}), 4
+; CHECK: br %r14
+  %add = add i64 %index, 4095
+  %ptr = getelementptr i8, i8 *%base, i64 %add
+  %res = call i32 @llvm.s390.lcbb(i8 *%ptr, i32 4)
+  ret i32 %res
+}
+
+; LCBB with an out-of-range displacement.
+define i32 @test_lcbb4(i8 *%base) {
+; CHECK-LABEL: test_lcbb4:
+; CHECK: lcbb %r2, 0({{%r[1-5]}}), 5
+; CHECK: br %r14
+  %ptr = getelementptr i8, i8 *%base, i64 4096
+  %res = call i32 @llvm.s390.lcbb(i8 *%ptr, i32 5)
+  ret i32 %res
+}
+
+; VLBB with the lowest M3 operand.
+define <16 x i8> @test_vlbb1(i8 *%ptr) {
+; CHECK-LABEL: test_vlbb1:
+; CHECK: vlbb %v24, 0(%r2), 0
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vlbb(i8 *%ptr, i32 0)
+  ret <16 x i8> %res
+}
+
+; VLBB with the highest M3 operand.
+define <16 x i8> @test_vlbb2(i8 *%ptr) {
+; CHECK-LABEL: test_vlbb2:
+; CHECK: vlbb %v24, 0(%r2), 15
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vlbb(i8 *%ptr, i32 15)
+  ret <16 x i8> %res
+}
+
+; VLBB with a displacement and index.
+define <16 x i8> @test_vlbb3(i8 *%base, i64 %index) {
+; CHECK-LABEL: test_vlbb3:
+; CHECK: vlbb %v24, 4095({{%r2,%r3|%r3,%r2}}), 4
+; CHECK: br %r14
+  %add = add i64 %index, 4095
+  %ptr = getelementptr i8, i8 *%base, i64 %add
+  %res = call <16 x i8> @llvm.s390.vlbb(i8 *%ptr, i32 4)
+  ret <16 x i8> %res
+}
+
+; VLBB with an out-of-range displacement.
+define <16 x i8> @test_vlbb4(i8 *%base) {
+; CHECK-LABEL: test_vlbb4:
+; CHECK: vlbb %v24, 0({{%r[1-5]}}), 5
+; CHECK: br %r14
+  %ptr = getelementptr i8, i8 *%base, i64 4096
+  %res = call <16 x i8> @llvm.s390.vlbb(i8 *%ptr, i32 5)
+  ret <16 x i8> %res
+}
+
+; VLL with the lowest in-range displacement.
+define <16 x i8> @test_vll1(i8 *%ptr, i32 %length) {
+; CHECK-LABEL: test_vll1:
+; CHECK: vll %v24, %r3, 0(%r2)
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vll(i32 %length, i8 *%ptr)
+  ret <16 x i8> %res
+}
+
+; VLL with the highest in-range displacement.
+define <16 x i8> @test_vll2(i8 *%base, i32 %length) {
+; CHECK-LABEL: test_vll2:
+; CHECK: vll %v24, %r3, 4095(%r2)
+; CHECK: br %r14
+  %ptr = getelementptr i8, i8 *%base, i64 4095
+  %res = call <16 x i8> @llvm.s390.vll(i32 %length, i8 *%ptr)
+  ret <16 x i8> %res
+}
+
+; VLL with an out-of-range displacementa.
+define <16 x i8> @test_vll3(i8 *%base, i32 %length) {
+; CHECK-LABEL: test_vll3:
+; CHECK: vll %v24, %r3, 0({{%r[1-5]}})
+; CHECK: br %r14
+  %ptr = getelementptr i8, i8 *%base, i64 4096
+  %res = call <16 x i8> @llvm.s390.vll(i32 %length, i8 *%ptr)
+  ret <16 x i8> %res
+}
+
+; Check that VLL doesn't allow an index.
+define <16 x i8> @test_vll4(i8 *%base, i64 %index, i32 %length) {
+; CHECK-LABEL: test_vll4:
+; CHECK: vll %v24, %r4, 0({{%r[1-5]}})
+; CHECK: br %r14
+  %ptr = getelementptr i8, i8 *%base, i64 %index
+  %res = call <16 x i8> @llvm.s390.vll(i32 %length, i8 *%ptr)
+  ret <16 x i8> %res
+}
+
+; VPDI taking element 0 from each half.
+define <2 x i64> @test_vpdi1(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vpdi1:
+; CHECK: vpdi %v24, %v24, %v26, 0
+; CHECK: br %r14
+  %res = call <2 x i64> @llvm.s390.vpdi(<2 x i64> %a, <2 x i64> %b, i32 0)
+  ret <2 x i64> %res
+}
+
+; VPDI taking element 1 from each half.
+define <2 x i64> @test_vpdi2(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vpdi2:
+; CHECK: vpdi %v24, %v24, %v26, 10
+; CHECK: br %r14
+  %res = call <2 x i64> @llvm.s390.vpdi(<2 x i64> %a, <2 x i64> %b, i32 10)
+  ret <2 x i64> %res
+}
+
+; VPERM.
+define <16 x i8> @test_vperm(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK-LABEL: test_vperm:
+; CHECK: vperm %v24, %v24, %v26, %v28
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vperm(<16 x i8> %a, <16 x i8> %b,
+                                         <16 x i8> %c)
+  ret <16 x i8> %res
+}
+
+; VPKSH.
+define <16 x i8> @test_vpksh(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vpksh:
+; CHECK: vpksh %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vpksh(<8 x i16> %a, <8 x i16> %b)
+  ret <16 x i8> %res
+}
+
+; VPKSF.
+define <8 x i16> @test_vpksf(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vpksf:
+; CHECK: vpksf %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <8 x i16> @llvm.s390.vpksf(<4 x i32> %a, <4 x i32> %b)
+  ret <8 x i16> %res
+}
+
+; VPKSG.
+define <4 x i32> @test_vpksg(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vpksg:
+; CHECK: vpksg %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <4 x i32> @llvm.s390.vpksg(<2 x i64> %a, <2 x i64> %b)
+  ret <4 x i32> %res
+}
+
+; VPKSHS with no processing of the result.
+define <16 x i8> @test_vpkshs(<8 x i16> %a, <8 x i16> %b, i32 *%ccptr) {
+; CHECK-LABEL: test_vpkshs:
+; CHECK: vpkshs %v24, %v24, %v26
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: srl [[REG]], 28
+; CHECK: st [[REG]], 0(%r2)
+; CHECK: br %r14
+  %call = call {<16 x i8>, i32} @llvm.s390.vpkshs(<8 x i16> %a, <8 x i16> %b)
+  %res = extractvalue {<16 x i8>, i32} %call, 0
+  %cc = extractvalue {<16 x i8>, i32} %call, 1
+  store i32 %cc, i32 *%ccptr
+  ret <16 x i8> %res
+}
+
+; VPKSHS, storing to %ptr if all values were saturated.
+define <16 x i8> @test_vpkshs_all_store(<8 x i16> %a, <8 x i16> %b, i32 *%ptr) {
+; CHECK-LABEL: test_vpkshs_all_store:
+; CHECK: vpkshs %v24, %v24, %v26
+; CHECK-NEXT: {{bnor|bler}} %r14
+; CHECK: mvhi 0(%r2), 0
+; CHECK: br %r14
+  %call = call {<16 x i8>, i32} @llvm.s390.vpkshs(<8 x i16> %a, <8 x i16> %b)
+  %res = extractvalue {<16 x i8>, i32} %call, 0
+  %cc = extractvalue {<16 x i8>, i32} %call, 1
+  %cmp = icmp uge i32 %cc, 3
+  br i1 %cmp, label %store, label %exit
+
+store:
+  store i32 0, i32 *%ptr
+  br label %exit
+
+exit:
+  ret <16 x i8> %res
+}
+
+; VPKSFS with no processing of the result.
+define <8 x i16> @test_vpksfs(<4 x i32> %a, <4 x i32> %b, i32 *%ccptr) {
+; CHECK-LABEL: test_vpksfs:
+; CHECK: vpksfs %v24, %v24, %v26
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: srl [[REG]], 28
+; CHECK: st [[REG]], 0(%r2)
+; CHECK: br %r14
+  %call = call {<8 x i16>, i32} @llvm.s390.vpksfs(<4 x i32> %a, <4 x i32> %b)
+  %res = extractvalue {<8 x i16>, i32} %call, 0
+  %cc = extractvalue {<8 x i16>, i32} %call, 1
+  store i32 %cc, i32 *%ccptr
+  ret <8 x i16> %res
+}
+
+; VPKSFS, storing to %ptr if any values were saturated.
+define <8 x i16> @test_vpksfs_any_store(<4 x i32> %a, <4 x i32> %b, i32 *%ptr) {
+; CHECK-LABEL: test_vpksfs_any_store:
+; CHECK: vpksfs %v24, %v24, %v26
+; CHECK-NEXT: {{bher|ber}} %r14
+; CHECK: mvhi 0(%r2), 0
+; CHECK: br %r14
+  %call = call {<8 x i16>, i32} @llvm.s390.vpksfs(<4 x i32> %a, <4 x i32> %b)
+  %res = extractvalue {<8 x i16>, i32} %call, 0
+  %cc = extractvalue {<8 x i16>, i32} %call, 1
+  %cmp = icmp ugt i32 %cc, 0
+  br i1 %cmp, label %store, label %exit
+
+store:
+  store i32 0, i32 *%ptr
+  br label %exit
+
+exit:
+  ret <8 x i16> %res
+}
+
+; VPKSGS with no processing of the result.
+define <4 x i32> @test_vpksgs(<2 x i64> %a, <2 x i64> %b, i32 *%ccptr) {
+; CHECK-LABEL: test_vpksgs:
+; CHECK: vpksgs %v24, %v24, %v26
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: srl [[REG]], 28
+; CHECK: st [[REG]], 0(%r2)
+; CHECK: br %r14
+  %call = call {<4 x i32>, i32} @llvm.s390.vpksgs(<2 x i64> %a, <2 x i64> %b)
+  %res = extractvalue {<4 x i32>, i32} %call, 0
+  %cc = extractvalue {<4 x i32>, i32} %call, 1
+  store i32 %cc, i32 *%ccptr
+  ret <4 x i32> %res
+}
+
+; VPKSGS, storing to %ptr if no elements were saturated
+define <4 x i32> @test_vpksgs_none_store(<2 x i64> %a, <2 x i64> %b,
+                                         i32 *%ptr) {
+; CHECK-LABEL: test_vpksgs_none_store:
+; CHECK: vpksgs %v24, %v24, %v26
+; CHECK-NEXT: {{bnher|bner}} %r14
+; CHECK: mvhi 0(%r2), 0
+; CHECK: br %r14
+  %call = call {<4 x i32>, i32} @llvm.s390.vpksgs(<2 x i64> %a, <2 x i64> %b)
+  %res = extractvalue {<4 x i32>, i32} %call, 0
+  %cc = extractvalue {<4 x i32>, i32} %call, 1
+  %cmp = icmp sle i32 %cc, 0
+  br i1 %cmp, label %store, label %exit
+
+store:
+  store i32 0, i32 *%ptr
+  br label %exit
+
+exit:
+  ret <4 x i32> %res
+}
+
+; VPKLSH.
+define <16 x i8> @test_vpklsh(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vpklsh:
+; CHECK: vpklsh %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vpklsh(<8 x i16> %a, <8 x i16> %b)
+  ret <16 x i8> %res
+}
+
+; VPKLSF.
+define <8 x i16> @test_vpklsf(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vpklsf:
+; CHECK: vpklsf %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <8 x i16> @llvm.s390.vpklsf(<4 x i32> %a, <4 x i32> %b)
+  ret <8 x i16> %res
+}
+
+; VPKLSG.
+define <4 x i32> @test_vpklsg(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vpklsg:
+; CHECK: vpklsg %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <4 x i32> @llvm.s390.vpklsg(<2 x i64> %a, <2 x i64> %b)
+  ret <4 x i32> %res
+}
+
+; VPKLSHS with no processing of the result.
+define <16 x i8> @test_vpklshs(<8 x i16> %a, <8 x i16> %b, i32 *%ccptr) {
+; CHECK-LABEL: test_vpklshs:
+; CHECK: vpklshs %v24, %v24, %v26
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: srl [[REG]], 28
+; CHECK: st [[REG]], 0(%r2)
+; CHECK: br %r14
+  %call = call {<16 x i8>, i32} @llvm.s390.vpklshs(<8 x i16> %a, <8 x i16> %b)
+  %res = extractvalue {<16 x i8>, i32} %call, 0
+  %cc = extractvalue {<16 x i8>, i32} %call, 1
+  store i32 %cc, i32 *%ccptr
+  ret <16 x i8> %res
+}
+
+; VPKLSHS, storing to %ptr if all values were saturated.
+define <16 x i8> @test_vpklshs_all_store(<8 x i16> %a, <8 x i16> %b,
+                                         i32 *%ptr) {
+; CHECK-LABEL: test_vpklshs_all_store:
+; CHECK: vpklshs %v24, %v24, %v26
+; CHECK-NEXT: {{bnor|bler}} %r14
+; CHECK: mvhi 0(%r2), 0
+; CHECK: br %r14
+  %call = call {<16 x i8>, i32} @llvm.s390.vpklshs(<8 x i16> %a, <8 x i16> %b)
+  %res = extractvalue {<16 x i8>, i32} %call, 0
+  %cc = extractvalue {<16 x i8>, i32} %call, 1
+  %cmp = icmp eq i32 %cc, 3
+  br i1 %cmp, label %store, label %exit
+
+store:
+  store i32 0, i32 *%ptr
+  br label %exit
+
+exit:
+  ret <16 x i8> %res
+}
+
+; VPKLSFS with no processing of the result.
+define <8 x i16> @test_vpklsfs(<4 x i32> %a, <4 x i32> %b, i32 *%ccptr) {
+; CHECK-LABEL: test_vpklsfs:
+; CHECK: vpklsfs %v24, %v24, %v26
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: srl [[REG]], 28
+; CHECK: st [[REG]], 0(%r2)
+; CHECK: br %r14
+  %call = call {<8 x i16>, i32} @llvm.s390.vpklsfs(<4 x i32> %a, <4 x i32> %b)
+  %res = extractvalue {<8 x i16>, i32} %call, 0
+  %cc = extractvalue {<8 x i16>, i32} %call, 1
+  store i32 %cc, i32 *%ccptr
+  ret <8 x i16> %res
+}
+
+; VPKLSFS, storing to %ptr if any values were saturated.
+define <8 x i16> @test_vpklsfs_any_store(<4 x i32> %a, <4 x i32> %b,
+                                         i32 *%ptr) {
+; CHECK-LABEL: test_vpklsfs_any_store:
+; CHECK: vpklsfs %v24, %v24, %v26
+; CHECK-NEXT: {{bher|ber}} %r14
+; CHECK: mvhi 0(%r2), 0
+; CHECK: br %r14
+  %call = call {<8 x i16>, i32} @llvm.s390.vpklsfs(<4 x i32> %a, <4 x i32> %b)
+  %res = extractvalue {<8 x i16>, i32} %call, 0
+  %cc = extractvalue {<8 x i16>, i32} %call, 1
+  %cmp = icmp ne i32 %cc, 0
+  br i1 %cmp, label %store, label %exit
+
+store:
+  store i32 0, i32 *%ptr
+  br label %exit
+
+exit:
+  ret <8 x i16> %res
+}
+
+; VPKLSGS with no processing of the result.
+define <4 x i32> @test_vpklsgs(<2 x i64> %a, <2 x i64> %b, i32 *%ccptr) {
+; CHECK-LABEL: test_vpklsgs:
+; CHECK: vpklsgs %v24, %v24, %v26
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: srl [[REG]], 28
+; CHECK: st [[REG]], 0(%r2)
+; CHECK: br %r14
+  %call = call {<4 x i32>, i32} @llvm.s390.vpklsgs(<2 x i64> %a, <2 x i64> %b)
+  %res = extractvalue {<4 x i32>, i32} %call, 0
+  %cc = extractvalue {<4 x i32>, i32} %call, 1
+  store i32 %cc, i32 *%ccptr
+  ret <4 x i32> %res
+}
+
+; VPKLSGS, storing to %ptr if no elements were saturated
+define <4 x i32> @test_vpklsgs_none_store(<2 x i64> %a, <2 x i64> %b,
+                                          i32 *%ptr) {
+; CHECK-LABEL: test_vpklsgs_none_store:
+; CHECK: vpklsgs %v24, %v24, %v26
+; CHECK-NEXT: {{bnher|bner}} %r14
+; CHECK: mvhi 0(%r2), 0
+; CHECK: br %r14
+  %call = call {<4 x i32>, i32} @llvm.s390.vpklsgs(<2 x i64> %a, <2 x i64> %b)
+  %res = extractvalue {<4 x i32>, i32} %call, 0
+  %cc = extractvalue {<4 x i32>, i32} %call, 1
+  %cmp = icmp eq i32 %cc, 0
+  br i1 %cmp, label %store, label %exit
+
+store:
+  store i32 0, i32 *%ptr
+  br label %exit
+
+exit:
+  ret <4 x i32> %res
+}
+
+; VSTL with the lowest in-range displacement.
+define void @test_vstl1(<16 x i8> %vec, i8 *%ptr, i32 %length) {
+; CHECK-LABEL: test_vstl1:
+; CHECK: vstl %v24, %r3, 0(%r2)
+; CHECK: br %r14
+  call void @llvm.s390.vstl(<16 x i8> %vec, i32 %length, i8 *%ptr)
+  ret void
+}
+
+; VSTL with the highest in-range displacement.
+define void @test_vstl2(<16 x i8> %vec, i8 *%base, i32 %length) {
+; CHECK-LABEL: test_vstl2:
+; CHECK: vstl %v24, %r3, 4095(%r2)
+; CHECK: br %r14
+  %ptr = getelementptr i8, i8 *%base, i64 4095
+  call void @llvm.s390.vstl(<16 x i8> %vec, i32 %length, i8 *%ptr)
+  ret void
+}
+
+; VSTL with an out-of-range displacement.
+define void @test_vstl3(<16 x i8> %vec, i8 *%base, i32 %length) {
+; CHECK-LABEL: test_vstl3:
+; CHECK: vstl %v24, %r3, 0({{%r[1-5]}})
+; CHECK: br %r14
+  %ptr = getelementptr i8, i8 *%base, i64 4096
+  call void @llvm.s390.vstl(<16 x i8> %vec, i32 %length, i8 *%ptr)
+  ret void
+}
+
+; Check that VSTL doesn't allow an index.
+define void @test_vstl4(<16 x i8> %vec, i8 *%base, i64 %index, i32 %length) {
+; CHECK-LABEL: test_vstl4:
+; CHECK: vstl %v24, %r4, 0({{%r[1-5]}})
+; CHECK: br %r14
+  %ptr = getelementptr i8, i8 *%base, i64 %index
+  call void @llvm.s390.vstl(<16 x i8> %vec, i32 %length, i8 *%ptr)
+  ret void
+}
+
+; VUPHB.
+define <8 x i16> @test_vuphb(<16 x i8> %a) {
+; CHECK-LABEL: test_vuphb:
+; CHECK: vuphb %v24, %v24
+; CHECK: br %r14
+  %res = call <8 x i16> @llvm.s390.vuphb(<16 x i8> %a)
+  ret <8 x i16> %res
+}
+
+; VUPHH.
+define <4 x i32> @test_vuphh(<8 x i16> %a) {
+; CHECK-LABEL: test_vuphh:
+; CHECK: vuphh %v24, %v24
+; CHECK: br %r14
+  %res = call <4 x i32> @llvm.s390.vuphh(<8 x i16> %a)
+  ret <4 x i32> %res
+}
+
+; VUPHF.
+define <2 x i64> @test_vuphf(<4 x i32> %a) {
+; CHECK-LABEL: test_vuphf:
+; CHECK: vuphf %v24, %v24
+; CHECK: br %r14
+  %res = call <2 x i64> @llvm.s390.vuphf(<4 x i32> %a)
+  ret <2 x i64> %res
+}
+
+; VUPLHB.
+define <8 x i16> @test_vuplhb(<16 x i8> %a) {
+; CHECK-LABEL: test_vuplhb:
+; CHECK: vuplhb %v24, %v24
+; CHECK: br %r14
+  %res = call <8 x i16> @llvm.s390.vuplhb(<16 x i8> %a)
+  ret <8 x i16> %res
+}
+
+; VUPLHH.
+define <4 x i32> @test_vuplhh(<8 x i16> %a) {
+; CHECK-LABEL: test_vuplhh:
+; CHECK: vuplhh %v24, %v24
+; CHECK: br %r14
+  %res = call <4 x i32> @llvm.s390.vuplhh(<8 x i16> %a)
+  ret <4 x i32> %res
+}
+
+; VUPLHF.
+define <2 x i64> @test_vuplhf(<4 x i32> %a) {
+; CHECK-LABEL: test_vuplhf:
+; CHECK: vuplhf %v24, %v24
+; CHECK: br %r14
+  %res = call <2 x i64> @llvm.s390.vuplhf(<4 x i32> %a)
+  ret <2 x i64> %res
+}
+
+; VUPLB.
+define <8 x i16> @test_vuplb(<16 x i8> %a) {
+; CHECK-LABEL: test_vuplb:
+; CHECK: vuplb %v24, %v24
+; CHECK: br %r14
+  %res = call <8 x i16> @llvm.s390.vuplb(<16 x i8> %a)
+  ret <8 x i16> %res
+}
+
+; VUPLHW.
+define <4 x i32> @test_vuplhw(<8 x i16> %a) {
+; CHECK-LABEL: test_vuplhw:
+; CHECK: vuplhw %v24, %v24
+; CHECK: br %r14
+  %res = call <4 x i32> @llvm.s390.vuplhw(<8 x i16> %a)
+  ret <4 x i32> %res
+}
+
+; VUPLF.
+define <2 x i64> @test_vuplf(<4 x i32> %a) {
+; CHECK-LABEL: test_vuplf:
+; CHECK: vuplf %v24, %v24
+; CHECK: br %r14
+  %res = call <2 x i64> @llvm.s390.vuplf(<4 x i32> %a)
+  ret <2 x i64> %res
+}
+
+; VUPLLB.
+define <8 x i16> @test_vupllb(<16 x i8> %a) {
+; CHECK-LABEL: test_vupllb:
+; CHECK: vupllb %v24, %v24
+; CHECK: br %r14
+  %res = call <8 x i16> @llvm.s390.vupllb(<16 x i8> %a)
+  ret <8 x i16> %res
+}
+
+; VUPLLH.
+define <4 x i32> @test_vupllh(<8 x i16> %a) {
+; CHECK-LABEL: test_vupllh:
+; CHECK: vupllh %v24, %v24
+; CHECK: br %r14
+  %res = call <4 x i32> @llvm.s390.vupllh(<8 x i16> %a)
+  ret <4 x i32> %res
+}
+
+; VUPLLF.
+define <2 x i64> @test_vupllf(<4 x i32> %a) {
+; CHECK-LABEL: test_vupllf:
+; CHECK: vupllf %v24, %v24
+; CHECK: br %r14
+  %res = call <2 x i64> @llvm.s390.vupllf(<4 x i32> %a)
+  ret <2 x i64> %res
+}
+
+; VACCB.
+define <16 x i8> @test_vaccb(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vaccb:
+; CHECK: vaccb %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vaccb(<16 x i8> %a, <16 x i8> %b)
+  ret <16 x i8> %res
+}
+
+; VACCH.
+define <8 x i16> @test_vacch(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vacch:
+; CHECK: vacch %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <8 x i16> @llvm.s390.vacch(<8 x i16> %a, <8 x i16> %b)
+  ret <8 x i16> %res
+}
+
+; VACCF.
+define <4 x i32> @test_vaccf(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vaccf:
+; CHECK: vaccf %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <4 x i32> @llvm.s390.vaccf(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %res
+}
+
+; VACCG.
+define <2 x i64> @test_vaccg(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vaccg:
+; CHECK: vaccg %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <2 x i64> @llvm.s390.vaccg(<2 x i64> %a, <2 x i64> %b)
+  ret <2 x i64> %res
+}
+
+; VAQ.
+define <16 x i8> @test_vaq(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vaq:
+; CHECK: vaq %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vaq(<16 x i8> %a, <16 x i8> %b)
+  ret <16 x i8> %res
+}
+
+; VACQ.
+define <16 x i8> @test_vacq(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK-LABEL: test_vacq:
+; CHECK: vacq %v24, %v24, %v26, %v28
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vacq(<16 x i8> %a, <16 x i8> %b,
+                                        <16 x i8> %c)
+  ret <16 x i8> %res
+}
+
+; VACCQ.
+define <16 x i8> @test_vaccq(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vaccq:
+; CHECK: vaccq %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vaccq(<16 x i8> %a, <16 x i8> %b)
+  ret <16 x i8> %res
+}
+
+; VACCCQ.
+define <16 x i8> @test_vacccq(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK-LABEL: test_vacccq:
+; CHECK: vacccq %v24, %v24, %v26, %v28
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vacccq(<16 x i8> %a, <16 x i8> %b,
+                                          <16 x i8> %c)
+  ret <16 x i8> %res
+}
+
+; VAVGB.
+define <16 x i8> @test_vavgb(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vavgb:
+; CHECK: vavgb %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vavgb(<16 x i8> %a, <16 x i8> %b)
+  ret <16 x i8> %res
+}
+
+; VAVGH.
+define <8 x i16> @test_vavgh(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vavgh:
+; CHECK: vavgh %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <8 x i16> @llvm.s390.vavgh(<8 x i16> %a, <8 x i16> %b)
+  ret <8 x i16> %res
+}
+
+; VAVGF.
+define <4 x i32> @test_vavgf(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vavgf:
+; CHECK: vavgf %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <4 x i32> @llvm.s390.vavgf(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %res
+}
+
+; VAVGG.
+define <2 x i64> @test_vavgg(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vavgg:
+; CHECK: vavgg %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <2 x i64> @llvm.s390.vavgg(<2 x i64> %a, <2 x i64> %b)
+  ret <2 x i64> %res
+}
+
+; VAVGLB.
+define <16 x i8> @test_vavglb(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vavglb:
+; CHECK: vavglb %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vavglb(<16 x i8> %a, <16 x i8> %b)
+  ret <16 x i8> %res
+}
+
+; VAVGLH.
+define <8 x i16> @test_vavglh(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vavglh:
+; CHECK: vavglh %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <8 x i16> @llvm.s390.vavglh(<8 x i16> %a, <8 x i16> %b)
+  ret <8 x i16> %res
+}
+
+; VAVGLF.
+define <4 x i32> @test_vavglf(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vavglf:
+; CHECK: vavglf %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <4 x i32> @llvm.s390.vavglf(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %res
+}
+
+; VAVGLG.
+define <2 x i64> @test_vavglg(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vavglg:
+; CHECK: vavglg %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <2 x i64> @llvm.s390.vavglg(<2 x i64> %a, <2 x i64> %b)
+  ret <2 x i64> %res
+}
+
+; VCKSM.
+define <4 x i32> @test_vcksm(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vcksm:
+; CHECK: vcksm %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <4 x i32> @llvm.s390.vcksm(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %res
+}
+
+; VGFMB.
+define <8 x i16> @test_vgfmb(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vgfmb:
+; CHECK: vgfmb %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <8 x i16> @llvm.s390.vgfmb(<16 x i8> %a, <16 x i8> %b)
+  ret <8 x i16> %res
+}
+
+; VGFMH.
+define <4 x i32> @test_vgfmh(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vgfmh:
+; CHECK: vgfmh %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <4 x i32> @llvm.s390.vgfmh(<8 x i16> %a, <8 x i16> %b)
+  ret <4 x i32> %res
+}
+
+; VGFMF.
+define <2 x i64> @test_vgfmf(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vgfmf:
+; CHECK: vgfmf %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <2 x i64> @llvm.s390.vgfmf(<4 x i32> %a, <4 x i32> %b)
+  ret <2 x i64> %res
+}
+
+; VGFMG.
+define <16 x i8> @test_vgfmg(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vgfmg:
+; CHECK: vgfmg %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vgfmg(<2 x i64> %a, <2 x i64> %b)
+  ret <16 x i8> %res
+}
+
+; VGFMAB.
+define <8 x i16> @test_vgfmab(<16 x i8> %a, <16 x i8> %b, <8 x i16> %c) {
+; CHECK-LABEL: test_vgfmab:
+; CHECK: vgfmab %v24, %v24, %v26, %v28
+; CHECK: br %r14
+  %res = call <8 x i16> @llvm.s390.vgfmab(<16 x i8> %a, <16 x i8> %b,
+                                          <8 x i16> %c)
+  ret <8 x i16> %res
+}
+
+; VGFMAH.
+define <4 x i32> @test_vgfmah(<8 x i16> %a, <8 x i16> %b, <4 x i32> %c) {
+; CHECK-LABEL: test_vgfmah:
+; CHECK: vgfmah %v24, %v24, %v26, %v28
+; CHECK: br %r14
+  %res = call <4 x i32> @llvm.s390.vgfmah(<8 x i16> %a, <8 x i16> %b,
+                                          <4 x i32> %c)
+  ret <4 x i32> %res
+}
+
+; VGFMAF.
+define <2 x i64> @test_vgfmaf(<4 x i32> %a, <4 x i32> %b, <2 x i64> %c) {
+; CHECK-LABEL: test_vgfmaf:
+; CHECK: vgfmaf %v24, %v24, %v26, %v28
+; CHECK: br %r14
+  %res = call <2 x i64> @llvm.s390.vgfmaf(<4 x i32> %a, <4 x i32> %b,
+                                          <2 x i64> %c)
+  ret <2 x i64> %res
+}
+
+; VGFMAG.
+define <16 x i8> @test_vgfmag(<2 x i64> %a, <2 x i64> %b, <16 x i8> %c) {
+; CHECK-LABEL: test_vgfmag:
+; CHECK: vgfmag %v24, %v24, %v26, %v28
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vgfmag(<2 x i64> %a, <2 x i64> %b,
+                                          <16 x i8> %c)
+  ret <16 x i8> %res
+}
+
+; VMAHB.
+define <16 x i8> @test_vmahb(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK-LABEL: test_vmahb:
+; CHECK: vmahb %v24, %v24, %v26, %v28
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vmahb(<16 x i8> %a, <16 x i8> %b,
+                                         <16 x i8> %c)
+  ret <16 x i8> %res
+}
+
+; VMAHH.
+define <8 x i16> @test_vmahh(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK-LABEL: test_vmahh:
+; CHECK: vmahh %v24, %v24, %v26, %v28
+; CHECK: br %r14
+  %res = call <8 x i16> @llvm.s390.vmahh(<8 x i16> %a, <8 x i16> %b,
+                                         <8 x i16> %c)
+  ret <8 x i16> %res
+}
+
+; VMAHF.
+define <4 x i32> @test_vmahf(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-LABEL: test_vmahf:
+; CHECK: vmahf %v24, %v24, %v26, %v28
+; CHECK: br %r14
+  %res = call <4 x i32> @llvm.s390.vmahf(<4 x i32> %a, <4 x i32> %b,
+                                         <4 x i32> %c)
+  ret <4 x i32> %res
+}
+
+; VMALHB.
+define <16 x i8> @test_vmalhb(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK-LABEL: test_vmalhb:
+; CHECK: vmalhb %v24, %v24, %v26, %v28
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vmalhb(<16 x i8> %a, <16 x i8> %b,
+                                          <16 x i8> %c)
+  ret <16 x i8> %res
+}
+
+; VMALHH.
+define <8 x i16> @test_vmalhh(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK-LABEL: test_vmalhh:
+; CHECK: vmalhh %v24, %v24, %v26, %v28
+; CHECK: br %r14
+  %res = call <8 x i16> @llvm.s390.vmalhh(<8 x i16> %a, <8 x i16> %b,
+                                          <8 x i16> %c)
+  ret <8 x i16> %res
+}
+
+; VMALHF.
+define <4 x i32> @test_vmalhf(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-LABEL: test_vmalhf:
+; CHECK: vmalhf %v24, %v24, %v26, %v28
+; CHECK: br %r14
+  %res = call <4 x i32> @llvm.s390.vmalhf(<4 x i32> %a, <4 x i32> %b,
+                                          <4 x i32> %c)
+  ret <4 x i32> %res
+}
+
+; VMAEB.
+define <8 x i16> @test_vmaeb(<16 x i8> %a, <16 x i8> %b, <8 x i16> %c) {
+; CHECK-LABEL: test_vmaeb:
+; CHECK: vmaeb %v24, %v24, %v26, %v28
+; CHECK: br %r14
+  %res = call <8 x i16> @llvm.s390.vmaeb(<16 x i8> %a, <16 x i8> %b,
+                                         <8 x i16> %c)
+  ret <8 x i16> %res
+}
+
+; VMAEH.
+define <4 x i32> @test_vmaeh(<8 x i16> %a, <8 x i16> %b, <4 x i32> %c) {
+; CHECK-LABEL: test_vmaeh:
+; CHECK: vmaeh %v24, %v24, %v26, %v28
+; CHECK: br %r14
+  %res = call <4 x i32> @llvm.s390.vmaeh(<8 x i16> %a, <8 x i16> %b,
+                                         <4 x i32> %c)
+  ret <4 x i32> %res
+}
+
+; VMAEF.
+define <2 x i64> @test_vmaef(<4 x i32> %a, <4 x i32> %b, <2 x i64> %c) {
+; CHECK-LABEL: test_vmaef:
+; CHECK: vmaef %v24, %v24, %v26, %v28
+; CHECK: br %r14
+  %res = call <2 x i64> @llvm.s390.vmaef(<4 x i32> %a, <4 x i32> %b,
+                                         <2 x i64> %c)
+  ret <2 x i64> %res
+}
+
+; VMALEB.
+define <8 x i16> @test_vmaleb(<16 x i8> %a, <16 x i8> %b, <8 x i16> %c) {
+; CHECK-LABEL: test_vmaleb:
+; CHECK: vmaleb %v24, %v24, %v26, %v28
+; CHECK: br %r14
+  %res = call <8 x i16> @llvm.s390.vmaleb(<16 x i8> %a, <16 x i8> %b,
+                                          <8 x i16> %c)
+  ret <8 x i16> %res
+}
+
+; VMALEH.
+define <4 x i32> @test_vmaleh(<8 x i16> %a, <8 x i16> %b, <4 x i32> %c) {
+; CHECK-LABEL: test_vmaleh:
+; CHECK: vmaleh %v24, %v24, %v26, %v28
+; CHECK: br %r14
+  %res = call <4 x i32> @llvm.s390.vmaleh(<8 x i16> %a, <8 x i16> %b,
+                                          <4 x i32> %c)
+  ret <4 x i32> %res
+}
+
+; VMALEF.
+define <2 x i64> @test_vmalef(<4 x i32> %a, <4 x i32> %b, <2 x i64> %c) {
+; CHECK-LABEL: test_vmalef:
+; CHECK: vmalef %v24, %v24, %v26, %v28
+; CHECK: br %r14
+  %res = call <2 x i64> @llvm.s390.vmalef(<4 x i32> %a, <4 x i32> %b,
+                                          <2 x i64> %c)
+  ret <2 x i64> %res
+}
+
+; VMAOB.
+define <8 x i16> @test_vmaob(<16 x i8> %a, <16 x i8> %b, <8 x i16> %c) {
+; CHECK-LABEL: test_vmaob:
+; CHECK: vmaob %v24, %v24, %v26, %v28
+; CHECK: br %r14
+  %res = call <8 x i16> @llvm.s390.vmaob(<16 x i8> %a, <16 x i8> %b,
+                                         <8 x i16> %c)
+  ret <8 x i16> %res
+}
+
+; VMAOH.
+define <4 x i32> @test_vmaoh(<8 x i16> %a, <8 x i16> %b, <4 x i32> %c) {
+; CHECK-LABEL: test_vmaoh:
+; CHECK: vmaoh %v24, %v24, %v26, %v28
+; CHECK: br %r14
+  %res = call <4 x i32> @llvm.s390.vmaoh(<8 x i16> %a, <8 x i16> %b,
+                                         <4 x i32> %c)
+  ret <4 x i32> %res
+}
+
+; VMAOF.
+define <2 x i64> @test_vmaof(<4 x i32> %a, <4 x i32> %b, <2 x i64> %c) {
+; CHECK-LABEL: test_vmaof:
+; CHECK: vmaof %v24, %v24, %v26, %v28
+; CHECK: br %r14
+  %res = call <2 x i64> @llvm.s390.vmaof(<4 x i32> %a, <4 x i32> %b,
+                                         <2 x i64> %c)
+  ret <2 x i64> %res
+}
+
+; VMALOB.
+define <8 x i16> @test_vmalob(<16 x i8> %a, <16 x i8> %b, <8 x i16> %c) {
+; CHECK-LABEL: test_vmalob:
+; CHECK: vmalob %v24, %v24, %v26, %v28
+; CHECK: br %r14
+  %res = call <8 x i16> @llvm.s390.vmalob(<16 x i8> %a, <16 x i8> %b,
+                                          <8 x i16> %c)
+  ret <8 x i16> %res
+}
+
+; VMALOH.
+define <4 x i32> @test_vmaloh(<8 x i16> %a, <8 x i16> %b, <4 x i32> %c) {
+; CHECK-LABEL: test_vmaloh:
+; CHECK: vmaloh %v24, %v24, %v26, %v28
+; CHECK: br %r14
+  %res = call <4 x i32> @llvm.s390.vmaloh(<8 x i16> %a, <8 x i16> %b,
+                                          <4 x i32> %c)
+  ret <4 x i32> %res
+}
+
+; VMALOF.
+define <2 x i64> @test_vmalof(<4 x i32> %a, <4 x i32> %b, <2 x i64> %c) {
+; CHECK-LABEL: test_vmalof:
+; CHECK: vmalof %v24, %v24, %v26, %v28
+; CHECK: br %r14
+  %res = call <2 x i64> @llvm.s390.vmalof(<4 x i32> %a, <4 x i32> %b,
+                                          <2 x i64> %c)
+  ret <2 x i64> %res
+}
+
+; VMHB.
+define <16 x i8> @test_vmhb(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vmhb:
+; CHECK: vmhb %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vmhb(<16 x i8> %a, <16 x i8> %b)
+  ret <16 x i8> %res
+}
+
+; VMHH.
+define <8 x i16> @test_vmhh(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vmhh:
+; CHECK: vmhh %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <8 x i16> @llvm.s390.vmhh(<8 x i16> %a, <8 x i16> %b)
+  ret <8 x i16> %res
+}
+
+; VMHF.
+define <4 x i32> @test_vmhf(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vmhf:
+; CHECK: vmhf %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <4 x i32> @llvm.s390.vmhf(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %res
+}
+
+; VMLHB.
+define <16 x i8> @test_vmlhb(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vmlhb:
+; CHECK: vmlhb %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vmlhb(<16 x i8> %a, <16 x i8> %b)
+  ret <16 x i8> %res
+}
+
+; VMLHH.
+define <8 x i16> @test_vmlhh(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vmlhh:
+; CHECK: vmlhh %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <8 x i16> @llvm.s390.vmlhh(<8 x i16> %a, <8 x i16> %b)
+  ret <8 x i16> %res
+}
+
+; VMLHF.
+define <4 x i32> @test_vmlhf(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vmlhf:
+; CHECK: vmlhf %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <4 x i32> @llvm.s390.vmlhf(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %res
+}
+
+; VMEB.
+define <8 x i16> @test_vmeb(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vmeb:
+; CHECK: vmeb %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <8 x i16> @llvm.s390.vmeb(<16 x i8> %a, <16 x i8> %b)
+  ret <8 x i16> %res
+}
+
+; VMEH.
+define <4 x i32> @test_vmeh(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vmeh:
+; CHECK: vmeh %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <4 x i32> @llvm.s390.vmeh(<8 x i16> %a, <8 x i16> %b)
+  ret <4 x i32> %res
+}
+
+; VMEF.
+define <2 x i64> @test_vmef(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vmef:
+; CHECK: vmef %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <2 x i64> @llvm.s390.vmef(<4 x i32> %a, <4 x i32> %b)
+  ret <2 x i64> %res
+}
+
+; VMLEB.
+define <8 x i16> @test_vmleb(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vmleb:
+; CHECK: vmleb %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <8 x i16> @llvm.s390.vmleb(<16 x i8> %a, <16 x i8> %b)
+  ret <8 x i16> %res
+}
+
+; VMLEH.
+define <4 x i32> @test_vmleh(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vmleh:
+; CHECK: vmleh %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <4 x i32> @llvm.s390.vmleh(<8 x i16> %a, <8 x i16> %b)
+  ret <4 x i32> %res
+}
+
+; VMLEF.
+define <2 x i64> @test_vmlef(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vmlef:
+; CHECK: vmlef %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <2 x i64> @llvm.s390.vmlef(<4 x i32> %a, <4 x i32> %b)
+  ret <2 x i64> %res
+}
+
+; VMOB.
+define <8 x i16> @test_vmob(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vmob:
+; CHECK: vmob %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <8 x i16> @llvm.s390.vmob(<16 x i8> %a, <16 x i8> %b)
+  ret <8 x i16> %res
+}
+
+; VMOH.
+define <4 x i32> @test_vmoh(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vmoh:
+; CHECK: vmoh %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <4 x i32> @llvm.s390.vmoh(<8 x i16> %a, <8 x i16> %b)
+  ret <4 x i32> %res
+}
+
+; VMOF.
+define <2 x i64> @test_vmof(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vmof:
+; CHECK: vmof %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <2 x i64> @llvm.s390.vmof(<4 x i32> %a, <4 x i32> %b)
+  ret <2 x i64> %res
+}
+
+; VMLOB.
+define <8 x i16> @test_vmlob(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vmlob:
+; CHECK: vmlob %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <8 x i16> @llvm.s390.vmlob(<16 x i8> %a, <16 x i8> %b)
+  ret <8 x i16> %res
+}
+
+; VMLOH.
+define <4 x i32> @test_vmloh(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vmloh:
+; CHECK: vmloh %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <4 x i32> @llvm.s390.vmloh(<8 x i16> %a, <8 x i16> %b)
+  ret <4 x i32> %res
+}
+
+; VMLOF.
+define <2 x i64> @test_vmlof(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vmlof:
+; CHECK: vmlof %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <2 x i64> @llvm.s390.vmlof(<4 x i32> %a, <4 x i32> %b)
+  ret <2 x i64> %res
+}
+
+; VERLLVB.
+define <16 x i8> @test_verllvb(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_verllvb:
+; CHECK: verllvb %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.verllvb(<16 x i8> %a, <16 x i8> %b)
+  ret <16 x i8> %res
+}
+
+; VERLLVH.
+define <8 x i16> @test_verllvh(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_verllvh:
+; CHECK: verllvh %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <8 x i16> @llvm.s390.verllvh(<8 x i16> %a, <8 x i16> %b)
+  ret <8 x i16> %res
+}
+
+; VERLLVF.
+define <4 x i32> @test_verllvf(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_verllvf:
+; CHECK: verllvf %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <4 x i32> @llvm.s390.verllvf(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %res
+}
+
+; VERLLVG.
+define <2 x i64> @test_verllvg(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_verllvg:
+; CHECK: verllvg %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <2 x i64> @llvm.s390.verllvg(<2 x i64> %a, <2 x i64> %b)
+  ret <2 x i64> %res
+}
+
+; VERLLB.
+define <16 x i8> @test_verllb(<16 x i8> %a, i32 %b) {
+; CHECK-LABEL: test_verllb:
+; CHECK: verllb %v24, %v24, 0(%r2)
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.verllb(<16 x i8> %a, i32 %b)
+  ret <16 x i8> %res
+}
+
+; VERLLH.
+define <8 x i16> @test_verllh(<8 x i16> %a, i32 %b) {
+; CHECK-LABEL: test_verllh:
+; CHECK: verllh %v24, %v24, 0(%r2)
+; CHECK: br %r14
+  %res = call <8 x i16> @llvm.s390.verllh(<8 x i16> %a, i32 %b)
+  ret <8 x i16> %res
+}
+
+; VERLLF.
+define <4 x i32> @test_verllf(<4 x i32> %a, i32 %b) {
+; CHECK-LABEL: test_verllf:
+; CHECK: verllf %v24, %v24, 0(%r2)
+; CHECK: br %r14
+  %res = call <4 x i32> @llvm.s390.verllf(<4 x i32> %a, i32 %b)
+  ret <4 x i32> %res
+}
+
+; VERLLG.
+define <2 x i64> @test_verllg(<2 x i64> %a, i32 %b) {
+; CHECK-LABEL: test_verllg:
+; CHECK: verllg %v24, %v24, 0(%r2)
+; CHECK: br %r14
+  %res = call <2 x i64> @llvm.s390.verllg(<2 x i64> %a, i32 %b)
+  ret <2 x i64> %res
+}
+
+; VERLLB with the smallest count.
+define <16 x i8> @test_verllb_1(<16 x i8> %a) {
+; CHECK-LABEL: test_verllb_1:
+; CHECK: verllb %v24, %v24, 1
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.verllb(<16 x i8> %a, i32 1)
+  ret <16 x i8> %res
+}
+
+; VERLLB with the largest count.
+define <16 x i8> @test_verllb_4095(<16 x i8> %a) {
+; CHECK-LABEL: test_verllb_4095:
+; CHECK: verllb %v24, %v24, 4095
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.verllb(<16 x i8> %a, i32 4095)
+  ret <16 x i8> %res
+}
+
+; VERLLB with the largest count + 1.
+define <16 x i8> @test_verllb_4096(<16 x i8> %a) {
+; CHECK-LABEL: test_verllb_4096:
+; CHECK: lhi [[REG:%r[1-5]]], 4096
+; CHECK: verllb %v24, %v24, 0([[REG]])
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.verllb(<16 x i8> %a, i32 4096)
+  ret <16 x i8> %res
+}
+
+; VERIMB.
+define <16 x i8> @test_verimb(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK-LABEL: test_verimb:
+; CHECK: verimb %v24, %v26, %v28, 1
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.verimb(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, i32 1)
+  ret <16 x i8> %res
+}
+
+; VERIMH.
+define <8 x i16> @test_verimh(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK-LABEL: test_verimh:
+; CHECK: verimh %v24, %v26, %v28, 1
+; CHECK: br %r14
+  %res = call <8 x i16> @llvm.s390.verimh(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, i32 1)
+  ret <8 x i16> %res
+}
+
+; VERIMF.
+define <4 x i32> @test_verimf(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-LABEL: test_verimf:
+; CHECK: verimf %v24, %v26, %v28, 1
+; CHECK: br %r14
+  %res = call <4 x i32> @llvm.s390.verimf(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, i32 1)
+  ret <4 x i32> %res
+}
+
+; VERIMG.
+define <2 x i64> @test_verimg(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) {
+; CHECK-LABEL: test_verimg:
+; CHECK: verimg %v24, %v26, %v28, 1
+; CHECK: br %r14
+  %res = call <2 x i64> @llvm.s390.verimg(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, i32 1)
+  ret <2 x i64> %res
+}
+
+; VERIMB with a different mask.
+define <16 x i8> @test_verimb_254(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK-LABEL: test_verimb_254:
+; CHECK: verimb %v24, %v26, %v28, 254
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.verimb(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, i32 254)
+  ret <16 x i8> %res
+}
+
+; VSL.
+define <16 x i8> @test_vsl(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vsl:
+; CHECK: vsl %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vsl(<16 x i8> %a, <16 x i8> %b)
+  ret <16 x i8> %res
+}
+
+; VSLB.
+define <16 x i8> @test_vslb(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vslb:
+; CHECK: vslb %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vslb(<16 x i8> %a, <16 x i8> %b)
+  ret <16 x i8> %res
+}
+
+; VSRA.
+define <16 x i8> @test_vsra(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vsra:
+; CHECK: vsra %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vsra(<16 x i8> %a, <16 x i8> %b)
+  ret <16 x i8> %res
+}
+
+; VSRAB.
+define <16 x i8> @test_vsrab(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vsrab:
+; CHECK: vsrab %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vsrab(<16 x i8> %a, <16 x i8> %b)
+  ret <16 x i8> %res
+}
+
+; VSRL.
+define <16 x i8> @test_vsrl(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vsrl:
+; CHECK: vsrl %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vsrl(<16 x i8> %a, <16 x i8> %b)
+  ret <16 x i8> %res
+}
+
+; VSRLB.
+define <16 x i8> @test_vsrlb(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vsrlb:
+; CHECK: vsrlb %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vsrlb(<16 x i8> %a, <16 x i8> %b)
+  ret <16 x i8> %res
+}
+
+; VSLDB with the minimum useful value.
+define <16 x i8> @test_vsldb_1(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vsldb_1:
+; CHECK: vsldb %v24, %v24, %v26, 1
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vsldb(<16 x i8> %a, <16 x i8> %b, i32 1)
+  ret <16 x i8> %res
+}
+
+; VSLDB with the maximum value.
+define <16 x i8> @test_vsldb_15(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vsldb_15:
+; CHECK: vsldb %v24, %v24, %v26, 15
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vsldb(<16 x i8> %a, <16 x i8> %b, i32 15)
+  ret <16 x i8> %res
+}
+
+; VSCBIB.
+define <16 x i8> @test_vscbib(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vscbib:
+; CHECK: vscbib %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vscbib(<16 x i8> %a, <16 x i8> %b)
+  ret <16 x i8> %res
+}
+
+; VSCBIH.
+define <8 x i16> @test_vscbih(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vscbih:
+; CHECK: vscbih %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <8 x i16> @llvm.s390.vscbih(<8 x i16> %a, <8 x i16> %b)
+  ret <8 x i16> %res
+}
+
+; VSCBIF.
+define <4 x i32> @test_vscbif(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vscbif:
+; CHECK: vscbif %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <4 x i32> @llvm.s390.vscbif(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %res
+}
+
+; VSCBIG.
+define <2 x i64> @test_vscbig(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vscbig:
+; CHECK: vscbig %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <2 x i64> @llvm.s390.vscbig(<2 x i64> %a, <2 x i64> %b)
+  ret <2 x i64> %res
+}
+
+; VSQ.
+define <16 x i8> @test_vsq(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vsq:
+; CHECK: vsq %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vsq(<16 x i8> %a, <16 x i8> %b)
+  ret <16 x i8> %res
+}
+
+; VSBIQ.
+define <16 x i8> @test_vsbiq(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK-LABEL: test_vsbiq:
+; CHECK: vsbiq %v24, %v24, %v26, %v28
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vsbiq(<16 x i8> %a, <16 x i8> %b,
+                                         <16 x i8> %c)
+  ret <16 x i8> %res
+}
+
+; VSCBIQ.
+define <16 x i8> @test_vscbiq(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vscbiq:
+; CHECK: vscbiq %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vscbiq(<16 x i8> %a, <16 x i8> %b)
+  ret <16 x i8> %res
+}
+
+; VSBCBIQ.
+define <16 x i8> @test_vsbcbiq(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK-LABEL: test_vsbcbiq:
+; CHECK: vsbcbiq %v24, %v24, %v26, %v28
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vsbcbiq(<16 x i8> %a, <16 x i8> %b,
+                                           <16 x i8> %c)
+  ret <16 x i8> %res
+}
+
+; VSUMB.
+define <4 x i32> @test_vsumb(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vsumb:
+; CHECK: vsumb %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <4 x i32> @llvm.s390.vsumb(<16 x i8> %a, <16 x i8> %b)
+  ret <4 x i32> %res
+}
+
+; VSUMH.
+define <4 x i32> @test_vsumh(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vsumh:
+; CHECK: vsumh %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <4 x i32> @llvm.s390.vsumh(<8 x i16> %a, <8 x i16> %b)
+  ret <4 x i32> %res
+}
+
+; VSUMGH.
+define <2 x i64> @test_vsumgh(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vsumgh:
+; CHECK: vsumgh %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <2 x i64> @llvm.s390.vsumgh(<8 x i16> %a, <8 x i16> %b)
+  ret <2 x i64> %res
+}
+
+; VSUMGF.
+define <2 x i64> @test_vsumgf(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vsumgf:
+; CHECK: vsumgf %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <2 x i64> @llvm.s390.vsumgf(<4 x i32> %a, <4 x i32> %b)
+  ret <2 x i64> %res
+}
+
+; VSUMQF.
+define <16 x i8> @test_vsumqf(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vsumqf:
+; CHECK: vsumqf %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vsumqf(<4 x i32> %a, <4 x i32> %b)
+  ret <16 x i8> %res
+}
+
+; VSUMQG.
+define <16 x i8> @test_vsumqg(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vsumqg:
+; CHECK: vsumqg %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vsumqg(<2 x i64> %a, <2 x i64> %b)
+  ret <16 x i8> %res
+}
+
+; VTM with no processing of the result.
+define i32 @test_vtm(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vtm:
+; CHECK: vtm %v24, %v26
+; CHECK: ipm %r2
+; CHECK: srl %r2, 28
+; CHECK: br %r14
+  %res = call i32 @llvm.s390.vtm(<16 x i8> %a, <16 x i8> %b)
+  ret i32 %res
+}
+
+; VTM, storing to %ptr if all bits are set.
+define void @test_vtm_all_store(<16 x i8> %a, <16 x i8> %b, i32 *%ptr) {
+; CHECK-LABEL: test_vtm_all_store:
+; CHECK-NOT: %r
+; CHECK: vtm %v24, %v26
+; CHECK-NEXT: {{bnor|bler}} %r14
+; CHECK: mvhi 0(%r2), 0
+; CHECK: br %r14
+  %res = call i32 @llvm.s390.vtm(<16 x i8> %a, <16 x i8> %b)
+  %cmp = icmp sge i32 %res, 3
+  br i1 %cmp, label %store, label %exit
+
+store:
+  store i32 0, i32 *%ptr
+  br label %exit
+
+exit:
+  ret void
+}
+
+; VCEQBS with no processing of the result.
+define i32 @test_vceqbs(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vceqbs:
+; CHECK: vceqbs {{%v[0-9]+}}, %v24, %v26
+; CHECK: ipm %r2
+; CHECK: srl %r2, 28
+; CHECK: br %r14
+  %call = call {<16 x i8>, i32} @llvm.s390.vceqbs(<16 x i8> %a, <16 x i8> %b)
+  %res = extractvalue {<16 x i8>, i32} %call, 1
+  ret i32 %res
+}
+
+; VCEQBS, returning 1 if any elements are equal (CC != 3).
+define i32 @test_vceqbs_any_bool(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vceqbs_any_bool:
+; CHECK: vceqbs {{%v[0-9]+}}, %v24, %v26
+; CHECK: ipm %r2
+; CHECK: afi %r2, -536870912
+; CHECK: srl %r2, 31
+; CHECK: br %r14
+  %call = call {<16 x i8>, i32} @llvm.s390.vceqbs(<16 x i8> %a, <16 x i8> %b)
+  %res = extractvalue {<16 x i8>, i32} %call, 1
+  %cmp = icmp ne i32 %res, 3
+  %ext = zext i1 %cmp to i32
+  ret i32 %ext
+}
+
+; VCEQBS, storing to %ptr if any elements are equal.
+define <16 x i8> @test_vceqbs_any_store(<16 x i8> %a, <16 x i8> %b, i32 *%ptr) {
+; CHECK-LABEL: test_vceqbs_any_store:
+; CHECK-NOT: %r
+; CHECK: vceqbs %v24, %v24, %v26
+; CHECK-NEXT: {{bor|bnler}} %r14
+; CHECK: mvhi 0(%r2), 0
+; CHECK: br %r14
+  %call = call {<16 x i8>, i32} @llvm.s390.vceqbs(<16 x i8> %a, <16 x i8> %b)
+  %res = extractvalue {<16 x i8>, i32} %call, 0
+  %cc = extractvalue {<16 x i8>, i32} %call, 1
+  %cmp = icmp ule i32 %cc, 2
+  br i1 %cmp, label %store, label %exit
+
+store:
+  store i32 0, i32 *%ptr
+  br label %exit
+
+exit:
+  ret <16 x i8> %res
+}
+
+; VCEQHS with no processing of the result.
+define i32 @test_vceqhs(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vceqhs:
+; CHECK: vceqhs {{%v[0-9]+}}, %v24, %v26
+; CHECK: ipm %r2
+; CHECK: srl %r2, 28
+; CHECK: br %r14
+  %call = call {<8 x i16>, i32} @llvm.s390.vceqhs(<8 x i16> %a, <8 x i16> %b)
+  %res = extractvalue {<8 x i16>, i32} %call, 1
+  ret i32 %res
+}
+
+; VCEQHS, returning 1 if not all elements are equal.
+define i32 @test_vceqhs_notall_bool(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vceqhs_notall_bool:
+; CHECK: vceqhs {{%v[0-9]+}}, %v24, %v26
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: risblg %r2, [[REG]], 31, 159, 36
+; CHECK: br %r14
+  %call = call {<8 x i16>, i32} @llvm.s390.vceqhs(<8 x i16> %a, <8 x i16> %b)
+  %res = extractvalue {<8 x i16>, i32} %call, 1
+  %cmp = icmp sge i32 %res, 1
+  %ext = zext i1 %cmp to i32
+  ret i32 %ext
+}
+
+; VCEQHS, storing to %ptr if not all elements are equal.
+define <8 x i16> @test_vceqhs_notall_store(<8 x i16> %a, <8 x i16> %b,
+                                           i32 *%ptr) {
+; CHECK-LABEL: test_vceqhs_notall_store:
+; CHECK-NOT: %r
+; CHECK: vceqhs %v24, %v24, %v26
+; CHECK-NEXT: {{bher|ber}} %r14
+; CHECK: mvhi 0(%r2), 0
+; CHECK: br %r14
+  %call = call {<8 x i16>, i32} @llvm.s390.vceqhs(<8 x i16> %a, <8 x i16> %b)
+  %res = extractvalue {<8 x i16>, i32} %call, 0
+  %cc = extractvalue {<8 x i16>, i32} %call, 1
+  %cmp = icmp ugt i32 %cc, 0
+  br i1 %cmp, label %store, label %exit
+
+store:
+  store i32 0, i32 *%ptr
+  br label %exit
+
+exit:
+  ret <8 x i16> %res
+}
+
+; VCEQFS with no processing of the result.
+define i32 @test_vceqfs(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vceqfs:
+; CHECK: vceqfs {{%v[0-9]+}}, %v24, %v26
+; CHECK: ipm %r2
+; CHECK: srl %r2, 28
+; CHECK: br %r14
+  %call = call {<4 x i32>, i32} @llvm.s390.vceqfs(<4 x i32> %a, <4 x i32> %b)
+  %res = extractvalue {<4 x i32>, i32} %call, 1
+  ret i32 %res
+}
+
+; VCEQFS, returning 1 if no elements are equal.
+define i32 @test_vceqfs_none_bool(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vceqfs_none_bool:
+; CHECK: vceqfs {{%v[0-9]+}}, %v24, %v26
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: risblg %r2, [[REG]], 31, 159, 35
+; CHECK: br %r14
+  %call = call {<4 x i32>, i32} @llvm.s390.vceqfs(<4 x i32> %a, <4 x i32> %b)
+  %res = extractvalue {<4 x i32>, i32} %call, 1
+  %cmp = icmp eq i32 %res, 3
+  %ext = zext i1 %cmp to i32
+  ret i32 %ext
+}
+
+; VCEQFS, storing to %ptr if no elements are equal.
+define <4 x i32> @test_vceqfs_none_store(<4 x i32> %a, <4 x i32> %b,
+                                         i32 *%ptr) {
+; CHECK-LABEL: test_vceqfs_none_store:
+; CHECK-NOT: %r
+; CHECK: vceqfs %v24, %v24, %v26
+; CHECK-NEXT: {{bnor|bler}} %r14
+; CHECK: mvhi 0(%r2), 0
+; CHECK: br %r14
+  %call = call {<4 x i32>, i32} @llvm.s390.vceqfs(<4 x i32> %a, <4 x i32> %b)
+  %res = extractvalue {<4 x i32>, i32} %call, 0
+  %cc = extractvalue {<4 x i32>, i32} %call, 1
+  %cmp = icmp uge i32 %cc, 3
+  br i1 %cmp, label %store, label %exit
+
+store:
+  store i32 0, i32 *%ptr
+  br label %exit
+
+exit:
+  ret <4 x i32> %res
+}
+
+; VCEQGS with no processing of the result.
+define i32 @test_vceqgs(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vceqgs:
+; CHECK: vceqgs {{%v[0-9]+}}, %v24, %v26
+; CHECK: ipm %r2
+; CHECK: srl %r2, 28
+; CHECK: br %r14
+  %call = call {<2 x i64>, i32} @llvm.s390.vceqgs(<2 x i64> %a, <2 x i64> %b)
+  %res = extractvalue {<2 x i64>, i32} %call, 1
+  ret i32 %res
+}
+
+; VCEQGS returning 1 if all elements are equal (CC == 0).
+define i32 @test_vceqgs_all_bool(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vceqgs_all_bool:
+; CHECK: vceqgs {{%v[0-9]+}}, %v24, %v26
+; CHECK: ipm %r2
+; CHECK: afi %r2, -268435456
+; CHECK: srl %r2, 31
+; CHECK: br %r14
+  %call = call {<2 x i64>, i32} @llvm.s390.vceqgs(<2 x i64> %a, <2 x i64> %b)
+  %res = extractvalue {<2 x i64>, i32} %call, 1
+  %cmp = icmp ult i32 %res, 1
+  %ext = zext i1 %cmp to i32
+  ret i32 %ext
+}
+
+; VCEQGS, storing to %ptr if all elements are equal.
+define <2 x i64> @test_vceqgs_all_store(<2 x i64> %a, <2 x i64> %b, i32 *%ptr) {
+; CHECK-LABEL: test_vceqgs_all_store:
+; CHECK-NOT: %r
+; CHECK: vceqgs %v24, %v24, %v26
+; CHECK-NEXT: {{bnher|bner}} %r14
+; CHECK: mvhi 0(%r2), 0
+; CHECK: br %r14
+  %call = call {<2 x i64>, i32} @llvm.s390.vceqgs(<2 x i64> %a, <2 x i64> %b)
+  %res = extractvalue {<2 x i64>, i32} %call, 0
+  %cc = extractvalue {<2 x i64>, i32} %call, 1
+  %cmp = icmp sle i32 %cc, 0
+  br i1 %cmp, label %store, label %exit
+
+store:
+  store i32 0, i32 *%ptr
+  br label %exit
+
+exit:
+  ret <2 x i64> %res
+}
+
+; VCHBS with no processing of the result.
+define i32 @test_vchbs(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vchbs:
+; CHECK: vchbs {{%v[0-9]+}}, %v24, %v26
+; CHECK: ipm %r2
+; CHECK: srl %r2, 28
+; CHECK: br %r14
+  %call = call {<16 x i8>, i32} @llvm.s390.vchbs(<16 x i8> %a, <16 x i8> %b)
+  %res = extractvalue {<16 x i8>, i32} %call, 1
+  ret i32 %res
+}
+
+; VCHBS, returning 1 if any elements are higher (CC != 3).
+define i32 @test_vchbs_any_bool(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vchbs_any_bool:
+; CHECK: vchbs {{%v[0-9]+}}, %v24, %v26
+; CHECK: ipm %r2
+; CHECK: afi %r2, -536870912
+; CHECK: srl %r2, 31
+; CHECK: br %r14
+  %call = call {<16 x i8>, i32} @llvm.s390.vchbs(<16 x i8> %a, <16 x i8> %b)
+  %res = extractvalue {<16 x i8>, i32} %call, 1
+  %cmp = icmp ne i32 %res, 3
+  %ext = zext i1 %cmp to i32
+  ret i32 %ext
+}
+
+; VCHBS, storing to %ptr if any elements are higher.
+define <16 x i8> @test_vchbs_any_store(<16 x i8> %a, <16 x i8> %b, i32 *%ptr) {
+; CHECK-LABEL: test_vchbs_any_store:
+; CHECK-NOT: %r
+; CHECK: vchbs %v24, %v24, %v26
+; CHECK-NEXT: {{bor|bnler}} %r14
+; CHECK: mvhi 0(%r2), 0
+; CHECK: br %r14
+  %call = call {<16 x i8>, i32} @llvm.s390.vchbs(<16 x i8> %a, <16 x i8> %b)
+  %res = extractvalue {<16 x i8>, i32} %call, 0
+  %cc = extractvalue {<16 x i8>, i32} %call, 1
+  %cmp = icmp ule i32 %cc, 2
+  br i1 %cmp, label %store, label %exit
+
+store:
+  store i32 0, i32 *%ptr
+  br label %exit
+
+exit:
+  ret <16 x i8> %res
+}
+
+; VCHHS with no processing of the result.
+define i32 @test_vchhs(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vchhs:
+; CHECK: vchhs {{%v[0-9]+}}, %v24, %v26
+; CHECK: ipm %r2
+; CHECK: srl %r2, 28
+; CHECK: br %r14
+  %call = call {<8 x i16>, i32} @llvm.s390.vchhs(<8 x i16> %a, <8 x i16> %b)
+  %res = extractvalue {<8 x i16>, i32} %call, 1
+  ret i32 %res
+}
+
+; VCHHS, returning 1 if not all elements are higher.
+define i32 @test_vchhs_notall_bool(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vchhs_notall_bool:
+; CHECK: vchhs {{%v[0-9]+}}, %v24, %v26
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: risblg %r2, [[REG]], 31, 159, 36
+; CHECK: br %r14
+  %call = call {<8 x i16>, i32} @llvm.s390.vchhs(<8 x i16> %a, <8 x i16> %b)
+  %res = extractvalue {<8 x i16>, i32} %call, 1
+  %cmp = icmp sge i32 %res, 1
+  %ext = zext i1 %cmp to i32
+  ret i32 %ext
+}
+
+; VCHHS, storing to %ptr if not all elements are higher.
+define <8 x i16> @test_vchhs_notall_store(<8 x i16> %a, <8 x i16> %b,
+                                          i32 *%ptr) {
+; CHECK-LABEL: test_vchhs_notall_store:
+; CHECK-NOT: %r
+; CHECK: vchhs %v24, %v24, %v26
+; CHECK-NEXT: {{bher|ber}} %r14
+; CHECK: mvhi 0(%r2), 0
+; CHECK: br %r14
+  %call = call {<8 x i16>, i32} @llvm.s390.vchhs(<8 x i16> %a, <8 x i16> %b)
+  %res = extractvalue {<8 x i16>, i32} %call, 0
+  %cc = extractvalue {<8 x i16>, i32} %call, 1
+  %cmp = icmp ugt i32 %cc, 0
+  br i1 %cmp, label %store, label %exit
+
+store:
+  store i32 0, i32 *%ptr
+  br label %exit
+
+exit:
+  ret <8 x i16> %res
+}
+
+; VCHFS with no processing of the result.
+define i32 @test_vchfs(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vchfs:
+; CHECK: vchfs {{%v[0-9]+}}, %v24, %v26
+; CHECK: ipm %r2
+; CHECK: srl %r2, 28
+; CHECK: br %r14
+  %call = call {<4 x i32>, i32} @llvm.s390.vchfs(<4 x i32> %a, <4 x i32> %b)
+  %res = extractvalue {<4 x i32>, i32} %call, 1
+  ret i32 %res
+}
+
+; VCHFS, returning 1 if no elements are higher.
+define i32 @test_vchfs_none_bool(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vchfs_none_bool:
+; CHECK: vchfs {{%v[0-9]+}}, %v24, %v26
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: risblg %r2, [[REG]], 31, 159, 35
+; CHECK: br %r14
+  %call = call {<4 x i32>, i32} @llvm.s390.vchfs(<4 x i32> %a, <4 x i32> %b)
+  %res = extractvalue {<4 x i32>, i32} %call, 1
+  %cmp = icmp eq i32 %res, 3
+  %ext = zext i1 %cmp to i32
+  ret i32 %ext
+}
+
+; VCHFS, storing to %ptr if no elements are higher.
+define <4 x i32> @test_vchfs_none_store(<4 x i32> %a, <4 x i32> %b, i32 *%ptr) {
+; CHECK-LABEL: test_vchfs_none_store:
+; CHECK-NOT: %r
+; CHECK: vchfs %v24, %v24, %v26
+; CHECK-NEXT: {{bnor|bler}} %r14
+; CHECK: mvhi 0(%r2), 0
+; CHECK: br %r14
+  %call = call {<4 x i32>, i32} @llvm.s390.vchfs(<4 x i32> %a, <4 x i32> %b)
+  %res = extractvalue {<4 x i32>, i32} %call, 0
+  %cc = extractvalue {<4 x i32>, i32} %call, 1
+  %cmp = icmp uge i32 %cc, 3
+  br i1 %cmp, label %store, label %exit
+
+store:
+  store i32 0, i32 *%ptr
+  br label %exit
+
+exit:
+  ret <4 x i32> %res
+}
+
+; VCHGS with no processing of the result.
+define i32 @test_vchgs(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vchgs:
+; CHECK: vchgs {{%v[0-9]+}}, %v24, %v26
+; CHECK: ipm %r2
+; CHECK: srl %r2, 28
+; CHECK: br %r14
+  %call = call {<2 x i64>, i32} @llvm.s390.vchgs(<2 x i64> %a, <2 x i64> %b)
+  %res = extractvalue {<2 x i64>, i32} %call, 1
+  ret i32 %res
+}
+
+; VCHGS returning 1 if all elements are higher (CC == 0).
+define i32 @test_vchgs_all_bool(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vchgs_all_bool:
+; CHECK: vchgs {{%v[0-9]+}}, %v24, %v26
+; CHECK: ipm %r2
+; CHECK: afi %r2, -268435456
+; CHECK: srl %r2, 31
+; CHECK: br %r14
+  %call = call {<2 x i64>, i32} @llvm.s390.vchgs(<2 x i64> %a, <2 x i64> %b)
+  %res = extractvalue {<2 x i64>, i32} %call, 1
+  %cmp = icmp ult i32 %res, 1
+  %ext = zext i1 %cmp to i32
+  ret i32 %ext
+}
+
+; VCHGS, storing to %ptr if all elements are higher.
+define <2 x i64> @test_vchgs_all_store(<2 x i64> %a, <2 x i64> %b, i32 *%ptr) {
+; CHECK-LABEL: test_vchgs_all_store:
+; CHECK-NOT: %r
+; CHECK: vchgs %v24, %v24, %v26
+; CHECK-NEXT: {{bnher|bner}} %r14
+; CHECK: mvhi 0(%r2), 0
+; CHECK: br %r14
+  %call = call {<2 x i64>, i32} @llvm.s390.vchgs(<2 x i64> %a, <2 x i64> %b)
+  %res = extractvalue {<2 x i64>, i32} %call, 0
+  %cc = extractvalue {<2 x i64>, i32} %call, 1
+  %cmp = icmp sle i32 %cc, 0
+  br i1 %cmp, label %store, label %exit
+
+store:
+  store i32 0, i32 *%ptr
+  br label %exit
+
+exit:
+  ret <2 x i64> %res
+}
+
+; VCHLBS with no processing of the result.
+define i32 @test_vchlbs(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vchlbs:
+; CHECK: vchlbs {{%v[0-9]+}}, %v24, %v26
+; CHECK: ipm %r2
+; CHECK: srl %r2, 28
+; CHECK: br %r14
+  %call = call {<16 x i8>, i32} @llvm.s390.vchlbs(<16 x i8> %a, <16 x i8> %b)
+  %res = extractvalue {<16 x i8>, i32} %call, 1
+  ret i32 %res
+}
+
+; VCHLBS, returning 1 if any elements are higher (CC != 3).
+define i32 @test_vchlbs_any_bool(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vchlbs_any_bool:
+; CHECK: vchlbs {{%v[0-9]+}}, %v24, %v26
+; CHECK: ipm %r2
+; CHECK: afi %r2, -536870912
+; CHECK: srl %r2, 31
+; CHECK: br %r14
+  %call = call {<16 x i8>, i32} @llvm.s390.vchlbs(<16 x i8> %a, <16 x i8> %b)
+  %res = extractvalue {<16 x i8>, i32} %call, 1
+  %cmp = icmp ne i32 %res, 3
+  %ext = zext i1 %cmp to i32
+  ret i32 %ext
+}
+
+; VCHLBS, storing to %ptr if any elements are higher.
+define <16 x i8> @test_vchlbs_any_store(<16 x i8> %a, <16 x i8> %b, i32 *%ptr) {
+; CHECK-LABEL: test_vchlbs_any_store:
+; CHECK-NOT: %r
+; CHECK: vchlbs %v24, %v24, %v26
+; CHECK-NEXT: {{bor|bnler}} %r14
+; CHECK: mvhi 0(%r2), 0
+; CHECK: br %r14
+  %call = call {<16 x i8>, i32} @llvm.s390.vchlbs(<16 x i8> %a, <16 x i8> %b)
+  %res = extractvalue {<16 x i8>, i32} %call, 0
+  %cc = extractvalue {<16 x i8>, i32} %call, 1
+  %cmp = icmp sle i32 %cc, 2
+  br i1 %cmp, label %store, label %exit
+
+store:
+  store i32 0, i32 *%ptr
+  br label %exit
+
+exit:
+  ret <16 x i8> %res
+}
+
+; VCHLHS with no processing of the result.
+define i32 @test_vchlhs(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vchlhs:
+; CHECK: vchlhs {{%v[0-9]+}}, %v24, %v26
+; CHECK: ipm %r2
+; CHECK: srl %r2, 28
+; CHECK: br %r14
+  %call = call {<8 x i16>, i32} @llvm.s390.vchlhs(<8 x i16> %a, <8 x i16> %b)
+  %res = extractvalue {<8 x i16>, i32} %call, 1
+  ret i32 %res
+}
+
+; VCHLHS, returning 1 if not all elements are higher.
+define i32 @test_vchlhs_notall_bool(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vchlhs_notall_bool:
+; CHECK: vchlhs {{%v[0-9]+}}, %v24, %v26
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: risblg %r2, [[REG]], 31, 159, 36
+; CHECK: br %r14
+  %call = call {<8 x i16>, i32} @llvm.s390.vchlhs(<8 x i16> %a, <8 x i16> %b)
+  %res = extractvalue {<8 x i16>, i32} %call, 1
+  %cmp = icmp uge i32 %res, 1
+  %ext = zext i1 %cmp to i32
+  ret i32 %ext
+}
+
+; VCHLHS, storing to %ptr if not all elements are higher.
+define <8 x i16> @test_vchlhs_notall_store(<8 x i16> %a, <8 x i16> %b,
+                                           i32 *%ptr) {
+; CHECK-LABEL: test_vchlhs_notall_store:
+; CHECK-NOT: %r
+; CHECK: vchlhs %v24, %v24, %v26
+; CHECK-NEXT: {{bher|ber}} %r14
+; CHECK: mvhi 0(%r2), 0
+; CHECK: br %r14
+  %call = call {<8 x i16>, i32} @llvm.s390.vchlhs(<8 x i16> %a, <8 x i16> %b)
+  %res = extractvalue {<8 x i16>, i32} %call, 0
+  %cc = extractvalue {<8 x i16>, i32} %call, 1
+  %cmp = icmp sgt i32 %cc, 0
+  br i1 %cmp, label %store, label %exit
+
+store:
+  store i32 0, i32 *%ptr
+  br label %exit
+
+exit:
+  ret <8 x i16> %res
+}
+
+; VCHLFS with no processing of the result.
+define i32 @test_vchlfs(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vchlfs:
+; CHECK: vchlfs {{%v[0-9]+}}, %v24, %v26
+; CHECK: ipm %r2
+; CHECK: srl %r2, 28
+; CHECK: br %r14
+  %call = call {<4 x i32>, i32} @llvm.s390.vchlfs(<4 x i32> %a, <4 x i32> %b)
+  %res = extractvalue {<4 x i32>, i32} %call, 1
+  ret i32 %res
+}
+
+; VCHLFS, returning 1 if no elements are higher.
+define i32 @test_vchlfs_none_bool(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vchlfs_none_bool:
+; CHECK: vchlfs {{%v[0-9]+}}, %v24, %v26
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: risblg %r2, [[REG]], 31, 159, 35
+; CHECK: br %r14
+  %call = call {<4 x i32>, i32} @llvm.s390.vchlfs(<4 x i32> %a, <4 x i32> %b)
+  %res = extractvalue {<4 x i32>, i32} %call, 1
+  %cmp = icmp eq i32 %res, 3
+  %ext = zext i1 %cmp to i32
+  ret i32 %ext
+}
+
+; VCHLFS, storing to %ptr if no elements are higher.
+define <4 x i32> @test_vchlfs_none_store(<4 x i32> %a, <4 x i32> %b,
+                                         i32 *%ptr) {
+; CHECK-LABEL: test_vchlfs_none_store:
+; CHECK-NOT: %r
+; CHECK: vchlfs %v24, %v24, %v26
+; CHECK-NEXT: {{bnor|bler}} %r14
+; CHECK: mvhi 0(%r2), 0
+; CHECK: br %r14
+  %call = call {<4 x i32>, i32} @llvm.s390.vchlfs(<4 x i32> %a, <4 x i32> %b)
+  %res = extractvalue {<4 x i32>, i32} %call, 0
+  %cc = extractvalue {<4 x i32>, i32} %call, 1
+  %cmp = icmp sge i32 %cc, 3
+  br i1 %cmp, label %store, label %exit
+
+store:
+  store i32 0, i32 *%ptr
+  br label %exit
+
+exit:
+  ret <4 x i32> %res
+}
+
+; VCHLGS with no processing of the result.
+define i32 @test_vchlgs(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vchlgs:
+; CHECK: vchlgs {{%v[0-9]+}}, %v24, %v26
+; CHECK: ipm %r2
+; CHECK: srl %r2, 28
+; CHECK: br %r14
+  %call = call {<2 x i64>, i32} @llvm.s390.vchlgs(<2 x i64> %a, <2 x i64> %b)
+  %res = extractvalue {<2 x i64>, i32} %call, 1
+  ret i32 %res
+}
+
+; VCHLGS returning 1 if all elements are higher (CC == 0).
+define i32 @test_vchlgs_all_bool(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vchlgs_all_bool:
+; CHECK: vchlgs {{%v[0-9]+}}, %v24, %v26
+; CHECK: ipm %r2
+; CHECK: afi %r2, -268435456
+; CHECK: srl %r2, 31
+; CHECK: br %r14
+  %call = call {<2 x i64>, i32} @llvm.s390.vchlgs(<2 x i64> %a, <2 x i64> %b)
+  %res = extractvalue {<2 x i64>, i32} %call, 1
+  %cmp = icmp slt i32 %res, 1
+  %ext = zext i1 %cmp to i32
+  ret i32 %ext
+}
+
+; VCHLGS, storing to %ptr if all elements are higher.
+define <2 x i64> @test_vchlgs_all_store(<2 x i64> %a, <2 x i64> %b, i32 *%ptr) {
+; CHECK-LABEL: test_vchlgs_all_store:
+; CHECK-NOT: %r
+; CHECK: vchlgs %v24, %v24, %v26
+; CHECK-NEXT: {{bnher|bner}} %r14
+; CHECK: mvhi 0(%r2), 0
+; CHECK: br %r14
+  %call = call {<2 x i64>, i32} @llvm.s390.vchlgs(<2 x i64> %a, <2 x i64> %b)
+  %res = extractvalue {<2 x i64>, i32} %call, 0
+  %cc = extractvalue {<2 x i64>, i32} %call, 1
+  %cmp = icmp ule i32 %cc, 0
+  br i1 %cmp, label %store, label %exit
+
+store:
+  store i32 0, i32 *%ptr
+  br label %exit
+
+exit:
+  ret <2 x i64> %res
+}
+
+; VFAEB with !IN !RT.
+define <16 x i8> @test_vfaeb_0(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vfaeb_0:
+; CHECK: vfaeb %v24, %v24, %v26, 0
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vfaeb(<16 x i8> %a, <16 x i8> %b, i32 0)
+  ret <16 x i8> %res
+}
+
+; VFAEB with !IN RT.
+define <16 x i8> @test_vfaeb_4(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vfaeb_4:
+; CHECK: vfaeb %v24, %v24, %v26, 4
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vfaeb(<16 x i8> %a, <16 x i8> %b, i32 4)
+  ret <16 x i8> %res
+}
+
+; VFAEB with IN !RT.
+define <16 x i8> @test_vfaeb_8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vfaeb_8:
+; CHECK: vfaeb %v24, %v24, %v26, 8
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vfaeb(<16 x i8> %a, <16 x i8> %b, i32 8)
+  ret <16 x i8> %res
+}
+
+; VFAEB with IN RT.
+define <16 x i8> @test_vfaeb_12(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vfaeb_12:
+; CHECK: vfaeb %v24, %v24, %v26, 12
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vfaeb(<16 x i8> %a, <16 x i8> %b, i32 12)
+  ret <16 x i8> %res
+}
+
+; VFAEB with CS -- should be ignored.
+define <16 x i8> @test_vfaeb_1(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vfaeb_1:
+; CHECK: vfaeb %v24, %v24, %v26, 0
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vfaeb(<16 x i8> %a, <16 x i8> %b, i32 1)
+  ret <16 x i8> %res
+}
+
+; VFAEH.
+define <8 x i16> @test_vfaeh(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vfaeh:
+; CHECK: vfaeh %v24, %v24, %v26, 4
+; CHECK: br %r14
+  %res = call <8 x i16> @llvm.s390.vfaeh(<8 x i16> %a, <8 x i16> %b, i32 4)
+  ret <8 x i16> %res
+}
+
+; VFAEF.
+define <4 x i32> @test_vfaef(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vfaef:
+; CHECK: vfaef %v24, %v24, %v26, 8
+; CHECK: br %r14
+  %res = call <4 x i32> @llvm.s390.vfaef(<4 x i32> %a, <4 x i32> %b, i32 8)
+  ret <4 x i32> %res
+}
+
+; VFAEBS.
+define <16 x i8> @test_vfaebs(<16 x i8> %a, <16 x i8> %b, i32 *%ccptr) {
+; CHECK-LABEL: test_vfaebs:
+; CHECK: vfaebs %v24, %v24, %v26, 0
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: srl [[REG]], 28
+; CHECK: st [[REG]], 0(%r2)
+; CHECK: br %r14
+  %call = call {<16 x i8>, i32} @llvm.s390.vfaebs(<16 x i8> %a, <16 x i8> %b,
+                                                  i32 0)
+  %res = extractvalue {<16 x i8>, i32} %call, 0
+  %cc = extractvalue {<16 x i8>, i32} %call, 1
+  store i32 %cc, i32 *%ccptr
+  ret <16 x i8> %res
+}
+
+; VFAEHS.
+define <8 x i16> @test_vfaehs(<8 x i16> %a, <8 x i16> %b, i32 *%ccptr) {
+; CHECK-LABEL: test_vfaehs:
+; CHECK: vfaehs %v24, %v24, %v26, 4
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: srl [[REG]], 28
+; CHECK: st [[REG]], 0(%r2)
+; CHECK: br %r14
+  %call = call {<8 x i16>, i32} @llvm.s390.vfaehs(<8 x i16> %a, <8 x i16> %b,
+                                                  i32 4)
+  %res = extractvalue {<8 x i16>, i32} %call, 0
+  %cc = extractvalue {<8 x i16>, i32} %call, 1
+  store i32 %cc, i32 *%ccptr
+  ret <8 x i16> %res
+}
+
+; VFAEFS.
+define <4 x i32> @test_vfaefs(<4 x i32> %a, <4 x i32> %b, i32 *%ccptr) {
+; CHECK-LABEL: test_vfaefs:
+; CHECK: vfaefs %v24, %v24, %v26, 8
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: srl [[REG]], 28
+; CHECK: st [[REG]], 0(%r2)
+; CHECK: br %r14
+  %call = call {<4 x i32>, i32} @llvm.s390.vfaefs(<4 x i32> %a, <4 x i32> %b,
+                                                  i32 8)
+  %res = extractvalue {<4 x i32>, i32} %call, 0
+  %cc = extractvalue {<4 x i32>, i32} %call, 1
+  store i32 %cc, i32 *%ccptr
+  ret <4 x i32> %res
+}
+
+; VFAEZB with !IN !RT.
+define <16 x i8> @test_vfaezb_0(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vfaezb_0:
+; CHECK: vfaezb %v24, %v24, %v26, 0
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vfaezb(<16 x i8> %a, <16 x i8> %b, i32 0)
+  ret <16 x i8> %res
+}
+
+; VFAEZB with !IN RT.
+define <16 x i8> @test_vfaezb_4(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vfaezb_4:
+; CHECK: vfaezb %v24, %v24, %v26, 4
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vfaezb(<16 x i8> %a, <16 x i8> %b, i32 4)
+  ret <16 x i8> %res
+}
+
+; VFAEZB with IN !RT.
+define <16 x i8> @test_vfaezb_8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vfaezb_8:
+; CHECK: vfaezb %v24, %v24, %v26, 8
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vfaezb(<16 x i8> %a, <16 x i8> %b, i32 8)
+  ret <16 x i8> %res
+}
+
+; VFAEZB with IN RT.
+define <16 x i8> @test_vfaezb_12(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vfaezb_12:
+; CHECK: vfaezb %v24, %v24, %v26, 12
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vfaezb(<16 x i8> %a, <16 x i8> %b, i32 12)
+  ret <16 x i8> %res
+}
+
+; VFAEZB with CS -- should be ignored.
+define <16 x i8> @test_vfaezb_1(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vfaezb_1:
+; CHECK: vfaezb %v24, %v24, %v26, 0
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vfaezb(<16 x i8> %a, <16 x i8> %b, i32 1)
+  ret <16 x i8> %res
+}
+
+; VFAEZH.
+define <8 x i16> @test_vfaezh(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vfaezh:
+; CHECK: vfaezh %v24, %v24, %v26, 4
+; CHECK: br %r14
+  %res = call <8 x i16> @llvm.s390.vfaezh(<8 x i16> %a, <8 x i16> %b, i32 4)
+  ret <8 x i16> %res
+}
+
+; VFAEZF.
+define <4 x i32> @test_vfaezf(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vfaezf:
+; CHECK: vfaezf %v24, %v24, %v26, 8
+; CHECK: br %r14
+  %res = call <4 x i32> @llvm.s390.vfaezf(<4 x i32> %a, <4 x i32> %b, i32 8)
+  ret <4 x i32> %res
+}
+
+; VFAEZBS.
+define <16 x i8> @test_vfaezbs(<16 x i8> %a, <16 x i8> %b, i32 *%ccptr) {
+; CHECK-LABEL: test_vfaezbs:
+; CHECK: vfaezbs %v24, %v24, %v26, 0
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: srl [[REG]], 28
+; CHECK: st [[REG]], 0(%r2)
+; CHECK: br %r14
+  %call = call {<16 x i8>, i32} @llvm.s390.vfaezbs(<16 x i8> %a, <16 x i8> %b,
+                                                   i32 0)
+  %res = extractvalue {<16 x i8>, i32} %call, 0
+  %cc = extractvalue {<16 x i8>, i32} %call, 1
+  store i32 %cc, i32 *%ccptr
+  ret <16 x i8> %res
+}
+
+; VFAEZHS.
+define <8 x i16> @test_vfaezhs(<8 x i16> %a, <8 x i16> %b, i32 *%ccptr) {
+; CHECK-LABEL: test_vfaezhs:
+; CHECK: vfaezhs %v24, %v24, %v26, 4
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: srl [[REG]], 28
+; CHECK: st [[REG]], 0(%r2)
+; CHECK: br %r14
+  %call = call {<8 x i16>, i32} @llvm.s390.vfaezhs(<8 x i16> %a, <8 x i16> %b,
+                                                   i32 4)
+  %res = extractvalue {<8 x i16>, i32} %call, 0
+  %cc = extractvalue {<8 x i16>, i32} %call, 1
+  store i32 %cc, i32 *%ccptr
+  ret <8 x i16> %res
+}
+
+; VFAEZFS.
+define <4 x i32> @test_vfaezfs(<4 x i32> %a, <4 x i32> %b, i32 *%ccptr) {
+; CHECK-LABEL: test_vfaezfs:
+; CHECK: vfaezfs %v24, %v24, %v26, 8
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: srl [[REG]], 28
+; CHECK: st [[REG]], 0(%r2)
+; CHECK: br %r14
+  %call = call {<4 x i32>, i32} @llvm.s390.vfaezfs(<4 x i32> %a, <4 x i32> %b,
+                                                   i32 8)
+  %res = extractvalue {<4 x i32>, i32} %call, 0
+  %cc = extractvalue {<4 x i32>, i32} %call, 1
+  store i32 %cc, i32 *%ccptr
+  ret <4 x i32> %res
+}
+
+; VFEEB.
+define <16 x i8> @test_vfeeb_0(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vfeeb_0:
+; CHECK: vfeeb %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vfeeb(<16 x i8> %a, <16 x i8> %b)
+  ret <16 x i8> %res
+}
+
+; VFEEH.
+define <8 x i16> @test_vfeeh(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vfeeh:
+; CHECK: vfeeh %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <8 x i16> @llvm.s390.vfeeh(<8 x i16> %a, <8 x i16> %b)
+  ret <8 x i16> %res
+}
+
+; VFEEF.
+define <4 x i32> @test_vfeef(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vfeef:
+; CHECK: vfeef %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <4 x i32> @llvm.s390.vfeef(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %res
+}
+
+; VFEEBS.
+define <16 x i8> @test_vfeebs(<16 x i8> %a, <16 x i8> %b, i32 *%ccptr) {
+; CHECK-LABEL: test_vfeebs:
+; CHECK: vfeebs %v24, %v24, %v26
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: srl [[REG]], 28
+; CHECK: st [[REG]], 0(%r2)
+; CHECK: br %r14
+  %call = call {<16 x i8>, i32} @llvm.s390.vfeebs(<16 x i8> %a, <16 x i8> %b)
+  %res = extractvalue {<16 x i8>, i32} %call, 0
+  %cc = extractvalue {<16 x i8>, i32} %call, 1
+  store i32 %cc, i32 *%ccptr
+  ret <16 x i8> %res
+}
+
+; VFEEHS.
+define <8 x i16> @test_vfeehs(<8 x i16> %a, <8 x i16> %b, i32 *%ccptr) {
+; CHECK-LABEL: test_vfeehs:
+; CHECK: vfeehs %v24, %v24, %v26
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: srl [[REG]], 28
+; CHECK: st [[REG]], 0(%r2)
+; CHECK: br %r14
+  %call = call {<8 x i16>, i32} @llvm.s390.vfeehs(<8 x i16> %a, <8 x i16> %b)
+  %res = extractvalue {<8 x i16>, i32} %call, 0
+  %cc = extractvalue {<8 x i16>, i32} %call, 1
+  store i32 %cc, i32 *%ccptr
+  ret <8 x i16> %res
+}
+
+; VFEEFS.
+define <4 x i32> @test_vfeefs(<4 x i32> %a, <4 x i32> %b, i32 *%ccptr) {
+; CHECK-LABEL: test_vfeefs:
+; CHECK: vfeefs %v24, %v24, %v26
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: srl [[REG]], 28
+; CHECK: st [[REG]], 0(%r2)
+; CHECK: br %r14
+  %call = call {<4 x i32>, i32} @llvm.s390.vfeefs(<4 x i32> %a, <4 x i32> %b)
+  %res = extractvalue {<4 x i32>, i32} %call, 0
+  %cc = extractvalue {<4 x i32>, i32} %call, 1
+  store i32 %cc, i32 *%ccptr
+  ret <4 x i32> %res
+}
+
+; VFEEZB.
+define <16 x i8> @test_vfeezb(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vfeezb:
+; CHECK: vfeezb %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vfeezb(<16 x i8> %a, <16 x i8> %b)
+  ret <16 x i8> %res
+}
+
+; VFEEZH.
+define <8 x i16> @test_vfeezh(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vfeezh:
+; CHECK: vfeezh %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <8 x i16> @llvm.s390.vfeezh(<8 x i16> %a, <8 x i16> %b)
+  ret <8 x i16> %res
+}
+
+; VFEEZF.
+define <4 x i32> @test_vfeezf(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vfeezf:
+; CHECK: vfeezf %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <4 x i32> @llvm.s390.vfeezf(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %res
+}
+
+; VFEEZBS.
+define <16 x i8> @test_vfeezbs(<16 x i8> %a, <16 x i8> %b, i32 *%ccptr) {
+; CHECK-LABEL: test_vfeezbs:
+; CHECK: vfeezbs %v24, %v24, %v26
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: srl [[REG]], 28
+; CHECK: st [[REG]], 0(%r2)
+; CHECK: br %r14
+  %call = call {<16 x i8>, i32} @llvm.s390.vfeezbs(<16 x i8> %a, <16 x i8> %b)
+  %res = extractvalue {<16 x i8>, i32} %call, 0
+  %cc = extractvalue {<16 x i8>, i32} %call, 1
+  store i32 %cc, i32 *%ccptr
+  ret <16 x i8> %res
+}
+
+; VFEEZHS.
+define <8 x i16> @test_vfeezhs(<8 x i16> %a, <8 x i16> %b, i32 *%ccptr) {
+; CHECK-LABEL: test_vfeezhs:
+; CHECK: vfeezhs %v24, %v24, %v26
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: srl [[REG]], 28
+; CHECK: st [[REG]], 0(%r2)
+; CHECK: br %r14
+  %call = call {<8 x i16>, i32} @llvm.s390.vfeezhs(<8 x i16> %a, <8 x i16> %b)
+  %res = extractvalue {<8 x i16>, i32} %call, 0
+  %cc = extractvalue {<8 x i16>, i32} %call, 1
+  store i32 %cc, i32 *%ccptr
+  ret <8 x i16> %res
+}
+
+; VFEEZFS.
+define <4 x i32> @test_vfeezfs(<4 x i32> %a, <4 x i32> %b, i32 *%ccptr) {
+; CHECK-LABEL: test_vfeezfs:
+; CHECK: vfeezfs %v24, %v24, %v26
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: srl [[REG]], 28
+; CHECK: st [[REG]], 0(%r2)
+; CHECK: br %r14
+  %call = call {<4 x i32>, i32} @llvm.s390.vfeezfs(<4 x i32> %a, <4 x i32> %b)
+  %res = extractvalue {<4 x i32>, i32} %call, 0
+  %cc = extractvalue {<4 x i32>, i32} %call, 1
+  store i32 %cc, i32 *%ccptr
+  ret <4 x i32> %res
+}
+
+; VFENEB.
+define <16 x i8> @test_vfeneb_0(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vfeneb_0:
+; CHECK: vfeneb %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vfeneb(<16 x i8> %a, <16 x i8> %b)
+  ret <16 x i8> %res
+}
+
+; VFENEH.
+define <8 x i16> @test_vfeneh(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vfeneh:
+; CHECK: vfeneh %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <8 x i16> @llvm.s390.vfeneh(<8 x i16> %a, <8 x i16> %b)
+  ret <8 x i16> %res
+}
+
+; VFENEF.
+define <4 x i32> @test_vfenef(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vfenef:
+; CHECK: vfenef %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <4 x i32> @llvm.s390.vfenef(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %res
+}
+
+; VFENEBS.
+define <16 x i8> @test_vfenebs(<16 x i8> %a, <16 x i8> %b, i32 *%ccptr) {
+; CHECK-LABEL: test_vfenebs:
+; CHECK: vfenebs %v24, %v24, %v26
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: srl [[REG]], 28
+; CHECK: st [[REG]], 0(%r2)
+; CHECK: br %r14
+  %call = call {<16 x i8>, i32} @llvm.s390.vfenebs(<16 x i8> %a, <16 x i8> %b)
+  %res = extractvalue {<16 x i8>, i32} %call, 0
+  %cc = extractvalue {<16 x i8>, i32} %call, 1
+  store i32 %cc, i32 *%ccptr
+  ret <16 x i8> %res
+}
+
+; VFENEHS.
+define <8 x i16> @test_vfenehs(<8 x i16> %a, <8 x i16> %b, i32 *%ccptr) {
+; CHECK-LABEL: test_vfenehs:
+; CHECK: vfenehs %v24, %v24, %v26
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: srl [[REG]], 28
+; CHECK: st [[REG]], 0(%r2)
+; CHECK: br %r14
+  %call = call {<8 x i16>, i32} @llvm.s390.vfenehs(<8 x i16> %a, <8 x i16> %b)
+  %res = extractvalue {<8 x i16>, i32} %call, 0
+  %cc = extractvalue {<8 x i16>, i32} %call, 1
+  store i32 %cc, i32 *%ccptr
+  ret <8 x i16> %res
+}
+
+; VFENEFS.
+define <4 x i32> @test_vfenefs(<4 x i32> %a, <4 x i32> %b, i32 *%ccptr) {
+; CHECK-LABEL: test_vfenefs:
+; CHECK: vfenefs %v24, %v24, %v26
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: srl [[REG]], 28
+; CHECK: st [[REG]], 0(%r2)
+; CHECK: br %r14
+  %call = call {<4 x i32>, i32} @llvm.s390.vfenefs(<4 x i32> %a, <4 x i32> %b)
+  %res = extractvalue {<4 x i32>, i32} %call, 0
+  %cc = extractvalue {<4 x i32>, i32} %call, 1
+  store i32 %cc, i32 *%ccptr
+  ret <4 x i32> %res
+}
+
+; VFENEZB.
+define <16 x i8> @test_vfenezb(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vfenezb:
+; CHECK: vfenezb %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vfenezb(<16 x i8> %a, <16 x i8> %b)
+  ret <16 x i8> %res
+}
+
+; VFENEZH.
+define <8 x i16> @test_vfenezh(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vfenezh:
+; CHECK: vfenezh %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <8 x i16> @llvm.s390.vfenezh(<8 x i16> %a, <8 x i16> %b)
+  ret <8 x i16> %res
+}
+
+; VFENEZF.
+define <4 x i32> @test_vfenezf(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vfenezf:
+; CHECK: vfenezf %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <4 x i32> @llvm.s390.vfenezf(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %res
+}
+
+; VFENEZBS.
+define <16 x i8> @test_vfenezbs(<16 x i8> %a, <16 x i8> %b, i32 *%ccptr) {
+; CHECK-LABEL: test_vfenezbs:
+; CHECK: vfenezbs %v24, %v24, %v26
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: srl [[REG]], 28
+; CHECK: st [[REG]], 0(%r2)
+; CHECK: br %r14
+  %call = call {<16 x i8>, i32} @llvm.s390.vfenezbs(<16 x i8> %a, <16 x i8> %b)
+  %res = extractvalue {<16 x i8>, i32} %call, 0
+  %cc = extractvalue {<16 x i8>, i32} %call, 1
+  store i32 %cc, i32 *%ccptr
+  ret <16 x i8> %res
+}
+
+; VFENEZHS.
+define <8 x i16> @test_vfenezhs(<8 x i16> %a, <8 x i16> %b, i32 *%ccptr) {
+; CHECK-LABEL: test_vfenezhs:
+; CHECK: vfenezhs %v24, %v24, %v26
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: srl [[REG]], 28
+; CHECK: st [[REG]], 0(%r2)
+; CHECK: br %r14
+  %call = call {<8 x i16>, i32} @llvm.s390.vfenezhs(<8 x i16> %a, <8 x i16> %b)
+  %res = extractvalue {<8 x i16>, i32} %call, 0
+  %cc = extractvalue {<8 x i16>, i32} %call, 1
+  store i32 %cc, i32 *%ccptr
+  ret <8 x i16> %res
+}
+
+; VFENEZFS.
+define <4 x i32> @test_vfenezfs(<4 x i32> %a, <4 x i32> %b, i32 *%ccptr) {
+; CHECK-LABEL: test_vfenezfs:
+; CHECK: vfenezfs %v24, %v24, %v26
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: srl [[REG]], 28
+; CHECK: st [[REG]], 0(%r2)
+; CHECK: br %r14
+  %call = call {<4 x i32>, i32} @llvm.s390.vfenezfs(<4 x i32> %a, <4 x i32> %b)
+  %res = extractvalue {<4 x i32>, i32} %call, 0
+  %cc = extractvalue {<4 x i32>, i32} %call, 1
+  store i32 %cc, i32 *%ccptr
+  ret <4 x i32> %res
+}
+
+; VISTRB.
+define <16 x i8> @test_vistrb(<16 x i8> %a) {
+; CHECK-LABEL: test_vistrb:
+; CHECK: vistrb %v24, %v24
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vistrb(<16 x i8> %a)
+  ret <16 x i8> %res
+}
+
+; VISTRH.
+define <8 x i16> @test_vistrh(<8 x i16> %a) {
+; CHECK-LABEL: test_vistrh:
+; CHECK: vistrh %v24, %v24
+; CHECK: br %r14
+  %res = call <8 x i16> @llvm.s390.vistrh(<8 x i16> %a)
+  ret <8 x i16> %res
+}
+
+; VISTRF.
+define <4 x i32> @test_vistrf(<4 x i32> %a) {
+; CHECK-LABEL: test_vistrf:
+; CHECK: vistrf %v24, %v24
+; CHECK: br %r14
+  %res = call <4 x i32> @llvm.s390.vistrf(<4 x i32> %a)
+  ret <4 x i32> %res
+}
+
+; VISTRBS.
+define <16 x i8> @test_vistrbs(<16 x i8> %a, i32 *%ccptr) {
+; CHECK-LABEL: test_vistrbs:
+; CHECK: vistrbs %v24, %v24
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: srl [[REG]], 28
+; CHECK: st [[REG]], 0(%r2)
+; CHECK: br %r14
+  %call = call {<16 x i8>, i32} @llvm.s390.vistrbs(<16 x i8> %a)
+  %res = extractvalue {<16 x i8>, i32} %call, 0
+  %cc = extractvalue {<16 x i8>, i32} %call, 1
+  store i32 %cc, i32 *%ccptr
+  ret <16 x i8> %res
+}
+
+; VISTRHS.
+define <8 x i16> @test_vistrhs(<8 x i16> %a, i32 *%ccptr) {
+; CHECK-LABEL: test_vistrhs:
+; CHECK: vistrhs %v24, %v24
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: srl [[REG]], 28
+; CHECK: st [[REG]], 0(%r2)
+; CHECK: br %r14
+  %call = call {<8 x i16>, i32} @llvm.s390.vistrhs(<8 x i16> %a)
+  %res = extractvalue {<8 x i16>, i32} %call, 0
+  %cc = extractvalue {<8 x i16>, i32} %call, 1
+  store i32 %cc, i32 *%ccptr
+  ret <8 x i16> %res
+}
+
+; VISTRFS.
+define <4 x i32> @test_vistrfs(<4 x i32> %a, i32 *%ccptr) {
+; CHECK-LABEL: test_vistrfs:
+; CHECK: vistrfs %v24, %v24
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: srl [[REG]], 28
+; CHECK: st [[REG]], 0(%r2)
+; CHECK: br %r14
+  %call = call {<4 x i32>, i32} @llvm.s390.vistrfs(<4 x i32> %a)
+  %res = extractvalue {<4 x i32>, i32} %call, 0
+  %cc = extractvalue {<4 x i32>, i32} %call, 1
+  store i32 %cc, i32 *%ccptr
+  ret <4 x i32> %res
+}
+
+; VSTRCB with !IN !RT.
+define <16 x i8> @test_vstrcb_0(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK-LABEL: test_vstrcb_0:
+; CHECK: vstrcb %v24, %v24, %v26, %v28, 0
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vstrcb(<16 x i8> %a, <16 x i8> %b,
+                                          <16 x i8> %c, i32 0)
+  ret <16 x i8> %res
+}
+
+; VSTRCB with !IN RT.
+define <16 x i8> @test_vstrcb_4(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK-LABEL: test_vstrcb_4:
+; CHECK: vstrcb %v24, %v24, %v26, %v28, 4
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vstrcb(<16 x i8> %a, <16 x i8> %b,
+                                          <16 x i8> %c, i32 4)
+  ret <16 x i8> %res
+}
+
+; VSTRCB with IN !RT.
+define <16 x i8> @test_vstrcb_8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK-LABEL: test_vstrcb_8:
+; CHECK: vstrcb %v24, %v24, %v26, %v28, 8
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vstrcb(<16 x i8> %a, <16 x i8> %b,
+                                          <16 x i8> %c, i32 8)
+  ret <16 x i8> %res
+}
+
+; VSTRCB with IN RT.
+define <16 x i8> @test_vstrcb_12(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK-LABEL: test_vstrcb_12:
+; CHECK: vstrcb %v24, %v24, %v26, %v28, 12
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vstrcb(<16 x i8> %a, <16 x i8> %b,
+                                          <16 x i8> %c, i32 12)
+  ret <16 x i8> %res
+}
+
+; VSTRCB with CS -- should be ignored.
+define <16 x i8> @test_vstrcb_1(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK-LABEL: test_vstrcb_1:
+; CHECK: vstrcb %v24, %v24, %v26, %v28, 0
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vstrcb(<16 x i8> %a, <16 x i8> %b,
+                                          <16 x i8> %c, i32 1)
+  ret <16 x i8> %res
+}
+
+; VSTRCH.
+define <8 x i16> @test_vstrch(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK-LABEL: test_vstrch:
+; CHECK: vstrch %v24, %v24, %v26, %v28, 4
+; CHECK: br %r14
+  %res = call <8 x i16> @llvm.s390.vstrch(<8 x i16> %a, <8 x i16> %b,
+                                          <8 x i16> %c, i32 4)
+  ret <8 x i16> %res
+}
+
+; VSTRCF.
+define <4 x i32> @test_vstrcf(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-LABEL: test_vstrcf:
+; CHECK: vstrcf %v24, %v24, %v26, %v28, 8
+; CHECK: br %r14
+  %res = call <4 x i32> @llvm.s390.vstrcf(<4 x i32> %a, <4 x i32> %b,
+                                          <4 x i32> %c, i32 8)
+  ret <4 x i32> %res
+}
+
+; VSTRCBS.
+define <16 x i8> @test_vstrcbs(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c,
+                               i32 *%ccptr) {
+; CHECK-LABEL: test_vstrcbs:
+; CHECK: vstrcbs %v24, %v24, %v26, %v28, 0
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: srl [[REG]], 28
+; CHECK: st [[REG]], 0(%r2)
+; CHECK: br %r14
+  %call = call {<16 x i8>, i32} @llvm.s390.vstrcbs(<16 x i8> %a, <16 x i8> %b,
+                                                   <16 x i8> %c, i32 0)
+  %res = extractvalue {<16 x i8>, i32} %call, 0
+  %cc = extractvalue {<16 x i8>, i32} %call, 1
+  store i32 %cc, i32 *%ccptr
+  ret <16 x i8> %res
+}
+
+; VSTRCHS.
+define <8 x i16> @test_vstrchs(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c,
+                               i32 *%ccptr) {
+; CHECK-LABEL: test_vstrchs:
+; CHECK: vstrchs %v24, %v24, %v26, %v28, 4
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: srl [[REG]], 28
+; CHECK: st [[REG]], 0(%r2)
+; CHECK: br %r14
+  %call = call {<8 x i16>, i32} @llvm.s390.vstrchs(<8 x i16> %a, <8 x i16> %b,
+                                                   <8 x i16> %c, i32 4)
+  %res = extractvalue {<8 x i16>, i32} %call, 0
+  %cc = extractvalue {<8 x i16>, i32} %call, 1
+  store i32 %cc, i32 *%ccptr
+  ret <8 x i16> %res
+}
+
+; VSTRCFS.
+define <4 x i32> @test_vstrcfs(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c,
+                               i32 *%ccptr) {
+; CHECK-LABEL: test_vstrcfs:
+; CHECK: vstrcfs %v24, %v24, %v26, %v28, 8
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: srl [[REG]], 28
+; CHECK: st [[REG]], 0(%r2)
+; CHECK: br %r14
+  %call = call {<4 x i32>, i32} @llvm.s390.vstrcfs(<4 x i32> %a, <4 x i32> %b,
+                                                   <4 x i32> %c, i32 8)
+  %res = extractvalue {<4 x i32>, i32} %call, 0
+  %cc = extractvalue {<4 x i32>, i32} %call, 1
+  store i32 %cc, i32 *%ccptr
+  ret <4 x i32> %res
+}
+
+; VSTRCZB with !IN !RT.
+define <16 x i8> @test_vstrczb_0(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK-LABEL: test_vstrczb_0:
+; CHECK: vstrczb %v24, %v24, %v26, %v28, 0
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vstrczb(<16 x i8> %a, <16 x i8> %b,
+                                           <16 x i8> %c, i32 0)
+  ret <16 x i8> %res
+}
+
+; VSTRCZB with !IN RT.
+define <16 x i8> @test_vstrczb_4(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK-LABEL: test_vstrczb_4:
+; CHECK: vstrczb %v24, %v24, %v26, %v28, 4
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vstrczb(<16 x i8> %a, <16 x i8> %b,
+                                           <16 x i8> %c, i32 4)
+  ret <16 x i8> %res
+}
+
+; VSTRCZB with IN !RT.
+define <16 x i8> @test_vstrczb_8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK-LABEL: test_vstrczb_8:
+; CHECK: vstrczb %v24, %v24, %v26, %v28, 8
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vstrczb(<16 x i8> %a, <16 x i8> %b,
+                                           <16 x i8> %c, i32 8)
+  ret <16 x i8> %res
+}
+
+; VSTRCZB with IN RT.
+define <16 x i8> @test_vstrczb_12(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK-LABEL: test_vstrczb_12:
+; CHECK: vstrczb %v24, %v24, %v26, %v28, 12
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vstrczb(<16 x i8> %a, <16 x i8> %b,
+                                           <16 x i8> %c, i32 12)
+  ret <16 x i8> %res
+}
+
+; VSTRCZB with CS -- should be ignored.
+define <16 x i8> @test_vstrczb_1(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK-LABEL: test_vstrczb_1:
+; CHECK: vstrczb %v24, %v24, %v26, %v28, 0
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vstrczb(<16 x i8> %a, <16 x i8> %b,
+                                           <16 x i8> %c, i32 1)
+  ret <16 x i8> %res
+}
+
+; VSTRCZH.
+define <8 x i16> @test_vstrczh(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK-LABEL: test_vstrczh:
+; CHECK: vstrczh %v24, %v24, %v26, %v28, 4
+; CHECK: br %r14
+  %res = call <8 x i16> @llvm.s390.vstrczh(<8 x i16> %a, <8 x i16> %b,
+                                           <8 x i16> %c,  i32 4)
+  ret <8 x i16> %res
+}
+
+; VSTRCZF.
+define <4 x i32> @test_vstrczf(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-LABEL: test_vstrczf:
+; CHECK: vstrczf %v24, %v24, %v26, %v28, 8
+; CHECK: br %r14
+  %res = call <4 x i32> @llvm.s390.vstrczf(<4 x i32> %a, <4 x i32> %b,
+                                           <4 x i32> %c, i32 8)
+  ret <4 x i32> %res
+}
+
+; VSTRCZBS.
+define <16 x i8> @test_vstrczbs(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c,
+                                i32 *%ccptr) {
+; CHECK-LABEL: test_vstrczbs:
+; CHECK: vstrczbs %v24, %v24, %v26, %v28, 0
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: srl [[REG]], 28
+; CHECK: st [[REG]], 0(%r2)
+; CHECK: br %r14
+  %call = call {<16 x i8>, i32} @llvm.s390.vstrczbs(<16 x i8> %a, <16 x i8> %b,
+                                                    <16 x i8> %c, i32 0)
+  %res = extractvalue {<16 x i8>, i32} %call, 0
+  %cc = extractvalue {<16 x i8>, i32} %call, 1
+  store i32 %cc, i32 *%ccptr
+  ret <16 x i8> %res
+}
+
+; VSTRCZHS.
+define <8 x i16> @test_vstrczhs(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c,
+                                i32 *%ccptr) {
+; CHECK-LABEL: test_vstrczhs:
+; CHECK: vstrczhs %v24, %v24, %v26, %v28, 4
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: srl [[REG]], 28
+; CHECK: st [[REG]], 0(%r2)
+; CHECK: br %r14
+  %call = call {<8 x i16>, i32} @llvm.s390.vstrczhs(<8 x i16> %a, <8 x i16> %b,
+                                                    <8 x i16> %c, i32 4)
+  %res = extractvalue {<8 x i16>, i32} %call, 0
+  %cc = extractvalue {<8 x i16>, i32} %call, 1
+  store i32 %cc, i32 *%ccptr
+  ret <8 x i16> %res
+}
+
+; VSTRCZFS.
+define <4 x i32> @test_vstrczfs(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c,
+                                i32 *%ccptr) {
+; CHECK-LABEL: test_vstrczfs:
+; CHECK: vstrczfs %v24, %v24, %v26, %v28, 8
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: srl [[REG]], 28
+; CHECK: st [[REG]], 0(%r2)
+; CHECK: br %r14
+  %call = call {<4 x i32>, i32} @llvm.s390.vstrczfs(<4 x i32> %a, <4 x i32> %b,
+                                                    <4 x i32> %c, i32 8)
+  %res = extractvalue {<4 x i32>, i32} %call, 0
+  %cc = extractvalue {<4 x i32>, i32} %call, 1
+  store i32 %cc, i32 *%ccptr
+  ret <4 x i32> %res
+}
+
+; VFCEDBS with no processing of the result.
+define i32 @test_vfcedbs(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: test_vfcedbs:
+; CHECK: vfcedbs {{%v[0-9]+}}, %v24, %v26
+; CHECK: ipm %r2
+; CHECK: srl %r2, 28
+; CHECK: br %r14
+  %call = call {<2 x i64>, i32} @llvm.s390.vfcedbs(<2 x double> %a,
+                                                   <2 x double> %b)
+  %res = extractvalue {<2 x i64>, i32} %call, 1
+  ret i32 %res
+}
+
+; VFCEDBS, returning 1 if any elements are equal (CC != 3).
+define i32 @test_vfcedbs_any_bool(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: test_vfcedbs_any_bool:
+; CHECK: vfcedbs {{%v[0-9]+}}, %v24, %v26
+; CHECK: ipm %r2
+; CHECK: afi %r2, -536870912
+; CHECK: srl %r2, 31
+; CHECK: br %r14
+  %call = call {<2 x i64>, i32} @llvm.s390.vfcedbs(<2 x double> %a,
+                                                   <2 x double> %b)
+  %res = extractvalue {<2 x i64>, i32} %call, 1
+  %cmp = icmp ne i32 %res, 3
+  %ext = zext i1 %cmp to i32
+  ret i32 %ext
+}
+
+; VFCEDBS, storing to %ptr if any elements are equal.
+define <2 x i64> @test_vfcedbs_any_store(<2 x double> %a, <2 x double> %b,
+                                         i32 *%ptr) {
+; CHECK-LABEL: test_vfcedbs_any_store:
+; CHECK-NOT: %r
+; CHECK: vfcedbs %v24, %v24, %v26
+; CHECK-NEXT: {{bor|bnler}} %r14
+; CHECK: mvhi 0(%r2), 0
+; CHECK: br %r14
+  %call = call {<2 x i64>, i32} @llvm.s390.vfcedbs(<2 x double> %a,
+                                                   <2 x double> %b)
+  %res = extractvalue {<2 x i64>, i32} %call, 0
+  %cc = extractvalue {<2 x i64>, i32} %call, 1
+  %cmp = icmp ule i32 %cc, 2
+  br i1 %cmp, label %store, label %exit
+
+store:
+  store i32 0, i32 *%ptr
+  br label %exit
+
+exit:
+  ret <2 x i64> %res
+}
+
+; VFCHDBS with no processing of the result.
+define i32 @test_vfchdbs(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: test_vfchdbs:
+; CHECK: vfchdbs {{%v[0-9]+}}, %v24, %v26
+; CHECK: ipm %r2
+; CHECK: srl %r2, 28
+; CHECK: br %r14
+  %call = call {<2 x i64>, i32} @llvm.s390.vfchdbs(<2 x double> %a,
+                                                   <2 x double> %b)
+  %res = extractvalue {<2 x i64>, i32} %call, 1
+  ret i32 %res
+}
+
+; VFCHDBS, returning 1 if not all elements are higher.
+define i32 @test_vfchdbs_notall_bool(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: test_vfchdbs_notall_bool:
+; CHECK: vfchdbs {{%v[0-9]+}}, %v24, %v26
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: risblg %r2, [[REG]], 31, 159, 36
+; CHECK: br %r14
+  %call = call {<2 x i64>, i32} @llvm.s390.vfchdbs(<2 x double> %a,
+                                                   <2 x double> %b)
+  %res = extractvalue {<2 x i64>, i32} %call, 1
+  %cmp = icmp sge i32 %res, 1
+  %ext = zext i1 %cmp to i32
+  ret i32 %ext
+}
+
+; VFCHDBS, storing to %ptr if not all elements are higher.
+define <2 x i64> @test_vfchdbs_notall_store(<2 x double> %a, <2 x double> %b,
+                                            i32 *%ptr) {
+; CHECK-LABEL: test_vfchdbs_notall_store:
+; CHECK-NOT: %r
+; CHECK: vfchdbs %v24, %v24, %v26
+; CHECK-NEXT: {{bher|ber}} %r14
+; CHECK: mvhi 0(%r2), 0
+; CHECK: br %r14
+  %call = call {<2 x i64>, i32} @llvm.s390.vfchdbs(<2 x double> %a,
+                                                   <2 x double> %b)
+  %res = extractvalue {<2 x i64>, i32} %call, 0
+  %cc = extractvalue {<2 x i64>, i32} %call, 1
+  %cmp = icmp ugt i32 %cc, 0
+  br i1 %cmp, label %store, label %exit
+
+store:
+  store i32 0, i32 *%ptr
+  br label %exit
+
+exit:
+  ret <2 x i64> %res
+}
+
+; VFCHEDBS with no processing of the result.
+define i32 @test_vfchedbs(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: test_vfchedbs:
+; CHECK: vfchedbs {{%v[0-9]+}}, %v24, %v26
+; CHECK: ipm %r2
+; CHECK: srl %r2, 28
+; CHECK: br %r14
+  %call = call {<2 x i64>, i32} @llvm.s390.vfchedbs(<2 x double> %a,
+						    <2 x double> %b)
+  %res = extractvalue {<2 x i64>, i32} %call, 1
+  ret i32 %res
+}
+
+; VFCHEDBS, returning 1 if neither element is higher or equal.
+define i32 @test_vfchedbs_none_bool(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: test_vfchedbs_none_bool:
+; CHECK: vfchedbs {{%v[0-9]+}}, %v24, %v26
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: risblg %r2, [[REG]], 31, 159, 35
+; CHECK: br %r14
+  %call = call {<2 x i64>, i32} @llvm.s390.vfchedbs(<2 x double> %a,
+						    <2 x double> %b)
+  %res = extractvalue {<2 x i64>, i32} %call, 1
+  %cmp = icmp eq i32 %res, 3
+  %ext = zext i1 %cmp to i32
+  ret i32 %ext
+}
+
+; VFCHEDBS, storing to %ptr if neither element is higher or equal.
+define <2 x i64> @test_vfchedbs_none_store(<2 x double> %a, <2 x double> %b,
+                                           i32 *%ptr) {
+; CHECK-LABEL: test_vfchedbs_none_store:
+; CHECK-NOT: %r
+; CHECK: vfchedbs %v24, %v24, %v26
+; CHECK-NEXT: {{bnor|bler}} %r14
+; CHECK: mvhi 0(%r2), 0
+; CHECK: br %r14
+  %call = call {<2 x i64>, i32} @llvm.s390.vfchedbs(<2 x double> %a,
+						    <2 x double> %b)
+  %res = extractvalue {<2 x i64>, i32} %call, 0
+  %cc = extractvalue {<2 x i64>, i32} %call, 1
+  %cmp = icmp uge i32 %cc, 3
+  br i1 %cmp, label %store, label %exit
+
+store:
+  store i32 0, i32 *%ptr
+  br label %exit
+
+exit:
+  ret <2 x i64> %res
+}
+
+; VFTCIDB with the lowest useful class selector and no processing of the result.
+define i32 @test_vftcidb(<2 x double> %a) {
+; CHECK-LABEL: test_vftcidb:
+; CHECK: vftcidb {{%v[0-9]+}}, %v24, 1
+; CHECK: ipm %r2
+; CHECK: srl %r2, 28
+; CHECK: br %r14
+  %call = call {<2 x i64>, i32} @llvm.s390.vftcidb(<2 x double> %a, i32 1)
+  %res = extractvalue {<2 x i64>, i32} %call, 1
+  ret i32 %res
+}
+
+; VFTCIDB with the highest useful class selector, returning 1 if all elements
+; have the right class (CC == 0).
+define i32 @test_vftcidb_all_bool(<2 x double> %a) {
+; CHECK-LABEL: test_vftcidb_all_bool:
+; CHECK: vftcidb {{%v[0-9]+}}, %v24, 4094
+; CHECK: afi %r2, -268435456
+; CHECK: srl %r2, 31
+; CHECK: br %r14
+  %call = call {<2 x i64>, i32} @llvm.s390.vftcidb(<2 x double> %a, i32 4094)
+  %res = extractvalue {<2 x i64>, i32} %call, 1
+  %cmp = icmp eq i32 %res, 0
+  %ext = zext i1 %cmp to i32
+  ret i32 %ext
+}
+
+; VFIDB with a rounding mode not usable via standard intrinsics.
+define <2 x double> @test_vfidb_0_4(<2 x double> %a) {
+; CHECK-LABEL: test_vfidb_0_4:
+; CHECK: vfidb %v24, %v24, 0, 4
+; CHECK: br %r14
+  %res = call <2 x double> @llvm.s390.vfidb(<2 x double> %a, i32 0, i32 4)
+  ret <2 x double> %res
+}
+
+; VFIDB with IEEE-inexact exception suppressed.
+define <2 x double> @test_vfidb_4_0(<2 x double> %a) {
+; CHECK-LABEL: test_vfidb_4_0:
+; CHECK: vfidb %v24, %v24, 4, 0
+; CHECK: br %r14
+  %res = call <2 x double> @llvm.s390.vfidb(<2 x double> %a, i32 4, i32 0)
+  ret <2 x double> %res
+}
+
diff --git a/test/CodeGen/SystemZ/vec-intrinsics-02.ll b/test/CodeGen/SystemZ/vec-intrinsics-02.ll
new file mode 100644
index 000000000000..84c6a0784031
--- /dev/null
+++ b/test/CodeGen/SystemZ/vec-intrinsics-02.ll
@@ -0,0 +1,441 @@
+; Test vector intrinsics added with z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+declare <2 x i64> @llvm.s390.vbperm(<16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.s390.vmslg(<2 x i64>, <2 x i64>, <16 x i8>, i32)
+declare <16 x i8> @llvm.s390.vlrl(i32, i8 *)
+declare void @llvm.s390.vstrl(<16 x i8>, i32, i8 *)
+
+declare {<4 x i32>, i32} @llvm.s390.vfcesbs(<4 x float>, <4 x float>)
+declare {<4 x i32>, i32} @llvm.s390.vfchsbs(<4 x float>, <4 x float>)
+declare {<4 x i32>, i32} @llvm.s390.vfchesbs(<4 x float>, <4 x float>)
+declare {<4 x i32>, i32} @llvm.s390.vftcisb(<4 x float>, i32)
+declare <4 x float> @llvm.s390.vfisb(<4 x float>, i32, i32)
+
+declare <2 x double> @llvm.s390.vfmaxdb(<2 x double>, <2 x double>, i32)
+declare <2 x double> @llvm.s390.vfmindb(<2 x double>, <2 x double>, i32)
+declare <4 x float> @llvm.s390.vfmaxsb(<4 x float>, <4 x float>, i32)
+declare <4 x float> @llvm.s390.vfminsb(<4 x float>, <4 x float>, i32)
+
+; VBPERM.
+define <2 x i64> @test_vbperm(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vbperm:
+; CHECK: vbperm %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <2 x i64> @llvm.s390.vbperm(<16 x i8> %a, <16 x i8> %b)
+  ret <2 x i64> %res
+}
+
+; VMSLG with no shifts.
+define <16 x i8> @test_vmslg1(<2 x i64> %a, <2 x i64> %b, <16 x i8> %c) {
+; CHECK-LABEL: test_vmslg1:
+; CHECK: vmslg %v24, %v24, %v26, %v28, 0
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vmslg(<2 x i64> %a, <2 x i64> %b, <16 x i8> %c, i32 0)
+  ret <16 x i8> %res
+}
+
+; VMSLG with both shifts.
+define <16 x i8> @test_vmslg2(<2 x i64> %a, <2 x i64> %b, <16 x i8> %c) {
+; CHECK-LABEL: test_vmslg2:
+; CHECK: vmslg %v24, %v24, %v26, %v28, 12
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vmslg(<2 x i64> %a, <2 x i64> %b, <16 x i8> %c, i32 12)
+  ret <16 x i8> %res
+}
+
+; VLRLR with the lowest in-range displacement.
+define <16 x i8> @test_vlrlr1(i8 *%ptr, i32 %length) {
+; CHECK-LABEL: test_vlrlr1:
+; CHECK: vlrlr %v24, %r3, 0(%r2)
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vlrl(i32 %length, i8 *%ptr)
+  ret <16 x i8> %res
+}
+
+; VLRLR with the highest in-range displacement.
+define <16 x i8> @test_vlrlr2(i8 *%base, i32 %length) {
+; CHECK-LABEL: test_vlrlr2:
+; CHECK: vlrlr %v24, %r3, 4095(%r2)
+; CHECK: br %r14
+  %ptr = getelementptr i8, i8 *%base, i64 4095
+  %res = call <16 x i8> @llvm.s390.vlrl(i32 %length, i8 *%ptr)
+  ret <16 x i8> %res
+}
+
+; VLRLR with an out-of-range displacement.
+define <16 x i8> @test_vlrlr3(i8 *%base, i32 %length) {
+; CHECK-LABEL: test_vlrlr3:
+; CHECK: vlrlr %v24, %r3, 0({{%r[1-5]}})
+; CHECK: br %r14
+  %ptr = getelementptr i8, i8 *%base, i64 4096
+  %res = call <16 x i8> @llvm.s390.vlrl(i32 %length, i8 *%ptr)
+  ret <16 x i8> %res
+}
+
+; Check that VLRLR doesn't allow an index.
+define <16 x i8> @test_vlrlr4(i8 *%base, i64 %index, i32 %length) {
+; CHECK-LABEL: test_vlrlr4:
+; CHECK: vlrlr %v24, %r4, 0({{%r[1-5]}})
+; CHECK: br %r14
+  %ptr = getelementptr i8, i8 *%base, i64 %index
+  %res = call <16 x i8> @llvm.s390.vlrl(i32 %length, i8 *%ptr)
+  ret <16 x i8> %res
+}
+
+; VLRL with the lowest in-range displacement.
+define <16 x i8> @test_vlrl1(i8 *%ptr) {
+; CHECK-LABEL: test_vlrl1:
+; CHECK: vlrl %v24, 0(%r2), 0
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vlrl(i32 0, i8 *%ptr)
+  ret <16 x i8> %res
+}
+
+; VLRL with the highest in-range displacement.
+define <16 x i8> @test_vlrl2(i8 *%base) {
+; CHECK-LABEL: test_vlrl2:
+; CHECK: vlrl %v24, 4095(%r2), 0
+; CHECK: br %r14
+  %ptr = getelementptr i8, i8 *%base, i64 4095
+  %res = call <16 x i8> @llvm.s390.vlrl(i32 0, i8 *%ptr)
+  ret <16 x i8> %res
+}
+
+; VLRL with an out-of-range displacement.
+define <16 x i8> @test_vlrl3(i8 *%base) {
+; CHECK-LABEL: test_vlrl3:
+; CHECK: vlrl %v24, 0({{%r[1-5]}}), 0
+; CHECK: br %r14
+  %ptr = getelementptr i8, i8 *%base, i64 4096
+  %res = call <16 x i8> @llvm.s390.vlrl(i32 0, i8 *%ptr)
+  ret <16 x i8> %res
+}
+
+; Check that VLRL doesn't allow an index.
+define <16 x i8> @test_vlrl4(i8 *%base, i64 %index) {
+; CHECK-LABEL: test_vlrl4:
+; CHECK: vlrl %v24, 0({{%r[1-5]}}), 0
+; CHECK: br %r14
+  %ptr = getelementptr i8, i8 *%base, i64 %index
+  %res = call <16 x i8> @llvm.s390.vlrl(i32 0, i8 *%ptr)
+  ret <16 x i8> %res
+}
+
+; VSTRLR with the lowest in-range displacement.
+define void @test_vstrlr1(<16 x i8> %vec, i8 *%ptr, i32 %length) {
+; CHECK-LABEL: test_vstrlr1:
+; CHECK: vstrlr %v24, %r3, 0(%r2)
+; CHECK: br %r14
+  call void @llvm.s390.vstrl(<16 x i8> %vec, i32 %length, i8 *%ptr)
+  ret void
+}
+
+; VSTRLR with the highest in-range displacement.
+define void @test_vstrlr2(<16 x i8> %vec, i8 *%base, i32 %length) {
+; CHECK-LABEL: test_vstrlr2:
+; CHECK: vstrlr %v24, %r3, 4095(%r2)
+; CHECK: br %r14
+  %ptr = getelementptr i8, i8 *%base, i64 4095
+  call void @llvm.s390.vstrl(<16 x i8> %vec, i32 %length, i8 *%ptr)
+  ret void
+}
+
+; VSTRLR with an out-of-range displacement.
+define void @test_vstrlr3(<16 x i8> %vec, i8 *%base, i32 %length) {
+; CHECK-LABEL: test_vstrlr3:
+; CHECK: vstrlr %v24, %r3, 0({{%r[1-5]}})
+; CHECK: br %r14
+  %ptr = getelementptr i8, i8 *%base, i64 4096
+  call void @llvm.s390.vstrl(<16 x i8> %vec, i32 %length, i8 *%ptr)
+  ret void
+}
+
+; Check that VSTRLR doesn't allow an index.
+define void @test_vstrlr4(<16 x i8> %vec, i8 *%base, i64 %index, i32 %length) {
+; CHECK-LABEL: test_vstrlr4:
+; CHECK: vstrlr %v24, %r4, 0({{%r[1-5]}})
+; CHECK: br %r14
+  %ptr = getelementptr i8, i8 *%base, i64 %index
+  call void @llvm.s390.vstrl(<16 x i8> %vec, i32 %length, i8 *%ptr)
+  ret void
+}
+
+; VSTRL with the lowest in-range displacement.
+define void @test_vstrl1(<16 x i8> %vec, i8 *%ptr) {
+; CHECK-LABEL: test_vstrl1:
+; CHECK: vstrl %v24, 0(%r2), 8
+; CHECK: br %r14
+  call void @llvm.s390.vstrl(<16 x i8> %vec, i32 8, i8 *%ptr)
+  ret void
+}
+
+; VSTRL with the highest in-range displacement.
+define void @test_vstrl2(<16 x i8> %vec, i8 *%base) {
+; CHECK-LABEL: test_vstrl2:
+; CHECK: vstrl %v24, 4095(%r2), 8
+; CHECK: br %r14
+  %ptr = getelementptr i8, i8 *%base, i64 4095
+  call void @llvm.s390.vstrl(<16 x i8> %vec, i32 8, i8 *%ptr)
+  ret void
+}
+
+; VSTRL with an out-of-range displacement.
+define void @test_vstrl3(<16 x i8> %vec, i8 *%base) {
+; CHECK-LABEL: test_vstrl3:
+; CHECK: vstrl %v24, 0({{%r[1-5]}}), 8
+; CHECK: br %r14
+  %ptr = getelementptr i8, i8 *%base, i64 4096
+  call void @llvm.s390.vstrl(<16 x i8> %vec, i32 8, i8 *%ptr)
+  ret void
+}
+
+; Check that VSTRL doesn't allow an index.
+define void @test_vstrl4(<16 x i8> %vec, i8 *%base, i64 %index) {
+; CHECK-LABEL: test_vstrl4:
+; CHECK: vstrl %v24, 0({{%r[1-5]}}), 8
+; CHECK: br %r14
+  %ptr = getelementptr i8, i8 *%base, i64 %index
+  call void @llvm.s390.vstrl(<16 x i8> %vec, i32 8, i8 *%ptr)
+  ret void
+}
+
+; VFCESBS with no processing of the result.
+define i32 @test_vfcesbs(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test_vfcesbs:
+; CHECK: vfcesbs {{%v[0-9]+}}, %v24, %v26
+; CHECK: ipm %r2
+; CHECK: srl %r2, 28
+; CHECK: br %r14
+  %call = call {<4 x i32>, i32} @llvm.s390.vfcesbs(<4 x float> %a,
+                                                   <4 x float> %b)
+  %res = extractvalue {<4 x i32>, i32} %call, 1
+  ret i32 %res
+}
+
+; VFCESBS, returning 1 if any elements are equal (CC != 3).
+define i32 @test_vfcesbs_any_bool(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test_vfcesbs_any_bool:
+; CHECK: vfcesbs {{%v[0-9]+}}, %v24, %v26
+; CHECK: ipm %r2
+; CHECK: afi %r2, -536870912
+; CHECK: srl %r2, 31
+; CHECK: br %r14
+  %call = call {<4 x i32>, i32} @llvm.s390.vfcesbs(<4 x float> %a,
+                                                   <4 x float> %b)
+  %res = extractvalue {<4 x i32>, i32} %call, 1
+  %cmp = icmp ne i32 %res, 3
+  %ext = zext i1 %cmp to i32
+  ret i32 %ext
+}
+
+; VFCESBS, storing to %ptr if any elements are equal.
+define <4 x i32> @test_vfcesbs_any_store(<4 x float> %a, <4 x float> %b,
+                                         i32 *%ptr) {
+; CHECK-LABEL: test_vfcesbs_any_store:
+; CHECK-NOT: %r
+; CHECK: vfcesbs %v24, %v24, %v26
+; CHECK-NEXT: {{bor|bnler}} %r14
+; CHECK: mvhi 0(%r2), 0
+; CHECK: br %r14
+  %call = call {<4 x i32>, i32} @llvm.s390.vfcesbs(<4 x float> %a,
+                                                   <4 x float> %b)
+  %res = extractvalue {<4 x i32>, i32} %call, 0
+  %cc = extractvalue {<4 x i32>, i32} %call, 1
+  %cmp = icmp ule i32 %cc, 2
+  br i1 %cmp, label %store, label %exit
+
+store:
+  store i32 0, i32 *%ptr
+  br label %exit
+
+exit:
+  ret <4 x i32> %res
+}
+
+; VFCHSBS with no processing of the result.
+define i32 @test_vfchsbs(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test_vfchsbs:
+; CHECK: vfchsbs {{%v[0-9]+}}, %v24, %v26
+; CHECK: ipm %r2
+; CHECK: srl %r2, 28
+; CHECK: br %r14
+  %call = call {<4 x i32>, i32} @llvm.s390.vfchsbs(<4 x float> %a,
+                                                   <4 x float> %b)
+  %res = extractvalue {<4 x i32>, i32} %call, 1
+  ret i32 %res
+}
+
+; VFCHSBS, returning 1 if not all elements are higher.
+define i32 @test_vfchsbs_notall_bool(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test_vfchsbs_notall_bool:
+; CHECK: vfchsbs {{%v[0-9]+}}, %v24, %v26
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: risblg %r2, [[REG]], 31, 159, 36
+; CHECK: br %r14
+  %call = call {<4 x i32>, i32} @llvm.s390.vfchsbs(<4 x float> %a,
+                                                   <4 x float> %b)
+  %res = extractvalue {<4 x i32>, i32} %call, 1
+  %cmp = icmp sge i32 %res, 1
+  %ext = zext i1 %cmp to i32
+  ret i32 %ext
+}
+
+; VFCHSBS, storing to %ptr if not all elements are higher.
+define <4 x i32> @test_vfchsbs_notall_store(<4 x float> %a, <4 x float> %b,
+                                            i32 *%ptr) {
+; CHECK-LABEL: test_vfchsbs_notall_store:
+; CHECK-NOT: %r
+; CHECK: vfchsbs %v24, %v24, %v26
+; CHECK-NEXT: {{bher|ber}} %r14
+; CHECK: mvhi 0(%r2), 0
+; CHECK: br %r14
+  %call = call {<4 x i32>, i32} @llvm.s390.vfchsbs(<4 x float> %a,
+                                                   <4 x float> %b)
+  %res = extractvalue {<4 x i32>, i32} %call, 0
+  %cc = extractvalue {<4 x i32>, i32} %call, 1
+  %cmp = icmp ugt i32 %cc, 0
+  br i1 %cmp, label %store, label %exit
+
+store:
+  store i32 0, i32 *%ptr
+  br label %exit
+
+exit:
+  ret <4 x i32> %res
+}
+
+; VFCHESBS with no processing of the result.
+define i32 @test_vfchesbs(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test_vfchesbs:
+; CHECK: vfchesbs {{%v[0-9]+}}, %v24, %v26
+; CHECK: ipm %r2
+; CHECK: srl %r2, 28
+; CHECK: br %r14
+  %call = call {<4 x i32>, i32} @llvm.s390.vfchesbs(<4 x float> %a,
+						    <4 x float> %b)
+  %res = extractvalue {<4 x i32>, i32} %call, 1
+  ret i32 %res
+}
+
+; VFCHESBS, returning 1 if neither element is higher or equal.
+define i32 @test_vfchesbs_none_bool(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test_vfchesbs_none_bool:
+; CHECK: vfchesbs {{%v[0-9]+}}, %v24, %v26
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: risblg %r2, [[REG]], 31, 159, 35
+; CHECK: br %r14
+  %call = call {<4 x i32>, i32} @llvm.s390.vfchesbs(<4 x float> %a,
+						    <4 x float> %b)
+  %res = extractvalue {<4 x i32>, i32} %call, 1
+  %cmp = icmp eq i32 %res, 3
+  %ext = zext i1 %cmp to i32
+  ret i32 %ext
+}
+
+; VFCHESBS, storing to %ptr if neither element is higher or equal.
+define <4 x i32> @test_vfchesbs_none_store(<4 x float> %a, <4 x float> %b,
+                                           i32 *%ptr) {
+; CHECK-LABEL: test_vfchesbs_none_store:
+; CHECK-NOT: %r
+; CHECK: vfchesbs %v24, %v24, %v26
+; CHECK-NEXT: {{bnor|bler}} %r14
+; CHECK: mvhi 0(%r2), 0
+; CHECK: br %r14
+  %call = call {<4 x i32>, i32} @llvm.s390.vfchesbs(<4 x float> %a,
+						    <4 x float> %b)
+  %res = extractvalue {<4 x i32>, i32} %call, 0
+  %cc = extractvalue {<4 x i32>, i32} %call, 1
+  %cmp = icmp uge i32 %cc, 3
+  br i1 %cmp, label %store, label %exit
+
+store:
+  store i32 0, i32 *%ptr
+  br label %exit
+
+exit:
+  ret <4 x i32> %res
+}
+
+; VFTCISB with the lowest useful class selector and no processing of the result.
+define i32 @test_vftcisb(<4 x float> %a) {
+; CHECK-LABEL: test_vftcisb:
+; CHECK: vftcisb {{%v[0-9]+}}, %v24, 1
+; CHECK: ipm %r2
+; CHECK: srl %r2, 28
+; CHECK: br %r14
+  %call = call {<4 x i32>, i32} @llvm.s390.vftcisb(<4 x float> %a, i32 1)
+  %res = extractvalue {<4 x i32>, i32} %call, 1
+  ret i32 %res
+}
+
+; VFTCISB with the highest useful class selector, returning 1 if all elements
+; have the right class (CC == 0).
+define i32 @test_vftcisb_all_bool(<4 x float> %a) {
+; CHECK-LABEL: test_vftcisb_all_bool:
+; CHECK: vftcisb {{%v[0-9]+}}, %v24, 4094
+; CHECK: afi %r2, -268435456
+; CHECK: srl %r2, 31
+; CHECK: br %r14
+  %call = call {<4 x i32>, i32} @llvm.s390.vftcisb(<4 x float> %a, i32 4094)
+  %res = extractvalue {<4 x i32>, i32} %call, 1
+  %cmp = icmp eq i32 %res, 0
+  %ext = zext i1 %cmp to i32
+  ret i32 %ext
+}
+
+; VFISB with a rounding mode not usable via standard intrinsics.
+define <4 x float> @test_vfisb_0_4(<4 x float> %a) {
+; CHECK-LABEL: test_vfisb_0_4:
+; CHECK: vfisb %v24, %v24, 0, 4
+; CHECK: br %r14
+  %res = call <4 x float> @llvm.s390.vfisb(<4 x float> %a, i32 0, i32 4)
+  ret <4 x float> %res
+}
+
+; VFISB with IEEE-inexact exception suppressed.
+define <4 x float> @test_vfisb_4_0(<4 x float> %a) {
+; CHECK-LABEL: test_vfisb_4_0:
+; CHECK: vfisb %v24, %v24, 4, 0
+; CHECK: br %r14
+  %res = call <4 x float> @llvm.s390.vfisb(<4 x float> %a, i32 4, i32 0)
+  ret <4 x float> %res
+}
+
+; VFMAXDB.
+define <2 x double> @test_vfmaxdb(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: test_vfmaxdb:
+; CHECK: vfmaxdb %v24, %v24, %v26, 4
+; CHECK: br %r14
+  %res = call <2 x double> @llvm.s390.vfmaxdb(<2 x double> %a, <2 x double> %b, i32 4)
+  ret <2 x double> %res
+}
+
+; VFMINDB.
+define <2 x double> @test_vfmindb(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: test_vfmindb:
+; CHECK: vfmindb %v24, %v24, %v26, 4
+; CHECK: br %r14
+  %res = call <2 x double> @llvm.s390.vfmindb(<2 x double> %a, <2 x double> %b, i32 4)
+  ret <2 x double> %res
+}
+
+; VFMAXSB.
+define <4 x float> @test_vfmaxsb(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test_vfmaxsb:
+; CHECK: vfmaxsb %v24, %v24, %v26, 4
+; CHECK: br %r14
+  %res = call <4 x float> @llvm.s390.vfmaxsb(<4 x float> %a, <4 x float> %b, i32 4)
+  ret <4 x float> %res
+}
+
+; VFMINSB.
+define <4 x float> @test_vfminsb(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test_vfminsb:
+; CHECK: vfminsb %v24, %v24, %v26, 4
+; CHECK: br %r14
+  %res = call <4 x float> @llvm.s390.vfminsb(<4 x float> %a, <4 x float> %b, i32 4)
+  ret <4 x float> %res
+}
+
diff --git a/test/CodeGen/SystemZ/vec-intrinsics.ll b/test/CodeGen/SystemZ/vec-intrinsics.ll
deleted file mode 100644
index 6f5eb0691aa8..000000000000
--- a/test/CodeGen/SystemZ/vec-intrinsics.ll
+++ /dev/null
@@ -1,3335 +0,0 @@
-; Test vector intrinsics.
-;
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
-
-declare i32 @llvm.s390.lcbb(i8 *, i32)
-declare <16 x i8> @llvm.s390.vlbb(i8 *, i32)
-declare <16 x i8> @llvm.s390.vll(i32, i8 *)
-declare <2 x i64> @llvm.s390.vpdi(<2 x i64>, <2 x i64>, i32)
-declare <16 x i8> @llvm.s390.vperm(<16 x i8>, <16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.s390.vpksh(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.s390.vpksf(<4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.s390.vpksg(<2 x i64>, <2 x i64>)
-declare {<16 x i8>, i32} @llvm.s390.vpkshs(<8 x i16>, <8 x i16>)
-declare {<8 x i16>, i32} @llvm.s390.vpksfs(<4 x i32>, <4 x i32>)
-declare {<4 x i32>, i32} @llvm.s390.vpksgs(<2 x i64>, <2 x i64>)
-declare <16 x i8> @llvm.s390.vpklsh(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.s390.vpklsf(<4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.s390.vpklsg(<2 x i64>, <2 x i64>)
-declare {<16 x i8>, i32} @llvm.s390.vpklshs(<8 x i16>, <8 x i16>)
-declare {<8 x i16>, i32} @llvm.s390.vpklsfs(<4 x i32>, <4 x i32>)
-declare {<4 x i32>, i32} @llvm.s390.vpklsgs(<2 x i64>, <2 x i64>)
-declare void @llvm.s390.vstl(<16 x i8>, i32, i8 *)
-declare <8 x i16> @llvm.s390.vuphb(<16 x i8>)
-declare <4 x i32> @llvm.s390.vuphh(<8 x i16>)
-declare <2 x i64> @llvm.s390.vuphf(<4 x i32>)
-declare <8 x i16> @llvm.s390.vuplhb(<16 x i8>)
-declare <4 x i32> @llvm.s390.vuplhh(<8 x i16>)
-declare <2 x i64> @llvm.s390.vuplhf(<4 x i32>)
-declare <8 x i16> @llvm.s390.vuplb(<16 x i8>)
-declare <4 x i32> @llvm.s390.vuplhw(<8 x i16>)
-declare <2 x i64> @llvm.s390.vuplf(<4 x i32>)
-declare <8 x i16> @llvm.s390.vupllb(<16 x i8>)
-declare <4 x i32> @llvm.s390.vupllh(<8 x i16>)
-declare <2 x i64> @llvm.s390.vupllf(<4 x i32>)
-declare <16 x i8> @llvm.s390.vaccb(<16 x i8>, <16 x i8>)
-declare <8 x i16> @llvm.s390.vacch(<8 x i16>, <8 x i16>)
-declare <4 x i32> @llvm.s390.vaccf(<4 x i32>, <4 x i32>)
-declare <2 x i64> @llvm.s390.vaccg(<2 x i64>, <2 x i64>)
-declare <16 x i8> @llvm.s390.vaq(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.s390.vacq(<16 x i8>, <16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.s390.vaccq(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.s390.vacccq(<16 x i8>, <16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.s390.vavgb(<16 x i8>, <16 x i8>)
-declare <8 x i16> @llvm.s390.vavgh(<8 x i16>, <8 x i16>)
-declare <4 x i32> @llvm.s390.vavgf(<4 x i32>, <4 x i32>)
-declare <2 x i64> @llvm.s390.vavgg(<2 x i64>, <2 x i64>)
-declare <16 x i8> @llvm.s390.vavglb(<16 x i8>, <16 x i8>)
-declare <8 x i16> @llvm.s390.vavglh(<8 x i16>, <8 x i16>)
-declare <4 x i32> @llvm.s390.vavglf(<4 x i32>, <4 x i32>)
-declare <2 x i64> @llvm.s390.vavglg(<2 x i64>, <2 x i64>)
-declare <4 x i32> @llvm.s390.vcksm(<4 x i32>, <4 x i32>)
-declare <8 x i16> @llvm.s390.vgfmb(<16 x i8>, <16 x i8>)
-declare <4 x i32> @llvm.s390.vgfmh(<8 x i16>, <8 x i16>)
-declare <2 x i64> @llvm.s390.vgfmf(<4 x i32>, <4 x i32>)
-declare <16 x i8> @llvm.s390.vgfmg(<2 x i64>, <2 x i64>)
-declare <8 x i16> @llvm.s390.vgfmab(<16 x i8>, <16 x i8>, <8 x i16>)
-declare <4 x i32> @llvm.s390.vgfmah(<8 x i16>, <8 x i16>, <4 x i32>)
-declare <2 x i64> @llvm.s390.vgfmaf(<4 x i32>, <4 x i32>, <2 x i64>)
-declare <16 x i8> @llvm.s390.vgfmag(<2 x i64>, <2 x i64>, <16 x i8>)
-declare <16 x i8> @llvm.s390.vmahb(<16 x i8>, <16 x i8>, <16 x i8>)
-declare <8 x i16> @llvm.s390.vmahh(<8 x i16>, <8 x i16>, <8 x i16>)
-declare <4 x i32> @llvm.s390.vmahf(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <16 x i8> @llvm.s390.vmalhb(<16 x i8>, <16 x i8>, <16 x i8>)
-declare <8 x i16> @llvm.s390.vmalhh(<8 x i16>, <8 x i16>, <8 x i16>)
-declare <4 x i32> @llvm.s390.vmalhf(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <8 x i16> @llvm.s390.vmaeb(<16 x i8>, <16 x i8>, <8 x i16>)
-declare <4 x i32> @llvm.s390.vmaeh(<8 x i16>, <8 x i16>, <4 x i32>)
-declare <2 x i64> @llvm.s390.vmaef(<4 x i32>, <4 x i32>, <2 x i64>)
-declare <8 x i16> @llvm.s390.vmaleb(<16 x i8>, <16 x i8>, <8 x i16>)
-declare <4 x i32> @llvm.s390.vmaleh(<8 x i16>, <8 x i16>, <4 x i32>)
-declare <2 x i64> @llvm.s390.vmalef(<4 x i32>, <4 x i32>, <2 x i64>)
-declare <8 x i16> @llvm.s390.vmaob(<16 x i8>, <16 x i8>, <8 x i16>)
-declare <4 x i32> @llvm.s390.vmaoh(<8 x i16>, <8 x i16>, <4 x i32>)
-declare <2 x i64> @llvm.s390.vmaof(<4 x i32>, <4 x i32>, <2 x i64>)
-declare <8 x i16> @llvm.s390.vmalob(<16 x i8>, <16 x i8>, <8 x i16>)
-declare <4 x i32> @llvm.s390.vmaloh(<8 x i16>, <8 x i16>, <4 x i32>)
-declare <2 x i64> @llvm.s390.vmalof(<4 x i32>, <4 x i32>, <2 x i64>)
-declare <16 x i8> @llvm.s390.vmhb(<16 x i8>, <16 x i8>)
-declare <8 x i16> @llvm.s390.vmhh(<8 x i16>, <8 x i16>)
-declare <4 x i32> @llvm.s390.vmhf(<4 x i32>, <4 x i32>)
-declare <16 x i8> @llvm.s390.vmlhb(<16 x i8>, <16 x i8>)
-declare <8 x i16> @llvm.s390.vmlhh(<8 x i16>, <8 x i16>)
-declare <4 x i32> @llvm.s390.vmlhf(<4 x i32>, <4 x i32>)
-declare <8 x i16> @llvm.s390.vmeb(<16 x i8>, <16 x i8>)
-declare <4 x i32> @llvm.s390.vmeh(<8 x i16>, <8 x i16>)
-declare <2 x i64> @llvm.s390.vmef(<4 x i32>, <4 x i32>)
-declare <8 x i16> @llvm.s390.vmleb(<16 x i8>, <16 x i8>)
-declare <4 x i32> @llvm.s390.vmleh(<8 x i16>, <8 x i16>)
-declare <2 x i64> @llvm.s390.vmlef(<4 x i32>, <4 x i32>)
-declare <8 x i16> @llvm.s390.vmob(<16 x i8>, <16 x i8>)
-declare <4 x i32> @llvm.s390.vmoh(<8 x i16>, <8 x i16>)
-declare <2 x i64> @llvm.s390.vmof(<4 x i32>, <4 x i32>)
-declare <8 x i16> @llvm.s390.vmlob(<16 x i8>, <16 x i8>)
-declare <4 x i32> @llvm.s390.vmloh(<8 x i16>, <8 x i16>)
-declare <2 x i64> @llvm.s390.vmlof(<4 x i32>, <4 x i32>)
-declare <16 x i8> @llvm.s390.verllvb(<16 x i8>, <16 x i8>)
-declare <8 x i16> @llvm.s390.verllvh(<8 x i16>, <8 x i16>)
-declare <4 x i32> @llvm.s390.verllvf(<4 x i32>, <4 x i32>)
-declare <2 x i64> @llvm.s390.verllvg(<2 x i64>, <2 x i64>)
-declare <16 x i8> @llvm.s390.verllb(<16 x i8>, i32)
-declare <8 x i16> @llvm.s390.verllh(<8 x i16>, i32)
-declare <4 x i32> @llvm.s390.verllf(<4 x i32>, i32)
-declare <2 x i64> @llvm.s390.verllg(<2 x i64>, i32)
-declare <16 x i8> @llvm.s390.verimb(<16 x i8>, <16 x i8>, <16 x i8>, i32)
-declare <8 x i16> @llvm.s390.verimh(<8 x i16>, <8 x i16>, <8 x i16>, i32)
-declare <4 x i32> @llvm.s390.verimf(<4 x i32>, <4 x i32>, <4 x i32>, i32)
-declare <2 x i64> @llvm.s390.verimg(<2 x i64>, <2 x i64>, <2 x i64>, i32)
-declare <16 x i8> @llvm.s390.vsl(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.s390.vslb(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.s390.vsra(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.s390.vsrab(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.s390.vsrl(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.s390.vsrlb(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.s390.vsldb(<16 x i8>, <16 x i8>, i32)
-declare <16 x i8> @llvm.s390.vscbib(<16 x i8>, <16 x i8>)
-declare <8 x i16> @llvm.s390.vscbih(<8 x i16>, <8 x i16>)
-declare <4 x i32> @llvm.s390.vscbif(<4 x i32>, <4 x i32>)
-declare <2 x i64> @llvm.s390.vscbig(<2 x i64>, <2 x i64>)
-declare <16 x i8> @llvm.s390.vsq(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.s390.vsbiq(<16 x i8>, <16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.s390.vscbiq(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.s390.vsbcbiq(<16 x i8>, <16 x i8>, <16 x i8>)
-declare <4 x i32> @llvm.s390.vsumb(<16 x i8>, <16 x i8>)
-declare <4 x i32> @llvm.s390.vsumh(<8 x i16>, <8 x i16>)
-declare <2 x i64> @llvm.s390.vsumgh(<8 x i16>, <8 x i16>)
-declare <2 x i64> @llvm.s390.vsumgf(<4 x i32>, <4 x i32>)
-declare <16 x i8> @llvm.s390.vsumqf(<4 x i32>, <4 x i32>)
-declare <16 x i8> @llvm.s390.vsumqg(<2 x i64>, <2 x i64>)
-declare i32 @llvm.s390.vtm(<16 x i8>, <16 x i8>)
-declare {<16 x i8>, i32} @llvm.s390.vceqbs(<16 x i8>, <16 x i8>)
-declare {<8 x i16>, i32} @llvm.s390.vceqhs(<8 x i16>, <8 x i16>)
-declare {<4 x i32>, i32} @llvm.s390.vceqfs(<4 x i32>, <4 x i32>)
-declare {<2 x i64>, i32} @llvm.s390.vceqgs(<2 x i64>, <2 x i64>)
-declare {<16 x i8>, i32} @llvm.s390.vchbs(<16 x i8>, <16 x i8>)
-declare {<8 x i16>, i32} @llvm.s390.vchhs(<8 x i16>, <8 x i16>)
-declare {<4 x i32>, i32} @llvm.s390.vchfs(<4 x i32>, <4 x i32>)
-declare {<2 x i64>, i32} @llvm.s390.vchgs(<2 x i64>, <2 x i64>)
-declare {<16 x i8>, i32} @llvm.s390.vchlbs(<16 x i8>, <16 x i8>)
-declare {<8 x i16>, i32} @llvm.s390.vchlhs(<8 x i16>, <8 x i16>)
-declare {<4 x i32>, i32} @llvm.s390.vchlfs(<4 x i32>, <4 x i32>)
-declare {<2 x i64>, i32} @llvm.s390.vchlgs(<2 x i64>, <2 x i64>)
-declare <16 x i8> @llvm.s390.vfaeb(<16 x i8>, <16 x i8>, i32)
-declare <8 x i16> @llvm.s390.vfaeh(<8 x i16>, <8 x i16>, i32)
-declare <4 x i32> @llvm.s390.vfaef(<4 x i32>, <4 x i32>, i32)
-declare {<16 x i8>, i32} @llvm.s390.vfaebs(<16 x i8>, <16 x i8>, i32)
-declare {<8 x i16>, i32} @llvm.s390.vfaehs(<8 x i16>, <8 x i16>, i32)
-declare {<4 x i32>, i32} @llvm.s390.vfaefs(<4 x i32>, <4 x i32>, i32)
-declare <16 x i8> @llvm.s390.vfaezb(<16 x i8>, <16 x i8>, i32)
-declare <8 x i16> @llvm.s390.vfaezh(<8 x i16>, <8 x i16>, i32)
-declare <4 x i32> @llvm.s390.vfaezf(<4 x i32>, <4 x i32>, i32)
-declare {<16 x i8>, i32} @llvm.s390.vfaezbs(<16 x i8>, <16 x i8>, i32)
-declare {<8 x i16>, i32} @llvm.s390.vfaezhs(<8 x i16>, <8 x i16>, i32)
-declare {<4 x i32>, i32} @llvm.s390.vfaezfs(<4 x i32>, <4 x i32>, i32)
-declare <16 x i8> @llvm.s390.vfeeb(<16 x i8>, <16 x i8>)
-declare <8 x i16> @llvm.s390.vfeeh(<8 x i16>, <8 x i16>)
-declare <4 x i32> @llvm.s390.vfeef(<4 x i32>, <4 x i32>)
-declare {<16 x i8>, i32} @llvm.s390.vfeebs(<16 x i8>, <16 x i8>)
-declare {<8 x i16>, i32} @llvm.s390.vfeehs(<8 x i16>, <8 x i16>)
-declare {<4 x i32>, i32} @llvm.s390.vfeefs(<4 x i32>, <4 x i32>)
-declare <16 x i8> @llvm.s390.vfeezb(<16 x i8>, <16 x i8>)
-declare <8 x i16> @llvm.s390.vfeezh(<8 x i16>, <8 x i16>)
-declare <4 x i32> @llvm.s390.vfeezf(<4 x i32>, <4 x i32>)
-declare {<16 x i8>, i32} @llvm.s390.vfeezbs(<16 x i8>, <16 x i8>)
-declare {<8 x i16>, i32} @llvm.s390.vfeezhs(<8 x i16>, <8 x i16>)
-declare {<4 x i32>, i32} @llvm.s390.vfeezfs(<4 x i32>, <4 x i32>)
-declare <16 x i8> @llvm.s390.vfeneb(<16 x i8>, <16 x i8>)
-declare <8 x i16> @llvm.s390.vfeneh(<8 x i16>, <8 x i16>)
-declare <4 x i32> @llvm.s390.vfenef(<4 x i32>, <4 x i32>)
-declare {<16 x i8>, i32} @llvm.s390.vfenebs(<16 x i8>, <16 x i8>)
-declare {<8 x i16>, i32} @llvm.s390.vfenehs(<8 x i16>, <8 x i16>)
-declare {<4 x i32>, i32} @llvm.s390.vfenefs(<4 x i32>, <4 x i32>)
-declare <16 x i8> @llvm.s390.vfenezb(<16 x i8>, <16 x i8>)
-declare <8 x i16> @llvm.s390.vfenezh(<8 x i16>, <8 x i16>)
-declare <4 x i32> @llvm.s390.vfenezf(<4 x i32>, <4 x i32>)
-declare {<16 x i8>, i32} @llvm.s390.vfenezbs(<16 x i8>, <16 x i8>)
-declare {<8 x i16>, i32} @llvm.s390.vfenezhs(<8 x i16>, <8 x i16>)
-declare {<4 x i32>, i32} @llvm.s390.vfenezfs(<4 x i32>, <4 x i32>)
-declare <16 x i8> @llvm.s390.vistrb(<16 x i8>)
-declare <8 x i16> @llvm.s390.vistrh(<8 x i16>)
-declare <4 x i32> @llvm.s390.vistrf(<4 x i32>)
-declare {<16 x i8>, i32} @llvm.s390.vistrbs(<16 x i8>)
-declare {<8 x i16>, i32} @llvm.s390.vistrhs(<8 x i16>)
-declare {<4 x i32>, i32} @llvm.s390.vistrfs(<4 x i32>)
-declare <16 x i8> @llvm.s390.vstrcb(<16 x i8>, <16 x i8>, <16 x i8>, i32)
-declare <8 x i16> @llvm.s390.vstrch(<8 x i16>, <8 x i16>, <8 x i16>, i32)
-declare <4 x i32> @llvm.s390.vstrcf(<4 x i32>, <4 x i32>, <4 x i32>, i32)
-declare {<16 x i8>, i32} @llvm.s390.vstrcbs(<16 x i8>, <16 x i8>, <16 x i8>,
-                                            i32)
-declare {<8 x i16>, i32} @llvm.s390.vstrchs(<8 x i16>, <8 x i16>, <8 x i16>,
-                                            i32)
-declare {<4 x i32>, i32} @llvm.s390.vstrcfs(<4 x i32>, <4 x i32>, <4 x i32>,
-                                            i32)
-declare <16 x i8> @llvm.s390.vstrczb(<16 x i8>, <16 x i8>, <16 x i8>, i32)
-declare <8 x i16> @llvm.s390.vstrczh(<8 x i16>, <8 x i16>, <8 x i16>, i32)
-declare <4 x i32> @llvm.s390.vstrczf(<4 x i32>, <4 x i32>, <4 x i32>, i32)
-declare {<16 x i8>, i32} @llvm.s390.vstrczbs(<16 x i8>, <16 x i8>, <16 x i8>,
-                                             i32)
-declare {<8 x i16>, i32} @llvm.s390.vstrczhs(<8 x i16>, <8 x i16>, <8 x i16>,
-                                             i32)
-declare {<4 x i32>, i32} @llvm.s390.vstrczfs(<4 x i32>, <4 x i32>, <4 x i32>,
-                                             i32)
-declare {<2 x i64>, i32} @llvm.s390.vfcedbs(<2 x double>, <2 x double>)
-declare {<2 x i64>, i32} @llvm.s390.vfchdbs(<2 x double>, <2 x double>)
-declare {<2 x i64>, i32} @llvm.s390.vfchedbs(<2 x double>, <2 x double>)
-declare {<2 x i64>, i32} @llvm.s390.vftcidb(<2 x double>, i32)
-declare <2 x double> @llvm.s390.vfidb(<2 x double>, i32, i32)
-
-; LCBB with the lowest M3 operand.
-define i32 @test_lcbb1(i8 *%ptr) {
-; CHECK-LABEL: test_lcbb1:
-; CHECK: lcbb %r2, 0(%r2), 0
-; CHECK: br %r14
-  %res = call i32 @llvm.s390.lcbb(i8 *%ptr, i32 0)
-  ret i32 %res
-}
-
-; LCBB with the highest M3 operand.
-define i32 @test_lcbb2(i8 *%ptr) {
-; CHECK-LABEL: test_lcbb2:
-; CHECK: lcbb %r2, 0(%r2), 15
-; CHECK: br %r14
-  %res = call i32 @llvm.s390.lcbb(i8 *%ptr, i32 15)
-  ret i32 %res
-}
-
-; LCBB with a displacement and index.
-define i32 @test_lcbb3(i8 *%base, i64 %index) {
-; CHECK-LABEL: test_lcbb3:
-; CHECK: lcbb %r2, 4095({{%r2,%r3|%r3,%r2}}), 4
-; CHECK: br %r14
-  %add = add i64 %index, 4095
-  %ptr = getelementptr i8, i8 *%base, i64 %add
-  %res = call i32 @llvm.s390.lcbb(i8 *%ptr, i32 4)
-  ret i32 %res
-}
-
-; LCBB with an out-of-range displacement.
-define i32 @test_lcbb4(i8 *%base) {
-; CHECK-LABEL: test_lcbb4:
-; CHECK: lcbb %r2, 0({{%r[1-5]}}), 5
-; CHECK: br %r14
-  %ptr = getelementptr i8, i8 *%base, i64 4096
-  %res = call i32 @llvm.s390.lcbb(i8 *%ptr, i32 5)
-  ret i32 %res
-}
-
-; VLBB with the lowest M3 operand.
-define <16 x i8> @test_vlbb1(i8 *%ptr) {
-; CHECK-LABEL: test_vlbb1:
-; CHECK: vlbb %v24, 0(%r2), 0
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.vlbb(i8 *%ptr, i32 0)
-  ret <16 x i8> %res
-}
-
-; VLBB with the highest M3 operand.
-define <16 x i8> @test_vlbb2(i8 *%ptr) {
-; CHECK-LABEL: test_vlbb2:
-; CHECK: vlbb %v24, 0(%r2), 15
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.vlbb(i8 *%ptr, i32 15)
-  ret <16 x i8> %res
-}
-
-; VLBB with a displacement and index.
-define <16 x i8> @test_vlbb3(i8 *%base, i64 %index) {
-; CHECK-LABEL: test_vlbb3:
-; CHECK: vlbb %v24, 4095({{%r2,%r3|%r3,%r2}}), 4
-; CHECK: br %r14
-  %add = add i64 %index, 4095
-  %ptr = getelementptr i8, i8 *%base, i64 %add
-  %res = call <16 x i8> @llvm.s390.vlbb(i8 *%ptr, i32 4)
-  ret <16 x i8> %res
-}
-
-; VLBB with an out-of-range displacement.
-define <16 x i8> @test_vlbb4(i8 *%base) {
-; CHECK-LABEL: test_vlbb4:
-; CHECK: vlbb %v24, 0({{%r[1-5]}}), 5
-; CHECK: br %r14
-  %ptr = getelementptr i8, i8 *%base, i64 4096
-  %res = call <16 x i8> @llvm.s390.vlbb(i8 *%ptr, i32 5)
-  ret <16 x i8> %res
-}
-
-; VLL with the lowest in-range displacement.
-define <16 x i8> @test_vll1(i8 *%ptr, i32 %length) {
-; CHECK-LABEL: test_vll1:
-; CHECK: vll %v24, %r3, 0(%r2)
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.vll(i32 %length, i8 *%ptr)
-  ret <16 x i8> %res
-}
-
-; VLL with the highest in-range displacement.
-define <16 x i8> @test_vll2(i8 *%base, i32 %length) {
-; CHECK-LABEL: test_vll2:
-; CHECK: vll %v24, %r3, 4095(%r2)
-; CHECK: br %r14
-  %ptr = getelementptr i8, i8 *%base, i64 4095
-  %res = call <16 x i8> @llvm.s390.vll(i32 %length, i8 *%ptr)
-  ret <16 x i8> %res
-}
-
-; VLL with an out-of-range displacementa.
-define <16 x i8> @test_vll3(i8 *%base, i32 %length) {
-; CHECK-LABEL: test_vll3:
-; CHECK: vll %v24, %r3, 0({{%r[1-5]}})
-; CHECK: br %r14
-  %ptr = getelementptr i8, i8 *%base, i64 4096
-  %res = call <16 x i8> @llvm.s390.vll(i32 %length, i8 *%ptr)
-  ret <16 x i8> %res
-}
-
-; Check that VLL doesn't allow an index.
-define <16 x i8> @test_vll4(i8 *%base, i64 %index, i32 %length) {
-; CHECK-LABEL: test_vll4:
-; CHECK: vll %v24, %r4, 0({{%r[1-5]}})
-; CHECK: br %r14
-  %ptr = getelementptr i8, i8 *%base, i64 %index
-  %res = call <16 x i8> @llvm.s390.vll(i32 %length, i8 *%ptr)
-  ret <16 x i8> %res
-}
-
-; VPDI taking element 0 from each half.
-define <2 x i64> @test_vpdi1(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vpdi1:
-; CHECK: vpdi %v24, %v24, %v26, 0
-; CHECK: br %r14
-  %res = call <2 x i64> @llvm.s390.vpdi(<2 x i64> %a, <2 x i64> %b, i32 0)
-  ret <2 x i64> %res
-}
-
-; VPDI taking element 1 from each half.
-define <2 x i64> @test_vpdi2(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vpdi2:
-; CHECK: vpdi %v24, %v24, %v26, 10
-; CHECK: br %r14
-  %res = call <2 x i64> @llvm.s390.vpdi(<2 x i64> %a, <2 x i64> %b, i32 10)
-  ret <2 x i64> %res
-}
-
-; VPERM.
-define <16 x i8> @test_vperm(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
-; CHECK-LABEL: test_vperm:
-; CHECK: vperm %v24, %v24, %v26, %v28
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.vperm(<16 x i8> %a, <16 x i8> %b,
-                                         <16 x i8> %c)
-  ret <16 x i8> %res
-}
-
-; VPKSH.
-define <16 x i8> @test_vpksh(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vpksh:
-; CHECK: vpksh %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.vpksh(<8 x i16> %a, <8 x i16> %b)
-  ret <16 x i8> %res
-}
-
-; VPKSF.
-define <8 x i16> @test_vpksf(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vpksf:
-; CHECK: vpksf %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <8 x i16> @llvm.s390.vpksf(<4 x i32> %a, <4 x i32> %b)
-  ret <8 x i16> %res
-}
-
-; VPKSG.
-define <4 x i32> @test_vpksg(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vpksg:
-; CHECK: vpksg %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <4 x i32> @llvm.s390.vpksg(<2 x i64> %a, <2 x i64> %b)
-  ret <4 x i32> %res
-}
-
-; VPKSHS with no processing of the result.
-define <16 x i8> @test_vpkshs(<8 x i16> %a, <8 x i16> %b, i32 *%ccptr) {
-; CHECK-LABEL: test_vpkshs:
-; CHECK: vpkshs %v24, %v24, %v26
-; CHECK: ipm [[REG:%r[0-5]]]
-; CHECK: srl [[REG]], 28
-; CHECK: st [[REG]], 0(%r2)
-; CHECK: br %r14
-  %call = call {<16 x i8>, i32} @llvm.s390.vpkshs(<8 x i16> %a, <8 x i16> %b)
-  %res = extractvalue {<16 x i8>, i32} %call, 0
-  %cc = extractvalue {<16 x i8>, i32} %call, 1
-  store i32 %cc, i32 *%ccptr
-  ret <16 x i8> %res
-}
-
-; VPKSHS, storing to %ptr if all values were saturated.
-define <16 x i8> @test_vpkshs_all_store(<8 x i16> %a, <8 x i16> %b, i32 *%ptr) {
-; CHECK-LABEL: test_vpkshs_all_store:
-; CHECK: vpkshs %v24, %v24, %v26
-; CHECK-NEXT: {{bnor|bler}} %r14
-; CHECK: mvhi 0(%r2), 0
-; CHECK: br %r14
-  %call = call {<16 x i8>, i32} @llvm.s390.vpkshs(<8 x i16> %a, <8 x i16> %b)
-  %res = extractvalue {<16 x i8>, i32} %call, 0
-  %cc = extractvalue {<16 x i8>, i32} %call, 1
-  %cmp = icmp uge i32 %cc, 3
-  br i1 %cmp, label %store, label %exit
-
-store:
-  store i32 0, i32 *%ptr
-  br label %exit
-
-exit:
-  ret <16 x i8> %res
-}
-
-; VPKSFS with no processing of the result.
-define <8 x i16> @test_vpksfs(<4 x i32> %a, <4 x i32> %b, i32 *%ccptr) {
-; CHECK-LABEL: test_vpksfs:
-; CHECK: vpksfs %v24, %v24, %v26
-; CHECK: ipm [[REG:%r[0-5]]]
-; CHECK: srl [[REG]], 28
-; CHECK: st [[REG]], 0(%r2)
-; CHECK: br %r14
-  %call = call {<8 x i16>, i32} @llvm.s390.vpksfs(<4 x i32> %a, <4 x i32> %b)
-  %res = extractvalue {<8 x i16>, i32} %call, 0
-  %cc = extractvalue {<8 x i16>, i32} %call, 1
-  store i32 %cc, i32 *%ccptr
-  ret <8 x i16> %res
-}
-
-; VPKSFS, storing to %ptr if any values were saturated.
-define <8 x i16> @test_vpksfs_any_store(<4 x i32> %a, <4 x i32> %b, i32 *%ptr) {
-; CHECK-LABEL: test_vpksfs_any_store:
-; CHECK: vpksfs %v24, %v24, %v26
-; CHECK-NEXT: {{bher|ber}} %r14
-; CHECK: mvhi 0(%r2), 0
-; CHECK: br %r14
-  %call = call {<8 x i16>, i32} @llvm.s390.vpksfs(<4 x i32> %a, <4 x i32> %b)
-  %res = extractvalue {<8 x i16>, i32} %call, 0
-  %cc = extractvalue {<8 x i16>, i32} %call, 1
-  %cmp = icmp ugt i32 %cc, 0
-  br i1 %cmp, label %store, label %exit
-
-store:
-  store i32 0, i32 *%ptr
-  br label %exit
-
-exit:
-  ret <8 x i16> %res
-}
-
-; VPKSGS with no processing of the result.
-define <4 x i32> @test_vpksgs(<2 x i64> %a, <2 x i64> %b, i32 *%ccptr) {
-; CHECK-LABEL: test_vpksgs:
-; CHECK: vpksgs %v24, %v24, %v26
-; CHECK: ipm [[REG:%r[0-5]]]
-; CHECK: srl [[REG]], 28
-; CHECK: st [[REG]], 0(%r2)
-; CHECK: br %r14
-  %call = call {<4 x i32>, i32} @llvm.s390.vpksgs(<2 x i64> %a, <2 x i64> %b)
-  %res = extractvalue {<4 x i32>, i32} %call, 0
-  %cc = extractvalue {<4 x i32>, i32} %call, 1
-  store i32 %cc, i32 *%ccptr
-  ret <4 x i32> %res
-}
-
-; VPKSGS, storing to %ptr if no elements were saturated
-define <4 x i32> @test_vpksgs_none_store(<2 x i64> %a, <2 x i64> %b,
-                                         i32 *%ptr) {
-; CHECK-LABEL: test_vpksgs_none_store:
-; CHECK: vpksgs %v24, %v24, %v26
-; CHECK-NEXT: {{bnher|bner}} %r14
-; CHECK: mvhi 0(%r2), 0
-; CHECK: br %r14
-  %call = call {<4 x i32>, i32} @llvm.s390.vpksgs(<2 x i64> %a, <2 x i64> %b)
-  %res = extractvalue {<4 x i32>, i32} %call, 0
-  %cc = extractvalue {<4 x i32>, i32} %call, 1
-  %cmp = icmp sle i32 %cc, 0
-  br i1 %cmp, label %store, label %exit
-
-store:
-  store i32 0, i32 *%ptr
-  br label %exit
-
-exit:
-  ret <4 x i32> %res
-}
-
-; VPKLSH.
-define <16 x i8> @test_vpklsh(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vpklsh:
-; CHECK: vpklsh %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.vpklsh(<8 x i16> %a, <8 x i16> %b)
-  ret <16 x i8> %res
-}
-
-; VPKLSF.
-define <8 x i16> @test_vpklsf(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vpklsf:
-; CHECK: vpklsf %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <8 x i16> @llvm.s390.vpklsf(<4 x i32> %a, <4 x i32> %b)
-  ret <8 x i16> %res
-}
-
-; VPKLSG.
-define <4 x i32> @test_vpklsg(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vpklsg:
-; CHECK: vpklsg %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <4 x i32> @llvm.s390.vpklsg(<2 x i64> %a, <2 x i64> %b)
-  ret <4 x i32> %res
-}
-
-; VPKLSHS with no processing of the result.
-define <16 x i8> @test_vpklshs(<8 x i16> %a, <8 x i16> %b, i32 *%ccptr) {
-; CHECK-LABEL: test_vpklshs:
-; CHECK: vpklshs %v24, %v24, %v26
-; CHECK: ipm [[REG:%r[0-5]]]
-; CHECK: srl [[REG]], 28
-; CHECK: st [[REG]], 0(%r2)
-; CHECK: br %r14
-  %call = call {<16 x i8>, i32} @llvm.s390.vpklshs(<8 x i16> %a, <8 x i16> %b)
-  %res = extractvalue {<16 x i8>, i32} %call, 0
-  %cc = extractvalue {<16 x i8>, i32} %call, 1
-  store i32 %cc, i32 *%ccptr
-  ret <16 x i8> %res
-}
-
-; VPKLSHS, storing to %ptr if all values were saturated.
-define <16 x i8> @test_vpklshs_all_store(<8 x i16> %a, <8 x i16> %b,
-                                         i32 *%ptr) {
-; CHECK-LABEL: test_vpklshs_all_store:
-; CHECK: vpklshs %v24, %v24, %v26
-; CHECK-NEXT: {{bnor|bler}} %r14
-; CHECK: mvhi 0(%r2), 0
-; CHECK: br %r14
-  %call = call {<16 x i8>, i32} @llvm.s390.vpklshs(<8 x i16> %a, <8 x i16> %b)
-  %res = extractvalue {<16 x i8>, i32} %call, 0
-  %cc = extractvalue {<16 x i8>, i32} %call, 1
-  %cmp = icmp eq i32 %cc, 3
-  br i1 %cmp, label %store, label %exit
-
-store:
-  store i32 0, i32 *%ptr
-  br label %exit
-
-exit:
-  ret <16 x i8> %res
-}
-
-; VPKLSFS with no processing of the result.
-define <8 x i16> @test_vpklsfs(<4 x i32> %a, <4 x i32> %b, i32 *%ccptr) {
-; CHECK-LABEL: test_vpklsfs:
-; CHECK: vpklsfs %v24, %v24, %v26
-; CHECK: ipm [[REG:%r[0-5]]]
-; CHECK: srl [[REG]], 28
-; CHECK: st [[REG]], 0(%r2)
-; CHECK: br %r14
-  %call = call {<8 x i16>, i32} @llvm.s390.vpklsfs(<4 x i32> %a, <4 x i32> %b)
-  %res = extractvalue {<8 x i16>, i32} %call, 0
-  %cc = extractvalue {<8 x i16>, i32} %call, 1
-  store i32 %cc, i32 *%ccptr
-  ret <8 x i16> %res
-}
-
-; VPKLSFS, storing to %ptr if any values were saturated.
-define <8 x i16> @test_vpklsfs_any_store(<4 x i32> %a, <4 x i32> %b,
-                                         i32 *%ptr) {
-; CHECK-LABEL: test_vpklsfs_any_store:
-; CHECK: vpklsfs %v24, %v24, %v26
-; CHECK-NEXT: {{bher|ber}} %r14
-; CHECK: mvhi 0(%r2), 0
-; CHECK: br %r14
-  %call = call {<8 x i16>, i32} @llvm.s390.vpklsfs(<4 x i32> %a, <4 x i32> %b)
-  %res = extractvalue {<8 x i16>, i32} %call, 0
-  %cc = extractvalue {<8 x i16>, i32} %call, 1
-  %cmp = icmp ne i32 %cc, 0
-  br i1 %cmp, label %store, label %exit
-
-store:
-  store i32 0, i32 *%ptr
-  br label %exit
-
-exit:
-  ret <8 x i16> %res
-}
-
-; VPKLSGS with no processing of the result.
-define <4 x i32> @test_vpklsgs(<2 x i64> %a, <2 x i64> %b, i32 *%ccptr) {
-; CHECK-LABEL: test_vpklsgs:
-; CHECK: vpklsgs %v24, %v24, %v26
-; CHECK: ipm [[REG:%r[0-5]]]
-; CHECK: srl [[REG]], 28
-; CHECK: st [[REG]], 0(%r2)
-; CHECK: br %r14
-  %call = call {<4 x i32>, i32} @llvm.s390.vpklsgs(<2 x i64> %a, <2 x i64> %b)
-  %res = extractvalue {<4 x i32>, i32} %call, 0
-  %cc = extractvalue {<4 x i32>, i32} %call, 1
-  store i32 %cc, i32 *%ccptr
-  ret <4 x i32> %res
-}
-
-; VPKLSGS, storing to %ptr if no elements were saturated
-define <4 x i32> @test_vpklsgs_none_store(<2 x i64> %a, <2 x i64> %b,
-                                          i32 *%ptr) {
-; CHECK-LABEL: test_vpklsgs_none_store:
-; CHECK: vpklsgs %v24, %v24, %v26
-; CHECK-NEXT: {{bnher|bner}} %r14
-; CHECK: mvhi 0(%r2), 0
-; CHECK: br %r14
-  %call = call {<4 x i32>, i32} @llvm.s390.vpklsgs(<2 x i64> %a, <2 x i64> %b)
-  %res = extractvalue {<4 x i32>, i32} %call, 0
-  %cc = extractvalue {<4 x i32>, i32} %call, 1
-  %cmp = icmp eq i32 %cc, 0
-  br i1 %cmp, label %store, label %exit
-
-store:
-  store i32 0, i32 *%ptr
-  br label %exit
-
-exit:
-  ret <4 x i32> %res
-}
-
-; VSTL with the lowest in-range displacement.
-define void @test_vstl1(<16 x i8> %vec, i8 *%ptr, i32 %length) {
-; CHECK-LABEL: test_vstl1:
-; CHECK: vstl %v24, %r3, 0(%r2)
-; CHECK: br %r14
-  call void @llvm.s390.vstl(<16 x i8> %vec, i32 %length, i8 *%ptr)
-  ret void
-}
-
-; VSTL with the highest in-range displacement.
-define void @test_vstl2(<16 x i8> %vec, i8 *%base, i32 %length) {
-; CHECK-LABEL: test_vstl2:
-; CHECK: vstl %v24, %r3, 4095(%r2)
-; CHECK: br %r14
-  %ptr = getelementptr i8, i8 *%base, i64 4095
-  call void @llvm.s390.vstl(<16 x i8> %vec, i32 %length, i8 *%ptr)
-  ret void
-}
-
-; VSTL with an out-of-range displacement.
-define void @test_vstl3(<16 x i8> %vec, i8 *%base, i32 %length) {
-; CHECK-LABEL: test_vstl3:
-; CHECK: vstl %v24, %r3, 0({{%r[1-5]}})
-; CHECK: br %r14
-  %ptr = getelementptr i8, i8 *%base, i64 4096
-  call void @llvm.s390.vstl(<16 x i8> %vec, i32 %length, i8 *%ptr)
-  ret void
-}
-
-; Check that VSTL doesn't allow an index.
-define void @test_vstl4(<16 x i8> %vec, i8 *%base, i64 %index, i32 %length) {
-; CHECK-LABEL: test_vstl4:
-; CHECK: vstl %v24, %r4, 0({{%r[1-5]}})
-; CHECK: br %r14
-  %ptr = getelementptr i8, i8 *%base, i64 %index
-  call void @llvm.s390.vstl(<16 x i8> %vec, i32 %length, i8 *%ptr)
-  ret void
-}
-
-; VUPHB.
-define <8 x i16> @test_vuphb(<16 x i8> %a) {
-; CHECK-LABEL: test_vuphb:
-; CHECK: vuphb %v24, %v24
-; CHECK: br %r14
-  %res = call <8 x i16> @llvm.s390.vuphb(<16 x i8> %a)
-  ret <8 x i16> %res
-}
-
-; VUPHH.
-define <4 x i32> @test_vuphh(<8 x i16> %a) {
-; CHECK-LABEL: test_vuphh:
-; CHECK: vuphh %v24, %v24
-; CHECK: br %r14
-  %res = call <4 x i32> @llvm.s390.vuphh(<8 x i16> %a)
-  ret <4 x i32> %res
-}
-
-; VUPHF.
-define <2 x i64> @test_vuphf(<4 x i32> %a) {
-; CHECK-LABEL: test_vuphf:
-; CHECK: vuphf %v24, %v24
-; CHECK: br %r14
-  %res = call <2 x i64> @llvm.s390.vuphf(<4 x i32> %a)
-  ret <2 x i64> %res
-}
-
-; VUPLHB.
-define <8 x i16> @test_vuplhb(<16 x i8> %a) {
-; CHECK-LABEL: test_vuplhb:
-; CHECK: vuplhb %v24, %v24
-; CHECK: br %r14
-  %res = call <8 x i16> @llvm.s390.vuplhb(<16 x i8> %a)
-  ret <8 x i16> %res
-}
-
-; VUPLHH.
-define <4 x i32> @test_vuplhh(<8 x i16> %a) {
-; CHECK-LABEL: test_vuplhh:
-; CHECK: vuplhh %v24, %v24
-; CHECK: br %r14
-  %res = call <4 x i32> @llvm.s390.vuplhh(<8 x i16> %a)
-  ret <4 x i32> %res
-}
-
-; VUPLHF.
-define <2 x i64> @test_vuplhf(<4 x i32> %a) {
-; CHECK-LABEL: test_vuplhf:
-; CHECK: vuplhf %v24, %v24
-; CHECK: br %r14
-  %res = call <2 x i64> @llvm.s390.vuplhf(<4 x i32> %a)
-  ret <2 x i64> %res
-}
-
-; VUPLB.
-define <8 x i16> @test_vuplb(<16 x i8> %a) {
-; CHECK-LABEL: test_vuplb:
-; CHECK: vuplb %v24, %v24
-; CHECK: br %r14
-  %res = call <8 x i16> @llvm.s390.vuplb(<16 x i8> %a)
-  ret <8 x i16> %res
-}
-
-; VUPLHW.
-define <4 x i32> @test_vuplhw(<8 x i16> %a) {
-; CHECK-LABEL: test_vuplhw:
-; CHECK: vuplhw %v24, %v24
-; CHECK: br %r14
-  %res = call <4 x i32> @llvm.s390.vuplhw(<8 x i16> %a)
-  ret <4 x i32> %res
-}
-
-; VUPLF.
-define <2 x i64> @test_vuplf(<4 x i32> %a) {
-; CHECK-LABEL: test_vuplf:
-; CHECK: vuplf %v24, %v24
-; CHECK: br %r14
-  %res = call <2 x i64> @llvm.s390.vuplf(<4 x i32> %a)
-  ret <2 x i64> %res
-}
-
-; VUPLLB.
-define <8 x i16> @test_vupllb(<16 x i8> %a) {
-; CHECK-LABEL: test_vupllb:
-; CHECK: vupllb %v24, %v24
-; CHECK: br %r14
-  %res = call <8 x i16> @llvm.s390.vupllb(<16 x i8> %a)
-  ret <8 x i16> %res
-}
-
-; VUPLLH.
-define <4 x i32> @test_vupllh(<8 x i16> %a) {
-; CHECK-LABEL: test_vupllh:
-; CHECK: vupllh %v24, %v24
-; CHECK: br %r14
-  %res = call <4 x i32> @llvm.s390.vupllh(<8 x i16> %a)
-  ret <4 x i32> %res
-}
-
-; VUPLLF.
-define <2 x i64> @test_vupllf(<4 x i32> %a) {
-; CHECK-LABEL: test_vupllf:
-; CHECK: vupllf %v24, %v24
-; CHECK: br %r14
-  %res = call <2 x i64> @llvm.s390.vupllf(<4 x i32> %a)
-  ret <2 x i64> %res
-}
-
-; VACCB.
-define <16 x i8> @test_vaccb(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vaccb:
-; CHECK: vaccb %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.vaccb(<16 x i8> %a, <16 x i8> %b)
-  ret <16 x i8> %res
-}
-
-; VACCH.
-define <8 x i16> @test_vacch(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vacch:
-; CHECK: vacch %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <8 x i16> @llvm.s390.vacch(<8 x i16> %a, <8 x i16> %b)
-  ret <8 x i16> %res
-}
-
-; VACCF.
-define <4 x i32> @test_vaccf(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vaccf:
-; CHECK: vaccf %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <4 x i32> @llvm.s390.vaccf(<4 x i32> %a, <4 x i32> %b)
-  ret <4 x i32> %res
-}
-
-; VACCG.
-define <2 x i64> @test_vaccg(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vaccg:
-; CHECK: vaccg %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <2 x i64> @llvm.s390.vaccg(<2 x i64> %a, <2 x i64> %b)
-  ret <2 x i64> %res
-}
-
-; VAQ.
-define <16 x i8> @test_vaq(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vaq:
-; CHECK: vaq %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.vaq(<16 x i8> %a, <16 x i8> %b)
-  ret <16 x i8> %res
-}
-
-; VACQ.
-define <16 x i8> @test_vacq(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
-; CHECK-LABEL: test_vacq:
-; CHECK: vacq %v24, %v24, %v26, %v28
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.vacq(<16 x i8> %a, <16 x i8> %b,
-                                        <16 x i8> %c)
-  ret <16 x i8> %res
-}
-
-; VACCQ.
-define <16 x i8> @test_vaccq(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vaccq:
-; CHECK: vaccq %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.vaccq(<16 x i8> %a, <16 x i8> %b)
-  ret <16 x i8> %res
-}
-
-; VACCCQ.
-define <16 x i8> @test_vacccq(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
-; CHECK-LABEL: test_vacccq:
-; CHECK: vacccq %v24, %v24, %v26, %v28
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.vacccq(<16 x i8> %a, <16 x i8> %b,
-                                          <16 x i8> %c)
-  ret <16 x i8> %res
-}
-
-; VAVGB.
-define <16 x i8> @test_vavgb(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vavgb:
-; CHECK: vavgb %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.vavgb(<16 x i8> %a, <16 x i8> %b)
-  ret <16 x i8> %res
-}
-
-; VAVGH.
-define <8 x i16> @test_vavgh(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vavgh:
-; CHECK: vavgh %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <8 x i16> @llvm.s390.vavgh(<8 x i16> %a, <8 x i16> %b)
-  ret <8 x i16> %res
-}
-
-; VAVGF.
-define <4 x i32> @test_vavgf(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vavgf:
-; CHECK: vavgf %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <4 x i32> @llvm.s390.vavgf(<4 x i32> %a, <4 x i32> %b)
-  ret <4 x i32> %res
-}
-
-; VAVGG.
-define <2 x i64> @test_vavgg(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vavgg:
-; CHECK: vavgg %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <2 x i64> @llvm.s390.vavgg(<2 x i64> %a, <2 x i64> %b)
-  ret <2 x i64> %res
-}
-
-; VAVGLB.
-define <16 x i8> @test_vavglb(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vavglb:
-; CHECK: vavglb %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.vavglb(<16 x i8> %a, <16 x i8> %b)
-  ret <16 x i8> %res
-}
-
-; VAVGLH.
-define <8 x i16> @test_vavglh(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vavglh:
-; CHECK: vavglh %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <8 x i16> @llvm.s390.vavglh(<8 x i16> %a, <8 x i16> %b)
-  ret <8 x i16> %res
-}
-
-; VAVGLF.
-define <4 x i32> @test_vavglf(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vavglf:
-; CHECK: vavglf %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <4 x i32> @llvm.s390.vavglf(<4 x i32> %a, <4 x i32> %b)
-  ret <4 x i32> %res
-}
-
-; VAVGLG.
-define <2 x i64> @test_vavglg(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vavglg:
-; CHECK: vavglg %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <2 x i64> @llvm.s390.vavglg(<2 x i64> %a, <2 x i64> %b)
-  ret <2 x i64> %res
-}
-
-; VCKSM.
-define <4 x i32> @test_vcksm(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vcksm:
-; CHECK: vcksm %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <4 x i32> @llvm.s390.vcksm(<4 x i32> %a, <4 x i32> %b)
-  ret <4 x i32> %res
-}
-
-; VGFMB.
-define <8 x i16> @test_vgfmb(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vgfmb:
-; CHECK: vgfmb %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <8 x i16> @llvm.s390.vgfmb(<16 x i8> %a, <16 x i8> %b)
-  ret <8 x i16> %res
-}
-
-; VGFMH.
-define <4 x i32> @test_vgfmh(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vgfmh:
-; CHECK: vgfmh %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <4 x i32> @llvm.s390.vgfmh(<8 x i16> %a, <8 x i16> %b)
-  ret <4 x i32> %res
-}
-
-; VGFMF.
-define <2 x i64> @test_vgfmf(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vgfmf:
-; CHECK: vgfmf %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <2 x i64> @llvm.s390.vgfmf(<4 x i32> %a, <4 x i32> %b)
-  ret <2 x i64> %res
-}
-
-; VGFMG.
-define <16 x i8> @test_vgfmg(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vgfmg:
-; CHECK: vgfmg %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.vgfmg(<2 x i64> %a, <2 x i64> %b)
-  ret <16 x i8> %res
-}
-
-; VGFMAB.
-define <8 x i16> @test_vgfmab(<16 x i8> %a, <16 x i8> %b, <8 x i16> %c) {
-; CHECK-LABEL: test_vgfmab:
-; CHECK: vgfmab %v24, %v24, %v26, %v28
-; CHECK: br %r14
-  %res = call <8 x i16> @llvm.s390.vgfmab(<16 x i8> %a, <16 x i8> %b,
-                                          <8 x i16> %c)
-  ret <8 x i16> %res
-}
-
-; VGFMAH.
-define <4 x i32> @test_vgfmah(<8 x i16> %a, <8 x i16> %b, <4 x i32> %c) {
-; CHECK-LABEL: test_vgfmah:
-; CHECK: vgfmah %v24, %v24, %v26, %v28
-; CHECK: br %r14
-  %res = call <4 x i32> @llvm.s390.vgfmah(<8 x i16> %a, <8 x i16> %b,
-                                          <4 x i32> %c)
-  ret <4 x i32> %res
-}
-
-; VGFMAF.
-define <2 x i64> @test_vgfmaf(<4 x i32> %a, <4 x i32> %b, <2 x i64> %c) {
-; CHECK-LABEL: test_vgfmaf:
-; CHECK: vgfmaf %v24, %v24, %v26, %v28
-; CHECK: br %r14
-  %res = call <2 x i64> @llvm.s390.vgfmaf(<4 x i32> %a, <4 x i32> %b,
-                                          <2 x i64> %c)
-  ret <2 x i64> %res
-}
-
-; VGFMAG.
-define <16 x i8> @test_vgfmag(<2 x i64> %a, <2 x i64> %b, <16 x i8> %c) {
-; CHECK-LABEL: test_vgfmag:
-; CHECK: vgfmag %v24, %v24, %v26, %v28
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.vgfmag(<2 x i64> %a, <2 x i64> %b,
-                                          <16 x i8> %c)
-  ret <16 x i8> %res
-}
-
-; VMAHB.
-define <16 x i8> @test_vmahb(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
-; CHECK-LABEL: test_vmahb:
-; CHECK: vmahb %v24, %v24, %v26, %v28
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.vmahb(<16 x i8> %a, <16 x i8> %b,
-                                         <16 x i8> %c)
-  ret <16 x i8> %res
-}
-
-; VMAHH.
-define <8 x i16> @test_vmahh(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) {
-; CHECK-LABEL: test_vmahh:
-; CHECK: vmahh %v24, %v24, %v26, %v28
-; CHECK: br %r14
-  %res = call <8 x i16> @llvm.s390.vmahh(<8 x i16> %a, <8 x i16> %b,
-                                         <8 x i16> %c)
-  ret <8 x i16> %res
-}
-
-; VMAHF.
-define <4 x i32> @test_vmahf(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
-; CHECK-LABEL: test_vmahf:
-; CHECK: vmahf %v24, %v24, %v26, %v28
-; CHECK: br %r14
-  %res = call <4 x i32> @llvm.s390.vmahf(<4 x i32> %a, <4 x i32> %b,
-                                         <4 x i32> %c)
-  ret <4 x i32> %res
-}
-
-; VMALHB.
-define <16 x i8> @test_vmalhb(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
-; CHECK-LABEL: test_vmalhb:
-; CHECK: vmalhb %v24, %v24, %v26, %v28
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.vmalhb(<16 x i8> %a, <16 x i8> %b,
-                                          <16 x i8> %c)
-  ret <16 x i8> %res
-}
-
-; VMALHH.
-define <8 x i16> @test_vmalhh(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) {
-; CHECK-LABEL: test_vmalhh:
-; CHECK: vmalhh %v24, %v24, %v26, %v28
-; CHECK: br %r14
-  %res = call <8 x i16> @llvm.s390.vmalhh(<8 x i16> %a, <8 x i16> %b,
-                                          <8 x i16> %c)
-  ret <8 x i16> %res
-}
-
-; VMALHF.
-define <4 x i32> @test_vmalhf(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
-; CHECK-LABEL: test_vmalhf:
-; CHECK: vmalhf %v24, %v24, %v26, %v28
-; CHECK: br %r14
-  %res = call <4 x i32> @llvm.s390.vmalhf(<4 x i32> %a, <4 x i32> %b,
-                                          <4 x i32> %c)
-  ret <4 x i32> %res
-}
-
-; VMAEB.
-define <8 x i16> @test_vmaeb(<16 x i8> %a, <16 x i8> %b, <8 x i16> %c) {
-; CHECK-LABEL: test_vmaeb:
-; CHECK: vmaeb %v24, %v24, %v26, %v28
-; CHECK: br %r14
-  %res = call <8 x i16> @llvm.s390.vmaeb(<16 x i8> %a, <16 x i8> %b,
-                                         <8 x i16> %c)
-  ret <8 x i16> %res
-}
-
-; VMAEH.
-define <4 x i32> @test_vmaeh(<8 x i16> %a, <8 x i16> %b, <4 x i32> %c) {
-; CHECK-LABEL: test_vmaeh:
-; CHECK: vmaeh %v24, %v24, %v26, %v28
-; CHECK: br %r14
-  %res = call <4 x i32> @llvm.s390.vmaeh(<8 x i16> %a, <8 x i16> %b,
-                                         <4 x i32> %c)
-  ret <4 x i32> %res
-}
-
-; VMAEF.
-define <2 x i64> @test_vmaef(<4 x i32> %a, <4 x i32> %b, <2 x i64> %c) {
-; CHECK-LABEL: test_vmaef:
-; CHECK: vmaef %v24, %v24, %v26, %v28
-; CHECK: br %r14
-  %res = call <2 x i64> @llvm.s390.vmaef(<4 x i32> %a, <4 x i32> %b,
-                                         <2 x i64> %c)
-  ret <2 x i64> %res
-}
-
-; VMALEB.
-define <8 x i16> @test_vmaleb(<16 x i8> %a, <16 x i8> %b, <8 x i16> %c) {
-; CHECK-LABEL: test_vmaleb:
-; CHECK: vmaleb %v24, %v24, %v26, %v28
-; CHECK: br %r14
-  %res = call <8 x i16> @llvm.s390.vmaleb(<16 x i8> %a, <16 x i8> %b,
-                                          <8 x i16> %c)
-  ret <8 x i16> %res
-}
-
-; VMALEH.
-define <4 x i32> @test_vmaleh(<8 x i16> %a, <8 x i16> %b, <4 x i32> %c) {
-; CHECK-LABEL: test_vmaleh:
-; CHECK: vmaleh %v24, %v24, %v26, %v28
-; CHECK: br %r14
-  %res = call <4 x i32> @llvm.s390.vmaleh(<8 x i16> %a, <8 x i16> %b,
-                                          <4 x i32> %c)
-  ret <4 x i32> %res
-}
-
-; VMALEF.
-define <2 x i64> @test_vmalef(<4 x i32> %a, <4 x i32> %b, <2 x i64> %c) {
-; CHECK-LABEL: test_vmalef:
-; CHECK: vmalef %v24, %v24, %v26, %v28
-; CHECK: br %r14
-  %res = call <2 x i64> @llvm.s390.vmalef(<4 x i32> %a, <4 x i32> %b,
-                                          <2 x i64> %c)
-  ret <2 x i64> %res
-}
-
-; VMAOB.
-define <8 x i16> @test_vmaob(<16 x i8> %a, <16 x i8> %b, <8 x i16> %c) {
-; CHECK-LABEL: test_vmaob:
-; CHECK: vmaob %v24, %v24, %v26, %v28
-; CHECK: br %r14
-  %res = call <8 x i16> @llvm.s390.vmaob(<16 x i8> %a, <16 x i8> %b,
-                                         <8 x i16> %c)
-  ret <8 x i16> %res
-}
-
-; VMAOH.
-define <4 x i32> @test_vmaoh(<8 x i16> %a, <8 x i16> %b, <4 x i32> %c) {
-; CHECK-LABEL: test_vmaoh:
-; CHECK: vmaoh %v24, %v24, %v26, %v28
-; CHECK: br %r14
-  %res = call <4 x i32> @llvm.s390.vmaoh(<8 x i16> %a, <8 x i16> %b,
-                                         <4 x i32> %c)
-  ret <4 x i32> %res
-}
-
-; VMAOF.
-define <2 x i64> @test_vmaof(<4 x i32> %a, <4 x i32> %b, <2 x i64> %c) {
-; CHECK-LABEL: test_vmaof:
-; CHECK: vmaof %v24, %v24, %v26, %v28
-; CHECK: br %r14
-  %res = call <2 x i64> @llvm.s390.vmaof(<4 x i32> %a, <4 x i32> %b,
-                                         <2 x i64> %c)
-  ret <2 x i64> %res
-}
-
-; VMALOB.
-define <8 x i16> @test_vmalob(<16 x i8> %a, <16 x i8> %b, <8 x i16> %c) {
-; CHECK-LABEL: test_vmalob:
-; CHECK: vmalob %v24, %v24, %v26, %v28
-; CHECK: br %r14
-  %res = call <8 x i16> @llvm.s390.vmalob(<16 x i8> %a, <16 x i8> %b,
-                                          <8 x i16> %c)
-  ret <8 x i16> %res
-}
-
-; VMALOH.
-define <4 x i32> @test_vmaloh(<8 x i16> %a, <8 x i16> %b, <4 x i32> %c) {
-; CHECK-LABEL: test_vmaloh:
-; CHECK: vmaloh %v24, %v24, %v26, %v28
-; CHECK: br %r14
-  %res = call <4 x i32> @llvm.s390.vmaloh(<8 x i16> %a, <8 x i16> %b,
-                                          <4 x i32> %c)
-  ret <4 x i32> %res
-}
-
-; VMALOF.
-define <2 x i64> @test_vmalof(<4 x i32> %a, <4 x i32> %b, <2 x i64> %c) {
-; CHECK-LABEL: test_vmalof:
-; CHECK: vmalof %v24, %v24, %v26, %v28
-; CHECK: br %r14
-  %res = call <2 x i64> @llvm.s390.vmalof(<4 x i32> %a, <4 x i32> %b,
-                                          <2 x i64> %c)
-  ret <2 x i64> %res
-}
-
-; VMHB.
-define <16 x i8> @test_vmhb(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vmhb:
-; CHECK: vmhb %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.vmhb(<16 x i8> %a, <16 x i8> %b)
-  ret <16 x i8> %res
-}
-
-; VMHH.
-define <8 x i16> @test_vmhh(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vmhh:
-; CHECK: vmhh %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <8 x i16> @llvm.s390.vmhh(<8 x i16> %a, <8 x i16> %b)
-  ret <8 x i16> %res
-}
-
-; VMHF.
-define <4 x i32> @test_vmhf(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vmhf:
-; CHECK: vmhf %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <4 x i32> @llvm.s390.vmhf(<4 x i32> %a, <4 x i32> %b)
-  ret <4 x i32> %res
-}
-
-; VMLHB.
-define <16 x i8> @test_vmlhb(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vmlhb:
-; CHECK: vmlhb %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.vmlhb(<16 x i8> %a, <16 x i8> %b)
-  ret <16 x i8> %res
-}
-
-; VMLHH.
-define <8 x i16> @test_vmlhh(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vmlhh:
-; CHECK: vmlhh %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <8 x i16> @llvm.s390.vmlhh(<8 x i16> %a, <8 x i16> %b)
-  ret <8 x i16> %res
-}
-
-; VMLHF.
-define <4 x i32> @test_vmlhf(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vmlhf:
-; CHECK: vmlhf %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <4 x i32> @llvm.s390.vmlhf(<4 x i32> %a, <4 x i32> %b)
-  ret <4 x i32> %res
-}
-
-; VMEB.
-define <8 x i16> @test_vmeb(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vmeb:
-; CHECK: vmeb %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <8 x i16> @llvm.s390.vmeb(<16 x i8> %a, <16 x i8> %b)
-  ret <8 x i16> %res
-}
-
-; VMEH.
-define <4 x i32> @test_vmeh(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vmeh:
-; CHECK: vmeh %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <4 x i32> @llvm.s390.vmeh(<8 x i16> %a, <8 x i16> %b)
-  ret <4 x i32> %res
-}
-
-; VMEF.
-define <2 x i64> @test_vmef(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vmef:
-; CHECK: vmef %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <2 x i64> @llvm.s390.vmef(<4 x i32> %a, <4 x i32> %b)
-  ret <2 x i64> %res
-}
-
-; VMLEB.
-define <8 x i16> @test_vmleb(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vmleb:
-; CHECK: vmleb %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <8 x i16> @llvm.s390.vmleb(<16 x i8> %a, <16 x i8> %b)
-  ret <8 x i16> %res
-}
-
-; VMLEH.
-define <4 x i32> @test_vmleh(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vmleh:
-; CHECK: vmleh %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <4 x i32> @llvm.s390.vmleh(<8 x i16> %a, <8 x i16> %b)
-  ret <4 x i32> %res
-}
-
-; VMLEF.
-define <2 x i64> @test_vmlef(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vmlef:
-; CHECK: vmlef %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <2 x i64> @llvm.s390.vmlef(<4 x i32> %a, <4 x i32> %b)
-  ret <2 x i64> %res
-}
-
-; VMOB.
-define <8 x i16> @test_vmob(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vmob:
-; CHECK: vmob %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <8 x i16> @llvm.s390.vmob(<16 x i8> %a, <16 x i8> %b)
-  ret <8 x i16> %res
-}
-
-; VMOH.
-define <4 x i32> @test_vmoh(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vmoh:
-; CHECK: vmoh %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <4 x i32> @llvm.s390.vmoh(<8 x i16> %a, <8 x i16> %b)
-  ret <4 x i32> %res
-}
-
-; VMOF.
-define <2 x i64> @test_vmof(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vmof:
-; CHECK: vmof %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <2 x i64> @llvm.s390.vmof(<4 x i32> %a, <4 x i32> %b)
-  ret <2 x i64> %res
-}
-
-; VMLOB.
-define <8 x i16> @test_vmlob(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vmlob:
-; CHECK: vmlob %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <8 x i16> @llvm.s390.vmlob(<16 x i8> %a, <16 x i8> %b)
-  ret <8 x i16> %res
-}
-
-; VMLOH.
-define <4 x i32> @test_vmloh(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vmloh:
-; CHECK: vmloh %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <4 x i32> @llvm.s390.vmloh(<8 x i16> %a, <8 x i16> %b)
-  ret <4 x i32> %res
-}
-
-; VMLOF.
-define <2 x i64> @test_vmlof(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vmlof:
-; CHECK: vmlof %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <2 x i64> @llvm.s390.vmlof(<4 x i32> %a, <4 x i32> %b)
-  ret <2 x i64> %res
-}
-
-; VERLLVB.
-define <16 x i8> @test_verllvb(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_verllvb:
-; CHECK: verllvb %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.verllvb(<16 x i8> %a, <16 x i8> %b)
-  ret <16 x i8> %res
-}
-
-; VERLLVH.
-define <8 x i16> @test_verllvh(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_verllvh:
-; CHECK: verllvh %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <8 x i16> @llvm.s390.verllvh(<8 x i16> %a, <8 x i16> %b)
-  ret <8 x i16> %res
-}
-
-; VERLLVF.
-define <4 x i32> @test_verllvf(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_verllvf:
-; CHECK: verllvf %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <4 x i32> @llvm.s390.verllvf(<4 x i32> %a, <4 x i32> %b)
-  ret <4 x i32> %res
-}
-
-; VERLLVG.
-define <2 x i64> @test_verllvg(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test_verllvg:
-; CHECK: verllvg %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <2 x i64> @llvm.s390.verllvg(<2 x i64> %a, <2 x i64> %b)
-  ret <2 x i64> %res
-}
-
-; VERLLB.
-define <16 x i8> @test_verllb(<16 x i8> %a, i32 %b) {
-; CHECK-LABEL: test_verllb:
-; CHECK: verllb %v24, %v24, 0(%r2)
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.verllb(<16 x i8> %a, i32 %b)
-  ret <16 x i8> %res
-}
-
-; VERLLH.
-define <8 x i16> @test_verllh(<8 x i16> %a, i32 %b) {
-; CHECK-LABEL: test_verllh:
-; CHECK: verllh %v24, %v24, 0(%r2)
-; CHECK: br %r14
-  %res = call <8 x i16> @llvm.s390.verllh(<8 x i16> %a, i32 %b)
-  ret <8 x i16> %res
-}
-
-; VERLLF.
-define <4 x i32> @test_verllf(<4 x i32> %a, i32 %b) {
-; CHECK-LABEL: test_verllf:
-; CHECK: verllf %v24, %v24, 0(%r2)
-; CHECK: br %r14
-  %res = call <4 x i32> @llvm.s390.verllf(<4 x i32> %a, i32 %b)
-  ret <4 x i32> %res
-}
-
-; VERLLG.
-define <2 x i64> @test_verllg(<2 x i64> %a, i32 %b) {
-; CHECK-LABEL: test_verllg:
-; CHECK: verllg %v24, %v24, 0(%r2)
-; CHECK: br %r14
-  %res = call <2 x i64> @llvm.s390.verllg(<2 x i64> %a, i32 %b)
-  ret <2 x i64> %res
-}
-
-; VERLLB with the smallest count.
-define <16 x i8> @test_verllb_1(<16 x i8> %a) {
-; CHECK-LABEL: test_verllb_1:
-; CHECK: verllb %v24, %v24, 1
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.verllb(<16 x i8> %a, i32 1)
-  ret <16 x i8> %res
-}
-
-; VERLLB with the largest count.
-define <16 x i8> @test_verllb_4095(<16 x i8> %a) {
-; CHECK-LABEL: test_verllb_4095:
-; CHECK: verllb %v24, %v24, 4095
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.verllb(<16 x i8> %a, i32 4095)
-  ret <16 x i8> %res
-}
-
-; VERLLB with the largest count + 1.
-define <16 x i8> @test_verllb_4096(<16 x i8> %a) {
-; CHECK-LABEL: test_verllb_4096:
-; CHECK: lhi [[REG:%r[1-5]]], 4096
-; CHECK: verllb %v24, %v24, 0([[REG]])
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.verllb(<16 x i8> %a, i32 4096)
-  ret <16 x i8> %res
-}
-
-; VERIMB.
-define <16 x i8> @test_verimb(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
-; CHECK-LABEL: test_verimb:
-; CHECK: verimb %v24, %v26, %v28, 1
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.verimb(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, i32 1)
-  ret <16 x i8> %res
-}
-
-; VERIMH.
-define <8 x i16> @test_verimh(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) {
-; CHECK-LABEL: test_verimh:
-; CHECK: verimh %v24, %v26, %v28, 1
-; CHECK: br %r14
-  %res = call <8 x i16> @llvm.s390.verimh(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, i32 1)
-  ret <8 x i16> %res
-}
-
-; VERIMF.
-define <4 x i32> @test_verimf(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
-; CHECK-LABEL: test_verimf:
-; CHECK: verimf %v24, %v26, %v28, 1
-; CHECK: br %r14
-  %res = call <4 x i32> @llvm.s390.verimf(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, i32 1)
-  ret <4 x i32> %res
-}
-
-; VERIMG.
-define <2 x i64> @test_verimg(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) {
-; CHECK-LABEL: test_verimg:
-; CHECK: verimg %v24, %v26, %v28, 1
-; CHECK: br %r14
-  %res = call <2 x i64> @llvm.s390.verimg(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, i32 1)
-  ret <2 x i64> %res
-}
-
-; VERIMB with a different mask.
-define <16 x i8> @test_verimb_254(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
-; CHECK-LABEL: test_verimb_254:
-; CHECK: verimb %v24, %v26, %v28, 254
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.verimb(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, i32 254)
-  ret <16 x i8> %res
-}
-
-; VSL.
-define <16 x i8> @test_vsl(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vsl:
-; CHECK: vsl %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.vsl(<16 x i8> %a, <16 x i8> %b)
-  ret <16 x i8> %res
-}
-
-; VSLB.
-define <16 x i8> @test_vslb(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vslb:
-; CHECK: vslb %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.vslb(<16 x i8> %a, <16 x i8> %b)
-  ret <16 x i8> %res
-}
-
-; VSRA.
-define <16 x i8> @test_vsra(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vsra:
-; CHECK: vsra %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.vsra(<16 x i8> %a, <16 x i8> %b)
-  ret <16 x i8> %res
-}
-
-; VSRAB.
-define <16 x i8> @test_vsrab(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vsrab:
-; CHECK: vsrab %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.vsrab(<16 x i8> %a, <16 x i8> %b)
-  ret <16 x i8> %res
-}
-
-; VSRL.
-define <16 x i8> @test_vsrl(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vsrl:
-; CHECK: vsrl %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.vsrl(<16 x i8> %a, <16 x i8> %b)
-  ret <16 x i8> %res
-}
-
-; VSRLB.
-define <16 x i8> @test_vsrlb(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vsrlb:
-; CHECK: vsrlb %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.vsrlb(<16 x i8> %a, <16 x i8> %b)
-  ret <16 x i8> %res
-}
-
-; VSLDB with the minimum useful value.
-define <16 x i8> @test_vsldb_1(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vsldb_1:
-; CHECK: vsldb %v24, %v24, %v26, 1
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.vsldb(<16 x i8> %a, <16 x i8> %b, i32 1)
-  ret <16 x i8> %res
-}
-
-; VSLDB with the maximum value.
-define <16 x i8> @test_vsldb_15(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vsldb_15:
-; CHECK: vsldb %v24, %v24, %v26, 15
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.vsldb(<16 x i8> %a, <16 x i8> %b, i32 15)
-  ret <16 x i8> %res
-}
-
-; VSCBIB.
-define <16 x i8> @test_vscbib(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vscbib:
-; CHECK: vscbib %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.vscbib(<16 x i8> %a, <16 x i8> %b)
-  ret <16 x i8> %res
-}
-
-; VSCBIH.
-define <8 x i16> @test_vscbih(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vscbih:
-; CHECK: vscbih %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <8 x i16> @llvm.s390.vscbih(<8 x i16> %a, <8 x i16> %b)
-  ret <8 x i16> %res
-}
-
-; VSCBIF.
-define <4 x i32> @test_vscbif(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vscbif:
-; CHECK: vscbif %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <4 x i32> @llvm.s390.vscbif(<4 x i32> %a, <4 x i32> %b)
-  ret <4 x i32> %res
-}
-
-; VSCBIG.
-define <2 x i64> @test_vscbig(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vscbig:
-; CHECK: vscbig %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <2 x i64> @llvm.s390.vscbig(<2 x i64> %a, <2 x i64> %b)
-  ret <2 x i64> %res
-}
-
-; VSQ.
-define <16 x i8> @test_vsq(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vsq:
-; CHECK: vsq %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.vsq(<16 x i8> %a, <16 x i8> %b)
-  ret <16 x i8> %res
-}
-
-; VSBIQ.
-define <16 x i8> @test_vsbiq(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
-; CHECK-LABEL: test_vsbiq:
-; CHECK: vsbiq %v24, %v24, %v26, %v28
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.vsbiq(<16 x i8> %a, <16 x i8> %b,
-                                         <16 x i8> %c)
-  ret <16 x i8> %res
-}
-
-; VSCBIQ.
-define <16 x i8> @test_vscbiq(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vscbiq:
-; CHECK: vscbiq %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.vscbiq(<16 x i8> %a, <16 x i8> %b)
-  ret <16 x i8> %res
-}
-
-; VSBCBIQ.
-define <16 x i8> @test_vsbcbiq(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
-; CHECK-LABEL: test_vsbcbiq:
-; CHECK: vsbcbiq %v24, %v24, %v26, %v28
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.vsbcbiq(<16 x i8> %a, <16 x i8> %b,
-                                           <16 x i8> %c)
-  ret <16 x i8> %res
-}
-
-; VSUMB.
-define <4 x i32> @test_vsumb(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vsumb:
-; CHECK: vsumb %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <4 x i32> @llvm.s390.vsumb(<16 x i8> %a, <16 x i8> %b)
-  ret <4 x i32> %res
-}
-
-; VSUMH.
-define <4 x i32> @test_vsumh(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vsumh:
-; CHECK: vsumh %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <4 x i32> @llvm.s390.vsumh(<8 x i16> %a, <8 x i16> %b)
-  ret <4 x i32> %res
-}
-
-; VSUMGH.
-define <2 x i64> @test_vsumgh(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vsumgh:
-; CHECK: vsumgh %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <2 x i64> @llvm.s390.vsumgh(<8 x i16> %a, <8 x i16> %b)
-  ret <2 x i64> %res
-}
-
-; VSUMGF.
-define <2 x i64> @test_vsumgf(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vsumgf:
-; CHECK: vsumgf %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <2 x i64> @llvm.s390.vsumgf(<4 x i32> %a, <4 x i32> %b)
-  ret <2 x i64> %res
-}
-
-; VSUMQF.
-define <16 x i8> @test_vsumqf(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vsumqf:
-; CHECK: vsumqf %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.vsumqf(<4 x i32> %a, <4 x i32> %b)
-  ret <16 x i8> %res
-}
-
-; VSUMQG.
-define <16 x i8> @test_vsumqg(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vsumqg:
-; CHECK: vsumqg %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.vsumqg(<2 x i64> %a, <2 x i64> %b)
-  ret <16 x i8> %res
-}
-
-; VTM with no processing of the result.
-define i32 @test_vtm(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vtm:
-; CHECK: vtm %v24, %v26
-; CHECK: ipm %r2
-; CHECK: srl %r2, 28
-; CHECK: br %r14
-  %res = call i32 @llvm.s390.vtm(<16 x i8> %a, <16 x i8> %b)
-  ret i32 %res
-}
-
-; VTM, storing to %ptr if all bits are set.
-define void @test_vtm_all_store(<16 x i8> %a, <16 x i8> %b, i32 *%ptr) {
-; CHECK-LABEL: test_vtm_all_store:
-; CHECK-NOT: %r
-; CHECK: vtm %v24, %v26
-; CHECK-NEXT: {{bnor|bler}} %r14
-; CHECK: mvhi 0(%r2), 0
-; CHECK: br %r14
-  %res = call i32 @llvm.s390.vtm(<16 x i8> %a, <16 x i8> %b)
-  %cmp = icmp sge i32 %res, 3
-  br i1 %cmp, label %store, label %exit
-
-store:
-  store i32 0, i32 *%ptr
-  br label %exit
-
-exit:
-  ret void
-}
-
-; VCEQBS with no processing of the result.
-define i32 @test_vceqbs(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vceqbs:
-; CHECK: vceqbs {{%v[0-9]+}}, %v24, %v26
-; CHECK: ipm %r2
-; CHECK: srl %r2, 28
-; CHECK: br %r14
-  %call = call {<16 x i8>, i32} @llvm.s390.vceqbs(<16 x i8> %a, <16 x i8> %b)
-  %res = extractvalue {<16 x i8>, i32} %call, 1
-  ret i32 %res
-}
-
-; VCEQBS, returning 1 if any elements are equal (CC != 3).
-define i32 @test_vceqbs_any_bool(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vceqbs_any_bool:
-; CHECK: vceqbs {{%v[0-9]+}}, %v24, %v26
-; CHECK: ipm %r2
-; CHECK: afi %r2, -536870912
-; CHECK: srl %r2, 31
-; CHECK: br %r14
-  %call = call {<16 x i8>, i32} @llvm.s390.vceqbs(<16 x i8> %a, <16 x i8> %b)
-  %res = extractvalue {<16 x i8>, i32} %call, 1
-  %cmp = icmp ne i32 %res, 3
-  %ext = zext i1 %cmp to i32
-  ret i32 %ext
-}
-
-; VCEQBS, storing to %ptr if any elements are equal.
-define <16 x i8> @test_vceqbs_any_store(<16 x i8> %a, <16 x i8> %b, i32 *%ptr) {
-; CHECK-LABEL: test_vceqbs_any_store:
-; CHECK-NOT: %r
-; CHECK: vceqbs %v24, %v24, %v26
-; CHECK-NEXT: {{bor|bnler}} %r14
-; CHECK: mvhi 0(%r2), 0
-; CHECK: br %r14
-  %call = call {<16 x i8>, i32} @llvm.s390.vceqbs(<16 x i8> %a, <16 x i8> %b)
-  %res = extractvalue {<16 x i8>, i32} %call, 0
-  %cc = extractvalue {<16 x i8>, i32} %call, 1
-  %cmp = icmp ule i32 %cc, 2
-  br i1 %cmp, label %store, label %exit
-
-store:
-  store i32 0, i32 *%ptr
-  br label %exit
-
-exit:
-  ret <16 x i8> %res
-}
-
-; VCEQHS with no processing of the result.
-define i32 @test_vceqhs(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vceqhs:
-; CHECK: vceqhs {{%v[0-9]+}}, %v24, %v26
-; CHECK: ipm %r2
-; CHECK: srl %r2, 28
-; CHECK: br %r14
-  %call = call {<8 x i16>, i32} @llvm.s390.vceqhs(<8 x i16> %a, <8 x i16> %b)
-  %res = extractvalue {<8 x i16>, i32} %call, 1
-  ret i32 %res
-}
-
-; VCEQHS, returning 1 if not all elements are equal.
-define i32 @test_vceqhs_notall_bool(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vceqhs_notall_bool:
-; CHECK: vceqhs {{%v[0-9]+}}, %v24, %v26
-; CHECK: ipm [[REG:%r[0-5]]]
-; CHECK: risblg %r2, [[REG]], 31, 159, 36
-; CHECK: br %r14
-  %call = call {<8 x i16>, i32} @llvm.s390.vceqhs(<8 x i16> %a, <8 x i16> %b)
-  %res = extractvalue {<8 x i16>, i32} %call, 1
-  %cmp = icmp sge i32 %res, 1
-  %ext = zext i1 %cmp to i32
-  ret i32 %ext
-}
-
-; VCEQHS, storing to %ptr if not all elements are equal.
-define <8 x i16> @test_vceqhs_notall_store(<8 x i16> %a, <8 x i16> %b,
-                                           i32 *%ptr) {
-; CHECK-LABEL: test_vceqhs_notall_store:
-; CHECK-NOT: %r
-; CHECK: vceqhs %v24, %v24, %v26
-; CHECK-NEXT: {{bher|ber}} %r14
-; CHECK: mvhi 0(%r2), 0
-; CHECK: br %r14
-  %call = call {<8 x i16>, i32} @llvm.s390.vceqhs(<8 x i16> %a, <8 x i16> %b)
-  %res = extractvalue {<8 x i16>, i32} %call, 0
-  %cc = extractvalue {<8 x i16>, i32} %call, 1
-  %cmp = icmp ugt i32 %cc, 0
-  br i1 %cmp, label %store, label %exit
-
-store:
-  store i32 0, i32 *%ptr
-  br label %exit
-
-exit:
-  ret <8 x i16> %res
-}
-
-; VCEQFS with no processing of the result.
-define i32 @test_vceqfs(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vceqfs:
-; CHECK: vceqfs {{%v[0-9]+}}, %v24, %v26
-; CHECK: ipm %r2
-; CHECK: srl %r2, 28
-; CHECK: br %r14
-  %call = call {<4 x i32>, i32} @llvm.s390.vceqfs(<4 x i32> %a, <4 x i32> %b)
-  %res = extractvalue {<4 x i32>, i32} %call, 1
-  ret i32 %res
-}
-
-; VCEQFS, returning 1 if no elements are equal.
-define i32 @test_vceqfs_none_bool(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vceqfs_none_bool:
-; CHECK: vceqfs {{%v[0-9]+}}, %v24, %v26
-; CHECK: ipm [[REG:%r[0-5]]]
-; CHECK: risblg %r2, [[REG]], 31, 159, 35
-; CHECK: br %r14
-  %call = call {<4 x i32>, i32} @llvm.s390.vceqfs(<4 x i32> %a, <4 x i32> %b)
-  %res = extractvalue {<4 x i32>, i32} %call, 1
-  %cmp = icmp eq i32 %res, 3
-  %ext = zext i1 %cmp to i32
-  ret i32 %ext
-}
-
-; VCEQFS, storing to %ptr if no elements are equal.
-define <4 x i32> @test_vceqfs_none_store(<4 x i32> %a, <4 x i32> %b,
-                                         i32 *%ptr) {
-; CHECK-LABEL: test_vceqfs_none_store:
-; CHECK-NOT: %r
-; CHECK: vceqfs %v24, %v24, %v26
-; CHECK-NEXT: {{bnor|bler}} %r14
-; CHECK: mvhi 0(%r2), 0
-; CHECK: br %r14
-  %call = call {<4 x i32>, i32} @llvm.s390.vceqfs(<4 x i32> %a, <4 x i32> %b)
-  %res = extractvalue {<4 x i32>, i32} %call, 0
-  %cc = extractvalue {<4 x i32>, i32} %call, 1
-  %cmp = icmp uge i32 %cc, 3
-  br i1 %cmp, label %store, label %exit
-
-store:
-  store i32 0, i32 *%ptr
-  br label %exit
-
-exit:
-  ret <4 x i32> %res
-}
-
-; VCEQGS with no processing of the result.
-define i32 @test_vceqgs(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vceqgs:
-; CHECK: vceqgs {{%v[0-9]+}}, %v24, %v26
-; CHECK: ipm %r2
-; CHECK: srl %r2, 28
-; CHECK: br %r14
-  %call = call {<2 x i64>, i32} @llvm.s390.vceqgs(<2 x i64> %a, <2 x i64> %b)
-  %res = extractvalue {<2 x i64>, i32} %call, 1
-  ret i32 %res
-}
-
-; VCEQGS returning 1 if all elements are equal (CC == 0).
-define i32 @test_vceqgs_all_bool(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vceqgs_all_bool:
-; CHECK: vceqgs {{%v[0-9]+}}, %v24, %v26
-; CHECK: ipm %r2
-; CHECK: afi %r2, -268435456
-; CHECK: srl %r2, 31
-; CHECK: br %r14
-  %call = call {<2 x i64>, i32} @llvm.s390.vceqgs(<2 x i64> %a, <2 x i64> %b)
-  %res = extractvalue {<2 x i64>, i32} %call, 1
-  %cmp = icmp ult i32 %res, 1
-  %ext = zext i1 %cmp to i32
-  ret i32 %ext
-}
-
-; VCEQGS, storing to %ptr if all elements are equal.
-define <2 x i64> @test_vceqgs_all_store(<2 x i64> %a, <2 x i64> %b, i32 *%ptr) {
-; CHECK-LABEL: test_vceqgs_all_store:
-; CHECK-NOT: %r
-; CHECK: vceqgs %v24, %v24, %v26
-; CHECK-NEXT: {{bnher|bner}} %r14
-; CHECK: mvhi 0(%r2), 0
-; CHECK: br %r14
-  %call = call {<2 x i64>, i32} @llvm.s390.vceqgs(<2 x i64> %a, <2 x i64> %b)
-  %res = extractvalue {<2 x i64>, i32} %call, 0
-  %cc = extractvalue {<2 x i64>, i32} %call, 1
-  %cmp = icmp sle i32 %cc, 0
-  br i1 %cmp, label %store, label %exit
-
-store:
-  store i32 0, i32 *%ptr
-  br label %exit
-
-exit:
-  ret <2 x i64> %res
-}
-
-; VCHBS with no processing of the result.
-define i32 @test_vchbs(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vchbs:
-; CHECK: vchbs {{%v[0-9]+}}, %v24, %v26
-; CHECK: ipm %r2
-; CHECK: srl %r2, 28
-; CHECK: br %r14
-  %call = call {<16 x i8>, i32} @llvm.s390.vchbs(<16 x i8> %a, <16 x i8> %b)
-  %res = extractvalue {<16 x i8>, i32} %call, 1
-  ret i32 %res
-}
-
-; VCHBS, returning 1 if any elements are higher (CC != 3).
-define i32 @test_vchbs_any_bool(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vchbs_any_bool:
-; CHECK: vchbs {{%v[0-9]+}}, %v24, %v26
-; CHECK: ipm %r2
-; CHECK: afi %r2, -536870912
-; CHECK: srl %r2, 31
-; CHECK: br %r14
-  %call = call {<16 x i8>, i32} @llvm.s390.vchbs(<16 x i8> %a, <16 x i8> %b)
-  %res = extractvalue {<16 x i8>, i32} %call, 1
-  %cmp = icmp ne i32 %res, 3
-  %ext = zext i1 %cmp to i32
-  ret i32 %ext
-}
-
-; VCHBS, storing to %ptr if any elements are higher.
-define <16 x i8> @test_vchbs_any_store(<16 x i8> %a, <16 x i8> %b, i32 *%ptr) {
-; CHECK-LABEL: test_vchbs_any_store:
-; CHECK-NOT: %r
-; CHECK: vchbs %v24, %v24, %v26
-; CHECK-NEXT: {{bor|bnler}} %r14
-; CHECK: mvhi 0(%r2), 0
-; CHECK: br %r14
-  %call = call {<16 x i8>, i32} @llvm.s390.vchbs(<16 x i8> %a, <16 x i8> %b)
-  %res = extractvalue {<16 x i8>, i32} %call, 0
-  %cc = extractvalue {<16 x i8>, i32} %call, 1
-  %cmp = icmp ule i32 %cc, 2
-  br i1 %cmp, label %store, label %exit
-
-store:
-  store i32 0, i32 *%ptr
-  br label %exit
-
-exit:
-  ret <16 x i8> %res
-}
-
-; VCHHS with no processing of the result.
-define i32 @test_vchhs(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vchhs:
-; CHECK: vchhs {{%v[0-9]+}}, %v24, %v26
-; CHECK: ipm %r2
-; CHECK: srl %r2, 28
-; CHECK: br %r14
-  %call = call {<8 x i16>, i32} @llvm.s390.vchhs(<8 x i16> %a, <8 x i16> %b)
-  %res = extractvalue {<8 x i16>, i32} %call, 1
-  ret i32 %res
-}
-
-; VCHHS, returning 1 if not all elements are higher.
-define i32 @test_vchhs_notall_bool(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vchhs_notall_bool:
-; CHECK: vchhs {{%v[0-9]+}}, %v24, %v26
-; CHECK: ipm [[REG:%r[0-5]]]
-; CHECK: risblg %r2, [[REG]], 31, 159, 36
-; CHECK: br %r14
-  %call = call {<8 x i16>, i32} @llvm.s390.vchhs(<8 x i16> %a, <8 x i16> %b)
-  %res = extractvalue {<8 x i16>, i32} %call, 1
-  %cmp = icmp sge i32 %res, 1
-  %ext = zext i1 %cmp to i32
-  ret i32 %ext
-}
-
-; VCHHS, storing to %ptr if not all elements are higher.
-define <8 x i16> @test_vchhs_notall_store(<8 x i16> %a, <8 x i16> %b,
-                                          i32 *%ptr) {
-; CHECK-LABEL: test_vchhs_notall_store:
-; CHECK-NOT: %r
-; CHECK: vchhs %v24, %v24, %v26
-; CHECK-NEXT: {{bher|ber}} %r14
-; CHECK: mvhi 0(%r2), 0
-; CHECK: br %r14
-  %call = call {<8 x i16>, i32} @llvm.s390.vchhs(<8 x i16> %a, <8 x i16> %b)
-  %res = extractvalue {<8 x i16>, i32} %call, 0
-  %cc = extractvalue {<8 x i16>, i32} %call, 1
-  %cmp = icmp ugt i32 %cc, 0
-  br i1 %cmp, label %store, label %exit
-
-store:
-  store i32 0, i32 *%ptr
-  br label %exit
-
-exit:
-  ret <8 x i16> %res
-}
-
-; VCHFS with no processing of the result.
-define i32 @test_vchfs(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vchfs:
-; CHECK: vchfs {{%v[0-9]+}}, %v24, %v26
-; CHECK: ipm %r2
-; CHECK: srl %r2, 28
-; CHECK: br %r14
-  %call = call {<4 x i32>, i32} @llvm.s390.vchfs(<4 x i32> %a, <4 x i32> %b)
-  %res = extractvalue {<4 x i32>, i32} %call, 1
-  ret i32 %res
-}
-
-; VCHFS, returning 1 if no elements are higher.
-define i32 @test_vchfs_none_bool(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vchfs_none_bool:
-; CHECK: vchfs {{%v[0-9]+}}, %v24, %v26
-; CHECK: ipm [[REG:%r[0-5]]]
-; CHECK: risblg %r2, [[REG]], 31, 159, 35
-; CHECK: br %r14
-  %call = call {<4 x i32>, i32} @llvm.s390.vchfs(<4 x i32> %a, <4 x i32> %b)
-  %res = extractvalue {<4 x i32>, i32} %call, 1
-  %cmp = icmp eq i32 %res, 3
-  %ext = zext i1 %cmp to i32
-  ret i32 %ext
-}
-
-; VCHFS, storing to %ptr if no elements are higher.
-define <4 x i32> @test_vchfs_none_store(<4 x i32> %a, <4 x i32> %b, i32 *%ptr) {
-; CHECK-LABEL: test_vchfs_none_store:
-; CHECK-NOT: %r
-; CHECK: vchfs %v24, %v24, %v26
-; CHECK-NEXT: {{bnor|bler}} %r14
-; CHECK: mvhi 0(%r2), 0
-; CHECK: br %r14
-  %call = call {<4 x i32>, i32} @llvm.s390.vchfs(<4 x i32> %a, <4 x i32> %b)
-  %res = extractvalue {<4 x i32>, i32} %call, 0
-  %cc = extractvalue {<4 x i32>, i32} %call, 1
-  %cmp = icmp uge i32 %cc, 3
-  br i1 %cmp, label %store, label %exit
-
-store:
-  store i32 0, i32 *%ptr
-  br label %exit
-
-exit:
-  ret <4 x i32> %res
-}
-
-; VCHGS with no processing of the result.
-define i32 @test_vchgs(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vchgs:
-; CHECK: vchgs {{%v[0-9]+}}, %v24, %v26
-; CHECK: ipm %r2
-; CHECK: srl %r2, 28
-; CHECK: br %r14
-  %call = call {<2 x i64>, i32} @llvm.s390.vchgs(<2 x i64> %a, <2 x i64> %b)
-  %res = extractvalue {<2 x i64>, i32} %call, 1
-  ret i32 %res
-}
-
-; VCHGS returning 1 if all elements are higher (CC == 0).
-define i32 @test_vchgs_all_bool(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vchgs_all_bool:
-; CHECK: vchgs {{%v[0-9]+}}, %v24, %v26
-; CHECK: ipm %r2
-; CHECK: afi %r2, -268435456
-; CHECK: srl %r2, 31
-; CHECK: br %r14
-  %call = call {<2 x i64>, i32} @llvm.s390.vchgs(<2 x i64> %a, <2 x i64> %b)
-  %res = extractvalue {<2 x i64>, i32} %call, 1
-  %cmp = icmp ult i32 %res, 1
-  %ext = zext i1 %cmp to i32
-  ret i32 %ext
-}
-
-; VCHGS, storing to %ptr if all elements are higher.
-define <2 x i64> @test_vchgs_all_store(<2 x i64> %a, <2 x i64> %b, i32 *%ptr) {
-; CHECK-LABEL: test_vchgs_all_store:
-; CHECK-NOT: %r
-; CHECK: vchgs %v24, %v24, %v26
-; CHECK-NEXT: {{bnher|bner}} %r14
-; CHECK: mvhi 0(%r2), 0
-; CHECK: br %r14
-  %call = call {<2 x i64>, i32} @llvm.s390.vchgs(<2 x i64> %a, <2 x i64> %b)
-  %res = extractvalue {<2 x i64>, i32} %call, 0
-  %cc = extractvalue {<2 x i64>, i32} %call, 1
-  %cmp = icmp sle i32 %cc, 0
-  br i1 %cmp, label %store, label %exit
-
-store:
-  store i32 0, i32 *%ptr
-  br label %exit
-
-exit:
-  ret <2 x i64> %res
-}
-
-; VCHLBS with no processing of the result.
-define i32 @test_vchlbs(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vchlbs:
-; CHECK: vchlbs {{%v[0-9]+}}, %v24, %v26
-; CHECK: ipm %r2
-; CHECK: srl %r2, 28
-; CHECK: br %r14
-  %call = call {<16 x i8>, i32} @llvm.s390.vchlbs(<16 x i8> %a, <16 x i8> %b)
-  %res = extractvalue {<16 x i8>, i32} %call, 1
-  ret i32 %res
-}
-
-; VCHLBS, returning 1 if any elements are higher (CC != 3).
-define i32 @test_vchlbs_any_bool(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vchlbs_any_bool:
-; CHECK: vchlbs {{%v[0-9]+}}, %v24, %v26
-; CHECK: ipm %r2
-; CHECK: afi %r2, -536870912
-; CHECK: srl %r2, 31
-; CHECK: br %r14
-  %call = call {<16 x i8>, i32} @llvm.s390.vchlbs(<16 x i8> %a, <16 x i8> %b)
-  %res = extractvalue {<16 x i8>, i32} %call, 1
-  %cmp = icmp ne i32 %res, 3
-  %ext = zext i1 %cmp to i32
-  ret i32 %ext
-}
-
-; VCHLBS, storing to %ptr if any elements are higher.
-define <16 x i8> @test_vchlbs_any_store(<16 x i8> %a, <16 x i8> %b, i32 *%ptr) {
-; CHECK-LABEL: test_vchlbs_any_store:
-; CHECK-NOT: %r
-; CHECK: vchlbs %v24, %v24, %v26
-; CHECK-NEXT: {{bor|bnler}} %r14
-; CHECK: mvhi 0(%r2), 0
-; CHECK: br %r14
-  %call = call {<16 x i8>, i32} @llvm.s390.vchlbs(<16 x i8> %a, <16 x i8> %b)
-  %res = extractvalue {<16 x i8>, i32} %call, 0
-  %cc = extractvalue {<16 x i8>, i32} %call, 1
-  %cmp = icmp sle i32 %cc, 2
-  br i1 %cmp, label %store, label %exit
-
-store:
-  store i32 0, i32 *%ptr
-  br label %exit
-
-exit:
-  ret <16 x i8> %res
-}
-
-; VCHLHS with no processing of the result.
-define i32 @test_vchlhs(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vchlhs:
-; CHECK: vchlhs {{%v[0-9]+}}, %v24, %v26
-; CHECK: ipm %r2
-; CHECK: srl %r2, 28
-; CHECK: br %r14
-  %call = call {<8 x i16>, i32} @llvm.s390.vchlhs(<8 x i16> %a, <8 x i16> %b)
-  %res = extractvalue {<8 x i16>, i32} %call, 1
-  ret i32 %res
-}
-
-; VCHLHS, returning 1 if not all elements are higher.
-define i32 @test_vchlhs_notall_bool(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vchlhs_notall_bool:
-; CHECK: vchlhs {{%v[0-9]+}}, %v24, %v26
-; CHECK: ipm [[REG:%r[0-5]]]
-; CHECK: risblg %r2, [[REG]], 31, 159, 36
-; CHECK: br %r14
-  %call = call {<8 x i16>, i32} @llvm.s390.vchlhs(<8 x i16> %a, <8 x i16> %b)
-  %res = extractvalue {<8 x i16>, i32} %call, 1
-  %cmp = icmp uge i32 %res, 1
-  %ext = zext i1 %cmp to i32
-  ret i32 %ext
-}
-
-; VCHLHS, storing to %ptr if not all elements are higher.
-define <8 x i16> @test_vchlhs_notall_store(<8 x i16> %a, <8 x i16> %b,
-                                           i32 *%ptr) {
-; CHECK-LABEL: test_vchlhs_notall_store:
-; CHECK-NOT: %r
-; CHECK: vchlhs %v24, %v24, %v26
-; CHECK-NEXT: {{bher|ber}} %r14
-; CHECK: mvhi 0(%r2), 0
-; CHECK: br %r14
-  %call = call {<8 x i16>, i32} @llvm.s390.vchlhs(<8 x i16> %a, <8 x i16> %b)
-  %res = extractvalue {<8 x i16>, i32} %call, 0
-  %cc = extractvalue {<8 x i16>, i32} %call, 1
-  %cmp = icmp sgt i32 %cc, 0
-  br i1 %cmp, label %store, label %exit
-
-store:
-  store i32 0, i32 *%ptr
-  br label %exit
-
-exit:
-  ret <8 x i16> %res
-}
-
-; VCHLFS with no processing of the result.
-define i32 @test_vchlfs(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vchlfs:
-; CHECK: vchlfs {{%v[0-9]+}}, %v24, %v26
-; CHECK: ipm %r2
-; CHECK: srl %r2, 28
-; CHECK: br %r14
-  %call = call {<4 x i32>, i32} @llvm.s390.vchlfs(<4 x i32> %a, <4 x i32> %b)
-  %res = extractvalue {<4 x i32>, i32} %call, 1
-  ret i32 %res
-}
-
-; VCHLFS, returning 1 if no elements are higher.
-define i32 @test_vchlfs_none_bool(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vchlfs_none_bool:
-; CHECK: vchlfs {{%v[0-9]+}}, %v24, %v26
-; CHECK: ipm [[REG:%r[0-5]]]
-; CHECK: risblg %r2, [[REG]], 31, 159, 35
-; CHECK: br %r14
-  %call = call {<4 x i32>, i32} @llvm.s390.vchlfs(<4 x i32> %a, <4 x i32> %b)
-  %res = extractvalue {<4 x i32>, i32} %call, 1
-  %cmp = icmp eq i32 %res, 3
-  %ext = zext i1 %cmp to i32
-  ret i32 %ext
-}
-
-; VCHLFS, storing to %ptr if no elements are higher.
-define <4 x i32> @test_vchlfs_none_store(<4 x i32> %a, <4 x i32> %b,
-                                         i32 *%ptr) {
-; CHECK-LABEL: test_vchlfs_none_store:
-; CHECK-NOT: %r
-; CHECK: vchlfs %v24, %v24, %v26
-; CHECK-NEXT: {{bnor|bler}} %r14
-; CHECK: mvhi 0(%r2), 0
-; CHECK: br %r14
-  %call = call {<4 x i32>, i32} @llvm.s390.vchlfs(<4 x i32> %a, <4 x i32> %b)
-  %res = extractvalue {<4 x i32>, i32} %call, 0
-  %cc = extractvalue {<4 x i32>, i32} %call, 1
-  %cmp = icmp sge i32 %cc, 3
-  br i1 %cmp, label %store, label %exit
-
-store:
-  store i32 0, i32 *%ptr
-  br label %exit
-
-exit:
-  ret <4 x i32> %res
-}
-
-; VCHLGS with no processing of the result.
-define i32 @test_vchlgs(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vchlgs:
-; CHECK: vchlgs {{%v[0-9]+}}, %v24, %v26
-; CHECK: ipm %r2
-; CHECK: srl %r2, 28
-; CHECK: br %r14
-  %call = call {<2 x i64>, i32} @llvm.s390.vchlgs(<2 x i64> %a, <2 x i64> %b)
-  %res = extractvalue {<2 x i64>, i32} %call, 1
-  ret i32 %res
-}
-
-; VCHLGS returning 1 if all elements are higher (CC == 0).
-define i32 @test_vchlgs_all_bool(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vchlgs_all_bool:
-; CHECK: vchlgs {{%v[0-9]+}}, %v24, %v26
-; CHECK: ipm %r2
-; CHECK: afi %r2, -268435456
-; CHECK: srl %r2, 31
-; CHECK: br %r14
-  %call = call {<2 x i64>, i32} @llvm.s390.vchlgs(<2 x i64> %a, <2 x i64> %b)
-  %res = extractvalue {<2 x i64>, i32} %call, 1
-  %cmp = icmp slt i32 %res, 1
-  %ext = zext i1 %cmp to i32
-  ret i32 %ext
-}
-
-; VCHLGS, storing to %ptr if all elements are higher.
-define <2 x i64> @test_vchlgs_all_store(<2 x i64> %a, <2 x i64> %b, i32 *%ptr) {
-; CHECK-LABEL: test_vchlgs_all_store:
-; CHECK-NOT: %r
-; CHECK: vchlgs %v24, %v24, %v26
-; CHECK-NEXT: {{bnher|bner}} %r14
-; CHECK: mvhi 0(%r2), 0
-; CHECK: br %r14
-  %call = call {<2 x i64>, i32} @llvm.s390.vchlgs(<2 x i64> %a, <2 x i64> %b)
-  %res = extractvalue {<2 x i64>, i32} %call, 0
-  %cc = extractvalue {<2 x i64>, i32} %call, 1
-  %cmp = icmp ule i32 %cc, 0
-  br i1 %cmp, label %store, label %exit
-
-store:
-  store i32 0, i32 *%ptr
-  br label %exit
-
-exit:
-  ret <2 x i64> %res
-}
-
-; VFAEB with !IN !RT.
-define <16 x i8> @test_vfaeb_0(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vfaeb_0:
-; CHECK: vfaeb %v24, %v24, %v26, 0
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.vfaeb(<16 x i8> %a, <16 x i8> %b, i32 0)
-  ret <16 x i8> %res
-}
-
-; VFAEB with !IN RT.
-define <16 x i8> @test_vfaeb_4(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vfaeb_4:
-; CHECK: vfaeb %v24, %v24, %v26, 4
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.vfaeb(<16 x i8> %a, <16 x i8> %b, i32 4)
-  ret <16 x i8> %res
-}
-
-; VFAEB with IN !RT.
-define <16 x i8> @test_vfaeb_8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vfaeb_8:
-; CHECK: vfaeb %v24, %v24, %v26, 8
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.vfaeb(<16 x i8> %a, <16 x i8> %b, i32 8)
-  ret <16 x i8> %res
-}
-
-; VFAEB with IN RT.
-define <16 x i8> @test_vfaeb_12(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vfaeb_12:
-; CHECK: vfaeb %v24, %v24, %v26, 12
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.vfaeb(<16 x i8> %a, <16 x i8> %b, i32 12)
-  ret <16 x i8> %res
-}
-
-; VFAEB with CS -- should be ignored.
-define <16 x i8> @test_vfaeb_1(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vfaeb_1:
-; CHECK: vfaeb %v24, %v24, %v26, 0
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.vfaeb(<16 x i8> %a, <16 x i8> %b, i32 1)
-  ret <16 x i8> %res
-}
-
-; VFAEH.
-define <8 x i16> @test_vfaeh(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vfaeh:
-; CHECK: vfaeh %v24, %v24, %v26, 4
-; CHECK: br %r14
-  %res = call <8 x i16> @llvm.s390.vfaeh(<8 x i16> %a, <8 x i16> %b, i32 4)
-  ret <8 x i16> %res
-}
-
-; VFAEF.
-define <4 x i32> @test_vfaef(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vfaef:
-; CHECK: vfaef %v24, %v24, %v26, 8
-; CHECK: br %r14
-  %res = call <4 x i32> @llvm.s390.vfaef(<4 x i32> %a, <4 x i32> %b, i32 8)
-  ret <4 x i32> %res
-}
-
-; VFAEBS.
-define <16 x i8> @test_vfaebs(<16 x i8> %a, <16 x i8> %b, i32 *%ccptr) {
-; CHECK-LABEL: test_vfaebs:
-; CHECK: vfaebs %v24, %v24, %v26, 0
-; CHECK: ipm [[REG:%r[0-5]]]
-; CHECK: srl [[REG]], 28
-; CHECK: st [[REG]], 0(%r2)
-; CHECK: br %r14
-  %call = call {<16 x i8>, i32} @llvm.s390.vfaebs(<16 x i8> %a, <16 x i8> %b,
-                                                  i32 0)
-  %res = extractvalue {<16 x i8>, i32} %call, 0
-  %cc = extractvalue {<16 x i8>, i32} %call, 1
-  store i32 %cc, i32 *%ccptr
-  ret <16 x i8> %res
-}
-
-; VFAEHS.
-define <8 x i16> @test_vfaehs(<8 x i16> %a, <8 x i16> %b, i32 *%ccptr) {
-; CHECK-LABEL: test_vfaehs:
-; CHECK: vfaehs %v24, %v24, %v26, 4
-; CHECK: ipm [[REG:%r[0-5]]]
-; CHECK: srl [[REG]], 28
-; CHECK: st [[REG]], 0(%r2)
-; CHECK: br %r14
-  %call = call {<8 x i16>, i32} @llvm.s390.vfaehs(<8 x i16> %a, <8 x i16> %b,
-                                                  i32 4)
-  %res = extractvalue {<8 x i16>, i32} %call, 0
-  %cc = extractvalue {<8 x i16>, i32} %call, 1
-  store i32 %cc, i32 *%ccptr
-  ret <8 x i16> %res
-}
-
-; VFAEFS.
-define <4 x i32> @test_vfaefs(<4 x i32> %a, <4 x i32> %b, i32 *%ccptr) {
-; CHECK-LABEL: test_vfaefs:
-; CHECK: vfaefs %v24, %v24, %v26, 8
-; CHECK: ipm [[REG:%r[0-5]]]
-; CHECK: srl [[REG]], 28
-; CHECK: st [[REG]], 0(%r2)
-; CHECK: br %r14
-  %call = call {<4 x i32>, i32} @llvm.s390.vfaefs(<4 x i32> %a, <4 x i32> %b,
-                                                  i32 8)
-  %res = extractvalue {<4 x i32>, i32} %call, 0
-  %cc = extractvalue {<4 x i32>, i32} %call, 1
-  store i32 %cc, i32 *%ccptr
-  ret <4 x i32> %res
-}
-
-; VFAEZB with !IN !RT.
-define <16 x i8> @test_vfaezb_0(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vfaezb_0:
-; CHECK: vfaezb %v24, %v24, %v26, 0
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.vfaezb(<16 x i8> %a, <16 x i8> %b, i32 0)
-  ret <16 x i8> %res
-}
-
-; VFAEZB with !IN RT.
-define <16 x i8> @test_vfaezb_4(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vfaezb_4:
-; CHECK: vfaezb %v24, %v24, %v26, 4
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.vfaezb(<16 x i8> %a, <16 x i8> %b, i32 4)
-  ret <16 x i8> %res
-}
-
-; VFAEZB with IN !RT.
-define <16 x i8> @test_vfaezb_8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vfaezb_8:
-; CHECK: vfaezb %v24, %v24, %v26, 8
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.vfaezb(<16 x i8> %a, <16 x i8> %b, i32 8)
-  ret <16 x i8> %res
-}
-
-; VFAEZB with IN RT.
-define <16 x i8> @test_vfaezb_12(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vfaezb_12:
-; CHECK: vfaezb %v24, %v24, %v26, 12
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.vfaezb(<16 x i8> %a, <16 x i8> %b, i32 12)
-  ret <16 x i8> %res
-}
-
-; VFAEZB with CS -- should be ignored.
-define <16 x i8> @test_vfaezb_1(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vfaezb_1:
-; CHECK: vfaezb %v24, %v24, %v26, 0
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.vfaezb(<16 x i8> %a, <16 x i8> %b, i32 1)
-  ret <16 x i8> %res
-}
-
-; VFAEZH.
-define <8 x i16> @test_vfaezh(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vfaezh:
-; CHECK: vfaezh %v24, %v24, %v26, 4
-; CHECK: br %r14
-  %res = call <8 x i16> @llvm.s390.vfaezh(<8 x i16> %a, <8 x i16> %b, i32 4)
-  ret <8 x i16> %res
-}
-
-; VFAEZF.
-define <4 x i32> @test_vfaezf(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vfaezf:
-; CHECK: vfaezf %v24, %v24, %v26, 8
-; CHECK: br %r14
-  %res = call <4 x i32> @llvm.s390.vfaezf(<4 x i32> %a, <4 x i32> %b, i32 8)
-  ret <4 x i32> %res
-}
-
-; VFAEZBS.
-define <16 x i8> @test_vfaezbs(<16 x i8> %a, <16 x i8> %b, i32 *%ccptr) {
-; CHECK-LABEL: test_vfaezbs:
-; CHECK: vfaezbs %v24, %v24, %v26, 0
-; CHECK: ipm [[REG:%r[0-5]]]
-; CHECK: srl [[REG]], 28
-; CHECK: st [[REG]], 0(%r2)
-; CHECK: br %r14
-  %call = call {<16 x i8>, i32} @llvm.s390.vfaezbs(<16 x i8> %a, <16 x i8> %b,
-                                                   i32 0)
-  %res = extractvalue {<16 x i8>, i32} %call, 0
-  %cc = extractvalue {<16 x i8>, i32} %call, 1
-  store i32 %cc, i32 *%ccptr
-  ret <16 x i8> %res
-}
-
-; VFAEZHS.
-define <8 x i16> @test_vfaezhs(<8 x i16> %a, <8 x i16> %b, i32 *%ccptr) {
-; CHECK-LABEL: test_vfaezhs:
-; CHECK: vfaezhs %v24, %v24, %v26, 4
-; CHECK: ipm [[REG:%r[0-5]]]
-; CHECK: srl [[REG]], 28
-; CHECK: st [[REG]], 0(%r2)
-; CHECK: br %r14
-  %call = call {<8 x i16>, i32} @llvm.s390.vfaezhs(<8 x i16> %a, <8 x i16> %b,
-                                                   i32 4)
-  %res = extractvalue {<8 x i16>, i32} %call, 0
-  %cc = extractvalue {<8 x i16>, i32} %call, 1
-  store i32 %cc, i32 *%ccptr
-  ret <8 x i16> %res
-}
-
-; VFAEZFS.
-define <4 x i32> @test_vfaezfs(<4 x i32> %a, <4 x i32> %b, i32 *%ccptr) {
-; CHECK-LABEL: test_vfaezfs:
-; CHECK: vfaezfs %v24, %v24, %v26, 8
-; CHECK: ipm [[REG:%r[0-5]]]
-; CHECK: srl [[REG]], 28
-; CHECK: st [[REG]], 0(%r2)
-; CHECK: br %r14
-  %call = call {<4 x i32>, i32} @llvm.s390.vfaezfs(<4 x i32> %a, <4 x i32> %b,
-                                                   i32 8)
-  %res = extractvalue {<4 x i32>, i32} %call, 0
-  %cc = extractvalue {<4 x i32>, i32} %call, 1
-  store i32 %cc, i32 *%ccptr
-  ret <4 x i32> %res
-}
-
-; VFEEB.
-define <16 x i8> @test_vfeeb_0(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vfeeb_0:
-; CHECK: vfeeb %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.vfeeb(<16 x i8> %a, <16 x i8> %b)
-  ret <16 x i8> %res
-}
-
-; VFEEH.
-define <8 x i16> @test_vfeeh(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vfeeh:
-; CHECK: vfeeh %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <8 x i16> @llvm.s390.vfeeh(<8 x i16> %a, <8 x i16> %b)
-  ret <8 x i16> %res
-}
-
-; VFEEF.
-define <4 x i32> @test_vfeef(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vfeef:
-; CHECK: vfeef %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <4 x i32> @llvm.s390.vfeef(<4 x i32> %a, <4 x i32> %b)
-  ret <4 x i32> %res
-}
-
-; VFEEBS.
-define <16 x i8> @test_vfeebs(<16 x i8> %a, <16 x i8> %b, i32 *%ccptr) {
-; CHECK-LABEL: test_vfeebs:
-; CHECK: vfeebs %v24, %v24, %v26
-; CHECK: ipm [[REG:%r[0-5]]]
-; CHECK: srl [[REG]], 28
-; CHECK: st [[REG]], 0(%r2)
-; CHECK: br %r14
-  %call = call {<16 x i8>, i32} @llvm.s390.vfeebs(<16 x i8> %a, <16 x i8> %b)
-  %res = extractvalue {<16 x i8>, i32} %call, 0
-  %cc = extractvalue {<16 x i8>, i32} %call, 1
-  store i32 %cc, i32 *%ccptr
-  ret <16 x i8> %res
-}
-
-; VFEEHS.
-define <8 x i16> @test_vfeehs(<8 x i16> %a, <8 x i16> %b, i32 *%ccptr) {
-; CHECK-LABEL: test_vfeehs:
-; CHECK: vfeehs %v24, %v24, %v26
-; CHECK: ipm [[REG:%r[0-5]]]
-; CHECK: srl [[REG]], 28
-; CHECK: st [[REG]], 0(%r2)
-; CHECK: br %r14
-  %call = call {<8 x i16>, i32} @llvm.s390.vfeehs(<8 x i16> %a, <8 x i16> %b)
-  %res = extractvalue {<8 x i16>, i32} %call, 0
-  %cc = extractvalue {<8 x i16>, i32} %call, 1
-  store i32 %cc, i32 *%ccptr
-  ret <8 x i16> %res
-}
-
-; VFEEFS.
-define <4 x i32> @test_vfeefs(<4 x i32> %a, <4 x i32> %b, i32 *%ccptr) {
-; CHECK-LABEL: test_vfeefs:
-; CHECK: vfeefs %v24, %v24, %v26
-; CHECK: ipm [[REG:%r[0-5]]]
-; CHECK: srl [[REG]], 28
-; CHECK: st [[REG]], 0(%r2)
-; CHECK: br %r14
-  %call = call {<4 x i32>, i32} @llvm.s390.vfeefs(<4 x i32> %a, <4 x i32> %b)
-  %res = extractvalue {<4 x i32>, i32} %call, 0
-  %cc = extractvalue {<4 x i32>, i32} %call, 1
-  store i32 %cc, i32 *%ccptr
-  ret <4 x i32> %res
-}
-
-; VFEEZB.
-define <16 x i8> @test_vfeezb(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vfeezb:
-; CHECK: vfeezb %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.vfeezb(<16 x i8> %a, <16 x i8> %b)
-  ret <16 x i8> %res
-}
-
-; VFEEZH.
-define <8 x i16> @test_vfeezh(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vfeezh:
-; CHECK: vfeezh %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <8 x i16> @llvm.s390.vfeezh(<8 x i16> %a, <8 x i16> %b)
-  ret <8 x i16> %res
-}
-
-; VFEEZF.
-define <4 x i32> @test_vfeezf(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vfeezf:
-; CHECK: vfeezf %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <4 x i32> @llvm.s390.vfeezf(<4 x i32> %a, <4 x i32> %b)
-  ret <4 x i32> %res
-}
-
-; VFEEZBS.
-define <16 x i8> @test_vfeezbs(<16 x i8> %a, <16 x i8> %b, i32 *%ccptr) {
-; CHECK-LABEL: test_vfeezbs:
-; CHECK: vfeezbs %v24, %v24, %v26
-; CHECK: ipm [[REG:%r[0-5]]]
-; CHECK: srl [[REG]], 28
-; CHECK: st [[REG]], 0(%r2)
-; CHECK: br %r14
-  %call = call {<16 x i8>, i32} @llvm.s390.vfeezbs(<16 x i8> %a, <16 x i8> %b)
-  %res = extractvalue {<16 x i8>, i32} %call, 0
-  %cc = extractvalue {<16 x i8>, i32} %call, 1
-  store i32 %cc, i32 *%ccptr
-  ret <16 x i8> %res
-}
-
-; VFEEZHS.
-define <8 x i16> @test_vfeezhs(<8 x i16> %a, <8 x i16> %b, i32 *%ccptr) {
-; CHECK-LABEL: test_vfeezhs:
-; CHECK: vfeezhs %v24, %v24, %v26
-; CHECK: ipm [[REG:%r[0-5]]]
-; CHECK: srl [[REG]], 28
-; CHECK: st [[REG]], 0(%r2)
-; CHECK: br %r14
-  %call = call {<8 x i16>, i32} @llvm.s390.vfeezhs(<8 x i16> %a, <8 x i16> %b)
-  %res = extractvalue {<8 x i16>, i32} %call, 0
-  %cc = extractvalue {<8 x i16>, i32} %call, 1
-  store i32 %cc, i32 *%ccptr
-  ret <8 x i16> %res
-}
-
-; VFEEZFS.
-define <4 x i32> @test_vfeezfs(<4 x i32> %a, <4 x i32> %b, i32 *%ccptr) {
-; CHECK-LABEL: test_vfeezfs:
-; CHECK: vfeezfs %v24, %v24, %v26
-; CHECK: ipm [[REG:%r[0-5]]]
-; CHECK: srl [[REG]], 28
-; CHECK: st [[REG]], 0(%r2)
-; CHECK: br %r14
-  %call = call {<4 x i32>, i32} @llvm.s390.vfeezfs(<4 x i32> %a, <4 x i32> %b)
-  %res = extractvalue {<4 x i32>, i32} %call, 0
-  %cc = extractvalue {<4 x i32>, i32} %call, 1
-  store i32 %cc, i32 *%ccptr
-  ret <4 x i32> %res
-}
-
-; VFENEB.
-define <16 x i8> @test_vfeneb_0(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vfeneb_0:
-; CHECK: vfeneb %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.vfeneb(<16 x i8> %a, <16 x i8> %b)
-  ret <16 x i8> %res
-}
-
-; VFENEH.
-define <8 x i16> @test_vfeneh(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vfeneh:
-; CHECK: vfeneh %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <8 x i16> @llvm.s390.vfeneh(<8 x i16> %a, <8 x i16> %b)
-  ret <8 x i16> %res
-}
-
-; VFENEF.
-define <4 x i32> @test_vfenef(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vfenef:
-; CHECK: vfenef %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <4 x i32> @llvm.s390.vfenef(<4 x i32> %a, <4 x i32> %b)
-  ret <4 x i32> %res
-}
-
-; VFENEBS.
-define <16 x i8> @test_vfenebs(<16 x i8> %a, <16 x i8> %b, i32 *%ccptr) {
-; CHECK-LABEL: test_vfenebs:
-; CHECK: vfenebs %v24, %v24, %v26
-; CHECK: ipm [[REG:%r[0-5]]]
-; CHECK: srl [[REG]], 28
-; CHECK: st [[REG]], 0(%r2)
-; CHECK: br %r14
-  %call = call {<16 x i8>, i32} @llvm.s390.vfenebs(<16 x i8> %a, <16 x i8> %b)
-  %res = extractvalue {<16 x i8>, i32} %call, 0
-  %cc = extractvalue {<16 x i8>, i32} %call, 1
-  store i32 %cc, i32 *%ccptr
-  ret <16 x i8> %res
-}
-
-; VFENEHS.
-define <8 x i16> @test_vfenehs(<8 x i16> %a, <8 x i16> %b, i32 *%ccptr) {
-; CHECK-LABEL: test_vfenehs:
-; CHECK: vfenehs %v24, %v24, %v26
-; CHECK: ipm [[REG:%r[0-5]]]
-; CHECK: srl [[REG]], 28
-; CHECK: st [[REG]], 0(%r2)
-; CHECK: br %r14
-  %call = call {<8 x i16>, i32} @llvm.s390.vfenehs(<8 x i16> %a, <8 x i16> %b)
-  %res = extractvalue {<8 x i16>, i32} %call, 0
-  %cc = extractvalue {<8 x i16>, i32} %call, 1
-  store i32 %cc, i32 *%ccptr
-  ret <8 x i16> %res
-}
-
-; VFENEFS.
-define <4 x i32> @test_vfenefs(<4 x i32> %a, <4 x i32> %b, i32 *%ccptr) {
-; CHECK-LABEL: test_vfenefs:
-; CHECK: vfenefs %v24, %v24, %v26
-; CHECK: ipm [[REG:%r[0-5]]]
-; CHECK: srl [[REG]], 28
-; CHECK: st [[REG]], 0(%r2)
-; CHECK: br %r14
-  %call = call {<4 x i32>, i32} @llvm.s390.vfenefs(<4 x i32> %a, <4 x i32> %b)
-  %res = extractvalue {<4 x i32>, i32} %call, 0
-  %cc = extractvalue {<4 x i32>, i32} %call, 1
-  store i32 %cc, i32 *%ccptr
-  ret <4 x i32> %res
-}
-
-; VFENEZB.
-define <16 x i8> @test_vfenezb(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vfenezb:
-; CHECK: vfenezb %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.vfenezb(<16 x i8> %a, <16 x i8> %b)
-  ret <16 x i8> %res
-}
-
-; VFENEZH.
-define <8 x i16> @test_vfenezh(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vfenezh:
-; CHECK: vfenezh %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <8 x i16> @llvm.s390.vfenezh(<8 x i16> %a, <8 x i16> %b)
-  ret <8 x i16> %res
-}
-
-; VFENEZF.
-define <4 x i32> @test_vfenezf(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vfenezf:
-; CHECK: vfenezf %v24, %v24, %v26
-; CHECK: br %r14
-  %res = call <4 x i32> @llvm.s390.vfenezf(<4 x i32> %a, <4 x i32> %b)
-  ret <4 x i32> %res
-}
-
-; VFENEZBS.
-define <16 x i8> @test_vfenezbs(<16 x i8> %a, <16 x i8> %b, i32 *%ccptr) {
-; CHECK-LABEL: test_vfenezbs:
-; CHECK: vfenezbs %v24, %v24, %v26
-; CHECK: ipm [[REG:%r[0-5]]]
-; CHECK: srl [[REG]], 28
-; CHECK: st [[REG]], 0(%r2)
-; CHECK: br %r14
-  %call = call {<16 x i8>, i32} @llvm.s390.vfenezbs(<16 x i8> %a, <16 x i8> %b)
-  %res = extractvalue {<16 x i8>, i32} %call, 0
-  %cc = extractvalue {<16 x i8>, i32} %call, 1
-  store i32 %cc, i32 *%ccptr
-  ret <16 x i8> %res
-}
-
-; VFENEZHS.
-define <8 x i16> @test_vfenezhs(<8 x i16> %a, <8 x i16> %b, i32 *%ccptr) {
-; CHECK-LABEL: test_vfenezhs:
-; CHECK: vfenezhs %v24, %v24, %v26
-; CHECK: ipm [[REG:%r[0-5]]]
-; CHECK: srl [[REG]], 28
-; CHECK: st [[REG]], 0(%r2)
-; CHECK: br %r14
-  %call = call {<8 x i16>, i32} @llvm.s390.vfenezhs(<8 x i16> %a, <8 x i16> %b)
-  %res = extractvalue {<8 x i16>, i32} %call, 0
-  %cc = extractvalue {<8 x i16>, i32} %call, 1
-  store i32 %cc, i32 *%ccptr
-  ret <8 x i16> %res
-}
-
-; VFENEZFS.
-define <4 x i32> @test_vfenezfs(<4 x i32> %a, <4 x i32> %b, i32 *%ccptr) {
-; CHECK-LABEL: test_vfenezfs:
-; CHECK: vfenezfs %v24, %v24, %v26
-; CHECK: ipm [[REG:%r[0-5]]]
-; CHECK: srl [[REG]], 28
-; CHECK: st [[REG]], 0(%r2)
-; CHECK: br %r14
-  %call = call {<4 x i32>, i32} @llvm.s390.vfenezfs(<4 x i32> %a, <4 x i32> %b)
-  %res = extractvalue {<4 x i32>, i32} %call, 0
-  %cc = extractvalue {<4 x i32>, i32} %call, 1
-  store i32 %cc, i32 *%ccptr
-  ret <4 x i32> %res
-}
-
-; VISTRB.
-define <16 x i8> @test_vistrb(<16 x i8> %a) {
-; CHECK-LABEL: test_vistrb:
-; CHECK: vistrb %v24, %v24
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.vistrb(<16 x i8> %a)
-  ret <16 x i8> %res
-}
-
-; VISTRH.
-define <8 x i16> @test_vistrh(<8 x i16> %a) {
-; CHECK-LABEL: test_vistrh:
-; CHECK: vistrh %v24, %v24
-; CHECK: br %r14
-  %res = call <8 x i16> @llvm.s390.vistrh(<8 x i16> %a)
-  ret <8 x i16> %res
-}
-
-; VISTRF.
-define <4 x i32> @test_vistrf(<4 x i32> %a) {
-; CHECK-LABEL: test_vistrf:
-; CHECK: vistrf %v24, %v24
-; CHECK: br %r14
-  %res = call <4 x i32> @llvm.s390.vistrf(<4 x i32> %a)
-  ret <4 x i32> %res
-}
-
-; VISTRBS.
-define <16 x i8> @test_vistrbs(<16 x i8> %a, i32 *%ccptr) {
-; CHECK-LABEL: test_vistrbs:
-; CHECK: vistrbs %v24, %v24
-; CHECK: ipm [[REG:%r[0-5]]]
-; CHECK: srl [[REG]], 28
-; CHECK: st [[REG]], 0(%r2)
-; CHECK: br %r14
-  %call = call {<16 x i8>, i32} @llvm.s390.vistrbs(<16 x i8> %a)
-  %res = extractvalue {<16 x i8>, i32} %call, 0
-  %cc = extractvalue {<16 x i8>, i32} %call, 1
-  store i32 %cc, i32 *%ccptr
-  ret <16 x i8> %res
-}
-
-; VISTRHS.
-define <8 x i16> @test_vistrhs(<8 x i16> %a, i32 *%ccptr) {
-; CHECK-LABEL: test_vistrhs:
-; CHECK: vistrhs %v24, %v24
-; CHECK: ipm [[REG:%r[0-5]]]
-; CHECK: srl [[REG]], 28
-; CHECK: st [[REG]], 0(%r2)
-; CHECK: br %r14
-  %call = call {<8 x i16>, i32} @llvm.s390.vistrhs(<8 x i16> %a)
-  %res = extractvalue {<8 x i16>, i32} %call, 0
-  %cc = extractvalue {<8 x i16>, i32} %call, 1
-  store i32 %cc, i32 *%ccptr
-  ret <8 x i16> %res
-}
-
-; VISTRFS.
-define <4 x i32> @test_vistrfs(<4 x i32> %a, i32 *%ccptr) {
-; CHECK-LABEL: test_vistrfs:
-; CHECK: vistrfs %v24, %v24
-; CHECK: ipm [[REG:%r[0-5]]]
-; CHECK: srl [[REG]], 28
-; CHECK: st [[REG]], 0(%r2)
-; CHECK: br %r14
-  %call = call {<4 x i32>, i32} @llvm.s390.vistrfs(<4 x i32> %a)
-  %res = extractvalue {<4 x i32>, i32} %call, 0
-  %cc = extractvalue {<4 x i32>, i32} %call, 1
-  store i32 %cc, i32 *%ccptr
-  ret <4 x i32> %res
-}
-
-; VSTRCB with !IN !RT.
-define <16 x i8> @test_vstrcb_0(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
-; CHECK-LABEL: test_vstrcb_0:
-; CHECK: vstrcb %v24, %v24, %v26, %v28, 0
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.vstrcb(<16 x i8> %a, <16 x i8> %b,
-                                          <16 x i8> %c, i32 0)
-  ret <16 x i8> %res
-}
-
-; VSTRCB with !IN RT.
-define <16 x i8> @test_vstrcb_4(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
-; CHECK-LABEL: test_vstrcb_4:
-; CHECK: vstrcb %v24, %v24, %v26, %v28, 4
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.vstrcb(<16 x i8> %a, <16 x i8> %b,
-                                          <16 x i8> %c, i32 4)
-  ret <16 x i8> %res
-}
-
-; VSTRCB with IN !RT.
-define <16 x i8> @test_vstrcb_8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
-; CHECK-LABEL: test_vstrcb_8:
-; CHECK: vstrcb %v24, %v24, %v26, %v28, 8
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.vstrcb(<16 x i8> %a, <16 x i8> %b,
-                                          <16 x i8> %c, i32 8)
-  ret <16 x i8> %res
-}
-
-; VSTRCB with IN RT.
-define <16 x i8> @test_vstrcb_12(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
-; CHECK-LABEL: test_vstrcb_12:
-; CHECK: vstrcb %v24, %v24, %v26, %v28, 12
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.vstrcb(<16 x i8> %a, <16 x i8> %b,
-                                          <16 x i8> %c, i32 12)
-  ret <16 x i8> %res
-}
-
-; VSTRCB with CS -- should be ignored.
-define <16 x i8> @test_vstrcb_1(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
-; CHECK-LABEL: test_vstrcb_1:
-; CHECK: vstrcb %v24, %v24, %v26, %v28, 0
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.vstrcb(<16 x i8> %a, <16 x i8> %b,
-                                          <16 x i8> %c, i32 1)
-  ret <16 x i8> %res
-}
-
-; VSTRCH.
-define <8 x i16> @test_vstrch(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) {
-; CHECK-LABEL: test_vstrch:
-; CHECK: vstrch %v24, %v24, %v26, %v28, 4
-; CHECK: br %r14
-  %res = call <8 x i16> @llvm.s390.vstrch(<8 x i16> %a, <8 x i16> %b,
-                                          <8 x i16> %c, i32 4)
-  ret <8 x i16> %res
-}
-
-; VSTRCF.
-define <4 x i32> @test_vstrcf(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
-; CHECK-LABEL: test_vstrcf:
-; CHECK: vstrcf %v24, %v24, %v26, %v28, 8
-; CHECK: br %r14
-  %res = call <4 x i32> @llvm.s390.vstrcf(<4 x i32> %a, <4 x i32> %b,
-                                          <4 x i32> %c, i32 8)
-  ret <4 x i32> %res
-}
-
-; VSTRCBS.
-define <16 x i8> @test_vstrcbs(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c,
-                               i32 *%ccptr) {
-; CHECK-LABEL: test_vstrcbs:
-; CHECK: vstrcbs %v24, %v24, %v26, %v28, 0
-; CHECK: ipm [[REG:%r[0-5]]]
-; CHECK: srl [[REG]], 28
-; CHECK: st [[REG]], 0(%r2)
-; CHECK: br %r14
-  %call = call {<16 x i8>, i32} @llvm.s390.vstrcbs(<16 x i8> %a, <16 x i8> %b,
-                                                   <16 x i8> %c, i32 0)
-  %res = extractvalue {<16 x i8>, i32} %call, 0
-  %cc = extractvalue {<16 x i8>, i32} %call, 1
-  store i32 %cc, i32 *%ccptr
-  ret <16 x i8> %res
-}
-
-; VSTRCHS.
-define <8 x i16> @test_vstrchs(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c,
-                               i32 *%ccptr) {
-; CHECK-LABEL: test_vstrchs:
-; CHECK: vstrchs %v24, %v24, %v26, %v28, 4
-; CHECK: ipm [[REG:%r[0-5]]]
-; CHECK: srl [[REG]], 28
-; CHECK: st [[REG]], 0(%r2)
-; CHECK: br %r14
-  %call = call {<8 x i16>, i32} @llvm.s390.vstrchs(<8 x i16> %a, <8 x i16> %b,
-                                                   <8 x i16> %c, i32 4)
-  %res = extractvalue {<8 x i16>, i32} %call, 0
-  %cc = extractvalue {<8 x i16>, i32} %call, 1
-  store i32 %cc, i32 *%ccptr
-  ret <8 x i16> %res
-}
-
-; VSTRCFS.
-define <4 x i32> @test_vstrcfs(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c,
-                               i32 *%ccptr) {
-; CHECK-LABEL: test_vstrcfs:
-; CHECK: vstrcfs %v24, %v24, %v26, %v28, 8
-; CHECK: ipm [[REG:%r[0-5]]]
-; CHECK: srl [[REG]], 28
-; CHECK: st [[REG]], 0(%r2)
-; CHECK: br %r14
-  %call = call {<4 x i32>, i32} @llvm.s390.vstrcfs(<4 x i32> %a, <4 x i32> %b,
-                                                   <4 x i32> %c, i32 8)
-  %res = extractvalue {<4 x i32>, i32} %call, 0
-  %cc = extractvalue {<4 x i32>, i32} %call, 1
-  store i32 %cc, i32 *%ccptr
-  ret <4 x i32> %res
-}
-
-; VSTRCZB with !IN !RT.
-define <16 x i8> @test_vstrczb_0(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
-; CHECK-LABEL: test_vstrczb_0:
-; CHECK: vstrczb %v24, %v24, %v26, %v28, 0
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.vstrczb(<16 x i8> %a, <16 x i8> %b,
-                                           <16 x i8> %c, i32 0)
-  ret <16 x i8> %res
-}
-
-; VSTRCZB with !IN RT.
-define <16 x i8> @test_vstrczb_4(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
-; CHECK-LABEL: test_vstrczb_4:
-; CHECK: vstrczb %v24, %v24, %v26, %v28, 4
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.vstrczb(<16 x i8> %a, <16 x i8> %b,
-                                           <16 x i8> %c, i32 4)
-  ret <16 x i8> %res
-}
-
-; VSTRCZB with IN !RT.
-define <16 x i8> @test_vstrczb_8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
-; CHECK-LABEL: test_vstrczb_8:
-; CHECK: vstrczb %v24, %v24, %v26, %v28, 8
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.vstrczb(<16 x i8> %a, <16 x i8> %b,
-                                           <16 x i8> %c, i32 8)
-  ret <16 x i8> %res
-}
-
-; VSTRCZB with IN RT.
-define <16 x i8> @test_vstrczb_12(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
-; CHECK-LABEL: test_vstrczb_12:
-; CHECK: vstrczb %v24, %v24, %v26, %v28, 12
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.vstrczb(<16 x i8> %a, <16 x i8> %b,
-                                           <16 x i8> %c, i32 12)
-  ret <16 x i8> %res
-}
-
-; VSTRCZB with CS -- should be ignored.
-define <16 x i8> @test_vstrczb_1(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
-; CHECK-LABEL: test_vstrczb_1:
-; CHECK: vstrczb %v24, %v24, %v26, %v28, 0
-; CHECK: br %r14
-  %res = call <16 x i8> @llvm.s390.vstrczb(<16 x i8> %a, <16 x i8> %b,
-                                           <16 x i8> %c, i32 1)
-  ret <16 x i8> %res
-}
-
-; VSTRCZH.
-define <8 x i16> @test_vstrczh(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) {
-; CHECK-LABEL: test_vstrczh:
-; CHECK: vstrczh %v24, %v24, %v26, %v28, 4
-; CHECK: br %r14
-  %res = call <8 x i16> @llvm.s390.vstrczh(<8 x i16> %a, <8 x i16> %b,
-                                           <8 x i16> %c,  i32 4)
-  ret <8 x i16> %res
-}
-
-; VSTRCZF.
-define <4 x i32> @test_vstrczf(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
-; CHECK-LABEL: test_vstrczf:
-; CHECK: vstrczf %v24, %v24, %v26, %v28, 8
-; CHECK: br %r14
-  %res = call <4 x i32> @llvm.s390.vstrczf(<4 x i32> %a, <4 x i32> %b,
-                                           <4 x i32> %c, i32 8)
-  ret <4 x i32> %res
-}
-
-; VSTRCZBS.
-define <16 x i8> @test_vstrczbs(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c,
-                                i32 *%ccptr) {
-; CHECK-LABEL: test_vstrczbs:
-; CHECK: vstrczbs %v24, %v24, %v26, %v28, 0
-; CHECK: ipm [[REG:%r[0-5]]]
-; CHECK: srl [[REG]], 28
-; CHECK: st [[REG]], 0(%r2)
-; CHECK: br %r14
-  %call = call {<16 x i8>, i32} @llvm.s390.vstrczbs(<16 x i8> %a, <16 x i8> %b,
-                                                    <16 x i8> %c, i32 0)
-  %res = extractvalue {<16 x i8>, i32} %call, 0
-  %cc = extractvalue {<16 x i8>, i32} %call, 1
-  store i32 %cc, i32 *%ccptr
-  ret <16 x i8> %res
-}
-
-; VSTRCZHS.
-define <8 x i16> @test_vstrczhs(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c,
-                                i32 *%ccptr) {
-; CHECK-LABEL: test_vstrczhs:
-; CHECK: vstrczhs %v24, %v24, %v26, %v28, 4
-; CHECK: ipm [[REG:%r[0-5]]]
-; CHECK: srl [[REG]], 28
-; CHECK: st [[REG]], 0(%r2)
-; CHECK: br %r14
-  %call = call {<8 x i16>, i32} @llvm.s390.vstrczhs(<8 x i16> %a, <8 x i16> %b,
-                                                    <8 x i16> %c, i32 4)
-  %res = extractvalue {<8 x i16>, i32} %call, 0
-  %cc = extractvalue {<8 x i16>, i32} %call, 1
-  store i32 %cc, i32 *%ccptr
-  ret <8 x i16> %res
-}
-
-; VSTRCZFS.
-define <4 x i32> @test_vstrczfs(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c,
-                                i32 *%ccptr) {
-; CHECK-LABEL: test_vstrczfs:
-; CHECK: vstrczfs %v24, %v24, %v26, %v28, 8
-; CHECK: ipm [[REG:%r[0-5]]]
-; CHECK: srl [[REG]], 28
-; CHECK: st [[REG]], 0(%r2)
-; CHECK: br %r14
-  %call = call {<4 x i32>, i32} @llvm.s390.vstrczfs(<4 x i32> %a, <4 x i32> %b,
-                                                    <4 x i32> %c, i32 8)
-  %res = extractvalue {<4 x i32>, i32} %call, 0
-  %cc = extractvalue {<4 x i32>, i32} %call, 1
-  store i32 %cc, i32 *%ccptr
-  ret <4 x i32> %res
-}
-
-; VFCEDBS with no processing of the result.
-define i32 @test_vfcedbs(<2 x double> %a, <2 x double> %b) {
-; CHECK-LABEL: test_vfcedbs:
-; CHECK: vfcedbs {{%v[0-9]+}}, %v24, %v26
-; CHECK: ipm %r2
-; CHECK: srl %r2, 28
-; CHECK: br %r14
-  %call = call {<2 x i64>, i32} @llvm.s390.vfcedbs(<2 x double> %a,
-                                                   <2 x double> %b)
-  %res = extractvalue {<2 x i64>, i32} %call, 1
-  ret i32 %res
-}
-
-; VFCEDBS, returning 1 if any elements are equal (CC != 3).
-define i32 @test_vfcedbs_any_bool(<2 x double> %a, <2 x double> %b) {
-; CHECK-LABEL: test_vfcedbs_any_bool:
-; CHECK: vfcedbs {{%v[0-9]+}}, %v24, %v26
-; CHECK: ipm %r2
-; CHECK: afi %r2, -536870912
-; CHECK: srl %r2, 31
-; CHECK: br %r14
-  %call = call {<2 x i64>, i32} @llvm.s390.vfcedbs(<2 x double> %a,
-                                                   <2 x double> %b)
-  %res = extractvalue {<2 x i64>, i32} %call, 1
-  %cmp = icmp ne i32 %res, 3
-  %ext = zext i1 %cmp to i32
-  ret i32 %ext
-}
-
-; VFCEDBS, storing to %ptr if any elements are equal.
-define <2 x i64> @test_vfcedbs_any_store(<2 x double> %a, <2 x double> %b,
-                                         i32 *%ptr) {
-; CHECK-LABEL: test_vfcedbs_any_store:
-; CHECK-NOT: %r
-; CHECK: vfcedbs %v24, %v24, %v26
-; CHECK-NEXT: {{bor|bnler}} %r14
-; CHECK: mvhi 0(%r2), 0
-; CHECK: br %r14
-  %call = call {<2 x i64>, i32} @llvm.s390.vfcedbs(<2 x double> %a,
-                                                   <2 x double> %b)
-  %res = extractvalue {<2 x i64>, i32} %call, 0
-  %cc = extractvalue {<2 x i64>, i32} %call, 1
-  %cmp = icmp ule i32 %cc, 2
-  br i1 %cmp, label %store, label %exit
-
-store:
-  store i32 0, i32 *%ptr
-  br label %exit
-
-exit:
-  ret <2 x i64> %res
-}
-
-; VFCHDBS with no processing of the result.
-define i32 @test_vfchdbs(<2 x double> %a, <2 x double> %b) {
-; CHECK-LABEL: test_vfchdbs:
-; CHECK: vfchdbs {{%v[0-9]+}}, %v24, %v26
-; CHECK: ipm %r2
-; CHECK: srl %r2, 28
-; CHECK: br %r14
-  %call = call {<2 x i64>, i32} @llvm.s390.vfchdbs(<2 x double> %a,
-                                                   <2 x double> %b)
-  %res = extractvalue {<2 x i64>, i32} %call, 1
-  ret i32 %res
-}
-
-; VFCHDBS, returning 1 if not all elements are higher.
-define i32 @test_vfchdbs_notall_bool(<2 x double> %a, <2 x double> %b) {
-; CHECK-LABEL: test_vfchdbs_notall_bool:
-; CHECK: vfchdbs {{%v[0-9]+}}, %v24, %v26
-; CHECK: ipm [[REG:%r[0-5]]]
-; CHECK: risblg %r2, [[REG]], 31, 159, 36
-; CHECK: br %r14
-  %call = call {<2 x i64>, i32} @llvm.s390.vfchdbs(<2 x double> %a,
-                                                   <2 x double> %b)
-  %res = extractvalue {<2 x i64>, i32} %call, 1
-  %cmp = icmp sge i32 %res, 1
-  %ext = zext i1 %cmp to i32
-  ret i32 %ext
-}
-
-; VFCHDBS, storing to %ptr if not all elements are higher.
-define <2 x i64> @test_vfchdbs_notall_store(<2 x double> %a, <2 x double> %b,
-                                            i32 *%ptr) {
-; CHECK-LABEL: test_vfchdbs_notall_store:
-; CHECK-NOT: %r
-; CHECK: vfchdbs %v24, %v24, %v26
-; CHECK-NEXT: {{bher|ber}} %r14
-; CHECK: mvhi 0(%r2), 0
-; CHECK: br %r14
-  %call = call {<2 x i64>, i32} @llvm.s390.vfchdbs(<2 x double> %a,
-                                                   <2 x double> %b)
-  %res = extractvalue {<2 x i64>, i32} %call, 0
-  %cc = extractvalue {<2 x i64>, i32} %call, 1
-  %cmp = icmp ugt i32 %cc, 0
-  br i1 %cmp, label %store, label %exit
-
-store:
-  store i32 0, i32 *%ptr
-  br label %exit
-
-exit:
-  ret <2 x i64> %res
-}
-
-; VFCHEDBS with no processing of the result.
-define i32 @test_vfchedbs(<2 x double> %a, <2 x double> %b) {
-; CHECK-LABEL: test_vfchedbs:
-; CHECK: vfchedbs {{%v[0-9]+}}, %v24, %v26
-; CHECK: ipm %r2
-; CHECK: srl %r2, 28
-; CHECK: br %r14
-  %call = call {<2 x i64>, i32} @llvm.s390.vfchedbs(<2 x double> %a,
-						    <2 x double> %b)
-  %res = extractvalue {<2 x i64>, i32} %call, 1
-  ret i32 %res
-}
-
-; VFCHEDBS, returning 1 if neither element is higher or equal.
-define i32 @test_vfchedbs_none_bool(<2 x double> %a, <2 x double> %b) {
-; CHECK-LABEL: test_vfchedbs_none_bool:
-; CHECK: vfchedbs {{%v[0-9]+}}, %v24, %v26
-; CHECK: ipm [[REG:%r[0-5]]]
-; CHECK: risblg %r2, [[REG]], 31, 159, 35
-; CHECK: br %r14
-  %call = call {<2 x i64>, i32} @llvm.s390.vfchedbs(<2 x double> %a,
-						    <2 x double> %b)
-  %res = extractvalue {<2 x i64>, i32} %call, 1
-  %cmp = icmp eq i32 %res, 3
-  %ext = zext i1 %cmp to i32
-  ret i32 %ext
-}
-
-; VFCHEDBS, storing to %ptr if neither element is higher or equal.
-define <2 x i64> @test_vfchedbs_none_store(<2 x double> %a, <2 x double> %b,
-                                           i32 *%ptr) {
-; CHECK-LABEL: test_vfchedbs_none_store:
-; CHECK-NOT: %r
-; CHECK: vfchedbs %v24, %v24, %v26
-; CHECK-NEXT: {{bnor|bler}} %r14
-; CHECK: mvhi 0(%r2), 0
-; CHECK: br %r14
-  %call = call {<2 x i64>, i32} @llvm.s390.vfchedbs(<2 x double> %a,
-						    <2 x double> %b)
-  %res = extractvalue {<2 x i64>, i32} %call, 0
-  %cc = extractvalue {<2 x i64>, i32} %call, 1
-  %cmp = icmp uge i32 %cc, 3
-  br i1 %cmp, label %store, label %exit
-
-store:
-  store i32 0, i32 *%ptr
-  br label %exit
-
-exit:
-  ret <2 x i64> %res
-}
-
-; VFTCIDB with the lowest useful class selector and no processing of the result.
-define i32 @test_vftcidb(<2 x double> %a) {
-; CHECK-LABEL: test_vftcidb:
-; CHECK: vftcidb {{%v[0-9]+}}, %v24, 1
-; CHECK: ipm %r2
-; CHECK: srl %r2, 28
-; CHECK: br %r14
-  %call = call {<2 x i64>, i32} @llvm.s390.vftcidb(<2 x double> %a, i32 1)
-  %res = extractvalue {<2 x i64>, i32} %call, 1
-  ret i32 %res
-}
-
-; VFTCIDB with the highest useful class selector, returning 1 if all elements
-; have the right class (CC == 0).
-define i32 @test_vftcidb_all_bool(<2 x double> %a) {
-; CHECK-LABEL: test_vftcidb_all_bool:
-; CHECK: vftcidb {{%v[0-9]+}}, %v24, 4094
-; CHECK: afi %r2, -268435456
-; CHECK: srl %r2, 31
-; CHECK: br %r14
-  %call = call {<2 x i64>, i32} @llvm.s390.vftcidb(<2 x double> %a, i32 4094)
-  %res = extractvalue {<2 x i64>, i32} %call, 1
-  %cmp = icmp eq i32 %res, 0
-  %ext = zext i1 %cmp to i32
-  ret i32 %ext
-}
-
-; VFIDB with a rounding mode not usable via standard intrinsics.
-define <2 x double> @test_vfidb_0_4(<2 x double> %a) {
-; CHECK-LABEL: test_vfidb_0_4:
-; CHECK: vfidb %v24, %v24, 0, 4
-; CHECK: br %r14
-  %res = call <2 x double> @llvm.s390.vfidb(<2 x double> %a, i32 0, i32 4)
-  ret <2 x double> %res
-}
-
-; VFIDB with IEEE-inexact exception suppressed.
-define <2 x double> @test_vfidb_4_0(<2 x double> %a) {
-; CHECK-LABEL: test_vfidb_4_0:
-; CHECK: vfidb %v24, %v24, 4, 0
-; CHECK: br %r14
-  %res = call <2 x double> @llvm.s390.vfidb(<2 x double> %a, i32 4, i32 0)
-  ret <2 x double> %res
-}
-
diff --git a/test/CodeGen/SystemZ/vec-max-05.ll b/test/CodeGen/SystemZ/vec-max-05.ll
new file mode 100644
index 000000000000..591d3bf36f16
--- /dev/null
+++ b/test/CodeGen/SystemZ/vec-max-05.ll
@@ -0,0 +1,175 @@
+; Test vector maximum on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+declare double @fmax(double, double)
+declare double @llvm.maxnum.f64(double, double)
+declare <2 x double> @llvm.maxnum.v2f64(<2 x double>, <2 x double>)
+
+declare float @fmaxf(float, float)
+declare float @llvm.maxnum.f32(float, float)
+declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>)
+
+declare fp128 @fmaxl(fp128, fp128)
+declare fp128 @llvm.maxnum.f128(fp128, fp128)
+
+; Test the fmax library function.
+define double @f1(double %dummy, double %val1, double %val2) {
+; CHECK-LABEL: f1:
+; CHECK: wfmaxdb %f0, %f2, %f4, 4
+; CHECK: br %r14
+  %ret = call double @fmax(double %val1, double %val2) readnone
+  ret double %ret
+}
+
+; Test the f64 maxnum intrinsic.
+define double @f2(double %dummy, double %val1, double %val2) {
+; CHECK-LABEL: f2:
+; CHECK: wfmaxdb %f0, %f2, %f4, 4
+; CHECK: br %r14
+  %ret = call double @llvm.maxnum.f64(double %val1, double %val2)
+  ret double %ret
+}
+
+; Test a f64 constant compare/select resulting in maxnum.
+define double @f3(double %dummy, double %val) {
+; CHECK-LABEL: f3:
+; CHECK: lzdr [[REG:%f[0-9]+]]
+; CHECK: wfmaxdb %f0, %f2, [[REG]], 4
+; CHECK: br %r14
+  %cmp = fcmp ogt double %val, 0.0
+  %ret = select i1 %cmp, double %val, double 0.0
+  ret double %ret
+}
+
+; Test a f64 constant compare/select resulting in maxnan.
+define double @f4(double %dummy, double %val) {
+; CHECK-LABEL: f4:
+; CHECK: lzdr [[REG:%f[0-9]+]]
+; CHECK: wfmaxdb %f0, %f2, [[REG]], 1
+; CHECK: br %r14
+  %cmp = fcmp ugt double %val, 0.0
+  %ret = select i1 %cmp, double %val, double 0.0
+  ret double %ret
+}
+
+; Test the v2f64 maxnum intrinsic.
+define <2 x double> @f5(<2 x double> %dummy, <2 x double> %val1,
+                        <2 x double> %val2) {
+; CHECK-LABEL: f5:
+; CHECK: vfmaxdb %v24, %v26, %v28, 4
+; CHECK: br %r14
+  %ret = call <2 x double> @llvm.maxnum.v2f64(<2 x double> %val1, <2 x double> %val2)
+  ret <2 x double> %ret
+}
+
+; Test the fmaxf library function.
+define float @f11(float %dummy, float %val1, float %val2) {
+; CHECK-LABEL: f11:
+; CHECK: wfmaxsb %f0, %f2, %f4, 4
+; CHECK: br %r14
+  %ret = call float @fmaxf(float %val1, float %val2) readnone
+  ret float %ret
+}
+
+; Test the f32 maxnum intrinsic.
+define float @f12(float %dummy, float %val1, float %val2) {
+; CHECK-LABEL: f12:
+; CHECK: wfmaxsb %f0, %f2, %f4, 4
+; CHECK: br %r14
+  %ret = call float @llvm.maxnum.f32(float %val1, float %val2)
+  ret float %ret
+}
+
+; Test a f32 constant compare/select resulting in maxnum.
+define float @f13(float %dummy, float %val) {
+; CHECK-LABEL: f13:
+; CHECK: lzer [[REG:%f[0-9]+]]
+; CHECK: wfmaxsb %f0, %f2, [[REG]], 4
+; CHECK: br %r14
+  %cmp = fcmp ogt float %val, 0.0
+  %ret = select i1 %cmp, float %val, float 0.0
+  ret float %ret
+}
+
+; Test a f32 constant compare/select resulting in maxnan.
+define float @f14(float %dummy, float %val) {
+; CHECK-LABEL: f14:
+; CHECK: lzer [[REG:%f[0-9]+]]
+; CHECK: wfmaxsb %f0, %f2, [[REG]], 1
+; CHECK: br %r14
+  %cmp = fcmp ugt float %val, 0.0
+  %ret = select i1 %cmp, float %val, float 0.0
+  ret float %ret
+}
+
+; Test the v4f32 maxnum intrinsic.
+define <4 x float> @f15(<4 x float> %dummy, <4 x float> %val1,
+                        <4 x float> %val2) {
+; CHECK-LABEL: f15:
+; CHECK: vfmaxsb %v24, %v26, %v28, 4
+; CHECK: br %r14
+  %ret = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %val1, <4 x float> %val2)
+  ret <4 x float> %ret
+}
+
+; Test the fmaxl library function.
+define void @f21(fp128 *%ptr1, fp128 *%ptr2, fp128 *%dst) {
+; CHECK-LABEL: f21:
+; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2)
+; CHECK-DAG: vl [[REG2:%v[0-9]+]], 0(%r3)
+; CHECK: wfmaxxb [[RES:%v[0-9]+]], [[REG1]], [[REG2]], 4
+; CHECK: vst [[RES]], 0(%r4)
+; CHECK: br %r14
+  %val1 = load fp128, fp128* %ptr1
+  %val2 = load fp128, fp128* %ptr2
+  %res = call fp128 @fmaxl(fp128 %val1, fp128 %val2) readnone
+  store fp128 %res, fp128* %dst
+  ret void
+}
+
+; Test the f128 maxnum intrinsic.
+define void @f22(fp128 *%ptr1, fp128 *%ptr2, fp128 *%dst) {
+; CHECK-LABEL: f22:
+; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2)
+; CHECK-DAG: vl [[REG2:%v[0-9]+]], 0(%r3)
+; CHECK: wfmaxxb [[RES:%v[0-9]+]], [[REG1]], [[REG2]], 4
+; CHECK: vst [[RES]], 0(%r4)
+; CHECK: br %r14
+  %val1 = load fp128, fp128* %ptr1
+  %val2 = load fp128, fp128* %ptr2
+  %res = call fp128 @llvm.maxnum.f128(fp128 %val1, fp128 %val2)
+  store fp128 %res, fp128* %dst
+  ret void
+}
+
+; Test a f128 constant compare/select resulting in maxnum.
+define void @f23(fp128 *%ptr, fp128 *%dst) {
+; CHECK-LABEL: f23:
+; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2)
+; CHECK-DAG: vzero [[REG2:%v[0-9]+]]
+; CHECK: wfmaxxb [[RES:%v[0-9]+]], [[REG1]], [[REG2]], 4
+; CHECK: vst [[RES]], 0(%r3)
+; CHECK: br %r14
+  %val = load fp128, fp128* %ptr
+  %cmp = fcmp ogt fp128 %val, 0xL00000000000000000000000000000000
+  %res = select i1 %cmp, fp128 %val, fp128 0xL00000000000000000000000000000000
+  store fp128 %res, fp128* %dst
+  ret void
+}
+
+; Test a f128 constant compare/select resulting in maxnan.
+define void @f24(fp128 *%ptr, fp128 *%dst) {
+; CHECK-LABEL: f24:
+; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2)
+; CHECK-DAG: vzero [[REG2:%v[0-9]+]]
+; CHECK: wfmaxxb [[RES:%v[0-9]+]], [[REG1]], [[REG2]], 1
+; CHECK: vst [[RES]], 0(%r3)
+; CHECK: br %r14
+  %val = load fp128, fp128* %ptr
+  %cmp = fcmp ugt fp128 %val, 0xL00000000000000000000000000000000
+  %res = select i1 %cmp, fp128 %val, fp128 0xL00000000000000000000000000000000
+  store fp128 %res, fp128* %dst
+  ret void
+}
+
diff --git a/test/CodeGen/SystemZ/vec-min-05.ll b/test/CodeGen/SystemZ/vec-min-05.ll
new file mode 100644
index 000000000000..3eef9016cd08
--- /dev/null
+++ b/test/CodeGen/SystemZ/vec-min-05.ll
@@ -0,0 +1,175 @@
+; Test vector minimum on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+declare double @fmin(double, double)
+declare double @llvm.minnum.f64(double, double)
+declare <2 x double> @llvm.minnum.v2f64(<2 x double>, <2 x double>)
+
+declare float @fminf(float, float)
+declare float @llvm.minnum.f32(float, float)
+declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>)
+
+declare fp128 @fminl(fp128, fp128)
+declare fp128 @llvm.minnum.f128(fp128, fp128)
+
+; Test the fmin library function.
+define double @f1(double %dummy, double %val1, double %val2) {
+; CHECK-LABEL: f1:
+; CHECK: wfmindb %f0, %f2, %f4, 4
+; CHECK: br %r14
+  %ret = call double @fmin(double %val1, double %val2) readnone
+  ret double %ret
+}
+
+; Test the f64 minnum intrinsic.
+define double @f2(double %dummy, double %val1, double %val2) {
+; CHECK-LABEL: f2:
+; CHECK: wfmindb %f0, %f2, %f4, 4
+; CHECK: br %r14
+  %ret = call double @llvm.minnum.f64(double %val1, double %val2)
+  ret double %ret
+}
+
+; Test a f64 constant compare/select resulting in minnum.
+define double @f3(double %dummy, double %val) {
+; CHECK-LABEL: f3:
+; CHECK: lzdr [[REG:%f[0-9]+]]
+; CHECK: wfmindb %f0, %f2, [[REG]], 4
+; CHECK: br %r14
+  %cmp = fcmp olt double %val, 0.0
+  %ret = select i1 %cmp, double %val, double 0.0
+  ret double %ret
+}
+
+; Test a f64 constant compare/select resulting in minnan.
+define double @f4(double %dummy, double %val) {
+; CHECK-LABEL: f4:
+; CHECK: lzdr [[REG:%f[0-9]+]]
+; CHECK: wfmindb %f0, %f2, [[REG]], 1
+; CHECK: br %r14
+  %cmp = fcmp ult double %val, 0.0
+  %ret = select i1 %cmp, double %val, double 0.0
+  ret double %ret
+}
+
+; Test the v2f64 minnum intrinsic.
+define <2 x double> @f5(<2 x double> %dummy, <2 x double> %val1,
+                        <2 x double> %val2) {
+; CHECK-LABEL: f5:
+; CHECK: vfmindb %v24, %v26, %v28, 4
+; CHECK: br %r14
+  %ret = call <2 x double> @llvm.minnum.v2f64(<2 x double> %val1, <2 x double> %val2)
+  ret <2 x double> %ret
+}
+
+; Test the fminf library function.
+define float @f11(float %dummy, float %val1, float %val2) {
+; CHECK-LABEL: f11:
+; CHECK: wfminsb %f0, %f2, %f4, 4
+; CHECK: br %r14
+  %ret = call float @fminf(float %val1, float %val2) readnone
+  ret float %ret
+}
+
+; Test the f32 minnum intrinsic.
+define float @f12(float %dummy, float %val1, float %val2) {
+; CHECK-LABEL: f12:
+; CHECK: wfminsb %f0, %f2, %f4, 4
+; CHECK: br %r14
+  %ret = call float @llvm.minnum.f32(float %val1, float %val2)
+  ret float %ret
+}
+
+; Test a f32 constant compare/select resulting in minnum.
+define float @f13(float %dummy, float %val) {
+; CHECK-LABEL: f13:
+; CHECK: lzer [[REG:%f[0-9]+]]
+; CHECK: wfminsb %f0, %f2, [[REG]], 4
+; CHECK: br %r14
+  %cmp = fcmp olt float %val, 0.0
+  %ret = select i1 %cmp, float %val, float 0.0
+  ret float %ret
+}
+
+; Test a f32 constant compare/select resulting in minnan.
+define float @f14(float %dummy, float %val) {
+; CHECK-LABEL: f14:
+; CHECK: lzer [[REG:%f[0-9]+]]
+; CHECK: wfminsb %f0, %f2, [[REG]], 1
+; CHECK: br %r14
+  %cmp = fcmp ult float %val, 0.0
+  %ret = select i1 %cmp, float %val, float 0.0
+  ret float %ret
+}
+
+; Test the v4f32 minnum intrinsic.
+define <4 x float> @f15(<4 x float> %dummy, <4 x float> %val1,
+                        <4 x float> %val2) {
+; CHECK-LABEL: f15:
+; CHECK: vfminsb %v24, %v26, %v28, 4
+; CHECK: br %r14
+  %ret = call <4 x float> @llvm.minnum.v4f32(<4 x float> %val1, <4 x float> %val2)
+  ret <4 x float> %ret
+}
+
+; Test the fminl library function.
+define void @f21(fp128 *%ptr1, fp128 *%ptr2, fp128 *%dst) {
+; CHECK-LABEL: f21:
+; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2)
+; CHECK-DAG: vl [[REG2:%v[0-9]+]], 0(%r3)
+; CHECK: wfminxb [[RES:%v[0-9]+]], [[REG1]], [[REG2]], 4
+; CHECK: vst [[RES]], 0(%r4)
+; CHECK: br %r14
+  %val1 = load fp128, fp128* %ptr1
+  %val2 = load fp128, fp128* %ptr2
+  %res = call fp128 @fminl(fp128 %val1, fp128 %val2) readnone
+  store fp128 %res, fp128* %dst
+  ret void
+}
+
+; Test the f128 minnum intrinsic.
+define void @f22(fp128 *%ptr1, fp128 *%ptr2, fp128 *%dst) {
+; CHECK-LABEL: f22:
+; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2)
+; CHECK-DAG: vl [[REG2:%v[0-9]+]], 0(%r3)
+; CHECK: wfminxb [[RES:%v[0-9]+]], [[REG1]], [[REG2]], 4
+; CHECK: vst [[RES]], 0(%r4)
+; CHECK: br %r14
+  %val1 = load fp128, fp128* %ptr1
+  %val2 = load fp128, fp128* %ptr2
+  %res = call fp128 @llvm.minnum.f128(fp128 %val1, fp128 %val2)
+  store fp128 %res, fp128* %dst
+  ret void
+}
+
+; Test a f128 constant compare/select resulting in minnum.
+define void @f23(fp128 *%ptr, fp128 *%dst) {
+; CHECK-LABEL: f23:
+; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2)
+; CHECK-DAG: vzero [[REG2:%v[0-9]+]]
+; CHECK: wfminxb [[RES:%v[0-9]+]], [[REG1]], [[REG2]], 4
+; CHECK: vst [[RES]], 0(%r3)
+; CHECK: br %r14
+  %val = load fp128, fp128* %ptr
+  %cmp = fcmp olt fp128 %val, 0xL00000000000000000000000000000000
+  %res = select i1 %cmp, fp128 %val, fp128 0xL00000000000000000000000000000000
+  store fp128 %res, fp128* %dst
+  ret void
+}
+
+; Test a f128 constant compare/select resulting in minnan.
+define void @f24(fp128 *%ptr, fp128 *%dst) {
+; CHECK-LABEL: f24:
+; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2)
+; CHECK-DAG: vzero [[REG2:%v[0-9]+]]
+; CHECK: wfminxb [[RES:%v[0-9]+]], [[REG1]], [[REG2]], 1
+; CHECK: vst [[RES]], 0(%r3)
+; CHECK: br %r14
+  %val = load fp128, fp128* %ptr
+  %cmp = fcmp ult fp128 %val, 0xL00000000000000000000000000000000
+  %res = select i1 %cmp, fp128 %val, fp128 0xL00000000000000000000000000000000
+  store fp128 %res, fp128* %dst
+  ret void
+}
+
diff --git a/test/CodeGen/SystemZ/vec-move-18.ll b/test/CodeGen/SystemZ/vec-move-18.ll
new file mode 100644
index 000000000000..5d3d09d83ef1
--- /dev/null
+++ b/test/CodeGen/SystemZ/vec-move-18.ll
@@ -0,0 +1,24 @@
+; Test insertions of memory values into 0 on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+; Test VLLEZLF.
+define <4 x i32> @f1(i32 *%ptr) {
+; CHECK-LABEL: f1:
+; CHECK: vllezlf %v24, 0(%r2)
+; CHECK: br %r14
+  %val = load i32, i32 *%ptr
+  %ret = insertelement <4 x i32> zeroinitializer, i32 %val, i32 0
+  ret <4 x i32> %ret
+}
+
+; Test VLLEZLF with a float.
+define <4 x float> @f2(float *%ptr) {
+; CHECK-LABEL: f2:
+; CHECK: vllezlf %v24, 0(%r2)
+; CHECK: br %r14
+  %val = load float, float *%ptr
+  %ret = insertelement <4 x float> zeroinitializer, float %val, i32 0
+  ret <4 x float> %ret
+}
+
diff --git a/test/CodeGen/SystemZ/vec-mul-03.ll b/test/CodeGen/SystemZ/vec-mul-03.ll
new file mode 100644
index 000000000000..3733db9fb339
--- /dev/null
+++ b/test/CodeGen/SystemZ/vec-mul-03.ll
@@ -0,0 +1,24 @@
+; Test vector multiplication on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+; Test a v4f32 multiplication.
+define <4 x float> @f1(<4 x float> %dummy, <4 x float> %val1,
+                       <4 x float> %val2) {
+; CHECK-LABEL: f1:
+; CHECK: vfmsb %v24, %v26, %v28
+; CHECK: br %r14
+  %ret = fmul <4 x float> %val1, %val2
+  ret <4 x float> %ret
+}
+
+; Test an f32 multiplication that uses vector registers.
+define float @f2(<4 x float> %val1, <4 x float> %val2) {
+; CHECK-LABEL: f2:
+; CHECK: wfmsb %f0, %v24, %v26
+; CHECK: br %r14
+  %scalar1 = extractelement <4 x float> %val1, i32 0
+  %scalar2 = extractelement <4 x float> %val2, i32 0
+  %ret = fmul float %scalar1, %scalar2
+  ret float %ret
+}
diff --git a/test/CodeGen/SystemZ/vec-mul-04.ll b/test/CodeGen/SystemZ/vec-mul-04.ll
new file mode 100644
index 000000000000..d96f0b6a745a
--- /dev/null
+++ b/test/CodeGen/SystemZ/vec-mul-04.ll
@@ -0,0 +1,31 @@
+; Test vector multiply-and-add on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
+
+; Test a v4f32 multiply-and-add.
+define <4 x float> @f1(<4 x float> %dummy, <4 x float> %val1,
+                       <4 x float> %val2, <4 x float> %val3) {
+; CHECK-LABEL: f1:
+; CHECK: vfmasb %v24, %v26, %v28, %v30
+; CHECK: br %r14
+  %ret = call <4 x float> @llvm.fma.v4f32 (<4 x float> %val1,
+                                           <4 x float> %val2,
+                                           <4 x float> %val3)
+  ret <4 x float> %ret
+}
+
+; Test a v4f32 multiply-and-subtract.
+define <4 x float> @f2(<4 x float> %dummy, <4 x float> %val1,
+                       <4 x float> %val2, <4 x float> %val3) {
+; CHECK-LABEL: f2:
+; CHECK: vfmssb %v24, %v26, %v28, %v30
+; CHECK: br %r14
+  %negval3 = fsub <4 x float> <float -0.0, float -0.0,
+                               float -0.0, float -0.0>, %val3
+  %ret = call <4 x float> @llvm.fma.v4f32 (<4 x float> %val1,
+                                           <4 x float> %val2,
+                                           <4 x float> %negval3)
+  ret <4 x float> %ret
+}
diff --git a/test/CodeGen/SystemZ/vec-mul-05.ll b/test/CodeGen/SystemZ/vec-mul-05.ll
new file mode 100644
index 000000000000..90a1f7a7efdf
--- /dev/null
+++ b/test/CodeGen/SystemZ/vec-mul-05.ll
@@ -0,0 +1,63 @@
+; Test vector negative multiply-and-add on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>)
+declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
+
+; Test a v2f64 negative multiply-and-add.
+define <2 x double> @f1(<2 x double> %dummy, <2 x double> %val1,
+                        <2 x double> %val2, <2 x double> %val3) {
+; CHECK-LABEL: f1:
+; CHECK: vfnmadb %v24, %v26, %v28, %v30
+; CHECK: br %r14
+  %ret = call <2 x double> @llvm.fma.v2f64 (<2 x double> %val1,
+                                            <2 x double> %val2,
+                                            <2 x double> %val3)
+  %negret = fsub <2 x double> <double -0.0, double -0.0>, %ret
+  ret <2 x double> %negret
+}
+
+; Test a v2f64 negative multiply-and-subtract.
+define <2 x double> @f2(<2 x double> %dummy, <2 x double> %val1,
+                        <2 x double> %val2, <2 x double> %val3) {
+; CHECK-LABEL: f2:
+; CHECK: vfnmsdb %v24, %v26, %v28, %v30
+; CHECK: br %r14
+  %negval3 = fsub <2 x double> <double -0.0, double -0.0>, %val3
+  %ret = call <2 x double> @llvm.fma.v2f64 (<2 x double> %val1,
+                                            <2 x double> %val2,
+                                            <2 x double> %negval3)
+  %negret = fsub <2 x double> <double -0.0, double -0.0>, %ret
+  ret <2 x double> %negret
+}
+
+; Test a v4f32 negative multiply-and-add.
+define <4 x float> @f3(<4 x float> %dummy, <4 x float> %val1,
+                       <4 x float> %val2, <4 x float> %val3) {
+; CHECK-LABEL: f3:
+; CHECK: vfnmasb %v24, %v26, %v28, %v30
+; CHECK: br %r14
+  %ret = call <4 x float> @llvm.fma.v4f32 (<4 x float> %val1,
+                                           <4 x float> %val2,
+                                           <4 x float> %val3)
+  %negret = fsub <4 x float> <float -0.0, float -0.0,
+                              float -0.0, float -0.0>, %ret
+  ret <4 x float> %negret
+}
+
+; Test a v4f32 negative multiply-and-subtract.
+define <4 x float> @f4(<4 x float> %dummy, <4 x float> %val1,
+                       <4 x float> %val2, <4 x float> %val3) {
+; CHECK-LABEL: f4:
+; CHECK: vfnmssb %v24, %v26, %v28, %v30
+; CHECK: br %r14
+  %negval3 = fsub <4 x float> <float -0.0, float -0.0,
+                               float -0.0, float -0.0>, %val3
+  %ret = call <4 x float> @llvm.fma.v4f32 (<4 x float> %val1,
+                                           <4 x float> %val2,
+                                           <4 x float> %negval3)
+  %negret = fsub <4 x float> <float -0.0, float -0.0,
+                               float -0.0, float -0.0>, %ret
+  ret <4 x float> %negret
+}
diff --git a/test/CodeGen/SystemZ/vec-neg-02.ll b/test/CodeGen/SystemZ/vec-neg-02.ll
new file mode 100644
index 000000000000..07ce037542fd
--- /dev/null
+++ b/test/CodeGen/SystemZ/vec-neg-02.ll
@@ -0,0 +1,23 @@
+; Test vector negation on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+; Test a v4f32 negation.
+define <4 x float> @f1(<4 x float> %dummy, <4 x float> %val) {
+; CHECK-LABEL: f1:
+; CHECK: vflcsb %v24, %v26
+; CHECK: br %r14
+  %ret = fsub <4 x float> <float -0.0, float -0.0,
+                           float -0.0, float -0.0>, %val
+  ret <4 x float> %ret
+}
+
+; Test an f32 negation that uses vector registers.
+define float @f2(<4 x float> %val) {
+; CHECK-LABEL: f2:
+; CHECK: wflcsb %f0, %v24
+; CHECK: br %r14
+  %scalar = extractelement <4 x float> %val, i32 0
+  %ret = fsub float -0.0, %scalar
+  ret float %ret
+}
diff --git a/test/CodeGen/SystemZ/vec-or-03.ll b/test/CodeGen/SystemZ/vec-or-03.ll
new file mode 100644
index 000000000000..010629d880d1
--- /dev/null
+++ b/test/CodeGen/SystemZ/vec-or-03.ll
@@ -0,0 +1,91 @@
+; Test vector OR-NOT on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+; Test a v16i8 OR-NOT.
+define <16 x i8> @f1(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) {
+; CHECK-LABEL: f1:
+; CHECK: voc %v24, %v26, %v28
+; CHECK: br %r14
+  %not = xor <16 x i8> %val2, <i8 -1, i8 -1, i8 -1, i8 -1,
+                               i8 -1, i8 -1, i8 -1, i8 -1,
+                               i8 -1, i8 -1, i8 -1, i8 -1,
+                               i8 -1, i8 -1, i8 -1, i8 -1>
+  %ret = or <16 x i8> %val1, %not
+  ret <16 x i8> %ret
+}
+
+; ...and again with the reverse.
+define <16 x i8> @f2(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) {
+; CHECK-LABEL: f2:
+; CHECK: voc %v24, %v28, %v26
+; CHECK: br %r14
+  %not = xor <16 x i8> %val1, <i8 -1, i8 -1, i8 -1, i8 -1,
+                               i8 -1, i8 -1, i8 -1, i8 -1,
+                               i8 -1, i8 -1, i8 -1, i8 -1,
+                               i8 -1, i8 -1, i8 -1, i8 -1>
+  %ret = or <16 x i8> %not, %val2
+  ret <16 x i8> %ret
+}
+
+; Test a v8i16 OR-NOT.
+define <8 x i16> @f3(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) {
+; CHECK-LABEL: f3:
+; CHECK: voc %v24, %v26, %v28
+; CHECK: br %r14
+  %not = xor <8 x i16> %val2, <i16 -1, i16 -1, i16 -1, i16 -1,
+                               i16 -1, i16 -1, i16 -1, i16 -1>
+  %ret = or <8 x i16> %val1, %not
+  ret <8 x i16> %ret
+}
+
+; ...and again with the reverse.
+define <8 x i16> @f4(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) {
+; CHECK-LABEL: f4:
+; CHECK: voc %v24, %v28, %v26
+; CHECK: br %r14
+  %not = xor <8 x i16> %val1, <i16 -1, i16 -1, i16 -1, i16 -1,
+                               i16 -1, i16 -1, i16 -1, i16 -1>
+  %ret = or <8 x i16> %not, %val2
+  ret <8 x i16> %ret
+}
+
+; Test a v4i32 OR-NOT.
+define <4 x i32> @f5(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) {
+; CHECK-LABEL: f5:
+; CHECK: voc %v24, %v26, %v28
+; CHECK: br %r14
+  %not = xor <4 x i32> %val2, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %ret = or <4 x i32> %val1, %not
+  ret <4 x i32> %ret
+}
+
+; ...and again with the reverse.
+define <4 x i32> @f6(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) {
+; CHECK-LABEL: f6:
+; CHECK: voc %v24, %v28, %v26
+; CHECK: br %r14
+  %not = xor <4 x i32> %val1, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %ret = or <4 x i32> %not, %val2
+  ret <4 x i32> %ret
+}
+
+; Test a v2i64 OR-NOT.
+define <2 x i64> @f7(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) {
+; CHECK-LABEL: f7:
+; CHECK: voc %v24, %v26, %v28
+; CHECK: br %r14
+  %not = xor <2 x i64> %val2, <i64 -1, i64 -1>
+  %ret = or <2 x i64> %val1, %not
+  ret <2 x i64> %ret
+}
+
+; ...and again with the reverse.
+define <2 x i64> @f8(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) {
+; CHECK-LABEL: f8:
+; CHECK: voc %v24, %v28, %v26
+; CHECK: br %r14
+  %not = xor <2 x i64> %val1, <i64 -1, i64 -1>
+  %ret = or <2 x i64> %not, %val2
+  ret <2 x i64> %ret
+}
diff --git a/test/CodeGen/SystemZ/vec-round-02.ll b/test/CodeGen/SystemZ/vec-round-02.ll
new file mode 100644
index 000000000000..bcd66ea803d1
--- /dev/null
+++ b/test/CodeGen/SystemZ/vec-round-02.ll
@@ -0,0 +1,118 @@
+; Test v4f32 rounding on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+declare float @llvm.rint.f32(float)
+declare float @llvm.nearbyint.f32(float)
+declare float @llvm.floor.f32(float)
+declare float @llvm.ceil.f32(float)
+declare float @llvm.trunc.f32(float)
+declare float @llvm.round.f32(float)
+declare <4 x float> @llvm.rint.v4f32(<4 x float>)
+declare <4 x float> @llvm.nearbyint.v4f32(<4 x float>)
+declare <4 x float> @llvm.floor.v4f32(<4 x float>)
+declare <4 x float> @llvm.ceil.v4f32(<4 x float>)
+declare <4 x float> @llvm.trunc.v4f32(<4 x float>)
+declare <4 x float> @llvm.round.v4f32(<4 x float>)
+
+define <4 x float> @f1(<4 x float> %val) {
+; CHECK-LABEL: f1:
+; CHECK: vfisb %v24, %v24, 0, 0
+; CHECK: br %r14
+  %res = call <4 x float> @llvm.rint.v4f32(<4 x float> %val)
+  ret <4 x float> %res
+}
+
+define <4 x float> @f2(<4 x float> %val) {
+; CHECK-LABEL: f2:
+; CHECK: vfisb %v24, %v24, 4, 0
+; CHECK: br %r14
+  %res = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %val)
+  ret <4 x float> %res
+}
+
+define <4 x float> @f3(<4 x float> %val) {
+; CHECK-LABEL: f3:
+; CHECK: vfisb %v24, %v24, 4, 7
+; CHECK: br %r14
+  %res = call <4 x float> @llvm.floor.v4f32(<4 x float> %val)
+  ret <4 x float> %res
+}
+
+define <4 x float> @f4(<4 x float> %val) {
+; CHECK-LABEL: f4:
+; CHECK: vfisb %v24, %v24, 4, 6
+; CHECK: br %r14
+  %res = call <4 x float> @llvm.ceil.v4f32(<4 x float> %val)
+  ret <4 x float> %res
+}
+
+define <4 x float> @f5(<4 x float> %val) {
+; CHECK-LABEL: f5:
+; CHECK: vfisb %v24, %v24, 4, 5
+; CHECK: br %r14
+  %res = call <4 x float> @llvm.trunc.v4f32(<4 x float> %val)
+  ret <4 x float> %res
+}
+
+define <4 x float> @f6(<4 x float> %val) {
+; CHECK-LABEL: f6:
+; CHECK: vfisb %v24, %v24, 4, 1
+; CHECK: br %r14
+  %res = call <4 x float> @llvm.round.v4f32(<4 x float> %val)
+  ret <4 x float> %res
+}
+
+define float @f7(<4 x float> %val) {
+; CHECK-LABEL: f7:
+; CHECK: wfisb %f0, %v24, 0, 0
+; CHECK: br %r14
+  %scalar = extractelement <4 x float> %val, i32 0
+  %res = call float @llvm.rint.f32(float %scalar)
+  ret float %res
+}
+
+define float @f8(<4 x float> %val) {
+; CHECK-LABEL: f8:
+; CHECK: wfisb %f0, %v24, 4, 0
+; CHECK: br %r14
+  %scalar = extractelement <4 x float> %val, i32 0
+  %res = call float @llvm.nearbyint.f32(float %scalar)
+  ret float %res
+}
+
+define float @f9(<4 x float> %val) {
+; CHECK-LABEL: f9:
+; CHECK: wfisb %f0, %v24, 4, 7
+; CHECK: br %r14
+  %scalar = extractelement <4 x float> %val, i32 0
+  %res = call float @llvm.floor.f32(float %scalar)
+  ret float %res
+}
+
+define float @f10(<4 x float> %val) {
+; CHECK-LABEL: f10:
+; CHECK: wfisb %f0, %v24, 4, 6
+; CHECK: br %r14
+  %scalar = extractelement <4 x float> %val, i32 0
+  %res = call float @llvm.ceil.f32(float %scalar)
+  ret float %res
+}
+
+define float @f11(<4 x float> %val) {
+; CHECK-LABEL: f11:
+; CHECK: wfisb %f0, %v24, 4, 5
+; CHECK: br %r14
+  %scalar = extractelement <4 x float> %val, i32 0
+  %res = call float @llvm.trunc.f32(float %scalar)
+  ret float %res
+}
+
+define float @f12(<4 x float> %val) {
+; CHECK-LABEL: f12:
+; CHECK: wfisb %f0, %v24, 4, 1
+; CHECK: br %r14
+  %scalar = extractelement <4 x float> %val, i32 0
+  %res = call float @llvm.round.f32(float %scalar)
+  ret float %res
+}
diff --git a/test/CodeGen/SystemZ/vec-sqrt-02.ll b/test/CodeGen/SystemZ/vec-sqrt-02.ll
new file mode 100644
index 000000000000..6970d9db6698
--- /dev/null
+++ b/test/CodeGen/SystemZ/vec-sqrt-02.ll
@@ -0,0 +1,23 @@
+; Test f32 and v4f32 square root on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+declare float @llvm.sqrt.f32(float)
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
+
+define <4 x float> @f1(<4 x float> %val) {
+; CHECK-LABEL: f1:
+; CHECK: vfsqsb %v24, %v24
+; CHECK: br %r14
+  %ret = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %val)
+  ret <4 x float> %ret
+}
+
+define float @f2(<4 x float> %val) {
+; CHECK-LABEL: f2:
+; CHECK: wfsqsb %f0, %v24
+; CHECK: br %r14
+  %scalar = extractelement <4 x float> %val, i32 0
+  %ret = call float @llvm.sqrt.f32(float %scalar)
+  ret float %ret
+}
diff --git a/test/CodeGen/SystemZ/vec-sub-02.ll b/test/CodeGen/SystemZ/vec-sub-02.ll
new file mode 100644
index 000000000000..83c76b5d4aa6
--- /dev/null
+++ b/test/CodeGen/SystemZ/vec-sub-02.ll
@@ -0,0 +1,31 @@
+; Test vector subtraction on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+; Test a v4f32 subtraction.
+define <4 x float> @f6(<4 x float> %dummy, <4 x float> %val1,
+                       <4 x float> %val2) {
+; CHECK-LABEL: f6:
+; CHECK: vfssb %v24, %v26, %v28
+; CHECK: br %r14
+  %ret = fsub <4 x float> %val1, %val2
+  ret <4 x float> %ret
+}
+
+; Test an f32 subtraction that uses vector registers.
+define float @f7(<4 x float> %val1, <4 x float> %val2) {
+; CHECK-LABEL: f7:
+; CHECK: wfssb %f0, %v24, %v26
+; CHECK: br %r14
+  %scalar1 = extractelement <4 x float> %val1, i32 0
+  %scalar2 = extractelement <4 x float> %val2, i32 0
+  %ret = fsub float %scalar1, %scalar2
+  ret float %ret
+}
+
+; Test a v2f32 subtraction, which gets promoted to v4f32.
+define <2 x float> @f14(<2 x float> %val1, <2 x float> %val2) {
+; No particular output expected, but must compile.
+  %ret = fsub <2 x float> %val1, %val2
+  ret <2 x float> %ret
+}
diff --git a/test/CodeGen/SystemZ/vec-xor-02.ll b/test/CodeGen/SystemZ/vec-xor-02.ll
new file mode 100644
index 000000000000..b4b5a96ba254
--- /dev/null
+++ b/test/CodeGen/SystemZ/vec-xor-02.ll
@@ -0,0 +1,47 @@
+; Test vector NOT-XOR on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+; Test a v16i8 NOT-XOR.
+define <16 x i8> @f1(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) {
+; CHECK-LABEL: f1:
+; CHECK: vnx %v24, %v26, %v28
+; CHECK: br %r14
+  %ret = xor <16 x i8> %val1, %val2
+  %not = xor <16 x i8> %ret, <i8 -1, i8 -1, i8 -1, i8 -1,
+                              i8 -1, i8 -1, i8 -1, i8 -1,
+                              i8 -1, i8 -1, i8 -1, i8 -1,
+                              i8 -1, i8 -1, i8 -1, i8 -1>
+  ret <16 x i8> %not
+}
+
+; Test a v8i16 NOT-XOR.
+define <8 x i16> @f2(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) {
+; CHECK-LABEL: f2:
+; CHECK: vnx %v24, %v26, %v28
+; CHECK: br %r14
+  %ret = xor <8 x i16> %val1, %val2
+  %not = xor <8 x i16> %ret, <i16 -1, i16 -1, i16 -1, i16 -1,
+                              i16 -1, i16 -1, i16 -1, i16 -1>
+  ret <8 x i16> %not
+}
+
+; Test a v4i32 NOT-XOR.
+define <4 x i32> @f3(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) {
+; CHECK-LABEL: f3:
+; CHECK: vnx %v24, %v26, %v28
+; CHECK: br %r14
+  %ret = xor <4 x i32> %val1, %val2
+  %not = xor <4 x i32> %ret, <i32 -1, i32 -1, i32 -1, i32 -1>
+  ret <4 x i32> %not
+}
+
+; Test a v2i64 NOT-XOR.
+define <2 x i64> @f4(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) {
+; CHECK-LABEL: f4:
+; CHECK: vnx %v24, %v26, %v28
+; CHECK: br %r14
+  %ret = xor <2 x i64> %val1, %val2
+  %not = xor <2 x i64> %ret, <i64 -1, i64 -1>
+  ret <2 x i64> %not
+}
diff --git a/test/CodeGen/Thumb/litpoolremat.ll b/test/CodeGen/Thumb/litpoolremat.ll
new file mode 100644
index 000000000000..6ed9b0c2a7ce
--- /dev/null
+++ b/test/CodeGen/Thumb/litpoolremat.ll
@@ -0,0 +1,28 @@
+; RUN: llc < %s -mtriple=thumb-apple-darwin | FileCheck %s
+
+declare void @consume_value(i32) #1
+
+declare i32 @get_value(...) #1
+
+declare void @consume_three_values(i32, i32, i32) #1
+
+; Function Attrs: nounwind uwtable
+define void @should_not_spill() #0 {
+  tail call void @consume_value(i32 1764) #2
+  %1 = tail call i32 (...) @get_value() #2
+  %2 = tail call i32 (...) @get_value() #2
+  %3 = tail call i32 (...) @get_value() #2
+  tail call void @consume_value(i32 %1) #2
+  tail call void @consume_value(i32 %2) #2
+  tail call void @consume_value(i32 %3) #2
+  tail call void @consume_value(i32 1764) #2
+  tail call void @consume_three_values(i32 %1, i32 %2, i32 %3) #2
+  ret void
+}
+
+; CHECK: ldr r0, LCPI0_0
+; CHECK-NOT: str r0
+; CHECK: bl
+; CHECK: ldr r0, LCPI0_0
+; CHECK-LABEL: LCPI0_0:
+; CHECK-NEXT: .long 1764
diff --git a/test/CodeGen/Thumb/select.ll b/test/CodeGen/Thumb/select.ll
index fe69a39e350c..75dbeab5ad0f 100644
--- a/test/CodeGen/Thumb/select.ll
+++ b/test/CodeGen/Thumb/select.ll
@@ -74,9 +74,9 @@ define double @f7(double %a, double %b) {
 }
 ; CHECK-LABEL: f7:
 ; CHECK: blt
-; CHECK: blt
+; CHECK: {{blt|bge}}
 ; CHECK: __ltdf2
 ; CHECK-EABI-LABEL: f7:
 ; CHECK-EABI: __aeabi_dcmplt
 ; CHECK-EABI: bne
-; CHECK-EABI: bne
+; CHECK-EABI: {{bne|beq}}
diff --git a/test/CodeGen/WebAssembly/indirect-import.ll b/test/CodeGen/WebAssembly/indirect-import.ll
index 1bde65bcbbba..7cac31a2aef5 100644
--- a/test/CodeGen/WebAssembly/indirect-import.ll
+++ b/test/CodeGen/WebAssembly/indirect-import.ll
@@ -19,9 +19,9 @@ entry:
   %vs = alloca void (%struct.big*)*, align 4
   %s = alloca void (%struct.big*)*, align 4
 
-; CHECK: i32.const       {{.+}}=, extern_fd@FUNCTION
+; CHECK-DAG: i32.const       {{.+}}=, extern_fd@FUNCTION
+; CHECK-DAG: i32.const       {{.+}}=, extern_vj@FUNCTION
   store float (double)* @extern_fd, float (double)** %fd, align 4
-; CHECK: i32.const       {{.+}}=, extern_vj@FUNCTION
   store void (i64)* @extern_vj, void (i64)** %vj, align 4
   %0 = load void (i64)*, void (i64)** %vj, align 4
   call void %0(i64 1)
@@ -36,10 +36,9 @@ entry:
   %2 = load i32 (i64, i32, double, float)*, i32 (i64, i32, double, float)** %ijidf, align 4
   %call = call i32 %2(i64 1, i32 2, double 3.000000e+00, float 4.000000e+00)
 
-; CHECK: i32.const       {{.+}}=, extern_struct@FUNCTION
+; CHECK-DAG: i32.const       {{.+}}=, extern_struct@FUNCTION
+; CHECK-DAG: i32.const       {{.+}}=, extern_sret@FUNCTION
   store void (%struct.big*)* @extern_struct, void (%struct.big*)** %vs, align 4
-
-; CHECK: i32.const       {{.+}}=, extern_sret@FUNCTION
   store void (%struct.big*)* @extern_sret, void (%struct.big*)** %s, align 4
   %3 = load float (double)*, float (double)** %fd, align 4
   %4 = ptrtoint float (double)* %3 to i32
diff --git a/test/CodeGen/WebAssembly/userstack.ll b/test/CodeGen/WebAssembly/userstack.ll
index c160b391f6e8..2580771eb2cf 100644
--- a/test/CodeGen/WebAssembly/userstack.ll
+++ b/test/CodeGen/WebAssembly/userstack.ll
@@ -36,13 +36,13 @@ define void @alloca3264() {
  ; CHECK-NEXT: tee_local $push[[L5:.+]]=, [[SP:.+]], $pop[[L6]]
  %r1 = alloca i32
  %r2 = alloca double
- ; CHECK-NEXT: i32.const $push[[L0:.+]]=, 0
- ; CHECK-NEXT: i32.store 12($pop[[L5]]), $pop[[L0]]
  store i32 0, i32* %r1
- ; CHECK-NEXT: get_local $push[[L2:.+]]=, [[SP]]{{$}}
- ; CHECK-NEXT: i64.const $push[[L1:.+]]=, 0
- ; CHECK-NEXT: i64.store 0($pop[[L2]]), $pop[[L1]]
  store double 0.0, double* %r2
+ ; CHECK-NEXT: i64.const $push[[L1:.+]]=, 0
+ ; CHECK-NEXT: i64.store 0($pop[[L5]]), $pop[[L1]]
+ ; CHECK-NEXT: get_local $push[[L2:.+]]=, [[SP]]{{$}}
+ ; CHECK-NEXT: i32.const $push[[L0:.+]]=, 0
+ ; CHECK-NEXT: i32.store 12($pop[[L2]]), $pop[[L0]]
  ; CHECK-NEXT: return
  ret void
 }
diff --git a/test/CodeGen/X86/2008-01-08-SchedulerCrash.ll b/test/CodeGen/X86/2008-01-08-SchedulerCrash.ll
index 7da85d3a9a1d..fa71bffaf8c6 100644
--- a/test/CodeGen/X86/2008-01-08-SchedulerCrash.ll
+++ b/test/CodeGen/X86/2008-01-08-SchedulerCrash.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+cmov | FileCheck %s
+; RUN: llc < %s -march=x86 -mattr=+cmov -x86-cmov-converter=false | FileCheck %s
 ;
 ; Test scheduling a multi-use compare. We should neither spill flags
 ; nor clone the compare.
diff --git a/test/CodeGen/X86/2009-06-03-Win64DisableRedZone.ll b/test/CodeGen/X86/2009-06-03-Win64DisableRedZone.ll
index 8d387136da9c..37f01845db79 100644
--- a/test/CodeGen/X86/2009-06-03-Win64DisableRedZone.ll
+++ b/test/CodeGen/X86/2009-06-03-Win64DisableRedZone.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -mtriple=x86_64-linux < %s | FileCheck %s
 ; CHECK-NOT: -{{[1-9][0-9]*}}(%rsp)
 
-define x86_64_win64cc x86_fp80 @a(i64 %x) nounwind readnone {
+define win64cc x86_fp80 @a(i64 %x) nounwind readnone {
 entry:
         %conv = sitofp i64 %x to x86_fp80               ; <x86_fp80> [#uses=1]
         ret x86_fp80 %conv
diff --git a/test/CodeGen/X86/2011-10-19-widen_vselect.ll b/test/CodeGen/X86/2011-10-19-widen_vselect.ll
index ba5de8eb5fcb..e812cbe3270a 100644
--- a/test/CodeGen/X86/2011-10-19-widen_vselect.ll
+++ b/test/CodeGen/X86/2011-10-19-widen_vselect.ll
@@ -83,10 +83,11 @@ define void @full_test() {
 ; X32-NEXT:    cmpeqps %xmm2, %xmm1
 ; X32-NEXT:    movaps %xmm1, %xmm0
 ; X32-NEXT:    blendvps %xmm0, %xmm2, %xmm4
-; X32-NEXT:    extractps $1, %xmm4, {{[0-9]+}}(%esp)
 ; X32-NEXT:    movss %xmm4, {{[0-9]+}}(%esp)
-; X32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X32-NEXT:    movsd %xmm0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movshdup {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; X32-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movss %xmm4, {{[0-9]+}}(%esp)
+; X32-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
 ; X32-NEXT:    addl $60, %esp
 ; X32-NEXT:    retl
 ;
diff --git a/test/CodeGen/X86/DynamicCalleeSavedRegisters.ll b/test/CodeGen/X86/DynamicCalleeSavedRegisters.ll
index 9dff4e596caa..72807922a22b 100644
--- a/test/CodeGen/X86/DynamicCalleeSavedRegisters.ll
+++ b/test/CodeGen/X86/DynamicCalleeSavedRegisters.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i386-linux-gnu  | FileCheck --check-prefix=CHECK %s
+; RUN: llc < %s -mtriple=i386-linux-gnu  | FileCheck %s
 
 declare x86_regcallcc i32 @callee(i32 %a0, i32 %b0, i32 %c0, i32 %d0, i32 %e0);
 
diff --git a/test/CodeGen/X86/alias-static-alloca.ll b/test/CodeGen/X86/alias-static-alloca.ll
new file mode 100644
index 000000000000..f4ca7e39f4fc
--- /dev/null
+++ b/test/CodeGen/X86/alias-static-alloca.ll
@@ -0,0 +1,37 @@
+; RUN: llc -o - -mtriple=x86_64-linux-gnu %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; We should be able to bypass the load values to their corresponding
+; stores here.
+
+; CHECK-LABEL: foo
+; CHECK-DAG: movl	%esi, -8(%rsp)
+; CHECK-DAG: movl	%ecx, -16(%rsp)
+; CHECK-DAG: movl	%edi, -4(%rsp)
+; CHECK-DAG: movl	%edx, -12(%rsp)
+; CHECK: leal
+; CHECK: addl
+; CHECK: addl
+; CHECK: retq
+
+define i32 @foo(i32 %a, i32 %b, i32 %c, i32 %d) {
+entry:
+  %a0 = alloca i32
+  %a1 = alloca i32
+  %a2 = alloca i32
+  %a3 = alloca i32
+  store i32 %b, i32* %a1
+  store i32 %d, i32* %a3
+  store i32 %a, i32* %a0
+  store i32 %c, i32* %a2
+  %l0 = load i32, i32* %a0
+  %l1 = load i32, i32* %a1
+  %l2 = load i32, i32* %a2
+  %l3 = load i32, i32* %a3
+  %add0 = add nsw i32 %l0, %l1
+  %add1 = add nsw i32 %add0, %l2
+  %add2 = add nsw i32 %add1, %l3
+  ret i32 %add2
+}
diff --git a/test/CodeGen/X86/atomic-minmax-i6432.ll b/test/CodeGen/X86/atomic-minmax-i6432.ll
index d5d3fa6db5e8..1a6fde371f09 100644
--- a/test/CodeGen/X86/atomic-minmax-i6432.ll
+++ b/test/CodeGen/X86/atomic-minmax-i6432.ll
@@ -9,32 +9,32 @@ define void @atomic_maxmin_i6432() {
 ; LINUX: [[LABEL:.LBB[0-9]+_[0-9]+]]
 ; LINUX: cmpl
 ; LINUX: sbbl
-; LINUX: cmovne
-; LINUX: cmovne
+; LINUX: jne
+; LINUX: jne
 ; LINUX: lock cmpxchg8b
 ; LINUX: jne [[LABEL]]
   %2 = atomicrmw min  i64* @sc64, i64 6 acquire
 ; LINUX: [[LABEL:.LBB[0-9]+_[0-9]+]]
 ; LINUX: cmpl
 ; LINUX: sbbl
-; LINUX: cmovne
-; LINUX: cmovne
+; LINUX: jne
+; LINUX: jne
 ; LINUX: lock cmpxchg8b
 ; LINUX: jne [[LABEL]]
   %3 = atomicrmw umax i64* @sc64, i64 7 acquire
 ; LINUX: [[LABEL:.LBB[0-9]+_[0-9]+]]
 ; LINUX: cmpl
 ; LINUX: sbbl
-; LINUX: cmovne
-; LINUX: cmovne
+; LINUX: jne
+; LINUX: jne
 ; LINUX: lock cmpxchg8b
 ; LINUX: jne [[LABEL]]
   %4 = atomicrmw umin i64* @sc64, i64 8 acquire
 ; LINUX: [[LABEL:.LBB[0-9]+_[0-9]+]]
 ; LINUX: cmpl
 ; LINUX: sbbl
-; LINUX: cmovne
-; LINUX: cmovne
+; LINUX: jne
+; LINUX: jne
 ; LINUX: lock cmpxchg8b
 ; LINUX: jne [[LABEL]]
   ret void
diff --git a/test/CodeGen/X86/atomic128.ll b/test/CodeGen/X86/atomic128.ll
index 77bbdec826a5..c6300708bcc1 100644
--- a/test/CodeGen/X86/atomic128.ll
+++ b/test/CodeGen/X86/atomic128.ll
@@ -167,14 +167,24 @@ define void @fetch_and_min(i128* %p, i128 %bits) {
 ; CHECK-NEXT:    sbbq %rdx, %rcx
 ; CHECK-NEXT:    setge %cl
 ; CHECK-NEXT:    andb $1, %cl
+; CHECK-NEXT:    movq %rax, %rbx
+; CHECK-NEXT:    jne LBB5_3
+; CHECK-NEXT:  ## BB#2: ## %atomicrmw.start
+; CHECK-NEXT:    ## in Loop: Header=BB5_1 Depth=1
 ; CHECK-NEXT:    movq %rsi, %rbx
-; CHECK-NEXT:    cmovneq %rax, %rbx
+; CHECK-NEXT:  LBB5_3: ## %atomicrmw.start
+; CHECK-NEXT:    ## in Loop: Header=BB5_1 Depth=1
 ; CHECK-NEXT:    testb %cl, %cl
+; CHECK-NEXT:    movq %rdx, %rcx
+; CHECK-NEXT:    jne LBB5_5
+; CHECK-NEXT:  ## BB#4: ## %atomicrmw.start
+; CHECK-NEXT:    ## in Loop: Header=BB5_1 Depth=1
 ; CHECK-NEXT:    movq %r8, %rcx
-; CHECK-NEXT:    cmovneq %rdx, %rcx
+; CHECK-NEXT:  LBB5_5: ## %atomicrmw.start
+; CHECK-NEXT:    ## in Loop: Header=BB5_1 Depth=1
 ; CHECK-NEXT:    lock cmpxchg16b (%rdi)
 ; CHECK-NEXT:    jne LBB5_1
-; CHECK-NEXT:  ## BB#2: ## %atomicrmw.end
+; CHECK-NEXT:  ## BB#6: ## %atomicrmw.end
 ; CHECK-NEXT:    movq %rax, {{.*}}(%rip)
 ; CHECK-NEXT:    movq %rdx, _var+{{.*}}(%rip)
 ; CHECK-NEXT:    popq %rbx
@@ -203,14 +213,24 @@ define void @fetch_and_max(i128* %p, i128 %bits) {
 ; CHECK-NEXT:    sbbq %r8, %rcx
 ; CHECK-NEXT:    setge %cl
 ; CHECK-NEXT:    andb $1, %cl
+; CHECK-NEXT:    movq %rax, %rbx
+; CHECK-NEXT:    jne LBB6_3
+; CHECK-NEXT:  ## BB#2: ## %atomicrmw.start
+; CHECK-NEXT:    ## in Loop: Header=BB6_1 Depth=1
 ; CHECK-NEXT:    movq %rsi, %rbx
-; CHECK-NEXT:    cmovneq %rax, %rbx
+; CHECK-NEXT:  LBB6_3: ## %atomicrmw.start
+; CHECK-NEXT:    ## in Loop: Header=BB6_1 Depth=1
 ; CHECK-NEXT:    testb %cl, %cl
+; CHECK-NEXT:    movq %rdx, %rcx
+; CHECK-NEXT:    jne LBB6_5
+; CHECK-NEXT:  ## BB#4: ## %atomicrmw.start
+; CHECK-NEXT:    ## in Loop: Header=BB6_1 Depth=1
 ; CHECK-NEXT:    movq %r8, %rcx
-; CHECK-NEXT:    cmovneq %rdx, %rcx
+; CHECK-NEXT:  LBB6_5: ## %atomicrmw.start
+; CHECK-NEXT:    ## in Loop: Header=BB6_1 Depth=1
 ; CHECK-NEXT:    lock cmpxchg16b (%rdi)
 ; CHECK-NEXT:    jne LBB6_1
-; CHECK-NEXT:  ## BB#2: ## %atomicrmw.end
+; CHECK-NEXT:  ## BB#6: ## %atomicrmw.end
 ; CHECK-NEXT:    movq %rax, {{.*}}(%rip)
 ; CHECK-NEXT:    movq %rdx, _var+{{.*}}(%rip)
 ; CHECK-NEXT:    popq %rbx
@@ -239,14 +259,24 @@ define void @fetch_and_umin(i128* %p, i128 %bits) {
 ; CHECK-NEXT:    sbbq %rdx, %rcx
 ; CHECK-NEXT:    setae %cl
 ; CHECK-NEXT:    andb $1, %cl
+; CHECK-NEXT:    movq %rax, %rbx
+; CHECK-NEXT:    jne LBB7_3
+; CHECK-NEXT:  ## BB#2: ## %atomicrmw.start
+; CHECK-NEXT:    ## in Loop: Header=BB7_1 Depth=1
 ; CHECK-NEXT:    movq %rsi, %rbx
-; CHECK-NEXT:    cmovneq %rax, %rbx
+; CHECK-NEXT:  LBB7_3: ## %atomicrmw.start
+; CHECK-NEXT:    ## in Loop: Header=BB7_1 Depth=1
 ; CHECK-NEXT:    testb %cl, %cl
+; CHECK-NEXT:    movq %rdx, %rcx
+; CHECK-NEXT:    jne LBB7_5
+; CHECK-NEXT:  ## BB#4: ## %atomicrmw.start
+; CHECK-NEXT:    ## in Loop: Header=BB7_1 Depth=1
 ; CHECK-NEXT:    movq %r8, %rcx
-; CHECK-NEXT:    cmovneq %rdx, %rcx
+; CHECK-NEXT:  LBB7_5: ## %atomicrmw.start
+; CHECK-NEXT:    ## in Loop: Header=BB7_1 Depth=1
 ; CHECK-NEXT:    lock cmpxchg16b (%rdi)
 ; CHECK-NEXT:    jne LBB7_1
-; CHECK-NEXT:  ## BB#2: ## %atomicrmw.end
+; CHECK-NEXT:  ## BB#6: ## %atomicrmw.end
 ; CHECK-NEXT:    movq %rax, {{.*}}(%rip)
 ; CHECK-NEXT:    movq %rdx, _var+{{.*}}(%rip)
 ; CHECK-NEXT:    popq %rbx
@@ -275,14 +305,24 @@ define void @fetch_and_umax(i128* %p, i128 %bits) {
 ; CHECK-NEXT:    sbbq %rdx, %rcx
 ; CHECK-NEXT:    setb %cl
 ; CHECK-NEXT:    andb $1, %cl
+; CHECK-NEXT:    movq %rax, %rbx
+; CHECK-NEXT:    jne LBB8_3
+; CHECK-NEXT:  ## BB#2: ## %atomicrmw.start
+; CHECK-NEXT:    ## in Loop: Header=BB8_1 Depth=1
 ; CHECK-NEXT:    movq %rsi, %rbx
-; CHECK-NEXT:    cmovneq %rax, %rbx
+; CHECK-NEXT:  LBB8_3: ## %atomicrmw.start
+; CHECK-NEXT:    ## in Loop: Header=BB8_1 Depth=1
 ; CHECK-NEXT:    testb %cl, %cl
+; CHECK-NEXT:    movq %rdx, %rcx
+; CHECK-NEXT:    jne LBB8_5
+; CHECK-NEXT:  ## BB#4: ## %atomicrmw.start
+; CHECK-NEXT:    ## in Loop: Header=BB8_1 Depth=1
 ; CHECK-NEXT:    movq %r8, %rcx
-; CHECK-NEXT:    cmovneq %rdx, %rcx
+; CHECK-NEXT:  LBB8_5: ## %atomicrmw.start
+; CHECK-NEXT:    ## in Loop: Header=BB8_1 Depth=1
 ; CHECK-NEXT:    lock cmpxchg16b (%rdi)
 ; CHECK-NEXT:    jne LBB8_1
-; CHECK-NEXT:  ## BB#2: ## %atomicrmw.end
+; CHECK-NEXT:  ## BB#6: ## %atomicrmw.end
 ; CHECK-NEXT:    movq %rax, {{.*}}(%rip)
 ; CHECK-NEXT:    movq %rdx, _var+{{.*}}(%rip)
 ; CHECK-NEXT:    popq %rbx
diff --git a/test/CodeGen/X86/avx-schedule.ll b/test/CodeGen/X86/avx-schedule.ll
index a12a412fb94d..953f3bdd06e8 100644
--- a/test/CodeGen/X86/avx-schedule.ll
+++ b/test/CodeGen/X86/avx-schedule.ll
@@ -27,9 +27,9 @@ define <4 x double> @test_addpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ;
 ; ZNVER1-LABEL: test_addpd:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    vaddpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vaddpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fadd <4 x double> %a0, %a1
   %2 = load <4 x double>, <4 x double> *%a2, align 32
   %3 = fadd <4 x double> %1, %2
@@ -57,9 +57,9 @@ define <8 x float> @test_addps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
 ;
 ; ZNVER1-LABEL: test_addps:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    vaddps (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vaddps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fadd <8 x float> %a0, %a1
   %2 = load <8 x float>, <8 x float> *%a2, align 32
   %3 = fadd <8 x float> %1, %2
@@ -87,9 +87,9 @@ define <4 x double> @test_addsubpd(<4 x double> %a0, <4 x double> %a1, <4 x doub
 ;
 ; ZNVER1-LABEL: test_addsubpd:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vaddsubpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vaddsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double> %a0, <4 x double> %a1)
   %2 = load <4 x double>, <4 x double> *%a2, align 32
   %3 = call <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double> %1, <4 x double> %2)
@@ -118,9 +118,9 @@ define <8 x float> @test_addsubps(<8 x float> %a0, <8 x float> %a1, <8 x float>
 ;
 ; ZNVER1-LABEL: test_addsubps:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vaddsubps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    vaddsubps (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vaddsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vaddsubps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float> %a0, <8 x float> %a1)
   %2 = load <8 x float>, <8 x float> *%a2, align 32
   %3 = call <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float> %1, <8 x float> %2)
@@ -152,10 +152,10 @@ define <4 x double> @test_andnotpd(<4 x double> %a0, <4 x double> %a1, <4 x doub
 ;
 ; ZNVER1-LABEL: test_andnotpd:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vandnpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    vandnpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vandnpd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vandnpd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = bitcast <4 x double> %a0 to <4 x i64>
   %2 = bitcast <4 x double> %a1 to <4 x i64>
   %3 = xor <4 x i64> %1, <i64 -1, i64 -1, i64 -1, i64 -1>
@@ -193,10 +193,10 @@ define <8 x float> @test_andnotps(<8 x float> %a0, <8 x float> %a1, <8 x float>
 ;
 ; ZNVER1-LABEL: test_andnotps:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vandnps %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    vandnps (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vandnps %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vandnps (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = bitcast <8 x float> %a0 to <4 x i64>
   %2 = bitcast <8 x float> %a1 to <4 x i64>
   %3 = xor <4 x i64> %1, <i64 -1, i64 -1, i64 -1, i64 -1>
@@ -234,10 +234,10 @@ define <4 x double> @test_andpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ;
 ; ZNVER1-LABEL: test_andpd:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vandpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    vandpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vandpd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vandpd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = bitcast <4 x double> %a0 to <4 x i64>
   %2 = bitcast <4 x double> %a1 to <4 x i64>
   %3 = and <4 x i64> %1, %2
@@ -273,10 +273,10 @@ define <8 x float> @test_andps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
 ;
 ; ZNVER1-LABEL: test_andps:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vandps %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    vandps (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vandps %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vandps (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = bitcast <8 x float> %a0 to <4 x i64>
   %2 = bitcast <8 x float> %a1 to <4 x i64>
   %3 = and <4 x i64> %1, %2
@@ -313,9 +313,9 @@ define <4 x double> @test_blendpd(<4 x double> %a0, <4 x double> %a1, <4 x doubl
 ; ZNVER1-LABEL: test_blendpd:
 ; ZNVER1:       # BB#0:
 ; ZNVER1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] sched: [1:0.50]
-; ZNVER1-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],mem[1,2],ymm0[3] sched: [6:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],mem[1,2],ymm0[3] sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
   %2 = load <4 x double>, <4 x double> *%a2, align 32
   %3 = fadd <4 x double> %a1, %1
@@ -345,8 +345,8 @@ define <8 x float> @test_blendps(<8 x float> %a0, <8 x float> %a1, <8 x float> *
 ; ZNVER1-LABEL: test_blendps:
 ; ZNVER1:       # BB#0:
 ; ZNVER1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4,5,6,7] sched: [1:0.50]
-; ZNVER1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4,5,6],ymm0[7] sched: [6:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4,5,6],ymm0[7] sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 5, i32 6, i32 7>
   %2 = load <8 x float>, <8 x float> *%a2, align 32
   %3 = shufflevector <8 x float> %1, <8 x float> %2, <8 x i32> <i32 0, i32 1, i32 10, i32 3, i32 12, i32 13, i32 14, i32 7>
@@ -374,9 +374,9 @@ define <4 x double> @test_blendvpd(<4 x double> %a0, <4 x double> %a1, <4 x doub
 ;
 ; ZNVER1-LABEL: test_blendvpd:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
-; ZNVER1-NEXT:    vblendvpd %ymm2, (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; ZNVER1-NEXT:    vblendvpd %ymm2, (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
   %2 = load <4 x double>, <4 x double> *%a3, align 32
   %3 = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %1, <4 x double> %2, <4 x double> %a2)
@@ -405,9 +405,9 @@ define <8 x float> @test_blendvps(<8 x float> %a0, <8 x float> %a1, <8 x float>
 ;
 ; ZNVER1-LABEL: test_blendvps:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vblendvps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
-; ZNVER1-NEXT:    vblendvps %ymm2, (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vblendvps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; ZNVER1-NEXT:    vblendvps %ymm2, (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
   %2 = load <8 x float>, <8 x float> *%a3, align 32
   %3 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %1, <8 x float> %2, <8 x float> %a2)
@@ -433,8 +433,8 @@ define <8 x float> @test_broadcastf128(<4 x float> *%a0) {
 ;
 ; ZNVER1-LABEL: test_broadcastf128:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] sched: [6:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = load <4 x float>, <4 x float> *%a0, align 32
   %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
   ret <8 x float> %2
@@ -458,8 +458,8 @@ define <4 x double> @test_broadcastsd_ymm(double *%a0) {
 ;
 ; ZNVER1-LABEL: test_broadcastsd_ymm:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vbroadcastsd (%rdi), %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vbroadcastsd (%rdi), %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = load double, double *%a0, align 8
   %2 = insertelement <4 x double> undef, double %1, i32 0
   %3 = shufflevector <4 x double> %2, <4 x double> undef, <4 x i32> zeroinitializer
@@ -484,8 +484,8 @@ define <4 x float> @test_broadcastss(float *%a0) {
 ;
 ; ZNVER1-LABEL: test_broadcastss:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vbroadcastss (%rdi), %xmm0 # sched: [5:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vbroadcastss (%rdi), %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = load float, float *%a0, align 4
   %2 = insertelement <4 x float> undef, float %1, i32 0
   %3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> zeroinitializer
@@ -510,8 +510,8 @@ define <8 x float> @test_broadcastss_ymm(float *%a0) {
 ;
 ; ZNVER1-LABEL: test_broadcastss_ymm:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vbroadcastss (%rdi), %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vbroadcastss (%rdi), %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = load float, float *%a0, align 4
   %2 = insertelement <8 x float> undef, float %1, i32 0
   %3 = shufflevector <8 x float> %2, <8 x float> undef, <8 x i32> zeroinitializer
@@ -543,9 +543,9 @@ define <4 x double> @test_cmppd(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; ZNVER1-LABEL: test_cmppd:
 ; ZNVER1:       # BB#0:
 ; ZNVER1-NEXT:    vcmpeqpd %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
-; ZNVER1-NEXT:    vcmpeqpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
-; ZNVER1-NEXT:    vorpd %ymm0, %ymm1, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vcmpeqpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    vorpd %ymm0, %ymm1, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fcmp oeq <4 x double> %a0, %a1
   %2 = load <4 x double>, <4 x double> *%a2, align 32
   %3 = fcmp oeq <4 x double> %a0, %2
@@ -581,9 +581,9 @@ define <8 x float> @test_cmpps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
 ; ZNVER1-LABEL: test_cmpps:
 ; ZNVER1:       # BB#0:
 ; ZNVER1-NEXT:    vcmpeqps %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
-; ZNVER1-NEXT:    vcmpeqps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
-; ZNVER1-NEXT:    vorps %ymm0, %ymm1, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vcmpeqps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    vorps %ymm0, %ymm1, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fcmp oeq <8 x float> %a0, %a1
   %2 = load <8 x float>, <8 x float> *%a2, align 32
   %3 = fcmp oeq <8 x float> %a0, %2
@@ -618,10 +618,10 @@ define <4 x double> @test_cvtdq2pd(<4 x i32> %a0, <4 x i32> *%a1) {
 ;
 ; ZNVER1-LABEL: test_cvtdq2pd:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vcvtdq2pd (%rdi), %ymm1 # sched: [8:1.00]
-; ZNVER1-NEXT:    vcvtdq2pd %xmm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vcvtdq2pd (%rdi), %ymm1 # sched: [12:1.00]
+; ZNVER1-NEXT:    vcvtdq2pd %xmm0, %ymm0 # sched: [5:1.00]
+; ZNVER1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = sitofp <4 x i32> %a0 to <4 x double>
   %2 = load <4 x i32>, <4 x i32> *%a1, align 16
   %3 = sitofp <4 x i32> %2 to <4 x double>
@@ -655,10 +655,10 @@ define <8 x float> @test_cvtdq2ps(<8 x i32> %a0, <8 x i32> *%a1) {
 ;
 ; ZNVER1-LABEL: test_cvtdq2ps:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vcvtdq2ps (%rdi), %ymm1 # sched: [8:1.00]
-; ZNVER1-NEXT:    vcvtdq2ps %ymm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vcvtdq2ps (%rdi), %ymm1 # sched: [12:1.00]
+; ZNVER1-NEXT:    vcvtdq2ps %ymm0, %ymm0 # sched: [5:1.00]
+; ZNVER1-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = sitofp <8 x i32> %a0 to <8 x float>
   %2 = load <8 x i32>, <8 x i32> *%a1, align 16
   %3 = sitofp <8 x i32> %2 to <8 x float>
@@ -690,10 +690,10 @@ define <8 x i32> @test_cvtpd2dq(<4 x double> %a0, <4 x double> *%a1) {
 ;
 ; ZNVER1-LABEL: test_cvtpd2dq:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vcvttpd2dqy (%rdi), %xmm1 # sched: [8:1.00]
-; ZNVER1-NEXT:    vcvttpd2dq %ymm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vcvttpd2dqy (%rdi), %xmm1 # sched: [12:1.00]
+; ZNVER1-NEXT:    vcvttpd2dq %ymm0, %xmm0 # sched: [5:1.00]
 ; ZNVER1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fptosi <4 x double> %a0 to <4 x i32>
   %2 = load <4 x double>, <4 x double> *%a1, align 32
   %3 = fptosi <4 x double> %2 to <4 x i32>
@@ -725,10 +725,10 @@ define <8 x float> @test_cvtpd2ps(<4 x double> %a0, <4 x double> *%a1) {
 ;
 ; ZNVER1-LABEL: test_cvtpd2ps:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vcvtpd2psy (%rdi), %xmm1 # sched: [8:1.00]
-; ZNVER1-NEXT:    vcvtpd2ps %ymm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vcvtpd2psy (%rdi), %xmm1 # sched: [12:1.00]
+; ZNVER1-NEXT:    vcvtpd2ps %ymm0, %xmm0 # sched: [5:1.00]
 ; ZNVER1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fptrunc <4 x double> %a0 to <4 x float>
   %2 = load <4 x double>, <4 x double> *%a1, align 32
   %3 = fptrunc <4 x double> %2 to <4 x float>
@@ -760,10 +760,10 @@ define <8 x i32> @test_cvtps2dq(<8 x float> %a0, <8 x float> *%a1) {
 ;
 ; ZNVER1-LABEL: test_cvtps2dq:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vcvttps2dq (%rdi), %ymm1 # sched: [8:1.00]
-; ZNVER1-NEXT:    vcvttps2dq %ymm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT:    vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vcvttps2dq (%rdi), %ymm1 # sched: [12:1.00]
+; ZNVER1-NEXT:    vcvttps2dq %ymm0, %ymm0 # sched: [5:1.00]
+; ZNVER1-NEXT:    vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fptosi <8 x float> %a0 to <8 x i32>
   %2 = load <8 x float>, <8 x float> *%a1, align 32
   %3 = fptosi <8 x float> %2 to <8 x i32>
@@ -792,9 +792,9 @@ define <4 x double> @test_divpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ;
 ; ZNVER1-LABEL: test_divpd:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vdivpd %ymm1, %ymm0, %ymm0 # sched: [38:38.00]
-; ZNVER1-NEXT:    vdivpd (%rdi), %ymm0, %ymm0 # sched: [43:38.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vdivpd %ymm1, %ymm0, %ymm0 # sched: [15:1.00]
+; ZNVER1-NEXT:    vdivpd (%rdi), %ymm0, %ymm0 # sched: [22:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fdiv <4 x double> %a0, %a1
   %2 = load <4 x double>, <4 x double> *%a2, align 32
   %3 = fdiv <4 x double> %1, %2
@@ -822,9 +822,9 @@ define <8 x float> @test_divps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
 ;
 ; ZNVER1-LABEL: test_divps:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vdivps %ymm1, %ymm0, %ymm0 # sched: [38:38.00]
-; ZNVER1-NEXT:    vdivps (%rdi), %ymm0, %ymm0 # sched: [43:38.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vdivps %ymm1, %ymm0, %ymm0 # sched: [15:1.00]
+; ZNVER1-NEXT:    vdivps (%rdi), %ymm0, %ymm0 # sched: [22:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fdiv <8 x float> %a0, %a1
   %2 = load <8 x float>, <8 x float> *%a2, align 32
   %3 = fdiv <8 x float> %1, %2
@@ -853,8 +853,8 @@ define <8 x float> @test_dpps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2
 ; ZNVER1-LABEL: test_dpps:
 ; ZNVER1:       # BB#0:
 ; ZNVER1-NEXT:    vdpps $7, %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT:    vdpps $7, (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vdpps $7, (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 7)
   %2 = load <8 x float>, <8 x float> *%a2, align 32
   %3 = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %1, <8 x float> %2, i8 7)
@@ -886,9 +886,9 @@ define <4 x float> @test_extractf128(<8 x float> %a0, <8 x float> %a1, <4 x floa
 ; ZNVER1-LABEL: test_extractf128:
 ; ZNVER1:       # BB#0:
 ; ZNVER1-NEXT:    vextractf128 $1, %ymm0, %xmm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    vextractf128 $1, %ymm1, (%rdi) # sched: [1:1.00]
+; ZNVER1-NEXT:    vextractf128 $1, %ymm1, (%rdi) # sched: [1:0.50]
 ; ZNVER1-NEXT:    vzeroupper # sched: [?:0.000000e+00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <8 x float> %a0, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %2 = shufflevector <8 x float> %a1, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   store <4 x float> %2, <4 x float> *%a2
@@ -916,9 +916,9 @@ define <4 x double> @test_haddpd(<4 x double> %a0, <4 x double> %a1, <4 x double
 ;
 ; ZNVER1-LABEL: test_haddpd:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    vhaddpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vhaddpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %a0, <4 x double> %a1)
   %2 = load <4 x double>, <4 x double> *%a2, align 32
   %3 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %1, <4 x double> %2)
@@ -947,9 +947,9 @@ define <8 x float> @test_haddps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%
 ;
 ; ZNVER1-LABEL: test_haddps:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vhaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    vhaddps (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vhaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vhaddps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %a0, <8 x float> %a1)
   %2 = load <8 x float>, <8 x float> *%a2, align 32
   %3 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %1, <8 x float> %2)
@@ -978,9 +978,9 @@ define <4 x double> @test_hsubpd(<4 x double> %a0, <4 x double> %a1, <4 x double
 ;
 ; ZNVER1-LABEL: test_hsubpd:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vhsubpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    vhsubpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vhsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vhsubpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %a0, <4 x double> %a1)
   %2 = load <4 x double>, <4 x double> *%a2, align 32
   %3 = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %1, <4 x double> %2)
@@ -1009,9 +1009,9 @@ define <8 x float> @test_hsubps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%
 ;
 ; ZNVER1-LABEL: test_hsubps:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vhsubps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    vhsubps (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vhsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vhsubps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %a0, <8 x float> %a1)
   %2 = load <8 x float>, <8 x float> *%a2, align 32
   %3 = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %1, <8 x float> %2)
@@ -1044,9 +1044,9 @@ define <8 x float> @test_insertf128(<8 x float> %a0, <4 x float> %a1, <4 x float
 ; ZNVER1-LABEL: test_insertf128:
 ; ZNVER1:       # BB#0:
 ; ZNVER1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1 # sched: [1:0.50]
-; ZNVER1-NEXT:    vinsertf128 $1, (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vinsertf128 $1, (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <4 x float> %a1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
   %2 = shufflevector <8 x float> %a0, <8 x float> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
   %3 = load <4 x float>, <4 x float> *%a2, align 16
@@ -1074,8 +1074,8 @@ define <32 x i8> @test_lddqu(i8* %a0) {
 ;
 ; ZNVER1-LABEL: test_lddqu:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vlddqu (%rdi), %ymm0 # sched: [5:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vlddqu (%rdi), %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <32 x i8> @llvm.x86.avx.ldu.dq.256(i8* %a0)
   ret <32 x i8> %1
 }
@@ -1108,7 +1108,7 @@ define <2 x double> @test_maskmovpd(i8* %a0, <2 x i64> %a1, <2 x double> %a2) {
 ; ZNVER1-NEXT:    vmaskmovpd (%rdi), %xmm0, %xmm2 # sched: [?:0.000000e+00]
 ; ZNVER1-NEXT:    vmaskmovpd %xmm1, %xmm0, (%rdi) # sched: [?:0.000000e+00]
 ; ZNVER1-NEXT:    vmovapd %xmm2, %xmm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <2 x double> @llvm.x86.avx.maskload.pd(i8* %a0, <2 x i64> %a1)
   call void @llvm.x86.avx.maskstore.pd(i8* %a0, <2 x i64> %a1, <2 x double> %a2)
   ret <2 x double> %1
@@ -1143,7 +1143,7 @@ define <4 x double> @test_maskmovpd_ymm(i8* %a0, <4 x i64> %a1, <4 x double> %a2
 ; ZNVER1-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [?:0.000000e+00]
 ; ZNVER1-NEXT:    vmaskmovpd %ymm1, %ymm0, (%rdi) # sched: [?:0.000000e+00]
 ; ZNVER1-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %a0, <4 x i64> %a1)
   call void @llvm.x86.avx.maskstore.pd.256(i8* %a0, <4 x i64> %a1, <4 x double> %a2)
   ret <4 x double> %1
@@ -1178,7 +1178,7 @@ define <4 x float> @test_maskmovps(i8* %a0, <4 x i32> %a1, <4 x float> %a2) {
 ; ZNVER1-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm2 # sched: [?:0.000000e+00]
 ; ZNVER1-NEXT:    vmaskmovps %xmm1, %xmm0, (%rdi) # sched: [?:0.000000e+00]
 ; ZNVER1-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x float> @llvm.x86.avx.maskload.ps(i8* %a0, <4 x i32> %a1)
   call void @llvm.x86.avx.maskstore.ps(i8* %a0, <4 x i32> %a1, <4 x float> %a2)
   ret <4 x float> %1
@@ -1213,7 +1213,7 @@ define <8 x float> @test_maskmovps_ymm(i8* %a0, <8 x i32> %a1, <8 x float> %a2)
 ; ZNVER1-NEXT:    vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [?:0.000000e+00]
 ; ZNVER1-NEXT:    vmaskmovps %ymm1, %ymm0, (%rdi) # sched: [?:0.000000e+00]
 ; ZNVER1-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8* %a0, <8 x i32> %a1)
   call void @llvm.x86.avx.maskstore.ps.256(i8* %a0, <8 x i32> %a1, <8 x float> %a2)
   ret <8 x float> %1
@@ -1243,8 +1243,8 @@ define <4 x double> @test_maxpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; ZNVER1-LABEL: test_maxpd:
 ; ZNVER1:       # BB#0:
 ; ZNVER1-NEXT:    vmaxpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT:    vmaxpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vmaxpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %a0, <4 x double> %a1)
   %2 = load <4 x double>, <4 x double> *%a2, align 32
   %3 = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %1, <4 x double> %2)
@@ -1274,8 +1274,8 @@ define <8 x float> @test_maxps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
 ; ZNVER1-LABEL: test_maxps:
 ; ZNVER1:       # BB#0:
 ; ZNVER1-NEXT:    vmaxps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT:    vmaxps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vmaxps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %a0, <8 x float> %a1)
   %2 = load <8 x float>, <8 x float> *%a2, align 32
   %3 = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %1, <8 x float> %2)
@@ -1305,8 +1305,8 @@ define <4 x double> @test_minpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; ZNVER1-LABEL: test_minpd:
 ; ZNVER1:       # BB#0:
 ; ZNVER1-NEXT:    vminpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT:    vminpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vminpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %a0, <4 x double> %a1)
   %2 = load <4 x double>, <4 x double> *%a2, align 32
   %3 = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %1, <4 x double> %2)
@@ -1336,8 +1336,8 @@ define <8 x float> @test_minps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
 ; ZNVER1-LABEL: test_minps:
 ; ZNVER1:       # BB#0:
 ; ZNVER1-NEXT:    vminps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT:    vminps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vminps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %a0, <8 x float> %a1)
   %2 = load <8 x float>, <8 x float> *%a2, align 32
   %3 = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %1, <8 x float> %2)
@@ -1369,10 +1369,10 @@ define <4 x double> @test_movapd(<4 x double> *%a0, <4 x double> *%a1) {
 ;
 ; ZNVER1-LABEL: test_movapd:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vmovapd (%rdi), %ymm0 # sched: [5:1.00]
-; ZNVER1-NEXT:    vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    vmovapd %ymm0, (%rsi) # sched: [1:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vmovapd (%rdi), %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vmovapd %ymm0, (%rsi) # sched: [1:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = load <4 x double>, <4 x double> *%a0, align 32
   %2 = fadd <4 x double> %1, %1
   store <4 x double> %2, <4 x double> *%a1, align 32
@@ -1403,10 +1403,10 @@ define <8 x float> @test_movaps(<8 x float> *%a0, <8 x float> *%a1) {
 ;
 ; ZNVER1-LABEL: test_movaps:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vmovaps (%rdi), %ymm0 # sched: [5:1.00]
-; ZNVER1-NEXT:    vaddps %ymm0, %ymm0, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    vmovaps %ymm0, (%rsi) # sched: [1:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vmovaps (%rdi), %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vmovaps %ymm0, (%rsi) # sched: [1:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = load <8 x float>, <8 x float> *%a0, align 32
   %2 = fadd <8 x float> %1, %1
   store <8 x float> %2, <8 x float> *%a1, align 32
@@ -1437,10 +1437,10 @@ define <4 x double> @test_movddup(<4 x double> %a0, <4 x double> *%a1) {
 ;
 ; ZNVER1-LABEL: test_movddup:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [5:1.00]
+; ZNVER1-NEXT:    vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [8:0.50]
 ; ZNVER1-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] sched: [1:0.50]
-; ZNVER1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
   %2 = load <4 x double>, <4 x double> *%a1, align 32
   %3 = shufflevector <4 x double> %2, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
@@ -1468,9 +1468,9 @@ define i32 @test_movmskpd(<4 x double> %a0) {
 ;
 ; ZNVER1-LABEL: test_movmskpd:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vmovmskpd %ymm0, %eax # sched: [1:0.50]
+; ZNVER1-NEXT:    vmovmskpd %ymm0, %eax # sched: [1:0.25]
 ; ZNVER1-NEXT:    vzeroupper # sched: [?:0.000000e+00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %a0)
   ret i32 %1
 }
@@ -1496,9 +1496,9 @@ define i32 @test_movmskps(<8 x float> %a0) {
 ;
 ; ZNVER1-LABEL: test_movmskps:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vmovmskps %ymm0, %eax # sched: [1:0.50]
+; ZNVER1-NEXT:    vmovmskps %ymm0, %eax # sched: [1:0.25]
 ; ZNVER1-NEXT:    vzeroupper # sched: [?:0.000000e+00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %a0)
   ret i32 %1
 }
@@ -1525,9 +1525,9 @@ define <4 x double> @test_movntpd(<4 x double> %a0, <4 x double> *%a1) {
 ;
 ; ZNVER1-LABEL: test_movntpd:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    vmovntpd %ymm0, (%rdi) # sched: [1:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vmovntpd %ymm0, (%rdi) # sched: [1:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fadd <4 x double> %a0, %a0
   store <4 x double> %1, <4 x double> *%a1, align 32, !nontemporal !0
   ret <4 x double> %1
@@ -1554,9 +1554,9 @@ define <8 x float> @test_movntps(<8 x float> %a0, <8 x float> *%a1) {
 ;
 ; ZNVER1-LABEL: test_movntps:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vaddps %ymm0, %ymm0, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    vmovntps %ymm0, (%rdi) # sched: [1:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vmovntps %ymm0, (%rdi) # sched: [1:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fadd <8 x float> %a0, %a0
   store <8 x float> %1, <8 x float> *%a1, align 32, !nontemporal !0
   ret <8 x float> %1
@@ -1586,10 +1586,10 @@ define <8 x float> @test_movshdup(<8 x float> %a0, <8 x float> *%a1) {
 ;
 ; ZNVER1-LABEL: test_movshdup:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [5:1.00]
+; ZNVER1-NEXT:    vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [8:0.50]
 ; ZNVER1-NEXT:    vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] sched: [1:0.50]
-; ZNVER1-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
   %2 = load <8 x float>, <8 x float> *%a1, align 32
   %3 = shufflevector <8 x float> %2, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
@@ -1621,10 +1621,10 @@ define <8 x float> @test_movsldup(<8 x float> %a0, <8 x float> *%a1) {
 ;
 ; ZNVER1-LABEL: test_movsldup:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [5:1.00]
+; ZNVER1-NEXT:    vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [8:0.50]
 ; ZNVER1-NEXT:    vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] sched: [1:0.50]
-; ZNVER1-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
   %2 = load <8 x float>, <8 x float> *%a1, align 32
   %3 = shufflevector <8 x float> %2, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
@@ -1658,10 +1658,10 @@ define <4 x double> @test_movupd(<4 x double> *%a0, <4 x double> *%a1) {
 ;
 ; ZNVER1-LABEL: test_movupd:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vmovupd (%rdi), %ymm0 # sched: [5:1.00]
-; ZNVER1-NEXT:    vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    vmovupd %ymm0, (%rsi) # sched: [1:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vmovupd (%rdi), %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vmovupd %ymm0, (%rsi) # sched: [1:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = load <4 x double>, <4 x double> *%a0, align 1
   %2 = fadd <4 x double> %1, %1
   store <4 x double> %2, <4 x double> *%a1, align 1
@@ -1694,10 +1694,10 @@ define <8 x float> @test_movups(<8 x float> *%a0, <8 x float> *%a1) {
 ;
 ; ZNVER1-LABEL: test_movups:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vmovups (%rdi), %ymm0 # sched: [5:1.00]
-; ZNVER1-NEXT:    vaddps %ymm0, %ymm0, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    vmovups %ymm0, (%rsi) # sched: [1:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vmovups (%rdi), %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vmovups %ymm0, (%rsi) # sched: [1:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = load <8 x float>, <8 x float> *%a0, align 1
   %2 = fadd <8 x float> %1, %1
   store <8 x float> %2, <8 x float> *%a1, align 1
@@ -1725,9 +1725,9 @@ define <4 x double> @test_mulpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ;
 ; ZNVER1-LABEL: test_mulpd:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vmulpd %ymm1, %ymm0, %ymm0 # sched: [4:4.00]
-; ZNVER1-NEXT:    vmulpd (%rdi), %ymm0, %ymm0 # sched: [9:4.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vmulpd %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
+; ZNVER1-NEXT:    vmulpd (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fmul <4 x double> %a0, %a1
   %2 = load <4 x double>, <4 x double> *%a2, align 32
   %3 = fmul <4 x double> %1, %2
@@ -1755,9 +1755,9 @@ define <8 x float> @test_mulps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
 ;
 ; ZNVER1-LABEL: test_mulps:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
-; ZNVER1-NEXT:    vmulps (%rdi), %ymm0, %ymm0 # sched: [7:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
+; ZNVER1-NEXT:    vmulps (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fmul <8 x float> %a0, %a1
   %2 = load <8 x float>, <8 x float> *%a2, align 32
   %3 = fmul <8 x float> %1, %2
@@ -1788,10 +1788,10 @@ define <4 x double> @orpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2)
 ;
 ; ZNVER1-LABEL: orpd:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    vorpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vorpd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = bitcast <4 x double> %a0 to <4 x i64>
   %2 = bitcast <4 x double> %a1 to <4 x i64>
   %3 = or <4 x i64> %1, %2
@@ -1827,10 +1827,10 @@ define <8 x float> @test_orps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2
 ;
 ; ZNVER1-LABEL: test_orps:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    vorps (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vorps (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = bitcast <8 x float> %a0 to <4 x i64>
   %2 = bitcast <8 x float> %a1 to <4 x i64>
   %3 = or <4 x i64> %1, %2
@@ -1866,10 +1866,10 @@ define <2 x double> @test_permilpd(<2 x double> %a0, <2 x double> *%a1) {
 ;
 ; ZNVER1-LABEL: test_permilpd:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vpermilpd {{.*#+}} xmm1 = mem[1,0] sched: [6:1.00]
+; ZNVER1-NEXT:    vpermilpd {{.*#+}} xmm1 = mem[1,0] sched: [8:0.50]
 ; ZNVER1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0] sched: [1:0.50]
 ; ZNVER1-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> <i32 1, i32 0>
   %2 = load <2 x double>, <2 x double> *%a1, align 16
   %3 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> <i32 1, i32 0>
@@ -1901,10 +1901,10 @@ define <4 x double> @test_permilpd_ymm(<4 x double> %a0, <4 x double> *%a1) {
 ;
 ; ZNVER1-LABEL: test_permilpd_ymm:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vpermilpd {{.*#+}} ymm1 = mem[1,0,2,3] sched: [6:1.00]
+; ZNVER1-NEXT:    vpermilpd {{.*#+}} ymm1 = mem[1,0,2,3] sched: [8:0.50]
 ; ZNVER1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] sched: [1:0.50]
-; ZNVER1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 3>
   %2 = load <4 x double>, <4 x double> *%a1, align 32
   %3 = shufflevector <4 x double> %2, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 3>
@@ -1936,10 +1936,10 @@ define <4 x float> @test_permilps(<4 x float> %a0, <4 x float> *%a1) {
 ;
 ; ZNVER1-LABEL: test_permilps:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vpermilps {{.*#+}} xmm1 = mem[3,2,1,0] sched: [6:1.00]
+; ZNVER1-NEXT:    vpermilps {{.*#+}} xmm1 = mem[3,2,1,0] sched: [8:0.50]
 ; ZNVER1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] sched: [1:0.50]
 ; ZNVER1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
   %2 = load <4 x float>, <4 x float> *%a1, align 16
   %3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -1971,10 +1971,10 @@ define <8 x float> @test_permilps_ymm(<8 x float> %a0, <8 x float> *%a1) {
 ;
 ; ZNVER1-LABEL: test_permilps_ymm:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vpermilps {{.*#+}} ymm1 = mem[3,2,1,0,7,6,5,4] sched: [6:1.00]
+; ZNVER1-NEXT:    vpermilps {{.*#+}} ymm1 = mem[3,2,1,0,7,6,5,4] sched: [8:0.50]
 ; ZNVER1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [1:0.50]
-; ZNVER1-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
   %2 = load <8 x float>, <8 x float> *%a1, align 32
   %3 = shufflevector <8 x float> %2, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
@@ -2004,8 +2004,8 @@ define <2 x double> @test_permilvarpd(<2 x double> %a0, <2 x i64> %a1, <2 x i64>
 ; ZNVER1-LABEL: test_permilvarpd:
 ; ZNVER1:       # BB#0:
 ; ZNVER1-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    vpermilpd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vpermilpd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> %a1)
   %2 = load <2 x i64>, <2 x i64> *%a2, align 16
   %3 = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %1, <2 x i64> %2)
@@ -2035,8 +2035,8 @@ define <4 x double> @test_permilvarpd_ymm(<4 x double> %a0, <4 x i64> %a1, <4 x
 ; ZNVER1-LABEL: test_permilvarpd_ymm:
 ; ZNVER1:       # BB#0:
 ; ZNVER1-NEXT:    vpermilpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    vpermilpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vpermilpd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> %a1)
   %2 = load <4 x i64>, <4 x i64> *%a2, align 32
   %3 = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %1, <4 x i64> %2)
@@ -2066,8 +2066,8 @@ define <4 x float> @test_permilvarps(<4 x float> %a0, <4 x i32> %a1, <4 x i32> *
 ; ZNVER1-LABEL: test_permilvarps:
 ; ZNVER1:       # BB#0:
 ; ZNVER1-NEXT:    vpermilps %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    vpermilps (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vpermilps (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> %a1)
   %2 = load <4 x i32>, <4 x i32> *%a2, align 16
   %3 = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %1, <4 x i32> %2)
@@ -2097,8 +2097,8 @@ define <8 x float> @test_permilvarps_ymm(<8 x float> %a0, <8 x i32> %a1, <8 x i3
 ; ZNVER1-LABEL: test_permilvarps_ymm:
 ; ZNVER1:       # BB#0:
 ; ZNVER1-NEXT:    vpermilps %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    vpermilps (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vpermilps (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> %a1)
   %2 = load <8 x i32>, <8 x i32> *%a2, align 32
   %3 = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %1, <8 x i32> %2)
@@ -2130,10 +2130,10 @@ define <8 x float> @test_rcpps(<8 x float> %a0, <8 x float> *%a1) {
 ;
 ; ZNVER1-LABEL: test_rcpps:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vrcpps (%rdi), %ymm1 # sched: [7:2.00]
-; ZNVER1-NEXT:    vrcpps %ymm0, %ymm0 # sched: [2:2.00]
-; ZNVER1-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vrcpps (%rdi), %ymm1 # sched: [12:0.50]
+; ZNVER1-NEXT:    vrcpps %ymm0, %ymm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %a0)
   %2 = load <8 x float>, <8 x float> *%a1, align 32
   %3 = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %2)
@@ -2166,10 +2166,10 @@ define <4 x double> @test_roundpd(<4 x double> %a0, <4 x double> *%a1) {
 ;
 ; ZNVER1-LABEL: test_roundpd:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vroundpd $7, (%rdi), %ymm1 # sched: [8:1.00]
+; ZNVER1-NEXT:    vroundpd $7, (%rdi), %ymm1 # sched: [10:1.00]
 ; ZNVER1-NEXT:    vroundpd $7, %ymm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 7)
   %2 = load <4 x double>, <4 x double> *%a1, align 32
   %3 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %2, i32 7)
@@ -2202,10 +2202,10 @@ define <8 x float> @test_roundps(<8 x float> %a0, <8 x float> *%a1) {
 ;
 ; ZNVER1-LABEL: test_roundps:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vroundps $7, (%rdi), %ymm1 # sched: [8:1.00]
+; ZNVER1-NEXT:    vroundps $7, (%rdi), %ymm1 # sched: [10:1.00]
 ; ZNVER1-NEXT:    vroundps $7, %ymm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 7)
   %2 = load <8 x float>, <8 x float> *%a1, align 32
   %3 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %2, i32 7)
@@ -2238,10 +2238,10 @@ define <8 x float> @test_rsqrtps(<8 x float> %a0, <8 x float> *%a1) {
 ;
 ; ZNVER1-LABEL: test_rsqrtps:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vrsqrtps (%rdi), %ymm1 # sched: [7:2.00]
-; ZNVER1-NEXT:    vrsqrtps %ymm0, %ymm0 # sched: [2:2.00]
-; ZNVER1-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vrsqrtps (%rdi), %ymm1 # sched: [12:0.50]
+; ZNVER1-NEXT:    vrsqrtps %ymm0, %ymm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %a0)
   %2 = load <8 x float>, <8 x float> *%a1, align 32
   %3 = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %2)
@@ -2275,9 +2275,9 @@ define <4 x double> @test_shufpd(<4 x double> %a0, <4 x double> %a1, <4 x double
 ; ZNVER1-LABEL: test_shufpd:
 ; ZNVER1:       # BB#0:
 ; ZNVER1-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[3] sched: [1:0.50]
-; ZNVER1-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] sched: [6:1.00]
-; ZNVER1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] sched: [8:0.50]
+; ZNVER1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 1, i32 4, i32 2, i32 7>
   %2 = load <4 x double>, <4 x double> *%a2, align 32
   %3 = shufflevector <4 x double> %a1, <4 x double> %2, <4 x i32> <i32 1, i32 4, i32 2, i32 7>
@@ -2307,8 +2307,8 @@ define <8 x float> @test_shufps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%
 ; ZNVER1-LABEL: test_shufps:
 ; ZNVER1:       # BB#0:
 ; ZNVER1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4] sched: [1:0.50]
-; ZNVER1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,3],mem[0,0],ymm0[4,7],mem[4,4] sched: [6:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,3],mem[0,0],ymm0[4,7],mem[4,4] sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 0, i32 8, i32 8, i32 4, i32 4, i32 12, i32 12>
   %2 = load <8 x float>, <8 x float> *%a2, align 32
   %3 = shufflevector <8 x float> %1, <8 x float> %2, <8 x i32> <i32 0, i32 3, i32 8, i32 8, i32 4, i32 7, i32 12, i32 12>
@@ -2339,10 +2339,10 @@ define <4 x double> @test_sqrtpd(<4 x double> %a0, <4 x double> *%a1) {
 ;
 ; ZNVER1-LABEL: test_sqrtpd:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vsqrtpd (%rdi), %ymm1 # sched: [59:54.00]
-; ZNVER1-NEXT:    vsqrtpd %ymm0, %ymm0 # sched: [54:54.00]
-; ZNVER1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vsqrtpd (%rdi), %ymm1 # sched: [27:1.00]
+; ZNVER1-NEXT:    vsqrtpd %ymm0, %ymm0 # sched: [20:1.00]
+; ZNVER1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %a0)
   %2 = load <4 x double>, <4 x double> *%a1, align 32
   %3 = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %2)
@@ -2375,10 +2375,10 @@ define <8 x float> @test_sqrtps(<8 x float> %a0, <8 x float> *%a1) {
 ;
 ; ZNVER1-LABEL: test_sqrtps:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vsqrtps (%rdi), %ymm1 # sched: [47:42.00]
-; ZNVER1-NEXT:    vsqrtps %ymm0, %ymm0 # sched: [42:42.00]
-; ZNVER1-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vsqrtps (%rdi), %ymm1 # sched: [27:1.00]
+; ZNVER1-NEXT:    vsqrtps %ymm0, %ymm0 # sched: [20:1.00]
+; ZNVER1-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %a0)
   %2 = load <8 x float>, <8 x float> *%a1, align 32
   %3 = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %2)
@@ -2408,9 +2408,9 @@ define <4 x double> @test_subpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ;
 ; ZNVER1-LABEL: test_subpd:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vsubpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    vsubpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vsubpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fsub <4 x double> %a0, %a1
   %2 = load <4 x double>, <4 x double> *%a2, align 32
   %3 = fsub <4 x double> %1, %2
@@ -2438,9 +2438,9 @@ define <8 x float> @test_subps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
 ;
 ; ZNVER1-LABEL: test_subps:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vsubps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    vsubps (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vsubps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fsub <8 x float> %a0, %a1
   %2 = load <8 x float>, <8 x float> *%a2, align 32
   %3 = fsub <8 x float> %1, %2
@@ -2477,12 +2477,12 @@ define i32 @test_testpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
 ;
 ; ZNVER1-LABEL: test_testpd:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    xorl %eax, %eax # sched: [1:0.50]
-; ZNVER1-NEXT:    vtestpd %xmm1, %xmm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    setb %al # sched: [1:0.50]
-; ZNVER1-NEXT:    vtestpd (%rdi), %xmm0 # sched: [6:1.00]
-; ZNVER1-NEXT:    adcl $0, %eax # sched: [1:0.50]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    xorl %eax, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    vtestpd %xmm1, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    setb %al # sched: [1:0.25]
+; ZNVER1-NEXT:    vtestpd (%rdi), %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    adcl $0, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call i32 @llvm.x86.avx.vtestc.pd(<2 x double> %a0, <2 x double> %a1)
   %2 = load <2 x double>, <2 x double> *%a2, align 16
   %3 = call i32 @llvm.x86.avx.vtestc.pd(<2 x double> %a0, <2 x double> %2)
@@ -2523,13 +2523,13 @@ define i32 @test_testpd_ymm(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a
 ;
 ; ZNVER1-LABEL: test_testpd_ymm:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    xorl %eax, %eax # sched: [1:0.50]
-; ZNVER1-NEXT:    vtestpd %ymm1, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    setb %al # sched: [1:0.50]
-; ZNVER1-NEXT:    vtestpd (%rdi), %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT:    adcl $0, %eax # sched: [1:0.50]
+; ZNVER1-NEXT:    xorl %eax, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    vtestpd %ymm1, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    setb %al # sched: [1:0.25]
+; ZNVER1-NEXT:    vtestpd (%rdi), %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    adcl $0, %eax # sched: [1:0.25]
 ; ZNVER1-NEXT:    vzeroupper # sched: [?:0.000000e+00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call i32 @llvm.x86.avx.vtestc.pd.256(<4 x double> %a0, <4 x double> %a1)
   %2 = load <4 x double>, <4 x double> *%a2, align 32
   %3 = call i32 @llvm.x86.avx.vtestc.pd.256(<4 x double> %a0, <4 x double> %2)
@@ -2568,12 +2568,12 @@ define i32 @test_testps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
 ;
 ; ZNVER1-LABEL: test_testps:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    xorl %eax, %eax # sched: [1:0.50]
-; ZNVER1-NEXT:    vtestps %xmm1, %xmm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    setb %al # sched: [1:0.50]
-; ZNVER1-NEXT:    vtestps (%rdi), %xmm0 # sched: [6:1.00]
-; ZNVER1-NEXT:    adcl $0, %eax # sched: [1:0.50]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    xorl %eax, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    vtestps %xmm1, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    setb %al # sched: [1:0.25]
+; ZNVER1-NEXT:    vtestps (%rdi), %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    adcl $0, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call i32 @llvm.x86.avx.vtestc.ps(<4 x float> %a0, <4 x float> %a1)
   %2 = load <4 x float>, <4 x float> *%a2, align 16
   %3 = call i32 @llvm.x86.avx.vtestc.ps(<4 x float> %a0, <4 x float> %2)
@@ -2614,13 +2614,13 @@ define i32 @test_testps_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2)
 ;
 ; ZNVER1-LABEL: test_testps_ymm:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    xorl %eax, %eax # sched: [1:0.50]
-; ZNVER1-NEXT:    vtestps %ymm1, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    setb %al # sched: [1:0.50]
-; ZNVER1-NEXT:    vtestps (%rdi), %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT:    adcl $0, %eax # sched: [1:0.50]
+; ZNVER1-NEXT:    xorl %eax, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    vtestps %ymm1, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    setb %al # sched: [1:0.25]
+; ZNVER1-NEXT:    vtestps (%rdi), %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    adcl $0, %eax # sched: [1:0.25]
 ; ZNVER1-NEXT:    vzeroupper # sched: [?:0.000000e+00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call i32 @llvm.x86.avx.vtestc.ps.256(<8 x float> %a0, <8 x float> %a1)
   %2 = load <8 x float>, <8 x float> *%a2, align 32
   %3 = call i32 @llvm.x86.avx.vtestc.ps.256(<8 x float> %a0, <8 x float> %2)
@@ -2654,9 +2654,9 @@ define <4 x double> @test_unpckhpd(<4 x double> %a0, <4 x double> %a1, <4 x doub
 ; ZNVER1-LABEL: test_unpckhpd:
 ; ZNVER1:       # BB#0:
 ; ZNVER1-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:0.50]
-; ZNVER1-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] sched: [6:1.00]
-; ZNVER1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] sched: [8:0.50]
+; ZNVER1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   %2 = load <4 x double>, <4 x double> *%a2, align 32
   %3 = shufflevector <4 x double> %a1, <4 x double> %2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -2686,8 +2686,8 @@ define <8 x float> @test_unpckhps(<8 x float> %a0, <8 x float> %a1, <8 x float>
 ; ZNVER1-LABEL: test_unpckhps:
 ; ZNVER1:       # BB#0:
 ; ZNVER1-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:0.50]
-; ZNVER1-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [6:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
   %2 = load <8 x float>, <8 x float> *%a2, align 32
   %3 = shufflevector <8 x float> %1, <8 x float> %2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
@@ -2719,9 +2719,9 @@ define <4 x double> @test_unpcklpd(<4 x double> %a0, <4 x double> %a1, <4 x doub
 ; ZNVER1-LABEL: test_unpcklpd:
 ; ZNVER1:       # BB#0:
 ; ZNVER1-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:0.50]
-; ZNVER1-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] sched: [6:1.00]
-; ZNVER1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] sched: [8:0.50]
+; ZNVER1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   %2 = load <4 x double>, <4 x double> *%a2, align 32
   %3 = shufflevector <4 x double> %a1, <4 x double> %2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -2751,8 +2751,8 @@ define <8 x float> @test_unpcklps(<8 x float> %a0, <8 x float> %a1, <8 x float>
 ; ZNVER1-LABEL: test_unpcklps:
 ; ZNVER1:       # BB#0:
 ; ZNVER1-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:0.50]
-; ZNVER1-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [6:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   %2 = load <8 x float>, <8 x float> *%a2, align 32
   %3 = shufflevector <8 x float> %1, <8 x float> %2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
@@ -2783,10 +2783,10 @@ define <4 x double> @test_xorpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ;
 ; ZNVER1-LABEL: test_xorpd:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vxorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    vxorpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vxorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vxorpd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = bitcast <4 x double> %a0 to <4 x i64>
   %2 = bitcast <4 x double> %a1 to <4 x i64>
   %3 = xor <4 x i64> %1, %2
@@ -2822,10 +2822,10 @@ define <8 x float> @test_xorps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
 ;
 ; ZNVER1-LABEL: test_xorps:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vxorps %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    vxorps (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vxorps %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vxorps (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = bitcast <8 x float> %a0 to <4 x i64>
   %2 = bitcast <8 x float> %a1 to <4 x i64>
   %3 = xor <4 x i64> %1, %2
@@ -2856,7 +2856,7 @@ define void @test_zeroall() {
 ; ZNVER1-LABEL: test_zeroall:
 ; ZNVER1:       # BB#0:
 ; ZNVER1-NEXT:    vzeroall # sched: [?:0.000000e+00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   call void @llvm.x86.avx.vzeroall()
   ret void
 }
@@ -2881,7 +2881,7 @@ define void @test_zeroupper() {
 ; ZNVER1-LABEL: test_zeroupper:
 ; ZNVER1:       # BB#0:
 ; ZNVER1-NEXT:    vzeroupper # sched: [?:0.000000e+00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   call void @llvm.x86.avx.vzeroupper()
   ret void
 }
diff --git a/test/CodeGen/X86/avx2-arith.ll b/test/CodeGen/X86/avx2-arith.ll
index 017f54b40b2d..9918d6680256 100644
--- a/test/CodeGen/X86/avx2-arith.ll
+++ b/test/CodeGen/X86/avx2-arith.ll
@@ -386,13 +386,13 @@ define <8 x i32> @mul_const9(<8 x i32> %x) {
 define <4 x i32> @mul_const10(<4 x i32> %x) {
 ; X32-LABEL: mul_const10:
 ; X32:       # BB#0:
-; X32-NEXT:    vpbroadcastd {{\.LCPI.*}}, %xmm1
+; X32-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [16843009,16843009,16843009,16843009]
 ; X32-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: mul_const10:
 ; X64:       # BB#0:
-; X64-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; X64-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [16843009,16843009,16843009,16843009]
 ; X64-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
 ; X64-NEXT:    retq
   %m = mul <4 x i32> %x, <i32 16843009, i32 16843009, i32 16843009, i32 16843009>
@@ -403,13 +403,13 @@ define <4 x i32> @mul_const10(<4 x i32> %x) {
 define <4 x i32> @mul_const11(<4 x i32> %x) {
 ; X32-LABEL: mul_const11:
 ; X32:       # BB#0:
-; X32-NEXT:    vpbroadcastd {{\.LCPI.*}}, %xmm1
+; X32-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [2155905152,2155905152,2155905152,2155905152]
 ; X32-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: mul_const11:
 ; X64:       # BB#0:
-; X64-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; X64-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [2155905152,2155905152,2155905152,2155905152]
 ; X64-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
 ; X64-NEXT:    retq
   %m = mul <4 x i32> %x, <i32 2155905152, i32 2155905152, i32 2155905152, i32 2155905152>
diff --git a/test/CodeGen/X86/avx2-schedule.ll b/test/CodeGen/X86/avx2-schedule.ll
index 042bc217b97c..a3862d7e27c6 100644
--- a/test/CodeGen/X86/avx2-schedule.ll
+++ b/test/CodeGen/X86/avx2-schedule.ll
@@ -13,10 +13,10 @@ define <32 x i8> @test_pabsb(<32 x i8> %a0, <32 x i8> *%a1) {
 ;
 ; ZNVER1-LABEL: test_pabsb:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vpabsb (%rdi), %ymm1 # sched: [6:1.00]
-; ZNVER1-NEXT:    vpabsb %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vpabsb (%rdi), %ymm1 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpabsb %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8> %a0)
   %2 = load <32 x i8>, <32 x i8> *%a1, align 32
   %3 = call <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8> %2)
@@ -35,10 +35,10 @@ define <8 x i32> @test_pabsd(<8 x i32> %a0, <8 x i32> *%a1) {
 ;
 ; ZNVER1-LABEL: test_pabsd:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vpabsd (%rdi), %ymm1 # sched: [6:1.00]
-; ZNVER1-NEXT:    vpabsd %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vpabsd (%rdi), %ymm1 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpabsd %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32> %a0)
   %2 = load <8 x i32>, <8 x i32> *%a1, align 32
   %3 = call <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32> %2)
@@ -57,10 +57,10 @@ define <16 x i16> @test_pabsw(<16 x i16> %a0, <16 x i16> *%a1) {
 ;
 ; ZNVER1-LABEL: test_pabsw:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vpabsw (%rdi), %ymm1 # sched: [6:1.00]
-; ZNVER1-NEXT:    vpabsw %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vpabsw (%rdi), %ymm1 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpabsw %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16> %a0)
   %2 = load <16 x i16>, <16 x i16> *%a1, align 32
   %3 = call <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16> %2)
@@ -78,9 +78,9 @@ define <32 x i8> @test_paddb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
 ;
 ; ZNVER1-LABEL: test_paddb:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vpaddb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    vpaddb (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vpaddb %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpaddb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = add <32 x i8> %a0, %a1
   %2 = load <32 x i8>, <32 x i8> *%a2, align 32
   %3 = add <32 x i8> %1, %2
@@ -96,9 +96,9 @@ define <8 x i32> @test_paddd(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
 ;
 ; ZNVER1-LABEL: test_paddd:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    vpaddd (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpaddd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = add <8 x i32> %a0, %a1
   %2 = load <8 x i32>, <8 x i32> *%a2, align 32
   %3 = add <8 x i32> %1, %2
@@ -114,9 +114,9 @@ define <4 x i64> @test_paddq(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
 ;
 ; ZNVER1-LABEL: test_paddq:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    vpaddq (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpaddq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = add <4 x i64> %a0, %a1
   %2 = load <4 x i64>, <4 x i64> *%a2, align 32
   %3 = add <4 x i64> %1, %2
@@ -132,9 +132,9 @@ define <16 x i16> @test_paddw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
 ;
 ; ZNVER1-LABEL: test_paddw:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    vpaddw (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpaddw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = add <16 x i16> %a0, %a1
   %2 = load <16 x i16>, <16 x i16> *%a2, align 32
   %3 = add <16 x i16> %1, %2
@@ -151,10 +151,10 @@ define <4 x i64> @test_pand(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
 ;
 ; ZNVER1-LABEL: test_pand:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vpand %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    vpand (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vpand %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpand (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = and <4 x i64> %a0, %a1
   %2 = load <4 x i64>, <4 x i64> *%a2, align 32
   %3 = and <4 x i64> %1, %2
@@ -172,10 +172,10 @@ define <4 x i64> @test_pandn(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
 ;
 ; ZNVER1-LABEL: test_pandn:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vpandn %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    vpandn (%rdi), %ymm0, %ymm1 # sched: [6:1.00]
-; ZNVER1-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vpandn %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpandn (%rdi), %ymm0, %ymm1 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = xor <4 x i64> %a0, <i64 -1, i64 -1, i64 -1, i64 -1>
   %2 = and <4 x i64> %a1, %1
   %3 = load <4 x i64>, <4 x i64> *%a2, align 32
@@ -194,9 +194,9 @@ define <8 x i32> @test_pmulld(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
 ;
 ; ZNVER1-LABEL: test_pmulld:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vpmulld %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
-; ZNVER1-NEXT:    vpmulld (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vpmulld %ymm1, %ymm0, %ymm0 # sched: [4:1.00]
+; ZNVER1-NEXT:    vpmulld (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = mul <8 x i32> %a0, %a1
   %2 = load <8 x i32>, <8 x i32> *%a2, align 32
   %3 = mul <8 x i32> %1, %2
@@ -212,9 +212,9 @@ define <16 x i16> @test_pmullw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2)
 ;
 ; ZNVER1-LABEL: test_pmullw:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vpmullw %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
-; ZNVER1-NEXT:    vpmullw (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vpmullw %ymm1, %ymm0, %ymm0 # sched: [4:1.00]
+; ZNVER1-NEXT:    vpmullw (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = mul <16 x i16> %a0, %a1
   %2 = load <16 x i16>, <16 x i16> *%a2, align 32
   %3 = mul <16 x i16> %1, %2
@@ -231,10 +231,10 @@ define <4 x i64> @test_por(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
 ;
 ; ZNVER1-LABEL: test_por:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    vpor (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpor (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = or <4 x i64> %a0, %a1
   %2 = load <4 x i64>, <4 x i64> *%a2, align 32
   %3 = or <4 x i64> %1, %2
@@ -251,9 +251,9 @@ define <32 x i8> @test_psubb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
 ;
 ; ZNVER1-LABEL: test_psubb:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vpsubb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    vpsubb (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vpsubb %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpsubb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = sub <32 x i8> %a0, %a1
   %2 = load <32 x i8>, <32 x i8> *%a2, align 32
   %3 = sub <32 x i8> %1, %2
@@ -269,9 +269,9 @@ define <8 x i32> @test_psubd(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
 ;
 ; ZNVER1-LABEL: test_psubd:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    vpsubd (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpsubd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = sub <8 x i32> %a0, %a1
   %2 = load <8 x i32>, <8 x i32> *%a2, align 32
   %3 = sub <8 x i32> %1, %2
@@ -287,9 +287,9 @@ define <4 x i64> @test_psubq(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
 ;
 ; ZNVER1-LABEL: test_psubq:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vpsubq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    vpsubq (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vpsubq %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpsubq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = sub <4 x i64> %a0, %a1
   %2 = load <4 x i64>, <4 x i64> *%a2, align 32
   %3 = sub <4 x i64> %1, %2
@@ -305,9 +305,9 @@ define <16 x i16> @test_psubw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
 ;
 ; ZNVER1-LABEL: test_psubw:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vpsubw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    vpsubw (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vpsubw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpsubw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = sub <16 x i16> %a0, %a1
   %2 = load <16 x i16>, <16 x i16> *%a2, align 32
   %3 = sub <16 x i16> %1, %2
@@ -324,10 +324,10 @@ define <4 x i64> @test_pxor(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
 ;
 ; ZNVER1-LABEL: test_pxor:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vpxor %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    vpxor (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vpxor %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpxor (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = xor <4 x i64> %a0, %a1
   %2 = load <4 x i64>, <4 x i64> *%a2, align 32
   %3 = xor <4 x i64> %1, %2
diff --git a/test/CodeGen/X86/avx2-vector-shifts.ll b/test/CodeGen/X86/avx2-vector-shifts.ll
index 127726ea30da..c77714b9e181 100644
--- a/test/CodeGen/X86/avx2-vector-shifts.ll
+++ b/test/CodeGen/X86/avx2-vector-shifts.ll
@@ -376,7 +376,7 @@ define <4 x i32> @srl_trunc_and_v4i64(<4 x i32> %x, <4 x i64> %y) nounwind {
 ; X32:       # BB#0:
 ; X32-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
 ; X32-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; X32-NEXT:    vpbroadcastd {{\.LCPI.*}}, %xmm2
+; X32-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8]
 ; X32-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; X32-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
 ; X32-NEXT:    vzeroupper
@@ -386,7 +386,7 @@ define <4 x i32> @srl_trunc_and_v4i64(<4 x i32> %x, <4 x i64> %y) nounwind {
 ; X64:       # BB#0:
 ; X64-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
 ; X64-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; X64-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
+; X64-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8]
 ; X64-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; X64-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
 ; X64-NEXT:    vzeroupper
diff --git a/test/CodeGen/X86/avx512-cvt.ll b/test/CodeGen/X86/avx512-cvt.ll
index 140299f5495d..e10a781fabc2 100644
--- a/test/CodeGen/X86/avx512-cvt.ll
+++ b/test/CodeGen/X86/avx512-cvt.ll
@@ -1507,7 +1507,7 @@ define <4 x float> @uitofp_4i1_float(<4 x i32> %a) {
 ; NOVL:       # BB#0:
 ; NOVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; NOVL-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
-; NOVL-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; NOVL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
 ; NOVL-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NOVL-NEXT:    retq
 ;
diff --git a/test/CodeGen/X86/avx512-mask-op.ll b/test/CodeGen/X86/avx512-mask-op.ll
index e1a92c60d182..6f4bf061a215 100644
--- a/test/CodeGen/X86/avx512-mask-op.ll
+++ b/test/CodeGen/X86/avx512-mask-op.ll
@@ -1630,8 +1630,9 @@ define void @f1(i32 %c) {
 ; CHECK-LABEL: f1:
 ; CHECK:       ## BB#0: ## %entry
 ; CHECK-NEXT:    movzbl {{.*}}(%rip), %edi
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    xorb $1, %al
+; CHECK-NEXT:    movb {{.*}}(%rip), %al
+; CHECK-NEXT:    notb %al
+; CHECK-NEXT:    andb $1, %al
 ; CHECK-NEXT:    movb %al, {{.*}}(%rip)
 ; CHECK-NEXT:    xorl $1, %edi
 ; CHECK-NEXT:    jmp _f2 ## TAILCALL
diff --git a/test/CodeGen/X86/avx512-rotate.ll b/test/CodeGen/X86/avx512-rotate.ll
new file mode 100644
index 000000000000..98fa67ad793d
--- /dev/null
+++ b/test/CodeGen/X86/avx512-rotate.ll
@@ -0,0 +1,256 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=KNL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
+
+declare <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+declare <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+declare <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+declare <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+
+; Tests showing replacement of variable rotates with immediate splat versions.
+
+define <16 x i32> @test_splat_rol_v16i32(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) {
+; KNL-LABEL: test_splat_rol_v16i32:
+; KNL:       # BB#0:
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    vprold $5, %zmm0, %zmm1 {%k1}
+; KNL-NEXT:    vprold $5, %zmm0, %zmm2 {%k1} {z}
+; KNL-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
+; KNL-NEXT:    vprold $5, %zmm0, %zmm0
+; KNL-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_splat_rol_v16i32:
+; SKX:       # BB#0:
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vprold $5, %zmm0, %zmm1 {%k1}
+; SKX-NEXT:    vprold $5, %zmm0, %zmm2 {%k1} {z}
+; SKX-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
+; SKX-NEXT:    vprold $5, %zmm0, %zmm0
+; SKX-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
+; SKX-NEXT:    retq
+  %res = call <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32> %x0, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>, <16 x i32> %x1, i16 %x2)
+  %res1 = call <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32> %x0, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>, <16 x i32> zeroinitializer, i16 %x2)
+  %res2 = call <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32> %x0, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>, <16 x i32> %x1, i16 -1)
+  %res3 = add <16 x i32> %res, %res1
+  %res4 = add <16 x i32> %res3, %res2
+  ret <16 x i32> %res4
+}
+
+define <8 x i64>@test_splat_rol_v8i64(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) {
+; KNL-LABEL: test_splat_rol_v8i64:
+; KNL:       # BB#0:
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    vprolq $5, %zmm0, %zmm1 {%k1}
+; KNL-NEXT:    vprolq $5, %zmm0, %zmm2 {%k1} {z}
+; KNL-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
+; KNL-NEXT:    vprolq $5, %zmm0, %zmm0
+; KNL-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_splat_rol_v8i64:
+; SKX:       # BB#0:
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vprolq $5, %zmm0, %zmm1 {%k1}
+; SKX-NEXT:    vprolq $5, %zmm0, %zmm2 {%k1} {z}
+; SKX-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
+; SKX-NEXT:    vprolq $5, %zmm0, %zmm0
+; SKX-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; SKX-NEXT:    retq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64> %x0, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>, <8 x i64> %x1, i8 %x2)
+  %res1 = call <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64> %x0, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>, <8 x i64> zeroinitializer, i8 %x2)
+  %res2 = call <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64> %x0, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>, <8 x i64> %x1, i8 -1)
+  %res3 = add <8 x i64> %res, %res1
+  %res4 = add <8 x i64> %res3, %res2
+  ret <8 x i64> %res4
+}
+
+define <16 x i32> @test_splat_ror_v16i32(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) {
+; KNL-LABEL: test_splat_ror_v16i32:
+; KNL:       # BB#0:
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    vprord $5, %zmm0, %zmm1 {%k1}
+; KNL-NEXT:    vprord $5, %zmm0, %zmm2 {%k1} {z}
+; KNL-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
+; KNL-NEXT:    vprord $5, %zmm0, %zmm0
+; KNL-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_splat_ror_v16i32:
+; SKX:       # BB#0:
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vprord $5, %zmm0, %zmm1 {%k1}
+; SKX-NEXT:    vprord $5, %zmm0, %zmm2 {%k1} {z}
+; SKX-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
+; SKX-NEXT:    vprord $5, %zmm0, %zmm0
+; SKX-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
+; SKX-NEXT:    retq
+  %res = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>, <16 x i32> %x1, i16 %x2)
+  %res1 = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>, <16 x i32> zeroinitializer, i16 %x2)
+  %res2 = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>, <16 x i32> %x1, i16 -1)
+  %res3 = add <16 x i32> %res, %res1
+  %res4 = add <16 x i32> %res3, %res2
+  ret <16 x i32> %res4
+}
+
+define <8 x i64>@test_splat_ror_v8i64(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) {
+; KNL-LABEL: test_splat_ror_v8i64:
+; KNL:       # BB#0:
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    vprorq $5, %zmm0, %zmm1 {%k1}
+; KNL-NEXT:    vprorq $5, %zmm0, %zmm2 {%k1} {z}
+; KNL-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
+; KNL-NEXT:    vprorq $5, %zmm0, %zmm0
+; KNL-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_splat_ror_v8i64:
+; SKX:       # BB#0:
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vprorq $5, %zmm0, %zmm1 {%k1}
+; SKX-NEXT:    vprorq $5, %zmm0, %zmm2 {%k1} {z}
+; SKX-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
+; SKX-NEXT:    vprorq $5, %zmm0, %zmm0
+; SKX-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; SKX-NEXT:    retq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>, <8 x i64> %x1, i8 %x2)
+  %res1 = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>, <8 x i64> zeroinitializer, i8 %x2)
+  %res2 = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>, <8 x i64> %x1, i8 -1)
+  %res3 = add <8 x i64> %res, %res1
+  %res4 = add <8 x i64> %res3, %res2
+  ret <8 x i64> %res4
+}
+
+; Tests showing replacement of out-of-bounds variable rotates with in-bounds immediate splat versions.
+
+define <16 x i32> @test_splat_bounds_rol_v16i32(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) {
+; KNL-LABEL: test_splat_bounds_rol_v16i32:
+; KNL:       # BB#0:
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    vprold $1, %zmm0, %zmm1 {%k1}
+; KNL-NEXT:    vprold $31, %zmm0, %zmm2 {%k1} {z}
+; KNL-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
+; KNL-NEXT:    vprold $30, %zmm0, %zmm0
+; KNL-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_splat_bounds_rol_v16i32:
+; SKX:       # BB#0:
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vprold $1, %zmm0, %zmm1 {%k1}
+; SKX-NEXT:    vprold $31, %zmm0, %zmm2 {%k1} {z}
+; SKX-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
+; SKX-NEXT:    vprold $30, %zmm0, %zmm0
+; SKX-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
+; SKX-NEXT:    retq
+  %res = call <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32> %x0, <16 x i32> <i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33>, <16 x i32> %x1, i16 %x2)
+  %res1 = call <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32> %x0, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <16 x i32> zeroinitializer, i16 %x2)
+  %res2 = call <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32> %x0, <16 x i32> <i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534>, <16 x i32> %x1, i16 -1)
+  %res3 = add <16 x i32> %res, %res1
+  %res4 = add <16 x i32> %res3, %res2
+  ret <16 x i32> %res4
+}
+
+define <8 x i64>@test_splat_bounds_rol_v8i64(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) {
+; KNL-LABEL: test_splat_bounds_rol_v8i64:
+; KNL:       # BB#0:
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    vprolq $62, %zmm0, %zmm1 {%k1}
+; KNL-NEXT:    vprolq $1, %zmm0, %zmm2 {%k1} {z}
+; KNL-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
+; KNL-NEXT:    vprolq $63, %zmm0, %zmm0
+; KNL-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_splat_bounds_rol_v8i64:
+; SKX:       # BB#0:
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vprolq $62, %zmm0, %zmm1 {%k1}
+; SKX-NEXT:    vprolq $1, %zmm0, %zmm2 {%k1} {z}
+; SKX-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
+; SKX-NEXT:    vprolq $63, %zmm0, %zmm0
+; SKX-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; SKX-NEXT:    retq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64> %x0, <8 x i64> <i64 65534, i64 65534, i64 65534, i64 65534, i64 65534, i64 65534, i64 65534, i64 65534>, <8 x i64> %x1, i8 %x2)
+  %res1 = call <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64> %x0, <8 x i64> <i64 65, i64 65, i64 65, i64 65, i64 65, i64 65, i64 65, i64 65>, <8 x i64> zeroinitializer, i8 %x2)
+  %res2 = call <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64> %x0, <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, <8 x i64> %x1, i8 -1)
+  %res3 = add <8 x i64> %res, %res1
+  %res4 = add <8 x i64> %res3, %res2
+  ret <8 x i64> %res4
+}
+
+define <16 x i32> @test_splat_bounds_ror_v16i32(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) {
+; KNL-LABEL: test_splat_bounds_ror_v16i32:
+; KNL:       # BB#0:
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    vprord $1, %zmm0, %zmm1 {%k1}
+; KNL-NEXT:    vprord $31, %zmm0, %zmm2 {%k1} {z}
+; KNL-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
+; KNL-NEXT:    vprord $30, %zmm0, %zmm0
+; KNL-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_splat_bounds_ror_v16i32:
+; SKX:       # BB#0:
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vprord $1, %zmm0, %zmm1 {%k1}
+; SKX-NEXT:    vprord $31, %zmm0, %zmm2 {%k1} {z}
+; SKX-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
+; SKX-NEXT:    vprord $30, %zmm0, %zmm0
+; SKX-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
+; SKX-NEXT:    retq
+  %res = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> <i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33>, <16 x i32> %x1, i16 %x2)
+  %res1 = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <16 x i32> zeroinitializer, i16 %x2)
+  %res2 = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> <i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534>, <16 x i32> %x1, i16 -1)
+  %res3 = add <16 x i32> %res, %res1
+  %res4 = add <16 x i32> %res3, %res2
+  ret <16 x i32> %res4
+}
+
+define <8 x i64>@test_splat_bounds_ror_v8i64(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) {
+; KNL-LABEL: test_splat_bounds_ror_v8i64:
+; KNL:       # BB#0:
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    vprorq $62, %zmm0, %zmm1 {%k1}
+; KNL-NEXT:    vprorq $1, %zmm0, %zmm2 {%k1} {z}
+; KNL-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
+; KNL-NEXT:    vprorq $63, %zmm0, %zmm0
+; KNL-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_splat_bounds_ror_v8i64:
+; SKX:       # BB#0:
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vprorq $62, %zmm0, %zmm1 {%k1}
+; SKX-NEXT:    vprorq $1, %zmm0, %zmm2 {%k1} {z}
+; SKX-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
+; SKX-NEXT:    vprorq $63, %zmm0, %zmm0
+; SKX-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; SKX-NEXT:    retq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> <i64 65534, i64 65534, i64 65534, i64 65534, i64 65534, i64 65534, i64 65534, i64 65534>, <8 x i64> %x1, i8 %x2)
+  %res1 = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> <i64 65, i64 65, i64 65, i64 65, i64 65, i64 65, i64 65, i64 65>, <8 x i64> zeroinitializer, i8 %x2)
+  %res2 = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, <8 x i64> %x1, i8 -1)
+  %res3 = add <8 x i64> %res, %res1
+  %res4 = add <8 x i64> %res3, %res2
+  ret <8 x i64> %res4
+}
+
+; Constant folding
+
+define <8 x i64> @test_fold_rol_v8i64() {
+; CHECK-LABEL: test_fold_rol_v8i64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [1,2,4,9223372036854775808,2,4611686018427387904,9223372036854775808,9223372036854775808]
+; CHECK-NEXT:    retq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>, <8 x i64> <i64 0, i64 1, i64 2, i64 63, i64 65, i64 65534, i64 65535, i64 -1>, <8 x i64> zeroinitializer, i8 -1)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_fold_ror_v8i64() {
+; CHECK-LABEL: test_fold_ror_v8i64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [1,9223372036854775808,4611686018427387904,2,9223372036854775808,4,2,2]
+; CHECK-NEXT:    retq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>, <8 x i64> <i64 0, i64 1, i64 2, i64 63, i64 65, i64 65534, i64 65535, i64 -1>, <8 x i64> zeroinitializer, i8 -1)
+  ret <8 x i64> %res
+}
diff --git a/test/CodeGen/X86/avx512-shift.ll b/test/CodeGen/X86/avx512-shift.ll
index 10883a5a9a62..ce2b010ec0f2 100644
--- a/test/CodeGen/X86/avx512-shift.ll
+++ b/test/CodeGen/X86/avx512-shift.ll
@@ -1,136 +1,178 @@
-;RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
-;RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck --check-prefix=SKX %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+;RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=KNL
+;RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
 
-;CHECK-LABEL: shift_16_i32
-;CHECK: vpsrld
-;CHECK: vpslld
-;CHECK: vpsrad
-;CHECK: ret
 define <16 x i32> @shift_16_i32(<16 x i32> %a) {
+; CHECK-LABEL: shift_16_i32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsrld $1, %zmm0, %zmm0
+; CHECK-NEXT:    vpslld $12, %zmm0, %zmm0
+; CHECK-NEXT:    vpsrad $12, %zmm0, %zmm0
+; CHECK-NEXT:    retq
    %b = lshr <16 x i32> %a, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    %c = shl <16 x i32> %b, <i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12>
    %d = ashr <16 x i32> %c, <i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12>
    ret <16 x i32> %d;
 }
 
-;CHECK-LABEL: shift_8_i64
-;CHECK: vpsrlq
-;CHECK: vpsllq
-;CHECK: vpsraq
-;CHECK: ret
 define <8 x i64> @shift_8_i64(<8 x i64> %a) {
+; CHECK-LABEL: shift_8_i64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsrlq $1, %zmm0, %zmm0
+; CHECK-NEXT:    vpsllq $12, %zmm0, %zmm0
+; CHECK-NEXT:    vpsraq $12, %zmm0, %zmm0
+; CHECK-NEXT:    retq
    %b = lshr <8 x i64> %a, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
    %c = shl <8 x i64> %b,  <i64 12, i64 12, i64 12, i64 12, i64 12, i64 12, i64 12, i64 12>
    %d = ashr <8 x i64> %c, <i64 12, i64 12, i64 12, i64 12, i64 12, i64 12, i64 12, i64 12>
    ret <8 x i64> %d;
 }
 
-;SKX-LABEL: shift_4_i64
-;SKX: vpsrlq
-;SKX: vpsllq
-;SKX: vpsraq
-;SKX: ret
 define <4 x i64> @shift_4_i64(<4 x i64> %a) {
+; KNL-LABEL: shift_4_i64:
+; KNL:       # BB#0:
+; KNL-NEXT:    vpsrlq $1, %ymm0, %ymm0
+; KNL-NEXT:    vpsllq $12, %ymm0, %ymm0
+; KNL-NEXT:    vpsraq $12, %zmm0, %zmm0
+; KNL-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: shift_4_i64:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpsrlq $1, %ymm0, %ymm0
+; SKX-NEXT:    vpsllq $12, %ymm0, %ymm0
+; SKX-NEXT:    vpsraq $12, %ymm0, %ymm0
+; SKX-NEXT:    retq
    %b = lshr <4 x i64> %a, <i64 1, i64 1, i64 1, i64 1>
    %c = shl <4 x i64> %b,  <i64 12, i64 12, i64 12, i64 12>
    %d = ashr <4 x i64> %c, <i64 12, i64 12, i64 12, i64 12>
    ret <4 x i64> %d;
 }
 
-; CHECK-LABEL: variable_shl4
-; CHECK: vpsllvq %zmm
-; CHECK: ret
 define <8 x i64> @variable_shl4(<8 x i64> %x, <8 x i64> %y) {
+; CHECK-LABEL: variable_shl4:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllvq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %k = shl <8 x i64> %x, %y
   ret <8 x i64> %k
 }
 
-; CHECK-LABEL: variable_shl5
-; CHECK: vpsllvd %zmm
-; CHECK: ret
 define <16 x i32> @variable_shl5(<16 x i32> %x, <16 x i32> %y) {
+; CHECK-LABEL: variable_shl5:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %k = shl <16 x i32> %x, %y
   ret <16 x i32> %k
 }
 
-; CHECK-LABEL: variable_srl0
-; CHECK: vpsrlvd
-; CHECK: ret
 define <16 x i32> @variable_srl0(<16 x i32> %x, <16 x i32> %y) {
+; CHECK-LABEL: variable_srl0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %k = lshr <16 x i32> %x, %y
   ret <16 x i32> %k
 }
 
-; CHECK-LABEL: variable_srl2
-; CHECK: psrlvq
-; CHECK: ret
 define <8 x i64> @variable_srl2(<8 x i64> %x, <8 x i64> %y) {
+; CHECK-LABEL: variable_srl2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsrlvq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %k = lshr <8 x i64> %x, %y
   ret <8 x i64> %k
 }
 
-; CHECK-LABEL: variable_sra1
-; CHECK: vpsravd
-; CHECK: ret
 define <16 x i32> @variable_sra1(<16 x i32> %x, <16 x i32> %y) {
+; CHECK-LABEL: variable_sra1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsravd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %k = ashr <16 x i32> %x, %y
   ret <16 x i32> %k
 }
 
-; CHECK-LABEL: variable_sra2
-; CHECK: vpsravq %zmm
-; CHECK: ret
 define <8 x i64> @variable_sra2(<8 x i64> %x, <8 x i64> %y) {
+; CHECK-LABEL: variable_sra2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsravq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %k = ashr <8 x i64> %x, %y
   ret <8 x i64> %k
 }
 
-; SKX-LABEL: variable_sra3
-; SKX: vpsravq %ymm
-; SKX: ret
 define <4 x i64> @variable_sra3(<4 x i64> %x, <4 x i64> %y) {
+; KNL-LABEL: variable_sra3:
+; KNL:       # BB#0:
+; KNL-NEXT:    # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; KNL-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL-NEXT:    vpsravq %zmm1, %zmm0, %zmm0
+; KNL-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: variable_sra3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpsravq %ymm1, %ymm0, %ymm0
+; SKX-NEXT:    retq
   %k = ashr <4 x i64> %x, %y
   ret <4 x i64> %k
 }
 
-; SKX-LABEL: variable_sra4
-; SKX: vpsravw %xmm
-; SKX: ret
 define <8 x i16> @variable_sra4(<8 x i16> %x, <8 x i16> %y) {
+; KNL-LABEL: variable_sra4:
+; KNL:       # BB#0:
+; KNL-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; KNL-NEXT:    vpmovsxwd %xmm0, %ymm0
+; KNL-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
+; KNL-NEXT:    vpmovdw %zmm0, %ymm0
+; KNL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: variable_sra4:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpsravw %xmm1, %xmm0, %xmm0
+; SKX-NEXT:    retq
   %k = ashr <8 x i16> %x, %y
   ret <8 x i16> %k
 }
 
-; CHECK-LABEL: variable_sra01_load
-; CHECK: vpsravd (%
-; CHECK: ret
 define <16 x i32> @variable_sra01_load(<16 x i32> %x, <16 x i32>* %y) {
+; CHECK-LABEL: variable_sra01_load:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsravd (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %y1 = load <16 x i32>, <16 x i32>* %y
   %k = ashr <16 x i32> %x, %y1
   ret <16 x i32> %k
 }
 
-; CHECK-LABEL: variable_shl1_load
-; CHECK: vpsllvd (%
-; CHECK: ret
 define <16 x i32> @variable_shl1_load(<16 x i32> %x, <16 x i32>* %y) {
+; CHECK-LABEL: variable_shl1_load:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllvd (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %y1 = load <16 x i32>, <16 x i32>* %y
   %k = shl <16 x i32> %x, %y1
   ret <16 x i32> %k
 }
-; CHECK: variable_srl0_load
-; CHECK: vpsrlvd (%
-; CHECK: ret
+
 define <16 x i32> @variable_srl0_load(<16 x i32> %x, <16 x i32>* %y) {
+; CHECK-LABEL: variable_srl0_load:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsrlvd (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %y1 = load <16 x i32>, <16 x i32>* %y
   %k = lshr <16 x i32> %x, %y1
   ret <16 x i32> %k
 }
 
-; CHECK: variable_srl3_load
-; CHECK: vpsrlvq (%
-; CHECK: ret
 define <8 x i64> @variable_srl3_load(<8 x i64> %x, <8 x i64>* %y) {
+; CHECK-LABEL: variable_srl3_load:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsrlvq (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %y1 = load <8 x i64>, <8 x i64>* %y
   %k = lshr <8 x i64> %x, %y1
   ret <8 x i64> %k
diff --git a/test/CodeGen/X86/bmi-schedule.ll b/test/CodeGen/X86/bmi-schedule.ll
new file mode 100644
index 000000000000..75be2d9c0f01
--- /dev/null
+++ b/test/CodeGen/X86/bmi-schedule.ll
@@ -0,0 +1,529 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mattr=+bmi   | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl     | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2  | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1  | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
+
+define i16 @test_andn_i16(i16 zeroext %a0, i16 zeroext %a1, i16 *%a2) {
+; GENERIC-LABEL: test_andn_i16:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    andnl %esi, %edi, %eax
+; GENERIC-NEXT:    notl %edi
+; GENERIC-NEXT:    andw (%rdx), %di
+; GENERIC-NEXT:    addl %edi, %eax
+; GENERIC-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; GENERIC-NEXT:    retq
+;
+; HASWELL-LABEL: test_andn_i16:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    andnl %esi, %edi, %eax # sched: [1:0.50]
+; HASWELL-NEXT:    notl %edi # sched: [1:0.25]
+; HASWELL-NEXT:    andw (%rdx), %di # sched: [5:0.50]
+; HASWELL-NEXT:    addl %edi, %eax # sched: [1:0.25]
+; HASWELL-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_andn_i16:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    andnl %esi, %edi, %eax # sched: [1:0.50]
+; BTVER2-NEXT:    notl %edi # sched: [1:0.50]
+; BTVER2-NEXT:    andw (%rdx), %di # sched: [4:1.00]
+; BTVER2-NEXT:    addl %edi, %eax # sched: [1:0.50]
+; BTVER2-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_andn_i16:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    andnl %esi, %edi, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    notl %edi # sched: [1:0.25]
+; ZNVER1-NEXT:    andw (%rdx), %di # sched: [5:0.50]
+; ZNVER1-NEXT:    addl %edi, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %1 = load i16, i16 *%a2
+  %2 = xor i16 %a0, -1
+  %3 = and i16 %2, %a1
+  %4 = and i16 %2, %1
+  %5 = add i16 %3, %4
+  ret i16 %5
+}
+
+define i32 @test_andn_i32(i32 %a0, i32 %a1, i32 *%a2) {
+; GENERIC-LABEL: test_andn_i32:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    andnl %esi, %edi, %ecx
+; GENERIC-NEXT:    andnl (%rdx), %edi, %eax
+; GENERIC-NEXT:    addl %ecx, %eax
+; GENERIC-NEXT:    retq
+;
+; HASWELL-LABEL: test_andn_i32:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    andnl %esi, %edi, %ecx # sched: [1:0.50]
+; HASWELL-NEXT:    andnl (%rdx), %edi, %eax # sched: [4:0.50]
+; HASWELL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_andn_i32:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    andnl (%rdx), %edi, %eax # sched: [4:1.00]
+; BTVER2-NEXT:    andnl %esi, %edi, %ecx # sched: [1:0.50]
+; BTVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_andn_i32:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    andnl (%rdx), %edi, %eax # sched: [5:0.50]
+; ZNVER1-NEXT:    andnl %esi, %edi, %ecx # sched: [1:0.25]
+; ZNVER1-NEXT:    addl %ecx, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %1 = load i32, i32 *%a2
+  %2 = xor i32 %a0, -1
+  %3 = and i32 %2, %a1
+  %4 = and i32 %2, %1
+  %5 = add i32 %3, %4
+  ret i32 %5
+}
+
+define i64 @test_andn_i64(i64 %a0, i64 %a1, i64 *%a2) {
+; GENERIC-LABEL: test_andn_i64:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    andnq %rsi, %rdi, %rcx
+; GENERIC-NEXT:    andnq (%rdx), %rdi, %rax
+; GENERIC-NEXT:    addq %rcx, %rax
+; GENERIC-NEXT:    retq
+;
+; HASWELL-LABEL: test_andn_i64:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    andnq %rsi, %rdi, %rcx # sched: [1:0.50]
+; HASWELL-NEXT:    andnq (%rdx), %rdi, %rax # sched: [4:0.50]
+; HASWELL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_andn_i64:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    andnq (%rdx), %rdi, %rax # sched: [4:1.00]
+; BTVER2-NEXT:    andnq %rsi, %rdi, %rcx # sched: [1:0.50]
+; BTVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_andn_i64:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    andnq (%rdx), %rdi, %rax # sched: [5:0.50]
+; ZNVER1-NEXT:    andnq %rsi, %rdi, %rcx # sched: [1:0.25]
+; ZNVER1-NEXT:    addq %rcx, %rax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %1 = load i64, i64 *%a2
+  %2 = xor i64 %a0, -1
+  %3 = and i64 %2, %a1
+  %4 = and i64 %2, %1
+  %5 = add i64 %3, %4
+  ret i64 %5
+}
+
+define i32 @test_bextr_i32(i32 %a0, i32 %a1, i32 *%a2) {
+; GENERIC-LABEL: test_bextr_i32:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    bextrl %edi, (%rdx), %ecx
+; GENERIC-NEXT:    bextrl %edi, %esi, %eax
+; GENERIC-NEXT:    addl %ecx, %eax
+; GENERIC-NEXT:    retq
+;
+; HASWELL-LABEL: test_bextr_i32:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    bextrl %edi, (%rdx), %ecx # sched: [6:0.50]
+; HASWELL-NEXT:    bextrl %edi, %esi, %eax # sched: [2:0.50]
+; HASWELL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_bextr_i32:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    bextrl %edi, (%rdx), %ecx # sched: [?:0.000000e+00]
+; BTVER2-NEXT:    bextrl %edi, %esi, %eax # sched: [?:0.000000e+00]
+; BTVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_bextr_i32:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    bextrl %edi, (%rdx), %ecx # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    bextrl %edi, %esi, %eax # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    addl %ecx, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %1 = load i32, i32 *%a2
+  %2 = tail call i32 @llvm.x86.bmi.bextr.32(i32 %1, i32 %a0)
+  %3 = tail call i32 @llvm.x86.bmi.bextr.32(i32 %a1, i32 %a0)
+  %4 = add i32 %2, %3
+  ret i32 %4
+}
+declare i32 @llvm.x86.bmi.bextr.32(i32, i32)
+
+define i64 @test_bextr_i64(i64 %a0, i64 %a1, i64 *%a2) {
+; GENERIC-LABEL: test_bextr_i64:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    bextrq %rdi, (%rdx), %rcx
+; GENERIC-NEXT:    bextrq %rdi, %rsi, %rax
+; GENERIC-NEXT:    addq %rcx, %rax
+; GENERIC-NEXT:    retq
+;
+; HASWELL-LABEL: test_bextr_i64:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    bextrq %rdi, (%rdx), %rcx # sched: [6:0.50]
+; HASWELL-NEXT:    bextrq %rdi, %rsi, %rax # sched: [2:0.50]
+; HASWELL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_bextr_i64:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    bextrq %rdi, (%rdx), %rcx # sched: [?:0.000000e+00]
+; BTVER2-NEXT:    bextrq %rdi, %rsi, %rax # sched: [?:0.000000e+00]
+; BTVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_bextr_i64:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    bextrq %rdi, (%rdx), %rcx # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    bextrq %rdi, %rsi, %rax # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    addq %rcx, %rax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %1 = load i64, i64 *%a2
+  %2 = tail call i64 @llvm.x86.bmi.bextr.64(i64 %1, i64 %a0)
+  %3 = tail call i64 @llvm.x86.bmi.bextr.64(i64 %a1, i64 %a0)
+  %4 = add i64 %2, %3
+  ret i64 %4
+}
+declare i64 @llvm.x86.bmi.bextr.64(i64, i64)
+
+define i32 @test_blsi_i32(i32 %a0, i32 *%a1) {
+; GENERIC-LABEL: test_blsi_i32:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    blsil (%rsi), %ecx
+; GENERIC-NEXT:    blsil %edi, %eax
+; GENERIC-NEXT:    addl %ecx, %eax
+; GENERIC-NEXT:    retq
+;
+; HASWELL-LABEL: test_blsi_i32:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    blsil (%rsi), %ecx # sched: [4:0.50]
+; HASWELL-NEXT:    blsil %edi, %eax # sched: [1:0.50]
+; HASWELL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_blsi_i32:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    blsil (%rsi), %ecx # sched: [?:0.000000e+00]
+; BTVER2-NEXT:    blsil %edi, %eax # sched: [?:0.000000e+00]
+; BTVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_blsi_i32:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    blsil (%rsi), %ecx # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    blsil %edi, %eax # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    addl %ecx, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %1 = load i32, i32 *%a1
+  %2 = sub i32 0, %1
+  %3 = sub i32 0, %a0
+  %4 = and i32 %1, %2
+  %5 = and i32 %a0, %3
+  %6 = add i32 %4, %5
+  ret i32 %6
+}
+
+define i64 @test_blsi_i64(i64 %a0, i64 *%a1) {
+; GENERIC-LABEL: test_blsi_i64:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    blsiq (%rsi), %rcx
+; GENERIC-NEXT:    blsiq %rdi, %rax
+; GENERIC-NEXT:    addq %rcx, %rax
+; GENERIC-NEXT:    retq
+;
+; HASWELL-LABEL: test_blsi_i64:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    blsiq (%rsi), %rcx # sched: [4:0.50]
+; HASWELL-NEXT:    blsiq %rdi, %rax # sched: [1:0.50]
+; HASWELL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_blsi_i64:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    blsiq (%rsi), %rcx # sched: [?:0.000000e+00]
+; BTVER2-NEXT:    blsiq %rdi, %rax # sched: [?:0.000000e+00]
+; BTVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_blsi_i64:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    blsiq (%rsi), %rcx # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    blsiq %rdi, %rax # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    addq %rcx, %rax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %1 = load i64, i64 *%a1
+  %2 = sub i64 0, %1
+  %3 = sub i64 0, %a0
+  %4 = and i64 %1, %2
+  %5 = and i64 %a0, %3
+  %6 = add i64 %4, %5
+  ret i64 %6
+}
+
+define i32 @test_blsmsk_i32(i32 %a0, i32 *%a1) {
+; GENERIC-LABEL: test_blsmsk_i32:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    blsmskl (%rsi), %ecx
+; GENERIC-NEXT:    blsmskl %edi, %eax
+; GENERIC-NEXT:    addl %ecx, %eax
+; GENERIC-NEXT:    retq
+;
+; HASWELL-LABEL: test_blsmsk_i32:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    blsmskl (%rsi), %ecx # sched: [4:0.50]
+; HASWELL-NEXT:    blsmskl %edi, %eax # sched: [1:0.50]
+; HASWELL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_blsmsk_i32:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    blsmskl (%rsi), %ecx # sched: [?:0.000000e+00]
+; BTVER2-NEXT:    blsmskl %edi, %eax # sched: [?:0.000000e+00]
+; BTVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_blsmsk_i32:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    blsmskl (%rsi), %ecx # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    blsmskl %edi, %eax # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    addl %ecx, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %1 = load i32, i32 *%a1
+  %2 = sub i32 %1, 1
+  %3 = sub i32 %a0, 1
+  %4 = xor i32 %1, %2
+  %5 = xor i32 %a0, %3
+  %6 = add i32 %4, %5
+  ret i32 %6
+}
+
+define i64 @test_blsmsk_i64(i64 %a0, i64 *%a1) {
+; GENERIC-LABEL: test_blsmsk_i64:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    blsmskq (%rsi), %rcx
+; GENERIC-NEXT:    blsmskq %rdi, %rax
+; GENERIC-NEXT:    addq %rcx, %rax
+; GENERIC-NEXT:    retq
+;
+; HASWELL-LABEL: test_blsmsk_i64:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    blsmskq (%rsi), %rcx # sched: [4:0.50]
+; HASWELL-NEXT:    blsmskq %rdi, %rax # sched: [1:0.50]
+; HASWELL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_blsmsk_i64:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    blsmskq (%rsi), %rcx # sched: [?:0.000000e+00]
+; BTVER2-NEXT:    blsmskq %rdi, %rax # sched: [?:0.000000e+00]
+; BTVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_blsmsk_i64:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    blsmskq (%rsi), %rcx # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    blsmskq %rdi, %rax # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    addq %rcx, %rax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %1 = load i64, i64 *%a1
+  %2 = sub i64 %1, 1
+  %3 = sub i64 %a0, 1
+  %4 = xor i64 %1, %2
+  %5 = xor i64 %a0, %3
+  %6 = add i64 %4, %5
+  ret i64 %6
+}
+
+define i32 @test_blsr_i32(i32 %a0, i32 *%a1) {
+; GENERIC-LABEL: test_blsr_i32:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    blsrl (%rsi), %ecx
+; GENERIC-NEXT:    blsrl %edi, %eax
+; GENERIC-NEXT:    addl %ecx, %eax
+; GENERIC-NEXT:    retq
+;
+; HASWELL-LABEL: test_blsr_i32:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    blsrl (%rsi), %ecx # sched: [4:0.50]
+; HASWELL-NEXT:    blsrl %edi, %eax # sched: [1:0.50]
+; HASWELL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_blsr_i32:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    blsrl (%rsi), %ecx # sched: [?:0.000000e+00]
+; BTVER2-NEXT:    blsrl %edi, %eax # sched: [?:0.000000e+00]
+; BTVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_blsr_i32:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    blsrl (%rsi), %ecx # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    blsrl %edi, %eax # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    addl %ecx, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %1 = load i32, i32 *%a1
+  %2 = sub i32 %1, 1
+  %3 = sub i32 %a0, 1
+  %4 = and i32 %1, %2
+  %5 = and i32 %a0, %3
+  %6 = add i32 %4, %5
+  ret i32 %6
+}
+
+define i64 @test_blsr_i64(i64 %a0, i64 *%a1) {
+; GENERIC-LABEL: test_blsr_i64:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    blsrq (%rsi), %rcx
+; GENERIC-NEXT:    blsrq %rdi, %rax
+; GENERIC-NEXT:    addq %rcx, %rax
+; GENERIC-NEXT:    retq
+;
+; HASWELL-LABEL: test_blsr_i64:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    blsrq (%rsi), %rcx # sched: [4:0.50]
+; HASWELL-NEXT:    blsrq %rdi, %rax # sched: [1:0.50]
+; HASWELL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_blsr_i64:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    blsrq (%rsi), %rcx # sched: [?:0.000000e+00]
+; BTVER2-NEXT:    blsrq %rdi, %rax # sched: [?:0.000000e+00]
+; BTVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_blsr_i64:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    blsrq (%rsi), %rcx # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    blsrq %rdi, %rax # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    addq %rcx, %rax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %1 = load i64, i64 *%a1
+  %2 = sub i64 %1, 1
+  %3 = sub i64 %a0, 1
+  %4 = and i64 %1, %2
+  %5 = and i64 %a0, %3
+  %6 = add i64 %4, %5
+  ret i64 %6
+}
+
+define i16 @test_cttz_i16(i16 zeroext %a0, i16 *%a1) {
+; GENERIC-LABEL: test_cttz_i16:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    tzcntw (%rsi), %cx
+; GENERIC-NEXT:    tzcntw %di, %ax
+; GENERIC-NEXT:    orl %ecx, %eax
+; GENERIC-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; GENERIC-NEXT:    retq
+;
+; HASWELL-LABEL: test_cttz_i16:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    tzcntw (%rsi), %cx # sched: [7:1.00]
+; HASWELL-NEXT:    tzcntw %di, %ax # sched: [3:1.00]
+; HASWELL-NEXT:    orl %ecx, %eax # sched: [1:0.25]
+; HASWELL-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_cttz_i16:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    tzcntw (%rsi), %cx # sched: [?:0.000000e+00]
+; BTVER2-NEXT:    tzcntw %di, %ax # sched: [?:0.000000e+00]
+; BTVER2-NEXT:    orl %ecx, %eax # sched: [1:0.50]
+; BTVER2-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cttz_i16:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    tzcntw (%rsi), %cx # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    tzcntw %di, %ax # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    orl %ecx, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %1 = load i16, i16 *%a1
+  %2 = tail call i16 @llvm.cttz.i16( i16 %1, i1 false )
+  %3 = tail call i16 @llvm.cttz.i16( i16 %a0, i1 false )
+  %4 = or i16 %2, %3
+  ret i16 %4
+}
+declare i16 @llvm.cttz.i16(i16, i1)
+
+define i32 @test_cttz_i32(i32 %a0, i32 *%a1) {
+; GENERIC-LABEL: test_cttz_i32:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    tzcntl (%rsi), %ecx
+; GENERIC-NEXT:    tzcntl %edi, %eax
+; GENERIC-NEXT:    orl %ecx, %eax
+; GENERIC-NEXT:    retq
+;
+; HASWELL-LABEL: test_cttz_i32:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    tzcntl (%rsi), %ecx # sched: [7:1.00]
+; HASWELL-NEXT:    tzcntl %edi, %eax # sched: [3:1.00]
+; HASWELL-NEXT:    orl %ecx, %eax # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_cttz_i32:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    tzcntl (%rsi), %ecx # sched: [?:0.000000e+00]
+; BTVER2-NEXT:    tzcntl %edi, %eax # sched: [?:0.000000e+00]
+; BTVER2-NEXT:    orl %ecx, %eax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cttz_i32:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    tzcntl (%rsi), %ecx # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    tzcntl %edi, %eax # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    orl %ecx, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %1 = load i32, i32 *%a1
+  %2 = tail call i32 @llvm.cttz.i32( i32 %1, i1 false )
+  %3 = tail call i32 @llvm.cttz.i32( i32 %a0, i1 false )
+  %4 = or i32 %2, %3
+  ret i32 %4
+}
+declare i32 @llvm.cttz.i32(i32, i1)
+
+define i64 @test_cttz_i64(i64 %a0, i64 *%a1) {
+; GENERIC-LABEL: test_cttz_i64:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    tzcntq (%rsi), %rcx
+; GENERIC-NEXT:    tzcntq %rdi, %rax
+; GENERIC-NEXT:    orq %rcx, %rax
+; GENERIC-NEXT:    retq
+;
+; HASWELL-LABEL: test_cttz_i64:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    tzcntq (%rsi), %rcx # sched: [7:1.00]
+; HASWELL-NEXT:    tzcntq %rdi, %rax # sched: [3:1.00]
+; HASWELL-NEXT:    orq %rcx, %rax # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_cttz_i64:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    tzcntq (%rsi), %rcx # sched: [?:0.000000e+00]
+; BTVER2-NEXT:    tzcntq %rdi, %rax # sched: [?:0.000000e+00]
+; BTVER2-NEXT:    orq %rcx, %rax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cttz_i64:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    tzcntq (%rsi), %rcx # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    tzcntq %rdi, %rax # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    orq %rcx, %rax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %1 = load i64, i64 *%a1
+  %2 = tail call i64 @llvm.cttz.i64( i64 %1, i1 false )
+  %3 = tail call i64 @llvm.cttz.i64( i64 %a0, i1 false )
+  %4 = or i64 %2, %3
+  ret i64 %4
+}
+declare i64 @llvm.cttz.i64(i64, i1)
diff --git a/test/CodeGen/X86/bmi2-schedule.ll b/test/CodeGen/X86/bmi2-schedule.ll
new file mode 100644
index 000000000000..9666dd85d853
--- /dev/null
+++ b/test/CodeGen/X86/bmi2-schedule.ll
@@ -0,0 +1,180 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mattr=+bmi2  | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl     | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1  | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
+
+define i32 @test_bzhi_i32(i32 %a0, i32 %a1, i32 *%a2) {
+; GENERIC-LABEL: test_bzhi_i32:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    bzhil %edi, (%rdx), %ecx
+; GENERIC-NEXT:    bzhil %edi, %esi, %eax
+; GENERIC-NEXT:    addl %ecx, %eax
+; GENERIC-NEXT:    retq
+;
+; HASWELL-LABEL: test_bzhi_i32:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    bzhil %edi, (%rdx), %ecx # sched: [4:0.50]
+; HASWELL-NEXT:    bzhil %edi, %esi, %eax # sched: [1:0.50]
+; HASWELL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; ZNVER1-LABEL: test_bzhi_i32:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    bzhil %edi, (%rdx), %ecx # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    bzhil %edi, %esi, %eax # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    addl %ecx, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %1 = load i32, i32 *%a2
+  %2 = tail call i32 @llvm.x86.bmi.bzhi.32(i32 %1, i32 %a0)
+  %3 = tail call i32 @llvm.x86.bmi.bzhi.32(i32 %a1, i32 %a0)
+  %4 = add i32 %2, %3
+  ret i32 %4
+}
+declare i32 @llvm.x86.bmi.bzhi.32(i32, i32)
+
+define i64 @test_bzhi_i64(i64 %a0, i64 %a1, i64 *%a2) {
+; GENERIC-LABEL: test_bzhi_i64:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    bzhiq %rdi, (%rdx), %rcx
+; GENERIC-NEXT:    bzhiq %rdi, %rsi, %rax
+; GENERIC-NEXT:    addq %rcx, %rax
+; GENERIC-NEXT:    retq
+;
+; HASWELL-LABEL: test_bzhi_i64:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    bzhiq %rdi, (%rdx), %rcx # sched: [4:0.50]
+; HASWELL-NEXT:    bzhiq %rdi, %rsi, %rax # sched: [1:0.50]
+; HASWELL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; ZNVER1-LABEL: test_bzhi_i64:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    bzhiq %rdi, (%rdx), %rcx # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    bzhiq %rdi, %rsi, %rax # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    addq %rcx, %rax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %1 = load i64, i64 *%a2
+  %2 = tail call i64 @llvm.x86.bmi.bzhi.64(i64 %1, i64 %a0)
+  %3 = tail call i64 @llvm.x86.bmi.bzhi.64(i64 %a1, i64 %a0)
+  %4 = add i64 %2, %3
+  ret i64 %4
+}
+declare i64 @llvm.x86.bmi.bzhi.64(i64, i64)
+
+define i32 @test_pdep_i32(i32 %a0, i32 %a1, i32 *%a2) {
+; GENERIC-LABEL: test_pdep_i32:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    pdepl (%rdx), %edi, %ecx
+; GENERIC-NEXT:    pdepl %esi, %edi, %eax
+; GENERIC-NEXT:    addl %ecx, %eax
+; GENERIC-NEXT:    retq
+;
+; HASWELL-LABEL: test_pdep_i32:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    pdepl (%rdx), %edi, %ecx # sched: [7:1.00]
+; HASWELL-NEXT:    pdepl %esi, %edi, %eax # sched: [3:1.00]
+; HASWELL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; ZNVER1-LABEL: test_pdep_i32:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    pdepl (%rdx), %edi, %ecx # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    pdepl %esi, %edi, %eax # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    addl %ecx, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %1 = load i32, i32 *%a2
+  %2 = tail call i32 @llvm.x86.bmi.pdep.32(i32 %a0, i32 %1)
+  %3 = tail call i32 @llvm.x86.bmi.pdep.32(i32 %a0, i32 %a1)
+  %4 = add i32 %2, %3
+  ret i32 %4
+}
+declare i32 @llvm.x86.bmi.pdep.32(i32, i32)
+
+define i64 @test_pdep_i64(i64 %a0, i64 %a1, i64 *%a2) {
+; GENERIC-LABEL: test_pdep_i64:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    pdepq (%rdx), %rdi, %rcx
+; GENERIC-NEXT:    pdepq %rsi, %rdi, %rax
+; GENERIC-NEXT:    addq %rcx, %rax
+; GENERIC-NEXT:    retq
+;
+; HASWELL-LABEL: test_pdep_i64:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    pdepq (%rdx), %rdi, %rcx # sched: [7:1.00]
+; HASWELL-NEXT:    pdepq %rsi, %rdi, %rax # sched: [3:1.00]
+; HASWELL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; ZNVER1-LABEL: test_pdep_i64:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    pdepq (%rdx), %rdi, %rcx # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    pdepq %rsi, %rdi, %rax # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    addq %rcx, %rax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %1 = load i64, i64 *%a2
+  %2 = tail call i64 @llvm.x86.bmi.pdep.64(i64 %a0, i64 %1)
+  %3 = tail call i64 @llvm.x86.bmi.pdep.64(i64 %a0, i64 %a1)
+  %4 = add i64 %2, %3
+  ret i64 %4
+}
+declare i64 @llvm.x86.bmi.pdep.64(i64, i64)
+
+define i32 @test_pext_i32(i32 %a0, i32 %a1, i32 *%a2) {
+; GENERIC-LABEL: test_pext_i32:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    pextl (%rdx), %edi, %ecx
+; GENERIC-NEXT:    pextl %esi, %edi, %eax
+; GENERIC-NEXT:    addl %ecx, %eax
+; GENERIC-NEXT:    retq
+;
+; HASWELL-LABEL: test_pext_i32:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    pextl (%rdx), %edi, %ecx # sched: [7:1.00]
+; HASWELL-NEXT:    pextl %esi, %edi, %eax # sched: [3:1.00]
+; HASWELL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; ZNVER1-LABEL: test_pext_i32:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    pextl (%rdx), %edi, %ecx # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    pextl %esi, %edi, %eax # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    addl %ecx, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %1 = load i32, i32 *%a2
+  %2 = tail call i32 @llvm.x86.bmi.pext.32(i32 %a0, i32 %1)
+  %3 = tail call i32 @llvm.x86.bmi.pext.32(i32 %a0, i32 %a1)
+  %4 = add i32 %2, %3
+  ret i32 %4
+}
+declare i32 @llvm.x86.bmi.pext.32(i32, i32)
+
+define i64 @test_pext_i64(i64 %a0, i64 %a1, i64 *%a2) {
+; GENERIC-LABEL: test_pext_i64:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    pextq (%rdx), %rdi, %rcx
+; GENERIC-NEXT:    pextq %rsi, %rdi, %rax
+; GENERIC-NEXT:    addq %rcx, %rax
+; GENERIC-NEXT:    retq
+;
+; HASWELL-LABEL: test_pext_i64:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    pextq (%rdx), %rdi, %rcx # sched: [7:1.00]
+; HASWELL-NEXT:    pextq %rsi, %rdi, %rax # sched: [3:1.00]
+; HASWELL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; ZNVER1-LABEL: test_pext_i64:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    pextq (%rdx), %rdi, %rcx # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    pextq %rsi, %rdi, %rax # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    addq %rcx, %rax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %1 = load i64, i64 *%a2
+  %2 = tail call i64 @llvm.x86.bmi.pext.64(i64 %a0, i64 %1)
+  %3 = tail call i64 @llvm.x86.bmi.pext.64(i64 %a0, i64 %a1)
+  %4 = add i64 %2, %3
+  ret i64 %4
+}
+declare i64 @llvm.x86.bmi.pext.64(i64, i64)
diff --git a/test/CodeGen/X86/bool-ext-inc.ll b/test/CodeGen/X86/bool-ext-inc.ll
index e292ccd0be11..7c1042878d59 100644
--- a/test/CodeGen/X86/bool-ext-inc.ll
+++ b/test/CodeGen/X86/bool-ext-inc.ll
@@ -19,7 +19,7 @@ define i32 @sext_inc(i1 zeroext %x) nounwind {
 define <4 x i32> @sext_inc_vec(<4 x i1> %x) nounwind {
 ; CHECK-LABEL: sext_inc_vec:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vbroadcastss {{.*}}(%rip), %xmm1
+; CHECK-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1,1,1,1]
 ; CHECK-NEXT:    vandnps %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %ext = sext <4 x i1> %x to <4 x i32>
@@ -31,7 +31,7 @@ define <4 x i32> @cmpgt_sext_inc_vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; CHECK-LABEL: cmpgt_sext_inc_vec:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; CHECK-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
 ; CHECK-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %cmp = icmp sgt <4 x i32> %x, %y
@@ -56,7 +56,7 @@ define <4 x i64> @cmpgt_sext_inc_vec256(<4 x i64> %x, <4 x i64> %y) nounwind {
 ; CHECK-LABEL: cmpgt_sext_inc_vec256:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
-; CHECK-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm1
+; CHECK-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1]
 ; CHECK-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    retq
   %cmp = icmp sgt <4 x i64> %x, %y
@@ -91,7 +91,7 @@ define <4 x i32> @bool_logic_and_math_vec(<4 x i32> %a, <4 x i32> %b, <4 x i32>
 ; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vpxor %xmm2, %xmm1, %xmm1
 ; CHECK-NEXT:    vpandn %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; CHECK-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
 ; CHECK-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %cmp1 = icmp ne <4 x i32> %a, %b
diff --git a/test/CodeGen/X86/bswap-rotate.ll b/test/CodeGen/X86/bswap-rotate.ll
new file mode 100644
index 000000000000..f686febe5645
--- /dev/null
+++ b/test/CodeGen/X86/bswap-rotate.ll
@@ -0,0 +1,27 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mcpu=i686 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64
+
+; Combine BSWAP (lowered to rolw 8) with a second rotate.
+; This test checks for combining rotates with inconsistent constant value types.
+
+define i16 @combine_bswap_rotate(i16 %a0) {
+; X86-LABEL: combine_bswap_rotate:
+; X86:       # BB#0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    rolw $9, %ax
+; X86-NEXT:    retl
+;
+; X64-LABEL: combine_bswap_rotate:
+; X64:       # BB#0:
+; X64-NEXT:    rolw $9, %di
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    retq
+  %1 = call i16 @llvm.bswap.i16(i16 %a0)
+  %2 = shl i16 %1, 1
+  %3 = lshr i16 %1, 15
+  %4 = or i16 %2, %3
+  ret i16 %4
+}
+
+declare i16 @llvm.bswap.i16(i16)
diff --git a/test/CodeGen/X86/clobber-fi0.ll b/test/CodeGen/X86/clobber-fi0.ll
index 02f1a1616db2..b69b18531601 100644
--- a/test/CodeGen/X86/clobber-fi0.ll
+++ b/test/CodeGen/X86/clobber-fi0.ll
@@ -15,22 +15,22 @@ bb:
   %tmp = alloca i32, align 4                      ; [#uses=3 type=i32*]
   %tmp2 = alloca i32, align 4                     ; [#uses=3 type=i32*]
   %tmp3 = alloca i32                              ; [#uses=1 type=i32*]
-  store i32 1, i32* %tmp, align 4
-  store i32 1, i32* %tmp2, align 4
+  store volatile i32 1, i32* %tmp, align 4
+  store volatile i32 1, i32* %tmp2, align 4
   br label %bb4
 
 bb4:                                              ; preds = %bb4, %bb
-  %tmp6 = load i32, i32* %tmp2, align 4                ; [#uses=1 type=i32]
+  %tmp6 = load volatile i32, i32* %tmp2, align 4                ; [#uses=1 type=i32]
   %tmp7 = add i32 %tmp6, -1                       ; [#uses=2 type=i32]
-  store i32 %tmp7, i32* %tmp2, align 4
+  store volatile i32 %tmp7, i32* %tmp2, align 4
   %tmp8 = icmp eq i32 %tmp7, 0                    ; [#uses=1 type=i1]
-  %tmp9 = load i32, i32* %tmp                          ; [#uses=1 type=i32]
+  %tmp9 = load volatile i32, i32* %tmp                          ; [#uses=1 type=i32]
   %tmp10 = add i32 %tmp9, -1              ; [#uses=1 type=i32]
-  store i32 %tmp10, i32* %tmp3
+  store volatile i32 %tmp10, i32* %tmp3
   br i1 %tmp8, label %bb11, label %bb4
 
 bb11:                                             ; preds = %bb4
-  %tmp12 = load i32, i32* %tmp, align 4                ; [#uses=1 type=i32]
+  %tmp12 = load volatile i32, i32* %tmp, align 4                ; [#uses=1 type=i32]
   ret i32 %tmp12
 }
 
diff --git a/test/CodeGen/X86/combine-rotates.ll b/test/CodeGen/X86/combine-rotates.ll
index 713ee5d0f65a..0d74c937af33 100644
--- a/test/CodeGen/X86/combine-rotates.ll
+++ b/test/CodeGen/X86/combine-rotates.ll
@@ -6,22 +6,12 @@
 define <4 x i32> @combine_vec_rot_rot(<4 x i32> %x) {
 ; XOP-LABEL: combine_vec_rot_rot:
 ; XOP:       # BB#0:
-; XOP-NEXT:    vpshld {{.*}}(%rip), %xmm0, %xmm1
-; XOP-NEXT:    vpshld {{.*}}(%rip), %xmm0, %xmm0
-; XOP-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; XOP-NEXT:    vpshld {{.*}}(%rip), %xmm0, %xmm1
-; XOP-NEXT:    vpshld {{.*}}(%rip), %xmm0, %xmm0
-; XOP-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; XOP-NEXT:    vprotd {{.*}}(%rip), %xmm0, %xmm0
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: combine_vec_rot_rot:
 ; AVX512:       # BB#0:
-; AVX512-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
-; AVX512-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; AVX512-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
-; AVX512-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    vprolvd {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512-NEXT:    retq
   %1 = lshr <4 x i32> %x, <i32 1, i32 2, i32 3, i32 4>
   %2 = shl <4 x i32> %x, <i32 31, i32 30, i32 29, i32 28>
@@ -40,12 +30,7 @@ define <4 x i32> @combine_vec_rot_rot_splat(<4 x i32> %x) {
 ;
 ; AVX512-LABEL: combine_vec_rot_rot_splat:
 ; AVX512:       # BB#0:
-; AVX512-NEXT:    vpsrld $3, %xmm0, %xmm1
-; AVX512-NEXT:    vpslld $29, %xmm0, %xmm0
-; AVX512-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; AVX512-NEXT:    vpsrld $22, %xmm0, %xmm1
-; AVX512-NEXT:    vpslld $10, %xmm0, %xmm0
-; AVX512-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    vprold $7, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
   %1 = lshr <4 x i32> %x, <i32 3, i32 3, i32 3, i32 3>
   %2 = shl <4 x i32> %x, <i32 29, i32 29, i32 29, i32 29>
@@ -63,12 +48,6 @@ define <4 x i32> @combine_vec_rot_rot_splat_zero(<4 x i32> %x) {
 ;
 ; AVX512-LABEL: combine_vec_rot_rot_splat_zero:
 ; AVX512:       # BB#0:
-; AVX512-NEXT:    vpsrld $1, %xmm0, %xmm1
-; AVX512-NEXT:    vpslld $31, %xmm0, %xmm0
-; AVX512-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; AVX512-NEXT:    vpsrld $31, %xmm0, %xmm1
-; AVX512-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vpor %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    retq
   %1 = lshr <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1>
   %2 = shl <4 x i32> %x, <i32 31, i32 31, i32 31, i32 31>
diff --git a/test/CodeGen/X86/combine-shl.ll b/test/CodeGen/X86/combine-shl.ll
index 3dbff2680c22..a6491a0a8694 100644
--- a/test/CodeGen/X86/combine-shl.ll
+++ b/test/CodeGen/X86/combine-shl.ll
@@ -392,7 +392,7 @@ define <4 x i32> @combine_vec_shl_gt_lshr0(<4 x i32> %x) {
 ;
 ; AVX-LABEL: combine_vec_shl_gt_lshr0:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4294967264,4294967264,4294967264,4294967264]
 ; AVX-NEXT:    vpslld $2, %xmm0, %xmm0
 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
@@ -437,7 +437,7 @@ define <4 x i32> @combine_vec_shl_le_lshr0(<4 x i32> %x) {
 ;
 ; AVX-LABEL: combine_vec_shl_le_lshr0:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1073741816,1073741816,1073741816,1073741816]
 ; AVX-NEXT:    vpsrld $2, %xmm0, %xmm0
 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
@@ -481,7 +481,7 @@ define <4 x i32> @combine_vec_shl_ashr0(<4 x i32> %x) {
 ;
 ; AVX-LABEL: combine_vec_shl_ashr0:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vbroadcastss {{.*}}(%rip), %xmm1
+; AVX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [4294967264,4294967264,4294967264,4294967264]
 ; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %1 = ashr <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
@@ -515,7 +515,7 @@ define <4 x i32> @combine_vec_shl_add0(<4 x i32> %x) {
 ; AVX-LABEL: combine_vec_shl_add0:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vpslld $2, %xmm0, %xmm0
-; AVX-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [20,20,20,20]
 ; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %1 = add <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
@@ -550,7 +550,7 @@ define <4 x i32> @combine_vec_shl_or0(<4 x i32> %x) {
 ;
 ; AVX-LABEL: combine_vec_shl_or0:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [5,5,5,5]
 ; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vpslld $2, %xmm0, %xmm0
 ; AVX-NEXT:    retq
@@ -585,7 +585,7 @@ define <4 x i32> @combine_vec_shl_mul0(<4 x i32> %x) {
 ;
 ; AVX-LABEL: combine_vec_shl_mul0:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [20,20,20,20]
 ; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %1 = mul <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
diff --git a/test/CodeGen/X86/combine-srl.ll b/test/CodeGen/X86/combine-srl.ll
index 21564cdd7353..473fae19f4fd 100644
--- a/test/CodeGen/X86/combine-srl.ll
+++ b/test/CodeGen/X86/combine-srl.ll
@@ -91,7 +91,7 @@ define <4 x i32> @combine_vec_lshr_known_zero1(<4 x i32> %x) {
 ;
 ; AVX-LABEL: combine_vec_lshr_known_zero1:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15]
 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
 ; AVX-NEXT:    retq
@@ -326,7 +326,7 @@ define <4 x i32> @combine_vec_lshr_shl_mask0(<4 x i32> %x) {
 ;
 ; AVX-LABEL: combine_vec_lshr_shl_mask0:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vbroadcastss {{.*}}(%rip), %xmm1
+; AVX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1073741823,1073741823,1073741823,1073741823]
 ; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %1 =  shl <4 x i32> %x, <i32 2, i32 2, i32 2, i32 2>
@@ -376,10 +376,10 @@ define <4 x i32> @combine_vec_lshr_lzcnt_bit0(<4 x i32> %x) {
 ;
 ; AVX-LABEL: combine_vec_lshr_lzcnt_bit0:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [16,16,16,16]
 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vpsrld $4, %xmm0, %xmm0
-; AVX-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
 ; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %1 = and <4 x i32> %x, <i32 16, i32 16, i32 16, i32 16>
diff --git a/test/CodeGen/X86/combine-udiv.ll b/test/CodeGen/X86/combine-udiv.ll
index e1e849929405..b6ae2fa6d157 100644
--- a/test/CodeGen/X86/combine-udiv.ll
+++ b/test/CodeGen/X86/combine-udiv.ll
@@ -166,7 +166,7 @@ define <4 x i32> @combine_vec_udiv_by_shl_pow2a(<4 x i32> %x, <4 x i32> %y) {
 ;
 ; AVX2-LABEL: combine_vec_udiv_by_shl_pow2a:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2,2,2,2]
 ; AVX2-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
diff --git a/test/CodeGen/X86/combine-urem.ll b/test/CodeGen/X86/combine-urem.ll
index 91da268a8d75..4c7716bbaebe 100644
--- a/test/CodeGen/X86/combine-urem.ll
+++ b/test/CodeGen/X86/combine-urem.ll
@@ -43,7 +43,7 @@ define <4 x i32> @combine_vec_urem_by_pow2a(<4 x i32> %x) {
 ;
 ; AVX2-LABEL: combine_vec_urem_by_pow2a:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vbroadcastss {{.*}}(%rip), %xmm1
+; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm1 = [3,3,3,3]
 ; AVX2-NEXT:    vandps %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
   %1 = urem <4 x i32> %x, <i32 4, i32 4, i32 4, i32 4>
@@ -87,7 +87,7 @@ define <4 x i32> @combine_vec_urem_by_pow2c(<4 x i32> %x, <4 x i32> %y) {
 ;
 ; AVX2-LABEL: combine_vec_urem_by_pow2c:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1]
 ; AVX2-NEXT:    vpsllvd %xmm1, %xmm2, %xmm1
 ; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 ; AVX2-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
@@ -146,7 +146,7 @@ define <4 x i32> @combine_vec_urem_by_pow2d(<4 x i32> %x, <4 x i32> %y) {
 ;
 ; AVX2-LABEL: combine_vec_urem_by_pow2d:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
 ; AVX2-NEXT:    vpsrlvd %xmm1, %xmm2, %xmm1
 ; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 ; AVX2-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
@@ -183,7 +183,7 @@ define <4 x i32> @combine_vec_urem_by_shl_pow2a(<4 x i32> %x, <4 x i32> %y) {
 ;
 ; AVX2-LABEL: combine_vec_urem_by_shl_pow2a:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [4,4,4,4]
 ; AVX2-NEXT:    vpsllvd %xmm1, %xmm2, %xmm1
 ; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 ; AVX2-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
diff --git a/test/CodeGen/X86/f16c-schedule.ll b/test/CodeGen/X86/f16c-schedule.ll
new file mode 100644
index 000000000000..15ae4a49d7d3
--- /dev/null
+++ b/test/CodeGen/X86/f16c-schedule.ll
@@ -0,0 +1,144 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=ivybridge | FileCheck %s --check-prefix=CHECK --check-prefix=IVY
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
+
+define <4 x float> @test_vcvtph2ps_128(<8 x i16> %a0, <8 x i16> *%a1) {
+; IVY-LABEL: test_vcvtph2ps_128:
+; IVY:       # BB#0:
+; IVY-NEXT:    vcvtph2ps (%rdi), %xmm1 # sched: [7:1.00]
+; IVY-NEXT:    vcvtph2ps %xmm0, %xmm0 # sched: [3:1.00]
+; IVY-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; IVY-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vcvtph2ps_128:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vcvtph2ps (%rdi), %xmm1 # sched: [7:1.00]
+; HASWELL-NEXT:    vcvtph2ps %xmm0, %xmm0 # sched: [4:1.00]
+; HASWELL-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_vcvtph2ps_128:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    vcvtph2ps (%rdi), %xmm1 # sched: [8:1.00]
+; BTVER2-NEXT:    vcvtph2ps %xmm0, %xmm0 # sched: [3:1.00]
+; BTVER2-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_vcvtph2ps_128:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vcvtph2ps (%rdi), %xmm1 # sched: [12:1.00]
+; ZNVER1-NEXT:    vcvtph2ps %xmm0, %xmm0 # sched: [5:1.00]
+; ZNVER1-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %1 = load <8 x i16>, <8 x i16> *%a1
+  %2 = call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %1)
+  %3 = call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %a0)
+  %4 = fadd <4 x float> %2, %3
+  ret <4 x float> %4
+}
+declare <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16>)
+
+define <8 x float> @test_vcvtph2ps_256(<8 x i16> %a0, <8 x i16> *%a1) {
+; IVY-LABEL: test_vcvtph2ps_256:
+; IVY:       # BB#0:
+; IVY-NEXT:    vcvtph2ps (%rdi), %ymm1 # sched: [7:1.00]
+; IVY-NEXT:    vcvtph2ps %xmm0, %ymm0 # sched: [3:1.00]
+; IVY-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; IVY-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vcvtph2ps_256:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vcvtph2ps (%rdi), %ymm1 # sched: [7:1.00]
+; HASWELL-NEXT:    vcvtph2ps %xmm0, %ymm0 # sched: [4:1.00]
+; HASWELL-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_vcvtph2ps_256:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    vcvtph2ps (%rdi), %ymm1 # sched: [8:1.00]
+; BTVER2-NEXT:    vcvtph2ps %xmm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_vcvtph2ps_256:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vcvtph2ps (%rdi), %ymm1 # sched: [12:1.00]
+; ZNVER1-NEXT:    vcvtph2ps %xmm0, %ymm0 # sched: [5:1.00]
+; ZNVER1-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %1 = load <8 x i16>, <8 x i16> *%a1
+  %2 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %1)
+  %3 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %a0)
+  %4 = fadd <8 x float> %2, %3
+  ret <8 x float> %4
+}
+declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>)
+
+define <8 x i16> @test_vcvtps2ph_128(<4 x float> %a0, <4 x float> %a1, <4 x i16> *%a2) {
+; IVY-LABEL: test_vcvtps2ph_128:
+; IVY:       # BB#0:
+; IVY-NEXT:    vcvtps2ph $0, %xmm0, %xmm0 # sched: [3:1.00]
+; IVY-NEXT:    vcvtps2ph $0, %xmm1, (%rdi) # sched: [7:1.00]
+; IVY-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vcvtps2ph_128:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vcvtps2ph $0, %xmm0, %xmm0 # sched: [4:1.00]
+; HASWELL-NEXT:    vcvtps2ph $0, %xmm1, (%rdi) # sched: [8:1.00]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_vcvtps2ph_128:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    vcvtps2ph $0, %xmm0, %xmm0 # sched: [3:1.00]
+; BTVER2-NEXT:    vcvtps2ph $0, %xmm1, (%rdi) # sched: [8:1.00]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_vcvtps2ph_128:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vcvtps2ph $0, %xmm0, %xmm0 # sched: [5:1.00]
+; ZNVER1-NEXT:    vcvtps2ph $0, %xmm1, (%rdi) # sched: [12:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %1 = call <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float> %a0, i32 0)
+  %2 = call <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float> %a1, i32 0)
+  %3 = shufflevector <8 x i16> %2, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x i16> %3, <4 x i16> *%a2
+  ret <8 x i16> %1
+}
+declare <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float>, i32)
+
+define <8 x i16> @test_vcvtps2ph_256(<8 x float> %a0, <8 x float> %a1, <8 x i16> *%a2) {
+; IVY-LABEL: test_vcvtps2ph_256:
+; IVY:       # BB#0:
+; IVY-NEXT:    vcvtps2ph $0, %ymm0, %xmm0 # sched: [3:1.00]
+; IVY-NEXT:    vcvtps2ph $0, %ymm1, (%rdi) # sched: [7:1.00]
+; IVY-NEXT:    vzeroupper # sched: [?:0.000000e+00]
+; IVY-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vcvtps2ph_256:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vcvtps2ph $0, %ymm0, %xmm0 # sched: [4:1.00]
+; HASWELL-NEXT:    vcvtps2ph $0, %ymm1, (%rdi) # sched: [8:1.00]
+; HASWELL-NEXT:    vzeroupper # sched: [1:0.00]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_vcvtps2ph_256:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    vcvtps2ph $0, %ymm0, %xmm0 # sched: [3:1.00]
+; BTVER2-NEXT:    vcvtps2ph $0, %ymm1, (%rdi) # sched: [8:1.00]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_vcvtps2ph_256:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vcvtps2ph $0, %ymm0, %xmm0 # sched: [5:1.00]
+; ZNVER1-NEXT:    vcvtps2ph $0, %ymm1, (%rdi) # sched: [12:1.00]
+; ZNVER1-NEXT:    vzeroupper # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %1 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %a0, i32 0)
+  %2 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %a1, i32 0)
+  store <8 x i16> %2, <8 x i16> *%a2
+  ret <8 x i16> %1
+}
+declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32)
diff --git a/test/CodeGen/X86/fast-isel-x86-64.ll b/test/CodeGen/X86/fast-isel-x86-64.ll
index 3d5c12c03484..c87353ed1f5a 100644
--- a/test/CodeGen/X86/fast-isel-x86-64.ll
+++ b/test/CodeGen/X86/fast-isel-x86-64.ll
@@ -316,7 +316,7 @@ define void @allocamaterialize() {
 
 ; STDERR-NOT: FastISel missed terminator:   ret void
 ; CHECK-LABEL: win64ccfun
-define x86_64_win64cc void @win64ccfun(i32 %i) {
+define win64cc void @win64ccfun(i32 %i) {
 ; CHECK: ret
   ret void
 }
diff --git a/test/CodeGen/X86/hipe-cc.ll b/test/CodeGen/X86/hipe-cc.ll
index fbc4cd9d4f9c..86469dad23f2 100644
--- a/test/CodeGen/X86/hipe-cc.ll
+++ b/test/CodeGen/X86/hipe-cc.ll
@@ -48,11 +48,7 @@ entry:
   store i32 %arg0, i32* %arg0_var
   store i32 %arg1, i32* %arg1_var
   store i32 %arg2, i32* %arg2_var
-
-  ; CHECK:      movl  16(%esp), %esi
-  ; CHECK-NEXT: movl  12(%esp), %ebp
-  ; CHECK-NEXT: movl   8(%esp), %eax
-  ; CHECK-NEXT: movl   4(%esp), %edx
+  ; These loads are loading the values from their previous stores and are optimized away.
   %0 = load i32, i32* %hp_var
   %1 = load i32, i32* %p_var
   %2 = load i32, i32* %arg0_var
diff --git a/test/CodeGen/X86/hipe-cc64.ll b/test/CodeGen/X86/hipe-cc64.ll
index 43e2e1409fde..efe07cf6301e 100644
--- a/test/CodeGen/X86/hipe-cc64.ll
+++ b/test/CodeGen/X86/hipe-cc64.ll
@@ -57,11 +57,7 @@ entry:
   store i64 %arg2, i64* %arg2_var
   store i64 %arg3, i64* %arg3_var
 
-  ; CHECK:      movq  40(%rsp), %r15
-  ; CHECK-NEXT: movq  32(%rsp), %rbp
-  ; CHECK-NEXT: movq  24(%rsp), %rsi
-  ; CHECK-NEXT: movq  16(%rsp), %rdx
-  ; CHECK-NEXT: movq  8(%rsp), %rcx
+  ; Loads are reading values just writen from corresponding register and are therefore noops. 
   %0 = load i64, i64* %hp_var
   %1 = load i64, i64* %p_var
   %2 = load i64, i64* %arg0_var
diff --git a/test/CodeGen/X86/lea32-schedule.ll b/test/CodeGen/X86/lea32-schedule.ll
new file mode 100644
index 000000000000..e42ce30c5a6d
--- /dev/null
+++ b/test/CodeGen/X86/lea32-schedule.ll
@@ -0,0 +1,653 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64      | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=atom        | FileCheck %s --check-prefix=CHECK --check-prefix=ATOM
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=slm         | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=sandybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=ivybridge   | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell     | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake     | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl         | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2      | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1      | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
+
+define i32 @test_lea_offset(i32) {
+; GENERIC-LABEL: test_lea_offset:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; GENERIC-NEXT:    leal -24(%rdi), %eax # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_offset:
+; ATOM:       # BB#0:
+; ATOM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; ATOM-NEXT:    leal -24(%rdi), %eax
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    retq
+;
+; SLM-LABEL: test_lea_offset:
+; SLM:       # BB#0:
+; SLM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SLM-NEXT:    leal -24(%rdi), %eax # sched: [1:1.00]
+; SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_offset:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SANDY-NEXT:    leal -24(%rdi), %eax # sched: [1:0.50]
+; SANDY-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_offset:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; HASWELL-NEXT:    leal -24(%rdi), %eax # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_lea_offset:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; BTVER2-NEXT:    leal -24(%rdi), %eax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_offset:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; ZNVER1-NEXT:    leal -24(%rdi), %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %2 = add nsw i32 %0, -24
+  ret i32 %2
+}
+
+define i32 @test_lea_offset_big(i32) {
+; GENERIC-LABEL: test_lea_offset_big:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; GENERIC-NEXT:    leal 1024(%rdi), %eax # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_offset_big:
+; ATOM:       # BB#0:
+; ATOM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; ATOM-NEXT:    leal 1024(%rdi), %eax
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    retq
+;
+; SLM-LABEL: test_lea_offset_big:
+; SLM:       # BB#0:
+; SLM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SLM-NEXT:    leal 1024(%rdi), %eax # sched: [1:1.00]
+; SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_offset_big:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SANDY-NEXT:    leal 1024(%rdi), %eax # sched: [1:0.50]
+; SANDY-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_offset_big:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; HASWELL-NEXT:    leal 1024(%rdi), %eax # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_lea_offset_big:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; BTVER2-NEXT:    leal 1024(%rdi), %eax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_offset_big:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; ZNVER1-NEXT:    leal 1024(%rdi), %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %2 = add nsw i32 %0, 1024
+  ret i32 %2
+}
+
+; Function Attrs: norecurse nounwind readnone uwtable
+define i32 @test_lea_add(i32, i32) {
+; GENERIC-LABEL: test_lea_add:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; GENERIC-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; GENERIC-NEXT:    leal (%rdi,%rsi), %eax # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_add:
+; ATOM:       # BB#0:
+; ATOM-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; ATOM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; ATOM-NEXT:    leal (%rdi,%rsi), %eax
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    retq
+;
+; SLM-LABEL: test_lea_add:
+; SLM:       # BB#0:
+; SLM-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; SLM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SLM-NEXT:    leal (%rdi,%rsi), %eax # sched: [1:1.00]
+; SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_add:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; SANDY-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SANDY-NEXT:    leal (%rdi,%rsi), %eax # sched: [1:0.50]
+; SANDY-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_add:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; HASWELL-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; HASWELL-NEXT:    leal (%rdi,%rsi), %eax # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_lea_add:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; BTVER2-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; BTVER2-NEXT:    leal (%rdi,%rsi), %eax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_add:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; ZNVER1-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; ZNVER1-NEXT:    leal (%rdi,%rsi), %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %3 = add nsw i32 %1, %0
+  ret i32 %3
+}
+
+define i32 @test_lea_add_offset(i32, i32) {
+; GENERIC-LABEL: test_lea_add_offset:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; GENERIC-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; GENERIC-NEXT:    leal 16(%rdi,%rsi), %eax # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_add_offset:
+; ATOM:       # BB#0:
+; ATOM-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; ATOM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; ATOM-NEXT:    leal 16(%rdi,%rsi), %eax
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    retq
+;
+; SLM-LABEL: test_lea_add_offset:
+; SLM:       # BB#0:
+; SLM-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; SLM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SLM-NEXT:    leal 16(%rdi,%rsi), %eax # sched: [1:1.00]
+; SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_add_offset:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; SANDY-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SANDY-NEXT:    leal (%rdi,%rsi), %eax # sched: [1:0.50]
+; SANDY-NEXT:    addl $16, %eax # sched: [1:0.33]
+; SANDY-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_add_offset:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; HASWELL-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; HASWELL-NEXT:    leal (%rdi,%rsi), %eax # sched: [1:0.50]
+; HASWELL-NEXT:    addl $16, %eax # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_lea_add_offset:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; BTVER2-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; BTVER2-NEXT:    leal 16(%rdi,%rsi), %eax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_add_offset:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; ZNVER1-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; ZNVER1-NEXT:    leal 16(%rdi,%rsi), %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %3 = add i32 %0, 16
+  %4 = add i32 %3, %1
+  ret i32 %4
+}
+
+define i32 @test_lea_add_offset_big(i32, i32) {
+; GENERIC-LABEL: test_lea_add_offset_big:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; GENERIC-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; GENERIC-NEXT:    leal -4096(%rdi,%rsi), %eax # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_add_offset_big:
+; ATOM:       # BB#0:
+; ATOM-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; ATOM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; ATOM-NEXT:    leal -4096(%rdi,%rsi), %eax
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    retq
+;
+; SLM-LABEL: test_lea_add_offset_big:
+; SLM:       # BB#0:
+; SLM-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; SLM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SLM-NEXT:    leal -4096(%rdi,%rsi), %eax # sched: [1:1.00]
+; SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_add_offset_big:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; SANDY-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SANDY-NEXT:    leal (%rdi,%rsi), %eax # sched: [1:0.50]
+; SANDY-NEXT:    addl $-4096, %eax # imm = 0xF000
+; SANDY-NEXT:    # sched: [1:0.33]
+; SANDY-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_add_offset_big:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; HASWELL-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; HASWELL-NEXT:    leal (%rdi,%rsi), %eax # sched: [1:0.50]
+; HASWELL-NEXT:    addl $-4096, %eax # imm = 0xF000
+; HASWELL-NEXT:    # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_lea_add_offset_big:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; BTVER2-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; BTVER2-NEXT:    leal -4096(%rdi,%rsi), %eax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_add_offset_big:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; ZNVER1-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; ZNVER1-NEXT:    leal -4096(%rdi,%rsi), %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %3 = add i32 %0, -4096
+  %4 = add i32 %3, %1
+  ret i32 %4
+}
+
+define i32 @test_lea_mul(i32) {
+; GENERIC-LABEL: test_lea_mul:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; GENERIC-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_mul:
+; ATOM:       # BB#0:
+; ATOM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; ATOM-NEXT:    leal (%rdi,%rdi,2), %eax
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    retq
+;
+; SLM-LABEL: test_lea_mul:
+; SLM:       # BB#0:
+; SLM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SLM-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:1.00]
+; SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_mul:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SANDY-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
+; SANDY-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_mul:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; HASWELL-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_lea_mul:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; BTVER2-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_mul:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; ZNVER1-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %2 = mul nsw i32 %0, 3
+  ret i32 %2
+}
+
+define i32 @test_lea_mul_offset(i32) {
+; GENERIC-LABEL: test_lea_mul_offset:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; GENERIC-NEXT:    leal -32(%rdi,%rdi,2), %eax # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_mul_offset:
+; ATOM:       # BB#0:
+; ATOM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; ATOM-NEXT:    leal -32(%rdi,%rdi,2), %eax
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    retq
+;
+; SLM-LABEL: test_lea_mul_offset:
+; SLM:       # BB#0:
+; SLM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SLM-NEXT:    leal -32(%rdi,%rdi,2), %eax # sched: [1:1.00]
+; SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_mul_offset:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SANDY-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
+; SANDY-NEXT:    addl $-32, %eax # sched: [1:0.33]
+; SANDY-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_mul_offset:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; HASWELL-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
+; HASWELL-NEXT:    addl $-32, %eax # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_lea_mul_offset:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; BTVER2-NEXT:    leal -32(%rdi,%rdi,2), %eax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_mul_offset:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; ZNVER1-NEXT:    leal -32(%rdi,%rdi,2), %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %2 = mul nsw i32 %0, 3
+  %3 = add nsw i32 %2, -32
+  ret i32 %3
+}
+
+define i32 @test_lea_mul_offset_big(i32) {
+; GENERIC-LABEL: test_lea_mul_offset_big:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; GENERIC-NEXT:    leal 10000(%rdi,%rdi,8), %eax # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_mul_offset_big:
+; ATOM:       # BB#0:
+; ATOM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; ATOM-NEXT:    leal 10000(%rdi,%rdi,8), %eax
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    retq
+;
+; SLM-LABEL: test_lea_mul_offset_big:
+; SLM:       # BB#0:
+; SLM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SLM-NEXT:    leal 10000(%rdi,%rdi,8), %eax # sched: [1:1.00]
+; SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_mul_offset_big:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SANDY-NEXT:    leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
+; SANDY-NEXT:    addl $10000, %eax # imm = 0x2710
+; SANDY-NEXT:    # sched: [1:0.33]
+; SANDY-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_mul_offset_big:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; HASWELL-NEXT:    leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
+; HASWELL-NEXT:    addl $10000, %eax # imm = 0x2710
+; HASWELL-NEXT:    # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_lea_mul_offset_big:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; BTVER2-NEXT:    leal 10000(%rdi,%rdi,8), %eax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_mul_offset_big:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; ZNVER1-NEXT:    leal 10000(%rdi,%rdi,8), %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %2 = mul nsw i32 %0, 9
+  %3 = add nsw i32 %2, 10000
+  ret i32 %3
+}
+
+define i32 @test_lea_add_scale(i32, i32) {
+; GENERIC-LABEL: test_lea_add_scale:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; GENERIC-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; GENERIC-NEXT:    leal (%rdi,%rsi,2), %eax # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_add_scale:
+; ATOM:       # BB#0:
+; ATOM-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; ATOM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; ATOM-NEXT:    leal (%rdi,%rsi,2), %eax
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    retq
+;
+; SLM-LABEL: test_lea_add_scale:
+; SLM:       # BB#0:
+; SLM-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; SLM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SLM-NEXT:    leal (%rdi,%rsi,2), %eax # sched: [1:1.00]
+; SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_add_scale:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; SANDY-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SANDY-NEXT:    leal (%rdi,%rsi,2), %eax # sched: [1:0.50]
+; SANDY-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_add_scale:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; HASWELL-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; HASWELL-NEXT:    leal (%rdi,%rsi,2), %eax # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_lea_add_scale:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; BTVER2-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; BTVER2-NEXT:    leal (%rdi,%rsi,2), %eax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_add_scale:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; ZNVER1-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; ZNVER1-NEXT:    leal (%rdi,%rsi,2), %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %3 = shl i32 %1, 1
+  %4 = add nsw i32 %3, %0
+  ret i32 %4
+}
+
+define i32 @test_lea_add_scale_offset(i32, i32) {
+; GENERIC-LABEL: test_lea_add_scale_offset:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; GENERIC-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; GENERIC-NEXT:    leal 96(%rdi,%rsi,4), %eax # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_add_scale_offset:
+; ATOM:       # BB#0:
+; ATOM-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; ATOM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; ATOM-NEXT:    leal 96(%rdi,%rsi,4), %eax
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    retq
+;
+; SLM-LABEL: test_lea_add_scale_offset:
+; SLM:       # BB#0:
+; SLM-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; SLM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SLM-NEXT:    leal 96(%rdi,%rsi,4), %eax # sched: [1:1.00]
+; SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_add_scale_offset:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; SANDY-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SANDY-NEXT:    leal (%rdi,%rsi,4), %eax # sched: [1:0.50]
+; SANDY-NEXT:    addl $96, %eax # sched: [1:0.33]
+; SANDY-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_add_scale_offset:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; HASWELL-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; HASWELL-NEXT:    leal (%rdi,%rsi,4), %eax # sched: [1:0.50]
+; HASWELL-NEXT:    addl $96, %eax # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_lea_add_scale_offset:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; BTVER2-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; BTVER2-NEXT:    leal 96(%rdi,%rsi,4), %eax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_add_scale_offset:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; ZNVER1-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; ZNVER1-NEXT:    leal 96(%rdi,%rsi,4), %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %3 = shl i32 %1, 2
+  %4 = add i32 %0, 96
+  %5 = add i32 %4, %3
+  ret i32 %5
+}
+
+define i32 @test_lea_add_scale_offset_big(i32, i32) {
+; GENERIC-LABEL: test_lea_add_scale_offset_big:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; GENERIC-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; GENERIC-NEXT:    leal -1200(%rdi,%rsi,8), %eax # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_add_scale_offset_big:
+; ATOM:       # BB#0:
+; ATOM-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; ATOM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; ATOM-NEXT:    leal -1200(%rdi,%rsi,8), %eax
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    retq
+;
+; SLM-LABEL: test_lea_add_scale_offset_big:
+; SLM:       # BB#0:
+; SLM-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; SLM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SLM-NEXT:    leal -1200(%rdi,%rsi,8), %eax # sched: [1:1.00]
+; SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_add_scale_offset_big:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; SANDY-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SANDY-NEXT:    leal (%rdi,%rsi,8), %eax # sched: [1:0.50]
+; SANDY-NEXT:    addl $-1200, %eax # imm = 0xFB50
+; SANDY-NEXT:    # sched: [1:0.33]
+; SANDY-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_add_scale_offset_big:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; HASWELL-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; HASWELL-NEXT:    leal (%rdi,%rsi,8), %eax # sched: [1:0.50]
+; HASWELL-NEXT:    addl $-1200, %eax # imm = 0xFB50
+; HASWELL-NEXT:    # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_lea_add_scale_offset_big:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; BTVER2-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; BTVER2-NEXT:    leal -1200(%rdi,%rsi,8), %eax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_add_scale_offset_big:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; ZNVER1-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; ZNVER1-NEXT:    leal -1200(%rdi,%rsi,8), %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %3 = shl i32 %1, 3
+  %4 = add i32 %0, -1200
+  %5 = add i32 %4, %3
+  ret i32 %5
+}
diff --git a/test/CodeGen/X86/lea64-schedule.ll b/test/CodeGen/X86/lea64-schedule.ll
new file mode 100644
index 000000000000..0ff1574c809d
--- /dev/null
+++ b/test/CodeGen/X86/lea64-schedule.ll
@@ -0,0 +1,534 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64      | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=atom        | FileCheck %s --check-prefix=CHECK --check-prefix=ATOM
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=slm         | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=sandybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=ivybridge   | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell     | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake     | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl         | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2      | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1      | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
+
+define i64 @test_lea_offset(i64) {
+; GENERIC-LABEL: test_lea_offset:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    leaq -24(%rdi), %rax # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_offset:
+; ATOM:       # BB#0:
+; ATOM-NEXT:    leaq -24(%rdi), %rax
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    retq
+;
+; SLM-LABEL: test_lea_offset:
+; SLM:       # BB#0:
+; SLM-NEXT:    leaq -24(%rdi), %rax # sched: [1:1.00]
+; SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_offset:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    leaq -24(%rdi), %rax # sched: [1:0.50]
+; SANDY-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_offset:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    leaq -24(%rdi), %rax # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_lea_offset:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    leaq -24(%rdi), %rax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_offset:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    leaq -24(%rdi), %rax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %2 = add nsw i64 %0, -24
+  ret i64 %2
+}
+
+define i64 @test_lea_offset_big(i64) {
+; GENERIC-LABEL: test_lea_offset_big:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    leaq 1024(%rdi), %rax # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_offset_big:
+; ATOM:       # BB#0:
+; ATOM-NEXT:    leaq 1024(%rdi), %rax
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    retq
+;
+; SLM-LABEL: test_lea_offset_big:
+; SLM:       # BB#0:
+; SLM-NEXT:    leaq 1024(%rdi), %rax # sched: [1:1.00]
+; SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_offset_big:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    leaq 1024(%rdi), %rax # sched: [1:0.50]
+; SANDY-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_offset_big:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    leaq 1024(%rdi), %rax # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_lea_offset_big:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    leaq 1024(%rdi), %rax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_offset_big:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    leaq 1024(%rdi), %rax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %2 = add nsw i64 %0, 1024
+  ret i64 %2
+}
+
+; Function Attrs: norecurse nounwind readnone uwtable
+define i64 @test_lea_add(i64, i64) {
+; GENERIC-LABEL: test_lea_add:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    leaq (%rdi,%rsi), %rax # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_add:
+; ATOM:       # BB#0:
+; ATOM-NEXT:    leaq (%rdi,%rsi), %rax
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    retq
+;
+; SLM-LABEL: test_lea_add:
+; SLM:       # BB#0:
+; SLM-NEXT:    leaq (%rdi,%rsi), %rax # sched: [1:1.00]
+; SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_add:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    leaq (%rdi,%rsi), %rax # sched: [1:0.50]
+; SANDY-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_add:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    leaq (%rdi,%rsi), %rax # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_lea_add:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    leaq (%rdi,%rsi), %rax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_add:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    leaq (%rdi,%rsi), %rax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %3 = add nsw i64 %1, %0
+  ret i64 %3
+}
+
+define i64 @test_lea_add_offset(i64, i64) {
+; GENERIC-LABEL: test_lea_add_offset:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    leaq 16(%rdi,%rsi), %rax # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_add_offset:
+; ATOM:       # BB#0:
+; ATOM-NEXT:    leaq 16(%rdi,%rsi), %rax
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    retq
+;
+; SLM-LABEL: test_lea_add_offset:
+; SLM:       # BB#0:
+; SLM-NEXT:    leaq 16(%rdi,%rsi), %rax # sched: [1:1.00]
+; SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_add_offset:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    leaq (%rdi,%rsi), %rax # sched: [1:0.50]
+; SANDY-NEXT:    addq $16, %rax # sched: [1:0.33]
+; SANDY-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_add_offset:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    leaq (%rdi,%rsi), %rax # sched: [1:0.50]
+; HASWELL-NEXT:    addq $16, %rax # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_lea_add_offset:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    leaq 16(%rdi,%rsi), %rax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_add_offset:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    leaq 16(%rdi,%rsi), %rax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %3 = add i64 %0, 16
+  %4 = add i64 %3, %1
+  ret i64 %4
+}
+
+define i64 @test_lea_add_offset_big(i64, i64) {
+; GENERIC-LABEL: test_lea_add_offset_big:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    leaq -4096(%rdi,%rsi), %rax # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_add_offset_big:
+; ATOM:       # BB#0:
+; ATOM-NEXT:    leaq -4096(%rdi,%rsi), %rax
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    retq
+;
+; SLM-LABEL: test_lea_add_offset_big:
+; SLM:       # BB#0:
+; SLM-NEXT:    leaq -4096(%rdi,%rsi), %rax # sched: [1:1.00]
+; SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_add_offset_big:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    leaq (%rdi,%rsi), %rax # sched: [1:0.50]
+; SANDY-NEXT:    addq $-4096, %rax # imm = 0xF000
+; SANDY-NEXT:    # sched: [1:0.33]
+; SANDY-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_add_offset_big:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    leaq (%rdi,%rsi), %rax # sched: [1:0.50]
+; HASWELL-NEXT:    addq $-4096, %rax # imm = 0xF000
+; HASWELL-NEXT:    # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_lea_add_offset_big:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    leaq -4096(%rdi,%rsi), %rax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_add_offset_big:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    leaq -4096(%rdi,%rsi), %rax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %3 = add i64 %0, -4096
+  %4 = add i64 %3, %1
+  ret i64 %4
+}
+
+define i64 @test_lea_mul(i64) {
+; GENERIC-LABEL: test_lea_mul:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_mul:
+; ATOM:       # BB#0:
+; ATOM-NEXT:    leaq (%rdi,%rdi,2), %rax
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    retq
+;
+; SLM-LABEL: test_lea_mul:
+; SLM:       # BB#0:
+; SLM-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:1.00]
+; SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_mul:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
+; SANDY-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_mul:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_lea_mul:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_mul:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %2 = mul nsw i64 %0, 3
+  ret i64 %2
+}
+
+define i64 @test_lea_mul_offset(i64) {
+; GENERIC-LABEL: test_lea_mul_offset:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    leaq -32(%rdi,%rdi,2), %rax # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_mul_offset:
+; ATOM:       # BB#0:
+; ATOM-NEXT:    leaq -32(%rdi,%rdi,2), %rax
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    retq
+;
+; SLM-LABEL: test_lea_mul_offset:
+; SLM:       # BB#0:
+; SLM-NEXT:    leaq -32(%rdi,%rdi,2), %rax # sched: [1:1.00]
+; SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_mul_offset:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
+; SANDY-NEXT:    addq $-32, %rax # sched: [1:0.33]
+; SANDY-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_mul_offset:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
+; HASWELL-NEXT:    addq $-32, %rax # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_lea_mul_offset:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    leaq -32(%rdi,%rdi,2), %rax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_mul_offset:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    leaq -32(%rdi,%rdi,2), %rax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %2 = mul nsw i64 %0, 3
+  %3 = add nsw i64 %2, -32
+  ret i64 %3
+}
+
+define i64 @test_lea_mul_offset_big(i64) {
+; GENERIC-LABEL: test_lea_mul_offset_big:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    leaq 10000(%rdi,%rdi,8), %rax # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_mul_offset_big:
+; ATOM:       # BB#0:
+; ATOM-NEXT:    leaq 10000(%rdi,%rdi,8), %rax
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    retq
+;
+; SLM-LABEL: test_lea_mul_offset_big:
+; SLM:       # BB#0:
+; SLM-NEXT:    leaq 10000(%rdi,%rdi,8), %rax # sched: [1:1.00]
+; SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_mul_offset_big:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
+; SANDY-NEXT:    addq $10000, %rax # imm = 0x2710
+; SANDY-NEXT:    # sched: [1:0.33]
+; SANDY-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_mul_offset_big:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
+; HASWELL-NEXT:    addq $10000, %rax # imm = 0x2710
+; HASWELL-NEXT:    # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_lea_mul_offset_big:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    leaq 10000(%rdi,%rdi,8), %rax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_mul_offset_big:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    leaq 10000(%rdi,%rdi,8), %rax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %2 = mul nsw i64 %0, 9
+  %3 = add nsw i64 %2, 10000
+  ret i64 %3
+}
+
+define i64 @test_lea_add_scale(i64, i64) {
+; GENERIC-LABEL: test_lea_add_scale:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    leaq (%rdi,%rsi,2), %rax # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_add_scale:
+; ATOM:       # BB#0:
+; ATOM-NEXT:    leaq (%rdi,%rsi,2), %rax
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    retq
+;
+; SLM-LABEL: test_lea_add_scale:
+; SLM:       # BB#0:
+; SLM-NEXT:    leaq (%rdi,%rsi,2), %rax # sched: [1:1.00]
+; SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_add_scale:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    leaq (%rdi,%rsi,2), %rax # sched: [1:0.50]
+; SANDY-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_add_scale:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    leaq (%rdi,%rsi,2), %rax # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_lea_add_scale:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    leaq (%rdi,%rsi,2), %rax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_add_scale:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    leaq (%rdi,%rsi,2), %rax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %3 = shl i64 %1, 1
+  %4 = add nsw i64 %3, %0
+  ret i64 %4
+}
+
+define i64 @test_lea_add_scale_offset(i64, i64) {
+; GENERIC-LABEL: test_lea_add_scale_offset:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    leaq 96(%rdi,%rsi,4), %rax # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_add_scale_offset:
+; ATOM:       # BB#0:
+; ATOM-NEXT:    leaq 96(%rdi,%rsi,4), %rax
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    retq
+;
+; SLM-LABEL: test_lea_add_scale_offset:
+; SLM:       # BB#0:
+; SLM-NEXT:    leaq 96(%rdi,%rsi,4), %rax # sched: [1:1.00]
+; SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_add_scale_offset:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    leaq (%rdi,%rsi,4), %rax # sched: [1:0.50]
+; SANDY-NEXT:    addq $96, %rax # sched: [1:0.33]
+; SANDY-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_add_scale_offset:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    leaq (%rdi,%rsi,4), %rax # sched: [1:0.50]
+; HASWELL-NEXT:    addq $96, %rax # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_lea_add_scale_offset:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    leaq 96(%rdi,%rsi,4), %rax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_add_scale_offset:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    leaq 96(%rdi,%rsi,4), %rax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %3 = shl i64 %1, 2
+  %4 = add i64 %0, 96
+  %5 = add i64 %4, %3
+  ret i64 %5
+}
+
+define i64 @test_lea_add_scale_offset_big(i64, i64) {
+; GENERIC-LABEL: test_lea_add_scale_offset_big:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    leaq -1200(%rdi,%rsi,8), %rax # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_add_scale_offset_big:
+; ATOM:       # BB#0:
+; ATOM-NEXT:    leaq -1200(%rdi,%rsi,8), %rax
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    retq
+;
+; SLM-LABEL: test_lea_add_scale_offset_big:
+; SLM:       # BB#0:
+; SLM-NEXT:    leaq -1200(%rdi,%rsi,8), %rax # sched: [1:1.00]
+; SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_add_scale_offset_big:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    leaq (%rdi,%rsi,8), %rax # sched: [1:0.50]
+; SANDY-NEXT:    addq $-1200, %rax # imm = 0xFB50
+; SANDY-NEXT:    # sched: [1:0.33]
+; SANDY-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_add_scale_offset_big:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    leaq (%rdi,%rsi,8), %rax # sched: [1:0.50]
+; HASWELL-NEXT:    addq $-1200, %rax # imm = 0xFB50
+; HASWELL-NEXT:    # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_lea_add_scale_offset_big:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    leaq -1200(%rdi,%rsi,8), %rax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_add_scale_offset_big:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    leaq -1200(%rdi,%rsi,8), %rax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %3 = shl i64 %1, 3
+  %4 = add i64 %0, -1200
+  %5 = add i64 %4, %3
+  ret i64 %5
+}
diff --git a/test/CodeGen/X86/legalize-shift-64.ll b/test/CodeGen/X86/legalize-shift-64.ll
index b3f2116e6486..3ad6cad32d83 100644
--- a/test/CodeGen/X86/legalize-shift-64.ll
+++ b/test/CodeGen/X86/legalize-shift-64.ll
@@ -148,8 +148,7 @@ define i32 @test6() {
 ; CHECK-NEXT:    andl $-8, %esp
 ; CHECK-NEXT:    subl $16, %esp
 ; CHECK-NEXT:    movl $1, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movl $1, (%esp)
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movl $1, %eax
 ; CHECK-NEXT:    xorl %ecx, %ecx
 ; CHECK-NEXT:    shldl $32, %eax, %ecx
@@ -175,9 +174,8 @@ define i32 @test6() {
 ; CHECK-NEXT:    retl
   %x = alloca i32, align 4
   %t = alloca i64, align 8
-  store i32 1, i32* %x, align 4
-  store i64 1, i64* %t, align 8  ;; DEAD
-  %load = load i32, i32* %x, align 4
+  store volatile i32 1, i32* %x, align 4
+  %load = load volatile i32, i32* %x, align 4
   %shl = shl i32 %load, 8
   %add = add i32 %shl, -224
   %sh_prom = zext i32 %add to i64
diff --git a/test/CodeGen/X86/lzcnt-schedule.ll b/test/CodeGen/X86/lzcnt-schedule.ll
new file mode 100644
index 000000000000..cd0dcbbd6afb
--- /dev/null
+++ b/test/CodeGen/X86/lzcnt-schedule.ll
@@ -0,0 +1,119 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mattr=+lzcnt | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl     | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2  | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1  | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
+
+define i16 @test_ctlz_i16(i16 zeroext %a0, i16 *%a1) {
+; GENERIC-LABEL: test_ctlz_i16:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    lzcntw (%rsi), %cx
+; GENERIC-NEXT:    lzcntw %di, %ax
+; GENERIC-NEXT:    orl %ecx, %eax
+; GENERIC-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; GENERIC-NEXT:    retq
+;
+; HASWELL-LABEL: test_ctlz_i16:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    lzcntw (%rsi), %cx
+; HASWELL-NEXT:    lzcntw %di, %ax
+; HASWELL-NEXT:    orl %ecx, %eax # sched: [1:0.25]
+; HASWELL-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_ctlz_i16:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    lzcntw (%rsi), %cx
+; BTVER2-NEXT:    lzcntw %di, %ax
+; BTVER2-NEXT:    orl %ecx, %eax # sched: [1:0.50]
+; BTVER2-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_ctlz_i16:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    lzcntw (%rsi), %cx
+; ZNVER1-NEXT:    lzcntw %di, %ax
+; ZNVER1-NEXT:    orl %ecx, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %1 = load i16, i16 *%a1
+  %2 = tail call i16 @llvm.ctlz.i16( i16 %1, i1 false )
+  %3 = tail call i16 @llvm.ctlz.i16( i16 %a0, i1 false )
+  %4 = or i16 %2, %3
+  ret i16 %4
+}
+declare i16 @llvm.ctlz.i16(i16, i1)
+
+define i32 @test_ctlz_i32(i32 %a0, i32 *%a1) {
+; GENERIC-LABEL: test_ctlz_i32:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    lzcntl (%rsi), %ecx
+; GENERIC-NEXT:    lzcntl %edi, %eax
+; GENERIC-NEXT:    orl %ecx, %eax
+; GENERIC-NEXT:    retq
+;
+; HASWELL-LABEL: test_ctlz_i32:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    lzcntl (%rsi), %ecx
+; HASWELL-NEXT:    lzcntl %edi, %eax
+; HASWELL-NEXT:    orl %ecx, %eax # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_ctlz_i32:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    lzcntl (%rsi), %ecx
+; BTVER2-NEXT:    lzcntl %edi, %eax
+; BTVER2-NEXT:    orl %ecx, %eax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_ctlz_i32:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    lzcntl (%rsi), %ecx
+; ZNVER1-NEXT:    lzcntl %edi, %eax
+; ZNVER1-NEXT:    orl %ecx, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %1 = load i32, i32 *%a1
+  %2 = tail call i32 @llvm.ctlz.i32( i32 %1, i1 false )
+  %3 = tail call i32 @llvm.ctlz.i32( i32 %a0, i1 false )
+  %4 = or i32 %2, %3
+  ret i32 %4
+}
+declare i32 @llvm.ctlz.i32(i32, i1)
+
+define i64 @test_ctlz_i64(i64 %a0, i64 *%a1) {
+; GENERIC-LABEL: test_ctlz_i64:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    lzcntq (%rsi), %rcx
+; GENERIC-NEXT:    lzcntq %rdi, %rax
+; GENERIC-NEXT:    orq %rcx, %rax
+; GENERIC-NEXT:    retq
+;
+; HASWELL-LABEL: test_ctlz_i64:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    lzcntq (%rsi), %rcx
+; HASWELL-NEXT:    lzcntq %rdi, %rax
+; HASWELL-NEXT:    orq %rcx, %rax # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_ctlz_i64:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    lzcntq (%rsi), %rcx
+; BTVER2-NEXT:    lzcntq %rdi, %rax
+; BTVER2-NEXT:    orq %rcx, %rax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_ctlz_i64:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    lzcntq (%rsi), %rcx
+; ZNVER1-NEXT:    lzcntq %rdi, %rax
+; ZNVER1-NEXT:    orq %rcx, %rax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %1 = load i64, i64 *%a1
+  %2 = tail call i64 @llvm.ctlz.i64( i64 %1, i1 false )
+  %3 = tail call i64 @llvm.ctlz.i64( i64 %a0, i1 false )
+  %4 = or i64 %2, %3
+  ret i64 %4
+}
+declare i64 @llvm.ctlz.i64(i64, i1)
diff --git a/test/CodeGen/X86/machine-outliner-debuginfo.ll b/test/CodeGen/X86/machine-outliner-debuginfo.ll
index 26a194764086..02d0964e37eb 100644
--- a/test/CodeGen/X86/machine-outliner-debuginfo.ll
+++ b/test/CodeGen/X86/machine-outliner-debuginfo.ll
@@ -17,6 +17,7 @@ define i32 @main() #0 !dbg !11 {
   call void @llvm.dbg.value(metadata i32 10, i64 0, metadata !15, metadata !16), !dbg !17
   store i32 4, i32* %5, align 4
   store i32 0, i32* @x, align 4, !dbg !24
+  call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"()
   ; This is the same sequence of instructions without a debug value. It should be outlined
   ; in the same way.
   ; CHECK: callq l_OUTLINED_FUNCTION_0
diff --git a/test/CodeGen/X86/machine-outliner.ll b/test/CodeGen/X86/machine-outliner.ll
index 9f8e6ec298f4..b4a277ec2d82 100644
--- a/test/CodeGen/X86/machine-outliner.ll
+++ b/test/CodeGen/X86/machine-outliner.ll
@@ -85,6 +85,7 @@ define i32 @main() #0 {
   store i32 3, i32* %4, align 4
   store i32 4, i32* %5, align 4
   store i32 1, i32* @x, align 4
+  call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"()
   ; CHECK: callq [[OFUNC2]]
   store i32 1, i32* %2, align 4
   store i32 2, i32* %3, align 4
diff --git a/test/CodeGen/X86/memcmp-minsize.ll b/test/CodeGen/X86/memcmp-minsize.ll
new file mode 100644
index 000000000000..a7f42644ca2d
--- /dev/null
+++ b/test/CodeGen/X86/memcmp-minsize.ll
@@ -0,0 +1,721 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX2
+
+; This tests codegen time inlining/optimization of memcmp
+; rdar://6480398
+
+@.str = private constant [65 x i8] c"0123456789012345678901234567890123456789012345678901234567890123\00", align 1
+
+declare i32 @memcmp(i8*, i8*, i64)
+
+define i32 @length2(i8* %X, i8* %Y) nounwind minsize {
+; X86-LABEL: length2:
+; X86:       # BB#0:
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    andl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $2, {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: length2:
+; X64:       # BB#0:
+; X64-NEXT:    pushq $2
+; X64-NEXT:    popq %rdx
+; X64-NEXT:    jmp memcmp # TAILCALL
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind
+  ret i32 %m
+}
+
+define i1 @length2_eq(i8* %X, i8* %Y) nounwind minsize {
+; X86-LABEL: length2_eq:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl (%ecx), %ecx
+; X86-NEXT:    cmpw (%eax), %cx
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length2_eq:
+; X64:       # BB#0:
+; X64-NEXT:    movzwl (%rdi), %eax
+; X64-NEXT:    cmpw (%rsi), %ax
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_const(i8* %X) nounwind minsize {
+; X86-LABEL: length2_eq_const:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpw $12849, (%eax) # imm = 0x3231
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length2_eq_const:
+; X64:       # BB#0:
+; X64-NEXT:    cmpw $12849, (%rdi) # imm = 0x3231
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 1), i64 2) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_nobuiltin_attr(i8* %X, i8* %Y) nounwind minsize {
+; X86-LABEL: length2_eq_nobuiltin_attr:
+; X86:       # BB#0:
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    andl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $2, {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: length2_eq_nobuiltin_attr:
+; X64:       # BB#0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    pushq $2
+; X64-NEXT:    popq %rdx
+; X64-NEXT:    callq memcmp
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    sete %al
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind nobuiltin
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length3(i8* %X, i8* %Y) nounwind minsize {
+; X86-LABEL: length3:
+; X86:       # BB#0:
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    andl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $3, {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: length3:
+; X64:       # BB#0:
+; X64-NEXT:    pushq $3
+; X64-NEXT:    popq %rdx
+; X64-NEXT:    jmp memcmp # TAILCALL
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 3) nounwind
+  ret i32 %m
+}
+
+define i1 @length3_eq(i8* %X, i8* %Y) nounwind minsize {
+; X86-LABEL: length3_eq:
+; X86:       # BB#0:
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    andl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $3, {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: length3_eq:
+; X64:       # BB#0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    pushq $3
+; X64-NEXT:    popq %rdx
+; X64-NEXT:    callq memcmp
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    setne %al
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 3) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length4(i8* %X, i8* %Y) nounwind minsize {
+; X86-LABEL: length4:
+; X86:       # BB#0:
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    andl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $4, {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: length4:
+; X64:       # BB#0:
+; X64-NEXT:    pushq $4
+; X64-NEXT:    popq %rdx
+; X64-NEXT:    jmp memcmp # TAILCALL
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 4) nounwind
+  ret i32 %m
+}
+
+define i1 @length4_eq(i8* %X, i8* %Y) nounwind minsize {
+; X86-LABEL: length4_eq:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %ecx
+; X86-NEXT:    cmpl (%eax), %ecx
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length4_eq:
+; X64:       # BB#0:
+; X64-NEXT:    movl (%rdi), %eax
+; X64-NEXT:    cmpl (%rsi), %eax
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 4) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length4_eq_const(i8* %X) nounwind minsize {
+; X86-LABEL: length4_eq_const:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $875770417, (%eax) # imm = 0x34333231
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length4_eq_const:
+; X64:       # BB#0:
+; X64-NEXT:    cmpl $875770417, (%rdi) # imm = 0x34333231
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 1), i64 4) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length5(i8* %X, i8* %Y) nounwind minsize {
+; X86-LABEL: length5:
+; X86:       # BB#0:
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    andl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $5, {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: length5:
+; X64:       # BB#0:
+; X64-NEXT:    pushq $5
+; X64-NEXT:    popq %rdx
+; X64-NEXT:    jmp memcmp # TAILCALL
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind
+  ret i32 %m
+}
+
+define i1 @length5_eq(i8* %X, i8* %Y) nounwind minsize {
+; X86-LABEL: length5_eq:
+; X86:       # BB#0:
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    andl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $5, {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: length5_eq:
+; X64:       # BB#0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    pushq $5
+; X64-NEXT:    popq %rdx
+; X64-NEXT:    callq memcmp
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    setne %al
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length8(i8* %X, i8* %Y) nounwind minsize {
+; X86-LABEL: length8:
+; X86:       # BB#0:
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    andl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $8, {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: length8:
+; X64:       # BB#0:
+; X64-NEXT:    pushq $8
+; X64-NEXT:    popq %rdx
+; X64-NEXT:    jmp memcmp # TAILCALL
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 8) nounwind
+  ret i32 %m
+}
+
+define i1 @length8_eq(i8* %X, i8* %Y) nounwind minsize {
+; X86-LABEL: length8_eq:
+; X86:       # BB#0:
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    andl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $8, {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: length8_eq:
+; X64:       # BB#0:
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    cmpq (%rsi), %rax
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 8) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length8_eq_const(i8* %X) nounwind minsize {
+; X86-LABEL: length8_eq_const:
+; X86:       # BB#0:
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    andl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $8, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $.L.str, {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: length8_eq_const:
+; X64:       # BB#0:
+; X64-NEXT:    movabsq $3978425819141910832, %rax # imm = 0x3736353433323130
+; X64-NEXT:    cmpq %rax, (%rdi)
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 8) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length12_eq(i8* %X, i8* %Y) nounwind minsize {
+; X86-LABEL: length12_eq:
+; X86:       # BB#0:
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    andl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $12, {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: length12_eq:
+; X64:       # BB#0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    pushq $12
+; X64-NEXT:    popq %rdx
+; X64-NEXT:    callq memcmp
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    setne %al
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length12(i8* %X, i8* %Y) nounwind minsize {
+; X86-LABEL: length12:
+; X86:       # BB#0:
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    andl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $12, {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: length12:
+; X64:       # BB#0:
+; X64-NEXT:    pushq $12
+; X64-NEXT:    popq %rdx
+; X64-NEXT:    jmp memcmp # TAILCALL
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind
+  ret i32 %m
+}
+
+; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329
+
+define i32 @length16(i8* %X, i8* %Y) nounwind minsize {
+; X86-LABEL: length16:
+; X86:       # BB#0:
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    andl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $16, {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: length16:
+; X64:       # BB#0:
+; X64-NEXT:    pushq $16
+; X64-NEXT:    popq %rdx
+; X64-NEXT:    jmp memcmp # TAILCALL
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 16) nounwind
+  ret i32 %m
+}
+
+define i1 @length16_eq(i8* %x, i8* %y) nounwind minsize {
+; X86-NOSSE-LABEL: length16_eq:
+; X86-NOSSE:       # BB#0:
+; X86-NOSSE-NEXT:    subl $16, %esp
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl %eax, (%esp)
+; X86-NOSSE-NEXT:    andl $0, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl $16, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    addl $16, %esp
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE2-LABEL: length16_eq:
+; X86-SSE2:       # BB#0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm1
+; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
+; X86-SSE2-NEXT:    pmovmskb %xmm1, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    setne %al
+; X86-SSE2-NEXT:    retl
+;
+; X64-SSE2-LABEL: length16_eq:
+; X64-SSE2:       # BB#0:
+; X64-SSE2-NEXT:    movdqu (%rsi), %xmm0
+; X64-SSE2-NEXT:    movdqu (%rdi), %xmm1
+; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
+; X64-SSE2-NEXT:    pmovmskb %xmm1, %eax
+; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-SSE2-NEXT:    setne %al
+; X64-SSE2-NEXT:    retq
+;
+; X64-AVX2-LABEL: length16_eq:
+; X64-AVX2:       # BB#0:
+; X64-AVX2-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX2-NEXT:    vpcmpeqb (%rsi), %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpmovmskb %xmm0, %eax
+; X64-AVX2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-AVX2-NEXT:    setne %al
+; X64-AVX2-NEXT:    retq
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length16_eq_const(i8* %X) nounwind minsize {
+; X86-NOSSE-LABEL: length16_eq_const:
+; X86-NOSSE:       # BB#0:
+; X86-NOSSE-NEXT:    subl $16, %esp
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl %eax, (%esp)
+; X86-NOSSE-NEXT:    andl $0, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl $16, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl $.L.str, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    addl $16, %esp
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE2-LABEL: length16_eq_const:
+; X86-SSE2:       # BB#0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE2-NEXT:    pcmpeqb {{\.LCPI.*}}, %xmm0
+; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    sete %al
+; X86-SSE2-NEXT:    retl
+;
+; X64-SSE2-LABEL: length16_eq_const:
+; X64-SSE2:       # BB#0:
+; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
+; X64-SSE2-NEXT:    pcmpeqb {{.*}}(%rip), %xmm0
+; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-SSE2-NEXT:    sete %al
+; X64-SSE2-NEXT:    retq
+;
+; X64-AVX2-LABEL: length16_eq_const:
+; X64-AVX2:       # BB#0:
+; X64-AVX2-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX2-NEXT:    vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpmovmskb %xmm0, %eax
+; X64-AVX2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-AVX2-NEXT:    sete %al
+; X64-AVX2-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 16) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length32(i8* %X, i8* %Y) nounwind minsize {
+; X86-LABEL: length32:
+; X86:       # BB#0:
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    andl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $32, {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: length32:
+; X64:       # BB#0:
+; X64-NEXT:    pushq $32
+; X64-NEXT:    popq %rdx
+; X64-NEXT:    jmp memcmp # TAILCALL
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 32) nounwind
+  ret i32 %m
+}
+
+; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
+
+define i1 @length32_eq(i8* %x, i8* %y) nounwind minsize {
+; X86-LABEL: length32_eq:
+; X86:       # BB#0:
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    andl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $32, {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    retl
+;
+; X64-SSE2-LABEL: length32_eq:
+; X64-SSE2:       # BB#0:
+; X64-SSE2-NEXT:    pushq %rax
+; X64-SSE2-NEXT:    pushq $32
+; X64-SSE2-NEXT:    popq %rdx
+; X64-SSE2-NEXT:    callq memcmp
+; X64-SSE2-NEXT:    testl %eax, %eax
+; X64-SSE2-NEXT:    sete %al
+; X64-SSE2-NEXT:    popq %rcx
+; X64-SSE2-NEXT:    retq
+;
+; X64-AVX2-LABEL: length32_eq:
+; X64-AVX2:       # BB#0:
+; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-AVX2-NEXT:    vpcmpeqb (%rsi), %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpmovmskb %ymm0, %eax
+; X64-AVX2-NEXT:    cmpl $-1, %eax
+; X64-AVX2-NEXT:    sete %al
+; X64-AVX2-NEXT:    vzeroupper
+; X64-AVX2-NEXT:    retq
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 32) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_eq_const(i8* %X) nounwind minsize {
+; X86-LABEL: length32_eq_const:
+; X86:       # BB#0:
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    andl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $32, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $.L.str, {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    retl
+;
+; X64-SSE2-LABEL: length32_eq_const:
+; X64-SSE2:       # BB#0:
+; X64-SSE2-NEXT:    pushq %rax
+; X64-SSE2-NEXT:    pushq $32
+; X64-SSE2-NEXT:    popq %rdx
+; X64-SSE2-NEXT:    movl $.L.str, %esi
+; X64-SSE2-NEXT:    callq memcmp
+; X64-SSE2-NEXT:    testl %eax, %eax
+; X64-SSE2-NEXT:    setne %al
+; X64-SSE2-NEXT:    popq %rcx
+; X64-SSE2-NEXT:    retq
+;
+; X64-AVX2-LABEL: length32_eq_const:
+; X64-AVX2:       # BB#0:
+; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-AVX2-NEXT:    vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpmovmskb %ymm0, %eax
+; X64-AVX2-NEXT:    cmpl $-1, %eax
+; X64-AVX2-NEXT:    setne %al
+; X64-AVX2-NEXT:    vzeroupper
+; X64-AVX2-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 32) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length64(i8* %X, i8* %Y) nounwind minsize {
+; X86-LABEL: length64:
+; X86:       # BB#0:
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    andl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $64, {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: length64:
+; X64:       # BB#0:
+; X64-NEXT:    pushq $64
+; X64-NEXT:    popq %rdx
+; X64-NEXT:    jmp memcmp # TAILCALL
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 64) nounwind
+  ret i32 %m
+}
+
+define i1 @length64_eq(i8* %x, i8* %y) nounwind minsize {
+; X86-LABEL: length64_eq:
+; X86:       # BB#0:
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    andl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $64, {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: length64_eq:
+; X64:       # BB#0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    pushq $64
+; X64-NEXT:    popq %rdx
+; X64-NEXT:    callq memcmp
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    setne %al
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 64) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length64_eq_const(i8* %X) nounwind minsize {
+; X86-LABEL: length64_eq_const:
+; X86:       # BB#0:
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    andl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $64, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $.L.str, {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: length64_eq_const:
+; X64:       # BB#0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    pushq $64
+; X64-NEXT:    popq %rdx
+; X64-NEXT:    movl $.L.str, %esi
+; X64-NEXT:    callq memcmp
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    sete %al
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 64) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
diff --git a/test/CodeGen/X86/memcmp-optsize.ll b/test/CodeGen/X86/memcmp-optsize.ll
new file mode 100644
index 000000000000..450205a966d2
--- /dev/null
+++ b/test/CodeGen/X86/memcmp-optsize.ll
@@ -0,0 +1,871 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX2
+
+; This tests codegen time inlining/optimization of memcmp
+; rdar://6480398
+
+@.str = private constant [65 x i8] c"0123456789012345678901234567890123456789012345678901234567890123\00", align 1
+
+declare i32 @memcmp(i8*, i8*, i64)
+
+define i32 @length2(i8* %X, i8* %Y) nounwind optsize {
+; X86-LABEL: length2:
+; X86:       # BB#0:
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl (%ecx), %ecx
+; X86-NEXT:    movzwl (%eax), %edx
+; X86-NEXT:    rolw $8, %cx
+; X86-NEXT:    rolw $8, %dx
+; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    xorl %edi, %edi
+; X86-NEXT:    incl %edi
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    decl %eax
+; X86-NEXT:    cmpw %dx, %cx
+; X86-NEXT:    cmovael %edi, %eax
+; X86-NEXT:    cmovel %esi, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl
+;
+; X64-LABEL: length2:
+; X64:       # BB#0:
+; X64-NEXT:    movzwl (%rdi), %eax
+; X64-NEXT:    movzwl (%rsi), %ecx
+; X64-NEXT:    rolw $8, %ax
+; X64-NEXT:    rolw $8, %cx
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    cmpw %cx, %ax
+; X64-NEXT:    movl $-1, %ecx
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    cmovbl %ecx, %eax
+; X64-NEXT:    cmovel %edx, %eax
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind
+  ret i32 %m
+}
+
+define i1 @length2_eq(i8* %X, i8* %Y) nounwind optsize {
+; X86-LABEL: length2_eq:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl (%ecx), %ecx
+; X86-NEXT:    cmpw (%eax), %cx
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length2_eq:
+; X64:       # BB#0:
+; X64-NEXT:    movzwl (%rdi), %eax
+; X64-NEXT:    cmpw (%rsi), %ax
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_const(i8* %X) nounwind optsize {
+; X86-LABEL: length2_eq_const:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %eax
+; X86-NEXT:    cmpl $12849, %eax # imm = 0x3231
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length2_eq_const:
+; X64:       # BB#0:
+; X64-NEXT:    movzwl (%rdi), %eax
+; X64-NEXT:    cmpl $12849, %eax # imm = 0x3231
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 1), i64 2) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_nobuiltin_attr(i8* %X, i8* %Y) nounwind optsize {
+; X86-LABEL: length2_eq_nobuiltin_attr:
+; X86:       # BB#0:
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $2
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length2_eq_nobuiltin_attr:
+; X64:       # BB#0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    movl $2, %edx
+; X64-NEXT:    callq memcmp
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    sete %al
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind nobuiltin
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length3(i8* %X, i8* %Y) nounwind optsize {
+; X86-LABEL: length3:
+; X86:       # BB#0: # %loadbb
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %edx
+; X86-NEXT:    movzwl (%ecx), %esi
+; X86-NEXT:    rolw $8, %dx
+; X86-NEXT:    rolw $8, %si
+; X86-NEXT:    movzwl %dx, %edx
+; X86-NEXT:    movzwl %si, %esi
+; X86-NEXT:    cmpl %esi, %edx
+; X86-NEXT:    jne .LBB4_1
+; X86-NEXT:  # BB#2: # %loadbb1
+; X86-NEXT:    movzbl 2(%eax), %eax
+; X86-NEXT:    movzbl 2(%ecx), %ecx
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    jmp .LBB4_3
+; X86-NEXT:  .LBB4_1: # %res_block
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    incl %ecx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    decl %eax
+; X86-NEXT:    cmpl %esi, %edx
+; X86-NEXT:    cmovael %ecx, %eax
+; X86-NEXT:  .LBB4_3: # %endblock
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+;
+; X64-LABEL: length3:
+; X64:       # BB#0: # %loadbb
+; X64-NEXT:    movzwl (%rdi), %eax
+; X64-NEXT:    movzwl (%rsi), %ecx
+; X64-NEXT:    rolw $8, %ax
+; X64-NEXT:    rolw $8, %cx
+; X64-NEXT:    movzwl %ax, %eax
+; X64-NEXT:    movzwl %cx, %ecx
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    jne .LBB4_1
+; X64-NEXT:  # BB#2: # %loadbb1
+; X64-NEXT:    movzbl 2(%rdi), %eax
+; X64-NEXT:    movzbl 2(%rsi), %ecx
+; X64-NEXT:    subl %ecx, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB4_1: # %res_block
+; X64-NEXT:    movl $-1, %ecx
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    cmovbl %ecx, %eax
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 3) nounwind
+  ret i32 %m
+}
+
+define i1 @length3_eq(i8* %X, i8* %Y) nounwind optsize {
+; X86-LABEL: length3_eq:
+; X86:       # BB#0: # %loadbb
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %edx
+; X86-NEXT:    cmpw (%ecx), %dx
+; X86-NEXT:    jne .LBB5_1
+; X86-NEXT:  # BB#2: # %loadbb1
+; X86-NEXT:    movb 2(%eax), %dl
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpb 2(%ecx), %dl
+; X86-NEXT:    je .LBB5_3
+; X86-NEXT:  .LBB5_1: # %res_block
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    incl %eax
+; X86-NEXT:  .LBB5_3: # %endblock
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length3_eq:
+; X64:       # BB#0: # %loadbb
+; X64-NEXT:    movzwl (%rdi), %eax
+; X64-NEXT:    cmpw (%rsi), %ax
+; X64-NEXT:    jne .LBB5_1
+; X64-NEXT:  # BB#2: # %loadbb1
+; X64-NEXT:    movb 2(%rdi), %cl
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpb 2(%rsi), %cl
+; X64-NEXT:    je .LBB5_3
+; X64-NEXT:  .LBB5_1: # %res_block
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:  .LBB5_3: # %endblock
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 3) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length4(i8* %X, i8* %Y) nounwind optsize {
+; X86-LABEL: length4:
+; X86:       # BB#0:
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %ecx
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    xorl %edi, %edi
+; X86-NEXT:    incl %edi
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    decl %eax
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    cmovael %edi, %eax
+; X86-NEXT:    cmovel %esi, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl
+;
+; X64-LABEL: length4:
+; X64:       # BB#0:
+; X64-NEXT:    movl (%rdi), %eax
+; X64-NEXT:    movl (%rsi), %ecx
+; X64-NEXT:    bswapl %eax
+; X64-NEXT:    bswapl %ecx
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    cmpl %ecx, %eax
+; X64-NEXT:    movl $-1, %ecx
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    cmovbl %ecx, %eax
+; X64-NEXT:    cmovel %edx, %eax
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 4) nounwind
+  ret i32 %m
+}
+
+define i1 @length4_eq(i8* %X, i8* %Y) nounwind optsize {
+; X86-LABEL: length4_eq:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %ecx
+; X86-NEXT:    cmpl (%eax), %ecx
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length4_eq:
+; X64:       # BB#0:
+; X64-NEXT:    movl (%rdi), %eax
+; X64-NEXT:    cmpl (%rsi), %eax
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 4) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length4_eq_const(i8* %X) nounwind optsize {
+; X86-LABEL: length4_eq_const:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $875770417, (%eax) # imm = 0x34333231
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length4_eq_const:
+; X64:       # BB#0:
+; X64-NEXT:    cmpl $875770417, (%rdi) # imm = 0x34333231
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 1), i64 4) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length5(i8* %X, i8* %Y) nounwind optsize {
+; X86-LABEL: length5:
+; X86:       # BB#0: # %loadbb
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    movl (%ecx), %esi
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    bswapl %esi
+; X86-NEXT:    cmpl %esi, %edx
+; X86-NEXT:    jne .LBB9_1
+; X86-NEXT:  # BB#2: # %loadbb1
+; X86-NEXT:    movzbl 4(%eax), %eax
+; X86-NEXT:    movzbl 4(%ecx), %ecx
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    jmp .LBB9_3
+; X86-NEXT:  .LBB9_1: # %res_block
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    incl %ecx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    decl %eax
+; X86-NEXT:    cmpl %esi, %edx
+; X86-NEXT:    cmovael %ecx, %eax
+; X86-NEXT:  .LBB9_3: # %endblock
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+;
+; X64-LABEL: length5:
+; X64:       # BB#0: # %loadbb
+; X64-NEXT:    movl (%rdi), %eax
+; X64-NEXT:    movl (%rsi), %ecx
+; X64-NEXT:    bswapl %eax
+; X64-NEXT:    bswapl %ecx
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    jne .LBB9_1
+; X64-NEXT:  # BB#2: # %loadbb1
+; X64-NEXT:    movzbl 4(%rdi), %eax
+; X64-NEXT:    movzbl 4(%rsi), %ecx
+; X64-NEXT:    subl %ecx, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB9_1: # %res_block
+; X64-NEXT:    movl $-1, %ecx
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    cmovbl %ecx, %eax
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind
+  ret i32 %m
+}
+
+define i1 @length5_eq(i8* %X, i8* %Y) nounwind optsize {
+; X86-LABEL: length5_eq:
+; X86:       # BB#0: # %loadbb
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    cmpl (%ecx), %edx
+; X86-NEXT:    jne .LBB10_1
+; X86-NEXT:  # BB#2: # %loadbb1
+; X86-NEXT:    movb 4(%eax), %dl
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpb 4(%ecx), %dl
+; X86-NEXT:    je .LBB10_3
+; X86-NEXT:  .LBB10_1: # %res_block
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    incl %eax
+; X86-NEXT:  .LBB10_3: # %endblock
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length5_eq:
+; X64:       # BB#0: # %loadbb
+; X64-NEXT:    movl (%rdi), %eax
+; X64-NEXT:    cmpl (%rsi), %eax
+; X64-NEXT:    jne .LBB10_1
+; X64-NEXT:  # BB#2: # %loadbb1
+; X64-NEXT:    movb 4(%rdi), %cl
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpb 4(%rsi), %cl
+; X64-NEXT:    je .LBB10_3
+; X64-NEXT:  .LBB10_1: # %res_block
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:  .LBB10_3: # %endblock
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length8(i8* %X, i8* %Y) nounwind optsize {
+; X86-LABEL: length8:
+; X86:       # BB#0: # %loadbb
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl (%esi), %ecx
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    jne .LBB11_1
+; X86-NEXT:  # BB#2: # %loadbb1
+; X86-NEXT:    movl 4(%esi), %ecx
+; X86-NEXT:    movl 4(%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    je .LBB11_3
+; X86-NEXT:  .LBB11_1: # %res_block
+; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    incl %esi
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    decl %eax
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    cmovael %esi, %eax
+; X86-NEXT:  .LBB11_3: # %endblock
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+;
+; X64-LABEL: length8:
+; X64:       # BB#0:
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    movq (%rsi), %rcx
+; X64-NEXT:    bswapq %rax
+; X64-NEXT:    bswapq %rcx
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    movl $-1, %ecx
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    cmovbl %ecx, %eax
+; X64-NEXT:    cmovel %edx, %eax
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 8) nounwind
+  ret i32 %m
+}
+
+define i1 @length8_eq(i8* %X, i8* %Y) nounwind optsize {
+; X86-LABEL: length8_eq:
+; X86:       # BB#0: # %loadbb
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    cmpl (%ecx), %edx
+; X86-NEXT:    jne .LBB12_1
+; X86-NEXT:  # BB#2: # %loadbb1
+; X86-NEXT:    movl 4(%eax), %edx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl 4(%ecx), %edx
+; X86-NEXT:    je .LBB12_3
+; X86-NEXT:  .LBB12_1: # %res_block
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    incl %eax
+; X86-NEXT:  .LBB12_3: # %endblock
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length8_eq:
+; X64:       # BB#0:
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    cmpq (%rsi), %rax
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 8) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length8_eq_const(i8* %X) nounwind optsize {
+; X86-LABEL: length8_eq_const:
+; X86:       # BB#0: # %loadbb
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl $858927408, (%ecx) # imm = 0x33323130
+; X86-NEXT:    jne .LBB13_1
+; X86-NEXT:  # BB#2: # %loadbb1
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl $926299444, 4(%ecx) # imm = 0x37363534
+; X86-NEXT:    je .LBB13_3
+; X86-NEXT:  .LBB13_1: # %res_block
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    incl %eax
+; X86-NEXT:  .LBB13_3: # %endblock
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length8_eq_const:
+; X64:       # BB#0:
+; X64-NEXT:    movabsq $3978425819141910832, %rax # imm = 0x3736353433323130
+; X64-NEXT:    cmpq %rax, (%rdi)
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 8) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length12_eq(i8* %X, i8* %Y) nounwind optsize {
+; X86-LABEL: length12_eq:
+; X86:       # BB#0:
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $12
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length12_eq:
+; X64:       # BB#0: # %loadbb
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    cmpq (%rsi), %rax
+; X64-NEXT:    jne .LBB14_1
+; X64-NEXT:  # BB#2: # %loadbb1
+; X64-NEXT:    movl 8(%rdi), %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpl 8(%rsi), %ecx
+; X64-NEXT:    je .LBB14_3
+; X64-NEXT:  .LBB14_1: # %res_block
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:  .LBB14_3: # %endblock
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length12(i8* %X, i8* %Y) nounwind optsize {
+; X86-LABEL: length12:
+; X86:       # BB#0:
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $12
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: length12:
+; X64:       # BB#0: # %loadbb
+; X64-NEXT:    movq (%rdi), %rcx
+; X64-NEXT:    movq (%rsi), %rdx
+; X64-NEXT:    bswapq %rcx
+; X64-NEXT:    bswapq %rdx
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    jne .LBB15_1
+; X64-NEXT:  # BB#2: # %loadbb1
+; X64-NEXT:    movl 8(%rdi), %ecx
+; X64-NEXT:    movl 8(%rsi), %edx
+; X64-NEXT:    bswapl %ecx
+; X64-NEXT:    bswapl %edx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    jne .LBB15_1
+; X64-NEXT:  # BB#3: # %endblock
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB15_1: # %res_block
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    movl $-1, %ecx
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    cmovbl %ecx, %eax
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind
+  ret i32 %m
+}
+
+; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329
+
+define i32 @length16(i8* %X, i8* %Y) nounwind optsize {
+; X86-LABEL: length16:
+; X86:       # BB#0:
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $16
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: length16:
+; X64:       # BB#0: # %loadbb
+; X64-NEXT:    movq (%rdi), %rcx
+; X64-NEXT:    movq (%rsi), %rdx
+; X64-NEXT:    bswapq %rcx
+; X64-NEXT:    bswapq %rdx
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    jne .LBB16_1
+; X64-NEXT:  # BB#2: # %loadbb1
+; X64-NEXT:    movq 8(%rdi), %rcx
+; X64-NEXT:    movq 8(%rsi), %rdx
+; X64-NEXT:    bswapq %rcx
+; X64-NEXT:    bswapq %rdx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    jne .LBB16_1
+; X64-NEXT:  # BB#3: # %endblock
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB16_1: # %res_block
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    movl $-1, %ecx
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    cmovbl %ecx, %eax
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 16) nounwind
+  ret i32 %m
+}
+
+define i1 @length16_eq(i8* %x, i8* %y) nounwind optsize {
+; X86-NOSSE-LABEL: length16_eq:
+; X86-NOSSE:       # BB#0:
+; X86-NOSSE-NEXT:    pushl $0
+; X86-NOSSE-NEXT:    pushl $16
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $16, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE2-LABEL: length16_eq:
+; X86-SSE2:       # BB#0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm1
+; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
+; X86-SSE2-NEXT:    pmovmskb %xmm1, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    setne %al
+; X86-SSE2-NEXT:    retl
+;
+; X64-LABEL: length16_eq:
+; X64:       # BB#0: # %loadbb
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    cmpq (%rsi), %rax
+; X64-NEXT:    jne .LBB17_1
+; X64-NEXT:  # BB#2: # %loadbb1
+; X64-NEXT:    movq 8(%rdi), %rcx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpq 8(%rsi), %rcx
+; X64-NEXT:    je .LBB17_3
+; X64-NEXT:  .LBB17_1: # %res_block
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:  .LBB17_3: # %endblock
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length16_eq_const(i8* %X) nounwind optsize {
+; X86-NOSSE-LABEL: length16_eq_const:
+; X86-NOSSE:       # BB#0:
+; X86-NOSSE-NEXT:    pushl $0
+; X86-NOSSE-NEXT:    pushl $16
+; X86-NOSSE-NEXT:    pushl $.L.str
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $16, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE2-LABEL: length16_eq_const:
+; X86-SSE2:       # BB#0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE2-NEXT:    pcmpeqb {{\.LCPI.*}}, %xmm0
+; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    sete %al
+; X86-SSE2-NEXT:    retl
+;
+; X64-LABEL: length16_eq_const:
+; X64:       # BB#0: # %loadbb
+; X64-NEXT:    movabsq $3978425819141910832, %rax # imm = 0x3736353433323130
+; X64-NEXT:    cmpq %rax, (%rdi)
+; X64-NEXT:    jne .LBB18_1
+; X64-NEXT:  # BB#2: # %loadbb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    movabsq $3833745473465760056, %rcx # imm = 0x3534333231303938
+; X64-NEXT:    cmpq %rcx, 8(%rdi)
+; X64-NEXT:    je .LBB18_3
+; X64-NEXT:  .LBB18_1: # %res_block
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:  .LBB18_3: # %endblock
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 16) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length32(i8* %X, i8* %Y) nounwind optsize {
+; X86-LABEL: length32:
+; X86:       # BB#0:
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $32
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: length32:
+; X64:       # BB#0:
+; X64-NEXT:    movl $32, %edx
+; X64-NEXT:    jmp memcmp # TAILCALL
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 32) nounwind
+  ret i32 %m
+}
+
+; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
+
+define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize {
+; X86-LABEL: length32_eq:
+; X86:       # BB#0:
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $32
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-SSE2-LABEL: length32_eq:
+; X64-SSE2:       # BB#0:
+; X64-SSE2-NEXT:    pushq %rax
+; X64-SSE2-NEXT:    movl $32, %edx
+; X64-SSE2-NEXT:    callq memcmp
+; X64-SSE2-NEXT:    testl %eax, %eax
+; X64-SSE2-NEXT:    sete %al
+; X64-SSE2-NEXT:    popq %rcx
+; X64-SSE2-NEXT:    retq
+;
+; X64-AVX2-LABEL: length32_eq:
+; X64-AVX2:       # BB#0:
+; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-AVX2-NEXT:    vpcmpeqb (%rsi), %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpmovmskb %ymm0, %eax
+; X64-AVX2-NEXT:    cmpl $-1, %eax
+; X64-AVX2-NEXT:    sete %al
+; X64-AVX2-NEXT:    vzeroupper
+; X64-AVX2-NEXT:    retq
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 32) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_eq_const(i8* %X) nounwind optsize {
+; X86-LABEL: length32_eq_const:
+; X86:       # BB#0:
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $32
+; X86-NEXT:    pushl $.L.str
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+;
+; X64-SSE2-LABEL: length32_eq_const:
+; X64-SSE2:       # BB#0:
+; X64-SSE2-NEXT:    pushq %rax
+; X64-SSE2-NEXT:    movl $.L.str, %esi
+; X64-SSE2-NEXT:    movl $32, %edx
+; X64-SSE2-NEXT:    callq memcmp
+; X64-SSE2-NEXT:    testl %eax, %eax
+; X64-SSE2-NEXT:    setne %al
+; X64-SSE2-NEXT:    popq %rcx
+; X64-SSE2-NEXT:    retq
+;
+; X64-AVX2-LABEL: length32_eq_const:
+; X64-AVX2:       # BB#0:
+; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-AVX2-NEXT:    vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpmovmskb %ymm0, %eax
+; X64-AVX2-NEXT:    cmpl $-1, %eax
+; X64-AVX2-NEXT:    setne %al
+; X64-AVX2-NEXT:    vzeroupper
+; X64-AVX2-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 32) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length64(i8* %X, i8* %Y) nounwind optsize {
+; X86-LABEL: length64:
+; X86:       # BB#0:
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $64
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: length64:
+; X64:       # BB#0:
+; X64-NEXT:    movl $64, %edx
+; X64-NEXT:    jmp memcmp # TAILCALL
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 64) nounwind
+  ret i32 %m
+}
+
+define i1 @length64_eq(i8* %x, i8* %y) nounwind optsize {
+; X86-LABEL: length64_eq:
+; X86:       # BB#0:
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $64
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length64_eq:
+; X64:       # BB#0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    movl $64, %edx
+; X64-NEXT:    callq memcmp
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    setne %al
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 64) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length64_eq_const(i8* %X) nounwind optsize {
+; X86-LABEL: length64_eq_const:
+; X86:       # BB#0:
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $64
+; X86-NEXT:    pushl $.L.str
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length64_eq_const:
+; X64:       # BB#0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    movl $.L.str, %esi
+; X64-NEXT:    movl $64, %edx
+; X64-NEXT:    callq memcmp
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    sete %al
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 64) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
diff --git a/test/CodeGen/X86/memcmp.ll b/test/CodeGen/X86/memcmp.ll
index 0e09abf73c8c..2e6782765462 100644
--- a/test/CodeGen/X86/memcmp.ll
+++ b/test/CodeGen/X86/memcmp.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=AVX2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX2
 
 ; This tests codegen time inlining/optimization of memcmp
 ; rdar://6480398
@@ -12,43 +12,21 @@
 declare i32 @memcmp(i8*, i8*, i64)
 
 define i32 @length2(i8* %X, i8* %Y) nounwind {
-; X86-NOSSE-LABEL: length2:
-; X86-NOSSE:       # BB#0:
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NOSSE-NEXT:    movzwl (%ecx), %ecx
-; X86-NOSSE-NEXT:    movzwl (%eax), %eax
-; X86-NOSSE-NEXT:    rolw $8, %cx
-; X86-NOSSE-NEXT:    rolw $8, %ax
-; X86-NOSSE-NEXT:    cmpw %ax, %cx
-; X86-NOSSE-NEXT:    movl $-1, %eax
-; X86-NOSSE-NEXT:    jae .LBB0_1
-; X86-NOSSE-NEXT:  # BB#2:
-; X86-NOSSE-NEXT:    je .LBB0_3
-; X86-NOSSE-NEXT:  .LBB0_4:
-; X86-NOSSE-NEXT:    retl
-; X86-NOSSE-NEXT:  .LBB0_1:
-; X86-NOSSE-NEXT:    movl $1, %eax
-; X86-NOSSE-NEXT:    jne .LBB0_4
-; X86-NOSSE-NEXT:  .LBB0_3:
-; X86-NOSSE-NEXT:    xorl %eax, %eax
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE2-LABEL: length2:
-; X86-SSE2:       # BB#0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movzwl (%ecx), %ecx
-; X86-SSE2-NEXT:    movzwl (%eax), %eax
-; X86-SSE2-NEXT:    rolw $8, %cx
-; X86-SSE2-NEXT:    rolw $8, %ax
-; X86-SSE2-NEXT:    xorl %edx, %edx
-; X86-SSE2-NEXT:    cmpw %ax, %cx
-; X86-SSE2-NEXT:    movl $-1, %ecx
-; X86-SSE2-NEXT:    movl $1, %eax
-; X86-SSE2-NEXT:    cmovbl %ecx, %eax
-; X86-SSE2-NEXT:    cmovel %edx, %eax
-; X86-SSE2-NEXT:    retl
+; X86-LABEL: length2:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl (%ecx), %ecx
+; X86-NEXT:    movzwl (%eax), %eax
+; X86-NEXT:    rolw $8, %cx
+; X86-NEXT:    rolw $8, %ax
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    cmpw %ax, %cx
+; X86-NEXT:    movl $-1, %ecx
+; X86-NEXT:    movl $1, %eax
+; X86-NEXT:    cmovbl %ecx, %eax
+; X86-NEXT:    cmovel %edx, %eax
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: length2:
 ; X64:       # BB#0:
@@ -137,44 +115,90 @@ define i1 @length2_eq_nobuiltin_attr(i8* %X, i8* %Y) nounwind {
 
 define i32 @length3(i8* %X, i8* %Y) nounwind {
 ; X86-LABEL: length3:
-; X86:       # BB#0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $3
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
+; X86:       # BB#0: # %loadbb
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %edx
+; X86-NEXT:    movzwl (%ecx), %esi
+; X86-NEXT:    rolw $8, %dx
+; X86-NEXT:    rolw $8, %si
+; X86-NEXT:    movzwl %dx, %edx
+; X86-NEXT:    movzwl %si, %esi
+; X86-NEXT:    cmpl %esi, %edx
+; X86-NEXT:    jne .LBB4_1
+; X86-NEXT:  # BB#2: # %loadbb1
+; X86-NEXT:    movzbl 2(%eax), %eax
+; X86-NEXT:    movzbl 2(%ecx), %ecx
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB4_1: # %res_block
+; X86-NEXT:    movl $-1, %ecx
+; X86-NEXT:    movl $1, %eax
+; X86-NEXT:    cmovbl %ecx, %eax
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: length3:
-; X64:       # BB#0:
-; X64-NEXT:    movl $3, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
+; X64:       # BB#0: # %loadbb
+; X64-NEXT:    movzwl (%rdi), %eax
+; X64-NEXT:    movzwl (%rsi), %ecx
+; X64-NEXT:    rolw $8, %ax
+; X64-NEXT:    rolw $8, %cx
+; X64-NEXT:    movzwl %ax, %eax
+; X64-NEXT:    movzwl %cx, %ecx
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    jne .LBB4_1
+; X64-NEXT:  # BB#2: # %loadbb1
+; X64-NEXT:    movzbl 2(%rdi), %eax
+; X64-NEXT:    movzbl 2(%rsi), %ecx
+; X64-NEXT:    subl %ecx, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB4_1: # %res_block
+; X64-NEXT:    movl $-1, %ecx
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    cmovbl %ecx, %eax
+; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 3) nounwind
   ret i32 %m
 }
 
 define i1 @length3_eq(i8* %X, i8* %Y) nounwind {
 ; X86-LABEL: length3_eq:
-; X86:       # BB#0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $3
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
+; X86:       # BB#0: # %loadbb
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %edx
+; X86-NEXT:    cmpw (%ecx), %dx
+; X86-NEXT:    jne .LBB5_1
+; X86-NEXT:  # BB#2: # %loadbb1
+; X86-NEXT:    movb 2(%eax), %dl
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpb 2(%ecx), %dl
+; X86-NEXT:    je .LBB5_3
+; X86-NEXT:  .LBB5_1: # %res_block
+; X86-NEXT:    movl $1, %eax
+; X86-NEXT:  .LBB5_3: # %endblock
 ; X86-NEXT:    testl %eax, %eax
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: length3_eq:
-; X64:       # BB#0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $3, %edx
-; X64-NEXT:    callq memcmp
+; X64:       # BB#0: # %loadbb
+; X64-NEXT:    movzwl (%rdi), %eax
+; X64-NEXT:    cmpw (%rsi), %ax
+; X64-NEXT:    jne .LBB5_1
+; X64-NEXT:  # BB#2: # %loadbb1
+; X64-NEXT:    movb 2(%rdi), %cl
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpb 2(%rsi), %cl
+; X64-NEXT:    je .LBB5_3
+; X64-NEXT:  .LBB5_1: # %res_block
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:  .LBB5_3: # %endblock
 ; X64-NEXT:    testl %eax, %eax
 ; X64-NEXT:    setne %al
-; X64-NEXT:    popq %rcx
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 3) nounwind
   %c = icmp ne i32 %m, 0
@@ -182,43 +206,21 @@ define i1 @length3_eq(i8* %X, i8* %Y) nounwind {
 }
 
 define i32 @length4(i8* %X, i8* %Y) nounwind {
-; X86-NOSSE-LABEL: length4:
-; X86-NOSSE:       # BB#0:
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NOSSE-NEXT:    movl (%ecx), %ecx
-; X86-NOSSE-NEXT:    movl (%eax), %eax
-; X86-NOSSE-NEXT:    bswapl %ecx
-; X86-NOSSE-NEXT:    bswapl %eax
-; X86-NOSSE-NEXT:    cmpl %eax, %ecx
-; X86-NOSSE-NEXT:    movl $-1, %eax
-; X86-NOSSE-NEXT:    jae .LBB6_1
-; X86-NOSSE-NEXT:  # BB#2:
-; X86-NOSSE-NEXT:    je .LBB6_3
-; X86-NOSSE-NEXT:  .LBB6_4:
-; X86-NOSSE-NEXT:    retl
-; X86-NOSSE-NEXT:  .LBB6_1:
-; X86-NOSSE-NEXT:    movl $1, %eax
-; X86-NOSSE-NEXT:    jne .LBB6_4
-; X86-NOSSE-NEXT:  .LBB6_3:
-; X86-NOSSE-NEXT:    xorl %eax, %eax
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE2-LABEL: length4:
-; X86-SSE2:       # BB#0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movl (%ecx), %ecx
-; X86-SSE2-NEXT:    movl (%eax), %eax
-; X86-SSE2-NEXT:    bswapl %ecx
-; X86-SSE2-NEXT:    bswapl %eax
-; X86-SSE2-NEXT:    xorl %edx, %edx
-; X86-SSE2-NEXT:    cmpl %eax, %ecx
-; X86-SSE2-NEXT:    movl $-1, %ecx
-; X86-SSE2-NEXT:    movl $1, %eax
-; X86-SSE2-NEXT:    cmovbl %ecx, %eax
-; X86-SSE2-NEXT:    cmovel %edx, %eax
-; X86-SSE2-NEXT:    retl
+; X86-LABEL: length4:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %ecx
+; X86-NEXT:    movl (%eax), %eax
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    cmpl %eax, %ecx
+; X86-NEXT:    movl $-1, %ecx
+; X86-NEXT:    movl $1, %eax
+; X86-NEXT:    cmovbl %ecx, %eax
+; X86-NEXT:    cmovel %edx, %eax
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: length4:
 ; X64:       # BB#0:
@@ -278,44 +280,86 @@ define i1 @length4_eq_const(i8* %X) nounwind {
 
 define i32 @length5(i8* %X, i8* %Y) nounwind {
 ; X86-LABEL: length5:
-; X86:       # BB#0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $5
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
+; X86:       # BB#0: # %loadbb
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    movl (%ecx), %esi
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    bswapl %esi
+; X86-NEXT:    cmpl %esi, %edx
+; X86-NEXT:    jne .LBB9_1
+; X86-NEXT:  # BB#2: # %loadbb1
+; X86-NEXT:    movzbl 4(%eax), %eax
+; X86-NEXT:    movzbl 4(%ecx), %ecx
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB9_1: # %res_block
+; X86-NEXT:    movl $-1, %ecx
+; X86-NEXT:    movl $1, %eax
+; X86-NEXT:    cmovbl %ecx, %eax
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: length5:
-; X64:       # BB#0:
-; X64-NEXT:    movl $5, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
+; X64:       # BB#0: # %loadbb
+; X64-NEXT:    movl (%rdi), %eax
+; X64-NEXT:    movl (%rsi), %ecx
+; X64-NEXT:    bswapl %eax
+; X64-NEXT:    bswapl %ecx
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    jne .LBB9_1
+; X64-NEXT:  # BB#2: # %loadbb1
+; X64-NEXT:    movzbl 4(%rdi), %eax
+; X64-NEXT:    movzbl 4(%rsi), %ecx
+; X64-NEXT:    subl %ecx, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB9_1: # %res_block
+; X64-NEXT:    movl $-1, %ecx
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    cmovbl %ecx, %eax
+; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind
   ret i32 %m
 }
 
 define i1 @length5_eq(i8* %X, i8* %Y) nounwind {
 ; X86-LABEL: length5_eq:
-; X86:       # BB#0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $5
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
+; X86:       # BB#0: # %loadbb
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    cmpl (%ecx), %edx
+; X86-NEXT:    jne .LBB10_1
+; X86-NEXT:  # BB#2: # %loadbb1
+; X86-NEXT:    movb 4(%eax), %dl
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpb 4(%ecx), %dl
+; X86-NEXT:    je .LBB10_3
+; X86-NEXT:  .LBB10_1: # %res_block
+; X86-NEXT:    movl $1, %eax
+; X86-NEXT:  .LBB10_3: # %endblock
 ; X86-NEXT:    testl %eax, %eax
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: length5_eq:
-; X64:       # BB#0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $5, %edx
-; X64-NEXT:    callq memcmp
+; X64:       # BB#0: # %loadbb
+; X64-NEXT:    movl (%rdi), %eax
+; X64-NEXT:    cmpl (%rsi), %eax
+; X64-NEXT:    jne .LBB10_1
+; X64-NEXT:  # BB#2: # %loadbb1
+; X64-NEXT:    movb 4(%rdi), %cl
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpb 4(%rsi), %cl
+; X64-NEXT:    je .LBB10_3
+; X64-NEXT:  .LBB10_1: # %res_block
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:  .LBB10_3: # %endblock
 ; X64-NEXT:    testl %eax, %eax
 ; X64-NEXT:    setne %al
-; X64-NEXT:    popq %rcx
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind
   %c = icmp ne i32 %m, 0
@@ -324,13 +368,33 @@ define i1 @length5_eq(i8* %X, i8* %Y) nounwind {
 
 define i32 @length8(i8* %X, i8* %Y) nounwind {
 ; X86-LABEL: length8:
-; X86:       # BB#0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $8
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
+; X86:       # BB#0: # %loadbb
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl (%esi), %ecx
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    jne .LBB11_1
+; X86-NEXT:  # BB#2: # %loadbb1
+; X86-NEXT:    movl 4(%esi), %ecx
+; X86-NEXT:    movl 4(%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    jne .LBB11_1
+; X86-NEXT:  # BB#3: # %endblock
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB11_1: # %res_block
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    movl $-1, %ecx
+; X86-NEXT:    movl $1, %eax
+; X86-NEXT:    cmovbl %ecx, %eax
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: length8:
@@ -352,13 +416,20 @@ define i32 @length8(i8* %X, i8* %Y) nounwind {
 
 define i1 @length8_eq(i8* %X, i8* %Y) nounwind {
 ; X86-LABEL: length8_eq:
-; X86:       # BB#0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $8
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
+; X86:       # BB#0: # %loadbb
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    cmpl (%ecx), %edx
+; X86-NEXT:    jne .LBB12_1
+; X86-NEXT:  # BB#2: # %loadbb1
+; X86-NEXT:    movl 4(%eax), %edx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl 4(%ecx), %edx
+; X86-NEXT:    je .LBB12_3
+; X86-NEXT:  .LBB12_1: # %res_block
+; X86-NEXT:    movl $1, %eax
+; X86-NEXT:  .LBB12_3: # %endblock
 ; X86-NEXT:    testl %eax, %eax
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    retl
@@ -376,13 +447,17 @@ define i1 @length8_eq(i8* %X, i8* %Y) nounwind {
 
 define i1 @length8_eq_const(i8* %X) nounwind {
 ; X86-LABEL: length8_eq_const:
-; X86:       # BB#0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $8
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
+; X86:       # BB#0: # %loadbb
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl $858927408, (%ecx) # imm = 0x33323130
+; X86-NEXT:    jne .LBB13_1
+; X86-NEXT:  # BB#2: # %loadbb1
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl $926299444, 4(%ecx) # imm = 0x37363534
+; X86-NEXT:    je .LBB13_3
+; X86-NEXT:  .LBB13_1: # %res_block
+; X86-NEXT:    movl $1, %eax
+; X86-NEXT:  .LBB13_3: # %endblock
 ; X86-NEXT:    testl %eax, %eax
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    retl
@@ -400,25 +475,43 @@ define i1 @length8_eq_const(i8* %X) nounwind {
 
 define i1 @length12_eq(i8* %X, i8* %Y) nounwind {
 ; X86-LABEL: length12_eq:
-; X86:       # BB#0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $12
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
+; X86:       # BB#0: # %loadbb
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %edx
+; X86-NEXT:    cmpl (%eax), %edx
+; X86-NEXT:    jne .LBB14_1
+; X86-NEXT:  # BB#2: # %loadbb1
+; X86-NEXT:    movl 4(%ecx), %edx
+; X86-NEXT:    cmpl 4(%eax), %edx
+; X86-NEXT:    jne .LBB14_1
+; X86-NEXT:  # BB#3: # %loadbb2
+; X86-NEXT:    movl 8(%ecx), %edx
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    cmpl 8(%eax), %edx
+; X86-NEXT:    je .LBB14_4
+; X86-NEXT:  .LBB14_1: # %res_block
+; X86-NEXT:    movl $1, %ecx
+; X86-NEXT:  .LBB14_4: # %endblock
+; X86-NEXT:    testl %ecx, %ecx
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: length12_eq:
-; X64:       # BB#0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $12, %edx
-; X64-NEXT:    callq memcmp
+; X64:       # BB#0: # %loadbb
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    cmpq (%rsi), %rax
+; X64-NEXT:    jne .LBB14_1
+; X64-NEXT:  # BB#2: # %loadbb1
+; X64-NEXT:    movl 8(%rdi), %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpl 8(%rsi), %ecx
+; X64-NEXT:    je .LBB14_3
+; X64-NEXT:  .LBB14_1: # %res_block
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:  .LBB14_3: # %endblock
 ; X64-NEXT:    testl %eax, %eax
 ; X64-NEXT:    setne %al
-; X64-NEXT:    popq %rcx
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind
   %c = icmp ne i32 %m, 0
@@ -427,19 +520,66 @@ define i1 @length12_eq(i8* %X, i8* %Y) nounwind {
 
 define i32 @length12(i8* %X, i8* %Y) nounwind {
 ; X86-LABEL: length12:
-; X86:       # BB#0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $12
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
+; X86:       # BB#0: # %loadbb
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl (%esi), %ecx
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    jne .LBB15_1
+; X86-NEXT:  # BB#2: # %loadbb1
+; X86-NEXT:    movl 4(%esi), %ecx
+; X86-NEXT:    movl 4(%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    jne .LBB15_1
+; X86-NEXT:  # BB#3: # %loadbb2
+; X86-NEXT:    movl 8(%esi), %ecx
+; X86-NEXT:    movl 8(%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    jne .LBB15_1
+; X86-NEXT:  # BB#4: # %endblock
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB15_1: # %res_block
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    movl $-1, %ecx
+; X86-NEXT:    movl $1, %eax
+; X86-NEXT:    cmovbl %ecx, %eax
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: length12:
-; X64:       # BB#0:
-; X64-NEXT:    movl $12, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
+; X64:       # BB#0: # %loadbb
+; X64-NEXT:    movq (%rdi), %rcx
+; X64-NEXT:    movq (%rsi), %rdx
+; X64-NEXT:    bswapq %rcx
+; X64-NEXT:    bswapq %rdx
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    jne .LBB15_1
+; X64-NEXT:  # BB#2: # %loadbb1
+; X64-NEXT:    movl 8(%rdi), %ecx
+; X64-NEXT:    movl 8(%rsi), %edx
+; X64-NEXT:    bswapl %ecx
+; X64-NEXT:    bswapl %edx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    jne .LBB15_1
+; X64-NEXT:  # BB#3: # %endblock
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB15_1: # %res_block
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    movl $-1, %ecx
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    cmovbl %ecx, %eax
+; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind
   ret i32 %m
 }
@@ -448,111 +588,165 @@ define i32 @length12(i8* %X, i8* %Y) nounwind {
 
 define i32 @length16(i8* %X, i8* %Y) nounwind {
 ; X86-LABEL: length16:
-; X86:       # BB#0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $16
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
+; X86:       # BB#0: # %loadbb
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl (%esi), %ecx
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    jne .LBB16_1
+; X86-NEXT:  # BB#2: # %loadbb1
+; X86-NEXT:    movl 4(%esi), %ecx
+; X86-NEXT:    movl 4(%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    jne .LBB16_1
+; X86-NEXT:  # BB#3: # %loadbb2
+; X86-NEXT:    movl 8(%esi), %ecx
+; X86-NEXT:    movl 8(%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    jne .LBB16_1
+; X86-NEXT:  # BB#4: # %loadbb3
+; X86-NEXT:    movl 12(%esi), %ecx
+; X86-NEXT:    movl 12(%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    jne .LBB16_1
+; X86-NEXT:  # BB#5: # %endblock
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB16_1: # %res_block
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    movl $-1, %ecx
+; X86-NEXT:    movl $1, %eax
+; X86-NEXT:    cmovbl %ecx, %eax
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: length16:
-; X64:       # BB#0:
-; X64-NEXT:    movl $16, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
+; X64:       # BB#0: # %loadbb
+; X64-NEXT:    movq (%rdi), %rcx
+; X64-NEXT:    movq (%rsi), %rdx
+; X64-NEXT:    bswapq %rcx
+; X64-NEXT:    bswapq %rdx
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    jne .LBB16_1
+; X64-NEXT:  # BB#2: # %loadbb1
+; X64-NEXT:    movq 8(%rdi), %rcx
+; X64-NEXT:    movq 8(%rsi), %rdx
+; X64-NEXT:    bswapq %rcx
+; X64-NEXT:    bswapq %rdx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    jne .LBB16_1
+; X64-NEXT:  # BB#3: # %endblock
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB16_1: # %res_block
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    movl $-1, %ecx
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    cmovbl %ecx, %eax
+; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 16) nounwind
   ret i32 %m
 }
 
 define i1 @length16_eq(i8* %x, i8* %y) nounwind {
-; X86-NOSSE-LABEL: length16_eq:
-; X86-NOSSE:       # BB#0:
-; X86-NOSSE-NEXT:    pushl $0
-; X86-NOSSE-NEXT:    pushl $16
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $16, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    setne %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE2-LABEL: length16_eq:
-; X86-SSE2:       # BB#0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm1
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
-; X86-SSE2-NEXT:    pmovmskb %xmm1, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    setne %al
-; X86-SSE2-NEXT:    retl
-;
-; X64-SSE2-LABEL: length16_eq:
-; X64-SSE2:       # BB#0:
-; X64-SSE2-NEXT:    movdqu (%rsi), %xmm0
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
-; X64-SSE2-NEXT:    pmovmskb %xmm1, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    setne %al
-; X64-SSE2-NEXT:    retq
+; X86-LABEL: length16_eq:
+; X86:       # BB#0: # %loadbb
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %edx
+; X86-NEXT:    cmpl (%eax), %edx
+; X86-NEXT:    jne .LBB17_1
+; X86-NEXT:  # BB#2: # %loadbb1
+; X86-NEXT:    movl 4(%ecx), %edx
+; X86-NEXT:    cmpl 4(%eax), %edx
+; X86-NEXT:    jne .LBB17_1
+; X86-NEXT:  # BB#3: # %loadbb2
+; X86-NEXT:    movl 8(%ecx), %edx
+; X86-NEXT:    cmpl 8(%eax), %edx
+; X86-NEXT:    jne .LBB17_1
+; X86-NEXT:  # BB#4: # %loadbb3
+; X86-NEXT:    movl 12(%ecx), %edx
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    cmpl 12(%eax), %edx
+; X86-NEXT:    je .LBB17_5
+; X86-NEXT:  .LBB17_1: # %res_block
+; X86-NEXT:    movl $1, %ecx
+; X86-NEXT:  .LBB17_5: # %endblock
+; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
 ;
-; X64-AVX2-LABEL: length16_eq:
-; X64-AVX2:       # BB#0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-AVX2-NEXT:    vpcmpeqb (%rsi), %xmm0, %xmm0
-; X64-AVX2-NEXT:    vpmovmskb %xmm0, %eax
-; X64-AVX2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-AVX2-NEXT:    setne %al
-; X64-AVX2-NEXT:    retq
+; X64-LABEL: length16_eq:
+; X64:       # BB#0: # %loadbb
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    cmpq (%rsi), %rax
+; X64-NEXT:    jne .LBB17_1
+; X64-NEXT:  # BB#2: # %loadbb1
+; X64-NEXT:    movq 8(%rdi), %rcx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpq 8(%rsi), %rcx
+; X64-NEXT:    je .LBB17_3
+; X64-NEXT:  .LBB17_1: # %res_block
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:  .LBB17_3: # %endblock
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) nounwind
   %cmp = icmp ne i32 %call, 0
   ret i1 %cmp
 }
 
 define i1 @length16_eq_const(i8* %X) nounwind {
-; X86-NOSSE-LABEL: length16_eq_const:
-; X86-NOSSE:       # BB#0:
-; X86-NOSSE-NEXT:    pushl $0
-; X86-NOSSE-NEXT:    pushl $16
-; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $16, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE2-LABEL: length16_eq_const:
-; X86-SSE2:       # BB#0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb {{\.LCPI.*}}, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-;
-; X64-SSE2-LABEL: length16_eq_const:
-; X64-SSE2:       # BB#0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    pcmpeqb {{.*}}(%rip), %xmm0
-; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    sete %al
-; X64-SSE2-NEXT:    retq
+; X86-LABEL: length16_eq_const:
+; X86:       # BB#0: # %loadbb
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $858927408, (%eax) # imm = 0x33323130
+; X86-NEXT:    jne .LBB18_1
+; X86-NEXT:  # BB#2: # %loadbb1
+; X86-NEXT:    cmpl $926299444, 4(%eax) # imm = 0x37363534
+; X86-NEXT:    jne .LBB18_1
+; X86-NEXT:  # BB#3: # %loadbb2
+; X86-NEXT:    cmpl $825243960, 8(%eax) # imm = 0x31303938
+; X86-NEXT:    jne .LBB18_1
+; X86-NEXT:  # BB#4: # %loadbb3
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    cmpl $892613426, 12(%eax) # imm = 0x35343332
+; X86-NEXT:    je .LBB18_5
+; X86-NEXT:  .LBB18_1: # %res_block
+; X86-NEXT:    movl $1, %ecx
+; X86-NEXT:  .LBB18_5: # %endblock
+; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
 ;
-; X64-AVX2-LABEL: length16_eq_const:
-; X64-AVX2:       # BB#0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-AVX2-NEXT:    vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0
-; X64-AVX2-NEXT:    vpmovmskb %xmm0, %eax
-; X64-AVX2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-AVX2-NEXT:    sete %al
-; X64-AVX2-NEXT:    retq
+; X64-LABEL: length16_eq_const:
+; X64:       # BB#0: # %loadbb
+; X64-NEXT:    movabsq $3978425819141910832, %rax # imm = 0x3736353433323130
+; X64-NEXT:    cmpq %rax, (%rdi)
+; X64-NEXT:    jne .LBB18_1
+; X64-NEXT:  # BB#2: # %loadbb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    movabsq $3833745473465760056, %rcx # imm = 0x3534333231303938
+; X64-NEXT:    cmpq %rcx, 8(%rdi)
+; X64-NEXT:    je .LBB18_3
+; X64-NEXT:  .LBB18_1: # %res_block
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:  .LBB18_3: # %endblock
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 16) nounwind
   %c = icmp eq i32 %m, 0
   ret i1 %c
@@ -570,9 +764,43 @@ define i32 @length32(i8* %X, i8* %Y) nounwind {
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: length32:
-; X64:       # BB#0:
-; X64-NEXT:    movl $32, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
+; X64:       # BB#0: # %loadbb
+; X64-NEXT:    movq (%rdi), %rcx
+; X64-NEXT:    movq (%rsi), %rdx
+; X64-NEXT:    bswapq %rcx
+; X64-NEXT:    bswapq %rdx
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    jne .LBB19_1
+; X64-NEXT:  # BB#2: # %loadbb1
+; X64-NEXT:    movq 8(%rdi), %rcx
+; X64-NEXT:    movq 8(%rsi), %rdx
+; X64-NEXT:    bswapq %rcx
+; X64-NEXT:    bswapq %rdx
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    jne .LBB19_1
+; X64-NEXT:  # BB#3: # %loadbb2
+; X64-NEXT:    movq 16(%rdi), %rcx
+; X64-NEXT:    movq 16(%rsi), %rdx
+; X64-NEXT:    bswapq %rcx
+; X64-NEXT:    bswapq %rdx
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    jne .LBB19_1
+; X64-NEXT:  # BB#4: # %loadbb3
+; X64-NEXT:    movq 24(%rdi), %rcx
+; X64-NEXT:    movq 24(%rsi), %rdx
+; X64-NEXT:    bswapq %rcx
+; X64-NEXT:    bswapq %rdx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    jne .LBB19_1
+; X64-NEXT:  # BB#5: # %endblock
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB19_1: # %res_block
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    movl $-1, %ecx
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    cmovbl %ecx, %eax
+; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 32) nounwind
   ret i32 %m
 }
@@ -592,25 +820,30 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind {
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    retl
 ;
-; X64-SSE2-LABEL: length32_eq:
-; X64-SSE2:       # BB#0:
-; X64-SSE2-NEXT:    pushq %rax
-; X64-SSE2-NEXT:    movl $32, %edx
-; X64-SSE2-NEXT:    callq memcmp
-; X64-SSE2-NEXT:    testl %eax, %eax
-; X64-SSE2-NEXT:    sete %al
-; X64-SSE2-NEXT:    popq %rcx
-; X64-SSE2-NEXT:    retq
-;
-; X64-AVX2-LABEL: length32_eq:
-; X64-AVX2:       # BB#0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vpcmpeqb (%rsi), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpmovmskb %ymm0, %eax
-; X64-AVX2-NEXT:    cmpl $-1, %eax
-; X64-AVX2-NEXT:    sete %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
+; X64-LABEL: length32_eq:
+; X64:       # BB#0: # %loadbb
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    cmpq (%rsi), %rax
+; X64-NEXT:    jne .LBB20_1
+; X64-NEXT:  # BB#2: # %loadbb1
+; X64-NEXT:    movq 8(%rdi), %rax
+; X64-NEXT:    cmpq 8(%rsi), %rax
+; X64-NEXT:    jne .LBB20_1
+; X64-NEXT:  # BB#3: # %loadbb2
+; X64-NEXT:    movq 16(%rdi), %rax
+; X64-NEXT:    cmpq 16(%rsi), %rax
+; X64-NEXT:    jne .LBB20_1
+; X64-NEXT:  # BB#4: # %loadbb3
+; X64-NEXT:    movq 24(%rdi), %rcx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpq 24(%rsi), %rcx
+; X64-NEXT:    je .LBB20_5
+; X64-NEXT:  .LBB20_1: # %res_block
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:  .LBB20_5: # %endblock
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 32) nounwind
   %cmp = icmp eq i32 %call, 0
   ret i1 %cmp
@@ -629,26 +862,30 @@ define i1 @length32_eq_const(i8* %X) nounwind {
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    retl
 ;
-; X64-SSE2-LABEL: length32_eq_const:
-; X64-SSE2:       # BB#0:
-; X64-SSE2-NEXT:    pushq %rax
-; X64-SSE2-NEXT:    movl $.L.str, %esi
-; X64-SSE2-NEXT:    movl $32, %edx
-; X64-SSE2-NEXT:    callq memcmp
-; X64-SSE2-NEXT:    testl %eax, %eax
-; X64-SSE2-NEXT:    setne %al
-; X64-SSE2-NEXT:    popq %rcx
-; X64-SSE2-NEXT:    retq
-;
-; X64-AVX2-LABEL: length32_eq_const:
-; X64-AVX2:       # BB#0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpmovmskb %ymm0, %eax
-; X64-AVX2-NEXT:    cmpl $-1, %eax
-; X64-AVX2-NEXT:    setne %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
+; X64-LABEL: length32_eq_const:
+; X64:       # BB#0: # %loadbb
+; X64-NEXT:    movabsq $3978425819141910832, %rax # imm = 0x3736353433323130
+; X64-NEXT:    cmpq %rax, (%rdi)
+; X64-NEXT:    jne .LBB21_1
+; X64-NEXT:  # BB#2: # %loadbb1
+; X64-NEXT:    movabsq $3833745473465760056, %rax # imm = 0x3534333231303938
+; X64-NEXT:    cmpq %rax, 8(%rdi)
+; X64-NEXT:    jne .LBB21_1
+; X64-NEXT:  # BB#3: # %loadbb2
+; X64-NEXT:    movabsq $3689065127958034230, %rax # imm = 0x3332313039383736
+; X64-NEXT:    cmpq %rax, 16(%rdi)
+; X64-NEXT:    jne .LBB21_1
+; X64-NEXT:  # BB#4: # %loadbb3
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    movabsq $3544395820347831604, %rcx # imm = 0x3130393837363534
+; X64-NEXT:    cmpq %rcx, 24(%rdi)
+; X64-NEXT:    je .LBB21_5
+; X64-NEXT:  .LBB21_1: # %res_block
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:  .LBB21_5: # %endblock
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 32) nounwind
   %c = icmp ne i32 %m, 0
   ret i1 %c
diff --git a/test/CodeGen/X86/pmul.ll b/test/CodeGen/X86/pmul.ll
index 50a661fcca11..76d750855cd4 100644
--- a/test/CodeGen/X86/pmul.ll
+++ b/test/CodeGen/X86/pmul.ll
@@ -105,7 +105,7 @@ define <4 x i32> @mul_v4i32c(<4 x i32> %i) nounwind  {
 ;
 ; AVX-LABEL: mul_v4i32c:
 ; AVX:       # BB#0: # %entry
-; AVX-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [117,117,117,117]
 ; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 entry:
@@ -523,7 +523,7 @@ define <8 x i32> @mul_v8i32c(<8 x i32> %i) nounwind  {
 ;
 ; AVX-LABEL: mul_v8i32c:
 ; AVX:       # BB#0: # %entry
-; AVX-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1
+; AVX-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [117,117,117,117,117,117,117,117]
 ; AVX-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    retq
 entry:
@@ -551,7 +551,7 @@ define <4 x i64> @mul_v4i64c(<4 x i64> %i) nounwind  {
 ;
 ; AVX-LABEL: mul_v4i64c:
 ; AVX:       # BB#0: # %entry
-; AVX-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm1
+; AVX-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [117,117,117,117]
 ; AVX-NEXT:    vpmuludq %ymm1, %ymm0, %ymm2
 ; AVX-NEXT:    vpsrlq $32, %ymm0, %ymm0
 ; AVX-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
diff --git a/test/CodeGen/X86/popcnt-schedule.ll b/test/CodeGen/X86/popcnt-schedule.ll
new file mode 100644
index 000000000000..c0d11280fc1d
--- /dev/null
+++ b/test/CodeGen/X86/popcnt-schedule.ll
@@ -0,0 +1,167 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mattr=+popcnt    | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=slm         | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=goldmont    | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=sandybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=ivybridge   | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell     | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake     | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl         | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2      | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1      | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
+
+define i16 @test_ctpop_i16(i16 zeroext %a0, i16 *%a1) {
+; GENERIC-LABEL: test_ctpop_i16:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    popcntw (%rsi), %cx
+; GENERIC-NEXT:    popcntw %di, %ax
+; GENERIC-NEXT:    orl %ecx, %eax
+; GENERIC-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; GENERIC-NEXT:    retq
+;
+; SLM-LABEL: test_ctpop_i16:
+; SLM:       # BB#0:
+; SLM-NEXT:    popcntw (%rsi), %cx # sched: [6:1.00]
+; SLM-NEXT:    popcntw %di, %ax # sched: [3:1.00]
+; SLM-NEXT:    orl %ecx, %eax # sched: [1:0.50]
+; SLM-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_ctpop_i16:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    popcntw (%rsi), %cx # sched: [7:1.00]
+; SANDY-NEXT:    popcntw %di, %ax # sched: [3:1.00]
+; SANDY-NEXT:    orl %ecx, %eax # sched: [1:0.33]
+; SANDY-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; SANDY-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_ctpop_i16:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    popcntw (%rsi), %cx # sched: [7:1.00]
+; HASWELL-NEXT:    popcntw %di, %ax # sched: [3:1.00]
+; HASWELL-NEXT:    orl %ecx, %eax # sched: [1:0.25]
+; HASWELL-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_ctpop_i16:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    popcntw (%rsi), %cx # sched: [8:1.00]
+; BTVER2-NEXT:    popcntw %di, %ax # sched: [3:1.00]
+; BTVER2-NEXT:    orl %ecx, %eax # sched: [1:0.50]
+; BTVER2-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_ctpop_i16:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    popcntw (%rsi), %cx # sched: [10:1.00]
+; ZNVER1-NEXT:    popcntw %di, %ax # sched: [3:1.00]
+; ZNVER1-NEXT:    orl %ecx, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %1 = load i16, i16 *%a1
+  %2 = tail call i16 @llvm.ctpop.i16( i16 %1 )
+  %3 = tail call i16 @llvm.ctpop.i16( i16 %a0 )
+  %4 = or i16 %2, %3
+  ret i16 %4
+}
+declare i16 @llvm.ctpop.i16(i16)
+
+define i32 @test_ctpop_i32(i32 %a0, i32 *%a1) {
+; GENERIC-LABEL: test_ctpop_i32:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    popcntl (%rsi), %ecx
+; GENERIC-NEXT:    popcntl %edi, %eax
+; GENERIC-NEXT:    orl %ecx, %eax
+; GENERIC-NEXT:    retq
+;
+; SLM-LABEL: test_ctpop_i32:
+; SLM:       # BB#0:
+; SLM-NEXT:    popcntl (%rsi), %ecx # sched: [6:1.00]
+; SLM-NEXT:    popcntl %edi, %eax # sched: [3:1.00]
+; SLM-NEXT:    orl %ecx, %eax # sched: [1:0.50]
+; SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_ctpop_i32:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    popcntl (%rsi), %ecx # sched: [7:1.00]
+; SANDY-NEXT:    popcntl %edi, %eax # sched: [3:1.00]
+; SANDY-NEXT:    orl %ecx, %eax # sched: [1:0.33]
+; SANDY-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_ctpop_i32:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    popcntl (%rsi), %ecx # sched: [7:1.00]
+; HASWELL-NEXT:    popcntl %edi, %eax # sched: [3:1.00]
+; HASWELL-NEXT:    orl %ecx, %eax # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_ctpop_i32:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    popcntl (%rsi), %ecx # sched: [8:1.00]
+; BTVER2-NEXT:    popcntl %edi, %eax # sched: [3:1.00]
+; BTVER2-NEXT:    orl %ecx, %eax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_ctpop_i32:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    popcntl (%rsi), %ecx # sched: [10:1.00]
+; ZNVER1-NEXT:    popcntl %edi, %eax # sched: [3:1.00]
+; ZNVER1-NEXT:    orl %ecx, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %1 = load i32, i32 *%a1
+  %2 = tail call i32 @llvm.ctpop.i32( i32 %1 )
+  %3 = tail call i32 @llvm.ctpop.i32( i32 %a0 )
+  %4 = or i32 %2, %3
+  ret i32 %4
+}
+declare i32 @llvm.ctpop.i32(i32)
+
+define i64 @test_ctpop_i64(i64 %a0, i64 *%a1) {
+; GENERIC-LABEL: test_ctpop_i64:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    popcntq (%rsi), %rcx
+; GENERIC-NEXT:    popcntq %rdi, %rax
+; GENERIC-NEXT:    orq %rcx, %rax
+; GENERIC-NEXT:    retq
+;
+; SLM-LABEL: test_ctpop_i64:
+; SLM:       # BB#0:
+; SLM-NEXT:    popcntq (%rsi), %rcx # sched: [6:1.00]
+; SLM-NEXT:    popcntq %rdi, %rax # sched: [3:1.00]
+; SLM-NEXT:    orq %rcx, %rax # sched: [1:0.50]
+; SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_ctpop_i64:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    popcntq (%rsi), %rcx # sched: [9:1.00]
+; SANDY-NEXT:    popcntq %rdi, %rax # sched: [3:1.00]
+; SANDY-NEXT:    orq %rcx, %rax # sched: [1:0.33]
+; SANDY-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_ctpop_i64:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    popcntq (%rsi), %rcx # sched: [7:1.00]
+; HASWELL-NEXT:    popcntq %rdi, %rax # sched: [3:1.00]
+; HASWELL-NEXT:    orq %rcx, %rax # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_ctpop_i64:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    popcntq (%rsi), %rcx # sched: [8:1.00]
+; BTVER2-NEXT:    popcntq %rdi, %rax # sched: [3:1.00]
+; BTVER2-NEXT:    orq %rcx, %rax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_ctpop_i64:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    popcntq (%rsi), %rcx # sched: [10:1.00]
+; ZNVER1-NEXT:    popcntq %rdi, %rax # sched: [3:1.00]
+; ZNVER1-NEXT:    orq %rcx, %rax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %1 = load i64, i64 *%a1
+  %2 = tail call i64 @llvm.ctpop.i64( i64 %1 )
+  %3 = tail call i64 @llvm.ctpop.i64( i64 %a0 )
+  %4 = or i64 %2, %3
+  ret i64 %4
+}
+declare i64 @llvm.ctpop.i64(i64)
diff --git a/test/CodeGen/X86/pr32282.ll b/test/CodeGen/X86/pr32282.ll
new file mode 100644
index 000000000000..26c4bdb2375a
--- /dev/null
+++ b/test/CodeGen/X86/pr32282.ll
@@ -0,0 +1,104 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mcpu=skx | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefix=X64
+
+; Check for assert in foldMaskAndShiftToScale due to out of range mask scaling.
+
+@b = common global i8 zeroinitializer, align 1
+@c = common global i8 zeroinitializer, align 1
+@d = common global i64 zeroinitializer, align 8
+@e = common global i64 zeroinitializer, align 8
+
+define void @foo() {
+; X86-LABEL: foo:
+; X86:       # BB#0:
+; X86-NEXT:    pushl %eax
+; X86-NEXT:  .Lcfi0:
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    movl d, %eax
+; X86-NEXT:    movl d+4, %ecx
+; X86-NEXT:    movl $701685459, %edx # imm = 0x29D2DED3
+; X86-NEXT:    andnl %edx, %ecx, %ecx
+; X86-NEXT:    movl $-564453154, %edx # imm = 0xDE5B20DE
+; X86-NEXT:    andnl %edx, %eax, %edx
+; X86-NEXT:    shrdl $21, %ecx, %edx
+; X86-NEXT:    shrl $21, %ecx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    testb %al, %al
+; X86-NEXT:    cmovnel %ecx, %edx
+; X86-NEXT:    cmovnel %eax, %ecx
+; X86-NEXT:    andl $-2, %edx
+; X86-NEXT:    andl $2147483647, %ecx # imm = 0x7FFFFFFF
+; X86-NEXT:    addl $7, %edx
+; X86-NEXT:    adcxl %eax, %ecx
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:  .Lcfi1:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    pushl %edx
+; X86-NEXT:  .Lcfi2:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    pushl $0
+; X86-NEXT:  .Lcfi3:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    pushl $0
+; X86-NEXT:  .Lcfi4:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    calll __divdi3
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:  .Lcfi5:
+; X86-NEXT:    .cfi_adjust_cfa_offset -16
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    setne {{[0-9]+}}(%esp)
+; X86-NEXT:    popl %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: foo:
+; X64:       # BB#0:
+; X64-NEXT:    movq {{.*}}(%rip), %rax
+; X64-NEXT:    movabsq $3013716102212485120, %rcx # imm = 0x29D2DED3DE400000
+; X64-NEXT:    andnq %rcx, %rax, %rcx
+; X64-NEXT:    shrq $21, %rcx
+; X64-NEXT:    addq $7, %rcx
+; X64-NEXT:    movabsq $4393751543808, %rax # imm = 0x3FF00000000
+; X64-NEXT:    testq %rax, %rcx
+; X64-NEXT:    je .LBB0_1
+; X64-NEXT:  # BB#2:
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    idivq %rcx
+; X64-NEXT:    jmp .LBB0_3
+; X64-NEXT:  .LBB0_1:
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    divl %ecx
+; X64-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<def>
+; X64-NEXT:  .LBB0_3:
+; X64-NEXT:    testq %rax, %rax
+; X64-NEXT:    setne -{{[0-9]+}}(%rsp)
+; X64-NEXT:    retq
+  %1 = alloca i8, align 1
+  %2 = load i64, i64* @d, align 8
+  %3 = or i64 -3013716102214263007, %2
+  %4 = xor i64 %3, -1
+  %5 = load i64, i64* @e, align 8
+  %6 = load i8, i8* @b, align 1
+  %7 = trunc i8 %6 to i1
+  %8 = zext i1 %7 to i64
+  %9 = xor i64 %5, %8
+  %10 = load i8, i8* @c, align 1
+  %11 = trunc i8 %10 to i1
+  %12 = zext i1 %11 to i32
+  %13 = or i32 551409149, %12
+  %14 = sub nsw i32 %13, 551409131
+  %15 = zext i32 %14 to i64
+  %16 = shl i64 %9, %15
+  %17 = sub nsw i64 %16, 223084523
+  %18 = ashr i64 %4, %17
+  %19 = and i64 %18, 9223372036854775806
+  %20 = add nsw i64 7, %19
+  %21 = sdiv i64 0, %20
+  %22 = icmp ne i64 %21, 0
+  %23 = zext i1 %22 to i8
+  store i8 %23, i8* %1, align 1
+  ret void
+}
diff --git a/test/CodeGen/X86/pr32515.ll b/test/CodeGen/X86/pr32515.ll
new file mode 100644
index 000000000000..aeb6803867aa
--- /dev/null
+++ b/test/CodeGen/X86/pr32515.ll
@@ -0,0 +1,29 @@
+; RUN: llc -O0 -mtriple=x86_64-unknown -mcpu=skx -o - %s
+; RUN: llc     -mtriple=x86_64-unknown -mcpu=skx -o - %s
+; RUN: llc -O0 -mtriple=i686-unknown   -mcpu=skx -o - %s
+; RUN: llc     -mtriple=i686-unknown   -mcpu=skx -o - %s
+; REQUIRES: asserts
+
+@var_26 = external global i16, align 2
+
+define void @foo() #0 {
+ %1 = alloca i16, align 2
+ %2 = load i16, i16* @var_26, align 2
+ %3 = zext i16 %2 to i32
+ %4 = icmp ne i32 %3, 7
+ %5 = zext i1 %4 to i16
+ store i16 %5, i16* %1, align 2
+ %6 = load i16, i16* @var_26, align 2
+ %7 = zext i16 %6 to i32
+ %8 = and i32 1, %7
+ %9 = shl i32 %8, 0
+ %10 = load i16, i16* @var_26, align 2
+ %11 = zext i16 %10 to i32
+ %12 = icmp ne i32 %11, 7
+ %13 = zext i1 %12 to i32
+ %14 = and i32 %9, %13
+ %15 = icmp ne i32 %14, 0
+ %16 = zext i1 %15 to i8
+ store i8 %16, i8* undef, align 1
+ unreachable
+ }
diff --git a/test/CodeGen/X86/pr33772.ll b/test/CodeGen/X86/pr33772.ll
new file mode 100644
index 000000000000..ff22c7478866
--- /dev/null
+++ b/test/CodeGen/X86/pr33772.ll
@@ -0,0 +1,15 @@
+; RUN: not llc < %s -mcpu=skylake-avx512 2>&1 | FileCheck %s
+
+target triple = "x86_64-unknown-linux-gnu"
+
+; make sure we don't crash if scale for gather isn't constant.
+
+; CHECK: LLVM ERROR: Cannot select: intrinsic %llvm.x86.avx512.gather.dpi.512
+declare <16 x i32> @llvm.x86.avx512.gather.dpi.512(<16 x i32>, i8*, <16 x i32>, i16, i32)
+
+define internal <16 x i32> @__gather_base_offsets32_i32(i8* readonly %ptr, i32 %offset_scale, <16 x i32> %offsets, <16 x i8> %vecmask) {
+  %mask_vec_i1.i.i = icmp ne <16 x i8> %vecmask, zeroinitializer
+  %mask_i16.i = bitcast <16 x i1> %mask_vec_i1.i.i to i16
+  %res = tail call <16 x i32> @llvm.x86.avx512.gather.dpi.512(<16 x i32> undef, i8* %ptr, <16 x i32> %offsets, i16 %mask_i16.i, i32 %offset_scale)
+  ret <16 x i32> %res
+}
diff --git a/test/CodeGen/X86/pr33828.ll b/test/CodeGen/X86/pr33828.ll
new file mode 100644
index 000000000000..1b7f44323b61
--- /dev/null
+++ b/test/CodeGen/X86/pr33828.ll
@@ -0,0 +1,48 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mcpu=haswell | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell | FileCheck %s --check-prefix=X64
+
+@var_580 = external local_unnamed_addr global i8, align 1
+
+define void @foo() {
+; X86-LABEL: foo:
+; X86:       # BB#0: # %entry
+; X86-NEXT:    movsbl var_580, %eax
+; X86-NEXT:    testl $-536870913, %eax # imm = 0xDFFFFFFF
+; X86-NEXT:    jne .LBB0_1
+; X86-NEXT:  # BB#2: # %if.end13
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB0_1: # %if.then11
+;
+; X64-LABEL: foo:
+; X64:       # BB#0: # %entry
+; X64-NEXT:    movsbl {{.*}}(%rip), %eax
+; X64-NEXT:    testl $-536870913, %eax # imm = 0xDFFFFFFF
+; X64-NEXT:    jne .LBB0_1
+; X64-NEXT:  # BB#2: # %if.end13
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB0_1: # %if.then11
+entry:
+  %tmp = icmp ugt i8 undef, 60
+  %phitmp = zext i1 %tmp to i16
+  br label %if.end
+
+if.end:
+  %tmp1 = load i8, i8* @var_580, align 1
+  %conv7 = sext i8 %tmp1 to i32
+  %conv8 = zext i16 %phitmp to i32
+  %mul = shl nuw nsw i32 %conv8, 1
+  %div9 = udiv i32 %mul, 71
+  %sub = add nsw i32 %div9, -3
+  %shl = shl i32 1, %sub
+  %neg = xor i32 %shl, -1
+  %and = and i32 %neg, %conv7
+  %tobool10 = icmp eq i32 %and, 0
+  br i1 %tobool10, label %if.end13, label %if.then11
+
+if.then11:
+  unreachable
+
+if.end13:
+  ret void
+}
diff --git a/test/CodeGen/X86/regparm.ll b/test/CodeGen/X86/regparm.ll
index 9484e5a9490b..f427010edc51 100644
--- a/test/CodeGen/X86/regparm.ll
+++ b/test/CodeGen/X86/regparm.ll
@@ -1,4 +1,4 @@
-; RUN: llc %s -mtriple=i386-pc-linux -o - | FileCheck -check-prefix=CHECK %s 
+; RUN: llc %s -mtriple=i386-pc-linux -o - | FileCheck %s 
 ; RUN: llc %s -mtriple=i386-pc-win32 -o - | FileCheck -check-prefix=WIN %s
 ; RUN: llc %s -mtriple=i386-pc-linux -fast-isel -o - | FileCheck -check-prefix=FAST %s 
 ; RUN: llc %s -mtriple=i386-pc-win32 -fast-isel -o - | FileCheck -check-prefix=FASTWIN %s
diff --git a/test/CodeGen/X86/rotate_vec.ll b/test/CodeGen/X86/rotate_vec.ll
new file mode 100644
index 000000000000..8fb000bae827
--- /dev/null
+++ b/test/CodeGen/X86/rotate_vec.ll
@@ -0,0 +1,54 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=bdver4 | FileCheck %s
+
+define <4 x i32> @rot_v4i32_splat(<4 x i32> %x) {
+; CHECK-LABEL: rot_v4i32_splat:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vprotd $31, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %1 = lshr <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1>
+  %2 = shl <4 x i32> %x, <i32 31, i32 31, i32 31, i32 31>
+  %3 = or <4 x i32> %1, %2
+  ret <4 x i32> %3
+}
+
+define <4 x i32> @rot_v4i32_non_splat(<4 x i32> %x) {
+; CHECK-LABEL: rot_v4i32_non_splat:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vprotd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %1 = lshr <4 x i32> %x, <i32 1, i32 2, i32 3, i32 4>
+  %2 = shl <4 x i32> %x, <i32 31, i32 30, i32 29, i32 28>
+  %3 = or <4 x i32> %1, %2
+  ret <4 x i32> %3
+}
+
+define <4 x i32> @rot_v4i32_splat_2masks(<4 x i32> %x) {
+; CHECK-LABEL: rot_v4i32_splat_2masks:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vprotd $31, %xmm0, %xmm0
+; CHECK-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %1 = lshr <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1>
+  %2 = and <4 x i32> %1, <i32 4294901760, i32 4294901760, i32 4294901760, i32 4294901760>
+
+  %3 = shl <4 x i32> %x, <i32 31, i32 31, i32 31, i32 31>
+  %4 = and <4 x i32> %3, <i32 0, i32 4294901760, i32 0, i32 4294901760>
+  %5 = or <4 x i32> %2, %4
+  ret <4 x i32> %5
+}
+
+define <4 x i32> @rot_v4i32_non_splat_2masks(<4 x i32> %x) {
+; CHECK-LABEL: rot_v4i32_non_splat_2masks:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vprotd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %1 = lshr <4 x i32> %x, <i32 1, i32 2, i32 3, i32 4>
+  %2 = and <4 x i32> %1, <i32 4294901760, i32 4294901760, i32 4294901760, i32 4294901760>
+
+  %3 = shl <4 x i32> %x, <i32 31, i32 30, i32 29, i32 28>
+  %4 = and <4 x i32> %3, <i32 0, i32 4294901760, i32 0, i32 4294901760>
+  %5 = or <4 x i32> %2, %4
+  ret <4 x i32> %5
+}
diff --git a/test/CodeGen/X86/sibcall-win64.ll b/test/CodeGen/X86/sibcall-win64.ll
index 204e1f8b050b..b9d5a4813e09 100644
--- a/test/CodeGen/X86/sibcall-win64.ll
+++ b/test/CodeGen/X86/sibcall-win64.ll
@@ -1,15 +1,15 @@
 ; RUN: llc < %s -mtriple=x86_64-pc-linux | FileCheck %s
 
-declare x86_64_win64cc void @win64_callee(i32)
-declare x86_64_win64cc void (i32)* @win64_indirect()
-declare x86_64_win64cc void @win64_other(i32)
+declare win64cc void @win64_callee(i32)
+declare win64cc void (i32)* @win64_indirect()
+declare win64cc void @win64_other(i32)
 declare void @sysv_callee(i32)
 declare void (i32)* @sysv_indirect()
 declare void @sysv_other(i32)
 
 define void @sysv_caller(i32 %p1) {
 entry:
-  tail call x86_64_win64cc void @win64_callee(i32 %p1)
+  tail call win64cc void @win64_callee(i32 %p1)
   ret void
 }
 
@@ -19,7 +19,7 @@ entry:
 ; CHECK: addq $40, %rsp
 ; CHECK: retq
 
-define x86_64_win64cc void @win64_caller(i32 %p1) {
+define win64cc void @win64_caller(i32 %p1) {
 entry:
   tail call void @sysv_callee(i32 %p1)
   ret void
@@ -37,18 +37,18 @@ define void @sysv_matched(i32 %p1) {
 ; CHECK-LABEL: sysv_matched:
 ; CHECK: jmp sysv_callee # TAILCALL
 
-define x86_64_win64cc void @win64_matched(i32 %p1) {
-  tail call x86_64_win64cc void @win64_callee(i32 %p1)
+define win64cc void @win64_matched(i32 %p1) {
+  tail call win64cc void @win64_callee(i32 %p1)
   ret void
 }
 
 ; CHECK-LABEL: win64_matched:
 ; CHECK: jmp win64_callee # TAILCALL
 
-define x86_64_win64cc void @win64_indirect_caller(i32 %p1) {
-  %1 = call x86_64_win64cc void (i32)* @win64_indirect()
-  call x86_64_win64cc void @win64_other(i32 0)
-  tail call x86_64_win64cc void %1(i32 %p1)
+define win64cc void @win64_indirect_caller(i32 %p1) {
+  %1 = call win64cc void (i32)* @win64_indirect()
+  call win64cc void @win64_other(i32 0)
+  tail call win64cc void %1(i32 %p1)
   ret void
 }
 
diff --git a/test/CodeGen/X86/sse-schedule.ll b/test/CodeGen/X86/sse-schedule.ll
index c41acd43b3ab..29f726c3df6a 100644
--- a/test/CodeGen/X86/sse-schedule.ll
+++ b/test/CodeGen/X86/sse-schedule.ll
@@ -7,7 +7,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
 
 define <4 x float> @test_addps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
 ; GENERIC-LABEL: test_addps:
@@ -45,6 +45,12 @@ define <4 x float> @test_addps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
 ; BTVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vaddps (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_addps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vaddps (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fadd <4 x float> %a0, %a1
   %2 = load <4 x float>, <4 x float> *%a2, align 16
   %3 = fadd <4 x float> %1, %2
@@ -87,6 +93,12 @@ define float @test_addss(float %a0, float %a1, float *%a2) {
 ; BTVER2-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vaddss (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_addss:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vaddss (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fadd float %a0, %a1
   %2 = load float, float *%a2, align 4
   %3 = fadd float %1, %2
@@ -137,6 +149,12 @@ define <4 x float> @test_andps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
 ; BTVER2-NEXT:    vandps %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vandps (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_andps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vandps %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vandps (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = bitcast <4 x float> %a0 to <4 x i32>
   %2 = bitcast <4 x float> %a1 to <4 x i32>
   %3 = and <4 x i32> %1, %2
@@ -191,6 +209,12 @@ define <4 x float> @test_andnotps(<4 x float> %a0, <4 x float> %a1, <4 x float>
 ; BTVER2-NEXT:    vandnps %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vandnps (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_andnotps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vandnps %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vandnps (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = bitcast <4 x float> %a0 to <4 x i32>
   %2 = bitcast <4 x float> %a1 to <4 x i32>
   %3 = xor <4 x i32> %1, <i32 -1, i32 -1, i32 -1, i32 -1>
@@ -245,6 +269,13 @@ define <4 x float> @test_cmpps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
 ; BTVER2-NEXT:    vcmpeqps (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
 ; BTVER2-NEXT:    vorps %xmm0, %xmm1, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cmpps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vcmpeqps %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
+; ZNVER1-NEXT:    vcmpeqps (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    vorps %xmm0, %xmm1, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fcmp oeq <4 x float> %a0, %a1
   %2 = load <4 x float>, <4 x float> *%a2, align 16
   %3 = fcmp oeq <4 x float> %a0, %2
@@ -290,6 +321,12 @@ define float @test_cmpss(float %a0, float %a1, float *%a2) {
 ; BTVER2-NEXT:    vcmpeqss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vcmpeqss (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cmpss:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vcmpeqss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vcmpeqss (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = insertelement <4 x float> undef, float %a0, i32 0
   %2 = insertelement <4 x float> undef, float %a1, i32 0
   %3 = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %1, <4 x float> %2, i8 0)
@@ -385,6 +422,20 @@ define i32 @test_comiss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
 ; BTVER2-NEXT:    orb %cl, %dl # sched: [1:0.50]
 ; BTVER2-NEXT:    movzbl %dl, %eax # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_comiss:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vcomiss %xmm1, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    setnp %al # sched: [1:0.25]
+; ZNVER1-NEXT:    sete %cl # sched: [1:0.25]
+; ZNVER1-NEXT:    andb %al, %cl # sched: [1:0.25]
+; ZNVER1-NEXT:    vcomiss (%rdi), %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    setnp %al # sched: [1:0.25]
+; ZNVER1-NEXT:    sete %dl # sched: [1:0.25]
+; ZNVER1-NEXT:    andb %al, %dl # sched: [1:0.25]
+; ZNVER1-NEXT:    orb %cl, %dl # sched: [1:0.25]
+; ZNVER1-NEXT:    movzbl %dl, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call i32 @llvm.x86.sse.comieq.ss(<4 x float> %a0, <4 x float> %a1)
   %2 = load <4 x float>, <4 x float> *%a2, align 4
   %3 = call i32 @llvm.x86.sse.comieq.ss(<4 x float> %a0, <4 x float> %2)
@@ -435,6 +486,13 @@ define float @test_cvtsi2ss(i32 %a0, i32 *%a1) {
 ; BTVER2-NEXT:    vcvtsi2ssl (%rsi), %xmm1, %xmm1 # sched: [8:1.00]
 ; BTVER2-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvtsi2ss:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vcvtsi2ssl %edi, %xmm0, %xmm0 # sched: [5:1.00]
+; ZNVER1-NEXT:    vcvtsi2ssl (%rsi), %xmm1, %xmm1 # sched: [12:1.00]
+; ZNVER1-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = sitofp i32 %a0 to float
   %2 = load i32, i32 *%a1, align 4
   %3 = sitofp i32 %2 to float
@@ -484,6 +542,13 @@ define float @test_cvtsi2ssq(i64 %a0, i64 *%a1) {
 ; BTVER2-NEXT:    vcvtsi2ssq (%rsi), %xmm1, %xmm1 # sched: [8:1.00]
 ; BTVER2-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvtsi2ssq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vcvtsi2ssq %rdi, %xmm0, %xmm0 # sched: [5:1.00]
+; ZNVER1-NEXT:    vcvtsi2ssq (%rsi), %xmm1, %xmm1 # sched: [12:1.00]
+; ZNVER1-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = sitofp i64 %a0 to float
   %2 = load i64, i64 *%a1, align 8
   %3 = sitofp i64 %2 to float
@@ -533,6 +598,13 @@ define i32 @test_cvtss2si(float %a0, float *%a1) {
 ; BTVER2-NEXT:    vcvtss2si %xmm0, %ecx # sched: [3:1.00]
 ; BTVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvtss2si:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vcvtss2si (%rdi), %eax # sched: [12:1.00]
+; ZNVER1-NEXT:    vcvtss2si %xmm0, %ecx # sched: [5:1.00]
+; ZNVER1-NEXT:    addl %ecx, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = insertelement <4 x float> undef, float %a0, i32 0
   %2 = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %1)
   %3 = load float, float *%a1, align 4
@@ -585,6 +657,13 @@ define i64 @test_cvtss2siq(float %a0, float *%a1) {
 ; BTVER2-NEXT:    vcvtss2si %xmm0, %rcx # sched: [3:1.00]
 ; BTVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvtss2siq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vcvtss2si (%rdi), %rax # sched: [12:1.00]
+; ZNVER1-NEXT:    vcvtss2si %xmm0, %rcx # sched: [5:1.00]
+; ZNVER1-NEXT:    addq %rcx, %rax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = insertelement <4 x float> undef, float %a0, i32 0
   %2 = call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %1)
   %3 = load float, float *%a1, align 4
@@ -637,6 +716,13 @@ define i32 @test_cvttss2si(float %a0, float *%a1) {
 ; BTVER2-NEXT:    vcvttss2si %xmm0, %ecx # sched: [3:1.00]
 ; BTVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvttss2si:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vcvttss2si (%rdi), %eax # sched: [12:1.00]
+; ZNVER1-NEXT:    vcvttss2si %xmm0, %ecx # sched: [5:1.00]
+; ZNVER1-NEXT:    addl %ecx, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fptosi float %a0 to i32
   %2 = load float, float *%a1, align 4
   %3 = fptosi float %2 to i32
@@ -686,6 +772,13 @@ define i64 @test_cvttss2siq(float %a0, float *%a1) {
 ; BTVER2-NEXT:    vcvttss2si %xmm0, %rcx # sched: [3:1.00]
 ; BTVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvttss2siq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vcvttss2si (%rdi), %rax # sched: [12:1.00]
+; ZNVER1-NEXT:    vcvttss2si %xmm0, %rcx # sched: [5:1.00]
+; ZNVER1-NEXT:    addq %rcx, %rax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fptosi float %a0 to i64
   %2 = load float, float *%a1, align 4
   %3 = fptosi float %2 to i64
@@ -729,6 +822,12 @@ define <4 x float> @test_divps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
 ; BTVER2-NEXT:    vdivps %xmm1, %xmm0, %xmm0 # sched: [19:19.00]
 ; BTVER2-NEXT:    vdivps (%rdi), %xmm0, %xmm0 # sched: [24:19.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_divps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vdivps %xmm1, %xmm0, %xmm0 # sched: [15:1.00]
+; ZNVER1-NEXT:    vdivps (%rdi), %xmm0, %xmm0 # sched: [22:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fdiv <4 x float> %a0, %a1
   %2 = load <4 x float>, <4 x float> *%a2, align 16
   %3 = fdiv <4 x float> %1, %2
@@ -771,6 +870,12 @@ define float @test_divss(float %a0, float %a1, float *%a2) {
 ; BTVER2-NEXT:    vdivss %xmm1, %xmm0, %xmm0 # sched: [19:19.00]
 ; BTVER2-NEXT:    vdivss (%rdi), %xmm0, %xmm0 # sched: [24:19.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_divss:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vdivss %xmm1, %xmm0, %xmm0 # sched: [15:1.00]
+; ZNVER1-NEXT:    vdivss (%rdi), %xmm0, %xmm0 # sched: [22:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fdiv float %a0, %a1
   %2 = load float, float *%a2, align 4
   %3 = fdiv float %1, %2
@@ -813,6 +918,12 @@ define void @test_ldmxcsr(i32 %a0) {
 ; BTVER2-NEXT:    movl %edi, -{{[0-9]+}}(%rsp) # sched: [1:1.00]
 ; BTVER2-NEXT:    vldmxcsr -{{[0-9]+}}(%rsp) # sched: [5:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_ldmxcsr:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    movl %edi, -{{[0-9]+}}(%rsp) # sched: [1:0.50]
+; ZNVER1-NEXT:    vldmxcsr -{{[0-9]+}}(%rsp) # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = alloca i32, align 4
   %2 = bitcast i32* %1 to i8*
   store i32 %a0, i32* %1
@@ -857,6 +968,12 @@ define <4 x float> @test_maxps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
 ; BTVER2-NEXT:    vmaxps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vmaxps (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_maxps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmaxps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vmaxps (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1)
   %2 = load <4 x float>, <4 x float> *%a2, align 16
   %3 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %1, <4 x float> %2)
@@ -900,6 +1017,12 @@ define <4 x float> @test_maxss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
 ; BTVER2-NEXT:    vmaxss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vmaxss (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_maxss:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmaxss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vmaxss (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a0, <4 x float> %a1)
   %2 = load <4 x float>, <4 x float> *%a2, align 16
   %3 = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %1, <4 x float> %2)
@@ -943,6 +1066,12 @@ define <4 x float> @test_minps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
 ; BTVER2-NEXT:    vminps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vminps (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_minps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vminps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vminps (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1)
   %2 = load <4 x float>, <4 x float> *%a2, align 16
   %3 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %1, <4 x float> %2)
@@ -986,6 +1115,12 @@ define <4 x float> @test_minss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
 ; BTVER2-NEXT:    vminss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vminss (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_minss:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vminss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vminss (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a0, <4 x float> %a1)
   %2 = load <4 x float>, <4 x float> *%a2, align 16
   %3 = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %1, <4 x float> %2)
@@ -1035,6 +1170,13 @@ define void @test_movaps(<4 x float> *%a0, <4 x float> *%a1) {
 ; BTVER2-NEXT:    vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vmovaps %xmm0, (%rsi) # sched: [1:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movaps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmovaps (%rdi), %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vmovaps %xmm0, (%rsi) # sched: [1:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = load <4 x float>, <4 x float> *%a0, align 16
   %2 = fadd <4 x float> %1, %1
   store <4 x float> %2, <4 x float> *%a1, align 16
@@ -1079,6 +1221,11 @@ define <4 x float> @test_movhlps(<4 x float> %a0, <4 x float> %a1) {
 ; BTVER2:       # BB#0:
 ; BTVER2-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movhlps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] sched: [1:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
   ret <4 x float> %1
 }
@@ -1129,6 +1276,13 @@ define void @test_movhps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2) {
 ; BTVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vpextrq $1, %xmm0, (%rdi) # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movhps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [8:0.50]
+; ZNVER1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vpextrq $1, %xmm0, (%rdi) # sched: [8:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = bitcast x86_mmx* %a2 to <2 x float>*
   %2 = load <2 x float>, <2 x float> *%1, align 8
   %3 = shufflevector <2 x float> %2, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -1177,6 +1331,12 @@ define <4 x float> @test_movlhps(<4 x float> %a0, <4 x float> %a1) {
 ; BTVER2-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:0.50]
 ; BTVER2-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movlhps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:0.50]
+; ZNVER1-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   %2 = fadd <4 x float> %a1, %1
   ret <4 x float> %2
@@ -1224,6 +1384,13 @@ define void @test_movlps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2) {
 ; BTVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vmovlps %xmm0, (%rdi) # sched: [1:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movlps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [8:0.50]
+; ZNVER1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vmovlps %xmm0, (%rdi) # sched: [1:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = bitcast x86_mmx* %a2 to <2 x float>*
   %2 = load <2 x float>, <2 x float> *%1, align 8
   %3 = shufflevector <2 x float> %2, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -1266,6 +1433,11 @@ define i32 @test_movmskps(<4 x float> %a0) {
 ; BTVER2:       # BB#0:
 ; BTVER2-NEXT:    vmovmskps %xmm0, %eax # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movmskps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmovmskps %xmm0, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %a0)
   ret i32 %1
 }
@@ -1307,6 +1479,11 @@ define void @test_movntps(<4 x float> %a0, <4 x float> *%a1) {
 ; BTVER2:       # BB#0:
 ; BTVER2-NEXT:    vmovntps %xmm0, (%rdi) # sched: [1:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movntps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmovntps %xmm0, (%rdi) # sched: [1:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   store <4 x float> %a0, <4 x float> *%a1, align 16, !nontemporal !0
   ret void
 }
@@ -1353,6 +1530,13 @@ define void @test_movss_mem(float* %a0, float* %a1) {
 ; BTVER2-NEXT:    vaddss %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vmovss %xmm0, (%rsi) # sched: [1:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movss_mem:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [8:0.50]
+; ZNVER1-NEXT:    vaddss %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vmovss %xmm0, (%rsi) # sched: [1:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = load float, float* %a0, align 1
   %2 = fadd float %1, %1
   store float %2, float *%a1, align 1
@@ -1395,6 +1579,11 @@ define <4 x float> @test_movss_reg(<4 x float> %a0, <4 x float> %a1) {
 ; BTVER2:       # BB#0:
 ; BTVER2-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movss_reg:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] sched: [1:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
   ret <4 x float> %1
 }
@@ -1441,6 +1630,13 @@ define void @test_movups(<4 x float> *%a0, <4 x float> *%a1) {
 ; BTVER2-NEXT:    vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vmovups %xmm0, (%rsi) # sched: [1:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movups:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmovups (%rdi), %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vmovups %xmm0, (%rsi) # sched: [1:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = load <4 x float>, <4 x float> *%a0, align 1
   %2 = fadd <4 x float> %1, %1
   store <4 x float> %2, <4 x float> *%a1, align 1
@@ -1483,6 +1679,12 @@ define <4 x float> @test_mulps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
 ; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
 ; BTVER2-NEXT:    vmulps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_mulps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; ZNVER1-NEXT:    vmulps (%rdi), %xmm0, %xmm0 # sched: [12:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fmul <4 x float> %a0, %a1
   %2 = load <4 x float>, <4 x float> *%a2, align 16
   %3 = fmul <4 x float> %1, %2
@@ -1525,6 +1727,12 @@ define float @test_mulss(float %a0, float %a1, float *%a2) {
 ; BTVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
 ; BTVER2-NEXT:    vmulss (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_mulss:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; ZNVER1-NEXT:    vmulss (%rdi), %xmm0, %xmm0 # sched: [12:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fmul float %a0, %a1
   %2 = load float, float *%a2, align 4
   %3 = fmul float %1, %2
@@ -1575,6 +1783,12 @@ define <4 x float> @test_orps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2
 ; BTVER2-NEXT:    vorps %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vorps (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_orps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vorps %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vorps (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = bitcast <4 x float> %a0 to <4 x i32>
   %2 = bitcast <4 x float> %a1 to <4 x i32>
   %3 = or <4 x i32> %1, %2
@@ -1621,6 +1835,11 @@ define void @test_prefetchnta(i8* %a0) {
 ; BTVER2:       # BB#0:
 ; BTVER2-NEXT:    prefetchnta (%rdi) # sched: [5:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_prefetchnta:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    prefetchnta (%rdi) # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   call void @llvm.prefetch(i8* %a0, i32 0, i32 0, i32 1)
   ret void
 }
@@ -1670,6 +1889,13 @@ define <4 x float> @test_rcpps(<4 x float> %a0, <4 x float> *%a1) {
 ; BTVER2-NEXT:    vrcpps %xmm0, %xmm0 # sched: [2:1.00]
 ; BTVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_rcpps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vrcpps (%rdi), %xmm1 # sched: [12:0.50]
+; ZNVER1-NEXT:    vrcpps %xmm0, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0)
   %2 = load <4 x float>, <4 x float> *%a1, align 16
   %3 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %2)
@@ -1728,6 +1954,14 @@ define <4 x float> @test_rcpss(float %a0, float *%a1) {
 ; BTVER2-NEXT:    vrcpss %xmm1, %xmm1, %xmm1 # sched: [7:1.00]
 ; BTVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_rcpss:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [8:0.50]
+; ZNVER1-NEXT:    vrcpss %xmm0, %xmm0, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    vrcpss %xmm1, %xmm1, %xmm1 # sched: [12:0.50]
+; ZNVER1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = insertelement <4 x float> undef, float %a0, i32 0
   %2 = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %1)
   %3 = load float, float *%a1, align 4
@@ -1782,6 +2016,13 @@ define <4 x float> @test_rsqrtps(<4 x float> %a0, <4 x float> *%a1) {
 ; BTVER2-NEXT:    vrsqrtps %xmm0, %xmm0 # sched: [2:1.00]
 ; BTVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_rsqrtps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vrsqrtps (%rdi), %xmm1 # sched: [12:0.50]
+; ZNVER1-NEXT:    vrsqrtps %xmm0, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0)
   %2 = load <4 x float>, <4 x float> *%a1, align 16
   %3 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %2)
@@ -1840,6 +2081,14 @@ define <4 x float> @test_rsqrtss(float %a0, float *%a1) {
 ; BTVER2-NEXT:    vrsqrtss %xmm1, %xmm1, %xmm1 # sched: [7:1.00]
 ; BTVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_rsqrtss:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [8:0.50]
+; ZNVER1-NEXT:    vrsqrtss %xmm0, %xmm0, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    vrsqrtss %xmm1, %xmm1, %xmm1 # sched: [12:0.50]
+; ZNVER1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = insertelement <4 x float> undef, float %a0, i32 0
   %2 = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %1)
   %3 = load float, float *%a1, align 4
@@ -1886,6 +2135,11 @@ define void @test_sfence() {
 ; BTVER2:       # BB#0:
 ; BTVER2-NEXT:    sfence # sched: [1:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_sfence:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    sfence # sched: [1:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   call void @llvm.x86.sse.sfence()
   ret void
 }
@@ -1931,6 +2185,12 @@ define <4 x float> @test_shufps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%
 ; BTVER2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] sched: [1:0.50]
 ; BTVER2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,3],mem[0,0] sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_shufps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] sched: [1:0.50]
+; ZNVER1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,3],mem[0,0] sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 0, i32 4, i32 4>
   %2 = load <4 x float>, <4 x float> *%a2, align 16
   %3 = shufflevector <4 x float> %1, <4 x float> %2, <4 x i32> <i32 0, i32 3, i32 4, i32 4>
@@ -1980,6 +2240,13 @@ define <4 x float> @test_sqrtps(<4 x float> %a0, <4 x float> *%a1) {
 ; BTVER2-NEXT:    vsqrtps %xmm0, %xmm0 # sched: [21:21.00]
 ; BTVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_sqrtps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vsqrtps (%rdi), %xmm1 # sched: [27:1.00]
+; ZNVER1-NEXT:    vsqrtps %xmm0, %xmm0 # sched: [20:1.00]
+; ZNVER1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %a0)
   %2 = load <4 x float>, <4 x float> *%a1, align 16
   %3 = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %2)
@@ -2038,6 +2305,14 @@ define <4 x float> @test_sqrtss(<4 x float> %a0, <4 x float> *%a1) {
 ; BTVER2-NEXT:    vsqrtss %xmm1, %xmm1, %xmm1 # sched: [26:21.00]
 ; BTVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_sqrtss:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmovaps (%rdi), %xmm1 # sched: [8:0.50]
+; ZNVER1-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0 # sched: [27:1.00]
+; ZNVER1-NEXT:    vsqrtss %xmm1, %xmm1, %xmm1 # sched: [27:1.00]
+; ZNVER1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %a0)
   %2 = load <4 x float>, <4 x float> *%a1, align 16
   %3 = call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %2)
@@ -2082,6 +2357,12 @@ define i32 @test_stmxcsr() {
 ; BTVER2-NEXT:    vstmxcsr -{{[0-9]+}}(%rsp) # sched: [1:1.00]
 ; BTVER2-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # sched: [5:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_stmxcsr:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vstmxcsr -{{[0-9]+}}(%rsp) # sched: [1:0.50]
+; ZNVER1-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = alloca i32, align 4
   %2 = bitcast i32* %1 to i8*
   call void @llvm.x86.sse.stmxcsr(i8* %2)
@@ -2126,6 +2407,12 @@ define <4 x float> @test_subps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
 ; BTVER2-NEXT:    vsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vsubps (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_subps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vsubps (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fsub <4 x float> %a0, %a1
   %2 = load <4 x float>, <4 x float> *%a2, align 16
   %3 = fsub <4 x float> %1, %2
@@ -2168,6 +2455,12 @@ define float @test_subss(float %a0, float %a1, float *%a2) {
 ; BTVER2-NEXT:    vsubss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vsubss (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_subss:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vsubss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vsubss (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fsub float %a0, %a1
   %2 = load float, float *%a2, align 4
   %3 = fsub float %1, %2
@@ -2258,6 +2551,20 @@ define i32 @test_ucomiss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
 ; BTVER2-NEXT:    orb %cl, %dl # sched: [1:0.50]
 ; BTVER2-NEXT:    movzbl %dl, %eax # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_ucomiss:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vucomiss %xmm1, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    setnp %al # sched: [1:0.25]
+; ZNVER1-NEXT:    sete %cl # sched: [1:0.25]
+; ZNVER1-NEXT:    andb %al, %cl # sched: [1:0.25]
+; ZNVER1-NEXT:    vucomiss (%rdi), %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    setnp %al # sched: [1:0.25]
+; ZNVER1-NEXT:    sete %dl # sched: [1:0.25]
+; ZNVER1-NEXT:    andb %al, %dl # sched: [1:0.25]
+; ZNVER1-NEXT:    orb %cl, %dl # sched: [1:0.25]
+; ZNVER1-NEXT:    movzbl %dl, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %a0, <4 x float> %a1)
   %2 = load <4 x float>, <4 x float> *%a2, align 4
   %3 = call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %a0, <4 x float> %2)
@@ -2306,6 +2613,12 @@ define <4 x float> @test_unpckhps(<4 x float> %a0, <4 x float> %a1, <4 x float>
 ; BTVER2-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.50]
 ; BTVER2-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_unpckhps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.50]
+; ZNVER1-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   %2 = load <4 x float>, <4 x float> *%a2, align 16
   %3 = shufflevector <4 x float> %1, <4 x float> %2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -2352,6 +2665,12 @@ define <4 x float> @test_unpcklps(<4 x float> %a0, <4 x float> %a1, <4 x float>
 ; BTVER2-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:0.50]
 ; BTVER2-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_unpcklps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:0.50]
+; ZNVER1-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   %2 = load <4 x float>, <4 x float> *%a2, align 16
   %3 = shufflevector <4 x float> %1, <4 x float> %2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -2402,6 +2721,12 @@ define <4 x float> @test_xorps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
 ; BTVER2-NEXT:    vxorps %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vxorps (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_xorps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vxorps %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vxorps (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = bitcast <4 x float> %a0 to <4 x i32>
   %2 = bitcast <4 x float> %a1 to <4 x i32>
   %3 = xor <4 x i32> %1, %2
diff --git a/test/CodeGen/X86/sse2-schedule.ll b/test/CodeGen/X86/sse2-schedule.ll
index 3c36b2138139..6ee908e0c787 100644
--- a/test/CodeGen/X86/sse2-schedule.ll
+++ b/test/CodeGen/X86/sse2-schedule.ll
@@ -7,7 +7,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
 
 define <2 x double> @test_addpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
 ; GENERIC-LABEL: test_addpd:
@@ -45,6 +45,12 @@ define <2 x double> @test_addpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; BTVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vaddpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_addpd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vaddpd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fadd <2 x double> %a0, %a1
   %2 = load <2 x double>, <2 x double> *%a2, align 16
   %3 = fadd <2 x double> %1, %2
@@ -87,6 +93,12 @@ define double @test_addsd(double %a0, double %a1, double *%a2) {
 ; BTVER2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vaddsd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_addsd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vaddsd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fadd double %a0, %a1
   %2 = load double, double *%a2, align 8
   %3 = fadd double %1, %2
@@ -135,6 +147,13 @@ define <2 x double> @test_andpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; BTVER2-NEXT:    vandpd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_andpd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vandpd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vandpd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = bitcast <2 x double> %a0 to <4 x i32>
   %2 = bitcast <2 x double> %a1 to <4 x i32>
   %3 = and <4 x i32> %1, %2
@@ -188,6 +207,13 @@ define <2 x double> @test_andnotpd(<2 x double> %a0, <2 x double> %a1, <2 x doub
 ; BTVER2-NEXT:    vandnpd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_andnotpd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vandnpd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vandnpd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = bitcast <2 x double> %a0 to <4 x i32>
   %2 = bitcast <2 x double> %a1 to <4 x i32>
   %3 = xor <4 x i32> %1, <i32 -1, i32 -1, i32 -1, i32 -1>
@@ -243,6 +269,13 @@ define <2 x double> @test_cmppd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; BTVER2-NEXT:    vcmpeqpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
 ; BTVER2-NEXT:    vorpd %xmm0, %xmm1, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cmppd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vcmpeqpd %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
+; ZNVER1-NEXT:    vcmpeqpd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    vorpd %xmm0, %xmm1, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fcmp oeq <2 x double> %a0, %a1
   %2 = load <2 x double>, <2 x double> *%a2, align 16
   %3 = fcmp oeq <2 x double> %a0, %2
@@ -288,6 +321,12 @@ define double @test_cmpsd(double %a0, double %a1, double *%a2) {
 ; BTVER2-NEXT:    vcmpeqsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vcmpeqsd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cmpsd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vcmpeqsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vcmpeqsd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = insertelement <2 x double> undef, double %a0, i32 0
   %2 = insertelement <2 x double> undef, double %a1, i32 0
   %3 = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %1, <2 x double> %2, i8 0)
@@ -383,6 +422,20 @@ define i32 @test_comisd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
 ; BTVER2-NEXT:    orb %cl, %dl # sched: [1:0.50]
 ; BTVER2-NEXT:    movzbl %dl, %eax # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_comisd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vcomisd %xmm1, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    setnp %al # sched: [1:0.25]
+; ZNVER1-NEXT:    sete %cl # sched: [1:0.25]
+; ZNVER1-NEXT:    andb %al, %cl # sched: [1:0.25]
+; ZNVER1-NEXT:    vcomisd (%rdi), %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    setnp %al # sched: [1:0.25]
+; ZNVER1-NEXT:    sete %dl # sched: [1:0.25]
+; ZNVER1-NEXT:    andb %al, %dl # sched: [1:0.25]
+; ZNVER1-NEXT:    orb %cl, %dl # sched: [1:0.25]
+; ZNVER1-NEXT:    movzbl %dl, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call i32 @llvm.x86.sse2.comieq.sd(<2 x double> %a0, <2 x double> %a1)
   %2 = load <2 x double>, <2 x double> *%a2, align 8
   %3 = call i32 @llvm.x86.sse2.comieq.sd(<2 x double> %a0, <2 x double> %2)
@@ -433,6 +486,13 @@ define <2 x double> @test_cvtdq2pd(<4 x i32> %a0, <4 x i32> *%a1) {
 ; BTVER2-NEXT:    vcvtdq2pd %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvtdq2pd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vcvtdq2pd (%rdi), %xmm1 # sched: [12:1.00]
+; ZNVER1-NEXT:    vcvtdq2pd %xmm0, %xmm0 # sched: [5:1.00]
+; ZNVER1-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
   %2 = sitofp <2 x i32> %1 to <2 x double>
   %3 = load <4 x i32>, <4 x i32>*%a1, align 16
@@ -485,6 +545,13 @@ define <4 x float> @test_cvtdq2ps(<4 x i32> %a0, <4 x i32> *%a1) {
 ; BTVER2-NEXT:    vcvtdq2ps %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvtdq2ps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vcvtdq2ps (%rdi), %xmm1 # sched: [12:1.00]
+; ZNVER1-NEXT:    vcvtdq2ps %xmm0, %xmm0 # sched: [5:1.00]
+; ZNVER1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = sitofp <4 x i32> %a0 to <4 x float>
   %2 = load <4 x i32>, <4 x i32>*%a1, align 16
   %3 = sitofp <4 x i32> %2 to <4 x float>
@@ -535,6 +602,13 @@ define <4 x i32> @test_cvtpd2dq(<2 x double> %a0, <2 x double> *%a1) {
 ; BTVER2-NEXT:    vcvtpd2dq %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvtpd2dq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vcvtpd2dqx (%rdi), %xmm1 # sched: [12:1.00]
+; ZNVER1-NEXT:    vcvtpd2dq %xmm0, %xmm0 # sched: [5:1.00]
+; ZNVER1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %a0)
   %2 = load <2 x double>, <2 x double> *%a1, align 16
   %3 = call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %2)
@@ -586,6 +660,13 @@ define <4 x float> @test_cvtpd2ps(<2 x double> %a0, <2 x double> *%a1) {
 ; BTVER2-NEXT:    vcvtpd2ps %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvtpd2ps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vcvtpd2psx (%rdi), %xmm1 # sched: [12:1.00]
+; ZNVER1-NEXT:    vcvtpd2ps %xmm0, %xmm0 # sched: [5:1.00]
+; ZNVER1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x float> @llvm.x86.sse2.cvtpd2ps(<2 x double> %a0)
   %2 = load <2 x double>, <2 x double> *%a1, align 16
   %3 = call <4 x float> @llvm.x86.sse2.cvtpd2ps(<2 x double> %2)
@@ -637,6 +718,13 @@ define <4 x i32> @test_cvtps2dq(<4 x float> %a0, <4 x float> *%a1) {
 ; BTVER2-NEXT:    vcvtps2dq %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvtps2dq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vcvtps2dq (%rdi), %xmm1 # sched: [12:1.00]
+; ZNVER1-NEXT:    vcvtps2dq %xmm0, %xmm0 # sched: [5:1.00]
+; ZNVER1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %a0)
   %2 = load <4 x float>, <4 x float> *%a1, align 16
   %3 = call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %2)
@@ -688,6 +776,13 @@ define <2 x double> @test_cvtps2pd(<4 x float> %a0, <4 x float> *%a1) {
 ; BTVER2-NEXT:    vcvtps2pd %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvtps2pd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vcvtps2pd (%rdi), %xmm1 # sched: [12:1.00]
+; ZNVER1-NEXT:    vcvtps2pd %xmm0, %xmm0 # sched: [5:1.00]
+; ZNVER1-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <4 x float> %a0, <4 x float> undef, <2 x i32> <i32 0, i32 1>
   %2 = fpext <2 x float> %1 to <2 x double>
   %3 = load <4 x float>, <4 x float> *%a1, align 16
@@ -739,6 +834,13 @@ define i32 @test_cvtsd2si(double %a0, double *%a1) {
 ; BTVER2-NEXT:    vcvtsd2si %xmm0, %ecx # sched: [3:1.00]
 ; BTVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvtsd2si:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vcvtsd2si (%rdi), %eax # sched: [12:1.00]
+; ZNVER1-NEXT:    vcvtsd2si %xmm0, %ecx # sched: [5:1.00]
+; ZNVER1-NEXT:    addl %ecx, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = insertelement <2 x double> undef, double %a0, i32 0
   %2 = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %1)
   %3 = load double, double *%a1, align 8
@@ -791,6 +893,13 @@ define i64 @test_cvtsd2siq(double %a0, double *%a1) {
 ; BTVER2-NEXT:    vcvtsd2si %xmm0, %rcx # sched: [3:1.00]
 ; BTVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvtsd2siq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vcvtsd2si (%rdi), %rax # sched: [12:1.00]
+; ZNVER1-NEXT:    vcvtsd2si %xmm0, %rcx # sched: [5:1.00]
+; ZNVER1-NEXT:    addq %rcx, %rax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = insertelement <2 x double> undef, double %a0, i32 0
   %2 = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %1)
   %3 = load double, double *%a1, align 8
@@ -850,6 +959,14 @@ define float @test_cvtsd2ss(double %a0, double *%a1) {
 ; BTVER2-NEXT:    vcvtsd2ss %xmm1, %xmm1, %xmm1 # sched: [3:1.00]
 ; BTVER2-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvtsd2ss:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero sched: [8:0.50]
+; ZNVER1-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
+; ZNVER1-NEXT:    vcvtsd2ss %xmm1, %xmm1, %xmm1 # sched: [5:1.00]
+; ZNVER1-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fptrunc double %a0 to float
   %2 = load double, double *%a1, align 8
   %3 = fptrunc double %2 to float
@@ -899,6 +1016,13 @@ define double @test_cvtsi2sd(i32 %a0, i32 *%a1) {
 ; BTVER2-NEXT:    vcvtsi2sdl (%rsi), %xmm1, %xmm1 # sched: [8:1.00]
 ; BTVER2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvtsi2sd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vcvtsi2sdl %edi, %xmm0, %xmm0 # sched: [5:1.00]
+; ZNVER1-NEXT:    vcvtsi2sdl (%rsi), %xmm1, %xmm1 # sched: [12:1.00]
+; ZNVER1-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = sitofp i32 %a0 to double
   %2 = load i32, i32 *%a1, align 8
   %3 = sitofp i32 %2 to double
@@ -948,6 +1072,13 @@ define double @test_cvtsi2sdq(i64 %a0, i64 *%a1) {
 ; BTVER2-NEXT:    vcvtsi2sdq (%rsi), %xmm1, %xmm1 # sched: [8:1.00]
 ; BTVER2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvtsi2sdq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vcvtsi2sdq %rdi, %xmm0, %xmm0 # sched: [5:1.00]
+; ZNVER1-NEXT:    vcvtsi2sdq (%rsi), %xmm1, %xmm1 # sched: [12:1.00]
+; ZNVER1-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = sitofp i64 %a0 to double
   %2 = load i64, i64 *%a1, align 8
   %3 = sitofp i64 %2 to double
@@ -1006,6 +1137,14 @@ define double @test_cvtss2sd(float %a0, float *%a1) {
 ; BTVER2-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1 # sched: [3:1.00]
 ; BTVER2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvtss2sd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [8:0.50]
+; ZNVER1-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
+; ZNVER1-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1 # sched: [5:1.00]
+; ZNVER1-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fpext float %a0 to double
   %2 = load float, float *%a1, align 4
   %3 = fpext float %2 to double
@@ -1056,6 +1195,13 @@ define <4 x i32> @test_cvttpd2dq(<2 x double> %a0, <2 x double> *%a1) {
 ; BTVER2-NEXT:    vcvttpd2dq %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvttpd2dq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vcvttpd2dqx (%rdi), %xmm1 # sched: [12:1.00]
+; ZNVER1-NEXT:    vcvttpd2dq %xmm0, %xmm0 # sched: [5:1.00]
+; ZNVER1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fptosi <2 x double> %a0 to <2 x i32>
   %2 = shufflevector <2 x i32> %1, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %3 = load <2 x double>, <2 x double> *%a1, align 16
@@ -1108,6 +1254,13 @@ define <4 x i32> @test_cvttps2dq(<4 x float> %a0, <4 x float> *%a1) {
 ; BTVER2-NEXT:    vcvttps2dq %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvttps2dq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vcvttps2dq (%rdi), %xmm1 # sched: [12:1.00]
+; ZNVER1-NEXT:    vcvttps2dq %xmm0, %xmm0 # sched: [5:1.00]
+; ZNVER1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fptosi <4 x float> %a0 to <4 x i32>
   %2 = load <4 x float>, <4 x float> *%a1, align 16
   %3 = fptosi <4 x float> %2 to <4 x i32>
@@ -1157,6 +1310,13 @@ define i32 @test_cvttsd2si(double %a0, double *%a1) {
 ; BTVER2-NEXT:    vcvttsd2si %xmm0, %ecx # sched: [3:1.00]
 ; BTVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvttsd2si:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vcvttsd2si (%rdi), %eax # sched: [12:1.00]
+; ZNVER1-NEXT:    vcvttsd2si %xmm0, %ecx # sched: [5:1.00]
+; ZNVER1-NEXT:    addl %ecx, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fptosi double %a0 to i32
   %2 = load double, double *%a1, align 8
   %3 = fptosi double %2 to i32
@@ -1206,6 +1366,13 @@ define i64 @test_cvttsd2siq(double %a0, double *%a1) {
 ; BTVER2-NEXT:    vcvttsd2si %xmm0, %rcx # sched: [3:1.00]
 ; BTVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvttsd2siq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vcvttsd2si (%rdi), %rax # sched: [12:1.00]
+; ZNVER1-NEXT:    vcvttsd2si %xmm0, %rcx # sched: [5:1.00]
+; ZNVER1-NEXT:    addq %rcx, %rax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fptosi double %a0 to i64
   %2 = load double, double *%a1, align 8
   %3 = fptosi double %2 to i64
@@ -1249,6 +1416,12 @@ define <2 x double> @test_divpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; BTVER2-NEXT:    vdivpd %xmm1, %xmm0, %xmm0 # sched: [19:19.00]
 ; BTVER2-NEXT:    vdivpd (%rdi), %xmm0, %xmm0 # sched: [24:19.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_divpd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vdivpd %xmm1, %xmm0, %xmm0 # sched: [15:1.00]
+; ZNVER1-NEXT:    vdivpd (%rdi), %xmm0, %xmm0 # sched: [22:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fdiv <2 x double> %a0, %a1
   %2 = load <2 x double>, <2 x double> *%a2, align 16
   %3 = fdiv <2 x double> %1, %2
@@ -1291,6 +1464,12 @@ define double @test_divsd(double %a0, double %a1, double *%a2) {
 ; BTVER2-NEXT:    vdivsd %xmm1, %xmm0, %xmm0 # sched: [19:19.00]
 ; BTVER2-NEXT:    vdivsd (%rdi), %xmm0, %xmm0 # sched: [24:19.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_divsd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vdivsd %xmm1, %xmm0, %xmm0 # sched: [15:1.00]
+; ZNVER1-NEXT:    vdivsd (%rdi), %xmm0, %xmm0 # sched: [22:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fdiv double %a0, %a1
   %2 = load double, double *%a2, align 8
   %3 = fdiv double %1, %2
@@ -1333,6 +1512,11 @@ define void @test_lfence() {
 ; BTVER2:       # BB#0:
 ; BTVER2-NEXT:    lfence # sched: [1:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lfence:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    lfence # sched: [1:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   call void @llvm.x86.sse2.lfence()
   ret void
 }
@@ -1374,6 +1558,11 @@ define void @test_mfence() {
 ; BTVER2:       # BB#0:
 ; BTVER2-NEXT:    mfence # sched: [1:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_mfence:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    mfence # sched: [1:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   call void @llvm.x86.sse2.mfence()
   ret void
 }
@@ -1413,6 +1602,11 @@ define void @test_maskmovdqu(<16 x i8> %a0, <16 x i8> %a1, i8* %a2) {
 ; BTVER2:       # BB#0:
 ; BTVER2-NEXT:    vmaskmovdqu %xmm1, %xmm0 # sched: [1:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_maskmovdqu:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmaskmovdqu %xmm1, %xmm0 # sched: [1:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   call void @llvm.x86.sse2.maskmov.dqu(<16 x i8> %a0, <16 x i8> %a1, i8* %a2)
   ret void
 }
@@ -1454,6 +1648,12 @@ define <2 x double> @test_maxpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; BTVER2-NEXT:    vmaxpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vmaxpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_maxpd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmaxpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vmaxpd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %a0, <2 x double> %a1)
   %2 = load <2 x double>, <2 x double> *%a2, align 16
   %3 = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %1, <2 x double> %2)
@@ -1497,6 +1697,12 @@ define <2 x double> @test_maxsd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; BTVER2-NEXT:    vmaxsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vmaxsd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_maxsd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmaxsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vmaxsd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %a0, <2 x double> %a1)
   %2 = load <2 x double>, <2 x double> *%a2, align 16
   %3 = call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %1, <2 x double> %2)
@@ -1540,6 +1746,12 @@ define <2 x double> @test_minpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; BTVER2-NEXT:    vminpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vminpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_minpd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vminpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vminpd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %a0, <2 x double> %a1)
   %2 = load <2 x double>, <2 x double> *%a2, align 16
   %3 = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %1, <2 x double> %2)
@@ -1583,6 +1795,12 @@ define <2 x double> @test_minsd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; BTVER2-NEXT:    vminsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vminsd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_minsd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vminsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vminsd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %a0, <2 x double> %a1)
   %2 = load <2 x double>, <2 x double> *%a2, align 16
   %3 = call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %1, <2 x double> %2)
@@ -1632,6 +1850,13 @@ define void @test_movapd(<2 x double> *%a0, <2 x double> *%a1) {
 ; BTVER2-NEXT:    vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vmovapd %xmm0, (%rsi) # sched: [1:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movapd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmovapd (%rdi), %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vmovapd %xmm0, (%rsi) # sched: [1:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = load <2 x double>, <2 x double> *%a0, align 16
   %2 = fadd <2 x double> %1, %1
   store <2 x double> %2, <2 x double> *%a1, align 16
@@ -1680,6 +1905,13 @@ define void @test_movdqa(<2 x i64> *%a0, <2 x i64> *%a1) {
 ; BTVER2-NEXT:    vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vmovdqa %xmm0, (%rsi) # sched: [1:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movdqa:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmovdqa (%rdi), %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vmovdqa %xmm0, (%rsi) # sched: [1:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = load <2 x i64>, <2 x i64> *%a0, align 16
   %2 = add <2 x i64> %1, %1
   store <2 x i64> %2, <2 x i64> *%a1, align 16
@@ -1728,6 +1960,13 @@ define void @test_movdqu(<2 x i64> *%a0, <2 x i64> *%a1) {
 ; BTVER2-NEXT:    vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vmovdqu %xmm0, (%rsi) # sched: [1:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movdqu:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmovdqu (%rdi), %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vmovdqu %xmm0, (%rsi) # sched: [1:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = load <2 x i64>, <2 x i64> *%a0, align 1
   %2 = add <2 x i64> %1, %1
   store <2 x i64> %2, <2 x i64> *%a1, align 1
@@ -1794,6 +2033,16 @@ define i32 @test_movd(<4 x i32> %a0, i32 %a1, i32 *%a2) {
 ; BTVER2-NEXT:    vpaddd %xmm2, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vmovd %xmm0, %eax # sched: [1:0.17]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [8:0.50]
+; ZNVER1-NEXT:    vmovd %edi, %xmm1 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1 # sched: [1:0.25]
+; ZNVER1-NEXT:    vmovd %xmm1, (%rsi) # sched: [1:0.50]
+; ZNVER1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vmovd %xmm0, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = insertelement <4 x i32> undef, i32 %a1, i32 0
   %2 = load i32, i32 *%a2
   %3 = insertelement <4 x i32> undef, i32 %2, i32 0
@@ -1865,6 +2114,16 @@ define i64 @test_movd_64(<2 x i64> %a0, i64 %a1, i64 *%a2) {
 ; BTVER2-NEXT:    vpaddq %xmm2, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vmovq %xmm0, %rax # sched: [1:0.17]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movd_64:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero sched: [8:0.50]
+; ZNVER1-NEXT:    vmovq %rdi, %xmm1 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpaddq %xmm1, %xmm0, %xmm1 # sched: [1:0.25]
+; ZNVER1-NEXT:    vmovq %xmm1, (%rsi) # sched: [1:0.50]
+; ZNVER1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vmovq %xmm0, %rax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = insertelement <2 x i64> undef, i64 %a1, i64 0
   %2 = load i64, i64 *%a2
   %3 = insertelement <2 x i64> undef, i64 %2, i64 0
@@ -1918,6 +2177,13 @@ define void @test_movhpd(<2 x double> %a0, <2 x double> %a1, x86_mmx *%a2) {
 ; BTVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vmovhpd %xmm0, (%rdi) # sched: [1:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movhpd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [8:0.50]
+; ZNVER1-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vmovhpd %xmm0, (%rdi) # sched: [1:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = bitcast x86_mmx* %a2 to double*
   %2 = load double, double *%1, align 8
   %3 = insertelement <2 x double> %a1, double %2, i32 1
@@ -1969,6 +2235,13 @@ define void @test_movlpd(<2 x double> %a0, <2 x double> %a1, x86_mmx *%a2) {
 ; BTVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vmovlpd %xmm0, (%rdi) # sched: [1:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movlpd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [8:0.50]
+; ZNVER1-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vmovlpd %xmm0, (%rdi) # sched: [1:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = bitcast x86_mmx* %a2 to double*
   %2 = load double, double *%1, align 8
   %3 = insertelement <2 x double> %a1, double %2, i32 0
@@ -2010,6 +2283,11 @@ define i32 @test_movmskpd(<2 x double> %a0) {
 ; BTVER2:       # BB#0:
 ; BTVER2-NEXT:    vmovmskpd %xmm0, %eax # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movmskpd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmovmskpd %xmm0, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> %a0)
   ret i32 %1
 }
@@ -2053,6 +2331,12 @@ define void @test_movntdqa(<2 x i64> %a0, <2 x i64> *%a1) {
 ; BTVER2-NEXT:    vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vmovntdq %xmm0, (%rdi) # sched: [1:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movntdqa:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vmovntdq %xmm0, (%rdi) # sched: [1:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = add <2 x i64> %a0, %a0
   store <2 x i64> %1, <2 x i64> *%a1, align 16, !nontemporal !0
   ret void
@@ -2094,6 +2378,12 @@ define void @test_movntpd(<2 x double> %a0, <2 x double> *%a1) {
 ; BTVER2-NEXT:    vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vmovntpd %xmm0, (%rdi) # sched: [1:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movntpd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vmovntpd %xmm0, (%rdi) # sched: [1:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fadd <2 x double> %a0, %a0
   store <2 x double> %1, <2 x double> *%a1, align 16, !nontemporal !0
   ret void
@@ -2141,6 +2431,13 @@ define <2 x i64> @test_movq_mem(<2 x i64> %a0, i64 *%a1) {
 ; BTVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vmovq %xmm0, (%rdi) # sched: [1:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movq_mem:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero sched: [8:0.50]
+; ZNVER1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vmovq %xmm0, (%rdi) # sched: [1:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = load i64, i64* %a1, align 1
   %2 = insertelement <2 x i64> zeroinitializer, i64 %1, i32 0
   %3 = add <2 x i64> %a0, %2
@@ -2187,6 +2484,12 @@ define <2 x i64> @test_movq_reg(<2 x i64> %a0, <2 x i64> %a1) {
 ; BTVER2-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero sched: [1:0.50]
 ; BTVER2-NEXT:    vpaddq %xmm0, %xmm1, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movq_reg:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero sched: [1:0.25]
+; ZNVER1-NEXT:    vpaddq %xmm0, %xmm1, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 2>
   %2 = add <2 x i64> %a1, %1
   ret <2 x i64> %2
@@ -2234,6 +2537,13 @@ define void @test_movsd_mem(double* %a0, double* %a1) {
 ; BTVER2-NEXT:    vaddsd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vmovsd %xmm0, (%rsi) # sched: [1:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movsd_mem:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [8:0.50]
+; ZNVER1-NEXT:    vaddsd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vmovsd %xmm0, (%rsi) # sched: [1:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = load double, double* %a0, align 1
   %2 = fadd double %1, %1
   store double %2, double *%a1, align 1
@@ -2277,6 +2587,11 @@ define <2 x double> @test_movsd_reg(<2 x double> %a0, <2 x double> %a1) {
 ; BTVER2:       # BB#0:
 ; BTVER2-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movsd_reg:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] sched: [1:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 2, i32 0>
   ret <2 x double> %1
 }
@@ -2323,6 +2638,13 @@ define void @test_movupd(<2 x double> *%a0, <2 x double> *%a1) {
 ; BTVER2-NEXT:    vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vmovupd %xmm0, (%rsi) # sched: [1:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movupd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmovupd (%rdi), %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vmovupd %xmm0, (%rsi) # sched: [1:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = load <2 x double>, <2 x double> *%a0, align 1
   %2 = fadd <2 x double> %1, %1
   store <2 x double> %2, <2 x double> *%a1, align 1
@@ -2365,6 +2687,12 @@ define <2 x double> @test_mulpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; BTVER2-NEXT:    vmulpd %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
 ; BTVER2-NEXT:    vmulpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_mulpd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmulpd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; ZNVER1-NEXT:    vmulpd (%rdi), %xmm0, %xmm0 # sched: [12:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fmul <2 x double> %a0, %a1
   %2 = load <2 x double>, <2 x double> *%a2, align 16
   %3 = fmul <2 x double> %1, %2
@@ -2407,6 +2735,12 @@ define double @test_mulsd(double %a0, double %a1, double *%a2) {
 ; BTVER2-NEXT:    vmulsd %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
 ; BTVER2-NEXT:    vmulsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_mulsd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmulsd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; ZNVER1-NEXT:    vmulsd (%rdi), %xmm0, %xmm0 # sched: [12:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fmul double %a0, %a1
   %2 = load double, double *%a2, align 8
   %3 = fmul double %1, %2
@@ -2455,6 +2789,13 @@ define <2 x double> @test_orpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; BTVER2-NEXT:    vorpd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_orpd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vorpd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vorpd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = bitcast <2 x double> %a0 to <4 x i32>
   %2 = bitcast <2 x double> %a1 to <4 x i32>
   %3 = or <4 x i32> %1, %2
@@ -2510,6 +2851,12 @@ define <8 x i16> @test_packssdw(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; BTVER2-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpackssdw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_packssdw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpackssdw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a0, <4 x i32> %a1)
   %2 = bitcast <8 x i16> %1 to <4 x i32>
   %3 = load <4 x i32>, <4 x i32> *%a2, align 16
@@ -2562,6 +2909,12 @@ define <16 x i8> @test_packsswb(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; BTVER2-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpacksswb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_packsswb:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpacksswb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a0, <8 x i16> %a1)
   %2 = bitcast <16 x i8> %1 to <8 x i16>
   %3 = load <8 x i16>, <8 x i16> *%a2, align 16
@@ -2614,6 +2967,12 @@ define <16 x i8> @test_packuswb(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; BTVER2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpackuswb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_packuswb:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpackuswb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a0, <8 x i16> %a1)
   %2 = bitcast <16 x i8> %1 to <8 x i16>
   %3 = load <8 x i16>, <8 x i16> *%a2, align 16
@@ -2662,6 +3021,12 @@ define <16 x i8> @test_paddb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; BTVER2-NEXT:    vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpaddb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_paddb:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpaddb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = add <16 x i8> %a0, %a1
   %2 = load <16 x i8>, <16 x i8> *%a2, align 16
   %3 = add <16 x i8> %1, %2
@@ -2708,6 +3073,12 @@ define <4 x i32> @test_paddd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; BTVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpaddd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_paddd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpaddd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = add <4 x i32> %a0, %a1
   %2 = load <4 x i32>, <4 x i32> *%a2, align 16
   %3 = add <4 x i32> %1, %2
@@ -2750,6 +3121,12 @@ define <2 x i64> @test_paddq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; BTVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpaddq (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_paddq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpaddq (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = add <2 x i64> %a0, %a1
   %2 = load <2 x i64>, <2 x i64> *%a2, align 16
   %3 = add <2 x i64> %1, %2
@@ -2796,6 +3173,12 @@ define <16 x i8> @test_paddsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; BTVER2-NEXT:    vpaddsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpaddsb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_paddsb:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpaddsb %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpaddsb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a0, <16 x i8> %a1)
   %2 = load <16 x i8>, <16 x i8> *%a2, align 16
   %3 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %1, <16 x i8> %2)
@@ -2843,6 +3226,12 @@ define <8 x i16> @test_paddsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; BTVER2-NEXT:    vpaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpaddsw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_paddsw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpaddsw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a0, <8 x i16> %a1)
   %2 = load <8 x i16>, <8 x i16> *%a2, align 16
   %3 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %1, <8 x i16> %2)
@@ -2890,6 +3279,12 @@ define <16 x i8> @test_paddusb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; BTVER2-NEXT:    vpaddusb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpaddusb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_paddusb:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpaddusb %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpaddusb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %a0, <16 x i8> %a1)
   %2 = load <16 x i8>, <16 x i8> *%a2, align 16
   %3 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %1, <16 x i8> %2)
@@ -2937,6 +3332,12 @@ define <8 x i16> @test_paddusw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; BTVER2-NEXT:    vpaddusw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpaddusw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_paddusw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpaddusw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpaddusw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %a0, <8 x i16> %a1)
   %2 = load <8 x i16>, <8 x i16> *%a2, align 16
   %3 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %1, <8 x i16> %2)
@@ -2984,6 +3385,12 @@ define <8 x i16> @test_paddw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; BTVER2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpaddw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_paddw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpaddw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = add <8 x i16> %a0, %a1
   %2 = load <8 x i16>, <8 x i16> *%a2, align 16
   %3 = add <8 x i16> %1, %2
@@ -3032,6 +3439,13 @@ define <2 x i64> @test_pand(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; BTVER2-NEXT:    vpand (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pand:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpand %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpand (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = and <2 x i64> %a0, %a1
   %2 = load <2 x i64>, <2 x i64> *%a2, align 16
   %3 = and <2 x i64> %1, %2
@@ -3087,6 +3501,13 @@ define <2 x i64> @test_pandn(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; BTVER2-NEXT:    vpandn (%rdi), %xmm0, %xmm1 # sched: [6:1.00]
 ; BTVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pandn:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpandn %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpandn (%rdi), %xmm0, %xmm1 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = xor <2 x i64> %a0, <i64 -1, i64 -1>
   %2 = and <2 x i64> %a1, %1
   %3 = load <2 x i64>, <2 x i64> *%a2, align 16
@@ -3136,6 +3557,12 @@ define <16 x i8> @test_pavgb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; BTVER2-NEXT:    vpavgb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpavgb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pavgb:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpavgb %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpavgb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %a0, <16 x i8> %a1)
   %2 = load <16 x i8>, <16 x i8> *%a2, align 16
   %3 = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %1, <16 x i8> %2)
@@ -3183,6 +3610,12 @@ define <8 x i16> @test_pavgw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; BTVER2-NEXT:    vpavgw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpavgw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pavgw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpavgw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpavgw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1)
   %2 = load <8 x i16>, <8 x i16> *%a2, align 16
   %3 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %1, <8 x i16> %2)
@@ -3234,6 +3667,13 @@ define <16 x i8> @test_pcmpeqb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; BTVER2-NEXT:    vpcmpeqb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pcmpeqb:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm1 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpcmpeqb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = icmp eq <16 x i8> %a0, %a1
   %2 = load <16 x i8>, <16 x i8> *%a2, align 16
   %3 = icmp eq <16 x i8> %a0, %2
@@ -3286,6 +3726,13 @@ define <4 x i32> @test_pcmpeqd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; BTVER2-NEXT:    vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pcmpeqd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm1 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = icmp eq <4 x i32> %a0, %a1
   %2 = load <4 x i32>, <4 x i32> *%a2, align 16
   %3 = icmp eq <4 x i32> %a0, %2
@@ -3338,6 +3785,13 @@ define <8 x i16> @test_pcmpeqw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; BTVER2-NEXT:    vpcmpeqw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pcmpeqw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm1 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpcmpeqw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = icmp eq <8 x i16> %a0, %a1
   %2 = load <8 x i16>, <8 x i16> *%a2, align 16
   %3 = icmp eq <8 x i16> %a0, %2
@@ -3391,6 +3845,13 @@ define <16 x i8> @test_pcmpgtb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; BTVER2-NEXT:    vpcmpgtb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pcmpgtb:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpcmpgtb %xmm1, %xmm0, %xmm1 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpcmpgtb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = icmp sgt <16 x i8> %a0, %a1
   %2 = load <16 x i8>, <16 x i8> *%a2, align 16
   %3 = icmp sgt <16 x i8> %a0, %2
@@ -3444,6 +3905,13 @@ define <4 x i32> @test_pcmpgtd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; BTVER2-NEXT:    vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pcmpgtd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm1 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = icmp sgt <4 x i32> %a0, %a1
   %2 = load <4 x i32>, <4 x i32> *%a2, align 16
   %3 = icmp eq <4 x i32> %a0, %2
@@ -3497,6 +3965,13 @@ define <8 x i16> @test_pcmpgtw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; BTVER2-NEXT:    vpcmpgtw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pcmpgtw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpcmpgtw %xmm1, %xmm0, %xmm1 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpcmpgtw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = icmp sgt <8 x i16> %a0, %a1
   %2 = load <8 x i16>, <8 x i16> *%a2, align 16
   %3 = icmp sgt <8 x i16> %a0, %2
@@ -3541,6 +4016,12 @@ define i16 @test_pextrw(<8 x i16> %a0) {
 ; BTVER2-NEXT:    vpextrw $6, %xmm0, %eax # sched: [1:0.50]
 ; BTVER2-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pextrw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpextrw $6, %xmm0, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = extractelement <8 x i16> %a0, i32 6
   ret i16 %1
 }
@@ -3585,6 +4066,12 @@ define <8 x i16> @test_pinsrw(<8 x i16> %a0, i16 %a1, i16 *%a2) {
 ; BTVER2-NEXT:    vpinsrw $1, %edi, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpinsrw $3, (%rsi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pinsrw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpinsrw $1, %edi, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpinsrw $3, (%rsi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = insertelement <8 x i16> %a0, i16 %a1, i32 1
   %2 = load i16, i16 *%a2
   %3 = insertelement <8 x i16> %1, i16 %2, i32 3
@@ -3635,6 +4122,12 @@ define <4 x i32> @test_pmaddwd(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; BTVER2-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
 ; BTVER2-NEXT:    vpmaddwd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmaddwd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; ZNVER1-NEXT:    vpmaddwd (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a0, <8 x i16> %a1)
   %2 = bitcast <4 x i32> %1 to <8 x i16>
   %3 = load <8 x i16>, <8 x i16> *%a2, align 16
@@ -3683,6 +4176,12 @@ define <8 x i16> @test_pmaxsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; BTVER2-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpmaxsw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmaxsw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpmaxsw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16> %a0, <8 x i16> %a1)
   %2 = load <8 x i16>, <8 x i16> *%a2, align 16
   %3 = call <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16> %1, <8 x i16> %2)
@@ -3730,6 +4229,12 @@ define <16 x i8> @test_pmaxub(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; BTVER2-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpmaxub (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmaxub:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpmaxub (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8> %a0, <16 x i8> %a1)
   %2 = load <16 x i8>, <16 x i8> *%a2, align 16
   %3 = call <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8> %1, <16 x i8> %2)
@@ -3777,6 +4282,12 @@ define <8 x i16> @test_pminsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; BTVER2-NEXT:    vpminsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpminsw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pminsw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpminsw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpminsw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16> %a0, <8 x i16> %a1)
   %2 = load <8 x i16>, <8 x i16> *%a2, align 16
   %3 = call <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16> %1, <8 x i16> %2)
@@ -3824,6 +4335,12 @@ define <16 x i8> @test_pminub(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; BTVER2-NEXT:    vpminub %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpminub (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pminub:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpminub %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpminub (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8> %a0, <16 x i8> %a1)
   %2 = load <16 x i8>, <16 x i8> *%a2, align 16
   %3 = call <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8> %1, <16 x i8> %2)
@@ -3863,6 +4380,11 @@ define i32 @test_pmovmskb(<16 x i8> %a0) {
 ; BTVER2:       # BB#0:
 ; BTVER2-NEXT:    vpmovmskb %xmm0, %eax # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmovmskb:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpmovmskb %xmm0, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %a0)
   ret i32 %1
 }
@@ -3904,6 +4426,12 @@ define <8 x i16> @test_pmulhuw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; BTVER2-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
 ; BTVER2-NEXT:    vpmulhuw (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmulhuw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; ZNVER1-NEXT:    vpmulhuw (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> %a0, <8 x i16> %a1)
   %2 = load <8 x i16>, <8 x i16> *%a2, align 16
   %3 = call <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> %1, <8 x i16> %2)
@@ -3947,6 +4475,12 @@ define <8 x i16> @test_pmulhw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; BTVER2-NEXT:    vpmulhw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
 ; BTVER2-NEXT:    vpmulhw (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmulhw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpmulhw %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; ZNVER1-NEXT:    vpmulhw (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> %a0, <8 x i16> %a1)
   %2 = load <8 x i16>, <8 x i16> *%a2, align 16
   %3 = call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> %1, <8 x i16> %2)
@@ -3990,6 +4524,12 @@ define <8 x i16> @test_pmullw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; BTVER2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
 ; BTVER2-NEXT:    vpmullw (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmullw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; ZNVER1-NEXT:    vpmullw (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = mul <8 x i16> %a0, %a1
   %2 = load <8 x i16>, <8 x i16> *%a2, align 16
   %3 = mul <8 x i16> %1, %2
@@ -4040,6 +4580,12 @@ define <2 x i64> @test_pmuludq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; BTVER2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
 ; BTVER2-NEXT:    vpmuludq (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmuludq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; ZNVER1-NEXT:    vpmuludq (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %a0, <4 x i32> %a1)
   %2 = bitcast <2 x i64> %1 to <4 x i32>
   %3 = load <4 x i32>, <4 x i32> *%a2, align 16
@@ -4090,6 +4636,13 @@ define <2 x i64> @test_por(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; BTVER2-NEXT:    vpor (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_por:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpor (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = or <2 x i64> %a0, %a1
   %2 = load <2 x i64>, <2 x i64> *%a2, align 16
   %3 = or <2 x i64> %1, %2
@@ -4141,6 +4694,12 @@ define <2 x i64> @test_psadbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; BTVER2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
 ; BTVER2-NEXT:    vpsadbw (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psadbw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; ZNVER1-NEXT:    vpsadbw (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %a0, <16 x i8> %a1)
   %2 = bitcast <2 x i64> %1 to <16 x i8>
   %3 = load <16 x i8>, <16 x i8> *%a2, align 16
@@ -4193,6 +4752,13 @@ define <4 x i32> @test_pshufd(<4 x i32> %a0, <4 x i32> *%a1) {
 ; BTVER2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] sched: [1:0.50]
 ; BTVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pshufd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpshufd {{.*#+}} xmm1 = mem[3,2,1,0] sched: [8:0.50]
+; ZNVER1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] sched: [1:0.25]
+; ZNVER1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
   %2 = load <4 x i32>, <4 x i32> *%a1, align 16
   %3 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -4244,6 +4810,13 @@ define <8 x i16> @test_pshufhw(<8 x i16> %a0, <8 x i16> *%a1) {
 ; BTVER2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] sched: [1:0.50]
 ; BTVER2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pshufhw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpshufhw {{.*#+}} xmm1 = mem[0,1,2,3,7,6,5,4] sched: [8:0.50]
+; ZNVER1-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] sched: [1:0.25]
+; ZNVER1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 7, i32 6>
   %2 = load <8 x i16>, <8 x i16> *%a1, align 16
   %3 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4>
@@ -4295,6 +4868,13 @@ define <8 x i16> @test_pshuflw(<8 x i16> %a0, <8 x i16> *%a1) {
 ; BTVER2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] sched: [1:0.50]
 ; BTVER2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pshuflw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpshuflw {{.*#+}} xmm1 = mem[3,2,1,0,4,5,6,7] sched: [8:0.50]
+; ZNVER1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] sched: [1:0.25]
+; ZNVER1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
   %2 = load <8 x i16>, <8 x i16> *%a1, align 16
   %3 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
@@ -4344,6 +4924,13 @@ define <4 x i32> @test_pslld(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; BTVER2-NEXT:    vpslld (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    vpslld $2, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pslld:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpslld %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpslld (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpslld $2, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %a0, <4 x i32> %a1)
   %2 = load <4 x i32>, <4 x i32> *%a2, align 16
   %3 = call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %1, <4 x i32> %2)
@@ -4389,6 +4976,11 @@ define <4 x i32> @test_pslldq(<4 x i32> %a0) {
 ; BTVER2:       # BB#0:
 ; BTVER2-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pslldq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> <i32 4, i32 0, i32 1, i32 2>
   ret <4 x i32> %1
 }
@@ -4435,6 +5027,13 @@ define <2 x i64> @test_psllq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; BTVER2-NEXT:    vpsllq (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    vpsllq $2, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psllq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpsllq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpsllq (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpsllq $2, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %a0, <2 x i64> %a1)
   %2 = load <2 x i64>, <2 x i64> *%a2, align 16
   %3 = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %1, <2 x i64> %2)
@@ -4486,6 +5085,13 @@ define <8 x i16> @test_psllw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; BTVER2-NEXT:    vpsllw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    vpsllw $2, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psllw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpsllw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpsllw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpsllw $2, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %a0, <8 x i16> %a1)
   %2 = load <8 x i16>, <8 x i16> *%a2, align 16
   %3 = call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %1, <8 x i16> %2)
@@ -4537,6 +5143,13 @@ define <4 x i32> @test_psrad(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; BTVER2-NEXT:    vpsrad (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    vpsrad $2, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psrad:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpsrad %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpsrad (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpsrad $2, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %a0, <4 x i32> %a1)
   %2 = load <4 x i32>, <4 x i32> *%a2, align 16
   %3 = call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %1, <4 x i32> %2)
@@ -4588,6 +5201,13 @@ define <8 x i16> @test_psraw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; BTVER2-NEXT:    vpsraw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    vpsraw $2, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psraw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpsraw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpsraw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpsraw $2, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %a0, <8 x i16> %a1)
   %2 = load <8 x i16>, <8 x i16> *%a2, align 16
   %3 = call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %1, <8 x i16> %2)
@@ -4639,6 +5259,13 @@ define <4 x i32> @test_psrld(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; BTVER2-NEXT:    vpsrld (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    vpsrld $2, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psrld:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpsrld %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpsrld (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpsrld $2, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %a0, <4 x i32> %a1)
   %2 = load <4 x i32>, <4 x i32> *%a2, align 16
   %3 = call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %1, <4 x i32> %2)
@@ -4684,6 +5311,11 @@ define <4 x i32> @test_psrldq(<4 x i32> %a0) {
 ; BTVER2:       # BB#0:
 ; BTVER2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psrldq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
   ret <4 x i32> %1
 }
@@ -4730,6 +5362,13 @@ define <2 x i64> @test_psrlq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; BTVER2-NEXT:    vpsrlq (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    vpsrlq $2, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psrlq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpsrlq (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpsrlq $2, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %a0, <2 x i64> %a1)
   %2 = load <2 x i64>, <2 x i64> *%a2, align 16
   %3 = call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %1, <2 x i64> %2)
@@ -4781,6 +5420,13 @@ define <8 x i16> @test_psrlw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; BTVER2-NEXT:    vpsrlw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    vpsrlw $2, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psrlw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpsrlw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpsrlw $2, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %a0, <8 x i16> %a1)
   %2 = load <8 x i16>, <8 x i16> *%a2, align 16
   %3 = call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %1, <8 x i16> %2)
@@ -4830,6 +5476,12 @@ define <16 x i8> @test_psubb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; BTVER2-NEXT:    vpsubb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpsubb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psubb:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpsubb %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpsubb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = sub <16 x i8> %a0, %a1
   %2 = load <16 x i8>, <16 x i8> *%a2, align 16
   %3 = sub <16 x i8> %1, %2
@@ -4876,6 +5528,12 @@ define <4 x i32> @test_psubd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; BTVER2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpsubd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psubd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpsubd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = sub <4 x i32> %a0, %a1
   %2 = load <4 x i32>, <4 x i32> *%a2, align 16
   %3 = sub <4 x i32> %1, %2
@@ -4918,6 +5576,12 @@ define <2 x i64> @test_psubq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; BTVER2-NEXT:    vpsubq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpsubq (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psubq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpsubq (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = sub <2 x i64> %a0, %a1
   %2 = load <2 x i64>, <2 x i64> *%a2, align 16
   %3 = sub <2 x i64> %1, %2
@@ -4964,6 +5628,12 @@ define <16 x i8> @test_psubsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; BTVER2-NEXT:    vpsubsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpsubsb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psubsb:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpsubsb %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpsubsb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a0, <16 x i8> %a1)
   %2 = load <16 x i8>, <16 x i8> *%a2, align 16
   %3 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %1, <16 x i8> %2)
@@ -5011,6 +5681,12 @@ define <8 x i16> @test_psubsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; BTVER2-NEXT:    vpsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpsubsw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psubsw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpsubsw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a0, <8 x i16> %a1)
   %2 = load <8 x i16>, <8 x i16> *%a2, align 16
   %3 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %1, <8 x i16> %2)
@@ -5058,6 +5734,12 @@ define <16 x i8> @test_psubusb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; BTVER2-NEXT:    vpsubusb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpsubusb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psubusb:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpsubusb %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpsubusb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %a0, <16 x i8> %a1)
   %2 = load <16 x i8>, <16 x i8> *%a2, align 16
   %3 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %1, <16 x i8> %2)
@@ -5105,6 +5787,12 @@ define <8 x i16> @test_psubusw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; BTVER2-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpsubusw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psubusw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpsubusw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %a0, <8 x i16> %a1)
   %2 = load <8 x i16>, <8 x i16> *%a2, align 16
   %3 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %1, <8 x i16> %2)
@@ -5152,6 +5840,12 @@ define <8 x i16> @test_psubw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; BTVER2-NEXT:    vpsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpsubw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psubw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpsubw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = sub <8 x i16> %a0, %a1
   %2 = load <8 x i16>, <8 x i16> *%a2, align 16
   %3 = sub <8 x i16> %1, %2
@@ -5198,6 +5892,12 @@ define <16 x i8> @test_punpckhbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; BTVER2-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] sched: [1:0.50]
 ; BTVER2-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_punpckhbw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] sched: [1:0.25]
+; ZNVER1-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
   %2 = load <16 x i8>, <16 x i8> *%a2, align 16
   %3 = shufflevector <16 x i8> %1, <16 x i8> %2, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
@@ -5248,6 +5948,13 @@ define <4 x i32> @test_punpckhdq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; BTVER2-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [6:1.00]
 ; BTVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_punpckhdq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.25]
+; ZNVER1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [8:0.50]
+; ZNVER1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   %2 = load <4 x i32>, <4 x i32> *%a2, align 16
   %3 = shufflevector <4 x i32> %a1, <4 x i32> %2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -5297,6 +6004,13 @@ define <2 x i64> @test_punpckhqdq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2)
 ; BTVER2-NEXT:    vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [6:1.00]
 ; BTVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_punpckhqdq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:0.25]
+; ZNVER1-NEXT:    vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [8:0.50]
+; ZNVER1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <2 x i64> %a0, <2 x i64> %a1, <2 x i32> <i32 1, i32 3>
   %2 = load <2 x i64>, <2 x i64> *%a2, align 16
   %3 = shufflevector <2 x i64> %a1, <2 x i64> %2, <2x i32> <i32 1, i32 3>
@@ -5344,6 +6058,12 @@ define <8 x i16> @test_punpckhwd(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; BTVER2-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:0.50]
 ; BTVER2-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_punpckhwd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:0.25]
+; ZNVER1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
   %2 = load <8 x i16>, <8 x i16> *%a2, align 16
   %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -5390,6 +6110,12 @@ define <16 x i8> @test_punpcklbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; BTVER2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:0.50]
 ; BTVER2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_punpcklbw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:0.25]
+; ZNVER1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
   %2 = load <16 x i8>, <16 x i8> *%a2, align 16
   %3 = shufflevector <16 x i8> %1, <16 x i8> %2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
@@ -5440,6 +6166,13 @@ define <4 x i32> @test_punpckldq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; BTVER2-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [6:1.00]
 ; BTVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_punpckldq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:0.25]
+; ZNVER1-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [8:0.50]
+; ZNVER1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   %2 = load <4 x i32>, <4 x i32> *%a2, align 16
   %3 = shufflevector <4 x i32> %a1, <4 x i32> %2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -5489,6 +6222,13 @@ define <2 x i64> @test_punpcklqdq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2)
 ; BTVER2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00]
 ; BTVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_punpcklqdq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:0.25]
+; ZNVER1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [8:0.50]
+; ZNVER1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <2 x i64> %a0, <2 x i64> %a1, <2 x i32> <i32 0, i32 2>
   %2 = load <2 x i64>, <2 x i64> *%a2, align 16
   %3 = shufflevector <2 x i64> %a1, <2 x i64> %2, <2x i32> <i32 0, i32 2>
@@ -5536,6 +6276,12 @@ define <8 x i16> @test_punpcklwd(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; BTVER2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.50]
 ; BTVER2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_punpcklwd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.25]
+; ZNVER1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
   %2 = load <8 x i16>, <8 x i16> *%a2, align 16
   %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -5584,6 +6330,13 @@ define <2 x i64> @test_pxor(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; BTVER2-NEXT:    vpxor (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pxor:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpxor %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpxor (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = xor <2 x i64> %a0, %a1
   %2 = load <2 x i64>, <2 x i64> *%a2, align 16
   %3 = xor <2 x i64> %1, %2
@@ -5633,6 +6386,13 @@ define <2 x double> @test_shufpd(<2 x double> %a0, <2 x double> %a1, <2 x double
 ; BTVER2-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1],mem[0] sched: [6:1.00]
 ; BTVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_shufpd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] sched: [1:0.50]
+; ZNVER1-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1],mem[0] sched: [8:0.50]
+; ZNVER1-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 2>
   %2 = load <2 x double>, <2 x double> *%a2, align 16
   %3 = shufflevector <2 x double> %a1, <2 x double> %2, <2 x i32> <i32 1, i32 2>
@@ -5683,6 +6443,13 @@ define <2 x double> @test_sqrtpd(<2 x double> %a0, <2 x double> *%a1) {
 ; BTVER2-NEXT:    vsqrtpd %xmm0, %xmm0 # sched: [21:21.00]
 ; BTVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_sqrtpd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vsqrtpd (%rdi), %xmm1 # sched: [27:1.00]
+; ZNVER1-NEXT:    vsqrtpd %xmm0, %xmm0 # sched: [20:1.00]
+; ZNVER1-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %a0)
   %2 = load <2 x double>, <2 x double> *%a1, align 16
   %3 = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %2)
@@ -5741,6 +6508,14 @@ define <2 x double> @test_sqrtsd(<2 x double> %a0, <2 x double> *%a1) {
 ; BTVER2-NEXT:    vsqrtsd %xmm1, %xmm1, %xmm1 # sched: [26:21.00]
 ; BTVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_sqrtsd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmovapd (%rdi), %xmm1 # sched: [8:0.50]
+; ZNVER1-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0 # sched: [27:1.00]
+; ZNVER1-NEXT:    vsqrtsd %xmm1, %xmm1, %xmm1 # sched: [27:1.00]
+; ZNVER1-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %a0)
   %2 = load <2 x double>, <2 x double> *%a1, align 16
   %3 = call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %2)
@@ -5785,6 +6560,12 @@ define <2 x double> @test_subpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; BTVER2-NEXT:    vsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vsubpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_subpd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vsubpd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fsub <2 x double> %a0, %a1
   %2 = load <2 x double>, <2 x double> *%a2, align 16
   %3 = fsub <2 x double> %1, %2
@@ -5827,6 +6608,12 @@ define double @test_subsd(double %a0, double %a1, double *%a2) {
 ; BTVER2-NEXT:    vsubsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vsubsd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_subsd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vsubsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vsubsd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fsub double %a0, %a1
   %2 = load double, double *%a2, align 8
   %3 = fsub double %1, %2
@@ -5917,6 +6704,20 @@ define i32 @test_ucomisd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2)
 ; BTVER2-NEXT:    orb %cl, %dl # sched: [1:0.50]
 ; BTVER2-NEXT:    movzbl %dl, %eax # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_ucomisd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vucomisd %xmm1, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    setnp %al # sched: [1:0.25]
+; ZNVER1-NEXT:    sete %cl # sched: [1:0.25]
+; ZNVER1-NEXT:    andb %al, %cl # sched: [1:0.25]
+; ZNVER1-NEXT:    vucomisd (%rdi), %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    setnp %al # sched: [1:0.25]
+; ZNVER1-NEXT:    sete %dl # sched: [1:0.25]
+; ZNVER1-NEXT:    andb %al, %dl # sched: [1:0.25]
+; ZNVER1-NEXT:    orb %cl, %dl # sched: [1:0.25]
+; ZNVER1-NEXT:    movzbl %dl, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call i32 @llvm.x86.sse2.ucomieq.sd(<2 x double> %a0, <2 x double> %a1)
   %2 = load <2 x double>, <2 x double> *%a2, align 8
   %3 = call i32 @llvm.x86.sse2.ucomieq.sd(<2 x double> %a0, <2 x double> %2)
@@ -5967,6 +6768,13 @@ define <2 x double> @test_unpckhpd(<2 x double> %a0, <2 x double> %a1, <2 x doub
 ; BTVER2-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [6:1.00]
 ; BTVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_unpckhpd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:0.50]
+; ZNVER1-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [8:0.50]
+; ZNVER1-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 3>
   %2 = load <2 x double>, <2 x double> *%a2, align 16
   %3 = shufflevector <2 x double> %a1, <2 x double> %2, <2 x i32> <i32 1, i32 3>
@@ -6022,6 +6830,13 @@ define <2 x double> @test_unpcklpd(<2 x double> %a0, <2 x double> %a1, <2 x doub
 ; BTVER2-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] sched: [6:1.00]
 ; BTVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_unpcklpd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:0.50]
+; ZNVER1-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] sched: [8:0.50]
+; ZNVER1-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 0, i32 2>
   %2 = load <2 x double>, <2 x double> *%a2, align 16
   %3 = shufflevector <2 x double> %1, <2 x double> %2, <2 x i32> <i32 0, i32 2>
@@ -6071,6 +6886,13 @@ define <2 x double> @test_xorpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; BTVER2-NEXT:    vxorpd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_xorpd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vxorpd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vxorpd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = bitcast <2 x double> %a0 to <4 x i32>
   %2 = bitcast <2 x double> %a1 to <4 x i32>
   %3 = xor <4 x i32> %1, %2
diff --git a/test/CodeGen/X86/sse3-schedule.ll b/test/CodeGen/X86/sse3-schedule.ll
index ef1ddae4532d..ad38d1c6ff49 100644
--- a/test/CodeGen/X86/sse3-schedule.ll
+++ b/test/CodeGen/X86/sse3-schedule.ll
@@ -7,7 +7,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
 
 define <2 x double> @test_addsubpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
 ; GENERIC-LABEL: test_addsubpd:
@@ -45,6 +45,12 @@ define <2 x double> @test_addsubpd(<2 x double> %a0, <2 x double> %a1, <2 x doub
 ; BTVER2-NEXT:    vaddsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vaddsubpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_addsubpd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vaddsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vaddsubpd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> %a0, <2 x double> %a1)
   %2 = load <2 x double>, <2 x double> *%a2, align 16
   %3 = call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> %1, <2 x double> %2)
@@ -88,6 +94,12 @@ define <4 x float> @test_addsubps(<4 x float> %a0, <4 x float> %a1, <4 x float>
 ; BTVER2-NEXT:    vaddsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vaddsubps (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_addsubps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vaddsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vaddsubps (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float> %a0, <4 x float> %a1)
   %2 = load <4 x float>, <4 x float> *%a2, align 16
   %3 = call <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float> %1, <4 x float> %2)
@@ -131,6 +143,12 @@ define <2 x double> @test_haddpd(<2 x double> %a0, <2 x double> %a1, <2 x double
 ; BTVER2-NEXT:    vhaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vhaddpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_haddpd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vhaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vhaddpd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %a0, <2 x double> %a1)
   %2 = load <2 x double>, <2 x double> *%a2, align 16
   %3 = call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %1, <2 x double> %2)
@@ -174,6 +192,12 @@ define <4 x float> @test_haddps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%
 ; BTVER2-NEXT:    vhaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vhaddps (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_haddps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vhaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vhaddps (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %a0, <4 x float> %a1)
   %2 = load <4 x float>, <4 x float> *%a2, align 16
   %3 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %1, <4 x float> %2)
@@ -217,6 +241,12 @@ define <2 x double> @test_hsubpd(<2 x double> %a0, <2 x double> %a1, <2 x double
 ; BTVER2-NEXT:    vhsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vhsubpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_hsubpd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vhsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vhsubpd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %a0, <2 x double> %a1)
   %2 = load <2 x double>, <2 x double> *%a2, align 16
   %3 = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %1, <2 x double> %2)
@@ -260,6 +290,12 @@ define <4 x float> @test_hsubps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%
 ; BTVER2-NEXT:    vhsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vhsubps (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_hsubps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vhsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vhsubps (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %a0, <4 x float> %a1)
   %2 = load <4 x float>, <4 x float> *%a2, align 16
   %3 = call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %1, <4 x float> %2)
@@ -299,6 +335,11 @@ define <16 x i8> @test_lddqu(i8* %a0) {
 ; BTVER2:       # BB#0:
 ; BTVER2-NEXT:    vlddqu (%rdi), %xmm0 # sched: [5:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lddqu:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vlddqu (%rdi), %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <16 x i8> @llvm.x86.sse3.ldu.dq(i8* %a0)
   ret <16 x i8> %1
 }
@@ -347,6 +388,13 @@ define <2 x double> @test_movddup(<2 x double> %a0, <2 x double> *%a1) {
 ; BTVER2-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0] sched: [1:0.50]
 ; BTVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movddup:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0] sched: [8:0.50]
+; ZNVER1-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0] sched: [1:0.50]
+; ZNVER1-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> zeroinitializer
   %2 = load <2 x double>, <2 x double> *%a1, align 16
   %3 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer
@@ -397,6 +445,13 @@ define <4 x float> @test_movshdup(<4 x float> %a0, <4 x float> *%a1) {
 ; BTVER2-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] sched: [1:0.50]
 ; BTVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movshdup:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmovshdup {{.*#+}} xmm1 = mem[1,1,3,3] sched: [8:0.50]
+; ZNVER1-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] sched: [1:0.50]
+; ZNVER1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
   %2 = load <4 x float>, <4 x float> *%a1, align 16
   %3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
@@ -447,6 +502,13 @@ define <4 x float> @test_movsldup(<4 x float> %a0, <4 x float> *%a1) {
 ; BTVER2-NEXT:    vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] sched: [1:0.50]
 ; BTVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movsldup:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmovsldup {{.*#+}} xmm1 = mem[0,0,2,2] sched: [8:0.50]
+; ZNVER1-NEXT:    vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] sched: [1:0.50]
+; ZNVER1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
   %2 = load <4 x float>, <4 x float> *%a1, align 16
   %3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
diff --git a/test/CodeGen/X86/sse41-schedule.ll b/test/CodeGen/X86/sse41-schedule.ll
index 1ab1598fcab7..26cca98816a3 100644
--- a/test/CodeGen/X86/sse41-schedule.ll
+++ b/test/CodeGen/X86/sse41-schedule.ll
@@ -6,7 +6,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
 
 define <2 x double> @test_blendpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
 ; GENERIC-LABEL: test_blendpd:
@@ -43,6 +43,13 @@ define <2 x double> @test_blendpd(<2 x double> %a0, <2 x double> %a1, <2 x doubl
 ; BTVER2-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1] sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_blendpd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] sched: [1:0.50]
+; ZNVER1-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1] sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 0, i32 3>
   %2 = load <2 x double>, <2 x double> *%a2, align 16
   %3 = fadd <2 x double> %a1, %1
@@ -80,6 +87,12 @@ define <4 x float> @test_blendps(<4 x float> %a0, <4 x float> %a1, <4 x float> *
 ; BTVER2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] sched: [1:0.50]
 ; BTVER2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3] sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_blendps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] sched: [1:0.50]
+; ZNVER1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3] sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
   %2 = load <4 x float>, <4 x float> *%a2, align 16
   %3 = shufflevector <4 x float> %1, <4 x float> %2, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
@@ -122,6 +135,12 @@ define <2 x double> @test_blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x doub
 ; BTVER2-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
 ; BTVER2-NEXT:    vblendvpd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_blendvpd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; ZNVER1-NEXT:    vblendvpd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
   %2 = load <2 x double>, <2 x double> *%a3, align 16
   %3 = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %1, <2 x double> %2, <2 x double> %a2)
@@ -165,6 +184,12 @@ define <4 x float> @test_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float>
 ; BTVER2-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
 ; BTVER2-NEXT:    vblendvps %xmm2, (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_blendvps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; ZNVER1-NEXT:    vblendvps %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
   %2 = load <4 x float>, <4 x float> *%a3
   %3 = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %1, <4 x float> %2, <4 x float> %a2)
@@ -202,6 +227,12 @@ define <2 x double> @test_dppd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; BTVER2-NEXT:    vdppd $7, %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vdppd $7, (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_dppd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vdppd $7, %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vdppd $7, (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i8 7)
   %2 = load <2 x double>, <2 x double> *%a2, align 16
   %3 = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %1, <2 x double> %2, i8 7)
@@ -239,6 +270,12 @@ define <4 x float> @test_dpps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2
 ; BTVER2-NEXT:    vdpps $7, %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vdpps $7, (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_dpps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vdpps $7, %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vdpps $7, (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i8 7)
   %2 = load <4 x float>, <4 x float> *%a2, align 16
   %3 = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %1, <4 x float> %2, i8 7)
@@ -276,6 +313,12 @@ define <4 x float> @test_insertps(<4 x float> %a0, <4 x float> %a1, float *%a2)
 ; BTVER2-NEXT:    vinsertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3] sched: [1:0.50]
 ; BTVER2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_insertps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vinsertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3] sched: [1:0.50]
+; ZNVER1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 17)
   %2 = load float, float *%a2
   %3 = insertelement <4 x float> %1, float %2, i32 3
@@ -308,6 +351,11 @@ define <2 x i64> @test_movntdqa(i8* %a0) {
 ; BTVER2:       # BB#0:
 ; BTVER2-NEXT:    vmovntdqa (%rdi), %xmm0 # sched: [5:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movntdqa:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmovntdqa (%rdi), %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <2 x i64> @llvm.x86.sse41.movntdqa(i8* %a0)
   ret <2 x i64> %1
 }
@@ -343,6 +391,12 @@ define <8 x i16> @test_mpsadbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; BTVER2-NEXT:    vmpsadbw $7, %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
 ; BTVER2-NEXT:    vmpsadbw $7, (%rdi), %xmm0, %xmm0 # sched: [8:2.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_mpsadbw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmpsadbw $7, %xmm1, %xmm0, %xmm0 # sched: [100:0.00]
+; ZNVER1-NEXT:    vmpsadbw $7, (%rdi), %xmm0, %xmm0 # sched: [100:0.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i8 7)
   %2 = bitcast <8 x i16> %1 to <16 x i8>
   %3 = load <16 x i8>, <16 x i8> *%a2, align 16
@@ -381,6 +435,12 @@ define <8 x i16> @test_packusdw(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; BTVER2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpackusdw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_packusdw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpackusdw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1)
   %2 = bitcast <8 x i16> %1 to <4 x i32>
   %3 = load <4 x i32>, <4 x i32> *%a2, align 16
@@ -425,6 +485,12 @@ define <16 x i8> @test_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2, <16
 ; BTVER2-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
 ; BTVER2-NEXT:    vpblendvb %xmm2, (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pblendvb:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; ZNVER1-NEXT:    vpblendvb %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2)
   %2 = load <16 x i8>, <16 x i8> *%a3, align 16
   %3 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %1, <16 x i8> %2, <16 x i8> %a2)
@@ -462,6 +528,12 @@ define <8 x i16> @test_pblendw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; BTVER2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] sched: [1:0.50]
 ; BTVER2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6],mem[7] sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pblendw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] sched: [1:0.50]
+; ZNVER1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6],mem[7] sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
   %2 = load <8 x i16>, <8 x i16> *%a2, align 16
   %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
@@ -498,6 +570,12 @@ define <2 x i64> @test_pcmpeqq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; BTVER2-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpcmpeqq (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pcmpeqq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpcmpeqq (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = icmp eq <2 x i64> %a0, %a1
   %2 = sext <2 x i1> %1 to <2 x i64>
   %3 = load <2 x i64>, <2 x i64>*%a2, align 16
@@ -536,6 +614,12 @@ define i32 @test_pextrb(<16 x i8> %a0, i8 *%a1) {
 ; BTVER2-NEXT:    vpextrb $3, %xmm0, %eax # sched: [1:0.50]
 ; BTVER2-NEXT:    vpextrb $1, %xmm0, (%rdi) # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pextrb:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpextrb $3, %xmm0, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    vpextrb $1, %xmm0, (%rdi) # sched: [8:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = extractelement <16 x i8> %a0, i32 3
   %2 = extractelement <16 x i8> %a0, i32 1
   store i8 %2, i8 *%a1
@@ -573,6 +657,12 @@ define i32 @test_pextrd(<4 x i32> %a0, i32 *%a1) {
 ; BTVER2-NEXT:    vpextrd $3, %xmm0, %eax # sched: [1:0.50]
 ; BTVER2-NEXT:    vpextrd $1, %xmm0, (%rdi) # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pextrd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpextrd $3, %xmm0, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    vpextrd $1, %xmm0, (%rdi) # sched: [8:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = extractelement <4 x i32> %a0, i32 3
   %2 = extractelement <4 x i32> %a0, i32 1
   store i32 %2, i32 *%a1
@@ -609,6 +699,12 @@ define i64 @test_pextrq(<2 x i64> %a0, <2 x i64> %a1, i64 *%a2) {
 ; BTVER2-NEXT:    vpextrq $1, %xmm0, %rax # sched: [1:0.50]
 ; BTVER2-NEXT:    vpextrq $1, %xmm0, (%rdi) # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pextrq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpextrq $1, %xmm0, %rax # sched: [1:0.25]
+; ZNVER1-NEXT:    vpextrq $1, %xmm0, (%rdi) # sched: [8:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = extractelement <2 x i64> %a0, i32 1
   %2 = extractelement <2 x i64> %a0, i32 1
   store i64 %2, i64 *%a2
@@ -645,6 +741,12 @@ define i32 @test_pextrw(<8 x i16> %a0, i16 *%a1) {
 ; BTVER2-NEXT:    vpextrw $3, %xmm0, %eax # sched: [1:0.50]
 ; BTVER2-NEXT:    vpextrw $1, %xmm0, (%rdi) # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pextrw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpextrw $3, %xmm0, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    vpextrw $1, %xmm0, (%rdi) # sched: [8:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = extractelement <8 x i16> %a0, i32 3
   %2 = extractelement <8 x i16> %a0, i32 1
   store i16 %2, i16 *%a1
@@ -682,6 +784,12 @@ define <8 x i16> @test_phminposuw(<8 x i16> *%a0) {
 ; BTVER2-NEXT:    vphminposuw (%rdi), %xmm0 # sched: [7:1.00]
 ; BTVER2-NEXT:    vphminposuw %xmm0, %xmm0 # sched: [2:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_phminposuw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vphminposuw (%rdi), %xmm0 # sched: [11:1.00]
+; ZNVER1-NEXT:    vphminposuw %xmm0, %xmm0 # sched: [4:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = load <8 x i16>, <8 x i16> *%a0, align 16
   %2 = call <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16> %1)
   %3 = call <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16> %2)
@@ -719,6 +827,12 @@ define <16 x i8> @test_pinsrb(<16 x i8> %a0, i8 %a1, i8 *%a2) {
 ; BTVER2-NEXT:    vpinsrb $1, %edi, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpinsrb $3, (%rsi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pinsrb:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpinsrb $1, %edi, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpinsrb $3, (%rsi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = insertelement <16 x i8> %a0, i8 %a1, i32 1
   %2 = load i8, i8 *%a2
   %3 = insertelement <16 x i8> %1, i8 %2, i32 3
@@ -755,6 +869,12 @@ define <4 x i32> @test_pinsrd(<4 x i32> %a0, i32 %a1, i32 *%a2) {
 ; BTVER2-NEXT:    vpinsrd $1, %edi, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpinsrd $3, (%rsi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pinsrd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpinsrd $1, %edi, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpinsrd $3, (%rsi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = insertelement <4 x i32> %a0, i32 %a1, i32 1
   %2 = load i32, i32 *%a2
   %3 = insertelement <4 x i32> %1, i32 %2, i32 3
@@ -796,6 +916,13 @@ define <2 x i64> @test_pinsrq(<2 x i64> %a0, <2 x i64> %a1, i64 %a2, i64 *%a3) {
 ; BTVER2-NEXT:    vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pinsrq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpinsrq $1, (%rsi), %xmm1, %xmm1 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = insertelement <2 x i64> %a0, i64 %a2, i32 1
   %2 = load i64, i64 *%a3
   %3 = insertelement <2 x i64> %a1, i64 %2, i32 1
@@ -833,6 +960,12 @@ define <16 x i8> @test_pmaxsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; BTVER2-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpmaxsb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmaxsb:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpmaxsb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <16 x i8> @llvm.x86.sse41.pmaxsb(<16 x i8> %a0, <16 x i8> %a1)
   %2 = load <16 x i8>, <16 x i8> *%a2, align 16
   %3 = call <16 x i8> @llvm.x86.sse41.pmaxsb(<16 x i8> %1, <16 x i8> %2)
@@ -870,6 +1003,12 @@ define <4 x i32> @test_pmaxsd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; BTVER2-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpmaxsd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmaxsd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpmaxsd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %a0, <4 x i32> %a1)
   %2 = load <4 x i32>, <4 x i32> *%a2, align 16
   %3 = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %1, <4 x i32> %2)
@@ -907,6 +1046,12 @@ define <4 x i32> @test_pmaxud(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; BTVER2-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpmaxud (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmaxud:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpmaxud (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %a0, <4 x i32> %a1)
   %2 = load <4 x i32>, <4 x i32> *%a2, align 16
   %3 = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %1, <4 x i32> %2)
@@ -944,6 +1089,12 @@ define <8 x i16> @test_pmaxuw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; BTVER2-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpmaxuw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmaxuw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpmaxuw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x i16> @llvm.x86.sse41.pmaxuw(<8 x i16> %a0, <8 x i16> %a1)
   %2 = load <8 x i16>, <8 x i16> *%a2, align 16
   %3 = call <8 x i16> @llvm.x86.sse41.pmaxuw(<8 x i16> %1, <8 x i16> %2)
@@ -981,6 +1132,12 @@ define <16 x i8> @test_pminsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; BTVER2-NEXT:    vpminsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpminsb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pminsb:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpminsb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <16 x i8> @llvm.x86.sse41.pminsb(<16 x i8> %a0, <16 x i8> %a1)
   %2 = load <16 x i8>, <16 x i8> *%a2, align 16
   %3 = call <16 x i8> @llvm.x86.sse41.pminsb(<16 x i8> %1, <16 x i8> %2)
@@ -1018,6 +1175,12 @@ define <4 x i32> @test_pminsd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; BTVER2-NEXT:    vpminsd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpminsd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pminsd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpminsd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpminsd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %a0, <4 x i32> %a1)
   %2 = load <4 x i32>, <4 x i32> *%a2, align 16
   %3 = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %1, <4 x i32> %2)
@@ -1055,6 +1218,12 @@ define <4 x i32> @test_pminud(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; BTVER2-NEXT:    vpminud %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpminud (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pminud:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpminud %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpminud (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %a0, <4 x i32> %a1)
   %2 = load <4 x i32>, <4 x i32> *%a2, align 16
   %3 = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %1, <4 x i32> %2)
@@ -1092,6 +1261,12 @@ define <8 x i16> @test_pminuw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; BTVER2-NEXT:    vpminuw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpminuw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pminuw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpminuw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpminuw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16> %a0, <8 x i16> %a1)
   %2 = load <8 x i16>, <8 x i16> *%a2, align 16
   %3 = call <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16> %1, <8 x i16> %2)
@@ -1135,6 +1310,13 @@ define <8 x i16> @test_pmovsxbw(<16 x i8> %a0, <8 x i8> *%a1) {
 ; BTVER2-NEXT:    vpmovsxbw %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmovsxbw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpmovsxbw (%rdi), %xmm1 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpmovsxbw %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %2 = sext <8 x i8> %1 to <8 x i16>
   %3 = load <8 x i8>, <8 x i8>* %a1, align 1
@@ -1179,6 +1361,13 @@ define <4 x i32> @test_pmovsxbd(<16 x i8> %a0, <4 x i8> *%a1) {
 ; BTVER2-NEXT:    vpmovsxbd %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmovsxbd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpmovsxbd (%rdi), %xmm1 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpmovsxbd %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %2 = sext <4 x i8> %1 to <4 x i32>
   %3 = load <4 x i8>, <4 x i8>* %a1, align 1
@@ -1223,6 +1412,13 @@ define <2 x i64> @test_pmovsxbq(<16 x i8> %a0, <2 x i8> *%a1) {
 ; BTVER2-NEXT:    vpmovsxbq %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmovsxbq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpmovsxbq (%rdi), %xmm1 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpmovsxbq %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
   %2 = sext <2 x i8> %1 to <2 x i64>
   %3 = load <2 x i8>, <2 x i8>* %a1, align 1
@@ -1267,6 +1463,13 @@ define <2 x i64> @test_pmovsxdq(<4 x i32> %a0, <2 x i32> *%a1) {
 ; BTVER2-NEXT:    vpmovsxdq %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmovsxdq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpmovsxdq (%rdi), %xmm1 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpmovsxdq %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
   %2 = sext <2 x i32> %1 to <2 x i64>
   %3 = load <2 x i32>, <2 x i32>* %a1, align 1
@@ -1311,6 +1514,13 @@ define <4 x i32> @test_pmovsxwd(<8 x i16> %a0, <4 x i16> *%a1) {
 ; BTVER2-NEXT:    vpmovsxwd %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmovsxwd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpmovsxwd (%rdi), %xmm1 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpmovsxwd %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %2 = sext <4 x i16> %1 to <4 x i32>
   %3 = load <4 x i16>, <4 x i16>* %a1, align 1
@@ -1355,6 +1565,13 @@ define <2 x i64> @test_pmovsxwq(<8 x i16> %a0, <2 x i16> *%a1) {
 ; BTVER2-NEXT:    vpmovsxwq %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmovsxwq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpmovsxwq (%rdi), %xmm1 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpmovsxwq %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
   %2 = sext <2 x i16> %1 to <2 x i64>
   %3 = load <2 x i16>, <2 x i16>* %a1, align 1
@@ -1399,6 +1616,13 @@ define <8 x i16> @test_pmovzxbw(<16 x i8> %a0, <8 x i8> *%a1) {
 ; BTVER2-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:0.50]
 ; BTVER2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmovzxbw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [8:0.50]
+; ZNVER1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:0.25]
+; ZNVER1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %2 = zext <8 x i8> %1 to <8 x i16>
   %3 = load <8 x i8>, <8 x i8>* %a1, align 1
@@ -1443,6 +1667,13 @@ define <4 x i32> @test_pmovzxbd(<16 x i8> %a0, <4 x i8> *%a1) {
 ; BTVER2-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [1:0.50]
 ; BTVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmovzxbd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [8:0.50]
+; ZNVER1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [1:0.25]
+; ZNVER1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %2 = zext <4 x i8> %1 to <4 x i32>
   %3 = load <4 x i8>, <4 x i8>* %a1, align 1
@@ -1487,6 +1718,13 @@ define <2 x i64> @test_pmovzxbq(<16 x i8> %a0, <2 x i8> *%a1) {
 ; BTVER2-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero sched: [1:0.50]
 ; BTVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmovzxbq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [8:0.50]
+; ZNVER1-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero sched: [1:0.25]
+; ZNVER1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
   %2 = zext <2 x i8> %1 to <2 x i64>
   %3 = load <2 x i8>, <2 x i8>* %a1, align 1
@@ -1531,6 +1769,13 @@ define <2 x i64> @test_pmovzxdq(<4 x i32> %a0, <2 x i32> *%a1) {
 ; BTVER2-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero sched: [1:0.50]
 ; BTVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmovzxdq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero sched: [8:0.50]
+; ZNVER1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero sched: [1:0.25]
+; ZNVER1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
   %2 = zext <2 x i32> %1 to <2 x i64>
   %3 = load <2 x i32>, <2 x i32>* %a1, align 1
@@ -1575,6 +1820,13 @@ define <4 x i32> @test_pmovzxwd(<8 x i16> %a0, <4 x i16> *%a1) {
 ; BTVER2-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [1:0.50]
 ; BTVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmovzxwd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [8:0.50]
+; ZNVER1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [1:0.25]
+; ZNVER1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %2 = zext <4 x i16> %1 to <4 x i32>
   %3 = load <4 x i16>, <4 x i16>* %a1, align 1
@@ -1619,6 +1871,13 @@ define <2 x i64> @test_pmovzxwq(<8 x i16> %a0, <2 x i16> *%a1) {
 ; BTVER2-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero sched: [1:0.50]
 ; BTVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmovzxwq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [8:0.50]
+; ZNVER1-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero sched: [1:0.25]
+; ZNVER1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
   %2 = zext <2 x i16> %1 to <2 x i64>
   %3 = load <2 x i16>, <2 x i16>* %a1, align 1
@@ -1657,6 +1916,12 @@ define <2 x i64> @test_pmuldq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; BTVER2-NEXT:    vpmuldq %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
 ; BTVER2-NEXT:    vpmuldq (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmuldq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpmuldq %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; ZNVER1-NEXT:    vpmuldq (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> %a0, <4 x i32> %a1)
   %2 = bitcast <2 x i64> %1 to <4 x i32>
   %3 = load <4 x i32>, <4 x i32> *%a2, align 16
@@ -1695,6 +1960,12 @@ define <4 x i32> @test_pmulld(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; BTVER2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
 ; BTVER2-NEXT:    vpmulld (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmulld:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; ZNVER1-NEXT:    vpmulld (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = mul <4 x i32> %a0, %a1
   %2 = load <4 x i32>, <4 x i32> *%a2, align 16
   %3 = mul <4 x i32> %1, %2
@@ -1751,6 +2022,16 @@ define i32 @test_ptest(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; BTVER2-NEXT:    andb %al, %cl # sched: [1:0.50]
 ; BTVER2-NEXT:    movzbl %cl, %eax # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_ptest:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vptest %xmm1, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    setb %al # sched: [1:0.25]
+; ZNVER1-NEXT:    vptest (%rdi), %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    setb %cl # sched: [1:0.25]
+; ZNVER1-NEXT:    andb %al, %cl # sched: [1:0.25]
+; ZNVER1-NEXT:    movzbl %cl, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %a0, <2 x i64> %a1)
   %2 = load <2 x i64>, <2 x i64> *%a2, align 16
   %3 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %a0, <2 x i64> %2)
@@ -1795,6 +2076,13 @@ define <2 x double> @test_roundpd(<2 x double> %a0, <2 x double> *%a1) {
 ; BTVER2-NEXT:    vroundpd $7, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_roundpd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vroundpd $7, (%rdi), %xmm1 # sched: [10:1.00]
+; ZNVER1-NEXT:    vroundpd $7, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 7)
   %2 = load <2 x double>, <2 x double> *%a1, align 16
   %3 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %2, i32 7)
@@ -1839,6 +2127,13 @@ define <4 x float> @test_roundps(<4 x float> %a0, <4 x float> *%a1) {
 ; BTVER2-NEXT:    vroundps $7, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_roundps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vroundps $7, (%rdi), %xmm1 # sched: [10:1.00]
+; ZNVER1-NEXT:    vroundps $7, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 7)
   %2 = load <4 x float>, <4 x float> *%a1, align 16
   %3 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %2, i32 7)
@@ -1884,6 +2179,13 @@ define <2 x double> @test_roundsd(<2 x double> %a0, <2 x double> %a1, <2 x doubl
 ; BTVER2-NEXT:    vroundsd $7, (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
 ; BTVER2-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_roundsd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vroundsd $7, %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
+; ZNVER1-NEXT:    vroundsd $7, (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %a1, i32 7)
   %2 = load <2 x double>, <2 x double>* %a2, align 16
   %3 = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %2, i32 7)
@@ -1929,6 +2231,13 @@ define <4 x float> @test_roundss(<4 x float> %a0, <4 x float> %a1, <4 x float> *
 ; BTVER2-NEXT:    vroundss $7, (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
 ; BTVER2-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_roundss:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vroundss $7, %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
+; ZNVER1-NEXT:    vroundss $7, (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %a1, i32 7)
   %2 = load <4 x float>, <4 x float> *%a2, align 16
   %3 = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %2, i32 7)
diff --git a/test/CodeGen/X86/sse42-schedule.ll b/test/CodeGen/X86/sse42-schedule.ll
index 7ce9ffdbd0ea..adf857e12179 100644
--- a/test/CodeGen/X86/sse42-schedule.ll
+++ b/test/CodeGen/X86/sse42-schedule.ll
@@ -6,7 +6,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
 
 define i32 @crc32_32_8(i32 %a0, i8 %a1, i8 *%a2) {
 ; GENERIC-LABEL: crc32_32_8:
@@ -43,6 +43,13 @@ define i32 @crc32_32_8(i32 %a0, i8 %a1, i8 *%a2) {
 ; BTVER2-NEXT:    crc32b (%rdx), %edi # sched: [8:1.00]
 ; BTVER2-NEXT:    movl %edi, %eax # sched: [1:0.17]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: crc32_32_8:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    crc32b %sil, %edi # sched: [3:1.00]
+; ZNVER1-NEXT:    crc32b (%rdx), %edi # sched: [10:1.00]
+; ZNVER1-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call i32 @llvm.x86.sse42.crc32.32.8(i32 %a0, i8 %a1)
   %2 = load i8, i8 *%a2
   %3 = call i32 @llvm.x86.sse42.crc32.32.8(i32 %1, i8 %2)
@@ -85,6 +92,13 @@ define i32 @crc32_32_16(i32 %a0, i16 %a1, i16 *%a2) {
 ; BTVER2-NEXT:    crc32w (%rdx), %edi # sched: [8:1.00]
 ; BTVER2-NEXT:    movl %edi, %eax # sched: [1:0.17]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: crc32_32_16:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    crc32w %si, %edi # sched: [3:1.00]
+; ZNVER1-NEXT:    crc32w (%rdx), %edi # sched: [10:1.00]
+; ZNVER1-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call i32 @llvm.x86.sse42.crc32.32.16(i32 %a0, i16 %a1)
   %2 = load i16, i16 *%a2
   %3 = call i32 @llvm.x86.sse42.crc32.32.16(i32 %1, i16 %2)
@@ -127,6 +141,13 @@ define i32 @crc32_32_32(i32 %a0, i32 %a1, i32 *%a2) {
 ; BTVER2-NEXT:    crc32l (%rdx), %edi # sched: [8:1.00]
 ; BTVER2-NEXT:    movl %edi, %eax # sched: [1:0.17]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: crc32_32_32:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    crc32l %esi, %edi # sched: [3:1.00]
+; ZNVER1-NEXT:    crc32l (%rdx), %edi # sched: [10:1.00]
+; ZNVER1-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call i32 @llvm.x86.sse42.crc32.32.32(i32 %a0, i32 %a1)
   %2 = load i32, i32 *%a2
   %3 = call i32 @llvm.x86.sse42.crc32.32.32(i32 %1, i32 %2)
@@ -169,6 +190,13 @@ define i64 @crc32_64_8(i64 %a0, i8 %a1, i8 *%a2) nounwind {
 ; BTVER2-NEXT:    crc32b (%rdx), %edi # sched: [8:1.00]
 ; BTVER2-NEXT:    movq %rdi, %rax # sched: [1:0.17]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: crc32_64_8:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    crc32b %sil, %edi # sched: [3:1.00]
+; ZNVER1-NEXT:    crc32b (%rdx), %edi # sched: [10:1.00]
+; ZNVER1-NEXT:    movq %rdi, %rax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call i64 @llvm.x86.sse42.crc32.64.8(i64 %a0, i8 %a1)
   %2 = load i8, i8 *%a2
   %3 = call i64 @llvm.x86.sse42.crc32.64.8(i64 %1, i8 %2)
@@ -211,6 +239,13 @@ define i64 @crc32_64_64(i64 %a0, i64 %a1, i64 *%a2) {
 ; BTVER2-NEXT:    crc32q (%rdx), %rdi # sched: [8:1.00]
 ; BTVER2-NEXT:    movq %rdi, %rax # sched: [1:0.17]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: crc32_64_64:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    crc32q %rsi, %rdi # sched: [3:1.00]
+; ZNVER1-NEXT:    crc32q (%rdx), %rdi # sched: [10:1.00]
+; ZNVER1-NEXT:    movq %rdi, %rax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call i64 @llvm.x86.sse42.crc32.64.64(i64 %a0, i64 %a1)
   %2 = load i64, i64 *%a2
   %3 = call i64 @llvm.x86.sse42.crc32.64.64(i64 %1, i64 %2)
@@ -283,6 +318,19 @@ define i32 @test_pcmpestri(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; BTVER2-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
 ; BTVER2-NEXT:    leal (%rcx,%rsi), %eax # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pcmpestri:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    movl $7, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    movl $7, %edx # sched: [1:0.25]
+; ZNVER1-NEXT:    vpcmpestri $7, %xmm1, %xmm0 # sched: [100:0.00]
+; ZNVER1-NEXT:    movl $7, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    movl $7, %edx # sched: [1:0.25]
+; ZNVER1-NEXT:    movl %ecx, %esi # sched: [1:0.25]
+; ZNVER1-NEXT:    vpcmpestri $7, (%rdi), %xmm0 # sched: [100:0.00]
+; ZNVER1-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
+; ZNVER1-NEXT:    leal (%rcx,%rsi), %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %a0, i32 7, <16 x i8> %a1, i32 7, i8 7)
   %2 = load <16 x i8>, <16 x i8> *%a2, align 16
   %3 = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %a0, i32 7, <16 x i8> %2, i32 7, i8 7)
@@ -341,6 +389,16 @@ define <16 x i8> @test_pcmpestrm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; BTVER2-NEXT:    movl $7, %edx # sched: [1:0.17]
 ; BTVER2-NEXT:    vpcmpestrm $7, (%rdi), %xmm0 # sched: [18:2.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pcmpestrm:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    movl $7, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    movl $7, %edx # sched: [1:0.25]
+; ZNVER1-NEXT:    vpcmpestrm $7, %xmm1, %xmm0 # sched: [100:0.00]
+; ZNVER1-NEXT:    movl $7, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    movl $7, %edx # sched: [1:0.25]
+; ZNVER1-NEXT:    vpcmpestrm $7, (%rdi), %xmm0 # sched: [100:0.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %a0, i32 7, <16 x i8> %a1, i32 7, i8 7)
   %2 = load <16 x i8>, <16 x i8> *%a2, align 16
   %3 = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %1, i32 7, <16 x i8> %2, i32 7, i8 7)
@@ -393,6 +451,15 @@ define i32 @test_pcmpistri(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; BTVER2-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
 ; BTVER2-NEXT:    leal (%rcx,%rax), %eax # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pcmpistri:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpcmpistri $7, %xmm1, %xmm0 # sched: [100:0.00]
+; ZNVER1-NEXT:    movl %ecx, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    vpcmpistri $7, (%rdi), %xmm0 # sched: [100:0.00]
+; ZNVER1-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
+; ZNVER1-NEXT:    leal (%rcx,%rax), %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %a0, <16 x i8> %a1, i8 7)
   %2 = load <16 x i8>, <16 x i8> *%a2, align 16
   %3 = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %a0, <16 x i8> %2, i8 7)
@@ -431,6 +498,12 @@ define <16 x i8> @test_pcmpistrm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; BTVER2-NEXT:    vpcmpistrm $7, %xmm1, %xmm0 # sched: [7:1.00]
 ; BTVER2-NEXT:    vpcmpistrm $7, (%rdi), %xmm0 # sched: [12:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pcmpistrm:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpcmpistrm $7, %xmm1, %xmm0 # sched: [100:0.00]
+; ZNVER1-NEXT:    vpcmpistrm $7, (%rdi), %xmm0 # sched: [100:0.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1, i8 7)
   %2 = load <16 x i8>, <16 x i8> *%a2, align 16
   %3 = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %1, <16 x i8> %2, i8 7)
@@ -468,6 +541,12 @@ define <2 x i64> @test_pcmpgtq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; BTVER2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpcmpgtq (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pcmpgtq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpcmpgtq (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = icmp sgt <2 x i64> %a0, %a1
   %2 = sext <2 x i1> %1 to <2 x i64>
   %3 = load <2 x i64>, <2 x i64>*%a2, align 16
diff --git a/test/CodeGen/X86/sse4a-schedule.ll b/test/CodeGen/X86/sse4a-schedule.ll
index 11afdb7989f1..9ad6b0dfd4d6 100644
--- a/test/CodeGen/X86/sse4a-schedule.ll
+++ b/test/CodeGen/X86/sse4a-schedule.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mattr=+sse4a | FileCheck %s --check-prefix=GENERIC
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=ZNVER1
 
 define <2 x i64> @test_extrq(<2 x i64> %a0, <16 x i8> %a1) {
 ; GENERIC-LABEL: test_extrq:
@@ -11,8 +11,13 @@ define <2 x i64> @test_extrq(<2 x i64> %a0, <16 x i8> %a1) {
 ;
 ; BTVER2-LABEL: test_extrq:
 ; BTVER2:       # BB#0:
-; BTVER2-NEXT:    extrq %xmm1, %xmm0
+; BTVER2-NEXT:    extrq %xmm1, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_extrq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    extrq %xmm1, %xmm0 # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %a0, <16 x i8> %a1)
   ret <2 x i64> %1
 }
@@ -26,8 +31,13 @@ define <2 x i64> @test_extrqi(<2 x i64> %a0) {
 ;
 ; BTVER2-LABEL: test_extrqi:
 ; BTVER2:       # BB#0:
-; BTVER2-NEXT:    extrq $2, $3, %xmm0
+; BTVER2-NEXT:    extrq $2, $3, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_extrqi:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    extrq $2, $3, %xmm0 # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %a0, i8 3, i8 2)
   ret <2 x i64> %1
 }
@@ -41,8 +51,13 @@ define <2 x i64> @test_insertq(<2 x i64> %a0, <2 x i64> %a1) {
 ;
 ; BTVER2-LABEL: test_insertq:
 ; BTVER2:       # BB#0:
-; BTVER2-NEXT:    insertq %xmm1, %xmm0
+; BTVER2-NEXT:    insertq %xmm1, %xmm0 # sched: [2:2.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_insertq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    insertq %xmm1, %xmm0 # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> %a0, <2 x i64> %a1)
   ret <2 x i64> %1
 }
@@ -56,8 +71,13 @@ define <2 x i64> @test_insertqi(<2 x i64> %a0, <2 x i64> %a1) {
 ;
 ; BTVER2-LABEL: test_insertqi:
 ; BTVER2:       # BB#0:
-; BTVER2-NEXT:    insertq $6, $5, %xmm1, %xmm0
+; BTVER2-NEXT:    insertq $6, $5, %xmm1, %xmm0 # sched: [2:2.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_insertqi:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    insertq $6, $5, %xmm1, %xmm0 # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %a0, <2 x i64> %a1, i8 5, i8 6)
   ret <2 x i64> %1
 }
@@ -73,6 +93,11 @@ define void @test_movntsd(i8* %p, <2 x double> %a) {
 ; BTVER2:       # BB#0:
 ; BTVER2-NEXT:    movntsd %xmm0, (%rdi) # sched: [1:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movntsd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    movntsd %xmm0, (%rdi) # sched: [1:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   tail call void @llvm.x86.sse4a.movnt.sd(i8* %p, <2 x double> %a)
   ret void
 }
@@ -88,6 +113,11 @@ define void @test_movntss(i8* %p, <4 x float> %a) {
 ; BTVER2:       # BB#0:
 ; BTVER2-NEXT:    movntss %xmm0, (%rdi) # sched: [1:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movntss:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    movntss %xmm0, (%rdi) # sched: [1:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   tail call void @llvm.x86.sse4a.movnt.ss(i8* %p, <4 x float> %a)
   ret void
 }
diff --git a/test/CodeGen/X86/ssse3-schedule.ll b/test/CodeGen/X86/ssse3-schedule.ll
index f24969a30c33..24ace69ebb9e 100644
--- a/test/CodeGen/X86/ssse3-schedule.ll
+++ b/test/CodeGen/X86/ssse3-schedule.ll
@@ -7,7 +7,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
 
 define <16 x i8> @test_pabsb(<16 x i8> %a0, <16 x i8> *%a1) {
 ; GENERIC-LABEL: test_pabsb:
@@ -52,6 +52,13 @@ define <16 x i8> @test_pabsb(<16 x i8> %a0, <16 x i8> *%a1) {
 ; BTVER2-NEXT:    vpabsb %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pabsb:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpabsb (%rdi), %xmm1 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpabsb %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8> %a0)
   %2 = load <16 x i8>, <16 x i8> *%a1, align 16
   %3 = call <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8> %2)
@@ -103,6 +110,13 @@ define <4 x i32> @test_pabsd(<4 x i32> %a0, <4 x i32> *%a1) {
 ; BTVER2-NEXT:    vpabsd %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pabsd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpabsd (%rdi), %xmm1 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpabsd %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32> %a0)
   %2 = load <4 x i32>, <4 x i32> *%a1, align 16
   %3 = call <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32> %2)
@@ -147,6 +161,11 @@ define <8 x i16> @test_pabsw(<8 x i16> %a0, <8 x i16> *%a1) {
 ; BTVER2:       # BB#0:
 ; BTVER2-NEXT:    vpabsw %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pabsw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpabsw %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x i16> @llvm.x86.ssse3.pabs.w.128(<8 x i16> %a0)
   %2 = load <8 x i16>, <8 x i16> *%a1, align 16
   %3 = call <8 x i16> @llvm.x86.ssse3.pabs.w.128(<8 x i16> %2)
@@ -196,6 +215,12 @@ define <8 x i16> @test_palignr(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; BTVER2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] sched: [1:0.50]
 ; BTVER2-NEXT:    vpalignr {{.*#+}} xmm0 = mem[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_palignr:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] sched: [1:0.25]
+; ZNVER1-NEXT:    vpalignr {{.*#+}} xmm0 = mem[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
   %2 = load <8 x i16>, <8 x i16> *%a2, align 16
   %3 = shufflevector <8 x i16> %2, <8 x i16> %1, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
@@ -238,6 +263,12 @@ define <4 x i32> @test_phaddd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; BTVER2-NEXT:    vphaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vphaddd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_phaddd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vphaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vphaddd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %a0, <4 x i32> %a1)
   %2 = load <4 x i32>, <4 x i32> *%a2, align 16
   %3 = call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %1, <4 x i32> %2)
@@ -289,6 +320,12 @@ define <8 x i16> @test_phaddsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; BTVER2-NEXT:    vphaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vphaddsw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_phaddsw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vphaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vphaddsw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %a0, <8 x i16> %a1)
   %2 = load <8 x i16>, <8 x i16> *%a2, align 16
   %3 = call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %1, <8 x i16> %2)
@@ -332,6 +369,12 @@ define <8 x i16> @test_phaddw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; BTVER2-NEXT:    vphaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vphaddw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_phaddw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vphaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vphaddw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %a0, <8 x i16> %a1)
   %2 = load <8 x i16>, <8 x i16> *%a2, align 16
   %3 = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %1, <8 x i16> %2)
@@ -375,6 +418,12 @@ define <4 x i32> @test_phsubd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; BTVER2-NEXT:    vphsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vphsubd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_phsubd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vphsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vphsubd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %a0, <4 x i32> %a1)
   %2 = load <4 x i32>, <4 x i32> *%a2, align 16
   %3 = call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %1, <4 x i32> %2)
@@ -426,6 +475,12 @@ define <8 x i16> @test_phsubsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; BTVER2-NEXT:    vphsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vphsubsw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_phsubsw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vphsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vphsubsw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %a0, <8 x i16> %a1)
   %2 = load <8 x i16>, <8 x i16> *%a2, align 16
   %3 = call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %1, <8 x i16> %2)
@@ -469,6 +524,12 @@ define <8 x i16> @test_phsubw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; BTVER2-NEXT:    vphsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vphsubw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_phsubw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vphsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vphsubw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %a0, <8 x i16> %a1)
   %2 = load <8 x i16>, <8 x i16> *%a2, align 16
   %3 = call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %1, <8 x i16> %2)
@@ -512,6 +573,12 @@ define <8 x i16> @test_pmaddubsw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; BTVER2-NEXT:    vpmaddubsw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
 ; BTVER2-NEXT:    vpmaddubsw (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmaddubsw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpmaddubsw %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; ZNVER1-NEXT:    vpmaddubsw (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> %a1)
   %2 = load <16 x i8>, <16 x i8> *%a2, align 16
   %3 = bitcast <8 x i16> %1 to <16 x i8>
@@ -550,6 +617,11 @@ define <8 x i16> @test_pmulhrsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; BTVER2:       # BB#0:
 ; BTVER2-NEXT:    vpmulhrsw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmulhrsw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpmulhrsw %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16> %a0, <8 x i16> %a1)
   %2 = load <8 x i16>, <8 x i16> *%a2, align 16
   %3 = call <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16> %1, <8 x i16> %2)
@@ -593,6 +665,12 @@ define <16 x i8> @test_pshufb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; BTVER2-NEXT:    vpshufb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpshufb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pshufb:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpshufb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> %a1)
   %2 = load <16 x i8>, <16 x i8> *%a2, align 16
   %3 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> %2)
@@ -644,6 +722,12 @@ define <16 x i8> @test_psignb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; BTVER2-NEXT:    vpsignb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpsignb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psignb:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpsignb %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpsignb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8> %a0, <16 x i8> %a1)
   %2 = load <16 x i8>, <16 x i8> *%a2, align 16
   %3 = call <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8> %1, <16 x i8> %2)
@@ -695,6 +779,12 @@ define <4 x i32> @test_psignd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; BTVER2-NEXT:    vpsignd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpsignd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psignd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpsignd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpsignd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32> %a0, <4 x i32> %a1)
   %2 = load <4 x i32>, <4 x i32> *%a2, align 16
   %3 = call <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32> %1, <4 x i32> %2)
@@ -746,6 +836,12 @@ define <8 x i16> @test_psignw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; BTVER2-NEXT:    vpsignw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpsignw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psignw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpsignw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpsignw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16> %a0, <8 x i16> %a1)
   %2 = load <8 x i16>, <8 x i16> *%a2, align 16
   %3 = call <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16> %1, <8 x i16> %2)
diff --git a/test/CodeGen/X86/statepoint-invoke.ll b/test/CodeGen/X86/statepoint-invoke.ll
index 29f8e3ed4f78..784b932addc8 100644
--- a/test/CodeGen/X86/statepoint-invoke.ll
+++ b/test/CodeGen/X86/statepoint-invoke.ll
@@ -95,8 +95,8 @@ left.relocs:
 
 right:
   ; CHECK-LABEL: %right
-  ; CHECK: movq
   ; CHECK: movq %rdx, (%rsp)
+  ; CHECK: movq
   ; CHECK: callq some_call
   %sp2 = invoke token (i64, i32, void (i64 addrspace(1)*)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidp1i64f(i64 0, i32 0, void (i64 addrspace(1)*)* @some_call, i32 1, i32 0, i64 addrspace(1)* %val1, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i64 addrspace(1)* %val2, i64 addrspace(1)* %val3)
            to label %right.relocs unwind label %exceptional_return.right
diff --git a/test/CodeGen/X86/statepoint-stack-usage.ll b/test/CodeGen/X86/statepoint-stack-usage.ll
index b16426eae3d5..6e7fc7bf1c07 100644
--- a/test/CodeGen/X86/statepoint-stack-usage.ll
+++ b/test/CodeGen/X86/statepoint-stack-usage.ll
@@ -11,9 +11,9 @@ target triple = "x86_64-pc-linux-gnu"
 define i32 @back_to_back_calls(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c) #1 gc "statepoint-example" {
 ; CHECK-LABEL: back_to_back_calls
 ; The exact stores don't matter, but there need to be three stack slots created
-; CHECK: movq	%rdi, 16(%rsp)
-; CHECK: movq	%rdx, 8(%rsp)
-; CHECK: movq	%rsi, (%rsp)
+; CHECK-DAG: movq	%rdi, 16(%rsp)
+; CHECK-DAG: movq	%rdx, 8(%rsp)
+; CHECK-DAG: movq	%rsi, (%rsp)
 ; There should be no more than three moves
 ; CHECK-NOT: movq
   %safepoint_token = tail call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c)
@@ -36,9 +36,9 @@ define i32 @back_to_back_calls(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 a
 define i32 @reserve_first(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c) #1 gc "statepoint-example" {
 ; CHECK-LABEL: reserve_first
 ; The exact stores don't matter, but there need to be three stack slots created
-; CHECK: movq	%rdi, 16(%rsp)
-; CHECK: movq	%rdx, 8(%rsp)
-; CHECK: movq	%rsi, (%rsp)
+; CHECK-DAG: movq	%rdi, 16(%rsp)
+; CHECK-DAG: movq	%rdx, 8(%rsp)
+; CHECK-DAG: movq	%rsi, (%rsp)
   %safepoint_token = tail call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c)
   %a1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 12, i32 12)
   %b1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 12, i32 13)
@@ -61,21 +61,21 @@ define i32 @back_to_back_deopt(i32 %a, i32 %b, i32 %c) #1
   gc "statepoint-example" {
 ; CHECK-LABEL: back_to_back_deopt
 ; The exact stores don't matter, but there need to be three stack slots created
-; CHECK: movl	%ebx, 12(%rsp)
-; CHECK: movl	%ebp, 8(%rsp)
-; CHECK: movl	%r14d, 4(%rsp)
+; CHECK-DAG: movl	%ebx, 12(%rsp)
+; CHECK-DAG: movl	%ebp, 8(%rsp)
+; CHECK-DAG: movl	%r14d, 4(%rsp)
 ; CHECK: callq
-; CHECK: movl	%ebx, 12(%rsp)
-; CHECK: movl	%ebp, 8(%rsp)
-; CHECK: movl	%r14d, 4(%rsp)
+; CHECK-DAG: movl	%ebx, 12(%rsp)
+; CHECK-DAG: movl	%ebp, 8(%rsp)
+; CHECK-DAG: movl	%r14d, 4(%rsp)
 ; CHECK: callq
-; CHECK: movl	%ebx, 12(%rsp)
-; CHECK: movl	%ebp, 8(%rsp)
-; CHECK: movl	%r14d, 4(%rsp)
+; CHECK-DAG: movl	%ebx, 12(%rsp)
+; CHECK-DAG: movl	%ebp, 8(%rsp)
+; CHECK-DAG: movl	%r14d, 4(%rsp)
 ; CHECK: callq
-; CHECK: movl	%ebx, 12(%rsp)
-; CHECK: movl	%ebp, 8(%rsp)
-; CHECK: movl	%r14d, 4(%rsp)
+; CHECK-DAG: movl	%ebx, 12(%rsp)
+; CHECK-DAG: movl	%ebp, 8(%rsp)
+; CHECK-DAG: movl	%r14d, 4(%rsp)
 ; CHECK: callq
   call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 3, i32 %a, i32 %b, i32 %c)
 call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 3, i32 %a, i32 %b, i32 %c)
@@ -89,9 +89,9 @@ define i32 @back_to_back_invokes(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32
 ; CHECK-LABEL: back_to_back_invokes
 entry:
   ; The exact stores don't matter, but there need to be three stack slots created
-  ; CHECK: movq	%rdi, 16(%rsp)
-  ; CHECK: movq	%rdx, 8(%rsp)
-  ; CHECK: movq	%rsi, (%rsp)
+  ; CHECK-DAG: movq	%rdi, 16(%rsp)
+  ; CHECK-DAG: movq	%rdx, 8(%rsp)
+  ; CHECK-DAG: movq	%rsi, (%rsp)
   ; CHECK: callq
   %safepoint_token = invoke token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c)
                    to label %normal_return unwind label %exceptional_return
diff --git a/test/CodeGen/X86/statepoint-vector.ll b/test/CodeGen/X86/statepoint-vector.ll
index 5bc8f983ff06..538d17564957 100644
--- a/test/CodeGen/X86/statepoint-vector.ll
+++ b/test/CodeGen/X86/statepoint-vector.ll
@@ -49,8 +49,8 @@ entry:
 ; CHECK: subq	$40, %rsp
 ; CHECK: testb	$1, %dil
 ; CHECK: movaps	(%rsi), %xmm0
-; CHECK: movaps	%xmm0, 16(%rsp)
-; CHECK: movaps	%xmm0, (%rsp)
+; CHECK-DAG: movaps	%xmm0, (%rsp)
+; CHECK-DAG: movaps	%xmm0, 16(%rsp)
 ; CHECK: callq	do_safepoint
 ; CHECK: movaps	(%rsp), %xmm0
 ; CHECK: addq	$40, %rsp
diff --git a/test/CodeGen/X86/vec_cmp_uint-128.ll b/test/CodeGen/X86/vec_cmp_uint-128.ll
index 8bed14e7e5f5..cad7991c4f3b 100644
--- a/test/CodeGen/X86/vec_cmp_uint-128.ll
+++ b/test/CodeGen/X86/vec_cmp_uint-128.ll
@@ -463,7 +463,7 @@ define <4 x i32> @gt_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ;
 ; AVX2-LABEL: gt_v4i32:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
 ; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
@@ -476,7 +476,7 @@ define <4 x i32> @gt_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ;
 ; AVX512-LABEL: gt_v4i32:
 ; AVX512:       # BB#0:
-; AVX512-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
 ; AVX512-NEXT:    vpxor %xmm2, %xmm1, %xmm1
 ; AVX512-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; AVX512-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
@@ -782,7 +782,7 @@ define <4 x i32> @lt_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ;
 ; AVX2-LABEL: lt_v4i32:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
 ; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
@@ -795,7 +795,7 @@ define <4 x i32> @lt_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ;
 ; AVX512-LABEL: lt_v4i32:
 ; AVX512:       # BB#0:
-; AVX512-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
 ; AVX512-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; AVX512-NEXT:    vpxor %xmm2, %xmm1, %xmm1
 ; AVX512-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
diff --git a/test/CodeGen/X86/vector-idiv-sdiv-128.ll b/test/CodeGen/X86/vector-idiv-sdiv-128.ll
index 2b5eb695f53e..87cf2026d1ef 100644
--- a/test/CodeGen/X86/vector-idiv-sdiv-128.ll
+++ b/test/CodeGen/X86/vector-idiv-sdiv-128.ll
@@ -135,7 +135,7 @@ define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind {
 ;
 ; AVX2-LABEL: test_div7_4i32:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
 ; AVX2-NEXT:    vpmuldq %xmm2, %xmm3, %xmm2
@@ -433,7 +433,7 @@ define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind {
 ;
 ; AVX2-LABEL: test_rem7_4i32:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
 ; AVX2-NEXT:    vpmuldq %xmm2, %xmm3, %xmm2
@@ -444,7 +444,7 @@ define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind {
 ; AVX2-NEXT:    vpsrld $31, %xmm1, %xmm2
 ; AVX2-NEXT:    vpsrad $2, %xmm1, %xmm1
 ; AVX2-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [7,7,7,7]
 ; AVX2-NEXT:    vpmulld %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
diff --git a/test/CodeGen/X86/vector-idiv-sdiv-256.ll b/test/CodeGen/X86/vector-idiv-sdiv-256.ll
index e7bfe3778212..ce0ec6c3875a 100644
--- a/test/CodeGen/X86/vector-idiv-sdiv-256.ll
+++ b/test/CodeGen/X86/vector-idiv-sdiv-256.ll
@@ -115,7 +115,7 @@ define <8 x i32> @test_div7_8i32(<8 x i32> %a) nounwind {
 ;
 ; AVX2-LABEL: test_div7_8i32:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
 ; AVX2-NEXT:    vpmuldq %ymm2, %ymm3, %ymm2
@@ -381,7 +381,7 @@ define <8 x i32> @test_rem7_8i32(<8 x i32> %a) nounwind {
 ;
 ; AVX2-LABEL: test_rem7_8i32:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
 ; AVX2-NEXT:    vpmuldq %ymm2, %ymm3, %ymm2
@@ -392,7 +392,7 @@ define <8 x i32> @test_rem7_8i32(<8 x i32> %a) nounwind {
 ; AVX2-NEXT:    vpsrld $31, %ymm1, %ymm2
 ; AVX2-NEXT:    vpsrad $2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm2
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7]
 ; AVX2-NEXT:    vpmulld %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
diff --git a/test/CodeGen/X86/vector-idiv-udiv-128.ll b/test/CodeGen/X86/vector-idiv-udiv-128.ll
index cd17fcf8c85b..8138442b3eaf 100644
--- a/test/CodeGen/X86/vector-idiv-udiv-128.ll
+++ b/test/CodeGen/X86/vector-idiv-udiv-128.ll
@@ -130,7 +130,7 @@ define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind {
 ;
 ; AVX2-LABEL: test_div7_4i32:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
 ; AVX2-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
@@ -412,7 +412,7 @@ define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind {
 ;
 ; AVX2-LABEL: test_rem7_4i32:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
 ; AVX2-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
@@ -423,7 +423,7 @@ define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind {
 ; AVX2-NEXT:    vpsrld $1, %xmm2, %xmm2
 ; AVX2-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
 ; AVX2-NEXT:    vpsrld $2, %xmm1, %xmm1
-; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [7,7,7,7]
 ; AVX2-NEXT:    vpmulld %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
diff --git a/test/CodeGen/X86/vector-idiv-udiv-256.ll b/test/CodeGen/X86/vector-idiv-udiv-256.ll
index 4adc2e2fb6c9..b0433110f181 100644
--- a/test/CodeGen/X86/vector-idiv-udiv-256.ll
+++ b/test/CodeGen/X86/vector-idiv-udiv-256.ll
@@ -123,7 +123,7 @@ define <8 x i32> @test_div7_8i32(<8 x i32> %a) nounwind {
 ;
 ; AVX2-LABEL: test_div7_8i32:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
 ; AVX2-NEXT:    vpmuludq %ymm2, %ymm3, %ymm2
@@ -392,7 +392,7 @@ define <8 x i32> @test_rem7_8i32(<8 x i32> %a) nounwind {
 ;
 ; AVX2-LABEL: test_rem7_8i32:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
 ; AVX2-NEXT:    vpmuludq %ymm2, %ymm3, %ymm2
@@ -403,7 +403,7 @@ define <8 x i32> @test_rem7_8i32(<8 x i32> %a) nounwind {
 ; AVX2-NEXT:    vpsrld $1, %ymm2, %ymm2
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vpsrld $2, %ymm1, %ymm1
-; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm2
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7]
 ; AVX2-NEXT:    vpmulld %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
diff --git a/test/CodeGen/X86/vector-idiv.ll b/test/CodeGen/X86/vector-idiv.ll
index 6719a66f030f..c65c3e7fd004 100644
--- a/test/CodeGen/X86/vector-idiv.ll
+++ b/test/CodeGen/X86/vector-idiv.ll
@@ -73,7 +73,7 @@ define <4 x i32> @PR20355(<4 x i32> %a) nounwind {
 ;
 ; AVX2-LABEL: PR20355:
 ; AVX2:       # BB#0: # %entry
-; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1431655766,1431655766,1431655766,1431655766]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
 ; AVX2-NEXT:    vpmuldq %xmm2, %xmm3, %xmm2
diff --git a/test/CodeGen/X86/vector-rotate-128.ll b/test/CodeGen/X86/vector-rotate-128.ll
index 852c1f4d3d98..04378ee2ee01 100644
--- a/test/CodeGen/X86/vector-rotate-128.ll
+++ b/test/CodeGen/X86/vector-rotate-128.ll
@@ -77,14 +77,19 @@ define <2 x i64> @var_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: var_rotate_v2i64:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [64,64]
-; AVX512-NEXT:    vpsubq %xmm1, %xmm2, %xmm2
-; AVX512-NEXT:    vpsllvq %xmm1, %xmm0, %xmm1
-; AVX512-NEXT:    vpsrlvq %xmm2, %xmm0, %xmm0
-; AVX512-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; AVX512-NEXT:    retq
+; AVX512BW-LABEL: var_rotate_v2i64:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
+; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512BW-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: var_rotate_v2i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vprolvq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
 ;
 ; XOP-LABEL: var_rotate_v2i64:
 ; XOP:       # BB#0:
@@ -207,21 +212,26 @@ define <4 x i32> @var_rotate_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ;
 ; AVX2-LABEL: var_rotate_v4i32:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [32,32,32,32]
 ; AVX2-NEXT:    vpsubd %xmm1, %xmm2, %xmm2
 ; AVX2-NEXT:    vpsllvd %xmm1, %xmm0, %xmm1
 ; AVX2-NEXT:    vpsrlvd %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: var_rotate_v4i32:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
-; AVX512-NEXT:    vpsubd %xmm1, %xmm2, %xmm2
-; AVX512-NEXT:    vpsllvd %xmm1, %xmm0, %xmm1
-; AVX512-NEXT:    vpsrlvd %xmm2, %xmm0, %xmm0
-; AVX512-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; AVX512-NEXT:    retq
+; AVX512BW-LABEL: var_rotate_v4i32:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
+; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512BW-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: var_rotate_v4i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vprolvd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
 ;
 ; XOP-LABEL: var_rotate_v4i32:
 ; XOP:       # BB#0:
@@ -844,28 +854,24 @@ define <2 x i64> @constant_rotate_v2i64(<2 x i64> %a) nounwind {
 ; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: constant_rotate_v2i64:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vpsllvq {{.*}}(%rip), %xmm0, %xmm1
-; AVX512-NEXT:    vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; AVX512-NEXT:    retq
+; AVX512BW-LABEL: constant_rotate_v2i64:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,14]
+; AVX512BW-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
 ;
-; XOPAVX1-LABEL: constant_rotate_v2i64:
-; XOPAVX1:       # BB#0:
-; XOPAVX1-NEXT:    vpshlq {{.*}}(%rip), %xmm0, %xmm1
-; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm2, %xmm2
-; XOPAVX1-NEXT:    vpshlq %xmm2, %xmm0, %xmm0
-; XOPAVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; XOPAVX1-NEXT:    retq
-;
-; XOPAVX2-LABEL: constant_rotate_v2i64:
-; XOPAVX2:       # BB#0:
-; XOPAVX2-NEXT:    vpsllvq {{.*}}(%rip), %xmm0, %xmm1
-; XOPAVX2-NEXT:    vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; XOPAVX2-NEXT:    retq
+; AVX512VL-LABEL: constant_rotate_v2i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vprolvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; XOP-LABEL: constant_rotate_v2i64:
+; XOP:       # BB#0:
+; XOP-NEXT:    vprotq {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    retq
 ;
 ; X32-SSE-LABEL: constant_rotate_v2i64:
 ; X32-SSE:       # BB#0:
@@ -951,26 +957,24 @@ define <4 x i32> @constant_rotate_v4i32(<4 x i32> %a) nounwind {
 ; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: constant_rotate_v4i32:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm1
-; AVX512-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; AVX512-NEXT:    retq
+; AVX512BW-LABEL: constant_rotate_v4i32:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,5,6,7]
+; AVX512BW-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
 ;
-; XOPAVX1-LABEL: constant_rotate_v4i32:
-; XOPAVX1:       # BB#0:
-; XOPAVX1-NEXT:    vpshld {{.*}}(%rip), %xmm0, %xmm1
-; XOPAVX1-NEXT:    vpshld {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; XOPAVX1-NEXT:    retq
+; AVX512VL-LABEL: constant_rotate_v4i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vprolvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
 ;
-; XOPAVX2-LABEL: constant_rotate_v4i32:
-; XOPAVX2:       # BB#0:
-; XOPAVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm1
-; XOPAVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; XOPAVX2-NEXT:    retq
+; XOP-LABEL: constant_rotate_v4i32:
+; XOP:       # BB#0:
+; XOP-NEXT:    vprotd {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    retq
 ;
 ; X32-SSE-LABEL: constant_rotate_v4i32:
 ; X32-SSE:       # BB#0:
@@ -1100,11 +1104,7 @@ define <8 x i16> @constant_rotate_v8i16(<8 x i16> %a) nounwind {
 ;
 ; XOP-LABEL: constant_rotate_v8i16:
 ; XOP:       # BB#0:
-; XOP-NEXT:    vpshlw {{.*}}(%rip), %xmm0, %xmm1
-; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; XOP-NEXT:    vpsubw {{.*}}(%rip), %xmm2, %xmm2
-; XOP-NEXT:    vpshlw %xmm2, %xmm0, %xmm0
-; XOP-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; XOP-NEXT:    vprotw {{.*}}(%rip), %xmm0, %xmm0
 ; XOP-NEXT:    retq
 ;
 ; X32-SSE-LABEL: constant_rotate_v8i16:
@@ -1281,11 +1281,7 @@ define <16 x i8> @constant_rotate_v16i8(<16 x i8> %a) nounwind {
 ;
 ; XOP-LABEL: constant_rotate_v16i8:
 ; XOP:       # BB#0:
-; XOP-NEXT:    vpshlb {{.*}}(%rip), %xmm0, %xmm1
-; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; XOP-NEXT:    vpsubb {{.*}}(%rip), %xmm2, %xmm2
-; XOP-NEXT:    vpshlb %xmm2, %xmm0, %xmm0
-; XOP-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; XOP-NEXT:    vprotb {{.*}}(%rip), %xmm0, %xmm0
 ; XOP-NEXT:    retq
 ;
 ; X32-SSE-LABEL: constant_rotate_v16i8:
@@ -1371,12 +1367,18 @@ define <2 x i64> @splatconstant_rotate_v2i64(<2 x i64> %a) nounwind {
 ; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
-; AVX512-LABEL: splatconstant_rotate_v2i64:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vpsllq $14, %xmm0, %xmm1
-; AVX512-NEXT:    vpsrlq $50, %xmm0, %xmm0
-; AVX512-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; AVX512-NEXT:    retq
+; AVX512BW-LABEL: splatconstant_rotate_v2i64:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512BW-NEXT:    vprolq $14, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_rotate_v2i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vprolq $14, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
 ;
 ; XOP-LABEL: splatconstant_rotate_v2i64:
 ; XOP:       # BB#0:
@@ -1412,12 +1414,18 @@ define <4 x i32> @splatconstant_rotate_v4i32(<4 x i32> %a) nounwind {
 ; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
-; AVX512-LABEL: splatconstant_rotate_v4i32:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vpslld $4, %xmm0, %xmm1
-; AVX512-NEXT:    vpsrld $28, %xmm0, %xmm0
-; AVX512-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; AVX512-NEXT:    retq
+; AVX512BW-LABEL: splatconstant_rotate_v4i32:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512BW-NEXT:    vprold $4, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_rotate_v4i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vprold $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
 ;
 ; XOP-LABEL: splatconstant_rotate_v4i32:
 ; XOP:       # BB#0:
@@ -1544,11 +1552,19 @@ define <2 x i64> @splatconstant_rotate_mask_v2i64(<2 x i64> %a) nounwind {
 ; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
-; AVX512-LABEL: splatconstant_rotate_mask_v2i64:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vpsrlq $49, %xmm0, %xmm0
-; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT:    retq
+; AVX512BW-LABEL: splatconstant_rotate_mask_v2i64:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512BW-NEXT:    vprolq $15, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_rotate_mask_v2i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vprolq $15, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
 ;
 ; XOP-LABEL: splatconstant_rotate_mask_v2i64:
 ; XOP:       # BB#0:
@@ -1595,14 +1611,19 @@ define <4 x i32> @splatconstant_rotate_mask_v4i32(<4 x i32> %a) nounwind {
 ; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
-; AVX512-LABEL: splatconstant_rotate_mask_v4i32:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vpslld $4, %xmm0, %xmm1
-; AVX512-NEXT:    vpsrld $28, %xmm0, %xmm0
-; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX512-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; AVX512-NEXT:    retq
+; AVX512BW-LABEL: splatconstant_rotate_mask_v4i32:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512BW-NEXT:    vprold $4, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_rotate_mask_v4i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vprold $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
 ;
 ; XOP-LABEL: splatconstant_rotate_mask_v4i32:
 ; XOP:       # BB#0:
diff --git a/test/CodeGen/X86/vector-rotate-256.ll b/test/CodeGen/X86/vector-rotate-256.ll
index 14215e486bf9..3b65b68352b5 100644
--- a/test/CodeGen/X86/vector-rotate-256.ll
+++ b/test/CodeGen/X86/vector-rotate-256.ll
@@ -41,21 +41,25 @@ define <4 x i64> @var_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ;
 ; AVX2-LABEL: var_rotate_v4i64:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm2
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [64,64,64,64]
 ; AVX2-NEXT:    vpsubq %ymm1, %ymm2, %ymm2
 ; AVX2-NEXT:    vpsllvq %ymm1, %ymm0, %ymm1
 ; AVX2-NEXT:    vpsrlvq %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: var_rotate_v4i64:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm2
-; AVX512-NEXT:    vpsubq %ymm1, %ymm2, %ymm2
-; AVX512-NEXT:    vpsllvq %ymm1, %ymm0, %ymm1
-; AVX512-NEXT:    vpsrlvq %ymm2, %ymm0, %ymm0
-; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
-; AVX512-NEXT:    retq
+; AVX512BW-LABEL: var_rotate_v4i64:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512BW-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: var_rotate_v4i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vprolvq %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: var_rotate_v4i64:
 ; XOPAVX1:       # BB#0:
@@ -128,21 +132,25 @@ define <8 x i32> @var_rotate_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 ;
 ; AVX2-LABEL: var_rotate_v8i32:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm2
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [32,32,32,32,32,32,32,32]
 ; AVX2-NEXT:    vpsubd %ymm1, %ymm2, %ymm2
 ; AVX2-NEXT:    vpsllvd %ymm1, %ymm0, %ymm1
 ; AVX2-NEXT:    vpsrlvd %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: var_rotate_v8i32:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm2
-; AVX512-NEXT:    vpsubd %ymm1, %ymm2, %ymm2
-; AVX512-NEXT:    vpsllvd %ymm1, %ymm0, %ymm1
-; AVX512-NEXT:    vpsrlvd %ymm2, %ymm0, %ymm0
-; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
-; AVX512-NEXT:    retq
+; AVX512BW-LABEL: var_rotate_v8i32:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512BW-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: var_rotate_v8i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vprolvd %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: var_rotate_v8i32:
 ; XOPAVX1:       # BB#0:
@@ -466,7 +474,7 @@ define <4 x i64> @constant_rotate_v4i64(<4 x i64> %a) nounwind {
 ; AVX1-NEXT:    vpsllq $4, %xmm0, %xmm4
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; AVX1-NEXT:    vpsrlq $2, %xmm1, %xmm3
+; AVX1-NEXT:    vpsrlq $4, %xmm1, %xmm3
 ; AVX1-NEXT:    vpsrlq $14, %xmm1, %xmm1
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
 ; AVX1-NEXT:    vpsrlq $50, %xmm0, %xmm3
@@ -483,36 +491,36 @@ define <4 x i64> @constant_rotate_v4i64(<4 x i64> %a) nounwind {
 ; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: constant_rotate_v4i64:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vpsllvq {{.*}}(%rip), %ymm0, %ymm1
-; AVX512-NEXT:    vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
-; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
-; AVX512-NEXT:    retq
+; AVX512BW-LABEL: constant_rotate_v4i64:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,14,50,60]
+; AVX512BW-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_rotate_v4i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vprolvq {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: constant_rotate_v4i64:
 ; XOPAVX1:       # BB#0:
-; XOPAVX1-NEXT:    vpshlq {{.*}}(%rip), %xmm0, %xmm1
-; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; XOPAVX1-NEXT:    vpshlq {{.*}}(%rip), %xmm2, %xmm3
-; XOPAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; XOPAVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm3, %xmm4
-; XOPAVX1-NEXT:    vpshlq %xmm4, %xmm2, %xmm2
-; XOPAVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm3, %xmm3
-; XOPAVX1-NEXT:    vpshlq %xmm3, %xmm0, %xmm0
-; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; XOPAVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
+; XOPAVX1-NEXT:    vprotq {{.*}}(%rip), %xmm0, %xmm1
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT:    vprotq {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; XOPAVX1-NEXT:    retq
 ;
 ; XOPAVX2-LABEL: constant_rotate_v4i64:
 ; XOPAVX2:       # BB#0:
-; XOPAVX2-NEXT:    vpsllvq {{.*}}(%rip), %ymm0, %ymm1
-; XOPAVX2-NEXT:    vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
-; XOPAVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; XOPAVX2-NEXT:    vprotq {{.*}}(%rip), %xmm0, %xmm1
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; XOPAVX2-NEXT:    vprotq {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 ; XOPAVX2-NEXT:    retq
   %shl = shl <4 x i64> %a, <i64 4, i64 14, i64 50, i64 60>
-  %lshr = lshr <4 x i64> %a, <i64 60, i64 50, i64 14, i64 2>
+  %lshr = lshr <4 x i64> %a, <i64 60, i64 50, i64 14, i64 4>
   %or = or <4 x i64> %shl, %lshr
   ret <4 x i64> %or
 }
@@ -549,30 +557,33 @@ define <8 x i32> @constant_rotate_v8i32(<8 x i32> %a) nounwind {
 ; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: constant_rotate_v8i32:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vpsllvd {{.*}}(%rip), %ymm0, %ymm1
-; AVX512-NEXT:    vpsrlvd {{.*}}(%rip), %ymm0, %ymm0
-; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
-; AVX512-NEXT:    retq
+; AVX512BW-LABEL: constant_rotate_v8i32:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11]
+; AVX512BW-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_rotate_v8i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vprolvd {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: constant_rotate_v8i32:
 ; XOPAVX1:       # BB#0:
-; XOPAVX1-NEXT:    vpshld {{.*}}(%rip), %xmm0, %xmm1
-; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; XOPAVX1-NEXT:    vpshld {{.*}}(%rip), %xmm2, %xmm3
-; XOPAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; XOPAVX1-NEXT:    vpshld {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX1-NEXT:    vpshld {{.*}}(%rip), %xmm2, %xmm2
-; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; XOPAVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
+; XOPAVX1-NEXT:    vprotd {{.*}}(%rip), %xmm0, %xmm1
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT:    vprotd {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; XOPAVX1-NEXT:    retq
 ;
 ; XOPAVX2-LABEL: constant_rotate_v8i32:
 ; XOPAVX2:       # BB#0:
-; XOPAVX2-NEXT:    vpsllvd {{.*}}(%rip), %ymm0, %ymm1
-; XOPAVX2-NEXT:    vpsrlvd {{.*}}(%rip), %ymm0, %ymm0
-; XOPAVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; XOPAVX2-NEXT:    vprotd {{.*}}(%rip), %xmm0, %xmm1
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; XOPAVX2-NEXT:    vprotd {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 ; XOPAVX2-NEXT:    retq
   %shl = shl <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   %lshr = lshr <8 x i32> %a, <i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21>
@@ -643,30 +654,18 @@ define <16 x i16> @constant_rotate_v16i16(<16 x i16> %a) nounwind {
 ;
 ; XOPAVX1-LABEL: constant_rotate_v16i16:
 ; XOPAVX1:       # BB#0:
-; XOPAVX1-NEXT:    vpshlw {{.*}}(%rip), %xmm0, %xmm1
-; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; XOPAVX1-NEXT:    vpshlw {{.*}}(%rip), %xmm2, %xmm3
-; XOPAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; XOPAVX1-NEXT:    vpsubw {{.*}}(%rip), %xmm3, %xmm4
-; XOPAVX1-NEXT:    vpshlw %xmm4, %xmm2, %xmm2
-; XOPAVX1-NEXT:    vpsubw {{.*}}(%rip), %xmm3, %xmm3
-; XOPAVX1-NEXT:    vpshlw %xmm3, %xmm0, %xmm0
-; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; XOPAVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
+; XOPAVX1-NEXT:    vprotw {{.*}}(%rip), %xmm0, %xmm1
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT:    vprotw {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; XOPAVX1-NEXT:    retq
 ;
 ; XOPAVX2-LABEL: constant_rotate_v16i16:
 ; XOPAVX2:       # BB#0:
-; XOPAVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm1
-; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX2-NEXT:    vpsubw {{.*}}(%rip), %xmm2, %xmm3
-; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
-; XOPAVX2-NEXT:    vpshlw %xmm3, %xmm4, %xmm3
-; XOPAVX2-NEXT:    vpsubw {{.*}}(%rip), %xmm2, %xmm2
-; XOPAVX2-NEXT:    vpshlw %xmm2, %xmm0, %xmm0
-; XOPAVX2-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
-; XOPAVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; XOPAVX2-NEXT:    vprotw {{.*}}(%rip), %xmm0, %xmm1
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; XOPAVX2-NEXT:    vprotw {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 ; XOPAVX2-NEXT:    retq
   %shl = shl <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
   %lshr = lshr <16 x i16> %a, <i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1>
@@ -768,32 +767,20 @@ define <32 x i8> @constant_rotate_v32i8(<32 x i8> %a) nounwind {
 ;
 ; XOPAVX1-LABEL: constant_rotate_v32i8:
 ; XOPAVX1:       # BB#0:
-; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1]
-; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm2, %xmm3
-; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm0, %xmm1
-; XOPAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; XOPAVX1-NEXT:    vpsubb {{.*}}(%rip), %xmm3, %xmm3
-; XOPAVX1-NEXT:    vpshlb %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT:    vpshlb %xmm3, %xmm0, %xmm0
-; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; XOPAVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1]
+; XOPAVX1-NEXT:    vprotb %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vprotb %xmm2, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; XOPAVX1-NEXT:    retq
 ;
 ; XOPAVX2-LABEL: constant_rotate_v32i8:
 ; XOPAVX2:       # BB#0:
-; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1]
-; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; XOPAVX2-NEXT:    vpshlb %xmm1, %xmm2, %xmm3
-; XOPAVX2-NEXT:    vpshlb %xmm1, %xmm0, %xmm1
-; XOPAVX2-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
-; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; XOPAVX2-NEXT:    vpsubb {{.*}}(%rip), %xmm3, %xmm3
-; XOPAVX2-NEXT:    vpshlb %xmm3, %xmm2, %xmm2
-; XOPAVX2-NEXT:    vpshlb %xmm3, %xmm0, %xmm0
-; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; XOPAVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1]
+; XOPAVX2-NEXT:    vprotb %xmm2, %xmm1, %xmm1
+; XOPAVX2-NEXT:    vprotb %xmm2, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
   %shl = shl <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>
   %lshr = lshr <32 x i8> %a, <i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
@@ -825,12 +812,17 @@ define <4 x i64> @splatconstant_rotate_v4i64(<4 x i64> %a) nounwind {
 ; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: splatconstant_rotate_v4i64:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vpsllq $14, %ymm0, %ymm1
-; AVX512-NEXT:    vpsrlq $50, %ymm0, %ymm0
-; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
-; AVX512-NEXT:    retq
+; AVX512BW-LABEL: splatconstant_rotate_v4i64:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512BW-NEXT:    vprolq $14, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_rotate_v4i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vprolq $14, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: splatconstant_rotate_v4i64:
 ; XOPAVX1:       # BB#0:
@@ -873,12 +865,17 @@ define <8 x i32> @splatconstant_rotate_v8i32(<8 x i32> %a) nounwind {
 ; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: splatconstant_rotate_v8i32:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vpslld $4, %ymm0, %ymm1
-; AVX512-NEXT:    vpsrld $28, %ymm0, %ymm0
-; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
-; AVX512-NEXT:    retq
+; AVX512BW-LABEL: splatconstant_rotate_v8i32:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512BW-NEXT:    vprold $4, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_rotate_v8i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vprold $4, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: splatconstant_rotate_v8i32:
 ; XOPAVX1:       # BB#0:
@@ -1027,11 +1024,18 @@ define <4 x i64> @splatconstant_rotate_mask_v4i64(<4 x i64> %a) nounwind {
 ; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: splatconstant_rotate_mask_v4i64:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vpsrlq $49, %ymm0, %ymm0
-; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
-; AVX512-NEXT:    retq
+; AVX512BW-LABEL: splatconstant_rotate_mask_v4i64:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512BW-NEXT:    vprolq $15, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_rotate_mask_v4i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vprolq $15, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: splatconstant_rotate_mask_v4i64:
 ; XOPAVX1:       # BB#0:
@@ -1082,14 +1086,18 @@ define <8 x i32> @splatconstant_rotate_mask_v8i32(<8 x i32> %a) nounwind {
 ; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: splatconstant_rotate_mask_v8i32:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vpslld $4, %ymm0, %ymm1
-; AVX512-NEXT:    vpsrld $28, %ymm0, %ymm0
-; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
-; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
-; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
-; AVX512-NEXT:    retq
+; AVX512BW-LABEL: splatconstant_rotate_mask_v8i32:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512BW-NEXT:    vprold $4, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_rotate_mask_v8i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vprold $4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: splatconstant_rotate_mask_v8i32:
 ; XOPAVX1:       # BB#0:
diff --git a/test/CodeGen/X86/vector-rotate-512.ll b/test/CodeGen/X86/vector-rotate-512.ll
new file mode 100644
index 000000000000..fa1b5c1c0cb4
--- /dev/null
+++ b/test/CodeGen/X86/vector-rotate-512.ll
@@ -0,0 +1,831 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VLBW
+
+;
+; Variable Rotates
+;
+
+define <8 x i64> @var_rotate_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
+; AVX512-LABEL: var_rotate_v8i64:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %b64 = sub <8 x i64> <i64 64, i64 64, i64 64, i64 64, i64 64, i64 64, i64 64, i64 64>, %b
+  %shl = shl <8 x i64> %a, %b
+  %lshr = lshr <8 x i64> %a, %b64
+  %or = or <8 x i64> %shl, %lshr
+  ret <8 x i64> %or
+}
+
+define <16 x i32> @var_rotate_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
+; AVX512-LABEL: var_rotate_v16i32:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %b32 = sub <16 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>, %b
+  %shl = shl <16 x i32> %a, %b
+  %lshr = lshr <16 x i32> %a, %b32
+  %or = or <16 x i32> %shl, %lshr
+  ret <16 x i32> %or
+}
+
+define <32 x i16> @var_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
+; AVX512F-LABEL: var_rotate_v32i16:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512F-NEXT:    vpsubw %ymm2, %ymm4, %ymm5
+; AVX512F-NEXT:    vpsubw %ymm3, %ymm4, %ymm4
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512F-NEXT:    vpsllvd %zmm3, %zmm1, %zmm3
+; AVX512F-NEXT:    vpmovdw %zmm3, %ymm3
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT:    vpsllvd %zmm2, %zmm0, %zmm2
+; AVX512F-NEXT:    vpmovdw %zmm2, %ymm2
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
+; AVX512F-NEXT:    vpsrlvd %zmm4, %zmm1, %zmm1
+; AVX512F-NEXT:    vpmovdw %zmm1, %ymm1
+; AVX512F-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm3 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero
+; AVX512F-NEXT:    vpsrlvd %zmm3, %zmm0, %zmm0
+; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT:    vpor %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: var_rotate_v32i16:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VL-NEXT:    vpsubw %ymm2, %ymm4, %ymm5
+; AVX512VL-NEXT:    vpsubw %ymm3, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512VL-NEXT:    vpsllvd %zmm3, %zmm1, %zmm3
+; AVX512VL-NEXT:    vpmovdw %zmm3, %ymm3
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VL-NEXT:    vpsllvd %zmm2, %zmm0, %zmm2
+; AVX512VL-NEXT:    vpmovdw %zmm2, %ymm2
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
+; AVX512VL-NEXT:    vpsrlvd %zmm4, %zmm1, %zmm1
+; AVX512VL-NEXT:    vpmovdw %zmm1, %ymm1
+; AVX512VL-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm3 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero
+; AVX512VL-NEXT:    vpsrlvd %zmm3, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512VL-NEXT:    vpor %ymm0, %ymm2, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: var_rotate_v32i16:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqu16 {{.*#+}} zmm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT:    vpsubw %zmm1, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpsrlvw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vporq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: var_rotate_v32i16:
+; AVX512VLBW:       # BB#0:
+; AVX512VLBW-NEXT:    vmovdqu16 {{.*#+}} zmm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT:    vpsubw %zmm1, %zmm2, %zmm2
+; AVX512VLBW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm1
+; AVX512VLBW-NEXT:    vpsrlvw %zmm2, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vporq %zmm0, %zmm1, %zmm0
+; AVX512VLBW-NEXT:    retq
+  %b16 = sub <32 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %b
+  %shl = shl <32 x i16> %a, %b
+  %lshr = lshr <32 x i16> %a, %b16
+  %or = or <32 x i16> %shl, %lshr
+  ret <32 x i16> %or
+}
+
+define <64 x i8> @var_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
+; AVX512F-LABEL: var_rotate_v64i8:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512F-NEXT:    vpsubb %ymm2, %ymm5, %ymm4
+; AVX512F-NEXT:    vpsubb %ymm3, %ymm5, %ymm5
+; AVX512F-NEXT:    vpsllw $4, %ymm1, %ymm6
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm7 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512F-NEXT:    vpand %ymm7, %ymm6, %ymm6
+; AVX512F-NEXT:    vpsllw $5, %ymm3, %ymm3
+; AVX512F-NEXT:    vpblendvb %ymm3, %ymm6, %ymm1, %ymm6
+; AVX512F-NEXT:    vpsllw $2, %ymm6, %ymm8
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX512F-NEXT:    vpand %ymm9, %ymm8, %ymm8
+; AVX512F-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
+; AVX512F-NEXT:    vpblendvb %ymm3, %ymm8, %ymm6, %ymm6
+; AVX512F-NEXT:    vpaddb %ymm6, %ymm6, %ymm8
+; AVX512F-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
+; AVX512F-NEXT:    vpblendvb %ymm3, %ymm8, %ymm6, %ymm3
+; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm6
+; AVX512F-NEXT:    vpand %ymm7, %ymm6, %ymm6
+; AVX512F-NEXT:    vpsllw $5, %ymm2, %ymm2
+; AVX512F-NEXT:    vpblendvb %ymm2, %ymm6, %ymm0, %ymm6
+; AVX512F-NEXT:    vpsllw $2, %ymm6, %ymm7
+; AVX512F-NEXT:    vpand %ymm9, %ymm7, %ymm7
+; AVX512F-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
+; AVX512F-NEXT:    vpblendvb %ymm2, %ymm7, %ymm6, %ymm6
+; AVX512F-NEXT:    vpaddb %ymm6, %ymm6, %ymm7
+; AVX512F-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
+; AVX512F-NEXT:    vpblendvb %ymm2, %ymm7, %ymm6, %ymm2
+; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm6
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpand %ymm7, %ymm6, %ymm6
+; AVX512F-NEXT:    vpsllw $5, %ymm5, %ymm5
+; AVX512F-NEXT:    vpblendvb %ymm5, %ymm6, %ymm1, %ymm1
+; AVX512F-NEXT:    vpsrlw $2, %ymm1, %ymm6
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm8 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512F-NEXT:    vpand %ymm8, %ymm6, %ymm6
+; AVX512F-NEXT:    vpaddb %ymm5, %ymm5, %ymm5
+; AVX512F-NEXT:    vpblendvb %ymm5, %ymm6, %ymm1, %ymm1
+; AVX512F-NEXT:    vpsrlw $1, %ymm1, %ymm6
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm9 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT:    vpand %ymm9, %ymm6, %ymm6
+; AVX512F-NEXT:    vpaddb %ymm5, %ymm5, %ymm5
+; AVX512F-NEXT:    vpblendvb %ymm5, %ymm6, %ymm1, %ymm1
+; AVX512F-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm3
+; AVX512F-NEXT:    vpand %ymm7, %ymm3, %ymm3
+; AVX512F-NEXT:    vpsllw $5, %ymm4, %ymm4
+; AVX512F-NEXT:    vpblendvb %ymm4, %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT:    vpsrlw $2, %ymm0, %ymm3
+; AVX512F-NEXT:    vpand %ymm8, %ymm3, %ymm3
+; AVX512F-NEXT:    vpaddb %ymm4, %ymm4, %ymm4
+; AVX512F-NEXT:    vpblendvb %ymm4, %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT:    vpsrlw $1, %ymm0, %ymm3
+; AVX512F-NEXT:    vpand %ymm9, %ymm3, %ymm3
+; AVX512F-NEXT:    vpaddb %ymm4, %ymm4, %ymm4
+; AVX512F-NEXT:    vpblendvb %ymm4, %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT:    vpor %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: var_rotate_v64i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VL-NEXT:    vpsubb %ymm2, %ymm5, %ymm4
+; AVX512VL-NEXT:    vpsubb %ymm3, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpsllw $4, %ymm1, %ymm6
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm7 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512VL-NEXT:    vpand %ymm7, %ymm6, %ymm6
+; AVX512VL-NEXT:    vpsllw $5, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpblendvb %ymm3, %ymm6, %ymm1, %ymm6
+; AVX512VL-NEXT:    vpsllw $2, %ymm6, %ymm8
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX512VL-NEXT:    vpand %ymm9, %ymm8, %ymm8
+; AVX512VL-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpblendvb %ymm3, %ymm8, %ymm6, %ymm6
+; AVX512VL-NEXT:    vpaddb %ymm6, %ymm6, %ymm8
+; AVX512VL-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpblendvb %ymm3, %ymm8, %ymm6, %ymm3
+; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm6
+; AVX512VL-NEXT:    vpand %ymm7, %ymm6, %ymm6
+; AVX512VL-NEXT:    vpsllw $5, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm6, %ymm0, %ymm6
+; AVX512VL-NEXT:    vpsllw $2, %ymm6, %ymm7
+; AVX512VL-NEXT:    vpand %ymm9, %ymm7, %ymm7
+; AVX512VL-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm7, %ymm6, %ymm6
+; AVX512VL-NEXT:    vpaddb %ymm6, %ymm6, %ymm7
+; AVX512VL-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm7, %ymm6, %ymm2
+; AVX512VL-NEXT:    vpsrlw $4, %ymm1, %ymm6
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT:    vpand %ymm7, %ymm6, %ymm6
+; AVX512VL-NEXT:    vpsllw $5, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpblendvb %ymm5, %ymm6, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpsrlw $2, %ymm1, %ymm6
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm8 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512VL-NEXT:    vpand %ymm8, %ymm6, %ymm6
+; AVX512VL-NEXT:    vpaddb %ymm5, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpblendvb %ymm5, %ymm6, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpsrlw $1, %ymm1, %ymm6
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm9 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512VL-NEXT:    vpand %ymm9, %ymm6, %ymm6
+; AVX512VL-NEXT:    vpaddb %ymm5, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpblendvb %ymm5, %ymm6, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm5
+; AVX512VL-NEXT:    vpand %ymm7, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpsllw $5, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpblendvb %ymm4, %ymm5, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsrlw $2, %ymm0, %ymm5
+; AVX512VL-NEXT:    vpand %ymm8, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpaddb %ymm4, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpblendvb %ymm4, %ymm5, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsrlw $1, %ymm0, %ymm5
+; AVX512VL-NEXT:    vpand %ymm9, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpaddb %ymm4, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpblendvb %ymm4, %ymm5, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpor %ymm0, %ymm2, %ymm0
+; AVX512VL-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: var_rotate_v64i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqu8 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT:    vpsubb %zmm1, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpsllw $4, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512BW-NEXT:    vpsllw $5, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vpblendmb %zmm3, %zmm0, %zmm3 {%k1}
+; AVX512BW-NEXT:    vpsllw $2, %zmm3, %zmm4
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm4, %zmm4
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm4, %zmm3 {%k1}
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vpaddb %zmm3, %zmm3, %zmm3 {%k1}
+; AVX512BW-NEXT:    vpsllw $5, %zmm2, %zmm1
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm2
+; AVX512BW-NEXT:    vpmovb2m %zmm2, %k1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k2
+; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k2}
+; AVX512BW-NEXT:    vpsrlw $2, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpsrlw $1, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT:    vpaddb %zmm2, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpmovb2m %zmm2, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT:    vporq %zmm0, %zmm3, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: var_rotate_v64i8:
+; AVX512VLBW:       # BB#0:
+; AVX512VLBW-NEXT:    vmovdqu8 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLBW-NEXT:    vpsubb %zmm1, %zmm2, %zmm2
+; AVX512VLBW-NEXT:    vpsllw $4, %zmm0, %zmm3
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VLBW-NEXT:    vpsllw $5, %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512VLBW-NEXT:    vpblendmb %zmm3, %zmm0, %zmm3 {%k1}
+; AVX512VLBW-NEXT:    vpsllw $2, %zmm3, %zmm4
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm4, %zmm4
+; AVX512VLBW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512VLBW-NEXT:    vmovdqu8 %zmm4, %zmm3 {%k1}
+; AVX512VLBW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512VLBW-NEXT:    vpaddb %zmm3, %zmm3, %zmm3 {%k1}
+; AVX512VLBW-NEXT:    vpsllw $5, %zmm2, %zmm1
+; AVX512VLBW-NEXT:    vpaddb %zmm1, %zmm1, %zmm2
+; AVX512VLBW-NEXT:    vpmovb2m %zmm2, %k1
+; AVX512VLBW-NEXT:    vpmovb2m %zmm1, %k2
+; AVX512VLBW-NEXT:    vpsrlw $4, %zmm0, %zmm1
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k2}
+; AVX512VLBW-NEXT:    vpsrlw $2, %zmm0, %zmm1
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
+; AVX512VLBW-NEXT:    vpsrlw $1, %zmm0, %zmm1
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vpaddb %zmm2, %zmm2, %zmm2
+; AVX512VLBW-NEXT:    vpmovb2m %zmm2, %k1
+; AVX512VLBW-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
+; AVX512VLBW-NEXT:    vporq %zmm0, %zmm3, %zmm0
+; AVX512VLBW-NEXT:    retq
+  %b8 = sub <64 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %b
+  %shl = shl <64 x i8> %a, %b
+  %lshr = lshr <64 x i8> %a, %b8
+  %or = or <64 x i8> %shl, %lshr
+  ret <64 x i8> %or
+}
+
+;
+; Constant Rotates
+;
+
+define <8 x i64> @constant_rotate_v8i64(<8 x i64> %a) nounwind {
+; AVX512-LABEL: constant_rotate_v8i64:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vprolvq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %shl = shl <8 x i64> %a, <i64 4, i64 14, i64 50, i64 60, i64 4, i64 14, i64 50, i64 60>
+  %lshr = lshr <8 x i64> %a, <i64 60, i64 50, i64 14, i64 4, i64 60, i64 50, i64 14, i64 4>
+  %or = or <8 x i64> %shl, %lshr
+  ret <8 x i64> %or
+}
+
+define <16 x i32> @constant_rotate_v16i32(<16 x i32> %a) nounwind {
+; AVX512-LABEL: constant_rotate_v16i32:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vprolvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %shl = shl <16 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+  %lshr = lshr <16 x i32> %a, <i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21>
+  %or = or <16 x i32> %shl, %lshr
+  ret <16 x i32> %or
+}
+
+define <32 x i16> @constant_rotate_v32i16(<32 x i16> %a) nounwind {
+; AVX512F-LABEL: constant_rotate_v32i16:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
+; AVX512F-NEXT:    vpmullw %ymm2, %ymm1, %ymm3
+; AVX512F-NEXT:    vpmullw %ymm2, %ymm0, %ymm2
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512F-NEXT:    vmovdqa32 {{.*#+}} zmm4 = [16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1]
+; AVX512F-NEXT:    vpsrlvd %zmm4, %zmm1, %zmm1
+; AVX512F-NEXT:    vpmovdw %zmm1, %ymm1
+; AVX512F-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT:    vpsrlvd %zmm4, %zmm0, %zmm0
+; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT:    vpor %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_rotate_v32i16:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
+; AVX512VL-NEXT:    vpmullw %ymm2, %ymm1, %ymm3
+; AVX512VL-NEXT:    vpmullw %ymm2, %ymm0, %ymm2
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512VL-NEXT:    vmovdqa32 {{.*#+}} zmm4 = [16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1]
+; AVX512VL-NEXT:    vpsrlvd %zmm4, %zmm1, %zmm1
+; AVX512VL-NEXT:    vpmovdw %zmm1, %ymm1
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VL-NEXT:    vpsrlvd %zmm4, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512VL-NEXT:    vpor %ymm0, %ymm2, %ymm0
+; AVX512VL-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_rotate_v32i16:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vpsllvw {{.*}}(%rip), %zmm0, %zmm1
+; AVX512BW-NEXT:    vpsrlvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT:    vporq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: constant_rotate_v32i16:
+; AVX512VLBW:       # BB#0:
+; AVX512VLBW-NEXT:    vpsllvw {{.*}}(%rip), %zmm0, %zmm1
+; AVX512VLBW-NEXT:    vpsrlvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vporq %zmm0, %zmm1, %zmm0
+; AVX512VLBW-NEXT:    retq
+  %shl = shl <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
+  %lshr = lshr <32 x i16> %a, <i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1>
+  %or = or <32 x i16> %shl, %lshr
+  ret <32 x i16> %or
+}
+
+define <64 x i8> @constant_rotate_v64i8(<64 x i8> %a) nounwind {
+; AVX512F-LABEL: constant_rotate_v64i8:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vpsllw $4, %ymm1, %ymm2
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512F-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256]
+; AVX512F-NEXT:    vpblendvb %ymm4, %ymm2, %ymm1, %ymm2
+; AVX512F-NEXT:    vpsllw $2, %ymm2, %ymm5
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX512F-NEXT:    vpand %ymm6, %ymm5, %ymm5
+; AVX512F-NEXT:    vpaddb %ymm4, %ymm4, %ymm7
+; AVX512F-NEXT:    vpblendvb %ymm7, %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT:    vpaddb %ymm2, %ymm2, %ymm5
+; AVX512F-NEXT:    vpaddb %ymm7, %ymm7, %ymm8
+; AVX512F-NEXT:    vpblendvb %ymm8, %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm5
+; AVX512F-NEXT:    vpand %ymm3, %ymm5, %ymm3
+; AVX512F-NEXT:    vpblendvb %ymm4, %ymm3, %ymm0, %ymm3
+; AVX512F-NEXT:    vpsllw $2, %ymm3, %ymm4
+; AVX512F-NEXT:    vpand %ymm6, %ymm4, %ymm4
+; AVX512F-NEXT:    vpblendvb %ymm7, %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT:    vpaddb %ymm3, %ymm3, %ymm4
+; AVX512F-NEXT:    vpblendvb %ymm8, %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm4
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpand %ymm5, %ymm4, %ymm4
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = [57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536]
+; AVX512F-NEXT:    vpblendvb %ymm6, %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT:    vpsrlw $2, %ymm1, %ymm4
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512F-NEXT:    vpand %ymm7, %ymm4, %ymm4
+; AVX512F-NEXT:    vpaddb %ymm6, %ymm6, %ymm8
+; AVX512F-NEXT:    vpblendvb %ymm8, %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT:    vpsrlw $1, %ymm1, %ymm4
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm9 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT:    vpand %ymm9, %ymm4, %ymm4
+; AVX512F-NEXT:    vpaddb %ymm8, %ymm8, %ymm10
+; AVX512F-NEXT:    vpblendvb %ymm10, %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm2
+; AVX512F-NEXT:    vpand %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT:    vpblendvb %ymm6, %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vpsrlw $2, %ymm0, %ymm2
+; AVX512F-NEXT:    vpand %ymm7, %ymm2, %ymm2
+; AVX512F-NEXT:    vpblendvb %ymm8, %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vpsrlw $1, %ymm0, %ymm2
+; AVX512F-NEXT:    vpand %ymm9, %ymm2, %ymm2
+; AVX512F-NEXT:    vpblendvb %ymm10, %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vpor %ymm0, %ymm3, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_rotate_v64i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsllw $4, %ymm1, %ymm2
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512VL-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm4 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256]
+; AVX512VL-NEXT:    vpblendvb %ymm4, %ymm2, %ymm1, %ymm2
+; AVX512VL-NEXT:    vpsllw $2, %ymm2, %ymm5
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX512VL-NEXT:    vpand %ymm6, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpaddb %ymm4, %ymm4, %ymm7
+; AVX512VL-NEXT:    vpblendvb %ymm7, %ymm5, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpaddb %ymm2, %ymm2, %ymm5
+; AVX512VL-NEXT:    vpaddb %ymm7, %ymm7, %ymm8
+; AVX512VL-NEXT:    vpblendvb %ymm8, %ymm5, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm5
+; AVX512VL-NEXT:    vpand %ymm3, %ymm5, %ymm3
+; AVX512VL-NEXT:    vpblendvb %ymm4, %ymm3, %ymm0, %ymm3
+; AVX512VL-NEXT:    vpsllw $2, %ymm3, %ymm4
+; AVX512VL-NEXT:    vpand %ymm6, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpblendvb %ymm7, %ymm4, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpaddb %ymm3, %ymm3, %ymm4
+; AVX512VL-NEXT:    vpblendvb %ymm8, %ymm4, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpsrlw $4, %ymm1, %ymm4
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT:    vpand %ymm5, %ymm4, %ymm4
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm6 = [57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536]
+; AVX512VL-NEXT:    vpblendvb %ymm6, %ymm4, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpsrlw $2, %ymm1, %ymm4
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512VL-NEXT:    vpand %ymm7, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpaddb %ymm6, %ymm6, %ymm8
+; AVX512VL-NEXT:    vpblendvb %ymm8, %ymm4, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpsrlw $1, %ymm1, %ymm4
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm9 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512VL-NEXT:    vpand %ymm9, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpaddb %ymm8, %ymm8, %ymm10
+; AVX512VL-NEXT:    vpblendvb %ymm10, %ymm4, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm4
+; AVX512VL-NEXT:    vpand %ymm5, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpblendvb %ymm6, %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsrlw $2, %ymm0, %ymm4
+; AVX512VL-NEXT:    vpand %ymm7, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpblendvb %ymm8, %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsrlw $1, %ymm0, %ymm4
+; AVX512VL-NEXT:    vpand %ymm9, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpblendvb %ymm10, %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpor %ymm0, %ymm3, %ymm0
+; AVX512VL-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_rotate_v64i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqu8 {{.*#+}} zmm1 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256]
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vpsllw $4, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpblendmb %zmm2, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpsllw $2, %zmm2, %zmm3
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vpaddb %zmm2, %zmm2, %zmm2 {%k1}
+; AVX512BW-NEXT:    vmovdqu8 {{.*#+}} zmm1 = [57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536]
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpsrlw $2, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpsrlw $1, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm0 {%k1}
+; AVX512BW-NEXT:    vporq %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: constant_rotate_v64i8:
+; AVX512VLBW:       # BB#0:
+; AVX512VLBW-NEXT:    vmovdqu8 {{.*#+}} zmm1 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256]
+; AVX512VLBW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512VLBW-NEXT:    vpsllw $4, %zmm0, %zmm2
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLBW-NEXT:    vpblendmb %zmm2, %zmm0, %zmm2 {%k1}
+; AVX512VLBW-NEXT:    vpsllw $2, %zmm2, %zmm3
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VLBW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512VLBW-NEXT:    vmovdqu8 %zmm3, %zmm2 {%k1}
+; AVX512VLBW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512VLBW-NEXT:    vpaddb %zmm2, %zmm2, %zmm2 {%k1}
+; AVX512VLBW-NEXT:    vmovdqu8 {{.*#+}} zmm1 = [57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536]
+; AVX512VLBW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512VLBW-NEXT:    vpsrlw $4, %zmm0, %zmm3
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VLBW-NEXT:    vmovdqu8 %zmm3, %zmm0 {%k1}
+; AVX512VLBW-NEXT:    vpsrlw $2, %zmm0, %zmm3
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VLBW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512VLBW-NEXT:    vmovdqu8 %zmm3, %zmm0 {%k1}
+; AVX512VLBW-NEXT:    vpsrlw $1, %zmm0, %zmm3
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VLBW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512VLBW-NEXT:    vmovdqu8 %zmm3, %zmm0 {%k1}
+; AVX512VLBW-NEXT:    vporq %zmm0, %zmm2, %zmm0
+; AVX512VLBW-NEXT:    retq
+  %shl = shl <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>
+  %lshr = lshr <64 x i8> %a, <i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
+  %or = or <64 x i8> %shl, %lshr
+  ret <64 x i8> %or
+}
+
+;
+; Uniform Constant Rotates
+;
+
+define <8 x i64> @splatconstant_rotate_v8i64(<8 x i64> %a) nounwind {
+; AVX512-LABEL: splatconstant_rotate_v8i64:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vprolq $14, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %shl = shl <8 x i64> %a, <i64 14, i64 14, i64 14, i64 14, i64 14, i64 14, i64 14, i64 14>
+  %lshr = lshr <8 x i64> %a, <i64 50, i64 50, i64 50, i64 50, i64 50, i64 50, i64 50, i64 50>
+  %or = or <8 x i64> %shl, %lshr
+  ret <8 x i64> %or
+}
+
+define <16 x i32> @splatconstant_rotate_v16i32(<16 x i32> %a) nounwind {
+; AVX512-LABEL: splatconstant_rotate_v16i32:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vprold $4, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %shl = shl <16 x i32> %a, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+  %lshr = lshr <16 x i32> %a, <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28>
+  %or = or <16 x i32> %shl, %lshr
+  ret <16 x i32> %or
+}
+
+define <32 x i16> @splatconstant_rotate_v32i16(<32 x i16> %a) nounwind {
+; AVX512F-LABEL: splatconstant_rotate_v32i16:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vpsllw $7, %ymm1, %ymm2
+; AVX512F-NEXT:    vpsllw $7, %ymm0, %ymm3
+; AVX512F-NEXT:    vpsrlw $9, %ymm1, %ymm1
+; AVX512F-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX512F-NEXT:    vpsrlw $9, %ymm0, %ymm0
+; AVX512F-NEXT:    vpor %ymm0, %ymm3, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_rotate_v32i16:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsllw $7, %ymm1, %ymm2
+; AVX512VL-NEXT:    vpsllw $7, %ymm0, %ymm3
+; AVX512VL-NEXT:    vpsrlw $9, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpsrlw $9, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpor %ymm0, %ymm3, %ymm0
+; AVX512VL-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: splatconstant_rotate_v32i16:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vpsllw $7, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpsrlw $9, %zmm0, %zmm0
+; AVX512BW-NEXT:    vporq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: splatconstant_rotate_v32i16:
+; AVX512VLBW:       # BB#0:
+; AVX512VLBW-NEXT:    vpsllw $7, %zmm0, %zmm1
+; AVX512VLBW-NEXT:    vpsrlw $9, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vporq %zmm0, %zmm1, %zmm0
+; AVX512VLBW-NEXT:    retq
+  %shl = shl <32 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+  %lshr = lshr <32 x i16> %a, <i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9>
+  %or = or <32 x i16> %shl, %lshr
+  ret <32 x i16> %or
+}
+
+define <64 x i8> @splatconstant_rotate_v64i8(<64 x i8> %a) nounwind {
+; AVX512F-LABEL: splatconstant_rotate_v64i8:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vpsllw $4, %ymm1, %ymm2
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512F-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm4
+; AVX512F-NEXT:    vpand %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpand %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; AVX512F-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT:    vpor %ymm0, %ymm3, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_rotate_v64i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsllw $4, %ymm1, %ymm2
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512VL-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm4
+; AVX512VL-NEXT:    vpand %ymm3, %ymm4, %ymm3
+; AVX512VL-NEXT:    vpsrlw $4, %ymm1, %ymm1
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT:    vpand %ymm4, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpor %ymm0, %ymm3, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: splatconstant_rotate_v64i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vpsllw $4, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT:    vporq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: splatconstant_rotate_v64i8:
+; AVX512VLBW:       # BB#0:
+; AVX512VLBW-NEXT:    vpsllw $4, %zmm0, %zmm1
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vpsrlw $4, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vporq %zmm0, %zmm1, %zmm0
+; AVX512VLBW-NEXT:    retq
+  %shl = shl <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
+  %lshr = lshr <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
+  %or = or <64 x i8> %shl, %lshr
+  ret <64 x i8> %or
+}
+
+;
+; Masked Uniform Constant Rotates
+;
+
+define <8 x i64> @splatconstant_rotate_mask_v8i64(<8 x i64> %a) nounwind {
+; AVX512-LABEL: splatconstant_rotate_mask_v8i64:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vprolq $15, %zmm0, %zmm0
+; AVX512-NEXT:    vpandq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %shl = shl <8 x i64> %a, <i64 15, i64 15, i64 15, i64 15, i64 15, i64 15, i64 15, i64 15>
+  %lshr = lshr <8 x i64> %a, <i64 49, i64 49, i64 49, i64 49, i64 49, i64 49, i64 49, i64 49>
+  %rmask = and <8 x i64> %lshr, <i64 255, i64 127, i64 127, i64 255, i64 255, i64 127, i64 127, i64 255>
+  %lmask = and <8 x i64> %shl, <i64 33, i64 65, i64 129, i64 257, i64 33, i64 65, i64 129, i64 257>
+  %or = or <8 x i64> %lmask, %rmask
+  ret <8 x i64> %or
+}
+
+define <16 x i32> @splatconstant_rotate_mask_v16i32(<16 x i32> %a) nounwind {
+; AVX512-LABEL: splatconstant_rotate_mask_v16i32:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vprold $4, %zmm0, %zmm0
+; AVX512-NEXT:    vpandd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %shl = shl <16 x i32> %a, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+  %lshr = lshr <16 x i32> %a, <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28>
+  %rmask = and <16 x i32> %lshr, <i32 3, i32 7, i32 15, i32 31, i32 63, i32 127, i32 255, i32 511, i32 3, i32 7, i32 15, i32 31, i32 63, i32 127, i32 255, i32 511>
+  %lmask = and <16 x i32> %shl, <i32 511, i32 255, i32 127, i32 63, i32 31, i32 15, i32 7, i32 3, i32 511, i32 255, i32 127, i32 63, i32 31, i32 15, i32 7, i32 3>
+  %or = or <16 x i32> %lmask, %rmask
+  ret <16 x i32> %or
+}
+
+define <32 x i16> @splatconstant_rotate_mask_v32i16(<32 x i16> %a) nounwind {
+; AVX512F-LABEL: splatconstant_rotate_mask_v32i16:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vpsllw $5, %ymm0, %ymm2
+; AVX512F-NEXT:    vpsllw $5, %ymm1, %ymm3
+; AVX512F-NEXT:    vpsrlw $11, %ymm0, %ymm0
+; AVX512F-NEXT:    vpsrlw $11, %ymm1, %ymm1
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55]
+; AVX512F-NEXT:    vpand %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33]
+; AVX512F-NEXT:    vpand %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT:    vpand %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT:    vpor %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_rotate_mask_v32i16:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsllw $5, %ymm0, %ymm2
+; AVX512VL-NEXT:    vpsllw $5, %ymm1, %ymm3
+; AVX512VL-NEXT:    vpsrlw $11, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsrlw $11, %ymm1, %ymm1
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm4 = [55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55]
+; AVX512VL-NEXT:    vpand %ymm4, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm4 = [33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33]
+; AVX512VL-NEXT:    vpand %ymm4, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT:    vpand %ymm4, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpor %ymm0, %ymm2, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: splatconstant_rotate_mask_v32i16:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vpsllw $5, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpsrlw $11, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT:    vporq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: splatconstant_rotate_mask_v32i16:
+; AVX512VLBW:       # BB#0:
+; AVX512VLBW-NEXT:    vpsllw $5, %zmm0, %zmm1
+; AVX512VLBW-NEXT:    vpsrlw $11, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vporq %zmm0, %zmm1, %zmm0
+; AVX512VLBW-NEXT:    retq
+  %shl = shl <32 x i16> %a, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
+  %lshr = lshr <32 x i16> %a, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
+  %rmask = and <32 x i16> %lshr, <i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55>
+  %lmask = and <32 x i16> %shl, <i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33>
+  %or = or <32 x i16> %lmask, %rmask
+  ret <32 x i16> %or
+}
+
+define <64 x i8> @splatconstant_rotate_mask_v64i8(<64 x i8> %a) nounwind {
+; AVX512F-LABEL: splatconstant_rotate_mask_v64i8:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm2
+; AVX512F-NEXT:    vpsllw $4, %ymm1, %ymm3
+; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55]
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm4, %ymm4
+; AVX512F-NEXT:    vpand %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33]
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm4, %ymm4
+; AVX512F-NEXT:    vpand %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT:    vpand %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT:    vpor %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_rotate_mask_v64i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm2
+; AVX512VL-NEXT:    vpsllw $4, %ymm1, %ymm3
+; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsrlw $4, %ymm1, %ymm1
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm4 = [55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55]
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm4, %ymm4
+; AVX512VL-NEXT:    vpand %ymm4, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm4 = [33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33]
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm4, %ymm4
+; AVX512VL-NEXT:    vpand %ymm4, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT:    vpand %ymm4, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpor %ymm0, %ymm2, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: splatconstant_rotate_mask_v64i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vpsllw $4, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT:    vporq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: splatconstant_rotate_mask_v64i8:
+; AVX512VLBW:       # BB#0:
+; AVX512VLBW-NEXT:    vpsllw $4, %zmm0, %zmm1
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vpsrlw $4, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vporq %zmm0, %zmm1, %zmm0
+; AVX512VLBW-NEXT:    retq
+  %shl = shl <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
+  %lshr = lshr <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
+  %rmask = and <64 x i8> %lshr, <i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55>
+  %lmask = and <64 x i8> %shl, <i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33>
+  %or = or <64 x i8> %lmask, %rmask
+  ret <64 x i8> %or
+}
diff --git a/test/CodeGen/X86/vector-shift-ashr-256.ll b/test/CodeGen/X86/vector-shift-ashr-256.ll
index 09e143ddcd4d..5f2b18fc9c03 100644
--- a/test/CodeGen/X86/vector-shift-ashr-256.ll
+++ b/test/CodeGen/X86/vector-shift-ashr-256.ll
@@ -45,7 +45,7 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ;
 ; AVX2-LABEL: var_shift_v4i64:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm2
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
 ; AVX2-NEXT:    vpsrlvq %ymm1, %ymm2, %ymm3
 ; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
@@ -66,7 +66,7 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ;
 ; XOPAVX2-LABEL: var_shift_v4i64:
 ; XOPAVX2:       # BB#0:
-; XOPAVX2-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm2
+; XOPAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
 ; XOPAVX2-NEXT:    vpsrlvq %ymm1, %ymm2, %ymm3
 ; XOPAVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
@@ -667,7 +667,7 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ;
 ; AVX2-LABEL: splatvar_shift_v4i64:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm2
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
 ; AVX2-NEXT:    vpsrlq %xmm1, %ymm2, %ymm2
 ; AVX2-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
@@ -687,7 +687,7 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ;
 ; XOPAVX2-LABEL: splatvar_shift_v4i64:
 ; XOPAVX2:       # BB#0:
-; XOPAVX2-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm2
+; XOPAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
 ; XOPAVX2-NEXT:    vpsrlq %xmm1, %ymm2, %ymm2
 ; XOPAVX2-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
@@ -1700,7 +1700,7 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind {
 ; XOPAVX2-LABEL: splatconstant_shift_v4i64:
 ; XOPAVX2:       # BB#0:
 ; XOPAVX2-NEXT:    vpsrlq $7, %ymm0, %ymm0
-; XOPAVX2-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm1
+; XOPAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [72057594037927936,72057594037927936,72057594037927936,72057594037927936]
 ; XOPAVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
diff --git a/test/CodeGen/X86/vector-tzcnt-128.ll b/test/CodeGen/X86/vector-tzcnt-128.ll
index 820178d2d992..5f00e55e225b 100644
--- a/test/CodeGen/X86/vector-tzcnt-128.ll
+++ b/test/CodeGen/X86/vector-tzcnt-128.ll
@@ -745,7 +745,7 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
 ; AVX512CDVL-NEXT:    vpsubd %xmm0, %xmm1, %xmm1
 ; AVX512CDVL-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512CDVL-NEXT:    vplzcntd %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX512CDVL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [31,31,31,31]
 ; AVX512CDVL-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
 ; AVX512CDVL-NEXT:    retq
 ;
@@ -755,7 +755,7 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
 ; AVX512CD-NEXT:    vpsubd %xmm0, %xmm1, %xmm1
 ; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512CD-NEXT:    vplzcntd %zmm0, %zmm0
-; AVX512CD-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX512CD-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [31,31,31,31]
 ; AVX512CD-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
 ; AVX512CD-NEXT:    vzeroupper
 ; AVX512CD-NEXT:    retq
diff --git a/test/CodeGen/X86/vector-tzcnt-256.ll b/test/CodeGen/X86/vector-tzcnt-256.ll
index 30e5661d5485..4a7d25c1376e 100644
--- a/test/CodeGen/X86/vector-tzcnt-256.ll
+++ b/test/CodeGen/X86/vector-tzcnt-256.ll
@@ -179,7 +179,7 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
 ; AVX512CDVL-NEXT:    vpsubq %ymm0, %ymm1, %ymm1
 ; AVX512CDVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512CDVL-NEXT:    vplzcntq %ymm0, %ymm0
-; AVX512CDVL-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm1
+; AVX512CDVL-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [63,63,63,63]
 ; AVX512CDVL-NEXT:    vpsubq %ymm0, %ymm1, %ymm0
 ; AVX512CDVL-NEXT:    retq
 ;
@@ -189,7 +189,7 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
 ; AVX512CD-NEXT:    vpsubq %ymm0, %ymm1, %ymm1
 ; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512CD-NEXT:    vplzcntq %zmm0, %zmm0
-; AVX512CD-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm1
+; AVX512CD-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [63,63,63,63]
 ; AVX512CD-NEXT:    vpsubq %ymm0, %ymm1, %ymm0
 ; AVX512CD-NEXT:    retq
 ;
@@ -432,7 +432,7 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
 ; AVX512CDVL-NEXT:    vpsubd %ymm0, %ymm1, %ymm1
 ; AVX512CDVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512CDVL-NEXT:    vplzcntd %ymm0, %ymm0
-; AVX512CDVL-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1
+; AVX512CDVL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [31,31,31,31,31,31,31,31]
 ; AVX512CDVL-NEXT:    vpsubd %ymm0, %ymm1, %ymm0
 ; AVX512CDVL-NEXT:    retq
 ;
@@ -442,7 +442,7 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
 ; AVX512CD-NEXT:    vpsubd %ymm0, %ymm1, %ymm1
 ; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512CD-NEXT:    vplzcntd %zmm0, %zmm0
-; AVX512CD-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1
+; AVX512CD-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [31,31,31,31,31,31,31,31]
 ; AVX512CD-NEXT:    vpsubd %ymm0, %ymm1, %ymm0
 ; AVX512CD-NEXT:    retq
 ;
diff --git a/test/CodeGen/X86/vector-tzcnt-512.ll b/test/CodeGen/X86/vector-tzcnt-512.ll
index 3bf677aadf19..2fce8a601931 100644
--- a/test/CodeGen/X86/vector-tzcnt-512.ll
+++ b/test/CodeGen/X86/vector-tzcnt-512.ll
@@ -89,7 +89,7 @@ define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind {
 ; AVX512CD-NEXT:    vpsubq %zmm0, %zmm1, %zmm1
 ; AVX512CD-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; AVX512CD-NEXT:    vplzcntq %zmm0, %zmm0
-; AVX512CD-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm1
+; AVX512CD-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [63,63,63,63,63,63,63,63]
 ; AVX512CD-NEXT:    vpsubq %zmm0, %zmm1, %zmm0
 ; AVX512CD-NEXT:    retq
 ;
@@ -99,7 +99,7 @@ define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind {
 ; AVX512CDBW-NEXT:    vpsubq %zmm0, %zmm1, %zmm1
 ; AVX512CDBW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; AVX512CDBW-NEXT:    vplzcntq %zmm0, %zmm0
-; AVX512CDBW-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm1
+; AVX512CDBW-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [63,63,63,63,63,63,63,63]
 ; AVX512CDBW-NEXT:    vpsubq %zmm0, %zmm1, %zmm0
 ; AVX512CDBW-NEXT:    retq
 ;
@@ -235,7 +235,7 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind {
 ; AVX512CD-NEXT:    vpsubd %zmm0, %zmm1, %zmm1
 ; AVX512CD-NEXT:    vpandd %zmm1, %zmm0, %zmm0
 ; AVX512CD-NEXT:    vplzcntd %zmm0, %zmm0
-; AVX512CD-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm1
+; AVX512CD-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
 ; AVX512CD-NEXT:    vpsubd %zmm0, %zmm1, %zmm0
 ; AVX512CD-NEXT:    retq
 ;
@@ -245,7 +245,7 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind {
 ; AVX512CDBW-NEXT:    vpsubd %zmm0, %zmm1, %zmm1
 ; AVX512CDBW-NEXT:    vpandd %zmm1, %zmm0, %zmm0
 ; AVX512CDBW-NEXT:    vplzcntd %zmm0, %zmm0
-; AVX512CDBW-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm1
+; AVX512CDBW-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
 ; AVX512CDBW-NEXT:    vpsubd %zmm0, %zmm1, %zmm0
 ; AVX512CDBW-NEXT:    retq
 ;
diff --git a/test/CodeGen/X86/vselect-avx.ll b/test/CodeGen/X86/vselect-avx.ll
index 5503cfc357e5..5825a56b6f99 100644
--- a/test/CodeGen/X86/vselect-avx.ll
+++ b/test/CodeGen/X86/vselect-avx.ll
@@ -58,8 +58,8 @@ define void @test2(double** %call1559, i64 %indvars.iv4198, <4 x i1> %tmp1895) {
 ; AVX2-NEXT:    vpslld $31, %xmm0, %xmm0
 ; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
 ; AVX2-NEXT:    movq (%rdi,%rsi,8), %rax
-; AVX2-NEXT:    vbroadcastsd {{.*}}(%rip), %ymm1
-; AVX2-NEXT:    vbroadcastsd {{.*}}(%rip), %ymm2
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [-0.5,-0.5,-0.5,-0.5]
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [0.5,0.5,0.5,0.5]
 ; AVX2-NEXT:    vblendvpd %ymm0, %ymm1, %ymm2, %ymm0
 ; AVX2-NEXT:    vmovupd %ymm0, (%rax)
 ; AVX2-NEXT:    vzeroupper
@@ -108,7 +108,7 @@ define void @test3(<4 x i32> %induction30, <4 x i16>* %tmp16, <4 x i16>* %tmp17,
 ;
 ; AVX2-LABEL: test3:
 ; AVX2:       ## BB#0:
-; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm3
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [1431655766,1431655766,1431655766,1431655766]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
 ; AVX2-NEXT:    vpmuldq %xmm4, %xmm5, %xmm4
@@ -117,7 +117,7 @@ define void @test3(<4 x i32> %induction30, <4 x i16>* %tmp16, <4 x i16>* %tmp17,
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3]
 ; AVX2-NEXT:    vpsrld $31, %xmm3, %xmm4
 ; AVX2-NEXT:    vpaddd %xmm4, %xmm3, %xmm3
-; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm4
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [3,3,3,3]
 ; AVX2-NEXT:    vpmulld %xmm4, %xmm3, %xmm3
 ; AVX2-NEXT:    vpsubd %xmm3, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
diff --git a/test/CodeGen/X86/widen_arith-2.ll b/test/CodeGen/X86/widen_arith-2.ll
index 48753ad4fd76..5731b63f3bc1 100644
--- a/test/CodeGen/X86/widen_arith-2.ll
+++ b/test/CodeGen/X86/widen_arith-2.ll
@@ -16,20 +16,17 @@ define void @update(i64* %dst_i, i64* %src_i, i32 %n) nounwind {
 ; CHECK-NEXT:  .LBB0_2: # %forbody
 ; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
 ; CHECK-NEXT:    movl (%esp), %eax
-; CHECK-NEXT:    shll $3, %eax
-; CHECK-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movl (%esp), %eax
-; CHECK-NEXT:    shll $3, %eax
-; CHECK-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movl (%esp), %ecx
+; CHECK-NEXT:    leal (,%eax,8), %ecx
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    addl %ecx, %edx
+; CHECK-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    pmovzxbw {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
 ; CHECK-NEXT:    psubw %xmm0, %xmm3
 ; CHECK-NEXT:    pand %xmm1, %xmm3
 ; CHECK-NEXT:    pshufb %xmm2, %xmm3
-; CHECK-NEXT:    movq %xmm3, (%edx,%ecx,8)
+; CHECK-NEXT:    movq %xmm3, (%edx,%eax,8)
 ; CHECK-NEXT:    incl (%esp)
 ; CHECK-NEXT:  .LBB0_1: # %forcond
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
diff --git a/test/CodeGen/X86/widen_cast-4.ll b/test/CodeGen/X86/widen_cast-4.ll
index e55d62a461aa..cc6fb27a6293 100644
--- a/test/CodeGen/X86/widen_cast-4.ll
+++ b/test/CodeGen/X86/widen_cast-4.ll
@@ -16,22 +16,19 @@ define void @update(i64* %dst_i, i64* %src_i, i32 %n) nounwind {
 ; NARROW-NEXT:  .LBB0_2: # %forbody
 ; NARROW-NEXT:    # in Loop: Header=BB0_1 Depth=1
 ; NARROW-NEXT:    movl (%esp), %eax
-; NARROW-NEXT:    shll $3, %eax
-; NARROW-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; NARROW-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; NARROW-NEXT:    movl (%esp), %eax
-; NARROW-NEXT:    shll $3, %eax
-; NARROW-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; NARROW-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; NARROW-NEXT:    movl (%esp), %ecx
+; NARROW-NEXT:    leal (,%eax,8), %ecx
 ; NARROW-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; NARROW-NEXT:    addl %ecx, %edx
+; NARROW-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; NARROW-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; NARROW-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; NARROW-NEXT:    pmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
 ; NARROW-NEXT:    psubw %xmm0, %xmm2
 ; NARROW-NEXT:    psllw $8, %xmm2
 ; NARROW-NEXT:    psraw $8, %xmm2
 ; NARROW-NEXT:    psraw $2, %xmm2
 ; NARROW-NEXT:    pshufb %xmm1, %xmm2
-; NARROW-NEXT:    movq %xmm2, (%edx,%ecx,8)
+; NARROW-NEXT:    movq %xmm2, (%edx,%eax,8)
 ; NARROW-NEXT:    incl (%esp)
 ; NARROW-NEXT:  .LBB0_1: # %forcond
 ; NARROW-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -54,24 +51,21 @@ define void @update(i64* %dst_i, i64* %src_i, i32 %n) nounwind {
 ; WIDE-NEXT:  .LBB0_2: # %forbody
 ; WIDE-NEXT:    # in Loop: Header=BB0_1 Depth=1
 ; WIDE-NEXT:    movl (%esp), %eax
-; WIDE-NEXT:    shll $3, %eax
-; WIDE-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; WIDE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; WIDE-NEXT:    movl (%esp), %eax
-; WIDE-NEXT:    shll $3, %eax
-; WIDE-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; WIDE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; WIDE-NEXT:    movl (%esp), %ecx
+; WIDE-NEXT:    leal (,%eax,8), %ecx
 ; WIDE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIDE-NEXT:    addl %ecx, %edx
+; WIDE-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIDE-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; WIDE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIDE-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; WIDE-NEXT:    pinsrd $1, 4(%eax,%ecx,8), %xmm3
+; WIDE-NEXT:    pinsrd $1, 4(%ecx,%eax,8), %xmm3
 ; WIDE-NEXT:    psubb %xmm0, %xmm3
 ; WIDE-NEXT:    psrlw $2, %xmm3
 ; WIDE-NEXT:    pand %xmm1, %xmm3
 ; WIDE-NEXT:    pxor %xmm2, %xmm3
 ; WIDE-NEXT:    psubb %xmm2, %xmm3
-; WIDE-NEXT:    pextrd $1, %xmm3, 4(%edx,%ecx,8)
-; WIDE-NEXT:    movd %xmm3, (%edx,%ecx,8)
+; WIDE-NEXT:    pextrd $1, %xmm3, 4(%edx,%eax,8)
+; WIDE-NEXT:    movd %xmm3, (%edx,%eax,8)
 ; WIDE-NEXT:    incl (%esp)
 ; WIDE-NEXT:  .LBB0_1: # %forcond
 ; WIDE-NEXT:    # =>This Inner Loop Header: Depth=1
diff --git a/test/CodeGen/X86/win64-nosse-csrs.ll b/test/CodeGen/X86/win64-nosse-csrs.ll
index d1860b721044..29d4f165392e 100644
--- a/test/CodeGen/X86/win64-nosse-csrs.ll
+++ b/test/CodeGen/X86/win64-nosse-csrs.ll
@@ -20,7 +20,7 @@ entry-block:
 }
 
 ; Function Attrs: nounwind uwtable
-define x86_64_win64cc i64 @peach() unnamed_addr #1 {
+define win64cc i64 @peach() unnamed_addr #1 {
 entry-block:
   %0 = call i64 @banana()
   ret i64 %0
diff --git a/test/CodeGen/X86/win64_nonvol.ll b/test/CodeGen/X86/win64_nonvol.ll
index 8e5f6cec1ab7..e1c615d75f28 100644
--- a/test/CodeGen/X86/win64_nonvol.ll
+++ b/test/CodeGen/X86/win64_nonvol.ll
@@ -5,7 +5,7 @@
 ; Win64 nonvolatile registers get saved.
 
 ; CHECK-LABEL: bar:
-define x86_64_win64cc void @bar(i32 %a, i32 %b) {
+define win64cc void @bar(i32 %a, i32 %b) {
 ; CHECK-DAG: pushq %rdi
 ; CHECK-DAG: pushq %rsi
 ; CHECK-DAG: movaps %xmm6,
diff --git a/test/CodeGen/X86/win64_params.ll b/test/CodeGen/X86/win64_params.ll
index a0b552d4d584..6b4273512013 100644
--- a/test/CodeGen/X86/win64_params.ll
+++ b/test/CodeGen/X86/win64_params.ll
@@ -12,7 +12,7 @@ entry:
   ret i32 %add
 }
 
-define x86_64_win64cc i32 @f7(i32 %p1, i32 %p2, i32 %p3, i32 %p4, i32 %p5, i32 %p6) nounwind readnone optsize {
+define win64cc i32 @f7(i32 %p1, i32 %p2, i32 %p3, i32 %p4, i32 %p5, i32 %p6) nounwind readnone optsize {
 entry:
 ; CHECK: movl    48(%rsp), %eax
 ; CHECK: addl    40(%rsp), %eax
diff --git a/test/CodeGen/X86/win_chkstk.ll b/test/CodeGen/X86/win_chkstk.ll
index 0faa24ef7290..c7550a467a35 100644
--- a/test/CodeGen/X86/win_chkstk.ll
+++ b/test/CodeGen/X86/win_chkstk.ll
@@ -51,7 +51,7 @@ entry:
 
 ; Make sure we don't call __chkstk or __alloca on non-Windows even if the
 ; caller has the Win64 calling convention.
-define x86_64_win64cc i32 @main4k_win64() nounwind {
+define win64cc i32 @main4k_win64() nounwind {
 entry:
 ; WIN_X32:    calll __chkstk
 ; WIN_X64:    callq __chkstk
diff --git a/test/CodeGen/X86/win_coreclr_chkstk.ll b/test/CodeGen/X86/win_coreclr_chkstk.ll
index c9a5fc2b3288..b4b8010ec564 100644
--- a/test/CodeGen/X86/win_coreclr_chkstk.ll
+++ b/test/CodeGen/X86/win_coreclr_chkstk.ll
@@ -103,7 +103,7 @@ entry:
 
 ; Make sure we don't emit the probe sequence if not on windows even if the
 ; caller has the Win64 calling convention.
-define x86_64_win64cc i32 @main4k_win64() nounwind {
+define win64cc i32 @main4k_win64() nounwind {
 entry:
 ; WIN_X64:      movq    %gs:16, %rcx
 ; LINUX-NOT:    movq    %gs:16, %rcx
@@ -115,7 +115,7 @@ entry:
 declare i32 @bar(i8*) nounwind
 
 ; Within-body inline probe expansion
-define x86_64_win64cc i32 @main4k_alloca(i64 %n) nounwind {
+define win64cc i32 @main4k_alloca(i64 %n) nounwind {
 entry:
 ; WIN_X64: 	callq	bar
 ; WIN_X64:  	movq	%gs:16, [[R:%r.*]]
diff --git a/test/CodeGen/X86/x86-64-ms_abi-vararg.ll b/test/CodeGen/X86/x86-64-ms_abi-vararg.ll
index 299190e8a595..e3387a2709cb 100644
--- a/test/CodeGen/X86/x86-64-ms_abi-vararg.ll
+++ b/test/CodeGen/X86/x86-64-ms_abi-vararg.ll
@@ -3,7 +3,7 @@
 ; Verify that the var arg parameters which are passed in registers are stored
 ; in home stack slots allocated by the caller and that AP is correctly
 ; calculated.
-define x86_64_win64cc void @average_va(i32 %count, ...) nounwind {
+define win64cc void @average_va(i32 %count, ...) nounwind {
 entry:
 ; CHECK: pushq
 ; CHECK: movq   %r9, 40(%rsp)
@@ -24,7 +24,7 @@ declare void @llvm.va_end(i8*) nounwind
 ; CHECK-LABEL: f5:
 ; CHECK: pushq
 ; CHECK: leaq 56(%rsp),
-define x86_64_win64cc i8** @f5(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, ...) nounwind {
+define win64cc i8** @f5(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, ...) nounwind {
 entry:
   %ap = alloca i8*, align 8
   %ap.0 = bitcast i8** %ap to i8*
@@ -35,7 +35,7 @@ entry:
 ; CHECK-LABEL: f4:
 ; CHECK: pushq
 ; CHECK: leaq 48(%rsp),
-define x86_64_win64cc i8** @f4(i64 %a0, i64 %a1, i64 %a2, i64 %a3, ...) nounwind {
+define win64cc i8** @f4(i64 %a0, i64 %a1, i64 %a2, i64 %a3, ...) nounwind {
 entry:
   %ap = alloca i8*, align 8
   %ap.0 = bitcast i8** %ap to i8*
@@ -46,7 +46,7 @@ entry:
 ; CHECK-LABEL: f3:
 ; CHECK: pushq
 ; CHECK: leaq 40(%rsp),
-define x86_64_win64cc i8** @f3(i64 %a0, i64 %a1, i64 %a2, ...) nounwind {
+define win64cc i8** @f3(i64 %a0, i64 %a1, i64 %a2, ...) nounwind {
 entry:
   %ap = alloca i8*, align 8
   %ap.0 = bitcast i8** %ap to i8*
@@ -62,7 +62,7 @@ entry:
 ; CHECK: movq [[REG_copy1]], 8(%rsp)
 ; CHECK: movq [[REG_copy1]], (%rsp)
 ; CHECK: ret
-define x86_64_win64cc void @copy1(i64 %a0, ...) nounwind {
+define win64cc void @copy1(i64 %a0, ...) nounwind {
 entry:
   %ap = alloca i8*, align 8
   %cp = alloca i8*, align 8
@@ -78,7 +78,7 @@ entry:
 ; CHECK: movq [[REG_copy4]], 8(%rsp)
 ; CHECK: movq [[REG_copy4]], (%rsp)
 ; CHECK: ret
-define x86_64_win64cc void @copy4(i64 %a0, i64 %a1, i64 %a2, i64 %a3, ...) nounwind {
+define win64cc void @copy4(i64 %a0, i64 %a1, i64 %a2, i64 %a3, ...) nounwind {
 entry:
   %ap = alloca i8*, align 8
   %cp = alloca i8*, align 8
@@ -96,7 +96,7 @@ entry:
 ; CHECK: movq [[REG_arg4_2]], (%rsp)
 ; CHECK: movl 48(%rsp), %eax
 ; CHECK: ret
-define x86_64_win64cc i32 @arg4(i64 %a0, i64 %a1, i64 %a2, i64 %a3, ...) nounwind {
+define win64cc i32 @arg4(i64 %a0, i64 %a1, i64 %a2, i64 %a3, ...) nounwind {
 entry:
   %ap = alloca i8*, align 8
   %ap.0 = bitcast i8** %ap to i8*
diff --git a/test/CodeGen/X86/x86-cmov-converter.ll b/test/CodeGen/X86/x86-cmov-converter.ll
new file mode 100644
index 000000000000..39877c14429f
--- /dev/null
+++ b/test/CodeGen/X86/x86-cmov-converter.ll
@@ -0,0 +1,321 @@
+; RUN: llc -mtriple=x86_64-pc-linux -x86-cmov-converter=true -verify-machineinstrs < %s | FileCheck %s
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; This test checks that x86-cmov-converter optimization transform CMOV
+;; instruction into branches when it is profitable.
+;; There are 5 cases below:
+;;   1. CmovInCriticalPath:
+;;        CMOV depends on the condition and it is in the hot path.
+;;        Thus, it worths transforming.
+;;
+;;   2. CmovNotInCriticalPath:
+;;        similar test like in (1), just that CMOV is not in the hot path.
+;;        Thus, it does not worth transforming.
+;;
+;;   3. MaxIndex:
+;;        Maximum calculation algorithm that is looking for the max index,
+;;        calculating CMOV value is cheaper than calculating CMOV condition.
+;;        Thus, it worths transforming.
+;;
+;;   4. MaxValue:
+;;        Maximum calculation algorithm that is looking for the max value,
+;;        calculating CMOV value is not cheaper than calculating CMOV condition.
+;;        Thus, it does not worth transforming.
+;;
+;;   5. BinarySearch:
+;;        Usually, binary search CMOV is not predicted.
+;;        Thus, it does not worth transforming.
+;;
+;; Test was created using the following command line:
+;; > clang -S -O2 -m64 -fno-vectorize -fno-unroll-loops -emit-llvm foo.c -o -
+;; Where foo.c is:
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;void CmovInHotPath(int n, int a, int b, int *c, int *d) {
+;;  for (int i = 0; i < n; i++) {
+;;    int t = c[i];
+;;    if (c[i] * a > b)
+;;      t = 10;
+;;    c[i] = t;
+;;  }
+;;}
+;;
+;;
+;;void CmovNotInHotPath(int n, int a, int b, int *c, int *d) {
+;;  for (int i = 0; i < n; i++) {
+;;    int t = c[i];
+;;    if (c[i] * a > b)
+;;      t = 10;
+;;    c[i] = t;
+;;    d[i] /= b;
+;;  }
+;;}
+;;
+;;
+;;int MaxIndex(int n, int *a) {
+;;  int t = 0;
+;;  for (int i = 1; i < n; i++) {
+;;    if (a[i] > a[t])
+;;      t = i;
+;;  }
+;;  return a[t];
+;;}
+;;
+;;
+;;int MaxValue(int n, int *a) {
+;;  int t = a[0];
+;;  for (int i = 1; i < n; i++) {
+;;    if (a[i] > t)
+;;      t = a[i];
+;;  }
+;;  return t;
+;;}
+;;
+;;typedef struct Node Node;
+;;struct Node {
+;;  unsigned Val;
+;;  Node *Right;
+;;  Node *Left;
+;;};
+;;
+;;unsigned BinarySearch(unsigned Mask, Node *Curr, Node *Next) {
+;;  while (Curr->Val > Next->Val) {
+;;    Curr = Next;
+;;    if (Mask & (0x1 << Curr->Val))
+;;      Next = Curr->Right;
+;;    else
+;;      Next = Curr->Left;
+;;  }
+;;  return Curr->Val;
+;;}
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%struct.Node = type { i32, %struct.Node*, %struct.Node* }
+
+; CHECK-LABEL: CmovInHotPath
+; CHECK-NOT: cmov
+; CHECK: jg
+
+define void @CmovInHotPath(i32 %n, i32 %a, i32 %b, i32* nocapture %c, i32* nocapture readnone %d) #0 {
+entry:
+  %cmp14 = icmp sgt i32 %n, 0
+  br i1 %cmp14, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %c, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %mul = mul nsw i32 %0, %a
+  %cmp3 = icmp sgt i32 %mul, %b
+  %. = select i1 %cmp3, i32 10, i32 %0
+  store i32 %., i32* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: CmovNotInHotPath
+; CHECK: cmovg
+
+define void @CmovNotInHotPath(i32 %n, i32 %a, i32 %b, i32* nocapture %c, i32* nocapture %d) #0 {
+entry:
+  %cmp18 = icmp sgt i32 %n, 0
+  br i1 %cmp18, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %c, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %mul = mul nsw i32 %0, %a
+  %cmp3 = icmp sgt i32 %mul, %b
+  %. = select i1 %cmp3, i32 10, i32 %0
+  store i32 %., i32* %arrayidx, align 4
+  %arrayidx7 = getelementptr inbounds i32, i32* %d, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx7, align 4
+  %div = sdiv i32 %1, %b
+  store i32 %div, i32* %arrayidx7, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: MaxIndex
+; CHECK-NOT: cmov
+; CHECK: jg
+
+define i32 @MaxIndex(i32 %n, i32* nocapture readonly %a) #0 {
+entry:
+  %cmp14 = icmp sgt i32 %n, 1
+  br i1 %cmp14, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %phitmp = sext i32 %i.0.t.0 to i64
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %t.0.lcssa = phi i64 [ 0, %entry ], [ %phitmp, %for.cond.cleanup.loopexit ]
+  %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %t.0.lcssa
+  %0 = load i32, i32* %arrayidx5, align 4
+  ret i32 %0
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 1, %for.body.preheader ]
+  %t.015 = phi i32 [ %i.0.t.0, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx, align 4
+  %idxprom1 = sext i32 %t.015 to i64
+  %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %idxprom1
+  %2 = load i32, i32* %arrayidx2, align 4
+  %cmp3 = icmp sgt i32 %1, %2
+  %3 = trunc i64 %indvars.iv to i32
+  %i.0.t.0 = select i1 %cmp3, i32 %3, i32 %t.015
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; CHECK-LABEL: MaxValue
+; CHECK-NOT: jg
+; CHECK: cmovg
+
+define i32 @MaxValue(i32 %n, i32* nocapture readonly %a) #0 {
+entry:
+  %0 = load i32, i32* %a, align 4
+  %cmp13 = icmp sgt i32 %n, 1
+  br i1 %cmp13, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %t.0.lcssa = phi i32 [ %0, %entry ], [ %.t.0, %for.body ]
+  ret i32 %t.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 1, %for.body.preheader ]
+  %t.014 = phi i32 [ %.t.0, %for.body ], [ %0, %for.body.preheader ]
+  %arrayidx1 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx1, align 4
+  %cmp2 = icmp sgt i32 %1, %t.014
+  %.t.0 = select i1 %cmp2, i32 %1, i32 %t.014
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: BinarySearch
+; CHECK: cmov
+
+define i32 @BinarySearch(i32 %Mask, %struct.Node* nocapture readonly %Curr, %struct.Node* nocapture readonly %Next) #0 {
+entry:
+  %Val8 = getelementptr inbounds %struct.Node, %struct.Node* %Curr, i64 0, i32 0
+  %0 = load i32, i32* %Val8, align 8
+  %Val19 = getelementptr inbounds %struct.Node, %struct.Node* %Next, i64 0, i32 0
+  %1 = load i32, i32* %Val19, align 8
+  %cmp10 = icmp ugt i32 %0, %1
+  br i1 %cmp10, label %while.body, label %while.end
+
+while.body:                                       ; preds = %entry, %while.body
+  %2 = phi i32 [ %4, %while.body ], [ %1, %entry ]
+  %Next.addr.011 = phi %struct.Node* [ %3, %while.body ], [ %Next, %entry ]
+  %shl = shl i32 1, %2
+  %and = and i32 %shl, %Mask
+  %tobool = icmp eq i32 %and, 0
+  %Left = getelementptr inbounds %struct.Node, %struct.Node* %Next.addr.011, i64 0, i32 2
+  %Right = getelementptr inbounds %struct.Node, %struct.Node* %Next.addr.011, i64 0, i32 1
+  %Left.sink = select i1 %tobool, %struct.Node** %Left, %struct.Node** %Right
+  %3 = load %struct.Node*, %struct.Node** %Left.sink, align 8
+  %Val1 = getelementptr inbounds %struct.Node, %struct.Node* %3, i64 0, i32 0
+  %4 = load i32, i32* %Val1, align 8
+  %cmp = icmp ugt i32 %2, %4
+  br i1 %cmp, label %while.body, label %while.end
+
+while.end:                                        ; preds = %while.body, %entry
+  %.lcssa = phi i32 [ %0, %entry ], [ %2, %while.body ]
+  ret i32 %.lcssa
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; The following test checks that x86-cmov-converter optimization transforms
+;; CMOV instructions into branch correctly.
+;;
+;; MBB:
+;;   cond = cmp ...
+;;   v1 = CMOVgt t1, f1, cond
+;;   v2 = CMOVle s1, f2, cond
+;;
+;; Where: t1 = 11, f1 = 22, f2 = a
+;;
+;; After CMOV transformation
+;; -------------------------
+;; MBB:
+;;   cond = cmp ...
+;;   ja %SinkMBB
+;;
+;; FalseMBB:
+;;   jmp %SinkMBB
+;;
+;; SinkMBB:
+;;   %v1 = phi[%f1, %FalseMBB], [%t1, %MBB]
+;;   %v2 = phi[%f1, %FalseMBB], [%f2, %MBB] ; For CMOV with OppCC switch
+;;                                          ; true-value with false-value
+;;                                          ; Phi instruction cannot use
+;;                                          ; previous Phi instruction result
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; CHECK-LABEL: Transform
+; CHECK-NOT: cmov
+; CHECK:         divl    [[a:%[0-9a-z]*]]
+; CHECK:         cmpl    [[a]], %eax
+; CHECK:         movl    $11, [[s1:%[0-9a-z]*]]
+; CHECK:         movl    [[a]], [[s2:%[0-9a-z]*]]
+; CHECK:         ja      [[SinkBB:.*]]
+; CHECK: [[FalseBB:.*]]:
+; CHECK:         movl    $22, [[s1]]
+; CHECK:         movl    $22, [[s2]]
+; CHECK: [[SinkBB]]:
+; CHECK:         ja
+
+define void @Transform(i32 *%arr, i32 *%arr2, i32 %a, i32 %b, i32 %c, i32 %n) #0 {
+entry:
+  %cmp10 = icmp ugt i32 0, %n
+  br i1 %cmp10, label %while.body, label %while.end
+
+while.body:                                       ; preds = %entry, %while.body
+  %i = phi i32 [ %i_inc, %while.body ], [ 0, %entry ]
+  %arr_i = getelementptr inbounds i32, i32* %arr, i32 %i
+  %x = load i32, i32* %arr_i, align 4
+  %div = udiv i32 %x, %a
+  %cond = icmp ugt i32 %div, %a
+  %condOpp = icmp ule i32 %div, %a
+  %s1 = select i1 %cond, i32 11, i32 22
+  %s2 = select i1 %condOpp, i32 %s1, i32 %a
+  %sum = urem i32 %s1, %s2
+  store i32 %sum, i32* %arr_i, align 4
+  %i_inc = add i32 %i, 1
+  %cmp = icmp ugt i32 %i_inc, %n
+  br i1 %cmp, label %while.body, label %while.end
+
+while.end:                                        ; preds = %while.body, %entry
+  ret void
+}
+
+attributes #0 = {"target-cpu"="x86-64"}
diff --git a/test/CodeGen/XCore/varargs.ll b/test/CodeGen/XCore/varargs.ll
index 2e364b275610..b6f716d66c9d 100644
--- a/test/CodeGen/XCore/varargs.ll
+++ b/test/CodeGen/XCore/varargs.ll
@@ -26,10 +26,10 @@ entry:
 ; CHECK-LABEL: test_vararg
 ; CHECK: extsp 6
 ; CHECK: stw lr, sp[1]
-; CHECK: stw r3, sp[6]
-; CHECK: stw r0, sp[3]
-; CHECK: stw r1, sp[4]
-; CHECK: stw r2, sp[5]
+; CHECK-DAG: stw r3, sp[6]
+; CHECK-DAG: stw r0, sp[3]
+; CHECK-DAG: stw r1, sp[4]
+; CHECK-DAG: stw r2, sp[5]
 ; CHECK: ldaw r0, sp[3]
 ; CHECK: stw r0, sp[2]
   %list = alloca i8*, align 4
diff --git a/test/DebugInfo/Generic/namespace.ll b/test/DebugInfo/Generic/namespace.ll
index 983e20db4111..5a8f65263192 100644
--- a/test/DebugInfo/Generic/namespace.ll
+++ b/test/DebugInfo/Generic/namespace.ll
@@ -54,17 +54,14 @@
 
 ; CHECK-NOT: NULL
 ; CHECK: DW_TAG_imported_module
-; This is a bug, it should be in F2 but it inherits the file from its
-; enclosing scope
-; CHECK-NEXT: DW_AT_decl_file{{.*}}stdin
+; CHECK-NEXT: DW_AT_decl_file{{.*}}([[F2:.*]])
 ; CHECK-NEXT: DW_AT_decl_line{{.*}}(15)
 ; CHECK-NEXT: DW_AT_import{{.*}}=> {[[NS2]]})
 ; CHECK: NULL
 ; CHECK-NOT: NULL
 
 ; CHECK: DW_TAG_imported_module
-; Same bug as above, this should be F2
-; CHECK-NEXT: DW_AT_decl_file{{.*}}debug-info-namespace.cpp
+; CHECK-NEXT: DW_AT_decl_file{{.*}}([[F2:.*]])
 ; CHECK-NEXT: DW_AT_decl_line{{.*}}(18)
 ; CHECK-NEXT: DW_AT_import{{.*}}=> {[[NS1]]})
 ; CHECK-NOT: NULL
@@ -320,29 +317,29 @@ attributes #1 = { nounwind readnone }
 !31 = !DIGlobalVariable(name: "i", linkageName: "_ZN1A1B1iE", line: 20, isLocal: false, isDefinition: true, scope: !6, file: !18, type: !13)
 !32 = !DIGlobalVariable(name: "var_fwd", linkageName: "_ZN1A1B7var_fwdE", line: 44, isLocal: false, isDefinition: true, scope: !6, file: !18, type: !13)
 !33 = !{!34, !35, !36, !37, !40, !41, !42, !43, !44, !45, !47, !48, !49, !51, !54, !55, !56}
-!34 = !DIImportedEntity(tag: DW_TAG_imported_module, line: 15, scope: !7, entity: !6)
-!35 = !DIImportedEntity(tag: DW_TAG_imported_module, line: 18, scope: !0, entity: !7)
-!36 = !DIImportedEntity(tag: DW_TAG_imported_declaration, line: 19, name: "E", scope: !0, entity: !7)
-!37 = !DIImportedEntity(tag: DW_TAG_imported_module, line: 23, scope: !38, entity: !6)
+!34 = !DIImportedEntity(tag: DW_TAG_imported_module, file: !5, line: 15, scope: !7, entity: !6)
+!35 = !DIImportedEntity(tag: DW_TAG_imported_module, file: !5, line: 18, scope: !0, entity: !7)
+!36 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !5, line: 19, name: "E", scope: !0, entity: !7)
+!37 = !DIImportedEntity(tag: DW_TAG_imported_module, file: !5, line: 23, scope: !38, entity: !6)
 !38 = distinct !DILexicalBlock(line: 22, column: 10, file: !5, scope: !39)
 !39 = distinct !DILexicalBlock(line: 22, column: 7, file: !5, scope: !21)
-!40 = !DIImportedEntity(tag: DW_TAG_imported_module, line: 26, scope: !21, entity: !7)
-!41 = !DIImportedEntity(tag: DW_TAG_imported_declaration, line: 27, scope: !21, entity: !4)
-!42 = !DIImportedEntity(tag: DW_TAG_imported_declaration, line: 28, scope: !21, entity: !8)
-!43 = !DIImportedEntity(tag: DW_TAG_imported_declaration, line: 29, scope: !21, entity: !14)
-!44 = !DIImportedEntity(tag: DW_TAG_imported_declaration, line: 30, scope: !21, entity: !31)
-!45 = !DIImportedEntity(tag: DW_TAG_imported_declaration, line: 31, scope: !21, entity: !46)
+!40 = !DIImportedEntity(tag: DW_TAG_imported_module, file: !5, line: 26, scope: !21, entity: !7)
+!41 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !5, line: 27, scope: !21, entity: !4)
+!42 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !5, line: 28, scope: !21, entity: !8)
+!43 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !5, line: 29, scope: !21, entity: !14)
+!44 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !5, line: 30, scope: !21, entity: !31)
+!45 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !5, line: 31, scope: !21, entity: !46)
 !46 = !DIDerivedType(tag: DW_TAG_typedef, name: "baz", line: 7, file: !5, scope: !6, baseType: !8)
-!47 = !DIImportedEntity(tag: DW_TAG_imported_declaration, line: 32, name: "X", scope: !21, entity: !7)
-!48 = !DIImportedEntity(tag: DW_TAG_imported_declaration, line: 33, name: "Y", scope: !21, entity: !47)
-!49 = !DIImportedEntity(tag: DW_TAG_imported_declaration, line: 34, scope: !21, entity: !50)
+!47 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !5, line: 32, name: "X", scope: !21, entity: !7)
+!48 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !5, line: 33, name: "Y", scope: !21, entity: !47)
+!49 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !5, line: 34, scope: !21, entity: !50)
 !50 = !DIGlobalVariable(name: "var_decl", linkageName: "_ZN1A1B8var_declE", line: 8, isLocal: false, isDefinition: false, scope: !6, file: !18, type: !13)
-!51 = !DIImportedEntity(tag: DW_TAG_imported_declaration, line: 35, scope: !21, entity: !52)
+!51 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !5, line: 35, scope: !21, entity: !52)
 !52 = !DISubprogram(name: "func_decl", linkageName: "_ZN1A1B9func_declEv", line: 9, isLocal: false, isDefinition: false, flags: DIFlagPrototyped, isOptimized: false, file: !5, scope: !6, type: !19, variables: !53)
 !53 = !{} ; previously: invalid DW_TAG_base_type
-!54 = !DIImportedEntity(tag: DW_TAG_imported_declaration, line: 36, scope: !21, entity: !32)
-!55 = !DIImportedEntity(tag: DW_TAG_imported_declaration, line: 37, scope: !21, entity: !26)
-!56 = !DIImportedEntity(tag: DW_TAG_imported_declaration, line: 42, scope: !7, entity: !31)
+!54 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !5, line: 36, scope: !21, entity: !32)
+!55 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !5, line: 37, scope: !21, entity: !26)
+!56 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !5, line: 42, scope: !7, entity: !31)
 !57 = !{i32 2, !"Dwarf Version", i32 2}
 !58 = !{i32 2, !"Debug Info Version", i32 3}
 !59 = !{!"clang version 3.6.0 "}
diff --git a/test/DebugInfo/PDB/pdbdump-headers.test b/test/DebugInfo/PDB/pdbdump-headers.test
index 1887af2e8268..14fe4bb352f6 100644
--- a/test/DebugInfo/PDB/pdbdump-headers.test
+++ b/test/DebugInfo/PDB/pdbdump-headers.test
@@ -91,189 +91,189 @@ ALL-NEXT:   Mod 0001 | `* Linker *`:
 ALL:                           Types (TPI Stream)
 ALL-NEXT: ============================================================
 ALL-NEXT:   Showing 75 records
-ALL-NEXT:   0x1000 | LF_ARGLIST [size = 8, hash = 205956]
-ALL-NEXT:   0x1001 | LF_PROCEDURE [size = 16, hash = 163561]
+ALL-NEXT:   0x1000 | LF_ARGLIST [size = 8, hash = 0x32484]
+ALL-NEXT:   0x1001 | LF_PROCEDURE [size = 16, hash = 0x27EE9]
 ALL-NEXT:            return type = 0x0074 (int), # args = 0, param list = 0x1000
 ALL-NEXT:            calling conv = cdecl, options = None
-ALL-NEXT:   0x1002 | LF_FIELDLIST [size = 76, hash = 59811]
+ALL-NEXT:   0x1002 | LF_FIELDLIST [size = 76, hash = 0xE9A3]
 ALL-NEXT:            - LF_ENUMERATE [apartment = 1]
 ALL-NEXT:            - LF_ENUMERATE [single = 2]
 ALL-NEXT:            - LF_ENUMERATE [free = 3]
 ALL-NEXT:            - LF_ENUMERATE [neutral = 4]
 ALL-NEXT:            - LF_ENUMERATE [both = 5]
-ALL-NEXT:   0x1003 | LF_ENUM [size = 120, hash = 208239] `__vc_attributes::threadingAttribute::threading_e`
+ALL-NEXT:   0x1003 | LF_ENUM [size = 120, hash = 0x32D6F] `__vc_attributes::threadingAttribute::threading_e`
 ALL-NEXT:            unique name: `.?AW4threading_e@threadingAttribute@__vc_attributes@@`
 ALL-NEXT:            field list: 0x1002, underlying type: 0x0074 (int)
 ALL-NEXT:            options: has unique name | is nested
-ALL-NEXT:   0x1004 | LF_STRUCTURE [size = 100, hash = 16377] `__vc_attributes::threadingAttribute`
+ALL-NEXT:   0x1004 | LF_STRUCTURE [size = 100, hash = 0x3FF9] `__vc_attributes::threadingAttribute`
 ALL-NEXT:            unique name: `.?AUthreadingAttribute@__vc_attributes@@`
 ALL-NEXT:            vtable: <no type>, base list: <no type>, field list: <no type>
 ALL-NEXT:            options: forward ref | has unique name
-ALL-NEXT:   0x1005 | LF_POINTER [size = 12, hash = 247078]
+ALL-NEXT:   0x1005 | LF_POINTER [size = 12, hash = 0x3C526]
 ALL-NEXT:            referent = 0x1004, mode = pointer, opts = const, kind = ptr32
-ALL-NEXT:   0x1006 | LF_ARGLIST [size = 12, hash = 194342]
+ALL-NEXT:   0x1006 | LF_ARGLIST [size = 12, hash = 0x2F726]
 ALL-NEXT:            0x1003: `__vc_attributes::threadingAttribute::threading_e`
-ALL-NEXT:   0x1007 | LF_MFUNCTION [size = 28, hash = 254156]
+ALL-NEXT:   0x1007 | LF_MFUNCTION [size = 28, hash = 0x3E0CC]
 ALL-NEXT:            return type = 0x0003 (void), # args = 1, param list = 0x1006
 ALL-NEXT:            class type = 0x1004, this type = 0x1005, this adjust = 0
 ALL-NEXT:            calling conv = thiscall, options = constructor
-ALL-NEXT:   0x1008 | LF_MFUNCTION [size = 28, hash = 194536]
+ALL-NEXT:   0x1008 | LF_MFUNCTION [size = 28, hash = 0x2F7E8]
 ALL-NEXT:            return type = 0x0003 (void), # args = 0, param list = 0x1000
 ALL-NEXT:            class type = 0x1004, this type = 0x1005, this adjust = 0
 ALL-NEXT:            calling conv = thiscall, options = constructor
-ALL-NEXT:   0x1009 | LF_METHODLIST [size = 20, hash = 167492]
+ALL-NEXT:   0x1009 | LF_METHODLIST [size = 20, hash = 0x28E44]
 ALL-NEXT:            - Method [type = 0x1007, vftable offset = -1, attrs = public]
 ALL-NEXT:            - Method [type = 0x1008, vftable offset = -1, attrs = public]
-ALL-NEXT:   0x100A | LF_FIELDLIST [size = 68, hash = 185421]
+ALL-NEXT:   0x100A | LF_FIELDLIST [size = 68, hash = 0x2D44D]
 ALL-NEXT:            - LF_NESTTYPE [name = `threading_e`, parent = 0x1003]
 ALL-NEXT:            - LF_METHOD [name = `threadingAttribute`, # overloads = 2, overload list = 0x1009]
 ALL-NEXT:            - LF_MEMBER [name = `value`, Type = 0x1003, offset = 0, attrs = public]
-ALL-NEXT:   0x100B | LF_STRUCTURE [size = 100, hash = 119540] `__vc_attributes::threadingAttribute`
+ALL-NEXT:   0x100B | LF_STRUCTURE [size = 100, hash = 0x1D2F4] `__vc_attributes::threadingAttribute`
 ALL-NEXT:            unique name: `.?AUthreadingAttribute@__vc_attributes@@`
 ALL-NEXT:            vtable: <no type>, base list: <no type>, field list: 0x100A
 ALL-NEXT:            options: has ctor / dtor | contains nested class | has unique name
-ALL-NEXT:   0x100C | LF_FIELDLIST [size = 48, hash = 261871]
+ALL-NEXT:   0x100C | LF_FIELDLIST [size = 48, hash = 0x3FEEF]
 ALL-NEXT:            - LF_ENUMERATE [native = 0]
 ALL-NEXT:            - LF_ENUMERATE [com = 1]
 ALL-NEXT:            - LF_ENUMERATE [managed = 2]
-ALL-NEXT:   0x100D | LF_ENUM [size = 120, hash = 198119] `__vc_attributes::event_receiverAttribute::type_e`
+ALL-NEXT:   0x100D | LF_ENUM [size = 120, hash = 0x305E7] `__vc_attributes::event_receiverAttribute::type_e`
 ALL-NEXT:            unique name: `.?AW4type_e@event_receiverAttribute@__vc_attributes@@`
 ALL-NEXT:            field list: 0x100C, underlying type: 0x0074 (int)
 ALL-NEXT:            options: has unique name | is nested
-ALL-NEXT:   0x100E | LF_STRUCTURE [size = 112, hash = 48056] `__vc_attributes::event_receiverAttribute`
+ALL-NEXT:   0x100E | LF_STRUCTURE [size = 112, hash = 0xBBB8] `__vc_attributes::event_receiverAttribute`
 ALL-NEXT:            unique name: `.?AUevent_receiverAttribute@__vc_attributes@@`
 ALL-NEXT:            vtable: <no type>, base list: <no type>, field list: <no type>
 ALL-NEXT:            options: forward ref | has unique name
-ALL-NEXT:   0x100F | LF_POINTER [size = 12, hash = 251486]
+ALL-NEXT:   0x100F | LF_POINTER [size = 12, hash = 0x3D65E]
 ALL-NEXT:            referent = 0x100E, mode = pointer, opts = const, kind = ptr32
-ALL-NEXT:   0x1010 | LF_ARGLIST [size = 16, hash = 134580]
+ALL-NEXT:   0x1010 | LF_ARGLIST [size = 16, hash = 0x20DB4]
 ALL-NEXT:            0x100D: `__vc_attributes::event_receiverAttribute::type_e`
 ALL-NEXT:            0x0030 (bool): `bool`
-ALL-NEXT:   0x1011 | LF_MFUNCTION [size = 28, hash = 148190]
+ALL-NEXT:   0x1011 | LF_MFUNCTION [size = 28, hash = 0x242DE]
 ALL-NEXT:            return type = 0x0003 (void), # args = 2, param list = 0x1010
 ALL-NEXT:            class type = 0x100E, this type = 0x100F, this adjust = 0
 ALL-NEXT:            calling conv = thiscall, options = constructor
-ALL-NEXT:   0x1012 | LF_ARGLIST [size = 12, hash = 113636]
+ALL-NEXT:   0x1012 | LF_ARGLIST [size = 12, hash = 0x1BBE4]
 ALL-NEXT:            0x100D: `__vc_attributes::event_receiverAttribute::type_e`
-ALL-NEXT:   0x1013 | LF_MFUNCTION [size = 28, hash = 53336]
+ALL-NEXT:   0x1013 | LF_MFUNCTION [size = 28, hash = 0xD058]
 ALL-NEXT:            return type = 0x0003 (void), # args = 1, param list = 0x1012
 ALL-NEXT:            class type = 0x100E, this type = 0x100F, this adjust = 0
 ALL-NEXT:            calling conv = thiscall, options = constructor
-ALL-NEXT:   0x1014 | LF_MFUNCTION [size = 28, hash = 55779]
+ALL-NEXT:   0x1014 | LF_MFUNCTION [size = 28, hash = 0xD9E3]
 ALL-NEXT:            return type = 0x0003 (void), # args = 0, param list = 0x1000
 ALL-NEXT:            class type = 0x100E, this type = 0x100F, this adjust = 0
 ALL-NEXT:            calling conv = thiscall, options = constructor
-ALL-NEXT:   0x1015 | LF_METHODLIST [size = 28, hash = 220695]
+ALL-NEXT:   0x1015 | LF_METHODLIST [size = 28, hash = 0x35E17]
 ALL-NEXT:            - Method [type = 0x1011, vftable offset = -1, attrs = public]
 ALL-NEXT:            - Method [type = 0x1013, vftable offset = -1, attrs = public]
 ALL-NEXT:            - Method [type = 0x1014, vftable offset = -1, attrs = public]
-ALL-NEXT:   0x1016 | LF_FIELDLIST [size = 96, hash = 198114]
+ALL-NEXT:   0x1016 | LF_FIELDLIST [size = 96, hash = 0x305E2]
 ALL-NEXT:            - LF_NESTTYPE [name = `type_e`, parent = 0x100D]
 ALL-NEXT:            - LF_METHOD [name = `event_receiverAttribute`, # overloads = 3, overload list = 0x1015]
 ALL-NEXT:            - LF_MEMBER [name = `type`, Type = 0x100D, offset = 0, attrs = public]
 ALL-NEXT:            - LF_MEMBER [name = `layout_dependent`, Type = 0x0030 (bool), offset = 4, attrs = public]
-ALL-NEXT:   0x1017 | LF_STRUCTURE [size = 112, hash = 148734] `__vc_attributes::event_receiverAttribute`
+ALL-NEXT:   0x1017 | LF_STRUCTURE [size = 112, hash = 0x244FE] `__vc_attributes::event_receiverAttribute`
 ALL-NEXT:            unique name: `.?AUevent_receiverAttribute@__vc_attributes@@`
 ALL-NEXT:            vtable: <no type>, base list: <no type>, field list: 0x1016
 ALL-NEXT:            options: has ctor / dtor | contains nested class | has unique name
-ALL-NEXT:   0x1018 | LF_FIELDLIST [size = 48, hash = 81128]
+ALL-NEXT:   0x1018 | LF_FIELDLIST [size = 48, hash = 0x13CE8]
 ALL-NEXT:            - LF_ENUMERATE [never = 0]
 ALL-NEXT:            - LF_ENUMERATE [allowed = 1]
 ALL-NEXT:            - LF_ENUMERATE [always = 2]
-ALL-NEXT:   0x1019 | LF_ENUM [size = 116, hash = 60158] `__vc_attributes::aggregatableAttribute::type_e`
+ALL-NEXT:   0x1019 | LF_ENUM [size = 116, hash = 0xEAFE] `__vc_attributes::aggregatableAttribute::type_e`
 ALL-NEXT:            unique name: `.?AW4type_e@aggregatableAttribute@__vc_attributes@@`
 ALL-NEXT:            field list: 0x1018, underlying type: 0x0074 (int)
 ALL-NEXT:            options: has unique name | is nested
-ALL-NEXT:   0x101A | LF_STRUCTURE [size = 108, hash = 217249] `__vc_attributes::aggregatableAttribute`
+ALL-NEXT:   0x101A | LF_STRUCTURE [size = 108, hash = 0x350A1] `__vc_attributes::aggregatableAttribute`
 ALL-NEXT:            unique name: `.?AUaggregatableAttribute@__vc_attributes@@`
 ALL-NEXT:            vtable: <no type>, base list: <no type>, field list: <no type>
 ALL-NEXT:            options: forward ref | has unique name
-ALL-NEXT:   0x101B | LF_POINTER [size = 12, hash = 174209]
+ALL-NEXT:   0x101B | LF_POINTER [size = 12, hash = 0x2A881]
 ALL-NEXT:            referent = 0x101A, mode = pointer, opts = const, kind = ptr32
-ALL-NEXT:   0x101C | LF_ARGLIST [size = 12, hash = 159978]
+ALL-NEXT:   0x101C | LF_ARGLIST [size = 12, hash = 0x270EA]
 ALL-NEXT:            0x1019: `__vc_attributes::aggregatableAttribute::type_e`
-ALL-NEXT:   0x101D | LF_MFUNCTION [size = 28, hash = 249504]
+ALL-NEXT:   0x101D | LF_MFUNCTION [size = 28, hash = 0x3CEA0]
 ALL-NEXT:            return type = 0x0003 (void), # args = 1, param list = 0x101C
 ALL-NEXT:            class type = 0x101A, this type = 0x101B, this adjust = 0
 ALL-NEXT:            calling conv = thiscall, options = constructor
-ALL-NEXT:   0x101E | LF_MFUNCTION [size = 28, hash = 141941]
+ALL-NEXT:   0x101E | LF_MFUNCTION [size = 28, hash = 0x22A75]
 ALL-NEXT:            return type = 0x0003 (void), # args = 0, param list = 0x1000
 ALL-NEXT:            class type = 0x101A, this type = 0x101B, this adjust = 0
 ALL-NEXT:            calling conv = thiscall, options = constructor
-ALL-NEXT:   0x101F | LF_METHODLIST [size = 20, hash = 238785]
+ALL-NEXT:   0x101F | LF_METHODLIST [size = 20, hash = 0x3A4C1]
 ALL-NEXT:            - Method [type = 0x101D, vftable offset = -1, attrs = public]
 ALL-NEXT:            - Method [type = 0x101E, vftable offset = -1, attrs = public]
-ALL-NEXT:   0x1020 | LF_FIELDLIST [size = 68, hash = 6214]
+ALL-NEXT:   0x1020 | LF_FIELDLIST [size = 68, hash = 0x1846]
 ALL-NEXT:            - LF_NESTTYPE [name = `type_e`, parent = 0x1019]
 ALL-NEXT:            - LF_METHOD [name = `aggregatableAttribute`, # overloads = 2, overload list = 0x101F]
 ALL-NEXT:            - LF_MEMBER [name = `type`, Type = 0x1019, offset = 0, attrs = public]
-ALL-NEXT:   0x1021 | LF_STRUCTURE [size = 108, hash = 94935] `__vc_attributes::aggregatableAttribute`
+ALL-NEXT:   0x1021 | LF_STRUCTURE [size = 108, hash = 0x172D7] `__vc_attributes::aggregatableAttribute`
 ALL-NEXT:            unique name: `.?AUaggregatableAttribute@__vc_attributes@@`
 ALL-NEXT:            vtable: <no type>, base list: <no type>, field list: 0x1020
 ALL-NEXT:            options: has ctor / dtor | contains nested class | has unique name
-ALL-NEXT:   0x1022 | LF_ENUM [size = 116, hash = 151449] `__vc_attributes::event_sourceAttribute::type_e`
+ALL-NEXT:   0x1022 | LF_ENUM [size = 116, hash = 0x24F99] `__vc_attributes::event_sourceAttribute::type_e`
 ALL-NEXT:            unique name: `.?AW4type_e@event_sourceAttribute@__vc_attributes@@`
 ALL-NEXT:            field list: 0x100C, underlying type: 0x0074 (int)
 ALL-NEXT:            options: has unique name | is nested
-ALL-NEXT:   0x1023 | LF_FIELDLIST [size = 28, hash = 135589]
+ALL-NEXT:   0x1023 | LF_FIELDLIST [size = 28, hash = 0x211A5]
 ALL-NEXT:            - LF_ENUMERATE [speed = 0]
 ALL-NEXT:            - LF_ENUMERATE [size = 1]
-ALL-NEXT:   0x1024 | LF_ENUM [size = 124, hash = 73373] `__vc_attributes::event_sourceAttribute::optimize_e`
+ALL-NEXT:   0x1024 | LF_ENUM [size = 124, hash = 0x11E9D] `__vc_attributes::event_sourceAttribute::optimize_e`
 ALL-NEXT:            unique name: `.?AW4optimize_e@event_sourceAttribute@__vc_attributes@@`
 ALL-NEXT:            field list: 0x1023, underlying type: 0x0074 (int)
 ALL-NEXT:            options: has unique name | is nested
-ALL-NEXT:   0x1025 | LF_STRUCTURE [size = 108, hash = 96512] `__vc_attributes::event_sourceAttribute`
+ALL-NEXT:   0x1025 | LF_STRUCTURE [size = 108, hash = 0x17900] `__vc_attributes::event_sourceAttribute`
 ALL-NEXT:            unique name: `.?AUevent_sourceAttribute@__vc_attributes@@`
 ALL-NEXT:            vtable: <no type>, base list: <no type>, field list: <no type>
 ALL-NEXT:            options: forward ref | has unique name
-ALL-NEXT:   0x1026 | LF_POINTER [size = 12, hash = 254299]
+ALL-NEXT:   0x1026 | LF_POINTER [size = 12, hash = 0x3E15B]
 ALL-NEXT:            referent = 0x1025, mode = pointer, opts = const, kind = ptr32
-ALL-NEXT:   0x1027 | LF_ARGLIST [size = 12, hash = 17744]
+ALL-NEXT:   0x1027 | LF_ARGLIST [size = 12, hash = 0x4550]
 ALL-NEXT:            0x1022: `__vc_attributes::event_sourceAttribute::type_e`
-ALL-NEXT:   0x1028 | LF_MFUNCTION [size = 28, hash = 239514]
+ALL-NEXT:   0x1028 | LF_MFUNCTION [size = 28, hash = 0x3A79A]
 ALL-NEXT:            return type = 0x0003 (void), # args = 1, param list = 0x1027
 ALL-NEXT:            class type = 0x1025, this type = 0x1026, this adjust = 0
 ALL-NEXT:            calling conv = thiscall, options = constructor
-ALL-NEXT:   0x1029 | LF_MFUNCTION [size = 28, hash = 173189]
+ALL-NEXT:   0x1029 | LF_MFUNCTION [size = 28, hash = 0x2A485]
 ALL-NEXT:            return type = 0x0003 (void), # args = 0, param list = 0x1000
 ALL-NEXT:            class type = 0x1025, this type = 0x1026, this adjust = 0
 ALL-NEXT:            calling conv = thiscall, options = constructor
-ALL-NEXT:   0x102A | LF_METHODLIST [size = 20, hash = 130544]
+ALL-NEXT:   0x102A | LF_METHODLIST [size = 20, hash = 0x1FDF0]
 ALL-NEXT:            - Method [type = 0x1028, vftable offset = -1, attrs = public]
 ALL-NEXT:            - Method [type = 0x1029, vftable offset = -1, attrs = public]
-ALL-NEXT:   0x102B | LF_FIELDLIST [size = 128, hash = 204437]
+ALL-NEXT:   0x102B | LF_FIELDLIST [size = 128, hash = 0x31E95]
 ALL-NEXT:            - LF_NESTTYPE [name = `type_e`, parent = 0x1022]
 ALL-NEXT:            - LF_NESTTYPE [name = `optimize_e`, parent = 0x1024]
 ALL-NEXT:            - LF_METHOD [name = `event_sourceAttribute`, # overloads = 2, overload list = 0x102A]
 ALL-NEXT:            - LF_MEMBER [name = `type`, Type = 0x1022, offset = 0, attrs = public]
 ALL-NEXT:            - LF_MEMBER [name = `optimize`, Type = 0x1024, offset = 4, attrs = public]
 ALL-NEXT:            - LF_MEMBER [name = `decorate`, Type = 0x0030 (bool), offset = 8, attrs = public]
-ALL-NEXT:   0x102C | LF_STRUCTURE [size = 108, hash = 238560] `__vc_attributes::event_sourceAttribute`
+ALL-NEXT:   0x102C | LF_STRUCTURE [size = 108, hash = 0x3A3E0] `__vc_attributes::event_sourceAttribute`
 ALL-NEXT:            unique name: `.?AUevent_sourceAttribute@__vc_attributes@@`
 ALL-NEXT:            vtable: <no type>, base list: <no type>, field list: 0x102B
 ALL-NEXT:            options: has ctor / dtor | contains nested class | has unique name
-ALL-NEXT:   0x102D | LF_FIELDLIST [size = 92, hash = 144673]
+ALL-NEXT:   0x102D | LF_FIELDLIST [size = 92, hash = 0x23521]
 ALL-NEXT:            - LF_ENUMERATE [dll = 1]
 ALL-NEXT:            - LF_ENUMERATE [exe = 2]
 ALL-NEXT:            - LF_ENUMERATE [service = 3]
 ALL-NEXT:            - LF_ENUMERATE [unspecified = 4]
 ALL-NEXT:            - LF_ENUMERATE [EXE = 2]
 ALL-NEXT:            - LF_ENUMERATE [SERVICE = 3]
-ALL-NEXT:   0x102E | LF_ENUM [size = 104, hash = 115151] `__vc_attributes::moduleAttribute::type_e`
+ALL-NEXT:   0x102E | LF_ENUM [size = 104, hash = 0x1C1CF] `__vc_attributes::moduleAttribute::type_e`
 ALL-NEXT:            unique name: `.?AW4type_e@moduleAttribute@__vc_attributes@@`
 ALL-NEXT:            field list: 0x102D, underlying type: 0x0074 (int)
 ALL-NEXT:            options: has unique name | is nested
-ALL-NEXT:   0x102F | LF_STRUCTURE [size = 96, hash = 197306] `__vc_attributes::moduleAttribute`
+ALL-NEXT:   0x102F | LF_STRUCTURE [size = 96, hash = 0x302BA] `__vc_attributes::moduleAttribute`
 ALL-NEXT:            unique name: `.?AUmoduleAttribute@__vc_attributes@@`
 ALL-NEXT:            vtable: <no type>, base list: <no type>, field list: <no type>
 ALL-NEXT:            options: forward ref | has unique name
-ALL-NEXT:   0x1030 | LF_POINTER [size = 12, hash = 256035]
+ALL-NEXT:   0x1030 | LF_POINTER [size = 12, hash = 0x3E823]
 ALL-NEXT:            referent = 0x102F, mode = pointer, opts = const, kind = ptr32
-ALL-NEXT:   0x1031 | LF_MODIFIER [size = 12, hash = 101096]
+ALL-NEXT:   0x1031 | LF_MODIFIER [size = 12, hash = 0x18AE8]
 ALL-NEXT:            referent = 0x0070 (char), modifiers = const
-ALL-NEXT:   0x1032 | LF_POINTER [size = 12, hash = 231280]
+ALL-NEXT:   0x1032 | LF_POINTER [size = 12, hash = 0x38770]
 ALL-NEXT:            referent = 0x1031, mode = pointer, opts = None, kind = ptr32
-ALL-NEXT:   0x1033 | LF_ARGLIST [size = 68, hash = 52156]
+ALL-NEXT:   0x1033 | LF_ARGLIST [size = 68, hash = 0xCBBC]
 ALL-NEXT:            0x102E: `__vc_attributes::moduleAttribute::type_e`
 ALL-NEXT:            0x1032: `const char*`
 ALL-NEXT:            0x1032: `const char*`
@@ -289,25 +289,25 @@ ALL-NEXT:            0x0030 (bool): `bool`
 ALL-NEXT:            0x0030 (bool): `bool`
 ALL-NEXT:            0x1032: `const char*`
 ALL-NEXT:            0x1032: `const char*`
-ALL-NEXT:   0x1034 | LF_MFUNCTION [size = 28, hash = 48854]
+ALL-NEXT:   0x1034 | LF_MFUNCTION [size = 28, hash = 0xBED6]
 ALL-NEXT:            return type = 0x0003 (void), # args = 15, param list = 0x1033
 ALL-NEXT:            class type = 0x102F, this type = 0x1030, this adjust = 0
 ALL-NEXT:            calling conv = thiscall, options = constructor
-ALL-NEXT:   0x1035 | LF_ARGLIST [size = 12, hash = 170035]
+ALL-NEXT:   0x1035 | LF_ARGLIST [size = 12, hash = 0x29833]
 ALL-NEXT:            0x102E: `__vc_attributes::moduleAttribute::type_e`
-ALL-NEXT:   0x1036 | LF_MFUNCTION [size = 28, hash = 177041]
+ALL-NEXT:   0x1036 | LF_MFUNCTION [size = 28, hash = 0x2B391]
 ALL-NEXT:            return type = 0x0003 (void), # args = 1, param list = 0x1035
 ALL-NEXT:            class type = 0x102F, this type = 0x1030, this adjust = 0
 ALL-NEXT:            calling conv = thiscall, options = constructor
-ALL-NEXT:   0x1037 | LF_MFUNCTION [size = 28, hash = 102745]
+ALL-NEXT:   0x1037 | LF_MFUNCTION [size = 28, hash = 0x19159]
 ALL-NEXT:            return type = 0x0003 (void), # args = 0, param list = 0x1000
 ALL-NEXT:            class type = 0x102F, this type = 0x1030, this adjust = 0
 ALL-NEXT:            calling conv = thiscall, options = constructor
-ALL-NEXT:   0x1038 | LF_METHODLIST [size = 28, hash = 16947]
+ALL-NEXT:   0x1038 | LF_METHODLIST [size = 28, hash = 0x4233]
 ALL-NEXT:            - Method [type = 0x1034, vftable offset = -1, attrs = public]
 ALL-NEXT:            - Method [type = 0x1036, vftable offset = -1, attrs = public]
 ALL-NEXT:            - Method [type = 0x1037, vftable offset = -1, attrs = public]
-ALL-NEXT:   0x1039 | LF_FIELDLIST [size = 356, hash = 183703]
+ALL-NEXT:   0x1039 | LF_FIELDLIST [size = 356, hash = 0x2CD97]
 ALL-NEXT:            - LF_NESTTYPE [name = `type_e`, parent = 0x102E]
 ALL-NEXT:            - LF_METHOD [name = `moduleAttribute`, # overloads = 3, overload list = 0x1038]
 ALL-NEXT:            - LF_MEMBER [name = `type`, Type = 0x102E, offset = 0, attrs = public]
@@ -325,11 +325,11 @@ ALL-NEXT:            - LF_MEMBER [name = `hidden`, Type = 0x0030 (bool), offset
 ALL-NEXT:            - LF_MEMBER [name = `restricted`, Type = 0x0030 (bool), offset = 45, attrs = public]
 ALL-NEXT:            - LF_MEMBER [name = `custom`, Type = 0x1032, offset = 48, attrs = public]
 ALL-NEXT:            - LF_MEMBER [name = `resource_name`, Type = 0x1032, offset = 52, attrs = public]
-ALL-NEXT:   0x103A | LF_STRUCTURE [size = 96, hash = 98548] `__vc_attributes::moduleAttribute`
+ALL-NEXT:   0x103A | LF_STRUCTURE [size = 96, hash = 0x180F4] `__vc_attributes::moduleAttribute`
 ALL-NEXT:            unique name: `.?AUmoduleAttribute@__vc_attributes@@`
 ALL-NEXT:            vtable: <no type>, base list: <no type>, field list: 0x1039
 ALL-NEXT:            options: has ctor / dtor | contains nested class | has unique name
-ALL-NEXT:   0x103B | LF_FIELDLIST [size = 756, hash = 35693]
+ALL-NEXT:   0x103B | LF_FIELDLIST [size = 756, hash = 0x8B6D]
 ALL-NEXT:            - LF_ENUMERATE [eAnyUsage = 0]
 ALL-NEXT:            - LF_ENUMERATE [eCoClassUsage = 1]
 ALL-NEXT:            - LF_ENUMERATE [eCOMInterfaceUsage = 2]
@@ -360,58 +360,58 @@ ALL-NEXT:            - LF_ENUMERATE [eModuleUsage = 16777216]
 ALL-NEXT:            - LF_ENUMERATE [eIllegalUsage = 33554432]
 ALL-NEXT:            - LF_ENUMERATE [eAsynchronousUsage = 67108864]
 ALL-NEXT:            - LF_ENUMERATE [eAnyIDLUsage = 4161535]
-ALL-NEXT:   0x103C | LF_ENUM [size = 140, hash = 171328] `__vc_attributes::helper_attributes::usageAttribute::usage_e`
+ALL-NEXT:   0x103C | LF_ENUM [size = 140, hash = 0x29D40] `__vc_attributes::helper_attributes::usageAttribute::usage_e`
 ALL-NEXT:            unique name: `.?AW4usage_e@usageAttribute@helper_attributes@__vc_attributes@@`
 ALL-NEXT:            field list: 0x103B, underlying type: 0x0074 (int)
 ALL-NEXT:            options: has unique name | is nested
-ALL-NEXT:   0x103D | LF_STRUCTURE [size = 128, hash = 203640] `__vc_attributes::helper_attributes::usageAttribute`
+ALL-NEXT:   0x103D | LF_STRUCTURE [size = 128, hash = 0x31B78] `__vc_attributes::helper_attributes::usageAttribute`
 ALL-NEXT:            unique name: `.?AUusageAttribute@helper_attributes@__vc_attributes@@`
 ALL-NEXT:            vtable: <no type>, base list: <no type>, field list: <no type>
 ALL-NEXT:            options: forward ref | has unique name
-ALL-NEXT:   0x103E | LF_POINTER [size = 12, hash = 139292]
+ALL-NEXT:   0x103E | LF_POINTER [size = 12, hash = 0x2201C]
 ALL-NEXT:            referent = 0x103D, mode = pointer, opts = const, kind = ptr32
-ALL-NEXT:   0x103F | LF_ARGLIST [size = 12, hash = 49018]
+ALL-NEXT:   0x103F | LF_ARGLIST [size = 12, hash = 0xBF7A]
 ALL-NEXT:            0x0075 (unsigned): `unsigned`
-ALL-NEXT:   0x1040 | LF_MFUNCTION [size = 28, hash = 43821]
+ALL-NEXT:   0x1040 | LF_MFUNCTION [size = 28, hash = 0xAB2D]
 ALL-NEXT:            return type = 0x0003 (void), # args = 1, param list = 0x103F
 ALL-NEXT:            class type = 0x103D, this type = 0x103E, this adjust = 0
 ALL-NEXT:            calling conv = thiscall, options = constructor
-ALL-NEXT:   0x1041 | LF_FIELDLIST [size = 60, hash = 202555]
+ALL-NEXT:   0x1041 | LF_FIELDLIST [size = 60, hash = 0x3173B]
 ALL-NEXT:            - LF_NESTTYPE [name = `usage_e`, parent = 0x103C]
 ALL-NEXT:            - LF_ONEMETHOD [name = `usageAttribute`]
 ALL-NEXT:              type = 0x1040, vftable offset = -1, attrs = public
 ALL-NEXT:            - LF_MEMBER [name = `value`, Type = 0x0075 (unsigned), offset = 0, attrs = public]
-ALL-NEXT:   0x1042 | LF_STRUCTURE [size = 128, hash = 165040] `__vc_attributes::helper_attributes::usageAttribute`
+ALL-NEXT:   0x1042 | LF_STRUCTURE [size = 128, hash = 0x284B0] `__vc_attributes::helper_attributes::usageAttribute`
 ALL-NEXT:            unique name: `.?AUusageAttribute@helper_attributes@__vc_attributes@@`
 ALL-NEXT:            vtable: <no type>, base list: <no type>, field list: 0x1041
 ALL-NEXT:            options: has ctor / dtor | contains nested class | has unique name
-ALL-NEXT:   0x1043 | LF_FIELDLIST [size = 68, hash = 215835]
+ALL-NEXT:   0x1043 | LF_FIELDLIST [size = 68, hash = 0x34B1B]
 ALL-NEXT:            - LF_ENUMERATE [eBoolean = 0]
 ALL-NEXT:            - LF_ENUMERATE [eInteger = 1]
 ALL-NEXT:            - LF_ENUMERATE [eFloat = 2]
 ALL-NEXT:            - LF_ENUMERATE [eDouble = 3]
-ALL-NEXT:   0x1044 | LF_ENUM [size = 148, hash = 142625] `__vc_attributes::helper_attributes::v1_alttypeAttribute::type_e`
+ALL-NEXT:   0x1044 | LF_ENUM [size = 148, hash = 0x22D21] `__vc_attributes::helper_attributes::v1_alttypeAttribute::type_e`
 ALL-NEXT:            unique name: `.?AW4type_e@v1_alttypeAttribute@helper_attributes@__vc_attributes@@`
 ALL-NEXT:            field list: 0x1043, underlying type: 0x0074 (int)
 ALL-NEXT:            options: has unique name | is nested
-ALL-NEXT:   0x1045 | LF_STRUCTURE [size = 140, hash = 52534] `__vc_attributes::helper_attributes::v1_alttypeAttribute`
+ALL-NEXT:   0x1045 | LF_STRUCTURE [size = 140, hash = 0xCD36] `__vc_attributes::helper_attributes::v1_alttypeAttribute`
 ALL-NEXT:            unique name: `.?AUv1_alttypeAttribute@helper_attributes@__vc_attributes@@`
 ALL-NEXT:            vtable: <no type>, base list: <no type>, field list: <no type>
 ALL-NEXT:            options: forward ref | has unique name
-ALL-NEXT:   0x1046 | LF_POINTER [size = 12, hash = 44186]
+ALL-NEXT:   0x1046 | LF_POINTER [size = 12, hash = 0xAC9A]
 ALL-NEXT:            referent = 0x1045, mode = pointer, opts = const, kind = ptr32
-ALL-NEXT:   0x1047 | LF_ARGLIST [size = 12, hash = 103930]
+ALL-NEXT:   0x1047 | LF_ARGLIST [size = 12, hash = 0x195FA]
 ALL-NEXT:            0x1044: `__vc_attributes::helper_attributes::v1_alttypeAttribute::type_e`
-ALL-NEXT:   0x1048 | LF_MFUNCTION [size = 28, hash = 110942]
+ALL-NEXT:   0x1048 | LF_MFUNCTION [size = 28, hash = 0x1B15E]
 ALL-NEXT:            return type = 0x0003 (void), # args = 1, param list = 0x1047
 ALL-NEXT:            class type = 0x1045, this type = 0x1046, this adjust = 0
 ALL-NEXT:            calling conv = thiscall, options = constructor
-ALL-NEXT:   0x1049 | LF_FIELDLIST [size = 64, hash = 17991]
+ALL-NEXT:   0x1049 | LF_FIELDLIST [size = 64, hash = 0x4647]
 ALL-NEXT:            - LF_NESTTYPE [name = `type_e`, parent = 0x1044]
 ALL-NEXT:            - LF_ONEMETHOD [name = `v1_alttypeAttribute`]
 ALL-NEXT:              type = 0x1048, vftable offset = -1, attrs = public
 ALL-NEXT:            - LF_MEMBER [name = `type`, Type = 0x1044, offset = 0, attrs = public]
-ALL-NEXT:   0x104A | LF_STRUCTURE [size = 140, hash = 213215] `__vc_attributes::helper_attributes::v1_alttypeAttribute`
+ALL-NEXT:   0x104A | LF_STRUCTURE [size = 140, hash = 0x340DF] `__vc_attributes::helper_attributes::v1_alttypeAttribute`
 ALL-NEXT:            unique name: `.?AUv1_alttypeAttribute@helper_attributes@__vc_attributes@@`
 ALL-NEXT:            vtable: <no type>, base list: <no type>, field list: 0x1049
 ALL-NEXT:            options: has ctor / dtor | contains nested class | has unique name
@@ -421,29 +421,29 @@ ALL:        Hash Adjusters:
 ALL:                           Types (IPI Stream)
 ALL-NEXT: ============================================================
 ALL-NEXT:   Showing 15 records
-ALL-NEXT:   0x1000 | LF_UDT_MOD_SRC_LINE [size = 20, hash = 7186]
+ALL-NEXT:   0x1000 | LF_UDT_MOD_SRC_LINE [size = 20, hash = 0x1C12]
 ALL-NEXT:            udt = 0x100B, mod = 1, file = 1, line = 481
-ALL-NEXT:   0x1001 | LF_UDT_MOD_SRC_LINE [size = 20, hash = 7198]
+ALL-NEXT:   0x1001 | LF_UDT_MOD_SRC_LINE [size = 20, hash = 0x1C1E]
 ALL-NEXT:            udt = 0x1017, mod = 1, file = 1, line = 194
-ALL-NEXT:   0x1002 | LF_UDT_MOD_SRC_LINE [size = 20, hash = 7180]
+ALL-NEXT:   0x1002 | LF_UDT_MOD_SRC_LINE [size = 20, hash = 0x1C0C]
 ALL-NEXT:            udt = 0x1021, mod = 1, file = 1, line = 603
-ALL-NEXT:   0x1003 | LF_UDT_MOD_SRC_LINE [size = 20, hash = 7191]
+ALL-NEXT:   0x1003 | LF_UDT_MOD_SRC_LINE [size = 20, hash = 0x1C17]
 ALL-NEXT:            udt = 0x102C, mod = 1, file = 1, line = 1200
-ALL-NEXT:   0x1004 | LF_UDT_MOD_SRC_LINE [size = 20, hash = 7201]
+ALL-NEXT:   0x1004 | LF_UDT_MOD_SRC_LINE [size = 20, hash = 0x1C21]
 ALL-NEXT:            udt = 0x103A, mod = 1, file = 1, line = 540
-ALL-NEXT:   0x1005 | LF_UDT_MOD_SRC_LINE [size = 20, hash = 7241]
+ALL-NEXT:   0x1005 | LF_UDT_MOD_SRC_LINE [size = 20, hash = 0x1C49]
 ALL-NEXT:            udt = 0x1042, mod = 1, file = 1, line = 108
-ALL-NEXT:   0x1006 | LF_UDT_MOD_SRC_LINE [size = 20, hash = 7249]
+ALL-NEXT:   0x1006 | LF_UDT_MOD_SRC_LINE [size = 20, hash = 0x1C51]
 ALL-NEXT:            udt = 0x104A, mod = 1, file = 1, line = 96
-ALL-NEXT:   0x1007 | LF_STRING_ID [size = 48, hash = 80727] ID: <no type>, String: d:\src\llvm\test\DebugInfo\PDB\Inputs
-ALL-NEXT:   0x1008 | LF_STRING_ID [size = 76, hash = 154177] ID: <no type>, String: C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\BIN\cl.exe
-ALL-NEXT:   0x1009 | LF_STRING_ID [size = 20, hash = 75189] ID: <no type>, String: empty.cpp
-ALL-NEXT:   0x100A | LF_STRING_ID [size = 56, hash = 253662] ID: <no type>, String: d:\src\llvm\test\DebugInfo\PDB\Inputs\vc120.pdb
-ALL-NEXT:   0x100B | LF_STRING_ID [size = 252, hash = 193467] ID: <no type>, String: -Zi -MT -I"C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\INCLUDE" -I"C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\ATLMFC\INCLUDE" -I"C:\Program Files (x86)\Windows Kits\8.1\include\shared" -I"C:\Program Files (x86)\Windows
-ALL-NEXT:   0x100C | LF_SUBSTR_LIST [size = 12, hash = 222705]
+ALL-NEXT:   0x1007 | LF_STRING_ID [size = 48, hash = 0x13B57] ID: <no type>, String: d:\src\llvm\test\DebugInfo\PDB\Inputs
+ALL-NEXT:   0x1008 | LF_STRING_ID [size = 76, hash = 0x25A41] ID: <no type>, String: C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\BIN\cl.exe
+ALL-NEXT:   0x1009 | LF_STRING_ID [size = 20, hash = 0x125B5] ID: <no type>, String: empty.cpp
+ALL-NEXT:   0x100A | LF_STRING_ID [size = 56, hash = 0x3DEDE] ID: <no type>, String: d:\src\llvm\test\DebugInfo\PDB\Inputs\vc120.pdb
+ALL-NEXT:   0x100B | LF_STRING_ID [size = 252, hash = 0x2F3BB] ID: <no type>, String: -Zi -MT -I"C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\INCLUDE" -I"C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\ATLMFC\INCLUDE" -I"C:\Program Files (x86)\Windows Kits\8.1\include\shared" -I"C:\Program Files (x86)\Windows
+ALL-NEXT:   0x100C | LF_SUBSTR_LIST [size = 12, hash = 0x365F1]
 ALL-NEXT:            0x100B: `-Zi -MT -I"C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\INCLUDE" -I"C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\ATLMFC\INCLUDE" -I"C:\Program Files (x86)\Windows Kits\8.1\include\shared" -I"C:\Program Files (x86)\Windows`
-ALL-NEXT:   0x100D | LF_STRING_ID [size = 96, hash = 186099] ID: 0x100C, String:  Kits\8.1\include\um" -I"C:\Program Files (x86)\Windows Kits\8.1\include\winrt" -TP -X
-ALL-NEXT:   0x100E | LF_BUILDINFO [size = 28, hash = 257108]
+ALL-NEXT:   0x100D | LF_STRING_ID [size = 96, hash = 0x2D6F3] ID: 0x100C, String:  Kits\8.1\include\um" -I"C:\Program Files (x86)\Windows Kits\8.1\include\winrt" -TP -X
+ALL-NEXT:   0x100E | LF_BUILDINFO [size = 28, hash = 0x3EC54]
 ALL-NEXT:            0x1007: `d:\src\llvm\test\DebugInfo\PDB\Inputs`
 ALL-NEXT:            0x1008: `C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\BIN\cl.exe`
 ALL-NEXT:            0x1009: `empty.cpp`
@@ -472,13 +472,13 @@ ALL-NEXT:          frontend = 18.0.31101.0, backend = 18.0.31101.0
 ALL-NEXT:          flags = security checks
 ALL-NEXT:    120 | S_GPROC32 [size = 44] `main`
 ALL-NEXT:          parent = 0, end = 196, addr = 0001:0016, code size = 10
-ALL-NEXT:          debug start = 3, debug end = 8, flags = has fp
+ALL-NEXT:          type = `0x1001 (int ())`, debug start = 3, debug end = 8, flags = has fp
 ALL-NEXT:    164 | S_FRAMEPROC [size = 32]
 ALL-NEXT:          size = 0, padding size = 0, offset to padding = 0
 ALL-NEXT:          bytes of callee saved registers = 0, exception handler addr = 0000:0000
 ALL-NEXT:          flags = has async eh | opt speed
 ALL-NEXT:    196 | S_END [size = 4]
-ALL-NEXT:    200 | S_BUILDINFO [size = 8] BuildId = `4110`
+ALL-NEXT:    200 | S_BUILDINFO [size = 8] BuildId = `0x100E`
 ALL-NEXT:   Mod 0001 | `* Linker *`:
 ALL-NEXT:      4 | S_OBJNAME [size = 20] sig=0, `* Linker *`
 ALL-NEXT:     24 | S_COMPILE3 [size = 48]
diff --git a/test/DebugInfo/X86/DIModule.ll b/test/DebugInfo/X86/DIModule.ll
index 1fe7f0c5fabe..eacdfe10f53b 100644
--- a/test/DebugInfo/X86/DIModule.ll
+++ b/test/DebugInfo/X86/DIModule.ll
@@ -18,7 +18,7 @@ target triple = "x86_64-apple-macosx"
 !1 = !DIFile(filename: "/llvm/tools/clang/test/Modules/<stdin>", directory: "/")
 !2 = !{}
 !3 = !{!4}
-!4 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !0, entity: !5, line: 5)
+!4 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !0, entity: !5, file: !1, line: 5)
 !5 = !DIModule(scope: null, name: "DebugModule", configMacros: "-DMODULES=0", includePath: "/llvm/tools/clang/test/Modules/Inputs", isysroot: "/")
 !6 = !{i32 2, !"Dwarf Version", i32 4}
 !7 = !{i32 2, !"Debug Info Version", i32 3}
diff --git a/test/DebugInfo/X86/DIModuleContext.ll b/test/DebugInfo/X86/DIModuleContext.ll
index a63fd0f373cd..02d4441c8234 100644
--- a/test/DebugInfo/X86/DIModuleContext.ll
+++ b/test/DebugInfo/X86/DIModuleContext.ll
@@ -24,7 +24,7 @@ target triple = "x86_64-apple-macosx"
 !4 = !{}
 !5 = !{!0}
 !6 = !{!7}
-!7 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !2, entity: !8, line: 11)
+!7 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !2, entity: !8, file: !3, line: 11)
 !8 = !DIModule(scope: null, name: "Module", includePath: ".", isysroot: "/")
 !9 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !10, size: 64, align: 64)
 !10 = !DICompositeType(tag: DW_TAG_structure_type, name: "s", scope: !8, file: !3, line: 1, flags: DIFlagFwdDecl)
diff --git a/test/DebugInfo/X86/fission-inline.ll b/test/DebugInfo/X86/fission-inline.ll
index 45e0127294d1..eadcd15b2f21 100644
--- a/test/DebugInfo/X86/fission-inline.ll
+++ b/test/DebugInfo/X86/fission-inline.ll
@@ -110,7 +110,7 @@ attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "n
 !16 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
 !17 = !DISubprogram(name: "f2<int>", linkageName: "_ZN3foo2f2IiEEvv", line: 10, isLocal: false, isDefinition: false, flags: DIFlagPrototyped, isOptimized: false, scopeLine: 10, file: !1, scope: !4, type: !12, templateParams: !14)
 !18 = !{!19}
-!19 = !DIImportedEntity(tag: DW_TAG_imported_declaration, line: 19, scope: !20, entity: !4)
+!19 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !1, line: 19, scope: !20, entity: !4)
 !20 = distinct !DILexicalBlock(line: 16, column: 13, file: !1, scope: !21)
 !21 = distinct !DILexicalBlock(line: 16, column: 7, file: !1, scope: !10)
 !22 = !{i32 2, !"Dwarf Version", i32 4}
diff --git a/test/DebugInfo/X86/gnu-public-names.ll b/test/DebugInfo/X86/gnu-public-names.ll
index 533ab838a732..df1d113538ea 100644
--- a/test/DebugInfo/X86/gnu-public-names.ll
+++ b/test/DebugInfo/X86/gnu-public-names.ll
@@ -345,9 +345,9 @@ attributes #1 = { nounwind readnone }
 !42 = !DINamespace(scope: !43)
 !43 = !DINamespace(name: "outer", scope: null)
 !44 = !{!45, !47}
-!45 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !2, entity: !46, line: 34)
+!45 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !2, entity: !46, file:!3, line: 34)
 !46 = !DIGlobalVariable(name: "global_namespace_variable_decl", linkageName: "_ZN2ns30global_namespace_variable_declE", scope: !18, file: !3, line: 28, type: !9, isLocal: false, isDefinition: false)
-!47 = !DIImportedEntity(tag: DW_TAG_imported_module, scope: !43, entity: !42, line: 43)
+!47 = !DIImportedEntity(tag: DW_TAG_imported_module, scope: !43, entity: !42, file: !3, line: 43)
 !48 = !{i32 2, !"Dwarf Version", i32 4}
 !49 = !{i32 2, !"Debug Info Version", i32 3}
 !50 = !{!"clang version 3.7.0 (trunk 234897) (llvm/trunk 234911)"}
diff --git a/test/DebugInfo/X86/lexical-block-file-inline.ll b/test/DebugInfo/X86/lexical-block-file-inline.ll
index 0f85a5573f03..9f040f41ec5e 100644
--- a/test/DebugInfo/X86/lexical-block-file-inline.ll
+++ b/test/DebugInfo/X86/lexical-block-file-inline.ll
@@ -134,7 +134,7 @@ attributes #2 = { nounwind }
 !8 = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov", scope: !9, file: !9, line: 6, type: !5, isLocal: false, isDefinition: true, scopeLine: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
 !9 = !DIFile(filename: "test.h", directory: "/")
 !10 = !{!11}
-!11 = !DIImportedEntity(tag: DW_TAG_imported_module, scope: !12, entity: !14, line: 1)
+!11 = !DIImportedEntity(tag: DW_TAG_imported_module, scope: !12, entity: !14, file: !1, line: 1)
 !12 = !DILexicalBlockFile(scope: !13, file: !9, discriminator: 0)
 !13 = distinct !DILexicalBlock(scope: !4, file: !1, line: 3)
 !14 = !DINamespace(name: "N", scope: null)
diff --git a/test/DebugInfo/X86/pr19307.ll b/test/DebugInfo/X86/pr19307.ll
index a8278c9dcf83..c94572f7d23c 100644
--- a/test/DebugInfo/X86/pr19307.ll
+++ b/test/DebugInfo/X86/pr19307.ll
@@ -105,26 +105,26 @@ attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "n
 !19 = !DIDerivedType(tag: DW_TAG_typedef, name: "string", line: 65, file: !20, scope: !10, baseType: !8)
 !20 = !DIFile(filename: "/usr/lib/gcc/x86_64-linux-gnu/4.6/../../../../include/c++/4.6/bits/stringfwd.h", directory: "/llvm_cmake_gcc")
 !21 = !{!22, !26, !29, !33, !38, !41}
-!22 = !DIImportedEntity(tag: DW_TAG_imported_module, line: 57, scope: !23, entity: !25)
+!22 = !DIImportedEntity(tag: DW_TAG_imported_module, file: !1, line: 57, scope: !23, entity: !25)
 !23 = !DINamespace(name: "__gnu_debug", scope: null)
 !24 = !DIFile(filename: "/usr/lib/gcc/x86_64-linux-gnu/4.6/../../../../include/c++/4.6/debug/debug.h", directory: "/llvm_cmake_gcc")
 !25 = !DINamespace(name: "__debug", scope: !10)
-!26 = !DIImportedEntity(tag: DW_TAG_imported_declaration, line: 66, scope: !10, entity: !27)
+!26 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !1, line: 66, scope: !10, entity: !27)
 !27 = !DIDerivedType(tag: DW_TAG_typedef, name: "mbstate_t", line: 106, file: !5, baseType: !28)
 !28 = !DIDerivedType(tag: DW_TAG_typedef, name: "__mbstate_t", line: 95, file: !5, baseType: !4)
-!29 = !DIImportedEntity(tag: DW_TAG_imported_declaration, line: 141, scope: !10, entity: !30)
+!29 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !1, line: 141, scope: !10, entity: !30)
 !30 = !DIDerivedType(tag: DW_TAG_typedef, name: "wint_t", line: 141, file: !31, baseType: !32)
 !31 = !DIFile(filename: "/llvm_cmake_gcc/bin/../lib/clang/3.5.0/include/stddef.h", directory: "/llvm_cmake_gcc")
 !32 = !DIBasicType(tag: DW_TAG_base_type, name: "unsigned int", size: 32, align: 32, encoding: DW_ATE_unsigned)
-!33 = !DIImportedEntity(tag: DW_TAG_imported_declaration, line: 42, scope: !34, entity: !36)
+!33 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !1, line: 42, scope: !34, entity: !36)
 !34 = !DINamespace(name: "__gnu_cxx", scope: null)
 !35 = !DIFile(filename: "/usr/lib/gcc/x86_64-linux-gnu/4.6/../../../../include/c++/4.6/bits/cpp_type_traits.h", directory: "/llvm_cmake_gcc")
 !36 = !DIDerivedType(tag: DW_TAG_typedef, name: "size_t", line: 155, file: !11, scope: !10, baseType: !37)
 !37 = !DIBasicType(tag: DW_TAG_base_type, name: "long unsigned int", size: 64, align: 64, encoding: DW_ATE_unsigned)
-!38 = !DIImportedEntity(tag: DW_TAG_imported_declaration, line: 43, scope: !34, entity: !39)
+!38 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !1, line: 43, scope: !34, entity: !39)
 !39 = !DIDerivedType(tag: DW_TAG_typedef, name: "ptrdiff_t", line: 156, file: !11, scope: !10, baseType: !40)
 !40 = !DIBasicType(tag: DW_TAG_base_type, name: "long int", size: 64, align: 64, encoding: DW_ATE_signed)
-!41 = !DIImportedEntity(tag: DW_TAG_imported_declaration, line: 55, scope: !10, entity: !6)
+!41 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !1, line: 55, scope: !10, entity: !6)
 !42 = !{i32 2, !"Dwarf Version", i32 4}
 !43 = !{i32 2, !"Debug Info Version", i32 3}
 !44 = !{!"clang version 3.5.0 (209308)"}
diff --git a/test/DllTool/coff-exports.def b/test/DllTool/coff-exports.def
new file mode 100644
index 000000000000..0226886a523c
--- /dev/null
+++ b/test/DllTool/coff-exports.def
@@ -0,0 +1,13 @@
+; RUN: llvm-dlltool -m i386:x86-64 --input-def %s --output-lib %t.a
+; RUN: llvm-readobj -coff-exports %t.a | FileCheck %s
+
+LIBRARY test.dll
+EXPORTS
+TestFunction
+
+; CHECK: File: test.dll
+; CHECK: Format: COFF-import-file
+; CHECK: Type: code
+; CHECK: Name type: name
+; CHECK: Symbol: __imp_TestFunction
+; CHECK: Symbol: TestFunction
diff --git a/test/DllTool/coff-weak-exports.def b/test/DllTool/coff-weak-exports.def
new file mode 100644
index 000000000000..511d947d8395
--- /dev/null
+++ b/test/DllTool/coff-weak-exports.def
@@ -0,0 +1,19 @@
+; RUN: llvm-dlltool -m i386:x86-64 --input-def %s --output-lib %t.a
+; RUN: llvm-readobj -coff-exports %t.a | FileCheck %s
+
+LIBRARY test.dll
+EXPORTS
+TestFunction==AltTestFunction
+
+; CHECK: File: test.dll
+; CHECK: Format: COFF-x86-64
+; CHECK: Arch: x86_64
+; CHECK: AddressSize: 64bit
+; CHECK: File: test.dll
+; CHECK: Format: COFF-x86-64
+; CHECK: Arch: x86_64
+; CHECK: AddressSize: 64bit
+; CHECK: File: test.dll
+; CHECK: Format: COFF-x86-64
+; CHECK: Arch: x86_64
+; CHECK: AddressSize: 64bit
diff --git a/test/DllTool/lit.local.cfg b/test/DllTool/lit.local.cfg
new file mode 100644
index 000000000000..482608486d21
--- /dev/null
+++ b/test/DllTool/lit.local.cfg
@@ -0,0 +1 @@
+config.suffixes = ['.def']
diff --git a/test/FileCheck/regex-scope.txt b/test/FileCheck/regex-scope.txt
index e77f3f6513a8..989f422c6bcd 100644
--- a/test/FileCheck/regex-scope.txt
+++ b/test/FileCheck/regex-scope.txt
@@ -1,4 +1,4 @@
-// RUN: FileCheck -check-prefix CHECK -input-file %s %s
+// RUN: FileCheck -input-file %s %s
 // RUN: FileCheck -check-prefixes CHECK,GLOBAL -input-file %s %s
 // RUN: FileCheck -check-prefixes CHECK,LOCAL -input-file %s %s
 // RUN: FileCheck -check-prefixes CHECK,GLOBAL --enable-var-scope -input-file %s %s
diff --git a/test/Instrumentation/AddressSanitizer/basic.ll b/test/Instrumentation/AddressSanitizer/basic.ll
index 9827e7a6792b..5eb5388c87a8 100644
--- a/test/Instrumentation/AddressSanitizer/basic.ll
+++ b/test/Instrumentation/AddressSanitizer/basic.ll
@@ -170,6 +170,26 @@ define void @memintr_test(i8* %a, i8* %b) nounwind uwtable sanitize_address {
 ; CHECK: __asan_memcpy
 ; CHECK: ret void
 
+declare void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* nocapture writeonly, i8, i64, i32) nounwind
+declare void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32) nounwind
+declare void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32) nounwind
+
+define void @memintr_element_atomic_test(i8* %a, i8* %b) nounwind uwtable sanitize_address {
+  ; This is a canary test to make sure that these don't get lowered into calls that don't
+  ; have the element-atomic property. Eventually, asan will have to be enhanced to lower
+  ; these properly.
+  ; CHECK-LABEL: memintr_element_atomic_test
+  ; CHECK-NEXT: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 1 %a, i8 0, i64 100, i32 1)
+  ; CHECK-NEXT: tail call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %a, i8* align 1 %b, i64 100, i32 1)
+  ; CHECK-NEXT: tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %a, i8* align 1 %b, i64 100, i32 1)
+  ; CHECK-NEXT: ret void
+  tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 1 %a, i8 0, i64 100, i32 1)
+  tail call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %a, i8* align 1 %b, i64 100, i32 1)
+  tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %a, i8* align 1 %b, i64 100, i32 1)
+  ret void
+}
+
+
 ; CHECK-LABEL: @test_swifterror
 ; CHECK-NOT: __asan_report_load
 ; CHECK: ret void
diff --git a/test/Instrumentation/AddressSanitizer/stack-poisoning-byval-args.ll b/test/Instrumentation/AddressSanitizer/stack-poisoning-byval-args.ll
new file mode 100644
index 000000000000..32610ce3b815
--- /dev/null
+++ b/test/Instrumentation/AddressSanitizer/stack-poisoning-byval-args.ll
@@ -0,0 +1,48 @@
+; This check verifies that arguments passed by value get redzones.
+; RUN: opt < %s -asan -asan-realign-stack=32 -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.A = type { [8 x i32] }
+
+declare i32 @bar(%struct.A*)
+
+; Test behavior for named argument with explicit alignment.  The memcpy and
+; alloca alignments should match the explicit alignment of 64.
+define void @foo(%struct.A* byval align 64 %a) sanitize_address {
+entry:
+; CHECK-LABEL: foo
+; CHECK: call i64 @__asan_stack_malloc
+; CHECK: alloca i8, i64 {{.*}} align 64
+; CHECK: [[copyPtr:%[^ \t]+]] = inttoptr i64 %{{[^ \t]+}} to %struct.A*
+; CHECK: [[copyBytePtr:%[^ \t]+]] = bitcast %struct.A* [[copyPtr]]
+; CHECK: [[aBytePtr:%[^ \t]+]] = bitcast %struct.A* %a
+; CHECK: call void @llvm.memcpy{{[^%]+}}[[copyBytePtr]]{{[^%]+}}[[aBytePtr]],{{[^,]+}}, i32 64
+; CHECK: call i32 @bar(%struct.A* [[copyPtr]])
+; CHECK: ret void
+
+  %call = call i32 @bar(%struct.A* %a)
+  ret void
+}
+
+; Test behavior for unnamed argument without explicit alignment.  In this case,
+; the first argument is referenced by the identifier %0 and the ABI requires a
+; minimum alignment of 4 bytes since struct.A contains i32s which have 4-byte
+; alignment.  However, the alloca alignment will be 32 since that is the value
+; passed via the -asan-realign-stack option, which is greater than 4.
+define void @baz(%struct.A* byval) sanitize_address {
+entry:
+; CHECK-LABEL: baz
+; CHECK: call i64 @__asan_stack_malloc
+; CHECK: alloca i8, i64 {{.*}} align 32
+; CHECK: [[copyPtr:%[^ \t]+]] = inttoptr i64 %{{[^ \t]+}} to %struct.A*
+; CHECK: [[copyBytePtr:%[^ \t]+]] = bitcast %struct.A* [[copyPtr]]
+; CHECK: [[aBytePtr:%[^ \t]+]] = bitcast %struct.A* %0
+; CHECK: call void @llvm.memcpy{{[^%]+}}[[copyBytePtr]]{{[^%]+}}[[aBytePtr]],{{[^,]+}}, i32 4
+; CHECK: call i32 @bar(%struct.A* [[copyPtr]])
+; CHECK: ret void
+
+  %call = call i32 @bar(%struct.A* %0)
+  ret void
+}
diff --git a/test/Instrumentation/DataFlowSanitizer/unordered_atomic_mem_intrins.ll b/test/Instrumentation/DataFlowSanitizer/unordered_atomic_mem_intrins.ll
new file mode 100644
index 000000000000..9b6e3db9050f
--- /dev/null
+++ b/test/Instrumentation/DataFlowSanitizer/unordered_atomic_mem_intrins.ll
@@ -0,0 +1,37 @@
+; RUN: opt < %s -dfsan -dfsan-args-abi -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+;; Placeholder tests that will fail once element atomic @llvm.mem[move|set] instrinsics have
+;; been added to the MemIntrinsic class hierarchy. These will act as a reminder to
+;; verify that dfsan handles these intrinsics properly once they have been
+;; added to that class hierarchy.
+
+declare void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* nocapture writeonly, i8, i64, i32) nounwind
+declare void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32) nounwind
+declare void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32) nounwind
+
+define void @test_memcpy(i8* nocapture, i8* nocapture) {
+  ; CHECK-LABEL: dfs$test_memcpy
+  ; CHECK: call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %0, i8* align 1 %1, i64 16, i32 1)
+  ; CHECK: ret void
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %0, i8* align 1 %1, i64 16, i32 1)
+  ret void
+}
+
+define void @test_memmove(i8* nocapture, i8* nocapture) {
+  ; CHECK-LABEL: dfs$test_memmove
+  ; CHECK: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %0, i8* align 1 %1, i64 16, i32 1)
+  ; CHECK: ret void
+  call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %0, i8* align 1 %1, i64 16, i32 1)
+  ret void
+}
+
+define void @test_memset(i8* nocapture) {
+  ; CHECK-LABEL: dfs$test_memset
+  ; CHECK: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 1 %0, i8 88, i64 16, i32 1)
+  ; CHECK: ret void
+  call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 1 %0, i8 88, i64 16, i32 1)
+  ret void
+}
diff --git a/test/Instrumentation/EfficiencySanitizer/working_set_basic.ll b/test/Instrumentation/EfficiencySanitizer/working_set_basic.ll
index 3457cfc7e278..344ad86e99e4 100644
--- a/test/Instrumentation/EfficiencySanitizer/working_set_basic.ll
+++ b/test/Instrumentation/EfficiencySanitizer/working_set_basic.ll
@@ -233,6 +233,39 @@ entry:
 ; CHECK: ret void
 }
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Ensure that esan doesn't convert element atomic memory intrinsics to
+; calls.
+
+declare void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* nocapture writeonly, i8, i64, i32) nounwind
+declare void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32) nounwind
+declare void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32) nounwind
+
+define void @elementAtomic_memCpyTest(i8* nocapture %x, i8* nocapture %y) {
+  ; CHECK-LABEL: elementAtomic_memCpyTest
+  ; CHECK-NEXT: tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %x, i8* align 1 %y, i64 16, i32 1)
+  ; CHECK-NEXT: ret void
+  tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %x, i8* align 1 %y, i64 16, i32 1)
+  ret void
+}
+
+define void @elementAtomic_memMoveTest(i8* nocapture %x, i8* nocapture %y) {
+  ; CHECK-LABEL: elementAtomic_memMoveTest
+  ; CHECK-NEXT:  tail call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %x, i8* align 1 %y, i64 16, i32 1)
+  ; CHECK-NEXT: ret void
+  tail call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %x, i8* align 1 %y, i64 16, i32 1)
+  ret void
+}
+
+define void @elementAtomic_memSetTest(i8* nocapture %x) {
+  ; CHECK-LABEL: elementAtomic_memSetTest
+  ; CHECK-NEXT: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 1 %x, i8 77, i64 16, i32 1)
+  ; CHECK-NEXT: ret void
+  tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 1 %x, i8 77, i64 16, i32 1)
+  ret void
+}
+
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; Top-level:
 
diff --git a/test/Instrumentation/EfficiencySanitizer/working_set_slow.ll b/test/Instrumentation/EfficiencySanitizer/working_set_slow.ll
index 1c5978e52864..22c8d5c59a16 100644
--- a/test/Instrumentation/EfficiencySanitizer/working_set_slow.ll
+++ b/test/Instrumentation/EfficiencySanitizer/working_set_slow.ll
@@ -250,6 +250,38 @@ entry:
 ; CHECK: ret void
 }
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Ensure that esan doesn't convert element atomic memory intrinsics to
+; calls.
+
+declare void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* nocapture writeonly, i8, i64, i32) nounwind
+declare void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32) nounwind
+declare void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32) nounwind
+
+define void @elementAtomic_memCpyTest(i8* nocapture %x, i8* nocapture %y) {
+  ; CHECK-LABEL: elementAtomic_memCpyTest
+  ; CHECK-NEXT: tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %x, i8* align 1 %y, i64 16, i32 1)
+  ; CHECK-NEXT: ret void
+  tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %x, i8* align 1 %y, i64 16, i32 1)
+  ret void
+}
+
+define void @elementAtomic_memMoveTest(i8* nocapture %x, i8* nocapture %y) {
+  ; CHECK-LABEL: elementAtomic_memMoveTest
+  ; CHECK-NEXT:  tail call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %x, i8* align 1 %y, i64 16, i32 1)
+  ; CHECK-NEXT: ret void
+  tail call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %x, i8* align 1 %y, i64 16, i32 1)
+  ret void
+}
+
+define void @elementAtomic_memSetTest(i8* nocapture %x) {
+  ; CHECK-LABEL: elementAtomic_memSetTest
+  ; CHECK-NEXT: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 1 %x, i8 77, i64 16, i32 1)
+  ; CHECK-NEXT: ret void
+  tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 1 %x, i8 77, i64 16, i32 1)
+  ret void
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; Top-level:
 
diff --git a/test/Instrumentation/MemorySanitizer/msan_basic.ll b/test/Instrumentation/MemorySanitizer/msan_basic.ll
index ffb239a15256..47912b5b6901 100644
--- a/test/Instrumentation/MemorySanitizer/msan_basic.ll
+++ b/test/Instrumentation/MemorySanitizer/msan_basic.ll
@@ -238,6 +238,41 @@ declare void @llvm.memmove.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32,
 ; CHECK: call i8* @__msan_memmove
 ; CHECK: ret void
 
+;; ------------
+;; Placeholder tests that will fail once element atomic @llvm.mem[cpy|move|set] instrinsics have
+;; been added to the MemIntrinsic class hierarchy. These will act as a reminder to
+;; verify that MSAN handles these intrinsics properly once they have been
+;; added to that class hierarchy.
+declare void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* nocapture writeonly, i8, i64, i32) nounwind
+declare void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32) nounwind
+declare void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32) nounwind
+
+define void @atomic_memcpy(i8* nocapture %x, i8* nocapture %y) nounwind {
+  ; CHECK-LABEL: atomic_memcpy
+  ; CHECK-NEXT: call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %x, i8* align 2 %y, i64 16, i32 1)
+  ; CHECK-NEXT: ret void
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %x, i8* align 2 %y, i64 16, i32 1)
+  ret void
+}
+
+define void @atomic_memmove(i8* nocapture %x, i8* nocapture %y) nounwind {
+  ; CHECK-LABEL: atomic_memmove
+  ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %x, i8* align 2 %y, i64 16, i32 1)
+  ; CHECK-NEXT: ret void
+  call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %x, i8* align 2 %y, i64 16, i32 1)
+  ret void
+}
+
+define void @atomic_memset(i8* nocapture %x) nounwind {
+  ; CHECK-LABEL: atomic_memset
+  ; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 1 %x, i8 88, i64 16, i32 1)
+  ; CHECK-NEXT: ret void
+  call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 1 %x, i8 88, i64 16, i32 1)
+  ret void
+}
+
+;; ------------
+
 
 ; Check that we propagate shadow for "select"
 
diff --git a/test/Instrumentation/SanitizerCoverage/cmp-tracing-api-x86_32.ll b/test/Instrumentation/SanitizerCoverage/cmp-tracing-api-x86_32.ll
new file mode 100644
index 000000000000..badf07db2c15
--- /dev/null
+++ b/test/Instrumentation/SanitizerCoverage/cmp-tracing-api-x86_32.ll
@@ -0,0 +1,22 @@
+; Test -sanitizer-coverage-trace-compares=1 API declarations on a non-x86_64 arch
+; RUN: opt < %s -sancov -sanitizer-coverage-level=1 -sanitizer-coverage-trace-compares=1  -S | FileCheck %s
+
+target triple = "x86_32-unknown-linux-gnu"
+define i32 @foo() #0 {
+entry:
+  ret i32 0
+}
+
+; CHECK: declare void @__sanitizer_cov_trace_pc_indir(i64)
+; CHECK: declare void @__sanitizer_cov_trace_cmp1(i8, i8)
+; CHECK: declare void @__sanitizer_cov_trace_cmp2(i16, i16)
+; CHECK: declare void @__sanitizer_cov_trace_cmp4(i32, i32)
+; CHECK: declare void @__sanitizer_cov_trace_cmp8(i64, i64)
+; CHECK: declare void @__sanitizer_cov_trace_div4(i32)
+; CHECK: declare void @__sanitizer_cov_trace_div8(i64)
+; CHECK: declare void @__sanitizer_cov_trace_gep(i64)
+; CHECK: declare void @__sanitizer_cov_trace_switch(i64, i64*)
+; CHECK: declare void @__sanitizer_cov_trace_pc()
+; CHECK: declare void @__sanitizer_cov_trace_pc_guard(i32*)
+; CHECK: declare void @__sanitizer_cov_trace_pc_guard_init(i32*, i32*)
+; CHECK-NOT: declare
diff --git a/test/Instrumentation/SanitizerCoverage/cmp-tracing-api-x86_64.ll b/test/Instrumentation/SanitizerCoverage/cmp-tracing-api-x86_64.ll
new file mode 100644
index 000000000000..16689f9831d8
--- /dev/null
+++ b/test/Instrumentation/SanitizerCoverage/cmp-tracing-api-x86_64.ll
@@ -0,0 +1,22 @@
+; Test -sanitizer-coverage-trace-compares=1 API declarations on x86_64
+; RUN: opt < %s -sancov -sanitizer-coverage-level=1 -sanitizer-coverage-trace-compares=1  -S | FileCheck %s
+
+target triple = "x86_64-unknown-linux-gnu"
+define i32 @foo() #0 {
+entry:
+  ret i32 0
+}
+
+; CHECK: declare void @__sanitizer_cov_trace_pc_indir(i64)
+; CHECK: declare void @__sanitizer_cov_trace_cmp1(i8 zeroext, i8 zeroext)
+; CHECK: declare void @__sanitizer_cov_trace_cmp2(i16 zeroext, i16 zeroext)
+; CHECK: declare void @__sanitizer_cov_trace_cmp4(i32 zeroext, i32 zeroext)
+; CHECK: declare void @__sanitizer_cov_trace_cmp8(i64, i64)
+; CHECK: declare void @__sanitizer_cov_trace_div4(i32 zeroext)
+; CHECK: declare void @__sanitizer_cov_trace_div8(i64)
+; CHECK: declare void @__sanitizer_cov_trace_gep(i64)
+; CHECK: declare void @__sanitizer_cov_trace_switch(i64, i64*)
+; CHECK: declare void @__sanitizer_cov_trace_pc()
+; CHECK: declare void @__sanitizer_cov_trace_pc_guard(i32*)
+; CHECK: declare void @__sanitizer_cov_trace_pc_guard_init(i32*, i32*)
+; CHECK-NOT: declare
diff --git a/test/Linker/pr26037.ll b/test/Linker/pr26037.ll
index 0e6da17e9fb7..da771669574f 100644
--- a/test/Linker/pr26037.ll
+++ b/test/Linker/pr26037.ll
@@ -44,13 +44,13 @@ entry:
 !7 = !{null}
 !8 = distinct !DISubprogram(name: "b", linkageName: "_ZN1A1bEv", scope: !5, file: !1, line: 8, type: !6, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
 !9 = !{!10, !16}
-!10 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !8, entity: !4, line: 8)
+!10 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !8, entity: !4, file: !1, line: 8)
 !11 = !{i32 2, !"Dwarf Version", i32 4}
 !12 = !{i32 2, !"Debug Info Version", i32 3}
 !13 = !{!"clang version 3.8.0 (trunk 256934) (llvm/trunk 256936)"}
 !14 = !DILocation(line: 7, column: 12, scope: !4)
 !15 = !DILocation(line: 8, column: 24, scope: !8)
-!16 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !17, entity: !19, line: 8)
+!16 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !17, entity: !19, file: !1, line: 8)
 !17 = distinct !DILexicalBlock(scope: !18, file: !1, line: 9, column: 8)
 !18 = distinct !DISubprogram(name: "c", linkageName: "_ZN1A1cEv", scope: !5, file: !1, line: 9, type: !6, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
 !19 = distinct !DILexicalBlock(scope: !20, file: !1, line: 10, column: 8)
diff --git a/test/MC/AArch64/coff-relocations.s b/test/MC/AArch64/coff-relocations.s
new file mode 100644
index 000000000000..221ecfd4cd41
--- /dev/null
+++ b/test/MC/AArch64/coff-relocations.s
@@ -0,0 +1,52 @@
+; RUN: llvm-mc -triple aarch64-windows -filetype obj -o - %s | \
+; RUN: llvm-readobj -r - | FileCheck %s
+
+; IMAGE_REL_ARM64_ADDR32
+.Linfo_foo:
+  .asciz "foo"
+  .long foo
+
+; IMAGE_REL_ARM64_ADDR32NB
+.long func@IMGREL
+
+; IMAGE_REL_ARM64_ADDR64
+.globl struc
+struc:
+  .quad arr
+
+; IMAGE_REL_ARM64_BRANCH26
+b target
+
+; IMAGE_REL_ARM64_PAGEBASE_REL21
+adrp x0, foo
+
+; IMAGE_REL_ARM64_PAGEOFFSET_12A
+add x0, x0, :lo12:foo
+
+; IMAGE_REL_ARM64_PAGEOFFSET_12L
+ldr x0, [x0, :lo12:foo]
+
+; IMAGE_REL_ARM64_SECREL
+.secrel32 .Linfo_bar
+.Linfo_bar:
+
+; IMAGE_REL_ARM64_SECTION
+.secidx func
+
+
+; CHECK: Format: COFF-ARM64
+; CHECK: Arch: aarch64
+; CHECK: AddressSize: 64bit
+; CHECK: Relocations [
+; CHECK:   Section (1) .text {
+; CHECK: 0x4 IMAGE_REL_ARM64_ADDR32 foo
+; CHECK: 0x8 IMAGE_REL_ARM64_ADDR32NB func
+; CHECK: 0xC IMAGE_REL_ARM64_ADDR64 arr
+; CHECK: 0x14 IMAGE_REL_ARM64_BRANCH26 target
+; CHECK: 0x18 IMAGE_REL_ARM64_PAGEBASE_REL21 foo
+; CHECK: 0x1C IMAGE_REL_ARM64_PAGEOFFSET_12A foo
+; CHECK: 0x20 IMAGE_REL_ARM64_PAGEOFFSET_12L foo
+; CHECK: 0x24 IMAGE_REL_ARM64_SECREL .text
+; CHECK: 0x28 IMAGE_REL_ARM64_SECTION func
+; CHECK:   }
+; CHECK: ]
diff --git a/test/MC/AArch64/invalid-instructions-spellcheck.s b/test/MC/AArch64/invalid-instructions-spellcheck.s
new file mode 100644
index 000000000000..8acb285ac9a6
--- /dev/null
+++ b/test/MC/AArch64/invalid-instructions-spellcheck.s
@@ -0,0 +1,37 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 | FileCheck %s
+// RUN: not llvm-mc -triple=aarch64 -mattr=-neon -show-encoding < %s 2>&1 | FileCheck %s --check-prefix=CHECK-NO-NEON
+
+// This tests the mnemonic spell checker.
+
+// First check what happens when an instruction is omitted:
+
+  w1, w2, w3
+
+// CHECK:      error: unknown token in expression
+// CHECK-NEXT: w1, w2, w3
+// CHECK-NEXT:   ^
+// CHECK-NEXT: error: invalid operand
+// CHECK-NEXT: w1, w2, w3
+// CHECK-NEXT:   ^
+
+// We don't want to see a suggestion here; the edit distance is too large to
+// give sensible suggestions:
+
+  addddddddd w1, w2, w3
+
+// CHECK:      error: unrecognized instruction mnemonic
+// CHECK-NEXT: addddddddd w1, w2, w3
+// CHECK-NEXT: ^
+
+  addd w1, w2, w3
+
+// CHECK:      error: unrecognized instruction mnemonic, did you mean: add, addp, adds, addv, fadd, madd?
+// CHECK-NEXT: addd w1, w2, w3
+// CHECK-NEXT: ^
+
+// Instructions 'addv' and 'addp' are only available when NEON is enabled, so we
+// don't want to see them here:
+
+// CHECK-NO-NEON:      error: unrecognized instruction mnemonic, did you mean: add, adds, fadd, madd?
+// CHECK-NO-NEON-NEXT: addd w1, w2, w3
+// CHECK-NO-NEON-NEXT: ^
diff --git a/test/MC/AMDGPU/gfx9_asm_all.s b/test/MC/AMDGPU/gfx9_asm_all.s
index 56484a37bdce..40bc2f8e159a 100644
--- a/test/MC/AMDGPU/gfx9_asm_all.s
+++ b/test/MC/AMDGPU/gfx9_asm_all.s
@@ -105392,3 +105392,354 @@ v_mad_mixlo_f16 v5, |v1|, |v2|, |v3|
 
 v_mad_mixlo_f16 v5, v1, v2, v3 clamp
 // CHECK: [0x05,0xc0,0xa1,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_pk_mad_i16 v5, v1, v2, v3
+// CHECK: [0x05,0x40,0x80,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_pk_mad_i16 v255, v1, v2, v3
+// CHECK: [0xff,0x40,0x80,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_pk_mad_i16 v5, v255, v2, v3
+// CHECK: [0x05,0x40,0x80,0xd3,0xff,0x05,0x0e,0x1c]
+
+v_pk_mad_i16 v5, s1, v2, v3
+// CHECK: [0x05,0x40,0x80,0xd3,0x01,0x04,0x0e,0x1c]
+
+v_pk_mad_i16 v5, s101, v2, v3
+// CHECK: [0x05,0x40,0x80,0xd3,0x65,0x04,0x0e,0x1c]
+
+v_pk_mad_i16 v5, flat_scratch_lo, v2, v3
+// CHECK: [0x05,0x40,0x80,0xd3,0x66,0x04,0x0e,0x1c]
+
+v_pk_mad_i16 v5, flat_scratch_hi, v2, v3
+// CHECK: [0x05,0x40,0x80,0xd3,0x67,0x04,0x0e,0x1c]
+
+v_pk_mad_i16 v5, vcc_lo, v2, v3
+// CHECK: [0x05,0x40,0x80,0xd3,0x6a,0x04,0x0e,0x1c]
+
+v_pk_mad_i16 v5, vcc_hi, v2, v3
+// CHECK: [0x05,0x40,0x80,0xd3,0x6b,0x04,0x0e,0x1c]
+
+v_pk_mad_i16 v5, m0, v2, v3
+// CHECK: [0x05,0x40,0x80,0xd3,0x7c,0x04,0x0e,0x1c]
+
+v_pk_mad_i16 v5, exec_lo, v2, v3
+// CHECK: [0x05,0x40,0x80,0xd3,0x7e,0x04,0x0e,0x1c]
+
+v_pk_mad_i16 v5, exec_hi, v2, v3
+// CHECK: [0x05,0x40,0x80,0xd3,0x7f,0x04,0x0e,0x1c]
+
+v_pk_mad_i16 v5, v1, v255, v3
+// CHECK: [0x05,0x40,0x80,0xd3,0x01,0xff,0x0f,0x1c]
+
+v_pk_mad_i16 v5, v1, s2, v3
+// CHECK: [0x05,0x40,0x80,0xd3,0x01,0x05,0x0c,0x1c]
+
+v_pk_mad_i16 v5, v1, s101, v3
+// CHECK: [0x05,0x40,0x80,0xd3,0x01,0xcb,0x0c,0x1c]
+
+v_pk_mad_i16 v5, v1, flat_scratch_lo, v3
+// CHECK: [0x05,0x40,0x80,0xd3,0x01,0xcd,0x0c,0x1c]
+
+v_pk_mad_i16 v5, v1, flat_scratch_hi, v3
+// CHECK: [0x05,0x40,0x80,0xd3,0x01,0xcf,0x0c,0x1c]
+
+v_pk_mad_i16 v5, v1, vcc_lo, v3
+// CHECK: [0x05,0x40,0x80,0xd3,0x01,0xd5,0x0c,0x1c]
+
+v_pk_mad_i16 v5, v1, vcc_hi, v3
+// CHECK: [0x05,0x40,0x80,0xd3,0x01,0xd7,0x0c,0x1c]
+
+v_pk_mad_i16 v5, v1, m0, v3
+// CHECK: [0x05,0x40,0x80,0xd3,0x01,0xf9,0x0c,0x1c]
+
+v_pk_mad_i16 v5, v1, exec_lo, v3
+// CHECK: [0x05,0x40,0x80,0xd3,0x01,0xfd,0x0c,0x1c]
+
+v_pk_mad_i16 v5, v1, exec_hi, v3
+// CHECK: [0x05,0x40,0x80,0xd3,0x01,0xff,0x0c,0x1c]
+
+v_pk_mad_i16 v5, v1, v2, v255
+// CHECK: [0x05,0x40,0x80,0xd3,0x01,0x05,0xfe,0x1f]
+
+v_pk_mad_i16 v5, v1, v2, s3
+// CHECK: [0x05,0x40,0x80,0xd3,0x01,0x05,0x0e,0x18]
+
+v_pk_mad_i16 v5, v1, v2, s101
+// CHECK: [0x05,0x40,0x80,0xd3,0x01,0x05,0x96,0x19]
+
+v_pk_mad_i16 v5, v1, v2, flat_scratch_lo
+// CHECK: [0x05,0x40,0x80,0xd3,0x01,0x05,0x9a,0x19]
+
+v_pk_mad_i16 v5, v1, v2, flat_scratch_hi
+// CHECK: [0x05,0x40,0x80,0xd3,0x01,0x05,0x9e,0x19]
+
+v_pk_mad_i16 v5, v1, v2, vcc_lo
+// CHECK: [0x05,0x40,0x80,0xd3,0x01,0x05,0xaa,0x19]
+
+v_pk_mad_i16 v5, v1, v2, vcc_hi
+// CHECK: [0x05,0x40,0x80,0xd3,0x01,0x05,0xae,0x19]
+
+v_pk_mad_i16 v5, v1, v2, m0
+// CHECK: [0x05,0x40,0x80,0xd3,0x01,0x05,0xf2,0x19]
+
+v_pk_mad_i16 v5, v1, v2, exec_lo
+// CHECK: [0x05,0x40,0x80,0xd3,0x01,0x05,0xfa,0x19]
+
+v_pk_mad_i16 v5, v1, v2, exec_hi
+// CHECK: [0x05,0x40,0x80,0xd3,0x01,0x05,0xfe,0x19]
+
+v_pk_mad_i16 v5, v1, v2, v3 op_sel:[0,0,0]
+// CHECK: [0x05,0x40,0x80,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_pk_mad_i16 v5, v1, v2, v3 op_sel:[1,0,0]
+// CHECK: [0x05,0x48,0x80,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_pk_mad_i16 v5, v1, v2, v3 op_sel:[0,1,0]
+// CHECK: [0x05,0x50,0x80,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_pk_mad_i16 v5, v1, v2, v3 op_sel:[0,0,1]
+// CHECK: [0x05,0x60,0x80,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_pk_mad_i16 v5, v1, v2, v3 op_sel:[1,1,1]
+// CHECK: [0x05,0x78,0x80,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_pk_mad_i16 v5, v1, v2, v3 op_sel_hi:[1,1,1]
+// CHECK: [0x05,0x40,0x80,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_pk_mad_i16 v5, v1, v2, v3 op_sel_hi:[0,0,0]
+// CHECK: [0x05,0x00,0x80,0xd3,0x01,0x05,0x0e,0x04]
+
+v_pk_mad_i16 v5, v1, v2, v3 op_sel_hi:[1,0,0]
+// CHECK: [0x05,0x00,0x80,0xd3,0x01,0x05,0x0e,0x0c]
+
+v_pk_mad_i16 v5, v1, v2, v3 op_sel_hi:[0,1,0]
+// CHECK: [0x05,0x00,0x80,0xd3,0x01,0x05,0x0e,0x14]
+
+v_pk_mad_i16 v5, v1, v2, v3 op_sel_hi:[0,0,1]
+// CHECK: [0x05,0x40,0x80,0xd3,0x01,0x05,0x0e,0x04]
+
+v_pk_mad_i16 v5, v1, v2, v3 clamp
+// CHECK: [0x05,0xc0,0x80,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_pk_mad_u16 v5, v1, v2, v3
+// CHECK: [0x05,0x40,0x89,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_pk_mad_u16 v255, v1, v2, v3
+// CHECK: [0xff,0x40,0x89,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_pk_mad_u16 v5, v255, v2, v3
+// CHECK: [0x05,0x40,0x89,0xd3,0xff,0x05,0x0e,0x1c]
+
+v_pk_mad_u16 v5, s1, v2, v3
+// CHECK: [0x05,0x40,0x89,0xd3,0x01,0x04,0x0e,0x1c]
+
+v_pk_mad_u16 v5, s101, v2, v3
+// CHECK: [0x05,0x40,0x89,0xd3,0x65,0x04,0x0e,0x1c]
+
+v_pk_mad_u16 v5, flat_scratch_lo, v2, v3
+// CHECK: [0x05,0x40,0x89,0xd3,0x66,0x04,0x0e,0x1c]
+
+v_pk_mad_u16 v5, flat_scratch_hi, v2, v3
+// CHECK: [0x05,0x40,0x89,0xd3,0x67,0x04,0x0e,0x1c]
+
+v_pk_mad_u16 v5, vcc_lo, v2, v3
+// CHECK: [0x05,0x40,0x89,0xd3,0x6a,0x04,0x0e,0x1c]
+
+v_pk_mad_u16 v5, vcc_hi, v2, v3
+// CHECK: [0x05,0x40,0x89,0xd3,0x6b,0x04,0x0e,0x1c]
+
+v_pk_mad_u16 v5, m0, v2, v3
+// CHECK: [0x05,0x40,0x89,0xd3,0x7c,0x04,0x0e,0x1c]
+
+v_pk_mad_u16 v5, exec_lo, v2, v3
+// CHECK: [0x05,0x40,0x89,0xd3,0x7e,0x04,0x0e,0x1c]
+
+v_pk_mad_u16 v5, exec_hi, v2, v3
+// CHECK: [0x05,0x40,0x89,0xd3,0x7f,0x04,0x0e,0x1c]
+
+v_pk_mad_u16 v5, v1, v255, v3
+// CHECK: [0x05,0x40,0x89,0xd3,0x01,0xff,0x0f,0x1c]
+
+v_pk_mad_u16 v5, v1, s2, v3
+// CHECK: [0x05,0x40,0x89,0xd3,0x01,0x05,0x0c,0x1c]
+
+v_pk_mad_u16 v5, v1, s101, v3
+// CHECK: [0x05,0x40,0x89,0xd3,0x01,0xcb,0x0c,0x1c]
+
+v_pk_mad_u16 v5, v1, flat_scratch_lo, v3
+// CHECK: [0x05,0x40,0x89,0xd3,0x01,0xcd,0x0c,0x1c]
+
+v_pk_mad_u16 v5, v1, flat_scratch_hi, v3
+// CHECK: [0x05,0x40,0x89,0xd3,0x01,0xcf,0x0c,0x1c]
+
+v_pk_mad_u16 v5, v1, vcc_lo, v3
+// CHECK: [0x05,0x40,0x89,0xd3,0x01,0xd5,0x0c,0x1c]
+
+v_pk_mad_u16 v5, v1, vcc_hi, v3
+// CHECK: [0x05,0x40,0x89,0xd3,0x01,0xd7,0x0c,0x1c]
+
+v_pk_mad_u16 v5, v1, m0, v3
+// CHECK: [0x05,0x40,0x89,0xd3,0x01,0xf9,0x0c,0x1c]
+
+v_pk_mad_u16 v5, v1, exec_lo, v3
+// CHECK: [0x05,0x40,0x89,0xd3,0x01,0xfd,0x0c,0x1c]
+
+v_pk_mad_u16 v5, v1, exec_hi, v3
+// CHECK: [0x05,0x40,0x89,0xd3,0x01,0xff,0x0c,0x1c]
+
+v_pk_mad_u16 v5, v1, v2, v255
+// CHECK: [0x05,0x40,0x89,0xd3,0x01,0x05,0xfe,0x1f]
+
+v_pk_mad_u16 v5, v1, v2, s3
+// CHECK: [0x05,0x40,0x89,0xd3,0x01,0x05,0x0e,0x18]
+
+v_pk_mad_u16 v5, v1, v2, s101
+// CHECK: [0x05,0x40,0x89,0xd3,0x01,0x05,0x96,0x19]
+
+v_pk_mad_u16 v5, v1, v2, flat_scratch_lo
+// CHECK: [0x05,0x40,0x89,0xd3,0x01,0x05,0x9a,0x19]
+
+v_pk_mad_u16 v5, v1, v2, flat_scratch_hi
+// CHECK: [0x05,0x40,0x89,0xd3,0x01,0x05,0x9e,0x19]
+
+v_pk_mad_u16 v5, v1, v2, vcc_lo
+// CHECK: [0x05,0x40,0x89,0xd3,0x01,0x05,0xaa,0x19]
+
+v_pk_mad_u16 v5, v1, v2, vcc_hi
+// CHECK: [0x05,0x40,0x89,0xd3,0x01,0x05,0xae,0x19]
+
+v_pk_mad_u16 v5, v1, v2, m0
+// CHECK: [0x05,0x40,0x89,0xd3,0x01,0x05,0xf2,0x19]
+
+v_pk_mad_u16 v5, v1, v2, exec_lo
+// CHECK: [0x05,0x40,0x89,0xd3,0x01,0x05,0xfa,0x19]
+
+v_pk_mad_u16 v5, v1, v2, exec_hi
+// CHECK: [0x05,0x40,0x89,0xd3,0x01,0x05,0xfe,0x19]
+
+v_pk_mad_u16 v5, v1, v2, v3 op_sel:[0,0,0]
+// CHECK: [0x05,0x40,0x89,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_pk_mad_u16 v5, v1, v2, v3 op_sel:[1,0,0]
+// CHECK: [0x05,0x48,0x89,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_pk_mad_u16 v5, v1, v2, v3 op_sel:[0,1,0]
+// CHECK: [0x05,0x50,0x89,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_pk_mad_u16 v5, v1, v2, v3 op_sel:[0,0,1]
+// CHECK: [0x05,0x60,0x89,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_pk_mad_u16 v5, v1, v2, v3 op_sel:[1,1,1]
+// CHECK: [0x05,0x78,0x89,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_pk_mad_u16 v5, v1, v2, v3 op_sel_hi:[1,1,1]
+// CHECK: [0x05,0x40,0x89,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_pk_mad_u16 v5, v1, v2, v3 op_sel_hi:[0,0,0]
+// CHECK: [0x05,0x00,0x89,0xd3,0x01,0x05,0x0e,0x04]
+
+v_pk_mad_u16 v5, v1, v2, v3 op_sel_hi:[1,0,0]
+// CHECK: [0x05,0x00,0x89,0xd3,0x01,0x05,0x0e,0x0c]
+
+v_pk_mad_u16 v5, v1, v2, v3 op_sel_hi:[0,1,0]
+// CHECK: [0x05,0x00,0x89,0xd3,0x01,0x05,0x0e,0x14]
+
+v_pk_mad_u16 v5, v1, v2, v3 op_sel_hi:[0,0,1]
+// CHECK: [0x05,0x40,0x89,0xd3,0x01,0x05,0x0e,0x04]
+
+v_pk_mad_u16 v5, v1, v2, v3 clamp
+// CHECK: [0x05,0xc0,0x89,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_pk_sub_u16 v5, v1, v2
+// CHECK: [0x05,0x00,0x8b,0xd3,0x01,0x05,0x02,0x18]
+
+v_pk_sub_u16 v255, v1, v2
+// CHECK: [0xff,0x00,0x8b,0xd3,0x01,0x05,0x02,0x18]
+
+v_pk_sub_u16 v5, v255, v2
+// CHECK: [0x05,0x00,0x8b,0xd3,0xff,0x05,0x02,0x18]
+
+v_pk_sub_u16 v5, s1, v2
+// CHECK: [0x05,0x00,0x8b,0xd3,0x01,0x04,0x02,0x18]
+
+v_pk_sub_u16 v5, s101, v2
+// CHECK: [0x05,0x00,0x8b,0xd3,0x65,0x04,0x02,0x18]
+
+v_pk_sub_u16 v5, flat_scratch_lo, v2
+// CHECK: [0x05,0x00,0x8b,0xd3,0x66,0x04,0x02,0x18]
+
+v_pk_sub_u16 v5, flat_scratch_hi, v2
+// CHECK: [0x05,0x00,0x8b,0xd3,0x67,0x04,0x02,0x18]
+
+v_pk_sub_u16 v5, vcc_lo, v2
+// CHECK: [0x05,0x00,0x8b,0xd3,0x6a,0x04,0x02,0x18]
+
+v_pk_sub_u16 v5, vcc_hi, v2
+// CHECK: [0x05,0x00,0x8b,0xd3,0x6b,0x04,0x02,0x18]
+
+v_pk_sub_u16 v5, m0, v2
+// CHECK: [0x05,0x00,0x8b,0xd3,0x7c,0x04,0x02,0x18]
+
+v_pk_sub_u16 v5, exec_lo, v2
+// CHECK: [0x05,0x00,0x8b,0xd3,0x7e,0x04,0x02,0x18]
+
+v_pk_sub_u16 v5, exec_hi, v2
+// CHECK: [0x05,0x00,0x8b,0xd3,0x7f,0x04,0x02,0x18]
+
+v_pk_sub_u16 v5, v1, v255
+// CHECK: [0x05,0x00,0x8b,0xd3,0x01,0xff,0x03,0x18]
+
+v_pk_sub_u16 v5, v1, s2
+// CHECK: [0x05,0x00,0x8b,0xd3,0x01,0x05,0x00,0x18]
+
+v_pk_sub_u16 v5, v1, s101
+// CHECK: [0x05,0x00,0x8b,0xd3,0x01,0xcb,0x00,0x18]
+
+v_pk_sub_u16 v5, v1, flat_scratch_lo
+// CHECK: [0x05,0x00,0x8b,0xd3,0x01,0xcd,0x00,0x18]
+
+v_pk_sub_u16 v5, v1, flat_scratch_hi
+// CHECK: [0x05,0x00,0x8b,0xd3,0x01,0xcf,0x00,0x18]
+
+v_pk_sub_u16 v5, v1, vcc_lo
+// CHECK: [0x05,0x00,0x8b,0xd3,0x01,0xd5,0x00,0x18]
+
+v_pk_sub_u16 v5, v1, vcc_hi
+// CHECK: [0x05,0x00,0x8b,0xd3,0x01,0xd7,0x00,0x18]
+
+v_pk_sub_u16 v5, v1, m0
+// CHECK: [0x05,0x00,0x8b,0xd3,0x01,0xf9,0x00,0x18]
+
+v_pk_sub_u16 v5, v1, exec_lo
+// CHECK: [0x05,0x00,0x8b,0xd3,0x01,0xfd,0x00,0x18]
+
+v_pk_sub_u16 v5, v1, exec_hi
+// CHECK: [0x05,0x00,0x8b,0xd3,0x01,0xff,0x00,0x18]
+
+v_pk_sub_u16 v5, v1, v2 op_sel:[0,0]
+// CHECK: [0x05,0x00,0x8b,0xd3,0x01,0x05,0x02,0x18]
+
+v_pk_sub_u16 v5, v1, v2 op_sel:[1,0]
+// CHECK: [0x05,0x08,0x8b,0xd3,0x01,0x05,0x02,0x18]
+
+v_pk_sub_u16 v5, v1, v2 op_sel:[0,1]
+// CHECK: [0x05,0x10,0x8b,0xd3,0x01,0x05,0x02,0x18]
+
+v_pk_sub_u16 v5, v1, v2 op_sel:[1,1]
+// CHECK: [0x05,0x18,0x8b,0xd3,0x01,0x05,0x02,0x18]
+
+v_pk_sub_u16 v5, v1, v2 op_sel_hi:[1,1]
+// CHECK: [0x05,0x00,0x8b,0xd3,0x01,0x05,0x02,0x18]
+
+v_pk_sub_u16 v5, v1, v2 op_sel_hi:[0,0]
+// CHECK: [0x05,0x00,0x8b,0xd3,0x01,0x05,0x02,0x00]
+
+v_pk_sub_u16 v5, v1, v2 op_sel_hi:[1,0]
+// CHECK: [0x05,0x00,0x8b,0xd3,0x01,0x05,0x02,0x08]
+
+v_pk_sub_u16 v5, v1, v2 op_sel_hi:[0,1]
+// CHECK: [0x05,0x00,0x8b,0xd3,0x01,0x05,0x02,0x10]
+
+v_pk_sub_u16 v5, v1, v2 clamp
+// CHECK: [0x05,0x80,0x8b,0xd3,0x01,0x05,0x02,0x18]
diff --git a/test/MC/AMDGPU/vop3-errs.s b/test/MC/AMDGPU/vop3-errs.s
index 7ba577049af3..855dd0b5de08 100644
--- a/test/MC/AMDGPU/vop3-errs.s
+++ b/test/MC/AMDGPU/vop3-errs.s
@@ -1,35 +1,47 @@
-// RUN: not llvm-mc -arch=amdgcn -show-encoding %s 2>&1 | FileCheck %s
-// RUN: not llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s 2>&1 | FileCheck %s
+// RUN: not llvm-mc -arch=amdgcn -show-encoding %s 2>&1 | FileCheck %s --check-prefix=GFX67 --check-prefix=GCN
+// RUN: not llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s 2>&1 | FileCheck %s --check-prefix=GFX67 --check-prefix=GCN
+// RUN: not llvm-mc -arch=amdgcn -mcpu=fiji -show-encoding %s 2>&1 | FileCheck %s --check-prefix=GFX89 --check-prefix=GCN
+// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx900 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=GFX89 --check-prefix=GCN
 
 v_add_f32_e64 v0, v1
-// CHECK: error: too few operands for instruction
+// GCN: error: too few operands for instruction
 
 v_div_scale_f32  v24, vcc, v22, 1.1, v22
-// CHECK: error: invalid operand for instruction
+// GCN: error: invalid operand for instruction
 
 v_mqsad_u32_u8 v[0:3], s[2:3], v4, v[0:3]
-// CHECK: error: instruction not supported on this GPU
+// GFX67: error: instruction not supported on this GPU
+// GFX89: error: destination must be different than all sources
 
 v_mqsad_pk_u16_u8 v[0:1], v[1:2], v9, v[4:5]
-// CHECK: error: destination must be different than all sources
+// GCN: error: destination must be different than all sources
 
 v_mqsad_pk_u16_u8 v[1:2], v[1:2], v9, v[4:5]
-// CHECK: error: destination must be different than all sources
+// GCN: error: destination must be different than all sources
 
 v_mqsad_pk_u16_u8 v[2:3], v[1:2], v9, v[4:5]
-// CHECK: error: destination must be different than all sources
+// GCN: error: destination must be different than all sources
 
 v_mqsad_pk_u16_u8 v[3:4], v[0:1], v9, v[4:5]
-// CHECK: error: destination must be different than all sources
+// GCN: error: destination must be different than all sources
 
 v_mqsad_pk_u16_u8 v[4:5], v[1:2], v9, v[4:5]
-// CHECK: error: destination must be different than all sources
+// GCN: error: destination must be different than all sources
 
 v_mqsad_pk_u16_u8 v[5:6], v[1:2], v9, v[4:5]
-// CHECK: error: destination must be different than all sources
+// GCN: error: destination must be different than all sources
 
 v_mqsad_pk_u16_u8 v[8:9], v[1:2], v9, v[4:5]
-// CHECK: error: destination must be different than all sources
+// GCN: error: destination must be different than all sources
 
 v_mqsad_pk_u16_u8 v[9:10], v[1:2], v9, v[4:5]
-// CHECK: error: destination must be different than all sources
+// GCN: error: destination must be different than all sources
+
+v_cmp_eq_f32_e64 vcc, v0, v1 mul:2
+// GCN: error: invalid operand for instruction
+
+v_cmp_le_f64_e64 vcc, v0, v1 mul:4
+// GCN: error: invalid operand for instruction
+
+v_cvt_u32_f32_e64 v0, v1 div:2
+// GCN: error: invalid operand for instruction
\ No newline at end of file
diff --git a/test/MC/ARM/virtexts-thumb.s b/test/MC/ARM/virtexts-thumb.s
index d911e1dfb1d7..0ea0bafe5a86 100644
--- a/test/MC/ARM/virtexts-thumb.s
+++ b/test/MC/ARM/virtexts-thumb.s
@@ -50,7 +50,7 @@
 # CHECK-THUMB: [0xde,0xf3,0x00,0x8f]
 
 # SUBS PC, LR, #0 should have the same encoding as ERET.
-# The conditional forms can't be tested becuse the ARM assembler parser doesn't
+# The conditional forms can't be tested because the ARM assembler parser doesn't
 # accept SUBS<cond> PC, LR, #<imm>, only the unconditonal form is allowed. This
 # is due to the way that the custom parser handles optional operands; see the
 # FIXME in ARM/AsmParser/ARMAsmParser.cpp.
diff --git a/test/MC/Disassembler/AMDGPU/gfx9_dasm_all.txt b/test/MC/Disassembler/AMDGPU/gfx9_dasm_all.txt
index 63374300504c..188534706eb5 100644
--- a/test/MC/Disassembler/AMDGPU/gfx9_dasm_all.txt
+++ b/test/MC/Disassembler/AMDGPU/gfx9_dasm_all.txt
@@ -88934,3 +88934,408 @@
 
 # CHECK: v_pk_sub_i16 v5, v1, v2 clamp    ; encoding: [0x05,0x80,0x83,0xd3,0x01,0x05,0x02,0x18]
 0x05,0x80,0x83,0xd3,0x01,0x05,0x02,0x18
+
+# CHECK: v_pk_mad_i16 v5, v1, v2, v3    ; encoding: [0x05,0x40,0x80,0xd3,0x01,0x05,0x0e,0x1c]
+0x05,0x40,0x80,0xd3,0x01,0x05,0x0e,0x1c
+
+# CHECK: v_pk_mad_i16 v255, v1, v2, v3    ; encoding: [0xff,0x40,0x80,0xd3,0x01,0x05,0x0e,0x1c]
+0xff,0x40,0x80,0xd3,0x01,0x05,0x0e,0x1c
+
+# CHECK: v_pk_mad_i16 v5, v255, v2, v3    ; encoding: [0x05,0x40,0x80,0xd3,0xff,0x05,0x0e,0x1c]
+0x05,0x40,0x80,0xd3,0xff,0x05,0x0e,0x1c
+
+# CHECK: v_pk_mad_i16 v5, s1, v2, v3    ; encoding: [0x05,0x40,0x80,0xd3,0x01,0x04,0x0e,0x1c]
+0x05,0x40,0x80,0xd3,0x01,0x04,0x0e,0x1c
+
+# CHECK: v_pk_mad_i16 v5, s101, v2, v3    ; encoding: [0x05,0x40,0x80,0xd3,0x65,0x04,0x0e,0x1c]
+0x05,0x40,0x80,0xd3,0x65,0x04,0x0e,0x1c
+
+# CHECK: v_pk_mad_i16 v5, flat_scratch_lo, v2, v3    ; encoding: [0x05,0x40,0x80,0xd3,0x66,0x04,0x0e,0x1c]
+0x05,0x40,0x80,0xd3,0x66,0x04,0x0e,0x1c
+
+# CHECK: v_pk_mad_i16 v5, flat_scratch_hi, v2, v3    ; encoding: [0x05,0x40,0x80,0xd3,0x67,0x04,0x0e,0x1c]
+0x05,0x40,0x80,0xd3,0x67,0x04,0x0e,0x1c
+
+# CHECK: v_pk_mad_i16 v5, vcc_lo, v2, v3    ; encoding: [0x05,0x40,0x80,0xd3,0x6a,0x04,0x0e,0x1c]
+0x05,0x40,0x80,0xd3,0x6a,0x04,0x0e,0x1c
+
+# CHECK: v_pk_mad_i16 v5, vcc_hi, v2, v3    ; encoding: [0x05,0x40,0x80,0xd3,0x6b,0x04,0x0e,0x1c]
+0x05,0x40,0x80,0xd3,0x6b,0x04,0x0e,0x1c
+
+# CHECK: v_pk_mad_i16 v5, m0, v2, v3    ; encoding: [0x05,0x40,0x80,0xd3,0x7c,0x04,0x0e,0x1c]
+0x05,0x40,0x80,0xd3,0x7c,0x04,0x0e,0x1c
+
+# CHECK: v_pk_mad_i16 v5, exec_lo, v2, v3    ; encoding: [0x05,0x40,0x80,0xd3,0x7e,0x04,0x0e,0x1c]
+0x05,0x40,0x80,0xd3,0x7e,0x04,0x0e,0x1c
+
+# CHECK: v_pk_mad_i16 v5, exec_hi, v2, v3    ; encoding: [0x05,0x40,0x80,0xd3,0x7f,0x04,0x0e,0x1c]
+0x05,0x40,0x80,0xd3,0x7f,0x04,0x0e,0x1c
+
+# CHECK: v_pk_mad_i16 v5, v1, v255, v3    ; encoding: [0x05,0x40,0x80,0xd3,0x01,0xff,0x0f,0x1c]
+0x05,0x40,0x80,0xd3,0x01,0xff,0x0f,0x1c
+
+# CHECK: v_pk_mad_i16 v5, v1, s2, v3    ; encoding: [0x05,0x40,0x80,0xd3,0x01,0x05,0x0c,0x1c]
+0x05,0x40,0x80,0xd3,0x01,0x05,0x0c,0x1c
+
+# CHECK: v_pk_mad_i16 v5, v1, s101, v3    ; encoding: [0x05,0x40,0x80,0xd3,0x01,0xcb,0x0c,0x1c]
+0x05,0x40,0x80,0xd3,0x01,0xcb,0x0c,0x1c
+
+# CHECK: v_pk_mad_i16 v5, v1, flat_scratch_lo, v3    ; encoding: [0x05,0x40,0x80,0xd3,0x01,0xcd,0x0c,0x1c]
+0x05,0x40,0x80,0xd3,0x01,0xcd,0x0c,0x1c
+
+# CHECK: v_pk_mad_i16 v5, v1, flat_scratch_hi, v3    ; encoding: [0x05,0x40,0x80,0xd3,0x01,0xcf,0x0c,0x1c]
+0x05,0x40,0x80,0xd3,0x01,0xcf,0x0c,0x1c
+
+# CHECK: v_pk_mad_i16 v5, v1, vcc_lo, v3    ; encoding: [0x05,0x40,0x80,0xd3,0x01,0xd5,0x0c,0x1c]
+0x05,0x40,0x80,0xd3,0x01,0xd5,0x0c,0x1c
+
+# CHECK: v_pk_mad_i16 v5, v1, vcc_hi, v3    ; encoding: [0x05,0x40,0x80,0xd3,0x01,0xd7,0x0c,0x1c]
+0x05,0x40,0x80,0xd3,0x01,0xd7,0x0c,0x1c
+
+# CHECK: v_pk_mad_i16 v5, v1, m0, v3    ; encoding: [0x05,0x40,0x80,0xd3,0x01,0xf9,0x0c,0x1c]
+0x05,0x40,0x80,0xd3,0x01,0xf9,0x0c,0x1c
+
+# CHECK: v_pk_mad_i16 v5, v1, exec_lo, v3    ; encoding: [0x05,0x40,0x80,0xd3,0x01,0xfd,0x0c,0x1c]
+0x05,0x40,0x80,0xd3,0x01,0xfd,0x0c,0x1c
+
+# CHECK: v_pk_mad_i16 v5, v1, exec_hi, v3    ; encoding: [0x05,0x40,0x80,0xd3,0x01,0xff,0x0c,0x1c]
+0x05,0x40,0x80,0xd3,0x01,0xff,0x0c,0x1c
+
+# CHECK: v_pk_mad_i16 v5, v1, v2, v255    ; encoding: [0x05,0x40,0x80,0xd3,0x01,0x05,0xfe,0x1f]
+0x05,0x40,0x80,0xd3,0x01,0x05,0xfe,0x1f
+
+# CHECK: v_pk_mad_i16 v5, v1, v2, s3    ; encoding: [0x05,0x40,0x80,0xd3,0x01,0x05,0x0e,0x18]
+0x05,0x40,0x80,0xd3,0x01,0x05,0x0e,0x18
+
+# CHECK: v_pk_mad_i16 v5, v1, v2, s101    ; encoding: [0x05,0x40,0x80,0xd3,0x01,0x05,0x96,0x19]
+0x05,0x40,0x80,0xd3,0x01,0x05,0x96,0x19
+
+# CHECK: v_pk_mad_i16 v5, v1, v2, flat_scratch_lo    ; encoding: [0x05,0x40,0x80,0xd3,0x01,0x05,0x9a,0x19]
+0x05,0x40,0x80,0xd3,0x01,0x05,0x9a,0x19
+
+# CHECK: v_pk_mad_i16 v5, v1, v2, flat_scratch_hi    ; encoding: [0x05,0x40,0x80,0xd3,0x01,0x05,0x9e,0x19]
+0x05,0x40,0x80,0xd3,0x01,0x05,0x9e,0x19
+
+# CHECK: v_pk_mad_i16 v5, v1, v2, vcc_lo    ; encoding: [0x05,0x40,0x80,0xd3,0x01,0x05,0xaa,0x19]
+0x05,0x40,0x80,0xd3,0x01,0x05,0xaa,0x19
+
+# CHECK: v_pk_mad_i16 v5, v1, v2, vcc_hi    ; encoding: [0x05,0x40,0x80,0xd3,0x01,0x05,0xae,0x19]
+0x05,0x40,0x80,0xd3,0x01,0x05,0xae,0x19
+
+# CHECK: v_pk_mad_i16 v5, v1, v2, m0    ; encoding: [0x05,0x40,0x80,0xd3,0x01,0x05,0xf2,0x19]
+0x05,0x40,0x80,0xd3,0x01,0x05,0xf2,0x19
+
+# CHECK: v_pk_mad_i16 v5, v1, v2, exec_lo    ; encoding: [0x05,0x40,0x80,0xd3,0x01,0x05,0xfa,0x19]
+0x05,0x40,0x80,0xd3,0x01,0x05,0xfa,0x19
+
+# CHECK: v_pk_mad_i16 v5, v1, v2, exec_hi    ; encoding: [0x05,0x40,0x80,0xd3,0x01,0x05,0xfe,0x19]
+0x05,0x40,0x80,0xd3,0x01,0x05,0xfe,0x19
+
+# CHECK: v_pk_mad_i16 v5, v1, v2, v3 op_sel:[1,0,0]    ; encoding: [0x05,0x48,0x80,0xd3,0x01,0x05,0x0e,0x1c]
+0x05,0x48,0x80,0xd3,0x01,0x05,0x0e,0x1c
+
+# CHECK: v_pk_mad_i16 v5, v1, v2, v3 op_sel:[0,1,0]    ; encoding: [0x05,0x50,0x80,0xd3,0x01,0x05,0x0e,0x1c]
+0x05,0x50,0x80,0xd3,0x01,0x05,0x0e,0x1c
+
+# CHECK: v_pk_mad_i16 v5, v1, v2, v3 op_sel:[0,0,1]    ; encoding: [0x05,0x60,0x80,0xd3,0x01,0x05,0x0e,0x1c]
+0x05,0x60,0x80,0xd3,0x01,0x05,0x0e,0x1c
+
+# CHECK: v_pk_mad_i16 v5, v1, v2, v3 op_sel:[1,1,1]    ; encoding: [0x05,0x78,0x80,0xd3,0x01,0x05,0x0e,0x1c]
+0x05,0x78,0x80,0xd3,0x01,0x05,0x0e,0x1c
+
+# CHECK: v_pk_mad_i16 v5, v1, v2, v3 op_sel_hi:[0,0,0]    ; encoding: [0x05,0x00,0x80,0xd3,0x01,0x05,0x0e,0x04]
+0x05,0x00,0x80,0xd3,0x01,0x05,0x0e,0x04
+
+# CHECK: v_pk_mad_i16 v5, v1, v2, v3 op_sel_hi:[1,0,0]    ; encoding: [0x05,0x00,0x80,0xd3,0x01,0x05,0x0e,0x0c]
+0x05,0x00,0x80,0xd3,0x01,0x05,0x0e,0x0c
+
+# CHECK: v_pk_mad_i16 v5, v1, v2, v3 op_sel_hi:[0,1,0]    ; encoding: [0x05,0x00,0x80,0xd3,0x01,0x05,0x0e,0x14]
+0x05,0x00,0x80,0xd3,0x01,0x05,0x0e,0x14
+
+# CHECK: v_pk_mad_i16 v5, v1, v2, v3 op_sel_hi:[0,0,1]    ; encoding: [0x05,0x40,0x80,0xd3,0x01,0x05,0x0e,0x04]
+0x05,0x40,0x80,0xd3,0x01,0x05,0x0e,0x04
+
+# CHECK: v_pk_mad_i16 v5, v1, v2, v3 clamp    ; encoding: [0x05,0xc0,0x80,0xd3,0x01,0x05,0x0e,0x1c]
+0x05,0xc0,0x80,0xd3,0x01,0x05,0x0e,0x1c
+
+# CHECK: v_pk_mad_u16 v5, v1, v2, v3    ; encoding: [0x05,0x40,0x89,0xd3,0x01,0x05,0x0e,0x1c]
+0x05,0x40,0x89,0xd3,0x01,0x05,0x0e,0x1c
+
+# CHECK: v_pk_mad_u16 v255, v1, v2, v3    ; encoding: [0xff,0x40,0x89,0xd3,0x01,0x05,0x0e,0x1c]
+0xff,0x40,0x89,0xd3,0x01,0x05,0x0e,0x1c
+
+# CHECK: v_pk_mad_u16 v5, v255, v2, v3    ; encoding: [0x05,0x40,0x89,0xd3,0xff,0x05,0x0e,0x1c]
+0x05,0x40,0x89,0xd3,0xff,0x05,0x0e,0x1c
+
+# CHECK: v_pk_mad_u16 v5, s1, v2, v3    ; encoding: [0x05,0x40,0x89,0xd3,0x01,0x04,0x0e,0x1c]
+0x05,0x40,0x89,0xd3,0x01,0x04,0x0e,0x1c
+
+# CHECK: v_pk_mad_u16 v5, s101, v2, v3    ; encoding: [0x05,0x40,0x89,0xd3,0x65,0x04,0x0e,0x1c]
+0x05,0x40,0x89,0xd3,0x65,0x04,0x0e,0x1c
+
+# CHECK: v_pk_mad_u16 v5, flat_scratch_lo, v2, v3    ; encoding: [0x05,0x40,0x89,0xd3,0x66,0x04,0x0e,0x1c]
+0x05,0x40,0x89,0xd3,0x66,0x04,0x0e,0x1c
+
+# CHECK: v_pk_mad_u16 v5, flat_scratch_hi, v2, v3    ; encoding: [0x05,0x40,0x89,0xd3,0x67,0x04,0x0e,0x1c]
+0x05,0x40,0x89,0xd3,0x67,0x04,0x0e,0x1c
+
+# CHECK: v_pk_mad_u16 v5, vcc_lo, v2, v3    ; encoding: [0x05,0x40,0x89,0xd3,0x6a,0x04,0x0e,0x1c]
+0x05,0x40,0x89,0xd3,0x6a,0x04,0x0e,0x1c
+
+# CHECK: v_pk_mad_u16 v5, vcc_hi, v2, v3    ; encoding: [0x05,0x40,0x89,0xd3,0x6b,0x04,0x0e,0x1c]
+0x05,0x40,0x89,0xd3,0x6b,0x04,0x0e,0x1c
+
+# CHECK: v_pk_mad_u16 v5, m0, v2, v3    ; encoding: [0x05,0x40,0x89,0xd3,0x7c,0x04,0x0e,0x1c]
+0x05,0x40,0x89,0xd3,0x7c,0x04,0x0e,0x1c
+
+# CHECK: v_pk_mad_u16 v5, exec_lo, v2, v3    ; encoding: [0x05,0x40,0x89,0xd3,0x7e,0x04,0x0e,0x1c]
+0x05,0x40,0x89,0xd3,0x7e,0x04,0x0e,0x1c
+
+# CHECK: v_pk_mad_u16 v5, exec_hi, v2, v3    ; encoding: [0x05,0x40,0x89,0xd3,0x7f,0x04,0x0e,0x1c]
+0x05,0x40,0x89,0xd3,0x7f,0x04,0x0e,0x1c
+
+# CHECK: v_pk_mad_u16 v5, v1, v255, v3    ; encoding: [0x05,0x40,0x89,0xd3,0x01,0xff,0x0f,0x1c]
+0x05,0x40,0x89,0xd3,0x01,0xff,0x0f,0x1c
+
+# CHECK: v_pk_mad_u16 v5, v1, s2, v3    ; encoding: [0x05,0x40,0x89,0xd3,0x01,0x05,0x0c,0x1c]
+0x05,0x40,0x89,0xd3,0x01,0x05,0x0c,0x1c
+
+# CHECK: v_pk_mad_u16 v5, v1, s101, v3    ; encoding: [0x05,0x40,0x89,0xd3,0x01,0xcb,0x0c,0x1c]
+0x05,0x40,0x89,0xd3,0x01,0xcb,0x0c,0x1c
+
+# CHECK: v_pk_mad_u16 v5, v1, flat_scratch_lo, v3    ; encoding: [0x05,0x40,0x89,0xd3,0x01,0xcd,0x0c,0x1c]
+0x05,0x40,0x89,0xd3,0x01,0xcd,0x0c,0x1c
+
+# CHECK: v_pk_mad_u16 v5, v1, flat_scratch_hi, v3    ; encoding: [0x05,0x40,0x89,0xd3,0x01,0xcf,0x0c,0x1c]
+0x05,0x40,0x89,0xd3,0x01,0xcf,0x0c,0x1c
+
+# CHECK: v_pk_mad_u16 v5, v1, vcc_lo, v3    ; encoding: [0x05,0x40,0x89,0xd3,0x01,0xd5,0x0c,0x1c]
+0x05,0x40,0x89,0xd3,0x01,0xd5,0x0c,0x1c
+
+# CHECK: v_pk_mad_u16 v5, v1, vcc_hi, v3    ; encoding: [0x05,0x40,0x89,0xd3,0x01,0xd7,0x0c,0x1c]
+0x05,0x40,0x89,0xd3,0x01,0xd7,0x0c,0x1c
+
+# CHECK: v_pk_mad_u16 v5, v1, m0, v3    ; encoding: [0x05,0x40,0x89,0xd3,0x01,0xf9,0x0c,0x1c]
+0x05,0x40,0x89,0xd3,0x01,0xf9,0x0c,0x1c
+
+# CHECK: v_pk_mad_u16 v5, v1, exec_lo, v3    ; encoding: [0x05,0x40,0x89,0xd3,0x01,0xfd,0x0c,0x1c]
+0x05,0x40,0x89,0xd3,0x01,0xfd,0x0c,0x1c
+
+# CHECK: v_pk_mad_u16 v5, v1, exec_hi, v3    ; encoding: [0x05,0x40,0x89,0xd3,0x01,0xff,0x0c,0x1c]
+0x05,0x40,0x89,0xd3,0x01,0xff,0x0c,0x1c
+
+# CHECK: v_pk_mad_u16 v5, v1, v2, v255    ; encoding: [0x05,0x40,0x89,0xd3,0x01,0x05,0xfe,0x1f]
+0x05,0x40,0x89,0xd3,0x01,0x05,0xfe,0x1f
+
+# CHECK: v_pk_mad_u16 v5, v1, v2, s3    ; encoding: [0x05,0x40,0x89,0xd3,0x01,0x05,0x0e,0x18]
+0x05,0x40,0x89,0xd3,0x01,0x05,0x0e,0x18
+
+# CHECK: v_pk_mad_u16 v5, v1, v2, s101    ; encoding: [0x05,0x40,0x89,0xd3,0x01,0x05,0x96,0x19]
+0x05,0x40,0x89,0xd3,0x01,0x05,0x96,0x19
+
+# CHECK: v_pk_mad_u16 v5, v1, v2, flat_scratch_lo    ; encoding: [0x05,0x40,0x89,0xd3,0x01,0x05,0x9a,0x19]
+0x05,0x40,0x89,0xd3,0x01,0x05,0x9a,0x19
+
+# CHECK: v_pk_mad_u16 v5, v1, v2, flat_scratch_hi    ; encoding: [0x05,0x40,0x89,0xd3,0x01,0x05,0x9e,0x19]
+0x05,0x40,0x89,0xd3,0x01,0x05,0x9e,0x19
+
+# CHECK: v_pk_mad_u16 v5, v1, v2, vcc_lo    ; encoding: [0x05,0x40,0x89,0xd3,0x01,0x05,0xaa,0x19]
+0x05,0x40,0x89,0xd3,0x01,0x05,0xaa,0x19
+
+# CHECK: v_pk_mad_u16 v5, v1, v2, vcc_hi    ; encoding: [0x05,0x40,0x89,0xd3,0x01,0x05,0xae,0x19]
+0x05,0x40,0x89,0xd3,0x01,0x05,0xae,0x19
+
+# CHECK: v_pk_mad_u16 v5, v1, v2, m0    ; encoding: [0x05,0x40,0x89,0xd3,0x01,0x05,0xf2,0x19]
+0x05,0x40,0x89,0xd3,0x01,0x05,0xf2,0x19
+
+# CHECK: v_pk_mad_u16 v5, v1, v2, exec_lo    ; encoding: [0x05,0x40,0x89,0xd3,0x01,0x05,0xfa,0x19]
+0x05,0x40,0x89,0xd3,0x01,0x05,0xfa,0x19
+
+# CHECK: v_pk_mad_u16 v5, v1, v2, exec_hi    ; encoding: [0x05,0x40,0x89,0xd3,0x01,0x05,0xfe,0x19]
+0x05,0x40,0x89,0xd3,0x01,0x05,0xfe,0x19
+
+# CHECK: v_pk_mad_u16 v5, v1, v2, v3 op_sel:[1,0,0]    ; encoding: [0x05,0x48,0x89,0xd3,0x01,0x05,0x0e,0x1c]
+0x05,0x48,0x89,0xd3,0x01,0x05,0x0e,0x1c
+
+# CHECK: v_pk_mad_u16 v5, v1, v2, v3 op_sel:[0,1,0]    ; encoding: [0x05,0x50,0x89,0xd3,0x01,0x05,0x0e,0x1c]
+0x05,0x50,0x89,0xd3,0x01,0x05,0x0e,0x1c
+
+# CHECK: v_pk_mad_u16 v5, v1, v2, v3 op_sel:[0,0,1]    ; encoding: [0x05,0x60,0x89,0xd3,0x01,0x05,0x0e,0x1c]
+0x05,0x60,0x89,0xd3,0x01,0x05,0x0e,0x1c
+
+# CHECK: v_pk_mad_u16 v5, v1, v2, v3 op_sel:[1,1,1]    ; encoding: [0x05,0x78,0x89,0xd3,0x01,0x05,0x0e,0x1c]
+0x05,0x78,0x89,0xd3,0x01,0x05,0x0e,0x1c
+
+# CHECK: v_pk_mad_u16 v5, v1, v2, v3 op_sel_hi:[0,0,0]    ; encoding: [0x05,0x00,0x89,0xd3,0x01,0x05,0x0e,0x04]
+0x05,0x00,0x89,0xd3,0x01,0x05,0x0e,0x04
+
+# CHECK: v_pk_mad_u16 v5, v1, v2, v3 op_sel_hi:[1,0,0]    ; encoding: [0x05,0x00,0x89,0xd3,0x01,0x05,0x0e,0x0c]
+0x05,0x00,0x89,0xd3,0x01,0x05,0x0e,0x0c
+
+# CHECK: v_pk_mad_u16 v5, v1, v2, v3 op_sel_hi:[0,1,0]    ; encoding: [0x05,0x00,0x89,0xd3,0x01,0x05,0x0e,0x14]
+0x05,0x00,0x89,0xd3,0x01,0x05,0x0e,0x14
+
+# CHECK: v_pk_mad_u16 v5, v1, v2, v3 op_sel_hi:[0,0,1]    ; encoding: [0x05,0x40,0x89,0xd3,0x01,0x05,0x0e,0x04]
+0x05,0x40,0x89,0xd3,0x01,0x05,0x0e,0x04
+
+# CHECK: v_pk_mad_u16 v5, v1, v2, v3 clamp    ; encoding: [0x05,0xc0,0x89,0xd3,0x01,0x05,0x0e,0x1c]
+0x05,0xc0,0x89,0xd3,0x01,0x05,0x0e,0x1c
+
+# CHECK: v_pk_sub_u16 v5, v1, v2    ; encoding: [0x05,0x00,0x8b,0xd3,0x01,0x05,0x02,0x18]
+0x05,0x00,0x8b,0xd3,0x01,0x05,0x02,0x18
+
+# CHECK: v_pk_sub_u16 v255, v1, v2    ; encoding: [0xff,0x00,0x8b,0xd3,0x01,0x05,0x02,0x18]
+0xff,0x00,0x8b,0xd3,0x01,0x05,0x02,0x18
+
+# CHECK: v_pk_sub_u16 v5, v255, v2    ; encoding: [0x05,0x00,0x8b,0xd3,0xff,0x05,0x02,0x18]
+0x05,0x00,0x8b,0xd3,0xff,0x05,0x02,0x18
+
+# CHECK: v_pk_sub_u16 v5, s1, v2    ; encoding: [0x05,0x00,0x8b,0xd3,0x01,0x04,0x02,0x18]
+0x05,0x00,0x8b,0xd3,0x01,0x04,0x02,0x18
+
+# CHECK: v_pk_sub_u16 v5, s101, v2    ; encoding: [0x05,0x00,0x8b,0xd3,0x65,0x04,0x02,0x18]
+0x05,0x00,0x8b,0xd3,0x65,0x04,0x02,0x18
+
+# CHECK: v_pk_sub_u16 v5, flat_scratch_lo, v2    ; encoding: [0x05,0x00,0x8b,0xd3,0x66,0x04,0x02,0x18]
+0x05,0x00,0x8b,0xd3,0x66,0x04,0x02,0x18
+
+# CHECK: v_pk_sub_u16 v5, flat_scratch_hi, v2    ; encoding: [0x05,0x00,0x8b,0xd3,0x67,0x04,0x02,0x18]
+0x05,0x00,0x8b,0xd3,0x67,0x04,0x02,0x18
+
+# CHECK: v_pk_sub_u16 v5, vcc_lo, v2    ; encoding: [0x05,0x00,0x8b,0xd3,0x6a,0x04,0x02,0x18]
+0x05,0x00,0x8b,0xd3,0x6a,0x04,0x02,0x18
+
+# CHECK: v_pk_sub_u16 v5, vcc_hi, v2    ; encoding: [0x05,0x00,0x8b,0xd3,0x6b,0x04,0x02,0x18]
+0x05,0x00,0x8b,0xd3,0x6b,0x04,0x02,0x18
+
+# CHECK: v_pk_sub_u16 v5, m0, v2    ; encoding: [0x05,0x00,0x8b,0xd3,0x7c,0x04,0x02,0x18]
+0x05,0x00,0x8b,0xd3,0x7c,0x04,0x02,0x18
+
+# CHECK: v_pk_sub_u16 v5, exec_lo, v2    ; encoding: [0x05,0x00,0x8b,0xd3,0x7e,0x04,0x02,0x18]
+0x05,0x00,0x8b,0xd3,0x7e,0x04,0x02,0x18
+
+# CHECK: v_pk_sub_u16 v5, exec_hi, v2    ; encoding: [0x05,0x00,0x8b,0xd3,0x7f,0x04,0x02,0x18]
+0x05,0x00,0x8b,0xd3,0x7f,0x04,0x02,0x18
+
+# CHECK: v_pk_sub_u16 v5, v1, v255    ; encoding: [0x05,0x00,0x8b,0xd3,0x01,0xff,0x03,0x18]
+0x05,0x00,0x8b,0xd3,0x01,0xff,0x03,0x18
+
+# CHECK: v_pk_sub_u16 v5, v1, s2    ; encoding: [0x05,0x00,0x8b,0xd3,0x01,0x05,0x00,0x18]
+0x05,0x00,0x8b,0xd3,0x01,0x05,0x00,0x18
+
+# CHECK: v_pk_sub_u16 v5, v1, s101    ; encoding: [0x05,0x00,0x8b,0xd3,0x01,0xcb,0x00,0x18]
+0x05,0x00,0x8b,0xd3,0x01,0xcb,0x00,0x18
+
+# CHECK: v_pk_sub_u16 v5, v1, flat_scratch_lo    ; encoding: [0x05,0x00,0x8b,0xd3,0x01,0xcd,0x00,0x18]
+0x05,0x00,0x8b,0xd3,0x01,0xcd,0x00,0x18
+
+# CHECK: v_pk_sub_u16 v5, v1, flat_scratch_hi    ; encoding: [0x05,0x00,0x8b,0xd3,0x01,0xcf,0x00,0x18]
+0x05,0x00,0x8b,0xd3,0x01,0xcf,0x00,0x18
+
+# CHECK: v_pk_sub_u16 v5, v1, vcc_lo    ; encoding: [0x05,0x00,0x8b,0xd3,0x01,0xd5,0x00,0x18]
+0x05,0x00,0x8b,0xd3,0x01,0xd5,0x00,0x18
+
+# CHECK: v_pk_sub_u16 v5, v1, vcc_hi    ; encoding: [0x05,0x00,0x8b,0xd3,0x01,0xd7,0x00,0x18]
+0x05,0x00,0x8b,0xd3,0x01,0xd7,0x00,0x18
+
+# CHECK: v_pk_sub_u16 v5, v1, m0    ; encoding: [0x05,0x00,0x8b,0xd3,0x01,0xf9,0x00,0x18]
+0x05,0x00,0x8b,0xd3,0x01,0xf9,0x00,0x18
+
+# CHECK: v_pk_sub_u16 v5, v1, exec_lo    ; encoding: [0x05,0x00,0x8b,0xd3,0x01,0xfd,0x00,0x18]
+0x05,0x00,0x8b,0xd3,0x01,0xfd,0x00,0x18
+
+# CHECK: v_pk_sub_u16 v5, v1, exec_hi    ; encoding: [0x05,0x00,0x8b,0xd3,0x01,0xff,0x00,0x18]
+0x05,0x00,0x8b,0xd3,0x01,0xff,0x00,0x18
+
+# CHECK: v_pk_sub_u16 v5, v1, v2 op_sel:[1,0]    ; encoding: [0x05,0x08,0x8b,0xd3,0x01,0x05,0x02,0x18]
+0x05,0x08,0x8b,0xd3,0x01,0x05,0x02,0x18
+
+# CHECK: v_pk_sub_u16 v5, v1, v2 op_sel:[0,1]    ; encoding: [0x05,0x10,0x8b,0xd3,0x01,0x05,0x02,0x18]
+0x05,0x10,0x8b,0xd3,0x01,0x05,0x02,0x18
+
+# CHECK: v_pk_sub_u16 v5, v1, v2 op_sel:[1,1]    ; encoding: [0x05,0x18,0x8b,0xd3,0x01,0x05,0x02,0x18]
+0x05,0x18,0x8b,0xd3,0x01,0x05,0x02,0x18
+
+# CHECK: v_pk_sub_u16 v5, v1, v2 op_sel_hi:[0,0]    ; encoding: [0x05,0x00,0x8b,0xd3,0x01,0x05,0x02,0x00]
+0x05,0x00,0x8b,0xd3,0x01,0x05,0x02,0x00
+
+# CHECK: v_pk_sub_u16 v5, v1, v2 op_sel_hi:[1,0]    ; encoding: [0x05,0x00,0x8b,0xd3,0x01,0x05,0x02,0x08]
+0x05,0x00,0x8b,0xd3,0x01,0x05,0x02,0x08
+
+# CHECK: v_pk_sub_u16 v5, v1, v2 op_sel_hi:[0,1]    ; encoding: [0x05,0x00,0x8b,0xd3,0x01,0x05,0x02,0x10]
+0x05,0x00,0x8b,0xd3,0x01,0x05,0x02,0x10
+
+# CHECK: v_pk_sub_u16 v5, v1, v2 clamp    ; encoding: [0x05,0x80,0x8b,0xd3,0x01,0x05,0x02,0x18]
+0x05,0x80,0x8b,0xd3,0x01,0x05,0x02,0x18
+
+# CHECK: v_mqsad_u32_u8 v[252:255], v[1:2], v2, v[3:6]    ; encoding: [0xfc,0x00,0xe7,0xd1,0x01,0x05,0x0e,0x04]
+0xfc,0x00,0xe7,0xd1,0x01,0x05,0x0e,0x04
+
+# CHECK: v_mqsad_u32_u8 v[252:255], s[2:3], v2, v[3:6]    ; encoding: [0xfc,0x00,0xe7,0xd1,0x02,0x04,0x0e,0x04]
+0xfc,0x00,0xe7,0xd1,0x02,0x04,0x0e,0x04
+
+# CHECK: v_mqsad_u32_u8 v[252:255], s[4:5], v2, v[3:6]    ; encoding: [0xfc,0x00,0xe7,0xd1,0x04,0x04,0x0e,0x04]
+0xfc,0x00,0xe7,0xd1,0x04,0x04,0x0e,0x04
+
+# CHECK: v_mqsad_u32_u8 v[252:255], s[100:101], v2, v[3:6]    ; encoding: [0xfc,0x00,0xe7,0xd1,0x64,0x04,0x0e,0x04]
+0xfc,0x00,0xe7,0xd1,0x64,0x04,0x0e,0x04
+
+# CHECK: v_mqsad_u32_u8 v[252:255], flat_scratch, v2, v[3:6]    ; encoding: [0xfc,0x00,0xe7,0xd1,0x66,0x04,0x0e,0x04]
+0xfc,0x00,0xe7,0xd1,0x66,0x04,0x0e,0x04
+
+# CHECK: v_mqsad_u32_u8 v[252:255], vcc, v2, v[3:6]    ; encoding: [0xfc,0x00,0xe7,0xd1,0x6a,0x04,0x0e,0x04]
+0xfc,0x00,0xe7,0xd1,0x6a,0x04,0x0e,0x04
+
+# CHECK: v_mqsad_u32_u8 v[252:255], exec, v2, v[3:6]    ; encoding: [0xfc,0x00,0xe7,0xd1,0x7e,0x04,0x0e,0x04]
+0xfc,0x00,0xe7,0xd1,0x7e,0x04,0x0e,0x04
+
+# CHECK: v_mqsad_u32_u8 v[252:255], 0, v2, v[3:6]    ; encoding: [0xfc,0x00,0xe7,0xd1,0x80,0x04,0x0e,0x04]
+0xfc,0x00,0xe7,0xd1,0x80,0x04,0x0e,0x04
+
+# CHECK: v_mqsad_u32_u8 v[252:255], -1, v2, v[3:6]    ; encoding: [0xfc,0x00,0xe7,0xd1,0xc1,0x04,0x0e,0x04]
+0xfc,0x00,0xe7,0xd1,0xc1,0x04,0x0e,0x04
+
+# CHECK: v_mqsad_u32_u8 v[252:255], 0.5, v2, v[3:6]    ; encoding: [0xfc,0x00,0xe7,0xd1,0xf0,0x04,0x0e,0x04]
+0xfc,0x00,0xe7,0xd1,0xf0,0x04,0x0e,0x04
+
+# CHECK: v_mqsad_u32_u8 v[252:255], -4.0, v2, v[3:6]    ; encoding: [0xfc,0x00,0xe7,0xd1,0xf7,0x04,0x0e,0x04]
+0xfc,0x00,0xe7,0xd1,0xf7,0x04,0x0e,0x04
+
+# CHECK: v_mqsad_u32_u8 v[252:255], v[1:2], s2, v[3:6]    ; encoding: [0xfc,0x00,0xe7,0xd1,0x01,0x05,0x0c,0x04]
+0xfc,0x00,0xe7,0xd1,0x01,0x05,0x0c,0x04
+
+# CHECK: v_mqsad_u32_u8 v[252:255], v[1:2], s101, v[3:6]    ; encoding: [0xfc,0x00,0xe7,0xd1,0x01,0xcb,0x0c,0x04]
+0xfc,0x00,0xe7,0xd1,0x01,0xcb,0x0c,0x04
+
+# CHECK: v_mqsad_u32_u8 v[252:255], v[1:2], flat_scratch_lo, v[3:6]    ; encoding: [0xfc,0x00,0xe7,0xd1,0x01,0xcd,0x0c,0x04]
+0xfc,0x00,0xe7,0xd1,0x01,0xcd,0x0c,0x04
+
+# CHECK: v_mqsad_u32_u8 v[252:255], v[1:2], flat_scratch_hi, v[3:6]    ; encoding: [0xfc,0x00,0xe7,0xd1,0x01,0xcf,0x0c,0x04]
+0xfc,0x00,0xe7,0xd1,0x01,0xcf,0x0c,0x04
+
+# CHECK: v_mqsad_u32_u8 v[252:255], v[1:2], vcc_lo, v[3:6]    ; encoding: [0xfc,0x00,0xe7,0xd1,0x01,0xd5,0x0c,0x04]
+0xfc,0x00,0xe7,0xd1,0x01,0xd5,0x0c,0x04
+
+# CHECK: v_mqsad_u32_u8 v[252:255], v[1:2], vcc_hi, v[3:6]    ; encoding: [0xfc,0x00,0xe7,0xd1,0x01,0xd7,0x0c,0x04]
+0xfc,0x00,0xe7,0xd1,0x01,0xd7,0x0c,0x04
+
+# CHECK: v_mqsad_u32_u8 v[252:255], v[1:2], m0, v[3:6]    ; encoding: [0xfc,0x00,0xe7,0xd1,0x01,0xf9,0x0c,0x04]
+0xfc,0x00,0xe7,0xd1,0x01,0xf9,0x0c,0x04
+
+# CHECK: v_mqsad_u32_u8 v[252:255], v[1:2], exec_lo, v[3:6]    ; encoding: [0xfc,0x00,0xe7,0xd1,0x01,0xfd,0x0c,0x04]
+0xfc,0x00,0xe7,0xd1,0x01,0xfd,0x0c,0x04
+
+# CHECK: v_mqsad_u32_u8 v[252:255], v[1:2], exec_hi, v[3:6]    ; encoding: [0xfc,0x00,0xe7,0xd1,0x01,0xff,0x0c,0x04]
+0xfc,0x00,0xe7,0xd1,0x01,0xff,0x0c,0x04
+
+# CHECK: v_mqsad_u32_u8 v[252:255], v[1:2], 0, v[3:6]    ; encoding: [0xfc,0x00,0xe7,0xd1,0x01,0x01,0x0d,0x04]
+0xfc,0x00,0xe7,0xd1,0x01,0x01,0x0d,0x04
+
+# CHECK: v_mqsad_u32_u8 v[252:255], v[1:2], -1, v[3:6]    ; encoding: [0xfc,0x00,0xe7,0xd1,0x01,0x83,0x0d,0x04]
+0xfc,0x00,0xe7,0xd1,0x01,0x83,0x0d,0x04
+
+# CHECK: v_mqsad_u32_u8 v[252:255], v[1:2], 0.5, v[3:6]    ; encoding: [0xfc,0x00,0xe7,0xd1,0x01,0xe1,0x0d,0x04]
+0xfc,0x00,0xe7,0xd1,0x01,0xe1,0x0d,0x04
+
+# CHECK: v_mqsad_u32_u8 v[252:255], v[1:2], -4.0, v[3:6]    ; encoding: [0xfc,0x00,0xe7,0xd1,0x01,0xef,0x0d,0x04]
+0xfc,0x00,0xe7,0xd1,0x01,0xef,0x0d,0x04
diff --git a/test/MC/Disassembler/Mips/mt/valid-r2-el.txt b/test/MC/Disassembler/Mips/mt/valid-r2-el.txt
index 62e7092086aa..7025354d6847 100644
--- a/test/MC/Disassembler/Mips/mt/valid-r2-el.txt
+++ b/test/MC/Disassembler/Mips/mt/valid-r2-el.txt
@@ -10,23 +10,4 @@
 0x08 0x10 0x65 0x7c  # CHECK: fork  $2, $3, $5
 0x09 0x00 0x80 0x7c  # CHECK: yield  $4
 0x09 0x20 0xa0 0x7c  # CHECK: yield $4, $5
-0x02 0x20 0x05 0x41  # CHECK: mftr  $4, $5, 0, 2, 0
-0x20 0x20 0x05 0x41  # CHECK: mftr  $4, $5, 1, 0, 0
-0x21 0x20 0x00 0x41  # CHECK: mftr  $4, $zero, 1, 1, 0
-0x21 0x20 0x0a 0x41  # CHECK: mftr  $4, $10, 1, 1, 0
-0x22 0x20 0x0a 0x41  # CHECK: mftr  $4, $10, 1, 2, 0
-0x32 0x20 0x0a 0x41  # CHECK: mftr  $4, $10, 1, 2, 1
-0x23 0x20 0x1a 0x41  # CHECK: mftr  $4, $26, 1, 3, 0
-0x23 0x20 0x1f 0x41  # CHECK: mftr  $4, $ra, 1, 3, 0
-0x24 0x20 0x0e 0x41  # CHECK: mftr  $4, $14, 1, 4, 0
-0x25 0x20 0x0f 0x41  # CHECK: mftr  $4, $15, 1, 5, 0
-0x02 0x28 0x84 0x41  # CHECK: mttr  $4, $5, 0, 2, 0
-0x20 0x28 0x84 0x41  # CHECK: mttr  $4, $5, 1, 0, 0
-0x21 0x00 0x84 0x41  # CHECK: mttr  $4, $zero, 1, 1, 0
-0x21 0x50 0x84 0x41  # CHECK: mttr  $4, $10, 1, 1, 0
-0x22 0x50 0x84 0x41  # CHECK: mttr  $4, $10, 1, 2, 0
-0x32 0x50 0x84 0x41  # CHECK: mttr  $4, $10, 1, 2, 1
-0x23 0xd0 0x84 0x41  # CHECK: mttr  $4, $26, 1, 3, 0
-0x23 0xf8 0x84 0x41  # CHECK: mttr  $4, $ra, 1, 3, 0
-0x24 0x70 0x84 0x41  # CHECK: mttr  $4, $14, 1, 4, 0
-0x25 0x78 0x84 0x41  # CHECK: mttr  $4, $15, 1, 5, 0
+
diff --git a/test/MC/Disassembler/Mips/mt/valid-r2.txt b/test/MC/Disassembler/Mips/mt/valid-r2.txt
index 4786d8b5591f..17c42c0614a5 100644
--- a/test/MC/Disassembler/Mips/mt/valid-r2.txt
+++ b/test/MC/Disassembler/Mips/mt/valid-r2.txt
@@ -10,23 +10,4 @@
 0x7c 0x65 0x10 0x08  # CHECK: fork  $2, $3, $5
 0x7c 0x80 0x00 0x09  # CHECK: yield  $4
 0x7c 0xa0 0x20 0x09  # CHECK: yield $4, $5
-0x41 0x05 0x20 0x02  # CHECK: mftr  $4, $5, 0, 2, 0
-0x41 0x05 0x20 0x20  # CHECK: mftr  $4, $5, 1, 0, 0
-0x41 0x00 0x20 0x21  # CHECK: mftr  $4, $zero, 1, 1, 0
-0x41 0x0a 0x20 0x21  # CHECK: mftr  $4, $10, 1, 1, 0
-0x41 0x0a 0x20 0x22  # CHECK: mftr  $4, $10, 1, 2, 0
-0x41 0x0a 0x20 0x32  # CHECK: mftr  $4, $10, 1, 2, 1
-0x41 0x1a 0x20 0x23  # CHECK: mftr  $4, $26, 1, 3, 0
-0x41 0x1f 0x20 0x23  # CHECK: mftr  $4, $ra, 1, 3, 0
-0x41 0x0e 0x20 0x24  # CHECK: mftr  $4, $14, 1, 4, 0
-0x41 0x0f 0x20 0x25  # CHECK: mftr  $4, $15, 1, 5, 0
-0x41 0x84 0x28 0x02  # CHECK: mttr  $4, $5, 0, 2, 0
-0x41 0x84 0x28 0x20  # CHECK: mttr  $4, $5, 1, 0, 0
-0x41 0x84 0x00 0x21  # CHECK: mttr  $4, $zero, 1, 1, 0
-0x41 0x84 0x50 0x21  # CHECK: mttr  $4, $10, 1, 1, 0
-0x41 0x84 0x50 0x22  # CHECK: mttr  $4, $10, 1, 2, 0
-0x41 0x84 0x50 0x32  # CHECK: mttr  $4, $10, 1, 2, 1
-0x41 0x84 0xd0 0x23  # CHECK: mttr  $4, $26, 1, 3, 0
-0x41 0x84 0xf8 0x23  # CHECK: mttr  $4, $ra, 1, 3, 0
-0x41 0x84 0x70 0x24  # CHECK: mttr  $4, $14, 1, 4, 0
-0x41 0x84 0x78 0x25  # CHECK: mttr  $4, $15, 1, 5, 0
+
diff --git a/test/MC/Disassembler/SystemZ/insns-z14.txt b/test/MC/Disassembler/SystemZ/insns-z14.txt
new file mode 100644
index 000000000000..c73b50c1c2fb
--- /dev/null
+++ b/test/MC/Disassembler/SystemZ/insns-z14.txt
@@ -0,0 +1,3253 @@
+# Test z14 instructions that don't have PC-relative operands.
+# RUN: llvm-mc --disassemble %s -triple=s390x-linux-gnu -mcpu=z14 \
+# RUN:   | FileCheck %s
+
+# CHECK: agh %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x38
+
+# CHECK: agh %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x38
+
+# CHECK: agh %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x38
+
+# CHECK: agh %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x38
+
+# CHECK: agh %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x38
+
+# CHECK: agh %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x38
+
+# CHECK: agh %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x38
+
+# CHECK: agh %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x38
+
+# CHECK: agh %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x38
+
+# CHECK: agh %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x38
+
+# CHECK: bi -524288
+0xe3 0xf0 0x00 0x00 0x80 0x47
+
+# CHECK: bi -1
+0xe3 0xf0 0x0f 0xff 0xff 0x47
+
+# CHECK: bi 0
+0xe3 0xf0 0x00 0x00 0x00 0x47
+
+# CHECK: bi 1
+0xe3 0xf0 0x00 0x01 0x00 0x47
+
+# CHECK: bi 524287
+0xe3 0xf0 0x0f 0xff 0x7f 0x47
+
+# CHECK: bi 0(%r1)
+0xe3 0xf0 0x10 0x00 0x00 0x47
+
+# CHECK: bi 0(%r15)
+0xe3 0xf0 0xf0 0x00 0x00 0x47
+
+# CHECK: bi 524287(%r1,%r15)
+0xe3 0xf1 0xff 0xff 0x7f 0x47
+
+# CHECK: bi 524287(%r15,%r1)
+0xe3 0xff 0x1f 0xff 0x7f 0x47
+
+# CHECK: bic 0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x47
+
+# CHECK: bic 0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x47
+
+# CHECK: bic 0, 0
+0xe3 0x00 0x00 0x00 0x00 0x47
+
+# CHECK: bic 0, 1
+0xe3 0x00 0x00 0x01 0x00 0x47
+
+# CHECK: bic 0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x47
+
+# CHECK: bic 0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x47
+
+# CHECK: bic 0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x47
+
+# CHECK: bic 0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x47
+
+# CHECK: bic 0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x47
+
+# CHECK: bio 0(%r15)
+0xe3 0x10 0xf0 0x00 0x00 0x47
+
+# CHECK: bih 0(%r15)
+0xe3 0x20 0xf0 0x00 0x00 0x47
+
+# CHECK: binle 0(%r15)
+0xe3 0x30 0xf0 0x00 0x00 0x47
+
+# CHECK: bil 0(%r15)
+0xe3 0x40 0xf0 0x00 0x00 0x47
+
+# CHECK: binhe 0(%r15)
+0xe3 0x50 0xf0 0x00 0x00 0x47
+
+# CHECK: bilh 0(%r15)
+0xe3 0x60 0xf0 0x00 0x00 0x47
+
+# CHECK: bine 0(%r15)
+0xe3 0x70 0xf0 0x00 0x00 0x47
+
+# CHECK: bie 0(%r15)
+0xe3 0x80 0xf0 0x00 0x00 0x47
+
+# CHECK: binlh 0(%r15)
+0xe3 0x90 0xf0 0x00 0x00 0x47
+
+# CHECK: bihe 0(%r15)
+0xe3 0xa0 0xf0 0x00 0x00 0x47
+
+# CHECK: binl 0(%r15)
+0xe3 0xb0 0xf0 0x00 0x00 0x47
+
+# CHECK: bile 0(%r15)
+0xe3 0xc0 0xf0 0x00 0x00 0x47
+
+# CHECK: binh 0(%r15)
+0xe3 0xd0 0xf0 0x00 0x00 0x47
+
+# CHECK: bino 0(%r15)
+0xe3 0xe0 0xf0 0x00 0x00 0x47
+
+# CHECK: irbm %r0, %r0
+0xb9 0xac 0x00 0x00
+
+# CHECK: irbm %r0, %r15
+0xb9 0xac 0x00 0x0f
+
+# CHECK: irbm %r15, %r0
+0xb9 0xac 0x00 0xf0
+
+# CHECK: irbm %r7, %r8
+0xb9 0xac 0x00 0x78
+
+# CHECK: irbm %r15, %r15
+0xb9 0xac 0x00 0xff
+
+# CHECK: kma %r2, %r2, %r2
+0xb9 0x29 0x20 0x22
+
+# CHECK: kma %r2, %r8, %r14
+0xb9 0x29 0x80 0x2e
+
+# CHECK: kma %r14, %r8, %r2
+0xb9 0x29 0x80 0xe2
+
+# CHECK: kma %r6, %r8, %r10
+0xb9 0x29 0x80 0x6a
+
+# CHECK: lgg %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x4c
+
+# CHECK: lgg %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x4c
+
+# CHECK: lgg %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x4c
+
+# CHECK: lgg %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x4c
+
+# CHECK: lgg %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x4c
+
+# CHECK: lgg %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x4c
+
+# CHECK: lgg %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x4c
+
+# CHECK: lgg %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x4c
+
+# CHECK: lgg %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x4c
+
+# CHECK: lgg %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x4c
+
+# CHECK: lgsc %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x4d
+
+# CHECK: lgsc %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x4d
+
+# CHECK: lgsc %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x4d
+
+# CHECK: lgsc %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x4d
+
+# CHECK: lgsc %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x4d
+
+# CHECK: lgsc %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x4d
+
+# CHECK: lgsc %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x4d
+
+# CHECK: lgsc %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x4d
+
+# CHECK: lgsc %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x4d
+
+# CHECK: llgfsg %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x48
+
+# CHECK: llgfsg %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x48
+
+# CHECK: llgfsg %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x48
+
+# CHECK: llgfsg %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x48
+
+# CHECK: llgfsg %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x48
+
+# CHECK: llgfsg %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x48
+
+# CHECK: llgfsg %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x48
+
+# CHECK: llgfsg %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x48
+
+# CHECK: llgfsg %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x48
+
+# CHECK: llgfsg %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x48
+
+# CHECK: mg %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x84
+
+# CHECK: mg %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x84
+
+# CHECK: mg %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x84
+
+# CHECK: mg %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x84
+
+# CHECK: mg %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x84
+
+# CHECK: mg %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x84
+
+# CHECK: mg %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x84
+
+# CHECK: mg %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x84
+
+# CHECK: mg %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x84
+
+# CHECK: mg %r14, 0
+0xe3 0xe0 0x00 0x00 0x00 0x84
+
+# CHECK: mgh %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x3c
+
+# CHECK: mgh %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x3c
+
+# CHECK: mgh %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x3c
+
+# CHECK: mgh %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x3c
+
+# CHECK: mgh %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x3c
+
+# CHECK: mgh %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x3c
+
+# CHECK: mgh %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x3c
+
+# CHECK: mgh %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x3c
+
+# CHECK: mgh %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x3c
+
+# CHECK: mgh %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x3c
+
+# CHECK: mgrk %r0, %r0, %r0
+0xb9 0xec 0x00 0x00
+
+# CHECK: mgrk %r0, %r0, %r15
+0xb9 0xec 0xf0 0x00
+
+# CHECK: mgrk %r0, %r15, %r0
+0xb9 0xec 0x00 0x0f
+
+# CHECK: mgrk %r14, %r0, %r0
+0xb9 0xec 0x00 0xe0
+
+# CHECK: mgrk %r6, %r8, %r9
+0xb9 0xec 0x90 0x68
+
+# CHECK: msc %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x53
+
+# CHECK: msc %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x53
+
+# CHECK: msc %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x53
+
+# CHECK: msc %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x53
+
+# CHECK: msc %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x53
+
+# CHECK: msc %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x53
+
+# CHECK: msc %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x53
+
+# CHECK: msc %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x53
+
+# CHECK: msc %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x53
+
+# CHECK: msc %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x53
+
+# CHECK: msgc %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x83
+
+# CHECK: msgc %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x83
+
+# CHECK: msgc %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x83
+
+# CHECK: msgc %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x83
+
+# CHECK: msgc %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x83
+
+# CHECK: msgc %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x83
+
+# CHECK: msgc %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x83
+
+# CHECK: msgc %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x83
+
+# CHECK: msgc %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x83
+
+# CHECK: msgc %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x83
+
+# CHECK: msrkc %r0, %r0, %r0
+0xb9 0xfd 0x00 0x00
+
+# CHECK: msrkc %r0, %r0, %r15
+0xb9 0xfd 0xf0 0x00
+
+# CHECK: msrkc %r0, %r15, %r0
+0xb9 0xfd 0x00 0x0f
+
+# CHECK: msrkc %r15, %r0, %r0
+0xb9 0xfd 0x00 0xf0
+
+# CHECK: msrkc %r7, %r8, %r9
+0xb9 0xfd 0x90 0x78
+
+# CHECK: msgrkc %r0, %r0, %r0
+0xb9 0xed 0x00 0x00
+
+# CHECK: msgrkc %r0, %r0, %r15
+0xb9 0xed 0xf0 0x00
+
+# CHECK: msgrkc %r0, %r15, %r0
+0xb9 0xed 0x00 0x0f
+
+# CHECK: msgrkc %r15, %r0, %r0
+0xb9 0xed 0x00 0xf0
+
+# CHECK: msgrkc %r7, %r8, %r9
+0xb9 0xed 0x90 0x78
+
+# CHECK: sgh %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x39
+
+# CHECK: sgh %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x39
+
+# CHECK: sgh %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x39
+
+# CHECK: sgh %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x39
+
+# CHECK: sgh %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x39
+
+# CHECK: sgh %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x39
+
+# CHECK: sgh %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x39
+
+# CHECK: sgh %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x39
+
+# CHECK: sgh %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x39
+
+# CHECK: sgh %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x39
+
+# CHECK: stgsc %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x49
+
+# CHECK: stgsc %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x49
+
+# CHECK: stgsc %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x49
+
+# CHECK: stgsc %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x49
+
+# CHECK: stgsc %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x49
+
+# CHECK: stgsc %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x49
+
+# CHECK: stgsc %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x49
+
+# CHECK: stgsc %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x49
+
+# CHECK: stgsc %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x49
+
+# CHECK: vap %v0, %v0, %v0, 0, 0
+0xe6 0x00 0x00 0x00 0x00 0x71
+
+# CHECK: vap %v0, %v0, %v0, 0, 15
+0xe6 0x00 0x00 0xf0 0x00 0x71
+
+# CHECK: vap %v0, %v0, %v0, 255, 0
+0xe6 0x00 0x00 0x0f 0xf0 0x71
+
+# CHECK: vap %v0, %v0, %v31, 0, 0
+0xe6 0x00 0xf0 0x00 0x02 0x71
+
+# CHECK: vap %v0, %v31, %v0, 0, 0
+0xe6 0x0f 0x00 0x00 0x04 0x71
+
+# CHECK: vap %v31, %v0, %v0, 0, 0
+0xe6 0xf0 0x00 0x00 0x08 0x71
+
+# CHECK: vap %v13, %v17, %v21, 121, 11
+0xe6 0xd1 0x50 0xb7 0x96 0x71
+
+# CHECK: vbperm %v0, %v0, %v0
+0xe7 0x00 0x00 0x00 0x00 0x85
+
+# CHECK: vbperm %v0, %v0, %v15
+0xe7 0x00 0xf0 0x00 0x00 0x85
+
+# CHECK: vbperm %v0, %v0, %v31
+0xe7 0x00 0xf0 0x00 0x02 0x85
+
+# CHECK: vbperm %v0, %v15, %v0
+0xe7 0x0f 0x00 0x00 0x00 0x85
+
+# CHECK: vbperm %v0, %v31, %v0
+0xe7 0x0f 0x00 0x00 0x04 0x85
+
+# CHECK: vbperm %v15, %v0, %v0
+0xe7 0xf0 0x00 0x00 0x00 0x85
+
+# CHECK: vbperm %v31, %v0, %v0
+0xe7 0xf0 0x00 0x00 0x08 0x85
+
+# CHECK: vbperm %v18, %v3, %v20
+0xe7 0x23 0x40 0x00 0x0a 0x85
+
+# CHECK: vcp %v0, %v0, 0
+0xe6 0x00 0x00 0x00 0x00 0x77
+
+# CHECK: vcp %v0, %v0, 15
+0xe6 0x00 0x00 0xf0 0x00 0x77
+
+# CHECK: vcp %v15, %v0, 0
+0xe6 0x0f 0x00 0x00 0x00 0x77
+
+# CHECK: vcp %v31, %v0, 0
+0xe6 0x0f 0x00 0x00 0x04 0x77
+
+# CHECK: vcp %v0, %v15, 0
+0xe6 0x00 0xf0 0x00 0x00 0x77
+
+# CHECK: vcp %v0, %v31, 0
+0xe6 0x00 0xf0 0x00 0x02 0x77
+
+# CHECK: vcp %v3, %v18, 4
+0xe6 0x03 0x20 0x40 0x02 0x77
+
+# CHECK: vcvb %r0, %v0, 0
+0xe6 0x00 0x00 0x00 0x00 0x50
+
+# CHECK: vcvb %r0, %v0, 15
+0xe6 0x00 0x00 0xf0 0x00 0x50
+
+# CHECK: vcvb %r15, %v0, 0
+0xe6 0xf0 0x00 0x00 0x00 0x50
+
+# CHECK: vcvb %r0, %v15, 0
+0xe6 0x0f 0x00 0x00 0x00 0x50
+
+# CHECK: vcvb %r0, %v31, 0
+0xe6 0x0f 0x00 0x00 0x04 0x50
+
+# CHECK: vcvb %r3, %v18, 4
+0xe6 0x32 0x00 0x40 0x04 0x50
+
+# CHECK: vcvbg %r0, %v0, 0
+0xe6 0x00 0x00 0x00 0x00 0x52
+
+# CHECK: vcvbg %r0, %v0, 15
+0xe6 0x00 0x00 0xf0 0x00 0x52
+
+# CHECK: vcvbg %r15, %v0, 0
+0xe6 0xf0 0x00 0x00 0x00 0x52
+
+# CHECK: vcvbg %r0, %v15, 0
+0xe6 0x0f 0x00 0x00 0x00 0x52
+
+# CHECK: vcvbg %r0, %v31, 0
+0xe6 0x0f 0x00 0x00 0x04 0x52
+
+# CHECK: vcvbg %r3, %v18, 4
+0xe6 0x32 0x00 0x40 0x04 0x52
+
+# CHECK: vcvd %v0, %r0, 0, 0
+0xe6 0x00 0x00 0x00 0x00 0x58
+
+# CHECK: vcvd %v0, %r0, 0, 15
+0xe6 0x00 0x00 0xf0 0x00 0x58
+
+# CHECK: vcvd %v0, %r0, 255, 0
+0xe6 0x00 0x00 0x0f 0xf0 0x58
+
+# CHECK: vcvd %v0, %r15, 0, 0
+0xe6 0x0f 0x00 0x00 0x00 0x58
+
+# CHECK: vcvd %v15, %r0, 0, 0
+0xe6 0xf0 0x00 0x00 0x00 0x58
+
+# CHECK: vcvd %v31, %r0, 0, 0
+0xe6 0xf0 0x00 0x00 0x08 0x58
+
+# CHECK: vcvd %v18, %r9, 52, 11
+0xe6 0x29 0x00 0xb3 0x48 0x58
+
+# CHECK: vcvdg %v0, %r0, 0, 0
+0xe6 0x00 0x00 0x00 0x00 0x5a
+
+# CHECK: vcvdg %v0, %r0, 0, 15
+0xe6 0x00 0x00 0xf0 0x00 0x5a
+
+# CHECK: vcvdg %v0, %r0, 255, 0
+0xe6 0x00 0x00 0x0f 0xf0 0x5a
+
+# CHECK: vcvdg %v0, %r15, 0, 0
+0xe6 0x0f 0x00 0x00 0x00 0x5a
+
+# CHECK: vcvdg %v15, %r0, 0, 0
+0xe6 0xf0 0x00 0x00 0x00 0x5a
+
+# CHECK: vcvdg %v31, %r0, 0, 0
+0xe6 0xf0 0x00 0x00 0x08 0x5a
+
+# CHECK: vcvdg %v18, %r9, 52, 11
+0xe6 0x29 0x00 0xb3 0x48 0x5a
+
+# CHECK: vdp %v0, %v0, %v0, 0, 0
+0xe6 0x00 0x00 0x00 0x00 0x7a
+
+# CHECK: vdp %v0, %v0, %v0, 0, 15
+0xe6 0x00 0x00 0xf0 0x00 0x7a
+
+# CHECK: vdp %v0, %v0, %v0, 255, 0
+0xe6 0x00 0x00 0x0f 0xf0 0x7a
+
+# CHECK: vdp %v0, %v0, %v31, 0, 0
+0xe6 0x00 0xf0 0x00 0x02 0x7a
+
+# CHECK: vdp %v0, %v31, %v0, 0, 0
+0xe6 0x0f 0x00 0x00 0x04 0x7a
+
+# CHECK: vdp %v31, %v0, %v0, 0, 0
+0xe6 0xf0 0x00 0x00 0x08 0x7a
+
+# CHECK: vdp %v13, %v17, %v21, 121, 11
+0xe6 0xd1 0x50 0xb7 0x96 0x7a
+
+# CHECK: vfasb %v0, %v0, %v0
+0xe7 0x00 0x00 0x00 0x20 0xe3
+
+# CHECK: vfasb %v0, %v0, %v31
+0xe7 0x00 0xf0 0x00 0x22 0xe3
+
+# CHECK: vfasb %v0, %v31, %v0
+0xe7 0x0f 0x00 0x00 0x24 0xe3
+
+# CHECK: vfasb %v31, %v0, %v0
+0xe7 0xf0 0x00 0x00 0x28 0xe3
+
+# CHECK: vfasb %v18, %v3, %v20
+0xe7 0x23 0x40 0x00 0x2a 0xe3
+
+# CHECK: vfcesb %v0, %v0, %v0
+0xe7 0x00 0x00 0x00 0x20 0xe8
+
+# CHECK: vfcesb %v0, %v0, %v31
+0xe7 0x00 0xf0 0x00 0x22 0xe8
+
+# CHECK: vfcesb %v0, %v31, %v0
+0xe7 0x0f 0x00 0x00 0x24 0xe8
+
+# CHECK: vfcesb %v31, %v0, %v0
+0xe7 0xf0 0x00 0x00 0x28 0xe8
+
+# CHECK: vfcesb %v18, %v3, %v20
+0xe7 0x23 0x40 0x00 0x2a 0xe8
+
+# CHECK: vfcesbs %v0, %v0, %v0
+0xe7 0x00 0x00 0x10 0x20 0xe8
+
+# CHECK: vfcesbs %v0, %v0, %v31
+0xe7 0x00 0xf0 0x10 0x22 0xe8
+
+# CHECK: vfcesbs %v0, %v31, %v0
+0xe7 0x0f 0x00 0x10 0x24 0xe8
+
+# CHECK: vfcesbs %v31, %v0, %v0
+0xe7 0xf0 0x00 0x10 0x28 0xe8
+
+# CHECK: vfcesbs %v18, %v3, %v20
+0xe7 0x23 0x40 0x10 0x2a 0xe8
+
+# CHECK: vfchsb %v0, %v0, %v0
+0xe7 0x00 0x00 0x00 0x20 0xeb
+
+# CHECK: vfchsb %v0, %v0, %v31
+0xe7 0x00 0xf0 0x00 0x22 0xeb
+
+# CHECK: vfchsb %v0, %v31, %v0
+0xe7 0x0f 0x00 0x00 0x24 0xeb
+
+# CHECK: vfchsb %v31, %v0, %v0
+0xe7 0xf0 0x00 0x00 0x28 0xeb
+
+# CHECK: vfchsb %v18, %v3, %v20
+0xe7 0x23 0x40 0x00 0x2a 0xeb
+
+# CHECK: vfchsbs %v0, %v0, %v0
+0xe7 0x00 0x00 0x10 0x20 0xeb
+
+# CHECK: vfchsbs %v0, %v0, %v31
+0xe7 0x00 0xf0 0x10 0x22 0xeb
+
+# CHECK: vfchsbs %v0, %v31, %v0
+0xe7 0x0f 0x00 0x10 0x24 0xeb
+
+# CHECK: vfchsbs %v31, %v0, %v0
+0xe7 0xf0 0x00 0x10 0x28 0xeb
+
+# CHECK: vfchsbs %v18, %v3, %v20
+0xe7 0x23 0x40 0x10 0x2a 0xeb
+
+# CHECK: vfchesb %v0, %v0, %v0
+0xe7 0x00 0x00 0x00 0x20 0xea
+
+# CHECK: vfchesb %v0, %v0, %v31
+0xe7 0x00 0xf0 0x00 0x22 0xea
+
+# CHECK: vfchesb %v0, %v31, %v0
+0xe7 0x0f 0x00 0x00 0x24 0xea
+
+# CHECK: vfchesb %v31, %v0, %v0
+0xe7 0xf0 0x00 0x00 0x28 0xea
+
+# CHECK: vfchesb %v18, %v3, %v20
+0xe7 0x23 0x40 0x00 0x2a 0xea
+
+# CHECK: vfchesbs %v0, %v0, %v0
+0xe7 0x00 0x00 0x10 0x20 0xea
+
+# CHECK: vfchesbs %v0, %v0, %v31
+0xe7 0x00 0xf0 0x10 0x22 0xea
+
+# CHECK: vfchesbs %v0, %v31, %v0
+0xe7 0x0f 0x00 0x10 0x24 0xea
+
+# CHECK: vfchesbs %v31, %v0, %v0
+0xe7 0xf0 0x00 0x10 0x28 0xea
+
+# CHECK: vfchesbs %v18, %v3, %v20
+0xe7 0x23 0x40 0x10 0x2a 0xea
+
+# CHECK: vfdsb %v0, %v0, %v0
+0xe7 0x00 0x00 0x00 0x20 0xe5
+
+# CHECK: vfdsb %v0, %v0, %v31
+0xe7 0x00 0xf0 0x00 0x22 0xe5
+
+# CHECK: vfdsb %v0, %v31, %v0
+0xe7 0x0f 0x00 0x00 0x24 0xe5
+
+# CHECK: vfdsb %v31, %v0, %v0
+0xe7 0xf0 0x00 0x00 0x28 0xe5
+
+# CHECK: vfdsb %v18, %v3, %v20
+0xe7 0x23 0x40 0x00 0x2a 0xe5
+
+# CHECK: vfisb %v0, %v0, 0, 0
+0xe7 0x00 0x00 0x00 0x20 0xc7
+
+# CHECK: vfisb %v0, %v0, 0, 15
+0xe7 0x00 0x00 0xf0 0x20 0xc7
+
+# CHECK: vfisb %v0, %v0, 4, 0
+0xe7 0x00 0x00 0x04 0x20 0xc7
+
+# CHECK: vfisb %v0, %v0, 7, 0
+0xe7 0x00 0x00 0x07 0x20 0xc7
+
+# CHECK: vfisb %v0, %v31, 0, 0
+0xe7 0x0f 0x00 0x00 0x24 0xc7
+
+# CHECK: vfisb %v31, %v0, 0, 0
+0xe7 0xf0 0x00 0x00 0x28 0xc7
+
+# CHECK: vfisb %v14, %v17, 4, 10
+0xe7 0xe1 0x00 0xa4 0x24 0xc7
+
+# CHECK: vfkedb %v0, %v0, %v0
+0xe7 0x00 0x00 0x04 0x30 0xe8
+
+# CHECK: vfkedb %v0, %v0, %v31
+0xe7 0x00 0xf0 0x04 0x32 0xe8
+
+# CHECK: vfkedb %v0, %v31, %v0
+0xe7 0x0f 0x00 0x04 0x34 0xe8
+
+# CHECK: vfkedb %v31, %v0, %v0
+0xe7 0xf0 0x00 0x04 0x38 0xe8
+
+# CHECK: vfkedb %v18, %v3, %v20
+0xe7 0x23 0x40 0x04 0x3a 0xe8
+
+# CHECK: vfkedbs %v0, %v0, %v0
+0xe7 0x00 0x00 0x14 0x30 0xe8
+
+# CHECK: vfkedbs %v0, %v0, %v31
+0xe7 0x00 0xf0 0x14 0x32 0xe8
+
+# CHECK: vfkedbs %v0, %v31, %v0
+0xe7 0x0f 0x00 0x14 0x34 0xe8
+
+# CHECK: vfkedbs %v31, %v0, %v0
+0xe7 0xf0 0x00 0x14 0x38 0xe8
+
+# CHECK: vfkedbs %v18, %v3, %v20
+0xe7 0x23 0x40 0x14 0x3a 0xe8
+
+# CHECK: vfkesb %v0, %v0, %v0
+0xe7 0x00 0x00 0x04 0x20 0xe8
+
+# CHECK: vfkesb %v0, %v0, %v31
+0xe7 0x00 0xf0 0x04 0x22 0xe8
+
+# CHECK: vfkesb %v0, %v31, %v0
+0xe7 0x0f 0x00 0x04 0x24 0xe8
+
+# CHECK: vfkesb %v31, %v0, %v0
+0xe7 0xf0 0x00 0x04 0x28 0xe8
+
+# CHECK: vfkesb %v18, %v3, %v20
+0xe7 0x23 0x40 0x04 0x2a 0xe8
+
+# CHECK: vfkesbs %v0, %v0, %v0
+0xe7 0x00 0x00 0x14 0x20 0xe8
+
+# CHECK: vfkesbs %v0, %v0, %v31
+0xe7 0x00 0xf0 0x14 0x22 0xe8
+
+# CHECK: vfkesbs %v0, %v31, %v0
+0xe7 0x0f 0x00 0x14 0x24 0xe8
+
+# CHECK: vfkesbs %v31, %v0, %v0
+0xe7 0xf0 0x00 0x14 0x28 0xe8
+
+# CHECK: vfkesbs %v18, %v3, %v20
+0xe7 0x23 0x40 0x14 0x2a 0xe8
+
+# CHECK: vfkhdb %v0, %v0, %v0
+0xe7 0x00 0x00 0x04 0x30 0xeb
+
+# CHECK: vfkhdb %v0, %v0, %v31
+0xe7 0x00 0xf0 0x04 0x32 0xeb
+
+# CHECK: vfkhdb %v0, %v31, %v0
+0xe7 0x0f 0x00 0x04 0x34 0xeb
+
+# CHECK: vfkhdb %v31, %v0, %v0
+0xe7 0xf0 0x00 0x04 0x38 0xeb
+
+# CHECK: vfkhdb %v18, %v3, %v20
+0xe7 0x23 0x40 0x04 0x3a 0xeb
+
+# CHECK: vfkhdbs %v0, %v0, %v0
+0xe7 0x00 0x00 0x14 0x30 0xeb
+
+# CHECK: vfkhdbs %v0, %v0, %v31
+0xe7 0x00 0xf0 0x14 0x32 0xeb
+
+# CHECK: vfkhdbs %v0, %v31, %v0
+0xe7 0x0f 0x00 0x14 0x34 0xeb
+
+# CHECK: vfkhdbs %v31, %v0, %v0
+0xe7 0xf0 0x00 0x14 0x38 0xeb
+
+# CHECK: vfkhdbs %v18, %v3, %v20
+0xe7 0x23 0x40 0x14 0x3a 0xeb
+
+# CHECK: vfkhsb %v0, %v0, %v0
+0xe7 0x00 0x00 0x04 0x20 0xeb
+
+# CHECK: vfkhsb %v0, %v0, %v31
+0xe7 0x00 0xf0 0x04 0x22 0xeb
+
+# CHECK: vfkhsb %v0, %v31, %v0
+0xe7 0x0f 0x00 0x04 0x24 0xeb
+
+# CHECK: vfkhsb %v31, %v0, %v0
+0xe7 0xf0 0x00 0x04 0x28 0xeb
+
+# CHECK: vfkhsb %v18, %v3, %v20
+0xe7 0x23 0x40 0x04 0x2a 0xeb
+
+# CHECK: vfkhsbs %v0, %v0, %v0
+0xe7 0x00 0x00 0x14 0x20 0xeb
+
+# CHECK: vfkhsbs %v0, %v0, %v31
+0xe7 0x00 0xf0 0x14 0x22 0xeb
+
+# CHECK: vfkhsbs %v0, %v31, %v0
+0xe7 0x0f 0x00 0x14 0x24 0xeb
+
+# CHECK: vfkhsbs %v31, %v0, %v0
+0xe7 0xf0 0x00 0x14 0x28 0xeb
+
+# CHECK: vfkhsbs %v18, %v3, %v20
+0xe7 0x23 0x40 0x14 0x2a 0xeb
+
+# CHECK: vfkhedb %v0, %v0, %v0
+0xe7 0x00 0x00 0x04 0x30 0xea
+
+# CHECK: vfkhedb %v0, %v0, %v31
+0xe7 0x00 0xf0 0x04 0x32 0xea
+
+# CHECK: vfkhedb %v0, %v31, %v0
+0xe7 0x0f 0x00 0x04 0x34 0xea
+
+# CHECK: vfkhedb %v31, %v0, %v0
+0xe7 0xf0 0x00 0x04 0x38 0xea
+
+# CHECK: vfkhedb %v18, %v3, %v20
+0xe7 0x23 0x40 0x04 0x3a 0xea
+
+# CHECK: vfkhedbs %v0, %v0, %v0
+0xe7 0x00 0x00 0x14 0x30 0xea
+
+# CHECK: vfkhedbs %v0, %v0, %v31
+0xe7 0x00 0xf0 0x14 0x32 0xea
+
+# CHECK: vfkhedbs %v0, %v31, %v0
+0xe7 0x0f 0x00 0x14 0x34 0xea
+
+# CHECK: vfkhedbs %v31, %v0, %v0
+0xe7 0xf0 0x00 0x14 0x38 0xea
+
+# CHECK: vfkhedbs %v18, %v3, %v20
+0xe7 0x23 0x40 0x14 0x3a 0xea
+
+# CHECK: vfkhesb %v0, %v0, %v0
+0xe7 0x00 0x00 0x04 0x20 0xea
+
+# CHECK: vfkhesb %v0, %v0, %v31
+0xe7 0x00 0xf0 0x04 0x22 0xea
+
+# CHECK: vfkhesb %v0, %v31, %v0
+0xe7 0x0f 0x00 0x04 0x24 0xea
+
+# CHECK: vfkhesb %v31, %v0, %v0
+0xe7 0xf0 0x00 0x04 0x28 0xea
+
+# CHECK: vfkhesb %v18, %v3, %v20
+0xe7 0x23 0x40 0x04 0x2a 0xea
+
+# CHECK: vfkhesbs %v0, %v0, %v0
+0xe7 0x00 0x00 0x14 0x20 0xea
+
+# CHECK: vfkhesbs %v0, %v0, %v31
+0xe7 0x00 0xf0 0x14 0x22 0xea
+
+# CHECK: vfkhesbs %v0, %v31, %v0
+0xe7 0x0f 0x00 0x14 0x24 0xea
+
+# CHECK: vfkhesbs %v31, %v0, %v0
+0xe7 0xf0 0x00 0x14 0x28 0xea
+
+# CHECK: vfkhesbs %v18, %v3, %v20
+0xe7 0x23 0x40 0x14 0x2a 0xea
+
+# CHECK: vfpsosb %v0, %v0, 3
+0xe7 0x00 0x00 0x30 0x20 0xcc
+
+# CHECK: vfpsosb %v0, %v0, 15
+0xe7 0x00 0x00 0xf0 0x20 0xcc
+
+# CHECK: vfpsosb %v0, %v15, 3
+0xe7 0x0f 0x00 0x30 0x20 0xcc
+
+# CHECK: vfpsosb %v0, %v31, 3
+0xe7 0x0f 0x00 0x30 0x24 0xcc
+
+# CHECK: vfpsosb %v15, %v0, 3
+0xe7 0xf0 0x00 0x30 0x20 0xcc
+
+# CHECK: vfpsosb %v31, %v0, 3
+0xe7 0xf0 0x00 0x30 0x28 0xcc
+
+# CHECK: vfpsosb %v14, %v17, 7
+0xe7 0xe1 0x00 0x70 0x24 0xcc
+
+# CHECK: vflcsb %v0, %v0
+0xe7 0x00 0x00 0x00 0x20 0xcc
+
+# CHECK: vflcsb %v0, %v15
+0xe7 0x0f 0x00 0x00 0x20 0xcc
+
+# CHECK: vflcsb %v0, %v31
+0xe7 0x0f 0x00 0x00 0x24 0xcc
+
+# CHECK: vflcsb %v15, %v0
+0xe7 0xf0 0x00 0x00 0x20 0xcc
+
+# CHECK: vflcsb %v31, %v0
+0xe7 0xf0 0x00 0x00 0x28 0xcc
+
+# CHECK: vflcsb %v14, %v17
+0xe7 0xe1 0x00 0x00 0x24 0xcc
+
+# CHECK: vflnsb %v0, %v0
+0xe7 0x00 0x00 0x10 0x20 0xcc
+
+# CHECK: vflnsb %v0, %v15
+0xe7 0x0f 0x00 0x10 0x20 0xcc
+
+# CHECK: vflnsb %v0, %v31
+0xe7 0x0f 0x00 0x10 0x24 0xcc
+
+# CHECK: vflnsb %v15, %v0
+0xe7 0xf0 0x00 0x10 0x20 0xcc
+
+# CHECK: vflnsb %v31, %v0
+0xe7 0xf0 0x00 0x10 0x28 0xcc
+
+# CHECK: vflnsb %v14, %v17
+0xe7 0xe1 0x00 0x10 0x24 0xcc
+
+# CHECK: vflpsb %v0, %v0
+0xe7 0x00 0x00 0x20 0x20 0xcc
+
+# CHECK: vflpsb %v0, %v15
+0xe7 0x0f 0x00 0x20 0x20 0xcc
+
+# CHECK: vflpsb %v0, %v31
+0xe7 0x0f 0x00 0x20 0x24 0xcc
+
+# CHECK: vflpsb %v15, %v0
+0xe7 0xf0 0x00 0x20 0x20 0xcc
+
+# CHECK: vflpsb %v31, %v0
+0xe7 0xf0 0x00 0x20 0x28 0xcc
+
+# CHECK: vflpsb %v14, %v17
+0xe7 0xe1 0x00 0x20 0x24 0xcc
+
+# CHECK: vfmax %v0, %v0, %v0, 0, 0, 0
+0xe7 0x00 0x00 0x00 0x00 0xef
+
+# CHECK: vfmax %v0, %v0, %v0, 15, 0, 0
+0xe7 0x00 0x00 0x00 0xf0 0xef
+
+# CHECK: vfmax %v0, %v0, %v0, 0, 15, 0
+0xe7 0x00 0x00 0x0f 0x00 0xef
+
+# CHECK: vfmax %v0, %v0, %v0, 0, 0, 4
+0xe7 0x00 0x00 0x40 0x00 0xef
+
+# CHECK: vfmax %v0, %v0, %v31, 0, 0, 0
+0xe7 0x00 0xf0 0x00 0x02 0xef
+
+# CHECK: vfmax %v0, %v31, %v0, 0, 0, 0
+0xe7 0x0f 0x00 0x00 0x04 0xef
+
+# CHECK: vfmax %v31, %v0, %v0, 0, 0, 0
+0xe7 0xf0 0x00 0x00 0x08 0xef
+
+# CHECK: vfmax %v18, %v3, %v20, 11, 9, 12
+0xe7 0x23 0x40 0xc9 0xba 0xef
+
+# CHECK: vfmaxdb %v0, %v0, %v0, 0
+0xe7 0x00 0x00 0x00 0x30 0xef
+
+# CHECK: vfmaxdb %v0, %v0, %v0, 4
+0xe7 0x00 0x00 0x40 0x30 0xef
+
+# CHECK: vfmaxdb %v0, %v0, %v31, 0
+0xe7 0x00 0xf0 0x00 0x32 0xef
+
+# CHECK: vfmaxdb %v0, %v31, %v0, 0
+0xe7 0x0f 0x00 0x00 0x34 0xef
+
+# CHECK: vfmaxdb %v31, %v0, %v0, 0
+0xe7 0xf0 0x00 0x00 0x38 0xef
+
+# CHECK: vfmaxdb %v18, %v3, %v20, 12
+0xe7 0x23 0x40 0xc0 0x3a 0xef
+
+# CHECK: vfmaxsb %v0, %v0, %v0, 0
+0xe7 0x00 0x00 0x00 0x20 0xef
+
+# CHECK: vfmaxsb %v0, %v0, %v0, 4
+0xe7 0x00 0x00 0x40 0x20 0xef
+
+# CHECK: vfmaxsb %v0, %v0, %v31, 0
+0xe7 0x00 0xf0 0x00 0x22 0xef
+
+# CHECK: vfmaxsb %v0, %v31, %v0, 0
+0xe7 0x0f 0x00 0x00 0x24 0xef
+
+# CHECK: vfmaxsb %v31, %v0, %v0, 0
+0xe7 0xf0 0x00 0x00 0x28 0xef
+
+# CHECK: vfmaxsb %v18, %v3, %v20, 12
+0xe7 0x23 0x40 0xc0 0x2a 0xef
+
+# CHECK: vfmin %v0, %v0, %v0, 0, 0, 0
+0xe7 0x00 0x00 0x00 0x00 0xee
+
+# CHECK: vfmin %v0, %v0, %v0, 15, 0, 0
+0xe7 0x00 0x00 0x00 0xf0 0xee
+
+# CHECK: vfmin %v0, %v0, %v0, 0, 15, 0
+0xe7 0x00 0x00 0x0f 0x00 0xee
+
+# CHECK: vfmin %v0, %v0, %v0, 0, 0, 4
+0xe7 0x00 0x00 0x40 0x00 0xee
+
+# CHECK: vfmin %v0, %v0, %v31, 0, 0, 0
+0xe7 0x00 0xf0 0x00 0x02 0xee
+
+# CHECK: vfmin %v0, %v31, %v0, 0, 0, 0
+0xe7 0x0f 0x00 0x00 0x04 0xee
+
+# CHECK: vfmin %v31, %v0, %v0, 0, 0, 0
+0xe7 0xf0 0x00 0x00 0x08 0xee
+
+# CHECK: vfmin %v18, %v3, %v20, 11, 9, 12
+0xe7 0x23 0x40 0xc9 0xba 0xee
+
+# CHECK: vfmindb %v0, %v0, %v0, 0
+0xe7 0x00 0x00 0x00 0x30 0xee
+
+# CHECK: vfmindb %v0, %v0, %v0, 4
+0xe7 0x00 0x00 0x40 0x30 0xee
+
+# CHECK: vfmindb %v0, %v0, %v31, 0
+0xe7 0x00 0xf0 0x00 0x32 0xee
+
+# CHECK: vfmindb %v0, %v31, %v0, 0
+0xe7 0x0f 0x00 0x00 0x34 0xee
+
+# CHECK: vfmindb %v31, %v0, %v0, 0
+0xe7 0xf0 0x00 0x00 0x38 0xee
+
+# CHECK: vfmindb %v18, %v3, %v20, 12
+0xe7 0x23 0x40 0xc0 0x3a 0xee
+
+# CHECK: vfminsb %v0, %v0, %v0, 0
+0xe7 0x00 0x00 0x00 0x20 0xee
+
+# CHECK: vfminsb %v0, %v0, %v0, 4
+0xe7 0x00 0x00 0x40 0x20 0xee
+
+# CHECK: vfminsb %v0, %v0, %v31, 0
+0xe7 0x00 0xf0 0x00 0x22 0xee
+
+# CHECK: vfminsb %v0, %v31, %v0, 0
+0xe7 0x0f 0x00 0x00 0x24 0xee
+
+# CHECK: vfminsb %v31, %v0, %v0, 0
+0xe7 0xf0 0x00 0x00 0x28 0xee
+
+# CHECK: vfminsb %v18, %v3, %v20, 12
+0xe7 0x23 0x40 0xc0 0x2a 0xee
+
+# CHECK: vfmasb %v0, %v0, %v0, %v0
+0xe7 0x00 0x02 0x00 0x00 0x8f
+
+# CHECK: vfmasb %v0, %v0, %v0, %v31
+0xe7 0x00 0x02 0x00 0xf1 0x8f
+
+# CHECK: vfmasb %v0, %v0, %v31, %v0
+0xe7 0x00 0xf2 0x00 0x02 0x8f
+
+# CHECK: vfmasb %v0, %v31, %v0, %v0
+0xe7 0x0f 0x02 0x00 0x04 0x8f
+
+# CHECK: vfmasb %v31, %v0, %v0, %v0
+0xe7 0xf0 0x02 0x00 0x08 0x8f
+
+# CHECK: vfmasb %v13, %v17, %v21, %v25
+0xe7 0xd1 0x52 0x00 0x97 0x8f
+
+# CHECK: vfmsb %v0, %v0, %v0
+0xe7 0x00 0x00 0x00 0x20 0xe7
+
+# CHECK: vfmsb %v0, %v0, %v31
+0xe7 0x00 0xf0 0x00 0x22 0xe7
+
+# CHECK: vfmsb %v0, %v31, %v0
+0xe7 0x0f 0x00 0x00 0x24 0xe7
+
+# CHECK: vfmsb %v31, %v0, %v0
+0xe7 0xf0 0x00 0x00 0x28 0xe7
+
+# CHECK: vfmsb %v18, %v3, %v20
+0xe7 0x23 0x40 0x00 0x2a 0xe7
+
+# CHECK: vfmssb %v0, %v0, %v0, %v0
+0xe7 0x00 0x02 0x00 0x00 0x8e
+
+# CHECK: vfmssb %v0, %v0, %v0, %v31
+0xe7 0x00 0x02 0x00 0xf1 0x8e
+
+# CHECK: vfmssb %v0, %v0, %v31, %v0
+0xe7 0x00 0xf2 0x00 0x02 0x8e
+
+# CHECK: vfmssb %v0, %v31, %v0, %v0
+0xe7 0x0f 0x02 0x00 0x04 0x8e
+
+# CHECK: vfmssb %v31, %v0, %v0, %v0
+0xe7 0xf0 0x02 0x00 0x08 0x8e
+
+# CHECK: vfmssb %v13, %v17, %v21, %v25
+0xe7 0xd1 0x52 0x00 0x97 0x8e
+
+# CHECK: vfnma %v0, %v0, %v0, %v0, 0, 0
+0xe7 0x00 0x00 0x00 0x00 0x9f
+
+# CHECK: vfnma %v0, %v0, %v0, %v0, 0, 15
+0xe7 0x00 0x0f 0x00 0x00 0x9f
+
+# CHECK: vfnma %v0, %v0, %v0, %v0, 15, 0
+0xe7 0x00 0x00 0x0f 0x00 0x9f
+
+# CHECK: vfnma %v0, %v0, %v0, %v31, 0, 0
+0xe7 0x00 0x00 0x00 0xf1 0x9f
+
+# CHECK: vfnma %v0, %v0, %v31, %v0, 0, 0
+0xe7 0x00 0xf0 0x00 0x02 0x9f
+
+# CHECK: vfnma %v0, %v31, %v0, %v0, 0, 0
+0xe7 0x0f 0x00 0x00 0x04 0x9f
+
+# CHECK: vfnma %v31, %v0, %v0, %v0, 0, 0
+0xe7 0xf0 0x00 0x00 0x08 0x9f
+
+# CHECK: vfnma %v13, %v17, %v21, %v25, 9, 11
+0xe7 0xd1 0x5b 0x09 0x97 0x9f
+
+# CHECK: vfnmadb %v0, %v0, %v0, %v0
+0xe7 0x00 0x03 0x00 0x00 0x9f
+
+# CHECK: vfnmadb %v0, %v0, %v0, %v31
+0xe7 0x00 0x03 0x00 0xf1 0x9f
+
+# CHECK: vfnmadb %v0, %v0, %v31, %v0
+0xe7 0x00 0xf3 0x00 0x02 0x9f
+
+# CHECK: vfnmadb %v0, %v31, %v0, %v0
+0xe7 0x0f 0x03 0x00 0x04 0x9f
+
+# CHECK: vfnmadb %v31, %v0, %v0, %v0
+0xe7 0xf0 0x03 0x00 0x08 0x9f
+
+# CHECK: vfnmadb %v13, %v17, %v21, %v25
+0xe7 0xd1 0x53 0x00 0x97 0x9f
+
+# CHECK: vfnmasb %v0, %v0, %v0, %v0
+0xe7 0x00 0x02 0x00 0x00 0x9f
+
+# CHECK: vfnmasb %v0, %v0, %v0, %v31
+0xe7 0x00 0x02 0x00 0xf1 0x9f
+
+# CHECK: vfnmasb %v0, %v0, %v31, %v0
+0xe7 0x00 0xf2 0x00 0x02 0x9f
+
+# CHECK: vfnmasb %v0, %v31, %v0, %v0
+0xe7 0x0f 0x02 0x00 0x04 0x9f
+
+# CHECK: vfnmasb %v31, %v0, %v0, %v0
+0xe7 0xf0 0x02 0x00 0x08 0x9f
+
+# CHECK: vfnmasb %v13, %v17, %v21, %v25
+0xe7 0xd1 0x52 0x00 0x97 0x9f
+
+# CHECK: vfnms %v0, %v0, %v0, %v0, 0, 0
+0xe7 0x00 0x00 0x00 0x00 0x9e
+
+# CHECK: vfnms %v0, %v0, %v0, %v0, 0, 15
+0xe7 0x00 0x0f 0x00 0x00 0x9e
+
+# CHECK: vfnms %v0, %v0, %v0, %v0, 15, 0
+0xe7 0x00 0x00 0x0f 0x00 0x9e
+
+# CHECK: vfnms %v0, %v0, %v0, %v31, 0, 0
+0xe7 0x00 0x00 0x00 0xf1 0x9e
+
+# CHECK: vfnms %v0, %v0, %v31, %v0, 0, 0
+0xe7 0x00 0xf0 0x00 0x02 0x9e
+
+# CHECK: vfnms %v0, %v31, %v0, %v0, 0, 0
+0xe7 0x0f 0x00 0x00 0x04 0x9e
+
+# CHECK: vfnms %v31, %v0, %v0, %v0, 0, 0
+0xe7 0xf0 0x00 0x00 0x08 0x9e
+
+# CHECK: vfnms %v13, %v17, %v21, %v25, 9, 11
+0xe7 0xd1 0x5b 0x09 0x97 0x9e
+
+# CHECK: vfnmsdb %v0, %v0, %v0, %v0
+0xe7 0x00 0x03 0x00 0x00 0x9e
+
+# CHECK: vfnmsdb %v0, %v0, %v0, %v31
+0xe7 0x00 0x03 0x00 0xf1 0x9e
+
+# CHECK: vfnmsdb %v0, %v0, %v31, %v0
+0xe7 0x00 0xf3 0x00 0x02 0x9e
+
+# CHECK: vfnmsdb %v0, %v31, %v0, %v0
+0xe7 0x0f 0x03 0x00 0x04 0x9e
+
+# CHECK: vfnmsdb %v31, %v0, %v0, %v0
+0xe7 0xf0 0x03 0x00 0x08 0x9e
+
+# CHECK: vfnmsdb %v13, %v17, %v21, %v25
+0xe7 0xd1 0x53 0x00 0x97 0x9e
+
+# CHECK: vfnmssb %v0, %v0, %v0, %v0
+0xe7 0x00 0x02 0x00 0x00 0x9e
+
+# CHECK: vfnmssb %v0, %v0, %v0, %v31
+0xe7 0x00 0x02 0x00 0xf1 0x9e
+
+# CHECK: vfnmssb %v0, %v0, %v31, %v0
+0xe7 0x00 0xf2 0x00 0x02 0x9e
+
+# CHECK: vfnmssb %v0, %v31, %v0, %v0
+0xe7 0x0f 0x02 0x00 0x04 0x9e
+
+# CHECK: vfnmssb %v31, %v0, %v0, %v0
+0xe7 0xf0 0x02 0x00 0x08 0x9e
+
+# CHECK: vfnmssb %v13, %v17, %v21, %v25
+0xe7 0xd1 0x52 0x00 0x97 0x9e
+
+# CHECK: vfssb %v0, %v0, %v0
+0xe7 0x00 0x00 0x00 0x20 0xe2
+
+# CHECK: vfssb %v0, %v0, %v31
+0xe7 0x00 0xf0 0x00 0x22 0xe2
+
+# CHECK: vfssb %v0, %v31, %v0
+0xe7 0x0f 0x00 0x00 0x24 0xe2
+
+# CHECK: vfssb %v31, %v0, %v0
+0xe7 0xf0 0x00 0x00 0x28 0xe2
+
+# CHECK: vfssb %v18, %v3, %v20
+0xe7 0x23 0x40 0x00 0x2a 0xe2
+
+# CHECK: vfsqsb %v0, %v0
+0xe7 0x00 0x00 0x00 0x20 0xce
+
+# CHECK: vfsqsb %v0, %v15
+0xe7 0x0f 0x00 0x00 0x20 0xce
+
+# CHECK: vfsqsb %v0, %v31
+0xe7 0x0f 0x00 0x00 0x24 0xce
+
+# CHECK: vfsqsb %v15, %v0
+0xe7 0xf0 0x00 0x00 0x20 0xce
+
+# CHECK: vfsqsb %v31, %v0
+0xe7 0xf0 0x00 0x00 0x28 0xce
+
+# CHECK: vfsqsb %v14, %v17
+0xe7 0xe1 0x00 0x00 0x24 0xce
+
+# CHECK: vftcisb %v0, %v0, 0
+0xe7 0x00 0x00 0x00 0x20 0x4a
+
+# CHECK: vftcisb %v0, %v0, 4095
+0xe7 0x00 0xff 0xf0 0x20 0x4a
+
+# CHECK: vftcisb %v0, %v15, 0
+0xe7 0x0f 0x00 0x00 0x20 0x4a
+
+# CHECK: vftcisb %v0, %v31, 0
+0xe7 0x0f 0x00 0x00 0x24 0x4a
+
+# CHECK: vftcisb %v15, %v0, 0
+0xe7 0xf0 0x00 0x00 0x20 0x4a
+
+# CHECK: vftcisb %v31, %v0, 0
+0xe7 0xf0 0x00 0x00 0x28 0x4a
+
+# CHECK: vftcisb %v4, %v21, 1656
+0xe7 0x45 0x67 0x80 0x24 0x4a
+
+# CHECK: vlip %v0, 0, 0
+0xe6 0x00 0x00 0x00 0x00 0x49
+
+# CHECK: vlip %v0, 0, 15
+0xe6 0x00 0x00 0x00 0xf0 0x49
+
+# CHECK: vlip %v0, 65535, 0
+0xe6 0x00 0xff 0xff 0x00 0x49
+
+# CHECK: vlip %v15, 0, 0
+0xe6 0xf0 0x00 0x00 0x00 0x49
+
+# CHECK: vlip %v31, 0, 0
+0xe6 0xf0 0x00 0x00 0x08 0x49
+
+# CHECK: vlip %v17, 4660, 7
+0xe6 0x10 0x12 0x34 0x78 0x49
+
+# CHECK: vllezlf %v0, 0
+0xe7 0x00 0x00 0x00 0x60 0x04
+
+# CHECK: vllezlf %v0, 4095
+0xe7 0x00 0x0f 0xff 0x60 0x04
+
+# CHECK: vllezlf %v0, 0(%r15)
+0xe7 0x00 0xf0 0x00 0x60 0x04
+
+# CHECK: vllezlf %v0, 0(%r15,%r1)
+0xe7 0x0f 0x10 0x00 0x60 0x04
+
+# CHECK: vllezlf %v15, 0
+0xe7 0xf0 0x00 0x00 0x60 0x04
+
+# CHECK: vllezlf %v31, 0
+0xe7 0xf0 0x00 0x00 0x68 0x04
+
+# CHECK: vllezlf %v18, 1383(%r3,%r4)
+0xe7 0x23 0x45 0x67 0x68 0x04
+
+# CHECK: vlrl %v0, 0, 0
+0xe6 0x00 0x00 0x00 0x00 0x35
+
+# CHECK: vlrl %v0, 4095, 0
+0xe6 0x00 0x0f 0xff 0x00 0x35
+
+# CHECK: vlrl %v0, 0(%r15), 0
+0xe6 0x00 0xf0 0x00 0x00 0x35
+
+# CHECK: vlrl %v0, 0, 255
+0xe6 0xff 0x00 0x00 0x00 0x35
+
+# CHECK: vlrl %v15, 0, 0
+0xe6 0x00 0x00 0x00 0xf0 0x35
+
+# CHECK: vlrl %v31, 0, 0
+0xe6 0x00 0x00 0x00 0xf1 0x35
+
+# CHECK: vlrl %v18, 1383(%r4), 3
+0xe6 0x03 0x45 0x67 0x21 0x35
+
+# CHECK: vlrlr %v0, %r0, 0
+0xe6 0x00 0x00 0x00 0x00 0x37
+
+# CHECK: vlrlr %v0, %r0, 4095
+0xe6 0x00 0x0f 0xff 0x00 0x37
+
+# CHECK: vlrlr %v0, %r0, 0(%r15)
+0xe6 0x00 0xf0 0x00 0x00 0x37
+
+# CHECK: vlrlr %v0, %r15, 0
+0xe6 0x0f 0x00 0x00 0x00 0x37
+
+# CHECK: vlrlr %v15, %r0, 0
+0xe6 0x00 0x00 0x00 0xf0 0x37
+
+# CHECK: vlrlr %v31, %r0, 0
+0xe6 0x00 0x00 0x00 0xf1 0x37
+
+# CHECK: vlrlr %v18, %r3, 1383(%r4)
+0xe6 0x03 0x45 0x67 0x21 0x37
+
+# CHECK: vmsl %v0, %v0, %v0, %v0, 0, 0
+0xe7 0x00 0x00 0x00 0x00 0xb8
+
+# CHECK: vmsl %v0, %v0, %v0, %v0, 15, 0
+0xe7 0x00 0x0f 0x00 0x00 0xb8
+
+# CHECK: vmsl %v0, %v0, %v0, %v0, 0, 12
+0xe7 0x00 0x00 0xc0 0x00 0xb8
+
+# CHECK: vmsl %v0, %v0, %v0, %v15, 0, 0
+0xe7 0x00 0x00 0x00 0xf0 0xb8
+
+# CHECK: vmsl %v0, %v0, %v0, %v31, 0, 0
+0xe7 0x00 0x00 0x00 0xf1 0xb8
+
+# CHECK: vmsl %v0, %v0, %v15, %v0, 0, 0
+0xe7 0x00 0xf0 0x00 0x00 0xb8
+
+# CHECK: vmsl %v0, %v0, %v31, %v0, 0, 0
+0xe7 0x00 0xf0 0x00 0x02 0xb8
+
+# CHECK: vmsl %v0, %v15, %v0, %v0, 0, 0
+0xe7 0x0f 0x00 0x00 0x00 0xb8
+
+# CHECK: vmsl %v0, %v31, %v0, %v0, 0, 0
+0xe7 0x0f 0x00 0x00 0x04 0xb8
+
+# CHECK: vmsl %v15, %v0, %v0, %v0, 0, 0
+0xe7 0xf0 0x00 0x00 0x00 0xb8
+
+# CHECK: vmsl %v31, %v0, %v0, %v0, 0, 0
+0xe7 0xf0 0x00 0x00 0x08 0xb8
+
+# CHECK: vmsl %v18, %v3, %v20, %v5, 0, 4
+0xe7 0x23 0x40 0x40 0x5a 0xb8
+
+# CHECK: vmsl %v18, %v3, %v20, %v5, 11, 8
+0xe7 0x23 0x4b 0x80 0x5a 0xb8
+
+# CHECK: vmslg %v0, %v0, %v0, %v0, 0
+0xe7 0x00 0x03 0x00 0x00 0xb8
+
+# CHECK: vmslg %v0, %v0, %v0, %v0, 12
+0xe7 0x00 0x03 0xc0 0x00 0xb8
+
+# CHECK: vmslg %v0, %v0, %v0, %v15, 0
+0xe7 0x00 0x03 0x00 0xf0 0xb8
+
+# CHECK: vmslg %v0, %v0, %v0, %v31, 0
+0xe7 0x00 0x03 0x00 0xf1 0xb8
+
+# CHECK: vmslg %v0, %v0, %v15, %v0, 0
+0xe7 0x00 0xf3 0x00 0x00 0xb8
+
+# CHECK: vmslg %v0, %v0, %v31, %v0, 0
+0xe7 0x00 0xf3 0x00 0x02 0xb8
+
+# CHECK: vmslg %v0, %v15, %v0, %v0, 0
+0xe7 0x0f 0x03 0x00 0x00 0xb8
+
+# CHECK: vmslg %v0, %v31, %v0, %v0, 0
+0xe7 0x0f 0x03 0x00 0x04 0xb8
+
+# CHECK: vmslg %v15, %v0, %v0, %v0, 0
+0xe7 0xf0 0x03 0x00 0x00 0xb8
+
+# CHECK: vmslg %v31, %v0, %v0, %v0, 0
+0xe7 0xf0 0x03 0x00 0x08 0xb8
+
+# CHECK: vmslg %v18, %v3, %v20, %v5, 4
+0xe7 0x23 0x43 0x40 0x5a 0xb8
+
+# CHECK: vmslg %v18, %v3, %v20, %v5, 8
+0xe7 0x23 0x43 0x80 0x5a 0xb8
+
+# CHECK: vmp %v0, %v0, %v0, 0, 0
+0xe6 0x00 0x00 0x00 0x00 0x78
+
+# CHECK: vmp %v0, %v0, %v0, 0, 15
+0xe6 0x00 0x00 0xf0 0x00 0x78
+
+# CHECK: vmp %v0, %v0, %v0, 255, 0
+0xe6 0x00 0x00 0x0f 0xf0 0x78
+
+# CHECK: vmp %v0, %v0, %v31, 0, 0
+0xe6 0x00 0xf0 0x00 0x02 0x78
+
+# CHECK: vmp %v0, %v31, %v0, 0, 0
+0xe6 0x0f 0x00 0x00 0x04 0x78
+
+# CHECK: vmp %v31, %v0, %v0, 0, 0
+0xe6 0xf0 0x00 0x00 0x08 0x78
+
+# CHECK: vmp %v13, %v17, %v21, 121, 11
+0xe6 0xd1 0x50 0xb7 0x96 0x78
+
+# CHECK: vmsp %v0, %v0, %v0, 0, 0
+0xe6 0x00 0x00 0x00 0x00 0x79
+
+# CHECK: vmsp %v0, %v0, %v0, 0, 15
+0xe6 0x00 0x00 0xf0 0x00 0x79
+
+# CHECK: vmsp %v0, %v0, %v0, 255, 0
+0xe6 0x00 0x00 0x0f 0xf0 0x79
+
+# CHECK: vmsp %v0, %v0, %v31, 0, 0
+0xe6 0x00 0xf0 0x00 0x02 0x79
+
+# CHECK: vmsp %v0, %v31, %v0, 0, 0
+0xe6 0x0f 0x00 0x00 0x04 0x79
+
+# CHECK: vmsp %v31, %v0, %v0, 0, 0
+0xe6 0xf0 0x00 0x00 0x08 0x79
+
+# CHECK: vmsp %v13, %v17, %v21, 121, 11
+0xe6 0xd1 0x50 0xb7 0x96 0x79
+
+# CHECK: vnn %v0, %v0, %v0
+0xe7 0x00 0x00 0x00 0x00 0x6e
+
+# CHECK: vnn %v0, %v0, %v31
+0xe7 0x00 0xf0 0x00 0x02 0x6e
+
+# CHECK: vnn %v0, %v31, %v0
+0xe7 0x0f 0x00 0x00 0x04 0x6e
+
+# CHECK: vnn %v31, %v0, %v0
+0xe7 0xf0 0x00 0x00 0x08 0x6e
+
+# CHECK: vnn %v18, %v3, %v20
+0xe7 0x23 0x40 0x00 0x0a 0x6e
+
+# CHECK: vnx %v0, %v0, %v0
+0xe7 0x00 0x00 0x00 0x00 0x6c
+
+# CHECK: vnx %v0, %v0, %v31
+0xe7 0x00 0xf0 0x00 0x02 0x6c
+
+# CHECK: vnx %v0, %v31, %v0
+0xe7 0x0f 0x00 0x00 0x04 0x6c
+
+# CHECK: vnx %v31, %v0, %v0
+0xe7 0xf0 0x00 0x00 0x08 0x6c
+
+# CHECK: vnx %v18, %v3, %v20
+0xe7 0x23 0x40 0x00 0x0a 0x6c
+
+# CHECK: voc %v0, %v0, %v0
+0xe7 0x00 0x00 0x00 0x00 0x6f
+
+# CHECK: voc %v0, %v0, %v31
+0xe7 0x00 0xf0 0x00 0x02 0x6f
+
+# CHECK: voc %v0, %v31, %v0
+0xe7 0x0f 0x00 0x00 0x04 0x6f
+
+# CHECK: voc %v31, %v0, %v0
+0xe7 0xf0 0x00 0x00 0x08 0x6f
+
+# CHECK: voc %v18, %v3, %v20
+0xe7 0x23 0x40 0x00 0x0a 0x6f
+
+# CHECK: vpkz %v0, 0, 0
+0xe6 0x00 0x00 0x00 0x00 0x34
+
+# CHECK: vpkz %v0, 4095, 0
+0xe6 0x00 0x0f 0xff 0x00 0x34
+
+# CHECK: vpkz %v0, 0(%r15), 0
+0xe6 0x00 0xf0 0x00 0x00 0x34
+
+# CHECK: vpkz %v0, 0, 255
+0xe6 0xff 0x00 0x00 0x00 0x34
+
+# CHECK: vpkz %v15, 0, 0
+0xe6 0x00 0x00 0x00 0xf0 0x34
+
+# CHECK: vpkz %v31, 0, 0
+0xe6 0x00 0x00 0x00 0xf1 0x34
+
+# CHECK: vpkz %v18, 1383(%r4), 3
+0xe6 0x03 0x45 0x67 0x21 0x34
+
+# CHECK: vpopctb %v0, %v0
+0xe7 0x00 0x00 0x00 0x00 0x50
+
+# CHECK: vpopctb %v0, %v15
+0xe7 0x0f 0x00 0x00 0x00 0x50
+
+# CHECK: vpopctb %v0, %v31
+0xe7 0x0f 0x00 0x00 0x04 0x50
+
+# CHECK: vpopctb %v15, %v0
+0xe7 0xf0 0x00 0x00 0x00 0x50
+
+# CHECK: vpopctb %v31, %v0
+0xe7 0xf0 0x00 0x00 0x08 0x50
+
+# CHECK: vpopctb %v14, %v17
+0xe7 0xe1 0x00 0x00 0x04 0x50
+
+# CHECK: vpopctf %v0, %v0
+0xe7 0x00 0x00 0x00 0x20 0x50
+
+# CHECK: vpopctf %v0, %v15
+0xe7 0x0f 0x00 0x00 0x20 0x50
+
+# CHECK: vpopctf %v0, %v31
+0xe7 0x0f 0x00 0x00 0x24 0x50
+
+# CHECK: vpopctf %v15, %v0
+0xe7 0xf0 0x00 0x00 0x20 0x50
+
+# CHECK: vpopctf %v31, %v0
+0xe7 0xf0 0x00 0x00 0x28 0x50
+
+# CHECK: vpopctf %v14, %v17
+0xe7 0xe1 0x00 0x00 0x24 0x50
+
+# CHECK: vpopctg %v0, %v0
+0xe7 0x00 0x00 0x00 0x30 0x50
+
+# CHECK: vpopctg %v0, %v15
+0xe7 0x0f 0x00 0x00 0x30 0x50
+
+# CHECK: vpopctg %v0, %v31
+0xe7 0x0f 0x00 0x00 0x34 0x50
+
+# CHECK: vpopctg %v15, %v0
+0xe7 0xf0 0x00 0x00 0x30 0x50
+
+# CHECK: vpopctg %v31, %v0
+0xe7 0xf0 0x00 0x00 0x38 0x50
+
+# CHECK: vpopctg %v14, %v17
+0xe7 0xe1 0x00 0x00 0x34 0x50
+
+# CHECK: vpopcth %v0, %v0
+0xe7 0x00 0x00 0x00 0x10 0x50
+
+# CHECK: vpopcth %v0, %v15
+0xe7 0x0f 0x00 0x00 0x10 0x50
+
+# CHECK: vpopcth %v0, %v31
+0xe7 0x0f 0x00 0x00 0x14 0x50
+
+# CHECK: vpopcth %v15, %v0
+0xe7 0xf0 0x00 0x00 0x10 0x50
+
+# CHECK: vpopcth %v31, %v0
+0xe7 0xf0 0x00 0x00 0x18 0x50
+
+# CHECK: vpopcth %v14, %v17
+0xe7 0xe1 0x00 0x00 0x14 0x50
+
+# CHECK: vpsop %v0, %v0, 0, 0, 0
+0xe6 0x00 0x00 0x00 0x00 0x5b
+
+# CHECK: vpsop %v0, %v0, 0, 0, 15
+0xe6 0x00 0x00 0xf0 0x00 0x5b
+
+# CHECK: vpsop %v0, %v0, 0, 255, 0
+0xe6 0x00 0xff 0x00 0x00 0x5b
+
+# CHECK: vpsop %v0, %v0, 255, 0, 0
+0xe6 0x00 0x00 0x0f 0xf0 0x5b
+
+# CHECK: vpsop %v0, %v31, 0, 0, 0
+0xe6 0x0f 0x00 0x00 0x04 0x5b
+
+# CHECK: vpsop %v31, %v0, 0, 0, 0
+0xe6 0xf0 0x00 0x00 0x08 0x5b
+
+# CHECK: vpsop %v13, %v17, 52, 121, 11
+0xe6 0xd1 0x79 0xb3 0x44 0x5b
+
+# CHECK: vrp %v0, %v0, %v0, 0, 0
+0xe6 0x00 0x00 0x00 0x00 0x7b
+
+# CHECK: vrp %v0, %v0, %v0, 0, 15
+0xe6 0x00 0x00 0xf0 0x00 0x7b
+
+# CHECK: vrp %v0, %v0, %v0, 255, 0
+0xe6 0x00 0x00 0x0f 0xf0 0x7b
+
+# CHECK: vrp %v0, %v0, %v31, 0, 0
+0xe6 0x00 0xf0 0x00 0x02 0x7b
+
+# CHECK: vrp %v0, %v31, %v0, 0, 0
+0xe6 0x0f 0x00 0x00 0x04 0x7b
+
+# CHECK: vrp %v31, %v0, %v0, 0, 0
+0xe6 0xf0 0x00 0x00 0x08 0x7b
+
+# CHECK: vrp %v13, %v17, %v21, 121, 11
+0xe6 0xd1 0x50 0xb7 0x96 0x7b
+
+# CHECK: vsdp %v0, %v0, %v0, 0, 0
+0xe6 0x00 0x00 0x00 0x00 0x7e
+
+# CHECK: vsdp %v0, %v0, %v0, 0, 15
+0xe6 0x00 0x00 0xf0 0x00 0x7e
+
+# CHECK: vsdp %v0, %v0, %v0, 255, 0
+0xe6 0x00 0x00 0x0f 0xf0 0x7e
+
+# CHECK: vsdp %v0, %v0, %v31, 0, 0
+0xe6 0x00 0xf0 0x00 0x02 0x7e
+
+# CHECK: vsdp %v0, %v31, %v0, 0, 0
+0xe6 0x0f 0x00 0x00 0x04 0x7e
+
+# CHECK: vsdp %v31, %v0, %v0, 0, 0
+0xe6 0xf0 0x00 0x00 0x08 0x7e
+
+# CHECK: vsdp %v13, %v17, %v21, 121, 11
+0xe6 0xd1 0x50 0xb7 0x96 0x7e
+
+# CHECK: vsp %v0, %v0, %v0, 0, 0
+0xe6 0x00 0x00 0x00 0x00 0x73
+
+# CHECK: vsp %v0, %v0, %v0, 0, 15
+0xe6 0x00 0x00 0xf0 0x00 0x73
+
+# CHECK: vsp %v0, %v0, %v0, 255, 0
+0xe6 0x00 0x00 0x0f 0xf0 0x73
+
+# CHECK: vsp %v0, %v0, %v31, 0, 0
+0xe6 0x00 0xf0 0x00 0x02 0x73
+
+# CHECK: vsp %v0, %v31, %v0, 0, 0
+0xe6 0x0f 0x00 0x00 0x04 0x73
+
+# CHECK: vsp %v31, %v0, %v0, 0, 0
+0xe6 0xf0 0x00 0x00 0x08 0x73
+
+# CHECK: vsp %v13, %v17, %v21, 121, 11
+0xe6 0xd1 0x50 0xb7 0x96 0x73
+
+# CHECK: vsrp %v0, %v0, 0, 0, 0
+0xe6 0x00 0x00 0x00 0x00 0x59
+
+# CHECK: vsrp %v0, %v0, 0, 0, 15
+0xe6 0x00 0x00 0xf0 0x00 0x59
+
+# CHECK: vsrp %v0, %v0, 0, 255, 0
+0xe6 0x00 0xff 0x00 0x00 0x59
+
+# CHECK: vsrp %v0, %v0, 255, 0, 0
+0xe6 0x00 0x00 0x0f 0xf0 0x59
+
+# CHECK: vsrp %v0, %v31, 0, 0, 0
+0xe6 0x0f 0x00 0x00 0x04 0x59
+
+# CHECK: vsrp %v31, %v0, 0, 0, 0
+0xe6 0xf0 0x00 0x00 0x08 0x59
+
+# CHECK: vsrp %v13, %v17, 52, 121, 11
+0xe6 0xd1 0x79 0xb3 0x44 0x59
+
+# CHECK: vstrl %v0, 0, 0
+0xe6 0x00 0x00 0x00 0x00 0x3d
+
+# CHECK: vstrl %v0, 4095, 0
+0xe6 0x00 0x0f 0xff 0x00 0x3d
+
+# CHECK: vstrl %v0, 0(%r15), 0
+0xe6 0x00 0xf0 0x00 0x00 0x3d
+
+# CHECK: vstrl %v0, 0, 255
+0xe6 0xff 0x00 0x00 0x00 0x3d
+
+# CHECK: vstrl %v15, 0, 0
+0xe6 0x00 0x00 0x00 0xf0 0x3d
+
+# CHECK: vstrl %v31, 0, 0
+0xe6 0x00 0x00 0x00 0xf1 0x3d
+
+# CHECK: vstrl %v18, 1383(%r4), 3
+0xe6 0x03 0x45 0x67 0x21 0x3d
+
+# CHECK: vstrlr %v0, %r0, 0
+0xe6 0x00 0x00 0x00 0x00 0x3f
+
+# CHECK: vstrlr %v0, %r0, 4095
+0xe6 0x00 0x0f 0xff 0x00 0x3f
+
+# CHECK: vstrlr %v0, %r0, 0(%r15)
+0xe6 0x00 0xf0 0x00 0x00 0x3f
+
+# CHECK: vstrlr %v0, %r15, 0
+0xe6 0x0f 0x00 0x00 0x00 0x3f
+
+# CHECK: vstrlr %v15, %r0, 0
+0xe6 0x00 0x00 0x00 0xf0 0x3f
+
+# CHECK: vstrlr %v31, %r0, 0
+0xe6 0x00 0x00 0x00 0xf1 0x3f
+
+# CHECK: vstrlr %v18, %r3, 1383(%r4)
+0xe6 0x03 0x45 0x67 0x21 0x3f
+
+# CHECK: vtp %v0
+0xe6 0x00 0x00 0x00 0x00 0x5f
+
+# CHECK: vtp %v15
+0xe6 0x0f 0x00 0x00 0x00 0x5f
+
+# CHECK: vtp %v31
+0xe6 0x0f 0x00 0x00 0x04 0x5f
+
+# CHECK: vupkz %v0, 0, 0
+0xe6 0x00 0x00 0x00 0x00 0x3c
+
+# CHECK: vupkz %v0, 4095, 0
+0xe6 0x00 0x0f 0xff 0x00 0x3c
+
+# CHECK: vupkz %v0, 0(%r15), 0
+0xe6 0x00 0xf0 0x00 0x00 0x3c
+
+# CHECK: vupkz %v0, 0, 255
+0xe6 0xff 0x00 0x00 0x00 0x3c
+
+# CHECK: vupkz %v15, 0, 0
+0xe6 0x00 0x00 0x00 0xf0 0x3c
+
+# CHECK: vupkz %v31, 0, 0
+0xe6 0x00 0x00 0x00 0xf1 0x3c
+
+# CHECK: vupkz %v18, 1383(%r4), 3
+0xe6 0x03 0x45 0x67 0x21 0x3c
+
+# CHECK: wfasb %f0, %f0, %f0
+0xe7 0x00 0x00 0x08 0x20 0xe3
+
+# CHECK: wfasb %f0, %f0, %f0
+0xe7 0x00 0x00 0x08 0x20 0xe3
+
+# CHECK: wfasb %f0, %f0, %v31
+0xe7 0x00 0xf0 0x08 0x22 0xe3
+
+# CHECK: wfasb %f0, %v31, %f0
+0xe7 0x0f 0x00 0x08 0x24 0xe3
+
+# CHECK: wfasb %v31, %f0, %f0
+0xe7 0xf0 0x00 0x08 0x28 0xe3
+
+# CHECK: wfasb %v18, %f3, %v20
+0xe7 0x23 0x40 0x08 0x2a 0xe3
+
+# CHECK: wfaxb %v0, %v0, %v0
+0xe7 0x00 0x00 0x08 0x40 0xe3
+
+# CHECK: wfaxb %v0, %v0, %v31
+0xe7 0x00 0xf0 0x08 0x42 0xe3
+
+# CHECK: wfaxb %v0, %v31, %v0
+0xe7 0x0f 0x00 0x08 0x44 0xe3
+
+# CHECK: wfaxb %v31, %v0, %v0
+0xe7 0xf0 0x00 0x08 0x48 0xe3
+
+# CHECK: wfaxb %v18, %v3, %v20
+0xe7 0x23 0x40 0x08 0x4a 0xe3
+
+# CHECK: wfcsb %f0, %f0
+0xe7 0x00 0x00 0x00 0x20 0xcb
+
+# CHECK: wfcsb %f0, %f0
+0xe7 0x00 0x00 0x00 0x20 0xcb
+
+# CHECK: wfcsb %f0, %f15
+0xe7 0x0f 0x00 0x00 0x20 0xcb
+
+# CHECK: wfcsb %f0, %v31
+0xe7 0x0f 0x00 0x00 0x24 0xcb
+
+# CHECK: wfcsb %f15, %f0
+0xe7 0xf0 0x00 0x00 0x20 0xcb
+
+# CHECK: wfcsb %v31, %f0
+0xe7 0xf0 0x00 0x00 0x28 0xcb
+
+# CHECK: wfcsb %f14, %v17
+0xe7 0xe1 0x00 0x00 0x24 0xcb
+
+# CHECK: wfcxb %v0, %v0
+0xe7 0x00 0x00 0x00 0x40 0xcb
+
+# CHECK: wfcxb %v0, %v15
+0xe7 0x0f 0x00 0x00 0x40 0xcb
+
+# CHECK: wfcxb %v0, %v31
+0xe7 0x0f 0x00 0x00 0x44 0xcb
+
+# CHECK: wfcxb %v15, %v0
+0xe7 0xf0 0x00 0x00 0x40 0xcb
+
+# CHECK: wfcxb %v31, %v0
+0xe7 0xf0 0x00 0x00 0x48 0xcb
+
+# CHECK: wfcxb %v14, %v17
+0xe7 0xe1 0x00 0x00 0x44 0xcb
+
+# CHECK: wfcesb %f0, %f0, %f0
+0xe7 0x00 0x00 0x08 0x20 0xe8
+
+# CHECK: wfcesb %f0, %f0, %f0
+0xe7 0x00 0x00 0x08 0x20 0xe8
+
+# CHECK: wfcesb %f0, %f0, %v31
+0xe7 0x00 0xf0 0x08 0x22 0xe8
+
+# CHECK: wfcesb %f0, %v31, %f0
+0xe7 0x0f 0x00 0x08 0x24 0xe8
+
+# CHECK: wfcesb %v31, %f0, %f0
+0xe7 0xf0 0x00 0x08 0x28 0xe8
+
+# CHECK: wfcesb %v18, %f3, %v20
+0xe7 0x23 0x40 0x08 0x2a 0xe8
+
+# CHECK: wfcesbs %f0, %f0, %f0
+0xe7 0x00 0x00 0x18 0x20 0xe8
+
+# CHECK: wfcesbs %f0, %f0, %f0
+0xe7 0x00 0x00 0x18 0x20 0xe8
+
+# CHECK: wfcesbs %f0, %f0, %v31
+0xe7 0x00 0xf0 0x18 0x22 0xe8
+
+# CHECK: wfcesbs %f0, %v31, %f0
+0xe7 0x0f 0x00 0x18 0x24 0xe8
+
+# CHECK: wfcesbs %v31, %f0, %f0
+0xe7 0xf0 0x00 0x18 0x28 0xe8
+
+# CHECK: wfcesbs %v18, %f3, %v20
+0xe7 0x23 0x40 0x18 0x2a 0xe8
+
+# CHECK: wfcexb %v0, %v0, %v0
+0xe7 0x00 0x00 0x08 0x40 0xe8
+
+# CHECK: wfcexb %v0, %v0, %v31
+0xe7 0x00 0xf0 0x08 0x42 0xe8
+
+# CHECK: wfcexb %v0, %v31, %v0
+0xe7 0x0f 0x00 0x08 0x44 0xe8
+
+# CHECK: wfcexb %v31, %v0, %v0
+0xe7 0xf0 0x00 0x08 0x48 0xe8
+
+# CHECK: wfcexb %v18, %v3, %v20
+0xe7 0x23 0x40 0x08 0x4a 0xe8
+
+# CHECK: wfcexbs %v0, %v0, %v0
+0xe7 0x00 0x00 0x18 0x40 0xe8
+
+# CHECK: wfcexbs %v0, %v0, %v31
+0xe7 0x00 0xf0 0x18 0x42 0xe8
+
+# CHECK: wfcexbs %v0, %v31, %v0
+0xe7 0x0f 0x00 0x18 0x44 0xe8
+
+# CHECK: wfcexbs %v31, %v0, %v0
+0xe7 0xf0 0x00 0x18 0x48 0xe8
+
+# CHECK: wfcexbs %v18, %v3, %v20
+0xe7 0x23 0x40 0x18 0x4a 0xe8
+
+# CHECK: wfchsb %f0, %f0, %f0
+0xe7 0x00 0x00 0x08 0x20 0xeb
+
+# CHECK: wfchsb %f0, %f0, %f0
+0xe7 0x00 0x00 0x08 0x20 0xeb
+
+# CHECK: wfchsb %f0, %f0, %v31
+0xe7 0x00 0xf0 0x08 0x22 0xeb
+
+# CHECK: wfchsb %f0, %v31, %f0
+0xe7 0x0f 0x00 0x08 0x24 0xeb
+
+# CHECK: wfchsb %v31, %f0, %f0
+0xe7 0xf0 0x00 0x08 0x28 0xeb
+
+# CHECK: wfchsb %v18, %f3, %v20
+0xe7 0x23 0x40 0x08 0x2a 0xeb
+
+# CHECK: wfchsbs %f0, %f0, %f0
+0xe7 0x00 0x00 0x18 0x20 0xeb
+
+# CHECK: wfchsbs %f0, %f0, %f0
+0xe7 0x00 0x00 0x18 0x20 0xeb
+
+# CHECK: wfchsbs %f0, %f0, %v31
+0xe7 0x00 0xf0 0x18 0x22 0xeb
+
+# CHECK: wfchsbs %f0, %v31, %f0
+0xe7 0x0f 0x00 0x18 0x24 0xeb
+
+# CHECK: wfchsbs %v31, %f0, %f0
+0xe7 0xf0 0x00 0x18 0x28 0xeb
+
+# CHECK: wfchsbs %v18, %f3, %v20
+0xe7 0x23 0x40 0x18 0x2a 0xeb
+
+# CHECK: wfchxb %v0, %v0, %v0
+0xe7 0x00 0x00 0x08 0x40 0xeb
+
+# CHECK: wfchxb %v0, %v0, %v31
+0xe7 0x00 0xf0 0x08 0x42 0xeb
+
+# CHECK: wfchxb %v0, %v31, %v0
+0xe7 0x0f 0x00 0x08 0x44 0xeb
+
+# CHECK: wfchxb %v31, %v0, %v0
+0xe7 0xf0 0x00 0x08 0x48 0xeb
+
+# CHECK: wfchxb %v18, %v3, %v20
+0xe7 0x23 0x40 0x08 0x4a 0xeb
+
+# CHECK: wfchxbs %v0, %v0, %v0
+0xe7 0x00 0x00 0x18 0x40 0xeb
+
+# CHECK: wfchxbs %v0, %v0, %v31
+0xe7 0x00 0xf0 0x18 0x42 0xeb
+
+# CHECK: wfchxbs %v0, %v31, %v0
+0xe7 0x0f 0x00 0x18 0x44 0xeb
+
+# CHECK: wfchxbs %v31, %v0, %v0
+0xe7 0xf0 0x00 0x18 0x48 0xeb
+
+# CHECK: wfchxbs %v18, %v3, %v20
+0xe7 0x23 0x40 0x18 0x4a 0xeb
+
+# CHECK: wfchesb %f0, %f0, %f0
+0xe7 0x00 0x00 0x08 0x20 0xea
+
+# CHECK: wfchesb %f0, %f0, %f0
+0xe7 0x00 0x00 0x08 0x20 0xea
+
+# CHECK: wfchesb %f0, %f0, %v31
+0xe7 0x00 0xf0 0x08 0x22 0xea
+
+# CHECK: wfchesb %f0, %v31, %f0
+0xe7 0x0f 0x00 0x08 0x24 0xea
+
+# CHECK: wfchesb %v31, %f0, %f0
+0xe7 0xf0 0x00 0x08 0x28 0xea
+
+# CHECK: wfchesb %v18, %f3, %v20
+0xe7 0x23 0x40 0x08 0x2a 0xea
+
+# CHECK: wfchesbs %f0, %f0, %f0
+0xe7 0x00 0x00 0x18 0x20 0xea
+
+# CHECK: wfchesbs %f0, %f0, %f0
+0xe7 0x00 0x00 0x18 0x20 0xea
+
+# CHECK: wfchesbs %f0, %f0, %v31
+0xe7 0x00 0xf0 0x18 0x22 0xea
+
+# CHECK: wfchesbs %f0, %v31, %f0
+0xe7 0x0f 0x00 0x18 0x24 0xea
+
+# CHECK: wfchesbs %v31, %f0, %f0
+0xe7 0xf0 0x00 0x18 0x28 0xea
+
+# CHECK: wfchesbs %v18, %f3, %v20
+0xe7 0x23 0x40 0x18 0x2a 0xea
+
+# CHECK: wfchexb %v0, %v0, %v0
+0xe7 0x00 0x00 0x08 0x40 0xea
+
+# CHECK: wfchexb %v0, %v0, %v31
+0xe7 0x00 0xf0 0x08 0x42 0xea
+
+# CHECK: wfchexb %v0, %v31, %v0
+0xe7 0x0f 0x00 0x08 0x44 0xea
+
+# CHECK: wfchexb %v31, %v0, %v0
+0xe7 0xf0 0x00 0x08 0x48 0xea
+
+# CHECK: wfchexb %v18, %v3, %v20
+0xe7 0x23 0x40 0x08 0x4a 0xea
+
+# CHECK: wfchexbs %v0, %v0, %v0
+0xe7 0x00 0x00 0x18 0x40 0xea
+
+# CHECK: wfchexbs %v0, %v0, %v31
+0xe7 0x00 0xf0 0x18 0x42 0xea
+
+# CHECK: wfchexbs %v0, %v31, %v0
+0xe7 0x0f 0x00 0x18 0x44 0xea
+
+# CHECK: wfchexbs %v31, %v0, %v0
+0xe7 0xf0 0x00 0x18 0x48 0xea
+
+# CHECK: wfchexbs %v18, %v3, %v20
+0xe7 0x23 0x40 0x18 0x4a 0xea
+
+# CHECK: wfdsb %f0, %f0, %f0
+0xe7 0x00 0x00 0x08 0x20 0xe5
+
+# CHECK: wfdsb %f0, %f0, %f0
+0xe7 0x00 0x00 0x08 0x20 0xe5
+
+# CHECK: wfdsb %f0, %f0, %v31
+0xe7 0x00 0xf0 0x08 0x22 0xe5
+
+# CHECK: wfdsb %f0, %v31, %f0
+0xe7 0x0f 0x00 0x08 0x24 0xe5
+
+# CHECK: wfdsb %v31, %f0, %f0
+0xe7 0xf0 0x00 0x08 0x28 0xe5
+
+# CHECK: wfdsb %v18, %f3, %v20
+0xe7 0x23 0x40 0x08 0x2a 0xe5
+
+# CHECK: wfdxb %v0, %v0, %v0
+0xe7 0x00 0x00 0x08 0x40 0xe5
+
+# CHECK: wfdxb %v0, %v0, %v31
+0xe7 0x00 0xf0 0x08 0x42 0xe5
+
+# CHECK: wfdxb %v0, %v31, %v0
+0xe7 0x0f 0x00 0x08 0x44 0xe5
+
+# CHECK: wfdxb %v31, %v0, %v0
+0xe7 0xf0 0x00 0x08 0x48 0xe5
+
+# CHECK: wfdxb %v18, %v3, %v20
+0xe7 0x23 0x40 0x08 0x4a 0xe5
+
+# CHECK: wfisb %f0, %f0, 0, 0
+0xe7 0x00 0x00 0x08 0x20 0xc7
+
+# CHECK: wfisb %f0, %f0, 0, 0
+0xe7 0x00 0x00 0x08 0x20 0xc7
+
+# CHECK: wfisb %f0, %f0, 0, 15
+0xe7 0x00 0x00 0xf8 0x20 0xc7
+
+# CHECK: wfisb %f0, %f0, 4, 0
+0xe7 0x00 0x00 0x0c 0x20 0xc7
+
+# CHECK: wfisb %f0, %f0, 7, 0
+0xe7 0x00 0x00 0x0f 0x20 0xc7
+
+# CHECK: wfisb %f0, %v31, 0, 0
+0xe7 0x0f 0x00 0x08 0x24 0xc7
+
+# CHECK: wfisb %v31, %f0, 0, 0
+0xe7 0xf0 0x00 0x08 0x28 0xc7
+
+# CHECK: wfisb %f14, %v17, 4, 10
+0xe7 0xe1 0x00 0xac 0x24 0xc7
+
+# CHECK: wfixb %v0, %v0, 0, 0
+0xe7 0x00 0x00 0x08 0x40 0xc7
+
+# CHECK: wfixb %v0, %v0, 0, 15
+0xe7 0x00 0x00 0xf8 0x40 0xc7
+
+# CHECK: wfixb %v0, %v0, 4, 0
+0xe7 0x00 0x00 0x0c 0x40 0xc7
+
+# CHECK: wfixb %v0, %v0, 7, 0
+0xe7 0x00 0x00 0x0f 0x40 0xc7
+
+# CHECK: wfixb %v0, %v31, 0, 0
+0xe7 0x0f 0x00 0x08 0x44 0xc7
+
+# CHECK: wfixb %v31, %v0, 0, 0
+0xe7 0xf0 0x00 0x08 0x48 0xc7
+
+# CHECK: wfixb %v14, %v17, 4, 10
+0xe7 0xe1 0x00 0xac 0x44 0xc7
+
+# CHECK: wfksb %f0, %f0
+0xe7 0x00 0x00 0x00 0x20 0xca
+
+# CHECK: wfksb %f0, %f0
+0xe7 0x00 0x00 0x00 0x20 0xca
+
+# CHECK: wfksb %f0, %f15
+0xe7 0x0f 0x00 0x00 0x20 0xca
+
+# CHECK: wfksb %f0, %v31
+0xe7 0x0f 0x00 0x00 0x24 0xca
+
+# CHECK: wfksb %f15, %f0
+0xe7 0xf0 0x00 0x00 0x20 0xca
+
+# CHECK: wfksb %v31, %f0
+0xe7 0xf0 0x00 0x00 0x28 0xca
+
+# CHECK: wfksb %f14, %v17
+0xe7 0xe1 0x00 0x00 0x24 0xca
+
+# CHECK: wfkxb %v0, %v0
+0xe7 0x00 0x00 0x00 0x40 0xca
+
+# CHECK: wfkxb %v0, %v15
+0xe7 0x0f 0x00 0x00 0x40 0xca
+
+# CHECK: wfkxb %v0, %v31
+0xe7 0x0f 0x00 0x00 0x44 0xca
+
+# CHECK: wfkxb %v15, %v0
+0xe7 0xf0 0x00 0x00 0x40 0xca
+
+# CHECK: wfkxb %v31, %v0
+0xe7 0xf0 0x00 0x00 0x48 0xca
+
+# CHECK: wfkxb %v14, %v17
+0xe7 0xe1 0x00 0x00 0x44 0xca
+
+# CHECK: wfkedb %f0, %f0, %f0
+0xe7 0x00 0x00 0x0c 0x30 0xe8
+
+# CHECK: wfkedb %f0, %f0, %f0
+0xe7 0x00 0x00 0x0c 0x30 0xe8
+
+# CHECK: wfkedb %f0, %f0, %v31
+0xe7 0x00 0xf0 0x0c 0x32 0xe8
+
+# CHECK: wfkedb %f0, %v31, %f0
+0xe7 0x0f 0x00 0x0c 0x34 0xe8
+
+# CHECK: wfkedb %v31, %f0, %f0
+0xe7 0xf0 0x00 0x0c 0x38 0xe8
+
+# CHECK: wfkedb %v18, %f3, %v20
+0xe7 0x23 0x40 0x0c 0x3a 0xe8
+
+# CHECK: wfkedbs %f0, %f0, %f0
+0xe7 0x00 0x00 0x1c 0x30 0xe8
+
+# CHECK: wfkedbs %f0, %f0, %f0
+0xe7 0x00 0x00 0x1c 0x30 0xe8
+
+# CHECK: wfkedbs %f0, %f0, %v31
+0xe7 0x00 0xf0 0x1c 0x32 0xe8
+
+# CHECK: wfkedbs %f0, %v31, %f0
+0xe7 0x0f 0x00 0x1c 0x34 0xe8
+
+# CHECK: wfkedbs %v31, %f0, %f0
+0xe7 0xf0 0x00 0x1c 0x38 0xe8
+
+# CHECK: wfkedbs %v18, %f3, %v20
+0xe7 0x23 0x40 0x1c 0x3a 0xe8
+
+# CHECK: wfkesb %f0, %f0, %f0
+0xe7 0x00 0x00 0x0c 0x20 0xe8
+
+# CHECK: wfkesb %f0, %f0, %f0
+0xe7 0x00 0x00 0x0c 0x20 0xe8
+
+# CHECK: wfkesb %f0, %f0, %v31
+0xe7 0x00 0xf0 0x0c 0x22 0xe8
+
+# CHECK: wfkesb %f0, %v31, %f0
+0xe7 0x0f 0x00 0x0c 0x24 0xe8
+
+# CHECK: wfkesb %v31, %f0, %f0
+0xe7 0xf0 0x00 0x0c 0x28 0xe8
+
+# CHECK: wfkesb %v18, %f3, %v20
+0xe7 0x23 0x40 0x0c 0x2a 0xe8
+
+# CHECK: wfkesbs %f0, %f0, %f0
+0xe7 0x00 0x00 0x1c 0x20 0xe8
+
+# CHECK: wfkesbs %f0, %f0, %f0
+0xe7 0x00 0x00 0x1c 0x20 0xe8
+
+# CHECK: wfkesbs %f0, %f0, %v31
+0xe7 0x00 0xf0 0x1c 0x22 0xe8
+
+# CHECK: wfkesbs %f0, %v31, %f0
+0xe7 0x0f 0x00 0x1c 0x24 0xe8
+
+# CHECK: wfkesbs %v31, %f0, %f0
+0xe7 0xf0 0x00 0x1c 0x28 0xe8
+
+# CHECK: wfkesbs %v18, %f3, %v20
+0xe7 0x23 0x40 0x1c 0x2a 0xe8
+
+# CHECK: wfkexb %v0, %v0, %v0
+0xe7 0x00 0x00 0x0c 0x40 0xe8
+
+# CHECK: wfkexb %v0, %v0, %v31
+0xe7 0x00 0xf0 0x0c 0x42 0xe8
+
+# CHECK: wfkexb %v0, %v31, %v0
+0xe7 0x0f 0x00 0x0c 0x44 0xe8
+
+# CHECK: wfkexb %v31, %v0, %v0
+0xe7 0xf0 0x00 0x0c 0x48 0xe8
+
+# CHECK: wfkexb %v18, %v3, %v20
+0xe7 0x23 0x40 0x0c 0x4a 0xe8
+
+# CHECK: wfkexbs %v0, %v0, %v0
+0xe7 0x00 0x00 0x1c 0x40 0xe8
+
+# CHECK: wfkexbs %v0, %v0, %v31
+0xe7 0x00 0xf0 0x1c 0x42 0xe8
+
+# CHECK: wfkexbs %v0, %v31, %v0
+0xe7 0x0f 0x00 0x1c 0x44 0xe8
+
+# CHECK: wfkexbs %v31, %v0, %v0
+0xe7 0xf0 0x00 0x1c 0x48 0xe8
+
+# CHECK: wfkexbs %v18, %v3, %v20
+0xe7 0x23 0x40 0x1c 0x4a 0xe8
+
+# CHECK: wfkhdb %f0, %f0, %f0
+0xe7 0x00 0x00 0x0c 0x30 0xeb
+
+# CHECK: wfkhdb %f0, %f0, %f0
+0xe7 0x00 0x00 0x0c 0x30 0xeb
+
+# CHECK: wfkhdb %f0, %f0, %v31
+0xe7 0x00 0xf0 0x0c 0x32 0xeb
+
+# CHECK: wfkhdb %f0, %v31, %f0
+0xe7 0x0f 0x00 0x0c 0x34 0xeb
+
+# CHECK: wfkhdb %v31, %f0, %f0
+0xe7 0xf0 0x00 0x0c 0x38 0xeb
+
+# CHECK: wfkhdb %v18, %f3, %v20
+0xe7 0x23 0x40 0x0c 0x3a 0xeb
+
+# CHECK: wfkhdbs %f0, %f0, %f0
+0xe7 0x00 0x00 0x1c 0x30 0xeb
+
+# CHECK: wfkhdbs %f0, %f0, %f0
+0xe7 0x00 0x00 0x1c 0x30 0xeb
+
+# CHECK: wfkhdbs %f0, %f0, %v31
+0xe7 0x00 0xf0 0x1c 0x32 0xeb
+
+# CHECK: wfkhdbs %f0, %v31, %f0
+0xe7 0x0f 0x00 0x1c 0x34 0xeb
+
+# CHECK: wfkhdbs %v31, %f0, %f0
+0xe7 0xf0 0x00 0x1c 0x38 0xeb
+
+# CHECK: wfkhdbs %v18, %f3, %v20
+0xe7 0x23 0x40 0x1c 0x3a 0xeb
+
+# CHECK: wfkhsb %f0, %f0, %f0
+0xe7 0x00 0x00 0x0c 0x20 0xeb
+
+# CHECK: wfkhsb %f0, %f0, %f0
+0xe7 0x00 0x00 0x0c 0x20 0xeb
+
+# CHECK: wfkhsb %f0, %f0, %v31
+0xe7 0x00 0xf0 0x0c 0x22 0xeb
+
+# CHECK: wfkhsb %f0, %v31, %f0
+0xe7 0x0f 0x00 0x0c 0x24 0xeb
+
+# CHECK: wfkhsb %v31, %f0, %f0
+0xe7 0xf0 0x00 0x0c 0x28 0xeb
+
+# CHECK: wfkhsb %v18, %f3, %v20
+0xe7 0x23 0x40 0x0c 0x2a 0xeb
+
+# CHECK: wfkhsbs %f0, %f0, %f0
+0xe7 0x00 0x00 0x1c 0x20 0xeb
+
+# CHECK: wfkhsbs %f0, %f0, %f0
+0xe7 0x00 0x00 0x1c 0x20 0xeb
+
+# CHECK: wfkhsbs %f0, %f0, %v31
+0xe7 0x00 0xf0 0x1c 0x22 0xeb
+
+# CHECK: wfkhsbs %f0, %v31, %f0
+0xe7 0x0f 0x00 0x1c 0x24 0xeb
+
+# CHECK: wfkhsbs %v31, %f0, %f0
+0xe7 0xf0 0x00 0x1c 0x28 0xeb
+
+# CHECK: wfkhsbs %v18, %f3, %v20
+0xe7 0x23 0x40 0x1c 0x2a 0xeb
+
+# CHECK: wfkhxb %v0, %v0, %v0
+0xe7 0x00 0x00 0x0c 0x40 0xeb
+
+# CHECK: wfkhxb %v0, %v0, %v31
+0xe7 0x00 0xf0 0x0c 0x42 0xeb
+
+# CHECK: wfkhxb %v0, %v31, %v0
+0xe7 0x0f 0x00 0x0c 0x44 0xeb
+
+# CHECK: wfkhxb %v31, %v0, %v0
+0xe7 0xf0 0x00 0x0c 0x48 0xeb
+
+# CHECK: wfkhxb %v18, %v3, %v20
+0xe7 0x23 0x40 0x0c 0x4a 0xeb
+
+# CHECK: wfkhxbs %v0, %v0, %v0
+0xe7 0x00 0x00 0x1c 0x40 0xeb
+
+# CHECK: wfkhxbs %v0, %v0, %v31
+0xe7 0x00 0xf0 0x1c 0x42 0xeb
+
+# CHECK: wfkhxbs %v0, %v31, %v0
+0xe7 0x0f 0x00 0x1c 0x44 0xeb
+
+# CHECK: wfkhxbs %v31, %v0, %v0
+0xe7 0xf0 0x00 0x1c 0x48 0xeb
+
+# CHECK: wfkhxbs %v18, %v3, %v20
+0xe7 0x23 0x40 0x1c 0x4a 0xeb
+
+# CHECK: wfkhedb %f0, %f0, %f0
+0xe7 0x00 0x00 0x0c 0x30 0xea
+
+# CHECK: wfkhedb %f0, %f0, %f0
+0xe7 0x00 0x00 0x0c 0x30 0xea
+
+# CHECK: wfkhedb %f0, %f0, %v31
+0xe7 0x00 0xf0 0x0c 0x32 0xea
+
+# CHECK: wfkhedb %f0, %v31, %f0
+0xe7 0x0f 0x00 0x0c 0x34 0xea
+
+# CHECK: wfkhedb %v31, %f0, %f0
+0xe7 0xf0 0x00 0x0c 0x38 0xea
+
+# CHECK: wfkhedb %v18, %f3, %v20
+0xe7 0x23 0x40 0x0c 0x3a 0xea
+
+# CHECK: wfkhedbs %f0, %f0, %f0
+0xe7 0x00 0x00 0x1c 0x30 0xea
+
+# CHECK: wfkhedbs %f0, %f0, %f0
+0xe7 0x00 0x00 0x1c 0x30 0xea
+
+# CHECK: wfkhedbs %f0, %f0, %v31
+0xe7 0x00 0xf0 0x1c 0x32 0xea
+
+# CHECK: wfkhedbs %f0, %v31, %f0
+0xe7 0x0f 0x00 0x1c 0x34 0xea
+
+# CHECK: wfkhedbs %v31, %f0, %f0
+0xe7 0xf0 0x00 0x1c 0x38 0xea
+
+# CHECK: wfkhedbs %v18, %f3, %v20
+0xe7 0x23 0x40 0x1c 0x3a 0xea
+
+# CHECK: wfkhesb %f0, %f0, %f0
+0xe7 0x00 0x00 0x0c 0x20 0xea
+
+# CHECK: wfkhesb %f0, %f0, %f0
+0xe7 0x00 0x00 0x0c 0x20 0xea
+
+# CHECK: wfkhesb %f0, %f0, %v31
+0xe7 0x00 0xf0 0x0c 0x22 0xea
+
+# CHECK: wfkhesb %f0, %v31, %f0
+0xe7 0x0f 0x00 0x0c 0x24 0xea
+
+# CHECK: wfkhesb %v31, %f0, %f0
+0xe7 0xf0 0x00 0x0c 0x28 0xea
+
+# CHECK: wfkhesb %v18, %f3, %v20
+0xe7 0x23 0x40 0x0c 0x2a 0xea
+
+# CHECK: wfkhesbs %f0, %f0, %f0
+0xe7 0x00 0x00 0x1c 0x20 0xea
+
+# CHECK: wfkhesbs %f0, %f0, %f0
+0xe7 0x00 0x00 0x1c 0x20 0xea
+
+# CHECK: wfkhesbs %f0, %f0, %v31
+0xe7 0x00 0xf0 0x1c 0x22 0xea
+
+# CHECK: wfkhesbs %f0, %v31, %f0
+0xe7 0x0f 0x00 0x1c 0x24 0xea
+
+# CHECK: wfkhesbs %v31, %f0, %f0
+0xe7 0xf0 0x00 0x1c 0x28 0xea
+
+# CHECK: wfkhesbs %v18, %f3, %v20
+0xe7 0x23 0x40 0x1c 0x2a 0xea
+
+# CHECK: wfkhexb %v0, %v0, %v0
+0xe7 0x00 0x00 0x0c 0x40 0xea
+
+# CHECK: wfkhexb %v0, %v0, %v31
+0xe7 0x00 0xf0 0x0c 0x42 0xea
+
+# CHECK: wfkhexb %v0, %v31, %v0
+0xe7 0x0f 0x00 0x0c 0x44 0xea
+
+# CHECK: wfkhexb %v31, %v0, %v0
+0xe7 0xf0 0x00 0x0c 0x48 0xea
+
+# CHECK: wfkhexb %v18, %v3, %v20
+0xe7 0x23 0x40 0x0c 0x4a 0xea
+
+# CHECK: wfkhexbs %v0, %v0, %v0
+0xe7 0x00 0x00 0x1c 0x40 0xea
+
+# CHECK: wfkhexbs %v0, %v0, %v31
+0xe7 0x00 0xf0 0x1c 0x42 0xea
+
+# CHECK: wfkhexbs %v0, %v31, %v0
+0xe7 0x0f 0x00 0x1c 0x44 0xea
+
+# CHECK: wfkhexbs %v31, %v0, %v0
+0xe7 0xf0 0x00 0x1c 0x48 0xea
+
+# CHECK: wfkhexbs %v18, %v3, %v20
+0xe7 0x23 0x40 0x1c 0x4a 0xea
+
+# CHECK: wfpsosb %f0, %f0, 3
+0xe7 0x00 0x00 0x38 0x20 0xcc
+
+# CHECK: wfpsosb %f0, %f0, 3
+0xe7 0x00 0x00 0x38 0x20 0xcc
+
+# CHECK: wfpsosb %f0, %f0, 15
+0xe7 0x00 0x00 0xf8 0x20 0xcc
+
+# CHECK: wfpsosb %f0, %f15, 3
+0xe7 0x0f 0x00 0x38 0x20 0xcc
+
+# CHECK: wfpsosb %f0, %v31, 3
+0xe7 0x0f 0x00 0x38 0x24 0xcc
+
+# CHECK: wfpsosb %f15, %f0, 3
+0xe7 0xf0 0x00 0x38 0x20 0xcc
+
+# CHECK: wfpsosb %v31, %f0, 3
+0xe7 0xf0 0x00 0x38 0x28 0xcc
+
+# CHECK: wfpsosb %f14, %v17, 7
+0xe7 0xe1 0x00 0x78 0x24 0xcc
+
+# CHECK: wfpsoxb %v0, %v0, 3
+0xe7 0x00 0x00 0x38 0x40 0xcc
+
+# CHECK: wfpsoxb %v0, %v0, 15
+0xe7 0x00 0x00 0xf8 0x40 0xcc
+
+# CHECK: wfpsoxb %v0, %v15, 3
+0xe7 0x0f 0x00 0x38 0x40 0xcc
+
+# CHECK: wfpsoxb %v0, %v31, 3
+0xe7 0x0f 0x00 0x38 0x44 0xcc
+
+# CHECK: wfpsoxb %v15, %v0, 3
+0xe7 0xf0 0x00 0x38 0x40 0xcc
+
+# CHECK: wfpsoxb %v31, %v0, 3
+0xe7 0xf0 0x00 0x38 0x48 0xcc
+
+# CHECK: wfpsoxb %v14, %v17, 7
+0xe7 0xe1 0x00 0x78 0x44 0xcc
+
+# CHECK: wflcsb %f0, %f0
+0xe7 0x00 0x00 0x08 0x20 0xcc
+
+# CHECK: wflcsb %f0, %f0
+0xe7 0x00 0x00 0x08 0x20 0xcc
+
+# CHECK: wflcsb %f0, %f15
+0xe7 0x0f 0x00 0x08 0x20 0xcc
+
+# CHECK: wflcsb %f0, %v31
+0xe7 0x0f 0x00 0x08 0x24 0xcc
+
+# CHECK: wflcsb %f15, %f0
+0xe7 0xf0 0x00 0x08 0x20 0xcc
+
+# CHECK: wflcsb %v31, %f0
+0xe7 0xf0 0x00 0x08 0x28 0xcc
+
+# CHECK: wflcsb %f14, %v17
+0xe7 0xe1 0x00 0x08 0x24 0xcc
+
+# CHECK: wflcxb %v0, %v0
+0xe7 0x00 0x00 0x08 0x40 0xcc
+
+# CHECK: wflcxb %v0, %v15
+0xe7 0x0f 0x00 0x08 0x40 0xcc
+
+# CHECK: wflcxb %v0, %v31
+0xe7 0x0f 0x00 0x08 0x44 0xcc
+
+# CHECK: wflcxb %v15, %v0
+0xe7 0xf0 0x00 0x08 0x40 0xcc
+
+# CHECK: wflcxb %v31, %v0
+0xe7 0xf0 0x00 0x08 0x48 0xcc
+
+# CHECK: wflcxb %v14, %v17
+0xe7 0xe1 0x00 0x08 0x44 0xcc
+
+# CHECK: wflnsb %f0, %f0
+0xe7 0x00 0x00 0x18 0x20 0xcc
+
+# CHECK: wflnsb %f0, %f0
+0xe7 0x00 0x00 0x18 0x20 0xcc
+
+# CHECK: wflnsb %f0, %f15
+0xe7 0x0f 0x00 0x18 0x20 0xcc
+
+# CHECK: wflnsb %f0, %v31
+0xe7 0x0f 0x00 0x18 0x24 0xcc
+
+# CHECK: wflnsb %f15, %f0
+0xe7 0xf0 0x00 0x18 0x20 0xcc
+
+# CHECK: wflnsb %v31, %f0
+0xe7 0xf0 0x00 0x18 0x28 0xcc
+
+# CHECK: wflnsb %f14, %v17
+0xe7 0xe1 0x00 0x18 0x24 0xcc
+
+# CHECK: wflnxb %v0, %v0
+0xe7 0x00 0x00 0x18 0x40 0xcc
+
+# CHECK: wflnxb %v0, %v15
+0xe7 0x0f 0x00 0x18 0x40 0xcc
+
+# CHECK: wflnxb %v0, %v31
+0xe7 0x0f 0x00 0x18 0x44 0xcc
+
+# CHECK: wflnxb %v15, %v0
+0xe7 0xf0 0x00 0x18 0x40 0xcc
+
+# CHECK: wflnxb %v31, %v0
+0xe7 0xf0 0x00 0x18 0x48 0xcc
+
+# CHECK: wflnxb %v14, %v17
+0xe7 0xe1 0x00 0x18 0x44 0xcc
+
+# CHECK: wflpsb %f0, %f0
+0xe7 0x00 0x00 0x28 0x20 0xcc
+
+# CHECK: wflpsb %f0, %f0
+0xe7 0x00 0x00 0x28 0x20 0xcc
+
+# CHECK: wflpsb %f0, %f15
+0xe7 0x0f 0x00 0x28 0x20 0xcc
+
+# CHECK: wflpsb %f0, %v31
+0xe7 0x0f 0x00 0x28 0x24 0xcc
+
+# CHECK: wflpsb %f15, %f0
+0xe7 0xf0 0x00 0x28 0x20 0xcc
+
+# CHECK: wflpsb %v31, %f0
+0xe7 0xf0 0x00 0x28 0x28 0xcc
+
+# CHECK: wflpsb %f14, %v17
+0xe7 0xe1 0x00 0x28 0x24 0xcc
+
+# CHECK: wflpxb %v0, %v0
+0xe7 0x00 0x00 0x28 0x40 0xcc
+
+# CHECK: wflpxb %v0, %v15
+0xe7 0x0f 0x00 0x28 0x40 0xcc
+
+# CHECK: wflpxb %v0, %v31
+0xe7 0x0f 0x00 0x28 0x44 0xcc
+
+# CHECK: wflpxb %v15, %v0
+0xe7 0xf0 0x00 0x28 0x40 0xcc
+
+# CHECK: wflpxb %v31, %v0
+0xe7 0xf0 0x00 0x28 0x48 0xcc
+
+# CHECK: wflpxb %v14, %v17
+0xe7 0xe1 0x00 0x28 0x44 0xcc
+
+# CHECK: wflld %v0, %f0
+0xe7 0x00 0x00 0x08 0x30 0xc4
+
+# CHECK: wflld %v0, %f0
+0xe7 0x00 0x00 0x08 0x30 0xc4
+
+# CHECK: wflld %v0, %f15
+0xe7 0x0f 0x00 0x08 0x30 0xc4
+
+# CHECK: wflld %v0, %v31
+0xe7 0x0f 0x00 0x08 0x34 0xc4
+
+# CHECK: wflld %v15, %f0
+0xe7 0xf0 0x00 0x08 0x30 0xc4
+
+# CHECK: wflld %v31, %f0
+0xe7 0xf0 0x00 0x08 0x38 0xc4
+
+# CHECK: wflld %v14, %v17
+0xe7 0xe1 0x00 0x08 0x34 0xc4
+
+# CHECK: wflrx %f0, %v0, 0, 0
+0xe7 0x00 0x00 0x08 0x40 0xc5
+
+# CHECK: wflrx %f0, %v0, 0, 0
+0xe7 0x00 0x00 0x08 0x40 0xc5
+
+# CHECK: wflrx %f0, %v0, 0, 15
+0xe7 0x00 0x00 0xf8 0x40 0xc5
+
+# CHECK: wflrx %f0, %v0, 4, 0
+0xe7 0x00 0x00 0x0c 0x40 0xc5
+
+# CHECK: wflrx %f0, %v0, 7, 0
+0xe7 0x00 0x00 0x0f 0x40 0xc5
+
+# CHECK: wflrx %f0, %v31, 0, 0
+0xe7 0x0f 0x00 0x08 0x44 0xc5
+
+# CHECK: wflrx %v31, %v0, 0, 0
+0xe7 0xf0 0x00 0x08 0x48 0xc5
+
+# CHECK: wflrx %f14, %v17, 4, 10
+0xe7 0xe1 0x00 0xac 0x44 0xc5
+
+# CHECK: wfmaxdb %f0, %f0, %f0, 0
+0xe7 0x00 0x00 0x08 0x30 0xef
+
+# CHECK: wfmaxdb %f0, %f0, %f0, 0
+0xe7 0x00 0x00 0x08 0x30 0xef
+
+# CHECK: wfmaxdb %f0, %f0, %f0, 4
+0xe7 0x00 0x00 0x48 0x30 0xef
+
+# CHECK: wfmaxdb %f0, %f0, %v31, 0
+0xe7 0x00 0xf0 0x08 0x32 0xef
+
+# CHECK: wfmaxdb %f0, %v31, %f0, 0
+0xe7 0x0f 0x00 0x08 0x34 0xef
+
+# CHECK: wfmaxdb %v31, %f0, %f0, 0
+0xe7 0xf0 0x00 0x08 0x38 0xef
+
+# CHECK: wfmaxdb %v18, %f3, %v20, 11
+0xe7 0x23 0x40 0xb8 0x3a 0xef
+
+# CHECK: wfmaxsb %f0, %f0, %f0, 0
+0xe7 0x00 0x00 0x08 0x20 0xef
+
+# CHECK: wfmaxsb %f0, %f0, %f0, 0
+0xe7 0x00 0x00 0x08 0x20 0xef
+
+# CHECK: wfmaxsb %f0, %f0, %f0, 4
+0xe7 0x00 0x00 0x48 0x20 0xef
+
+# CHECK: wfmaxsb %f0, %f0, %v31, 0
+0xe7 0x00 0xf0 0x08 0x22 0xef
+
+# CHECK: wfmaxsb %f0, %v31, %f0, 0
+0xe7 0x0f 0x00 0x08 0x24 0xef
+
+# CHECK: wfmaxsb %v31, %f0, %f0, 0
+0xe7 0xf0 0x00 0x08 0x28 0xef
+
+# CHECK: wfmaxsb %v18, %f3, %v20, 11
+0xe7 0x23 0x40 0xb8 0x2a 0xef
+
+# CHECK: wfmaxxb %v0, %v0, %v0, 0
+0xe7 0x00 0x00 0x08 0x40 0xef
+
+# CHECK: wfmaxxb %v0, %v0, %v0, 4
+0xe7 0x00 0x00 0x48 0x40 0xef
+
+# CHECK: wfmaxxb %v0, %v0, %v31, 0
+0xe7 0x00 0xf0 0x08 0x42 0xef
+
+# CHECK: wfmaxxb %v0, %v31, %v0, 0
+0xe7 0x0f 0x00 0x08 0x44 0xef
+
+# CHECK: wfmaxxb %v31, %v0, %v0, 0
+0xe7 0xf0 0x00 0x08 0x48 0xef
+
+# CHECK: wfmaxxb %v18, %v3, %v20, 11
+0xe7 0x23 0x40 0xb8 0x4a 0xef
+
+# CHECK: wfmindb %f0, %f0, %f0, 0
+0xe7 0x00 0x00 0x08 0x30 0xee
+
+# CHECK: wfmindb %f0, %f0, %f0, 0
+0xe7 0x00 0x00 0x08 0x30 0xee
+
+# CHECK: wfmindb %f0, %f0, %f0, 4
+0xe7 0x00 0x00 0x48 0x30 0xee
+
+# CHECK: wfmindb %f0, %f0, %v31, 0
+0xe7 0x00 0xf0 0x08 0x32 0xee
+
+# CHECK: wfmindb %f0, %v31, %f0, 0
+0xe7 0x0f 0x00 0x08 0x34 0xee
+
+# CHECK: wfmindb %v31, %f0, %f0, 0
+0xe7 0xf0 0x00 0x08 0x38 0xee
+
+# CHECK: wfmindb %v18, %f3, %v20, 11
+0xe7 0x23 0x40 0xb8 0x3a 0xee
+
+# CHECK: wfminsb %f0, %f0, %f0, 0
+0xe7 0x00 0x00 0x08 0x20 0xee
+
+# CHECK: wfminsb %f0, %f0, %f0, 0
+0xe7 0x00 0x00 0x08 0x20 0xee
+
+# CHECK: wfminsb %f0, %f0, %f0, 4
+0xe7 0x00 0x00 0x48 0x20 0xee
+
+# CHECK: wfminsb %f0, %f0, %v31, 0
+0xe7 0x00 0xf0 0x08 0x22 0xee
+
+# CHECK: wfminsb %f0, %v31, %f0, 0
+0xe7 0x0f 0x00 0x08 0x24 0xee
+
+# CHECK: wfminsb %v31, %f0, %f0, 0
+0xe7 0xf0 0x00 0x08 0x28 0xee
+
+# CHECK: wfminsb %v18, %f3, %v20, 11
+0xe7 0x23 0x40 0xb8 0x2a 0xee
+
+# CHECK: wfminxb %v0, %v0, %v0, 0
+0xe7 0x00 0x00 0x08 0x40 0xee
+
+# CHECK: wfminxb %v0, %v0, %v0, 4
+0xe7 0x00 0x00 0x48 0x40 0xee
+
+# CHECK: wfminxb %v0, %v0, %v31, 0
+0xe7 0x00 0xf0 0x08 0x42 0xee
+
+# CHECK: wfminxb %v0, %v31, %v0, 0
+0xe7 0x0f 0x00 0x08 0x44 0xee
+
+# CHECK: wfminxb %v31, %v0, %v0, 0
+0xe7 0xf0 0x00 0x08 0x48 0xee
+
+# CHECK: wfminxb %v18, %v3, %v20, 11
+0xe7 0x23 0x40 0xb8 0x4a 0xee
+
+# CHECK: wfmasb %f0, %f0, %f0, %f0
+0xe7 0x00 0x02 0x08 0x00 0x8f
+
+# CHECK: wfmasb %f0, %f0, %f0, %f0
+0xe7 0x00 0x02 0x08 0x00 0x8f
+
+# CHECK: wfmasb %f0, %f0, %f0, %v31
+0xe7 0x00 0x02 0x08 0xf1 0x8f
+
+# CHECK: wfmasb %f0, %f0, %v31, %f0
+0xe7 0x00 0xf2 0x08 0x02 0x8f
+
+# CHECK: wfmasb %f0, %v31, %f0, %f0
+0xe7 0x0f 0x02 0x08 0x04 0x8f
+
+# CHECK: wfmasb %v31, %f0, %f0, %f0
+0xe7 0xf0 0x02 0x08 0x08 0x8f
+
+# CHECK: wfmasb %f13, %v17, %v21, %v25
+0xe7 0xd1 0x52 0x08 0x97 0x8f
+
+# CHECK: wfmaxb %v0, %v0, %v0, %v0
+0xe7 0x00 0x04 0x08 0x00 0x8f
+
+# CHECK: wfmaxb %v0, %v0, %v0, %v31
+0xe7 0x00 0x04 0x08 0xf1 0x8f
+
+# CHECK: wfmaxb %v0, %v0, %v31, %v0
+0xe7 0x00 0xf4 0x08 0x02 0x8f
+
+# CHECK: wfmaxb %v0, %v31, %v0, %v0
+0xe7 0x0f 0x04 0x08 0x04 0x8f
+
+# CHECK: wfmaxb %v31, %v0, %v0, %v0
+0xe7 0xf0 0x04 0x08 0x08 0x8f
+
+# CHECK: wfmaxb %v13, %v17, %v21, %v25
+0xe7 0xd1 0x54 0x08 0x97 0x8f
+
+# CHECK: wfmsb %f0, %f0, %f0
+0xe7 0x00 0x00 0x08 0x20 0xe7
+
+# CHECK: wfmsb %f0, %f0, %f0
+0xe7 0x00 0x00 0x08 0x20 0xe7
+
+# CHECK: wfmsb %f0, %f0, %v31
+0xe7 0x00 0xf0 0x08 0x22 0xe7
+
+# CHECK: wfmsb %f0, %v31, %f0
+0xe7 0x0f 0x00 0x08 0x24 0xe7
+
+# CHECK: wfmsb %v31, %f0, %f0
+0xe7 0xf0 0x00 0x08 0x28 0xe7
+
+# CHECK: wfmsb %v18, %f3, %v20
+0xe7 0x23 0x40 0x08 0x2a 0xe7
+
+# CHECK: wfmxb %v0, %v0, %v0
+0xe7 0x00 0x00 0x08 0x40 0xe7
+
+# CHECK: wfmxb %v0, %v0, %v31
+0xe7 0x00 0xf0 0x08 0x42 0xe7
+
+# CHECK: wfmxb %v0, %v31, %v0
+0xe7 0x0f 0x00 0x08 0x44 0xe7
+
+# CHECK: wfmxb %v31, %v0, %v0
+0xe7 0xf0 0x00 0x08 0x48 0xe7
+
+# CHECK: wfmxb %v18, %v3, %v20
+0xe7 0x23 0x40 0x08 0x4a 0xe7
+
+# CHECK: wfmssb %f0, %f0, %f0, %f0
+0xe7 0x00 0x02 0x08 0x00 0x8e
+
+# CHECK: wfmssb %f0, %f0, %f0, %f0
+0xe7 0x00 0x02 0x08 0x00 0x8e
+
+# CHECK: wfmssb %f0, %f0, %f0, %v31
+0xe7 0x00 0x02 0x08 0xf1 0x8e
+
+# CHECK: wfmssb %f0, %f0, %v31, %f0
+0xe7 0x00 0xf2 0x08 0x02 0x8e
+
+# CHECK: wfmssb %f0, %v31, %f0, %f0
+0xe7 0x0f 0x02 0x08 0x04 0x8e
+
+# CHECK: wfmssb %v31, %f0, %f0, %f0
+0xe7 0xf0 0x02 0x08 0x08 0x8e
+
+# CHECK: wfmssb %f13, %v17, %v21, %v25
+0xe7 0xd1 0x52 0x08 0x97 0x8e
+
+# CHECK: wfmsxb %v0, %v0, %v0, %v0
+0xe7 0x00 0x04 0x08 0x00 0x8e
+
+# CHECK: wfmsxb %v0, %v0, %v0, %v31
+0xe7 0x00 0x04 0x08 0xf1 0x8e
+
+# CHECK: wfmsxb %v0, %v0, %v31, %v0
+0xe7 0x00 0xf4 0x08 0x02 0x8e
+
+# CHECK: wfmsxb %v0, %v31, %v0, %v0
+0xe7 0x0f 0x04 0x08 0x04 0x8e
+
+# CHECK: wfmsxb %v31, %v0, %v0, %v0
+0xe7 0xf0 0x04 0x08 0x08 0x8e
+
+# CHECK: wfmsxb %v13, %v17, %v21, %v25
+0xe7 0xd1 0x54 0x08 0x97 0x8e
+
+# CHECK: wfnmadb %f0, %f0, %f0, %f0
+0xe7 0x00 0x03 0x08 0x00 0x9f
+
+# CHECK: wfnmadb %f0, %f0, %f0, %f0
+0xe7 0x00 0x03 0x08 0x00 0x9f
+
+# CHECK: wfnmadb %f0, %f0, %f0, %v31
+0xe7 0x00 0x03 0x08 0xf1 0x9f
+
+# CHECK: wfnmadb %f0, %f0, %v31, %f0
+0xe7 0x00 0xf3 0x08 0x02 0x9f
+
+# CHECK: wfnmadb %f0, %v31, %f0, %f0
+0xe7 0x0f 0x03 0x08 0x04 0x9f
+
+# CHECK: wfnmadb %v31, %f0, %f0, %f0
+0xe7 0xf0 0x03 0x08 0x08 0x9f
+
+# CHECK: wfnmadb %f13, %v17, %v21, %v25
+0xe7 0xd1 0x53 0x08 0x97 0x9f
+
+# CHECK: wfnmasb %f0, %f0, %f0, %f0
+0xe7 0x00 0x02 0x08 0x00 0x9f
+
+# CHECK: wfnmasb %f0, %f0, %f0, %f0
+0xe7 0x00 0x02 0x08 0x00 0x9f
+
+# CHECK: wfnmasb %f0, %f0, %f0, %v31
+0xe7 0x00 0x02 0x08 0xf1 0x9f
+
+# CHECK: wfnmasb %f0, %f0, %v31, %f0
+0xe7 0x00 0xf2 0x08 0x02 0x9f
+
+# CHECK: wfnmasb %f0, %v31, %f0, %f0
+0xe7 0x0f 0x02 0x08 0x04 0x9f
+
+# CHECK: wfnmasb %v31, %f0, %f0, %f0
+0xe7 0xf0 0x02 0x08 0x08 0x9f
+
+# CHECK: wfnmasb %f13, %v17, %v21, %v25
+0xe7 0xd1 0x52 0x08 0x97 0x9f
+
+# CHECK: wfnmaxb %v0, %v0, %v0, %v0
+0xe7 0x00 0x04 0x08 0x00 0x9f
+
+# CHECK: wfnmaxb %v0, %v0, %v0, %v31
+0xe7 0x00 0x04 0x08 0xf1 0x9f
+
+# CHECK: wfnmaxb %v0, %v0, %v31, %v0
+0xe7 0x00 0xf4 0x08 0x02 0x9f
+
+# CHECK: wfnmaxb %v0, %v31, %v0, %v0
+0xe7 0x0f 0x04 0x08 0x04 0x9f
+
+# CHECK: wfnmaxb %v31, %v0, %v0, %v0
+0xe7 0xf0 0x04 0x08 0x08 0x9f
+
+# CHECK: wfnmaxb %v13, %v17, %v21, %v25
+0xe7 0xd1 0x54 0x08 0x97 0x9f
+
+# CHECK: wfnmsdb %f0, %f0, %f0, %f0
+0xe7 0x00 0x03 0x08 0x00 0x9e
+
+# CHECK: wfnmsdb %f0, %f0, %f0, %f0
+0xe7 0x00 0x03 0x08 0x00 0x9e
+
+# CHECK: wfnmsdb %f0, %f0, %f0, %v31
+0xe7 0x00 0x03 0x08 0xf1 0x9e
+
+# CHECK: wfnmsdb %f0, %f0, %v31, %f0
+0xe7 0x00 0xf3 0x08 0x02 0x9e
+
+# CHECK: wfnmsdb %f0, %v31, %f0, %f0
+0xe7 0x0f 0x03 0x08 0x04 0x9e
+
+# CHECK: wfnmsdb %v31, %f0, %f0, %f0
+0xe7 0xf0 0x03 0x08 0x08 0x9e
+
+# CHECK: wfnmsdb %f13, %v17, %v21, %v25
+0xe7 0xd1 0x53 0x08 0x97 0x9e
+
+# CHECK: wfnmssb %f0, %f0, %f0, %f0
+0xe7 0x00 0x02 0x08 0x00 0x9e
+
+# CHECK: wfnmssb %f0, %f0, %f0, %f0
+0xe7 0x00 0x02 0x08 0x00 0x9e
+
+# CHECK: wfnmssb %f0, %f0, %f0, %v31
+0xe7 0x00 0x02 0x08 0xf1 0x9e
+
+# CHECK: wfnmssb %f0, %f0, %v31, %f0
+0xe7 0x00 0xf2 0x08 0x02 0x9e
+
+# CHECK: wfnmssb %f0, %v31, %f0, %f0
+0xe7 0x0f 0x02 0x08 0x04 0x9e
+
+# CHECK: wfnmssb %v31, %f0, %f0, %f0
+0xe7 0xf0 0x02 0x08 0x08 0x9e
+
+# CHECK: wfnmssb %f13, %v17, %v21, %v25
+0xe7 0xd1 0x52 0x08 0x97 0x9e
+
+# CHECK: wfnmsxb %v0, %v0, %v0, %v0
+0xe7 0x00 0x04 0x08 0x00 0x9e
+
+# CHECK: wfnmsxb %v0, %v0, %v0, %v31
+0xe7 0x00 0x04 0x08 0xf1 0x9e
+
+# CHECK: wfnmsxb %v0, %v0, %v31, %v0
+0xe7 0x00 0xf4 0x08 0x02 0x9e
+
+# CHECK: wfnmsxb %v0, %v31, %v0, %v0
+0xe7 0x0f 0x04 0x08 0x04 0x9e
+
+# CHECK: wfnmsxb %v31, %v0, %v0, %v0
+0xe7 0xf0 0x04 0x08 0x08 0x9e
+
+# CHECK: wfnmsxb %v13, %v17, %v21, %v25
+0xe7 0xd1 0x54 0x08 0x97 0x9e
+
+# CHECK: wfssb %f0, %f0, %f0
+0xe7 0x00 0x00 0x08 0x20 0xe2
+
+# CHECK: wfssb %f0, %f0, %f0
+0xe7 0x00 0x00 0x08 0x20 0xe2
+
+# CHECK: wfssb %f0, %f0, %v31
+0xe7 0x00 0xf0 0x08 0x22 0xe2
+
+# CHECK: wfssb %f0, %v31, %f0
+0xe7 0x0f 0x00 0x08 0x24 0xe2
+
+# CHECK: wfssb %v31, %f0, %f0
+0xe7 0xf0 0x00 0x08 0x28 0xe2
+
+# CHECK: wfssb %v18, %f3, %v20
+0xe7 0x23 0x40 0x08 0x2a 0xe2
+
+# CHECK: wfsxb %v0, %v0, %v0
+0xe7 0x00 0x00 0x08 0x40 0xe2
+
+# CHECK: wfsxb %v0, %v0, %v31
+0xe7 0x00 0xf0 0x08 0x42 0xe2
+
+# CHECK: wfsxb %v0, %v31, %v0
+0xe7 0x0f 0x00 0x08 0x44 0xe2
+
+# CHECK: wfsxb %v31, %v0, %v0
+0xe7 0xf0 0x00 0x08 0x48 0xe2
+
+# CHECK: wfsxb %v18, %v3, %v20
+0xe7 0x23 0x40 0x08 0x4a 0xe2
+
+# CHECK: wfsqsb %f0, %f0
+0xe7 0x00 0x00 0x08 0x20 0xce
+
+# CHECK: wfsqsb %f0, %f0
+0xe7 0x00 0x00 0x08 0x20 0xce
+
+# CHECK: wfsqsb %f0, %f15
+0xe7 0x0f 0x00 0x08 0x20 0xce
+
+# CHECK: wfsqsb %f0, %v31
+0xe7 0x0f 0x00 0x08 0x24 0xce
+
+# CHECK: wfsqsb %f15, %f0
+0xe7 0xf0 0x00 0x08 0x20 0xce
+
+# CHECK: wfsqsb %v31, %f0
+0xe7 0xf0 0x00 0x08 0x28 0xce
+
+# CHECK: wfsqsb %f14, %v17
+0xe7 0xe1 0x00 0x08 0x24 0xce
+
+# CHECK: wfsqxb %v0, %v0
+0xe7 0x00 0x00 0x08 0x40 0xce
+
+# CHECK: wfsqxb %v0, %v15
+0xe7 0x0f 0x00 0x08 0x40 0xce
+
+# CHECK: wfsqxb %v0, %v31
+0xe7 0x0f 0x00 0x08 0x44 0xce
+
+# CHECK: wfsqxb %v15, %v0
+0xe7 0xf0 0x00 0x08 0x40 0xce
+
+# CHECK: wfsqxb %v31, %v0
+0xe7 0xf0 0x00 0x08 0x48 0xce
+
+# CHECK: wfsqxb %v14, %v17
+0xe7 0xe1 0x00 0x08 0x44 0xce
+
+# CHECK: wftcisb %f0, %f0, 0
+0xe7 0x00 0x00 0x08 0x20 0x4a
+
+# CHECK: wftcisb %f0, %f0, 0
+0xe7 0x00 0x00 0x08 0x20 0x4a
+
+# CHECK: wftcisb %f0, %f0, 4095
+0xe7 0x00 0xff 0xf8 0x20 0x4a
+
+# CHECK: wftcisb %f0, %f15, 0
+0xe7 0x0f 0x00 0x08 0x20 0x4a
+
+# CHECK: wftcisb %f0, %v31, 0
+0xe7 0x0f 0x00 0x08 0x24 0x4a
+
+# CHECK: wftcisb %f15, %f0, 0
+0xe7 0xf0 0x00 0x08 0x20 0x4a
+
+# CHECK: wftcisb %v31, %f0, 0
+0xe7 0xf0 0x00 0x08 0x28 0x4a
+
+# CHECK: wftcisb %f4, %v21, 1656
+0xe7 0x45 0x67 0x88 0x24 0x4a
+
+# CHECK: wftcixb %v0, %v0, 0
+0xe7 0x00 0x00 0x08 0x40 0x4a
+
+# CHECK: wftcixb %v0, %v0, 4095
+0xe7 0x00 0xff 0xf8 0x40 0x4a
+
+# CHECK: wftcixb %v0, %v15, 0
+0xe7 0x0f 0x00 0x08 0x40 0x4a
+
+# CHECK: wftcixb %v0, %v31, 0
+0xe7 0x0f 0x00 0x08 0x44 0x4a
+
+# CHECK: wftcixb %v15, %v0, 0
+0xe7 0xf0 0x00 0x08 0x40 0x4a
+
+# CHECK: wftcixb %v31, %v0, 0
+0xe7 0xf0 0x00 0x08 0x48 0x4a
+
+# CHECK: wftcixb %v4, %v21, 1656
+0xe7 0x45 0x67 0x88 0x44 0x4a
+
diff --git a/test/MC/Mips/mt/invalid-wrong-error.s b/test/MC/Mips/mt/invalid-wrong-error.s
deleted file mode 100644
index 0247089b70ae..000000000000
--- a/test/MC/Mips/mt/invalid-wrong-error.s
+++ /dev/null
@@ -1,3 +0,0 @@
-# RUN: not llvm-mc -arch=mips -mcpu=mips32r2 -mattr=+mt < %s 2>&1 | FileCheck %s
-  mftr 0($4), $5, 0, 0, 0 # CHECK: error: unexpected token in argument list
-  mttr 0($4), $5, 0, 0, 0 # CHECK: error: unexpected token in argument list
diff --git a/test/MC/Mips/mt/invalid.s b/test/MC/Mips/mt/invalid.s
index d4055c4a50f4..5a145a7e0850 100644
--- a/test/MC/Mips/mt/invalid.s
+++ b/test/MC/Mips/mt/invalid.s
@@ -1,27 +1,13 @@
 # RUN: not llvm-mc -arch=mips -mcpu=mips32 -mattr=+mt < %s 2>&1 | FileCheck %s
-  dmt 4                   # CHECK: error: invalid operand for instruction
-  dmt $4, $5              # CHECK: error: invalid operand for instruction
-  dmt $5, 0($4)           # CHECK: error: invalid operand for instruction
-  emt 4                   # CHECK: error: invalid operand for instruction
-  emt $4, $5              # CHECK: error: invalid operand for instruction
-  emt $5, 0($5)           # CHECK: error: invalid operand for instruction
-  dvpe 4                  # CHECK: error: invalid operand for instruction
-  dvpe $4, $5             # CHECK: error: invalid operand for instruction
-  dvpe $5, 0($4)          # CHECK: error: invalid operand for instruction
-  evpe 4                  # CHECK: error: invalid operand for instruction
-  evpe $4, $5             # CHECK: error: invalid operand for instruction
-  evpe $5, 0($5)          # CHECK: error: invalid operand for instruction
-  mftr $4, 0($5), 0, 0, 0 # CHECK: error: invalid operand for instruction
-  mftr $4, $5, 2, 0, 0    # CHECK: error: expected 1-bit unsigned immediate
-  mftr $4, $5, -1, 0, 0   # CHECK: error: expected 1-bit unsigned immediate
-  mftr $4, $5, 0, 8, 0    # CHECK: error: expected 3-bit unsigned immediate
-  mftr $4, $5, 0, -1, 0   # CHECK: error: expected 3-bit unsigned immediate
-  mftr $4, $4, 0, 0, 2    # CHECK: error: expected 1-bit unsigned immediate
-  mftr $4, $5, 0, 0, -1   # CHECK: error: expected 1-bit unsigned immediate
-  mttr $4, 0($5), 0, 0, 0 # CHECK: error: invalid operand for instruction
-  mttr $4, $5, 2, 0, 0    # CHECK: error: expected 1-bit unsigned immediate
-  mttr $4, $5, -1, 0, 0   # CHECK: error: expected 1-bit unsigned immediate
-  mttr $4, $5, 0, 8, 0    # CHECK: error: expected 3-bit unsigned immediate
-  mttr $4, $5, 0, -1, 0   # CHECK: error: expected 3-bit unsigned immediate
-  mttr $4, $4, 0, 0, 2    # CHECK: error: expected 1-bit unsigned immediate
-  mttr $4, $5, 0, 0, -1   # CHECK: error: expected 1-bit unsigned immediate
+  dmt 4           # CHECK: error: invalid operand for instruction
+  dmt $4, $5      # CHECK: error: invalid operand for instruction
+  dmt $5, 0($4)   # CHECK: error: invalid operand for instruction
+  emt 4           # CHECK: error: invalid operand for instruction
+  emt $4, $5      # CHECK: error: invalid operand for instruction
+  emt $5, 0($5)   # CHECK: error: invalid operand for instruction
+  dvpe 4          # CHECK: error: invalid operand for instruction
+  dvpe $4, $5     # CHECK: error: invalid operand for instruction
+  dvpe $5, 0($4)  # CHECK: error: invalid operand for instruction
+  evpe 4          # CHECK: error: invalid operand for instruction
+  evpe $4, $5     # CHECK: error: invalid operand for instruction
+  evpe $5, 0($5)  # CHECK: error: invalid operand for instruction
diff --git a/test/MC/Mips/mt/mftr-mttr-aliases-invalid-wrong-error.s b/test/MC/Mips/mt/mftr-mttr-aliases-invalid-wrong-error.s
deleted file mode 100644
index 4e872412e6ef..000000000000
--- a/test/MC/Mips/mt/mftr-mttr-aliases-invalid-wrong-error.s
+++ /dev/null
@@ -1,18 +0,0 @@
-# RUN: not llvm-mc -arch=mips -mcpu=mips32r2 -mattr=+mt -show-encoding < %s \
-# RUN:       2>&1 | FileCheck %s
-
-# The integrated assembler produces a wrong or misleading error message.
-
-  mftc0 0($4), $5    # CHECK: error: unexpected token in argument list
-  mftc0 0($4), $5, 1 # CHECK: error: unexpected token in argument list
-  mftgpr 0($4), $5   # CHECK: error: unexpected token in argument list
-  mftlo 0($3)        # CHECK: error: unexpected token in argument list
-  mftlo 0($3), $ac1  # CHECK: error: unexpected token in argument list
-  mfthi 0($3)        # CHECK: error: unexpected token in argument list
-  mfthi 0($3), $ac1  # CHECK: error: unexpected token in argument list
-  mftacx 0($3)       # CHECK: error: unexpected token in argument list
-  mftacx 0($3), $ac1 # CHECK: error: unexpected token in argument list
-  mftdsp 0($4)       # CHECK: error: unexpected token in argument list
-  mftc1 0($4), $f4   # CHECK: error: unexpected token in argument list
-  mfthc1 0($4), $f4  # CHECK: error: unexpected token in argument list
-  cftc1 0($4), $f8   # CHECK: error: unexpected token in argument list
diff --git a/test/MC/Mips/mt/mftr-mttr-aliases-invalid.s b/test/MC/Mips/mt/mftr-mttr-aliases-invalid.s
deleted file mode 100644
index 06ae8c72e654..000000000000
--- a/test/MC/Mips/mt/mftr-mttr-aliases-invalid.s
+++ /dev/null
@@ -1,23 +0,0 @@
-# RUN: not llvm-mc -arch=mips -mcpu=mips32r2 -mattr=+mt -show-encoding < %s \
-# RUN:       2>&1 | FileCheck %s
-
-  mftc0 $4, 0($5)     # CHECK: error: invalid operand for instruction
-  mftc0 $4, 0($5), 1  # CHECK: error: invalid operand for instruction
-  mftc0 $4, $5, -1    # CHECK: error: expected 3-bit unsigned immediate
-  mftc0 $4, $5, 9     # CHECK: error: expected 3-bit unsigned immediate
-  mftc0 $4, $5, $6    # CHECK: error: expected 3-bit unsigned immediate
-  mftgpr $4, 0($5)    # CHECK: error: invalid operand for instruction
-  mftgpr $4, $5, $6   # CHECK: error: invalid operand for instruction
-  mftlo $3, 0($ac1)   # CHECK: error: invalid operand for instruction
-  mftlo $4, $ac1, $4  # CHECK: error: invalid operand for instruction
-  mfthi $3, 0($ac1)   # CHECK: error: invalid operand for instruction
-  mfthi $4, $ac1, $4  # CHECK: error: invalid operand for instruction
-  mftacx $3, 0($ac1)  # CHECK: error: invalid operand for instruction
-  mftacx $4, $ac1, $4 # CHECK: error: invalid operand for instruction
-  mftdsp $4, $5       # CHECK: error: invalid operand for instruction
-  mftdsp $4, $f5      # CHECK: error: invalid operand for instruction
-  mftdsp $4, $ac0     # CHECK: error: invalid operand for instruction
-  mftc1 $4, 0($f4)    # CHECK: error: invalid operand for instruction
-  mfthc1 $4, 0($f4)   # CHECK: error: invalid operand for instruction
-  cftc1 $4, 0($f4)    # CHECK: error: invalid operand for instruction
-  cftc1 $4, $f4, $5   # CHECK: error: invalid operand for instruction
diff --git a/test/MC/Mips/mt/mftr-mttr-aliases.s b/test/MC/Mips/mt/mftr-mttr-aliases.s
deleted file mode 100644
index 92ed9f9281f2..000000000000
--- a/test/MC/Mips/mt/mftr-mttr-aliases.s
+++ /dev/null
@@ -1,47 +0,0 @@
-# RUN: llvm-mc -arch=mips -mcpu=mips32r2 -mattr=+mt -show-encoding < %s | FileCheck %s
-
-# Check the various aliases of the m[ft]tr instruction.
-
-  mftc0 $4, $5           # CHECK: mftr  $4, $5, 0, 0, 0         # encoding: [0x41,0x05,0x20,0x00]
-  mftc0 $6, $7, 1        # CHECK: mftr  $6, $7, 0, 1, 0         # encoding: [0x41,0x07,0x30,0x01]
-  mftgpr $5, $9          # CHECK: mftr  $5, $9, 1, 0, 0         # encoding: [0x41,0x09,0x28,0x20]
-  mftlo $3               # CHECK: mftr  $3, $zero, 1, 1, 0      # encoding: [0x41,0x00,0x18,0x21]
-  mftlo $3, $ac0         # CHECK: mftr  $3, $zero, 1, 1, 0      # encoding: [0x41,0x00,0x18,0x21]
-  mftlo $3, $ac1         # CHECK: mftr  $3, $4, 1, 1, 0         # encoding: [0x41,0x04,0x18,0x21]
-  mftlo $3, $ac2         # CHECK: mftr  $3, $8, 1, 1, 0         # encoding: [0x41,0x08,0x18,0x21]
-  mftlo $3, $ac3         # CHECK: mftr  $3, $12, 1, 1, 0        # encoding: [0x41,0x0c,0x18,0x21]
-  mfthi $3, $ac0         # CHECK: mftr  $3, $1, 1, 1, 0         # encoding: [0x41,0x01,0x18,0x21]
-  mfthi $3, $ac1         # CHECK: mftr  $3, $5, 1, 1, 0         # encoding: [0x41,0x05,0x18,0x21]
-  mfthi $3, $ac2         # CHECK: mftr  $3, $9, 1, 1, 0         # encoding: [0x41,0x09,0x18,0x21]
-  mfthi $3, $ac3         # CHECK: mftr  $3, $13, 1, 1, 0        # encoding: [0x41,0x0d,0x18,0x21]
-  mftacx $3, $ac0        # CHECK: mftr  $3, $2, 1, 1, 0         # encoding: [0x41,0x02,0x18,0x21]
-  mftacx $3, $ac1        # CHECK: mftr  $3, $6, 1, 1, 0         # encoding: [0x41,0x06,0x18,0x21]
-  mftacx $3, $ac2        # CHECK: mftr  $3, $10, 1, 1, 0        # encoding: [0x41,0x0a,0x18,0x21]
-  mftacx $3, $ac3        # CHECK: mftr  $3, $14, 1, 1, 0        # encoding: [0x41,0x0e,0x18,0x21]
-  mftdsp $4              # CHECK: mftr  $4, $16, 1, 1, 0        # encoding: [0x41,0x10,0x20,0x21]
-  mftc1 $4, $f5          # CHECK: mftr  $4, $5, 1, 2, 0         # encoding: [0x41,0x05,0x20,0x22]
-  mfthc1 $4, $f5         # CHECK: mftr  $4, $5, 1, 2, 1         # encoding: [0x41,0x05,0x20,0x32]
-  cftc1  $4, $f9         # CHECK: mftr  $4, $9, 1, 3, 0         # encoding: [0x41,0x09,0x20,0x23]
-
-  mttc0 $4, $5           # CHECK: mttr  $4, $5, 0, 0, 0         # encoding: [0x41,0x84,0x28,0x00]
-  mttc0 $6, $7, 1        # CHECK: mttr  $6, $7, 0, 1, 0         # encoding: [0x41,0x86,0x38,0x01]
-  mttgpr $5, $9          # CHECK: mttr  $5, $9, 1, 0, 0         # encoding: [0x41,0x85,0x48,0x20]
-  mttlo $3               # CHECK: mttr  $3, $zero, 1, 1, 0      # encoding: [0x41,0x83,0x00,0x21]
-  mttlo $3, $ac0         # CHECK: mttr  $3, $zero, 1, 1, 0      # encoding: [0x41,0x83,0x00,0x21]
-  mttlo $3, $ac1         # CHECK: mttr  $3, $4, 1, 1, 0         # encoding: [0x41,0x83,0x20,0x21]
-  mttlo $3, $ac2         # CHECK: mttr  $3, $8, 1, 1, 0         # encoding: [0x41,0x83,0x40,0x21]
-  mttlo $3, $ac3         # CHECK: mttr  $3, $12, 1, 1, 0        # encoding: [0x41,0x83,0x60,0x21]
-  mtthi $3               # CHECK: mttr  $3, $1, 1, 1, 0         # encoding: [0x41,0x83,0x08,0x21]
-  mtthi $3, $ac0         # CHECK: mttr  $3, $1, 1, 1, 0         # encoding: [0x41,0x83,0x08,0x21]
-  mtthi $3, $ac1         # CHECK: mttr  $3, $5, 1, 1, 0         # encoding: [0x41,0x83,0x28,0x21]
-  mtthi $3, $ac2         # CHECK: mttr  $3, $9, 1, 1, 0         # encoding: [0x41,0x83,0x48,0x21]
-  mtthi $3, $ac3         # CHECK: mttr  $3, $13, 1, 1, 0        # encoding: [0x41,0x83,0x68,0x21]
-  mttacx $3              # CHECK: mttr  $3, $2, 1, 1, 0         # encoding: [0x41,0x83,0x10,0x21]
-  mttacx $3, $ac0        # CHECK: mttr  $3, $2, 1, 1, 0         # encoding: [0x41,0x83,0x10,0x21]
-  mttacx $3, $ac1        # CHECK: mttr  $3, $6, 1, 1, 0         # encoding: [0x41,0x83,0x30,0x21]
-  mttacx $3, $ac2        # CHECK: mttr  $3, $10, 1, 1, 0        # encoding: [0x41,0x83,0x50,0x21]
-  mttacx $3, $ac3        # CHECK: mttr  $3, $14, 1, 1, 0        # encoding: [0x41,0x83,0x70,0x21]
-  mttdsp $4              # CHECK: mttr  $4, $16, 1, 1, 0        # encoding: [0x41,0x84,0x80,0x21]
-  mttc1 $4, $f5          # CHECK: mttr  $4, $5, 1, 2, 0         # encoding: [0x41,0x84,0x28,0x22]
-  mtthc1 $4, $f5         # CHECK: mttr  $4, $5, 1, 2, 1         # encoding: [0x41,0x84,0x28,0x32]
-  cttc1  $4, $f9         # CHECK: mttr  $4, $9, 1, 3, 0         # encoding: [0x41,0x84,0x48,0x23]
diff --git a/test/MC/Mips/mt/mftr-mttr-reserved-valid.s b/test/MC/Mips/mt/mftr-mttr-reserved-valid.s
deleted file mode 100644
index c40e81bfc7d7..000000000000
--- a/test/MC/Mips/mt/mftr-mttr-reserved-valid.s
+++ /dev/null
@@ -1,8 +0,0 @@
-# RUN: llvm-mc -arch=mips -mcpu=mips32r2 -mattr=+mt -show-encoding < %s | FileCheck %s
-
-# The selector value and register values here are marked as reserved in the
-# documentation, but GAS accepts them without warning.
-  mftr  $31, $31, 1, 1, 0       # CHECK: mftr  $ra, $ra, 1, 1, 0   # encoding: [0x41,0x1f,0xf8,0x21]
-  mttr  $31, $31, 1, 1, 0       # CHECK: mttr  $ra, $ra, 1, 1, 0   # encoding: [0x41,0x9f,0xf8,0x21]
-  mftr  $31, $13, 1, 6, 0       # CHECK: mftr  $ra, $13, 1, 6, 0   # encoding: [0x41,0x0d,0xf8,0x26]
-  mttr  $31, $13, 1, 6, 0       # CHECK: mttr  $ra, $13, 1, 6, 0   # encoding: [0x41,0x9f,0x68,0x26]
diff --git a/test/MC/Mips/mt/valid.s b/test/MC/Mips/mt/valid.s
index 9fa07870a61f..ab1179d05c6a 100644
--- a/test/MC/Mips/mt/valid.s
+++ b/test/MC/Mips/mt/valid.s
@@ -1,33 +1,13 @@
 # RUN: llvm-mc -arch=mips -mcpu=mips32r2 -mattr=+mt -show-encoding < %s \
 # RUN:   | FileCheck %s
-  dmt                    # CHECK:  dmt                       # encoding: [0x41,0x60,0x0b,0xc1]
-  dmt $5                 # CHECK:  dmt $5                    # encoding: [0x41,0x65,0x0b,0xc1]
-  emt                    # CHECK:  emt                       # encoding: [0x41,0x60,0x0b,0xe1]
-  emt $4                 # CHECK:  emt $4                    # encoding: [0x41,0x64,0x0b,0xe1]
-  dvpe                   # CHECK:  dvpe                      # encoding: [0x41,0x60,0x00,0x01]
-  dvpe $6                # CHECK:  dvpe  $6                  # encoding: [0x41,0x66,0x00,0x01]
-  evpe                   # CHECK:  evpe                      # encoding: [0x41,0x60,0x00,0x21]
-  evpe $4                # CHECK:  evpe  $4                  # encoding: [0x41,0x64,0x00,0x21]
-  fork $2, $3, $5        # CHECK:  fork  $2, $3, $5          # encoding: [0x7c,0x65,0x10,0x08]
-  yield $4               # CHECK:  yield  $4                 # encoding: [0x7c,0x80,0x00,0x09]
-  yield $4, $5           # CHECK:  yield $4, $5              # encoding: [0x7c,0xa0,0x20,0x09]
-  mftr $4, $5, 0, 2, 0   # CHECK:  mftr  $4, $5, 0, 2, 0     # encoding: [0x41,0x05,0x20,0x02]
-  mftr $4, $5, 1, 0, 0   # CHECK:  mftr  $4, $5, 1, 0, 0     # encoding: [0x41,0x05,0x20,0x20]
-  mftr $4, $0, 1, 1, 0   # CHECK:  mftr  $4, $zero, 1, 1, 0  # encoding: [0x41,0x00,0x20,0x21]
-  mftr $4, $10, 1, 1, 0  # CHECK:  mftr  $4, $10, 1, 1, 0    # encoding: [0x41,0x0a,0x20,0x21]
-  mftr $4, $10, 1, 2, 0  # CHECK:  mftr  $4, $10, 1, 2, 0    # encoding: [0x41,0x0a,0x20,0x22]
-  mftr $4, $10, 1, 2, 1  # CHECK:  mftr  $4, $10, 1, 2, 1    # encoding: [0x41,0x0a,0x20,0x32]
-  mftr $4, $26, 1, 3, 0  # CHECK:  mftr  $4, $26, 1, 3, 0    # encoding: [0x41,0x1a,0x20,0x23]
-  mftr $4, $31, 1, 3, 0  # CHECK:  mftr  $4, $ra, 1, 3, 0    # encoding: [0x41,0x1f,0x20,0x23]
-  mftr $4, $14, 1, 4, 0  # CHECK:  mftr  $4, $14, 1, 4, 0    # encoding: [0x41,0x0e,0x20,0x24]
-  mftr $4, $15, 1, 5, 0  # CHECK:  mftr  $4, $15, 1, 5, 0    # encoding: [0x41,0x0f,0x20,0x25]
-  mttr $4, $5, 0, 2, 0   # CHECK:  mttr  $4, $5, 0, 2, 0     # encoding: [0x41,0x84,0x28,0x02]
-  mttr $4, $5, 1, 0, 0   # CHECK:  mttr  $4, $5, 1, 0, 0     # encoding: [0x41,0x84,0x28,0x20]
-  mttr $4, $0, 1, 1, 0   # CHECK:  mttr  $4, $zero, 1, 1, 0  # encoding: [0x41,0x84,0x00,0x21]
-  mttr $4, $10, 1, 1, 0  # CHECK:  mttr  $4, $10, 1, 1, 0    # encoding: [0x41,0x84,0x50,0x21]
-  mttr $4, $10, 1, 2, 0  # CHECK:  mttr  $4, $10, 1, 2, 0    # encoding: [0x41,0x84,0x50,0x22]
-  mttr $4, $10, 1, 2, 1  # CHECK:  mttr  $4, $10, 1, 2, 1    # encoding: [0x41,0x84,0x50,0x32]
-  mttr $4, $26, 1, 3, 0  # CHECK:  mttr  $4, $26, 1, 3, 0    # encoding: [0x41,0x84,0xd0,0x23]
-  mttr $4, $31, 1, 3, 0  # CHECK:  mttr  $4, $ra, 1, 3, 0    # encoding: [0x41,0x84,0xf8,0x23]
-  mttr $4, $14, 1, 4, 0  # CHECK:  mttr  $4, $14, 1, 4, 0    # encoding: [0x41,0x84,0x70,0x24]
-  mttr $4, $15, 1, 5, 0  # CHECK:  mttr  $4, $15, 1, 5, 0    # encoding: [0x41,0x84,0x78,0x25]
+  dmt           # CHECK:  dmt         # encoding: [0x41,0x60,0x0b,0xc1]
+  dmt $5        # CHECK:  dmt $5      # encoding: [0x41,0x65,0x0b,0xc1]
+  emt           # CHECK:  emt         # encoding: [0x41,0x60,0x0b,0xe1]
+  emt $4        # CHECK:  emt $4      # encoding: [0x41,0x64,0x0b,0xe1]
+  dvpe          # CHECK:  dvpe        # encoding: [0x41,0x60,0x00,0x01]
+  dvpe $6       # CHECK:  dvpe  $6    # encoding: [0x41,0x66,0x00,0x01]
+  evpe          # CHECK:  evpe        # encoding: [0x41,0x60,0x00,0x21]
+  evpe $4       # CHECK:  evpe  $4    # encoding: [0x41,0x64,0x00,0x21]
+  fork $2, $3, $5 # CHECK:  fork  $2, $3, $5 # encoding: [0x7c,0x65,0x10,0x08]
+  yield $4        # CHECK:  yield  $4        # encoding: [0x7c,0x80,0x00,0x09]
+  yield $4, $5    # CHECK:  yield $4, $5     # encoding: [0x7c,0xa0,0x20,0x09]
diff --git a/test/MC/SystemZ/insn-bad-z13.s b/test/MC/SystemZ/insn-bad-z13.s
index e9fac44aa883..0ccdd11cbe97 100644
--- a/test/MC/SystemZ/insn-bad-z13.s
+++ b/test/MC/SystemZ/insn-bad-z13.s
@@ -4,6 +4,19 @@
 # RUN: not llvm-mc -triple s390x-linux-gnu -mcpu=arch11 < %s 2> %t
 # RUN: FileCheck < %t %s
 
+#CHECK: error: instruction requires: miscellaneous-extensions-2
+#CHECK: agh	%r0, 0
+
+	agh	%r0, 0
+
+#CHECK: error: instruction requires: miscellaneous-extensions-2
+#CHECK: bi	0
+#CHECK: error: instruction requires: miscellaneous-extensions-2
+#CHECK: bic	0, 0
+
+	bi	0
+	bic	0, 0
+
 #CHECK: error: invalid operand
 #CHECK: cdpt	%f0, 0(1), -1
 #CHECK: error: invalid operand
@@ -150,6 +163,16 @@
 	cxpt	%f0, 0(-), 0
 	cxpt	%f15, 0(1), 0
 
+#CHECK: error: instruction requires: insert-reference-bits-multiple
+#CHECK: irbm	%r0, %r0
+
+	irbm	%r0, %r0
+
+#CHECK: error: instruction requires: message-security-assist-extension8
+#CHECK: kma	%r2, %r4, %r6
+
+	kma	%r2, %r4, %r6
+
 #CHECK: error: invalid operand
 #CHECK: lcbb	%r0, 0, -1
 #CHECK: error: invalid operand
@@ -167,6 +190,21 @@
 	lcbb	%r0, 4096, 0
 	lcbb	%r0, 0(%v1,%r2), 0
 
+#CHECK: error: instruction requires: guarded-storage
+#CHECK: lgg	%r0, 0
+
+	lgg	%r0, 0
+
+#CHECK: error: instruction requires: guarded-storage
+#CHECK: lgsc	%r0, 0
+
+	lgsc	%r0, 0
+
+#CHECK: error: instruction requires: guarded-storage
+#CHECK: llgfsg	%r0, 0
+
+	llgfsg	%r0, 0
+
 #CHECK: error: invalid operand
 #CHECK: llzrgf	%r0, -524289
 #CHECK: error: invalid operand
@@ -249,6 +287,41 @@
 	lzrg	%r0, -524289
 	lzrg	%r0, 524288
 
+#CHECK: error: instruction requires: miscellaneous-extensions-2
+#CHECK: mg	%r0, 0
+
+	mg	%r0, 0
+
+#CHECK: error: instruction requires: miscellaneous-extensions-2
+#CHECK: mgh	%r0, 0
+
+	mgh	%r0, 0
+
+#CHECK: error: instruction requires: miscellaneous-extensions-2
+#CHECK: mgrk	%r0, %r0, %r0
+
+	mgrk	%r0, %r0, %r0
+
+#CHECK: error: instruction requires: miscellaneous-extensions-2
+#CHECK: msc	%r0, 0
+
+	msc	%r0, 0
+
+#CHECK: error: instruction requires: miscellaneous-extensions-2
+#CHECK: msgc	%r0, 0
+
+	msgc	%r0, 0
+
+#CHECK: error: instruction requires: miscellaneous-extensions-2
+#CHECK: msrkc	%r0, %r0, %r0
+
+	msrkc	%r0, %r0, %r0
+
+#CHECK: error: instruction requires: miscellaneous-extensions-2
+#CHECK: msgrkc	%r0, %r0, %r0
+
+	msgrkc	%r0, %r0, %r0
+
 #CHECK: error: invalid register pair
 #CHECK: ppno	%r1, %r2
 #CHECK: error: invalid register pair
@@ -257,6 +330,21 @@
 	ppno	%r1, %r2
 	ppno	%r2, %r1
 
+#CHECK: error: instruction requires: message-security-assist-extension7
+#CHECK: prno	%r2, %r4
+
+	prno	%r2, %r4
+
+#CHECK: error: instruction requires: miscellaneous-extensions-2
+#CHECK: sgh	%r0, 0
+
+	sgh	%r0, 0
+
+#CHECK: error: instruction requires: guarded-storage
+#CHECK: stgsc	%r0, 0
+
+	stgsc	%r0, 0
+
 #CHECK: error: invalid operand
 #CHECK: stocfh	%r0, 0, -1
 #CHECK: error: invalid operand
@@ -274,6 +362,16 @@
 	stocfh	%r0, 524288, 1
 	stocfh	%r0, 0(%r1,%r2), 1
 
+#CHECK: error: instruction requires: vector-packed-decimal
+#CHECK: vap	%v0, %v0, %v0, 0, 0
+
+	vap	%v0, %v0, %v0, 0, 0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vbperm	%v0, %v0, %v0
+
+	vbperm	%v0, %v0, %v0
+
 #CHECK: error: invalid operand
 #CHECK: vcdg	%v0, %v0, 0, 0, -1
 #CHECK: error: invalid operand
@@ -410,6 +508,35 @@
 	vclgdb	%v0, %v0, -1, 0
 	vclgdb	%v0, %v0, 16, 0
 
+#CHECK: error: instruction requires: vector-packed-decimal
+#CHECK: vcp	%v0, %v0, 0
+
+	vcp	%v0, %v0, 0
+
+#CHECK: vcvb	%r0, %v0, 0
+
+	vcvb	%r0, %v0, 0
+
+#CHECK: error: instruction requires: vector-packed-decimal
+#CHECK: vcvbg	%r0, %v0, 0
+
+	vcvbg	%r0, %v0, 0
+
+#CHECK: error: instruction requires: vector-packed-decimal
+#CHECK: vcvd	%v0, %r0, 0, 0
+
+	vcvd	%v0, %r0, 0, 0
+
+#CHECK: error: instruction requires: vector-packed-decimal
+#CHECK: vcvdg	%v0, %r0, 0, 0
+
+	vcvdg	%v0, %r0, 0, 0
+
+#CHECK: error: instruction requires: vector-packed-decimal
+#CHECK: vdp	%v0, %v0, %v0, 0, 0
+
+	vdp	%v0, %v0, %v0, 0, 0
+
 #CHECK: error: invalid operand
 #CHECK: verim	%v0, %v0, %v0, 0, -1
 #CHECK: error: invalid operand
@@ -828,6 +955,40 @@
 	vfaezhs	%v0, %v0
 	vfaezhs	%v0, %v0, %v0, 0, 0
 
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfasb	%v0, %v0, %v0
+
+	vfasb	%v0, %v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfcesb	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfcesbs	%v0, %v0, %v0
+
+	vfcesb	%v0, %v0, %v0
+	vfcesbs	%v0, %v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfchsb	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfchsbs	%v0, %v0, %v0
+
+	vfchsb	%v0, %v0, %v0
+	vfchsbs	%v0, %v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfchesb	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfchesbs %v0, %v0, %v0
+
+	vfchesb	%v0, %v0, %v0
+	vfchesbs %v0, %v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfdsb	%v0, %v0, %v0
+
+	vfdsb	%v0, %v0, %v0
+
 #CHECK: error: invalid operand
 #CHECK: vfee	%v0, %v0, %v0, 0, -1
 #CHECK: error: invalid operand
@@ -1130,6 +1291,152 @@
 	vfidb	%v0, %v0, -1, 0
 	vfidb	%v0, %v0, 16, 0
 
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfisb	%v0, %v0, 0, 0
+
+	vfisb	%v0, %v0, 0, 0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfkedb	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfkedbs	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfkesb	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfkesbs	%v0, %v0, %v0
+
+	vfkedb	%v0, %v0, %v0
+	vfkedbs	%v0, %v0, %v0
+	vfkesb	%v0, %v0, %v0
+	vfkesbs	%v0, %v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfkhdb	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfkhdbs	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfkhsb	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfkhsbs	%v0, %v0, %v0
+
+	vfkhdb	%v0, %v0, %v0
+	vfkhdbs	%v0, %v0, %v0
+	vfkhsb	%v0, %v0, %v0
+	vfkhsbs	%v0, %v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfkhedb	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfkhedbs %v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfkhesb	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfkhesbs %v0, %v0, %v0
+
+	vfkhedb	%v0, %v0, %v0
+	vfkhedbs %v0, %v0, %v0
+	vfkhesb	%v0, %v0, %v0
+	vfkhesbs %v0, %v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfpsosb	%v0, %v0, 0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vflcsb	%v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vflnsb	%v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vflpsb	%v0, %v0
+
+	vfpsosb	%v0, %v0, 0
+	vflcsb	%v0, %v0
+	vflnsb	%v0, %v0
+	vflpsb	%v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfll	%v0, %v0, 0, 0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vflls	%v0, %v0
+
+	vfll	%v0, %v0, 0, 0
+	vflls	%v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vflr	%v0, %v0, 0, 0, 0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vflrd	%v0, %v0, 0, 0
+
+	vflr	%v0, %v0, 0, 0, 0
+	vflrd	%v0, %v0, 0, 0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfmax	%v0, %v0, %v0, 0, 0, 0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfmaxdb	%v0, %v0, %v0, 0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfmaxsb	%v0, %v0, %v0, 0
+
+	vfmax	%v0, %v0, %v0, 0, 0, 0
+	vfmaxdb	%v0, %v0, %v0, 0
+	vfmaxsb	%v0, %v0, %v0, 0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfmin	%v0, %v0, %v0, 0, 0, 0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfmindb	%v0, %v0, %v0, 0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfminsb	%v0, %v0, %v0, 0
+
+	vfmin	%v0, %v0, %v0, 0, 0, 0
+	vfmindb	%v0, %v0, %v0, 0
+	vfminsb	%v0, %v0, %v0, 0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfmasb	%v0, %v0, %v0, %v0
+
+	vfmasb	%v0, %v0, %v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfmsb	%v0, %v0, %v0
+
+	vfmsb	%v0, %v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfmssb	%v0, %v0, %v0, %v0
+
+	vfmssb	%v0, %v0, %v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfnma	%v0, %v0, %v0, %v0, 0, 0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfnmadb	%v0, %v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfnmasb	%v0, %v0, %v0, %v0
+
+	vfnma	%v0, %v0, %v0, %v0, 0, 0
+	vfnmadb	%v0, %v0, %v0, %v0
+	vfnmasb	%v0, %v0, %v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfnms	%v0, %v0, %v0, %v0, 0, 0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfnmsdb	%v0, %v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfnmssb	%v0, %v0, %v0, %v0
+
+	vfnms	%v0, %v0, %v0, %v0, 0, 0
+	vfnmsdb	%v0, %v0, %v0, %v0
+	vfnmssb	%v0, %v0, %v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfssb	%v0, %v0, %v0
+
+	vfssb	%v0, %v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfsqsb	%v0, %v0
+
+	vfsqsb	%v0, %v0
+
 #CHECK: error: invalid operand
 #CHECK: vftci	%v0, %v0, 0, 0, -1
 #CHECK: error: invalid operand
@@ -1158,6 +1465,11 @@
 	vftcidb	%v0, %v0, -1
 	vftcidb	%v0, %v0, 4096
 
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vftcisb	%v0, %v0, 0
+
+	vftcisb	%v0, %v0, 0
+
 #CHECK: error: invalid operand
 #CHECK: vgbm	%v0, -1
 #CHECK: error: invalid operand
@@ -1615,6 +1927,11 @@
 	vlgvh	%r0, %v0, 4096
 	vlgvh	%r0, %v0, 0(%r0)
 
+#CHECK: error: instruction requires: vector-packed-decimal
+#CHECK: vlip	%v0, 0, 0
+
+	vlip	%v0, 0, 0
+
 #CHECK: error: invalid operand
 #CHECK: vll	%v0, %r0, -1
 #CHECK: error: invalid operand
@@ -1687,6 +2004,11 @@
 	vllezh	%v0, 4096
 	vllezh	%v0, 0(%v1,%r2)
 
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vllezlf	%v0, 0
+
+	vllezlf	%v0, 0
+
 #CHECK: error: invalid operand
 #CHECK: vlm	%v0, %v0, -1
 #CHECK: error: invalid operand
@@ -1756,6 +2078,16 @@
 	vlreph	%v0, 4096
 	vlreph	%v0, 0(%v1,%r2)
 
+#CHECK: error: instruction requires: vector-packed-decimal
+#CHECK: vlrl	%v0, 0, 0
+
+	vlrl	%v0, 0, 0
+
+#CHECK: error: instruction requires: vector-packed-decimal
+#CHECK: vlrlr	%v0, %r0, 0
+
+	vlrlr	%v0, %r0, 0
+
 #CHECK: error: invalid operand
 #CHECK: vlvg	%v0, %r0, 0, -1
 #CHECK: error: invalid operand
@@ -1817,6 +2149,39 @@
 	vlvgh	%v0, %r0, 4096
 	vlvgh	%v0, %r0, 0(%r0)
 
+#CHECK: error: instruction requires: vector-packed-decimal
+#CHECK: vmp	%v0, %v0, %v0, 0, 0
+
+	vmp	%v0, %v0, %v0, 0, 0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vmsl	%v0, %v0, %v0, %v0, 0, 0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vmslg	%v0, %v0, %v0, %v0, 0
+
+	vmsl	%v0, %v0, %v0, %v0, 0, 0
+	vmslg	%v0, %v0, %v0, %v0, 0
+
+#CHECK: error: instruction requires: vector-packed-decimal
+#CHECK: vmsp	%v0, %v0, %v0, 0, 0
+
+	vmsp	%v0, %v0, %v0, 0, 0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vnn	%v0, %v0, %v0
+
+	vnn	%v0, %v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vnx	%v0, %v0, %v0
+
+	vnx	%v0, %v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: voc	%v0, %v0, %v0
+
+	voc	%v0, %v0, %v0
+
 #CHECK: error: invalid operand
 #CHECK: vpdi	%v0, %v0, %v0, -1
 #CHECK: error: invalid operand
@@ -1825,6 +2190,30 @@
 	vpdi	%v0, %v0, %v0, -1
 	vpdi	%v0, %v0, %v0, 16
 
+#CHECK: error: instruction requires: vector-packed-decimal
+#CHECK: vpkz	%v0, 0, 0
+
+	vpkz	%v0, 0, 0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vpopctb	%v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vpopctf	%v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vpopctg	%v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vpopcth	%v0, %v0
+
+	vpopctb	%v0, %v0
+	vpopctf	%v0, %v0
+	vpopctg	%v0, %v0
+	vpopcth	%v0, %v0
+
+#CHECK: error: instruction requires: vector-packed-decimal
+#CHECK: vpsop	%v0, %v0, 0, 0, 0
+
+	vpsop	%v0, %v0, 0, 0, 0
+
 #CHECK: error: invalid operand
 #CHECK: vrep	%v0, %v0, 0, -1
 #CHECK: error: invalid operand
@@ -1917,6 +2306,11 @@
 	vrepih	%v0, -32769
 	vrepih	%v0, 32768
 
+#CHECK: error: instruction requires: vector-packed-decimal
+#CHECK: vrp	%v0, %v0, %v0, 0, 0
+
+	vrp	%v0, %v0, %v0, 0, 0
+
 #CHECK: error: vector index required
 #CHECK: vscef	%v0, 0(%r1), 0
 #CHECK: error: vector index required
@@ -1957,6 +2351,11 @@
 	vsceg	%v0, -1(%v0,%r1), 0
 	vsceg	%v0, 4096(%v0,%r1), 0
 
+#CHECK: error: instruction requires: vector-packed-decimal
+#CHECK: vsdp	%v0, %v0, %v0, 0, 0
+
+	vsdp	%v0, %v0, %v0, 0, 0
+
 #CHECK: error: invalid operand
 #CHECK: vsldb	%v0, %v0, %v0, -1
 #CHECK: error: invalid operand
@@ -1965,6 +2364,16 @@
 	vsldb	%v0, %v0, %v0, -1
 	vsldb	%v0, %v0, %v0, 256
 
+#CHECK: error: instruction requires: vector-packed-decimal
+#CHECK: vsp	%v0, %v0, %v0, 0, 0
+
+	vsp	%v0, %v0, %v0, 0, 0
+
+#CHECK: error: instruction requires: vector-packed-decimal
+#CHECK: vsrp	%v0, %v0, 0, 0, 0
+
+	vsrp	%v0, %v0, 0, 0, 0
+
 #CHECK: error: invalid operand
 #CHECK: vst	%v0, -1
 #CHECK: error: invalid operand
@@ -2251,6 +2660,26 @@
 	vstrczhs %v0, %v0, %v0
 	vstrczhs %v0, %v0, %v0, %v0, 0, 0
 
+#CHECK: error: instruction requires: vector-packed-decimal
+#CHECK: vstrl	%v0, 0, 0
+
+	vstrl	%v0, 0, 0
+
+#CHECK: error: instruction requires: vector-packed-decimal
+#CHECK: vstrlr	%v0, %r0, 0
+
+	vstrlr	%v0, %r0, 0
+
+#CHECK: error: instruction requires: vector-packed-decimal
+#CHECK: vtp	%v0
+
+	vtp	%v0
+
+#CHECK: error: instruction requires: vector-packed-decimal
+#CHECK: vupkz	%v0, 0, 0
+
+	vupkz	%v0, 0, 0
+
 #CHECK: error: invalid operand
 #CHECK: wcdgb	%v0, %v0, 0, -1
 #CHECK: error: invalid operand
@@ -2307,6 +2736,72 @@
 	wclgdb	%v0, %v0, -1, 0
 	wclgdb	%v0, %v0, 16, 0
 
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfasb	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfaxb	%v0, %v0, %v0
+
+	wfasb	%v0, %v0, %v0
+	wfaxb	%v0, %v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfcsb	%v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfcxb	%v0, %v0
+
+	wfcsb	%v0, %v0
+	wfcxb	%v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfcesb	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfcesbs	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfcexb	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfcexbs	%v0, %v0, %v0
+
+	wfcesb	%v0, %v0, %v0
+	wfcesbs	%v0, %v0, %v0
+	wfcexb	%v0, %v0, %v0
+	wfcexbs	%v0, %v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfchsb	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfchsbs	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfchxb	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfchxbs	%v0, %v0, %v0
+
+	wfchsb	%v0, %v0, %v0
+	wfchsbs	%v0, %v0, %v0
+	wfchxb	%v0, %v0, %v0
+	wfchxbs	%v0, %v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfchesb	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfchesbs %v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfchexb	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfchexbs %v0, %v0, %v0
+
+	wfchesb	%v0, %v0, %v0
+	wfchesbs %v0, %v0, %v0
+	wfchexb	%v0, %v0, %v0
+	wfchexbs %v0, %v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfdsb	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfdxb	%v0, %v0, %v0
+
+	wfdsb	%v0, %v0, %v0
+	wfdxb	%v0, %v0, %v0
+
 #CHECK: error: invalid operand
 #CHECK: wfidb	%v0, %v0, 0, -1
 #CHECK: error: invalid operand
@@ -2321,6 +2816,208 @@
 	wfidb	%v0, %v0, -1, 0
 	wfidb	%v0, %v0, 16, 0
 
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfisb	%v0, %v0, 0, 0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfixb	%v0, %v0, 0, 0
+
+	wfisb	%v0, %v0, 0, 0
+	wfixb	%v0, %v0, 0, 0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfksb	%v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfkxb	%v0, %v0
+
+	wfksb	%v0, %v0
+	wfkxb	%v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfkedb	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfkedbs	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfkesb	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfkesbs	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfkexb	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfkexbs	%v0, %v0, %v0
+
+	wfkedb	%v0, %v0, %v0
+	wfkedbs	%v0, %v0, %v0
+	wfkesb	%v0, %v0, %v0
+	wfkesbs	%v0, %v0, %v0
+	wfkexb	%v0, %v0, %v0
+	wfkexbs	%v0, %v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfkhdb	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfkhdbs	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfkhsb	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfkhsbs	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfkhxb	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfkhxbs	%v0, %v0, %v0
+
+	wfkhdb	%v0, %v0, %v0
+	wfkhdbs	%v0, %v0, %v0
+	wfkhsb	%v0, %v0, %v0
+	wfkhsbs	%v0, %v0, %v0
+	wfkhxb	%v0, %v0, %v0
+	wfkhxbs	%v0, %v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfkhedb	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfkhedbs %v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfkhesb	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfkhesbs %v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfkhexb	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfkhexbs %v0, %v0, %v0
+
+	wfkhedb	%v0, %v0, %v0
+	wfkhedbs %v0, %v0, %v0
+	wfkhesb	%v0, %v0, %v0
+	wfkhesbs %v0, %v0, %v0
+	wfkhexb	%v0, %v0, %v0
+	wfkhexbs %v0, %v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfpsosb	%v0, %v0, 0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfpsoxb	%v0, %v0, 0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wflcsb	%v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wflcxb	%v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wflnsb	%v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wflnxb	%v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wflpsb	%v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wflpxb	%v0, %v0
+
+	wfpsosb	%v0, %v0, 0
+	wfpsoxb	%v0, %v0, 0
+	wflcsb	%v0, %v0
+	wflcxb	%v0, %v0
+	wflnsb	%v0, %v0
+	wflnxb	%v0, %v0
+	wflpsb	%v0, %v0
+	wflpxb	%v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wflls	%v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wflld	%v0, %v0
+
+	wflls	%v0, %v0
+	wflld	%v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wflrd	%v0, %v0, 0, 0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wflrx	%v0, %v0, 0, 0
+
+	wflrd	%v0, %v0, 0, 0
+	wflrx	%v0, %v0, 0, 0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfmaxdb	%v0, %v0, %v0, 0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfmaxsb	%v0, %v0, %v0, 0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfmaxxb	%v0, %v0, %v0, 0
+
+	wfmaxdb	%v0, %v0, %v0, 0
+	wfmaxsb	%v0, %v0, %v0, 0
+	wfmaxxb	%v0, %v0, %v0, 0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfmindb	%v0, %v0, %v0, 0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfminsb	%v0, %v0, %v0, 0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfminxb	%v0, %v0, %v0, 0
+
+	wfmindb	%v0, %v0, %v0, 0
+	wfminsb	%v0, %v0, %v0, 0
+	wfminxb	%v0, %v0, %v0, 0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfmasb	%v0, %v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfmaxb	%v0, %v0, %v0, %v0
+
+	wfmasb	%v0, %v0, %v0, %v0
+	wfmaxb	%v0, %v0, %v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfmsb	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfmxb	%v0, %v0, %v0
+
+	wfmsb	%v0, %v0, %v0
+	wfmxb	%v0, %v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfmssb	%v0, %v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfmsxb	%v0, %v0, %v0, %v0
+
+	wfmssb	%v0, %v0, %v0, %v0
+	wfmsxb	%v0, %v0, %v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfnmadb	%v0, %v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfnmasb	%v0, %v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfnmaxb	%v0, %v0, %v0, %v0
+
+	wfnmadb	%v0, %v0, %v0, %v0
+	wfnmasb	%v0, %v0, %v0, %v0
+	wfnmaxb	%v0, %v0, %v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfnmsdb	%v0, %v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfnmssb	%v0, %v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfnmsxb	%v0, %v0, %v0, %v0
+
+	wfnmsdb	%v0, %v0, %v0, %v0
+	wfnmssb	%v0, %v0, %v0, %v0
+	wfnmsxb	%v0, %v0, %v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfssb	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfsxb	%v0, %v0, %v0
+
+	wfssb	%v0, %v0, %v0
+	wfsxb	%v0, %v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfsqsb	%v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfsqxb	%v0, %v0
+
+	wfsqsb	%v0, %v0
+	wfsqxb	%v0, %v0
+
 #CHECK: error: invalid operand
 #CHECK: wftcidb	%v0, %v0, -1
 #CHECK: error: invalid operand
@@ -2329,6 +3026,14 @@
 	wftcidb	%v0, %v0, -1
 	wftcidb	%v0, %v0, 4096
 
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wftcisb	%v0, %v0, 0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wftcixb	%v0, %v0, 0
+
+	wftcisb	%v0, %v0, 0
+	wftcixb	%v0, %v0, 0
+
 #CHECK: error: invalid operand
 #CHECK: wledb	%v0, %v0, 0, -1
 #CHECK: error: invalid operand
diff --git a/test/MC/SystemZ/insn-bad-z14.s b/test/MC/SystemZ/insn-bad-z14.s
new file mode 100644
index 000000000000..8bc736a7a1a4
--- /dev/null
+++ b/test/MC/SystemZ/insn-bad-z14.s
@@ -0,0 +1,752 @@
+# For z14 only.
+# RUN: not llvm-mc -triple s390x-linux-gnu -mcpu=z14 < %s 2> %t
+# RUN: FileCheck < %t %s
+# RUN: not llvm-mc -triple s390x-linux-gnu -mcpu=arch12 < %s 2> %t
+# RUN: FileCheck < %t %s
+
+#CHECK: error: invalid operand
+#CHECK: bi	-524289
+#CHECK: error: invalid operand
+#CHECK: bi	524288
+
+	bi	-524289
+	bi	524288
+
+#CHECK: error: invalid operand
+#CHECK: bic	-1, 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: bic	16, 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: bic	0, -524289
+#CHECK: error: invalid operand
+#CHECK: bic	0, 524288
+
+	bic	-1, 0(%r1)
+	bic	16, 0(%r1)
+	bic	0, -524289
+	bic	0, 524288
+
+#CHECK: error: invalid operand
+#CHECK: agh	%r0, -524289
+#CHECK: error: invalid operand
+#CHECK: agh	%r0, 524288
+
+	agh	%r0, -524289
+	agh	%r0, 524288
+
+#CHECK: error: invalid register pair
+#CHECK: kma	%r1, %r2, %r4
+#CHECK: error: invalid register pair
+#CHECK: kma	%r2, %r1, %r4
+#CHECK: error: invalid register pair
+#CHECK: kma	%r2, %r4, %r1
+
+	kma	%r1, %r2, %r4
+	kma	%r2, %r1, %r4
+	kma	%r2, %r4, %r1
+
+#CHECK: error: invalid operand
+#CHECK: lgg	%r0, -524289
+#CHECK: error: invalid operand
+#CHECK: lgg	%r0, 524288
+
+	lgg	%r0, -524289
+	lgg	%r0, 524288
+
+#CHECK: error: invalid operand
+#CHECK: lgsc	%r0, -524289
+#CHECK: error: invalid operand
+#CHECK: lgsc	%r0, 524288
+
+	lgsc	%r0, -524289
+	lgsc	%r0, 524288
+
+#CHECK: error: invalid operand
+#CHECK: llgfsg	%r0, -524289
+#CHECK: error: invalid operand
+#CHECK: llgfsg	%r0, 524288
+
+	llgfsg	%r0, -524289
+	llgfsg	%r0, 524288
+
+#CHECK: error: invalid operand
+#CHECK: mg	%r0, -524289
+#CHECK: error: invalid operand
+#CHECK: mg	%r0, 524288
+#CHECK: error: invalid register pair
+#CHECK: mg	%r1, 0
+
+	mg	%r0, -524289
+	mg	%r0, 524288
+	mg	%r1, 0
+
+#CHECK: error: invalid operand
+#CHECK: mgh	%r0, -524289
+#CHECK: error: invalid operand
+#CHECK: mgh	%r0, 524288
+
+	mgh	%r0, -524289
+	mgh	%r0, 524288
+
+#CHECK: error: invalid register pair
+#CHECK: mgrk	%r1, %r0, %r0
+
+	mgrk	%r1, %r0, %r0
+
+#CHECK: error: invalid operand
+#CHECK: msc	%r0, -524289
+#CHECK: error: invalid operand
+#CHECK: msc	%r0, 524288
+
+	msc	%r0, -524289
+	msc	%r0, 524288
+
+#CHECK: error: invalid operand
+#CHECK: msgc	%r0, -524289
+#CHECK: error: invalid operand
+#CHECK: msgc	%r0, 524288
+
+	msgc	%r0, -524289
+	msgc	%r0, 524288
+
+#CHECK: error: invalid register pair
+#CHECK: prno	%r1, %r2
+#CHECK: error: invalid register pair
+#CHECK: prno	%r2, %r1
+
+	prno	%r1, %r2
+	prno	%r2, %r1
+
+#CHECK: error: invalid operand
+#CHECK: sgh	%r0, -524289
+#CHECK: error: invalid operand
+#CHECK: sgh	%r0, 524288
+
+	sgh	%r0, -524289
+	sgh	%r0, 524288
+
+#CHECK: error: invalid operand
+#CHECK: stgsc	%r0, -524289
+#CHECK: error: invalid operand
+#CHECK: stgsc	%r0, 524288
+
+	stgsc	%r0, -524289
+	stgsc	%r0, 524288
+
+#CHECK: error: invalid operand
+#CHECK: vap	%v0, %v0, %v0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: vap	%v0, %v0, %v0, 0, 16
+#CHECK: error: invalid operand
+#CHECK: vap	%v0, %v0, %v0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: vap	%v0, %v0, %v0, 256, 0
+
+	vap	%v0, %v0, %v0, 0, -1
+	vap	%v0, %v0, %v0, 0, 16
+	vap	%v0, %v0, %v0, -1, 0
+	vap	%v0, %v0, %v0, 256, 0
+
+#CHECK: error: invalid operand
+#CHECK: vcp	%v0, %v0, -1
+#CHECK: error: invalid operand
+#CHECK: vcp	%v0, %v0, 16
+
+	vcp	%v0, %v0, -1
+	vcp	%v0, %v0, 16
+
+#CHECK: error: invalid operand
+#CHECK: vcvb	%r0, %v0, -1
+#CHECK: error: invalid operand
+#CHECK: vcvb	%r0, %v0, 16
+
+	vcvb	%r0, %v0, -1
+	vcvb	%r0, %v0, 16
+
+#CHECK: error: invalid operand
+#CHECK: vcvbg	%r0, %v0, -1
+#CHECK: error: invalid operand
+#CHECK: vcvbg	%r0, %v0, 16
+
+	vcvbg	%r0, %v0, -1
+	vcvbg	%r0, %v0, 16
+
+#CHECK: error: invalid operand
+#CHECK: vcvd	%r0, %v0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: vcvd	%r0, %v0, 0, 16
+#CHECK: error: invalid operand
+#CHECK: vcvd	%r0, %v0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: vcvd	%r0, %v0, 256, 0
+
+	vcvd	%r0, %v0, 0, -1
+	vcvd	%r0, %v0, 0, 16
+	vcvd	%r0, %v0, -1, 0
+	vcvd	%r0, %v0, 256, 0
+
+#CHECK: error: invalid operand
+#CHECK: vcvdg	%r0, %v0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: vcvdg	%r0, %v0, 0, 16
+#CHECK: error: invalid operand
+#CHECK: vcvdg	%r0, %v0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: vcvdg	%r0, %v0, 256, 0
+
+	vcvdg	%r0, %v0, 0, -1
+	vcvdg	%r0, %v0, 0, 16
+	vcvdg	%r0, %v0, -1, 0
+	vcvdg	%r0, %v0, 256, 0
+
+#CHECK: error: invalid operand
+#CHECK: vdp	%v0, %v0, %v0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: vdp	%v0, %v0, %v0, 0, 16
+#CHECK: error: invalid operand
+#CHECK: vdp	%v0, %v0, %v0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: vdp	%v0, %v0, %v0, 256, 0
+
+	vdp	%v0, %v0, %v0, 0, -1
+	vdp	%v0, %v0, %v0, 0, 16
+	vdp	%v0, %v0, %v0, -1, 0
+	vdp	%v0, %v0, %v0, 256, 0
+
+#CHECK: error: invalid operand
+#CHECK: vfisb	%v0, %v0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: vfisb	%v0, %v0, 0, 16
+#CHECK: error: invalid operand
+#CHECK: vfisb	%v0, %v0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: vfisb	%v0, %v0, 16, 0
+
+	vfisb	%v0, %v0, 0, -1
+	vfisb	%v0, %v0, 0, 16
+	vfisb	%v0, %v0, -1, 0
+	vfisb	%v0, %v0, 16, 0
+
+#CHECK: error: invalid operand
+#CHECK: vfll	%v0, %v0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: vfll	%v0, %v0, 0, 16
+#CHECK: error: invalid operand
+#CHECK: vfll	%v0, %v0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: vfll	%v0, %v0, 16, 0
+
+	vfll	%v0, %v0, 0, -1
+	vfll	%v0, %v0, 0, 16
+	vfll	%v0, %v0, -1, 0
+	vfll	%v0, %v0, 16, 0
+
+#CHECK: error: invalid operand
+#CHECK: vflr	%v0, %v0, 0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: vflr	%v0, %v0, 0, 0, 16
+#CHECK: error: invalid operand
+#CHECK: vflr	%v0, %v0, 0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: vflr	%v0, %v0, 0, 16, 0
+#CHECK: error: invalid operand
+#CHECK: vflr	%v0, %v0, -1, 0, 0
+#CHECK: error: invalid operand
+#CHECK: vflr	%v0, %v0, 16, 0, 0
+
+	vflr	%v0, %v0, 0, 0, -1
+	vflr	%v0, %v0, 0, 0, 16
+	vflr	%v0, %v0, 0, -1, 0
+	vflr	%v0, %v0, 0, 16, 0
+	vflr	%v0, %v0, -1, 0, 0
+	vflr	%v0, %v0, 16, 0, 0
+
+#CHECK: error: invalid operand
+#CHECK: vflrd	%v0, %v0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: vflrd	%v0, %v0, 0, 16
+#CHECK: error: invalid operand
+#CHECK: vflrd	%v0, %v0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: vflrd	%v0, %v0, 16, 0
+
+	vflrd	%v0, %v0, 0, -1
+	vflrd	%v0, %v0, 0, 16
+	vflrd	%v0, %v0, -1, 0
+	vflrd	%v0, %v0, 16, 0
+
+#CHECK: error: invalid operand
+#CHECK: vfmax	%v0, %v0, %v0, 0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: vfmax	%v0, %v0, %v0, 0, 0, 16
+#CHECK: error: invalid operand
+#CHECK: vfmax	%v0, %v0, %v0, 0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: vfmax	%v0, %v0, %v0, 0, 16, 0
+#CHECK: error: invalid operand
+#CHECK: vfmax	%v0, %v0, %v0, -1, 0, 0
+#CHECK: error: invalid operand
+#CHECK: vfmax	%v0, %v0, %v0, 16, 0, 0
+
+	vfmax	%v0, %v0, %v0, 0, 0, -1
+	vfmax	%v0, %v0, %v0, 0, 0, 16
+	vfmax	%v0, %v0, %v0, 0, -1, 0
+	vfmax	%v0, %v0, %v0, 0, 16, 0
+	vfmax	%v0, %v0, %v0, -1, 0, 0
+	vfmax	%v0, %v0, %v0, 16, 0, 0
+
+#CHECK: error: invalid operand
+#CHECK: vfmaxdb	%v0, %v0, %v0, -1
+#CHECK: error: invalid operand
+#CHECK: vfmaxdb	%v0, %v0, %v0, 16
+
+	vfmaxdb	%v0, %v0, %v0, -1
+	vfmaxdb	%v0, %v0, %v0, 16
+
+#CHECK: error: invalid operand
+#CHECK: vfmaxsb	%v0, %v0, %v0, -1
+#CHECK: error: invalid operand
+#CHECK: vfmaxsb	%v0, %v0, %v0, 16
+
+	vfmaxsb	%v0, %v0, %v0, -1
+	vfmaxsb	%v0, %v0, %v0, 16
+
+#CHECK: error: invalid operand
+#CHECK: vfmin	%v0, %v0, %v0, 0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: vfmin	%v0, %v0, %v0, 0, 0, 16
+#CHECK: error: invalid operand
+#CHECK: vfmin	%v0, %v0, %v0, 0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: vfmin	%v0, %v0, %v0, 0, 16, 0
+#CHECK: error: invalid operand
+#CHECK: vfmin	%v0, %v0, %v0, -1, 0, 0
+#CHECK: error: invalid operand
+#CHECK: vfmin	%v0, %v0, %v0, 16, 0, 0
+
+	vfmin	%v0, %v0, %v0, 0, 0, -1
+	vfmin	%v0, %v0, %v0, 0, 0, 16
+	vfmin	%v0, %v0, %v0, 0, -1, 0
+	vfmin	%v0, %v0, %v0, 0, 16, 0
+	vfmin	%v0, %v0, %v0, -1, 0, 0
+	vfmin	%v0, %v0, %v0, 16, 0, 0
+
+#CHECK: error: invalid operand
+#CHECK: vfmindb	%v0, %v0, %v0, -1
+#CHECK: error: invalid operand
+#CHECK: vfmindb	%v0, %v0, %v0, 16
+
+	vfmindb	%v0, %v0, %v0, -1
+	vfmindb	%v0, %v0, %v0, 16
+
+#CHECK: error: invalid operand
+#CHECK: vfminsb	%v0, %v0, %v0, -1
+#CHECK: error: invalid operand
+#CHECK: vfminsb	%v0, %v0, %v0, 16
+
+	vfminsb	%v0, %v0, %v0, -1
+	vfminsb	%v0, %v0, %v0, 16
+
+#CHECK: error: invalid operand
+#CHECK: vfnma	%v0, %v0, %v0, %v0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: vfnma	%v0, %v0, %v0, %v0, 0, 16
+#CHECK: error: invalid operand
+#CHECK: vfnma	%v0, %v0, %v0, %v0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: vfnma	%v0, %v0, %v0, %v0, 16, 0
+
+	vfnma	%v0, %v0, %v0, %v0, 0, -1
+	vfnma	%v0, %v0, %v0, %v0, 0, 16
+	vfnma	%v0, %v0, %v0, %v0, -1, 0
+	vfnma	%v0, %v0, %v0, %v0, 16, 0
+
+#CHECK: error: invalid operand
+#CHECK: vfnms	%v0, %v0, %v0, %v0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: vfnms	%v0, %v0, %v0, %v0, 0, 16
+#CHECK: error: invalid operand
+#CHECK: vfnms	%v0, %v0, %v0, %v0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: vfnms	%v0, %v0, %v0, %v0, 16, 0
+
+	vfnms	%v0, %v0, %v0, %v0, 0, -1
+	vfnms	%v0, %v0, %v0, %v0, 0, 16
+	vfnms	%v0, %v0, %v0, %v0, -1, 0
+	vfnms	%v0, %v0, %v0, %v0, 16, 0
+
+#CHECK: error: invalid operand
+#CHECK: vftcisb	%v0, %v0, -1
+#CHECK: error: invalid operand
+#CHECK: vftcisb	%v0, %v0, 4096
+
+	vftcisb	%v0, %v0, -1
+	vftcisb	%v0, %v0, 4096
+
+#CHECK: error: invalid operand
+#CHECK: vlip	%v0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: vlip	%v0, 0, 16
+#CHECK: error: invalid operand
+#CHECK: vlip	%v0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: vlip	%v0, 65536, 0
+
+	vlip	%v0, 0, -1
+	vlip	%v0, 0, 16
+	vlip	%v0, -1, 0
+	vlip	%v0, 65536, 0
+
+#CHECK: error: invalid operand
+#CHECK: vllezlf	%v0, -1
+#CHECK: error: invalid operand
+#CHECK: vllezlf	%v0, 4096
+#CHECK: error: invalid use of vector addressing
+#CHECK: vllezlf	%v0, 0(%v1,%r2)
+
+	vllezlf	%v0, -1
+	vllezlf	%v0, 4096
+	vllezlf	%v0, 0(%v1,%r2)
+
+#CHECK: error: invalid operand
+#CHECK: vlrl	%v0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: vlrl	%v0, 0, 256
+#CHECK: error: invalid operand
+#CHECK: vlrl	%v0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: vlrl	%v0, 4096, 0
+#CHECK: error: %r0 used in an address
+#CHECK: vlrl	%v0, 0(%r0), 0
+
+	vlrl	%v0, 0, -1
+	vlrl	%v0, 0, 256
+	vlrl	%v0, -1, 0
+	vlrl	%v0, 4096, 0
+	vlrl	%v0, 0(%r0), 0
+
+#CHECK: error: invalid operand
+#CHECK: vlrlr	%v0, %r0, -1
+#CHECK: error: invalid operand
+#CHECK: vlrlr	%v0, %r0, 4096
+#CHECK: error: %r0 used in an address
+#CHECK: vlrlr	%v0, %r0, 0(%r0)
+
+	vlrlr	%v0, %r0, -1
+	vlrlr	%v0, %r0, 4096
+	vlrlr	%v0, %r0, 0(%r0)
+
+#CHECK: error: invalid operand
+#CHECK: vmp	%v0, %v0, %v0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: vmp	%v0, %v0, %v0, 0, 16
+#CHECK: error: invalid operand
+#CHECK: vmp	%v0, %v0, %v0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: vmp	%v0, %v0, %v0, 256, 0
+
+	vmp	%v0, %v0, %v0, 0, -1
+	vmp	%v0, %v0, %v0, 0, 16
+	vmp	%v0, %v0, %v0, -1, 0
+	vmp	%v0, %v0, %v0, 256, 0
+
+#CHECK: error: invalid operand
+#CHECK: vmsp	%v0, %v0, %v0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: vmsp	%v0, %v0, %v0, 0, 16
+#CHECK: error: invalid operand
+#CHECK: vmsp	%v0, %v0, %v0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: vmsp	%v0, %v0, %v0, 256, 0
+
+	vmsp	%v0, %v0, %v0, 0, -1
+	vmsp	%v0, %v0, %v0, 0, 16
+	vmsp	%v0, %v0, %v0, -1, 0
+	vmsp	%v0, %v0, %v0, 256, 0
+
+#CHECK: error: invalid operand
+#CHECK: vmsl	%v0, %v0, %v0, %v0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: vmsl	%v0, %v0, %v0, %v0, 0, 16
+#CHECK: error: invalid operand
+#CHECK: vmsl	%v0, %v0, %v0, %v0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: vmsl	%v0, %v0, %v0, %v0, 16, 0
+
+	vmsl	%v0, %v0, %v0, %v0, 0, -1
+	vmsl	%v0, %v0, %v0, %v0, 0, 16
+	vmsl	%v0, %v0, %v0, %v0, -1, 0
+	vmsl	%v0, %v0, %v0, %v0, 16, 0
+
+#CHECK: error: invalid operand
+#CHECK: vmslg	%v0, %v0, %v0, %v0, -1
+#CHECK: error: invalid operand
+#CHECK: vmslg	%v0, %v0, %v0, %v0, 16
+
+	vmslg	%v0, %v0, %v0, %v0, -1
+	vmslg	%v0, %v0, %v0, %v0, 16
+
+#CHECK: error: invalid operand
+#CHECK: vpkz	%v0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: vpkz	%v0, 0, 256
+#CHECK: error: invalid operand
+#CHECK: vpkz	%v0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: vpkz	%v0, 4096, 0
+#CHECK: error: %r0 used in an address
+#CHECK: vpkz	%v0, 0(%r0), 0
+
+	vpkz	%v0, 0, -1
+	vpkz	%v0, 0, 256
+	vpkz	%v0, -1, 0
+	vpkz	%v0, 4096, 0
+	vpkz	%v0, 0(%r0), 0
+
+#CHECK: error: invalid operand
+#CHECK: vpsop	%v0, %v0, 0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: vpsop	%v0, %v0, 0, 0, 16
+#CHECK: error: invalid operand
+#CHECK: vpsop	%v0, %v0, 0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: vpsop	%v0, %v0, 0, 256, 0
+#CHECK: error: invalid operand
+#CHECK: vpsop	%v0, %v0, -1, 0, 0
+#CHECK: error: invalid operand
+#CHECK: vpsop	%v0, %v0, 256, 0, 0
+
+	vpsop	%v0, %v0, 0, 0, -1
+	vpsop	%v0, %v0, 0, 0, 16
+	vpsop	%v0, %v0, 0, -1, 0
+	vpsop	%v0, %v0, 0, 256, 0
+	vpsop	%v0, %v0, -1, 0, 0
+	vpsop	%v0, %v0, 256, 0, 0
+
+#CHECK: error: invalid operand
+#CHECK: vrp	%v0, %v0, %v0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: vrp	%v0, %v0, %v0, 0, 16
+#CHECK: error: invalid operand
+#CHECK: vrp	%v0, %v0, %v0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: vrp	%v0, %v0, %v0, 256, 0
+
+	vrp	%v0, %v0, %v0, 0, -1
+	vrp	%v0, %v0, %v0, 0, 16
+	vrp	%v0, %v0, %v0, -1, 0
+	vrp	%v0, %v0, %v0, 256, 0
+
+#CHECK: error: invalid operand
+#CHECK: vsdp	%v0, %v0, %v0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: vsdp	%v0, %v0, %v0, 0, 16
+#CHECK: error: invalid operand
+#CHECK: vsdp	%v0, %v0, %v0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: vsdp	%v0, %v0, %v0, 256, 0
+
+	vsdp	%v0, %v0, %v0, 0, -1
+	vsdp	%v0, %v0, %v0, 0, 16
+	vsdp	%v0, %v0, %v0, -1, 0
+	vsdp	%v0, %v0, %v0, 256, 0
+
+#CHECK: error: invalid operand
+#CHECK: vsp	%v0, %v0, %v0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: vsp	%v0, %v0, %v0, 0, 16
+#CHECK: error: invalid operand
+#CHECK: vsp	%v0, %v0, %v0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: vsp	%v0, %v0, %v0, 256, 0
+
+	vsp	%v0, %v0, %v0, 0, -1
+	vsp	%v0, %v0, %v0, 0, 16
+	vsp	%v0, %v0, %v0, -1, 0
+	vsp	%v0, %v0, %v0, 256, 0
+
+#CHECK: error: invalid operand
+#CHECK: vsrp	%v0, %v0, 0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: vsrp	%v0, %v0, 0, 0, 16
+#CHECK: error: invalid operand
+#CHECK: vsrp	%v0, %v0, 0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: vsrp	%v0, %v0, 0, 256, 0
+#CHECK: error: invalid operand
+#CHECK: vsrp	%v0, %v0, -1, 0, 0
+#CHECK: error: invalid operand
+#CHECK: vsrp	%v0, %v0, 256, 0, 0
+
+	vsrp	%v0, %v0, 0, 0, -1
+	vsrp	%v0, %v0, 0, 0, 16
+	vsrp	%v0, %v0, 0, -1, 0
+	vsrp	%v0, %v0, 0, 256, 0
+	vsrp	%v0, %v0, -1, 0, 0
+	vsrp	%v0, %v0, 256, 0, 0
+
+#CHECK: error: invalid operand
+#CHECK: vstrl	%v0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: vstrl	%v0, 0, 256
+#CHECK: error: invalid operand
+#CHECK: vstrl	%v0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: vstrl	%v0, 4096, 0
+#CHECK: error: %r0 used in an address
+#CHECK: vstrl	%v0, 0(%r0), 0
+
+	vstrl	%v0, 0, -1
+	vstrl	%v0, 0, 256
+	vstrl	%v0, -1, 0
+	vstrl	%v0, 4096, 0
+	vstrl	%v0, 0(%r0), 0
+
+#CHECK: error: invalid operand
+#CHECK: vstrlr	%v0, %r0, -1
+#CHECK: error: invalid operand
+#CHECK: vstrlr	%v0, %r0, 4096
+#CHECK: error: %r0 used in an address
+#CHECK: vstrlr	%v0, %r0, 0(%r0)
+
+	vstrlr	%v0, %r0, -1
+	vstrlr	%v0, %r0, 4096
+	vstrlr	%v0, %r0, 0(%r0)
+
+#CHECK: error: invalid operand
+#CHECK: vupkz	%v0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: vupkz	%v0, 0, 256
+#CHECK: error: invalid operand
+#CHECK: vupkz	%v0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: vupkz	%v0, 4096, 0
+#CHECK: error: %r0 used in an address
+#CHECK: vupkz	%v0, 0(%r0), 0
+
+	vupkz	%v0, 0, -1
+	vupkz	%v0, 0, 256
+	vupkz	%v0, -1, 0
+	vupkz	%v0, 4096, 0
+	vupkz	%v0, 0(%r0), 0
+
+#CHECK: error: invalid operand
+#CHECK: wfisb	%v0, %v0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: wfisb	%v0, %v0, 0, 16
+#CHECK: error: invalid operand
+#CHECK: wfisb	%v0, %v0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: wfisb	%v0, %v0, 16, 0
+
+	wfisb	%v0, %v0, 0, -1
+	wfisb	%v0, %v0, 0, 16
+	wfisb	%v0, %v0, -1, 0
+	wfisb	%v0, %v0, 16, 0
+
+#CHECK: error: invalid operand
+#CHECK: wfixb	%v0, %v0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: wfixb	%v0, %v0, 0, 16
+#CHECK: error: invalid operand
+#CHECK: wfixb	%v0, %v0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: wfixb	%v0, %v0, 16, 0
+
+	wfixb	%v0, %v0, 0, -1
+	wfixb	%v0, %v0, 0, 16
+	wfixb	%v0, %v0, -1, 0
+	wfixb	%v0, %v0, 16, 0
+
+#CHECK: error: invalid operand
+#CHECK: wflrd	%v0, %v0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: wflrd	%v0, %v0, 0, 16
+#CHECK: error: invalid operand
+#CHECK: wflrd	%v0, %v0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: wflrd	%v0, %v0, 16, 0
+
+	wflrd	%v0, %v0, 0, -1
+	wflrd	%v0, %v0, 0, 16
+	wflrd	%v0, %v0, -1, 0
+	wflrd	%v0, %v0, 16, 0
+
+#CHECK: error: invalid operand
+#CHECK: wflrx	%v0, %v0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: wflrx	%v0, %v0, 0, 16
+#CHECK: error: invalid operand
+#CHECK: wflrx	%v0, %v0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: wflrx	%v0, %v0, 16, 0
+
+	wflrx	%v0, %v0, 0, -1
+	wflrx	%v0, %v0, 0, 16
+	wflrx	%v0, %v0, -1, 0
+	wflrx	%v0, %v0, 16, 0
+
+#CHECK: error: invalid operand
+#CHECK: wfmaxdb	%v0, %v0, %v0, -1
+#CHECK: error: invalid operand
+#CHECK: wfmaxdb	%v0, %v0, %v0, 16
+
+	wfmaxdb	%v0, %v0, %v0, -1
+	wfmaxdb	%v0, %v0, %v0, 16
+
+#CHECK: error: invalid operand
+#CHECK: wfmaxsb	%v0, %v0, %v0, -1
+#CHECK: error: invalid operand
+#CHECK: wfmaxsb	%v0, %v0, %v0, 16
+
+	wfmaxsb	%v0, %v0, %v0, -1
+	wfmaxsb	%v0, %v0, %v0, 16
+
+#CHECK: error: invalid operand
+#CHECK: wfmaxxb	%v0, %v0, %v0, -1
+#CHECK: error: invalid operand
+#CHECK: wfmaxxb	%v0, %v0, %v0, 16
+
+	wfmaxxb	%v0, %v0, %v0, -1
+	wfmaxxb	%v0, %v0, %v0, 16
+
+#CHECK: error: invalid operand
+#CHECK: wfmindb	%v0, %v0, %v0, -1
+#CHECK: error: invalid operand
+#CHECK: wfmindb	%v0, %v0, %v0, 16
+
+	wfmindb	%v0, %v0, %v0, -1
+	wfmindb	%v0, %v0, %v0, 16
+
+#CHECK: error: invalid operand
+#CHECK: wfminsb	%v0, %v0, %v0, -1
+#CHECK: error: invalid operand
+#CHECK: wfminsb	%v0, %v0, %v0, 16
+
+	wfminsb	%v0, %v0, %v0, -1
+	wfminsb	%v0, %v0, %v0, 16
+
+#CHECK: error: invalid operand
+#CHECK: wfminxb	%v0, %v0, %v0, -1
+#CHECK: error: invalid operand
+#CHECK: wfminxb	%v0, %v0, %v0, 16
+
+	wfminxb	%v0, %v0, %v0, -1
+	wfminxb	%v0, %v0, %v0, 16
+
+#CHECK: error: invalid operand
+#CHECK: wftcisb	%v0, %v0, -1
+#CHECK: error: invalid operand
+#CHECK: wftcisb	%v0, %v0, 4096
+
+	wftcisb	%v0, %v0, -1
+	wftcisb	%v0, %v0, 4096
+
+#CHECK: error: invalid operand
+#CHECK: wftcixb	%v0, %v0, -1
+#CHECK: error: invalid operand
+#CHECK: wftcixb	%v0, %v0, 4096
+
+	wftcixb	%v0, %v0, -1
+	wftcixb	%v0, %v0, 4096
+
diff --git a/test/MC/SystemZ/insn-good-z14.s b/test/MC/SystemZ/insn-good-z14.s
new file mode 100644
index 000000000000..1fcdcb4ccab0
--- /dev/null
+++ b/test/MC/SystemZ/insn-good-z14.s
@@ -0,0 +1,2674 @@
+# For z14 and above.
+# RUN: llvm-mc -triple s390x-linux-gnu -mcpu=z14 -show-encoding %s \
+# RUN:   | FileCheck %s
+# RUN: llvm-mc -triple s390x-linux-gnu -mcpu=arch12 -show-encoding %s \
+# RUN:   | FileCheck %s
+
+#CHECK: agh	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x38]
+#CHECK: agh	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x38]
+#CHECK: agh	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0x38]
+#CHECK: agh	%r0, 1                  # encoding: [0xe3,0x00,0x00,0x01,0x00,0x38]
+#CHECK: agh	%r0, 524287             # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x38]
+#CHECK: agh	%r0, 0(%r1)             # encoding: [0xe3,0x00,0x10,0x00,0x00,0x38]
+#CHECK: agh	%r0, 0(%r15)            # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x38]
+#CHECK: agh	%r0, 524287(%r1,%r15)   # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x38]
+#CHECK: agh	%r0, 524287(%r15,%r1)   # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x38]
+#CHECK: agh	%r15, 0                 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0x38]
+
+	agh	%r0, -524288
+	agh	%r0, -1
+	agh	%r0, 0
+	agh	%r0, 1
+	agh	%r0, 524287
+	agh	%r0, 0(%r1)
+	agh	%r0, 0(%r15)
+	agh	%r0, 524287(%r1,%r15)
+	agh	%r0, 524287(%r15,%r1)
+	agh	%r15, 0
+
+#CHECK: bi	-524288                 # encoding: [0xe3,0xf0,0x00,0x00,0x80,0x47]
+#CHECK: bi	-1                      # encoding: [0xe3,0xf0,0x0f,0xff,0xff,0x47]
+#CHECK: bi	0                       # encoding: [0xe3,0xf0,0x00,0x00,0x00,0x47]
+#CHECK: bi	1                       # encoding: [0xe3,0xf0,0x00,0x01,0x00,0x47]
+#CHECK: bi	524287                  # encoding: [0xe3,0xf0,0x0f,0xff,0x7f,0x47]
+#CHECK: bi	0(%r1)                  # encoding: [0xe3,0xf0,0x10,0x00,0x00,0x47]
+#CHECK: bi	0(%r15)                 # encoding: [0xe3,0xf0,0xf0,0x00,0x00,0x47]
+#CHECK: bi	524287(%r1,%r15)        # encoding: [0xe3,0xf1,0xff,0xff,0x7f,0x47]
+#CHECK: bi	524287(%r15,%r1)        # encoding: [0xe3,0xff,0x1f,0xff,0x7f,0x47]
+
+	bi	-524288
+	bi	-1
+	bi	0
+	bi	1
+	bi	524287
+	bi	0(%r1)
+	bi	0(%r15)
+	bi	524287(%r1,%r15)
+	bi	524287(%r15,%r1)
+
+#CHECK: bic	0, -524288              # encoding: [0xe3,0x00,0x00,0x00,0x80,0x47]
+#CHECK: bic	0, -1                   # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x47]
+#CHECK: bic	0, 0                    # encoding: [0xe3,0x00,0x00,0x00,0x00,0x47]
+#CHECK: bic	0, 1                    # encoding: [0xe3,0x00,0x00,0x01,0x00,0x47]
+#CHECK: bic	0, 524287               # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x47]
+#CHECK: bic	0, 0(%r1)               # encoding: [0xe3,0x00,0x10,0x00,0x00,0x47]
+#CHECK: bic	0, 0(%r15)              # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x47]
+#CHECK: bic	0, 524287(%r1,%r15)     # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x47]
+#CHECK: bic	0, 524287(%r15,%r1)     # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x47]
+#CHECK: bic	15, 0                   # encoding: [0xe3,0xf0,0x00,0x00,0x00,0x47]
+
+	bic	0, -524288
+	bic	0, -1
+	bic	0, 0
+	bic	0, 1
+	bic	0, 524287
+	bic	0, 0(%r1)
+	bic	0, 0(%r15)
+	bic	0, 524287(%r1,%r15)
+	bic	0, 524287(%r15,%r1)
+	bic	15, 0
+
+#CHECK:	bic	1, 0(%r7)		# encoding: [0xe3,0x10,0x70,0x00,0x00,0x47]
+#CHECK:	bio	0(%r15)			# encoding: [0xe3,0x10,0xf0,0x00,0x00,0x47]
+
+	bic	1, 0(%r7)
+	bio	0(%r15)
+
+#CHECK:	bic	2, 0(%r7)		# encoding: [0xe3,0x20,0x70,0x00,0x00,0x47]
+#CHECK:	bih	0(%r15)			# encoding: [0xe3,0x20,0xf0,0x00,0x00,0x47]
+
+	bic	2, 0(%r7)
+	bih	0(%r15)
+
+#CHECK:	bic	3, 0(%r7)		# encoding: [0xe3,0x30,0x70,0x00,0x00,0x47]
+#CHECK:	binle	0(%r15)			# encoding: [0xe3,0x30,0xf0,0x00,0x00,0x47]
+
+	bic	3, 0(%r7)
+	binle	0(%r15)
+
+#CHECK:	bic	4, 0(%r7)		# encoding: [0xe3,0x40,0x70,0x00,0x00,0x47]
+#CHECK:	bil	0(%r15)			# encoding: [0xe3,0x40,0xf0,0x00,0x00,0x47]
+
+	bic	4, 0(%r7)
+	bil	0(%r15)
+
+#CHECK:	bic	5, 0(%r7)		# encoding: [0xe3,0x50,0x70,0x00,0x00,0x47]
+#CHECK:	binhe	0(%r15)			# encoding: [0xe3,0x50,0xf0,0x00,0x00,0x47]
+
+	bic	5, 0(%r7)
+	binhe	0(%r15)
+
+#CHECK:	bic	6, 0(%r7)		# encoding: [0xe3,0x60,0x70,0x00,0x00,0x47]
+#CHECK:	bilh	0(%r15)			# encoding: [0xe3,0x60,0xf0,0x00,0x00,0x47]
+
+	bic	6, 0(%r7)
+	bilh	0(%r15)
+
+#CHECK:	bic	7, 0(%r7)		# encoding: [0xe3,0x70,0x70,0x00,0x00,0x47]
+#CHECK:	bine	0(%r15)			# encoding: [0xe3,0x70,0xf0,0x00,0x00,0x47]
+
+	bic	7, 0(%r7)
+	bine	0(%r15)
+
+#CHECK:	bic	8, 0(%r7)		# encoding: [0xe3,0x80,0x70,0x00,0x00,0x47]
+#CHECK:	bie	0(%r15)			# encoding: [0xe3,0x80,0xf0,0x00,0x00,0x47]
+
+	bic	8, 0(%r7)
+	bie	0(%r15)
+
+#CHECK:	bic	9, 0(%r7)		# encoding: [0xe3,0x90,0x70,0x00,0x00,0x47]
+#CHECK:	binlh	0(%r15)			# encoding: [0xe3,0x90,0xf0,0x00,0x00,0x47]
+
+	bic	9, 0(%r7)
+	binlh	0(%r15)
+
+#CHECK:	bic	10, 0(%r7)		# encoding: [0xe3,0xa0,0x70,0x00,0x00,0x47]
+#CHECK:	bihe	0(%r15)			# encoding: [0xe3,0xa0,0xf0,0x00,0x00,0x47]
+
+	bic	10, 0(%r7)
+	bihe	0(%r15)
+
+#CHECK:	bic	11, 0(%r7)		# encoding: [0xe3,0xb0,0x70,0x00,0x00,0x47]
+#CHECK:	binl	0(%r15)			# encoding: [0xe3,0xb0,0xf0,0x00,0x00,0x47]
+
+	bic	11, 0(%r7)
+	binl	0(%r15)
+
+#CHECK:	bic	12, 0(%r7)		# encoding: [0xe3,0xc0,0x70,0x00,0x00,0x47]
+#CHECK:	bile	0(%r15)			# encoding: [0xe3,0xc0,0xf0,0x00,0x00,0x47]
+
+	bic	12, 0(%r7)
+	bile	0(%r15)
+
+#CHECK:	bic	13, 0(%r7)		# encoding: [0xe3,0xd0,0x70,0x00,0x00,0x47]
+#CHECK:	binh	0(%r15)			# encoding: [0xe3,0xd0,0xf0,0x00,0x00,0x47]
+
+	bic	13, 0(%r7)
+	binh	0(%r15)
+
+#CHECK:	bic	14, 0(%r7)		# encoding: [0xe3,0xe0,0x70,0x00,0x00,0x47]
+#CHECK:	bino	0(%r15)			# encoding: [0xe3,0xe0,0xf0,0x00,0x00,0x47]
+
+	bic	14, 0(%r7)
+	bino	0(%r15)
+
+#CHECK: irbm	%r0, %r0                # encoding: [0xb9,0xac,0x00,0x00]
+#CHECK: irbm	%r0, %r15               # encoding: [0xb9,0xac,0x00,0x0f]
+#CHECK: irbm	%r15, %r0               # encoding: [0xb9,0xac,0x00,0xf0]
+#CHECK: irbm	%r7, %r8                # encoding: [0xb9,0xac,0x00,0x78]
+#CHECK: irbm	%r15, %r15              # encoding: [0xb9,0xac,0x00,0xff]
+
+	irbm	%r0,%r0
+	irbm	%r0,%r15
+	irbm	%r15,%r0
+	irbm	%r7,%r8
+	irbm	%r15,%r15
+
+#CHECK: kma	%r2, %r2, %r2           # encoding: [0xb9,0x29,0x20,0x22]
+#CHECK: kma	%r2, %r8, %r14          # encoding: [0xb9,0x29,0x80,0x2e]
+#CHECK: kma	%r14, %r8, %r2          # encoding: [0xb9,0x29,0x80,0xe2]
+#CHECK: kma	%r6, %r8, %r10          # encoding: [0xb9,0x29,0x80,0x6a]
+
+	kma	%r2, %r2, %r2
+	kma	%r2, %r8, %r14
+	kma	%r14, %r8, %r2
+	kma	%r6, %r8, %r10
+
+#CHECK: lgg	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x4c]
+#CHECK: lgg	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x4c]
+#CHECK: lgg	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0x4c]
+#CHECK: lgg	%r0, 1                  # encoding: [0xe3,0x00,0x00,0x01,0x00,0x4c]
+#CHECK: lgg	%r0, 524287             # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x4c]
+#CHECK: lgg	%r0, 0(%r1)             # encoding: [0xe3,0x00,0x10,0x00,0x00,0x4c]
+#CHECK: lgg	%r0, 0(%r15)            # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x4c]
+#CHECK: lgg	%r0, 524287(%r1,%r15)   # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x4c]
+#CHECK: lgg	%r0, 524287(%r15,%r1)   # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x4c]
+#CHECK: lgg	%r15, 0                 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0x4c]
+
+	lgg	%r0, -524288
+	lgg	%r0, -1
+	lgg	%r0, 0
+	lgg	%r0, 1
+	lgg	%r0, 524287
+	lgg	%r0, 0(%r1)
+	lgg	%r0, 0(%r15)
+	lgg	%r0, 524287(%r1,%r15)
+	lgg	%r0, 524287(%r15,%r1)
+	lgg	%r15, 0
+
+#CHECK: lgsc	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x4d]
+#CHECK: lgsc	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x4d]
+#CHECK: lgsc	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0x4d]
+#CHECK: lgsc	%r0, 1                  # encoding: [0xe3,0x00,0x00,0x01,0x00,0x4d]
+#CHECK: lgsc	%r0, 524287             # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x4d]
+#CHECK: lgsc	%r0, 0(%r1)             # encoding: [0xe3,0x00,0x10,0x00,0x00,0x4d]
+#CHECK: lgsc	%r0, 0(%r15)            # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x4d]
+#CHECK: lgsc	%r0, 524287(%r1,%r15)   # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x4d]
+#CHECK: lgsc	%r0, 524287(%r15,%r1)   # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x4d]
+
+	lgsc	%r0, -524288
+	lgsc	%r0, -1
+	lgsc	%r0, 0
+	lgsc	%r0, 1
+	lgsc	%r0, 524287
+	lgsc	%r0, 0(%r1)
+	lgsc	%r0, 0(%r15)
+	lgsc	%r0, 524287(%r1,%r15)
+	lgsc	%r0, 524287(%r15,%r1)
+
+#CHECK: llgfsg	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x48]
+#CHECK: llgfsg	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x48]
+#CHECK: llgfsg	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0x48]
+#CHECK: llgfsg	%r0, 1                  # encoding: [0xe3,0x00,0x00,0x01,0x00,0x48]
+#CHECK: llgfsg	%r0, 524287             # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x48]
+#CHECK: llgfsg	%r0, 0(%r1)             # encoding: [0xe3,0x00,0x10,0x00,0x00,0x48]
+#CHECK: llgfsg	%r0, 0(%r15)            # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x48]
+#CHECK: llgfsg	%r0, 524287(%r1,%r15)   # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x48]
+#CHECK: llgfsg	%r0, 524287(%r15,%r1)   # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x48]
+#CHECK: llgfsg	%r15, 0                 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0x48]
+
+	llgfsg	%r0, -524288
+	llgfsg	%r0, -1
+	llgfsg	%r0, 0
+	llgfsg	%r0, 1
+	llgfsg	%r0, 524287
+	llgfsg	%r0, 0(%r1)
+	llgfsg	%r0, 0(%r15)
+	llgfsg	%r0, 524287(%r1,%r15)
+	llgfsg	%r0, 524287(%r15,%r1)
+	llgfsg	%r15, 0
+
+#CHECK: mg	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x84]
+#CHECK: mg	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x84]
+#CHECK: mg	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0x84]
+#CHECK: mg	%r0, 1                  # encoding: [0xe3,0x00,0x00,0x01,0x00,0x84]
+#CHECK: mg	%r0, 524287             # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x84]
+#CHECK: mg	%r0, 0(%r1)             # encoding: [0xe3,0x00,0x10,0x00,0x00,0x84]
+#CHECK: mg	%r0, 0(%r15)            # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x84]
+#CHECK: mg	%r0, 524287(%r1,%r15)   # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x84]
+#CHECK: mg	%r0, 524287(%r15,%r1)   # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x84]
+#CHECK: mg	%r14, 0                 # encoding: [0xe3,0xe0,0x00,0x00,0x00,0x84]
+
+	mg	%r0, -524288
+	mg	%r0, -1
+	mg	%r0, 0
+	mg	%r0, 1
+	mg	%r0, 524287
+	mg	%r0, 0(%r1)
+	mg	%r0, 0(%r15)
+	mg	%r0, 524287(%r1,%r15)
+	mg	%r0, 524287(%r15,%r1)
+	mg	%r14, 0
+
+#CHECK: mgh	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x3c]
+#CHECK: mgh	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x3c]
+#CHECK: mgh	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0x3c]
+#CHECK: mgh	%r0, 1                  # encoding: [0xe3,0x00,0x00,0x01,0x00,0x3c]
+#CHECK: mgh	%r0, 524287             # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x3c]
+#CHECK: mgh	%r0, 0(%r1)             # encoding: [0xe3,0x00,0x10,0x00,0x00,0x3c]
+#CHECK: mgh	%r0, 0(%r15)            # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x3c]
+#CHECK: mgh	%r0, 524287(%r1,%r15)   # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x3c]
+#CHECK: mgh	%r0, 524287(%r15,%r1)   # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x3c]
+#CHECK: mgh	%r15, 0                 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0x3c]
+
+	mgh	%r0, -524288
+	mgh	%r0, -1
+	mgh	%r0, 0
+	mgh	%r0, 1
+	mgh	%r0, 524287
+	mgh	%r0, 0(%r1)
+	mgh	%r0, 0(%r15)
+	mgh	%r0, 524287(%r1,%r15)
+	mgh	%r0, 524287(%r15,%r1)
+	mgh	%r15, 0
+
+#CHECK: mgrk	%r0, %r0, %r0           # encoding: [0xb9,0xec,0x00,0x00]
+#CHECK: mgrk	%r0, %r0, %r15          # encoding: [0xb9,0xec,0xf0,0x00]
+#CHECK: mgrk	%r0, %r15, %r0          # encoding: [0xb9,0xec,0x00,0x0f]
+#CHECK: mgrk	%r14, %r0, %r0          # encoding: [0xb9,0xec,0x00,0xe0]
+#CHECK: mgrk	%r6, %r8, %r9           # encoding: [0xb9,0xec,0x90,0x68]
+
+	mgrk	%r0,%r0,%r0
+	mgrk	%r0,%r0,%r15
+	mgrk	%r0,%r15,%r0
+	mgrk	%r14,%r0,%r0
+	mgrk	%r6,%r8,%r9
+
+#CHECK: msc	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x53]
+#CHECK: msc	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x53]
+#CHECK: msc	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0x53]
+#CHECK: msc	%r0, 1                  # encoding: [0xe3,0x00,0x00,0x01,0x00,0x53]
+#CHECK: msc	%r0, 524287             # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x53]
+#CHECK: msc	%r0, 0(%r1)             # encoding: [0xe3,0x00,0x10,0x00,0x00,0x53]
+#CHECK: msc	%r0, 0(%r15)            # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x53]
+#CHECK: msc	%r0, 524287(%r1,%r15)   # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x53]
+#CHECK: msc	%r0, 524287(%r15,%r1)   # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x53]
+#CHECK: msc	%r15, 0                 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0x53]
+
+	msc	%r0, -524288
+	msc	%r0, -1
+	msc	%r0, 0
+	msc	%r0, 1
+	msc	%r0, 524287
+	msc	%r0, 0(%r1)
+	msc	%r0, 0(%r15)
+	msc	%r0, 524287(%r1,%r15)
+	msc	%r0, 524287(%r15,%r1)
+	msc	%r15, 0
+
+#CHECK: msgc	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x83]
+#CHECK: msgc	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x83]
+#CHECK: msgc	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0x83]
+#CHECK: msgc	%r0, 1                  # encoding: [0xe3,0x00,0x00,0x01,0x00,0x83]
+#CHECK: msgc	%r0, 524287             # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x83]
+#CHECK: msgc	%r0, 0(%r1)             # encoding: [0xe3,0x00,0x10,0x00,0x00,0x83]
+#CHECK: msgc	%r0, 0(%r15)            # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x83]
+#CHECK: msgc	%r0, 524287(%r1,%r15)   # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x83]
+#CHECK: msgc	%r0, 524287(%r15,%r1)   # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x83]
+#CHECK: msgc	%r15, 0                 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0x83]
+
+	msgc	%r0, -524288
+	msgc	%r0, -1
+	msgc	%r0, 0
+	msgc	%r0, 1
+	msgc	%r0, 524287
+	msgc	%r0, 0(%r1)
+	msgc	%r0, 0(%r15)
+	msgc	%r0, 524287(%r1,%r15)
+	msgc	%r0, 524287(%r15,%r1)
+	msgc	%r15, 0
+
+#CHECK: msrkc	%r0, %r0, %r0           # encoding: [0xb9,0xfd,0x00,0x00]
+#CHECK: msrkc	%r0, %r0, %r15          # encoding: [0xb9,0xfd,0xf0,0x00]
+#CHECK: msrkc	%r0, %r15, %r0          # encoding: [0xb9,0xfd,0x00,0x0f]
+#CHECK: msrkc	%r15, %r0, %r0          # encoding: [0xb9,0xfd,0x00,0xf0]
+#CHECK: msrkc	%r7, %r8, %r9           # encoding: [0xb9,0xfd,0x90,0x78]
+
+	msrkc	%r0,%r0,%r0
+	msrkc	%r0,%r0,%r15
+	msrkc	%r0,%r15,%r0
+	msrkc	%r15,%r0,%r0
+	msrkc	%r7,%r8,%r9
+
+#CHECK: msgrkc	%r0, %r0, %r0           # encoding: [0xb9,0xed,0x00,0x00]
+#CHECK: msgrkc	%r0, %r0, %r15          # encoding: [0xb9,0xed,0xf0,0x00]
+#CHECK: msgrkc	%r0, %r15, %r0          # encoding: [0xb9,0xed,0x00,0x0f]
+#CHECK: msgrkc	%r15, %r0, %r0          # encoding: [0xb9,0xed,0x00,0xf0]
+#CHECK: msgrkc	%r7, %r8, %r9           # encoding: [0xb9,0xed,0x90,0x78]
+
+	msgrkc	%r0,%r0,%r0
+	msgrkc	%r0,%r0,%r15
+	msgrkc	%r0,%r15,%r0
+	msgrkc	%r15,%r0,%r0
+	msgrkc	%r7,%r8,%r9
+
+#CHECK: prno	%r2, %r2                # encoding: [0xb9,0x3c,0x00,0x22]
+#CHECK: prno	%r2, %r14               # encoding: [0xb9,0x3c,0x00,0x2e]
+#CHECK: prno	%r14, %r2               # encoding: [0xb9,0x3c,0x00,0xe2]
+#CHECK: prno	%r6, %r10               # encoding: [0xb9,0x3c,0x00,0x6a]
+
+	prno	%r2, %r2
+	prno	%r2, %r14
+	prno	%r14, %r2
+	prno	%r6, %r10
+
+#CHECK: sgh	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x39]
+#CHECK: sgh	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x39]
+#CHECK: sgh	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0x39]
+#CHECK: sgh	%r0, 1                  # encoding: [0xe3,0x00,0x00,0x01,0x00,0x39]
+#CHECK: sgh	%r0, 524287             # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x39]
+#CHECK: sgh	%r0, 0(%r1)             # encoding: [0xe3,0x00,0x10,0x00,0x00,0x39]
+#CHECK: sgh	%r0, 0(%r15)            # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x39]
+#CHECK: sgh	%r0, 524287(%r1,%r15)   # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x39]
+#CHECK: sgh	%r0, 524287(%r15,%r1)   # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x39]
+#CHECK: sgh	%r15, 0                 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0x39]
+
+	sgh	%r0, -524288
+	sgh	%r0, -1
+	sgh	%r0, 0
+	sgh	%r0, 1
+	sgh	%r0, 524287
+	sgh	%r0, 0(%r1)
+	sgh	%r0, 0(%r15)
+	sgh	%r0, 524287(%r1,%r15)
+	sgh	%r0, 524287(%r15,%r1)
+	sgh	%r15, 0
+
+#CHECK: stgsc	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x49]
+#CHECK: stgsc	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x49]
+#CHECK: stgsc	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0x49]
+#CHECK: stgsc	%r0, 1                  # encoding: [0xe3,0x00,0x00,0x01,0x00,0x49]
+#CHECK: stgsc	%r0, 524287             # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x49]
+#CHECK: stgsc	%r0, 0(%r1)             # encoding: [0xe3,0x00,0x10,0x00,0x00,0x49]
+#CHECK: stgsc	%r0, 0(%r15)            # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x49]
+#CHECK: stgsc	%r0, 524287(%r1,%r15)   # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x49]
+#CHECK: stgsc	%r0, 524287(%r15,%r1)   # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x49]
+
+	stgsc	%r0, -524288
+	stgsc	%r0, -1
+	stgsc	%r0, 0
+	stgsc	%r0, 1
+	stgsc	%r0, 524287
+	stgsc	%r0, 0(%r1)
+	stgsc	%r0, 0(%r15)
+	stgsc	%r0, 524287(%r1,%r15)
+	stgsc	%r0, 524287(%r15,%r1)
+
+#CHECK: vap	%v0, %v0, %v0, 0, 0     # encoding: [0xe6,0x00,0x00,0x00,0x00,0x71]
+#CHECK: vap	%v0, %v0, %v0, 0, 15    # encoding: [0xe6,0x00,0x00,0xf0,0x00,0x71]
+#CHECK: vap	%v0, %v0, %v0, 255, 0   # encoding: [0xe6,0x00,0x00,0x0f,0xf0,0x71]
+#CHECK: vap	%v0, %v0, %v31, 0, 0    # encoding: [0xe6,0x00,0xf0,0x00,0x02,0x71]
+#CHECK: vap	%v0, %v31, %v0, 0, 0    # encoding: [0xe6,0x0f,0x00,0x00,0x04,0x71]
+#CHECK: vap	%v31, %v0, %v0, 0, 0    # encoding: [0xe6,0xf0,0x00,0x00,0x08,0x71]
+#CHECK: vap	%v13, %v17, %v21, 121, 11 # encoding: [0xe6,0xd1,0x50,0xb7,0x96,0x71]
+
+	vap	%v0, %v0, %v0, 0, 0
+	vap	%v0, %v0, %v0, 0, 15
+	vap	%v0, %v0, %v0, 255, 0
+	vap	%v0, %v0, %v31, 0, 0
+	vap	%v0, %v31, %v0, 0, 0
+	vap	%v31, %v0, %v0, 0, 0
+	vap	%v13, %v17, %v21, 0x79, 11
+
+#CHECK: vbperm	%v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x00,0x85]
+#CHECK: vbperm	%v0, %v0, %v15          # encoding: [0xe7,0x00,0xf0,0x00,0x00,0x85]
+#CHECK: vbperm	%v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x85]
+#CHECK: vbperm	%v0, %v15, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x00,0x85]
+#CHECK: vbperm	%v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x85]
+#CHECK: vbperm	%v15, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x00,0x85]
+#CHECK: vbperm	%v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x85]
+#CHECK: vbperm	%v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x0a,0x85]
+
+	vbperm	%v0, %v0, %v0
+	vbperm	%v0, %v0, %v15
+	vbperm	%v0, %v0, %v31
+	vbperm	%v0, %v15, %v0
+	vbperm	%v0, %v31, %v0
+	vbperm	%v15, %v0, %v0
+	vbperm	%v31, %v0, %v0
+	vbperm	%v18, %v3, %v20
+
+#CHECK: vcp	%v0, %v0, 0             # encoding: [0xe6,0x00,0x00,0x00,0x00,0x77]
+#CHECK: vcp	%v0, %v0, 15            # encoding: [0xe6,0x00,0x00,0xf0,0x00,0x77]
+#CHECK: vcp	%v15, %v0, 0            # encoding: [0xe6,0x0f,0x00,0x00,0x00,0x77]
+#CHECK: vcp	%v31, %v0, 0            # encoding: [0xe6,0x0f,0x00,0x00,0x04,0x77]
+#CHECK: vcp	%v0, %v15, 0            # encoding: [0xe6,0x00,0xf0,0x00,0x00,0x77]
+#CHECK: vcp	%v0, %v31, 0            # encoding: [0xe6,0x00,0xf0,0x00,0x02,0x77]
+#CHECK: vcp	%v3, %v18, 4            # encoding: [0xe6,0x03,0x20,0x40,0x02,0x77]
+
+	vcp	%v0, %v0, 0
+	vcp	%v0, %v0, 15
+	vcp	%v15, %v0, 0
+	vcp	%v31, %v0, 0
+	vcp	%v0, %v15, 0
+	vcp	%v0, %v31, 0
+	vcp	%v3, %v18, 4
+
+#CHECK: vcvb	%r0, %v0, 0             # encoding: [0xe6,0x00,0x00,0x00,0x00,0x50]
+#CHECK: vcvb	%r0, %v0, 15            # encoding: [0xe6,0x00,0x00,0xf0,0x00,0x50]
+#CHECK: vcvb	%r15, %v0, 0            # encoding: [0xe6,0xf0,0x00,0x00,0x00,0x50]
+#CHECK: vcvb	%r0, %v15, 0            # encoding: [0xe6,0x0f,0x00,0x00,0x00,0x50]
+#CHECK: vcvb	%r0, %v31, 0            # encoding: [0xe6,0x0f,0x00,0x00,0x04,0x50]
+#CHECK: vcvb	%r3, %v18, 4            # encoding: [0xe6,0x32,0x00,0x40,0x04,0x50]
+
+	vcvb	%r0, %v0, 0
+	vcvb	%r0, %v0, 15
+	vcvb	%r15, %v0, 0
+	vcvb	%r0, %v15, 0
+	vcvb	%r0, %v31, 0
+	vcvb	%r3, %v18, 4
+
+#CHECK: vcvbg	%r0, %v0, 0             # encoding: [0xe6,0x00,0x00,0x00,0x00,0x52]
+#CHECK: vcvbg	%r0, %v0, 15            # encoding: [0xe6,0x00,0x00,0xf0,0x00,0x52]
+#CHECK: vcvbg	%r15, %v0, 0            # encoding: [0xe6,0xf0,0x00,0x00,0x00,0x52]
+#CHECK: vcvbg	%r0, %v15, 0            # encoding: [0xe6,0x0f,0x00,0x00,0x00,0x52]
+#CHECK: vcvbg	%r0, %v31, 0            # encoding: [0xe6,0x0f,0x00,0x00,0x04,0x52]
+#CHECK: vcvbg	%r3, %v18, 4            # encoding: [0xe6,0x32,0x00,0x40,0x04,0x52]
+
+	vcvbg	%r0, %v0, 0
+	vcvbg	%r0, %v0, 15
+	vcvbg	%r15, %v0, 0
+	vcvbg	%r0, %v15, 0
+	vcvbg	%r0, %v31, 0
+	vcvbg	%r3, %v18, 4
+
+#CHECK: vcvd	%v0, %r0, 0, 0          # encoding: [0xe6,0x00,0x00,0x00,0x00,0x58]
+#CHECK: vcvd	%v0, %r0, 0, 15         # encoding: [0xe6,0x00,0x00,0xf0,0x00,0x58]
+#CHECK: vcvd	%v0, %r0, 255, 0        # encoding: [0xe6,0x00,0x00,0x0f,0xf0,0x58]
+#CHECK: vcvd	%v0, %r15, 0, 0         # encoding: [0xe6,0x0f,0x00,0x00,0x00,0x58]
+#CHECK: vcvd	%v15, %r0, 0, 0         # encoding: [0xe6,0xf0,0x00,0x00,0x00,0x58]
+#CHECK: vcvd	%v31, %r0, 0, 0         # encoding: [0xe6,0xf0,0x00,0x00,0x08,0x58]
+#CHECK: vcvd	%v18, %r9, 52, 11       # encoding: [0xe6,0x29,0x00,0xb3,0x48,0x58]
+
+	vcvd	%v0, %r0, 0, 0
+	vcvd	%v0, %r0, 0, 15
+	vcvd	%v0, %r0, 255, 0
+	vcvd	%v0, %r15, 0, 0
+	vcvd	%v15, %r0, 0, 0
+	vcvd	%v31, %r0, 0, 0
+	vcvd	%v18, %r9, 0x34, 11
+
+#CHECK: vcvdg	%v0, %r0, 0, 0          # encoding: [0xe6,0x00,0x00,0x00,0x00,0x5a]
+#CHECK: vcvdg	%v0, %r0, 0, 15         # encoding: [0xe6,0x00,0x00,0xf0,0x00,0x5a]
+#CHECK: vcvdg	%v0, %r0, 255, 0        # encoding: [0xe6,0x00,0x00,0x0f,0xf0,0x5a]
+#CHECK: vcvdg	%v0, %r15, 0, 0         # encoding: [0xe6,0x0f,0x00,0x00,0x00,0x5a]
+#CHECK: vcvdg	%v15, %r0, 0, 0         # encoding: [0xe6,0xf0,0x00,0x00,0x00,0x5a]
+#CHECK: vcvdg	%v31, %r0, 0, 0         # encoding: [0xe6,0xf0,0x00,0x00,0x08,0x5a]
+#CHECK: vcvdg	%v18, %r9, 52, 11       # encoding: [0xe6,0x29,0x00,0xb3,0x48,0x5a]
+
+	vcvdg	%v0, %r0, 0, 0
+	vcvdg	%v0, %r0, 0, 15
+	vcvdg	%v0, %r0, 255, 0
+	vcvdg	%v0, %r15, 0, 0
+	vcvdg	%v15, %r0, 0, 0
+	vcvdg	%v31, %r0, 0, 0
+	vcvdg	%v18, %r9, 0x34, 11
+
+#CHECK: vdp	%v0, %v0, %v0, 0, 0     # encoding: [0xe6,0x00,0x00,0x00,0x00,0x7a]
+#CHECK: vdp	%v0, %v0, %v0, 0, 15    # encoding: [0xe6,0x00,0x00,0xf0,0x00,0x7a]
+#CHECK: vdp	%v0, %v0, %v0, 255, 0   # encoding: [0xe6,0x00,0x00,0x0f,0xf0,0x7a]
+#CHECK: vdp	%v0, %v0, %v31, 0, 0    # encoding: [0xe6,0x00,0xf0,0x00,0x02,0x7a]
+#CHECK: vdp	%v0, %v31, %v0, 0, 0    # encoding: [0xe6,0x0f,0x00,0x00,0x04,0x7a]
+#CHECK: vdp	%v31, %v0, %v0, 0, 0    # encoding: [0xe6,0xf0,0x00,0x00,0x08,0x7a]
+#CHECK: vdp	%v13, %v17, %v21, 121, 11 # encoding: [0xe6,0xd1,0x50,0xb7,0x96,0x7a]
+
+	vdp	%v0, %v0, %v0, 0, 0
+	vdp	%v0, %v0, %v0, 0, 15
+	vdp	%v0, %v0, %v0, 255, 0
+	vdp	%v0, %v0, %v31, 0, 0
+	vdp	%v0, %v31, %v0, 0, 0
+	vdp	%v31, %v0, %v0, 0, 0
+	vdp	%v13, %v17, %v21, 0x79, 11
+
+#CHECK: vfasb   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x20,0xe3]
+#CHECK: vfasb   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x22,0xe3]
+#CHECK: vfasb   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xe3]
+#CHECK: vfasb   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xe3]
+#CHECK: vfasb   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x2a,0xe3]
+
+	vfasb	%v0, %v0, %v0
+	vfasb	%v0, %v0, %v31
+	vfasb	%v0, %v31, %v0
+	vfasb	%v31, %v0, %v0
+	vfasb	%v18, %v3, %v20
+
+#CHECK: vfcesb  %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x20,0xe8]
+#CHECK: vfcesb  %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x22,0xe8]
+#CHECK: vfcesb  %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xe8]
+#CHECK: vfcesb  %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xe8]
+#CHECK: vfcesb  %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x2a,0xe8]
+
+	vfcesb	%v0, %v0, %v0
+	vfcesb	%v0, %v0, %v31
+	vfcesb	%v0, %v31, %v0
+	vfcesb	%v31, %v0, %v0
+	vfcesb	%v18, %v3, %v20
+
+#CHECK: vfcesbs %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x10,0x20,0xe8]
+#CHECK: vfcesbs %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x10,0x22,0xe8]
+#CHECK: vfcesbs %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x10,0x24,0xe8]
+#CHECK: vfcesbs %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x10,0x28,0xe8]
+#CHECK: vfcesbs %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x10,0x2a,0xe8]
+
+	vfcesbs	%v0, %v0, %v0
+	vfcesbs	%v0, %v0, %v31
+	vfcesbs	%v0, %v31, %v0
+	vfcesbs	%v31, %v0, %v0
+	vfcesbs	%v18, %v3, %v20
+
+#CHECK: vfchsb  %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x20,0xeb]
+#CHECK: vfchsb  %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x22,0xeb]
+#CHECK: vfchsb  %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xeb]
+#CHECK: vfchsb  %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xeb]
+#CHECK: vfchsb  %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x2a,0xeb]
+
+	vfchsb	%v0, %v0, %v0
+	vfchsb	%v0, %v0, %v31
+	vfchsb	%v0, %v31, %v0
+	vfchsb	%v31, %v0, %v0
+	vfchsb	%v18, %v3, %v20
+
+#CHECK: vfchsbs %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x10,0x20,0xeb]
+#CHECK: vfchsbs %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x10,0x22,0xeb]
+#CHECK: vfchsbs %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x10,0x24,0xeb]
+#CHECK: vfchsbs %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x10,0x28,0xeb]
+#CHECK: vfchsbs %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x10,0x2a,0xeb]
+
+	vfchsbs	%v0, %v0, %v0
+	vfchsbs	%v0, %v0, %v31
+	vfchsbs	%v0, %v31, %v0
+	vfchsbs	%v31, %v0, %v0
+	vfchsbs	%v18, %v3, %v20
+
+#CHECK: vfchesb %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x20,0xea]
+#CHECK: vfchesb %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x22,0xea]
+#CHECK: vfchesb %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xea]
+#CHECK: vfchesb %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xea]
+#CHECK: vfchesb %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x2a,0xea]
+
+	vfchesb	%v0, %v0, %v0
+	vfchesb	%v0, %v0, %v31
+	vfchesb	%v0, %v31, %v0
+	vfchesb	%v31, %v0, %v0
+	vfchesb	%v18, %v3, %v20
+
+#CHECK: vfchesbs %v0, %v0, %v0          # encoding: [0xe7,0x00,0x00,0x10,0x20,0xea]
+#CHECK: vfchesbs %v0, %v0, %v31         # encoding: [0xe7,0x00,0xf0,0x10,0x22,0xea]
+#CHECK: vfchesbs %v0, %v31, %v0         # encoding: [0xe7,0x0f,0x00,0x10,0x24,0xea]
+#CHECK: vfchesbs %v31, %v0, %v0         # encoding: [0xe7,0xf0,0x00,0x10,0x28,0xea]
+#CHECK: vfchesbs %v18, %v3, %v20        # encoding: [0xe7,0x23,0x40,0x10,0x2a,0xea]
+
+	vfchesbs %v0, %v0, %v0
+	vfchesbs %v0, %v0, %v31
+	vfchesbs %v0, %v31, %v0
+	vfchesbs %v31, %v0, %v0
+	vfchesbs %v18, %v3, %v20
+
+#CHECK: vfdsb   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x20,0xe5]
+#CHECK: vfdsb   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x22,0xe5]
+#CHECK: vfdsb   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xe5]
+#CHECK: vfdsb   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xe5]
+#CHECK: vfdsb   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x2a,0xe5]
+
+	vfdsb	%v0, %v0, %v0
+	vfdsb	%v0, %v0, %v31
+	vfdsb	%v0, %v31, %v0
+	vfdsb	%v31, %v0, %v0
+	vfdsb	%v18, %v3, %v20
+
+#CHECK: vfisb   %v0, %v0, 0, 0          # encoding: [0xe7,0x00,0x00,0x00,0x20,0xc7]
+#CHECK: vfisb   %v0, %v0, 0, 15         # encoding: [0xe7,0x00,0x00,0xf0,0x20,0xc7]
+#CHECK: vfisb   %v0, %v0, 4, 0          # encoding: [0xe7,0x00,0x00,0x04,0x20,0xc7]
+#CHECK: vfisb   %v0, %v0, 7, 0          # encoding: [0xe7,0x00,0x00,0x07,0x20,0xc7]
+#CHECK: vfisb   %v0, %v31, 0, 0         # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xc7]
+#CHECK: vfisb   %v31, %v0, 0, 0         # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xc7]
+#CHECK: vfisb   %v14, %v17, 4, 10       # encoding: [0xe7,0xe1,0x00,0xa4,0x24,0xc7]
+
+	vfisb	%v0, %v0, 0, 0
+	vfisb	%v0, %v0, 0, 15
+	vfisb	%v0, %v0, 4, 0
+	vfisb	%v0, %v0, 7, 0
+	vfisb	%v0, %v31, 0, 0
+	vfisb	%v31, %v0, 0, 0
+	vfisb	%v14, %v17, 4, 10
+
+#CHECK: vfkedb  %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x04,0x30,0xe8]
+#CHECK: vfkedb  %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x04,0x32,0xe8]
+#CHECK: vfkedb  %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x04,0x34,0xe8]
+#CHECK: vfkedb  %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x04,0x38,0xe8]
+#CHECK: vfkedb  %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x04,0x3a,0xe8]
+
+	vfkedb	%v0, %v0, %v0
+	vfkedb	%v0, %v0, %v31
+	vfkedb	%v0, %v31, %v0
+	vfkedb	%v31, %v0, %v0
+	vfkedb	%v18, %v3, %v20
+
+#CHECK: vfkedbs %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x14,0x30,0xe8]
+#CHECK: vfkedbs %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x14,0x32,0xe8]
+#CHECK: vfkedbs %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x14,0x34,0xe8]
+#CHECK: vfkedbs %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x14,0x38,0xe8]
+#CHECK: vfkedbs %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x14,0x3a,0xe8]
+
+	vfkedbs	%v0, %v0, %v0
+	vfkedbs	%v0, %v0, %v31
+	vfkedbs	%v0, %v31, %v0
+	vfkedbs	%v31, %v0, %v0
+	vfkedbs	%v18, %v3, %v20
+
+#CHECK: vfkesb  %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x04,0x20,0xe8]
+#CHECK: vfkesb  %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x04,0x22,0xe8]
+#CHECK: vfkesb  %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x04,0x24,0xe8]
+#CHECK: vfkesb  %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x04,0x28,0xe8]
+#CHECK: vfkesb  %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x04,0x2a,0xe8]
+
+	vfkesb	%v0, %v0, %v0
+	vfkesb	%v0, %v0, %v31
+	vfkesb	%v0, %v31, %v0
+	vfkesb	%v31, %v0, %v0
+	vfkesb	%v18, %v3, %v20
+
+#CHECK: vfkesbs %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x14,0x20,0xe8]
+#CHECK: vfkesbs %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x14,0x22,0xe8]
+#CHECK: vfkesbs %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x14,0x24,0xe8]
+#CHECK: vfkesbs %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x14,0x28,0xe8]
+#CHECK: vfkesbs %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x14,0x2a,0xe8]
+
+	vfkesbs	%v0, %v0, %v0
+	vfkesbs	%v0, %v0, %v31
+	vfkesbs	%v0, %v31, %v0
+	vfkesbs	%v31, %v0, %v0
+	vfkesbs	%v18, %v3, %v20
+
+#CHECK: vfkhdb  %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x04,0x30,0xeb]
+#CHECK: vfkhdb  %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x04,0x32,0xeb]
+#CHECK: vfkhdb  %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x04,0x34,0xeb]
+#CHECK: vfkhdb  %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x04,0x38,0xeb]
+#CHECK: vfkhdb  %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x04,0x3a,0xeb]
+
+	vfkhdb	%v0, %v0, %v0
+	vfkhdb	%v0, %v0, %v31
+	vfkhdb	%v0, %v31, %v0
+	vfkhdb	%v31, %v0, %v0
+	vfkhdb	%v18, %v3, %v20
+
+#CHECK: vfkhdbs %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x14,0x30,0xeb]
+#CHECK: vfkhdbs %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x14,0x32,0xeb]
+#CHECK: vfkhdbs %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x14,0x34,0xeb]
+#CHECK: vfkhdbs %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x14,0x38,0xeb]
+#CHECK: vfkhdbs %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x14,0x3a,0xeb]
+
+	vfkhdbs	%v0, %v0, %v0
+	vfkhdbs	%v0, %v0, %v31
+	vfkhdbs	%v0, %v31, %v0
+	vfkhdbs	%v31, %v0, %v0
+	vfkhdbs	%v18, %v3, %v20
+
+#CHECK: vfkhsb  %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x04,0x20,0xeb]
+#CHECK: vfkhsb  %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x04,0x22,0xeb]
+#CHECK: vfkhsb  %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x04,0x24,0xeb]
+#CHECK: vfkhsb  %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x04,0x28,0xeb]
+#CHECK: vfkhsb  %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x04,0x2a,0xeb]
+
+	vfkhsb	%v0, %v0, %v0
+	vfkhsb	%v0, %v0, %v31
+	vfkhsb	%v0, %v31, %v0
+	vfkhsb	%v31, %v0, %v0
+	vfkhsb	%v18, %v3, %v20
+
+#CHECK: vfkhsbs %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x14,0x20,0xeb]
+#CHECK: vfkhsbs %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x14,0x22,0xeb]
+#CHECK: vfkhsbs %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x14,0x24,0xeb]
+#CHECK: vfkhsbs %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x14,0x28,0xeb]
+#CHECK: vfkhsbs %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x14,0x2a,0xeb]
+
+	vfkhsbs	%v0, %v0, %v0
+	vfkhsbs	%v0, %v0, %v31
+	vfkhsbs	%v0, %v31, %v0
+	vfkhsbs	%v31, %v0, %v0
+	vfkhsbs	%v18, %v3, %v20
+
+#CHECK: vfkhedb %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x04,0x30,0xea]
+#CHECK: vfkhedb %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x04,0x32,0xea]
+#CHECK: vfkhedb %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x04,0x34,0xea]
+#CHECK: vfkhedb %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x04,0x38,0xea]
+#CHECK: vfkhedb %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x04,0x3a,0xea]
+
+	vfkhedb	%v0, %v0, %v0
+	vfkhedb	%v0, %v0, %v31
+	vfkhedb	%v0, %v31, %v0
+	vfkhedb	%v31, %v0, %v0
+	vfkhedb	%v18, %v3, %v20
+
+#CHECK: vfkhedbs %v0, %v0, %v0          # encoding: [0xe7,0x00,0x00,0x14,0x30,0xea]
+#CHECK: vfkhedbs %v0, %v0, %v31         # encoding: [0xe7,0x00,0xf0,0x14,0x32,0xea]
+#CHECK: vfkhedbs %v0, %v31, %v0         # encoding: [0xe7,0x0f,0x00,0x14,0x34,0xea]
+#CHECK: vfkhedbs %v31, %v0, %v0         # encoding: [0xe7,0xf0,0x00,0x14,0x38,0xea]
+#CHECK: vfkhedbs %v18, %v3, %v20        # encoding: [0xe7,0x23,0x40,0x14,0x3a,0xea]
+
+	vfkhedbs %v0, %v0, %v0
+	vfkhedbs %v0, %v0, %v31
+	vfkhedbs %v0, %v31, %v0
+	vfkhedbs %v31, %v0, %v0
+	vfkhedbs %v18, %v3, %v20
+
+#CHECK: vfkhesb %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x04,0x20,0xea]
+#CHECK: vfkhesb %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x04,0x22,0xea]
+#CHECK: vfkhesb %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x04,0x24,0xea]
+#CHECK: vfkhesb %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x04,0x28,0xea]
+#CHECK: vfkhesb %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x04,0x2a,0xea]
+
+	vfkhesb	%v0, %v0, %v0
+	vfkhesb	%v0, %v0, %v31
+	vfkhesb	%v0, %v31, %v0
+	vfkhesb	%v31, %v0, %v0
+	vfkhesb	%v18, %v3, %v20
+
+#CHECK: vfkhesbs %v0, %v0, %v0          # encoding: [0xe7,0x00,0x00,0x14,0x20,0xea]
+#CHECK: vfkhesbs %v0, %v0, %v31         # encoding: [0xe7,0x00,0xf0,0x14,0x22,0xea]
+#CHECK: vfkhesbs %v0, %v31, %v0         # encoding: [0xe7,0x0f,0x00,0x14,0x24,0xea]
+#CHECK: vfkhesbs %v31, %v0, %v0         # encoding: [0xe7,0xf0,0x00,0x14,0x28,0xea]
+#CHECK: vfkhesbs %v18, %v3, %v20        # encoding: [0xe7,0x23,0x40,0x14,0x2a,0xea]
+
+	vfkhesbs %v0, %v0, %v0
+	vfkhesbs %v0, %v0, %v31
+	vfkhesbs %v0, %v31, %v0
+	vfkhesbs %v31, %v0, %v0
+	vfkhesbs %v18, %v3, %v20
+
+#CHECK: vfpsosb %v0, %v0, 3             # encoding: [0xe7,0x00,0x00,0x30,0x20,0xcc]
+#CHECK: vfpsosb %v0, %v0, 15            # encoding: [0xe7,0x00,0x00,0xf0,0x20,0xcc]
+#CHECK: vfpsosb %v0, %v15, 3            # encoding: [0xe7,0x0f,0x00,0x30,0x20,0xcc]
+#CHECK: vfpsosb %v0, %v31, 3            # encoding: [0xe7,0x0f,0x00,0x30,0x24,0xcc]
+#CHECK: vfpsosb %v15, %v0, 3            # encoding: [0xe7,0xf0,0x00,0x30,0x20,0xcc]
+#CHECK: vfpsosb %v31, %v0, 3            # encoding: [0xe7,0xf0,0x00,0x30,0x28,0xcc]
+#CHECK: vfpsosb %v14, %v17, 7           # encoding: [0xe7,0xe1,0x00,0x70,0x24,0xcc]
+
+	vfpsosb	%v0, %v0, 3
+	vfpsosb	%v0, %v0, 15
+	vfpsosb	%v0, %v15, 3
+	vfpsosb	%v0, %v31, 3
+	vfpsosb	%v15, %v0, 3
+	vfpsosb	%v31, %v0, 3
+	vfpsosb	%v14, %v17, 7
+
+#CHECK: vflcsb  %v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x20,0xcc]
+#CHECK: vflcsb  %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x20,0xcc]
+#CHECK: vflcsb  %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xcc]
+#CHECK: vflcsb  %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x20,0xcc]
+#CHECK: vflcsb  %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xcc]
+#CHECK: vflcsb  %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x24,0xcc]
+
+	vflcsb	%v0, %v0
+	vflcsb	%v0, %v15
+	vflcsb	%v0, %v31
+	vflcsb	%v15, %v0
+	vflcsb	%v31, %v0
+	vflcsb	%v14, %v17
+
+#CHECK: vflnsb  %v0, %v0                # encoding: [0xe7,0x00,0x00,0x10,0x20,0xcc]
+#CHECK: vflnsb  %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x10,0x20,0xcc]
+#CHECK: vflnsb  %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x10,0x24,0xcc]
+#CHECK: vflnsb  %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x10,0x20,0xcc]
+#CHECK: vflnsb  %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x10,0x28,0xcc]
+#CHECK: vflnsb  %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x10,0x24,0xcc]
+
+	vflnsb	%v0, %v0
+	vflnsb	%v0, %v15
+	vflnsb	%v0, %v31
+	vflnsb	%v15, %v0
+	vflnsb	%v31, %v0
+	vflnsb	%v14, %v17
+
+#CHECK: vflpsb  %v0, %v0                # encoding: [0xe7,0x00,0x00,0x20,0x20,0xcc]
+#CHECK: vflpsb  %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x20,0x20,0xcc]
+#CHECK: vflpsb  %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x20,0x24,0xcc]
+#CHECK: vflpsb  %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x20,0x20,0xcc]
+#CHECK: vflpsb  %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x20,0x28,0xcc]
+#CHECK: vflpsb  %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x20,0x24,0xcc]
+
+	vflpsb	%v0, %v0
+	vflpsb	%v0, %v15
+	vflpsb	%v0, %v31
+	vflpsb	%v15, %v0
+	vflpsb	%v31, %v0
+	vflpsb	%v14, %v17
+
+#CHECK: vfll    %v0, %v0, 0, 0          # encoding: [0xe7,0x00,0x00,0x00,0x00,0xc4]
+#CHECK: vfll    %v0, %v0, 15, 0         # encoding: [0xe7,0x00,0x00,0x00,0xf0,0xc4]
+#CHECK: vfll    %v0, %v0, 0, 15         # encoding: [0xe7,0x00,0x00,0x0f,0x00,0xc4]
+#CHECK: vfll    %v0, %v15, 0, 0         # encoding: [0xe7,0x0f,0x00,0x00,0x00,0xc4]
+#CHECK: vfll    %v0, %v31, 0, 0         # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xc4]
+#CHECK: vfll    %v15, %v0, 0, 0         # encoding: [0xe7,0xf0,0x00,0x00,0x00,0xc4]
+#CHECK: vfll    %v31, %v0, 0, 0         # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xc4]
+#CHECK: vfll    %v14, %v17, 11, 9       # encoding: [0xe7,0xe1,0x00,0x09,0xb4,0xc4]
+
+	vfll	%v0, %v0, 0, 0
+	vfll	%v0, %v0, 15, 0
+	vfll	%v0, %v0, 0, 15
+	vfll	%v0, %v15, 0, 0
+	vfll	%v0, %v31, 0, 0
+	vfll	%v15, %v0, 0, 0
+	vfll	%v31, %v0, 0, 0
+	vfll	%v14, %v17, 11, 9
+
+#CHECK: vflls   %v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x20,0xc4]
+#CHECK: vflls   %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x20,0xc4]
+#CHECK: vflls   %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xc4]
+#CHECK: vflls   %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x20,0xc4]
+#CHECK: vflls   %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xc4]
+#CHECK: vflls   %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x24,0xc4]
+
+	vflls	%v0, %v0
+	vflls	%v0, %v15
+	vflls	%v0, %v31
+	vflls	%v15, %v0
+	vflls	%v31, %v0
+	vflls	%v14, %v17
+
+#CHECK: vflr    %v0, %v0, 0, 0, 0       # encoding: [0xe7,0x00,0x00,0x00,0x00,0xc5]
+#CHECK: vflr    %v0, %v0, 15, 0, 0      # encoding: [0xe7,0x00,0x00,0x00,0xf0,0xc5]
+#CHECK: vflr    %v0, %v0, 0, 0, 15      # encoding: [0xe7,0x00,0x00,0xf0,0x00,0xc5]
+#CHECK: vflr    %v0, %v0, 0, 4, 0       # encoding: [0xe7,0x00,0x00,0x04,0x00,0xc5]
+#CHECK: vflr    %v0, %v0, 0, 12, 0      # encoding: [0xe7,0x00,0x00,0x0c,0x00,0xc5]
+#CHECK: vflr    %v0, %v31, 0, 0, 0      # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xc5]
+#CHECK: vflr    %v31, %v0, 0, 0, 0      # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xc5]
+#CHECK: vflr    %v14, %v17, 11, 4, 10   # encoding: [0xe7,0xe1,0x00,0xa4,0xb4,0xc5]
+
+	vflr	%v0, %v0, 0, 0, 0
+	vflr	%v0, %v0, 15, 0, 0
+	vflr	%v0, %v0, 0, 0, 15
+	vflr	%v0, %v0, 0, 4, 0
+	vflr	%v0, %v0, 0, 12, 0
+	vflr	%v0, %v31, 0, 0, 0
+	vflr	%v31, %v0, 0, 0, 0
+	vflr	%v14, %v17, 11, 4, 10
+
+#CHECK: vflrd   %v0, %v0, 0, 0          # encoding: [0xe7,0x00,0x00,0x00,0x30,0xc5]
+#CHECK: vflrd   %v0, %v0, 0, 15         # encoding: [0xe7,0x00,0x00,0xf0,0x30,0xc5]
+#CHECK: vflrd   %v0, %v0, 4, 0          # encoding: [0xe7,0x00,0x00,0x04,0x30,0xc5]
+#CHECK: vflrd   %v0, %v0, 12, 0         # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xc5]
+#CHECK: vflrd   %v0, %v31, 0, 0         # encoding: [0xe7,0x0f,0x00,0x00,0x34,0xc5]
+#CHECK: vflrd   %v31, %v0, 0, 0         # encoding: [0xe7,0xf0,0x00,0x00,0x38,0xc5]
+#CHECK: vflrd   %v14, %v17, 4, 10       # encoding: [0xe7,0xe1,0x00,0xa4,0x34,0xc5]
+
+	vflrd	%v0, %v0, 0, 0
+	vflrd	%v0, %v0, 0, 15
+	vflrd	%v0, %v0, 4, 0
+	vflrd	%v0, %v0, 12, 0
+	vflrd	%v0, %v31, 0, 0
+	vflrd	%v31, %v0, 0, 0
+	vflrd	%v14, %v17, 4, 10
+
+#CHECK: vfmax	%v0, %v0, %v0, 0, 0, 0     # encoding: [0xe7,0x00,0x00,0x00,0x00,0xef]
+#CHECK: vfmax	%v0, %v0, %v0, 15, 0, 0    # encoding: [0xe7,0x00,0x00,0x00,0xf0,0xef]
+#CHECK: vfmax	%v0, %v0, %v0, 0, 15, 0    # encoding: [0xe7,0x00,0x00,0x0f,0x00,0xef]
+#CHECK: vfmax	%v0, %v0, %v0, 0, 0, 4     # encoding: [0xe7,0x00,0x00,0x40,0x00,0xef]
+#CHECK: vfmax	%v0, %v0, %v31, 0, 0, 0    # encoding: [0xe7,0x00,0xf0,0x00,0x02,0xef]
+#CHECK: vfmax	%v0, %v31, %v0, 0, 0, 0    # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xef]
+#CHECK: vfmax	%v31, %v0, %v0, 0, 0, 0    # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xef]
+#CHECK: vfmax	%v18, %v3, %v20, 11, 9, 12 # encoding: [0xe7,0x23,0x40,0xc9,0xba,0xef]
+
+	vfmax	%v0, %v0, %v0, 0, 0, 0
+	vfmax	%v0, %v0, %v0, 15, 0, 0
+	vfmax	%v0, %v0, %v0, 0, 15, 0
+	vfmax	%v0, %v0, %v0, 0, 0, 4
+	vfmax	%v0, %v0, %v31, 0, 0, 0
+	vfmax	%v0, %v31, %v0, 0, 0, 0
+	vfmax	%v31, %v0, %v0, 0, 0, 0
+	vfmax	%v18, %v3, %v20, 11, 9, 12
+
+#CHECK: vfmaxdb	%v0, %v0, %v0, 0        # encoding: [0xe7,0x00,0x00,0x00,0x30,0xef]
+#CHECK: vfmaxdb	%v0, %v0, %v0, 4        # encoding: [0xe7,0x00,0x00,0x40,0x30,0xef]
+#CHECK: vfmaxdb	%v0, %v0, %v31, 0       # encoding: [0xe7,0x00,0xf0,0x00,0x32,0xef]
+#CHECK: vfmaxdb	%v0, %v31, %v0, 0       # encoding: [0xe7,0x0f,0x00,0x00,0x34,0xef]
+#CHECK: vfmaxdb	%v31, %v0, %v0, 0       # encoding: [0xe7,0xf0,0x00,0x00,0x38,0xef]
+#CHECK: vfmaxdb	%v18, %v3, %v20, 12     # encoding: [0xe7,0x23,0x40,0xc0,0x3a,0xef]
+
+	vfmaxdb	%v0, %v0, %v0, 0
+	vfmaxdb	%v0, %v0, %v0, 4
+	vfmaxdb	%v0, %v0, %v31, 0
+	vfmaxdb	%v0, %v31, %v0, 0
+	vfmaxdb	%v31, %v0, %v0, 0
+	vfmaxdb	%v18, %v3, %v20, 12
+
+#CHECK: vfmaxsb	%v0, %v0, %v0, 0        # encoding: [0xe7,0x00,0x00,0x00,0x20,0xef]
+#CHECK: vfmaxsb	%v0, %v0, %v0, 4        # encoding: [0xe7,0x00,0x00,0x40,0x20,0xef]
+#CHECK: vfmaxsb	%v0, %v0, %v31, 0       # encoding: [0xe7,0x00,0xf0,0x00,0x22,0xef]
+#CHECK: vfmaxsb	%v0, %v31, %v0, 0       # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xef]
+#CHECK: vfmaxsb	%v31, %v0, %v0, 0       # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xef]
+#CHECK: vfmaxsb	%v18, %v3, %v20, 12     # encoding: [0xe7,0x23,0x40,0xc0,0x2a,0xef]
+
+	vfmaxsb	%v0, %v0, %v0, 0
+	vfmaxsb	%v0, %v0, %v0, 4
+	vfmaxsb	%v0, %v0, %v31, 0
+	vfmaxsb	%v0, %v31, %v0, 0
+	vfmaxsb	%v31, %v0, %v0, 0
+	vfmaxsb	%v18, %v3, %v20, 12
+
+#CHECK: vfmin	%v0, %v0, %v0, 0, 0, 0     # encoding: [0xe7,0x00,0x00,0x00,0x00,0xee]
+#CHECK: vfmin	%v0, %v0, %v0, 15, 0, 0    # encoding: [0xe7,0x00,0x00,0x00,0xf0,0xee]
+#CHECK: vfmin	%v0, %v0, %v0, 0, 15, 0    # encoding: [0xe7,0x00,0x00,0x0f,0x00,0xee]
+#CHECK: vfmin	%v0, %v0, %v0, 0, 0, 4     # encoding: [0xe7,0x00,0x00,0x40,0x00,0xee]
+#CHECK: vfmin	%v0, %v0, %v31, 0, 0, 0    # encoding: [0xe7,0x00,0xf0,0x00,0x02,0xee]
+#CHECK: vfmin	%v0, %v31, %v0, 0, 0, 0    # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xee]
+#CHECK: vfmin	%v31, %v0, %v0, 0, 0, 0    # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xee]
+#CHECK: vfmin	%v18, %v3, %v20, 11, 9, 12 # encoding: [0xe7,0x23,0x40,0xc9,0xba,0xee]
+
+	vfmin	%v0, %v0, %v0, 0, 0, 0
+	vfmin	%v0, %v0, %v0, 15, 0, 0
+	vfmin	%v0, %v0, %v0, 0, 15, 0
+	vfmin	%v0, %v0, %v0, 0, 0, 4
+	vfmin	%v0, %v0, %v31, 0, 0, 0
+	vfmin	%v0, %v31, %v0, 0, 0, 0
+	vfmin	%v31, %v0, %v0, 0, 0, 0
+	vfmin	%v18, %v3, %v20, 11, 9, 12
+
+#CHECK: vfmindb	%v0, %v0, %v0, 0        # encoding: [0xe7,0x00,0x00,0x00,0x30,0xee]
+#CHECK: vfmindb	%v0, %v0, %v0, 4        # encoding: [0xe7,0x00,0x00,0x40,0x30,0xee]
+#CHECK: vfmindb	%v0, %v0, %v31, 0       # encoding: [0xe7,0x00,0xf0,0x00,0x32,0xee]
+#CHECK: vfmindb	%v0, %v31, %v0, 0       # encoding: [0xe7,0x0f,0x00,0x00,0x34,0xee]
+#CHECK: vfmindb	%v31, %v0, %v0, 0       # encoding: [0xe7,0xf0,0x00,0x00,0x38,0xee]
+#CHECK: vfmindb	%v18, %v3, %v20, 12     # encoding: [0xe7,0x23,0x40,0xc0,0x3a,0xee]
+
+	vfmindb	%v0, %v0, %v0, 0
+	vfmindb	%v0, %v0, %v0, 4
+	vfmindb	%v0, %v0, %v31, 0
+	vfmindb	%v0, %v31, %v0, 0
+	vfmindb	%v31, %v0, %v0, 0
+	vfmindb	%v18, %v3, %v20, 12
+
+#CHECK: vfminsb	%v0, %v0, %v0, 0        # encoding: [0xe7,0x00,0x00,0x00,0x20,0xee]
+#CHECK: vfminsb	%v0, %v0, %v0, 4        # encoding: [0xe7,0x00,0x00,0x40,0x20,0xee]
+#CHECK: vfminsb	%v0, %v0, %v31, 0       # encoding: [0xe7,0x00,0xf0,0x00,0x22,0xee]
+#CHECK: vfminsb	%v0, %v31, %v0, 0       # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xee]
+#CHECK: vfminsb	%v31, %v0, %v0, 0       # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xee]
+#CHECK: vfminsb	%v18, %v3, %v20, 12     # encoding: [0xe7,0x23,0x40,0xc0,0x2a,0xee]
+
+	vfminsb	%v0, %v0, %v0, 0
+	vfminsb	%v0, %v0, %v0, 4
+	vfminsb	%v0, %v0, %v31, 0
+	vfminsb	%v0, %v31, %v0, 0
+	vfminsb	%v31, %v0, %v0, 0
+	vfminsb	%v18, %v3, %v20, 12
+
+#CHECK: vfmasb  %v0, %v0, %v0, %v0      # encoding: [0xe7,0x00,0x02,0x00,0x00,0x8f]
+#CHECK: vfmasb  %v0, %v0, %v0, %v31     # encoding: [0xe7,0x00,0x02,0x00,0xf1,0x8f]
+#CHECK: vfmasb  %v0, %v0, %v31, %v0     # encoding: [0xe7,0x00,0xf2,0x00,0x02,0x8f]
+#CHECK: vfmasb  %v0, %v31, %v0, %v0     # encoding: [0xe7,0x0f,0x02,0x00,0x04,0x8f]
+#CHECK: vfmasb  %v31, %v0, %v0, %v0     # encoding: [0xe7,0xf0,0x02,0x00,0x08,0x8f]
+#CHECK: vfmasb  %v13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x52,0x00,0x97,0x8f]
+
+	vfmasb	%v0, %v0, %v0, %v0
+	vfmasb	%v0, %v0, %v0, %v31
+	vfmasb	%v0, %v0, %v31, %v0
+	vfmasb	%v0, %v31, %v0, %v0
+	vfmasb	%v31, %v0, %v0, %v0
+	vfmasb	%v13, %v17, %v21, %v25
+
+#CHECK: vfmsb   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x20,0xe7]
+#CHECK: vfmsb   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x22,0xe7]
+#CHECK: vfmsb   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xe7]
+#CHECK: vfmsb   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xe7]
+#CHECK: vfmsb   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x2a,0xe7]
+
+	vfmsb	%v0, %v0, %v0
+	vfmsb	%v0, %v0, %v31
+	vfmsb	%v0, %v31, %v0
+	vfmsb	%v31, %v0, %v0
+	vfmsb	%v18, %v3, %v20
+
+#CHECK: vfmssb  %v0, %v0, %v0, %v0      # encoding: [0xe7,0x00,0x02,0x00,0x00,0x8e]
+#CHECK: vfmssb  %v0, %v0, %v0, %v31     # encoding: [0xe7,0x00,0x02,0x00,0xf1,0x8e]
+#CHECK: vfmssb  %v0, %v0, %v31, %v0     # encoding: [0xe7,0x00,0xf2,0x00,0x02,0x8e]
+#CHECK: vfmssb  %v0, %v31, %v0, %v0     # encoding: [0xe7,0x0f,0x02,0x00,0x04,0x8e]
+#CHECK: vfmssb  %v31, %v0, %v0, %v0     # encoding: [0xe7,0xf0,0x02,0x00,0x08,0x8e]
+#CHECK: vfmssb  %v13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x52,0x00,0x97,0x8e]
+
+	vfmssb	%v0, %v0, %v0, %v0
+	vfmssb	%v0, %v0, %v0, %v31
+	vfmssb	%v0, %v0, %v31, %v0
+	vfmssb	%v0, %v31, %v0, %v0
+	vfmssb	%v31, %v0, %v0, %v0
+	vfmssb	%v13, %v17, %v21, %v25
+
+#CHECK: vfnma	%v0, %v0, %v0, %v0, 0, 0       # encoding: [0xe7,0x00,0x00,0x00,0x00,0x9f]
+#CHECK: vfnma	%v0, %v0, %v0, %v0, 0, 15      # encoding: [0xe7,0x00,0x0f,0x00,0x00,0x9f]
+#CHECK: vfnma	%v0, %v0, %v0, %v0, 15, 0      # encoding: [0xe7,0x00,0x00,0x0f,0x00,0x9f]
+#CHECK: vfnma	%v0, %v0, %v0, %v31, 0, 0      # encoding: [0xe7,0x00,0x00,0x00,0xf1,0x9f]
+#CHECK: vfnma	%v0, %v0, %v31, %v0, 0, 0      # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x9f]
+#CHECK: vfnma	%v0, %v31, %v0, %v0, 0, 0      # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x9f]
+#CHECK: vfnma	%v31, %v0, %v0, %v0, 0, 0      # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x9f]
+#CHECK: vfnma	%v13, %v17, %v21, %v25, 9, 11  # encoding: [0xe7,0xd1,0x5b,0x09,0x97,0x9f]
+
+	vfnma	%v0, %v0, %v0, %v0, 0, 0
+	vfnma	%v0, %v0, %v0, %v0, 0, 15
+	vfnma	%v0, %v0, %v0, %v0, 15, 0
+	vfnma	%v0, %v0, %v0, %v31, 0, 0
+	vfnma	%v0, %v0, %v31, %v0, 0, 0
+	vfnma	%v0, %v31, %v0, %v0, 0, 0
+	vfnma	%v31, %v0, %v0, %v0, 0, 0
+	vfnma	%v13, %v17, %v21, %v25, 9, 11
+
+#CHECK: vfnmadb	%v0, %v0, %v0, %v0      # encoding: [0xe7,0x00,0x03,0x00,0x00,0x9f]
+#CHECK: vfnmadb	%v0, %v0, %v0, %v31     # encoding: [0xe7,0x00,0x03,0x00,0xf1,0x9f]
+#CHECK: vfnmadb	%v0, %v0, %v31, %v0     # encoding: [0xe7,0x00,0xf3,0x00,0x02,0x9f]
+#CHECK: vfnmadb	%v0, %v31, %v0, %v0     # encoding: [0xe7,0x0f,0x03,0x00,0x04,0x9f]
+#CHECK: vfnmadb	%v31, %v0, %v0, %v0     # encoding: [0xe7,0xf0,0x03,0x00,0x08,0x9f]
+#CHECK: vfnmadb	%v13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x53,0x00,0x97,0x9f]
+
+	vfnmadb	%v0, %v0, %v0, %v0
+	vfnmadb	%v0, %v0, %v0, %v31
+	vfnmadb	%v0, %v0, %v31, %v0
+	vfnmadb	%v0, %v31, %v0, %v0
+	vfnmadb	%v31, %v0, %v0, %v0
+	vfnmadb	%v13, %v17, %v21, %v25
+
+#CHECK: vfnmasb	%v0, %v0, %v0, %v0      # encoding: [0xe7,0x00,0x02,0x00,0x00,0x9f]
+#CHECK: vfnmasb	%v0, %v0, %v0, %v31     # encoding: [0xe7,0x00,0x02,0x00,0xf1,0x9f]
+#CHECK: vfnmasb	%v0, %v0, %v31, %v0     # encoding: [0xe7,0x00,0xf2,0x00,0x02,0x9f]
+#CHECK: vfnmasb	%v0, %v31, %v0, %v0     # encoding: [0xe7,0x0f,0x02,0x00,0x04,0x9f]
+#CHECK: vfnmasb	%v31, %v0, %v0, %v0     # encoding: [0xe7,0xf0,0x02,0x00,0x08,0x9f]
+#CHECK: vfnmasb	%v13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x52,0x00,0x97,0x9f]
+
+	vfnmasb	%v0, %v0, %v0, %v0
+	vfnmasb	%v0, %v0, %v0, %v31
+	vfnmasb	%v0, %v0, %v31, %v0
+	vfnmasb	%v0, %v31, %v0, %v0
+	vfnmasb	%v31, %v0, %v0, %v0
+	vfnmasb	%v13, %v17, %v21, %v25
+
+#CHECK: vfnms	%v0, %v0, %v0, %v0, 0, 0       # encoding: [0xe7,0x00,0x00,0x00,0x00,0x9e]
+#CHECK: vfnms	%v0, %v0, %v0, %v0, 0, 15      # encoding: [0xe7,0x00,0x0f,0x00,0x00,0x9e]
+#CHECK: vfnms	%v0, %v0, %v0, %v0, 15, 0      # encoding: [0xe7,0x00,0x00,0x0f,0x00,0x9e]
+#CHECK: vfnms	%v0, %v0, %v0, %v31, 0, 0      # encoding: [0xe7,0x00,0x00,0x00,0xf1,0x9e]
+#CHECK: vfnms	%v0, %v0, %v31, %v0, 0, 0      # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x9e]
+#CHECK: vfnms	%v0, %v31, %v0, %v0, 0, 0      # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x9e]
+#CHECK: vfnms	%v31, %v0, %v0, %v0, 0, 0      # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x9e]
+#CHECK: vfnms	%v13, %v17, %v21, %v25, 9, 11  # encoding: [0xe7,0xd1,0x5b,0x09,0x97,0x9e]
+
+	vfnms	%v0, %v0, %v0, %v0, 0, 0
+	vfnms	%v0, %v0, %v0, %v0, 0, 15
+	vfnms	%v0, %v0, %v0, %v0, 15, 0
+	vfnms	%v0, %v0, %v0, %v31, 0, 0
+	vfnms	%v0, %v0, %v31, %v0, 0, 0
+	vfnms	%v0, %v31, %v0, %v0, 0, 0
+	vfnms	%v31, %v0, %v0, %v0, 0, 0
+	vfnms	%v13, %v17, %v21, %v25, 9, 11
+
+#CHECK: vfnmsdb	%v0, %v0, %v0, %v0      # encoding: [0xe7,0x00,0x03,0x00,0x00,0x9e]
+#CHECK: vfnmsdb	%v0, %v0, %v0, %v31     # encoding: [0xe7,0x00,0x03,0x00,0xf1,0x9e]
+#CHECK: vfnmsdb	%v0, %v0, %v31, %v0     # encoding: [0xe7,0x00,0xf3,0x00,0x02,0x9e]
+#CHECK: vfnmsdb	%v0, %v31, %v0, %v0     # encoding: [0xe7,0x0f,0x03,0x00,0x04,0x9e]
+#CHECK: vfnmsdb	%v31, %v0, %v0, %v0     # encoding: [0xe7,0xf0,0x03,0x00,0x08,0x9e]
+#CHECK: vfnmsdb	%v13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x53,0x00,0x97,0x9e]
+
+	vfnmsdb	%v0, %v0, %v0, %v0
+	vfnmsdb	%v0, %v0, %v0, %v31
+	vfnmsdb	%v0, %v0, %v31, %v0
+	vfnmsdb	%v0, %v31, %v0, %v0
+	vfnmsdb	%v31, %v0, %v0, %v0
+	vfnmsdb	%v13, %v17, %v21, %v25
+
+#CHECK: vfnmssb	%v0, %v0, %v0, %v0      # encoding: [0xe7,0x00,0x02,0x00,0x00,0x9e]
+#CHECK: vfnmssb	%v0, %v0, %v0, %v31     # encoding: [0xe7,0x00,0x02,0x00,0xf1,0x9e]
+#CHECK: vfnmssb	%v0, %v0, %v31, %v0     # encoding: [0xe7,0x00,0xf2,0x00,0x02,0x9e]
+#CHECK: vfnmssb	%v0, %v31, %v0, %v0     # encoding: [0xe7,0x0f,0x02,0x00,0x04,0x9e]
+#CHECK: vfnmssb	%v31, %v0, %v0, %v0     # encoding: [0xe7,0xf0,0x02,0x00,0x08,0x9e]
+#CHECK: vfnmssb	%v13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x52,0x00,0x97,0x9e]
+
+	vfnmssb	%v0, %v0, %v0, %v0
+	vfnmssb	%v0, %v0, %v0, %v31
+	vfnmssb	%v0, %v0, %v31, %v0
+	vfnmssb	%v0, %v31, %v0, %v0
+	vfnmssb	%v31, %v0, %v0, %v0
+	vfnmssb	%v13, %v17, %v21, %v25
+
+#CHECK: vfssb   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x20,0xe2]
+#CHECK: vfssb   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x22,0xe2]
+#CHECK: vfssb   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xe2]
+#CHECK: vfssb   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xe2]
+#CHECK: vfssb   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x2a,0xe2]
+
+	vfssb	%v0, %v0, %v0
+	vfssb	%v0, %v0, %v31
+	vfssb	%v0, %v31, %v0
+	vfssb	%v31, %v0, %v0
+	vfssb	%v18, %v3, %v20
+
+#CHECK: vfsqsb  %v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x20,0xce]
+#CHECK: vfsqsb  %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x20,0xce]
+#CHECK: vfsqsb  %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xce]
+#CHECK: vfsqsb  %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x20,0xce]
+#CHECK: vfsqsb  %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xce]
+#CHECK: vfsqsb  %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x24,0xce]
+
+	vfsqsb	%v0, %v0
+	vfsqsb	%v0, %v15
+	vfsqsb	%v0, %v31
+	vfsqsb	%v15, %v0
+	vfsqsb	%v31, %v0
+	vfsqsb	%v14, %v17
+
+#CHECK: vftcisb %v0, %v0, 0             # encoding: [0xe7,0x00,0x00,0x00,0x20,0x4a]
+#CHECK: vftcisb %v0, %v0, 4095          # encoding: [0xe7,0x00,0xff,0xf0,0x20,0x4a]
+#CHECK: vftcisb %v0, %v15, 0            # encoding: [0xe7,0x0f,0x00,0x00,0x20,0x4a]
+#CHECK: vftcisb %v0, %v31, 0            # encoding: [0xe7,0x0f,0x00,0x00,0x24,0x4a]
+#CHECK: vftcisb %v15, %v0, 0            # encoding: [0xe7,0xf0,0x00,0x00,0x20,0x4a]
+#CHECK: vftcisb %v31, %v0, 0            # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x4a]
+#CHECK: vftcisb %v4, %v21, 1656         # encoding: [0xe7,0x45,0x67,0x80,0x24,0x4a]
+
+	vftcisb	%v0, %v0, 0
+	vftcisb	%v0, %v0, 4095
+	vftcisb	%v0, %v15, 0
+	vftcisb	%v0, %v31, 0
+	vftcisb	%v15, %v0, 0
+	vftcisb	%v31, %v0, 0
+	vftcisb	%v4, %v21, 0x678
+
+#CHECK: vlip	%v0, 0, 0               # encoding: [0xe6,0x00,0x00,0x00,0x00,0x49]
+#CHECK: vlip	%v0, 0, 15              # encoding: [0xe6,0x00,0x00,0x00,0xf0,0x49]
+#CHECK: vlip	%v0, 65535, 0           # encoding: [0xe6,0x00,0xff,0xff,0x00,0x49]
+#CHECK: vlip	%v15, 0, 0              # encoding: [0xe6,0xf0,0x00,0x00,0x00,0x49]
+#CHECK: vlip	%v31, 0, 0              # encoding: [0xe6,0xf0,0x00,0x00,0x08,0x49]
+#CHECK: vlip	%v17, 4660, 7           # encoding: [0xe6,0x10,0x12,0x34,0x78,0x49]
+
+	vlip	%v0, 0, 0
+	vlip	%v0, 0, 15
+	vlip	%v0, 0xffff, 0
+	vlip	%v15, 0, 0
+	vlip	%v31, 0, 0
+	vlip	%v17, 0x1234, 7
+
+#CHECK: vllezlf	%v0, 0                  # encoding: [0xe7,0x00,0x00,0x00,0x60,0x04]
+#CHECK: vllezlf	%v0, 4095               # encoding: [0xe7,0x00,0x0f,0xff,0x60,0x04]
+#CHECK: vllezlf	%v0, 0(%r15)            # encoding: [0xe7,0x00,0xf0,0x00,0x60,0x04]
+#CHECK: vllezlf	%v0, 0(%r15,%r1)        # encoding: [0xe7,0x0f,0x10,0x00,0x60,0x04]
+#CHECK: vllezlf	%v15, 0                 # encoding: [0xe7,0xf0,0x00,0x00,0x60,0x04]
+#CHECK: vllezlf	%v31, 0                 # encoding: [0xe7,0xf0,0x00,0x00,0x68,0x04]
+#CHECK: vllezlf	%v18, 1383(%r3,%r4)     # encoding: [0xe7,0x23,0x45,0x67,0x68,0x04]
+
+	vllezlf	%v0, 0
+	vllezlf	%v0, 4095
+	vllezlf	%v0, 0(%r15)
+	vllezlf	%v0, 0(%r15,%r1)
+	vllezlf	%v15, 0
+	vllezlf	%v31, 0
+	vllezlf	%v18, 0x567(%r3,%r4)
+
+#CHECK: vlrl	%v0, 0, 0               # encoding: [0xe6,0x00,0x00,0x00,0x00,0x35]
+#CHECK: vlrl	%v0, 4095, 0            # encoding: [0xe6,0x00,0x0f,0xff,0x00,0x35]
+#CHECK: vlrl	%v0, 0(%r15), 0         # encoding: [0xe6,0x00,0xf0,0x00,0x00,0x35]
+#CHECK: vlrl	%v0, 0, 255             # encoding: [0xe6,0xff,0x00,0x00,0x00,0x35]
+#CHECK: vlrl	%v15, 0, 0              # encoding: [0xe6,0x00,0x00,0x00,0xf0,0x35]
+#CHECK: vlrl	%v31, 0, 0              # encoding: [0xe6,0x00,0x00,0x00,0xf1,0x35]
+#CHECK: vlrl	%v18, 1383(%r4), 3      # encoding: [0xe6,0x03,0x45,0x67,0x21,0x35]
+
+	vlrl	%v0, 0, 0
+	vlrl	%v0, 4095, 0
+	vlrl	%v0, 0(%r15), 0
+	vlrl	%v0, 0, 255
+	vlrl	%v15, 0, 0
+	vlrl	%v31, 0, 0
+	vlrl	%v18, 1383(%r4), 3
+
+#CHECK: vlrlr	%v0, %r0, 0             # encoding: [0xe6,0x00,0x00,0x00,0x00,0x37]
+#CHECK: vlrlr	%v0, %r0, 4095          # encoding: [0xe6,0x00,0x0f,0xff,0x00,0x37]
+#CHECK: vlrlr	%v0, %r0, 0(%r15)       # encoding: [0xe6,0x00,0xf0,0x00,0x00,0x37]
+#CHECK: vlrlr	%v0, %r15, 0            # encoding: [0xe6,0x0f,0x00,0x00,0x00,0x37]
+#CHECK: vlrlr	%v15, %r0, 0            # encoding: [0xe6,0x00,0x00,0x00,0xf0,0x37]
+#CHECK: vlrlr	%v31, %r0, 0            # encoding: [0xe6,0x00,0x00,0x00,0xf1,0x37]
+#CHECK: vlrlr	%v18, %r3, 1383(%r4)    # encoding: [0xe6,0x03,0x45,0x67,0x21,0x37]
+
+	vlrlr	%v0, %r0, 0
+	vlrlr	%v0, %r0, 4095
+	vlrlr	%v0, %r0, 0(%r15)
+	vlrlr	%v0, %r15, 0
+	vlrlr	%v15, %r0, 0
+	vlrlr	%v31, %r0, 0
+	vlrlr	%v18, %r3, 1383(%r4)
+
+#CHECK: vmsl	%v0, %v0, %v0, %v0, 0, 0    # encoding: [0xe7,0x00,0x00,0x00,0x00,0xb8]
+#CHECK: vmsl	%v0, %v0, %v0, %v0, 15, 0   # encoding: [0xe7,0x00,0x0f,0x00,0x00,0xb8]
+#CHECK: vmsl	%v0, %v0, %v0, %v0, 0, 12   # encoding: [0xe7,0x00,0x00,0xc0,0x00,0xb8]
+#CHECK: vmsl	%v0, %v0, %v0, %v15, 0, 0   # encoding: [0xe7,0x00,0x00,0x00,0xf0,0xb8]
+#CHECK: vmsl	%v0, %v0, %v0, %v31, 0, 0   # encoding: [0xe7,0x00,0x00,0x00,0xf1,0xb8]
+#CHECK: vmsl	%v0, %v0, %v15, %v0, 0, 0   # encoding: [0xe7,0x00,0xf0,0x00,0x00,0xb8]
+#CHECK: vmsl	%v0, %v0, %v31, %v0, 0, 0   # encoding: [0xe7,0x00,0xf0,0x00,0x02,0xb8]
+#CHECK: vmsl	%v0, %v15, %v0, %v0, 0, 0   # encoding: [0xe7,0x0f,0x00,0x00,0x00,0xb8]
+#CHECK: vmsl	%v0, %v31, %v0, %v0, 0, 0   # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xb8]
+#CHECK: vmsl	%v15, %v0, %v0, %v0, 0, 0   # encoding: [0xe7,0xf0,0x00,0x00,0x00,0xb8]
+#CHECK: vmsl	%v31, %v0, %v0, %v0, 0, 0   # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xb8]
+#CHECK: vmsl	%v18, %v3, %v20, %v5, 0, 4  # encoding: [0xe7,0x23,0x40,0x40,0x5a,0xb8]
+#CHECK: vmsl	%v18, %v3, %v20, %v5, 11, 8 # encoding: [0xe7,0x23,0x4b,0x80,0x5a,0xb8]
+
+	vmsl	%v0, %v0, %v0, %v0, 0, 0
+	vmsl	%v0, %v0, %v0, %v0, 15, 0
+	vmsl	%v0, %v0, %v0, %v0, 0, 12
+	vmsl	%v0, %v0, %v0, %v15, 0, 0
+	vmsl	%v0, %v0, %v0, %v31, 0, 0
+	vmsl	%v0, %v0, %v15, %v0, 0, 0
+	vmsl	%v0, %v0, %v31, %v0, 0, 0
+	vmsl	%v0, %v15, %v0, %v0, 0, 0
+	vmsl	%v0, %v31, %v0, %v0, 0, 0
+	vmsl	%v15, %v0, %v0, %v0, 0, 0
+	vmsl	%v31, %v0, %v0, %v0, 0, 0
+	vmsl	%v18, %v3, %v20, %v5, 0, 4
+	vmsl	%v18, %v3, %v20, %v5, 11, 8
+
+#CHECK: vmslg	%v0, %v0, %v0, %v0, 0   # encoding: [0xe7,0x00,0x03,0x00,0x00,0xb8]
+#CHECK: vmslg	%v0, %v0, %v0, %v0, 12  # encoding: [0xe7,0x00,0x03,0xc0,0x00,0xb8]
+#CHECK: vmslg	%v0, %v0, %v0, %v15, 0  # encoding: [0xe7,0x00,0x03,0x00,0xf0,0xb8]
+#CHECK: vmslg	%v0, %v0, %v0, %v31, 0  # encoding: [0xe7,0x00,0x03,0x00,0xf1,0xb8]
+#CHECK: vmslg	%v0, %v0, %v15, %v0, 0  # encoding: [0xe7,0x00,0xf3,0x00,0x00,0xb8]
+#CHECK: vmslg	%v0, %v0, %v31, %v0, 0  # encoding: [0xe7,0x00,0xf3,0x00,0x02,0xb8]
+#CHECK: vmslg	%v0, %v15, %v0, %v0, 0  # encoding: [0xe7,0x0f,0x03,0x00,0x00,0xb8]
+#CHECK: vmslg	%v0, %v31, %v0, %v0, 0  # encoding: [0xe7,0x0f,0x03,0x00,0x04,0xb8]
+#CHECK: vmslg	%v15, %v0, %v0, %v0, 0  # encoding: [0xe7,0xf0,0x03,0x00,0x00,0xb8]
+#CHECK: vmslg	%v31, %v0, %v0, %v0, 0  # encoding: [0xe7,0xf0,0x03,0x00,0x08,0xb8]
+#CHECK: vmslg	%v18, %v3, %v20, %v5, 4 # encoding: [0xe7,0x23,0x43,0x40,0x5a,0xb8]
+#CHECK: vmslg	%v18, %v3, %v20, %v5, 8 # encoding: [0xe7,0x23,0x43,0x80,0x5a,0xb8]
+
+	vmslg	%v0, %v0, %v0, %v0, 0
+	vmslg	%v0, %v0, %v0, %v0, 12
+	vmslg	%v0, %v0, %v0, %v15, 0
+	vmslg	%v0, %v0, %v0, %v31, 0
+	vmslg	%v0, %v0, %v15, %v0, 0
+	vmslg	%v0, %v0, %v31, %v0, 0
+	vmslg	%v0, %v15, %v0, %v0, 0
+	vmslg	%v0, %v31, %v0, %v0, 0
+	vmslg	%v15, %v0, %v0, %v0, 0
+	vmslg	%v31, %v0, %v0, %v0, 0
+	vmslg	%v18, %v3, %v20, %v5, 4
+	vmslg	%v18, %v3, %v20, %v5, 8
+
+#CHECK: vmp	%v0, %v0, %v0, 0, 0     # encoding: [0xe6,0x00,0x00,0x00,0x00,0x78]
+#CHECK: vmp	%v0, %v0, %v0, 0, 15    # encoding: [0xe6,0x00,0x00,0xf0,0x00,0x78]
+#CHECK: vmp	%v0, %v0, %v0, 255, 0   # encoding: [0xe6,0x00,0x00,0x0f,0xf0,0x78]
+#CHECK: vmp	%v0, %v0, %v31, 0, 0    # encoding: [0xe6,0x00,0xf0,0x00,0x02,0x78]
+#CHECK: vmp	%v0, %v31, %v0, 0, 0    # encoding: [0xe6,0x0f,0x00,0x00,0x04,0x78]
+#CHECK: vmp	%v31, %v0, %v0, 0, 0    # encoding: [0xe6,0xf0,0x00,0x00,0x08,0x78]
+#CHECK: vmp	%v13, %v17, %v21, 121, 11 # encoding: [0xe6,0xd1,0x50,0xb7,0x96,0x78]
+
+	vmp	%v0, %v0, %v0, 0, 0
+	vmp	%v0, %v0, %v0, 0, 15
+	vmp	%v0, %v0, %v0, 255, 0
+	vmp	%v0, %v0, %v31, 0, 0
+	vmp	%v0, %v31, %v0, 0, 0
+	vmp	%v31, %v0, %v0, 0, 0
+	vmp	%v13, %v17, %v21, 0x79, 11
+
+#CHECK: vmsp	%v0, %v0, %v0, 0, 0     # encoding: [0xe6,0x00,0x00,0x00,0x00,0x79]
+#CHECK: vmsp	%v0, %v0, %v0, 0, 15    # encoding: [0xe6,0x00,0x00,0xf0,0x00,0x79]
+#CHECK: vmsp	%v0, %v0, %v0, 255, 0   # encoding: [0xe6,0x00,0x00,0x0f,0xf0,0x79]
+#CHECK: vmsp	%v0, %v0, %v31, 0, 0    # encoding: [0xe6,0x00,0xf0,0x00,0x02,0x79]
+#CHECK: vmsp	%v0, %v31, %v0, 0, 0    # encoding: [0xe6,0x0f,0x00,0x00,0x04,0x79]
+#CHECK: vmsp	%v31, %v0, %v0, 0, 0    # encoding: [0xe6,0xf0,0x00,0x00,0x08,0x79]
+#CHECK: vmsp	%v13, %v17, %v21, 121, 11 # encoding: [0xe6,0xd1,0x50,0xb7,0x96,0x79]
+
+	vmsp	%v0, %v0, %v0, 0, 0
+	vmsp	%v0, %v0, %v0, 0, 15
+	vmsp	%v0, %v0, %v0, 255, 0
+	vmsp	%v0, %v0, %v31, 0, 0
+	vmsp	%v0, %v31, %v0, 0, 0
+	vmsp	%v31, %v0, %v0, 0, 0
+	vmsp	%v13, %v17, %v21, 0x79, 11
+
+#CHECK: vnn	%v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x00,0x6e]
+#CHECK: vnn	%v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x6e]
+#CHECK: vnn	%v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x6e]
+#CHECK: vnn	%v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x6e]
+#CHECK: vnn	%v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x0a,0x6e]
+
+	vnn	%v0, %v0, %v0
+	vnn	%v0, %v0, %v31
+	vnn	%v0, %v31, %v0
+	vnn	%v31, %v0, %v0
+	vnn	%v18, %v3, %v20
+
+#CHECK: vnx	%v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x00,0x6c]
+#CHECK: vnx	%v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x6c]
+#CHECK: vnx	%v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x6c]
+#CHECK: vnx	%v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x6c]
+#CHECK: vnx	%v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x0a,0x6c]
+
+	vnx	%v0, %v0, %v0
+	vnx	%v0, %v0, %v31
+	vnx	%v0, %v31, %v0
+	vnx	%v31, %v0, %v0
+	vnx	%v18, %v3, %v20
+
+#CHECK: voc	%v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x00,0x6f]
+#CHECK: voc	%v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x6f]
+#CHECK: voc	%v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x6f]
+#CHECK: voc	%v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x6f]
+#CHECK: voc	%v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x0a,0x6f]
+
+	voc	%v0, %v0, %v0
+	voc	%v0, %v0, %v31
+	voc	%v0, %v31, %v0
+	voc	%v31, %v0, %v0
+	voc	%v18, %v3, %v20
+
+#CHECK: vpkz	%v0, 0, 0               # encoding: [0xe6,0x00,0x00,0x00,0x00,0x34]
+#CHECK: vpkz	%v0, 4095, 0            # encoding: [0xe6,0x00,0x0f,0xff,0x00,0x34]
+#CHECK: vpkz	%v0, 0(%r15), 0         # encoding: [0xe6,0x00,0xf0,0x00,0x00,0x34]
+#CHECK: vpkz	%v0, 0, 255             # encoding: [0xe6,0xff,0x00,0x00,0x00,0x34]
+#CHECK: vpkz	%v15, 0, 0              # encoding: [0xe6,0x00,0x00,0x00,0xf0,0x34]
+#CHECK: vpkz	%v31, 0, 0              # encoding: [0xe6,0x00,0x00,0x00,0xf1,0x34]
+#CHECK: vpkz	%v18, 1383(%r4), 3      # encoding: [0xe6,0x03,0x45,0x67,0x21,0x34]
+
+	vpkz	%v0, 0, 0
+	vpkz	%v0, 4095, 0
+	vpkz	%v0, 0(%r15), 0
+	vpkz	%v0, 0, 255
+	vpkz	%v15, 0, 0
+	vpkz	%v31, 0, 0
+	vpkz	%v18, 1383(%r4), 3
+
+#CHECK: vpopctb	%v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x00,0x50]
+#CHECK: vpopctb	%v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x00,0x50]
+#CHECK: vpopctb	%v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x50]
+#CHECK: vpopctb	%v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x00,0x50]
+#CHECK: vpopctb	%v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x50]
+#CHECK: vpopctb	%v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x04,0x50]
+
+	vpopctb	%v0, %v0
+	vpopctb	%v0, %v15
+	vpopctb	%v0, %v31
+	vpopctb	%v15, %v0
+	vpopctb	%v31, %v0
+	vpopctb	%v14, %v17
+
+#CHECK: vpopctf	%v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x20,0x50]
+#CHECK: vpopctf	%v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x20,0x50]
+#CHECK: vpopctf	%v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x24,0x50]
+#CHECK: vpopctf	%v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x20,0x50]
+#CHECK: vpopctf	%v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x50]
+#CHECK: vpopctf	%v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x24,0x50]
+
+	vpopctf	%v0, %v0
+	vpopctf	%v0, %v15
+	vpopctf	%v0, %v31
+	vpopctf	%v15, %v0
+	vpopctf	%v31, %v0
+	vpopctf	%v14, %v17
+
+#CHECK: vpopctg	%v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x30,0x50]
+#CHECK: vpopctg	%v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x30,0x50]
+#CHECK: vpopctg	%v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x34,0x50]
+#CHECK: vpopctg	%v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x30,0x50]
+#CHECK: vpopctg	%v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x38,0x50]
+#CHECK: vpopctg	%v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x34,0x50]
+
+	vpopctg	%v0, %v0
+	vpopctg	%v0, %v15
+	vpopctg	%v0, %v31
+	vpopctg	%v15, %v0
+	vpopctg	%v31, %v0
+	vpopctg	%v14, %v17
+
+#CHECK: vpopcth	%v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x10,0x50]
+#CHECK: vpopcth	%v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x10,0x50]
+#CHECK: vpopcth	%v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x14,0x50]
+#CHECK: vpopcth	%v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x10,0x50]
+#CHECK: vpopcth	%v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x18,0x50]
+#CHECK: vpopcth	%v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x14,0x50]
+
+	vpopcth	%v0, %v0
+	vpopcth	%v0, %v15
+	vpopcth	%v0, %v31
+	vpopcth	%v15, %v0
+	vpopcth	%v31, %v0
+	vpopcth	%v14, %v17
+
+#CHECK: vpsop	%v0, %v0, 0, 0, 0       # encoding: [0xe6,0x00,0x00,0x00,0x00,0x5b]
+#CHECK: vpsop	%v0, %v0, 0, 0, 15      # encoding: [0xe6,0x00,0x00,0xf0,0x00,0x5b]
+#CHECK: vpsop	%v0, %v0, 0, 255, 0     # encoding: [0xe6,0x00,0xff,0x00,0x00,0x5b]
+#CHECK: vpsop	%v0, %v0, 255, 0, 0     # encoding: [0xe6,0x00,0x00,0x0f,0xf0,0x5b]
+#CHECK: vpsop	%v0, %v31, 0, 0, 0      # encoding: [0xe6,0x0f,0x00,0x00,0x04,0x5b]
+#CHECK: vpsop	%v31, %v0, 0, 0, 0      # encoding: [0xe6,0xf0,0x00,0x00,0x08,0x5b]
+#CHECK: vpsop	%v13, %v17, 52, 121, 11 # encoding: [0xe6,0xd1,0x79,0xb3,0x44,0x5b]
+
+	vpsop	%v0, %v0, 0, 0, 0
+	vpsop	%v0, %v0, 0, 0, 15
+	vpsop	%v0, %v0, 0, 255, 0
+	vpsop	%v0, %v0, 255, 0, 0
+	vpsop	%v0, %v31, 0, 0, 0
+	vpsop	%v31, %v0, 0, 0, 0
+	vpsop	%v13, %v17, 0x34, 0x79, 11
+
+#CHECK: vrp	%v0, %v0, %v0, 0, 0     # encoding: [0xe6,0x00,0x00,0x00,0x00,0x7b]
+#CHECK: vrp	%v0, %v0, %v0, 0, 15    # encoding: [0xe6,0x00,0x00,0xf0,0x00,0x7b]
+#CHECK: vrp	%v0, %v0, %v0, 255, 0   # encoding: [0xe6,0x00,0x00,0x0f,0xf0,0x7b]
+#CHECK: vrp	%v0, %v0, %v31, 0, 0    # encoding: [0xe6,0x00,0xf0,0x00,0x02,0x7b]
+#CHECK: vrp	%v0, %v31, %v0, 0, 0    # encoding: [0xe6,0x0f,0x00,0x00,0x04,0x7b]
+#CHECK: vrp	%v31, %v0, %v0, 0, 0    # encoding: [0xe6,0xf0,0x00,0x00,0x08,0x7b]
+#CHECK: vrp	%v13, %v17, %v21, 121, 11 # encoding: [0xe6,0xd1,0x50,0xb7,0x96,0x7b]
+
+	vrp	%v0, %v0, %v0, 0, 0
+	vrp	%v0, %v0, %v0, 0, 15
+	vrp	%v0, %v0, %v0, 255, 0
+	vrp	%v0, %v0, %v31, 0, 0
+	vrp	%v0, %v31, %v0, 0, 0
+	vrp	%v31, %v0, %v0, 0, 0
+	vrp	%v13, %v17, %v21, 0x79, 11
+
+#CHECK: vsdp	%v0, %v0, %v0, 0, 0     # encoding: [0xe6,0x00,0x00,0x00,0x00,0x7e]
+#CHECK: vsdp	%v0, %v0, %v0, 0, 15    # encoding: [0xe6,0x00,0x00,0xf0,0x00,0x7e]
+#CHECK: vsdp	%v0, %v0, %v0, 255, 0   # encoding: [0xe6,0x00,0x00,0x0f,0xf0,0x7e]
+#CHECK: vsdp	%v0, %v0, %v31, 0, 0    # encoding: [0xe6,0x00,0xf0,0x00,0x02,0x7e]
+#CHECK: vsdp	%v0, %v31, %v0, 0, 0    # encoding: [0xe6,0x0f,0x00,0x00,0x04,0x7e]
+#CHECK: vsdp	%v31, %v0, %v0, 0, 0    # encoding: [0xe6,0xf0,0x00,0x00,0x08,0x7e]
+#CHECK: vsdp	%v13, %v17, %v21, 121, 11 # encoding: [0xe6,0xd1,0x50,0xb7,0x96,0x7e]
+
+	vsdp	%v0, %v0, %v0, 0, 0
+	vsdp	%v0, %v0, %v0, 0, 15
+	vsdp	%v0, %v0, %v0, 255, 0
+	vsdp	%v0, %v0, %v31, 0, 0
+	vsdp	%v0, %v31, %v0, 0, 0
+	vsdp	%v31, %v0, %v0, 0, 0
+	vsdp	%v13, %v17, %v21, 0x79, 11
+
+#CHECK: vsp	%v0, %v0, %v0, 0, 0     # encoding: [0xe6,0x00,0x00,0x00,0x00,0x73]
+#CHECK: vsp	%v0, %v0, %v0, 0, 15    # encoding: [0xe6,0x00,0x00,0xf0,0x00,0x73]
+#CHECK: vsp	%v0, %v0, %v0, 255, 0   # encoding: [0xe6,0x00,0x00,0x0f,0xf0,0x73]
+#CHECK: vsp	%v0, %v0, %v31, 0, 0    # encoding: [0xe6,0x00,0xf0,0x00,0x02,0x73]
+#CHECK: vsp	%v0, %v31, %v0, 0, 0    # encoding: [0xe6,0x0f,0x00,0x00,0x04,0x73]
+#CHECK: vsp	%v31, %v0, %v0, 0, 0    # encoding: [0xe6,0xf0,0x00,0x00,0x08,0x73]
+#CHECK: vsp	%v13, %v17, %v21, 121, 11 # encoding: [0xe6,0xd1,0x50,0xb7,0x96,0x73]
+
+	vsp	%v0, %v0, %v0, 0, 0
+	vsp	%v0, %v0, %v0, 0, 15
+	vsp	%v0, %v0, %v0, 255, 0
+	vsp	%v0, %v0, %v31, 0, 0
+	vsp	%v0, %v31, %v0, 0, 0
+	vsp	%v31, %v0, %v0, 0, 0
+	vsp	%v13, %v17, %v21, 0x79, 11
+
+#CHECK: vsrp	%v0, %v0, 0, 0, 0       # encoding: [0xe6,0x00,0x00,0x00,0x00,0x59]
+#CHECK: vsrp	%v0, %v0, 0, 0, 15      # encoding: [0xe6,0x00,0x00,0xf0,0x00,0x59]
+#CHECK: vsrp	%v0, %v0, 0, 255, 0     # encoding: [0xe6,0x00,0xff,0x00,0x00,0x59]
+#CHECK: vsrp	%v0, %v0, 255, 0, 0     # encoding: [0xe6,0x00,0x00,0x0f,0xf0,0x59]
+#CHECK: vsrp	%v0, %v31, 0, 0, 0      # encoding: [0xe6,0x0f,0x00,0x00,0x04,0x59]
+#CHECK: vsrp	%v31, %v0, 0, 0, 0      # encoding: [0xe6,0xf0,0x00,0x00,0x08,0x59]
+#CHECK: vsrp	%v13, %v17, 52, 121, 11 # encoding: [0xe6,0xd1,0x79,0xb3,0x44,0x59]
+
+	vsrp	%v0, %v0, 0, 0, 0
+	vsrp	%v0, %v0, 0, 0, 15
+	vsrp	%v0, %v0, 0, 255, 0
+	vsrp	%v0, %v0, 255, 0, 0
+	vsrp	%v0, %v31, 0, 0, 0
+	vsrp	%v31, %v0, 0, 0, 0
+	vsrp	%v13, %v17, 0x34, 0x79, 11
+
+#CHECK: vstrl	%v0, 0, 0               # encoding: [0xe6,0x00,0x00,0x00,0x00,0x3d]
+#CHECK: vstrl	%v0, 4095, 0            # encoding: [0xe6,0x00,0x0f,0xff,0x00,0x3d]
+#CHECK: vstrl	%v0, 0(%r15), 0         # encoding: [0xe6,0x00,0xf0,0x00,0x00,0x3d]
+#CHECK: vstrl	%v0, 0, 255             # encoding: [0xe6,0xff,0x00,0x00,0x00,0x3d]
+#CHECK: vstrl	%v15, 0, 0              # encoding: [0xe6,0x00,0x00,0x00,0xf0,0x3d]
+#CHECK: vstrl	%v31, 0, 0              # encoding: [0xe6,0x00,0x00,0x00,0xf1,0x3d]
+#CHECK: vstrl	%v18, 1383(%r4), 3      # encoding: [0xe6,0x03,0x45,0x67,0x21,0x3d]
+
+	vstrl	%v0, 0, 0
+	vstrl	%v0, 4095, 0
+	vstrl	%v0, 0(%r15), 0
+	vstrl	%v0, 0, 255
+	vstrl	%v15, 0, 0
+	vstrl	%v31, 0, 0
+	vstrl	%v18, 1383(%r4), 3
+
+#CHECK: vstrlr	%v0, %r0, 0             # encoding: [0xe6,0x00,0x00,0x00,0x00,0x3f]
+#CHECK: vstrlr	%v0, %r0, 4095          # encoding: [0xe6,0x00,0x0f,0xff,0x00,0x3f]
+#CHECK: vstrlr	%v0, %r0, 0(%r15)       # encoding: [0xe6,0x00,0xf0,0x00,0x00,0x3f]
+#CHECK: vstrlr	%v0, %r15, 0            # encoding: [0xe6,0x0f,0x00,0x00,0x00,0x3f]
+#CHECK: vstrlr	%v15, %r0, 0            # encoding: [0xe6,0x00,0x00,0x00,0xf0,0x3f]
+#CHECK: vstrlr	%v31, %r0, 0            # encoding: [0xe6,0x00,0x00,0x00,0xf1,0x3f]
+#CHECK: vstrlr	%v18, %r3, 1383(%r4)    # encoding: [0xe6,0x03,0x45,0x67,0x21,0x3f]
+
+	vstrlr	%v0, %r0, 0
+	vstrlr	%v0, %r0, 4095
+	vstrlr	%v0, %r0, 0(%r15)
+	vstrlr	%v0, %r15, 0
+	vstrlr	%v15, %r0, 0
+	vstrlr	%v31, %r0, 0
+	vstrlr	%v18, %r3, 1383(%r4)
+
+#CHECK: vtp	%v0                     # encoding: [0xe6,0x00,0x00,0x00,0x00,0x5f]
+#CHECK: vtp	%v15                    # encoding: [0xe6,0x0f,0x00,0x00,0x00,0x5f]
+#CHECK: vtp	%v31                    # encoding: [0xe6,0x0f,0x00,0x00,0x04,0x5f]
+
+	vtp	%v0
+	vtp	%v15
+	vtp	%v31
+
+#CHECK: vupkz	%v0, 0, 0               # encoding: [0xe6,0x00,0x00,0x00,0x00,0x3c]
+#CHECK: vupkz	%v0, 4095, 0            # encoding: [0xe6,0x00,0x0f,0xff,0x00,0x3c]
+#CHECK: vupkz	%v0, 0(%r15), 0         # encoding: [0xe6,0x00,0xf0,0x00,0x00,0x3c]
+#CHECK: vupkz	%v0, 0, 255             # encoding: [0xe6,0xff,0x00,0x00,0x00,0x3c]
+#CHECK: vupkz	%v15, 0, 0              # encoding: [0xe6,0x00,0x00,0x00,0xf0,0x3c]
+#CHECK: vupkz	%v31, 0, 0              # encoding: [0xe6,0x00,0x00,0x00,0xf1,0x3c]
+#CHECK: vupkz	%v18, 1383(%r4), 3      # encoding: [0xe6,0x03,0x45,0x67,0x21,0x3c]
+
+	vupkz	%v0, 0, 0
+	vupkz	%v0, 4095, 0
+	vupkz	%v0, 0(%r15), 0
+	vupkz	%v0, 0, 255
+	vupkz	%v15, 0, 0
+	vupkz	%v31, 0, 0
+	vupkz	%v18, 1383(%r4), 3
+
+#CHECK: wfasb   %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x08,0x20,0xe3]
+#CHECK: wfasb   %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x08,0x20,0xe3]
+#CHECK: wfasb   %f0, %f0, %v31          # encoding: [0xe7,0x00,0xf0,0x08,0x22,0xe3]
+#CHECK: wfasb   %f0, %v31, %f0          # encoding: [0xe7,0x0f,0x00,0x08,0x24,0xe3]
+#CHECK: wfasb   %v31, %f0, %f0          # encoding: [0xe7,0xf0,0x00,0x08,0x28,0xe3]
+#CHECK: wfasb   %v18, %f3, %v20         # encoding: [0xe7,0x23,0x40,0x08,0x2a,0xe3]
+
+	wfasb	%v0, %v0, %v0
+	wfasb   %f0, %f0, %f0
+	wfasb	%v0, %v0, %v31
+	wfasb	%v0, %v31, %v0
+	wfasb	%v31, %v0, %v0
+	wfasb	%v18, %v3, %v20
+
+#CHECK: wfaxb   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x08,0x40,0xe3]
+#CHECK: wfaxb   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x08,0x42,0xe3]
+#CHECK: wfaxb   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x08,0x44,0xe3]
+#CHECK: wfaxb   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x08,0x48,0xe3]
+#CHECK: wfaxb   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x08,0x4a,0xe3]
+
+	wfaxb	%v0, %v0, %v0
+	wfaxb	%v0, %v0, %v31
+	wfaxb	%v0, %v31, %v0
+	wfaxb	%v31, %v0, %v0
+	wfaxb	%v18, %v3, %v20
+
+#CHECK: wfcsb   %f0, %f0                # encoding: [0xe7,0x00,0x00,0x00,0x20,0xcb]
+#CHECK: wfcsb   %f0, %f0                # encoding: [0xe7,0x00,0x00,0x00,0x20,0xcb]
+#CHECK: wfcsb   %f0, %f15               # encoding: [0xe7,0x0f,0x00,0x00,0x20,0xcb]
+#CHECK: wfcsb   %f0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xcb]
+#CHECK: wfcsb   %f15, %f0               # encoding: [0xe7,0xf0,0x00,0x00,0x20,0xcb]
+#CHECK: wfcsb   %v31, %f0               # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xcb]
+#CHECK: wfcsb   %f14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x24,0xcb]
+
+	wfcsb	%v0, %v0
+	wfcsb	%f0, %f0
+	wfcsb	%v0, %v15
+	wfcsb	%v0, %v31
+	wfcsb	%v15, %v0
+	wfcsb	%v31, %v0
+	wfcsb	%v14, %v17
+
+#CHECK: wfcxb   %v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x40,0xcb]
+#CHECK: wfcxb   %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x40,0xcb]
+#CHECK: wfcxb   %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x44,0xcb]
+#CHECK: wfcxb   %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x40,0xcb]
+#CHECK: wfcxb   %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x48,0xcb]
+#CHECK: wfcxb   %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x44,0xcb]
+
+	wfcxb	%v0, %v0
+	wfcxb	%v0, %v15
+	wfcxb	%v0, %v31
+	wfcxb	%v15, %v0
+	wfcxb	%v31, %v0
+	wfcxb	%v14, %v17
+
+#CHECK: wfcesb  %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x08,0x20,0xe8]
+#CHECK: wfcesb  %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x08,0x20,0xe8]
+#CHECK: wfcesb  %f0, %f0, %v31          # encoding: [0xe7,0x00,0xf0,0x08,0x22,0xe8]
+#CHECK: wfcesb  %f0, %v31, %f0          # encoding: [0xe7,0x0f,0x00,0x08,0x24,0xe8]
+#CHECK: wfcesb  %v31, %f0, %f0          # encoding: [0xe7,0xf0,0x00,0x08,0x28,0xe8]
+#CHECK: wfcesb  %v18, %f3, %v20         # encoding: [0xe7,0x23,0x40,0x08,0x2a,0xe8]
+
+	wfcesb	%v0, %v0, %v0
+	wfcesb	%f0, %f0, %f0
+	wfcesb	%v0, %v0, %v31
+	wfcesb	%v0, %v31, %v0
+	wfcesb	%v31, %v0, %v0
+	wfcesb	%v18, %v3, %v20
+
+#CHECK: wfcesbs %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x18,0x20,0xe8]
+#CHECK: wfcesbs %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x18,0x20,0xe8]
+#CHECK: wfcesbs %f0, %f0, %v31          # encoding: [0xe7,0x00,0xf0,0x18,0x22,0xe8]
+#CHECK: wfcesbs %f0, %v31, %f0          # encoding: [0xe7,0x0f,0x00,0x18,0x24,0xe8]
+#CHECK: wfcesbs %v31, %f0, %f0          # encoding: [0xe7,0xf0,0x00,0x18,0x28,0xe8]
+#CHECK: wfcesbs %v18, %f3, %v20         # encoding: [0xe7,0x23,0x40,0x18,0x2a,0xe8]
+
+	wfcesbs	%v0, %v0, %v0
+	wfcesbs	%f0, %f0, %f0
+	wfcesbs	%v0, %v0, %v31
+	wfcesbs	%v0, %v31, %v0
+	wfcesbs	%v31, %v0, %v0
+	wfcesbs	%v18, %v3, %v20
+
+#CHECK: wfcexb  %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x08,0x40,0xe8]
+#CHECK: wfcexb  %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x08,0x42,0xe8]
+#CHECK: wfcexb  %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x08,0x44,0xe8]
+#CHECK: wfcexb  %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x08,0x48,0xe8]
+#CHECK: wfcexb  %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x08,0x4a,0xe8]
+
+	wfcexb	%v0, %v0, %v0
+	wfcexb	%v0, %v0, %v31
+	wfcexb	%v0, %v31, %v0
+	wfcexb	%v31, %v0, %v0
+	wfcexb	%v18, %v3, %v20
+
+#CHECK: wfcexbs %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x18,0x40,0xe8]
+#CHECK: wfcexbs %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x18,0x42,0xe8]
+#CHECK: wfcexbs %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x18,0x44,0xe8]
+#CHECK: wfcexbs %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x18,0x48,0xe8]
+#CHECK: wfcexbs %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x18,0x4a,0xe8]
+
+	wfcexbs	%v0, %v0, %v0
+	wfcexbs	%v0, %v0, %v31
+	wfcexbs	%v0, %v31, %v0
+	wfcexbs	%v31, %v0, %v0
+	wfcexbs	%v18, %v3, %v20
+
+#CHECK: wfchsb  %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x08,0x20,0xeb]
+#CHECK: wfchsb  %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x08,0x20,0xeb]
+#CHECK: wfchsb  %f0, %f0, %v31          # encoding: [0xe7,0x00,0xf0,0x08,0x22,0xeb]
+#CHECK: wfchsb  %f0, %v31, %f0          # encoding: [0xe7,0x0f,0x00,0x08,0x24,0xeb]
+#CHECK: wfchsb  %v31, %f0, %f0          # encoding: [0xe7,0xf0,0x00,0x08,0x28,0xeb]
+#CHECK: wfchsb  %v18, %f3, %v20         # encoding: [0xe7,0x23,0x40,0x08,0x2a,0xeb]
+
+	wfchsb	%v0, %v0, %v0
+	wfchsb	%f0, %f0, %f0
+	wfchsb	%v0, %v0, %v31
+	wfchsb	%v0, %v31, %v0
+	wfchsb	%v31, %v0, %v0
+	wfchsb	%v18, %v3, %v20
+
+#CHECK: wfchsbs %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x18,0x20,0xeb]
+#CHECK: wfchsbs %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x18,0x20,0xeb]
+#CHECK: wfchsbs %f0, %f0, %v31          # encoding: [0xe7,0x00,0xf0,0x18,0x22,0xeb]
+#CHECK: wfchsbs %f0, %v31, %f0          # encoding: [0xe7,0x0f,0x00,0x18,0x24,0xeb]
+#CHECK: wfchsbs %v31, %f0, %f0          # encoding: [0xe7,0xf0,0x00,0x18,0x28,0xeb]
+#CHECK: wfchsbs %v18, %f3, %v20         # encoding: [0xe7,0x23,0x40,0x18,0x2a,0xeb]
+
+	wfchsbs	%v0, %v0, %v0
+	wfchsbs	%f0, %f0, %f0
+	wfchsbs	%v0, %v0, %v31
+	wfchsbs	%v0, %v31, %v0
+	wfchsbs	%v31, %v0, %v0
+	wfchsbs	%v18, %v3, %v20
+
+#CHECK: wfchxb  %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x08,0x40,0xeb]
+#CHECK: wfchxb  %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x08,0x42,0xeb]
+#CHECK: wfchxb  %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x08,0x44,0xeb]
+#CHECK: wfchxb  %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x08,0x48,0xeb]
+#CHECK: wfchxb  %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x08,0x4a,0xeb]
+
+	wfchxb	%v0, %v0, %v0
+	wfchxb	%v0, %v0, %v31
+	wfchxb	%v0, %v31, %v0
+	wfchxb	%v31, %v0, %v0
+	wfchxb	%v18, %v3, %v20
+
+#CHECK: wfchxbs %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x18,0x40,0xeb]
+#CHECK: wfchxbs %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x18,0x42,0xeb]
+#CHECK: wfchxbs %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x18,0x44,0xeb]
+#CHECK: wfchxbs %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x18,0x48,0xeb]
+#CHECK: wfchxbs %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x18,0x4a,0xeb]
+
+	wfchxbs	%v0, %v0, %v0
+	wfchxbs	%v0, %v0, %v31
+	wfchxbs	%v0, %v31, %v0
+	wfchxbs	%v31, %v0, %v0
+	wfchxbs	%v18, %v3, %v20
+
+#CHECK: wfchesb %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x08,0x20,0xea]
+#CHECK: wfchesb %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x08,0x20,0xea]
+#CHECK: wfchesb %f0, %f0, %v31          # encoding: [0xe7,0x00,0xf0,0x08,0x22,0xea]
+#CHECK: wfchesb %f0, %v31, %f0          # encoding: [0xe7,0x0f,0x00,0x08,0x24,0xea]
+#CHECK: wfchesb %v31, %f0, %f0          # encoding: [0xe7,0xf0,0x00,0x08,0x28,0xea]
+#CHECK: wfchesb %v18, %f3, %v20         # encoding: [0xe7,0x23,0x40,0x08,0x2a,0xea]
+
+	wfchesb	%v0, %v0, %v0
+	wfchesb	%f0, %f0, %f0
+	wfchesb	%v0, %v0, %v31
+	wfchesb	%v0, %v31, %v0
+	wfchesb	%v31, %v0, %v0
+	wfchesb	%v18, %v3, %v20
+
+#CHECK: wfchesbs %f0, %f0, %f0          # encoding: [0xe7,0x00,0x00,0x18,0x20,0xea]
+#CHECK: wfchesbs %f0, %f0, %f0          # encoding: [0xe7,0x00,0x00,0x18,0x20,0xea]
+#CHECK: wfchesbs %f0, %f0, %v31         # encoding: [0xe7,0x00,0xf0,0x18,0x22,0xea]
+#CHECK: wfchesbs %f0, %v31, %f0         # encoding: [0xe7,0x0f,0x00,0x18,0x24,0xea]
+#CHECK: wfchesbs %v31, %f0, %f0         # encoding: [0xe7,0xf0,0x00,0x18,0x28,0xea]
+#CHECK: wfchesbs %v18, %f3, %v20        # encoding: [0xe7,0x23,0x40,0x18,0x2a,0xea]
+
+	wfchesbs %v0, %v0, %v0
+	wfchesbs %f0, %f0, %f0
+	wfchesbs %v0, %v0, %v31
+	wfchesbs %v0, %v31, %v0
+	wfchesbs %v31, %v0, %v0
+	wfchesbs %v18, %v3, %v20
+
+#CHECK: wfchexb %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x08,0x40,0xea]
+#CHECK: wfchexb %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x08,0x42,0xea]
+#CHECK: wfchexb %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x08,0x44,0xea]
+#CHECK: wfchexb %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x08,0x48,0xea]
+#CHECK: wfchexb %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x08,0x4a,0xea]
+
+	wfchexb	%v0, %v0, %v0
+	wfchexb	%v0, %v0, %v31
+	wfchexb	%v0, %v31, %v0
+	wfchexb	%v31, %v0, %v0
+	wfchexb	%v18, %v3, %v20
+
+#CHECK: wfchexbs %v0, %v0, %v0          # encoding: [0xe7,0x00,0x00,0x18,0x40,0xea]
+#CHECK: wfchexbs %v0, %v0, %v31         # encoding: [0xe7,0x00,0xf0,0x18,0x42,0xea]
+#CHECK: wfchexbs %v0, %v31, %v0         # encoding: [0xe7,0x0f,0x00,0x18,0x44,0xea]
+#CHECK: wfchexbs %v31, %v0, %v0         # encoding: [0xe7,0xf0,0x00,0x18,0x48,0xea]
+#CHECK: wfchexbs %v18, %v3, %v20        # encoding: [0xe7,0x23,0x40,0x18,0x4a,0xea]
+
+	wfchexbs %v0, %v0, %v0
+	wfchexbs %v0, %v0, %v31
+	wfchexbs %v0, %v31, %v0
+	wfchexbs %v31, %v0, %v0
+	wfchexbs %v18, %v3, %v20
+
+#CHECK: wfdsb   %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x08,0x20,0xe5]
+#CHECK: wfdsb   %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x08,0x20,0xe5]
+#CHECK: wfdsb   %f0, %f0, %v31          # encoding: [0xe7,0x00,0xf0,0x08,0x22,0xe5]
+#CHECK: wfdsb   %f0, %v31, %f0          # encoding: [0xe7,0x0f,0x00,0x08,0x24,0xe5]
+#CHECK: wfdsb   %v31, %f0, %f0          # encoding: [0xe7,0xf0,0x00,0x08,0x28,0xe5]
+#CHECK: wfdsb   %v18, %f3, %v20         # encoding: [0xe7,0x23,0x40,0x08,0x2a,0xe5]
+
+	wfdsb	%v0, %v0, %v0
+	wfdsb	%f0, %f0, %f0
+	wfdsb	%v0, %v0, %v31
+	wfdsb	%v0, %v31, %v0
+	wfdsb	%v31, %v0, %v0
+	wfdsb	%v18, %v3, %v20
+
+#CHECK: wfdxb   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x08,0x40,0xe5]
+#CHECK: wfdxb   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x08,0x42,0xe5]
+#CHECK: wfdxb   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x08,0x44,0xe5]
+#CHECK: wfdxb   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x08,0x48,0xe5]
+#CHECK: wfdxb   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x08,0x4a,0xe5]
+
+	wfdxb	%v0, %v0, %v0
+	wfdxb	%v0, %v0, %v31
+	wfdxb	%v0, %v31, %v0
+	wfdxb	%v31, %v0, %v0
+	wfdxb	%v18, %v3, %v20
+
+#CHECK: wfisb   %f0, %f0, 0, 0          # encoding: [0xe7,0x00,0x00,0x08,0x20,0xc7]
+#CHECK: wfisb	%f0, %f0, 0, 0          # encoding: [0xe7,0x00,0x00,0x08,0x20,0xc7]
+#CHECK: wfisb   %f0, %f0, 0, 15         # encoding: [0xe7,0x00,0x00,0xf8,0x20,0xc7]
+#CHECK: wfisb   %f0, %f0, 4, 0          # encoding: [0xe7,0x00,0x00,0x0c,0x20,0xc7]
+#CHECK: wfisb   %f0, %f0, 7, 0          # encoding: [0xe7,0x00,0x00,0x0f,0x20,0xc7]
+#CHECK: wfisb   %f0, %v31, 0, 0         # encoding: [0xe7,0x0f,0x00,0x08,0x24,0xc7]
+#CHECK: wfisb   %v31, %f0, 0, 0         # encoding: [0xe7,0xf0,0x00,0x08,0x28,0xc7]
+#CHECK: wfisb   %f14, %v17, 4, 10       # encoding: [0xe7,0xe1,0x00,0xac,0x24,0xc7]
+
+	wfisb	%v0, %v0, 0, 0
+	wfisb	%f0, %f0, 0, 0
+ 	wfisb	%v0, %v0, 0, 15
+	wfisb	%v0, %v0, 4, 0
+	wfisb	%v0, %v0, 7, 0
+	wfisb	%v0, %v31, 0, 0
+	wfisb	%v31, %v0, 0, 0
+	wfisb	%v14, %v17, 4, 10
+
+#CHECK: wfixb   %v0, %v0, 0, 0          # encoding: [0xe7,0x00,0x00,0x08,0x40,0xc7]
+#CHECK: wfixb   %v0, %v0, 0, 15         # encoding: [0xe7,0x00,0x00,0xf8,0x40,0xc7]
+#CHECK: wfixb   %v0, %v0, 4, 0          # encoding: [0xe7,0x00,0x00,0x0c,0x40,0xc7]
+#CHECK: wfixb   %v0, %v0, 7, 0          # encoding: [0xe7,0x00,0x00,0x0f,0x40,0xc7]
+#CHECK: wfixb   %v0, %v31, 0, 0         # encoding: [0xe7,0x0f,0x00,0x08,0x44,0xc7]
+#CHECK: wfixb   %v31, %v0, 0, 0         # encoding: [0xe7,0xf0,0x00,0x08,0x48,0xc7]
+#CHECK: wfixb   %v14, %v17, 4, 10       # encoding: [0xe7,0xe1,0x00,0xac,0x44,0xc7]
+
+	wfixb	%v0, %v0, 0, 0
+ 	wfixb	%v0, %v0, 0, 15
+	wfixb	%v0, %v0, 4, 0
+	wfixb	%v0, %v0, 7, 0
+	wfixb	%v0, %v31, 0, 0
+	wfixb	%v31, %v0, 0, 0
+	wfixb	%v14, %v17, 4, 10
+
+#CHECK: wfksb   %f0, %f0                # encoding: [0xe7,0x00,0x00,0x00,0x20,0xca]
+#CHECK: wfksb   %f0, %f0                # encoding: [0xe7,0x00,0x00,0x00,0x20,0xca]
+#CHECK: wfksb   %f0, %f15               # encoding: [0xe7,0x0f,0x00,0x00,0x20,0xca]
+#CHECK: wfksb   %f0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xca]
+#CHECK: wfksb   %f15, %f0               # encoding: [0xe7,0xf0,0x00,0x00,0x20,0xca]
+#CHECK: wfksb   %v31, %f0               # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xca]
+#CHECK: wfksb   %f14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x24,0xca]
+
+	wfksb	%v0, %v0
+	wfksb	%f0, %f0
+	wfksb	%v0, %v15
+	wfksb	%v0, %v31
+	wfksb	%v15, %v0
+	wfksb	%v31, %v0
+	wfksb	%v14, %v17
+
+#CHECK: wfkxb   %v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x40,0xca]
+#CHECK: wfkxb   %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x40,0xca]
+#CHECK: wfkxb   %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x44,0xca]
+#CHECK: wfkxb   %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x40,0xca]
+#CHECK: wfkxb   %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x48,0xca]
+#CHECK: wfkxb   %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x44,0xca]
+
+	wfkxb	%v0, %v0
+	wfkxb	%v0, %v15
+	wfkxb	%v0, %v31
+	wfkxb	%v15, %v0
+	wfkxb	%v31, %v0
+	wfkxb	%v14, %v17
+
+#CHECK: wfkedb  %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xe8]
+#CHECK: wfkedb  %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xe8]
+#CHECK: wfkedb  %f0, %f0, %v31          # encoding: [0xe7,0x00,0xf0,0x0c,0x32,0xe8]
+#CHECK: wfkedb  %f0, %v31, %f0          # encoding: [0xe7,0x0f,0x00,0x0c,0x34,0xe8]
+#CHECK: wfkedb  %v31, %f0, %f0          # encoding: [0xe7,0xf0,0x00,0x0c,0x38,0xe8]
+#CHECK: wfkedb  %v18, %f3, %v20         # encoding: [0xe7,0x23,0x40,0x0c,0x3a,0xe8]
+
+	wfkedb	%v0, %v0, %v0
+	wfkedb	%f0, %f0, %f0
+	wfkedb	%v0, %v0, %v31
+	wfkedb	%v0, %v31, %v0
+	wfkedb	%v31, %v0, %v0
+	wfkedb	%v18, %v3, %v20
+
+#CHECK: wfkedbs %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x1c,0x30,0xe8]
+#CHECK: wfkedbs %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x1c,0x30,0xe8]
+#CHECK: wfkedbs %f0, %f0, %v31          # encoding: [0xe7,0x00,0xf0,0x1c,0x32,0xe8]
+#CHECK: wfkedbs %f0, %v31, %f0          # encoding: [0xe7,0x0f,0x00,0x1c,0x34,0xe8]
+#CHECK: wfkedbs %v31, %f0, %f0          # encoding: [0xe7,0xf0,0x00,0x1c,0x38,0xe8]
+#CHECK: wfkedbs %v18, %f3, %v20         # encoding: [0xe7,0x23,0x40,0x1c,0x3a,0xe8]
+
+	wfkedbs	%v0, %v0, %v0
+	wfkedbs	%f0, %f0, %f0
+	wfkedbs	%v0, %v0, %v31
+	wfkedbs	%v0, %v31, %v0
+	wfkedbs	%v31, %v0, %v0
+	wfkedbs	%v18, %v3, %v20
+
+#CHECK: wfkesb  %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x0c,0x20,0xe8]
+#CHECK: wfkesb  %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x0c,0x20,0xe8]
+#CHECK: wfkesb  %f0, %f0, %v31          # encoding: [0xe7,0x00,0xf0,0x0c,0x22,0xe8]
+#CHECK: wfkesb  %f0, %v31, %f0          # encoding: [0xe7,0x0f,0x00,0x0c,0x24,0xe8]
+#CHECK: wfkesb  %v31, %f0, %f0          # encoding: [0xe7,0xf0,0x00,0x0c,0x28,0xe8]
+#CHECK: wfkesb  %v18, %f3, %v20         # encoding: [0xe7,0x23,0x40,0x0c,0x2a,0xe8]
+
+	wfkesb	%v0, %v0, %v0
+	wfkesb	%f0, %f0, %f0
+	wfkesb	%v0, %v0, %v31
+	wfkesb	%v0, %v31, %v0
+	wfkesb	%v31, %v0, %v0
+	wfkesb	%v18, %v3, %v20
+
+#CHECK: wfkesbs %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x1c,0x20,0xe8]
+#CHECK: wfkesbs %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x1c,0x20,0xe8]
+#CHECK: wfkesbs %f0, %f0, %v31          # encoding: [0xe7,0x00,0xf0,0x1c,0x22,0xe8]
+#CHECK: wfkesbs %f0, %v31, %f0          # encoding: [0xe7,0x0f,0x00,0x1c,0x24,0xe8]
+#CHECK: wfkesbs %v31, %f0, %f0          # encoding: [0xe7,0xf0,0x00,0x1c,0x28,0xe8]
+#CHECK: wfkesbs %v18, %f3, %v20         # encoding: [0xe7,0x23,0x40,0x1c,0x2a,0xe8]
+
+	wfkesbs	%v0, %v0, %v0
+	wfkesbs	%f0, %f0, %f0
+	wfkesbs	%v0, %v0, %v31
+	wfkesbs	%v0, %v31, %v0
+	wfkesbs	%v31, %v0, %v0
+	wfkesbs	%v18, %v3, %v20
+
+#CHECK: wfkexb  %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x0c,0x40,0xe8]
+#CHECK: wfkexb  %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x0c,0x42,0xe8]
+#CHECK: wfkexb  %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x0c,0x44,0xe8]
+#CHECK: wfkexb  %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x0c,0x48,0xe8]
+#CHECK: wfkexb  %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x0c,0x4a,0xe8]
+
+	wfkexb	%v0, %v0, %v0
+	wfkexb	%v0, %v0, %v31
+	wfkexb	%v0, %v31, %v0
+	wfkexb	%v31, %v0, %v0
+	wfkexb	%v18, %v3, %v20
+
+#CHECK: wfkexbs %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x1c,0x40,0xe8]
+#CHECK: wfkexbs %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x1c,0x42,0xe8]
+#CHECK: wfkexbs %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x1c,0x44,0xe8]
+#CHECK: wfkexbs %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x1c,0x48,0xe8]
+#CHECK: wfkexbs %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x1c,0x4a,0xe8]
+
+	wfkexbs	%v0, %v0, %v0
+	wfkexbs	%v0, %v0, %v31
+	wfkexbs	%v0, %v31, %v0
+	wfkexbs	%v31, %v0, %v0
+	wfkexbs	%v18, %v3, %v20
+
+#CHECK: wfkhdb  %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xeb]
+#CHECK: wfkhdb  %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xeb]
+#CHECK: wfkhdb  %f0, %f0, %v31          # encoding: [0xe7,0x00,0xf0,0x0c,0x32,0xeb]
+#CHECK: wfkhdb  %f0, %v31, %f0          # encoding: [0xe7,0x0f,0x00,0x0c,0x34,0xeb]
+#CHECK: wfkhdb  %v31, %f0, %f0          # encoding: [0xe7,0xf0,0x00,0x0c,0x38,0xeb]
+#CHECK: wfkhdb  %v18, %f3, %v20         # encoding: [0xe7,0x23,0x40,0x0c,0x3a,0xeb]
+
+	wfkhdb	%v0, %v0, %v0
+	wfkhdb	%f0, %f0, %f0
+	wfkhdb	%v0, %v0, %v31
+	wfkhdb	%v0, %v31, %v0
+	wfkhdb	%v31, %v0, %v0
+	wfkhdb	%v18, %v3, %v20
+
+#CHECK: wfkhdbs %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x1c,0x30,0xeb]
+#CHECK: wfkhdbs %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x1c,0x30,0xeb]
+#CHECK: wfkhdbs %f0, %f0, %v31          # encoding: [0xe7,0x00,0xf0,0x1c,0x32,0xeb]
+#CHECK: wfkhdbs %f0, %v31, %f0          # encoding: [0xe7,0x0f,0x00,0x1c,0x34,0xeb]
+#CHECK: wfkhdbs %v31, %f0, %f0          # encoding: [0xe7,0xf0,0x00,0x1c,0x38,0xeb]
+#CHECK: wfkhdbs %v18, %f3, %v20         # encoding: [0xe7,0x23,0x40,0x1c,0x3a,0xeb]
+
+	wfkhdbs	%v0, %v0, %v0
+	wfkhdbs	%f0, %f0, %f0
+	wfkhdbs	%v0, %v0, %v31
+	wfkhdbs	%v0, %v31, %v0
+	wfkhdbs	%v31, %v0, %v0
+	wfkhdbs	%v18, %v3, %v20
+
+#CHECK: wfkhsb  %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x0c,0x20,0xeb]
+#CHECK: wfkhsb  %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x0c,0x20,0xeb]
+#CHECK: wfkhsb  %f0, %f0, %v31          # encoding: [0xe7,0x00,0xf0,0x0c,0x22,0xeb]
+#CHECK: wfkhsb  %f0, %v31, %f0          # encoding: [0xe7,0x0f,0x00,0x0c,0x24,0xeb]
+#CHECK: wfkhsb  %v31, %f0, %f0          # encoding: [0xe7,0xf0,0x00,0x0c,0x28,0xeb]
+#CHECK: wfkhsb  %v18, %f3, %v20         # encoding: [0xe7,0x23,0x40,0x0c,0x2a,0xeb]
+
+	wfkhsb	%v0, %v0, %v0
+	wfkhsb	%f0, %f0, %f0
+	wfkhsb	%v0, %v0, %v31
+	wfkhsb	%v0, %v31, %v0
+	wfkhsb	%v31, %v0, %v0
+	wfkhsb	%v18, %v3, %v20
+
+#CHECK: wfkhsbs %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x1c,0x20,0xeb]
+#CHECK: wfkhsbs %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x1c,0x20,0xeb]
+#CHECK: wfkhsbs %f0, %f0, %v31          # encoding: [0xe7,0x00,0xf0,0x1c,0x22,0xeb]
+#CHECK: wfkhsbs %f0, %v31, %f0          # encoding: [0xe7,0x0f,0x00,0x1c,0x24,0xeb]
+#CHECK: wfkhsbs %v31, %f0, %f0          # encoding: [0xe7,0xf0,0x00,0x1c,0x28,0xeb]
+#CHECK: wfkhsbs %v18, %f3, %v20         # encoding: [0xe7,0x23,0x40,0x1c,0x2a,0xeb]
+
+	wfkhsbs	%v0, %v0, %v0
+	wfkhsbs	%f0, %f0, %f0
+	wfkhsbs	%v0, %v0, %v31
+	wfkhsbs	%v0, %v31, %v0
+	wfkhsbs	%v31, %v0, %v0
+	wfkhsbs	%v18, %v3, %v20
+
+#CHECK: wfkhxb  %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x0c,0x40,0xeb]
+#CHECK: wfkhxb  %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x0c,0x42,0xeb]
+#CHECK: wfkhxb  %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x0c,0x44,0xeb]
+#CHECK: wfkhxb  %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x0c,0x48,0xeb]
+#CHECK: wfkhxb  %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x0c,0x4a,0xeb]
+
+	wfkhxb	%v0, %v0, %v0
+	wfkhxb	%v0, %v0, %v31
+	wfkhxb	%v0, %v31, %v0
+	wfkhxb	%v31, %v0, %v0
+	wfkhxb	%v18, %v3, %v20
+
+#CHECK: wfkhxbs %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x1c,0x40,0xeb]
+#CHECK: wfkhxbs %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x1c,0x42,0xeb]
+#CHECK: wfkhxbs %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x1c,0x44,0xeb]
+#CHECK: wfkhxbs %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x1c,0x48,0xeb]
+#CHECK: wfkhxbs %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x1c,0x4a,0xeb]
+
+	wfkhxbs	%v0, %v0, %v0
+	wfkhxbs	%v0, %v0, %v31
+	wfkhxbs	%v0, %v31, %v0
+	wfkhxbs	%v31, %v0, %v0
+	wfkhxbs	%v18, %v3, %v20
+
+#CHECK: wfkhedb %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xea]
+#CHECK: wfkhedb %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xea]
+#CHECK: wfkhedb %f0, %f0, %v31          # encoding: [0xe7,0x00,0xf0,0x0c,0x32,0xea]
+#CHECK: wfkhedb %f0, %v31, %f0          # encoding: [0xe7,0x0f,0x00,0x0c,0x34,0xea]
+#CHECK: wfkhedb %v31, %f0, %f0          # encoding: [0xe7,0xf0,0x00,0x0c,0x38,0xea]
+#CHECK: wfkhedb %v18, %f3, %v20         # encoding: [0xe7,0x23,0x40,0x0c,0x3a,0xea]
+
+	wfkhedb	%v0, %v0, %v0
+	wfkhedb	%f0, %f0, %f0
+	wfkhedb	%v0, %v0, %v31
+	wfkhedb	%v0, %v31, %v0
+	wfkhedb	%v31, %v0, %v0
+	wfkhedb	%v18, %v3, %v20
+
+#CHECK: wfkhedbs %f0, %f0, %f0          # encoding: [0xe7,0x00,0x00,0x1c,0x30,0xea]
+#CHECK: wfkhedbs %f0, %f0, %f0          # encoding: [0xe7,0x00,0x00,0x1c,0x30,0xea]
+#CHECK: wfkhedbs %f0, %f0, %v31         # encoding: [0xe7,0x00,0xf0,0x1c,0x32,0xea]
+#CHECK: wfkhedbs %f0, %v31, %f0         # encoding: [0xe7,0x0f,0x00,0x1c,0x34,0xea]
+#CHECK: wfkhedbs %v31, %f0, %f0         # encoding: [0xe7,0xf0,0x00,0x1c,0x38,0xea]
+#CHECK: wfkhedbs %v18, %f3, %v20        # encoding: [0xe7,0x23,0x40,0x1c,0x3a,0xea]
+
+	wfkhedbs %v0, %v0, %v0
+	wfkhedbs %f0, %f0, %f0
+	wfkhedbs %v0, %v0, %v31
+	wfkhedbs %v0, %v31, %v0
+	wfkhedbs %v31, %v0, %v0
+	wfkhedbs %v18, %v3, %v20
+
+#CHECK: wfkhesb %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x0c,0x20,0xea]
+#CHECK: wfkhesb %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x0c,0x20,0xea]
+#CHECK: wfkhesb %f0, %f0, %v31          # encoding: [0xe7,0x00,0xf0,0x0c,0x22,0xea]
+#CHECK: wfkhesb %f0, %v31, %f0          # encoding: [0xe7,0x0f,0x00,0x0c,0x24,0xea]
+#CHECK: wfkhesb %v31, %f0, %f0          # encoding: [0xe7,0xf0,0x00,0x0c,0x28,0xea]
+#CHECK: wfkhesb %v18, %f3, %v20         # encoding: [0xe7,0x23,0x40,0x0c,0x2a,0xea]
+
+	wfkhesb	%v0, %v0, %v0
+	wfkhesb	%f0, %f0, %f0
+	wfkhesb	%v0, %v0, %v31
+	wfkhesb	%v0, %v31, %v0
+	wfkhesb	%v31, %v0, %v0
+	wfkhesb	%v18, %v3, %v20
+
+#CHECK: wfkhesbs %f0, %f0, %f0          # encoding: [0xe7,0x00,0x00,0x1c,0x20,0xea]
+#CHECK: wfkhesbs %f0, %f0, %f0          # encoding: [0xe7,0x00,0x00,0x1c,0x20,0xea]
+#CHECK: wfkhesbs %f0, %f0, %v31         # encoding: [0xe7,0x00,0xf0,0x1c,0x22,0xea]
+#CHECK: wfkhesbs %f0, %v31, %f0         # encoding: [0xe7,0x0f,0x00,0x1c,0x24,0xea]
+#CHECK: wfkhesbs %v31, %f0, %f0         # encoding: [0xe7,0xf0,0x00,0x1c,0x28,0xea]
+#CHECK: wfkhesbs %v18, %f3, %v20        # encoding: [0xe7,0x23,0x40,0x1c,0x2a,0xea]
+
+	wfkhesbs %v0, %v0, %v0
+	wfkhesbs %f0, %f0, %f0
+	wfkhesbs %v0, %v0, %v31
+	wfkhesbs %v0, %v31, %v0
+	wfkhesbs %v31, %v0, %v0
+	wfkhesbs %v18, %v3, %v20
+
+#CHECK: wfkhexb %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x0c,0x40,0xea]
+#CHECK: wfkhexb %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x0c,0x42,0xea]
+#CHECK: wfkhexb %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x0c,0x44,0xea]
+#CHECK: wfkhexb %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x0c,0x48,0xea]
+#CHECK: wfkhexb %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x0c,0x4a,0xea]
+
+	wfkhexb	%v0, %v0, %v0
+	wfkhexb	%v0, %v0, %v31
+	wfkhexb	%v0, %v31, %v0
+	wfkhexb	%v31, %v0, %v0
+	wfkhexb	%v18, %v3, %v20
+
+#CHECK: wfkhexbs %v0, %v0, %v0          # encoding: [0xe7,0x00,0x00,0x1c,0x40,0xea]
+#CHECK: wfkhexbs %v0, %v0, %v31         # encoding: [0xe7,0x00,0xf0,0x1c,0x42,0xea]
+#CHECK: wfkhexbs %v0, %v31, %v0         # encoding: [0xe7,0x0f,0x00,0x1c,0x44,0xea]
+#CHECK: wfkhexbs %v31, %v0, %v0         # encoding: [0xe7,0xf0,0x00,0x1c,0x48,0xea]
+#CHECK: wfkhexbs %v18, %v3, %v20        # encoding: [0xe7,0x23,0x40,0x1c,0x4a,0xea]
+
+	wfkhexbs %v0, %v0, %v0
+	wfkhexbs %v0, %v0, %v31
+	wfkhexbs %v0, %v31, %v0
+	wfkhexbs %v31, %v0, %v0
+	wfkhexbs %v18, %v3, %v20
+
+#CHECK: wfpsosb %f0, %f0, 3             # encoding: [0xe7,0x00,0x00,0x38,0x20,0xcc]
+#CHECK: wfpsosb %f0, %f0, 3             # encoding: [0xe7,0x00,0x00,0x38,0x20,0xcc]
+#CHECK: wfpsosb %f0, %f0, 15            # encoding: [0xe7,0x00,0x00,0xf8,0x20,0xcc]
+#CHECK: wfpsosb %f0, %f15, 3            # encoding: [0xe7,0x0f,0x00,0x38,0x20,0xcc]
+#CHECK: wfpsosb %f0, %v31, 3            # encoding: [0xe7,0x0f,0x00,0x38,0x24,0xcc]
+#CHECK: wfpsosb %f15, %f0, 3            # encoding: [0xe7,0xf0,0x00,0x38,0x20,0xcc]
+#CHECK: wfpsosb %v31, %f0, 3            # encoding: [0xe7,0xf0,0x00,0x38,0x28,0xcc]
+#CHECK: wfpsosb %f14, %v17, 7           # encoding: [0xe7,0xe1,0x00,0x78,0x24,0xcc]
+
+	wfpsosb	%v0, %v0, 3
+	wfpsosb	%f0, %f0, 3
+	wfpsosb	%v0, %v0, 15
+	wfpsosb	%v0, %v15, 3
+	wfpsosb	%v0, %v31, 3
+	wfpsosb	%v15, %v0, 3
+	wfpsosb	%v31, %v0, 3
+	wfpsosb	%v14, %v17, 7
+
+#CHECK: wfpsoxb %v0, %v0, 3             # encoding: [0xe7,0x00,0x00,0x38,0x40,0xcc]
+#CHECK: wfpsoxb %v0, %v0, 15            # encoding: [0xe7,0x00,0x00,0xf8,0x40,0xcc]
+#CHECK: wfpsoxb %v0, %v15, 3            # encoding: [0xe7,0x0f,0x00,0x38,0x40,0xcc]
+#CHECK: wfpsoxb %v0, %v31, 3            # encoding: [0xe7,0x0f,0x00,0x38,0x44,0xcc]
+#CHECK: wfpsoxb %v15, %v0, 3            # encoding: [0xe7,0xf0,0x00,0x38,0x40,0xcc]
+#CHECK: wfpsoxb %v31, %v0, 3            # encoding: [0xe7,0xf0,0x00,0x38,0x48,0xcc]
+#CHECK: wfpsoxb %v14, %v17, 7           # encoding: [0xe7,0xe1,0x00,0x78,0x44,0xcc]
+
+	wfpsoxb	%v0, %v0, 3
+	wfpsoxb	%v0, %v0, 15
+	wfpsoxb	%v0, %v15, 3
+	wfpsoxb	%v0, %v31, 3
+	wfpsoxb	%v15, %v0, 3
+	wfpsoxb	%v31, %v0, 3
+	wfpsoxb	%v14, %v17, 7
+
+#CHECK: wflcsb  %f0, %f0                # encoding: [0xe7,0x00,0x00,0x08,0x20,0xcc]
+#CHECK: wflcsb  %f0, %f0                # encoding: [0xe7,0x00,0x00,0x08,0x20,0xcc]
+#CHECK: wflcsb  %f0, %f15               # encoding: [0xe7,0x0f,0x00,0x08,0x20,0xcc]
+#CHECK: wflcsb  %f0, %v31               # encoding: [0xe7,0x0f,0x00,0x08,0x24,0xcc]
+#CHECK: wflcsb  %f15, %f0               # encoding: [0xe7,0xf0,0x00,0x08,0x20,0xcc]
+#CHECK: wflcsb  %v31, %f0               # encoding: [0xe7,0xf0,0x00,0x08,0x28,0xcc]
+#CHECK: wflcsb  %f14, %v17              # encoding: [0xe7,0xe1,0x00,0x08,0x24,0xcc]
+
+	wflcsb	%v0, %v0
+	wflcsb  %f0, %f0
+	wflcsb	%v0, %v15
+	wflcsb	%v0, %v31
+	wflcsb	%v15, %v0
+	wflcsb	%v31, %v0
+	wflcsb	%v14, %v17
+
+#CHECK: wflcxb  %v0, %v0                # encoding: [0xe7,0x00,0x00,0x08,0x40,0xcc]
+#CHECK: wflcxb  %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x08,0x40,0xcc]
+#CHECK: wflcxb  %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x08,0x44,0xcc]
+#CHECK: wflcxb  %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x08,0x40,0xcc]
+#CHECK: wflcxb  %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x08,0x48,0xcc]
+#CHECK: wflcxb  %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x08,0x44,0xcc]
+
+	wflcxb	%v0, %v0
+	wflcxb	%v0, %v15
+	wflcxb	%v0, %v31
+	wflcxb	%v15, %v0
+	wflcxb	%v31, %v0
+	wflcxb	%v14, %v17
+
+#CHECK: wflnsb  %f0, %f0                # encoding: [0xe7,0x00,0x00,0x18,0x20,0xcc]
+#CHECK: wflnsb  %f0, %f0                # encoding: [0xe7,0x00,0x00,0x18,0x20,0xcc]
+#CHECK: wflnsb  %f0, %f15               # encoding: [0xe7,0x0f,0x00,0x18,0x20,0xcc]
+#CHECK: wflnsb  %f0, %v31               # encoding: [0xe7,0x0f,0x00,0x18,0x24,0xcc]
+#CHECK: wflnsb  %f15, %f0               # encoding: [0xe7,0xf0,0x00,0x18,0x20,0xcc]
+#CHECK: wflnsb  %v31, %f0               # encoding: [0xe7,0xf0,0x00,0x18,0x28,0xcc]
+#CHECK: wflnsb  %f14, %v17              # encoding: [0xe7,0xe1,0x00,0x18,0x24,0xcc]
+
+	wflnsb	%v0, %v0
+	wflnsb	%f0, %f0
+	wflnsb	%v0, %v15
+	wflnsb	%v0, %v31
+	wflnsb	%v15, %v0
+	wflnsb	%v31, %v0
+	wflnsb	%v14, %v17
+
+#CHECK: wflnxb  %v0, %v0                # encoding: [0xe7,0x00,0x00,0x18,0x40,0xcc]
+#CHECK: wflnxb  %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x18,0x40,0xcc]
+#CHECK: wflnxb  %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x18,0x44,0xcc]
+#CHECK: wflnxb  %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x18,0x40,0xcc]
+#CHECK: wflnxb  %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x18,0x48,0xcc]
+#CHECK: wflnxb  %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x18,0x44,0xcc]
+
+	wflnxb	%v0, %v0
+	wflnxb	%v0, %v15
+	wflnxb	%v0, %v31
+	wflnxb	%v15, %v0
+	wflnxb	%v31, %v0
+	wflnxb	%v14, %v17
+
+#CHECK: wflpsb  %f0, %f0                # encoding: [0xe7,0x00,0x00,0x28,0x20,0xcc]
+#CHECK: wflpsb  %f0, %f0                # encoding: [0xe7,0x00,0x00,0x28,0x20,0xcc]
+#CHECK: wflpsb  %f0, %f15               # encoding: [0xe7,0x0f,0x00,0x28,0x20,0xcc]
+#CHECK: wflpsb  %f0, %v31               # encoding: [0xe7,0x0f,0x00,0x28,0x24,0xcc]
+#CHECK: wflpsb  %f15, %f0               # encoding: [0xe7,0xf0,0x00,0x28,0x20,0xcc]
+#CHECK: wflpsb  %v31, %f0               # encoding: [0xe7,0xf0,0x00,0x28,0x28,0xcc]
+#CHECK: wflpsb  %f14, %v17              # encoding: [0xe7,0xe1,0x00,0x28,0x24,0xcc]
+
+	wflpsb	%v0, %v0
+	wflpsb	%f0, %f0
+	wflpsb	%v0, %v15
+	wflpsb	%v0, %v31
+	wflpsb	%v15, %v0
+	wflpsb	%v31, %v0
+	wflpsb	%v14, %v17
+
+#CHECK: wflpxb  %v0, %v0                # encoding: [0xe7,0x00,0x00,0x28,0x40,0xcc]
+#CHECK: wflpxb  %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x28,0x40,0xcc]
+#CHECK: wflpxb  %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x28,0x44,0xcc]
+#CHECK: wflpxb  %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x28,0x40,0xcc]
+#CHECK: wflpxb  %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x28,0x48,0xcc]
+#CHECK: wflpxb  %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x28,0x44,0xcc]
+
+	wflpxb	%v0, %v0
+	wflpxb	%v0, %v15
+	wflpxb	%v0, %v31
+	wflpxb	%v15, %v0
+	wflpxb	%v31, %v0
+	wflpxb	%v14, %v17
+
+#CHECK: wflls   %f0, %f0                # encoding: [0xe7,0x00,0x00,0x08,0x20,0xc4]
+#CHECK: wflls   %f0, %f0                # encoding: [0xe7,0x00,0x00,0x08,0x20,0xc4]
+#CHECK: wflls   %f0, %f15               # encoding: [0xe7,0x0f,0x00,0x08,0x20,0xc4]
+#CHECK: wflls   %f0, %v31               # encoding: [0xe7,0x0f,0x00,0x08,0x24,0xc4]
+#CHECK: wflls   %f15, %f0               # encoding: [0xe7,0xf0,0x00,0x08,0x20,0xc4]
+#CHECK: wflls   %v31, %f0               # encoding: [0xe7,0xf0,0x00,0x08,0x28,0xc4]
+#CHECK: wflls   %f14, %v17              # encoding: [0xe7,0xe1,0x00,0x08,0x24,0xc4]
+
+	wflls	%v0, %v0
+	wflls	%f0, %f0
+	wflls	%v0, %v15
+	wflls	%v0, %v31
+	wflls	%v15, %v0
+	wflls	%v31, %v0
+	wflls	%v14, %v17
+
+#CHECK: wflld   %v0, %f0                # encoding: [0xe7,0x00,0x00,0x08,0x30,0xc4]
+#CHECK: wflld   %v0, %f0                # encoding: [0xe7,0x00,0x00,0x08,0x30,0xc4]
+#CHECK: wflld   %v0, %f15               # encoding: [0xe7,0x0f,0x00,0x08,0x30,0xc4]
+#CHECK: wflld   %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x08,0x34,0xc4]
+#CHECK: wflld   %v15, %f0               # encoding: [0xe7,0xf0,0x00,0x08,0x30,0xc4]
+#CHECK: wflld   %v31, %f0               # encoding: [0xe7,0xf0,0x00,0x08,0x38,0xc4]
+#CHECK: wflld   %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x08,0x34,0xc4]
+
+	wflld	%v0, %v0
+	wflld	%v0, %f0
+	wflld	%v0, %v15
+	wflld	%v0, %v31
+	wflld	%v15, %v0
+	wflld	%v31, %v0
+	wflld	%v14, %v17
+
+#CHECK: wflrd   %f0, %f0, 0, 0          # encoding: [0xe7,0x00,0x00,0x08,0x30,0xc5]
+#CHECK: wflrd   %f0, %f0, 0, 0          # encoding: [0xe7,0x00,0x00,0x08,0x30,0xc5]
+#CHECK: wflrd   %f0, %f0, 0, 15         # encoding: [0xe7,0x00,0x00,0xf8,0x30,0xc5]
+#CHECK: wflrd   %f0, %f0, 4, 0          # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xc5]
+#CHECK: wflrd   %f0, %f0, 12, 0         # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xc5]
+#CHECK: wflrd   %f0, %v31, 0, 0         # encoding: [0xe7,0x0f,0x00,0x08,0x34,0xc5]
+#CHECK: wflrd   %v31, %f0, 0, 0         # encoding: [0xe7,0xf0,0x00,0x08,0x38,0xc5]
+#CHECK: wflrd   %f14, %v17, 4, 10       # encoding: [0xe7,0xe1,0x00,0xac,0x34,0xc5]
+
+	wflrd	%v0, %v0, 0, 0
+	wflrd	%f0, %f0, 0, 0
+ 	wflrd	%v0, %v0, 0, 15
+	wflrd	%v0, %v0, 4, 0
+	wflrd	%v0, %v0, 12, 0
+	wflrd	%v0, %v31, 0, 0
+	wflrd	%v31, %v0, 0, 0
+	wflrd	%v14, %v17, 4, 10
+
+#CHECK: wflrx   %f0, %v0, 0, 0          # encoding: [0xe7,0x00,0x00,0x08,0x40,0xc5]
+#CHECK: wflrx   %f0, %v0, 0, 0          # encoding: [0xe7,0x00,0x00,0x08,0x40,0xc5]
+#CHECK: wflrx   %f0, %v0, 0, 15         # encoding: [0xe7,0x00,0x00,0xf8,0x40,0xc5]
+#CHECK: wflrx   %f0, %v0, 4, 0          # encoding: [0xe7,0x00,0x00,0x0c,0x40,0xc5]
+#CHECK: wflrx   %f0, %v0, 7, 0          # encoding: [0xe7,0x00,0x00,0x0f,0x40,0xc5]
+#CHECK: wflrx   %f0, %v31, 0, 0         # encoding: [0xe7,0x0f,0x00,0x08,0x44,0xc5]
+#CHECK: wflrx   %v31, %v0, 0, 0         # encoding: [0xe7,0xf0,0x00,0x08,0x48,0xc5]
+#CHECK: wflrx   %f14, %v17, 4, 10       # encoding: [0xe7,0xe1,0x00,0xac,0x44,0xc5]
+
+	wflrx	%v0, %v0, 0, 0
+	wflrx	%f0, %v0, 0, 0
+ 	wflrx	%v0, %v0, 0, 15
+	wflrx	%v0, %v0, 4, 0
+	wflrx	%v0, %v0, 7, 0
+	wflrx	%v0, %v31, 0, 0
+	wflrx	%v31, %v0, 0, 0
+	wflrx	%v14, %v17, 4, 10
+
+#CHECK: wfmaxdb	%f0, %f0, %f0, 0        # encoding: [0xe7,0x00,0x00,0x08,0x30,0xef]
+#CHECK: wfmaxdb	%f0, %f0, %f0, 0        # encoding: [0xe7,0x00,0x00,0x08,0x30,0xef]
+#CHECK: wfmaxdb	%f0, %f0, %f0, 4        # encoding: [0xe7,0x00,0x00,0x48,0x30,0xef]
+#CHECK: wfmaxdb	%f0, %f0, %v31, 0       # encoding: [0xe7,0x00,0xf0,0x08,0x32,0xef]
+#CHECK: wfmaxdb	%f0, %v31, %f0, 0       # encoding: [0xe7,0x0f,0x00,0x08,0x34,0xef]
+#CHECK: wfmaxdb	%v31, %f0, %f0, 0       # encoding: [0xe7,0xf0,0x00,0x08,0x38,0xef]
+#CHECK: wfmaxdb	%v18, %f3, %v20, 11     # encoding: [0xe7,0x23,0x40,0xb8,0x3a,0xef]
+
+	wfmaxdb	%v0, %v0, %v0, 0
+	wfmaxdb	%f0, %f0, %f0, 0
+	wfmaxdb	%v0, %v0, %v0, 4
+	wfmaxdb	%v0, %v0, %v31, 0
+	wfmaxdb	%v0, %v31, %v0, 0
+	wfmaxdb	%v31, %v0, %v0, 0
+	wfmaxdb	%v18, %v3, %v20, 11
+
+#CHECK: wfmaxsb	%f0, %f0, %f0, 0        # encoding: [0xe7,0x00,0x00,0x08,0x20,0xef]
+#CHECK: wfmaxsb	%f0, %f0, %f0, 0        # encoding: [0xe7,0x00,0x00,0x08,0x20,0xef]
+#CHECK: wfmaxsb	%f0, %f0, %f0, 4        # encoding: [0xe7,0x00,0x00,0x48,0x20,0xef]
+#CHECK: wfmaxsb	%f0, %f0, %v31, 0       # encoding: [0xe7,0x00,0xf0,0x08,0x22,0xef]
+#CHECK: wfmaxsb	%f0, %v31, %f0, 0       # encoding: [0xe7,0x0f,0x00,0x08,0x24,0xef]
+#CHECK: wfmaxsb	%v31, %f0, %f0, 0       # encoding: [0xe7,0xf0,0x00,0x08,0x28,0xef]
+#CHECK: wfmaxsb	%v18, %f3, %v20, 11     # encoding: [0xe7,0x23,0x40,0xb8,0x2a,0xef]
+
+	wfmaxsb	%v0, %v0, %v0, 0
+	wfmaxsb	%f0, %f0, %f0, 0
+	wfmaxsb	%v0, %v0, %v0, 4
+	wfmaxsb	%v0, %v0, %v31, 0
+	wfmaxsb	%v0, %v31, %v0, 0
+	wfmaxsb	%v31, %v0, %v0, 0
+	wfmaxsb	%v18, %v3, %v20, 11
+
+#CHECK: wfmaxxb	%v0, %v0, %v0, 0        # encoding: [0xe7,0x00,0x00,0x08,0x40,0xef]
+#CHECK: wfmaxxb	%v0, %v0, %v0, 4        # encoding: [0xe7,0x00,0x00,0x48,0x40,0xef]
+#CHECK: wfmaxxb	%v0, %v0, %v31, 0       # encoding: [0xe7,0x00,0xf0,0x08,0x42,0xef]
+#CHECK: wfmaxxb	%v0, %v31, %v0, 0       # encoding: [0xe7,0x0f,0x00,0x08,0x44,0xef]
+#CHECK: wfmaxxb	%v31, %v0, %v0, 0       # encoding: [0xe7,0xf0,0x00,0x08,0x48,0xef]
+#CHECK: wfmaxxb	%v18, %v3, %v20, 11     # encoding: [0xe7,0x23,0x40,0xb8,0x4a,0xef]
+
+	wfmaxxb	%v0, %v0, %v0, 0
+	wfmaxxb	%v0, %v0, %v0, 4
+	wfmaxxb	%v0, %v0, %v31, 0
+	wfmaxxb	%v0, %v31, %v0, 0
+	wfmaxxb	%v31, %v0, %v0, 0
+	wfmaxxb	%v18, %v3, %v20, 11
+
+#CHECK: wfmindb	%f0, %f0, %f0, 0        # encoding: [0xe7,0x00,0x00,0x08,0x30,0xee]
+#CHECK: wfmindb	%f0, %f0, %f0, 0        # encoding: [0xe7,0x00,0x00,0x08,0x30,0xee]
+#CHECK: wfmindb	%f0, %f0, %f0, 4        # encoding: [0xe7,0x00,0x00,0x48,0x30,0xee]
+#CHECK: wfmindb	%f0, %f0, %v31, 0       # encoding: [0xe7,0x00,0xf0,0x08,0x32,0xee]
+#CHECK: wfmindb	%f0, %v31, %f0, 0       # encoding: [0xe7,0x0f,0x00,0x08,0x34,0xee]
+#CHECK: wfmindb	%v31, %f0, %f0, 0       # encoding: [0xe7,0xf0,0x00,0x08,0x38,0xee]
+#CHECK: wfmindb	%v18, %f3, %v20, 11     # encoding: [0xe7,0x23,0x40,0xb8,0x3a,0xee]
+
+	wfmindb	%v0, %v0, %v0, 0
+	wfmindb	%f0, %f0, %f0, 0
+	wfmindb	%v0, %v0, %v0, 4
+	wfmindb	%v0, %v0, %v31, 0
+	wfmindb	%v0, %v31, %v0, 0
+	wfmindb	%v31, %v0, %v0, 0
+	wfmindb	%v18, %v3, %v20, 11
+
+#CHECK: wfminsb	%f0, %f0, %f0, 0        # encoding: [0xe7,0x00,0x00,0x08,0x20,0xee]
+#CHECK: wfminsb	%f0, %f0, %f0, 0        # encoding: [0xe7,0x00,0x00,0x08,0x20,0xee]
+#CHECK: wfminsb	%f0, %f0, %f0, 4        # encoding: [0xe7,0x00,0x00,0x48,0x20,0xee]
+#CHECK: wfminsb	%f0, %f0, %v31, 0       # encoding: [0xe7,0x00,0xf0,0x08,0x22,0xee]
+#CHECK: wfminsb	%f0, %v31, %f0, 0       # encoding: [0xe7,0x0f,0x00,0x08,0x24,0xee]
+#CHECK: wfminsb	%v31, %f0, %f0, 0       # encoding: [0xe7,0xf0,0x00,0x08,0x28,0xee]
+#CHECK: wfminsb	%v18, %f3, %v20, 11     # encoding: [0xe7,0x23,0x40,0xb8,0x2a,0xee]
+
+	wfminsb	%v0, %v0, %v0, 0
+	wfminsb	%f0, %f0, %f0, 0
+	wfminsb	%v0, %v0, %v0, 4
+	wfminsb	%v0, %v0, %v31, 0
+	wfminsb	%v0, %v31, %v0, 0
+	wfminsb	%v31, %v0, %v0, 0
+	wfminsb	%v18, %v3, %v20, 11
+
+#CHECK: wfminxb	%v0, %v0, %v0, 0        # encoding: [0xe7,0x00,0x00,0x08,0x40,0xee]
+#CHECK: wfminxb	%v0, %v0, %v0, 4        # encoding: [0xe7,0x00,0x00,0x48,0x40,0xee]
+#CHECK: wfminxb	%v0, %v0, %v31, 0       # encoding: [0xe7,0x00,0xf0,0x08,0x42,0xee]
+#CHECK: wfminxb	%v0, %v31, %v0, 0       # encoding: [0xe7,0x0f,0x00,0x08,0x44,0xee]
+#CHECK: wfminxb	%v31, %v0, %v0, 0       # encoding: [0xe7,0xf0,0x00,0x08,0x48,0xee]
+#CHECK: wfminxb	%v18, %v3, %v20, 11     # encoding: [0xe7,0x23,0x40,0xb8,0x4a,0xee]
+
+	wfminxb	%v0, %v0, %v0, 0
+	wfminxb	%v0, %v0, %v0, 4
+	wfminxb	%v0, %v0, %v31, 0
+	wfminxb	%v0, %v31, %v0, 0
+	wfminxb	%v31, %v0, %v0, 0
+	wfminxb	%v18, %v3, %v20, 11
+
+#CHECK: wfmasb  %f0, %f0, %f0, %f0      # encoding: [0xe7,0x00,0x02,0x08,0x00,0x8f]
+#CHECK: wfmasb  %f0, %f0, %f0, %f0      # encoding: [0xe7,0x00,0x02,0x08,0x00,0x8f]
+#CHECK: wfmasb  %f0, %f0, %f0, %v31     # encoding: [0xe7,0x00,0x02,0x08,0xf1,0x8f]
+#CHECK: wfmasb  %f0, %f0, %v31, %f0     # encoding: [0xe7,0x00,0xf2,0x08,0x02,0x8f]
+#CHECK: wfmasb  %f0, %v31, %f0, %f0     # encoding: [0xe7,0x0f,0x02,0x08,0x04,0x8f]
+#CHECK: wfmasb  %v31, %f0, %f0, %f0     # encoding: [0xe7,0xf0,0x02,0x08,0x08,0x8f]
+#CHECK: wfmasb  %f13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x52,0x08,0x97,0x8f]
+
+	wfmasb	%v0, %v0, %v0, %v0
+	wfmasb	%f0, %f0, %f0, %f0
+	wfmasb	%v0, %v0, %v0, %v31
+	wfmasb	%v0, %v0, %v31, %v0
+	wfmasb	%v0, %v31, %v0, %v0
+	wfmasb	%v31, %v0, %v0, %v0
+	wfmasb	%v13, %v17, %v21, %v25
+
+#CHECK: wfmaxb  %v0, %v0, %v0, %v0      # encoding: [0xe7,0x00,0x04,0x08,0x00,0x8f]
+#CHECK: wfmaxb  %v0, %v0, %v0, %v31     # encoding: [0xe7,0x00,0x04,0x08,0xf1,0x8f]
+#CHECK: wfmaxb  %v0, %v0, %v31, %v0     # encoding: [0xe7,0x00,0xf4,0x08,0x02,0x8f]
+#CHECK: wfmaxb  %v0, %v31, %v0, %v0     # encoding: [0xe7,0x0f,0x04,0x08,0x04,0x8f]
+#CHECK: wfmaxb  %v31, %v0, %v0, %v0     # encoding: [0xe7,0xf0,0x04,0x08,0x08,0x8f]
+#CHECK: wfmaxb  %v13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x54,0x08,0x97,0x8f]
+
+	wfmaxb	%v0, %v0, %v0, %v0
+	wfmaxb	%v0, %v0, %v0, %v31
+	wfmaxb	%v0, %v0, %v31, %v0
+	wfmaxb	%v0, %v31, %v0, %v0
+	wfmaxb	%v31, %v0, %v0, %v0
+	wfmaxb	%v13, %v17, %v21, %v25
+
+#CHECK: wfmsb   %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x08,0x20,0xe7]
+#CHECK: wfmsb   %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x08,0x20,0xe7]
+#CHECK: wfmsb   %f0, %f0, %v31          # encoding: [0xe7,0x00,0xf0,0x08,0x22,0xe7]
+#CHECK: wfmsb   %f0, %v31, %f0          # encoding: [0xe7,0x0f,0x00,0x08,0x24,0xe7]
+#CHECK: wfmsb   %v31, %f0, %f0          # encoding: [0xe7,0xf0,0x00,0x08,0x28,0xe7]
+#CHECK: wfmsb   %v18, %f3, %v20         # encoding: [0xe7,0x23,0x40,0x08,0x2a,0xe7]
+
+	wfmsb	%v0, %v0, %v0
+	wfmsb	%f0, %f0, %f0
+	wfmsb	%v0, %v0, %v31
+	wfmsb	%v0, %v31, %v0
+	wfmsb	%v31, %v0, %v0
+	wfmsb	%v18, %v3, %v20
+
+#CHECK: wfmxb   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x08,0x40,0xe7]
+#CHECK: wfmxb   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x08,0x42,0xe7]
+#CHECK: wfmxb   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x08,0x44,0xe7]
+#CHECK: wfmxb   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x08,0x48,0xe7]
+#CHECK: wfmxb   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x08,0x4a,0xe7]
+
+	wfmxb	%v0, %v0, %v0
+	wfmxb	%v0, %v0, %v31
+	wfmxb	%v0, %v31, %v0
+	wfmxb	%v31, %v0, %v0
+	wfmxb	%v18, %v3, %v20
+
+#CHECK: wfmssb  %f0, %f0, %f0, %f0      # encoding: [0xe7,0x00,0x02,0x08,0x00,0x8e]
+#CHECK: wfmssb  %f0, %f0, %f0, %f0      # encoding: [0xe7,0x00,0x02,0x08,0x00,0x8e]
+#CHECK: wfmssb  %f0, %f0, %f0, %v31     # encoding: [0xe7,0x00,0x02,0x08,0xf1,0x8e]
+#CHECK: wfmssb  %f0, %f0, %v31, %f0     # encoding: [0xe7,0x00,0xf2,0x08,0x02,0x8e]
+#CHECK: wfmssb  %f0, %v31, %f0, %f0     # encoding: [0xe7,0x0f,0x02,0x08,0x04,0x8e]
+#CHECK: wfmssb  %v31, %f0, %f0, %f0     # encoding: [0xe7,0xf0,0x02,0x08,0x08,0x8e]
+#CHECK: wfmssb  %f13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x52,0x08,0x97,0x8e]
+
+	wfmssb	%v0, %v0, %v0, %v0
+	wfmssb	%f0, %f0, %f0, %f0
+	wfmssb	%v0, %v0, %v0, %v31
+	wfmssb	%v0, %v0, %v31, %v0
+	wfmssb	%v0, %v31, %v0, %v0
+	wfmssb	%v31, %v0, %v0, %v0
+	wfmssb	%v13, %v17, %v21, %v25
+
+#CHECK: wfmsxb  %v0, %v0, %v0, %v0      # encoding: [0xe7,0x00,0x04,0x08,0x00,0x8e]
+#CHECK: wfmsxb  %v0, %v0, %v0, %v31     # encoding: [0xe7,0x00,0x04,0x08,0xf1,0x8e]
+#CHECK: wfmsxb  %v0, %v0, %v31, %v0     # encoding: [0xe7,0x00,0xf4,0x08,0x02,0x8e]
+#CHECK: wfmsxb  %v0, %v31, %v0, %v0     # encoding: [0xe7,0x0f,0x04,0x08,0x04,0x8e]
+#CHECK: wfmsxb  %v31, %v0, %v0, %v0     # encoding: [0xe7,0xf0,0x04,0x08,0x08,0x8e]
+#CHECK: wfmsxb  %v13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x54,0x08,0x97,0x8e]
+
+	wfmsxb	%v0, %v0, %v0, %v0
+	wfmsxb	%v0, %v0, %v0, %v31
+	wfmsxb	%v0, %v0, %v31, %v0
+	wfmsxb	%v0, %v31, %v0, %v0
+	wfmsxb	%v31, %v0, %v0, %v0
+	wfmsxb	%v13, %v17, %v21, %v25
+
+#CHECK: wfnmadb	%f0, %f0, %f0, %f0      # encoding: [0xe7,0x00,0x03,0x08,0x00,0x9f]
+#CHECK: wfnmadb	%f0, %f0, %f0, %f0      # encoding: [0xe7,0x00,0x03,0x08,0x00,0x9f]
+#CHECK: wfnmadb	%f0, %f0, %f0, %v31     # encoding: [0xe7,0x00,0x03,0x08,0xf1,0x9f]
+#CHECK: wfnmadb	%f0, %f0, %v31, %f0     # encoding: [0xe7,0x00,0xf3,0x08,0x02,0x9f]
+#CHECK: wfnmadb	%f0, %v31, %f0, %f0     # encoding: [0xe7,0x0f,0x03,0x08,0x04,0x9f]
+#CHECK: wfnmadb	%v31, %f0, %f0, %f0     # encoding: [0xe7,0xf0,0x03,0x08,0x08,0x9f]
+#CHECK: wfnmadb	%f13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x53,0x08,0x97,0x9f]
+
+	wfnmadb	%v0, %v0, %v0, %v0
+	wfnmadb	%f0, %f0, %f0, %f0
+	wfnmadb	%v0, %v0, %v0, %v31
+	wfnmadb	%v0, %v0, %v31, %v0
+	wfnmadb	%v0, %v31, %v0, %v0
+	wfnmadb	%v31, %v0, %v0, %v0
+	wfnmadb	%v13, %v17, %v21, %v25
+
+#CHECK: wfnmasb	%f0, %f0, %f0, %f0      # encoding: [0xe7,0x00,0x02,0x08,0x00,0x9f]
+#CHECK: wfnmasb	%f0, %f0, %f0, %f0      # encoding: [0xe7,0x00,0x02,0x08,0x00,0x9f]
+#CHECK: wfnmasb	%f0, %f0, %f0, %v31     # encoding: [0xe7,0x00,0x02,0x08,0xf1,0x9f]
+#CHECK: wfnmasb	%f0, %f0, %v31, %f0     # encoding: [0xe7,0x00,0xf2,0x08,0x02,0x9f]
+#CHECK: wfnmasb	%f0, %v31, %f0, %f0     # encoding: [0xe7,0x0f,0x02,0x08,0x04,0x9f]
+#CHECK: wfnmasb	%v31, %f0, %f0, %f0     # encoding: [0xe7,0xf0,0x02,0x08,0x08,0x9f]
+#CHECK: wfnmasb	%f13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x52,0x08,0x97,0x9f]
+
+	wfnmasb	%v0, %v0, %v0, %v0
+	wfnmasb	%f0, %f0, %f0, %f0
+	wfnmasb	%v0, %v0, %v0, %v31
+	wfnmasb	%v0, %v0, %v31, %v0
+	wfnmasb	%v0, %v31, %v0, %v0
+	wfnmasb	%v31, %v0, %v0, %v0
+	wfnmasb	%v13, %v17, %v21, %v25
+
+#CHECK: wfnmaxb	%v0, %v0, %v0, %v0      # encoding: [0xe7,0x00,0x04,0x08,0x00,0x9f]
+#CHECK: wfnmaxb	%v0, %v0, %v0, %v31     # encoding: [0xe7,0x00,0x04,0x08,0xf1,0x9f]
+#CHECK: wfnmaxb	%v0, %v0, %v31, %v0     # encoding: [0xe7,0x00,0xf4,0x08,0x02,0x9f]
+#CHECK: wfnmaxb	%v0, %v31, %v0, %v0     # encoding: [0xe7,0x0f,0x04,0x08,0x04,0x9f]
+#CHECK: wfnmaxb	%v31, %v0, %v0, %v0     # encoding: [0xe7,0xf0,0x04,0x08,0x08,0x9f]
+#CHECK: wfnmaxb	%v13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x54,0x08,0x97,0x9f]
+
+	wfnmaxb	%v0, %v0, %v0, %v0
+	wfnmaxb	%v0, %v0, %v0, %v31
+	wfnmaxb	%v0, %v0, %v31, %v0
+	wfnmaxb	%v0, %v31, %v0, %v0
+	wfnmaxb	%v31, %v0, %v0, %v0
+	wfnmaxb	%v13, %v17, %v21, %v25
+
+#CHECK: wfnmsdb	%f0, %f0, %f0, %f0      # encoding: [0xe7,0x00,0x03,0x08,0x00,0x9e]
+#CHECK: wfnmsdb	%f0, %f0, %f0, %f0      # encoding: [0xe7,0x00,0x03,0x08,0x00,0x9e]
+#CHECK: wfnmsdb	%f0, %f0, %f0, %v31     # encoding: [0xe7,0x00,0x03,0x08,0xf1,0x9e]
+#CHECK: wfnmsdb	%f0, %f0, %v31, %f0     # encoding: [0xe7,0x00,0xf3,0x08,0x02,0x9e]
+#CHECK: wfnmsdb	%f0, %v31, %f0, %f0     # encoding: [0xe7,0x0f,0x03,0x08,0x04,0x9e]
+#CHECK: wfnmsdb	%v31, %f0, %f0, %f0     # encoding: [0xe7,0xf0,0x03,0x08,0x08,0x9e]
+#CHECK: wfnmsdb	%f13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x53,0x08,0x97,0x9e]
+
+	wfnmsdb	%v0, %v0, %v0, %v0
+	wfnmsdb	%f0, %f0, %f0, %f0
+	wfnmsdb	%v0, %v0, %v0, %v31
+	wfnmsdb	%v0, %v0, %v31, %v0
+	wfnmsdb	%v0, %v31, %v0, %v0
+	wfnmsdb	%v31, %v0, %v0, %v0
+	wfnmsdb	%v13, %v17, %v21, %v25
+
+#CHECK: wfnmssb	%f0, %f0, %f0, %f0      # encoding: [0xe7,0x00,0x02,0x08,0x00,0x9e]
+#CHECK: wfnmssb	%f0, %f0, %f0, %f0      # encoding: [0xe7,0x00,0x02,0x08,0x00,0x9e]
+#CHECK: wfnmssb	%f0, %f0, %f0, %v31     # encoding: [0xe7,0x00,0x02,0x08,0xf1,0x9e]
+#CHECK: wfnmssb	%f0, %f0, %v31, %f0     # encoding: [0xe7,0x00,0xf2,0x08,0x02,0x9e]
+#CHECK: wfnmssb	%f0, %v31, %f0, %f0     # encoding: [0xe7,0x0f,0x02,0x08,0x04,0x9e]
+#CHECK: wfnmssb	%v31, %f0, %f0, %f0     # encoding: [0xe7,0xf0,0x02,0x08,0x08,0x9e]
+#CHECK: wfnmssb	%f13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x52,0x08,0x97,0x9e]
+
+	wfnmssb	%v0, %v0, %v0, %v0
+	wfnmssb	%f0, %f0, %f0, %f0
+	wfnmssb	%v0, %v0, %v0, %v31
+	wfnmssb	%v0, %v0, %v31, %v0
+	wfnmssb	%v0, %v31, %v0, %v0
+	wfnmssb	%v31, %v0, %v0, %v0
+	wfnmssb	%v13, %v17, %v21, %v25
+
+#CHECK: wfnmsxb	%v0, %v0, %v0, %v0      # encoding: [0xe7,0x00,0x04,0x08,0x00,0x9e]
+#CHECK: wfnmsxb	%v0, %v0, %v0, %v31     # encoding: [0xe7,0x00,0x04,0x08,0xf1,0x9e]
+#CHECK: wfnmsxb	%v0, %v0, %v31, %v0     # encoding: [0xe7,0x00,0xf4,0x08,0x02,0x9e]
+#CHECK: wfnmsxb	%v0, %v31, %v0, %v0     # encoding: [0xe7,0x0f,0x04,0x08,0x04,0x9e]
+#CHECK: wfnmsxb	%v31, %v0, %v0, %v0     # encoding: [0xe7,0xf0,0x04,0x08,0x08,0x9e]
+#CHECK: wfnmsxb	%v13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x54,0x08,0x97,0x9e]
+
+	wfnmsxb	%v0, %v0, %v0, %v0
+	wfnmsxb	%v0, %v0, %v0, %v31
+	wfnmsxb	%v0, %v0, %v31, %v0
+	wfnmsxb	%v0, %v31, %v0, %v0
+	wfnmsxb	%v31, %v0, %v0, %v0
+	wfnmsxb	%v13, %v17, %v21, %v25
+
+#CHECK: wfssb   %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x08,0x20,0xe2]
+#CHECK: wfssb   %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x08,0x20,0xe2]
+#CHECK: wfssb   %f0, %f0, %v31          # encoding: [0xe7,0x00,0xf0,0x08,0x22,0xe2]
+#CHECK: wfssb   %f0, %v31, %f0          # encoding: [0xe7,0x0f,0x00,0x08,0x24,0xe2]
+#CHECK: wfssb   %v31, %f0, %f0          # encoding: [0xe7,0xf0,0x00,0x08,0x28,0xe2]
+#CHECK: wfssb   %v18, %f3, %v20         # encoding: [0xe7,0x23,0x40,0x08,0x2a,0xe2]
+
+	wfssb	%v0, %v0, %v0
+	wfssb	%f0, %f0, %f0
+	wfssb	%v0, %v0, %v31
+	wfssb	%v0, %v31, %v0
+	wfssb	%v31, %v0, %v0
+	wfssb	%v18, %v3, %v20
+
+#CHECK: wfsxb   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x08,0x40,0xe2]
+#CHECK: wfsxb   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x08,0x42,0xe2]
+#CHECK: wfsxb   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x08,0x44,0xe2]
+#CHECK: wfsxb   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x08,0x48,0xe2]
+#CHECK: wfsxb   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x08,0x4a,0xe2]
+
+	wfsxb	%v0, %v0, %v0
+	wfsxb	%v0, %v0, %v31
+	wfsxb	%v0, %v31, %v0
+	wfsxb	%v31, %v0, %v0
+	wfsxb	%v18, %v3, %v20
+
+#CHECK: wfsqsb  %f0, %f0                # encoding: [0xe7,0x00,0x00,0x08,0x20,0xce]
+#CHECK: wfsqsb  %f0, %f0                # encoding: [0xe7,0x00,0x00,0x08,0x20,0xce]
+#CHECK: wfsqsb  %f0, %f15               # encoding: [0xe7,0x0f,0x00,0x08,0x20,0xce]
+#CHECK: wfsqsb  %f0, %v31               # encoding: [0xe7,0x0f,0x00,0x08,0x24,0xce]
+#CHECK: wfsqsb  %f15, %f0               # encoding: [0xe7,0xf0,0x00,0x08,0x20,0xce]
+#CHECK: wfsqsb  %v31, %f0               # encoding: [0xe7,0xf0,0x00,0x08,0x28,0xce]
+#CHECK: wfsqsb  %f14, %v17              # encoding: [0xe7,0xe1,0x00,0x08,0x24,0xce]
+
+	wfsqsb	%v0, %v0
+	wfsqsb	%f0, %f0
+	wfsqsb	%v0, %v15
+	wfsqsb	%v0, %v31
+	wfsqsb	%v15, %v0
+	wfsqsb	%v31, %v0
+	wfsqsb	%v14, %v17
+
+#CHECK: wfsqxb  %v0, %v0                # encoding: [0xe7,0x00,0x00,0x08,0x40,0xce]
+#CHECK: wfsqxb  %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x08,0x40,0xce]
+#CHECK: wfsqxb  %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x08,0x44,0xce]
+#CHECK: wfsqxb  %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x08,0x40,0xce]
+#CHECK: wfsqxb  %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x08,0x48,0xce]
+#CHECK: wfsqxb  %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x08,0x44,0xce]
+
+	wfsqxb	%v0, %v0
+	wfsqxb	%v0, %v15
+	wfsqxb	%v0, %v31
+	wfsqxb	%v15, %v0
+	wfsqxb	%v31, %v0
+	wfsqxb	%v14, %v17
+
+#CHECK: wftcisb %f0, %f0, 0             # encoding: [0xe7,0x00,0x00,0x08,0x20,0x4a]
+#CHECK: wftcisb %f0, %f0, 0             # encoding: [0xe7,0x00,0x00,0x08,0x20,0x4a]
+#CHECK: wftcisb %f0, %f0, 4095          # encoding: [0xe7,0x00,0xff,0xf8,0x20,0x4a]
+#CHECK: wftcisb %f0, %f15, 0            # encoding: [0xe7,0x0f,0x00,0x08,0x20,0x4a]
+#CHECK: wftcisb %f0, %v31, 0            # encoding: [0xe7,0x0f,0x00,0x08,0x24,0x4a]
+#CHECK: wftcisb %f15, %f0, 0            # encoding: [0xe7,0xf0,0x00,0x08,0x20,0x4a]
+#CHECK: wftcisb %v31, %f0, 0            # encoding: [0xe7,0xf0,0x00,0x08,0x28,0x4a]
+#CHECK: wftcisb %f4, %v21, 1656         # encoding: [0xe7,0x45,0x67,0x88,0x24,0x4a]
+
+	wftcisb	%v0, %v0, 0
+	wftcisb	%f0, %f0, 0
+	wftcisb	%v0, %v0, 4095
+	wftcisb	%v0, %v15, 0
+	wftcisb	%v0, %v31, 0
+	wftcisb	%v15, %v0, 0
+	wftcisb	%v31, %v0, 0
+	wftcisb	%v4, %v21, 0x678
+
+#CHECK: wftcixb %v0, %v0, 0             # encoding: [0xe7,0x00,0x00,0x08,0x40,0x4a]
+#CHECK: wftcixb %v0, %v0, 4095          # encoding: [0xe7,0x00,0xff,0xf8,0x40,0x4a]
+#CHECK: wftcixb %v0, %v15, 0            # encoding: [0xe7,0x0f,0x00,0x08,0x40,0x4a]
+#CHECK: wftcixb %v0, %v31, 0            # encoding: [0xe7,0x0f,0x00,0x08,0x44,0x4a]
+#CHECK: wftcixb %v15, %v0, 0            # encoding: [0xe7,0xf0,0x00,0x08,0x40,0x4a]
+#CHECK: wftcixb %v31, %v0, 0            # encoding: [0xe7,0xf0,0x00,0x08,0x48,0x4a]
+#CHECK: wftcixb %v4, %v21, 1656         # encoding: [0xe7,0x45,0x67,0x88,0x44,0x4a]
+
+	wftcixb	%v0, %v0, 0
+	wftcixb	%v0, %v0, 4095
+	wftcixb	%v0, %v15, 0
+	wftcixb	%v0, %v31, 0
+	wftcixb	%v15, %v0, 0
+	wftcixb	%v31, %v0, 0
+	wftcixb	%v4, %v21, 0x678
+
diff --git a/test/MC/SystemZ/invalid-instructions-spellcheck.s b/test/MC/SystemZ/invalid-instructions-spellcheck.s
new file mode 100644
index 000000000000..e77b99d9a387
--- /dev/null
+++ b/test/MC/SystemZ/invalid-instructions-spellcheck.s
@@ -0,0 +1,66 @@
+# RUN: not llvm-mc -triple=systemz -mcpu=z13 -show-encoding < %s 2>&1 | FileCheck %s
+# RUN: not llvm-mc -triple=systemz -mcpu=zEC12 -show-encoding < %s 2>&1 | FileCheck %s --check-prefix=CHECK-ZEC12
+
+# This tests the mnemonic spell checker.
+
+# First check what happens when an instruction is omitted:
+
+  %r1, %r2, %r3
+
+# CHECK:      error: unexpected token at start of statement
+# CHECK-NEXT: %r1, %r2, %r3
+# CHECK-NEXT:   ^
+
+# We don't want to see a suggestion here; the edit distance is too large to
+# give sensible suggestions:
+
+  aaaaaaaaaaaaaaa %r1, %r2, %r3
+
+# CHECK:      error: invalid instruction
+# CHECK-NEXT: aaaaaaaaaaaaaaa %r1, %r2, %r3
+# CHECK-NEXT: ^
+
+# Check that we get one suggestion: 'cpdt' is 1 edit away, i.e. an deletion.
+
+  cpdtX %r1, 0(4, %r15), 0
+
+#CHECK:      error: invalid instruction, did you mean: cpdt
+#CHECK-NEXT: cpdtX %r1, 0(4, %r15), 0
+#CHECK-NEXT: ^
+
+# Check edit distance 1 and 2
+
+  ltTr %r1, %r2
+
+# CHECK:      error: invalid instruction, did you mean: lr, lt, ltdr, ltdtr, lter, ltgr, ltr, ltxr, ltxtr, tr, trtr?
+# CHECK-NEXT: ltTr %r1, %r2
+# CHECK-NEXT: ^
+
+# Check edit distance 1 and 2, just insertions:
+
+  begin 0, 65292
+
+# CHECK:      error: invalid instruction, did you mean: tbegin, tbeginc?
+# CHECK-NEXT: begin 0, 65292
+# CHECK-NEXT: ^
+
+# Check an instruction that is 2 edits away, and also has a lot of candidates:
+
+  adt %r1, 244(%r15)
+
+# CHECK:      error: invalid instruction, did you mean: a, ad, adb, adr, adtr, adtra, d, lat, mad, qadtr?
+# CHECK-NEXT: adt %r1, 244(%r15)
+# CHECK-NEXT: ^
+
+# Here it is checked that we don't suggest instructions that are not supported.
+# For example, in pre-z13 mode we don't want to see suggestions for vector instructions.
+
+  vlvggp %v1, %r2, %r3
+
+# CHECK-ZEC12: error: invalid instruction
+# CHECK-ZEC12: vlvggp
+# CHECK-ZEC12: ^
+
+# CHECK:      error: invalid instruction, did you mean: vlvg, vlvgg, vlvgp?
+# CHECK-NEXT: vlvggp %v1, %r2, %r3
+# CHECK-NEXT: ^
diff --git a/test/MC/X86/pr22028.s b/test/MC/X86/pr22028.s
index 6ce7da07488b..d82b50f051a1 100644
--- a/test/MC/X86/pr22028.s
+++ b/test/MC/X86/pr22028.s
@@ -1,6 +1,6 @@
 // RUN: llvm-mc -triple i386-unknown-unknown-code16 -show-encoding %s | FileCheck --check-prefix=CHECK16 %s
-// RUN: llvm-mc -triple i386-unknown-unknown -show-encoding %s | FileCheck --check-prefix=CHECK %s
-// RUN: llvm-mc -triple i686-unknown-unknown -show-encoding %s | FileCheck --check-prefix=CHECK %s	
+// RUN: llvm-mc -triple i386-unknown-unknown -show-encoding %s | FileCheck %s
+// RUN: llvm-mc -triple i686-unknown-unknown -show-encoding %s | FileCheck %s	
 	
 .intel_syntax
 	
diff --git a/test/Object/no-section-table.test b/test/Object/no-section-table.test
index bd60e681b71f..9ecde4f8c369 100644
--- a/test/Object/no-section-table.test
+++ b/test/Object/no-section-table.test
@@ -3,7 +3,7 @@ RUN:   | FileCheck %s
 
 CHECK: DynamicSection [ (24 entries)
 CHECK:   Tag                Type                 Name/Value
-CHECK:   0x0000000000000001 NEEDED               SharedLibrary (libc.so.6)
+CHECK:   0x0000000000000001 NEEDED               Shared library: [libc.so.6]
 CHECK:   0x000000000000000C INIT                 0x4B8
 CHECK:   0x000000000000000D FINI                 0x618
 CHECK:   0x0000000000000019 INIT_ARRAY           0x2006C0
diff --git a/test/Object/readobj-shared-object.test b/test/Object/readobj-shared-object.test
index 173581e60c39..59f5ff127cf3 100644
--- a/test/Object/readobj-shared-object.test
+++ b/test/Object/readobj-shared-object.test
@@ -302,9 +302,9 @@ ELF: ]
 
 ELF32: DynamicSection [ (9 entries)
 ELF32:   Tag        Type                 Name/Value
-ELF32:   0x00000001 NEEDED               SharedLibrary (libc.so.6)
-ELF32:   0x00000001 NEEDED               SharedLibrary (libm.so.6)
-ELF32:   0x0000000E SONAME               LibrarySoname (libfoo.so)
+ELF32:   0x00000001 NEEDED               Shared library: [libc.so.6]
+ELF32:   0x00000001 NEEDED               Shared library: [libm.so.6]
+ELF32:   0x0000000E SONAME               Library soname: [libfoo.so]
 ELF32:   0x00000004 HASH                 {{[0-9a-f]+}}
 ELF32:   0x00000005 STRTAB               {{[0-9a-f]+}}
 ELF32:   0x00000006 SYMTAB               {{[0-9a-f]+}}
@@ -315,9 +315,9 @@ ELF32: ]
 
 ELF64: DynamicSection [ (9 entries)
 ELF64:   Tag        Type                 Name/Value
-ELF64:   0x0000000000000001 NEEDED               SharedLibrary (libc.so.6)
-ELF64:   0x0000000000000001 NEEDED               SharedLibrary (libm.so.6)
-ELF64:   0x000000000000000E SONAME               LibrarySoname (libfoo.so)
+ELF64:   0x0000000000000001 NEEDED               Shared library: [libc.so.6]
+ELF64:   0x0000000000000001 NEEDED               Shared library: [libm.so.6]
+ELF64:   0x000000000000000E SONAME               Library soname: [libfoo.so]
 ELF64:   0x0000000000000004 HASH                 {{[0-9a-f]+}}
 ELF64:   0x0000000000000005 STRTAB               {{[0-9a-f]+}}
 ELF64:   0x0000000000000006 SYMTAB               {{[0-9a-f]+}}
diff --git a/test/ObjectYAML/CodeView/guid.yaml b/test/ObjectYAML/CodeView/guid.yaml
new file mode 100644
index 000000000000..8d8d0142c5e3
--- /dev/null
+++ b/test/ObjectYAML/CodeView/guid.yaml
@@ -0,0 +1,59 @@
+# RUN: yaml2obj %s | obj2yaml | FileCheck %s
+
+--- !COFF
+header:
+  Machine:         IMAGE_FILE_MACHINE_AMD64
+  Characteristics: [  ]
+sections:
+  - Name:            '.debug$T'
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_DISCARDABLE, IMAGE_SCN_MEM_READ ]
+    Alignment:       1
+    Types:
+      - Kind:            LF_TYPESERVER2
+        TypeServer2:
+          Guid:            '{01DF191B-22BF-6B42-96CE-5258B8329FE5}'
+          Age:             24
+          Name:            'C:\src\llvm-project\build\vc140.pdb'
+symbols:
+  - Name:            '.debug$T'
+    Value:           0
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition:
+      Length:          64
+      NumberOfRelocations: 0
+      NumberOfLinenumbers: 0
+      CheckSum:        0
+      Number:          0
+...
+
+# CHECK: --- !COFF
+# CHECK: header:
+# CHECK:   Machine:         IMAGE_FILE_MACHINE_AMD64
+# CHECK:   Characteristics: [  ]
+# CHECK: sections:
+# CHECK:   - Name:            '.debug$T'
+# CHECK:     Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_DISCARDABLE, IMAGE_SCN_MEM_READ ]
+# CHECK:     Alignment:       1
+# CHECK:     Types:
+# CHECK:       - Kind:            LF_TYPESERVER2
+# CHECK:         TypeServer2:
+# CHECK:           Guid:            '{01DF191B-22BF-6B42-96CE-5258B8329FE5}'
+# CHECK:           Age:             24
+# CHECK:           Name:            'C:\src\llvm-project\build\vc140.pdb'
+# CHECK: symbols:
+# CHECK:   - Name:            '.debug$T'
+# CHECK:     Value:           0
+# CHECK:     SectionNumber:   1
+# CHECK:     SimpleType:      IMAGE_SYM_TYPE_NULL
+# CHECK:     ComplexType:     IMAGE_SYM_DTYPE_NULL
+# CHECK:     StorageClass:    IMAGE_SYM_CLASS_STATIC
+# CHECK:     SectionDefinition:
+# CHECK:       Length:          64
+# CHECK:       NumberOfRelocations: 0
+# CHECK:       NumberOfLinenumbers: 0
+# CHECK:       CheckSum:        0
+# CHECK:       Number:          0
+# CHECK: ...
diff --git a/test/Other/cgscc-libcall-update.ll b/test/Other/cgscc-libcall-update.ll
new file mode 100644
index 000000000000..e0833ca09266
--- /dev/null
+++ b/test/Other/cgscc-libcall-update.ll
@@ -0,0 +1,61 @@
+; Make sure that the CGSCC pass manager can handle when instcombine simplifies
+; one libcall into an unrelated libcall and update the call graph accordingly.
+;
+; Also check that it can handle inlining *removing* a libcall entirely.
+;
+; RUN: opt -passes='cgscc(inline,function(instcombine))' -S < %s | FileCheck %s
+
+define i8* @wibble(i8* %arg1, i8* %arg2) {
+; CHECK-LABEL: define i8* @wibble(
+bb:
+  %tmp = alloca [1024 x i8], align 16
+  %tmp2 = getelementptr inbounds [1024 x i8], [1024 x i8]* %tmp, i64 0, i64 0
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %tmp2, i8* %arg1, i64 1024, i32 0, i1 false)
+; CHECK:         call void @llvm.memcpy
+  %tmp3 = call i64 @llvm.objectsize.i64.p0i8(i8* %tmp2, i1 false, i1 true)
+  %tmp4 = call i8* @__strncpy_chk(i8* %arg2, i8* %tmp2, i64 1023, i64 %tmp3)
+; CHECK-NOT:     call
+; CHECK:         call i8* @strncpy(i8* %arg2, i8* %tmp2, i64 1023)
+; CHECK-NOT:     call
+
+  ret i8* %tmp4
+; CHECK:         ret
+}
+
+define i8* @strncpy(i8* %arg1, i8* %arg2, i64 %size) noinline {
+bb:
+  %result = call i8* @my_special_strncpy(i8* %arg1, i8* %arg2, i64 %size)
+  ret i8* %result
+}
+
+declare i8* @my_special_strncpy(i8* %arg1, i8* %arg2, i64 %size)
+
+declare i64 @llvm.objectsize.i64.p0i8(i8*, i1, i1)
+
+declare i8* @__strncpy_chk(i8*, i8*, i64, i64)
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1)
+
+; Check that even when we completely remove a libcall we don't get the call
+; graph wrong once we handle libcalls in the call graph specially to address
+; the above case.
+define i32 @hoge(i32* %arg1) {
+; CHECK-LABEL: define i32 @hoge(
+bb:
+  %tmp41 = load i32*, i32** null
+  %tmp6 = load i32, i32* %arg1
+  %tmp7 = call i32 @ntohl(i32 %tmp6)
+; CHECK-NOT: call i32 @ntohl
+  ret i32 %tmp7
+; CHECK: ret i32
+}
+
+; Even though this function is not used, it should be retained as it may be
+; used when doing further libcall transformations.
+define internal i32 @ntohl(i32 %x) {
+; CHECK-LABEL: define internal i32 @ntohl(
+entry:
+  %and2 = lshr i32 %x, 8
+  %shr = and i32 %and2, 65280
+  ret i32 %shr
+}
diff --git a/test/Other/new-pass-manager.ll b/test/Other/new-pass-manager.ll
index bf8e596d118b..35f596e77988 100644
--- a/test/Other/new-pass-manager.ll
+++ b/test/Other/new-pass-manager.ll
@@ -23,6 +23,7 @@
 ; CHECK-CGSCC-PASS-NEXT: Running analysis: InnerAnalysisManagerProxy<{{.*(CGSCCAnalysisManager|AnalysisManager<.*LazyCallGraph::SCC.*>).*}},{{.*}}Module>
 ; CHECK-CGSCC-PASS-NEXT: Running analysis: InnerAnalysisManagerProxy<{{.*(FunctionAnalysisManager|AnalysisManager<.*Function.*>).*}},{{.*}}Module>
 ; CHECK-CGSCC-PASS-NEXT: Running analysis: LazyCallGraphAnalysis
+; CHECK-CGSCC-PASS-NEXT: Running analysis: TargetLibraryAnalysis
 ; CHECK-CGSCC-PASS-NEXT: Running an SCC pass across the RefSCC: [(foo)]
 ; CHECK-CGSCC-PASS-NEXT: Starting CGSCC pass manager run
 ; CHECK-CGSCC-PASS-NEXT: Running pass: NoOpCGSCCPass
@@ -407,6 +408,7 @@
 ; CHECK-REPEAT-CGSCC-PASS-NEXT: Running analysis: InnerAnalysisManagerProxy<{{.*(CGSCCAnalysisManager|AnalysisManager<.*LazyCallGraph::SCC.*>).*}},{{.*}}Module>
 ; CHECK-REPEAT-CGSCC-PASS-NEXT: Running analysis: InnerAnalysisManagerProxy<{{.*(FunctionAnalysisManager|AnalysisManager<.*Function.*>).*}},{{.*}}Module>
 ; CHECK-REPEAT-CGSCC-PASS-NEXT: Running analysis: LazyCallGraphAnalysis
+; CHECK-REPEAT-CGSCC-PASS-NEXT: Running analysis: TargetLibraryAnalysis
 ; CHECK-REPEAT-CGSCC-PASS-NEXT: Running an SCC pass across the RefSCC: [(foo)]
 ; CHECK-REPEAT-CGSCC-PASS-NEXT: Starting CGSCC pass manager run
 ; CHECK-REPEAT-CGSCC-PASS-NEXT: Running pass: RepeatedPass
diff --git a/test/ThinLTO/X86/debuginfo-cu-import.ll b/test/ThinLTO/X86/debuginfo-cu-import.ll
index 42a751191860..e0b066c736e4 100644
--- a/test/ThinLTO/X86/debuginfo-cu-import.ll
+++ b/test/ThinLTO/X86/debuginfo-cu-import.ll
@@ -51,11 +51,11 @@ entry:
 !9 = !DIGlobalVariableExpression(var: !10)
 !10 = !DIGlobalVariable(name: "version", scope: !4, file: !1, line: 2, type: !7, isLocal: false, isDefinition: true)
 !11 = !{!12, !16}
-!12 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !13, line: 8)
+!12 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !13, file: !1, line: 8)
 !13 = distinct !DISubprogram(name: "a", linkageName: "_ZN1A1aEv", scope: !4, file: !1, line: 7, type: !14, isLocal: false, isDefinition: true, scopeLine: 7, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !5)
 !14 = !DISubroutineType(types: !15)
 !15 = !{null}
-!16 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !17, entity: !19, line: 8)
+!16 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !17, entity: !19, file: !1, line: 8)
 !17 = distinct !DILexicalBlock(scope: !18, file: !1, line: 9, column: 8)
 !18 = distinct !DISubprogram(name: "c", linkageName: "_ZN1A1cEv", scope: !4, file: !1, line: 9, type: !14, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !5)
 !19 = distinct !DILexicalBlock(scope: !20, file: !1, line: 10, column: 8)
diff --git a/test/Transforms/CodeGenPrepare/X86/memcmp.ll b/test/Transforms/CodeGenPrepare/X86/memcmp.ll
index 4b9e7c3956f5..1dfc08761965 100644
--- a/test/Transforms/CodeGenPrepare/X86/memcmp.ll
+++ b/test/Transforms/CodeGenPrepare/X86/memcmp.ll
@@ -23,9 +23,63 @@ define i32 @cmp2(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 }
 
 define i32 @cmp3(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; ALL-LABEL: @cmp3(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 3)
-; ALL-NEXT:    ret i32 [[CALL]]
+; X32-LABEL: @cmp3(
+; X32-NEXT:  loadbb:
+; X32-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i16*
+; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i16*
+; X32-NEXT:    [[TMP2:%.*]] = load i16, i16* [[TMP0]]
+; X32-NEXT:    [[TMP3:%.*]] = load i16, i16* [[TMP1]]
+; X32-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X32-NEXT:    [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X32-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X32-NEXT:    [[TMP7:%.*]] = zext i16 [[TMP5]] to i32
+; X32-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[TMP6]], [[TMP7]]
+; X32-NEXT:    br i1 [[TMP8]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X32:       res_block:
+; X32-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP6]], [[TMP7]]
+; X32-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], i32 -1, i32 1
+; X32-NEXT:    br label [[ENDBLOCK:%.*]]
+; X32:       loadbb1:
+; X32-NEXT:    [[TMP11:%.*]] = getelementptr i8, i8* [[X]], i8 2
+; X32-NEXT:    [[TMP12:%.*]] = getelementptr i8, i8* [[Y]], i8 2
+; X32-NEXT:    [[TMP13:%.*]] = load i8, i8* [[TMP11]]
+; X32-NEXT:    [[TMP14:%.*]] = load i8, i8* [[TMP12]]
+; X32-NEXT:    [[TMP15:%.*]] = zext i8 [[TMP13]] to i32
+; X32-NEXT:    [[TMP16:%.*]] = zext i8 [[TMP14]] to i32
+; X32-NEXT:    [[TMP17:%.*]] = sub i32 [[TMP15]], [[TMP16]]
+; X32-NEXT:    br label [[ENDBLOCK]]
+; X32:       endblock:
+; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP17]], [[LOADBB1]] ], [ [[TMP10]], [[RES_BLOCK]] ]
+; X32-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-LABEL: @cmp3(
+; X64-NEXT:  loadbb:
+; X64-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i16*
+; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i16*
+; X64-NEXT:    [[TMP2:%.*]] = load i16, i16* [[TMP0]]
+; X64-NEXT:    [[TMP3:%.*]] = load i16, i16* [[TMP1]]
+; X64-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X64-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i64
+; X64-NEXT:    [[TMP7:%.*]] = zext i16 [[TMP5]] to i64
+; X64-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[TMP6]], [[TMP7]]
+; X64-NEXT:    br i1 [[TMP8]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[TMP9:%.*]] = icmp ult i64 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP11:%.*]] = getelementptr i8, i8* [[X]], i8 2
+; X64-NEXT:    [[TMP12:%.*]] = getelementptr i8, i8* [[Y]], i8 2
+; X64-NEXT:    [[TMP13:%.*]] = load i8, i8* [[TMP11]]
+; X64-NEXT:    [[TMP14:%.*]] = load i8, i8* [[TMP12]]
+; X64-NEXT:    [[TMP15:%.*]] = zext i8 [[TMP13]] to i32
+; X64-NEXT:    [[TMP16:%.*]] = zext i8 [[TMP14]] to i32
+; X64-NEXT:    [[TMP17:%.*]] = sub i32 [[TMP15]], [[TMP16]]
+; X64-NEXT:    br label [[ENDBLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP17]], [[LOADBB1]] ], [ [[TMP10]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 3)
   ret i32 %call
@@ -50,27 +104,225 @@ define i32 @cmp4(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 }
 
 define i32 @cmp5(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; ALL-LABEL: @cmp5(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 5)
-; ALL-NEXT:    ret i32 [[CALL]]
+; X32-LABEL: @cmp5(
+; X32-NEXT:  loadbb:
+; X32-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]]
+; X32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; X32-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X32-NEXT:    [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X32-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]]
+; X32-NEXT:    br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X32:       res_block:
+; X32-NEXT:    [[TMP7:%.*]] = icmp ult i32 [[TMP4]], [[TMP5]]
+; X32-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
+; X32-NEXT:    br label [[ENDBLOCK:%.*]]
+; X32:       loadbb1:
+; X32-NEXT:    [[TMP9:%.*]] = getelementptr i8, i8* [[X]], i8 4
+; X32-NEXT:    [[TMP10:%.*]] = getelementptr i8, i8* [[Y]], i8 4
+; X32-NEXT:    [[TMP11:%.*]] = load i8, i8* [[TMP9]]
+; X32-NEXT:    [[TMP12:%.*]] = load i8, i8* [[TMP10]]
+; X32-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X32-NEXT:    [[TMP14:%.*]] = zext i8 [[TMP12]] to i32
+; X32-NEXT:    [[TMP15:%.*]] = sub i32 [[TMP13]], [[TMP14]]
+; X32-NEXT:    br label [[ENDBLOCK]]
+; X32:       endblock:
+; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP15]], [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ]
+; X32-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-LABEL: @cmp5(
+; X64-NEXT:  loadbb:
+; X64-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]]
+; X64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; X64-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-NEXT:    [[TMP6:%.*]] = zext i32 [[TMP4]] to i64
+; X64-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP5]] to i64
+; X64-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[TMP6]], [[TMP7]]
+; X64-NEXT:    br i1 [[TMP8]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[TMP9:%.*]] = icmp ult i64 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP11:%.*]] = getelementptr i8, i8* [[X]], i8 4
+; X64-NEXT:    [[TMP12:%.*]] = getelementptr i8, i8* [[Y]], i8 4
+; X64-NEXT:    [[TMP13:%.*]] = load i8, i8* [[TMP11]]
+; X64-NEXT:    [[TMP14:%.*]] = load i8, i8* [[TMP12]]
+; X64-NEXT:    [[TMP15:%.*]] = zext i8 [[TMP13]] to i32
+; X64-NEXT:    [[TMP16:%.*]] = zext i8 [[TMP14]] to i32
+; X64-NEXT:    [[TMP17:%.*]] = sub i32 [[TMP15]], [[TMP16]]
+; X64-NEXT:    br label [[ENDBLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP17]], [[LOADBB1]] ], [ [[TMP10]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 5)
   ret i32 %call
 }
 
 define i32 @cmp6(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; ALL-LABEL: @cmp6(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 6)
-; ALL-NEXT:    ret i32 [[CALL]]
+; X32-LABEL: @cmp6(
+; X32-NEXT:  loadbb:
+; X32-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]]
+; X32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; X32-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X32-NEXT:    [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X32-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]]
+; X32-NEXT:    br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X32:       res_block:
+; X32-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ]
+; X32-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ]
+; X32-NEXT:    [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X32-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
+; X32-NEXT:    br label [[ENDBLOCK:%.*]]
+; X32:       loadbb1:
+; X32-NEXT:    [[TMP9:%.*]] = bitcast i8* [[X]] to i16*
+; X32-NEXT:    [[TMP10:%.*]] = bitcast i8* [[Y]] to i16*
+; X32-NEXT:    [[TMP11:%.*]] = getelementptr i16, i16* [[TMP9]], i16 2
+; X32-NEXT:    [[TMP12:%.*]] = getelementptr i16, i16* [[TMP10]], i16 2
+; X32-NEXT:    [[TMP13:%.*]] = load i16, i16* [[TMP11]]
+; X32-NEXT:    [[TMP14:%.*]] = load i16, i16* [[TMP12]]
+; X32-NEXT:    [[TMP15:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP13]])
+; X32-NEXT:    [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]])
+; X32-NEXT:    [[TMP17]] = zext i16 [[TMP15]] to i32
+; X32-NEXT:    [[TMP18]] = zext i16 [[TMP16]] to i32
+; X32-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[TMP17]], [[TMP18]]
+; X32-NEXT:    br i1 [[TMP19]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X32:       endblock:
+; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ]
+; X32-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-LABEL: @cmp6(
+; X64-NEXT:  loadbb:
+; X64-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]]
+; X64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; X64-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-NEXT:    [[TMP6:%.*]] = zext i32 [[TMP4]] to i64
+; X64-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP5]] to i64
+; X64-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[TMP6]], [[TMP7]]
+; X64-NEXT:    br i1 [[TMP8]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP6]], [[LOADBB:%.*]] ], [ [[TMP19:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP7]], [[LOADBB]] ], [ [[TMP20:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[TMP9:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP11:%.*]] = bitcast i8* [[X]] to i16*
+; X64-NEXT:    [[TMP12:%.*]] = bitcast i8* [[Y]] to i16*
+; X64-NEXT:    [[TMP13:%.*]] = getelementptr i16, i16* [[TMP11]], i16 2
+; X64-NEXT:    [[TMP14:%.*]] = getelementptr i16, i16* [[TMP12]], i16 2
+; X64-NEXT:    [[TMP15:%.*]] = load i16, i16* [[TMP13]]
+; X64-NEXT:    [[TMP16:%.*]] = load i16, i16* [[TMP14]]
+; X64-NEXT:    [[TMP17:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP15]])
+; X64-NEXT:    [[TMP18:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP16]])
+; X64-NEXT:    [[TMP19]] = zext i16 [[TMP17]] to i64
+; X64-NEXT:    [[TMP20]] = zext i16 [[TMP18]] to i64
+; X64-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP10]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 6)
   ret i32 %call
 }
 
 define i32 @cmp7(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; ALL-LABEL: @cmp7(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 7)
-; ALL-NEXT:    ret i32 [[CALL]]
+; X32-LABEL: @cmp7(
+; X32-NEXT:  loadbb:
+; X32-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]]
+; X32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; X32-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X32-NEXT:    [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X32-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]]
+; X32-NEXT:    br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X32:       res_block:
+; X32-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ]
+; X32-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ]
+; X32-NEXT:    [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X32-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
+; X32-NEXT:    br label [[ENDBLOCK:%.*]]
+; X32:       loadbb1:
+; X32-NEXT:    [[TMP9:%.*]] = bitcast i8* [[X]] to i16*
+; X32-NEXT:    [[TMP10:%.*]] = bitcast i8* [[Y]] to i16*
+; X32-NEXT:    [[TMP11:%.*]] = getelementptr i16, i16* [[TMP9]], i16 2
+; X32-NEXT:    [[TMP12:%.*]] = getelementptr i16, i16* [[TMP10]], i16 2
+; X32-NEXT:    [[TMP13:%.*]] = load i16, i16* [[TMP11]]
+; X32-NEXT:    [[TMP14:%.*]] = load i16, i16* [[TMP12]]
+; X32-NEXT:    [[TMP15:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP13]])
+; X32-NEXT:    [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]])
+; X32-NEXT:    [[TMP17]] = zext i16 [[TMP15]] to i32
+; X32-NEXT:    [[TMP18]] = zext i16 [[TMP16]] to i32
+; X32-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[TMP17]], [[TMP18]]
+; X32-NEXT:    br i1 [[TMP19]], label [[LOADBB2:%.*]], label [[RES_BLOCK]]
+; X32:       loadbb2:
+; X32-NEXT:    [[TMP20:%.*]] = getelementptr i8, i8* [[X]], i8 6
+; X32-NEXT:    [[TMP21:%.*]] = getelementptr i8, i8* [[Y]], i8 6
+; X32-NEXT:    [[TMP22:%.*]] = load i8, i8* [[TMP20]]
+; X32-NEXT:    [[TMP23:%.*]] = load i8, i8* [[TMP21]]
+; X32-NEXT:    [[TMP24:%.*]] = zext i8 [[TMP22]] to i32
+; X32-NEXT:    [[TMP25:%.*]] = zext i8 [[TMP23]] to i32
+; X32-NEXT:    [[TMP26:%.*]] = sub i32 [[TMP24]], [[TMP25]]
+; X32-NEXT:    br label [[ENDBLOCK]]
+; X32:       endblock:
+; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP26]], [[LOADBB2]] ], [ [[TMP8]], [[RES_BLOCK]] ]
+; X32-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-LABEL: @cmp7(
+; X64-NEXT:  loadbb:
+; X64-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]]
+; X64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; X64-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-NEXT:    [[TMP6:%.*]] = zext i32 [[TMP4]] to i64
+; X64-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP5]] to i64
+; X64-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[TMP6]], [[TMP7]]
+; X64-NEXT:    br i1 [[TMP8]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP6]], [[LOADBB:%.*]] ], [ [[TMP19:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP7]], [[LOADBB]] ], [ [[TMP20:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[TMP9:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP11:%.*]] = bitcast i8* [[X]] to i16*
+; X64-NEXT:    [[TMP12:%.*]] = bitcast i8* [[Y]] to i16*
+; X64-NEXT:    [[TMP13:%.*]] = getelementptr i16, i16* [[TMP11]], i16 2
+; X64-NEXT:    [[TMP14:%.*]] = getelementptr i16, i16* [[TMP12]], i16 2
+; X64-NEXT:    [[TMP15:%.*]] = load i16, i16* [[TMP13]]
+; X64-NEXT:    [[TMP16:%.*]] = load i16, i16* [[TMP14]]
+; X64-NEXT:    [[TMP17:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP15]])
+; X64-NEXT:    [[TMP18:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP16]])
+; X64-NEXT:    [[TMP19]] = zext i16 [[TMP17]] to i64
+; X64-NEXT:    [[TMP20]] = zext i16 [[TMP18]] to i64
+; X64-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-NEXT:    br i1 [[TMP21]], label [[LOADBB2:%.*]], label [[RES_BLOCK]]
+; X64:       loadbb2:
+; X64-NEXT:    [[TMP22:%.*]] = getelementptr i8, i8* [[X]], i8 6
+; X64-NEXT:    [[TMP23:%.*]] = getelementptr i8, i8* [[Y]], i8 6
+; X64-NEXT:    [[TMP24:%.*]] = load i8, i8* [[TMP22]]
+; X64-NEXT:    [[TMP25:%.*]] = load i8, i8* [[TMP23]]
+; X64-NEXT:    [[TMP26:%.*]] = zext i8 [[TMP24]] to i32
+; X64-NEXT:    [[TMP27:%.*]] = zext i8 [[TMP25]] to i32
+; X64-NEXT:    [[TMP28:%.*]] = sub i32 [[TMP26]], [[TMP27]]
+; X64-NEXT:    br label [[ENDBLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP28]], [[LOADBB2]] ], [ [[TMP10]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 7)
   ret i32 %call
@@ -78,8 +330,35 @@ define i32 @cmp7(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 
 define i32 @cmp8(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 ; X32-LABEL: @cmp8(
-; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 8)
-; X32-NEXT:    ret i32 [[CALL]]
+; X32-NEXT:  loadbb:
+; X32-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]]
+; X32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; X32-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X32-NEXT:    [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X32-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]]
+; X32-NEXT:    br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X32:       res_block:
+; X32-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; X32-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ]
+; X32-NEXT:    [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X32-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
+; X32-NEXT:    br label [[ENDBLOCK:%.*]]
+; X32:       loadbb1:
+; X32-NEXT:    [[TMP9:%.*]] = bitcast i8* [[X]] to i32*
+; X32-NEXT:    [[TMP10:%.*]] = bitcast i8* [[Y]] to i32*
+; X32-NEXT:    [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 1
+; X32-NEXT:    [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 1
+; X32-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP11]]
+; X32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP12]]
+; X32-NEXT:    [[TMP15]] = call i32 @llvm.bswap.i32(i32 [[TMP13]])
+; X32-NEXT:    [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]])
+; X32-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[TMP15]], [[TMP16]]
+; X32-NEXT:    br i1 [[TMP17]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X32:       endblock:
+; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ]
+; X32-NEXT:    ret i32 [[PHI_RES]]
 ;
 ; X64-LABEL: @cmp8(
 ; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64*
@@ -99,72 +378,691 @@ define i32 @cmp8(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 }
 
 define i32 @cmp9(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; ALL-LABEL: @cmp9(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 9)
-; ALL-NEXT:    ret i32 [[CALL]]
+; X32-LABEL: @cmp9(
+; X32-NEXT:  loadbb:
+; X32-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]]
+; X32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; X32-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X32-NEXT:    [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X32-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]]
+; X32-NEXT:    br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X32:       res_block:
+; X32-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; X32-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ]
+; X32-NEXT:    [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X32-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
+; X32-NEXT:    br label [[ENDBLOCK:%.*]]
+; X32:       loadbb1:
+; X32-NEXT:    [[TMP9:%.*]] = bitcast i8* [[X]] to i32*
+; X32-NEXT:    [[TMP10:%.*]] = bitcast i8* [[Y]] to i32*
+; X32-NEXT:    [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 1
+; X32-NEXT:    [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 1
+; X32-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP11]]
+; X32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP12]]
+; X32-NEXT:    [[TMP15]] = call i32 @llvm.bswap.i32(i32 [[TMP13]])
+; X32-NEXT:    [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]])
+; X32-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[TMP15]], [[TMP16]]
+; X32-NEXT:    br i1 [[TMP17]], label [[LOADBB2:%.*]], label [[RES_BLOCK]]
+; X32:       loadbb2:
+; X32-NEXT:    [[TMP18:%.*]] = getelementptr i8, i8* [[X]], i8 8
+; X32-NEXT:    [[TMP19:%.*]] = getelementptr i8, i8* [[Y]], i8 8
+; X32-NEXT:    [[TMP20:%.*]] = load i8, i8* [[TMP18]]
+; X32-NEXT:    [[TMP21:%.*]] = load i8, i8* [[TMP19]]
+; X32-NEXT:    [[TMP22:%.*]] = zext i8 [[TMP20]] to i32
+; X32-NEXT:    [[TMP23:%.*]] = zext i8 [[TMP21]] to i32
+; X32-NEXT:    [[TMP24:%.*]] = sub i32 [[TMP22]], [[TMP23]]
+; X32-NEXT:    br label [[ENDBLOCK]]
+; X32:       endblock:
+; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP24]], [[LOADBB2]] ], [ [[TMP8]], [[RES_BLOCK]] ]
+; X32-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-LABEL: @cmp9(
+; X64-NEXT:  loadbb:
+; X64-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64*
+; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64*
+; X64-NEXT:    [[TMP2:%.*]] = load i64, i64* [[TMP0]]
+; X64-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP1]]
+; X64-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]]
+; X64-NEXT:    br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[TMP7:%.*]] = icmp ult i64 [[TMP4]], [[TMP5]]
+; X64-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, i8* [[X]], i8 8
+; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, i8* [[Y]], i8 8
+; X64-NEXT:    [[TMP11:%.*]] = load i8, i8* [[TMP9]]
+; X64-NEXT:    [[TMP12:%.*]] = load i8, i8* [[TMP10]]
+; X64-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-NEXT:    [[TMP14:%.*]] = zext i8 [[TMP12]] to i32
+; X64-NEXT:    [[TMP15:%.*]] = sub i32 [[TMP13]], [[TMP14]]
+; X64-NEXT:    br label [[ENDBLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP15]], [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 9)
   ret i32 %call
 }
 
 define i32 @cmp10(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; ALL-LABEL: @cmp10(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 10)
-; ALL-NEXT:    ret i32 [[CALL]]
+; X32-LABEL: @cmp10(
+; X32-NEXT:  loadbb:
+; X32-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]]
+; X32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; X32-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X32-NEXT:    [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X32-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]]
+; X32-NEXT:    br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X32:       res_block:
+; X32-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ], [ [[TMP26:%.*]], [[LOADBB2:%.*]] ]
+; X32-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ], [ [[TMP27:%.*]], [[LOADBB2]] ]
+; X32-NEXT:    [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X32-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
+; X32-NEXT:    br label [[ENDBLOCK:%.*]]
+; X32:       loadbb1:
+; X32-NEXT:    [[TMP9:%.*]] = bitcast i8* [[X]] to i32*
+; X32-NEXT:    [[TMP10:%.*]] = bitcast i8* [[Y]] to i32*
+; X32-NEXT:    [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 1
+; X32-NEXT:    [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 1
+; X32-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP11]]
+; X32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP12]]
+; X32-NEXT:    [[TMP15]] = call i32 @llvm.bswap.i32(i32 [[TMP13]])
+; X32-NEXT:    [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]])
+; X32-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[TMP15]], [[TMP16]]
+; X32-NEXT:    br i1 [[TMP17]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X32:       loadbb2:
+; X32-NEXT:    [[TMP18:%.*]] = bitcast i8* [[X]] to i16*
+; X32-NEXT:    [[TMP19:%.*]] = bitcast i8* [[Y]] to i16*
+; X32-NEXT:    [[TMP20:%.*]] = getelementptr i16, i16* [[TMP18]], i16 4
+; X32-NEXT:    [[TMP21:%.*]] = getelementptr i16, i16* [[TMP19]], i16 4
+; X32-NEXT:    [[TMP22:%.*]] = load i16, i16* [[TMP20]]
+; X32-NEXT:    [[TMP23:%.*]] = load i16, i16* [[TMP21]]
+; X32-NEXT:    [[TMP24:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP22]])
+; X32-NEXT:    [[TMP25:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP23]])
+; X32-NEXT:    [[TMP26]] = zext i16 [[TMP24]] to i32
+; X32-NEXT:    [[TMP27]] = zext i16 [[TMP25]] to i32
+; X32-NEXT:    [[TMP28:%.*]] = icmp eq i32 [[TMP26]], [[TMP27]]
+; X32-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X32:       endblock:
+; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP8]], [[RES_BLOCK]] ]
+; X32-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-LABEL: @cmp10(
+; X64-NEXT:  loadbb:
+; X64-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64*
+; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64*
+; X64-NEXT:    [[TMP2:%.*]] = load i64, i64* [[TMP0]]
+; X64-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP1]]
+; X64-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]]
+; X64-NEXT:    br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP9:%.*]] = bitcast i8* [[X]] to i16*
+; X64-NEXT:    [[TMP10:%.*]] = bitcast i8* [[Y]] to i16*
+; X64-NEXT:    [[TMP11:%.*]] = getelementptr i16, i16* [[TMP9]], i16 4
+; X64-NEXT:    [[TMP12:%.*]] = getelementptr i16, i16* [[TMP10]], i16 4
+; X64-NEXT:    [[TMP13:%.*]] = load i16, i16* [[TMP11]]
+; X64-NEXT:    [[TMP14:%.*]] = load i16, i16* [[TMP12]]
+; X64-NEXT:    [[TMP15:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP13]])
+; X64-NEXT:    [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]])
+; X64-NEXT:    [[TMP17]] = zext i16 [[TMP15]] to i64
+; X64-NEXT:    [[TMP18]] = zext i16 [[TMP16]] to i64
+; X64-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[TMP17]], [[TMP18]]
+; X64-NEXT:    br i1 [[TMP19]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 10)
   ret i32 %call
 }
 
 define i32 @cmp11(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; ALL-LABEL: @cmp11(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 11)
-; ALL-NEXT:    ret i32 [[CALL]]
+; X32-LABEL: @cmp11(
+; X32-NEXT:  loadbb:
+; X32-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]]
+; X32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; X32-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X32-NEXT:    [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X32-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]]
+; X32-NEXT:    br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X32:       res_block:
+; X32-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ], [ [[TMP26:%.*]], [[LOADBB2:%.*]] ]
+; X32-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ], [ [[TMP27:%.*]], [[LOADBB2]] ]
+; X32-NEXT:    [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X32-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
+; X32-NEXT:    br label [[ENDBLOCK:%.*]]
+; X32:       loadbb1:
+; X32-NEXT:    [[TMP9:%.*]] = bitcast i8* [[X]] to i32*
+; X32-NEXT:    [[TMP10:%.*]] = bitcast i8* [[Y]] to i32*
+; X32-NEXT:    [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 1
+; X32-NEXT:    [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 1
+; X32-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP11]]
+; X32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP12]]
+; X32-NEXT:    [[TMP15]] = call i32 @llvm.bswap.i32(i32 [[TMP13]])
+; X32-NEXT:    [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]])
+; X32-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[TMP15]], [[TMP16]]
+; X32-NEXT:    br i1 [[TMP17]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X32:       loadbb2:
+; X32-NEXT:    [[TMP18:%.*]] = bitcast i8* [[X]] to i16*
+; X32-NEXT:    [[TMP19:%.*]] = bitcast i8* [[Y]] to i16*
+; X32-NEXT:    [[TMP20:%.*]] = getelementptr i16, i16* [[TMP18]], i16 4
+; X32-NEXT:    [[TMP21:%.*]] = getelementptr i16, i16* [[TMP19]], i16 4
+; X32-NEXT:    [[TMP22:%.*]] = load i16, i16* [[TMP20]]
+; X32-NEXT:    [[TMP23:%.*]] = load i16, i16* [[TMP21]]
+; X32-NEXT:    [[TMP24:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP22]])
+; X32-NEXT:    [[TMP25:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP23]])
+; X32-NEXT:    [[TMP26]] = zext i16 [[TMP24]] to i32
+; X32-NEXT:    [[TMP27]] = zext i16 [[TMP25]] to i32
+; X32-NEXT:    [[TMP28:%.*]] = icmp eq i32 [[TMP26]], [[TMP27]]
+; X32-NEXT:    br i1 [[TMP28]], label [[LOADBB3:%.*]], label [[RES_BLOCK]]
+; X32:       loadbb3:
+; X32-NEXT:    [[TMP29:%.*]] = getelementptr i8, i8* [[X]], i8 10
+; X32-NEXT:    [[TMP30:%.*]] = getelementptr i8, i8* [[Y]], i8 10
+; X32-NEXT:    [[TMP31:%.*]] = load i8, i8* [[TMP29]]
+; X32-NEXT:    [[TMP32:%.*]] = load i8, i8* [[TMP30]]
+; X32-NEXT:    [[TMP33:%.*]] = zext i8 [[TMP31]] to i32
+; X32-NEXT:    [[TMP34:%.*]] = zext i8 [[TMP32]] to i32
+; X32-NEXT:    [[TMP35:%.*]] = sub i32 [[TMP33]], [[TMP34]]
+; X32-NEXT:    br label [[ENDBLOCK]]
+; X32:       endblock:
+; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP35]], [[LOADBB3]] ], [ [[TMP8]], [[RES_BLOCK]] ]
+; X32-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-LABEL: @cmp11(
+; X64-NEXT:  loadbb:
+; X64-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64*
+; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64*
+; X64-NEXT:    [[TMP2:%.*]] = load i64, i64* [[TMP0]]
+; X64-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP1]]
+; X64-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]]
+; X64-NEXT:    br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP9:%.*]] = bitcast i8* [[X]] to i16*
+; X64-NEXT:    [[TMP10:%.*]] = bitcast i8* [[Y]] to i16*
+; X64-NEXT:    [[TMP11:%.*]] = getelementptr i16, i16* [[TMP9]], i16 4
+; X64-NEXT:    [[TMP12:%.*]] = getelementptr i16, i16* [[TMP10]], i16 4
+; X64-NEXT:    [[TMP13:%.*]] = load i16, i16* [[TMP11]]
+; X64-NEXT:    [[TMP14:%.*]] = load i16, i16* [[TMP12]]
+; X64-NEXT:    [[TMP15:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP13]])
+; X64-NEXT:    [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]])
+; X64-NEXT:    [[TMP17]] = zext i16 [[TMP15]] to i64
+; X64-NEXT:    [[TMP18]] = zext i16 [[TMP16]] to i64
+; X64-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[TMP17]], [[TMP18]]
+; X64-NEXT:    br i1 [[TMP19]], label [[LOADBB2:%.*]], label [[RES_BLOCK]]
+; X64:       loadbb2:
+; X64-NEXT:    [[TMP20:%.*]] = getelementptr i8, i8* [[X]], i8 10
+; X64-NEXT:    [[TMP21:%.*]] = getelementptr i8, i8* [[Y]], i8 10
+; X64-NEXT:    [[TMP22:%.*]] = load i8, i8* [[TMP20]]
+; X64-NEXT:    [[TMP23:%.*]] = load i8, i8* [[TMP21]]
+; X64-NEXT:    [[TMP24:%.*]] = zext i8 [[TMP22]] to i32
+; X64-NEXT:    [[TMP25:%.*]] = zext i8 [[TMP23]] to i32
+; X64-NEXT:    [[TMP26:%.*]] = sub i32 [[TMP24]], [[TMP25]]
+; X64-NEXT:    br label [[ENDBLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP26]], [[LOADBB2]] ], [ [[TMP8]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 11)
   ret i32 %call
 }
 
 define i32 @cmp12(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; ALL-LABEL: @cmp12(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 12)
-; ALL-NEXT:    ret i32 [[CALL]]
+; X32-LABEL: @cmp12(
+; X32-NEXT:  loadbb:
+; X32-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]]
+; X32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; X32-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X32-NEXT:    [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X32-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]]
+; X32-NEXT:    br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X32:       res_block:
+; X32-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ], [ [[TMP24:%.*]], [[LOADBB2:%.*]] ]
+; X32-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ], [ [[TMP25:%.*]], [[LOADBB2]] ]
+; X32-NEXT:    [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X32-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
+; X32-NEXT:    br label [[ENDBLOCK:%.*]]
+; X32:       loadbb1:
+; X32-NEXT:    [[TMP9:%.*]] = bitcast i8* [[X]] to i32*
+; X32-NEXT:    [[TMP10:%.*]] = bitcast i8* [[Y]] to i32*
+; X32-NEXT:    [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 1
+; X32-NEXT:    [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 1
+; X32-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP11]]
+; X32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP12]]
+; X32-NEXT:    [[TMP15]] = call i32 @llvm.bswap.i32(i32 [[TMP13]])
+; X32-NEXT:    [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]])
+; X32-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[TMP15]], [[TMP16]]
+; X32-NEXT:    br i1 [[TMP17]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X32:       loadbb2:
+; X32-NEXT:    [[TMP18:%.*]] = bitcast i8* [[X]] to i32*
+; X32-NEXT:    [[TMP19:%.*]] = bitcast i8* [[Y]] to i32*
+; X32-NEXT:    [[TMP20:%.*]] = getelementptr i32, i32* [[TMP18]], i32 2
+; X32-NEXT:    [[TMP21:%.*]] = getelementptr i32, i32* [[TMP19]], i32 2
+; X32-NEXT:    [[TMP22:%.*]] = load i32, i32* [[TMP20]]
+; X32-NEXT:    [[TMP23:%.*]] = load i32, i32* [[TMP21]]
+; X32-NEXT:    [[TMP24]] = call i32 @llvm.bswap.i32(i32 [[TMP22]])
+; X32-NEXT:    [[TMP25]] = call i32 @llvm.bswap.i32(i32 [[TMP23]])
+; X32-NEXT:    [[TMP26:%.*]] = icmp eq i32 [[TMP24]], [[TMP25]]
+; X32-NEXT:    br i1 [[TMP26]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X32:       endblock:
+; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP8]], [[RES_BLOCK]] ]
+; X32-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-LABEL: @cmp12(
+; X64-NEXT:  loadbb:
+; X64-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64*
+; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64*
+; X64-NEXT:    [[TMP2:%.*]] = load i64, i64* [[TMP0]]
+; X64-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP1]]
+; X64-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]]
+; X64-NEXT:    br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP9:%.*]] = bitcast i8* [[X]] to i32*
+; X64-NEXT:    [[TMP10:%.*]] = bitcast i8* [[Y]] to i32*
+; X64-NEXT:    [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 2
+; X64-NEXT:    [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 2
+; X64-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP11]]
+; X64-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP12]]
+; X64-NEXT:    [[TMP15:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP13]])
+; X64-NEXT:    [[TMP16:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP14]])
+; X64-NEXT:    [[TMP17]] = zext i32 [[TMP15]] to i64
+; X64-NEXT:    [[TMP18]] = zext i32 [[TMP16]] to i64
+; X64-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[TMP17]], [[TMP18]]
+; X64-NEXT:    br i1 [[TMP19]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 12)
   ret i32 %call
 }
 
 define i32 @cmp13(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; ALL-LABEL: @cmp13(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 13)
-; ALL-NEXT:    ret i32 [[CALL]]
+; X32-LABEL: @cmp13(
+; X32-NEXT:  loadbb:
+; X32-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]]
+; X32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; X32-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X32-NEXT:    [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X32-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]]
+; X32-NEXT:    br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X32:       res_block:
+; X32-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ], [ [[TMP24:%.*]], [[LOADBB2:%.*]] ]
+; X32-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ], [ [[TMP25:%.*]], [[LOADBB2]] ]
+; X32-NEXT:    [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X32-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
+; X32-NEXT:    br label [[ENDBLOCK:%.*]]
+; X32:       loadbb1:
+; X32-NEXT:    [[TMP9:%.*]] = bitcast i8* [[X]] to i32*
+; X32-NEXT:    [[TMP10:%.*]] = bitcast i8* [[Y]] to i32*
+; X32-NEXT:    [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 1
+; X32-NEXT:    [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 1
+; X32-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP11]]
+; X32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP12]]
+; X32-NEXT:    [[TMP15]] = call i32 @llvm.bswap.i32(i32 [[TMP13]])
+; X32-NEXT:    [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]])
+; X32-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[TMP15]], [[TMP16]]
+; X32-NEXT:    br i1 [[TMP17]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X32:       loadbb2:
+; X32-NEXT:    [[TMP18:%.*]] = bitcast i8* [[X]] to i32*
+; X32-NEXT:    [[TMP19:%.*]] = bitcast i8* [[Y]] to i32*
+; X32-NEXT:    [[TMP20:%.*]] = getelementptr i32, i32* [[TMP18]], i32 2
+; X32-NEXT:    [[TMP21:%.*]] = getelementptr i32, i32* [[TMP19]], i32 2
+; X32-NEXT:    [[TMP22:%.*]] = load i32, i32* [[TMP20]]
+; X32-NEXT:    [[TMP23:%.*]] = load i32, i32* [[TMP21]]
+; X32-NEXT:    [[TMP24]] = call i32 @llvm.bswap.i32(i32 [[TMP22]])
+; X32-NEXT:    [[TMP25]] = call i32 @llvm.bswap.i32(i32 [[TMP23]])
+; X32-NEXT:    [[TMP26:%.*]] = icmp eq i32 [[TMP24]], [[TMP25]]
+; X32-NEXT:    br i1 [[TMP26]], label [[LOADBB3:%.*]], label [[RES_BLOCK]]
+; X32:       loadbb3:
+; X32-NEXT:    [[TMP27:%.*]] = getelementptr i8, i8* [[X]], i8 12
+; X32-NEXT:    [[TMP28:%.*]] = getelementptr i8, i8* [[Y]], i8 12
+; X32-NEXT:    [[TMP29:%.*]] = load i8, i8* [[TMP27]]
+; X32-NEXT:    [[TMP30:%.*]] = load i8, i8* [[TMP28]]
+; X32-NEXT:    [[TMP31:%.*]] = zext i8 [[TMP29]] to i32
+; X32-NEXT:    [[TMP32:%.*]] = zext i8 [[TMP30]] to i32
+; X32-NEXT:    [[TMP33:%.*]] = sub i32 [[TMP31]], [[TMP32]]
+; X32-NEXT:    br label [[ENDBLOCK]]
+; X32:       endblock:
+; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP33]], [[LOADBB3]] ], [ [[TMP8]], [[RES_BLOCK]] ]
+; X32-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-LABEL: @cmp13(
+; X64-NEXT:  loadbb:
+; X64-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64*
+; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64*
+; X64-NEXT:    [[TMP2:%.*]] = load i64, i64* [[TMP0]]
+; X64-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP1]]
+; X64-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]]
+; X64-NEXT:    br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP9:%.*]] = bitcast i8* [[X]] to i32*
+; X64-NEXT:    [[TMP10:%.*]] = bitcast i8* [[Y]] to i32*
+; X64-NEXT:    [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 2
+; X64-NEXT:    [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 2
+; X64-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP11]]
+; X64-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP12]]
+; X64-NEXT:    [[TMP15:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP13]])
+; X64-NEXT:    [[TMP16:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP14]])
+; X64-NEXT:    [[TMP17]] = zext i32 [[TMP15]] to i64
+; X64-NEXT:    [[TMP18]] = zext i32 [[TMP16]] to i64
+; X64-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[TMP17]], [[TMP18]]
+; X64-NEXT:    br i1 [[TMP19]], label [[LOADBB2:%.*]], label [[RES_BLOCK]]
+; X64:       loadbb2:
+; X64-NEXT:    [[TMP20:%.*]] = getelementptr i8, i8* [[X]], i8 12
+; X64-NEXT:    [[TMP21:%.*]] = getelementptr i8, i8* [[Y]], i8 12
+; X64-NEXT:    [[TMP22:%.*]] = load i8, i8* [[TMP20]]
+; X64-NEXT:    [[TMP23:%.*]] = load i8, i8* [[TMP21]]
+; X64-NEXT:    [[TMP24:%.*]] = zext i8 [[TMP22]] to i32
+; X64-NEXT:    [[TMP25:%.*]] = zext i8 [[TMP23]] to i32
+; X64-NEXT:    [[TMP26:%.*]] = sub i32 [[TMP24]], [[TMP25]]
+; X64-NEXT:    br label [[ENDBLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP26]], [[LOADBB2]] ], [ [[TMP8]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 13)
   ret i32 %call
 }
 
 define i32 @cmp14(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; ALL-LABEL: @cmp14(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 14)
-; ALL-NEXT:    ret i32 [[CALL]]
+; X32-LABEL: @cmp14(
+; X32-NEXT:  loadbb:
+; X32-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]]
+; X32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; X32-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X32-NEXT:    [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X32-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]]
+; X32-NEXT:    br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X32:       res_block:
+; X32-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ], [ [[TMP24:%.*]], [[LOADBB2:%.*]] ], [ [[TMP35:%.*]], [[LOADBB3:%.*]] ]
+; X32-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ], [ [[TMP25:%.*]], [[LOADBB2]] ], [ [[TMP36:%.*]], [[LOADBB3]] ]
+; X32-NEXT:    [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X32-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
+; X32-NEXT:    br label [[ENDBLOCK:%.*]]
+; X32:       loadbb1:
+; X32-NEXT:    [[TMP9:%.*]] = bitcast i8* [[X]] to i32*
+; X32-NEXT:    [[TMP10:%.*]] = bitcast i8* [[Y]] to i32*
+; X32-NEXT:    [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 1
+; X32-NEXT:    [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 1
+; X32-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP11]]
+; X32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP12]]
+; X32-NEXT:    [[TMP15]] = call i32 @llvm.bswap.i32(i32 [[TMP13]])
+; X32-NEXT:    [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]])
+; X32-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[TMP15]], [[TMP16]]
+; X32-NEXT:    br i1 [[TMP17]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X32:       loadbb2:
+; X32-NEXT:    [[TMP18:%.*]] = bitcast i8* [[X]] to i32*
+; X32-NEXT:    [[TMP19:%.*]] = bitcast i8* [[Y]] to i32*
+; X32-NEXT:    [[TMP20:%.*]] = getelementptr i32, i32* [[TMP18]], i32 2
+; X32-NEXT:    [[TMP21:%.*]] = getelementptr i32, i32* [[TMP19]], i32 2
+; X32-NEXT:    [[TMP22:%.*]] = load i32, i32* [[TMP20]]
+; X32-NEXT:    [[TMP23:%.*]] = load i32, i32* [[TMP21]]
+; X32-NEXT:    [[TMP24]] = call i32 @llvm.bswap.i32(i32 [[TMP22]])
+; X32-NEXT:    [[TMP25]] = call i32 @llvm.bswap.i32(i32 [[TMP23]])
+; X32-NEXT:    [[TMP26:%.*]] = icmp eq i32 [[TMP24]], [[TMP25]]
+; X32-NEXT:    br i1 [[TMP26]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X32:       loadbb3:
+; X32-NEXT:    [[TMP27:%.*]] = bitcast i8* [[X]] to i16*
+; X32-NEXT:    [[TMP28:%.*]] = bitcast i8* [[Y]] to i16*
+; X32-NEXT:    [[TMP29:%.*]] = getelementptr i16, i16* [[TMP27]], i16 6
+; X32-NEXT:    [[TMP30:%.*]] = getelementptr i16, i16* [[TMP28]], i16 6
+; X32-NEXT:    [[TMP31:%.*]] = load i16, i16* [[TMP29]]
+; X32-NEXT:    [[TMP32:%.*]] = load i16, i16* [[TMP30]]
+; X32-NEXT:    [[TMP33:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP31]])
+; X32-NEXT:    [[TMP34:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP32]])
+; X32-NEXT:    [[TMP35]] = zext i16 [[TMP33]] to i32
+; X32-NEXT:    [[TMP36]] = zext i16 [[TMP34]] to i32
+; X32-NEXT:    [[TMP37:%.*]] = icmp eq i32 [[TMP35]], [[TMP36]]
+; X32-NEXT:    br i1 [[TMP37]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X32:       endblock:
+; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP8]], [[RES_BLOCK]] ]
+; X32-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-LABEL: @cmp14(
+; X64-NEXT:  loadbb:
+; X64-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64*
+; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64*
+; X64-NEXT:    [[TMP2:%.*]] = load i64, i64* [[TMP0]]
+; X64-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP1]]
+; X64-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]]
+; X64-NEXT:    br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ], [ [[TMP28:%.*]], [[LOADBB2:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ], [ [[TMP29:%.*]], [[LOADBB2]] ]
+; X64-NEXT:    [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP9:%.*]] = bitcast i8* [[X]] to i32*
+; X64-NEXT:    [[TMP10:%.*]] = bitcast i8* [[Y]] to i32*
+; X64-NEXT:    [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 2
+; X64-NEXT:    [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 2
+; X64-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP11]]
+; X64-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP12]]
+; X64-NEXT:    [[TMP15:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP13]])
+; X64-NEXT:    [[TMP16:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP14]])
+; X64-NEXT:    [[TMP17]] = zext i32 [[TMP15]] to i64
+; X64-NEXT:    [[TMP18]] = zext i32 [[TMP16]] to i64
+; X64-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[TMP17]], [[TMP18]]
+; X64-NEXT:    br i1 [[TMP19]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64:       loadbb2:
+; X64-NEXT:    [[TMP20:%.*]] = bitcast i8* [[X]] to i16*
+; X64-NEXT:    [[TMP21:%.*]] = bitcast i8* [[Y]] to i16*
+; X64-NEXT:    [[TMP22:%.*]] = getelementptr i16, i16* [[TMP20]], i16 6
+; X64-NEXT:    [[TMP23:%.*]] = getelementptr i16, i16* [[TMP21]], i16 6
+; X64-NEXT:    [[TMP24:%.*]] = load i16, i16* [[TMP22]]
+; X64-NEXT:    [[TMP25:%.*]] = load i16, i16* [[TMP23]]
+; X64-NEXT:    [[TMP26:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP24]])
+; X64-NEXT:    [[TMP27:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP25]])
+; X64-NEXT:    [[TMP28]] = zext i16 [[TMP26]] to i64
+; X64-NEXT:    [[TMP29]] = zext i16 [[TMP27]] to i64
+; X64-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[TMP28]], [[TMP29]]
+; X64-NEXT:    br i1 [[TMP30]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP8]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 14)
   ret i32 %call
 }
 
 define i32 @cmp15(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; ALL-LABEL: @cmp15(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 15)
-; ALL-NEXT:    ret i32 [[CALL]]
+; X32-LABEL: @cmp15(
+; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 15)
+; X32-NEXT:    ret i32 [[CALL]]
+;
+; X64-LABEL: @cmp15(
+; X64-NEXT:  loadbb:
+; X64-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64*
+; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64*
+; X64-NEXT:    [[TMP2:%.*]] = load i64, i64* [[TMP0]]
+; X64-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP1]]
+; X64-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]]
+; X64-NEXT:    br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ], [ [[TMP28:%.*]], [[LOADBB2:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ], [ [[TMP29:%.*]], [[LOADBB2]] ]
+; X64-NEXT:    [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP9:%.*]] = bitcast i8* [[X]] to i32*
+; X64-NEXT:    [[TMP10:%.*]] = bitcast i8* [[Y]] to i32*
+; X64-NEXT:    [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 2
+; X64-NEXT:    [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 2
+; X64-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP11]]
+; X64-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP12]]
+; X64-NEXT:    [[TMP15:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP13]])
+; X64-NEXT:    [[TMP16:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP14]])
+; X64-NEXT:    [[TMP17]] = zext i32 [[TMP15]] to i64
+; X64-NEXT:    [[TMP18]] = zext i32 [[TMP16]] to i64
+; X64-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[TMP17]], [[TMP18]]
+; X64-NEXT:    br i1 [[TMP19]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64:       loadbb2:
+; X64-NEXT:    [[TMP20:%.*]] = bitcast i8* [[X]] to i16*
+; X64-NEXT:    [[TMP21:%.*]] = bitcast i8* [[Y]] to i16*
+; X64-NEXT:    [[TMP22:%.*]] = getelementptr i16, i16* [[TMP20]], i16 6
+; X64-NEXT:    [[TMP23:%.*]] = getelementptr i16, i16* [[TMP21]], i16 6
+; X64-NEXT:    [[TMP24:%.*]] = load i16, i16* [[TMP22]]
+; X64-NEXT:    [[TMP25:%.*]] = load i16, i16* [[TMP23]]
+; X64-NEXT:    [[TMP26:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP24]])
+; X64-NEXT:    [[TMP27:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP25]])
+; X64-NEXT:    [[TMP28]] = zext i16 [[TMP26]] to i64
+; X64-NEXT:    [[TMP29]] = zext i16 [[TMP27]] to i64
+; X64-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[TMP28]], [[TMP29]]
+; X64-NEXT:    br i1 [[TMP30]], label [[LOADBB3:%.*]], label [[RES_BLOCK]]
+; X64:       loadbb3:
+; X64-NEXT:    [[TMP31:%.*]] = getelementptr i8, i8* [[X]], i8 14
+; X64-NEXT:    [[TMP32:%.*]] = getelementptr i8, i8* [[Y]], i8 14
+; X64-NEXT:    [[TMP33:%.*]] = load i8, i8* [[TMP31]]
+; X64-NEXT:    [[TMP34:%.*]] = load i8, i8* [[TMP32]]
+; X64-NEXT:    [[TMP35:%.*]] = zext i8 [[TMP33]] to i32
+; X64-NEXT:    [[TMP36:%.*]] = zext i8 [[TMP34]] to i32
+; X64-NEXT:    [[TMP37:%.*]] = sub i32 [[TMP35]], [[TMP36]]
+; X64-NEXT:    br label [[ENDBLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP37]], [[LOADBB3]] ], [ [[TMP8]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 15)
   ret i32 %call
 }
 
 define i32 @cmp16(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; ALL-LABEL: @cmp16(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 16)
-; ALL-NEXT:    ret i32 [[CALL]]
+; X32-LABEL: @cmp16(
+; X32-NEXT:  loadbb:
+; X32-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]]
+; X32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; X32-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X32-NEXT:    [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X32-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]]
+; X32-NEXT:    br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X32:       res_block:
+; X32-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ], [ [[TMP24:%.*]], [[LOADBB2:%.*]] ], [ [[TMP33:%.*]], [[LOADBB3:%.*]] ]
+; X32-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ], [ [[TMP25:%.*]], [[LOADBB2]] ], [ [[TMP34:%.*]], [[LOADBB3]] ]
+; X32-NEXT:    [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X32-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
+; X32-NEXT:    br label [[ENDBLOCK:%.*]]
+; X32:       loadbb1:
+; X32-NEXT:    [[TMP9:%.*]] = bitcast i8* [[X]] to i32*
+; X32-NEXT:    [[TMP10:%.*]] = bitcast i8* [[Y]] to i32*
+; X32-NEXT:    [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 1
+; X32-NEXT:    [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 1
+; X32-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP11]]
+; X32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP12]]
+; X32-NEXT:    [[TMP15]] = call i32 @llvm.bswap.i32(i32 [[TMP13]])
+; X32-NEXT:    [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]])
+; X32-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[TMP15]], [[TMP16]]
+; X32-NEXT:    br i1 [[TMP17]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X32:       loadbb2:
+; X32-NEXT:    [[TMP18:%.*]] = bitcast i8* [[X]] to i32*
+; X32-NEXT:    [[TMP19:%.*]] = bitcast i8* [[Y]] to i32*
+; X32-NEXT:    [[TMP20:%.*]] = getelementptr i32, i32* [[TMP18]], i32 2
+; X32-NEXT:    [[TMP21:%.*]] = getelementptr i32, i32* [[TMP19]], i32 2
+; X32-NEXT:    [[TMP22:%.*]] = load i32, i32* [[TMP20]]
+; X32-NEXT:    [[TMP23:%.*]] = load i32, i32* [[TMP21]]
+; X32-NEXT:    [[TMP24]] = call i32 @llvm.bswap.i32(i32 [[TMP22]])
+; X32-NEXT:    [[TMP25]] = call i32 @llvm.bswap.i32(i32 [[TMP23]])
+; X32-NEXT:    [[TMP26:%.*]] = icmp eq i32 [[TMP24]], [[TMP25]]
+; X32-NEXT:    br i1 [[TMP26]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X32:       loadbb3:
+; X32-NEXT:    [[TMP27:%.*]] = bitcast i8* [[X]] to i32*
+; X32-NEXT:    [[TMP28:%.*]] = bitcast i8* [[Y]] to i32*
+; X32-NEXT:    [[TMP29:%.*]] = getelementptr i32, i32* [[TMP27]], i32 3
+; X32-NEXT:    [[TMP30:%.*]] = getelementptr i32, i32* [[TMP28]], i32 3
+; X32-NEXT:    [[TMP31:%.*]] = load i32, i32* [[TMP29]]
+; X32-NEXT:    [[TMP32:%.*]] = load i32, i32* [[TMP30]]
+; X32-NEXT:    [[TMP33]] = call i32 @llvm.bswap.i32(i32 [[TMP31]])
+; X32-NEXT:    [[TMP34]] = call i32 @llvm.bswap.i32(i32 [[TMP32]])
+; X32-NEXT:    [[TMP35:%.*]] = icmp eq i32 [[TMP33]], [[TMP34]]
+; X32-NEXT:    br i1 [[TMP35]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X32:       endblock:
+; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP8]], [[RES_BLOCK]] ]
+; X32-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-LABEL: @cmp16(
+; X64-NEXT:  loadbb:
+; X64-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64*
+; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64*
+; X64-NEXT:    [[TMP2:%.*]] = load i64, i64* [[TMP0]]
+; X64-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP1]]
+; X64-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]]
+; X64-NEXT:    br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP9:%.*]] = bitcast i8* [[X]] to i64*
+; X64-NEXT:    [[TMP10:%.*]] = bitcast i8* [[Y]] to i64*
+; X64-NEXT:    [[TMP11:%.*]] = getelementptr i64, i64* [[TMP9]], i64 1
+; X64-NEXT:    [[TMP12:%.*]] = getelementptr i64, i64* [[TMP10]], i64 1
+; X64-NEXT:    [[TMP13:%.*]] = load i64, i64* [[TMP11]]
+; X64-NEXT:    [[TMP14:%.*]] = load i64, i64* [[TMP12]]
+; X64-NEXT:    [[TMP15]] = call i64 @llvm.bswap.i64(i64 [[TMP13]])
+; X64-NEXT:    [[TMP16]] = call i64 @llvm.bswap.i64(i64 [[TMP14]])
+; X64-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[TMP15]], [[TMP16]]
+; X64-NEXT:    br i1 [[TMP17]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16)
   ret i32 %call
@@ -190,8 +1088,25 @@ define i32 @cmp_eq2(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 
 define i32 @cmp_eq3(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 ; ALL-LABEL: @cmp_eq3(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 3)
-; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; ALL-NEXT:  loadbb:
+; ALL-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i16*
+; ALL-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i16*
+; ALL-NEXT:    [[TMP2:%.*]] = load i16, i16* [[TMP0]]
+; ALL-NEXT:    [[TMP3:%.*]] = load i16, i16* [[TMP1]]
+; ALL-NEXT:    [[TMP4:%.*]] = icmp ne i16 [[TMP2]], [[TMP3]]
+; ALL-NEXT:    br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; ALL:       res_block:
+; ALL-NEXT:    br label [[ENDBLOCK:%.*]]
+; ALL:       loadbb1:
+; ALL-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8* [[X]], i8 2
+; ALL-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[Y]], i8 2
+; ALL-NEXT:    [[TMP7:%.*]] = load i8, i8* [[TMP5]]
+; ALL-NEXT:    [[TMP8:%.*]] = load i8, i8* [[TMP6]]
+; ALL-NEXT:    [[TMP9:%.*]] = icmp ne i8 [[TMP7]], [[TMP8]]
+; ALL-NEXT:    br i1 [[TMP9]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; ALL:       endblock:
+; ALL-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
+; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
 ; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; ALL-NEXT:    ret i32 [[CONV]]
 ;
@@ -221,8 +1136,25 @@ define i32 @cmp_eq4(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 
 define i32 @cmp_eq5(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 ; ALL-LABEL: @cmp_eq5(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 5)
-; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; ALL-NEXT:  loadbb:
+; ALL-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; ALL-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; ALL-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]]
+; ALL-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; ALL-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]]
+; ALL-NEXT:    br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; ALL:       res_block:
+; ALL-NEXT:    br label [[ENDBLOCK:%.*]]
+; ALL:       loadbb1:
+; ALL-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8* [[X]], i8 4
+; ALL-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[Y]], i8 4
+; ALL-NEXT:    [[TMP7:%.*]] = load i8, i8* [[TMP5]]
+; ALL-NEXT:    [[TMP8:%.*]] = load i8, i8* [[TMP6]]
+; ALL-NEXT:    [[TMP9:%.*]] = icmp ne i8 [[TMP7]], [[TMP8]]
+; ALL-NEXT:    br i1 [[TMP9]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; ALL:       endblock:
+; ALL-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
+; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
 ; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; ALL-NEXT:    ret i32 [[CONV]]
 ;
@@ -234,8 +1166,27 @@ define i32 @cmp_eq5(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 
 define i32 @cmp_eq6(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 ; ALL-LABEL: @cmp_eq6(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 6)
-; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; ALL-NEXT:  loadbb:
+; ALL-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; ALL-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; ALL-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]]
+; ALL-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; ALL-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]]
+; ALL-NEXT:    br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; ALL:       res_block:
+; ALL-NEXT:    br label [[ENDBLOCK:%.*]]
+; ALL:       loadbb1:
+; ALL-NEXT:    [[TMP5:%.*]] = bitcast i8* [[X]] to i16*
+; ALL-NEXT:    [[TMP6:%.*]] = bitcast i8* [[Y]] to i16*
+; ALL-NEXT:    [[TMP7:%.*]] = getelementptr i16, i16* [[TMP5]], i16 2
+; ALL-NEXT:    [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 2
+; ALL-NEXT:    [[TMP9:%.*]] = load i16, i16* [[TMP7]]
+; ALL-NEXT:    [[TMP10:%.*]] = load i16, i16* [[TMP8]]
+; ALL-NEXT:    [[TMP11:%.*]] = icmp ne i16 [[TMP9]], [[TMP10]]
+; ALL-NEXT:    br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; ALL:       endblock:
+; ALL-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
+; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
 ; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; ALL-NEXT:    ret i32 [[CONV]]
 ;
@@ -247,8 +1198,34 @@ define i32 @cmp_eq6(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 
 define i32 @cmp_eq7(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 ; ALL-LABEL: @cmp_eq7(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 7)
-; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; ALL-NEXT:  loadbb:
+; ALL-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; ALL-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; ALL-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]]
+; ALL-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; ALL-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]]
+; ALL-NEXT:    br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; ALL:       res_block:
+; ALL-NEXT:    br label [[ENDBLOCK:%.*]]
+; ALL:       loadbb1:
+; ALL-NEXT:    [[TMP5:%.*]] = bitcast i8* [[X]] to i16*
+; ALL-NEXT:    [[TMP6:%.*]] = bitcast i8* [[Y]] to i16*
+; ALL-NEXT:    [[TMP7:%.*]] = getelementptr i16, i16* [[TMP5]], i16 2
+; ALL-NEXT:    [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 2
+; ALL-NEXT:    [[TMP9:%.*]] = load i16, i16* [[TMP7]]
+; ALL-NEXT:    [[TMP10:%.*]] = load i16, i16* [[TMP8]]
+; ALL-NEXT:    [[TMP11:%.*]] = icmp ne i16 [[TMP9]], [[TMP10]]
+; ALL-NEXT:    br i1 [[TMP11]], label [[RES_BLOCK]], label [[LOADBB2:%.*]]
+; ALL:       loadbb2:
+; ALL-NEXT:    [[TMP12:%.*]] = getelementptr i8, i8* [[X]], i8 6
+; ALL-NEXT:    [[TMP13:%.*]] = getelementptr i8, i8* [[Y]], i8 6
+; ALL-NEXT:    [[TMP14:%.*]] = load i8, i8* [[TMP12]]
+; ALL-NEXT:    [[TMP15:%.*]] = load i8, i8* [[TMP13]]
+; ALL-NEXT:    [[TMP16:%.*]] = icmp ne i8 [[TMP14]], [[TMP15]]
+; ALL-NEXT:    br i1 [[TMP16]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; ALL:       endblock:
+; ALL-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ 1, [[RES_BLOCK]] ]
+; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
 ; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; ALL-NEXT:    ret i32 [[CONV]]
 ;
@@ -260,8 +1237,27 @@ define i32 @cmp_eq7(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 
 define i32 @cmp_eq8(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 ; X32-LABEL: @cmp_eq8(
-; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 8)
-; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X32-NEXT:  loadbb:
+; X32-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]]
+; X32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; X32-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]]
+; X32-NEXT:    br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X32:       res_block:
+; X32-NEXT:    br label [[ENDBLOCK:%.*]]
+; X32:       loadbb1:
+; X32-NEXT:    [[TMP5:%.*]] = bitcast i8* [[X]] to i32*
+; X32-NEXT:    [[TMP6:%.*]] = bitcast i8* [[Y]] to i32*
+; X32-NEXT:    [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 1
+; X32-NEXT:    [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 1
+; X32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP7]]
+; X32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP8]]
+; X32-NEXT:    [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]]
+; X32-NEXT:    br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X32:       endblock:
+; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
+; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
 ; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X32-NEXT:    ret i32 [[CONV]]
 ;
@@ -283,11 +1279,60 @@ define i32 @cmp_eq8(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq9(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; ALL-LABEL: @cmp_eq9(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 9)
-; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
-; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; ALL-NEXT:    ret i32 [[CONV]]
+; X32-LABEL: @cmp_eq9(
+; X32-NEXT:  loadbb:
+; X32-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]]
+; X32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; X32-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]]
+; X32-NEXT:    br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X32:       res_block:
+; X32-NEXT:    br label [[ENDBLOCK:%.*]]
+; X32:       loadbb1:
+; X32-NEXT:    [[TMP5:%.*]] = bitcast i8* [[X]] to i32*
+; X32-NEXT:    [[TMP6:%.*]] = bitcast i8* [[Y]] to i32*
+; X32-NEXT:    [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 1
+; X32-NEXT:    [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 1
+; X32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP7]]
+; X32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP8]]
+; X32-NEXT:    [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]]
+; X32-NEXT:    br i1 [[TMP11]], label [[RES_BLOCK]], label [[LOADBB2:%.*]]
+; X32:       loadbb2:
+; X32-NEXT:    [[TMP12:%.*]] = getelementptr i8, i8* [[X]], i8 8
+; X32-NEXT:    [[TMP13:%.*]] = getelementptr i8, i8* [[Y]], i8 8
+; X32-NEXT:    [[TMP14:%.*]] = load i8, i8* [[TMP12]]
+; X32-NEXT:    [[TMP15:%.*]] = load i8, i8* [[TMP13]]
+; X32-NEXT:    [[TMP16:%.*]] = icmp ne i8 [[TMP14]], [[TMP15]]
+; X32-NEXT:    br i1 [[TMP16]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X32:       endblock:
+; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ 1, [[RES_BLOCK]] ]
+; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; X32-NEXT:    ret i32 [[CONV]]
+;
+; X64-LABEL: @cmp_eq9(
+; X64-NEXT:  loadbb:
+; X64-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64*
+; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64*
+; X64-NEXT:    [[TMP2:%.*]] = load i64, i64* [[TMP0]]
+; X64-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP1]]
+; X64-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]]
+; X64-NEXT:    br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X64:       res_block:
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8* [[X]], i8 8
+; X64-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[Y]], i8 8
+; X64-NEXT:    [[TMP7:%.*]] = load i8, i8* [[TMP5]]
+; X64-NEXT:    [[TMP8:%.*]] = load i8, i8* [[TMP6]]
+; X64-NEXT:    [[TMP9:%.*]] = icmp ne i8 [[TMP7]], [[TMP8]]
+; X64-NEXT:    br i1 [[TMP9]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; X64-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; X64-NEXT:    ret i32 [[CONV]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 9)
   %cmp = icmp eq i32 %call, 0
@@ -296,11 +1341,64 @@ define i32 @cmp_eq9(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq10(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; ALL-LABEL: @cmp_eq10(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 10)
-; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
-; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; ALL-NEXT:    ret i32 [[CONV]]
+; X32-LABEL: @cmp_eq10(
+; X32-NEXT:  loadbb:
+; X32-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]]
+; X32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; X32-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]]
+; X32-NEXT:    br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X32:       res_block:
+; X32-NEXT:    br label [[ENDBLOCK:%.*]]
+; X32:       loadbb1:
+; X32-NEXT:    [[TMP5:%.*]] = bitcast i8* [[X]] to i32*
+; X32-NEXT:    [[TMP6:%.*]] = bitcast i8* [[Y]] to i32*
+; X32-NEXT:    [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 1
+; X32-NEXT:    [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 1
+; X32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP7]]
+; X32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP8]]
+; X32-NEXT:    [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]]
+; X32-NEXT:    br i1 [[TMP11]], label [[RES_BLOCK]], label [[LOADBB2:%.*]]
+; X32:       loadbb2:
+; X32-NEXT:    [[TMP12:%.*]] = bitcast i8* [[X]] to i16*
+; X32-NEXT:    [[TMP13:%.*]] = bitcast i8* [[Y]] to i16*
+; X32-NEXT:    [[TMP14:%.*]] = getelementptr i16, i16* [[TMP12]], i16 4
+; X32-NEXT:    [[TMP15:%.*]] = getelementptr i16, i16* [[TMP13]], i16 4
+; X32-NEXT:    [[TMP16:%.*]] = load i16, i16* [[TMP14]]
+; X32-NEXT:    [[TMP17:%.*]] = load i16, i16* [[TMP15]]
+; X32-NEXT:    [[TMP18:%.*]] = icmp ne i16 [[TMP16]], [[TMP17]]
+; X32-NEXT:    br i1 [[TMP18]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X32:       endblock:
+; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ 1, [[RES_BLOCK]] ]
+; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; X32-NEXT:    ret i32 [[CONV]]
+;
+; X64-LABEL: @cmp_eq10(
+; X64-NEXT:  loadbb:
+; X64-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64*
+; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64*
+; X64-NEXT:    [[TMP2:%.*]] = load i64, i64* [[TMP0]]
+; X64-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP1]]
+; X64-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]]
+; X64-NEXT:    br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X64:       res_block:
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP5:%.*]] = bitcast i8* [[X]] to i16*
+; X64-NEXT:    [[TMP6:%.*]] = bitcast i8* [[Y]] to i16*
+; X64-NEXT:    [[TMP7:%.*]] = getelementptr i16, i16* [[TMP5]], i16 4
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 4
+; X64-NEXT:    [[TMP9:%.*]] = load i16, i16* [[TMP7]]
+; X64-NEXT:    [[TMP10:%.*]] = load i16, i16* [[TMP8]]
+; X64-NEXT:    [[TMP11:%.*]] = icmp ne i16 [[TMP9]], [[TMP10]]
+; X64-NEXT:    br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; X64-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; X64-NEXT:    ret i32 [[CONV]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 10)
   %cmp = icmp eq i32 %call, 0
@@ -309,11 +1407,78 @@ define i32 @cmp_eq10(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq11(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; ALL-LABEL: @cmp_eq11(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 11)
-; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
-; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; ALL-NEXT:    ret i32 [[CONV]]
+; X32-LABEL: @cmp_eq11(
+; X32-NEXT:  loadbb:
+; X32-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]]
+; X32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; X32-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]]
+; X32-NEXT:    br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X32:       res_block:
+; X32-NEXT:    br label [[ENDBLOCK:%.*]]
+; X32:       loadbb1:
+; X32-NEXT:    [[TMP5:%.*]] = bitcast i8* [[X]] to i32*
+; X32-NEXT:    [[TMP6:%.*]] = bitcast i8* [[Y]] to i32*
+; X32-NEXT:    [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 1
+; X32-NEXT:    [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 1
+; X32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP7]]
+; X32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP8]]
+; X32-NEXT:    [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]]
+; X32-NEXT:    br i1 [[TMP11]], label [[RES_BLOCK]], label [[LOADBB2:%.*]]
+; X32:       loadbb2:
+; X32-NEXT:    [[TMP12:%.*]] = bitcast i8* [[X]] to i16*
+; X32-NEXT:    [[TMP13:%.*]] = bitcast i8* [[Y]] to i16*
+; X32-NEXT:    [[TMP14:%.*]] = getelementptr i16, i16* [[TMP12]], i16 4
+; X32-NEXT:    [[TMP15:%.*]] = getelementptr i16, i16* [[TMP13]], i16 4
+; X32-NEXT:    [[TMP16:%.*]] = load i16, i16* [[TMP14]]
+; X32-NEXT:    [[TMP17:%.*]] = load i16, i16* [[TMP15]]
+; X32-NEXT:    [[TMP18:%.*]] = icmp ne i16 [[TMP16]], [[TMP17]]
+; X32-NEXT:    br i1 [[TMP18]], label [[RES_BLOCK]], label [[LOADBB3:%.*]]
+; X32:       loadbb3:
+; X32-NEXT:    [[TMP19:%.*]] = getelementptr i8, i8* [[X]], i8 10
+; X32-NEXT:    [[TMP20:%.*]] = getelementptr i8, i8* [[Y]], i8 10
+; X32-NEXT:    [[TMP21:%.*]] = load i8, i8* [[TMP19]]
+; X32-NEXT:    [[TMP22:%.*]] = load i8, i8* [[TMP20]]
+; X32-NEXT:    [[TMP23:%.*]] = icmp ne i8 [[TMP21]], [[TMP22]]
+; X32-NEXT:    br i1 [[TMP23]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X32:       endblock:
+; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ 1, [[RES_BLOCK]] ]
+; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; X32-NEXT:    ret i32 [[CONV]]
+;
+; X64-LABEL: @cmp_eq11(
+; X64-NEXT:  loadbb:
+; X64-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64*
+; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64*
+; X64-NEXT:    [[TMP2:%.*]] = load i64, i64* [[TMP0]]
+; X64-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP1]]
+; X64-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]]
+; X64-NEXT:    br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X64:       res_block:
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP5:%.*]] = bitcast i8* [[X]] to i16*
+; X64-NEXT:    [[TMP6:%.*]] = bitcast i8* [[Y]] to i16*
+; X64-NEXT:    [[TMP7:%.*]] = getelementptr i16, i16* [[TMP5]], i16 4
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 4
+; X64-NEXT:    [[TMP9:%.*]] = load i16, i16* [[TMP7]]
+; X64-NEXT:    [[TMP10:%.*]] = load i16, i16* [[TMP8]]
+; X64-NEXT:    [[TMP11:%.*]] = icmp ne i16 [[TMP9]], [[TMP10]]
+; X64-NEXT:    br i1 [[TMP11]], label [[RES_BLOCK]], label [[LOADBB2:%.*]]
+; X64:       loadbb2:
+; X64-NEXT:    [[TMP12:%.*]] = getelementptr i8, i8* [[X]], i8 10
+; X64-NEXT:    [[TMP13:%.*]] = getelementptr i8, i8* [[Y]], i8 10
+; X64-NEXT:    [[TMP14:%.*]] = load i8, i8* [[TMP12]]
+; X64-NEXT:    [[TMP15:%.*]] = load i8, i8* [[TMP13]]
+; X64-NEXT:    [[TMP16:%.*]] = icmp ne i8 [[TMP14]], [[TMP15]]
+; X64-NEXT:    br i1 [[TMP16]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ 1, [[RES_BLOCK]] ]
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; X64-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; X64-NEXT:    ret i32 [[CONV]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 11)
   %cmp = icmp eq i32 %call, 0
@@ -322,11 +1487,64 @@ define i32 @cmp_eq11(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq12(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; ALL-LABEL: @cmp_eq12(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 12)
-; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
-; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; ALL-NEXT:    ret i32 [[CONV]]
+; X32-LABEL: @cmp_eq12(
+; X32-NEXT:  loadbb:
+; X32-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]]
+; X32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; X32-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]]
+; X32-NEXT:    br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X32:       res_block:
+; X32-NEXT:    br label [[ENDBLOCK:%.*]]
+; X32:       loadbb1:
+; X32-NEXT:    [[TMP5:%.*]] = bitcast i8* [[X]] to i32*
+; X32-NEXT:    [[TMP6:%.*]] = bitcast i8* [[Y]] to i32*
+; X32-NEXT:    [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 1
+; X32-NEXT:    [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 1
+; X32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP7]]
+; X32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP8]]
+; X32-NEXT:    [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]]
+; X32-NEXT:    br i1 [[TMP11]], label [[RES_BLOCK]], label [[LOADBB2:%.*]]
+; X32:       loadbb2:
+; X32-NEXT:    [[TMP12:%.*]] = bitcast i8* [[X]] to i32*
+; X32-NEXT:    [[TMP13:%.*]] = bitcast i8* [[Y]] to i32*
+; X32-NEXT:    [[TMP14:%.*]] = getelementptr i32, i32* [[TMP12]], i32 2
+; X32-NEXT:    [[TMP15:%.*]] = getelementptr i32, i32* [[TMP13]], i32 2
+; X32-NEXT:    [[TMP16:%.*]] = load i32, i32* [[TMP14]]
+; X32-NEXT:    [[TMP17:%.*]] = load i32, i32* [[TMP15]]
+; X32-NEXT:    [[TMP18:%.*]] = icmp ne i32 [[TMP16]], [[TMP17]]
+; X32-NEXT:    br i1 [[TMP18]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X32:       endblock:
+; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ 1, [[RES_BLOCK]] ]
+; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; X32-NEXT:    ret i32 [[CONV]]
+;
+; X64-LABEL: @cmp_eq12(
+; X64-NEXT:  loadbb:
+; X64-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64*
+; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64*
+; X64-NEXT:    [[TMP2:%.*]] = load i64, i64* [[TMP0]]
+; X64-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP1]]
+; X64-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]]
+; X64-NEXT:    br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X64:       res_block:
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP5:%.*]] = bitcast i8* [[X]] to i32*
+; X64-NEXT:    [[TMP6:%.*]] = bitcast i8* [[Y]] to i32*
+; X64-NEXT:    [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 2
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 2
+; X64-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP7]]
+; X64-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP8]]
+; X64-NEXT:    [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]]
+; X64-NEXT:    br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; X64-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; X64-NEXT:    ret i32 [[CONV]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 12)
   %cmp = icmp eq i32 %call, 0
@@ -335,11 +1553,78 @@ define i32 @cmp_eq12(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq13(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; ALL-LABEL: @cmp_eq13(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 13)
-; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
-; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; ALL-NEXT:    ret i32 [[CONV]]
+; X32-LABEL: @cmp_eq13(
+; X32-NEXT:  loadbb:
+; X32-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]]
+; X32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; X32-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]]
+; X32-NEXT:    br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X32:       res_block:
+; X32-NEXT:    br label [[ENDBLOCK:%.*]]
+; X32:       loadbb1:
+; X32-NEXT:    [[TMP5:%.*]] = bitcast i8* [[X]] to i32*
+; X32-NEXT:    [[TMP6:%.*]] = bitcast i8* [[Y]] to i32*
+; X32-NEXT:    [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 1
+; X32-NEXT:    [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 1
+; X32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP7]]
+; X32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP8]]
+; X32-NEXT:    [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]]
+; X32-NEXT:    br i1 [[TMP11]], label [[RES_BLOCK]], label [[LOADBB2:%.*]]
+; X32:       loadbb2:
+; X32-NEXT:    [[TMP12:%.*]] = bitcast i8* [[X]] to i32*
+; X32-NEXT:    [[TMP13:%.*]] = bitcast i8* [[Y]] to i32*
+; X32-NEXT:    [[TMP14:%.*]] = getelementptr i32, i32* [[TMP12]], i32 2
+; X32-NEXT:    [[TMP15:%.*]] = getelementptr i32, i32* [[TMP13]], i32 2
+; X32-NEXT:    [[TMP16:%.*]] = load i32, i32* [[TMP14]]
+; X32-NEXT:    [[TMP17:%.*]] = load i32, i32* [[TMP15]]
+; X32-NEXT:    [[TMP18:%.*]] = icmp ne i32 [[TMP16]], [[TMP17]]
+; X32-NEXT:    br i1 [[TMP18]], label [[RES_BLOCK]], label [[LOADBB3:%.*]]
+; X32:       loadbb3:
+; X32-NEXT:    [[TMP19:%.*]] = getelementptr i8, i8* [[X]], i8 12
+; X32-NEXT:    [[TMP20:%.*]] = getelementptr i8, i8* [[Y]], i8 12
+; X32-NEXT:    [[TMP21:%.*]] = load i8, i8* [[TMP19]]
+; X32-NEXT:    [[TMP22:%.*]] = load i8, i8* [[TMP20]]
+; X32-NEXT:    [[TMP23:%.*]] = icmp ne i8 [[TMP21]], [[TMP22]]
+; X32-NEXT:    br i1 [[TMP23]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X32:       endblock:
+; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ 1, [[RES_BLOCK]] ]
+; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; X32-NEXT:    ret i32 [[CONV]]
+;
+; X64-LABEL: @cmp_eq13(
+; X64-NEXT:  loadbb:
+; X64-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64*
+; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64*
+; X64-NEXT:    [[TMP2:%.*]] = load i64, i64* [[TMP0]]
+; X64-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP1]]
+; X64-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]]
+; X64-NEXT:    br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X64:       res_block:
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP5:%.*]] = bitcast i8* [[X]] to i32*
+; X64-NEXT:    [[TMP6:%.*]] = bitcast i8* [[Y]] to i32*
+; X64-NEXT:    [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 2
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 2
+; X64-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP7]]
+; X64-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP8]]
+; X64-NEXT:    [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]]
+; X64-NEXT:    br i1 [[TMP11]], label [[RES_BLOCK]], label [[LOADBB2:%.*]]
+; X64:       loadbb2:
+; X64-NEXT:    [[TMP12:%.*]] = getelementptr i8, i8* [[X]], i8 12
+; X64-NEXT:    [[TMP13:%.*]] = getelementptr i8, i8* [[Y]], i8 12
+; X64-NEXT:    [[TMP14:%.*]] = load i8, i8* [[TMP12]]
+; X64-NEXT:    [[TMP15:%.*]] = load i8, i8* [[TMP13]]
+; X64-NEXT:    [[TMP16:%.*]] = icmp ne i8 [[TMP14]], [[TMP15]]
+; X64-NEXT:    br i1 [[TMP16]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ 1, [[RES_BLOCK]] ]
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; X64-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; X64-NEXT:    ret i32 [[CONV]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 13)
   %cmp = icmp eq i32 %call, 0
@@ -348,11 +1633,82 @@ define i32 @cmp_eq13(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq14(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; ALL-LABEL: @cmp_eq14(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 14)
-; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
-; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; ALL-NEXT:    ret i32 [[CONV]]
+; X32-LABEL: @cmp_eq14(
+; X32-NEXT:  loadbb:
+; X32-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]]
+; X32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; X32-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]]
+; X32-NEXT:    br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X32:       res_block:
+; X32-NEXT:    br label [[ENDBLOCK:%.*]]
+; X32:       loadbb1:
+; X32-NEXT:    [[TMP5:%.*]] = bitcast i8* [[X]] to i32*
+; X32-NEXT:    [[TMP6:%.*]] = bitcast i8* [[Y]] to i32*
+; X32-NEXT:    [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 1
+; X32-NEXT:    [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 1
+; X32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP7]]
+; X32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP8]]
+; X32-NEXT:    [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]]
+; X32-NEXT:    br i1 [[TMP11]], label [[RES_BLOCK]], label [[LOADBB2:%.*]]
+; X32:       loadbb2:
+; X32-NEXT:    [[TMP12:%.*]] = bitcast i8* [[X]] to i32*
+; X32-NEXT:    [[TMP13:%.*]] = bitcast i8* [[Y]] to i32*
+; X32-NEXT:    [[TMP14:%.*]] = getelementptr i32, i32* [[TMP12]], i32 2
+; X32-NEXT:    [[TMP15:%.*]] = getelementptr i32, i32* [[TMP13]], i32 2
+; X32-NEXT:    [[TMP16:%.*]] = load i32, i32* [[TMP14]]
+; X32-NEXT:    [[TMP17:%.*]] = load i32, i32* [[TMP15]]
+; X32-NEXT:    [[TMP18:%.*]] = icmp ne i32 [[TMP16]], [[TMP17]]
+; X32-NEXT:    br i1 [[TMP18]], label [[RES_BLOCK]], label [[LOADBB3:%.*]]
+; X32:       loadbb3:
+; X32-NEXT:    [[TMP19:%.*]] = bitcast i8* [[X]] to i16*
+; X32-NEXT:    [[TMP20:%.*]] = bitcast i8* [[Y]] to i16*
+; X32-NEXT:    [[TMP21:%.*]] = getelementptr i16, i16* [[TMP19]], i16 6
+; X32-NEXT:    [[TMP22:%.*]] = getelementptr i16, i16* [[TMP20]], i16 6
+; X32-NEXT:    [[TMP23:%.*]] = load i16, i16* [[TMP21]]
+; X32-NEXT:    [[TMP24:%.*]] = load i16, i16* [[TMP22]]
+; X32-NEXT:    [[TMP25:%.*]] = icmp ne i16 [[TMP23]], [[TMP24]]
+; X32-NEXT:    br i1 [[TMP25]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X32:       endblock:
+; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ 1, [[RES_BLOCK]] ]
+; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; X32-NEXT:    ret i32 [[CONV]]
+;
+; X64-LABEL: @cmp_eq14(
+; X64-NEXT:  loadbb:
+; X64-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64*
+; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64*
+; X64-NEXT:    [[TMP2:%.*]] = load i64, i64* [[TMP0]]
+; X64-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP1]]
+; X64-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]]
+; X64-NEXT:    br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X64:       res_block:
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP5:%.*]] = bitcast i8* [[X]] to i32*
+; X64-NEXT:    [[TMP6:%.*]] = bitcast i8* [[Y]] to i32*
+; X64-NEXT:    [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 2
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 2
+; X64-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP7]]
+; X64-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP8]]
+; X64-NEXT:    [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]]
+; X64-NEXT:    br i1 [[TMP11]], label [[RES_BLOCK]], label [[LOADBB2:%.*]]
+; X64:       loadbb2:
+; X64-NEXT:    [[TMP12:%.*]] = bitcast i8* [[X]] to i16*
+; X64-NEXT:    [[TMP13:%.*]] = bitcast i8* [[Y]] to i16*
+; X64-NEXT:    [[TMP14:%.*]] = getelementptr i16, i16* [[TMP12]], i16 6
+; X64-NEXT:    [[TMP15:%.*]] = getelementptr i16, i16* [[TMP13]], i16 6
+; X64-NEXT:    [[TMP16:%.*]] = load i16, i16* [[TMP14]]
+; X64-NEXT:    [[TMP17:%.*]] = load i16, i16* [[TMP15]]
+; X64-NEXT:    [[TMP18:%.*]] = icmp ne i16 [[TMP16]], [[TMP17]]
+; X64-NEXT:    br i1 [[TMP18]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ 1, [[RES_BLOCK]] ]
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; X64-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; X64-NEXT:    ret i32 [[CONV]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 14)
   %cmp = icmp eq i32 %call, 0
@@ -361,11 +1717,52 @@ define i32 @cmp_eq14(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq15(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; ALL-LABEL: @cmp_eq15(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 15)
-; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
-; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; ALL-NEXT:    ret i32 [[CONV]]
+; X32-LABEL: @cmp_eq15(
+; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 15)
+; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; X32-NEXT:    ret i32 [[CONV]]
+;
+; X64-LABEL: @cmp_eq15(
+; X64-NEXT:  loadbb:
+; X64-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64*
+; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64*
+; X64-NEXT:    [[TMP2:%.*]] = load i64, i64* [[TMP0]]
+; X64-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP1]]
+; X64-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]]
+; X64-NEXT:    br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X64:       res_block:
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP5:%.*]] = bitcast i8* [[X]] to i32*
+; X64-NEXT:    [[TMP6:%.*]] = bitcast i8* [[Y]] to i32*
+; X64-NEXT:    [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 2
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 2
+; X64-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP7]]
+; X64-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP8]]
+; X64-NEXT:    [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]]
+; X64-NEXT:    br i1 [[TMP11]], label [[RES_BLOCK]], label [[LOADBB2:%.*]]
+; X64:       loadbb2:
+; X64-NEXT:    [[TMP12:%.*]] = bitcast i8* [[X]] to i16*
+; X64-NEXT:    [[TMP13:%.*]] = bitcast i8* [[Y]] to i16*
+; X64-NEXT:    [[TMP14:%.*]] = getelementptr i16, i16* [[TMP12]], i16 6
+; X64-NEXT:    [[TMP15:%.*]] = getelementptr i16, i16* [[TMP13]], i16 6
+; X64-NEXT:    [[TMP16:%.*]] = load i16, i16* [[TMP14]]
+; X64-NEXT:    [[TMP17:%.*]] = load i16, i16* [[TMP15]]
+; X64-NEXT:    [[TMP18:%.*]] = icmp ne i16 [[TMP16]], [[TMP17]]
+; X64-NEXT:    br i1 [[TMP18]], label [[RES_BLOCK]], label [[LOADBB3:%.*]]
+; X64:       loadbb3:
+; X64-NEXT:    [[TMP19:%.*]] = getelementptr i8, i8* [[X]], i8 14
+; X64-NEXT:    [[TMP20:%.*]] = getelementptr i8, i8* [[Y]], i8 14
+; X64-NEXT:    [[TMP21:%.*]] = load i8, i8* [[TMP19]]
+; X64-NEXT:    [[TMP22:%.*]] = load i8, i8* [[TMP20]]
+; X64-NEXT:    [[TMP23:%.*]] = icmp ne i8 [[TMP21]], [[TMP22]]
+; X64-NEXT:    br i1 [[TMP23]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ 1, [[RES_BLOCK]] ]
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; X64-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; X64-NEXT:    ret i32 [[CONV]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 15)
   %cmp = icmp eq i32 %call, 0
@@ -374,11 +1771,73 @@ define i32 @cmp_eq15(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq16(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; ALL-LABEL: @cmp_eq16(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 16)
-; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
-; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; ALL-NEXT:    ret i32 [[CONV]]
+; X32-LABEL: @cmp_eq16(
+; X32-NEXT:  loadbb:
+; X32-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]]
+; X32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; X32-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]]
+; X32-NEXT:    br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X32:       res_block:
+; X32-NEXT:    br label [[ENDBLOCK:%.*]]
+; X32:       loadbb1:
+; X32-NEXT:    [[TMP5:%.*]] = bitcast i8* [[X]] to i32*
+; X32-NEXT:    [[TMP6:%.*]] = bitcast i8* [[Y]] to i32*
+; X32-NEXT:    [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 1
+; X32-NEXT:    [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 1
+; X32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP7]]
+; X32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP8]]
+; X32-NEXT:    [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]]
+; X32-NEXT:    br i1 [[TMP11]], label [[RES_BLOCK]], label [[LOADBB2:%.*]]
+; X32:       loadbb2:
+; X32-NEXT:    [[TMP12:%.*]] = bitcast i8* [[X]] to i32*
+; X32-NEXT:    [[TMP13:%.*]] = bitcast i8* [[Y]] to i32*
+; X32-NEXT:    [[TMP14:%.*]] = getelementptr i32, i32* [[TMP12]], i32 2
+; X32-NEXT:    [[TMP15:%.*]] = getelementptr i32, i32* [[TMP13]], i32 2
+; X32-NEXT:    [[TMP16:%.*]] = load i32, i32* [[TMP14]]
+; X32-NEXT:    [[TMP17:%.*]] = load i32, i32* [[TMP15]]
+; X32-NEXT:    [[TMP18:%.*]] = icmp ne i32 [[TMP16]], [[TMP17]]
+; X32-NEXT:    br i1 [[TMP18]], label [[RES_BLOCK]], label [[LOADBB3:%.*]]
+; X32:       loadbb3:
+; X32-NEXT:    [[TMP19:%.*]] = bitcast i8* [[X]] to i32*
+; X32-NEXT:    [[TMP20:%.*]] = bitcast i8* [[Y]] to i32*
+; X32-NEXT:    [[TMP21:%.*]] = getelementptr i32, i32* [[TMP19]], i32 3
+; X32-NEXT:    [[TMP22:%.*]] = getelementptr i32, i32* [[TMP20]], i32 3
+; X32-NEXT:    [[TMP23:%.*]] = load i32, i32* [[TMP21]]
+; X32-NEXT:    [[TMP24:%.*]] = load i32, i32* [[TMP22]]
+; X32-NEXT:    [[TMP25:%.*]] = icmp ne i32 [[TMP23]], [[TMP24]]
+; X32-NEXT:    br i1 [[TMP25]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X32:       endblock:
+; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ 1, [[RES_BLOCK]] ]
+; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; X32-NEXT:    ret i32 [[CONV]]
+;
+; X64-LABEL: @cmp_eq16(
+; X64-NEXT:  loadbb:
+; X64-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64*
+; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64*
+; X64-NEXT:    [[TMP2:%.*]] = load i64, i64* [[TMP0]]
+; X64-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP1]]
+; X64-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]]
+; X64-NEXT:    br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X64:       res_block:
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP5:%.*]] = bitcast i8* [[X]] to i64*
+; X64-NEXT:    [[TMP6:%.*]] = bitcast i8* [[Y]] to i64*
+; X64-NEXT:    [[TMP7:%.*]] = getelementptr i64, i64* [[TMP5]], i64 1
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i64, i64* [[TMP6]], i64 1
+; X64-NEXT:    [[TMP9:%.*]] = load i64, i64* [[TMP7]]
+; X64-NEXT:    [[TMP10:%.*]] = load i64, i64* [[TMP8]]
+; X64-NEXT:    [[TMP11:%.*]] = icmp ne i64 [[TMP9]], [[TMP10]]
+; X64-NEXT:    br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; X64-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; X64-NEXT:    ret i32 [[CONV]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16)
   %cmp = icmp eq i32 %call, 0
diff --git a/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll b/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll
index b6b775797826..088b177c2e11 100644
--- a/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll
+++ b/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll
@@ -194,7 +194,6 @@ rare.2:
   br label %fallthrough
 }
 
-
 declare void @slowpath(i32, i32*)
 
 ; Make sure we don't end up in an infinite loop after we fail to sink.
@@ -218,3 +217,37 @@ load.i145:
 pl_loop.i.i122:
   br label %pl_loop.i.i122
 }
+
+; Make sure we can sink address computation even
+; if there is a cycle in phi nodes.
+define void @test9(i1 %cond, i64* %base) {
+; CHECK-LABEL: @test9
+entry:
+  %addr = getelementptr inbounds i64, i64* %base, i64 5
+  %casted = bitcast i64* %addr to i32*
+  br label %header
+
+header:
+  %iv = phi i32 [0, %entry], [%iv.inc, %backedge]
+  %casted.loop = phi i32* [%casted, %entry], [%casted.merged, %backedge]
+  br i1 %cond, label %if.then, label %backedge
+
+if.then:
+  call void @foo(i32 %iv)
+  %addr.1 = getelementptr inbounds i64, i64* %base, i64 5
+  %casted.1 = bitcast i64* %addr.1 to i32*
+  br label %backedge
+
+backedge:
+; CHECK-LABEL: backedge:
+; CHECK: getelementptr i8, {{.+}} 40
+  %casted.merged = phi i32* [%casted.loop, %header], [%casted.1, %if.then]
+  %v = load i32, i32* %casted.merged, align 4
+  call void @foo(i32 %v)
+  %iv.inc = add i32 %iv, 1
+  %cmp = icmp slt i32 %iv.inc, 1000
+  br i1 %cmp, label %header, label %exit
+
+exit:
+  ret void
+}
diff --git a/test/Transforms/EarlyCSE/globalsaa-memoryssa.ll b/test/Transforms/EarlyCSE/globalsaa-memoryssa.ll
new file mode 100644
index 000000000000..57dbdd883190
--- /dev/null
+++ b/test/Transforms/EarlyCSE/globalsaa-memoryssa.ll
@@ -0,0 +1,25 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -S -globals-aa -early-cse-memssa | FileCheck %s
+
+define i16 @f1() readonly {
+  ret i16 0
+}
+
+declare void @f2()
+
+; Check that EarlyCSE correctly handles function calls that don't have
+; a MemoryAccess.  In this case the calls to @f1 have no
+; MemoryAccesses since globals-aa determines that @f1 doesn't
+; read/write memory at all.
+
+define void @f3() {
+; CHECK-LABEL: @f3(
+; CHECK-NEXT:    [[CALL1:%.*]] = call i16 @f1()
+; CHECK-NEXT:    call void @f2()
+; CHECK-NEXT:    ret void
+;
+  %call1 = call i16 @f1()
+  call void @f2()
+  %call2 = call i16 @f1()
+  ret void
+}
diff --git a/test/Transforms/GVN/PRE/2017-06-28-pre-load-dbgloc.ll b/test/Transforms/GVN/PRE/2017-06-28-pre-load-dbgloc.ll
new file mode 100644
index 000000000000..513379d0bd01
--- /dev/null
+++ b/test/Transforms/GVN/PRE/2017-06-28-pre-load-dbgloc.ll
@@ -0,0 +1,79 @@
+; This test checks if debug loc is propagated to load/store created by GVN/Instcombine.
+; RUN: opt < %s -gvn -S | FileCheck %s --check-prefixes=ALL,GVN
+; RUN: opt < %s -gvn -instcombine -S | FileCheck %s --check-prefixes=ALL,INSTCOMBINE
+
+; struct node {
+;  int  *v;
+; struct desc *descs;
+; };
+
+; struct desc {
+;  struct node *node;
+; };
+
+; extern int bar(void *v, void* n);
+
+; int test(struct desc *desc)
+; {
+;  void *v, *n;
+;  v = !desc ? ((void *)0) : desc->node->v;  // Line 15
+;  n = &desc->node->descs[0];                // Line 16
+;  return bar(v, n);
+; }
+
+; Line 16, Column 13:
+;   n = &desc->node->descs[0];
+;              ^
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+%struct.desc = type { %struct.node* }
+%struct.node = type { i32*, %struct.desc* }
+
+define i32 @test(%struct.desc* readonly %desc) local_unnamed_addr #0 !dbg !4 {
+entry:
+  %tobool = icmp eq %struct.desc* %desc, null
+  br i1 %tobool, label %cond.end, label %cond.false, !dbg !9
+; ALL: br i1 %tobool, label %entry.cond.end_crit_edge, label %cond.false, !dbg [[LOC_15_6:![0-9]+]]
+; ALL: entry.cond.end_crit_edge:
+; GVN: %.pre = load %struct.node*, %struct.node** null, align 8, !dbg [[LOC_16_13:![0-9]+]]
+; INSTCOMBINE:store %struct.node* undef, %struct.node** null, align 536870912, !dbg [[LOC_16_13:![0-9]+]]
+
+cond.false:
+  %0 = bitcast %struct.desc* %desc to i8***, !dbg !11
+  %1 = load i8**, i8*** %0, align 8, !dbg !11
+  %2 = load i8*, i8** %1, align 8
+  br label %cond.end, !dbg !9
+
+cond.end:
+  %3 = phi i8* [ %2, %cond.false ], [ null, %entry ], !dbg !9
+  %node2 = getelementptr inbounds %struct.desc, %struct.desc* %desc, i64 0, i32 0
+  %4 = load %struct.node*, %struct.node** %node2, align 8, !dbg !10
+  %descs = getelementptr inbounds %struct.node, %struct.node* %4, i64 0, i32 1
+  %5 = bitcast %struct.desc** %descs to i8**
+  %6 = load i8*, i8** %5, align 8
+  %call = tail call i32 @bar(i8* %3, i8* %6)
+  ret i32 %call
+}
+
+declare i32 @bar(i8*, i8*) local_unnamed_addr #1
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, emissionKind: FullDebug)
+!1 = !DIFile(filename: "test.c", directory: ".")
+!2 = !{i32 2, !"Dwarf Version", i32 4}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = distinct !DISubprogram(name: "test", scope: !1, file: !1, line: 12, type: !5, isLocal: false, isDefinition: true, scopeLine: 13, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !8)
+!5 = !DISubroutineType(types: !6)
+!6 = !{!7}
+!7 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!8 = !{}
+!9 = !DILocation(line: 15, column: 6, scope: !4)
+!10 = !DILocation(line: 16, column: 13, scope: !4)
+!11 = !DILocation(line: 15, column: 34, scope: !4)
+
+;ALL: [[SCOPE:![0-9]+]] = distinct  !DISubprogram(name: "test",{{.*}}
+;ALL: [[LOC_15_6]] = !DILocation(line: 15, column: 6, scope: [[SCOPE]])
+;ALL: [[LOC_16_13]] = !DILocation(line: 16, column: 13, scope: [[SCOPE]])
diff --git a/test/Transforms/GVN/PRE/phi-translate.ll b/test/Transforms/GVN/PRE/phi-translate.ll
index 1f6c7c8d33ea..55f5fd6465b6 100644
--- a/test/Transforms/GVN/PRE/phi-translate.ll
+++ b/test/Transforms/GVN/PRE/phi-translate.ll
@@ -6,12 +6,12 @@ target datalayout = "e-p:64:64:64"
 ; CHECK: entry.end_crit_edge:
 ; CHECK: %[[INDEX:[a-z0-9.]+]] = sext i32 %x to i64{{$}}
 ; CHECK: %[[ADDRESS:[a-z0-9.]+]] = getelementptr [100 x i32], [100 x i32]* @G, i64 0, i64 %[[INDEX]]{{$}}
-; CHECK:   %n.pre = load i32, i32* %[[ADDRESS]]{{$}}
+; CHECK:   %n.pre = load i32, i32* %[[ADDRESS]], !dbg [[N_LOC:![0-9]+]]
 ; CHECK: br label %end
 ; CHECK: then:
 ; CHECK:   store i32 %z
 ; CHECK: end:
-; CHECK:   %n = phi i32 [ %n.pre, %entry.end_crit_edge ], [ %z, %then ], !dbg [[N_LOC:![0-9]+]]
+; CHECK:   %n = phi i32 [ %n.pre, %entry.end_crit_edge ], [ %z, %then ], !dbg [[N_LOC]]
 ; CHECK:   ret i32 %n
 
 ; CHECK: [[N_LOC]] = !DILocation(line: 47, column: 1, scope: !{{.*}})
diff --git a/test/Transforms/GlobalOpt/pr33686.ll b/test/Transforms/GlobalOpt/pr33686.ll
new file mode 100644
index 000000000000..d6bb98735f4e
--- /dev/null
+++ b/test/Transforms/GlobalOpt/pr33686.ll
@@ -0,0 +1,17 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -globalopt %s | FileCheck %s
+
+@glob = external global i16, align 1
+
+define void @beth() {
+; CHECK-LABEL: @beth(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret void
+;
+entry:
+  ret void
+
+notreachable:
+  %patatino = select i1 undef, i16* @glob, i16* %patatino
+  br label %notreachable
+}
diff --git a/test/Transforms/IRCE/eq_ne.ll b/test/Transforms/IRCE/eq_ne.ll
new file mode 100644
index 000000000000..1b1ffe6b94ba
--- /dev/null
+++ b/test/Transforms/IRCE/eq_ne.ll
@@ -0,0 +1,257 @@
+; RUN: opt -verify-loop-info -irce-print-changed-loops -irce -S < %s 2>&1 | FileCheck %s
+
+; CHECK: irce: in function test_01: constrained Loop at depth 1 containing: %loop<header><exiting>,%in.bounds<latch><exiting>
+; CHECK-NOT: irce: in function test_02: constrained Loop at depth 1 containing: %loop<header><exiting>,%in.bounds<latch><exiting>
+; CHECK: irce: in function test_03: constrained Loop at depth 1 containing: %loop<header><exiting>,%in.bounds<latch><exiting>
+; CHECK-NOT: irce: in function test_04: constrained Loop at depth 1 containing: %loop<header><exiting>,%in.bounds<latch><exiting>
+; CHECK: irce: in function test_05: constrained Loop at depth 1 containing: %loop<header><exiting>,%in.bounds<latch><exiting>
+; CHECK-NOT: irce: in function test_06: constrained Loop at depth 1 containing: %loop<header><exiting>,%in.bounds<latch><exiting>
+; CHECK: irce: in function test_07: constrained Loop at depth 1 containing: %loop<header><exiting>,%in.bounds<latch><exiting>
+; CHECK-NOT: irce: in function test_08: constrained Loop at depth 1 containing: %loop<header><exiting>,%in.bounds<latch><exiting>
+
+; Show that IRCE can turn 'ne' condition to 'slt' in increasing IV.
+define void @test_01(i32* %arr, i32* %a_len_ptr) #0 {
+
+; CHECK:      test_01
+; CHECK:        main.exit.selector:
+; CHECK-NEXT:     [[PSEUDO_PHI:%[^ ]+]] = phi i32 [ %idx.next, %in.bounds ]
+; CHECK-NEXT:     [[COND:%[^ ]+]] = icmp slt i32 [[PSEUDO_PHI]], 100
+; CHECK-NEXT:     br i1 [[COND]]
+
+entry:
+  %len = load i32, i32* %a_len_ptr, !range !0
+  br label %loop
+
+loop:
+  %idx = phi i32 [ 0, %entry ], [ %idx.next, %in.bounds ]
+  %idx.next = add i32 %idx, 1
+  %abc = icmp slt i32 %idx, %len
+  br i1 %abc, label %in.bounds, label %out.of.bounds, !prof !1
+
+in.bounds:
+  %addr = getelementptr i32, i32* %arr, i32 %idx
+  store i32 0, i32* %addr
+  %next = icmp ne i32 %idx.next, 100
+  br i1 %next, label %loop, label %exit
+
+out.of.bounds:
+  ret void
+
+exit:
+  ret void
+}
+
+; Show that if n is not known to be greater than the starting value, IRCE
+; doesn't apply.
+define void @test_02(i32* %arr, i32* %a_len_ptr) #0 {
+
+; CHECK: test_02(
+
+entry:
+  %len = load i32, i32* %a_len_ptr, !range !0
+  br label %loop
+
+loop:
+  %idx = phi i32 [ 0, %entry ], [ %idx.next, %in.bounds ]
+  %idx.next = add i32 %idx, 1
+  %abc = icmp slt i32 %idx, %len
+  br i1 %abc, label %in.bounds, label %out.of.bounds, !prof !1
+
+in.bounds:
+  %addr = getelementptr i32, i32* %arr, i32 %idx
+  store i32 0, i32* %addr
+  %next = icmp ne i32 %idx.next, -100
+  br i1 %next, label %loop, label %exit
+
+out.of.bounds:
+  ret void
+
+exit:
+  ret void
+}
+
+; Show that IRCE can turn 'eq' condition to 'sge' in increasing IV.
+define void @test_03(i32* %arr, i32* %a_len_ptr) #0 {
+
+; CHECK: test_03(
+; CHECK:        main.exit.selector:
+; CHECK-NEXT:     [[PSEUDO_PHI:%[^ ]+]] = phi i32 [ %idx.next, %in.bounds ]
+; CHECK-NEXT:     [[COND:%[^ ]+]] = icmp slt i32 [[PSEUDO_PHI]], 100
+; CHECK-NEXT:     br i1 [[COND]]
+
+entry:
+  %len = load i32, i32* %a_len_ptr, !range !0
+  br label %loop
+
+loop:
+  %idx = phi i32 [ 0, %entry ], [ %idx.next, %in.bounds ]
+  %idx.next = add i32 %idx, 1
+  %abc = icmp slt i32 %idx, %len
+  br i1 %abc, label %in.bounds, label %out.of.bounds, !prof !1
+
+in.bounds:
+  %addr = getelementptr i32, i32* %arr, i32 %idx
+  store i32 0, i32* %addr
+  %next = icmp eq i32 %idx.next, 100
+  br i1 %next, label %exit, label %loop
+
+out.of.bounds:
+  ret void
+
+exit:
+  ret void
+}
+
+; Show that if n is not known to be greater than the starting value, IRCE
+; doesn't apply.
+define void @test_04(i32* %arr, i32* %a_len_ptr) #0 {
+
+; CHECK: test_04(
+
+entry:
+  %len = load i32, i32* %a_len_ptr, !range !0
+  br label %loop
+
+loop:
+  %idx = phi i32 [ 0, %entry ], [ %idx.next, %in.bounds ]
+  %idx.next = add i32 %idx, 1
+  %abc = icmp slt i32 %idx, %len
+  br i1 %abc, label %in.bounds, label %out.of.bounds, !prof !1
+
+in.bounds:
+  %addr = getelementptr i32, i32* %arr, i32 %idx
+  store i32 0, i32* %addr
+  %next = icmp eq i32 %idx.next, -100
+  br i1 %next, label %exit, label %loop
+
+out.of.bounds:
+  ret void
+
+exit:
+  ret void
+}
+
+; Show that IRCE can turn 'ne' condition to 'sgt' in decreasing IV.
+define void @test_05(i32* %arr, i32* %a_len_ptr) #0 {
+
+; CHECK: test_05(
+; CHECK:        preloop.exit.selector:
+; CHECK-NEXT:     [[PSEUDO_PHI:%[^ ]+]] = phi i32 [ %idx.next.preloop, %in.bounds.preloop ]
+; CHECK-NEXT:     [[COND:%[^ ]+]] = icmp sgt i32 [[PSEUDO_PHI]], 0
+; CHECK-NEXT:     br i1 [[COND]]
+
+entry:
+  %len = load i32, i32* %a_len_ptr, !range !0
+  br label %loop
+
+loop:
+  %idx = phi i32 [ 100, %entry ], [ %idx.next, %in.bounds ]
+  %idx.next = add i32 %idx, -1
+  %abc = icmp slt i32 %idx, %len
+  br i1 %abc, label %in.bounds, label %out.of.bounds, !prof !1
+
+in.bounds:
+  %addr = getelementptr i32, i32* %arr, i32 %idx
+  store i32 0, i32* %addr
+  %next = icmp ne i32 %idx.next, 0
+  br i1 %next, label %loop, label %exit
+
+out.of.bounds:
+  ret void
+
+exit:
+  ret void
+}
+
+; Show that IRCE cannot turn 'ne' condition to 'sgt' in decreasing IV if the end
+; value is not proved to be less than the start value.
+define void @test_06(i32* %arr, i32* %a_len_ptr) #0 {
+
+; CHECK: test_06(
+
+entry:
+  %len = load i32, i32* %a_len_ptr, !range !0
+  br label %loop
+
+loop:
+  %idx = phi i32 [ 100, %entry ], [ %idx.next, %in.bounds ]
+  %idx.next = add i32 %idx, -1
+  %abc = icmp slt i32 %idx, %len
+  br i1 %abc, label %in.bounds, label %out.of.bounds, !prof !1
+
+in.bounds:
+  %addr = getelementptr i32, i32* %arr, i32 %idx
+  store i32 0, i32* %addr
+  %next = icmp ne i32 %idx.next, 120
+  br i1 %next, label %loop, label %exit
+
+out.of.bounds:
+  ret void
+
+exit:
+  ret void
+}
+
+; Show that IRCE can turn 'eq' condition to 'slt' in decreasing IV.
+define void @test_07(i32* %arr, i32* %a_len_ptr) #0 {
+
+; CHECK: test_07(
+; CHECK:        preloop.exit.selector:
+; CHECK-NEXT:     [[PSEUDO_PHI:%[^ ]+]] = phi i32 [ %idx.next.preloop, %in.bounds.preloop ]
+; CHECK-NEXT:     [[COND:%[^ ]+]] = icmp sgt i32 [[PSEUDO_PHI]], 0
+; CHECK-NEXT:     br i1 [[COND]]
+
+entry:
+  %len = load i32, i32* %a_len_ptr, !range !0
+  br label %loop
+
+loop:
+  %idx = phi i32 [ 100, %entry ], [ %idx.next, %in.bounds ]
+  %idx.next = add i32 %idx, -1
+  %abc = icmp slt i32 %idx, %len
+  br i1 %abc, label %in.bounds, label %out.of.bounds, !prof !1
+
+in.bounds:
+  %addr = getelementptr i32, i32* %arr, i32 %idx
+  store i32 0, i32* %addr
+  %next = icmp eq i32 %idx.next, 0
+  br i1 %next, label %exit, label %loop
+
+out.of.bounds:
+  ret void
+
+exit:
+  ret void
+}
+
+; Show that IRCE cannot turn 'eq' condition to 'slt' in decreasing IV if the end
+; value is not proved to be less than the start value.
+define void @test_08(i32* %arr, i32* %a_len_ptr) #0 {
+
+; CHECK: test_08(
+
+entry:
+  %len = load i32, i32* %a_len_ptr, !range !0
+  br label %loop
+
+loop:
+  %idx = phi i32 [ 100, %entry ], [ %idx.next, %in.bounds ]
+  %idx.next = add i32 %idx, -1
+  %abc = icmp slt i32 %idx, %len
+  br i1 %abc, label %in.bounds, label %out.of.bounds, !prof !1
+
+in.bounds:
+  %addr = getelementptr i32, i32* %arr, i32 %idx
+  store i32 0, i32* %addr
+  %next = icmp eq i32 %idx.next, 120
+  br i1 %next, label %exit, label %loop
+
+out.of.bounds:
+  ret void
+
+exit:
+  ret void
+}
+
+!0 = !{i32 0, i32 50}
+!1 = !{!"branch_weights", i32 64, i32 4}
diff --git a/test/Transforms/IRCE/pre_post_loops.ll b/test/Transforms/IRCE/pre_post_loops.ll
new file mode 100644
index 000000000000..2cd2e29104fe
--- /dev/null
+++ b/test/Transforms/IRCE/pre_post_loops.ll
@@ -0,0 +1,117 @@
+; RUN: opt -verify-loop-info -irce-print-changed-loops -irce -S < %s 2>&1 | FileCheck %s
+
+; CHECK: irce: in function test_01: constrained Loop at depth 1 containing: %loop<header><exiting>,%in.bounds<latch><exiting>
+; CHECK: irce: in function test_02: constrained Loop at depth 1 containing: %loop<header><exiting>,%in.bounds<latch><exiting>
+
+; Iterate from 0 to SINT_MAX, check that the post-loop is generated.
+define void @test_01(i32* %arr, i32* %a_len_ptr) {
+
+; CHECK:       test_01(
+; CHECK:       entry:
+; CHECK-NEXT:    %exit.mainloop.at = load i32, i32* %a_len_ptr
+; CHECK:       loop:
+; CHECK-NEXT:    %idx = phi i32 [ %idx.next, %in.bounds ], [ 0, %loop.preheader ]
+; CHECK-NEXT:    %idx.next = add i32 %idx, 1
+; CHECK-NEXT:    %abc = icmp slt i32 %idx, %exit.mainloop.at
+; CHECK-NEXT:    br i1 true, label %in.bounds,
+; CHECK:       in.bounds:
+; CHECK-NEXT:    %addr = getelementptr i32, i32* %arr, i32 %idx
+; CHECK-NEXT:    store i32 0, i32* %addr
+; CHECK-NEXT:    %next = icmp slt i32 %idx.next, 2147483647
+; CHECK-NEXT:    [[COND:%[^ ]+]] = icmp slt i32 %idx.next, %exit.mainloop.at
+; CHECK-NEXT:    br i1 [[COND]], label %loop, label %main.exit.selector
+; CHECK:       main.pseudo.exit:
+; CHECK-NEXT:    %idx.copy = phi i32 [ 0, %entry ], [ %idx.next.lcssa, %main.exit.selector ]
+; CHECK-NEXT:    %indvar.end = phi i32 [ 0, %entry ], [ %idx.next.lcssa, %main.exit.selector ]
+; CHECK-NEXT:    br label %postloop
+; CHECK:       postloop:
+; CHECK-NEXT:    br label %loop.postloop
+; CHECK:       loop.postloop:
+; CHECK-NEXT:    %idx.postloop = phi i32 [ %idx.copy, %postloop ], [ %idx.next.postloop, %in.bounds.postloop ]
+; CHECK-NEXT:    %idx.next.postloop = add i32 %idx.postloop, 1
+; CHECK-NEXT:    %abc.postloop = icmp slt i32 %idx.postloop, %exit.mainloop.at
+; CHECK-NEXT:    br i1 %abc.postloop, label %in.bounds.postloop, label %out.of.bounds.loopexit
+; CHECK:       in.bounds.postloop:
+; CHECK-NEXT:    %addr.postloop = getelementptr i32, i32* %arr, i32 %idx.postloop
+; CHECK-NEXT:    store i32 0, i32* %addr.postloop
+; CHECK-NEXT:    %next.postloop = icmp slt i32 %idx.next.postloop, 2147483647
+; CHECK-NEXT:    br i1 %next.postloop, label %loop.postloop, label %exit.loopexit
+
+entry:
+  %len = load i32, i32* %a_len_ptr, !range !0
+  br label %loop
+
+loop:
+  %idx = phi i32 [ 0, %entry ], [ %idx.next, %in.bounds ]
+  %idx.next = add i32 %idx, 1
+  %abc = icmp slt i32 %idx, %len
+  br i1 %abc, label %in.bounds, label %out.of.bounds
+
+in.bounds:
+  %addr = getelementptr i32, i32* %arr, i32 %idx
+  store i32 0, i32* %addr
+  %next = icmp slt i32 %idx.next, 2147483647
+  br i1 %next, label %loop, label %exit
+
+out.of.bounds:
+  ret void
+
+exit:
+  ret void
+}
+
+; Iterate from SINT_MAX to 0, check that the pre-loop is generated.
+define void @test_02(i32* %arr, i32* %a_len_ptr) {
+
+; CHECK:      test_02(
+; CHECK:      entry:
+; CHECK-NEXT:   %len = load i32, i32* %a_len_ptr, !range !0
+; CHECH-NEXT:    br i1 true, label %loop.preloop.preheader
+; CHECK:      mainloop:
+; CHECK-NEXT:   br label %loop
+; CHECK:      loop:
+; CHECK-NEXT:   %idx = phi i32 [ %idx.preloop.copy, %mainloop ], [ %idx.next, %in.bounds ]
+; CHECK-NEXT:   %idx.next = add i32 %idx, -1
+; CHECK-NEXT:   %abc = icmp slt i32 %idx, %len
+; CHECK-NEXT:   br i1 true, label %in.bounds
+; CHECK:      in.bounds:
+; CHECK-NEXT:   %addr = getelementptr i32, i32* %arr, i32 %idx
+; CHECK-NEXT:   store i32 0, i32* %addr
+; CHECK-NEXT:   %next = icmp sgt i32 %idx.next, -1
+; CHECK-NEXT:   br i1 %next, label %loop, label %exit.loopexit
+; CHECK:      loop.preloop:
+; CHECK-NEXT:   %idx.preloop = phi i32 [ %idx.next.preloop, %in.bounds.preloop ], [ 2147483647, %loop.preloop.preheader ]
+; CHECK-NEXT:   %idx.next.preloop = add i32 %idx.preloop, -1
+; CHECK-NEXT:   %abc.preloop = icmp slt i32 %idx.preloop, %len
+; CHECK-NEXT:   br i1 %abc.preloop, label %in.bounds.preloop, label %out.of.bounds.loopexit
+; CHECK:      in.bounds.preloop:
+; CHECK-NEXT:   %addr.preloop = getelementptr i32, i32* %arr, i32 %idx.preloop
+; CHECK-NEXT:   store i32 0, i32* %addr.preloop
+; CHECK-NEXT:   %next.preloop = icmp sgt i32 %idx.next.preloop, -1
+; CHECK-NEXT:   [[COND:%[^ ]+]] = icmp sgt i32 %idx.next.preloop, -1
+; CHECK-NEXT:   br i1 [[COND]], label %loop.preloop, label %preloop.exit.selector
+
+entry:
+  %len = load i32, i32* %a_len_ptr, !range !0
+  br label %loop
+
+loop:
+  %idx = phi i32 [ 2147483647, %entry ], [ %idx.next, %in.bounds ]
+  %idx.next = add i32 %idx, -1
+  %abc = icmp slt i32 %idx, %len
+  br i1 %abc, label %in.bounds, label %out.of.bounds
+
+in.bounds:
+  %addr = getelementptr i32, i32* %arr, i32 %idx
+  store i32 0, i32* %addr
+  %next = icmp sgt i32 %idx.next, -1
+  br i1 %next, label %loop, label %exit
+
+out.of.bounds:
+  ret void
+
+exit:
+  ret void
+}
+
+!0 = !{i32 0, i32 50}
diff --git a/test/Transforms/Inline/AArch64/ext.ll b/test/Transforms/Inline/AArch64/ext.ll
new file mode 100644
index 000000000000..04095c04ee86
--- /dev/null
+++ b/test/Transforms/Inline/AArch64/ext.ll
@@ -0,0 +1,249 @@
+; REQUIRES: asserts
+; RUN: opt -inline -mtriple=aarch64--linux-gnu -S -debug-only=inline-cost < %s 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+define i32 @outer1(i32* %ptr, i32 %i) {
+  %C = call i32 @inner1(i32* %ptr, i32 %i)
+  ret i32 %C
+}
+
+; sext can be folded into gep.
+; CHECK: Analyzing call of inner1
+; CHECK: NumInstructionsSimplified: 3
+; CHECK: NumInstructions: 4
+define i32 @inner1(i32* %ptr, i32 %i) {
+  %E = sext i32 %i to i64
+  %G = getelementptr inbounds i32, i32* %ptr, i64 %E
+  %L = load i32, i32* %G
+  ret i32 %L
+}
+
+define i32 @outer2(i32* %ptr, i32 %i) {
+  %C = call i32 @inner2(i32* %ptr, i32 %i)
+  ret i32 %C
+}
+
+; zext from i32 to i64 is free.
+; CHECK: Analyzing call of inner2
+; CHECK: NumInstructionsSimplified: 3
+; CHECK: NumInstructions: 4
+define i32 @inner2(i32* %ptr, i32 %i) {
+  %E = zext i32 %i to i64
+  %G = getelementptr inbounds i32, i32* %ptr, i64 %E
+  %L = load i32, i32* %G
+  ret i32 %L
+}
+
+define i32 @outer3(i32* %ptr, i16 %i) {
+  %C = call i32 @inner3(i32* %ptr, i16 %i)
+  ret i32 %C
+}
+
+; zext can be folded into gep.
+; CHECK: Analyzing call of inner3
+; CHECK: NumInstructionsSimplified: 3
+; CHECK: NumInstructions: 4
+define i32 @inner3(i32* %ptr, i16 %i) {
+  %E = zext i16 %i to i64
+  %G = getelementptr inbounds i32, i32* %ptr, i64 %E
+  %L = load i32, i32* %G
+  ret i32 %L
+}
+
+define i16 @outer4(i8* %ptr) {
+  %C = call i16 @inner4(i8* %ptr)
+  ret i16 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner4
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i16 @inner4(i8* %ptr) {
+  %L = load i8, i8* %ptr
+  %E = zext i8 %L to i16
+  ret i16 %E
+}
+
+define i16 @outer5(i8* %ptr) {
+  %C = call i16 @inner5(i8* %ptr)
+  ret i16 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner5
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i16 @inner5(i8* %ptr) {
+  %L = load i8, i8* %ptr
+  %E = sext i8 %L to i16
+  ret i16 %E
+}
+
+define i32 @outer6(i8* %ptr) {
+  %C = call i32 @inner6(i8* %ptr)
+  ret i32 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner6
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i32 @inner6(i8* %ptr) {
+  %L = load i8, i8* %ptr
+  %E = zext i8 %L to i32
+  ret i32 %E
+}
+
+define i32 @outer7(i8* %ptr) {
+  %C = call i32 @inner7(i8* %ptr)
+  ret i32 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner7
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i32 @inner7(i8* %ptr) {
+  %L = load i8, i8* %ptr
+  %E = sext i8 %L to i32
+  ret i32 %E
+}
+
+define i32 @outer8(i16* %ptr) {
+  %C = call i32 @inner8(i16* %ptr)
+  ret i32 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner8
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i32 @inner8(i16* %ptr) {
+  %L = load i16, i16* %ptr
+  %E = zext i16 %L to i32
+  ret i32 %E
+}
+
+define i32 @outer9(i16* %ptr) {
+  %C = call i32 @inner9(i16* %ptr)
+  ret i32 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner9
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i32 @inner9(i16* %ptr) {
+  %L = load i16, i16* %ptr
+  %E = sext i16 %L to i32
+  ret i32 %E
+}
+
+define i64 @outer10(i8* %ptr) {
+  %C = call i64 @inner10(i8* %ptr)
+  ret i64 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner10
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i64 @inner10(i8* %ptr) {
+  %L = load i8, i8* %ptr
+  %E = zext i8 %L to i64
+  ret i64 %E
+}
+
+define i64 @outer11(i8* %ptr) {
+  %C = call i64 @inner11(i8* %ptr)
+  ret i64 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner11
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i64 @inner11(i8* %ptr) {
+  %L = load i8, i8* %ptr
+  %E = sext i8 %L to i64
+  ret i64 %E
+}
+
+define i64 @outer12(i16* %ptr) {
+  %C = call i64 @inner12(i16* %ptr)
+  ret i64 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner12
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i64 @inner12(i16* %ptr) {
+  %L = load i16, i16* %ptr
+  %E = zext i16 %L to i64
+  ret i64 %E
+}
+
+define i64 @outer13(i16* %ptr) {
+  %C = call i64 @inner13(i16* %ptr)
+  ret i64 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner13
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i64 @inner13(i16* %ptr) {
+  %L = load i16, i16* %ptr
+  %E = sext i16 %L to i64
+  ret i64 %E
+}
+
+define i64 @outer14(i32* %ptr) {
+  %C = call i64 @inner14(i32* %ptr)
+  ret i64 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner14
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i64 @inner14(i32* %ptr) {
+  %L = load i32, i32* %ptr
+  %E = zext i32 %L to i64
+  ret i64 %E
+}
+
+define i64 @outer15(i32* %ptr) {
+  %C = call i64 @inner15(i32* %ptr)
+  ret i64 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner15
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i64 @inner15(i32* %ptr) {
+  %L = load i32, i32* %ptr
+  %E = sext i32 %L to i64
+  ret i64 %E
+}
+
+define i64 @outer16(i32 %V1, i64 %V2) {
+  %C = call i64 @inner16(i32 %V1, i64 %V2)
+  ret i64 %C
+}
+
+; sext can be folded into shl.
+; CHECK: Analyzing call of inner16
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 4
+define i64 @inner16(i32 %V1, i64 %V2) {
+  %E = sext i32 %V1 to i64
+  %S = shl i64 %E, 3
+  %A = add i64 %V2, %S
+  ret i64 %A
+}
diff --git a/test/Transforms/Inline/PowerPC/ext.ll b/test/Transforms/Inline/PowerPC/ext.ll
new file mode 100644
index 000000000000..f7a409467b2c
--- /dev/null
+++ b/test/Transforms/Inline/PowerPC/ext.ll
@@ -0,0 +1,140 @@
+; REQUIRES: asserts
+; RUN: opt -inline -S -debug-only=inline-cost < %s 2>&1 | FileCheck %s
+
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64le-ibm-linux-gnu"
+
+define i16 @outer1(i8* %ptr) {
+  %C = call i16 @inner1(i8* %ptr)
+  ret i16 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner1
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i16 @inner1(i8* %ptr) {
+  %L = load i8, i8* %ptr
+  %E = zext i8 %L to i16
+  ret i16 %E
+}
+
+define i32 @outer2(i8* %ptr) {
+  %C = call i32 @inner2(i8* %ptr)
+  ret i32 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner2
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i32 @inner2(i8* %ptr) {
+  %L = load i8, i8* %ptr
+  %E = zext i8 %L to i32
+  ret i32 %E
+}
+
+define i32 @outer3(i16* %ptr) {
+  %C = call i32 @inner3(i16* %ptr)
+  ret i32 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner3
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i32 @inner3(i16* %ptr) {
+  %L = load i16, i16* %ptr
+  %E = zext i16 %L to i32
+  ret i32 %E
+}
+
+define i32 @outer4(i16* %ptr) {
+  %C = call i32 @inner4(i16* %ptr)
+  ret i32 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner4
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i32 @inner4(i16* %ptr) {
+  %L = load i16, i16* %ptr
+  %E = sext i16 %L to i32
+  ret i32 %E
+}
+
+define i64 @outer5(i8* %ptr) {
+  %C = call i64 @inner5(i8* %ptr)
+  ret i64 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner5
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i64 @inner5(i8* %ptr) {
+  %L = load i8, i8* %ptr
+  %E = zext i8 %L to i64
+  ret i64 %E
+}
+
+define i64 @outer6(i16* %ptr) {
+  %C = call i64 @inner6(i16* %ptr)
+  ret i64 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner6
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i64 @inner6(i16* %ptr) {
+  %L = load i16, i16* %ptr
+  %E = zext i16 %L to i64
+  ret i64 %E
+}
+
+define i64 @outer7(i16* %ptr) {
+  %C = call i64 @inner7(i16* %ptr)
+  ret i64 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner7
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i64 @inner7(i16* %ptr) {
+  %L = load i16, i16* %ptr
+  %E = sext i16 %L to i64
+  ret i64 %E
+}
+
+define i64 @outer8(i32* %ptr) {
+  %C = call i64 @inner8(i32* %ptr)
+  ret i64 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner8
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i64 @inner8(i32* %ptr) {
+  %L = load i32, i32* %ptr
+  %E = zext i32 %L to i64
+  ret i64 %E
+}
+
+define i64 @outer9(i32* %ptr) {
+  %C = call i64 @inner9(i32* %ptr)
+  ret i64 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner9
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i64 @inner9(i32* %ptr) {
+  %L = load i32, i32* %ptr
+  %E = sext i32 %L to i64
+  ret i64 %E
+}
diff --git a/test/Transforms/Inline/PowerPC/lit.local.cfg b/test/Transforms/Inline/PowerPC/lit.local.cfg
new file mode 100644
index 000000000000..5d33887ff0a4
--- /dev/null
+++ b/test/Transforms/Inline/PowerPC/lit.local.cfg
@@ -0,0 +1,3 @@
+if not 'PowerPC' in config.root.targets:
+    config.unsupported = True
+
diff --git a/test/Transforms/Inline/X86/ext.ll b/test/Transforms/Inline/X86/ext.ll
new file mode 100644
index 000000000000..bffda3852799
--- /dev/null
+++ b/test/Transforms/Inline/X86/ext.ll
@@ -0,0 +1,201 @@
+; REQUIRES: asserts
+; RUN: opt -inline -mtriple=x86_64-unknown-unknown -S -debug-only=inline-cost < %s 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-unknown"
+
+define i32 @outer1(i32* %ptr, i32 %i) {
+  %C = call i32 @inner1(i32* %ptr, i32 %i)
+  ret i32 %C
+}
+
+; zext from i32 to i64 is free.
+; CHECK: Analyzing call of inner1
+; CHECK: NumInstructionsSimplified: 3
+; CHECK: NumInstructions: 4
+define i32 @inner1(i32* %ptr, i32 %i) {
+  %E = zext i32 %i to i64
+  %G = getelementptr inbounds i32, i32* %ptr, i64 %E
+  %L = load i32, i32* %G
+  ret i32 %L
+}
+
+define i16 @outer2(i8* %ptr) {
+  %C = call i16 @inner2(i8* %ptr)
+  ret i16 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner2
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i16 @inner2(i8* %ptr) {
+  %L = load i8, i8* %ptr
+  %E = zext i8 %L to i16
+  ret i16 %E
+}
+
+define i16 @outer3(i8* %ptr) {
+  %C = call i16 @inner3(i8* %ptr)
+  ret i16 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner3
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i16 @inner3(i8* %ptr) {
+  %L = load i8, i8* %ptr
+  %E = sext i8 %L to i16
+  ret i16 %E
+}
+
+define i32 @outer4(i8* %ptr) {
+  %C = call i32 @inner4(i8* %ptr)
+  ret i32 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner4
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i32 @inner4(i8* %ptr) {
+  %L = load i8, i8* %ptr
+  %E = zext i8 %L to i32
+  ret i32 %E
+}
+
+define i32 @outer5(i8* %ptr) {
+  %C = call i32 @inner5(i8* %ptr)
+  ret i32 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner5
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i32 @inner5(i8* %ptr) {
+  %L = load i8, i8* %ptr
+  %E = sext i8 %L to i32
+  ret i32 %E
+}
+
+define i32 @outer6(i16* %ptr) {
+  %C = call i32 @inner6(i16* %ptr)
+  ret i32 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner6
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i32 @inner6(i16* %ptr) {
+  %L = load i16, i16* %ptr
+  %E = zext i16 %L to i32
+  ret i32 %E
+}
+
+define i32 @outer7(i16* %ptr) {
+  %C = call i32 @inner7(i16* %ptr)
+  ret i32 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner7
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i32 @inner7(i16* %ptr) {
+  %L = load i16, i16* %ptr
+  %E = sext i16 %L to i32
+  ret i32 %E
+}
+
+define i64 @outer8(i8* %ptr) {
+  %C = call i64 @inner8(i8* %ptr)
+  ret i64 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner8
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i64 @inner8(i8* %ptr) {
+  %L = load i8, i8* %ptr
+  %E = zext i8 %L to i64
+  ret i64 %E
+}
+
+define i64 @outer9(i8* %ptr) {
+  %C = call i64 @inner9(i8* %ptr)
+  ret i64 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner9
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i64 @inner9(i8* %ptr) {
+  %L = load i8, i8* %ptr
+  %E = sext i8 %L to i64
+  ret i64 %E
+}
+
+define i64 @outer10(i16* %ptr) {
+  %C = call i64 @inner10(i16* %ptr)
+  ret i64 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner10
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i64 @inner10(i16* %ptr) {
+  %L = load i16, i16* %ptr
+  %E = zext i16 %L to i64
+  ret i64 %E
+}
+
+define i64 @outer11(i16* %ptr) {
+  %C = call i64 @inner11(i16* %ptr)
+  ret i64 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner11
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i64 @inner11(i16* %ptr) {
+  %L = load i16, i16* %ptr
+  %E = sext i16 %L to i64
+  ret i64 %E
+}
+
+define i64 @outer12(i32* %ptr) {
+  %C = call i64 @inner12(i32* %ptr)
+  ret i64 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner12
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i64 @inner12(i32* %ptr) {
+  %L = load i32, i32* %ptr
+  %E = zext i32 %L to i64
+  ret i64 %E
+}
+
+define i64 @outer13(i32* %ptr) {
+  %C = call i64 @inner13(i32* %ptr)
+  ret i64 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner13
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i64 @inner13(i32* %ptr) {
+  %L = load i32, i32* %ptr
+  %E = sext i32 %L to i64
+  ret i64 %E
+}
diff --git a/test/Transforms/InstCombine/2017-07-07-UMul-ZExt.ll b/test/Transforms/InstCombine/2017-07-07-UMul-ZExt.ll
index 3c4e08b5b515..905357817509 100644
--- a/test/Transforms/InstCombine/2017-07-07-UMul-ZExt.ll
+++ b/test/Transforms/InstCombine/2017-07-07-UMul-ZExt.ll
@@ -1,7 +1,29 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -instcombine -S | FileCheck %s
 
-; CHECK: llvm.umul.with.overflow
 define i32 @sterix(i32, i8, i64) {
+; CHECK-LABEL: @sterix(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0:%.*]] to i64
+; CHECK-NEXT:    [[CONV1:%.*]] = sext i8 [[TMP1:%.*]] to i32
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[CONV1]], 1945964878
+; CHECK-NEXT:    [[SH_PROM:%.*]] = trunc i64 [[TMP2:%.*]] to i32
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[MUL]], [[SH_PROM]]
+; CHECK-NEXT:    [[CONV2:%.*]] = zext i32 [[SHR]] to i64
+; CHECK-NEXT:    [[MUL3:%.*]] = mul nuw nsw i64 [[CONV]], [[CONV2]]
+; CHECK-NEXT:    [[CONV6:%.*]] = and i64 [[MUL3]], 4294967295
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i64 [[CONV6]], [[MUL3]]
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[LOR_RHS:%.*]], label [[LOR_END:%.*]]
+; CHECK:       lor.rhs:
+; CHECK-NEXT:    [[AND:%.*]] = and i64 [[MUL3]], [[TMP2]]
+; CHECK-NEXT:    [[CONV4:%.*]] = trunc i64 [[AND]] to i32
+; CHECK-NEXT:    [[TOBOOL7:%.*]] = icmp eq i32 [[CONV4]], 0
+; CHECK-NEXT:    [[PHITMP:%.*]] = zext i1 [[TOBOOL7]] to i32
+; CHECK-NEXT:    br label [[LOR_END]]
+; CHECK:       lor.end:
+; CHECK-NEXT:    [[TMP3:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[PHITMP]], [[LOR_RHS]] ]
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
 entry:
   %conv = zext i32 %0 to i64
   %conv1 = sext i8 %1 to i32
diff --git a/test/Transforms/InstCombine/and-not-or.ll b/test/Transforms/InstCombine/and-not-or.ll
deleted file mode 100644
index a42140be2805..000000000000
--- a/test/Transforms/InstCombine/and-not-or.ll
+++ /dev/null
@@ -1,34 +0,0 @@
-; RUN: opt < %s -instcombine -S | grep "and i32 %x, %y" | count 4
-; RUN: opt < %s -instcombine -S | not grep "or"
-
-define i32 @func1(i32 %x, i32 %y) nounwind {
-entry:
-	%n = xor i32 %y, -1
-	%o = or i32 %n, %x
-	%a = and i32 %o, %y
-	ret i32 %a
-}
-
-define i32 @func2(i32 %x, i32 %y) nounwind {
-entry:
-	%n = xor i32 %y, -1
-	%o = or i32 %x, %n
-	%a = and i32 %o, %y
-	ret i32 %a
-}
-
-define i32 @func3(i32 %x, i32 %y) nounwind {
-entry:
-	%n = xor i32 %y, -1
-	%o = or i32 %n, %x
-	%a = and i32 %y, %o
-	ret i32 %a
-}
-
-define i32 @func4(i32 %x, i32 %y) nounwind {
-entry:
-	%n = xor i32 %y, -1
-	%o = or i32 %x, %n
-	%a = and i32 %y, %o
-	ret i32 %a
-}
diff --git a/test/Transforms/InstCombine/and.ll b/test/Transforms/InstCombine/and.ll
index 7bb9b95b3179..c12662d4db0e 100644
--- a/test/Transforms/InstCombine/and.ll
+++ b/test/Transforms/InstCombine/and.ll
@@ -628,3 +628,195 @@ define i32 @test43(i32 %a, i32 %c, i32 %d) {
   %and = and i32 %or, %xor
   ret i32 %and
 }
+
+; (~y | x) & y -> x & y
+define i32 @test44(i32 %x, i32 %y) nounwind {
+; CHECK-LABEL: @test44(
+; CHECK-NEXT:    [[A:%.*]] = and i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i32 [[A]]
+;
+  %n = xor i32 %y, -1
+  %o = or i32 %n, %x
+  %a = and i32 %o, %y
+  ret i32 %a
+}
+
+; (x | ~y) & y -> x & y
+define i32 @test45(i32 %x, i32 %y) nounwind {
+; CHECK-LABEL: @test45(
+; CHECK-NEXT:    [[A:%.*]] = and i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i32 [[A]]
+;
+  %n = xor i32 %y, -1
+  %o = or i32 %x, %n
+  %a = and i32 %o, %y
+  ret i32 %a
+}
+
+; y & (~y | x) -> y | x
+define i32 @test46(i32 %x, i32 %y) nounwind {
+; CHECK-LABEL: @test46(
+; CHECK-NEXT:    [[A:%.*]] = and i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i32 [[A]]
+;
+  %n = xor i32 %y, -1
+  %o = or i32 %n, %x
+  %a = and i32 %y, %o
+  ret i32 %a
+}
+
+; y & (x | ~y) -> y | x
+define i32 @test47(i32 %x, i32 %y) nounwind {
+; CHECK-LABEL: @test47(
+; CHECK-NEXT:    [[A:%.*]] = and i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i32 [[A]]
+;
+  %n = xor i32 %y, -1
+  %o = or i32 %x, %n
+  %a = and i32 %y, %o
+  ret i32 %a
+}
+
+; In the next 4 tests, vary the types and predicates for extra coverage.
+; (X & (Y | ~X)) -> (X & Y), where 'not' is an inverted cmp
+
+define i1 @and_orn_cmp_1(i32 %a, i32 %b, i32 %c) {
+; CHECK-LABEL: @and_orn_cmp_1(
+; CHECK-NEXT:    [[X:%.*]] = icmp sgt i32 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[Y:%.*]] = icmp ugt i32 [[C:%.*]], 42
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[X]], [[Y]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %x = icmp sgt i32 %a, %b
+  %x_inv = icmp sle i32 %a, %b
+  %y = icmp ugt i32 %c, 42      ; thwart complexity-based ordering
+  %or = or i1 %y, %x_inv
+  %and = and i1 %x, %or
+  ret i1 %and
+}
+
+; Commute the 'and':
+; ((Y | ~X) & X) -> (X & Y), where 'not' is an inverted cmp
+
+define <2 x i1> @and_orn_cmp_2(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) {
+; CHECK-LABEL: @and_orn_cmp_2(
+; CHECK-NEXT:    [[X:%.*]] = icmp sge <2 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[Y:%.*]] = icmp ugt <2 x i32> [[C:%.*]], <i32 42, i32 47>
+; CHECK-NEXT:    [[AND:%.*]] = and <2 x i1> [[Y]], [[X]]
+; CHECK-NEXT:    ret <2 x i1> [[AND]]
+;
+  %x = icmp sge <2 x i32> %a, %b
+  %x_inv = icmp slt <2 x i32> %a, %b
+  %y = icmp ugt <2 x i32> %c, <i32 42, i32 47>      ; thwart complexity-based ordering
+  %or = or <2 x i1> %y, %x_inv
+  %and = and <2 x i1> %or, %x
+  ret <2 x i1> %and
+}
+
+; Commute the 'or':
+; (X & (~X | Y)) -> (X & Y), where 'not' is an inverted cmp
+
+define i1 @and_orn_cmp_3(i72 %a, i72 %b, i72 %c) {
+; CHECK-LABEL: @and_orn_cmp_3(
+; CHECK-NEXT:    [[X:%.*]] = icmp ugt i72 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[Y:%.*]] = icmp ugt i72 [[C:%.*]], 42
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[X]], [[Y]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %x = icmp ugt i72 %a, %b
+  %x_inv = icmp ule i72 %a, %b
+  %y = icmp ugt i72 %c, 42      ; thwart complexity-based ordering
+  %or = or i1 %x_inv, %y
+  %and = and i1 %x, %or
+  ret i1 %and
+}
+
+; Commute the 'and':
+; ((~X | Y) & X) -> (X & Y), where 'not' is an inverted cmp
+
+define <3 x i1> @or_andn_cmp_4(<3 x i32> %a, <3 x i32> %b, <3 x i32> %c) {
+; CHECK-LABEL: @or_andn_cmp_4(
+; CHECK-NEXT:    [[X:%.*]] = icmp eq <3 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[Y:%.*]] = icmp ugt <3 x i32> [[C:%.*]], <i32 42, i32 43, i32 -1>
+; CHECK-NEXT:    [[AND:%.*]] = and <3 x i1> [[Y]], [[X]]
+; CHECK-NEXT:    ret <3 x i1> [[AND]]
+;
+  %x = icmp eq <3 x i32> %a, %b
+  %x_inv = icmp ne <3 x i32> %a, %b
+  %y = icmp ugt <3 x i32> %c, <i32 42, i32 43, i32 -1>      ; thwart complexity-based ordering
+  %or = or <3 x i1> %x_inv, %y
+  %and = and <3 x i1> %or, %x
+  ret <3 x i1> %and
+}
+
+; In the next 4 tests, vary the types and predicates for extra coverage.
+; (~X & (Y | X)) -> (~X & Y), where 'not' is an inverted cmp
+
+define i1 @andn_or_cmp_1(i37 %a, i37 %b, i37 %c) {
+; CHECK-LABEL: @andn_or_cmp_1(
+; CHECK-NEXT:    [[X_INV:%.*]] = icmp sle i37 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[Y:%.*]] = icmp ugt i37 [[C:%.*]], 42
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[X_INV]], [[Y]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %x = icmp sgt i37 %a, %b
+  %x_inv = icmp sle i37 %a, %b
+  %y = icmp ugt i37 %c, 42      ; thwart complexity-based ordering
+  %or = or i1 %y, %x
+  %and = and i1 %x_inv, %or
+  ret i1 %and
+}
+
+; Commute the 'and':
+; ((Y | X) & ~X) -> (~X & Y), where 'not' is an inverted cmp
+
+define i1 @andn_or_cmp_2(i16 %a, i16 %b, i16 %c) {
+; CHECK-LABEL: @andn_or_cmp_2(
+; CHECK-NEXT:    [[X_INV:%.*]] = icmp slt i16 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[Y:%.*]] = icmp ugt i16 [[C:%.*]], 42
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[Y]], [[X_INV]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %x = icmp sge i16 %a, %b
+  %x_inv = icmp slt i16 %a, %b
+  %y = icmp ugt i16 %c, 42      ; thwart complexity-based ordering
+  %or = or i1 %y, %x
+  %and = and i1 %or, %x_inv
+  ret i1 %and
+}
+
+; Commute the 'or':
+; (~X & (X | Y)) -> (~X & Y), where 'not' is an inverted cmp
+
+define <4 x i1> @andn_or_cmp_3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-LABEL: @andn_or_cmp_3(
+; CHECK-NEXT:    [[X_INV:%.*]] = icmp ule <4 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[Y:%.*]] = icmp ugt <4 x i32> [[C:%.*]], <i32 42, i32 0, i32 1, i32 -1>
+; CHECK-NEXT:    [[AND:%.*]] = and <4 x i1> [[X_INV]], [[Y]]
+; CHECK-NEXT:    ret <4 x i1> [[AND]]
+;
+  %x = icmp ugt <4 x i32> %a, %b
+  %x_inv = icmp ule <4 x i32> %a, %b
+  %y = icmp ugt <4 x i32> %c, <i32 42, i32 0, i32 1, i32 -1>      ; thwart complexity-based ordering
+  %or = or <4 x i1> %x, %y
+  %and = and <4 x i1> %x_inv, %or
+  ret <4 x i1> %and
+}
+
+; Commute the 'and':
+; ((X | Y) & ~X) -> (~X & Y), where 'not' is an inverted cmp
+
+define i1 @andn_or_cmp_4(i32 %a, i32 %b, i32 %c) {
+; CHECK-LABEL: @andn_or_cmp_4(
+; CHECK-NEXT:    [[X_INV:%.*]] = icmp ne i32 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[Y:%.*]] = icmp ugt i32 [[C:%.*]], 42
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[Y]], [[X_INV]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %x = icmp eq i32 %a, %b
+  %x_inv = icmp ne i32 %a, %b
+  %y = icmp ugt i32 %c, 42      ; thwart complexity-based ordering
+  %or = or i1 %x, %y
+  %and = and i1 %or, %x_inv
+  ret i1 %and
+}
diff --git a/test/Transforms/InstCombine/and2.ll b/test/Transforms/InstCombine/and2.ll
index 001ac58891e4..15772d158f62 100644
--- a/test/Transforms/InstCombine/and2.ll
+++ b/test/Transforms/InstCombine/and2.ll
@@ -98,8 +98,7 @@ define i64 @test9(i64 %x) {
 ; combine -x & 1 into x & 1
 define <2 x i64> @test9vec(<2 x i64> %x) {
 ; CHECK-LABEL: @test9vec(
-; CHECK-NEXT:    [[SUB:%.*]] = sub nsw <2 x i64> zeroinitializer, [[X:%.*]]
-; CHECK-NEXT:    [[AND:%.*]] = and <2 x i64> [[SUB]], <i64 1, i64 1>
+; CHECK-NEXT:    [[AND:%.*]] = and <2 x i64> %x, <i64 1, i64 1>
 ; CHECK-NEXT:    ret <2 x i64> [[AND]]
 ;
   %sub = sub nsw <2 x i64> <i64 0, i64 0>, %x
@@ -119,6 +118,88 @@ define i64 @test10(i64 %x) {
   ret i64 %add
 }
 
+; (1 << x) & 1 --> zext(x == 0)
+
+define i8 @and1_shl1_is_cmp_eq_0(i8 %x) {
+; CHECK-LABEL: @and1_shl1_is_cmp_eq_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i8 %x, 0
+; CHECK-NEXT:    [[AND:%.*]] = zext i1 [[TMP1]] to i8
+; CHECK-NEXT:    ret i8 [[AND]]
+;
+  %sh = shl i8 1, %x
+  %and = and i8 %sh, 1
+  ret i8 %and
+}
+
+; Don't do it if the shift has another use.
+
+define i8 @and1_shl1_is_cmp_eq_0_multiuse(i8 %x) {
+; CHECK-LABEL: @and1_shl1_is_cmp_eq_0_multiuse(
+; CHECK-NEXT:    [[SH:%.*]] = shl i8 1, %x
+; CHECK-NEXT:    [[AND:%.*]] = and i8 [[SH]], 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[SH]], [[AND]]
+; CHECK-NEXT:    ret i8 [[ADD]]
+;
+  %sh = shl i8 1, %x
+  %and = and i8 %sh, 1
+  %add = add i8 %sh, %and
+  ret i8 %add
+}
+
+; (1 << x) & 1 --> zext(x == 0)
+
+define <2 x i8> @and1_shl1_is_cmp_eq_0_vec(<2 x i8> %x) {
+; CHECK-LABEL: @and1_shl1_is_cmp_eq_0_vec(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <2 x i8> %x, zeroinitializer
+; CHECK-NEXT:    [[AND:%.*]] = zext <2 x i1> [[TMP1]] to <2 x i8>
+; CHECK-NEXT:    ret <2 x i8> [[AND]]
+;
+  %sh = shl <2 x i8> <i8 1, i8 1>, %x
+  %and = and <2 x i8> %sh, <i8 1, i8 1>
+  ret <2 x i8> %and
+}
+
+; (1 >> x) & 1 --> zext(x == 0)
+
+define i8 @and1_lshr1_is_cmp_eq_0(i8 %x) {
+; CHECK-LABEL: @and1_lshr1_is_cmp_eq_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i8 %x, 0
+; CHECK-NEXT:    [[AND:%.*]] = zext i1 [[TMP1]] to i8
+; CHECK-NEXT:    ret i8 [[AND]]
+;
+  %sh = lshr i8 1, %x
+  %and = and i8 %sh, 1
+  ret i8 %and
+}
+
+; Don't do it if the shift has another use.
+
+define i8 @and1_lshr1_is_cmp_eq_0_multiuse(i8 %x) {
+; CHECK-LABEL: @and1_lshr1_is_cmp_eq_0_multiuse(
+; CHECK-NEXT:    [[SH:%.*]] = lshr i8 1, %x
+; CHECK-NEXT:    [[AND:%.*]] = and i8 [[SH]], 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[SH]], [[AND]]
+; CHECK-NEXT:    ret i8 [[ADD]]
+;
+  %sh = lshr i8 1, %x
+  %and = and i8 %sh, 1
+  %add = add i8 %sh, %and
+  ret i8 %add
+}
+
+; (1 >> x) & 1 --> zext(x == 0)
+
+define <2 x i8> @and1_lshr1_is_cmp_eq_0_vec(<2 x i8> %x) {
+; CHECK-LABEL: @and1_lshr1_is_cmp_eq_0_vec(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <2 x i8> %x, zeroinitializer
+; CHECK-NEXT:    [[AND:%.*]] = zext <2 x i1> [[TMP1]] to <2 x i8>
+; CHECK-NEXT:    ret <2 x i8> [[AND]]
+;
+  %sh = lshr <2 x i8> <i8 1, i8 1>, %x
+  %and = and <2 x i8> %sh, <i8 1, i8 1>
+  ret <2 x i8> %and
+}
+
 ; The add in this test is unnecessary because the LSBs of the LHS are 0 and the 'and' only consumes bits from those LSBs. It doesn't matter what happens to the upper bits.
 define i32 @test11(i32 %a, i32 %b) {
 ; CHECK-LABEL: @test11(
diff --git a/test/Transforms/InstCombine/element-atomic-memintrins.ll b/test/Transforms/InstCombine/element-atomic-memintrins.ll
new file mode 100644
index 000000000000..2e3bfd7b721d
--- /dev/null
+++ b/test/Transforms/InstCombine/element-atomic-memintrins.ll
@@ -0,0 +1,98 @@
+;; Placeholder tests that will fail once element atomic @llvm.mem[move|set] instrinsics have
+;; been added to the MemIntrinsic class hierarchy. These will act as a reminder to
+;; verify that inst combine handles these intrinsics properly once they have been
+;; added to that class hierarchy.
+
+; RUN: opt -instcombine -S < %s | FileCheck %s
+
+;; ---- memset -----
+
+; Ensure 0-length memset isn't removed
+define void @test_memset_zero_length(i8* %dest) {
+  ; CHECK-LABEL: test_memset_zero_length
+  ; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 0, i32 1)
+  ; CHECK-NEXT: ret void
+  call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 0, i32 1)
+  ret void
+}
+
+; Ensure that small-sized memsets don't convert to stores
+define void @test_memset_to_store(i8* %dest) {
+  ; CHECK-LABEL: test_memset_to_store
+  ; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 1, i32 1)
+  ; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 2, i32 1)
+  ; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 4, i32 1)
+  ; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 8, i32 1)
+  ; CHECK-NEXT: ret void
+  call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 1, i32 1)
+  call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 2, i32 1)
+  call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 4, i32 1)
+  call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 8, i32 1)
+  ret void
+}
+
+declare void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* nocapture writeonly, i8, i32, i32) nounwind argmemonly
+
+
+;; =========================================
+;; ----- memmove ------
+
+; memmove from a global constant source does not become memcpy
+@gconst = constant [8 x i8] c"0123456\00"
+define void @test_memmove_to_memcpy(i8* %dest) {
+  ; CHECK-LABEL: test_memmove_to_memcpy
+  ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 getelementptr inbounds ([8 x i8], [8 x i8]* @gconst, i64 0, i64 0), i32 8, i32 1)
+  ; CHECK-NEXT: ret void
+  call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 getelementptr inbounds ([8 x i8], [8 x i8]* @gconst, i64 0, i64 0), i32 8, i32 1)
+  ret void
+}
+
+define void @test_memmove_zero_length(i8* %dest, i8* %src) {
+  ; CHECK-LABEL: test_memmove_zero_length
+  ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 0, i32 1)
+  ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %dest, i8* align 2 %src, i32 0, i32 2)
+  ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %dest, i8* align 4 %src, i32 0, i32 4)
+  ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %dest, i8* align 8 %src, i32 0, i32 8)
+  ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %dest, i8* align 16 %src, i32 0, i32 16)
+  ; CHECK-NEXT: ret void
+  call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 0, i32 1)
+  call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %dest, i8* align 2 %src, i32 0, i32 2)
+  call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %dest, i8* align 4 %src, i32 0, i32 4)
+  call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %dest, i8* align 8 %src, i32 0, i32 8)
+  call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %dest, i8* align 16 %src, i32 0, i32 16)
+  ret void  
+}
+
+; memmove with src==dest is removed
+define void @test_memmove_removed(i8* %srcdest, i32 %sz) {
+  ; CHECK-LABEL: test_memmove_removed
+  ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %srcdest, i8* align 1 %srcdest, i32 %sz, i32 1)
+  ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %srcdest, i8* align 2 %srcdest, i32 %sz, i32 2)
+  ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %srcdest, i8* align 4 %srcdest, i32 %sz, i32 4)
+  ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %srcdest, i8* align 8 %srcdest, i32 %sz, i32 8)
+  ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %srcdest, i8* align 16 %srcdest, i32 %sz, i32 16)
+  ; CHECK-NEXT: ret void
+  call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %srcdest, i8* align 1 %srcdest, i32 %sz, i32 1)
+  call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %srcdest, i8* align 2 %srcdest, i32 %sz, i32 2)
+  call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %srcdest, i8* align 4 %srcdest, i32 %sz, i32 4)
+  call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %srcdest, i8* align 8 %srcdest, i32 %sz, i32 8)
+  call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %srcdest, i8* align 16 %srcdest, i32 %sz, i32 16)
+  ret void
+}
+
+; memmove with a small constant length is converted to a load/store pair
+define void @test_memmove_loadstore(i8* %dest, i8* %src) {
+  ; CHECK-LABEL: test_memmove_loadstore
+  ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 1, i32 1)
+  ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 2, i32 1)
+  ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 4, i32 1)
+  ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 8, i32 1)
+  ; CHECK-NEXT: ret void
+  call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 1, i32 1)
+  call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 2, i32 1)
+  call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 4, i32 1)
+  call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 8, i32 1)
+  ret void
+}
+
+declare void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* nocapture writeonly, i8* nocapture readonly, i32, i32) nounwind argmemonly
diff --git a/test/Transforms/InstCombine/icmp-logical.ll b/test/Transforms/InstCombine/icmp-logical.ll
index faae2016e207..aa95cc5a1316 100644
--- a/test/Transforms/InstCombine/icmp-logical.ll
+++ b/test/Transforms/InstCombine/icmp-logical.ll
@@ -1,159 +1,138 @@
 ; RUN: opt -instcombine -S -o - %s | FileCheck %s
 
 define i1 @masked_and_notallzeroes(i32 %A) {
-; CHECK-LABEL: @masked_and_notallzeroes
-; CHECK: [[MASK:%.*]] = and i32 %A, 7
-; CHECK: icmp ne i32 [[MASK]], 0
-; CHECK-NOT: and i32 %A, 39
-; CHECK: ret i1
-
+; CHECK-LABEL: @masked_and_notallzeroes(
+; CHECK-NEXT:    [[MASK1:%.*]] = and i32 %A, 7
+; CHECK-NEXT:    [[TST1:%.*]] = icmp ne i32 [[MASK1]], 0
+; CHECK-NEXT:    ret i1 [[TST1]]
+;
   %mask1 = and i32 %A, 7
   %tst1 = icmp ne i32 %mask1, 0
-
   %mask2 = and i32 %A, 39
   %tst2 = icmp ne i32 %mask2, 0
-
   %res = and i1 %tst1, %tst2
   ret i1 %res
 }
 
 define i1 @masked_or_allzeroes(i32 %A) {
-; CHECK-LABEL: @masked_or_allzeroes
-; CHECK: [[MASK:%.*]] = and i32 %A, 7
-; CHECK: icmp eq i32 [[MASK]], 0
-; CHECK-NOT: and i32 %A, 39
-; CHECK: ret i1
-
+; CHECK-LABEL: @masked_or_allzeroes(
+; CHECK-NEXT:    [[MASK1:%.*]] = and i32 %A, 7
+; CHECK-NEXT:    [[TST1:%.*]] = icmp eq i32 [[MASK1]], 0
+; CHECK-NEXT:    ret i1 [[TST1]]
+;
   %mask1 = and i32 %A, 7
   %tst1 = icmp eq i32 %mask1, 0
-
   %mask2 = and i32 %A, 39
   %tst2 = icmp eq i32 %mask2, 0
-
   %res = or i1 %tst1, %tst2
   ret i1 %res
 }
 
 define i1 @masked_and_notallones(i32 %A) {
-; CHECK-LABEL: @masked_and_notallones
-; CHECK: [[MASK:%.*]] = and i32 %A, 7
-; CHECK: icmp ne i32 [[MASK]], 7
-; CHECK-NOT: and i32 %A, 39
-; CHECK: ret i1
-
+; CHECK-LABEL: @masked_and_notallones(
+; CHECK-NEXT:    [[MASK1:%.*]] = and i32 %A, 7
+; CHECK-NEXT:    [[TST1:%.*]] = icmp ne i32 [[MASK1]], 7
+; CHECK-NEXT:    ret i1 [[TST1]]
+;
   %mask1 = and i32 %A, 7
   %tst1 = icmp ne i32 %mask1, 7
-
   %mask2 = and i32 %A, 39
   %tst2 = icmp ne i32 %mask2, 39
-
   %res = and i1 %tst1, %tst2
   ret i1 %res
 }
 
 define i1 @masked_or_allones(i32 %A) {
-; CHECK-LABEL: @masked_or_allones
-; CHECK: [[MASK:%.*]] = and i32 %A, 7
-; CHECK: icmp eq i32 [[MASK]], 7
-; CHECK-NOT: and i32 %A, 39
-; CHECK: ret i1
-
+; CHECK-LABEL: @masked_or_allones(
+; CHECK-NEXT:    [[MASK1:%.*]] = and i32 %A, 7
+; CHECK-NEXT:    [[TST1:%.*]] = icmp eq i32 [[MASK1]], 7
+; CHECK-NEXT:    ret i1 [[TST1]]
+;
   %mask1 = and i32 %A, 7
   %tst1 = icmp eq i32 %mask1, 7
-
   %mask2 = and i32 %A, 39
   %tst2 = icmp eq i32 %mask2, 39
-
   %res = or i1 %tst1, %tst2
   ret i1 %res
 }
 
 define i1 @masked_and_notA(i32 %A) {
-; CHECK-LABEL: @masked_and_notA
-; CHECK: [[MASK:%.*]] = and i32 %A, 39
-; CHECK: icmp ne i32 [[MASK]], %A
-; CHECK-NOT: and i32 %A, 7
-; CHECK: ret i1
-
+; CHECK-LABEL: @masked_and_notA(
+; CHECK-NEXT:    [[MASK2:%.*]] = and i32 %A, 39
+; CHECK-NEXT:    [[TST2:%.*]] = icmp ne i32 [[MASK2]], %A
+; CHECK-NEXT:    ret i1 [[TST2]]
+;
   %mask1 = and i32 %A, 7
   %tst1 = icmp ne i32 %mask1, %A
-
   %mask2 = and i32 %A, 39
   %tst2 = icmp ne i32 %mask2, %A
-
   %res = and i1 %tst1, %tst2
   ret i1 %res
 }
 
 define i1 @masked_or_A(i32 %A) {
-; CHECK-LABEL: @masked_or_A
-; CHECK: [[MASK:%.*]] = and i32 %A, 39
-; CHECK: icmp eq i32 [[MASK]], %A
-; CHECK-NOT: and i32 %A, 7
-; CHECK: ret i1
-
+; CHECK-LABEL: @masked_or_A(
+; CHECK-NEXT:    [[MASK2:%.*]] = and i32 %A, 39
+; CHECK-NEXT:    [[TST2:%.*]] = icmp eq i32 [[MASK2]], %A
+; CHECK-NEXT:    ret i1 [[TST2]]
+;
   %mask1 = and i32 %A, 7
   %tst1 = icmp eq i32 %mask1, %A
-
   %mask2 = and i32 %A, 39
   %tst2 = icmp eq i32 %mask2, %A
-
   %res = or i1 %tst1, %tst2
   ret i1 %res
 }
 
 define i1 @masked_or_allzeroes_notoptimised(i32 %A) {
-; CHECK-LABEL: @masked_or_allzeroes_notoptimised
-; CHECK: [[MASK:%.*]] = and i32 %A, 15
-; CHECK: icmp eq i32 [[MASK]], 0
-; CHECK: [[MASK:%.*]] = and i32 %A, 39
-; CHECK: icmp eq i32 [[MASK]], 0
-; CHECK: ret i1
-
+; CHECK-LABEL: @masked_or_allzeroes_notoptimised(
+; CHECK-NEXT:    [[MASK1:%.*]] = and i32 %A, 15
+; CHECK-NEXT:    [[TST1:%.*]] = icmp eq i32 [[MASK1]], 0
+; CHECK-NEXT:    [[MASK2:%.*]] = and i32 %A, 39
+; CHECK-NEXT:    [[TST2:%.*]] = icmp eq i32 [[MASK2]], 0
+; CHECK-NEXT:    [[RES:%.*]] = or i1 [[TST1]], [[TST2]]
+; CHECK-NEXT:    ret i1 [[RES]]
+;
   %mask1 = and i32 %A, 15
   %tst1 = icmp eq i32 %mask1, 0
-
   %mask2 = and i32 %A, 39
   %tst2 = icmp eq i32 %mask2, 0
-
   %res = or i1 %tst1, %tst2
   ret i1 %res
 }
 
 define i1 @nomask_lhs(i32 %in) {
-; CHECK-LABEL: @nomask_lhs
-; CHECK: [[MASK:%.*]] = and i32 %in, 1
-; CHECK: icmp eq i32 [[MASK]], 0
-; CHECK-NOT: icmp
-; CHECK: ret i1
+; CHECK-LABEL: @nomask_lhs(
+; CHECK-NEXT:    [[MASKED:%.*]] = and i32 %in, 1
+; CHECK-NEXT:    [[TST2:%.*]] = icmp eq i32 [[MASKED]], 0
+; CHECK-NEXT:    ret i1 [[TST2]]
+;
   %tst1 = icmp eq i32 %in, 0
-
   %masked = and i32 %in, 1
   %tst2 = icmp eq i32 %masked, 0
-
   %val = or i1 %tst1, %tst2
   ret i1 %val
 }
 
-
 define i1 @nomask_rhs(i32 %in) {
-; CHECK-LABEL: @nomask_rhs
-; CHECK: [[MASK:%.*]] = and i32 %in, 1
-; CHECK: icmp eq i32 [[MASK]], 0
-; CHECK-NOT: icmp
-; CHECK: ret i1
+; CHECK-LABEL: @nomask_rhs(
+; CHECK-NEXT:    [[MASKED:%.*]] = and i32 %in, 1
+; CHECK-NEXT:    [[TST1:%.*]] = icmp eq i32 [[MASKED]], 0
+; CHECK-NEXT:    ret i1 [[TST1]]
+;
   %masked = and i32 %in, 1
   %tst1 = icmp eq i32 %masked, 0
-
   %tst2 = icmp eq i32 %in, 0
-
   %val = or i1 %tst1, %tst2
   ret i1 %val
 }
 
+; TODO: This test simplifies to a constant, so the functionality and test could be in InstSimplify.
+
 define i1 @fold_mask_cmps_to_false(i32 %x) {
-; CHECK-LABEL: @fold_mask_cmps_to_false
-; CHECK: ret i1 false
+; CHECK-LABEL: @fold_mask_cmps_to_false(
+; CHECK-NEXT:    ret i1 false
+;
   %1 = and i32 %x, 2147483647
   %2 = icmp eq i32 %1, 0
   %3 = icmp eq i32 %x, 2147483647
@@ -161,12 +140,46 @@ define i1 @fold_mask_cmps_to_false(i32 %x) {
   ret i1 %4
 }
 
+; TODO: This test simplifies to a constant, so the functionality and test could be in InstSimplify.
+
 define i1 @fold_mask_cmps_to_true(i32 %x) {
-; CHECK-LABEL: @fold_mask_cmps_to_true
-; CHECK: ret i1 true
+; CHECK-LABEL: @fold_mask_cmps_to_true(
+; CHECK-NEXT:    ret i1 true
+;
   %1 = and i32 %x, 2147483647
   %2 = icmp ne i32 %1, 0
   %3 = icmp ne i32 %x, 2147483647
   %4 = or i1 %3, %2
   ret i1 %4
 }
+
+; PR32401 - https://bugs.llvm.org/show_bug.cgi?id=32401
+
+define i1 @cmpeq_bitwise(i8 %a, i8 %b, i8 %c, i8 %d) {
+; CHECK-LABEL: @cmpeq_bitwise(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i8 %a, %b
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i8 %c, %d
+; CHECK-NEXT:    [[CMP:%.*]] = and i1 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %xor1 = xor i8 %a, %b
+  %xor2 = xor i8 %c, %d
+  %or = or i8 %xor1, %xor2
+  %cmp = icmp eq i8 %or, 0
+  ret i1 %cmp
+}
+
+define <2 x i1> @cmpne_bitwise(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i64> %d) {
+; CHECK-LABEL: @cmpne_bitwise(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <2 x i64> %a, %b
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne <2 x i64> %c, %d
+; CHECK-NEXT:    [[CMP:%.*]] = or <2 x i1> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %xor1 = xor <2 x i64> %a, %b
+  %xor2 = xor <2 x i64> %c, %d
+  %or = or <2 x i64> %xor1, %xor2
+  %cmp = icmp ne <2 x i64> %or, zeroinitializer
+  ret <2 x i1> %cmp
+}
+
diff --git a/test/Transforms/InstCombine/or-xor.ll b/test/Transforms/InstCombine/or-xor.ll
index 947971c6c83b..be64f51b6c4c 100644
--- a/test/Transforms/InstCombine/or-xor.ll
+++ b/test/Transforms/InstCombine/or-xor.ll
@@ -1,7 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -instcombine < %s | FileCheck %s
 
-define i32 @test1(i32 %x, i32 %y) nounwind {
+; X | ~(X | Y) --> X | ~Y
+
+define i32 @test1(i32 %x, i32 %y) {
 ; CHECK-LABEL: @test1(
 ; CHECK-NEXT:    [[Y_NOT:%.*]] = xor i32 %y, -1
 ; CHECK-NEXT:    [[Z:%.*]] = or i32 [[Y_NOT]], %x
@@ -13,7 +15,10 @@ define i32 @test1(i32 %x, i32 %y) nounwind {
   ret i32 %z
 }
 
-define i32 @test2(i32 %x, i32 %y) nounwind {
+; Commute (rename) the inner 'or' operands:
+; Y | ~(X | Y) --> ~X | Y
+
+define i32 @test2(i32 %x, i32 %y) {
 ; CHECK-LABEL: @test2(
 ; CHECK-NEXT:    [[X_NOT:%.*]] = xor i32 %x, -1
 ; CHECK-NEXT:    [[Z:%.*]] = or i32 [[X_NOT]], %y
@@ -25,7 +30,9 @@ define i32 @test2(i32 %x, i32 %y) nounwind {
   ret i32 %z
 }
 
-define i32 @test3(i32 %x, i32 %y) nounwind {
+; X | ~(X ^ Y) --> X | ~Y
+
+define i32 @test3(i32 %x, i32 %y) {
 ; CHECK-LABEL: @test3(
 ; CHECK-NEXT:    [[Y_NOT:%.*]] = xor i32 %y, -1
 ; CHECK-NEXT:    [[Z:%.*]] = or i32 [[Y_NOT]], %x
@@ -37,7 +44,10 @@ define i32 @test3(i32 %x, i32 %y) nounwind {
   ret i32 %z
 }
 
-define i32 @test4(i32 %x, i32 %y) nounwind {
+; Commute (rename) the 'xor' operands:
+; Y | ~(X ^ Y) --> ~X | Y
+
+define i32 @test4(i32 %x, i32 %y) {
 ; CHECK-LABEL: @test4(
 ; CHECK-NEXT:    [[X_NOT:%.*]] = xor i32 %x, -1
 ; CHECK-NEXT:    [[Z:%.*]] = or i32 [[X_NOT]], %y
@@ -49,7 +59,7 @@ define i32 @test4(i32 %x, i32 %y) nounwind {
   ret i32 %z
 }
 
-define i32 @test5(i32 %x, i32 %y) nounwind {
+define i32 @test5(i32 %x, i32 %y) {
 ; CHECK-LABEL: @test5(
 ; CHECK-NEXT:    ret i32 -1
 ;
@@ -59,7 +69,7 @@ define i32 @test5(i32 %x, i32 %y) nounwind {
   ret i32 %z
 }
 
-define i32 @test6(i32 %x, i32 %y) nounwind {
+define i32 @test6(i32 %x, i32 %y) {
 ; CHECK-LABEL: @test6(
 ; CHECK-NEXT:    ret i32 -1
 ;
@@ -69,7 +79,7 @@ define i32 @test6(i32 %x, i32 %y) nounwind {
   ret i32 %z
 }
 
-define i32 @test7(i32 %x, i32 %y) nounwind {
+define i32 @test7(i32 %x, i32 %y) {
 ; CHECK-LABEL: @test7(
 ; CHECK-NEXT:    [[Z:%.*]] = or i32 %x, %y
 ; CHECK-NEXT:    ret i32 [[Z]]
@@ -79,7 +89,7 @@ define i32 @test7(i32 %x, i32 %y) nounwind {
   ret i32 %z
 }
 
-define i32 @test8(i32 %x, i32 %y) nounwind {
+define i32 @test8(i32 %x, i32 %y) {
 ; CHECK-LABEL: @test8(
 ; CHECK-NEXT:    [[X_NOT:%.*]] = xor i32 %x, -1
 ; CHECK-NEXT:    [[Z:%.*]] = or i32 [[X_NOT]], %y
@@ -91,7 +101,7 @@ define i32 @test8(i32 %x, i32 %y) nounwind {
   ret i32 %z
 }
 
-define i32 @test9(i32 %x, i32 %y) nounwind {
+define i32 @test9(i32 %x, i32 %y) {
 ; CHECK-LABEL: @test9(
 ; CHECK-NEXT:    [[Y_NOT:%.*]] = xor i32 %y, -1
 ; CHECK-NEXT:    [[Z:%.*]] = or i32 [[Y_NOT]], %x
diff --git a/test/Transforms/InstCombine/or.ll b/test/Transforms/InstCombine/or.ll
index 764fe4503b5e..fb56449ba4d4 100644
--- a/test/Transforms/InstCombine/or.ll
+++ b/test/Transforms/InstCombine/or.ll
@@ -397,14 +397,74 @@ define <2 x i132> @orsext_to_sel_vec_swap(<2 x i132> %x, <2 x i1> %y) {
   ret <2 x i132> %or
 }
 
-define i32 @test39(i32 %a, i32 %b) {
-; CHECK-LABEL: @test39(
-; CHECK-NEXT:    [[OR:%.*]] = or i32 %b, %a
+; (~A & B) | A --> A | B
+
+define i32 @test39a(i32 %a, float %b) {
+; CHECK-LABEL: @test39a(
+; CHECK-NEXT:    [[A1:%.*]] = mul i32 %a, 42
+; CHECK-NEXT:    [[B1:%.*]] = bitcast float %b to i32
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[A1]], [[B1]]
 ; CHECK-NEXT:    ret i32 [[OR]]
 ;
-  %xor = xor i32 %a, -1
-  %and = and i32 %xor, %b
-  %or = or i32 %and, %a
+  %a1 = mul i32 %a, 42          ; thwart complexity-based ordering
+  %b1 = bitcast float %b to i32 ; thwart complexity-based ordering
+  %nota = xor i32 %a1, -1
+  %and = and i32 %nota, %b1
+  %or = or i32 %and, %a1
+  ret i32 %or
+}
+
+; Commute 'and' operands:
+; (B & ~A) | A --> A | B
+
+define i32 @test39b(i32 %a, float %b) {
+; CHECK-LABEL: @test39b(
+; CHECK-NEXT:    [[A1:%.*]] = mul i32 %a, 42
+; CHECK-NEXT:    [[B1:%.*]] = bitcast float %b to i32
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[A1]], [[B1]]
+; CHECK-NEXT:    ret i32 [[OR]]
+;
+  %a1 = mul i32 %a, 42          ; thwart complexity-based ordering
+  %b1 = bitcast float %b to i32 ; thwart complexity-based ordering
+  %nota = xor i32 %a1, -1
+  %and = and i32 %b1, %nota
+  %or = or i32 %and, %a1
+  ret i32 %or
+}
+
+; Commute 'or' operands:
+; A | (~A & B) --> A | B
+
+define i32 @test39c(i32 %a, float %b) {
+; CHECK-LABEL: @test39c(
+; CHECK-NEXT:    [[A1:%.*]] = mul i32 %a, 42
+; CHECK-NEXT:    [[B1:%.*]] = bitcast float %b to i32
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[A1]], [[B1]]
+; CHECK-NEXT:    ret i32 [[OR]]
+;
+  %a1 = mul i32 %a, 42          ; thwart complexity-based ordering
+  %b1 = bitcast float %b to i32 ; thwart complexity-based ordering
+  %nota = xor i32 %a1, -1
+  %and = and i32 %nota, %b1
+  %or = or i32 %a1, %and
+  ret i32 %or
+}
+
+; Commute 'and' operands:
+; A | (B & ~A) --> A | B
+
+define i32 @test39d(i32 %a, float %b) {
+; CHECK-LABEL: @test39d(
+; CHECK-NEXT:    [[A1:%.*]] = mul i32 %a, 42
+; CHECK-NEXT:    [[B1:%.*]] = bitcast float %b to i32
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[A1]], [[B1]]
+; CHECK-NEXT:    ret i32 [[OR]]
+;
+  %a1 = mul i32 %a, 42          ; thwart complexity-based ordering
+  %b1 = bitcast float %b to i32 ; thwart complexity-based ordering
+  %nota = xor i32 %a1, -1
+  %and = and i32 %b1, %nota
+  %or = or i32 %a1, %and
   ret i32 %or
 }
 
@@ -456,60 +516,6 @@ define i32 @test40d(i32 %a, i32 %b) {
   ret i32 %or
 }
 
-define i32 @test41(i32 %a, i32 %b) {
-; CHECK-LABEL: @test41(
-; CHECK-NEXT:    [[TMP1:%.*]] = xor i32 %a, -1
-; CHECK-NEXT:    [[OR:%.*]] = xor i32 [[TMP1]], %b
-; CHECK-NEXT:    ret i32 [[OR]]
-;
-  %and = and i32 %a, %b
-  %nega = xor i32 %a, -1
-  %xor = xor i32 %nega, %b
-  %or = or i32 %and, %xor
-  ret i32 %or
-}
-
-; (~A ^ B) | (A & B) -> (~A ^ B)
-
-define i32 @test42(i32 %a, i32 %b) {
-; CHECK-LABEL: @test42(
-; CHECK-NEXT:    [[TMP1:%.*]] = xor i32 %a, -1
-; CHECK-NEXT:    [[OR:%.*]] = xor i32 [[TMP1]], %b
-; CHECK-NEXT:    ret i32 [[OR]]
-;
-  %nega = xor i32 %a, -1
-  %xor = xor i32 %nega, %b
-  %and = and i32 %a, %b
-  %or = or i32 %xor, %and
-  ret i32 %or
-}
-
-define i32 @test42_commuted_and(i32 %a, i32 %b) {
-; CHECK-LABEL: @test42_commuted_and(
-; CHECK-NEXT:    [[TMP1:%.*]] = xor i32 %a, -1
-; CHECK-NEXT:    [[OR:%.*]] = xor i32 [[TMP1]], %b
-; CHECK-NEXT:    ret i32 [[OR]]
-;
-  %nega = xor i32 %a, -1
-  %xor = xor i32 %nega, %b
-  %and = and i32 %b, %a
-  %or = or i32 %xor, %and
-  ret i32 %or
-}
-
-define i32 @test42_commuted_xor(i32 %a, i32 %b) {
-; CHECK-LABEL: @test42_commuted_xor(
-; CHECK-NEXT:    [[TMP1:%.*]] = xor i32 %a, -1
-; CHECK-NEXT:    [[OR:%.*]] = xor i32 [[TMP1]], %b
-; CHECK-NEXT:    ret i32 [[OR]]
-;
-  %nega = xor i32 %a, -1
-  %xor = xor i32 %b, %nega
-  %and = and i32 %a, %b
-  %or = or i32 %xor, %and
-  ret i32 %or
-}
-
 define i32 @test45(i32 %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: @test45(
 ; CHECK-NEXT:    [[TMP1:%.*]] = and i32 %x, %z
@@ -648,41 +654,146 @@ final:
   ret <2 x i32> %value
 }
 
-define i8 @test51(i8 %a, i8 %b, i8 %c) {
-; CHECK-LABEL: @test51(
-; CHECK-NEXT:    [[W:%.*]] = mul i8 [[B:%.*]], [[C:%.*]]
-; CHECK-NEXT:    [[X:%.*]] = or i8 [[W]], [[A:%.*]]
-; CHECK-NEXT:    ret i8 [[X]]
+; In the next 4 tests, vary the types and predicates for extra coverage.
+; (X | (Y & ~X)) -> (X | Y), where 'not' is an inverted cmp
+
+define i1 @or_andn_cmp_1(i32 %a, i32 %b, i32 %c) {
+; CHECK-LABEL: @or_andn_cmp_1(
+; CHECK-NEXT:    [[X:%.*]] = icmp sgt i32 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[Y:%.*]] = icmp ugt i32 [[C:%.*]], 42
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[X]], [[Y]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %x = icmp sgt i32 %a, %b
+  %x_inv = icmp sle i32 %a, %b
+  %y = icmp ugt i32 %c, 42      ; thwart complexity-based ordering
+  %and = and i1 %y, %x_inv
+  %or = or i1 %x, %and
+  ret i1 %or
+}
+
+; Commute the 'or':
+; ((Y & ~X) | X) -> (X | Y), where 'not' is an inverted cmp
+
+define <2 x i1> @or_andn_cmp_2(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) {
+; CHECK-LABEL: @or_andn_cmp_2(
+; CHECK-NEXT:    [[X:%.*]] = icmp sge <2 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[Y:%.*]] = icmp ugt <2 x i32> [[C:%.*]], <i32 42, i32 47>
+; CHECK-NEXT:    [[OR:%.*]] = or <2 x i1> [[Y]], [[X]]
+; CHECK-NEXT:    ret <2 x i1> [[OR]]
+;
+  %x = icmp sge <2 x i32> %a, %b
+  %x_inv = icmp slt <2 x i32> %a, %b
+  %y = icmp ugt <2 x i32> %c, <i32 42, i32 47>      ; thwart complexity-based ordering
+  %and = and <2 x i1> %y, %x_inv
+  %or = or <2 x i1> %and, %x
+  ret <2 x i1> %or
+}
+
+; Commute the 'and':
+; (X | (~X & Y)) -> (X | Y), where 'not' is an inverted cmp
+
+define i1 @or_andn_cmp_3(i72 %a, i72 %b, i72 %c) {
+; CHECK-LABEL: @or_andn_cmp_3(
+; CHECK-NEXT:    [[X:%.*]] = icmp ugt i72 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[Y:%.*]] = icmp ugt i72 [[C:%.*]], 42
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[X]], [[Y]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %x = icmp ugt i72 %a, %b
+  %x_inv = icmp ule i72 %a, %b
+  %y = icmp ugt i72 %c, 42      ; thwart complexity-based ordering
+  %and = and i1 %x_inv, %y
+  %or = or i1 %x, %and
+  ret i1 %or
+}
+
+; Commute the 'or':
+; ((~X & Y) | X) -> (X | Y), where 'not' is an inverted cmp
+
+define <3 x i1> @or_andn_cmp_4(<3 x i32> %a, <3 x i32> %b, <3 x i32> %c) {
+; CHECK-LABEL: @or_andn_cmp_4(
+; CHECK-NEXT:    [[X:%.*]] = icmp eq <3 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[Y:%.*]] = icmp ugt <3 x i32> [[C:%.*]], <i32 42, i32 43, i32 -1>
+; CHECK-NEXT:    [[OR:%.*]] = or <3 x i1> [[Y]], [[X]]
+; CHECK-NEXT:    ret <3 x i1> [[OR]]
+;
+  %x = icmp eq <3 x i32> %a, %b
+  %x_inv = icmp ne <3 x i32> %a, %b
+  %y = icmp ugt <3 x i32> %c, <i32 42, i32 43, i32 -1>      ; thwart complexity-based ordering
+  %and = and <3 x i1> %x_inv, %y
+  %or = or <3 x i1> %and, %x
+  ret <3 x i1> %or
+}
+
+; In the next 4 tests, vary the types and predicates for extra coverage.
+; (~X | (Y & X)) -> (~X | Y), where 'not' is an inverted cmp
+
+define i1 @orn_and_cmp_1(i37 %a, i37 %b, i37 %c) {
+; CHECK-LABEL: @orn_and_cmp_1(
+; CHECK-NEXT:    [[X_INV:%.*]] = icmp sle i37 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[Y:%.*]] = icmp ugt i37 [[C:%.*]], 42
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[X_INV]], [[Y]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %x = icmp sgt i37 %a, %b
+  %x_inv = icmp sle i37 %a, %b
+  %y = icmp ugt i37 %c, 42      ; thwart complexity-based ordering
+  %and = and i1 %y, %x
+  %or = or i1 %x_inv, %and
+  ret i1 %or
+}
+
+; Commute the 'or':
+; ((Y & X) | ~X) -> (~X | Y), where 'not' is an inverted cmp
+
+define i1 @orn_and_cmp_2(i16 %a, i16 %b, i16 %c) {
+; CHECK-LABEL: @orn_and_cmp_2(
+; CHECK-NEXT:    [[X_INV:%.*]] = icmp slt i16 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[Y:%.*]] = icmp ugt i16 [[C:%.*]], 42
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[Y]], [[X_INV]]
+; CHECK-NEXT:    ret i1 [[OR]]
 ;
-  %w = mul i8 %b, %c
-  %z = xor i8 %a, -1
-  %y = and i8 %w, %z
-  %x = or i8 %y, %a
-  ret i8 %x
+  %x = icmp sge i16 %a, %b
+  %x_inv = icmp slt i16 %a, %b
+  %y = icmp ugt i16 %c, 42      ; thwart complexity-based ordering
+  %and = and i1 %y, %x
+  %or = or i1 %and, %x_inv
+  ret i1 %or
 }
 
-define i8 @test52(i8 %a, i8 %b, i8 %c) {
-; CHECK-LABEL: @test52(
-; CHECK-NEXT:    [[W:%.*]] = mul i8 [[B:%.*]], [[C:%.*]]
-; CHECK-NEXT:    [[X:%.*]] = or i8 [[W]], [[A:%.*]]
-; CHECK-NEXT:    ret i8 [[X]]
+; Commute the 'and':
+; (~X | (X & Y)) -> (~X | Y), where 'not' is an inverted cmp
+
+define <4 x i1> @orn_and_cmp_3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-LABEL: @orn_and_cmp_3(
+; CHECK-NEXT:    [[X_INV:%.*]] = icmp ule <4 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[Y:%.*]] = icmp ugt <4 x i32> [[C:%.*]], <i32 42, i32 0, i32 1, i32 -1>
+; CHECK-NEXT:    [[OR:%.*]] = or <4 x i1> [[X_INV]], [[Y]]
+; CHECK-NEXT:    ret <4 x i1> [[OR]]
 ;
-  %w = mul i8 %b, %c
-  %z = xor i8 %w, -1
-  %y = and i8 %z, %a
-  %x = or i8 %w, %y
-  ret i8 %x
+  %x = icmp ugt <4 x i32> %a, %b
+  %x_inv = icmp ule <4 x i32> %a, %b
+  %y = icmp ugt <4 x i32> %c, <i32 42, i32 0, i32 1, i32 -1>      ; thwart complexity-based ordering
+  %and = and <4 x i1> %x, %y
+  %or = or <4 x i1> %x_inv, %and
+  ret <4 x i1> %or
 }
 
-define i8 @test53(i8 %a, i8 %b, i8 %c) {
-; CHECK-LABEL: @test53(
-; CHECK-NEXT:    [[W:%.*]] = mul i8 [[B:%.*]], [[C:%.*]]
-; CHECK-NEXT:    [[X:%.*]] = or i8 [[W]], [[A:%.*]]
-; CHECK-NEXT:    ret i8 [[X]]
+; Commute the 'or':
+; ((X & Y) | ~X) -> (~X | Y), where 'not' is an inverted cmp
+
+define i1 @orn_and_cmp_4(i32 %a, i32 %b, i32 %c) {
+; CHECK-LABEL: @orn_and_cmp_4(
+; CHECK-NEXT:    [[X_INV:%.*]] = icmp ne i32 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[Y:%.*]] = icmp ugt i32 [[C:%.*]], 42
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[Y]], [[X_INV]]
+; CHECK-NEXT:    ret i1 [[OR]]
 ;
-  %w = mul i8 %b, %c
-  %z = xor i8 %w, -1
-  %y = and i8 %z, %a
-  %x = or i8 %w, %y
-  ret i8 %x
+  %x = icmp eq i32 %a, %b
+  %x_inv = icmp ne i32 %a, %b
+  %y = icmp ugt i32 %c, 42      ; thwart complexity-based ordering
+  %and = and i1 %x, %y
+  %or = or i1 %and, %x_inv
+  ret i1 %or
 }
diff --git a/test/Transforms/InstCombine/pr33765.ll b/test/Transforms/InstCombine/pr33765.ll
new file mode 100644
index 000000000000..99ed0d13b5cf
--- /dev/null
+++ b/test/Transforms/InstCombine/pr33765.ll
@@ -0,0 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S %s -instcombine | FileCheck %s
+
+@glob = external global i16
+
+define void @patatino(i8 %beth) {
+; CHECK-LABEL: @patatino(
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[BETH:%.*]] to i32
+; CHECK-NEXT:    br i1 undef, label [[IF_THEN9:%.*]], label [[IF_THEN9]]
+; CHECK:       if.then9:
+; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i32 [[CONV]], [[CONV]]
+; CHECK-NEXT:    [[TINKY:%.*]] = load i16, i16* @glob, align 2
+; CHECK-NEXT:    [[CONV131:%.*]] = zext i16 [[TINKY]] to i32
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[MUL]], [[CONV131]]
+; CHECK-NEXT:    [[CONV14:%.*]] = trunc i32 [[AND]] to i16
+; CHECK-NEXT:    store i16 [[CONV14]], i16* @glob, align 2
+; CHECK-NEXT:    ret void
+;
+  %conv = zext i8 %beth to i32
+  %mul = mul nuw nsw i32 %conv, %conv
+  %conv3 = and i32 %mul, 255
+  %tobool8 = icmp ne i32 %mul, %conv3
+  br i1 %tobool8, label %if.then9, label %if.then9
+
+if.then9:
+  %tinky = load i16, i16* @glob
+  %conv13 = sext i16 %tinky to i32
+  %and = and i32 %mul, %conv13
+  %conv14 = trunc i32 %and to i16
+  store i16 %conv14, i16* @glob
+  ret void
+}
diff --git a/test/Transforms/JumpThreading/select.ll b/test/Transforms/JumpThreading/select.ll
index 6a3cf7edd7dc..5e84ec54971a 100644
--- a/test/Transforms/JumpThreading/select.ll
+++ b/test/Transforms/JumpThreading/select.ll
@@ -280,10 +280,85 @@ cond.false.15.i:                                  ; preds = %cond.false.10.i
   ret i32 %j.add3
 
 ; CHECK-LABEL: @unfold3
-; CHECK: br i1 %cmp.i, label %.exit.thread2, label %cond.false.i 
+; CHECK: br i1 %cmp.i, label %.exit.thread2, label %cond.false.i
 ; CHECK: br i1 %cmp4.i, label %.exit.thread, label %cond.false.6.i
 ; CHECK: br i1 %cmp8.i, label %.exit.thread2, label %cond.false.10.i
 ; CHECK: br i1 %cmp13.i, label %.exit.thread, label %.exit
 ; CHECK: br i1 %phitmp, label %.exit.thread, label %.exit.thread2
 ; CHECK: br label %.exit.thread2
 }
+
+define i32 @unfold4(i32 %u, i32 %v, i32 %w, i32 %x, i32 %y, i32 %z, i32 %j) nounwind {
+entry:
+  %add3 = add nsw i32 %j, 2
+  %cmp.i = icmp slt i32 %u, %v
+  br i1 %cmp.i, label %.exit, label %cond.false.i
+
+cond.false.i:                                     ; preds = %entry
+  %cmp4.i = icmp sgt i32 %u, %v
+  br i1 %cmp4.i, label %.exit, label %cond.false.6.i
+
+cond.false.6.i:                                   ; preds = %cond.false.i
+  %cmp8.i = icmp slt i32 %w, %x
+  br i1 %cmp8.i, label %.exit, label %cond.false.10.i
+
+cond.false.10.i:                                  ; preds = %cond.false.6.i
+  %cmp13.i = icmp sgt i32 %w, %x
+  br i1 %cmp13.i, label %.exit, label %cond.false.15.i
+
+cond.false.15.i:                                  ; preds = %cond.false.10.i
+  %cmp19.i = icmp sge i32 %y, %z
+  %conv = zext i1 %cmp19.i to i32
+  br label %.exit
+
+.exit:                                  ; preds = %entry, %cond.false.i, %cond.false.6.i, %cond.false.10.i, %cond.false.15.i
+  %cond23.i = phi i32 [ 1, %entry ], [ 0, %cond.false.i ], [ 1, %cond.false.6.i ], [ %conv, %cond.false.15.i ], [ 0, %cond.false.10.i ]
+  %lnot.i18 = icmp eq i32 %cond23.i, 1
+  %j.add3 = select i1 %lnot.i18, i32 %j, i32 %add3
+  ret i32 %j.add3
+
+; CHECK-LABEL: @unfold4
+; CHECK: br i1 %cmp.i, label %.exit.thread, label %cond.false.i
+; CHECK: br i1 %cmp4.i, label %.exit.thread3, label %cond.false.6.i
+; CHECK: br i1 %cmp8.i, label %.exit.thread, label %cond.false.10.i
+; CHECK: br i1 %cmp13.i, label %.exit.thread3, label %.exit
+; CHECK: br i1 %lnot.i18, label %.exit.thread, label %.exit.thread3
+; CHECK: br label %.exit.thread3
+}
+
+define i32 @unfold5(i32 %u, i32 %v, i32 %w, i32 %x, i32 %y, i32 %z, i32 %j) nounwind {
+entry:
+  %add3 = add nsw i32 %j, 2
+  %cmp.i = icmp slt i32 %u, %v
+  br i1 %cmp.i, label %.exit, label %cond.false.i
+
+cond.false.i:                                     ; preds = %entry
+  %cmp4.i = icmp sgt i32 %u, %v
+  br i1 %cmp4.i, label %.exit, label %cond.false.6.i
+
+cond.false.6.i:                                   ; preds = %cond.false.i
+  %cmp8.i = icmp slt i32 %w, %x
+  br i1 %cmp8.i, label %.exit, label %cond.false.10.i
+
+cond.false.10.i:                                  ; preds = %cond.false.6.i
+  %cmp13.i = icmp sgt i32 %w, %x
+  br i1 %cmp13.i, label %.exit, label %cond.false.15.i
+
+cond.false.15.i:                                  ; preds = %cond.false.10.i
+  %cmp19.i = icmp sge i32 %y, %z
+  %conv = zext i1 %cmp19.i to i32
+  br label %.exit
+
+.exit:                                  ; preds = %entry, %cond.false.i, %cond.false.6.i, %cond.false.10.i, %cond.false.15.i
+  %cond23.i = phi i32 [ 2, %entry ], [ 3, %cond.false.i ], [ 1, %cond.false.6.i ], [ %conv, %cond.false.15.i ], [ 7, %cond.false.10.i ]
+  %lnot.i18 = icmp sgt i32 %cond23.i, 5
+  %j.add3 = select i1 %lnot.i18, i32 %j, i32 %cond23.i
+  ret i32 %j.add3
+
+; CHECK-LABEL: @unfold5
+; CHECK: br i1 %cmp.i, label %.exit, label %cond.false.i
+; CHECK: br i1 %cmp4.i, label %.exit, label %cond.false.6.i
+; CHECK: br i1 %cmp8.i, label %.exit, label %cond.false.10.i
+; CHECK: br i1 %cmp13.i, label %.exit, label %cond.false.15.i
+; CHECK: br label %.exit
+}
diff --git a/test/Transforms/LoopInterchange/current-limitations-lcssa.ll b/test/Transforms/LoopInterchange/current-limitations-lcssa.ll
new file mode 100644
index 000000000000..df6c6cfdbcb5
--- /dev/null
+++ b/test/Transforms/LoopInterchange/current-limitations-lcssa.ll
@@ -0,0 +1,76 @@
+; RUN: opt < %s -basicaa -loop-interchange -S | FileCheck %s
+;; We test the complete .ll for adjustment in outer loop header/latch and inner loop header/latch.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@A = common global [100 x [100 x i32]] zeroinitializer
+@C = common global [100 x [100 x i32]] zeroinitializer
+
+;; FIXME:
+;; Test for interchange when we have an lcssa phi. This should ideally be interchanged but it is currently not supported.
+;;     for(gi=1;gi<N;gi++)
+;;       for(gj=1;gj<M;gj++)
+;;         A[gj][gi] = A[gj - 1][gi] + C[gj][gi];
+
+@gi = common global i32 0
+@gj = common global i32 0
+
+define void @interchange_07(i32 %N, i32 %M){
+entry:
+  store i32 1, i32* @gi
+  %cmp21 = icmp sgt i32 %N, 1
+  br i1 %cmp21, label %for.cond1.preheader.lr.ph, label %for.end16
+
+for.cond1.preheader.lr.ph:
+  %cmp218 = icmp sgt i32 %M, 1
+  %gi.promoted = load i32, i32* @gi
+  %0 = add i32 %M, -1
+  %1 = sext i32 %gi.promoted to i64
+  %2 = sext i32 %N to i64
+  %3 = add i32 %gi.promoted, 1
+  %4 = icmp slt i32 %3, %N
+  %smax = select i1 %4, i32 %N, i32 %3
+  br label %for.cond1.preheader
+
+for.cond1.preheader:
+  %indvars.iv25 = phi i64 [ %1, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next26, %for.inc14 ]
+  br i1 %cmp218, label %for.body3, label %for.inc14
+
+for.body3:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3 ], [ 1, %for.cond1.preheader ]
+  %5 = add nsw i64 %indvars.iv, -1
+  %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %5, i64 %indvars.iv25
+  %6 = load i32, i32* %arrayidx5
+  %arrayidx9 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @C, i64 0, i64 %indvars.iv, i64 %indvars.iv25
+  %7 = load i32, i32* %arrayidx9
+  %add = add nsw i32 %7, %6
+  %arrayidx13 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv25
+  store i32 %add, i32* %arrayidx13
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %0
+  br i1 %exitcond, label %for.inc14, label %for.body3
+
+for.inc14:
+  %inc.lcssa23 = phi i32 [ 1, %for.cond1.preheader ], [ %M, %for.body3 ]
+  %indvars.iv.next26 = add nsw i64 %indvars.iv25, 1
+  %cmp = icmp slt i64 %indvars.iv.next26, %2
+  br i1 %cmp, label %for.cond1.preheader, label %for.cond.for.end16_crit_edge
+
+for.cond.for.end16_crit_edge:
+  store i32 %inc.lcssa23, i32* @gj
+  store i32 %smax, i32* @gi
+  br label %for.end16
+
+for.end16:
+  ret void
+}
+
+; CHECK-LABEL: @interchange_07
+; CHECK: for.body3:                                        ; preds = %for.body3.preheader, %for.body3
+; CHECK:   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3 ], [ 1, %for.body3.preheader ]
+; CHECK:   %5 = add nsw i64 %indvars.iv, -1
+; CHECK:   %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %5, i64 %indvars.iv25
+; CHECK:   %6 = load i32, i32* %arrayidx5
+; CHECK:   %arrayidx9 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @C, i64 0, i64 %indvars.iv, i64 %indvars.iv25
diff --git a/test/Transforms/LoopInterchange/interchange-flow-dep-outer.ll b/test/Transforms/LoopInterchange/interchange-flow-dep-outer.ll
new file mode 100644
index 000000000000..c3b0b9291424
--- /dev/null
+++ b/test/Transforms/LoopInterchange/interchange-flow-dep-outer.ll
@@ -0,0 +1,118 @@
+; RUN: opt < %s -basicaa -loop-interchange -S | FileCheck %s
+;; We test the complete .ll for adjustment in outer loop header/latch and inner loop header/latch.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@A = common global [100 x [100 x i32]] zeroinitializer
+@B = common global [100 x i32] zeroinitializer
+@C = common global [100 x [100 x i32]] zeroinitializer
+@D = common global [100 x [100 x [100 x i32]]] zeroinitializer
+
+;; Test that a flow dependency in outer loop doesn't prevent interchange in
+;; loops i and j.
+;;
+;;  for (int k = 0; k < 100; ++k) {
+;;    T[k] = fn1();
+;;    for (int i = 0; i < 1000; ++i)
+;;      for(int j = 1; j < 1000; ++j)
+;;        Arr[j][i] = Arr[j][i]+k;
+;;    fn2(T[k]);
+;;  }
+
+@T = internal global [100 x double] zeroinitializer, align 4
+@Arr = internal global [1000 x [1000 x i32]] zeroinitializer, align 4
+
+define void @interchange_09(i32 %k) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup4
+  ret void
+
+for.body:                                         ; preds = %for.cond.cleanup4, %entry
+  %indvars.iv45 = phi i64 [ 0, %entry ], [ %indvars.iv.next46, %for.cond.cleanup4 ]
+  %call = call double @fn1()
+  %arrayidx = getelementptr inbounds [100 x double], [100 x double]* @T, i64 0, i64 %indvars.iv45
+  store double %call, double* %arrayidx, align 8
+  br label %for.cond6.preheader
+
+for.cond6.preheader:                              ; preds = %for.cond.cleanup8, %for.body
+  %indvars.iv42 = phi i64 [ 0, %for.body ], [ %indvars.iv.next43, %for.cond.cleanup8 ]
+  br label %for.body9
+
+for.cond.cleanup4:                                ; preds = %for.cond.cleanup8
+  %tmp = load double, double* %arrayidx, align 8
+  call void @fn2(double %tmp)
+  %indvars.iv.next46 = add nuw nsw i64 %indvars.iv45, 1
+  %exitcond47 = icmp ne i64 %indvars.iv.next46, 100
+  br i1 %exitcond47, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup8:                                ; preds = %for.body9
+  %indvars.iv.next43 = add nuw nsw i64 %indvars.iv42, 1
+  %exitcond44 = icmp ne i64 %indvars.iv.next43, 1000
+  br i1 %exitcond44, label %for.cond6.preheader, label %for.cond.cleanup4
+
+for.body9:                                        ; preds = %for.body9, %for.cond6.preheader
+  %indvars.iv = phi i64 [ 1, %for.cond6.preheader ], [ %indvars.iv.next, %for.body9 ]
+  %arrayidx13 = getelementptr inbounds [1000 x [1000 x i32]], [1000 x [1000 x i32]]* @Arr, i64 0, i64 %indvars.iv, i64 %indvars.iv42
+  %tmp1 = load i32, i32* %arrayidx13, align 4
+  %tmp2 = trunc i64 %indvars.iv45 to i32
+  %add = add nsw i32 %tmp1, %tmp2
+  store i32 %add, i32* %arrayidx13, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, 1000
+  br i1 %exitcond, label %for.body9, label %for.cond.cleanup8
+}
+
+declare double @fn1()
+declare void @fn2(double)
+
+
+;; After interchange %indvars.iv (j) should increment as the middle loop.
+;; After interchange %indvars.iv42 (i) should increment with the inner most loop.
+
+; CHECK-LABEL: @interchange_09
+
+; CHECK: for.body:
+; CHECK:   %indvars.iv45 = phi i64 [ %indvars.iv.next46, %for.cond.cleanup4 ], [ 0, %for.body.preheader ]
+; CHECK:   %call = call double @fn1()
+; CHECK:   %arrayidx = getelementptr inbounds [100 x double], [100 x double]* @T, i64 0, i64 %indvars.iv45
+; CHECK:   store double %call, double* %arrayidx, align 8
+; CHECK:   br label %for.body9.preheader
+
+; CHECK: for.cond6.preheader.preheader:
+; CHECK:   br label %for.cond6.preheader
+
+; CHECK: for.cond6.preheader:
+; CHECK:   %indvars.iv42 = phi i64 [ %indvars.iv.next43, %for.cond.cleanup8 ], [ 0, %for.cond6.preheader.preheader ]
+; CHECK:   br label %for.body9.split1
+
+; CHECK: for.body9.preheader:
+; CHECK:   br label %for.body9
+
+; CHECK: for.cond.cleanup4:
+; CHECK:   %tmp = load double, double* %arrayidx, align 8
+; CHECK:   call void @fn2(double %tmp)
+; CHECK:   %indvars.iv.next46 = add nuw nsw i64 %indvars.iv45, 1
+; CHECK:   %exitcond47 = icmp ne i64 %indvars.iv.next46, 100
+; CHECK:   br i1 %exitcond47, label %for.body, label %for.cond.cleanup
+
+; CHECK: for.cond.cleanup8:
+; CHECK:   %indvars.iv.next43 = add nuw nsw i64 %indvars.iv42, 1
+; CHECK:   %exitcond44 = icmp ne i64 %indvars.iv.next43, 1000
+; CHECK:   br i1 %exitcond44, label %for.cond6.preheader, label %for.body9.split
+
+; CHECK: for.body9:
+; CHECK:   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body9.split ], [ 1, %for.body9.preheader ]
+; CHECK:   br label %for.cond6.preheader.preheader
+
+; CHECK: for.body9.split1:
+; CHECK:   %arrayidx13 = getelementptr inbounds [1000 x [1000 x i32]], [1000 x [1000 x i32]]* @Arr, i64 0, i64 %indvars.iv, i64 %indvars.iv42
+; CHECK:   store i32 %add, i32* %arrayidx13, align 4
+; CHECK:   br label %for.cond.cleanup8
+
+; CHECK: for.body9.split:
+; CHECK:   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+; CHECK:   %exitcond = icmp ne i64 %indvars.iv.next, 1000
+; CHECK:   br i1 %exitcond, label %for.body9, label %for.cond.cleanup4
diff --git a/test/Transforms/LoopInterchange/interchange-not-profitable.ll b/test/Transforms/LoopInterchange/interchange-not-profitable.ll
new file mode 100644
index 000000000000..67a63cab08bd
--- /dev/null
+++ b/test/Transforms/LoopInterchange/interchange-not-profitable.ll
@@ -0,0 +1,66 @@
+; RUN: opt < %s -basicaa -loop-interchange -S | FileCheck %s
+;; We test the complete .ll for adjustment in outer loop header/latch and inner loop header/latch.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@A = common global [100 x [100 x i32]] zeroinitializer
+@B = common global [100 x i32] zeroinitializer
+
+;; Loops should not be interchanged in this case as it is not profitable.
+;;  for(int i=0;i<100;i++)
+;;    for(int j=0;j<100;j++)
+;;      A[i][j] = A[i][j]+k;
+
+define void @interchange_03(i32 %k) {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:
+  %indvars.iv21 = phi i64 [ 0, %entry ], [ %indvars.iv.next22, %for.inc10 ]
+  br label %for.body3
+
+for.body3:
+  %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ]
+  %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv21, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx5
+  %add = add nsw i32 %0, %k
+  store i32 %add, i32* %arrayidx5
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 100
+  br i1 %exitcond, label %for.inc10, label %for.body3
+
+for.inc10:
+  %indvars.iv.next22 = add nuw nsw i64 %indvars.iv21, 1
+  %exitcond23 = icmp eq i64 %indvars.iv.next22, 100
+  br i1 %exitcond23, label %for.end12, label %for.cond1.preheader
+
+for.end12:
+  ret void
+}
+
+; CHECK-LABEL: @interchange_03
+; CHECK: entry:
+; CHECK:   br label %for.cond1.preheader.preheader
+; CHECK: for.cond1.preheader.preheader:                    ; preds = %entry
+; CHECK:   br label %for.cond1.preheader
+; CHECK: for.cond1.preheader:                              ; preds = %for.cond1.preheader.preheader, %for.inc10
+; CHECK:   %indvars.iv21 = phi i64 [ %indvars.iv.next22, %for.inc10 ], [ 0, %for.cond1.preheader.preheader ]
+; CHECK:  br label %for.body3.preheader
+; CHECK: for.body3.preheader:                              ; preds = %for.cond1.preheader
+; CHECK:   br label %for.body3
+; CHECK: for.body3:                                        ; preds = %for.body3.preheader, %for.body3
+; CHECK:   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3 ], [ 0, %for.body3.preheader ]
+; CHECK:   %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv21, i64 %indvars.iv
+; CHECK:   %0 = load i32, i32* %arrayidx5
+; CHECK:   %add = add nsw i32 %0, %k
+; CHECK:   store i32 %add, i32* %arrayidx5
+; CHECK:   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+; CHECK:   %exitcond = icmp eq i64 %indvars.iv.next, 100
+; CHECK:   br i1 %exitcond, label %for.inc10, label %for.body3
+; CHECK: for.inc10:                                        ; preds = %for.body3
+; CHECK:   %indvars.iv.next22 = add nuw nsw i64 %indvars.iv21, 1
+; CHECK:   %exitcond23 = icmp eq i64 %indvars.iv.next22, 100
+; CHECK:   br i1 %exitcond23, label %for.end12, label %for.cond1.preheader
+; CHECK: for.end12:                                        ; preds = %for.inc10
+; CHECK:   ret void
diff --git a/test/Transforms/LoopInterchange/interchange-output-dependencies.ll b/test/Transforms/LoopInterchange/interchange-output-dependencies.ll
new file mode 100644
index 000000000000..98deba96f8c6
--- /dev/null
+++ b/test/Transforms/LoopInterchange/interchange-output-dependencies.ll
@@ -0,0 +1,86 @@
+; RUN: opt < %s -basicaa -loop-interchange -S | FileCheck %s
+;; We test the complete .ll for adjustment in outer loop header/latch and inner loop header/latch.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@A = common global [100 x [100 x i32]] zeroinitializer
+
+;; Test to make sure we can handle output dependencies.
+;;
+;;  for (int i = 0; i < 2; ++i)
+;;    for(int j = 0; j < 3; ++j) {
+;;      A[j][i] = i;
+;;      A[j][i+1] = j;
+;;    }
+
+@A10 = local_unnamed_addr global [3 x [3 x i32]] zeroinitializer, align 16
+
+define void @interchange_10() {
+entry:
+  br label %for.cond1.preheader
+
+for.cond.loopexit:                                ; preds = %for.body4
+  %exitcond28 = icmp ne i64 %indvars.iv.next27, 2
+  br i1 %exitcond28, label %for.cond1.preheader, label %for.cond.cleanup
+
+for.cond1.preheader:                              ; preds = %for.cond.loopexit, %entry
+  %indvars.iv26 = phi i64 [ 0, %entry ], [ %indvars.iv.next27, %for.cond.loopexit ]
+  %indvars.iv.next27 = add nuw nsw i64 %indvars.iv26, 1
+  br label %for.body4
+
+for.cond.cleanup:                                 ; preds = %for.cond.loopexit
+  ret void
+
+for.body4:                                        ; preds = %for.body4, %for.cond1.preheader
+  %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body4 ]
+  %arrayidx6 = getelementptr inbounds [3 x [3 x i32]], [3 x [3 x i32]]* @A10, i64 0, i64 %indvars.iv, i64 %indvars.iv26
+  %tmp = trunc i64 %indvars.iv26 to i32
+  store i32 %tmp, i32* %arrayidx6, align 4
+  %arrayidx10 = getelementptr inbounds [3 x [3 x i32]], [3 x [3 x i32]]* @A10, i64 0, i64 %indvars.iv, i64 %indvars.iv.next27
+  %tmp1 = trunc i64 %indvars.iv to i32
+  store i32 %tmp1, i32* %arrayidx10, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, 3
+  br i1 %exitcond, label %for.body4, label %for.cond.loopexit
+}
+
+; CHECK-LABEL: @interchange_10
+; CHECK: entry:
+; CHECK:   br label %for.body4.preheader
+
+; CHECK: for.cond1.preheader.preheader:
+; CHECK:   br label %for.cond1.preheader
+
+; CHECK: for.cond.loopexit:
+; CHECK:   %exitcond28 = icmp ne i64 %indvars.iv.next27, 2
+; CHECK:   br i1 %exitcond28, label %for.cond1.preheader, label %for.body4.split
+
+; CHECK: for.cond1.preheader:
+; CHECK:   %indvars.iv26 = phi i64 [ %indvars.iv.next27, %for.cond.loopexit ], [ 0, %for.cond1.preheader.preheader ]
+; CHECK:   %indvars.iv.next27 = add nuw nsw i64 %indvars.iv26, 1
+; CHECK:   br label %for.body4.split1
+
+; CHECK: for.body4.preheader:
+; CHECK:   br label %for.body4
+
+; CHECK: for.cond.cleanup:
+; CHECK:   ret void
+
+; CHECK: for.body4:
+; CHECK:   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body4.split ], [ 0, %for.body4.preheader ]
+; CHECK:   br label %for.cond1.preheader.preheader
+
+; CHECK: for.body4.split1:
+; CHECK:   %arrayidx6 = getelementptr inbounds [3 x [3 x i32]], [3 x [3 x i32]]* @A10, i64 0, i64 %indvars.iv, i64 %indvars.iv26
+; CHECK:   %tmp = trunc i64 %indvars.iv26 to i32
+; CHECK:   store i32 %tmp, i32* %arrayidx6, align 4
+; CHECK:   %arrayidx10 = getelementptr inbounds [3 x [3 x i32]], [3 x [3 x i32]]* @A10, i64 0, i64 %indvars.iv, i64 %indvars.iv.next27
+; CHECK:   %tmp1 = trunc i64 %indvars.iv to i32
+; CHECK:   store i32 %tmp1, i32* %arrayidx10, align 4
+; CHECK:   br label %for.cond.loopexit
+
+; CHECK: for.body4.split:
+; CHECK:   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+; CHECK:   %exitcond = icmp ne i64 %indvars.iv.next, 3
+; CHECK:   br i1 %exitcond, label %for.body4, label %for.cond.cleanup
diff --git a/test/Transforms/LoopInterchange/interchange-simple-count-down.ll b/test/Transforms/LoopInterchange/interchange-simple-count-down.ll
new file mode 100644
index 000000000000..70ba5940257f
--- /dev/null
+++ b/test/Transforms/LoopInterchange/interchange-simple-count-down.ll
@@ -0,0 +1,69 @@
+; RUN: opt < %s -basicaa -loop-interchange -S | FileCheck %s
+;; We test the complete .ll for adjustment in outer loop header/latch and inner loop header/latch.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@A = common global [100 x [100 x i32]] zeroinitializer
+@B = common global [100 x i32] zeroinitializer
+
+;; for(int i=0;i<100;i++)
+;;   for(int j=100;j>=0;j--)
+;;     A[j][i] = A[j][i]+k;
+
+define void @interchange_02(i32 %k) {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:
+  %indvars.iv19 = phi i64 [ 0, %entry ], [ %indvars.iv.next20, %for.inc10 ]
+  br label %for.body3
+
+for.body3:
+  %indvars.iv = phi i64 [ 100, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ]
+  %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv19
+  %0 = load i32, i32* %arrayidx5
+  %add = add nsw i32 %0, %k
+  store i32 %add, i32* %arrayidx5
+  %indvars.iv.next = add nsw i64 %indvars.iv, -1
+  %cmp2 = icmp sgt i64 %indvars.iv, 0
+  br i1 %cmp2, label %for.body3, label %for.inc10
+
+for.inc10:
+  %indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1
+  %exitcond = icmp eq i64 %indvars.iv.next20, 100
+  br i1 %exitcond, label %for.end11, label %for.cond1.preheader
+
+for.end11:
+  ret void
+}
+
+; CHECK-LABEL: @interchange_02
+; CHECK: entry:
+; CHECK:   br label %for.body3.preheader
+; CHECK: for.cond1.preheader.preheader:
+; CHECK:   br label %for.cond1.preheader
+; CHECK: for.cond1.preheader:
+; CHECK:   %indvars.iv19 = phi i64 [ %indvars.iv.next20, %for.inc10 ], [ 0, %for.cond1.preheader.preheader ]
+; CHECK:   br label %for.body3.split1
+; CHECK: for.body3.preheader:
+; CHECK:   br label %for.body3
+; CHECK: for.body3:
+; CHECK:   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3.split ], [ 100, %for.body3.preheader ]
+; CHECK:   br label %for.cond1.preheader.preheader
+; CHECK: for.body3.split1:                                 ; preds = %for.cond1.preheader
+; CHECK:   %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv19
+; CHECK:   %0 = load i32, i32* %arrayidx5
+; CHECK:   %add = add nsw i32 %0, %k
+; CHECK:   store i32 %add, i32* %arrayidx5
+; CHECK:   br label %for.inc10
+; CHECK: for.body3.split:
+; CHECK:   %indvars.iv.next = add nsw i64 %indvars.iv, -1
+; CHECK:   %cmp2 = icmp sgt i64 %indvars.iv, 0
+; CHECK:   br i1 %cmp2, label %for.body3, label %for.end11
+; CHECK: for.inc10:
+; CHECK:   %indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1
+; CHECK:   %exitcond = icmp eq i64 %indvars.iv.next20, 100
+; CHECK:   br i1 %exitcond, label %for.body3.split, label %for.cond1.preheader
+; CHECK: for.end11:
+; CHECK:   ret void
diff --git a/test/Transforms/LoopInterchange/interchange-simple-count-up.ll b/test/Transforms/LoopInterchange/interchange-simple-count-up.ll
new file mode 100644
index 000000000000..4febe0269810
--- /dev/null
+++ b/test/Transforms/LoopInterchange/interchange-simple-count-up.ll
@@ -0,0 +1,86 @@
+; RUN: opt < %s -basicaa -loop-interchange -S | FileCheck %s
+;; We test the complete .ll for adjustment in outer loop header/latch and inner loop header/latch.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@A = common global [100 x [100 x i32]] zeroinitializer
+@B = common global [100 x i32] zeroinitializer
+
+;;  for(int i=0;i<N;i++)
+;;    for(int j=1;j<N;j++)
+;;      A[j][i] = A[j][i]+k;
+
+define void @interchange_01(i32 %k, i32 %N) {
+entry:
+  %cmp21 = icmp sgt i32 %N, 0
+  br i1 %cmp21, label %for.cond1.preheader.lr.ph, label %for.end12
+
+for.cond1.preheader.lr.ph:
+  %cmp219 = icmp sgt i32 %N, 1
+  %0 = add i32 %N, -1
+  br label %for.cond1.preheader
+
+for.cond1.preheader:
+  %indvars.iv23 = phi i64 [ 0, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next24, %for.inc10 ]
+  br i1 %cmp219, label %for.body3, label %for.inc10
+
+for.body3:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3 ], [ 1, %for.cond1.preheader ]
+  %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv23
+  %1 = load i32, i32* %arrayidx5
+  %add = add nsw i32 %1, %k
+  store i32 %add, i32* %arrayidx5
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %0
+  br i1 %exitcond, label %for.inc10, label %for.body3
+
+for.inc10:
+  %indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1
+  %lftr.wideiv25 = trunc i64 %indvars.iv23 to i32
+  %exitcond26 = icmp eq i32 %lftr.wideiv25, %0
+  br i1 %exitcond26, label %for.end12, label %for.cond1.preheader
+
+for.end12:
+  ret void
+}
+
+; CHECK-LABEL: @interchange_01
+; CHECK: entry:
+; CHECK:   %cmp21 = icmp sgt i32 %N, 0
+; CHECK:   br i1 %cmp21, label %for.body3.preheader, label %for.end12
+; CHECK: for.cond1.preheader.lr.ph:
+; CHECK:   br label %for.cond1.preheader
+; CHECK: for.cond1.preheader:
+; CHECK:   %indvars.iv23 = phi i64 [ 0, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next24, %for.inc10 ]
+; CHECK:   br i1 %cmp219, label %for.body3.split1, label %for.end12.loopexit
+; CHECK: for.body3.preheader:
+; CHECK:   %cmp219 = icmp sgt i32 %N, 1
+; CHECK:   %0 = add i32 %N, -1
+; CHECK:   br label %for.body3
+; CHECK: for.body3:
+; CHECK:   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3.split ], [ 1, %for.body3.preheader ]
+; CHECK:   br label %for.cond1.preheader.lr.ph
+; CHECK: for.body3.split1:
+; CHECK:   %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv23
+; CHECK:   %1 = load i32, i32* %arrayidx5
+; CHECK:   %add = add nsw i32 %1, %k
+; CHECK:   store i32 %add, i32* %arrayidx5
+; CHECK:   br label %for.inc10.loopexit
+; CHECK: for.body3.split:
+; CHECK:   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+; CHECK:   %lftr.wideiv = trunc i64 %indvars.iv to i32
+; CHECK:   %exitcond = icmp eq i32 %lftr.wideiv, %0
+; CHECK:   br i1 %exitcond, label %for.end12.loopexit, label %for.body3
+; CHECK: for.inc10.loopexit:
+; CHECK:   br label %for.inc10
+; CHECK: for.inc10:
+; CHECK:   %indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1
+; CHECK:   %lftr.wideiv25 = trunc i64 %indvars.iv23 to i32
+; CHECK:   %exitcond26 = icmp eq i32 %lftr.wideiv25, %0
+; CHECK:   br i1 %exitcond26, label %for.body3.split, label %for.cond1.preheader
+; CHECK: for.end12.loopexit:
+; CHECK:   br label %for.end12
+; CHECK: for.end12:
+; CHECK:   ret void
diff --git a/test/Transforms/LoopInterchange/interchange.ll b/test/Transforms/LoopInterchange/interchange.ll
deleted file mode 100644
index 77b33e43bedc..000000000000
--- a/test/Transforms/LoopInterchange/interchange.ll
+++ /dev/null
@@ -1,749 +0,0 @@
-; RUN: opt < %s -basicaa -loop-interchange -S | FileCheck %s
-;; We test the complete .ll for adjustment in outer loop header/latch and inner loop header/latch.
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-@A = common global [100 x [100 x i32]] zeroinitializer
-@B = common global [100 x i32] zeroinitializer
-@C = common global [100 x [100 x i32]] zeroinitializer
-@D = common global [100 x [100 x [100 x i32]]] zeroinitializer
-
-declare void @foo(...)
-
-;;--------------------------------------Test case 01------------------------------------
-;;  for(int i=0;i<N;i++)
-;;    for(int j=1;j<N;j++)
-;;      A[j][i] = A[j][i]+k;
-
-define void @interchange_01(i32 %k, i32 %N) {
-entry:
-  %cmp21 = icmp sgt i32 %N, 0
-  br i1 %cmp21, label %for.cond1.preheader.lr.ph, label %for.end12
-
-for.cond1.preheader.lr.ph:
-  %cmp219 = icmp sgt i32 %N, 1
-  %0 = add i32 %N, -1
-  br label %for.cond1.preheader
-
-for.cond1.preheader: 
-  %indvars.iv23 = phi i64 [ 0, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next24, %for.inc10 ]
-  br i1 %cmp219, label %for.body3, label %for.inc10
-
-for.body3:
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3 ], [ 1, %for.cond1.preheader ]
-  %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv23
-  %1 = load i32, i32* %arrayidx5
-  %add = add nsw i32 %1, %k
-  store i32 %add, i32* %arrayidx5
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %0
-  br i1 %exitcond, label %for.inc10, label %for.body3
-
-for.inc10:
-  %indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1
-  %lftr.wideiv25 = trunc i64 %indvars.iv23 to i32
-  %exitcond26 = icmp eq i32 %lftr.wideiv25, %0
-  br i1 %exitcond26, label %for.end12, label %for.cond1.preheader
-
-for.end12:
-  ret void
-}
-
-; CHECK-LABEL: @interchange_01
-; CHECK: entry:
-; CHECK:   %cmp21 = icmp sgt i32 %N, 0
-; CHECK:   br i1 %cmp21, label %for.body3.preheader, label %for.end12
-; CHECK: for.cond1.preheader.lr.ph:                        
-; CHECK:   br label %for.cond1.preheader
-; CHECK: for.cond1.preheader:                              
-; CHECK:   %indvars.iv23 = phi i64 [ 0, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next24, %for.inc10 ]
-; CHECK:   br i1 %cmp219, label %for.body3.split1, label %for.end12.loopexit
-; CHECK: for.body3.preheader:                              
-; CHECK:   %cmp219 = icmp sgt i32 %N, 1
-; CHECK:   %0 = add i32 %N, -1
-; CHECK:   br label %for.body3
-; CHECK: for.body3:                                        
-; CHECK:   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3.split ], [ 1, %for.body3.preheader ]
-; CHECK:   br label %for.cond1.preheader.lr.ph
-; CHECK: for.body3.split1:                                 
-; CHECK:   %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv23
-; CHECK:   %1 = load i32, i32* %arrayidx5
-; CHECK:   %add = add nsw i32 %1, %k
-; CHECK:   store i32 %add, i32* %arrayidx5
-; CHECK:   br label %for.inc10.loopexit
-; CHECK: for.body3.split:                                  
-; CHECK:   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-; CHECK:   %lftr.wideiv = trunc i64 %indvars.iv to i32
-; CHECK:   %exitcond = icmp eq i32 %lftr.wideiv, %0
-; CHECK:   br i1 %exitcond, label %for.end12.loopexit, label %for.body3
-; CHECK: for.inc10.loopexit:                               
-; CHECK:   br label %for.inc10
-; CHECK: for.inc10:                                        
-; CHECK:   %indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1
-; CHECK:   %lftr.wideiv25 = trunc i64 %indvars.iv23 to i32
-; CHECK:   %exitcond26 = icmp eq i32 %lftr.wideiv25, %0
-; CHECK:   br i1 %exitcond26, label %for.body3.split, label %for.cond1.preheader
-; CHECK: for.end12.loopexit:                               
-; CHECK:   br label %for.end12
-; CHECK: for.end12:                                        
-; CHECK:   ret void
-
-;;--------------------------------------Test case 02-------------------------------------
-
-;; for(int i=0;i<100;i++)
-;;   for(int j=100;j>=0;j--)
-;;     A[j][i] = A[j][i]+k;
-
-define void @interchange_02(i32 %k) {
-entry:
-  br label %for.cond1.preheader
-
-for.cond1.preheader:
-  %indvars.iv19 = phi i64 [ 0, %entry ], [ %indvars.iv.next20, %for.inc10 ]
-  br label %for.body3
-
-for.body3:
-  %indvars.iv = phi i64 [ 100, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ]
-  %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv19
-  %0 = load i32, i32* %arrayidx5
-  %add = add nsw i32 %0, %k
-  store i32 %add, i32* %arrayidx5
-  %indvars.iv.next = add nsw i64 %indvars.iv, -1
-  %cmp2 = icmp sgt i64 %indvars.iv, 0
-  br i1 %cmp2, label %for.body3, label %for.inc10
-
-for.inc10:
-  %indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1
-  %exitcond = icmp eq i64 %indvars.iv.next20, 100
-  br i1 %exitcond, label %for.end11, label %for.cond1.preheader
-
-for.end11:
-  ret void
-}
-
-; CHECK-LABEL: @interchange_02
-; CHECK: entry:
-; CHECK:   br label %for.body3.preheader
-; CHECK: for.cond1.preheader.preheader: 
-; CHECK:   br label %for.cond1.preheader
-; CHECK: for.cond1.preheader:  
-; CHECK:   %indvars.iv19 = phi i64 [ %indvars.iv.next20, %for.inc10 ], [ 0, %for.cond1.preheader.preheader ]
-; CHECK:   br label %for.body3.split1
-; CHECK: for.body3.preheader: 
-; CHECK:   br label %for.body3
-; CHECK: for.body3: 
-; CHECK:   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3.split ], [ 100, %for.body3.preheader ]
-; CHECK:   br label %for.cond1.preheader.preheader
-; CHECK: for.body3.split1:                                 ; preds = %for.cond1.preheader
-; CHECK:   %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv19
-; CHECK:   %0 = load i32, i32* %arrayidx5
-; CHECK:   %add = add nsw i32 %0, %k
-; CHECK:   store i32 %add, i32* %arrayidx5
-; CHECK:   br label %for.inc10
-; CHECK: for.body3.split:
-; CHECK:   %indvars.iv.next = add nsw i64 %indvars.iv, -1
-; CHECK:   %cmp2 = icmp sgt i64 %indvars.iv, 0
-; CHECK:   br i1 %cmp2, label %for.body3, label %for.end11
-; CHECK: for.inc10:
-; CHECK:   %indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1
-; CHECK:   %exitcond = icmp eq i64 %indvars.iv.next20, 100
-; CHECK:   br i1 %exitcond, label %for.body3.split, label %for.cond1.preheader
-; CHECK: for.end11: 
-; CHECK:   ret void
-
-;;--------------------------------------Test case 03-------------------------------------
-;; Loops should not be interchanged in this case as it is not profitable.
-;;  for(int i=0;i<100;i++)
-;;    for(int j=0;j<100;j++)
-;;      A[i][j] = A[i][j]+k;
-
-define void @interchange_03(i32 %k) {
-entry:
-  br label %for.cond1.preheader
-
-for.cond1.preheader:
-  %indvars.iv21 = phi i64 [ 0, %entry ], [ %indvars.iv.next22, %for.inc10 ]
-  br label %for.body3
-
-for.body3:
-  %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ]
-  %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv21, i64 %indvars.iv
-  %0 = load i32, i32* %arrayidx5
-  %add = add nsw i32 %0, %k
-  store i32 %add, i32* %arrayidx5
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 100
-  br i1 %exitcond, label %for.inc10, label %for.body3
-
-for.inc10:
-  %indvars.iv.next22 = add nuw nsw i64 %indvars.iv21, 1
-  %exitcond23 = icmp eq i64 %indvars.iv.next22, 100
-  br i1 %exitcond23, label %for.end12, label %for.cond1.preheader
-
-for.end12:
-  ret void
-}
-
-; CHECK-LABEL: @interchange_03
-; CHECK: entry:
-; CHECK:   br label %for.cond1.preheader.preheader
-; CHECK: for.cond1.preheader.preheader:                    ; preds = %entry
-; CHECK:   br label %for.cond1.preheader
-; CHECK: for.cond1.preheader:                              ; preds = %for.cond1.preheader.preheader, %for.inc10
-; CHECK:   %indvars.iv21 = phi i64 [ %indvars.iv.next22, %for.inc10 ], [ 0, %for.cond1.preheader.preheader ]
-; CHECK:  br label %for.body3.preheader
-; CHECK: for.body3.preheader:                              ; preds = %for.cond1.preheader
-; CHECK:   br label %for.body3
-; CHECK: for.body3:                                        ; preds = %for.body3.preheader, %for.body3
-; CHECK:   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3 ], [ 0, %for.body3.preheader ]
-; CHECK:   %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv21, i64 %indvars.iv
-; CHECK:   %0 = load i32, i32* %arrayidx5
-; CHECK:   %add = add nsw i32 %0, %k
-; CHECK:   store i32 %add, i32* %arrayidx5
-; CHECK:   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-; CHECK:   %exitcond = icmp eq i64 %indvars.iv.next, 100
-; CHECK:   br i1 %exitcond, label %for.inc10, label %for.body3
-; CHECK: for.inc10:                                        ; preds = %for.body3
-; CHECK:   %indvars.iv.next22 = add nuw nsw i64 %indvars.iv21, 1
-; CHECK:   %exitcond23 = icmp eq i64 %indvars.iv.next22, 100
-; CHECK:   br i1 %exitcond23, label %for.end12, label %for.cond1.preheader
-; CHECK: for.end12:                                        ; preds = %for.inc10
-; CHECK:   ret void
-
-
-;;--------------------------------------Test case 04-------------------------------------
-;; Loops should not be interchanged in this case as it is not legal due to dependency.
-;;  for(int j=0;j<99;j++)
-;;   for(int i=0;i<99;i++)
-;;       A[j][i+1] = A[j+1][i]+k;
-
-define void @interchange_04(i32 %k){
-entry:
-  br label %for.cond1.preheader
-
-for.cond1.preheader:
-  %indvars.iv23 = phi i64 [ 0, %entry ], [ %indvars.iv.next24, %for.inc12 ]
-  %indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1
-  br label %for.body3
-
-for.body3:
-  %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ]
-  %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv.next24, i64 %indvars.iv
-  %0 = load i32, i32* %arrayidx5
-  %add6 = add nsw i32 %0, %k
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %arrayidx11 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv23, i64 %indvars.iv.next
-  store i32 %add6, i32* %arrayidx11
-  %exitcond = icmp eq i64 %indvars.iv.next, 99
-  br i1 %exitcond, label %for.inc12, label %for.body3
-
-for.inc12:
-  %exitcond25 = icmp eq i64 %indvars.iv.next24, 99
-  br i1 %exitcond25, label %for.end14, label %for.cond1.preheader
-
-for.end14:
-  ret void
-}
-
-; CHECK-LABEL: @interchange_04
-; CHECK: entry:
-; CHECK:   br label %for.cond1.preheader
-; CHECK: for.cond1.preheader:                              ; preds = %for.inc12, %entry
-; CHECK:   %indvars.iv23 = phi i64 [ 0, %entry ], [ %indvars.iv.next24, %for.inc12 ]
-; CHECK:   %indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1
-; CHECK:   br label %for.body3
-; CHECK: for.body3:                                        ; preds = %for.body3, %for.cond1.preheader
-; CHECK:   %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ]
-; CHECK:   %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv.next24, i64 %indvars.iv
-; CHECK:   %0 = load i32, i32* %arrayidx5
-; CHECK:   %add6 = add nsw i32 %0, %k
-; CHECK:   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-; CHECK:   %arrayidx11 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv23, i64 %indvars.iv.next
-; CHECK:   store i32 %add6, i32* %arrayidx11
-; CHECK:   %exitcond = icmp eq i64 %indvars.iv.next, 99
-; CHECK:   br i1 %exitcond, label %for.inc12, label %for.body3
-; CHECK: for.inc12:                                        ; preds = %for.body3
-; CHECK:   %exitcond25 = icmp eq i64 %indvars.iv.next24, 99
-; CHECK:   br i1 %exitcond25, label %for.end14, label %for.cond1.preheader
-; CHECK: for.end14:                                        ; preds = %for.inc12
-; CHECK:   ret void
-
-
-
-;;--------------------------------------Test case 05-------------------------------------
-;; Loops not tightly nested are not interchanged
-;;  for(int j=0;j<N;j++) {
-;;    B[j] = j+k;
-;;    for(int i=0;i<N;i++)
-;;      A[j][i] = A[j][i]+B[j];
-;;  }
-
-define void @interchange_05(i32 %k, i32 %N){
-entry:
-  %cmp30 = icmp sgt i32 %N, 0
-  br i1 %cmp30, label %for.body.lr.ph, label %for.end17
-
-for.body.lr.ph:
-  %0 = add i32 %N, -1
-  %1 = zext i32 %k to i64
-  br label %for.body
-
-for.body:
-  %indvars.iv32 = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next33, %for.inc15 ]
-  %2 = add nsw i64 %indvars.iv32, %1
-  %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* @B, i64 0, i64 %indvars.iv32
-  %3 = trunc i64 %2 to i32
-  store i32 %3, i32* %arrayidx
-  br label %for.body3
-
-for.body3:
-  %indvars.iv = phi i64 [ 0, %for.body ], [ %indvars.iv.next, %for.body3 ]
-  %arrayidx7 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv32, i64 %indvars.iv
-  %4 = load i32, i32* %arrayidx7
-  %add10 = add nsw i32 %3, %4
-  store i32 %add10, i32* %arrayidx7
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %0
-  br i1 %exitcond, label %for.inc15, label %for.body3
-
-for.inc15:
-  %indvars.iv.next33 = add nuw nsw i64 %indvars.iv32, 1
-  %lftr.wideiv35 = trunc i64 %indvars.iv32 to i32
-  %exitcond36 = icmp eq i32 %lftr.wideiv35, %0
-  br i1 %exitcond36, label %for.end17, label %for.body
-
-for.end17:
-  ret void
-}
-
-; CHECK-LABEL: @interchange_05
-; CHECK: entry:
-; CHECK: %cmp30 = icmp sgt i32 %N, 0
-; CHECK: br i1 %cmp30, label %for.body.lr.ph, label %for.end17
-; CHECK: for.body.lr.ph:
-; CHECK: %0 = add i32 %N, -1
-; CHECK: %1 = zext i32 %k to i64
-; CHECK: br label %for.body
-; CHECK: for.body:
-; CHECK: %indvars.iv32 = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next33, %for.inc15 ]
-; CHECK: %2 = add nsw i64 %indvars.iv32, %1
-; CHECK: %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* @B, i64 0, i64 %indvars.iv32
-; CHECK: %3 = trunc i64 %2 to i32
-; CHECK: store i32 %3, i32* %arrayidx
-; CHECK: br label %for.body3.preheader
-; CHECK: for.body3.preheader:
-; CHECK: br label %for.body3
-; CHECK: for.body3:
-; CHECK: %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3 ], [ 0, %for.body3.preheader ]
-; CHECK: %arrayidx7 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv32, i64 %indvars.iv
-; CHECK: %4 = load i32, i32* %arrayidx7
-; CHECK: %add10 = add nsw i32 %3, %4
-; CHECK: store i32 %add10, i32* %arrayidx7
-; CHECK: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-; CHECK: %lftr.wideiv = trunc i64 %indvars.iv to i32
-; CHECK: %exitcond = icmp eq i32 %lftr.wideiv, %0
-; CHECK: br i1 %exitcond, label %for.inc15, label %for.body3
-; CHECK: for.inc15:
-; CHECK: %indvars.iv.next33 = add nuw nsw i64 %indvars.iv32, 1
-; CHECK: %lftr.wideiv35 = trunc i64 %indvars.iv32 to i32
-; CHECK: %exitcond36 = icmp eq i32 %lftr.wideiv35, %0
-; CHECK: br i1 %exitcond36, label %for.end17.loopexit, label %for.body
-; CHECK: for.end17.loopexit:
-; CHECK: br label %for.end17
-; CHECK: for.end17:
-; CHECK: ret void
-
-
-;;--------------------------------------Test case 06-------------------------------------
-;; Loops not tightly nested are not interchanged
-;;  for(int j=0;j<N;j++) {
-;;    foo();
-;;    for(int i=2;i<N;i++)
-;;      A[j][i] = A[j][i]+k;
-;;  }
-
-define void @interchange_06(i32 %k, i32 %N) {
-entry:
-  %cmp22 = icmp sgt i32 %N, 0
-  br i1 %cmp22, label %for.body.lr.ph, label %for.end12
-
-for.body.lr.ph:
-  %0 = add i32 %N, -1
-  br label %for.body
-
-for.body:
-  %indvars.iv24 = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next25, %for.inc10 ]
-  tail call void (...) @foo()
-  br label %for.body3
-
-for.body3:
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3 ], [ 2, %for.body ]
-  %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv24, i64 %indvars.iv
-  %1 = load i32, i32* %arrayidx5
-  %add = add nsw i32 %1, %k
-  store i32 %add, i32* %arrayidx5
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %0
-  br i1 %exitcond, label %for.inc10, label %for.body3
-
-for.inc10:
-  %indvars.iv.next25 = add nuw nsw i64 %indvars.iv24, 1
-  %lftr.wideiv26 = trunc i64 %indvars.iv24 to i32
-  %exitcond27 = icmp eq i32 %lftr.wideiv26, %0
-  br i1 %exitcond27, label %for.end12, label %for.body
-
-for.end12:
-  ret void
-}
-;; Here we are checking if the inner phi is not split then we have not interchanged.
-; CHECK-LABEL: @interchange_06
-; CHECK:  phi i64 [ %indvars.iv.next, %for.body3 ], [ 2, %for.body3.preheader ]
-; CHECK-NEXT: getelementptr
-; CHECK-NEXT: %1 = load
-
-;;--------------------------------------Test case 07-------------------------------------
-;; FIXME:
-;; Test for interchange when we have an lcssa phi. This should ideally be interchanged but it is currently not supported.
-;;     for(gi=1;gi<N;gi++)
-;;       for(gj=1;gj<M;gj++)
-;;         A[gj][gi] = A[gj - 1][gi] + C[gj][gi];
-
-@gi = common global i32 0
-@gj = common global i32 0
-
-define void @interchange_07(i32 %N, i32 %M){
-entry:
-  store i32 1, i32* @gi
-  %cmp21 = icmp sgt i32 %N, 1
-  br i1 %cmp21, label %for.cond1.preheader.lr.ph, label %for.end16
-
-for.cond1.preheader.lr.ph: 
-  %cmp218 = icmp sgt i32 %M, 1
-  %gi.promoted = load i32, i32* @gi
-  %0 = add i32 %M, -1
-  %1 = sext i32 %gi.promoted to i64
-  %2 = sext i32 %N to i64
-  %3 = add i32 %gi.promoted, 1
-  %4 = icmp slt i32 %3, %N
-  %smax = select i1 %4, i32 %N, i32 %3
-  br label %for.cond1.preheader
-
-for.cond1.preheader: 
-  %indvars.iv25 = phi i64 [ %1, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next26, %for.inc14 ]
-  br i1 %cmp218, label %for.body3, label %for.inc14
-
-for.body3: 
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3 ], [ 1, %for.cond1.preheader ]
-  %5 = add nsw i64 %indvars.iv, -1
-  %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %5, i64 %indvars.iv25
-  %6 = load i32, i32* %arrayidx5
-  %arrayidx9 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @C, i64 0, i64 %indvars.iv, i64 %indvars.iv25
-  %7 = load i32, i32* %arrayidx9
-  %add = add nsw i32 %7, %6
-  %arrayidx13 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv25
-  store i32 %add, i32* %arrayidx13
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %0
-  br i1 %exitcond, label %for.inc14, label %for.body3
-
-for.inc14: 
-  %inc.lcssa23 = phi i32 [ 1, %for.cond1.preheader ], [ %M, %for.body3 ]
-  %indvars.iv.next26 = add nsw i64 %indvars.iv25, 1
-  %cmp = icmp slt i64 %indvars.iv.next26, %2
-  br i1 %cmp, label %for.cond1.preheader, label %for.cond.for.end16_crit_edge
-
-for.cond.for.end16_crit_edge: 
-  store i32 %inc.lcssa23, i32* @gj
-  store i32 %smax, i32* @gi
-  br label %for.end16
-
-for.end16:
-  ret void
-}
-
-; CHECK-LABEL: @interchange_07
-; CHECK: for.body3:                                        ; preds = %for.body3.preheader, %for.body3
-; CHECK:   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3 ], [ 1, %for.body3.preheader ]
-; CHECK:   %5 = add nsw i64 %indvars.iv, -1
-; CHECK:   %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %5, i64 %indvars.iv25
-; CHECK:   %6 = load i32, i32* %arrayidx5
-; CHECK:   %arrayidx9 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @C, i64 0, i64 %indvars.iv, i64 %indvars.iv25
-
-;;------------------------------------------------Test case 08-------------------------------
-;; Test for interchange in loop nest greater than 2.
-;;  for(int i=0;i<100;i++)
-;;    for(int j=0;j<100;j++)
-;;      for(int k=0;k<100;k++)
-;;        D[i][k][j] = D[i][k][j]+t;
-
-define void @interchange_08(i32 %t){
-entry:
-  br label %for.cond1.preheader
-
-for.cond1.preheader:                              ; preds = %for.inc15, %entry
-  %i.028 = phi i32 [ 0, %entry ], [ %inc16, %for.inc15 ]
-  br label %for.cond4.preheader
-
-for.cond4.preheader:                              ; preds = %for.inc12, %for.cond1.preheader
-  %j.027 = phi i32 [ 0, %for.cond1.preheader ], [ %inc13, %for.inc12 ]
-  br label %for.body6
-
-for.body6:                                        ; preds = %for.body6, %for.cond4.preheader
-  %k.026 = phi i32 [ 0, %for.cond4.preheader ], [ %inc, %for.body6 ]
-  %arrayidx8 = getelementptr inbounds [100 x [100 x [100 x i32]]], [100 x [100 x [100 x i32]]]* @D, i32 0, i32 %i.028, i32 %k.026, i32 %j.027
-  %0 = load i32, i32* %arrayidx8
-  %add = add nsw i32 %0, %t
-  store i32 %add, i32* %arrayidx8
-  %inc = add nuw nsw i32 %k.026, 1
-  %exitcond = icmp eq i32 %inc, 100
-  br i1 %exitcond, label %for.inc12, label %for.body6
-
-for.inc12:                                        ; preds = %for.body6
-  %inc13 = add nuw nsw i32 %j.027, 1
-  %exitcond29 = icmp eq i32 %inc13, 100
-  br i1 %exitcond29, label %for.inc15, label %for.cond4.preheader
-
-for.inc15:                                        ; preds = %for.inc12
-  %inc16 = add nuw nsw i32 %i.028, 1
-  %exitcond30 = icmp eq i32 %inc16, 100
-  br i1 %exitcond30, label %for.end17, label %for.cond1.preheader
-
-for.end17:                                        ; preds = %for.inc15
-  ret void
-}
-; CHECK-LABEL: @interchange_08
-; CHECK:   entry:
-; CHECK:     br label %for.cond1.preheader.preheader
-; CHECK:   for.cond1.preheader.preheader:                    ; preds = %entry
-; CHECK:     br label %for.cond1.preheader
-; CHECK:   for.cond1.preheader:                              ; preds = %for.cond1.preheader.preheader, %for.inc15
-; CHECK:     %i.028 = phi i32 [ %inc16, %for.inc15 ], [ 0, %for.cond1.preheader.preheader ]
-; CHECK:     br label %for.body6.preheader
-; CHECK:   for.cond4.preheader.preheader:                    ; preds = %for.body6
-; CHECK:     br label %for.cond4.preheader
-; CHECK:   for.cond4.preheader:                              ; preds = %for.cond4.preheader.preheader, %for.inc12
-; CHECK:     %j.027 = phi i32 [ %inc13, %for.inc12 ], [ 0, %for.cond4.preheader.preheader ]
-; CHECK:     br label %for.body6.split1
-; CHECK:   for.body6.preheader:                              ; preds = %for.cond1.preheader
-; CHECK:     br label %for.body6
-; CHECK:   for.body6:                                        ; preds = %for.body6.preheader, %for.body6.split
-; CHECK:     %k.026 = phi i32 [ %inc, %for.body6.split ], [ 0, %for.body6.preheader ]
-; CHECK:     br label %for.cond4.preheader.preheader
-; CHECK:   for.body6.split1:                                 ; preds = %for.cond4.preheader
-; CHECK:     %arrayidx8 = getelementptr inbounds [100 x [100 x [100 x i32]]], [100 x [100 x [100 x i32]]]* @D, i32 0, i32 %i.028, i32 %k.026, i32 %j.027
-; CHECK:     %0 = load i32, i32* %arrayidx8
-; CHECK:     %add = add nsw i32 %0, %t
-; CHECK:     store i32 %add, i32* %arrayidx8
-; CHECK:     br label %for.inc12
-; CHECK:   for.body6.split:                                  ; preds = %for.inc12
-; CHECK:     %inc = add nuw nsw i32 %k.026, 1
-; CHECK:     %exitcond = icmp eq i32 %inc, 100
-; CHECK:     br i1 %exitcond, label %for.inc15, label %for.body6
-; CHECK:   for.inc12:                                        ; preds = %for.body6.split1
-; CHECK:     %inc13 = add nuw nsw i32 %j.027, 1
-; CHECK:     %exitcond29 = icmp eq i32 %inc13, 100
-; CHECK:     br i1 %exitcond29, label %for.body6.split, label %for.cond4.preheader
-; CHECK:   for.inc15:                                        ; preds = %for.body6.split
-; CHECK:     %inc16 = add nuw nsw i32 %i.028, 1
-; CHECK:     %exitcond30 = icmp eq i32 %inc16, 100
-; CHECK:     br i1 %exitcond30, label %for.end17, label %for.cond1.preheader
-; CHECK:   for.end17:                                        ; preds = %for.inc15
-; CHECK:     ret void
-
-;;-----------------------------------Test case 09-------------------------------
-;; Test that a flow dependency in outer loop doesn't prevent interchange in
-;; loops i and j.
-;;
-;;  for (int k = 0; k < 100; ++k) {
-;;    T[k] = fn1();
-;;    for (int i = 0; i < 1000; ++i)
-;;      for(int j = 1; j < 1000; ++j)
-;;        Arr[j][i] = Arr[j][i]+k;
-;;    fn2(T[k]);
-;;  }
-
-@T = internal global [100 x double] zeroinitializer, align 4
-@Arr = internal global [1000 x [1000 x i32]] zeroinitializer, align 4
-
-define void @interchange_09(i32 %k) {
-entry:
-  br label %for.body
-
-for.cond.cleanup:                                 ; preds = %for.cond.cleanup4
-  ret void
-
-for.body:                                         ; preds = %for.cond.cleanup4, %entry
-  %indvars.iv45 = phi i64 [ 0, %entry ], [ %indvars.iv.next46, %for.cond.cleanup4 ]
-  %call = call double @fn1()
-  %arrayidx = getelementptr inbounds [100 x double], [100 x double]* @T, i64 0, i64 %indvars.iv45
-  store double %call, double* %arrayidx, align 8
-  br label %for.cond6.preheader
-
-for.cond6.preheader:                              ; preds = %for.cond.cleanup8, %for.body
-  %indvars.iv42 = phi i64 [ 0, %for.body ], [ %indvars.iv.next43, %for.cond.cleanup8 ]
-  br label %for.body9
-
-for.cond.cleanup4:                                ; preds = %for.cond.cleanup8
-  %tmp = load double, double* %arrayidx, align 8
-  call void @fn2(double %tmp)
-  %indvars.iv.next46 = add nuw nsw i64 %indvars.iv45, 1
-  %exitcond47 = icmp ne i64 %indvars.iv.next46, 100
-  br i1 %exitcond47, label %for.body, label %for.cond.cleanup
-
-for.cond.cleanup8:                                ; preds = %for.body9
-  %indvars.iv.next43 = add nuw nsw i64 %indvars.iv42, 1
-  %exitcond44 = icmp ne i64 %indvars.iv.next43, 1000
-  br i1 %exitcond44, label %for.cond6.preheader, label %for.cond.cleanup4
-
-for.body9:                                        ; preds = %for.body9, %for.cond6.preheader
-  %indvars.iv = phi i64 [ 1, %for.cond6.preheader ], [ %indvars.iv.next, %for.body9 ]
-  %arrayidx13 = getelementptr inbounds [1000 x [1000 x i32]], [1000 x [1000 x i32]]* @Arr, i64 0, i64 %indvars.iv, i64 %indvars.iv42
-  %tmp1 = load i32, i32* %arrayidx13, align 4
-  %tmp2 = trunc i64 %indvars.iv45 to i32
-  %add = add nsw i32 %tmp1, %tmp2
-  store i32 %add, i32* %arrayidx13, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp ne i64 %indvars.iv.next, 1000
-  br i1 %exitcond, label %for.body9, label %for.cond.cleanup8
-}
-
-declare double @fn1()
-declare void @fn2(double)
-
-
-
-
-
-;; After interchange %indvars.iv (j) should increment as the middle loop.
-;; After interchange %indvars.iv42 (i) should increment with the inner most loop.
-
-; CHECK-LABEL: @interchange_09
-
-; CHECK: for.body:
-; CHECK:   %indvars.iv45 = phi i64 [ %indvars.iv.next46, %for.cond.cleanup4 ], [ 0, %for.body.preheader ]
-; CHECK:   %call = call double @fn1()
-; CHECK:   %arrayidx = getelementptr inbounds [100 x double], [100 x double]* @T, i64 0, i64 %indvars.iv45
-; CHECK:   store double %call, double* %arrayidx, align 8
-; CHECK:   br label %for.body9.preheader
-
-; CHECK: for.cond6.preheader.preheader:
-; CHECK:   br label %for.cond6.preheader
-
-; CHECK: for.cond6.preheader:
-; CHECK:   %indvars.iv42 = phi i64 [ %indvars.iv.next43, %for.cond.cleanup8 ], [ 0, %for.cond6.preheader.preheader ]
-; CHECK:   br label %for.body9.split1
-
-; CHECK: for.body9.preheader:
-; CHECK:   br label %for.body9
-
-; CHECK: for.cond.cleanup4:
-; CHECK:   %tmp = load double, double* %arrayidx, align 8
-; CHECK:   call void @fn2(double %tmp)
-; CHECK:   %indvars.iv.next46 = add nuw nsw i64 %indvars.iv45, 1
-; CHECK:   %exitcond47 = icmp ne i64 %indvars.iv.next46, 100
-; CHECK:   br i1 %exitcond47, label %for.body, label %for.cond.cleanup
-
-; CHECK: for.cond.cleanup8:
-; CHECK:   %indvars.iv.next43 = add nuw nsw i64 %indvars.iv42, 1
-; CHECK:   %exitcond44 = icmp ne i64 %indvars.iv.next43, 1000
-; CHECK:   br i1 %exitcond44, label %for.cond6.preheader, label %for.body9.split
-
-; CHECK: for.body9:
-; CHECK:   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body9.split ], [ 1, %for.body9.preheader ]
-; CHECK:   br label %for.cond6.preheader.preheader
-
-; CHECK: for.body9.split1:
-; CHECK:   %arrayidx13 = getelementptr inbounds [1000 x [1000 x i32]], [1000 x [1000 x i32]]* @Arr, i64 0, i64 %indvars.iv, i64 %indvars.iv42
-; CHECK:   store i32 %add, i32* %arrayidx13, align 4
-; CHECK:   br label %for.cond.cleanup8
-
-; CHECK: for.body9.split:
-; CHECK:   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-; CHECK:   %exitcond = icmp ne i64 %indvars.iv.next, 1000
-; CHECK:   br i1 %exitcond, label %for.body9, label %for.cond.cleanup4
-
-
-;;-----------------------------------Test case 10-------------------------------
-;; Test to make sure we can handle output dependencies.
-;;
-;;  for (int i = 0; i < 2; ++i)
-;;    for(int j = 0; j < 3; ++j) {
-;;      A[j][i] = i;
-;;      A[j][i+1] = j;
-;;    }
-
-@A10 = local_unnamed_addr global [3 x [3 x i32]] zeroinitializer, align 16
-
-define void @interchange_10() {
-entry:
-  br label %for.cond1.preheader
-
-for.cond.loopexit:                                ; preds = %for.body4
-  %exitcond28 = icmp ne i64 %indvars.iv.next27, 2
-  br i1 %exitcond28, label %for.cond1.preheader, label %for.cond.cleanup
-
-for.cond1.preheader:                              ; preds = %for.cond.loopexit, %entry
-  %indvars.iv26 = phi i64 [ 0, %entry ], [ %indvars.iv.next27, %for.cond.loopexit ]
-  %indvars.iv.next27 = add nuw nsw i64 %indvars.iv26, 1
-  br label %for.body4
-
-for.cond.cleanup:                                 ; preds = %for.cond.loopexit
-  ret void
-
-for.body4:                                        ; preds = %for.body4, %for.cond1.preheader
-  %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body4 ]
-  %arrayidx6 = getelementptr inbounds [3 x [3 x i32]], [3 x [3 x i32]]* @A10, i64 0, i64 %indvars.iv, i64 %indvars.iv26
-  %tmp = trunc i64 %indvars.iv26 to i32
-  store i32 %tmp, i32* %arrayidx6, align 4
-  %arrayidx10 = getelementptr inbounds [3 x [3 x i32]], [3 x [3 x i32]]* @A10, i64 0, i64 %indvars.iv, i64 %indvars.iv.next27
-  %tmp1 = trunc i64 %indvars.iv to i32
-  store i32 %tmp1, i32* %arrayidx10, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp ne i64 %indvars.iv.next, 3
-  br i1 %exitcond, label %for.body4, label %for.cond.loopexit
-}
-
-; CHECK-LABEL: @interchange_10
-; CHECK: entry:
-; CHECK:   br label %for.body4.preheader
-
-; CHECK: for.cond1.preheader.preheader:
-; CHECK:   br label %for.cond1.preheader
-
-; CHECK: for.cond.loopexit:
-; CHECK:   %exitcond28 = icmp ne i64 %indvars.iv.next27, 2
-; CHECK:   br i1 %exitcond28, label %for.cond1.preheader, label %for.body4.split
-
-; CHECK: for.cond1.preheader:
-; CHECK:   %indvars.iv26 = phi i64 [ %indvars.iv.next27, %for.cond.loopexit ], [ 0, %for.cond1.preheader.preheader ]
-; CHECK:   %indvars.iv.next27 = add nuw nsw i64 %indvars.iv26, 1
-; CHECK:   br label %for.body4.split1
-
-; CHECK: for.body4.preheader:
-; CHECK:   br label %for.body4
-
-; CHECK: for.cond.cleanup:
-; CHECK:   ret void
-
-; CHECK: for.body4:
-; CHECK:   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body4.split ], [ 0, %for.body4.preheader ]
-; CHECK:   br label %for.cond1.preheader.preheader
-
-; CHECK: for.body4.split1:
-; CHECK:   %arrayidx6 = getelementptr inbounds [3 x [3 x i32]], [3 x [3 x i32]]* @A10, i64 0, i64 %indvars.iv, i64 %indvars.iv26
-; CHECK:   %tmp = trunc i64 %indvars.iv26 to i32
-; CHECK:   store i32 %tmp, i32* %arrayidx6, align 4
-; CHECK:   %arrayidx10 = getelementptr inbounds [3 x [3 x i32]], [3 x [3 x i32]]* @A10, i64 0, i64 %indvars.iv, i64 %indvars.iv.next27
-; CHECK:   %tmp1 = trunc i64 %indvars.iv to i32
-; CHECK:   store i32 %tmp1, i32* %arrayidx10, align 4
-; CHECK:   br label %for.cond.loopexit
-
-; CHECK: for.body4.split:
-; CHECK:   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-; CHECK:   %exitcond = icmp ne i64 %indvars.iv.next, 3
-; CHECK:   br i1 %exitcond, label %for.body4, label %for.cond.cleanup
diff --git a/test/Transforms/LoopInterchange/loop-interchange-optimization-remarks.ll b/test/Transforms/LoopInterchange/loop-interchange-optimization-remarks.ll
new file mode 100644
index 000000000000..e14598cfdd60
--- /dev/null
+++ b/test/Transforms/LoopInterchange/loop-interchange-optimization-remarks.ll
@@ -0,0 +1,220 @@
+; Test optimization remarks generated by the LoopInterchange pass.
+;
+; RUN: opt < %s -basicaa -loop-interchange -pass-remarks-output=%t -pass-remarks-missed='loop-interchange' \
+; RUN:          -pass-remarks='loop-interchange' -S
+; RUN: cat %t |  FileCheck %s
+
+@A = common global [100 x [100 x i32]] zeroinitializer
+@B = common global [100 x [100 x i32]] zeroinitializer
+@C = common global [100 x i32] zeroinitializer
+
+;;---------------------------------------Test case 01---------------------------------
+;; Loops interchange is not profitable.
+;;   for(int i=1;i<N;i++)
+;;     for(int j=1;j<N;j++)
+;;       A[i-1][j-1] = A[i - 1][j-1] + B[i][j];
+
+define void @test01(i32 %N){
+entry:
+  %cmp31 = icmp sgt i32 %N, 1
+  br i1 %cmp31, label %for.cond1.preheader.lr.ph, label %for.end19
+
+for.cond1.preheader.lr.ph:
+  %0 = add i32 %N, -1
+  br label %for.body3.lr.ph
+
+for.body3.lr.ph:
+  %indvars.iv34 = phi i64 [ 1, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next35, %for.inc17 ]
+  %1 = add nsw i64 %indvars.iv34, -1
+  br label %for.body3
+
+for.body3:
+  %indvars.iv = phi i64 [ 1, %for.body3.lr.ph ], [ %indvars.iv.next, %for.body3 ]
+  %2 = add nsw i64 %indvars.iv, -1
+  %arrayidx6 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %1, i64 %2
+  %3 = load i32, i32* %arrayidx6
+  %arrayidx10 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @B, i64 0, i64 %indvars.iv34, i64 %indvars.iv
+  %4 = load i32, i32* %arrayidx10
+  %add = add nsw i32 %4, %3
+  store i32 %add, i32* %arrayidx6
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %0
+  br i1 %exitcond, label %for.inc17, label %for.body3
+
+for.inc17:
+  %indvars.iv.next35 = add nuw nsw i64 %indvars.iv34, 1
+  %lftr.wideiv37 = trunc i64 %indvars.iv34 to i32
+  %exitcond38 = icmp eq i32 %lftr.wideiv37, %0
+  br i1 %exitcond38, label %for.end19, label %for.body3.lr.ph
+
+for.end19:
+  ret void
+}
+
+; CHECK: --- !Missed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            InterchangeNotProfitable
+; CHECK-NEXT: Function:        test01
+; CHECK-NEXT: Args:
+; CHECK-NEXT:  - String:          'Interchanging loops is too costly (cost='
+; CHECK-NEXT:  - Cost:            '2'
+; CHECK-NEXT:  - String:          ', threshold='
+; CHECK-NEXT:  - Threshold:       '0'
+; CHECK-NEXT:  - String:          ') and it does not improve parallelism.'
+; CHECK-NEXT: ...
+
+;;--------------------------------------Test case 02------------------------------------
+;; [FIXME] This loop though valid is currently not interchanged due to the
+;; limitation that we cannot split the inner loop latch due to multiple use of inner induction
+;; variable.(used to increment the loop counter and to access A[j+1][i+1]
+;;  for(int i=0;i<N-1;i++)
+;;    for(int j=1;j<N-1;j++)
+;;      A[j+1][i+1] = A[j+1][i+1] + k;
+
+define void @test02(i32 %k, i32 %N) {
+ entry:
+   %sub = add nsw i32 %N, -1
+   %cmp26 = icmp sgt i32 %N, 1
+   br i1 %cmp26, label %for.cond1.preheader.lr.ph, label %for.end17
+
+ for.cond1.preheader.lr.ph:
+   %cmp324 = icmp sgt i32 %sub, 1
+   %0 = add i32 %N, -2
+   %1 = sext i32 %sub to i64
+   br label %for.cond1.preheader
+
+ for.cond.loopexit:
+   %cmp = icmp slt i64 %indvars.iv.next29, %1
+   br i1 %cmp, label %for.cond1.preheader, label %for.end17
+
+ for.cond1.preheader:
+   %indvars.iv28 = phi i64 [ 0, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next29, %for.cond.loopexit ]
+   %indvars.iv.next29 = add nuw nsw i64 %indvars.iv28, 1
+   br i1 %cmp324, label %for.body4, label %for.cond.loopexit
+
+ for.body4:
+   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body4 ], [ 1, %for.cond1.preheader ]
+   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+   %arrayidx7 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv.next, i64 %indvars.iv.next29
+   %2 = load i32, i32* %arrayidx7
+   %add8 = add nsw i32 %2, %k
+   store i32 %add8, i32* %arrayidx7
+   %lftr.wideiv = trunc i64 %indvars.iv to i32
+   %exitcond = icmp eq i32 %lftr.wideiv, %0
+   br i1 %exitcond, label %for.cond.loopexit, label %for.body4
+
+ for.end17:
+   ret void
+}
+
+; CHECK: --- !Missed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            UnsupportedInsBetweenInduction
+; CHECK-NEXT: Function:        test02
+; CHECK-NEXT: Args:
+; CHECK-NEXT:   - String:          Found unsupported instruction between induction variable increment and branch.
+; CHECK-NEXT: ...
+
+;;-----------------------------------Test case 03-------------------------------
+;; Test to make sure we can handle output dependencies.
+;;
+;;  for (int i = 0; i < 2; ++i)
+;;    for(int j = 0; j < 3; ++j) {
+;;      A[j][i] = i;
+;;      A[j][i+1] = j;
+;;    }
+
+@A10 = local_unnamed_addr global [3 x [3 x i32]] zeroinitializer, align 16
+
+define void @test03() {
+entry:
+  br label %for.cond1.preheader
+
+for.cond.loopexit:                                ; preds = %for.body4
+  %exitcond28 = icmp ne i64 %indvars.iv.next27, 2
+  br i1 %exitcond28, label %for.cond1.preheader, label %for.cond.cleanup
+
+for.cond1.preheader:                              ; preds = %for.cond.loopexit, %entry
+  %indvars.iv26 = phi i64 [ 0, %entry ], [ %indvars.iv.next27, %for.cond.loopexit ]
+  %indvars.iv.next27 = add nuw nsw i64 %indvars.iv26, 1
+  br label %for.body4
+
+for.cond.cleanup:                                 ; preds = %for.cond.loopexit
+  ret void
+
+for.body4:                                        ; preds = %for.body4, %for.cond1.preheader
+  %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body4 ]
+  %arrayidx6 = getelementptr inbounds [3 x [3 x i32]], [3 x [3 x i32]]* @A10, i64 0, i64 %indvars.iv, i64 %indvars.iv26
+  %tmp = trunc i64 %indvars.iv26 to i32
+  store i32 %tmp, i32* %arrayidx6, align 4
+  %arrayidx10 = getelementptr inbounds [3 x [3 x i32]], [3 x [3 x i32]]* @A10, i64 0, i64 %indvars.iv, i64 %indvars.iv.next27
+  %tmp1 = trunc i64 %indvars.iv to i32
+  store i32 %tmp1, i32* %arrayidx10, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, 3
+  br i1 %exitcond, label %for.body4, label %for.cond.loopexit
+}
+
+; CHECK: --- !Passed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Function:        test03
+; CHECK-NEXT: Args:
+; CHECK-NEXT:   - String:          Loop interchanged with enclosing loop.
+; CHECK-NEXT: ...
+
+;;--------------------------------------Test case 04-------------------------------------
+;; Loops not tightly nested are not interchanged
+;;  for(int j=0;j<N;j++) {
+;;    B[j] = j+k;
+;;    for(int i=0;i<N;i++)
+;;      A[j][i] = A[j][i]+B[j];
+;;  }
+
+define void @test04(i32 %k, i32 %N){
+entry:
+  %cmp30 = icmp sgt i32 %N, 0
+  br i1 %cmp30, label %for.body.lr.ph, label %for.end17
+
+for.body.lr.ph:
+  %0 = add i32 %N, -1
+  %1 = zext i32 %k to i64
+  br label %for.body
+
+for.body:
+  %indvars.iv32 = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next33, %for.inc15 ]
+  %2 = add nsw i64 %indvars.iv32, %1
+  %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* @C, i64 0, i64 %indvars.iv32
+  %3 = trunc i64 %2 to i32
+  store i32 %3, i32* %arrayidx
+  br label %for.body3
+
+for.body3:
+  %indvars.iv = phi i64 [ 0, %for.body ], [ %indvars.iv.next, %for.body3 ]
+  %arrayidx7 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv32, i64 %indvars.iv
+  %4 = load i32, i32* %arrayidx7
+  %add10 = add nsw i32 %3, %4
+  store i32 %add10, i32* %arrayidx7
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %0
+  br i1 %exitcond, label %for.inc15, label %for.body3
+
+for.inc15:
+  %indvars.iv.next33 = add nuw nsw i64 %indvars.iv32, 1
+  %lftr.wideiv35 = trunc i64 %indvars.iv32 to i32
+  %exitcond36 = icmp eq i32 %lftr.wideiv35, %0
+  br i1 %exitcond36, label %for.end17, label %for.body
+
+for.end17:
+  ret void
+}
+
+; CHECK: --- !Missed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            NotTightlyNested
+; CHECK-NEXT: Function:        test04
+; CHECK-NEXT: Args:
+; CHECK-NEXT:   - String:          Cannot interchange loops because they are not tightly nested.
+; CHECK-NEXT: ...
diff --git a/test/Transforms/LoopInterchange/not-interchanged-dependencies-1.ll b/test/Transforms/LoopInterchange/not-interchanged-dependencies-1.ll
new file mode 100644
index 000000000000..cf4f83baea82
--- /dev/null
+++ b/test/Transforms/LoopInterchange/not-interchanged-dependencies-1.ll
@@ -0,0 +1,64 @@
+; RUN: opt < %s -basicaa -loop-interchange -S | FileCheck %s
+;; We test the complete .ll for adjustment in outer loop header/latch and inner loop header/latch.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@A = common global [100 x [100 x i32]] zeroinitializer
+@B = common global [100 x i32] zeroinitializer
+
+;; Loops should not be interchanged in this case as it is not legal due to dependency.
+;;  for(int j=0;j<99;j++)
+;;   for(int i=0;i<99;i++)
+;;       A[j][i+1] = A[j+1][i]+k;
+
+define void @interchange_04(i32 %k){
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:
+  %indvars.iv23 = phi i64 [ 0, %entry ], [ %indvars.iv.next24, %for.inc12 ]
+  %indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1
+  br label %for.body3
+
+for.body3:
+  %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ]
+  %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv.next24, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx5
+  %add6 = add nsw i32 %0, %k
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %arrayidx11 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv23, i64 %indvars.iv.next
+  store i32 %add6, i32* %arrayidx11
+  %exitcond = icmp eq i64 %indvars.iv.next, 99
+  br i1 %exitcond, label %for.inc12, label %for.body3
+
+for.inc12:
+  %exitcond25 = icmp eq i64 %indvars.iv.next24, 99
+  br i1 %exitcond25, label %for.end14, label %for.cond1.preheader
+
+for.end14:
+  ret void
+}
+
+; CHECK-LABEL: @interchange_04
+; CHECK: entry:
+; CHECK:   br label %for.cond1.preheader
+; CHECK: for.cond1.preheader:                              ; preds = %for.inc12, %entry
+; CHECK:   %indvars.iv23 = phi i64 [ 0, %entry ], [ %indvars.iv.next24, %for.inc12 ]
+; CHECK:   %indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1
+; CHECK:   br label %for.body3
+; CHECK: for.body3:                                        ; preds = %for.body3, %for.cond1.preheader
+; CHECK:   %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ]
+; CHECK:   %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv.next24, i64 %indvars.iv
+; CHECK:   %0 = load i32, i32* %arrayidx5
+; CHECK:   %add6 = add nsw i32 %0, %k
+; CHECK:   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+; CHECK:   %arrayidx11 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv23, i64 %indvars.iv.next
+; CHECK:   store i32 %add6, i32* %arrayidx11
+; CHECK:   %exitcond = icmp eq i64 %indvars.iv.next, 99
+; CHECK:   br i1 %exitcond, label %for.inc12, label %for.body3
+; CHECK: for.inc12:                                        ; preds = %for.body3
+; CHECK:   %exitcond25 = icmp eq i64 %indvars.iv.next24, 99
+; CHECK:   br i1 %exitcond25, label %for.end14, label %for.cond1.preheader
+; CHECK: for.end14:                                        ; preds = %for.inc12
+; CHECK:   ret void
diff --git a/test/Transforms/LoopInterchange/not-interchanged-loop-nest-3.ll b/test/Transforms/LoopInterchange/not-interchanged-loop-nest-3.ll
new file mode 100644
index 000000000000..1d4d22883a4f
--- /dev/null
+++ b/test/Transforms/LoopInterchange/not-interchanged-loop-nest-3.ll
@@ -0,0 +1,87 @@
+; RUN: opt < %s -basicaa -loop-interchange -S | FileCheck %s
+;; We test the complete .ll for adjustment in outer loop header/latch and inner loop header/latch.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@D = common global [100 x [100 x [100 x i32]]] zeroinitializer
+
+;; Test for interchange in loop nest greater than 2.
+;;  for(int i=0;i<100;i++)
+;;    for(int j=0;j<100;j++)
+;;      for(int k=0;k<100;k++)
+;;        D[i][k][j] = D[i][k][j]+t;
+
+define void @interchange_08(i32 %t){
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc15, %entry
+  %i.028 = phi i32 [ 0, %entry ], [ %inc16, %for.inc15 ]
+  br label %for.cond4.preheader
+
+for.cond4.preheader:                              ; preds = %for.inc12, %for.cond1.preheader
+  %j.027 = phi i32 [ 0, %for.cond1.preheader ], [ %inc13, %for.inc12 ]
+  br label %for.body6
+
+for.body6:                                        ; preds = %for.body6, %for.cond4.preheader
+  %k.026 = phi i32 [ 0, %for.cond4.preheader ], [ %inc, %for.body6 ]
+  %arrayidx8 = getelementptr inbounds [100 x [100 x [100 x i32]]], [100 x [100 x [100 x i32]]]* @D, i32 0, i32 %i.028, i32 %k.026, i32 %j.027
+  %0 = load i32, i32* %arrayidx8
+  %add = add nsw i32 %0, %t
+  store i32 %add, i32* %arrayidx8
+  %inc = add nuw nsw i32 %k.026, 1
+  %exitcond = icmp eq i32 %inc, 100
+  br i1 %exitcond, label %for.inc12, label %for.body6
+
+for.inc12:                                        ; preds = %for.body6
+  %inc13 = add nuw nsw i32 %j.027, 1
+  %exitcond29 = icmp eq i32 %inc13, 100
+  br i1 %exitcond29, label %for.inc15, label %for.cond4.preheader
+
+for.inc15:                                        ; preds = %for.inc12
+  %inc16 = add nuw nsw i32 %i.028, 1
+  %exitcond30 = icmp eq i32 %inc16, 100
+  br i1 %exitcond30, label %for.end17, label %for.cond1.preheader
+
+for.end17:                                        ; preds = %for.inc15
+  ret void
+}
+; CHECK-LABEL: @interchange_08
+; CHECK:   entry:
+; CHECK:     br label %for.cond1.preheader.preheader
+; CHECK:   for.cond1.preheader.preheader:                    ; preds = %entry
+; CHECK:     br label %for.cond1.preheader
+; CHECK:   for.cond1.preheader:                              ; preds = %for.cond1.preheader.preheader, %for.inc15
+; CHECK:     %i.028 = phi i32 [ %inc16, %for.inc15 ], [ 0, %for.cond1.preheader.preheader ]
+; CHECK:     br label %for.body6.preheader
+; CHECK:   for.cond4.preheader.preheader:                    ; preds = %for.body6
+; CHECK:     br label %for.cond4.preheader
+; CHECK:   for.cond4.preheader:                              ; preds = %for.cond4.preheader.preheader, %for.inc12
+; CHECK:     %j.027 = phi i32 [ %inc13, %for.inc12 ], [ 0, %for.cond4.preheader.preheader ]
+; CHECK:     br label %for.body6.split1
+; CHECK:   for.body6.preheader:                              ; preds = %for.cond1.preheader
+; CHECK:     br label %for.body6
+; CHECK:   for.body6:                                        ; preds = %for.body6.preheader, %for.body6.split
+; CHECK:     %k.026 = phi i32 [ %inc, %for.body6.split ], [ 0, %for.body6.preheader ]
+; CHECK:     br label %for.cond4.preheader.preheader
+; CHECK:   for.body6.split1:                                 ; preds = %for.cond4.preheader
+; CHECK:     %arrayidx8 = getelementptr inbounds [100 x [100 x [100 x i32]]], [100 x [100 x [100 x i32]]]* @D, i32 0, i32 %i.028, i32 %k.026, i32 %j.027
+; CHECK:     %0 = load i32, i32* %arrayidx8
+; CHECK:     %add = add nsw i32 %0, %t
+; CHECK:     store i32 %add, i32* %arrayidx8
+; CHECK:     br label %for.inc12
+; CHECK:   for.body6.split:                                  ; preds = %for.inc12
+; CHECK:     %inc = add nuw nsw i32 %k.026, 1
+; CHECK:     %exitcond = icmp eq i32 %inc, 100
+; CHECK:     br i1 %exitcond, label %for.inc15, label %for.body6
+; CHECK:   for.inc12:                                        ; preds = %for.body6.split1
+; CHECK:     %inc13 = add nuw nsw i32 %j.027, 1
+; CHECK:     %exitcond29 = icmp eq i32 %inc13, 100
+; CHECK:     br i1 %exitcond29, label %for.body6.split, label %for.cond4.preheader
+; CHECK:   for.inc15:                                        ; preds = %for.body6.split
+; CHECK:     %inc16 = add nuw nsw i32 %i.028, 1
+; CHECK:     %exitcond30 = icmp eq i32 %inc16, 100
+; CHECK:     br i1 %exitcond30, label %for.end17, label %for.cond1.preheader
+; CHECK:   for.end17:                                        ; preds = %for.inc15
+; CHECK:     ret void
diff --git a/test/Transforms/LoopInterchange/not-interchanged-tightly-nested.ll b/test/Transforms/LoopInterchange/not-interchanged-tightly-nested.ll
new file mode 100644
index 000000000000..0cf91b09e65d
--- /dev/null
+++ b/test/Transforms/LoopInterchange/not-interchanged-tightly-nested.ll
@@ -0,0 +1,143 @@
+; RUN: opt < %s -basicaa -loop-interchange -S | FileCheck %s
+;; We test the complete .ll for adjustment in outer loop header/latch and inner loop header/latch.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@A = common global [100 x [100 x i32]] zeroinitializer
+@B = common global [100 x i32] zeroinitializer
+@C = common global [100 x [100 x i32]] zeroinitializer
+@D = common global [100 x [100 x [100 x i32]]] zeroinitializer
+
+;; Loops not tightly nested are not interchanged
+;;  for(int j=0;j<N;j++) {
+;;    B[j] = j+k;
+;;    for(int i=0;i<N;i++)
+;;      A[j][i] = A[j][i]+B[j];
+;;  }
+
+define void @interchange_05(i32 %k, i32 %N){
+entry:
+  %cmp30 = icmp sgt i32 %N, 0
+  br i1 %cmp30, label %for.body.lr.ph, label %for.end17
+
+for.body.lr.ph:
+  %0 = add i32 %N, -1
+  %1 = zext i32 %k to i64
+  br label %for.body
+
+for.body:
+  %indvars.iv32 = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next33, %for.inc15 ]
+  %2 = add nsw i64 %indvars.iv32, %1
+  %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* @B, i64 0, i64 %indvars.iv32
+  %3 = trunc i64 %2 to i32
+  store i32 %3, i32* %arrayidx
+  br label %for.body3
+
+for.body3:
+  %indvars.iv = phi i64 [ 0, %for.body ], [ %indvars.iv.next, %for.body3 ]
+  %arrayidx7 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv32, i64 %indvars.iv
+  %4 = load i32, i32* %arrayidx7
+  %add10 = add nsw i32 %3, %4
+  store i32 %add10, i32* %arrayidx7
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %0
+  br i1 %exitcond, label %for.inc15, label %for.body3
+
+for.inc15:
+  %indvars.iv.next33 = add nuw nsw i64 %indvars.iv32, 1
+  %lftr.wideiv35 = trunc i64 %indvars.iv32 to i32
+  %exitcond36 = icmp eq i32 %lftr.wideiv35, %0
+  br i1 %exitcond36, label %for.end17, label %for.body
+
+for.end17:
+  ret void
+}
+
+; CHECK-LABEL: @interchange_05
+; CHECK: entry:
+; CHECK: %cmp30 = icmp sgt i32 %N, 0
+; CHECK: br i1 %cmp30, label %for.body.lr.ph, label %for.end17
+; CHECK: for.body.lr.ph:
+; CHECK: %0 = add i32 %N, -1
+; CHECK: %1 = zext i32 %k to i64
+; CHECK: br label %for.body
+; CHECK: for.body:
+; CHECK: %indvars.iv32 = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next33, %for.inc15 ]
+; CHECK: %2 = add nsw i64 %indvars.iv32, %1
+; CHECK: %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* @B, i64 0, i64 %indvars.iv32
+; CHECK: %3 = trunc i64 %2 to i32
+; CHECK: store i32 %3, i32* %arrayidx
+; CHECK: br label %for.body3.preheader
+; CHECK: for.body3.preheader:
+; CHECK: br label %for.body3
+; CHECK: for.body3:
+; CHECK: %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3 ], [ 0, %for.body3.preheader ]
+; CHECK: %arrayidx7 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv32, i64 %indvars.iv
+; CHECK: %4 = load i32, i32* %arrayidx7
+; CHECK: %add10 = add nsw i32 %3, %4
+; CHECK: store i32 %add10, i32* %arrayidx7
+; CHECK: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+; CHECK: %lftr.wideiv = trunc i64 %indvars.iv to i32
+; CHECK: %exitcond = icmp eq i32 %lftr.wideiv, %0
+; CHECK: br i1 %exitcond, label %for.inc15, label %for.body3
+; CHECK: for.inc15:
+; CHECK: %indvars.iv.next33 = add nuw nsw i64 %indvars.iv32, 1
+; CHECK: %lftr.wideiv35 = trunc i64 %indvars.iv32 to i32
+; CHECK: %exitcond36 = icmp eq i32 %lftr.wideiv35, %0
+; CHECK: br i1 %exitcond36, label %for.end17.loopexit, label %for.body
+; CHECK: for.end17.loopexit:
+; CHECK: br label %for.end17
+; CHECK: for.end17:
+; CHECK: ret void
+
+
+declare void @foo(...)
+
+;; Loops not tightly nested are not interchanged
+;;  for(int j=0;j<N;j++) {
+;;    foo();
+;;    for(int i=2;i<N;i++)
+;;      A[j][i] = A[j][i]+k;
+;;  }
+
+define void @interchange_06(i32 %k, i32 %N) {
+entry:
+  %cmp22 = icmp sgt i32 %N, 0
+  br i1 %cmp22, label %for.body.lr.ph, label %for.end12
+
+for.body.lr.ph:
+  %0 = add i32 %N, -1
+  br label %for.body
+
+for.body:
+  %indvars.iv24 = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next25, %for.inc10 ]
+  tail call void (...) @foo()
+  br label %for.body3
+
+for.body3:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3 ], [ 2, %for.body ]
+  %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv24, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx5
+  %add = add nsw i32 %1, %k
+  store i32 %add, i32* %arrayidx5
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %0
+  br i1 %exitcond, label %for.inc10, label %for.body3
+
+for.inc10:
+  %indvars.iv.next25 = add nuw nsw i64 %indvars.iv24, 1
+  %lftr.wideiv26 = trunc i64 %indvars.iv24 to i32
+  %exitcond27 = icmp eq i32 %lftr.wideiv26, %0
+  br i1 %exitcond27, label %for.end12, label %for.body
+
+for.end12:
+  ret void
+}
+;; Here we are checking if the inner phi is not split then we have not interchanged.
+; CHECK-LABEL: @interchange_06
+; CHECK:  phi i64 [ %indvars.iv.next, %for.body3 ], [ 2, %for.body3.preheader ]
+; CHECK-NEXT: getelementptr
+; CHECK-NEXT: %1 = load
diff --git a/test/Transforms/LoopUnroll/runtime-loop-multiexit-dom-verify.ll b/test/Transforms/LoopUnroll/runtime-loop-multiexit-dom-verify.ll
new file mode 100644
index 000000000000..6014775028ee
--- /dev/null
+++ b/test/Transforms/LoopUnroll/runtime-loop-multiexit-dom-verify.ll
@@ -0,0 +1,126 @@
+; RUN: opt < %s -loop-unroll -unroll-runtime=true -unroll-runtime-epilog=false -unroll-runtime-multi-exit=true -unroll-count=4  -verify-dom-info -S | FileCheck %s
+
+; REQUIRES: asserts
+; The tests below are for verifying dom tree after runtime unrolling
+; with multiple exit/exiting blocks.
+
+; We explicitly set the unroll count so that expensiveTripCount computation is allowed.
+
+; mergedexit block has edges from loop exit blocks.
+define i64 @test1() {
+; CHECK-LABEL: test1(
+; CHECK-LABEL: headerexit:
+; CHECK-NEXT:    %addphi = phi i64 [ %add.iv, %header ], [ %add.iv.1, %header.1 ], [ %add.iv.2, %header.2 ], [ %add.iv.3, %header.3 ]
+; CHECK-NEXT:    br label %mergedexit
+; CHECK-LABEL: latchexit:
+; CHECK-NEXT:    %shftphi = phi i64 [ %shft, %latch ], [ %shft.1, %latch.1 ], [ %shft.2, %latch.2 ], [ %shft.3, %latch.3 ]
+; CHECK-NEXT:    br label %mergedexit
+; CHECK-LABEL: mergedexit:
+; CHECK-NEXT:    %retval = phi i64 [ %addphi, %headerexit ], [ %shftphi, %latchexit ]
+; CHECK-NEXT:    ret i64 %retval
+entry:
+  br label %preheader
+
+preheader:                                              ; preds = %bb
+  %trip = zext i32 undef to i64
+  br label %header
+
+header:                                              ; preds = %latch, %preheader
+  %iv = phi i64 [ 2, %preheader ], [ %add.iv, %latch ]
+  %add.iv = add nuw nsw i64 %iv, 2
+  %cmp1 = icmp ult i64 %add.iv, %trip
+  br i1 %cmp1, label %latch, label %headerexit
+
+latch:                                             ; preds = %header
+  %shft = ashr i64 %add.iv, 1
+  %cmp2 = icmp ult i64 %shft, %trip
+  br i1 %cmp2, label %header, label %latchexit
+
+headerexit:                                              ; preds = %header
+  %addphi = phi i64 [ %add.iv, %header ]
+  br label %mergedexit
+
+latchexit:                                              ; preds = %latch
+ %shftphi = phi i64 [ %shft, %latch ]
+  br label %mergedexit
+
+mergedexit:                                              ; preds = %latchexit, %headerexit
+  %retval = phi i64 [ %addphi, %headerexit ], [ %shftphi, %latchexit ]
+  ret i64 %retval
+}
+
+; mergedexit has edges from loop exit blocks and a block outside the loop.
+define  void @test2(i1 %cond, i32 %n) {
+; CHECK-LABEL: header.1:
+; CHECK-NEXT:    %add.iv.1 = add nuw nsw i64 %add.iv, 2
+; CHECK:         br i1 %cmp1.1, label %latch.1, label %headerexit
+; CHECK-LABEL: latch.3:
+; CHECK:         %cmp2.3 = icmp ult i64 %shft.3, %trip
+; CHECK-NEXT:    br i1 %cmp2.3, label %header, label %latchexit, !llvm.loop
+entry:
+  br i1 %cond, label %preheader, label %mergedexit
+
+preheader:                                              ; preds = %entry
+  %trip = zext i32 %n to i64
+  br label %header
+
+header:                                              ; preds = %latch, %preheader
+  %iv = phi i64 [ 2, %preheader ], [ %add.iv, %latch ]
+  %add.iv = add nuw nsw i64 %iv, 2
+  %cmp1 = icmp ult i64 %add.iv, %trip
+  br i1 %cmp1, label %latch, label %headerexit
+
+latch:                                             ; preds = %header
+  %shft = ashr i64 %add.iv, 1
+  %cmp2 = icmp ult i64 %shft, %trip
+  br i1 %cmp2, label %header, label %latchexit
+
+headerexit:                                              ; preds = %header
+  br label %mergedexit
+
+latchexit:                                              ; preds = %latch
+  br label %mergedexit
+
+mergedexit:                                              ; preds = %latchexit, %headerexit, %entry
+  ret void
+}
+
+
+; exitsucc is from loop exit block only.
+define i64 @test3(i32 %n) {
+; CHECK-LABEL: test3(
+; CHECK-LABEL:  headerexit:
+; CHECK-NEXT:     br label %exitsucc
+; CHECK-LABEL:  latchexit:
+; CHECK-NEXT:     %shftphi = phi i64 [ %shft, %latch ], [ %shft.1, %latch.1 ], [ %shft.2, %latch.2 ], [ %shft.3, %latch.3 ]
+; CHECK-NEXT:     ret i64 %shftphi
+; CHECK-LABEL:  exitsucc:
+; CHECK-NEXT:     ret i64 96
+entry:
+  br label %preheader
+
+preheader:                                              ; preds = %bb
+  %trip = zext i32 %n to i64
+  br label %header
+
+header:                                              ; preds = %latch, %preheader
+  %iv = phi i64 [ 2, %preheader ], [ %add.iv, %latch ]
+  %add.iv = add nuw nsw i64 %iv, 2
+  %cmp1 = icmp ult i64 %add.iv, %trip
+  br i1 %cmp1, label %latch, label %headerexit
+
+latch:                                             ; preds = %header
+  %shft = ashr i64 %add.iv, 1
+  %cmp2 = icmp ult i64 %shft, %trip
+  br i1 %cmp2, label %header, label %latchexit
+
+headerexit:                                              ; preds = %header
+  br label %exitsucc
+
+latchexit:                                              ; preds = %latch
+  %shftphi = phi i64 [ %shft, %latch ]
+  ret i64 %shftphi
+
+exitsucc:                                              ; preds = %headerexit
+  ret i64 96
+}
diff --git a/test/Transforms/LoopVectorize/X86/float-induction-x86.ll b/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
index b5e914500fb4..31c564779fb2 100644
--- a/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
+++ b/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
@@ -86,10 +86,10 @@ for.end:                                          ; preds = %for.end.loopexit, %
 ; AUTO_VEC-NEXT:  entry:
 ; AUTO_VEC-NEXT:    [[TMP0:%.*]] = icmp sgt i64 %n, 1
 ; AUTO_VEC-NEXT:    [[SMAX:%.*]] = select i1 [[TMP0]], i64 %n, i64 1
-; AUTO_VEC:         br i1 {{.*}}, label %for.body, label %min.iters.checked
-; AUTO_VEC:       min.iters.checked:
+; AUTO_VEC:         br i1 {{.*}}, label %for.body, label %vector.ph
+; AUTO_VEC:       vector.ph:
 ; AUTO_VEC-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775792
-; AUTO_VEC:         br i1 {{.*}}, label %for.body, label %vector.body
+; AUTO_VEC:         br label %vector.body
 ; AUTO_VEC:       middle.block:
 ; AUTO_VEC:         [[TMP11:%.*]] = add nsw i64 [[N_VEC]], -1
 ; AUTO_VEC-NEXT:    [[CAST_CMO:%.*]] = sitofp i64 [[TMP11]] to double
diff --git a/test/Transforms/LoopVectorize/debugloc.ll b/test/Transforms/LoopVectorize/debugloc.ll
index 49d88323523c..f2d68fb4e62b 100644
--- a/test/Transforms/LoopVectorize/debugloc.ll
+++ b/test/Transforms/LoopVectorize/debugloc.ll
@@ -5,7 +5,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ; Make sure we are preserving debug info in the vectorized code.
 
 ; CHECK: for.body.lr.ph
-; CHECK:   cmp.zero = icmp eq i64 {{.*}}, 0, !dbg !{{[0-9]+}}
+; CHECK:   min.iters.check = icmp ult i64 {{.*}}, 2, !dbg !{{[0-9]+}}
 ; CHECK: vector.body
 ; CHECK:   index {{.*}}, !dbg ![[LOC:[0-9]+]]
 ; CHECK:   getelementptr inbounds i32, i32* %a, {{.*}}, !dbg ![[LOC]]
diff --git a/test/Transforms/LoopVectorize/first-order-recurrence.ll b/test/Transforms/LoopVectorize/first-order-recurrence.ll
index 0ff94c1450ac..508938958d59 100644
--- a/test/Transforms/LoopVectorize/first-order-recurrence.ll
+++ b/test/Transforms/LoopVectorize/first-order-recurrence.ll
@@ -22,7 +22,7 @@ target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 ; CHECK:       middle.block:
 ; CHECK:         %vector.recur.extract = extractelement <4 x i32> [[L1]], i32 3
 ; CHECK:       scalar.ph:
-; CHECK:         %scalar.recur.init = phi i32 [ %vector.recur.extract, %middle.block ], [ %pre_load, %vector.memcheck ], [ %pre_load, %min.iters.checked ], [ %pre_load, %for.preheader ]
+; CHECK:         %scalar.recur.init = phi i32 [ %vector.recur.extract, %middle.block ], [ %pre_load, %vector.memcheck ], [ %pre_load, %for.preheader ]
 ; CHECK:       scalar.body:
 ; CHECK:         %scalar.recur = phi i32 [ %scalar.recur.init, %scalar.ph ], [ {{.*}}, %scalar.body ]
 ;
@@ -79,7 +79,7 @@ for.exit:
 ; CHECK:       middle.block:
 ; CHECK:         %vector.recur.extract = extractelement <4 x i32> [[L1]], i32 3
 ; CHECK:       scalar.ph:
-; CHECK:         %scalar.recur.init = phi i32 [ %vector.recur.extract, %middle.block ], [ %.pre, %min.iters.checked ], [ %.pre, %for.preheader ]
+; CHECK:         %scalar.recur.init = phi i32 [ %vector.recur.extract, %middle.block ], [ %.pre, %for.preheader ]
 ; CHECK:       scalar.body:
 ; CHECK:         %scalar.recur = phi i32 [ %scalar.recur.init, %scalar.ph ], [ {{.*}}, %scalar.body ]
 ;
@@ -144,7 +144,7 @@ scalar.body:
 ; CHECK:       middle.block:
 ; CHECK:         %vector.recur.extract = extractelement <4 x i16> [[L1]], i32 3
 ; CHECK:       scalar.ph:
-; CHECK:         %scalar.recur.init = phi i16 [ %vector.recur.extract, %middle.block ], [ %0, %vector.memcheck ], [ %0, %min.iters.checked ], [ %0, %for.preheader ]
+; CHECK:         %scalar.recur.init = phi i16 [ %vector.recur.extract, %middle.block ], [ %0, %vector.memcheck ], [ %0, %for.preheader ]
 ; CHECK:       scalar.body:
 ; CHECK:         %scalar.recur = phi i16 [ %scalar.recur.init, %scalar.ph ], [ {{.*}}, %scalar.body ]
 ;
@@ -288,7 +288,7 @@ for.cond.cleanup3:
 
 ; UNROLL-NO-IC-LABEL: @PR30183(
 ; UNROLL-NO-IC:       vector.ph:
-; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x i32> undef, i32 [[PRE_LOAD:%.*]], i32 3
+; UNROLL-NO-IC:         [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x i32> undef, i32 [[PRE_LOAD:%.*]], i32 3
 ; UNROLL-NO-IC-NEXT:    br label %vector.body
 ; UNROLL-NO-IC:       vector.body:
 ; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
diff --git a/test/Transforms/LoopVectorize/float-induction.ll b/test/Transforms/LoopVectorize/float-induction.ll
index 8eec6e262c1a..a7cc4530ceb3 100644
--- a/test/Transforms/LoopVectorize/float-induction.ll
+++ b/test/Transforms/LoopVectorize/float-induction.ll
@@ -15,7 +15,7 @@
 
 ; VEC4_INTERL1-LABEL: @fp_iv_loop1(
 ; VEC4_INTERL1:       vector.ph:
-; VEC4_INTERL1-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %init, i32 0
+; VEC4_INTERL1:         [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %init, i32 0
 ; VEC4_INTERL1-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer
 ; VEC4_INTERL1-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <4 x float> undef, float %fpinc, i32 0
 ; VEC4_INTERL1-NEXT:    [[DOTSPLAT3:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT2]], <4 x float> undef, <4 x i32> zeroinitializer
@@ -37,7 +37,7 @@
 
 ; VEC4_INTERL2-LABEL: @fp_iv_loop1(
 ; VEC4_INTERL2:       vector.ph:
-; VEC4_INTERL2-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %init, i32 0
+; VEC4_INTERL2:         [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %init, i32 0
 ; VEC4_INTERL2-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer
 ; VEC4_INTERL2-NEXT:    [[DOTSPLATINSERT3:%.*]] = insertelement <4 x float> undef, float %fpinc, i32 0
 ; VEC4_INTERL2-NEXT:    [[DOTSPLAT4:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT3]], <4 x float> undef, <4 x i32> zeroinitializer
@@ -63,7 +63,7 @@
 
 ; VEC1_INTERL2-LABEL: @fp_iv_loop1(
 ; VEC1_INTERL2:       vector.ph:
-; VEC1_INTERL2-NEXT:    br label %vector.body
+; VEC1_INTERL2:         br label %vector.body
 ; VEC1_INTERL2:       vector.body:
 ; VEC1_INTERL2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
 ; VEC1_INTERL2-NEXT:    [[INDUCTION2:%.*]] = or i64 [[INDEX]], 1
@@ -115,7 +115,7 @@ for.end:                                          ; preds = %for.end.loopexit, %
 
 ; VEC4_INTERL1-LABEL: @fp_iv_loop2(
 ; VEC4_INTERL1:       vector.ph:
-; VEC4_INTERL1-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %init, i32 0
+; VEC4_INTERL1:         [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %init, i32 0
 ; VEC4_INTERL1-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer
 ; VEC4_INTERL1-NEXT:    [[INDUCTION2:%.*]] = fadd fast <4 x float> [[DOTSPLAT]], <float 0.000000e+00, float 5.000000e-01, float 1.000000e+00, float 1.500000e+00>
 ; VEC4_INTERL1-NEXT:    br label %vector.body
@@ -172,7 +172,7 @@ for.end:                                          ; preds = %for.end.loopexit, %
 ; VEC4_INTERL1:       for.body.lr.ph:
 ; VEC4_INTERL1:         [[TMP0:%.*]] = load float, float* @fp_inc, align 4
 ; VEC4_INTERL1:       vector.ph:
-; VEC4_INTERL1-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %init, i32 0
+; VEC4_INTERL1:         [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %init, i32 0
 ; VEC4_INTERL1-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer
 ; VEC4_INTERL1-NEXT:    [[DOTSPLATINSERT5:%.*]] = insertelement <4 x float> undef, float [[TMP0]], i32 0
 ; VEC4_INTERL1-NEXT:    [[DOTSPLAT6:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT5]], <4 x float> undef, <4 x i32> zeroinitializer
@@ -250,7 +250,7 @@ for.end:
 
 ; VEC4_INTERL1-LABEL: @fp_iv_loop4(
 ; VEC4_INTERL1:       vector.ph:
-; VEC4_INTERL1-NEXT:    br label %vector.body
+; VEC4_INTERL1:         br label %vector.body
 ; VEC4_INTERL1:       vector.body:
 ; VEC4_INTERL1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
 ; VEC4_INTERL1-NEXT:    [[VEC_IND:%.*]] = phi <4 x float> [ <float 1.000000e+00, float 1.500000e+00, float 2.000000e+00, float 2.500000e+00>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
@@ -289,7 +289,7 @@ for.end:                                          ; preds = %for.end.loopexit, %
 
 ; VEC2_INTERL1_PRED_STORE-LABEL: @non_primary_iv_float_scalar(
 ; VEC2_INTERL1_PRED_STORE:       vector.body:
-; VEC2_INTERL1_PRED_STORE-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE7:.*]] ], [ 0, %min.iters.checked ]
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE7:.*]] ]
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP1:%.*]] = sitofp i64 [[INDEX]] to float
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDEX]]
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <2 x float>*
diff --git a/test/Transforms/LoopVectorize/if-conversion-nest.ll b/test/Transforms/LoopVectorize/if-conversion-nest.ll
index 7f381ae6ad7b..0d6e4b1e61b4 100644
--- a/test/Transforms/LoopVectorize/if-conversion-nest.ll
+++ b/test/Transforms/LoopVectorize/if-conversion-nest.ll
@@ -13,24 +13,21 @@ define i32 @foo(i32* nocapture %A, i32* nocapture %B, i32 %n) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[MIN_ITERS_CHECKED:%.*]]
-; CHECK:       min.iters.checked:
-; CHECK-NEXT:    [[TMP3:%.*]] = and i32 [[N]], 3
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = zext i32 [[TMP3]] to i64
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[TMP2]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[CMP_ZERO:%.*]] = icmp eq i64 [[N_VEC]], 0
-; CHECK-NEXT:    br i1 [[CMP_ZERO]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[N]], -1
-; CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = add nuw nsw i64 [[TMP5]], 1
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[TMP6]]
-; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = add nuw nsw i64 [[TMP4]], 1
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[TMP5]]
+; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP4]], [[A]]
 ; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[B]]
 ; CHECK-NEXT:    [[MEMCHECK_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
 ; CHECK-NEXT:    br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP6:%.*]] = and i32 [[N]], 3
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = zext i32 [[TMP6]] to i64
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[TMP2]], [[N_MOD_VF]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -55,10 +52,10 @@ define i32 @foo(i32* nocapture %A, i32* nocapture %B, i32 %n) {
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !5
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP3]], 0
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP6]], 0
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[MIN_ITERS_CHECKED]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[IF_END14:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
diff --git a/test/Transforms/LoopVectorize/induction-step.ll b/test/Transforms/LoopVectorize/induction-step.ll
index 33e8ed067160..b37537efcc51 100644
--- a/test/Transforms/LoopVectorize/induction-step.ll
+++ b/test/Transforms/LoopVectorize/induction-step.ll
@@ -15,7 +15,7 @@
 ; CHECK:       for.body.lr.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @int_inc, align 4
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 %init, i32 0
+; CHECK:         [[DOTSPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 %init, i32 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <8 x i32> undef, i32 [[TMP0]], i32 0
 ; CHECK-NEXT:    [[DOTSPLAT3:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT2]], <8 x i32> undef, <8 x i32> zeroinitializer
@@ -86,7 +86,7 @@ for.end:                                          ; preds = %for.end.loopexit, %
 
 ; CHECK-LABEL: @induction_with_loop_inv(
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 %x.011, i32 0
+; CHECK:         [[DOTSPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 %x.011, i32 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <8 x i32> undef, i32 %j.012, i32 0
 ; CHECK-NEXT:    [[DOTSPLAT3:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT2]], <8 x i32> undef, <8 x i32> zeroinitializer
diff --git a/test/Transforms/LoopVectorize/induction.ll b/test/Transforms/LoopVectorize/induction.ll
index 7e9e6b1cdc8e..d77806da59be 100644
--- a/test/Transforms/LoopVectorize/induction.ll
+++ b/test/Transforms/LoopVectorize/induction.ll
@@ -501,13 +501,13 @@ define i32 @i16_loop() nounwind readnone ssp uwtable {
 ; condition and branch directly to the scalar loop.
 
 ; CHECK-LABEL: max_i32_backedgetaken
-; CHECK:  br i1 true, label %scalar.ph, label %min.iters.checked
+; CHECK:  br i1 true, label %scalar.ph, label %vector.ph
 
 ; CHECK: middle.block:
 ; CHECK:  %[[v9:.+]] = extractelement <2 x i32> %bin.rdx, i32 0
 ; CHECK: scalar.ph:
 ; CHECK:  %bc.resume.val = phi i32 [ 0, %middle.block ], [ 0, %[[v0:.+]] ]
-; CHECK:  %bc.merge.rdx = phi i32 [ 1, %[[v0:.+]] ], [ 1, %min.iters.checked ], [ %[[v9]], %middle.block ]
+; CHECK:  %bc.merge.rdx = phi i32 [ 1, %[[v0:.+]] ], [ %[[v9]], %middle.block ]
 
 define i32 @max_i32_backedgetaken() nounwind readnone ssp uwtable {
 
diff --git a/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll b/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll
index 1e8b982363d8..89c0ac109167 100644
--- a/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll
+++ b/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll
@@ -9,7 +9,7 @@ target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 ;
 ; CHECK-LABEL: @interleaved_with_cond_store_0(
 ;
-; CHECK: min.iters.checked
+; CHECK: vector.ph
 ; CHECK:   %n.mod.vf = and i64 %[[N:.+]], 1
 ; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
 ; CHECK:   %[[R:.+]] = select i1 %[[IsZero]], i64 2, i64 %n.mod.vf
@@ -58,7 +58,7 @@ for.end:
 ;
 ; CHECK-LABEL: @interleaved_with_cond_store_1(
 ;
-; CHECK: min.iters.checked
+; CHECK: vector.ph
 ; CHECK:   %n.mod.vf = and i64 %[[N:.+]], 1
 ; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
 ; CHECK:   %[[R:.+]] = select i1 %[[IsZero]], i64 2, i64 %n.mod.vf
@@ -117,7 +117,7 @@ for.end:
 ;
 ; CHECK-LABEL: @interleaved_with_cond_store_2(
 ;
-; CHECK: min.iters.checked
+; CHECK: vector.ph
 ; CHECK:   %n.mod.vf = and i64 %[[N:.+]], 1
 ; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
 ; CHECK:   %[[R:.+]] = select i1 %[[IsZero]], i64 2, i64 %n.mod.vf
diff --git a/test/Transforms/LoopVectorize/interleaved-accesses.ll b/test/Transforms/LoopVectorize/interleaved-accesses.ll
index d84dc42bdf54..530c2f66552a 100644
--- a/test/Transforms/LoopVectorize/interleaved-accesses.ll
+++ b/test/Transforms/LoopVectorize/interleaved-accesses.ll
@@ -338,7 +338,7 @@ for.body:                                         ; preds = %for.body, %entry
 ; }
 
 ; CHECK-LABEL: @even_load_dynamic_tc(
-; CHECK: min.iters.checked:
+; CHECK: vector.ph:
 ; CHECK:   %n.mod.vf = and i64 %[[N:[a-zA-Z0-9]+]], 3
 ; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
 ; CHECK:   %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf
@@ -579,7 +579,7 @@ for.body:                                         ; preds = %for.body, %entry
 ; }
 
 ; CHECK-LABEL: @PR27626_0(
-; CHECK: min.iters.checked:
+; CHECK: vector.ph:
 ; CHECK:   %n.mod.vf = and i64 %[[N:.+]], 3
 ; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
 ; CHECK:   %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf
@@ -627,7 +627,7 @@ for.end:
 ; }
 
 ; CHECK-LABEL: @PR27626_1(
-; CHECK: min.iters.checked:
+; CHECK: vector.ph:
 ; CHECK:   %n.mod.vf = and i64 %[[N:.+]], 3
 ; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
 ; CHECK:   %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf
@@ -680,7 +680,7 @@ for.end:
 ; }
 
 ; CHECK-LABEL: @PR27626_2(
-; CHECK: min.iters.checked:
+; CHECK: vector.ph:
 ; CHECK:   %n.mod.vf = and i64 %[[N:.+]], 3
 ; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
 ; CHECK:   %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf
@@ -728,7 +728,7 @@ for.end:
 ; }
 
 ; CHECK-LABEL: @PR27626_3(
-; CHECK: min.iters.checked:
+; CHECK: vector.ph:
 ; CHECK:   %n.mod.vf = and i64 %[[N:.+]], 3
 ; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
 ; CHECK:   %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf
diff --git a/test/Transforms/LoopVectorize/iv_outside_user.ll b/test/Transforms/LoopVectorize/iv_outside_user.ll
index 8a44af96e7f4..265188886996 100644
--- a/test/Transforms/LoopVectorize/iv_outside_user.ll
+++ b/test/Transforms/LoopVectorize/iv_outside_user.ll
@@ -135,7 +135,7 @@ for.end:
 }
 
 ; CHECK-LABEL: @PR30742
-; CHECK: min.iters.checked
+; CHECK: vector.ph
 ; CHECK:   %[[N_MOD_VF:.+]] = urem i32 %[[T5:.+]], 2
 ; CHECK:   %[[N_VEC:.+]] = sub i32 %[[T5]], %[[N_MOD_VF]]
 ; CHECK: middle.block
diff --git a/test/Transforms/LoopVectorize/miniters.ll b/test/Transforms/LoopVectorize/miniters.ll
index 1cb67f9150ac..f5f4eb5eaa01 100644
--- a/test/Transforms/LoopVectorize/miniters.ll
+++ b/test/Transforms/LoopVectorize/miniters.ll
@@ -10,10 +10,10 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 ; Generate min.iters.check to skip the vector loop and jump to scalar.ph directly when loop iteration number is less than VF * UF.
 ; CHECK-LABEL: foo(
 ; CHECK: %min.iters.check = icmp ult i64 %N, 4
-; CHECK: br i1 %min.iters.check, label %scalar.ph, label %min.iters.checked
+; CHECK: br i1 %min.iters.check, label %scalar.ph, label %vector.ph
 ; UNROLL-LABEL: foo(
 ; UNROLL: %min.iters.check = icmp ult i64 %N, 8
-; UNROLL: br i1 %min.iters.check, label %scalar.ph, label %min.iters.checked
+; UNROLL: br i1 %min.iters.check, label %scalar.ph, label %vector.ph
 
 define void @foo(i64 %N) {
 entry:
diff --git a/test/Transforms/LoopVectorize/pr30654-phiscev-sext-trunc.ll b/test/Transforms/LoopVectorize/pr30654-phiscev-sext-trunc.ll
new file mode 100644
index 000000000000..40af8f3adf02
--- /dev/null
+++ b/test/Transforms/LoopVectorize/pr30654-phiscev-sext-trunc.ll
@@ -0,0 +1,240 @@
+; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 < %s 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Check that the vectorizer identifies the %p.09 phi,
+; as an induction variable, despite the potential overflow
+; due to the truncation from 32bit to 8bit. 
+; SCEV will detect the pattern "sext(trunc(%p.09)) + %step"
+; and generate the required runtime checks under which
+; we can assume no overflow. We check here that we generate
+; exactly two runtime checks:
+; 1) an overflow check:
+;    {0,+,(trunc i32 %step to i8)}<%for.body> Added Flags: <nssw>
+; 2) an equality check verifying that the step of the induction 
+;    is equal to sext(trunc(step)): 
+;    Equal predicate: %step == (sext i8 (trunc i32 %step to i8) to i32)
+; 
+; See also pr30654.
+;
+; int a[N];
+; void doit1(int n, int step) {
+;   int i;
+;   char p = 0;
+;   for (i = 0; i < n; i++) {
+;      a[i] = p;
+;      p = p + step;
+;   }
+; }
+; 
+
+; CHECK-LABEL: @doit1
+; CHECK: vector.scevcheck
+; CHECK: %mul = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 {{.*}}, i8 {{.*}})
+; CHECK-NOT: %mul = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 {{.*}}, i8 {{.*}})
+; CHECK: %[[TEST:[0-9]+]] = or i1 {{.*}}, %mul.overflow
+; CHECK: %[[NTEST:[0-9]+]] = or i1 false, %[[TEST]]
+; CHECK: %ident.check = icmp ne i32 {{.*}}, %{{.*}}
+; CHECK: %{{.*}} = or i1 %[[NTEST]], %ident.check
+; CHECK-NOT: %mul = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 {{.*}}, i8 {{.*}})
+; CHECK: vector.body:
+; CHECK: <4 x i32>
+
+@a = common local_unnamed_addr global [250 x i32] zeroinitializer, align 16
+
+; Function Attrs: norecurse nounwind uwtable
+define void @doit1(i32 %n, i32 %step) local_unnamed_addr {
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body.preheader, label %for.end
+
+for.body.preheader:                    
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.body:                  
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %p.09 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+  %sext = shl i32 %p.09, 24
+  %conv = ashr exact i32 %sext, 24
+  %arrayidx = getelementptr inbounds [250 x i32], [250 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %conv, i32* %arrayidx, align 4
+  %add = add nsw i32 %conv, %step
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                        
+  br label %for.end
+
+for.end:                                
+  ret void
+}
+
+; Same as above, but for checking the SCEV "zext(trunc(%p.09)) + %step".
+; Here we expect the following two predicates to be added for runtime checking:
+; 1) {0,+,(trunc i32 %step to i8)}<%for.body> Added Flags: <nusw>
+; 2) Equal predicate: %step == (zext i8 (trunc i32 %step to i8) to i32)
+;
+; int a[N];
+; void doit2(int n, int step) {
+;   int i;
+;   unsigned char p = 0;
+;   for (i = 0; i < n; i++) {
+;      a[i] = p;
+;      p = p + step;
+;   }
+; }
+; 
+
+; CHECK-LABEL: @doit2
+; CHECK: vector.scevcheck
+; CHECK: %mul = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 {{.*}}, i8 {{.*}})
+; CHECK-NOT: %mul = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 {{.*}}, i8 {{.*}})
+; CHECK: %[[TEST:[0-9]+]] = or i1 {{.*}}, %mul.overflow
+; CHECK: %[[NTEST:[0-9]+]] = or i1 false, %[[TEST]]
+; CHECK: %ident.check = icmp ne i32 {{.*}}, %{{.*}}
+; CHECK: %{{.*}} = or i1 %[[NTEST]], %ident.check
+; CHECK-NOT: %mul = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 {{.*}}, i8 {{.*}})
+; CHECK: vector.body:
+; CHECK: <4 x i32>
+
+; Function Attrs: norecurse nounwind uwtable
+define void @doit2(i32 %n, i32 %step) local_unnamed_addr  {
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body.preheader, label %for.end
+
+for.body.preheader:                             
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.body:                                      
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %p.09 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+  %conv = and i32 %p.09, 255
+  %arrayidx = getelementptr inbounds [250 x i32], [250 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %conv, i32* %arrayidx, align 4
+  %add = add nsw i32 %conv, %step
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                        
+  br label %for.end
+
+for.end:                                
+  ret void
+}
+
+; Here we check that the same phi scev analysis would fail 
+; to create the runtime checks because the step is not invariant.
+; As a result vectorization will fail.
+;
+; int a[N];
+; void doit3(int n, int step) {
+;   int i;
+;   char p = 0;
+;   for (i = 0; i < n; i++) {
+;      a[i] = p;
+;      p = p + step;
+;      step += 2;
+;   }
+; }
+;
+
+; CHECK-LABEL: @doit3
+; CHECK-NOT: vector.scevcheck
+; CHECK-NOT: vector.body:
+; CHECK-LABEL: for.body:
+
+; Function Attrs: norecurse nounwind uwtable
+define void @doit3(i32 %n, i32 %step) local_unnamed_addr {
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body.preheader, label %for.end
+
+for.body.preheader:
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %p.012 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+  %step.addr.010 = phi i32 [ %add3, %for.body ], [ %step, %for.body.preheader ]
+  %sext = shl i32 %p.012, 24
+  %conv = ashr exact i32 %sext, 24
+  %arrayidx = getelementptr inbounds [250 x i32], [250 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %conv, i32* %arrayidx, align 4
+  %add = add nsw i32 %conv, %step.addr.010
+  %add3 = add nsw i32 %step.addr.010, 2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+
+; Lastly, we also check the case where we can tell at compile time that
+; the step of the induction is equal to sext(trunc(step)), in which case
+; we don't have to check this equality at runtime (we only need the
+; runtime overflow check). Therefore only the following overflow predicate
+; will be added for runtime checking:
+; {0,+,%cstep}<%for.body> Added Flags: <nssw>
+;
+; a[N];
+; void doit4(int n, char cstep) {
+;   int i;
+;   char p = 0;
+;   int istep = cstep;
+;  for (i = 0; i < n; i++) {
+;      a[i] = p;
+;      p = p + istep;
+;   }
+; }
+
+; CHECK-LABEL: @doit4
+; CHECK: vector.scevcheck
+; CHECK: %mul = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 {{.*}}, i8 {{.*}})
+; CHECK-NOT: %mul = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 {{.*}}, i8 {{.*}})
+; CHECK: %{{.*}} = or i1 {{.*}}, %mul.overflow
+; CHECK-NOT: %ident.check = icmp ne i32 {{.*}}, %{{.*}}
+; CHECK-NOT: %{{.*}} = or i1 %{{.*}}, %ident.check
+; CHECK-NOT: %mul = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 {{.*}}, i8 {{.*}})
+; CHECK: vector.body:
+; CHECK: <4 x i32>
+
+; Function Attrs: norecurse nounwind uwtable
+define void @doit4(i32 %n, i8 signext %cstep) local_unnamed_addr {
+entry:
+  %conv = sext i8 %cstep to i32
+  %cmp10 = icmp sgt i32 %n, 0
+  br i1 %cmp10, label %for.body.preheader, label %for.end
+
+for.body.preheader:
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %p.011 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+  %sext = shl i32 %p.011, 24
+  %conv2 = ashr exact i32 %sext, 24
+  %arrayidx = getelementptr inbounds [250 x i32], [250 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %conv2, i32* %arrayidx, align 4
+  %add = add nsw i32 %conv2, %conv
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
diff --git a/test/Transforms/LoopVectorize/runtime-check-readonly.ll b/test/Transforms/LoopVectorize/runtime-check-readonly.ll
index ac1145aab67b..b37d94c0c328 100644
--- a/test/Transforms/LoopVectorize/runtime-check-readonly.ll
+++ b/test/Transforms/LoopVectorize/runtime-check-readonly.ll
@@ -4,7 +4,6 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 
 ;CHECK-LABEL: @add_ints(
 ;CHECK: br
-;CHECK: br
 ;CHECK: getelementptr
 ;CHECK-DAG: getelementptr
 ;CHECK-DAG: icmp ugt
diff --git a/test/Transforms/LoopVectorize/runtime-check.ll b/test/Transforms/LoopVectorize/runtime-check.ll
index 958b3c135c97..fb0548612715 100644
--- a/test/Transforms/LoopVectorize/runtime-check.ll
+++ b/test/Transforms/LoopVectorize/runtime-check.ll
@@ -10,7 +10,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 
 ;CHECK-LABEL: define i32 @foo
 ;CHECK: for.body.preheader:
-;CHECK: br i1 %cmp.zero, label %scalar.ph, label %vector.memcheck, !dbg [[BODY_LOC:![0-9]+]]
+;CHECK: br i1 %min.iters.check, label %scalar.ph, label %vector.memcheck, !dbg [[BODY_LOC:![0-9]+]]
 ;CHECK: vector.memcheck:
 ;CHECK: br i1 %memcheck.conflict, label %scalar.ph, label %vector.ph, !dbg [[BODY_LOC]]
 ;CHECK: load <4 x float>
diff --git a/test/tools/llvm-cov/showTabsHTML.cpp b/test/tools/llvm-cov/showTabsHTML.cpp
index 953c06a3c60d..c092841aeb22 100644
--- a/test/tools/llvm-cov/showTabsHTML.cpp
+++ b/test/tools/llvm-cov/showTabsHTML.cpp
@@ -1,5 +1,5 @@
 // RUN: llvm-profdata merge -o %t.profdata %S/Inputs/showTabsHTML.proftext
-// RUN: llvm-cov show %S/Inputs/showTabsHTML.covmapping -format html -instr-profile %t.profdata -filename-equivalence %s | FileCheck -check-prefix=CHECK %s
+// RUN: llvm-cov show %S/Inputs/showTabsHTML.covmapping -format html -instr-profile %t.profdata -filename-equivalence %s | FileCheck %s
 
 int main(int argc, char ** argv) {
 	(void) "This tab starts at column 0";            // CHECK: &nbsp;&nbsp;(void) &quot;This tab starts at column 0&quot;;
@@ -13,4 +13,4 @@ int main(int argc, char ** argv) {
 
 // CHECK-TABSIZE: &nbsp;&nbsp;&nbsp;(void) &quot;This tab starts at column 0&quot;;
 // CHECK-TABSIZE: (void) &quot;&nbsp;&nbsp;This tab starts at column 10&quot;;
-// CHECK-TABSIZE: (void) &quot;This &nbsp;&nbsp;&nbsp; tab starts at column 15&quot;;
\ No newline at end of file
+// CHECK-TABSIZE: (void) &quot;This &nbsp;&nbsp;&nbsp; tab starts at column 15&quot;;
diff --git a/test/tools/llvm-dwarfdump/X86/verify_debug_info.s b/test/tools/llvm-dwarfdump/X86/verify_debug_info.s
new file mode 100644
index 000000000000..90733eda278f
--- /dev/null
+++ b/test/tools/llvm-dwarfdump/X86/verify_debug_info.s
@@ -0,0 +1,193 @@
+# RUN: llvm-mc %s -filetype obj -triple x86_64-apple-darwin -o - \
+# RUN: | not llvm-dwarfdump -verify - \
+# RUN: | FileCheck %s
+
+# CHECK: Verifying .debug_info Unit Header Chain...
+# CHECK-NEXT: error: DIE has invalid DW_AT_stmt_list encoding:{{[[:space:]]}}
+# CHECK-NEXT: 0x0000000c: DW_TAG_compile_unit [1] *
+# CHECK-NEXT: DW_AT_producer [DW_FORM_strp]	( .debug_str[0x00000000] = "clang version 5.0.0 (trunk 308185) (llvm/trunk 308186)")
+# CHECK-NEXT: DW_AT_language [DW_FORM_data2]	(DW_LANG_C99)
+# CHECK-NEXT: DW_AT_name [DW_FORM_strp]	( .debug_str[0x00000037] = "basic.c")
+# CHECK-NEXT: DW_AT_stmt_list [DW_FORM_strx4]	( indexed (00000000) string = )
+# CHECK-NEXT: DW_AT_comp_dir [DW_FORM_strp]	( .debug_str[0x0000003f] = "/Users/sgravani/Development/tests")
+# CHECK-NEXT: DW_AT_low_pc [DW_FORM_addr]	(0x0000000000000000)
+# CHECK-NEXT: DW_AT_high_pc [DW_FORM_data4]	(0x00000016){{[[:space:]]}}
+# CHECK-NEXT: Units[2] - start offset: 0x00000068 
+# CHECK-NEXT:	Error: The length for this unit is too large for the .debug_info provided.
+# CHECK-NEXT:	Error: The unit type encoding is not valid.
+
+
+	.section	__TEXT,__text,regular,pure_instructions
+	.globl	_main                   ## -- Begin function main
+	.p2align	4, 0x90
+_main:                                  ## @main
+Lfunc_begin0:
+	.file	1 "basic.c"
+	.loc	1 1 0                   ## basic.c:1:0
+	.cfi_startproc
+## BB#0:                                ## %entry
+	pushq	%rbp
+Lcfi0:
+	.cfi_def_cfa_offset 16
+Lcfi1:
+	.cfi_offset %rbp, -16
+	movq	%rsp, %rbp
+Lcfi2:
+	.cfi_def_cfa_register %rbp
+	xorl	%eax, %eax
+	movl	$0, -4(%rbp)
+Ltmp0:
+	.loc	1 2 7 prologue_end      ## basic.c:2:7
+	movl	$1, -8(%rbp)
+	.loc	1 3 3                   ## basic.c:3:3
+	popq	%rbp
+	retq
+Ltmp1:
+Lfunc_end0:
+	.cfi_endproc
+                                        ## -- End function
+	.section	__DWARF,__debug_str,regular,debug
+Linfo_string:
+	.asciz	"clang version 5.0.0 (trunk 308185) (llvm/trunk 308186)" ## string offset=0
+	.asciz	"basic.c"               ## string offset=55
+	.asciz	"/Users/sgravani/Development/tests" ## string offset=63
+	.asciz	"main"                  ## string offset=97
+	.asciz	"int"                   ## string offset=102
+	.asciz	"a"                     ## string offset=106
+	.section	__DWARF,__debug_abbrev,regular,debug
+Lsection_abbrev:
+	.byte	1                       ## Abbreviation Code
+	.byte	17                      ## DW_TAG_compile_unit
+	.byte	1                       ## DW_CHILDREN_yes
+	.byte	37                      ## DW_AT_producer
+	.byte	14                      ## DW_FORM_strp
+	.byte	19                      ## DW_AT_language
+	.byte	5                       ## DW_FORM_data2
+	.byte	3                       ## DW_AT_name
+	.byte	14                      ## DW_FORM_strp
+	.byte	16                      ## DW_AT_stmt_list
+	.byte	40                      ## DW_FORM_sec_offset -- error: DIE has invalid DW_AT_stmt_list encoding:
+	.byte	27                      ## DW_AT_comp_dir
+	.byte	14                      ## DW_FORM_strp
+	.byte	17                      ## DW_AT_low_pc
+	.byte	1                       ## DW_FORM_addr
+	.byte	18                      ## DW_AT_high_pc
+	.byte	6                       ## DW_FORM_data4
+	.byte	0                       ## EOM(1)
+	.byte	0                       ## EOM(2)
+	.byte	2                       ## Abbreviation Code
+	.byte	46                      ## DW_TAG_subprogram
+	.byte	1                       ## DW_CHILDREN_yes
+	.byte	17                      ## DW_AT_low_pc
+	.byte	1                       ## DW_FORM_addr
+	.byte	18                      ## DW_AT_high_pc
+	.byte	6                       ## DW_FORM_data4
+	.byte	64                      ## DW_AT_frame_base
+	.byte	24                      ## DW_FORM_exprloc
+	.byte	3                       ## DW_AT_name
+	.byte	14                      ## DW_FORM_strp
+	.byte	58                      ## DW_AT_decl_file
+	.byte	11                      ## DW_FORM_data1
+	.byte	59                      ## DW_AT_decl_line
+	.byte	11                      ## DW_FORM_data1
+	.byte	39                      ## DW_AT_prototyped
+	.byte	25                      ## DW_FORM_flag_present
+	.byte	73                      ## DW_AT_type
+	.byte	19                      ## DW_FORM_ref4
+	.byte	63                      ## DW_AT_external
+	.byte	25                      ## DW_FORM_flag_present
+	.byte	0                       ## EOM(1)
+	.byte	0                       ## EOM(2)
+	.byte	3                       ## Abbreviation Code
+	.byte	52                      ## DW_TAG_variable
+	.byte	0                       ## DW_CHILDREN_no
+	.byte	2                       ## DW_AT_location
+	.byte	24                      ## DW_FORM_exprloc
+	.byte	3                       ## DW_AT_name
+	.byte	14                      ## DW_FORM_strp
+	.byte	58                      ## DW_AT_decl_file
+	.byte	11                      ## DW_FORM_data1
+	.byte	59                      ## DW_AT_decl_line
+	.byte	11                      ## DW_FORM_data1
+	.byte	73                      ## DW_AT_type
+	.byte	19                      ## DW_FORM_ref4
+	.byte	0                       ## EOM(1)
+	.byte	0                       ## EOM(2)
+	.byte	4                       ## Abbreviation Code
+	.byte	36                      ## DW_TAG_base_type
+	.byte	0                       ## DW_CHILDREN_no
+	.byte	3                       ## DW_AT_name
+	.byte	14                      ## DW_FORM_strp
+	.byte	62                      ## DW_AT_encoding
+	.byte	11                      ## DW_FORM_data1
+	.byte	11                      ## DW_AT_byte_size
+	.byte	11                      ## DW_FORM_data1
+	.byte	0                       ## EOM(1)
+	.byte	0                       ## EOM(2)
+	.byte	0                       ## EOM(3)
+	.section	__DWARF,__debug_info,regular,debug
+Lsection_info:
+Lcu_begin0:
+	.long	87                      ## Length of Unit
+	.short	5                       ## DWARF version number
+	.byte	1                       ## DWARF Unit Type
+	.byte	8                       ## Address Size (in bytes)
+Lset0 = Lsection_abbrev-Lsection_abbrev ## Offset Into Abbrev. Section
+	.long	Lset0
+	.byte	1                       ## Abbrev [1] 0xc:0x4f DW_TAG_compile_unit
+	.long	0                       ## DW_AT_producer
+	.short	12                      ## DW_AT_language
+	.long	55                      ## DW_AT_name
+Lset1 = Lline_table_start0-Lsection_line ## DW_AT_stmt_list
+	.long	Lset1
+	.long	63                      ## DW_AT_comp_dir
+	.quad	Lfunc_begin0            ## DW_AT_low_pc
+Lset2 = Lfunc_end0-Lfunc_begin0         ## DW_AT_high_pc
+	.long	Lset2
+	.byte	2                       ## Abbrev [2] 0x2b:0x28 DW_TAG_subprogram
+	.quad	Lfunc_begin0            ## DW_AT_low_pc
+Lset3 = Lfunc_end0-Lfunc_begin0         ## DW_AT_high_pc
+	.long	Lset3
+	.byte	1                       ## DW_AT_frame_base
+	.byte	86
+	.long	97                      ## DW_AT_name
+	.byte	1                       ## DW_AT_decl_file
+	.byte	1                       ## DW_AT_decl_line
+                                        ## DW_AT_prototyped
+	.long	83                      ## DW_AT_type
+                                        ## DW_AT_external
+	.byte	3                       ## Abbrev [3] 0x44:0xe DW_TAG_variable
+	.byte	2                       ## DW_AT_location
+	.byte	145
+	.byte	120
+	.long	106                     ## DW_AT_name
+	.byte	1                       ## DW_AT_decl_file
+	.byte	2                       ## DW_AT_decl_line
+	.long	83                      ## DW_AT_type
+	.byte	0                       ## End Of Children Mark
+	.byte	4                       ## Abbrev [4] 0x53:0x7 DW_TAG_base_type
+	.long	102                     ## DW_AT_name
+	.byte	5                       ## DW_AT_encoding
+	.byte	4                       ## DW_AT_byte_size
+	.byte	0                       ## End Of Children Mark
+Lcu_begin1:
+	.long	9                      ## Length of Unit
+	.short	5                       ## DWARF version number
+	.byte	1                       ## DWARF Unit Type
+	.byte	4                       ## Address Size (in bytes)
+	.long	0						## Abbrev offset
+	.byte 	0	
+Ltu_begin0:
+	.long	26                      ## Length of Unit -- Error: The length for this unit is too large for the .debug_info provided.
+	.short	5                       ## DWARF version number
+	.byte	0                       ## DWARF Unit Type
+	.byte	4                       ## Address Size (in bytes)
+	.long	0
+	.quad	0
+	.long   0
+	.byte 	0			
+
+.subsections_via_symbols
+	.section	__DWARF,__debug_line,regular,debug
+Lsection_line:
+Lline_table_start0:
diff --git a/test/tools/llvm-dwarfdump/X86/verify_unit_header_chain.s b/test/tools/llvm-dwarfdump/X86/verify_unit_header_chain.s
new file mode 100644
index 000000000000..a3a54077bbf9
--- /dev/null
+++ b/test/tools/llvm-dwarfdump/X86/verify_unit_header_chain.s
@@ -0,0 +1,81 @@
+# RUN: llvm-mc %s -filetype obj -triple x86_64-apple-darwin -o - \
+# RUN: | not llvm-dwarfdump -verify - \
+# RUN: | FileCheck %s
+
+# CHECK: Verifying .debug_info Unit Header Chain...
+# CHECK-NEXT: Units[1] - start offset: 0x0000000d 
+# CHECK-NEXT: 	Error: The unit type encoding is not valid.
+# CHECK-NEXT: 	Error: The address size is unsupported.
+# CHECK-NEXT: Units[2] - start offset: 0x00000026 
+# CHECK-NEXT: 	Error: The 16 bit unit header version is not valid.
+# CHECK-NEXT: 	Error: The offset into the .debug_abbrev section is not valid.
+# CHECK-NEXT: Units[4] - start offset: 0x00000041 
+# CHECK-NEXT: 	Error: The length for this unit is too large for the .debug_info provided.
+
+	.section	__TEXT,__text,regular,pure_instructions
+	.file	1 "basic.c"
+	.comm	_i,4,2                  ## @i
+	.comm	_j,4,2                  ## @j
+	.section	__DWARF,__debug_str,regular,debug
+Linfo_string:
+	.asciz	"clang version 5.0.0 (trunk 307232) (llvm/trunk 307042)" ## string offset=0
+	.asciz	"basic.c"               ## string offset=55
+	.asciz	"/Users/sgravani/Development/tests" ## string offset=63
+	.asciz	"i"                     ## string offset=97
+	.asciz	"int"                   ## string offset=99
+	.asciz	"j"                     ## string offset=103
+	.section	__DWARF,__debug_abbrev,regular,debug
+Lsection_abbrev:
+	.byte	1                       ## Abbreviation Code
+	.byte	17                      ## DW_TAG_compile_unit
+	.byte	0                       ## EOM(1)
+	.byte	0                       ## EOM(2)
+	.byte	0                       ## EOM(3)
+	.section	__DWARF,__debug_info,regular,debug
+Lsection_info:
+Lcu_begin0:
+	.long	9                      ## Length of Unit
+	.short	4                       ## DWARF version number
+Lset0 = Lsection_abbrev-Lsection_abbrev ## Offset Into Abbrev. Section
+	.long	Lset0
+	.byte	4                       ## Address Size (in bytes)
+	.byte	1                       ## Abbrev [1] 0xc:0x45 DW_TAG_compile_unit
+	.byte	0                       ## End Of Children Mark
+Ltu_begin0:
+	.long	21                      ## Length of Unit
+	.short	5                       ## DWARF version number
+	.byte	0                       ## DWARF Unit Type -- Error: The unit type encoding is not valid.
+	.byte	3                       ## Address Size (in bytes) -- Error: The address size is unsupported.
+	.long	0
+	.quad	0
+	.long   0
+	.byte 	0
+Lcu_begin1:
+	.long	10                      ## Length of Unit
+	.short	6                       ## DWARF version number -- Error: The 16 bit unit header version is not valid.
+	.byte	1                       ## DWARF Unit Type
+	.byte	4                       ## Address Size (in bytes) -- The offset into the .debug_abbrev section is not valid.
+	.long	Lline_table_start0
+	.byte	1                       ## Abbrev [1] 0xc:0x45 DW_TAG_compile_unit
+	.byte	0                       ## End Of Children Mark
+Lcu_begin2:
+	.long	9                      ## Length of Unit
+	.short	5                       ## DWARF version number
+	.byte	1                       ## DWARF Unit Type
+	.byte	4                       ## Address Size (in bytes)
+	.long	0						## Abbrev offset
+	.byte 	0
+Ltu_begin1:
+	.long	26                      ## Length of Unit -- Error: The length for this unit is too large for the .debug_info provided.
+	.short	5                       ## DWARF version number
+	.byte	2                       ## DWARF Unit Type
+	.byte	4                       ## Address Size (in bytes)
+	.long	0
+	.quad	0
+	.long   0
+	.byte 	0		
+
+.subsections_via_symbols
+	.section	__DWARF,__debug_line,regular,debug
+Lsection_line:
+Lline_table_start0:
diff --git a/test/tools/llvm-mt/help.test b/test/tools/llvm-mt/help.test
new file mode 100644
index 000000000000..29e3667ec2ca
--- /dev/null
+++ b/test/tools/llvm-mt/help.test
@@ -0,0 +1,7 @@
+RUN: llvm-mt /h | FileCheck %s -check-prefix=HELP
+
+RUN: llvm-mt /inputresource:foo.res /manifest foo.manifest | FileCheck %s -check-prefix=NOT_SUPPORTED
+
+HELP:      OVERVIEW: Manifest Tool
+
+NOT_SUPPORTED: llvm-mt: ignoring unsupported 'inputresource:' option
diff --git a/test/tools/llvm-objdump/AArch64/Inputs/reloc-addend.obj.macho-aarch64 b/test/tools/llvm-objdump/AArch64/Inputs/reloc-addend.obj.macho-aarch64
new file mode 100644
index 000000000000..58ed3c7a48fc
Binary files /dev/null and b/test/tools/llvm-objdump/AArch64/Inputs/reloc-addend.obj.macho-aarch64 differ
diff --git a/test/tools/llvm-objdump/AArch64/macho-reloc-addend.test b/test/tools/llvm-objdump/AArch64/macho-reloc-addend.test
new file mode 100644
index 000000000000..4cc4a6eca2a1
--- /dev/null
+++ b/test/tools/llvm-objdump/AArch64/macho-reloc-addend.test
@@ -0,0 +1,6 @@
+RUN: llvm-objdump -r %p/Inputs/reloc-addend.obj.macho-aarch64 | FileCheck %s
+
+CHECK-DAG: 0000000000000004 ARM64_RELOC_ADDEND 0x999
+CHECK-DAG: 0000000000000004 ARM64_RELOC_PAGEOFF12 _stringbuf
+CHECK-DAG: 0000000000000000 ARM64_RELOC_ADDEND 0x999
+CHECK-DAG: 0000000000000000 ARM64_RELOC_PAGE21 _stringbuf
diff --git a/test/tools/llvm-readobj/Inputs/dynamic-table-so.x86 b/test/tools/llvm-readobj/Inputs/dynamic-table-so.x86
index fb9d37881c98..01bd1c2fc1ed 100644
Binary files a/test/tools/llvm-readobj/Inputs/dynamic-table-so.x86 and b/test/tools/llvm-readobj/Inputs/dynamic-table-so.x86 differ
diff --git a/test/tools/llvm-readobj/Inputs/dynamic-table.c b/test/tools/llvm-readobj/Inputs/dynamic-table.c
index b5251f87ee53..0af66ca0c53f 100644
--- a/test/tools/llvm-readobj/Inputs/dynamic-table.c
+++ b/test/tools/llvm-readobj/Inputs/dynamic-table.c
@@ -1,10 +1,10 @@
 // clang -target x86_64-linux-gnu -shared -fPIC -lc dynamic-table.c \
-//       -o dynamic-table-so.x86 -Wl,-f,aux_val
+//       -o dynamic-table-so.x86 -Wl,-f,aux.so -Wl,-F,filter.so
 // clang -target mipsel-linux-gnu -shared -fPIC -lc dynamic-table.c \
 //       -o dynamic-table-so.mips
 // clang -target mipsel-linux-gnu -lc dynamic-table.c \
 //       -o dynamic-table-exe.mips
-// clang -target aarch64-linux-gnu -fPIC -shared dynamic-table.c \ 
+// clang -target aarch64-linux-gnu -fPIC -shared dynamic-table.c\
 //       -o dynamic-table-so.aarch64
 int puts(const char *);
 
diff --git a/test/tools/llvm-readobj/dynamic.test b/test/tools/llvm-readobj/dynamic.test
index 5079d3556957..71b6b06cbc08 100644
--- a/test/tools/llvm-readobj/dynamic.test
+++ b/test/tools/llvm-readobj/dynamic.test
@@ -8,7 +8,7 @@ ELF-MIPS: AddressSize: 32bit
 ELF-MIPS: LoadName:
 ELF-MIPS: DynamicSection [ (23 entries)
 ELF-MIPS:   Tag        Type                 Name/Value
-ELF-MIPS:   0x00000001 NEEDED               SharedLibrary (libc.so.6)
+ELF-MIPS:   0x00000001 NEEDED               Shared library: [libc.so.6]
 ELF-MIPS:   0x0000000C INIT                 0x528
 ELF-MIPS:   0x0000000D FINI                 0x860
 ELF-MIPS:   0x00000004 HASH                 0x210
@@ -43,7 +43,7 @@ ELF-MIPS-EXE: AddressSize: 32bit
 ELF-MIPS-EXE: LoadName:
 ELF-MIPS-EXE: DynamicSection [ (26 entries)
 ELF-MIPS-EXE:   Tag        Type                 Name/Value
-ELF-MIPS-EXE:   0x00000001 NEEDED               SharedLibrary (libc.so.6)
+ELF-MIPS-EXE:   0x00000001 NEEDED               Shared library: [libc.so.6]
 ELF-MIPS-EXE:   0x0000000C INIT                 0x400418
 ELF-MIPS-EXE:   0x0000000D FINI                 0x4007B0
 ELF-MIPS-EXE:   0x00000004 HASH                 0x4002B8
@@ -80,9 +80,9 @@ ELF-X86-EXE: AddressSize: 32bit
 ELF-X86-EXE: LoadName:
 ELF-X86-EXE: DynamicSection [ (30 entries)
 ELF-X86-EXE:   Tag        Type                 Name/Value
-ELF-X86-EXE:   0x00000001 NEEDED               SharedLibrary (libstdc++.so.6)
-ELF-X86-EXE:   0x00000001 NEEDED               SharedLibrary (libgcc_s.so.1)
-ELF-X86-EXE:   0x00000001 NEEDED               SharedLibrary (libc.so.6)
+ELF-X86-EXE:   0x00000001 NEEDED               Shared library: [libstdc++.so.6]
+ELF-X86-EXE:   0x00000001 NEEDED               Shared library: [libgcc_s.so.1]
+ELF-X86-EXE:   0x00000001 NEEDED               Shared library: [libc.so.6]
 ELF-X86-EXE:   0x0000000C INIT                 0x62C
 ELF-X86-EXE:   0x0000000D FINI                 0x920
 ELF-X86-EXE:   0x00000019 INIT_ARRAY           0x19FC
@@ -119,32 +119,33 @@ ELF-X86-SO: Format: ELF64-x86-64
 ELF-X86-SO: Arch: x86_64
 ELF-X86-SO: AddressSize: 64bit
 ELF-X86-SO: LoadName: 
-ELF-X86-SO: DynamicSection [ (26 entries)
+ELF-X86-SO: DynamicSection [ ({{[0-9]+}} entries)
 ELF-X86-SO:   Tag                Type                 Name/Value
-ELF-X86-SO:   0x0000000000000001 NEEDED               SharedLibrary (libc.so.6)
-ELF-X86-SO:   0x0000000000000001 NEEDED               SharedLibrary (ld-linux-x86-64.so.2)
-ELF-X86-SO:   0x000000007FFFFFFD AUXILIARY            Auxiliary library: [aux_val]
-ELF-X86-SO:   0x000000000000000C INIT                 0x610
-ELF-X86-SO:   0x000000000000000D FINI                 0x7AC
-ELF-X86-SO:   0x0000000000000019 INIT_ARRAY           0x200DD0
+ELF-X86-SO:   0x0000000000000001 NEEDED               Shared library: [libc.so.6]
+ELF-X86-SO:   0x0000000000000001 NEEDED               Shared library: [ld-linux-x86-64.so.2]
+ELF-X86-SO:   0x000000007FFFFFFF FILTER               Filter library: [filter.so]
+ELF-X86-SO:   0x000000007FFFFFFD AUXILIARY            Auxiliary library: [aux.so]
+ELF-X86-SO:   0x000000000000000C INIT                 0x{{[0-9A-F]+}}
+ELF-X86-SO:   0x000000000000000D FINI                 0x{{[0-9A-F]+}}
+ELF-X86-SO:   0x0000000000000019 INIT_ARRAY           0x{{[0-9A-F]+}}
 ELF-X86-SO:   0x000000000000001B INIT_ARRAYSZ         8 (bytes)
-ELF-X86-SO:   0x000000000000001A FINI_ARRAY           0x200DD8
+ELF-X86-SO:   0x000000000000001A FINI_ARRAY           0x{{[0-9A-F]+}}
 ELF-X86-SO:   0x000000000000001C FINI_ARRAYSZ         8 (bytes)
 ELF-X86-SO:   0x000000006FFFFEF5 GNU_HASH             0x1C8
 ELF-X86-SO:   0x0000000000000005 STRTAB               0x3A0
 ELF-X86-SO:   0x0000000000000006 SYMTAB               0x208
-ELF-X86-SO:   0x000000000000000A STRSZ                231 (bytes)
+ELF-X86-SO:   0x000000000000000A STRSZ                {{[0-9]+}} (bytes)
 ELF-X86-SO:   0x000000000000000B SYMENT               24 (bytes)
 ELF-X86-SO:   0x0000000000000003 PLTGOT               0x201000
 ELF-X86-SO:   0x0000000000000002 PLTRELSZ             48 (bytes)
 ELF-X86-SO:   0x0000000000000014 PLTREL               RELA
-ELF-X86-SO:   0x0000000000000017 JMPREL               0x5E0
-ELF-X86-SO:   0x0000000000000007 RELA                 0x4F0
+ELF-X86-SO:   0x0000000000000017 JMPREL               0x{{[0-9A-F]+}}
+ELF-X86-SO:   0x0000000000000007 RELA                 0x{{[0-9A-F]+}}
 ELF-X86-SO:   0x0000000000000008 RELASZ               240 (bytes)
 ELF-X86-SO:   0x0000000000000009 RELAENT              24 (bytes)
-ELF-X86-SO:   0x000000006FFFFFFE VERNEED              0x4B0
+ELF-X86-SO:   0x000000006FFFFFFE VERNEED              0x{{[0-9A-F]+}}
 ELF-X86-SO:   0x000000006FFFFFFF VERNEEDNUM           2
-ELF-X86-SO:   0x000000006FFFFFF0 VERSYM               0x488
+ELF-X86-SO:   0x000000006FFFFFF0 VERSYM               0x{{[0-9A-F]+}}
 ELF-X86-SO:   0x000000006FFFFFF9 RELACOUNT            3
 ELF-X86-SO:   0x0000000000000000 NULL                 0x0
 
@@ -157,7 +158,7 @@ ELF-AARCH64-SO: AddressSize: 64bit
 ELF-AARCH64-SO: LoadName: 
 ELF-AARCH64-SO: DynamicSection [ (26 entries)
 ELF-AARCH64-SO:   Tag                Type                 Name/Value
-ELF-AARCH64-SO:   0x0000000000000001 NEEDED               SharedLibrary (libc.so.6)
+ELF-AARCH64-SO:   0x0000000000000001 NEEDED               Shared library: [libc.so.6]
 ELF-AARCH64-SO:   0x000000000000000C INIT                 0x660
 ELF-AARCH64-SO:   0x000000000000000D FINI                 0x83C
 ELF-AARCH64-SO:   0x0000000000000019 INIT_ARRAY           0x10DB8
diff --git a/test/tools/llvm-readobj/gnu-sections.test b/test/tools/llvm-readobj/gnu-sections.test
index fb90ce44d10f..f5b504fa66fa 100644
--- a/test/tools/llvm-readobj/gnu-sections.test
+++ b/test/tools/llvm-readobj/gnu-sections.test
@@ -1,6 +1,14 @@
 RUN: llvm-readobj -s %p/Inputs/relocs.obj.elf-i386 --elf-output-style=GNU \
 RUN:   | FileCheck %s -check-prefix ELF32
-RUN: llvm-readobj -s %p/Inputs/relocs.obj.elf-x86_64 --elf-output-style=GNU \
+RUN: llvm-readobj -S %p/Inputs/relocs.obj.elf-x86_64 --elf-output-style=GNU \
+RUN:   | FileCheck %s -check-prefix ELF64
+RUN: llvm-readobj --wide --sections \
+RUN:   %p/Inputs/relocs.obj.elf-x86_64 --elf-output-style=GNU \
+RUN:   | FileCheck %s -check-prefix ELF64
+RUN: llvm-readobj -W --sections \
+RUN:   %p/Inputs/relocs.obj.elf-x86_64 --elf-output-style=GNU \
+RUN:   | FileCheck %s -check-prefix ELF64
+RUN: llvm-readelf -W -S %p/Inputs/relocs.obj.elf-x86_64 \
 RUN:   | FileCheck %s -check-prefix ELF64
 
 ELF32:    Section Headers:
diff --git a/tools/llvm-ar/CMakeLists.txt b/tools/llvm-ar/CMakeLists.txt
index 3bb0c8f7b7c5..731bcbd8ac9d 100644
--- a/tools/llvm-ar/CMakeLists.txt
+++ b/tools/llvm-ar/CMakeLists.txt
@@ -1,6 +1,7 @@
 set(LLVM_LINK_COMPONENTS
   ${LLVM_TARGETS_TO_BUILD}
   Core
+  DlltoolDriver
   LibDriver
   Object
   Support
@@ -15,3 +16,4 @@ add_llvm_tool(llvm-ar
 
 add_llvm_tool_symlink(llvm-ranlib llvm-ar)
 add_llvm_tool_symlink(llvm-lib llvm-ar)
+add_llvm_tool_symlink(llvm-dlltool llvm-ar)
diff --git a/tools/llvm-ar/llvm-ar.cpp b/tools/llvm-ar/llvm-ar.cpp
index 500507fd4966..af4d3efa52f7 100644
--- a/tools/llvm-ar/llvm-ar.cpp
+++ b/tools/llvm-ar/llvm-ar.cpp
@@ -16,6 +16,7 @@
 #include "llvm/ADT/Triple.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
+#include "llvm/ToolDrivers/llvm-dlltool/DlltoolDriver.h"
 #include "llvm/ToolDrivers/llvm-lib/LibDriver.h"
 #include "llvm/Object/Archive.h"
 #include "llvm/Object/ArchiveWriter.h"
@@ -863,6 +864,9 @@ int main(int argc, char **argv) {
   llvm::InitializeAllAsmParsers();
 
   StringRef Stem = sys::path::stem(ToolName);
+  if (Stem.find("dlltool") != StringRef::npos)
+    return dlltoolDriverMain(makeArrayRef(argv, argc));
+
   if (Stem.find("ranlib") == StringRef::npos &&
       Stem.find("lib") != StringRef::npos)
     return libDriverMain(makeArrayRef(argv, argc));
@@ -878,5 +882,5 @@ int main(int argc, char **argv) {
     return ranlib_main();
   if (Stem.find("ar") != StringRef::npos)
     return ar_main();
-  fail("Not ranlib, ar or lib!");
+  fail("Not ranlib, ar, lib or dlltool!");
 }
diff --git a/tools/llvm-mt/CMakeLists.txt b/tools/llvm-mt/CMakeLists.txt
new file mode 100644
index 000000000000..d97cc4fe446f
--- /dev/null
+++ b/tools/llvm-mt/CMakeLists.txt
@@ -0,0 +1,13 @@
+set(LLVM_LINK_COMPONENTS
+  Option
+  Support
+  )
+
+set(LLVM_TARGET_DEFINITIONS Opts.td)
+
+tablegen(LLVM Opts.inc -gen-opt-parser-defs)
+add_public_tablegen_target(MtTableGen)
+
+add_llvm_tool(llvm-mt
+  llvm-mt.cpp
+  )
diff --git a/tools/llvm-mt/LLVMBuild.txt b/tools/llvm-mt/LLVMBuild.txt
new file mode 100644
index 000000000000..894d3527924b
--- /dev/null
+++ b/tools/llvm-mt/LLVMBuild.txt
@@ -0,0 +1,22 @@
+;===- ./tools/llvm-mt/LLVMBuild.txt ---------------------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Tool
+name = llvm-mt
+parent = Tools
+required_libraries = Option Support
diff --git a/tools/llvm-mt/Opts.td b/tools/llvm-mt/Opts.td
new file mode 100644
index 000000000000..6dc3eea524e6
--- /dev/null
+++ b/tools/llvm-mt/Opts.td
@@ -0,0 +1,29 @@
+include "llvm/Option/OptParser.td"
+
+def unsupported : OptionGroup<"unsupported">;
+def manifest : Separate<["/", "-"], "manifest">, HelpText<"Used to specify each manifest that need to be processed">, MetaVarName<"manifest">;
+def identity : Joined<["/", "-"], "identity:">, HelpText<"Not supported">, MetaVarName<"identity">, Group<unsupported>;
+def rgs : Joined<["/", "-"], "rgs:">, HelpText<"Not supported">, MetaVarName<"script">, Group<unsupported>;
+def tlb : Joined<["/", "-"], "tlb:">, HelpText<"Not supported">, MetaVarName<"file">, Group<unsupported>;
+def dll : Joined<["/", "-"], "dll:">, HelpText<"Not supported">, MetaVarName<"dll">, Group<unsupported>;
+def replacements : Joined<["/", "-"], "replacements:">, HelpText<"Not supported">, MetaVarName<"file">, Group<unsupported>;
+def managed_assembly_name : Joined<["/", "-"], "managedassemblyname:">, HelpText<"Not supported">, MetaVarName<"assembly">, Group<unsupported>;
+def no_dependency : Flag<["/", "-"], "nodependency">, HelpText<"Not supported">, Group<unsupported>;
+def category : Flag<["/", "-"], "category">, HelpText<"Not supported">, Group<unsupported>;
+def no_logo : Flag<["/", "-"], "nologo">, HelpText<"No effect as this tool never writes copyright data.  Included for parity">;
+def out : Joined<["/", "-"], "out:">, HelpText<"Name of the output manifest.  If this is skipped and only one manifest is being operated upon by the tool, that manifest is modified in place">, MetaVarName<"manifest">;
+def input_resource : Joined<["/", "-"], "inputresource:">, HelpText<"Not supported">, MetaVarName<"file">, Group<unsupported>;
+def output_resource : Joined<["/", "-"], "outputresource:">, HelpText<"Not supported">, MetaVarName<"file">, Group<unsupported>;
+def output_resource_flag : Flag<["/", "-"], "outputresource">, Alias<output_resource>, HelpText<"Not supported">, Group<unsupported>;
+def update_resource : Joined<["/", "-"], "updateresource:">, HelpText<"Not supported">, MetaVarName<"file">, Group<unsupported>;
+def hash_update : Joined<["/", "-"], "hashupdate:">, HelpText<"Not supported">, MetaVarName<"file">, Group<unsupported>;
+def hash_update_flag : Flag<["/", "-"], "hashupdate">, Alias<hash_update>, HelpText<"Not supported">, Group<unsupported>;
+def validate_manifest : Flag<["/", "-"], "validate_manifest">, HelpText<"Not supported">, Group<unsupported>;
+def validate_file_hashes : Joined<["/", "-"], "validate_file_hashes:">, HelpText<"Not supported">, MetaVarName<"">, Group<unsupported>;
+def canonicalize : Flag<["/", "-"], "canonicalize:">, HelpText<"Not supported">, Group<unsupported>;
+def check_for_duplicates : Flag<["/", "-"], "check_for_duplicates:">, HelpText<"Not supported">, Group<unsupported>;
+def make_cdfs : Flag<["/", "-"], "makecdfs:">, HelpText<"Not supported">, Group<unsupported>;
+def verbose : Flag<["/", "-"], "verbose">, HelpText<"Not supported">, Group<unsupported>;
+def help : Flag<["/", "-"], "?">;
+def help_long : Flag<["/", "-"], "help">, Alias<help>;
+def h : Flag<["/", "-"], "h">, Alias<help>;
diff --git a/tools/llvm-mt/llvm-mt.cpp b/tools/llvm-mt/llvm-mt.cpp
new file mode 100644
index 000000000000..05c9238c7c64
--- /dev/null
+++ b/tools/llvm-mt/llvm-mt.cpp
@@ -0,0 +1,117 @@
+//===- llvm-mt.cpp - Merge .manifest files ---------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===---------------------------------------------------------------------===//
+//
+// Merge .manifest files.  This is intended to be a platform-independent port
+// of Microsoft's mt.exe.
+//
+//===---------------------------------------------------------------------===//
+
+#include "llvm/Option/Arg.h"
+#include "llvm/Option/ArgList.h"
+#include "llvm/Option/Option.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/PrettyStackTrace.h"
+#include "llvm/Support/Process.h"
+#include "llvm/Support/Signals.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <system_error>
+
+using namespace llvm;
+
+namespace {
+
+enum ID {
+  OPT_INVALID = 0, // This is not an option ID.
+#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM,  \
+               HELPTEXT, METAVAR, VALUES)                                      \
+  OPT_##ID,
+#include "Opts.inc"
+#undef OPTION
+};
+
+#define PREFIX(NAME, VALUE) const char *const NAME[] = VALUE;
+#include "Opts.inc"
+#undef PREFIX
+
+static const opt::OptTable::Info InfoTable[] = {
+#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM,  \
+               HELPTEXT, METAVAR, VALUES)                                      \
+{                                                                              \
+      PREFIX,      NAME,      HELPTEXT,                                        \
+      METAVAR,     OPT_##ID,  opt::Option::KIND##Class,                        \
+      PARAM,       FLAGS,     OPT_##GROUP,                                     \
+      OPT_##ALIAS, ALIASARGS, VALUES},
+#include "Opts.inc"
+#undef OPTION
+};
+
+class CvtResOptTable : public opt::OptTable {
+public:
+  CvtResOptTable() : OptTable(InfoTable, true) {}
+};
+
+static ExitOnError ExitOnErr;
+} // namespace
+
+LLVM_ATTRIBUTE_NORETURN void reportError(Twine Msg) {
+  errs() << "llvm-mt error: " << Msg << "\n";
+  exit(1);
+}
+
+int main(int argc, const char **argv) {
+  sys::PrintStackTraceOnErrorSignal(argv[0]);
+  PrettyStackTraceProgram X(argc, argv);
+
+  ExitOnErr.setBanner("llvm-mt: ");
+
+  SmallVector<const char *, 256> argv_buf;
+  SpecificBumpPtrAllocator<char> ArgAllocator;
+  ExitOnErr(errorCodeToError(sys::Process::GetArgumentVector(
+      argv_buf, makeArrayRef(argv, argc), ArgAllocator)));
+
+  llvm_shutdown_obj Y; // Call llvm_shutdown() on exit.
+
+  CvtResOptTable T;
+  unsigned MAI, MAC;
+  ArrayRef<const char *> ArgsArr = makeArrayRef(argv + 1, argc);
+  opt::InputArgList InputArgs = T.ParseArgs(ArgsArr, MAI, MAC);
+
+  for (auto &Arg : InputArgs) {
+    if (Arg->getOption().matches(OPT_unsupported)) {
+      outs() << "llvm-mt: ignoring unsupported '" << Arg->getOption().getName()
+             << "' option\n";
+    }
+  }
+
+  if (InputArgs.hasArg(OPT_help)) {
+    T.PrintHelp(outs(), "mt", "Manifest Tool", false);
+    return 0;
+  }
+
+  std::vector<std::string> InputFiles = InputArgs.getAllArgValues(OPT_manifest);
+
+  if (InputFiles.size() == 0) {
+    reportError("no input file specified");
+  }
+
+  StringRef OutputFile;
+
+  if (InputArgs.hasArg(OPT_out)) {
+    OutputFile = InputArgs.getLastArgValue(OPT_out);
+  } else if (InputFiles.size() == 1) {
+    OutputFile = InputFiles[0];
+  } else {
+    reportError("no output file specified");
+  }
+
+  return 0;
+}
diff --git a/tools/llvm-objdump/llvm-objdump.cpp b/tools/llvm-objdump/llvm-objdump.cpp
index 812f1af3ac68..d54b45515f05 100644
--- a/tools/llvm-objdump/llvm-objdump.cpp
+++ b/tools/llvm-objdump/llvm-objdump.cpp
@@ -870,7 +870,10 @@ static void printRelocationTargetName(const MachOObjectFile *O,
   bool isExtern = O->getPlainRelocationExternal(RE);
   uint64_t Val = O->getPlainRelocationSymbolNum(RE);
 
-  if (isExtern) {
+  if (O->getAnyRelocationType(RE) == MachO::ARM64_RELOC_ADDEND) {
+    fmt << format("0x%x", Val);
+    return;
+  } else if (isExtern) {
     symbol_iterator SI = O->symbol_begin();
     advance(SI, Val);
     Expected<StringRef> SOrErr = SI->getName();
diff --git a/tools/llvm-pdbutil/DumpOutputStyle.cpp b/tools/llvm-pdbutil/DumpOutputStyle.cpp
index 0642d841fd9f..01c7481c3086 100644
--- a/tools/llvm-pdbutil/DumpOutputStyle.cpp
+++ b/tools/llvm-pdbutil/DumpOutputStyle.cpp
@@ -654,7 +654,7 @@ static void dumpFullTypeStream(LinePrinter &Printer,
       NumDigits(TypeIndex::FirstNonSimpleIndex + Stream.getNumTypeRecords());
 
   MinimalTypeDumpVisitor V(Printer, Width + 2, Bytes, Extras, Types,
-                           Stream.getHashValues());
+                           Stream.getNumHashBuckets(), Stream.getHashValues());
 
   if (auto EC = codeview::visitTypeStream(Types, V)) {
     Printer.formatLine("An error occurred dumping type records: {0}",
@@ -670,7 +670,7 @@ static void dumpPartialTypeStream(LinePrinter &Printer,
       NumDigits(TypeIndex::FirstNonSimpleIndex + Stream.getNumTypeRecords());
 
   MinimalTypeDumpVisitor V(Printer, Width + 2, Bytes, Extras, Types,
-                           Stream.getHashValues());
+                           Stream.getNumHashBuckets(), Stream.getHashValues());
 
   if (opts::dump::DumpTypeDependents) {
     // If we need to dump all dependents, then iterate each index and find
diff --git a/tools/llvm-pdbutil/MinimalSymbolDumper.cpp b/tools/llvm-pdbutil/MinimalSymbolDumper.cpp
index ab7045ca4492..d93843649db0 100644
--- a/tools/llvm-pdbutil/MinimalSymbolDumper.cpp
+++ b/tools/llvm-pdbutil/MinimalSymbolDumper.cpp
@@ -725,8 +725,9 @@ Error MinimalSymbolDumper::visitKnownRecord(CVSymbol &CVR, ProcSym &Proc) {
                Proc.Parent, Proc.End,
                formatSegmentOffset(Proc.Segment, Proc.CodeOffset),
                Proc.CodeSize);
-  P.formatLine("debug start = {0}, debug end = {1}, flags = {2}", Proc.DbgStart,
-               Proc.DbgEnd,
+  // FIXME: It seems FunctionType is sometimes an id and sometimes a type.
+  P.formatLine("type = `{0}`, debug start = {1}, debug end = {2}, flags = {3}",
+               typeIndex(Proc.FunctionType), Proc.DbgStart, Proc.DbgEnd,
                formatProcSymFlags(P.getIndentLevel() + 9, Proc.Flags));
   return Error::success();
 }
diff --git a/tools/llvm-pdbutil/MinimalTypeDumper.cpp b/tools/llvm-pdbutil/MinimalTypeDumper.cpp
index 9621320ea99a..0079b9e7eaa4 100644
--- a/tools/llvm-pdbutil/MinimalTypeDumper.cpp
+++ b/tools/llvm-pdbutil/MinimalTypeDumper.cpp
@@ -18,6 +18,7 @@
 #include "llvm/DebugInfo/CodeView/Formatters.h"
 #include "llvm/DebugInfo/CodeView/LazyRandomTypeCollection.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
+#include "llvm/DebugInfo/PDB/Native/TpiHashing.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/MathExtras.h"
 
@@ -214,10 +215,20 @@ Error MinimalTypeDumpVisitor::visitTypeBegin(CVType &Record, TypeIndex Index) {
                  getLeafTypeName(Record.Type), Record.length());
   } else {
     std::string H;
-    if (Index.toArrayIndex() >= HashValues.size())
+    if (Index.toArrayIndex() >= HashValues.size()) {
       H = "(not present)";
-    else
-      H = utostr(HashValues[Index.toArrayIndex()]);
+    } else {
+      uint32_t Hash = HashValues[Index.toArrayIndex()];
+      Expected<uint32_t> MaybeHash = hashTypeRecord(Record);
+      if (!MaybeHash)
+        return MaybeHash.takeError();
+      uint32_t OurHash = *MaybeHash;
+      OurHash %= NumHashBuckets;
+      if (Hash == OurHash)
+        H = "0x" + utohexstr(Hash);
+      else
+        H = "0x" + utohexstr(Hash) + ", our hash = 0x" + utohexstr(OurHash);
+    }
     P.formatLine("{0} | {1} [size = {2}, hash = {3}]",
                  fmt_align(Index, AlignStyle::Right, Width),
                  getLeafTypeName(Record.Type), Record.length(), H);
@@ -395,8 +406,7 @@ Error MinimalTypeDumpVisitor::visitKnownRecord(CVType &CVR,
 
 Error MinimalTypeDumpVisitor::visitKnownRecord(CVType &CVR,
                                                TypeServer2Record &TS) {
-  P.formatLine("name = {0}, age = {1}, guid = {2}", TS.Name, TS.Age,
-               fmt_guid(TS.Guid));
+  P.formatLine("name = {0}, age = {1}, guid = {2}", TS.Name, TS.Age, TS.Guid);
   return Error::success();
 }
 
diff --git a/tools/llvm-pdbutil/MinimalTypeDumper.h b/tools/llvm-pdbutil/MinimalTypeDumper.h
index 42882b4b4060..4227688f0f71 100644
--- a/tools/llvm-pdbutil/MinimalTypeDumper.h
+++ b/tools/llvm-pdbutil/MinimalTypeDumper.h
@@ -25,9 +25,10 @@ class MinimalTypeDumpVisitor : public codeview::TypeVisitorCallbacks {
 public:
   MinimalTypeDumpVisitor(LinePrinter &P, uint32_t Width, bool RecordBytes,
                          bool Hashes, codeview::LazyRandomTypeCollection &Types,
+                         uint32_t NumHashBuckets,
                          FixedStreamArray<support::ulittle32_t> HashValues)
       : P(P), Width(Width), RecordBytes(RecordBytes), Hashes(Hashes),
-        Types(Types), HashValues(HashValues) {}
+        Types(Types), NumHashBuckets(NumHashBuckets), HashValues(HashValues) {}
 
   Error visitTypeBegin(codeview::CVType &Record,
                        codeview::TypeIndex Index) override;
@@ -53,6 +54,7 @@ private:
   bool RecordBytes = false;
   bool Hashes = false;
   codeview::LazyRandomTypeCollection &Types;
+  uint32_t NumHashBuckets;
   FixedStreamArray<support::ulittle32_t> HashValues;
 };
 } // namespace pdb
diff --git a/tools/llvm-pdbutil/PdbYaml.cpp b/tools/llvm-pdbutil/PdbYaml.cpp
index 315ae2e6711f..9c3beb566d2c 100644
--- a/tools/llvm-pdbutil/PdbYaml.cpp
+++ b/tools/llvm-pdbutil/PdbYaml.cpp
@@ -38,41 +38,6 @@ LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(llvm::pdb::PdbRaw_FeatureSig)
 namespace llvm {
 namespace yaml {
 
-template <> struct ScalarTraits<llvm::pdb::PDB_UniqueId> {
-  static void output(const llvm::pdb::PDB_UniqueId &S, void *,
-                     llvm::raw_ostream &OS) {
-    OS << S;
-  }
-
-  static StringRef input(StringRef Scalar, void *Ctx,
-                         llvm::pdb::PDB_UniqueId &S) {
-    if (Scalar.size() != 38)
-      return "GUID strings are 38 characters long";
-    if (Scalar[0] != '{' || Scalar[37] != '}')
-      return "GUID is not enclosed in {}";
-    if (Scalar[9] != '-' || Scalar[14] != '-' || Scalar[19] != '-' ||
-        Scalar[24] != '-')
-      return "GUID sections are not properly delineated with dashes";
-
-    uint8_t *OutBuffer = S.Guid;
-    for (auto Iter = Scalar.begin(); Iter != Scalar.end();) {
-      if (*Iter == '-' || *Iter == '{' || *Iter == '}') {
-        ++Iter;
-        continue;
-      }
-      uint8_t Value = (llvm::hexDigitValue(*Iter) << 4);
-      ++Iter;
-      Value |= llvm::hexDigitValue(*Iter);
-      ++Iter;
-      *OutBuffer++ = Value;
-    }
-
-    return "";
-  }
-
-  static bool mustQuote(StringRef Scalar) { return needsQuotes(Scalar); }
-};
-
 template <> struct ScalarEnumerationTraits<llvm::pdb::PDB_Machine> {
   static void enumeration(IO &io, llvm::pdb::PDB_Machine &Value) {
     io.enumCase(Value, "Invalid", PDB_Machine::Invalid);
diff --git a/tools/llvm-pdbutil/PdbYaml.h b/tools/llvm-pdbutil/PdbYaml.h
index 62ed608916fc..91e054490a5f 100644
--- a/tools/llvm-pdbutil/PdbYaml.h
+++ b/tools/llvm-pdbutil/PdbYaml.h
@@ -57,7 +57,7 @@ struct PdbInfoStream {
   PdbRaw_ImplVer Version = PdbImplVC70;
   uint32_t Signature = 0;
   uint32_t Age = 1;
-  PDB_UniqueId Guid;
+  codeview::GUID Guid;
   std::vector<PdbRaw_FeatureSig> Features;
   std::vector<NamedStreamMapping> NamedStreams;
 };
diff --git a/tools/llvm-pdbutil/llvm-pdbutil.cpp b/tools/llvm-pdbutil/llvm-pdbutil.cpp
index 6aa08ff3cd87..f2bd194622ed 100644
--- a/tools/llvm-pdbutil/llvm-pdbutil.cpp
+++ b/tools/llvm-pdbutil/llvm-pdbutil.cpp
@@ -956,8 +956,8 @@ static void mergePdbs() {
     SmallVector<TypeIndex, 128> IdMap;
     if (File.hasPDBTpiStream()) {
       auto &Tpi = ExitOnErr(File.getPDBTpiStream());
-      ExitOnErr(codeview::mergeTypeRecords(MergedTpi, TypeMap, nullptr,
-                                           Tpi.typeArray()));
+      ExitOnErr(
+          codeview::mergeTypeRecords(MergedTpi, TypeMap, Tpi.typeArray()));
     }
     if (File.hasPDBIpiStream()) {
       auto &Ipi = ExitOnErr(File.getPDBIpiStream());
diff --git a/tools/llvm-readobj/CMakeLists.txt b/tools/llvm-readobj/CMakeLists.txt
index bde486a5f0db..f5b1a5b256ad 100644
--- a/tools/llvm-readobj/CMakeLists.txt
+++ b/tools/llvm-readobj/CMakeLists.txt
@@ -20,3 +20,5 @@ add_llvm_tool(llvm-readobj
   WasmDumper.cpp
   Win64EHDumper.cpp
   )
+
+add_llvm_tool_symlink(llvm-readelf llvm-readobj)
diff --git a/tools/llvm-readobj/COFFDumper.cpp b/tools/llvm-readobj/COFFDumper.cpp
index 9fb3267e2f9d..74c44116b127 100644
--- a/tools/llvm-readobj/COFFDumper.cpp
+++ b/tools/llvm-readobj/COFFDumper.cpp
@@ -1215,8 +1215,7 @@ void COFFDumper::mergeCodeViewTypes(TypeTableBuilder &CVIDs,
         error(object_error::parse_failed);
       }
       SmallVector<TypeIndex, 128> SourceToDest;
-      if (auto EC = mergeTypeAndIdRecords(CVIDs, CVTypes, SourceToDest, nullptr,
-                                          Types))
+      if (auto EC = mergeTypeAndIdRecords(CVIDs, CVTypes, SourceToDest, Types))
         return error(std::move(EC));
     }
   }
diff --git a/tools/llvm-readobj/ELFDumper.cpp b/tools/llvm-readobj/ELFDumper.cpp
index a1db96cba081..5698420bbcc2 100644
--- a/tools/llvm-readobj/ELFDumper.cpp
+++ b/tools/llvm-readobj/ELFDumper.cpp
@@ -1532,6 +1532,7 @@ static const char *getTypeString(unsigned Arch, uint64_t Type) {
   LLVM_READOBJ_TYPE_CASE(TLSDESC_PLT);
   LLVM_READOBJ_TYPE_CASE(TLSDESC_GOT);
   LLVM_READOBJ_TYPE_CASE(AUXILIARY);
+  LLVM_READOBJ_TYPE_CASE(FILTER);
   default: return "unknown";
   }
 }
@@ -1624,6 +1625,10 @@ StringRef ELFDumper<ELFT>::getDynamicString(uint64_t Value) const {
   return StringRef(DynamicStringTable.data() + Value);
 }
 
+static void printLibrary(raw_ostream &OS, const Twine &Tag, const Twine &Name) {
+  OS << Tag << ": [" << Name << "]";
+}
+
 template <class ELFT>
 void ELFDumper<ELFT>::printValue(uint64_t Type, uint64_t Value) {
   raw_ostream &OS = W.getOStream();
@@ -1687,13 +1692,16 @@ void ELFDumper<ELFT>::printValue(uint64_t Type, uint64_t Value) {
     OS << Value << " (bytes)";
     break;
   case DT_NEEDED:
-    OS << "SharedLibrary (" << getDynamicString(Value) << ")";
+    printLibrary(OS, "Shared library", getDynamicString(Value));
     break;
   case DT_SONAME:
-    OS << "LibrarySoname (" << getDynamicString(Value) << ")";
+    printLibrary(OS, "Library soname", getDynamicString(Value));
     break;
   case DT_AUXILIARY:
-    OS << "Auxiliary library: [" << getDynamicString(Value) << "]";
+    printLibrary(OS, "Auxiliary library", getDynamicString(Value));
+    break;
+  case DT_FILTER:
+    printLibrary(OS, "Filter library", getDynamicString(Value));
     break;
   case DT_RPATH:
   case DT_RUNPATH:
diff --git a/tools/llvm-readobj/llvm-readobj.cpp b/tools/llvm-readobj/llvm-readobj.cpp
index 51991a3f067b..7bfb18fab12b 100644
--- a/tools/llvm-readobj/llvm-readobj.cpp
+++ b/tools/llvm-readobj/llvm-readobj.cpp
@@ -34,6 +34,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/Path.h"
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/ScopedPrinter.h"
 #include "llvm/Support/Signals.h"
@@ -50,6 +51,13 @@ namespace opts {
     cl::desc("<input object files>"),
     cl::ZeroOrMore);
 
+  // -wide, -W
+  cl::opt<bool> WideOutput("wide",
+    cl::desc("Ignored for compatibility with GNU readelf"));
+  cl::alias WideOutputShort("W",
+    cl::desc("Alias for --wide"),
+    cl::aliasopt(WideOutput));
+
   // -file-headers, -h
   cl::opt<bool> FileHeaders("file-headers",
     cl::desc("Display file headers "));
@@ -57,12 +65,16 @@ namespace opts {
     cl::desc("Alias for --file-headers"),
     cl::aliasopt(FileHeaders));
 
-  // -sections, -s
+  // -sections, -s, -S
+  // Note: In GNU readelf, -s means --symbols!
   cl::opt<bool> Sections("sections",
     cl::desc("Display all sections."));
   cl::alias SectionsShort("s",
     cl::desc("Alias for --sections"),
     cl::aliasopt(Sections));
+  cl::alias SectionsShortUpper("S",
+    cl::desc("Alias for --sections"),
+    cl::aliasopt(Sections));
 
   // -section-relocations, -sr
   cl::opt<bool> SectionRelocations("section-relocations",
@@ -533,13 +545,19 @@ static void dumpInput(StringRef File) {
 }
 
 int main(int argc, const char *argv[]) {
-  sys::PrintStackTraceOnErrorSignal(argv[0]);
+  StringRef ToolName = argv[0];
+  sys::PrintStackTraceOnErrorSignal(ToolName);
   PrettyStackTraceProgram X(argc, argv);
   llvm_shutdown_obj Y;
 
   // Register the target printer for --version.
   cl::AddExtraVersionPrinter(TargetRegistry::printRegisteredTargetsForVersion);
 
+  opts::WideOutput.setHiddenFlag(cl::Hidden);
+
+  if (sys::path::stem(ToolName).find("readelf") != StringRef::npos)
+    opts::Output = opts::GNU;
+
   cl::ParseCommandLineOptions(argc, argv, "LLVM Object Reader\n");
 
   // Default to stdin if no filename is specified.
diff --git a/tools/opt-viewer/opt-diff.py b/tools/opt-viewer/opt-diff.py
index 9e921f8488d3..f39432c8bc9d 100755
--- a/tools/opt-viewer/opt-diff.py
+++ b/tools/opt-viewer/opt-diff.py
@@ -20,24 +20,17 @@ import optrecord
 import argparse
 from collections import defaultdict
 from multiprocessing import cpu_count, Pool
-import os, os.path
-import fnmatch
-
-def find_files(dir_or_file):
-    if os.path.isfile(dir_or_file):
-        return [dir_or_file]
-
-    all = []
-    for dir, subdirs, files in os.walk(dir_or_file):
-        for file in files:
-            if fnmatch.fnmatch(file, "*.opt.yaml"):
-                all.append( os.path.join(dir, file))
-    return all
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description=desc)
-    parser.add_argument('yaml_dir_or_file_1')
-    parser.add_argument('yaml_dir_or_file_2')
+    parser.add_argument(
+        'yaml_dir_or_file_1',
+        help='An optimization record file or a directory searched for optimization '
+             'record files that are used as the old version for the comparison')
+    parser.add_argument(
+        'yaml_dir_or_file_2',
+        help='An optimization record file or a directory searched for optimization '
+             'record files that are used as the new version for the comparison')
     parser.add_argument(
         '--jobs',
         '-j',
@@ -53,8 +46,8 @@ if __name__ == '__main__':
     parser.add_argument('--output', '-o', default='diff.opt.yaml')
     args = parser.parse_args()
 
-    files1 = find_files(args.yaml_dir_or_file_1)
-    files2 = find_files(args.yaml_dir_or_file_2)
+    files1 = optrecord.find_opt_files([args.yaml_dir_or_file_1])
+    files2 = optrecord.find_opt_files([args.yaml_dir_or_file_2])
 
     print_progress = not args.no_progress_indicator
     all_remarks1, _, _ = optrecord.gather_results(files1, args.jobs, print_progress)
diff --git a/tools/opt-viewer/opt-stats.py b/tools/opt-viewer/opt-stats.py
index a7e598fdfd02..205b08ba8a74 100755
--- a/tools/opt-viewer/opt-stats.py
+++ b/tools/opt-viewer/opt-stats.py
@@ -15,7 +15,11 @@ from multiprocessing import cpu_count, Pool
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description=desc)
-    parser.add_argument('yaml_files', nargs='+')
+    parser.add_argument(
+        'yaml_dirs_or_files',
+        nargs='+',
+        help='List of optimization record files or directories searched '
+             'for optimization record files.')
     parser.add_argument(
         '--jobs',
         '-j',
@@ -31,8 +35,14 @@ if __name__ == '__main__':
     args = parser.parse_args()
 
     print_progress = not args.no_progress_indicator
+
+    files = optrecord.find_opt_files(args.yaml_dirs_or_files)
+    if not files:
+        parser.error("No *.opt.yaml files found")
+        sys.exit(1)
+
     all_remarks, file_remarks, _ = optrecord.gather_results(
-        args.yaml_files, args.jobs, print_progress)
+        files, args.jobs, print_progress)
     if print_progress:
         print('\n')
 
diff --git a/tools/opt-viewer/opt-viewer.py b/tools/opt-viewer/opt-viewer.py
index e6dd6a0286fe..69bcaedb7669 100755
--- a/tools/opt-viewer/opt-viewer.py
+++ b/tools/opt-viewer/opt-viewer.py
@@ -219,7 +219,11 @@ def generate_report(all_remarks,
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description=desc)
-    parser.add_argument('yaml_files', nargs='+')
+    parser.add_argument(
+        'yaml_dirs_or_files',
+        nargs='+',
+        help='List of optimization record files or directories searched '
+             'for optimization record files.')
     parser.add_argument(
         '--output-dir',
         '-o',
@@ -248,8 +252,14 @@ if __name__ == '__main__':
     args = parser.parse_args()
 
     print_progress = not args.no_progress_indicator
+
+    files = optrecord.find_opt_files(args.yaml_dirs_or_files)
+    if not files:
+        parser.error("No *.opt.yaml files found")
+        sys.exit(1)
+
     all_remarks, file_remarks, should_display_hotness = \
-        optrecord.gather_results(args.yaml_files, args.jobs, print_progress)
+        optrecord.gather_results(files, args.jobs, print_progress)
 
     map_remarks(all_remarks)
 
diff --git a/tools/opt-viewer/optpmap.py b/tools/opt-viewer/optpmap.py
index 01e848e03976..16cb22e21491 100644
--- a/tools/opt-viewer/optpmap.py
+++ b/tools/opt-viewer/optpmap.py
@@ -20,6 +20,7 @@ def _wrapped_func(func_and_args):
         with _current.get_lock():
             _current.value += 1
         sys.stdout.write('\r\t{} of {}'.format(_current.value, _total.value))
+        sys.stdout.flush()
 
     return func(argument)
 
diff --git a/tools/opt-viewer/optrecord.py b/tools/opt-viewer/optrecord.py
index 61ed9626cffa..4599e12d7e68 100644
--- a/tools/opt-viewer/optrecord.py
+++ b/tools/opt-viewer/optrecord.py
@@ -12,8 +12,10 @@ except ImportError:
 
 import cgi
 from collections import defaultdict
+import fnmatch
 import functools
 from multiprocessing import Lock
+import os, os.path
 import subprocess
 
 import optpmap
@@ -47,7 +49,7 @@ def demangle(name):
 
 
 def html_file_name(filename):
-    return filename.replace('/', '_') + ".html"
+    return filename.replace('/', '_').replace('#', '_') + ".html"
 
 
 def make_link(File, Line):
@@ -233,3 +235,19 @@ def gather_results(filenames, num_jobs, should_print_progress):
         all_remarks.update(all_remarks_job)
 
     return all_remarks, file_remarks, max_hotness != 0
+
+
+def find_opt_files(dirs_or_files):
+    all = []
+    for dir_or_file in dirs_or_files:
+        if os.path.isfile(dir_or_file):
+            all.append(dir_or_file)
+        else:
+            for dir, subdirs, files in os.walk(dir_or_file):
+                # Exclude mounted directories and symlinks (os.walk default).
+                subdirs[:] = [d for d in subdirs
+                              if not os.path.ismount(os.path.join(dir, d))]
+                for file in files:
+                    if fnmatch.fnmatch(file, "*.opt.yaml"):
+                        all.append(os.path.join(dir, file))
+    return all
diff --git a/unittests/Analysis/CGSCCPassManagerTest.cpp b/unittests/Analysis/CGSCCPassManagerTest.cpp
index d46d9535fa4b..e24818265975 100644
--- a/unittests/Analysis/CGSCCPassManagerTest.cpp
+++ b/unittests/Analysis/CGSCCPassManagerTest.cpp
@@ -9,6 +9,7 @@
 
 #include "llvm/Analysis/CGSCCPassManager.h"
 #include "llvm/Analysis/LazyCallGraph.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstIterator.h"
@@ -227,6 +228,7 @@ public:
             "entry:\n"
             "  ret void\n"
             "}\n")) {
+    MAM.registerPass([&] { return TargetLibraryAnalysis(); });
     MAM.registerPass([&] { return LazyCallGraphAnalysis(); });
     MAM.registerPass([&] { return FunctionAnalysisManagerModuleProxy(FAM); });
     MAM.registerPass([&] { return CGSCCAnalysisManagerModuleProxy(CGAM); });
diff --git a/unittests/Analysis/LazyCallGraphTest.cpp b/unittests/Analysis/LazyCallGraphTest.cpp
index 65730486cd75..9e7e128bcfb3 100644
--- a/unittests/Analysis/LazyCallGraphTest.cpp
+++ b/unittests/Analysis/LazyCallGraphTest.cpp
@@ -216,10 +216,17 @@ static const char DiamondOfTrianglesRefGraph[] =
      "  ret void\n"
      "}\n";
 
+static LazyCallGraph buildCG(Module &M) {
+  TargetLibraryInfoImpl TLII(Triple(M.getTargetTriple()));
+  TargetLibraryInfo TLI(TLII);
+  LazyCallGraph CG(M, TLI);
+  return CG;
+}
+
 TEST(LazyCallGraphTest, BasicGraphFormation) {
   LLVMContext Context;
   std::unique_ptr<Module> M = parseAssembly(Context, DiamondOfTriangles);
-  LazyCallGraph CG(*M);
+  LazyCallGraph CG = buildCG(*M);
 
   // The order of the entry nodes should be stable w.r.t. the source order of
   // the IR, and everything in our module is an entry node, so just directly
@@ -407,7 +414,7 @@ TEST(LazyCallGraphTest, BasicGraphMutation) {
                                                      "entry:\n"
                                                      "  ret void\n"
                                                      "}\n");
-  LazyCallGraph CG(*M);
+  LazyCallGraph CG = buildCG(*M);
 
   LazyCallGraph::Node &A = CG.get(lookupFunction(*M, "a"));
   LazyCallGraph::Node &B = CG.get(lookupFunction(*M, "b"));
@@ -445,7 +452,7 @@ TEST(LazyCallGraphTest, BasicGraphMutation) {
 TEST(LazyCallGraphTest, InnerSCCFormation) {
   LLVMContext Context;
   std::unique_ptr<Module> M = parseAssembly(Context, DiamondOfTriangles);
-  LazyCallGraph CG(*M);
+  LazyCallGraph CG = buildCG(*M);
 
   // Now mutate the graph to connect every node into a single RefSCC to ensure
   // that our inner SCC formation handles the rest.
@@ -542,7 +549,7 @@ TEST(LazyCallGraphTest, MultiArmSCC) {
                                                      "  call void @f1()\n"
                                                      "  ret void\n"
                                                      "}\n");
-  LazyCallGraph CG(*M);
+  LazyCallGraph CG = buildCG(*M);
 
   // Force the graph to be fully expanded.
   CG.buildRefSCCs();
@@ -593,7 +600,7 @@ TEST(LazyCallGraphTest, OutgoingEdgeMutation) {
                                                      "entry:\n"
                                                      "  ret void\n"
                                                      "}\n");
-  LazyCallGraph CG(*M);
+  LazyCallGraph CG = buildCG(*M);
 
   // Force the graph to be fully expanded.
   CG.buildRefSCCs();
@@ -739,7 +746,7 @@ TEST(LazyCallGraphTest, IncomingEdgeInsertion) {
   //      a3--a2      |
   //
   std::unique_ptr<Module> M = parseAssembly(Context, DiamondOfTriangles);
-  LazyCallGraph CG(*M);
+  LazyCallGraph CG = buildCG(*M);
 
   // Force the graph to be fully expanded.
   CG.buildRefSCCs();
@@ -831,7 +838,7 @@ TEST(LazyCallGraphTest, IncomingEdgeInsertionRefGraph) {
   // references rather than calls.
   std::unique_ptr<Module> M =
       parseAssembly(Context, DiamondOfTrianglesRefGraph);
-  LazyCallGraph CG(*M);
+  LazyCallGraph CG = buildCG(*M);
 
   // Force the graph to be fully expanded.
   CG.buildRefSCCs();
@@ -938,7 +945,7 @@ TEST(LazyCallGraphTest, IncomingEdgeInsertionLargeCallCycle) {
                                                      "entry:\n"
                                                      "  ret void\n"
                                                      "}\n");
-  LazyCallGraph CG(*M);
+  LazyCallGraph CG = buildCG(*M);
 
   // Force the graph to be fully expanded.
   CG.buildRefSCCs();
@@ -1015,7 +1022,7 @@ TEST(LazyCallGraphTest, IncomingEdgeInsertionLargeRefCycle) {
                              "entry:\n"
                              "  ret void\n"
                              "}\n");
-  LazyCallGraph CG(*M);
+  LazyCallGraph CG = buildCG(*M);
 
   // Force the graph to be fully expanded.
   CG.buildRefSCCs();
@@ -1077,7 +1084,7 @@ TEST(LazyCallGraphTest, InlineAndDeleteFunction) {
   //      a3--a2      |
   //
   std::unique_ptr<Module> M = parseAssembly(Context, DiamondOfTriangles);
-  LazyCallGraph CG(*M);
+  LazyCallGraph CG = buildCG(*M);
 
   // Force the graph to be fully expanded.
   CG.buildRefSCCs();
@@ -1221,7 +1228,7 @@ TEST(LazyCallGraphTest, InternalEdgeMutation) {
                                                      "  call void @a()\n"
                                                      "  ret void\n"
                                                      "}\n");
-  LazyCallGraph CG(*M);
+  LazyCallGraph CG = buildCG(*M);
 
   // Force the graph to be fully expanded.
   CG.buildRefSCCs();
@@ -1315,7 +1322,7 @@ TEST(LazyCallGraphTest, InternalEdgeRemoval) {
                "  store i8* bitcast (void(i8**)* @c to i8*), i8** %ptr\n"
                "  ret void\n"
                "}\n");
-  LazyCallGraph CG(*M);
+  LazyCallGraph CG = buildCG(*M);
 
   // Force the graph to be fully expanded.
   CG.buildRefSCCs();
@@ -1390,7 +1397,7 @@ TEST(LazyCallGraphTest, InternalNoOpEdgeRemoval) {
                "  store i8* bitcast (void(i8**)* @b to i8*), i8** %ptr\n"
                "  ret void\n"
                "}\n");
-  LazyCallGraph CG(*M);
+  LazyCallGraph CG = buildCG(*M);
 
   // Force the graph to be fully expanded.
   CG.buildRefSCCs();
@@ -1467,7 +1474,7 @@ TEST(LazyCallGraphTest, InternalCallEdgeToRef) {
                                                      "  call void @c()\n"
                                                      "  ret void\n"
                                                      "}\n");
-  LazyCallGraph CG(*M);
+  LazyCallGraph CG = buildCG(*M);
 
   // Force the graph to be fully expanded.
   CG.buildRefSCCs();
@@ -1560,7 +1567,7 @@ TEST(LazyCallGraphTest, InternalRefEdgeToCall) {
                              "  store void()* @a, void()** undef\n"
                              "  ret void\n"
                              "}\n");
-  LazyCallGraph CG(*M);
+  LazyCallGraph CG = buildCG(*M);
 
   // Force the graph to be fully expanded.
   CG.buildRefSCCs();
@@ -1672,7 +1679,7 @@ TEST(LazyCallGraphTest, InternalRefEdgeToCallNoCycleInterleaved) {
                              "  store void()* @a, void()** undef\n"
                              "  ret void\n"
                              "}\n");
-  LazyCallGraph CG(*M);
+  LazyCallGraph CG = buildCG(*M);
 
   // Force the graph to be fully expanded.
   CG.buildRefSCCs();
@@ -1802,7 +1809,7 @@ TEST(LazyCallGraphTest, InternalRefEdgeToCallBothPartitionAndMerge) {
                              "  store void()* @a, void()** undef\n"
                              "  ret void\n"
                              "}\n");
-  LazyCallGraph CG(*M);
+  LazyCallGraph CG = buildCG(*M);
 
   // Force the graph to be fully expanded.
   CG.buildRefSCCs();
@@ -1885,7 +1892,7 @@ TEST(LazyCallGraphTest, HandleBlockAddress) {
                              "  store i8* blockaddress(@f, %bb), i8** %ptr\n"
                              "  ret void\n"
                              "}\n");
-  LazyCallGraph CG(*M);
+  LazyCallGraph CG = buildCG(*M);
 
   CG.buildRefSCCs();
   auto I = CG.postorder_ref_scc_begin();
@@ -1933,7 +1940,7 @@ TEST(LazyCallGraphTest, ReplaceNodeFunction) {
                     "  store i8* bitcast (void(i8**)* @d to i8*), i8** %ptr\n"
                     "  ret void\n"
                     "}\n");
-  LazyCallGraph CG(*M);
+  LazyCallGraph CG = buildCG(*M);
 
   // Force the graph to be fully expanded.
   CG.buildRefSCCs();
@@ -2011,7 +2018,7 @@ TEST(LazyCallGraphTest, RemoveFunctionWithSpurriousRef) {
                     "entry:\n"
                     "  ret void\n"
                     "}\n");
-  LazyCallGraph CG(*M);
+  LazyCallGraph CG = buildCG(*M);
 
   // Insert spurious ref edges.
   LazyCallGraph::Node &AN = CG.get(lookupFunction(*M, "a"));
diff --git a/unittests/DebugInfo/CodeView/RandomAccessVisitorTest.cpp b/unittests/DebugInfo/CodeView/RandomAccessVisitorTest.cpp
index 92134513b75b..04b7bb0ba936 100644
--- a/unittests/DebugInfo/CodeView/RandomAccessVisitorTest.cpp
+++ b/unittests/DebugInfo/CodeView/RandomAccessVisitorTest.cpp
@@ -13,7 +13,6 @@
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
 #include "llvm/DebugInfo/CodeView/TypeRecordMapping.h"
 #include "llvm/DebugInfo/CodeView/TypeSerializer.h"
-#include "llvm/DebugInfo/CodeView/TypeServerHandler.h"
 #include "llvm/DebugInfo/CodeView/TypeTableBuilder.h"
 #include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h"
 #include "llvm/DebugInfo/PDB/Native/RawTypes.h"
@@ -402,4 +401,4 @@ TEST_F(RandomAccessVisitorTest, CrossChunkName) {
 
   StringRef Name = Types.getTypeName(IndexOne);
   EXPECT_EQ("const FooClass", Name);
-}
\ No newline at end of file
+}
diff --git a/unittests/DebugInfo/PDB/CMakeLists.txt b/unittests/DebugInfo/PDB/CMakeLists.txt
index 989cb396f674..583b065f464c 100644
--- a/unittests/DebugInfo/PDB/CMakeLists.txt
+++ b/unittests/DebugInfo/PDB/CMakeLists.txt
@@ -10,11 +10,10 @@ set(DebugInfoPDBSources
   StringTableBuilderTest.cpp
   MSFBuilderTest.cpp
   PDBApiTest.cpp
-  TypeServerHandlerTest.cpp
   )
 
 add_llvm_unittest(DebugInfoPDBTests
   ${DebugInfoPDBSources}
   )
 
-target_link_libraries(DebugInfoPDBTests LLVMTestingSupport)
\ No newline at end of file
+target_link_libraries(DebugInfoPDBTests LLVMTestingSupport)
diff --git a/unittests/DebugInfo/PDB/TypeServerHandlerTest.cpp b/unittests/DebugInfo/PDB/TypeServerHandlerTest.cpp
deleted file mode 100644
index d09b9130ee27..000000000000
--- a/unittests/DebugInfo/PDB/TypeServerHandlerTest.cpp
+++ /dev/null
@@ -1,183 +0,0 @@
-//===- llvm/unittest/DebugInfo/PDB/TypeServerHandlerTest.cpp --------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/DebugInfo/CodeView/TypeServerHandler.h"
-#include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
-#include "llvm/DebugInfo/CodeView/TypeRecord.h"
-#include "llvm/DebugInfo/CodeView/TypeRecordMapping.h"
-#include "llvm/DebugInfo/CodeView/TypeSerializer.h"
-#include "llvm/DebugInfo/CodeView/TypeTableBuilder.h"
-#include "llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h"
-#include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h"
-#include "llvm/DebugInfo/PDB/Native/RawTypes.h"
-#include "llvm/Support/Allocator.h"
-#include "llvm/Support/Error.h"
-#include "llvm/Testing/Support/Error.h"
-
-#include "gtest/gtest.h"
-
-using namespace llvm;
-using namespace llvm::codeview;
-using namespace llvm::pdb;
-
-namespace {
-
-constexpr uint8_t Guid[] = {0x2a, 0x2c, 0x1c, 0x2a, 0xcb, 0x9e, 0x48, 0x18,
-                            0x82, 0x82, 0x7a, 0x87, 0xc3, 0xfe, 0x16, 0xe8};
-StringRef GuidStr(reinterpret_cast<const char *>(Guid),
-                  llvm::array_lengthof(Guid));
-
-constexpr const char *Name = "Test Name";
-constexpr int Age = 1;
-
-class MockTypeServerHandler : public TypeServerHandler {
-public:
-  explicit MockTypeServerHandler(bool HandleAlways)
-      : HandleAlways(HandleAlways) {}
-
-  Expected<bool> handle(TypeServer2Record &TS,
-                        TypeVisitorCallbacks &Callbacks) override {
-    if (TS.Age != Age || TS.Guid != GuidStr || TS.Name != Name)
-      return make_error<CodeViewError>(cv_error_code::corrupt_record,
-                                       "Invalid TypeServer record!");
-
-    if (Handled && !HandleAlways)
-      return false;
-
-    Handled = true;
-    return true;
-  }
-
-  bool Handled = false;
-  bool HandleAlways;
-};
-
-class MockTypeVisitorCallbacks : public TypeVisitorCallbacks {
-public:
-  enum class State {
-    Ready,
-    VisitTypeBegin,
-    VisitKnownRecord,
-    VisitTypeEnd,
-  };
-  Error visitTypeBegin(CVType &CVT) override {
-    if (S != State::Ready)
-      return make_error<CodeViewError>(cv_error_code::unspecified,
-                                       "Invalid visitor state!");
-
-    S = State::VisitTypeBegin;
-    return Error::success();
-  }
-
-  Error visitKnownRecord(CVType &CVT, TypeServer2Record &TS) override {
-    if (S != State::VisitTypeBegin)
-      return make_error<CodeViewError>(cv_error_code::unspecified,
-                                       "Invalid visitor state!");
-
-    S = State::VisitKnownRecord;
-    return Error::success();
-  }
-
-  Error visitTypeEnd(CVType &CVT) override {
-    if (S != State::VisitKnownRecord)
-      return make_error<CodeViewError>(cv_error_code::unspecified,
-                                       "Invalid visitor state!");
-
-    S = State::VisitTypeEnd;
-    return Error::success();
-  }
-
-  State S = State::Ready;
-};
-
-class TypeServerHandlerTest : public testing::Test {
-public:
-  void SetUp() override {
-    TypeServer2Record R(TypeRecordKind::TypeServer2);
-    R.Age = Age;
-    R.Guid = GuidStr;
-    R.Name = Name;
-
-    TypeTableBuilder Builder(Allocator);
-    Builder.writeKnownType(R);
-    TypeServerRecord.RecordData = Builder.records().front();
-    TypeServerRecord.Type = TypeLeafKind::LF_TYPESERVER2;
-  }
-
-protected:
-  BumpPtrAllocator Allocator;
-  CVType TypeServerRecord;
-};
-
-// Test that when no type server handler is registered, it gets handled by the
-// normal
-// visitor callbacks.
-TEST_F(TypeServerHandlerTest, VisitRecordNoTypeServer) {
-  MockTypeVisitorCallbacks C2;
-  MockTypeVisitorCallbacks C1;
-  TypeVisitorCallbackPipeline Pipeline;
-
-  Pipeline.addCallbackToPipeline(C1);
-  Pipeline.addCallbackToPipeline(C2);
-
-  EXPECT_THAT_ERROR(codeview::visitTypeRecord(TypeServerRecord, Pipeline),
-                    Succeeded());
-
-  EXPECT_EQ(MockTypeVisitorCallbacks::State::VisitTypeEnd, C1.S);
-  EXPECT_EQ(MockTypeVisitorCallbacks::State::VisitTypeEnd, C2.S);
-}
-
-// Test that when a TypeServerHandler is registered, it gets consumed by the
-// handler if and only if the handler returns true.
-TEST_F(TypeServerHandlerTest, VisitRecordWithTypeServerOnce) {
-  MockTypeServerHandler Handler(false);
-
-  MockTypeVisitorCallbacks C1;
-
-  // Our mock server returns true the first time.
-  EXPECT_THAT_ERROR(codeview::visitTypeRecord(TypeServerRecord, C1,
-                                              codeview::VDS_BytesExternal,
-                                              &Handler),
-                    Succeeded());
-  EXPECT_TRUE(Handler.Handled);
-  EXPECT_EQ(MockTypeVisitorCallbacks::State::Ready, C1.S);
-
-  // And false the second time.
-  EXPECT_THAT_ERROR(codeview::visitTypeRecord(TypeServerRecord, C1,
-                                              codeview::VDS_BytesExternal,
-                                              &Handler),
-                    Succeeded());
-  EXPECT_TRUE(Handler.Handled);
-  EXPECT_EQ(MockTypeVisitorCallbacks::State::VisitTypeEnd, C1.S);
-}
-
-// Test that when a type server handler is registered, if the handler keeps
-// returning true, it will keep getting consumed by the handler and not go
-// to the default processor.
-TEST_F(TypeServerHandlerTest, VisitRecordWithTypeServerAlways) {
-  MockTypeServerHandler Handler(true);
-
-  MockTypeVisitorCallbacks C1;
-
-  EXPECT_THAT_ERROR(codeview::visitTypeRecord(TypeServerRecord, C1,
-                                              codeview::VDS_BytesExternal,
-                                              &Handler),
-                    Succeeded());
-  EXPECT_TRUE(Handler.Handled);
-  EXPECT_EQ(MockTypeVisitorCallbacks::State::Ready, C1.S);
-
-  EXPECT_THAT_ERROR(codeview::visitTypeRecord(TypeServerRecord, C1,
-                                              codeview::VDS_BytesExternal,
-                                              &Handler),
-                    Succeeded());
-  EXPECT_TRUE(Handler.Handled);
-  EXPECT_EQ(MockTypeVisitorCallbacks::State::Ready, C1.S);
-}
-
-} // end anonymous namespace
diff --git a/unittests/IR/CFGBuilder.cpp b/unittests/IR/CFGBuilder.cpp
new file mode 100644
index 000000000000..50494ab5c7ca
--- /dev/null
+++ b/unittests/IR/CFGBuilder.cpp
@@ -0,0 +1,269 @@
+//===- llvm/Testing/Support/CFGBuilder.cpp --------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CFGBuilder.h"
+
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/TypeBuilder.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "gtest/gtest.h"
+
+#define DEBUG_TYPE "cfg-builder"
+
+using namespace llvm;
+
+CFGHolder::CFGHolder(StringRef ModuleName, StringRef FunctionName)
+    : Context(llvm::make_unique<LLVMContext>()),
+      M(llvm::make_unique<Module>(ModuleName, *Context)) {
+  FunctionType *FTy = TypeBuilder<void(), false>::get(*Context);
+  F = cast<Function>(M->getOrInsertFunction(FunctionName, FTy));
+}
+CFGHolder::~CFGHolder() = default;
+
+CFGBuilder::CFGBuilder(Function *F, const std::vector<Arc> &InitialArcs,
+                       std::vector<Update> Updates)
+    : F(F), Updates(std::move(Updates)) {
+  assert(F);
+  buildCFG(InitialArcs);
+}
+
+static void ConnectBlocks(BasicBlock *From, BasicBlock *To) {
+  DEBUG(dbgs() << "Creating BB arc " << From->getName() << " -> "
+               << To->getName() << "\n";
+        dbgs().flush());
+  auto *IntTy = IntegerType::get(From->getContext(), 32);
+
+  if (isa<UnreachableInst>(From->getTerminator()))
+    From->getTerminator()->eraseFromParent();
+  if (!From->getTerminator()) {
+    IRBuilder<> IRB(From);
+    IRB.CreateSwitch(ConstantInt::get(IntTy, 0), To);
+    return;
+  }
+
+  SwitchInst *SI = cast<SwitchInst>(From->getTerminator());
+  const auto Last = SI->getNumCases();
+
+  auto *IntVal = ConstantInt::get(IntTy, Last);
+  SI->addCase(IntVal, To);
+}
+
+static void DisconnectBlocks(BasicBlock *From, BasicBlock *To) {
+  DEBUG(dbgs() << "Deleting BB arc " << From->getName() << " -> "
+               << To->getName() << "\n";
+        dbgs().flush());
+  SwitchInst *SI = cast<SwitchInst>(From->getTerminator());
+
+  if (SI->getNumCases() == 0) {
+    SI->eraseFromParent();
+    IRBuilder<> IRB(From);
+    IRB.CreateUnreachable();
+    return;
+  }
+
+  if (SI->getDefaultDest() == To) {
+    auto FirstC = SI->case_begin();
+    SI->setDefaultDest(FirstC->getCaseSuccessor());
+    SI->removeCase(FirstC);
+    return;
+  }
+
+  for (auto CIt = SI->case_begin(); CIt != SI->case_end(); ++CIt)
+    if (CIt->getCaseSuccessor() == To) {
+      SI->removeCase(CIt);
+      return;
+    }
+}
+
+BasicBlock *CFGBuilder::getOrAddBlock(StringRef BlockName) {
+  auto BIt = NameToBlock.find(BlockName);
+  if (BIt != NameToBlock.end())
+    return BIt->second;
+
+  auto *BB = BasicBlock::Create(F->getParent()->getContext(), BlockName, F);
+  IRBuilder<> IRB(BB);
+  IRB.CreateUnreachable();
+  NameToBlock[BlockName] = BB;
+  return BB;
+}
+
+bool CFGBuilder::connect(const Arc &A) {
+  BasicBlock *From = getOrAddBlock(A.From);
+  BasicBlock *To = getOrAddBlock(A.To);
+  if (Arcs.count(A) != 0)
+    return false;
+
+  Arcs.insert(A);
+  ConnectBlocks(From, To);
+  return true;
+}
+
+bool CFGBuilder::disconnect(const Arc &A) {
+  assert(NameToBlock.count(A.From) != 0 && "No block to disconnect (From)");
+  assert(NameToBlock.count(A.To) != 0 && "No block to disconnect (To)");
+  if (Arcs.count(A) == 0)
+    return false;
+
+  BasicBlock *From = getOrAddBlock(A.From);
+  BasicBlock *To = getOrAddBlock(A.To);
+  Arcs.erase(A);
+  DisconnectBlocks(From, To);
+  return true;
+}
+
+void CFGBuilder::buildCFG(const std::vector<Arc> &NewArcs) {
+  for (const auto &A : NewArcs) {
+    const bool Connected = connect(A);
+    (void)Connected;
+    assert(Connected);
+  }
+}
+
+Optional<CFGBuilder::Update> CFGBuilder::getNextUpdate() const {
+  if (UpdateIdx == Updates.size())
+    return None;
+  return Updates[UpdateIdx];
+}
+
+Optional<CFGBuilder::Update> CFGBuilder::applyUpdate() {
+  if (UpdateIdx == Updates.size())
+    return None;
+  Update NextUpdate = Updates[UpdateIdx++];
+  if (NextUpdate.Action == ActionKind::Insert)
+    connect(NextUpdate.Edge);
+  else
+    disconnect(NextUpdate.Edge);
+
+  return NextUpdate;
+}
+
+void CFGBuilder::dump(raw_ostream &OS) const {
+  OS << "Arcs:\n";
+  size_t i = 0;
+  for (const auto &A : Arcs)
+    OS << "  " << i++ << ":\t" << A.From << " -> " << A.To << "\n";
+
+  OS << "Updates:\n";
+  i = 0;
+  for (const auto &U : Updates) {
+    OS << (i + 1 == UpdateIdx ? "->" : "  ") << i
+       << ((U.Action == ActionKind::Insert) ? "\tIns " : "\tDel ")
+       << U.Edge.From << " -> " << U.Edge.To << "\n";
+    ++i;
+  }
+}
+
+//---- CFGBuilder tests ---------------------------------------------------===//
+
+TEST(CFGBuilder, Construction) {
+  CFGHolder Holder;
+  std::vector<CFGBuilder::Arc> Arcs = {{"entry", "a"}, {"a", "b"}, {"a", "c"},
+                                       {"c", "d"},     {"d", "b"}, {"d", "e"},
+                                       {"d", "f"},     {"e", "f"}};
+  CFGBuilder B(Holder.F, Arcs, {});
+
+  EXPECT_TRUE(B.getOrAddBlock("entry") == &Holder.F->getEntryBlock());
+  EXPECT_TRUE(isa<SwitchInst>(B.getOrAddBlock("entry")->getTerminator()));
+  EXPECT_TRUE(isa<SwitchInst>(B.getOrAddBlock("a")->getTerminator()));
+  EXPECT_TRUE(isa<UnreachableInst>(B.getOrAddBlock("b")->getTerminator()));
+  EXPECT_TRUE(isa<SwitchInst>(B.getOrAddBlock("d")->getTerminator()));
+
+  auto *DSwitch = cast<SwitchInst>(B.getOrAddBlock("d")->getTerminator());
+  // d has 3 successors, but one of them if going to be a default case
+  EXPECT_EQ(DSwitch->getNumCases(), 2U);
+  EXPECT_FALSE(B.getNextUpdate()); // No updates to apply.
+}
+
+TEST(CFGBuilder, Insertions) {
+  CFGHolder Holder;
+  const auto Insert = CFGBuilder::ActionKind::Insert;
+  std::vector<CFGBuilder::Update> Updates = {
+      {Insert, {"entry", "a"}}, {Insert, {"a", "b"}}, {Insert, {"a", "c"}},
+      {Insert, {"c", "d"}},     {Insert, {"d", "b"}}, {Insert, {"d", "e"}},
+      {Insert, {"d", "f"}},     {Insert, {"e", "f"}}};
+  const size_t NumUpdates = Updates.size();
+
+  CFGBuilder B(Holder.F, {}, Updates);
+
+  size_t i = 0;
+  while (B.applyUpdate())
+    ++i;
+  EXPECT_EQ(i, NumUpdates);
+
+  EXPECT_TRUE(B.getOrAddBlock("entry") == &Holder.F->getEntryBlock());
+  EXPECT_TRUE(isa<SwitchInst>(B.getOrAddBlock("entry")->getTerminator()));
+  EXPECT_TRUE(isa<SwitchInst>(B.getOrAddBlock("a")->getTerminator()));
+  EXPECT_TRUE(isa<UnreachableInst>(B.getOrAddBlock("b")->getTerminator()));
+  EXPECT_TRUE(isa<SwitchInst>(B.getOrAddBlock("d")->getTerminator()));
+
+  auto *DSwitch = cast<SwitchInst>(B.getOrAddBlock("d")->getTerminator());
+  // d has 3 successors, but one of them if going to be a default case
+  EXPECT_EQ(DSwitch->getNumCases(), 2U);
+  EXPECT_FALSE(B.getNextUpdate()); // No updates to apply.
+}
+
+TEST(CFGBuilder, Deletions) {
+  CFGHolder Holder;
+  std::vector<CFGBuilder::Arc> Arcs = {
+      {"entry", "a"}, {"a", "b"}, {"a", "c"}, {"c", "d"}, {"d", "b"}};
+  const auto Delete = CFGBuilder::ActionKind::Delete;
+  std::vector<CFGBuilder::Update> Updates = {
+      {Delete, {"c", "d"}}, {Delete, {"a", "c"}}, {Delete, {"entry", "a"}},
+  };
+  const size_t NumUpdates = Updates.size();
+
+  CFGBuilder B(Holder.F, Arcs, Updates);
+
+  EXPECT_TRUE(isa<SwitchInst>(B.getOrAddBlock("entry")->getTerminator()));
+  EXPECT_TRUE(isa<SwitchInst>(B.getOrAddBlock("a")->getTerminator()));
+  EXPECT_TRUE(isa<SwitchInst>(B.getOrAddBlock("c")->getTerminator()));
+  EXPECT_TRUE(isa<SwitchInst>(B.getOrAddBlock("d")->getTerminator()));
+
+  auto UpdateC = B.applyUpdate();
+
+  EXPECT_TRUE(UpdateC);
+  EXPECT_EQ(UpdateC->Action, CFGBuilder::ActionKind::Delete);
+  EXPECT_EQ(UpdateC->Edge.From, "c");
+  EXPECT_EQ(UpdateC->Edge.To, "d");
+  EXPECT_TRUE(isa<UnreachableInst>(B.getOrAddBlock("c")->getTerminator()));
+
+  size_t i = 1;
+  while (B.applyUpdate())
+    ++i;
+  EXPECT_EQ(i, NumUpdates);
+
+  EXPECT_TRUE(isa<SwitchInst>(B.getOrAddBlock("a")->getTerminator()));
+  EXPECT_TRUE(isa<UnreachableInst>(B.getOrAddBlock("entry")->getTerminator()));
+}
+
+TEST(CFGBuilder, Rebuild) {
+  CFGHolder Holder;
+  std::vector<CFGBuilder::Arc> Arcs = {
+      {"entry", "a"}, {"a", "b"}, {"a", "c"}, {"c", "d"}, {"d", "b"}};
+  const auto Insert = CFGBuilder::ActionKind::Insert;
+  const auto Delete = CFGBuilder::ActionKind::Delete;
+  std::vector<CFGBuilder::Update> Updates = {
+      {Delete, {"c", "d"}}, {Delete, {"a", "c"}}, {Delete, {"entry", "a"}},
+      {Insert, {"c", "d"}}, {Insert, {"a", "c"}}, {Insert, {"entry", "a"}},
+  };
+  const size_t NumUpdates = Updates.size();
+
+  CFGBuilder B(Holder.F, Arcs, Updates);
+  size_t i = 0;
+  while (B.applyUpdate())
+    ++i;
+  EXPECT_EQ(i, NumUpdates);
+
+  EXPECT_TRUE(isa<SwitchInst>(B.getOrAddBlock("entry")->getTerminator()));
+  EXPECT_TRUE(isa<SwitchInst>(B.getOrAddBlock("a")->getTerminator()));
+  EXPECT_TRUE(isa<SwitchInst>(B.getOrAddBlock("c")->getTerminator()));
+  EXPECT_TRUE(isa<SwitchInst>(B.getOrAddBlock("d")->getTerminator()));
+}
diff --git a/unittests/IR/CFGBuilder.h b/unittests/IR/CFGBuilder.h
new file mode 100644
index 000000000000..d9d9c378e110
--- /dev/null
+++ b/unittests/IR/CFGBuilder.h
@@ -0,0 +1,94 @@
+//===- CFGBuilder.h - CFG building and updating utility ----------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// CFGBuilders provides utilities fo building and updating CFG for testing
+/// purposes.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_UNITTESTS_CFG_BUILDER_H
+#define LLVM_UNITTESTS_CFG_BUILDER_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Debug.h"
+
+#include <memory>
+#include <set>
+#include <tuple>
+#include <vector>
+
+namespace llvm {
+
+class LLVMContext;
+class Module;
+class Function;
+class BasicBlock;
+class raw_ostream;
+
+struct CFGHolder {
+  std::unique_ptr<LLVMContext> Context;
+  std::unique_ptr<Module> M;
+  Function *F;
+
+  CFGHolder(StringRef ModuleName = "m", StringRef FunctionName = "foo");
+  ~CFGHolder(); // Defined in the .cpp file so we can use forward declarations.
+};
+
+/// \brief
+/// CFGBuilder builds IR with specific CFG, based on the supplied list of arcs.
+/// It's able to apply the provided updates and automatically modify the IR.
+///
+/// Internally it makes every basic block end with either SwitchInst or with
+/// UnreachableInst. When all arc to a BB are deleted, the BB remains in the
+/// function and doesn't get deleted.
+///
+class CFGBuilder {
+public:
+  struct Arc {
+    StringRef From;
+    StringRef To;
+
+    friend bool operator<(const Arc &LHS, const Arc &RHS) {
+      return std::tie(LHS.From, LHS.To) <
+             std::tie(RHS.From, RHS.To);
+    }
+  };
+
+  enum class ActionKind { Insert, Delete };
+  struct Update {
+    ActionKind Action;
+    Arc Edge;
+  };
+
+  CFGBuilder(Function *F, const std::vector<Arc> &InitialArcs,
+             std::vector<Update> Updates);
+
+  BasicBlock *getOrAddBlock(StringRef BlockName);
+  Optional<Update> getNextUpdate() const;
+  Optional<Update> applyUpdate();
+  void dump(raw_ostream &OS = dbgs()) const;
+
+private:
+  void buildCFG(const std::vector<Arc> &Arcs);
+  bool connect(const Arc &A);
+  bool disconnect(const Arc &A);
+
+  Function *F;
+  unsigned UpdateIdx = 0;
+  StringMap<BasicBlock *> NameToBlock;
+  std::set<Arc> Arcs;
+  std::vector<Update> Updates;
+};
+
+} // namespace llvm
+
+#endif
diff --git a/unittests/IR/CMakeLists.txt b/unittests/IR/CMakeLists.txt
index d76ebfa64d88..92c9f8d3788d 100644
--- a/unittests/IR/CMakeLists.txt
+++ b/unittests/IR/CMakeLists.txt
@@ -10,6 +10,7 @@ set(IRSources
   AsmWriterTest.cpp
   AttributesTest.cpp
   BasicBlockTest.cpp
+  CFGBuilder.cpp
   ConstantRangeTest.cpp
   ConstantsTest.cpp
   DebugInfoTest.cpp
diff --git a/unittests/IR/DominatorTreeTest.cpp b/unittests/IR/DominatorTreeTest.cpp
index fa3dad8a2ab1..df1e2993dc85 100644
--- a/unittests/IR/DominatorTreeTest.cpp
+++ b/unittests/IR/DominatorTreeTest.cpp
@@ -7,6 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include <random>
 #include "llvm/Analysis/PostDominators.h"
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/IR/Constants.h"
@@ -15,22 +16,24 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/SourceMgr.h"
+#include "CFGBuilder.h"
 #include "gtest/gtest.h"
 
 using namespace llvm;
 
+struct PostDomTree : PostDomTreeBase<BasicBlock> {
+  PostDomTree(Function &F) { recalculate(F); }
+};
+
 /// Build the dominator tree for the function and run the Test.
-static void
-runWithDomTree(Module &M, StringRef FuncName,
-               function_ref<void(Function &F, DominatorTree *DT,
-                                 DominatorTreeBase<BasicBlock> *PDT)>
-                   Test) {
+static void runWithDomTree(
+    Module &M, StringRef FuncName,
+    function_ref<void(Function &F, DominatorTree *DT, PostDomTree *PDT)> Test) {
   auto *F = M.getFunction(FuncName);
   ASSERT_NE(F, nullptr) << "Could not find " << FuncName;
   // Compute the dominator tree for the function.
   DominatorTree DT(*F);
-  DominatorTreeBase<BasicBlock> PDT(/*isPostDom*/ true);
-  PDT.recalculate(*F);
+  PostDomTree PDT(*F);
   Test(*F, &DT, &PDT);
 }
 
@@ -72,8 +75,7 @@ TEST(DominatorTree, Unreachable) {
   std::unique_ptr<Module> M = makeLLVMModule(Context, ModuleString);
 
   runWithDomTree(
-      *M, "f",
-      [&](Function &F, DominatorTree *DT, DominatorTreeBase<BasicBlock> *PDT) {
+      *M, "f", [&](Function &F, DominatorTree *DT, PostDomTree *PDT) {
         Function::iterator FI = F.begin();
 
         BasicBlock *BB0 = &*FI++;
@@ -293,8 +295,7 @@ TEST(DominatorTree, NonUniqueEdges) {
   std::unique_ptr<Module> M = makeLLVMModule(Context, ModuleString);
 
   runWithDomTree(
-      *M, "f",
-      [&](Function &F, DominatorTree *DT, DominatorTreeBase<BasicBlock> *PDT) {
+      *M, "f", [&](Function &F, DominatorTree *DT, PostDomTree *PDT) {
         Function::iterator FI = F.begin();
 
         BasicBlock *BB0 = &*FI++;
@@ -324,3 +325,280 @@ TEST(DominatorTree, NonUniqueEdges) {
         EXPECT_FALSE(DT->dominates(Edge_BB0_BB1_b, BB2));
       });
 }
+
+namespace {
+const auto Insert = CFGBuilder::ActionKind::Insert;
+const auto Delete = CFGBuilder::ActionKind::Delete;
+
+bool CompUpdates(const CFGBuilder::Update &A, const CFGBuilder::Update &B) {
+  return std::tie(A.Action, A.Edge.From, A.Edge.To) <
+         std::tie(B.Action, B.Edge.From, B.Edge.To);
+}
+}  // namespace
+
+TEST(DominatorTree, InsertReachable) {
+  CFGHolder Holder;
+  std::vector<CFGBuilder::Arc> Arcs = {
+      {"1", "2"}, {"2", "3"}, {"3", "4"},  {"4", "5"},  {"5", "6"},  {"5", "7"},
+      {"3", "8"}, {"8", "9"}, {"9", "10"}, {"8", "11"}, {"11", "12"}};
+
+  std::vector<CFGBuilder::Update> Updates = {{Insert, {"12", "10"}},
+                                             {Insert, {"10", "9"}},
+                                             {Insert, {"7", "6"}},
+                                             {Insert, {"7", "5"}}};
+  CFGBuilder B(Holder.F, Arcs, Updates);
+  DominatorTree DT(*Holder.F);
+  EXPECT_TRUE(DT.verify());
+  PostDomTree PDT(*Holder.F);
+  EXPECT_TRUE(PDT.verify());
+
+  Optional<CFGBuilder::Update> LastUpdate;
+  while ((LastUpdate = B.applyUpdate())) {
+    EXPECT_EQ(LastUpdate->Action, Insert);
+    BasicBlock *From = B.getOrAddBlock(LastUpdate->Edge.From);
+    BasicBlock *To = B.getOrAddBlock(LastUpdate->Edge.To);
+    DT.insertEdge(From, To);
+    EXPECT_TRUE(DT.verify());
+    PDT.insertEdge(From, To);
+    EXPECT_TRUE(PDT.verify());
+  }
+}
+
+TEST(DominatorTree, InsertReachable2) {
+  CFGHolder Holder;
+  std::vector<CFGBuilder::Arc> Arcs = {
+      {"1", "2"}, {"2", "3"}, {"3", "4"},  {"4", "5"},  {"5", "6"},  {"5", "7"},
+      {"7", "5"}, {"2", "8"}, {"8", "11"}, {"11", "12"}, {"12", "10"},
+      {"10", "9"}, {"9", "10"}};
+
+  std::vector<CFGBuilder::Update> Updates = {{Insert, {"10", "7"}}};
+  CFGBuilder B(Holder.F, Arcs, Updates);
+  DominatorTree DT(*Holder.F);
+  EXPECT_TRUE(DT.verify());
+  PostDomTree PDT(*Holder.F);
+  EXPECT_TRUE(PDT.verify());
+
+  Optional<CFGBuilder::Update> LastUpdate = B.applyUpdate();
+  EXPECT_TRUE(LastUpdate);
+
+  EXPECT_EQ(LastUpdate->Action, Insert);
+  BasicBlock *From = B.getOrAddBlock(LastUpdate->Edge.From);
+  BasicBlock *To = B.getOrAddBlock(LastUpdate->Edge.To);
+  DT.insertEdge(From, To);
+  EXPECT_TRUE(DT.verify());
+  PDT.insertEdge(From, To);
+  EXPECT_TRUE(PDT.verify());
+}
+
+TEST(DominatorTree, InsertUnreachable) {
+  CFGHolder Holder;
+  std::vector<CFGBuilder::Arc> Arcs = {{"1", "2"},  {"2", "3"},  {"3", "4"},
+                                       {"5", "6"},  {"5", "7"},  {"3", "8"},
+                                       {"9", "10"}, {"11", "12"}};
+
+  std::vector<CFGBuilder::Update> Updates = {{Insert, {"4", "5"}},
+                                             {Insert, {"8", "9"}},
+                                             {Insert, {"10", "12"}},
+                                             {Insert, {"10", "11"}}};
+  CFGBuilder B(Holder.F, Arcs, Updates);
+  DominatorTree DT(*Holder.F);
+  EXPECT_TRUE(DT.verify());
+  PostDomTree PDT(*Holder.F);
+  EXPECT_TRUE(PDT.verify());
+
+  Optional<CFGBuilder::Update> LastUpdate;
+  while ((LastUpdate = B.applyUpdate())) {
+    EXPECT_EQ(LastUpdate->Action, Insert);
+    BasicBlock *From = B.getOrAddBlock(LastUpdate->Edge.From);
+    BasicBlock *To = B.getOrAddBlock(LastUpdate->Edge.To);
+    DT.insertEdge(From, To);
+    EXPECT_TRUE(DT.verify());
+    PDT.insertEdge(From, To);
+    EXPECT_TRUE(PDT.verify());
+  }
+}
+
+TEST(DominatorTree, InsertMixed) {
+  CFGHolder Holder;
+  std::vector<CFGBuilder::Arc> Arcs = {
+      {"1", "2"}, {"2", "3"},  {"3", "4"},  {"5", "6"},   {"5", "7"},
+      {"8", "9"}, {"9", "10"}, {"8", "11"}, {"11", "12"}, {"7", "3"}};
+
+  std::vector<CFGBuilder::Update> Updates = {
+      {Insert, {"4", "5"}},   {Insert, {"2", "5"}},   {Insert, {"10", "9"}},
+      {Insert, {"12", "10"}}, {Insert, {"12", "10"}}, {Insert, {"7", "8"}},
+      {Insert, {"7", "5"}}};
+  CFGBuilder B(Holder.F, Arcs, Updates);
+  DominatorTree DT(*Holder.F);
+  EXPECT_TRUE(DT.verify());
+  PostDomTree PDT(*Holder.F);
+  EXPECT_TRUE(PDT.verify());
+
+  Optional<CFGBuilder::Update> LastUpdate;
+  while ((LastUpdate = B.applyUpdate())) {
+    EXPECT_EQ(LastUpdate->Action, Insert);
+    BasicBlock *From = B.getOrAddBlock(LastUpdate->Edge.From);
+    BasicBlock *To = B.getOrAddBlock(LastUpdate->Edge.To);
+    DT.insertEdge(From, To);
+    EXPECT_TRUE(DT.verify());
+    PDT.insertEdge(From, To);
+    EXPECT_TRUE(PDT.verify());
+  }
+}
+
+TEST(DominatorTree, InsertPermut) {
+  std::vector<CFGBuilder::Arc> Arcs = {
+      {"1", "2"}, {"2", "3"},  {"3", "4"},  {"5", "6"},   {"5", "7"},
+      {"8", "9"}, {"9", "10"}, {"8", "11"}, {"11", "12"}, {"7", "3"}};
+
+  std::vector<CFGBuilder::Update> Updates = {{Insert, {"4", "5"}},
+                                             {Insert, {"2", "5"}},
+                                             {Insert, {"10", "9"}},
+                                             {Insert, {"12", "10"}}};
+
+  while (std::next_permutation(Updates.begin(), Updates.end(), CompUpdates)) {
+    CFGHolder Holder;
+    CFGBuilder B(Holder.F, Arcs, Updates);
+    DominatorTree DT(*Holder.F);
+    EXPECT_TRUE(DT.verify());
+    PostDomTree PDT(*Holder.F);
+    EXPECT_TRUE(PDT.verify());
+
+    Optional<CFGBuilder::Update> LastUpdate;
+    while ((LastUpdate = B.applyUpdate())) {
+      EXPECT_EQ(LastUpdate->Action, Insert);
+      BasicBlock *From = B.getOrAddBlock(LastUpdate->Edge.From);
+      BasicBlock *To = B.getOrAddBlock(LastUpdate->Edge.To);
+      DT.insertEdge(From, To);
+      EXPECT_TRUE(DT.verify());
+      PDT.insertEdge(From, To);
+      EXPECT_TRUE(PDT.verify());
+    }
+  }
+}
+
+TEST(DominatorTree, DeleteReachable) {
+  CFGHolder Holder;
+  std::vector<CFGBuilder::Arc> Arcs = {
+      {"1", "2"}, {"2", "3"}, {"2", "4"}, {"3", "4"}, {"4", "5"},  {"5", "6"},
+      {"5", "7"}, {"7", "8"}, {"3", "8"}, {"8", "9"}, {"9", "10"}, {"10", "2"}};
+
+  std::vector<CFGBuilder::Update> Updates = {
+      {Delete, {"2", "4"}}, {Delete, {"7", "8"}}, {Delete, {"10", "2"}}};
+  CFGBuilder B(Holder.F, Arcs, Updates);
+  DominatorTree DT(*Holder.F);
+  EXPECT_TRUE(DT.verify());
+  PostDomTree PDT(*Holder.F);
+  EXPECT_TRUE(PDT.verify());
+
+  Optional<CFGBuilder::Update> LastUpdate;
+  while ((LastUpdate = B.applyUpdate())) {
+    EXPECT_EQ(LastUpdate->Action, Delete);
+    BasicBlock *From = B.getOrAddBlock(LastUpdate->Edge.From);
+    BasicBlock *To = B.getOrAddBlock(LastUpdate->Edge.To);
+    DT.deleteEdge(From, To);
+    EXPECT_TRUE(DT.verify());
+    PDT.deleteEdge(From, To);
+    EXPECT_TRUE(PDT.verify());
+  }
+}
+
+TEST(DominatorTree, DeleteUnreachable) {
+  CFGHolder Holder;
+  std::vector<CFGBuilder::Arc> Arcs = {
+      {"1", "2"}, {"2", "3"}, {"3", "4"}, {"4", "5"},  {"5", "6"}, {"5", "7"},
+      {"7", "8"}, {"3", "8"}, {"8", "9"}, {"9", "10"}, {"10", "2"}};
+
+  std::vector<CFGBuilder::Update> Updates = {
+      {Delete, {"8", "9"}}, {Delete, {"7", "8"}}, {Delete, {"3", "4"}}};
+  CFGBuilder B(Holder.F, Arcs, Updates);
+  DominatorTree DT(*Holder.F);
+  EXPECT_TRUE(DT.verify());
+  PostDomTree PDT(*Holder.F);
+  EXPECT_TRUE(PDT.verify());
+
+  Optional<CFGBuilder::Update> LastUpdate;
+  while ((LastUpdate = B.applyUpdate())) {
+    EXPECT_EQ(LastUpdate->Action, Delete);
+    BasicBlock *From = B.getOrAddBlock(LastUpdate->Edge.From);
+    BasicBlock *To = B.getOrAddBlock(LastUpdate->Edge.To);
+    DT.deleteEdge(From, To);
+    EXPECT_TRUE(DT.verify());
+    PDT.deleteEdge(From, To);
+    EXPECT_TRUE(PDT.verify());
+  }
+}
+
+TEST(DominatorTree, InsertDelete) {
+  std::vector<CFGBuilder::Arc> Arcs = {
+      {"1", "2"}, {"2", "3"}, {"3", "4"},  {"4", "5"},  {"5", "6"},  {"5", "7"},
+      {"3", "8"}, {"8", "9"}, {"9", "10"}, {"8", "11"}, {"11", "12"}};
+
+  std::vector<CFGBuilder::Update> Updates = {
+      {Insert, {"2", "4"}},  {Insert, {"12", "10"}}, {Insert, {"10", "9"}},
+      {Insert, {"7", "6"}},  {Insert, {"7", "5"}},   {Delete, {"3", "8"}},
+      {Insert, {"10", "7"}}, {Insert, {"2", "8"}},   {Delete, {"3", "4"}},
+      {Delete, {"8", "9"}},  {Delete, {"11", "12"}}};
+
+  CFGHolder Holder;
+  CFGBuilder B(Holder.F, Arcs, Updates);
+  DominatorTree DT(*Holder.F);
+  EXPECT_TRUE(DT.verify());
+  PostDomTree PDT(*Holder.F);
+  EXPECT_TRUE(PDT.verify());
+
+  Optional<CFGBuilder::Update> LastUpdate;
+  while ((LastUpdate = B.applyUpdate())) {
+    BasicBlock *From = B.getOrAddBlock(LastUpdate->Edge.From);
+    BasicBlock *To = B.getOrAddBlock(LastUpdate->Edge.To);
+    if (LastUpdate->Action == Insert) {
+      DT.insertEdge(From, To);
+      PDT.insertEdge(From, To);
+    } else {
+      DT.deleteEdge(From, To);
+      PDT.deleteEdge(From, To);
+    }
+
+    EXPECT_TRUE(DT.verify());
+    EXPECT_TRUE(PDT.verify());
+  }
+}
+
+TEST(DominatorTree, InsertDeleteExhaustive) {
+  std::vector<CFGBuilder::Arc> Arcs = {
+      {"1", "2"}, {"2", "3"}, {"3", "4"},  {"4", "5"},  {"5", "6"},  {"5", "7"},
+      {"3", "8"}, {"8", "9"}, {"9", "10"}, {"8", "11"}, {"11", "12"}};
+
+  std::vector<CFGBuilder::Update> Updates = {
+      {Insert, {"2", "4"}},  {Insert, {"12", "10"}}, {Insert, {"10", "9"}},
+      {Insert, {"7", "6"}},  {Insert, {"7", "5"}},   {Delete, {"3", "8"}},
+      {Insert, {"10", "7"}}, {Insert, {"2", "8"}},   {Delete, {"3", "4"}},
+      {Delete, {"8", "9"}},  {Delete, {"11", "12"}}};
+
+  std::mt19937 Generator(0);
+  for (unsigned i = 0; i < 16; ++i) {
+    std::shuffle(Updates.begin(), Updates.end(), Generator);
+    CFGHolder Holder;
+    CFGBuilder B(Holder.F, Arcs, Updates);
+    DominatorTree DT(*Holder.F);
+    EXPECT_TRUE(DT.verify());
+    PostDomTree PDT(*Holder.F);
+    EXPECT_TRUE(PDT.verify());
+
+    Optional<CFGBuilder::Update> LastUpdate;
+    while ((LastUpdate = B.applyUpdate())) {
+      BasicBlock *From = B.getOrAddBlock(LastUpdate->Edge.From);
+      BasicBlock *To = B.getOrAddBlock(LastUpdate->Edge.To);
+      if (LastUpdate->Action == Insert) {
+        DT.insertEdge(From, To);
+        PDT.insertEdge(From, To);
+      } else {
+        DT.deleteEdge(From, To);
+        PDT.deleteEdge(From, To);
+      }
+
+      EXPECT_TRUE(DT.verify());
+      EXPECT_TRUE(PDT.verify());
+    }
+  }
+}
diff --git a/unittests/IR/IRBuilderTest.cpp b/unittests/IR/IRBuilderTest.cpp
index 186330f10573..d361107cc0d2 100644
--- a/unittests/IR/IRBuilderTest.cpp
+++ b/unittests/IR/IRBuilderTest.cpp
@@ -463,13 +463,14 @@ TEST_F(IRBuilderTest, DebugLoc) {
 TEST_F(IRBuilderTest, DIImportedEntity) {
   IRBuilder<> Builder(BB);
   DIBuilder DIB(*M);
+  auto F = DIB.createFile("F.CBL", "/");
   auto CU = DIB.createCompileUnit(dwarf::DW_LANG_Cobol74,
-                                  DIB.createFile("F.CBL", "/"), "llvm-cobol74",
+                                  F, "llvm-cobol74",
                                   true, "", 0);
-  DIB.createImportedDeclaration(CU, nullptr, 1);
-  DIB.createImportedDeclaration(CU, nullptr, 1);
-  DIB.createImportedModule(CU, (DIImportedEntity *)nullptr, 2);
-  DIB.createImportedModule(CU, (DIImportedEntity *)nullptr, 2);
+  DIB.createImportedDeclaration(CU, nullptr, F, 1);
+  DIB.createImportedDeclaration(CU, nullptr, F, 1);
+  DIB.createImportedModule(CU, (DIImportedEntity *)nullptr, F, 2);
+  DIB.createImportedModule(CU, (DIImportedEntity *)nullptr, F, 2);
   DIB.finalize();
   EXPECT_TRUE(verifyModule(*M));
   EXPECT_TRUE(CU->getImportedEntities().size() == 2);
diff --git a/unittests/IR/MetadataTest.cpp b/unittests/IR/MetadataTest.cpp
index cb38b30f43e6..e47afca532d2 100644
--- a/unittests/IR/MetadataTest.cpp
+++ b/unittests/IR/MetadataTest.cpp
@@ -2116,29 +2116,35 @@ TEST_F(DIImportedEntityTest, get) {
   unsigned Tag = dwarf::DW_TAG_imported_module;
   DIScope *Scope = getSubprogram();
   DINode *Entity = getCompositeType();
+  DIFile *File = getFile();
   unsigned Line = 5;
   StringRef Name = "name";
 
-  auto *N = DIImportedEntity::get(Context, Tag, Scope, Entity, Line, Name);
+  auto *N =
+      DIImportedEntity::get(Context, Tag, Scope, Entity, File, Line, Name);
 
   EXPECT_EQ(Tag, N->getTag());
   EXPECT_EQ(Scope, N->getScope());
   EXPECT_EQ(Entity, N->getEntity());
+  EXPECT_EQ(File, N->getFile());
   EXPECT_EQ(Line, N->getLine());
   EXPECT_EQ(Name, N->getName());
-  EXPECT_EQ(N, DIImportedEntity::get(Context, Tag, Scope, Entity, Line, Name));
+  EXPECT_EQ(
+      N, DIImportedEntity::get(Context, Tag, Scope, Entity, File, Line, Name));
 
   EXPECT_NE(N,
             DIImportedEntity::get(Context, dwarf::DW_TAG_imported_declaration,
-                                  Scope, Entity, Line, Name));
+                                  Scope, Entity, File, Line, Name));
   EXPECT_NE(N, DIImportedEntity::get(Context, Tag, getSubprogram(), Entity,
-                                     Line, Name));
+                                     File, Line, Name));
   EXPECT_NE(N, DIImportedEntity::get(Context, Tag, Scope, getCompositeType(),
-                                     Line, Name));
-  EXPECT_NE(N,
-            DIImportedEntity::get(Context, Tag, Scope, Entity, Line + 1, Name));
-  EXPECT_NE(N,
-            DIImportedEntity::get(Context, Tag, Scope, Entity, Line, "other"));
+                                     File, Line, Name));
+  EXPECT_NE(N, DIImportedEntity::get(Context, Tag, Scope, Entity, nullptr, Line,
+                                     Name));
+  EXPECT_NE(N, DIImportedEntity::get(Context, Tag, Scope, Entity, File,
+                                     Line + 1, Name));
+  EXPECT_NE(N, DIImportedEntity::get(Context, Tag, Scope, Entity, File, Line,
+                                     "other"));
 
   TempDIImportedEntity Temp = N->clone();
   EXPECT_EQ(N, MDNode::replaceWithUniqued(std::move(Temp)));
diff --git a/unittests/Support/TargetParserTest.cpp b/unittests/Support/TargetParserTest.cpp
index b252641f1a13..b9b725f934b3 100644
--- a/unittests/Support/TargetParserTest.cpp
+++ b/unittests/Support/TargetParserTest.cpp
@@ -737,7 +737,7 @@ TEST(TargetParserTest, AArch64ExtensionFeatures) {
   unsigned Extensions = AArch64::AEK_CRC | AArch64::AEK_CRYPTO |
                         AArch64::AEK_FP | AArch64::AEK_SIMD |
                         AArch64::AEK_FP16 | AArch64::AEK_PROFILE |
-                        AArch64::AEK_RAS;
+                        AArch64::AEK_RAS | AArch64::AEK_SVE;
 
   for (unsigned i = 0; i <= Extensions; i++)
     EXPECT_TRUE(i == 0 ? !AArch64::getExtensionFeatures(i, Features)
@@ -762,7 +762,8 @@ TEST(TargetParserTest, AArch64ArchExtFeature) {
                               {"simd", "nosimd", "+neon", "-neon"},
                               {"fp16", "nofp16", "+fullfp16", "-fullfp16"},
                               {"profile", "noprofile", "+spe", "-spe"},
-                              {"ras", "noras", "+ras", "-ras"}};
+                              {"ras", "noras", "+ras", "-ras"},
+                              {"sve", "nosve", "+sve", "-sve"}};
 
   for (unsigned i = 0; i < array_lengthof(ArchExt); i++) {
     EXPECT_EQ(StringRef(ArchExt[i][2]),
diff --git a/unittests/Support/YAMLIOTest.cpp b/unittests/Support/YAMLIOTest.cpp
index 5cf0e9d0f5b3..120773a0c8dd 100644
--- a/unittests/Support/YAMLIOTest.cpp
+++ b/unittests/Support/YAMLIOTest.cpp
@@ -232,6 +232,22 @@ TEST(YAMLIO, TestSequenceMapWriteAndRead) {
   }
 }
 
+//
+// Test YAML filename handling.
+//
+static void testErrorFilename(const llvm::SMDiagnostic &Error, void *) {
+  EXPECT_EQ(Error.getFilename(), "foo.yaml");
+}
+
+TEST(YAMLIO, TestGivenFilename) {
+  auto Buffer = llvm::MemoryBuffer::getMemBuffer("{ x: 42 }", "foo.yaml");
+  Input yin(*Buffer, nullptr, testErrorFilename);
+  FooBar Value;
+  yin >> Value;
+
+  EXPECT_TRUE(!!yin.error());
+}
+
 
 //===----------------------------------------------------------------------===//
 //  Test built-in types
diff --git a/unittests/Support/raw_ostream_test.cpp b/unittests/Support/raw_ostream_test.cpp
index a7a5ce8dd6d4..a75f446e4ba1 100644
--- a/unittests/Support/raw_ostream_test.cpp
+++ b/unittests/Support/raw_ostream_test.cpp
@@ -151,6 +151,11 @@ TEST(raw_ostreamTest, Justify) {
   EXPECT_EQ("   xyz", printToString(right_justify("xyz", 6), 6));
   EXPECT_EQ("abc",    printToString(right_justify("abc", 3), 3));
   EXPECT_EQ("big",    printToString(right_justify("big", 1), 3));
+  EXPECT_EQ("   on    ",    printToString(center_justify("on", 9), 9));
+  EXPECT_EQ("   off    ",    printToString(center_justify("off", 10), 10));
+  EXPECT_EQ("single ",    printToString(center_justify("single", 7), 7));
+  EXPECT_EQ("none",    printToString(center_justify("none", 1), 4));
+  EXPECT_EQ("none",    printToString(center_justify("none", 1), 1));
 }
 
 TEST(raw_ostreamTest, FormatHex) {  
diff --git a/utils/TableGen/CodeGenRegisters.cpp b/utils/TableGen/CodeGenRegisters.cpp
index d4a21a986c58..6399fb5ec1dd 100644
--- a/utils/TableGen/CodeGenRegisters.cpp
+++ b/utils/TableGen/CodeGenRegisters.cpp
@@ -1268,12 +1268,12 @@ void CodeGenRegBank::computeSubRegLaneMasks() {
   CoveringLanes = LaneBitmask::getAll();
   for (auto &Idx : SubRegIndices) {
     if (Idx.getComposites().empty()) {
-      if (Bit > 32) {
+      if (Bit > LaneBitmask::BitWidth) {
         PrintFatalError(
           Twine("Ran out of lanemask bits to represent subregister ")
           + Idx.getName());
       }
-      Idx.LaneMask = LaneBitmask(1 << Bit);
+      Idx.LaneMask = LaneBitmask::getLane(Bit);
       ++Bit;
     } else {
       Idx.LaneMask = LaneBitmask::getNone();
@@ -1298,9 +1298,9 @@ void CodeGenRegBank::computeSubRegLaneMasks() {
       static_assert(sizeof(Idx.LaneMask.getAsInteger()) == 4,
                     "Change Log2_32 to a proper one");
       unsigned DstBit = Log2_32(Idx.LaneMask.getAsInteger());
-      assert(Idx.LaneMask == LaneBitmask(1 << DstBit) &&
+      assert(Idx.LaneMask == LaneBitmask::getLane(DstBit) &&
              "Must be a leaf subregister");
-      MaskRolPair MaskRol = { LaneBitmask(1), (uint8_t)DstBit };
+      MaskRolPair MaskRol = { LaneBitmask::getLane(0), (uint8_t)DstBit };
       LaneTransforms.push_back(MaskRol);
     } else {
       // Go through all leaf subregisters and find the ones that compose with
@@ -1314,7 +1314,7 @@ void CodeGenRegBank::computeSubRegLaneMasks() {
           continue;
         // Replicate the behaviour from the lane mask generation loop above.
         unsigned SrcBit = NextBit;
-        LaneBitmask SrcMask = LaneBitmask(1 << SrcBit);
+        LaneBitmask SrcMask = LaneBitmask::getLane(SrcBit);
         if (NextBit < LaneBitmask::BitWidth-1)
           ++NextBit;
         assert(Idx2.LaneMask == SrcMask);
@@ -1386,7 +1386,7 @@ void CodeGenRegBank::computeSubRegLaneMasks() {
     // For classes without any subregisters set LaneMask to 1 instead of 0.
     // This makes it easier for client code to handle classes uniformly.
     if (LaneMask.none())
-      LaneMask = LaneBitmask(1);
+      LaneMask = LaneBitmask::getLane(0);
 
     RegClass.LaneMask = LaneMask;
   }
diff --git a/utils/lit/lit/LitConfig.py b/utils/lit/lit/LitConfig.py
index 2b680846e176..2ef0a8f77ec9 100644
--- a/utils/lit/lit/LitConfig.py
+++ b/utils/lit/lit/LitConfig.py
@@ -25,7 +25,8 @@ class LitConfig(object):
                  params, config_prefix = None,
                  maxIndividualTestTime = 0,
                  maxFailures = None,
-                 parallelism_groups = []):
+                 parallelism_groups = [],
+                 echo_all_commands = False):
         # The name of the test runner.
         self.progname = progname
         # The items to add to the PATH environment variable.
@@ -64,6 +65,7 @@ class LitConfig(object):
         self.maxIndividualTestTime = maxIndividualTestTime
         self.maxFailures = maxFailures
         self.parallelism_groups = parallelism_groups
+        self.echo_all_commands = echo_all_commands
 
     @property
     def maxIndividualTestTime(self):
diff --git a/utils/lit/lit/TestRunner.py b/utils/lit/lit/TestRunner.py
index 8260d3813345..46bcac4b306e 100644
--- a/utils/lit/lit/TestRunner.py
+++ b/utils/lit/lit/TestRunner.py
@@ -715,6 +715,8 @@ def executeScript(test, litConfig, tmpBase, commands, cwd):
     else:
         if test.config.pipefail:
             f.write('set -o pipefail;')
+        if litConfig.echo_all_commands:
+            f.write('set -x;')
         f.write('{ ' + '; } &&\n{ '.join(commands) + '; }')
     f.write('\n')
     f.close()
diff --git a/utils/lit/lit/main.py b/utils/lit/lit/main.py
index 530f962d336d..f0162464ce33 100755
--- a/utils/lit/lit/main.py
+++ b/utils/lit/lit/main.py
@@ -199,6 +199,12 @@ def main_with_tmp(builtinParameters):
     format_group.add_argument("-v", "--verbose", dest="showOutput",
                      help="Show test output for failures",
                      action="store_true", default=False)
+    format_group.add_argument("-vv", "--echo-all-commands",
+                     dest="echoAllCommands",
+                     action="store_true", default=False,
+                     help="Echo all commands as they are executed to stdout.\
+                     In case of failure, last command shown will be the\
+                     failing one.")
     format_group.add_argument("-a", "--show-all", dest="showAllOutput",
                      help="Display all commandlines and output",
                      action="store_true", default=False)
@@ -303,6 +309,9 @@ def main_with_tmp(builtinParameters):
     if opts.maxFailures == 0:
         parser.error("Setting --max-failures to 0 does not have any effect.")
 
+    if opts.echoAllCommands:
+        opts.showOutput = True
+
     inputs = args
 
     # Create the user defined parameters.
@@ -338,7 +347,8 @@ def main_with_tmp(builtinParameters):
         config_prefix = opts.configPrefix,
         maxIndividualTestTime = maxIndividualTestTime,
         maxFailures = opts.maxFailures,
-        parallelism_groups = {})
+        parallelism_groups = {},
+        echo_all_commands = opts.echoAllCommands)
 
     # Perform test discovery.
     run = lit.run.Run(litConfig,
diff --git a/utils/vim/syntax/llvm.vim b/utils/vim/syntax/llvm.vim
index e795c7f62133..e110da4329b5 100644
--- a/utils/vim/syntax/llvm.vim
+++ b/utils/vim/syntax/llvm.vim
@@ -1,7 +1,7 @@
 " Vim syntax file
 " Language:   llvm
 " Maintainer: The LLVM team, http://llvm.org/
-" Version:      $Revision: 307419 $
+" Version:      $Revision: 308208 $
 
 if version < 600
   syntax clear
@@ -161,7 +161,7 @@ syn keyword llvmKeyword
       \ within
       \ writeonly
       \ x86_64_sysvcc
-      \ x86_64_win64cc
+      \ win64cc
       \ x86_fastcallcc
       \ x86_stdcallcc
       \ x86_thiscallcc
-- 
cgit v1.3