From 30815c536baacc07e925f0aef23a5395883173dc Mon Sep 17 00:00:00 2001
From: Dimitry Andric <dim@FreeBSD.org>
Date: Thu, 20 Oct 2011 21:10:27 +0000
Subject: Vendor import of llvm release_30 branch r142614:
 http://llvm.org/svn/llvm-project/llvm/branches/release_30@142614

---
 lib/Target/ARM/ARM.h                               |   21 +-
 lib/Target/ARM/ARM.td                              |   23 +-
 lib/Target/ARM/ARMAddressingModes.h                |  595 ---
 lib/Target/ARM/ARMAsmBackend.cpp                   |  516 ---
 lib/Target/ARM/ARMAsmPrinter.cpp                   |  194 +-
 lib/Target/ARM/ARMBaseInfo.h                       |  294 --
 lib/Target/ARM/ARMBaseInstrInfo.cpp                |  522 ++-
 lib/Target/ARM/ARMBaseInstrInfo.h                  |  161 +-
 lib/Target/ARM/ARMBaseRegisterInfo.cpp             |  287 +-
 lib/Target/ARM/ARMBaseRegisterInfo.h               |   19 +-
 lib/Target/ARM/ARMCodeEmitter.cpp                  |   35 +-
 lib/Target/ARM/ARMConstantIslandPass.cpp           |   23 +-
 lib/Target/ARM/ARMConstantPoolValue.cpp            |  317 +-
 lib/Target/ARM/ARMConstantPoolValue.h              |  189 +-
 lib/Target/ARM/ARMExpandPseudoInsts.cpp            |   61 +-
 lib/Target/ARM/ARMFastISel.cpp                     |   93 +-
 lib/Target/ARM/ARMFixupKinds.h                     |   97 -
 lib/Target/ARM/ARMFrameLowering.cpp                |   31 +-
 lib/Target/ARM/ARMGlobalMerge.cpp                  |   10 +-
 lib/Target/ARM/ARMISelDAGToDAG.cpp                 |  336 +-
 lib/Target/ARM/ARMISelLowering.cpp                 | 1295 +++++-
 lib/Target/ARM/ARMISelLowering.h                   |   47 +-
 lib/Target/ARM/ARMInstrFormats.td                  |  341 +-
 lib/Target/ARM/ARMInstrInfo.cpp                    |   26 +-
 lib/Target/ARM/ARMInstrInfo.td                     | 2752 ++++++++----
 lib/Target/ARM/ARMInstrNEON.td                     |  153 +-
 lib/Target/ARM/ARMInstrThumb.td                    |  489 ++-
 lib/Target/ARM/ARMInstrThumb2.td                   | 1957 ++++++---
 lib/Target/ARM/ARMInstrVFP.td                      |  113 +-
 lib/Target/ARM/ARMLoadStoreOptimizer.cpp           |   49 +-
 lib/Target/ARM/ARMMCCodeEmitter.cpp                | 1314 ------
 lib/Target/ARM/ARMMCExpr.cpp                       |   73 -
 lib/Target/ARM/ARMMCExpr.h                         |   76 -
 lib/Target/ARM/ARMMCInstLower.cpp                  |    2 +-
 lib/Target/ARM/ARMMachObjectWriter.cpp             |  389 --
 lib/Target/ARM/ARMRegisterInfo.td                  |   24 +-
 lib/Target/ARM/ARMSelectionDAGInfo.cpp             |   15 +-
 lib/Target/ARM/ARMSelectionDAGInfo.h               |   17 +
 lib/Target/ARM/ARMSubtarget.cpp                    |    5 +
 lib/Target/ARM/ARMSubtarget.h                      |   18 +
 lib/Target/ARM/ARMTargetMachine.cpp                |   91 +-
 lib/Target/ARM/ARMTargetMachine.h                  |   16 +-
 lib/Target/ARM/AsmParser/ARMAsmLexer.cpp           |   37 +-
 lib/Target/ARM/AsmParser/ARMAsmParser.cpp          | 4071 ++++++++++++++----
 lib/Target/ARM/AsmParser/CMakeLists.txt            |    9 +
 lib/Target/ARM/CMakeLists.txt                      |   48 +-
 lib/Target/ARM/Disassembler/ARMDisassembler.cpp    | 4489 +++++++++++++++++---
 lib/Target/ARM/Disassembler/ARMDisassembler.h      |   99 -
 .../ARM/Disassembler/ARMDisassemblerCore.cpp       | 3818 -----------------
 lib/Target/ARM/Disassembler/ARMDisassemblerCore.h  |  336 --
 lib/Target/ARM/Disassembler/CMakeLists.txt         |   11 +-
 .../ARM/Disassembler/ThumbDisassemblerCore.h       | 2459 -----------
 lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp      |  399 +-
 lib/Target/ARM/InstPrinter/ARMInstPrinter.h        |   34 +-
 lib/Target/ARM/InstPrinter/CMakeLists.txt          |    8 +-
 lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h   |  667 +++
 lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp      |  531 +++
 lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h          |  449 ++
 lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h        |   97 +
 lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp       |   11 +-
 lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp   | 1468 +++++++
 lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp          |   73 +
 lib/Target/ARM/MCTargetDesc/ARMMCExpr.h            |   76 +
 lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp    |  179 +-
 lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h      |   21 +-
 .../ARM/MCTargetDesc/ARMMachObjectWriter.cpp       |  388 ++
 lib/Target/ARM/MCTargetDesc/CMakeLists.txt         |   14 +-
 lib/Target/ARM/Makefile                            |    5 +-
 lib/Target/ARM/NEONMoveFix.cpp                     |  149 -
 lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp        |    2 +-
 lib/Target/ARM/TargetInfo/CMakeLists.txt           |    8 +-
 lib/Target/ARM/Thumb1FrameLowering.cpp             |   11 +-
 lib/Target/ARM/Thumb1RegisterInfo.cpp              |   34 +-
 lib/Target/ARM/Thumb2ITBlockPass.cpp               |   21 +
 lib/Target/ARM/Thumb2InstrInfo.cpp                 |   13 +-
 lib/Target/ARM/Thumb2SizeReduction.cpp             |   21 +-
 lib/Target/Alpha/AlphaAsmPrinter.cpp               |    2 +-
 lib/Target/Alpha/AlphaISelDAGToDAG.cpp             |    7 +-
 lib/Target/Alpha/AlphaISelLowering.cpp             |    8 +-
 lib/Target/Alpha/AlphaISelLowering.h               |    2 +-
 lib/Target/Alpha/AlphaInstrInfo.cpp                |    1 -
 lib/Target/Alpha/AlphaInstrInfo.td                 |    2 +
 lib/Target/Alpha/AlphaRegisterInfo.cpp             |   18 +-
 lib/Target/Alpha/AlphaRegisterInfo.h               |    4 -
 lib/Target/Alpha/AlphaSubtarget.cpp                |    1 -
 lib/Target/Alpha/AlphaTargetMachine.cpp            |   12 +-
 lib/Target/Alpha/AlphaTargetMachine.h              |    5 +-
 lib/Target/Alpha/CMakeLists.txt                    |   25 +-
 .../Alpha/MCTargetDesc/AlphaMCTargetDesc.cpp       |   35 +-
 lib/Target/Alpha/MCTargetDesc/CMakeLists.txt       |    7 +
 lib/Target/Alpha/TargetInfo/AlphaTargetInfo.cpp    |    2 +-
 lib/Target/Alpha/TargetInfo/CMakeLists.txt         |    8 +-
 lib/Target/Blackfin/BlackfinAsmPrinter.cpp         |    2 +-
 lib/Target/Blackfin/BlackfinFrameLowering.h        |    4 +-
 lib/Target/Blackfin/BlackfinISelLowering.cpp       |    4 +-
 lib/Target/Blackfin/BlackfinISelLowering.h         |    2 +-
 lib/Target/Blackfin/BlackfinInstrInfo.cpp          |    2 +-
 lib/Target/Blackfin/BlackfinIntrinsicInfo.cpp      |    8 +-
 lib/Target/Blackfin/BlackfinIntrinsicInfo.h        |    4 +-
 lib/Target/Blackfin/BlackfinRegisterInfo.cpp       |   18 +-
 lib/Target/Blackfin/BlackfinRegisterInfo.h         |    4 -
 lib/Target/Blackfin/BlackfinSubtarget.cpp          |    2 +-
 lib/Target/Blackfin/BlackfinTargetMachine.cpp      |   12 +-
 lib/Target/Blackfin/BlackfinTargetMachine.h        |    5 +-
 lib/Target/Blackfin/CMakeLists.txt                 |   27 +-
 .../Blackfin/MCTargetDesc/BlackfinMCTargetDesc.cpp |   39 +-
 lib/Target/Blackfin/MCTargetDesc/CMakeLists.txt    |    7 +
 .../Blackfin/TargetInfo/BlackfinTargetInfo.cpp     |    2 +-
 lib/Target/Blackfin/TargetInfo/CMakeLists.txt      |    8 +-
 lib/Target/CBackend/CBackend.cpp                   |  180 +-
 lib/Target/CBackend/CMakeLists.txt                 |   12 +
 lib/Target/CBackend/CTargetMachine.h               |    5 +-
 .../CBackend/TargetInfo/CBackendTargetInfo.cpp     |    4 +-
 lib/Target/CBackend/TargetInfo/CMakeLists.txt      |    5 +
 lib/Target/CMakeLists.txt                          |    8 +-
 lib/Target/CellSPU/CMakeLists.txt                  |   27 +-
 lib/Target/CellSPU/MCTargetDesc/CMakeLists.txt     |    7 +
 .../CellSPU/MCTargetDesc/SPUMCTargetDesc.cpp       |   52 +-
 lib/Target/CellSPU/MCTargetDesc/SPUMCTargetDesc.h  |    4 +-
 lib/Target/CellSPU/SPUAsmPrinter.cpp               |    2 +-
 lib/Target/CellSPU/SPUFrameLowering.cpp            |   20 -
 lib/Target/CellSPU/SPUFrameLowering.h              |   14 -
 lib/Target/CellSPU/SPUISelLowering.cpp             |   16 +-
 lib/Target/CellSPU/SPUISelLowering.h               |    6 +-
 lib/Target/CellSPU/SPUInstrInfo.cpp                |    4 +-
 lib/Target/CellSPU/SPUInstrInfo.td                 |   10 +-
 lib/Target/CellSPU/SPURegisterInfo.cpp             |   19 +-
 lib/Target/CellSPU/SPURegisterInfo.h               |    6 -
 lib/Target/CellSPU/SPUSubtarget.cpp                |    2 +-
 lib/Target/CellSPU/SPUTargetMachine.cpp            |   25 +-
 lib/Target/CellSPU/SPUTargetMachine.h              |    5 +-
 lib/Target/CellSPU/TargetInfo/CMakeLists.txt       |    8 +-
 .../CellSPU/TargetInfo/CellSPUTargetInfo.cpp       |    2 +-
 lib/Target/CppBackend/CMakeLists.txt               |    7 +
 lib/Target/CppBackend/CPPBackend.cpp               |  141 +-
 lib/Target/CppBackend/CPPTargetMachine.h           |    5 +-
 lib/Target/CppBackend/TargetInfo/CMakeLists.txt    |    4 +
 .../CppBackend/TargetInfo/CppBackendTargetInfo.cpp |    4 +-
 lib/Target/MBlaze/AsmParser/CMakeLists.txt         |    8 +
 lib/Target/MBlaze/AsmParser/MBlazeAsmLexer.cpp     |   27 +-
 lib/Target/MBlaze/AsmParser/MBlazeAsmParser.cpp    |   24 +-
 lib/Target/MBlaze/AsmParser/Makefile               |    2 +-
 lib/Target/MBlaze/CMakeLists.txt                   |   36 +-
 lib/Target/MBlaze/Disassembler/CMakeLists.txt      |   10 +-
 .../MBlaze/Disassembler/MBlazeDisassembler.cpp     |   76 +-
 .../MBlaze/Disassembler/MBlazeDisassembler.h       |    9 +-
 lib/Target/MBlaze/InstPrinter/CMakeLists.txt       |    9 +-
 .../MBlaze/InstPrinter/MBlazeInstPrinter.cpp       |    4 +-
 lib/Target/MBlaze/InstPrinter/MBlazeInstPrinter.h  |    2 +-
 lib/Target/MBlaze/MBlaze.h                         |   12 +-
 lib/Target/MBlaze/MBlazeAsmBackend.cpp             |  162 -
 lib/Target/MBlaze/MBlazeAsmPrinter.cpp             |   21 +-
 lib/Target/MBlaze/MBlazeFrameLowering.cpp          |    2 +-
 lib/Target/MBlaze/MBlazeISelLowering.cpp           |   13 +-
 lib/Target/MBlaze/MBlazeISelLowering.h             |    2 +-
 lib/Target/MBlaze/MBlazeInstrInfo.cpp              |    5 +-
 lib/Target/MBlaze/MBlazeInstrInfo.h                |   56 -
 lib/Target/MBlaze/MBlazeInstrInfo.td               |   21 +-
 lib/Target/MBlaze/MBlazeIntrinsicInfo.cpp          |    8 +-
 lib/Target/MBlaze/MBlazeIntrinsicInfo.h            |    4 +-
 lib/Target/MBlaze/MBlazeMCCodeEmitter.cpp          |  222 -
 lib/Target/MBlaze/MBlazeRegisterInfo.cpp           |  172 +-
 lib/Target/MBlaze/MBlazeRegisterInfo.h             |   12 -
 lib/Target/MBlaze/MBlazeSubtarget.cpp              |    2 +-
 lib/Target/MBlaze/MBlazeTargetMachine.cpp          |   50 +-
 lib/Target/MBlaze/MBlazeTargetMachine.h            |    5 +-
 lib/Target/MBlaze/MBlazeTargetObjectFile.cpp       |    2 +-
 lib/Target/MBlaze/MCTargetDesc/CMakeLists.txt      |   13 +-
 .../MBlaze/MCTargetDesc/MBlazeAsmBackend.cpp       |  159 +
 lib/Target/MBlaze/MCTargetDesc/MBlazeBaseInfo.h    |  240 ++
 .../MBlaze/MCTargetDesc/MBlazeMCCodeEmitter.cpp    |  224 +
 .../MBlaze/MCTargetDesc/MBlazeMCTargetDesc.cpp     |   93 +-
 .../MBlaze/MCTargetDesc/MBlazeMCTargetDesc.h       |   11 +
 lib/Target/MBlaze/TargetInfo/CMakeLists.txt        |    8 +-
 lib/Target/MBlaze/TargetInfo/MBlazeTargetInfo.cpp  |    2 +-
 lib/Target/MSP430/CMakeLists.txt                   |   26 +-
 lib/Target/MSP430/InstPrinter/CMakeLists.txt       |    8 +-
 .../MSP430/InstPrinter/MSP430InstPrinter.cpp       |    4 +-
 lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h  |    4 +-
 lib/Target/MSP430/MCTargetDesc/CMakeLists.txt      |    8 +
 .../MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp     |   51 +-
 .../MSP430/MCTargetDesc/MSP430MCTargetDesc.h       |    4 +-
 lib/Target/MSP430/MSP430AsmPrinter.cpp             |   14 +-
 lib/Target/MSP430/MSP430ISelLowering.cpp           |    7 +-
 lib/Target/MSP430/MSP430ISelLowering.h             |    4 +-
 lib/Target/MSP430/MSP430InstrInfo.cpp              |    2 +-
 lib/Target/MSP430/MSP430RegisterInfo.cpp           |   16 +-
 lib/Target/MSP430/MSP430RegisterInfo.h             |    5 -
 lib/Target/MSP430/MSP430Subtarget.cpp              |    2 +-
 lib/Target/MSP430/MSP430TargetMachine.cpp          |   11 +-
 lib/Target/MSP430/MSP430TargetMachine.h            |    5 +-
 lib/Target/MSP430/TargetInfo/CMakeLists.txt        |    8 +-
 lib/Target/MSP430/TargetInfo/MSP430TargetInfo.cpp  |    2 +-
 lib/Target/Mangler.cpp                             |    4 +-
 lib/Target/Mips/CMakeLists.txt                     |   28 +-
 lib/Target/Mips/InstPrinter/CMakeLists.txt         |    8 +-
 lib/Target/Mips/InstPrinter/Makefile               |    2 +-
 lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp    |    6 +-
 lib/Target/Mips/InstPrinter/MipsInstPrinter.h      |    4 +-
 lib/Target/Mips/MCTargetDesc/CMakeLists.txt        |   13 +-
 lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp    |  117 +
 lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h        |  113 +
 lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h      |   90 +
 lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp     |    3 +-
 lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp |   52 +
 lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp  |  122 +-
 lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h    |   21 +-
 lib/Target/Mips/Mips.h                             |    3 +
 lib/Target/Mips/Mips.td                            |   32 +-
 lib/Target/Mips/Mips64InstrInfo.td                 |  214 +
 lib/Target/Mips/MipsAsmPrinter.cpp                 |   42 +-
 lib/Target/Mips/MipsCallingConv.td                 |   55 +-
 lib/Target/Mips/MipsCodeEmitter.cpp                |  245 ++
 lib/Target/Mips/MipsDelaySlotFiller.cpp            |  204 +-
 lib/Target/Mips/MipsFrameLowering.cpp              |   19 +-
 lib/Target/Mips/MipsFrameLowering.h                |    5 +-
 lib/Target/Mips/MipsISelDAGToDAG.cpp               |  185 +-
 lib/Target/Mips/MipsISelLowering.cpp               |  848 ++--
 lib/Target/Mips/MipsISelLowering.h                 |   16 +-
 lib/Target/Mips/MipsInstrFPU.td                    |  236 +-
 lib/Target/Mips/MipsInstrFormats.td                |   44 +-
 lib/Target/Mips/MipsInstrInfo.cpp                  |  210 +-
 lib/Target/Mips/MipsInstrInfo.h                    |   37 +-
 lib/Target/Mips/MipsInstrInfo.td                   |  832 ++--
 lib/Target/Mips/MipsJITInfo.cpp                    |  230 +
 lib/Target/Mips/MipsJITInfo.h                      |   70 +
 lib/Target/Mips/MipsMCInstLower.cpp                |   64 +-
 lib/Target/Mips/MipsMCInstLower.h                  |    5 +-
 lib/Target/Mips/MipsMCSymbolRefExpr.cpp            |    9 +-
 lib/Target/Mips/MipsMCSymbolRefExpr.h              |    7 +-
 lib/Target/Mips/MipsMachineFunction.h              |    9 +-
 lib/Target/Mips/MipsRegisterInfo.cpp               |  205 +-
 lib/Target/Mips/MipsRegisterInfo.h                 |    4 -
 lib/Target/Mips/MipsRegisterInfo.td                |  119 +-
 lib/Target/Mips/MipsRelocations.h                  |   41 +
 lib/Target/Mips/MipsSubtarget.cpp                  |   38 +-
 lib/Target/Mips/MipsSubtarget.h                    |   15 +-
 lib/Target/Mips/MipsTargetMachine.cpp              |   73 +-
 lib/Target/Mips/MipsTargetMachine.h                |   48 +-
 lib/Target/Mips/MipsTargetObjectFile.cpp           |    2 +-
 lib/Target/Mips/TargetInfo/CMakeLists.txt          |    8 +-
 lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp      |   16 +-
 lib/Target/PTX/CMakeLists.txt                      |   32 +-
 lib/Target/PTX/InstPrinter/CMakeLists.txt          |   13 +
 lib/Target/PTX/InstPrinter/Makefile                |   16 +
 lib/Target/PTX/InstPrinter/PTXInstPrinter.cpp      |  192 +
 lib/Target/PTX/InstPrinter/PTXInstPrinter.h        |   47 +
 lib/Target/PTX/MCTargetDesc/CMakeLists.txt         |    9 +
 lib/Target/PTX/MCTargetDesc/PTXBaseInfo.h          |   63 +
 lib/Target/PTX/MCTargetDesc/PTXMCTargetDesc.cpp    |   53 +-
 lib/Target/PTX/Makefile                            |    3 +-
 lib/Target/PTX/PTX.h                               |   28 +-
 lib/Target/PTX/PTX.td                              |   28 +-
 lib/Target/PTX/PTXAsmPrinter.cpp                   |  500 ++-
 lib/Target/PTX/PTXAsmPrinter.h                     |   57 +
 lib/Target/PTX/PTXCallingConv.td                   |   29 -
 lib/Target/PTX/PTXFPRoundingModePass.cpp           |  179 +
 lib/Target/PTX/PTXISelDAGToDAG.cpp                 |  182 +-
 lib/Target/PTX/PTXISelLowering.cpp                 |  273 +-
 lib/Target/PTX/PTXISelLowering.h                   |   19 +-
 lib/Target/PTX/PTXInstrFormats.td                  |   31 +-
 lib/Target/PTX/PTXInstrInfo.cpp                    |   82 +-
 lib/Target/PTX/PTXInstrInfo.td                     | 1241 +++---
 lib/Target/PTX/PTXInstrLoadStore.td                |  278 ++
 lib/Target/PTX/PTXIntrinsicInstrInfo.td            |   78 +-
 lib/Target/PTX/PTXMCAsmStreamer.cpp                |   15 +-
 lib/Target/PTX/PTXMCInstLower.cpp                  |   32 +
 lib/Target/PTX/PTXMFInfoExtract.cpp                |   36 +-
 lib/Target/PTX/PTXMachineFunctionInfo.h            |  163 +-
 lib/Target/PTX/PTXParamManager.cpp                 |   73 +
 lib/Target/PTX/PTXParamManager.h                   |   86 +
 lib/Target/PTX/PTXRegAlloc.cpp                     |   58 +
 lib/Target/PTX/PTXRegisterInfo.cpp                 |   29 +-
 lib/Target/PTX/PTXRegisterInfo.h                   |   18 +-
 lib/Target/PTX/PTXRegisterInfo.td                  |  540 +--
 lib/Target/PTX/PTXSelectionDAGInfo.cpp             |  149 +
 lib/Target/PTX/PTXSelectionDAGInfo.h               |   53 +
 lib/Target/PTX/PTXSubtarget.cpp                    |    2 +-
 lib/Target/PTX/PTXSubtarget.h                      |   11 +-
 lib/Target/PTX/PTXTargetMachine.cpp                |  318 +-
 lib/Target/PTX/PTXTargetMachine.h                  |   60 +-
 lib/Target/PTX/TargetInfo/CMakeLists.txt           |    8 +-
 lib/Target/PTX/TargetInfo/PTXTargetInfo.cpp        |    2 +-
 lib/Target/PTX/generate-register-td.py             |  163 -
 lib/Target/PowerPC/CMakeLists.txt                  |   34 +-
 lib/Target/PowerPC/InstPrinter/CMakeLists.txt      |    8 +-
 lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp  |   11 +-
 lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h    |    2 +-
 lib/Target/PowerPC/MCTargetDesc/CMakeLists.txt     |   12 +
 lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp  |  191 +
 lib/Target/PowerPC/MCTargetDesc/PPCBaseInfo.h      |   70 +
 lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h    |   45 +
 lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp   |    6 +-
 .../PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp      |  193 +
 .../PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp       |  115 +-
 lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h  |   10 +
 lib/Target/PowerPC/MCTargetDesc/PPCPredicates.cpp  |   31 +
 lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h    |   37 +
 lib/Target/PowerPC/PPC.h                           |   11 +-
 lib/Target/PowerPC/PPC.td                          |    4 +-
 lib/Target/PowerPC/PPCAsmBackend.cpp               |  123 -
 lib/Target/PowerPC/PPCAsmPrinter.cpp               |   14 +-
 lib/Target/PowerPC/PPCBranchSelector.cpp           |    2 +-
 lib/Target/PowerPC/PPCCodeEmitter.cpp              |    4 +-
 lib/Target/PowerPC/PPCFixupKinds.h                 |   45 -
 lib/Target/PowerPC/PPCFrameLowering.cpp            |   17 +-
 lib/Target/PowerPC/PPCFrameLowering.h              |    1 -
 lib/Target/PowerPC/PPCISelDAGToDAG.cpp             |    2 +-
 lib/Target/PowerPC/PPCISelLowering.cpp             |   65 +-
 lib/Target/PowerPC/PPCISelLowering.h               |   17 +-
 lib/Target/PowerPC/PPCInstrInfo.cpp                |   36 +-
 lib/Target/PowerPC/PPCInstrInfo.td                 |    6 +
 lib/Target/PowerPC/PPCMCCodeEmitter.cpp            |  194 -
 lib/Target/PowerPC/PPCPredicates.cpp               |   31 -
 lib/Target/PowerPC/PPCPredicates.h                 |   39 -
 lib/Target/PowerPC/PPCRegisterInfo.cpp             |   79 +-
 lib/Target/PowerPC/PPCRegisterInfo.h               |    8 -
 lib/Target/PowerPC/PPCSubtarget.cpp                |    2 +-
 lib/Target/PowerPC/PPCTargetMachine.cpp            |   71 +-
 lib/Target/PowerPC/PPCTargetMachine.h              |   16 +-
 lib/Target/PowerPC/TargetInfo/CMakeLists.txt       |    8 +-
 .../PowerPC/TargetInfo/PowerPCTargetInfo.cpp       |    2 +-
 lib/Target/README.txt                              |   10 +
 lib/Target/Sparc/CMakeLists.txt                    |   25 +-
 lib/Target/Sparc/MCTargetDesc/CMakeLists.txt       |    8 +
 .../Sparc/MCTargetDesc/SparcMCTargetDesc.cpp       |   36 +-
 lib/Target/Sparc/SparcAsmPrinter.cpp               |    2 +-
 lib/Target/Sparc/SparcISelLowering.cpp             |    8 +-
 lib/Target/Sparc/SparcInstrInfo.cpp                |    2 +-
 lib/Target/Sparc/SparcRegisterInfo.cpp             |   15 +-
 lib/Target/Sparc/SparcRegisterInfo.h               |    4 -
 lib/Target/Sparc/SparcSubtarget.cpp                |    2 +-
 lib/Target/Sparc/SparcTargetMachine.cpp            |   27 +-
 lib/Target/Sparc/SparcTargetMachine.h              |   16 +-
 lib/Target/Sparc/TargetInfo/CMakeLists.txt         |    8 +-
 lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp    |    2 +-
 lib/Target/SystemZ/CMakeLists.txt                  |   25 +-
 lib/Target/SystemZ/MCTargetDesc/CMakeLists.txt     |    7 +
 .../SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp   |   39 +-
 lib/Target/SystemZ/SystemZAsmPrinter.cpp           |    4 +-
 lib/Target/SystemZ/SystemZISelLowering.cpp         |    1 +
 lib/Target/SystemZ/SystemZInstrInfo.cpp            |    2 +-
 lib/Target/SystemZ/SystemZInstrInfo.td             |    2 +-
 lib/Target/SystemZ/SystemZRegisterInfo.cpp         |   17 +-
 lib/Target/SystemZ/SystemZRegisterInfo.h           |    4 -
 lib/Target/SystemZ/SystemZSubtarget.cpp            |    2 +-
 lib/Target/SystemZ/SystemZTargetMachine.cpp        |   13 +-
 lib/Target/SystemZ/SystemZTargetMachine.h          |    5 +-
 lib/Target/SystemZ/TargetInfo/CMakeLists.txt       |    8 +-
 .../SystemZ/TargetInfo/SystemZTargetInfo.cpp       |    2 +-
 lib/Target/Target.cpp                              |   10 +-
 lib/Target/TargetAsmInfo.cpp                       |   23 -
 lib/Target/TargetAsmLexer.cpp                      |   14 -
 lib/Target/TargetData.cpp                          |   53 +-
 lib/Target/TargetFrameLowering.cpp                 |    8 -
 lib/Target/TargetLoweringObjectFile.cpp            |   59 +-
 lib/Target/TargetMachine.cpp                       |   67 +-
 lib/Target/TargetRegisterInfo.cpp                  |   49 +-
 lib/Target/X86/AsmParser/CMakeLists.txt            |   11 +-
 lib/Target/X86/AsmParser/X86AsmLexer.cpp           |   20 +-
 lib/Target/X86/AsmParser/X86AsmParser.cpp          |   84 +-
 lib/Target/X86/CMakeLists.txt                      |   41 +-
 lib/Target/X86/Disassembler/CMakeLists.txt         |   10 +-
 lib/Target/X86/Disassembler/X86Disassembler.cpp    |   77 +-
 lib/Target/X86/Disassembler/X86Disassembler.h      |   26 +-
 .../X86/Disassembler/X86DisassemblerDecoder.c      |  165 +-
 .../Disassembler/X86DisassemblerDecoderCommon.h    |    8 +-
 lib/Target/X86/InstPrinter/CMakeLists.txt          |    9 +-
 lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp   |   10 +-
 lib/Target/X86/InstPrinter/X86ATTInstPrinter.h     |    2 +-
 lib/Target/X86/InstPrinter/X86InstComments.cpp     |   31 +-
 lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp |    7 +-
 lib/Target/X86/InstPrinter/X86IntelInstPrinter.h   |    2 +-
 lib/Target/X86/MCTargetDesc/CMakeLists.txt         |   13 +
 lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp      |  458 ++
 lib/Target/X86/MCTargetDesc/X86BaseInfo.h          |  548 +++
 lib/Target/X86/MCTargetDesc/X86FixupKinds.h        |   33 +
 lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp   | 1074 +++++
 lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp    |  338 +-
 lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h      |   45 +-
 .../X86/MCTargetDesc/X86MachObjectWriter.cpp       |  554 +++
 lib/Target/X86/README-SSE.txt                      |    2 +-
 lib/Target/X86/README.txt                          |   38 +-
 lib/Target/X86/SSEDomainFix.cpp                    |  506 ---
 lib/Target/X86/TargetInfo/CMakeLists.txt           |    8 +-
 lib/Target/X86/TargetInfo/X86TargetInfo.cpp        |    2 +-
 lib/Target/X86/Utils/CMakeLists.txt                |    8 +-
 lib/Target/X86/Utils/X86ShuffleDecode.cpp          |   75 +-
 lib/Target/X86/Utils/X86ShuffleDecode.h            |   20 +
 lib/Target/X86/X86.h                               |   30 +-
 lib/Target/X86/X86.td                              |   78 +-
 lib/Target/X86/X86AsmBackend.cpp                   |  453 --
 lib/Target/X86/X86AsmPrinter.cpp                   |   23 +-
 lib/Target/X86/X86CodeEmitter.cpp                  |   36 +-
 lib/Target/X86/X86ELFWriterInfo.cpp                |    2 +-
 lib/Target/X86/X86FastISel.cpp                     |   93 +-
 lib/Target/X86/X86FixupKinds.h                     |   33 -
 lib/Target/X86/X86FloatingPoint.cpp                |   34 +
 lib/Target/X86/X86FrameLowering.cpp                |  625 ++-
 lib/Target/X86/X86FrameLowering.h                  |    7 +-
 lib/Target/X86/X86ISelDAGToDAG.cpp                 |   16 +-
 lib/Target/X86/X86ISelLowering.cpp                 | 2940 ++++++++++---
 lib/Target/X86/X86ISelLowering.h                   |   62 +-
 lib/Target/X86/X86InstrArithmetic.td               |   96 +-
 lib/Target/X86/X86InstrCompiler.td                 |  102 +-
 lib/Target/X86/X86InstrExtension.td                |    4 +-
 lib/Target/X86/X86InstrFormats.td                  |    8 +-
 lib/Target/X86/X86InstrFragmentsSIMD.td            |   84 +-
 lib/Target/X86/X86InstrInfo.cpp                    | 1838 ++++----
 lib/Target/X86/X86InstrInfo.h                      |  553 +--
 lib/Target/X86/X86InstrInfo.td                     |  209 +-
 lib/Target/X86/X86InstrSSE.td                      | 4174 +++++++++++-------
 lib/Target/X86/X86InstrSystem.td                   |  114 +-
 lib/Target/X86/X86InstrVMX.td                      |   10 +-
 lib/Target/X86/X86MCCodeEmitter.cpp                | 1044 -----
 lib/Target/X86/X86MCInstLower.cpp                  |   23 +-
 lib/Target/X86/X86MachObjectWriter.cpp             |  554 ---
 lib/Target/X86/X86MachineFunctionInfo.h            |   20 +-
 lib/Target/X86/X86RegisterInfo.cpp                 |  165 +-
 lib/Target/X86/X86RegisterInfo.h                   |   25 +-
 lib/Target/X86/X86RegisterInfo.td                  |   11 +-
 lib/Target/X86/X86SelectionDAGInfo.cpp             |    2 +-
 lib/Target/X86/X86Subtarget.cpp                    |   78 +-
 lib/Target/X86/X86Subtarget.h                      |   36 +-
 lib/Target/X86/X86TargetMachine.cpp                |  173 +-
 lib/Target/X86/X86TargetMachine.h                  |   22 +-
 lib/Target/X86/X86TargetObjectFile.cpp             |   76 -
 lib/Target/X86/X86TargetObjectFile.h               |   22 -
 lib/Target/X86/X86VZeroUpper.cpp                   |  105 +
 lib/Target/XCore/CMakeLists.txt                    |   25 +-
 lib/Target/XCore/MCTargetDesc/CMakeLists.txt       |    7 +
 .../XCore/MCTargetDesc/XCoreMCTargetDesc.cpp       |   48 +-
 lib/Target/XCore/TargetInfo/CMakeLists.txt         |    8 +-
 lib/Target/XCore/TargetInfo/XCoreTargetInfo.cpp    |    2 +-
 lib/Target/XCore/XCoreAsmPrinter.cpp               |   60 +-
 lib/Target/XCore/XCoreFrameLowering.cpp            |   11 +-
 lib/Target/XCore/XCoreFrameLowering.h              |    2 -
 lib/Target/XCore/XCoreISelDAGToDAG.cpp             |   11 +-
 lib/Target/XCore/XCoreISelLowering.cpp             |   37 +-
 lib/Target/XCore/XCoreISelLowering.h               |    5 +-
 lib/Target/XCore/XCoreInstrInfo.cpp                |   12 +-
 lib/Target/XCore/XCoreInstrInfo.h                  |    5 +
 lib/Target/XCore/XCoreInstrInfo.td                 |   81 +-
 lib/Target/XCore/XCoreRegisterInfo.cpp             |   15 +-
 lib/Target/XCore/XCoreRegisterInfo.h               |    5 -
 lib/Target/XCore/XCoreSubtarget.cpp                |    2 +-
 lib/Target/XCore/XCoreTargetMachine.cpp            |   10 +-
 lib/Target/XCore/XCoreTargetMachine.h              |    5 +-
 448 files changed, 38648 insertions(+), 28015 deletions(-)
 delete mode 100644 lib/Target/ARM/ARMAddressingModes.h
 delete mode 100644 lib/Target/ARM/ARMAsmBackend.cpp
 delete mode 100644 lib/Target/ARM/ARMBaseInfo.h
 delete mode 100644 lib/Target/ARM/ARMFixupKinds.h
 delete mode 100644 lib/Target/ARM/ARMMCCodeEmitter.cpp
 delete mode 100644 lib/Target/ARM/ARMMCExpr.cpp
 delete mode 100644 lib/Target/ARM/ARMMCExpr.h
 delete mode 100644 lib/Target/ARM/ARMMachObjectWriter.cpp
 delete mode 100644 lib/Target/ARM/Disassembler/ARMDisassembler.h
 delete mode 100644 lib/Target/ARM/Disassembler/ARMDisassemblerCore.cpp
 delete mode 100644 lib/Target/ARM/Disassembler/ARMDisassemblerCore.h
 delete mode 100644 lib/Target/ARM/Disassembler/ThumbDisassemblerCore.h
 create mode 100644 lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
 create mode 100644 lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
 create mode 100644 lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
 create mode 100644 lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h
 create mode 100644 lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
 create mode 100644 lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp
 create mode 100644 lib/Target/ARM/MCTargetDesc/ARMMCExpr.h
 create mode 100644 lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
 delete mode 100644 lib/Target/ARM/NEONMoveFix.cpp
 delete mode 100644 lib/Target/MBlaze/MBlazeAsmBackend.cpp
 delete mode 100644 lib/Target/MBlaze/MBlazeMCCodeEmitter.cpp
 create mode 100644 lib/Target/MBlaze/MCTargetDesc/MBlazeAsmBackend.cpp
 create mode 100644 lib/Target/MBlaze/MCTargetDesc/MBlazeBaseInfo.h
 create mode 100644 lib/Target/MBlaze/MCTargetDesc/MBlazeMCCodeEmitter.cpp
 create mode 100644 lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
 create mode 100644 lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h
 create mode 100644 lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
 create mode 100644 lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
 create mode 100644 lib/Target/Mips/Mips64InstrInfo.td
 create mode 100644 lib/Target/Mips/MipsCodeEmitter.cpp
 create mode 100644 lib/Target/Mips/MipsJITInfo.cpp
 create mode 100644 lib/Target/Mips/MipsJITInfo.h
 create mode 100644 lib/Target/Mips/MipsRelocations.h
 create mode 100644 lib/Target/PTX/InstPrinter/CMakeLists.txt
 create mode 100644 lib/Target/PTX/InstPrinter/Makefile
 create mode 100644 lib/Target/PTX/InstPrinter/PTXInstPrinter.cpp
 create mode 100644 lib/Target/PTX/InstPrinter/PTXInstPrinter.h
 create mode 100644 lib/Target/PTX/MCTargetDesc/PTXBaseInfo.h
 create mode 100644 lib/Target/PTX/PTXAsmPrinter.h
 delete mode 100644 lib/Target/PTX/PTXCallingConv.td
 create mode 100644 lib/Target/PTX/PTXFPRoundingModePass.cpp
 create mode 100644 lib/Target/PTX/PTXInstrLoadStore.td
 create mode 100644 lib/Target/PTX/PTXMCInstLower.cpp
 create mode 100644 lib/Target/PTX/PTXParamManager.cpp
 create mode 100644 lib/Target/PTX/PTXParamManager.h
 create mode 100644 lib/Target/PTX/PTXRegAlloc.cpp
 create mode 100644 lib/Target/PTX/PTXSelectionDAGInfo.cpp
 create mode 100644 lib/Target/PTX/PTXSelectionDAGInfo.h
 delete mode 100755 lib/Target/PTX/generate-register-td.py
 create mode 100644 lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
 create mode 100644 lib/Target/PowerPC/MCTargetDesc/PPCBaseInfo.h
 create mode 100644 lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
 create mode 100644 lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
 create mode 100644 lib/Target/PowerPC/MCTargetDesc/PPCPredicates.cpp
 create mode 100644 lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h
 delete mode 100644 lib/Target/PowerPC/PPCAsmBackend.cpp
 delete mode 100644 lib/Target/PowerPC/PPCFixupKinds.h
 delete mode 100644 lib/Target/PowerPC/PPCMCCodeEmitter.cpp
 delete mode 100644 lib/Target/PowerPC/PPCPredicates.cpp
 delete mode 100644 lib/Target/PowerPC/PPCPredicates.h
 delete mode 100644 lib/Target/TargetAsmInfo.cpp
 delete mode 100644 lib/Target/TargetAsmLexer.cpp
 create mode 100644 lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
 create mode 100644 lib/Target/X86/MCTargetDesc/X86BaseInfo.h
 create mode 100644 lib/Target/X86/MCTargetDesc/X86FixupKinds.h
 create mode 100644 lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
 create mode 100644 lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
 delete mode 100644 lib/Target/X86/SSEDomainFix.cpp
 delete mode 100644 lib/Target/X86/X86AsmBackend.cpp
 delete mode 100644 lib/Target/X86/X86FixupKinds.h
 delete mode 100644 lib/Target/X86/X86MCCodeEmitter.cpp
 delete mode 100644 lib/Target/X86/X86MachObjectWriter.cpp
 create mode 100644 lib/Target/X86/X86VZeroUpper.cpp

(limited to 'lib/Target')

diff --git a/lib/Target/ARM/ARM.h b/lib/Target/ARM/ARM.h
index 08dc340f8541d..16d0da3b8ac72 100644
--- a/lib/Target/ARM/ARM.h
+++ b/lib/Target/ARM/ARM.h
@@ -15,7 +15,7 @@
 #ifndef TARGET_ARM_H
 #define TARGET_ARM_H
 
-#include "ARMBaseInfo.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
 #include "MCTargetDesc/ARMMCTargetDesc.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -29,19 +29,7 @@ class ARMBaseTargetMachine;
 class FunctionPass;
 class JITCodeEmitter;
 class MachineInstr;
-class MCCodeEmitter;
 class MCInst;
-class MCInstrInfo;
-class MCObjectWriter;
-class MCSubtargetInfo;
-class TargetAsmBackend;
-class formatted_raw_ostream;
-
-MCCodeEmitter *createARMMCCodeEmitter(const MCInstrInfo &MCII,
-                                      const MCSubtargetInfo &STI,
-                                      MCContext &Ctx);
-
-TargetAsmBackend *createARMAsmBackend(const Target &, const std::string &);
 
 FunctionPass *createARMISelDag(ARMBaseTargetMachine &TM,
                                CodeGenOpt::Level OptLevel);
@@ -53,7 +41,6 @@ FunctionPass *createARMLoadStoreOptimizationPass(bool PreAlloc = false);
 FunctionPass *createARMExpandPseudoPass();
 FunctionPass *createARMGlobalMergePass(const TargetLowering* tli);
 FunctionPass *createARMConstantIslandPass();
-FunctionPass *createNEONMoveFixPass();
 FunctionPass *createMLxExpansionPass();
 FunctionPass *createThumb2ITBlockPass();
 FunctionPass *createThumb2SizeReductionPass();
@@ -61,12 +48,6 @@ FunctionPass *createThumb2SizeReductionPass();
 void LowerARMMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
                                   ARMAsmPrinter &AP);
 
-/// createARMMachObjectWriter - Construct an ARM Mach-O object writer.
-MCObjectWriter *createARMMachObjectWriter(raw_ostream &OS,
-                                          bool Is64Bit,
-                                          uint32_t CPUType,
-                                          uint32_t CPUSubtype);
-
 } // end namespace llvm;
 
 #endif
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td
index cf333ccd49ba6..5c727ad6e3439 100644
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -23,6 +23,9 @@ include "llvm/Target/Target.td"
 def ModeThumb  : SubtargetFeature<"thumb-mode", "InThumbMode", "true",
                                   "Thumb mode">;
 
+def ModeNaCl   : SubtargetFeature<"nacl-mode", "InNaClMode", "true",
+                                  "Native client mode">;
+
 //===----------------------------------------------------------------------===//
 // ARM Subtarget features.
 //
@@ -85,12 +88,16 @@ def FeatureAvoidPartialCPSR : SubtargetFeature<"avoid-partial-cpsr",
 
 /// Some M architectures don't have the DSP extension (v7E-M vs. v7M)
 def FeatureDSPThumb2 : SubtargetFeature<"t2dsp", "Thumb2DSP", "true",
-                                 "Supports v7 DSP instructions in Thumb2.">;
+                                 "Supports v7 DSP instructions in Thumb2">;
 
 // Multiprocessing extension.
 def FeatureMP : SubtargetFeature<"mp", "HasMPExtension", "true",
                                  "Supports Multiprocessing extension">;
 
+// M-series ISA?
+def FeatureMClass : SubtargetFeature<"mclass", "IsMClass", "true",
+                                     "Is microcontroller profile ('M' series)">;
+
 // ARM ISAs.
 def HasV4TOps   : SubtargetFeature<"v4t", "HasV4TOps", "true",
                                    "Support ARM v4T instructions">;
@@ -105,7 +112,7 @@ def HasV6Ops    : SubtargetFeature<"v6", "HasV6Ops", "true",
                                    [HasV5TEOps]>;
 def HasV6T2Ops  : SubtargetFeature<"v6t2", "HasV6T2Ops", "true",
                                    "Support ARM v6t2 instructions",
-                                   [HasV6Ops, FeatureThumb2, FeatureDSPThumb2]>;
+                                   [HasV6Ops, FeatureThumb2]>;
 def HasV7Ops    : SubtargetFeature<"v7", "HasV7Ops", "true",
                                    "Support ARM v7 instructions",
                                    [HasV6T2Ops]>;
@@ -182,12 +189,14 @@ def : Processor<"mpcore",           ARMV6Itineraries, [HasV6Ops, FeatureVFP2,
 
 // V6M Processors.
 def : Processor<"cortex-m0",        ARMV6Itineraries, [HasV6Ops, FeatureNoARM,
-                                                       FeatureDB]>;
+                                                       FeatureDB, FeatureMClass]>;
 
 // V6T2 Processors.
-def : Processor<"arm1156t2-s",      ARMV6Itineraries, [HasV6T2Ops]>;
+def : Processor<"arm1156t2-s",      ARMV6Itineraries, [HasV6T2Ops,
+                                                       FeatureDSPThumb2]>;
 def : Processor<"arm1156t2f-s",     ARMV6Itineraries, [HasV6T2Ops, FeatureVFP2,
-                                                       FeatureHasSlowFPVMLx]>;
+                                                       FeatureHasSlowFPVMLx,
+                                                       FeatureDSPThumb2]>;
 
 // V7a Processors.
 def : Processor<"cortex-a8",        CortexA8Itineraries,
@@ -203,14 +212,14 @@ def : Processor<"cortex-a9-mp",     CortexA9Itineraries,
 // V7M Processors.
 def : ProcNoItin<"cortex-m3",       [HasV7Ops,
                                      FeatureThumb2, FeatureNoARM, FeatureDB,
-                                     FeatureHWDiv]>;
+                                     FeatureHWDiv, FeatureMClass]>;
 
 // V7EM Processors.
 def : ProcNoItin<"cortex-m4",       [HasV7Ops,
                                      FeatureThumb2, FeatureNoARM, FeatureDB,
                                      FeatureHWDiv, FeatureDSPThumb2,
                                      FeatureT2XtPk, FeatureVFP2,
-                                     FeatureVFPOnlySP]>;
+                                     FeatureVFPOnlySP, FeatureMClass]>;
 
 //===----------------------------------------------------------------------===//
 // Register File Description
diff --git a/lib/Target/ARM/ARMAddressingModes.h b/lib/Target/ARM/ARMAddressingModes.h
deleted file mode 100644
index 595708fa78811..0000000000000
--- a/lib/Target/ARM/ARMAddressingModes.h
+++ /dev/null
@@ -1,595 +0,0 @@
-//===- ARMAddressingModes.h - ARM Addressing Modes --------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the ARM addressing mode implementation stuff.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TARGET_ARM_ARMADDRESSINGMODES_H
-#define LLVM_TARGET_ARM_ARMADDRESSINGMODES_H
-
-#include "llvm/CodeGen/SelectionDAGNodes.h"
-#include "llvm/Support/MathExtras.h"
-#include <cassert>
-
-namespace llvm {
-
-/// ARM_AM - ARM Addressing Mode Stuff
-namespace ARM_AM {
-  enum ShiftOpc {
-    no_shift = 0,
-    asr,
-    lsl,
-    lsr,
-    ror,
-    rrx
-  };
-
-  enum AddrOpc {
-    add = '+', sub = '-'
-  };
-
-  static inline const char *getAddrOpcStr(AddrOpc Op) {
-    return Op == sub ? "-" : "";
-  }
-
-  static inline const char *getShiftOpcStr(ShiftOpc Op) {
-    switch (Op) {
-    default: assert(0 && "Unknown shift opc!");
-    case ARM_AM::asr: return "asr";
-    case ARM_AM::lsl: return "lsl";
-    case ARM_AM::lsr: return "lsr";
-    case ARM_AM::ror: return "ror";
-    case ARM_AM::rrx: return "rrx";
-    }
-  }
-
-  static inline unsigned getShiftOpcEncoding(ShiftOpc Op) {
-    switch (Op) {
-    default: assert(0 && "Unknown shift opc!");
-    case ARM_AM::asr: return 2;
-    case ARM_AM::lsl: return 0;
-    case ARM_AM::lsr: return 1;
-    case ARM_AM::ror: return 3;
-    }
-  }
-
-  static inline ShiftOpc getShiftOpcForNode(SDValue N) {
-    switch (N.getOpcode()) {
-    default:          return ARM_AM::no_shift;
-    case ISD::SHL:    return ARM_AM::lsl;
-    case ISD::SRL:    return ARM_AM::lsr;
-    case ISD::SRA:    return ARM_AM::asr;
-    case ISD::ROTR:   return ARM_AM::ror;
-    //case ISD::ROTL:  // Only if imm -> turn into ROTR.
-    // Can't handle RRX here, because it would require folding a flag into
-    // the addressing mode.  :(  This causes us to miss certain things.
-    //case ARMISD::RRX: return ARM_AM::rrx;
-    }
-  }
-
-  enum AMSubMode {
-    bad_am_submode = 0,
-    ia,
-    ib,
-    da,
-    db
-  };
-
-  static inline const char *getAMSubModeStr(AMSubMode Mode) {
-    switch (Mode) {
-    default: assert(0 && "Unknown addressing sub-mode!");
-    case ARM_AM::ia: return "ia";
-    case ARM_AM::ib: return "ib";
-    case ARM_AM::da: return "da";
-    case ARM_AM::db: return "db";
-    }
-  }
-
-  /// rotr32 - Rotate a 32-bit unsigned value right by a specified # bits.
-  ///
-  static inline unsigned rotr32(unsigned Val, unsigned Amt) {
-    assert(Amt < 32 && "Invalid rotate amount");
-    return (Val >> Amt) | (Val << ((32-Amt)&31));
-  }
-
-  /// rotl32 - Rotate a 32-bit unsigned value left by a specified # bits.
-  ///
-  static inline unsigned rotl32(unsigned Val, unsigned Amt) {
-    assert(Amt < 32 && "Invalid rotate amount");
-    return (Val << Amt) | (Val >> ((32-Amt)&31));
-  }
-
-  //===--------------------------------------------------------------------===//
-  // Addressing Mode #1: shift_operand with registers
-  //===--------------------------------------------------------------------===//
-  //
-  // This 'addressing mode' is used for arithmetic instructions.  It can
-  // represent things like:
-  //   reg
-  //   reg [asr|lsl|lsr|ror|rrx] reg
-  //   reg [asr|lsl|lsr|ror|rrx] imm
-  //
-  // This is stored three operands [rega, regb, opc].  The first is the base
-  // reg, the second is the shift amount (or reg0 if not present or imm).  The
-  // third operand encodes the shift opcode and the imm if a reg isn't present.
-  //
-  static inline unsigned getSORegOpc(ShiftOpc ShOp, unsigned Imm) {
-    return ShOp | (Imm << 3);
-  }
-  static inline unsigned getSORegOffset(unsigned Op) {
-    return Op >> 3;
-  }
-  static inline ShiftOpc getSORegShOp(unsigned Op) {
-    return (ShiftOpc)(Op & 7);
-  }
-
-  /// getSOImmValImm - Given an encoded imm field for the reg/imm form, return
-  /// the 8-bit imm value.
-  static inline unsigned getSOImmValImm(unsigned Imm) {
-    return Imm & 0xFF;
-  }
-  /// getSOImmValRot - Given an encoded imm field for the reg/imm form, return
-  /// the rotate amount.
-  static inline unsigned getSOImmValRot(unsigned Imm) {
-    return (Imm >> 8) * 2;
-  }
-
-  /// getSOImmValRotate - Try to handle Imm with an immediate shifter operand,
-  /// computing the rotate amount to use.  If this immediate value cannot be
-  /// handled with a single shifter-op, determine a good rotate amount that will
-  /// take a maximal chunk of bits out of the immediate.
-  static inline unsigned getSOImmValRotate(unsigned Imm) {
-    // 8-bit (or less) immediates are trivially shifter_operands with a rotate
-    // of zero.
-    if ((Imm & ~255U) == 0) return 0;
-
-    // Use CTZ to compute the rotate amount.
-    unsigned TZ = CountTrailingZeros_32(Imm);
-
-    // Rotate amount must be even.  Something like 0x200 must be rotated 8 bits,
-    // not 9.
-    unsigned RotAmt = TZ & ~1;
-
-    // If we can handle this spread, return it.
-    if ((rotr32(Imm, RotAmt) & ~255U) == 0)
-      return (32-RotAmt)&31;  // HW rotates right, not left.
-
-    // For values like 0xF000000F, we should ignore the low 6 bits, then
-    // retry the hunt.
-    if (Imm & 63U) {
-      unsigned TZ2 = CountTrailingZeros_32(Imm & ~63U);
-      unsigned RotAmt2 = TZ2 & ~1;
-      if ((rotr32(Imm, RotAmt2) & ~255U) == 0)
-        return (32-RotAmt2)&31;  // HW rotates right, not left.
-    }
-
-    // Otherwise, we have no way to cover this span of bits with a single
-    // shifter_op immediate.  Return a chunk of bits that will be useful to
-    // handle.
-    return (32-RotAmt)&31;  // HW rotates right, not left.
-  }
-
-  /// getSOImmVal - Given a 32-bit immediate, if it is something that can fit
-  /// into an shifter_operand immediate operand, return the 12-bit encoding for
-  /// it.  If not, return -1.
-  static inline int getSOImmVal(unsigned Arg) {
-    // 8-bit (or less) immediates are trivially shifter_operands with a rotate
-    // of zero.
-    if ((Arg & ~255U) == 0) return Arg;
-
-    unsigned RotAmt = getSOImmValRotate(Arg);
-
-    // If this cannot be handled with a single shifter_op, bail out.
-    if (rotr32(~255U, RotAmt) & Arg)
-      return -1;
-
-    // Encode this correctly.
-    return rotl32(Arg, RotAmt) | ((RotAmt>>1) << 8);
-  }
-
-  /// isSOImmTwoPartVal - Return true if the specified value can be obtained by
-  /// or'ing together two SOImmVal's.
-  static inline bool isSOImmTwoPartVal(unsigned V) {
-    // If this can be handled with a single shifter_op, bail out.
-    V = rotr32(~255U, getSOImmValRotate(V)) & V;
-    if (V == 0)
-      return false;
-
-    // If this can be handled with two shifter_op's, accept.
-    V = rotr32(~255U, getSOImmValRotate(V)) & V;
-    return V == 0;
-  }
-
-  /// getSOImmTwoPartFirst - If V is a value that satisfies isSOImmTwoPartVal,
-  /// return the first chunk of it.
-  static inline unsigned getSOImmTwoPartFirst(unsigned V) {
-    return rotr32(255U, getSOImmValRotate(V)) & V;
-  }
-
-  /// getSOImmTwoPartSecond - If V is a value that satisfies isSOImmTwoPartVal,
-  /// return the second chunk of it.
-  static inline unsigned getSOImmTwoPartSecond(unsigned V) {
-    // Mask out the first hunk.
-    V = rotr32(~255U, getSOImmValRotate(V)) & V;
-
-    // Take what's left.
-    assert(V == (rotr32(255U, getSOImmValRotate(V)) & V));
-    return V;
-  }
-
-  /// getThumbImmValShift - Try to handle Imm with a 8-bit immediate followed
-  /// by a left shift. Returns the shift amount to use.
-  static inline unsigned getThumbImmValShift(unsigned Imm) {
-    // 8-bit (or less) immediates are trivially immediate operand with a shift
-    // of zero.
-    if ((Imm & ~255U) == 0) return 0;
-
-    // Use CTZ to compute the shift amount.
-    return CountTrailingZeros_32(Imm);
-  }
-
-  /// isThumbImmShiftedVal - Return true if the specified value can be obtained
-  /// by left shifting a 8-bit immediate.
-  static inline bool isThumbImmShiftedVal(unsigned V) {
-    // If this can be handled with
-    V = (~255U << getThumbImmValShift(V)) & V;
-    return V == 0;
-  }
-
-  /// getThumbImm16ValShift - Try to handle Imm with a 16-bit immediate followed
-  /// by a left shift. Returns the shift amount to use.
-  static inline unsigned getThumbImm16ValShift(unsigned Imm) {
-    // 16-bit (or less) immediates are trivially immediate operand with a shift
-    // of zero.
-    if ((Imm & ~65535U) == 0) return 0;
-
-    // Use CTZ to compute the shift amount.
-    return CountTrailingZeros_32(Imm);
-  }
-
-  /// isThumbImm16ShiftedVal - Return true if the specified value can be
-  /// obtained by left shifting a 16-bit immediate.
-  static inline bool isThumbImm16ShiftedVal(unsigned V) {
-    // If this can be handled with
-    V = (~65535U << getThumbImm16ValShift(V)) & V;
-    return V == 0;
-  }
-
-  /// getThumbImmNonShiftedVal - If V is a value that satisfies
-  /// isThumbImmShiftedVal, return the non-shiftd value.
-  static inline unsigned getThumbImmNonShiftedVal(unsigned V) {
-    return V >> getThumbImmValShift(V);
-  }
-
-
-  /// getT2SOImmValSplat - Return the 12-bit encoded representation
-  /// if the specified value can be obtained by splatting the low 8 bits
-  /// into every other byte or every byte of a 32-bit value. i.e.,
-  ///     00000000 00000000 00000000 abcdefgh    control = 0
-  ///     00000000 abcdefgh 00000000 abcdefgh    control = 1
-  ///     abcdefgh 00000000 abcdefgh 00000000    control = 2
-  ///     abcdefgh abcdefgh abcdefgh abcdefgh    control = 3
-  /// Return -1 if none of the above apply.
-  /// See ARM Reference Manual A6.3.2.
-  static inline int getT2SOImmValSplatVal(unsigned V) {
-    unsigned u, Vs, Imm;
-    // control = 0
-    if ((V & 0xffffff00) == 0)
-      return V;
-
-    // If the value is zeroes in the first byte, just shift those off
-    Vs = ((V & 0xff) == 0) ? V >> 8 : V;
-    // Any passing value only has 8 bits of payload, splatted across the word
-    Imm = Vs & 0xff;
-    // Likewise, any passing values have the payload splatted into the 3rd byte
-    u = Imm | (Imm << 16);
-
-    // control = 1 or 2
-    if (Vs == u)
-      return (((Vs == V) ? 1 : 2) << 8) | Imm;
-
-    // control = 3
-    if (Vs == (u | (u << 8)))
-      return (3 << 8) | Imm;
-
-    return -1;
-  }
-
-  /// getT2SOImmValRotateVal - Return the 12-bit encoded representation if the
-  /// specified value is a rotated 8-bit value. Return -1 if no rotation
-  /// encoding is possible.
-  /// See ARM Reference Manual A6.3.2.
-  static inline int getT2SOImmValRotateVal(unsigned V) {
-    unsigned RotAmt = CountLeadingZeros_32(V);
-    if (RotAmt >= 24)
-      return -1;
-
-    // If 'Arg' can be handled with a single shifter_op return the value.
-    if ((rotr32(0xff000000U, RotAmt) & V) == V)
-      return (rotr32(V, 24 - RotAmt) & 0x7f) | ((RotAmt + 8) << 7);
-
-    return -1;
-  }
-
-  /// getT2SOImmVal - Given a 32-bit immediate, if it is something that can fit
-  /// into a Thumb-2 shifter_operand immediate operand, return the 12-bit
-  /// encoding for it.  If not, return -1.
-  /// See ARM Reference Manual A6.3.2.
-  static inline int getT2SOImmVal(unsigned Arg) {
-    // If 'Arg' is an 8-bit splat, then get the encoded value.
-    int Splat = getT2SOImmValSplatVal(Arg);
-    if (Splat != -1)
-      return Splat;
-
-    // If 'Arg' can be handled with a single shifter_op return the value.
-    int Rot = getT2SOImmValRotateVal(Arg);
-    if (Rot != -1)
-      return Rot;
-
-    return -1;
-  }
-
-  static inline unsigned getT2SOImmValRotate(unsigned V) {
-    if ((V & ~255U) == 0) return 0;
-    // Use CTZ to compute the rotate amount.
-    unsigned RotAmt = CountTrailingZeros_32(V);
-    return (32 - RotAmt) & 31;
-  }
-
-  static inline bool isT2SOImmTwoPartVal (unsigned Imm) {
-    unsigned V = Imm;
-    // Passing values can be any combination of splat values and shifter
-    // values. If this can be handled with a single shifter or splat, bail
-    // out. Those should be handled directly, not with a two-part val.
-    if (getT2SOImmValSplatVal(V) != -1)
-      return false;
-    V = rotr32 (~255U, getT2SOImmValRotate(V)) & V;
-    if (V == 0)
-      return false;
-
-    // If this can be handled as an immediate, accept.
-    if (getT2SOImmVal(V) != -1) return true;
-
-    // Likewise, try masking out a splat value first.
-    V = Imm;
-    if (getT2SOImmValSplatVal(V & 0xff00ff00U) != -1)
-      V &= ~0xff00ff00U;
-    else if (getT2SOImmValSplatVal(V & 0x00ff00ffU) != -1)
-      V &= ~0x00ff00ffU;
-    // If what's left can be handled as an immediate, accept.
-    if (getT2SOImmVal(V) != -1) return true;
-
-    // Otherwise, do not accept.
-    return false;
-  }
-
-  static inline unsigned getT2SOImmTwoPartFirst(unsigned Imm) {
-    assert (isT2SOImmTwoPartVal(Imm) &&
-            "Immedate cannot be encoded as two part immediate!");
-    // Try a shifter operand as one part
-    unsigned V = rotr32 (~255, getT2SOImmValRotate(Imm)) & Imm;
-    // If the rest is encodable as an immediate, then return it.
-    if (getT2SOImmVal(V) != -1) return V;
-
-    // Try masking out a splat value first.
-    if (getT2SOImmValSplatVal(Imm & 0xff00ff00U) != -1)
-      return Imm & 0xff00ff00U;
-
-    // The other splat is all that's left as an option.
-    assert (getT2SOImmValSplatVal(Imm & 0x00ff00ffU) != -1);
-    return Imm & 0x00ff00ffU;
-  }
-
-  static inline unsigned getT2SOImmTwoPartSecond(unsigned Imm) {
-    // Mask out the first hunk
-    Imm ^= getT2SOImmTwoPartFirst(Imm);
-    // Return what's left
-    assert (getT2SOImmVal(Imm) != -1 &&
-            "Unable to encode second part of T2 two part SO immediate");
-    return Imm;
-  }
-
-
-  //===--------------------------------------------------------------------===//
-  // Addressing Mode #2
-  //===--------------------------------------------------------------------===//
-  //
-  // This is used for most simple load/store instructions.
-  //
-  // addrmode2 := reg +/- reg shop imm
-  // addrmode2 := reg +/- imm12
-  //
-  // The first operand is always a Reg.  The second operand is a reg if in
-  // reg/reg form, otherwise it's reg#0.  The third field encodes the operation
-  // in bit 12, the immediate in bits 0-11, and the shift op in 13-15. The
-  // fourth operand 16-17 encodes the index mode.
-  //
-  // If this addressing mode is a frame index (before prolog/epilog insertion
-  // and code rewriting), this operand will have the form:  FI#, reg0, <offs>
-  // with no shift amount for the frame offset.
-  //
-  static inline unsigned getAM2Opc(AddrOpc Opc, unsigned Imm12, ShiftOpc SO,
-                                   unsigned IdxMode = 0) {
-    assert(Imm12 < (1 << 12) && "Imm too large!");
-    bool isSub = Opc == sub;
-    return Imm12 | ((int)isSub << 12) | (SO << 13) | (IdxMode << 16) ;
-  }
-  static inline unsigned getAM2Offset(unsigned AM2Opc) {
-    return AM2Opc & ((1 << 12)-1);
-  }
-  static inline AddrOpc getAM2Op(unsigned AM2Opc) {
-    return ((AM2Opc >> 12) & 1) ? sub : add;
-  }
-  static inline ShiftOpc getAM2ShiftOpc(unsigned AM2Opc) {
-    return (ShiftOpc)((AM2Opc >> 13) & 7);
-  }
-  static inline unsigned getAM2IdxMode(unsigned AM2Opc) {
-    return (AM2Opc >> 16);
-  }
-
-
-  //===--------------------------------------------------------------------===//
-  // Addressing Mode #3
-  //===--------------------------------------------------------------------===//
-  //
-  // This is used for sign-extending loads, and load/store-pair instructions.
-  //
-  // addrmode3 := reg +/- reg
-  // addrmode3 := reg +/- imm8
-  //
-  // The first operand is always a Reg.  The second operand is a reg if in
-  // reg/reg form, otherwise it's reg#0.  The third field encodes the operation
-  // in bit 8, the immediate in bits 0-7. The fourth operand 9-10 encodes the
-  // index mode.
-
-  /// getAM3Opc - This function encodes the addrmode3 opc field.
-  static inline unsigned getAM3Opc(AddrOpc Opc, unsigned char Offset,
-                                   unsigned IdxMode = 0) {
-    bool isSub = Opc == sub;
-    return ((int)isSub << 8) | Offset | (IdxMode << 9);
-  }
-  static inline unsigned char getAM3Offset(unsigned AM3Opc) {
-    return AM3Opc & 0xFF;
-  }
-  static inline AddrOpc getAM3Op(unsigned AM3Opc) {
-    return ((AM3Opc >> 8) & 1) ? sub : add;
-  }
-  static inline unsigned getAM3IdxMode(unsigned AM3Opc) {
-    return (AM3Opc >> 9);
-  }
-
-  //===--------------------------------------------------------------------===//
-  // Addressing Mode #4
-  //===--------------------------------------------------------------------===//
-  //
-  // This is used for load / store multiple instructions.
-  //
-  // addrmode4 := reg, <mode>
-  //
-  // The four modes are:
-  //    IA - Increment after
-  //    IB - Increment before
-  //    DA - Decrement after
-  //    DB - Decrement before
-  // For VFP instructions, only the IA and DB modes are valid.
-
-  static inline AMSubMode getAM4SubMode(unsigned Mode) {
-    return (AMSubMode)(Mode & 0x7);
-  }
-
-  static inline unsigned getAM4ModeImm(AMSubMode SubMode) {
-    return (int)SubMode;
-  }
-
-  //===--------------------------------------------------------------------===//
-  // Addressing Mode #5
-  //===--------------------------------------------------------------------===//
-  //
-  // This is used for coprocessor instructions, such as FP load/stores.
-  //
-  // addrmode5 := reg +/- imm8*4
-  //
-  // The first operand is always a Reg.  The second operand encodes the
-  // operation in bit 8 and the immediate in bits 0-7.
-
-  /// getAM5Opc - This function encodes the addrmode5 opc field.
-  static inline unsigned getAM5Opc(AddrOpc Opc, unsigned char Offset) {
-    bool isSub = Opc == sub;
-    return ((int)isSub << 8) | Offset;
-  }
-  static inline unsigned char getAM5Offset(unsigned AM5Opc) {
-    return AM5Opc & 0xFF;
-  }
-  static inline AddrOpc getAM5Op(unsigned AM5Opc) {
-    return ((AM5Opc >> 8) & 1) ? sub : add;
-  }
-
-  //===--------------------------------------------------------------------===//
-  // Addressing Mode #6
-  //===--------------------------------------------------------------------===//
-  //
-  // This is used for NEON load / store instructions.
-  //
-  // addrmode6 := reg with optional alignment
-  //
-  // This is stored in two operands [regaddr, align].  The first is the
-  // address register.  The second operand is the value of the alignment
-  // specifier in bytes or zero if no explicit alignment.
-  // Valid alignments depend on the specific instruction.
-
-  //===--------------------------------------------------------------------===//
-  // NEON Modified Immediates
-  //===--------------------------------------------------------------------===//
-  //
-  // Several NEON instructions (e.g., VMOV) take a "modified immediate"
-  // vector operand, where a small immediate encoded in the instruction
-  // specifies a full NEON vector value.  These modified immediates are
-  // represented here as encoded integers.  The low 8 bits hold the immediate
-  // value; bit 12 holds the "Op" field of the instruction, and bits 11-8 hold
-  // the "Cmode" field of the instruction.  The interfaces below treat the
-  // Op and Cmode values as a single 5-bit value.
-
-  static inline unsigned createNEONModImm(unsigned OpCmode, unsigned Val) {
-    return (OpCmode << 8) | Val;
-  }
-  static inline unsigned getNEONModImmOpCmode(unsigned ModImm) {
-    return (ModImm >> 8) & 0x1f;
-  }
-  static inline unsigned getNEONModImmVal(unsigned ModImm) {
-    return ModImm & 0xff;
-  }
-
-  /// decodeNEONModImm - Decode a NEON modified immediate value into the
-  /// element value and the element size in bits.  (If the element size is
-  /// smaller than the vector, it is splatted into all the elements.)
-  static inline uint64_t decodeNEONModImm(unsigned ModImm, unsigned &EltBits) {
-    unsigned OpCmode = getNEONModImmOpCmode(ModImm);
-    unsigned Imm8 = getNEONModImmVal(ModImm);
-    uint64_t Val = 0;
-
-    if (OpCmode == 0xe) {
-      // 8-bit vector elements
-      Val = Imm8;
-      EltBits = 8;
-    } else if ((OpCmode & 0xc) == 0x8) {
-      // 16-bit vector elements
-      unsigned ByteNum = (OpCmode & 0x6) >> 1;
-      Val = Imm8 << (8 * ByteNum);
-      EltBits = 16;
-    } else if ((OpCmode & 0x8) == 0) {
-      // 32-bit vector elements, zero with one byte set
-      unsigned ByteNum = (OpCmode & 0x6) >> 1;
-      Val = Imm8 << (8 * ByteNum);
-      EltBits = 32;
-    } else if ((OpCmode & 0xe) == 0xc) {
-      // 32-bit vector elements, one byte with low bits set
-      unsigned ByteNum = 1 + (OpCmode & 0x1);
-      Val = (Imm8 << (8 * ByteNum)) | (0xffff >> (8 * (2 - ByteNum)));
-      EltBits = 32;
-    } else if (OpCmode == 0x1e) {
-      // 64-bit vector elements
-      for (unsigned ByteNum = 0; ByteNum < 8; ++ByteNum) {
-        if ((ModImm >> ByteNum) & 1)
-          Val |= (uint64_t)0xff << (8 * ByteNum);
-      }
-      EltBits = 64;
-    } else {
-      assert(false && "Unsupported NEON immediate");
-    }
-    return Val;
-  }
-
-  AMSubMode getLoadStoreMultipleSubMode(int Opcode);
-
-} // end namespace ARM_AM
-} // end namespace llvm
-
-#endif
-
diff --git a/lib/Target/ARM/ARMAsmBackend.cpp b/lib/Target/ARM/ARMAsmBackend.cpp
deleted file mode 100644
index 5e438a9767324..0000000000000
--- a/lib/Target/ARM/ARMAsmBackend.cpp
+++ /dev/null
@@ -1,516 +0,0 @@
-//===-- ARMAsmBackend.cpp - ARM Assembler Backend -------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARM.h"
-#include "ARMAddressingModes.h"
-#include "ARMFixupKinds.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/MC/MCAssembler.h"
-#include "llvm/MC/MCDirectives.h"
-#include "llvm/MC/MCELFObjectWriter.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCMachObjectWriter.h"
-#include "llvm/MC/MCObjectWriter.h"
-#include "llvm/MC/MCSectionELF.h"
-#include "llvm/MC/MCSectionMachO.h"
-#include "llvm/Object/MachOFormat.h"
-#include "llvm/Support/ELF.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetAsmBackend.h"
-#include "llvm/Target/TargetRegistry.h"
-using namespace llvm;
-
-namespace {
-class ARMELFObjectWriter : public MCELFObjectTargetWriter {
-public:
-  ARMELFObjectWriter(Triple::OSType OSType)
-    : MCELFObjectTargetWriter(/*Is64Bit*/ false, OSType, ELF::EM_ARM,
-                              /*HasRelocationAddend*/ false) {}
-};
-
-class ARMAsmBackend : public TargetAsmBackend {
-  bool isThumbMode;  // Currently emitting Thumb code.
-public:
-  ARMAsmBackend(const Target &T) : TargetAsmBackend(), isThumbMode(false) {}
-
-  unsigned getNumFixupKinds() const { return ARM::NumTargetFixupKinds; }
-
-  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const {
-    const static MCFixupKindInfo Infos[ARM::NumTargetFixupKinds] = {
-// This table *must* be in the order that the fixup_* kinds are defined in
-// ARMFixupKinds.h.
-//
-// Name                      Offset (bits) Size (bits)     Flags
-{ "fixup_arm_ldst_pcrel_12", 1,            24,  MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_t2_ldst_pcrel_12",  0,            32,  MCFixupKindInfo::FKF_IsPCRel |
-                                   MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
-{ "fixup_arm_pcrel_10",      1,            24,  MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_t2_pcrel_10",       0,            32,  MCFixupKindInfo::FKF_IsPCRel |
-                                   MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
-{ "fixup_thumb_adr_pcrel_10",0,            8,   MCFixupKindInfo::FKF_IsPCRel |
-                                   MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
-{ "fixup_arm_adr_pcrel_12",  1,            24,  MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_t2_adr_pcrel_12",   0,            32,  MCFixupKindInfo::FKF_IsPCRel |
-                                   MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
-{ "fixup_arm_condbranch",    0,            24,  MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_arm_uncondbranch",  0,            24,  MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_t2_condbranch",     0,            32,  MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_t2_uncondbranch",   0,            32,  MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_arm_thumb_br",      0,            16,  MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_arm_thumb_bl",      0,            32,  MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_arm_thumb_blx",     7,            21,  MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_arm_thumb_cb",      0,            16,  MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_arm_thumb_cp",      1,             8,  MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_arm_thumb_bcc",     0,             8,  MCFixupKindInfo::FKF_IsPCRel },
-// movw / movt: 16-bits immediate but scattered into two chunks 0 - 12, 16 - 19.
-{ "fixup_arm_movt_hi16",     0,            20,  0 },
-{ "fixup_arm_movw_lo16",     0,            20,  0 },
-{ "fixup_t2_movt_hi16",      0,            20,  0 },
-{ "fixup_t2_movw_lo16",      0,            20,  0 },
-{ "fixup_arm_movt_hi16_pcrel", 0,          20,  MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_arm_movw_lo16_pcrel", 0,          20,  MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_t2_movt_hi16_pcrel", 0,           20,  MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_t2_movw_lo16_pcrel", 0,           20,  MCFixupKindInfo::FKF_IsPCRel },
-    };
-
-    if (Kind < FirstTargetFixupKind)
-      return TargetAsmBackend::getFixupKindInfo(Kind);
-
-    assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
-           "Invalid kind!");
-    return Infos[Kind - FirstTargetFixupKind];
-  }
-
-  bool MayNeedRelaxation(const MCInst &Inst) const;
-
-  void RelaxInstruction(const MCInst &Inst, MCInst &Res) const;
-
-  bool WriteNopData(uint64_t Count, MCObjectWriter *OW) const;
-
-  void HandleAssemblerFlag(MCAssemblerFlag Flag) {
-    switch (Flag) {
-    default: break;
-    case MCAF_Code16:
-      setIsThumb(true);
-      break;
-    case MCAF_Code32:
-      setIsThumb(false);
-      break;
-    }
-  }
-
-  unsigned getPointerSize() const { return 4; }
-  bool isThumb() const { return isThumbMode; }
-  void setIsThumb(bool it) { isThumbMode = it; }
-};
-} // end anonymous namespace
-
-bool ARMAsmBackend::MayNeedRelaxation(const MCInst &Inst) const {
-  // FIXME: Thumb targets, different move constant targets..
-  return false;
-}
-
-void ARMAsmBackend::RelaxInstruction(const MCInst &Inst, MCInst &Res) const {
-  assert(0 && "ARMAsmBackend::RelaxInstruction() unimplemented");
-  return;
-}
-
-bool ARMAsmBackend::WriteNopData(uint64_t Count, MCObjectWriter *OW) const {
-  if (isThumb()) {
-    // FIXME: 0xbf00 is the ARMv7 value. For v6 and before, we'll need to
-    // use 0x46c0 (which is a 'mov r8, r8' insn).
-    uint64_t NumNops = Count / 2;
-    for (uint64_t i = 0; i != NumNops; ++i)
-      OW->Write16(0xbf00);
-    if (Count & 1)
-      OW->Write8(0);
-    return true;
-  }
-  // ARM mode
-  uint64_t NumNops = Count / 4;
-  for (uint64_t i = 0; i != NumNops; ++i)
-    OW->Write32(0xe1a00000);
-  switch (Count % 4) {
-  default: break; // No leftover bytes to write
-  case 1: OW->Write8(0); break;
-  case 2: OW->Write16(0); break;
-  case 3: OW->Write16(0); OW->Write8(0xa0); break;
-  }
-
-  return true;
-}
-
-static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
-  switch (Kind) {
-  default:
-    llvm_unreachable("Unknown fixup kind!");
-  case FK_Data_1:
-  case FK_Data_2:
-  case FK_Data_4:
-    return Value;
-  case ARM::fixup_arm_movt_hi16:
-    Value >>= 16;
-    // Fallthrough
-  case ARM::fixup_arm_movw_lo16:
-  case ARM::fixup_arm_movt_hi16_pcrel:
-  case ARM::fixup_arm_movw_lo16_pcrel: {
-    unsigned Hi4 = (Value & 0xF000) >> 12;
-    unsigned Lo12 = Value & 0x0FFF;
-    assert ((((int64_t)Value) >= -0x8000) && (((int64_t)Value) <= 0x7fff) &&
-            "Out of range pc-relative fixup value!");
-    // inst{19-16} = Hi4;
-    // inst{11-0} = Lo12;
-    Value = (Hi4 << 16) | (Lo12);
-    return Value;
-  }
-  case ARM::fixup_t2_movt_hi16:
-    Value >>= 16;
-    // Fallthrough
-  case ARM::fixup_t2_movw_lo16:
-  case ARM::fixup_t2_movt_hi16_pcrel:  //FIXME: Shouldn't this be shifted like
-                                       // the other hi16 fixup?
-  case ARM::fixup_t2_movw_lo16_pcrel: {
-    unsigned Hi4 = (Value & 0xF000) >> 12;
-    unsigned i = (Value & 0x800) >> 11;
-    unsigned Mid3 = (Value & 0x700) >> 8;
-    unsigned Lo8 = Value & 0x0FF;
-    // inst{19-16} = Hi4;
-    // inst{26} = i;
-    // inst{14-12} = Mid3;
-    // inst{7-0} = Lo8;
-    // The value comes in as the whole thing, not just the portion required
-    // for this fixup, so we need to mask off the bits not handled by this
-    // portion (lo vs. hi).
-    Value &= 0xffff;
-    Value = (Hi4 << 16) | (i << 26) | (Mid3 << 12) | (Lo8);
-    uint64_t swapped = (Value & 0xFFFF0000) >> 16;
-    swapped |= (Value & 0x0000FFFF) << 16;
-    return swapped;
-  }
-  case ARM::fixup_arm_ldst_pcrel_12:
-    // ARM PC-relative values are offset by 8.
-    Value -= 4;
-    // FALLTHROUGH
-  case ARM::fixup_t2_ldst_pcrel_12: {
-    // Offset by 4, adjusted by two due to the half-word ordering of thumb.
-    Value -= 4;
-    bool isAdd = true;
-    if ((int64_t)Value < 0) {
-      Value = -Value;
-      isAdd = false;
-    }
-    assert ((Value < 4096) && "Out of range pc-relative fixup value!");
-    Value |= isAdd << 23;
-
-    // Same addressing mode as fixup_arm_pcrel_10,
-    // but with 16-bit halfwords swapped.
-    if (Kind == ARM::fixup_t2_ldst_pcrel_12) {
-      uint64_t swapped = (Value & 0xFFFF0000) >> 16;
-      swapped |= (Value & 0x0000FFFF) << 16;
-      return swapped;
-    }
-
-    return Value;
-  }
-  case ARM::fixup_thumb_adr_pcrel_10:
-    return ((Value - 4) >> 2) & 0xff;
-  case ARM::fixup_arm_adr_pcrel_12: {
-    // ARM PC-relative values are offset by 8.
-    Value -= 8;
-    unsigned opc = 4; // bits {24-21}. Default to add: 0b0100
-    if ((int64_t)Value < 0) {
-      Value = -Value;
-      opc = 2; // 0b0010
-    }
-    assert(ARM_AM::getSOImmVal(Value) != -1 &&
-           "Out of range pc-relative fixup value!");
-    // Encode the immediate and shift the opcode into place.
-    return ARM_AM::getSOImmVal(Value) | (opc << 21);
-  }
-
-  case ARM::fixup_t2_adr_pcrel_12: {
-    Value -= 4;
-    unsigned opc = 0;
-    if ((int64_t)Value < 0) {
-      Value = -Value;
-      opc = 5;
-    }
-
-    uint32_t out = (opc << 21);
-    out |= (Value & 0x800) << 15;
-    out |= (Value & 0x700) << 4;
-    out |= (Value & 0x0FF);
-
-    uint64_t swapped = (out & 0xFFFF0000) >> 16;
-    swapped |= (out & 0x0000FFFF) << 16;
-    return swapped;
-  }
-
-  case ARM::fixup_arm_condbranch:
-  case ARM::fixup_arm_uncondbranch:
-    // These values don't encode the low two bits since they're always zero.
-    // Offset by 8 just as above.
-    return 0xffffff & ((Value - 8) >> 2);
-  case ARM::fixup_t2_uncondbranch: {
-    Value = Value - 4;
-    Value >>= 1; // Low bit is not encoded.
-
-    uint32_t out = 0;
-    bool I =  Value & 0x800000;
-    bool J1 = Value & 0x400000;
-    bool J2 = Value & 0x200000;
-    J1 ^= I;
-    J2 ^= I;
-
-    out |= I  << 26; // S bit
-    out |= !J1 << 13; // J1 bit
-    out |= !J2 << 11; // J2 bit
-    out |= (Value & 0x1FF800)  << 5; // imm6 field
-    out |= (Value & 0x0007FF);        // imm11 field
-
-    uint64_t swapped = (out & 0xFFFF0000) >> 16;
-    swapped |= (out & 0x0000FFFF) << 16;
-    return swapped;
-  }
-  case ARM::fixup_t2_condbranch: {
-    Value = Value - 4;
-    Value >>= 1; // Low bit is not encoded.
-
-    uint64_t out = 0;
-    out |= (Value & 0x80000) << 7; // S bit
-    out |= (Value & 0x40000) >> 7; // J2 bit
-    out |= (Value & 0x20000) >> 4; // J1 bit
-    out |= (Value & 0x1F800) << 5; // imm6 field
-    out |= (Value & 0x007FF);      // imm11 field
-
-    uint32_t swapped = (out & 0xFFFF0000) >> 16;
-    swapped |= (out & 0x0000FFFF) << 16;
-    return swapped;
-  }
-  case ARM::fixup_arm_thumb_bl: {
-    // The value doesn't encode the low bit (always zero) and is offset by
-    // four. The value is encoded into disjoint bit positions in the destination
-    // opcode. x = unchanged, I = immediate value bit, S = sign extension bit
-    //
-    //   BL:  xxxxxSIIIIIIIIII xxxxxIIIIIIIIIII
-    //
-    // Note that the halfwords are stored high first, low second; so we need
-    // to transpose the fixup value here to map properly.
-    unsigned isNeg = (int64_t(Value - 4) < 0) ? 1 : 0;
-    uint32_t Binary = 0;
-    Value = 0x3fffff & ((Value - 4) >> 1);
-    Binary  = (Value & 0x7ff) << 16;    // Low imm11 value.
-    Binary |= (Value & 0x1ffc00) >> 11; // High imm10 value.
-    Binary |= isNeg << 10;              // Sign bit.
-    return Binary;
-  }
-  case ARM::fixup_arm_thumb_blx: {
-    // The value doesn't encode the low two bits (always zero) and is offset by
-    // four (see fixup_arm_thumb_cp). The value is encoded into disjoint bit
-    // positions in the destination opcode. x = unchanged, I = immediate value
-    // bit, S = sign extension bit, 0 = zero.
-    //
-    //   BLX: xxxxxSIIIIIIIIII xxxxxIIIIIIIIII0
-    //
-    // Note that the halfwords are stored high first, low second; so we need
-    // to transpose the fixup value here to map properly.
-    unsigned isNeg = (int64_t(Value-4) < 0) ? 1 : 0;
-    uint32_t Binary = 0;
-    Value = 0xfffff & ((Value - 2) >> 2);
-    Binary  = (Value & 0x3ff) << 17;    // Low imm10L value.
-    Binary |= (Value & 0xffc00) >> 10;  // High imm10H value.
-    Binary |= isNeg << 10;              // Sign bit.
-    return Binary;
-  }
-  case ARM::fixup_arm_thumb_cp:
-    // Offset by 4, and don't encode the low two bits. Two bytes of that
-    // 'off by 4' is implicitly handled by the half-word ordering of the
-    // Thumb encoding, so we only need to adjust by 2 here.
-    return ((Value - 2) >> 2) & 0xff;
-  case ARM::fixup_arm_thumb_cb: {
-    // Offset by 4 and don't encode the lower bit, which is always 0.
-    uint32_t Binary = (Value - 4) >> 1;
-    return ((Binary & 0x20) << 4) | ((Binary & 0x1f) << 3);
-  }
-  case ARM::fixup_arm_thumb_br:
-    // Offset by 4 and don't encode the lower bit, which is always 0.
-    return ((Value - 4) >> 1) & 0x7ff;
-  case ARM::fixup_arm_thumb_bcc:
-    // Offset by 4 and don't encode the lower bit, which is always 0.
-    return ((Value - 4) >> 1) & 0xff;
-  case ARM::fixup_arm_pcrel_10:
-    Value = Value - 4; // ARM fixups offset by an additional word and don't
-                       // need to adjust for the half-word ordering.
-    // Fall through.
-  case ARM::fixup_t2_pcrel_10: {
-    // Offset by 4, adjusted by two due to the half-word ordering of thumb.
-    Value = Value - 4;
-    bool isAdd = true;
-    if ((int64_t)Value < 0) {
-      Value = -Value;
-      isAdd = false;
-    }
-    // These values don't encode the low two bits since they're always zero.
-    Value >>= 2;
-    assert ((Value < 256) && "Out of range pc-relative fixup value!");
-    Value |= isAdd << 23;
-
-    // Same addressing mode as fixup_arm_pcrel_10,
-    // but with 16-bit halfwords swapped.
-    if (Kind == ARM::fixup_t2_pcrel_10) {
-      uint32_t swapped = (Value & 0xFFFF0000) >> 16;
-      swapped |= (Value & 0x0000FFFF) << 16;
-      return swapped;
-    }
-
-    return Value;
-  }
-  }
-}
-
-namespace {
-
-// FIXME: This should be in a separate file.
-// ELF is an ELF of course...
-class ELFARMAsmBackend : public ARMAsmBackend {
-public:
-  Triple::OSType OSType;
-  ELFARMAsmBackend(const Target &T, Triple::OSType _OSType)
-    : ARMAsmBackend(T), OSType(_OSType) { }
-
-  void ApplyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value) const;
-
-  MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
-    return createELFObjectWriter(new ARMELFObjectWriter(OSType), OS,
-                              /*IsLittleEndian*/ true);
-  }
-};
-
-// FIXME: Raise this to share code between Darwin and ELF.
-void ELFARMAsmBackend::ApplyFixup(const MCFixup &Fixup, char *Data,
-                                  unsigned DataSize, uint64_t Value) const {
-  unsigned NumBytes = 4;        // FIXME: 2 for Thumb
-  Value = adjustFixupValue(Fixup.getKind(), Value);
-  if (!Value) return;           // Doesn't change encoding.
-
-  unsigned Offset = Fixup.getOffset();
-
-  // For each byte of the fragment that the fixup touches, mask in the bits from
-  // the fixup value. The Value has been "split up" into the appropriate
-  // bitfields above.
-  for (unsigned i = 0; i != NumBytes; ++i)
-    Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff);
-}
-
-// FIXME: This should be in a separate file.
-class DarwinARMAsmBackend : public ARMAsmBackend {
-public:
-  const object::mach::CPUSubtypeARM Subtype;
-  DarwinARMAsmBackend(const Target &T, object::mach::CPUSubtypeARM st)
-    : ARMAsmBackend(T), Subtype(st) { }
-
-  MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
-    return createARMMachObjectWriter(OS, /*Is64Bit=*/false,
-                                     object::mach::CTM_ARM,
-                                     Subtype);
-  }
-
-  void ApplyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value) const;
-
-  virtual bool doesSectionRequireSymbols(const MCSection &Section) const {
-    return false;
-  }
-};
-
-/// getFixupKindNumBytes - The number of bytes the fixup may change.
-static unsigned getFixupKindNumBytes(unsigned Kind) {
-  switch (Kind) {
-  default:
-    llvm_unreachable("Unknown fixup kind!");
-
-  case FK_Data_1:
-  case ARM::fixup_arm_thumb_bcc:
-  case ARM::fixup_arm_thumb_cp:
-  case ARM::fixup_thumb_adr_pcrel_10:
-    return 1;
-
-  case FK_Data_2:
-  case ARM::fixup_arm_thumb_br:
-  case ARM::fixup_arm_thumb_cb:
-    return 2;
-
-  case ARM::fixup_arm_ldst_pcrel_12:
-  case ARM::fixup_arm_pcrel_10:
-  case ARM::fixup_arm_adr_pcrel_12:
-  case ARM::fixup_arm_condbranch:
-  case ARM::fixup_arm_uncondbranch:
-    return 3;
-
-  case FK_Data_4:
-  case ARM::fixup_t2_ldst_pcrel_12:
-  case ARM::fixup_t2_condbranch:
-  case ARM::fixup_t2_uncondbranch:
-  case ARM::fixup_t2_pcrel_10:
-  case ARM::fixup_t2_adr_pcrel_12:
-  case ARM::fixup_arm_thumb_bl:
-  case ARM::fixup_arm_thumb_blx:
-  case ARM::fixup_arm_movt_hi16:
-  case ARM::fixup_arm_movw_lo16:
-  case ARM::fixup_arm_movt_hi16_pcrel:
-  case ARM::fixup_arm_movw_lo16_pcrel:
-  case ARM::fixup_t2_movt_hi16:
-  case ARM::fixup_t2_movw_lo16:
-  case ARM::fixup_t2_movt_hi16_pcrel:
-  case ARM::fixup_t2_movw_lo16_pcrel:
-    return 4;
-  }
-}
-
-void DarwinARMAsmBackend::ApplyFixup(const MCFixup &Fixup, char *Data,
-                                     unsigned DataSize, uint64_t Value) const {
-  unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
-  Value = adjustFixupValue(Fixup.getKind(), Value);
-  if (!Value) return;           // Doesn't change encoding.
-
-  unsigned Offset = Fixup.getOffset();
-  assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
-
-  // For each byte of the fragment that the fixup touches, mask in the
-  // bits from the fixup value.
-  for (unsigned i = 0; i != NumBytes; ++i)
-    Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff);
-}
-
-} // end anonymous namespace
-
-TargetAsmBackend *llvm::createARMAsmBackend(const Target &T,
-                                            const std::string &TT) {
-  Triple TheTriple(TT);
-
-  if (TheTriple.isOSDarwin()) {
-    if (TheTriple.getArchName() == "armv4t" ||
-        TheTriple.getArchName() == "thumbv4t")
-      return new DarwinARMAsmBackend(T, object::mach::CSARM_V4T);
-    else if (TheTriple.getArchName() == "armv5e" ||
-        TheTriple.getArchName() == "thumbv5e")
-      return new DarwinARMAsmBackend(T, object::mach::CSARM_V5TEJ);
-    else if (TheTriple.getArchName() == "armv6" ||
-        TheTriple.getArchName() == "thumbv6")
-      return new DarwinARMAsmBackend(T, object::mach::CSARM_V6);
-    return new DarwinARMAsmBackend(T, object::mach::CSARM_V7);
-  }
-
-  if (TheTriple.isOSWindows())
-    assert(0 && "Windows not supported on ARM");
-
-  return new ELFARMAsmBackend(T, Triple(TT).getOS());
-}
diff --git a/lib/Target/ARM/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp
index dbc3ee41f3da8..ea3319fb0e03b 100644
--- a/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -15,15 +15,15 @@
 #define DEBUG_TYPE "asm-printer"
 #include "ARM.h"
 #include "ARMAsmPrinter.h"
-#include "ARMAddressingModes.h"
 #include "ARMBuildAttrs.h"
 #include "ARMBaseRegisterInfo.h"
 #include "ARMConstantPoolValue.h"
 #include "ARMMachineFunctionInfo.h"
-#include "ARMMCExpr.h"
 #include "ARMTargetMachine.h"
 #include "ARMTargetObjectFile.h"
 #include "InstPrinter/ARMInstPrinter.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "MCTargetDesc/ARMMCExpr.h"
 #include "llvm/Analysis/DebugInfo.h"
 #include "llvm/Constants.h"
 #include "llvm/Module.h"
@@ -45,13 +45,13 @@
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
-#include "llvm/Target/TargetRegistry.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cctype>
 using namespace llvm;
@@ -92,7 +92,7 @@ namespace {
       case ARMBuildAttrs::Advanced_SIMD_arch:
       case ARMBuildAttrs::VFP_arch:
         Streamer.EmitRawText(StringRef("\t.fpu ") + LowercaseString(String));
-        break;    
+        break;
       default: assert(0 && "Unsupported Text attribute in ASM Mode"); break;
       }
     }
@@ -100,13 +100,41 @@ namespace {
   };
 
   class ObjectAttributeEmitter : public AttributeEmitter {
+    // This structure holds all attributes, accounting for
+    // their string/numeric value, so we can later emmit them
+    // in declaration order, keeping all in the same vector
+    struct AttributeItemType {
+      enum {
+        HiddenAttribute = 0,
+        NumericAttribute,
+        TextAttribute
+      } Type;
+      unsigned Tag;
+      unsigned IntValue;
+      StringRef StringValue;
+    } AttributeItem;
+
     MCObjectStreamer &Streamer;
     StringRef CurrentVendor;
-    SmallString<64> Contents;
+    SmallVector<AttributeItemType, 64> Contents;
+
+    // Account for the ULEB/String size of each item,
+    // not just the number of items
+    size_t ContentsSize;
+    // FIXME: this should be in a more generic place, but
+    // getULEBSize() is in MCAsmInfo and will be moved to MCDwarf
+    size_t getULEBSize(int Value) {
+      size_t Size = 0;
+      do {
+        Value >>= 7;
+        Size += sizeof(int8_t); // Is this really necessary?
+      } while (Value);
+      return Size;
+    }
 
   public:
     ObjectAttributeEmitter(MCObjectStreamer &Streamer_) :
-      Streamer(Streamer_), CurrentVendor("") { }
+      Streamer(Streamer_), CurrentVendor(""), ContentsSize(0) { }
 
     void MaybeSwitchVendor(StringRef Vendor) {
       assert(!Vendor.empty() && "Vendor cannot be empty.");
@@ -124,20 +152,32 @@ namespace {
     }
 
     void EmitAttribute(unsigned Attribute, unsigned Value) {
-      // FIXME: should be ULEB
-      Contents += Attribute;
-      Contents += Value;
+      AttributeItemType attr = {
+        AttributeItemType::NumericAttribute,
+        Attribute,
+        Value,
+        StringRef("")
+      };
+      ContentsSize += getULEBSize(Attribute);
+      ContentsSize += getULEBSize(Value);
+      Contents.push_back(attr);
     }
 
     void EmitTextAttribute(unsigned Attribute, StringRef String) {
-      Contents += Attribute;
-      Contents += UppercaseString(String);
-      Contents += 0;
+      AttributeItemType attr = {
+        AttributeItemType::TextAttribute,
+        Attribute,
+        0,
+        String
+      };
+      ContentsSize += getULEBSize(Attribute);
+      // String + \0
+      ContentsSize += String.size()+1;
+
+      Contents.push_back(attr);
     }
 
     void Finish() {
-      const size_t ContentsSize = Contents.size();
-
       // Vendor size + Vendor name + '\0'
       const size_t VendorHeaderSize = 4 + CurrentVendor.size() + 1;
 
@@ -151,7 +191,23 @@ namespace {
       Streamer.EmitIntValue(ARMBuildAttrs::File, 1);
       Streamer.EmitIntValue(TagHeaderSize + ContentsSize, 4);
 
-      Streamer.EmitBytes(Contents, 0);
+      // Size should have been accounted for already, now
+      // emit each field as its type (ULEB or String)
+      for (unsigned int i=0; i<Contents.size(); ++i) {
+        AttributeItemType item = Contents[i];
+        Streamer.EmitULEB128IntValue(item.Tag, 0);
+        switch (item.Type) {
+        case AttributeItemType::NumericAttribute:
+          Streamer.EmitULEB128IntValue(item.IntValue, 0);
+          break;
+        case AttributeItemType::TextAttribute:
+          Streamer.EmitBytes(UppercaseString(item.StringValue), 0);
+          Streamer.EmitIntValue(0, 1); // '\0'
+          break;
+        default:
+          assert(0 && "Invalid attribute type");
+        }
+      }
 
       Contents.clear();
     }
@@ -184,7 +240,7 @@ void ARMAsmPrinter::EmitDwarfRegOp(const MachineLocation &MLoc) const {
       // S registers are described as bit-pieces of a register
       // S[2x] = DW_OP_regx(256 + (x>>1)) DW_OP_bit_piece(32, 0)
       // S[2x+1] = DW_OP_regx(256 + (x>>1)) DW_OP_bit_piece(32, 32)
-      
+
       unsigned SReg = Reg - ARM::S0;
       bool odd = SReg & 0x1;
       unsigned Rx = 256 + (SReg >> 1);
@@ -209,12 +265,13 @@ void ARMAsmPrinter::EmitDwarfRegOp(const MachineLocation &MLoc) const {
     } else if (Reg >= ARM::Q0 && Reg <= ARM::Q15) {
       assert(ARM::Q0 + 15 == ARM::Q15 && "Unexpected ARM Q register numbering");
       // Q registers Q0-Q15 are described by composing two D registers together.
-      // Qx = DW_OP_regx(256+2x) DW_OP_piece(8) DW_OP_regx(256+2x+1) DW_OP_piece(8)
+      // Qx = DW_OP_regx(256+2x) DW_OP_piece(8) DW_OP_regx(256+2x+1)
+      // DW_OP_piece(8)
 
       unsigned QReg = Reg - ARM::Q0;
       unsigned D1 = 256 + 2 * QReg;
       unsigned D2 = D1 + 1;
-      
+
       OutStreamer.AddComment("DW_OP_regx for Q register: D1");
       EmitInt8(dwarf::DW_OP_regx);
       EmitULEB128(D1);
@@ -233,6 +290,8 @@ void ARMAsmPrinter::EmitDwarfRegOp(const MachineLocation &MLoc) const {
 }
 
 void ARMAsmPrinter::EmitFunctionEntryLabel() {
+  OutStreamer.ForceCodeRegion();
+
   if (AFI->isThumbFunction()) {
     OutStreamer.EmitAssemblerFlag(MCAF_Code16);
     OutStreamer.EmitThumbFunc(CurrentFnSym);
@@ -395,16 +454,16 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
       // This takes advantage of the 2 operand-ness of ldm/stm and that we've
       // already got the operands in registers that are operands to the
       // inline asm statement.
-      
+
       O << "{" << ARMInstPrinter::getRegisterName(RegBegin);
-      
+
       // FIXME: The register allocator not only may not have given us the
       // registers in sequence, but may not be in ascending registers. This
       // will require changes in the register allocator that'll need to be
       // propagated down here if the operands change.
       unsigned RegOps = OpNum + 1;
       while (MI->getOperand(RegOps).isReg()) {
-        O << ", " 
+        O << ", "
           << ARMInstPrinter::getRegisterName(MI->getOperand(RegOps).getReg());
         RegOps++;
       }
@@ -413,14 +472,34 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
 
       return false;
     }
+    case 'R': // The most significant register of a pair.
+    case 'Q': { // The least significant register of a pair.
+      if (OpNum == 0)
+        return true;
+      const MachineOperand &FlagsOP = MI->getOperand(OpNum - 1);
+      if (!FlagsOP.isImm())
+        return true;
+      unsigned Flags = FlagsOP.getImm();
+      unsigned NumVals = InlineAsm::getNumOperandRegisters(Flags);
+      if (NumVals != 2)
+        return true;
+      unsigned RegOp = ExtraCode[0] == 'Q' ? OpNum : OpNum + 1;
+      if (RegOp >= MI->getNumOperands())
+        return true;
+      const MachineOperand &MO = MI->getOperand(RegOp);
+      if (!MO.isReg())
+        return true;
+      unsigned Reg = MO.getReg();
+      O << ARMInstPrinter::getRegisterName(Reg);
+      return false;
+    }
+
     // These modifiers are not yet supported.
     case 'p': // The high single-precision register of a VFP double-precision
               // register.
     case 'e': // The low doubleword register of a NEON quad register.
     case 'f': // The high doubleword register of a NEON quad register.
     case 'h': // A range of VFP/NEON registers suitable for VLD1/VST1.
-    case 'Q': // The least significant register of a pair.
-    case 'R': // The most significant register of a pair.
     case 'H': // The highest-numbered register of a pair.
       return true;
     }
@@ -437,7 +516,7 @@ bool ARMAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
   // Does this asm operand have a single letter operand modifier?
   if (ExtraCode && ExtraCode[0]) {
     if (ExtraCode[1] != 0) return true; // Unknown modifier.
-    
+
     switch (ExtraCode[0]) {
       case 'A': // A memory operand for a VLD1/VST1 instruction.
       default: return true;  // Unknown modifier.
@@ -448,7 +527,7 @@ bool ARMAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
         return false;
     }
   }
-  
+
   const MachineOperand &MO = MI->getOperand(OpNum);
   assert(MO.isReg() && "unexpected inline asm memory operand");
   O << "[" << ARMInstPrinter::getRegisterName(MO.getReg()) << "]";
@@ -772,13 +851,19 @@ EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) {
     OS << MAI->getPrivateGlobalPrefix() << "_LSDA_" << getFunctionNumber();
     MCSym = OutContext.GetOrCreateSymbol(OS.str());
   } else if (ACPV->isBlockAddress()) {
-    MCSym = GetBlockAddressSymbol(ACPV->getBlockAddress());
+    const BlockAddress *BA =
+      cast<ARMConstantPoolConstant>(ACPV)->getBlockAddress();
+    MCSym = GetBlockAddressSymbol(BA);
   } else if (ACPV->isGlobalValue()) {
-    const GlobalValue *GV = ACPV->getGV();
+    const GlobalValue *GV = cast<ARMConstantPoolConstant>(ACPV)->getGV();
     MCSym = GetARMGVSymbol(GV);
+  } else if (ACPV->isMachineBasicBlock()) {
+    const MachineBasicBlock *MBB = cast<ARMConstantPoolMBB>(ACPV)->getMBB();
+    MCSym = MBB->getSymbol();
   } else {
     assert(ACPV->isExtSymbol() && "unrecognized constant pool value");
-    MCSym = GetExternalSymbolSymbol(ACPV->getSymbol());
+    const char *Sym = cast<ARMConstantPoolSymbol>(ACPV)->getSymbol();
+    MCSym = GetExternalSymbolSymbol(Sym);
   }
 
   // Create an MCSymbol for the reference.
@@ -822,6 +907,9 @@ void ARMAsmPrinter::EmitJumpTable(const MachineInstr *MI) {
   const MachineOperand &MO2 = MI->getOperand(OpNum+1); // Unique Id
   unsigned JTI = MO1.getIndex();
 
+  // Tag the jump table appropriately for precise disassembly.
+  OutStreamer.EmitJumpTable32Region();
+
   // Emit a label for the jump table.
   MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel2(JTI, MO2.getImm());
   OutStreamer.EmitLabel(JTISymbol);
@@ -847,6 +935,11 @@ void ARMAsmPrinter::EmitJumpTable(const MachineInstr *MI) {
       Expr = MCBinaryExpr::CreateSub(Expr, MCSymbolRefExpr::Create(JTISymbol,
                                                                    OutContext),
                                      OutContext);
+    // If we're generating a table of Thumb addresses in static relocation
+    // model, we need to add one to keep interworking correctly.
+    else if (AFI->isThumbFunction())
+      Expr = MCBinaryExpr::CreateAdd(Expr, MCConstantExpr::Create(1,OutContext),
+                                     OutContext);
     OutStreamer.EmitValue(Expr, 4);
   }
 }
@@ -859,6 +952,14 @@ void ARMAsmPrinter::EmitJump2Table(const MachineInstr *MI) {
   unsigned JTI = MO1.getIndex();
 
   // Emit a label for the jump table.
+  if (MI->getOpcode() == ARM::t2TBB_JT) {
+    OutStreamer.EmitJumpTable8Region();
+  } else if (MI->getOpcode() == ARM::t2TBH_JT) {
+    OutStreamer.EmitJumpTable16Region();
+  } else {
+    OutStreamer.EmitJumpTable32Region();
+  }
+
   MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel2(JTI, MO2.getImm());
   OutStreamer.EmitLabel(JTISymbol);
 
@@ -881,6 +982,8 @@ void ARMAsmPrinter::EmitJump2Table(const MachineInstr *MI) {
       MCInst BrInst;
       BrInst.setOpcode(ARM::t2B);
       BrInst.addOperand(MCOperand::CreateExpr(MBBSymbolExpr));
+      BrInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
+      BrInst.addOperand(MCOperand::CreateReg(0));
       OutStreamer.EmitInstruction(BrInst);
       continue;
     }
@@ -994,7 +1097,8 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
            i != NumOps; ++i)
         RegList.push_back(MI->getOperand(i).getReg());
       break;
-    case ARM::STR_PRE:
+    case ARM::STR_PRE_IMM:
+    case ARM::STR_PRE_REG:
       assert(MI->getOperand(2).getReg() == ARM::SP &&
              "Only stack pointer as a source reg is supported");
       RegList.push_back(SrcReg);
@@ -1074,10 +1178,20 @@ extern cl::opt<bool> EnableARMEHABI;
 #include "ARMGenMCPseudoLowering.inc"
 
 void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  if (MI->getOpcode() != ARM::CONSTPOOL_ENTRY)
+    OutStreamer.EmitCodeRegion();
+
+  // Emit unwinding stuff for frame-related instructions
+  if (EnableARMEHABI && MI->getFlag(MachineInstr::FrameSetup))
+    EmitUnwindingInstruction(MI);
+
   // Do any auto-generated pseudo lowerings.
   if (emitPseudoExpansionLowering(OutStreamer, MI))
     return;
 
+  assert(!convertAddSubFlagsOpcode(MI->getOpcode()) &&
+         "Pseudo flag setting opcode should be expanded early");
+
   // Check for manual lowerings.
   unsigned Opc = MI->getOpcode();
   switch (Opc) {
@@ -1372,6 +1486,10 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     unsigned CPIdx   = (unsigned)MI->getOperand(1).getIndex();
 
     EmitAlignment(2);
+
+    // Mark the constant pool entry as data if we're not already in a data
+    // region.
+    OutStreamer.EmitDataRegion();
     OutStreamer.EmitLabel(GetCPISymbol(LabelId));
 
     const MachineConstantPoolEntry &MCPE = MCP->getConstants()[CPIdx];
@@ -1379,7 +1497,6 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       EmitMachineConstantPoolValue(MCPE.Val.MachineCPVal);
     else
       EmitGlobalConstant(MCPE.Val.ConstVal);
-
     return;
   }
   case ARM::t2BR_JT: {
@@ -1590,6 +1707,8 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       MCInst TmpInst;
       TmpInst.setOpcode(ARM::tB);
       TmpInst.addOperand(MCOperand::CreateExpr(SymbolExpr));
+      TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
+      TmpInst.addOperand(MCOperand::CreateReg(0));
       OutStreamer.EmitInstruction(TmpInst);
     }
     {
@@ -1804,10 +1923,6 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   MCInst TmpInst;
   LowerARMMachineInstrToMCInst(MI, TmpInst, *this);
 
-  // Emit unwinding stuff for frame-related instructions
-  if (EnableARMEHABI && MI->getFlag(MachineInstr::FrameSetup))
-    EmitUnwindingInstruction(MI);
-
   OutStreamer.EmitInstruction(TmpInst);
 }
 
@@ -1815,20 +1930,9 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
 // Target Registry Stuff
 //===----------------------------------------------------------------------===//
 
-static MCInstPrinter *createARMMCInstPrinter(const Target &T,
-                                             unsigned SyntaxVariant,
-                                             const MCAsmInfo &MAI) {
-  if (SyntaxVariant == 0)
-    return new ARMInstPrinter(MAI);
-  return 0;
-}
-
 // Force static initialization.
 extern "C" void LLVMInitializeARMAsmPrinter() {
   RegisterAsmPrinter<ARMAsmPrinter> X(TheARMTarget);
   RegisterAsmPrinter<ARMAsmPrinter> Y(TheThumbTarget);
-
-  TargetRegistry::RegisterMCInstPrinter(TheARMTarget, createARMMCInstPrinter);
-  TargetRegistry::RegisterMCInstPrinter(TheThumbTarget, createARMMCInstPrinter);
 }
 
diff --git a/lib/Target/ARM/ARMBaseInfo.h b/lib/Target/ARM/ARMBaseInfo.h
deleted file mode 100644
index 458f7dd1f784e..0000000000000
--- a/lib/Target/ARM/ARMBaseInfo.h
+++ /dev/null
@@ -1,294 +0,0 @@
-//===-- ARMBaseInfo.h - Top level definitions for ARM -------- --*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains small standalone helper functions and enum definitions for
-// the ARM target useful for the compiler back-end and the MC libraries.
-// As such, it deliberately does not include references to LLVM core
-// code gen types, passes, etc..
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ARMBASEINFO_H
-#define ARMBASEINFO_H
-
-#include "MCTargetDesc/ARMMCTargetDesc.h"
-#include "llvm/Support/ErrorHandling.h"
-
-// Note that the following auto-generated files only defined enum types, and
-// so are safe to include here.
-
-namespace llvm {
-
-// Enums corresponding to ARM condition codes
-namespace ARMCC {
-  // The CondCodes constants map directly to the 4-bit encoding of the
-  // condition field for predicated instructions.
-  enum CondCodes { // Meaning (integer)          Meaning (floating-point)
-    EQ,            // Equal                      Equal
-    NE,            // Not equal                  Not equal, or unordered
-    HS,            // Carry set                  >, ==, or unordered
-    LO,            // Carry clear                Less than
-    MI,            // Minus, negative            Less than
-    PL,            // Plus, positive or zero     >, ==, or unordered
-    VS,            // Overflow                   Unordered
-    VC,            // No overflow                Not unordered
-    HI,            // Unsigned higher            Greater than, or unordered
-    LS,            // Unsigned lower or same     Less than or equal
-    GE,            // Greater than or equal      Greater than or equal
-    LT,            // Less than                  Less than, or unordered
-    GT,            // Greater than               Greater than
-    LE,            // Less than or equal         <, ==, or unordered
-    AL             // Always (unconditional)     Always (unconditional)
-  };
-
-  inline static CondCodes getOppositeCondition(CondCodes CC) {
-    switch (CC) {
-    default: llvm_unreachable("Unknown condition code");
-    case EQ: return NE;
-    case NE: return EQ;
-    case HS: return LO;
-    case LO: return HS;
-    case MI: return PL;
-    case PL: return MI;
-    case VS: return VC;
-    case VC: return VS;
-    case HI: return LS;
-    case LS: return HI;
-    case GE: return LT;
-    case LT: return GE;
-    case GT: return LE;
-    case LE: return GT;
-    }
-  }
-} // namespace ARMCC
-
-inline static const char *ARMCondCodeToString(ARMCC::CondCodes CC) {
-  switch (CC) {
-  default: llvm_unreachable("Unknown condition code");
-  case ARMCC::EQ:  return "eq";
-  case ARMCC::NE:  return "ne";
-  case ARMCC::HS:  return "hs";
-  case ARMCC::LO:  return "lo";
-  case ARMCC::MI:  return "mi";
-  case ARMCC::PL:  return "pl";
-  case ARMCC::VS:  return "vs";
-  case ARMCC::VC:  return "vc";
-  case ARMCC::HI:  return "hi";
-  case ARMCC::LS:  return "ls";
-  case ARMCC::GE:  return "ge";
-  case ARMCC::LT:  return "lt";
-  case ARMCC::GT:  return "gt";
-  case ARMCC::LE:  return "le";
-  case ARMCC::AL:  return "al";
-  }
-}
-
-namespace ARM_PROC {
-  enum IMod {
-    IE = 2,
-    ID = 3
-  };
-
-  enum IFlags {
-    F = 1,
-    I = 2,
-    A = 4
-  };
-
-  inline static const char *IFlagsToString(unsigned val) {
-    switch (val) {
-    default: llvm_unreachable("Unknown iflags operand");
-    case F: return "f";
-    case I: return "i";
-    case A: return "a";
-    }
-  }
-
-  inline static const char *IModToString(unsigned val) {
-    switch (val) {
-    default: llvm_unreachable("Unknown imod operand");
-    case IE: return "ie";
-    case ID: return "id";
-    }
-  }
-}
-
-namespace ARM_MB {
-  // The Memory Barrier Option constants map directly to the 4-bit encoding of
-  // the option field for memory barrier operations.
-  enum MemBOpt {
-    SY    = 15,
-    ST    = 14,
-    ISH   = 11,
-    ISHST = 10,
-    NSH   = 7,
-    NSHST = 6,
-    OSH   = 3,
-    OSHST = 2
-  };
-
-  inline static const char *MemBOptToString(unsigned val) {
-    switch (val) {
-    default: llvm_unreachable("Unknown memory operation");
-    case SY:    return "sy";
-    case ST:    return "st";
-    case ISH:   return "ish";
-    case ISHST: return "ishst";
-    case NSH:   return "nsh";
-    case NSHST: return "nshst";
-    case OSH:   return "osh";
-    case OSHST: return "oshst";
-    }
-  }
-} // namespace ARM_MB
-
-/// getARMRegisterNumbering - Given the enum value for some register, e.g.
-/// ARM::LR, return the number that it corresponds to (e.g. 14).
-inline static unsigned getARMRegisterNumbering(unsigned Reg) {
-  using namespace ARM;
-  switch (Reg) {
-  default:
-    llvm_unreachable("Unknown ARM register!");
-  case R0:  case S0:  case D0:  case Q0:  return 0;
-  case R1:  case S1:  case D1:  case Q1:  return 1;
-  case R2:  case S2:  case D2:  case Q2:  return 2;
-  case R3:  case S3:  case D3:  case Q3:  return 3;
-  case R4:  case S4:  case D4:  case Q4:  return 4;
-  case R5:  case S5:  case D5:  case Q5:  return 5;
-  case R6:  case S6:  case D6:  case Q6:  return 6;
-  case R7:  case S7:  case D7:  case Q7:  return 7;
-  case R8:  case S8:  case D8:  case Q8:  return 8;
-  case R9:  case S9:  case D9:  case Q9:  return 9;
-  case R10: case S10: case D10: case Q10: return 10;
-  case R11: case S11: case D11: case Q11: return 11;
-  case R12: case S12: case D12: case Q12: return 12;
-  case SP:  case S13: case D13: case Q13: return 13;
-  case LR:  case S14: case D14: case Q14: return 14;
-  case PC:  case S15: case D15: case Q15: return 15;
-
-  case S16: case D16: return 16;
-  case S17: case D17: return 17;
-  case S18: case D18: return 18;
-  case S19: case D19: return 19;
-  case S20: case D20: return 20;
-  case S21: case D21: return 21;
-  case S22: case D22: return 22;
-  case S23: case D23: return 23;
-  case S24: case D24: return 24;
-  case S25: case D25: return 25;
-  case S26: case D26: return 26;
-  case S27: case D27: return 27;
-  case S28: case D28: return 28;
-  case S29: case D29: return 29;
-  case S30: case D30: return 30;
-  case S31: case D31: return 31;
-  }
-}
-
-namespace ARMII {
-
-  /// ARM Index Modes
-  enum IndexMode {
-    IndexModeNone  = 0,
-    IndexModePre   = 1,
-    IndexModePost  = 2,
-    IndexModeUpd   = 3
-  };
-
-  /// ARM Addressing Modes
-  enum AddrMode {
-    AddrModeNone    = 0,
-    AddrMode1       = 1,
-    AddrMode2       = 2,
-    AddrMode3       = 3,
-    AddrMode4       = 4,
-    AddrMode5       = 5,
-    AddrMode6       = 6,
-    AddrModeT1_1    = 7,
-    AddrModeT1_2    = 8,
-    AddrModeT1_4    = 9,
-    AddrModeT1_s    = 10, // i8 * 4 for pc and sp relative data
-    AddrModeT2_i12  = 11,
-    AddrModeT2_i8   = 12,
-    AddrModeT2_so   = 13,
-    AddrModeT2_pc   = 14, // +/- i12 for pc relative data
-    AddrModeT2_i8s4 = 15, // i8 * 4
-    AddrMode_i12    = 16
-  };
-
-  inline static const char *AddrModeToString(AddrMode addrmode) {
-    switch (addrmode) {
-    default: llvm_unreachable("Unknown memory operation");
-    case AddrModeNone:    return "AddrModeNone";
-    case AddrMode1:       return "AddrMode1";
-    case AddrMode2:       return "AddrMode2";
-    case AddrMode3:       return "AddrMode3";
-    case AddrMode4:       return "AddrMode4";
-    case AddrMode5:       return "AddrMode5";
-    case AddrMode6:       return "AddrMode6";
-    case AddrModeT1_1:    return "AddrModeT1_1";
-    case AddrModeT1_2:    return "AddrModeT1_2";
-    case AddrModeT1_4:    return "AddrModeT1_4";
-    case AddrModeT1_s:    return "AddrModeT1_s";
-    case AddrModeT2_i12:  return "AddrModeT2_i12";
-    case AddrModeT2_i8:   return "AddrModeT2_i8";
-    case AddrModeT2_so:   return "AddrModeT2_so";
-    case AddrModeT2_pc:   return "AddrModeT2_pc";
-    case AddrModeT2_i8s4: return "AddrModeT2_i8s4";
-    case AddrMode_i12:    return "AddrMode_i12";
-    }
-  }
-
-  /// Target Operand Flag enum.
-  enum TOF {
-    //===------------------------------------------------------------------===//
-    // ARM Specific MachineOperand flags.
-
-    MO_NO_FLAG,
-
-    /// MO_LO16 - On a symbol operand, this represents a relocation containing
-    /// lower 16 bit of the address. Used only via movw instruction.
-    MO_LO16,
-
-    /// MO_HI16 - On a symbol operand, this represents a relocation containing
-    /// higher 16 bit of the address. Used only via movt instruction.
-    MO_HI16,
-
-    /// MO_LO16_NONLAZY - On a symbol operand "FOO", this represents a
-    /// relocation containing lower 16 bit of the non-lazy-ptr indirect symbol,
-    /// i.e. "FOO$non_lazy_ptr".
-    /// Used only via movw instruction.
-    MO_LO16_NONLAZY,
-
-    /// MO_HI16_NONLAZY - On a symbol operand "FOO", this represents a
-    /// relocation containing lower 16 bit of the non-lazy-ptr indirect symbol,
-    /// i.e. "FOO$non_lazy_ptr". Used only via movt instruction.
-    MO_HI16_NONLAZY,
-
-    /// MO_LO16_NONLAZY_PIC - On a symbol operand "FOO", this represents a
-    /// relocation containing lower 16 bit of the PC relative address of the
-    /// non-lazy-ptr indirect symbol, i.e. "FOO$non_lazy_ptr - LABEL".
-    /// Used only via movw instruction.
-    MO_LO16_NONLAZY_PIC,
-
-    /// MO_HI16_NONLAZY_PIC - On a symbol operand "FOO", this represents a
-    /// relocation containing lower 16 bit of the PC relative address of the
-    /// non-lazy-ptr indirect symbol, i.e. "FOO$non_lazy_ptr - LABEL".
-    /// Used only via movt instruction.
-    MO_HI16_NONLAZY_PIC,
-
-    /// MO_PLT - On a symbol operand, this represents an ELF PLT reference on a
-    /// call operand.
-    MO_PLT
-  };
-} // end namespace ARMII
-
-} // end namespace llvm;
-
-#endif
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 649bd7d5ce3f9..408edfc20d4cc 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -13,11 +13,11 @@
 
 #include "ARMBaseInstrInfo.h"
 #include "ARM.h"
-#include "ARMAddressingModes.h"
 #include "ARMConstantPoolValue.h"
 #include "ARMHazardRecognizer.h"
 #include "ARMMachineFunctionInfo.h"
 #include "ARMRegisterInfo.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
 #include "llvm/Constants.h"
 #include "llvm/Function.h"
 #include "llvm/GlobalValue.h"
@@ -29,6 +29,7 @@
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Support/BranchProbability.h"
 #include "llvm/Support/CommandLine.h"
@@ -45,6 +46,10 @@ static cl::opt<bool>
 EnableARM3Addr("enable-arm-3-addr-conv", cl::Hidden,
                cl::desc("Enable ARM 2-addr to 3-addr conv"));
 
+static cl::opt<bool>
+WidenVMOVS("widen-vmovs", cl::Hidden,
+           cl::desc("Widen ARM vmovs to vmovd when possible"));
+
 /// ARM_MLxEntry - Record information about MLA / MLS instructions.
 struct ARM_MLxEntry {
   unsigned MLxOpc;     // MLA / MLS opcode
@@ -171,7 +176,7 @@ ARMBaseInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
       ARM_AM::ShiftOpc ShOpc = ARM_AM::getAM2ShiftOpc(OffImm);
       unsigned SOOpc = ARM_AM::getSORegOpc(ShOpc, Amt);
       UpdateMI = BuildMI(MF, MI->getDebugLoc(),
-                         get(isSub ? ARM::SUBrs : ARM::ADDrs), WBReg)
+                         get(isSub ? ARM::SUBrsi : ARM::ADDrsi), WBReg)
         .addReg(BaseReg).addReg(OffReg).addReg(0).addImm(SOOpc)
         .addImm(Pred).addReg(0).addReg(0);
     } else
@@ -399,6 +404,7 @@ ARMBaseInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
     ? ARM::B : (AFI->isThumb2Function() ? ARM::t2B : ARM::tB);
   int BccOpc = !AFI->isThumbFunction()
     ? ARM::Bcc : (AFI->isThumb2Function() ? ARM::t2Bcc : ARM::tBcc);
+  bool isThumb = AFI->isThumbFunction() || AFI->isThumb2Function();
 
   // Shouldn't be a fall through.
   assert(TBB && "InsertBranch must not be told to insert a fallthrough");
@@ -406,9 +412,12 @@ ARMBaseInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
          "ARM branch conditions have two components!");
 
   if (FBB == 0) {
-    if (Cond.empty()) // Unconditional branch?
-      BuildMI(&MBB, DL, get(BOpc)).addMBB(TBB);
-    else
+    if (Cond.empty()) { // Unconditional branch?
+      if (isThumb)
+        BuildMI(&MBB, DL, get(BOpc)).addMBB(TBB).addImm(ARMCC::AL).addReg(0);
+      else
+        BuildMI(&MBB, DL, get(BOpc)).addMBB(TBB);
+    } else
       BuildMI(&MBB, DL, get(BccOpc)).addMBB(TBB)
         .addImm(Cond[0].getImm()).addReg(Cond[1].getReg());
     return 1;
@@ -417,7 +426,10 @@ ARMBaseInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
   // Two-way conditional branch.
   BuildMI(&MBB, DL, get(BccOpc)).addMBB(TBB)
     .addImm(Cond[0].getImm()).addReg(Cond[1].getReg());
-  BuildMI(&MBB, DL, get(BOpc)).addMBB(FBB);
+  if (isThumb)
+    BuildMI(&MBB, DL, get(BOpc)).addMBB(FBB).addImm(ARMCC::AL).addReg(0);
+  else
+    BuildMI(&MBB, DL, get(BOpc)).addMBB(FBB);
   return 2;
 }
 
@@ -627,7 +639,7 @@ void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   bool SPRDest = ARM::SPRRegClass.contains(DestReg);
   bool SPRSrc  = ARM::SPRRegClass.contains(SrcReg);
 
-  unsigned Opc;
+  unsigned Opc = 0;
   if (SPRDest && SPRSrc)
     Opc = ARM::VMOVS;
   else if (GPRDest && SPRSrc)
@@ -638,19 +650,40 @@ void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     Opc = ARM::VMOVD;
   else if (ARM::QPRRegClass.contains(DestReg, SrcReg))
     Opc = ARM::VORRq;
-  else if (ARM::QQPRRegClass.contains(DestReg, SrcReg))
-    Opc = ARM::VMOVQQ;
-  else if (ARM::QQQQPRRegClass.contains(DestReg, SrcReg))
-    Opc = ARM::VMOVQQQQ;
-  else
-    llvm_unreachable("Impossible reg-to-reg copy");
 
-  MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opc), DestReg);
-  MIB.addReg(SrcReg, getKillRegState(KillSrc));
-  if (Opc == ARM::VORRq)
+  if (Opc) {
+    MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opc), DestReg);
     MIB.addReg(SrcReg, getKillRegState(KillSrc));
-  if (Opc != ARM::VMOVQQ && Opc != ARM::VMOVQQQQ)
+    if (Opc == ARM::VORRq)
+      MIB.addReg(SrcReg, getKillRegState(KillSrc));
     AddDefaultPred(MIB);
+    return;
+  }
+
+  // Generate instructions for VMOVQQ and VMOVQQQQ pseudos in place.
+  if (ARM::QQPRRegClass.contains(DestReg, SrcReg) ||
+      ARM::QQQQPRRegClass.contains(DestReg, SrcReg)) {
+    const TargetRegisterInfo *TRI = &getRegisterInfo();
+    assert(ARM::qsub_0 + 3 == ARM::qsub_3 && "Expected contiguous enum.");
+    unsigned EndSubReg = ARM::QQPRRegClass.contains(DestReg, SrcReg) ?
+      ARM::qsub_1 : ARM::qsub_3;
+    for (unsigned i = ARM::qsub_0, e = EndSubReg + 1; i != e; ++i) {
+      unsigned Dst = TRI->getSubReg(DestReg, i);
+      unsigned Src = TRI->getSubReg(SrcReg, i);
+      MachineInstrBuilder Mov =
+        AddDefaultPred(BuildMI(MBB, I, I->getDebugLoc(), get(ARM::VORRq))
+                       .addReg(Dst, RegState::Define)
+                       .addReg(Src, getKillRegState(KillSrc))
+                       .addReg(Src, getKillRegState(KillSrc)));
+      if (i == EndSubReg) {
+        Mov->addRegisterDefined(DestReg, TRI);
+        if (KillSrc)
+          Mov->addRegisterKilled(SrcReg, TRI);
+      }
+    }
+    return;
+  }
+  llvm_unreachable("Impossible reg-to-reg copy");
 }
 
 static const
@@ -683,82 +716,84 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                             MFI.getObjectSize(FI),
                             Align);
 
-  // tGPR is used sometimes in ARM instructions that need to avoid using
-  // certain registers.  Just treat it as GPR here. Likewise, rGPR.
-  if (RC == ARM::tGPRRegisterClass || RC == ARM::tcGPRRegisterClass
-      || RC == ARM::rGPRRegisterClass)
-    RC = ARM::GPRRegisterClass;
-
-  switch (RC->getID()) {
-  case ARM::GPRRegClassID:
-    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::STRi12))
+  switch (RC->getSize()) {
+    case 4:
+      if (ARM::GPRRegClass.hasSubClassEq(RC)) {
+        AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::STRi12))
                    .addReg(SrcReg, getKillRegState(isKill))
                    .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
-    break;
-  case ARM::SPRRegClassID:
-    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTRS))
+      } else if (ARM::SPRRegClass.hasSubClassEq(RC)) {
+        AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTRS))
                    .addReg(SrcReg, getKillRegState(isKill))
                    .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
-    break;
-  case ARM::DPRRegClassID:
-  case ARM::DPR_VFP2RegClassID:
-  case ARM::DPR_8RegClassID:
-    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTRD))
+      } else
+        llvm_unreachable("Unknown reg class!");
+      break;
+    case 8:
+      if (ARM::DPRRegClass.hasSubClassEq(RC)) {
+        AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTRD))
                    .addReg(SrcReg, getKillRegState(isKill))
                    .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
-    break;
-  case ARM::QPRRegClassID:
-  case ARM::QPR_VFP2RegClassID:
-  case ARM::QPR_8RegClassID:
-    if (Align >= 16 && getRegisterInfo().needsStackRealignment(MF)) {
-      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VST1q64Pseudo))
+      } else
+        llvm_unreachable("Unknown reg class!");
+      break;
+    case 16:
+      if (ARM::QPRRegClass.hasSubClassEq(RC)) {
+        if (Align >= 16 && getRegisterInfo().needsStackRealignment(MF)) {
+          AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VST1q64Pseudo))
                      .addFrameIndex(FI).addImm(16)
                      .addReg(SrcReg, getKillRegState(isKill))
                      .addMemOperand(MMO));
-    } else {
-      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMQIA))
+        } else {
+          AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMQIA))
                      .addReg(SrcReg, getKillRegState(isKill))
                      .addFrameIndex(FI)
                      .addMemOperand(MMO));
-    }
-    break;
-  case ARM::QQPRRegClassID:
-  case ARM::QQPR_VFP2RegClassID:
-    if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
-      // FIXME: It's possible to only store part of the QQ register if the
-      // spilled def has a sub-register index.
-      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VST1d64QPseudo))
+        }
+      } else
+        llvm_unreachable("Unknown reg class!");
+      break;
+    case 32:
+      if (ARM::QQPRRegClass.hasSubClassEq(RC)) {
+        if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
+          // FIXME: It's possible to only store part of the QQ register if the
+          // spilled def has a sub-register index.
+          AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VST1d64QPseudo))
                      .addFrameIndex(FI).addImm(16)
                      .addReg(SrcReg, getKillRegState(isKill))
                      .addMemOperand(MMO));
-    } else {
-      MachineInstrBuilder MIB =
-        AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMDIA))
+        } else {
+          MachineInstrBuilder MIB =
+          AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMDIA))
                        .addFrameIndex(FI))
-        .addMemOperand(MMO);
-      MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill), TRI);
-      MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0, TRI);
-      MIB = AddDReg(MIB, SrcReg, ARM::dsub_2, 0, TRI);
-            AddDReg(MIB, SrcReg, ARM::dsub_3, 0, TRI);
-    }
-    break;
-  case ARM::QQQQPRRegClassID: {
-    MachineInstrBuilder MIB =
-      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMDIA))
-                     .addFrameIndex(FI))
-      .addMemOperand(MMO);
-    MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill), TRI);
-    MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0, TRI);
-    MIB = AddDReg(MIB, SrcReg, ARM::dsub_2, 0, TRI);
-    MIB = AddDReg(MIB, SrcReg, ARM::dsub_3, 0, TRI);
-    MIB = AddDReg(MIB, SrcReg, ARM::dsub_4, 0, TRI);
-    MIB = AddDReg(MIB, SrcReg, ARM::dsub_5, 0, TRI);
-    MIB = AddDReg(MIB, SrcReg, ARM::dsub_6, 0, TRI);
-          AddDReg(MIB, SrcReg, ARM::dsub_7, 0, TRI);
-    break;
-  }
-  default:
-    llvm_unreachable("Unknown regclass!");
+                       .addMemOperand(MMO);
+          MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill), TRI);
+          MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0, TRI);
+          MIB = AddDReg(MIB, SrcReg, ARM::dsub_2, 0, TRI);
+                AddDReg(MIB, SrcReg, ARM::dsub_3, 0, TRI);
+        }
+      } else
+        llvm_unreachable("Unknown reg class!");
+      break;
+    case 64:
+      if (ARM::QQQQPRRegClass.hasSubClassEq(RC)) {
+        MachineInstrBuilder MIB =
+          AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMDIA))
+                         .addFrameIndex(FI))
+                         .addMemOperand(MMO);
+        MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill), TRI);
+        MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0, TRI);
+        MIB = AddDReg(MIB, SrcReg, ARM::dsub_2, 0, TRI);
+        MIB = AddDReg(MIB, SrcReg, ARM::dsub_3, 0, TRI);
+        MIB = AddDReg(MIB, SrcReg, ARM::dsub_4, 0, TRI);
+        MIB = AddDReg(MIB, SrcReg, ARM::dsub_5, 0, TRI);
+        MIB = AddDReg(MIB, SrcReg, ARM::dsub_6, 0, TRI);
+              AddDReg(MIB, SrcReg, ARM::dsub_7, 0, TRI);
+      } else
+        llvm_unreachable("Unknown reg class!");
+      break;
+    default:
+      llvm_unreachable("Unknown reg class!");
   }
 }
 
@@ -809,6 +844,12 @@ ARMBaseInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
   return 0;
 }
 
+unsigned ARMBaseInstrInfo::isStoreToStackSlotPostFE(const MachineInstr *MI,
+                                                    int &FrameIndex) const {
+  const MachineMemOperand *Dummy;
+  return MI->getDesc().mayStore() && hasStoreToStackSlot(MI, Dummy, FrameIndex);
+}
+
 void ARMBaseInstrInfo::
 loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                      unsigned DestReg, int FI,
@@ -826,72 +867,77 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                             MFI.getObjectSize(FI),
                             Align);
 
-  // tGPR is used sometimes in ARM instructions that need to avoid using
-  // certain registers.  Just treat it as GPR here.
-  if (RC == ARM::tGPRRegisterClass || RC == ARM::tcGPRRegisterClass
-      || RC == ARM::rGPRRegisterClass)
-    RC = ARM::GPRRegisterClass;
-
-  switch (RC->getID()) {
-  case ARM::GPRRegClassID:
-    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::LDRi12), DestReg)
+  switch (RC->getSize()) {
+  case 4:
+    if (ARM::GPRRegClass.hasSubClassEq(RC)) {
+      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::LDRi12), DestReg)
                    .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
-    break;
-  case ARM::SPRRegClassID:
-    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDRS), DestReg)
+
+    } else if (ARM::SPRRegClass.hasSubClassEq(RC)) {
+      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDRS), DestReg)
                    .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+    } else
+      llvm_unreachable("Unknown reg class!");
     break;
-  case ARM::DPRRegClassID:
-  case ARM::DPR_VFP2RegClassID:
-  case ARM::DPR_8RegClassID:
-    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDRD), DestReg)
+  case 8:
+    if (ARM::DPRRegClass.hasSubClassEq(RC)) {
+      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDRD), DestReg)
                    .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+    } else
+      llvm_unreachable("Unknown reg class!");
     break;
-  case ARM::QPRRegClassID:
-  case ARM::QPR_VFP2RegClassID:
-  case ARM::QPR_8RegClassID:
-    if (Align >= 16 && getRegisterInfo().needsStackRealignment(MF)) {
-      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLD1q64Pseudo), DestReg)
+  case 16:
+    if (ARM::QPRRegClass.hasSubClassEq(RC)) {
+      if (Align >= 16 && getRegisterInfo().needsStackRealignment(MF)) {
+        AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLD1q64Pseudo), DestReg)
                      .addFrameIndex(FI).addImm(16)
                      .addMemOperand(MMO));
-    } else {
-      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMQIA), DestReg)
-                     .addFrameIndex(FI)
-                     .addMemOperand(MMO));
-    }
+      } else {
+        AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMQIA), DestReg)
+                       .addFrameIndex(FI)
+                       .addMemOperand(MMO));
+      }
+    } else
+      llvm_unreachable("Unknown reg class!");
     break;
-  case ARM::QQPRRegClassID:
-  case ARM::QQPR_VFP2RegClassID:
-    if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
-      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLD1d64QPseudo), DestReg)
+  case 32:
+    if (ARM::QQPRRegClass.hasSubClassEq(RC)) {
+      if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
+        AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLD1d64QPseudo), DestReg)
                      .addFrameIndex(FI).addImm(16)
                      .addMemOperand(MMO));
-    } else {
-      MachineInstrBuilder MIB =
+      } else {
+        MachineInstrBuilder MIB =
         AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMDIA))
                        .addFrameIndex(FI))
-        .addMemOperand(MMO);
-      MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::Define, TRI);
-      MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::Define, TRI);
-      MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::Define, TRI);
-            AddDReg(MIB, DestReg, ARM::dsub_3, RegState::Define, TRI);
-    }
+                       .addMemOperand(MMO);
+        MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::Define, TRI);
+        MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::Define, TRI);
+        MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::Define, TRI);
+        MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::Define, TRI);
+        MIB.addReg(DestReg, RegState::Define | RegState::Implicit);
+      }
+    } else
+      llvm_unreachable("Unknown reg class!");
     break;
-  case ARM::QQQQPRRegClassID: {
-    MachineInstrBuilder MIB =
+  case 64:
+    if (ARM::QQQQPRRegClass.hasSubClassEq(RC)) {
+      MachineInstrBuilder MIB =
       AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMDIA))
                      .addFrameIndex(FI))
-      .addMemOperand(MMO);
-    MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::Define, TRI);
-    MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::Define, TRI);
-    MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::Define, TRI);
-    MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::Define, TRI);
-    MIB = AddDReg(MIB, DestReg, ARM::dsub_4, RegState::Define, TRI);
-    MIB = AddDReg(MIB, DestReg, ARM::dsub_5, RegState::Define, TRI);
-    MIB = AddDReg(MIB, DestReg, ARM::dsub_6, RegState::Define, TRI);
-    AddDReg(MIB, DestReg, ARM::dsub_7, RegState::Define, TRI);
+                     .addMemOperand(MMO);
+      MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::Define, TRI);
+      MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::Define, TRI);
+      MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::Define, TRI);
+      MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::Define, TRI);
+      MIB = AddDReg(MIB, DestReg, ARM::dsub_4, RegState::Define, TRI);
+      MIB = AddDReg(MIB, DestReg, ARM::dsub_5, RegState::Define, TRI);
+      MIB = AddDReg(MIB, DestReg, ARM::dsub_6, RegState::Define, TRI);
+      MIB = AddDReg(MIB, DestReg, ARM::dsub_7, RegState::Define, TRI);
+      MIB.addReg(DestReg, RegState::Define | RegState::Implicit);
+    } else
+      llvm_unreachable("Unknown reg class!");
     break;
-  }
   default:
     llvm_unreachable("Unknown regclass!");
   }
@@ -944,6 +990,78 @@ ARMBaseInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
   return 0;
 }
 
+unsigned ARMBaseInstrInfo::isLoadFromStackSlotPostFE(const MachineInstr *MI,
+                                             int &FrameIndex) const {
+  const MachineMemOperand *Dummy;
+  return MI->getDesc().mayLoad() && hasLoadFromStackSlot(MI, Dummy, FrameIndex);
+}
+
+bool ARMBaseInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const{
+  // This hook gets to expand COPY instructions before they become
+  // copyPhysReg() calls.  Look for VMOVS instructions that can legally be
+  // widened to VMOVD.  We prefer the VMOVD when possible because it may be
+  // changed into a VORR that can go down the NEON pipeline.
+  if (!WidenVMOVS || !MI->isCopy())
+    return false;
+
+  // Look for a copy between even S-registers.  That is where we keep floats
+  // when using NEON v2f32 instructions for f32 arithmetic.
+  unsigned DstRegS = MI->getOperand(0).getReg();
+  unsigned SrcRegS = MI->getOperand(1).getReg();
+  if (!ARM::SPRRegClass.contains(DstRegS, SrcRegS))
+    return false;
+
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  unsigned DstRegD = TRI->getMatchingSuperReg(DstRegS, ARM::ssub_0,
+                                              &ARM::DPRRegClass);
+  unsigned SrcRegD = TRI->getMatchingSuperReg(SrcRegS, ARM::ssub_0,
+                                              &ARM::DPRRegClass);
+  if (!DstRegD || !SrcRegD)
+    return false;
+
+  // We want to widen this into a DstRegD = VMOVD SrcRegD copy.  This is only
+  // legal if the COPY already defines the full DstRegD, and it isn't a
+  // sub-register insertion.
+  if (!MI->definesRegister(DstRegD, TRI) || MI->readsRegister(DstRegD, TRI))
+    return false;
+
+  // A dead copy shouldn't show up here, but reject it just in case.
+  if (MI->getOperand(0).isDead())
+    return false;
+
+  // All clear, widen the COPY.
+  DEBUG(dbgs() << "widening:    " << *MI);
+
+  // Get rid of the old <imp-def> of DstRegD.  Leave it if it defines a Q-reg
+  // or some other super-register.
+  int ImpDefIdx = MI->findRegisterDefOperandIdx(DstRegD);
+  if (ImpDefIdx != -1)
+    MI->RemoveOperand(ImpDefIdx);
+
+  // Change the opcode and operands.
+  MI->setDesc(get(ARM::VMOVD));
+  MI->getOperand(0).setReg(DstRegD);
+  MI->getOperand(1).setReg(SrcRegD);
+  AddDefaultPred(MachineInstrBuilder(MI));
+
+  // We are now reading SrcRegD instead of SrcRegS.  This may upset the
+  // register scavenger and machine verifier, so we need to indicate that we
+  // are reading an undefined value from SrcRegD, but a proper value from
+  // SrcRegS.
+  MI->getOperand(1).setIsUndef();
+  MachineInstrBuilder(MI).addReg(SrcRegS, RegState::Implicit);
+
+  // SrcRegD may actually contain an unrelated value in the ssub_1
+  // sub-register.  Don't kill it.  Only kill the ssub_0 sub-register.
+  if (MI->getOperand(1).isKill()) {
+    MI->getOperand(1).setIsKill(false);
+    MI->addRegisterKilled(SrcRegS, TRI, true);
+  }
+
+  DEBUG(dbgs() << "replaced by: " << *MI);
+  return true;
+}
+
 MachineInstr*
 ARMBaseInstrInfo::emitFrameIndexDebugValue(MachineFunction &MF,
                                            int FrameIx, uint64_t Offset,
@@ -974,17 +1092,24 @@ static unsigned duplicateCPV(MachineFunction &MF, unsigned &CPI) {
   // instructions, so that's probably OK, but is PIC always correct when
   // we get here?
   if (ACPV->isGlobalValue())
-    NewCPV = new ARMConstantPoolValue(ACPV->getGV(), PCLabelId,
-                                      ARMCP::CPValue, 4);
+    NewCPV = ARMConstantPoolConstant::
+      Create(cast<ARMConstantPoolConstant>(ACPV)->getGV(), PCLabelId,
+             ARMCP::CPValue, 4);
   else if (ACPV->isExtSymbol())
-    NewCPV = new ARMConstantPoolValue(MF.getFunction()->getContext(),
-                                      ACPV->getSymbol(), PCLabelId, 4);
+    NewCPV = ARMConstantPoolSymbol::
+      Create(MF.getFunction()->getContext(),
+             cast<ARMConstantPoolSymbol>(ACPV)->getSymbol(), PCLabelId, 4);
   else if (ACPV->isBlockAddress())
-    NewCPV = new ARMConstantPoolValue(ACPV->getBlockAddress(), PCLabelId,
-                                      ARMCP::CPBlockAddress, 4);
+    NewCPV = ARMConstantPoolConstant::
+      Create(cast<ARMConstantPoolConstant>(ACPV)->getBlockAddress(), PCLabelId,
+             ARMCP::CPBlockAddress, 4);
   else if (ACPV->isLSDA())
-    NewCPV = new ARMConstantPoolValue(MF.getFunction(), PCLabelId,
-                                      ARMCP::CPLSDA, 4);
+    NewCPV = ARMConstantPoolConstant::Create(MF.getFunction(), PCLabelId,
+                                             ARMCP::CPLSDA, 4);
+  else if (ACPV->isMachineBasicBlock())
+    NewCPV = ARMConstantPoolMBB::
+      Create(MF.getFunction()->getContext(),
+             cast<ARMConstantPoolMBB>(ACPV)->getMBB(), PCLabelId, 4);
   else
     llvm_unreachable("Unexpected ARM constantpool value type!!");
   CPI = MCP->getConstantPoolIndex(NewCPV, MCPE.getAlignment());
@@ -1289,7 +1414,7 @@ isProfitableToIfCvt(MachineBasicBlock &TMBB,
   // Attempt to estimate the relative costs of predication versus branching.
   unsigned TUnpredCost = Probability.getNumerator() * TCycles;
   TUnpredCost /= Probability.getDenominator();
-    
+
   uint32_t Comp = Probability.getDenominator() - Probability.getNumerator();
   unsigned FUnpredCost = Comp * FCycles;
   FUnpredCost /= Probability.getDenominator();
@@ -1330,6 +1455,57 @@ int llvm::getMatchingCondBranchOpcode(int Opc) {
 }
 
 
+/// Map pseudo instructions that imply an 'S' bit onto real opcodes. Whether the
+/// instruction is encoded with an 'S' bit is determined by the optional CPSR
+/// def operand.
+///
+/// This will go away once we can teach tblgen how to set the optional CPSR def
+/// operand itself.
+struct AddSubFlagsOpcodePair {
+  unsigned PseudoOpc;
+  unsigned MachineOpc;
+};
+
+static AddSubFlagsOpcodePair AddSubFlagsOpcodeMap[] = {
+  {ARM::ADDSri, ARM::ADDri},
+  {ARM::ADDSrr, ARM::ADDrr},
+  {ARM::ADDSrsi, ARM::ADDrsi},
+  {ARM::ADDSrsr, ARM::ADDrsr},
+
+  {ARM::SUBSri, ARM::SUBri},
+  {ARM::SUBSrr, ARM::SUBrr},
+  {ARM::SUBSrsi, ARM::SUBrsi},
+  {ARM::SUBSrsr, ARM::SUBrsr},
+
+  {ARM::RSBSri, ARM::RSBri},
+  {ARM::RSBSrr, ARM::RSBrr},
+  {ARM::RSBSrsi, ARM::RSBrsi},
+  {ARM::RSBSrsr, ARM::RSBrsr},
+
+  {ARM::t2ADDSri, ARM::t2ADDri},
+  {ARM::t2ADDSrr, ARM::t2ADDrr},
+  {ARM::t2ADDSrs, ARM::t2ADDrs},
+
+  {ARM::t2SUBSri, ARM::t2SUBri},
+  {ARM::t2SUBSrr, ARM::t2SUBrr},
+  {ARM::t2SUBSrs, ARM::t2SUBrs},
+
+  {ARM::t2RSBSri, ARM::t2RSBri},
+  {ARM::t2RSBSrs, ARM::t2RSBrs},
+};
+
+unsigned llvm::convertAddSubFlagsOpcode(unsigned OldOpc) {
+  static const int NPairs =
+    sizeof(AddSubFlagsOpcodeMap) / sizeof(AddSubFlagsOpcodePair);
+  for (AddSubFlagsOpcodePair *OpcPair = &AddSubFlagsOpcodeMap[0],
+         *End = &AddSubFlagsOpcodeMap[NPairs]; OpcPair != End; ++OpcPair) {
+    if (OldOpc == OpcPair->PseudoOpc) {
+      return OpcPair->MachineOpc;
+    }
+  }
+  return 0;
+}
+
 void llvm::emitARMRegPlusImmediate(MachineBasicBlock &MBB,
                                MachineBasicBlock::iterator &MBBI, DebugLoc dl,
                                unsigned DestReg, unsigned BaseReg, int NumBytes,
@@ -1862,7 +2038,6 @@ ARMBaseInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData,
   case ARM::STMIB_UPD:
   case ARM::tLDMIA:
   case ARM::tLDMIA_UPD:
-  case ARM::tSTMIA:
   case ARM::tSTMIA_UPD:
   case ARM::tPOP_RET:
   case ARM::tPOP:
@@ -2128,7 +2303,6 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
   case ARM::STMDA_UPD:
   case ARM::STMDB_UPD:
   case ARM::STMIB_UPD:
-  case ARM::tSTMIA:
   case ARM::tSTMIA_UPD:
   case ARM::tPOP_RET:
   case ARM::tPOP:
@@ -2567,6 +2741,15 @@ hasLowDefLatency(const InstrItineraryData *ItinData,
   return false;
 }
 
+bool ARMBaseInstrInfo::verifyInstruction(const MachineInstr *MI,
+                                         StringRef &ErrInfo) const {
+  if (convertAddSubFlagsOpcode(MI->getOpcode())) {
+    ErrInfo = "Pseudo flag setting opcodes only exist in Selection DAG";
+    return false;
+  }
+  return true;
+}
+
 bool
 ARMBaseInstrInfo::isFpMLxInstruction(unsigned Opcode, unsigned &MulOpc,
                                      unsigned &AddSubOpc,
@@ -2582,3 +2765,66 @@ ARMBaseInstrInfo::isFpMLxInstruction(unsigned Opcode, unsigned &MulOpc,
   HasLane = Entry.HasLane;
   return true;
 }
+
+//===----------------------------------------------------------------------===//
+// Execution domains.
+//===----------------------------------------------------------------------===//
+//
+// Some instructions go down the NEON pipeline, some go down the VFP pipeline,
+// and some can go down both.  The vmov instructions go down the VFP pipeline,
+// but they can be changed to vorr equivalents that are executed by the NEON
+// pipeline.
+//
+// We use the following execution domain numbering:
+//
+enum ARMExeDomain {
+  ExeGeneric = 0,
+  ExeVFP = 1,
+  ExeNEON = 2
+};
+//
+// Also see ARMInstrFormats.td and Domain* enums in ARMBaseInfo.h
+//
+std::pair<uint16_t, uint16_t>
+ARMBaseInstrInfo::getExecutionDomain(const MachineInstr *MI) const {
+  // VMOVD is a VFP instruction, but can be changed to NEON if it isn't
+  // predicated.
+  if (MI->getOpcode() == ARM::VMOVD && !isPredicated(MI))
+    return std::make_pair(ExeVFP, (1<<ExeVFP) | (1<<ExeNEON));
+
+  // No other instructions can be swizzled, so just determine their domain.
+  unsigned Domain = MI->getDesc().TSFlags & ARMII::DomainMask;
+
+  if (Domain & ARMII::DomainNEON)
+    return std::make_pair(ExeNEON, 0);
+
+  // Certain instructions can go either way on Cortex-A8.
+  // Treat them as NEON instructions.
+  if ((Domain & ARMII::DomainNEONA8) && Subtarget.isCortexA8())
+    return std::make_pair(ExeNEON, 0);
+
+  if (Domain & ARMII::DomainVFP)
+    return std::make_pair(ExeVFP, 0);
+
+  return std::make_pair(ExeGeneric, 0);
+}
+
+void
+ARMBaseInstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const {
+  // We only know how to change VMOVD into VORR.
+  assert(MI->getOpcode() == ARM::VMOVD && "Can only swizzle VMOVD");
+  if (Domain != ExeNEON)
+    return;
+
+  // Zap the predicate operands.
+  assert(!isPredicated(MI) && "Cannot predicate a VORRd");
+  MI->RemoveOperand(3);
+  MI->RemoveOperand(2);
+
+  // Change to a VORRd which requires two identical use operands.
+  MI->setDesc(get(ARM::VORRd));
+
+  // Add the extra source operand and new predicates.
+  // This will go before any implicit ops.
+  AddDefaultPred(MachineInstrBuilder(MI).addOperand(MI->getOperand(1)));
+}
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h
index 507e8974bf7b2..0f9f32179a310 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -27,146 +27,6 @@ namespace llvm {
   class ARMSubtarget;
   class ARMBaseRegisterInfo;
 
-/// ARMII - This namespace holds all of the target specific flags that
-/// instruction info tracks.
-///
-namespace ARMII {
-  enum {
-    //===------------------------------------------------------------------===//
-    // Instruction Flags.
-
-    //===------------------------------------------------------------------===//
-    // This four-bit field describes the addressing mode used.
-    AddrModeMask  = 0x1f, // The AddrMode enums are declared in ARMBaseInfo.h
-
-    // IndexMode - Unindex, pre-indexed, or post-indexed are valid for load
-    // and store ops only.  Generic "updating" flag is used for ld/st multiple.
-    // The index mode enums are declared in ARMBaseInfo.h
-    IndexModeShift = 5,
-    IndexModeMask  = 3 << IndexModeShift,
-
-    //===------------------------------------------------------------------===//
-    // Instruction encoding formats.
-    //
-    FormShift     = 7,
-    FormMask      = 0x3f << FormShift,
-
-    // Pseudo instructions
-    Pseudo        = 0  << FormShift,
-
-    // Multiply instructions
-    MulFrm        = 1  << FormShift,
-
-    // Branch instructions
-    BrFrm         = 2  << FormShift,
-    BrMiscFrm     = 3  << FormShift,
-
-    // Data Processing instructions
-    DPFrm         = 4  << FormShift,
-    DPSoRegFrm    = 5  << FormShift,
-
-    // Load and Store
-    LdFrm         = 6  << FormShift,
-    StFrm         = 7  << FormShift,
-    LdMiscFrm     = 8  << FormShift,
-    StMiscFrm     = 9  << FormShift,
-    LdStMulFrm    = 10 << FormShift,
-
-    LdStExFrm     = 11 << FormShift,
-
-    // Miscellaneous arithmetic instructions
-    ArithMiscFrm  = 12 << FormShift,
-    SatFrm        = 13 << FormShift,
-
-    // Extend instructions
-    ExtFrm        = 14 << FormShift,
-
-    // VFP formats
-    VFPUnaryFrm   = 15 << FormShift,
-    VFPBinaryFrm  = 16 << FormShift,
-    VFPConv1Frm   = 17 << FormShift,
-    VFPConv2Frm   = 18 << FormShift,
-    VFPConv3Frm   = 19 << FormShift,
-    VFPConv4Frm   = 20 << FormShift,
-    VFPConv5Frm   = 21 << FormShift,
-    VFPLdStFrm    = 22 << FormShift,
-    VFPLdStMulFrm = 23 << FormShift,
-    VFPMiscFrm    = 24 << FormShift,
-
-    // Thumb format
-    ThumbFrm      = 25 << FormShift,
-
-    // Miscelleaneous format
-    MiscFrm       = 26 << FormShift,
-
-    // NEON formats
-    NGetLnFrm     = 27 << FormShift,
-    NSetLnFrm     = 28 << FormShift,
-    NDupFrm       = 29 << FormShift,
-    NLdStFrm      = 30 << FormShift,
-    N1RegModImmFrm= 31 << FormShift,
-    N2RegFrm      = 32 << FormShift,
-    NVCVTFrm      = 33 << FormShift,
-    NVDupLnFrm    = 34 << FormShift,
-    N2RegVShLFrm  = 35 << FormShift,
-    N2RegVShRFrm  = 36 << FormShift,
-    N3RegFrm      = 37 << FormShift,
-    N3RegVShFrm   = 38 << FormShift,
-    NVExtFrm      = 39 << FormShift,
-    NVMulSLFrm    = 40 << FormShift,
-    NVTBLFrm      = 41 << FormShift,
-
-    //===------------------------------------------------------------------===//
-    // Misc flags.
-
-    // UnaryDP - Indicates this is a unary data processing instruction, i.e.
-    // it doesn't have a Rn operand.
-    UnaryDP       = 1 << 13,
-
-    // Xform16Bit - Indicates this Thumb2 instruction may be transformed into
-    // a 16-bit Thumb instruction if certain conditions are met.
-    Xform16Bit    = 1 << 14,
-
-    //===------------------------------------------------------------------===//
-    // Code domain.
-    DomainShift   = 15,
-    DomainMask    = 7 << DomainShift,
-    DomainGeneral = 0 << DomainShift,
-    DomainVFP     = 1 << DomainShift,
-    DomainNEON    = 2 << DomainShift,
-    DomainNEONA8  = 4 << DomainShift,
-
-    //===------------------------------------------------------------------===//
-    // Field shifts - such shifts are used to set field while generating
-    // machine instructions.
-    //
-    // FIXME: This list will need adjusting/fixing as the MC code emitter
-    // takes shape and the ARMCodeEmitter.cpp bits go away.
-    ShiftTypeShift = 4,
-
-    M_BitShift     = 5,
-    ShiftImmShift  = 5,
-    ShiftShift     = 7,
-    N_BitShift     = 7,
-    ImmHiShift     = 8,
-    SoRotImmShift  = 8,
-    RegRsShift     = 8,
-    ExtRotImmShift = 10,
-    RegRdLoShift   = 12,
-    RegRdShift     = 12,
-    RegRdHiShift   = 16,
-    RegRnShift     = 16,
-    S_BitShift     = 20,
-    W_BitShift     = 21,
-    AM3_I_BitShift = 22,
-    D_BitShift     = 22,
-    U_BitShift     = 23,
-    P_BitShift     = 24,
-    I_BitShift     = 25,
-    CondShift      = 28
-  };
-}
-
 class ARMBaseInstrInfo : public ARMGenInstrInfo {
   const ARMSubtarget &Subtarget;
 
@@ -241,6 +101,10 @@ public:
                                        int &FrameIndex) const;
   virtual unsigned isStoreToStackSlot(const MachineInstr *MI,
                                       int &FrameIndex) const;
+  virtual unsigned isLoadFromStackSlotPostFE(const MachineInstr *MI,
+                                             int &FrameIndex) const;
+  virtual unsigned isStoreToStackSlotPostFE(const MachineInstr *MI,
+                                            int &FrameIndex) const;
 
   virtual void copyPhysReg(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator I, DebugLoc DL,
@@ -259,6 +123,8 @@ public:
                                     const TargetRegisterClass *RC,
                                     const TargetRegisterInfo *TRI) const;
 
+  virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const;
+
   virtual MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF,
                                                  int FrameIx,
                                                  uint64_t Offset,
@@ -346,6 +212,12 @@ public:
   int getOperandLatency(const InstrItineraryData *ItinData,
                         SDNode *DefNode, unsigned DefIdx,
                         SDNode *UseNode, unsigned UseIdx) const;
+
+  /// VFP/NEON execution domains.
+  std::pair<uint16_t, uint16_t>
+  getExecutionDomain(const MachineInstr *MI) const;
+  void setExecutionDomain(MachineInstr *MI, unsigned Domain) const;
+
 private:
   int getVLDMDefCycle(const InstrItineraryData *ItinData,
                       const MCInstrDesc &DefMCID,
@@ -382,6 +254,9 @@ private:
   bool hasLowDefLatency(const InstrItineraryData *ItinData,
                         const MachineInstr *DefMI, unsigned DefIdx) const;
 
+  /// verifyInstruction - Perform target specific instruction verification.
+  bool verifyInstruction(const MachineInstr *MI, StringRef &ErrInfo) const;
+
 private:
   /// Modeling special VFP / NEON fp MLA / MLS hazards.
 
@@ -464,6 +339,12 @@ ARMCC::CondCodes getInstrPredicate(const MachineInstr *MI, unsigned &PredReg);
 
 int getMatchingCondBranchOpcode(int Opc);
 
+
+/// Map pseudo instructions that imply an 'S' bit onto real opcodes. Whether
+/// the instruction is encoded with an 'S' bit is determined by the optional
+/// CPSR def operand.
+unsigned convertAddSubFlagsOpcode(unsigned OldOpc);
+
 /// emitARMRegPlusImmediate / emitT2RegPlusImmediate - Emits a series of
 /// instructions to materializea destreg = basereg + immediate in ARM / Thumb2
 /// code.
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index ba422952ac1a4..7c42342229a26 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -12,13 +12,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "ARM.h"
-#include "ARMAddressingModes.h"
 #include "ARMBaseInstrInfo.h"
 #include "ARMBaseRegisterInfo.h"
 #include "ARMFrameLowering.h"
 #include "ARMInstrInfo.h"
 #include "ARMMachineFunctionInfo.h"
 #include "ARMSubtarget.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Function.h"
@@ -27,7 +27,6 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineLocation.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/Support/Debug.h"
@@ -57,7 +56,7 @@ EnableBasePointer("arm-use-base-pointer", cl::Hidden, cl::init(true),
 
 ARMBaseRegisterInfo::ARMBaseRegisterInfo(const ARMBaseInstrInfo &tii,
                                          const ARMSubtarget &sti)
-  : ARMGenRegisterInfo(), TII(tii), STI(sti),
+  : ARMGenRegisterInfo(ARM::LR), TII(tii), STI(sti),
     FramePtr((STI.isTargetDarwin() || STI.isThumb()) ? ARM::R7 : ARM::R11),
     BasePtr(ARM::R6) {
 }
@@ -354,7 +353,7 @@ const TargetRegisterClass*
 ARMBaseRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC)
                                                                          const {
   const TargetRegisterClass *Super = RC;
-  TargetRegisterClass::sc_iterator I = RC->superclasses_begin();
+  TargetRegisterClass::sc_iterator I = RC->getSuperClasses();
   do {
     switch (Super->getID()) {
     case ARM::GPRRegClassID:
@@ -375,6 +374,13 @@ ARMBaseRegisterInfo::getPointerRegClass(unsigned Kind) const {
   return ARM::GPRRegisterClass;
 }
 
+const TargetRegisterClass *
+ARMBaseRegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
+  if (RC == &ARM::CCRRegClass)
+    return 0;  // Can't copy CCR registers.
+  return RC;
+}
+
 unsigned
 ARMBaseRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
                                          MachineFunction &MF) const {
@@ -487,19 +493,19 @@ ARMBaseRegisterInfo::getRawAllocationOrder(const TargetRegisterClass *RC,
 
     if (!TFI->hasFP(MF)) {
       if (!STI.isR9Reserved())
-        return ArrayRef<unsigned>(GPREven1);
+        return makeArrayRef(GPREven1);
       else
-        return ArrayRef<unsigned>(GPREven4);
+        return makeArrayRef(GPREven4);
     } else if (FramePtr == ARM::R7) {
       if (!STI.isR9Reserved())
-        return ArrayRef<unsigned>(GPREven2);
+        return makeArrayRef(GPREven2);
       else
-        return ArrayRef<unsigned>(GPREven5);
+        return makeArrayRef(GPREven5);
     } else { // FramePtr == ARM::R11
       if (!STI.isR9Reserved())
-        return ArrayRef<unsigned>(GPREven3);
+        return makeArrayRef(GPREven3);
       else
-        return ArrayRef<unsigned>(GPREven6);
+        return makeArrayRef(GPREven6);
     }
   } else if (HintType == ARMRI::RegPairOdd) {
     if (isPhysicalRegister(HintReg) && getRegisterPairOdd(HintReg, MF) == 0)
@@ -509,19 +515,19 @@ ARMBaseRegisterInfo::getRawAllocationOrder(const TargetRegisterClass *RC,
 
     if (!TFI->hasFP(MF)) {
       if (!STI.isR9Reserved())
-        return ArrayRef<unsigned>(GPROdd1);
+        return makeArrayRef(GPROdd1);
       else
-        return ArrayRef<unsigned>(GPROdd4);
+        return makeArrayRef(GPROdd4);
     } else if (FramePtr == ARM::R7) {
       if (!STI.isR9Reserved())
-        return ArrayRef<unsigned>(GPROdd2);
+        return makeArrayRef(GPROdd2);
       else
-        return ArrayRef<unsigned>(GPROdd5);
+        return makeArrayRef(GPROdd5);
     } else { // FramePtr == ARM::R11
       if (!STI.isR9Reserved())
-        return ArrayRef<unsigned>(GPROdd3);
+        return makeArrayRef(GPROdd3);
       else
-        return ArrayRef<unsigned>(GPROdd6);
+        return makeArrayRef(GPROdd6);
     }
   }
   return RC->getRawAllocationOrder(MF);
@@ -649,10 +655,6 @@ cannotEliminateFrame(const MachineFunction &MF) const {
     || needsStackRealignment(MF);
 }
 
-unsigned ARMBaseRegisterInfo::getRARegister() const {
-  return ARM::LR;
-}
-
 unsigned
 ARMBaseRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
@@ -672,99 +674,54 @@ unsigned ARMBaseRegisterInfo::getEHHandlerRegister() const {
   return 0;
 }
 
-int ARMBaseRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
-  return ARMGenRegisterInfo::getDwarfRegNumFull(RegNum, 0);
-}
-
-int ARMBaseRegisterInfo::getLLVMRegNum(unsigned DwarfRegNo, bool isEH) const {
-  return ARMGenRegisterInfo::getLLVMRegNumFull(DwarfRegNo,0);
-}
-
 unsigned ARMBaseRegisterInfo::getRegisterPairEven(unsigned Reg,
                                               const MachineFunction &MF) const {
   switch (Reg) {
   default: break;
   // Return 0 if either register of the pair is a special register.
   // So no R12, etc.
-  case ARM::R1:
-    return ARM::R0;
-  case ARM::R3:
-    return ARM::R2;
-  case ARM::R5:
-    return ARM::R4;
+  case ARM::R1: return ARM::R0;
+  case ARM::R3: return ARM::R2;
+  case ARM::R5: return ARM::R4;
   case ARM::R7:
     return (isReservedReg(MF, ARM::R7) || isReservedReg(MF, ARM::R6))
       ? 0 : ARM::R6;
-  case ARM::R9:
-    return isReservedReg(MF, ARM::R9)  ? 0 :ARM::R8;
-  case ARM::R11:
-    return isReservedReg(MF, ARM::R11) ? 0 : ARM::R10;
-
-  case ARM::S1:
-    return ARM::S0;
-  case ARM::S3:
-    return ARM::S2;
-  case ARM::S5:
-    return ARM::S4;
-  case ARM::S7:
-    return ARM::S6;
-  case ARM::S9:
-    return ARM::S8;
-  case ARM::S11:
-    return ARM::S10;
-  case ARM::S13:
-    return ARM::S12;
-  case ARM::S15:
-    return ARM::S14;
-  case ARM::S17:
-    return ARM::S16;
-  case ARM::S19:
-    return ARM::S18;
-  case ARM::S21:
-    return ARM::S20;
-  case ARM::S23:
-    return ARM::S22;
-  case ARM::S25:
-    return ARM::S24;
-  case ARM::S27:
-    return ARM::S26;
-  case ARM::S29:
-    return ARM::S28;
-  case ARM::S31:
-    return ARM::S30;
-
-  case ARM::D1:
-    return ARM::D0;
-  case ARM::D3:
-    return ARM::D2;
-  case ARM::D5:
-    return ARM::D4;
-  case ARM::D7:
-    return ARM::D6;
-  case ARM::D9:
-    return ARM::D8;
-  case ARM::D11:
-    return ARM::D10;
-  case ARM::D13:
-    return ARM::D12;
-  case ARM::D15:
-    return ARM::D14;
-  case ARM::D17:
-    return ARM::D16;
-  case ARM::D19:
-    return ARM::D18;
-  case ARM::D21:
-    return ARM::D20;
-  case ARM::D23:
-    return ARM::D22;
-  case ARM::D25:
-    return ARM::D24;
-  case ARM::D27:
-    return ARM::D26;
-  case ARM::D29:
-    return ARM::D28;
-  case ARM::D31:
-    return ARM::D30;
+  case ARM::R9: return isReservedReg(MF, ARM::R9)  ? 0 :ARM::R8;
+  case ARM::R11: return isReservedReg(MF, ARM::R11) ? 0 : ARM::R10;
+
+  case ARM::S1: return ARM::S0;
+  case ARM::S3: return ARM::S2;
+  case ARM::S5: return ARM::S4;
+  case ARM::S7: return ARM::S6;
+  case ARM::S9: return ARM::S8;
+  case ARM::S11: return ARM::S10;
+  case ARM::S13: return ARM::S12;
+  case ARM::S15: return ARM::S14;
+  case ARM::S17: return ARM::S16;
+  case ARM::S19: return ARM::S18;
+  case ARM::S21: return ARM::S20;
+  case ARM::S23: return ARM::S22;
+  case ARM::S25: return ARM::S24;
+  case ARM::S27: return ARM::S26;
+  case ARM::S29: return ARM::S28;
+  case ARM::S31: return ARM::S30;
+
+  case ARM::D1: return ARM::D0;
+  case ARM::D3: return ARM::D2;
+  case ARM::D5: return ARM::D4;
+  case ARM::D7: return ARM::D6;
+  case ARM::D9: return ARM::D8;
+  case ARM::D11: return ARM::D10;
+  case ARM::D13: return ARM::D12;
+  case ARM::D15: return ARM::D14;
+  case ARM::D17: return ARM::D16;
+  case ARM::D19: return ARM::D18;
+  case ARM::D21: return ARM::D20;
+  case ARM::D23: return ARM::D22;
+  case ARM::D25: return ARM::D24;
+  case ARM::D27: return ARM::D26;
+  case ARM::D29: return ARM::D28;
+  case ARM::D31: return ARM::D30;
   }
 
   return 0;
@@ -776,85 +733,48 @@ unsigned ARMBaseRegisterInfo::getRegisterPairOdd(unsigned Reg,
   default: break;
   // Return 0 if either register of the pair is a special register.
   // So no R12, etc.
-  case ARM::R0:
-    return ARM::R1;
-  case ARM::R2:
-    return ARM::R3;
-  case ARM::R4:
-    return ARM::R5;
+  case ARM::R0: return ARM::R1;
+  case ARM::R2: return ARM::R3;
+  case ARM::R4: return ARM::R5;
   case ARM::R6:
     return (isReservedReg(MF, ARM::R7) || isReservedReg(MF, ARM::R6))
       ? 0 : ARM::R7;
-  case ARM::R8:
-    return isReservedReg(MF, ARM::R9)  ? 0 :ARM::R9;
-  case ARM::R10:
-    return isReservedReg(MF, ARM::R11) ? 0 : ARM::R11;
-
-  case ARM::S0:
-    return ARM::S1;
-  case ARM::S2:
-    return ARM::S3;
-  case ARM::S4:
-    return ARM::S5;
-  case ARM::S6:
-    return ARM::S7;
-  case ARM::S8:
-    return ARM::S9;
-  case ARM::S10:
-    return ARM::S11;
-  case ARM::S12:
-    return ARM::S13;
-  case ARM::S14:
-    return ARM::S15;
-  case ARM::S16:
-    return ARM::S17;
-  case ARM::S18:
-    return ARM::S19;
-  case ARM::S20:
-    return ARM::S21;
-  case ARM::S22:
-    return ARM::S23;
-  case ARM::S24:
-    return ARM::S25;
-  case ARM::S26:
-    return ARM::S27;
-  case ARM::S28:
-    return ARM::S29;
-  case ARM::S30:
-    return ARM::S31;
-
-  case ARM::D0:
-    return ARM::D1;
-  case ARM::D2:
-    return ARM::D3;
-  case ARM::D4:
-    return ARM::D5;
-  case ARM::D6:
-    return ARM::D7;
-  case ARM::D8:
-    return ARM::D9;
-  case ARM::D10:
-    return ARM::D11;
-  case ARM::D12:
-    return ARM::D13;
-  case ARM::D14:
-    return ARM::D15;
-  case ARM::D16:
-    return ARM::D17;
-  case ARM::D18:
-    return ARM::D19;
-  case ARM::D20:
-    return ARM::D21;
-  case ARM::D22:
-    return ARM::D23;
-  case ARM::D24:
-    return ARM::D25;
-  case ARM::D26:
-    return ARM::D27;
-  case ARM::D28:
-    return ARM::D29;
-  case ARM::D30:
-    return ARM::D31;
+  case ARM::R8: return isReservedReg(MF, ARM::R9)  ? 0 :ARM::R9;
+  case ARM::R10: return isReservedReg(MF, ARM::R11) ? 0 : ARM::R11;
+
+  case ARM::S0: return ARM::S1;
+  case ARM::S2: return ARM::S3;
+  case ARM::S4: return ARM::S5;
+  case ARM::S6: return ARM::S7;
+  case ARM::S8: return ARM::S9;
+  case ARM::S10: return ARM::S11;
+  case ARM::S12: return ARM::S13;
+  case ARM::S14: return ARM::S15;
+  case ARM::S16: return ARM::S17;
+  case ARM::S18: return ARM::S19;
+  case ARM::S20: return ARM::S21;
+  case ARM::S22: return ARM::S23;
+  case ARM::S24: return ARM::S25;
+  case ARM::S26: return ARM::S27;
+  case ARM::S28: return ARM::S29;
+  case ARM::S30: return ARM::S31;
+
+  case ARM::D0: return ARM::D1;
+  case ARM::D2: return ARM::D3;
+  case ARM::D4: return ARM::D5;
+  case ARM::D6: return ARM::D7;
+  case ARM::D8: return ARM::D9;
+  case ARM::D10: return ARM::D11;
+  case ARM::D12: return ARM::D13;
+  case ARM::D14: return ARM::D15;
+  case ARM::D16: return ARM::D17;
+  case ARM::D18: return ARM::D19;
+  case ARM::D20: return ARM::D21;
+  case ARM::D22: return ARM::D23;
+  case ARM::D24: return ARM::D25;
+  case ARM::D26: return ARM::D27;
+  case ARM::D28: return ARM::D29;
+  case ARM::D30: return ARM::D31;
   }
 
   return 0;
@@ -1111,11 +1031,11 @@ materializeFrameBaseRegister(MachineBasicBlock *MBB,
   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
   MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, 0, this));
 
-  MachineInstrBuilder MIB = BuildMI(*MBB, Ins, DL, MCID, BaseReg)
-    .addFrameIndex(FrameIdx).addImm(Offset);
+  MachineInstrBuilder MIB = AddDefaultPred(BuildMI(*MBB, Ins, DL, MCID, BaseReg)
+    .addFrameIndex(FrameIdx).addImm(Offset));
 
   if (!AFI->isThumb1OnlyFunction())
-    AddDefaultCC(AddDefaultPred(MIB));
+    AddDefaultCC(MIB);
 }
 
 void
@@ -1143,6 +1063,7 @@ ARMBaseRegisterInfo::resolveFrameIndex(MachineBasicBlock::iterator I,
     Done = rewriteT2FrameIndex(MI, i, BaseReg, Off, TII);
   }
   assert (Done && "Unable to resolve frame index!");
+  (void)Done;
 }
 
 bool ARMBaseRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.h b/lib/Target/ARM/ARMBaseRegisterInfo.h
index b4b4059e73611..fee17ff3c1ca1 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.h
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.h
@@ -33,19 +33,6 @@ namespace ARMRI {
   };
 }
 
-/// isARMLowRegister - Returns true if the register is low register r0-r7.
-///
-static inline bool isARMLowRegister(unsigned Reg) {
-  using namespace ARM;
-  switch (Reg) {
-  case R0:  case R1:  case R2:  case R3:
-  case R4:  case R5:  case R6:  case R7:
-    return true;
-  default:
-    return false;
-  }
-}
-
 /// isARMArea1Register - Returns true if the register is a low register (r0-r7)
 /// or a stack/pc register that we should push/pop.
 static inline bool isARMArea1Register(unsigned Reg, bool isDarwin) {
@@ -129,6 +116,8 @@ public:
                                        unsigned &NewSubIdx) const;
 
   const TargetRegisterClass *getPointerRegClass(unsigned Kind = 0) const;
+  const TargetRegisterClass*
+  getCrossCopyRegClass(const TargetRegisterClass *RC) const;
 
   const TargetRegisterClass*
   getLargestLegalSuperClass(const TargetRegisterClass *RC) const;
@@ -164,7 +153,6 @@ public:
   bool cannotEliminateFrame(const MachineFunction &MF) const;
 
   // Debug information queries.
-  unsigned getRARegister() const;
   unsigned getFrameRegister(const MachineFunction &MF) const;
   unsigned getBaseRegister() const { return BasePtr; }
 
@@ -172,9 +160,6 @@ public:
   unsigned getEHExceptionRegister() const;
   unsigned getEHHandlerRegister() const;
 
-  int getDwarfRegNum(unsigned RegNum, bool isEH) const;
-  int getLLVMRegNum(unsigned RegNum, bool isEH) const;
-
   bool isLowRegister(unsigned Reg) const;
 
 
diff --git a/lib/Target/ARM/ARMCodeEmitter.cpp b/lib/Target/ARM/ARMCodeEmitter.cpp
index d6fca62775010..4148d4ab10e97 100644
--- a/lib/Target/ARM/ARMCodeEmitter.cpp
+++ b/lib/Target/ARM/ARMCodeEmitter.cpp
@@ -14,12 +14,12 @@
 
 #define DEBUG_TYPE "jit"
 #include "ARM.h"
-#include "ARMAddressingModes.h"
 #include "ARMConstantPoolValue.h"
 #include "ARMInstrInfo.h"
 #include "ARMRelocations.h"
 #include "ARMSubtarget.h"
 #include "ARMTargetMachine.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Function.h"
@@ -161,11 +161,11 @@ namespace {
     //  are already handled elsewhere. They are placeholders to allow this
     //  encoder to continue to function until the MC encoder is sufficiently
     //  far along that this one can be eliminated entirely.
-    unsigned NEONThumb2DataIPostEncoder(const MachineInstr &MI, unsigned Val) 
+    unsigned NEONThumb2DataIPostEncoder(const MachineInstr &MI, unsigned Val)
       const { return 0; }
-    unsigned NEONThumb2LoadStorePostEncoder(const MachineInstr &MI,unsigned Val) 
+    unsigned NEONThumb2LoadStorePostEncoder(const MachineInstr &MI,unsigned Val)
       const { return 0; }
-    unsigned NEONThumb2DupPostEncoder(const MachineInstr &MI,unsigned Val) 
+    unsigned NEONThumb2DupPostEncoder(const MachineInstr &MI,unsigned Val)
       const { return 0; }
     unsigned VFPThumb2PostEncoder(const MachineInstr&MI, unsigned Val)
       const { return 0; }
@@ -189,13 +189,17 @@ namespace {
       unsigned Op) const { return 0; }
     unsigned getARMBranchTargetOpValue(const MachineInstr &MI, unsigned Op)
       const { return 0; }
+    unsigned getARMBLXTargetOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
     unsigned getCCOutOpValue(const MachineInstr &MI, unsigned Op)
       const { return 0; }
     unsigned getSOImmOpValue(const MachineInstr &MI, unsigned Op)
       const { return 0; }
     unsigned getT2SOImmOpValue(const MachineInstr &MI, unsigned Op)
       const { return 0; }
-    unsigned getSORegOpValue(const MachineInstr &MI, unsigned Op)
+    unsigned getSORegRegOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getSORegImmOpValue(const MachineInstr &MI, unsigned Op)
       const { return 0; }
     unsigned getThumbAddrModeRegRegOpValue(const MachineInstr &MI, unsigned Op)
       const { return 0; }
@@ -203,8 +207,12 @@ namespace {
       const { return 0; }
     unsigned getT2AddrModeImm8OpValue(const MachineInstr &MI, unsigned Op)
       const { return 0; }
+    unsigned getT2Imm8s4OpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
     unsigned getT2AddrModeImm8s4OpValue(const MachineInstr &MI, unsigned Op)
       const { return 0; }
+    unsigned getT2AddrModeImm0_1020s4OpValue(const MachineInstr &MI,unsigned Op)
+      const { return 0; }
     unsigned getT2AddrModeImm8OffsetOpValue(const MachineInstr &MI, unsigned Op)
       const { return 0; }
     unsigned getT2AddrModeImm12OffsetOpValue(const MachineInstr &MI,unsigned Op)
@@ -213,10 +221,6 @@ namespace {
       const { return 0; }
     unsigned getT2SORegOpValue(const MachineInstr &MI, unsigned Op)
       const { return 0; }
-    unsigned getRotImmOpValue(const MachineInstr &MI, unsigned Op)
-      const { return 0; }
-    unsigned getImmMinusOneOpValue(const MachineInstr &MI, unsigned Op)
-      const { return 0; }
     unsigned getT2AdrLabelOpValue(const MachineInstr &MI, unsigned Op)
       const { return 0; }
     unsigned getAddrMode6AddressOpValue(const MachineInstr &MI, unsigned Op)
@@ -230,8 +234,6 @@ namespace {
       const { return 0; }
     unsigned getBitfieldInvertedMaskOpValue(const MachineInstr &MI,
                                             unsigned Op) const { return 0; }
-    unsigned getMsbOpValue(const MachineInstr &MI,
-                           unsigned Op) const { return 0; }
     unsigned getSsatBitPosValue(const MachineInstr &MI,
                                 unsigned Op) const { return 0; }
     uint32_t getLdStmModeOpValue(const MachineInstr &MI, unsigned OpIdx)
@@ -268,6 +270,8 @@ namespace {
       const { return 0;}
     uint32_t getAddrMode2OffsetOpValue(const MachineInstr &MI, unsigned OpIdx)
       const { return 0;}
+    uint32_t getPostIdxRegOpValue(const MachineInstr &MI, unsigned OpIdx)
+      const { return 0;}
     uint32_t getAddrMode3OffsetOpValue(const MachineInstr &MI, unsigned OpIdx)
       const { return 0;}
     uint32_t getAddrMode3OpValue(const MachineInstr &MI, unsigned Op)
@@ -632,15 +636,16 @@ void ARMCodeEmitter::emitConstPoolInstruction(const MachineInstr &MI) {
           << (void*)MCE.getCurrentPCValue() << " " << *ACPV << '\n');
 
     assert(ACPV->isGlobalValue() && "unsupported constant pool value");
-    const GlobalValue *GV = ACPV->getGV();
+    const GlobalValue *GV = cast<ARMConstantPoolConstant>(ACPV)->getGV();
     if (GV) {
       Reloc::Model RelocM = TM.getRelocationModel();
       emitGlobalAddress(GV, ARM::reloc_arm_machine_cp_entry,
                         isa<Function>(GV),
                         Subtarget->GVIsIndirectSymbol(GV, RelocM),
                         (intptr_t)ACPV);
-     } else  {
-      emitExternalSymbolAddress(ACPV->getSymbol(), ARM::reloc_arm_absolute);
+    } else  {
+      const char *Sym = cast<ARMConstantPoolSymbol>(ACPV)->getSymbol();
+      emitExternalSymbolAddress(Sym, ARM::reloc_arm_absolute);
     }
     emitWordLE(0);
   } else {
@@ -983,7 +988,7 @@ unsigned ARMCodeEmitter::getMachineSoImmOpValue(unsigned SoImm) {
 
 unsigned ARMCodeEmitter::getAddrModeSBit(const MachineInstr &MI,
                                          const MCInstrDesc &MCID) const {
-  for (unsigned i = MI.getNumOperands(), e = MCID.getNumOperands(); i >= e; --i){
+  for (unsigned i = MI.getNumOperands(), e = MCID.getNumOperands(); i >= e;--i){
     const MachineOperand &MO = MI.getOperand(i-1);
     if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR)
       return 1 << ARMII::S_BitShift;
diff --git a/lib/Target/ARM/ARMConstantIslandPass.cpp b/lib/Target/ARM/ARMConstantIslandPass.cpp
index f45ebdc535002..3e3a4134c7044 100644
--- a/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -15,10 +15,10 @@
 
 #define DEBUG_TYPE "arm-cp-islands"
 #include "ARM.h"
-#include "ARMAddressingModes.h"
 #include "ARMMachineFunctionInfo.h"
 #include "ARMInstrInfo.h"
 #include "Thumb2InstrInfo.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
@@ -739,7 +739,11 @@ MachineBasicBlock *ARMConstantIslands::SplitBlockBeforeInstr(MachineInstr *MI) {
   // There doesn't seem to be meaningful DebugInfo available; this doesn't
   // correspond to anything in the source.
   unsigned Opc = isThumb ? (isThumb2 ? ARM::t2B : ARM::tB) : ARM::B;
-  BuildMI(OrigBB, DebugLoc(), TII->get(Opc)).addMBB(NewBB);
+  if (!isThumb)
+    BuildMI(OrigBB, DebugLoc(), TII->get(Opc)).addMBB(NewBB);
+  else
+    BuildMI(OrigBB, DebugLoc(), TII->get(Opc)).addMBB(NewBB)
+            .addImm(ARMCC::AL).addReg(0);
   ++NumSplit;
 
   // Update the CFG.  All succs of OrigBB are now succs of NewBB.
@@ -1151,7 +1155,11 @@ void ARMConstantIslands::CreateNewWater(unsigned CPUserIndex,
     // targets will be exchanged, and the altered branch may be out of
     // range, so the machinery has to know about it.
     int UncondBr = isThumb ? ((isThumb2) ? ARM::t2B : ARM::tB) : ARM::B;
-    BuildMI(UserMBB, DebugLoc(), TII->get(UncondBr)).addMBB(NewMBB);
+    if (!isThumb)
+      BuildMI(UserMBB, DebugLoc(), TII->get(UncondBr)).addMBB(NewMBB);
+    else
+      BuildMI(UserMBB, DebugLoc(), TII->get(UncondBr)).addMBB(NewMBB)
+              .addImm(ARMCC::AL).addReg(0);
     unsigned MaxDisp = getUnconditionalBrDisp(UncondBr);
     ImmBranches.push_back(ImmBranch(&UserMBB->back(),
                           MaxDisp, false, UncondBr));
@@ -1512,7 +1520,11 @@ ARMConstantIslands::FixUpConditionalBr(MachineFunction &MF, ImmBranch &Br) {
     .addMBB(NextBB).addImm(CC).addReg(CCReg);
   Br.MI = &MBB->back();
   BBSizes[MBB->getNumber()] += TII->GetInstSizeInBytes(&MBB->back());
-  BuildMI(MBB, DebugLoc(), TII->get(Br.UncondBr)).addMBB(DestBB);
+  if (isThumb)
+    BuildMI(MBB, DebugLoc(), TII->get(Br.UncondBr)).addMBB(DestBB)
+            .addImm(ARMCC::AL).addReg(0);
+  else
+    BuildMI(MBB, DebugLoc(), TII->get(Br.UncondBr)).addMBB(DestBB);
   BBSizes[MBB->getNumber()] += TII->GetInstSizeInBytes(&MBB->back());
   unsigned MaxDisp = getUnconditionalBrDisp(Br.UncondBr);
   ImmBranches.push_back(ImmBranch(&MBB->back(), MaxDisp, false, Br.UncondBr));
@@ -1891,7 +1903,8 @@ AdjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB)
   // There doesn't seem to be meaningful DebugInfo available; this doesn't
   // correspond directly to anything in the source.
   assert (isThumb2 && "Adjusting for TB[BH] but not in Thumb2?");
-  BuildMI(NewBB, DebugLoc(), TII->get(ARM::t2B)).addMBB(BB);
+  BuildMI(NewBB, DebugLoc(), TII->get(ARM::t2B)).addMBB(BB)
+          .addImm(ARMCC::AL).addReg(0);
 
   // Update internal data structures to account for the newly inserted MBB.
   MF.RenumberBlocks(NewBB);
diff --git a/lib/Target/ARM/ARMConstantPoolValue.cpp b/lib/Target/ARM/ARMConstantPoolValue.cpp
index 165a1d849ad52..aadfd4779db36 100644
--- a/lib/Target/ARM/ARMConstantPoolValue.cpp
+++ b/lib/Target/ARM/ARMConstantPoolValue.cpp
@@ -17,79 +17,57 @@
 #include "llvm/Constants.h"
 #include "llvm/GlobalValue.h"
 #include "llvm/Type.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cstdlib>
 using namespace llvm;
 
-ARMConstantPoolValue::ARMConstantPoolValue(const Constant *cval, unsigned id,
-                                           ARMCP::ARMCPKind K,
+//===----------------------------------------------------------------------===//
+// ARMConstantPoolValue
+//===----------------------------------------------------------------------===//
+
+ARMConstantPoolValue::ARMConstantPoolValue(Type *Ty, unsigned id,
+                                           ARMCP::ARMCPKind kind,
                                            unsigned char PCAdj,
-                                           ARMCP::ARMCPModifier Modif,
-                                           bool AddCA)
-  : MachineConstantPoolValue((const Type*)cval->getType()),
-    CVal(cval), S(NULL), LabelId(id), Kind(K), PCAdjust(PCAdj),
-    Modifier(Modif), AddCurrentAddress(AddCA) {}
-
-ARMConstantPoolValue::ARMConstantPoolValue(LLVMContext &C,
-                                           const char *s, unsigned id,
+                                           ARMCP::ARMCPModifier modifier,
+                                           bool addCurrentAddress)
+  : MachineConstantPoolValue(Ty), LabelId(id), Kind(kind),
+    PCAdjust(PCAdj), Modifier(modifier),
+    AddCurrentAddress(addCurrentAddress) {}
+
+ARMConstantPoolValue::ARMConstantPoolValue(LLVMContext &C, unsigned id,
+                                           ARMCP::ARMCPKind kind,
                                            unsigned char PCAdj,
-                                           ARMCP::ARMCPModifier Modif,
-                                           bool AddCA)
-  : MachineConstantPoolValue((const Type*)Type::getInt32Ty(C)),
-    CVal(NULL), S(strdup(s)), LabelId(id), Kind(ARMCP::CPExtSymbol),
-    PCAdjust(PCAdj), Modifier(Modif), AddCurrentAddress(AddCA) {}
-
-ARMConstantPoolValue::ARMConstantPoolValue(const GlobalValue *gv,
-                                           ARMCP::ARMCPModifier Modif)
-  : MachineConstantPoolValue((const Type*)Type::getInt32Ty(gv->getContext())),
-    CVal(gv), S(NULL), LabelId(0), Kind(ARMCP::CPValue), PCAdjust(0),
-    Modifier(Modif), AddCurrentAddress(false) {}
-
-const GlobalValue *ARMConstantPoolValue::getGV() const {
-  return dyn_cast_or_null<GlobalValue>(CVal);
-}
+                                           ARMCP::ARMCPModifier modifier,
+                                           bool addCurrentAddress)
+  : MachineConstantPoolValue((Type*)Type::getInt32Ty(C)),
+    LabelId(id), Kind(kind), PCAdjust(PCAdj), Modifier(modifier),
+    AddCurrentAddress(addCurrentAddress) {}
 
-const BlockAddress *ARMConstantPoolValue::getBlockAddress() const {
-  return dyn_cast_or_null<BlockAddress>(CVal);
-}
+ARMConstantPoolValue::~ARMConstantPoolValue() {}
 
-static bool CPV_streq(const char *S1, const char *S2) {
-  if (S1 == S2)
-    return true;
-  if (S1 && S2 && strcmp(S1, S2) == 0)
-    return true;
-  return false;
+const char *ARMConstantPoolValue::getModifierText() const {
+  switch (Modifier) {
+  default: llvm_unreachable("Unknown modifier!");
+    // FIXME: Are these case sensitive? It'd be nice to lower-case all the
+    // strings if that's legal.
+  case ARMCP::no_modifier: return "none";
+  case ARMCP::TLSGD:       return "tlsgd";
+  case ARMCP::GOT:         return "GOT";
+  case ARMCP::GOTOFF:      return "GOTOFF";
+  case ARMCP::GOTTPOFF:    return "gottpoff";
+  case ARMCP::TPOFF:       return "tpoff";
+  }
 }
 
 int ARMConstantPoolValue::getExistingMachineCPValue(MachineConstantPool *CP,
                                                     unsigned Alignment) {
-  unsigned AlignMask = Alignment - 1;
-  const std::vector<MachineConstantPoolEntry> Constants = CP->getConstants();
-  for (unsigned i = 0, e = Constants.size(); i != e; ++i) {
-    if (Constants[i].isMachineConstantPoolEntry() &&
-        (Constants[i].getAlignment() & AlignMask) == 0) {
-      ARMConstantPoolValue *CPV =
-        (ARMConstantPoolValue *)Constants[i].Val.MachineCPVal;
-      if (CPV->CVal == CVal &&
-          CPV->LabelId == LabelId &&
-          CPV->PCAdjust == PCAdjust &&
-          CPV_streq(CPV->S, S) &&
-          CPV->Modifier == Modifier)
-        return i;
-    }
-  }
-
+  assert(false && "Shouldn't be calling this directly!");
   return -1;
 }
 
-ARMConstantPoolValue::~ARMConstantPoolValue() {
-  free((void*)S);
-}
-
 void
-ARMConstantPoolValue::AddSelectionDAGCSEId(FoldingSetNodeID &ID) {
-  ID.AddPointer(CVal);
-  ID.AddPointer(S);
+ARMConstantPoolValue::addSelectionDAGCSEId(FoldingSetNodeID &ID) {
   ID.AddInteger(LabelId);
   ID.AddInteger(PCAdjust);
 }
@@ -97,9 +75,7 @@ ARMConstantPoolValue::AddSelectionDAGCSEId(FoldingSetNodeID &ID) {
 bool
 ARMConstantPoolValue::hasSameValue(ARMConstantPoolValue *ACPV) {
   if (ACPV->Kind == Kind &&
-      ACPV->CVal == CVal &&
       ACPV->PCAdjust == PCAdjust &&
-      CPV_streq(ACPV->S, S) &&
       ACPV->Modifier == Modifier) {
     if (ACPV->LabelId == LabelId)
       return true;
@@ -115,12 +91,7 @@ void ARMConstantPoolValue::dump() const {
   errs() << "  " << *this;
 }
 
-
 void ARMConstantPoolValue::print(raw_ostream &O) const {
-  if (CVal)
-    O << CVal->getName();
-  else
-    O << S;
   if (Modifier) O << "(" << getModifierText() << ")";
   if (PCAdjust != 0) {
     O << "-(LPC" << LabelId << "+" << (unsigned)PCAdjust;
@@ -128,3 +99,221 @@ void ARMConstantPoolValue::print(raw_ostream &O) const {
     O << ")";
   }
 }
+
+//===----------------------------------------------------------------------===//
+// ARMConstantPoolConstant
+//===----------------------------------------------------------------------===//
+
+ARMConstantPoolConstant::ARMConstantPoolConstant(Type *Ty,
+                                                 const Constant *C,
+                                                 unsigned ID,
+                                                 ARMCP::ARMCPKind Kind,
+                                                 unsigned char PCAdj,
+                                                 ARMCP::ARMCPModifier Modifier,
+                                                 bool AddCurrentAddress)
+  : ARMConstantPoolValue(Ty, ID, Kind, PCAdj, Modifier, AddCurrentAddress),
+    CVal(C) {}
+
+ARMConstantPoolConstant::ARMConstantPoolConstant(const Constant *C,
+                                                 unsigned ID,
+                                                 ARMCP::ARMCPKind Kind,
+                                                 unsigned char PCAdj,
+                                                 ARMCP::ARMCPModifier Modifier,
+                                                 bool AddCurrentAddress)
+  : ARMConstantPoolValue((Type*)C->getType(), ID, Kind, PCAdj, Modifier,
+                         AddCurrentAddress),
+    CVal(C) {}
+
+ARMConstantPoolConstant *
+ARMConstantPoolConstant::Create(const Constant *C, unsigned ID) {
+  return new ARMConstantPoolConstant(C, ID, ARMCP::CPValue, 0,
+                                     ARMCP::no_modifier, false);
+}
+
+ARMConstantPoolConstant *
+ARMConstantPoolConstant::Create(const GlobalValue *GV,
+                                ARMCP::ARMCPModifier Modifier) {
+  return new ARMConstantPoolConstant((Type*)Type::getInt32Ty(GV->getContext()),
+                                     GV, 0, ARMCP::CPValue, 0,
+                                     Modifier, false);
+}
+
+ARMConstantPoolConstant *
+ARMConstantPoolConstant::Create(const Constant *C, unsigned ID,
+                                ARMCP::ARMCPKind Kind, unsigned char PCAdj) {
+  return new ARMConstantPoolConstant(C, ID, Kind, PCAdj,
+                                     ARMCP::no_modifier, false);
+}
+
+ARMConstantPoolConstant *
+ARMConstantPoolConstant::Create(const Constant *C, unsigned ID,
+                                ARMCP::ARMCPKind Kind, unsigned char PCAdj,
+                                ARMCP::ARMCPModifier Modifier,
+                                bool AddCurrentAddress) {
+  return new ARMConstantPoolConstant(C, ID, Kind, PCAdj, Modifier,
+                                     AddCurrentAddress);
+}
+
+const GlobalValue *ARMConstantPoolConstant::getGV() const {
+  return dyn_cast_or_null<GlobalValue>(CVal);
+}
+
+const BlockAddress *ARMConstantPoolConstant::getBlockAddress() const {
+  return dyn_cast_or_null<BlockAddress>(CVal);
+}
+
+int ARMConstantPoolConstant::getExistingMachineCPValue(MachineConstantPool *CP,
+                                                       unsigned Alignment) {
+  unsigned AlignMask = Alignment - 1;
+  const std::vector<MachineConstantPoolEntry> Constants = CP->getConstants();
+  for (unsigned i = 0, e = Constants.size(); i != e; ++i) {
+    if (Constants[i].isMachineConstantPoolEntry() &&
+        (Constants[i].getAlignment() & AlignMask) == 0) {
+      ARMConstantPoolValue *CPV =
+        (ARMConstantPoolValue *)Constants[i].Val.MachineCPVal;
+      ARMConstantPoolConstant *APC = dyn_cast<ARMConstantPoolConstant>(CPV);
+      if (!APC) continue;
+      if (APC->CVal == CVal && equals(APC))
+        return i;
+    }
+  }
+
+  return -1;
+}
+
+bool ARMConstantPoolConstant::hasSameValue(ARMConstantPoolValue *ACPV) {
+  const ARMConstantPoolConstant *ACPC = dyn_cast<ARMConstantPoolConstant>(ACPV);
+  return ACPC && ACPC->CVal == CVal && ARMConstantPoolValue::hasSameValue(ACPV);
+}
+
+void ARMConstantPoolConstant::addSelectionDAGCSEId(FoldingSetNodeID &ID) {
+  ID.AddPointer(CVal);
+  ARMConstantPoolValue::addSelectionDAGCSEId(ID);
+}
+
+void ARMConstantPoolConstant::print(raw_ostream &O) const {
+  O << CVal->getName();
+  ARMConstantPoolValue::print(O);
+}
+
+//===----------------------------------------------------------------------===//
+// ARMConstantPoolSymbol
+//===----------------------------------------------------------------------===//
+
+ARMConstantPoolSymbol::ARMConstantPoolSymbol(LLVMContext &C, const char *s,
+                                             unsigned id,
+                                             unsigned char PCAdj,
+                                             ARMCP::ARMCPModifier Modifier,
+                                             bool AddCurrentAddress)
+  : ARMConstantPoolValue(C, id, ARMCP::CPExtSymbol, PCAdj, Modifier,
+                         AddCurrentAddress),
+    S(strdup(s)) {}
+
+ARMConstantPoolSymbol::~ARMConstantPoolSymbol() {
+  free((void*)S);
+}
+
+ARMConstantPoolSymbol *
+ARMConstantPoolSymbol::Create(LLVMContext &C, const char *s,
+                              unsigned ID, unsigned char PCAdj) {
+  return new ARMConstantPoolSymbol(C, s, ID, PCAdj, ARMCP::no_modifier, false);
+}
+
+static bool CPV_streq(const char *S1, const char *S2) {
+  if (S1 == S2)
+    return true;
+  if (S1 && S2 && strcmp(S1, S2) == 0)
+    return true;
+  return false;
+}
+
+int ARMConstantPoolSymbol::getExistingMachineCPValue(MachineConstantPool *CP,
+                                                     unsigned Alignment) {
+  unsigned AlignMask = Alignment - 1;
+  const std::vector<MachineConstantPoolEntry> Constants = CP->getConstants();
+  for (unsigned i = 0, e = Constants.size(); i != e; ++i) {
+    if (Constants[i].isMachineConstantPoolEntry() &&
+        (Constants[i].getAlignment() & AlignMask) == 0) {
+      ARMConstantPoolValue *CPV =
+        (ARMConstantPoolValue *)Constants[i].Val.MachineCPVal;
+      ARMConstantPoolSymbol *APS = dyn_cast<ARMConstantPoolSymbol>(CPV);
+      if (!APS) continue;
+
+      if (CPV_streq(APS->S, S) && equals(APS))
+        return i;
+    }
+  }
+
+  return -1;
+}
+
+bool ARMConstantPoolSymbol::hasSameValue(ARMConstantPoolValue *ACPV) {
+  const ARMConstantPoolSymbol *ACPS = dyn_cast<ARMConstantPoolSymbol>(ACPV);
+  return ACPS && CPV_streq(ACPS->S, S) &&
+    ARMConstantPoolValue::hasSameValue(ACPV);
+}
+
+void ARMConstantPoolSymbol::addSelectionDAGCSEId(FoldingSetNodeID &ID) {
+  ID.AddPointer(S);
+  ARMConstantPoolValue::addSelectionDAGCSEId(ID);
+}
+
+void ARMConstantPoolSymbol::print(raw_ostream &O) const {
+  O << S;
+  ARMConstantPoolValue::print(O);
+}
+
+//===----------------------------------------------------------------------===//
+// ARMConstantPoolMBB
+//===----------------------------------------------------------------------===//
+
+ARMConstantPoolMBB::ARMConstantPoolMBB(LLVMContext &C,
+                                       const MachineBasicBlock *mbb,
+                                       unsigned id, unsigned char PCAdj,
+                                       ARMCP::ARMCPModifier Modifier,
+                                       bool AddCurrentAddress)
+  : ARMConstantPoolValue(C, id, ARMCP::CPMachineBasicBlock, PCAdj,
+                         Modifier, AddCurrentAddress),
+    MBB(mbb) {}
+
+ARMConstantPoolMBB *ARMConstantPoolMBB::Create(LLVMContext &C,
+                                               const MachineBasicBlock *mbb,
+                                               unsigned ID,
+                                               unsigned char PCAdj) {
+  return new ARMConstantPoolMBB(C, mbb, ID, PCAdj, ARMCP::no_modifier, false);
+}
+
+int ARMConstantPoolMBB::getExistingMachineCPValue(MachineConstantPool *CP,
+                                                  unsigned Alignment) {
+  unsigned AlignMask = Alignment - 1;
+  const std::vector<MachineConstantPoolEntry> Constants = CP->getConstants();
+  for (unsigned i = 0, e = Constants.size(); i != e; ++i) {
+    if (Constants[i].isMachineConstantPoolEntry() &&
+        (Constants[i].getAlignment() & AlignMask) == 0) {
+      ARMConstantPoolValue *CPV =
+        (ARMConstantPoolValue *)Constants[i].Val.MachineCPVal;
+      ARMConstantPoolMBB *APMBB = dyn_cast<ARMConstantPoolMBB>(CPV);
+      if (!APMBB) continue;
+
+      if (APMBB->MBB == MBB && equals(APMBB))
+        return i;
+    }
+  }
+
+  return -1;
+}
+
+bool ARMConstantPoolMBB::hasSameValue(ARMConstantPoolValue *ACPV) {
+  const ARMConstantPoolMBB *ACPMBB = dyn_cast<ARMConstantPoolMBB>(ACPV);
+  return ACPMBB && ACPMBB->MBB == MBB &&
+    ARMConstantPoolValue::hasSameValue(ACPV);
+}
+
+void ARMConstantPoolMBB::addSelectionDAGCSEId(FoldingSetNodeID &ID) {
+  ID.AddPointer(MBB);
+  ARMConstantPoolValue::addSelectionDAGCSEId(ID);
+}
+
+void ARMConstantPoolMBB::print(raw_ostream &O) const {
+  ARMConstantPoolValue::print(O);
+}
diff --git a/lib/Target/ARM/ARMConstantPoolValue.h b/lib/Target/ARM/ARMConstantPoolValue.h
index d008811c40e4e..0d0def32b7d8a 100644
--- a/lib/Target/ARM/ARMConstantPoolValue.h
+++ b/lib/Target/ARM/ARMConstantPoolValue.h
@@ -20,17 +20,19 @@
 
 namespace llvm {
 
-class Constant;
 class BlockAddress;
+class Constant;
 class GlobalValue;
 class LLVMContext;
+class MachineBasicBlock;
 
 namespace ARMCP {
   enum ARMCPKind {
     CPValue,
     CPExtSymbol,
     CPBlockAddress,
-    CPLSDA
+    CPLSDA,
+    CPMachineBasicBlock
   };
 
   enum ARMCPModifier {
@@ -47,8 +49,6 @@ namespace ARMCP {
 /// represent PC-relative displacement between the address of the load
 /// instruction and the constant being loaded, i.e. (&GV-(LPIC+8)).
 class ARMConstantPoolValue : public MachineConstantPoolValue {
-  const Constant *CVal;    // Constant being loaded.
-  const char *S;           // ExtSymbol being loaded.
   unsigned LabelId;        // Label id of the load.
   ARMCP::ARMCPKind Kind;   // Kind of constant.
   unsigned char PCAdjust;  // Extra adjustment if constantpool is pc-relative.
@@ -56,60 +56,54 @@ class ARMConstantPoolValue : public MachineConstantPoolValue {
   ARMCP::ARMCPModifier Modifier;   // GV modifier i.e. (&GV(modifier)-(LPIC+8))
   bool AddCurrentAddress;
 
+protected:
+  ARMConstantPoolValue(Type *Ty, unsigned id, ARMCP::ARMCPKind Kind,
+                       unsigned char PCAdj, ARMCP::ARMCPModifier Modifier,
+                       bool AddCurrentAddress);
+
+  ARMConstantPoolValue(LLVMContext &C, unsigned id, ARMCP::ARMCPKind Kind,
+                       unsigned char PCAdj, ARMCP::ARMCPModifier Modifier,
+                       bool AddCurrentAddress);
 public:
-  ARMConstantPoolValue(const Constant *cval, unsigned id,
-                       ARMCP::ARMCPKind Kind = ARMCP::CPValue,
-                       unsigned char PCAdj = 0,
-                       ARMCP::ARMCPModifier Modifier = ARMCP::no_modifier,
-                       bool AddCurrentAddress = false);
-  ARMConstantPoolValue(LLVMContext &C, const char *s, unsigned id,
-                       unsigned char PCAdj = 0,
-                       ARMCP::ARMCPModifier Modifier = ARMCP::no_modifier,
-                       bool AddCurrentAddress = false);
-  ARMConstantPoolValue(const GlobalValue *GV, ARMCP::ARMCPModifier Modifier);
-  ARMConstantPoolValue();
-  ~ARMConstantPoolValue();
+  virtual ~ARMConstantPoolValue();
 
-  const GlobalValue *getGV() const;
-  const char *getSymbol() const { return S; }
-  const BlockAddress *getBlockAddress() const;
   ARMCP::ARMCPModifier getModifier() const { return Modifier; }
-  const char *getModifierText() const {
-    switch (Modifier) {
-    default: llvm_unreachable("Unknown modifier!");
-    // FIXME: Are these case sensitive? It'd be nice to lower-case all the
-    // strings if that's legal.
-    case ARMCP::no_modifier: return "none";
-    case ARMCP::TLSGD:       return "tlsgd";
-    case ARMCP::GOT:         return "GOT";
-    case ARMCP::GOTOFF:      return "GOTOFF";
-    case ARMCP::GOTTPOFF:    return "gottpoff";
-    case ARMCP::TPOFF:       return "tpoff";
-    }
-  }
+  const char *getModifierText() const;
   bool hasModifier() const { return Modifier != ARMCP::no_modifier; }
+
   bool mustAddCurrentAddress() const { return AddCurrentAddress; }
+
   unsigned getLabelId() const { return LabelId; }
   unsigned char getPCAdjustment() const { return PCAdjust; }
+
   bool isGlobalValue() const { return Kind == ARMCP::CPValue; }
   bool isExtSymbol() const { return Kind == ARMCP::CPExtSymbol; }
-  bool isBlockAddress() { return Kind == ARMCP::CPBlockAddress; }
-  bool isLSDA() { return Kind == ARMCP::CPLSDA; }
+  bool isBlockAddress() const { return Kind == ARMCP::CPBlockAddress; }
+  bool isLSDA() const { return Kind == ARMCP::CPLSDA; }
+  bool isMachineBasicBlock() const{ return Kind == ARMCP::CPMachineBasicBlock; }
 
   virtual unsigned getRelocationInfo() const { return 2; }
 
   virtual int getExistingMachineCPValue(MachineConstantPool *CP,
                                         unsigned Alignment);
 
-  virtual void AddSelectionDAGCSEId(FoldingSetNodeID &ID);
+  virtual void addSelectionDAGCSEId(FoldingSetNodeID &ID);
 
-  /// hasSameValue - Return true if this ARM constpool value
-  /// can share the same constantpool entry as another ARM constpool value.
-  bool hasSameValue(ARMConstantPoolValue *ACPV);
+  /// hasSameValue - Return true if this ARM constpool value can share the same
+  /// constantpool entry as another ARM constpool value.
+  virtual bool hasSameValue(ARMConstantPoolValue *ACPV);
+
+  bool equals(const ARMConstantPoolValue *A) const {
+    return this->LabelId == A->LabelId &&
+      this->PCAdjust == A->PCAdjust &&
+      this->Modifier == A->Modifier;
+  }
 
+  virtual void print(raw_ostream &O) const;
   void print(raw_ostream *O) const { if (O) print(*O); }
-  void print(raw_ostream &O) const;
   void dump() const;
+
+  static bool classof(const ARMConstantPoolValue *) { return true; }
 };
 
 inline raw_ostream &operator<<(raw_ostream &O, const ARMConstantPoolValue &V) {
@@ -117,6 +111,123 @@ inline raw_ostream &operator<<(raw_ostream &O, const ARMConstantPoolValue &V) {
   return O;
 }
 
+/// ARMConstantPoolConstant - ARM-specific constant pool values for Constants,
+/// Functions, and BlockAddresses.
+class ARMConstantPoolConstant : public ARMConstantPoolValue {
+  const Constant *CVal;         // Constant being loaded.
+
+  ARMConstantPoolConstant(const Constant *C,
+                          unsigned ID,
+                          ARMCP::ARMCPKind Kind,
+                          unsigned char PCAdj,
+                          ARMCP::ARMCPModifier Modifier,
+                          bool AddCurrentAddress);
+  ARMConstantPoolConstant(Type *Ty, const Constant *C,
+                          unsigned ID,
+                          ARMCP::ARMCPKind Kind,
+                          unsigned char PCAdj,
+                          ARMCP::ARMCPModifier Modifier,
+                          bool AddCurrentAddress);
+
+public:
+  static ARMConstantPoolConstant *Create(const Constant *C, unsigned ID);
+  static ARMConstantPoolConstant *Create(const GlobalValue *GV,
+                                         ARMCP::ARMCPModifier Modifier);
+  static ARMConstantPoolConstant *Create(const Constant *C, unsigned ID,
+                                         ARMCP::ARMCPKind Kind,
+                                         unsigned char PCAdj);
+  static ARMConstantPoolConstant *Create(const Constant *C, unsigned ID,
+                                         ARMCP::ARMCPKind Kind,
+                                         unsigned char PCAdj,
+                                         ARMCP::ARMCPModifier Modifier,
+                                         bool AddCurrentAddress);
+
+  const GlobalValue *getGV() const;
+  const BlockAddress *getBlockAddress() const;
+
+  virtual int getExistingMachineCPValue(MachineConstantPool *CP,
+                                        unsigned Alignment);
+
+  /// hasSameValue - Return true if this ARM constpool value can share the same
+  /// constantpool entry as another ARM constpool value.
+  virtual bool hasSameValue(ARMConstantPoolValue *ACPV);
+
+  virtual void addSelectionDAGCSEId(FoldingSetNodeID &ID);
+
+  virtual void print(raw_ostream &O) const;
+  static bool classof(const ARMConstantPoolValue *APV) {
+    return APV->isGlobalValue() || APV->isBlockAddress() || APV->isLSDA();
+  }
+  static bool classof(const ARMConstantPoolConstant *) { return true; }
+};
+
+/// ARMConstantPoolSymbol - ARM-specific constantpool values for external
+/// symbols.
+class ARMConstantPoolSymbol : public ARMConstantPoolValue {
+  const char *S;                // ExtSymbol being loaded.
+
+  ARMConstantPoolSymbol(LLVMContext &C, const char *s, unsigned id,
+                        unsigned char PCAdj, ARMCP::ARMCPModifier Modifier,
+                        bool AddCurrentAddress);
+
+public:
+  ~ARMConstantPoolSymbol();
+
+  static ARMConstantPoolSymbol *Create(LLVMContext &C, const char *s,
+                                       unsigned ID, unsigned char PCAdj);
+
+  const char *getSymbol() const { return S; }
+
+  virtual int getExistingMachineCPValue(MachineConstantPool *CP,
+                                        unsigned Alignment);
+
+  virtual void addSelectionDAGCSEId(FoldingSetNodeID &ID);
+
+  /// hasSameValue - Return true if this ARM constpool value can share the same
+  /// constantpool entry as another ARM constpool value.
+  virtual bool hasSameValue(ARMConstantPoolValue *ACPV);
+
+  virtual void print(raw_ostream &O) const;
+
+  static bool classof(const ARMConstantPoolValue *ACPV) {
+    return ACPV->isExtSymbol();
+  }
+  static bool classof(const ARMConstantPoolSymbol *) { return true; }
+};
+
+/// ARMConstantPoolMBB - ARM-specific constantpool value of a machine basic
+/// block.
+class ARMConstantPoolMBB : public ARMConstantPoolValue {
+  const MachineBasicBlock *MBB; // Machine basic block.
+
+  ARMConstantPoolMBB(LLVMContext &C, const MachineBasicBlock *mbb, unsigned id,
+                     unsigned char PCAdj, ARMCP::ARMCPModifier Modifier,
+                     bool AddCurrentAddress);
+
+public:
+  static ARMConstantPoolMBB *Create(LLVMContext &C,
+                                    const MachineBasicBlock *mbb,
+                                    unsigned ID, unsigned char PCAdj);
+
+  const MachineBasicBlock *getMBB() const { return MBB; }
+
+  virtual int getExistingMachineCPValue(MachineConstantPool *CP,
+                                        unsigned Alignment);
+
+  virtual void addSelectionDAGCSEId(FoldingSetNodeID &ID);
+
+  /// hasSameValue - Return true if this ARM constpool value can share the same
+  /// constantpool entry as another ARM constpool value.
+  virtual bool hasSameValue(ARMConstantPoolValue *ACPV);
+
+  virtual void print(raw_ostream &O) const;
+
+  static bool classof(const ARMConstantPoolValue *ACPV) {
+    return ACPV->isMachineBasicBlock();
+  }
+  static bool classof(const ARMConstantPoolMBB *) { return true; }
+};
+
 } // End llvm namespace
 
 #endif
diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index 94b72fdb9a7e2..7872cb90f4e7a 100644
--- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -16,19 +16,24 @@
 
 #define DEBUG_TYPE "arm-pseudo"
 #include "ARM.h"
-#include "ARMAddressingModes.h"
 #include "ARMBaseInstrInfo.h"
 #include "ARMBaseRegisterInfo.h"
 #include "ARMMachineFunctionInfo.h"
 #include "ARMRegisterInfo.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/raw_ostream.h" // FIXME: for debug only. remove!
 using namespace llvm;
 
+static cl::opt<bool>
+VerifyARMPseudo("verify-arm-pseudo-expand", cl::Hidden,
+                cl::desc("Verify machine code after expanding ARM pseudos"));
+
 namespace {
   class ARMExpandPseudo : public MachineFunctionPass {
   public:
@@ -741,8 +746,22 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       MI.eraseFromParent();
       return true;
     }
-    case ARM::MOVCCs: {
-      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVs),
+    case ARM::MOVCCsi: {
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVsi),
+              (MI.getOperand(1).getReg()))
+        .addReg(MI.getOperand(2).getReg(),
+                getKillRegState(MI.getOperand(2).isKill()))
+        .addImm(MI.getOperand(3).getImm())
+        .addImm(MI.getOperand(4).getImm()) // 'pred'
+        .addReg(MI.getOperand(5).getReg())
+        .addReg(0); // 's' bit
+
+      MI.eraseFromParent();
+      return true;
+    }
+
+    case ARM::MOVCCsr: {
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVsr),
               (MI.getOperand(1).getReg()))
         .addReg(MI.getOperand(2).getReg(),
                 getKillRegState(MI.getOperand(2).isKill()))
@@ -837,10 +856,9 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
     case ARM::MOVsrl_flag:
     case ARM::MOVsra_flag: {
       // These are just fancy MOVs insructions.
-      AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVs),
+      AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVsi),
                              MI.getOperand(0).getReg())
                      .addOperand(MI.getOperand(1))
-                     .addReg(0)
                      .addImm(ARM_AM::getSORegOpc((Opcode == ARM::MOVsrl_flag ?
                                                   ARM_AM::lsr : ARM_AM::asr),
                                                  1)))
@@ -851,10 +869,9 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
     case ARM::RRX: {
       // This encodes as "MOVs Rd, Rm, rrx
       MachineInstrBuilder MIB =
-        AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVs),
+        AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(),TII->get(ARM::MOVsi),
                                MI.getOperand(0).getReg())
                        .addOperand(MI.getOperand(1))
-                       .addOperand(MI.getOperand(1))
                        .addImm(ARM_AM::getSORegOpc(ARM_AM::rrx, 0)))
         .addReg(0);
       TransferImpOps(MI, MIB, MIB);
@@ -953,34 +970,6 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       ExpandMOV32BitImm(MBB, MBBI);
       return true;
 
-    case ARM::VMOVQQ: {
-      unsigned DstReg = MI.getOperand(0).getReg();
-      bool DstIsDead = MI.getOperand(0).isDead();
-      unsigned EvenDst = TRI->getSubReg(DstReg, ARM::qsub_0);
-      unsigned OddDst  = TRI->getSubReg(DstReg, ARM::qsub_1);
-      unsigned SrcReg = MI.getOperand(1).getReg();
-      bool SrcIsKill = MI.getOperand(1).isKill();
-      unsigned EvenSrc = TRI->getSubReg(SrcReg, ARM::qsub_0);
-      unsigned OddSrc  = TRI->getSubReg(SrcReg, ARM::qsub_1);
-      MachineInstrBuilder Even =
-        AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(),
-                               TII->get(ARM::VORRq))
-                       .addReg(EvenDst,
-                               RegState::Define | getDeadRegState(DstIsDead))
-                       .addReg(EvenSrc, getKillRegState(SrcIsKill))
-                       .addReg(EvenSrc, getKillRegState(SrcIsKill)));
-      MachineInstrBuilder Odd =
-        AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(),
-                               TII->get(ARM::VORRq))
-                       .addReg(OddDst,
-                               RegState::Define | getDeadRegState(DstIsDead))
-                       .addReg(OddSrc, getKillRegState(SrcIsKill))
-                       .addReg(OddSrc, getKillRegState(SrcIsKill)));
-      TransferImpOps(MI, Even, Odd);
-      MI.eraseFromParent();
-      return true;
-    }
-
     case ARM::VLDMQIA: {
       unsigned NewOpc = ARM::VLDMDIA;
       MachineInstrBuilder MIB =
@@ -1316,6 +1305,8 @@ bool ARMExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
   for (MachineFunction::iterator MFI = MF.begin(), E = MF.end(); MFI != E;
        ++MFI)
     Modified |= ExpandMBB(*MFI);
+  if (VerifyARMPseudo)
+    MF.verify(this, "After expanding ARM pseudo instructions.");
   return Modified;
 }
 
diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp
index f469d7efe11a2..9bc7ef21d8a34 100644
--- a/lib/Target/ARM/ARMFastISel.cpp
+++ b/lib/Target/ARM/ARMFastISel.cpp
@@ -14,13 +14,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "ARM.h"
-#include "ARMAddressingModes.h"
 #include "ARMBaseInstrInfo.h"
 #include "ARMCallingConv.h"
 #include "ARMRegisterInfo.h"
 #include "ARMTargetMachine.h"
 #include "ARMSubtarget.h"
 #include "ARMConstantPoolValue.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
 #include "llvm/CallingConv.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/GlobalVariable.h"
@@ -171,8 +171,8 @@ class ARMFastISel : public FastISel {
 
     // Utility routines.
   private:
-    bool isTypeLegal(const Type *Ty, MVT &VT);
-    bool isLoadTypeLegal(const Type *Ty, MVT &VT);
+    bool isTypeLegal(Type *Ty, MVT &VT);
+    bool isLoadTypeLegal(Type *Ty, MVT &VT);
     bool ARMEmitLoad(EVT VT, unsigned &ResultReg, Address &Addr);
     bool ARMEmitStore(EVT VT, unsigned SrcReg, Address &Addr);
     bool ARMComputeAddress(const Value *Obj, Address &Addr);
@@ -502,11 +502,19 @@ unsigned ARMFastISel::ARMMaterializeFP(const ConstantFP *CFP, EVT VT) {
   // This checks to see if we can use VFP3 instructions to materialize
   // a constant, otherwise we have to go through the constant pool.
   if (TLI.isFPImmLegal(Val, VT)) {
-    unsigned Opc = is64bit ? ARM::FCONSTD : ARM::FCONSTS;
+    int Imm;
+    unsigned Opc;
+    if (is64bit) {
+      Imm = ARM_AM::getFP64Imm(Val);
+      Opc = ARM::FCONSTD;
+    } else {
+      Imm = ARM_AM::getFP32Imm(Val);
+      Opc = ARM::FCONSTS;
+    }
     unsigned DestReg = createResultReg(TLI.getRegClassFor(VT));
     AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc),
                             DestReg)
-                    .addFPImm(CFP));
+                    .addImm(Imm));
     return DestReg;
   }
 
@@ -590,8 +598,9 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, EVT VT) {
   // Grab index.
   unsigned PCAdj = (RelocM != Reloc::PIC_) ? 0 : (Subtarget->isThumb() ? 4 : 8);
   unsigned Id = AFI->createPICLabelUId();
-  ARMConstantPoolValue *CPV = new ARMConstantPoolValue(GV, Id,
-                                                       ARMCP::CPValue, PCAdj);
+  ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(GV, Id,
+                                                              ARMCP::CPValue,
+                                                              PCAdj);
   unsigned Idx = MCP.getConstantPoolIndex(CPV, Align);
 
   // Load value.
@@ -615,8 +624,8 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, EVT VT) {
   if (Subtarget->GVIsIndirectSymbol(GV, RelocM)) {
     unsigned NewDestReg = createResultReg(TLI.getRegClassFor(VT));
     if (isThumb)
-      MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(ARM::t2LDRi12),
-                    NewDestReg)
+      MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                    TII.get(ARM::t2LDRi12), NewDestReg)
             .addReg(DestReg)
             .addImm(0);
     else
@@ -673,7 +682,7 @@ unsigned ARMFastISel::TargetMaterializeAlloca(const AllocaInst *AI) {
   return 0;
 }
 
-bool ARMFastISel::isTypeLegal(const Type *Ty, MVT &VT) {
+bool ARMFastISel::isTypeLegal(Type *Ty, MVT &VT) {
   EVT evt = TLI.getValueType(Ty, true);
 
   // Only handle simple types.
@@ -685,7 +694,7 @@ bool ARMFastISel::isTypeLegal(const Type *Ty, MVT &VT) {
   return TLI.isTypeLegal(VT);
 }
 
-bool ARMFastISel::isLoadTypeLegal(const Type *Ty, MVT &VT) {
+bool ARMFastISel::isLoadTypeLegal(Type *Ty, MVT &VT) {
   if (isTypeLegal(Ty, VT)) return true;
 
   // If this is a type than can be sign or zero-extended to a basic operation
@@ -714,7 +723,7 @@ bool ARMFastISel::ARMComputeAddress(const Value *Obj, Address &Addr) {
     U = C;
   }
 
-  if (const PointerType *Ty = dyn_cast<PointerType>(Obj->getType()))
+  if (PointerType *Ty = dyn_cast<PointerType>(Obj->getType()))
     if (Ty->getAddressSpace() > 255)
       // Fast instruction selection doesn't support the special
       // address spaces.
@@ -749,7 +758,7 @@ bool ARMFastISel::ARMComputeAddress(const Value *Obj, Address &Addr) {
       for (User::const_op_iterator i = U->op_begin() + 1, e = U->op_end();
            i != e; ++i, ++GTI) {
         const Value *Op = *i;
-        if (const StructType *STy = dyn_cast<StructType>(*GTI)) {
+        if (StructType *STy = dyn_cast<StructType>(*GTI)) {
           const StructLayout *SL = TD.getStructLayout(STy);
           unsigned Idx = cast<ConstantInt>(Op)->getZExtValue();
           TmpOffset += SL->getElementOffset(Idx);
@@ -946,6 +955,10 @@ bool ARMFastISel::ARMEmitLoad(EVT VT, unsigned &ResultReg, Address &Addr) {
 }
 
 bool ARMFastISel::SelectLoad(const Instruction *I) {
+  // Atomic loads need special handling.
+  if (cast<LoadInst>(I)->isAtomic())
+    return false;
+
   // Verify we have a legal type before going any further.
   MVT VT;
   if (!isLoadTypeLegal(I->getType(), VT))
@@ -1008,6 +1021,10 @@ bool ARMFastISel::SelectStore(const Instruction *I) {
   Value *Op0 = I->getOperand(0);
   unsigned SrcReg = 0;
 
+  // Atomic stores need special handling.
+  if (cast<StoreInst>(I)->isAtomic())
+    return false;
+
   // Verify we have a legal type before going any further.
   MVT VT;
   if (!isLoadTypeLegal(I->getOperand(0)->getType(), VT))
@@ -1085,7 +1102,7 @@ bool ARMFastISel::SelectBranch(const Instruction *I) {
   // TODO: Factor this out.
   if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
     MVT SourceVT;
-    const Type *Ty = CI->getOperand(0)->getType();
+    Type *Ty = CI->getOperand(0)->getType();
     if (CI->hasOneUse() && (CI->getParent() == I->getParent())
         && isTypeLegal(Ty, SourceVT)) {
       bool isFloat = (Ty->isDoubleTy() || Ty->isFloatTy());
@@ -1201,7 +1218,7 @@ bool ARMFastISel::SelectCmp(const Instruction *I) {
   const CmpInst *CI = cast<CmpInst>(I);
 
   MVT VT;
-  const Type *Ty = CI->getOperand(0)->getType();
+  Type *Ty = CI->getOperand(0)->getType();
   if (!isTypeLegal(Ty, VT))
     return false;
 
@@ -1309,7 +1326,7 @@ bool ARMFastISel::SelectSIToFP(const Instruction *I) {
   if (!Subtarget->hasVFP2()) return false;
 
   MVT DstVT;
-  const Type *Ty = I->getType();
+  Type *Ty = I->getType();
   if (!isTypeLegal(Ty, DstVT))
     return false;
 
@@ -1328,7 +1345,7 @@ bool ARMFastISel::SelectSIToFP(const Instruction *I) {
   unsigned Opc;
   if (Ty->isFloatTy()) Opc = ARM::VSITOS;
   else if (Ty->isDoubleTy()) Opc = ARM::VSITOD;
-  else return 0;
+  else return false;
 
   unsigned ResultReg = createResultReg(TLI.getRegClassFor(DstVT));
   AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc),
@@ -1343,7 +1360,7 @@ bool ARMFastISel::SelectFPToSI(const Instruction *I) {
   if (!Subtarget->hasVFP2()) return false;
 
   MVT DstVT;
-  const Type *RetTy = I->getType();
+  Type *RetTy = I->getType();
   if (!isTypeLegal(RetTy, DstVT))
     return false;
 
@@ -1351,10 +1368,10 @@ bool ARMFastISel::SelectFPToSI(const Instruction *I) {
   if (Op == 0) return false;
 
   unsigned Opc;
-  const Type *OpTy = I->getOperand(0)->getType();
+  Type *OpTy = I->getOperand(0)->getType();
   if (OpTy->isFloatTy()) Opc = ARM::VTOSIZS;
   else if (OpTy->isDoubleTy()) Opc = ARM::VTOSIZD;
-  else return 0;
+  else return false;
 
   // f64->s32 or f32->s32 both need an intermediate f32 reg.
   unsigned ResultReg = createResultReg(TLI.getRegClassFor(MVT::f32));
@@ -1401,7 +1418,7 @@ bool ARMFastISel::SelectSelect(const Instruction *I) {
 
 bool ARMFastISel::SelectSDiv(const Instruction *I) {
   MVT VT;
-  const Type *Ty = I->getType();
+  Type *Ty = I->getType();
   if (!isTypeLegal(Ty, VT))
     return false;
 
@@ -1429,7 +1446,7 @@ bool ARMFastISel::SelectSDiv(const Instruction *I) {
 
 bool ARMFastISel::SelectSRem(const Instruction *I) {
   MVT VT;
-  const Type *Ty = I->getType();
+  Type *Ty = I->getType();
   if (!isTypeLegal(Ty, VT))
     return false;
 
@@ -1456,7 +1473,7 @@ bool ARMFastISel::SelectBinaryOp(const Instruction *I, unsigned ISDOpcode) {
   // operations, but can't figure out how to. Just use the vfp instructions
   // if we have them.
   // FIXME: It'd be nice to use NEON instructions.
-  const Type *Ty = I->getType();
+  Type *Ty = I->getType();
   bool isFloat = (Ty->isDoubleTy() || Ty->isFloatTy());
   if (isFloat && !Subtarget->hasVFP2())
     return false;
@@ -1711,7 +1728,7 @@ bool ARMFastISel::SelectRet(const Instruction *I) {
 
     // Analyze operands of the call, assigning locations to each operand.
     SmallVector<CCValAssign, 16> ValLocs;
-    CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, TM, ValLocs, I->getContext());
+    CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, TM, ValLocs,I->getContext());
     CCInfo.AnalyzeReturn(Outs, CCAssignFnForCall(CC, true /* is Ret */));
 
     const Value *RV = Ret->getOperand(0);
@@ -1778,7 +1795,7 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
   CallingConv::ID CC = TLI.getLibcallCallingConv(Call);
 
   // Handle *simple* calls for now.
-  const Type *RetTy = I->getType();
+  Type *RetTy = I->getType();
   MVT RetVT;
   if (RetTy->isVoidTy())
     RetVT = MVT::isVoid;
@@ -1802,7 +1819,7 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
     unsigned Arg = getRegForValue(Op);
     if (Arg == 0) return false;
 
-    const Type *ArgTy = Op->getType();
+    Type *ArgTy = Op->getType();
     MVT ArgVT;
     if (!isTypeLegal(ArgTy, ArgVT)) return false;
 
@@ -1870,13 +1887,13 @@ bool ARMFastISel::SelectCall(const Instruction *I) {
   // TODO: Avoid some calling conventions?
 
   // Let SDISel handle vararg functions.
-  const PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType());
-  const FunctionType *FTy = cast<FunctionType>(PT->getElementType());
+  PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType());
+  FunctionType *FTy = cast<FunctionType>(PT->getElementType());
   if (FTy->isVarArg())
     return false;
 
   // Handle *simple* calls for now.
-  const Type *RetTy = I->getType();
+  Type *RetTy = I->getType();
   MVT RetVT;
   if (RetTy->isVoidTy())
     RetVT = MVT::isVoid;
@@ -1915,7 +1932,7 @@ bool ARMFastISel::SelectCall(const Instruction *I) {
         CS.paramHasAttr(AttrInd, Attribute::ByVal))
       return false;
 
-    const Type *ArgTy = (*i)->getType();
+    Type *ArgTy = (*i)->getType();
     MVT ArgVT;
     if (!isTypeLegal(ArgTy, ArgVT))
       return false;
@@ -1969,9 +1986,9 @@ bool ARMFastISel::SelectIntCast(const Instruction *I) {
   // On ARM, in general, integer casts don't involve legal types; this code
   // handles promotable integers.  The high bits for a type smaller than
   // the register size are assumed to be undefined.
-  const Type *DestTy = I->getType();
+  Type *DestTy = I->getType();
   Value *Op = I->getOperand(0);
-  const Type *SrcTy = Op->getType();
+  Type *SrcTy = Op->getType();
 
   EVT SrcVT, DestVT;
   SrcVT = TLI.getValueType(SrcTy, true);
@@ -2002,16 +2019,18 @@ bool ARMFastISel::SelectIntCast(const Instruction *I) {
   switch (SrcVT.getSimpleVT().SimpleTy) {
   default: return false;
   case MVT::i16:
+    if (!Subtarget->hasV6Ops()) return false;
     if (isZext)
-      Opc = isThumb ? ARM::t2UXTHr : ARM::UXTHr;
+      Opc = isThumb ? ARM::t2UXTH : ARM::UXTH;
     else
-      Opc = isThumb ? ARM::t2SXTHr : ARM::SXTHr;
+      Opc = isThumb ? ARM::t2SXTH : ARM::SXTH;
     break;
   case MVT::i8:
+    if (!Subtarget->hasV6Ops()) return false;
     if (isZext)
-      Opc = isThumb ? ARM::t2UXTBr : ARM::UXTBr;
+      Opc = isThumb ? ARM::t2UXTB : ARM::UXTB;
     else
-      Opc = isThumb ? ARM::t2SXTBr : ARM::SXTBr;
+      Opc = isThumb ? ARM::t2SXTB : ARM::SXTB;
     break;
   case MVT::i1:
     if (isZext) {
@@ -2033,6 +2052,8 @@ bool ARMFastISel::SelectIntCast(const Instruction *I) {
         .addReg(SrcReg);
   if (isBoolZext)
     MIB.addImm(1);
+  else
+    MIB.addImm(0);
   AddOptionalDefs(MIB);
   UpdateValueMap(I, DestReg);
   return true;
diff --git a/lib/Target/ARM/ARMFixupKinds.h b/lib/Target/ARM/ARMFixupKinds.h
deleted file mode 100644
index 350c92decdce4..0000000000000
--- a/lib/Target/ARM/ARMFixupKinds.h
+++ /dev/null
@@ -1,97 +0,0 @@
-//===-- ARM/ARMFixupKinds.h - ARM Specific Fixup Entries --------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_ARM_ARMFIXUPKINDS_H
-#define LLVM_ARM_ARMFIXUPKINDS_H
-
-#include "llvm/MC/MCFixup.h"
-
-namespace llvm {
-namespace ARM {
-enum Fixups {
-  // fixup_arm_ldst_pcrel_12 - 12-bit PC relative relocation for symbol
-  // addresses
-  fixup_arm_ldst_pcrel_12 = FirstTargetFixupKind,
-
-  // fixup_t2_ldst_pcrel_12 - Equivalent to fixup_arm_ldst_pcrel_12, with
-  // the 16-bit halfwords reordered.
-  fixup_t2_ldst_pcrel_12,
-
-  // fixup_arm_pcrel_10 - 10-bit PC relative relocation for symbol addresses
-  // used in VFP instructions where the lower 2 bits are not encoded
-  // (so it's encoded as an 8-bit immediate).
-  fixup_arm_pcrel_10,
-  // fixup_t2_pcrel_10 - Equivalent to fixup_arm_pcrel_10, accounting for
-  // the short-swapped encoding of Thumb2 instructions.
-  fixup_t2_pcrel_10,
-  // fixup_thumb_adr_pcrel_10 - 10-bit PC relative relocation for symbol
-  // addresses where the lower 2 bits are not encoded (so it's encoded as an
-  // 8-bit immediate).
-  fixup_thumb_adr_pcrel_10,
-  // fixup_arm_adr_pcrel_12 - 12-bit PC relative relocation for the ADR
-  // instruction.
-  fixup_arm_adr_pcrel_12,
-  // fixup_t2_adr_pcrel_12 - 12-bit PC relative relocation for the ADR
-  // instruction.
-  fixup_t2_adr_pcrel_12,
-  // fixup_arm_condbranch - 24-bit PC relative relocation for conditional branch
-  // instructions. 
-  fixup_arm_condbranch,
-  // fixup_arm_uncondbranch - 24-bit PC relative relocation for 
-  // branch instructions. (unconditional)
-  fixup_arm_uncondbranch,
-  // fixup_t2_condbranch - 20-bit PC relative relocation for Thumb2 direct
-  // uconditional branch instructions.
-  fixup_t2_condbranch,
-  // fixup_t2_uncondbranch - 20-bit PC relative relocation for Thumb2 direct
-  // branch unconditional branch instructions.
-  fixup_t2_uncondbranch,
-
-  // fixup_arm_thumb_br - 12-bit fixup for Thumb B instructions.
-  fixup_arm_thumb_br,
-
-  // fixup_arm_thumb_bl - Fixup for Thumb BL instructions.
-  fixup_arm_thumb_bl,
-
-  // fixup_arm_thumb_blx - Fixup for Thumb BLX instructions.
-  fixup_arm_thumb_blx,
-
-  // fixup_arm_thumb_cb - Fixup for Thumb branch instructions.
-  fixup_arm_thumb_cb,
-
-  // fixup_arm_thumb_cp - Fixup for Thumb load/store from constant pool instrs.
-  fixup_arm_thumb_cp,
-
-  // fixup_arm_thumb_bcc - Fixup for Thumb conditional branching instructions.
-  fixup_arm_thumb_bcc,
-
-  // The next two are for the movt/movw pair
-  // the 16bit imm field are split into imm{15-12} and imm{11-0}
-  fixup_arm_movt_hi16, // :upper16:
-  fixup_arm_movw_lo16, // :lower16:
-  fixup_t2_movt_hi16, // :upper16:
-  fixup_t2_movw_lo16, // :lower16:
-
-  // It is possible to create an "immediate" that happens to be pcrel.
-  // movw r0, :lower16:Foo-(Bar+8) and movt  r0, :upper16:Foo-(Bar+8)
-  // result in different reloc tags than the above two.
-  // Needed to support ELF::R_ARM_MOVT_PREL and ELF::R_ARM_MOVW_PREL_NC
-  fixup_arm_movt_hi16_pcrel, // :upper16:
-  fixup_arm_movw_lo16_pcrel, // :lower16:
-  fixup_t2_movt_hi16_pcrel, // :upper16:
-  fixup_t2_movw_lo16_pcrel, // :lower16:
-
-  // Marker
-  LastTargetFixupKind,
-  NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
-};
-}
-}
-
-#endif
diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp
index 381b404519e22..2d1de6fe8e9d1 100644
--- a/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/lib/Target/ARM/ARMFrameLowering.cpp
@@ -12,10 +12,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "ARMFrameLowering.h"
-#include "ARMAddressingModes.h"
 #include "ARMBaseInstrInfo.h"
 #include "ARMBaseRegisterInfo.h"
 #include "ARMMachineFunctionInfo.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -93,7 +93,8 @@ static bool isCSRestore(MachineInstr *MI,
         return false;
     return true;
   }
-  if ((MI->getOpcode() == ARM::LDR_POST ||
+  if ((MI->getOpcode() == ARM::LDR_POST_IMM ||
+       MI->getOpcode() == ARM::LDR_POST_REG ||
        MI->getOpcode() == ARM::t2LDR_POST) &&
       isCalleeSavedRegister(MI->getOperand(0).getReg(), CSRegs) &&
       MI->getOperand(1).getReg() == ARM::SP)
@@ -413,6 +414,9 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
         MIB.addExternalSymbol(JumpTarget.getSymbolName(),
                               JumpTarget.getTargetFlags());
       }
+
+      // Add the default predicate in Thumb mode.
+      if (STI.isThumb()) MIB.addImm(ARMCC::AL).addReg(0);
     } else if (RetOpcode == ARM::TCRETURNri) {
       BuildMI(MBB, MBBI, dl,
               TII.get(STI.isThumb() ? ARM::tTAILJMPr : ARM::TAILJMPr)).
@@ -502,7 +506,7 @@ ARMFrameLowering::ResolveFrameIndexReference(const MachineFunction &MF,
         }
       }
     } else if (AFI->isThumb2Function()) {
-      // Use  add <rd>, sp, #<imm8> 
+      // Use  add <rd>, sp, #<imm8>
       //      ldr <rd>, [sp, #<imm8>]
       // if at all possible to save space.
       if (Offset >= 0 && (Offset & 3) == 0 && Offset <= 1020)
@@ -587,14 +591,8 @@ void ARMFrameLowering::emitPushInst(MachineBasicBlock &MBB,
       MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc),
                                         ARM::SP)
         .addReg(Regs[0].first, getKillRegState(Regs[0].second))
-        .addReg(ARM::SP).setMIFlags(MIFlags);
-      // ARM mode needs an extra reg0 here due to addrmode2. Will go away once
-      // that refactoring is complete (eventually).
-      if (StrOpc == ARM::STR_PRE) {
-        MIB.addReg(0);
-        MIB.addImm(ARM_AM::getAM2Opc(ARM_AM::sub, 4, ARM_AM::no_shift));
-      } else
-        MIB.addImm(-4);
+        .addReg(ARM::SP).setMIFlags(MIFlags)
+        .addImm(-4);
       AddDefaultPred(MIB);
     }
     Regs.clear();
@@ -651,8 +649,10 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
                        .addReg(ARM::SP));
       for (unsigned i = 0, e = Regs.size(); i < e; ++i)
         MIB.addReg(Regs[i], getDefRegState(true));
-      if (DeleteRet)
+      if (DeleteRet) {
+        MIB->copyImplicitOps(&*MI);
         MI->eraseFromParent();
+      }
       MI = MIB;
     } else if (Regs.size() == 1) {
       // If we adjusted the reg to PC from LR above, switch it back here. We
@@ -665,7 +665,7 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
           .addReg(ARM::SP);
       // ARM mode needs an extra reg0 here due to addrmode2. Will go away once
       // that refactoring is complete (eventually).
-      if (LdrOpc == ARM::LDR_POST) {
+      if (LdrOpc == ARM::LDR_POST_REG || LdrOpc == ARM::LDR_POST_IMM) {
         MIB.addReg(0);
         MIB.addImm(ARM_AM::getAM2Opc(ARM_AM::add, 4, ARM_AM::no_shift));
       } else
@@ -687,7 +687,8 @@ bool ARMFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
 
   unsigned PushOpc = AFI->isThumbFunction() ? ARM::t2STMDB_UPD : ARM::STMDB_UPD;
-  unsigned PushOneOpc = AFI->isThumbFunction() ? ARM::t2STR_PRE : ARM::STR_PRE;
+  unsigned PushOneOpc = AFI->isThumbFunction() ?
+    ARM::t2STR_PRE : ARM::STR_PRE_IMM;
   unsigned FltOpc = ARM::VSTMDDB_UPD;
   emitPushInst(MBB, MI, CSI, PushOpc, PushOneOpc, false, &isARMArea1Register,
                MachineInstr::FrameSetup);
@@ -711,7 +712,7 @@ bool ARMFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
   bool isVarArg = AFI->getVarArgsRegSaveSize() > 0;
 
   unsigned PopOpc = AFI->isThumbFunction() ? ARM::t2LDMIA_UPD : ARM::LDMIA_UPD;
-  unsigned LdrOpc = AFI->isThumbFunction() ? ARM::t2LDR_POST : ARM::LDR_POST;
+  unsigned LdrOpc = AFI->isThumbFunction() ? ARM::t2LDR_POST :ARM::LDR_POST_IMM;
   unsigned FltOpc = ARM::VLDMDIA_UPD;
   emitPopInst(MBB, MI, CSI, FltOpc, 0, isVarArg, true, &isARMArea3Register);
   emitPopInst(MBB, MI, CSI, PopOpc, LdrOpc, isVarArg, false,
diff --git a/lib/Target/ARM/ARMGlobalMerge.cpp b/lib/Target/ARM/ARMGlobalMerge.cpp
index 8d77b2d8383e1..5f863ea241ca8 100644
--- a/lib/Target/ARM/ARMGlobalMerge.cpp
+++ b/lib/Target/ARM/ARMGlobalMerge.cpp
@@ -100,8 +100,8 @@ namespace {
       GlobalCmp(const TargetData *td) : TD(td) { }
 
       bool operator()(const GlobalVariable *GV1, const GlobalVariable *GV2) {
-        const Type *Ty1 = cast<PointerType>(GV1->getType())->getElementType();
-        const Type *Ty2 = cast<PointerType>(GV2->getType())->getElementType();
+        Type *Ty1 = cast<PointerType>(GV1->getType())->getElementType();
+        Type *Ty2 = cast<PointerType>(GV2->getType())->getElementType();
 
         return (TD->getTypeAllocSize(Ty1) < TD->getTypeAllocSize(Ty2));
       }
@@ -123,7 +123,7 @@ bool ARMGlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
   // FIXME: Find better heuristics
   std::stable_sort(Globals.begin(), Globals.end(), GlobalCmp(TD));
 
-  const Type *Int32Ty = Type::getInt32Ty(M.getContext());
+  Type *Int32Ty = Type::getInt32Ty(M.getContext());
 
   for (size_t i = 0, e = Globals.size(); i != e; ) {
     size_t j = 0;
@@ -150,7 +150,7 @@ bool ARMGlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
         ConstantInt::get(Int32Ty, 0),
         ConstantInt::get(Int32Ty, k-i)
       };
-      Constant *GEP = ConstantExpr::getInBoundsGetElementPtr(MergedGV, Idx, 2);
+      Constant *GEP = ConstantExpr::getInBoundsGetElementPtr(MergedGV, Idx);
       Globals[k]->replaceAllUsesWith(GEP);
       Globals[k]->eraseFromParent();
     }
@@ -176,7 +176,7 @@ bool ARMGlobalMerge::doInitialization(Module &M) {
 
     // Ignore fancy-aligned globals for now.
     unsigned Alignment = I->getAlignment();
-    const Type *Ty = I->getType()->getElementType();
+    Type *Ty = I->getType()->getElementType();
     if (Alignment > TD->getABITypeAlignment(Ty))
       continue;
 
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 2c9481b86c557..5ee009c04c5b4 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -14,8 +14,8 @@
 #define DEBUG_TYPE "arm-isel"
 #include "ARM.h"
 #include "ARMBaseInstrInfo.h"
-#include "ARMAddressingModes.h"
 #include "ARMTargetMachine.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
 #include "llvm/CallingConv.h"
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
@@ -47,6 +47,11 @@ CheckVMLxHazard("check-vmlx-hazard", cl::Hidden,
   cl::desc("Check fp vmla / vmls hazard at isel time"),
   cl::init(true));
 
+static cl::opt<bool>
+DisableARMIntABS("disable-arm-int-abs", cl::Hidden,
+  cl::desc("Enable / disable ARM integer abs transform"),
+  cl::init(false));
+
 //===--------------------------------------------------------------------===//
 /// ARMDAGToDAGISel - ARM specific code to select ARM machine
 /// instructions for SelectionDAG operations.
@@ -90,13 +95,20 @@ public:
   bool hasNoVMLxHazardUse(SDNode *N) const;
   bool isShifterOpProfitable(const SDValue &Shift,
                              ARM_AM::ShiftOpc ShOpcVal, unsigned ShAmt);
-  bool SelectShifterOperandReg(SDValue N, SDValue &A,
+  bool SelectRegShifterOperand(SDValue N, SDValue &A,
                                SDValue &B, SDValue &C,
                                bool CheckProfitability = true);
-  bool SelectShiftShifterOperandReg(SDValue N, SDValue &A,
+  bool SelectImmShifterOperand(SDValue N, SDValue &A,
+                               SDValue &B, bool CheckProfitability = true);
+  bool SelectShiftRegShifterOperand(SDValue N, SDValue &A,
                                     SDValue &B, SDValue &C) {
     // Don't apply the profitability check
-    return SelectShifterOperandReg(N, A, B, C, false);
+    return SelectRegShifterOperand(N, A, B, C, false);
+  }
+  bool SelectShiftImmShifterOperand(SDValue N, SDValue &A,
+                                    SDValue &B) {
+    // Don't apply the profitability check
+    return SelectImmShifterOperand(N, A, B, false);
   }
 
   bool SelectAddrModeImm12(SDValue N, SDValue &Base, SDValue &OffImm);
@@ -122,8 +134,13 @@ public:
     return true;
   }
 
-  bool SelectAddrMode2Offset(SDNode *Op, SDValue N,
+  bool SelectAddrMode2OffsetReg(SDNode *Op, SDValue N,
                              SDValue &Offset, SDValue &Opc);
+  bool SelectAddrMode2OffsetImm(SDNode *Op, SDValue N,
+                             SDValue &Offset, SDValue &Opc);
+  bool SelectAddrMode2OffsetImmPre(SDNode *Op, SDValue N,
+                             SDValue &Offset, SDValue &Opc);
+  bool SelectAddrOffsetNone(SDValue N, SDValue &Base);
   bool SelectAddrMode3(SDValue N, SDValue &Base,
                        SDValue &Offset, SDValue &Opc);
   bool SelectAddrMode3Offset(SDNode *Op, SDValue N,
@@ -240,8 +257,13 @@ private:
                                ARMCC::CondCodes CCVal, SDValue CCR,
                                SDValue InFlag);
 
+  // Select special operations if node forms integer ABS pattern
+  SDNode *SelectABSOp(SDNode *N);
+
   SDNode *SelectConcatVector(SDNode *N);
 
+  SDNode *SelectAtomic64(SDNode *Node, unsigned Opc);
+
   /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
   /// inline asm expressions.
   virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op,
@@ -291,10 +313,10 @@ static bool isOpcWithIntImmediate(SDNode *N, unsigned Opc, unsigned& Imm) {
 /// (N * Scale) where (N in [\arg RangeMin, \arg RangeMax).
 ///
 /// \param ScaledConstant [out] - On success, the pre-scaled constant value.
-static bool isScaledConstantInRange(SDValue Node, unsigned Scale,
+static bool isScaledConstantInRange(SDValue Node, int Scale,
                                     int RangeMin, int RangeMax,
                                     int &ScaledConstant) {
-  assert(Scale && "Invalid scale!");
+  assert(Scale > 0 && "Invalid scale!");
 
   // Check that this is a constant.
   const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Node);
@@ -365,7 +387,30 @@ bool ARMDAGToDAGISel::isShifterOpProfitable(const SDValue &Shift,
   return ShOpcVal == ARM_AM::lsl && ShAmt == 2;
 }
 
-bool ARMDAGToDAGISel::SelectShifterOperandReg(SDValue N,
+bool ARMDAGToDAGISel::SelectImmShifterOperand(SDValue N,
+                                              SDValue &BaseReg,
+                                              SDValue &Opc,
+                                              bool CheckProfitability) {
+  if (DisableShifterOp)
+    return false;
+
+  ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOpcode());
+
+  // Don't match base register only case. That is matched to a separate
+  // lower complexity pattern with explicit register operand.
+  if (ShOpcVal == ARM_AM::no_shift) return false;
+
+  BaseReg = N.getOperand(0);
+  unsigned ShImmVal = 0;
+  ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1));
+  if (!RHS) return false;
+  ShImmVal = RHS->getZExtValue() & 31;
+  Opc = CurDAG->getTargetConstant(ARM_AM::getSORegOpc(ShOpcVal, ShImmVal),
+                                  MVT::i32);
+  return true;
+}
+
+bool ARMDAGToDAGISel::SelectRegShifterOperand(SDValue N,
                                               SDValue &BaseReg,
                                               SDValue &ShReg,
                                               SDValue &Opc,
@@ -373,7 +418,7 @@ bool ARMDAGToDAGISel::SelectShifterOperandReg(SDValue N,
   if (DisableShifterOp)
     return false;
 
-  ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N);
+  ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOpcode());
 
   // Don't match base register only case. That is matched to a separate
   // lower complexity pattern with explicit register operand.
@@ -381,19 +426,18 @@ bool ARMDAGToDAGISel::SelectShifterOperandReg(SDValue N,
 
   BaseReg = N.getOperand(0);
   unsigned ShImmVal = 0;
-  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
-    ShReg = CurDAG->getRegister(0, MVT::i32);
-    ShImmVal = RHS->getZExtValue() & 31;
-  } else {
-    ShReg = N.getOperand(1);
-    if (CheckProfitability && !isShifterOpProfitable(N, ShOpcVal, ShImmVal))
-      return false;
-  }
+  ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1));
+  if (RHS) return false;
+
+  ShReg = N.getOperand(1);
+  if (CheckProfitability && !isShifterOpProfitable(N, ShOpcVal, ShImmVal))
+    return false;
   Opc = CurDAG->getTargetConstant(ARM_AM::getSORegOpc(ShOpcVal, ShImmVal),
                                   MVT::i32);
   return true;
 }
 
+
 bool ARMDAGToDAGISel::SelectAddrModeImm12(SDValue N,
                                           SDValue &Base,
                                           SDValue &OffImm) {
@@ -483,13 +527,10 @@ bool ARMDAGToDAGISel::SelectLdStSOReg(SDValue N, SDValue &Base, SDValue &Offset,
       return false;
   }
 
-  if (Subtarget->isCortexA9() && !N.hasOneUse())
-    // Compute R +/- (R << N) and reuse it.
-    return false;
-
   // Otherwise this is R +/- [possibly shifted] R.
   ARM_AM::AddrOpc AddSub = N.getOpcode() == ISD::SUB ? ARM_AM::sub:ARM_AM::add;
-  ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOperand(1));
+  ARM_AM::ShiftOpc ShOpcVal =
+    ARM_AM::getShiftOpcForNode(N.getOperand(1).getOpcode());
   unsigned ShAmt = 0;
 
   Base   = N.getOperand(0);
@@ -515,16 +556,14 @@ bool ARMDAGToDAGISel::SelectLdStSOReg(SDValue N, SDValue &Base, SDValue &Offset,
   // Try matching (R shl C) + (R).
   if (N.getOpcode() != ISD::SUB && ShOpcVal == ARM_AM::no_shift &&
       !(Subtarget->isCortexA9() || N.getOperand(0).hasOneUse())) {
-    ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOperand(0));
+    ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOperand(0).getOpcode());
     if (ShOpcVal != ARM_AM::no_shift) {
       // Check to see if the RHS of the shift is a constant, if not, we can't
       // fold it.
       if (ConstantSDNode *Sh =
           dyn_cast<ConstantSDNode>(N.getOperand(0).getOperand(1))) {
         ShAmt = Sh->getZExtValue();
-        if (!Subtarget->isCortexA9() ||
-            (N.hasOneUse() &&
-             isShifterOpProfitable(N.getOperand(0), ShOpcVal, ShAmt))) {
+        if (isShifterOpProfitable(N.getOperand(0), ShOpcVal, ShAmt)) {
           Offset = N.getOperand(0).getOperand(0);
           Base = N.getOperand(1);
         } else {
@@ -630,7 +669,8 @@ AddrMode2Type ARMDAGToDAGISel::SelectAddrMode2Worker(SDValue N,
 
   // Otherwise this is R +/- [possibly shifted] R.
   ARM_AM::AddrOpc AddSub = N.getOpcode() != ISD::SUB ? ARM_AM::add:ARM_AM::sub;
-  ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOperand(1));
+  ARM_AM::ShiftOpc ShOpcVal =
+    ARM_AM::getShiftOpcForNode(N.getOperand(1).getOpcode());
   unsigned ShAmt = 0;
 
   Base   = N.getOperand(0);
@@ -656,16 +696,14 @@ AddrMode2Type ARMDAGToDAGISel::SelectAddrMode2Worker(SDValue N,
   // Try matching (R shl C) + (R).
   if (N.getOpcode() != ISD::SUB && ShOpcVal == ARM_AM::no_shift &&
       !(Subtarget->isCortexA9() || N.getOperand(0).hasOneUse())) {
-    ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOperand(0));
+    ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOperand(0).getOpcode());
     if (ShOpcVal != ARM_AM::no_shift) {
       // Check to see if the RHS of the shift is a constant, if not, we can't
       // fold it.
       if (ConstantSDNode *Sh =
           dyn_cast<ConstantSDNode>(N.getOperand(0).getOperand(1))) {
         ShAmt = Sh->getZExtValue();
-        if (!Subtarget->isCortexA9() ||
-            (N.hasOneUse() &&
-             isShifterOpProfitable(N.getOperand(0), ShOpcVal, ShAmt))) {
+        if (isShifterOpProfitable(N.getOperand(0), ShOpcVal, ShAmt)) {
           Offset = N.getOperand(0).getOperand(0);
           Base = N.getOperand(1);
         } else {
@@ -683,7 +721,7 @@ AddrMode2Type ARMDAGToDAGISel::SelectAddrMode2Worker(SDValue N,
   return AM2_SHOP;
 }
 
-bool ARMDAGToDAGISel::SelectAddrMode2Offset(SDNode *Op, SDValue N,
+bool ARMDAGToDAGISel::SelectAddrMode2OffsetReg(SDNode *Op, SDValue N,
                                             SDValue &Offset, SDValue &Opc) {
   unsigned Opcode = Op->getOpcode();
   ISD::MemIndexedMode AM = (Opcode == ISD::LOAD)
@@ -692,16 +730,11 @@ bool ARMDAGToDAGISel::SelectAddrMode2Offset(SDNode *Op, SDValue N,
   ARM_AM::AddrOpc AddSub = (AM == ISD::PRE_INC || AM == ISD::POST_INC)
     ? ARM_AM::add : ARM_AM::sub;
   int Val;
-  if (isScaledConstantInRange(N, /*Scale=*/1, 0, 0x1000, Val)) { // 12 bits.
-    Offset = CurDAG->getRegister(0, MVT::i32);
-    Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, Val,
-                                                      ARM_AM::no_shift),
-                                    MVT::i32);
-    return true;
-  }
+  if (isScaledConstantInRange(N, /*Scale=*/1, 0, 0x1000, Val))
+    return false;
 
   Offset = N;
-  ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N);
+  ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOpcode());
   unsigned ShAmt = 0;
   if (ShOpcVal != ARM_AM::no_shift) {
     // Check to see if the RHS of the shift is a constant, if not, we can't fold
@@ -724,6 +757,50 @@ bool ARMDAGToDAGISel::SelectAddrMode2Offset(SDNode *Op, SDValue N,
   return true;
 }
 
+bool ARMDAGToDAGISel::SelectAddrMode2OffsetImmPre(SDNode *Op, SDValue N,
+                                            SDValue &Offset, SDValue &Opc) {
+  unsigned Opcode = Op->getOpcode();
+  ISD::MemIndexedMode AM = (Opcode == ISD::LOAD)
+    ? cast<LoadSDNode>(Op)->getAddressingMode()
+    : cast<StoreSDNode>(Op)->getAddressingMode();
+  ARM_AM::AddrOpc AddSub = (AM == ISD::PRE_INC || AM == ISD::POST_INC)
+    ? ARM_AM::add : ARM_AM::sub;
+  int Val;
+  if (isScaledConstantInRange(N, /*Scale=*/1, 0, 0x1000, Val)) { // 12 bits.
+    if (AddSub == ARM_AM::sub) Val *= -1;
+    Offset = CurDAG->getRegister(0, MVT::i32);
+    Opc = CurDAG->getTargetConstant(Val, MVT::i32);
+    return true;
+  }
+
+  return false;
+}
+
+
+bool ARMDAGToDAGISel::SelectAddrMode2OffsetImm(SDNode *Op, SDValue N,
+                                            SDValue &Offset, SDValue &Opc) {
+  unsigned Opcode = Op->getOpcode();
+  ISD::MemIndexedMode AM = (Opcode == ISD::LOAD)
+    ? cast<LoadSDNode>(Op)->getAddressingMode()
+    : cast<StoreSDNode>(Op)->getAddressingMode();
+  ARM_AM::AddrOpc AddSub = (AM == ISD::PRE_INC || AM == ISD::POST_INC)
+    ? ARM_AM::add : ARM_AM::sub;
+  int Val;
+  if (isScaledConstantInRange(N, /*Scale=*/1, 0, 0x1000, Val)) { // 12 bits.
+    Offset = CurDAG->getRegister(0, MVT::i32);
+    Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, Val,
+                                                      ARM_AM::no_shift),
+                                    MVT::i32);
+    return true;
+  }
+
+  return false;
+}
+
+bool ARMDAGToDAGISel::SelectAddrOffsetNone(SDValue N, SDValue &Base) {
+  Base = N;
+  return true;
+}
 
 bool ARMDAGToDAGISel::SelectAddrMode3(SDValue N,
                                       SDValue &Base, SDValue &Offset,
@@ -1079,7 +1156,7 @@ bool ARMDAGToDAGISel::SelectT2ShifterOperandReg(SDValue N, SDValue &BaseReg,
   if (DisableShifterOp)
     return false;
 
-  ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N);
+  ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOpcode());
 
   // Don't match base register only case. That is matched to a separate
   // lower complexity pattern with explicit register operand.
@@ -1208,21 +1285,15 @@ bool ARMDAGToDAGISel::SelectT2AddrModeSoReg(SDValue N,
       return false;
   }
 
-  if (Subtarget->isCortexA9() && !N.hasOneUse()) {
-    // Compute R + (R << [1,2,3]) and reuse it.
-    Base = N;
-    return false;
-  }
-
   // Look for (R + R) or (R + (R << [1,2,3])).
   unsigned ShAmt = 0;
   Base   = N.getOperand(0);
   OffReg = N.getOperand(1);
 
   // Swap if it is ((R << c) + R).
-  ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(OffReg);
+  ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(OffReg.getOpcode());
   if (ShOpcVal != ARM_AM::lsl) {
-    ShOpcVal = ARM_AM::getShiftOpcForNode(Base);
+    ShOpcVal = ARM_AM::getShiftOpcForNode(Base.getOpcode());
     if (ShOpcVal == ARM_AM::lsl)
       std::swap(Base, OffReg);
   }
@@ -1266,10 +1337,19 @@ SDNode *ARMDAGToDAGISel::SelectARMIndexedLoad(SDNode *N) {
   bool isPre = (AM == ISD::PRE_INC) || (AM == ISD::PRE_DEC);
   unsigned Opcode = 0;
   bool Match = false;
-  if (LoadedVT == MVT::i32 &&
-      SelectAddrMode2Offset(N, LD->getOffset(), Offset, AMOpc)) {
-    Opcode = isPre ? ARM::LDR_PRE : ARM::LDR_POST;
+  if (LoadedVT == MVT::i32 && isPre &&
+      SelectAddrMode2OffsetImmPre(N, LD->getOffset(), Offset, AMOpc)) {
+    Opcode = ARM::LDR_PRE_IMM;
+    Match = true;
+  } else if (LoadedVT == MVT::i32 && !isPre &&
+      SelectAddrMode2OffsetImm(N, LD->getOffset(), Offset, AMOpc)) {
+    Opcode = ARM::LDR_POST_IMM;
     Match = true;
+  } else if (LoadedVT == MVT::i32 &&
+      SelectAddrMode2OffsetReg(N, LD->getOffset(), Offset, AMOpc)) {
+    Opcode = isPre ? ARM::LDR_PRE_REG : ARM::LDR_POST_REG;
+    Match = true;
+
   } else if (LoadedVT == MVT::i16 &&
              SelectAddrMode3Offset(N, LD->getOffset(), Offset, AMOpc)) {
     Match = true;
@@ -1283,20 +1363,37 @@ SDNode *ARMDAGToDAGISel::SelectARMIndexedLoad(SDNode *N) {
         Opcode = isPre ? ARM::LDRSB_PRE : ARM::LDRSB_POST;
       }
     } else {
-      if (SelectAddrMode2Offset(N, LD->getOffset(), Offset, AMOpc)) {
+      if (isPre &&
+          SelectAddrMode2OffsetImmPre(N, LD->getOffset(), Offset, AMOpc)) {
+        Match = true;
+        Opcode = ARM::LDRB_PRE_IMM;
+      } else if (!isPre &&
+                  SelectAddrMode2OffsetImm(N, LD->getOffset(), Offset, AMOpc)) {
+        Match = true;
+        Opcode = ARM::LDRB_POST_IMM;
+      } else if (SelectAddrMode2OffsetReg(N, LD->getOffset(), Offset, AMOpc)) {
         Match = true;
-        Opcode = isPre ? ARM::LDRB_PRE : ARM::LDRB_POST;
+        Opcode = isPre ? ARM::LDRB_PRE_REG : ARM::LDRB_POST_REG;
       }
     }
   }
 
   if (Match) {
-    SDValue Chain = LD->getChain();
-    SDValue Base = LD->getBasePtr();
-    SDValue Ops[]= { Base, Offset, AMOpc, getAL(CurDAG),
-                     CurDAG->getRegister(0, MVT::i32), Chain };
-    return CurDAG->getMachineNode(Opcode, N->getDebugLoc(), MVT::i32, MVT::i32,
-                                  MVT::Other, Ops, 6);
+    if (Opcode == ARM::LDR_PRE_IMM || Opcode == ARM::LDRB_PRE_IMM) {
+      SDValue Chain = LD->getChain();
+      SDValue Base = LD->getBasePtr();
+      SDValue Ops[]= { Base, AMOpc, getAL(CurDAG),
+                       CurDAG->getRegister(0, MVT::i32), Chain };
+      return CurDAG->getMachineNode(Opcode, N->getDebugLoc(), MVT::i32,
+                                    MVT::i32, MVT::Other, Ops, 5);
+    } else {
+      SDValue Chain = LD->getChain();
+      SDValue Base = LD->getBasePtr();
+      SDValue Ops[]= { Base, Offset, AMOpc, getAL(CurDAG),
+                       CurDAG->getRegister(0, MVT::i32), Chain };
+      return CurDAG->getMachineNode(Opcode, N->getDebugLoc(), MVT::i32,
+                                    MVT::i32, MVT::Other, Ops, 6);
+    }
   }
 
   return NULL;
@@ -1966,7 +2063,8 @@ SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDNode *N,
                                 Srl_imm)) {
         assert(Srl_imm > 0 && Srl_imm < 32 && "bad amount in shift node!");
 
-        unsigned Width = CountTrailingOnes_32(And_imm);
+        // Note: The width operand is encoded as width-1.
+        unsigned Width = CountTrailingOnes_32(And_imm) - 1;
         unsigned LSB = Srl_imm;
         SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
         SDValue Ops[] = { N->getOperand(0).getOperand(0),
@@ -1986,7 +2084,8 @@ SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDNode *N,
     unsigned Srl_imm = 0;
     if (isInt32Immediate(N->getOperand(1), Srl_imm)) {
       assert(Srl_imm > 0 && Srl_imm < 32 && "bad amount in shift node!");
-      unsigned Width = 32 - Srl_imm;
+      // Note: The width operand is encoded as width-1.
+      unsigned Width = 32 - Srl_imm - 1;
       int LSB = Srl_imm - Shl_imm;
       if (LSB < 0)
         return NULL;
@@ -2034,10 +2133,16 @@ SelectARMCMOVShiftOp(SDNode *N, SDValue FalseVal, SDValue TrueVal,
   SDValue CPTmp0;
   SDValue CPTmp1;
   SDValue CPTmp2;
-  if (SelectShifterOperandReg(TrueVal, CPTmp0, CPTmp1, CPTmp2)) {
+  if (SelectImmShifterOperand(TrueVal, CPTmp0, CPTmp2)) {
+    SDValue CC = CurDAG->getTargetConstant(CCVal, MVT::i32);
+    SDValue Ops[] = { FalseVal, CPTmp0, CPTmp2, CC, CCR, InFlag };
+    return CurDAG->SelectNodeTo(N, ARM::MOVCCsi, MVT::i32, Ops, 6);
+  }
+
+  if (SelectRegShifterOperand(TrueVal, CPTmp0, CPTmp1, CPTmp2)) {
     SDValue CC = CurDAG->getTargetConstant(CCVal, MVT::i32);
     SDValue Ops[] = { FalseVal, CPTmp0, CPTmp1, CPTmp2, CC, CCR, InFlag };
-    return CurDAG->SelectNodeTo(N, ARM::MOVCCs, MVT::i32, Ops, 7);
+    return CurDAG->SelectNodeTo(N, ARM::MOVCCsr, MVT::i32, Ops, 7);
   }
   return 0;
 }
@@ -2198,6 +2303,56 @@ SDNode *ARMDAGToDAGISel::SelectCMOVOp(SDNode *N) {
   return CurDAG->SelectNodeTo(N, Opc, VT, Ops, 5);
 }
 
+/// Target-specific DAG combining for ISD::XOR.
+/// Target-independent combining lowers SELECT_CC nodes of the form
+/// select_cc setg[ge] X,  0,  X, -X
+/// select_cc setgt    X, -1,  X, -X
+/// select_cc setl[te] X,  0, -X,  X
+/// select_cc setlt    X,  1, -X,  X
+/// which represent Integer ABS into:
+/// Y = sra (X, size(X)-1); xor (add (X, Y), Y)
+/// ARM instruction selection detects the latter and matches it to
+/// ARM::ABS or ARM::t2ABS machine node.
+SDNode *ARMDAGToDAGISel::SelectABSOp(SDNode *N){
+  SDValue XORSrc0 = N->getOperand(0);
+  SDValue XORSrc1 = N->getOperand(1);
+  DebugLoc DL = N->getDebugLoc();
+  EVT VT = N->getValueType(0);
+
+  if (DisableARMIntABS)
+    return NULL;
+
+  if (Subtarget->isThumb1Only())
+    return NULL;
+
+  if (XORSrc0.getOpcode() != ISD::ADD ||
+    XORSrc1.getOpcode() != ISD::SRA)
+    return NULL;
+
+  SDValue ADDSrc0 = XORSrc0.getOperand(0);
+  SDValue ADDSrc1 = XORSrc0.getOperand(1);
+  SDValue SRASrc0 = XORSrc1.getOperand(0);
+  SDValue SRASrc1 = XORSrc1.getOperand(1);
+  ConstantSDNode *SRAConstant =  dyn_cast<ConstantSDNode>(SRASrc1);
+  EVT XType = SRASrc0.getValueType();
+  unsigned Size = XType.getSizeInBits() - 1;
+
+  if (ADDSrc1 == XORSrc1  &&
+      ADDSrc0 == SRASrc0 &&
+      XType.isInteger() &&
+      SRAConstant != NULL &&
+      Size == SRAConstant->getZExtValue()) {
+
+    unsigned Opcode = ARM::ABS;
+    if (Subtarget->isThumb2())
+      Opcode = ARM::t2ABS;
+
+    return CurDAG->SelectNodeTo(N, Opcode, VT, ADDSrc0);
+  }
+
+  return NULL;
+}
+
 SDNode *ARMDAGToDAGISel::SelectConcatVector(SDNode *N) {
   // The only time a CONCAT_VECTORS operation can have legal types is when
   // two 64-bit vectors are concatenated to a 128-bit vector.
@@ -2207,6 +2362,25 @@ SDNode *ARMDAGToDAGISel::SelectConcatVector(SDNode *N) {
   return PairDRegs(VT, N->getOperand(0), N->getOperand(1));
 }
 
+SDNode *ARMDAGToDAGISel::SelectAtomic64(SDNode *Node, unsigned Opc) {
+  SmallVector<SDValue, 6> Ops;
+  Ops.push_back(Node->getOperand(1)); // Ptr
+  Ops.push_back(Node->getOperand(2)); // Low part of Val1
+  Ops.push_back(Node->getOperand(3)); // High part of Val1
+  if (Opc == ARM::ATOMCMPXCHG6432) {
+    Ops.push_back(Node->getOperand(4)); // Low part of Val2
+    Ops.push_back(Node->getOperand(5)); // High part of Val2
+  }
+  Ops.push_back(Node->getOperand(0)); // Chain
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemSDNode>(Node)->getMemOperand();
+  SDNode *ResNode = CurDAG->getMachineNode(Opc, Node->getDebugLoc(),
+                                           MVT::i32, MVT::i32, MVT::Other,
+                                           Ops.data() ,Ops.size());
+  cast<MachineSDNode>(ResNode)->setMemRefs(MemOp, MemOp + 1);
+  return ResNode;
+}
+
 SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
   DebugLoc dl = N->getDebugLoc();
 
@@ -2215,6 +2389,14 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
 
   switch (N->getOpcode()) {
   default: break;
+  case ISD::XOR: {
+    // Select special operations if XOR node forms integer ABS pattern
+    SDNode *ResNode = SelectABSOp(N);
+    if (ResNode)
+      return ResNode;
+    // Other cases are autogenerated.
+    break;
+  }
   case ISD::Constant: {
     unsigned Val = cast<ConstantSDNode>(N)->getZExtValue();
     bool UseCP = true;
@@ -2269,8 +2451,9 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     int FI = cast<FrameIndexSDNode>(N)->getIndex();
     SDValue TFI = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
     if (Subtarget->isThumb1Only()) {
-      return CurDAG->SelectNodeTo(N, ARM::tADDrSPi, MVT::i32, TFI,
-                                  CurDAG->getTargetConstant(0, MVT::i32));
+      SDValue Ops[] = { TFI, CurDAG->getTargetConstant(0, MVT::i32),
+                        getAL(CurDAG), CurDAG->getRegister(0, MVT::i32) };
+      return CurDAG->SelectNodeTo(N, ARM::tADDrSPi, MVT::i32, Ops, 4);
     } else {
       unsigned Opc = ((Subtarget->isThumb() && Subtarget->hasThumb2()) ?
                       ARM::t2ADDri : ARM::ADDri);
@@ -2307,7 +2490,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
           return CurDAG->SelectNodeTo(N, ARM::t2ADDrs, MVT::i32, Ops, 6);
         } else {
           SDValue Ops[] = { V, V, Reg0, ShImmOp, getAL(CurDAG), Reg0, Reg0 };
-          return CurDAG->SelectNodeTo(N, ARM::ADDrs, MVT::i32, Ops, 7);
+          return CurDAG->SelectNodeTo(N, ARM::ADDrsi, MVT::i32, Ops, 7);
         }
       }
       if (isPowerOf2_32(RHSV+1)) {  // 2^n-1?
@@ -2323,7 +2506,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
           return CurDAG->SelectNodeTo(N, ARM::t2RSBrs, MVT::i32, Ops, 6);
         } else {
           SDValue Ops[] = { V, V, Reg0, ShImmOp, getAL(CurDAG), Reg0, Reg0 };
-          return CurDAG->SelectNodeTo(N, ARM::RSBrs, MVT::i32, Ops, 7);
+          return CurDAG->SelectNodeTo(N, ARM::RSBrsi, MVT::i32, Ops, 7);
         }
       }
     }
@@ -2986,6 +3169,23 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
 
   case ISD::CONCAT_VECTORS:
     return SelectConcatVector(N);
+
+  case ARMISD::ATOMOR64_DAG:
+    return SelectAtomic64(N, ARM::ATOMOR6432);
+  case ARMISD::ATOMXOR64_DAG:
+    return SelectAtomic64(N, ARM::ATOMXOR6432);
+  case ARMISD::ATOMADD64_DAG:
+    return SelectAtomic64(N, ARM::ATOMADD6432);
+  case ARMISD::ATOMSUB64_DAG:
+    return SelectAtomic64(N, ARM::ATOMSUB6432);
+  case ARMISD::ATOMNAND64_DAG:
+    return SelectAtomic64(N, ARM::ATOMNAND6432);
+  case ARMISD::ATOMAND64_DAG:
+    return SelectAtomic64(N, ARM::ATOMAND6432);
+  case ARMISD::ATOMSWAP64_DAG:
+    return SelectAtomic64(N, ARM::ATOMSWAP6432);
+  case ARMISD::ATOMCMPXCHG64_DAG:
+    return SelectAtomic64(N, ARM::ATOMCMPXCHG6432);
   }
 
   return SelectCode(N);
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index cf8c5baa8e7d7..e44e35673aebb 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -14,7 +14,6 @@
 
 #define DEBUG_TYPE "arm-isel"
 #include "ARM.h"
-#include "ARMAddressingModes.h"
 #include "ARMCallingConv.h"
 #include "ARMConstantPoolValue.h"
 #include "ARMISelLowering.h"
@@ -24,6 +23,7 @@
 #include "ARMSubtarget.h"
 #include "ARMTargetMachine.h"
 #include "ARMTargetObjectFile.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
 #include "llvm/CallingConv.h"
 #include "llvm/Constants.h"
 #include "llvm/Function.h"
@@ -38,6 +38,7 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/CodeGen/SelectionDAG.h"
@@ -106,7 +107,7 @@ void ARMTargetLowering::addTypeForNEON(EVT VT, EVT PromotedLdStVT,
 
   EVT ElemTy = VT.getVectorElementType();
   if (ElemTy != MVT::i64 && ElemTy != MVT::f64)
-    setOperationAction(ISD::VSETCC, VT.getSimpleVT(), Custom);
+    setOperationAction(ISD::SETCC, VT.getSimpleVT(), Custom);
   setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT.getSimpleVT(), Custom);
   if (ElemTy != MVT::i32) {
     setOperationAction(ISD::SINT_TO_FP, VT.getSimpleVT(), Expand);
@@ -178,6 +179,8 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   RegInfo = TM.getRegisterInfo();
   Itins = TM.getInstrItineraryData();
 
+  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
+
   if (Subtarget->isTargetDarwin()) {
     // Uses VFP for Thumb libfuncs if available.
     if (Subtarget->isThumb() && Subtarget->hasVFP2()) {
@@ -419,6 +422,13 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
     setLibcallName(RTLIB::MEMSET,  "__aeabi_memset");
   }
 
+  // Use divmod compiler-rt calls for iOS 5.0 and later.
+  if (Subtarget->getTargetTriple().getOS() == Triple::IOS &&
+      !Subtarget->getTargetTriple().isOSVersionLT(5, 0)) {
+    setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4");
+    setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
+  }
+
   if (Subtarget->isThumb1Only())
     addRegisterClass(MVT::i32, ARM::tGPRRegisterClass);
   else
@@ -453,7 +463,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
     setOperationAction(ISD::FDIV, MVT::v2f64, Expand);
     setOperationAction(ISD::FREM, MVT::v2f64, Expand);
     setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Expand);
-    setOperationAction(ISD::VSETCC, MVT::v2f64, Expand);
+    setOperationAction(ISD::SETCC, MVT::v2f64, Expand);
     setOperationAction(ISD::FNEG, MVT::v2f64, Expand);
     setOperationAction(ISD::FABS, MVT::v2f64, Expand);
     setOperationAction(ISD::FSQRT, MVT::v2f64, Expand);
@@ -485,8 +495,8 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
     setOperationAction(ISD::SDIV, MVT::v8i8, Custom);
     setOperationAction(ISD::UDIV, MVT::v4i16, Custom);
     setOperationAction(ISD::UDIV, MVT::v8i8, Custom);
-    setOperationAction(ISD::VSETCC, MVT::v1i64, Expand);
-    setOperationAction(ISD::VSETCC, MVT::v2i64, Expand);
+    setOperationAction(ISD::SETCC, MVT::v1i64, Expand);
+    setOperationAction(ISD::SETCC, MVT::v2i64, Expand);
     // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with
     // a destination type that is wider than the source.
     setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom);
@@ -551,6 +561,14 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   setOperationAction(ISD::SRL,       MVT::i64, Custom);
   setOperationAction(ISD::SRA,       MVT::i64, Custom);
 
+  if (!Subtarget->isThumb1Only()) {
+    // FIXME: We should do this for Thumb1 as well.
+    setOperationAction(ISD::ADDC,    MVT::i32, Custom);
+    setOperationAction(ISD::ADDE,    MVT::i32, Custom);
+    setOperationAction(ISD::SUBC,    MVT::i32, Custom);
+    setOperationAction(ISD::SUBE,    MVT::i32, Custom);
+  }
+
   // ARM does not have ROTL.
   setOperationAction(ISD::ROTL,  MVT::i32, Expand);
   setOperationAction(ISD::CTTZ,  MVT::i32, Custom);
@@ -596,62 +614,46 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
   // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use
   // the default expansion.
+  // FIXME: This should be checking for v6k, not just v6.
   if (Subtarget->hasDataBarrier() ||
       (Subtarget->hasV6Ops() && !Subtarget->isThumb())) {
     // membarrier needs custom lowering; the rest are legal and handled
     // normally.
     setOperationAction(ISD::MEMBARRIER, MVT::Other, Custom);
+    setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
+    // Custom lowering for 64-bit ops
+    setOperationAction(ISD::ATOMIC_LOAD_ADD,  MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_SUB,  MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_AND,  MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_OR,   MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_XOR,  MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_SWAP,  MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i64, Custom);
+    // Automatically insert fences (dmb ist) around ATOMIC_SWAP etc.
+    setInsertFencesForAtomic(true);
   } else {
     // Set them all for expansion, which will force libcalls.
     setOperationAction(ISD::MEMBARRIER, MVT::Other, Expand);
-    setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i8,  Expand);
-    setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i16, Expand);
+    setOperationAction(ISD::ATOMIC_FENCE,   MVT::Other, Expand);
     setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i32, Expand);
-    setOperationAction(ISD::ATOMIC_SWAP,      MVT::i8,  Expand);
-    setOperationAction(ISD::ATOMIC_SWAP,      MVT::i16, Expand);
     setOperationAction(ISD::ATOMIC_SWAP,      MVT::i32, Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_ADD,  MVT::i8,  Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_ADD,  MVT::i16, Expand);
     setOperationAction(ISD::ATOMIC_LOAD_ADD,  MVT::i32, Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_SUB,  MVT::i8,  Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_SUB,  MVT::i16, Expand);
     setOperationAction(ISD::ATOMIC_LOAD_SUB,  MVT::i32, Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_AND,  MVT::i8,  Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_AND,  MVT::i16, Expand);
     setOperationAction(ISD::ATOMIC_LOAD_AND,  MVT::i32, Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_OR,   MVT::i8,  Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_OR,   MVT::i16, Expand);
     setOperationAction(ISD::ATOMIC_LOAD_OR,   MVT::i32, Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_XOR,  MVT::i8,  Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_XOR,  MVT::i16, Expand);
     setOperationAction(ISD::ATOMIC_LOAD_XOR,  MVT::i32, Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i8,  Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i16, Expand);
     setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i8,  Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i16, Expand);
     setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i8,  Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i16, Expand);
     setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i8,  Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i16, Expand);
     setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i8,  Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i16, Expand);
     setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Expand);
+    // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the
+    // Unordered/Monotonic case.
+    setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom);
+    setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom);
     // Since the libcalls include locking, fold in the fences
     setShouldFoldAtomicFences(true);
   }
-  // 64-bit versions are always libcalls (for now)
-  setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i64, Expand);
-  setOperationAction(ISD::ATOMIC_SWAP,      MVT::i64, Expand);
-  setOperationAction(ISD::ATOMIC_LOAD_ADD,  MVT::i64, Expand);
-  setOperationAction(ISD::ATOMIC_LOAD_SUB,  MVT::i64, Expand);
-  setOperationAction(ISD::ATOMIC_LOAD_AND,  MVT::i64, Expand);
-  setOperationAction(ISD::ATOMIC_LOAD_OR,   MVT::i64, Expand);
-  setOperationAction(ISD::ATOMIC_LOAD_XOR,  MVT::i64, Expand);
-  setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Expand);
 
   setOperationAction(ISD::PREFETCH,         MVT::Other, Custom);
 
@@ -839,6 +841,11 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case ARMISD::SRA_FLAG:      return "ARMISD::SRA_FLAG";
   case ARMISD::RRX:           return "ARMISD::RRX";
 
+  case ARMISD::ADDC:          return "ARMISD::ADDC";
+  case ARMISD::ADDE:          return "ARMISD::ADDE";
+  case ARMISD::SUBC:          return "ARMISD::SUBC";
+  case ARMISD::SUBE:          return "ARMISD::SUBE";
+
   case ARMISD::VMOVRRD:       return "ARMISD::VMOVRRD";
   case ARMISD::VMOVDRR:       return "ARMISD::VMOVDRR";
 
@@ -935,6 +942,11 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   }
 }
 
+EVT ARMTargetLowering::getSetCCResultType(EVT VT) const {
+  if (!VT.isVector()) return getPointerTy();
+  return VT.changeVectorElementTypeToInteger();
+}
+
 /// getRegClassFor - Return the register class that should be used for the
 /// specified value type.
 TargetRegisterClass *ARMTargetLowering::getRegClassFor(EVT VT) const {
@@ -1210,8 +1222,8 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
   MachineFunction &MF = DAG.getMachineFunction();
   bool IsStructRet    = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
   bool IsSibCall = false;
-  // Temporarily disable tail calls so things don't break.
-  if (!EnableARMTailCalls)
+  // Disable tail calls if they're not supported.
+  if (!EnableARMTailCalls && !Subtarget->supportsTailCall())
     isTailCall = false;
   if (isTailCall) {
     // Check if it's really possible to do a tail call.
@@ -1336,10 +1348,12 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
       SDValue Src = DAG.getNode(ISD::ADD, dl, getPointerTy(), Arg, SrcOffset);
       SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset,
                                          MVT::i32);
+      // TODO: Disable AlwaysInline when it becomes possible
+      //       to emit a nested call sequence.
       MemOpChains.push_back(DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode,
                                           Flags.getByValAlign(),
                                           /*isVolatile=*/false,
-                                          /*AlwaysInline=*/false,
+                                          /*AlwaysInline=*/true,
                                           MachinePointerInfo(0),
                                           MachinePointerInfo(0)));
 
@@ -1404,9 +1418,9 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
       const GlobalValue *GV = G->getGlobal();
       // Create a constant pool entry for the callee address
       unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
-      ARMConstantPoolValue *CPV = new ARMConstantPoolValue(GV,
-                                                           ARMPCLabelIndex,
-                                                           ARMCP::CPValue, 0);
+      ARMConstantPoolValue *CPV =
+        ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 0);
+
       // Get the address of the callee into a register
       SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4);
       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
@@ -1419,8 +1433,9 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
 
       // Create a constant pool entry for the callee address
       unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
-      ARMConstantPoolValue *CPV = new ARMConstantPoolValue(*DAG.getContext(),
-                                                       Sym, ARMPCLabelIndex, 0);
+      ARMConstantPoolValue *CPV =
+        ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym,
+                                      ARMPCLabelIndex, 0);
       // Get the address of the callee into a register
       SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4);
       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
@@ -1441,9 +1456,8 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
     // tBX takes a register source operand.
     if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
       unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
-      ARMConstantPoolValue *CPV = new ARMConstantPoolValue(GV,
-                                                           ARMPCLabelIndex,
-                                                           ARMCP::CPValue, 4);
+      ARMConstantPoolValue *CPV =
+        ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 4);
       SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4);
       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
       Callee = DAG.getLoad(getPointerTy(), dl,
@@ -1470,8 +1484,9 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
     const char *Sym = S->getSymbol();
     if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
       unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
-      ARMConstantPoolValue *CPV = new ARMConstantPoolValue(*DAG.getContext(),
-                                                       Sym, ARMPCLabelIndex, 4);
+      ARMConstantPoolValue *CPV =
+        ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym,
+                                      ARMPCLabelIndex, 4);
       SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4);
       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
       Callee = DAG.getLoad(getPointerTy(), dl,
@@ -1940,9 +1955,9 @@ SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
   } else {
     unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
     ARMPCLabelIndex = AFI->createPICLabelUId();
-    ARMConstantPoolValue *CPV = new ARMConstantPoolValue(BA, ARMPCLabelIndex,
-                                                         ARMCP::CPBlockAddress,
-                                                         PCAdj);
+    ARMConstantPoolValue *CPV =
+      ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex,
+                                      ARMCP::CPBlockAddress, PCAdj);
     CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
   }
   CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr);
@@ -1966,8 +1981,8 @@ ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
   ARMConstantPoolValue *CPV =
-    new ARMConstantPoolValue(GA->getGlobal(), ARMPCLabelIndex,
-                             ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true);
+    ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
+                                    ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true);
   SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, 4);
   Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument);
   Argument = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Argument,
@@ -1982,11 +1997,11 @@ ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
   ArgListTy Args;
   ArgListEntry Entry;
   Entry.Node = Argument;
-  Entry.Ty = (const Type *) Type::getInt32Ty(*DAG.getContext());
+  Entry.Ty = (Type *) Type::getInt32Ty(*DAG.getContext());
   Args.push_back(Entry);
   // FIXME: is there useful debug info available here?
   std::pair<SDValue, SDValue> CallResult =
-    LowerCallTo(Chain, (const Type *) Type::getInt32Ty(*DAG.getContext()),
+    LowerCallTo(Chain, (Type *) Type::getInt32Ty(*DAG.getContext()),
                 false, false, false, false,
                 0, CallingConv::C, false, /*isReturnValueUsed=*/true,
                 DAG.getExternalSymbol("__tls_get_addr", PtrVT), Args, DAG, dl);
@@ -2013,8 +2028,9 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
     // Initial exec model.
     unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
     ARMConstantPoolValue *CPV =
-      new ARMConstantPoolValue(GA->getGlobal(), ARMPCLabelIndex,
-                               ARMCP::CPValue, PCAdj, ARMCP::GOTTPOFF, true);
+      ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
+                                      ARMCP::CPValue, PCAdj, ARMCP::GOTTPOFF,
+                                      true);
     Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
     Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
     Offset = DAG.getLoad(PtrVT, dl, Chain, Offset,
@@ -2030,7 +2046,8 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
                          false, false, 0);
   } else {
     // local exec model
-    ARMConstantPoolValue *CPV = new ARMConstantPoolValue(GV, ARMCP::TPOFF);
+    ARMConstantPoolValue *CPV =
+      ARMConstantPoolConstant::Create(GV, ARMCP::TPOFF);
     Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
     Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
     Offset = DAG.getLoad(PtrVT, dl, Chain, Offset,
@@ -2066,7 +2083,8 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
   if (RelocM == Reloc::PIC_) {
     bool UseGOTOFF = GV->hasLocalLinkage() || GV->hasHiddenVisibility();
     ARMConstantPoolValue *CPV =
-      new ARMConstantPoolValue(GV, UseGOTOFF ? ARMCP::GOTOFF : ARMCP::GOT);
+      ARMConstantPoolConstant::Create(GV,
+                                      UseGOTOFF ? ARMCP::GOTOFF : ARMCP::GOT);
     SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
     CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
     SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
@@ -2135,7 +2153,8 @@ SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
     ARMPCLabelIndex = AFI->createPICLabelUId();
     unsigned PCAdj = (RelocM != Reloc::PIC_) ? 0 : (Subtarget->isThumb()?4:8);
     ARMConstantPoolValue *CPV =
-      new ARMConstantPoolValue(GV, ARMPCLabelIndex, ARMCP::CPValue, PCAdj);
+      ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue,
+                                      PCAdj);
     CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
   }
   CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
@@ -2167,9 +2186,9 @@ SDValue ARMTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op,
   EVT PtrVT = getPointerTy();
   DebugLoc dl = Op.getDebugLoc();
   unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
-  ARMConstantPoolValue *CPV = new ARMConstantPoolValue(*DAG.getContext(),
-                                                       "_GLOBAL_OFFSET_TABLE_",
-                                                       ARMPCLabelIndex, PCAdj);
+  ARMConstantPoolValue *CPV =
+    ARMConstantPoolSymbol::Create(*DAG.getContext(), "_GLOBAL_OFFSET_TABLE_",
+                                  ARMPCLabelIndex, PCAdj);
   SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
   CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
   SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr,
@@ -2191,7 +2210,8 @@ SDValue
 ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const {
   DebugLoc dl = Op.getDebugLoc();
   SDValue Val = DAG.getConstant(0, MVT::i32);
-  return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl, MVT::i32, Op.getOperand(0),
+  return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl,
+                     DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0),
                      Op.getOperand(1), Val);
 }
 
@@ -2224,8 +2244,8 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
     unsigned PCAdj = (RelocM != Reloc::PIC_)
       ? 0 : (Subtarget->isThumb() ? 4 : 8);
     ARMConstantPoolValue *CPV =
-      new ARMConstantPoolValue(MF.getFunction(), ARMPCLabelIndex,
-                               ARMCP::CPLSDA, PCAdj);
+      ARMConstantPoolConstant::Create(MF.getFunction(), ARMPCLabelIndex,
+                                      ARMCP::CPLSDA, PCAdj);
     CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
     CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
     SDValue Result =
@@ -2277,6 +2297,25 @@ static SDValue LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG,
                      DAG.getConstant(DMBOpt, MVT::i32));
 }
 
+
+static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG,
+                                 const ARMSubtarget *Subtarget) {
+  // FIXME: handle "fence singlethread" more efficiently.
+  DebugLoc dl = Op.getDebugLoc();
+  if (!Subtarget->hasDataBarrier()) {
+    // Some ARMv6 cpus can support data barriers with an mcr instruction.
+    // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
+    // here.
+    assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() &&
+           "Unexpected ISD::MEMBARRIER encountered. Should be libcall!");
+    return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0),
+                       DAG.getConstant(0, MVT::i32));
+  }
+
+  return DAG.getNode(ARMISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0),
+                     DAG.getConstant(ARM_MB::ISH, MVT::i32));
+}
+
 static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG,
                              const ARMSubtarget *Subtarget) {
   // ARM pre v5TE and Thumb1 does not have preload instructions.
@@ -2754,7 +2793,7 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
     SDValue ARMcc;
     SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
     SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
-    return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp);
+    return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR,Cmp);
   }
 
   ARMCC::CondCodes CondCode, CondCode2;
@@ -2993,8 +3032,8 @@ static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
   EVT VT = Op.getValueType();
   DebugLoc dl = Op.getDebugLoc();
 
-  EVT OperandVT = Op.getOperand(0).getValueType();
-  assert(OperandVT == MVT::v4i16 && "Invalid type for custom lowering!");
+  assert(Op.getOperand(0).getValueType() == MVT::v4i16 &&
+         "Invalid type for custom lowering!");
   if (VT != MVT::v4f32)
     return DAG.UnrollVectorOp(Op.getNode());
 
@@ -3905,8 +3944,7 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
       }
 
       // Try an immediate VMVN.
-      uint64_t NegatedImm = (SplatBits.getZExtValue() ^
-                             ((1LL << SplatBitSize) - 1));
+      uint64_t NegatedImm = (~SplatBits).getZExtValue();
       Val = isNEONModifiedImm(NegatedImm,
                                       SplatUndef.getZExtValue(), SplatBitSize,
                                       DAG, VmovVT, VT.is128BitVector(),
@@ -4019,6 +4057,14 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
       // A shuffle can only come from building a vector from various
       // elements of other vectors.
       return SDValue();
+    } else if (V.getOperand(0).getValueType().getVectorElementType() !=
+               VT.getVectorElementType()) {
+      // This code doesn't know how to handle shuffles where the vector
+      // element types do not match (this happens because type legalization
+      // promotes the return type of EXTRACT_VECTOR_ELT).
+      // FIXME: It might be appropriate to extend this code to handle
+      // mismatched types.
+      return SDValue();
     }
 
     // Record this extraction against the appropriate vector if possible...
@@ -4819,6 +4865,71 @@ static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) {
   return N0;
 }
 
+static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getNode()->getValueType(0);
+  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
+
+  unsigned Opc;
+  bool ExtraOp = false;
+  switch (Op.getOpcode()) {
+  default: assert(0 && "Invalid code");
+  case ISD::ADDC: Opc = ARMISD::ADDC; break;
+  case ISD::ADDE: Opc = ARMISD::ADDE; ExtraOp = true; break;
+  case ISD::SUBC: Opc = ARMISD::SUBC; break;
+  case ISD::SUBE: Opc = ARMISD::SUBE; ExtraOp = true; break;
+  }
+
+  if (!ExtraOp)
+    return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0),
+                       Op.getOperand(1));
+  return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0),
+                     Op.getOperand(1), Op.getOperand(2));
+}
+
+static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) {
+  // Monotonic load/store is legal for all targets
+  if (cast<AtomicSDNode>(Op)->getOrdering() <= Monotonic)
+    return Op;
+
+  // Aquire/Release load/store is not legal for targets without a
+  // dmb or equivalent available.
+  return SDValue();
+}
+
+
+static void
+ReplaceATOMIC_OP_64(SDNode *Node, SmallVectorImpl<SDValue>& Results,
+                    SelectionDAG &DAG, unsigned NewOp) {
+  DebugLoc dl = Node->getDebugLoc();
+  assert (Node->getValueType(0) == MVT::i64 &&
+          "Only know how to expand i64 atomics");
+
+  SmallVector<SDValue, 6> Ops;
+  Ops.push_back(Node->getOperand(0)); // Chain
+  Ops.push_back(Node->getOperand(1)); // Ptr
+  // Low part of Val1
+  Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                            Node->getOperand(2), DAG.getIntPtrConstant(0)));
+  // High part of Val1
+  Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                            Node->getOperand(2), DAG.getIntPtrConstant(1)));
+  if (NewOp == ARMISD::ATOMCMPXCHG64_DAG) {
+    // High part of Val1
+    Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                              Node->getOperand(3), DAG.getIntPtrConstant(0)));
+    // High part of Val2
+    Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                              Node->getOperand(3), DAG.getIntPtrConstant(1)));
+  }
+  SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
+  SDValue Result =
+    DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops.data(), Ops.size(), MVT::i64,
+                            cast<MemSDNode>(Node)->getMemOperand());
+  SDValue OpsF[] = { Result.getValue(0), Result.getValue(1) };
+  Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2));
+  Results.push_back(Result.getValue(2));
+}
+
 SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
   default: llvm_unreachable("Don't know how to custom lower this!");
@@ -4834,6 +4945,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::BR_JT:         return LowerBR_JT(Op, DAG);
   case ISD::VASTART:       return LowerVASTART(Op, DAG);
   case ISD::MEMBARRIER:    return LowerMEMBARRIER(Op, DAG, Subtarget);
+  case ISD::ATOMIC_FENCE:  return LowerATOMIC_FENCE(Op, DAG, Subtarget);
   case ISD::PREFETCH:      return LowerPREFETCH(Op, DAG, Subtarget);
   case ISD::SINT_TO_FP:
   case ISD::UINT_TO_FP:    return LowerINT_TO_FP(Op, DAG);
@@ -4856,7 +4968,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::SRL_PARTS:
   case ISD::SRA_PARTS:     return LowerShiftRightParts(Op, DAG);
   case ISD::CTTZ:          return LowerCTTZ(Op.getNode(), DAG, Subtarget);
-  case ISD::VSETCC:        return LowerVSETCC(Op, DAG);
+  case ISD::SETCC:         return LowerVSETCC(Op, DAG);
   case ISD::BUILD_VECTOR:  return LowerBUILD_VECTOR(Op, DAG, Subtarget);
   case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
@@ -4865,6 +4977,12 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::MUL:           return LowerMUL(Op, DAG);
   case ISD::SDIV:          return LowerSDIV(Op, DAG);
   case ISD::UDIV:          return LowerUDIV(Op, DAG);
+  case ISD::ADDC:
+  case ISD::ADDE:
+  case ISD::SUBC:
+  case ISD::SUBE:          return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
+  case ISD::ATOMIC_LOAD:
+  case ISD::ATOMIC_STORE:  return LowerAtomicLoadStore(Op, DAG);
   }
   return SDValue();
 }
@@ -4886,6 +5004,30 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
   case ISD::SRA:
     Res = Expand64BitShift(N, DAG, Subtarget);
     break;
+  case ISD::ATOMIC_LOAD_ADD:
+    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMADD64_DAG);
+    return;
+  case ISD::ATOMIC_LOAD_AND:
+    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMAND64_DAG);
+    return;
+  case ISD::ATOMIC_LOAD_NAND:
+    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMNAND64_DAG);
+    return;
+  case ISD::ATOMIC_LOAD_OR:
+    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMOR64_DAG);
+    return;
+  case ISD::ATOMIC_LOAD_SUB:
+    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMSUB64_DAG);
+    return;
+  case ISD::ATOMIC_LOAD_XOR:
+    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMXOR64_DAG);
+    return;
+  case ISD::ATOMIC_SWAP:
+    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMSWAP64_DAG);
+    return;
+  case ISD::ATOMIC_CMP_SWAP:
+    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMCMPXCHG64_DAG);
+    return;
   }
   if (Res.getNode())
     Results.push_back(Res);
@@ -4963,7 +5105,10 @@ ARMTargetLowering::EmitAtomicCmpSwap(MachineInstr *MI,
   //   cmp dest, oldval
   //   bne exitMBB
   BB = loop1MBB;
-  AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr));
+  MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr);
+  if (ldrOpc == ARM::t2LDREX)
+    MIB.addImm(0);
+  AddDefaultPred(MIB);
   AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
                  .addReg(dest).addReg(oldval));
   BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
@@ -4976,8 +5121,10 @@ ARMTargetLowering::EmitAtomicCmpSwap(MachineInstr *MI,
   //   cmp scratch, #0
   //   bne loop1MBB
   BB = loop2MBB;
-  AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), scratch).addReg(newval)
-                 .addReg(ptr));
+  MIB = BuildMI(BB, dl, TII->get(strOpc), scratch).addReg(newval).addReg(ptr);
+  if (strOpc == ARM::t2STREX)
+    MIB.addImm(0);
+  AddDefaultPred(MIB);
   AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
                  .addReg(scratch).addImm(0));
   BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
@@ -5063,7 +5210,10 @@ ARMTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
   //   bne- loopMBB
   //   fallthrough --> exitMBB
   BB = loopMBB;
-  AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr));
+  MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr);
+  if (ldrOpc == ARM::t2LDREX)
+    MIB.addImm(0);
+  AddDefaultPred(MIB);
   if (BinOpcode) {
     // operand order needs to go the other way for NAND
     if (BinOpcode == ARM::BICrr || BinOpcode == ARM::t2BICrr)
@@ -5074,8 +5224,10 @@ ARMTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
                      addReg(dest).addReg(incr)).addReg(0);
   }
 
-  AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), scratch).addReg(scratch2)
-                 .addReg(ptr));
+  MIB = BuildMI(BB, dl, TII->get(strOpc), scratch).addReg(scratch2).addReg(ptr);
+  if (strOpc == ARM::t2STREX)
+    MIB.addImm(0);
+  AddDefaultPred(MIB);
   AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
                  .addReg(scratch).addImm(0));
   BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
@@ -5125,12 +5277,12 @@ ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI,
   case 1:
     ldrOpc = isThumb2 ? ARM::t2LDREXB : ARM::LDREXB;
     strOpc = isThumb2 ? ARM::t2STREXB : ARM::STREXB;
-    extendOpc = isThumb2 ? ARM::t2SXTBr : ARM::SXTBr;
+    extendOpc = isThumb2 ? ARM::t2SXTB : ARM::SXTB;
     break;
   case 2:
     ldrOpc = isThumb2 ? ARM::t2LDREXH : ARM::LDREXH;
     strOpc = isThumb2 ? ARM::t2STREXH : ARM::STREXH;
-    extendOpc = isThumb2 ? ARM::t2SXTHr : ARM::SXTHr;
+    extendOpc = isThumb2 ? ARM::t2SXTH : ARM::SXTH;
     break;
   case 4:
     ldrOpc = isThumb2 ? ARM::t2LDREX : ARM::LDREX;
@@ -5170,12 +5322,17 @@ ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI,
   //   bne- loopMBB
   //   fallthrough --> exitMBB
   BB = loopMBB;
-  AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr));
+  MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr);
+  if (ldrOpc == ARM::t2LDREX)
+    MIB.addImm(0);
+  AddDefaultPred(MIB);
 
   // Sign extend the value, if necessary.
   if (signExtend && extendOpc) {
     oldval = MRI.createVirtualRegister(ARM::GPRRegisterClass);
-    AddDefaultPred(BuildMI(BB, dl, TII->get(extendOpc), oldval).addReg(dest));
+    AddDefaultPred(BuildMI(BB, dl, TII->get(extendOpc), oldval)
+                     .addReg(dest)
+                     .addImm(0));
   }
 
   // Build compare and cmov instructions.
@@ -5184,8 +5341,10 @@ ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI,
   BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2MOVCCr : ARM::MOVCCr), scratch2)
          .addReg(oldval).addReg(incr).addImm(Cond).addReg(ARM::CPSR);
 
-  AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), scratch).addReg(scratch2)
-                 .addReg(ptr));
+  MIB = BuildMI(BB, dl, TII->get(strOpc), scratch).addReg(scratch2).addReg(ptr);
+  if (strOpc == ARM::t2STREX)
+    MIB.addImm(0);
+  AddDefaultPred(MIB);
   AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
                  .addReg(scratch).addImm(0));
   BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
@@ -5203,79 +5362,596 @@ ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI,
   return BB;
 }
 
-static
-MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) {
-  for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(),
-       E = MBB->succ_end(); I != E; ++I)
-    if (*I != Succ)
-      return *I;
-  llvm_unreachable("Expecting a BB with two successors!");
-}
+MachineBasicBlock *
+ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB,
+                                      unsigned Op1, unsigned Op2,
+                                      bool NeedsCarry, bool IsCmpxchg) const {
+  // This also handles ATOMIC_SWAP, indicated by Op1==0.
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
 
-// FIXME: This opcode table should obviously be expressed in the target
-// description. We probably just need a "machine opcode" value in the pseudo
-// instruction. But the ideal solution maybe to simply remove the "S" version
-// of the opcode altogether.
-struct AddSubFlagsOpcodePair {
-  unsigned PseudoOpc;
-  unsigned MachineOpc;
-};
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction *MF = BB->getParent();
+  MachineFunction::iterator It = BB;
+  ++It;
 
-static AddSubFlagsOpcodePair AddSubFlagsOpcodeMap[] = {
-  {ARM::ADCSri, ARM::ADCri},
-  {ARM::ADCSrr, ARM::ADCrr},
-  {ARM::ADCSrs, ARM::ADCrs},
-  {ARM::SBCSri, ARM::SBCri},
-  {ARM::SBCSrr, ARM::SBCrr},
-  {ARM::SBCSrs, ARM::SBCrs},
-  {ARM::RSBSri, ARM::RSBri},
-  {ARM::RSBSrr, ARM::RSBrr},
-  {ARM::RSBSrs, ARM::RSBrs},
-  {ARM::RSCSri, ARM::RSCri},
-  {ARM::RSCSrs, ARM::RSCrs},
-  {ARM::t2ADCSri, ARM::t2ADCri},
-  {ARM::t2ADCSrr, ARM::t2ADCrr},
-  {ARM::t2ADCSrs, ARM::t2ADCrs},
-  {ARM::t2SBCSri, ARM::t2SBCri},
-  {ARM::t2SBCSrr, ARM::t2SBCrr},
-  {ARM::t2SBCSrs, ARM::t2SBCrs},
-  {ARM::t2RSBSri, ARM::t2RSBri},
-  {ARM::t2RSBSrs, ARM::t2RSBrs},
-};
+  unsigned destlo = MI->getOperand(0).getReg();
+  unsigned desthi = MI->getOperand(1).getReg();
+  unsigned ptr = MI->getOperand(2).getReg();
+  unsigned vallo = MI->getOperand(3).getReg();
+  unsigned valhi = MI->getOperand(4).getReg();
+  DebugLoc dl = MI->getDebugLoc();
+  bool isThumb2 = Subtarget->isThumb2();
 
-// Convert and Add or Subtract with Carry and Flags to a generic opcode with
-// CPSR<def> operand. e.g. ADCS (...) -> ADC (... CPSR<def>).
-//
-// FIXME: Somewhere we should assert that CPSR<def> is in the correct
-// position to be recognized by the target descrition as the 'S' bit.
-bool ARMTargetLowering::RemapAddSubWithFlags(MachineInstr *MI,
-                                             MachineBasicBlock *BB) const {
-  unsigned OldOpc = MI->getOpcode();
-  unsigned NewOpc = 0;
+  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+  if (isThumb2) {
+    MRI.constrainRegClass(destlo, ARM::rGPRRegisterClass);
+    MRI.constrainRegClass(desthi, ARM::rGPRRegisterClass);
+    MRI.constrainRegClass(ptr, ARM::rGPRRegisterClass);
+  }
 
-  // This is only called for instructions that need remapping, so iterating over
-  // the tiny opcode table is not costly.
-  static const int NPairs =
-    sizeof(AddSubFlagsOpcodeMap) / sizeof(AddSubFlagsOpcodePair);
-  for (AddSubFlagsOpcodePair *Pair = &AddSubFlagsOpcodeMap[0],
-         *End = &AddSubFlagsOpcodeMap[NPairs]; Pair != End; ++Pair) {
-    if (OldOpc == Pair->PseudoOpc) {
-      NewOpc = Pair->MachineOpc;
-      break;
+  unsigned ldrOpc = isThumb2 ? ARM::t2LDREXD : ARM::LDREXD;
+  unsigned strOpc = isThumb2 ? ARM::t2STREXD : ARM::STREXD;
+
+  MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *contBB = 0, *cont2BB = 0;
+  if (IsCmpxchg) {
+    contBB = MF->CreateMachineBasicBlock(LLVM_BB);
+    cont2BB = MF->CreateMachineBasicBlock(LLVM_BB);
+  }
+  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MF->insert(It, loopMBB);
+  if (IsCmpxchg) {
+    MF->insert(It, contBB);
+    MF->insert(It, cont2BB);
+  }
+  MF->insert(It, exitMBB);
+
+  // Transfer the remainder of BB and its successor edges to exitMBB.
+  exitMBB->splice(exitMBB->begin(), BB,
+                  llvm::next(MachineBasicBlock::iterator(MI)),
+                  BB->end());
+  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  TargetRegisterClass *TRC =
+    isThumb2 ? ARM::tGPRRegisterClass : ARM::GPRRegisterClass;
+  unsigned storesuccess = MRI.createVirtualRegister(TRC);
+
+  //  thisMBB:
+  //   ...
+  //   fallthrough --> loopMBB
+  BB->addSuccessor(loopMBB);
+
+  //  loopMBB:
+  //   ldrexd r2, r3, ptr
+  //   <binopa> r0, r2, incr
+  //   <binopb> r1, r3, incr
+  //   strexd storesuccess, r0, r1, ptr
+  //   cmp storesuccess, #0
+  //   bne- loopMBB
+  //   fallthrough --> exitMBB
+  //
+  // Note that the registers are explicitly specified because there is not any
+  // way to force the register allocator to allocate a register pair.
+  //
+  // FIXME: The hardcoded registers are not necessary for Thumb2, but we
+  // need to properly enforce the restriction that the two output registers
+  // for ldrexd must be different.
+  BB = loopMBB;
+  // Load
+  AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc))
+                 .addReg(ARM::R2, RegState::Define)
+                 .addReg(ARM::R3, RegState::Define).addReg(ptr));
+  // Copy r2/r3 into dest.  (This copy will normally be coalesced.)
+  BuildMI(BB, dl, TII->get(TargetOpcode::COPY), destlo).addReg(ARM::R2);
+  BuildMI(BB, dl, TII->get(TargetOpcode::COPY), desthi).addReg(ARM::R3);
+
+  if (IsCmpxchg) {
+    // Add early exit
+    for (unsigned i = 0; i < 2; i++) {
+      AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr :
+                                                         ARM::CMPrr))
+                     .addReg(i == 0 ? destlo : desthi)
+                     .addReg(i == 0 ? vallo : valhi));
+      BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
+        .addMBB(exitMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
+      BB->addSuccessor(exitMBB);
+      BB->addSuccessor(i == 0 ? contBB : cont2BB);
+      BB = (i == 0 ? contBB : cont2BB);
     }
+
+    // Copy to physregs for strexd
+    unsigned setlo = MI->getOperand(5).getReg();
+    unsigned sethi = MI->getOperand(6).getReg();
+    BuildMI(BB, dl, TII->get(TargetOpcode::COPY), ARM::R0).addReg(setlo);
+    BuildMI(BB, dl, TII->get(TargetOpcode::COPY), ARM::R1).addReg(sethi);
+  } else if (Op1) {
+    // Perform binary operation
+    AddDefaultPred(BuildMI(BB, dl, TII->get(Op1), ARM::R0)
+                   .addReg(destlo).addReg(vallo))
+        .addReg(NeedsCarry ? ARM::CPSR : 0, getDefRegState(NeedsCarry));
+    AddDefaultPred(BuildMI(BB, dl, TII->get(Op2), ARM::R1)
+                   .addReg(desthi).addReg(valhi)).addReg(0);
+  } else {
+    // Copy to physregs for strexd
+    BuildMI(BB, dl, TII->get(TargetOpcode::COPY), ARM::R0).addReg(vallo);
+    BuildMI(BB, dl, TII->get(TargetOpcode::COPY), ARM::R1).addReg(valhi);
   }
-  if (!NewOpc)
-    return false;
 
+  // Store
+  AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), storesuccess)
+                 .addReg(ARM::R0).addReg(ARM::R1).addReg(ptr));
+  // Cmp+jump
+  AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
+                 .addReg(storesuccess).addImm(0));
+  BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
+    .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
+
+  BB->addSuccessor(loopMBB);
+  BB->addSuccessor(exitMBB);
+
+  //  exitMBB:
+  //   ...
+  BB = exitMBB;
+
+  MI->eraseFromParent();   // The instruction is gone now.
+
+  return BB;
+}
+
+/// EmitBasePointerRecalculation - For functions using a base pointer, we
+/// rematerialize it (via the frame pointer).
+void ARMTargetLowering::
+EmitBasePointerRecalculation(MachineInstr *MI, MachineBasicBlock *MBB,
+                             MachineBasicBlock *DispatchBB) const {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII);
+  MachineFunction &MF = *MI->getParent()->getParent();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  const ARMBaseRegisterInfo &RI = AII->getRegisterInfo();
+
+  if (!RI.hasBasePointer(MF)) return;
+
+  MachineBasicBlock::iterator MBBI = MI;
+
+  int32_t NumBytes = AFI->getFramePtrSpillOffset();
+  unsigned FramePtr = RI.getFrameRegister(MF);
+  assert(MF.getTarget().getFrameLowering()->hasFP(MF) &&
+         "Base pointer without frame pointer?");
+
+  if (AFI->isThumb2Function())
+    llvm::emitT2RegPlusImmediate(*MBB, MBBI, MI->getDebugLoc(), ARM::R6,
+                                 FramePtr, -NumBytes, ARMCC::AL, 0, *AII);
+  else if (AFI->isThumbFunction())
+    llvm::emitThumbRegPlusImmediate(*MBB, MBBI, MI->getDebugLoc(), ARM::R6,
+                                    FramePtr, -NumBytes, *AII, RI);
+  else
+    llvm::emitARMRegPlusImmediate(*MBB, MBBI, MI->getDebugLoc(), ARM::R6,
+                                  FramePtr, -NumBytes, ARMCC::AL, 0, *AII);
+
+  if (!RI.needsStackRealignment(MF)) return;
+
+  // If there's dynamic realignment, adjust for it.
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  unsigned MaxAlign = MFI->getMaxAlignment();
+  assert(!AFI->isThumb1OnlyFunction());
+
+  // Emit bic r6, r6, MaxAlign
+  unsigned bicOpc = AFI->isThumbFunction() ? ARM::t2BICri : ARM::BICri;
+  AddDefaultCC(
+    AddDefaultPred(
+      BuildMI(*MBB, MBBI, MI->getDebugLoc(), TII->get(bicOpc), ARM::R6)
+      .addReg(ARM::R6, RegState::Kill)
+      .addImm(MaxAlign - 1)));
+}
+
+/// SetupEntryBlockForSjLj - Insert code into the entry block that creates and
+/// registers the function context.
+void ARMTargetLowering::
+SetupEntryBlockForSjLj(MachineInstr *MI, MachineBasicBlock *MBB,
+                       MachineBasicBlock *DispatchBB, int FI) const {
   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
   DebugLoc dl = MI->getDebugLoc();
-  MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc));
-  for (unsigned i = 0; i < MI->getNumOperands(); ++i)
-    MIB.addOperand(MI->getOperand(i));
-  AddDefaultPred(MIB);
-  MIB.addReg(ARM::CPSR, RegState::Define); // S bit
+  MachineFunction *MF = MBB->getParent();
+  MachineRegisterInfo *MRI = &MF->getRegInfo();
+  MachineConstantPool *MCP = MF->getConstantPool();
+  ARMFunctionInfo *AFI = MF->getInfo<ARMFunctionInfo>();
+  const Function *F = MF->getFunction();
+
+  bool isThumb = Subtarget->isThumb();
+  bool isThumb2 = Subtarget->isThumb2();
+
+  unsigned PCLabelId = AFI->createPICLabelUId();
+  unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8;
+  ARMConstantPoolValue *CPV =
+    ARMConstantPoolMBB::Create(F->getContext(), DispatchBB, PCLabelId, PCAdj);
+  unsigned CPI = MCP->getConstantPoolIndex(CPV, 4);
+
+  const TargetRegisterClass *TRC =
+    isThumb ? ARM::tGPRRegisterClass : ARM::GPRRegisterClass;
+
+  // Grab constant pool and fixed stack memory operands.
+  MachineMemOperand *CPMMO =
+    MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(),
+                             MachineMemOperand::MOLoad, 4, 4);
+
+  MachineMemOperand *FIMMOSt =
+    MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(FI),
+                             MachineMemOperand::MOStore, 4, 4);
+
+  EmitBasePointerRecalculation(MI, MBB, DispatchBB);
+
+  // Load the address of the dispatch MBB into the jump buffer.
+  if (isThumb2) {
+    // Incoming value: jbuf
+    //   ldr.n  r5, LCPI1_1
+    //   orr    r5, r5, #1
+    //   add    r5, pc
+    //   str    r5, [$jbuf, #+4] ; &jbuf[1]
+    unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1)
+                   .addConstantPoolIndex(CPI)
+                   .addMemOperand(CPMMO));
+    // Set the low bit because of thumb mode.
+    unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
+    AddDefaultCC(
+      AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2)
+                     .addReg(NewVReg1, RegState::Kill)
+                     .addImm(0x01)));
+    unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
+    BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3)
+      .addReg(NewVReg2, RegState::Kill)
+      .addImm(PCLabelId);
+    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12))
+                   .addReg(NewVReg3, RegState::Kill)
+                   .addFrameIndex(FI)
+                   .addImm(36)  // &jbuf[1] :: pc
+                   .addMemOperand(FIMMOSt));
+  } else if (isThumb) {
+    // Incoming value: jbuf
+    //   ldr.n  r1, LCPI1_4
+    //   add    r1, pc
+    //   mov    r2, #1
+    //   orrs   r1, r2
+    //   add    r2, $jbuf, #+4 ; &jbuf[1]
+    //   str    r1, [r2]
+    unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1)
+                   .addConstantPoolIndex(CPI)
+                   .addMemOperand(CPMMO));
+    unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
+    BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2)
+      .addReg(NewVReg1, RegState::Kill)
+      .addImm(PCLabelId);
+    // Set the low bit because of thumb mode.
+    unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3)
+                   .addReg(ARM::CPSR, RegState::Define)
+                   .addImm(1));
+    unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4)
+                   .addReg(ARM::CPSR, RegState::Define)
+                   .addReg(NewVReg2, RegState::Kill)
+                   .addReg(NewVReg3, RegState::Kill));
+    unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tADDrSPi), NewVReg5)
+                   .addFrameIndex(FI)
+                   .addImm(36)); // &jbuf[1] :: pc
+    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi))
+                   .addReg(NewVReg4, RegState::Kill)
+                   .addReg(NewVReg5, RegState::Kill)
+                   .addImm(0)
+                   .addMemOperand(FIMMOSt));
+  } else {
+    // Incoming value: jbuf
+    //   ldr  r1, LCPI1_1
+    //   add  r1, pc, r1
+    //   str  r1, [$jbuf, #+4] ; &jbuf[1]
+    unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12),  NewVReg1)
+                   .addConstantPoolIndex(CPI)
+                   .addImm(0)
+                   .addMemOperand(CPMMO));
+    unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2)
+                   .addReg(NewVReg1, RegState::Kill)
+                   .addImm(PCLabelId));
+    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12))
+                   .addReg(NewVReg2, RegState::Kill)
+                   .addFrameIndex(FI)
+                   .addImm(36)  // &jbuf[1] :: pc
+                   .addMemOperand(FIMMOSt));
+  }
+}
+
+MachineBasicBlock *ARMTargetLowering::
+EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc dl = MI->getDebugLoc();
+  MachineFunction *MF = MBB->getParent();
+  MachineRegisterInfo *MRI = &MF->getRegInfo();
+  ARMFunctionInfo *AFI = MF->getInfo<ARMFunctionInfo>();
+  MachineFrameInfo *MFI = MF->getFrameInfo();
+  int FI = MFI->getFunctionContextIndex();
+
+  const TargetRegisterClass *TRC =
+    Subtarget->isThumb() ? ARM::tGPRRegisterClass : ARM::GPRRegisterClass;
+
+  // Get a mapping of the call site numbers to all of the landing pads they're
+  // associated with.
+  DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2> > CallSiteNumToLPad;
+  unsigned MaxCSNum = 0;
+  MachineModuleInfo &MMI = MF->getMMI();
+  for (MachineFunction::iterator BB = MF->begin(), E = MF->end(); BB != E; ++BB) {
+    if (!BB->isLandingPad()) continue;
+
+    // FIXME: We should assert that the EH_LABEL is the first MI in the landing
+    // pad.
+    for (MachineBasicBlock::iterator
+           II = BB->begin(), IE = BB->end(); II != IE; ++II) {
+      if (!II->isEHLabel()) continue;
+
+      MCSymbol *Sym = II->getOperand(0).getMCSymbol();
+      if (!MMI.hasCallSiteLandingPad(Sym)) continue;
+
+      SmallVectorImpl<unsigned> &CallSiteIdxs = MMI.getCallSiteLandingPad(Sym);
+      for (SmallVectorImpl<unsigned>::iterator
+             CSI = CallSiteIdxs.begin(), CSE = CallSiteIdxs.end();
+           CSI != CSE; ++CSI) {
+        CallSiteNumToLPad[*CSI].push_back(BB);
+        MaxCSNum = std::max(MaxCSNum, *CSI);
+      }
+      break;
+    }
+  }
+
+  // Get an ordered list of the machine basic blocks for the jump table.
+  std::vector<MachineBasicBlock*> LPadList;
+  SmallPtrSet<MachineBasicBlock*, 64> InvokeBBs;
+  LPadList.reserve(CallSiteNumToLPad.size());
+  for (unsigned I = 1; I <= MaxCSNum; ++I) {
+    SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I];
+    for (SmallVectorImpl<MachineBasicBlock*>::iterator
+           II = MBBList.begin(), IE = MBBList.end(); II != IE; ++II) {
+      LPadList.push_back(*II);
+      InvokeBBs.insert((*II)->pred_begin(), (*II)->pred_end());
+    }
+  }
+
+  assert(!LPadList.empty() &&
+         "No landing pad destinations for the dispatch jump table!");
+
+  // Create the jump table and associated information.
+  MachineJumpTableInfo *JTI =
+    MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline);
+  unsigned MJTI = JTI->createJumpTableIndex(LPadList);
+  unsigned UId = AFI->createJumpTableUId();
+
+  // Create the MBBs for the dispatch code.
+
+  // Shove the dispatch's address into the return slot in the function context.
+  MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
+  DispatchBB->setIsLandingPad();
+
+  MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
+  BuildMI(TrapBB, dl, TII->get(Subtarget->isThumb() ? ARM::tTRAP : ARM::TRAP));
+  DispatchBB->addSuccessor(TrapBB);
+
+  MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
+  DispatchBB->addSuccessor(DispContBB);
+
+  // Insert and renumber MBBs.
+  MachineBasicBlock *Last = &MF->back();
+  MF->insert(MF->end(), DispatchBB);
+  MF->insert(MF->end(), DispContBB);
+  MF->insert(MF->end(), TrapBB);
+  MF->RenumberBlocks(Last);
+
+  // Insert code into the entry block that creates and registers the function
+  // context.
+  SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI);
+
+  MachineMemOperand *FIMMOLd =
+    MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(FI),
+                             MachineMemOperand::MOLoad |
+                             MachineMemOperand::MOVolatile, 4, 4);
+
+  if (Subtarget->isThumb2()) {
+    unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1)
+                   .addFrameIndex(FI)
+                   .addImm(4)
+                   .addMemOperand(FIMMOLd));
+    AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri))
+                   .addReg(NewVReg1)
+                   .addImm(LPadList.size()));
+    BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc))
+      .addMBB(TrapBB)
+      .addImm(ARMCC::HI)
+      .addReg(ARM::CPSR);
+
+    unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT),NewVReg2)
+                   .addJumpTableIndex(MJTI)
+                   .addImm(UId));
+
+    unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
+    AddDefaultCC(
+      AddDefaultPred(
+        BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg3)
+        .addReg(NewVReg2, RegState::Kill)
+        .addReg(NewVReg1)
+        .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2))));
+
+    BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT))
+      .addReg(NewVReg3, RegState::Kill)
+      .addReg(NewVReg1)
+      .addJumpTableIndex(MJTI)
+      .addImm(UId);
+  } else if (Subtarget->isThumb()) {
+    unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1)
+                   .addFrameIndex(FI)
+                   .addImm(1)
+                   .addMemOperand(FIMMOLd));
+
+    AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8))
+                   .addReg(NewVReg1)
+                   .addImm(LPadList.size()));
+    BuildMI(DispatchBB, dl, TII->get(ARM::tBcc))
+      .addMBB(TrapBB)
+      .addImm(ARMCC::HI)
+      .addReg(ARM::CPSR);
+
+    unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2)
+                   .addReg(ARM::CPSR, RegState::Define)
+                   .addReg(NewVReg1)
+                   .addImm(2));
+
+    unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3)
+                   .addJumpTableIndex(MJTI)
+                   .addImm(UId));
+
+    unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4)
+                   .addReg(ARM::CPSR, RegState::Define)
+                   .addReg(NewVReg2, RegState::Kill)
+                   .addReg(NewVReg3));
+
+    MachineMemOperand *JTMMOLd =
+      MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(),
+                               MachineMemOperand::MOLoad, 4, 4);
+
+    unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5)
+                   .addReg(NewVReg4, RegState::Kill)
+                   .addImm(0)
+                   .addMemOperand(JTMMOLd));
+
+    unsigned NewVReg6 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6)
+                   .addReg(ARM::CPSR, RegState::Define)
+                   .addReg(NewVReg5, RegState::Kill)
+                   .addReg(NewVReg3));
+
+    BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr))
+      .addReg(NewVReg6, RegState::Kill)
+      .addJumpTableIndex(MJTI)
+      .addImm(UId);
+  } else {
+    unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1)
+                   .addFrameIndex(FI)
+                   .addImm(4)
+                   .addMemOperand(FIMMOLd));
+    AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::CMPri))
+                   .addReg(NewVReg1)
+                   .addImm(LPadList.size()));
+    BuildMI(DispatchBB, dl, TII->get(ARM::Bcc))
+      .addMBB(TrapBB)
+      .addImm(ARMCC::HI)
+      .addReg(ARM::CPSR);
+
+    unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
+    AddDefaultCC(
+      AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg2)
+                     .addReg(NewVReg1)
+                     .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2))));
+    unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg3)
+                   .addJumpTableIndex(MJTI)
+                   .addImm(UId));
+
+    MachineMemOperand *JTMMOLd =
+      MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(),
+                               MachineMemOperand::MOLoad, 4, 4);
+    unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(
+      BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg4)
+      .addReg(NewVReg2, RegState::Kill)
+      .addReg(NewVReg3)
+      .addImm(0)
+      .addMemOperand(JTMMOLd));
+
+    BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd))
+      .addReg(NewVReg4, RegState::Kill)
+      .addReg(NewVReg3)
+      .addJumpTableIndex(MJTI)
+      .addImm(UId);
+  }
+
+  // Add the jump table entries as successors to the MBB.
+  MachineBasicBlock *PrevMBB = 0;
+  for (std::vector<MachineBasicBlock*>::iterator
+         I = LPadList.begin(), E = LPadList.end(); I != E; ++I) {
+    MachineBasicBlock *CurMBB = *I;
+    if (PrevMBB != CurMBB)
+      DispContBB->addSuccessor(CurMBB);
+    PrevMBB = CurMBB;
+  }
+
+  const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII);
+  const ARMBaseRegisterInfo &RI = AII->getRegisterInfo();
+  const unsigned *SavedRegs = RI.getCalleeSavedRegs(MF);
+  for (SmallPtrSet<MachineBasicBlock*, 64>::iterator
+         I = InvokeBBs.begin(), E = InvokeBBs.end(); I != E; ++I) {
+    MachineBasicBlock *BB = *I;
+
+    // Remove the landing pad successor from the invoke block and replace it
+    // with the new dispatch block.
+    for (MachineBasicBlock::succ_iterator
+           SI = BB->succ_begin(), SE = BB->succ_end(); SI != SE; ++SI) {
+      MachineBasicBlock *SMBB = *SI;
+      if (SMBB->isLandingPad()) {
+        BB->removeSuccessor(SMBB);
+        SMBB->setIsLandingPad(false);
+      }
+    }
+
+    BB->addSuccessor(DispatchBB);
+
+    // Find the invoke call and mark all of the callee-saved registers as
+    // 'implicit defined' so that they're spilled. This prevents code from
+    // moving instructions to before the EH block, where they will never be
+    // executed.
+    for (MachineBasicBlock::reverse_iterator
+           II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) {
+      if (!II->getDesc().isCall()) continue;
+
+      DenseMap<unsigned, bool> DefRegs;
+      for (MachineInstr::mop_iterator
+             OI = II->operands_begin(), OE = II->operands_end();
+           OI != OE; ++OI) {
+        if (!OI->isReg()) continue;
+        DefRegs[OI->getReg()] = true;
+      }
+
+      MachineInstrBuilder MIB(&*II);
+
+      for (unsigned i = 0; SavedRegs[i] != 0; ++i) {
+        if (!TRC->contains(SavedRegs[i])) continue;
+        if (!DefRegs[SavedRegs[i]])
+          MIB.addReg(SavedRegs[i], RegState::ImplicitDefine | RegState::Dead);
+      }
+
+      break;
+    }
+  }
+
+  // The instruction is gone now.
   MI->eraseFromParent();
-  return true;
+
+  return MBB;
+}
+
+static
+MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) {
+  for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(),
+       E = MBB->succ_end(); I != E; ++I)
+    if (*I != Succ)
+      return *I;
+  llvm_unreachable("Expecting a BB with two successors!");
 }
 
 MachineBasicBlock *
@@ -5286,12 +5962,61 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   bool isThumb2 = Subtarget->isThumb2();
   switch (MI->getOpcode()) {
   default: {
-    if (RemapAddSubWithFlags(MI, BB))
-      return BB;
-
     MI->dump();
     llvm_unreachable("Unexpected instr type to insert");
   }
+  // The Thumb2 pre-indexed stores have the same MI operands, they just
+  // define them differently in the .td files from the isel patterns, so
+  // they need pseudos.
+  case ARM::t2STR_preidx:
+    MI->setDesc(TII->get(ARM::t2STR_PRE));
+    return BB;
+  case ARM::t2STRB_preidx:
+    MI->setDesc(TII->get(ARM::t2STRB_PRE));
+    return BB;
+  case ARM::t2STRH_preidx:
+    MI->setDesc(TII->get(ARM::t2STRH_PRE));
+    return BB;
+
+  case ARM::STRi_preidx:
+  case ARM::STRBi_preidx: {
+    unsigned NewOpc = MI->getOpcode() == ARM::STRi_preidx ?
+      ARM::STR_PRE_IMM : ARM::STRB_PRE_IMM;
+    // Decode the offset.
+    unsigned Offset = MI->getOperand(4).getImm();
+    bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub;
+    Offset = ARM_AM::getAM2Offset(Offset);
+    if (isSub)
+      Offset = -Offset;
+
+    MachineMemOperand *MMO = *MI->memoperands_begin();
+    BuildMI(*BB, MI, dl, TII->get(NewOpc))
+      .addOperand(MI->getOperand(0))  // Rn_wb
+      .addOperand(MI->getOperand(1))  // Rt
+      .addOperand(MI->getOperand(2))  // Rn
+      .addImm(Offset)                 // offset (skip GPR==zero_reg)
+      .addOperand(MI->getOperand(5))  // pred
+      .addOperand(MI->getOperand(6))
+      .addMemOperand(MMO);
+    MI->eraseFromParent();
+    return BB;
+  }
+  case ARM::STRr_preidx:
+  case ARM::STRBr_preidx:
+  case ARM::STRH_preidx: {
+    unsigned NewOpc;
+    switch (MI->getOpcode()) {
+    default: llvm_unreachable("unexpected opcode!");
+    case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break;
+    case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break;
+    case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break;
+    }
+    MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc));
+    for (unsigned i = 0; i < MI->getNumOperands(); ++i)
+      MIB.addOperand(MI->getOperand(i));
+    MI->eraseFromParent();
+    return BB;
+  }
   case ARM::ATOMIC_LOAD_ADD_I8:
      return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2ADDrr : ARM::ADDrr);
   case ARM::ATOMIC_LOAD_ADD_I16:
@@ -5370,6 +6095,31 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   case ARM::ATOMIC_CMP_SWAP_I16: return EmitAtomicCmpSwap(MI, BB, 2);
   case ARM::ATOMIC_CMP_SWAP_I32: return EmitAtomicCmpSwap(MI, BB, 4);
 
+
+  case ARM::ATOMADD6432:
+    return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2ADDrr : ARM::ADDrr,
+                              isThumb2 ? ARM::t2ADCrr : ARM::ADCrr,
+                              /*NeedsCarry*/ true);
+  case ARM::ATOMSUB6432:
+    return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
+                              isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
+                              /*NeedsCarry*/ true);
+  case ARM::ATOMOR6432:
+    return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2ORRrr : ARM::ORRrr,
+                              isThumb2 ? ARM::t2ORRrr : ARM::ORRrr);
+  case ARM::ATOMXOR6432:
+    return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2EORrr : ARM::EORrr,
+                              isThumb2 ? ARM::t2EORrr : ARM::EORrr);
+  case ARM::ATOMAND6432:
+    return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2ANDrr : ARM::ANDrr,
+                              isThumb2 ? ARM::t2ANDrr : ARM::ANDrr);
+  case ARM::ATOMSWAP6432:
+    return EmitAtomicBinary64(MI, BB, 0, 0, false);
+  case ARM::ATOMCMPXCHG6432:
+    return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
+                              isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
+                              /*NeedsCarry*/ false, /*IsCmpxchg*/true);
+
   case ARM::tMOVCCr_pseudo: {
     // To "insert" a SELECT_CC instruction, we actually have to insert the
     // diamond control-flow pattern.  The incoming instruction knows the
@@ -5461,13 +6211,159 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
 
     BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
       .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR);
-    BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2B : ARM::B))
-      .addMBB(exitMBB);
+    if (isThumb2)
+      AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2B)).addMBB(exitMBB));
+    else
+      BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB);
 
     MI->eraseFromParent();   // The pseudo instruction is gone now.
     return BB;
   }
+
+  case ARM::ABS:
+  case ARM::t2ABS: {
+    // To insert an ABS instruction, we have to insert the
+    // diamond control-flow pattern.  The incoming instruction knows the
+    // source vreg to test against 0, the destination vreg to set,
+    // the condition code register to branch on, the
+    // true/false values to select between, and a branch opcode to use. 
+    // It transforms
+    //     V1 = ABS V0
+    // into
+    //     V2 = MOVS V0
+    //     BCC                      (branch to SinkBB if V0 >= 0)
+    //     RSBBB: V3 = RSBri V2, 0  (compute ABS if V2 < 0)
+    //     SinkBB: V1 = PHI(V2, V3)     
+    const BasicBlock *LLVM_BB = BB->getBasicBlock();
+    MachineFunction::iterator BBI = BB;
+    ++BBI;
+    MachineFunction *Fn = BB->getParent();
+    MachineBasicBlock *RSBBB = Fn->CreateMachineBasicBlock(LLVM_BB);
+    MachineBasicBlock *SinkBB  = Fn->CreateMachineBasicBlock(LLVM_BB);
+    Fn->insert(BBI, RSBBB);
+    Fn->insert(BBI, SinkBB);
+
+    unsigned int ABSSrcReg = MI->getOperand(1).getReg();
+    unsigned int ABSDstReg = MI->getOperand(0).getReg();
+    bool isThumb2 = Subtarget->isThumb2();
+    MachineRegisterInfo &MRI = Fn->getRegInfo();
+    // In Thumb mode S must not be specified if source register is the SP or
+    // PC and if destination register is the SP, so restrict register class
+    unsigned NewMovDstReg = MRI.createVirtualRegister(
+      isThumb2 ? ARM::rGPRRegisterClass : ARM::GPRRegisterClass);
+    unsigned NewRsbDstReg = MRI.createVirtualRegister(
+      isThumb2 ? ARM::rGPRRegisterClass : ARM::GPRRegisterClass);
+
+    // Transfer the remainder of BB and its successor edges to sinkMBB.
+    SinkBB->splice(SinkBB->begin(), BB,
+      llvm::next(MachineBasicBlock::iterator(MI)),
+      BB->end());
+    SinkBB->transferSuccessorsAndUpdatePHIs(BB);
+
+    BB->addSuccessor(RSBBB);
+    BB->addSuccessor(SinkBB);
+
+    // fall through to SinkMBB
+    RSBBB->addSuccessor(SinkBB);
+
+    // insert a movs at the end of BB
+    BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2MOVr : ARM::MOVr),
+      NewMovDstReg)
+      .addReg(ABSSrcReg, RegState::Kill)
+      .addImm((unsigned)ARMCC::AL).addReg(0)
+      .addReg(ARM::CPSR, RegState::Define);
+
+    // insert a bcc with opposite CC to ARMCC::MI at the end of BB
+    BuildMI(BB, dl, 
+      TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)).addMBB(SinkBB)
+      .addImm(ARMCC::getOppositeCondition(ARMCC::MI)).addReg(ARM::CPSR);
+
+    // insert rsbri in RSBBB
+    // Note: BCC and rsbri will be converted into predicated rsbmi
+    // by if-conversion pass
+    BuildMI(*RSBBB, RSBBB->begin(), dl, 
+      TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg)
+      .addReg(NewMovDstReg, RegState::Kill)
+      .addImm(0).addImm((unsigned)ARMCC::AL).addReg(0).addReg(0);
+
+    // insert PHI in SinkBB, 
+    // reuse ABSDstReg to not change uses of ABS instruction
+    BuildMI(*SinkBB, SinkBB->begin(), dl,
+      TII->get(ARM::PHI), ABSDstReg)
+      .addReg(NewRsbDstReg).addMBB(RSBBB)
+      .addReg(NewMovDstReg).addMBB(BB);
+
+    // remove ABS instruction
+    MI->eraseFromParent(); 
+
+    // return last added BB
+    return SinkBB;
+  }
+  }
+}
+
+void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
+                                                      SDNode *Node) const {
+  const MCInstrDesc &MCID = MI->getDesc();
+  if (!MCID.hasPostISelHook()) {
+    assert(!convertAddSubFlagsOpcode(MI->getOpcode()) &&
+           "Pseudo flag-setting opcodes must be marked with 'hasPostISelHook'");
+    return;
+  }
+
+  // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB,
+  // RSC. Coming out of isel, they have an implicit CPSR def, but the optional
+  // operand is still set to noreg. If needed, set the optional operand's
+  // register to CPSR, and remove the redundant implicit def.
+  //
+  // e.g. ADCS (...opt:%noreg, CPSR<imp-def>) -> ADC (... opt:CPSR<def>).
+
+  // Rename pseudo opcodes.
+  unsigned NewOpc = convertAddSubFlagsOpcode(MI->getOpcode());
+  if (NewOpc) {
+    const ARMBaseInstrInfo *TII =
+      static_cast<const ARMBaseInstrInfo*>(getTargetMachine().getInstrInfo());
+    MI->setDesc(TII->get(NewOpc));
+  }
+  unsigned ccOutIdx = MCID.getNumOperands() - 1;
+
+  // Any ARM instruction that sets the 's' bit should specify an optional
+  // "cc_out" operand in the last operand position.
+  if (!MCID.hasOptionalDef() || !MCID.OpInfo[ccOutIdx].isOptionalDef()) {
+    assert(!NewOpc && "Optional cc_out operand required");
+    return;
+  }
+  // Look for an implicit def of CPSR added by MachineInstr ctor. Remove it
+  // since we already have an optional CPSR def.
+  bool definesCPSR = false;
+  bool deadCPSR = false;
+  for (unsigned i = MCID.getNumOperands(), e = MI->getNumOperands();
+       i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) {
+      definesCPSR = true;
+      if (MO.isDead())
+        deadCPSR = true;
+      MI->RemoveOperand(i);
+      break;
+    }
+  }
+  if (!definesCPSR) {
+    assert(!NewOpc && "Optional cc_out operand required");
+    return;
+  }
+  assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag");
+  if (deadCPSR) {
+    assert(!MI->getOperand(ccOutIdx).getReg() &&
+           "expect uninitialized optional cc_out operand");
+    return;
   }
+
+  // If this instruction was defined with an optional CPSR def and its dag node
+  // had a live implicit CPSR def, then activate the optional CPSR def.
+  MachineOperand &MO = MI->getOperand(ccOutIdx);
+  MO.setReg(ARM::CPSR);
+  MO.setIsDef(true);
 }
 
 //===----------------------------------------------------------------------===//
@@ -6975,7 +7871,8 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {
   SDValue FalseVal = N->getOperand(0);
   SDValue TrueVal = N->getOperand(1);
   SDValue ARMcc = N->getOperand(2);
-  ARMCC::CondCodes CC = (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue();
+  ARMCC::CondCodes CC =
+    (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue();
 
   // Simplify
   //   mov     r1, r0
@@ -6995,7 +7892,7 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {
   //   movne   r0, y
   /// FIXME: Turn this into a target neutral optimization?
   SDValue Res;
-  if (CC == ARMCC::NE && FalseVal == RHS) {
+  if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) {
     Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, TrueVal, ARMcc,
                       N->getOperand(3), Cmp);
   } else if (CC == ARMCC::EQ && TrueVal == RHS) {
@@ -7235,7 +8132,7 @@ bool ARMTargetLowering::isLegalT2ScaledAddressingMode(const AddrMode &AM,
 /// isLegalAddressingMode - Return true if the addressing mode represented
 /// by AM is legal for this target, for a load/store of the specified type.
 bool ARMTargetLowering::isLegalAddressingMode(const AddrMode &AM,
-                                              const Type *Ty) const {
+                                              Type *Ty) const {
   EVT VT = getValueType(Ty, true);
   if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget))
     return false;
@@ -7351,7 +8248,8 @@ static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT,
 
     if (Ptr->getOpcode() == ISD::ADD) {
       isInc = true;
-      ARM_AM::ShiftOpc ShOpcVal= ARM_AM::getShiftOpcForNode(Ptr->getOperand(0));
+      ARM_AM::ShiftOpc ShOpcVal=
+        ARM_AM::getShiftOpcForNode(Ptr->getOperand(0).getOpcode());
       if (ShOpcVal != ARM_AM::no_shift) {
         Base = Ptr->getOperand(1);
         Offset = Ptr->getOperand(0);
@@ -7536,7 +8434,7 @@ bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const {
     if (AsmPieces.size() == 3 &&
         AsmPieces[0] == "rev" && AsmPieces[1] == "$0" && AsmPieces[2] == "$1" &&
         IA->getConstraintString().compare(0, 4, "=l,l") == 0) {
-      const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
+      IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
       if (Ty && Ty->getBitWidth() == 32)
         return IntrinsicLowering::LowerToByteSwap(CI);
     }
@@ -7559,6 +8457,9 @@ ARMTargetLowering::getConstraintType(const std::string &Constraint) const {
     case 'x': return C_RegisterClass;
     case 't': return C_RegisterClass;
     case 'j': return C_Other; // Constant for movw.
+      // An address with a single base register. Due to the way we
+      // currently handle addresses it is the same as an 'r' memory constraint.
+    case 'Q': return C_Memory;
     }
   } else if (Constraint.size() == 2) {
     switch (Constraint[0]) {
@@ -7582,7 +8483,7 @@ ARMTargetLowering::getSingleConstraintMatchWeight(
     // but allow it at the lowest weight.
   if (CallOperandVal == NULL)
     return CW_Default;
-  const Type *type = CallOperandVal->getType();
+  Type *type = CallOperandVal->getType();
   // Look at the constraint type.
   switch (*constraint) {
   default:
@@ -7618,7 +8519,7 @@ ARMTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
         return RCPair(0U, ARM::GPRRegisterClass);
     case 'h': // High regs or no regs.
       if (Subtarget->isThumb())
-	return RCPair(0U, ARM::hGPRRegisterClass);
+        return RCPair(0U, ARM::hGPRRegisterClass);
       break;
     case 'r':
       return RCPair(0U, ARM::GPRRegisterClass);
@@ -7632,15 +8533,15 @@ ARMTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
       break;
     case 'x':
       if (VT == MVT::f32)
-	return RCPair(0U, ARM::SPR_8RegisterClass);
+        return RCPair(0U, ARM::SPR_8RegisterClass);
       if (VT.getSizeInBits() == 64)
-	return RCPair(0U, ARM::DPR_8RegisterClass);
+        return RCPair(0U, ARM::DPR_8RegisterClass);
       if (VT.getSizeInBits() == 128)
-	return RCPair(0U, ARM::QPR_8RegisterClass);
+        return RCPair(0U, ARM::QPR_8RegisterClass);
       break;
     case 't':
       if (VT == MVT::f32)
-	return RCPair(0U, ARM::SPRRegisterClass);
+        return RCPair(0U, ARM::SPRRegisterClass);
       break;
     }
   }
@@ -7680,12 +8581,12 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
 
     switch (ConstraintLetter) {
       case 'j':
-	// Constant suitable for movw, must be between 0 and
-	// 65535.
-	if (Subtarget->hasV6T2Ops())
-	  if (CVal >= 0 && CVal <= 65535)
-	    break;
-	return;
+        // Constant suitable for movw, must be between 0 and
+        // 65535.
+        if (Subtarget->hasV6T2Ops())
+          if (CVal >= 0 && CVal <= 65535)
+            break;
+        return;
       case 'I':
         if (Subtarget->isThumb1Only()) {
           // This must be a constant between 0 and 255, for ADD
@@ -7823,50 +8724,6 @@ ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
   return false;
 }
 
-int ARM::getVFPf32Imm(const APFloat &FPImm) {
-  APInt Imm = FPImm.bitcastToAPInt();
-  uint32_t Sign = Imm.lshr(31).getZExtValue() & 1;
-  int32_t Exp = (Imm.lshr(23).getSExtValue() & 0xff) - 127;  // -126 to 127
-  int64_t Mantissa = Imm.getZExtValue() & 0x7fffff;  // 23 bits
-
-  // We can handle 4 bits of mantissa.
-  // mantissa = (16+UInt(e:f:g:h))/16.
-  if (Mantissa & 0x7ffff)
-    return -1;
-  Mantissa >>= 19;
-  if ((Mantissa & 0xf) != Mantissa)
-    return -1;
-
-  // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3
-  if (Exp < -3 || Exp > 4)
-    return -1;
-  Exp = ((Exp+3) & 0x7) ^ 4;
-
-  return ((int)Sign << 7) | (Exp << 4) | Mantissa;
-}
-
-int ARM::getVFPf64Imm(const APFloat &FPImm) {
-  APInt Imm = FPImm.bitcastToAPInt();
-  uint64_t Sign = Imm.lshr(63).getZExtValue() & 1;
-  int64_t Exp = (Imm.lshr(52).getSExtValue() & 0x7ff) - 1023;   // -1022 to 1023
-  uint64_t Mantissa = Imm.getZExtValue() & 0xfffffffffffffLL;
-
-  // We can handle 4 bits of mantissa.
-  // mantissa = (16+UInt(e:f:g:h))/16.
-  if (Mantissa & 0xffffffffffffLL)
-    return -1;
-  Mantissa >>= 48;
-  if ((Mantissa & 0xf) != Mantissa)
-    return -1;
-
-  // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3
-  if (Exp < -3 || Exp > 4)
-    return -1;
-  Exp = ((Exp+3) & 0x7) ^ 4;
-
-  return ((int)Sign << 7) | (Exp << 4) | Mantissa;
-}
-
 bool ARM::isBitFieldInvertedMask(unsigned v) {
   if (v == 0xffffffff)
     return 0;
@@ -7889,9 +8746,9 @@ bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
   if (!Subtarget->hasVFP3())
     return false;
   if (VT == MVT::f32)
-    return ARM::getVFPf32Imm(Imm) != -1;
+    return ARM_AM::getFP32Imm(Imm) != -1;
   if (VT == MVT::f64)
-    return ARM::getVFPf64Imm(Imm) != -1;
+    return ARM_AM::getFP64Imm(Imm) != -1;
   return false;
 }
 
@@ -7933,7 +8790,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     // Conservatively set memVT to the entire set of vectors stored.
     unsigned NumElts = 0;
     for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
-      const Type *ArgTy = I.getArgOperand(ArgI)->getType();
+      Type *ArgTy = I.getArgOperand(ArgI)->getType();
       if (!ArgTy->isVectorTy())
         break;
       NumElts += getTargetData()->getTypeAllocSize(ArgTy) / 8;
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index 980fb404887e0..5da9b27fca68e 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -71,6 +71,11 @@ namespace llvm {
       SRA_FLAG,     // V,Flag = sra_flag X -> sra X, 1 + save carry out.
       RRX,          // V = RRX X, Flag     -> srl X, 1 + shift in carry flag.
 
+      ADDC,         // Add with carry
+      ADDE,         // Add using carry
+      SUBC,         // Sub with carry
+      SUBE,         // Sub using carry
+
       VMOVRRD,      // double to two gprs.
       VMOVDRR,      // Two gprs to double.
 
@@ -206,18 +211,22 @@ namespace llvm {
       VST4_UPD,
       VST2LN_UPD,
       VST3LN_UPD,
-      VST4LN_UPD
+      VST4LN_UPD,
+
+      // 64-bit atomic ops (value split into two registers)
+      ATOMADD64_DAG,
+      ATOMSUB64_DAG,
+      ATOMOR64_DAG,
+      ATOMXOR64_DAG,
+      ATOMAND64_DAG,
+      ATOMNAND64_DAG,
+      ATOMSWAP64_DAG,
+      ATOMCMPXCHG64_DAG
     };
   }
 
   /// Define some predicates that are used for node matching.
   namespace ARM {
-    /// getVFPf32Imm / getVFPf64Imm - If the given fp immediate can be
-    /// materialized with a VMOV.f32 / VMOV.f64 (i.e. fconsts / fconstd)
-    /// instruction, returns its 8-bit integer representation. Otherwise,
-    /// returns -1.
-    int getVFPf32Imm(const APFloat &FPImm);
-    int getVFPf64Imm(const APFloat &FPImm);
     bool isBitFieldInvertedMask(unsigned v);
   }
 
@@ -240,10 +249,16 @@ namespace llvm {
 
     virtual const char *getTargetNodeName(unsigned Opcode) const;
 
+    /// getSetCCResultType - Return the value type to use for ISD::SETCC.
+    virtual EVT getSetCCResultType(EVT VT) const;
+
     virtual MachineBasicBlock *
       EmitInstrWithCustomInserter(MachineInstr *MI,
                                   MachineBasicBlock *MBB) const;
 
+    virtual void
+    AdjustInstrPostInstrSelection(MachineInstr *MI, SDNode *Node) const;
+
     SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const;
     virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
@@ -256,7 +271,7 @@ namespace llvm {
 
     /// isLegalAddressingMode - Return true if the addressing mode represented
     /// by AM is legal for this target, for a load/store of the specified type.
-    virtual bool isLegalAddressingMode(const AddrMode &AM, const Type *Ty)const;
+    virtual bool isLegalAddressingMode(const AddrMode &AM, Type *Ty)const;
     bool isLegalT2ScaledAddressingMode(const AddrMode &AM, EVT VT) const;
 
     /// isLegalICmpImmediate - Return true if the specified immediate is legal
@@ -485,12 +500,28 @@ namespace llvm {
                                         MachineBasicBlock *BB,
                                         unsigned Size,
                                         unsigned BinOpcode) const;
+    MachineBasicBlock *EmitAtomicBinary64(MachineInstr *MI,
+                                          MachineBasicBlock *BB,
+                                          unsigned Op1,
+                                          unsigned Op2,
+                                          bool NeedsCarry = false,
+                                          bool IsCmpxchg = false) const;
     MachineBasicBlock * EmitAtomicBinaryMinMax(MachineInstr *MI,
                                                MachineBasicBlock *BB,
                                                unsigned Size,
                                                bool signExtend,
                                                ARMCC::CondCodes Cond) const;
 
+    void EmitBasePointerRecalculation(MachineInstr *MI, MachineBasicBlock *MBB,
+                                      MachineBasicBlock *DispatchBB) const;
+
+    void SetupEntryBlockForSjLj(MachineInstr *MI,
+                                MachineBasicBlock *MBB,
+                                MachineBasicBlock *DispatchBB, int FI) const;
+
+    MachineBasicBlock *EmitSjLjDispatchBlock(MachineInstr *MI,
+                                             MachineBasicBlock *MBB) const;
+
     bool RemapAddSubWithFlags(MachineInstr *MI, MachineBasicBlock *BB) const;
   };
 
diff --git a/lib/Target/ARM/ARMInstrFormats.td b/lib/Target/ARM/ARMInstrFormats.td
index 3ccf22f80b7da..7cbc9111dec37 100644
--- a/lib/Target/ARM/ARMInstrFormats.td
+++ b/lib/Target/ARM/ARMInstrFormats.td
@@ -25,7 +25,7 @@ def BrFrm         : Format<2>;
 def BrMiscFrm     : Format<3>;
 
 def DPFrm         : Format<4>;
-def DPSoRegFrm    : Format<5>;
+def DPSoRegRegFrm    : Format<5>;
 
 def LdFrm         : Format<6>;
 def StFrm         : Format<7>;
@@ -68,6 +68,7 @@ def N3RegVShFrm   : Format<38>;
 def NVExtFrm      : Format<39>;
 def NVMulSLFrm    : Format<40>;
 def NVTBLFrm      : Format<41>;
+def DPSoRegImmFrm  : Format<42>;
 
 // Misc flags.
 
@@ -130,39 +131,15 @@ def VFPNeonA8Domain : Domain<5>; // Instructions in VFP & Neon under A8
 // ARM special operands.
 //
 
-def CondCodeOperand : AsmOperandClass {
-  let Name = "CondCode";
-  let SuperClasses = [];
-}
-
-def CCOutOperand : AsmOperandClass {
-  let Name = "CCOut";
-  let SuperClasses = [];
-}
-
-def MemBarrierOptOperand : AsmOperandClass {
-  let Name = "MemBarrierOpt";
-  let SuperClasses = [];
-  let ParserMethod = "tryParseMemBarrierOptOperand";
-}
-
-def ProcIFlagsOperand : AsmOperandClass {
-  let Name = "ProcIFlags";
-  let SuperClasses = [];
-  let ParserMethod = "tryParseProcIFlagsOperand";
-}
-
-def MSRMaskOperand : AsmOperandClass {
-  let Name = "MSRMask";
-  let SuperClasses = [];
-  let ParserMethod = "tryParseMSRMaskOperand";
-}
-
 // ARM imod and iflag operands, used only by the CPS instruction.
 def imod_op : Operand<i32> {
   let PrintMethod = "printCPSIMod";
 }
 
+def ProcIFlagsOperand : AsmOperandClass {
+  let Name = "ProcIFlags";
+  let ParserMethod = "parseProcIFlagsOperand";
+}
 def iflags_op : Operand<i32> {
   let PrintMethod = "printCPSIFlag";
   let ParserMatchClass = ProcIFlagsOperand;
@@ -170,17 +147,21 @@ def iflags_op : Operand<i32> {
 
 // ARM Predicate operand. Default to 14 = always (AL). Second part is CC
 // register whose default is 0 (no register).
-def pred : PredicateOperand<OtherVT, (ops i32imm, CCR),
+def CondCodeOperand : AsmOperandClass { let Name = "CondCode"; }
+def pred : PredicateOperand<OtherVT, (ops i32imm, i32imm),
                                      (ops (i32 14), (i32 zero_reg))> {
   let PrintMethod = "printPredicateOperand";
   let ParserMatchClass = CondCodeOperand;
+  let DecoderMethod = "DecodePredicateOperand";
 }
 
 // Conditional code result for instructions whose 's' bit is set, e.g. subs.
+def CCOutOperand : AsmOperandClass { let Name = "CCOut"; }
 def cc_out : OptionalDefOperand<OtherVT, (ops CCR), (ops (i32 zero_reg))> {
   let EncoderMethod = "getCCOutOpValue";
   let PrintMethod = "printSBitModifierOperand";
   let ParserMatchClass = CCOutOperand;
+  let DecoderMethod = "DecodeCCOutOperand";
 }
 
 // Same as cc_out except it defaults to setting CPSR.
@@ -188,16 +169,27 @@ def s_cc_out : OptionalDefOperand<OtherVT, (ops CCR), (ops (i32 CPSR))> {
   let EncoderMethod = "getCCOutOpValue";
   let PrintMethod = "printSBitModifierOperand";
   let ParserMatchClass = CCOutOperand;
+  let DecoderMethod = "DecodeCCOutOperand";
 }
 
 // ARM special operands for disassembly only.
 //
+def SetEndAsmOperand : AsmOperandClass {
+  let Name = "SetEndImm";
+  let ParserMethod = "parseSetEndImm";
+}
 def setend_op : Operand<i32> {
   let PrintMethod = "printSetendOperand";
+  let ParserMatchClass = SetEndAsmOperand;
 }
 
+def MSRMaskOperand : AsmOperandClass {
+  let Name = "MSRMask";
+  let ParserMethod = "parseMSRMaskOperand";
+}
 def msr_mask : Operand<i32> {
   let PrintMethod = "printMSRMaskOperand";
+  let DecoderMethod = "DecodeMSRMask";
   let ParserMatchClass = MSRMaskOperand;
 }
 
@@ -211,21 +203,40 @@ def msr_mask : Operand<i32> {
 //     64       64 - <imm> is encoded in imm6<5:0>
 def shr_imm8  : Operand<i32> {
   let EncoderMethod = "getShiftRight8Imm";
+  let DecoderMethod = "DecodeShiftRight8Imm";
 }
 def shr_imm16 : Operand<i32> {
   let EncoderMethod = "getShiftRight16Imm";
+  let DecoderMethod = "DecodeShiftRight16Imm";
 }
 def shr_imm32 : Operand<i32> {
   let EncoderMethod = "getShiftRight32Imm";
+  let DecoderMethod = "DecodeShiftRight32Imm";
 }
 def shr_imm64 : Operand<i32> {
   let EncoderMethod = "getShiftRight64Imm";
+  let DecoderMethod = "DecodeShiftRight64Imm";
 }
 
+//===----------------------------------------------------------------------===//
+// ARM Assembler alias templates.
+//
+class ARMInstAlias<string Asm, dag Result, bit Emit = 0b1>
+      : InstAlias<Asm, Result, Emit>, Requires<[IsARM]>;
+class  tInstAlias<string Asm, dag Result, bit Emit = 0b1>
+      : InstAlias<Asm, Result, Emit>, Requires<[IsThumb]>;
+class t2InstAlias<string Asm, dag Result, bit Emit = 0b1>
+      : InstAlias<Asm, Result, Emit>, Requires<[IsThumb2]>;
+class VFP2InstAlias<string Asm, dag Result, bit Emit = 0b1>
+      : InstAlias<Asm, Result, Emit>, Requires<[HasVFP2]>;
+class VFP3InstAlias<string Asm, dag Result, bit Emit = 0b1>
+      : InstAlias<Asm, Result, Emit>, Requires<[HasVFP3]>;
+
 //===----------------------------------------------------------------------===//
 // ARM Instruction templates.
 //
 
+
 class InstTemplate<AddrMode am, int sz, IndexMode im,
                    Format f, Domain d, string cstr, InstrItinClass itin>
   : Instruction {
@@ -240,17 +251,22 @@ class InstTemplate<AddrMode am, int sz, IndexMode im,
   Domain D = d;
   bit isUnaryDataProc = 0;
   bit canXformTo16Bit = 0;
+  // The instruction is a 16-bit flag setting Thumb instruction. Used
+  // by the parser to determine whether to require the 'S' suffix on the
+  // mnemonic (when not in an IT block) or preclude it (when in an IT block).
+  bit thumbArithFlagSetting = 0;
 
   // If this is a pseudo instruction, mark it isCodeGenOnly.
   let isCodeGenOnly = !eq(!cast<string>(f), "Pseudo");
 
-  // The layout of TSFlags should be kept in sync with ARMBaseInstrInfo.h.
+  // The layout of TSFlags should be kept in sync with ARMBaseInfo.h.
   let TSFlags{4-0}   = AM.Value;
   let TSFlags{6-5}   = IndexModeBits;
   let TSFlags{12-7} = Form;
   let TSFlags{13}    = isUnaryDataProc;
   let TSFlags{14}    = canXformTo16Bit;
   let TSFlags{17-15} = D.Value;
+  let TSFlags{18}    = thumbArithFlagSetting;
 
   let Constraints = cstr;
   let Itinerary = itin;
@@ -262,13 +278,17 @@ class Encoding {
 
 class InstARM<AddrMode am, int sz, IndexMode im,
               Format f, Domain d, string cstr, InstrItinClass itin>
-  : InstTemplate<am, sz, im, f, d, cstr, itin>, Encoding;
+  : InstTemplate<am, sz, im, f, d, cstr, itin>, Encoding {
+  let DecoderNamespace = "ARM";
+}
 
 // This Encoding-less class is used by Thumb1 to specify the encoding bits later
 // on by adding flavors to specific instructions.
 class InstThumb<AddrMode am, int sz, IndexMode im,
                 Format f, Domain d, string cstr, InstrItinClass itin>
-  : InstTemplate<am, sz, im, f, d, cstr, itin>;
+  : InstTemplate<am, sz, im, f, d, cstr, itin> {
+  let DecoderNamespace = "Thumb";
+}
 
 class PseudoInst<dag oops, dag iops, InstrItinClass itin, list<dag> pattern>
   : InstTemplate<AddrModeNone, 0, IndexModeNone, Pseudo,
@@ -426,11 +446,11 @@ class AIldrex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
   : I<oops, iops, AddrModeNone, 4, IndexModeNone, LdStExFrm, itin,
       opc, asm, "", pattern> {
   bits<4> Rt;
-  bits<4> Rn;
+  bits<4> addr;
   let Inst{27-23} = 0b00011;
   let Inst{22-21} = opcod;
   let Inst{20}    = 1;
-  let Inst{19-16} = Rn;
+  let Inst{19-16} = addr;
   let Inst{15-12} = Rt;
   let Inst{11-0}  = 0b111110011111;
 }
@@ -450,14 +470,14 @@ class AIstrex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
   let Inst{3-0}   = Rt;
 }
 class AIswp<bit b, dag oops, dag iops, string opc, list<dag> pattern>
-  : AI<oops, iops, MiscFrm, NoItinerary, opc, "\t$Rt, $Rt2, [$Rn]", pattern> {
+  : AI<oops, iops, MiscFrm, NoItinerary, opc, "\t$Rt, $Rt2, $addr", pattern> {
   bits<4> Rt;
   bits<4> Rt2;
-  bits<4> Rn;
+  bits<4> addr;
   let Inst{27-23} = 0b00010;
   let Inst{22} = b;
   let Inst{21-20} = 0b00;
-  let Inst{19-16} = Rn;
+  let Inst{19-16} = addr;
   let Inst{15-12} = Rt;
   let Inst{11-4} = 0b00001001;
   let Inst{3-0} = Rt2;
@@ -515,22 +535,41 @@ class AI2ldstidx<bit isLd, bit isByte, bit isPre, dag oops, dag iops,
   let Inst{20}    = isLd; // L bit
   let Inst{15-12} = Rt;
 }
-class AI2stridx<bit isByte, bit isPre, dag oops, dag iops,
+class AI2stridx_reg<bit isByte, bit isPre, dag oops, dag iops,
+                IndexMode im, Format f, InstrItinClass itin, string opc,
+                string asm, string cstr, list<dag> pattern>
+  : AI2ldstidx<0, isByte, isPre, oops, iops, im, f, itin, opc, asm, cstr,
+               pattern> {
+  // AM2 store w/ two operands: (GPR, am2offset)
+  // {12}     isAdd
+  // {11-0}   imm12/Rm
+  bits<14> offset;
+  bits<4> Rn;
+  let Inst{25} = 1;
+  let Inst{23} = offset{12};
+  let Inst{19-16} = Rn;
+  let Inst{11-5} = offset{11-5};
+  let Inst{4} = 0;
+  let Inst{3-0} = offset{3-0};
+}
+
+class AI2stridx_imm<bit isByte, bit isPre, dag oops, dag iops,
                 IndexMode im, Format f, InstrItinClass itin, string opc,
                 string asm, string cstr, list<dag> pattern>
   : AI2ldstidx<0, isByte, isPre, oops, iops, im, f, itin, opc, asm, cstr,
                pattern> {
   // AM2 store w/ two operands: (GPR, am2offset)
-  // {13}     1 == Rm, 0 == imm12
   // {12}     isAdd
   // {11-0}   imm12/Rm
   bits<14> offset;
   bits<4> Rn;
-  let Inst{25} = offset{13};
+  let Inst{25} = 0;
   let Inst{23} = offset{12};
   let Inst{19-16} = Rn;
   let Inst{11-0} = offset{11-0};
 }
+
+
 // FIXME: Merge with the above class when addrmode2 gets used for STR, STRB
 // but for now use this class for STRT and STRBT.
 class AI2stridxT<bit isByte, bit isPre, dag oops, dag iops,
@@ -568,9 +607,11 @@ class AI3ld<bits<4> op, bit op20, dag oops, dag iops, Format f,
   let Inst{11-8}  = addr{7-4};    // imm7_4/zero
   let Inst{7-4}   = op;
   let Inst{3-0}   = addr{3-0};    // imm3_0/Rm
+
+  let DecoderMethod = "DecodeAddrMode3Instruction";
 }
 
-class AI3ldstidx<bits<4> op, bit op20, bit isLd, bit isPre, dag oops, dag iops,
+class AI3ldstidx<bits<4> op, bit op20, bit isPre, dag oops, dag iops,
                 IndexMode im, Format f, InstrItinClass itin, string opc,
                 string asm, string cstr, list<dag> pattern>
   : I<oops, iops, AddrMode3, 4, im, f, itin,
@@ -586,48 +627,24 @@ class AI3ldstidx<bits<4> op, bit op20, bit isLd, bit isPre, dag oops, dag iops,
 
 // FIXME: Merge with the above class when addrmode2 gets used for LDR, LDRB
 // but for now use this class for LDRSBT, LDRHT, LDSHT.
-class AI3ldstidxT<bits<4> op, bit op20, bit isLd, bit isPre, dag oops, dag iops,
+class AI3ldstidxT<bits<4> op, bit isLoad, dag oops, dag iops,
                   IndexMode im, Format f, InstrItinClass itin, string opc,
                   string asm, string cstr, list<dag> pattern>
-  : I<oops, iops, AddrMode3, 4, im, f, itin,
-      opc, asm, cstr, pattern> {
+  : I<oops, iops, AddrMode3, 4, im, f, itin, opc, asm, cstr, pattern> {
   // {13}     1 == imm8, 0 == Rm
   // {12-9}   Rn
   // {8}      isAdd
   // {7-4}    imm7_4/zero
   // {3-0}    imm3_0/Rm
-  bits<14> addr;
-  bits<4> Rt;
-  let Inst{27-25} = 0b000;
-  let Inst{24}    = isPre;        // P bit
-  let Inst{23}    = addr{8};      // U bit
-  let Inst{22}    = addr{13};     // 1 == imm8, 0 == Rm
-  let Inst{20}    = op20;         // L bit
-  let Inst{19-16} = addr{12-9};   // Rn
-  let Inst{15-12} = Rt;           // Rt
-  let Inst{11-8}  = addr{7-4};    // imm7_4/zero
-  let Inst{7-4}   = op;
-  let Inst{3-0}   = addr{3-0};    // imm3_0/Rm
-  let AsmMatchConverter = "CvtLdWriteBackRegAddrMode3";
-}
-
-class AI3stridx<bits<4> op, bit isByte, bit isPre, dag oops, dag iops,
-                IndexMode im, Format f, InstrItinClass itin, string opc,
-                string asm, string cstr, list<dag> pattern>
-  : AI2ldstidx<0, isByte, isPre, oops, iops, im, f, itin, opc, asm, cstr,
-               pattern> {
-  // AM3 store w/ two operands: (GPR, am3offset)
-  bits<14> offset;
+  bits<4> addr;
   bits<4> Rt;
-  bits<4> Rn;
   let Inst{27-25} = 0b000;
-  let Inst{23}    = offset{8};
-  let Inst{22}    = offset{9};
-  let Inst{19-16} = Rn;
+  let Inst{24}    = 0;            // P bit
+  let Inst{21}    = 1;
+  let Inst{20}    = isLoad;       // L bit
+  let Inst{19-16} = addr;         // Rn
   let Inst{15-12} = Rt;           // Rt
-  let Inst{11-8}  = offset{7-4};  // imm7_4/zero
   let Inst{7-4}   = op;
-  let Inst{3-0}   = offset{3-0};  // imm3_0/Rm
 }
 
 // stores
@@ -648,75 +665,7 @@ class AI3str<bits<4> op, dag oops, dag iops, Format f, InstrItinClass itin,
   let Inst{11-8}  = addr{7-4};    // imm7_4/zero
   let Inst{7-4}   = op;
   let Inst{3-0}   = addr{3-0};    // imm3_0/Rm
-}
-
-// Pre-indexed stores
-class AI3sthpr<dag oops, dag iops, Format f, InstrItinClass itin,
-               string opc, string asm, string cstr, list<dag> pattern>
-  : I<oops, iops, AddrMode3, 4, IndexModePre, f, itin,
-      opc, asm, cstr, pattern> {
-  let Inst{4}     = 1;
-  let Inst{5}     = 1; // H bit
-  let Inst{6}     = 0; // S bit
-  let Inst{7}     = 1;
-  let Inst{20}    = 0; // L bit
-  let Inst{21}    = 1; // W bit
-  let Inst{24}    = 1; // P bit
-  let Inst{27-25} = 0b000;
-}
-class AI3stdpr<dag oops, dag iops, Format f, InstrItinClass itin,
-             string opc, string asm, string cstr, list<dag> pattern>
-  : I<oops, iops, AddrMode3, 4, IndexModePre, f, itin,
-      opc, asm, cstr, pattern> {
-  let Inst{4}     = 1;
-  let Inst{5}     = 1; // H bit
-  let Inst{6}     = 1; // S bit
-  let Inst{7}     = 1;
-  let Inst{20}    = 0; // L bit
-  let Inst{21}    = 1; // W bit
-  let Inst{24}    = 1; // P bit
-  let Inst{27-25} = 0b000;
-}
-
-// Post-indexed stores
-class AI3sthpo<dag oops, dag iops, Format f, InstrItinClass itin,
-               string opc, string asm, string cstr, list<dag> pattern>
-  : I<oops, iops, AddrMode3, 4, IndexModePost, f, itin,
-      opc, asm, cstr,pattern> {
-  // {13}     1 == imm8, 0 == Rm
-  // {12-9}   Rn
-  // {8}      isAdd
-  // {7-4}    imm7_4/zero
-  // {3-0}    imm3_0/Rm
-  bits<14> addr;
-  bits<4> Rt;
-  let Inst{3-0}   = addr{3-0};    // imm3_0/Rm
-  let Inst{4}     = 1;
-  let Inst{5}     = 1; // H bit
-  let Inst{6}     = 0; // S bit
-  let Inst{7}     = 1;
-  let Inst{11-8}  = addr{7-4};    // imm7_4/zero
-  let Inst{15-12} = Rt;           // Rt
-  let Inst{19-16} = addr{12-9};   // Rn
-  let Inst{20}    = 0; // L bit
-  let Inst{21}    = 0; // W bit
-  let Inst{22}    = addr{13};     // 1 == imm8, 0 == Rm
-  let Inst{23}    = addr{8};      // U bit
-  let Inst{24}    = 0; // P bit
-  let Inst{27-25} = 0b000;
-}
-class AI3stdpo<dag oops, dag iops, Format f, InstrItinClass itin,
-             string opc, string asm, string cstr, list<dag> pattern>
-  : I<oops, iops, AddrMode3, 4, IndexModePost, f, itin,
-      opc, asm, cstr, pattern> {
-  let Inst{4}     = 1;
-  let Inst{5}     = 1; // H bit
-  let Inst{6}     = 1; // S bit
-  let Inst{7}     = 1;
-  let Inst{20}    = 0; // L bit
-  let Inst{21}    = 0; // W bit
-  let Inst{24}    = 0; // P bit
-  let Inst{27-25} = 0b000;
+  let DecoderMethod = "DecodeAddrMode3Instruction";
 }
 
 // addrmode4 instructions
@@ -843,6 +792,23 @@ class AMiscA1I<bits<8> opcod, bits<4> opc7_4, dag oops, dag iops,
 }
 
 // PKH instructions
+def PKHLSLAsmOperand : AsmOperandClass {
+  let Name = "PKHLSLImm";
+  let ParserMethod = "parsePKHLSLImm";
+}
+def pkh_lsl_amt: Operand<i32>, ImmLeaf<i32, [{ return Imm >= 0 && Imm < 32; }]>{
+  let PrintMethod = "printPKHLSLShiftImm";
+  let ParserMatchClass = PKHLSLAsmOperand;
+}
+def PKHASRAsmOperand : AsmOperandClass {
+  let Name = "PKHASRImm";
+  let ParserMethod = "parsePKHASRImm";
+}
+def pkh_asr_amt: Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm <= 32; }]>{
+  let PrintMethod = "printPKHASRShiftImm";
+  let ParserMatchClass = PKHASRAsmOperand;
+}
+
 class APKHI<bits<8> opcod, bit tb, dag oops, dag iops, InstrItinClass itin,
             string opc, string asm, list<dag> pattern>
   : I<oops, iops, AddrModeNone, 4, IndexModeNone, ArithMiscFrm, itin,
@@ -850,11 +816,11 @@ class APKHI<bits<8> opcod, bit tb, dag oops, dag iops, InstrItinClass itin,
   bits<4> Rd;
   bits<4> Rn;
   bits<4> Rm;
-  bits<8> sh;
+  bits<5> sh;
   let Inst{27-20} = opcod;
   let Inst{19-16} = Rn;
   let Inst{15-12} = Rd;
-  let Inst{11-7}  = sh{7-3};
+  let Inst{11-7}  = sh;
   let Inst{6}     = tb;
   let Inst{5-4}   = 0b01;
   let Inst{3-0}   = Rm;
@@ -949,7 +915,9 @@ class Thumb1sI<dag oops, dag iops, AddrMode am, int sz,
   let InOperandList = !con(iops, (ins pred:$p));
   let AsmString = !strconcat(opc, "${s}${p}", asm);
   let Pattern = pattern;
+  let thumbArithFlagSetting = 1;
   list<Predicate> Predicates = [IsThumb, IsThumb1Only];
+  let DecoderNamespace = "ThumbSBit";
 }
 
 class T1sI<dag oops, dag iops, InstrItinClass itin,
@@ -1071,6 +1039,7 @@ class Thumb2I<dag oops, dag iops, AddrMode am, int sz,
   let AsmString = !strconcat(opc, "${p}", asm);
   let Pattern = pattern;
   list<Predicate> Predicates = [IsThumb2];
+  let DecoderNamespace = "Thumb2";
 }
 
 // Same as Thumb2I except it can optionally modify CPSR. Note it's modeled as an
@@ -1091,6 +1060,7 @@ class Thumb2sI<dag oops, dag iops, AddrMode am, int sz,
   let AsmString = !strconcat(opc, "${s}${p}", asm);
   let Pattern = pattern;
   list<Predicate> Predicates = [IsThumb2];
+  let DecoderNamespace = "Thumb2";
 }
 
 // Special cases
@@ -1103,6 +1073,7 @@ class Thumb2XI<dag oops, dag iops, AddrMode am, int sz,
   let AsmString = asm;
   let Pattern = pattern;
   list<Predicate> Predicates = [IsThumb2];
+  let DecoderNamespace = "Thumb2";
 }
 
 class ThumbXI<dag oops, dag iops, AddrMode am, int sz,
@@ -1114,6 +1085,7 @@ class ThumbXI<dag oops, dag iops, AddrMode am, int sz,
   let AsmString = asm;
   let Pattern = pattern;
   list<Predicate> Predicates = [IsThumb, IsThumb1Only];
+  let DecoderNamespace = "Thumb";
 }
 
 class T2I<dag oops, dag iops, InstrItinClass itin,
@@ -1132,8 +1104,8 @@ class T2Ipc<dag oops, dag iops, InstrItinClass itin,
             string opc, string asm, list<dag> pattern>
   : Thumb2I<oops, iops, AddrModeT2_pc, 4, itin, opc, asm, "", pattern>;
 class T2Ii8s4<bit P, bit W, bit isLoad, dag oops, dag iops, InstrItinClass itin,
-              string opc, string asm, list<dag> pattern>
-  : Thumb2I<oops, iops, AddrModeT2_i8s4, 4, itin, opc, asm, "",
+              string opc, string asm, string cstr, list<dag> pattern>
+  : Thumb2I<oops, iops, AddrModeT2_i8s4, 4, itin, opc, asm, cstr,
             pattern> {
   bits<4> Rt;
   bits<4> Rt2;
@@ -1149,6 +1121,26 @@ class T2Ii8s4<bit P, bit W, bit isLoad, dag oops, dag iops, InstrItinClass itin,
   let Inst{11-8}  = Rt2{3-0};
   let Inst{7-0}   = addr{7-0};
 }
+class T2Ii8s4post<bit P, bit W, bit isLoad, dag oops, dag iops,
+                  InstrItinClass itin, string opc, string asm, string cstr,
+                  list<dag> pattern>
+  : Thumb2I<oops, iops, AddrModeT2_i8s4, 4, itin, opc, asm, cstr,
+            pattern> {
+  bits<4> Rt;
+  bits<4> Rt2;
+  bits<4> addr;
+  bits<9> imm;
+  let Inst{31-25} = 0b1110100;
+  let Inst{24}    = P;
+  let Inst{23}    = imm{8};
+  let Inst{22}    = 1;
+  let Inst{21}    = W;
+  let Inst{20}    = isLoad;
+  let Inst{19-16} = addr;
+  let Inst{15-12} = Rt{3-0};
+  let Inst{11-8}  = Rt2{3-0};
+  let Inst{7-0}   = imm{7-0};
+}
 
 class T2sI<dag oops, dag iops, InstrItinClass itin,
            string opc, string asm, list<dag> pattern>
@@ -1172,8 +1164,8 @@ class T2XIt<dag oops, dag iops, InstrItinClass itin,
             string asm, string cstr, list<dag> pattern>
   : Thumb2XI<oops, iops, AddrModeNone, 4, itin, asm, cstr, pattern>;
 
-// T2Iidxldst - Thumb2 indexed load / store instructions.
-class T2Iidxldst<bit signed, bits<2> opcod, bit load, bit pre,
+// T2Ipreldst - Thumb2 pre-indexed load / store instructions.
+class T2Ipreldst<bit signed, bits<2> opcod, bit load, bit pre,
                  dag oops, dag iops,
                  AddrMode am, IndexMode im, InstrItinClass itin,
                  string opc, string asm, string cstr, list<dag> pattern>
@@ -1183,25 +1175,60 @@ class T2Iidxldst<bit signed, bits<2> opcod, bit load, bit pre,
   let AsmString = !strconcat(opc, "${p}", asm);
   let Pattern = pattern;
   list<Predicate> Predicates = [IsThumb2];
+  let DecoderNamespace = "Thumb2";
+
+  bits<4> Rt;
+  bits<13> addr;
   let Inst{31-27} = 0b11111;
   let Inst{26-25} = 0b00;
   let Inst{24}    = signed;
   let Inst{23}    = 0;
   let Inst{22-21} = opcod;
   let Inst{20}    = load;
+  let Inst{19-16} = addr{12-9};
+  let Inst{15-12} = Rt{3-0};
   let Inst{11}    = 1;
   // (P, W) = (1, 1) Pre-indexed or (0, 1) Post-indexed
   let Inst{10}    = pre; // The P bit.
+  let Inst{9}     = addr{8}; // Sign bit
   let Inst{8}     = 1; // The W bit.
+  let Inst{7-0}   = addr{7-0};
 
-  bits<9> addr;
-  let Inst{7-0} = addr{7-0};
-  let Inst{9}   = addr{8}; // Sign bit
+  let DecoderMethod = "DecodeT2LdStPre";
+}
+
+// T2Ipostldst - Thumb2 post-indexed load / store instructions.
+class T2Ipostldst<bit signed, bits<2> opcod, bit load, bit pre,
+                 dag oops, dag iops,
+                 AddrMode am, IndexMode im, InstrItinClass itin,
+                 string opc, string asm, string cstr, list<dag> pattern>
+  : InstARM<am, 4, im, ThumbFrm, GenericDomain, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = !con(iops, (ins pred:$p));
+  let AsmString = !strconcat(opc, "${p}", asm);
+  let Pattern = pattern;
+  list<Predicate> Predicates = [IsThumb2];
+  let DecoderNamespace = "Thumb2";
 
   bits<4> Rt;
   bits<4> Rn;
+  bits<9> offset;
+  let Inst{31-27} = 0b11111;
+  let Inst{26-25} = 0b00;
+  let Inst{24}    = signed;
+  let Inst{23}    = 0;
+  let Inst{22-21} = opcod;
+  let Inst{20}    = load;
+  let Inst{19-16} = Rn;
   let Inst{15-12} = Rt{3-0};
-  let Inst{19-16} = Rn{3-0};
+  let Inst{11}    = 1;
+  // (P, W) = (1, 1) Pre-indexed or (0, 1) Post-indexed
+  let Inst{10}    = pre; // The P bit.
+  let Inst{9}     = offset{8}; // Sign bit
+  let Inst{8}     = 1; // The W bit.
+  let Inst{7-0}   = offset{7-0};
+
+  let DecoderMethod = "DecodeT2LdStPre";
 }
 
 // Tv5Pat - Same as Pat<>, but requires V5T Thumb mode.
@@ -1242,6 +1269,7 @@ class VFPI<dag oops, dag iops, AddrMode am, int sz,
   let AsmString = !strconcat(opc, "${p}", asm);
   let Pattern = pattern;
   let PostEncoderMethod = "VFPThumb2PostEncoder";
+  let DecoderNamespace = "VFP";
   list<Predicate> Predicates = [HasVFP2];
 }
 
@@ -1257,6 +1285,7 @@ class VFPXI<dag oops, dag iops, AddrMode am, int sz,
   let AsmString = asm;
   let Pattern = pattern;
   let PostEncoderMethod = "VFPThumb2PostEncoder";
+  let DecoderNamespace = "VFP";
   list<Predicate> Predicates = [HasVFP2];
 }
 
@@ -1574,6 +1603,7 @@ class NeonI<dag oops, dag iops, AddrMode am, IndexMode im, Format f,
   let AsmString = !strconcat(opc, "${p}", ".", dt, "\t", asm);
   let Pattern = pattern;
   list<Predicate> Predicates = [HasNEON];
+  let DecoderNamespace = "NEON";
 }
 
 // Same as NeonI except it does not have a "data type" specifier.
@@ -1586,6 +1616,7 @@ class NeonXI<dag oops, dag iops, AddrMode am, IndexMode im, Format f,
   let AsmString = !strconcat(opc, "${p}", "\t", asm);
   let Pattern = pattern;
   list<Predicate> Predicates = [HasNEON];
+  let DecoderNamespace = "NEON";
 }
 
 class NLdSt<bit op23, bits<2> op21_20, bits<4> op11_8, bits<4> op7_4,
@@ -1600,6 +1631,7 @@ class NLdSt<bit op23, bits<2> op21_20, bits<4> op11_8, bits<4> op7_4,
   let Inst{7-4}   = op7_4;
 
   let PostEncoderMethod = "NEONThumb2LoadStorePostEncoder";
+  let DecoderNamespace = "NEONLoadStore";
 
   bits<5> Vd;
   bits<6> Rn;
@@ -1643,6 +1675,7 @@ class NDataI<dag oops, dag iops, Format f, InstrItinClass itin,
           pattern> {
   let Inst{31-25} = 0b1111001;
   let PostEncoderMethod = "NEONThumb2DataIPostEncoder";
+  let DecoderNamespace = "NEONData";
 }
 
 class NDataXI<dag oops, dag iops, Format f, InstrItinClass itin,
@@ -1651,6 +1684,7 @@ class NDataXI<dag oops, dag iops, Format f, InstrItinClass itin,
            cstr, pattern> {
   let Inst{31-25} = 0b1111001;
   let PostEncoderMethod = "NEONThumb2DataIPostEncoder";
+  let DecoderNamespace = "NEONData";
 }
 
 // NEON "one register and a modified immediate" format.
@@ -1677,6 +1711,7 @@ class N1ModImm<bit op23, bits<3> op21_19, bits<4> op11_8, bit op7, bit op6,
   let Inst{24}    = SIMM{7};
   let Inst{18-16} = SIMM{6-4};
   let Inst{3-0}   = SIMM{3-0};
+  let DecoderMethod = "DecodeNEONModImmInstruction";
 }
 
 // NEON 2 vector register format.
@@ -1874,6 +1909,7 @@ class NVLaneOp<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
   list<Predicate> Predicates = [HasNEON];
 
   let PostEncoderMethod = "NEONThumb2DupPostEncoder";
+  let DecoderNamespace = "NEONDup";
 
   bits<5> V;
   bits<4> R;
@@ -1915,7 +1951,6 @@ class NVDupLane<bits<4> op19_16, bit op6, dag oops, dag iops,
 
   bits<5> Vd;
   bits<5> Vm;
-  bits<4> lane;
 
   let Inst{22}     = Vd{4};
   let Inst{15-12} = Vd{3-0};
diff --git a/lib/Target/ARM/ARMInstrInfo.cpp b/lib/Target/ARM/ARMInstrInfo.cpp
index adcbf1806fe3d..48da03f63bb93 100644
--- a/lib/Target/ARM/ARMInstrInfo.cpp
+++ b/lib/Target/ARM/ARMInstrInfo.cpp
@@ -13,8 +13,8 @@
 
 #include "ARMInstrInfo.h"
 #include "ARM.h"
-#include "ARMAddressingModes.h"
 #include "ARMMachineFunctionInfo.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/LiveVariables.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -30,14 +30,18 @@ ARMInstrInfo::ARMInstrInfo(const ARMSubtarget &STI)
 unsigned ARMInstrInfo::getUnindexedOpcode(unsigned Opc) const {
   switch (Opc) {
   default: break;
-  case ARM::LDR_PRE:
-  case ARM::LDR_POST:
+  case ARM::LDR_PRE_IMM:
+  case ARM::LDR_PRE_REG:
+  case ARM::LDR_POST_IMM:
+  case ARM::LDR_POST_REG:
     return ARM::LDRi12;
   case ARM::LDRH_PRE:
   case ARM::LDRH_POST:
     return ARM::LDRH;
-  case ARM::LDRB_PRE:
-  case ARM::LDRB_POST:
+  case ARM::LDRB_PRE_IMM:
+  case ARM::LDRB_PRE_REG:
+  case ARM::LDRB_POST_IMM:
+  case ARM::LDRB_POST_REG:
     return ARM::LDRBi12;
   case ARM::LDRSH_PRE:
   case ARM::LDRSH_POST:
@@ -45,14 +49,18 @@ unsigned ARMInstrInfo::getUnindexedOpcode(unsigned Opc) const {
   case ARM::LDRSB_PRE:
   case ARM::LDRSB_POST:
     return ARM::LDRSB;
-  case ARM::STR_PRE:
-  case ARM::STR_POST:
+  case ARM::STR_PRE_IMM:
+  case ARM::STR_PRE_REG:
+  case ARM::STR_POST_IMM:
+  case ARM::STR_POST_REG:
     return ARM::STRi12;
   case ARM::STRH_PRE:
   case ARM::STRH_POST:
     return ARM::STRH;
-  case ARM::STRB_PRE:
-  case ARM::STRB_POST:
+  case ARM::STRB_PRE_IMM:
+  case ARM::STRB_PRE_REG:
+  case ARM::STRB_POST_IMM:
+  case ARM::STRB_POST_REG:
     return ARM::STRBi12;
   }
 
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index a42dd1a54ec72..2cf0f09ffc64e 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -70,6 +70,18 @@ def SDT_ARMTCRET : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
 def SDT_ARMBFI : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
                                       SDTCisVT<2, i32>, SDTCisVT<3, i32>]>;
 
+def SDTBinaryArithWithFlags : SDTypeProfile<2, 2,
+                                            [SDTCisSameAs<0, 2>,
+                                             SDTCisSameAs<0, 3>,
+                                             SDTCisInt<0>, SDTCisVT<1, i32>]>;
+
+// SDTBinaryArithWithFlagsInOut - RES1, CPSR = op LHS, RHS, CPSR
+def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3,
+                                            [SDTCisSameAs<0, 2>,
+                                             SDTCisSameAs<0, 3>,
+                                             SDTCisInt<0>,
+                                             SDTCisVT<1, i32>,
+                                             SDTCisVT<4, i32>]>;
 // Node definitions.
 def ARMWrapper       : SDNode<"ARMISD::Wrapper",     SDTIntUnaryOp>;
 def ARMWrapperDYN    : SDNode<"ARMISD::WrapperDYN",  SDTIntUnaryOp>;
@@ -120,6 +132,12 @@ def ARMsrl_flag      : SDNode<"ARMISD::SRL_FLAG", SDTIntUnaryOp, [SDNPOutGlue]>;
 def ARMsra_flag      : SDNode<"ARMISD::SRA_FLAG", SDTIntUnaryOp, [SDNPOutGlue]>;
 def ARMrrx           : SDNode<"ARMISD::RRX"     , SDTIntUnaryOp, [SDNPInGlue ]>;
 
+def ARMaddc          : SDNode<"ARMISD::ADDC",  SDTBinaryArithWithFlags,
+                              [SDNPCommutative]>;
+def ARMsubc          : SDNode<"ARMISD::SUBC",  SDTBinaryArithWithFlags>;
+def ARMadde          : SDNode<"ARMISD::ADDE",  SDTBinaryArithWithFlagsInOut>;
+def ARMsube          : SDNode<"ARMISD::SUBE",  SDTBinaryArithWithFlagsInOut>;
+
 def ARMthread_pointer: SDNode<"ARMISD::THREAD_POINTER", SDT_ARMThreadPointer>;
 def ARMeh_sjlj_setjmp: SDNode<"ARMISD::EH_SJLJ_SETJMP",
                                SDT_ARMEH_SJLJ_Setjmp, [SDNPHasChain]>;
@@ -187,10 +205,16 @@ def IsThumb          : Predicate<"Subtarget->isThumb()">,
 def IsThumb1Only     : Predicate<"Subtarget->isThumb1Only()">;
 def IsThumb2         : Predicate<"Subtarget->isThumb2()">,
                                  AssemblerPredicate<"ModeThumb,FeatureThumb2">;
+def IsMClass         : Predicate<"Subtarget->isMClass()">,
+                                 AssemblerPredicate<"FeatureMClass">;
+def IsARClass        : Predicate<"!Subtarget->isMClass()">,
+                                 AssemblerPredicate<"!FeatureMClass">;
 def IsARM            : Predicate<"!Subtarget->isThumb()">,
                                  AssemblerPredicate<"!ModeThumb">;
 def IsDarwin         : Predicate<"Subtarget->isTargetDarwin()">;
 def IsNotDarwin      : Predicate<"!Subtarget->isTargetDarwin()">;
+def IsNaCl           : Predicate<"Subtarget->isTargetNaCl()">,
+                                 AssemblerPredicate<"ModeNaCl">;
 
 // FIXME: Eventually this will be just "hasV6T2Ops".
 def UseMovt          : Predicate<"Subtarget->useMovt()">;
@@ -263,24 +287,11 @@ def imm0_65535 : Operand<i32>, ImmLeaf<i32, [{
   let ParserMatchClass = Imm0_65535AsmOperand;
 }
 
+class BinOpWithFlagFrag<dag res> :
+      PatFrag<(ops node:$LHS, node:$RHS, node:$FLAG), res>;
 class BinOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$RHS), res>;
 class UnOpFrag <dag res> : PatFrag<(ops node:$Src), res>;
 
-/// adde and sube predicates - True based on whether the carry flag output
-/// will be needed or not.
-def adde_dead_carry :
-  PatFrag<(ops node:$LHS, node:$RHS), (adde node:$LHS, node:$RHS),
-  [{return !N->hasAnyUseOfValue(1);}]>;
-def sube_dead_carry :
-  PatFrag<(ops node:$LHS, node:$RHS), (sube node:$LHS, node:$RHS),
-  [{return !N->hasAnyUseOfValue(1);}]>;
-def adde_live_carry :
-  PatFrag<(ops node:$LHS, node:$RHS), (adde node:$LHS, node:$RHS),
-  [{return N->hasAnyUseOfValue(1);}]>;
-def sube_live_carry :
-  PatFrag<(ops node:$LHS, node:$RHS), (sube node:$LHS, node:$RHS),
-  [{return N->hasAnyUseOfValue(1);}]>;
-
 // An 'and' node with a single use.
 def and_su : PatFrag<(ops node:$lhs, node:$rhs), (and node:$lhs, node:$rhs), [{
   return N->hasOneUse();
@@ -315,6 +326,7 @@ def fsub_mlx : PatFrag<(ops node:$lhs, node:$rhs),(fsub node:$lhs, node:$rhs),[{
 def brtarget : Operand<OtherVT> {
   let EncoderMethod = "getBranchTargetOpValue";
   let OperandType = "OPERAND_PCREL";
+  let DecoderMethod = "DecodeT2BROperand";
 }
 
 // FIXME: get rid of this one?
@@ -345,39 +357,35 @@ def bl_target : Operand<i32> {
   let OperandType = "OPERAND_PCREL";
 }
 
-
-// A list of registers separated by comma. Used by load/store multiple.
-def RegListAsmOperand : AsmOperandClass {
-  let Name = "RegList";
-  let SuperClasses = [];
-}
-
-def DPRRegListAsmOperand : AsmOperandClass {
-  let Name = "DPRRegList";
-  let SuperClasses = [];
-}
-
-def SPRRegListAsmOperand : AsmOperandClass {
-  let Name = "SPRRegList";
-  let SuperClasses = [];
+def blx_target : Operand<i32> {
+  // Encoded the same as branch targets.
+  let EncoderMethod = "getARMBLXTargetOpValue";
+  let OperandType = "OPERAND_PCREL";
 }
 
+// A list of registers separated by comma. Used by load/store multiple.
+def RegListAsmOperand : AsmOperandClass { let Name = "RegList"; }
 def reglist : Operand<i32> {
   let EncoderMethod = "getRegisterListOpValue";
   let ParserMatchClass = RegListAsmOperand;
   let PrintMethod = "printRegisterList";
+  let DecoderMethod = "DecodeRegListOperand";
 }
 
+def DPRRegListAsmOperand : AsmOperandClass { let Name = "DPRRegList"; }
 def dpr_reglist : Operand<i32> {
   let EncoderMethod = "getRegisterListOpValue";
   let ParserMatchClass = DPRRegListAsmOperand;
   let PrintMethod = "printRegisterList";
+  let DecoderMethod = "DecodeDPRRegListOperand";
 }
 
+def SPRRegListAsmOperand : AsmOperandClass { let Name = "SPRRegList"; }
 def spr_reglist : Operand<i32> {
   let EncoderMethod = "getRegisterListOpValue";
   let ParserMatchClass = SPRRegListAsmOperand;
   let PrintMethod = "printRegisterList";
+  let DecoderMethod = "DecodeSPRRegListOperand";
 }
 
 // An operand for the CONSTPOOL_ENTRY pseudo-instruction.
@@ -397,56 +405,99 @@ def adrlabel : Operand<i32> {
 
 def neon_vcvt_imm32 : Operand<i32> {
   let EncoderMethod = "getNEONVcvtImm32OpValue";
+  let DecoderMethod = "DecodeVCVTImmOperand";
 }
 
 // rot_imm: An integer that encodes a rotate amount. Must be 8, 16, or 24.
-def rot_imm : Operand<i32>, ImmLeaf<i32, [{
-    int32_t v = (int32_t)Imm;
-    return v == 8 || v == 16 || v == 24; }]> {
-  let EncoderMethod = "getRotImmOpValue";
+def rot_imm_XFORM: SDNodeXForm<imm, [{
+  switch (N->getZExtValue()){
+  default: assert(0);
+  case 0:  return CurDAG->getTargetConstant(0, MVT::i32);
+  case 8:  return CurDAG->getTargetConstant(1, MVT::i32);
+  case 16: return CurDAG->getTargetConstant(2, MVT::i32);
+  case 24: return CurDAG->getTargetConstant(3, MVT::i32);
+  }
+}]>;
+def RotImmAsmOperand : AsmOperandClass {
+  let Name = "RotImm";
+  let ParserMethod = "parseRotImm";
 }
-
-def ShifterAsmOperand : AsmOperandClass {
-  let Name = "Shifter";
-  let SuperClasses = [];
+def rot_imm : Operand<i32>, PatLeaf<(i32 imm), [{
+    int32_t v = N->getZExtValue();
+    return v == 8 || v == 16 || v == 24; }],
+    rot_imm_XFORM> {
+  let PrintMethod = "printRotImmOperand";
+  let ParserMatchClass = RotImmAsmOperand;
 }
 
 // shift_imm: An integer that encodes a shift amount and the type of shift
-// (currently either asr or lsl) using the same encoding used for the
-// immediates in so_reg operands.
+// (asr or lsl). The 6-bit immediate encodes as:
+//    {5}     0 ==> lsl
+//            1     asr
+//    {4-0}   imm5 shift amount.
+//            asr #32 encoded as imm5 == 0.
+def ShifterImmAsmOperand : AsmOperandClass {
+  let Name = "ShifterImm";
+  let ParserMethod = "parseShifterImm";
+}
 def shift_imm : Operand<i32> {
   let PrintMethod = "printShiftImmOperand";
-  let ParserMatchClass = ShifterAsmOperand;
+  let ParserMatchClass = ShifterImmAsmOperand;
 }
 
-def ShiftedRegAsmOperand : AsmOperandClass {
-  let Name = "ShiftedReg";
+// shifter_operand operands: so_reg_reg, so_reg_imm, and so_imm.
+def ShiftedRegAsmOperand : AsmOperandClass { let Name = "RegShiftedReg"; }
+def so_reg_reg : Operand<i32>,  // reg reg imm
+                 ComplexPattern<i32, 3, "SelectRegShifterOperand",
+                                [shl, srl, sra, rotr]> {
+  let EncoderMethod = "getSORegRegOpValue";
+  let PrintMethod = "printSORegRegOperand";
+  let DecoderMethod = "DecodeSORegRegOperand";
+  let ParserMatchClass = ShiftedRegAsmOperand;
+  let MIOperandInfo = (ops GPRnopc, GPRnopc, i32imm);
 }
 
-// shifter_operand operands: so_reg and so_imm.
-def so_reg : Operand<i32>,    // reg reg imm
-             ComplexPattern<i32, 3, "SelectShifterOperandReg",
-                            [shl,srl,sra,rotr]> {
-  let EncoderMethod = "getSORegOpValue";
-  let PrintMethod = "printSORegOperand";
-  let ParserMatchClass = ShiftedRegAsmOperand;
-  let MIOperandInfo = (ops GPR, GPR, shift_imm);
+def ShiftedImmAsmOperand : AsmOperandClass { let Name = "RegShiftedImm"; }
+def so_reg_imm : Operand<i32>, // reg imm
+                 ComplexPattern<i32, 2, "SelectImmShifterOperand",
+                                [shl, srl, sra, rotr]> {
+  let EncoderMethod = "getSORegImmOpValue";
+  let PrintMethod = "printSORegImmOperand";
+  let DecoderMethod = "DecodeSORegImmOperand";
+  let ParserMatchClass = ShiftedImmAsmOperand;
+  let MIOperandInfo = (ops GPR, i32imm);
+}
+
+// FIXME: Does this need to be distinct from so_reg?
+def shift_so_reg_reg : Operand<i32>,    // reg reg imm
+                   ComplexPattern<i32, 3, "SelectShiftRegShifterOperand",
+                                  [shl,srl,sra,rotr]> {
+  let EncoderMethod = "getSORegRegOpValue";
+  let PrintMethod = "printSORegRegOperand";
+  let DecoderMethod = "DecodeSORegRegOperand";
+  let MIOperandInfo = (ops GPR, GPR, i32imm);
 }
+
 // FIXME: Does this need to be distinct from so_reg?
-def shift_so_reg : Operand<i32>,    // reg reg imm
-                   ComplexPattern<i32, 3, "SelectShiftShifterOperandReg",
+def shift_so_reg_imm : Operand<i32>,    // reg reg imm
+                   ComplexPattern<i32, 2, "SelectShiftImmShifterOperand",
                                   [shl,srl,sra,rotr]> {
-  let EncoderMethod = "getSORegOpValue";
-  let PrintMethod = "printSORegOperand";
-  let MIOperandInfo = (ops GPR, GPR, shift_imm);
+  let EncoderMethod = "getSORegImmOpValue";
+  let PrintMethod = "printSORegImmOperand";
+  let DecoderMethod = "DecodeSORegImmOperand";
+  let MIOperandInfo = (ops GPR, i32imm);
 }
 
+
 // so_imm - Match a 32-bit shifter_operand immediate operand, which is an
 // 8-bit immediate rotated by an arbitrary number of bits.
+def SOImmAsmOperand: AsmOperandClass { let Name = "ARMSOImm"; }
 def so_imm : Operand<i32>, ImmLeaf<i32, [{
     return ARM_AM::getSOImmVal(Imm) != -1;
   }]> {
   let EncoderMethod = "getSOImmOpValue";
+  let ParserMatchClass = SOImmAsmOperand;
+  let DecoderMethod = "DecodeSOImmOperand";
 }
 
 // Break so_imm's up into two pieces.  This handles immediates with up to 16
@@ -464,7 +515,7 @@ def arm_i32imm : PatLeaf<(imm), [{
   return ARM_AM::isSOImmTwoPartVal((unsigned)N->getZExtValue());
 }]>;
 
-/// imm0_7 predicate - Immediate in the range [0,31].
+/// imm0_7 predicate - Immediate in the range [0,7].
 def Imm0_7AsmOperand: AsmOperandClass { let Name = "Imm0_7"; }
 def imm0_7 : Operand<i32>, ImmLeaf<i32, [{
   return Imm >= 0 && Imm < 8;
@@ -472,7 +523,7 @@ def imm0_7 : Operand<i32>, ImmLeaf<i32, [{
   let ParserMatchClass = Imm0_7AsmOperand;
 }
 
-/// imm0_15 predicate - Immediate in the range [0,31].
+/// imm0_15 predicate - Immediate in the range [0,15].
 def Imm0_15AsmOperand: AsmOperandClass { let Name = "Imm0_15"; }
 def imm0_15 : Operand<i32>, ImmLeaf<i32, [{
   return Imm >= 0 && Imm < 16;
@@ -481,68 +532,83 @@ def imm0_15 : Operand<i32>, ImmLeaf<i32, [{
 }
 
 /// imm0_31 predicate - True if the 32-bit immediate is in the range [0,31].
+def Imm0_31AsmOperand: AsmOperandClass { let Name = "Imm0_31"; }
 def imm0_31 : Operand<i32>, ImmLeaf<i32, [{
   return Imm >= 0 && Imm < 32;
-}]>;
-
-/// imm0_31_m1 - Matches and prints like imm0_31, but encodes as 'value - 1'.
-def imm0_31_m1 : Operand<i32>, ImmLeaf<i32, [{
-  return Imm >= 0 && Imm < 32;
 }]> {
-  let EncoderMethod = "getImmMinusOneOpValue";
+  let ParserMatchClass = Imm0_31AsmOperand;
 }
 
-// i32imm_hilo16 - For movt/movw - sets the MC Encoder method.
-// The imm is split into imm{15-12}, imm{11-0}
+/// imm0_255 predicate - Immediate in the range [0,255].
+def Imm0_255AsmOperand : AsmOperandClass { let Name = "Imm0_255"; }
+def imm0_255 : Operand<i32>, ImmLeaf<i32, [{ return Imm >= 0 && Imm < 256; }]> {
+  let ParserMatchClass = Imm0_255AsmOperand;
+}
+
+// imm0_65535_expr - For movt/movw - 16-bit immediate that can also reference
+// a relocatable expression.
 //
-def i32imm_hilo16 : Operand<i32> {
+// FIXME: This really needs a Thumb version separate from the ARM version.
+// While the range is the same, and can thus use the same match class,
+// the encoding is different so it should have a different encoder method.
+def Imm0_65535ExprAsmOperand: AsmOperandClass { let Name = "Imm0_65535Expr"; }
+def imm0_65535_expr : Operand<i32> {
   let EncoderMethod = "getHiLo16ImmOpValue";
+  let ParserMatchClass = Imm0_65535ExprAsmOperand;
+}
+
+/// imm24b - True if the 32-bit immediate is encodable in 24 bits.
+def Imm24bitAsmOperand: AsmOperandClass { let Name = "Imm24bit"; }
+def imm24b : Operand<i32>, ImmLeaf<i32, [{
+  return Imm >= 0 && Imm <= 0xffffff;
+}]> {
+  let ParserMatchClass = Imm24bitAsmOperand;
 }
 
+
 /// bf_inv_mask_imm predicate - An AND mask to clear an arbitrary width bitfield
 /// e.g., 0xf000ffff
+def BitfieldAsmOperand : AsmOperandClass {
+  let Name = "Bitfield";
+  let ParserMethod = "parseBitfield";
+}
 def bf_inv_mask_imm : Operand<i32>,
                       PatLeaf<(imm), [{
   return ARM::isBitFieldInvertedMask(N->getZExtValue());
 }] > {
   let EncoderMethod = "getBitfieldInvertedMaskOpValue";
   let PrintMethod = "printBitfieldInvMaskImmOperand";
+  let DecoderMethod = "DecodeBitfieldMaskOperand";
+  let ParserMatchClass = BitfieldAsmOperand;
 }
 
-/// lsb_pos_imm - position of the lsb bit, used by BFI4p and t2BFI4p
-def lsb_pos_imm : Operand<i32>, ImmLeaf<i32, [{
-  return isInt<5>(Imm);
+def imm1_32_XFORM: SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant((int)N->getZExtValue() - 1, MVT::i32);
 }]>;
-
-/// width_imm - number of bits to be copied, used by BFI4p and t2BFI4p
-def width_imm : Operand<i32>, ImmLeaf<i32, [{
-  return Imm > 0 &&  Imm <= 32;
-}] > {
-  let EncoderMethod = "getMsbOpValue";
-}
-
-def ssat_imm : Operand<i32>, ImmLeaf<i32, [{
-  return Imm > 0 && Imm <= 32;
-}]> {
-  let EncoderMethod = "getSsatBitPosValue";
+def Imm1_32AsmOperand: AsmOperandClass { let Name = "Imm1_32"; }
+def imm1_32 : Operand<i32>, PatLeaf<(imm), [{
+   uint64_t Imm = N->getZExtValue();
+   return Imm > 0 && Imm <= 32;
+ }],
+    imm1_32_XFORM> {
+  let PrintMethod = "printImmPlusOneOperand";
+  let ParserMatchClass = Imm1_32AsmOperand;
+}
+
+def imm1_16_XFORM: SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant((int)N->getZExtValue() - 1, MVT::i32);
+}]>;
+def Imm1_16AsmOperand: AsmOperandClass { let Name = "Imm1_16"; }
+def imm1_16 : Operand<i32>, PatLeaf<(imm), [{ return Imm > 0 && Imm <= 16; }],
+    imm1_16_XFORM> {
+  let PrintMethod = "printImmPlusOneOperand";
+  let ParserMatchClass = Imm1_16AsmOperand;
 }
 
 // Define ARM specific addressing modes.
-
-def MemMode2AsmOperand : AsmOperandClass {
-  let Name = "MemMode2";
-  let SuperClasses = [];
-  let ParserMethod = "tryParseMemMode2Operand";
-}
-
-def MemMode3AsmOperand : AsmOperandClass {
-  let Name = "MemMode3";
-  let SuperClasses = [];
-  let ParserMethod = "tryParseMemMode3Operand";
-}
-
 // addrmode_imm12 := reg +/- imm12
 //
+def MemImm12OffsetAsmOperand : AsmOperandClass { let Name = "MemImm12Offset"; }
 def addrmode_imm12 : Operand<i32>,
                      ComplexPattern<i32, 2, "SelectAddrModeImm12", []> {
   // 12-bit immediate operand. Note that instructions using this encode
@@ -551,53 +617,129 @@ def addrmode_imm12 : Operand<i32>,
 
   let EncoderMethod = "getAddrModeImm12OpValue";
   let PrintMethod = "printAddrModeImm12Operand";
+  let DecoderMethod = "DecodeAddrModeImm12Operand";
+  let ParserMatchClass = MemImm12OffsetAsmOperand;
   let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
 }
 // ldst_so_reg := reg +/- reg shop imm
 //
+def MemRegOffsetAsmOperand : AsmOperandClass { let Name = "MemRegOffset"; }
 def ldst_so_reg : Operand<i32>,
                   ComplexPattern<i32, 3, "SelectLdStSOReg", []> {
   let EncoderMethod = "getLdStSORegOpValue";
   // FIXME: Simplify the printer
   let PrintMethod = "printAddrMode2Operand";
-  let MIOperandInfo = (ops GPR:$base, GPR:$offsreg, i32imm:$offsimm);
+  let DecoderMethod = "DecodeSORegMemOperand";
+  let ParserMatchClass = MemRegOffsetAsmOperand;
+  let MIOperandInfo = (ops GPR:$base, GPRnopc:$offsreg, i32imm:$shift);
 }
 
+// postidx_imm8 := +/- [0,255]
+//
+// 9 bit value:
+//  {8}       1 is imm8 is non-negative. 0 otherwise.
+//  {7-0}     [0,255] imm8 value.
+def PostIdxImm8AsmOperand : AsmOperandClass { let Name = "PostIdxImm8"; }
+def postidx_imm8 : Operand<i32> {
+  let PrintMethod = "printPostIdxImm8Operand";
+  let ParserMatchClass = PostIdxImm8AsmOperand;
+  let MIOperandInfo = (ops i32imm);
+}
+
+// postidx_imm8s4 := +/- [0,1020]
+//
+// 9 bit value:
+//  {8}       1 is imm8 is non-negative. 0 otherwise.
+//  {7-0}     [0,255] imm8 value, scaled by 4.
+def PostIdxImm8s4AsmOperand : AsmOperandClass { let Name = "PostIdxImm8s4"; }
+def postidx_imm8s4 : Operand<i32> {
+  let PrintMethod = "printPostIdxImm8s4Operand";
+  let ParserMatchClass = PostIdxImm8s4AsmOperand;
+  let MIOperandInfo = (ops i32imm);
+}
+
+
+// postidx_reg := +/- reg
+//
+def PostIdxRegAsmOperand : AsmOperandClass {
+  let Name = "PostIdxReg";
+  let ParserMethod = "parsePostIdxReg";
+}
+def postidx_reg : Operand<i32> {
+  let EncoderMethod = "getPostIdxRegOpValue";
+  let DecoderMethod = "DecodePostIdxReg";
+  let PrintMethod = "printPostIdxRegOperand";
+  let ParserMatchClass = PostIdxRegAsmOperand;
+  let MIOperandInfo = (ops GPR, i32imm);
+}
+
+
 // addrmode2 := reg +/- imm12
 //           := reg +/- reg shop imm
 //
+// FIXME: addrmode2 should be refactored the rest of the way to always
+// use explicit imm vs. reg versions above (addrmode_imm12 and ldst_so_reg).
+def AddrMode2AsmOperand : AsmOperandClass { let Name = "AddrMode2"; }
 def addrmode2 : Operand<i32>,
                 ComplexPattern<i32, 3, "SelectAddrMode2", []> {
   let EncoderMethod = "getAddrMode2OpValue";
   let PrintMethod = "printAddrMode2Operand";
-  let ParserMatchClass = MemMode2AsmOperand;
+  let ParserMatchClass = AddrMode2AsmOperand;
   let MIOperandInfo = (ops GPR:$base, GPR:$offsreg, i32imm:$offsimm);
 }
 
-def am2offset : Operand<i32>,
-                ComplexPattern<i32, 2, "SelectAddrMode2Offset",
+def PostIdxRegShiftedAsmOperand : AsmOperandClass {
+  let Name = "PostIdxRegShifted";
+  let ParserMethod = "parsePostIdxReg";
+}
+def am2offset_reg : Operand<i32>,
+                ComplexPattern<i32, 2, "SelectAddrMode2OffsetReg",
+                [], [SDNPWantRoot]> {
+  let EncoderMethod = "getAddrMode2OffsetOpValue";
+  let PrintMethod = "printAddrMode2OffsetOperand";
+  // When using this for assembly, it's always as a post-index offset.
+  let ParserMatchClass = PostIdxRegShiftedAsmOperand;
+  let MIOperandInfo = (ops GPR, i32imm);
+}
+
+// FIXME: am2offset_imm should only need the immediate, not the GPR. Having
+// the GPR is purely vestigal at this point.
+def AM2OffsetImmAsmOperand : AsmOperandClass { let Name = "AM2OffsetImm"; }
+def am2offset_imm : Operand<i32>,
+                ComplexPattern<i32, 2, "SelectAddrMode2OffsetImm",
                 [], [SDNPWantRoot]> {
   let EncoderMethod = "getAddrMode2OffsetOpValue";
   let PrintMethod = "printAddrMode2OffsetOperand";
+  let ParserMatchClass = AM2OffsetImmAsmOperand;
   let MIOperandInfo = (ops GPR, i32imm);
 }
 
+
 // addrmode3 := reg +/- reg
 // addrmode3 := reg +/- imm8
 //
+// FIXME: split into imm vs. reg versions.
+def AddrMode3AsmOperand : AsmOperandClass { let Name = "AddrMode3"; }
 def addrmode3 : Operand<i32>,
                 ComplexPattern<i32, 3, "SelectAddrMode3", []> {
   let EncoderMethod = "getAddrMode3OpValue";
   let PrintMethod = "printAddrMode3Operand";
-  let ParserMatchClass = MemMode3AsmOperand;
+  let ParserMatchClass = AddrMode3AsmOperand;
   let MIOperandInfo = (ops GPR:$base, GPR:$offsreg, i32imm:$offsimm);
 }
 
+// FIXME: split into imm vs. reg versions.
+// FIXME: parser method to handle +/- register.
+def AM3OffsetAsmOperand : AsmOperandClass {
+  let Name = "AM3Offset";
+  let ParserMethod = "parseAM3Offset";
+}
 def am3offset : Operand<i32>,
                 ComplexPattern<i32, 2, "SelectAddrMode3Offset",
                                [], [SDNPWantRoot]> {
   let EncoderMethod = "getAddrMode3OffsetOpValue";
   let PrintMethod = "printAddrMode3OffsetOperand";
+  let ParserMatchClass = AM3OffsetAsmOperand;
   let MIOperandInfo = (ops GPR, i32imm);
 }
 
@@ -608,28 +750,28 @@ def ldstm_mode : OptionalDefOperand<OtherVT, (ops i32), (ops (i32 1))> {
   let PrintMethod = "printLdStmModeOperand";
 }
 
-def MemMode5AsmOperand : AsmOperandClass {
-  let Name = "MemMode5";
-  let SuperClasses = [];
-}
-
 // addrmode5 := reg +/- imm8*4
 //
+def AddrMode5AsmOperand : AsmOperandClass { let Name = "AddrMode5"; }
 def addrmode5 : Operand<i32>,
                 ComplexPattern<i32, 2, "SelectAddrMode5", []> {
   let PrintMethod = "printAddrMode5Operand";
-  let MIOperandInfo = (ops GPR:$base, i32imm);
-  let ParserMatchClass = MemMode5AsmOperand;
   let EncoderMethod = "getAddrMode5OpValue";
+  let DecoderMethod = "DecodeAddrMode5Operand";
+  let ParserMatchClass = AddrMode5AsmOperand;
+  let MIOperandInfo = (ops GPR:$base, i32imm);
 }
 
 // addrmode6 := reg with optional alignment
 //
+def AddrMode6AsmOperand : AsmOperandClass { let Name = "AlignedMemory"; }
 def addrmode6 : Operand<i32>,
                 ComplexPattern<i32, 2, "SelectAddrMode6", [], [SDNPWantParent]>{
   let PrintMethod = "printAddrMode6Operand";
-  let MIOperandInfo = (ops GPR:$addr, i32imm);
+  let MIOperandInfo = (ops GPR:$addr, i32imm:$align);
   let EncoderMethod = "getAddrMode6AddressOpValue";
+  let DecoderMethod = "DecodeAddrMode6Operand";
+  let ParserMatchClass = AddrMode6AsmOperand;
 }
 
 def am6offset : Operand<i32>,
@@ -638,6 +780,7 @@ def am6offset : Operand<i32>,
   let PrintMethod = "printAddrMode6OffsetOperand";
   let MIOperandInfo = (ops GPR);
   let EncoderMethod = "getAddrMode6OffsetOpValue";
+  let DecoderMethod = "DecodeGPRRegisterClass";
 }
 
 // Special version of addrmode6 to handle alignment encoding for VST1/VLD1
@@ -666,19 +809,15 @@ def addrmodepc : Operand<i32>,
   let MIOperandInfo = (ops GPR, i32imm);
 }
 
-def MemMode7AsmOperand : AsmOperandClass {
-  let Name = "MemMode7";
-  let SuperClasses = [];
-}
-
-// addrmode7 := reg
-// Used by load/store exclusive instructions. Useful to enable right assembly
-// parsing and printing. Not used for any codegen matching.
+// addr_offset_none := reg
 //
-def addrmode7 : Operand<i32> {
+def MemNoOffsetAsmOperand : AsmOperandClass { let Name = "MemNoOffset"; }
+def addr_offset_none : Operand<i32>,
+                       ComplexPattern<i32, 1, "SelectAddrOffsetNone", []> {
   let PrintMethod = "printAddrMode7Operand";
-  let MIOperandInfo = (ops GPR);
-  let ParserMatchClass = MemMode7AsmOperand;
+  let DecoderMethod = "DecodeAddrMode7Operand";
+  let ParserMatchClass = MemNoOffsetAsmOperand;
+  let MIOperandInfo = (ops GPR:$base);
 }
 
 def nohash_imm : Operand<i32> {
@@ -687,25 +826,30 @@ def nohash_imm : Operand<i32> {
 
 def CoprocNumAsmOperand : AsmOperandClass {
   let Name = "CoprocNum";
-  let SuperClasses = [];
-  let ParserMethod = "tryParseCoprocNumOperand";
-}
-
-def CoprocRegAsmOperand : AsmOperandClass {
-  let Name = "CoprocReg";
-  let SuperClasses = [];
-  let ParserMethod = "tryParseCoprocRegOperand";
+  let ParserMethod = "parseCoprocNumOperand";
 }
-
 def p_imm : Operand<i32> {
   let PrintMethod = "printPImmediate";
   let ParserMatchClass = CoprocNumAsmOperand;
+  let DecoderMethod = "DecodeCoprocessor";
 }
 
+def CoprocRegAsmOperand : AsmOperandClass {
+  let Name = "CoprocReg";
+  let ParserMethod = "parseCoprocRegOperand";
+}
 def c_imm : Operand<i32> {
   let PrintMethod = "printCImmediate";
   let ParserMatchClass = CoprocRegAsmOperand;
 }
+def CoprocOptionAsmOperand : AsmOperandClass {
+  let Name = "CoprocOption";
+  let ParserMethod = "parseCoprocOptionOperand";
+}
+def coproc_option_imm : Operand<i32> {
+  let PrintMethod = "printCoprocOptionImm";
+  let ParserMatchClass = CoprocOptionAsmOperand;
+}
 
 //===----------------------------------------------------------------------===//
 
@@ -748,16 +892,37 @@ multiclass AsI1_bin_irs<bits<4> opcod, string opc,
     let Inst{11-4} = 0b00000000;
     let Inst{3-0} = Rm;
   }
-  def rs : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_reg:$shift), DPSoRegFrm,
+
+  def rsi : AsI1<opcod, (outs GPR:$Rd),
+               (ins GPR:$Rn, so_reg_imm:$shift), DPSoRegImmFrm,
+               iis, opc, "\t$Rd, $Rn, $shift",
+               [(set GPR:$Rd, (opnode GPR:$Rn, so_reg_imm:$shift))]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<12> shift;
+    let Inst{25} = 0;
+    let Inst{19-16} = Rn;
+    let Inst{15-12} = Rd;
+    let Inst{11-5} = shift{11-5};
+    let Inst{4} = 0;
+    let Inst{3-0} = shift{3-0};
+  }
+
+  def rsr : AsI1<opcod, (outs GPR:$Rd),
+               (ins GPR:$Rn, so_reg_reg:$shift), DPSoRegRegFrm,
                iis, opc, "\t$Rd, $Rn, $shift",
-               [(set GPR:$Rd, (opnode GPR:$Rn, so_reg:$shift))]> {
+               [(set GPR:$Rd, (opnode GPR:$Rn, so_reg_reg:$shift))]> {
     bits<4> Rd;
     bits<4> Rn;
     bits<12> shift;
     let Inst{25} = 0;
     let Inst{19-16} = Rn;
     let Inst{15-12} = Rd;
-    let Inst{11-0} = shift;
+    let Inst{11-8} = shift{11-8};
+    let Inst{7} = 0;
+    let Inst{6-5} = shift{6-5};
+    let Inst{4} = 1;
+    let Inst{3-0} = shift{3-0};
   }
 
   // Assembly aliases for optional destination operand when it's the same
@@ -773,59 +938,175 @@ multiclass AsI1_bin_irs<bits<4> opcod, string opc,
                                                     cc_out:$s)>,
      Requires<[IsARM]>;
   def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $shift"),
-     (!cast<Instruction>(!strconcat(baseOpc, "rs")) GPR:$Rdn, GPR:$Rdn,
-                                                    so_reg:$shift, pred:$p,
+     (!cast<Instruction>(!strconcat(baseOpc, "rsi")) GPR:$Rdn, GPR:$Rdn,
+                                                    so_reg_imm:$shift, pred:$p,
                                                     cc_out:$s)>,
      Requires<[IsARM]>;
+  def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $shift"),
+     (!cast<Instruction>(!strconcat(baseOpc, "rsr")) GPR:$Rdn, GPR:$Rdn,
+                                                    so_reg_reg:$shift, pred:$p,
+                                                    cc_out:$s)>,
+     Requires<[IsARM]>;
+
 }
 
-/// AI1_bin_s_irs - Similar to AsI1_bin_irs except it sets the 's' bit so the
-/// instruction modifies the CPSR register.
-let isCodeGenOnly = 1, Defs = [CPSR] in {
-multiclass AI1_bin_s_irs<bits<4> opcod, string opc,
+/// AsI1_rbin_irs - Same as AsI1_bin_irs except the order of operands are
+/// reversed.  The 'rr' form is only defined for the disassembler; for codegen
+/// it is equivalent to the AsI1_bin_irs counterpart.
+multiclass AsI1_rbin_irs<bits<4> opcod, string opc,
                      InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
-                         PatFrag opnode, bit Commutable = 0> {
-  def ri : AI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm), DPFrm,
+                        PatFrag opnode, string baseOpc, bit Commutable = 0> {
+  // The register-immediate version is re-materializable. This is useful
+  // in particular for taking the address of a local.
+  let isReMaterializable = 1 in {
+  def ri : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm), DPFrm,
                iii, opc, "\t$Rd, $Rn, $imm",
-               [(set GPR:$Rd, (opnode GPR:$Rn, so_imm:$imm))]> {
+               [(set GPR:$Rd, (opnode so_imm:$imm, GPR:$Rn))]> {
     bits<4> Rd;
     bits<4> Rn;
     bits<12> imm;
     let Inst{25} = 1;
-    let Inst{20} = 1;
     let Inst{19-16} = Rn;
     let Inst{15-12} = Rd;
     let Inst{11-0} = imm;
   }
-  def rr : AI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm,
+  }
+  def rr : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm,
                iir, opc, "\t$Rd, $Rn, $Rm",
-               [(set GPR:$Rd, (opnode GPR:$Rn, GPR:$Rm))]> {
+               [/* pattern left blank */]> {
     bits<4> Rd;
     bits<4> Rn;
     bits<4> Rm;
-    let isCommutable = Commutable;
+    let Inst{11-4} = 0b00000000;
+    let Inst{25} = 0;
+    let Inst{3-0} = Rm;
+    let Inst{15-12} = Rd;
+    let Inst{19-16} = Rn;
+  }
+
+  def rsi : AsI1<opcod, (outs GPR:$Rd),
+               (ins GPR:$Rn, so_reg_imm:$shift), DPSoRegImmFrm,
+               iis, opc, "\t$Rd, $Rn, $shift",
+               [(set GPR:$Rd, (opnode so_reg_imm:$shift, GPR:$Rn))]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<12> shift;
     let Inst{25} = 0;
-    let Inst{20} = 1;
     let Inst{19-16} = Rn;
     let Inst{15-12} = Rd;
-    let Inst{11-4} = 0b00000000;
-    let Inst{3-0} = Rm;
+    let Inst{11-5} = shift{11-5};
+    let Inst{4} = 0;
+    let Inst{3-0} = shift{3-0};
   }
-  def rs : AI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_reg:$shift), DPSoRegFrm,
+
+  def rsr : AsI1<opcod, (outs GPR:$Rd),
+               (ins GPR:$Rn, so_reg_reg:$shift), DPSoRegRegFrm,
                iis, opc, "\t$Rd, $Rn, $shift",
-               [(set GPR:$Rd, (opnode GPR:$Rn, so_reg:$shift))]> {
+               [(set GPR:$Rd, (opnode so_reg_reg:$shift, GPR:$Rn))]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<12> shift;
+    let Inst{25} = 0;
+    let Inst{19-16} = Rn;
+    let Inst{15-12} = Rd;
+    let Inst{11-8} = shift{11-8};
+    let Inst{7} = 0;
+    let Inst{6-5} = shift{6-5};
+    let Inst{4} = 1;
+    let Inst{3-0} = shift{3-0};
+  }
+
+  // Assembly aliases for optional destination operand when it's the same
+  // as the source operand.
+  def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $imm"),
+     (!cast<Instruction>(!strconcat(baseOpc, "ri")) GPR:$Rdn, GPR:$Rdn,
+                                                    so_imm:$imm, pred:$p,
+                                                    cc_out:$s)>,
+     Requires<[IsARM]>;
+  def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $Rm"),
+     (!cast<Instruction>(!strconcat(baseOpc, "rr")) GPR:$Rdn, GPR:$Rdn,
+                                                    GPR:$Rm, pred:$p,
+                                                    cc_out:$s)>,
+     Requires<[IsARM]>;
+  def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $shift"),
+     (!cast<Instruction>(!strconcat(baseOpc, "rsi")) GPR:$Rdn, GPR:$Rdn,
+                                                    so_reg_imm:$shift, pred:$p,
+                                                    cc_out:$s)>,
+     Requires<[IsARM]>;
+  def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $shift"),
+     (!cast<Instruction>(!strconcat(baseOpc, "rsr")) GPR:$Rdn, GPR:$Rdn,
+                                                    so_reg_reg:$shift, pred:$p,
+                                                    cc_out:$s)>,
+     Requires<[IsARM]>;
+
+}
+
+/// AsI1_rbin_s_is - Same as AsI1_rbin_s_is except it sets 's' bit by default.
+///
+/// These opcodes will be converted to the real non-S opcodes by
+/// AdjustInstrPostInstrSelection after giving then an optional CPSR operand.
+let hasPostISelHook = 1, isCodeGenOnly = 1, isPseudo = 1, Defs = [CPSR] in {
+multiclass AsI1_rbin_s_is<bits<4> opcod, string opc,
+                     InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
+                        PatFrag opnode, bit Commutable = 0> {
+  def ri : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm), DPFrm,
+               iii, opc, "\t$Rd, $Rn, $imm",
+               [(set GPR:$Rd, CPSR, (opnode so_imm:$imm, GPR:$Rn))]>;
+
+  def rr : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm,
+               iir, opc, "\t$Rd, $Rn, $Rm",
+               [/* pattern left blank */]>;
+
+  def rsi : AsI1<opcod, (outs GPR:$Rd),
+               (ins GPR:$Rn, so_reg_imm:$shift), DPSoRegImmFrm,
+               iis, opc, "\t$Rd, $Rn, $shift",
+               [(set GPR:$Rd, CPSR, (opnode so_reg_imm:$shift, GPR:$Rn))]>;
+
+  def rsr : AsI1<opcod, (outs GPR:$Rd),
+               (ins GPR:$Rn, so_reg_reg:$shift), DPSoRegRegFrm,
+               iis, opc, "\t$Rd, $Rn, $shift",
+               [(set GPR:$Rd, CPSR, (opnode so_reg_reg:$shift, GPR:$Rn))]> {
     bits<4> Rd;
     bits<4> Rn;
     bits<12> shift;
     let Inst{25} = 0;
-    let Inst{20} = 1;
     let Inst{19-16} = Rn;
     let Inst{15-12} = Rd;
-    let Inst{11-0} = shift;
+    let Inst{11-8} = shift{11-8};
+    let Inst{7} = 0;
+    let Inst{6-5} = shift{6-5};
+    let Inst{4} = 1;
+    let Inst{3-0} = shift{3-0};
   }
 }
 }
 
+/// AsI1_bin_s_irs - Same as AsI1_bin_irs except it sets the 's' bit by default.
+///
+/// These opcodes will be converted to the real non-S opcodes by
+/// AdjustInstrPostInstrSelection after giving then an optional CPSR operand.
+let hasPostISelHook = 1, isCodeGenOnly = 1, isPseudo = 1, Defs = [CPSR] in {
+multiclass AsI1_bin_s_irs<bits<4> opcod, string opc,
+                     InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
+                         PatFrag opnode, bit Commutable = 0> {
+  def ri : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm), DPFrm,
+               iii, opc, "\t$Rd, $Rn, $imm",
+               [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, so_imm:$imm))]>;
+  def rr : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm,
+               iir, opc, "\t$Rd, $Rn, $Rm",
+               [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, GPR:$Rm))]>;
+  def rsi : AsI1<opcod, (outs GPR:$Rd),
+               (ins GPR:$Rn, so_reg_imm:$shift), DPSoRegImmFrm,
+               iis, opc, "\t$Rd, $Rn, $shift",
+               [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, so_reg_imm:$shift))]>;
+
+  def rsr : AsI1<opcod, (outs GPR:$Rd),
+               (ins GPR:$Rn, so_reg_reg:$shift), DPSoRegRegFrm,
+               iis, opc, "\t$Rd, $Rn, $shift",
+               [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, so_reg_reg:$shift))]>;
+}
+}
+
 /// AI1_cmp_irs - Defines a set of (op r, {so_imm|r|so_reg}) cmp / test
 /// patterns. Similar to AsI1_bin_irs except the instruction does not produce
 /// a explicit result, only implicitly set CPSR.
@@ -857,128 +1138,190 @@ multiclass AI1_cmp_irs<bits<4> opcod, string opc,
     let Inst{11-4} = 0b00000000;
     let Inst{3-0} = Rm;
   }
-  def rs : AI1<opcod, (outs), (ins GPR:$Rn, so_reg:$shift), DPSoRegFrm, iis,
+  def rsi : AI1<opcod, (outs),
+               (ins GPR:$Rn, so_reg_imm:$shift), DPSoRegImmFrm, iis,
                opc, "\t$Rn, $shift",
-               [(opnode GPR:$Rn, so_reg:$shift)]> {
+               [(opnode GPR:$Rn, so_reg_imm:$shift)]> {
     bits<4> Rn;
     bits<12> shift;
     let Inst{25} = 0;
     let Inst{20} = 1;
     let Inst{19-16} = Rn;
     let Inst{15-12} = 0b0000;
-    let Inst{11-0} = shift;
+    let Inst{11-5} = shift{11-5};
+    let Inst{4} = 0;
+    let Inst{3-0} = shift{3-0};
   }
+  def rsr : AI1<opcod, (outs),
+               (ins GPR:$Rn, so_reg_reg:$shift), DPSoRegRegFrm, iis,
+               opc, "\t$Rn, $shift",
+               [(opnode GPR:$Rn, so_reg_reg:$shift)]> {
+    bits<4> Rn;
+    bits<12> shift;
+    let Inst{25} = 0;
+    let Inst{20} = 1;
+    let Inst{19-16} = Rn;
+    let Inst{15-12} = 0b0000;
+    let Inst{11-8} = shift{11-8};
+    let Inst{7} = 0;
+    let Inst{6-5} = shift{6-5};
+    let Inst{4} = 1;
+    let Inst{3-0} = shift{3-0};
+  }
+
 }
 }
 
 /// AI_ext_rrot - A unary operation with two forms: one whose operand is a
 /// register and one whose operand is a register rotated by 8/16/24.
 /// FIXME: Remove the 'r' variant. Its rot_imm is zero.
-multiclass AI_ext_rrot<bits<8> opcod, string opc, PatFrag opnode> {
-  def r     : AExtI<opcod, (outs GPR:$Rd), (ins GPR:$Rm),
-                 IIC_iEXTr, opc, "\t$Rd, $Rm",
-                 [(set GPR:$Rd, (opnode GPR:$Rm))]>,
-              Requires<[IsARM, HasV6]> {
-    bits<4> Rd;
-    bits<4> Rm;
-    let Inst{19-16} = 0b1111;
-    let Inst{15-12} = Rd;
-    let Inst{11-10} = 0b00;
-    let Inst{3-0}   = Rm;
-  }
-  def r_rot : AExtI<opcod, (outs GPR:$Rd), (ins GPR:$Rm, rot_imm:$rot),
-                 IIC_iEXTr, opc, "\t$Rd, $Rm, ror $rot",
-                 [(set GPR:$Rd, (opnode (rotr GPR:$Rm, rot_imm:$rot)))]>,
-              Requires<[IsARM, HasV6]> {
-    bits<4> Rd;
-    bits<4> Rm;
-    bits<2> rot;
-    let Inst{19-16} = 0b1111;
-    let Inst{15-12} = Rd;
-    let Inst{11-10} = rot;
-    let Inst{3-0}   = Rm;
-  }
+class AI_ext_rrot<bits<8> opcod, string opc, PatFrag opnode>
+  : AExtI<opcod, (outs GPRnopc:$Rd), (ins GPRnopc:$Rm, rot_imm:$rot),
+          IIC_iEXTr, opc, "\t$Rd, $Rm$rot",
+          [(set GPRnopc:$Rd, (opnode (rotr GPRnopc:$Rm, rot_imm:$rot)))]>,
+       Requires<[IsARM, HasV6]> {
+  bits<4> Rd;
+  bits<4> Rm;
+  bits<2> rot;
+  let Inst{19-16} = 0b1111;
+  let Inst{15-12} = Rd;
+  let Inst{11-10} = rot;
+  let Inst{3-0}   = Rm;
 }
 
-multiclass AI_ext_rrot_np<bits<8> opcod, string opc> {
-  def r     : AExtI<opcod, (outs GPR:$Rd), (ins GPR:$Rm),
-                 IIC_iEXTr, opc, "\t$Rd, $Rm",
-                 [/* For disassembly only; pattern left blank */]>,
-              Requires<[IsARM, HasV6]> {
-    let Inst{19-16} = 0b1111;
-    let Inst{11-10} = 0b00;
-  }
-  def r_rot : AExtI<opcod, (outs GPR:$Rd), (ins GPR:$Rm, rot_imm:$rot),
-                 IIC_iEXTr, opc, "\t$Rd, $Rm, ror $rot",
-                 [/* For disassembly only; pattern left blank */]>,
-              Requires<[IsARM, HasV6]> {
-    bits<2> rot;
-    let Inst{19-16} = 0b1111;
-    let Inst{11-10} = rot;
-  }
+class AI_ext_rrot_np<bits<8> opcod, string opc>
+  : AExtI<opcod, (outs GPRnopc:$Rd), (ins GPRnopc:$Rm, rot_imm:$rot),
+          IIC_iEXTr, opc, "\t$Rd, $Rm$rot", []>,
+       Requires<[IsARM, HasV6]> {
+  bits<2> rot;
+  let Inst{19-16} = 0b1111;
+  let Inst{11-10} = rot;
 }
 
 /// AI_exta_rrot - A binary operation with two forms: one whose operand is a
 /// register and one whose operand is a register rotated by 8/16/24.
-multiclass AI_exta_rrot<bits<8> opcod, string opc, PatFrag opnode> {
-  def rr     : AExtI<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
-                  IIC_iEXTAr, opc, "\t$Rd, $Rn, $Rm",
-                  [(set GPR:$Rd, (opnode GPR:$Rn, GPR:$Rm))]>,
-               Requires<[IsARM, HasV6]> {
+class AI_exta_rrot<bits<8> opcod, string opc, PatFrag opnode>
+  : AExtI<opcod, (outs GPRnopc:$Rd), (ins GPR:$Rn, GPRnopc:$Rm, rot_imm:$rot),
+          IIC_iEXTAr, opc, "\t$Rd, $Rn, $Rm$rot",
+          [(set GPRnopc:$Rd, (opnode GPR:$Rn,
+                                     (rotr GPRnopc:$Rm, rot_imm:$rot)))]>,
+        Requires<[IsARM, HasV6]> {
+  bits<4> Rd;
+  bits<4> Rm;
+  bits<4> Rn;
+  bits<2> rot;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = Rd;
+  let Inst{11-10} = rot;
+  let Inst{9-4}   = 0b000111;
+  let Inst{3-0}   = Rm;
+}
+
+class AI_exta_rrot_np<bits<8> opcod, string opc>
+  : AExtI<opcod, (outs GPRnopc:$Rd), (ins GPR:$Rn, GPRnopc:$Rm, rot_imm:$rot),
+          IIC_iEXTAr, opc, "\t$Rd, $Rn, $Rm$rot", []>,
+       Requires<[IsARM, HasV6]> {
+  bits<4> Rn;
+  bits<2> rot;
+  let Inst{19-16} = Rn;
+  let Inst{11-10} = rot;
+}
+
+/// AI1_adde_sube_irs - Define instructions and patterns for adde and sube.
+multiclass AI1_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode,
+                             string baseOpc, bit Commutable = 0> {
+  let hasPostISelHook = 1, Defs = [CPSR], Uses = [CPSR] in {
+  def ri : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm),
+                DPFrm, IIC_iALUi, opc, "\t$Rd, $Rn, $imm",
+               [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, so_imm:$imm, CPSR))]>,
+               Requires<[IsARM]> {
     bits<4> Rd;
-    bits<4> Rm;
     bits<4> Rn;
-    let Inst{19-16} = Rn;
+    bits<12> imm;
+    let Inst{25} = 1;
     let Inst{15-12} = Rd;
-    let Inst{11-10} = 0b00;
-    let Inst{9-4}   = 0b000111;
-    let Inst{3-0}   = Rm;
-  }
-  def rr_rot : AExtI<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm,
-                                             rot_imm:$rot),
-                  IIC_iEXTAr, opc, "\t$Rd, $Rn, $Rm, ror $rot",
-                  [(set GPR:$Rd, (opnode GPR:$Rn,
-                                          (rotr GPR:$Rm, rot_imm:$rot)))]>,
-                  Requires<[IsARM, HasV6]> {
+    let Inst{19-16} = Rn;
+    let Inst{11-0} = imm;
+  }
+  def rr : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+                DPFrm, IIC_iALUr, opc, "\t$Rd, $Rn, $Rm",
+               [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, GPR:$Rm, CPSR))]>,
+               Requires<[IsARM]> {
     bits<4> Rd;
+    bits<4> Rn;
     bits<4> Rm;
+    let Inst{11-4} = 0b00000000;
+    let Inst{25} = 0;
+    let isCommutable = Commutable;
+    let Inst{3-0} = Rm;
+    let Inst{15-12} = Rd;
+    let Inst{19-16} = Rn;
+  }
+  def rsi : AsI1<opcod, (outs GPR:$Rd),
+                (ins GPR:$Rn, so_reg_imm:$shift),
+                DPSoRegImmFrm, IIC_iALUsr, opc, "\t$Rd, $Rn, $shift",
+              [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, so_reg_imm:$shift, CPSR))]>,
+               Requires<[IsARM]> {
+    bits<4> Rd;
     bits<4> Rn;
-    bits<2> rot;
+    bits<12> shift;
+    let Inst{25} = 0;
     let Inst{19-16} = Rn;
     let Inst{15-12} = Rd;
-    let Inst{11-10} = rot;
-    let Inst{9-4}   = 0b000111;
-    let Inst{3-0}   = Rm;
-  }
-}
-
-// For disassembly only.
-multiclass AI_exta_rrot_np<bits<8> opcod, string opc> {
-  def rr     : AExtI<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
-                  IIC_iEXTAr, opc, "\t$Rd, $Rn, $Rm",
-                  [/* For disassembly only; pattern left blank */]>,
-               Requires<[IsARM, HasV6]> {
-    let Inst{11-10} = 0b00;
+    let Inst{11-5} = shift{11-5};
+    let Inst{4} = 0;
+    let Inst{3-0} = shift{3-0};
   }
-  def rr_rot : AExtI<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm,
-                                             rot_imm:$rot),
-                  IIC_iEXTAr, opc, "\t$Rd, $Rn, $Rm, ror $rot",
-                  [/* For disassembly only; pattern left blank */]>,
-                  Requires<[IsARM, HasV6]> {
+  def rsr : AsI1<opcod, (outs GPR:$Rd),
+                (ins GPR:$Rn, so_reg_reg:$shift),
+                DPSoRegRegFrm, IIC_iALUsr, opc, "\t$Rd, $Rn, $shift",
+              [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, so_reg_reg:$shift, CPSR))]>,
+               Requires<[IsARM]> {
+    bits<4> Rd;
     bits<4> Rn;
-    bits<2> rot;
+    bits<12> shift;
+    let Inst{25} = 0;
     let Inst{19-16} = Rn;
-    let Inst{11-10} = rot;
+    let Inst{15-12} = Rd;
+    let Inst{11-8} = shift{11-8};
+    let Inst{7} = 0;
+    let Inst{6-5} = shift{6-5};
+    let Inst{4} = 1;
+    let Inst{3-0} = shift{3-0};
+  }
   }
+
+  // Assembly aliases for optional destination operand when it's the same
+  // as the source operand.
+  def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $imm"),
+     (!cast<Instruction>(!strconcat(baseOpc, "ri")) GPR:$Rdn, GPR:$Rdn,
+                                                    so_imm:$imm, pred:$p,
+                                                    cc_out:$s)>,
+     Requires<[IsARM]>;
+  def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $Rm"),
+     (!cast<Instruction>(!strconcat(baseOpc, "rr")) GPR:$Rdn, GPR:$Rdn,
+                                                    GPR:$Rm, pred:$p,
+                                                    cc_out:$s)>,
+     Requires<[IsARM]>;
+  def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $shift"),
+     (!cast<Instruction>(!strconcat(baseOpc, "rsi")) GPR:$Rdn, GPR:$Rdn,
+                                                    so_reg_imm:$shift, pred:$p,
+                                                    cc_out:$s)>,
+     Requires<[IsARM]>;
+  def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $shift"),
+     (!cast<Instruction>(!strconcat(baseOpc, "rsr")) GPR:$Rdn, GPR:$Rdn,
+                                                    so_reg_reg:$shift, pred:$p,
+                                                    cc_out:$s)>,
+     Requires<[IsARM]>;
 }
 
-/// AI1_adde_sube_irs - Define instructions and patterns for adde and sube.
-multiclass AI1_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode,
-                             string baseOpc, bit Commutable = 0> {
-  let Uses = [CPSR] in {
+/// AI1_rsc_irs - Define instructions and patterns for rsc
+multiclass AI1_rsc_irs<bits<4> opcod, string opc, PatFrag opnode,
+                       string baseOpc> {
+  let hasPostISelHook = 1, Defs = [CPSR], Uses = [CPSR] in {
   def ri : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm),
                 DPFrm, IIC_iALUi, opc, "\t$Rd, $Rn, $imm",
-               [(set GPR:$Rd, (opnode GPR:$Rn, so_imm:$imm))]>,
+               [(set GPR:$Rd, CPSR, (opnode so_imm:$imm, GPR:$Rn, CPSR))]>,
                Requires<[IsARM]> {
     bits<4> Rd;
     bits<4> Rn;
@@ -990,31 +1333,48 @@ multiclass AI1_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode,
   }
   def rr : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
                 DPFrm, IIC_iALUr, opc, "\t$Rd, $Rn, $Rm",
-               [(set GPR:$Rd, (opnode GPR:$Rn, GPR:$Rm))]>,
-               Requires<[IsARM]> {
+               [/* pattern left blank */]> {
     bits<4> Rd;
     bits<4> Rn;
     bits<4> Rm;
     let Inst{11-4} = 0b00000000;
     let Inst{25} = 0;
-    let isCommutable = Commutable;
     let Inst{3-0} = Rm;
     let Inst{15-12} = Rd;
     let Inst{19-16} = Rn;
   }
-  def rs : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_reg:$shift),
-                DPSoRegFrm, IIC_iALUsr, opc, "\t$Rd, $Rn, $shift",
-               [(set GPR:$Rd, (opnode GPR:$Rn, so_reg:$shift))]>,
+  def rsi : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_reg_imm:$shift),
+                DPSoRegImmFrm, IIC_iALUsr, opc, "\t$Rd, $Rn, $shift",
+              [(set GPR:$Rd, CPSR, (opnode so_reg_imm:$shift, GPR:$Rn, CPSR))]>,
                Requires<[IsARM]> {
     bits<4> Rd;
     bits<4> Rn;
     bits<12> shift;
     let Inst{25} = 0;
-    let Inst{11-0} = shift;
+    let Inst{19-16} = Rn;
     let Inst{15-12} = Rd;
+    let Inst{11-5} = shift{11-5};
+    let Inst{4} = 0;
+    let Inst{3-0} = shift{3-0};
+  }
+  def rsr : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_reg_reg:$shift),
+                DPSoRegRegFrm, IIC_iALUsr, opc, "\t$Rd, $Rn, $shift",
+              [(set GPR:$Rd, CPSR, (opnode so_reg_reg:$shift, GPR:$Rn, CPSR))]>,
+               Requires<[IsARM]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<12> shift;
+    let Inst{25} = 0;
     let Inst{19-16} = Rn;
+    let Inst{15-12} = Rd;
+    let Inst{11-8} = shift{11-8};
+    let Inst{7} = 0;
+    let Inst{6-5} = shift{6-5};
+    let Inst{4} = 1;
+    let Inst{3-0} = shift{3-0};
   }
   }
+
   // Assembly aliases for optional destination operand when it's the same
   // as the source operand.
   def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $imm"),
@@ -1028,28 +1388,15 @@ multiclass AI1_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode,
                                                     cc_out:$s)>,
      Requires<[IsARM]>;
   def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $shift"),
-     (!cast<Instruction>(!strconcat(baseOpc, "rs")) GPR:$Rdn, GPR:$Rdn,
-                                                    so_reg:$shift, pred:$p,
+     (!cast<Instruction>(!strconcat(baseOpc, "rsi")) GPR:$Rdn, GPR:$Rdn,
+                                                    so_reg_imm:$shift, pred:$p,
+                                                    cc_out:$s)>,
+     Requires<[IsARM]>;
+  def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $shift"),
+     (!cast<Instruction>(!strconcat(baseOpc, "rsr")) GPR:$Rdn, GPR:$Rdn,
+                                                    so_reg_reg:$shift, pred:$p,
                                                     cc_out:$s)>,
      Requires<[IsARM]>;
-}
-
-// Carry setting variants
-// NOTE: CPSR def omitted because it will be handled by the custom inserter.
-let usesCustomInserter = 1 in {
-multiclass AI1_adde_sube_s_irs<PatFrag opnode, bit Commutable = 0> {
-  def ri : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm),
-               4, IIC_iALUi,
-               [(set GPR:$Rd, (opnode GPR:$Rn, so_imm:$imm))]>;
-  def rr : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
-               4, IIC_iALUr,
-               [(set GPR:$Rd, (opnode GPR:$Rn, GPR:$Rm))]> {
-    let isCommutable = Commutable;
-  }
-  def rs : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, so_reg:$shift),
-               4, IIC_iALUsr,
-               [(set GPR:$Rd, (opnode GPR:$Rn, so_reg:$shift))]>;
-}
 }
 
 let canFoldAsLoad = 1, isReMaterializable = 1 in {
@@ -1082,6 +1429,37 @@ multiclass AI_ldr1<bit isByte, string opc, InstrItinClass iii,
 }
 }
 
+let canFoldAsLoad = 1, isReMaterializable = 1 in {
+multiclass AI_ldr1nopc<bit isByte, string opc, InstrItinClass iii,
+           InstrItinClass iir, PatFrag opnode> {
+  // Note: We use the complex addrmode_imm12 rather than just an input
+  // GPR and a constrained immediate so that we can use this to match
+  // frame index references and avoid matching constant pool references.
+  def i12: AI2ldst<0b010, 1, isByte, (outs GPRnopc:$Rt), (ins addrmode_imm12:$addr),
+                   AddrMode_i12, LdFrm, iii, opc, "\t$Rt, $addr",
+                  [(set GPRnopc:$Rt, (opnode addrmode_imm12:$addr))]> {
+    bits<4>  Rt;
+    bits<17> addr;
+    let Inst{23}    = addr{12};     // U (add = ('U' == 1))
+    let Inst{19-16} = addr{16-13};  // Rn
+    let Inst{15-12} = Rt;
+    let Inst{11-0}  = addr{11-0};   // imm12
+  }
+  def rs : AI2ldst<0b011, 1, isByte, (outs GPRnopc:$Rt), (ins ldst_so_reg:$shift),
+                  AddrModeNone, LdFrm, iir, opc, "\t$Rt, $shift",
+                 [(set GPRnopc:$Rt, (opnode ldst_so_reg:$shift))]> {
+    bits<4>  Rt;
+    bits<17> shift;
+    let shift{4}    = 0;            // Inst{4} = 0
+    let Inst{23}    = shift{12};    // U (add = ('U' == 1))
+    let Inst{19-16} = shift{16-13}; // Rn
+    let Inst{15-12} = Rt;
+    let Inst{11-0}  = shift{11-0};
+  }
+}
+}
+
+
 multiclass AI_str1<bit isByte, string opc, InstrItinClass iii,
            InstrItinClass iir, PatFrag opnode> {
   // Note: We use the complex addrmode_imm12 rather than just an input
@@ -1110,6 +1488,37 @@ multiclass AI_str1<bit isByte, string opc, InstrItinClass iii,
     let Inst{11-0}  = shift{11-0};
   }
 }
+
+multiclass AI_str1nopc<bit isByte, string opc, InstrItinClass iii,
+           InstrItinClass iir, PatFrag opnode> {
+  // Note: We use the complex addrmode_imm12 rather than just an input
+  // GPR and a constrained immediate so that we can use this to match
+  // frame index references and avoid matching constant pool references.
+  def i12 : AI2ldst<0b010, 0, isByte, (outs),
+                   (ins GPRnopc:$Rt, addrmode_imm12:$addr),
+                   AddrMode_i12, StFrm, iii, opc, "\t$Rt, $addr",
+                  [(opnode GPRnopc:$Rt, addrmode_imm12:$addr)]> {
+    bits<4> Rt;
+    bits<17> addr;
+    let Inst{23}    = addr{12};     // U (add = ('U' == 1))
+    let Inst{19-16} = addr{16-13};  // Rn
+    let Inst{15-12} = Rt;
+    let Inst{11-0}  = addr{11-0};   // imm12
+  }
+  def rs : AI2ldst<0b011, 0, isByte, (outs), (ins GPRnopc:$Rt, ldst_so_reg:$shift),
+                  AddrModeNone, StFrm, iir, opc, "\t$Rt, $shift",
+                 [(opnode GPRnopc:$Rt, ldst_so_reg:$shift)]> {
+    bits<4> Rt;
+    bits<17> shift;
+    let shift{4}    = 0;            // Inst{4} = 0
+    let Inst{23}    = shift{12};    // U (add = ('U' == 1))
+    let Inst{19-16} = shift{16-13}; // Rn
+    let Inst{15-12} = Rt;
+    let Inst{11-0}  = shift{11-0};
+  }
+}
+
+
 //===----------------------------------------------------------------------===//
 // Instructions
 //===----------------------------------------------------------------------===//
@@ -1140,42 +1549,66 @@ PseudoInst<(outs), (ins i32imm:$amt, pred:$p), NoItinerary,
            [(ARMcallseq_start timm:$amt)]>;
 }
 
-def NOP : AI<(outs), (ins), MiscFrm, NoItinerary, "nop", "",
-             [/* For disassembly only; pattern left blank */]>,
+// Atomic pseudo-insts which will be lowered to ldrexd/strexd loops.
+// (These psuedos use a hand-written selection code).
+let usesCustomInserter = 1, Defs = [CPSR], mayLoad = 1, mayStore = 1 in {
+def ATOMOR6432   : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
+                              (ins GPR:$addr, GPR:$src1, GPR:$src2),
+                              NoItinerary, []>;
+def ATOMXOR6432  : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
+                              (ins GPR:$addr, GPR:$src1, GPR:$src2),
+                              NoItinerary, []>;
+def ATOMADD6432  : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
+                              (ins GPR:$addr, GPR:$src1, GPR:$src2),
+                              NoItinerary, []>;
+def ATOMSUB6432  : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
+                              (ins GPR:$addr, GPR:$src1, GPR:$src2),
+                              NoItinerary, []>;
+def ATOMNAND6432 : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
+                              (ins GPR:$addr, GPR:$src1, GPR:$src2),
+                              NoItinerary, []>;
+def ATOMAND6432  : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
+                              (ins GPR:$addr, GPR:$src1, GPR:$src2),
+                              NoItinerary, []>;
+def ATOMSWAP6432 : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
+                              (ins GPR:$addr, GPR:$src1, GPR:$src2),
+                              NoItinerary, []>;
+def ATOMCMPXCHG6432 : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
+                                 (ins GPR:$addr, GPR:$cmp1, GPR:$cmp2,
+                                      GPR:$set1, GPR:$set2),
+                                 NoItinerary, []>;
+}
+
+def NOP : AI<(outs), (ins), MiscFrm, NoItinerary, "nop", "", []>,
           Requires<[IsARM, HasV6T2]> {
   let Inst{27-16} = 0b001100100000;
   let Inst{15-8} = 0b11110000;
   let Inst{7-0} = 0b00000000;
 }
 
-def YIELD : AI<(outs), (ins), MiscFrm, NoItinerary, "yield", "",
-             [/* For disassembly only; pattern left blank */]>,
+def YIELD : AI<(outs), (ins), MiscFrm, NoItinerary, "yield", "", []>,
           Requires<[IsARM, HasV6T2]> {
   let Inst{27-16} = 0b001100100000;
   let Inst{15-8} = 0b11110000;
   let Inst{7-0} = 0b00000001;
 }
 
-def WFE : AI<(outs), (ins), MiscFrm, NoItinerary, "wfe", "",
-             [/* For disassembly only; pattern left blank */]>,
+def WFE : AI<(outs), (ins), MiscFrm, NoItinerary, "wfe", "", []>,
           Requires<[IsARM, HasV6T2]> {
   let Inst{27-16} = 0b001100100000;
   let Inst{15-8} = 0b11110000;
   let Inst{7-0} = 0b00000010;
 }
 
-def WFI : AI<(outs), (ins), MiscFrm, NoItinerary, "wfi", "",
-             [/* For disassembly only; pattern left blank */]>,
+def WFI : AI<(outs), (ins), MiscFrm, NoItinerary, "wfi", "", []>,
           Requires<[IsARM, HasV6T2]> {
   let Inst{27-16} = 0b001100100000;
   let Inst{15-8} = 0b11110000;
   let Inst{7-0} = 0b00000011;
 }
 
-def SEL : AI<(outs GPR:$dst), (ins GPR:$a, GPR:$b), DPFrm, NoItinerary, "sel",
-             "\t$dst, $a, $b",
-             [/* For disassembly only; pattern left blank */]>,
-          Requires<[IsARM, HasV6]> {
+def SEL : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm, NoItinerary, "sel",
+             "\t$Rd, $Rn, $Rm", []>, Requires<[IsARM, HasV6]> {
   bits<4> Rd;
   bits<4> Rn;
   bits<4> Rm;
@@ -1188,8 +1621,7 @@ def SEL : AI<(outs GPR:$dst), (ins GPR:$a, GPR:$b), DPFrm, NoItinerary, "sel",
 }
 
 def SEV : AI<(outs), (ins), MiscFrm, NoItinerary, "sev", "",
-             [/* For disassembly only; pattern left blank */]>,
-          Requires<[IsARM, HasV6T2]> {
+             []>, Requires<[IsARM, HasV6T2]> {
   let Inst{27-16} = 0b001100100000;
   let Inst{15-8} = 0b11110000;
   let Inst{7-0} = 0b00000100;
@@ -1206,14 +1638,11 @@ def BKPT : AI<(outs), (ins imm0_65535:$val), MiscFrm, NoItinerary,
   let Inst{7-4} = 0b0111;
 }
 
-// Change Processor State is a system instruction -- for disassembly and
-// parsing only.
-// FIXME: Since the asm parser has currently no clean way to handle optional
-// operands, create 3 versions of the same instruction. Once there's a clean
-// framework to represent optional operands, change this behavior.
+// Change Processor State
+// FIXME: We should use InstAlias to handle the optional operands.
 class CPS<dag iops, string asm_ops>
   : AXI<(outs), iops, MiscFrm, NoItinerary, !strconcat("cps", asm_ops),
-        [/* For disassembly only; pattern left blank */]>, Requires<[IsARM]> {
+        []>, Requires<[IsARM]> {
   bits<2> imod;
   bits<3> iflags;
   bits<5> mode;
@@ -1229,17 +1658,18 @@ class CPS<dag iops, string asm_ops>
   let Inst{4-0}   = mode;
 }
 
+let DecoderMethod = "DecodeCPSInstruction" in {
 let M = 1 in
-  def CPS3p : CPS<(ins imod_op:$imod, iflags_op:$iflags, i32imm:$mode),
+  def CPS3p : CPS<(ins imod_op:$imod, iflags_op:$iflags, imm0_31:$mode),
                   "$imod\t$iflags, $mode">;
 let mode = 0, M = 0 in
   def CPS2p : CPS<(ins imod_op:$imod, iflags_op:$iflags), "$imod\t$iflags">;
 
 let imod = 0, iflags = 0, M = 1 in
-  def CPS1p : CPS<(ins i32imm:$mode), "\t$mode">;
+  def CPS1p : CPS<(ins imm0_31:$mode), "\t$mode">;
+}
 
 // Preload signals the memory system of possible future data/instruction access.
-// These are for disassembly only.
 multiclass APreLoad<bits<1> read, bits<1> data, string opc> {
 
   def i12 : AXI<(outs), (ins addrmode_imm12:$addr), MiscFrm, IIC_Preload,
@@ -1271,6 +1701,7 @@ multiclass APreLoad<bits<1> read, bits<1> data, string opc> {
     let Inst{19-16} = shift{16-13}; // Rn
     let Inst{15-12} = 0b1111;
     let Inst{11-0}  = shift{11-0};
+    let Inst{4} = 0;
   }
 }
 
@@ -1278,10 +1709,8 @@ defm PLD  : APreLoad<1, 1, "pld">,  Requires<[IsARM]>;
 defm PLDW : APreLoad<0, 1, "pldw">, Requires<[IsARM,HasV7,HasMP]>;
 defm PLI  : APreLoad<1, 0, "pli">,  Requires<[IsARM,HasV7]>;
 
-def SETEND : AXI<(outs),(ins setend_op:$end), MiscFrm, NoItinerary,
-                 "setend\t$end",
-                 [/* For disassembly only; pattern left blank */]>,
-               Requires<[IsARM]> {
+def SETEND : AXI<(outs), (ins setend_op:$end), MiscFrm, NoItinerary,
+                 "setend\t$end", []>, Requires<[IsARM]> {
   bits<1> end;
   let Inst{31-10} = 0b1111000100000001000000;
   let Inst{9} = end;
@@ -1351,14 +1780,17 @@ let neverHasSideEffects = 1, isReMaterializable = 1 in
 // the instruction. The {24-21} opcode bits are set by the fixup, as we don't
 // know until then which form of the instruction will be used.
 def ADR : AI1<{0,?,?,0}, (outs GPR:$Rd), (ins adrlabel:$label),
-                 MiscFrm, IIC_iALUi, "adr", "\t$Rd, #$label", []> {
+                 MiscFrm, IIC_iALUi, "adr", "\t$Rd, $label", []> {
   bits<4> Rd;
-  bits<12> label;
+  bits<14> label;
   let Inst{27-25} = 0b001;
+  let Inst{24} = 0;
+  let Inst{23-22} = label{13-12};
+  let Inst{21} = 0;
   let Inst{20} = 0;
   let Inst{19-16} = 0b1111;
   let Inst{15-12} = Rd;
-  let Inst{11-0} = label;
+  let Inst{11-0} = label{11-0};
 }
 def LEApcrel : ARMPseudoInst<(outs GPR:$Rd), (ins i32imm:$label, pred:$p),
                     4, IIC_iALUi, []>;
@@ -1424,6 +1856,7 @@ let isCall = 1,
     let Inst{31-28} = 0b1110;
     bits<24> func;
     let Inst{23-0} = func;
+    let DecoderMethod = "DecodeBranchImmInstruction";
   }
 
   def BL_pred : ABI<0b1011, (outs), (ins bl_target:$func, variable_ops),
@@ -1432,6 +1865,7 @@ let isCall = 1,
                 Requires<[IsARM, IsNotDarwin]> {
     bits<24> func;
     let Inst{23-0} = func;
+    let DecoderMethod = "DecodeBranchImmInstruction";
   }
 
   // ARMv5T and above
@@ -1516,6 +1950,7 @@ let isBranch = 1, isTerminator = 1 in {
                [/*(ARMbrcond bb:$target, imm:$cc, CCR:$ccr)*/]> {
     bits<24> target;
     let Inst{23-0} = target;
+    let DecoderMethod = "DecodeBranchImmInstruction";
   }
 
   let isBarrier = 1 in {
@@ -1549,9 +1984,9 @@ let isBranch = 1, isTerminator = 1 in {
 
 }
 
-// BLX (immediate) -- for disassembly only
-def BLXi : AXI<(outs), (ins br_target:$target), BrMiscFrm, NoItinerary,
-               "blx\t$target", [/* pattern left blank */]>,
+// BLX (immediate)
+def BLXi : AXI<(outs), (ins blx_target:$target), BrMiscFrm, NoItinerary,
+               "blx\t$target", []>,
            Requires<[IsARM, HasV5T]> {
   let Inst{31-25} = 0b1111101;
   bits<25> target;
@@ -1614,64 +2049,100 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
   }
 }
 
-
-
-
-
-// Secure Monitor Call is a system instruction -- for disassembly only
-def SMC : ABI<0b0001, (outs), (ins i32imm:$opt), NoItinerary, "smc", "\t$opt",
-              [/* For disassembly only; pattern left blank */]> {
+// Secure Monitor Call is a system instruction.
+def SMC : ABI<0b0001, (outs), (ins imm0_15:$opt), NoItinerary, "smc", "\t$opt",
+              []> {
   bits<4> opt;
   let Inst{23-4} = 0b01100000000000000111;
   let Inst{3-0} = opt;
 }
 
-// Supervisor Call (Software Interrupt) -- for disassembly only
+// Supervisor Call (Software Interrupt)
 let isCall = 1, Uses = [SP] in {
-def SVC : ABI<0b1111, (outs), (ins i32imm:$svc), IIC_Br, "svc", "\t$svc",
-              [/* For disassembly only; pattern left blank */]> {
+def SVC : ABI<0b1111, (outs), (ins imm24b:$svc), IIC_Br, "svc", "\t$svc", []> {
   bits<24> svc;
   let Inst{23-0} = svc;
 }
 }
 
-// Store Return State is a system instruction -- for disassembly only
-let isCodeGenOnly = 1 in {  // FIXME: This should not use submode!
-def SRSW : ABXI<{1,0,0,?}, (outs), (ins ldstm_mode:$amode, i32imm:$mode),
-                NoItinerary, "srs${amode}\tsp!, $mode",
-                [/* For disassembly only; pattern left blank */]> {
+// Store Return State
+class SRSI<bit wb, string asm>
+  : XI<(outs), (ins imm0_31:$mode), AddrModeNone, 4, IndexModeNone, BrFrm,
+       NoItinerary, asm, "", []> {
+  bits<5> mode;
   let Inst{31-28} = 0b1111;
-  let Inst{22-20} = 0b110; // W = 1
-  let Inst{19-8} = 0xd05;
-  let Inst{7-5} = 0b000;
+  let Inst{27-25} = 0b100;
+  let Inst{22} = 1;
+  let Inst{21} = wb;
+  let Inst{20} = 0;
+  let Inst{19-16} = 0b1101;  // SP
+  let Inst{15-5} = 0b00000101000;
+  let Inst{4-0} = mode;
 }
 
-def SRS  : ABXI<{1,0,0,?}, (outs), (ins ldstm_mode:$amode, i32imm:$mode),
-                NoItinerary, "srs${amode}\tsp, $mode",
-                [/* For disassembly only; pattern left blank */]> {
-  let Inst{31-28} = 0b1111;
-  let Inst{22-20} = 0b100; // W = 0
-  let Inst{19-8} = 0xd05;
-  let Inst{7-5} = 0b000;
+def SRSDA : SRSI<0, "srsda\tsp, $mode"> {
+  let Inst{24-23} = 0;
+}
+def SRSDA_UPD : SRSI<1, "srsda\tsp!, $mode"> {
+  let Inst{24-23} = 0;
+}
+def SRSDB : SRSI<0, "srsdb\tsp, $mode"> {
+  let Inst{24-23} = 0b10;
+}
+def SRSDB_UPD : SRSI<1, "srsdb\tsp!, $mode"> {
+  let Inst{24-23} = 0b10;
+}
+def SRSIA : SRSI<0, "srsia\tsp, $mode"> {
+  let Inst{24-23} = 0b01;
+}
+def SRSIA_UPD : SRSI<1, "srsia\tsp!, $mode"> {
+  let Inst{24-23} = 0b01;
+}
+def SRSIB : SRSI<0, "srsib\tsp, $mode"> {
+  let Inst{24-23} = 0b11;
+}
+def SRSIB_UPD : SRSI<1, "srsib\tsp!, $mode"> {
+  let Inst{24-23} = 0b11;
 }
 
-// Return From Exception is a system instruction -- for disassembly only
-def RFEW : ABXI<{1,0,0,?}, (outs), (ins ldstm_mode:$amode, GPR:$base),
-                NoItinerary, "rfe${amode}\t$base!",
-                [/* For disassembly only; pattern left blank */]> {
+// Return From Exception
+class RFEI<bit wb, string asm>
+  : XI<(outs), (ins GPR:$Rn), AddrModeNone, 4, IndexModeNone, BrFrm,
+       NoItinerary, asm, "", []> {
+  bits<4> Rn;
   let Inst{31-28} = 0b1111;
-  let Inst{22-20} = 0b011; // W = 1
-  let Inst{15-0} = 0x0a00;
+  let Inst{27-25} = 0b100;
+  let Inst{22} = 0;
+  let Inst{21} = wb;
+  let Inst{20} = 1;
+  let Inst{19-16} = Rn;
+  let Inst{15-0} = 0xa00;
 }
 
-def RFE  : ABXI<{1,0,0,?}, (outs), (ins ldstm_mode:$amode, GPR:$base),
-                NoItinerary, "rfe${amode}\t$base",
-                [/* For disassembly only; pattern left blank */]> {
-  let Inst{31-28} = 0b1111;
-  let Inst{22-20} = 0b001; // W = 0
-  let Inst{15-0} = 0x0a00;
+def RFEDA : RFEI<0, "rfeda\t$Rn"> {
+  let Inst{24-23} = 0;
+}
+def RFEDA_UPD : RFEI<1, "rfeda\t$Rn!"> {
+  let Inst{24-23} = 0;
+}
+def RFEDB : RFEI<0, "rfedb\t$Rn"> {
+  let Inst{24-23} = 0b10;
+}
+def RFEDB_UPD : RFEI<1, "rfedb\t$Rn!"> {
+  let Inst{24-23} = 0b10;
+}
+def RFEIA : RFEI<0, "rfeia\t$Rn"> {
+  let Inst{24-23} = 0b01;
+}
+def RFEIA_UPD : RFEI<1, "rfeia\t$Rn!"> {
+  let Inst{24-23} = 0b01;
+}
+def RFEIB : RFEI<0, "rfeib\t$Rn"> {
+  let Inst{24-23} = 0b11;
+}
+def RFEIB_UPD : RFEI<1, "rfeib\t$Rn!"> {
+  let Inst{24-23} = 0b11;
 }
-} // isCodeGenOnly = 1
 
 //===----------------------------------------------------------------------===//
 //  Load / store Instructions.
@@ -1682,16 +2153,16 @@ def RFE  : ABXI<{1,0,0,?}, (outs), (ins ldstm_mode:$amode, GPR:$base),
 
 defm LDR  : AI_ldr1<0, "ldr", IIC_iLoad_r, IIC_iLoad_si,
                     UnOpFrag<(load node:$Src)>>;
-defm LDRB : AI_ldr1<1, "ldrb", IIC_iLoad_bh_r, IIC_iLoad_bh_si,
+defm LDRB : AI_ldr1nopc<1, "ldrb", IIC_iLoad_bh_r, IIC_iLoad_bh_si,
                     UnOpFrag<(zextloadi8 node:$Src)>>;
 defm STR  : AI_str1<0, "str", IIC_iStore_r, IIC_iStore_si,
                    BinOpFrag<(store node:$LHS, node:$RHS)>>;
-defm STRB : AI_str1<1, "strb", IIC_iStore_bh_r, IIC_iStore_bh_si,
+defm STRB : AI_str1nopc<1, "strb", IIC_iStore_bh_r, IIC_iStore_bh_si,
                    BinOpFrag<(truncstorei8 node:$LHS, node:$RHS)>>;
 
 // Special LDR for loads from non-pc-relative constpools.
 let canFoldAsLoad = 1, mayLoad = 1, neverHasSideEffects = 1,
-    isReMaterializable = 1 in
+    isReMaterializable = 1, isCodeGenOnly = 1 in
 def LDRcp : AI2ldst<0b010, 1, 0, (outs GPR:$Rt), (ins addrmode_imm12:$addr),
                  AddrMode_i12, LdFrm, IIC_iLoad_r, "ldr", "\t$Rt, $addr",
                  []> {
@@ -1725,36 +2196,67 @@ def LDRD : AI3ld<0b1101, 0, (outs GPR:$Rd, GPR:$dst2),
                  []>, Requires<[IsARM, HasV5TE]>;
 }
 
-// Indexed loads
-multiclass AI2_ldridx<bit isByte, string opc, InstrItinClass itin> {
-  def _PRE  : AI2ldstidx<1, isByte, 1, (outs GPR:$Rt, GPR:$Rn_wb),
-                      (ins addrmode2:$addr), IndexModePre, LdFrm, itin,
+// Indexed loads
+multiclass AI2_ldridx<bit isByte, string opc, InstrItinClass itin> {
+  def _PRE_IMM  : AI2ldstidx<1, isByte, 1, (outs GPR:$Rt, GPR:$Rn_wb),
+                      (ins addrmode_imm12:$addr), IndexModePre, LdFrm, itin,
+                      opc, "\t$Rt, $addr!", "$addr.base = $Rn_wb", []> {
+    bits<17> addr;
+    let Inst{25} = 0;
+    let Inst{23} = addr{12};
+    let Inst{19-16} = addr{16-13};
+    let Inst{11-0} = addr{11-0};
+    let DecoderMethod = "DecodeLDRPreImm";
+    let AsmMatchConverter = "cvtLdWriteBackRegAddrModeImm12";
+  }
+
+  def _PRE_REG  : AI2ldstidx<1, isByte, 1, (outs GPR:$Rt, GPR:$Rn_wb),
+                      (ins ldst_so_reg:$addr), IndexModePre, LdFrm, itin,
                       opc, "\t$Rt, $addr!", "$addr.base = $Rn_wb", []> {
-    // {17-14}  Rn
-    // {13}     1 == Rm, 0 == imm12
-    // {12}     isAdd
-    // {11-0}   imm12/Rm
-    bits<18> addr;
-    let Inst{25} = addr{13};
+    bits<17> addr;
+    let Inst{25} = 1;
     let Inst{23} = addr{12};
-    let Inst{19-16} = addr{17-14};
+    let Inst{19-16} = addr{16-13};
     let Inst{11-0} = addr{11-0};
-    let AsmMatchConverter = "CvtLdWriteBackRegAddrMode2";
+    let Inst{4} = 0;
+    let DecoderMethod = "DecodeLDRPreReg";
+    let AsmMatchConverter = "cvtLdWriteBackRegAddrMode2";
   }
-  def _POST : AI2ldstidx<1, isByte, 0, (outs GPR:$Rt, GPR:$Rn_wb),
-                      (ins GPR:$Rn, am2offset:$offset),
+
+  def _POST_REG : AI2ldstidx<1, isByte, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+                       (ins addr_offset_none:$addr, am2offset_reg:$offset),
+                       IndexModePost, LdFrm, itin,
+                       opc, "\t$Rt, $addr, $offset",
+                       "$addr.base = $Rn_wb", []> {
+     // {12}     isAdd
+     // {11-0}   imm12/Rm
+     bits<14> offset;
+     bits<4> addr;
+     let Inst{25} = 1;
+     let Inst{23} = offset{12};
+     let Inst{19-16} = addr;
+     let Inst{11-0} = offset{11-0};
+
+    let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+   }
+
+   def _POST_IMM : AI2ldstidx<1, isByte, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+                       (ins addr_offset_none:$addr, am2offset_imm:$offset),
                       IndexModePost, LdFrm, itin,
-                      opc, "\t$Rt, [$Rn], $offset", "$Rn = $Rn_wb", []> {
-    // {13}     1 == Rm, 0 == imm12
+                      opc, "\t$Rt, $addr, $offset",
+                      "$addr.base = $Rn_wb", []> {
     // {12}     isAdd
     // {11-0}   imm12/Rm
     bits<14> offset;
-    bits<4> Rn;
-    let Inst{25} = offset{13};
+    bits<4> addr;
+    let Inst{25} = 0;
     let Inst{23} = offset{12};
-    let Inst{19-16} = Rn;
+    let Inst{19-16} = addr;
     let Inst{11-0} = offset{11-0};
+
+    let DecoderMethod = "DecodeAddrMode2IdxInstruction";
   }
+
 }
 
 let mayLoad = 1, neverHasSideEffects = 1 in {
@@ -1762,8 +2264,8 @@ defm LDR  : AI2_ldridx<0, "ldr", IIC_iLoad_ru>;
 defm LDRB : AI2_ldridx<1, "ldrb", IIC_iLoad_bh_ru>;
 }
 
-multiclass AI3_ldridx<bits<4> op, bit op20, string opc, InstrItinClass itin> {
-  def _PRE  : AI3ldstidx<op, op20, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
+multiclass AI3_ldridx<bits<4> op, string opc, InstrItinClass itin> {
+  def _PRE  : AI3ldstidx<op, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
                         (ins addrmode3:$addr), IndexModePre,
                         LdMiscFrm, itin,
                         opc, "\t$Rt, $addr!", "$addr.base = $Rn_wb", []> {
@@ -1773,27 +2275,31 @@ multiclass AI3_ldridx<bits<4> op, bit op20, string opc, InstrItinClass itin> {
     let Inst{19-16} = addr{12-9};   // Rn
     let Inst{11-8}  = addr{7-4};    // imm7_4/zero
     let Inst{3-0}   = addr{3-0};    // imm3_0/Rm
+    let AsmMatchConverter = "cvtLdWriteBackRegAddrMode3";
+    let DecoderMethod = "DecodeAddrMode3Instruction";
   }
-  def _POST : AI3ldstidx<op, op20, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
-                        (ins GPR:$Rn, am3offset:$offset), IndexModePost,
-                        LdMiscFrm, itin,
-                        opc, "\t$Rt, [$Rn], $offset", "$Rn = $Rn_wb", []> {
+  def _POST : AI3ldstidx<op, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+                        (ins addr_offset_none:$addr, am3offset:$offset),
+                        IndexModePost, LdMiscFrm, itin,
+                        opc, "\t$Rt, $addr, $offset", "$addr.base = $Rn_wb",
+                        []> {
     bits<10> offset;
-    bits<4> Rn;
+    bits<4> addr;
     let Inst{23}    = offset{8};      // U bit
     let Inst{22}    = offset{9};      // 1 == imm8, 0 == Rm
-    let Inst{19-16} = Rn;
+    let Inst{19-16} = addr;
     let Inst{11-8}  = offset{7-4};    // imm7_4/zero
     let Inst{3-0}   = offset{3-0};    // imm3_0/Rm
+    let DecoderMethod = "DecodeAddrMode3Instruction";
   }
 }
 
 let mayLoad = 1, neverHasSideEffects = 1 in {
-defm LDRH  : AI3_ldridx<0b1011, 1, "ldrh", IIC_iLoad_bh_ru>;
-defm LDRSH : AI3_ldridx<0b1111, 1, "ldrsh", IIC_iLoad_bh_ru>;
-defm LDRSB : AI3_ldridx<0b1101, 1, "ldrsb", IIC_iLoad_bh_ru>;
+defm LDRH  : AI3_ldridx<0b1011, "ldrh", IIC_iLoad_bh_ru>;
+defm LDRSH : AI3_ldridx<0b1111, "ldrsh", IIC_iLoad_bh_ru>;
+defm LDRSB : AI3_ldridx<0b1101, "ldrsb", IIC_iLoad_bh_ru>;
 let hasExtraDefRegAllocReq = 1 in {
-def LDRD_PRE : AI3ldstidx<0b1101, 0, 1, 1, (outs GPR:$Rt, GPR:$Rt2, GPR:$Rn_wb),
+def LDRD_PRE : AI3ldstidx<0b1101, 0, 1, (outs GPR:$Rt, GPR:$Rt2, GPR:$Rn_wb),
                           (ins addrmode3:$addr), IndexModePre,
                           LdMiscFrm, IIC_iLoad_d_ru,
                           "ldrd", "\t$Rt, $Rt2, $addr!",
@@ -1804,70 +2310,128 @@ def LDRD_PRE : AI3ldstidx<0b1101, 0, 1, 1, (outs GPR:$Rt, GPR:$Rt2, GPR:$Rn_wb),
   let Inst{19-16} = addr{12-9};   // Rn
   let Inst{11-8}  = addr{7-4};    // imm7_4/zero
   let Inst{3-0}   = addr{3-0};    // imm3_0/Rm
+  let DecoderMethod = "DecodeAddrMode3Instruction";
+  let AsmMatchConverter = "cvtLdrdPre";
 }
-def LDRD_POST: AI3ldstidx<0b1101, 0, 1, 0, (outs GPR:$Rt, GPR:$Rt2, GPR:$Rn_wb),
-                          (ins GPR:$Rn, am3offset:$offset), IndexModePost,
-                          LdMiscFrm, IIC_iLoad_d_ru,
-                          "ldrd", "\t$Rt, $Rt2, [$Rn], $offset",
-                          "$Rn = $Rn_wb", []> {
+def LDRD_POST: AI3ldstidx<0b1101, 0, 0, (outs GPR:$Rt, GPR:$Rt2, GPR:$Rn_wb),
+                          (ins addr_offset_none:$addr, am3offset:$offset),
+                          IndexModePost, LdMiscFrm, IIC_iLoad_d_ru,
+                          "ldrd", "\t$Rt, $Rt2, $addr, $offset",
+                          "$addr.base = $Rn_wb", []> {
   bits<10> offset;
-  bits<4> Rn;
+  bits<4> addr;
   let Inst{23}    = offset{8};      // U bit
   let Inst{22}    = offset{9};      // 1 == imm8, 0 == Rm
-  let Inst{19-16} = Rn;
+  let Inst{19-16} = addr;
   let Inst{11-8}  = offset{7-4};    // imm7_4/zero
   let Inst{3-0}   = offset{3-0};    // imm3_0/Rm
+  let DecoderMethod = "DecodeAddrMode3Instruction";
 }
 } // hasExtraDefRegAllocReq = 1
 } // mayLoad = 1, neverHasSideEffects = 1
 
-// LDRT, LDRBT, LDRSBT, LDRHT, LDRSHT are for disassembly only.
+// LDRT, LDRBT, LDRSBT, LDRHT, LDRSHT.
 let mayLoad = 1, neverHasSideEffects = 1 in {
-def LDRT : AI2ldstidx<1, 0, 0, (outs GPR:$Rt, GPR:$base_wb),
-                   (ins addrmode2:$addr), IndexModePost, LdFrm, IIC_iLoad_ru,
-                   "ldrt", "\t$Rt, $addr", "$addr.base = $base_wb", []> {
-  // {17-14}  Rn
-  // {13}     1 == Rm, 0 == imm12
+def LDRT_POST_REG : AI2ldstidx<1, 0, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+                    (ins addr_offset_none:$addr, am2offset_reg:$offset),
+                    IndexModePost, LdFrm, IIC_iLoad_ru,
+                    "ldrt", "\t$Rt, $addr, $offset",
+                    "$addr.base = $Rn_wb", []> {
   // {12}     isAdd
   // {11-0}   imm12/Rm
-  bits<18> addr;
-  let Inst{25} = addr{13};
-  let Inst{23} = addr{12};
+  bits<14> offset;
+  bits<4> addr;
+  let Inst{25} = 1;
+  let Inst{23} = offset{12};
   let Inst{21} = 1; // overwrite
-  let Inst{19-16} = addr{17-14};
-  let Inst{11-0} = addr{11-0};
-  let AsmMatchConverter = "CvtLdWriteBackRegAddrMode2";
-}
-def LDRBT : AI2ldstidx<1, 1, 0, (outs GPR:$Rt, GPR:$base_wb),
-                  (ins addrmode2:$addr), IndexModePost, LdFrm, IIC_iLoad_bh_ru,
-                  "ldrbt", "\t$Rt, $addr", "$addr.base = $base_wb", []> {
-  // {17-14}  Rn
-  // {13}     1 == Rm, 0 == imm12
+  let Inst{19-16} = addr;
+  let Inst{11-5} = offset{11-5};
+  let Inst{4} = 0;
+  let Inst{3-0} = offset{3-0};
+  let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+}
+
+def LDRT_POST_IMM : AI2ldstidx<1, 0, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+                    (ins addr_offset_none:$addr, am2offset_imm:$offset),
+                   IndexModePost, LdFrm, IIC_iLoad_ru,
+                   "ldrt", "\t$Rt, $addr, $offset",
+                   "$addr.base = $Rn_wb", []> {
   // {12}     isAdd
   // {11-0}   imm12/Rm
-  bits<18> addr;
-  let Inst{25} = addr{13};
-  let Inst{23} = addr{12};
-  let Inst{21} = 1; // overwrite
-  let Inst{19-16} = addr{17-14};
-  let Inst{11-0} = addr{11-0};
-  let AsmMatchConverter = "CvtLdWriteBackRegAddrMode2";
-}
-def LDRSBT : AI3ldstidxT<0b1101, 1, 1, 0, (outs GPR:$Rt, GPR:$base_wb),
-             (ins addrmode3:$addr), IndexModePost, LdMiscFrm, IIC_iLoad_bh_ru,
-             "ldrsbt", "\t$Rt, $addr", "$addr.base = $base_wb", []> {
+  bits<14> offset;
+  bits<4> addr;
+  let Inst{25} = 0;
+  let Inst{23} = offset{12};
   let Inst{21} = 1; // overwrite
+  let Inst{19-16} = addr;
+  let Inst{11-0} = offset{11-0};
+  let DecoderMethod = "DecodeAddrMode2IdxInstruction";
 }
-def LDRHT  : AI3ldstidxT<0b1011, 1, 1, 0, (outs GPR:$Rt, GPR:$base_wb),
-             (ins addrmode3:$addr), IndexModePost, LdMiscFrm, IIC_iLoad_bh_ru,
-             "ldrht", "\t$Rt, $addr", "$addr.base = $base_wb", []> {
+
+def LDRBT_POST_REG : AI2ldstidx<1, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+                     (ins addr_offset_none:$addr, am2offset_reg:$offset),
+                     IndexModePost, LdFrm, IIC_iLoad_bh_ru,
+                     "ldrbt", "\t$Rt, $addr, $offset",
+                     "$addr.base = $Rn_wb", []> {
+  // {12}     isAdd
+  // {11-0}   imm12/Rm
+  bits<14> offset;
+  bits<4> addr;
+  let Inst{25} = 1;
+  let Inst{23} = offset{12};
   let Inst{21} = 1; // overwrite
-}
-def LDRSHT : AI3ldstidxT<0b1111, 1, 1, 0, (outs GPR:$Rt, GPR:$base_wb),
-             (ins addrmode3:$addr), IndexModePost, LdMiscFrm, IIC_iLoad_bh_ru,
-             "ldrsht", "\t$Rt, $addr", "$addr.base = $base_wb", []> {
+  let Inst{19-16} = addr;
+  let Inst{11-5} = offset{11-5};
+  let Inst{4} = 0;
+  let Inst{3-0} = offset{3-0};
+  let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+}
+
+def LDRBT_POST_IMM : AI2ldstidx<1, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+                     (ins addr_offset_none:$addr, am2offset_imm:$offset),
+                    IndexModePost, LdFrm, IIC_iLoad_bh_ru,
+                    "ldrbt", "\t$Rt, $addr, $offset",
+                    "$addr.base = $Rn_wb", []> {
+  // {12}     isAdd
+  // {11-0}   imm12/Rm
+  bits<14> offset;
+  bits<4> addr;
+  let Inst{25} = 0;
+  let Inst{23} = offset{12};
   let Inst{21} = 1; // overwrite
+  let Inst{19-16} = addr;
+  let Inst{11-0} = offset{11-0};
+  let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+}
+
+multiclass AI3ldrT<bits<4> op, string opc> {
+  def i : AI3ldstidxT<op, 1, (outs GPR:$Rt, GPR:$base_wb),
+                      (ins addr_offset_none:$addr, postidx_imm8:$offset),
+                      IndexModePost, LdMiscFrm, IIC_iLoad_bh_ru, opc,
+                      "\t$Rt, $addr, $offset", "$addr.base = $base_wb", []> {
+    bits<9> offset;
+    let Inst{23} = offset{8};
+    let Inst{22} = 1;
+    let Inst{11-8} = offset{7-4};
+    let Inst{3-0} = offset{3-0};
+    let AsmMatchConverter = "cvtLdExtTWriteBackImm";
+  }
+  def r : AI3ldstidxT<op, 1, (outs GPR:$Rt, GPR:$base_wb),
+                      (ins addr_offset_none:$addr, postidx_reg:$Rm),
+                      IndexModePost, LdMiscFrm, IIC_iLoad_bh_ru, opc,
+                      "\t$Rt, $addr, $Rm", "$addr.base = $base_wb", []> {
+    bits<5> Rm;
+    let Inst{23} = Rm{4};
+    let Inst{22} = 0;
+    let Inst{11-8} = 0;
+    let Inst{3-0} = Rm{3-0};
+    let AsmMatchConverter = "cvtLdExtTWriteBackReg";
+  }
 }
+
+defm LDRSBT : AI3ldrT<0b1101, "ldrsbt">;
+defm LDRHT  : AI3ldrT<0b1011, "ldrht">;
+defm LDRSHT : AI3ldrT<0b1111, "ldrsht">;
 }
 
 // Store
@@ -1881,98 +2445,302 @@ def STRH : AI3str<0b1011, (outs), (ins GPR:$Rt, addrmode3:$addr), StMiscFrm,
 let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in
 def STRD : AI3str<0b1111, (outs), (ins GPR:$Rt, GPR:$src2, addrmode3:$addr),
                StMiscFrm, IIC_iStore_d_r,
-               "strd", "\t$Rt, $src2, $addr", []>, Requires<[IsARM, HasV5TE]>;
+               "strd", "\t$Rt, $src2, $addr", []>,
+           Requires<[IsARM, HasV5TE]> {
+  let Inst{21} = 0;
+}
 
 // Indexed stores
-def STR_PRE  : AI2stridx<0, 1, (outs GPR:$Rn_wb),
-                     (ins GPR:$Rt, GPR:$Rn, am2offset:$offset),
-                     IndexModePre, StFrm, IIC_iStore_ru,
-                     "str", "\t$Rt, [$Rn, $offset]!",
-                     "$Rn = $Rn_wb,@earlyclobber $Rn_wb",
-                     [(set GPR:$Rn_wb,
-                      (pre_store GPR:$Rt, GPR:$Rn, am2offset:$offset))]>;
-
-def STR_POST : AI2stridx<0, 0, (outs GPR:$Rn_wb),
-                     (ins GPR:$Rt, GPR:$Rn, am2offset:$offset),
-                     IndexModePost, StFrm, IIC_iStore_ru,
-                     "str", "\t$Rt, [$Rn], $offset",
-                     "$Rn = $Rn_wb,@earlyclobber $Rn_wb",
-                     [(set GPR:$Rn_wb,
-                      (post_store GPR:$Rt, GPR:$Rn, am2offset:$offset))]>;
-
-def STRB_PRE : AI2stridx<1, 1, (outs GPR:$Rn_wb),
-                     (ins GPR:$Rt, GPR:$Rn, am2offset:$offset),
-                     IndexModePre, StFrm, IIC_iStore_bh_ru,
-                     "strb", "\t$Rt, [$Rn, $offset]!",
-                     "$Rn = $Rn_wb,@earlyclobber $Rn_wb",
-                     [(set GPR:$Rn_wb, (pre_truncsti8 GPR:$Rt,
-                                        GPR:$Rn, am2offset:$offset))]>;
-def STRB_POST: AI2stridx<1, 0, (outs GPR:$Rn_wb),
-                     (ins GPR:$Rt, GPR:$Rn, am2offset:$offset),
-                     IndexModePost, StFrm, IIC_iStore_bh_ru,
-                     "strb", "\t$Rt, [$Rn], $offset",
-                     "$Rn = $Rn_wb,@earlyclobber $Rn_wb",
-                     [(set GPR:$Rn_wb, (post_truncsti8 GPR:$Rt,
-                                        GPR:$Rn, am2offset:$offset))]>;
-
-def STRH_PRE : AI3stridx<0b1011, 0, 1, (outs GPR:$Rn_wb),
-                     (ins GPR:$Rt, GPR:$Rn, am3offset:$offset),
-                     IndexModePre, StMiscFrm, IIC_iStore_ru,
-                     "strh", "\t$Rt, [$Rn, $offset]!",
-                     "$Rn = $Rn_wb,@earlyclobber $Rn_wb",
-                     [(set GPR:$Rn_wb,
-                      (pre_truncsti16 GPR:$Rt, GPR:$Rn, am3offset:$offset))]>;
-
-def STRH_POST: AI3stridx<0b1011, 0, 0, (outs GPR:$Rn_wb),
-                     (ins GPR:$Rt, GPR:$Rn, am3offset:$offset),
-                     IndexModePost, StMiscFrm, IIC_iStore_bh_ru,
-                     "strh", "\t$Rt, [$Rn], $offset",
-                     "$Rn = $Rn_wb,@earlyclobber $Rn_wb",
-                     [(set GPR:$Rn_wb, (post_truncsti16 GPR:$Rt,
-                                        GPR:$Rn, am3offset:$offset))]>;
-
-// For disassembly only
+multiclass AI2_stridx<bit isByte, string opc, InstrItinClass itin> {
+  def _PRE_IMM : AI2ldstidx<0, isByte, 1, (outs GPR:$Rn_wb),
+                            (ins GPR:$Rt, addrmode_imm12:$addr), IndexModePre,
+                            StFrm, itin,
+                            opc, "\t$Rt, $addr!", "$addr.base = $Rn_wb", []> {
+    bits<17> addr;
+    let Inst{25} = 0;
+    let Inst{23}    = addr{12};     // U (add = ('U' == 1))
+    let Inst{19-16} = addr{16-13};  // Rn
+    let Inst{11-0}  = addr{11-0};   // imm12
+    let AsmMatchConverter = "cvtStWriteBackRegAddrModeImm12";
+    let DecoderMethod = "DecodeSTRPreImm";
+  }
+
+  def _PRE_REG  : AI2ldstidx<0, isByte, 1, (outs GPR:$Rn_wb),
+                      (ins GPR:$Rt, ldst_so_reg:$addr),
+                      IndexModePre, StFrm, itin,
+                      opc, "\t$Rt, $addr!", "$addr.base = $Rn_wb", []> {
+    bits<17> addr;
+    let Inst{25} = 1;
+    let Inst{23}    = addr{12};    // U (add = ('U' == 1))
+    let Inst{19-16} = addr{16-13}; // Rn
+    let Inst{11-0}  = addr{11-0};
+    let Inst{4}     = 0;           // Inst{4} = 0
+    let AsmMatchConverter = "cvtStWriteBackRegAddrMode2";
+    let DecoderMethod = "DecodeSTRPreReg";
+  }
+  def _POST_REG : AI2ldstidx<0, isByte, 0, (outs GPR:$Rn_wb),
+                (ins GPR:$Rt, addr_offset_none:$addr, am2offset_reg:$offset),
+                IndexModePost, StFrm, itin,
+                opc, "\t$Rt, $addr, $offset",
+                "$addr.base = $Rn_wb", []> {
+     // {12}     isAdd
+     // {11-0}   imm12/Rm
+     bits<14> offset;
+     bits<4> addr;
+     let Inst{25} = 1;
+     let Inst{23} = offset{12};
+     let Inst{19-16} = addr;
+     let Inst{11-0} = offset{11-0};
+
+    let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+   }
+
+   def _POST_IMM : AI2ldstidx<0, isByte, 0, (outs GPR:$Rn_wb),
+                (ins GPR:$Rt, addr_offset_none:$addr, am2offset_imm:$offset),
+                IndexModePost, StFrm, itin,
+                opc, "\t$Rt, $addr, $offset",
+                "$addr.base = $Rn_wb", []> {
+    // {12}     isAdd
+    // {11-0}   imm12/Rm
+    bits<14> offset;
+    bits<4> addr;
+    let Inst{25} = 0;
+    let Inst{23} = offset{12};
+    let Inst{19-16} = addr;
+    let Inst{11-0} = offset{11-0};
+
+    let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+  }
+}
+
+let mayStore = 1, neverHasSideEffects = 1 in {
+defm STR  : AI2_stridx<0, "str", IIC_iStore_ru>;
+defm STRB : AI2_stridx<1, "strb", IIC_iStore_bh_ru>;
+}
+
+def : ARMPat<(post_store GPR:$Rt, addr_offset_none:$addr,
+                         am2offset_reg:$offset),
+             (STR_POST_REG GPR:$Rt, addr_offset_none:$addr,
+                           am2offset_reg:$offset)>;
+def : ARMPat<(post_store GPR:$Rt, addr_offset_none:$addr,
+                         am2offset_imm:$offset),
+             (STR_POST_IMM GPR:$Rt, addr_offset_none:$addr,
+                           am2offset_imm:$offset)>;
+def : ARMPat<(post_truncsti8 GPR:$Rt, addr_offset_none:$addr,
+                             am2offset_reg:$offset),
+             (STRB_POST_REG GPR:$Rt, addr_offset_none:$addr,
+                            am2offset_reg:$offset)>;
+def : ARMPat<(post_truncsti8 GPR:$Rt, addr_offset_none:$addr,
+                             am2offset_imm:$offset),
+             (STRB_POST_IMM GPR:$Rt, addr_offset_none:$addr,
+                            am2offset_imm:$offset)>;
+
+// Pseudo-instructions for pattern matching the pre-indexed stores. We can't
+// put the patterns on the instruction definitions directly as ISel wants
+// the address base and offset to be separate operands, not a single
+// complex operand like we represent the instructions themselves. The
+// pseudos map between the two.
+let usesCustomInserter = 1,
+    Constraints = "$Rn = $Rn_wb,@earlyclobber $Rn_wb" in {
+def STRi_preidx: ARMPseudoInst<(outs GPR:$Rn_wb),
+               (ins GPR:$Rt, GPR:$Rn, am2offset_imm:$offset, pred:$p),
+               4, IIC_iStore_ru,
+            [(set GPR:$Rn_wb,
+                  (pre_store GPR:$Rt, GPR:$Rn, am2offset_imm:$offset))]>;
+def STRr_preidx: ARMPseudoInst<(outs GPR:$Rn_wb),
+               (ins GPR:$Rt, GPR:$Rn, am2offset_reg:$offset, pred:$p),
+               4, IIC_iStore_ru,
+            [(set GPR:$Rn_wb,
+                  (pre_store GPR:$Rt, GPR:$Rn, am2offset_reg:$offset))]>;
+def STRBi_preidx: ARMPseudoInst<(outs GPR:$Rn_wb),
+               (ins GPR:$Rt, GPR:$Rn, am2offset_imm:$offset, pred:$p),
+               4, IIC_iStore_ru,
+            [(set GPR:$Rn_wb,
+                  (pre_truncsti8 GPR:$Rt, GPR:$Rn, am2offset_imm:$offset))]>;
+def STRBr_preidx: ARMPseudoInst<(outs GPR:$Rn_wb),
+               (ins GPR:$Rt, GPR:$Rn, am2offset_reg:$offset, pred:$p),
+               4, IIC_iStore_ru,
+            [(set GPR:$Rn_wb,
+                  (pre_truncsti8 GPR:$Rt, GPR:$Rn, am2offset_reg:$offset))]>;
+def STRH_preidx: ARMPseudoInst<(outs GPR:$Rn_wb),
+               (ins GPR:$Rt, GPR:$Rn, am3offset:$offset, pred:$p),
+               4, IIC_iStore_ru,
+            [(set GPR:$Rn_wb,
+                  (pre_truncsti16 GPR:$Rt, GPR:$Rn, am3offset:$offset))]>;
+}
+
+
+
+def STRH_PRE  : AI3ldstidx<0b1011, 0, 1, (outs GPR:$Rn_wb),
+                           (ins GPR:$Rt, addrmode3:$addr), IndexModePre,
+                           StMiscFrm, IIC_iStore_bh_ru,
+                           "strh", "\t$Rt, $addr!", "$addr.base = $Rn_wb", []> {
+  bits<14> addr;
+  let Inst{23}    = addr{8};      // U bit
+  let Inst{22}    = addr{13};     // 1 == imm8, 0 == Rm
+  let Inst{19-16} = addr{12-9};   // Rn
+  let Inst{11-8}  = addr{7-4};    // imm7_4/zero
+  let Inst{3-0}   = addr{3-0};    // imm3_0/Rm
+  let AsmMatchConverter = "cvtStWriteBackRegAddrMode3";
+  let DecoderMethod = "DecodeAddrMode3Instruction";
+}
+
+def STRH_POST : AI3ldstidx<0b1011, 0, 0, (outs GPR:$Rn_wb),
+                       (ins GPR:$Rt, addr_offset_none:$addr, am3offset:$offset),
+                       IndexModePost, StMiscFrm, IIC_iStore_bh_ru,
+                       "strh", "\t$Rt, $addr, $offset", "$addr.base = $Rn_wb",
+                   [(set GPR:$Rn_wb, (post_truncsti16 GPR:$Rt,
+                                                      addr_offset_none:$addr,
+                                                      am3offset:$offset))]> {
+  bits<10> offset;
+  bits<4> addr;
+  let Inst{23}    = offset{8};      // U bit
+  let Inst{22}    = offset{9};      // 1 == imm8, 0 == Rm
+  let Inst{19-16} = addr;
+  let Inst{11-8}  = offset{7-4};    // imm7_4/zero
+  let Inst{3-0}   = offset{3-0};    // imm3_0/Rm
+  let DecoderMethod = "DecodeAddrMode3Instruction";
+}
+
 let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in {
-def STRD_PRE : AI3stdpr<(outs GPR:$base_wb),
-                     (ins GPR:$src1, GPR:$src2, GPR:$base, am3offset:$offset),
-                     StMiscFrm, IIC_iStore_d_ru,
-                     "strd", "\t$src1, $src2, [$base, $offset]!",
-                     "$base = $base_wb", []>;
-
-// For disassembly only
-def STRD_POST: AI3stdpo<(outs GPR:$base_wb),
-                     (ins GPR:$src1, GPR:$src2, GPR:$base, am3offset:$offset),
-                     StMiscFrm, IIC_iStore_d_ru,
-                     "strd", "\t$src1, $src2, [$base], $offset",
-                     "$base = $base_wb", []>;
+def STRD_PRE : AI3ldstidx<0b1111, 0, 1, (outs GPR:$Rn_wb),
+                          (ins GPR:$Rt, GPR:$Rt2, addrmode3:$addr),
+                          IndexModePre, StMiscFrm, IIC_iStore_d_ru,
+                          "strd", "\t$Rt, $Rt2, $addr!",
+                          "$addr.base = $Rn_wb", []> {
+  bits<14> addr;
+  let Inst{23}    = addr{8};      // U bit
+  let Inst{22}    = addr{13};     // 1 == imm8, 0 == Rm
+  let Inst{19-16} = addr{12-9};   // Rn
+  let Inst{11-8}  = addr{7-4};    // imm7_4/zero
+  let Inst{3-0}   = addr{3-0};    // imm3_0/Rm
+  let DecoderMethod = "DecodeAddrMode3Instruction";
+  let AsmMatchConverter = "cvtStrdPre";
+}
+
+def STRD_POST: AI3ldstidx<0b1111, 0, 0, (outs GPR:$Rn_wb),
+                          (ins GPR:$Rt, GPR:$Rt2, addr_offset_none:$addr,
+                               am3offset:$offset),
+                          IndexModePost, StMiscFrm, IIC_iStore_d_ru,
+                          "strd", "\t$Rt, $Rt2, $addr, $offset",
+                          "$addr.base = $Rn_wb", []> {
+  bits<10> offset;
+  bits<4> addr;
+  let Inst{23}    = offset{8};      // U bit
+  let Inst{22}    = offset{9};      // 1 == imm8, 0 == Rm
+  let Inst{19-16} = addr;
+  let Inst{11-8}  = offset{7-4};    // imm7_4/zero
+  let Inst{3-0}   = offset{3-0};    // imm3_0/Rm
+  let DecoderMethod = "DecodeAddrMode3Instruction";
+}
 } // mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1
 
-// STRT, STRBT, and STRHT are for disassembly only.
+// STRT, STRBT, and STRHT
 
-def STRT : AI2stridxT<0, 0, (outs GPR:$Rn_wb), (ins GPR:$Rt, addrmode2:$addr),
-                     IndexModePost, StFrm, IIC_iStore_ru,
-                     "strt", "\t$Rt, $addr", "$addr.base = $Rn_wb",
-                     [/* For disassembly only; pattern left blank */]> {
+def STRBT_POST_REG : AI2ldstidx<0, 1, 0, (outs GPR:$Rn_wb),
+                   (ins GPR:$Rt, addr_offset_none:$addr, am2offset_reg:$offset),
+                   IndexModePost, StFrm, IIC_iStore_bh_ru,
+                   "strbt", "\t$Rt, $addr, $offset",
+                   "$addr.base = $Rn_wb", []> {
+  // {12}     isAdd
+  // {11-0}   imm12/Rm
+  bits<14> offset;
+  bits<4> addr;
+  let Inst{25} = 1;
+  let Inst{23} = offset{12};
+  let Inst{21} = 1; // overwrite
+  let Inst{19-16} = addr;
+  let Inst{11-5} = offset{11-5};
+  let Inst{4} = 0;
+  let Inst{3-0} = offset{3-0};
+  let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+}
+
+def STRBT_POST_IMM : AI2ldstidx<0, 1, 0, (outs GPR:$Rn_wb),
+                   (ins GPR:$Rt, addr_offset_none:$addr, am2offset_imm:$offset),
+                   IndexModePost, StFrm, IIC_iStore_bh_ru,
+                   "strbt", "\t$Rt, $addr, $offset",
+                   "$addr.base = $Rn_wb", []> {
+  // {12}     isAdd
+  // {11-0}   imm12/Rm
+  bits<14> offset;
+  bits<4> addr;
+  let Inst{25} = 0;
+  let Inst{23} = offset{12};
   let Inst{21} = 1; // overwrite
-  let AsmMatchConverter = "CvtStWriteBackRegAddrMode2";
+  let Inst{19-16} = addr;
+  let Inst{11-0} = offset{11-0};
+  let DecoderMethod = "DecodeAddrMode2IdxInstruction";
 }
 
-def STRBT : AI2stridxT<1, 0, (outs GPR:$Rn_wb), (ins GPR:$Rt, addrmode2:$addr),
-                      IndexModePost, StFrm, IIC_iStore_bh_ru,
-                      "strbt", "\t$Rt, $addr", "$addr.base = $Rn_wb",
-                      [/* For disassembly only; pattern left blank */]> {
+let mayStore = 1, neverHasSideEffects = 1 in {
+def STRT_POST_REG : AI2ldstidx<0, 0, 0, (outs GPR:$Rn_wb),
+                   (ins GPR:$Rt, addr_offset_none:$addr, am2offset_reg:$offset),
+                   IndexModePost, StFrm, IIC_iStore_ru,
+                   "strt", "\t$Rt, $addr, $offset",
+                   "$addr.base = $Rn_wb", []> {
+  // {12}     isAdd
+  // {11-0}   imm12/Rm
+  bits<14> offset;
+  bits<4> addr;
+  let Inst{25} = 1;
+  let Inst{23} = offset{12};
+  let Inst{21} = 1; // overwrite
+  let Inst{19-16} = addr;
+  let Inst{11-5} = offset{11-5};
+  let Inst{4} = 0;
+  let Inst{3-0} = offset{3-0};
+  let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+}
+
+def STRT_POST_IMM : AI2ldstidx<0, 0, 0, (outs GPR:$Rn_wb),
+                   (ins GPR:$Rt, addr_offset_none:$addr, am2offset_imm:$offset),
+                   IndexModePost, StFrm, IIC_iStore_ru,
+                   "strt", "\t$Rt, $addr, $offset",
+                   "$addr.base = $Rn_wb", []> {
+  // {12}     isAdd
+  // {11-0}   imm12/Rm
+  bits<14> offset;
+  bits<4> addr;
+  let Inst{25} = 0;
+  let Inst{23} = offset{12};
   let Inst{21} = 1; // overwrite
-  let AsmMatchConverter = "CvtStWriteBackRegAddrMode2";
+  let Inst{19-16} = addr;
+  let Inst{11-0} = offset{11-0};
+  let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+}
 }
 
-def STRHT: AI3sthpo<(outs GPR:$base_wb), (ins GPR:$Rt, addrmode3:$addr),
-                    StMiscFrm, IIC_iStore_bh_ru,
-                    "strht", "\t$Rt, $addr", "$addr.base = $base_wb",
-                    [/* For disassembly only; pattern left blank */]> {
-  let Inst{21} = 1; // overwrite
-  let AsmMatchConverter = "CvtStWriteBackRegAddrMode3";
+
+multiclass AI3strT<bits<4> op, string opc> {
+  def i : AI3ldstidxT<op, 0, (outs GPR:$base_wb),
+                    (ins GPR:$Rt, addr_offset_none:$addr, postidx_imm8:$offset),
+                    IndexModePost, StMiscFrm, IIC_iStore_bh_ru, opc,
+                    "\t$Rt, $addr, $offset", "$addr.base = $base_wb", []> {
+    bits<9> offset;
+    let Inst{23} = offset{8};
+    let Inst{22} = 1;
+    let Inst{11-8} = offset{7-4};
+    let Inst{3-0} = offset{3-0};
+    let AsmMatchConverter = "cvtStExtTWriteBackImm";
+  }
+  def r : AI3ldstidxT<op, 0, (outs GPR:$base_wb),
+                      (ins GPR:$Rt, addr_offset_none:$addr, postidx_reg:$Rm),
+                      IndexModePost, StMiscFrm, IIC_iStore_bh_ru, opc,
+                      "\t$Rt, $addr, $Rm", "$addr.base = $base_wb", []> {
+    bits<5> Rm;
+    let Inst{23} = Rm{4};
+    let Inst{22} = 0;
+    let Inst{11-8} = 0;
+    let Inst{3-0} = Rm{3-0};
+    let AsmMatchConverter = "cvtStExtTWriteBackReg";
+  }
 }
 
+
+defm STRHT : AI3strT<0b1011, "strht">;
+
+
 //===----------------------------------------------------------------------===//
 //  Load / store multiple Instructions.
 //
@@ -1996,6 +2764,8 @@ multiclass arm_ldst_mult<string asm, bit L_bit, Format f,
     let Inst{24-23} = 0b01;       // Increment After
     let Inst{21}    = 1;          // Writeback
     let Inst{20}    = L_bit;
+
+    let DecoderMethod = "DecodeMemMultipleWritebackInstruction";
   }
   def DA :
     AXI4<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
@@ -2012,6 +2782,8 @@ multiclass arm_ldst_mult<string asm, bit L_bit, Format f,
     let Inst{24-23} = 0b00;       // Decrement After
     let Inst{21}    = 1;          // Writeback
     let Inst{20}    = L_bit;
+
+    let DecoderMethod = "DecodeMemMultipleWritebackInstruction";
   }
   def DB :
     AXI4<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
@@ -2028,6 +2800,8 @@ multiclass arm_ldst_mult<string asm, bit L_bit, Format f,
     let Inst{24-23} = 0b10;       // Decrement Before
     let Inst{21}    = 1;          // Writeback
     let Inst{20}    = L_bit;
+
+    let DecoderMethod = "DecodeMemMultipleWritebackInstruction";
   }
   def IB :
     AXI4<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
@@ -2044,6 +2818,8 @@ multiclass arm_ldst_mult<string asm, bit L_bit, Format f,
     let Inst{24-23} = 0b11;       // Increment Before
     let Inst{21}    = 1;          // Writeback
     let Inst{20}    = L_bit;
+
+    let DecoderMethod = "DecodeMemMultipleWritebackInstruction";
   }
 }
 
@@ -2084,6 +2860,9 @@ def MOVr : AsI1<0b1101, (outs GPR:$Rd), (ins GPR:$Rm), DPFrm, IIC_iMOVr,
   let Inst{15-12} = Rd;
 }
 
+def : ARMInstAlias<"movs${p} $Rd, $Rm", 
+                   (MOVr GPR:$Rd, GPR:$Rm, pred:$p, CPSR)>;
+
 // A version for the smaller set of tail call registers.
 let neverHasSideEffects = 1 in
 def MOVr_TC : AsI1<0b1101, (outs tcGPR:$Rd), (ins tcGPR:$Rm), DPFrm,
@@ -2097,15 +2876,33 @@ def MOVr_TC : AsI1<0b1101, (outs tcGPR:$Rd), (ins tcGPR:$Rm), DPFrm,
   let Inst{15-12} = Rd;
 }
 
-def MOVs : AsI1<0b1101, (outs GPR:$Rd), (ins shift_so_reg:$src),
-                DPSoRegFrm, IIC_iMOVsr,
-                "mov", "\t$Rd, $src", [(set GPR:$Rd, shift_so_reg:$src)]>,
+def MOVsr : AsI1<0b1101, (outs GPRnopc:$Rd), (ins shift_so_reg_reg:$src),
+                DPSoRegRegFrm, IIC_iMOVsr,
+                "mov", "\t$Rd, $src",
+                [(set GPRnopc:$Rd, shift_so_reg_reg:$src)]>, UnaryDP {
+  bits<4> Rd;
+  bits<12> src;
+  let Inst{15-12} = Rd;
+  let Inst{19-16} = 0b0000;
+  let Inst{11-8} = src{11-8};
+  let Inst{7} = 0;
+  let Inst{6-5} = src{6-5};
+  let Inst{4} = 1;
+  let Inst{3-0} = src{3-0};
+  let Inst{25} = 0;
+}
+
+def MOVsi : AsI1<0b1101, (outs GPR:$Rd), (ins shift_so_reg_imm:$src),
+                DPSoRegImmFrm, IIC_iMOVsr,
+                "mov", "\t$Rd, $src", [(set GPR:$Rd, shift_so_reg_imm:$src)]>,
                 UnaryDP {
   bits<4> Rd;
   bits<12> src;
   let Inst{15-12} = Rd;
   let Inst{19-16} = 0b0000;
-  let Inst{11-0} = src;
+  let Inst{11-5} = src{11-5};
+  let Inst{4} = 0;
+  let Inst{3-0} = src{3-0};
   let Inst{25} = 0;
 }
 
@@ -2121,7 +2918,7 @@ def MOVi : AsI1<0b1101, (outs GPR:$Rd), (ins so_imm:$imm), DPFrm, IIC_iMOVi,
 }
 
 let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in
-def MOVi16 : AI1<0b1000, (outs GPR:$Rd), (ins i32imm_hilo16:$imm),
+def MOVi16 : AI1<0b1000, (outs GPR:$Rd), (ins imm0_65535_expr:$imm),
                  DPFrm, IIC_iMOVi,
                  "movw", "\t$Rd, $imm",
                  [(set GPR:$Rd, imm0_65535:$imm)]>,
@@ -2133,16 +2930,22 @@ def MOVi16 : AI1<0b1000, (outs GPR:$Rd), (ins i32imm_hilo16:$imm),
   let Inst{19-16} = imm{15-12};
   let Inst{20} = 0;
   let Inst{25} = 1;
+  let DecoderMethod = "DecodeArmMOVTWInstruction";
 }
 
+def : InstAlias<"mov${p} $Rd, $imm",
+                (MOVi16 GPR:$Rd, imm0_65535_expr:$imm, pred:$p)>,
+        Requires<[IsARM]>;
+
 def MOVi16_ga_pcrel : PseudoInst<(outs GPR:$Rd),
                                 (ins i32imm:$addr, pclabel:$id), IIC_iMOVi, []>;
 
 let Constraints = "$src = $Rd" in {
-def MOVTi16 : AI1<0b1010, (outs GPR:$Rd), (ins GPR:$src, i32imm_hilo16:$imm),
+def MOVTi16 : AI1<0b1010, (outs GPRnopc:$Rd),
+                  (ins GPR:$src, imm0_65535_expr:$imm),
                   DPFrm, IIC_iMOVi,
                   "movt", "\t$Rd, $imm",
-                  [(set GPR:$Rd,
+                  [(set GPRnopc:$Rd,
                         (or (and GPR:$src, 0xffff),
                             lo16AllZero:$imm))]>, UnaryDP,
                   Requires<[IsARM, HasV6T2]> {
@@ -2153,6 +2956,7 @@ def MOVTi16 : AI1<0b1010, (outs GPR:$Rd), (ins GPR:$src, i32imm_hilo16:$imm),
   let Inst{19-16} = imm{15-12};
   let Inst{20} = 0;
   let Inst{25} = 1;
+  let DecoderMethod = "DecodeArmMOVTWInstruction";
 }
 
 def MOVTi16_ga_pcrel : PseudoInst<(outs GPR:$Rd),
@@ -2186,30 +2990,28 @@ def MOVsra_flag : PseudoInst<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVsi,
 
 // Sign extenders
 
-defm SXTB  : AI_ext_rrot<0b01101010,
+def SXTB  : AI_ext_rrot<0b01101010,
                          "sxtb", UnOpFrag<(sext_inreg node:$Src, i8)>>;
-defm SXTH  : AI_ext_rrot<0b01101011,
+def SXTH  : AI_ext_rrot<0b01101011,
                          "sxth", UnOpFrag<(sext_inreg node:$Src, i16)>>;
 
-defm SXTAB : AI_exta_rrot<0b01101010,
+def SXTAB : AI_exta_rrot<0b01101010,
                "sxtab", BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS, i8))>>;
-defm SXTAH : AI_exta_rrot<0b01101011,
+def SXTAH : AI_exta_rrot<0b01101011,
                "sxtah", BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS,i16))>>;
 
-// For disassembly only
-defm SXTB16  : AI_ext_rrot_np<0b01101000, "sxtb16">;
+def SXTB16  : AI_ext_rrot_np<0b01101000, "sxtb16">;
 
-// For disassembly only
-defm SXTAB16 : AI_exta_rrot_np<0b01101000, "sxtab16">;
+def SXTAB16 : AI_exta_rrot_np<0b01101000, "sxtab16">;
 
 // Zero extenders
 
 let AddedComplexity = 16 in {
-defm UXTB   : AI_ext_rrot<0b01101110,
+def UXTB   : AI_ext_rrot<0b01101110,
                           "uxtb"  , UnOpFrag<(and node:$Src, 0x000000FF)>>;
-defm UXTH   : AI_ext_rrot<0b01101111,
+def UXTH   : AI_ext_rrot<0b01101111,
                           "uxth"  , UnOpFrag<(and node:$Src, 0x0000FFFF)>>;
-defm UXTB16 : AI_ext_rrot<0b01101100,
+def UXTB16 : AI_ext_rrot<0b01101100,
                           "uxtb16", UnOpFrag<(and node:$Src, 0x00FF00FF)>>;
 
 // FIXME: This pattern incorrectly assumes the shl operator is a rotate.
@@ -2217,23 +3019,22 @@ defm UXTB16 : AI_ext_rrot<0b01101100,
 //        instead so we can include a check for masking back in the upper
 //        eight bits of the source into the lower eight bits of the result.
 //def : ARMV6Pat<(and (shl GPR:$Src, (i32 8)), 0xFF00FF),
-//               (UXTB16r_rot GPR:$Src, 24)>;
+//               (UXTB16r_rot GPR:$Src, 3)>;
 def : ARMV6Pat<(and (srl GPR:$Src, (i32 8)), 0xFF00FF),
-               (UXTB16r_rot GPR:$Src, 8)>;
+               (UXTB16 GPR:$Src, 1)>;
 
-defm UXTAB : AI_exta_rrot<0b01101110, "uxtab",
+def UXTAB : AI_exta_rrot<0b01101110, "uxtab",
                         BinOpFrag<(add node:$LHS, (and node:$RHS, 0x00FF))>>;
-defm UXTAH : AI_exta_rrot<0b01101111, "uxtah",
+def UXTAH : AI_exta_rrot<0b01101111, "uxtah",
                         BinOpFrag<(add node:$LHS, (and node:$RHS, 0xFFFF))>>;
 }
 
 // This isn't safe in general, the add is two 16-bit units, not a 32-bit add.
-// For disassembly only
-defm UXTAB16 : AI_exta_rrot_np<0b01101100, "uxtab16">;
+def UXTAB16 : AI_exta_rrot_np<0b01101100, "uxtab16">;
 
 
-def SBFX  : I<(outs GPR:$Rd),
-              (ins GPR:$Rn, imm0_31:$lsb, imm0_31_m1:$width),
+def SBFX  : I<(outs GPRnopc:$Rd),
+              (ins GPRnopc:$Rn, imm0_31:$lsb, imm1_32:$width),
                AddrMode1, 4, IndexModeNone, DPFrm, IIC_iUNAsi,
                "sbfx", "\t$Rd, $Rn, $lsb, $width", "", []>,
                Requires<[IsARM, HasV6T2]> {
@@ -2250,7 +3051,7 @@ def SBFX  : I<(outs GPR:$Rd),
 }
 
 def UBFX  : I<(outs GPR:$Rd),
-              (ins GPR:$Rn, imm0_31:$lsb, imm0_31_m1:$width),
+              (ins GPR:$Rn, imm0_31:$lsb, imm1_32:$width),
                AddrMode1, 4, IndexModeNone, DPFrm, IIC_iUNAsi,
                "ubfx", "\t$Rd, $Rn, $lsb, $width", "", []>,
                Requires<[IsARM, HasV6T2]> {
@@ -2278,148 +3079,58 @@ defm SUB  : AsI1_bin_irs<0b0010, "sub",
                          BinOpFrag<(sub  node:$LHS, node:$RHS)>, "SUB">;
 
 // ADD and SUB with 's' bit set.
-defm ADDS : AI1_bin_s_irs<0b0100, "adds",
+//
+// Currently, t2ADDS/t2SUBS are pseudo opcodes that exist only in the
+// selection DAG. They are "lowered" to real t2ADD/t2SUB opcodes by
+// AdjustInstrPostInstrSelection where we determine whether or not to
+// set the "s" bit based on CPSR liveness.
+//
+// FIXME: Eliminate t2ADDS/t2SUBS pseudo opcodes after adding tablegen
+// support for an optional CPSR definition that corresponds to the DAG
+// node's second value. We can then eliminate the implicit def of CPSR.
+defm ADDS : AsI1_bin_s_irs<0b0100, "add",
                           IIC_iALUi, IIC_iALUr, IIC_iALUsr,
-                          BinOpFrag<(addc node:$LHS, node:$RHS)>, 1>;
-defm SUBS : AI1_bin_s_irs<0b0010, "subs",
+                          BinOpFrag<(ARMaddc node:$LHS, node:$RHS)>, 1>;
+defm SUBS : AsI1_bin_s_irs<0b0010, "sub",
                           IIC_iALUi, IIC_iALUr, IIC_iALUsr,
-                          BinOpFrag<(subc node:$LHS, node:$RHS)>>;
+                          BinOpFrag<(ARMsubc node:$LHS, node:$RHS)>>;
 
 defm ADC : AI1_adde_sube_irs<0b0101, "adc",
-                          BinOpFrag<(adde_dead_carry node:$LHS, node:$RHS)>,
+                  BinOpWithFlagFrag<(ARMadde node:$LHS, node:$RHS, node:$FLAG)>,
                           "ADC", 1>;
 defm SBC : AI1_adde_sube_irs<0b0110, "sbc",
-                          BinOpFrag<(sube_dead_carry node:$LHS, node:$RHS)>,
+                  BinOpWithFlagFrag<(ARMsube node:$LHS, node:$RHS, node:$FLAG)>,
                           "SBC">;
 
-// ADC and SUBC with 's' bit set.
-let usesCustomInserter = 1 in {
-defm ADCS : AI1_adde_sube_s_irs<
-              BinOpFrag<(adde_live_carry node:$LHS, node:$RHS)>, 1>;
-defm SBCS : AI1_adde_sube_s_irs<
-              BinOpFrag<(sube_live_carry node:$LHS, node:$RHS) >>;
-}
-
-def RSBri : AsI1<0b0011, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm), DPFrm,
-                 IIC_iALUi, "rsb", "\t$Rd, $Rn, $imm",
-                 [(set GPR:$Rd, (sub so_imm:$imm, GPR:$Rn))]> {
-  bits<4> Rd;
-  bits<4> Rn;
-  bits<12> imm;
-  let Inst{25} = 1;
-  let Inst{15-12} = Rd;
-  let Inst{19-16} = Rn;
-  let Inst{11-0} = imm;
-}
-
-// The reg/reg form is only defined for the disassembler; for codegen it is
-// equivalent to SUBrr.
-def RSBrr : AsI1<0b0011, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm,
-                 IIC_iALUr, "rsb", "\t$Rd, $Rn, $Rm",
-                 [/* For disassembly only; pattern left blank */]> {
-  bits<4> Rd;
-  bits<4> Rn;
-  bits<4> Rm;
-  let Inst{11-4} = 0b00000000;
-  let Inst{25} = 0;
-  let Inst{3-0} = Rm;
-  let Inst{15-12} = Rd;
-  let Inst{19-16} = Rn;
-}
-
-def RSBrs : AsI1<0b0011, (outs GPR:$Rd), (ins GPR:$Rn, so_reg:$shift),
-                 DPSoRegFrm, IIC_iALUsr, "rsb", "\t$Rd, $Rn, $shift",
-                 [(set GPR:$Rd, (sub so_reg:$shift, GPR:$Rn))]> {
-  bits<4> Rd;
-  bits<4> Rn;
-  bits<12> shift;
-  let Inst{25} = 0;
-  let Inst{11-0} = shift;
-  let Inst{15-12} = Rd;
-  let Inst{19-16} = Rn;
-}
+defm RSB  : AsI1_rbin_irs <0b0011, "rsb",
+                         IIC_iALUi, IIC_iALUr, IIC_iALUsr,
+                         BinOpFrag<(sub node:$LHS, node:$RHS)>, "RSB">;
 
-// RSB with 's' bit set.
-// NOTE: CPSR def omitted because it will be handled by the custom inserter.
-let usesCustomInserter = 1 in {
-def RSBSri : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm),
-                 4, IIC_iALUi,
-                 [(set GPR:$Rd, (subc so_imm:$imm, GPR:$Rn))]>;
-def RSBSrr : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
-                 4, IIC_iALUr,
-                 [/* For disassembly only; pattern left blank */]>;
-def RSBSrs : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, so_reg:$shift),
-                 4, IIC_iALUsr,
-                 [(set GPR:$Rd, (subc so_reg:$shift, GPR:$Rn))]>;
-}
-
-let Uses = [CPSR] in {
-def RSCri : AsI1<0b0111, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm),
-                 DPFrm, IIC_iALUi, "rsc", "\t$Rd, $Rn, $imm",
-                 [(set GPR:$Rd, (sube_dead_carry so_imm:$imm, GPR:$Rn))]>,
-                 Requires<[IsARM]> {
-  bits<4> Rd;
-  bits<4> Rn;
-  bits<12> imm;
-  let Inst{25} = 1;
-  let Inst{15-12} = Rd;
-  let Inst{19-16} = Rn;
-  let Inst{11-0} = imm;
-}
-// The reg/reg form is only defined for the disassembler; for codegen it is
-// equivalent to SUBrr.
-def RSCrr : AsI1<0b0111, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
-                 DPFrm, IIC_iALUr, "rsc", "\t$Rd, $Rn, $Rm",
-                 [/* For disassembly only; pattern left blank */]> {
-  bits<4> Rd;
-  bits<4> Rn;
-  bits<4> Rm;
-  let Inst{11-4} = 0b00000000;
-  let Inst{25} = 0;
-  let Inst{3-0} = Rm;
-  let Inst{15-12} = Rd;
-  let Inst{19-16} = Rn;
-}
-def RSCrs : AsI1<0b0111, (outs GPR:$Rd), (ins GPR:$Rn, so_reg:$shift),
-                 DPSoRegFrm, IIC_iALUsr, "rsc", "\t$Rd, $Rn, $shift",
-                 [(set GPR:$Rd, (sube_dead_carry so_reg:$shift, GPR:$Rn))]>,
-                 Requires<[IsARM]> {
-  bits<4> Rd;
-  bits<4> Rn;
-  bits<12> shift;
-  let Inst{25} = 0;
-  let Inst{11-0} = shift;
-  let Inst{15-12} = Rd;
-  let Inst{19-16} = Rn;
-}
-}
+// FIXME: Eliminate them if we can write def : Pat patterns which defines
+// CPSR and the implicit def of CPSR is not needed.
+defm RSBS : AsI1_rbin_s_is<0b0011, "rsb",
+                         IIC_iALUi, IIC_iALUr, IIC_iALUsr,
+                         BinOpFrag<(ARMsubc node:$LHS, node:$RHS)>>;
 
-// NOTE: CPSR def omitted because it will be handled by the custom inserter.
-let usesCustomInserter = 1, Uses = [CPSR] in {
-def RSCSri : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm),
-                  4, IIC_iALUi,
-                  [(set GPR:$Rd, (sube_dead_carry so_imm:$imm, GPR:$Rn))]>;
-def RSCSrs : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, so_reg:$shift),
-                  4, IIC_iALUsr,
-                  [(set GPR:$Rd, (sube_dead_carry so_reg:$shift, GPR:$Rn))]>;
-}
+defm RSC : AI1_rsc_irs<0b0111, "rsc",
+                  BinOpWithFlagFrag<(ARMsube node:$LHS, node:$RHS, node:$FLAG)>,
+                       "RSC">;
 
 // (sub X, imm) gets canonicalized to (add X, -imm).  Match this form.
 // The assume-no-carry-in form uses the negation of the input since add/sub
 // assume opposite meanings of the carry flag (i.e., carry == !borrow).
 // See the definition of AddWithCarry() in the ARM ARM A2.2.1 for the gory
 // details.
-def : ARMPat<(add    GPR:$src, so_imm_neg:$imm),
-             (SUBri  GPR:$src, so_imm_neg:$imm)>;
-def : ARMPat<(addc   GPR:$src, so_imm_neg:$imm),
-             (SUBSri GPR:$src, so_imm_neg:$imm)>;
+def : ARMPat<(add     GPR:$src, so_imm_neg:$imm),
+             (SUBri   GPR:$src, so_imm_neg:$imm)>;
+def : ARMPat<(ARMaddc GPR:$src, so_imm_neg:$imm),
+             (SUBSri  GPR:$src, so_imm_neg:$imm)>;
+
 // The with-carry-in form matches bitwise not instead of the negation.
 // Effectively, the inverse interpretation of the carry flag already accounts
 // for part of the negation.
-def : ARMPat<(adde_dead_carry   GPR:$src, so_imm_not:$imm),
-             (SBCri  GPR:$src, so_imm_not:$imm)>;
-def : ARMPat<(adde_live_carry   GPR:$src, so_imm_not:$imm),
-             (SBCSri GPR:$src, so_imm_not:$imm)>;
+def : ARMPat<(ARMadde GPR:$src, so_imm_not:$imm, CPSR),
+             (SBCri   GPR:$src, so_imm_not:$imm)>;
 
 // Note: These are implemented in C++ code, because they have to generate
 // ADD/SUBrs instructions, which use a complex pattern that a xform function
@@ -2427,12 +3138,13 @@ def : ARMPat<(adde_live_carry   GPR:$src, so_imm_not:$imm),
 // (mul X, 2^n+1) -> (add (X << n), X)
 // (mul X, 2^n-1) -> (rsb X, (X << n))
 
-// ARM Arithmetic Instruction -- for disassembly only
+// ARM Arithmetic Instruction
 // GPR:$dst = GPR:$a op GPR:$b
 class AAI<bits<8> op27_20, bits<8> op11_4, string opc,
-          list<dag> pattern = [/* For disassembly only; pattern left blank */],
-          dag iops = (ins GPR:$Rn, GPR:$Rm), string asm = "\t$Rd, $Rn, $Rm">
-  : AI<(outs GPR:$Rd), iops, DPFrm, IIC_iALUr, opc, asm, pattern> {
+          list<dag> pattern = [],
+          dag iops = (ins GPRnopc:$Rn, GPRnopc:$Rm),
+          string asm = "\t$Rd, $Rn, $Rm">
+  : AI<(outs GPRnopc:$Rd), iops, DPFrm, IIC_iALUr, opc, asm, pattern> {
   bits<4> Rn;
   bits<4> Rd;
   bits<4> Rm;
@@ -2443,17 +3155,19 @@ class AAI<bits<8> op27_20, bits<8> op11_4, string opc,
   let Inst{3-0}   = Rm;
 }
 
-// Saturating add/subtract -- for disassembly only
+// Saturating add/subtract
 
 def QADD    : AAI<0b00010000, 0b00000101, "qadd",
-                  [(set GPR:$Rd, (int_arm_qadd GPR:$Rm, GPR:$Rn))],
-                  (ins GPR:$Rm, GPR:$Rn), "\t$Rd, $Rm, $Rn">;
+                  [(set GPRnopc:$Rd, (int_arm_qadd GPRnopc:$Rm, GPRnopc:$Rn))],
+                  (ins GPRnopc:$Rm, GPRnopc:$Rn), "\t$Rd, $Rm, $Rn">;
 def QSUB    : AAI<0b00010010, 0b00000101, "qsub",
-                  [(set GPR:$Rd, (int_arm_qsub GPR:$Rm, GPR:$Rn))],
-                  (ins GPR:$Rm, GPR:$Rn), "\t$Rd, $Rm, $Rn">;
-def QDADD   : AAI<0b00010100, 0b00000101, "qdadd", [], (ins GPR:$Rm, GPR:$Rn),
+                  [(set GPRnopc:$Rd, (int_arm_qsub GPRnopc:$Rm, GPRnopc:$Rn))],
+                  (ins GPRnopc:$Rm, GPRnopc:$Rn), "\t$Rd, $Rm, $Rn">;
+def QDADD   : AAI<0b00010100, 0b00000101, "qdadd", [],
+                  (ins GPRnopc:$Rm, GPRnopc:$Rn),
                   "\t$Rd, $Rm, $Rn">;
-def QDSUB   : AAI<0b00010110, 0b00000101, "qdsub", [], (ins GPR:$Rm, GPR:$Rn),
+def QDSUB   : AAI<0b00010110, 0b00000101, "qdsub", [],
+                  (ins GPRnopc:$Rm, GPRnopc:$Rn),
                   "\t$Rd, $Rm, $Rn">;
 
 def QADD16  : AAI<0b01100010, 0b11110001, "qadd16">;
@@ -2469,7 +3183,7 @@ def UQSAX   : AAI<0b01100110, 0b11110101, "uqsax">;
 def UQSUB16 : AAI<0b01100110, 0b11110111, "uqsub16">;
 def UQSUB8  : AAI<0b01100110, 0b11111111, "uqsub8">;
 
-// Signed/Unsigned add/subtract -- for disassembly only
+// Signed/Unsigned add/subtract
 
 def SASX   : AAI<0b01100001, 0b11110011, "sasx">;
 def SADD16 : AAI<0b01100001, 0b11110001, "sadd16">;
@@ -2484,7 +3198,7 @@ def USAX   : AAI<0b01100101, 0b11110101, "usax">;
 def USUB16 : AAI<0b01100101, 0b11110111, "usub16">;
 def USUB8  : AAI<0b01100101, 0b11111111, "usub8">;
 
-// Signed/Unsigned halving add/subtract -- for disassembly only
+// Signed/Unsigned halving add/subtract
 
 def SHASX   : AAI<0b01100011, 0b11110011, "shasx">;
 def SHADD16 : AAI<0b01100011, 0b11110001, "shadd16">;
@@ -2499,7 +3213,7 @@ def UHSAX   : AAI<0b01100111, 0b11110101, "uhsax">;
 def UHSUB16 : AAI<0b01100111, 0b11110111, "uhsub16">;
 def UHSUB8  : AAI<0b01100111, 0b11111111, "uhsub8">;
 
-// Unsigned Sum of Absolute Differences [and Accumulate] -- for disassembly only
+// Unsigned Sum of Absolute Differences [and Accumulate].
 
 def USAD8  : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
                 MulFrm /* for convenience */, NoItinerary, "usad8",
@@ -2531,11 +3245,11 @@ def USADA8 : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
   let Inst{3-0} = Rn;
 }
 
-// Signed/Unsigned saturate -- for disassembly only
+// Signed/Unsigned saturate
 
-def SSAT : AI<(outs GPR:$Rd), (ins ssat_imm:$sat_imm, GPR:$a, shift_imm:$sh),
-              SatFrm, NoItinerary, "ssat", "\t$Rd, $sat_imm, $a$sh",
-              [/* For disassembly only; pattern left blank */]> {
+def SSAT : AI<(outs GPRnopc:$Rd),
+              (ins imm1_32:$sat_imm, GPRnopc:$Rn, shift_imm:$sh),
+              SatFrm, NoItinerary, "ssat", "\t$Rd, $sat_imm, $Rn$sh", []> {
   bits<4> Rd;
   bits<5> sat_imm;
   bits<4> Rn;
@@ -2544,14 +3258,14 @@ def SSAT : AI<(outs GPR:$Rd), (ins ssat_imm:$sat_imm, GPR:$a, shift_imm:$sh),
   let Inst{5-4} = 0b01;
   let Inst{20-16} = sat_imm;
   let Inst{15-12} = Rd;
-  let Inst{11-7} = sh{7-3};
-  let Inst{6} = sh{0};
+  let Inst{11-7} = sh{4-0};
+  let Inst{6} = sh{5};
   let Inst{3-0} = Rn;
 }
 
-def SSAT16 : AI<(outs GPR:$Rd), (ins ssat_imm:$sat_imm, GPR:$Rn), SatFrm,
-                NoItinerary, "ssat16", "\t$Rd, $sat_imm, $Rn",
-                [/* For disassembly only; pattern left blank */]> {
+def SSAT16 : AI<(outs GPRnopc:$Rd),
+                (ins imm1_16:$sat_imm, GPRnopc:$Rn), SatFrm,
+                NoItinerary, "ssat16", "\t$Rd, $sat_imm, $Rn", []> {
   bits<4> Rd;
   bits<4> sat_imm;
   bits<4> Rn;
@@ -2562,9 +3276,9 @@ def SSAT16 : AI<(outs GPR:$Rd), (ins ssat_imm:$sat_imm, GPR:$Rn), SatFrm,
   let Inst{3-0} = Rn;
 }
 
-def USAT : AI<(outs GPR:$Rd), (ins i32imm:$sat_imm, GPR:$a, shift_imm:$sh),
-              SatFrm, NoItinerary, "usat", "\t$Rd, $sat_imm, $a$sh",
-              [/* For disassembly only; pattern left blank */]> {
+def USAT : AI<(outs GPRnopc:$Rd),
+              (ins imm0_31:$sat_imm, GPRnopc:$Rn, shift_imm:$sh),
+              SatFrm, NoItinerary, "usat", "\t$Rd, $sat_imm, $Rn$sh", []> {
   bits<4> Rd;
   bits<5> sat_imm;
   bits<4> Rn;
@@ -2572,15 +3286,15 @@ def USAT : AI<(outs GPR:$Rd), (ins i32imm:$sat_imm, GPR:$a, shift_imm:$sh),
   let Inst{27-21} = 0b0110111;
   let Inst{5-4} = 0b01;
   let Inst{15-12} = Rd;
-  let Inst{11-7} = sh{7-3};
-  let Inst{6} = sh{0};
+  let Inst{11-7} = sh{4-0};
+  let Inst{6} = sh{5};
   let Inst{20-16} = sat_imm;
   let Inst{3-0} = Rn;
 }
 
-def USAT16 : AI<(outs GPR:$Rd), (ins i32imm:$sat_imm, GPR:$a), SatFrm,
-                NoItinerary, "usat16", "\t$Rd, $sat_imm, $a",
-                [/* For disassembly only; pattern left blank */]> {
+def USAT16 : AI<(outs GPRnopc:$Rd),
+                (ins imm0_15:$sat_imm, GPRnopc:$Rn), SatFrm,
+                NoItinerary, "usat16", "\t$Rd, $sat_imm, $Rn", []> {
   bits<4> Rd;
   bits<4> sat_imm;
   bits<4> Rn;
@@ -2591,8 +3305,10 @@ def USAT16 : AI<(outs GPR:$Rd), (ins i32imm:$sat_imm, GPR:$a), SatFrm,
   let Inst{3-0} = Rn;
 }
 
-def : ARMV6Pat<(int_arm_ssat GPR:$a, imm:$pos), (SSAT imm:$pos, GPR:$a, 0)>;
-def : ARMV6Pat<(int_arm_usat GPR:$a, imm:$pos), (USAT imm:$pos, GPR:$a, 0)>;
+def : ARMV6Pat<(int_arm_ssat GPRnopc:$a, imm:$pos),
+               (SSAT imm:$pos, GPRnopc:$a, 0)>;
+def : ARMV6Pat<(int_arm_usat GPRnopc:$a, imm:$pos),
+               (USAT imm:$pos, GPRnopc:$a, 0)>;
 
 //===----------------------------------------------------------------------===//
 //  Bitwise Instructions.
@@ -2611,6 +3327,10 @@ defm BIC   : AsI1_bin_irs<0b1110, "bic",
                           IIC_iBITi, IIC_iBITr, IIC_iBITsr,
                           BinOpFrag<(and node:$LHS, (not node:$RHS))>, "BIC">;
 
+// FIXME: bf_inv_mask_imm should be two operands, the lsb and the msb, just
+// like in the actual instruction encoding. The complexity of mapping the mask
+// to the lsb/msb pair should be handled by ISel, not encapsulated in the
+// instruction description.
 def BFC    : I<(outs GPR:$Rd), (ins GPR:$src, bf_inv_mask_imm:$imm),
                AddrMode1, 4, IndexModeNone, DPFrm, IIC_iUNAsi,
                "bfc", "\t$Rd, $imm", "$src = $Rd",
@@ -2622,16 +3342,16 @@ def BFC    : I<(outs GPR:$Rd), (ins GPR:$src, bf_inv_mask_imm:$imm),
   let Inst{6-0}   = 0b0011111;
   let Inst{15-12} = Rd;
   let Inst{11-7}  = imm{4-0}; // lsb
-  let Inst{20-16} = imm{9-5}; // width
+  let Inst{20-16} = imm{9-5}; // msb
 }
 
 // A8.6.18  BFI - Bitfield insert (Encoding A1)
-def BFI    : I<(outs GPR:$Rd), (ins GPR:$src, GPR:$Rn, bf_inv_mask_imm:$imm),
-               AddrMode1, 4, IndexModeNone, DPFrm, IIC_iUNAsi,
-               "bfi", "\t$Rd, $Rn, $imm", "$src = $Rd",
-               [(set GPR:$Rd, (ARMbfi GPR:$src, GPR:$Rn,
-                                bf_inv_mask_imm:$imm))]>,
-               Requires<[IsARM, HasV6T2]> {
+def BFI:I<(outs GPRnopc:$Rd), (ins GPRnopc:$src, GPR:$Rn, bf_inv_mask_imm:$imm),
+          AddrMode1, 4, IndexModeNone, DPFrm, IIC_iUNAsi,
+          "bfi", "\t$Rd, $Rn, $imm", "$src = $Rd",
+          [(set GPRnopc:$Rd, (ARMbfi GPRnopc:$src, GPR:$Rn,
+                           bf_inv_mask_imm:$imm))]>,
+          Requires<[IsARM, HasV6T2]> {
   bits<4> Rd;
   bits<4> Rn;
   bits<10> imm;
@@ -2643,25 +3363,6 @@ def BFI    : I<(outs GPR:$Rd), (ins GPR:$src, GPR:$Rn, bf_inv_mask_imm:$imm),
   let Inst{3-0}   = Rn;
 }
 
-// GNU as only supports this form of bfi (w/ 4 arguments)
-let isAsmParserOnly = 1 in
-def BFI4p : I<(outs GPR:$Rd), (ins GPR:$src, GPR:$Rn,
-                                   lsb_pos_imm:$lsb, width_imm:$width),
-               AddrMode1, 4, IndexModeNone, DPFrm, IIC_iUNAsi,
-               "bfi", "\t$Rd, $Rn, $lsb, $width", "$src = $Rd",
-               []>, Requires<[IsARM, HasV6T2]> {
-  bits<4> Rd;
-  bits<4> Rn;
-  bits<5> lsb;
-  bits<5> width;
-  let Inst{27-21} = 0b0111110;
-  let Inst{6-4}   = 0b001; // Rn: Inst{3-0} != 15
-  let Inst{15-12} = Rd;
-  let Inst{11-7}  = lsb;
-  let Inst{20-16} = width; // Custom encoder => lsb+width-1
-  let Inst{3-0}   = Rn;
-}
-
 def  MVNr  : AsI1<0b1111, (outs GPR:$Rd), (ins GPR:$Rm), DPFrm, IIC_iMVNr,
                   "mvn", "\t$Rd, $Rm",
                   [(set GPR:$Rd, (not GPR:$Rm))]>, UnaryDP {
@@ -2673,15 +3374,31 @@ def  MVNr  : AsI1<0b1111, (outs GPR:$Rd), (ins GPR:$Rm), DPFrm, IIC_iMVNr,
   let Inst{15-12} = Rd;
   let Inst{3-0} = Rm;
 }
-def  MVNs  : AsI1<0b1111, (outs GPR:$Rd), (ins so_reg:$shift), DPSoRegFrm,
-                  IIC_iMVNsr, "mvn", "\t$Rd, $shift",
-                  [(set GPR:$Rd, (not so_reg:$shift))]>, UnaryDP {
+def  MVNsi  : AsI1<0b1111, (outs GPR:$Rd), (ins so_reg_imm:$shift),
+                  DPSoRegImmFrm, IIC_iMVNsr, "mvn", "\t$Rd, $shift",
+                  [(set GPR:$Rd, (not so_reg_imm:$shift))]>, UnaryDP {
+  bits<4> Rd;
+  bits<12> shift;
+  let Inst{25} = 0;
+  let Inst{19-16} = 0b0000;
+  let Inst{15-12} = Rd;
+  let Inst{11-5} = shift{11-5};
+  let Inst{4} = 0;
+  let Inst{3-0} = shift{3-0};
+}
+def  MVNsr  : AsI1<0b1111, (outs GPR:$Rd), (ins so_reg_reg:$shift),
+                  DPSoRegRegFrm, IIC_iMVNsr, "mvn", "\t$Rd, $shift",
+                  [(set GPR:$Rd, (not so_reg_reg:$shift))]>, UnaryDP {
   bits<4> Rd;
   bits<12> shift;
   let Inst{25} = 0;
   let Inst{19-16} = 0b0000;
   let Inst{15-12} = Rd;
-  let Inst{11-0} = shift;
+  let Inst{11-8} = shift{11-8};
+  let Inst{7} = 0;
+  let Inst{6-5} = shift{6-5};
+  let Inst{4} = 1;
+  let Inst{3-0} = shift{3-0};
 }
 let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in
 def  MVNi  : AsI1<0b1111, (outs GPR:$Rd), (ins so_imm:$imm), DPFrm,
@@ -2820,8 +3537,8 @@ def UMAAL : AMul1I <0b0000010, (outs GPR:$RdLo, GPR:$RdHi),
   bits<4> RdHi;
   bits<4> Rm;
   bits<4> Rn;
-  let Inst{19-16} = RdLo;
-  let Inst{15-12} = RdHi;
+  let Inst{19-16} = RdHi;
+  let Inst{15-12} = RdLo;
   let Inst{11-8}  = Rm;
   let Inst{3-0}   = Rn;
 }
@@ -2855,8 +3572,7 @@ def SMMUL : AMul2I <0b0111010, 0b0001, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
 }
 
 def SMMULR : AMul2I <0b0111010, 0b0011, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
-               IIC_iMUL32, "smmulr", "\t$Rd, $Rn, $Rm",
-               [/* For disassembly only; pattern left blank */]>,
+               IIC_iMUL32, "smmulr", "\t$Rd, $Rn, $Rm", []>,
             Requires<[IsARM, HasV6]> {
   let Inst{15-12} = 0b1111;
 }
@@ -2869,8 +3585,7 @@ def SMMLA : AMul2Ia <0b0111010, 0b0001, (outs GPR:$Rd),
 
 def SMMLAR : AMul2Ia <0b0111010, 0b0011, (outs GPR:$Rd),
                (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
-               IIC_iMAC32, "smmlar", "\t$Rd, $Rn, $Rm, $Ra",
-               [/* For disassembly only; pattern left blank */]>,
+               IIC_iMAC32, "smmlar", "\t$Rd, $Rn, $Rm, $Ra", []>,
             Requires<[IsARM, HasV6]>;
 
 def SMMLS : AMul2Ia <0b0111010, 0b1101, (outs GPR:$Rd),
@@ -2881,8 +3596,7 @@ def SMMLS : AMul2Ia <0b0111010, 0b1101, (outs GPR:$Rd),
 
 def SMMLSR : AMul2Ia <0b0111010, 0b1111, (outs GPR:$Rd),
                (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
-               IIC_iMAC32, "smmlsr", "\t$Rd, $Rn, $Rm, $Ra",
-               [/* For disassembly only; pattern left blank */]>,
+               IIC_iMAC32, "smmlsr", "\t$Rd, $Rn, $Rm, $Ra", []>,
             Requires<[IsARM, HasV6]>;
 
 multiclass AI_smul<string opc, PatFrag opnode> {
@@ -2925,92 +3639,95 @@ multiclass AI_smul<string opc, PatFrag opnode> {
 
 
 multiclass AI_smla<string opc, PatFrag opnode> {
-  def BB : AMulxyIa<0b0001000, 0b00, (outs GPR:$Rd),
-              (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
+  let DecoderMethod = "DecodeSMLAInstruction" in {
+  def BB : AMulxyIa<0b0001000, 0b00, (outs GPRnopc:$Rd),
+              (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
               IIC_iMAC16, !strconcat(opc, "bb"), "\t$Rd, $Rn, $Rm, $Ra",
-              [(set GPR:$Rd, (add GPR:$Ra,
-                               (opnode (sext_inreg GPR:$Rn, i16),
-                                       (sext_inreg GPR:$Rm, i16))))]>,
+              [(set GPRnopc:$Rd, (add GPR:$Ra,
+                               (opnode (sext_inreg GPRnopc:$Rn, i16),
+                                       (sext_inreg GPRnopc:$Rm, i16))))]>,
            Requires<[IsARM, HasV5TE]>;
 
-  def BT : AMulxyIa<0b0001000, 0b10, (outs GPR:$Rd),
-              (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
+  def BT : AMulxyIa<0b0001000, 0b10, (outs GPRnopc:$Rd),
+              (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
               IIC_iMAC16, !strconcat(opc, "bt"), "\t$Rd, $Rn, $Rm, $Ra",
-              [(set GPR:$Rd, (add GPR:$Ra, (opnode (sext_inreg GPR:$Rn, i16),
-                                                   (sra GPR:$Rm, (i32 16)))))]>,
+              [(set GPRnopc:$Rd,
+                    (add GPR:$Ra, (opnode (sext_inreg GPRnopc:$Rn, i16),
+                                          (sra GPRnopc:$Rm, (i32 16)))))]>,
            Requires<[IsARM, HasV5TE]>;
 
-  def TB : AMulxyIa<0b0001000, 0b01, (outs GPR:$Rd),
-              (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
+  def TB : AMulxyIa<0b0001000, 0b01, (outs GPRnopc:$Rd),
+              (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
               IIC_iMAC16, !strconcat(opc, "tb"), "\t$Rd, $Rn, $Rm, $Ra",
-              [(set GPR:$Rd, (add GPR:$Ra, (opnode (sra GPR:$Rn, (i32 16)),
-                                                (sext_inreg GPR:$Rm, i16))))]>,
+              [(set GPRnopc:$Rd,
+                    (add GPR:$Ra, (opnode (sra GPRnopc:$Rn, (i32 16)),
+                                          (sext_inreg GPRnopc:$Rm, i16))))]>,
            Requires<[IsARM, HasV5TE]>;
 
-  def TT : AMulxyIa<0b0001000, 0b11, (outs GPR:$Rd),
-              (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
+  def TT : AMulxyIa<0b0001000, 0b11, (outs GPRnopc:$Rd),
+              (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
               IIC_iMAC16, !strconcat(opc, "tt"), "\t$Rd, $Rn, $Rm, $Ra",
-             [(set GPR:$Rd, (add GPR:$Ra, (opnode (sra GPR:$Rn, (i32 16)),
-                                                   (sra GPR:$Rm, (i32 16)))))]>,
+             [(set GPRnopc:$Rd,
+                   (add GPR:$Ra, (opnode (sra GPRnopc:$Rn, (i32 16)),
+                                         (sra GPRnopc:$Rm, (i32 16)))))]>,
             Requires<[IsARM, HasV5TE]>;
 
-  def WB : AMulxyIa<0b0001001, 0b00, (outs GPR:$Rd),
-              (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
+  def WB : AMulxyIa<0b0001001, 0b00, (outs GPRnopc:$Rd),
+              (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
               IIC_iMAC16, !strconcat(opc, "wb"), "\t$Rd, $Rn, $Rm, $Ra",
-              [(set GPR:$Rd, (add GPR:$Ra, (sra (opnode GPR:$Rn,
-                                      (sext_inreg GPR:$Rm, i16)), (i32 16))))]>,
+              [(set GPRnopc:$Rd,
+                    (add GPR:$Ra, (sra (opnode GPRnopc:$Rn,
+                                  (sext_inreg GPRnopc:$Rm, i16)), (i32 16))))]>,
            Requires<[IsARM, HasV5TE]>;
 
-  def WT : AMulxyIa<0b0001001, 0b10, (outs GPR:$Rd),
-              (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
+  def WT : AMulxyIa<0b0001001, 0b10, (outs GPRnopc:$Rd),
+              (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
               IIC_iMAC16, !strconcat(opc, "wt"), "\t$Rd, $Rn, $Rm, $Ra",
-              [(set GPR:$Rd, (add GPR:$Ra, (sra (opnode GPR:$Rn,
-                                        (sra GPR:$Rm, (i32 16))), (i32 16))))]>,
+              [(set GPRnopc:$Rd,
+                 (add GPR:$Ra, (sra (opnode GPRnopc:$Rn,
+                                    (sra GPRnopc:$Rm, (i32 16))), (i32 16))))]>,
             Requires<[IsARM, HasV5TE]>;
+  }
 }
 
 defm SMUL : AI_smul<"smul", BinOpFrag<(mul node:$LHS, node:$RHS)>>;
 defm SMLA : AI_smla<"smla", BinOpFrag<(mul node:$LHS, node:$RHS)>>;
 
-// Halfword multiply accumulate long: SMLAL<x><y> -- for disassembly only
-def SMLALBB : AMulxyI64<0b0001010, 0b00, (outs GPR:$RdLo, GPR:$RdHi),
-                      (ins GPR:$Rn, GPR:$Rm),
-                      IIC_iMAC64, "smlalbb", "\t$RdLo, $RdHi, $Rn, $Rm",
-                      [/* For disassembly only; pattern left blank */]>,
+// Halfword multiply accumulate long: SMLAL<x><y>.
+def SMLALBB : AMulxyI64<0b0001010, 0b00, (outs GPRnopc:$RdLo, GPRnopc:$RdHi),
+                      (ins GPRnopc:$Rn, GPRnopc:$Rm),
+                      IIC_iMAC64, "smlalbb", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
               Requires<[IsARM, HasV5TE]>;
 
-def SMLALBT : AMulxyI64<0b0001010, 0b10, (outs GPR:$RdLo, GPR:$RdHi),
-                      (ins GPR:$Rn, GPR:$Rm),
-                      IIC_iMAC64, "smlalbt", "\t$RdLo, $RdHi, $Rn, $Rm",
-                      [/* For disassembly only; pattern left blank */]>,
+def SMLALBT : AMulxyI64<0b0001010, 0b10, (outs GPRnopc:$RdLo, GPRnopc:$RdHi),
+                      (ins GPRnopc:$Rn, GPRnopc:$Rm),
+                      IIC_iMAC64, "smlalbt", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
               Requires<[IsARM, HasV5TE]>;
 
-def SMLALTB : AMulxyI64<0b0001010, 0b01, (outs GPR:$RdLo, GPR:$RdHi),
-                      (ins GPR:$Rn, GPR:$Rm),
-                      IIC_iMAC64, "smlaltb", "\t$RdLo, $RdHi, $Rn, $Rm",
-                      [/* For disassembly only; pattern left blank */]>,
+def SMLALTB : AMulxyI64<0b0001010, 0b01, (outs GPRnopc:$RdLo, GPRnopc:$RdHi),
+                      (ins GPRnopc:$Rn, GPRnopc:$Rm),
+                      IIC_iMAC64, "smlaltb", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
               Requires<[IsARM, HasV5TE]>;
 
-def SMLALTT : AMulxyI64<0b0001010, 0b11, (outs GPR:$RdLo, GPR:$RdHi),
-                      (ins GPR:$Rn, GPR:$Rm),
-                      IIC_iMAC64, "smlaltt", "\t$RdLo, $RdHi, $Rn, $Rm",
-                      [/* For disassembly only; pattern left blank */]>,
+def SMLALTT : AMulxyI64<0b0001010, 0b11, (outs GPRnopc:$RdLo, GPRnopc:$RdHi),
+                      (ins GPRnopc:$Rn, GPRnopc:$Rm),
+                      IIC_iMAC64, "smlaltt", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
               Requires<[IsARM, HasV5TE]>;
 
-// Helper class for AI_smld -- for disassembly only
+// Helper class for AI_smld.
 class AMulDualIbase<bit long, bit sub, bit swap, dag oops, dag iops,
                     InstrItinClass itin, string opc, string asm>
   : AI<oops, iops, MulFrm, itin, opc, asm, []>, Requires<[IsARM, HasV6]> {
   bits<4> Rn;
   bits<4> Rm;
-  let Inst{4}     = 1;
-  let Inst{5}     = swap;
-  let Inst{6}     = sub;
-  let Inst{7}     = 0;
-  let Inst{21-20} = 0b00;
-  let Inst{22}    = long;
   let Inst{27-23} = 0b01110;
+  let Inst{22}    = long;
+  let Inst{21-20} = 0b00;
   let Inst{11-8}  = Rm;
+  let Inst{7}     = 0;
+  let Inst{6}     = sub;
+  let Inst{5}     = swap;
+  let Inst{4}     = 1;
   let Inst{3-0}   = Rn;
 }
 class AMulDualI<bit long, bit sub, bit swap, dag oops, dag iops,
@@ -3024,6 +3741,8 @@ class AMulDualIa<bit long, bit sub, bit swap, dag oops, dag iops,
                 InstrItinClass itin, string opc, string asm>
   : AMulDualIbase<long, sub, swap, oops, iops, itin, opc, asm> {
   bits<4> Ra;
+  bits<4> Rd;
+  let Inst{19-16} = Rd;
   let Inst{15-12} = Ra;
 }
 class AMulDualI64<bit long, bit sub, bit swap, dag oops, dag iops,
@@ -3037,18 +3756,20 @@ class AMulDualI64<bit long, bit sub, bit swap, dag oops, dag iops,
 
 multiclass AI_smld<bit sub, string opc> {
 
-  def D : AMulDualIa<0, sub, 0, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
+  def D : AMulDualIa<0, sub, 0, (outs GPRnopc:$Rd),
+                  (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
                   NoItinerary, !strconcat(opc, "d"), "\t$Rd, $Rn, $Rm, $Ra">;
 
-  def DX: AMulDualIa<0, sub, 1, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
+  def DX: AMulDualIa<0, sub, 1, (outs GPRnopc:$Rd),
+                  (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
                   NoItinerary, !strconcat(opc, "dx"), "\t$Rd, $Rn, $Rm, $Ra">;
 
-  def LD: AMulDualI64<1, sub, 0, (outs GPR:$RdLo,GPR:$RdHi),
-                  (ins GPR:$Rn, GPR:$Rm), NoItinerary,
+  def LD: AMulDualI64<1, sub, 0, (outs GPRnopc:$RdLo, GPRnopc:$RdHi),
+                  (ins GPRnopc:$Rn, GPRnopc:$Rm), NoItinerary,
                   !strconcat(opc, "ld"), "\t$RdLo, $RdHi, $Rn, $Rm">;
 
-  def LDX : AMulDualI64<1, sub, 1, (outs GPR:$RdLo,GPR:$RdHi),
-                  (ins GPR:$Rn, GPR:$Rm), NoItinerary,
+  def LDX : AMulDualI64<1, sub, 1, (outs GPRnopc:$RdLo, GPRnopc:$RdHi),
+                  (ins GPRnopc:$Rn, GPRnopc:$Rm), NoItinerary,
                   !strconcat(opc, "ldx"),"\t$RdLo, $RdHi, $Rn, $Rm">;
 
 }
@@ -3058,10 +3779,10 @@ defm SMLS : AI_smld<1, "smls">;
 
 multiclass AI_sdml<bit sub, string opc> {
 
-  def D : AMulDualI<0, sub, 0, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
-                    NoItinerary, !strconcat(opc, "d"), "\t$Rd, $Rn, $Rm">;
-  def DX : AMulDualI<0, sub, 1, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
-                    NoItinerary, !strconcat(opc, "dx"), "\t$Rd, $Rn, $Rm">;
+  def D:AMulDualI<0, sub, 0, (outs GPRnopc:$Rd), (ins GPRnopc:$Rn, GPRnopc:$Rm),
+                  NoItinerary, !strconcat(opc, "d"), "\t$Rd, $Rn, $Rm">;
+  def DX:AMulDualI<0, sub, 1, (outs GPRnopc:$Rd),(ins GPRnopc:$Rn, GPRnopc:$Rm),
+                  NoItinerary, !strconcat(opc, "dx"), "\t$Rd, $Rn, $Rm">;
 }
 
 defm SMUA : AI_sdml<0, "smua">;
@@ -3100,55 +3821,38 @@ def : ARMV6Pat<(or (sra (shl GPR:$Rm, (i32 24)), (i32 16)),
                    (and (srl GPR:$Rm, (i32 8)), 0xFF)),
                (REVSH GPR:$Rm)>;
 
-def lsl_shift_imm : SDNodeXForm<imm, [{
-  unsigned Sh = ARM_AM::getSORegOpc(ARM_AM::lsl, N->getZExtValue());
-  return CurDAG->getTargetConstant(Sh, MVT::i32);
-}]>;
-
-def lsl_amt : ImmLeaf<i32, [{
-  return Imm > 0 && Imm < 32;
-}], lsl_shift_imm>;
-
-def PKHBT : APKHI<0b01101000, 0, (outs GPR:$Rd),
-                              (ins GPR:$Rn, GPR:$Rm, shift_imm:$sh),
+def PKHBT : APKHI<0b01101000, 0, (outs GPRnopc:$Rd),
+                              (ins GPRnopc:$Rn, GPRnopc:$Rm, pkh_lsl_amt:$sh),
                IIC_iALUsi, "pkhbt", "\t$Rd, $Rn, $Rm$sh",
-               [(set GPR:$Rd, (or (and GPR:$Rn, 0xFFFF),
-                                  (and (shl GPR:$Rm, lsl_amt:$sh),
-                                       0xFFFF0000)))]>,
+               [(set GPRnopc:$Rd, (or (and GPRnopc:$Rn, 0xFFFF),
+                                      (and (shl GPRnopc:$Rm, pkh_lsl_amt:$sh),
+                                           0xFFFF0000)))]>,
                Requires<[IsARM, HasV6]>;
 
 // Alternate cases for PKHBT where identities eliminate some nodes.
-def : ARMV6Pat<(or (and GPR:$Rn, 0xFFFF), (and GPR:$Rm, 0xFFFF0000)),
-               (PKHBT GPR:$Rn, GPR:$Rm, 0)>;
-def : ARMV6Pat<(or (and GPR:$Rn, 0xFFFF), (shl GPR:$Rm, imm16_31:$sh)),
-               (PKHBT GPR:$Rn, GPR:$Rm, (lsl_shift_imm imm16_31:$sh))>;
-
-def asr_shift_imm : SDNodeXForm<imm, [{
-  unsigned Sh = ARM_AM::getSORegOpc(ARM_AM::asr, N->getZExtValue());
-  return CurDAG->getTargetConstant(Sh, MVT::i32);
-}]>;
-
-def asr_amt : ImmLeaf<i32, [{
-  return Imm > 0 && Imm <= 32;
-}], asr_shift_imm>;
+def : ARMV6Pat<(or (and GPRnopc:$Rn, 0xFFFF), (and GPRnopc:$Rm, 0xFFFF0000)),
+               (PKHBT GPRnopc:$Rn, GPRnopc:$Rm, 0)>;
+def : ARMV6Pat<(or (and GPRnopc:$Rn, 0xFFFF), (shl GPRnopc:$Rm, imm16_31:$sh)),
+               (PKHBT GPRnopc:$Rn, GPRnopc:$Rm, imm16_31:$sh)>;
 
 // Note: Shifts of 1-15 bits will be transformed to srl instead of sra and
 // will match the pattern below.
-def PKHTB : APKHI<0b01101000, 1, (outs GPR:$Rd),
-                              (ins GPR:$Rn, GPR:$Rm, shift_imm:$sh),
+def PKHTB : APKHI<0b01101000, 1, (outs GPRnopc:$Rd),
+                              (ins GPRnopc:$Rn, GPRnopc:$Rm, pkh_asr_amt:$sh),
                IIC_iBITsi, "pkhtb", "\t$Rd, $Rn, $Rm$sh",
-               [(set GPR:$Rd, (or (and GPR:$Rn, 0xFFFF0000),
-                                  (and (sra GPR:$Rm, asr_amt:$sh),
-                                       0xFFFF)))]>,
+               [(set GPRnopc:$Rd, (or (and GPRnopc:$Rn, 0xFFFF0000),
+                                      (and (sra GPRnopc:$Rm, pkh_asr_amt:$sh),
+                                           0xFFFF)))]>,
                Requires<[IsARM, HasV6]>;
 
 // Alternate cases for PKHTB where identities eliminate some nodes.  Note that
 // a shift amount of 0 is *not legal* here, it is PKHBT instead.
-def : ARMV6Pat<(or (and GPR:$src1, 0xFFFF0000), (srl GPR:$src2, imm16_31:$sh)),
-               (PKHTB GPR:$src1, GPR:$src2, (asr_shift_imm imm16_31:$sh))>;
-def : ARMV6Pat<(or (and GPR:$src1, 0xFFFF0000),
-                   (and (srl GPR:$src2, imm1_15:$sh), 0xFFFF)),
-               (PKHTB GPR:$src1, GPR:$src2, (asr_shift_imm imm1_15:$sh))>;
+def : ARMV6Pat<(or (and GPRnopc:$src1, 0xFFFF0000),
+                   (srl GPRnopc:$src2, imm16_31:$sh)),
+               (PKHTB GPRnopc:$src1, GPRnopc:$src2, imm16_31:$sh)>;
+def : ARMV6Pat<(or (and GPRnopc:$src1, 0xFFFF0000),
+                   (and (srl GPRnopc:$src2, imm1_15:$sh), 0xFFFF)),
+               (PKHTB GPRnopc:$src1, GPRnopc:$src2, imm1_15:$sh)>;
 
 //===----------------------------------------------------------------------===//
 //  Comparison Instructions...
@@ -3163,8 +3867,10 @@ def : ARMPat<(ARMcmpZ GPR:$src, so_imm:$imm),
              (CMPri   GPR:$src, so_imm:$imm)>;
 def : ARMPat<(ARMcmpZ GPR:$src, GPR:$rhs),
              (CMPrr   GPR:$src, GPR:$rhs)>;
-def : ARMPat<(ARMcmpZ GPR:$src, so_reg:$rhs),
-             (CMPrs   GPR:$src, so_reg:$rhs)>;
+def : ARMPat<(ARMcmpZ GPR:$src, so_reg_imm:$rhs),
+             (CMPrsi   GPR:$src, so_reg_imm:$rhs)>;
+def : ARMPat<(ARMcmpZ GPR:$src, so_reg_reg:$rhs),
+             (CMPrsr   GPR:$src, so_reg_reg:$rhs)>;
 
 // FIXME: We have to be careful when using the CMN instruction and comparison
 // with 0. One would expect these two pieces of code should give identical
@@ -3250,15 +3956,23 @@ def MOVCCr : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$false, GPR:$Rm, pred:$p),
                            4, IIC_iCMOVr,
   [/*(set GPR:$Rd, (ARMcmov GPR:$false, GPR:$Rm, imm:$cc, CCR:$ccr))*/]>,
       RegConstraint<"$false = $Rd">;
-def MOVCCs : ARMPseudoInst<(outs GPR:$Rd),
-                           (ins GPR:$false, so_reg:$shift, pred:$p),
+def MOVCCsi : ARMPseudoInst<(outs GPR:$Rd),
+                           (ins GPR:$false, so_reg_imm:$shift, pred:$p),
+                           4, IIC_iCMOVsr,
+  [/*(set GPR:$Rd, (ARMcmov GPR:$false, so_reg_imm:$shift,
+                            imm:$cc, CCR:$ccr))*/]>,
+      RegConstraint<"$false = $Rd">;
+def MOVCCsr : ARMPseudoInst<(outs GPR:$Rd),
+                           (ins GPR:$false, so_reg_reg:$shift, pred:$p),
                            4, IIC_iCMOVsr,
-  [/*(set GPR:$Rd, (ARMcmov GPR:$false, so_reg:$shift, imm:$cc, CCR:$ccr))*/]>,
+  [/*(set GPR:$Rd, (ARMcmov GPR:$false, so_reg_reg:$shift,
+                            imm:$cc, CCR:$ccr))*/]>,
       RegConstraint<"$false = $Rd">;
 
+
 let isMoveImm = 1 in
 def MOVCCi16 : ARMPseudoInst<(outs GPR:$Rd),
-                             (ins GPR:$false, i32imm_hilo16:$imm, pred:$p),
+                             (ins GPR:$false, imm0_65535_expr:$imm, pred:$p),
                              4, IIC_iMOVi,
                              []>,
       RegConstraint<"$false = $Rd">, Requires<[IsARM, HasV6T2]>;
@@ -3288,9 +4002,14 @@ def MVNCCi : ARMPseudoInst<(outs GPR:$Rd),
 // Atomic operations intrinsics
 //
 
+def MemBarrierOptOperand : AsmOperandClass {
+  let Name = "MemBarrierOpt";
+  let ParserMethod = "parseMemBarrierOptOperand";
+}
 def memb_opt : Operand<i32> {
   let PrintMethod = "printMemBOption";
   let ParserMatchClass = MemBarrierOptOperand;
+  let DecoderMethod = "DecodeMemBarrierOption";
 }
 
 // memory barriers protect the atomic sequences
@@ -3321,8 +4040,16 @@ def ISB : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary,
   let Inst{3-0} = opt;
 }
 
+// Pseudo isntruction that combines movs + predicated rsbmi
+// to implement integer ABS
+let usesCustomInserter = 1, Defs = [CPSR] in {
+def ABS : ARMPseudoInst<
+  (outs GPR:$dst), (ins GPR:$src),
+  8, NoItinerary, []>;
+}
+
 let usesCustomInserter = 1 in {
-  let Uses = [CPSR] in {
+  let Defs = [CPSR] in {
     def ATOMIC_LOAD_ADD_I8 : PseudoInst<
       (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
       [(set GPR:$dst, (atomic_load_add_8 GPR:$ptr, GPR:$incr))]>;
@@ -3437,44 +4164,47 @@ let usesCustomInserter = 1 in {
 }
 
 let mayLoad = 1 in {
-def LDREXB : AIldrex<0b10, (outs GPR:$Rt), (ins addrmode7:$addr), NoItinerary,
+def LDREXB : AIldrex<0b10, (outs GPR:$Rt), (ins addr_offset_none:$addr),
+                     NoItinerary,
                     "ldrexb", "\t$Rt, $addr", []>;
-def LDREXH : AIldrex<0b11, (outs GPR:$Rt), (ins addrmode7:$addr), NoItinerary,
-                    "ldrexh", "\t$Rt, $addr", []>;
-def LDREX  : AIldrex<0b00, (outs GPR:$Rt), (ins addrmode7:$addr), NoItinerary,
-                    "ldrex", "\t$Rt, $addr", []>;
+def LDREXH : AIldrex<0b11, (outs GPR:$Rt), (ins addr_offset_none:$addr),
+                     NoItinerary, "ldrexh", "\t$Rt, $addr", []>;
+def LDREX  : AIldrex<0b00, (outs GPR:$Rt), (ins addr_offset_none:$addr),
+                     NoItinerary, "ldrex", "\t$Rt, $addr", []>;
 let hasExtraDefRegAllocReq = 1 in
-  def LDREXD : AIldrex<0b01, (outs GPR:$Rt, GPR:$Rt2), (ins addrmode7:$addr),
-                      NoItinerary, "ldrexd", "\t$Rt, $Rt2, $addr", []>;
+def LDREXD: AIldrex<0b01, (outs GPR:$Rt, GPR:$Rt2),(ins addr_offset_none:$addr),
+                      NoItinerary, "ldrexd", "\t$Rt, $Rt2, $addr", []> {
+  let DecoderMethod = "DecodeDoubleRegLoad";
+}
 }
 
 let mayStore = 1, Constraints = "@earlyclobber $Rd" in {
-def STREXB : AIstrex<0b10, (outs GPR:$Rd), (ins GPR:$Rt, addrmode7:$addr),
+def STREXB: AIstrex<0b10, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr),
                     NoItinerary, "strexb", "\t$Rd, $Rt, $addr", []>;
-def STREXH : AIstrex<0b11, (outs GPR:$Rd), (ins GPR:$Rt, addrmode7:$addr),
+def STREXH: AIstrex<0b11, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr),
                     NoItinerary, "strexh", "\t$Rd, $Rt, $addr", []>;
-def STREX  : AIstrex<0b00, (outs GPR:$Rd), (ins GPR:$Rt, addrmode7:$addr),
+def STREX : AIstrex<0b00, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr),
                     NoItinerary, "strex", "\t$Rd, $Rt, $addr", []>;
 }
 
 let hasExtraSrcRegAllocReq = 1, Constraints = "@earlyclobber $Rd" in
 def STREXD : AIstrex<0b01, (outs GPR:$Rd),
-                    (ins GPR:$Rt, GPR:$Rt2, addrmode7:$addr),
-                    NoItinerary, "strexd", "\t$Rd, $Rt, $Rt2, $addr", []>;
+                    (ins GPR:$Rt, GPR:$Rt2, addr_offset_none:$addr),
+                    NoItinerary, "strexd", "\t$Rd, $Rt, $Rt2, $addr", []> {
+  let DecoderMethod = "DecodeDoubleRegStore";
+}
 
-// Clear-Exclusive is for disassembly only.
-def CLREX : AXI<(outs), (ins), MiscFrm, NoItinerary, "clrex",
-                [/* For disassembly only; pattern left blank */]>,
+def CLREX : AXI<(outs), (ins), MiscFrm, NoItinerary, "clrex", []>,
             Requires<[IsARM, HasV7]>  {
   let Inst{31-0} = 0b11110101011111111111000000011111;
 }
 
-// SWP/SWPB are deprecated in V6/V7 and for disassembly only.
-let mayLoad = 1 in {
-def SWP  : AIswp<0, (outs GPR:$Rt), (ins GPR:$Rt2, GPR:$Rn), "swp",
-             [/* For disassembly only; pattern left blank */]>;
-def SWPB : AIswp<1, (outs GPR:$Rt), (ins GPR:$Rt2, GPR:$Rn), "swpb",
-             [/* For disassembly only; pattern left blank */]>;
+// SWP/SWPB are deprecated in V6/V7.
+let mayLoad = 1, mayStore = 1 in {
+def SWP : AIswp<0, (outs GPR:$Rt), (ins GPR:$Rt2, addr_offset_none:$addr),
+                "swp", []>;
+def SWPB: AIswp<1, (outs GPR:$Rt), (ins GPR:$Rt2, addr_offset_none:$addr),
+                "swpb", []>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -3526,108 +4256,171 @@ def CDP2 : ABXI<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1,
 
 class ACI<dag oops, dag iops, string opc, string asm,
           IndexMode im = IndexModeNone>
+  : I<oops, iops, AddrModeNone, 4, im, BrFrm, NoItinerary,
+      opc, asm, "", []> {
+  let Inst{27-25} = 0b110;
+}
+class ACInoP<dag oops, dag iops, string opc, string asm,
+          IndexMode im = IndexModeNone>
   : InoP<oops, iops, AddrModeNone, 4, im, BrFrm, NoItinerary,
-         opc, asm, "", [/* For disassembly only; pattern left blank */]> {
+         opc, asm, "", []> {
+  let Inst{31-28} = 0b1111;
   let Inst{27-25} = 0b110;
 }
-
-multiclass LdStCop<bits<4> op31_28, bit load, dag ops, string opc, string cond>{
-
-  def _OFFSET : ACI<(outs),
-      !con((ins nohash_imm:$cop, nohash_imm:$CRd, addrmode2:$addr), ops),
-      !strconcat(opc, cond), "\tp$cop, cr$CRd, $addr"> {
-    let Inst{31-28} = op31_28;
+multiclass LdStCop<bit load, bit Dbit, string asm> {
+  def _OFFSET : ACI<(outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5:$addr),
+                    asm, "\t$cop, $CRd, $addr"> {
+    bits<13> addr;
+    bits<4> cop;
+    bits<4> CRd;
     let Inst{24} = 1; // P = 1
+    let Inst{23} = addr{8};
+    let Inst{22} = Dbit;
     let Inst{21} = 0; // W = 0
-    let Inst{22} = 0; // D = 0
     let Inst{20} = load;
+    let Inst{19-16} = addr{12-9};
+    let Inst{15-12} = CRd;
+    let Inst{11-8} = cop;
+    let Inst{7-0} = addr{7-0};
+    let DecoderMethod = "DecodeCopMemInstruction";
   }
-
-  def _PRE : ACI<(outs),
-      !con((ins nohash_imm:$cop, nohash_imm:$CRd, addrmode2:$addr), ops),
-      !strconcat(opc, cond), "\tp$cop, cr$CRd, $addr!", IndexModePre> {
-    let Inst{31-28} = op31_28;
+  def _PRE : ACI<(outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5:$addr),
+                 asm, "\t$cop, $CRd, $addr!", IndexModePre> {
+    bits<13> addr;
+    bits<4> cop;
+    bits<4> CRd;
     let Inst{24} = 1; // P = 1
+    let Inst{23} = addr{8};
+    let Inst{22} = Dbit;
     let Inst{21} = 1; // W = 1
-    let Inst{22} = 0; // D = 0
     let Inst{20} = load;
+    let Inst{19-16} = addr{12-9};
+    let Inst{15-12} = CRd;
+    let Inst{11-8} = cop;
+    let Inst{7-0} = addr{7-0};
+    let DecoderMethod = "DecodeCopMemInstruction";
   }
-
-  def _POST : ACI<(outs),
-      !con((ins nohash_imm:$cop, nohash_imm:$CRd, addrmode2:$addr), ops),
-      !strconcat(opc, cond), "\tp$cop, cr$CRd, $addr", IndexModePost> {
-    let Inst{31-28} = op31_28;
+  def _POST: ACI<(outs), (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr,
+                              postidx_imm8s4:$offset),
+                 asm, "\t$cop, $CRd, $addr, $offset", IndexModePost> {
+    bits<9> offset;
+    bits<4> addr;
+    bits<4> cop;
+    bits<4> CRd;
     let Inst{24} = 0; // P = 0
+    let Inst{23} = offset{8};
+    let Inst{22} = Dbit;
     let Inst{21} = 1; // W = 1
-    let Inst{22} = 0; // D = 0
     let Inst{20} = load;
+    let Inst{19-16} = addr;
+    let Inst{15-12} = CRd;
+    let Inst{11-8} = cop;
+    let Inst{7-0} = offset{7-0};
+    let DecoderMethod = "DecodeCopMemInstruction";
   }
-
   def _OPTION : ACI<(outs),
-      !con((ins nohash_imm:$cop,nohash_imm:$CRd,GPR:$base, nohash_imm:$option),
-            ops),
-      !strconcat(opc, cond), "\tp$cop, cr$CRd, [$base], \\{$option\\}"> {
-    let Inst{31-28} = op31_28;
+                    (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr,
+                         coproc_option_imm:$option),
+      asm, "\t$cop, $CRd, $addr, $option"> {
+    bits<8> option;
+    bits<4> addr;
+    bits<4> cop;
+    bits<4> CRd;
     let Inst{24} = 0; // P = 0
     let Inst{23} = 1; // U = 1
+    let Inst{22} = Dbit;
     let Inst{21} = 0; // W = 0
-    let Inst{22} = 0; // D = 0
     let Inst{20} = load;
+    let Inst{19-16} = addr;
+    let Inst{15-12} = CRd;
+    let Inst{11-8} = cop;
+    let Inst{7-0} = option;
+    let DecoderMethod = "DecodeCopMemInstruction";
   }
-
-  def L_OFFSET : ACI<(outs),
-      !con((ins nohash_imm:$cop, nohash_imm:$CRd, addrmode2:$addr), ops),
-      !strconcat(!strconcat(opc, "l"), cond), "\tp$cop, cr$CRd, $addr"> {
-    let Inst{31-28} = op31_28;
+}
+multiclass LdSt2Cop<bit load, bit Dbit, string asm> {
+  def _OFFSET : ACInoP<(outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5:$addr),
+                       asm, "\t$cop, $CRd, $addr"> {
+    bits<13> addr;
+    bits<4> cop;
+    bits<4> CRd;
     let Inst{24} = 1; // P = 1
+    let Inst{23} = addr{8};
+    let Inst{22} = Dbit;
     let Inst{21} = 0; // W = 0
-    let Inst{22} = 1; // D = 1
     let Inst{20} = load;
+    let Inst{19-16} = addr{12-9};
+    let Inst{15-12} = CRd;
+    let Inst{11-8} = cop;
+    let Inst{7-0} = addr{7-0};
+    let DecoderMethod = "DecodeCopMemInstruction";
   }
-
-  def L_PRE : ACI<(outs),
-      !con((ins nohash_imm:$cop, nohash_imm:$CRd, addrmode2:$addr), ops),
-      !strconcat(!strconcat(opc, "l"), cond), "\tp$cop, cr$CRd, $addr!",
-      IndexModePre> {
-    let Inst{31-28} = op31_28;
+  def _PRE : ACInoP<(outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5:$addr),
+                    asm, "\t$cop, $CRd, $addr!", IndexModePre> {
+    bits<13> addr;
+    bits<4> cop;
+    bits<4> CRd;
     let Inst{24} = 1; // P = 1
+    let Inst{23} = addr{8};
+    let Inst{22} = Dbit;
     let Inst{21} = 1; // W = 1
-    let Inst{22} = 1; // D = 1
     let Inst{20} = load;
+    let Inst{19-16} = addr{12-9};
+    let Inst{15-12} = CRd;
+    let Inst{11-8} = cop;
+    let Inst{7-0} = addr{7-0};
+    let DecoderMethod = "DecodeCopMemInstruction";
   }
-
-  def L_POST : ACI<(outs),
-      !con((ins nohash_imm:$cop, nohash_imm:$CRd, addrmode2:$addr), ops),
-      !strconcat(!strconcat(opc, "l"), cond), "\tp$cop, cr$CRd, $addr",
-      IndexModePost> {
-    let Inst{31-28} = op31_28;
+  def _POST: ACInoP<(outs), (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr,
+                                 postidx_imm8s4:$offset),
+                 asm, "\t$cop, $CRd, $addr, $offset", IndexModePost> {
+    bits<9> offset;
+    bits<4> addr;
+    bits<4> cop;
+    bits<4> CRd;
     let Inst{24} = 0; // P = 0
+    let Inst{23} = offset{8};
+    let Inst{22} = Dbit;
     let Inst{21} = 1; // W = 1
-    let Inst{22} = 1; // D = 1
     let Inst{20} = load;
+    let Inst{19-16} = addr;
+    let Inst{15-12} = CRd;
+    let Inst{11-8} = cop;
+    let Inst{7-0} = offset{7-0};
+    let DecoderMethod = "DecodeCopMemInstruction";
   }
-
-  def L_OPTION : ACI<(outs),
-      !con((ins nohash_imm:$cop, nohash_imm:$CRd,GPR:$base,nohash_imm:$option),
-            ops),
-      !strconcat(!strconcat(opc, "l"), cond),
-      "\tp$cop, cr$CRd, [$base], \\{$option\\}"> {
-    let Inst{31-28} = op31_28;
+  def _OPTION : ACInoP<(outs),
+                       (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr,
+                            coproc_option_imm:$option),
+      asm, "\t$cop, $CRd, $addr, $option"> {
+    bits<8> option;
+    bits<4> addr;
+    bits<4> cop;
+    bits<4> CRd;
     let Inst{24} = 0; // P = 0
     let Inst{23} = 1; // U = 1
+    let Inst{22} = Dbit;
     let Inst{21} = 0; // W = 0
-    let Inst{22} = 1; // D = 1
     let Inst{20} = load;
+    let Inst{19-16} = addr;
+    let Inst{15-12} = CRd;
+    let Inst{11-8} = cop;
+    let Inst{7-0} = option;
+    let DecoderMethod = "DecodeCopMemInstruction";
   }
 }
 
-defm LDC  : LdStCop<{?,?,?,?}, 1, (ins pred:$p), "ldc",  "${p}">;
-defm LDC2 : LdStCop<0b1111,    1, (ins),         "ldc2", "">;
-defm STC  : LdStCop<{?,?,?,?}, 0, (ins pred:$p), "stc",  "${p}">;
-defm STC2 : LdStCop<0b1111,    0, (ins),         "stc2", "">;
+defm LDC   : LdStCop <1, 0, "ldc">;
+defm LDCL  : LdStCop <1, 1, "ldcl">;
+defm STC   : LdStCop <0, 0, "stc">;
+defm STCL  : LdStCop <0, 1, "stcl">;
+defm LDC2  : LdSt2Cop<1, 0, "ldc2">;
+defm LDC2L : LdSt2Cop<1, 1, "ldc2l">;
+defm STC2  : LdSt2Cop<0, 0, "stc2">;
+defm STC2L : LdSt2Cop<0, 1, "stc2l">;
 
 //===----------------------------------------------------------------------===//
-// Move between coprocessor and ARM core register -- for disassembly only
+// Move between coprocessor and ARM core register.
 //
 
 class MovRCopro<string opc, bit direction, dag oops, dag iops,
@@ -3660,8 +4453,8 @@ def MCR : MovRCopro<"mcr", 0 /* from ARM core register to coprocessor */,
                                   imm:$CRm, imm:$opc2)]>;
 def MRC : MovRCopro<"mrc", 1 /* from coprocessor to ARM core register */,
                     (outs GPR:$Rt),
-                    (ins p_imm:$cop, i32imm:$opc1, c_imm:$CRn, c_imm:$CRm,
-                         i32imm:$opc2), []>;
+                    (ins p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm,
+                         imm0_7:$opc2), []>;
 
 def : ARMPat<(int_arm_mrc imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2),
              (MRC imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>;
@@ -3697,15 +4490,14 @@ def MCR2 : MovRCopro2<"mcr2", 0 /* from ARM core register to coprocessor */,
                                      imm:$CRm, imm:$opc2)]>;
 def MRC2 : MovRCopro2<"mrc2", 1 /* from coprocessor to ARM core register */,
                       (outs GPR:$Rt),
-                      (ins p_imm:$cop, i32imm:$opc1, c_imm:$CRn, c_imm:$CRm,
-                           i32imm:$opc2), []>;
+                      (ins p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm,
+                           imm0_7:$opc2), []>;
 
 def : ARMV5TPat<(int_arm_mrc2 imm:$cop, imm:$opc1, imm:$CRn,
                               imm:$CRm, imm:$opc2),
                 (MRC2 imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>;
 
-class MovRRCopro<string opc, bit direction,
-                 list<dag> pattern = [/* For disassembly only */]>
+class MovRRCopro<string opc, bit direction, list<dag> pattern = []>
   : ABI<0b1100, (outs), (ins p_imm:$cop, imm0_15:$opc1,
         GPR:$Rt, GPR:$Rt2, c_imm:$CRm),
         NoItinerary, opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm", pattern> {
@@ -3730,8 +4522,7 @@ def MCRR : MovRRCopro<"mcrr", 0 /* from ARM core register to coprocessor */,
                                      imm:$CRm)]>;
 def MRRC : MovRRCopro<"mrrc", 1 /* from coprocessor to ARM core register */>;
 
-class MovRRCopro2<string opc, bit direction,
-                  list<dag> pattern = [/* For disassembly only */]>
+class MovRRCopro2<string opc, bit direction, list<dag> pattern = []>
   : ABXI<0b1100, (outs), (ins p_imm:$cop, imm0_15:$opc1,
          GPR:$Rt, GPR:$Rt2, c_imm:$CRm), NoItinerary,
          !strconcat(opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm"), pattern> {
@@ -3758,20 +4549,22 @@ def MCRR2 : MovRRCopro2<"mcrr2", 0 /* from ARM core register to coprocessor */,
 def MRRC2 : MovRRCopro2<"mrrc2", 1 /* from coprocessor to ARM core register */>;
 
 //===----------------------------------------------------------------------===//
-// Move between special register and ARM core register -- for disassembly only
+// Move between special register and ARM core register
 //
 
 // Move to ARM core register from Special Register
-def MRS : ABI<0b0001, (outs GPR:$Rd), (ins), NoItinerary, "mrs", "\t$Rd, cpsr",
-              [/* For disassembly only; pattern left blank */]> {
+def MRS : ABI<0b0001, (outs GPR:$Rd), (ins), NoItinerary,
+              "mrs", "\t$Rd, apsr", []> {
   bits<4> Rd;
   let Inst{23-16} = 0b00001111;
   let Inst{15-12} = Rd;
   let Inst{7-4} = 0b0000;
 }
 
-def MRSsys : ABI<0b0001, (outs GPR:$Rd), (ins), NoItinerary,"mrs","\t$Rd, spsr",
-              [/* For disassembly only; pattern left blank */]> {
+def : InstAlias<"mrs${p} $Rd, cpsr", (MRS GPR:$Rd, pred:$p)>, Requires<[IsARM]>;
+
+def MRSsys : ABI<0b0001, (outs GPR:$Rd), (ins), NoItinerary,
+                 "mrs", "\t$Rd, spsr", []> {
   bits<4> Rd;
   let Inst{23-16} = 0b01001111;
   let Inst{15-12} = Rd;
@@ -3785,8 +4578,7 @@ def MRSsys : ABI<0b0001, (outs GPR:$Rd), (ins), NoItinerary,"mrs","\t$Rd, spsr",
 // operand contains the special register (R Bit) in bit 4 and bits 3-0 contains
 // the mask with the fields to be accessed in the special register.
 def MSR : ABI<0b0001, (outs), (ins msr_mask:$mask, GPR:$Rn), NoItinerary,
-              "msr", "\t$mask, $Rn",
-              [/* For disassembly only; pattern left blank */]> {
+              "msr", "\t$mask, $Rn", []> {
   bits<5> mask;
   bits<4> Rn;
 
@@ -3800,8 +4592,7 @@ def MSR : ABI<0b0001, (outs), (ins msr_mask:$mask, GPR:$Rn), NoItinerary,
 }
 
 def MSRi : ABI<0b0011, (outs), (ins msr_mask:$mask,  so_imm:$a), NoItinerary,
-               "msr", "\t$mask, $a",
-               [/* For disassembly only; pattern left blank */]> {
+               "msr", "\t$mask, $a", []> {
   bits<5> mask;
   bits<12> a;
 
@@ -4030,6 +4821,47 @@ def : ARMV5TEPat<(add GPR:$acc,
 def : ARMPat<(ARMMemBarrierMCR GPR:$zero), (MCR 15, 0, GPR:$zero, 7, 10, 5)>,
          Requires<[IsARM, HasV6]>;
 
+// SXT/UXT with no rotate
+let AddedComplexity = 16 in {
+def : ARMV6Pat<(and GPR:$Src, 0x000000FF), (UXTB GPR:$Src, 0)>;
+def : ARMV6Pat<(and GPR:$Src, 0x0000FFFF), (UXTH GPR:$Src, 0)>;
+def : ARMV6Pat<(and GPR:$Src, 0x00FF00FF), (UXTB16 GPR:$Src, 0)>;
+def : ARMV6Pat<(add GPR:$Rn, (and GPR:$Rm, 0x00FF)),
+               (UXTAB GPR:$Rn, GPR:$Rm, 0)>;
+def : ARMV6Pat<(add GPR:$Rn, (and GPR:$Rm, 0xFFFF)),
+               (UXTAH GPR:$Rn, GPR:$Rm, 0)>;
+}
+
+def : ARMV6Pat<(sext_inreg GPR:$Src, i8),  (SXTB GPR:$Src, 0)>;
+def : ARMV6Pat<(sext_inreg GPR:$Src, i16), (SXTH GPR:$Src, 0)>;
+
+def : ARMV6Pat<(add GPR:$Rn, (sext_inreg GPRnopc:$Rm, i8)),
+               (SXTAB GPR:$Rn, GPRnopc:$Rm, 0)>;
+def : ARMV6Pat<(add GPR:$Rn, (sext_inreg GPRnopc:$Rm, i16)),
+               (SXTAH GPR:$Rn, GPRnopc:$Rm, 0)>;
+
+// Atomic load/store patterns
+def : ARMPat<(atomic_load_8 ldst_so_reg:$src),
+             (LDRBrs ldst_so_reg:$src)>;
+def : ARMPat<(atomic_load_8 addrmode_imm12:$src),
+             (LDRBi12 addrmode_imm12:$src)>;
+def : ARMPat<(atomic_load_16 addrmode3:$src),
+             (LDRH addrmode3:$src)>;
+def : ARMPat<(atomic_load_32 ldst_so_reg:$src),
+             (LDRrs ldst_so_reg:$src)>;
+def : ARMPat<(atomic_load_32 addrmode_imm12:$src),
+             (LDRi12 addrmode_imm12:$src)>;
+def : ARMPat<(atomic_store_8 ldst_so_reg:$ptr, GPR:$val),
+             (STRBrs GPR:$val, ldst_so_reg:$ptr)>;
+def : ARMPat<(atomic_store_8 addrmode_imm12:$ptr, GPR:$val),
+             (STRBi12 GPR:$val, addrmode_imm12:$ptr)>;
+def : ARMPat<(atomic_store_16 addrmode3:$ptr, GPR:$val),
+             (STRH GPR:$val, addrmode3:$ptr)>;
+def : ARMPat<(atomic_store_32 ldst_so_reg:$ptr, GPR:$val),
+             (STRrs GPR:$val, ldst_so_reg:$ptr)>;
+def : ARMPat<(atomic_store_32 addrmode_imm12:$ptr, GPR:$val),
+             (STRi12 GPR:$val, addrmode_imm12:$ptr)>;
+
 
 //===----------------------------------------------------------------------===//
 // Thumb Support
@@ -4070,7 +4902,103 @@ def : MnemonicAlias<"swi", "svc">;
 // Load / Store Multiple
 def : MnemonicAlias<"ldmfd", "ldm">;
 def : MnemonicAlias<"ldmia", "ldm">;
+def : MnemonicAlias<"ldmea", "ldmdb">;
 def : MnemonicAlias<"stmfd", "stmdb">;
 def : MnemonicAlias<"stmia", "stm">;
 def : MnemonicAlias<"stmea", "stm">;
 
+// PKHBT/PKHTB with default shift amount. PKHTB is equivalent to PKHBT when the
+// shift amount is zero (i.e., unspecified).
+def : InstAlias<"pkhbt${p} $Rd, $Rn, $Rm",
+                (PKHBT GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, 0, pred:$p)>,
+        Requires<[IsARM, HasV6]>;
+def : InstAlias<"pkhtb${p} $Rd, $Rn, $Rm",
+                (PKHBT GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, 0, pred:$p)>,
+        Requires<[IsARM, HasV6]>;
+
+// PUSH/POP aliases for STM/LDM
+def : ARMInstAlias<"push${p} $regs", (STMDB_UPD SP, pred:$p, reglist:$regs)>;
+def : ARMInstAlias<"pop${p} $regs", (LDMIA_UPD SP, pred:$p, reglist:$regs)>;
+
+// SSAT/USAT optional shift operand.
+def : ARMInstAlias<"ssat${p} $Rd, $sat_imm, $Rn",
+                (SSAT GPRnopc:$Rd, imm1_32:$sat_imm, GPRnopc:$Rn, 0, pred:$p)>;
+def : ARMInstAlias<"usat${p} $Rd, $sat_imm, $Rn",
+                (USAT GPRnopc:$Rd, imm0_31:$sat_imm, GPRnopc:$Rn, 0, pred:$p)>;
+
+
+// Extend instruction optional rotate operand.
+def : ARMInstAlias<"sxtab${p} $Rd, $Rn, $Rm",
+                (SXTAB GPRnopc:$Rd, GPR:$Rn, GPRnopc:$Rm, 0, pred:$p)>;
+def : ARMInstAlias<"sxtah${p} $Rd, $Rn, $Rm",
+                (SXTAH GPRnopc:$Rd, GPR:$Rn, GPRnopc:$Rm, 0, pred:$p)>;
+def : ARMInstAlias<"sxtab16${p} $Rd, $Rn, $Rm",
+                (SXTAB16 GPRnopc:$Rd, GPR:$Rn, GPRnopc:$Rm, 0, pred:$p)>;
+def : ARMInstAlias<"sxtb${p} $Rd, $Rm",
+                (SXTB GPRnopc:$Rd, GPRnopc:$Rm, 0, pred:$p)>;
+def : ARMInstAlias<"sxtb16${p} $Rd, $Rm",
+                (SXTB16 GPRnopc:$Rd, GPRnopc:$Rm, 0, pred:$p)>;
+def : ARMInstAlias<"sxth${p} $Rd, $Rm",
+                (SXTH GPRnopc:$Rd, GPRnopc:$Rm, 0, pred:$p)>;
+
+def : ARMInstAlias<"uxtab${p} $Rd, $Rn, $Rm",
+                (UXTAB GPRnopc:$Rd, GPR:$Rn, GPRnopc:$Rm, 0, pred:$p)>;
+def : ARMInstAlias<"uxtah${p} $Rd, $Rn, $Rm",
+                (UXTAH GPRnopc:$Rd, GPR:$Rn, GPRnopc:$Rm, 0, pred:$p)>;
+def : ARMInstAlias<"uxtab16${p} $Rd, $Rn, $Rm",
+                (UXTAB16 GPRnopc:$Rd, GPR:$Rn, GPRnopc:$Rm, 0, pred:$p)>;
+def : ARMInstAlias<"uxtb${p} $Rd, $Rm",
+                (UXTB GPRnopc:$Rd, GPRnopc:$Rm, 0, pred:$p)>;
+def : ARMInstAlias<"uxtb16${p} $Rd, $Rm",
+                (UXTB16 GPRnopc:$Rd, GPRnopc:$Rm, 0, pred:$p)>;
+def : ARMInstAlias<"uxth${p} $Rd, $Rm",
+                (UXTH GPRnopc:$Rd, GPRnopc:$Rm, 0, pred:$p)>;
+
+
+// RFE aliases
+def : MnemonicAlias<"rfefa", "rfeda">;
+def : MnemonicAlias<"rfeea", "rfedb">;
+def : MnemonicAlias<"rfefd", "rfeia">;
+def : MnemonicAlias<"rfeed", "rfeib">;
+def : MnemonicAlias<"rfe", "rfeia">;
+
+// SRS aliases
+def : MnemonicAlias<"srsfa", "srsda">;
+def : MnemonicAlias<"srsea", "srsdb">;
+def : MnemonicAlias<"srsfd", "srsia">;
+def : MnemonicAlias<"srsed", "srsib">;
+def : MnemonicAlias<"srs", "srsia">;
+
+// QSAX == QSUBADDX
+def : MnemonicAlias<"qsubaddx", "qsax">;
+// SASX == SADDSUBX
+def : MnemonicAlias<"saddsubx", "sasx">;
+// SHASX == SHADDSUBX
+def : MnemonicAlias<"shaddsubx", "shasx">;
+// SHSAX == SHSUBADDX
+def : MnemonicAlias<"shsubaddx", "shsax">;
+// SSAX == SSUBADDX
+def : MnemonicAlias<"ssubaddx", "ssax">;
+// UASX == UADDSUBX
+def : MnemonicAlias<"uaddsubx", "uasx">;
+// UHASX == UHADDSUBX
+def : MnemonicAlias<"uhaddsubx", "uhasx">;
+// UHSAX == UHSUBADDX
+def : MnemonicAlias<"uhsubaddx", "uhsax">;
+// UQASX == UQADDSUBX
+def : MnemonicAlias<"uqaddsubx", "uqasx">;
+// UQSAX == UQSUBADDX
+def : MnemonicAlias<"uqsubaddx", "uqsax">;
+// USAX == USUBADDX
+def : MnemonicAlias<"usubaddx", "usax">;
+
+// LDRSBT/LDRHT/LDRSHT post-index offset if optional.
+// Note that the write-back output register is a dummy operand for MC (it's
+// only meaningful for codegen), so we just pass zero here.
+// FIXME: tblgen not cooperating with argument conversions.
+//def : InstAlias<"ldrsbt${p} $Rt, $addr",
+//                (LDRSBTi GPR:$Rt, GPR:$Rt, addr_offset_none:$addr, 0,pred:$p)>;
+//def : InstAlias<"ldrht${p} $Rt, $addr",
+//                (LDRHTi GPR:$Rt, GPR:$Rt, addr_offset_none:$addr, 0, pred:$p)>;
+//def : InstAlias<"ldrsht${p} $Rt, $addr",
+//                (LDRSHTi GPR:$Rt, GPR:$Rt, addr_offset_none:$addr, 0, pred:$p)>;
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index 0df62f4563436..7aad18695bb3b 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -11,6 +11,35 @@
 //
 //===----------------------------------------------------------------------===//
 
+
+//===----------------------------------------------------------------------===//
+// NEON-specific Operands.
+//===----------------------------------------------------------------------===//
+def VectorIndex8Operand  : AsmOperandClass { let Name = "VectorIndex8"; }
+def VectorIndex16Operand : AsmOperandClass { let Name = "VectorIndex16"; }
+def VectorIndex32Operand : AsmOperandClass { let Name = "VectorIndex32"; }
+def VectorIndex8 : Operand<i32>, ImmLeaf<i32, [{
+  return ((uint64_t)Imm) < 8;
+}]> {
+  let ParserMatchClass = VectorIndex8Operand;
+  let PrintMethod = "printVectorIndex";
+  let MIOperandInfo = (ops i32imm);
+}
+def VectorIndex16 : Operand<i32>, ImmLeaf<i32, [{
+  return ((uint64_t)Imm) < 4;
+}]> {
+  let ParserMatchClass = VectorIndex16Operand;
+  let PrintMethod = "printVectorIndex";
+  let MIOperandInfo = (ops i32imm);
+}
+def VectorIndex32 : Operand<i32>, ImmLeaf<i32, [{
+  return ((uint64_t)Imm) < 2;
+}]> {
+  let ParserMatchClass = VectorIndex32Operand;
+  let PrintMethod = "printVectorIndex";
+  let MIOperandInfo = (ops i32imm);
+}
+
 //===----------------------------------------------------------------------===//
 // NEON-specific DAG Nodes.
 //===----------------------------------------------------------------------===//
@@ -175,7 +204,8 @@ class VLDQQWBPseudo<InstrItinClass itin>
                 (ins addrmode6:$addr, am6offset:$offset), itin,
                 "$addr.addr = $wb">;
 class VLDQQQQPseudo<InstrItinClass itin>
-  : PseudoNLdSt<(outs QQQQPR:$dst), (ins addrmode6:$addr, QQQQPR:$src),itin,"">;
+  : PseudoNLdSt<(outs QQQQPR:$dst), (ins addrmode6:$addr, QQQQPR:$src),itin,
+                "$src = $dst">;
 class VLDQQQQWBPseudo<InstrItinClass itin>
   : PseudoNLdSt<(outs QQQQPR:$dst, GPR:$wb),
                 (ins addrmode6:$addr, am6offset:$offset, QQQQPR:$src), itin,
@@ -190,6 +220,7 @@ class VLD1D<bits<4> op7_4, string Dt>
           "vld1", Dt, "\\{$Vd\\}, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLDInstruction";
 }
 class VLD1Q<bits<4> op7_4, string Dt>
   : NLdSt<0,0b10,0b1010,op7_4, (outs DPR:$Vd, DPR:$dst2),
@@ -197,6 +228,7 @@ class VLD1Q<bits<4> op7_4, string Dt>
           "vld1", Dt, "\\{$Vd, $dst2\\}, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVLDInstruction";
 }
 
 def  VLD1d8   : VLD1D<{0,0,0,?}, "8">;
@@ -221,6 +253,7 @@ class VLD1DWB<bits<4> op7_4, string Dt>
           "vld1", Dt, "\\{$Vd\\}, $Rn$Rm",
           "$Rn.addr = $wb", []> {
   let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLDInstruction";
 }
 class VLD1QWB<bits<4> op7_4, string Dt>
   : NLdSt<0,0b10,0b1010,op7_4, (outs DPR:$Vd, DPR:$dst2, GPR:$wb),
@@ -228,6 +261,7 @@ class VLD1QWB<bits<4> op7_4, string Dt>
           "vld1", Dt, "\\{$Vd, $dst2\\}, $Rn$Rm",
           "$Rn.addr = $wb", []> {
   let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVLDInstruction";
 }
 
 def VLD1d8_UPD  : VLD1DWB<{0,0,0,?}, "8">;
@@ -252,12 +286,14 @@ class VLD1D3<bits<4> op7_4, string Dt>
           "\\{$Vd, $dst2, $dst3\\}, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLDInstruction";
 }
 class VLD1D3WB<bits<4> op7_4, string Dt>
   : NLdSt<0,0b10,0b0110,op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, GPR:$wb),
           (ins addrmode6:$Rn, am6offset:$Rm), IIC_VLD1x3u, "vld1", Dt,
           "\\{$Vd, $dst2, $dst3\\}, $Rn$Rm", "$Rn.addr = $wb", []> {
   let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLDInstruction";
 }
 
 def VLD1d8T      : VLD1D3<{0,0,0,?}, "8">;
@@ -280,6 +316,7 @@ class VLD1D4<bits<4> op7_4, string Dt>
           "\\{$Vd, $dst2, $dst3, $dst4\\}, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVLDInstruction";
 }
 class VLD1D4WB<bits<4> op7_4, string Dt>
   : NLdSt<0,0b10,0b0010,op7_4,
@@ -288,6 +325,7 @@ class VLD1D4WB<bits<4> op7_4, string Dt>
           "\\{$Vd, $dst2, $dst3, $dst4\\}, $Rn$Rm", "$Rn.addr = $wb",
           []> {
   let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVLDInstruction";
 }
 
 def VLD1d8Q      : VLD1D4<{0,0,?,?}, "8">;
@@ -310,6 +348,7 @@ class VLD2D<bits<4> op11_8, bits<4> op7_4, string Dt>
           "vld2", Dt, "\\{$Vd, $dst2\\}, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVLDInstruction";
 }
 class VLD2Q<bits<4> op7_4, string Dt>
   : NLdSt<0, 0b10, 0b0011, op7_4,
@@ -318,6 +357,7 @@ class VLD2Q<bits<4> op7_4, string Dt>
           "vld2", Dt, "\\{$Vd, $dst2, $dst3, $dst4\\}, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVLDInstruction";
 }
 
 def  VLD2d8   : VLD2D<0b1000, {0,0,?,?}, "8">;
@@ -343,6 +383,7 @@ class VLD2DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
           "vld2", Dt, "\\{$Vd, $dst2\\}, $Rn$Rm",
           "$Rn.addr = $wb", []> {
   let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVLDInstruction";
 }
 class VLD2QWB<bits<4> op7_4, string Dt>
   : NLdSt<0, 0b10, 0b0011, op7_4,
@@ -351,6 +392,7 @@ class VLD2QWB<bits<4> op7_4, string Dt>
           "vld2", Dt, "\\{$Vd, $dst2, $dst3, $dst4\\}, $Rn$Rm",
           "$Rn.addr = $wb", []> {
   let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVLDInstruction";
 }
 
 def VLD2d8_UPD  : VLD2DWB<0b1000, {0,0,?,?}, "8">;
@@ -384,6 +426,7 @@ class VLD3D<bits<4> op11_8, bits<4> op7_4, string Dt>
           "vld3", Dt, "\\{$Vd, $dst2, $dst3\\}, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLDInstruction";
 }
 
 def  VLD3d8   : VLD3D<0b0100, {0,0,0,?}, "8">;
@@ -402,6 +445,7 @@ class VLD3DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
           "vld3", Dt, "\\{$Vd, $dst2, $dst3\\}, $Rn$Rm",
           "$Rn.addr = $wb", []> {
   let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLDInstruction";
 }
 
 def VLD3d8_UPD  : VLD3DWB<0b0100, {0,0,0,?}, "8">;
@@ -441,6 +485,7 @@ class VLD4D<bits<4> op11_8, bits<4> op7_4, string Dt>
           "vld4", Dt, "\\{$Vd, $dst2, $dst3, $dst4\\}, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVLDInstruction";
 }
 
 def  VLD4d8   : VLD4D<0b0000, {0,0,?,?}, "8">;
@@ -459,6 +504,7 @@ class VLD4DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
           "vld4", Dt, "\\{$Vd, $dst2, $dst3, $dst4\\}, $Rn$Rm",
           "$Rn.addr = $wb", []> {
   let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVLDInstruction";
 }
 
 def VLD4d8_UPD  : VLD4DWB<0b0000, {0,0,?,?}, "8">;
@@ -530,6 +576,7 @@ class VLD1LN<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty,
                                          (i32 (LoadOp addrmode6:$Rn)),
                                          imm:$lane))]> {
   let Rm = 0b1111;
+  let DecoderMethod = "DecodeVLD1LN";
 }
 class VLD1LN32<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty,
              PatFrag LoadOp>
@@ -541,6 +588,7 @@ class VLD1LN32<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty,
                                          (i32 (LoadOp addrmode6oneL32:$Rn)),
                                          imm:$lane))]> {
   let Rm = 0b1111;
+  let DecoderMethod = "DecodeVLD1LN";
 }
 class VLD1QLNPseudo<ValueType Ty, PatFrag LoadOp> : VLDQLNPseudo<IIC_VLD1ln> {
   let Pattern = [(set QPR:$dst, (vector_insert (Ty QPR:$src),
@@ -580,7 +628,9 @@ class VLD1LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
           (ins addrmode6:$Rn, am6offset:$Rm,
            DPR:$src, nohash_imm:$lane), IIC_VLD1lnu, "vld1", Dt,
           "\\{$Vd[$lane]\\}, $Rn$Rm",
-          "$src = $Vd, $Rn.addr = $wb", []>;
+          "$src = $Vd, $Rn.addr = $wb", []> {
+  let DecoderMethod = "DecodeVLD1LN";
+}
 
 def VLD1LNd8_UPD  : VLD1LNWB<0b0000, {?,?,?,0}, "8"> {
   let Inst{7-5} = lane{2-0};
@@ -607,6 +657,7 @@ class VLD2LN<bits<4> op11_8, bits<4> op7_4, string Dt>
           "$src1 = $Vd, $src2 = $dst2", []> {
   let Rm = 0b1111;
   let Inst{4}   = Rn{4};
+  let DecoderMethod = "DecodeVLD2LN";
 }
 
 def VLD2LNd8  : VLD2LN<0b0001, {?,?,?,?}, "8"> {
@@ -642,6 +693,7 @@ class VLD2LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
           "\\{$Vd[$lane], $dst2[$lane]\\}, $Rn$Rm",
           "$src1 = $Vd, $src2 = $dst2, $Rn.addr = $wb", []> {
   let Inst{4}   = Rn{4};
+  let DecoderMethod = "DecodeVLD2LN";
 }
 
 def VLD2LNd8_UPD  : VLD2LNWB<0b0001, {?,?,?,?}, "8"> {
@@ -676,6 +728,7 @@ class VLD3LN<bits<4> op11_8, bits<4> op7_4, string Dt>
           "\\{$Vd[$lane], $dst2[$lane], $dst3[$lane]\\}, $Rn",
           "$src1 = $Vd, $src2 = $dst2, $src3 = $dst3", []> {
   let Rm = 0b1111;
+  let DecoderMethod = "DecodeVLD3LN";
 }
 
 def VLD3LNd8  : VLD3LN<0b0010, {?,?,?,0}, "8"> {
@@ -712,7 +765,9 @@ class VLD3LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
           IIC_VLD3lnu, "vld3", Dt,
           "\\{$Vd[$lane], $dst2[$lane], $dst3[$lane]\\}, $Rn$Rm",
           "$src1 = $Vd, $src2 = $dst2, $src3 = $dst3, $Rn.addr = $wb",
-          []>;
+          []> {
+  let DecoderMethod = "DecodeVLD3LN";
+}
 
 def VLD3LNd8_UPD  : VLD3LNWB<0b0010, {?,?,?,0}, "8"> {
   let Inst{7-5} = lane{2-0};
@@ -748,6 +803,7 @@ class VLD4LN<bits<4> op11_8, bits<4> op7_4, string Dt>
           "$src1 = $Vd, $src2 = $dst2, $src3 = $dst3, $src4 = $dst4", []> {
   let Rm = 0b1111;
   let Inst{4}   = Rn{4};
+  let DecoderMethod = "DecodeVLD4LN";
 }
 
 def VLD4LNd8  : VLD4LN<0b0011, {?,?,?,?}, "8"> {
@@ -788,6 +844,7 @@ class VLD4LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
 "$src1 = $Vd, $src2 = $dst2, $src3 = $dst3, $src4 = $dst4, $Rn.addr = $wb",
           []> {
   let Inst{4}   = Rn{4};
+  let DecoderMethod = "DecodeVLD4LN"  ;
 }
 
 def VLD4LNd8_UPD  : VLD4LNWB<0b0011, {?,?,?,?}, "8"> {
@@ -825,6 +882,7 @@ class VLD1DUP<bits<4> op7_4, string Dt, ValueType Ty, PatFrag LoadOp>
           [(set DPR:$Vd, (Ty (NEONvdup (i32 (LoadOp addrmode6dup:$Rn)))))]> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLD1DupInstruction";
 }
 class VLD1QDUPPseudo<ValueType Ty, PatFrag LoadOp> : VLDQPseudo<IIC_VLD1dup> {
   let Pattern = [(set QPR:$dst,
@@ -852,6 +910,7 @@ class VLD1QDUP<bits<4> op7_4, string Dt>
           "vld1", Dt, "\\{$Vd[], $dst2[]\\}, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLD1DupInstruction";
 }
 
 def VLD1DUPq8  : VLD1QDUP<{0,0,1,0}, "8">;
@@ -864,12 +923,14 @@ class VLD1DUPWB<bits<4> op7_4, string Dt>
           (ins addrmode6dup:$Rn, am6offset:$Rm), IIC_VLD1dupu,
           "vld1", Dt, "\\{$Vd[]\\}, $Rn$Rm", "$Rn.addr = $wb", []> {
   let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLD1DupInstruction";
 }
 class VLD1QDUPWB<bits<4> op7_4, string Dt>
   : NLdSt<1, 0b10, 0b1100, op7_4, (outs DPR:$Vd, DPR:$dst2, GPR:$wb),
           (ins addrmode6dup:$Rn, am6offset:$Rm), IIC_VLD1dupu,
           "vld1", Dt, "\\{$Vd[], $dst2[]\\}, $Rn$Rm", "$Rn.addr = $wb", []> {
   let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLD1DupInstruction";
 }
 
 def VLD1DUPd8_UPD  : VLD1DUPWB<{0,0,0,0}, "8">;
@@ -891,6 +952,7 @@ class VLD2DUP<bits<4> op7_4, string Dt>
           "vld2", Dt, "\\{$Vd[], $dst2[]\\}, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLD2DupInstruction";
 }
 
 def VLD2DUPd8  : VLD2DUP<{0,0,0,?}, "8">;
@@ -912,6 +974,7 @@ class VLD2DUPWB<bits<4> op7_4, string Dt>
           (ins addrmode6dup:$Rn, am6offset:$Rm), IIC_VLD2dupu,
           "vld2", Dt, "\\{$Vd[], $dst2[]\\}, $Rn$Rm", "$Rn.addr = $wb", []> {
   let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLD2DupInstruction";
 }
 
 def VLD2DUPd8_UPD  : VLD2DUPWB<{0,0,0,0}, "8">;
@@ -932,7 +995,8 @@ class VLD3DUP<bits<4> op7_4, string Dt>
           (ins addrmode6dup:$Rn), IIC_VLD3dup,
           "vld3", Dt, "\\{$Vd[], $dst2[], $dst3[]\\}, $Rn", "", []> {
   let Rm = 0b1111;
-  let Inst{4} = Rn{4};
+  let Inst{4} = 0;
+  let DecoderMethod = "DecodeVLD3DupInstruction";
 }
 
 def VLD3DUPd8  : VLD3DUP<{0,0,0,?}, "8">;
@@ -954,7 +1018,8 @@ class VLD3DUPWB<bits<4> op7_4, string Dt>
           (ins addrmode6dup:$Rn, am6offset:$Rm), IIC_VLD3dupu,
           "vld3", Dt, "\\{$Vd[], $dst2[], $dst3[]\\}, $Rn$Rm",
           "$Rn.addr = $wb", []> {
-  let Inst{4} = Rn{4};
+  let Inst{4} = 0;
+  let DecoderMethod = "DecodeVLD3DupInstruction";
 }
 
 def VLD3DUPd8_UPD  : VLD3DUPWB<{0,0,0,0}, "8">;
@@ -977,6 +1042,7 @@ class VLD4DUP<bits<4> op7_4, string Dt>
           "vld4", Dt, "\\{$Vd[], $dst2[], $dst3[], $dst4[]\\}, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLD4DupInstruction";
 }
 
 def VLD4DUPd8  : VLD4DUP<{0,0,0,?}, "8">;
@@ -1000,6 +1066,7 @@ class VLD4DUPWB<bits<4> op7_4, string Dt>
           "vld4", Dt, "\\{$Vd[], $dst2[], $dst3[], $dst4[]\\}, $Rn$Rm",
           "$Rn.addr = $wb", []> {
   let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLD4DupInstruction";
 }
 
 def VLD4DUPd8_UPD  : VLD4DUPWB<{0,0,0,0}, "8">;
@@ -1045,6 +1112,7 @@ class VST1D<bits<4> op7_4, string Dt>
           IIC_VST1, "vst1", Dt, "\\{$Vd\\}, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVSTInstruction";
 }
 class VST1Q<bits<4> op7_4, string Dt>
   : NLdSt<0,0b00,0b1010,op7_4, (outs),
@@ -1052,6 +1120,7 @@ class VST1Q<bits<4> op7_4, string Dt>
           "vst1", Dt, "\\{$Vd, $src2\\}, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVSTInstruction";
 }
 
 def  VST1d8   : VST1D<{0,0,0,?}, "8">;
@@ -1075,6 +1144,7 @@ class VST1DWB<bits<4> op7_4, string Dt>
           (ins addrmode6:$Rn, am6offset:$Rm, DPR:$Vd), IIC_VST1u,
           "vst1", Dt, "\\{$Vd\\}, $Rn$Rm", "$Rn.addr = $wb", []> {
   let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVSTInstruction";
 }
 class VST1QWB<bits<4> op7_4, string Dt>
   : NLdSt<0, 0b00, 0b1010, op7_4, (outs GPR:$wb),
@@ -1082,6 +1152,7 @@ class VST1QWB<bits<4> op7_4, string Dt>
           IIC_VST1x2u, "vst1", Dt, "\\{$Vd, $src2\\}, $Rn$Rm",
           "$Rn.addr = $wb", []> {
   let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVSTInstruction";
 }
 
 def VST1d8_UPD  : VST1DWB<{0,0,0,?}, "8">;
@@ -1106,6 +1177,7 @@ class VST1D3<bits<4> op7_4, string Dt>
           IIC_VST1x3, "vst1", Dt, "\\{$Vd, $src2, $src3\\}, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVSTInstruction";
 }
 class VST1D3WB<bits<4> op7_4, string Dt>
   : NLdSt<0, 0b00, 0b0110, op7_4, (outs GPR:$wb),
@@ -1114,6 +1186,7 @@ class VST1D3WB<bits<4> op7_4, string Dt>
           IIC_VST1x3u, "vst1", Dt, "\\{$Vd, $src2, $src3\\}, $Rn$Rm",
           "$Rn.addr = $wb", []> {
   let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVSTInstruction";
 }
 
 def VST1d8T      : VST1D3<{0,0,0,?}, "8">;
@@ -1137,6 +1210,7 @@ class VST1D4<bits<4> op7_4, string Dt>
           []> {
   let Rm = 0b1111;
   let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVSTInstruction";
 }
 class VST1D4WB<bits<4> op7_4, string Dt>
   : NLdSt<0, 0b00, 0b0010, op7_4, (outs GPR:$wb),
@@ -1145,6 +1219,7 @@ class VST1D4WB<bits<4> op7_4, string Dt>
           "vst1", Dt, "\\{$Vd, $src2, $src3, $src4\\}, $Rn$Rm",
           "$Rn.addr = $wb", []> {
   let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVSTInstruction";
 }
 
 def VST1d8Q      : VST1D4<{0,0,?,?}, "8">;
@@ -1167,6 +1242,7 @@ class VST2D<bits<4> op11_8, bits<4> op7_4, string Dt>
           IIC_VST2, "vst2", Dt, "\\{$Vd, $src2\\}, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVSTInstruction";
 }
 class VST2Q<bits<4> op7_4, string Dt>
   : NLdSt<0, 0b00, 0b0011, op7_4, (outs),
@@ -1175,6 +1251,7 @@ class VST2Q<bits<4> op7_4, string Dt>
           "", []> {
   let Rm = 0b1111;
   let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVSTInstruction";
 }
 
 def  VST2d8   : VST2D<0b1000, {0,0,?,?}, "8">;
@@ -1200,6 +1277,7 @@ class VST2DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
           IIC_VST2u, "vst2", Dt, "\\{$Vd, $src2\\}, $Rn$Rm",
           "$Rn.addr = $wb", []> {
   let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVSTInstruction";
 }
 class VST2QWB<bits<4> op7_4, string Dt>
   : NLdSt<0, 0b00, 0b0011, op7_4, (outs GPR:$wb),
@@ -1208,6 +1286,7 @@ class VST2QWB<bits<4> op7_4, string Dt>
           "vst2", Dt, "\\{$Vd, $src2, $src3, $src4\\}, $Rn$Rm",
           "$Rn.addr = $wb", []> {
   let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVSTInstruction";
 }
 
 def VST2d8_UPD  : VST2DWB<0b1000, {0,0,?,?}, "8">;
@@ -1241,6 +1320,7 @@ class VST3D<bits<4> op11_8, bits<4> op7_4, string Dt>
           "vst3", Dt, "\\{$Vd, $src2, $src3\\}, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVSTInstruction";
 }
 
 def  VST3d8   : VST3D<0b0100, {0,0,0,?}, "8">;
@@ -1259,6 +1339,7 @@ class VST3DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
           "vst3", Dt, "\\{$Vd, $src2, $src3\\}, $Rn$Rm",
           "$Rn.addr = $wb", []> {
   let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVSTInstruction";
 }
 
 def VST3d8_UPD  : VST3DWB<0b0100, {0,0,0,?}, "8">;
@@ -1298,6 +1379,7 @@ class VST4D<bits<4> op11_8, bits<4> op7_4, string Dt>
           "", []> {
   let Rm = 0b1111;
   let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVSTInstruction";
 }
 
 def  VST4d8   : VST4D<0b0000, {0,0,?,?}, "8">;
@@ -1316,6 +1398,7 @@ class VST4DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
            "vst4", Dt, "\\{$Vd, $src2, $src3, $src4\\}, $Rn$Rm",
           "$Rn.addr = $wb", []> {
   let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVSTInstruction";
 }
 
 def VST4d8_UPD  : VST4DWB<0b0000, {0,0,?,?}, "8">;
@@ -1381,6 +1464,7 @@ class VST1LN<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty,
           IIC_VST1ln, "vst1", Dt, "\\{$Vd[$lane]\\}, $Rn", "",
           [(StoreOp (ExtractOp (Ty DPR:$Vd), imm:$lane), addrmode6:$Rn)]> {
   let Rm = 0b1111;
+  let DecoderMethod = "DecodeVST1LN";
 }
 class VST1LN32<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty,
              PatFrag StoreOp, SDNode ExtractOp>
@@ -1389,6 +1473,7 @@ class VST1LN32<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty,
           IIC_VST1ln, "vst1", Dt, "\\{$Vd[$lane]\\}, $Rn", "",
           [(StoreOp (ExtractOp (Ty DPR:$Vd), imm:$lane), addrmode6oneL32:$Rn)]>{
   let Rm = 0b1111;
+  let DecoderMethod = "DecodeVST1LN";
 }
 class VST1QLNPseudo<ValueType Ty, PatFrag StoreOp, SDNode ExtractOp>
   : VSTQLNPseudo<IIC_VST1ln> {
@@ -1429,7 +1514,9 @@ class VST1LNWB<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty,
           "\\{$Vd[$lane]\\}, $Rn$Rm",
           "$Rn.addr = $wb",
           [(set GPR:$wb, (StoreOp (ExtractOp (Ty DPR:$Vd), imm:$lane),
-                                  addrmode6:$Rn, am6offset:$Rm))]>;
+                                  addrmode6:$Rn, am6offset:$Rm))]> {
+  let DecoderMethod = "DecodeVST1LN";
+}
 class VST1QLNWBPseudo<ValueType Ty, PatFrag StoreOp, SDNode ExtractOp>
   : VSTQLNWBPseudo<IIC_VST1lnu> {
   let Pattern = [(set GPR:$wb, (StoreOp (ExtractOp (Ty QPR:$src), imm:$lane),
@@ -1465,6 +1552,7 @@ class VST2LN<bits<4> op11_8, bits<4> op7_4, string Dt>
           "", []> {
   let Rm = 0b1111;
   let Inst{4}   = Rn{4};
+  let DecoderMethod = "DecodeVST2LN";
 }
 
 def VST2LNd8  : VST2LN<0b0001, {?,?,?,?}, "8"> {
@@ -1502,6 +1590,7 @@ class VST2LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
           "\\{$src1[$lane], $src2[$lane]\\}, $addr$offset",
           "$addr.addr = $wb", []> {
   let Inst{4}   = Rn{4};
+  let DecoderMethod = "DecodeVST2LN";
 }
 
 def VST2LNd8_UPD  : VST2LNWB<0b0001, {?,?,?,?}, "8"> {
@@ -1535,6 +1624,7 @@ class VST3LN<bits<4> op11_8, bits<4> op7_4, string Dt>
            nohash_imm:$lane), IIC_VST3ln, "vst3", Dt,
           "\\{$Vd[$lane], $src2[$lane], $src3[$lane]\\}, $Rn", "", []> {
   let Rm = 0b1111;
+  let DecoderMethod = "DecodeVST3LN";
 }
 
 def VST3LNd8  : VST3LN<0b0010, {?,?,?,0}, "8"> {
@@ -1569,7 +1659,9 @@ class VST3LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
            DPR:$Vd, DPR:$src2, DPR:$src3, nohash_imm:$lane),
           IIC_VST3lnu, "vst3", Dt,
           "\\{$Vd[$lane], $src2[$lane], $src3[$lane]\\}, $Rn$Rm",
-          "$Rn.addr = $wb", []>;
+          "$Rn.addr = $wb", []> {
+  let DecoderMethod = "DecodeVST3LN";
+}
 
 def VST3LNd8_UPD  : VST3LNWB<0b0010, {?,?,?,0}, "8"> {
   let Inst{7-5} = lane{2-0};
@@ -1604,6 +1696,7 @@ class VST4LN<bits<4> op11_8, bits<4> op7_4, string Dt>
           "", []> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVST4LN";
 }
 
 def VST4LNd8  : VST4LN<0b0011, {?,?,?,?}, "8"> {
@@ -1642,6 +1735,7 @@ class VST4LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
   "\\{$Vd[$lane], $src2[$lane], $src3[$lane], $src4[$lane]\\}, $Rn$Rm",
           "$Rn.addr = $wb", []> {
   let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVST4LN";
 }
 
 def VST4LNd8_UPD  : VST4LNWB<0b0011, {?,?,?,?}, "8"> {
@@ -4039,6 +4133,7 @@ class N2VLShMax<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
   : N2VLSh<op24, op23, op11_8, op7, op6, op4, OpcodeStr, Dt,
            ResTy, OpTy, OpNode> {
   let Inst{21-16} = op21_16;
+  let DecoderMethod = "DecodeVSHLMaxInstruction";
 }
 def  VSHLLi8  : N2VLShMax<1, 1, 0b110010, 0b0011, 0, 0, 0, "vshll", "i8",
                           v8i16, v8i8, NEONvshlli>;
@@ -4219,16 +4314,6 @@ def : InstAlias<"vmov${p} $Vd, $Vm",
 def : InstAlias<"vmov${p} $Vd, $Vm",
                 (VORRq QPR:$Vd, QPR:$Vm, QPR:$Vm, pred:$p)>;
 
-let neverHasSideEffects = 1 in {
-// Pseudo vector move instructions for QQ and QQQQ registers. This should
-// be expanded after register allocation is completed.
-def  VMOVQQ   : PseudoInst<(outs QQPR:$dst), (ins QQPR:$src),
-                NoItinerary, []>;
-
-def  VMOVQQQQ : PseudoInst<(outs QQQQPR:$dst), (ins QQQQPR:$src),
-                NoItinerary, []>;
-} // neverHasSideEffects
-
 //   VMOV     : Vector Move (Immediate)
 
 let isReMaterializable = 1 in {
@@ -4462,36 +4547,42 @@ def : Pat<(v4f32 (NEONvdup (f32 (bitconvert GPR:$R)))), (VDUP32q GPR:$R)>;
 //   VDUP     : Vector Duplicate Lane (from scalar to all elements)
 
 class VDUPLND<bits<4> op19_16, string OpcodeStr, string Dt,
-              ValueType Ty>
-  : NVDupLane<op19_16, 0, (outs DPR:$Vd), (ins DPR:$Vm, nohash_imm:$lane),
-              IIC_VMOVD, OpcodeStr, Dt, "$Vd, $Vm[$lane]",
+              ValueType Ty, Operand IdxTy>
+  : NVDupLane<op19_16, 0, (outs DPR:$Vd), (ins DPR:$Vm, IdxTy:$lane),
+              IIC_VMOVD, OpcodeStr, Dt, "$Vd, $Vm$lane",
               [(set DPR:$Vd, (Ty (NEONvduplane (Ty DPR:$Vm), imm:$lane)))]>;
 
 class VDUPLNQ<bits<4> op19_16, string OpcodeStr, string Dt,
-              ValueType ResTy, ValueType OpTy>
-  : NVDupLane<op19_16, 1, (outs QPR:$Vd), (ins DPR:$Vm, nohash_imm:$lane),
-              IIC_VMOVQ, OpcodeStr, Dt, "$Vd, $Vm[$lane]",
+              ValueType ResTy, ValueType OpTy, Operand IdxTy>
+  : NVDupLane<op19_16, 1, (outs QPR:$Vd), (ins DPR:$Vm, IdxTy:$lane),
+              IIC_VMOVQ, OpcodeStr, Dt, "$Vd, $Vm$lane",
               [(set QPR:$Vd, (ResTy (NEONvduplane (OpTy DPR:$Vm),
-                                      imm:$lane)))]>;
+                                      VectorIndex32:$lane)))]>;
 
 // Inst{19-16} is partially specified depending on the element size.
 
-def VDUPLN8d  : VDUPLND<{?,?,?,1}, "vdup", "8", v8i8> {
+def VDUPLN8d  : VDUPLND<{?,?,?,1}, "vdup", "8", v8i8, VectorIndex8> {
+  bits<3> lane;
   let Inst{19-17} = lane{2-0};
 }
-def VDUPLN16d : VDUPLND<{?,?,1,0}, "vdup", "16", v4i16> {
+def VDUPLN16d : VDUPLND<{?,?,1,0}, "vdup", "16", v4i16, VectorIndex16> {
+  bits<2> lane;
   let Inst{19-18} = lane{1-0};
 }
-def VDUPLN32d : VDUPLND<{?,1,0,0}, "vdup", "32", v2i32> {
+def VDUPLN32d : VDUPLND<{?,1,0,0}, "vdup", "32", v2i32, VectorIndex32> {
+  bits<1> lane;
   let Inst{19} = lane{0};
 }
-def VDUPLN8q  : VDUPLNQ<{?,?,?,1}, "vdup", "8", v16i8, v8i8> {
+def VDUPLN8q  : VDUPLNQ<{?,?,?,1}, "vdup", "8", v16i8, v8i8, VectorIndex8> {
+  bits<3> lane;
   let Inst{19-17} = lane{2-0};
 }
-def VDUPLN16q : VDUPLNQ<{?,?,1,0}, "vdup", "16", v8i16, v4i16> {
+def VDUPLN16q : VDUPLNQ<{?,?,1,0}, "vdup", "16", v8i16, v4i16, VectorIndex16> {
+  bits<2> lane;
   let Inst{19-18} = lane{1-0};
 }
-def VDUPLN32q : VDUPLNQ<{?,1,0,0}, "vdup", "32", v4i32, v2i32> {
+def VDUPLN32q : VDUPLNQ<{?,1,0,0}, "vdup", "32", v4i32, v2i32, VectorIndex32> {
+  bits<1> lane;
   let Inst{19} = lane{0};
 }
 
@@ -4753,6 +4844,7 @@ def  VZIPq32  : N2VQShuffle<0b10, 0b00011, IIC_VPERMQ3, "vzip", "32">;
 // Vector Table Lookup and Table Extension.
 
 //   VTBL     : Vector Table Lookup
+let DecoderMethod = "DecodeTBLInstruction" in {
 def  VTBL1
   : N3V<1,1,0b11,0b1000,0,0, (outs DPR:$Vd),
         (ins DPR:$Vn, DPR:$Vm), NVTBLFrm, IIC_VTB1,
@@ -4815,6 +4907,7 @@ def  VTBX3Pseudo
 def  VTBX4Pseudo
   : PseudoNeonI<(outs DPR:$dst), (ins DPR:$orig, QQPR:$tbl, DPR:$src),
                 IIC_VTBX4, "$orig = $dst", []>;
+} // DecoderMethod = "DecodeTBLInstruction"
 
 //===----------------------------------------------------------------------===//
 // NEON instructions for single-precision FP math
diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td
index bfe83eceb13fb..cedb54799db0d 100644
--- a/lib/Target/ARM/ARMInstrThumb.td
+++ b/lib/Target/ARM/ARMInstrThumb.td
@@ -19,6 +19,19 @@ def ARMtcall : SDNode<"ARMISD::tCALL", SDT_ARMcall,
                       [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
                        SDNPVariadic]>;
 
+def imm_sr_XFORM: SDNodeXForm<imm, [{
+  unsigned Imm = N->getZExtValue();
+  return CurDAG->getTargetConstant((Imm == 32 ? 0 : Imm), MVT::i32);
+}]>;
+def ThumbSRImmAsmOperand: AsmOperandClass { let Name = "ImmThumbSR"; }
+def imm_sr : Operand<i32>, PatLeaf<(imm), [{
+  uint64_t Imm = N->getZExtValue();
+  return Imm > 0 && Imm <= 32;
+}], imm_sr_XFORM> {
+  let PrintMethod = "printThumbSRImm";
+  let ParserMatchClass = ThumbSRImmAsmOperand;
+}
+
 def imm_neg_XFORM : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(-(int)N->getZExtValue(), MVT::i32);
 }]>;
@@ -30,10 +43,6 @@ def imm0_7_neg : PatLeaf<(i32 imm), [{
   return (uint32_t)-N->getZExtValue() < 8;
 }], imm_neg_XFORM>;
 
-def imm0_255_asmoperand : AsmOperandClass { let Name = "Imm0_255"; }
-def imm0_255 : Operand<i32>, ImmLeaf<i32, [{ return Imm >= 0 && Imm < 256; }]> {
-  let ParserMatchClass = imm0_255_asmoperand;
-}
 def imm0_255_comp : PatLeaf<(i32 imm), [{
   return ~((uint32_t)N->getZExtValue()) < 256;
 }]>;
@@ -69,8 +78,17 @@ def t_adrlabel : Operand<i32> {
 }
 
 // Scaled 4 immediate.
-def t_imm_s4 : Operand<i32> {
+def t_imm0_1020s4_asmoperand: AsmOperandClass { let Name = "Imm0_1020s4"; }
+def t_imm0_1020s4 : Operand<i32> {
+  let PrintMethod = "printThumbS4ImmOperand";
+  let ParserMatchClass = t_imm0_1020s4_asmoperand;
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+
+def t_imm0_508s4_asmoperand: AsmOperandClass { let Name = "Imm0_508s4"; }
+def t_imm0_508s4 : Operand<i32> {
   let PrintMethod = "printThumbS4ImmOperand";
+  let ParserMatchClass = t_imm0_508s4_asmoperand;
   let OperandType = "OPERAND_IMMEDIATE";
 }
 
@@ -79,113 +97,129 @@ def t_imm_s4 : Operand<i32> {
 let OperandType = "OPERAND_PCREL" in {
 def t_brtarget : Operand<OtherVT> {
   let EncoderMethod = "getThumbBRTargetOpValue";
+  let DecoderMethod = "DecodeThumbBROperand";
 }
 
 def t_bcctarget : Operand<i32> {
   let EncoderMethod = "getThumbBCCTargetOpValue";
+  let DecoderMethod = "DecodeThumbBCCTargetOperand";
 }
 
 def t_cbtarget : Operand<i32> {
   let EncoderMethod = "getThumbCBTargetOpValue";
+  let DecoderMethod = "DecodeThumbCmpBROperand";
 }
 
 def t_bltarget : Operand<i32> {
   let EncoderMethod = "getThumbBLTargetOpValue";
+  let DecoderMethod = "DecodeThumbBLTargetOperand";
 }
 
 def t_blxtarget : Operand<i32> {
   let EncoderMethod = "getThumbBLXTargetOpValue";
+  let DecoderMethod = "DecodeThumbBLXOffset";
 }
 }
 
-def MemModeRegThumbAsmOperand : AsmOperandClass {
-  let Name = "MemModeRegThumb";
-  let SuperClasses = [];
-}
-
-def MemModeImmThumbAsmOperand : AsmOperandClass {
-  let Name = "MemModeImmThumb";
-  let SuperClasses = [];
-}
-
 // t_addrmode_rr := reg + reg
 //
+def t_addrmode_rr_asm_operand : AsmOperandClass { let Name = "MemThumbRR"; }
 def t_addrmode_rr : Operand<i32>,
                     ComplexPattern<i32, 2, "SelectThumbAddrModeRR", []> {
   let EncoderMethod = "getThumbAddrModeRegRegOpValue";
   let PrintMethod = "printThumbAddrModeRROperand";
+  let DecoderMethod = "DecodeThumbAddrModeRR";
+  let ParserMatchClass = t_addrmode_rr_asm_operand;
   let MIOperandInfo = (ops tGPR:$base, tGPR:$offsreg);
 }
 
 // t_addrmode_rrs := reg + reg
 //
+// We use separate scaled versions because the Select* functions need
+// to explicitly check for a matching constant and return false here so that
+// the reg+imm forms will match instead. This is a horrible way to do that,
+// as it forces tight coupling between the methods, but it's how selectiondag
+// currently works.
 def t_addrmode_rrs1 : Operand<i32>,
                       ComplexPattern<i32, 2, "SelectThumbAddrModeRI5S1", []> {
   let EncoderMethod = "getThumbAddrModeRegRegOpValue";
   let PrintMethod = "printThumbAddrModeRROperand";
+  let DecoderMethod = "DecodeThumbAddrModeRR";
+  let ParserMatchClass = t_addrmode_rr_asm_operand;
   let MIOperandInfo = (ops tGPR:$base, tGPR:$offsreg);
-  let ParserMatchClass = MemModeRegThumbAsmOperand;
 }
 def t_addrmode_rrs2 : Operand<i32>,
                       ComplexPattern<i32, 2, "SelectThumbAddrModeRI5S2", []> {
   let EncoderMethod = "getThumbAddrModeRegRegOpValue";
+  let DecoderMethod = "DecodeThumbAddrModeRR";
   let PrintMethod = "printThumbAddrModeRROperand";
+  let ParserMatchClass = t_addrmode_rr_asm_operand;
   let MIOperandInfo = (ops tGPR:$base, tGPR:$offsreg);
-  let ParserMatchClass = MemModeRegThumbAsmOperand;
 }
 def t_addrmode_rrs4 : Operand<i32>,
                       ComplexPattern<i32, 2, "SelectThumbAddrModeRI5S4", []> {
   let EncoderMethod = "getThumbAddrModeRegRegOpValue";
+  let DecoderMethod = "DecodeThumbAddrModeRR";
   let PrintMethod = "printThumbAddrModeRROperand";
+  let ParserMatchClass = t_addrmode_rr_asm_operand;
   let MIOperandInfo = (ops tGPR:$base, tGPR:$offsreg);
-  let ParserMatchClass = MemModeRegThumbAsmOperand;
 }
 
 // t_addrmode_is4 := reg + imm5 * 4
 //
+def t_addrmode_is4_asm_operand : AsmOperandClass { let Name = "MemThumbRIs4"; }
 def t_addrmode_is4 : Operand<i32>,
                      ComplexPattern<i32, 2, "SelectThumbAddrModeImm5S4", []> {
   let EncoderMethod = "getAddrModeISOpValue";
+  let DecoderMethod = "DecodeThumbAddrModeIS";
   let PrintMethod = "printThumbAddrModeImm5S4Operand";
+  let ParserMatchClass = t_addrmode_is4_asm_operand;
   let MIOperandInfo = (ops tGPR:$base, i32imm:$offsimm);
-  let ParserMatchClass = MemModeImmThumbAsmOperand;
 }
 
 // t_addrmode_is2 := reg + imm5 * 2
 //
+def t_addrmode_is2_asm_operand : AsmOperandClass { let Name = "MemThumbRIs2"; }
 def t_addrmode_is2 : Operand<i32>,
                      ComplexPattern<i32, 2, "SelectThumbAddrModeImm5S2", []> {
   let EncoderMethod = "getAddrModeISOpValue";
+  let DecoderMethod = "DecodeThumbAddrModeIS";
   let PrintMethod = "printThumbAddrModeImm5S2Operand";
+  let ParserMatchClass = t_addrmode_is2_asm_operand;
   let MIOperandInfo = (ops tGPR:$base, i32imm:$offsimm);
-  let ParserMatchClass = MemModeImmThumbAsmOperand;
 }
 
 // t_addrmode_is1 := reg + imm5
 //
+def t_addrmode_is1_asm_operand : AsmOperandClass { let Name = "MemThumbRIs1"; }
 def t_addrmode_is1 : Operand<i32>,
                      ComplexPattern<i32, 2, "SelectThumbAddrModeImm5S1", []> {
   let EncoderMethod = "getAddrModeISOpValue";
+  let DecoderMethod = "DecodeThumbAddrModeIS";
   let PrintMethod = "printThumbAddrModeImm5S1Operand";
+  let ParserMatchClass = t_addrmode_is1_asm_operand;
   let MIOperandInfo = (ops tGPR:$base, i32imm:$offsimm);
-  let ParserMatchClass = MemModeImmThumbAsmOperand;
 }
 
 // t_addrmode_sp := sp + imm8 * 4
 //
+// FIXME: This really shouldn't have an explicit SP operand at all. It should
+// be implicit, just like in the instruction encoding itself.
+def t_addrmode_sp_asm_operand : AsmOperandClass { let Name = "MemThumbSPI"; }
 def t_addrmode_sp : Operand<i32>,
                     ComplexPattern<i32, 2, "SelectThumbAddrModeSP", []> {
   let EncoderMethod = "getAddrModeThumbSPOpValue";
+  let DecoderMethod = "DecodeThumbAddrModeSP";
   let PrintMethod = "printThumbAddrModeSPOperand";
+  let ParserMatchClass = t_addrmode_sp_asm_operand;
   let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
-  let ParserMatchClass = MemModeImmThumbAsmOperand;
 }
 
 // t_addrmode_pc := <label> => pc + imm8 * 4
 //
 def t_addrmode_pc : Operand<i32> {
   let EncoderMethod = "getAddrModePCOpValue";
-  let ParserMatchClass = MemModeImmThumbAsmOperand;
+  let DecoderMethod = "DecodeThumbAddrModePC";
 }
 
 //===----------------------------------------------------------------------===//
@@ -207,68 +241,52 @@ def tADJCALLSTACKDOWN :
             Requires<[IsThumb, IsThumb1Only]>;
 }
 
-// T1Disassembly - A simple class to make encoding some disassembly patterns
-// easier and less verbose.
-class T1Disassembly<bits<2> op1, bits<8> op2>
+class T1SystemEncoding<bits<8> opc>
   : T1Encoding<0b101111> {
-  let Inst{9-8} = op1;
-  let Inst{7-0} = op2;
+  let Inst{9-8} = 0b11;
+  let Inst{7-0} = opc;
 }
 
-def tNOP : T1pI<(outs), (ins), NoItinerary, "nop", "",
-                [/* For disassembly only; pattern left blank */]>,
-           T1Disassembly<0b11, 0x00>; // A8.6.110
+def tNOP : T1pI<(outs), (ins), NoItinerary, "nop", "", []>,
+           T1SystemEncoding<0x00>, // A8.6.110
+        Requires<[IsThumb2]>;
 
-def tYIELD : T1pI<(outs), (ins), NoItinerary, "yield", "",
-                  [/* For disassembly only; pattern left blank */]>,
-           T1Disassembly<0b11, 0x10>; // A8.6.410
+def tYIELD : T1pI<(outs), (ins), NoItinerary, "yield", "", []>,
+           T1SystemEncoding<0x10>; // A8.6.410
 
-def tWFE : T1pI<(outs), (ins), NoItinerary, "wfe", "",
-                [/* For disassembly only; pattern left blank */]>,
-           T1Disassembly<0b11, 0x20>; // A8.6.408
+def tWFE : T1pI<(outs), (ins), NoItinerary, "wfe", "", []>,
+           T1SystemEncoding<0x20>; // A8.6.408
 
-def tWFI : T1pI<(outs), (ins), NoItinerary, "wfi", "",
-                [/* For disassembly only; pattern left blank */]>,
-           T1Disassembly<0b11, 0x30>; // A8.6.409
+def tWFI : T1pI<(outs), (ins), NoItinerary, "wfi", "", []>,
+           T1SystemEncoding<0x30>; // A8.6.409
 
-def tSEV : T1pI<(outs), (ins), NoItinerary, "sev", "",
-                [/* For disassembly only; pattern left blank */]>,
-           T1Disassembly<0b11, 0x40>; // A8.6.157
+def tSEV : T1pI<(outs), (ins), NoItinerary, "sev", "", []>,
+           T1SystemEncoding<0x40>; // A8.6.157
 
-// The i32imm operand $val can be used by a debugger to store more information
+// The imm operand $val can be used by a debugger to store more information
 // about the breakpoint.
-def tBKPT : T1I<(outs), (ins i32imm:$val), NoItinerary, "bkpt\t$val",
-                [/* For disassembly only; pattern left blank */]>,
-           T1Disassembly<0b10, {?,?,?,?,?,?,?,?}> {
+def tBKPT : T1I<(outs), (ins imm0_255:$val), NoItinerary, "bkpt\t$val",
+                []>,
+           T1Encoding<0b101111> {
+  let Inst{9-8} = 0b10;
   // A8.6.22
   bits<8> val;
   let Inst{7-0} = val;
 }
 
-def tSETENDBE : T1I<(outs), (ins), NoItinerary, "setend\tbe",
-                    [/* For disassembly only; pattern left blank */]>,
-                T1Encoding<0b101101> {
-  // A8.6.156
-  let Inst{9-5} = 0b10010;
-  let Inst{4}   = 1;
-  let Inst{3}   = 1;            // Big-Endian
-  let Inst{2-0} = 0b000;
-}
-
-def tSETENDLE : T1I<(outs), (ins), NoItinerary, "setend\tle",
-                    [/* For disassembly only; pattern left blank */]>,
-                T1Encoding<0b101101> {
+def tSETEND : T1I<(outs), (ins setend_op:$end), NoItinerary, "setend\t$end",
+                  []>, T1Encoding<0b101101> {
+  bits<1> end;
   // A8.6.156
   let Inst{9-5} = 0b10010;
   let Inst{4}   = 1;
-  let Inst{3}   = 0;            // Little-Endian
+  let Inst{3}   = end;
   let Inst{2-0} = 0b000;
 }
 
 // Change Processor State is a system instruction -- for disassembly only.
 def tCPS : T1I<(outs), (ins imod_op:$imod, iflags_op:$iflags),
-                NoItinerary, "cps$imod $iflags",
-                [/* For disassembly only; pattern left blank */]>,
+                NoItinerary, "cps$imod $iflags", []>,
            T1Misc<0b0110011> {
   // A8.6.38 & B6.1.1
   bit imod;
@@ -277,6 +295,7 @@ def tCPS : T1I<(outs), (ins imod_op:$imod, iflags_op:$iflags),
   let Inst{4}   = imod;
   let Inst{3}   = 0;
   let Inst{2-0} = iflags;
+  let DecoderMethod = "DecodeThumbCPS";
 }
 
 // For both thumb1 and thumb2.
@@ -290,70 +309,70 @@ def tPICADD : TIt<(outs GPR:$dst), (ins GPR:$lhs, pclabel:$cp), IIC_iALUr, "",
   let Inst{2-0} = dst;
 }
 
-// PC relative add (ADR).
-def tADDrPCi : T1I<(outs tGPR:$dst), (ins t_imm_s4:$rhs), IIC_iALUi,
-                   "add\t$dst, pc, $rhs", []>,
-               T1Encoding<{1,0,1,0,0,?}> {
-  // A6.2 & A8.6.10
-  bits<3> dst;
-  bits<8> rhs;
-  let Inst{10-8} = dst;
-  let Inst{7-0}  = rhs;
-}
-
 // ADD <Rd>, sp, #<imm8>
-// This is rematerializable, which is particularly useful for taking the
-// address of locals.
-let isReMaterializable = 1 in
-def tADDrSPi : T1I<(outs tGPR:$dst), (ins GPR:$sp, t_imm_s4:$rhs), IIC_iALUi,
-                   "add\t$dst, $sp, $rhs", []>,
+// FIXME: This should not be marked as having side effects, and it should be
+// rematerializable. Clearing the side effect bit causes miscompilations,
+// probably because the instruction can be moved around.
+def tADDrSPi : T1pI<(outs tGPR:$dst), (ins GPRsp:$sp, t_imm0_1020s4:$imm),
+                    IIC_iALUi, "add", "\t$dst, $sp, $imm", []>,
                T1Encoding<{1,0,1,0,1,?}> {
   // A6.2 & A8.6.8
   bits<3> dst;
-  bits<8> rhs;
+  bits<8> imm;
   let Inst{10-8} = dst;
-  let Inst{7-0}  = rhs;
+  let Inst{7-0}  = imm;
+  let DecoderMethod = "DecodeThumbAddSpecialReg";
 }
 
 // ADD sp, sp, #<imm7>
-def tADDspi : TIt<(outs GPR:$dst), (ins GPR:$lhs, t_imm_s4:$rhs), IIC_iALUi,
-                  "add\t$dst, $rhs", []>,
+def tADDspi : T1pIt<(outs GPRsp:$Rdn), (ins GPRsp:$Rn, t_imm0_508s4:$imm),
+                     IIC_iALUi, "add", "\t$Rdn, $imm", []>,
               T1Misc<{0,0,0,0,0,?,?}> {
   // A6.2.5 & A8.6.8
-  bits<7> rhs;
-  let Inst{6-0} = rhs;
+  bits<7> imm;
+  let Inst{6-0} = imm;
+  let DecoderMethod = "DecodeThumbAddSPImm";
 }
 
 // SUB sp, sp, #<imm7>
 // FIXME: The encoding and the ASM string don't match up.
-def tSUBspi : TIt<(outs GPR:$dst), (ins GPR:$lhs, t_imm_s4:$rhs), IIC_iALUi,
-                  "sub\t$dst, $rhs", []>,
+def tSUBspi : T1pIt<(outs GPRsp:$Rdn), (ins GPRsp:$Rn, t_imm0_508s4:$imm),
+                    IIC_iALUi, "sub", "\t$Rdn, $imm", []>,
               T1Misc<{0,0,0,0,1,?,?}> {
   // A6.2.5 & A8.6.214
-  bits<7> rhs;
-  let Inst{6-0} = rhs;
+  bits<7> imm;
+  let Inst{6-0} = imm;
+  let DecoderMethod = "DecodeThumbAddSPImm";
 }
 
+// Can optionally specify SP as a three operand instruction.
+def : tInstAlias<"add${p} sp, sp, $imm",
+                 (tADDspi SP, t_imm0_508s4:$imm, pred:$p)>;
+def : tInstAlias<"sub${p} sp, sp, $imm",
+                 (tSUBspi SP, t_imm0_508s4:$imm, pred:$p)>;
+
 // ADD <Rm>, sp
-def tADDrSP : TIt<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iALUr,
-                  "add\t$dst, $rhs", []>,
+def tADDrSP : T1pIt<(outs GPR:$Rdn), (ins GPR:$Rn, GPRsp:$sp), IIC_iALUr,
+                  "add", "\t$Rdn, $sp, $Rn", []>,
               T1Special<{0,0,?,?}> {
   // A8.6.9 Encoding T1
-  bits<4> dst;
-  let Inst{7}   = dst{3};
+  bits<4> Rdn;
+  let Inst{7}   = Rdn{3};
   let Inst{6-3} = 0b1101;
-  let Inst{2-0} = dst{2-0};
+  let Inst{2-0} = Rdn{2-0};
+  let DecoderMethod = "DecodeThumbAddSPReg";
 }
 
 // ADD sp, <Rm>
-def tADDspr : TIt<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iALUr,
-                  "add\t$dst, $rhs", []>,
+def tADDspr : T1pIt<(outs GPRsp:$Rdn), (ins GPRsp:$Rn, GPR:$Rm), IIC_iALUr,
+                  "add", "\t$Rdn, $Rm", []>,
               T1Special<{0,0,?,?}> {
   // A8.6.9 Encoding T2
-  bits<4> dst;
+  bits<4> Rm;
   let Inst{7} = 1;
-  let Inst{6-3} = dst;
+  let Inst{6-3} = Rm;
   let Inst{2-0} = 0b101;
+  let DecoderMethod = "DecodeThumbAddSPReg";
 }
 
 //===----------------------------------------------------------------------===//
@@ -390,11 +409,12 @@ let isCall = 1,
   Uses = [SP] in {
   // Also used for Thumb2
   def tBL  : TIx2<0b11110, 0b11, 1,
-                  (outs), (ins t_bltarget:$func, variable_ops), IIC_Br,
-                  "bl\t$func",
+                  (outs), (ins pred:$p, t_bltarget:$func, variable_ops), IIC_Br,
+                  "bl${p}\t$func",
                   [(ARMtcall tglobaladdr:$func)]>,
              Requires<[IsThumb, IsNotDarwin]> {
-    bits<21> func;
+    bits<22> func;
+    let Inst{26} = func{21};
     let Inst{25-16} = func{20-11};
     let Inst{13} = 1;
     let Inst{11} = 1;
@@ -403,8 +423,8 @@ let isCall = 1,
 
   // ARMv5T and above, also used for Thumb2
   def tBLXi : TIx2<0b11110, 0b11, 0,
-                   (outs), (ins t_blxtarget:$func, variable_ops), IIC_Br,
-                   "blx\t$func",
+                 (outs), (ins pred:$p, t_blxtarget:$func, variable_ops), IIC_Br,
+                   "blx${p}\t$func",
                    [(ARMcall tglobaladdr:$func)]>,
               Requires<[IsThumb, HasV5T, IsNotDarwin]> {
     bits<21> func;
@@ -416,8 +436,8 @@ let isCall = 1,
   }
 
   // Also used for Thumb2
-  def tBLXr : TI<(outs), (ins GPR:$func, variable_ops), IIC_Br,
-                  "blx\t$func",
+  def tBLXr : TI<(outs), (ins pred:$p, GPR:$func, variable_ops), IIC_Br,
+                  "blx${p}\t$func",
                   [(ARMtcall GPR:$func)]>,
               Requires<[IsThumb, HasV5T, IsNotDarwin]>,
               T1Special<{1,1,1,?}> { // A6.2.3 & A8.6.24;
@@ -440,43 +460,22 @@ let isCall = 1,
   Defs = [R0,  R1,  R2,  R3,  R9,  R12, LR, QQQQ0, QQQQ2, QQQQ3, CPSR, FPSCR],
   Uses = [R7, SP] in {
   // Also used for Thumb2
-  def tBLr9 : TIx2<0b11110, 0b11, 1,
-                   (outs), (ins pred:$p, t_bltarget:$func, variable_ops),
-                   IIC_Br, "bl${p}\t$func",
-                   [(ARMtcall tglobaladdr:$func)]>,
-              Requires<[IsThumb, IsDarwin]> {
-    bits<21> func;
-    let Inst{25-16} = func{20-11};
-    let Inst{13} = 1;
-    let Inst{11} = 1;
-    let Inst{10-0} = func{10-0};
-  }
+  def tBLr9 : tPseudoExpand<(outs), (ins pred:$p, t_bltarget:$func, variable_ops),
+                          4, IIC_Br, [(ARMtcall tglobaladdr:$func)],
+                          (tBL pred:$p, t_bltarget:$func)>,
+              Requires<[IsThumb, IsDarwin]>;
 
   // ARMv5T and above, also used for Thumb2
-  def tBLXi_r9 : TIx2<0b11110, 0b11, 0,
-                      (outs), (ins pred:$p, t_blxtarget:$func, variable_ops),
-                      IIC_Br, "blx${p}\t$func",
-                      [(ARMcall tglobaladdr:$func)]>,
-                 Requires<[IsThumb, HasV5T, IsDarwin]> {
-    bits<21> func;
-    let Inst{25-16} = func{20-11};
-    let Inst{13} = 1;
-    let Inst{11} = 1;
-    let Inst{10-1} = func{10-1};
-    let Inst{0} = 0; // func{0} is assumed zero
-  }
+  def tBLXi_r9 : tPseudoExpand<(outs), (ins pred:$p, t_blxtarget:$func, variable_ops),
+                      4, IIC_Br, [(ARMcall tglobaladdr:$func)],
+                      (tBLXi pred:$p, t_blxtarget:$func)>,
+                 Requires<[IsThumb, HasV5T, IsDarwin]>;
 
   // Also used for Thumb2
-  def tBLXr_r9 : TI<(outs), (ins pred:$p, GPR:$func, variable_ops), IIC_Br,
-                    "blx${p}\t$func",
-                    [(ARMtcall GPR:$func)]>,
-                 Requires<[IsThumb, HasV5T, IsDarwin]>,
-                 T1Special<{1,1,1,?}> {
-    // A6.2.3 & A8.6.24
-    bits<4> func;
-    let Inst{6-3} = func;
-    let Inst{2-0} = 0b000;
-  }
+  def tBLXr_r9 : tPseudoExpand<(outs), (ins pred:$p, GPR:$func, variable_ops),
+                    2, IIC_Br, [(ARMtcall GPR:$func)],
+                    (tBLXr pred:$p, GPR:$func)>,
+                 Requires<[IsThumb, HasV5T, IsDarwin]>;
 
   // ARMv4T
   def tBXr9_CALL : tPseudoInst<(outs), (ins tGPR:$func, variable_ops),
@@ -487,8 +486,8 @@ let isCall = 1,
 
 let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
   let isPredicable = 1 in
-  def tB   : T1I<(outs), (ins t_brtarget:$target), IIC_Br,
-                 "b\t$target", [(br bb:$target)]>,
+  def tB   : T1pI<(outs), (ins t_brtarget:$target), IIC_Br,
+                 "b", "\t$target", [(br bb:$target)]>,
              T1Encoding<{1,1,1,0,0,?}> {
     bits<11> target;
     let Inst{10-0} = target;
@@ -498,8 +497,8 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
   // Just a pseudo for a tBL instruction. Needed to let regalloc know about
   // the clobber of LR.
   let Defs = [LR] in
-  def tBfar : tPseudoExpand<(outs), (ins t_bltarget:$target),
-                          4, IIC_Br, [], (tBL t_bltarget:$target)>;
+  def tBfar : tPseudoExpand<(outs), (ins t_bltarget:$target, pred:$p),
+                          4, IIC_Br, [], (tBL pred:$p, t_bltarget:$target)>;
 
   def tBR_JTr : tPseudoInst<(outs),
                       (ins tGPR:$target, i32imm:$jt, i32imm:$id),
@@ -522,31 +521,6 @@ let isBranch = 1, isTerminator = 1 in
   let Inst{7-0} = target;
 }
 
-// Compare and branch on zero / non-zero
-let isBranch = 1, isTerminator = 1 in {
-  def tCBZ  : T1I<(outs), (ins tGPR:$Rn, t_cbtarget:$target), IIC_Br,
-                  "cbz\t$Rn, $target", []>,
-              T1Misc<{0,0,?,1,?,?,?}> {
-    // A8.6.27
-    bits<6> target;
-    bits<3> Rn;
-    let Inst{9}   = target{5};
-    let Inst{7-3} = target{4-0};
-    let Inst{2-0} = Rn;
-  }
-
-  def tCBNZ : T1I<(outs), (ins tGPR:$cmp, t_cbtarget:$target), IIC_Br,
-                  "cbnz\t$cmp, $target", []>,
-              T1Misc<{1,0,?,1,?,?,?}> {
-    // A8.6.27
-    bits<6> target;
-    bits<3> Rn;
-    let Inst{9}   = target{5};
-    let Inst{7-3} = target{4-0};
-    let Inst{2-0} = Rn;
-  }
-}
-
 // Tail calls
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
   // Darwin versions.
@@ -562,9 +536,10 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
   // Non-Darwin versions (the difference is R9).
   let Defs = [R0, R1, R2, R3, R12, QQQQ0, QQQQ2, QQQQ3, PC],
       Uses = [SP] in {
-    def tTAILJMPdND : tPseudoExpand<(outs), (ins t_brtarget:$dst, variable_ops),
+    def tTAILJMPdND : tPseudoExpand<(outs),
+                   (ins t_brtarget:$dst, pred:$p, variable_ops),
                    4, IIC_Br, [],
-                   (tB t_brtarget:$dst)>,
+                   (tB t_brtarget:$dst, pred:$p)>,
                  Requires<[IsThumb, IsNotDarwin]>;
     def tTAILJMPrND : tPseudoExpand<(outs), (ins tcGPR:$dst, variable_ops),
                      4, IIC_Br, [],
@@ -574,11 +549,11 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
 }
 
 
-// A8.6.218 Supervisor Call (Software Interrupt) -- for disassembly only
+// A8.6.218 Supervisor Call (Software Interrupt)
 // A8.6.16 B: Encoding T1
 // If Inst{11-8} == 0b1111 then SEE SVC
 let isCall = 1, Uses = [SP] in
-def tSVC : T1pI<(outs), (ins i32imm:$imm), IIC_Br,
+def tSVC : T1pI<(outs), (ins imm0_255:$imm), IIC_Br,
                 "svc", "\t$imm", []>, Encoding16 {
   bits<8> imm;
   let Inst{15-12} = 0b1101;
@@ -653,17 +628,17 @@ defm tLDRH : thumb_ld_rr_ri_enc<0b101, 0b1000, t_addrmode_rrs2,
 
 let AddedComplexity = 10 in
 def tLDRSB :                    // A8.6.80
-  T1pILdStEncode<0b011, (outs tGPR:$dst), (ins t_addrmode_rr:$addr),
+  T1pILdStEncode<0b011, (outs tGPR:$Rt), (ins t_addrmode_rr:$addr),
                  AddrModeT1_1, IIC_iLoad_bh_r,
-                 "ldrsb", "\t$dst, $addr",
-                 [(set tGPR:$dst, (sextloadi8 t_addrmode_rr:$addr))]>;
+                 "ldrsb", "\t$Rt, $addr",
+                 [(set tGPR:$Rt, (sextloadi8 t_addrmode_rr:$addr))]>;
 
 let AddedComplexity = 10 in
 def tLDRSH :                    // A8.6.84
-  T1pILdStEncode<0b111, (outs tGPR:$dst), (ins t_addrmode_rr:$addr),
+  T1pILdStEncode<0b111, (outs tGPR:$Rt), (ins t_addrmode_rr:$addr),
                  AddrModeT1_2, IIC_iLoad_bh_r,
-                 "ldrsh", "\t$dst, $addr",
-                 [(set tGPR:$dst, (sextloadi16 t_addrmode_rr:$addr))]>;
+                 "ldrsh", "\t$Rt, $addr",
+                 [(set tGPR:$Rt, (sextloadi16 t_addrmode_rr:$addr))]>;
 
 let canFoldAsLoad = 1 in
 def tLDRspi : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_sp:$addr), IIC_iLoad_i,
@@ -678,7 +653,7 @@ def tLDRspi : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_sp:$addr), IIC_iLoad_i,
 
 // Load tconstpool
 // FIXME: Use ldr.n to work around a Darwin assembler bug.
-let canFoldAsLoad = 1, isReMaterializable = 1 in
+let canFoldAsLoad = 1, isReMaterializable = 1, isCodeGenOnly = 1 in
 def tLDRpci : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_pc:$addr), IIC_iLoad_i,
                   "ldr", ".n\t$Rt, $addr",
                   [(set tGPR:$Rt, (load (ARMWrapper tconstpool:$addr)))]>,
@@ -736,42 +711,53 @@ def tSTRspi : T1pIs<(outs), (ins tGPR:$Rt, t_addrmode_sp:$addr), IIC_iStore_i,
 //  Load / store multiple Instructions.
 //
 
-multiclass thumb_ldst_mult<string asm, InstrItinClass itin,
-                           InstrItinClass itin_upd, bits<6> T1Enc,
-                           bit L_bit> {
-  def IA :
-    T1I<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
-        itin, !strconcat(asm, "ia${p}\t$Rn, $regs"), []>,
-       T1Encoding<T1Enc> {
-    bits<3> Rn;
-    bits<8> regs;
-    let Inst{10-8} = Rn;
-    let Inst{7-0}  = regs;
-  }
-  def IA_UPD :
-    T1It<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
-         itin_upd, !strconcat(asm, "ia${p}\t$Rn!, $regs"), "$Rn = $wb", []>,
-        T1Encoding<T1Enc> {
-    bits<3> Rn;
-    bits<8> regs;
-    let Inst{10-8} = Rn;
-    let Inst{7-0}  = regs;
-  }
-}
-
 // These require base address to be written back or one of the loaded regs.
 let neverHasSideEffects = 1 in {
 
 let mayLoad = 1, hasExtraDefRegAllocReq = 1 in
-defm tLDM : thumb_ldst_mult<"ldm", IIC_iLoad_m, IIC_iLoad_mu,
-                            {1,1,0,0,1,?}, 1>;
-
+def tLDMIA : T1I<(outs), (ins tGPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+        IIC_iLoad_m, "ldm${p}\t$Rn, $regs", []>, T1Encoding<{1,1,0,0,1,?}> {
+  bits<3> Rn;
+  bits<8> regs;
+  let Inst{10-8} = Rn;
+  let Inst{7-0}  = regs;
+}
+
+// Writeback version is just a pseudo, as there's no encoding difference.
+// Writeback happens iff the base register is not in the destination register
+// list.
+def tLDMIA_UPD :
+    InstTemplate<AddrModeNone, 0, IndexModeNone, Pseudo, GenericDomain,
+                 "$Rn = $wb", IIC_iLoad_mu>,
+    PseudoInstExpansion<(tLDMIA tGPR:$Rn, pred:$p, reglist:$regs)> {
+  let Size = 2;
+  let OutOperandList = (outs GPR:$wb);
+  let InOperandList = (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops);
+  let Pattern = [];
+  let isCodeGenOnly = 1;
+  let isPseudo = 1;
+  list<Predicate> Predicates = [IsThumb];
+}
+
+// There is no non-writeback version of STM for Thumb.
 let mayStore = 1, hasExtraSrcRegAllocReq = 1 in
-defm tSTM : thumb_ldst_mult<"stm", IIC_iStore_m, IIC_iStore_mu,
-                            {1,1,0,0,0,?}, 0>;
+def tSTMIA_UPD : Thumb1I<(outs GPR:$wb),
+                         (ins tGPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+                         AddrModeNone, 2, IIC_iStore_mu,
+                         "stm${p}\t$Rn!, $regs", "$Rn = $wb", []>,
+                     T1Encoding<{1,1,0,0,0,?}> {
+  bits<3> Rn;
+  bits<8> regs;
+  let Inst{10-8} = Rn;
+  let Inst{7-0}  = regs;
+}
 
 } // neverHasSideEffects
 
+def : InstAlias<"ldm${p} $Rn!, $regs",
+                (tLDMIA tGPR:$Rn, pred:$p, reglist:$regs)>,
+        Requires<[IsThumb, IsThumb1Only]>;
+
 let mayLoad = 1, Uses = [SP], Defs = [SP], hasExtraDefRegAllocReq = 1 in
 def tPOP : T1I<(outs), (ins pred:$p, reglist:$regs, variable_ops),
                IIC_iPop,
@@ -876,7 +862,7 @@ def tADC :                      // A8.6.2
 
 // Add immediate
 def tADDi3 :                    // A8.6.4 T1
-  T1sIGenEncodeImm<0b01110, (outs tGPR:$Rd), (ins tGPR:$Rm, i32imm:$imm3),
+  T1sIGenEncodeImm<0b01110, (outs tGPR:$Rd), (ins tGPR:$Rm, imm0_7:$imm3),
                    IIC_iALUi,
                    "add", "\t$Rd, $Rm, $imm3",
                    [(set tGPR:$Rd, (add tGPR:$Rm, imm0_7:$imm3))]> {
@@ -885,8 +871,8 @@ def tADDi3 :                    // A8.6.4 T1
 }
 
 def tADDi8 :                    // A8.6.4 T2
-  T1sItGenEncodeImm<{1,1,0,?,?}, (outs tGPR:$Rdn), (ins tGPR:$Rn, i32imm:$imm8),
-                    IIC_iALUi,
+  T1sItGenEncodeImm<{1,1,0,?,?}, (outs tGPR:$Rdn),
+                    (ins tGPR:$Rn, imm0_255:$imm8), IIC_iALUi,
                     "add", "\t$Rdn, $imm8",
                     [(set tGPR:$Rdn, (add tGPR:$Rn, imm8_255:$imm8))]>;
 
@@ -920,10 +906,10 @@ def tAND :                      // A8.6.12
 
 // ASR immediate
 def tASRri :                    // A8.6.14
-  T1sIGenEncodeImm<{0,1,0,?,?}, (outs tGPR:$Rd), (ins tGPR:$Rm, i32imm:$imm5),
+  T1sIGenEncodeImm<{0,1,0,?,?}, (outs tGPR:$Rd), (ins tGPR:$Rm, imm_sr:$imm5),
                    IIC_iMOVsi,
                    "asr", "\t$Rd, $Rm, $imm5",
-                   [(set tGPR:$Rd, (sra tGPR:$Rm, (i32 imm:$imm5)))]> {
+                   [(set tGPR:$Rd, (sra tGPR:$Rm, (i32 imm_sr:$imm5)))]> {
   bits<5> imm5;
   let Inst{10-6} = imm5;
 }
@@ -962,7 +948,7 @@ def tCMNz :                     // A8.6.33
 
 // CMP immediate
 let isCompare = 1, Defs = [CPSR] in {
-def tCMPi8 : T1pI<(outs), (ins tGPR:$Rn, i32imm:$imm8), IIC_iCMPi,
+def tCMPi8 : T1pI<(outs), (ins tGPR:$Rn, imm0_255:$imm8), IIC_iCMPi,
                   "cmp", "\t$Rn, $imm8",
                   [(ARMcmp tGPR:$Rn, imm0_255:$imm8)]>,
              T1General<{1,0,1,?,?}> {
@@ -1003,7 +989,7 @@ def tEOR :                      // A8.6.45
 
 // LSL immediate
 def tLSLri :                    // A8.6.88
-  T1sIGenEncodeImm<{0,0,0,?,?}, (outs tGPR:$Rd), (ins tGPR:$Rm, i32imm:$imm5),
+  T1sIGenEncodeImm<{0,0,0,?,?}, (outs tGPR:$Rd), (ins tGPR:$Rm, imm0_31:$imm5),
                    IIC_iMOVsi,
                    "lsl", "\t$Rd, $Rm, $imm5",
                    [(set tGPR:$Rd, (shl tGPR:$Rm, (i32 imm:$imm5)))]> {
@@ -1020,10 +1006,10 @@ def tLSLrr :                    // A8.6.89
 
 // LSR immediate
 def tLSRri :                    // A8.6.90
-  T1sIGenEncodeImm<{0,0,1,?,?}, (outs tGPR:$Rd), (ins tGPR:$Rm, i32imm:$imm5),
+  T1sIGenEncodeImm<{0,0,1,?,?}, (outs tGPR:$Rd), (ins tGPR:$Rm, imm_sr:$imm5),
                    IIC_iMOVsi,
                    "lsr", "\t$Rd, $Rm, $imm5",
-                   [(set tGPR:$Rd, (srl tGPR:$Rm, (i32 imm:$imm5)))]> {
+                   [(set tGPR:$Rd, (srl tGPR:$Rm, (i32 imm_sr:$imm5)))]> {
   bits<5> imm5;
   let Inst{10-6} = imm5;
 }
@@ -1047,6 +1033,10 @@ def tMOVi8 : T1sI<(outs tGPR:$Rd), (ins imm0_255:$imm8), IIC_iMOVi,
   let Inst{10-8} = Rd;
   let Inst{7-0}  = imm8;
 }
+// Because we have an explicit tMOVSr below, we need an alias to handle
+// the immediate "movs" form here. Blech.
+def : tInstAlias <"movs $Rdn, $imm",
+                 (tMOVi8 tGPR:$Rdn, CPSR, imm0_255:$imm, 14, 0)>;
 
 // A7-73: MOV(2) - mov setting flag.
 
@@ -1077,10 +1067,19 @@ def tMOVSr      : T1I<(outs tGPR:$Rd), (ins tGPR:$Rm), IIC_iMOVr,
 // Multiply register
 let isCommutable = 1 in
 def tMUL :                      // A8.6.105 T1
-  T1sItDPEncode<0b1101, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
-                IIC_iMUL32,
-                "mul", "\t$Rdn, $Rm, $Rdn",
-                [(set tGPR:$Rdn, (mul tGPR:$Rn, tGPR:$Rm))]>;
+  Thumb1sI<(outs tGPR:$Rd), (ins tGPR:$Rn, tGPR:$Rm), AddrModeNone, 2,
+           IIC_iMUL32, "mul", "\t$Rd, $Rn, $Rm", "$Rm = $Rd",
+           [(set tGPR:$Rd, (mul tGPR:$Rn, tGPR:$Rm))]>,
+      T1DataProcessing<0b1101> {
+  bits<3> Rd;
+  bits<3> Rn;
+  let Inst{5-3} = Rn;
+  let Inst{2-0} = Rd;
+  let AsmMatchConverter = "cvtThumbMultiply";
+}
+
+def :tInstAlias<"mul${s}${p} $Rdm, $Rn", (tMUL tGPR:$Rdm, s_cc_out:$s, tGPR:$Rn,
+                                               pred:$p)>;
 
 // Move inverse register
 def tMVN :                      // A8.6.107
@@ -1132,6 +1131,9 @@ def tRSB :                      // A8.6.141
                "rsb", "\t$Rd, $Rn, #0",
                [(set tGPR:$Rd, (ineg tGPR:$Rn))]>;
 
+def : tInstAlias<"neg${s}${p} $Rd, $Rm",
+                 (tRSB tGPR:$Rd, s_cc_out:$s, tGPR:$Rm, pred:$p)>;
+
 // Subtract with carry register
 let Uses = [CPSR] in
 def tSBC :                      // A8.6.151
@@ -1142,7 +1144,7 @@ def tSBC :                      // A8.6.151
 
 // Subtract immediate
 def tSUBi3 :                    // A8.6.210 T1
-  T1sIGenEncodeImm<0b01111, (outs tGPR:$Rd), (ins tGPR:$Rm, i32imm:$imm3),
+  T1sIGenEncodeImm<0b01111, (outs tGPR:$Rd), (ins tGPR:$Rm, imm0_7:$imm3),
                    IIC_iALUi,
                    "sub", "\t$Rd, $Rm, $imm3",
                    [(set tGPR:$Rd, (add tGPR:$Rm, imm0_7_neg:$imm3))]> {
@@ -1151,8 +1153,8 @@ def tSUBi3 :                    // A8.6.210 T1
 }
 
 def tSUBi8 :                    // A8.6.210 T2
-  T1sItGenEncodeImm<{1,1,1,?,?}, (outs tGPR:$Rdn), (ins tGPR:$Rn, i32imm:$imm8),
-                    IIC_iALUi,
+  T1sItGenEncodeImm<{1,1,1,?,?}, (outs tGPR:$Rdn),
+                    (ins tGPR:$Rn, imm0_255:$imm8), IIC_iALUi,
                     "sub", "\t$Rdn, $imm8",
                     [(set tGPR:$Rdn, (add tGPR:$Rn, imm8_255_neg:$imm8))]>;
 
@@ -1163,8 +1165,6 @@ def tSUBrr :                    // A8.6.212
                 "sub", "\t$Rd, $Rn, $Rm",
                 [(set tGPR:$Rd, (sub tGPR:$Rn, tGPR:$Rm))]>;
 
-// TODO: A7-96: STMIA - store multiple.
-
 // Sign-extend byte
 def tSXTB :                     // A8.6.222
   T1pIMiscEncode<{0,0,1,0,0,1,?}, (outs tGPR:$Rd), (ins tGPR:$Rm),
@@ -1216,12 +1216,13 @@ let usesCustomInserter = 1 in  // Expanded after instruction selection.
 // assembler.
 
 def tADR : T1I<(outs tGPR:$Rd), (ins t_adrlabel:$addr, pred:$p),
-               IIC_iALUi, "adr{$p}\t$Rd, #$addr", []>,
+               IIC_iALUi, "adr{$p}\t$Rd, $addr", []>,
                T1Encoding<{1,0,1,0,0,?}> {
   bits<3> Rd;
   bits<8> addr;
   let Inst{10-8} = Rd;
   let Inst{7-0} = addr;
+  let DecoderMethod = "DecodeThumbAddSpecialReg";
 }
 
 let neverHasSideEffects = 1, isReMaterializable = 1 in
@@ -1361,6 +1362,31 @@ def : T1Pat<(sextloadi16 t_addrmode_rrs2:$addr),
 def : T1Pat<(sextloadi16 t_addrmode_is2:$addr),
             (tASRri (tLSLri (tLDRHi t_addrmode_is2:$addr), 16), 16)>;
 
+def : T1Pat<(atomic_load_8 t_addrmode_is1:$src),
+             (tLDRBi t_addrmode_is1:$src)>;
+def : T1Pat<(atomic_load_8 t_addrmode_rrs1:$src),
+             (tLDRBr t_addrmode_rrs1:$src)>;
+def : T1Pat<(atomic_load_16 t_addrmode_is2:$src),
+             (tLDRHi t_addrmode_is2:$src)>;
+def : T1Pat<(atomic_load_16 t_addrmode_rrs2:$src),
+             (tLDRHr t_addrmode_rrs2:$src)>;
+def : T1Pat<(atomic_load_32 t_addrmode_is4:$src),
+             (tLDRi t_addrmode_is4:$src)>;
+def : T1Pat<(atomic_load_32 t_addrmode_rrs4:$src),
+             (tLDRr t_addrmode_rrs4:$src)>;
+def : T1Pat<(atomic_store_8 t_addrmode_is1:$ptr, tGPR:$val),
+             (tSTRBi tGPR:$val, t_addrmode_is1:$ptr)>;
+def : T1Pat<(atomic_store_8 t_addrmode_rrs1:$ptr, tGPR:$val),
+             (tSTRBr tGPR:$val, t_addrmode_rrs1:$ptr)>;
+def : T1Pat<(atomic_store_16 t_addrmode_is2:$ptr, tGPR:$val),
+             (tSTRHi tGPR:$val, t_addrmode_is2:$ptr)>;
+def : T1Pat<(atomic_store_16 t_addrmode_rrs2:$ptr, tGPR:$val),
+             (tSTRHr tGPR:$val, t_addrmode_rrs2:$ptr)>;
+def : T1Pat<(atomic_store_32 t_addrmode_is4:$ptr, tGPR:$val),
+             (tSTRi tGPR:$val, t_addrmode_is4:$ptr)>;
+def : T1Pat<(atomic_store_32 t_addrmode_rrs4:$ptr, tGPR:$val),
+             (tSTRr tGPR:$val, t_addrmode_rrs4:$ptr)>;
+
 // Large immediate handling.
 
 // Two piece imms.
@@ -1395,3 +1421,16 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
                   2, IIC_Br, [(brind GPR:$Rm)],
                   (tMOVr PC, GPR:$Rm, pred:$p)>;
 }
+
+
+// In Thumb1, "nop" is encoded as a "mov r8, r8". Technically, the bf00
+// encoding is available on ARMv6K, but we don't differentiate that finely.
+def : InstAlias<"nop", (tMOVr R8, R8, 14, 0)>,Requires<[IsThumb, IsThumb1Only]>;
+
+
+// For round-trip assembly/disassembly, we have to handle a CPS instruction
+// without any iflags. That's not, strictly speaking, valid syntax, but it's
+// a useful extention and assembles to defined behaviour (the insn does
+// nothing).
+def : tInstAlias<"cps$imod", (tCPS imod_op:$imod, 0)>;
+def : tInstAlias<"cps$imod", (tCPS imod_op:$imod, 0)>;
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index c2c6cbcac0f5c..471ec29674877 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -12,13 +12,32 @@
 //===----------------------------------------------------------------------===//
 
 // IT block predicate field
+def it_pred_asmoperand : AsmOperandClass {
+  let Name = "ITCondCode";
+  let ParserMethod = "parseITCondCode";
+}
 def it_pred : Operand<i32> {
   let PrintMethod = "printMandatoryPredicateOperand";
+  let ParserMatchClass = it_pred_asmoperand;
 }
 
 // IT block condition mask
+def it_mask_asmoperand : AsmOperandClass { let Name = "ITMask"; }
 def it_mask : Operand<i32> {
   let PrintMethod = "printThumbITMask";
+  let ParserMatchClass = it_mask_asmoperand;
+}
+
+// t2_shift_imm: An integer that encodes a shift amount and the type of shift
+// (asr or lsl). The 6-bit immediate encodes as:
+//    {5}     0 ==> lsl
+//            1     asr
+//    {4-0}   imm5 shift amount.
+//            asr #32 not allowed
+def t2_shift_imm : Operand<i32> {
+  let PrintMethod = "printShiftImmOperand";
+  let ParserMatchClass = ShifterImmAsmOperand;
+  let DecoderMethod = "DecodeT2ShifterImmOperand";
 }
 
 // Shifted operands. No register controlled shifts for Thumb2.
@@ -28,6 +47,8 @@ def t2_so_reg : Operand<i32>,    // reg imm
                                [shl,srl,sra,rotr]> {
   let EncoderMethod = "getT2SORegOpValue";
   let PrintMethod = "printT2SOOperand";
+  let DecoderMethod = "DecodeSORegImmOperand";
+  let ParserMatchClass = ShiftedImmAsmOperand;
   let MIOperandInfo = (ops rGPR, i32imm);
 }
 
@@ -50,6 +71,7 @@ def t2_so_imm : Operand<i32>, ImmLeaf<i32, [{
   }]> {
   let ParserMatchClass = t2_so_imm_asmoperand;
   let EncoderMethod = "getT2SOImmOpValue";
+  let DecoderMethod = "DecodeT2SOImm";
 }
 
 // t2_so_imm_not - Match an immediate that is a complement
@@ -65,11 +87,6 @@ def t2_so_imm_neg : Operand<i32>,
   return ARM_AM::getT2SOImmVal(-((uint32_t)N->getZExtValue())) != -1;
 }], t2_so_imm_neg_XFORM>;
 
-/// imm1_31 predicate - True if the 32-bit immediate is in the range [1,31].
-def imm1_31 : ImmLeaf<i32, [{
-  return (int32_t)Imm >= 1 && (int32_t)Imm < 32;
-}]>;
-
 /// imm0_4095 predicate - True if the 32-bit immediate is in the range [0.4095].
 def imm0_4095 : Operand<i32>,
                 ImmLeaf<i32, [{
@@ -96,17 +113,20 @@ def lo5AllOne : PatLeaf<(i32 imm), [{
 // Define Thumb2 specific addressing modes.
 
 // t2addrmode_imm12  := reg + imm12
+def t2addrmode_imm12_asmoperand : AsmOperandClass {let Name="MemUImm12Offset";}
 def t2addrmode_imm12 : Operand<i32>,
                        ComplexPattern<i32, 2, "SelectT2AddrModeImm12", []> {
   let PrintMethod = "printAddrModeImm12Operand";
   let EncoderMethod = "getAddrModeImm12OpValue";
+  let DecoderMethod = "DecodeT2AddrModeImm12";
+  let ParserMatchClass = t2addrmode_imm12_asmoperand;
   let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
-  let ParserMatchClass = MemMode5AsmOperand;
 }
 
 // t2ldrlabel  := imm12
 def t2ldrlabel : Operand<i32> {
   let EncoderMethod = "getAddrModeImm12OpValue";
+  let PrintMethod = "printT2LdrLabelOperand";
 }
 
 
@@ -116,13 +136,36 @@ def t2adrlabel : Operand<i32> {
 }
 
 
+// t2addrmode_posimm8  := reg + imm8
+def MemPosImm8OffsetAsmOperand : AsmOperandClass {let Name="MemPosImm8Offset";}
+def t2addrmode_posimm8 : Operand<i32> {
+  let PrintMethod = "printT2AddrModeImm8Operand";
+  let EncoderMethod = "getT2AddrModeImm8OpValue";
+  let DecoderMethod = "DecodeT2AddrModeImm8";
+  let ParserMatchClass = MemPosImm8OffsetAsmOperand;
+  let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
+}
+
+// t2addrmode_negimm8  := reg - imm8
+def MemNegImm8OffsetAsmOperand : AsmOperandClass {let Name="MemNegImm8Offset";}
+def t2addrmode_negimm8 : Operand<i32>,
+                      ComplexPattern<i32, 2, "SelectT2AddrModeImm8", []> {
+  let PrintMethod = "printT2AddrModeImm8Operand";
+  let EncoderMethod = "getT2AddrModeImm8OpValue";
+  let DecoderMethod = "DecodeT2AddrModeImm8";
+  let ParserMatchClass = MemNegImm8OffsetAsmOperand;
+  let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
+}
+
 // t2addrmode_imm8  := reg +/- imm8
+def MemImm8OffsetAsmOperand : AsmOperandClass { let Name = "MemImm8Offset"; }
 def t2addrmode_imm8 : Operand<i32>,
                       ComplexPattern<i32, 2, "SelectT2AddrModeImm8", []> {
   let PrintMethod = "printT2AddrModeImm8Operand";
   let EncoderMethod = "getT2AddrModeImm8OpValue";
+  let DecoderMethod = "DecodeT2AddrModeImm8";
+  let ParserMatchClass = MemImm8OffsetAsmOperand;
   let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
-  let ParserMatchClass = MemMode5AsmOperand;
 }
 
 def t2am_imm8_offset : Operand<i32>,
@@ -130,38 +173,61 @@ def t2am_imm8_offset : Operand<i32>,
                                       [], [SDNPWantRoot]> {
   let PrintMethod = "printT2AddrModeImm8OffsetOperand";
   let EncoderMethod = "getT2AddrModeImm8OffsetOpValue";
-  let ParserMatchClass = MemMode5AsmOperand;
+  let DecoderMethod = "DecodeT2Imm8";
 }
 
 // t2addrmode_imm8s4  := reg +/- (imm8 << 2)
+def MemImm8s4OffsetAsmOperand : AsmOperandClass {let Name = "MemImm8s4Offset";}
 def t2addrmode_imm8s4 : Operand<i32> {
   let PrintMethod = "printT2AddrModeImm8s4Operand";
   let EncoderMethod = "getT2AddrModeImm8s4OpValue";
+  let DecoderMethod = "DecodeT2AddrModeImm8s4";
+  let ParserMatchClass = MemImm8s4OffsetAsmOperand;
   let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
-  let ParserMatchClass = MemMode5AsmOperand;
 }
 
+def t2am_imm8s4_offset_asmoperand : AsmOperandClass { let Name = "Imm8s4"; }
 def t2am_imm8s4_offset : Operand<i32> {
   let PrintMethod = "printT2AddrModeImm8s4OffsetOperand";
+  let EncoderMethod = "getT2Imm8s4OpValue";
+  let DecoderMethod = "DecodeT2Imm8S4";
+}
+
+// t2addrmode_imm0_1020s4  := reg + (imm8 << 2)
+def MemImm0_1020s4OffsetAsmOperand : AsmOperandClass {
+  let Name = "MemImm0_1020s4Offset";
+}
+def t2addrmode_imm0_1020s4 : Operand<i32> {
+  let PrintMethod = "printT2AddrModeImm0_1020s4Operand";
+  let EncoderMethod = "getT2AddrModeImm0_1020s4OpValue";
+  let DecoderMethod = "DecodeT2AddrModeImm0_1020s4";
+  let ParserMatchClass = MemImm0_1020s4OffsetAsmOperand;
+  let MIOperandInfo = (ops GPRnopc:$base, i32imm:$offsimm);
 }
 
 // t2addrmode_so_reg  := reg + (reg << imm2)
+def t2addrmode_so_reg_asmoperand : AsmOperandClass {let Name="T2MemRegOffset";}
 def t2addrmode_so_reg : Operand<i32>,
                         ComplexPattern<i32, 3, "SelectT2AddrModeSoReg", []> {
   let PrintMethod = "printT2AddrModeSoRegOperand";
   let EncoderMethod = "getT2AddrModeSORegOpValue";
+  let DecoderMethod = "DecodeT2AddrModeSOReg";
+  let ParserMatchClass = t2addrmode_so_reg_asmoperand;
   let MIOperandInfo = (ops GPR:$base, rGPR:$offsreg, i32imm:$offsimm);
-  let ParserMatchClass = MemMode5AsmOperand;
 }
 
-// t2addrmode_reg := reg
-// Used by load/store exclusive instructions. Useful to enable right assembly
-// parsing and printing. Not used for any codegen matching.
-//
-def t2addrmode_reg : Operand<i32> {
-  let PrintMethod = "printAddrMode7Operand";
-  let MIOperandInfo = (ops GPR);
-  let ParserMatchClass = MemMode7AsmOperand;
+// Addresses for the TBB/TBH instructions.
+def addrmode_tbb_asmoperand : AsmOperandClass { let Name = "MemTBB"; }
+def addrmode_tbb : Operand<i32> {
+  let PrintMethod = "printAddrModeTBB";
+  let ParserMatchClass = addrmode_tbb_asmoperand;
+  let MIOperandInfo = (ops GPR:$Rn, rGPR:$Rm);
+}
+def addrmode_tbh_asmoperand : AsmOperandClass { let Name = "MemTBH"; }
+def addrmode_tbh : Operand<i32> {
+  let PrintMethod = "printAddrModeTBH";
+  let ParserMatchClass = addrmode_tbh_asmoperand;
+  let MIOperandInfo = (ops GPR:$Rn, rGPR:$Rm);
 }
 
 //===----------------------------------------------------------------------===//
@@ -419,47 +485,6 @@ class T2MulLong<bits<3> opc22_20, bits<4> opc7_4,
 }
 
 
-/// T2I_un_irs - Defines a set of (op reg, {so_imm|r|so_reg}) patterns for a
-/// unary operation that produces a value. These are predicable and can be
-/// changed to modify CPSR.
-multiclass T2I_un_irs<bits<4> opcod, string opc,
-                     InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
-                      PatFrag opnode, bit Cheap = 0, bit ReMat = 0> {
-   // shifted imm
-   def i : T2sOneRegImm<(outs rGPR:$Rd), (ins t2_so_imm:$imm), iii,
-                opc, "\t$Rd, $imm",
-                [(set rGPR:$Rd, (opnode t2_so_imm:$imm))]> {
-     let isAsCheapAsAMove = Cheap;
-     let isReMaterializable = ReMat;
-     let Inst{31-27} = 0b11110;
-     let Inst{25} = 0;
-     let Inst{24-21} = opcod;
-     let Inst{19-16} = 0b1111; // Rn
-     let Inst{15} = 0;
-   }
-   // register
-   def r : T2sTwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm), iir,
-                opc, ".w\t$Rd, $Rm",
-                [(set rGPR:$Rd, (opnode rGPR:$Rm))]> {
-     let Inst{31-27} = 0b11101;
-     let Inst{26-25} = 0b01;
-     let Inst{24-21} = opcod;
-     let Inst{19-16} = 0b1111; // Rn
-     let Inst{14-12} = 0b000; // imm3
-     let Inst{7-6} = 0b00; // imm2
-     let Inst{5-4} = 0b00; // type
-   }
-   // shifted register
-   def s : T2sOneRegShiftedReg<(outs rGPR:$Rd), (ins t2_so_reg:$ShiftedRm), iis,
-                opc, ".w\t$Rd, $ShiftedRm",
-                [(set rGPR:$Rd, (opnode t2_so_reg:$ShiftedRm))]> {
-     let Inst{31-27} = 0b11101;
-     let Inst{26-25} = 0b01;
-     let Inst{24-21} = opcod;
-     let Inst{19-16} = 0b1111; // Rn
-   }
-}
-
 /// T2I_bin_irs - Defines a set of (op reg, {so_imm|r|so_reg}) patterns for a
 /// binary operation that produces a value. These are predicable and can be
 /// changed to modify CPSR.
@@ -500,21 +525,18 @@ multiclass T2I_bin_irs<bits<4> opcod, string opc,
    }
   // Assembly aliases for optional destination operand when it's the same
   // as the source operand.
-  def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $imm"),
+  def : t2InstAlias<!strconcat(opc, "${s}${p} $Rdn, $imm"),
      (!cast<Instruction>(!strconcat(baseOpc, "ri")) rGPR:$Rdn, rGPR:$Rdn,
                                                     t2_so_imm:$imm, pred:$p,
-                                                    cc_out:$s)>,
-     Requires<[IsThumb2]>;
-  def : InstAlias<!strconcat(opc, "${s}${p}", wide, " $Rdn, $Rm"),
+                                                    cc_out:$s)>;
+  def : t2InstAlias<!strconcat(opc, "${s}${p}", wide, " $Rdn, $Rm"),
      (!cast<Instruction>(!strconcat(baseOpc, "rr")) rGPR:$Rdn, rGPR:$Rdn,
                                                     rGPR:$Rm, pred:$p,
-                                                    cc_out:$s)>,
-     Requires<[IsThumb2]>;
-  def : InstAlias<!strconcat(opc, "${s}${p}", wide, " $Rdn, $shift"),
+                                                    cc_out:$s)>;
+  def : t2InstAlias<!strconcat(opc, "${s}${p}", wide, " $Rdn, $shift"),
      (!cast<Instruction>(!strconcat(baseOpc, "rs")) rGPR:$Rdn, rGPR:$Rdn,
                                                     t2_so_reg:$shift, pred:$p,
-                                                    cc_out:$s)>,
-     Requires<[IsThumb2]>;
+                                                    cc_out:$s)>;
 }
 
 /// T2I_bin_w_irs - Same as T2I_bin_irs except these operations need
@@ -522,7 +544,27 @@ multiclass T2I_bin_irs<bits<4> opcod, string opc,
 multiclass T2I_bin_w_irs<bits<4> opcod, string opc,
                      InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
                          PatFrag opnode, string baseOpc, bit Commutable = 0> :
-    T2I_bin_irs<opcod, opc, iii, iir, iis, opnode, baseOpc, Commutable, ".w">;
+    T2I_bin_irs<opcod, opc, iii, iir, iis, opnode, baseOpc, Commutable, ".w"> {
+  // Assembler aliases w/o the ".w" suffix.
+  def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rd, $Rn, $Rm"),
+     (!cast<Instruction>(!strconcat(baseOpc, "rr")) rGPR:$Rd, rGPR:$Rn,
+                                                    rGPR:$Rm, pred:$p,
+                                                    cc_out:$s)>;
+  def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rd, $Rn, $shift"),
+     (!cast<Instruction>(!strconcat(baseOpc, "rs")) rGPR:$Rd, rGPR:$Rn,
+                                                    t2_so_reg:$shift, pred:$p,
+                                                    cc_out:$s)>;
+
+  // and with the optional destination operand, too.
+  def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rdn, $Rm"),
+     (!cast<Instruction>(!strconcat(baseOpc, "rr")) rGPR:$Rdn, rGPR:$Rdn,
+                                                    rGPR:$Rm, pred:$p,
+                                                    cc_out:$s)>;
+  def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rdn, $shift"),
+     (!cast<Instruction>(!strconcat(baseOpc, "rs")) rGPR:$Rdn, rGPR:$Rdn,
+                                                    t2_so_reg:$shift, pred:$p,
+                                                    cc_out:$s)>;
+}
 
 /// T2I_rbin_is - Same as T2I_bin_irs except the order of operands are
 /// reversed.  The 'rr' form is only defined for the disassembler; for codegen
@@ -563,45 +605,28 @@ multiclass T2I_rbin_irs<bits<4> opcod, string opc, PatFrag opnode> {
 
 /// T2I_bin_s_irs - Similar to T2I_bin_irs except it sets the 's' bit so the
 /// instruction modifies the CPSR register.
-let isCodeGenOnly = 1, Defs = [CPSR] in {
+///
+/// These opcodes will be converted to the real non-S opcodes by
+/// AdjustInstrPostInstrSelection after giving then an optional CPSR operand.
+let hasPostISelHook = 1, isCodeGenOnly = 1, isPseudo = 1, Defs = [CPSR] in {
 multiclass T2I_bin_s_irs<bits<4> opcod, string opc,
                      InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
                          PatFrag opnode, bit Commutable = 0> {
    // shifted imm
-   def ri : T2TwoRegImm<
+   def ri : T2sTwoRegImm<
                 (outs rGPR:$Rd), (ins GPR:$Rn, t2_so_imm:$imm), iii,
-                !strconcat(opc, "s"), ".w\t$Rd, $Rn, $imm",
-                [(set rGPR:$Rd, (opnode GPR:$Rn, t2_so_imm:$imm))]> {
-     let Inst{31-27} = 0b11110;
-     let Inst{25} = 0;
-     let Inst{24-21} = opcod;
-     let Inst{20} = 1; // The S bit.
-     let Inst{15} = 0;
-   }
+                opc, ".w\t$Rd, $Rn, $imm",
+                [(set rGPR:$Rd, CPSR, (opnode GPR:$Rn, t2_so_imm:$imm))]>;
    // register
-   def rr : T2ThreeReg<
+   def rr : T2sThreeReg<
                 (outs rGPR:$Rd), (ins GPR:$Rn, rGPR:$Rm), iir,
-                !strconcat(opc, "s"), ".w\t$Rd, $Rn, $Rm",
-                [(set rGPR:$Rd, (opnode GPR:$Rn, rGPR:$Rm))]> {
-     let isCommutable = Commutable;
-     let Inst{31-27} = 0b11101;
-     let Inst{26-25} = 0b01;
-     let Inst{24-21} = opcod;
-     let Inst{20} = 1; // The S bit.
-     let Inst{14-12} = 0b000; // imm3
-     let Inst{7-6} = 0b00; // imm2
-     let Inst{5-4} = 0b00; // type
-   }
+                opc, ".w\t$Rd, $Rn, $Rm",
+                [(set rGPR:$Rd, CPSR, (opnode GPR:$Rn, rGPR:$Rm))]>;
    // shifted register
-   def rs : T2TwoRegShiftedReg<
+   def rs : T2sTwoRegShiftedReg<
                 (outs rGPR:$Rd), (ins GPR:$Rn, t2_so_reg:$ShiftedRm), iis,
-                !strconcat(opc, "s"), ".w\t$Rd, $Rn, $ShiftedRm",
-                [(set rGPR:$Rd, (opnode GPR:$Rn, t2_so_reg:$ShiftedRm))]> {
-     let Inst{31-27} = 0b11101;
-     let Inst{26-25} = 0b01;
-     let Inst{24-21} = opcod;
-     let Inst{20} = 1; // The S bit.
-   }
+                opc, ".w\t$Rd, $Rn, $ShiftedRm",
+               [(set rGPR:$Rd, CPSR, (opnode GPR:$Rn, t2_so_reg:$ShiftedRm))]>;
 }
 }
 
@@ -614,9 +639,9 @@ multiclass T2I_bin_ii12rs<bits<3> op23_21, string opc, PatFrag opnode,
    // in particular for taking the address of a local.
    let isReMaterializable = 1 in {
    def ri : T2sTwoRegImm<
-                 (outs rGPR:$Rd), (ins GPR:$Rn, t2_so_imm:$imm), IIC_iALUi,
-                 opc, ".w\t$Rd, $Rn, $imm",
-                 [(set rGPR:$Rd, (opnode GPR:$Rn, t2_so_imm:$imm))]> {
+               (outs GPRnopc:$Rd), (ins GPRnopc:$Rn, t2_so_imm:$imm), IIC_iALUi,
+               opc, ".w\t$Rd, $Rn, $imm",
+               [(set GPRnopc:$Rd, (opnode GPRnopc:$Rn, t2_so_imm:$imm))]> {
      let Inst{31-27} = 0b11110;
      let Inst{25} = 0;
      let Inst{24} = 1;
@@ -626,9 +651,9 @@ multiclass T2I_bin_ii12rs<bits<3> op23_21, string opc, PatFrag opnode,
    }
    // 12-bit imm
    def ri12 : T2I<
-                  (outs rGPR:$Rd), (ins GPR:$Rn, imm0_4095:$imm), IIC_iALUi,
+                  (outs GPRnopc:$Rd), (ins GPR:$Rn, imm0_4095:$imm), IIC_iALUi,
                   !strconcat(opc, "w"), "\t$Rd, $Rn, $imm",
-                  [(set rGPR:$Rd, (opnode GPR:$Rn, imm0_4095:$imm))]> {
+                  [(set GPRnopc:$Rd, (opnode GPR:$Rn, imm0_4095:$imm))]> {
      bits<4> Rd;
      bits<4> Rn;
      bits<12> imm;
@@ -644,9 +669,9 @@ multiclass T2I_bin_ii12rs<bits<3> op23_21, string opc, PatFrag opnode,
      let Inst{7-0} = imm{7-0};
    }
    // register
-   def rr : T2sThreeReg<(outs rGPR:$Rd), (ins GPR:$Rn, rGPR:$Rm), IIC_iALUr,
-                 opc, ".w\t$Rd, $Rn, $Rm",
-                 [(set rGPR:$Rd, (opnode GPR:$Rn, rGPR:$Rm))]> {
+   def rr : T2sThreeReg<(outs GPRnopc:$Rd), (ins GPRnopc:$Rn, rGPR:$Rm),
+                 IIC_iALUr, opc, ".w\t$Rd, $Rn, $Rm",
+                 [(set GPRnopc:$Rd, (opnode GPRnopc:$Rn, rGPR:$Rm))]> {
      let isCommutable = Commutable;
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
@@ -658,9 +683,9 @@ multiclass T2I_bin_ii12rs<bits<3> op23_21, string opc, PatFrag opnode,
    }
    // shifted register
    def rs : T2sTwoRegShiftedReg<
-                 (outs rGPR:$Rd), (ins GPR:$Rn, t2_so_reg:$ShiftedRm),
+                 (outs GPRnopc:$Rd), (ins GPRnopc:$Rn, t2_so_reg:$ShiftedRm),
                  IIC_iALUsi, opc, ".w\t$Rd, $Rn, $ShiftedRm",
-                 [(set rGPR:$Rd, (opnode GPR:$Rn, t2_so_reg:$ShiftedRm))]> {
+              [(set GPRnopc:$Rd, (opnode GPRnopc:$Rn, t2_so_reg:$ShiftedRm))]> {
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
      let Inst{24} = 1;
@@ -671,13 +696,13 @@ multiclass T2I_bin_ii12rs<bits<3> op23_21, string opc, PatFrag opnode,
 /// T2I_adde_sube_irs - Defines a set of (op reg, {so_imm|r|so_reg}) patterns
 /// for a binary operation that produces a value and use the carry
 /// bit. It's not predicable.
-let Uses = [CPSR] in {
+let Defs = [CPSR], Uses = [CPSR] in {
 multiclass T2I_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode,
                              bit Commutable = 0> {
    // shifted imm
    def ri : T2sTwoRegImm<(outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_imm:$imm),
                  IIC_iALUi, opc, "\t$Rd, $Rn, $imm",
-                 [(set rGPR:$Rd, (opnode rGPR:$Rn, t2_so_imm:$imm))]>,
+               [(set rGPR:$Rd, CPSR, (opnode rGPR:$Rn, t2_so_imm:$imm, CPSR))]>,
                  Requires<[IsThumb2]> {
      let Inst{31-27} = 0b11110;
      let Inst{25} = 0;
@@ -687,7 +712,7 @@ multiclass T2I_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode,
    // register
    def rr : T2sThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iALUr,
                  opc, ".w\t$Rd, $Rn, $Rm",
-                 [(set rGPR:$Rd, (opnode rGPR:$Rn, rGPR:$Rm))]>,
+                 [(set rGPR:$Rd, CPSR, (opnode rGPR:$Rn, rGPR:$Rm, CPSR))]>,
                  Requires<[IsThumb2]> {
      let isCommutable = Commutable;
      let Inst{31-27} = 0b11101;
@@ -701,7 +726,7 @@ multiclass T2I_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode,
    def rs : T2sTwoRegShiftedReg<
                  (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_reg:$ShiftedRm),
                  IIC_iALUsi, opc, ".w\t$Rd, $Rn, $ShiftedRm",
-                 [(set rGPR:$Rd, (opnode rGPR:$Rn, t2_so_reg:$ShiftedRm))]>,
+         [(set rGPR:$Rd, CPSR, (opnode rGPR:$Rn, t2_so_reg:$ShiftedRm, CPSR))]>,
                  Requires<[IsThumb2]> {
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
@@ -710,64 +735,35 @@ multiclass T2I_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode,
 }
 }
 
-// Carry setting variants
-// NOTE: CPSR def omitted because it will be handled by the custom inserter.
-let usesCustomInserter = 1 in {
-multiclass T2I_adde_sube_s_irs<PatFrag opnode, bit Commutable = 0> {
-   // shifted imm
-   def ri : t2PseudoInst<(outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_imm:$imm),
-                4, IIC_iALUi,
-                [(set rGPR:$Rd, (opnode rGPR:$Rn, t2_so_imm:$imm))]>;
-   // register
-   def rr : t2PseudoInst<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm),
-                4, IIC_iALUr,
-                [(set rGPR:$Rd, (opnode rGPR:$Rn, rGPR:$Rm))]> {
-     let isCommutable = Commutable;
-   }
-   // shifted register
-   def rs : t2PseudoInst<
-                (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_reg:$ShiftedRm),
-                4, IIC_iALUsi,
-                [(set rGPR:$Rd, (opnode rGPR:$Rn, t2_so_reg:$ShiftedRm))]>;
-}
-}
-
 /// T2I_rbin_s_is - Same as T2I_rbin_irs except sets 's' bit and the register
 /// version is not needed since this is only for codegen.
-let isCodeGenOnly = 1, Defs = [CPSR] in {
+///
+/// These opcodes will be converted to the real non-S opcodes by
+/// AdjustInstrPostInstrSelection after giving then an optional CPSR operand.
+let hasPostISelHook = 1, isCodeGenOnly = 1, isPseudo = 1, Defs = [CPSR] in {
 multiclass T2I_rbin_s_is<bits<4> opcod, string opc, PatFrag opnode> {
    // shifted imm
-   def ri : T2TwoRegImm<
+   def ri : T2sTwoRegImm<
                 (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_imm:$imm), IIC_iALUi,
-                !strconcat(opc, "s"), ".w\t$Rd, $Rn, $imm",
-                [(set rGPR:$Rd, (opnode t2_so_imm:$imm, rGPR:$Rn))]> {
-     let Inst{31-27} = 0b11110;
-     let Inst{25} = 0;
-     let Inst{24-21} = opcod;
-     let Inst{20} = 1; // The S bit.
-     let Inst{15} = 0;
-   }
+                opc, ".w\t$Rd, $Rn, $imm",
+                [(set rGPR:$Rd, CPSR, (opnode t2_so_imm:$imm, rGPR:$Rn))]>;
    // shifted register
-   def rs : T2TwoRegShiftedReg<
+   def rs : T2sTwoRegShiftedReg<
                 (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_reg:$ShiftedRm),
-                IIC_iALUsi, !strconcat(opc, "s"), "\t$Rd, $Rn, $ShiftedRm",
-                [(set rGPR:$Rd, (opnode t2_so_reg:$ShiftedRm, rGPR:$Rn))]> {
-     let Inst{31-27} = 0b11101;
-     let Inst{26-25} = 0b01;
-     let Inst{24-21} = opcod;
-     let Inst{20} = 1; // The S bit.
-   }
+                IIC_iALUsi, opc, "\t$Rd, $Rn, $ShiftedRm",
+              [(set rGPR:$Rd, CPSR, (opnode t2_so_reg:$ShiftedRm, rGPR:$Rn))]>;
 }
 }
 
 /// T2I_sh_ir - Defines a set of (op reg, {so_imm|r}) patterns for a shift /
 //  rotate operation that produces a value.
-multiclass T2I_sh_ir<bits<2> opcod, string opc, PatFrag opnode> {
+multiclass T2I_sh_ir<bits<2> opcod, string opc, Operand ty, PatFrag opnode,
+                     string baseOpc> {
    // 5-bit imm
    def ri : T2sTwoRegShiftImm<
-                 (outs rGPR:$Rd), (ins rGPR:$Rm, i32imm:$imm), IIC_iMOVsi,
+                 (outs rGPR:$Rd), (ins rGPR:$Rm, ty:$imm), IIC_iMOVsi,
                  opc, ".w\t$Rd, $Rm, $imm",
-                 [(set rGPR:$Rd, (opnode rGPR:$Rm, imm1_31:$imm))]> {
+                 [(set rGPR:$Rd, (opnode rGPR:$Rm, (i32 ty:$imm)))]> {
      let Inst{31-27} = 0b11101;
      let Inst{26-21} = 0b010010;
      let Inst{19-16} = 0b1111; // Rn
@@ -784,20 +780,50 @@ multiclass T2I_sh_ir<bits<2> opcod, string opc, PatFrag opnode> {
      let Inst{15-12} = 0b1111;
      let Inst{7-4} = 0b0000;
    }
+
+  // Optional destination register
+  def : t2InstAlias<!strconcat(opc, "${s}${p}", ".w $Rdn, $imm"),
+     (!cast<Instruction>(!strconcat(baseOpc, "ri")) rGPR:$Rdn, rGPR:$Rdn,
+                                                    ty:$imm, pred:$p,
+                                                    cc_out:$s)>;
+  def : t2InstAlias<!strconcat(opc, "${s}${p}", ".w $Rdn, $Rm"),
+     (!cast<Instruction>(!strconcat(baseOpc, "rr")) rGPR:$Rdn, rGPR:$Rdn,
+                                                    rGPR:$Rm, pred:$p,
+                                                    cc_out:$s)>;
+
+  // Assembler aliases w/o the ".w" suffix.
+  def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rd, $Rn, $imm"),
+     (!cast<Instruction>(!strconcat(baseOpc, "ri")) rGPR:$Rd, rGPR:$Rn,
+                                                    ty:$imm, pred:$p,
+                                                   cc_out:$s)>;
+  def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rd, $Rn, $Rm"),
+     (!cast<Instruction>(!strconcat(baseOpc, "rr")) rGPR:$Rd, rGPR:$Rn,
+                                                    rGPR:$Rm, pred:$p,
+                                                    cc_out:$s)>;
+
+  // and with the optional destination operand, too.
+  def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rdn, $imm"),
+     (!cast<Instruction>(!strconcat(baseOpc, "ri")) rGPR:$Rdn, rGPR:$Rdn,
+                                                    ty:$imm, pred:$p,
+                                                    cc_out:$s)>;
+  def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rdn, $Rm"),
+     (!cast<Instruction>(!strconcat(baseOpc, "rr")) rGPR:$Rdn, rGPR:$Rdn,
+                                                    rGPR:$Rm, pred:$p,
+                                                    cc_out:$s)>;
 }
 
 /// T2I_cmp_irs - Defines a set of (op r, {so_imm|r|so_reg}) cmp / test
 /// patterns. Similar to T2I_bin_irs except the instruction does not produce
 /// a explicit result, only implicitly set CPSR.
-let isCompare = 1, Defs = [CPSR] in {
 multiclass T2I_cmp_irs<bits<4> opcod, string opc,
                      InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
-                       PatFrag opnode> {
+                       PatFrag opnode, string baseOpc> {
+let isCompare = 1, Defs = [CPSR] in {
    // shifted imm
    def ri : T2OneRegCmpImm<
-                (outs), (ins GPR:$Rn, t2_so_imm:$imm), iii,
+                (outs), (ins GPRnopc:$Rn, t2_so_imm:$imm), iii,
                 opc, ".w\t$Rn, $imm",
-                [(opnode GPR:$Rn, t2_so_imm:$imm)]> {
+                [(opnode GPRnopc:$Rn, t2_so_imm:$imm)]> {
      let Inst{31-27} = 0b11110;
      let Inst{25} = 0;
      let Inst{24-21} = opcod;
@@ -807,9 +833,9 @@ multiclass T2I_cmp_irs<bits<4> opcod, string opc,
    }
    // register
    def rr : T2TwoRegCmp<
-                (outs), (ins GPR:$lhs, rGPR:$rhs), iir,
-                opc, ".w\t$lhs, $rhs",
-                [(opnode GPR:$lhs, rGPR:$rhs)]> {
+                (outs), (ins GPRnopc:$Rn, rGPR:$Rm), iir,
+                opc, ".w\t$Rn, $Rm",
+                [(opnode GPRnopc:$Rn, rGPR:$Rm)]> {
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
      let Inst{24-21} = opcod;
@@ -821,9 +847,9 @@ multiclass T2I_cmp_irs<bits<4> opcod, string opc,
    }
    // shifted register
    def rs : T2OneRegCmpShiftedReg<
-                (outs), (ins GPR:$Rn, t2_so_reg:$ShiftedRm), iis,
+                (outs), (ins GPRnopc:$Rn, t2_so_reg:$ShiftedRm), iis,
                 opc, ".w\t$Rn, $ShiftedRm",
-                [(opnode GPR:$Rn, t2_so_reg:$ShiftedRm)]> {
+                [(opnode GPRnopc:$Rn, t2_so_reg:$ShiftedRm)]> {
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
      let Inst{24-21} = opcod;
@@ -831,55 +857,60 @@ multiclass T2I_cmp_irs<bits<4> opcod, string opc,
      let Inst{11-8} = 0b1111; // Rd
    }
 }
+
+  // Assembler aliases w/o the ".w" suffix.
+  // No alias here for 'rr' version as not all instantiations of this
+  // multiclass want one (CMP in particular, does not).
+  def : t2InstAlias<!strconcat(opc, "${p}", " $Rn, $imm"),
+     (!cast<Instruction>(!strconcat(baseOpc, "ri")) GPRnopc:$Rn,
+                                                    t2_so_imm:$imm, pred:$p)>;
+  def : t2InstAlias<!strconcat(opc, "${p}", " $Rn, $shift"),
+     (!cast<Instruction>(!strconcat(baseOpc, "rs")) GPRnopc:$Rn,
+                                                    t2_so_reg:$shift,
+                                                    pred:$p)>;
 }
 
 /// T2I_ld - Defines a set of (op r, {imm12|imm8|so_reg}) load patterns.
 multiclass T2I_ld<bit signed, bits<2> opcod, string opc,
-                  InstrItinClass iii, InstrItinClass iis, PatFrag opnode> {
-  def i12 : T2Ii12<(outs GPR:$Rt), (ins t2addrmode_imm12:$addr), iii,
+                  InstrItinClass iii, InstrItinClass iis, RegisterClass target,
+                  PatFrag opnode> {
+  def i12 : T2Ii12<(outs target:$Rt), (ins t2addrmode_imm12:$addr), iii,
                    opc, ".w\t$Rt, $addr",
-                   [(set GPR:$Rt, (opnode t2addrmode_imm12:$addr))]> {
-    let Inst{31-27} = 0b11111;
-    let Inst{26-25} = 0b00;
+                   [(set target:$Rt, (opnode t2addrmode_imm12:$addr))]> {
+    bits<4> Rt;
+    bits<17> addr;
+    let Inst{31-25} = 0b1111100;
     let Inst{24} = signed;
     let Inst{23} = 1;
     let Inst{22-21} = opcod;
     let Inst{20} = 1; // load
-
-    bits<4> Rt;
-    let Inst{15-12} = Rt;
-
-    bits<17> addr;
-    let addr{12}    = 1;           // add = TRUE
     let Inst{19-16} = addr{16-13}; // Rn
-    let Inst{23}    = addr{12};    // U
+    let Inst{15-12} = Rt;
     let Inst{11-0}  = addr{11-0};  // imm
   }
-  def i8  : T2Ii8 <(outs GPR:$Rt), (ins t2addrmode_imm8:$addr), iii,
+  def i8  : T2Ii8 <(outs target:$Rt), (ins t2addrmode_negimm8:$addr), iii,
                    opc, "\t$Rt, $addr",
-                   [(set GPR:$Rt, (opnode t2addrmode_imm8:$addr))]> {
+                   [(set target:$Rt, (opnode t2addrmode_negimm8:$addr))]> {
+    bits<4> Rt;
+    bits<13> addr;
     let Inst{31-27} = 0b11111;
     let Inst{26-25} = 0b00;
     let Inst{24} = signed;
     let Inst{23} = 0;
     let Inst{22-21} = opcod;
     let Inst{20} = 1; // load
+    let Inst{19-16} = addr{12-9}; // Rn
+    let Inst{15-12} = Rt;
     let Inst{11} = 1;
     // Offset: index==TRUE, wback==FALSE
     let Inst{10} = 1; // The P bit.
-    let Inst{8} = 0; // The W bit.
-
-    bits<4> Rt;
-    let Inst{15-12} = Rt;
-
-    bits<13> addr;
-    let Inst{19-16} = addr{12-9}; // Rn
     let Inst{9}     = addr{8};    // U
+    let Inst{8} = 0; // The W bit.
     let Inst{7-0}   = addr{7-0};  // imm
   }
-  def s   : T2Iso <(outs GPR:$Rt), (ins t2addrmode_so_reg:$addr), iis,
+  def s   : T2Iso <(outs target:$Rt), (ins t2addrmode_so_reg:$addr), iis,
                    opc, ".w\t$Rt, $addr",
-                   [(set GPR:$Rt, (opnode t2addrmode_so_reg:$addr))]> {
+                   [(set target:$Rt, (opnode t2addrmode_so_reg:$addr))]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-25} = 0b00;
     let Inst{24} = signed;
@@ -895,12 +926,14 @@ multiclass T2I_ld<bit signed, bits<2> opcod, string opc,
     let Inst{19-16} = addr{9-6}; // Rn
     let Inst{3-0}   = addr{5-2}; // Rm
     let Inst{5-4}   = addr{1-0}; // imm
+
+    let DecoderMethod = "DecodeT2LoadShift";
   }
 
   // FIXME: Is the pci variant actually needed?
-  def pci : T2Ipc <(outs GPR:$Rt), (ins t2ldrlabel:$addr), iii,
+  def pci : T2Ipc <(outs target:$Rt), (ins t2ldrlabel:$addr), iii,
                    opc, ".w\t$Rt, $addr",
-                   [(set GPR:$Rt, (opnode (ARMWrapper tconstpool:$addr)))]> {
+                   [(set target:$Rt, (opnode (ARMWrapper tconstpool:$addr)))]> {
     let isReMaterializable = 1;
     let Inst{31-27} = 0b11111;
     let Inst{26-25} = 0b00;
@@ -918,10 +951,11 @@ multiclass T2I_ld<bit signed, bits<2> opcod, string opc,
 
 /// T2I_st - Defines a set of (op r, {imm12|imm8|so_reg}) store patterns.
 multiclass T2I_st<bits<2> opcod, string opc,
-                  InstrItinClass iii, InstrItinClass iis, PatFrag opnode> {
-  def i12 : T2Ii12<(outs), (ins GPR:$Rt, t2addrmode_imm12:$addr), iii,
+                  InstrItinClass iii, InstrItinClass iis, RegisterClass target,
+                  PatFrag opnode> {
+  def i12 : T2Ii12<(outs), (ins target:$Rt, t2addrmode_imm12:$addr), iii,
                    opc, ".w\t$Rt, $addr",
-                   [(opnode GPR:$Rt, t2addrmode_imm12:$addr)]> {
+                   [(opnode target:$Rt, t2addrmode_imm12:$addr)]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0001;
     let Inst{22-21} = opcod;
@@ -936,9 +970,9 @@ multiclass T2I_st<bits<2> opcod, string opc,
     let Inst{23}    = addr{12};    // U
     let Inst{11-0}  = addr{11-0};  // imm
   }
-  def i8  : T2Ii8 <(outs), (ins GPR:$Rt, t2addrmode_imm8:$addr), iii,
+  def i8  : T2Ii8 <(outs), (ins target:$Rt, t2addrmode_negimm8:$addr), iii,
                    opc, "\t$Rt, $addr",
-                   [(opnode GPR:$Rt, t2addrmode_imm8:$addr)]> {
+                   [(opnode target:$Rt, t2addrmode_negimm8:$addr)]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0000;
     let Inst{22-21} = opcod;
@@ -956,9 +990,9 @@ multiclass T2I_st<bits<2> opcod, string opc,
     let Inst{9}     = addr{8};    // U
     let Inst{7-0}   = addr{7-0};  // imm
   }
-  def s   : T2Iso <(outs), (ins GPR:$Rt, t2addrmode_so_reg:$addr), iis,
+  def s   : T2Iso <(outs), (ins target:$Rt, t2addrmode_so_reg:$addr), iis,
                    opc, ".w\t$Rt, $addr",
-                   [(opnode GPR:$Rt, t2addrmode_so_reg:$addr)]> {
+                   [(opnode target:$Rt, t2addrmode_so_reg:$addr)]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0000;
     let Inst{22-21} = opcod;
@@ -977,146 +1011,81 @@ multiclass T2I_st<bits<2> opcod, string opc,
 
 /// T2I_ext_rrot - A unary operation with two forms: one whose operand is a
 /// register and one whose operand is a register rotated by 8/16/24.
-multiclass T2I_ext_rrot<bits<3> opcod, string opc, PatFrag opnode> {
-  def r     : T2TwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iEXTr,
-                  opc, ".w\t$Rd, $Rm",
-                 [(set rGPR:$Rd, (opnode rGPR:$Rm))]> {
-     let Inst{31-27} = 0b11111;
-     let Inst{26-23} = 0b0100;
-     let Inst{22-20} = opcod;
-     let Inst{19-16} = 0b1111; // Rn
-     let Inst{15-12} = 0b1111;
-     let Inst{7} = 1;
-     let Inst{5-4} = 0b00; // rotate
-   }
-  def r_rot : T2TwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm, rot_imm:$rot), IIC_iEXTr,
-                  opc, ".w\t$Rd, $Rm, ror $rot",
-                 [(set rGPR:$Rd, (opnode (rotr rGPR:$Rm, rot_imm:$rot)))]> {
-     let Inst{31-27} = 0b11111;
-     let Inst{26-23} = 0b0100;
-     let Inst{22-20} = opcod;
-     let Inst{19-16} = 0b1111; // Rn
-     let Inst{15-12} = 0b1111;
-     let Inst{7} = 1;
-
-     bits<2> rot;
-     let Inst{5-4} = rot{1-0}; // rotate
-   }
+class T2I_ext_rrot<bits<3> opcod, string opc, PatFrag opnode>
+  : T2TwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm, rot_imm:$rot), IIC_iEXTr,
+             opc, ".w\t$Rd, $Rm$rot",
+             [(set rGPR:$Rd, (opnode (rotr rGPR:$Rm, rot_imm:$rot)))]>,
+             Requires<[IsThumb2]> {
+   let Inst{31-27} = 0b11111;
+   let Inst{26-23} = 0b0100;
+   let Inst{22-20} = opcod;
+   let Inst{19-16} = 0b1111; // Rn
+   let Inst{15-12} = 0b1111;
+   let Inst{7} = 1;
+
+   bits<2> rot;
+   let Inst{5-4} = rot{1-0}; // rotate
 }
 
 // UXTB16 - Requres T2ExtractPack, does not need the .w qualifier.
-multiclass T2I_ext_rrot_uxtb16<bits<3> opcod, string opc, PatFrag opnode> {
-  def r     : T2TwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iEXTr,
-                  opc, "\t$Rd, $Rm",
-                 [(set rGPR:$Rd, (opnode rGPR:$Rm))]>,
-                 Requires<[HasT2ExtractPack, IsThumb2]> {
-     let Inst{31-27} = 0b11111;
-     let Inst{26-23} = 0b0100;
-     let Inst{22-20} = opcod;
-     let Inst{19-16} = 0b1111; // Rn
-     let Inst{15-12} = 0b1111;
-     let Inst{7} = 1;
-     let Inst{5-4} = 0b00; // rotate
-   }
-  def r_rot : T2TwoReg<(outs rGPR:$dst), (ins rGPR:$Rm, rot_imm:$rot),
-                  IIC_iEXTr, opc, "\t$dst, $Rm, ror $rot",
-                 [(set rGPR:$dst, (opnode (rotr rGPR:$Rm, rot_imm:$rot)))]>,
-                 Requires<[HasT2ExtractPack, IsThumb2]> {
-     let Inst{31-27} = 0b11111;
-     let Inst{26-23} = 0b0100;
-     let Inst{22-20} = opcod;
-     let Inst{19-16} = 0b1111; // Rn
-     let Inst{15-12} = 0b1111;
-     let Inst{7} = 1;
-
-     bits<2> rot;
-     let Inst{5-4} = rot{1-0}; // rotate
-   }
+class T2I_ext_rrot_uxtb16<bits<3> opcod, string opc, PatFrag opnode>
+  : T2TwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm, rot_imm:$rot),
+             IIC_iEXTr, opc, "\t$Rd, $Rm$rot",
+            [(set rGPR:$Rd, (opnode (rotr rGPR:$Rm, rot_imm:$rot)))]>,
+          Requires<[HasT2ExtractPack, IsThumb2]> {
+  bits<2> rot;
+  let Inst{31-27} = 0b11111;
+  let Inst{26-23} = 0b0100;
+  let Inst{22-20} = opcod;
+  let Inst{19-16} = 0b1111; // Rn
+  let Inst{15-12} = 0b1111;
+  let Inst{7} = 1;
+  let Inst{5-4} = rot;
 }
 
 // SXTB16 - Requres T2ExtractPack, does not need the .w qualifier, no pattern
 // supported yet.
-multiclass T2I_ext_rrot_sxtb16<bits<3> opcod, string opc> {
-  def r     : T2TwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iEXTr,
-                  opc, "\t$Rd, $Rm", []>,
+class T2I_ext_rrot_sxtb16<bits<3> opcod, string opc>
+  : T2TwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm, rot_imm:$rot), IIC_iEXTr,
+             opc, "\t$Rd, $Rm$rot", []>,
           Requires<[IsThumb2, HasT2ExtractPack]> {
-     let Inst{31-27} = 0b11111;
-     let Inst{26-23} = 0b0100;
-     let Inst{22-20} = opcod;
-     let Inst{19-16} = 0b1111; // Rn
-     let Inst{15-12} = 0b1111;
-     let Inst{7} = 1;
-     let Inst{5-4} = 0b00; // rotate
-   }
-  def r_rot : T2TwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm, i32imm:$rot), IIC_iEXTr,
-                  opc, "\t$Rd, $Rm, ror $rot", []>,
-          Requires<[IsThumb2, HasT2ExtractPack]> {
-     let Inst{31-27} = 0b11111;
-     let Inst{26-23} = 0b0100;
-     let Inst{22-20} = opcod;
-     let Inst{19-16} = 0b1111; // Rn
-     let Inst{15-12} = 0b1111;
-     let Inst{7} = 1;
-
-      bits<2> rot;
-      let Inst{5-4} = rot{1-0}; // rotate
-   }
+  bits<2> rot;
+  let Inst{31-27} = 0b11111;
+  let Inst{26-23} = 0b0100;
+  let Inst{22-20} = opcod;
+  let Inst{19-16} = 0b1111; // Rn
+  let Inst{15-12} = 0b1111;
+  let Inst{7} = 1;
+  let Inst{5-4} = rot;
 }
 
 /// T2I_exta_rrot - A binary operation with two forms: one whose operand is a
 /// register and one whose operand is a register rotated by 8/16/24.
-multiclass T2I_exta_rrot<bits<3> opcod, string opc, PatFrag opnode> {
-  def rr     : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iEXTAr,
-                  opc, "\t$Rd, $Rn, $Rm",
-                  [(set rGPR:$Rd, (opnode rGPR:$Rn, rGPR:$Rm))]>,
-                  Requires<[HasT2ExtractPack, IsThumb2]> {
-     let Inst{31-27} = 0b11111;
-     let Inst{26-23} = 0b0100;
-     let Inst{22-20} = opcod;
-     let Inst{15-12} = 0b1111;
-     let Inst{7} = 1;
-     let Inst{5-4} = 0b00; // rotate
-   }
-  def rr_rot : T2ThreeReg<(outs rGPR:$Rd),
-                  (ins rGPR:$Rn, rGPR:$Rm, rot_imm:$rot),
-                  IIC_iEXTAsr, opc, "\t$Rd, $Rn, $Rm, ror $rot",
-                  [(set rGPR:$Rd, (opnode rGPR:$Rn,
-                                          (rotr rGPR:$Rm, rot_imm:$rot)))]>,
-                  Requires<[HasT2ExtractPack, IsThumb2]> {
-     let Inst{31-27} = 0b11111;
-     let Inst{26-23} = 0b0100;
-     let Inst{22-20} = opcod;
-     let Inst{15-12} = 0b1111;
-     let Inst{7} = 1;
-
-     bits<2> rot;
-     let Inst{5-4} = rot{1-0}; // rotate
-   }
+class T2I_exta_rrot<bits<3> opcod, string opc, PatFrag opnode>
+  : T2ThreeReg<(outs rGPR:$Rd),
+               (ins rGPR:$Rn, rGPR:$Rm, rot_imm:$rot),
+               IIC_iEXTAsr, opc, "\t$Rd, $Rn, $Rm$rot",
+             [(set rGPR:$Rd, (opnode rGPR:$Rn, (rotr rGPR:$Rm,rot_imm:$rot)))]>,
+           Requires<[HasT2ExtractPack, IsThumb2]> {
+  bits<2> rot;
+  let Inst{31-27} = 0b11111;
+  let Inst{26-23} = 0b0100;
+  let Inst{22-20} = opcod;
+  let Inst{15-12} = 0b1111;
+  let Inst{7} = 1;
+  let Inst{5-4} = rot;
 }
 
-// DO variant - disassembly only, no pattern
-
-multiclass T2I_exta_rrot_DO<bits<3> opcod, string opc> {
-  def rr     : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iEXTAr,
-                  opc, "\t$Rd, $Rn, $Rm", []> {
-     let Inst{31-27} = 0b11111;
-     let Inst{26-23} = 0b0100;
-     let Inst{22-20} = opcod;
-     let Inst{15-12} = 0b1111;
-     let Inst{7} = 1;
-     let Inst{5-4} = 0b00; // rotate
-   }
-  def rr_rot :T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, i32imm:$rot),
-                  IIC_iEXTAsr, opc, "\t$Rd, $Rn, $Rm, ror $rot", []> {
-     let Inst{31-27} = 0b11111;
-     let Inst{26-23} = 0b0100;
-     let Inst{22-20} = opcod;
-     let Inst{15-12} = 0b1111;
-     let Inst{7} = 1;
-
-     bits<2> rot;
-     let Inst{5-4} = rot{1-0}; // rotate
-   }
+class T2I_exta_rrot_np<bits<3> opcod, string opc>
+  : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm,rot_imm:$rot),
+               IIC_iEXTAsr, opc, "\t$Rd, $Rn, $Rm$rot", []> {
+  bits<2> rot;
+  let Inst{31-27} = 0b11111;
+  let Inst{26-23} = 0b0100;
+  let Inst{22-20} = opcod;
+  let Inst{15-12} = 0b1111;
+  let Inst{7} = 1;
+  let Inst{5-4} = rot;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1143,7 +1112,7 @@ class T2PCOneRegImm<dag oops, dag iops, InstrItinClass itin,
 // assembler.
 def t2ADR : T2PCOneRegImm<(outs rGPR:$Rd),
               (ins t2adrlabel:$addr, pred:$p),
-              IIC_iALUi, "adr{$p}.w\t$Rd, #$addr", []> {
+              IIC_iALUi, "adr{$p}.w\t$Rd, $addr", []> {
   let Inst{31-27} = 0b11110;
   let Inst{25-24} = 0b10;
   // Inst{23:21} = '11' (add = FALSE) or '00' (add = TRUE)
@@ -1160,6 +1129,8 @@ def t2ADR : T2PCOneRegImm<(outs rGPR:$Rd),
   let Inst{26}    = addr{11};
   let Inst{14-12} = addr{10-8};
   let Inst{7-0}   = addr{7-0};
+
+  let DecoderMethod = "DecodeT2Adr";
 }
 
 let neverHasSideEffects = 1, isReMaterializable = 1 in
@@ -1177,33 +1148,33 @@ def t2LEApcrelJT : t2PseudoInst<(outs rGPR:$Rd),
 
 // Load
 let canFoldAsLoad = 1, isReMaterializable = 1  in
-defm t2LDR   : T2I_ld<0, 0b10, "ldr", IIC_iLoad_i, IIC_iLoad_si,
+defm t2LDR   : T2I_ld<0, 0b10, "ldr", IIC_iLoad_i, IIC_iLoad_si, GPR,
                       UnOpFrag<(load node:$Src)>>;
 
 // Loads with zero extension
 defm t2LDRH  : T2I_ld<0, 0b01, "ldrh", IIC_iLoad_bh_i, IIC_iLoad_bh_si,
-                      UnOpFrag<(zextloadi16 node:$Src)>>;
+                      rGPR, UnOpFrag<(zextloadi16 node:$Src)>>;
 defm t2LDRB  : T2I_ld<0, 0b00, "ldrb", IIC_iLoad_bh_i, IIC_iLoad_bh_si,
-                      UnOpFrag<(zextloadi8  node:$Src)>>;
+                      rGPR, UnOpFrag<(zextloadi8  node:$Src)>>;
 
 // Loads with sign extension
 defm t2LDRSH : T2I_ld<1, 0b01, "ldrsh", IIC_iLoad_bh_i, IIC_iLoad_bh_si,
-                      UnOpFrag<(sextloadi16 node:$Src)>>;
+                      rGPR, UnOpFrag<(sextloadi16 node:$Src)>>;
 defm t2LDRSB : T2I_ld<1, 0b00, "ldrsb", IIC_iLoad_bh_i, IIC_iLoad_bh_si,
-                      UnOpFrag<(sextloadi8  node:$Src)>>;
+                      rGPR, UnOpFrag<(sextloadi8  node:$Src)>>;
 
 let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in {
 // Load doubleword
 def t2LDRDi8  : T2Ii8s4<1, 0, 1, (outs rGPR:$Rt, rGPR:$Rt2),
                         (ins t2addrmode_imm8s4:$addr),
-                        IIC_iLoad_d_i, "ldrd", "\t$Rt, $Rt2, $addr", []>;
+                        IIC_iLoad_d_i, "ldrd", "\t$Rt, $Rt2, $addr", "", []>;
 } // mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1
 
 // zextload i1 -> zextload i8
 def : T2Pat<(zextloadi1 t2addrmode_imm12:$addr),
             (t2LDRBi12  t2addrmode_imm12:$addr)>;
-def : T2Pat<(zextloadi1 t2addrmode_imm8:$addr),
-            (t2LDRBi8   t2addrmode_imm8:$addr)>;
+def : T2Pat<(zextloadi1 t2addrmode_negimm8:$addr),
+            (t2LDRBi8   t2addrmode_negimm8:$addr)>;
 def : T2Pat<(zextloadi1 t2addrmode_so_reg:$addr),
             (t2LDRBs    t2addrmode_so_reg:$addr)>;
 def : T2Pat<(zextloadi1 (ARMWrapper tconstpool:$addr)),
@@ -1214,8 +1185,8 @@ def : T2Pat<(zextloadi1 (ARMWrapper tconstpool:$addr)),
 // earlier?
 def : T2Pat<(extloadi1  t2addrmode_imm12:$addr),
             (t2LDRBi12  t2addrmode_imm12:$addr)>;
-def : T2Pat<(extloadi1  t2addrmode_imm8:$addr),
-            (t2LDRBi8   t2addrmode_imm8:$addr)>;
+def : T2Pat<(extloadi1  t2addrmode_negimm8:$addr),
+            (t2LDRBi8   t2addrmode_negimm8:$addr)>;
 def : T2Pat<(extloadi1  t2addrmode_so_reg:$addr),
             (t2LDRBs    t2addrmode_so_reg:$addr)>;
 def : T2Pat<(extloadi1  (ARMWrapper tconstpool:$addr)),
@@ -1223,8 +1194,8 @@ def : T2Pat<(extloadi1  (ARMWrapper tconstpool:$addr)),
 
 def : T2Pat<(extloadi8  t2addrmode_imm12:$addr),
             (t2LDRBi12  t2addrmode_imm12:$addr)>;
-def : T2Pat<(extloadi8  t2addrmode_imm8:$addr),
-            (t2LDRBi8   t2addrmode_imm8:$addr)>;
+def : T2Pat<(extloadi8  t2addrmode_negimm8:$addr),
+            (t2LDRBi8   t2addrmode_negimm8:$addr)>;
 def : T2Pat<(extloadi8  t2addrmode_so_reg:$addr),
             (t2LDRBs    t2addrmode_so_reg:$addr)>;
 def : T2Pat<(extloadi8  (ARMWrapper tconstpool:$addr)),
@@ -1232,8 +1203,8 @@ def : T2Pat<(extloadi8  (ARMWrapper tconstpool:$addr)),
 
 def : T2Pat<(extloadi16 t2addrmode_imm12:$addr),
             (t2LDRHi12  t2addrmode_imm12:$addr)>;
-def : T2Pat<(extloadi16 t2addrmode_imm8:$addr),
-            (t2LDRHi8   t2addrmode_imm8:$addr)>;
+def : T2Pat<(extloadi16 t2addrmode_negimm8:$addr),
+            (t2LDRHi8   t2addrmode_negimm8:$addr)>;
 def : T2Pat<(extloadi16 t2addrmode_so_reg:$addr),
             (t2LDRHs    t2addrmode_so_reg:$addr)>;
 def : T2Pat<(extloadi16 (ARMWrapper tconstpool:$addr)),
@@ -1247,83 +1218,86 @@ def : T2Pat<(extloadi16 (ARMWrapper tconstpool:$addr)),
 // Indexed loads
 
 let mayLoad = 1, neverHasSideEffects = 1 in {
-def t2LDR_PRE  : T2Iidxldst<0, 0b10, 1, 1, (outs GPR:$Rt, GPR:$Rn),
+def t2LDR_PRE  : T2Ipreldst<0, 0b10, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
                             (ins t2addrmode_imm8:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iLoad_iu,
-                            "ldr", "\t$Rt, $addr!", "$addr.base = $Rn",
-                            []>;
+                            "ldr", "\t$Rt, $addr!", "$addr.base = $Rn_wb",
+                            []> {
+  let AsmMatchConverter = "cvtLdWriteBackRegT2AddrModeImm8";
+}
 
-def t2LDR_POST : T2Iidxldst<0, 0b10, 1, 0, (outs GPR:$Rt, GPR:$Rn),
-                            (ins GPR:$base, t2am_imm8_offset:$addr),
-                            AddrModeT2_i8, IndexModePost, IIC_iLoad_iu,
-                          "ldr", "\t$Rt, [$Rn], $addr", "$base = $Rn",
-                            []>;
+def t2LDR_POST : T2Ipostldst<0, 0b10, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+                          (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset),
+                          AddrModeT2_i8, IndexModePost, IIC_iLoad_iu,
+                          "ldr", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>;
 
-def t2LDRB_PRE : T2Iidxldst<0, 0b00, 1, 1, (outs GPR:$Rt, GPR:$Rn),
+def t2LDRB_PRE : T2Ipreldst<0, 0b00, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
                             (ins t2addrmode_imm8:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iLoad_bh_iu,
-                            "ldrb", "\t$Rt, $addr!", "$addr.base = $Rn",
-                            []>;
-def t2LDRB_POST : T2Iidxldst<0, 0b00, 1, 0, (outs GPR:$Rt, GPR:$Rn),
-                            (ins GPR:$base, t2am_imm8_offset:$addr),
-                            AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu,
-                         "ldrb", "\t$Rt, [$Rn], $addr", "$base = $Rn",
-                            []>;
-
-def t2LDRH_PRE : T2Iidxldst<0, 0b01, 1, 1, (outs GPR:$Rt, GPR:$Rn),
+                            "ldrb", "\t$Rt, $addr!", "$addr.base = $Rn_wb",
+                            []> {
+  let AsmMatchConverter = "cvtLdWriteBackRegT2AddrModeImm8";
+}
+def t2LDRB_POST : T2Ipostldst<0, 0b00, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+                          (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset),
+                          AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu,
+                          "ldrb", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>;
+
+def t2LDRH_PRE : T2Ipreldst<0, 0b01, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
                             (ins t2addrmode_imm8:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iLoad_bh_iu,
-                            "ldrh", "\t$Rt, $addr!", "$addr.base = $Rn",
-                            []>;
-def t2LDRH_POST : T2Iidxldst<0, 0b01, 1, 0, (outs GPR:$Rt, GPR:$Rn),
-                            (ins GPR:$base, t2am_imm8_offset:$addr),
-                            AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu,
-                         "ldrh", "\t$Rt, [$Rn], $addr", "$base = $Rn",
-                            []>;
-
-def t2LDRSB_PRE : T2Iidxldst<1, 0b00, 1, 1, (outs GPR:$Rt, GPR:$Rn),
+                            "ldrh", "\t$Rt, $addr!", "$addr.base = $Rn_wb",
+                            []> {
+  let AsmMatchConverter = "cvtLdWriteBackRegT2AddrModeImm8";
+}
+def t2LDRH_POST : T2Ipostldst<0, 0b01, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+                          (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset),
+                          AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu,
+                          "ldrh", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>;
+
+def t2LDRSB_PRE : T2Ipreldst<1, 0b00, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
                             (ins t2addrmode_imm8:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iLoad_bh_iu,
-                            "ldrsb", "\t$Rt, $addr!", "$addr.base = $Rn",
-                            []>;
-def t2LDRSB_POST : T2Iidxldst<1, 0b00, 1, 0, (outs GPR:$Rt, GPR:$Rn),
-                            (ins GPR:$base, t2am_imm8_offset:$addr),
-                            AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu,
-                        "ldrsb", "\t$Rt, [$Rn], $addr", "$base = $Rn",
-                            []>;
-
-def t2LDRSH_PRE : T2Iidxldst<1, 0b01, 1, 1, (outs GPR:$Rt, GPR:$Rn),
+                            "ldrsb", "\t$Rt, $addr!", "$addr.base = $Rn_wb",
+                            []> {
+  let AsmMatchConverter = "cvtLdWriteBackRegT2AddrModeImm8";
+}
+def t2LDRSB_POST : T2Ipostldst<1, 0b00, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+                          (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset),
+                          AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu,
+                          "ldrsb", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>;
+
+def t2LDRSH_PRE : T2Ipreldst<1, 0b01, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
                             (ins t2addrmode_imm8:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iLoad_bh_iu,
-                            "ldrsh", "\t$Rt, $addr!", "$addr.base = $Rn",
-                            []>;
-def t2LDRSH_POST : T2Iidxldst<1, 0b01, 1, 0, (outs GPR:$dst, GPR:$Rn),
-                            (ins GPR:$base, t2am_imm8_offset:$addr),
-                            AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu,
-                        "ldrsh", "\t$dst, [$Rn], $addr", "$base = $Rn",
-                            []>;
+                            "ldrsh", "\t$Rt, $addr!", "$addr.base = $Rn_wb",
+                            []> {
+  let AsmMatchConverter = "cvtLdWriteBackRegT2AddrModeImm8";
+}
+def t2LDRSH_POST : T2Ipostldst<1, 0b01, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+                          (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset),
+                          AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu,
+                          "ldrsh", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>;
 } // mayLoad = 1, neverHasSideEffects = 1
 
-// LDRT, LDRBT, LDRHT, LDRSBT, LDRSHT all have offset mode (PUW=0b110) and are
-// for disassembly only.
+// LDRT, LDRBT, LDRHT, LDRSBT, LDRSHT all have offset mode (PUW=0b110).
 // Ref: A8.6.57 LDR (immediate, Thumb) Encoding T4
 class T2IldT<bit signed, bits<2> type, string opc, InstrItinClass ii>
-  : T2Ii8<(outs rGPR:$Rt), (ins t2addrmode_imm8:$addr), ii, opc,
+  : T2Ii8<(outs rGPR:$Rt), (ins t2addrmode_posimm8:$addr), ii, opc,
           "\t$Rt, $addr", []> {
+  bits<4> Rt;
+  bits<13> addr;
   let Inst{31-27} = 0b11111;
   let Inst{26-25} = 0b00;
   let Inst{24} = signed;
   let Inst{23} = 0;
   let Inst{22-21} = type;
   let Inst{20} = 1; // load
+  let Inst{19-16} = addr{12-9};
+  let Inst{15-12} = Rt;
   let Inst{11} = 1;
   let Inst{10-8} = 0b110; // PUW.
-
-  bits<4> Rt;
-  bits<13> addr;
-  let Inst{15-12} = Rt;
-  let Inst{19-16} = addr{12-9};
-  let Inst{7-0}   = addr{7-0};
+  let Inst{7-0} = addr{7-0};
 }
 
 def t2LDRT   : T2IldT<0, 0b10, "ldrt", IIC_iLoad_i>;
@@ -1333,67 +1307,97 @@ def t2LDRSBT : T2IldT<1, 0b00, "ldrsbt", IIC_iLoad_bh_i>;
 def t2LDRSHT : T2IldT<1, 0b01, "ldrsht", IIC_iLoad_bh_i>;
 
 // Store
-defm t2STR :T2I_st<0b10,"str", IIC_iStore_i, IIC_iStore_si,
+defm t2STR :T2I_st<0b10,"str", IIC_iStore_i, IIC_iStore_si, GPR,
                    BinOpFrag<(store node:$LHS, node:$RHS)>>;
 defm t2STRB:T2I_st<0b00,"strb", IIC_iStore_bh_i, IIC_iStore_bh_si,
-                   BinOpFrag<(truncstorei8 node:$LHS, node:$RHS)>>;
+                   rGPR, BinOpFrag<(truncstorei8 node:$LHS, node:$RHS)>>;
 defm t2STRH:T2I_st<0b01,"strh", IIC_iStore_bh_i, IIC_iStore_bh_si,
-                   BinOpFrag<(truncstorei16 node:$LHS, node:$RHS)>>;
+                   rGPR, BinOpFrag<(truncstorei16 node:$LHS, node:$RHS)>>;
 
 // Store doubleword
 let mayLoad = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in
 def t2STRDi8 : T2Ii8s4<1, 0, 0, (outs),
                        (ins GPR:$Rt, GPR:$Rt2, t2addrmode_imm8s4:$addr),
-               IIC_iStore_d_r, "strd", "\t$Rt, $Rt2, $addr", []>;
+               IIC_iStore_d_r, "strd", "\t$Rt, $Rt2, $addr", "", []>;
 
 // Indexed stores
-def t2STR_PRE  : T2Iidxldst<0, 0b10, 0, 1, (outs GPR:$base_wb),
-                            (ins GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr),
+def t2STR_PRE  : T2Ipreldst<0, 0b10, 0, 1, (outs GPRnopc:$Rn_wb),
+                            (ins rGPR:$Rt, t2addrmode_imm8:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iStore_iu,
-                         "str", "\t$Rt, [$Rn, $addr]!",
-                         "$Rn = $base_wb,@earlyclobber $base_wb",
-             [(set GPR:$base_wb,
-                   (pre_store GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr))]>;
-
-def t2STR_POST : T2Iidxldst<0, 0b10, 0, 0, (outs GPR:$base_wb),
-                            (ins GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr),
-                            AddrModeT2_i8, IndexModePost, IIC_iStore_iu,
-                          "str", "\t$Rt, [$Rn], $addr",
-                          "$Rn = $base_wb,@earlyclobber $base_wb",
-             [(set GPR:$base_wb,
-                  (post_store GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr))]>;
-
-def t2STRH_PRE  : T2Iidxldst<0, 0b01, 0, 1, (outs GPR:$base_wb),
-                            (ins GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr),
+                            "str", "\t$Rt, $addr!",
+                            "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []> {
+  let AsmMatchConverter = "cvtStWriteBackRegT2AddrModeImm8";
+}
+def t2STRH_PRE  : T2Ipreldst<0, 0b01, 0, 1, (outs GPRnopc:$Rn_wb),
+                            (ins rGPR:$Rt, t2addrmode_imm8:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iStore_iu,
-                        "strh", "\t$Rt, [$Rn, $addr]!",
-                        "$Rn = $base_wb,@earlyclobber $base_wb",
-        [(set GPR:$base_wb,
-              (pre_truncsti16 GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr))]>;
-
-def t2STRH_POST : T2Iidxldst<0, 0b01, 0, 0, (outs GPR:$base_wb),
-                            (ins GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr),
-                            AddrModeT2_i8, IndexModePost, IIC_iStore_bh_iu,
-                         "strh", "\t$Rt, [$Rn], $addr",
-                         "$Rn = $base_wb,@earlyclobber $base_wb",
-       [(set GPR:$base_wb,
-             (post_truncsti16 GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr))]>;
+                        "strh", "\t$Rt, $addr!",
+                        "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []> {
+  let AsmMatchConverter = "cvtStWriteBackRegT2AddrModeImm8";
+}
 
-def t2STRB_PRE  : T2Iidxldst<0, 0b00, 0, 1, (outs GPR:$base_wb),
-                            (ins GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr),
+def t2STRB_PRE  : T2Ipreldst<0, 0b00, 0, 1, (outs GPRnopc:$Rn_wb),
+                            (ins rGPR:$Rt, t2addrmode_imm8:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iStore_bh_iu,
-                        "strb", "\t$Rt, [$Rn, $addr]!",
-                        "$Rn = $base_wb,@earlyclobber $base_wb",
-         [(set GPR:$base_wb,
-               (pre_truncsti8 GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr))]>;
+                        "strb", "\t$Rt, $addr!",
+                        "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []> {
+  let AsmMatchConverter = "cvtStWriteBackRegT2AddrModeImm8";
+}
 
-def t2STRB_POST : T2Iidxldst<0, 0b00, 0, 0, (outs GPR:$base_wb),
-                            (ins GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr),
+def t2STR_POST : T2Ipostldst<0, 0b10, 0, 0, (outs GPRnopc:$Rn_wb),
+                            (ins rGPR:$Rt, addr_offset_none:$Rn,
+                                 t2am_imm8_offset:$offset),
+                            AddrModeT2_i8, IndexModePost, IIC_iStore_iu,
+                          "str", "\t$Rt, $Rn$offset",
+                          "$Rn = $Rn_wb,@earlyclobber $Rn_wb",
+             [(set GPRnopc:$Rn_wb,
+                  (post_store rGPR:$Rt, addr_offset_none:$Rn,
+                              t2am_imm8_offset:$offset))]>;
+
+def t2STRH_POST : T2Ipostldst<0, 0b01, 0, 0, (outs GPRnopc:$Rn_wb),
+                            (ins rGPR:$Rt, addr_offset_none:$Rn,
+                                 t2am_imm8_offset:$offset),
                             AddrModeT2_i8, IndexModePost, IIC_iStore_bh_iu,
-                         "strb", "\t$Rt, [$Rn], $addr",
-                         "$Rn = $base_wb,@earlyclobber $base_wb",
-        [(set GPR:$base_wb,
-              (post_truncsti8 GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr))]>;
+                         "strh", "\t$Rt, $Rn$offset",
+                         "$Rn = $Rn_wb,@earlyclobber $Rn_wb",
+       [(set GPRnopc:$Rn_wb,
+             (post_truncsti16 rGPR:$Rt, addr_offset_none:$Rn,
+                              t2am_imm8_offset:$offset))]>;
+
+def t2STRB_POST : T2Ipostldst<0, 0b00, 0, 0, (outs GPRnopc:$Rn_wb),
+                            (ins rGPR:$Rt, addr_offset_none:$Rn,
+                                 t2am_imm8_offset:$offset),
+                            AddrModeT2_i8, IndexModePost, IIC_iStore_bh_iu,
+                         "strb", "\t$Rt, $Rn$offset",
+                         "$Rn = $Rn_wb,@earlyclobber $Rn_wb",
+        [(set GPRnopc:$Rn_wb,
+              (post_truncsti8 rGPR:$Rt, addr_offset_none:$Rn,
+                              t2am_imm8_offset:$offset))]>;
+
+// Pseudo-instructions for pattern matching the pre-indexed stores. We can't
+// put the patterns on the instruction definitions directly as ISel wants
+// the address base and offset to be separate operands, not a single
+// complex operand like we represent the instructions themselves. The
+// pseudos map between the two.
+let usesCustomInserter = 1,
+    Constraints = "$Rn = $Rn_wb,@earlyclobber $Rn_wb" in {
+def t2STR_preidx: t2PseudoInst<(outs GPRnopc:$Rn_wb),
+               (ins rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset, pred:$p),
+               4, IIC_iStore_ru,
+      [(set GPRnopc:$Rn_wb,
+            (pre_store rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset))]>;
+def t2STRB_preidx: t2PseudoInst<(outs GPRnopc:$Rn_wb),
+               (ins rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset, pred:$p),
+               4, IIC_iStore_ru,
+      [(set GPRnopc:$Rn_wb,
+            (pre_truncsti8 rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset))]>;
+def t2STRH_preidx: t2PseudoInst<(outs GPRnopc:$Rn_wb),
+               (ins rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset, pred:$p),
+               4, IIC_iStore_ru,
+      [(set GPRnopc:$Rn_wb,
+            (pre_truncsti16 rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset))]>;
+}
+
 
 // STRT, STRBT, STRHT all have offset mode (PUW=0b110) and are for disassembly
 // only.
@@ -1424,21 +1428,31 @@ def t2STRHT  : T2IstT<0b01, "strht", IIC_iStore_bh_i>;
 // ldrd / strd pre / post variants
 // For disassembly only.
 
-def t2LDRD_PRE  : T2Ii8s4<1, 1, 1, (outs rGPR:$Rt, rGPR:$Rt2),
-                 (ins GPR:$base, t2am_imm8s4_offset:$imm), IIC_iLoad_d_ru,
-                 "ldrd", "\t$Rt, $Rt2, [$base, $imm]!", []>;
+def t2LDRD_PRE  : T2Ii8s4<1, 1, 1, (outs rGPR:$Rt, rGPR:$Rt2, GPR:$wb),
+                 (ins t2addrmode_imm8s4:$addr), IIC_iLoad_d_ru,
+                 "ldrd", "\t$Rt, $Rt2, $addr!", "$addr.base = $wb", []> {
+  let AsmMatchConverter = "cvtT2LdrdPre";
+  let DecoderMethod = "DecodeT2LDRDPreInstruction";
+}
 
-def t2LDRD_POST : T2Ii8s4<0, 1, 1, (outs rGPR:$Rt, rGPR:$Rt2),
-                 (ins GPR:$base, t2am_imm8s4_offset:$imm), IIC_iLoad_d_ru,
-                 "ldrd", "\t$Rt, $Rt2, [$base], $imm", []>;
+def t2LDRD_POST : T2Ii8s4post<0, 1, 1, (outs rGPR:$Rt, rGPR:$Rt2, GPR:$wb),
+                 (ins addr_offset_none:$addr, t2am_imm8s4_offset:$imm),
+                 IIC_iLoad_d_ru, "ldrd", "\t$Rt, $Rt2, $addr$imm",
+                 "$addr.base = $wb", []>;
 
-def t2STRD_PRE  : T2Ii8s4<1, 1, 0, (outs),
-                 (ins rGPR:$Rt, rGPR:$Rt2, GPR:$base, t2am_imm8s4_offset:$imm),
-                 IIC_iStore_d_ru, "strd", "\t$Rt, $Rt2, [$base, $imm]!", []>;
+def t2STRD_PRE  : T2Ii8s4<1, 1, 0, (outs GPR:$wb),
+                 (ins rGPR:$Rt, rGPR:$Rt2, t2addrmode_imm8s4:$addr),
+                 IIC_iStore_d_ru, "strd", "\t$Rt, $Rt2, $addr!",
+                 "$addr.base = $wb", []> {
+  let AsmMatchConverter = "cvtT2StrdPre";
+  let DecoderMethod = "DecodeT2STRDPreInstruction";
+}
 
-def t2STRD_POST : T2Ii8s4<0, 1, 0, (outs),
-                 (ins rGPR:$Rt, rGPR:$Rt2, GPR:$base, t2am_imm8s4_offset:$imm),
-                 IIC_iStore_d_ru, "strd", "\t$Rt, $Rt2, [$base], $imm", []>;
+def t2STRD_POST : T2Ii8s4post<0, 1, 0, (outs GPR:$wb),
+                 (ins rGPR:$Rt, rGPR:$Rt2, addr_offset_none:$addr,
+                      t2am_imm8s4_offset:$imm),
+                 IIC_iStore_d_ru, "strd", "\t$Rt, $Rt2, $addr$imm",
+                 "$addr.base = $wb", []>;
 
 // T2Ipl (Preload Data/Instruction) signals the memory system of possible future
 // data/instruction access.  These are for disassembly only.
@@ -1463,9 +1477,9 @@ multiclass T2Ipl<bits<1> write, bits<1> instr, string opc> {
     let Inst{11-0}  = addr{11-0};  // imm12
   }
 
-  def i8 : T2Ii8<(outs), (ins t2addrmode_imm8:$addr), IIC_Preload, opc,
+  def i8 : T2Ii8<(outs), (ins t2addrmode_negimm8:$addr), IIC_Preload, opc,
                 "\t$addr",
-               [(ARMPreload t2addrmode_imm8:$addr, (i32 write), (i32 instr))]> {
+            [(ARMPreload t2addrmode_negimm8:$addr, (i32 write), (i32 instr))]> {
     let Inst{31-25} = 0b1111100;
     let Inst{24} = instr;
     let Inst{23} = 0; // U = 0
@@ -1496,6 +1510,8 @@ multiclass T2Ipl<bits<1> write, bits<1> instr, string opc> {
     let Inst{19-16} = addr{9-6}; // Rn
     let Inst{3-0}   = addr{5-2}; // Rm
     let Inst{5-4}   = addr{1-0}; // imm2
+
+    let DecoderMethod = "DecodeT2LoadShift";
   }
 }
 
@@ -1507,11 +1523,11 @@ defm t2PLI  : T2Ipl<0, 1, "pli">,  Requires<[IsThumb2,HasV7]>;
 //  Load / store multiple Instructions.
 //
 
-multiclass thumb2_ldst_mult<string asm, InstrItinClass itin,
+multiclass thumb2_ld_mult<string asm, InstrItinClass itin,
                             InstrItinClass itin_upd, bit L_bit> {
   def IA :
     T2XI<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
-         itin, !strconcat(asm, "ia${p}.w\t$Rn, $regs"), []> {
+         itin, !strconcat(asm, "${p}.w\t$Rn, $regs"), []> {
     bits<4>  Rn;
     bits<16> regs;
 
@@ -1522,11 +1538,12 @@ multiclass thumb2_ldst_mult<string asm, InstrItinClass itin,
     let Inst{21}    = 0;        // No writeback
     let Inst{20}    = L_bit;
     let Inst{19-16} = Rn;
-    let Inst{15-0}  = regs;
+    let Inst{15}    = 0;
+    let Inst{14-0}  = regs{14-0};
   }
   def IA_UPD :
     T2XIt<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
-          itin_upd, !strconcat(asm, "ia${p}.w\t$Rn!, $regs"), "$Rn = $wb", []> {
+          itin_upd, !strconcat(asm, "${p}.w\t$Rn!, $regs"), "$Rn = $wb", []> {
     bits<4>  Rn;
     bits<16> regs;
 
@@ -1537,11 +1554,12 @@ multiclass thumb2_ldst_mult<string asm, InstrItinClass itin,
     let Inst{21}    = 1;        // Writeback
     let Inst{20}    = L_bit;
     let Inst{19-16} = Rn;
-    let Inst{15-0}  = regs;
+    let Inst{15}    = 0;
+    let Inst{14-0}  = regs{14-0};
   }
   def DB :
     T2XI<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
-         itin, !strconcat(asm, "db${p}.w\t$Rn, $regs"), []> {
+         itin, !strconcat(asm, "db${p}\t$Rn, $regs"), []> {
     bits<4>  Rn;
     bits<16> regs;
 
@@ -1552,11 +1570,12 @@ multiclass thumb2_ldst_mult<string asm, InstrItinClass itin,
     let Inst{21}    = 0;        // No writeback
     let Inst{20}    = L_bit;
     let Inst{19-16} = Rn;
-    let Inst{15-0}  = regs;
+    let Inst{15}    = 0;
+    let Inst{14-0}  = regs{14-0};
   }
   def DB_UPD :
     T2XIt<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
-          itin_upd, !strconcat(asm, "db${p}.w\t$Rn, $regs"), "$Rn = $wb", []> {
+          itin_upd, !strconcat(asm, "db${p}\t$Rn!, $regs"), "$Rn = $wb", []> {
     bits<4>  Rn;
     bits<16> regs;
 
@@ -1567,17 +1586,95 @@ multiclass thumb2_ldst_mult<string asm, InstrItinClass itin,
     let Inst{21}    = 1;        // Writeback
     let Inst{20}    = L_bit;
     let Inst{19-16} = Rn;
-    let Inst{15-0}  = regs;
+    let Inst{15}    = 0;
+    let Inst{14-0}  = regs{14-0};
   }
 }
 
 let neverHasSideEffects = 1 in {
 
 let mayLoad = 1, hasExtraDefRegAllocReq = 1 in
-defm t2LDM : thumb2_ldst_mult<"ldm", IIC_iLoad_m, IIC_iLoad_mu, 1>;
+defm t2LDM : thumb2_ld_mult<"ldm", IIC_iLoad_m, IIC_iLoad_mu, 1>;
+
+multiclass thumb2_st_mult<string asm, InstrItinClass itin,
+                            InstrItinClass itin_upd, bit L_bit> {
+  def IA :
+    T2XI<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+         itin, !strconcat(asm, "${p}.w\t$Rn, $regs"), []> {
+    bits<4>  Rn;
+    bits<16> regs;
+
+    let Inst{31-27} = 0b11101;
+    let Inst{26-25} = 0b00;
+    let Inst{24-23} = 0b01;     // Increment After
+    let Inst{22}    = 0;
+    let Inst{21}    = 0;        // No writeback
+    let Inst{20}    = L_bit;
+    let Inst{19-16} = Rn;
+    let Inst{15}    = 0;
+    let Inst{14}    = regs{14};
+    let Inst{13}    = 0;
+    let Inst{12-0}  = regs{12-0};
+  }
+  def IA_UPD :
+    T2XIt<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+          itin_upd, !strconcat(asm, "${p}.w\t$Rn!, $regs"), "$Rn = $wb", []> {
+    bits<4>  Rn;
+    bits<16> regs;
+
+    let Inst{31-27} = 0b11101;
+    let Inst{26-25} = 0b00;
+    let Inst{24-23} = 0b01;     // Increment After
+    let Inst{22}    = 0;
+    let Inst{21}    = 1;        // Writeback
+    let Inst{20}    = L_bit;
+    let Inst{19-16} = Rn;
+    let Inst{15}    = 0;
+    let Inst{14}    = regs{14};
+    let Inst{13}    = 0;
+    let Inst{12-0}  = regs{12-0};
+  }
+  def DB :
+    T2XI<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+         itin, !strconcat(asm, "db${p}\t$Rn, $regs"), []> {
+    bits<4>  Rn;
+    bits<16> regs;
+
+    let Inst{31-27} = 0b11101;
+    let Inst{26-25} = 0b00;
+    let Inst{24-23} = 0b10;     // Decrement Before
+    let Inst{22}    = 0;
+    let Inst{21}    = 0;        // No writeback
+    let Inst{20}    = L_bit;
+    let Inst{19-16} = Rn;
+    let Inst{15}    = 0;
+    let Inst{14}    = regs{14};
+    let Inst{13}    = 0;
+    let Inst{12-0}  = regs{12-0};
+  }
+  def DB_UPD :
+    T2XIt<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+          itin_upd, !strconcat(asm, "db${p}\t$Rn!, $regs"), "$Rn = $wb", []> {
+    bits<4>  Rn;
+    bits<16> regs;
+
+    let Inst{31-27} = 0b11101;
+    let Inst{26-25} = 0b00;
+    let Inst{24-23} = 0b10;     // Decrement Before
+    let Inst{22}    = 0;
+    let Inst{21}    = 1;        // Writeback
+    let Inst{20}    = L_bit;
+    let Inst{19-16} = Rn;
+    let Inst{15}    = 0;
+    let Inst{14}    = regs{14};
+    let Inst{13}    = 0;
+    let Inst{12-0}  = regs{12-0};
+  }
+}
+
 
 let mayStore = 1, hasExtraSrcRegAllocReq = 1 in
-defm t2STM : thumb2_ldst_mult<"stm", IIC_iStore_m, IIC_iStore_mu, 0>;
+defm t2STM : thumb2_st_mult<"stm", IIC_iStore_m, IIC_iStore_mu, 0>;
 
 } // neverHasSideEffects
 
@@ -1587,7 +1684,7 @@ defm t2STM : thumb2_ldst_mult<"stm", IIC_iStore_m, IIC_iStore_mu, 0>;
 //
 
 let neverHasSideEffects = 1 in
-def t2MOVr : T2sTwoReg<(outs GPR:$Rd), (ins GPR:$Rm), IIC_iMOVr,
+def t2MOVr : T2sTwoReg<(outs GPRnopc:$Rd), (ins GPR:$Rm), IIC_iMOVr,
                    "mov", ".w\t$Rd, $Rm", []> {
   let Inst{31-27} = 0b11101;
   let Inst{26-25} = 0b01;
@@ -1596,6 +1693,10 @@ def t2MOVr : T2sTwoReg<(outs GPR:$Rd), (ins GPR:$Rm), IIC_iMOVr,
   let Inst{14-12} = 0b000;
   let Inst{7-4} = 0b0000;
 }
+def : t2InstAlias<"movs${p}.w $Rd, $Rm", (t2MOVr GPRnopc:$Rd, GPR:$Rm,
+                                                 pred:$p, CPSR)>;
+def : t2InstAlias<"movs${p} $Rd, $Rm", (t2MOVr GPRnopc:$Rd, GPR:$Rm,
+                                               pred:$p, CPSR)>;
 
 // AddedComplexity to ensure isel tries t2MOVi before t2MOVi16.
 let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1,
@@ -1610,12 +1711,20 @@ def t2MOVi : T2sOneRegImm<(outs rGPR:$Rd), (ins t2_so_imm:$imm), IIC_iMOVi,
   let Inst{15} = 0;
 }
 
-def : InstAlias<"mov${s}${p} $Rd, $imm", (t2MOVi rGPR:$Rd, t2_so_imm:$imm,
-                                                 pred:$p, cc_out:$s)>,
-                Requires<[IsThumb2]>;
+// cc_out is handled as part of the explicit mnemonic in the parser for 'mov'.
+// Use aliases to get that to play nice here.
+def : t2InstAlias<"movs${p}.w $Rd, $imm", (t2MOVi rGPR:$Rd, t2_so_imm:$imm,
+                                                pred:$p, CPSR)>;
+def : t2InstAlias<"movs${p} $Rd, $imm", (t2MOVi rGPR:$Rd, t2_so_imm:$imm,
+                                                pred:$p, CPSR)>;
+
+def : t2InstAlias<"mov${p}.w $Rd, $imm", (t2MOVi rGPR:$Rd, t2_so_imm:$imm,
+                                                 pred:$p, zero_reg)>;
+def : t2InstAlias<"mov${p} $Rd, $imm", (t2MOVi rGPR:$Rd, t2_so_imm:$imm,
+                                               pred:$p, zero_reg)>;
 
 let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in
-def t2MOVi16 : T2I<(outs rGPR:$Rd), (ins i32imm_hilo16:$imm), IIC_iMOVi,
+def t2MOVi16 : T2I<(outs rGPR:$Rd), (ins imm0_65535_expr:$imm), IIC_iMOVi,
                    "movw", "\t$Rd, $imm",
                    [(set rGPR:$Rd, imm0_65535:$imm)]> {
   let Inst{31-27} = 0b11110;
@@ -1632,6 +1741,7 @@ def t2MOVi16 : T2I<(outs rGPR:$Rd), (ins i32imm_hilo16:$imm), IIC_iMOVi,
   let Inst{26}    = imm{11};
   let Inst{14-12} = imm{10-8};
   let Inst{7-0}   = imm{7-0};
+  let DecoderMethod = "DecodeT2MOVTWInstruction";
 }
 
 def t2MOVi16_ga_pcrel : PseudoInst<(outs rGPR:$Rd),
@@ -1639,7 +1749,7 @@ def t2MOVi16_ga_pcrel : PseudoInst<(outs rGPR:$Rd),
 
 let Constraints = "$src = $Rd" in {
 def t2MOVTi16 : T2I<(outs rGPR:$Rd),
-                    (ins rGPR:$src, i32imm_hilo16:$imm), IIC_iMOVi,
+                    (ins rGPR:$src, imm0_65535_expr:$imm), IIC_iMOVi,
                     "movt", "\t$Rd, $imm",
                     [(set rGPR:$Rd,
                           (or (and rGPR:$src, 0xffff), lo16AllZero:$imm))]> {
@@ -1657,6 +1767,7 @@ def t2MOVTi16 : T2I<(outs rGPR:$Rd),
   let Inst{26}    = imm{11};
   let Inst{14-12} = imm{10-8};
   let Inst{7-0}   = imm{7-0};
+  let DecoderMethod = "DecodeT2MOVTWInstruction";
 }
 
 def t2MOVTi16_ga_pcrel : PseudoInst<(outs rGPR:$Rd),
@@ -1671,28 +1782,26 @@ def : T2Pat<(or rGPR:$src, 0xffff0000), (t2MOVTi16 rGPR:$src, 0xffff)>;
 
 // Sign extenders
 
-defm t2SXTB  : T2I_ext_rrot<0b100, "sxtb",
+def t2SXTB  : T2I_ext_rrot<0b100, "sxtb",
                               UnOpFrag<(sext_inreg node:$Src, i8)>>;
-defm t2SXTH  : T2I_ext_rrot<0b000, "sxth",
+def t2SXTH  : T2I_ext_rrot<0b000, "sxth",
                               UnOpFrag<(sext_inreg node:$Src, i16)>>;
-defm t2SXTB16 : T2I_ext_rrot_sxtb16<0b010, "sxtb16">;
+def t2SXTB16 : T2I_ext_rrot_sxtb16<0b010, "sxtb16">;
 
-defm t2SXTAB : T2I_exta_rrot<0b100, "sxtab",
+def t2SXTAB : T2I_exta_rrot<0b100, "sxtab",
                         BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS, i8))>>;
-defm t2SXTAH : T2I_exta_rrot<0b000, "sxtah",
+def t2SXTAH : T2I_exta_rrot<0b000, "sxtah",
                         BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS,i16))>>;
-defm t2SXTAB16 : T2I_exta_rrot_DO<0b010, "sxtab16">;
-
-// TODO: SXT(A){B|H}16 - done for disassembly only
+def t2SXTAB16 : T2I_exta_rrot_np<0b010, "sxtab16">;
 
 // Zero extenders
 
 let AddedComplexity = 16 in {
-defm t2UXTB   : T2I_ext_rrot<0b101, "uxtb",
+def t2UXTB   : T2I_ext_rrot<0b101, "uxtb",
                                UnOpFrag<(and node:$Src, 0x000000FF)>>;
-defm t2UXTH   : T2I_ext_rrot<0b001, "uxth",
+def t2UXTH   : T2I_ext_rrot<0b001, "uxth",
                                UnOpFrag<(and node:$Src, 0x0000FFFF)>>;
-defm t2UXTB16 : T2I_ext_rrot_uxtb16<0b011, "uxtb16",
+def t2UXTB16 : T2I_ext_rrot_uxtb16<0b011, "uxtb16",
                                UnOpFrag<(and node:$Src, 0x00FF00FF)>>;
 
 // FIXME: This pattern incorrectly assumes the shl operator is a rotate.
@@ -1700,17 +1809,17 @@ defm t2UXTB16 : T2I_ext_rrot_uxtb16<0b011, "uxtb16",
 //        instead so we can include a check for masking back in the upper
 //        eight bits of the source into the lower eight bits of the result.
 //def : T2Pat<(and (shl rGPR:$Src, (i32 8)), 0xFF00FF),
-//            (t2UXTB16r_rot rGPR:$Src, 24)>,
+//            (t2UXTB16 rGPR:$Src, 3)>,
 //          Requires<[HasT2ExtractPack, IsThumb2]>;
 def : T2Pat<(and (srl rGPR:$Src, (i32 8)), 0xFF00FF),
-            (t2UXTB16r_rot rGPR:$Src, 8)>,
+            (t2UXTB16 rGPR:$Src, 1)>,
         Requires<[HasT2ExtractPack, IsThumb2]>;
 
-defm t2UXTAB : T2I_exta_rrot<0b101, "uxtab",
+def t2UXTAB : T2I_exta_rrot<0b101, "uxtab",
                            BinOpFrag<(add node:$LHS, (and node:$RHS, 0x00FF))>>;
-defm t2UXTAH : T2I_exta_rrot<0b001, "uxtah",
+def t2UXTAH : T2I_exta_rrot<0b001, "uxtah",
                            BinOpFrag<(add node:$LHS, (and node:$RHS, 0xFFFF))>>;
-defm t2UXTAB16 : T2I_exta_rrot_DO<0b011, "uxtab16">;
+def t2UXTAB16 : T2I_exta_rrot_np<0b011, "uxtab16">;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1723,27 +1832,37 @@ defm t2SUB  : T2I_bin_ii12rs<0b101, "sub",
                              BinOpFrag<(sub  node:$LHS, node:$RHS)>>;
 
 // ADD and SUB with 's' bit set. No 12-bit immediate (T4) variants.
+//
+// Currently, t2ADDS/t2SUBS are pseudo opcodes that exist only in the
+// selection DAG. They are "lowered" to real t2ADD/t2SUB opcodes by
+// AdjustInstrPostInstrSelection where we determine whether or not to
+// set the "s" bit based on CPSR liveness.
+//
+// FIXME: Eliminate t2ADDS/t2SUBS pseudo opcodes after adding tablegen
+// support for an optional CPSR definition that corresponds to the DAG
+// node's second value. We can then eliminate the implicit def of CPSR.
 defm t2ADDS : T2I_bin_s_irs <0b1000, "add",
                              IIC_iALUi, IIC_iALUr, IIC_iALUsi,
-                             BinOpFrag<(addc node:$LHS, node:$RHS)>, 1>;
+                             BinOpFrag<(ARMaddc node:$LHS, node:$RHS)>, 1>;
 defm t2SUBS : T2I_bin_s_irs <0b1101, "sub",
                              IIC_iALUi, IIC_iALUr, IIC_iALUsi,
-                             BinOpFrag<(subc node:$LHS, node:$RHS)>>;
+                             BinOpFrag<(ARMsubc node:$LHS, node:$RHS)>>;
 
+let hasPostISelHook = 1 in {
 defm t2ADC  : T2I_adde_sube_irs<0b1010, "adc",
-                          BinOpFrag<(adde_dead_carry node:$LHS, node:$RHS)>, 1>;
+              BinOpWithFlagFrag<(ARMadde node:$LHS, node:$RHS, node:$FLAG)>, 1>;
 defm t2SBC  : T2I_adde_sube_irs<0b1011, "sbc",
-                          BinOpFrag<(sube_dead_carry node:$LHS, node:$RHS)>>;
-defm t2ADCS : T2I_adde_sube_s_irs<BinOpFrag<(adde_live_carry node:$LHS,
-                                                             node:$RHS)>, 1>;
-defm t2SBCS : T2I_adde_sube_s_irs<BinOpFrag<(sube_live_carry node:$LHS,
-                                                             node:$RHS)>>;
+              BinOpWithFlagFrag<(ARMsube node:$LHS, node:$RHS, node:$FLAG)>>;
+}
 
 // RSB
 defm t2RSB  : T2I_rbin_irs  <0b1110, "rsb",
                              BinOpFrag<(sub  node:$LHS, node:$RHS)>>;
+
+// FIXME: Eliminate them if we can write def : Pat patterns which defines
+// CPSR and the implicit def of CPSR is not needed.
 defm t2RSBS : T2I_rbin_s_is <0b1110, "rsb",
-                             BinOpFrag<(subc node:$LHS, node:$RHS)>>;
+                             BinOpFrag<(ARMsubc node:$LHS, node:$RHS)>>;
 
 // (sub X, imm) gets canonicalized to (add X, -imm).  Match this form.
 // The assume-no-carry-in form uses the negation of the input since add/sub
@@ -1760,23 +1879,18 @@ def : T2Pat<(add        GPR:$src, t2_so_imm_neg:$imm),
 def : T2Pat<(add        GPR:$src, imm0_4095_neg:$imm),
             (t2SUBri12  GPR:$src, imm0_4095_neg:$imm)>;
 let AddedComplexity = 1 in
-def : T2Pat<(addc       rGPR:$src, imm0_255_neg:$imm),
+def : T2Pat<(ARMaddc    rGPR:$src, imm0_255_neg:$imm),
             (t2SUBSri   rGPR:$src, imm0_255_neg:$imm)>;
-def : T2Pat<(addc       rGPR:$src, t2_so_imm_neg:$imm),
+def : T2Pat<(ARMaddc    rGPR:$src, t2_so_imm_neg:$imm),
             (t2SUBSri   rGPR:$src, t2_so_imm_neg:$imm)>;
 // The with-carry-in form matches bitwise not instead of the negation.
 // Effectively, the inverse interpretation of the carry flag already accounts
 // for part of the negation.
 let AddedComplexity = 1 in
-def : T2Pat<(adde_dead_carry       rGPR:$src, imm0_255_not:$imm),
+def : T2Pat<(ARMadde    rGPR:$src, imm0_255_not:$imm, CPSR),
             (t2SBCri    rGPR:$src, imm0_255_not:$imm)>;
-def : T2Pat<(adde_dead_carry       rGPR:$src, t2_so_imm_not:$imm),
+def : T2Pat<(ARMadde    rGPR:$src, t2_so_imm_not:$imm, CPSR),
             (t2SBCri    rGPR:$src, t2_so_imm_not:$imm)>;
-let AddedComplexity = 1 in
-def : T2Pat<(adde_live_carry       rGPR:$src, imm0_255_not:$imm),
-            (t2SBCSri   rGPR:$src, imm0_255_not:$imm)>;
-def : T2Pat<(adde_live_carry       rGPR:$src, t2_so_imm_not:$imm),
-            (t2SBCSri   rGPR:$src, t2_so_imm_not:$imm)>;
 
 // Select Bytes -- for disassembly only
 
@@ -1893,8 +2007,7 @@ class T2FourReg_mac<bit long, bits<3> op22_20, bits<4> op7_4, dag oops,
   let Inst{7-4}   = op7_4;
 }
 
-// Unsigned Sum of Absolute Differences [and Accumulate] -- for disassembly only
-
+// Unsigned Sum of Absolute Differences [and Accumulate].
 def t2USAD8   : T2ThreeReg_mac<0, 0b111, 0b0000, (outs rGPR:$Rd),
                                            (ins rGPR:$Rn, rGPR:$Rm),
                         NoItinerary, "usad8", "\t$Rd, $Rn, $Rm", []>,
@@ -1906,8 +2019,7 @@ def t2USADA8  : T2FourReg_mac<0, 0b111, 0b0000, (outs rGPR:$Rd),
                         "usada8", "\t$Rd, $Rn, $Rm, $Ra", []>,
           Requires<[IsThumb2, HasThumb2DSP]>;
 
-// Signed/Unsigned saturate -- for disassembly only
-
+// Signed/Unsigned saturate.
 class T2SatI<dag oops, dag iops, InstrItinClass itin,
            string opc, string asm, list<dag> pattern>
   : T2I<oops, iops, itin, opc, asm, pattern> {
@@ -1918,26 +2030,26 @@ class T2SatI<dag oops, dag iops, InstrItinClass itin,
 
   let Inst{11-8}  = Rd;
   let Inst{19-16} = Rn;
-  let Inst{4-0}   = sat_imm{4-0};
-  let Inst{21}    = sh{6};
+  let Inst{4-0}   = sat_imm;
+  let Inst{21}    = sh{5};
   let Inst{14-12} = sh{4-2};
   let Inst{7-6}   = sh{1-0};
 }
 
 def t2SSAT: T2SatI<
-              (outs rGPR:$Rd), (ins ssat_imm:$sat_imm, rGPR:$Rn, shift_imm:$sh),
-              NoItinerary, "ssat", "\t$Rd, $sat_imm, $Rn$sh",
-              [/* For disassembly only; pattern left blank */]> {
+              (outs rGPR:$Rd),
+              (ins imm1_32:$sat_imm, rGPR:$Rn, t2_shift_imm:$sh),
+              NoItinerary, "ssat", "\t$Rd, $sat_imm, $Rn$sh", []> {
   let Inst{31-27} = 0b11110;
   let Inst{25-22} = 0b1100;
   let Inst{20} = 0;
   let Inst{15} = 0;
+  let Inst{5}  = 0;
 }
 
 def t2SSAT16: T2SatI<
-                (outs rGPR:$Rd), (ins ssat_imm:$sat_imm, rGPR:$Rn), NoItinerary,
-                "ssat16", "\t$Rd, $sat_imm, $Rn",
-                [/* For disassembly only; pattern left blank */]>,
+                (outs rGPR:$Rd), (ins imm1_16:$sat_imm, rGPR:$Rn), NoItinerary,
+                "ssat16", "\t$Rd, $sat_imm, $Rn", []>,
           Requires<[IsThumb2, HasThumb2DSP]> {
   let Inst{31-27} = 0b11110;
   let Inst{25-22} = 0b1100;
@@ -1946,30 +2058,30 @@ def t2SSAT16: T2SatI<
   let Inst{21} = 1;        // sh = '1'
   let Inst{14-12} = 0b000; // imm3 = '000'
   let Inst{7-6} = 0b00;    // imm2 = '00'
+  let Inst{5-4} = 0b00;
 }
 
 def t2USAT: T2SatI<
-                (outs rGPR:$Rd), (ins i32imm:$sat_imm, rGPR:$Rn, shift_imm:$sh),
-                NoItinerary, "usat", "\t$Rd, $sat_imm, $Rn$sh",
-                [/* For disassembly only; pattern left blank */]> {
+               (outs rGPR:$Rd),
+               (ins imm0_31:$sat_imm, rGPR:$Rn, t2_shift_imm:$sh),
+                NoItinerary, "usat", "\t$Rd, $sat_imm, $Rn$sh", []> {
   let Inst{31-27} = 0b11110;
   let Inst{25-22} = 0b1110;
   let Inst{20} = 0;
   let Inst{15} = 0;
 }
 
-def t2USAT16: T2SatI<(outs rGPR:$dst), (ins i32imm:$sat_imm, rGPR:$Rn),
+def t2USAT16: T2SatI<(outs rGPR:$Rd), (ins imm0_15:$sat_imm, rGPR:$Rn),
                      NoItinerary,
-                     "usat16", "\t$dst, $sat_imm, $Rn",
-                     [/* For disassembly only; pattern left blank */]>,
+                     "usat16", "\t$Rd, $sat_imm, $Rn", []>,
           Requires<[IsThumb2, HasThumb2DSP]> {
-  let Inst{31-27} = 0b11110;
-  let Inst{25-22} = 0b1110;
+  let Inst{31-22} = 0b1111001110;
   let Inst{20} = 0;
   let Inst{15} = 0;
   let Inst{21} = 1;        // sh = '1'
   let Inst{14-12} = 0b000; // imm3 = '000'
   let Inst{7-6} = 0b00;    // imm2 = '00'
+  let Inst{5-4} = 0b00;
 }
 
 def : T2Pat<(int_arm_ssat GPR:$a, imm:$pos), (t2SSAT imm:$pos, GPR:$a, 0)>;
@@ -1979,10 +2091,14 @@ def : T2Pat<(int_arm_usat GPR:$a, imm:$pos), (t2USAT imm:$pos, GPR:$a, 0)>;
 //  Shift and rotate Instructions.
 //
 
-defm t2LSL  : T2I_sh_ir<0b00, "lsl", BinOpFrag<(shl  node:$LHS, node:$RHS)>>;
-defm t2LSR  : T2I_sh_ir<0b01, "lsr", BinOpFrag<(srl  node:$LHS, node:$RHS)>>;
-defm t2ASR  : T2I_sh_ir<0b10, "asr", BinOpFrag<(sra  node:$LHS, node:$RHS)>>;
-defm t2ROR  : T2I_sh_ir<0b11, "ror", BinOpFrag<(rotr node:$LHS, node:$RHS)>>;
+defm t2LSL  : T2I_sh_ir<0b00, "lsl", imm0_31,
+                        BinOpFrag<(shl  node:$LHS, node:$RHS)>, "t2LSL">;
+defm t2LSR  : T2I_sh_ir<0b01, "lsr", imm_sr,
+                        BinOpFrag<(srl  node:$LHS, node:$RHS)>, "t2LSR">;
+defm t2ASR  : T2I_sh_ir<0b10, "asr", imm_sr,
+                        BinOpFrag<(sra  node:$LHS, node:$RHS)>, "t2ASR">;
+defm t2ROR  : T2I_sh_ir<0b11, "ror", imm0_31,
+                        BinOpFrag<(rotr node:$LHS, node:$RHS)>, "t2ROR">;
 
 // (rotr x, (and y, 0x...1f)) ==> (ROR x, y)
 def : Pat<(rotr rGPR:$lhs, (and rGPR:$rhs, lo5AllOne)),
@@ -2090,7 +2206,7 @@ def t2BFC : T2BitFI<(outs rGPR:$Rd), (ins rGPR:$src, bf_inv_mask_imm:$imm),
 }
 
 def t2SBFX: T2TwoRegBitFI<
-                (outs rGPR:$Rd), (ins rGPR:$Rn, imm0_31:$lsb, imm0_31_m1:$msb),
+                (outs rGPR:$Rd), (ins rGPR:$Rn, imm0_31:$lsb, imm1_32:$msb),
                  IIC_iUNAsi, "sbfx", "\t$Rd, $Rn, $lsb, $msb", []> {
   let Inst{31-27} = 0b11110;
   let Inst{25} = 1;
@@ -2099,7 +2215,7 @@ def t2SBFX: T2TwoRegBitFI<
 }
 
 def t2UBFX: T2TwoRegBitFI<
-                (outs rGPR:$Rd), (ins rGPR:$Rn, imm0_31:$lsb, imm0_31_m1:$msb),
+                (outs rGPR:$Rd), (ins rGPR:$Rn, imm0_31:$lsb, imm1_32:$msb),
                  IIC_iUNAsi, "ubfx", "\t$Rd, $Rn, $lsb, $msb", []> {
   let Inst{31-27} = 0b11110;
   let Inst{25} = 1;
@@ -2125,26 +2241,6 @@ let Constraints = "$src = $Rd" in {
     let msb{4-0} = imm{9-5};
     let lsb{4-0} = imm{4-0};
   }
-
-  // GNU as only supports this form of bfi (w/ 4 arguments)
-  let isAsmParserOnly = 1 in
-  def t2BFI4p : T2TwoRegBitFI<(outs rGPR:$Rd),
-                  (ins rGPR:$src, rGPR:$Rn, lsb_pos_imm:$lsbit,
-                       width_imm:$width),
-                  IIC_iBITi, "bfi", "\t$Rd, $Rn, $lsbit, $width",
-                  []> {
-    let Inst{31-27} = 0b11110;
-    let Inst{26} = 0; // should be 0.
-    let Inst{25} = 1;
-    let Inst{24-20} = 0b10110;
-    let Inst{15} = 0;
-    let Inst{5} = 0; // should be 0.
-
-    bits<5> lsbit;
-    bits<5> width;
-    let msb{4-0} = width; // Custom encoder => lsb+width-1
-    let lsb{4-0} = lsbit;
-  }
 }
 
 defm t2ORN  : T2I_bin_irs<0b0011, "orn",
@@ -2152,13 +2248,53 @@ defm t2ORN  : T2I_bin_irs<0b0011, "orn",
                           BinOpFrag<(or  node:$LHS, (not node:$RHS))>,
                           "t2ORN", 0, "">;
 
+/// T2I_un_irs - Defines a set of (op reg, {so_imm|r|so_reg}) patterns for a
+/// unary operation that produces a value. These are predicable and can be
+/// changed to modify CPSR.
+multiclass T2I_un_irs<bits<4> opcod, string opc,
+                     InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
+                      PatFrag opnode, bit Cheap = 0, bit ReMat = 0> {
+   // shifted imm
+   def i : T2sOneRegImm<(outs rGPR:$Rd), (ins t2_so_imm:$imm), iii,
+                opc, "\t$Rd, $imm",
+                [(set rGPR:$Rd, (opnode t2_so_imm:$imm))]> {
+     let isAsCheapAsAMove = Cheap;
+     let isReMaterializable = ReMat;
+     let Inst{31-27} = 0b11110;
+     let Inst{25} = 0;
+     let Inst{24-21} = opcod;
+     let Inst{19-16} = 0b1111; // Rn
+     let Inst{15} = 0;
+   }
+   // register
+   def r : T2sTwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm), iir,
+                opc, ".w\t$Rd, $Rm",
+                [(set rGPR:$Rd, (opnode rGPR:$Rm))]> {
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24-21} = opcod;
+     let Inst{19-16} = 0b1111; // Rn
+     let Inst{14-12} = 0b000; // imm3
+     let Inst{7-6} = 0b00; // imm2
+     let Inst{5-4} = 0b00; // type
+   }
+   // shifted register
+   def s : T2sOneRegShiftedReg<(outs rGPR:$Rd), (ins t2_so_reg:$ShiftedRm), iis,
+                opc, ".w\t$Rd, $ShiftedRm",
+                [(set rGPR:$Rd, (opnode t2_so_reg:$ShiftedRm))]> {
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24-21} = opcod;
+     let Inst{19-16} = 0b1111; // Rn
+   }
+}
+
 // Prefer over of t2EORri ra, rb, -1 because mvn has 16-bit version
 let AddedComplexity = 1 in
 defm t2MVN  : T2I_un_irs <0b0011, "mvn",
                           IIC_iMVNi, IIC_iMVNr, IIC_iMVNsi,
                           UnOpFrag<(not node:$Src)>, 1, 1>;
 
-
 let AddedComplexity = 1 in
 def : T2Pat<(and     rGPR:$src, t2_so_imm_not:$imm),
             (t2BICri rGPR:$src, t2_so_imm_not:$imm)>;
@@ -2209,9 +2345,9 @@ def t2MLS: T2FourReg<
 let neverHasSideEffects = 1 in {
 let isCommutable = 1 in {
 def t2SMULL : T2MulLong<0b000, 0b0000,
-                  (outs rGPR:$Rd, rGPR:$Ra),
+                  (outs rGPR:$RdLo, rGPR:$RdHi),
                   (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL64,
-                   "smull", "\t$Rd, $Ra, $Rn, $Rm", []>;
+                   "smull", "\t$RdLo, $RdHi, $Rn, $Rm", []>;
 
 def t2UMULL : T2MulLong<0b010, 0b0000,
                   (outs rGPR:$RdLo, rGPR:$RdHi),
@@ -2468,7 +2604,7 @@ multiclass T2I_smla<string opc, PatFrag opnode> {
 defm t2SMUL : T2I_smul<"smul", BinOpFrag<(mul node:$LHS, node:$RHS)>>;
 defm t2SMLA : T2I_smla<"smla", BinOpFrag<(mul node:$LHS, node:$RHS)>>;
 
-// Halfword multiple accumulate long: SMLAL<x><y> -- for disassembly only
+// Halfword multiple accumulate long: SMLAL<x><y>
 def t2SMLALBB : T2FourReg_mac<1, 0b100, 0b1000, (outs rGPR:$Ra,rGPR:$Rd),
          (ins rGPR:$Rn,rGPR:$Rm), IIC_iMAC64, "smlalbb", "\t$Ra, $Rd, $Rn, $Rm",
            [/* For disassembly only; pattern left blank */]>,
@@ -2487,8 +2623,6 @@ def t2SMLALTT : T2FourReg_mac<1, 0b100, 0b1011, (outs rGPR:$Ra,rGPR:$Rd),
           Requires<[IsThumb2, HasThumb2DSP]>;
 
 // Dual halfword multiple: SMUAD, SMUSD, SMLAD, SMLSD, SMLALD, SMLSLD
-// These are for disassembly only.
-
 def t2SMUAD: T2ThreeReg_mac<
             0, 0b010, 0b0000, (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm),
             IIC_iMAC32, "smuad", "\t$Rd, $Rn, $Rm", []>,
@@ -2513,7 +2647,7 @@ def t2SMUSDX:T2ThreeReg_mac<
           Requires<[IsThumb2, HasThumb2DSP]> {
   let Inst{15-12} = 0b1111;
 }
-def t2SMLAD   : T2ThreeReg_mac<
+def t2SMLAD   : T2FourReg_mac<
             0, 0b010, 0b0000, (outs rGPR:$Rd),
             (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, "smlad",
             "\t$Rd, $Rn, $Rm, $Ra", []>,
@@ -2532,20 +2666,20 @@ def t2SMLSDX  : T2FourReg_mac<0, 0b100, 0b0001, (outs rGPR:$Rd),
             "\t$Rd, $Rn, $Rm, $Ra", []>,
           Requires<[IsThumb2, HasThumb2DSP]>;
 def t2SMLALD  : T2FourReg_mac<1, 0b100, 0b1100, (outs rGPR:$Ra,rGPR:$Rd),
-                        (ins rGPR:$Rm, rGPR:$Rn), IIC_iMAC64, "smlald",
-                        "\t$Ra, $Rd, $Rm, $Rn", []>,
+                        (ins rGPR:$Rn, rGPR:$Rm), IIC_iMAC64, "smlald",
+                        "\t$Ra, $Rd, $Rn, $Rm", []>,
           Requires<[IsThumb2, HasThumb2DSP]>;
 def t2SMLALDX : T2FourReg_mac<1, 0b100, 0b1101, (outs rGPR:$Ra,rGPR:$Rd),
-                        (ins rGPR:$Rm,rGPR:$Rn), IIC_iMAC64, "smlaldx",
-                        "\t$Ra, $Rd, $Rm, $Rn", []>,
+                        (ins rGPR:$Rn,rGPR:$Rm), IIC_iMAC64, "smlaldx",
+                        "\t$Ra, $Rd, $Rn, $Rm", []>,
           Requires<[IsThumb2, HasThumb2DSP]>;
 def t2SMLSLD  : T2FourReg_mac<1, 0b101, 0b1100, (outs rGPR:$Ra,rGPR:$Rd),
-                        (ins rGPR:$Rm,rGPR:$Rn), IIC_iMAC64, "smlsld",
-                        "\t$Ra, $Rd, $Rm, $Rn", []>,
+                        (ins rGPR:$Rn,rGPR:$Rm), IIC_iMAC64, "smlsld",
+                        "\t$Ra, $Rd, $Rn, $Rm", []>,
           Requires<[IsThumb2, HasThumb2DSP]>;
 def t2SMLSLDX : T2FourReg_mac<1, 0b101, 0b1101, (outs rGPR:$Ra,rGPR:$Rd),
                         (ins rGPR:$Rm,rGPR:$Rn), IIC_iMAC64, "smlsldx",
-                        "\t$Ra, $Rd, $Rm, $Rn", []>,
+                        "\t$Ra, $Rd, $Rn, $Rm", []>,
           Requires<[IsThumb2, HasThumb2DSP]>;
 
 //===----------------------------------------------------------------------===//
@@ -2613,10 +2747,10 @@ def : T2Pat<(or (sra (shl rGPR:$Rm, (i32 24)), (i32 16)),
             (t2REVSH rGPR:$Rm)>;
 
 def t2PKHBT : T2ThreeReg<
-            (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, shift_imm:$sh),
+            (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, pkh_lsl_amt:$sh),
                   IIC_iBITsi, "pkhbt", "\t$Rd, $Rn, $Rm$sh",
                   [(set rGPR:$Rd, (or (and rGPR:$Rn, 0xFFFF),
-                                      (and (shl rGPR:$Rm, lsl_amt:$sh),
+                                      (and (shl rGPR:$Rm, pkh_lsl_amt:$sh),
                                            0xFFFF0000)))]>,
                   Requires<[HasT2ExtractPack, IsThumb2]> {
   let Inst{31-27} = 0b11101;
@@ -2625,9 +2759,9 @@ def t2PKHBT : T2ThreeReg<
   let Inst{5} = 0; // BT form
   let Inst{4} = 0;
 
-  bits<8> sh;
-  let Inst{14-12} = sh{7-5};
-  let Inst{7-6}   = sh{4-3};
+  bits<5> sh;
+  let Inst{14-12} = sh{4-2};
+  let Inst{7-6}   = sh{1-0};
 }
 
 // Alternate cases for PKHBT where identities eliminate some nodes.
@@ -2635,16 +2769,16 @@ def : T2Pat<(or (and rGPR:$src1, 0xFFFF), (and rGPR:$src2, 0xFFFF0000)),
             (t2PKHBT rGPR:$src1, rGPR:$src2, 0)>,
             Requires<[HasT2ExtractPack, IsThumb2]>;
 def : T2Pat<(or (and rGPR:$src1, 0xFFFF), (shl rGPR:$src2, imm16_31:$sh)),
-            (t2PKHBT rGPR:$src1, rGPR:$src2, (lsl_shift_imm imm16_31:$sh))>,
+            (t2PKHBT rGPR:$src1, rGPR:$src2, imm16_31:$sh)>,
             Requires<[HasT2ExtractPack, IsThumb2]>;
 
 // Note: Shifts of 1-15 bits will be transformed to srl instead of sra and
 // will match the pattern below.
 def t2PKHTB : T2ThreeReg<
-                  (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, shift_imm:$sh),
+                  (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, pkh_asr_amt:$sh),
                   IIC_iBITsi, "pkhtb", "\t$Rd, $Rn, $Rm$sh",
                   [(set rGPR:$Rd, (or (and rGPR:$Rn, 0xFFFF0000),
-                                       (and (sra rGPR:$Rm, asr_amt:$sh),
+                                       (and (sra rGPR:$Rm, pkh_asr_amt:$sh),
                                             0xFFFF)))]>,
                   Requires<[HasT2ExtractPack, IsThumb2]> {
   let Inst{31-27} = 0b11101;
@@ -2653,19 +2787,19 @@ def t2PKHTB : T2ThreeReg<
   let Inst{5} = 1; // TB form
   let Inst{4} = 0;
 
-  bits<8> sh;
-  let Inst{14-12} = sh{7-5};
-  let Inst{7-6}   = sh{4-3};
+  bits<5> sh;
+  let Inst{14-12} = sh{4-2};
+  let Inst{7-6}   = sh{1-0};
 }
 
 // Alternate cases for PKHTB where identities eliminate some nodes.  Note that
 // a shift amount of 0 is *not legal* here, it is PKHBT instead.
 def : T2Pat<(or (and rGPR:$src1, 0xFFFF0000), (srl rGPR:$src2, imm16_31:$sh)),
-            (t2PKHTB rGPR:$src1, rGPR:$src2, (asr_shift_imm imm16_31:$sh))>,
+            (t2PKHTB rGPR:$src1, rGPR:$src2, imm16_31:$sh)>,
             Requires<[HasT2ExtractPack, IsThumb2]>;
 def : T2Pat<(or (and rGPR:$src1, 0xFFFF0000),
                 (and (srl rGPR:$src2, imm1_15:$sh), 0xFFFF)),
-            (t2PKHTB rGPR:$src1, rGPR:$src2, (asr_shift_imm imm1_15:$sh))>,
+            (t2PKHTB rGPR:$src1, rGPR:$src2, imm1_15:$sh)>,
             Requires<[HasT2ExtractPack, IsThumb2]>;
 
 //===----------------------------------------------------------------------===//
@@ -2673,14 +2807,14 @@ def : T2Pat<(or (and rGPR:$src1, 0xFFFF0000),
 //
 defm t2CMP  : T2I_cmp_irs<0b1101, "cmp",
                           IIC_iCMPi, IIC_iCMPr, IIC_iCMPsi,
-                          BinOpFrag<(ARMcmp node:$LHS, node:$RHS)>>;
+                          BinOpFrag<(ARMcmp node:$LHS, node:$RHS)>, "t2CMP">;
 
-def : T2Pat<(ARMcmpZ  GPR:$lhs, t2_so_imm:$imm),
-            (t2CMPri  GPR:$lhs, t2_so_imm:$imm)>;
-def : T2Pat<(ARMcmpZ  GPR:$lhs, rGPR:$rhs),
-            (t2CMPrr  GPR:$lhs, rGPR:$rhs)>;
-def : T2Pat<(ARMcmpZ  GPR:$lhs, t2_so_reg:$rhs),
-            (t2CMPrs  GPR:$lhs, t2_so_reg:$rhs)>;
+def : T2Pat<(ARMcmpZ  GPRnopc:$lhs, t2_so_imm:$imm),
+            (t2CMPri  GPRnopc:$lhs, t2_so_imm:$imm)>;
+def : T2Pat<(ARMcmpZ  GPRnopc:$lhs, rGPR:$rhs),
+            (t2CMPrr  GPRnopc:$lhs, rGPR:$rhs)>;
+def : T2Pat<(ARMcmpZ  GPRnopc:$lhs, t2_so_reg:$rhs),
+            (t2CMPrs  GPRnopc:$lhs, t2_so_reg:$rhs)>;
 
 //FIXME: Disable CMN, as CCodes are backwards from compare expectations
 //       Compare-to-zero still works out, just not the relationals
@@ -2688,20 +2822,23 @@ def : T2Pat<(ARMcmpZ  GPR:$lhs, t2_so_reg:$rhs),
 //                          BinOpFrag<(ARMcmp node:$LHS,(ineg node:$RHS))>>;
 defm t2CMNz : T2I_cmp_irs<0b1000, "cmn",
                           IIC_iCMPi, IIC_iCMPr, IIC_iCMPsi,
-                          BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>>;
+                          BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>,
+                          "t2CMNz">;
 
 //def : T2Pat<(ARMcmp  GPR:$src, t2_so_imm_neg:$imm),
 //            (t2CMNri GPR:$src, t2_so_imm_neg:$imm)>;
 
-def : T2Pat<(ARMcmpZ  GPR:$src, t2_so_imm_neg:$imm),
-            (t2CMNzri GPR:$src, t2_so_imm_neg:$imm)>;
+def : T2Pat<(ARMcmpZ  GPRnopc:$src, t2_so_imm_neg:$imm),
+            (t2CMNzri GPRnopc:$src, t2_so_imm_neg:$imm)>;
 
 defm t2TST  : T2I_cmp_irs<0b0000, "tst",
                           IIC_iTSTi, IIC_iTSTr, IIC_iTSTsi,
-                         BinOpFrag<(ARMcmpZ (and_su node:$LHS, node:$RHS), 0)>>;
+                         BinOpFrag<(ARMcmpZ (and_su node:$LHS, node:$RHS), 0)>,
+                          "t2TST">;
 defm t2TEQ  : T2I_cmp_irs<0b0100, "teq",
                           IIC_iTSTi, IIC_iTSTr, IIC_iTSTsi,
-                         BinOpFrag<(ARMcmpZ (xor_su node:$LHS, node:$RHS), 0)>>;
+                         BinOpFrag<(ARMcmpZ (xor_su node:$LHS, node:$RHS), 0)>,
+                          "t2TEQ">;
 
 // Conditional moves
 // FIXME: should be able to write a pattern for ARMcmov, but can't use
@@ -2723,7 +2860,7 @@ def t2MOVCCi : t2PseudoInst<(outs rGPR:$Rd),
 // FIXME: Pseudo-ize these. For now, just mark codegen only.
 let isCodeGenOnly = 1 in {
 let isMoveImm = 1 in
-def t2MOVCCi16 : T2I<(outs rGPR:$Rd), (ins rGPR:$false, i32imm_hilo16:$imm),
+def t2MOVCCi16 : T2I<(outs rGPR:$Rd), (ins rGPR:$false, imm0_65535_expr:$imm),
                       IIC_iCMOVi,
                       "movw", "\t$Rd, $imm", []>,
                       RegConstraint<"$false = $Rd"> {
@@ -2807,20 +2944,19 @@ def t2DMB : AInoP<(outs), (ins memb_opt:$opt), ThumbFrm, NoItinerary,
 }
 
 def t2DSB : AInoP<(outs), (ins memb_opt:$opt), ThumbFrm, NoItinerary,
-                  "dsb", "\t$opt",
-                  [/* For disassembly only; pattern left blank */]>,
+                  "dsb", "\t$opt", []>,
                   Requires<[IsThumb, HasDB]> {
   bits<4> opt;
   let Inst{31-4} = 0xf3bf8f4;
   let Inst{3-0} = opt;
 }
 
-// ISB has only full system option -- for disassembly only
-def t2ISB : AInoP<(outs), (ins), ThumbFrm, NoItinerary, "isb", "",
-                  [/* For disassembly only; pattern left blank */]>,
-                  Requires<[IsThumb2, HasV7]> {
+def t2ISB : AInoP<(outs), (ins memb_opt:$opt), ThumbFrm, NoItinerary,
+                  "isb", "\t$opt",
+                  []>, Requires<[IsThumb2, HasDB]> {
+  bits<4> opt;
   let Inst{31-4} = 0xf3bf8f6;
-  let Inst{3-0} = 0b1111;
+  let Inst{3-0} = opt;
 }
 
 class T2I_ldrex<bits<2> opcod, dag oops, dag iops, AddrMode am, int sz,
@@ -2858,28 +2994,27 @@ class T2I_strex<bits<2> opcod, dag oops, dag iops, AddrMode am, int sz,
 }
 
 let mayLoad = 1 in {
-def t2LDREXB : T2I_ldrex<0b00, (outs rGPR:$Rt), (ins t2addrmode_reg:$addr),
+def t2LDREXB : T2I_ldrex<0b00, (outs rGPR:$Rt), (ins addr_offset_none:$addr),
                          AddrModeNone, 4, NoItinerary,
                          "ldrexb", "\t$Rt, $addr", "", []>;
-def t2LDREXH : T2I_ldrex<0b01, (outs rGPR:$Rt), (ins t2addrmode_reg:$addr),
+def t2LDREXH : T2I_ldrex<0b01, (outs rGPR:$Rt), (ins addr_offset_none:$addr),
                          AddrModeNone, 4, NoItinerary,
                          "ldrexh", "\t$Rt, $addr", "", []>;
-def t2LDREX  : Thumb2I<(outs rGPR:$Rt), (ins t2addrmode_reg:$addr),
+def t2LDREX  : Thumb2I<(outs rGPR:$Rt), (ins t2addrmode_imm0_1020s4:$addr),
                        AddrModeNone, 4, NoItinerary,
                        "ldrex", "\t$Rt, $addr", "", []> {
+  bits<4> Rt;
+  bits<12> addr;
   let Inst{31-27} = 0b11101;
   let Inst{26-20} = 0b0000101;
-  let Inst{11-8} = 0b1111;
-  let Inst{7-0} = 0b00000000; // imm8 = 0
-
-  bits<4> Rt;
-  bits<4> addr;
-  let Inst{19-16} = addr;
+  let Inst{19-16} = addr{11-8};
   let Inst{15-12} = Rt;
+  let Inst{11-8} = 0b1111;
+  let Inst{7-0} = addr{7-0};
 }
 let hasExtraDefRegAllocReq = 1 in
 def t2LDREXD : T2I_ldrex<0b11, (outs rGPR:$Rt, rGPR:$Rt2),
-                         (ins t2addrmode_reg:$addr),
+                         (ins addr_offset_none:$addr),
                          AddrModeNone, 4, NoItinerary,
                          "ldrexd", "\t$Rt, $Rt2, $addr", "",
                          [], {?, ?, ?, ?}> {
@@ -2890,33 +3025,33 @@ def t2LDREXD : T2I_ldrex<0b11, (outs rGPR:$Rt, rGPR:$Rt2),
 
 let mayStore = 1, Constraints = "@earlyclobber $Rd" in {
 def t2STREXB : T2I_strex<0b00, (outs rGPR:$Rd),
-                         (ins rGPR:$Rt, t2addrmode_reg:$addr),
+                         (ins rGPR:$Rt, addr_offset_none:$addr),
                          AddrModeNone, 4, NoItinerary,
                          "strexb", "\t$Rd, $Rt, $addr", "", []>;
 def t2STREXH : T2I_strex<0b01, (outs rGPR:$Rd),
-                         (ins rGPR:$Rt, t2addrmode_reg:$addr),
+                         (ins rGPR:$Rt, addr_offset_none:$addr),
                          AddrModeNone, 4, NoItinerary,
                          "strexh", "\t$Rd, $Rt, $addr", "", []>;
-def t2STREX  : Thumb2I<(outs rGPR:$Rd), (ins rGPR:$Rt, t2addrmode_reg:$addr),
+def t2STREX  : Thumb2I<(outs rGPR:$Rd), (ins rGPR:$Rt,
+                             t2addrmode_imm0_1020s4:$addr),
                   AddrModeNone, 4, NoItinerary,
                   "strex", "\t$Rd, $Rt, $addr", "",
                   []> {
-  let Inst{31-27} = 0b11101;
-  let Inst{26-20} = 0b0000100;
-  let Inst{7-0} = 0b00000000; // imm8 = 0
-
   bits<4> Rd;
-  bits<4> addr;
   bits<4> Rt;
-  let Inst{11-8}  = Rd;
-  let Inst{19-16} = addr;
+  bits<12> addr;
+  let Inst{31-27} = 0b11101;
+  let Inst{26-20} = 0b0000100;
+  let Inst{19-16} = addr{11-8};
   let Inst{15-12} = Rt;
+  let Inst{11-8}  = Rd;
+  let Inst{7-0} = addr{7-0};
 }
 }
 
 let hasExtraSrcRegAllocReq = 1, Constraints = "@earlyclobber $Rd" in
 def t2STREXD : T2I_strex<0b11, (outs rGPR:$Rd),
-                         (ins rGPR:$Rt, rGPR:$Rt2, t2addrmode_reg:$addr),
+                         (ins rGPR:$Rt, rGPR:$Rt2, addr_offset_none:$addr),
                          AddrModeNone, 4, NoItinerary,
                          "strexd", "\t$Rd, $Rt, $Rt2, $addr", "", [],
                          {?, ?, ?, ?}> {
@@ -2924,9 +3059,7 @@ def t2STREXD : T2I_strex<0b11, (outs rGPR:$Rd),
   let Inst{11-8} = Rt2;
 }
 
-// Clear-Exclusive is for disassembly only.
-def t2CLREX : T2XI<(outs), (ins), NoItinerary, "clrex",
-                   [/* For disassembly only; pattern left blank */]>,
+def t2CLREX : T2I<(outs), (ins), NoItinerary, "clrex", "", []>,
             Requires<[IsThumb2, HasV7]>  {
   let Inst{31-16} = 0xf3bf;
   let Inst{15-14} = 0b10;
@@ -2986,8 +3119,8 @@ def t2LDMIA_RET: t2PseudoExpand<(outs GPR:$wb), (ins GPR:$Rn, pred:$p,
 
 let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
 let isPredicable = 1 in
-def t2B   : T2XI<(outs), (ins uncondbrtarget:$target), IIC_Br,
-                 "b.w\t$target",
+def t2B   : T2I<(outs), (ins uncondbrtarget:$target), IIC_Br,
+                 "b", ".w\t$target",
                  [(br bb:$target)]> {
   let Inst{31-27} = 0b11110;
   let Inst{15-14} = 0b10;
@@ -3009,15 +3142,13 @@ def t2BR_JT : t2PseudoInst<(outs),
 
 // FIXME: Add a non-pc based case that can be predicated.
 def t2TBB_JT : t2PseudoInst<(outs),
-        (ins GPR:$index, i32imm:$jt, i32imm:$id),
-         0, IIC_Br, []>;
+        (ins GPR:$index, i32imm:$jt, i32imm:$id), 0, IIC_Br, []>;
 
 def t2TBH_JT : t2PseudoInst<(outs),
-        (ins GPR:$index, i32imm:$jt, i32imm:$id),
-         0, IIC_Br, []>;
+        (ins GPR:$index, i32imm:$jt, i32imm:$id), 0, IIC_Br, []>;
 
-def t2TBB : T2I<(outs), (ins GPR:$Rn, GPR:$Rm), IIC_Br,
-                    "tbb", "\t[$Rn, $Rm]", []> {
+def t2TBB : T2I<(outs), (ins addrmode_tbb:$addr), IIC_Br,
+                    "tbb", "\t$addr", []> {
   bits<4> Rn;
   bits<4> Rm;
   let Inst{31-20} = 0b111010001101;
@@ -3025,10 +3156,12 @@ def t2TBB : T2I<(outs), (ins GPR:$Rn, GPR:$Rm), IIC_Br,
   let Inst{15-5} = 0b11110000000;
   let Inst{4} = 0; // B form
   let Inst{3-0} = Rm;
+
+  let DecoderMethod = "DecodeThumbTableBranch";
 }
 
-def t2TBH : T2I<(outs), (ins GPR:$Rn, GPR:$Rm), IIC_Br,
-                   "tbh", "\t[$Rn, $Rm, lsl #1]", []> {
+def t2TBH : T2I<(outs), (ins addrmode_tbh:$addr), IIC_Br,
+                   "tbh", "\t$addr", []> {
   bits<4> Rn;
   bits<4> Rm;
   let Inst{31-20} = 0b111010001101;
@@ -3036,13 +3169,15 @@ def t2TBH : T2I<(outs), (ins GPR:$Rn, GPR:$Rm), IIC_Br,
   let Inst{15-5} = 0b11110000000;
   let Inst{4} = 1; // H form
   let Inst{3-0} = Rm;
+
+  let DecoderMethod = "DecodeThumbTableBranch";
 }
 } // isNotDuplicable, isIndirectBranch
 
 } // isBranch, isTerminator, isBarrier
 
 // FIXME: should be able to write a pattern for ARMBrcond, but can't use
-// a two-value operand where a dag node expects two operands. :(
+// a two-value operand where a dag node expects ", "two operands. :(
 let isBranch = 1, isTerminator = 1 in
 def t2Bcc : T2I<(outs), (ins brtarget:$target), IIC_Br,
                 "b", ".w\t$target",
@@ -3060,6 +3195,8 @@ def t2Bcc : T2I<(outs), (ins brtarget:$target), IIC_Br,
   let Inst{13} = target{18};
   let Inst{21-16} = target{17-12};
   let Inst{10-0} = target{11-1};
+
+  let DecoderMethod = "DecodeThumb2BCCInstruction";
 }
 
 // Tail calls. The Darwin version of thumb tail calls uses a t2 branch, so
@@ -3068,9 +3205,10 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
   // Darwin version.
   let Defs = [R0, R1, R2, R3, R9, R12, QQQQ0, QQQQ2, QQQQ3, PC],
       Uses = [SP] in
-  def tTAILJMPd: tPseudoExpand<(outs), (ins uncondbrtarget:$dst, variable_ops),
+  def tTAILJMPd: tPseudoExpand<(outs),
+                   (ins uncondbrtarget:$dst, pred:$p, variable_ops),
                    4, IIC_Br, [],
-                   (t2B uncondbrtarget:$dst)>,
+                   (t2B uncondbrtarget:$dst, pred:$p)>,
                  Requires<[IsThumb2, IsDarwin]>;
 }
 
@@ -3087,30 +3225,55 @@ def t2IT : Thumb2XI<(outs), (ins it_pred:$cc, it_mask:$mask),
   bits<4> mask;
   let Inst{7-4} = cc;
   let Inst{3-0} = mask;
+
+  let DecoderMethod = "DecodeIT";
 }
 
 // Branch and Exchange Jazelle -- for disassembly only
 // Rm = Inst{19-16}
-def t2BXJ : T2I<(outs), (ins rGPR:$func), NoItinerary, "bxj", "\t$func",
-              [/* For disassembly only; pattern left blank */]> {
+def t2BXJ : T2I<(outs), (ins rGPR:$func), NoItinerary, "bxj", "\t$func", []> {
+  bits<4> func;
   let Inst{31-27} = 0b11110;
   let Inst{26} = 0;
   let Inst{25-20} = 0b111100;
-  let Inst{15-14} = 0b10;
-  let Inst{12} = 0;
-
-  bits<4> func;
   let Inst{19-16} = func;
+  let Inst{15-0} = 0b1000111100000000;
+}
+
+// Compare and branch on zero / non-zero
+let isBranch = 1, isTerminator = 1 in {
+  def tCBZ  : T1I<(outs), (ins tGPR:$Rn, t_cbtarget:$target), IIC_Br,
+                  "cbz\t$Rn, $target", []>,
+              T1Misc<{0,0,?,1,?,?,?}>,
+              Requires<[IsThumb2]> {
+    // A8.6.27
+    bits<6> target;
+    bits<3> Rn;
+    let Inst{9}   = target{5};
+    let Inst{7-3} = target{4-0};
+    let Inst{2-0} = Rn;
+  }
+
+  def tCBNZ : T1I<(outs), (ins tGPR:$Rn, t_cbtarget:$target), IIC_Br,
+                  "cbnz\t$Rn, $target", []>,
+              T1Misc<{1,0,?,1,?,?,?}>,
+              Requires<[IsThumb2]> {
+    // A8.6.27
+    bits<6> target;
+    bits<3> Rn;
+    let Inst{9}   = target{5};
+    let Inst{7-3} = target{4-0};
+    let Inst{2-0} = Rn;
+  }
 }
 
-// Change Processor State is a system instruction -- for disassembly and
-// parsing only.
+
+// Change Processor State is a system instruction.
 // FIXME: Since the asm parser has currently no clean way to handle optional
 // operands, create 3 versions of the same instruction. Once there's a clean
 // framework to represent optional operands, change this behavior.
 class t2CPS<dag iops, string asm_op> : T2XI<(outs), iops, NoItinerary,
-            !strconcat("cps", asm_op),
-            [/* For disassembly only; pattern left blank */]> {
+            !strconcat("cps", asm_op), []> {
   bits<2> imod;
   bits<3> iflags;
   bits<5> mode;
@@ -3126,6 +3289,7 @@ class t2CPS<dag iops, string asm_op> : T2XI<(outs), iops, NoItinerary,
   let Inst{8}     = M;
   let Inst{7-5}   = iflags;
   let Inst{4-0}   = mode;
+  let DecoderMethod = "DecodeT2CPSInstruction";
 }
 
 let M = 1 in
@@ -3135,14 +3299,12 @@ let mode = 0, M = 0 in
   def t2CPS2p : t2CPS<(ins imod_op:$imod, iflags_op:$iflags),
                       "$imod.w\t$iflags">;
 let imod = 0, iflags = 0, M = 1 in
-  def t2CPS1p : t2CPS<(ins i32imm:$mode), "\t$mode">;
+  def t2CPS1p : t2CPS<(ins imm0_31:$mode), "\t$mode">;
 
 // A6.3.4 Branches and miscellaneous control
 // Table A6-14 Change Processor State, and hint instructions
-// Helper class for disassembly only.
 class T2I_hint<bits<8> op7_0, string opc, string asm>
-  : T2I<(outs), (ins), NoItinerary, opc, asm,
-        [/* For disassembly only; pattern left blank */]> {
+  : T2I<(outs), (ins), NoItinerary, opc, asm, []> {
   let Inst{31-20} = 0xf3a;
   let Inst{19-16} = 0b1111;
   let Inst{15-14} = 0b10;
@@ -3158,20 +3320,17 @@ def t2WFI   : T2I_hint<0b00000011, "wfi",   ".w">;
 def t2SEV   : T2I_hint<0b00000100, "sev",   ".w">;
 
 def t2DBG : T2I<(outs), (ins imm0_15:$opt), NoItinerary, "dbg", "\t$opt", []> {
-  let Inst{31-20} = 0xf3a;
-  let Inst{15-14} = 0b10;
-  let Inst{12} = 0;
-  let Inst{10-8} = 0b000;
-  let Inst{7-4} = 0b1111;
-
   bits<4> opt;
+  let Inst{31-20} = 0b111100111010;
+  let Inst{19-16} = 0b1111;
+  let Inst{15-8} = 0b10000000;
+  let Inst{7-4} = 0b1111;
   let Inst{3-0} = opt;
 }
 
-// Secure Monitor Call is a system instruction -- for disassembly only
+// Secure Monitor Call is a system instruction.
 // Option = Inst{19-16}
-def t2SMC : T2I<(outs), (ins i32imm:$opt), NoItinerary, "smc", "\t$opt",
-                [/* For disassembly only; pattern left blank */]> {
+def t2SMC : T2I<(outs), (ins imm0_15:$opt), NoItinerary, "smc", "\t$opt", []> {
   let Inst{31-27} = 0b11110;
   let Inst{26-20} = 0b1111111;
   let Inst{15-12} = 0b1000;
@@ -3180,32 +3339,30 @@ def t2SMC : T2I<(outs), (ins i32imm:$opt), NoItinerary, "smc", "\t$opt",
   let Inst{19-16} = opt;
 }
 
-class T2SRS<bits<12> op31_20,
-           dag oops, dag iops, InstrItinClass itin,
-          string opc, string asm, list<dag> pattern>
+class T2SRS<bits<2> Op, bit W, dag oops, dag iops, InstrItinClass itin,
+            string opc, string asm, list<dag> pattern>
   : T2I<oops, iops, itin, opc, asm, pattern> {
-  let Inst{31-20} = op31_20{11-0};
-
   bits<5> mode;
+  let Inst{31-25} = 0b1110100;
+  let Inst{24-23} = Op;
+  let Inst{22} = 0;
+  let Inst{21} = W;
+  let Inst{20-16} = 0b01101;
+  let Inst{15-5} = 0b11000000000;
   let Inst{4-0} = mode{4-0};
 }
 
-// Store Return State is a system instruction -- for disassembly only
-def t2SRSDBW : T2SRS<0b111010000010,
-                   (outs),(ins i32imm:$mode),NoItinerary,"srsdb","\tsp!, $mode",
-                   [/* For disassembly only; pattern left blank */]>;
-def t2SRSDB  : T2SRS<0b111010000000,
-                   (outs),(ins i32imm:$mode),NoItinerary,"srsdb","\tsp, $mode",
-                   [/* For disassembly only; pattern left blank */]>;
-def t2SRSIAW : T2SRS<0b111010011010,
-                   (outs),(ins i32imm:$mode),NoItinerary,"srsia","\tsp!, $mode",
-                   [/* For disassembly only; pattern left blank */]>;
-def t2SRSIA  : T2SRS<0b111010011000,
-                   (outs), (ins i32imm:$mode),NoItinerary,"srsia","\tsp, $mode",
-                   [/* For disassembly only; pattern left blank */]>;
-
-// Return From Exception is a system instruction -- for disassembly only
+// Store Return State is a system instruction.
+def t2SRSDB_UPD : T2SRS<0b00, 1, (outs), (ins imm0_31:$mode), NoItinerary,
+                        "srsdb", "\tsp!, $mode", []>;
+def t2SRSDB  : T2SRS<0b00, 0, (outs), (ins imm0_31:$mode), NoItinerary,
+                     "srsdb","\tsp, $mode", []>;
+def t2SRSIA_UPD : T2SRS<0b11, 1, (outs), (ins imm0_31:$mode), NoItinerary,
+                        "srsia","\tsp!, $mode", []>;
+def t2SRSIA  : T2SRS<0b11, 0, (outs), (ins imm0_31:$mode), NoItinerary,
+                     "srsia","\tsp, $mode", []>;
 
+// Return From Exception is a system instruction.
 class T2RFE<bits<12> op31_20, dag oops, dag iops, InstrItinClass itin,
           string opc, string asm, list<dag> pattern>
   : T2I<oops, iops, itin, opc, asm, pattern> {
@@ -3277,53 +3434,186 @@ def t2LDRpci_pic : PseudoInst<(outs rGPR:$dst), (ins i32imm:$addr, pclabel:$cp),
                                            imm:$cp))]>,
                Requires<[IsThumb2]>;
 
+// Pseudo isntruction that combines movs + predicated rsbmi 
+// to implement integer ABS
+let usesCustomInserter = 1, Defs = [CPSR] in {
+def t2ABS : PseudoInst<(outs rGPR:$dst), (ins rGPR:$src),
+                       NoItinerary, []>, Requires<[IsThumb2]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Coprocessor load/store -- for disassembly only
+//
+class T2CI<bits<4> op31_28, dag oops, dag iops, string opc, string asm>
+  : T2I<oops, iops, NoItinerary, opc, asm, []> {
+  let Inst{31-28} = op31_28;
+  let Inst{27-25} = 0b110;
+}
+
+multiclass t2LdStCop<bits<4> op31_28, bit load, bit Dbit, string asm> {
+  def _OFFSET : T2CI<op31_28,
+                     (outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5:$addr),
+                     asm, "\t$cop, $CRd, $addr"> {
+    bits<13> addr;
+    bits<4> cop;
+    bits<4> CRd;
+    let Inst{24} = 1; // P = 1
+    let Inst{23} = addr{8};
+    let Inst{22} = Dbit;
+    let Inst{21} = 0; // W = 0
+    let Inst{20} = load;
+    let Inst{19-16} = addr{12-9};
+    let Inst{15-12} = CRd;
+    let Inst{11-8} = cop;
+    let Inst{7-0} = addr{7-0};
+    let DecoderMethod = "DecodeCopMemInstruction";
+  }
+  def _PRE : T2CI<op31_28,
+                  (outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5:$addr),
+                  asm, "\t$cop, $CRd, $addr!"> {
+    bits<13> addr;
+    bits<4> cop;
+    bits<4> CRd;
+    let Inst{24} = 1; // P = 1
+    let Inst{23} = addr{8};
+    let Inst{22} = Dbit;
+    let Inst{21} = 1; // W = 1
+    let Inst{20} = load;
+    let Inst{19-16} = addr{12-9};
+    let Inst{15-12} = CRd;
+    let Inst{11-8} = cop;
+    let Inst{7-0} = addr{7-0};
+    let DecoderMethod = "DecodeCopMemInstruction";
+  }
+  def _POST: T2CI<op31_28,
+                  (outs), (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr,
+                               postidx_imm8s4:$offset),
+                 asm, "\t$cop, $CRd, $addr, $offset"> {
+    bits<9> offset;
+    bits<4> addr;
+    bits<4> cop;
+    bits<4> CRd;
+    let Inst{24} = 0; // P = 0
+    let Inst{23} = offset{8};
+    let Inst{22} = Dbit;
+    let Inst{21} = 1; // W = 1
+    let Inst{20} = load;
+    let Inst{19-16} = addr;
+    let Inst{15-12} = CRd;
+    let Inst{11-8} = cop;
+    let Inst{7-0} = offset{7-0};
+    let DecoderMethod = "DecodeCopMemInstruction";
+  }
+  def _OPTION : T2CI<op31_28, (outs),
+                     (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr,
+                          coproc_option_imm:$option),
+      asm, "\t$cop, $CRd, $addr, $option"> {
+    bits<8> option;
+    bits<4> addr;
+    bits<4> cop;
+    bits<4> CRd;
+    let Inst{24} = 0; // P = 0
+    let Inst{23} = 1; // U = 1
+    let Inst{22} = Dbit;
+    let Inst{21} = 0; // W = 0
+    let Inst{20} = load;
+    let Inst{19-16} = addr;
+    let Inst{15-12} = CRd;
+    let Inst{11-8} = cop;
+    let Inst{7-0} = option;
+    let DecoderMethod = "DecodeCopMemInstruction";
+  }
+}
+
+defm t2LDC   : t2LdStCop<0b1110, 1, 0, "ldc">;
+defm t2LDCL  : t2LdStCop<0b1110, 1, 1, "ldcl">;
+defm t2STC   : t2LdStCop<0b1110, 0, 0, "stc">;
+defm t2STCL  : t2LdStCop<0b1110, 0, 1, "stcl">;
+defm t2LDC2  : t2LdStCop<0b1111, 1, 0, "ldc2">;
+defm t2LDC2L : t2LdStCop<0b1111, 1, 1, "ldc2l">;
+defm t2STC2  : t2LdStCop<0b1111, 0, 0, "stc2">;
+defm t2STC2L : t2LdStCop<0b1111, 0, 1, "stc2l">;
+
+
 //===----------------------------------------------------------------------===//
 // Move between special register and ARM core register -- for disassembly only
 //
+// Move to ARM core register from Special Register
 
-class T2SpecialReg<bits<12> op31_20, bits<2> op15_14, bits<1> op12,
-          dag oops, dag iops, InstrItinClass itin,
-          string opc, string asm, list<dag> pattern>
-  : T2I<oops, iops, itin, opc, asm, pattern> {
-  let Inst{31-20} = op31_20{11-0};
-  let Inst{15-14} = op15_14{1-0};
-  let Inst{12} = op12{0};
+// A/R class MRS.
+//
+// A/R class can only move from CPSR or SPSR.
+def t2MRS_AR : T2I<(outs GPR:$Rd), (ins), NoItinerary, "mrs", "\t$Rd, apsr", []>,
+               Requires<[IsThumb2,IsARClass]> {
+  bits<4> Rd;
+  let Inst{31-12} = 0b11110011111011111000;
+  let Inst{11-8} = Rd;
+  let Inst{7-0} = 0b0000;
 }
 
-class T2MRS<bits<12> op31_20, bits<2> op15_14, bits<1> op12,
-          dag oops, dag iops, InstrItinClass itin,
-          string opc, string asm, list<dag> pattern>
-  : T2SpecialReg<op31_20, op15_14, op12, oops, iops, itin, opc, asm, pattern> {
+def : t2InstAlias<"mrs${p} $Rd, cpsr", (t2MRS_AR GPR:$Rd, pred:$p)>;
+
+def t2MRSsys_AR: T2I<(outs GPR:$Rd), (ins), NoItinerary, "mrs", "\t$Rd, spsr", []>,
+                 Requires<[IsThumb2,IsARClass]> {
   bits<4> Rd;
+  let Inst{31-12} = 0b11110011111111111000;
+  let Inst{11-8} = Rd;
+  let Inst{7-0} = 0b0000;
+}
+
+// M class MRS.
+//
+// This MRS has a mask field in bits 7-0 and can take more values than
+// the A/R class (a full msr_mask).
+def t2MRS_M : T2I<(outs rGPR:$Rd), (ins msr_mask:$mask), NoItinerary,
+                  "mrs", "\t$Rd, $mask", []>,
+              Requires<[IsThumb2,IsMClass]> {
+  bits<4> Rd;
+  bits<8> mask;
+  let Inst{31-12} = 0b11110011111011111000;
   let Inst{11-8} = Rd;
   let Inst{19-16} = 0b1111;
+  let Inst{7-0} = mask;
 }
 
-def t2MRS : T2MRS<0b111100111110, 0b10, 0,
-                (outs rGPR:$Rd), (ins), NoItinerary, "mrs", "\t$Rd, cpsr",
-                [/* For disassembly only; pattern left blank */]>;
-def t2MRSsys : T2MRS<0b111100111111, 0b10, 0,
-                   (outs rGPR:$Rd), (ins), NoItinerary, "mrs", "\t$Rd, spsr",
-                   [/* For disassembly only; pattern left blank */]>;
 
 // Move from ARM core register to Special Register
 //
+// A/R class MSR.
+//
 // No need to have both system and application versions, the encodings are the
 // same and the assembly parser has no way to distinguish between them. The mask
 // operand contains the special register (R Bit) in bit 4 and bits 3-0 contains
 // the mask with the fields to be accessed in the special register.
-def t2MSR : T2SpecialReg<0b111100111000 /* op31-20 */, 0b10 /* op15-14 */,
-                         0 /* op12 */, (outs), (ins msr_mask:$mask, rGPR:$Rn),
-                         NoItinerary, "msr", "\t$mask, $Rn",
-                         [/* For disassembly only; pattern left blank */]> {
+def t2MSR_AR : T2I<(outs), (ins msr_mask:$mask, rGPR:$Rn),
+                   NoItinerary, "msr", "\t$mask, $Rn", []>,
+               Requires<[IsThumb2,IsARClass]> {
   bits<5> mask;
   bits<4> Rn;
-  let Inst{19-16} = Rn;
+  let Inst{31-21} = 0b11110011100;
   let Inst{20}    = mask{4}; // R Bit
-  let Inst{13}    = 0b0;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = 0b1000;
   let Inst{11-8}  = mask{3-0};
+  let Inst{7-0}   = 0;
 }
 
+// M class MSR.
+//
+// Move from ARM core register to Special Register
+def t2MSR_M : T2I<(outs), (ins msr_mask:$SYSm, rGPR:$Rn),
+                  NoItinerary, "msr", "\t$SYSm, $Rn", []>,
+              Requires<[IsThumb2,IsMClass]> {
+  bits<8> SYSm;
+  bits<4> Rn;
+  let Inst{31-21} = 0b11110011100;
+  let Inst{20}    = 0b0;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = 0b1000;
+  let Inst{7-0}  = SYSm;
+}
+
+
 //===----------------------------------------------------------------------===//
 // Move between coprocessor and ARM core register
 //
@@ -3389,13 +3679,12 @@ def t2MCR2 : t2MovRCopro<0b1111, "mcr2", 0,
 
 /* from coprocessor to ARM core register */
 def t2MRC : t2MovRCopro<0b1110, "mrc", 1,
-           (outs GPR:$Rt),
-           (ins p_imm:$cop, i32imm:$opc1, c_imm:$CRn, c_imm:$CRm, i32imm:$opc2),
-           []>;
+             (outs GPR:$Rt), (ins p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
+                                  c_imm:$CRm, imm0_7:$opc2), []>;
 
 def t2MRC2 : t2MovRCopro<0b1111, "mrc2", 1,
-             (outs GPR:$Rt), (ins p_imm:$cop, i32imm:$opc1, c_imm:$CRn,
-                                  c_imm:$CRm, i32imm:$opc2), []>;
+             (outs GPR:$Rt), (ins p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
+                                  c_imm:$CRm, imm0_7:$opc2), []>;
 
 def : T2v6Pat<(int_arm_mrc  imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2),
               (t2MRC imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>;
@@ -3465,3 +3754,269 @@ def t2CDP2 : T2Cop<0b1111, (outs), (ins p_imm:$cop, imm0_15:$opc1,
   let Inst{19-16} = CRn;
   let Inst{23-20} = opc1;
 }
+
+
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//
+
+// SXT/UXT with no rotate
+let AddedComplexity = 16 in {
+def : T2Pat<(and rGPR:$Rm, 0x000000FF), (t2UXTB rGPR:$Rm, 0)>,
+           Requires<[IsThumb2]>;
+def : T2Pat<(and rGPR:$Rm, 0x0000FFFF), (t2UXTH rGPR:$Rm, 0)>,
+           Requires<[IsThumb2]>;
+def : T2Pat<(and rGPR:$Rm, 0x00FF00FF), (t2UXTB16 rGPR:$Rm, 0)>,
+           Requires<[HasT2ExtractPack, IsThumb2]>;
+def : T2Pat<(add rGPR:$Rn, (and rGPR:$Rm, 0x00FF)),
+            (t2UXTAB rGPR:$Rn, rGPR:$Rm, 0)>,
+           Requires<[HasT2ExtractPack, IsThumb2]>;
+def : T2Pat<(add rGPR:$Rn, (and rGPR:$Rm, 0xFFFF)),
+            (t2UXTAH rGPR:$Rn, rGPR:$Rm, 0)>,
+           Requires<[HasT2ExtractPack, IsThumb2]>;
+}
+
+def : T2Pat<(sext_inreg rGPR:$Src, i8),  (t2SXTB rGPR:$Src, 0)>,
+           Requires<[IsThumb2]>;
+def : T2Pat<(sext_inreg rGPR:$Src, i16), (t2SXTH rGPR:$Src, 0)>,
+           Requires<[IsThumb2]>;
+def : T2Pat<(add rGPR:$Rn, (sext_inreg rGPR:$Rm, i8)),
+            (t2SXTAB rGPR:$Rn, rGPR:$Rm, 0)>,
+           Requires<[HasT2ExtractPack, IsThumb2]>;
+def : T2Pat<(add rGPR:$Rn, (sext_inreg rGPR:$Rm, i16)),
+            (t2SXTAH rGPR:$Rn, rGPR:$Rm, 0)>,
+           Requires<[HasT2ExtractPack, IsThumb2]>;
+
+// Atomic load/store patterns
+def : T2Pat<(atomic_load_8   t2addrmode_imm12:$addr),
+            (t2LDRBi12  t2addrmode_imm12:$addr)>;
+def : T2Pat<(atomic_load_8   t2addrmode_negimm8:$addr),
+            (t2LDRBi8   t2addrmode_negimm8:$addr)>;
+def : T2Pat<(atomic_load_8   t2addrmode_so_reg:$addr),
+            (t2LDRBs    t2addrmode_so_reg:$addr)>;
+def : T2Pat<(atomic_load_16  t2addrmode_imm12:$addr),
+            (t2LDRHi12  t2addrmode_imm12:$addr)>;
+def : T2Pat<(atomic_load_16  t2addrmode_negimm8:$addr),
+            (t2LDRHi8   t2addrmode_negimm8:$addr)>;
+def : T2Pat<(atomic_load_16  t2addrmode_so_reg:$addr),
+            (t2LDRHs    t2addrmode_so_reg:$addr)>;
+def : T2Pat<(atomic_load_32  t2addrmode_imm12:$addr),
+            (t2LDRi12   t2addrmode_imm12:$addr)>;
+def : T2Pat<(atomic_load_32  t2addrmode_negimm8:$addr),
+            (t2LDRi8    t2addrmode_negimm8:$addr)>;
+def : T2Pat<(atomic_load_32  t2addrmode_so_reg:$addr),
+            (t2LDRs     t2addrmode_so_reg:$addr)>;
+def : T2Pat<(atomic_store_8  t2addrmode_imm12:$addr, GPR:$val),
+            (t2STRBi12  GPR:$val, t2addrmode_imm12:$addr)>;
+def : T2Pat<(atomic_store_8  t2addrmode_negimm8:$addr, GPR:$val),
+            (t2STRBi8   GPR:$val, t2addrmode_negimm8:$addr)>;
+def : T2Pat<(atomic_store_8  t2addrmode_so_reg:$addr, GPR:$val),
+            (t2STRBs    GPR:$val, t2addrmode_so_reg:$addr)>;
+def : T2Pat<(atomic_store_16 t2addrmode_imm12:$addr, GPR:$val),
+            (t2STRHi12  GPR:$val, t2addrmode_imm12:$addr)>;
+def : T2Pat<(atomic_store_16 t2addrmode_negimm8:$addr, GPR:$val),
+            (t2STRHi8   GPR:$val, t2addrmode_negimm8:$addr)>;
+def : T2Pat<(atomic_store_16 t2addrmode_so_reg:$addr, GPR:$val),
+            (t2STRHs    GPR:$val, t2addrmode_so_reg:$addr)>;
+def : T2Pat<(atomic_store_32 t2addrmode_imm12:$addr, GPR:$val),
+            (t2STRi12   GPR:$val, t2addrmode_imm12:$addr)>;
+def : T2Pat<(atomic_store_32 t2addrmode_negimm8:$addr, GPR:$val),
+            (t2STRi8    GPR:$val, t2addrmode_negimm8:$addr)>;
+def : T2Pat<(atomic_store_32 t2addrmode_so_reg:$addr, GPR:$val),
+            (t2STRs     GPR:$val, t2addrmode_so_reg:$addr)>;
+
+
+//===----------------------------------------------------------------------===//
+// Assembler aliases
+//
+
+// Aliases for ADC without the ".w" optional width specifier.
+def : t2InstAlias<"adc${s}${p} $Rd, $Rn, $Rm",
+                  (t2ADCrr rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"adc${s}${p} $Rd, $Rn, $ShiftedRm",
+                  (t2ADCrs rGPR:$Rd, rGPR:$Rn, t2_so_reg:$ShiftedRm,
+                           pred:$p, cc_out:$s)>;
+
+// Aliases for SBC without the ".w" optional width specifier.
+def : t2InstAlias<"sbc${s}${p} $Rd, $Rn, $Rm",
+                  (t2SBCrr rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"sbc${s}${p} $Rd, $Rn, $ShiftedRm",
+                  (t2SBCrs rGPR:$Rd, rGPR:$Rn, t2_so_reg:$ShiftedRm,
+                           pred:$p, cc_out:$s)>;
+
+// Aliases for ADD without the ".w" optional width specifier.
+def : t2InstAlias<"add${s}${p} $Rd, $Rn, $imm",
+        (t2ADDri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm:$imm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"add${p} $Rd, $Rn, $imm",
+           (t2ADDri12 GPRnopc:$Rd, GPR:$Rn, imm0_4095:$imm, pred:$p)>;
+def : t2InstAlias<"add${s}${p} $Rd, $Rn, $Rm",
+              (t2ADDrr GPRnopc:$Rd, GPRnopc:$Rn, rGPR:$Rm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"add${s}${p} $Rd, $Rn, $ShiftedRm",
+                  (t2ADDrs GPRnopc:$Rd, GPRnopc:$Rn, t2_so_reg:$ShiftedRm,
+                           pred:$p, cc_out:$s)>;
+
+// Aliases for SUB without the ".w" optional width specifier.
+def : t2InstAlias<"sub${s}${p} $Rd, $Rn, $imm",
+        (t2SUBri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm:$imm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"sub${p} $Rd, $Rn, $imm",
+           (t2SUBri12 GPRnopc:$Rd, GPR:$Rn, imm0_4095:$imm, pred:$p)>;
+def : t2InstAlias<"sub${s}${p} $Rd, $Rn, $Rm",
+              (t2SUBrr GPRnopc:$Rd, GPRnopc:$Rn, rGPR:$Rm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"sub${s}${p} $Rd, $Rn, $ShiftedRm",
+                  (t2SUBrs GPRnopc:$Rd, GPRnopc:$Rn, t2_so_reg:$ShiftedRm,
+                           pred:$p, cc_out:$s)>;
+
+// Alias for compares without the ".w" optional width specifier.
+def : t2InstAlias<"cmn${p} $Rn, $Rm",
+                  (t2CMNzrr GPRnopc:$Rn, rGPR:$Rm, pred:$p)>;
+def : t2InstAlias<"teq${p} $Rn, $Rm",
+                  (t2TEQrr GPRnopc:$Rn, rGPR:$Rm, pred:$p)>;
+def : t2InstAlias<"tst${p} $Rn, $Rm",
+                  (t2TSTrr GPRnopc:$Rn, rGPR:$Rm, pred:$p)>;
+
+// Memory barriers
+def : InstAlias<"dmb", (t2DMB 0xf)>, Requires<[IsThumb2, HasDB]>;
+def : InstAlias<"dsb", (t2DSB 0xf)>, Requires<[IsThumb2, HasDB]>;
+def : InstAlias<"isb", (t2ISB 0xf)>, Requires<[IsThumb2, HasDB]>;
+
+// Alias for LDR, LDRB, LDRH, LDRSB, and LDRSH without the ".w" optional
+// width specifier.
+def : t2InstAlias<"ldr${p} $Rt, $addr",
+                  (t2LDRi12 GPR:$Rt, t2addrmode_imm12:$addr, pred:$p)>;
+def : t2InstAlias<"ldrb${p} $Rt, $addr",
+                  (t2LDRBi12 rGPR:$Rt, t2addrmode_imm12:$addr, pred:$p)>;
+def : t2InstAlias<"ldrh${p} $Rt, $addr",
+                  (t2LDRHi12 rGPR:$Rt, t2addrmode_imm12:$addr, pred:$p)>;
+def : t2InstAlias<"ldrsb${p} $Rt, $addr",
+                  (t2LDRSBi12 rGPR:$Rt, t2addrmode_imm12:$addr, pred:$p)>;
+def : t2InstAlias<"ldrsh${p} $Rt, $addr",
+                  (t2LDRSHi12 rGPR:$Rt, t2addrmode_imm12:$addr, pred:$p)>;
+
+def : t2InstAlias<"ldr${p} $Rt, $addr",
+                  (t2LDRs GPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>;
+def : t2InstAlias<"ldrb${p} $Rt, $addr",
+                  (t2LDRBs rGPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>;
+def : t2InstAlias<"ldrh${p} $Rt, $addr",
+                  (t2LDRHs rGPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>;
+def : t2InstAlias<"ldrsb${p} $Rt, $addr",
+                  (t2LDRSBs rGPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>;
+def : t2InstAlias<"ldrsh${p} $Rt, $addr",
+                  (t2LDRSHs rGPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>;
+
+// Alias for MVN without the ".w" optional width specifier.
+def : t2InstAlias<"mvn${s}${p} $Rd, $Rm",
+           (t2MVNr rGPR:$Rd, rGPR:$Rm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"mvn${s}${p} $Rd, $ShiftedRm",
+           (t2MVNs rGPR:$Rd, t2_so_reg:$ShiftedRm, pred:$p, cc_out:$s)>;
+
+// PKHBT/PKHTB with default shift amount. PKHTB is equivalent to PKHBT when the
+// shift amount is zero (i.e., unspecified).
+def : InstAlias<"pkhbt${p} $Rd, $Rn, $Rm",
+                (t2PKHBT rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>,
+            Requires<[HasT2ExtractPack, IsThumb2]>;
+def : InstAlias<"pkhtb${p} $Rd, $Rn, $Rm",
+                (t2PKHBT rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>,
+            Requires<[HasT2ExtractPack, IsThumb2]>;
+
+// PUSH/POP aliases for STM/LDM
+def : t2InstAlias<"push${p}.w $regs", (t2STMDB_UPD SP, pred:$p, reglist:$regs)>;
+def : t2InstAlias<"push${p} $regs", (t2STMDB_UPD SP, pred:$p, reglist:$regs)>;
+def : t2InstAlias<"pop${p}.w $regs", (t2LDMIA_UPD SP, pred:$p, reglist:$regs)>;
+def : t2InstAlias<"pop${p} $regs", (t2LDMIA_UPD SP, pred:$p, reglist:$regs)>;
+
+// Alias for REV/REV16/REVSH without the ".w" optional width specifier.
+def : t2InstAlias<"rev${p} $Rd, $Rm", (t2REV rGPR:$Rd, rGPR:$Rm, pred:$p)>;
+def : t2InstAlias<"rev16${p} $Rd, $Rm", (t2REV16 rGPR:$Rd, rGPR:$Rm, pred:$p)>;
+def : t2InstAlias<"revsh${p} $Rd, $Rm", (t2REVSH rGPR:$Rd, rGPR:$Rm, pred:$p)>;
+
+
+// Alias for RSB without the ".w" optional width specifier, and with optional
+// implied destination register.
+def : t2InstAlias<"rsb${s}${p} $Rd, $Rn, $imm",
+           (t2RSBri rGPR:$Rd, rGPR:$Rn, t2_so_imm:$imm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"rsb${s}${p} $Rdn, $imm",
+           (t2RSBri rGPR:$Rdn, rGPR:$Rdn, t2_so_imm:$imm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"rsb${s}${p} $Rdn, $Rm",
+           (t2RSBrr rGPR:$Rdn, rGPR:$Rdn, rGPR:$Rm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"rsb${s}${p} $Rdn, $ShiftedRm",
+           (t2RSBrs rGPR:$Rdn, rGPR:$Rdn, t2_so_reg:$ShiftedRm, pred:$p,
+                    cc_out:$s)>;
+
+// SSAT/USAT optional shift operand.
+def : t2InstAlias<"ssat${p} $Rd, $sat_imm, $Rn",
+                  (t2SSAT rGPR:$Rd, imm1_32:$sat_imm, rGPR:$Rn, 0, pred:$p)>;
+def : t2InstAlias<"usat${p} $Rd, $sat_imm, $Rn",
+                  (t2USAT rGPR:$Rd, imm0_31:$sat_imm, rGPR:$Rn, 0, pred:$p)>;
+
+// STM w/o the .w suffix.
+def : t2InstAlias<"stm${p} $Rn, $regs",
+                  (t2STMIA GPR:$Rn, pred:$p, reglist:$regs)>;
+
+// Alias for STR, STRB, and STRH without the ".w" optional
+// width specifier.
+def : t2InstAlias<"str${p} $Rt, $addr",
+                  (t2STRi12 GPR:$Rt, t2addrmode_imm12:$addr, pred:$p)>;
+def : t2InstAlias<"strb${p} $Rt, $addr",
+                  (t2STRBi12 rGPR:$Rt, t2addrmode_imm12:$addr, pred:$p)>;
+def : t2InstAlias<"strh${p} $Rt, $addr",
+                  (t2STRHi12 rGPR:$Rt, t2addrmode_imm12:$addr, pred:$p)>;
+
+def : t2InstAlias<"str${p} $Rt, $addr",
+                  (t2STRs GPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>;
+def : t2InstAlias<"strb${p} $Rt, $addr",
+                  (t2STRBs rGPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>;
+def : t2InstAlias<"strh${p} $Rt, $addr",
+                  (t2STRHs rGPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>;
+
+// Extend instruction optional rotate operand.
+def : t2InstAlias<"sxtab${p} $Rd, $Rn, $Rm",
+                (t2SXTAB rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>;
+def : t2InstAlias<"sxtah${p} $Rd, $Rn, $Rm",
+                (t2SXTAH rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>;
+def : t2InstAlias<"sxtab16${p} $Rd, $Rn, $Rm",
+                (t2SXTAB16 rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>;
+
+def : t2InstAlias<"sxtb${p} $Rd, $Rm",
+                (t2SXTB rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
+def : t2InstAlias<"sxtb16${p} $Rd, $Rm",
+                (t2SXTB16 rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
+def : t2InstAlias<"sxth${p} $Rd, $Rm",
+                (t2SXTH rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
+def : t2InstAlias<"sxtb${p}.w $Rd, $Rm",
+                (t2SXTB rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
+def : t2InstAlias<"sxth${p}.w $Rd, $Rm",
+                (t2SXTH rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
+
+def : t2InstAlias<"uxtab${p} $Rd, $Rn, $Rm",
+                (t2UXTAB rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>;
+def : t2InstAlias<"uxtah${p} $Rd, $Rn, $Rm",
+                (t2UXTAH rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>;
+def : t2InstAlias<"uxtab16${p} $Rd, $Rn, $Rm",
+                (t2UXTAB16 rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>;
+def : t2InstAlias<"uxtb${p} $Rd, $Rm",
+                (t2UXTB rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
+def : t2InstAlias<"uxtb16${p} $Rd, $Rm",
+                (t2UXTB16 rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
+def : t2InstAlias<"uxth${p} $Rd, $Rm",
+                (t2UXTH rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
+
+def : t2InstAlias<"uxtb${p}.w $Rd, $Rm",
+                (t2UXTB rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
+def : t2InstAlias<"uxth${p}.w $Rd, $Rm",
+                (t2UXTH rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
+
+// Extend instruction w/o the ".w" optional width specifier.
+def : t2InstAlias<"uxtb${p} $Rd, $Rm$rot",
+                  (t2UXTB rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>;
+def : t2InstAlias<"uxtb16${p} $Rd, $Rm$rot",
+                  (t2UXTB16 rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>;
+def : t2InstAlias<"uxth${p} $Rd, $Rm$rot",
+                  (t2UXTH rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>;
+
+def : t2InstAlias<"sxtb${p} $Rd, $Rm$rot",
+                  (t2SXTB rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>;
+def : t2InstAlias<"sxtb16${p} $Rd, $Rm$rot",
+                  (t2SXTB16 rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>;
+def : t2InstAlias<"sxth${p} $Rd, $Rm$rot",
+                  (t2SXTH rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>;
diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td
index f1f3cb9c2ecd7..e746cf20d032a 100644
--- a/lib/Target/ARM/ARMInstrVFP.td
+++ b/lib/Target/ARM/ARMInstrVFP.td
@@ -31,18 +31,34 @@ def arm_fmdrr  : SDNode<"ARMISD::VMOVDRR", SDT_VMOVDRR>;
 // Operand Definitions.
 //
 
+// 8-bit floating-point immediate encodings.
+def FPImmOperand : AsmOperandClass {
+  let Name = "FPImm";
+  let ParserMethod = "parseFPImm";
+}
+
 def vfp_f32imm : Operand<f32>,
                  PatLeaf<(f32 fpimm), [{
-      return ARM::getVFPf32Imm(N->getValueAPF()) != -1;
-    }]> {
-  let PrintMethod = "printVFPf32ImmOperand";
+      return ARM_AM::getFP32Imm(N->getValueAPF()) != -1;
+    }], SDNodeXForm<fpimm, [{
+      APFloat InVal = N->getValueAPF();
+      uint32_t enc = ARM_AM::getFP32Imm(InVal);
+      return CurDAG->getTargetConstant(enc, MVT::i32);
+    }]>> {
+  let PrintMethod = "printFPImmOperand";
+  let ParserMatchClass = FPImmOperand;
 }
 
 def vfp_f64imm : Operand<f64>,
                  PatLeaf<(f64 fpimm), [{
-      return ARM::getVFPf64Imm(N->getValueAPF()) != -1;
-    }]> {
-  let PrintMethod = "printVFPf64ImmOperand";
+      return ARM_AM::getFP64Imm(N->getValueAPF()) != -1;
+    }], SDNodeXForm<fpimm, [{
+      APFloat InVal = N->getValueAPF();
+      uint32_t enc = ARM_AM::getFP64Imm(InVal);
+      return CurDAG->getTargetConstant(enc, MVT::i32);
+    }]>> {
+  let PrintMethod = "printFPImmOperand";
+  let ParserMatchClass = FPImmOperand;
 }
 
 
@@ -385,26 +401,26 @@ def VCVTSD  : VFPAI<(outs SPR:$Sd), (ins DPR:$Dm), VFPUnaryFrm,
 // Between half-precision and single-precision.  For disassembly only.
 
 // FIXME: Verify encoding after integrated assembler is working.
-def VCVTBSH: ASuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs SPR:$dst), (ins SPR:$a),
-                 /* FIXME */ IIC_fpCVTSH, "vcvtb", ".f32.f16\t$dst, $a",
+def VCVTBSH: ASuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm),
+                 /* FIXME */ IIC_fpCVTSH, "vcvtb", ".f32.f16\t$Sd, $Sm",
                  [/* For disassembly only; pattern left blank */]>;
 
 def : ARMPat<(f32_to_f16 SPR:$a),
              (i32 (COPY_TO_REGCLASS (VCVTBSH SPR:$a), GPR))>;
 
-def VCVTBHS: ASuI<0b11101, 0b11, 0b0011, 0b01, 0, (outs SPR:$dst), (ins SPR:$a),
-                 /* FIXME */ IIC_fpCVTHS, "vcvtb", ".f16.f32\t$dst, $a",
+def VCVTBHS: ASuI<0b11101, 0b11, 0b0011, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm),
+                 /* FIXME */ IIC_fpCVTHS, "vcvtb", ".f16.f32\t$Sd, $Sm",
                  [/* For disassembly only; pattern left blank */]>;
 
 def : ARMPat<(f16_to_f32 GPR:$a),
              (VCVTBHS (COPY_TO_REGCLASS GPR:$a, SPR))>;
 
-def VCVTTSH: ASuI<0b11101, 0b11, 0b0010, 0b11, 0, (outs SPR:$dst), (ins SPR:$a),
-                 /* FIXME */ IIC_fpCVTSH, "vcvtt", ".f32.f16\t$dst, $a",
+def VCVTTSH: ASuI<0b11101, 0b11, 0b0010, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm),
+                 /* FIXME */ IIC_fpCVTSH, "vcvtt", ".f32.f16\t$Sd, $Sm",
                  [/* For disassembly only; pattern left blank */]>;
 
-def VCVTTHS: ASuI<0b11101, 0b11, 0b0011, 0b11, 0, (outs SPR:$dst), (ins SPR:$a),
-                 /* FIXME */ IIC_fpCVTHS, "vcvtt", ".f16.f32\t$dst, $a",
+def VCVTTHS: ASuI<0b11101, 0b11, 0b0011, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm),
+                 /* FIXME */ IIC_fpCVTHS, "vcvtt", ".f16.f32\t$Sd, $Sm",
                  [/* For disassembly only; pattern left blank */]>;
 
 def VNEGD  : ADuI<0b11101, 0b11, 0b0001, 0b01, 0,
@@ -511,14 +527,25 @@ def VMOVRRD  : AVConv3I<0b11000101, 0b1011,
 }
 
 def VMOVRRS  : AVConv3I<0b11000101, 0b1010,
-                      (outs GPR:$wb, GPR:$dst2), (ins SPR:$src1, SPR:$src2),
-                 IIC_fpMOVDI, "vmov", "\t$wb, $dst2, $src1, $src2",
+                      (outs GPR:$Rt, GPR:$Rt2), (ins SPR:$src1, SPR:$src2),
+                 IIC_fpMOVDI, "vmov", "\t$Rt, $Rt2, $src1, $src2",
                  [/* For disassembly only; pattern left blank */]> {
+  bits<5> src1;
+  bits<4> Rt;
+  bits<4> Rt2;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = src1{3-0};
+  let Inst{5}     = src1{4};
+  let Inst{15-12} = Rt;
+  let Inst{19-16} = Rt2;
+
   let Inst{7-6} = 0b00;
 
   // Some single precision VFP instructions may be executed on both NEON and VFP
   // pipelines.
   let D = VFPNeonDomain;
+  let DecoderMethod = "DecodeVMOVRRS";
 }
 } // neverHasSideEffects
 
@@ -552,11 +579,24 @@ def VMOVSRR : AVConv5I<0b11000100, 0b1010,
                      (outs SPR:$dst1, SPR:$dst2), (ins GPR:$src1, GPR:$src2),
                 IIC_fpMOVID, "vmov", "\t$dst1, $dst2, $src1, $src2",
                 [/* For disassembly only; pattern left blank */]> {
+  // Instruction operands.
+  bits<5> dst1;
+  bits<4> src1;
+  bits<4> src2;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = dst1{3-0};
+  let Inst{5}     = dst1{4};
+  let Inst{15-12} = src1;
+  let Inst{19-16} = src2;
+
   let Inst{7-6} = 0b00;
 
   // Some single precision VFP instructions may be executed on both NEON and VFP
   // pipelines.
   let D = VFPNeonDomain;
+
+  let DecoderMethod = "DecodeVMOVSRR";
 }
 
 // FMRDH: SPR -> GPR
@@ -1084,45 +1124,42 @@ def FCONSTD : VFPAI<(outs DPR:$Dd), (ins vfp_f64imm:$imm),
                     VFPMiscFrm, IIC_fpUNA64,
                     "vmov", ".f64\t$Dd, $imm",
                     [(set DPR:$Dd, vfp_f64imm:$imm)]>, Requires<[HasVFP3]> {
-  // Instruction operands.
-  bits<5>  Dd;
-  bits<32> imm;
-
-  // Encode instruction operands.
-  let Inst{15-12} = Dd{3-0};
-  let Inst{22}    = Dd{4};
-  let Inst{19}    = imm{31};
-  let Inst{18-16} = imm{22-20};
-  let Inst{3-0}   = imm{19-16};
+  bits<5> Dd;
+  bits<8> imm;
 
-  // Encode remaining instruction bits.
   let Inst{27-23} = 0b11101;
+  let Inst{22}    = Dd{4};
   let Inst{21-20} = 0b11;
+  let Inst{19-16} = imm{7-4};
+  let Inst{15-12} = Dd{3-0};
   let Inst{11-9}  = 0b101;
   let Inst{8}     = 1;          // Double precision.
   let Inst{7-4}   = 0b0000;
+  let Inst{3-0}   = imm{3-0};
 }
 
 def FCONSTS : VFPAI<(outs SPR:$Sd), (ins vfp_f32imm:$imm),
                      VFPMiscFrm, IIC_fpUNA32,
                      "vmov", ".f32\t$Sd, $imm",
                      [(set SPR:$Sd, vfp_f32imm:$imm)]>, Requires<[HasVFP3]> {
-  // Instruction operands.
-  bits<5>  Sd;
-  bits<32> imm;
-
-  // Encode instruction operands.
-  let Inst{15-12} = Sd{4-1};
-  let Inst{22}    = Sd{0};
-  let Inst{19}    = imm{31};    // The immediate is handled as a double.
-  let Inst{18-16} = imm{22-20};
-  let Inst{3-0}   = imm{19-16};
+  bits<5> Sd;
+  bits<8> imm;
 
-  // Encode remaining instruction bits.
   let Inst{27-23} = 0b11101;
+  let Inst{22}    = Sd{0};
   let Inst{21-20} = 0b11;
+  let Inst{19-16} = imm{7-4};
+  let Inst{15-12} = Sd{4-1};
   let Inst{11-9}  = 0b101;
   let Inst{8}     = 0;          // Single precision.
   let Inst{7-4}   = 0b0000;
+  let Inst{3-0}   = imm{3-0};
 }
 }
+
+//===----------------------------------------------------------------------===//
+// Assembler aliases.
+//
+
+def : VFP2InstAlias<"fmstat${p}", (FMSTAT pred:$p)>;
+
diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index c6efea1d7806e..faa8ba76845e4 100644
--- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -14,10 +14,10 @@
 
 #define DEBUG_TYPE "arm-ldst-opt"
 #include "ARM.h"
-#include "ARMAddressingModes.h"
 #include "ARMBaseInstrInfo.h"
 #include "ARMMachineFunctionInfo.h"
 #include "ARMRegisterInfo.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Function.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -26,6 +26,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
@@ -763,9 +764,9 @@ static unsigned getPreIndexedLoadStoreOpcode(unsigned Opc,
                                              ARM_AM::AddrOpc Mode) {
   switch (Opc) {
   case ARM::LDRi12:
-    return ARM::LDR_PRE;
+    return ARM::LDR_PRE_IMM;
   case ARM::STRi12:
-    return ARM::STR_PRE;
+    return ARM::STR_PRE_IMM;
   case ARM::VLDRS:
     return Mode == ARM_AM::add ? ARM::VLDMSIA_UPD : ARM::VLDMSDB_UPD;
   case ARM::VLDRD:
@@ -789,9 +790,9 @@ static unsigned getPostIndexedLoadStoreOpcode(unsigned Opc,
                                               ARM_AM::AddrOpc Mode) {
   switch (Opc) {
   case ARM::LDRi12:
-    return ARM::LDR_POST;
+    return ARM::LDR_POST_IMM;
   case ARM::STRi12:
-    return ARM::STR_POST;
+    return ARM::STR_POST_IMM;
   case ARM::VLDRS:
     return Mode == ARM_AM::add ? ARM::VLDMSIA_UPD : ARM::VLDMSDB_UPD;
   case ARM::VLDRD:
@@ -892,12 +893,6 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
   if (!DoMerge)
     return false;
 
-  unsigned Offset = 0;
-  if (isAM2)
-    Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
-  else if (!isAM5)
-    Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes;
-
   if (isAM5) {
     // VLDM[SD}_UPD, VSTM[SD]_UPD
     // (There are no base-updating versions of VLDR/VSTR instructions, but the
@@ -911,28 +906,44 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
       .addReg(MO.getReg(), (isLd ? getDefRegState(true) :
                             getKillRegState(MO.isKill())));
   } else if (isLd) {
-    if (isAM2)
-      // LDR_PRE, LDR_POST,
-      BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg())
-        .addReg(Base, RegState::Define)
-        .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg);
-    else
+    if (isAM2) {
+      // LDR_PRE, LDR_POST
+      if (NewOpc == ARM::LDR_PRE_IMM || NewOpc == ARM::LDRB_PRE_IMM) {
+        int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes;
+        BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg())
+          .addReg(Base, RegState::Define)
+          .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
+      } else {
+        int Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
+        BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg())
+          .addReg(Base, RegState::Define)
+          .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg);
+      }
+    } else {
+      int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes;
       // t2LDR_PRE, t2LDR_POST
       BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg())
         .addReg(Base, RegState::Define)
         .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
+    }
   } else {
     MachineOperand &MO = MI->getOperand(0);
-    if (isAM2)
+    // FIXME: post-indexed stores use am2offset_imm, which still encodes
+    // the vestigal zero-reg offset register. When that's fixed, this clause
+    // can be removed entirely.
+    if (isAM2 && NewOpc == ARM::STR_POST_IMM) {
+      int Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
       // STR_PRE, STR_POST
       BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base)
         .addReg(MO.getReg(), getKillRegState(MO.isKill()))
         .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg);
-    else
+    } else {
+      int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes;
       // t2STR_PRE, t2STR_POST
       BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base)
         .addReg(MO.getReg(), getKillRegState(MO.isKill()))
         .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
+    }
   }
   MBB.erase(MBBI);
 
diff --git a/lib/Target/ARM/ARMMCCodeEmitter.cpp b/lib/Target/ARM/ARMMCCodeEmitter.cpp
deleted file mode 100644
index 39be3f0e39f8d..0000000000000
--- a/lib/Target/ARM/ARMMCCodeEmitter.cpp
+++ /dev/null
@@ -1,1314 +0,0 @@
-//===-- ARM/ARMMCCodeEmitter.cpp - Convert ARM code to machine code -------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the ARMMCCodeEmitter class.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "mccodeemitter"
-#include "ARM.h"
-#include "ARMAddressingModes.h"
-#include "ARMFixupKinds.h"
-#include "ARMInstrInfo.h"
-#include "ARMMCExpr.h"
-#include "ARMSubtarget.h"
-#include "llvm/MC/MCCodeEmitter.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCInstrInfo.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-
-STATISTIC(MCNumEmitted, "Number of MC instructions emitted.");
-STATISTIC(MCNumCPRelocations, "Number of constant pool relocations created.");
-
-namespace {
-class ARMMCCodeEmitter : public MCCodeEmitter {
-  ARMMCCodeEmitter(const ARMMCCodeEmitter &); // DO NOT IMPLEMENT
-  void operator=(const ARMMCCodeEmitter &); // DO NOT IMPLEMENT
-  const MCInstrInfo &MCII;
-  const MCSubtargetInfo &STI;
-
-public:
-  ARMMCCodeEmitter(const MCInstrInfo &mcii, const MCSubtargetInfo &sti,
-                   MCContext &ctx)
-    : MCII(mcii), STI(sti) {
-  }
-
-  ~ARMMCCodeEmitter() {}
-
-  bool isThumb() const {
-    // FIXME: Can tablegen auto-generate this?
-    return (STI.getFeatureBits() & ARM::ModeThumb) != 0;
-  }
-  bool isThumb2() const {
-    return isThumb() && (STI.getFeatureBits() & ARM::FeatureThumb2) != 0;
-  }
-  bool isTargetDarwin() const {
-    Triple TT(STI.getTargetTriple());
-    Triple::OSType OS = TT.getOS();
-    return OS == Triple::Darwin || OS == Triple::MacOSX || OS == Triple::IOS;
-  }
-
-  unsigned getMachineSoImmOpValue(unsigned SoImm) const;
-
-  // getBinaryCodeForInstr - TableGen'erated function for getting the
-  // binary encoding for an instruction.
-  unsigned getBinaryCodeForInstr(const MCInst &MI,
-                                 SmallVectorImpl<MCFixup> &Fixups) const;
-
-  /// getMachineOpValue - Return binary encoding of operand. If the machine
-  /// operand requires relocation, record the relocation and return zero.
-  unsigned getMachineOpValue(const MCInst &MI,const MCOperand &MO,
-                             SmallVectorImpl<MCFixup> &Fixups) const;
-
-  /// getHiLo16ImmOpValue - Return the encoding for the hi / low 16-bit of
-  /// the specified operand. This is used for operands with :lower16: and
-  /// :upper16: prefixes.
-  uint32_t getHiLo16ImmOpValue(const MCInst &MI, unsigned OpIdx,
-                               SmallVectorImpl<MCFixup> &Fixups) const;
-
-  bool EncodeAddrModeOpValues(const MCInst &MI, unsigned OpIdx,
-                              unsigned &Reg, unsigned &Imm,
-                              SmallVectorImpl<MCFixup> &Fixups) const;
-
-  /// getThumbBLTargetOpValue - Return encoding info for Thumb immediate
-  /// BL branch target.
-  uint32_t getThumbBLTargetOpValue(const MCInst &MI, unsigned OpIdx,
-                                   SmallVectorImpl<MCFixup> &Fixups) const;
-
-  /// getThumbBLXTargetOpValue - Return encoding info for Thumb immediate
-  /// BLX branch target.
-  uint32_t getThumbBLXTargetOpValue(const MCInst &MI, unsigned OpIdx,
-                                    SmallVectorImpl<MCFixup> &Fixups) const;
-
-  /// getThumbBRTargetOpValue - Return encoding info for Thumb branch target.
-  uint32_t getThumbBRTargetOpValue(const MCInst &MI, unsigned OpIdx,
-                                   SmallVectorImpl<MCFixup> &Fixups) const;
-
-  /// getThumbBCCTargetOpValue - Return encoding info for Thumb branch target.
-  uint32_t getThumbBCCTargetOpValue(const MCInst &MI, unsigned OpIdx,
-                                    SmallVectorImpl<MCFixup> &Fixups) const;
-
-  /// getThumbCBTargetOpValue - Return encoding info for Thumb branch target.
-  uint32_t getThumbCBTargetOpValue(const MCInst &MI, unsigned OpIdx,
-                                   SmallVectorImpl<MCFixup> &Fixups) const;
-
-  /// getBranchTargetOpValue - Return encoding info for 24-bit immediate
-  /// branch target.
-  uint32_t getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
-                                  SmallVectorImpl<MCFixup> &Fixups) const;
-
-  /// getUnconditionalBranchTargetOpValue - Return encoding info for 24-bit
-  /// immediate Thumb2 direct branch target.
-  uint32_t getUnconditionalBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
-                                  SmallVectorImpl<MCFixup> &Fixups) const;
-  
-  /// getARMBranchTargetOpValue - Return encoding info for 24-bit immediate
-  /// branch target.
-  uint32_t getARMBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
-                                     SmallVectorImpl<MCFixup> &Fixups) const;
-
-  /// getAdrLabelOpValue - Return encoding info for 12-bit immediate
-  /// ADR label target.
-  uint32_t getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
-                              SmallVectorImpl<MCFixup> &Fixups) const;
-  uint32_t getThumbAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
-                              SmallVectorImpl<MCFixup> &Fixups) const;
-  uint32_t getT2AdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
-                              SmallVectorImpl<MCFixup> &Fixups) const;
-
-
-  /// getAddrModeImm12OpValue - Return encoding info for 'reg +/- imm12'
-  /// operand.
-  uint32_t getAddrModeImm12OpValue(const MCInst &MI, unsigned OpIdx,
-                                   SmallVectorImpl<MCFixup> &Fixups) const;
-
-  /// getThumbAddrModeRegRegOpValue - Return encoding for 'reg + reg' operand.
-  uint32_t getThumbAddrModeRegRegOpValue(const MCInst &MI, unsigned OpIdx,
-                                         SmallVectorImpl<MCFixup> &Fixups)const;
-
-  /// getT2AddrModeImm8s4OpValue - Return encoding info for 'reg +/- imm8<<2'
-  /// operand.
-  uint32_t getT2AddrModeImm8s4OpValue(const MCInst &MI, unsigned OpIdx,
-                                   SmallVectorImpl<MCFixup> &Fixups) const;
-
-
-  /// getLdStSORegOpValue - Return encoding info for 'reg +/- reg shop imm'
-  /// operand as needed by load/store instructions.
-  uint32_t getLdStSORegOpValue(const MCInst &MI, unsigned OpIdx,
-                               SmallVectorImpl<MCFixup> &Fixups) const;
-
-  /// getLdStmModeOpValue - Return encoding for load/store multiple mode.
-  uint32_t getLdStmModeOpValue(const MCInst &MI, unsigned OpIdx,
-                               SmallVectorImpl<MCFixup> &Fixups) const {
-    ARM_AM::AMSubMode Mode = (ARM_AM::AMSubMode)MI.getOperand(OpIdx).getImm();
-    switch (Mode) {
-    default: assert(0 && "Unknown addressing sub-mode!");
-    case ARM_AM::da: return 0;
-    case ARM_AM::ia: return 1;
-    case ARM_AM::db: return 2;
-    case ARM_AM::ib: return 3;
-    }
-  }
-  /// getShiftOp - Return the shift opcode (bit[6:5]) of the immediate value.
-  ///
-  unsigned getShiftOp(ARM_AM::ShiftOpc ShOpc) const {
-    switch (ShOpc) {
-    default: llvm_unreachable("Unknown shift opc!");
-    case ARM_AM::no_shift:
-    case ARM_AM::lsl: return 0;
-    case ARM_AM::lsr: return 1;
-    case ARM_AM::asr: return 2;
-    case ARM_AM::ror:
-    case ARM_AM::rrx: return 3;
-    }
-    return 0;
-  }
-
-  /// getAddrMode2OpValue - Return encoding for addrmode2 operands.
-  uint32_t getAddrMode2OpValue(const MCInst &MI, unsigned OpIdx,
-                               SmallVectorImpl<MCFixup> &Fixups) const;
-
-  /// getAddrMode2OffsetOpValue - Return encoding for am2offset operands.
-  uint32_t getAddrMode2OffsetOpValue(const MCInst &MI, unsigned OpIdx,
-                                     SmallVectorImpl<MCFixup> &Fixups) const;
-
-  /// getAddrMode3OffsetOpValue - Return encoding for am3offset operands.
-  uint32_t getAddrMode3OffsetOpValue(const MCInst &MI, unsigned OpIdx,
-                                     SmallVectorImpl<MCFixup> &Fixups) const;
-
-  /// getAddrMode3OpValue - Return encoding for addrmode3 operands.
-  uint32_t getAddrMode3OpValue(const MCInst &MI, unsigned OpIdx,
-                               SmallVectorImpl<MCFixup> &Fixups) const;
-
-  /// getAddrModeThumbSPOpValue - Return encoding info for 'reg +/- imm12'
-  /// operand.
-  uint32_t getAddrModeThumbSPOpValue(const MCInst &MI, unsigned OpIdx,
-                                     SmallVectorImpl<MCFixup> &Fixups) const;
-
-  /// getAddrModeISOpValue - Encode the t_addrmode_is# operands.
-  uint32_t getAddrModeISOpValue(const MCInst &MI, unsigned OpIdx,
-                                SmallVectorImpl<MCFixup> &Fixups) const;
-
-  /// getAddrModePCOpValue - Return encoding for t_addrmode_pc operands.
-  uint32_t getAddrModePCOpValue(const MCInst &MI, unsigned OpIdx,
-                                SmallVectorImpl<MCFixup> &Fixups) const;
-
-  /// getAddrMode5OpValue - Return encoding info for 'reg +/- imm8' operand.
-  uint32_t getAddrMode5OpValue(const MCInst &MI, unsigned OpIdx,
-                               SmallVectorImpl<MCFixup> &Fixups) const;
-
-  /// getCCOutOpValue - Return encoding of the 's' bit.
-  unsigned getCCOutOpValue(const MCInst &MI, unsigned Op,
-                           SmallVectorImpl<MCFixup> &Fixups) const {
-    // The operand is either reg0 or CPSR. The 's' bit is encoded as '0' or
-    // '1' respectively.
-    return MI.getOperand(Op).getReg() == ARM::CPSR;
-  }
-
-  /// getSOImmOpValue - Return an encoded 12-bit shifted-immediate value.
-  unsigned getSOImmOpValue(const MCInst &MI, unsigned Op,
-                           SmallVectorImpl<MCFixup> &Fixups) const {
-    unsigned SoImm = MI.getOperand(Op).getImm();
-    int SoImmVal = ARM_AM::getSOImmVal(SoImm);
-    assert(SoImmVal != -1 && "Not a valid so_imm value!");
-
-    // Encode rotate_imm.
-    unsigned Binary = (ARM_AM::getSOImmValRot((unsigned)SoImmVal) >> 1)
-      << ARMII::SoRotImmShift;
-
-    // Encode immed_8.
-    Binary |= ARM_AM::getSOImmValImm((unsigned)SoImmVal);
-    return Binary;
-  }
-
-  /// getT2SOImmOpValue - Return an encoded 12-bit shifted-immediate value.
-  unsigned getT2SOImmOpValue(const MCInst &MI, unsigned Op,
-                           SmallVectorImpl<MCFixup> &Fixups) const {
-    unsigned SoImm = MI.getOperand(Op).getImm();
-    unsigned Encoded =  ARM_AM::getT2SOImmVal(SoImm);
-    assert(Encoded != ~0U && "Not a Thumb2 so_imm value?");
-    return Encoded;
-  }
-
-  unsigned getT2AddrModeSORegOpValue(const MCInst &MI, unsigned OpNum,
-    SmallVectorImpl<MCFixup> &Fixups) const;
-  unsigned getT2AddrModeImm8OpValue(const MCInst &MI, unsigned OpNum,
-    SmallVectorImpl<MCFixup> &Fixups) const;
-  unsigned getT2AddrModeImm8OffsetOpValue(const MCInst &MI, unsigned OpNum,
-    SmallVectorImpl<MCFixup> &Fixups) const;
-  unsigned getT2AddrModeImm12OffsetOpValue(const MCInst &MI, unsigned OpNum,
-    SmallVectorImpl<MCFixup> &Fixups) const;
-
-  /// getSORegOpValue - Return an encoded so_reg shifted register value.
-  unsigned getSORegOpValue(const MCInst &MI, unsigned Op,
-                           SmallVectorImpl<MCFixup> &Fixups) const;
-  unsigned getT2SORegOpValue(const MCInst &MI, unsigned Op,
-                             SmallVectorImpl<MCFixup> &Fixups) const;
-
-  unsigned getRotImmOpValue(const MCInst &MI, unsigned Op,
-                            SmallVectorImpl<MCFixup> &Fixups) const {
-    switch (MI.getOperand(Op).getImm()) {
-    default: assert (0 && "Not a valid rot_imm value!");
-    case 0:  return 0;
-    case 8:  return 1;
-    case 16: return 2;
-    case 24: return 3;
-    }
-  }
-
-  unsigned getImmMinusOneOpValue(const MCInst &MI, unsigned Op,
-                                 SmallVectorImpl<MCFixup> &Fixups) const {
-    return MI.getOperand(Op).getImm() - 1;
-  }
-
-  unsigned getNEONVcvtImm32OpValue(const MCInst &MI, unsigned Op,
-                                   SmallVectorImpl<MCFixup> &Fixups) const {
-    return 64 - MI.getOperand(Op).getImm();
-  }
-
-  unsigned getBitfieldInvertedMaskOpValue(const MCInst &MI, unsigned Op,
-                                      SmallVectorImpl<MCFixup> &Fixups) const;
-
-  unsigned getMsbOpValue(const MCInst &MI, unsigned Op,
-                         SmallVectorImpl<MCFixup> &Fixups) const;
-
-  unsigned getSsatBitPosValue(const MCInst &MI, unsigned Op,
-                              SmallVectorImpl<MCFixup> &Fixups) const;
-
-  unsigned getRegisterListOpValue(const MCInst &MI, unsigned Op,
-                                  SmallVectorImpl<MCFixup> &Fixups) const;
-  unsigned getAddrMode6AddressOpValue(const MCInst &MI, unsigned Op,
-                                      SmallVectorImpl<MCFixup> &Fixups) const;
-  unsigned getAddrMode6OneLane32AddressOpValue(const MCInst &MI, unsigned Op,
-                                        SmallVectorImpl<MCFixup> &Fixups) const;
-  unsigned getAddrMode6DupAddressOpValue(const MCInst &MI, unsigned Op,
-                                        SmallVectorImpl<MCFixup> &Fixups) const;
-  unsigned getAddrMode6OffsetOpValue(const MCInst &MI, unsigned Op,
-                                     SmallVectorImpl<MCFixup> &Fixups) const;
-
-  unsigned getShiftRight8Imm(const MCInst &MI, unsigned Op,
-                             SmallVectorImpl<MCFixup> &Fixups) const;
-  unsigned getShiftRight16Imm(const MCInst &MI, unsigned Op,
-                              SmallVectorImpl<MCFixup> &Fixups) const;
-  unsigned getShiftRight32Imm(const MCInst &MI, unsigned Op,
-                              SmallVectorImpl<MCFixup> &Fixups) const;
-  unsigned getShiftRight64Imm(const MCInst &MI, unsigned Op,
-                              SmallVectorImpl<MCFixup> &Fixups) const;
-
-  unsigned NEONThumb2DataIPostEncoder(const MCInst &MI,
-                                      unsigned EncodedValue) const;
-  unsigned NEONThumb2LoadStorePostEncoder(const MCInst &MI,
-                                          unsigned EncodedValue) const;
-  unsigned NEONThumb2DupPostEncoder(const MCInst &MI,
-                                    unsigned EncodedValue) const;
-
-  unsigned VFPThumb2PostEncoder(const MCInst &MI,
-                                unsigned EncodedValue) const;
-
-  void EmitByte(unsigned char C, raw_ostream &OS) const {
-    OS << (char)C;
-  }
-
-  void EmitConstant(uint64_t Val, unsigned Size, raw_ostream &OS) const {
-    // Output the constant in little endian byte order.
-    for (unsigned i = 0; i != Size; ++i) {
-      EmitByte(Val & 255, OS);
-      Val >>= 8;
-    }
-  }
-
-  void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
-                         SmallVectorImpl<MCFixup> &Fixups) const;
-};
-
-} // end anonymous namespace
-
-MCCodeEmitter *llvm::createARMMCCodeEmitter(const MCInstrInfo &MCII,
-                                            const MCSubtargetInfo &STI,
-                                            MCContext &Ctx) {
-  return new ARMMCCodeEmitter(MCII, STI, Ctx);
-}
-
-/// NEONThumb2DataIPostEncoder - Post-process encoded NEON data-processing
-/// instructions, and rewrite them to their Thumb2 form if we are currently in
-/// Thumb2 mode.
-unsigned ARMMCCodeEmitter::NEONThumb2DataIPostEncoder(const MCInst &MI,
-                                                 unsigned EncodedValue) const {
-  if (isThumb2()) {
-    // NEON Thumb2 data-processsing encodings are very simple: bit 24 is moved
-    // to bit 12 of the high half-word (i.e. bit 28), and bits 27-24 are
-    // set to 1111.
-    unsigned Bit24 = EncodedValue & 0x01000000;
-    unsigned Bit28 = Bit24 << 4;
-    EncodedValue &= 0xEFFFFFFF;
-    EncodedValue |= Bit28;
-    EncodedValue |= 0x0F000000;
-  }
-
-  return EncodedValue;
-}
-
-/// NEONThumb2LoadStorePostEncoder - Post-process encoded NEON load/store
-/// instructions, and rewrite them to their Thumb2 form if we are currently in
-/// Thumb2 mode.
-unsigned ARMMCCodeEmitter::NEONThumb2LoadStorePostEncoder(const MCInst &MI,
-                                                 unsigned EncodedValue) const {
-  if (isThumb2()) {
-    EncodedValue &= 0xF0FFFFFF;
-    EncodedValue |= 0x09000000;
-  }
-
-  return EncodedValue;
-}
-
-/// NEONThumb2DupPostEncoder - Post-process encoded NEON vdup
-/// instructions, and rewrite them to their Thumb2 form if we are currently in
-/// Thumb2 mode.
-unsigned ARMMCCodeEmitter::NEONThumb2DupPostEncoder(const MCInst &MI,
-                                                 unsigned EncodedValue) const {
-  if (isThumb2()) {
-    EncodedValue &= 0x00FFFFFF;
-    EncodedValue |= 0xEE000000;
-  }
-
-  return EncodedValue;
-}
-
-/// VFPThumb2PostEncoder - Post-process encoded VFP instructions and rewrite
-/// them to their Thumb2 form if we are currently in Thumb2 mode.
-unsigned ARMMCCodeEmitter::
-VFPThumb2PostEncoder(const MCInst &MI, unsigned EncodedValue) const {
-  if (isThumb2()) {
-    EncodedValue &= 0x0FFFFFFF;
-    EncodedValue |= 0xE0000000;
-  }
-  return EncodedValue;
-}
-
-/// getMachineOpValue - Return binary encoding of operand. If the machine
-/// operand requires relocation, record the relocation and return zero.
-unsigned ARMMCCodeEmitter::
-getMachineOpValue(const MCInst &MI, const MCOperand &MO,
-                  SmallVectorImpl<MCFixup> &Fixups) const {
-  if (MO.isReg()) {
-    unsigned Reg = MO.getReg();
-    unsigned RegNo = getARMRegisterNumbering(Reg);
-
-    // Q registers are encoded as 2x their register number.
-    switch (Reg) {
-    default:
-      return RegNo;
-    case ARM::Q0:  case ARM::Q1:  case ARM::Q2:  case ARM::Q3:
-    case ARM::Q4:  case ARM::Q5:  case ARM::Q6:  case ARM::Q7:
-    case ARM::Q8:  case ARM::Q9:  case ARM::Q10: case ARM::Q11:
-    case ARM::Q12: case ARM::Q13: case ARM::Q14: case ARM::Q15:
-      return 2 * RegNo;
-    }
-  } else if (MO.isImm()) {
-    return static_cast<unsigned>(MO.getImm());
-  } else if (MO.isFPImm()) {
-    return static_cast<unsigned>(APFloat(MO.getFPImm())
-                     .bitcastToAPInt().getHiBits(32).getLimitedValue());
-  }
-
-  llvm_unreachable("Unable to encode MCOperand!");
-  return 0;
-}
-
-/// getAddrModeImmOpValue - Return encoding info for 'reg +/- imm' operand.
-bool ARMMCCodeEmitter::
-EncodeAddrModeOpValues(const MCInst &MI, unsigned OpIdx, unsigned &Reg,
-                       unsigned &Imm, SmallVectorImpl<MCFixup> &Fixups) const {
-  const MCOperand &MO  = MI.getOperand(OpIdx);
-  const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
-
-  Reg = getARMRegisterNumbering(MO.getReg());
-
-  int32_t SImm = MO1.getImm();
-  bool isAdd = true;
-
-  // Special value for #-0
-  if (SImm == INT32_MIN)
-    SImm = 0;
-
-  // Immediate is always encoded as positive. The 'U' bit controls add vs sub.
-  if (SImm < 0) {
-    SImm = -SImm;
-    isAdd = false;
-  }
-
-  Imm = SImm;
-  return isAdd;
-}
-
-/// getBranchTargetOpValue - Helper function to get the branch target operand,
-/// which is either an immediate or requires a fixup.
-static uint32_t getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
-                                       unsigned FixupKind,
-                                       SmallVectorImpl<MCFixup> &Fixups) {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-
-  // If the destination is an immediate, we have nothing to do.
-  if (MO.isImm()) return MO.getImm();
-  assert(MO.isExpr() && "Unexpected branch target type!");
-  const MCExpr *Expr = MO.getExpr();
-  MCFixupKind Kind = MCFixupKind(FixupKind);
-  Fixups.push_back(MCFixup::Create(0, Expr, Kind));
-
-  // All of the information is in the fixup.
-  return 0;
-}
-
-/// getThumbBLTargetOpValue - Return encoding info for immediate branch target.
-uint32_t ARMMCCodeEmitter::
-getThumbBLTargetOpValue(const MCInst &MI, unsigned OpIdx,
-                        SmallVectorImpl<MCFixup> &Fixups) const {
-  return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_bl, Fixups);
-}
-
-/// getThumbBLXTargetOpValue - Return encoding info for Thumb immediate
-/// BLX branch target.
-uint32_t ARMMCCodeEmitter::
-getThumbBLXTargetOpValue(const MCInst &MI, unsigned OpIdx,
-                         SmallVectorImpl<MCFixup> &Fixups) const {
-  return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_blx, Fixups);
-}
-
-/// getThumbBRTargetOpValue - Return encoding info for Thumb branch target.
-uint32_t ARMMCCodeEmitter::
-getThumbBRTargetOpValue(const MCInst &MI, unsigned OpIdx,
-                        SmallVectorImpl<MCFixup> &Fixups) const {
-  return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_br, Fixups);
-}
-
-/// getThumbBCCTargetOpValue - Return encoding info for Thumb branch target.
-uint32_t ARMMCCodeEmitter::
-getThumbBCCTargetOpValue(const MCInst &MI, unsigned OpIdx,
-                         SmallVectorImpl<MCFixup> &Fixups) const {
-  return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_bcc, Fixups);
-}
-
-/// getThumbCBTargetOpValue - Return encoding info for Thumb branch target.
-uint32_t ARMMCCodeEmitter::
-getThumbCBTargetOpValue(const MCInst &MI, unsigned OpIdx,
-                        SmallVectorImpl<MCFixup> &Fixups) const {
-  return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_cb, Fixups);
-}
-
-/// Return true if this branch has a non-always predication
-static bool HasConditionalBranch(const MCInst &MI) {
-  int NumOp = MI.getNumOperands();
-  if (NumOp >= 2) {
-    for (int i = 0; i < NumOp-1; ++i) {
-      const MCOperand &MCOp1 = MI.getOperand(i);
-      const MCOperand &MCOp2 = MI.getOperand(i + 1);
-      if (MCOp1.isImm() && MCOp2.isReg() && 
-          (MCOp2.getReg() == 0 || MCOp2.getReg() == ARM::CPSR)) {
-        if (ARMCC::CondCodes(MCOp1.getImm()) != ARMCC::AL) 
-          return true;
-      }
-    }
-  }
-  return false;
-}
-
-/// getBranchTargetOpValue - Return encoding info for 24-bit immediate branch
-/// target.
-uint32_t ARMMCCodeEmitter::
-getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
-                       SmallVectorImpl<MCFixup> &Fixups) const {
-  // FIXME: This really, really shouldn't use TargetMachine. We don't want
-  // coupling between MC and TM anywhere we can help it.
-  if (isThumb2())
-    return
-      ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_t2_condbranch, Fixups);
-  return getARMBranchTargetOpValue(MI, OpIdx, Fixups);
-}
-
-/// getBranchTargetOpValue - Return encoding info for 24-bit immediate branch
-/// target.
-uint32_t ARMMCCodeEmitter::
-getARMBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
-                          SmallVectorImpl<MCFixup> &Fixups) const {
-  if (HasConditionalBranch(MI)) 
-    return ::getBranchTargetOpValue(MI, OpIdx,
-                                    ARM::fixup_arm_condbranch, Fixups);
-  return ::getBranchTargetOpValue(MI, OpIdx, 
-                                  ARM::fixup_arm_uncondbranch, Fixups);
-}
-
-
-
-
-/// getUnconditionalBranchTargetOpValue - Return encoding info for 24-bit
-/// immediate branch target.
-uint32_t ARMMCCodeEmitter::
-getUnconditionalBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
-                       SmallVectorImpl<MCFixup> &Fixups) const {
-  unsigned Val =
-    ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_t2_uncondbranch, Fixups);
-  bool I  = (Val & 0x800000);
-  bool J1 = (Val & 0x400000);
-  bool J2 = (Val & 0x200000);
-  if (I ^ J1)
-    Val &= ~0x400000;
-  else
-    Val |= 0x400000;
-
-  if (I ^ J2)
-    Val &= ~0x200000;
-  else
-    Val |= 0x200000;
-
-  return Val;
-}
-
-/// getAdrLabelOpValue - Return encoding info for 12-bit immediate ADR label
-/// target.
-uint32_t ARMMCCodeEmitter::
-getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
-                   SmallVectorImpl<MCFixup> &Fixups) const {
-  assert(MI.getOperand(OpIdx).isExpr() && "Unexpected adr target type!");
-  return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_adr_pcrel_12,
-                                  Fixups);
-}
-
-/// getAdrLabelOpValue - Return encoding info for 12-bit immediate ADR label
-/// target.
-uint32_t ARMMCCodeEmitter::
-getT2AdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
-                   SmallVectorImpl<MCFixup> &Fixups) const {
-  assert(MI.getOperand(OpIdx).isExpr() && "Unexpected adr target type!");
-  return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_t2_adr_pcrel_12,
-                                  Fixups);
-}
-
-/// getAdrLabelOpValue - Return encoding info for 8-bit immediate ADR label
-/// target.
-uint32_t ARMMCCodeEmitter::
-getThumbAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
-                   SmallVectorImpl<MCFixup> &Fixups) const {
-  assert(MI.getOperand(OpIdx).isExpr() && "Unexpected adr target type!");
-  return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_thumb_adr_pcrel_10,
-                                  Fixups);
-}
-
-/// getThumbAddrModeRegRegOpValue - Return encoding info for 'reg + reg'
-/// operand.
-uint32_t ARMMCCodeEmitter::
-getThumbAddrModeRegRegOpValue(const MCInst &MI, unsigned OpIdx,
-                              SmallVectorImpl<MCFixup> &) const {
-  // [Rn, Rm]
-  //   {5-3} = Rm
-  //   {2-0} = Rn
-  const MCOperand &MO1 = MI.getOperand(OpIdx);
-  const MCOperand &MO2 = MI.getOperand(OpIdx + 1);
-  unsigned Rn = getARMRegisterNumbering(MO1.getReg());
-  unsigned Rm = getARMRegisterNumbering(MO2.getReg());
-  return (Rm << 3) | Rn;
-}
-
-/// getAddrModeImm12OpValue - Return encoding info for 'reg +/- imm12' operand.
-uint32_t ARMMCCodeEmitter::
-getAddrModeImm12OpValue(const MCInst &MI, unsigned OpIdx,
-                        SmallVectorImpl<MCFixup> &Fixups) const {
-  // {17-13} = reg
-  // {12}    = (U)nsigned (add == '1', sub == '0')
-  // {11-0}  = imm12
-  unsigned Reg, Imm12;
-  bool isAdd = true;
-  // If The first operand isn't a register, we have a label reference.
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  if (!MO.isReg()) {
-    Reg = getARMRegisterNumbering(ARM::PC);   // Rn is PC.
-    Imm12 = 0;
-    isAdd = false ; // 'U' bit is set as part of the fixup.
-
-    assert(MO.isExpr() && "Unexpected machine operand type!");
-    const MCExpr *Expr = MO.getExpr();
-
-    MCFixupKind Kind;
-    if (isThumb2())
-      Kind = MCFixupKind(ARM::fixup_t2_ldst_pcrel_12);
-    else
-      Kind = MCFixupKind(ARM::fixup_arm_ldst_pcrel_12);
-    Fixups.push_back(MCFixup::Create(0, Expr, Kind));
-
-    ++MCNumCPRelocations;
-  } else
-    isAdd = EncodeAddrModeOpValues(MI, OpIdx, Reg, Imm12, Fixups);
-
-  uint32_t Binary = Imm12 & 0xfff;
-  // Immediate is always encoded as positive. The 'U' bit controls add vs sub.
-  if (isAdd)
-    Binary |= (1 << 12);
-  Binary |= (Reg << 13);
-  return Binary;
-}
-
-/// getT2AddrModeImm8s4OpValue - Return encoding info for
-/// 'reg +/- imm8<<2' operand.
-uint32_t ARMMCCodeEmitter::
-getT2AddrModeImm8s4OpValue(const MCInst &MI, unsigned OpIdx,
-                        SmallVectorImpl<MCFixup> &Fixups) const {
-  // {12-9} = reg
-  // {8}    = (U)nsigned (add == '1', sub == '0')
-  // {7-0}  = imm8
-  unsigned Reg, Imm8;
-  bool isAdd = true;
-  // If The first operand isn't a register, we have a label reference.
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  if (!MO.isReg()) {
-    Reg = getARMRegisterNumbering(ARM::PC);   // Rn is PC.
-    Imm8 = 0;
-    isAdd = false ; // 'U' bit is set as part of the fixup.
-
-    assert(MO.isExpr() && "Unexpected machine operand type!");
-    const MCExpr *Expr = MO.getExpr();
-    MCFixupKind Kind = MCFixupKind(ARM::fixup_arm_pcrel_10);
-    Fixups.push_back(MCFixup::Create(0, Expr, Kind));
-
-    ++MCNumCPRelocations;
-  } else
-    isAdd = EncodeAddrModeOpValues(MI, OpIdx, Reg, Imm8, Fixups);
-
-  uint32_t Binary = (Imm8 >> 2) & 0xff;
-  // Immediate is always encoded as positive. The 'U' bit controls add vs sub.
-  if (isAdd)
-    Binary |= (1 << 8);
-  Binary |= (Reg << 9);
-  return Binary;
-}
-
-// FIXME: This routine assumes that a binary
-// expression will always result in a PCRel expression
-// In reality, its only true if one or more subexpressions
-// is itself a PCRel (i.e. "." in asm or some other pcrel construct)
-// but this is good enough for now.
-static bool EvaluateAsPCRel(const MCExpr *Expr) {
-  switch (Expr->getKind()) {
-  default: assert(0 && "Unexpected expression type");
-  case MCExpr::SymbolRef: return false;
-  case MCExpr::Binary: return true;
-  }
-}
-
-uint32_t
-ARMMCCodeEmitter::getHiLo16ImmOpValue(const MCInst &MI, unsigned OpIdx,
-                                      SmallVectorImpl<MCFixup> &Fixups) const {
-  // {20-16} = imm{15-12}
-  // {11-0}  = imm{11-0}
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  if (MO.isImm())
-    // Hi / lo 16 bits already extracted during earlier passes.
-    return static_cast<unsigned>(MO.getImm());
-
-  // Handle :upper16: and :lower16: assembly prefixes.
-  const MCExpr *E = MO.getExpr();
-  if (E->getKind() == MCExpr::Target) {
-    const ARMMCExpr *ARM16Expr = cast<ARMMCExpr>(E);
-    E = ARM16Expr->getSubExpr();
-
-    MCFixupKind Kind;
-    switch (ARM16Expr->getKind()) {
-    default: assert(0 && "Unsupported ARMFixup");
-    case ARMMCExpr::VK_ARM_HI16:
-      if (!isTargetDarwin() && EvaluateAsPCRel(E))
-        Kind = MCFixupKind(isThumb2()
-                           ? ARM::fixup_t2_movt_hi16_pcrel
-                           : ARM::fixup_arm_movt_hi16_pcrel);
-      else
-        Kind = MCFixupKind(isThumb2()
-                           ? ARM::fixup_t2_movt_hi16
-                           : ARM::fixup_arm_movt_hi16);
-      break;
-    case ARMMCExpr::VK_ARM_LO16:
-      if (!isTargetDarwin() && EvaluateAsPCRel(E))
-        Kind = MCFixupKind(isThumb2()
-                           ? ARM::fixup_t2_movw_lo16_pcrel
-                           : ARM::fixup_arm_movw_lo16_pcrel);
-      else
-        Kind = MCFixupKind(isThumb2()
-                           ? ARM::fixup_t2_movw_lo16
-                           : ARM::fixup_arm_movw_lo16);
-      break;
-    }
-    Fixups.push_back(MCFixup::Create(0, E, Kind));
-    return 0;
-  };
-
-  llvm_unreachable("Unsupported MCExpr type in MCOperand!");
-  return 0;
-}
-
-uint32_t ARMMCCodeEmitter::
-getLdStSORegOpValue(const MCInst &MI, unsigned OpIdx,
-                    SmallVectorImpl<MCFixup> &Fixups) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  const MCOperand &MO1 = MI.getOperand(OpIdx+1);
-  const MCOperand &MO2 = MI.getOperand(OpIdx+2);
-  unsigned Rn = getARMRegisterNumbering(MO.getReg());
-  unsigned Rm = getARMRegisterNumbering(MO1.getReg());
-  unsigned ShImm = ARM_AM::getAM2Offset(MO2.getImm());
-  bool isAdd = ARM_AM::getAM2Op(MO2.getImm()) == ARM_AM::add;
-  ARM_AM::ShiftOpc ShOp = ARM_AM::getAM2ShiftOpc(MO2.getImm());
-  unsigned SBits = getShiftOp(ShOp);
-
-  // {16-13} = Rn
-  // {12}    = isAdd
-  // {11-0}  = shifter
-  //  {3-0}  = Rm
-  //  {4}    = 0
-  //  {6-5}  = type
-  //  {11-7} = imm
-  uint32_t Binary = Rm;
-  Binary |= Rn << 13;
-  Binary |= SBits << 5;
-  Binary |= ShImm << 7;
-  if (isAdd)
-    Binary |= 1 << 12;
-  return Binary;
-}
-
-uint32_t ARMMCCodeEmitter::
-getAddrMode2OpValue(const MCInst &MI, unsigned OpIdx,
-                    SmallVectorImpl<MCFixup> &Fixups) const {
-  // {17-14}  Rn
-  // {13}     1 == imm12, 0 == Rm
-  // {12}     isAdd
-  // {11-0}   imm12/Rm
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  unsigned Rn = getARMRegisterNumbering(MO.getReg());
-  uint32_t Binary = getAddrMode2OffsetOpValue(MI, OpIdx + 1, Fixups);
-  Binary |= Rn << 14;
-  return Binary;
-}
-
-uint32_t ARMMCCodeEmitter::
-getAddrMode2OffsetOpValue(const MCInst &MI, unsigned OpIdx,
-                          SmallVectorImpl<MCFixup> &Fixups) const {
-  // {13}     1 == imm12, 0 == Rm
-  // {12}     isAdd
-  // {11-0}   imm12/Rm
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  const MCOperand &MO1 = MI.getOperand(OpIdx+1);
-  unsigned Imm = MO1.getImm();
-  bool isAdd = ARM_AM::getAM2Op(Imm) == ARM_AM::add;
-  bool isReg = MO.getReg() != 0;
-  uint32_t Binary = ARM_AM::getAM2Offset(Imm);
-  // if reg +/- reg, Rm will be non-zero. Otherwise, we have reg +/- imm12
-  if (isReg) {
-    ARM_AM::ShiftOpc ShOp = ARM_AM::getAM2ShiftOpc(Imm);
-    Binary <<= 7;                    // Shift amount is bits [11:7]
-    Binary |= getShiftOp(ShOp) << 5; // Shift type is bits [6:5]
-    Binary |= getARMRegisterNumbering(MO.getReg()); // Rm is bits [3:0]
-  }
-  return Binary | (isAdd << 12) | (isReg << 13);
-}
-
-uint32_t ARMMCCodeEmitter::
-getAddrMode3OffsetOpValue(const MCInst &MI, unsigned OpIdx,
-                          SmallVectorImpl<MCFixup> &Fixups) const {
-  // {9}      1 == imm8, 0 == Rm
-  // {8}      isAdd
-  // {7-4}    imm7_4/zero
-  // {3-0}    imm3_0/Rm
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  const MCOperand &MO1 = MI.getOperand(OpIdx+1);
-  unsigned Imm = MO1.getImm();
-  bool isAdd = ARM_AM::getAM3Op(Imm) == ARM_AM::add;
-  bool isImm = MO.getReg() == 0;
-  uint32_t Imm8 = ARM_AM::getAM3Offset(Imm);
-  // if reg +/- reg, Rm will be non-zero. Otherwise, we have reg +/- imm8
-  if (!isImm)
-    Imm8 = getARMRegisterNumbering(MO.getReg());
-  return Imm8 | (isAdd << 8) | (isImm << 9);
-}
-
-uint32_t ARMMCCodeEmitter::
-getAddrMode3OpValue(const MCInst &MI, unsigned OpIdx,
-                    SmallVectorImpl<MCFixup> &Fixups) const {
-  // {13}     1 == imm8, 0 == Rm
-  // {12-9}   Rn
-  // {8}      isAdd
-  // {7-4}    imm7_4/zero
-  // {3-0}    imm3_0/Rm
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  const MCOperand &MO1 = MI.getOperand(OpIdx+1);
-  const MCOperand &MO2 = MI.getOperand(OpIdx+2);
-  unsigned Rn = getARMRegisterNumbering(MO.getReg());
-  unsigned Imm = MO2.getImm();
-  bool isAdd = ARM_AM::getAM3Op(Imm) == ARM_AM::add;
-  bool isImm = MO1.getReg() == 0;
-  uint32_t Imm8 = ARM_AM::getAM3Offset(Imm);
-  // if reg +/- reg, Rm will be non-zero. Otherwise, we have reg +/- imm8
-  if (!isImm)
-    Imm8 = getARMRegisterNumbering(MO1.getReg());
-  return (Rn << 9) | Imm8 | (isAdd << 8) | (isImm << 13);
-}
-
-/// getAddrModeThumbSPOpValue - Encode the t_addrmode_sp operands.
-uint32_t ARMMCCodeEmitter::
-getAddrModeThumbSPOpValue(const MCInst &MI, unsigned OpIdx,
-                          SmallVectorImpl<MCFixup> &Fixups) const {
-  // [SP, #imm]
-  //   {7-0} = imm8
-  const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
-  assert(MI.getOperand(OpIdx).getReg() == ARM::SP &&
-         "Unexpected base register!");
-
-  // The immediate is already shifted for the implicit zeroes, so no change
-  // here.
-  return MO1.getImm() & 0xff;
-}
-
-/// getAddrModeISOpValue - Encode the t_addrmode_is# operands.
-uint32_t ARMMCCodeEmitter::
-getAddrModeISOpValue(const MCInst &MI, unsigned OpIdx,
-                     SmallVectorImpl<MCFixup> &Fixups) const {
-  // [Rn, #imm]
-  //   {7-3} = imm5
-  //   {2-0} = Rn
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
-  unsigned Rn = getARMRegisterNumbering(MO.getReg());
-  unsigned Imm5 = MO1.getImm();
-  return ((Imm5 & 0x1f) << 3) | Rn;
-}
-
-/// getAddrModePCOpValue - Return encoding for t_addrmode_pc operands.
-uint32_t ARMMCCodeEmitter::
-getAddrModePCOpValue(const MCInst &MI, unsigned OpIdx,
-                     SmallVectorImpl<MCFixup> &Fixups) const {
-  return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_cp, Fixups);
-}
-
-/// getAddrMode5OpValue - Return encoding info for 'reg +/- imm10' operand.
-uint32_t ARMMCCodeEmitter::
-getAddrMode5OpValue(const MCInst &MI, unsigned OpIdx,
-                    SmallVectorImpl<MCFixup> &Fixups) const {
-  // {12-9} = reg
-  // {8}    = (U)nsigned (add == '1', sub == '0')
-  // {7-0}  = imm8
-  unsigned Reg, Imm8;
-  bool isAdd;
-  // If The first operand isn't a register, we have a label reference.
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  if (!MO.isReg()) {
-    Reg = getARMRegisterNumbering(ARM::PC);   // Rn is PC.
-    Imm8 = 0;
-    isAdd = false; // 'U' bit is handled as part of the fixup.
-
-    assert(MO.isExpr() && "Unexpected machine operand type!");
-    const MCExpr *Expr = MO.getExpr();
-    MCFixupKind Kind;
-    if (isThumb2())
-      Kind = MCFixupKind(ARM::fixup_t2_pcrel_10);
-    else
-      Kind = MCFixupKind(ARM::fixup_arm_pcrel_10);
-    Fixups.push_back(MCFixup::Create(0, Expr, Kind));
-
-    ++MCNumCPRelocations;
-  } else {
-    EncodeAddrModeOpValues(MI, OpIdx, Reg, Imm8, Fixups);
-    isAdd = ARM_AM::getAM5Op(Imm8) == ARM_AM::add;
-  }
-
-  uint32_t Binary = ARM_AM::getAM5Offset(Imm8);
-  // Immediate is always encoded as positive. The 'U' bit controls add vs sub.
-  if (isAdd)
-    Binary |= (1 << 8);
-  Binary |= (Reg << 9);
-  return Binary;
-}
-
-unsigned ARMMCCodeEmitter::
-getSORegOpValue(const MCInst &MI, unsigned OpIdx,
-                SmallVectorImpl<MCFixup> &Fixups) const {
-  // Sub-operands are [reg, reg, imm]. The first register is Rm, the reg to be
-  // shifted. The second is either Rs, the amount to shift by, or reg0 in which
-  // case the imm contains the amount to shift by.
-  //
-  // {3-0} = Rm.
-  // {4}   = 1 if reg shift, 0 if imm shift
-  // {6-5} = type
-  //    If reg shift:
-  //      {11-8} = Rs
-  //      {7}    = 0
-  //    else (imm shift)
-  //      {11-7} = imm
-
-  const MCOperand &MO  = MI.getOperand(OpIdx);
-  const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
-  const MCOperand &MO2 = MI.getOperand(OpIdx + 2);
-  ARM_AM::ShiftOpc SOpc = ARM_AM::getSORegShOp(MO2.getImm());
-
-  // Encode Rm.
-  unsigned Binary = getARMRegisterNumbering(MO.getReg());
-
-  // Encode the shift opcode.
-  unsigned SBits = 0;
-  unsigned Rs = MO1.getReg();
-  if (Rs) {
-    // Set shift operand (bit[7:4]).
-    // LSL - 0001
-    // LSR - 0011
-    // ASR - 0101
-    // ROR - 0111
-    // RRX - 0110 and bit[11:8] clear.
-    switch (SOpc) {
-    default: llvm_unreachable("Unknown shift opc!");
-    case ARM_AM::lsl: SBits = 0x1; break;
-    case ARM_AM::lsr: SBits = 0x3; break;
-    case ARM_AM::asr: SBits = 0x5; break;
-    case ARM_AM::ror: SBits = 0x7; break;
-    case ARM_AM::rrx: SBits = 0x6; break;
-    }
-  } else {
-    // Set shift operand (bit[6:4]).
-    // LSL - 000
-    // LSR - 010
-    // ASR - 100
-    // ROR - 110
-    switch (SOpc) {
-    default: llvm_unreachable("Unknown shift opc!");
-    case ARM_AM::lsl: SBits = 0x0; break;
-    case ARM_AM::lsr: SBits = 0x2; break;
-    case ARM_AM::asr: SBits = 0x4; break;
-    case ARM_AM::ror: SBits = 0x6; break;
-    }
-  }
-
-  Binary |= SBits << 4;
-  if (SOpc == ARM_AM::rrx)
-    return Binary;
-
-  // Encode the shift operation Rs or shift_imm (except rrx).
-  if (Rs) {
-    // Encode Rs bit[11:8].
-    assert(ARM_AM::getSORegOffset(MO2.getImm()) == 0);
-    return Binary | (getARMRegisterNumbering(Rs) << ARMII::RegRsShift);
-  }
-
-  // Encode shift_imm bit[11:7].
-  return Binary | ARM_AM::getSORegOffset(MO2.getImm()) << 7;
-}
-
-unsigned ARMMCCodeEmitter::
-getT2AddrModeSORegOpValue(const MCInst &MI, unsigned OpNum,
-                SmallVectorImpl<MCFixup> &Fixups) const {
-  const MCOperand &MO1 = MI.getOperand(OpNum);
-  const MCOperand &MO2 = MI.getOperand(OpNum+1);
-  const MCOperand &MO3 = MI.getOperand(OpNum+2);
-
-  // Encoded as [Rn, Rm, imm].
-  // FIXME: Needs fixup support.
-  unsigned Value = getARMRegisterNumbering(MO1.getReg());
-  Value <<= 4;
-  Value |= getARMRegisterNumbering(MO2.getReg());
-  Value <<= 2;
-  Value |= MO3.getImm();
-
-  return Value;
-}
-
-unsigned ARMMCCodeEmitter::
-getT2AddrModeImm8OpValue(const MCInst &MI, unsigned OpNum,
-                         SmallVectorImpl<MCFixup> &Fixups) const {
-  const MCOperand &MO1 = MI.getOperand(OpNum);
-  const MCOperand &MO2 = MI.getOperand(OpNum+1);
-
-  // FIXME: Needs fixup support.
-  unsigned Value = getARMRegisterNumbering(MO1.getReg());
-
-  // Even though the immediate is 8 bits long, we need 9 bits in order
-  // to represent the (inverse of the) sign bit.
-  Value <<= 9;
-  int32_t tmp = (int32_t)MO2.getImm();
-  if (tmp < 0)
-    tmp = abs(tmp);
-  else
-    Value |= 256; // Set the ADD bit
-  Value |= tmp & 255;
-  return Value;
-}
-
-unsigned ARMMCCodeEmitter::
-getT2AddrModeImm8OffsetOpValue(const MCInst &MI, unsigned OpNum,
-                         SmallVectorImpl<MCFixup> &Fixups) const {
-  const MCOperand &MO1 = MI.getOperand(OpNum);
-
-  // FIXME: Needs fixup support.
-  unsigned Value = 0;
-  int32_t tmp = (int32_t)MO1.getImm();
-  if (tmp < 0)
-    tmp = abs(tmp);
-  else
-    Value |= 256; // Set the ADD bit
-  Value |= tmp & 255;
-  return Value;
-}
-
-unsigned ARMMCCodeEmitter::
-getT2AddrModeImm12OffsetOpValue(const MCInst &MI, unsigned OpNum,
-                         SmallVectorImpl<MCFixup> &Fixups) const {
-  const MCOperand &MO1 = MI.getOperand(OpNum);
-
-  // FIXME: Needs fixup support.
-  unsigned Value = 0;
-  int32_t tmp = (int32_t)MO1.getImm();
-  if (tmp < 0)
-    tmp = abs(tmp);
-  else
-    Value |= 4096; // Set the ADD bit
-  Value |= tmp & 4095;
-  return Value;
-}
-
-unsigned ARMMCCodeEmitter::
-getT2SORegOpValue(const MCInst &MI, unsigned OpIdx,
-                SmallVectorImpl<MCFixup> &Fixups) const {
-  // Sub-operands are [reg, imm]. The first register is Rm, the reg to be
-  // shifted. The second is the amount to shift by.
-  //
-  // {3-0} = Rm.
-  // {4}   = 0
-  // {6-5} = type
-  // {11-7} = imm
-
-  const MCOperand &MO  = MI.getOperand(OpIdx);
-  const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
-  ARM_AM::ShiftOpc SOpc = ARM_AM::getSORegShOp(MO1.getImm());
-
-  // Encode Rm.
-  unsigned Binary = getARMRegisterNumbering(MO.getReg());
-
-  // Encode the shift opcode.
-  unsigned SBits = 0;
-  // Set shift operand (bit[6:4]).
-  // LSL - 000
-  // LSR - 010
-  // ASR - 100
-  // ROR - 110
-  switch (SOpc) {
-  default: llvm_unreachable("Unknown shift opc!");
-  case ARM_AM::lsl: SBits = 0x0; break;
-  case ARM_AM::lsr: SBits = 0x2; break;
-  case ARM_AM::asr: SBits = 0x4; break;
-  case ARM_AM::ror: SBits = 0x6; break;
-  }
-
-  Binary |= SBits << 4;
-  if (SOpc == ARM_AM::rrx)
-    return Binary;
-
-  // Encode shift_imm bit[11:7].
-  return Binary | ARM_AM::getSORegOffset(MO1.getImm()) << 7;
-}
-
-unsigned ARMMCCodeEmitter::
-getBitfieldInvertedMaskOpValue(const MCInst &MI, unsigned Op,
-                               SmallVectorImpl<MCFixup> &Fixups) const {
-  // 10 bits. lower 5 bits are are the lsb of the mask, high five bits are the
-  // msb of the mask.
-  const MCOperand &MO = MI.getOperand(Op);
-  uint32_t v = ~MO.getImm();
-  uint32_t lsb = CountTrailingZeros_32(v);
-  uint32_t msb = (32 - CountLeadingZeros_32 (v)) - 1;
-  assert (v != 0 && lsb < 32 && msb < 32 && "Illegal bitfield mask!");
-  return lsb | (msb << 5);
-}
-
-unsigned ARMMCCodeEmitter::
-getMsbOpValue(const MCInst &MI, unsigned Op,
-              SmallVectorImpl<MCFixup> &Fixups) const {
-  // MSB - 5 bits.
-  uint32_t lsb = MI.getOperand(Op-1).getImm();
-  uint32_t width = MI.getOperand(Op).getImm();
-  uint32_t msb = lsb+width-1;
-  assert (width != 0 && msb < 32 && "Illegal bit width!");
-  return msb;
-}
-
-unsigned ARMMCCodeEmitter::
-getSsatBitPosValue(const MCInst &MI, unsigned Op,
-                   SmallVectorImpl<MCFixup> &Fixups) const {
-  // For ssat instructions, the bit position should be encoded decremented by 1
-  return MI.getOperand(Op).getImm()-1;
-}
-
-unsigned ARMMCCodeEmitter::
-getRegisterListOpValue(const MCInst &MI, unsigned Op,
-                       SmallVectorImpl<MCFixup> &Fixups) const {
-  // VLDM/VSTM:
-  //   {12-8} = Vd
-  //   {7-0}  = Number of registers
-  //
-  // LDM/STM:
-  //   {15-0}  = Bitfield of GPRs.
-  unsigned Reg = MI.getOperand(Op).getReg();
-  bool SPRRegs = ARM::SPRRegClass.contains(Reg);
-  bool DPRRegs = ARM::DPRRegClass.contains(Reg);
-
-  unsigned Binary = 0;
-
-  if (SPRRegs || DPRRegs) {
-    // VLDM/VSTM
-    unsigned RegNo = getARMRegisterNumbering(Reg);
-    unsigned NumRegs = (MI.getNumOperands() - Op) & 0xff;
-    Binary |= (RegNo & 0x1f) << 8;
-    if (SPRRegs)
-      Binary |= NumRegs;
-    else
-      Binary |= NumRegs * 2;
-  } else {
-    for (unsigned I = Op, E = MI.getNumOperands(); I < E; ++I) {
-      unsigned RegNo = getARMRegisterNumbering(MI.getOperand(I).getReg());
-      Binary |= 1 << RegNo;
-    }
-  }
-
-  return Binary;
-}
-
-/// getAddrMode6AddressOpValue - Encode an addrmode6 register number along
-/// with the alignment operand.
-unsigned ARMMCCodeEmitter::
-getAddrMode6AddressOpValue(const MCInst &MI, unsigned Op,
-                           SmallVectorImpl<MCFixup> &Fixups) const {
-  const MCOperand &Reg = MI.getOperand(Op);
-  const MCOperand &Imm = MI.getOperand(Op + 1);
-
-  unsigned RegNo = getARMRegisterNumbering(Reg.getReg());
-  unsigned Align = 0;
-
-  switch (Imm.getImm()) {
-  default: break;
-  case 2:
-  case 4:
-  case 8:  Align = 0x01; break;
-  case 16: Align = 0x02; break;
-  case 32: Align = 0x03; break;
-  }
-
-  return RegNo | (Align << 4);
-}
-
-/// getAddrMode6OneLane32AddressOpValue - Encode an addrmode6 register number
-/// along  with the alignment operand for use in VST1 and VLD1 with size 32.
-unsigned ARMMCCodeEmitter::
-getAddrMode6OneLane32AddressOpValue(const MCInst &MI, unsigned Op,
-                                    SmallVectorImpl<MCFixup> &Fixups) const {
-  const MCOperand &Reg = MI.getOperand(Op);
-  const MCOperand &Imm = MI.getOperand(Op + 1);
-
-  unsigned RegNo = getARMRegisterNumbering(Reg.getReg());
-  unsigned Align = 0;
-
-  switch (Imm.getImm()) {
-  default: break;
-  case 2:
-  case 4:
-  case 8:
-  case 16: Align = 0x00; break;
-  case 32: Align = 0x03; break;
-  }
-
-  return RegNo | (Align << 4);
-}
-
-
-/// getAddrMode6DupAddressOpValue - Encode an addrmode6 register number and
-/// alignment operand for use in VLD-dup instructions.  This is the same as
-/// getAddrMode6AddressOpValue except for the alignment encoding, which is
-/// different for VLD4-dup.
-unsigned ARMMCCodeEmitter::
-getAddrMode6DupAddressOpValue(const MCInst &MI, unsigned Op,
-                              SmallVectorImpl<MCFixup> &Fixups) const {
-  const MCOperand &Reg = MI.getOperand(Op);
-  const MCOperand &Imm = MI.getOperand(Op + 1);
-
-  unsigned RegNo = getARMRegisterNumbering(Reg.getReg());
-  unsigned Align = 0;
-
-  switch (Imm.getImm()) {
-  default: break;
-  case 2:
-  case 4:
-  case 8:  Align = 0x01; break;
-  case 16: Align = 0x03; break;
-  }
-
-  return RegNo | (Align << 4);
-}
-
-unsigned ARMMCCodeEmitter::
-getAddrMode6OffsetOpValue(const MCInst &MI, unsigned Op,
-                          SmallVectorImpl<MCFixup> &Fixups) const {
-  const MCOperand &MO = MI.getOperand(Op);
-  if (MO.getReg() == 0) return 0x0D;
-  return MO.getReg();
-}
-
-unsigned ARMMCCodeEmitter::
-getShiftRight8Imm(const MCInst &MI, unsigned Op,
-                  SmallVectorImpl<MCFixup> &Fixups) const {
-  return 8 - MI.getOperand(Op).getImm();
-}
-
-unsigned ARMMCCodeEmitter::
-getShiftRight16Imm(const MCInst &MI, unsigned Op,
-                   SmallVectorImpl<MCFixup> &Fixups) const {
-  return 16 - MI.getOperand(Op).getImm();
-}
-
-unsigned ARMMCCodeEmitter::
-getShiftRight32Imm(const MCInst &MI, unsigned Op,
-                   SmallVectorImpl<MCFixup> &Fixups) const {
-  return 32 - MI.getOperand(Op).getImm();
-}
-
-unsigned ARMMCCodeEmitter::
-getShiftRight64Imm(const MCInst &MI, unsigned Op,
-                   SmallVectorImpl<MCFixup> &Fixups) const {
-  return 64 - MI.getOperand(Op).getImm();
-}
-
-void ARMMCCodeEmitter::
-EncodeInstruction(const MCInst &MI, raw_ostream &OS,
-                  SmallVectorImpl<MCFixup> &Fixups) const {
-  // Pseudo instructions don't get encoded.
-  const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
-  uint64_t TSFlags = Desc.TSFlags;
-  if ((TSFlags & ARMII::FormMask) == ARMII::Pseudo)
-    return;
-
-  int Size;
-  if (Desc.getSize() == 2 || Desc.getSize() == 4)
-    Size = Desc.getSize();
-  else
-    llvm_unreachable("Unexpected instruction size!");
-  
-  uint32_t Binary = getBinaryCodeForInstr(MI, Fixups);
-  // Thumb 32-bit wide instructions need to emit the high order halfword
-  // first.
-  if (isThumb() && Size == 4) {
-    EmitConstant(Binary >> 16, 2, OS);
-    EmitConstant(Binary & 0xffff, 2, OS);
-  } else
-    EmitConstant(Binary, Size, OS);
-  ++MCNumEmitted;  // Keep track of the # of mi's emitted.
-}
-
-#include "ARMGenMCCodeEmitter.inc"
diff --git a/lib/Target/ARM/ARMMCExpr.cpp b/lib/Target/ARM/ARMMCExpr.cpp
deleted file mode 100644
index 2727ba8c8aa5f..0000000000000
--- a/lib/Target/ARM/ARMMCExpr.cpp
+++ /dev/null
@@ -1,73 +0,0 @@
-//===-- ARMMCExpr.cpp - ARM specific MC expression classes ----------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "armmcexpr"
-#include "ARMMCExpr.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCAssembler.h"
-using namespace llvm;
-
-const ARMMCExpr*
-ARMMCExpr::Create(VariantKind Kind, const MCExpr *Expr,
-                       MCContext &Ctx) {
-  return new (Ctx) ARMMCExpr(Kind, Expr);
-}
-
-void ARMMCExpr::PrintImpl(raw_ostream &OS) const {
-  switch (Kind) {
-  default: assert(0 && "Invalid kind!");
-  case VK_ARM_HI16: OS << ":upper16:"; break;
-  case VK_ARM_LO16: OS << ":lower16:"; break;
-  }
-
-  const MCExpr *Expr = getSubExpr();
-  if (Expr->getKind() != MCExpr::SymbolRef)
-    OS << '(';
-  Expr->print(OS);
-  if (Expr->getKind() != MCExpr::SymbolRef)
-    OS << ')';
-}
-
-bool
-ARMMCExpr::EvaluateAsRelocatableImpl(MCValue &Res,
-                                     const MCAsmLayout *Layout) const {
-  return false;
-}
-
-// FIXME: This basically copies MCObjectStreamer::AddValueSymbols. Perhaps
-// that method should be made public?
-static void AddValueSymbols_(const MCExpr *Value, MCAssembler *Asm) {
-  switch (Value->getKind()) {
-  case MCExpr::Target:
-    assert(0 && "Can't handle nested target expr!");
-    break;
-
-  case MCExpr::Constant:
-    break;
-
-  case MCExpr::Binary: {
-    const MCBinaryExpr *BE = cast<MCBinaryExpr>(Value);
-    AddValueSymbols_(BE->getLHS(), Asm);
-    AddValueSymbols_(BE->getRHS(), Asm);
-    break;
-  }
-
-  case MCExpr::SymbolRef:
-    Asm->getOrCreateSymbolData(cast<MCSymbolRefExpr>(Value)->getSymbol());
-    break;
-
-  case MCExpr::Unary:
-    AddValueSymbols_(cast<MCUnaryExpr>(Value)->getSubExpr(), Asm);
-    break;
-  }
-}
-
-void ARMMCExpr::AddValueSymbols(MCAssembler *Asm) const {
-  AddValueSymbols_(getSubExpr(), Asm);
-}
diff --git a/lib/Target/ARM/ARMMCExpr.h b/lib/Target/ARM/ARMMCExpr.h
deleted file mode 100644
index 0a2e883deb1d4..0000000000000
--- a/lib/Target/ARM/ARMMCExpr.h
+++ /dev/null
@@ -1,76 +0,0 @@
-//===-- ARMMCExpr.h - ARM specific MC expression classes ------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ARMMCEXPR_H
-#define ARMMCEXPR_H
-
-#include "llvm/MC/MCExpr.h"
-
-namespace llvm {
-
-class ARMMCExpr : public MCTargetExpr {
-public:
-  enum VariantKind {
-    VK_ARM_None,
-    VK_ARM_HI16,  // The R_ARM_MOVT_ABS relocation (:upper16: in the .s file)
-    VK_ARM_LO16   // The R_ARM_MOVW_ABS_NC relocation (:lower16: in the .s file)
-  };
-
-private:
-  const VariantKind Kind;
-  const MCExpr *Expr;
-
-  explicit ARMMCExpr(VariantKind _Kind, const MCExpr *_Expr)
-    : Kind(_Kind), Expr(_Expr) {}
-  
-public:
-  /// @name Construction
-  /// @{
-
-  static const ARMMCExpr *Create(VariantKind Kind, const MCExpr *Expr,
-                                      MCContext &Ctx);
-
-  static const ARMMCExpr *CreateUpper16(const MCExpr *Expr, MCContext &Ctx) {
-    return Create(VK_ARM_HI16, Expr, Ctx);
-  }
-
-  static const ARMMCExpr *CreateLower16(const MCExpr *Expr, MCContext &Ctx) {
-    return Create(VK_ARM_LO16, Expr, Ctx);
-  }
-
-  /// @}
-  /// @name Accessors
-  /// @{
-
-  /// getOpcode - Get the kind of this expression.
-  VariantKind getKind() const { return Kind; }
-
-  /// getSubExpr - Get the child of this expression.
-  const MCExpr *getSubExpr() const { return Expr; }
-
-  /// @}
-
-  void PrintImpl(raw_ostream &OS) const;
-  bool EvaluateAsRelocatableImpl(MCValue &Res,
-                                 const MCAsmLayout *Layout) const;
-  void AddValueSymbols(MCAssembler *) const;
-  const MCSection *FindAssociatedSection() const {
-    return getSubExpr()->FindAssociatedSection();
-  }
-
-  static bool classof(const MCExpr *E) {
-    return E->getKind() == MCExpr::Target;
-  }
-  
-  static bool classof(const ARMMCExpr *) { return true; }
-
-};
-} // end namespace llvm
-
-#endif
diff --git a/lib/Target/ARM/ARMMCInstLower.cpp b/lib/Target/ARM/ARMMCInstLower.cpp
index 7411b599f0fac..daa126def4017 100644
--- a/lib/Target/ARM/ARMMCInstLower.cpp
+++ b/lib/Target/ARM/ARMMCInstLower.cpp
@@ -14,7 +14,7 @@
 
 #include "ARM.h"
 #include "ARMAsmPrinter.h"
-#include "ARMMCExpr.h"
+#include "MCTargetDesc/ARMMCExpr.h"
 #include "llvm/Constants.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/MC/MCExpr.h"
diff --git a/lib/Target/ARM/ARMMachObjectWriter.cpp b/lib/Target/ARM/ARMMachObjectWriter.cpp
deleted file mode 100644
index a36e47da06d4d..0000000000000
--- a/lib/Target/ARM/ARMMachObjectWriter.cpp
+++ /dev/null
@@ -1,389 +0,0 @@
-//===-- ARMMachObjectWriter.cpp - ARM Mach Object Writer ------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARM.h"
-#include "ARMFixupKinds.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/MC/MCAssembler.h"
-#include "llvm/MC/MCAsmLayout.h"
-#include "llvm/MC/MCMachObjectWriter.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCFixup.h"
-#include "llvm/MC/MCFixupKindInfo.h"
-#include "llvm/MC/MCValue.h"
-#include "llvm/Object/MachOFormat.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Target/TargetAsmBackend.h"
-using namespace llvm;
-using namespace llvm::object;
-
-namespace {
-class ARMMachObjectWriter : public MCMachObjectTargetWriter {
-  void RecordARMScatteredRelocation(MachObjectWriter *Writer,
-                                    const MCAssembler &Asm,
-                                    const MCAsmLayout &Layout,
-                                    const MCFragment *Fragment,
-                                    const MCFixup &Fixup,
-                                    MCValue Target,
-                                    unsigned Log2Size,
-                                    uint64_t &FixedValue);
-  void RecordARMMovwMovtRelocation(MachObjectWriter *Writer,
-                                   const MCAssembler &Asm,
-                                   const MCAsmLayout &Layout,
-                                   const MCFragment *Fragment,
-                                   const MCFixup &Fixup, MCValue Target,
-                                   uint64_t &FixedValue);
-
-public:
-  ARMMachObjectWriter(bool Is64Bit, uint32_t CPUType,
-                      uint32_t CPUSubtype)
-    : MCMachObjectTargetWriter(Is64Bit, CPUType, CPUSubtype,
-                               /*UseAggressiveSymbolFolding=*/true) {}
-
-  void RecordRelocation(MachObjectWriter *Writer,
-                        const MCAssembler &Asm, const MCAsmLayout &Layout,
-                        const MCFragment *Fragment, const MCFixup &Fixup,
-                        MCValue Target, uint64_t &FixedValue);
-};
-}
-
-static bool getARMFixupKindMachOInfo(unsigned Kind, unsigned &RelocType,
-                              unsigned &Log2Size) {
-  RelocType = unsigned(macho::RIT_Vanilla);
-  Log2Size = ~0U;
-
-  switch (Kind) {
-  default:
-    return false;
-
-  case FK_Data_1:
-    Log2Size = llvm::Log2_32(1);
-    return true;
-  case FK_Data_2:
-    Log2Size = llvm::Log2_32(2);
-    return true;
-  case FK_Data_4:
-    Log2Size = llvm::Log2_32(4);
-    return true;
-  case FK_Data_8:
-    Log2Size = llvm::Log2_32(8);
-    return true;
-
-    // Handle 24-bit branch kinds.
-  case ARM::fixup_arm_ldst_pcrel_12:
-  case ARM::fixup_arm_pcrel_10:
-  case ARM::fixup_arm_adr_pcrel_12:
-  case ARM::fixup_arm_condbranch:
-  case ARM::fixup_arm_uncondbranch:
-    RelocType = unsigned(macho::RIT_ARM_Branch24Bit);
-    // Report as 'long', even though that is not quite accurate.
-    Log2Size = llvm::Log2_32(4);
-    return true;
-
-    // Handle Thumb branches.
-  case ARM::fixup_arm_thumb_br:
-    RelocType = unsigned(macho::RIT_ARM_ThumbBranch22Bit);
-    Log2Size = llvm::Log2_32(2);
-    return true;
-
-  case ARM::fixup_t2_uncondbranch:
-  case ARM::fixup_arm_thumb_bl:
-  case ARM::fixup_arm_thumb_blx:
-    RelocType = unsigned(macho::RIT_ARM_ThumbBranch22Bit);
-    Log2Size = llvm::Log2_32(4);
-    return true;
-
-  case ARM::fixup_arm_movt_hi16:
-  case ARM::fixup_arm_movt_hi16_pcrel:
-  case ARM::fixup_t2_movt_hi16:
-  case ARM::fixup_t2_movt_hi16_pcrel:
-    RelocType = unsigned(macho::RIT_ARM_HalfDifference);
-    // Report as 'long', even though that is not quite accurate.
-    Log2Size = llvm::Log2_32(4);
-    return true;
-
-  case ARM::fixup_arm_movw_lo16:
-  case ARM::fixup_arm_movw_lo16_pcrel:
-  case ARM::fixup_t2_movw_lo16:
-  case ARM::fixup_t2_movw_lo16_pcrel:
-    RelocType = unsigned(macho::RIT_ARM_Half);
-    // Report as 'long', even though that is not quite accurate.
-    Log2Size = llvm::Log2_32(4);
-    return true;
-  }
-}
-
-void ARMMachObjectWriter::
-RecordARMMovwMovtRelocation(MachObjectWriter *Writer,
-                            const MCAssembler &Asm,
-                            const MCAsmLayout &Layout,
-                            const MCFragment *Fragment,
-                            const MCFixup &Fixup,
-                            MCValue Target,
-                            uint64_t &FixedValue) {
-  uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
-  unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
-  unsigned Type = macho::RIT_ARM_Half;
-
-  // See <reloc.h>.
-  const MCSymbol *A = &Target.getSymA()->getSymbol();
-  MCSymbolData *A_SD = &Asm.getSymbolData(*A);
-
-  if (!A_SD->getFragment())
-    report_fatal_error("symbol '" + A->getName() +
-                       "' can not be undefined in a subtraction expression");
-
-  uint32_t Value = Writer->getSymbolAddress(A_SD, Layout);
-  uint32_t Value2 = 0;
-  uint64_t SecAddr =
-    Writer->getSectionAddress(A_SD->getFragment()->getParent());
-  FixedValue += SecAddr;
-
-  if (const MCSymbolRefExpr *B = Target.getSymB()) {
-    MCSymbolData *B_SD = &Asm.getSymbolData(B->getSymbol());
-
-    if (!B_SD->getFragment())
-      report_fatal_error("symbol '" + B->getSymbol().getName() +
-                         "' can not be undefined in a subtraction expression");
-
-    // Select the appropriate difference relocation type.
-    Type = macho::RIT_ARM_HalfDifference;
-    Value2 = Writer->getSymbolAddress(B_SD, Layout);
-    FixedValue -= Writer->getSectionAddress(B_SD->getFragment()->getParent());
-  }
-
-  // Relocations are written out in reverse order, so the PAIR comes first.
-  // ARM_RELOC_HALF and ARM_RELOC_HALF_SECTDIFF abuse the r_length field:
-  //
-  // For these two r_type relocations they always have a pair following them and
-  // the r_length bits are used differently.  The encoding of the r_length is as
-  // follows:
-  //   low bit of r_length:
-  //      0 - :lower16: for movw instructions
-  //      1 - :upper16: for movt instructions
-  //   high bit of r_length:
-  //      0 - arm instructions
-  //      1 - thumb instructions
-  // the other half of the relocated expression is in the following pair
-  // relocation entry in the the low 16 bits of r_address field.
-  unsigned ThumbBit = 0;
-  unsigned MovtBit = 0;
-  switch ((unsigned)Fixup.getKind()) {
-  default: break;
-  case ARM::fixup_arm_movt_hi16:
-  case ARM::fixup_arm_movt_hi16_pcrel:
-    MovtBit = 1;
-    break;
-  case ARM::fixup_t2_movt_hi16:
-  case ARM::fixup_t2_movt_hi16_pcrel:
-    MovtBit = 1;
-    // Fallthrough
-  case ARM::fixup_t2_movw_lo16:
-  case ARM::fixup_t2_movw_lo16_pcrel:
-    ThumbBit = 1;
-    break;
-  }
-
-
-  if (Type == macho::RIT_ARM_HalfDifference) {
-    uint32_t OtherHalf = MovtBit
-      ? (FixedValue & 0xffff) : ((FixedValue & 0xffff0000) >> 16);
-
-    macho::RelocationEntry MRE;
-    MRE.Word0 = ((OtherHalf       <<  0) |
-                 (macho::RIT_Pair << 24) |
-                 (MovtBit         << 28) |
-                 (ThumbBit        << 29) |
-                 (IsPCRel         << 30) |
-                 macho::RF_Scattered);
-    MRE.Word1 = Value2;
-    Writer->addRelocation(Fragment->getParent(), MRE);
-  }
-
-  macho::RelocationEntry MRE;
-  MRE.Word0 = ((FixupOffset <<  0) |
-               (Type        << 24) |
-               (MovtBit     << 28) |
-               (ThumbBit    << 29) |
-               (IsPCRel     << 30) |
-               macho::RF_Scattered);
-  MRE.Word1 = Value;
-  Writer->addRelocation(Fragment->getParent(), MRE);
-}
-
-void ARMMachObjectWriter::RecordARMScatteredRelocation(MachObjectWriter *Writer,
-                                                    const MCAssembler &Asm,
-                                                    const MCAsmLayout &Layout,
-                                                    const MCFragment *Fragment,
-                                                    const MCFixup &Fixup,
-                                                    MCValue Target,
-                                                    unsigned Log2Size,
-                                                    uint64_t &FixedValue) {
-  uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
-  unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
-  unsigned Type = macho::RIT_Vanilla;
-
-  // See <reloc.h>.
-  const MCSymbol *A = &Target.getSymA()->getSymbol();
-  MCSymbolData *A_SD = &Asm.getSymbolData(*A);
-
-  if (!A_SD->getFragment())
-    report_fatal_error("symbol '" + A->getName() +
-                       "' can not be undefined in a subtraction expression");
-
-  uint32_t Value = Writer->getSymbolAddress(A_SD, Layout);
-  uint64_t SecAddr = Writer->getSectionAddress(A_SD->getFragment()->getParent());
-  FixedValue += SecAddr;
-  uint32_t Value2 = 0;
-
-  if (const MCSymbolRefExpr *B = Target.getSymB()) {
-    MCSymbolData *B_SD = &Asm.getSymbolData(B->getSymbol());
-
-    if (!B_SD->getFragment())
-      report_fatal_error("symbol '" + B->getSymbol().getName() +
-                         "' can not be undefined in a subtraction expression");
-
-    // Select the appropriate difference relocation type.
-    Type = macho::RIT_Difference;
-    Value2 = Writer->getSymbolAddress(B_SD, Layout);
-    FixedValue -= Writer->getSectionAddress(B_SD->getFragment()->getParent());
-  }
-
-  // Relocations are written out in reverse order, so the PAIR comes first.
-  if (Type == macho::RIT_Difference ||
-      Type == macho::RIT_Generic_LocalDifference) {
-    macho::RelocationEntry MRE;
-    MRE.Word0 = ((0         <<  0) |
-                 (macho::RIT_Pair  << 24) |
-                 (Log2Size  << 28) |
-                 (IsPCRel   << 30) |
-                 macho::RF_Scattered);
-    MRE.Word1 = Value2;
-    Writer->addRelocation(Fragment->getParent(), MRE);
-  }
-
-  macho::RelocationEntry MRE;
-  MRE.Word0 = ((FixupOffset <<  0) |
-               (Type        << 24) |
-               (Log2Size    << 28) |
-               (IsPCRel     << 30) |
-               macho::RF_Scattered);
-  MRE.Word1 = Value;
-  Writer->addRelocation(Fragment->getParent(), MRE);
-}
-
-void ARMMachObjectWriter::RecordRelocation(MachObjectWriter *Writer,
-                                           const MCAssembler &Asm,
-                                           const MCAsmLayout &Layout,
-                                           const MCFragment *Fragment,
-                                           const MCFixup &Fixup,
-                                           MCValue Target,
-                                           uint64_t &FixedValue) {
-  unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
-  unsigned Log2Size;
-  unsigned RelocType = macho::RIT_Vanilla;
-  if (!getARMFixupKindMachOInfo(Fixup.getKind(), RelocType, Log2Size)) {
-    report_fatal_error("unknown ARM fixup kind!");
-    return;
-  }
-
-  // If this is a difference or a defined symbol plus an offset, then we need a
-  // scattered relocation entry.  Differences always require scattered
-  // relocations.
-  if (Target.getSymB()) {
-    if (RelocType == macho::RIT_ARM_Half ||
-        RelocType == macho::RIT_ARM_HalfDifference)
-      return RecordARMMovwMovtRelocation(Writer, Asm, Layout, Fragment, Fixup,
-                                         Target, FixedValue);
-    return RecordARMScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup,
-                                        Target, Log2Size, FixedValue);
-  }
-
-  // Get the symbol data, if any.
-  MCSymbolData *SD = 0;
-  if (Target.getSymA())
-    SD = &Asm.getSymbolData(Target.getSymA()->getSymbol());
-
-  // FIXME: For other platforms, we need to use scattered relocations for
-  // internal relocations with offsets.  If this is an internal relocation with
-  // an offset, it also needs a scattered relocation entry.
-  //
-  // Is this right for ARM?
-  uint32_t Offset = Target.getConstant();
-  if (IsPCRel && RelocType == macho::RIT_Vanilla)
-    Offset += 1 << Log2Size;
-  if (Offset && SD && !Writer->doesSymbolRequireExternRelocation(SD))
-    return RecordARMScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup,
-                                        Target, Log2Size, FixedValue);
-
-  // See <reloc.h>.
-  uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
-  unsigned Index = 0;
-  unsigned IsExtern = 0;
-  unsigned Type = 0;
-
-  if (Target.isAbsolute()) { // constant
-    // FIXME!
-    report_fatal_error("FIXME: relocations to absolute targets "
-                       "not yet implemented");
-  } else {
-    // Resolve constant variables.
-    if (SD->getSymbol().isVariable()) {
-      int64_t Res;
-      if (SD->getSymbol().getVariableValue()->EvaluateAsAbsolute(
-            Res, Layout, Writer->getSectionAddressMap())) {
-        FixedValue = Res;
-        return;
-      }
-    }
-
-    // Check whether we need an external or internal relocation.
-    if (Writer->doesSymbolRequireExternRelocation(SD)) {
-      IsExtern = 1;
-      Index = SD->getIndex();
-
-      // For external relocations, make sure to offset the fixup value to
-      // compensate for the addend of the symbol address, if it was
-      // undefined. This occurs with weak definitions, for example.
-      if (!SD->Symbol->isUndefined())
-        FixedValue -= Layout.getSymbolOffset(SD);
-    } else {
-      // The index is the section ordinal (1-based).
-      const MCSectionData &SymSD = Asm.getSectionData(
-        SD->getSymbol().getSection());
-      Index = SymSD.getOrdinal() + 1;
-      FixedValue += Writer->getSectionAddress(&SymSD);
-    }
-    if (IsPCRel)
-      FixedValue -= Writer->getSectionAddress(Fragment->getParent());
-
-    // The type is determined by the fixup kind.
-    Type = RelocType;
-  }
-
-  // struct relocation_info (8 bytes)
-  macho::RelocationEntry MRE;
-  MRE.Word0 = FixupOffset;
-  MRE.Word1 = ((Index     <<  0) |
-               (IsPCRel   << 24) |
-               (Log2Size  << 25) |
-               (IsExtern  << 27) |
-               (Type      << 28));
-  Writer->addRelocation(Fragment->getParent(), MRE);
-}
-
-MCObjectWriter *llvm::createARMMachObjectWriter(raw_ostream &OS,
-                                                bool Is64Bit,
-                                                uint32_t CPUType,
-                                                uint32_t CPUSubtype) {
-  return createMachObjectWriter(new ARMMachObjectWriter(Is64Bit,
-                                                        CPUType,
-                                                        CPUSubtype),
-                                OS, /*IsLittleEndian=*/true);
-}
diff --git a/lib/Target/ARM/ARMRegisterInfo.td b/lib/Target/ARM/ARMRegisterInfo.td
index 76eb496bde42c..036822d18ad2e 100644
--- a/lib/Target/ARM/ARMRegisterInfo.td
+++ b/lib/Target/ARM/ARMRegisterInfo.td
@@ -182,8 +182,10 @@ def QQQQ3 : ARMReg<3, "qqqq3", [QQ6, QQ7]>;
 
 // Current Program Status Register.
 def CPSR    : ARMReg<0, "cpsr">;
-def FPSCR   : ARMReg<1, "fpscr">;
-def ITSTATE : ARMReg<2, "itstate">;
+def APSR    : ARMReg<1, "apsr">;
+def SPSR    : ARMReg<2, "spsr">;
+def FPSCR   : ARMReg<3, "fpscr">;
+def ITSTATE : ARMReg<4, "itstate">;
 
 // Special Registers - only available in privileged mode.
 def FPSID   : ARMReg<0, "fpsid">;
@@ -213,6 +215,23 @@ def GPR : RegisterClass<"ARM", [i32], 32, (add (sequence "R%u", 0, 12),
   }];
 }
 
+// GPRs without the PC.  Some ARM instructions do not allow the PC in
+// certain operand slots, particularly as the destination.  Primarily
+// useful for disassembly.
+def GPRnopc : RegisterClass<"ARM", [i32], 32, (sub GPR, PC)> {
+  let AltOrders = [(add LR, GPRnopc), (trunc GPRnopc, 8)];
+  let AltOrderSelect = [{
+      return 1 + MF.getTarget().getSubtarget<ARMSubtarget>().isThumb1Only();
+  }];
+}
+
+// GPRsp - Only the SP is legal. Used by Thumb1 instructions that want the
+// implied SP argument list.
+// FIXME: It would be better to not use this at all and refactor the
+// instructions to not have SP an an explicit argument. That makes
+// frame index resolution a bit trickier, though.
+def GPRsp : RegisterClass<"ARM", [i32], 32, (add SP)>;
+
 // restricted GPR register class. Many Thumb2 instructions allow the full
 // register range for operands, but have undefined behaviours when PC
 // or SP (R13 or R15) are used. The ARM ISA refers to these operands
@@ -328,5 +347,6 @@ def QQQQPR : RegisterClass<"ARM", [v8i64], 256, (sequence "QQQQ%u", 0, 3)> {
 
 // Condition code registers.
 def CCR : RegisterClass<"ARM", [i32], 32, (add CPSR)> {
+  let CopyCost = -1;  // Don't allow copying of status registers.
   let isAllocatable = 0;
 }
diff --git a/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/lib/Target/ARM/ARMSelectionDAGInfo.cpp
index ef0aaf2a59eb9..a3a3d58841db6 100644
--- a/lib/Target/ARM/ARMSelectionDAGInfo.cpp
+++ b/lib/Target/ARM/ARMSelectionDAGInfo.cpp
@@ -138,13 +138,12 @@ ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
 // Adjust parameters for memset, EABI uses format (ptr, size, value),
 // GNU library uses (ptr, value, size)
 // See RTABI section 4.3.4
-SDValue
-ARMSelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
-                                             SDValue Chain, SDValue Dst,
-                                             SDValue Src, SDValue Size,
-                                             unsigned Align, bool isVolatile,
-                                             MachinePointerInfo DstPtrInfo) const
-{
+SDValue ARMSelectionDAGInfo::
+EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
+                        SDValue Chain, SDValue Dst,
+                        SDValue Src, SDValue Size,
+                        unsigned Align, bool isVolatile,
+                        MachinePointerInfo DstPtrInfo) const {
   // Use default for non AAPCS subtargets
   if (!Subtarget->isAAPCS_ABI())
     return SDValue();
@@ -155,7 +154,7 @@ ARMSelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
   TargetLowering::ArgListEntry Entry;
 
   // First argument: data pointer
-  const Type *IntPtrTy = TLI.getTargetData()->getIntPtrType(*DAG.getContext());
+  Type *IntPtrTy = TLI.getTargetData()->getIntPtrType(*DAG.getContext());
   Entry.Node = Dst;
   Entry.Ty = IntPtrTy;
   Args.push_back(Entry);
diff --git a/lib/Target/ARM/ARMSelectionDAGInfo.h b/lib/Target/ARM/ARMSelectionDAGInfo.h
index ec1bf5ca1941d..6419a737295a7 100644
--- a/lib/Target/ARM/ARMSelectionDAGInfo.h
+++ b/lib/Target/ARM/ARMSelectionDAGInfo.h
@@ -14,10 +14,27 @@
 #ifndef ARMSELECTIONDAGINFO_H
 #define ARMSELECTIONDAGINFO_H
 
+#include "MCTargetDesc/ARMAddressingModes.h"
 #include "llvm/Target/TargetSelectionDAGInfo.h"
 
 namespace llvm {
 
+namespace ARM_AM {
+  static inline ShiftOpc getShiftOpcForNode(unsigned Opcode) {
+    switch (Opcode) {
+    default:          return ARM_AM::no_shift;
+    case ISD::SHL:    return ARM_AM::lsl;
+    case ISD::SRL:    return ARM_AM::lsr;
+    case ISD::SRA:    return ARM_AM::asr;
+    case ISD::ROTR:   return ARM_AM::ror;
+    //case ISD::ROTL:  // Only if imm -> turn into ROTR.
+    // Can't handle RRX here, because it would require folding a flag into
+    // the addressing mode.  :(  This causes us to miss certain things.
+    //case ARMISD::RRX: return ARM_AM::rrx;
+    }
+  }
+}  // end namespace ARM_AM
+
 class ARMSelectionDAGInfo : public TargetSelectionDAGInfo {
   /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
   /// make the right decision when generating code for different targets.
diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp
index 1cab9e44ce758..247d6be59ad4f 100644
--- a/lib/Target/ARM/ARMSubtarget.cpp
+++ b/lib/Target/ARM/ARMSubtarget.cpp
@@ -53,11 +53,14 @@ ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &CPU,
   , HasVMLxForwarding(false)
   , SlowFPBrcc(false)
   , InThumbMode(false)
+  , InNaClMode(false)
   , HasThumb2(false)
+  , IsMClass(false)
   , NoARM(false)
   , PostRAScheduler(false)
   , IsR9Reserved(ReserveR9)
   , UseMovt(false)
+  , SupportsTailCall(false)
   , HasFP16(false)
   , HasD16(false)
   , HasHardwareDivide(false)
@@ -111,6 +114,8 @@ ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &CPU,
   else {
     IsR9Reserved = ReserveR9 | !HasV6Ops;
     UseMovt = DarwinUseMOVT && hasV6T2Ops();
+    const Triple &T = getTargetTriple();
+    SupportsTailCall = T.getOS() == Triple::IOS && !T.isOSVersionLT(5, 0);
   }
 
   if (!isThumb() || hasThumb2())
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
index c6508723a5768..b63e1085fb83a 100644
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -70,9 +70,16 @@ protected:
   /// InThumbMode - True if compiling for Thumb, false for ARM.
   bool InThumbMode;
 
+  /// InNaClMode - True if targeting Native Client
+  bool InNaClMode;
+
   /// HasThumb2 - True if Thumb2 instructions are supported.
   bool HasThumb2;
 
+  /// IsMClass - True if the subtarget belongs to the 'M' profile of CPUs - 
+  /// v6m, v7m for example.
+  bool IsMClass;
+
   /// NoARM - True if subtarget does not support ARM mode execution.
   bool NoARM;
 
@@ -86,6 +93,11 @@ protected:
   /// imms (including global addresses).
   bool UseMovt;
 
+  /// SupportsTailCall - True if the OS supports tail call. The dynamic linker
+  /// must be able to synthesize call stubs for interworking between ARM and
+  /// Thumb.
+  bool SupportsTailCall;
+
   /// HasFP16 - True if subtarget supports half-precision FP (We support VFP+HF
   /// only so far)
   bool HasFP16;
@@ -209,6 +221,9 @@ protected:
   const Triple &getTargetTriple() const { return TargetTriple; }
 
   bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
+  bool isTargetNaCl() const {
+    return TargetTriple.getOS() == Triple::NativeClient;
+  }
   bool isTargetELF() const { return !isTargetDarwin(); }
 
   bool isAPCS_ABI() const { return TargetABI == ARM_ABI_APCS; }
@@ -218,10 +233,13 @@ protected:
   bool isThumb1Only() const { return InThumbMode && !HasThumb2; }
   bool isThumb2() const { return InThumbMode && HasThumb2; }
   bool hasThumb2() const { return HasThumb2; }
+  bool isMClass() const { return IsMClass; }
+  bool isARClass() const { return !IsMClass; }
 
   bool isR9Reserved() const { return IsR9Reserved; }
 
   bool useMovt() const { return UseMovt && hasV6T2Ops(); }
+  bool supportsTailCall() const { return SupportsTailCall; }
 
   bool allowsUnalignedMem() const { return AllowsUnalignedMem; }
 
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
index f0b176ad69812..96b1e89b0df09 100644
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -15,77 +15,50 @@
 #include "ARM.h"
 #include "llvm/PassManager.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/TargetOptions.h"
-#include "llvm/Target/TargetRegistry.h"
 using namespace llvm;
 
-// This is duplicated code. Refactor this.
-static MCStreamer *createMCStreamer(const Target &T, const std::string &TT,
-                                    MCContext &Ctx, TargetAsmBackend &TAB,
-                                    raw_ostream &OS,
-                                    MCCodeEmitter *Emitter,
-                                    bool RelaxAll,
-                                    bool NoExecStack) {
-  Triple TheTriple(TT);
-
-  if (TheTriple.isOSDarwin())
-    return createMachOStreamer(Ctx, TAB, OS, Emitter, RelaxAll);
-
-  if (TheTriple.isOSWindows()) {
-    llvm_unreachable("ARM does not support Windows COFF format");
-    return NULL;
-  }
-
-  return createELFStreamer(Ctx, TAB, OS, Emitter, RelaxAll, NoExecStack);
-}
+static cl::opt<bool>
+EnableGlobalMerge("global-merge", cl::Hidden,
+                  cl::desc("Enable global merge pass"),
+                  cl::init(true));
 
 extern "C" void LLVMInitializeARMTarget() {
   // Register the target.
   RegisterTargetMachine<ARMTargetMachine> X(TheARMTarget);
   RegisterTargetMachine<ThumbTargetMachine> Y(TheThumbTarget);
-
-  // Register the MC Code Emitter
-  TargetRegistry::RegisterCodeEmitter(TheARMTarget, createARMMCCodeEmitter);
-  TargetRegistry::RegisterCodeEmitter(TheThumbTarget, createARMMCCodeEmitter);
-
-  // Register the asm backend.
-  TargetRegistry::RegisterAsmBackend(TheARMTarget, createARMAsmBackend);
-  TargetRegistry::RegisterAsmBackend(TheThumbTarget, createARMAsmBackend);
-
-  // Register the object streamer.
-  TargetRegistry::RegisterObjectStreamer(TheARMTarget, createMCStreamer);
-  TargetRegistry::RegisterObjectStreamer(TheThumbTarget, createMCStreamer);
-
 }
 
 /// TargetMachine ctor - Create an ARM architecture model.
 ///
-ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T,
-                                           const std::string &TT,
-                                           const std::string &CPU,
-                                           const std::string &FS)
-  : LLVMTargetMachine(T, TT, CPU, FS),
+ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, StringRef TT,
+                                           StringRef CPU, StringRef FS,
+                                           Reloc::Model RM, CodeModel::Model CM)
+  : LLVMTargetMachine(T, TT, CPU, FS, RM, CM),
     Subtarget(TT, CPU, FS),
     JITInfo(),
     InstrItins(Subtarget.getInstrItineraryData()) {
-  DefRelocModel = getRelocationModel();
-
   // Default to soft float ABI
   if (FloatABIType == FloatABI::Default)
     FloatABIType = FloatABI::Soft;
 }
 
-ARMTargetMachine::ARMTargetMachine(const Target &T, const std::string &TT,
-                                   const std::string &CPU,
-                                   const std::string &FS)
-  : ARMBaseTargetMachine(T, TT, CPU, FS), InstrInfo(Subtarget),
+ARMTargetMachine::ARMTargetMachine(const Target &T, StringRef TT,
+                                   StringRef CPU, StringRef FS,
+                                   Reloc::Model RM, CodeModel::Model CM)
+  : ARMBaseTargetMachine(T, TT, CPU, FS, RM, CM), InstrInfo(Subtarget),
     DataLayout(Subtarget.isAPCS_ABI() ?
                std::string("e-p:32:32-f64:32:64-i64:32:64-"
-                           "v128:32:128-v64:32:64-n32") :
+                           "v128:32:128-v64:32:64-n32-S32") :
+               Subtarget.isAAPCS_ABI() ?
+               std::string("e-p:32:32-f64:64:64-i64:64:64-"
+                           "v128:64:128-v64:64:64-n32-S64") :
                std::string("e-p:32:32-f64:64:64-i64:64:64-"
-                           "v128:64:128-v64:64:64-n32")),
+                           "v128:64:128-v64:64:64-n32-S32")),
     ELFWriterInfo(*this),
     TLInfo(*this),
     TSInfo(*this),
@@ -95,20 +68,24 @@ ARMTargetMachine::ARMTargetMachine(const Target &T, const std::string &TT,
                        "support ARM mode execution!");
 }
 
-ThumbTargetMachine::ThumbTargetMachine(const Target &T, const std::string &TT,
-                                       const std::string &CPU,
-                                       const std::string &FS)
-  : ARMBaseTargetMachine(T, TT, CPU, FS),
+ThumbTargetMachine::ThumbTargetMachine(const Target &T, StringRef TT,
+                                       StringRef CPU, StringRef FS,
+                                       Reloc::Model RM, CodeModel::Model CM)
+  : ARMBaseTargetMachine(T, TT, CPU, FS, RM, CM),
     InstrInfo(Subtarget.hasThumb2()
               ? ((ARMBaseInstrInfo*)new Thumb2InstrInfo(Subtarget))
               : ((ARMBaseInstrInfo*)new Thumb1InstrInfo(Subtarget))),
     DataLayout(Subtarget.isAPCS_ABI() ?
                std::string("e-p:32:32-f64:32:64-i64:32:64-"
                            "i16:16:32-i8:8:32-i1:8:32-"
-                           "v128:32:128-v64:32:64-a:0:32-n32") :
+                           "v128:32:128-v64:32:64-a:0:32-n32-S32") :
+               Subtarget.isAAPCS_ABI() ?
+               std::string("e-p:32:32-f64:64:64-i64:64:64-"
+                           "i16:16:32-i8:8:32-i1:8:32-"
+                           "v128:64:128-v64:64:64-a:0:32-n32-S64") :
                std::string("e-p:32:32-f64:64:64-i64:64:64-"
                            "i16:16:32-i8:8:32-i1:8:32-"
-                           "v128:64:128-v64:64:64-a:0:32-n32")),
+                           "v128:64:128-v64:64:64-a:0:32-n32-S32")),
     ELFWriterInfo(*this),
     TLInfo(*this),
     TSInfo(*this),
@@ -117,10 +94,9 @@ ThumbTargetMachine::ThumbTargetMachine(const Target &T, const std::string &TT,
               : (ARMFrameLowering*)new Thumb1FrameLowering(Subtarget)) {
 }
 
-// Pass Pipeline Configuration
 bool ARMBaseTargetMachine::addPreISel(PassManagerBase &PM,
                                       CodeGenOpt::Level OptLevel) {
-  if (OptLevel != CodeGenOpt::None)
+  if (OptLevel != CodeGenOpt::None && EnableGlobalMerge)
     PM.add(createARMGlobalMergePass(getTargetLowering()));
 
   return false;
@@ -139,7 +115,6 @@ bool ARMBaseTargetMachine::addPreRegAlloc(PassManagerBase &PM,
     PM.add(createARMLoadStoreOptimizationPass(true));
   if (OptLevel != CodeGenOpt::None && Subtarget.isCortexA9())
     PM.add(createMLxExpansionPass());
-
   return true;
 }
 
@@ -150,7 +125,7 @@ bool ARMBaseTargetMachine::addPreSched2(PassManagerBase &PM,
     if (!Subtarget.isThumb1Only())
       PM.add(createARMLoadStoreOptimizationPass());
     if (Subtarget.hasNEON())
-      PM.add(createNEONMoveFixPass());
+      PM.add(createExecutionDependencyFixPass(&ARM::DPRRegClass));
   }
 
   // Expand some pseudo instructions into multiple instructions to allow
@@ -179,10 +154,6 @@ bool ARMBaseTargetMachine::addPreEmitPass(PassManagerBase &PM,
 bool ARMBaseTargetMachine::addCodeEmitter(PassManagerBase &PM,
                                           CodeGenOpt::Level OptLevel,
                                           JITCodeEmitter &JCE) {
-  // FIXME: Move this to TargetJITInfo!
-  if (DefRelocModel == Reloc::Default)
-    setRelocationModel(Reloc::Static);
-
   // Machine code emitter pass for ARM.
   PM.add(createARMJITCodeEmitterPass(*this, JCE));
   return false;
diff --git a/lib/Target/ARM/ARMTargetMachine.h b/lib/Target/ARM/ARMTargetMachine.h
index bc3d46a50ea52..c8c601c301715 100644
--- a/lib/Target/ARM/ARMTargetMachine.h
+++ b/lib/Target/ARM/ARMTargetMachine.h
@@ -37,11 +37,11 @@ protected:
 private:
   ARMJITInfo          JITInfo;
   InstrItineraryData  InstrItins;
-  Reloc::Model        DefRelocModel;    // Reloc model before it's overridden.
 
 public:
-  ARMBaseTargetMachine(const Target &T, const std::string &TT,
-                       const std::string &CPU, const std::string &FS);
+  ARMBaseTargetMachine(const Target &T, StringRef TT,
+                       StringRef CPU, StringRef FS,
+                       Reloc::Model RM, CodeModel::Model CM);
 
   virtual       ARMJITInfo       *getJITInfo()         { return &JITInfo; }
   virtual const ARMSubtarget  *getSubtargetImpl() const { return &Subtarget; }
@@ -69,8 +69,9 @@ class ARMTargetMachine : public ARMBaseTargetMachine {
   ARMSelectionDAGInfo TSInfo;
   ARMFrameLowering    FrameLowering;
  public:
-  ARMTargetMachine(const Target &T, const std::string &TT,
-                   const std::string &CPU, const std::string &FS);
+  ARMTargetMachine(const Target &T, StringRef TT,
+                   StringRef CPU, StringRef FS,
+                   Reloc::Model RM, CodeModel::Model CM);
 
   virtual const ARMRegisterInfo  *getRegisterInfo() const {
     return &InstrInfo.getRegisterInfo();
@@ -108,8 +109,9 @@ class ThumbTargetMachine : public ARMBaseTargetMachine {
   // Either Thumb1FrameLowering or ARMFrameLowering.
   OwningPtr<ARMFrameLowering> FrameLowering;
 public:
-  ThumbTargetMachine(const Target &T, const std::string &TT,
-                     const std::string &CPU, const std::string &FS);
+  ThumbTargetMachine(const Target &T, StringRef TT,
+                     StringRef CPU, StringRef FS,
+                     Reloc::Model RM, CodeModel::Model CM);
 
   /// returns either Thumb1RegisterInfo or Thumb2RegisterInfo
   virtual const ARMBaseRegisterInfo *getRegisterInfo() const {
diff --git a/lib/Target/ARM/AsmParser/ARMAsmLexer.cpp b/lib/Target/ARM/AsmParser/ARMAsmLexer.cpp
index d9a5fa223b4bd..14d35ba546542 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmLexer.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmLexer.cpp
@@ -7,16 +7,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "ARM.h"
-#include "ARMTargetMachine.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
 
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCTargetAsmLexer.h"
 
-#include "llvm/Target/TargetAsmLexer.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/SmallVector.h"
@@ -30,7 +29,7 @@ using namespace llvm;
 
 namespace {
 
-class ARMBaseAsmLexer : public TargetAsmLexer {
+class ARMBaseAsmLexer : public MCTargetAsmLexer {
   const MCAsmInfo &AsmInfo;
 
   const AsmToken &lexDefinite() {
@@ -43,7 +42,7 @@ protected:
 
   rmap_ty RegisterMap;
 
-  void InitRegisterMap(const TargetRegisterInfo *info) {
+  void InitRegisterMap(const MCRegisterInfo *info) {
     unsigned numRegs = info->getNumRegs();
 
     for (unsigned i = 0; i < numRegs; ++i) {
@@ -77,33 +76,23 @@ protected:
   }
 public:
   ARMBaseAsmLexer(const Target &T, const MCAsmInfo &MAI)
-    : TargetAsmLexer(T), AsmInfo(MAI) {
+    : MCTargetAsmLexer(T), AsmInfo(MAI) {
   }
 };
 
 class ARMAsmLexer : public ARMBaseAsmLexer {
 public:
-  ARMAsmLexer(const Target &T, const MCAsmInfo &MAI)
+  ARMAsmLexer(const Target &T, const MCRegisterInfo &MRI, const MCAsmInfo &MAI)
     : ARMBaseAsmLexer(T, MAI) {
-    std::string tripleString("arm-unknown-unknown");
-    std::string featureString;
-    std::string CPU;
-    OwningPtr<const TargetMachine>
-      targetMachine(T.createTargetMachine(tripleString, CPU, featureString));
-    InitRegisterMap(targetMachine->getRegisterInfo());
+    InitRegisterMap(&MRI);
   }
 };
 
 class ThumbAsmLexer : public ARMBaseAsmLexer {
 public:
-  ThumbAsmLexer(const Target &T, const MCAsmInfo &MAI)
+  ThumbAsmLexer(const Target &T, const MCRegisterInfo &MRI,const MCAsmInfo &MAI)
     : ARMBaseAsmLexer(T, MAI) {
-    std::string tripleString("thumb-unknown-unknown");
-    std::string featureString;
-    std::string CPU;
-    OwningPtr<const TargetMachine>
-      targetMachine(T.createTargetMachine(tripleString, CPU, featureString));
-    InitRegisterMap(targetMachine->getRegisterInfo());
+    InitRegisterMap(&MRI);
   }
 };
 
@@ -149,6 +138,6 @@ AsmToken ARMBaseAsmLexer::LexTokenUAL() {
 }
 
 extern "C" void LLVMInitializeARMAsmLexer() {
-  RegisterAsmLexer<ARMAsmLexer> X(TheARMTarget);
-  RegisterAsmLexer<ThumbAsmLexer> Y(TheThumbTarget);
+  RegisterMCAsmLexer<ARMAsmLexer> X(TheARMTarget);
+  RegisterMCAsmLexer<ThumbAsmLexer> Y(TheThumbTarget);
 }
diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index a4741270c7a5f..24f15b4694ff0 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -7,11 +7,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "ARM.h"
-#include "ARMAddressingModes.h"
-#include "ARMMCExpr.h"
-#include "ARMBaseRegisterInfo.h"
-#include "ARMSubtarget.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "MCTargetDesc/ARMMCExpr.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
@@ -20,12 +18,17 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Target/TargetRegistry.h"
-#include "llvm/Target/TargetAsmParser.h"
+#include "llvm/MC/MCTargetAsmParser.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSwitch.h"
@@ -37,49 +40,65 @@ namespace {
 
 class ARMOperand;
 
-class ARMAsmParser : public TargetAsmParser {
+class ARMAsmParser : public MCTargetAsmParser {
   MCSubtargetInfo &STI;
   MCAsmParser &Parser;
 
+  struct {
+    ARMCC::CondCodes Cond;    // Condition for IT block.
+    unsigned Mask:4;          // Condition mask for instructions.
+                              // Starting at first 1 (from lsb).
+                              //   '1'  condition as indicated in IT.
+                              //   '0'  inverse of condition (else).
+                              // Count of instructions in IT block is
+                              // 4 - trailingzeroes(mask)
+
+    bool FirstCond;           // Explicit flag for when we're parsing the
+                              // First instruction in the IT block. It's
+                              // implied in the mask, so needs special
+                              // handling.
+
+    unsigned CurPosition;     // Current position in parsing of IT
+                              // block. In range [0,3]. Initialized
+                              // according to count of instructions in block.
+                              // ~0U if no active IT block.
+  } ITState;
+  bool inITBlock() { return ITState.CurPosition != ~0U;}
+  void forwardITPosition() {
+    if (!inITBlock()) return;
+    // Move to the next instruction in the IT block, if there is one. If not,
+    // mark the block as done.
+    unsigned TZ = CountTrailingZeros_32(ITState.Mask);
+    if (++ITState.CurPosition == 5 - TZ)
+      ITState.CurPosition = ~0U; // Done with the IT block after this.
+  }
+
+
   MCAsmParser &getParser() const { return Parser; }
   MCAsmLexer &getLexer() const { return Parser.getLexer(); }
 
   void Warning(SMLoc L, const Twine &Msg) { Parser.Warning(L, Msg); }
   bool Error(SMLoc L, const Twine &Msg) { return Parser.Error(L, Msg); }
 
-  int TryParseRegister();
-  virtual bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc);
-  bool TryParseRegisterWithWriteBack(SmallVectorImpl<MCParsedAsmOperand*> &);
-  int TryParseShiftRegister(SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool ParseRegisterList(SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool ParseMemory(SmallVectorImpl<MCParsedAsmOperand*> &,
-                   ARMII::AddrMode AddrMode);
-  bool ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &, StringRef Mnemonic);
-  bool ParsePrefix(ARMMCExpr::VariantKind &RefKind);
-  const MCExpr *ApplyPrefixToExpr(const MCExpr *E,
-                                  MCSymbolRefExpr::VariantKind Variant);
-
-
-  bool ParseMemoryOffsetReg(bool &Negative,
-                            bool &OffsetRegShifted,
-                            enum ARM_AM::ShiftOpc &ShiftType,
-                            const MCExpr *&ShiftAmount,
-                            const MCExpr *&Offset,
-                            bool &OffsetIsReg,
-                            int &OffsetRegNum,
-                            SMLoc &E);
-  bool ParseShift(enum ARM_AM::ShiftOpc &St,
-                  const MCExpr *&ShiftAmount, SMLoc &E);
-  bool ParseDirectiveWord(unsigned Size, SMLoc L);
-  bool ParseDirectiveThumb(SMLoc L);
-  bool ParseDirectiveThumbFunc(SMLoc L);
-  bool ParseDirectiveCode(SMLoc L);
-  bool ParseDirectiveSyntax(SMLoc L);
-
-  bool MatchAndEmitInstruction(SMLoc IDLoc,
-                               SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                               MCStreamer &Out);
-  void GetMnemonicAcceptInfo(StringRef Mnemonic, bool &CanAcceptCarrySet,
+  int tryParseRegister();
+  bool tryParseRegisterWithWriteBack(SmallVectorImpl<MCParsedAsmOperand*> &);
+  int tryParseShiftRegister(SmallVectorImpl<MCParsedAsmOperand*> &);
+  bool parseRegisterList(SmallVectorImpl<MCParsedAsmOperand*> &);
+  bool parseMemory(SmallVectorImpl<MCParsedAsmOperand*> &);
+  bool parseOperand(SmallVectorImpl<MCParsedAsmOperand*> &, StringRef Mnemonic);
+  bool parsePrefix(ARMMCExpr::VariantKind &RefKind);
+  bool parseMemRegOffsetShift(ARM_AM::ShiftOpc &ShiftType,
+                              unsigned &ShiftAmount);
+  bool parseDirectiveWord(unsigned Size, SMLoc L);
+  bool parseDirectiveThumb(SMLoc L);
+  bool parseDirectiveThumbFunc(SMLoc L);
+  bool parseDirectiveCode(SMLoc L);
+  bool parseDirectiveSyntax(SMLoc L);
+
+  StringRef splitMnemonic(StringRef Mnemonic, unsigned &PredicationCode,
+                          bool &CarrySetting, unsigned &ProcessorIMod,
+                          StringRef &ITMask);
+  void getMnemonicAcceptInfo(StringRef Mnemonic, bool &CanAcceptCarrySet,
                              bool &CanAcceptPredicationCode);
 
   bool isThumb() const {
@@ -89,10 +108,22 @@ class ARMAsmParser : public TargetAsmParser {
   bool isThumbOne() const {
     return isThumb() && (STI.getFeatureBits() & ARM::FeatureThumb2) == 0;
   }
+  bool isThumbTwo() const {
+    return isThumb() && (STI.getFeatureBits() & ARM::FeatureThumb2);
+  }
+  bool hasV6Ops() const {
+    return STI.getFeatureBits() & ARM::HasV6Ops;
+  }
+  bool hasV7Ops() const {
+    return STI.getFeatureBits() & ARM::HasV7Ops;
+  }
   void SwitchMode() {
     unsigned FB = ComputeAvailableFeatures(STI.ToggleFeature(ARM::ModeThumb));
     setAvailableFeatures(FB);
   }
+  bool isMClass() const {
+    return STI.getFeatureBits() & ARM::FeatureMClass;
+  }
 
   /// @name Auto-generated Match Functions
   /// {
@@ -102,43 +133,108 @@ class ARMAsmParser : public TargetAsmParser {
 
   /// }
 
-  OperandMatchResultTy tryParseCoprocNumOperand(
-    SmallVectorImpl<MCParsedAsmOperand*>&);
-  OperandMatchResultTy tryParseCoprocRegOperand(
+  OperandMatchResultTy parseITCondCode(SmallVectorImpl<MCParsedAsmOperand*>&);
+  OperandMatchResultTy parseCoprocNumOperand(
     SmallVectorImpl<MCParsedAsmOperand*>&);
-  OperandMatchResultTy tryParseMemBarrierOptOperand(
+  OperandMatchResultTy parseCoprocRegOperand(
     SmallVectorImpl<MCParsedAsmOperand*>&);
-  OperandMatchResultTy tryParseProcIFlagsOperand(
+  OperandMatchResultTy parseCoprocOptionOperand(
     SmallVectorImpl<MCParsedAsmOperand*>&);
-  OperandMatchResultTy tryParseMSRMaskOperand(
+  OperandMatchResultTy parseMemBarrierOptOperand(
     SmallVectorImpl<MCParsedAsmOperand*>&);
-  OperandMatchResultTy tryParseMemMode2Operand(
+  OperandMatchResultTy parseProcIFlagsOperand(
     SmallVectorImpl<MCParsedAsmOperand*>&);
-  OperandMatchResultTy tryParseMemMode3Operand(
+  OperandMatchResultTy parseMSRMaskOperand(
     SmallVectorImpl<MCParsedAsmOperand*>&);
+  OperandMatchResultTy parsePKHImm(SmallVectorImpl<MCParsedAsmOperand*> &O,
+                                   StringRef Op, int Low, int High);
+  OperandMatchResultTy parsePKHLSLImm(SmallVectorImpl<MCParsedAsmOperand*> &O) {
+    return parsePKHImm(O, "lsl", 0, 31);
+  }
+  OperandMatchResultTy parsePKHASRImm(SmallVectorImpl<MCParsedAsmOperand*> &O) {
+    return parsePKHImm(O, "asr", 1, 32);
+  }
+  OperandMatchResultTy parseSetEndImm(SmallVectorImpl<MCParsedAsmOperand*>&);
+  OperandMatchResultTy parseShifterImm(SmallVectorImpl<MCParsedAsmOperand*>&);
+  OperandMatchResultTy parseRotImm(SmallVectorImpl<MCParsedAsmOperand*>&);
+  OperandMatchResultTy parseBitfield(SmallVectorImpl<MCParsedAsmOperand*>&);
+  OperandMatchResultTy parsePostIdxReg(SmallVectorImpl<MCParsedAsmOperand*>&);
+  OperandMatchResultTy parseAM3Offset(SmallVectorImpl<MCParsedAsmOperand*>&);
+  OperandMatchResultTy parseFPImm(SmallVectorImpl<MCParsedAsmOperand*>&);
 
   // Asm Match Converter Methods
-  bool CvtLdWriteBackRegAddrMode2(MCInst &Inst, unsigned Opcode,
+  bool cvtT2LdrdPre(MCInst &Inst, unsigned Opcode,
+                    const SmallVectorImpl<MCParsedAsmOperand*> &);
+  bool cvtT2StrdPre(MCInst &Inst, unsigned Opcode,
+                    const SmallVectorImpl<MCParsedAsmOperand*> &);
+  bool cvtLdWriteBackRegT2AddrModeImm8(MCInst &Inst, unsigned Opcode,
+                                  const SmallVectorImpl<MCParsedAsmOperand*> &);
+  bool cvtStWriteBackRegT2AddrModeImm8(MCInst &Inst, unsigned Opcode,
+                                  const SmallVectorImpl<MCParsedAsmOperand*> &);
+  bool cvtLdWriteBackRegAddrMode2(MCInst &Inst, unsigned Opcode,
                                   const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool CvtStWriteBackRegAddrMode2(MCInst &Inst, unsigned Opcode,
+  bool cvtLdWriteBackRegAddrModeImm12(MCInst &Inst, unsigned Opcode,
                                   const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool CvtLdWriteBackRegAddrMode3(MCInst &Inst, unsigned Opcode,
+  bool cvtStWriteBackRegAddrModeImm12(MCInst &Inst, unsigned Opcode,
                                   const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool CvtStWriteBackRegAddrMode3(MCInst &Inst, unsigned Opcode,
+  bool cvtStWriteBackRegAddrMode2(MCInst &Inst, unsigned Opcode,
                                   const SmallVectorImpl<MCParsedAsmOperand*> &);
+  bool cvtStWriteBackRegAddrMode3(MCInst &Inst, unsigned Opcode,
+                                  const SmallVectorImpl<MCParsedAsmOperand*> &);
+  bool cvtLdExtTWriteBackImm(MCInst &Inst, unsigned Opcode,
+                             const SmallVectorImpl<MCParsedAsmOperand*> &);
+  bool cvtLdExtTWriteBackReg(MCInst &Inst, unsigned Opcode,
+                             const SmallVectorImpl<MCParsedAsmOperand*> &);
+  bool cvtStExtTWriteBackImm(MCInst &Inst, unsigned Opcode,
+                             const SmallVectorImpl<MCParsedAsmOperand*> &);
+  bool cvtStExtTWriteBackReg(MCInst &Inst, unsigned Opcode,
+                             const SmallVectorImpl<MCParsedAsmOperand*> &);
+  bool cvtLdrdPre(MCInst &Inst, unsigned Opcode,
+                  const SmallVectorImpl<MCParsedAsmOperand*> &);
+  bool cvtStrdPre(MCInst &Inst, unsigned Opcode,
+                  const SmallVectorImpl<MCParsedAsmOperand*> &);
+  bool cvtLdWriteBackRegAddrMode3(MCInst &Inst, unsigned Opcode,
+                                  const SmallVectorImpl<MCParsedAsmOperand*> &);
+  bool cvtThumbMultiply(MCInst &Inst, unsigned Opcode,
+                        const SmallVectorImpl<MCParsedAsmOperand*> &);
+
+  bool validateInstruction(MCInst &Inst,
+                           const SmallVectorImpl<MCParsedAsmOperand*> &Ops);
+  void processInstruction(MCInst &Inst,
+                          const SmallVectorImpl<MCParsedAsmOperand*> &Ops);
+  bool shouldOmitCCOutOperand(StringRef Mnemonic,
+                              SmallVectorImpl<MCParsedAsmOperand*> &Operands);
 
 public:
+  enum ARMMatchResultTy {
+    Match_RequiresITBlock = FIRST_TARGET_MATCH_RESULT_TY,
+    Match_RequiresNotITBlock,
+    Match_RequiresV6,
+    Match_RequiresThumb2
+  };
+
   ARMAsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser)
-    : TargetAsmParser(), STI(_STI), Parser(_Parser) {
+    : MCTargetAsmParser(), STI(_STI), Parser(_Parser) {
     MCAsmParserExtension::Initialize(_Parser);
 
     // Initialize the set of available features.
     setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+
+    // Not in an ITBlock to start with.
+    ITState.CurPosition = ~0U;
   }
 
-  virtual bool ParseInstruction(StringRef Name, SMLoc NameLoc,
-                                SmallVectorImpl<MCParsedAsmOperand*> &Operands);
-  virtual bool ParseDirective(AsmToken DirectiveID);
+  // Implementation of the MCTargetAsmParser interface:
+  bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc);
+  bool ParseInstruction(StringRef Name, SMLoc NameLoc,
+                        SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+  bool ParseDirective(AsmToken DirectiveID);
+
+  unsigned checkTargetMatchPredicate(MCInst &Inst);
+
+  bool MatchAndEmitInstruction(SMLoc IDLoc,
+                               SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                               MCStreamer &Out);
 };
 } // end anonymous namespace
 
@@ -148,22 +244,30 @@ namespace {
 /// instruction.
 class ARMOperand : public MCParsedAsmOperand {
   enum KindTy {
-    CondCode,
-    CCOut,
-    CoprocNum,
-    CoprocReg,
-    Immediate,
-    MemBarrierOpt,
-    Memory,
-    MSRMask,
-    ProcIFlags,
-    Register,
-    RegisterList,
-    DPRRegisterList,
-    SPRRegisterList,
-    ShiftedRegister,
-    Shifter,
-    Token
+    k_CondCode,
+    k_CCOut,
+    k_ITCondMask,
+    k_CoprocNum,
+    k_CoprocReg,
+    k_CoprocOption,
+    k_Immediate,
+    k_FPImmediate,
+    k_MemBarrierOpt,
+    k_Memory,
+    k_PostIndexRegister,
+    k_MSRMask,
+    k_ProcIFlags,
+    k_VectorIndex,
+    k_Register,
+    k_RegisterList,
+    k_DPRRegisterList,
+    k_SPRRegisterList,
+    k_ShiftedRegister,
+    k_ShiftedImmediate,
+    k_ShifterImmediate,
+    k_RotateImmediate,
+    k_BitfieldDescriptor,
+    k_Token
   } Kind;
 
   SMLoc StartLoc, EndLoc;
@@ -175,12 +279,20 @@ class ARMOperand : public MCParsedAsmOperand {
     } CC;
 
     struct {
-      ARM_MB::MemBOpt Val;
-    } MBOpt;
+      unsigned Val;
+    } Cop;
 
     struct {
       unsigned Val;
-    } Cop;
+    } CoprocOption;
+
+    struct {
+      unsigned Mask:4;
+    } ITMask;
+
+    struct {
+      ARM_MB::MemBOpt Val;
+    } MBOpt;
 
     struct {
       ARM_PROC::IFlags Val;
@@ -199,38 +311,61 @@ class ARMOperand : public MCParsedAsmOperand {
       unsigned RegNum;
     } Reg;
 
+    struct {
+      unsigned Val;
+    } VectorIndex;
+
     struct {
       const MCExpr *Val;
     } Imm;
 
+    struct {
+      unsigned Val;       // encoded 8-bit representation
+    } FPImm;
+
     /// Combined record for all forms of ARM address expressions.
     struct {
-      ARMII::AddrMode AddrMode;
       unsigned BaseRegNum;
-      union {
-        unsigned RegNum;     ///< Offset register num, when OffsetIsReg.
-        const MCExpr *Value; ///< Offset value, when !OffsetIsReg.
-      } Offset;
-      const MCExpr *ShiftAmount;     // used when OffsetRegShifted is true
-      enum ARM_AM::ShiftOpc ShiftType; // used when OffsetRegShifted is true
-      unsigned OffsetRegShifted : 1; // only used when OffsetIsReg is true
-      unsigned Preindexed       : 1;
-      unsigned Postindexed      : 1;
-      unsigned OffsetIsReg      : 1;
-      unsigned Negative         : 1; // only used when OffsetIsReg is true
-      unsigned Writeback        : 1;
-    } Mem;
+      // Offset is in OffsetReg or OffsetImm. If both are zero, no offset
+      // was specified.
+      const MCConstantExpr *OffsetImm;  // Offset immediate value
+      unsigned OffsetRegNum;    // Offset register num, when OffsetImm == NULL
+      ARM_AM::ShiftOpc ShiftType; // Shift type for OffsetReg
+      unsigned ShiftImm;        // shift for OffsetReg.
+      unsigned Alignment;       // 0 = no alignment specified
+                                // n = alignment in bytes (8, 16, or 32)
+      unsigned isNegative : 1;  // Negated OffsetReg? (~'U' bit)
+    } Memory;
 
     struct {
+      unsigned RegNum;
+      bool isAdd;
       ARM_AM::ShiftOpc ShiftTy;
+      unsigned ShiftImm;
+    } PostIdxReg;
+
+    struct {
+      bool isASR;
       unsigned Imm;
-    } Shift;
+    } ShifterImm;
     struct {
       ARM_AM::ShiftOpc ShiftTy;
       unsigned SrcReg;
       unsigned ShiftReg;
       unsigned ShiftImm;
-    } ShiftedReg;
+    } RegShiftedReg;
+    struct {
+      ARM_AM::ShiftOpc ShiftTy;
+      unsigned SrcReg;
+      unsigned ShiftImm;
+    } RegShiftedImm;
+    struct {
+      unsigned Imm;
+    } RotImm;
+    struct {
+      unsigned LSB;
+      unsigned Width;
+    } Bitfield;
   };
 
   ARMOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {}
@@ -240,45 +375,69 @@ public:
     StartLoc = o.StartLoc;
     EndLoc = o.EndLoc;
     switch (Kind) {
-    case CondCode:
+    case k_CondCode:
       CC = o.CC;
       break;
-    case Token:
+    case k_ITCondMask:
+      ITMask = o.ITMask;
+      break;
+    case k_Token:
       Tok = o.Tok;
       break;
-    case CCOut:
-    case Register:
+    case k_CCOut:
+    case k_Register:
       Reg = o.Reg;
       break;
-    case RegisterList:
-    case DPRRegisterList:
-    case SPRRegisterList:
+    case k_RegisterList:
+    case k_DPRRegisterList:
+    case k_SPRRegisterList:
       Registers = o.Registers;
       break;
-    case CoprocNum:
-    case CoprocReg:
+    case k_CoprocNum:
+    case k_CoprocReg:
       Cop = o.Cop;
       break;
-    case Immediate:
+    case k_CoprocOption:
+      CoprocOption = o.CoprocOption;
+      break;
+    case k_Immediate:
       Imm = o.Imm;
       break;
-    case MemBarrierOpt:
+    case k_FPImmediate:
+      FPImm = o.FPImm;
+      break;
+    case k_MemBarrierOpt:
       MBOpt = o.MBOpt;
       break;
-    case Memory:
-      Mem = o.Mem;
+    case k_Memory:
+      Memory = o.Memory;
+      break;
+    case k_PostIndexRegister:
+      PostIdxReg = o.PostIdxReg;
       break;
-    case MSRMask:
+    case k_MSRMask:
       MMask = o.MMask;
       break;
-    case ProcIFlags:
+    case k_ProcIFlags:
       IFlags = o.IFlags;
       break;
-    case Shifter:
-      Shift = o.Shift;
+    case k_ShifterImmediate:
+      ShifterImm = o.ShifterImm;
+      break;
+    case k_ShiftedRegister:
+      RegShiftedReg = o.RegShiftedReg;
+      break;
+    case k_ShiftedImmediate:
+      RegShiftedImm = o.RegShiftedImm;
       break;
-    case ShiftedRegister:
-      ShiftedReg = o.ShiftedReg;
+    case k_RotateImmediate:
+      RotImm = o.RotImm;
+      break;
+    case k_BitfieldDescriptor:
+      Bitfield = o.Bitfield;
+      break;
+    case k_VectorIndex:
+      VectorIndex = o.VectorIndex;
       break;
     }
   }
@@ -289,94 +448,96 @@ public:
   SMLoc getEndLoc() const { return EndLoc; }
 
   ARMCC::CondCodes getCondCode() const {
-    assert(Kind == CondCode && "Invalid access!");
+    assert(Kind == k_CondCode && "Invalid access!");
     return CC.Val;
   }
 
   unsigned getCoproc() const {
-    assert((Kind == CoprocNum || Kind == CoprocReg) && "Invalid access!");
+    assert((Kind == k_CoprocNum || Kind == k_CoprocReg) && "Invalid access!");
     return Cop.Val;
   }
 
   StringRef getToken() const {
-    assert(Kind == Token && "Invalid access!");
+    assert(Kind == k_Token && "Invalid access!");
     return StringRef(Tok.Data, Tok.Length);
   }
 
   unsigned getReg() const {
-    assert((Kind == Register || Kind == CCOut) && "Invalid access!");
+    assert((Kind == k_Register || Kind == k_CCOut) && "Invalid access!");
     return Reg.RegNum;
   }
 
   const SmallVectorImpl<unsigned> &getRegList() const {
-    assert((Kind == RegisterList || Kind == DPRRegisterList ||
-            Kind == SPRRegisterList) && "Invalid access!");
+    assert((Kind == k_RegisterList || Kind == k_DPRRegisterList ||
+            Kind == k_SPRRegisterList) && "Invalid access!");
     return Registers;
   }
 
   const MCExpr *getImm() const {
-    assert(Kind == Immediate && "Invalid access!");
+    assert(Kind == k_Immediate && "Invalid access!");
     return Imm.Val;
   }
 
+  unsigned getFPImm() const {
+    assert(Kind == k_FPImmediate && "Invalid access!");
+    return FPImm.Val;
+  }
+
+  unsigned getVectorIndex() const {
+    assert(Kind == k_VectorIndex && "Invalid access!");
+    return VectorIndex.Val;
+  }
+
   ARM_MB::MemBOpt getMemBarrierOpt() const {
-    assert(Kind == MemBarrierOpt && "Invalid access!");
+    assert(Kind == k_MemBarrierOpt && "Invalid access!");
     return MBOpt.Val;
   }
 
   ARM_PROC::IFlags getProcIFlags() const {
-    assert(Kind == ProcIFlags && "Invalid access!");
+    assert(Kind == k_ProcIFlags && "Invalid access!");
     return IFlags.Val;
   }
 
   unsigned getMSRMask() const {
-    assert(Kind == MSRMask && "Invalid access!");
+    assert(Kind == k_MSRMask && "Invalid access!");
     return MMask.Val;
   }
 
-  /// @name Memory Operand Accessors
-  /// @{
-  ARMII::AddrMode getMemAddrMode() const {
-    return Mem.AddrMode;
-  }
-  unsigned getMemBaseRegNum() const {
-    return Mem.BaseRegNum;
-  }
-  unsigned getMemOffsetRegNum() const {
-    assert(Mem.OffsetIsReg && "Invalid access!");
-    return Mem.Offset.RegNum;
-  }
-  const MCExpr *getMemOffset() const {
-    assert(!Mem.OffsetIsReg && "Invalid access!");
-    return Mem.Offset.Value;
-  }
-  unsigned getMemOffsetRegShifted() const {
-    assert(Mem.OffsetIsReg && "Invalid access!");
-    return Mem.OffsetRegShifted;
+  bool isCoprocNum() const { return Kind == k_CoprocNum; }
+  bool isCoprocReg() const { return Kind == k_CoprocReg; }
+  bool isCoprocOption() const { return Kind == k_CoprocOption; }
+  bool isCondCode() const { return Kind == k_CondCode; }
+  bool isCCOut() const { return Kind == k_CCOut; }
+  bool isITMask() const { return Kind == k_ITCondMask; }
+  bool isITCondCode() const { return Kind == k_CondCode; }
+  bool isImm() const { return Kind == k_Immediate; }
+  bool isFPImm() const { return Kind == k_FPImmediate; }
+  bool isImm8s4() const {
+    if (Kind != k_Immediate)
+      return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return ((Value & 3) == 0) && Value >= -1020 && Value <= 1020;
   }
-  const MCExpr *getMemShiftAmount() const {
-    assert(Mem.OffsetIsReg && Mem.OffsetRegShifted && "Invalid access!");
-    return Mem.ShiftAmount;
+  bool isImm0_1020s4() const {
+    if (Kind != k_Immediate)
+      return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return ((Value & 3) == 0) && Value >= 0 && Value <= 1020;
   }
-  enum ARM_AM::ShiftOpc getMemShiftType() const {
-    assert(Mem.OffsetIsReg && Mem.OffsetRegShifted && "Invalid access!");
-    return Mem.ShiftType;
+  bool isImm0_508s4() const {
+    if (Kind != k_Immediate)
+      return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return ((Value & 3) == 0) && Value >= 0 && Value <= 508;
   }
-  bool getMemPreindexed() const { return Mem.Preindexed; }
-  bool getMemPostindexed() const { return Mem.Postindexed; }
-  bool getMemOffsetIsReg() const { return Mem.OffsetIsReg; }
-  bool getMemNegative() const { return Mem.Negative; }
-  bool getMemWriteback() const { return Mem.Writeback; }
-
-  /// @}
-
-  bool isCoprocNum() const { return Kind == CoprocNum; }
-  bool isCoprocReg() const { return Kind == CoprocReg; }
-  bool isCondCode() const { return Kind == CondCode; }
-  bool isCCOut() const { return Kind == CCOut; }
-  bool isImm() const { return Kind == Immediate; }
   bool isImm0_255() const {
-    if (Kind != Immediate)
+    if (Kind != k_Immediate)
       return false;
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
     if (!CE) return false;
@@ -384,7 +545,7 @@ public:
     return Value >= 0 && Value < 256;
   }
   bool isImm0_7() const {
-    if (Kind != Immediate)
+    if (Kind != k_Immediate)
       return false;
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
     if (!CE) return false;
@@ -392,130 +553,365 @@ public:
     return Value >= 0 && Value < 8;
   }
   bool isImm0_15() const {
-    if (Kind != Immediate)
+    if (Kind != k_Immediate)
       return false;
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
     if (!CE) return false;
     int64_t Value = CE->getValue();
     return Value >= 0 && Value < 16;
   }
+  bool isImm0_31() const {
+    if (Kind != k_Immediate)
+      return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value >= 0 && Value < 32;
+  }
+  bool isImm1_16() const {
+    if (Kind != k_Immediate)
+      return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value > 0 && Value < 17;
+  }
+  bool isImm1_32() const {
+    if (Kind != k_Immediate)
+      return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value > 0 && Value < 33;
+  }
   bool isImm0_65535() const {
-    if (Kind != Immediate)
+    if (Kind != k_Immediate)
       return false;
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
     if (!CE) return false;
     int64_t Value = CE->getValue();
     return Value >= 0 && Value < 65536;
   }
-  bool isT2SOImm() const {
-    if (Kind != Immediate)
+  bool isImm0_65535Expr() const {
+    if (Kind != k_Immediate)
       return false;
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
+    // If it's not a constant expression, it'll generate a fixup and be
+    // handled later.
+    if (!CE) return true;
     int64_t Value = CE->getValue();
-    return ARM_AM::getT2SOImmVal(Value) != -1;
+    return Value >= 0 && Value < 65536;
   }
-  bool isReg() const { return Kind == Register; }
-  bool isRegList() const { return Kind == RegisterList; }
-  bool isDPRRegList() const { return Kind == DPRRegisterList; }
-  bool isSPRRegList() const { return Kind == SPRRegisterList; }
-  bool isToken() const { return Kind == Token; }
-  bool isMemBarrierOpt() const { return Kind == MemBarrierOpt; }
-  bool isMemory() const { return Kind == Memory; }
-  bool isShifter() const { return Kind == Shifter; }
-  bool isShiftedReg() const { return Kind == ShiftedRegister; }
-  bool isMemMode2() const {
-    if (getMemAddrMode() != ARMII::AddrMode2)
+  bool isImm24bit() const {
+    if (Kind != k_Immediate)
       return false;
-
-    if (getMemOffsetIsReg())
-      return true;
-
-    if (getMemNegative() &&
-        !(getMemPostindexed() || getMemPreindexed()))
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value >= 0 && Value <= 0xffffff;
+  }
+  bool isImmThumbSR() const {
+    if (Kind != k_Immediate)
       return false;
-
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemOffset());
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
     if (!CE) return false;
     int64_t Value = CE->getValue();
-
-    // The offset must be in the range 0-4095 (imm12).
-    if (Value > 4095 || Value < -4095)
+    return Value > 0 && Value < 33;
+  }
+  bool isPKHLSLImm() const {
+    if (Kind != k_Immediate)
       return false;
-
-    return true;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value >= 0 && Value < 32;
   }
-  bool isMemMode3() const {
-    if (getMemAddrMode() != ARMII::AddrMode3)
+  bool isPKHASRImm() const {
+    if (Kind != k_Immediate)
       return false;
-
-    if (getMemOffsetIsReg()) {
-      if (getMemOffsetRegShifted())
-        return false; // No shift with offset reg allowed
-      return true;
-    }
-
-    if (getMemNegative() &&
-        !(getMemPostindexed() || getMemPreindexed()))
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value > 0 && Value <= 32;
+  }
+  bool isARMSOImm() const {
+    if (Kind != k_Immediate)
       return false;
-
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemOffset());
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
     if (!CE) return false;
     int64_t Value = CE->getValue();
-
-    // The offset must be in the range 0-255 (imm8).
-    if (Value > 255 || Value < -255)
+    return ARM_AM::getSOImmVal(Value) != -1;
+  }
+  bool isT2SOImm() const {
+    if (Kind != k_Immediate)
       return false;
-
-    return true;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return ARM_AM::getT2SOImmVal(Value) != -1;
   }
-  bool isMemMode5() const {
-    if (!isMemory() || getMemOffsetIsReg() || getMemWriteback() ||
-        getMemNegative())
+  bool isSetEndImm() const {
+    if (Kind != k_Immediate)
       return false;
-
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemOffset());
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
     if (!CE) return false;
-
-    // The offset must be a multiple of 4 in the range 0-1020.
     int64_t Value = CE->getValue();
-    return ((Value & 0x3) == 0 && Value <= 1020 && Value >= -1020);
-  }
-  bool isMemMode7() const {
-    if (!isMemory() ||
-        getMemPreindexed() ||
-        getMemPostindexed() ||
-        getMemOffsetIsReg() ||
-        getMemNegative() ||
-        getMemWriteback())
+    return Value == 1 || Value == 0;
+  }
+  bool isReg() const { return Kind == k_Register; }
+  bool isRegList() const { return Kind == k_RegisterList; }
+  bool isDPRRegList() const { return Kind == k_DPRRegisterList; }
+  bool isSPRRegList() const { return Kind == k_SPRRegisterList; }
+  bool isToken() const { return Kind == k_Token; }
+  bool isMemBarrierOpt() const { return Kind == k_MemBarrierOpt; }
+  bool isMemory() const { return Kind == k_Memory; }
+  bool isShifterImm() const { return Kind == k_ShifterImmediate; }
+  bool isRegShiftedReg() const { return Kind == k_ShiftedRegister; }
+  bool isRegShiftedImm() const { return Kind == k_ShiftedImmediate; }
+  bool isRotImm() const { return Kind == k_RotateImmediate; }
+  bool isBitfield() const { return Kind == k_BitfieldDescriptor; }
+  bool isPostIdxRegShifted() const { return Kind == k_PostIndexRegister; }
+  bool isPostIdxReg() const {
+    return Kind == k_PostIndexRegister && PostIdxReg.ShiftTy == ARM_AM::no_shift;
+  }
+  bool isMemNoOffset(bool alignOK = false) const {
+    if (!isMemory())
       return false;
-
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemOffset());
+    // No offset of any kind.
+    return Memory.OffsetRegNum == 0 && Memory.OffsetImm == 0 &&
+     (alignOK || Memory.Alignment == 0);
+  }
+  bool isAlignedMemory() const {
+    return isMemNoOffset(true);
+  }
+  bool isAddrMode2() const {
+    if (!isMemory() || Memory.Alignment != 0) return false;
+    // Check for register offset.
+    if (Memory.OffsetRegNum) return true;
+    // Immediate offset in range [-4095, 4095].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return Val > -4096 && Val < 4096;
+  }
+  bool isAM2OffsetImm() const {
+    if (Kind != k_Immediate)
+      return false;
+    // Immediate offset in range [-4095, 4095].
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
     if (!CE) return false;
-
-    if (CE->getValue())
+    int64_t Val = CE->getValue();
+    return Val > -4096 && Val < 4096;
+  }
+  bool isAddrMode3() const {
+    if (!isMemory() || Memory.Alignment != 0) return false;
+    // No shifts are legal for AM3.
+    if (Memory.ShiftType != ARM_AM::no_shift) return false;
+    // Check for register offset.
+    if (Memory.OffsetRegNum) return true;
+    // Immediate offset in range [-255, 255].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return Val > -256 && Val < 256;
+  }
+  bool isAM3Offset() const {
+    if (Kind != k_Immediate && Kind != k_PostIndexRegister)
+      return false;
+    if (Kind == k_PostIndexRegister)
+      return PostIdxReg.ShiftTy == ARM_AM::no_shift;
+    // Immediate offset in range [-255, 255].
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Val = CE->getValue();
+    // Special case, #-0 is INT32_MIN.
+    return (Val > -256 && Val < 256) || Val == INT32_MIN;
+  }
+  bool isAddrMode5() const {
+    if (!isMemory() || Memory.Alignment != 0) return false;
+    // Check for register offset.
+    if (Memory.OffsetRegNum) return false;
+    // Immediate offset in range [-1020, 1020] and a multiple of 4.
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return (Val >= -1020 && Val <= 1020 && ((Val & 3) == 0)) ||
+           Val == INT32_MIN;
+  }
+  bool isMemTBB() const {
+    if (!isMemory() || !Memory.OffsetRegNum || Memory.isNegative ||
+        Memory.ShiftType != ARM_AM::no_shift || Memory.Alignment != 0)
+      return false;
+    return true;
+  }
+  bool isMemTBH() const {
+    if (!isMemory() || !Memory.OffsetRegNum || Memory.isNegative ||
+        Memory.ShiftType != ARM_AM::lsl || Memory.ShiftImm != 1 ||
+        Memory.Alignment != 0 )
+      return false;
+    return true;
+  }
+  bool isMemRegOffset() const {
+    if (!isMemory() || !Memory.OffsetRegNum || Memory.Alignment != 0)
       return false;
-
     return true;
   }
-  bool isMemModeRegThumb() const {
-    if (!isMemory() || !getMemOffsetIsReg() || getMemWriteback())
+  bool isT2MemRegOffset() const {
+    if (!isMemory() || !Memory.OffsetRegNum || Memory.isNegative ||
+        Memory.Alignment != 0)
+      return false;
+    // Only lsl #{0, 1, 2, 3} allowed.
+    if (Memory.ShiftType == ARM_AM::no_shift)
+      return true;
+    if (Memory.ShiftType != ARM_AM::lsl || Memory.ShiftImm > 3)
       return false;
     return true;
   }
-  bool isMemModeImmThumb() const {
-    if (!isMemory() || getMemOffsetIsReg() || getMemWriteback())
+  bool isMemThumbRR() const {
+    // Thumb reg+reg addressing is simple. Just two registers, a base and
+    // an offset. No shifts, negations or any other complicating factors.
+    if (!isMemory() || !Memory.OffsetRegNum || Memory.isNegative ||
+        Memory.ShiftType != ARM_AM::no_shift || Memory.Alignment != 0)
+      return false;
+    return isARMLowRegister(Memory.BaseRegNum) &&
+      (!Memory.OffsetRegNum || isARMLowRegister(Memory.OffsetRegNum));
+  }
+  bool isMemThumbRIs4() const {
+    if (!isMemory() || Memory.OffsetRegNum != 0 ||
+        !isARMLowRegister(Memory.BaseRegNum) || Memory.Alignment != 0)
+      return false;
+    // Immediate offset, multiple of 4 in range [0, 124].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return Val >= 0 && Val <= 124 && (Val % 4) == 0;
+  }
+  bool isMemThumbRIs2() const {
+    if (!isMemory() || Memory.OffsetRegNum != 0 ||
+        !isARMLowRegister(Memory.BaseRegNum) || Memory.Alignment != 0)
+      return false;
+    // Immediate offset, multiple of 4 in range [0, 62].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return Val >= 0 && Val <= 62 && (Val % 2) == 0;
+  }
+  bool isMemThumbRIs1() const {
+    if (!isMemory() || Memory.OffsetRegNum != 0 ||
+        !isARMLowRegister(Memory.BaseRegNum) || Memory.Alignment != 0)
+      return false;
+    // Immediate offset in range [0, 31].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return Val >= 0 && Val <= 31;
+  }
+  bool isMemThumbSPI() const {
+    if (!isMemory() || Memory.OffsetRegNum != 0 ||
+        Memory.BaseRegNum != ARM::SP || Memory.Alignment != 0)
+      return false;
+    // Immediate offset, multiple of 4 in range [0, 1020].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return Val >= 0 && Val <= 1020 && (Val % 4) == 0;
+  }
+  bool isMemImm8s4Offset() const {
+    if (!isMemory() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
       return false;
+    // Immediate offset a multiple of 4 in range [-1020, 1020].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return Val >= -1020 && Val <= 1020 && (Val & 3) == 0;
+  }
+  bool isMemImm0_1020s4Offset() const {
+    if (!isMemory() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+      return false;
+    // Immediate offset a multiple of 4 in range [0, 1020].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return Val >= 0 && Val <= 1020 && (Val & 3) == 0;
+  }
+  bool isMemImm8Offset() const {
+    if (!isMemory() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+      return false;
+    // Immediate offset in range [-255, 255].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return (Val == INT32_MIN) || (Val > -256 && Val < 256);
+  }
+  bool isMemPosImm8Offset() const {
+    if (!isMemory() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+      return false;
+    // Immediate offset in range [0, 255].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return Val >= 0 && Val < 256;
+  }
+  bool isMemNegImm8Offset() const {
+    if (!isMemory() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+      return false;
+    // Immediate offset in range [-255, -1].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return Val > -256 && Val < 0;
+  }
+  bool isMemUImm12Offset() const {
+    // If we have an immediate that's not a constant, treat it as a label
+    // reference needing a fixup. If it is a constant, it's something else
+    // and we reject it.
+    if (Kind == k_Immediate && !isa<MCConstantExpr>(getImm()))
+      return true;
+
+    if (!isMemory() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+      return false;
+    // Immediate offset in range [0, 4095].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return (Val >= 0 && Val < 4096);
+  }
+  bool isMemImm12Offset() const {
+    // If we have an immediate that's not a constant, treat it as a label
+    // reference needing a fixup. If it is a constant, it's something else
+    // and we reject it.
+    if (Kind == k_Immediate && !isa<MCConstantExpr>(getImm()))
+      return true;
 
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemOffset());
+    if (!isMemory() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+      return false;
+    // Immediate offset in range [-4095, 4095].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return (Val > -4096 && Val < 4096) || (Val == INT32_MIN);
+  }
+  bool isPostIdxImm8() const {
+    if (Kind != k_Immediate)
+      return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Val = CE->getValue();
+    return (Val > -256 && Val < 256) || (Val == INT32_MIN);
+  }
+  bool isPostIdxImm8s4() const {
+    if (Kind != k_Immediate)
+      return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
     if (!CE) return false;
+    int64_t Val = CE->getValue();
+    return ((Val & 3) == 0 && Val >= -1020 && Val <= 1020) ||
+      (Val == INT32_MIN);
+  }
+
+  bool isMSRMask() const { return Kind == k_MSRMask; }
+  bool isProcIFlags() const { return Kind == k_ProcIFlags; }
 
-    // The offset must be a multiple of 4 in the range 0-124.
-    uint64_t Value = CE->getValue();
-    return ((Value & 0x3) == 0 && Value <= 124);
+  bool isVectorIndex8() const {
+    if (Kind != k_VectorIndex) return false;
+    return VectorIndex.Val < 8;
   }
-  bool isMSRMask() const { return Kind == MSRMask; }
-  bool isProcIFlags() const { return Kind == ProcIFlags; }
+  bool isVectorIndex16() const {
+    if (Kind != k_VectorIndex) return false;
+    return VectorIndex.Val < 4;
+  }
+  bool isVectorIndex32() const {
+    if (Kind != k_VectorIndex) return false;
+    return VectorIndex.Val < 2;
+  }
+
+
 
   void addExpr(MCInst &Inst, const MCExpr *Expr) const {
     // Add as immediates when possible.  Null MCExpr = 0.
@@ -544,6 +940,21 @@ public:
     Inst.addOperand(MCOperand::CreateImm(getCoproc()));
   }
 
+  void addCoprocOptionOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(CoprocOption.Val));
+  }
+
+  void addITMaskOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(ITMask.Mask));
+  }
+
+  void addITCondCodeOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(unsigned(getCondCode())));
+  }
+
   void addCCOutOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     Inst.addOperand(MCOperand::CreateReg(getReg()));
@@ -554,22 +965,27 @@ public:
     Inst.addOperand(MCOperand::CreateReg(getReg()));
   }
 
-  void addShiftedRegOperands(MCInst &Inst, unsigned N) const {
+  void addRegShiftedRegOperands(MCInst &Inst, unsigned N) const {
     assert(N == 3 && "Invalid number of operands!");
-    assert(isShiftedReg() && "addShiftedRegOperands() on non ShiftedReg!");
-    assert((ShiftedReg.ShiftReg == 0 ||
-            ARM_AM::getSORegOffset(ShiftedReg.ShiftImm) == 0) &&
-           "Invalid shifted register operand!");
-    Inst.addOperand(MCOperand::CreateReg(ShiftedReg.SrcReg));
-    Inst.addOperand(MCOperand::CreateReg(ShiftedReg.ShiftReg));
+    assert(isRegShiftedReg() && "addRegShiftedRegOperands() on non RegShiftedReg!");
+    Inst.addOperand(MCOperand::CreateReg(RegShiftedReg.SrcReg));
+    Inst.addOperand(MCOperand::CreateReg(RegShiftedReg.ShiftReg));
     Inst.addOperand(MCOperand::CreateImm(
-      ARM_AM::getSORegOpc(ShiftedReg.ShiftTy, ShiftedReg.ShiftImm)));
+      ARM_AM::getSORegOpc(RegShiftedReg.ShiftTy, RegShiftedReg.ShiftImm)));
   }
 
-  void addShifterOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
+  void addRegShiftedImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    assert(isRegShiftedImm() && "addRegShiftedImmOperands() on non RegShiftedImm!");
+    Inst.addOperand(MCOperand::CreateReg(RegShiftedImm.SrcReg));
     Inst.addOperand(MCOperand::CreateImm(
-      ARM_AM::getSORegOpc(Shift.ShiftTy, 0)));
+      ARM_AM::getSORegOpc(RegShiftedImm.ShiftTy, RegShiftedImm.ShiftImm)));
+  }
+
+  void addShifterImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm((ShifterImm.isASR << 5) |
+                                         ShifterImm.Imm));
   }
 
   void addRegListOperands(MCInst &Inst, unsigned N) const {
@@ -588,11 +1004,57 @@ public:
     addRegListOperands(Inst, N);
   }
 
+  void addRotImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // Encoded as val>>3. The printer handles display as 8, 16, 24.
+    Inst.addOperand(MCOperand::CreateImm(RotImm.Imm >> 3));
+  }
+
+  void addBitfieldOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // Munge the lsb/width into a bitfield mask.
+    unsigned lsb = Bitfield.LSB;
+    unsigned width = Bitfield.Width;
+    // Make a 32-bit mask w/ the referenced bits clear and all other bits set.
+    uint32_t Mask = ~(((uint32_t)0xffffffff >> lsb) << (32 - width) >>
+                      (32 - (lsb + width)));
+    Inst.addOperand(MCOperand::CreateImm(Mask));
+  }
+
   void addImmOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     addExpr(Inst, getImm());
   }
 
+  void addFPImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getFPImm()));
+  }
+
+  void addImm8s4Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // FIXME: We really want to scale the value here, but the LDRD/STRD
+    // instruction don't encode operands that way yet.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
+  }
+
+  void addImm0_1020s4Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The immediate is scaled by four in the encoding and is stored
+    // in the MCInst as such. Lop off the low two bits here.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::CreateImm(CE->getValue() / 4));
+  }
+
+  void addImm0_508s4Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The immediate is scaled by four in the encoding and is stored
+    // in the MCInst as such. Lop off the low two bits here.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::CreateImm(CE->getValue() / 4));
+  }
+
   void addImm0_255Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     addExpr(Inst, getImm());
@@ -608,137 +1070,344 @@ public:
     addExpr(Inst, getImm());
   }
 
-  void addImm0_65535Operands(MCInst &Inst, unsigned N) const {
+  void addImm0_31Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     addExpr(Inst, getImm());
   }
 
-  void addT2SOImmOperands(MCInst &Inst, unsigned N) const {
+  void addImm1_16Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    addExpr(Inst, getImm());
+    // The constant encodes as the immediate-1, and we store in the instruction
+    // the bits as encoded, so subtract off one here.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::CreateImm(CE->getValue() - 1));
   }
 
-  void addMemBarrierOptOperands(MCInst &Inst, unsigned N) const {
+  void addImm1_32Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateImm(unsigned(getMemBarrierOpt())));
+    // The constant encodes as the immediate-1, and we store in the instruction
+    // the bits as encoded, so subtract off one here.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::CreateImm(CE->getValue() - 1));
   }
 
-  void addMemMode7Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && isMemMode7() && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateReg(getMemBaseRegNum()));
-
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemOffset());
-    (void)CE;
-    assert((CE || CE->getValue() == 0) &&
-           "No offset operand support in mode 7");
+  void addImm0_65535Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    addExpr(Inst, getImm());
   }
 
-  void addMemMode2Operands(MCInst &Inst, unsigned N) const {
-    assert(isMemMode2() && "Invalid mode or number of operands!");
-    Inst.addOperand(MCOperand::CreateReg(getMemBaseRegNum()));
-    unsigned IdxMode = (getMemPreindexed() | getMemPostindexed() << 1);
-
-    if (getMemOffsetIsReg()) {
-      Inst.addOperand(MCOperand::CreateReg(getMemOffsetRegNum()));
+  void addImm0_65535ExprOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    addExpr(Inst, getImm());
+  }
 
-      ARM_AM::AddrOpc AMOpc = getMemNegative() ? ARM_AM::sub : ARM_AM::add;
-      ARM_AM::ShiftOpc ShOpc = ARM_AM::no_shift;
-      int64_t ShiftAmount = 0;
+  void addImm24bitOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    addExpr(Inst, getImm());
+  }
 
-      if (getMemOffsetRegShifted()) {
-        ShOpc = getMemShiftType();
-        const MCConstantExpr *CE =
-                   dyn_cast<MCConstantExpr>(getMemShiftAmount());
-        ShiftAmount = CE->getValue();
-      }
+  void addImmThumbSROperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The constant encodes as the immediate, except for 32, which encodes as
+    // zero.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    unsigned Imm = CE->getValue();
+    Inst.addOperand(MCOperand::CreateImm((Imm == 32 ? 0 : Imm)));
+  }
+
+  void addPKHLSLImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    addExpr(Inst, getImm());
+  }
+
+  void addPKHASRImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // An ASR value of 32 encodes as 0, so that's how we want to add it to
+    // the instruction as well.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    int Val = CE->getValue();
+    Inst.addOperand(MCOperand::CreateImm(Val == 32 ? 0 : Val));
+  }
+
+  void addARMSOImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    addExpr(Inst, getImm());
+  }
+
+  void addT2SOImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    addExpr(Inst, getImm());
+  }
+
+  void addSetEndImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    addExpr(Inst, getImm());
+  }
+
+  void addMemBarrierOptOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(unsigned(getMemBarrierOpt())));
+  }
+
+  void addMemNoOffsetOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+  }
+
+  void addAlignedMemoryOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateImm(Memory.Alignment));
+  }
+
+  void addAddrMode2Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 3 && "Invalid number of operands!");
+    int32_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() : 0;
+    if (!Memory.OffsetRegNum) {
+      ARM_AM::AddrOpc AddSub = Val < 0 ? ARM_AM::sub : ARM_AM::add;
+      // Special case for #-0
+      if (Val == INT32_MIN) Val = 0;
+      if (Val < 0) Val = -Val;
+      Val = ARM_AM::getAM2Opc(AddSub, Val, ARM_AM::no_shift);
+    } else {
+      // For register offset, we encode the shift type and negation flag
+      // here.
+      Val = ARM_AM::getAM2Opc(Memory.isNegative ? ARM_AM::sub : ARM_AM::add,
+                              Memory.ShiftImm, Memory.ShiftType);
+    }
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateReg(Memory.OffsetRegNum));
+    Inst.addOperand(MCOperand::CreateImm(Val));
+  }
+
+  void addAM2OffsetImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    assert(CE && "non-constant AM2OffsetImm operand!");
+    int32_t Val = CE->getValue();
+    ARM_AM::AddrOpc AddSub = Val < 0 ? ARM_AM::sub : ARM_AM::add;
+    // Special case for #-0
+    if (Val == INT32_MIN) Val = 0;
+    if (Val < 0) Val = -Val;
+    Val = ARM_AM::getAM2Opc(AddSub, Val, ARM_AM::no_shift);
+    Inst.addOperand(MCOperand::CreateReg(0));
+    Inst.addOperand(MCOperand::CreateImm(Val));
+  }
+
+  void addAddrMode3Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 3 && "Invalid number of operands!");
+    int32_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() : 0;
+    if (!Memory.OffsetRegNum) {
+      ARM_AM::AddrOpc AddSub = Val < 0 ? ARM_AM::sub : ARM_AM::add;
+      // Special case for #-0
+      if (Val == INT32_MIN) Val = 0;
+      if (Val < 0) Val = -Val;
+      Val = ARM_AM::getAM3Opc(AddSub, Val);
+    } else {
+      // For register offset, we encode the shift type and negation flag
+      // here.
+      Val = ARM_AM::getAM3Opc(Memory.isNegative ? ARM_AM::sub : ARM_AM::add, 0);
+    }
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateReg(Memory.OffsetRegNum));
+    Inst.addOperand(MCOperand::CreateImm(Val));
+  }
 
-      Inst.addOperand(MCOperand::CreateImm(ARM_AM::getAM2Opc(AMOpc, ShiftAmount,
-                                           ShOpc, IdxMode)));
+  void addAM3OffsetOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    if (Kind == k_PostIndexRegister) {
+      int32_t Val =
+        ARM_AM::getAM3Opc(PostIdxReg.isAdd ? ARM_AM::add : ARM_AM::sub, 0);
+      Inst.addOperand(MCOperand::CreateReg(PostIdxReg.RegNum));
+      Inst.addOperand(MCOperand::CreateImm(Val));
       return;
     }
 
-    // Create a operand placeholder to always yield the same number of operands.
+    // Constant offset.
+    const MCConstantExpr *CE = static_cast<const MCConstantExpr*>(getImm());
+    int32_t Val = CE->getValue();
+    ARM_AM::AddrOpc AddSub = Val < 0 ? ARM_AM::sub : ARM_AM::add;
+    // Special case for #-0
+    if (Val == INT32_MIN) Val = 0;
+    if (Val < 0) Val = -Val;
+    Val = ARM_AM::getAM3Opc(AddSub, Val);
     Inst.addOperand(MCOperand::CreateReg(0));
+    Inst.addOperand(MCOperand::CreateImm(Val));
+  }
 
-    // FIXME: #-0 is encoded differently than #0. Does the parser preserve
-    // the difference?
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemOffset());
-    assert(CE && "Non-constant mode 2 offset operand!");
-    int64_t Offset = CE->getValue();
+  void addAddrMode5Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    // The lower two bits are always zero and as such are not encoded.
+    int32_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() / 4 : 0;
+    ARM_AM::AddrOpc AddSub = Val < 0 ? ARM_AM::sub : ARM_AM::add;
+    // Special case for #-0
+    if (Val == INT32_MIN) Val = 0;
+    if (Val < 0) Val = -Val;
+    Val = ARM_AM::getAM5Opc(AddSub, Val);
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateImm(Val));
+  }
 
-    if (Offset >= 0)
-      Inst.addOperand(MCOperand::CreateImm(ARM_AM::getAM2Opc(ARM_AM::add,
-                                           Offset, ARM_AM::no_shift, IdxMode)));
-    else
-      Inst.addOperand(MCOperand::CreateImm(ARM_AM::getAM2Opc(ARM_AM::sub,
-                                          -Offset, ARM_AM::no_shift, IdxMode)));
+  void addMemImm8s4OffsetOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    int64_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() : 0;
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateImm(Val));
+  }
+
+  void addMemImm0_1020s4OffsetOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    // The lower two bits are always zero and as such are not encoded.
+    int32_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() / 4 : 0;
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateImm(Val));
+  }
+
+  void addMemImm8OffsetOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    int64_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() : 0;
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateImm(Val));
   }
 
-  void addMemMode3Operands(MCInst &Inst, unsigned N) const {
-    assert(isMemMode3() && "Invalid mode or number of operands!");
-    Inst.addOperand(MCOperand::CreateReg(getMemBaseRegNum()));
-    unsigned IdxMode = (getMemPreindexed() | getMemPostindexed() << 1);
+  void addMemPosImm8OffsetOperands(MCInst &Inst, unsigned N) const {
+    addMemImm8OffsetOperands(Inst, N);
+  }
 
-    if (getMemOffsetIsReg()) {
-      Inst.addOperand(MCOperand::CreateReg(getMemOffsetRegNum()));
+  void addMemNegImm8OffsetOperands(MCInst &Inst, unsigned N) const {
+    addMemImm8OffsetOperands(Inst, N);
+  }
 
-      ARM_AM::AddrOpc AMOpc = getMemNegative() ? ARM_AM::sub : ARM_AM::add;
-      Inst.addOperand(MCOperand::CreateImm(ARM_AM::getAM3Opc(AMOpc, 0,
-                                                             IdxMode)));
+  void addMemUImm12OffsetOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    // If this is an immediate, it's a label reference.
+    if (Kind == k_Immediate) {
+      addExpr(Inst, getImm());
+      Inst.addOperand(MCOperand::CreateImm(0));
       return;
     }
 
-    // Create a operand placeholder to always yield the same number of operands.
-    Inst.addOperand(MCOperand::CreateReg(0));
+    // Otherwise, it's a normal memory reg+offset.
+    int64_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() : 0;
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateImm(Val));
+  }
 
-    // FIXME: #-0 is encoded differently than #0. Does the parser preserve
-    // the difference?
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemOffset());
-    assert(CE && "Non-constant mode 3 offset operand!");
-    int64_t Offset = CE->getValue();
+  void addMemImm12OffsetOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    // If this is an immediate, it's a label reference.
+    if (Kind == k_Immediate) {
+      addExpr(Inst, getImm());
+      Inst.addOperand(MCOperand::CreateImm(0));
+      return;
+    }
 
-    if (Offset >= 0)
-      Inst.addOperand(MCOperand::CreateImm(ARM_AM::getAM3Opc(ARM_AM::add,
-                                           Offset, IdxMode)));
-    else
-      Inst.addOperand(MCOperand::CreateImm(ARM_AM::getAM3Opc(ARM_AM::sub,
-                                           -Offset, IdxMode)));
+    // Otherwise, it's a normal memory reg+offset.
+    int64_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() : 0;
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateImm(Val));
+  }
+
+  void addMemTBBOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateReg(Memory.OffsetRegNum));
   }
 
-  void addMemMode5Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 2 && isMemMode5() && "Invalid number of operands!");
+  void addMemTBHOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateReg(Memory.OffsetRegNum));
+  }
 
-    Inst.addOperand(MCOperand::CreateReg(getMemBaseRegNum()));
-    assert(!getMemOffsetIsReg() && "Invalid mode 5 operand");
+  void addMemRegOffsetOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 3 && "Invalid number of operands!");
+    unsigned Val = ARM_AM::getAM2Opc(Memory.isNegative ? ARM_AM::sub : ARM_AM::add,
+                                     Memory.ShiftImm, Memory.ShiftType);
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateReg(Memory.OffsetRegNum));
+    Inst.addOperand(MCOperand::CreateImm(Val));
+  }
 
-    // FIXME: #-0 is encoded differently than #0. Does the parser preserve
-    // the difference?
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemOffset());
-    assert(CE && "Non-constant mode 5 offset operand!");
+  void addT2MemRegOffsetOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 3 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateReg(Memory.OffsetRegNum));
+    Inst.addOperand(MCOperand::CreateImm(Memory.ShiftImm));
+  }
 
-    // The MCInst offset operand doesn't include the low two bits (like
-    // the instruction encoding).
-    int64_t Offset = CE->getValue() / 4;
-    if (Offset >= 0)
-      Inst.addOperand(MCOperand::CreateImm(ARM_AM::getAM5Opc(ARM_AM::add,
-                                                             Offset)));
-    else
-      Inst.addOperand(MCOperand::CreateImm(ARM_AM::getAM5Opc(ARM_AM::sub,
-                                                             -Offset)));
+  void addMemThumbRROperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateReg(Memory.OffsetRegNum));
   }
 
-  void addMemModeRegThumbOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 2 && isMemModeRegThumb() && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateReg(getMemBaseRegNum()));
-    Inst.addOperand(MCOperand::CreateReg(getMemOffsetRegNum()));
+  void addMemThumbRIs4Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    int64_t Val = Memory.OffsetImm ? (Memory.OffsetImm->getValue() / 4) : 0;
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateImm(Val));
   }
 
-  void addMemModeImmThumbOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 2 && isMemModeImmThumb() && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateReg(getMemBaseRegNum()));
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemOffset());
-    assert(CE && "Non-constant mode offset operand!");
-    Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
+  void addMemThumbRIs2Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    int64_t Val = Memory.OffsetImm ? (Memory.OffsetImm->getValue() / 2) : 0;
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateImm(Val));
+  }
+
+  void addMemThumbRIs1Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    int64_t Val = Memory.OffsetImm ? (Memory.OffsetImm->getValue()) : 0;
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateImm(Val));
+  }
+
+  void addMemThumbSPIOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    int64_t Val = Memory.OffsetImm ? (Memory.OffsetImm->getValue() / 4) : 0;
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateImm(Val));
+  }
+
+  void addPostIdxImm8Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    assert(CE && "non-constant post-idx-imm8 operand!");
+    int Imm = CE->getValue();
+    bool isAdd = Imm >= 0;
+    if (Imm == INT32_MIN) Imm = 0;
+    Imm = (Imm < 0 ? -Imm : Imm) | (int)isAdd << 8;
+    Inst.addOperand(MCOperand::CreateImm(Imm));
+  }
+
+  void addPostIdxImm8s4Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    assert(CE && "non-constant post-idx-imm8s4 operand!");
+    int Imm = CE->getValue();
+    bool isAdd = Imm >= 0;
+    if (Imm == INT32_MIN) Imm = 0;
+    // Immediate is scaled by 4.
+    Imm = ((Imm < 0 ? -Imm : Imm) / 4) | (int)isAdd << 8;
+    Inst.addOperand(MCOperand::CreateImm(Imm));
+  }
+
+  void addPostIdxRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(PostIdxReg.RegNum));
+    Inst.addOperand(MCOperand::CreateImm(PostIdxReg.isAdd));
+  }
+
+  void addPostIdxRegShiftedOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(PostIdxReg.RegNum));
+    // The sign, shift type, and shift amount are encoded in a single operand
+    // using the AM2 encoding helpers.
+    ARM_AM::AddrOpc opc = PostIdxReg.isAdd ? ARM_AM::add : ARM_AM::sub;
+    unsigned Imm = ARM_AM::getAM2Opc(opc, PostIdxReg.ShiftImm,
+                                     PostIdxReg.ShiftTy);
+    Inst.addOperand(MCOperand::CreateImm(Imm));
   }
 
   void addMSRMaskOperands(MCInst &Inst, unsigned N) const {
@@ -751,10 +1420,33 @@ public:
     Inst.addOperand(MCOperand::CreateImm(unsigned(getProcIFlags())));
   }
 
+  void addVectorIndex8Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getVectorIndex()));
+  }
+
+  void addVectorIndex16Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getVectorIndex()));
+  }
+
+  void addVectorIndex32Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getVectorIndex()));
+  }
+
   virtual void print(raw_ostream &OS) const;
 
+  static ARMOperand *CreateITMask(unsigned Mask, SMLoc S) {
+    ARMOperand *Op = new ARMOperand(k_ITCondMask);
+    Op->ITMask.Mask = Mask;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
   static ARMOperand *CreateCondCode(ARMCC::CondCodes CC, SMLoc S) {
-    ARMOperand *Op = new ARMOperand(CondCode);
+    ARMOperand *Op = new ARMOperand(k_CondCode);
     Op->CC.Val = CC;
     Op->StartLoc = S;
     Op->EndLoc = S;
@@ -762,7 +1454,7 @@ public:
   }
 
   static ARMOperand *CreateCoprocNum(unsigned CopVal, SMLoc S) {
-    ARMOperand *Op = new ARMOperand(CoprocNum);
+    ARMOperand *Op = new ARMOperand(k_CoprocNum);
     Op->Cop.Val = CopVal;
     Op->StartLoc = S;
     Op->EndLoc = S;
@@ -770,15 +1462,23 @@ public:
   }
 
   static ARMOperand *CreateCoprocReg(unsigned CopVal, SMLoc S) {
-    ARMOperand *Op = new ARMOperand(CoprocReg);
+    ARMOperand *Op = new ARMOperand(k_CoprocReg);
     Op->Cop.Val = CopVal;
     Op->StartLoc = S;
     Op->EndLoc = S;
     return Op;
   }
 
+  static ARMOperand *CreateCoprocOption(unsigned Val, SMLoc S, SMLoc E) {
+    ARMOperand *Op = new ARMOperand(k_CoprocOption);
+    Op->Cop.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
   static ARMOperand *CreateCCOut(unsigned RegNum, SMLoc S) {
-    ARMOperand *Op = new ARMOperand(CCOut);
+    ARMOperand *Op = new ARMOperand(k_CCOut);
     Op->Reg.RegNum = RegNum;
     Op->StartLoc = S;
     Op->EndLoc = S;
@@ -786,7 +1486,7 @@ public:
   }
 
   static ARMOperand *CreateToken(StringRef Str, SMLoc S) {
-    ARMOperand *Op = new ARMOperand(Token);
+    ARMOperand *Op = new ARMOperand(k_Token);
     Op->Tok.Data = Str.data();
     Op->Tok.Length = Str.size();
     Op->StartLoc = S;
@@ -795,7 +1495,7 @@ public:
   }
 
   static ARMOperand *CreateReg(unsigned RegNum, SMLoc S, SMLoc E) {
-    ARMOperand *Op = new ARMOperand(Register);
+    ARMOperand *Op = new ARMOperand(k_Register);
     Op->Reg.RegNum = RegNum;
     Op->StartLoc = S;
     Op->EndLoc = E;
@@ -807,20 +1507,52 @@ public:
                                            unsigned ShiftReg,
                                            unsigned ShiftImm,
                                            SMLoc S, SMLoc E) {
-    ARMOperand *Op = new ARMOperand(ShiftedRegister);
-    Op->ShiftedReg.ShiftTy = ShTy;
-    Op->ShiftedReg.SrcReg = SrcReg;
-    Op->ShiftedReg.ShiftReg = ShiftReg;
-    Op->ShiftedReg.ShiftImm = ShiftImm;
+    ARMOperand *Op = new ARMOperand(k_ShiftedRegister);
+    Op->RegShiftedReg.ShiftTy = ShTy;
+    Op->RegShiftedReg.SrcReg = SrcReg;
+    Op->RegShiftedReg.ShiftReg = ShiftReg;
+    Op->RegShiftedReg.ShiftImm = ShiftImm;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static ARMOperand *CreateShiftedImmediate(ARM_AM::ShiftOpc ShTy,
+                                            unsigned SrcReg,
+                                            unsigned ShiftImm,
+                                            SMLoc S, SMLoc E) {
+    ARMOperand *Op = new ARMOperand(k_ShiftedImmediate);
+    Op->RegShiftedImm.ShiftTy = ShTy;
+    Op->RegShiftedImm.SrcReg = SrcReg;
+    Op->RegShiftedImm.ShiftImm = ShiftImm;
     Op->StartLoc = S;
     Op->EndLoc = E;
     return Op;
   }
 
-  static ARMOperand *CreateShifter(ARM_AM::ShiftOpc ShTy,
+  static ARMOperand *CreateShifterImm(bool isASR, unsigned Imm,
                                    SMLoc S, SMLoc E) {
-    ARMOperand *Op = new ARMOperand(Shifter);
-    Op->Shift.ShiftTy = ShTy;
+    ARMOperand *Op = new ARMOperand(k_ShifterImmediate);
+    Op->ShifterImm.isASR = isASR;
+    Op->ShifterImm.Imm = Imm;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static ARMOperand *CreateRotImm(unsigned Imm, SMLoc S, SMLoc E) {
+    ARMOperand *Op = new ARMOperand(k_RotateImmediate);
+    Op->RotImm.Imm = Imm;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static ARMOperand *CreateBitfield(unsigned LSB, unsigned Width,
+                                    SMLoc S, SMLoc E) {
+    ARMOperand *Op = new ARMOperand(k_BitfieldDescriptor);
+    Op->Bitfield.LSB = LSB;
+    Op->Bitfield.Width = Width;
     Op->StartLoc = S;
     Op->EndLoc = E;
     return Op;
@@ -829,12 +1561,13 @@ public:
   static ARMOperand *
   CreateRegList(const SmallVectorImpl<std::pair<unsigned, SMLoc> > &Regs,
                 SMLoc StartLoc, SMLoc EndLoc) {
-    KindTy Kind = RegisterList;
+    KindTy Kind = k_RegisterList;
 
-    if (ARM::DPRRegClass.contains(Regs.front().first))
-      Kind = DPRRegisterList;
-    else if (ARM::SPRRegClass.contains(Regs.front().first))
-      Kind = SPRRegisterList;
+    if (ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Regs.front().first))
+      Kind = k_DPRRegisterList;
+    else if (ARMMCRegisterClasses[ARM::SPRRegClassID].
+             contains(Regs.front().first))
+      Kind = k_SPRRegisterList;
 
     ARMOperand *Op = new ARMOperand(Kind);
     for (SmallVectorImpl<std::pair<unsigned, SMLoc> >::const_iterator
@@ -846,55 +1579,68 @@ public:
     return Op;
   }
 
+  static ARMOperand *CreateVectorIndex(unsigned Idx, SMLoc S, SMLoc E,
+                                       MCContext &Ctx) {
+    ARMOperand *Op = new ARMOperand(k_VectorIndex);
+    Op->VectorIndex.Val = Idx;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
   static ARMOperand *CreateImm(const MCExpr *Val, SMLoc S, SMLoc E) {
-    ARMOperand *Op = new ARMOperand(Immediate);
+    ARMOperand *Op = new ARMOperand(k_Immediate);
     Op->Imm.Val = Val;
     Op->StartLoc = S;
     Op->EndLoc = E;
     return Op;
   }
 
-  static ARMOperand *CreateMem(ARMII::AddrMode AddrMode, unsigned BaseRegNum,
-                               bool OffsetIsReg, const MCExpr *Offset,
-                               int OffsetRegNum, bool OffsetRegShifted,
-                               enum ARM_AM::ShiftOpc ShiftType,
-                               const MCExpr *ShiftAmount, bool Preindexed,
-                               bool Postindexed, bool Negative, bool Writeback,
+  static ARMOperand *CreateFPImm(unsigned Val, SMLoc S, MCContext &Ctx) {
+    ARMOperand *Op = new ARMOperand(k_FPImmediate);
+    Op->FPImm.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static ARMOperand *CreateMem(unsigned BaseRegNum,
+                               const MCConstantExpr *OffsetImm,
+                               unsigned OffsetRegNum,
+                               ARM_AM::ShiftOpc ShiftType,
+                               unsigned ShiftImm,
+                               unsigned Alignment,
+                               bool isNegative,
                                SMLoc S, SMLoc E) {
-    assert((OffsetRegNum == -1 || OffsetIsReg) &&
-           "OffsetRegNum must imply OffsetIsReg!");
-    assert((!OffsetRegShifted || OffsetIsReg) &&
-           "OffsetRegShifted must imply OffsetIsReg!");
-    assert((Offset || OffsetIsReg) &&
-           "Offset must exists unless register offset is used!");
-    assert((!ShiftAmount || (OffsetIsReg && OffsetRegShifted)) &&
-           "Cannot have shift amount without shifted register offset!");
-    assert((!Offset || !OffsetIsReg) &&
-           "Cannot have expression offset and register offset!");
-
-    ARMOperand *Op = new ARMOperand(Memory);
-    Op->Mem.AddrMode = AddrMode;
-    Op->Mem.BaseRegNum = BaseRegNum;
-    Op->Mem.OffsetIsReg = OffsetIsReg;
-    if (OffsetIsReg)
-      Op->Mem.Offset.RegNum = OffsetRegNum;
-    else
-      Op->Mem.Offset.Value = Offset;
-    Op->Mem.OffsetRegShifted = OffsetRegShifted;
-    Op->Mem.ShiftType = ShiftType;
-    Op->Mem.ShiftAmount = ShiftAmount;
-    Op->Mem.Preindexed = Preindexed;
-    Op->Mem.Postindexed = Postindexed;
-    Op->Mem.Negative = Negative;
-    Op->Mem.Writeback = Writeback;
+    ARMOperand *Op = new ARMOperand(k_Memory);
+    Op->Memory.BaseRegNum = BaseRegNum;
+    Op->Memory.OffsetImm = OffsetImm;
+    Op->Memory.OffsetRegNum = OffsetRegNum;
+    Op->Memory.ShiftType = ShiftType;
+    Op->Memory.ShiftImm = ShiftImm;
+    Op->Memory.Alignment = Alignment;
+    Op->Memory.isNegative = isNegative;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
 
+  static ARMOperand *CreatePostIdxReg(unsigned RegNum, bool isAdd,
+                                      ARM_AM::ShiftOpc ShiftTy,
+                                      unsigned ShiftImm,
+                                      SMLoc S, SMLoc E) {
+    ARMOperand *Op = new ARMOperand(k_PostIndexRegister);
+    Op->PostIdxReg.RegNum = RegNum;
+    Op->PostIdxReg.isAdd = isAdd;
+    Op->PostIdxReg.ShiftTy = ShiftTy;
+    Op->PostIdxReg.ShiftImm = ShiftImm;
     Op->StartLoc = S;
     Op->EndLoc = E;
     return Op;
   }
 
   static ARMOperand *CreateMemBarrierOpt(ARM_MB::MemBOpt Opt, SMLoc S) {
-    ARMOperand *Op = new ARMOperand(MemBarrierOpt);
+    ARMOperand *Op = new ARMOperand(k_MemBarrierOpt);
     Op->MBOpt.Val = Opt;
     Op->StartLoc = S;
     Op->EndLoc = S;
@@ -902,7 +1648,7 @@ public:
   }
 
   static ARMOperand *CreateProcIFlags(ARM_PROC::IFlags IFlags, SMLoc S) {
-    ARMOperand *Op = new ARMOperand(ProcIFlags);
+    ARMOperand *Op = new ARMOperand(k_ProcIFlags);
     Op->IFlags.Val = IFlags;
     Op->StartLoc = S;
     Op->EndLoc = S;
@@ -910,7 +1656,7 @@ public:
   }
 
   static ARMOperand *CreateMSRMask(unsigned MMask, SMLoc S) {
-    ARMOperand *Op = new ARMOperand(MSRMask);
+    ARMOperand *Op = new ARMOperand(k_MSRMask);
     Op->MMask.Val = MMask;
     Op->StartLoc = S;
     Op->EndLoc = S;
@@ -922,53 +1668,56 @@ public:
 
 void ARMOperand::print(raw_ostream &OS) const {
   switch (Kind) {
-  case CondCode:
+  case k_FPImmediate:
+    OS << "<fpimm " << getFPImm() << "(" << ARM_AM::getFPImmFloat(getFPImm())
+       << ") >";
+    break;
+  case k_CondCode:
     OS << "<ARMCC::" << ARMCondCodeToString(getCondCode()) << ">";
     break;
-  case CCOut:
+  case k_CCOut:
     OS << "<ccout " << getReg() << ">";
     break;
-  case CoprocNum:
+  case k_ITCondMask: {
+    static char MaskStr[][6] = { "()", "(t)", "(e)", "(tt)", "(et)", "(te)",
+      "(ee)", "(ttt)", "(ett)", "(tet)", "(eet)", "(tte)", "(ete)",
+      "(tee)", "(eee)" };
+    assert((ITMask.Mask & 0xf) == ITMask.Mask);
+    OS << "<it-mask " << MaskStr[ITMask.Mask] << ">";
+    break;
+  }
+  case k_CoprocNum:
     OS << "<coprocessor number: " << getCoproc() << ">";
     break;
-  case CoprocReg:
+  case k_CoprocReg:
     OS << "<coprocessor register: " << getCoproc() << ">";
     break;
-  case MSRMask:
+  case k_CoprocOption:
+    OS << "<coprocessor option: " << CoprocOption.Val << ">";
+    break;
+  case k_MSRMask:
     OS << "<mask: " << getMSRMask() << ">";
     break;
-  case Immediate:
+  case k_Immediate:
     getImm()->print(OS);
     break;
-  case MemBarrierOpt:
+  case k_MemBarrierOpt:
     OS << "<ARM_MB::" << MemBOptToString(getMemBarrierOpt()) << ">";
     break;
-  case Memory:
+  case k_Memory:
     OS << "<memory "
-       << "am:" << ARMII::AddrModeToString(getMemAddrMode())
-       << " base:" << getMemBaseRegNum();
-    if (getMemOffsetIsReg()) {
-      OS << " offset:<register " << getMemOffsetRegNum();
-      if (getMemOffsetRegShifted()) {
-        OS << " offset-shift-type:" << getMemShiftType();
-        OS << " offset-shift-amount:" << *getMemShiftAmount();
-      }
-    } else {
-      OS << " offset:" << *getMemOffset();
-    }
-    if (getMemOffsetIsReg())
-      OS << " (offset-is-reg)";
-    if (getMemPreindexed())
-      OS << " (pre-indexed)";
-    if (getMemPostindexed())
-      OS << " (post-indexed)";
-    if (getMemNegative())
-      OS << " (negative)";
-    if (getMemWriteback())
-      OS << " (writeback)";
+       << " base:" << Memory.BaseRegNum;
+    OS << ">";
+    break;
+  case k_PostIndexRegister:
+    OS << "post-idx register " << (PostIdxReg.isAdd ? "" : "-")
+       << PostIdxReg.RegNum;
+    if (PostIdxReg.ShiftTy != ARM_AM::no_shift)
+      OS << ARM_AM::getShiftOpcStr(PostIdxReg.ShiftTy) << " "
+         << PostIdxReg.ShiftImm;
     OS << ">";
     break;
-  case ProcIFlags: {
+  case k_ProcIFlags: {
     OS << "<ARM_PROC::";
     unsigned IFlags = getProcIFlags();
     for (int i=2; i >= 0; --i)
@@ -977,23 +1726,38 @@ void ARMOperand::print(raw_ostream &OS) const {
     OS << ">";
     break;
   }
-  case Register:
+  case k_Register:
     OS << "<register " << getReg() << ">";
     break;
-  case Shifter:
-    OS << "<shifter " << ARM_AM::getShiftOpcStr(Shift.ShiftTy) << ">";
+  case k_ShifterImmediate:
+    OS << "<shift " << (ShifterImm.isASR ? "asr" : "lsl")
+       << " #" << ShifterImm.Imm << ">";
+    break;
+  case k_ShiftedRegister:
+    OS << "<so_reg_reg "
+       << RegShiftedReg.SrcReg
+       << ARM_AM::getShiftOpcStr(ARM_AM::getSORegShOp(RegShiftedReg.ShiftImm))
+       << ", " << RegShiftedReg.ShiftReg << ", "
+       << ARM_AM::getSORegOffset(RegShiftedReg.ShiftImm)
+       << ">";
     break;
-  case ShiftedRegister:
-    OS << "<so_reg"
-       << ShiftedReg.SrcReg
-       << ARM_AM::getShiftOpcStr(ARM_AM::getSORegShOp(ShiftedReg.ShiftImm))
-       << ", " << ShiftedReg.ShiftReg << ", "
-       << ARM_AM::getSORegOffset(ShiftedReg.ShiftImm)
+  case k_ShiftedImmediate:
+    OS << "<so_reg_imm "
+       << RegShiftedImm.SrcReg
+       << ARM_AM::getShiftOpcStr(ARM_AM::getSORegShOp(RegShiftedImm.ShiftImm))
+       << ", " << ARM_AM::getSORegOffset(RegShiftedImm.ShiftImm)
        << ">";
     break;
-  case RegisterList:
-  case DPRRegisterList:
-  case SPRRegisterList: {
+  case k_RotateImmediate:
+    OS << "<ror " << " #" << (RotImm.Imm * 8) << ">";
+    break;
+  case k_BitfieldDescriptor:
+    OS << "<bitfield " << "lsb: " << Bitfield.LSB
+       << ", width: " << Bitfield.Width << ">";
+    break;
+  case k_RegisterList:
+  case k_DPRRegisterList:
+  case k_SPRRegisterList: {
     OS << "<register_list ";
 
     const SmallVectorImpl<unsigned> &RegList = getRegList();
@@ -1006,9 +1770,12 @@ void ARMOperand::print(raw_ostream &OS) const {
     OS << ">";
     break;
   }
-  case Token:
+  case k_Token:
     OS << "'" << getToken() << "'";
     break;
+  case k_VectorIndex:
+    OS << "<vectorindex " << getVectorIndex() << ">";
+    break;
   }
 }
 
@@ -1021,7 +1788,7 @@ static unsigned MatchRegisterName(StringRef Name);
 
 bool ARMAsmParser::ParseRegister(unsigned &RegNo,
                                  SMLoc &StartLoc, SMLoc &EndLoc) {
-  RegNo = TryParseRegister();
+  RegNo = tryParseRegister();
 
   return (RegNo == (unsigned)-1);
 }
@@ -1030,9 +1797,9 @@ bool ARMAsmParser::ParseRegister(unsigned &RegNo,
 /// and if it is a register name the token is eaten and the register number is
 /// returned.  Otherwise return -1.
 ///
-int ARMAsmParser::TryParseRegister() {
+int ARMAsmParser::tryParseRegister() {
   const AsmToken &Tok = Parser.getTok();
-  assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");
+  if (Tok.isNot(AsmToken::Identifier)) return -1;
 
   // FIXME: Validate register for the current architecture; we have to do
   // validation later, so maybe there is no need for this here.
@@ -1050,6 +1817,39 @@ int ARMAsmParser::TryParseRegister() {
   if (!RegNum) return -1;
 
   Parser.Lex(); // Eat identifier token.
+
+#if 0
+  // Also check for an index operand. This is only legal for vector registers,
+  // but that'll get caught OK in operand matching, so we don't need to
+  // explicitly filter everything else out here.
+  if (Parser.getTok().is(AsmToken::LBrac)) {
+    SMLoc SIdx = Parser.getTok().getLoc();
+    Parser.Lex(); // Eat left bracket token.
+
+    const MCExpr *ImmVal;
+    SMLoc ExprLoc = Parser.getTok().getLoc();
+    if (getParser().ParseExpression(ImmVal))
+      return MatchOperand_ParseFail;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
+    if (!MCE) {
+      TokError("immediate value expected for vector index");
+      return MatchOperand_ParseFail;
+    }
+
+    SMLoc E = Parser.getTok().getLoc();
+    if (Parser.getTok().isNot(AsmToken::RBrac)) {
+      Error(E, "']' expected");
+      return MatchOperand_ParseFail;
+    }
+
+    Parser.Lex(); // Eat right bracket token.
+
+    Operands.push_back(ARMOperand::CreateVectorIndex(MCE->getValue(),
+                                                     SIdx, E,
+                                                     getContext()));
+  }
+#endif
+
   return RegNum;
 }
 
@@ -1058,7 +1858,7 @@ int ARMAsmParser::TryParseRegister() {
 // occurs, return -1. An irrecoverable error is one where tokens have been
 // consumed in the process of trying to parse the shifter (i.e., when it is
 // indeed a shifter operand, but malformed).
-int ARMAsmParser::TryParseShiftRegister(
+int ARMAsmParser::tryParseShiftRegister(
                                SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   SMLoc S = Parser.getTok().getLoc();
   const AsmToken &Tok = Parser.getTok();
@@ -1120,7 +1920,7 @@ int ARMAsmParser::TryParseShiftRegister(
         return -1;
       }
     } else if (Parser.getTok().is(AsmToken::Identifier)) {
-      ShiftReg = TryParseRegister();
+      ShiftReg = tryParseRegister();
       SMLoc L = Parser.getTok().getLoc();
       if (ShiftReg == -1) {
         Error (L, "expected immediate or register in shift operand");
@@ -1133,8 +1933,12 @@ int ARMAsmParser::TryParseShiftRegister(
     }
   }
 
-  Operands.push_back(ARMOperand::CreateShiftedRegister(ShiftTy, SrcReg,
-                                                       ShiftReg, Imm,
+  if (ShiftReg && ShiftTy != ARM_AM::rrx)
+    Operands.push_back(ARMOperand::CreateShiftedRegister(ShiftTy, SrcReg,
+                                                         ShiftReg, Imm,
+                                               S, Parser.getTok().getLoc()));
+  else
+    Operands.push_back(ARMOperand::CreateShiftedImmediate(ShiftTy, SrcReg, Imm,
                                                S, Parser.getTok().getLoc()));
 
   return 0;
@@ -1148,9 +1952,9 @@ int ARMAsmParser::TryParseShiftRegister(
 /// TODO this is likely to change to allow different register types and or to
 /// parse for a specific register type.
 bool ARMAsmParser::
-TryParseRegisterWithWriteBack(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+tryParseRegisterWithWriteBack(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   SMLoc S = Parser.getTok().getLoc();
-  int RegNo = TryParseRegister();
+  int RegNo = tryParseRegister();
   if (RegNo == -1)
     return true;
 
@@ -1161,6 +1965,37 @@ TryParseRegisterWithWriteBack(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
     Operands.push_back(ARMOperand::CreateToken(ExclaimTok.getString(),
                                                ExclaimTok.getLoc()));
     Parser.Lex(); // Eat exclaim token
+    return false;
+  }
+
+  // Also check for an index operand. This is only legal for vector registers,
+  // but that'll get caught OK in operand matching, so we don't need to
+  // explicitly filter everything else out here.
+  if (Parser.getTok().is(AsmToken::LBrac)) {
+    SMLoc SIdx = Parser.getTok().getLoc();
+    Parser.Lex(); // Eat left bracket token.
+
+    const MCExpr *ImmVal;
+    SMLoc ExprLoc = Parser.getTok().getLoc();
+    if (getParser().ParseExpression(ImmVal))
+      return MatchOperand_ParseFail;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
+    if (!MCE) {
+      TokError("immediate value expected for vector index");
+      return MatchOperand_ParseFail;
+    }
+
+    SMLoc E = Parser.getTok().getLoc();
+    if (Parser.getTok().isNot(AsmToken::RBrac)) {
+      Error(E, "']' expected");
+      return MatchOperand_ParseFail;
+    }
+
+    Parser.Lex(); // Eat right bracket token.
+
+    Operands.push_back(ARMOperand::CreateVectorIndex(MCE->getValue(),
+                                                     SIdx, E,
+                                                     getContext()));
   }
 
   return false;
@@ -1209,14 +2044,50 @@ static int MatchCoprocessorOperandName(StringRef Name, char CoprocOp) {
   return -1;
 }
 
-/// tryParseCoprocNumOperand - Try to parse an coprocessor number operand. The
+/// parseITCondCode - Try to parse a condition code for an IT instruction.
+ARMAsmParser::OperandMatchResultTy ARMAsmParser::
+parseITCondCode(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  SMLoc S = Parser.getTok().getLoc();
+  const AsmToken &Tok = Parser.getTok();
+  if (!Tok.is(AsmToken::Identifier))
+    return MatchOperand_NoMatch;
+  unsigned CC = StringSwitch<unsigned>(Tok.getString())
+    .Case("eq", ARMCC::EQ)
+    .Case("ne", ARMCC::NE)
+    .Case("hs", ARMCC::HS)
+    .Case("cs", ARMCC::HS)
+    .Case("lo", ARMCC::LO)
+    .Case("cc", ARMCC::LO)
+    .Case("mi", ARMCC::MI)
+    .Case("pl", ARMCC::PL)
+    .Case("vs", ARMCC::VS)
+    .Case("vc", ARMCC::VC)
+    .Case("hi", ARMCC::HI)
+    .Case("ls", ARMCC::LS)
+    .Case("ge", ARMCC::GE)
+    .Case("lt", ARMCC::LT)
+    .Case("gt", ARMCC::GT)
+    .Case("le", ARMCC::LE)
+    .Case("al", ARMCC::AL)
+    .Default(~0U);
+  if (CC == ~0U)
+    return MatchOperand_NoMatch;
+  Parser.Lex(); // Eat the token.
+
+  Operands.push_back(ARMOperand::CreateCondCode(ARMCC::CondCodes(CC), S));
+
+  return MatchOperand_Success;
+}
+
+/// parseCoprocNumOperand - Try to parse an coprocessor number operand. The
 /// token must be an Identifier when called, and if it is a coprocessor
 /// number, the token is eaten and the operand is added to the operand list.
 ARMAsmParser::OperandMatchResultTy ARMAsmParser::
-tryParseCoprocNumOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+parseCoprocNumOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   SMLoc S = Parser.getTok().getLoc();
   const AsmToken &Tok = Parser.getTok();
-  assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");
+  if (Tok.isNot(AsmToken::Identifier))
+    return MatchOperand_NoMatch;
 
   int Num = MatchCoprocessorOperandName(Tok.getString(), 'p');
   if (Num == -1)
@@ -1227,14 +2098,15 @@ tryParseCoprocNumOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   return MatchOperand_Success;
 }
 
-/// tryParseCoprocRegOperand - Try to parse an coprocessor register operand. The
+/// parseCoprocRegOperand - Try to parse an coprocessor register operand. The
 /// token must be an Identifier when called, and if it is a coprocessor
 /// number, the token is eaten and the operand is added to the operand list.
 ARMAsmParser::OperandMatchResultTy ARMAsmParser::
-tryParseCoprocRegOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+parseCoprocRegOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   SMLoc S = Parser.getTok().getLoc();
   const AsmToken &Tok = Parser.getTok();
-  assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");
+  if (Tok.isNot(AsmToken::Identifier))
+    return MatchOperand_NoMatch;
 
   int Reg = MatchCoprocessorOperandName(Tok.getString(), 'c');
   if (Reg == -1)
@@ -1245,93 +2117,155 @@ tryParseCoprocRegOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   return MatchOperand_Success;
 }
 
-/// Parse a register list, return it if successful else return null.  The first
-/// token must be a '{' when called.
-bool ARMAsmParser::
-ParseRegisterList(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  assert(Parser.getTok().is(AsmToken::LCurly) &&
-         "Token is not a Left Curly Brace");
+/// parseCoprocOptionOperand - Try to parse an coprocessor option operand.
+/// coproc_option : '{' imm0_255 '}'
+ARMAsmParser::OperandMatchResultTy ARMAsmParser::
+parseCoprocOptionOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   SMLoc S = Parser.getTok().getLoc();
 
-  // Read the rest of the registers in the list.
-  unsigned PrevRegNum = 0;
-  SmallVector<std::pair<unsigned, SMLoc>, 32> Registers;
-
-  do {
-    bool IsRange = Parser.getTok().is(AsmToken::Minus);
-    Parser.Lex(); // Eat non-identifier token.
-
-    const AsmToken &RegTok = Parser.getTok();
-    SMLoc RegLoc = RegTok.getLoc();
-    if (RegTok.isNot(AsmToken::Identifier)) {
-      Error(RegLoc, "register expected");
-      return true;
-    }
-
-    int RegNum = TryParseRegister();
-    if (RegNum == -1) {
-      Error(RegLoc, "register expected");
-      return true;
-    }
-
-    if (IsRange) {
-      int Reg = PrevRegNum;
-      do {
-        ++Reg;
-        Registers.push_back(std::make_pair(Reg, RegLoc));
-      } while (Reg != RegNum);
-    } else {
-      Registers.push_back(std::make_pair(RegNum, RegLoc));
-    }
-
-    PrevRegNum = RegNum;
-  } while (Parser.getTok().is(AsmToken::Comma) ||
-           Parser.getTok().is(AsmToken::Minus));
+  // If this isn't a '{', this isn't a coprocessor immediate operand.
+  if (Parser.getTok().isNot(AsmToken::LCurly))
+    return MatchOperand_NoMatch;
+  Parser.Lex(); // Eat the '{'
 
-  // Process the right curly brace of the list.
-  const AsmToken &RCurlyTok = Parser.getTok();
-  if (RCurlyTok.isNot(AsmToken::RCurly)) {
-    Error(RCurlyTok.getLoc(), "'}' expected");
-    return true;
+  const MCExpr *Expr;
+  SMLoc Loc = Parser.getTok().getLoc();
+  if (getParser().ParseExpression(Expr)) {
+    Error(Loc, "illegal expression");
+    return MatchOperand_ParseFail;
   }
+  const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr);
+  if (!CE || CE->getValue() < 0 || CE->getValue() > 255) {
+    Error(Loc, "coprocessor option must be an immediate in range [0, 255]");
+    return MatchOperand_ParseFail;
+  }
+  int Val = CE->getValue();
 
-  SMLoc E = RCurlyTok.getLoc();
-  Parser.Lex(); // Eat right curly brace token.
-
-  // Verify the register list.
-  SmallVectorImpl<std::pair<unsigned, SMLoc> >::const_iterator
-    RI = Registers.begin(), RE = Registers.end();
+  // Check for and consume the closing '}'
+  if (Parser.getTok().isNot(AsmToken::RCurly))
+    return MatchOperand_ParseFail;
+  SMLoc E = Parser.getTok().getLoc();
+  Parser.Lex(); // Eat the '}'
 
-  unsigned HighRegNum = getARMRegisterNumbering(RI->first);
-  bool EmittedWarning = false;
+  Operands.push_back(ARMOperand::CreateCoprocOption(Val, S, E));
+  return MatchOperand_Success;
+}
 
-  DenseMap<unsigned, bool> RegMap;
-  RegMap[HighRegNum] = true;
+// For register list parsing, we need to map from raw GPR register numbering
+// to the enumeration values. The enumeration values aren't sorted by
+// register number due to our using "sp", "lr" and "pc" as canonical names.
+static unsigned getNextRegister(unsigned Reg) {
+  // If this is a GPR, we need to do it manually, otherwise we can rely
+  // on the sort ordering of the enumeration since the other reg-classes
+  // are sane.
+  if (!ARMMCRegisterClasses[ARM::GPRRegClassID].contains(Reg))
+    return Reg + 1;
+  switch(Reg) {
+  default: assert(0 && "Invalid GPR number!");
+  case ARM::R0:  return ARM::R1;  case ARM::R1:  return ARM::R2;
+  case ARM::R2:  return ARM::R3;  case ARM::R3:  return ARM::R4;
+  case ARM::R4:  return ARM::R5;  case ARM::R5:  return ARM::R6;
+  case ARM::R6:  return ARM::R7;  case ARM::R7:  return ARM::R8;
+  case ARM::R8:  return ARM::R9;  case ARM::R9:  return ARM::R10;
+  case ARM::R10: return ARM::R11; case ARM::R11: return ARM::R12;
+  case ARM::R12: return ARM::SP;  case ARM::SP:  return ARM::LR;
+  case ARM::LR:  return ARM::PC;  case ARM::PC:  return ARM::R0;
+  }
+}
 
-  for (++RI; RI != RE; ++RI) {
-    const std::pair<unsigned, SMLoc> &RegInfo = *RI;
-    unsigned Reg = getARMRegisterNumbering(RegInfo.first);
+/// Parse a register list.
+bool ARMAsmParser::
+parseRegisterList(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  assert(Parser.getTok().is(AsmToken::LCurly) &&
+         "Token is not a Left Curly Brace");
+  SMLoc S = Parser.getTok().getLoc();
+  Parser.Lex(); // Eat '{' token.
+  SMLoc RegLoc = Parser.getTok().getLoc();
 
-    if (RegMap[Reg]) {
-      Error(RegInfo.second, "register duplicated in register list");
-      return true;
+  // Check the first register in the list to see what register class
+  // this is a list of.
+  int Reg = tryParseRegister();
+  if (Reg == -1)
+    return Error(RegLoc, "register expected");
+
+  MCRegisterClass *RC;
+  if (ARMMCRegisterClasses[ARM::GPRRegClassID].contains(Reg))
+    RC = &ARMMCRegisterClasses[ARM::GPRRegClassID];
+  else if (ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Reg))
+    RC = &ARMMCRegisterClasses[ARM::DPRRegClassID];
+  else if (ARMMCRegisterClasses[ARM::SPRRegClassID].contains(Reg))
+    RC = &ARMMCRegisterClasses[ARM::SPRRegClassID];
+  else
+    return Error(RegLoc, "invalid register in register list");
+
+  // The reglist instructions have at most 16 registers, so reserve
+  // space for that many.
+  SmallVector<std::pair<unsigned, SMLoc>, 16> Registers;
+  // Store the first register.
+  Registers.push_back(std::pair<unsigned, SMLoc>(Reg, RegLoc));
+
+  // This starts immediately after the first register token in the list,
+  // so we can see either a comma or a minus (range separator) as a legal
+  // next token.
+  while (Parser.getTok().is(AsmToken::Comma) ||
+         Parser.getTok().is(AsmToken::Minus)) {
+    if (Parser.getTok().is(AsmToken::Minus)) {
+      Parser.Lex(); // Eat the comma.
+      SMLoc EndLoc = Parser.getTok().getLoc();
+      int EndReg = tryParseRegister();
+      if (EndReg == -1)
+        return Error(EndLoc, "register expected");
+      // If the register is the same as the start reg, there's nothing
+      // more to do.
+      if (Reg == EndReg)
+        continue;
+      // The register must be in the same register class as the first.
+      if (!RC->contains(EndReg))
+        return Error(EndLoc, "invalid register in register list");
+      // Ranges must go from low to high.
+      if (getARMRegisterNumbering(Reg) > getARMRegisterNumbering(EndReg))
+        return Error(EndLoc, "bad range in register list");
+
+      // Add all the registers in the range to the register list.
+      while (Reg != EndReg) {
+        Reg = getNextRegister(Reg);
+        Registers.push_back(std::pair<unsigned, SMLoc>(Reg, RegLoc));
+      }
+      continue;
     }
-
-    if (!EmittedWarning && Reg < HighRegNum)
-      Warning(RegInfo.second,
-              "register not in ascending order in register list");
-
-    RegMap[Reg] = true;
-    HighRegNum = std::max(Reg, HighRegNum);
+    Parser.Lex(); // Eat the comma.
+    RegLoc = Parser.getTok().getLoc();
+    int OldReg = Reg;
+    Reg = tryParseRegister();
+    if (Reg == -1)
+      return Error(RegLoc, "register expected");
+    // The register must be in the same register class as the first.
+    if (!RC->contains(Reg))
+      return Error(RegLoc, "invalid register in register list");
+    // List must be monotonically increasing.
+    if (getARMRegisterNumbering(Reg) <= getARMRegisterNumbering(OldReg))
+      return Error(RegLoc, "register list not in ascending order");
+    // VFP register lists must also be contiguous.
+    // It's OK to use the enumeration values directly here rather, as the
+    // VFP register classes have the enum sorted properly.
+    if (RC != &ARMMCRegisterClasses[ARM::GPRRegClassID] &&
+        Reg != OldReg + 1)
+      return Error(RegLoc, "non-contiguous register range");
+    Registers.push_back(std::pair<unsigned, SMLoc>(Reg, RegLoc));
   }
 
+  SMLoc E = Parser.getTok().getLoc();
+  if (Parser.getTok().isNot(AsmToken::RCurly))
+    return Error(E, "'}' expected");
+  Parser.Lex(); // Eat '}' token.
+
   Operands.push_back(ARMOperand::CreateRegList(Registers, S, E));
   return false;
 }
 
-/// tryParseMemBarrierOptOperand - Try to parse DSB/DMB data barrier options.
+/// parseMemBarrierOptOperand - Try to parse DSB/DMB data barrier options.
 ARMAsmParser::OperandMatchResultTy ARMAsmParser::
-tryParseMemBarrierOptOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+parseMemBarrierOptOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   SMLoc S = Parser.getTok().getLoc();
   const AsmToken &Tok = Parser.getTok();
   assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");
@@ -1360,28 +2294,32 @@ tryParseMemBarrierOptOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   return MatchOperand_Success;
 }
 
-/// tryParseProcIFlagsOperand - Try to parse iflags from CPS instruction.
+/// parseProcIFlagsOperand - Try to parse iflags from CPS instruction.
 ARMAsmParser::OperandMatchResultTy ARMAsmParser::
-tryParseProcIFlagsOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+parseProcIFlagsOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   SMLoc S = Parser.getTok().getLoc();
   const AsmToken &Tok = Parser.getTok();
   assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");
   StringRef IFlagsStr = Tok.getString();
 
+  // An iflags string of "none" is interpreted to mean that none of the AIF
+  // bits are set.  Not a terribly useful instruction, but a valid encoding.
   unsigned IFlags = 0;
-  for (int i = 0, e = IFlagsStr.size(); i != e; ++i) {
-    unsigned Flag = StringSwitch<unsigned>(IFlagsStr.substr(i, 1))
-    .Case("a", ARM_PROC::A)
-    .Case("i", ARM_PROC::I)
-    .Case("f", ARM_PROC::F)
-    .Default(~0U);
+  if (IFlagsStr != "none") {
+        for (int i = 0, e = IFlagsStr.size(); i != e; ++i) {
+      unsigned Flag = StringSwitch<unsigned>(IFlagsStr.substr(i, 1))
+        .Case("a", ARM_PROC::A)
+        .Case("i", ARM_PROC::I)
+        .Case("f", ARM_PROC::F)
+        .Default(~0U);
+
+      // If some specific iflag is already set, it means that some letter is
+      // present more than once, this is not acceptable.
+      if (Flag == ~0U || (IFlags & Flag))
+        return MatchOperand_NoMatch;
 
-    // If some specific iflag is already set, it means that some letter is
-    // present more than once, this is not acceptable.
-    if (Flag == ~0U || (IFlags & Flag))
-      return MatchOperand_NoMatch;
-
-    IFlags |= Flag;
+      IFlags |= Flag;
+    }
   }
 
   Parser.Lex(); // Eat identifier token.
@@ -1389,18 +2327,49 @@ tryParseProcIFlagsOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   return MatchOperand_Success;
 }
 
-/// tryParseMSRMaskOperand - Try to parse mask flags from MSR instruction.
+/// parseMSRMaskOperand - Try to parse mask flags from MSR instruction.
 ARMAsmParser::OperandMatchResultTy ARMAsmParser::
-tryParseMSRMaskOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+parseMSRMaskOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   SMLoc S = Parser.getTok().getLoc();
   const AsmToken &Tok = Parser.getTok();
   assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");
   StringRef Mask = Tok.getString();
 
+  if (isMClass()) {
+    // See ARMv6-M 10.1.1
+    unsigned FlagsVal = StringSwitch<unsigned>(Mask)
+      .Case("apsr", 0)
+      .Case("iapsr", 1)
+      .Case("eapsr", 2)
+      .Case("xpsr", 3)
+      .Case("ipsr", 5)
+      .Case("epsr", 6)
+      .Case("iepsr", 7)
+      .Case("msp", 8)
+      .Case("psp", 9)
+      .Case("primask", 16)
+      .Case("basepri", 17)
+      .Case("basepri_max", 18)
+      .Case("faultmask", 19)
+      .Case("control", 20)
+      .Default(~0U);
+    
+    if (FlagsVal == ~0U)
+      return MatchOperand_NoMatch;
+
+    if (!hasV7Ops() && FlagsVal >= 17 && FlagsVal <= 19)
+      // basepri, basepri_max and faultmask only valid for V7m.
+      return MatchOperand_NoMatch;
+    
+    Parser.Lex(); // Eat identifier token.
+    Operands.push_back(ARMOperand::CreateMSRMask(FlagsVal, S));
+    return MatchOperand_Success;
+  }
+
   // Split spec_reg from flag, example: CPSR_sxf => "CPSR" and "sxf"
   size_t Start = 0, Next = Mask.find('_');
   StringRef Flags = "";
-  StringRef SpecReg = Mask.slice(Start, Next);
+  std::string SpecReg = LowercaseString(Mask.slice(Start, Next));
   if (Next != StringRef::npos)
     Flags = Mask.slice(Next+1, Mask.size());
 
@@ -1411,7 +2380,7 @@ tryParseMSRMaskOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
 
   if (SpecReg == "apsr") {
     FlagsVal = StringSwitch<unsigned>(Flags)
-    .Case("nzcvq",  0x8) // same as CPSR_c
+    .Case("nzcvq",  0x8) // same as CPSR_f
     .Case("g",      0x4) // same as CPSR_s
     .Case("nzcvqg", 0xc) // same as CPSR_fs
     .Default(~0U);
@@ -1420,7 +2389,7 @@ tryParseMSRMaskOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
       if (!Flags.empty())
         return MatchOperand_NoMatch;
       else
-        FlagsVal = 0; // No flag
+        FlagsVal = 8; // No flag
     }
   } else if (SpecReg == "cpsr" || SpecReg == "spsr") {
     if (Flags == "all") // cpsr_all is an alias for cpsr_fc
@@ -1455,96 +2424,680 @@ tryParseMSRMaskOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   return MatchOperand_Success;
 }
 
-/// tryParseMemMode2Operand - Try to parse memory addressing mode 2 operand.
 ARMAsmParser::OperandMatchResultTy ARMAsmParser::
-tryParseMemMode2Operand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  assert(Parser.getTok().is(AsmToken::LBrac) && "Token is not a \"[\"");
+parsePKHImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands, StringRef Op,
+            int Low, int High) {
+  const AsmToken &Tok = Parser.getTok();
+  if (Tok.isNot(AsmToken::Identifier)) {
+    Error(Parser.getTok().getLoc(), Op + " operand expected.");
+    return MatchOperand_ParseFail;
+  }
+  StringRef ShiftName = Tok.getString();
+  std::string LowerOp = LowercaseString(Op);
+  std::string UpperOp = UppercaseString(Op);
+  if (ShiftName != LowerOp && ShiftName != UpperOp) {
+    Error(Parser.getTok().getLoc(), Op + " operand expected.");
+    return MatchOperand_ParseFail;
+  }
+  Parser.Lex(); // Eat shift type token.
 
-  if (ParseMemory(Operands, ARMII::AddrMode2))
-    return MatchOperand_NoMatch;
+  // There must be a '#' and a shift amount.
+  if (Parser.getTok().isNot(AsmToken::Hash)) {
+    Error(Parser.getTok().getLoc(), "'#' expected");
+    return MatchOperand_ParseFail;
+  }
+  Parser.Lex(); // Eat hash token.
+
+  const MCExpr *ShiftAmount;
+  SMLoc Loc = Parser.getTok().getLoc();
+  if (getParser().ParseExpression(ShiftAmount)) {
+    Error(Loc, "illegal expression");
+    return MatchOperand_ParseFail;
+  }
+  const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ShiftAmount);
+  if (!CE) {
+    Error(Loc, "constant expression expected");
+    return MatchOperand_ParseFail;
+  }
+  int Val = CE->getValue();
+  if (Val < Low || Val > High) {
+    Error(Loc, "immediate value out of range");
+    return MatchOperand_ParseFail;
+  }
+
+  Operands.push_back(ARMOperand::CreateImm(CE, Loc, Parser.getTok().getLoc()));
 
   return MatchOperand_Success;
 }
 
-/// tryParseMemMode3Operand - Try to parse memory addressing mode 3 operand.
 ARMAsmParser::OperandMatchResultTy ARMAsmParser::
-tryParseMemMode3Operand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  assert(Parser.getTok().is(AsmToken::LBrac) && "Token is not a \"[\"");
+parseSetEndImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  const AsmToken &Tok = Parser.getTok();
+  SMLoc S = Tok.getLoc();
+  if (Tok.isNot(AsmToken::Identifier)) {
+    Error(Tok.getLoc(), "'be' or 'le' operand expected");
+    return MatchOperand_ParseFail;
+  }
+  int Val = StringSwitch<int>(Tok.getString())
+    .Case("be", 1)
+    .Case("le", 0)
+    .Default(-1);
+  Parser.Lex(); // Eat the token.
+
+  if (Val == -1) {
+    Error(Tok.getLoc(), "'be' or 'le' operand expected");
+    return MatchOperand_ParseFail;
+  }
+  Operands.push_back(ARMOperand::CreateImm(MCConstantExpr::Create(Val,
+                                                                  getContext()),
+                                           S, Parser.getTok().getLoc()));
+  return MatchOperand_Success;
+}
 
-  if (ParseMemory(Operands, ARMII::AddrMode3))
+/// parseShifterImm - Parse the shifter immediate operand for SSAT/USAT
+/// instructions. Legal values are:
+///     lsl #n  'n' in [0,31]
+///     asr #n  'n' in [1,32]
+///             n == 32 encoded as n == 0.
+ARMAsmParser::OperandMatchResultTy ARMAsmParser::
+parseShifterImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  const AsmToken &Tok = Parser.getTok();
+  SMLoc S = Tok.getLoc();
+  if (Tok.isNot(AsmToken::Identifier)) {
+    Error(S, "shift operator 'asr' or 'lsl' expected");
+    return MatchOperand_ParseFail;
+  }
+  StringRef ShiftName = Tok.getString();
+  bool isASR;
+  if (ShiftName == "lsl" || ShiftName == "LSL")
+    isASR = false;
+  else if (ShiftName == "asr" || ShiftName == "ASR")
+    isASR = true;
+  else {
+    Error(S, "shift operator 'asr' or 'lsl' expected");
+    return MatchOperand_ParseFail;
+  }
+  Parser.Lex(); // Eat the operator.
+
+  // A '#' and a shift amount.
+  if (Parser.getTok().isNot(AsmToken::Hash)) {
+    Error(Parser.getTok().getLoc(), "'#' expected");
+    return MatchOperand_ParseFail;
+  }
+  Parser.Lex(); // Eat hash token.
+
+  const MCExpr *ShiftAmount;
+  SMLoc E = Parser.getTok().getLoc();
+  if (getParser().ParseExpression(ShiftAmount)) {
+    Error(E, "malformed shift expression");
+    return MatchOperand_ParseFail;
+  }
+  const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ShiftAmount);
+  if (!CE) {
+    Error(E, "shift amount must be an immediate");
+    return MatchOperand_ParseFail;
+  }
+
+  int64_t Val = CE->getValue();
+  if (isASR) {
+    // Shift amount must be in [1,32]
+    if (Val < 1 || Val > 32) {
+      Error(E, "'asr' shift amount must be in range [1,32]");
+      return MatchOperand_ParseFail;
+    }
+    // asr #32 encoded as asr #0, but is not allowed in Thumb2 mode.
+    if (isThumb() && Val == 32) {
+      Error(E, "'asr #32' shift amount not allowed in Thumb mode");
+      return MatchOperand_ParseFail;
+    }
+    if (Val == 32) Val = 0;
+  } else {
+    // Shift amount must be in [1,32]
+    if (Val < 0 || Val > 31) {
+      Error(E, "'lsr' shift amount must be in range [0,31]");
+      return MatchOperand_ParseFail;
+    }
+  }
+
+  E = Parser.getTok().getLoc();
+  Operands.push_back(ARMOperand::CreateShifterImm(isASR, Val, S, E));
+
+  return MatchOperand_Success;
+}
+
+/// parseRotImm - Parse the shifter immediate operand for SXTB/UXTB family
+/// of instructions. Legal values are:
+///     ror #n  'n' in {0, 8, 16, 24}
+ARMAsmParser::OperandMatchResultTy ARMAsmParser::
+parseRotImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  const AsmToken &Tok = Parser.getTok();
+  SMLoc S = Tok.getLoc();
+  if (Tok.isNot(AsmToken::Identifier))
+    return MatchOperand_NoMatch;
+  StringRef ShiftName = Tok.getString();
+  if (ShiftName != "ror" && ShiftName != "ROR")
     return MatchOperand_NoMatch;
+  Parser.Lex(); // Eat the operator.
+
+  // A '#' and a rotate amount.
+  if (Parser.getTok().isNot(AsmToken::Hash)) {
+    Error(Parser.getTok().getLoc(), "'#' expected");
+    return MatchOperand_ParseFail;
+  }
+  Parser.Lex(); // Eat hash token.
+
+  const MCExpr *ShiftAmount;
+  SMLoc E = Parser.getTok().getLoc();
+  if (getParser().ParseExpression(ShiftAmount)) {
+    Error(E, "malformed rotate expression");
+    return MatchOperand_ParseFail;
+  }
+  const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ShiftAmount);
+  if (!CE) {
+    Error(E, "rotate amount must be an immediate");
+    return MatchOperand_ParseFail;
+  }
+
+  int64_t Val = CE->getValue();
+  // Shift amount must be in {0, 8, 16, 24} (0 is undocumented extension)
+  // normally, zero is represented in asm by omitting the rotate operand
+  // entirely.
+  if (Val != 8 && Val != 16 && Val != 24 && Val != 0) {
+    Error(E, "'ror' rotate amount must be 8, 16, or 24");
+    return MatchOperand_ParseFail;
+  }
+
+  E = Parser.getTok().getLoc();
+  Operands.push_back(ARMOperand::CreateRotImm(Val, S, E));
+
+  return MatchOperand_Success;
+}
+
+ARMAsmParser::OperandMatchResultTy ARMAsmParser::
+parseBitfield(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  SMLoc S = Parser.getTok().getLoc();
+  // The bitfield descriptor is really two operands, the LSB and the width.
+  if (Parser.getTok().isNot(AsmToken::Hash)) {
+    Error(Parser.getTok().getLoc(), "'#' expected");
+    return MatchOperand_ParseFail;
+  }
+  Parser.Lex(); // Eat hash token.
+
+  const MCExpr *LSBExpr;
+  SMLoc E = Parser.getTok().getLoc();
+  if (getParser().ParseExpression(LSBExpr)) {
+    Error(E, "malformed immediate expression");
+    return MatchOperand_ParseFail;
+  }
+  const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(LSBExpr);
+  if (!CE) {
+    Error(E, "'lsb' operand must be an immediate");
+    return MatchOperand_ParseFail;
+  }
+
+  int64_t LSB = CE->getValue();
+  // The LSB must be in the range [0,31]
+  if (LSB < 0 || LSB > 31) {
+    Error(E, "'lsb' operand must be in the range [0,31]");
+    return MatchOperand_ParseFail;
+  }
+  E = Parser.getTok().getLoc();
+
+  // Expect another immediate operand.
+  if (Parser.getTok().isNot(AsmToken::Comma)) {
+    Error(Parser.getTok().getLoc(), "too few operands");
+    return MatchOperand_ParseFail;
+  }
+  Parser.Lex(); // Eat hash token.
+  if (Parser.getTok().isNot(AsmToken::Hash)) {
+    Error(Parser.getTok().getLoc(), "'#' expected");
+    return MatchOperand_ParseFail;
+  }
+  Parser.Lex(); // Eat hash token.
+
+  const MCExpr *WidthExpr;
+  if (getParser().ParseExpression(WidthExpr)) {
+    Error(E, "malformed immediate expression");
+    return MatchOperand_ParseFail;
+  }
+  CE = dyn_cast<MCConstantExpr>(WidthExpr);
+  if (!CE) {
+    Error(E, "'width' operand must be an immediate");
+    return MatchOperand_ParseFail;
+  }
+
+  int64_t Width = CE->getValue();
+  // The LSB must be in the range [1,32-lsb]
+  if (Width < 1 || Width > 32 - LSB) {
+    Error(E, "'width' operand must be in the range [1,32-lsb]");
+    return MatchOperand_ParseFail;
+  }
+  E = Parser.getTok().getLoc();
+
+  Operands.push_back(ARMOperand::CreateBitfield(LSB, Width, S, E));
+
+  return MatchOperand_Success;
+}
+
+ARMAsmParser::OperandMatchResultTy ARMAsmParser::
+parsePostIdxReg(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // Check for a post-index addressing register operand. Specifically:
+  // postidx_reg := '+' register {, shift}
+  //              | '-' register {, shift}
+  //              | register {, shift}
+
+  // This method must return MatchOperand_NoMatch without consuming any tokens
+  // in the case where there is no match, as other alternatives take other
+  // parse methods.
+  AsmToken Tok = Parser.getTok();
+  SMLoc S = Tok.getLoc();
+  bool haveEaten = false;
+  bool isAdd = true;
+  int Reg = -1;
+  if (Tok.is(AsmToken::Plus)) {
+    Parser.Lex(); // Eat the '+' token.
+    haveEaten = true;
+  } else if (Tok.is(AsmToken::Minus)) {
+    Parser.Lex(); // Eat the '-' token.
+    isAdd = false;
+    haveEaten = true;
+  }
+  if (Parser.getTok().is(AsmToken::Identifier))
+    Reg = tryParseRegister();
+  if (Reg == -1) {
+    if (!haveEaten)
+      return MatchOperand_NoMatch;
+    Error(Parser.getTok().getLoc(), "register expected");
+    return MatchOperand_ParseFail;
+  }
+  SMLoc E = Parser.getTok().getLoc();
+
+  ARM_AM::ShiftOpc ShiftTy = ARM_AM::no_shift;
+  unsigned ShiftImm = 0;
+  if (Parser.getTok().is(AsmToken::Comma)) {
+    Parser.Lex(); // Eat the ','.
+    if (parseMemRegOffsetShift(ShiftTy, ShiftImm))
+      return MatchOperand_ParseFail;
+  }
+
+  Operands.push_back(ARMOperand::CreatePostIdxReg(Reg, isAdd, ShiftTy,
+                                                  ShiftImm, S, E));
+
+  return MatchOperand_Success;
+}
+
+ARMAsmParser::OperandMatchResultTy ARMAsmParser::
+parseAM3Offset(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // Check for a post-index addressing register operand. Specifically:
+  // am3offset := '+' register
+  //              | '-' register
+  //              | register
+  //              | # imm
+  //              | # + imm
+  //              | # - imm
+
+  // This method must return MatchOperand_NoMatch without consuming any tokens
+  // in the case where there is no match, as other alternatives take other
+  // parse methods.
+  AsmToken Tok = Parser.getTok();
+  SMLoc S = Tok.getLoc();
+
+  // Do immediates first, as we always parse those if we have a '#'.
+  if (Parser.getTok().is(AsmToken::Hash)) {
+    Parser.Lex(); // Eat the '#'.
+    // Explicitly look for a '-', as we need to encode negative zero
+    // differently.
+    bool isNegative = Parser.getTok().is(AsmToken::Minus);
+    const MCExpr *Offset;
+    if (getParser().ParseExpression(Offset))
+      return MatchOperand_ParseFail;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Offset);
+    if (!CE) {
+      Error(S, "constant expression expected");
+      return MatchOperand_ParseFail;
+    }
+    SMLoc E = Tok.getLoc();
+    // Negative zero is encoded as the flag value INT32_MIN.
+    int32_t Val = CE->getValue();
+    if (isNegative && Val == 0)
+      Val = INT32_MIN;
+
+    Operands.push_back(
+      ARMOperand::CreateImm(MCConstantExpr::Create(Val, getContext()), S, E));
+
+    return MatchOperand_Success;
+  }
+
+
+  bool haveEaten = false;
+  bool isAdd = true;
+  int Reg = -1;
+  if (Tok.is(AsmToken::Plus)) {
+    Parser.Lex(); // Eat the '+' token.
+    haveEaten = true;
+  } else if (Tok.is(AsmToken::Minus)) {
+    Parser.Lex(); // Eat the '-' token.
+    isAdd = false;
+    haveEaten = true;
+  }
+  if (Parser.getTok().is(AsmToken::Identifier))
+    Reg = tryParseRegister();
+  if (Reg == -1) {
+    if (!haveEaten)
+      return MatchOperand_NoMatch;
+    Error(Parser.getTok().getLoc(), "register expected");
+    return MatchOperand_ParseFail;
+  }
+  SMLoc E = Parser.getTok().getLoc();
+
+  Operands.push_back(ARMOperand::CreatePostIdxReg(Reg, isAdd, ARM_AM::no_shift,
+                                                  0, S, E));
 
   return MatchOperand_Success;
 }
 
-/// CvtLdWriteBackRegAddrMode2 - Convert parsed operands to MCInst.
+/// cvtT2LdrdPre - Convert parsed operands to MCInst.
+/// Needed here because the Asm Gen Matcher can't handle properly tied operands
+/// when they refer multiple MIOperands inside a single one.
+bool ARMAsmParser::
+cvtT2LdrdPre(MCInst &Inst, unsigned Opcode,
+             const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // Rt, Rt2
+  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
+  ((ARMOperand*)Operands[3])->addRegOperands(Inst, 1);
+  // Create a writeback register dummy placeholder.
+  Inst.addOperand(MCOperand::CreateReg(0));
+  // addr
+  ((ARMOperand*)Operands[4])->addMemImm8s4OffsetOperands(Inst, 2);
+  // pred
+  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
+  return true;
+}
+
+/// cvtT2StrdPre - Convert parsed operands to MCInst.
 /// Needed here because the Asm Gen Matcher can't handle properly tied operands
 /// when they refer multiple MIOperands inside a single one.
 bool ARMAsmParser::
-CvtLdWriteBackRegAddrMode2(MCInst &Inst, unsigned Opcode,
+cvtT2StrdPre(MCInst &Inst, unsigned Opcode,
+             const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // Create a writeback register dummy placeholder.
+  Inst.addOperand(MCOperand::CreateReg(0));
+  // Rt, Rt2
+  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
+  ((ARMOperand*)Operands[3])->addRegOperands(Inst, 1);
+  // addr
+  ((ARMOperand*)Operands[4])->addMemImm8s4OffsetOperands(Inst, 2);
+  // pred
+  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
+  return true;
+}
+
+/// cvtLdWriteBackRegT2AddrModeImm8 - Convert parsed operands to MCInst.
+/// Needed here because the Asm Gen Matcher can't handle properly tied operands
+/// when they refer multiple MIOperands inside a single one.
+bool ARMAsmParser::
+cvtLdWriteBackRegT2AddrModeImm8(MCInst &Inst, unsigned Opcode,
+                         const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
+
+  // Create a writeback register dummy placeholder.
+  Inst.addOperand(MCOperand::CreateImm(0));
+
+  ((ARMOperand*)Operands[3])->addMemImm8OffsetOperands(Inst, 2);
+  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
+  return true;
+}
+
+/// cvtStWriteBackRegT2AddrModeImm8 - Convert parsed operands to MCInst.
+/// Needed here because the Asm Gen Matcher can't handle properly tied operands
+/// when they refer multiple MIOperands inside a single one.
+bool ARMAsmParser::
+cvtStWriteBackRegT2AddrModeImm8(MCInst &Inst, unsigned Opcode,
+                         const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // Create a writeback register dummy placeholder.
+  Inst.addOperand(MCOperand::CreateImm(0));
+  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
+  ((ARMOperand*)Operands[3])->addMemImm8OffsetOperands(Inst, 2);
+  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
+  return true;
+}
+
+/// cvtLdWriteBackRegAddrMode2 - Convert parsed operands to MCInst.
+/// Needed here because the Asm Gen Matcher can't handle properly tied operands
+/// when they refer multiple MIOperands inside a single one.
+bool ARMAsmParser::
+cvtLdWriteBackRegAddrMode2(MCInst &Inst, unsigned Opcode,
+                         const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
+
+  // Create a writeback register dummy placeholder.
+  Inst.addOperand(MCOperand::CreateImm(0));
+
+  ((ARMOperand*)Operands[3])->addAddrMode2Operands(Inst, 3);
+  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
+  return true;
+}
+
+/// cvtLdWriteBackRegAddrModeImm12 - Convert parsed operands to MCInst.
+/// Needed here because the Asm Gen Matcher can't handle properly tied operands
+/// when they refer multiple MIOperands inside a single one.
+bool ARMAsmParser::
+cvtLdWriteBackRegAddrModeImm12(MCInst &Inst, unsigned Opcode,
+                         const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
+
+  // Create a writeback register dummy placeholder.
+  Inst.addOperand(MCOperand::CreateImm(0));
+
+  ((ARMOperand*)Operands[3])->addMemImm12OffsetOperands(Inst, 2);
+  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
+  return true;
+}
+
+
+/// cvtStWriteBackRegAddrModeImm12 - Convert parsed operands to MCInst.
+/// Needed here because the Asm Gen Matcher can't handle properly tied operands
+/// when they refer multiple MIOperands inside a single one.
+bool ARMAsmParser::
+cvtStWriteBackRegAddrModeImm12(MCInst &Inst, unsigned Opcode,
+                         const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // Create a writeback register dummy placeholder.
+  Inst.addOperand(MCOperand::CreateImm(0));
+  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
+  ((ARMOperand*)Operands[3])->addMemImm12OffsetOperands(Inst, 2);
+  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
+  return true;
+}
+
+/// cvtStWriteBackRegAddrMode2 - Convert parsed operands to MCInst.
+/// Needed here because the Asm Gen Matcher can't handle properly tied operands
+/// when they refer multiple MIOperands inside a single one.
+bool ARMAsmParser::
+cvtStWriteBackRegAddrMode2(MCInst &Inst, unsigned Opcode,
+                         const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // Create a writeback register dummy placeholder.
+  Inst.addOperand(MCOperand::CreateImm(0));
+  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
+  ((ARMOperand*)Operands[3])->addAddrMode2Operands(Inst, 3);
+  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
+  return true;
+}
+
+/// cvtStWriteBackRegAddrMode3 - Convert parsed operands to MCInst.
+/// Needed here because the Asm Gen Matcher can't handle properly tied operands
+/// when they refer multiple MIOperands inside a single one.
+bool ARMAsmParser::
+cvtStWriteBackRegAddrMode3(MCInst &Inst, unsigned Opcode,
                          const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // Create a writeback register dummy placeholder.
+  Inst.addOperand(MCOperand::CreateImm(0));
+  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
+  ((ARMOperand*)Operands[3])->addAddrMode3Operands(Inst, 3);
+  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
+  return true;
+}
+
+/// cvtLdExtTWriteBackImm - Convert parsed operands to MCInst.
+/// Needed here because the Asm Gen Matcher can't handle properly tied operands
+/// when they refer multiple MIOperands inside a single one.
+bool ARMAsmParser::
+cvtLdExtTWriteBackImm(MCInst &Inst, unsigned Opcode,
+                      const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // Rt
+  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
+  // Create a writeback register dummy placeholder.
+  Inst.addOperand(MCOperand::CreateImm(0));
+  // addr
+  ((ARMOperand*)Operands[3])->addMemNoOffsetOperands(Inst, 1);
+  // offset
+  ((ARMOperand*)Operands[4])->addPostIdxImm8Operands(Inst, 1);
+  // pred
+  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
+  return true;
+}
+
+/// cvtLdExtTWriteBackReg - Convert parsed operands to MCInst.
+/// Needed here because the Asm Gen Matcher can't handle properly tied operands
+/// when they refer multiple MIOperands inside a single one.
+bool ARMAsmParser::
+cvtLdExtTWriteBackReg(MCInst &Inst, unsigned Opcode,
+                      const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // Rt
+  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
+  // Create a writeback register dummy placeholder.
+  Inst.addOperand(MCOperand::CreateImm(0));
+  // addr
+  ((ARMOperand*)Operands[3])->addMemNoOffsetOperands(Inst, 1);
+  // offset
+  ((ARMOperand*)Operands[4])->addPostIdxRegOperands(Inst, 2);
+  // pred
+  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
+  return true;
+}
+
+/// cvtStExtTWriteBackImm - Convert parsed operands to MCInst.
+/// Needed here because the Asm Gen Matcher can't handle properly tied operands
+/// when they refer multiple MIOperands inside a single one.
+bool ARMAsmParser::
+cvtStExtTWriteBackImm(MCInst &Inst, unsigned Opcode,
+                      const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // Create a writeback register dummy placeholder.
+  Inst.addOperand(MCOperand::CreateImm(0));
+  // Rt
+  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
+  // addr
+  ((ARMOperand*)Operands[3])->addMemNoOffsetOperands(Inst, 1);
+  // offset
+  ((ARMOperand*)Operands[4])->addPostIdxImm8Operands(Inst, 1);
+  // pred
+  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
+  return true;
+}
+
+/// cvtStExtTWriteBackReg - Convert parsed operands to MCInst.
+/// Needed here because the Asm Gen Matcher can't handle properly tied operands
+/// when they refer multiple MIOperands inside a single one.
+bool ARMAsmParser::
+cvtStExtTWriteBackReg(MCInst &Inst, unsigned Opcode,
+                      const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // Create a writeback register dummy placeholder.
+  Inst.addOperand(MCOperand::CreateImm(0));
+  // Rt
+  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
+  // addr
+  ((ARMOperand*)Operands[3])->addMemNoOffsetOperands(Inst, 1);
+  // offset
+  ((ARMOperand*)Operands[4])->addPostIdxRegOperands(Inst, 2);
+  // pred
+  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
+  return true;
+}
+
+/// cvtLdrdPre - Convert parsed operands to MCInst.
+/// Needed here because the Asm Gen Matcher can't handle properly tied operands
+/// when they refer multiple MIOperands inside a single one.
+bool ARMAsmParser::
+cvtLdrdPre(MCInst &Inst, unsigned Opcode,
+           const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // Rt, Rt2
   ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
-
+  ((ARMOperand*)Operands[3])->addRegOperands(Inst, 1);
   // Create a writeback register dummy placeholder.
   Inst.addOperand(MCOperand::CreateImm(0));
-
-  ((ARMOperand*)Operands[3])->addMemMode2Operands(Inst, 3);
+  // addr
+  ((ARMOperand*)Operands[4])->addAddrMode3Operands(Inst, 3);
+  // pred
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
   return true;
 }
 
-/// CvtStWriteBackRegAddrMode2 - Convert parsed operands to MCInst.
+/// cvtStrdPre - Convert parsed operands to MCInst.
 /// Needed here because the Asm Gen Matcher can't handle properly tied operands
 /// when they refer multiple MIOperands inside a single one.
 bool ARMAsmParser::
-CvtStWriteBackRegAddrMode2(MCInst &Inst, unsigned Opcode,
-                         const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+cvtStrdPre(MCInst &Inst, unsigned Opcode,
+           const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // Create a writeback register dummy placeholder.
   Inst.addOperand(MCOperand::CreateImm(0));
+  // Rt, Rt2
   ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
-  ((ARMOperand*)Operands[3])->addMemMode2Operands(Inst, 3);
+  ((ARMOperand*)Operands[3])->addRegOperands(Inst, 1);
+  // addr
+  ((ARMOperand*)Operands[4])->addAddrMode3Operands(Inst, 3);
+  // pred
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
   return true;
 }
 
-/// CvtLdWriteBackRegAddrMode3 - Convert parsed operands to MCInst.
+/// cvtLdWriteBackRegAddrMode3 - Convert parsed operands to MCInst.
 /// Needed here because the Asm Gen Matcher can't handle properly tied operands
 /// when they refer multiple MIOperands inside a single one.
 bool ARMAsmParser::
-CvtLdWriteBackRegAddrMode3(MCInst &Inst, unsigned Opcode,
+cvtLdWriteBackRegAddrMode3(MCInst &Inst, unsigned Opcode,
                          const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
-
   // Create a writeback register dummy placeholder.
   Inst.addOperand(MCOperand::CreateImm(0));
-
-  ((ARMOperand*)Operands[3])->addMemMode3Operands(Inst, 3);
+  ((ARMOperand*)Operands[3])->addAddrMode3Operands(Inst, 3);
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
   return true;
 }
 
-/// CvtStWriteBackRegAddrMode3 - Convert parsed operands to MCInst.
+/// cvtThumbMultiple- Convert parsed operands to MCInst.
 /// Needed here because the Asm Gen Matcher can't handle properly tied operands
 /// when they refer multiple MIOperands inside a single one.
 bool ARMAsmParser::
-CvtStWriteBackRegAddrMode3(MCInst &Inst, unsigned Opcode,
-                         const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  // Create a writeback register dummy placeholder.
-  Inst.addOperand(MCOperand::CreateImm(0));
-  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
-  ((ARMOperand*)Operands[3])->addMemMode3Operands(Inst, 3);
-  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
+cvtThumbMultiply(MCInst &Inst, unsigned Opcode,
+           const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // The second source operand must be the same register as the destination
+  // operand.
+  if (Operands.size() == 6 &&
+      (((ARMOperand*)Operands[3])->getReg() !=
+       ((ARMOperand*)Operands[5])->getReg()) &&
+      (((ARMOperand*)Operands[3])->getReg() !=
+       ((ARMOperand*)Operands[4])->getReg())) {
+    Error(Operands[3]->getStartLoc(),
+          "destination register must match source register");
+    return false;
+  }
+  ((ARMOperand*)Operands[3])->addRegOperands(Inst, 1);
+  ((ARMOperand*)Operands[1])->addCCOutOperands(Inst, 1);
+  ((ARMOperand*)Operands[4])->addRegOperands(Inst, 1);
+  // If we have a three-operand form, use that, else the second source operand
+  // is just the destination operand again.
+  if (Operands.size() == 6)
+    ((ARMOperand*)Operands[5])->addRegOperands(Inst, 1);
+  else
+    Inst.addOperand(Inst.getOperand(0));
+  ((ARMOperand*)Operands[2])->addCondCodeOperands(Inst, 2);
+
   return true;
 }
 
 /// Parse an ARM memory expression, return false if successful else return true
 /// or an error.  The first token must be a '[' when called.
-///
-/// TODO Only preindexing and postindexing addressing are started, unindexed
-/// with option, etc are still to do.
 bool ARMAsmParser::
-ParseMemory(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-            ARMII::AddrMode AddrMode = ARMII::AddrModeNone) {
+parseMemory(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   SMLoc S, E;
   assert(Parser.getTok().is(AsmToken::LBrac) &&
          "Token is not a Left Bracket");
@@ -1552,185 +3105,178 @@ ParseMemory(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
   Parser.Lex(); // Eat left bracket token.
 
   const AsmToken &BaseRegTok = Parser.getTok();
-  if (BaseRegTok.isNot(AsmToken::Identifier)) {
-    Error(BaseRegTok.getLoc(), "register expected");
-    return true;
-  }
-  int BaseRegNum = TryParseRegister();
-  if (BaseRegNum == -1) {
-    Error(BaseRegTok.getLoc(), "register expected");
-    return true;
-  }
+  int BaseRegNum = tryParseRegister();
+  if (BaseRegNum == -1)
+    return Error(BaseRegTok.getLoc(), "register expected");
 
   // The next token must either be a comma or a closing bracket.
   const AsmToken &Tok = Parser.getTok();
   if (!Tok.is(AsmToken::Comma) && !Tok.is(AsmToken::RBrac))
-    return true;
+    return Error(Tok.getLoc(), "malformed memory operand");
 
-  bool Preindexed = false;
-  bool Postindexed = false;
-  bool OffsetIsReg = false;
-  bool Negative = false;
-  bool Writeback = false;
-  ARMOperand *WBOp = 0;
-  int OffsetRegNum = -1;
-  bool OffsetRegShifted = false;
-  enum ARM_AM::ShiftOpc ShiftType = ARM_AM::lsl;
-  const MCExpr *ShiftAmount = 0;
-  const MCExpr *Offset = 0;
-
-  // First look for preindexed address forms, that is after the "[Rn" we now
-  // have to see if the next token is a comma.
-  if (Tok.is(AsmToken::Comma)) {
-    Preindexed = true;
-    Parser.Lex(); // Eat comma token.
-
-    if (ParseMemoryOffsetReg(Negative, OffsetRegShifted, ShiftType, ShiftAmount,
-                             Offset, OffsetIsReg, OffsetRegNum, E))
-      return true;
-    const AsmToken &RBracTok = Parser.getTok();
-    if (RBracTok.isNot(AsmToken::RBrac)) {
-      Error(RBracTok.getLoc(), "']' expected");
-      return true;
-    }
-    E = RBracTok.getLoc();
+  if (Tok.is(AsmToken::RBrac)) {
+    E = Tok.getLoc();
     Parser.Lex(); // Eat right bracket token.
 
-    const AsmToken &ExclaimTok = Parser.getTok();
-    if (ExclaimTok.is(AsmToken::Exclaim)) {
-      // None of addrmode3 instruction uses "!"
-      if (AddrMode == ARMII::AddrMode3)
-        return true;
+    Operands.push_back(ARMOperand::CreateMem(BaseRegNum, 0, 0, ARM_AM::no_shift,
+                                             0, 0, false, S, E));
 
-      WBOp = ARMOperand::CreateToken(ExclaimTok.getString(),
-                                     ExclaimTok.getLoc());
-      Writeback = true;
-      Parser.Lex(); // Eat exclaim token
-    } else { // In addressing mode 2, pre-indexed mode always end with "!"
-      if (AddrMode == ARMII::AddrMode2)
-        Preindexed = false;
+    // If there's a pre-indexing writeback marker, '!', just add it as a token
+    // operand. It's rather odd, but syntactically valid.
+    if (Parser.getTok().is(AsmToken::Exclaim)) {
+      Operands.push_back(ARMOperand::CreateToken("!",Parser.getTok().getLoc()));
+      Parser.Lex(); // Eat the '!'.
     }
-  } else {
-    // The "[Rn" we have so far was not followed by a comma.
-
-    // If there's anything other than the right brace, this is a post indexing
-    // addressing form.
-    E = Tok.getLoc();
-    Parser.Lex(); // Eat right bracket token.
 
-    const AsmToken &NextTok = Parser.getTok();
+    return false;
+  }
 
-    if (NextTok.isNot(AsmToken::EndOfStatement)) {
-      Postindexed = true;
-      Writeback = true;
+  assert(Tok.is(AsmToken::Comma) && "Lost comma in memory operand?!");
+  Parser.Lex(); // Eat the comma.
 
-      if (NextTok.isNot(AsmToken::Comma)) {
-        Error(NextTok.getLoc(), "',' expected");
-        return true;
-      }
+  // If we have a ':', it's an alignment specifier.
+  if (Parser.getTok().is(AsmToken::Colon)) {
+    Parser.Lex(); // Eat the ':'.
+    E = Parser.getTok().getLoc();
 
-      Parser.Lex(); // Eat comma token.
+    const MCExpr *Expr;
+    if (getParser().ParseExpression(Expr))
+     return true;
 
-      if (ParseMemoryOffsetReg(Negative, OffsetRegShifted, ShiftType,
-                               ShiftAmount, Offset, OffsetIsReg, OffsetRegNum,
-                               E))
-        return true;
+    // The expression has to be a constant. Memory references with relocations
+    // don't come through here, as they use the <label> forms of the relevant
+    // instructions.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr);
+    if (!CE)
+      return Error (E, "constant expression expected");
+
+    unsigned Align = 0;
+    switch (CE->getValue()) {
+    default:
+      return Error(E, "alignment specifier must be 64, 128, or 256 bits");
+    case 64:  Align = 8; break;
+    case 128: Align = 16; break;
+    case 256: Align = 32; break;
     }
-  }
 
-  // Force Offset to exist if used.
-  if (!OffsetIsReg) {
-    if (!Offset)
-      Offset = MCConstantExpr::Create(0, getContext());
-  } else {
-    if (AddrMode == ARMII::AddrMode3 && OffsetRegShifted) {
-      Error(E, "shift amount not supported");
-      return true;
+    // Now we should have the closing ']'
+    E = Parser.getTok().getLoc();
+    if (Parser.getTok().isNot(AsmToken::RBrac))
+      return Error(E, "']' expected");
+    Parser.Lex(); // Eat right bracket token.
+
+    // Don't worry about range checking the value here. That's handled by
+    // the is*() predicates.
+    Operands.push_back(ARMOperand::CreateMem(BaseRegNum, 0, 0,
+                                             ARM_AM::no_shift, 0, Align,
+                                             false, S, E));
+
+    // If there's a pre-indexing writeback marker, '!', just add it as a token
+    // operand.
+    if (Parser.getTok().is(AsmToken::Exclaim)) {
+      Operands.push_back(ARMOperand::CreateToken("!",Parser.getTok().getLoc()));
+      Parser.Lex(); // Eat the '!'.
     }
+
+    return false;
   }
 
-  Operands.push_back(ARMOperand::CreateMem(AddrMode, BaseRegNum, OffsetIsReg,
-                                     Offset, OffsetRegNum, OffsetRegShifted,
-                                     ShiftType, ShiftAmount, Preindexed,
-                                     Postindexed, Negative, Writeback, S, E));
-  if (WBOp)
-    Operands.push_back(WBOp);
+  // If we have a '#', it's an immediate offset, else assume it's a register
+  // offset.
+  if (Parser.getTok().is(AsmToken::Hash)) {
+    Parser.Lex(); // Eat the '#'.
+    E = Parser.getTok().getLoc();
 
-  return false;
-}
+    bool isNegative = getParser().getTok().is(AsmToken::Minus);
+    const MCExpr *Offset;
+    if (getParser().ParseExpression(Offset))
+     return true;
+
+    // The expression has to be a constant. Memory references with relocations
+    // don't come through here, as they use the <label> forms of the relevant
+    // instructions.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Offset);
+    if (!CE)
+      return Error (E, "constant expression expected");
+
+    // If the constant was #-0, represent it as INT32_MIN.
+    int32_t Val = CE->getValue();
+    if (isNegative && Val == 0)
+      CE = MCConstantExpr::Create(INT32_MIN, getContext());
+
+    // Now we should have the closing ']'
+    E = Parser.getTok().getLoc();
+    if (Parser.getTok().isNot(AsmToken::RBrac))
+      return Error(E, "']' expected");
+    Parser.Lex(); // Eat right bracket token.
 
-/// Parse the offset of a memory operand after we have seen "[Rn," or "[Rn],"
-/// we will parse the following (were +/- means that a plus or minus is
-/// optional):
-///   +/-Rm
-///   +/-Rm, shift
-///   #offset
-/// we return false on success or an error otherwise.
-bool ARMAsmParser::ParseMemoryOffsetReg(bool &Negative,
-                                        bool &OffsetRegShifted,
-                                        enum ARM_AM::ShiftOpc &ShiftType,
-                                        const MCExpr *&ShiftAmount,
-                                        const MCExpr *&Offset,
-                                        bool &OffsetIsReg,
-                                        int &OffsetRegNum,
-                                        SMLoc &E) {
-  Negative = false;
-  OffsetRegShifted = false;
-  OffsetIsReg = false;
-  OffsetRegNum = -1;
-  const AsmToken &NextTok = Parser.getTok();
-  E = NextTok.getLoc();
-  if (NextTok.is(AsmToken::Plus))
-    Parser.Lex(); // Eat plus token.
-  else if (NextTok.is(AsmToken::Minus)) {
-    Negative = true;
-    Parser.Lex(); // Eat minus token
-  }
-  // See if there is a register following the "[Rn," or "[Rn]," we have so far.
-  const AsmToken &OffsetRegTok = Parser.getTok();
-  if (OffsetRegTok.is(AsmToken::Identifier)) {
-    SMLoc CurLoc = OffsetRegTok.getLoc();
-    OffsetRegNum = TryParseRegister();
-    if (OffsetRegNum != -1) {
-      OffsetIsReg = true;
-      E = CurLoc;
+    // Don't worry about range checking the value here. That's handled by
+    // the is*() predicates.
+    Operands.push_back(ARMOperand::CreateMem(BaseRegNum, CE, 0,
+                                             ARM_AM::no_shift, 0, 0,
+                                             false, S, E));
+
+    // If there's a pre-indexing writeback marker, '!', just add it as a token
+    // operand.
+    if (Parser.getTok().is(AsmToken::Exclaim)) {
+      Operands.push_back(ARMOperand::CreateToken("!",Parser.getTok().getLoc()));
+      Parser.Lex(); // Eat the '!'.
     }
-  }
 
-  // If we parsed a register as the offset then there can be a shift after that.
-  if (OffsetRegNum != -1) {
-    // Look for a comma then a shift
-    const AsmToken &Tok = Parser.getTok();
-    if (Tok.is(AsmToken::Comma)) {
-      Parser.Lex(); // Eat comma token.
+    return false;
+  }
 
-      const AsmToken &Tok = Parser.getTok();
-      if (ParseShift(ShiftType, ShiftAmount, E))
-        return Error(Tok.getLoc(), "shift expected");
-      OffsetRegShifted = true;
-    }
+  // The register offset is optionally preceded by a '+' or '-'
+  bool isNegative = false;
+  if (Parser.getTok().is(AsmToken::Minus)) {
+    isNegative = true;
+    Parser.Lex(); // Eat the '-'.
+  } else if (Parser.getTok().is(AsmToken::Plus)) {
+    // Nothing to do.
+    Parser.Lex(); // Eat the '+'.
   }
-  else { // the "[Rn," or "[Rn,]" we have so far was not followed by "Rm"
-    // Look for #offset following the "[Rn," or "[Rn],"
-    const AsmToken &HashTok = Parser.getTok();
-    if (HashTok.isNot(AsmToken::Hash))
-      return Error(HashTok.getLoc(), "'#' expected");
 
-    Parser.Lex(); // Eat hash token.
+  E = Parser.getTok().getLoc();
+  int OffsetRegNum = tryParseRegister();
+  if (OffsetRegNum == -1)
+    return Error(E, "register expected");
+
+  // If there's a shift operator, handle it.
+  ARM_AM::ShiftOpc ShiftType = ARM_AM::no_shift;
+  unsigned ShiftImm = 0;
+  if (Parser.getTok().is(AsmToken::Comma)) {
+    Parser.Lex(); // Eat the ','.
+    if (parseMemRegOffsetShift(ShiftType, ShiftImm))
+      return true;
+  }
 
-    if (getParser().ParseExpression(Offset))
-     return true;
-    E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+  // Now we should have the closing ']'
+  E = Parser.getTok().getLoc();
+  if (Parser.getTok().isNot(AsmToken::RBrac))
+    return Error(E, "']' expected");
+  Parser.Lex(); // Eat right bracket token.
+
+  Operands.push_back(ARMOperand::CreateMem(BaseRegNum, 0, OffsetRegNum,
+                                           ShiftType, ShiftImm, 0, isNegative,
+                                           S, E));
+
+  // If there's a pre-indexing writeback marker, '!', just add it as a token
+  // operand.
+  if (Parser.getTok().is(AsmToken::Exclaim)) {
+    Operands.push_back(ARMOperand::CreateToken("!",Parser.getTok().getLoc()));
+    Parser.Lex(); // Eat the '!'.
   }
+
   return false;
 }
 
-/// ParseShift as one of these two:
+/// parseMemRegOffsetShift - one of these two:
 ///   ( lsl | lsr | asr | ror ) , # shift_amount
 ///   rrx
-/// and returns true if it parses a shift otherwise it returns false.
-bool ARMAsmParser::ParseShift(ARM_AM::ShiftOpc &St,
-                              const MCExpr *&ShiftAmount, SMLoc &E) {
+/// return true if it parses a shift otherwise it returns false.
+bool ARMAsmParser::parseMemRegOffsetShift(ARM_AM::ShiftOpc &St,
+                                          unsigned &Amount) {
+  SMLoc Loc = Parser.getTok().getLoc();
   const AsmToken &Tok = Parser.getTok();
   if (Tok.isNot(AsmToken::Identifier))
     return true;
@@ -1746,28 +3292,86 @@ bool ARMAsmParser::ParseShift(ARM_AM::ShiftOpc &St,
   else if (ShiftName == "rrx" || ShiftName == "RRX")
     St = ARM_AM::rrx;
   else
-    return true;
+    return Error(Loc, "illegal shift operator");
   Parser.Lex(); // Eat shift type token.
 
-  // Rrx stands alone.
-  if (St == ARM_AM::rrx)
-    return false;
-
-  // Otherwise, there must be a '#' and a shift amount.
-  const AsmToken &HashTok = Parser.getTok();
-  if (HashTok.isNot(AsmToken::Hash))
-    return Error(HashTok.getLoc(), "'#' expected");
-  Parser.Lex(); // Eat hash token.
+  // rrx stands alone.
+  Amount = 0;
+  if (St != ARM_AM::rrx) {
+    Loc = Parser.getTok().getLoc();
+    // A '#' and a shift amount.
+    const AsmToken &HashTok = Parser.getTok();
+    if (HashTok.isNot(AsmToken::Hash))
+      return Error(HashTok.getLoc(), "'#' expected");
+    Parser.Lex(); // Eat hash token.
 
-  if (getParser().ParseExpression(ShiftAmount))
-    return true;
+    const MCExpr *Expr;
+    if (getParser().ParseExpression(Expr))
+      return true;
+    // Range check the immediate.
+    // lsl, ror: 0 <= imm <= 31
+    // lsr, asr: 0 <= imm <= 32
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr);
+    if (!CE)
+      return Error(Loc, "shift amount must be an immediate");
+    int64_t Imm = CE->getValue();
+    if (Imm < 0 ||
+        ((St == ARM_AM::lsl || St == ARM_AM::ror) && Imm > 31) ||
+        ((St == ARM_AM::lsr || St == ARM_AM::asr) && Imm > 32))
+      return Error(Loc, "immediate shift value out of range");
+    Amount = Imm;
+  }
 
   return false;
 }
 
+/// parseFPImm - A floating point immediate expression operand.
+ARMAsmParser::OperandMatchResultTy ARMAsmParser::
+parseFPImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  SMLoc S = Parser.getTok().getLoc();
+
+  if (Parser.getTok().isNot(AsmToken::Hash))
+    return MatchOperand_NoMatch;
+  Parser.Lex(); // Eat the '#'.
+
+  // Handle negation, as that still comes through as a separate token.
+  bool isNegative = false;
+  if (Parser.getTok().is(AsmToken::Minus)) {
+    isNegative = true;
+    Parser.Lex();
+  }
+  const AsmToken &Tok = Parser.getTok();
+  if (Tok.is(AsmToken::Real)) {
+    APFloat RealVal(APFloat::IEEEdouble, Tok.getString());
+    uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue();
+    // If we had a '-' in front, toggle the sign bit.
+    IntVal ^= (uint64_t)isNegative << 63;
+    int Val = ARM_AM::getFP64Imm(APInt(64, IntVal));
+    Parser.Lex(); // Eat the token.
+    if (Val == -1) {
+      TokError("floating point value out of range");
+      return MatchOperand_ParseFail;
+    }
+    Operands.push_back(ARMOperand::CreateFPImm(Val, S, getContext()));
+    return MatchOperand_Success;
+  }
+  if (Tok.is(AsmToken::Integer)) {
+    int64_t Val = Tok.getIntVal();
+    Parser.Lex(); // Eat the token.
+    if (Val > 255 || Val < 0) {
+      TokError("encoded floating point value out of range");
+      return MatchOperand_ParseFail;
+    }
+    Operands.push_back(ARMOperand::CreateFPImm(Val, S, getContext()));
+    return MatchOperand_Success;
+  }
+
+  TokError("invalid floating point immediate");
+  return MatchOperand_ParseFail;
+}
 /// Parse a arm instruction operand.  For now this parses the operand regardless
 /// of the mnemonic.
-bool ARMAsmParser::ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+bool ARMAsmParser::parseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
                                 StringRef Mnemonic) {
   SMLoc S, E;
 
@@ -1787,13 +3391,20 @@ bool ARMAsmParser::ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
     Error(Parser.getTok().getLoc(), "unexpected token in operand");
     return true;
   case AsmToken::Identifier: {
-    if (!TryParseRegisterWithWriteBack(Operands))
+    // If this is VMRS, check for the apsr_nzcv operand.
+    if (!tryParseRegisterWithWriteBack(Operands))
       return false;
-    int Res = TryParseShiftRegister(Operands);
+    int Res = tryParseShiftRegister(Operands);
     if (Res == 0) // success
       return false;
     else if (Res == -1) // irrecoverable error
       return true;
+    if (Mnemonic == "vmrs" && Parser.getTok().getString() == "apsr_nzcv") {
+      S = Parser.getTok().getLoc();
+      Parser.Lex();
+      Operands.push_back(ARMOperand::CreateToken("apsr_nzcv", S));
+      return false;
+    }
 
     // Fall though for the Identifier case that is not a register or a
     // special name.
@@ -1811,26 +3422,36 @@ bool ARMAsmParser::ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
     return false;
   }
   case AsmToken::LBrac:
-    return ParseMemory(Operands);
+    return parseMemory(Operands);
   case AsmToken::LCurly:
-    return ParseRegisterList(Operands);
-  case AsmToken::Hash:
+    return parseRegisterList(Operands);
+  case AsmToken::Hash: {
     // #42 -> immediate.
     // TODO: ":lower16:" and ":upper16:" modifiers after # before immediate
     S = Parser.getTok().getLoc();
     Parser.Lex();
+    bool isNegative = Parser.getTok().is(AsmToken::Minus);
     const MCExpr *ImmVal;
     if (getParser().ParseExpression(ImmVal))
       return true;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ImmVal);
+    if (!CE) {
+      Error(S, "constant expression expected");
+      return MatchOperand_ParseFail;
+    }
+    int32_t Val = CE->getValue();
+    if (isNegative && Val == 0)
+      ImmVal = MCConstantExpr::Create(INT32_MIN, getContext());
     E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
     Operands.push_back(ARMOperand::CreateImm(ImmVal, S, E));
     return false;
+  }
   case AsmToken::Colon: {
     // ":lower16:" and ":upper16:" expression prefixes
     // FIXME: Check it's an expression prefix,
     // e.g. (FOO - :lower16:BAR) isn't legal.
     ARMMCExpr::VariantKind RefKind;
-    if (ParsePrefix(RefKind))
+    if (parsePrefix(RefKind))
       return true;
 
     const MCExpr *SubExprVal;
@@ -1846,9 +3467,9 @@ bool ARMAsmParser::ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
   }
 }
 
-// ParsePrefix - Parse ARM 16-bit relocations expression prefix, i.e.
+// parsePrefix - Parse ARM 16-bit relocations expression prefix, i.e.
 //  :lower16: and :upper16:.
-bool ARMAsmParser::ParsePrefix(ARMMCExpr::VariantKind &RefKind) {
+bool ARMAsmParser::parsePrefix(ARMMCExpr::VariantKind &RefKind) {
   RefKind = ARMMCExpr::VK_ARM_None;
 
   // :lower16: and :upper16: modifiers
@@ -1879,55 +3500,16 @@ bool ARMAsmParser::ParsePrefix(ARMMCExpr::VariantKind &RefKind) {
   return false;
 }
 
-const MCExpr *
-ARMAsmParser::ApplyPrefixToExpr(const MCExpr *E,
-                                MCSymbolRefExpr::VariantKind Variant) {
-  // Recurse over the given expression, rebuilding it to apply the given variant
-  // to the leftmost symbol.
-  if (Variant == MCSymbolRefExpr::VK_None)
-    return E;
-
-  switch (E->getKind()) {
-  case MCExpr::Target:
-    llvm_unreachable("Can't handle target expr yet");
-  case MCExpr::Constant:
-    llvm_unreachable("Can't handle lower16/upper16 of constant yet");
-
-  case MCExpr::SymbolRef: {
-    const MCSymbolRefExpr *SRE = cast<MCSymbolRefExpr>(E);
-
-    if (SRE->getKind() != MCSymbolRefExpr::VK_None)
-      return 0;
-
-    return MCSymbolRefExpr::Create(&SRE->getSymbol(), Variant, getContext());
-  }
-
-  case MCExpr::Unary:
-    llvm_unreachable("Can't handle unary expressions yet");
-
-  case MCExpr::Binary: {
-    const MCBinaryExpr *BE = cast<MCBinaryExpr>(E);
-    const MCExpr *LHS = ApplyPrefixToExpr(BE->getLHS(), Variant);
-    const MCExpr *RHS = BE->getRHS();
-    if (!LHS)
-      return 0;
-
-    return MCBinaryExpr::Create(BE->getOpcode(), LHS, RHS, getContext());
-  }
-  }
-
-  assert(0 && "Invalid expression kind!");
-  return 0;
-}
-
 /// \brief Given a mnemonic, split out possible predication code and carry
 /// setting letters to form a canonical mnemonic and flags.
 //
 // FIXME: Would be nice to autogen this.
-static StringRef SplitMnemonic(StringRef Mnemonic,
-                               unsigned &PredicationCode,
-                               bool &CarrySetting,
-                               unsigned &ProcessorIMod) {
+// FIXME: This is a bit of a maze of special cases.
+StringRef ARMAsmParser::splitMnemonic(StringRef Mnemonic,
+                                      unsigned &PredicationCode,
+                                      bool &CarrySetting,
+                                      unsigned &ProcessorIMod,
+                                      StringRef &ITMask) {
   PredicationCode = ARMCC::AL;
   CarrySetting = false;
   ProcessorIMod = 0;
@@ -1935,23 +3517,22 @@ static StringRef SplitMnemonic(StringRef Mnemonic,
   // Ignore some mnemonics we know aren't predicated forms.
   //
   // FIXME: Would be nice to autogen this.
-  if (Mnemonic == "teq" || Mnemonic == "vceq" ||
-      Mnemonic == "movs" ||
-      Mnemonic == "svc" ||
-      (Mnemonic == "mls" || Mnemonic == "smmls" || Mnemonic == "vcls" ||
-       Mnemonic == "vmls" || Mnemonic == "vnmls") ||
-      Mnemonic == "vacge" || Mnemonic == "vcge" ||
-      Mnemonic == "vclt" ||
-      Mnemonic == "vacgt" || Mnemonic == "vcgt" ||
-      Mnemonic == "vcle" ||
-      (Mnemonic == "smlal" || Mnemonic == "umaal" || Mnemonic == "umlal" ||
-       Mnemonic == "vabal" || Mnemonic == "vmlal" || Mnemonic == "vpadal" ||
-       Mnemonic == "vqdmlal" || Mnemonic == "bics"))
+  if ((Mnemonic == "movs" && isThumb()) ||
+      Mnemonic == "teq"   || Mnemonic == "vceq"   || Mnemonic == "svc"   ||
+      Mnemonic == "mls"   || Mnemonic == "smmls"  || Mnemonic == "vcls"  ||
+      Mnemonic == "vmls"  || Mnemonic == "vnmls"  || Mnemonic == "vacge" ||
+      Mnemonic == "vcge"  || Mnemonic == "vclt"   || Mnemonic == "vacgt" ||
+      Mnemonic == "vcgt"  || Mnemonic == "vcle"   || Mnemonic == "smlal" ||
+      Mnemonic == "umaal" || Mnemonic == "umlal"  || Mnemonic == "vabal" ||
+      Mnemonic == "vmlal" || Mnemonic == "vpadal" || Mnemonic == "vqdmlal")
     return Mnemonic;
 
   // First, split out any predication code. Ignore mnemonics we know aren't
   // predicated but do have a carry-set and so weren't caught above.
-  if (Mnemonic != "adcs") {
+  if (Mnemonic != "adcs" && Mnemonic != "bics" && Mnemonic != "movs" &&
+      Mnemonic != "muls" && Mnemonic != "smlals" && Mnemonic != "smulls" &&
+      Mnemonic != "umlals" && Mnemonic != "umulls" && Mnemonic != "lsls" &&
+      Mnemonic != "sbcs" && Mnemonic != "rscs") {
     unsigned CC = StringSwitch<unsigned>(Mnemonic.substr(Mnemonic.size()-2))
       .Case("eq", ARMCC::EQ)
       .Case("ne", ARMCC::NE)
@@ -1980,11 +3561,12 @@ static StringRef SplitMnemonic(StringRef Mnemonic,
   // Next, determine if we have a carry setting bit. We explicitly ignore all
   // the instructions we know end in 's'.
   if (Mnemonic.endswith("s") &&
-      !(Mnemonic == "asrs" || Mnemonic == "cps" || Mnemonic == "mls" ||
-        Mnemonic == "movs" || Mnemonic == "mrs" || Mnemonic == "smmls" ||
-        Mnemonic == "vabs" || Mnemonic == "vcls" || Mnemonic == "vmls" ||
-        Mnemonic == "vmrs" || Mnemonic == "vnmls" || Mnemonic == "vqabs" ||
-        Mnemonic == "vrecps" || Mnemonic == "vrsqrts")) {
+      !(Mnemonic == "cps" || Mnemonic == "mls" ||
+        Mnemonic == "mrs" || Mnemonic == "smmls" || Mnemonic == "vabs" ||
+        Mnemonic == "vcls" || Mnemonic == "vmls" || Mnemonic == "vmrs" ||
+        Mnemonic == "vnmls" || Mnemonic == "vqabs" || Mnemonic == "vrecps" ||
+        Mnemonic == "vrsqrts" || Mnemonic == "srs" ||
+        (Mnemonic == "movs" && isThumb()))) {
     Mnemonic = Mnemonic.slice(0, Mnemonic.size() - 1);
     CarrySetting = true;
   }
@@ -2004,6 +3586,12 @@ static StringRef SplitMnemonic(StringRef Mnemonic,
     }
   }
 
+  // The "it" instruction has the condition mask on the end of the mnemonic.
+  if (Mnemonic.startswith("it")) {
+    ITMask = Mnemonic.slice(2, Mnemonic.size());
+    Mnemonic = Mnemonic.slice(0, 2);
+  }
+
   return Mnemonic;
 }
 
@@ -2012,37 +3600,154 @@ static StringRef SplitMnemonic(StringRef Mnemonic,
 //
 // FIXME: It would be nice to autogen this.
 void ARMAsmParser::
-GetMnemonicAcceptInfo(StringRef Mnemonic, bool &CanAcceptCarrySet,
+getMnemonicAcceptInfo(StringRef Mnemonic, bool &CanAcceptCarrySet,
                       bool &CanAcceptPredicationCode) {
   if (Mnemonic == "and" || Mnemonic == "lsl" || Mnemonic == "lsr" ||
       Mnemonic == "rrx" || Mnemonic == "ror" || Mnemonic == "sub" ||
-      Mnemonic == "smull" || Mnemonic == "add" || Mnemonic == "adc" ||
+      Mnemonic == "add" || Mnemonic == "adc" ||
       Mnemonic == "mul" || Mnemonic == "bic" || Mnemonic == "asr" ||
-      Mnemonic == "umlal" || Mnemonic == "orr" || Mnemonic == "mvn" ||
+      Mnemonic == "orr" || Mnemonic == "mvn" ||
       Mnemonic == "rsb" || Mnemonic == "rsc" || Mnemonic == "orn" ||
-      Mnemonic == "sbc" || Mnemonic == "mla" || Mnemonic == "umull" ||
-      Mnemonic == "eor" || Mnemonic == "smlal" ||
-      (Mnemonic == "mov" && !isThumbOne())) {
+      Mnemonic == "sbc" || Mnemonic == "eor" || Mnemonic == "neg" ||
+      (!isThumb() && (Mnemonic == "smull" || Mnemonic == "mov" ||
+                      Mnemonic == "mla" || Mnemonic == "smlal" ||
+                      Mnemonic == "umlal" || Mnemonic == "umull"))) {
     CanAcceptCarrySet = true;
-  } else {
+  } else
     CanAcceptCarrySet = false;
-  }
 
   if (Mnemonic == "cbnz" || Mnemonic == "setend" || Mnemonic == "dmb" ||
       Mnemonic == "cps" || Mnemonic == "mcr2" || Mnemonic == "it" ||
       Mnemonic == "mcrr2" || Mnemonic == "cbz" || Mnemonic == "cdp2" ||
       Mnemonic == "trap" || Mnemonic == "mrc2" || Mnemonic == "mrrc2" ||
-      Mnemonic == "dsb" || Mnemonic == "movs" || Mnemonic == "isb" ||
-      Mnemonic == "clrex" || Mnemonic.startswith("cps")) {
+      Mnemonic == "dsb" || Mnemonic == "isb" || Mnemonic == "setend" ||
+      (Mnemonic == "clrex" && !isThumb()) ||
+      (Mnemonic == "nop" && isThumbOne()) ||
+      ((Mnemonic == "pld" || Mnemonic == "pli" || Mnemonic == "pldw" ||
+        Mnemonic == "ldc2" || Mnemonic == "ldc2l" ||
+        Mnemonic == "stc2" || Mnemonic == "stc2l") && !isThumb()) ||
+      ((Mnemonic.startswith("rfe") || Mnemonic.startswith("srs")) &&
+       !isThumb()) ||
+      Mnemonic.startswith("cps") || (Mnemonic == "movs" && isThumbOne())) {
     CanAcceptPredicationCode = false;
-  } else {
+  } else
     CanAcceptPredicationCode = true;
-  }
 
-  if (isThumb())
+  if (isThumb()) {
     if (Mnemonic == "bkpt" || Mnemonic == "mcr" || Mnemonic == "mcrr" ||
         Mnemonic == "mrc" || Mnemonic == "mrrc" || Mnemonic == "cdp")
       CanAcceptPredicationCode = false;
+  }
+}
+
+bool ARMAsmParser::shouldOmitCCOutOperand(StringRef Mnemonic,
+                               SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // FIXME: This is all horribly hacky. We really need a better way to deal
+  // with optional operands like this in the matcher table.
+
+  // The 'mov' mnemonic is special. One variant has a cc_out operand, while
+  // another does not. Specifically, the MOVW instruction does not. So we
+  // special case it here and remove the defaulted (non-setting) cc_out
+  // operand if that's the instruction we're trying to match.
+  //
+  // We do this as post-processing of the explicit operands rather than just
+  // conditionally adding the cc_out in the first place because we need
+  // to check the type of the parsed immediate operand.
+  if (Mnemonic == "mov" && Operands.size() > 4 && !isThumb() &&
+      !static_cast<ARMOperand*>(Operands[4])->isARMSOImm() &&
+      static_cast<ARMOperand*>(Operands[4])->isImm0_65535Expr() &&
+      static_cast<ARMOperand*>(Operands[1])->getReg() == 0)
+    return true;
+
+  // Register-register 'add' for thumb does not have a cc_out operand
+  // when there are only two register operands.
+  if (isThumb() && Mnemonic == "add" && Operands.size() == 5 &&
+      static_cast<ARMOperand*>(Operands[3])->isReg() &&
+      static_cast<ARMOperand*>(Operands[4])->isReg() &&
+      static_cast<ARMOperand*>(Operands[1])->getReg() == 0)
+    return true;
+  // Register-register 'add' for thumb does not have a cc_out operand
+  // when it's an ADD Rdm, SP, {Rdm|#imm0_255} instruction. We do
+  // have to check the immediate range here since Thumb2 has a variant
+  // that can handle a different range and has a cc_out operand.
+  if (((isThumb() && Mnemonic == "add") ||
+       (isThumbTwo() && Mnemonic == "sub")) &&
+      Operands.size() == 6 &&
+      static_cast<ARMOperand*>(Operands[3])->isReg() &&
+      static_cast<ARMOperand*>(Operands[4])->isReg() &&
+      static_cast<ARMOperand*>(Operands[4])->getReg() == ARM::SP &&
+      static_cast<ARMOperand*>(Operands[1])->getReg() == 0 &&
+      (static_cast<ARMOperand*>(Operands[5])->isReg() ||
+       static_cast<ARMOperand*>(Operands[5])->isImm0_1020s4()))
+    return true;
+  // For Thumb2, add/sub immediate does not have a cc_out operand for the
+  // imm0_4095 variant. That's the least-preferred variant when
+  // selecting via the generic "add" mnemonic, so to know that we
+  // should remove the cc_out operand, we have to explicitly check that
+  // it's not one of the other variants. Ugh.
+  if (isThumbTwo() && (Mnemonic == "add" || Mnemonic == "sub") &&
+      Operands.size() == 6 &&
+      static_cast<ARMOperand*>(Operands[3])->isReg() &&
+      static_cast<ARMOperand*>(Operands[4])->isReg() &&
+      static_cast<ARMOperand*>(Operands[5])->isImm()) {
+    // Nest conditions rather than one big 'if' statement for readability.
+    //
+    // If either register is a high reg, it's either one of the SP
+    // variants (handled above) or a 32-bit encoding, so we just
+    // check against T3.
+    if ((!isARMLowRegister(static_cast<ARMOperand*>(Operands[3])->getReg()) ||
+         !isARMLowRegister(static_cast<ARMOperand*>(Operands[4])->getReg())) &&
+        static_cast<ARMOperand*>(Operands[5])->isT2SOImm())
+      return false;
+    // If both registers are low, we're in an IT block, and the immediate is
+    // in range, we should use encoding T1 instead, which has a cc_out.
+    if (inITBlock() &&
+        isARMLowRegister(static_cast<ARMOperand*>(Operands[3])->getReg()) &&
+        isARMLowRegister(static_cast<ARMOperand*>(Operands[4])->getReg()) &&
+        static_cast<ARMOperand*>(Operands[5])->isImm0_7())
+      return false;
+
+    // Otherwise, we use encoding T4, which does not have a cc_out
+    // operand.
+    return true;
+  }
+
+  // The thumb2 multiply instruction doesn't have a CCOut register, so
+  // if we have a "mul" mnemonic in Thumb mode, check if we'll be able to
+  // use the 16-bit encoding or not.
+  if (isThumbTwo() && Mnemonic == "mul" && Operands.size() == 6 &&
+      static_cast<ARMOperand*>(Operands[1])->getReg() == 0 &&
+      static_cast<ARMOperand*>(Operands[3])->isReg() &&
+      static_cast<ARMOperand*>(Operands[4])->isReg() &&
+      static_cast<ARMOperand*>(Operands[5])->isReg() &&
+      // If the registers aren't low regs, the destination reg isn't the
+      // same as one of the source regs, or the cc_out operand is zero
+      // outside of an IT block, we have to use the 32-bit encoding, so
+      // remove the cc_out operand.
+      (!isARMLowRegister(static_cast<ARMOperand*>(Operands[3])->getReg()) ||
+       !isARMLowRegister(static_cast<ARMOperand*>(Operands[4])->getReg()) ||
+       !inITBlock() ||
+       (static_cast<ARMOperand*>(Operands[3])->getReg() !=
+        static_cast<ARMOperand*>(Operands[5])->getReg() &&
+        static_cast<ARMOperand*>(Operands[3])->getReg() !=
+        static_cast<ARMOperand*>(Operands[4])->getReg())))
+    return true;
+
+
+
+  // Register-register 'add/sub' for thumb does not have a cc_out operand
+  // when it's an ADD/SUB SP, #imm. Be lenient on count since there's also
+  // the "add/sub SP, SP, #imm" version. If the follow-up operands aren't
+  // right, this will result in better diagnostics (which operand is off)
+  // anyway.
+  if (isThumb() && (Mnemonic == "add" || Mnemonic == "sub") &&
+      (Operands.size() == 5 || Operands.size() == 6) &&
+      static_cast<ARMOperand*>(Operands[3])->isReg() &&
+      static_cast<ARMOperand*>(Operands[3])->getReg() == ARM::SP &&
+      static_cast<ARMOperand*>(Operands[1])->getReg() == 0)
+    return true;
+
+  return false;
 }
 
 /// Parse an arm instruction mnemonic followed by its operands.
@@ -2050,16 +3755,51 @@ bool ARMAsmParser::ParseInstruction(StringRef Name, SMLoc NameLoc,
                                SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // Create the leading tokens for the mnemonic, split by '.' characters.
   size_t Start = 0, Next = Name.find('.');
-  StringRef Head = Name.slice(Start, Next);
+  StringRef Mnemonic = Name.slice(Start, Next);
 
   // Split out the predication code and carry setting flag from the mnemonic.
   unsigned PredicationCode;
   unsigned ProcessorIMod;
   bool CarrySetting;
-  Head = SplitMnemonic(Head, PredicationCode, CarrySetting,
-                       ProcessorIMod);
+  StringRef ITMask;
+  Mnemonic = splitMnemonic(Mnemonic, PredicationCode, CarrySetting,
+                           ProcessorIMod, ITMask);
+
+  // In Thumb1, only the branch (B) instruction can be predicated.
+  if (isThumbOne() && PredicationCode != ARMCC::AL && Mnemonic != "b") {
+    Parser.EatToEndOfStatement();
+    return Error(NameLoc, "conditional execution not supported in Thumb1");
+  }
+
+  Operands.push_back(ARMOperand::CreateToken(Mnemonic, NameLoc));
+
+  // Handle the IT instruction ITMask. Convert it to a bitmask. This
+  // is the mask as it will be for the IT encoding if the conditional
+  // encoding has a '1' as it's bit0 (i.e. 't' ==> '1'). In the case
+  // where the conditional bit0 is zero, the instruction post-processing
+  // will adjust the mask accordingly.
+  if (Mnemonic == "it") {
+    SMLoc Loc = SMLoc::getFromPointer(NameLoc.getPointer() + 2);
+    if (ITMask.size() > 3) {
+      Parser.EatToEndOfStatement();
+      return Error(Loc, "too many conditions on IT instruction");
+    }
+    unsigned Mask = 8;
+    for (unsigned i = ITMask.size(); i != 0; --i) {
+      char pos = ITMask[i - 1];
+      if (pos != 't' && pos != 'e') {
+        Parser.EatToEndOfStatement();
+        return Error(Loc, "illegal IT block condition mask '" + ITMask + "'");
+      }
+      Mask >>= 1;
+      if (ITMask[i - 1] == 't')
+        Mask |= 8;
+    }
+    Operands.push_back(ARMOperand::CreateITMask(Mask, Loc));
+  }
 
-  Operands.push_back(ARMOperand::CreateToken(Head, NameLoc));
+  // FIXME: This is all a pretty gross hack. We should automatically handle
+  // optional operands like this via tblgen.
 
   // Next, add the CCOut and ConditionCode operands, if needed.
   //
@@ -2069,34 +3809,36 @@ bool ARMAsmParser::ParseInstruction(StringRef Name, SMLoc NameLoc,
   // the matcher deal with finding the right instruction or generating an
   // appropriate error.
   bool CanAcceptCarrySet, CanAcceptPredicationCode;
-  GetMnemonicAcceptInfo(Head, CanAcceptCarrySet, CanAcceptPredicationCode);
+  getMnemonicAcceptInfo(Mnemonic, CanAcceptCarrySet, CanAcceptPredicationCode);
 
   // If we had a carry-set on an instruction that can't do that, issue an
   // error.
   if (!CanAcceptCarrySet && CarrySetting) {
     Parser.EatToEndOfStatement();
-    return Error(NameLoc, "instruction '" + Head +
+    return Error(NameLoc, "instruction '" + Mnemonic +
                  "' can not set flags, but 's' suffix specified");
   }
+  // If we had a predication code on an instruction that can't do that, issue an
+  // error.
+  if (!CanAcceptPredicationCode && PredicationCode != ARMCC::AL) {
+    Parser.EatToEndOfStatement();
+    return Error(NameLoc, "instruction '" + Mnemonic +
+                 "' is not predicable, but condition code specified");
+  }
 
   // Add the carry setting operand, if necessary.
-  //
-  // FIXME: It would be awesome if we could somehow invent a location such that
-  // match errors on this operand would print a nice diagnostic about how the
-  // 's' character in the mnemonic resulted in a CCOut operand.
-  if (CanAcceptCarrySet)
+  if (CanAcceptCarrySet) {
+    SMLoc Loc = SMLoc::getFromPointer(NameLoc.getPointer() + Mnemonic.size());
     Operands.push_back(ARMOperand::CreateCCOut(CarrySetting ? ARM::CPSR : 0,
-                                               NameLoc));
+                                               Loc));
+  }
 
   // Add the predication code operand, if necessary.
   if (CanAcceptPredicationCode) {
+    SMLoc Loc = SMLoc::getFromPointer(NameLoc.getPointer() + Mnemonic.size() +
+                                      CarrySetting);
     Operands.push_back(ARMOperand::CreateCondCode(
-                         ARMCC::CondCodes(PredicationCode), NameLoc));
-  } else {
-    // This mnemonic can't ever accept a predication code, but the user wrote
-    // one (or misspelled another mnemonic).
-
-    // FIXME: Issue a nice error.
+                         ARMCC::CondCodes(PredicationCode), Loc));
   }
 
   // Add the processor imod operand, if necessary.
@@ -2104,11 +3846,6 @@ bool ARMAsmParser::ParseInstruction(StringRef Name, SMLoc NameLoc,
     Operands.push_back(ARMOperand::CreateImm(
           MCConstantExpr::Create(ProcessorIMod, getContext()),
                                  NameLoc, NameLoc));
-  } else {
-    // This mnemonic can't ever accept a imod, but the user wrote
-    // one (or misspelled another mnemonic).
-
-    // FIXME: Issue a nice error.
   }
 
   // Add the remaining tokens in the mnemonic.
@@ -2117,13 +3854,19 @@ bool ARMAsmParser::ParseInstruction(StringRef Name, SMLoc NameLoc,
     Next = Name.find('.', Start + 1);
     StringRef ExtraToken = Name.slice(Start, Next);
 
-    Operands.push_back(ARMOperand::CreateToken(ExtraToken, NameLoc));
+    // For now, we're only parsing Thumb1 (for the most part), so
+    // just ignore ".n" qualifiers. We'll use them to restrict
+    // matching when we do Thumb2.
+    if (ExtraToken != ".n") {
+      SMLoc Loc = SMLoc::getFromPointer(NameLoc.getPointer() + Start);
+      Operands.push_back(ARMOperand::CreateToken(ExtraToken, Loc));
+    }
   }
 
   // Read the remaining operands.
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     // Read the first operand.
-    if (ParseOperand(Operands, Head)) {
+    if (parseOperand(Operands, Mnemonic)) {
       Parser.EatToEndOfStatement();
       return true;
     }
@@ -2132,7 +3875,7 @@ bool ARMAsmParser::ParseInstruction(StringRef Name, SMLoc NameLoc,
       Parser.Lex();  // Eat the comma.
 
       // Parse and remember the operand.
-      if (ParseOperand(Operands, Head)) {
+      if (parseOperand(Operands, Mnemonic)) {
         Parser.EatToEndOfStatement();
         return true;
       }
@@ -2140,75 +3883,548 @@ bool ARMAsmParser::ParseInstruction(StringRef Name, SMLoc NameLoc,
   }
 
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    SMLoc Loc = getLexer().getLoc();
     Parser.EatToEndOfStatement();
-    return TokError("unexpected token in argument list");
+    return Error(Loc, "unexpected token in argument list");
   }
 
   Parser.Lex(); // Consume the EndOfStatement
+
+  // Some instructions, mostly Thumb, have forms for the same mnemonic that
+  // do and don't have a cc_out optional-def operand. With some spot-checks
+  // of the operand list, we can figure out which variant we're trying to
+  // parse and adjust accordingly before actually matching. We shouldn't ever
+  // try to remove a cc_out operand that was explicitly set on the the
+  // mnemonic, of course (CarrySetting == true). Reason number #317 the
+  // table driven matcher doesn't fit well with the ARM instruction set.
+  if (!CarrySetting && shouldOmitCCOutOperand(Mnemonic, Operands)) {
+    ARMOperand *Op = static_cast<ARMOperand*>(Operands[1]);
+    Operands.erase(Operands.begin() + 1);
+    delete Op;
+  }
+
+  // ARM mode 'blx' need special handling, as the register operand version
+  // is predicable, but the label operand version is not. So, we can't rely
+  // on the Mnemonic based checking to correctly figure out when to put
+  // a k_CondCode operand in the list. If we're trying to match the label
+  // version, remove the k_CondCode operand here.
+  if (!isThumb() && Mnemonic == "blx" && Operands.size() == 3 &&
+      static_cast<ARMOperand*>(Operands[2])->isImm()) {
+    ARMOperand *Op = static_cast<ARMOperand*>(Operands[1]);
+    Operands.erase(Operands.begin() + 1);
+    delete Op;
+  }
+
+  // The vector-compare-to-zero instructions have a literal token "#0" at
+  // the end that comes to here as an immediate operand. Convert it to a
+  // token to play nicely with the matcher.
+  if ((Mnemonic == "vceq" || Mnemonic == "vcge" || Mnemonic == "vcgt" ||
+      Mnemonic == "vcle" || Mnemonic == "vclt") && Operands.size() == 6 &&
+      static_cast<ARMOperand*>(Operands[5])->isImm()) {
+    ARMOperand *Op = static_cast<ARMOperand*>(Operands[5]);
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op->getImm());
+    if (CE && CE->getValue() == 0) {
+      Operands.erase(Operands.begin() + 5);
+      Operands.push_back(ARMOperand::CreateToken("#0", Op->getStartLoc()));
+      delete Op;
+    }
+  }
+  // VCMP{E} does the same thing, but with a different operand count.
+  if ((Mnemonic == "vcmp" || Mnemonic == "vcmpe") && Operands.size() == 5 &&
+      static_cast<ARMOperand*>(Operands[4])->isImm()) {
+    ARMOperand *Op = static_cast<ARMOperand*>(Operands[4]);
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op->getImm());
+    if (CE && CE->getValue() == 0) {
+      Operands.erase(Operands.begin() + 4);
+      Operands.push_back(ARMOperand::CreateToken("#0", Op->getStartLoc()));
+      delete Op;
+    }
+  }
+  // Similarly, the Thumb1 "RSB" instruction has a literal "#0" on the
+  // end. Convert it to a token here.
+  if (Mnemonic == "rsb" && isThumb() && Operands.size() == 6 &&
+      static_cast<ARMOperand*>(Operands[5])->isImm()) {
+    ARMOperand *Op = static_cast<ARMOperand*>(Operands[5]);
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op->getImm());
+    if (CE && CE->getValue() == 0) {
+      Operands.erase(Operands.begin() + 5);
+      Operands.push_back(ARMOperand::CreateToken("#0", Op->getStartLoc()));
+      delete Op;
+    }
+  }
+
+  return false;
+}
+
+// Validate context-sensitive operand constraints.
+
+// return 'true' if register list contains non-low GPR registers,
+// 'false' otherwise. If Reg is in the register list or is HiReg, set
+// 'containsReg' to true.
+static bool checkLowRegisterList(MCInst Inst, unsigned OpNo, unsigned Reg,
+                                 unsigned HiReg, bool &containsReg) {
+  containsReg = false;
+  for (unsigned i = OpNo; i < Inst.getNumOperands(); ++i) {
+    unsigned OpReg = Inst.getOperand(i).getReg();
+    if (OpReg == Reg)
+      containsReg = true;
+    // Anything other than a low register isn't legal here.
+    if (!isARMLowRegister(OpReg) && (!HiReg || OpReg != HiReg))
+      return true;
+  }
+  return false;
+}
+
+// Check if the specified regisgter is in the register list of the inst,
+// starting at the indicated operand number.
+static bool listContainsReg(MCInst &Inst, unsigned OpNo, unsigned Reg) {
+  for (unsigned i = OpNo; i < Inst.getNumOperands(); ++i) {
+    unsigned OpReg = Inst.getOperand(i).getReg();
+    if (OpReg == Reg)
+      return true;
+  }
+  return false;
+}
+
+// FIXME: We would really prefer to have MCInstrInfo (the wrapper around
+// the ARMInsts array) instead. Getting that here requires awkward
+// API changes, though. Better way?
+namespace llvm {
+extern MCInstrDesc ARMInsts[];
+}
+static MCInstrDesc &getInstDesc(unsigned Opcode) {
+  return ARMInsts[Opcode];
+}
+
+// FIXME: We would really like to be able to tablegen'erate this.
+bool ARMAsmParser::
+validateInstruction(MCInst &Inst,
+                    const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  MCInstrDesc &MCID = getInstDesc(Inst.getOpcode());
+  SMLoc Loc = Operands[0]->getStartLoc();
+  // Check the IT block state first.
+  // NOTE: In Thumb mode, the BKPT instruction has the interesting property of
+  // being allowed in IT blocks, but not being predicable.  It just always
+  // executes.
+  if (inITBlock() && Inst.getOpcode() != ARM::tBKPT) {
+    unsigned bit = 1;
+    if (ITState.FirstCond)
+      ITState.FirstCond = false;
+    else
+      bit = (ITState.Mask >> (5 - ITState.CurPosition)) & 1;
+    // The instruction must be predicable.
+    if (!MCID.isPredicable())
+      return Error(Loc, "instructions in IT block must be predicable");
+    unsigned Cond = Inst.getOperand(MCID.findFirstPredOperandIdx()).getImm();
+    unsigned ITCond = bit ? ITState.Cond :
+      ARMCC::getOppositeCondition(ITState.Cond);
+    if (Cond != ITCond) {
+      // Find the condition code Operand to get its SMLoc information.
+      SMLoc CondLoc;
+      for (unsigned i = 1; i < Operands.size(); ++i)
+        if (static_cast<ARMOperand*>(Operands[i])->isCondCode())
+          CondLoc = Operands[i]->getStartLoc();
+      return Error(CondLoc, "incorrect condition in IT block; got '" +
+                   StringRef(ARMCondCodeToString(ARMCC::CondCodes(Cond))) +
+                   "', but expected '" +
+                   ARMCondCodeToString(ARMCC::CondCodes(ITCond)) + "'");
+    }
+  // Check for non-'al' condition codes outside of the IT block.
+  } else if (isThumbTwo() && MCID.isPredicable() &&
+             Inst.getOperand(MCID.findFirstPredOperandIdx()).getImm() !=
+             ARMCC::AL && Inst.getOpcode() != ARM::tB &&
+             Inst.getOpcode() != ARM::t2B)
+    return Error(Loc, "predicated instructions must be in IT block");
+
+  switch (Inst.getOpcode()) {
+  case ARM::LDRD:
+  case ARM::LDRD_PRE:
+  case ARM::LDRD_POST:
+  case ARM::LDREXD: {
+    // Rt2 must be Rt + 1.
+    unsigned Rt = getARMRegisterNumbering(Inst.getOperand(0).getReg());
+    unsigned Rt2 = getARMRegisterNumbering(Inst.getOperand(1).getReg());
+    if (Rt2 != Rt + 1)
+      return Error(Operands[3]->getStartLoc(),
+                   "destination operands must be sequential");
+    return false;
+  }
+  case ARM::STRD: {
+    // Rt2 must be Rt + 1.
+    unsigned Rt = getARMRegisterNumbering(Inst.getOperand(0).getReg());
+    unsigned Rt2 = getARMRegisterNumbering(Inst.getOperand(1).getReg());
+    if (Rt2 != Rt + 1)
+      return Error(Operands[3]->getStartLoc(),
+                   "source operands must be sequential");
+    return false;
+  }
+  case ARM::STRD_PRE:
+  case ARM::STRD_POST:
+  case ARM::STREXD: {
+    // Rt2 must be Rt + 1.
+    unsigned Rt = getARMRegisterNumbering(Inst.getOperand(1).getReg());
+    unsigned Rt2 = getARMRegisterNumbering(Inst.getOperand(2).getReg());
+    if (Rt2 != Rt + 1)
+      return Error(Operands[3]->getStartLoc(),
+                   "source operands must be sequential");
+    return false;
+  }
+  case ARM::SBFX:
+  case ARM::UBFX: {
+    // width must be in range [1, 32-lsb]
+    unsigned lsb = Inst.getOperand(2).getImm();
+    unsigned widthm1 = Inst.getOperand(3).getImm();
+    if (widthm1 >= 32 - lsb)
+      return Error(Operands[5]->getStartLoc(),
+                   "bitfield width must be in range [1,32-lsb]");
+    return false;
+  }
+  case ARM::tLDMIA: {
+    // If we're parsing Thumb2, the .w variant is available and handles
+    // most cases that are normally illegal for a Thumb1 LDM
+    // instruction. We'll make the transformation in processInstruction()
+    // if necessary.
+    //
+    // Thumb LDM instructions are writeback iff the base register is not
+    // in the register list.
+    unsigned Rn = Inst.getOperand(0).getReg();
+    bool hasWritebackToken =
+      (static_cast<ARMOperand*>(Operands[3])->isToken() &&
+       static_cast<ARMOperand*>(Operands[3])->getToken() == "!");
+    bool listContainsBase;
+    if (checkLowRegisterList(Inst, 3, Rn, 0, listContainsBase) && !isThumbTwo())
+      return Error(Operands[3 + hasWritebackToken]->getStartLoc(),
+                   "registers must be in range r0-r7");
+    // If we should have writeback, then there should be a '!' token.
+    if (!listContainsBase && !hasWritebackToken && !isThumbTwo())
+      return Error(Operands[2]->getStartLoc(),
+                   "writeback operator '!' expected");
+    // If we should not have writeback, there must not be a '!'. This is
+    // true even for the 32-bit wide encodings.
+    if (listContainsBase && hasWritebackToken)
+      return Error(Operands[3]->getStartLoc(),
+                   "writeback operator '!' not allowed when base register "
+                   "in register list");
+
+    break;
+  }
+  case ARM::t2LDMIA_UPD: {
+    if (listContainsReg(Inst, 3, Inst.getOperand(0).getReg()))
+      return Error(Operands[4]->getStartLoc(),
+                   "writeback operator '!' not allowed when base register "
+                   "in register list");
+    break;
+  }
+  case ARM::tPOP: {
+    bool listContainsBase;
+    if (checkLowRegisterList(Inst, 3, 0, ARM::PC, listContainsBase))
+      return Error(Operands[2]->getStartLoc(),
+                   "registers must be in range r0-r7 or pc");
+    break;
+  }
+  case ARM::tPUSH: {
+    bool listContainsBase;
+    if (checkLowRegisterList(Inst, 3, 0, ARM::LR, listContainsBase))
+      return Error(Operands[2]->getStartLoc(),
+                   "registers must be in range r0-r7 or lr");
+    break;
+  }
+  case ARM::tSTMIA_UPD: {
+    bool listContainsBase;
+    if (checkLowRegisterList(Inst, 4, 0, 0, listContainsBase) && !isThumbTwo())
+      return Error(Operands[4]->getStartLoc(),
+                   "registers must be in range r0-r7");
+    break;
+  }
+  }
+
   return false;
 }
 
+void ARMAsmParser::
+processInstruction(MCInst &Inst,
+                   const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  switch (Inst.getOpcode()) {
+  case ARM::LDMIA_UPD:
+    // If this is a load of a single register via a 'pop', then we should use
+    // a post-indexed LDR instruction instead, per the ARM ARM.
+    if (static_cast<ARMOperand*>(Operands[0])->getToken() == "pop" &&
+        Inst.getNumOperands() == 5) {
+      MCInst TmpInst;
+      TmpInst.setOpcode(ARM::LDR_POST_IMM);
+      TmpInst.addOperand(Inst.getOperand(4)); // Rt
+      TmpInst.addOperand(Inst.getOperand(0)); // Rn_wb
+      TmpInst.addOperand(Inst.getOperand(1)); // Rn
+      TmpInst.addOperand(MCOperand::CreateReg(0));  // am2offset
+      TmpInst.addOperand(MCOperand::CreateImm(4));
+      TmpInst.addOperand(Inst.getOperand(2)); // CondCode
+      TmpInst.addOperand(Inst.getOperand(3));
+      Inst = TmpInst;
+    }
+    break;
+  case ARM::STMDB_UPD:
+    // If this is a store of a single register via a 'push', then we should use
+    // a pre-indexed STR instruction instead, per the ARM ARM.
+    if (static_cast<ARMOperand*>(Operands[0])->getToken() == "push" &&
+        Inst.getNumOperands() == 5) {
+      MCInst TmpInst;
+      TmpInst.setOpcode(ARM::STR_PRE_IMM);
+      TmpInst.addOperand(Inst.getOperand(0)); // Rn_wb
+      TmpInst.addOperand(Inst.getOperand(4)); // Rt
+      TmpInst.addOperand(Inst.getOperand(1)); // addrmode_imm12
+      TmpInst.addOperand(MCOperand::CreateImm(-4));
+      TmpInst.addOperand(Inst.getOperand(2)); // CondCode
+      TmpInst.addOperand(Inst.getOperand(3));
+      Inst = TmpInst;
+    }
+    break;
+  case ARM::tADDi8:
+    // If the immediate is in the range 0-7, we want tADDi3 iff Rd was
+    // explicitly specified. From the ARM ARM: "Encoding T1 is preferred
+    // to encoding T2 if <Rd> is specified and encoding T2 is preferred
+    // to encoding T1 if <Rd> is omitted."
+    if (Inst.getOperand(3).getImm() < 8 && Operands.size() == 6)
+      Inst.setOpcode(ARM::tADDi3);
+    break;
+  case ARM::tSUBi8:
+    // If the immediate is in the range 0-7, we want tADDi3 iff Rd was
+    // explicitly specified. From the ARM ARM: "Encoding T1 is preferred
+    // to encoding T2 if <Rd> is specified and encoding T2 is preferred
+    // to encoding T1 if <Rd> is omitted."
+    if (Inst.getOperand(3).getImm() < 8 && Operands.size() == 6)
+      Inst.setOpcode(ARM::tSUBi3);
+    break;
+  case ARM::tB:
+    // A Thumb conditional branch outside of an IT block is a tBcc.
+    if (Inst.getOperand(1).getImm() != ARMCC::AL && !inITBlock())
+      Inst.setOpcode(ARM::tBcc);
+    break;
+  case ARM::t2B:
+    // A Thumb2 conditional branch outside of an IT block is a t2Bcc.
+    if (Inst.getOperand(1).getImm() != ARMCC::AL && !inITBlock())
+      Inst.setOpcode(ARM::t2Bcc);
+    break;
+  case ARM::t2Bcc:
+    // If the conditional is AL or we're in an IT block, we really want t2B.
+    if (Inst.getOperand(1).getImm() == ARMCC::AL || inITBlock())
+      Inst.setOpcode(ARM::t2B);
+    break;
+  case ARM::tBcc:
+    // If the conditional is AL, we really want tB.
+    if (Inst.getOperand(1).getImm() == ARMCC::AL)
+      Inst.setOpcode(ARM::tB);
+    break;
+  case ARM::tLDMIA: {
+    // If the register list contains any high registers, or if the writeback
+    // doesn't match what tLDMIA can do, we need to use the 32-bit encoding
+    // instead if we're in Thumb2. Otherwise, this should have generated
+    // an error in validateInstruction().
+    unsigned Rn = Inst.getOperand(0).getReg();
+    bool hasWritebackToken =
+      (static_cast<ARMOperand*>(Operands[3])->isToken() &&
+       static_cast<ARMOperand*>(Operands[3])->getToken() == "!");
+    bool listContainsBase;
+    if (checkLowRegisterList(Inst, 3, Rn, 0, listContainsBase) ||
+        (!listContainsBase && !hasWritebackToken) ||
+        (listContainsBase && hasWritebackToken)) {
+      // 16-bit encoding isn't sufficient. Switch to the 32-bit version.
+      assert (isThumbTwo());
+      Inst.setOpcode(hasWritebackToken ? ARM::t2LDMIA_UPD : ARM::t2LDMIA);
+      // If we're switching to the updating version, we need to insert
+      // the writeback tied operand.
+      if (hasWritebackToken)
+        Inst.insert(Inst.begin(),
+                    MCOperand::CreateReg(Inst.getOperand(0).getReg()));
+    }
+    break;
+  }
+  case ARM::tSTMIA_UPD: {
+    // If the register list contains any high registers, we need to use
+    // the 32-bit encoding instead if we're in Thumb2. Otherwise, this
+    // should have generated an error in validateInstruction().
+    unsigned Rn = Inst.getOperand(0).getReg();
+    bool listContainsBase;
+    if (checkLowRegisterList(Inst, 4, Rn, 0, listContainsBase)) {
+      // 16-bit encoding isn't sufficient. Switch to the 32-bit version.
+      assert (isThumbTwo());
+      Inst.setOpcode(ARM::t2STMIA_UPD);
+    }
+    break;
+  }
+  case ARM::t2MOVi: {
+    // If we can use the 16-bit encoding and the user didn't explicitly
+    // request the 32-bit variant, transform it here.
+    if (isARMLowRegister(Inst.getOperand(0).getReg()) &&
+        Inst.getOperand(1).getImm() <= 255 &&
+        ((!inITBlock() && Inst.getOperand(2).getImm() == ARMCC::AL &&
+         Inst.getOperand(4).getReg() == ARM::CPSR) ||
+        (inITBlock() && Inst.getOperand(4).getReg() == 0)) &&
+        (!static_cast<ARMOperand*>(Operands[2])->isToken() ||
+         static_cast<ARMOperand*>(Operands[2])->getToken() != ".w")) {
+      // The operands aren't in the same order for tMOVi8...
+      MCInst TmpInst;
+      TmpInst.setOpcode(ARM::tMOVi8);
+      TmpInst.addOperand(Inst.getOperand(0));
+      TmpInst.addOperand(Inst.getOperand(4));
+      TmpInst.addOperand(Inst.getOperand(1));
+      TmpInst.addOperand(Inst.getOperand(2));
+      TmpInst.addOperand(Inst.getOperand(3));
+      Inst = TmpInst;
+    }
+    break;
+  }
+  case ARM::t2MOVr: {
+    // If we can use the 16-bit encoding and the user didn't explicitly
+    // request the 32-bit variant, transform it here.
+    if (isARMLowRegister(Inst.getOperand(0).getReg()) &&
+        isARMLowRegister(Inst.getOperand(1).getReg()) &&
+        Inst.getOperand(2).getImm() == ARMCC::AL &&
+        Inst.getOperand(4).getReg() == ARM::CPSR &&
+        (!static_cast<ARMOperand*>(Operands[2])->isToken() ||
+         static_cast<ARMOperand*>(Operands[2])->getToken() != ".w")) {
+      // The operands aren't the same for tMOV[S]r... (no cc_out)
+      MCInst TmpInst;
+      TmpInst.setOpcode(Inst.getOperand(4).getReg() ? ARM::tMOVSr : ARM::tMOVr);
+      TmpInst.addOperand(Inst.getOperand(0));
+      TmpInst.addOperand(Inst.getOperand(1));
+      TmpInst.addOperand(Inst.getOperand(2));
+      TmpInst.addOperand(Inst.getOperand(3));
+      Inst = TmpInst;
+    }
+    break;
+  }
+  case ARM::t2SXTH:
+  case ARM::t2SXTB:
+  case ARM::t2UXTH:
+  case ARM::t2UXTB: {
+    // If we can use the 16-bit encoding and the user didn't explicitly
+    // request the 32-bit variant, transform it here.
+    if (isARMLowRegister(Inst.getOperand(0).getReg()) &&
+        isARMLowRegister(Inst.getOperand(1).getReg()) &&
+        Inst.getOperand(2).getImm() == 0 &&
+        (!static_cast<ARMOperand*>(Operands[2])->isToken() ||
+         static_cast<ARMOperand*>(Operands[2])->getToken() != ".w")) {
+      unsigned NewOpc;
+      switch (Inst.getOpcode()) {
+      default: llvm_unreachable("Illegal opcode!");
+      case ARM::t2SXTH: NewOpc = ARM::tSXTH; break;
+      case ARM::t2SXTB: NewOpc = ARM::tSXTB; break;
+      case ARM::t2UXTH: NewOpc = ARM::tUXTH; break;
+      case ARM::t2UXTB: NewOpc = ARM::tUXTB; break;
+      }
+      // The operands aren't the same for thumb1 (no rotate operand).
+      MCInst TmpInst;
+      TmpInst.setOpcode(NewOpc);
+      TmpInst.addOperand(Inst.getOperand(0));
+      TmpInst.addOperand(Inst.getOperand(1));
+      TmpInst.addOperand(Inst.getOperand(3));
+      TmpInst.addOperand(Inst.getOperand(4));
+      Inst = TmpInst;
+    }
+    break;
+  }
+  case ARM::t2IT: {
+    // The mask bits for all but the first condition are represented as
+    // the low bit of the condition code value implies 't'. We currently
+    // always have 1 implies 't', so XOR toggle the bits if the low bit
+    // of the condition code is zero. The encoding also expects the low
+    // bit of the condition to be encoded as bit 4 of the mask operand,
+    // so mask that in if needed
+    MCOperand &MO = Inst.getOperand(1);
+    unsigned Mask = MO.getImm();
+    unsigned OrigMask = Mask;
+    unsigned TZ = CountTrailingZeros_32(Mask);
+    if ((Inst.getOperand(0).getImm() & 1) == 0) {
+      assert(Mask && TZ <= 3 && "illegal IT mask value!");
+      for (unsigned i = 3; i != TZ; --i)
+        Mask ^= 1 << i;
+    } else
+      Mask |= 0x10;
+    MO.setImm(Mask);
+
+    // Set up the IT block state according to the IT instruction we just
+    // matched.
+    assert(!inITBlock() && "nested IT blocks?!");
+    ITState.Cond = ARMCC::CondCodes(Inst.getOperand(0).getImm());
+    ITState.Mask = OrigMask; // Use the original mask, not the updated one.
+    ITState.CurPosition = 0;
+    ITState.FirstCond = true;
+    break;
+  }
+  }
+}
+
+unsigned ARMAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
+  // 16-bit thumb arithmetic instructions either require or preclude the 'S'
+  // suffix depending on whether they're in an IT block or not.
+  unsigned Opc = Inst.getOpcode();
+  MCInstrDesc &MCID = getInstDesc(Opc);
+  if (MCID.TSFlags & ARMII::ThumbArithFlagSetting) {
+    assert(MCID.hasOptionalDef() &&
+           "optionally flag setting instruction missing optional def operand");
+    assert(MCID.NumOperands == Inst.getNumOperands() &&
+           "operand count mismatch!");
+    // Find the optional-def operand (cc_out).
+    unsigned OpNo;
+    for (OpNo = 0;
+         !MCID.OpInfo[OpNo].isOptionalDef() && OpNo < MCID.NumOperands;
+         ++OpNo)
+      ;
+    // If we're parsing Thumb1, reject it completely.
+    if (isThumbOne() && Inst.getOperand(OpNo).getReg() != ARM::CPSR)
+      return Match_MnemonicFail;
+    // If we're parsing Thumb2, which form is legal depends on whether we're
+    // in an IT block.
+    if (isThumbTwo() && Inst.getOperand(OpNo).getReg() != ARM::CPSR &&
+        !inITBlock())
+      return Match_RequiresITBlock;
+    if (isThumbTwo() && Inst.getOperand(OpNo).getReg() == ARM::CPSR &&
+        inITBlock())
+      return Match_RequiresNotITBlock;
+  }
+  // Some high-register supporting Thumb1 encodings only allow both registers
+  // to be from r0-r7 when in Thumb2.
+  else if (Opc == ARM::tADDhirr && isThumbOne() &&
+           isARMLowRegister(Inst.getOperand(1).getReg()) &&
+           isARMLowRegister(Inst.getOperand(2).getReg()))
+    return Match_RequiresThumb2;
+  // Others only require ARMv6 or later.
+  else if (Opc == ARM::tMOVr && isThumbOne() && !hasV6Ops() &&
+           isARMLowRegister(Inst.getOperand(0).getReg()) &&
+           isARMLowRegister(Inst.getOperand(1).getReg()))
+    return Match_RequiresV6;
+  return Match_Success;
+}
+
 bool ARMAsmParser::
 MatchAndEmitInstruction(SMLoc IDLoc,
                         SmallVectorImpl<MCParsedAsmOperand*> &Operands,
                         MCStreamer &Out) {
   MCInst Inst;
   unsigned ErrorInfo;
-  MatchResultTy MatchResult, MatchResult2;
+  unsigned MatchResult;
   MatchResult = MatchInstructionImpl(Operands, Inst, ErrorInfo);
-  if (MatchResult != Match_Success) {
-    // If we get a Match_InvalidOperand it might be some arithmetic instruction
-    // that does not update the condition codes.  So try adding a CCOut operand
-    // with a value of reg0.
-    if (MatchResult == Match_InvalidOperand) {
-      Operands.insert(Operands.begin() + 1,
-                      ARMOperand::CreateCCOut(0,
-                                  ((ARMOperand*)Operands[0])->getStartLoc()));
-      MatchResult2 = MatchInstructionImpl(Operands, Inst, ErrorInfo);
-      if (MatchResult2 == Match_Success)
-        MatchResult = Match_Success;
-      else {
-        ARMOperand *CCOut = ((ARMOperand*)Operands[1]);
-        Operands.erase(Operands.begin() + 1);
-        delete CCOut;
-      }
-    }
-    // If we get a Match_MnemonicFail it might be some arithmetic instruction
-    // that updates the condition codes if it ends in 's'.  So see if the
-    // mnemonic ends in 's' and if so try removing the 's' and adding a CCOut
-    // operand with a value of CPSR.
-    else if (MatchResult == Match_MnemonicFail) {
-      // Get the instruction mnemonic, which is the first token.
-      StringRef Mnemonic = ((ARMOperand*)Operands[0])->getToken();
-      if (Mnemonic.substr(Mnemonic.size()-1) == "s") {
-        // removed the 's' from the mnemonic for matching.
-        StringRef MnemonicNoS = Mnemonic.slice(0, Mnemonic.size() - 1);
-        SMLoc NameLoc = ((ARMOperand*)Operands[0])->getStartLoc();
-        ARMOperand *OldMnemonic = ((ARMOperand*)Operands[0]);
-        Operands.erase(Operands.begin());
-        delete OldMnemonic;
-        Operands.insert(Operands.begin(),
-                        ARMOperand::CreateToken(MnemonicNoS, NameLoc));
-        Operands.insert(Operands.begin() + 1,
-                        ARMOperand::CreateCCOut(ARM::CPSR, NameLoc));
-        MatchResult2 = MatchInstructionImpl(Operands, Inst, ErrorInfo);
-        if (MatchResult2 == Match_Success)
-          MatchResult = Match_Success;
-        else {
-          ARMOperand *OldMnemonic = ((ARMOperand*)Operands[0]);
-          Operands.erase(Operands.begin());
-          delete OldMnemonic;
-          Operands.insert(Operands.begin(),
-                          ARMOperand::CreateToken(Mnemonic, NameLoc));
-          ARMOperand *CCOut = ((ARMOperand*)Operands[1]);
-          Operands.erase(Operands.begin() + 1);
-          delete CCOut;
-        }
-      }
-    }
-  }
   switch (MatchResult) {
+  default: break;
   case Match_Success:
+    // Context sensitive operand constraints aren't handled by the matcher,
+    // so check them here.
+    if (validateInstruction(Inst, Operands)) {
+      // Still progress the IT block, otherwise one wrong condition causes
+      // nasty cascading errors.
+      forwardITPosition();
+      return true;
+    }
+
+    // Some instructions need post-processing to, for example, tweak which
+    // encoding is selected.
+    processInstruction(Inst, Operands);
+
+    // Only move forward at the very end so that everything in validate
+    // and process gets a consistent answer about whether we're in an IT
+    // block.
+    forwardITPosition();
+
     Out.EmitInstruction(Inst);
     return false;
   case Match_MissingFeature:
@@ -2227,34 +4443,43 @@ MatchAndEmitInstruction(SMLoc IDLoc,
     return Error(ErrorLoc, "invalid operand for instruction");
   }
   case Match_MnemonicFail:
-    return Error(IDLoc, "unrecognized instruction mnemonic");
+    return Error(IDLoc, "invalid instruction");
   case Match_ConversionFail:
-    return Error(IDLoc, "unable to convert operands to instruction");
+    // The converter function will have already emited a diagnostic.
+    return true;
+  case Match_RequiresNotITBlock:
+    return Error(IDLoc, "flag setting instruction only valid outside IT block");
+  case Match_RequiresITBlock:
+    return Error(IDLoc, "instruction only valid inside IT block");
+  case Match_RequiresV6:
+    return Error(IDLoc, "instruction variant requires ARMv6 or later");
+  case Match_RequiresThumb2:
+    return Error(IDLoc, "instruction variant requires Thumb2");
   }
 
   llvm_unreachable("Implement any new match types added!");
   return true;
 }
 
-/// ParseDirective parses the arm specific directives
+/// parseDirective parses the arm specific directives
 bool ARMAsmParser::ParseDirective(AsmToken DirectiveID) {
   StringRef IDVal = DirectiveID.getIdentifier();
   if (IDVal == ".word")
-    return ParseDirectiveWord(4, DirectiveID.getLoc());
+    return parseDirectiveWord(4, DirectiveID.getLoc());
   else if (IDVal == ".thumb")
-    return ParseDirectiveThumb(DirectiveID.getLoc());
+    return parseDirectiveThumb(DirectiveID.getLoc());
   else if (IDVal == ".thumb_func")
-    return ParseDirectiveThumbFunc(DirectiveID.getLoc());
+    return parseDirectiveThumbFunc(DirectiveID.getLoc());
   else if (IDVal == ".code")
-    return ParseDirectiveCode(DirectiveID.getLoc());
+    return parseDirectiveCode(DirectiveID.getLoc());
   else if (IDVal == ".syntax")
-    return ParseDirectiveSyntax(DirectiveID.getLoc());
+    return parseDirectiveSyntax(DirectiveID.getLoc());
   return true;
 }
 
-/// ParseDirectiveWord
+/// parseDirectiveWord
 ///  ::= .word [ expression (, expression)* ]
-bool ARMAsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
+bool ARMAsmParser::parseDirectiveWord(unsigned Size, SMLoc L) {
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     for (;;) {
       const MCExpr *Value;
@@ -2277,9 +4502,9 @@ bool ARMAsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
   return false;
 }
 
-/// ParseDirectiveThumb
+/// parseDirectiveThumb
 ///  ::= .thumb
-bool ARMAsmParser::ParseDirectiveThumb(SMLoc L) {
+bool ARMAsmParser::parseDirectiveThumb(SMLoc L) {
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return Error(L, "unexpected token in directive");
   Parser.Lex();
@@ -2290,9 +4515,9 @@ bool ARMAsmParser::ParseDirectiveThumb(SMLoc L) {
   return false;
 }
 
-/// ParseDirectiveThumbFunc
+/// parseDirectiveThumbFunc
 ///  ::= .thumbfunc symbol_name
-bool ARMAsmParser::ParseDirectiveThumbFunc(SMLoc L) {
+bool ARMAsmParser::parseDirectiveThumbFunc(SMLoc L) {
   const MCAsmInfo &MAI = getParser().getStreamer().getContext().getAsmInfo();
   bool isMachO = MAI.hasSubsectionsViaSymbols();
   StringRef Name;
@@ -2322,9 +4547,9 @@ bool ARMAsmParser::ParseDirectiveThumbFunc(SMLoc L) {
   return false;
 }
 
-/// ParseDirectiveSyntax
+/// parseDirectiveSyntax
 ///  ::= .syntax unified | divided
-bool ARMAsmParser::ParseDirectiveSyntax(SMLoc L) {
+bool ARMAsmParser::parseDirectiveSyntax(SMLoc L) {
   const AsmToken &Tok = Parser.getTok();
   if (Tok.isNot(AsmToken::Identifier))
     return Error(L, "unexpected token in .syntax directive");
@@ -2345,9 +4570,9 @@ bool ARMAsmParser::ParseDirectiveSyntax(SMLoc L) {
   return false;
 }
 
-/// ParseDirectiveCode
+/// parseDirectiveCode
 ///  ::= .code 16 | 32
-bool ARMAsmParser::ParseDirectiveCode(SMLoc L) {
+bool ARMAsmParser::parseDirectiveCode(SMLoc L) {
   const AsmToken &Tok = Parser.getTok();
   if (Tok.isNot(AsmToken::Integer))
     return Error(L, "unexpected token in .code directive");
@@ -2380,8 +4605,8 @@ extern "C" void LLVMInitializeARMAsmLexer();
 
 /// Force static initialization.
 extern "C" void LLVMInitializeARMAsmParser() {
-  RegisterAsmParser<ARMAsmParser> X(TheARMTarget);
-  RegisterAsmParser<ARMAsmParser> Y(TheThumbTarget);
+  RegisterMCAsmParser<ARMAsmParser> X(TheARMTarget);
+  RegisterMCAsmParser<ARMAsmParser> Y(TheThumbTarget);
   LLVMInitializeARMAsmLexer();
 }
 
diff --git a/lib/Target/ARM/AsmParser/CMakeLists.txt b/lib/Target/ARM/AsmParser/CMakeLists.txt
index 9ba7c0125d7f5..3f5ad39debc31 100644
--- a/lib/Target/ARM/AsmParser/CMakeLists.txt
+++ b/lib/Target/ARM/AsmParser/CMakeLists.txt
@@ -5,3 +5,12 @@ add_llvm_library(LLVMARMAsmParser
   ARMAsmParser.cpp
   )
 
+add_dependencies(LLVMARMAsmParser ARMCommonTableGen)
+
+add_llvm_library_dependencies(LLVMARMAsmParser
+  LLVMARMDesc
+  LLVMARMInfo
+  LLVMMC
+  LLVMMCParser
+  LLVMSupport
+  )
diff --git a/lib/Target/ARM/CMakeLists.txt b/lib/Target/ARM/CMakeLists.txt
index 21608d0b62fd7..f045e839a6167 100644
--- a/lib/Target/ARM/CMakeLists.txt
+++ b/lib/Target/ARM/CMakeLists.txt
@@ -1,21 +1,21 @@
 set(LLVM_TARGET_DEFINITIONS ARM.td)
 
-tablegen(ARMGenRegisterInfo.inc -gen-register-info)
-tablegen(ARMGenInstrInfo.inc -gen-instr-info)
-tablegen(ARMGenCodeEmitter.inc -gen-emitter)
-tablegen(ARMGenMCCodeEmitter.inc -gen-emitter -mc-emitter)
-tablegen(ARMGenMCPseudoLowering.inc -gen-pseudo-lowering)
-tablegen(ARMGenAsmWriter.inc -gen-asm-writer)
-tablegen(ARMGenAsmMatcher.inc -gen-asm-matcher)
-tablegen(ARMGenDAGISel.inc -gen-dag-isel)
-tablegen(ARMGenFastISel.inc -gen-fast-isel)
-tablegen(ARMGenCallingConv.inc -gen-callingconv)
-tablegen(ARMGenSubtargetInfo.inc -gen-subtarget)
-tablegen(ARMGenEDInfo.inc -gen-enhanced-disassembly-info)
-tablegen(ARMGenDecoderTables.inc -gen-arm-decoder)
+llvm_tablegen(ARMGenRegisterInfo.inc -gen-register-info)
+llvm_tablegen(ARMGenInstrInfo.inc -gen-instr-info)
+llvm_tablegen(ARMGenCodeEmitter.inc -gen-emitter)
+llvm_tablegen(ARMGenMCCodeEmitter.inc -gen-emitter -mc-emitter)
+llvm_tablegen(ARMGenMCPseudoLowering.inc -gen-pseudo-lowering)
+llvm_tablegen(ARMGenAsmWriter.inc -gen-asm-writer)
+llvm_tablegen(ARMGenAsmMatcher.inc -gen-asm-matcher)
+llvm_tablegen(ARMGenDAGISel.inc -gen-dag-isel)
+llvm_tablegen(ARMGenFastISel.inc -gen-fast-isel)
+llvm_tablegen(ARMGenCallingConv.inc -gen-callingconv)
+llvm_tablegen(ARMGenSubtargetInfo.inc -gen-subtarget)
+llvm_tablegen(ARMGenEDInfo.inc -gen-enhanced-disassembly-info)
+llvm_tablegen(ARMGenDisassemblerTables.inc -gen-disassembler)
+add_public_tablegen_target(ARMCommonTableGen)
 
 add_llvm_target(ARMCodeGen
-  ARMAsmBackend.cpp
   ARMAsmPrinter.cpp
   ARMBaseInstrInfo.cpp
   ARMBaseRegisterInfo.cpp
@@ -32,9 +32,6 @@ add_llvm_target(ARMCodeGen
   ARMISelLowering.cpp
   ARMInstrInfo.cpp
   ARMJITInfo.cpp
-  ARMMachObjectWriter.cpp
-  ARMMCCodeEmitter.cpp
-  ARMMCExpr.cpp
   ARMLoadStoreOptimizer.cpp
   ARMMCInstLower.cpp
   ARMRegisterInfo.cpp
@@ -43,9 +40,8 @@ add_llvm_target(ARMCodeGen
   ARMTargetMachine.cpp
   ARMTargetObjectFile.cpp
   MLxExpansionPass.cpp
-  NEONMoveFix.cpp
-  Thumb1InstrInfo.cpp
   Thumb1FrameLowering.cpp
+  Thumb1InstrInfo.cpp
   Thumb1RegisterInfo.cpp
   Thumb2ITBlockPass.cpp
   Thumb2InstrInfo.cpp
@@ -53,6 +49,20 @@ add_llvm_target(ARMCodeGen
   Thumb2SizeReduction.cpp
   )
 
+add_llvm_library_dependencies(LLVMARMCodeGen
+  LLVMARMAsmPrinter
+  LLVMARMDesc
+  LLVMARMInfo
+  LLVMAnalysis
+  LLVMAsmPrinter
+  LLVMCodeGen
+  LLVMCore
+  LLVMMC
+  LLVMSelectionDAG
+  LLVMSupport
+  LLVMTarget
+  )
+
 # workaround for hanging compilation on MSVC10
 if( MSVC_VERSION EQUAL 1600 )
 set_property(
diff --git a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index bdce2c4cf8960..8f2f813b676af 100644
--- a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -6,584 +6,4077 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// This file is part of the ARM Disassembler.
-// It contains code to implement the public interfaces of ARMDisassembler and
-// ThumbDisassembler, both of which are instances of MCDisassembler.
-//
-//===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "arm-disassembler"
 
-#include "ARMDisassembler.h"
-#include "ARMDisassemblerCore.h"
-
-#include "llvm/ADT/OwningPtr.h"
+#include "ARM.h"
+#include "ARMRegisterInfo.h"
+#include "ARMSubtarget.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "MCTargetDesc/ARMMCExpr.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
 #include "llvm/MC/EDInstInfo.h"
 #include "llvm/MC/MCInst.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDisassembler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MemoryObject.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 
-//#define DEBUG(X) do { X; } while (0)
-
-/// ARMGenDecoderTables.inc - ARMDecoderTables.inc is tblgen'ed from
-/// ARMDecoderEmitter.cpp TableGen backend.  It contains:
-///
-/// o Mappings from opcode to ARM/Thumb instruction format
-///
-/// o static uint16_t decodeInstruction(uint32_t insn) - the decoding function
-/// for an ARM instruction.
-///
-/// o static uint16_t decodeThumbInstruction(field_t insn) - the decoding
-/// function for a Thumb instruction.
-///
-#include "ARMGenDecoderTables.inc"
+using namespace llvm;
+
+typedef MCDisassembler::DecodeStatus DecodeStatus;
+
+namespace {
+/// ARMDisassembler - ARM disassembler for all ARM platforms.
+class ARMDisassembler : public MCDisassembler {
+public:
+  /// Constructor     - Initializes the disassembler.
+  ///
+  ARMDisassembler(const MCSubtargetInfo &STI) :
+    MCDisassembler(STI) {
+  }
+
+  ~ARMDisassembler() {
+  }
+
+  /// getInstruction - See MCDisassembler.
+  DecodeStatus getInstruction(MCInst &instr,
+                              uint64_t &size,
+                              const MemoryObject &region,
+                              uint64_t address,
+                              raw_ostream &vStream,
+                              raw_ostream &cStream) const;
+
+  /// getEDInfo - See MCDisassembler.
+  EDInstInfo *getEDInfo() const;
+private:
+};
+
+/// ThumbDisassembler - Thumb disassembler for all Thumb platforms.
+class ThumbDisassembler : public MCDisassembler {
+public:
+  /// Constructor     - Initializes the disassembler.
+  ///
+  ThumbDisassembler(const MCSubtargetInfo &STI) :
+    MCDisassembler(STI) {
+  }
+
+  ~ThumbDisassembler() {
+  }
+
+  /// getInstruction - See MCDisassembler.
+  DecodeStatus getInstruction(MCInst &instr,
+                              uint64_t &size,
+                              const MemoryObject &region,
+                              uint64_t address,
+                              raw_ostream &vStream,
+                              raw_ostream &cStream) const;
+
+  /// getEDInfo - See MCDisassembler.
+  EDInstInfo *getEDInfo() const;
+private:
+  mutable std::vector<unsigned> ITBlock;
+  DecodeStatus AddThumbPredicate(MCInst&) const;
+  void UpdateThumbVFPPredicate(MCInst&) const;
+};
+}
+
+static bool Check(DecodeStatus &Out, DecodeStatus In) {
+  switch (In) {
+    case MCDisassembler::Success:
+      // Out stays the same.
+      return true;
+    case MCDisassembler::SoftFail:
+      Out = In;
+      return true;
+    case MCDisassembler::Fail:
+      Out = In;
+      return false;
+  }
+  return false;
+}
+
 
+// Forward declare these because the autogenerated code will reference them.
+// Definitions are further down.
+static DecodeStatus DecodeGPRRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeGPRnopcRegisterClass(llvm::MCInst &Inst,
+                                               unsigned RegNo, uint64_t Address,
+                                               const void *Decoder);
+static DecodeStatus DecodetGPRRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecodetcGPRRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecoderGPRRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeSPRRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeDPRRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeDPR_8RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeDPR_VFP2RegisterClass(llvm::MCInst &Inst,
+                                                unsigned RegNo,
+                                                uint64_t Address,
+                                                const void *Decoder);
+static DecodeStatus DecodeQPRRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder);
+
+static DecodeStatus DecodePredicateOperand(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeCCOutOperand(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeSOImmOperand(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeRegListOperand(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeSPRRegListOperand(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeDPRRegListOperand(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+
+static DecodeStatus DecodeBitfieldMaskOperand(llvm::MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeCopMemInstruction(llvm::MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeAddrMode2IdxInstruction(llvm::MCInst &Inst,
+                                                  unsigned Insn,
+                                                  uint64_t Address,
+                                                  const void *Decoder);
+static DecodeStatus DecodeSORegMemOperand(llvm::MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeAddrMode3Instruction(llvm::MCInst &Inst,unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeSORegImmOperand(llvm::MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeSORegRegOperand(llvm::MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+
+static DecodeStatus DecodeMemMultipleWritebackInstruction(llvm::MCInst & Inst,
+                                                  unsigned Insn,
+                                                  uint64_t Adddress,
+                                                  const void *Decoder);
+static DecodeStatus DecodeT2MOVTWInstruction(llvm::MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeArmMOVTWInstruction(llvm::MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeSMLAInstruction(llvm::MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeCPSInstruction(llvm::MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2CPSInstruction(llvm::MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeAddrModeImm12Operand(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeAddrMode5Operand(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeAddrMode7Operand(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeBranchImmInstruction(llvm::MCInst &Inst,unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVCVTImmOperand(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeAddrMode6Operand(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLDInstruction(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVSTInstruction(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLD1DupInstruction(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLD2DupInstruction(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLD3DupInstruction(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLD4DupInstruction(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeNEONModImmInstruction(llvm::MCInst &Inst,unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVSHLMaxInstruction(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeShiftRight8Imm(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeShiftRight16Imm(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeShiftRight32Imm(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeShiftRight64Imm(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeTBLInstruction(llvm::MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodePostIdxReg(llvm::MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeCoprocessor(llvm::MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeMemBarrierOption(llvm::MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeMSRMask(llvm::MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeDoubleRegLoad(llvm::MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeDoubleRegStore(llvm::MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeLDRPreImm(llvm::MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeLDRPreReg(llvm::MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeSTRPreImm(llvm::MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeSTRPreReg(llvm::MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLD1LN(llvm::MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLD2LN(llvm::MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLD3LN(llvm::MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLD4LN(llvm::MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVST1LN(llvm::MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVST2LN(llvm::MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVST3LN(llvm::MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVST4LN(llvm::MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVMOVSRR(llvm::MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVMOVRRS(llvm::MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+
+static DecodeStatus DecodeThumbAddSpecialReg(llvm::MCInst &Inst, uint16_t Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbBROperand(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2BROperand(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbCmpBROperand(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbAddrModeRR(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbAddrModeIS(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbAddrModePC(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbAddrModeSP(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2AddrModeSOReg(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2LoadShift(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2Imm8S4(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2AddrModeImm8s4(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2AddrModeImm0_1020s4(llvm::MCInst &Inst,unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2Imm8(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2AddrModeImm8(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbAddSPImm(llvm::MCInst &Inst, uint16_t Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbAddSPReg(llvm::MCInst &Inst, uint16_t Insn,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbCPS(llvm::MCInst &Inst, uint16_t Insn,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbBLXOffset(llvm::MCInst &Inst, unsigned Insn,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2AddrModeImm12(llvm::MCInst &Inst, unsigned Val,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbTableBranch(llvm::MCInst &Inst, unsigned Val,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumb2BCCInstruction(llvm::MCInst &Inst, unsigned Val,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2SOImm(llvm::MCInst &Inst, unsigned Val,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbBCCTargetOperand(llvm::MCInst &Inst,unsigned Val,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbBLTargetOperand(llvm::MCInst &Inst, unsigned Val,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeIT(llvm::MCInst &Inst, unsigned Val,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2LDRDPreInstruction(llvm::MCInst &Inst,unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2STRDPreInstruction(llvm::MCInst &Inst,unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2Adr(llvm::MCInst &Inst, unsigned Val,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2LdStPre(llvm::MCInst &Inst, unsigned Val,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2ShifterImmOperand(llvm::MCInst &Inst, unsigned Val,
+                                uint64_t Address, const void *Decoder);
+
+
+
+#include "ARMGenDisassemblerTables.inc"
+#include "ARMGenInstrInfo.inc"
 #include "ARMGenEDInfo.inc"
 
-using namespace llvm;
+static MCDisassembler *createARMDisassembler(const Target &T, const MCSubtargetInfo &STI) {
+  return new ARMDisassembler(STI);
+}
 
-/// showBitVector - Use the raw_ostream to log a diagnostic message describing
-/// the inidividual bits of the instruction.
-///
-static inline void showBitVector(raw_ostream &os, const uint32_t &insn) {
-  // Split the bit position markers into more than one lines to fit 80 columns.
-  os << " 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11"
-     << " 10  9  8  7  6  5  4  3  2  1  0 \n";
-  os << "---------------------------------------------------------------"
-     << "----------------------------------\n";
-  os << '|';
-  for (unsigned i = 32; i != 0; --i) {
-    if (insn >> (i - 1) & 0x01)
-      os << " 1";
-    else
-      os << " 0";
-    os << (i%4 == 1 ? '|' : ':');
-  }
-  os << '\n';
-  // Split the bit position markers into more than one lines to fit 80 columns.
-  os << "---------------------------------------------------------------"
-     << "----------------------------------\n";
-  os << '\n';
-}
-
-/// decodeARMInstruction is a decorator function which tries special cases of
-/// instruction matching before calling the auto-generated decoder function.
-static unsigned decodeARMInstruction(uint32_t &insn) {
-  if (slice(insn, 31, 28) == 15)
-    goto AutoGenedDecoder;
-
-  // Special case processing, if any, goes here....
-
-  // LLVM combines the offset mode of A8.6.197 & A8.6.198 into STRB.
-  // The insufficient encoding information of the combined instruction confuses
-  // the decoder wrt BFC/BFI.  Therefore, we try to recover here.
-  // For BFC, Inst{27-21} = 0b0111110 & Inst{6-0} = 0b0011111.
-  // For BFI, Inst{27-21} = 0b0111110 & Inst{6-4} = 0b001 & Inst{3-0} =! 0b1111.
-  if (slice(insn, 27, 21) == 0x3e && slice(insn, 6, 4) == 1) {
-    if (slice(insn, 3, 0) == 15)
-      return ARM::BFC;
-    else
-      return ARM::BFI;
-  }
-
-  // Ditto for STRBT, which is a super-instruction for A8.6.199 Encodings
-  // A1 & A2.
-  // As a result, the decoder fails to deocode USAT properly.
-  if (slice(insn, 27, 21) == 0x37 && slice(insn, 5, 4) == 1)
-    return ARM::USAT;
-  // As a result, the decoder fails to deocode UQADD16 properly.
-  if (slice(insn, 27, 20) == 0x66 && slice(insn, 7, 4) == 1)
-    return ARM::UQADD16;
-
-  // Ditto for ADDSrs, which is a super-instruction for A8.6.7 & A8.6.8.
-  // As a result, the decoder fails to decode UMULL properly.
-  if (slice(insn, 27, 21) == 0x04 && slice(insn, 7, 4) == 9) {
-    return ARM::UMULL;
-  }
-
-  // Ditto for STR_PRE, which is a super-instruction for A8.6.194 & A8.6.195.
-  // As a result, the decoder fails to decode SBFX properly.
-  if (slice(insn, 27, 21) == 0x3d && slice(insn, 6, 4) == 5)
-    return ARM::SBFX;
-
-  // And STRB_PRE, which is a super-instruction for A8.6.197 & A8.6.198.
-  // As a result, the decoder fails to decode UBFX properly.
-  if (slice(insn, 27, 21) == 0x3f && slice(insn, 6, 4) == 5)
-    return ARM::UBFX;
-
-  // Ditto for STRT, which is a super-instruction for A8.6.210 Encoding A1 & A2.
-  // As a result, the decoder fails to deocode SSAT properly.
-  if (slice(insn, 27, 21) == 0x35 && slice(insn, 5, 4) == 1)
-    return ARM::SSAT;
-
-  // Ditto for RSCrs, which is a super-instruction for A8.6.146 & A8.6.147.
-  // As a result, the decoder fails to decode STRHT/LDRHT/LDRSHT/LDRSBT.
-  if (slice(insn, 27, 24) == 0) {
-    switch (slice(insn, 21, 20)) {
-    case 2:
-      switch (slice(insn, 7, 4)) {
-      case 11:
-        return ARM::STRHT;
-      default:
-        break; // fallthrough
+static MCDisassembler *createThumbDisassembler(const Target &T, const MCSubtargetInfo &STI) {
+  return new ThumbDisassembler(STI);
+}
+
+EDInstInfo *ARMDisassembler::getEDInfo() const {
+  return instInfoARM;
+}
+
+EDInstInfo *ThumbDisassembler::getEDInfo() const {
+  return instInfoARM;
+}
+
+DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
+                                             const MemoryObject &Region,
+                                             uint64_t Address,
+                                             raw_ostream &os,
+                                             raw_ostream &cs) const {
+  CommentStream = &cs;
+
+  uint8_t bytes[4];
+
+  assert(!(STI.getFeatureBits() & ARM::ModeThumb) &&
+         "Asked to disassemble an ARM instruction but Subtarget is in Thumb mode!");
+
+  // We want to read exactly 4 bytes of data.
+  if (Region.readBytes(Address, 4, (uint8_t*)bytes, NULL) == -1) {
+    Size = 0;
+    return MCDisassembler::Fail;
+  }
+
+  // Encoded as a small-endian 32-bit word in the stream.
+  uint32_t insn = (bytes[3] << 24) |
+                  (bytes[2] << 16) |
+                  (bytes[1] <<  8) |
+                  (bytes[0] <<  0);
+
+  // Calling the auto-generated decoder function.
+  DecodeStatus result = decodeARMInstruction32(MI, insn, Address, this, STI);
+  if (result != MCDisassembler::Fail) {
+    Size = 4;
+    return result;
+  }
+
+  // VFP and NEON instructions, similarly, are shared between ARM
+  // and Thumb modes.
+  MI.clear();
+  result = decodeVFPInstruction32(MI, insn, Address, this, STI);
+  if (result != MCDisassembler::Fail) {
+    Size = 4;
+    return result;
+  }
+
+  MI.clear();
+  result = decodeNEONDataInstruction32(MI, insn, Address, this, STI);
+  if (result != MCDisassembler::Fail) {
+    Size = 4;
+    // Add a fake predicate operand, because we share these instruction
+    // definitions with Thumb2 where these instructions are predicable.
+    if (!DecodePredicateOperand(MI, 0xE, Address, this))
+      return MCDisassembler::Fail;
+    return result;
+  }
+
+  MI.clear();
+  result = decodeNEONLoadStoreInstruction32(MI, insn, Address, this, STI);
+  if (result != MCDisassembler::Fail) {
+    Size = 4;
+    // Add a fake predicate operand, because we share these instruction
+    // definitions with Thumb2 where these instructions are predicable.
+    if (!DecodePredicateOperand(MI, 0xE, Address, this))
+      return MCDisassembler::Fail;
+    return result;
+  }
+
+  MI.clear();
+  result = decodeNEONDupInstruction32(MI, insn, Address, this, STI);
+  if (result != MCDisassembler::Fail) {
+    Size = 4;
+    // Add a fake predicate operand, because we share these instruction
+    // definitions with Thumb2 where these instructions are predicable.
+    if (!DecodePredicateOperand(MI, 0xE, Address, this))
+      return MCDisassembler::Fail;
+    return result;
+  }
+
+  MI.clear();
+
+  Size = 0;
+  return MCDisassembler::Fail;
+}
+
+namespace llvm {
+extern MCInstrDesc ARMInsts[];
+}
+
+/// tryAddingSymbolicOperand - trys to add a symbolic operand in place of the
+/// immediate Value in the MCInst.  The immediate Value has had any PC
+/// adjustment made by the caller.  If the instruction is a branch instruction
+/// then isBranch is true, else false.  If the getOpInfo() function was set as
+/// part of the setupForSymbolicDisassembly() call then that function is called
+/// to get any symbolic information at the Address for this instruction.  If
+/// that returns non-zero then the symbolic information it returns is used to
+/// create an MCExpr and that is added as an operand to the MCInst.  If
+/// getOpInfo() returns zero and isBranch is true then a symbol look up for
+/// Value is done and if a symbol is found an MCExpr is created with that, else
+/// an MCExpr with Value is created.  This function returns true if it adds an
+/// operand to the MCInst and false otherwise.
+static bool tryAddingSymbolicOperand(uint64_t Address, int32_t Value,
+                                     bool isBranch, uint64_t InstSize,
+                                     MCInst &MI, const void *Decoder) {
+  const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder);
+  LLVMOpInfoCallback getOpInfo = Dis->getLLVMOpInfoCallback();
+  if (!getOpInfo)
+    return false;
+
+  struct LLVMOpInfo1 SymbolicOp;
+  SymbolicOp.Value = Value;
+  void *DisInfo = Dis->getDisInfoBlock();
+  if (!getOpInfo(DisInfo, Address, 0 /* Offset */, InstSize, 1, &SymbolicOp)) {
+    if (isBranch) {
+      LLVMSymbolLookupCallback SymbolLookUp =
+                                            Dis->getLLVMSymbolLookupCallback();
+      if (SymbolLookUp) {
+        uint64_t ReferenceType;
+        ReferenceType = LLVMDisassembler_ReferenceType_In_Branch;
+        const char *ReferenceName;
+        const char *Name = SymbolLookUp(DisInfo, Value, &ReferenceType, Address,
+                                        &ReferenceName);
+        if (Name) {
+          SymbolicOp.AddSymbol.Name = Name;
+          SymbolicOp.AddSymbol.Present = true;
+          SymbolicOp.Value = 0;
+        }
+        else {
+          SymbolicOp.Value = Value;
+        }
+        if(ReferenceType == LLVMDisassembler_ReferenceType_Out_SymbolStub)
+          (*Dis->CommentStream) << "symbol stub for: " << ReferenceName;
       }
-      break;
-    case 3:
-      switch (slice(insn, 7, 4)) {
-      case 11:
-        return ARM::LDRHT;
-      case 13:
-        return ARM::LDRSBT;
-      case 15:
-        return ARM::LDRSHT;
-      default:
-        break; // fallthrough
+      else {
+        return false;
       }
-      break;
-    default:
-      break;   // fallthrough
+    }
+    else {
+      return false;
     }
   }
 
-  // Ditto for SBCrs, which is a super-instruction for A8.6.152 & A8.6.153.
-  // As a result, the decoder fails to decode STRH_Post/LDRD_POST/STRD_POST
-  // properly.
-  if (slice(insn, 27, 25) == 0 && slice(insn, 20, 20) == 0) {
-    unsigned PW = slice(insn, 24, 24) << 1 | slice(insn, 21, 21);
-    switch (slice(insn, 7, 4)) {
-    case 11:
-      switch (PW) {
-      case 2: // Offset
-        return ARM::STRH;
-      case 3: // Pre-indexed
-        return ARM::STRH_PRE;
-      case 0: // Post-indexed
-        return ARM::STRH_POST;
-      default:
-        break; // fallthrough
-      }
-      break;
-    case 13:
-      switch (PW) {
-      case 2: // Offset
-        return ARM::LDRD;
-      case 3: // Pre-indexed
-        return ARM::LDRD_PRE;
-      case 0: // Post-indexed
-        return ARM::LDRD_POST;
-      default:
-        break; // fallthrough
-      }
-      break;
-    case 15:
-      switch (PW) {
-      case 2: // Offset
-        return ARM::STRD;
-      case 3: // Pre-indexed
-        return ARM::STRD_PRE;
-      case 0: // Post-indexed
-        return ARM::STRD_POST;
-      default:
-        break; // fallthrough
-      }
-      break;
-    default:
-      break; // fallthrough
+  MCContext *Ctx = Dis->getMCContext();
+  const MCExpr *Add = NULL;
+  if (SymbolicOp.AddSymbol.Present) {
+    if (SymbolicOp.AddSymbol.Name) {
+      StringRef Name(SymbolicOp.AddSymbol.Name);
+      MCSymbol *Sym = Ctx->GetOrCreateSymbol(Name);
+      Add = MCSymbolRefExpr::Create(Sym, *Ctx);
+    } else {
+      Add = MCConstantExpr::Create(SymbolicOp.AddSymbol.Value, *Ctx);
     }
   }
 
-  // Ditto for SBCSSrs, which is a super-instruction for A8.6.152 & A8.6.153.
-  // As a result, the decoder fails to decode LDRH_POST/LDRSB_POST/LDRSH_POST
-  // properly.
-  if (slice(insn, 27, 25) == 0 && slice(insn, 20, 20) == 1) {
-    unsigned PW = slice(insn, 24, 24) << 1 | slice(insn, 21, 21);
-    switch (slice(insn, 7, 4)) {
-    case 11:
-      switch (PW) {
-      case 2: // Offset
-        return ARM::LDRH;
-      case 3: // Pre-indexed
-        return ARM::LDRH_PRE;
-      case 0: // Post-indexed
-        return ARM::LDRH_POST;
-      default:
-        break; // fallthrough
-      }
-      break;
-    case 13:
-      switch (PW) {
-      case 2: // Offset
-        return ARM::LDRSB;
-      case 3: // Pre-indexed
-        return ARM::LDRSB_PRE;
-      case 0: // Post-indexed
-        return ARM::LDRSB_POST;
-      default:
-        break; // fallthrough
-      }
+  const MCExpr *Sub = NULL;
+  if (SymbolicOp.SubtractSymbol.Present) {
+    if (SymbolicOp.SubtractSymbol.Name) {
+      StringRef Name(SymbolicOp.SubtractSymbol.Name);
+      MCSymbol *Sym = Ctx->GetOrCreateSymbol(Name);
+      Sub = MCSymbolRefExpr::Create(Sym, *Ctx);
+    } else {
+      Sub = MCConstantExpr::Create(SymbolicOp.SubtractSymbol.Value, *Ctx);
+    }
+  }
+
+  const MCExpr *Off = NULL;
+  if (SymbolicOp.Value != 0)
+    Off = MCConstantExpr::Create(SymbolicOp.Value, *Ctx);
+
+  const MCExpr *Expr;
+  if (Sub) {
+    const MCExpr *LHS;
+    if (Add)
+      LHS = MCBinaryExpr::CreateSub(Add, Sub, *Ctx);
+    else
+      LHS = MCUnaryExpr::CreateMinus(Sub, *Ctx);
+    if (Off != 0)
+      Expr = MCBinaryExpr::CreateAdd(LHS, Off, *Ctx);
+    else
+      Expr = LHS;
+  } else if (Add) {
+    if (Off != 0)
+      Expr = MCBinaryExpr::CreateAdd(Add, Off, *Ctx);
+    else
+      Expr = Add;
+  } else {
+    if (Off != 0)
+      Expr = Off;
+    else
+      Expr = MCConstantExpr::Create(0, *Ctx);
+  }
+
+  if (SymbolicOp.VariantKind == LLVMDisassembler_VariantKind_ARM_HI16)
+    MI.addOperand(MCOperand::CreateExpr(ARMMCExpr::CreateUpper16(Expr, *Ctx)));
+  else if (SymbolicOp.VariantKind == LLVMDisassembler_VariantKind_ARM_LO16)
+    MI.addOperand(MCOperand::CreateExpr(ARMMCExpr::CreateLower16(Expr, *Ctx)));
+  else if (SymbolicOp.VariantKind == LLVMDisassembler_VariantKind_None)
+    MI.addOperand(MCOperand::CreateExpr(Expr));
+  else 
+    assert(0 && "bad SymbolicOp.VariantKind");
+
+  return true;
+}
+
+/// tryAddingPcLoadReferenceComment - trys to add a comment as to what is being
+/// referenced by a load instruction with the base register that is the Pc.
+/// These can often be values in a literal pool near the Address of the
+/// instruction.  The Address of the instruction and its immediate Value are
+/// used as a possible literal pool entry.  The SymbolLookUp call back will
+/// return the name of a symbol referenced by the the literal pool's entry if
+/// the referenced address is that of a symbol.  Or it will return a pointer to
+/// a literal 'C' string if the referenced address of the literal pool's entry
+/// is an address into a section with 'C' string literals.
+static void tryAddingPcLoadReferenceComment(uint64_t Address, int Value,
+					    const void *Decoder) {
+  const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder);
+  LLVMSymbolLookupCallback SymbolLookUp = Dis->getLLVMSymbolLookupCallback();
+  if (SymbolLookUp) {
+    void *DisInfo = Dis->getDisInfoBlock();
+    uint64_t ReferenceType;
+    ReferenceType = LLVMDisassembler_ReferenceType_In_PCrel_Load;
+    const char *ReferenceName;
+    (void)SymbolLookUp(DisInfo, Value, &ReferenceType, Address, &ReferenceName);
+    if(ReferenceType == LLVMDisassembler_ReferenceType_Out_LitPool_SymAddr ||
+       ReferenceType == LLVMDisassembler_ReferenceType_Out_LitPool_CstrAddr)
+      (*Dis->CommentStream) << "literal pool for: " << ReferenceName;
+  }
+}
+
+// Thumb1 instructions don't have explicit S bits.  Rather, they
+// implicitly set CPSR.  Since it's not represented in the encoding, the
+// auto-generated decoder won't inject the CPSR operand.  We need to fix
+// that as a post-pass.
+static void AddThumb1SBit(MCInst &MI, bool InITBlock) {
+  const MCOperandInfo *OpInfo = ARMInsts[MI.getOpcode()].OpInfo;
+  unsigned short NumOps = ARMInsts[MI.getOpcode()].NumOperands;
+  MCInst::iterator I = MI.begin();
+  for (unsigned i = 0; i < NumOps; ++i, ++I) {
+    if (I == MI.end()) break;
+    if (OpInfo[i].isOptionalDef() && OpInfo[i].RegClass == ARM::CCRRegClassID) {
+      if (i > 0 && OpInfo[i-1].isPredicate()) continue;
+      MI.insert(I, MCOperand::CreateReg(InITBlock ? 0 : ARM::CPSR));
+      return;
+    }
+  }
+
+  MI.insert(I, MCOperand::CreateReg(InITBlock ? 0 : ARM::CPSR));
+}
+
+// Most Thumb instructions don't have explicit predicates in the
+// encoding, but rather get their predicates from IT context.  We need
+// to fix up the predicate operands using this context information as a
+// post-pass.
+MCDisassembler::DecodeStatus
+ThumbDisassembler::AddThumbPredicate(MCInst &MI) const {
+  MCDisassembler::DecodeStatus S = Success;
+
+  // A few instructions actually have predicates encoded in them.  Don't
+  // try to overwrite it if we're seeing one of those.
+  switch (MI.getOpcode()) {
+    case ARM::tBcc:
+    case ARM::t2Bcc:
+    case ARM::tCBZ:
+    case ARM::tCBNZ:
+    case ARM::tCPS:
+    case ARM::t2CPS3p:
+    case ARM::t2CPS2p:
+    case ARM::t2CPS1p:
+    case ARM::tMOVSr:
+    case ARM::tSETEND:
+      // Some instructions (mostly conditional branches) are not
+      // allowed in IT blocks.
+      if (!ITBlock.empty())
+        S = SoftFail;
+      else
+        return Success;
       break;
-    case 15:
-      switch (PW) {
-      case 2: // Offset
-        return ARM::LDRSH;
-      case 3: // Pre-indexed
-        return ARM::LDRSH_PRE;
-      case 0: // Post-indexed
-        return ARM::LDRSH_POST;
-      default:
-        break; // fallthrough
-      }
+    case ARM::tB:
+    case ARM::t2B:
+    case ARM::t2TBB:
+    case ARM::t2TBH:
+      // Some instructions (mostly unconditional branches) can
+      // only appears at the end of, or outside of, an IT.
+      if (ITBlock.size() > 1)
+        S = SoftFail;
       break;
     default:
-      break; // fallthrough
+      break;
+  }
+
+  // If we're in an IT block, base the predicate on that.  Otherwise,
+  // assume a predicate of AL.
+  unsigned CC;
+  if (!ITBlock.empty()) {
+    CC = ITBlock.back();
+    if (CC == 0xF)
+      CC = ARMCC::AL;
+    ITBlock.pop_back();
+  } else
+    CC = ARMCC::AL;
+
+  const MCOperandInfo *OpInfo = ARMInsts[MI.getOpcode()].OpInfo;
+  unsigned short NumOps = ARMInsts[MI.getOpcode()].NumOperands;
+  MCInst::iterator I = MI.begin();
+  for (unsigned i = 0; i < NumOps; ++i, ++I) {
+    if (I == MI.end()) break;
+    if (OpInfo[i].isPredicate()) {
+      I = MI.insert(I, MCOperand::CreateImm(CC));
+      ++I;
+      if (CC == ARMCC::AL)
+        MI.insert(I, MCOperand::CreateReg(0));
+      else
+        MI.insert(I, MCOperand::CreateReg(ARM::CPSR));
+      return S;
     }
   }
 
-AutoGenedDecoder:
-  // Calling the auto-generated decoder function.
-  return decodeInstruction(insn);
+  I = MI.insert(I, MCOperand::CreateImm(CC));
+  ++I;
+  if (CC == ARMCC::AL)
+    MI.insert(I, MCOperand::CreateReg(0));
+  else
+    MI.insert(I, MCOperand::CreateReg(ARM::CPSR));
+
+  return S;
 }
 
-// Helper function for special case handling of LDR (literal) and friends.
-// See, for example, A6.3.7 Load word: Table A6-18 Load word.
-// See A8.6.57 T3, T4 & A8.6.60 T2 and friends for why we morphed the opcode
-// before returning it.
-static unsigned T2Morph2LoadLiteral(unsigned Opcode) {
-  switch (Opcode) {
-  default:
-    return Opcode; // Return unmorphed opcode.
+// Thumb VFP instructions are a special case.  Because we share their
+// encodings between ARM and Thumb modes, and they are predicable in ARM
+// mode, the auto-generated decoder will give them an (incorrect)
+// predicate operand.  We need to rewrite these operands based on the IT
+// context as a post-pass.
+void ThumbDisassembler::UpdateThumbVFPPredicate(MCInst &MI) const {
+  unsigned CC;
+  if (!ITBlock.empty()) {
+    CC = ITBlock.back();
+    ITBlock.pop_back();
+  } else
+    CC = ARMCC::AL;
+
+  const MCOperandInfo *OpInfo = ARMInsts[MI.getOpcode()].OpInfo;
+  MCInst::iterator I = MI.begin();
+  unsigned short NumOps = ARMInsts[MI.getOpcode()].NumOperands;
+  for (unsigned i = 0; i < NumOps; ++i, ++I) {
+    if (OpInfo[i].isPredicate() ) {
+      I->setImm(CC);
+      ++I;
+      if (CC == ARMCC::AL)
+        I->setReg(0);
+      else
+        I->setReg(ARM::CPSR);
+      return;
+    }
+  }
+}
+
+DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
+                                               const MemoryObject &Region,
+                                               uint64_t Address,
+                                               raw_ostream &os,
+                                               raw_ostream &cs) const {
+  CommentStream = &cs;
 
-  case ARM::t2LDR_POST:   case ARM::t2LDR_PRE:
-  case ARM::t2LDRi12:     case ARM::t2LDRi8:
-  case ARM::t2LDRs:       case ARM::t2LDRT:
-    return ARM::t2LDRpci;
+  uint8_t bytes[4];
 
-  case ARM::t2LDRB_POST:  case ARM::t2LDRB_PRE:
-  case ARM::t2LDRBi12:    case ARM::t2LDRBi8:
-  case ARM::t2LDRBs:      case ARM::t2LDRBT:
-    return ARM::t2LDRBpci;
+  assert((STI.getFeatureBits() & ARM::ModeThumb) &&
+         "Asked to disassemble in Thumb mode but Subtarget is in ARM mode!");
 
-  case ARM::t2LDRH_POST:  case ARM::t2LDRH_PRE:
-  case ARM::t2LDRHi12:    case ARM::t2LDRHi8:
-  case ARM::t2LDRHs:      case ARM::t2LDRHT:
-    return ARM::t2LDRHpci;
+  // We want to read exactly 2 bytes of data.
+  if (Region.readBytes(Address, 2, (uint8_t*)bytes, NULL) == -1) {
+    Size = 0;
+    return MCDisassembler::Fail;
+  }
 
-  case ARM::t2LDRSB_POST:  case ARM::t2LDRSB_PRE:
-  case ARM::t2LDRSBi12:    case ARM::t2LDRSBi8:
-  case ARM::t2LDRSBs:      case ARM::t2LDRSBT:
-    return ARM::t2LDRSBpci;
+  uint16_t insn16 = (bytes[1] << 8) | bytes[0];
+  DecodeStatus result = decodeThumbInstruction16(MI, insn16, Address, this, STI);
+  if (result != MCDisassembler::Fail) {
+    Size = 2;
+    Check(result, AddThumbPredicate(MI));
+    return result;
+  }
 
-  case ARM::t2LDRSH_POST:  case ARM::t2LDRSH_PRE:
-  case ARM::t2LDRSHi12:    case ARM::t2LDRSHi8:
-  case ARM::t2LDRSHs:      case ARM::t2LDRSHT:
-    return ARM::t2LDRSHpci;
+  MI.clear();
+  result = decodeThumbSBitInstruction16(MI, insn16, Address, this, STI);
+  if (result) {
+    Size = 2;
+    bool InITBlock = !ITBlock.empty();
+    Check(result, AddThumbPredicate(MI));
+    AddThumb1SBit(MI, InITBlock);
+    return result;
   }
-}
 
-// Helper function for special case handling of PLD (literal) and friends.
-// See A8.6.117 T1 & T2 and friends for why we morphed the opcode
-// before returning it.
-static unsigned T2Morph2PLDLiteral(unsigned Opcode) {
-  switch (Opcode) {
-  default:
-    return Opcode; // Return unmorphed opcode.
-
-  case ARM::t2PLDi8:   case ARM::t2PLDs:
-  case ARM::t2PLDWi12: case ARM::t2PLDWi8:
-  case ARM::t2PLDWs:
-    return ARM::t2PLDi12;
-
-  case ARM::t2PLIi8:   case ARM::t2PLIs:
-    return ARM::t2PLIi12;
-  }
-}
-
-/// decodeThumbSideEffect is a decorator function which can potentially twiddle
-/// the instruction or morph the returned opcode under Thumb2.
-///
-/// First it checks whether the insn is a NEON or VFP instr; if true, bit
-/// twiddling could be performed on insn to turn it into an ARM NEON/VFP
-/// equivalent instruction and decodeInstruction is called with the transformed
-/// insn.
-///
-/// Next, there is special handling for Load byte/halfword/word instruction by
-/// checking whether Rn=0b1111 and call T2Morph2LoadLiteral() on the decoded
-/// Thumb2 instruction.  See comments below for further details.
-///
-/// Finally, one last check is made to see whether the insn is a NEON/VFP and
-/// decodeInstruction(insn) is invoked on the original insn.
-///
-/// Otherwise, decodeThumbInstruction is called with the original insn.
-static unsigned decodeThumbSideEffect(bool IsThumb2, unsigned &insn) {
-  if (IsThumb2) {
-    uint16_t op1 = slice(insn, 28, 27);
-    uint16_t op2 = slice(insn, 26, 20);
-
-    // A6.3 32-bit Thumb instruction encoding
-    // Table A6-9 32-bit Thumb instruction encoding
-
-    // The coprocessor instructions of interest are transformed to their ARM
-    // equivalents.
-
-    // --------- Transform Begin Marker ---------
-    if ((op1 == 1 || op1 == 3) && slice(op2, 6, 4) == 7) {
-      // A7.4 Advanced SIMD data-processing instructions
-      // U bit of Thumb corresponds to Inst{24} of ARM.
-      uint16_t U = slice(op1, 1, 1);
-
-      // Inst{28-24} of ARM = {1,0,0,1,U};
-      uint16_t bits28_24 = 9 << 1 | U;
-      DEBUG(showBitVector(errs(), insn));
-      setSlice(insn, 28, 24, bits28_24);
-      return decodeInstruction(insn);
-    }
+  MI.clear();
+  result = decodeThumb2Instruction16(MI, insn16, Address, this, STI);
+  if (result != MCDisassembler::Fail) {
+    Size = 2;
+
+    // Nested IT blocks are UNPREDICTABLE.  Must be checked before we add
+    // the Thumb predicate.
+    if (MI.getOpcode() == ARM::t2IT && !ITBlock.empty())
+      result = MCDisassembler::SoftFail;
+
+    Check(result, AddThumbPredicate(MI));
+
+    // If we find an IT instruction, we need to parse its condition
+    // code and mask operands so that we can apply them correctly
+    // to the subsequent instructions.
+    if (MI.getOpcode() == ARM::t2IT) {
+
+      // (3 - the number of trailing zeros) is the number of then / else.
+      unsigned firstcond = MI.getOperand(0).getImm();
+      unsigned Mask = MI.getOperand(1).getImm();
+      unsigned CondBit0 = Mask >> 4 & 1;
+      unsigned NumTZ = CountTrailingZeros_32(Mask);
+      assert(NumTZ <= 3 && "Invalid IT mask!");
+      for (unsigned Pos = 3, e = NumTZ; Pos > e; --Pos) {
+        bool T = ((Mask >> Pos) & 1) == CondBit0;
+        if (T)
+          ITBlock.insert(ITBlock.begin(), firstcond);
+        else
+          ITBlock.insert(ITBlock.begin(), firstcond ^ 1);
+      }
 
-    if (op1 == 3 && slice(op2, 6, 4) == 1 && slice(op2, 0, 0) == 0) {
-      // A7.7 Advanced SIMD element or structure load/store instructions
-      // Inst{27-24} of Thumb = 0b1001
-      // Inst{27-24} of ARM   = 0b0100
-      DEBUG(showBitVector(errs(), insn));
-      setSlice(insn, 27, 24, 4);
-      return decodeInstruction(insn);
+      ITBlock.push_back(firstcond);
     }
-    // --------- Transform End Marker ---------
-
-    unsigned unmorphed = decodeThumbInstruction(insn);
-
-    // See, for example, A6.3.7 Load word: Table A6-18 Load word.
-    // See A8.6.57 T3, T4 & A8.6.60 T2 and friends for why we morphed the opcode
-    // before returning it to our caller.
-    if (op1 == 3 && slice(op2, 6, 5) == 0 && slice(op2, 0, 0) == 1
-        && slice(insn, 19, 16) == 15) {
-      unsigned morphed = T2Morph2LoadLiteral(unmorphed);
-      if (morphed != unmorphed)
-        return morphed;
+
+    return result;
+  }
+
+  // We want to read exactly 4 bytes of data.
+  if (Region.readBytes(Address, 4, (uint8_t*)bytes, NULL) == -1) {
+    Size = 0;
+    return MCDisassembler::Fail;
+  }
+
+  uint32_t insn32 = (bytes[3] <<  8) |
+                    (bytes[2] <<  0) |
+                    (bytes[1] << 24) |
+                    (bytes[0] << 16);
+  MI.clear();
+  result = decodeThumbInstruction32(MI, insn32, Address, this, STI);
+  if (result != MCDisassembler::Fail) {
+    Size = 4;
+    bool InITBlock = ITBlock.size();
+    Check(result, AddThumbPredicate(MI));
+    AddThumb1SBit(MI, InITBlock);
+    return result;
+  }
+
+  MI.clear();
+  result = decodeThumb2Instruction32(MI, insn32, Address, this, STI);
+  if (result != MCDisassembler::Fail) {
+    Size = 4;
+    Check(result, AddThumbPredicate(MI));
+    return result;
+  }
+
+  MI.clear();
+  result = decodeVFPInstruction32(MI, insn32, Address, this, STI);
+  if (result != MCDisassembler::Fail) {
+    Size = 4;
+    UpdateThumbVFPPredicate(MI);
+    return result;
+  }
+
+  MI.clear();
+  result = decodeNEONDupInstruction32(MI, insn32, Address, this, STI);
+  if (result != MCDisassembler::Fail) {
+    Size = 4;
+    Check(result, AddThumbPredicate(MI));
+    return result;
+  }
+
+  if (fieldFromInstruction32(insn32, 24, 8) == 0xF9) {
+    MI.clear();
+    uint32_t NEONLdStInsn = insn32;
+    NEONLdStInsn &= 0xF0FFFFFF;
+    NEONLdStInsn |= 0x04000000;
+    result = decodeNEONLoadStoreInstruction32(MI, NEONLdStInsn, Address, this, STI);
+    if (result != MCDisassembler::Fail) {
+      Size = 4;
+      Check(result, AddThumbPredicate(MI));
+      return result;
     }
+  }
 
-    // See, for example, A8.6.117 PLD,PLDW (immediate) T1 & T2, and friends for
-    // why we morphed the opcode before returning it to our caller.
-    if (slice(insn, 31, 25) == 0x7C && slice(insn, 15, 12) == 0xF
-        && slice(insn, 22, 22) == 0 && slice(insn, 20, 20) == 1
-        && slice(insn, 19, 16) == 15) {
-      unsigned morphed = T2Morph2PLDLiteral(unmorphed);
-      if (morphed != unmorphed)
-        return morphed;
+  if (fieldFromInstruction32(insn32, 24, 4) == 0xF) {
+    MI.clear();
+    uint32_t NEONDataInsn = insn32;
+    NEONDataInsn &= 0xF0FFFFFF; // Clear bits 27-24
+    NEONDataInsn |= (NEONDataInsn & 0x10000000) >> 4; // Move bit 28 to bit 24
+    NEONDataInsn |= 0x12000000; // Set bits 28 and 25
+    result = decodeNEONDataInstruction32(MI, NEONDataInsn, Address, this, STI);
+    if (result != MCDisassembler::Fail) {
+      Size = 4;
+      Check(result, AddThumbPredicate(MI));
+      return result;
     }
+  }
 
-    // One last check for NEON/VFP instructions.
-    if ((op1 == 1 || op1 == 3) && slice(op2, 6, 6) == 1)
-      return decodeInstruction(insn);
+  Size = 0;
+  return MCDisassembler::Fail;
+}
 
-    // Fall through.
-  }
 
-  return decodeThumbInstruction(insn);
+extern "C" void LLVMInitializeARMDisassembler() {
+  TargetRegistry::RegisterMCDisassembler(TheARMTarget,
+                                         createARMDisassembler);
+  TargetRegistry::RegisterMCDisassembler(TheThumbTarget,
+                                         createThumbDisassembler);
 }
 
-//
-// Public interface for the disassembler
-//
+static const unsigned GPRDecoderTable[] = {
+  ARM::R0, ARM::R1, ARM::R2, ARM::R3,
+  ARM::R4, ARM::R5, ARM::R6, ARM::R7,
+  ARM::R8, ARM::R9, ARM::R10, ARM::R11,
+  ARM::R12, ARM::SP, ARM::LR, ARM::PC
+};
+
+static DecodeStatus DecodeGPRRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder) {
+  if (RegNo > 15)
+    return MCDisassembler::Fail;
+
+  unsigned Register = GPRDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return MCDisassembler::Success;
+}
 
-bool ARMDisassembler::getInstruction(MCInst &MI,
-                                     uint64_t &Size,
-                                     const MemoryObject &Region,
-                                     uint64_t Address,
-                                     raw_ostream &os) const {
-  // The machine instruction.
-  uint32_t insn;
-  uint8_t bytes[4];
+static DecodeStatus
+DecodeGPRnopcRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                           uint64_t Address, const void *Decoder) {
+  if (RegNo == 15) return MCDisassembler::Fail;
+  return DecodeGPRRegisterClass(Inst, RegNo, Address, Decoder);
+}
 
-  // We want to read exactly 4 bytes of data.
-  if (Region.readBytes(Address, 4, (uint8_t*)bytes, NULL) == -1)
-    return false;
+static DecodeStatus DecodetGPRRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder) {
+  if (RegNo > 7)
+    return MCDisassembler::Fail;
+  return DecodeGPRRegisterClass(Inst, RegNo, Address, Decoder);
+}
 
-  // Encoded as a small-endian 32-bit word in the stream.
-  insn = (bytes[3] << 24) |
-         (bytes[2] << 16) |
-         (bytes[1] <<  8) |
-         (bytes[0] <<  0);
-
-  unsigned Opcode = decodeARMInstruction(insn);
-  ARMFormat Format = ARMFormats[Opcode];
-  Size = 4;
-
-  DEBUG({
-      errs() << "\nOpcode=" << Opcode << " Name=" <<ARMUtils::OpcodeName(Opcode)
-             << " Format=" << stringForARMFormat(Format) << '(' << (int)Format
-             << ")\n";
-      showBitVector(errs(), insn);
-    });
-
-  OwningPtr<ARMBasicMCBuilder> Builder(CreateMCBuilder(Opcode, Format));
-  if (!Builder)
-    return false;
+static DecodeStatus DecodetcGPRRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder) {
+  unsigned Register = 0;
+  switch (RegNo) {
+    case 0:
+      Register = ARM::R0;
+      break;
+    case 1:
+      Register = ARM::R1;
+      break;
+    case 2:
+      Register = ARM::R2;
+      break;
+    case 3:
+      Register = ARM::R3;
+      break;
+    case 9:
+      Register = ARM::R9;
+      break;
+    case 12:
+      Register = ARM::R12;
+      break;
+    default:
+      return MCDisassembler::Fail;
+    }
 
-  Builder->setupBuilderForSymbolicDisassembly(getLLVMOpInfoCallback(),
-                                              getDisInfoBlock(), getMCContext(),
-                                              Address);
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return MCDisassembler::Success;
+}
 
-  if (!Builder->Build(MI, insn))
-    return false;
+static DecodeStatus DecoderGPRRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder) {
+  if (RegNo == 13 || RegNo == 15) return MCDisassembler::Fail;
+  return DecodeGPRRegisterClass(Inst, RegNo, Address, Decoder);
+}
 
-  return true;
+static const unsigned SPRDecoderTable[] = {
+     ARM::S0,  ARM::S1,  ARM::S2,  ARM::S3,
+     ARM::S4,  ARM::S5,  ARM::S6,  ARM::S7,
+     ARM::S8,  ARM::S9, ARM::S10, ARM::S11,
+    ARM::S12, ARM::S13, ARM::S14, ARM::S15,
+    ARM::S16, ARM::S17, ARM::S18, ARM::S19,
+    ARM::S20, ARM::S21, ARM::S22, ARM::S23,
+    ARM::S24, ARM::S25, ARM::S26, ARM::S27,
+    ARM::S28, ARM::S29, ARM::S30, ARM::S31
+};
+
+static DecodeStatus DecodeSPRRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned Register = SPRDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return MCDisassembler::Success;
 }
 
-bool ThumbDisassembler::getInstruction(MCInst &MI,
-                                       uint64_t &Size,
-                                       const MemoryObject &Region,
-                                       uint64_t Address,
-                                       raw_ostream &os) const {
-  // The Thumb instruction stream is a sequence of halfwords.
-
-  // This represents the first halfword as well as the machine instruction
-  // passed to decodeThumbInstruction().  For 16-bit Thumb instruction, the top
-  // halfword of insn is 0x00 0x00; otherwise, the first halfword is moved to
-  // the top half followed by the second halfword.
-  unsigned insn = 0;
-  // Possible second halfword.
-  uint16_t insn1 = 0;
-
-  // A6.1 Thumb instruction set encoding
-  //
-  // If bits [15:11] of the halfword being decoded take any of the following
-  // values, the halfword is the first halfword of a 32-bit instruction:
-  // o 0b11101
-  // o 0b11110
-  // o 0b11111.
-  //
-  // Otherwise, the halfword is a 16-bit instruction.
-
-  // Read 2 bytes of data first.
-  uint8_t bytes[2];
-  if (Region.readBytes(Address, 2, (uint8_t*)bytes, NULL) == -1)
-    return false;
+static const unsigned DPRDecoderTable[] = {
+     ARM::D0,  ARM::D1,  ARM::D2,  ARM::D3,
+     ARM::D4,  ARM::D5,  ARM::D6,  ARM::D7,
+     ARM::D8,  ARM::D9, ARM::D10, ARM::D11,
+    ARM::D12, ARM::D13, ARM::D14, ARM::D15,
+    ARM::D16, ARM::D17, ARM::D18, ARM::D19,
+    ARM::D20, ARM::D21, ARM::D22, ARM::D23,
+    ARM::D24, ARM::D25, ARM::D26, ARM::D27,
+    ARM::D28, ARM::D29, ARM::D30, ARM::D31
+};
+
+static DecodeStatus DecodeDPRRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned Register = DPRDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return MCDisassembler::Success;
+}
 
-  // Encoded as a small-endian 16-bit halfword in the stream.
-  insn = (bytes[1] << 8) | bytes[0];
-  unsigned bits15_11 = slice(insn, 15, 11);
-  bool IsThumb2 = false;
+static DecodeStatus DecodeDPR_8RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder) {
+  if (RegNo > 7)
+    return MCDisassembler::Fail;
+  return DecodeDPRRegisterClass(Inst, RegNo, Address, Decoder);
+}
 
-  // 32-bit instructions if the bits [15:11] of the halfword matches
-  // { 0b11101 /* 0x1D */, 0b11110 /* 0x1E */, ob11111 /* 0x1F */ }.
-  if (bits15_11 == 0x1D || bits15_11 == 0x1E || bits15_11 == 0x1F) {
-    IsThumb2 = true;
-    if (Region.readBytes(Address + 2, 2, (uint8_t*)bytes, NULL) == -1)
-      return false;
-    // Encoded as a small-endian 16-bit halfword in the stream.
-    insn1 = (bytes[1] << 8) | bytes[0];
-    insn = (insn << 16 | insn1);
-  }
-
-  // The insn could potentially be bit-twiddled in order to be decoded as an ARM
-  // NEON/VFP opcode.  In such case, the modified insn is later disassembled as
-  // an ARM NEON/VFP instruction.
-  //
-  // This is a short term solution for lack of encoding bits specified for the
-  // Thumb2 NEON/VFP instructions.  The long term solution could be adding some
-  // infrastructure to have each instruction support more than one encodings.
-  // Which encoding is used would be based on which subtarget the compiler/
-  // disassembler is working with at the time.  This would allow the sharing of
-  // the NEON patterns between ARM and Thumb2, as well as potential greater
-  // sharing between the regular ARM instructions and the 32-bit wide Thumb2
-  // instructions as well.
-  unsigned Opcode = decodeThumbSideEffect(IsThumb2, insn);
-
-  ARMFormat Format = ARMFormats[Opcode];
-  Size = IsThumb2 ? 4 : 2;
-
-  DEBUG({
-      errs() << "Opcode=" << Opcode << " Name=" << ARMUtils::OpcodeName(Opcode)
-             << " Format=" << stringForARMFormat(Format) << '(' << (int)Format
-             << ")\n";
-      showBitVector(errs(), insn);
-    });
-
-  OwningPtr<ARMBasicMCBuilder> Builder(CreateMCBuilder(Opcode, Format));
-  if (!Builder)
-    return false;
+static DecodeStatus
+DecodeDPR_VFP2RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                            uint64_t Address, const void *Decoder) {
+  if (RegNo > 15)
+    return MCDisassembler::Fail;
+  return DecodeDPRRegisterClass(Inst, RegNo, Address, Decoder);
+}
 
-  Builder->SetSession(const_cast<Session *>(&SO));
+static const unsigned QPRDecoderTable[] = {
+     ARM::Q0,  ARM::Q1,  ARM::Q2,  ARM::Q3,
+     ARM::Q4,  ARM::Q5,  ARM::Q6,  ARM::Q7,
+     ARM::Q8,  ARM::Q9, ARM::Q10, ARM::Q11,
+    ARM::Q12, ARM::Q13, ARM::Q14, ARM::Q15
+};
 
-  Builder->setupBuilderForSymbolicDisassembly(getLLVMOpInfoCallback(),
-                                              getDisInfoBlock(), getMCContext(),
-                                              Address);
 
-  if (!Builder->Build(MI, insn))
-    return false;
+static DecodeStatus DecodeQPRRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+  RegNo >>= 1;
 
-  return true;
+  unsigned Register = QPRDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return MCDisassembler::Success;
 }
 
-// A8.6.50
-// Valid return values are {1, 2, 3, 4}, with 0 signifying an error condition.
-static unsigned short CountITSize(unsigned ITMask) {
-  // First count the trailing zeros of the IT mask.
-  unsigned TZ = CountTrailingZeros_32(ITMask);
-  if (TZ > 3) {
-    DEBUG(errs() << "Encoding error: IT Mask '0000'");
-    return 0;
-  }
-  return (4 - TZ);
+static DecodeStatus DecodePredicateOperand(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder) {
+  if (Val == 0xF) return MCDisassembler::Fail;
+  // AL predicate is not allowed on Thumb1 branches.
+  if (Inst.getOpcode() == ARM::tBcc && Val == 0xE)
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(Val));
+  if (Val == ARMCC::AL) {
+    Inst.addOperand(MCOperand::CreateReg(0));
+  } else
+    Inst.addOperand(MCOperand::CreateReg(ARM::CPSR));
+  return MCDisassembler::Success;
 }
 
-/// Init ITState.  Note that at least one bit is always 1 in mask.
-bool Session::InitIT(unsigned short bits7_0) {
-  ITCounter = CountITSize(slice(bits7_0, 3, 0));
-  if (ITCounter == 0)
-    return false;
+static DecodeStatus DecodeCCOutOperand(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder) {
+  if (Val)
+    Inst.addOperand(MCOperand::CreateReg(ARM::CPSR));
+  else
+    Inst.addOperand(MCOperand::CreateReg(0));
+  return MCDisassembler::Success;
+}
 
-  // A8.6.50 IT
-  unsigned short FirstCond = slice(bits7_0, 7, 4);
-  if (FirstCond == 0xF) {
-    DEBUG(errs() << "Encoding error: IT FirstCond '1111'");
-    return false;
-  }
-  if (FirstCond == 0xE && ITCounter != 1) {
-    DEBUG(errs() << "Encoding error: IT FirstCond '1110' && Mask != '1000'");
-    return false;
+static DecodeStatus DecodeSOImmOperand(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder) {
+  uint32_t imm = Val & 0xFF;
+  uint32_t rot = (Val & 0xF00) >> 7;
+  uint32_t rot_imm = (imm >> rot) | (imm << ((32-rot) & 0x1F));
+  Inst.addOperand(MCOperand::CreateImm(rot_imm));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeSORegImmOperand(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rm = fieldFromInstruction32(Val, 0, 4);
+  unsigned type = fieldFromInstruction32(Val, 5, 2);
+  unsigned imm = fieldFromInstruction32(Val, 7, 5);
+
+  // Register-immediate
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  ARM_AM::ShiftOpc Shift = ARM_AM::lsl;
+  switch (type) {
+    case 0:
+      Shift = ARM_AM::lsl;
+      break;
+    case 1:
+      Shift = ARM_AM::lsr;
+      break;
+    case 2:
+      Shift = ARM_AM::asr;
+      break;
+    case 3:
+      Shift = ARM_AM::ror;
+      break;
   }
 
-  ITState = bits7_0;
+  if (Shift == ARM_AM::ror && imm == 0)
+    Shift = ARM_AM::rrx;
 
-  return true;
+  unsigned Op = Shift | (imm << 3);
+  Inst.addOperand(MCOperand::CreateImm(Op));
+
+  return S;
 }
 
-/// Update ITState if necessary.
-void Session::UpdateIT() {
-  assert(ITCounter);
-  --ITCounter;
-  if (ITCounter == 0)
-    ITState = 0;
-  else {
-    unsigned short NewITState4_0 = slice(ITState, 4, 0) << 1;
-    setSlice(ITState, 4, 0, NewITState4_0);
+static DecodeStatus DecodeSORegRegOperand(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rm = fieldFromInstruction32(Val, 0, 4);
+  unsigned type = fieldFromInstruction32(Val, 5, 2);
+  unsigned Rs = fieldFromInstruction32(Val, 8, 4);
+
+  // Register-register
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rs, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  ARM_AM::ShiftOpc Shift = ARM_AM::lsl;
+  switch (type) {
+    case 0:
+      Shift = ARM_AM::lsl;
+      break;
+    case 1:
+      Shift = ARM_AM::lsr;
+      break;
+    case 2:
+      Shift = ARM_AM::asr;
+      break;
+    case 3:
+      Shift = ARM_AM::ror;
+      break;
   }
+
+  Inst.addOperand(MCOperand::CreateImm(Shift));
+
+  return S;
 }
 
-static MCDisassembler *createARMDisassembler(const Target &T) {
-  return new ARMDisassembler;
+static DecodeStatus DecodeRegListOperand(llvm::MCInst &Inst, unsigned Val,
+                                 uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  bool writebackLoad = false;
+  unsigned writebackReg = 0;
+  switch (Inst.getOpcode()) {
+    default:
+      break;
+    case ARM::LDMIA_UPD:
+    case ARM::LDMDB_UPD:
+    case ARM::LDMIB_UPD:
+    case ARM::LDMDA_UPD:
+    case ARM::t2LDMIA_UPD:
+    case ARM::t2LDMDB_UPD:
+      writebackLoad = true;
+      writebackReg = Inst.getOperand(0).getReg();
+      break;
+  }
+
+  // Empty register lists are not allowed.
+  if (CountPopulation_32(Val) == 0) return MCDisassembler::Fail;
+  for (unsigned i = 0; i < 16; ++i) {
+    if (Val & (1 << i)) {
+      if (!Check(S, DecodeGPRRegisterClass(Inst, i, Address, Decoder)))
+        return MCDisassembler::Fail;
+      // Writeback not allowed if Rn is in the target list.
+      if (writebackLoad && writebackReg == Inst.end()[-1].getReg())
+        Check(S, MCDisassembler::SoftFail);
+    }
+  }
+
+  return S;
 }
 
-static MCDisassembler *createThumbDisassembler(const Target &T) {
-  return new ThumbDisassembler;
+static DecodeStatus DecodeSPRRegListOperand(llvm::MCInst &Inst, unsigned Val,
+                                 uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Vd = fieldFromInstruction32(Val, 8, 4);
+  unsigned regs = Val & 0xFF;
+
+  if (!Check(S, DecodeSPRRegisterClass(Inst, Vd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  for (unsigned i = 0; i < (regs - 1); ++i) {
+    if (!Check(S, DecodeSPRRegisterClass(Inst, ++Vd, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  return S;
 }
 
-extern "C" void LLVMInitializeARMDisassembler() {
-  // Register the disassembler.
-  TargetRegistry::RegisterMCDisassembler(TheARMTarget,
-                                         createARMDisassembler);
-  TargetRegistry::RegisterMCDisassembler(TheThumbTarget,
-                                         createThumbDisassembler);
+static DecodeStatus DecodeDPRRegListOperand(llvm::MCInst &Inst, unsigned Val,
+                                 uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Vd = fieldFromInstruction32(Val, 8, 4);
+  unsigned regs = (Val & 0xFF) / 2;
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Vd, Address, Decoder)))
+      return MCDisassembler::Fail;
+  for (unsigned i = 0; i < (regs - 1); ++i) {
+    if (!Check(S, DecodeDPRRegisterClass(Inst, ++Vd, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  return S;
 }
 
-EDInstInfo *ARMDisassembler::getEDInfo() const {
-  return instInfoARM;
+static DecodeStatus DecodeBitfieldMaskOperand(llvm::MCInst &Inst, unsigned Val,
+                                      uint64_t Address, const void *Decoder) {
+  // This operand encodes a mask of contiguous zeros between a specified MSB
+  // and LSB.  To decode it, we create the mask of all bits MSB-and-lower,
+  // the mask of all bits LSB-and-lower, and then xor them to create
+  // the mask of that's all ones on [msb, lsb].  Finally we not it to
+  // create the final mask.
+  unsigned msb = fieldFromInstruction32(Val, 5, 5);
+  unsigned lsb = fieldFromInstruction32(Val, 0, 5);
+
+  DecodeStatus S = MCDisassembler::Success;
+  if (lsb > msb) Check(S, MCDisassembler::SoftFail);
+
+  uint32_t msb_mask = 0xFFFFFFFF;
+  if (msb != 31) msb_mask = (1U << (msb+1)) - 1;
+  uint32_t lsb_mask = (1U << lsb) - 1;
+
+  Inst.addOperand(MCOperand::CreateImm(~(msb_mask ^ lsb_mask)));
+  return S;
 }
 
-EDInstInfo *ThumbDisassembler::getEDInfo() const {
-  return instInfoARM;
+static DecodeStatus DecodeCopMemInstruction(llvm::MCInst &Inst, unsigned Insn,
+                                  uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned pred = fieldFromInstruction32(Insn, 28, 4);
+  unsigned CRd = fieldFromInstruction32(Insn, 12, 4);
+  unsigned coproc = fieldFromInstruction32(Insn, 8, 4);
+  unsigned imm = fieldFromInstruction32(Insn, 0, 8);
+  unsigned Rn = fieldFromInstruction32(Insn, 16, 4);
+  unsigned U = fieldFromInstruction32(Insn, 23, 1);
+
+  switch (Inst.getOpcode()) {
+    case ARM::LDC_OFFSET:
+    case ARM::LDC_PRE:
+    case ARM::LDC_POST:
+    case ARM::LDC_OPTION:
+    case ARM::LDCL_OFFSET:
+    case ARM::LDCL_PRE:
+    case ARM::LDCL_POST:
+    case ARM::LDCL_OPTION:
+    case ARM::STC_OFFSET:
+    case ARM::STC_PRE:
+    case ARM::STC_POST:
+    case ARM::STC_OPTION:
+    case ARM::STCL_OFFSET:
+    case ARM::STCL_PRE:
+    case ARM::STCL_POST:
+    case ARM::STCL_OPTION:
+    case ARM::t2LDC_OFFSET:
+    case ARM::t2LDC_PRE:
+    case ARM::t2LDC_POST:
+    case ARM::t2LDC_OPTION:
+    case ARM::t2LDCL_OFFSET:
+    case ARM::t2LDCL_PRE:
+    case ARM::t2LDCL_POST:
+    case ARM::t2LDCL_OPTION:
+    case ARM::t2STC_OFFSET:
+    case ARM::t2STC_PRE:
+    case ARM::t2STC_POST:
+    case ARM::t2STC_OPTION:
+    case ARM::t2STCL_OFFSET:
+    case ARM::t2STCL_PRE:
+    case ARM::t2STCL_POST:
+    case ARM::t2STCL_OPTION:
+      if (coproc == 0xA || coproc == 0xB)
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  Inst.addOperand(MCOperand::CreateImm(coproc));
+  Inst.addOperand(MCOperand::CreateImm(CRd));
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  unsigned P = fieldFromInstruction32(Insn, 24, 1);
+  unsigned W = fieldFromInstruction32(Insn, 21, 1);
+
+  bool writeback = (P == 0) || (W == 1);
+  unsigned idx_mode = 0;
+  if (P && writeback)
+    idx_mode = ARMII::IndexModePre;
+  else if (!P && writeback)
+    idx_mode = ARMII::IndexModePost;
+
+  switch (Inst.getOpcode()) {
+    case ARM::t2LDC2_OFFSET:
+    case ARM::t2LDC2L_OFFSET:
+    case ARM::t2LDC2_PRE:
+    case ARM::t2LDC2L_PRE:
+    case ARM::t2STC2_OFFSET:
+    case ARM::t2STC2L_OFFSET:
+    case ARM::t2STC2_PRE:
+    case ARM::t2STC2L_PRE:
+    case ARM::LDC2_OFFSET:
+    case ARM::LDC2L_OFFSET:
+    case ARM::LDC2_PRE:
+    case ARM::LDC2L_PRE:
+    case ARM::STC2_OFFSET:
+    case ARM::STC2L_OFFSET:
+    case ARM::STC2_PRE:
+    case ARM::STC2L_PRE:
+    case ARM::t2LDC_OFFSET:
+    case ARM::t2LDCL_OFFSET:
+    case ARM::t2LDC_PRE:
+    case ARM::t2LDCL_PRE:
+    case ARM::t2STC_OFFSET:
+    case ARM::t2STCL_OFFSET:
+    case ARM::t2STC_PRE:
+    case ARM::t2STCL_PRE:
+    case ARM::LDC_OFFSET:
+    case ARM::LDCL_OFFSET:
+    case ARM::LDC_PRE:
+    case ARM::LDCL_PRE:
+    case ARM::STC_OFFSET:
+    case ARM::STCL_OFFSET:
+    case ARM::STC_PRE:
+    case ARM::STCL_PRE:
+      imm = ARM_AM::getAM5Opc(U ? ARM_AM::add : ARM_AM::sub, imm);
+      Inst.addOperand(MCOperand::CreateImm(imm));
+      break;
+    case ARM::t2LDC2_POST:
+    case ARM::t2LDC2L_POST:
+    case ARM::t2STC2_POST:
+    case ARM::t2STC2L_POST:
+    case ARM::LDC2_POST:
+    case ARM::LDC2L_POST:
+    case ARM::STC2_POST:
+    case ARM::STC2L_POST:
+    case ARM::t2LDC_POST:
+    case ARM::t2LDCL_POST:
+    case ARM::t2STC_POST:
+    case ARM::t2STCL_POST:
+    case ARM::LDC_POST:
+    case ARM::LDCL_POST:
+    case ARM::STC_POST:
+    case ARM::STCL_POST:
+      imm |= U << 8;
+      // fall through.
+    default:
+      // The 'option' variant doesn't encode 'U' in the immediate since
+      // the immediate is unsigned [0,255].
+      Inst.addOperand(MCOperand::CreateImm(imm));
+      break;
+  }
+
+  switch (Inst.getOpcode()) {
+    case ARM::LDC_OFFSET:
+    case ARM::LDC_PRE:
+    case ARM::LDC_POST:
+    case ARM::LDC_OPTION:
+    case ARM::LDCL_OFFSET:
+    case ARM::LDCL_PRE:
+    case ARM::LDCL_POST:
+    case ARM::LDCL_OPTION:
+    case ARM::STC_OFFSET:
+    case ARM::STC_PRE:
+    case ARM::STC_POST:
+    case ARM::STC_OPTION:
+    case ARM::STCL_OFFSET:
+    case ARM::STCL_PRE:
+    case ARM::STCL_POST:
+    case ARM::STCL_OPTION:
+      if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  return S;
 }
+
+static DecodeStatus
+DecodeAddrMode2IdxInstruction(llvm::MCInst &Inst, unsigned Insn,
+                              uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction32(Insn, 16, 4);
+  unsigned Rt = fieldFromInstruction32(Insn, 12, 4);
+  unsigned Rm = fieldFromInstruction32(Insn, 0, 4);
+  unsigned imm = fieldFromInstruction32(Insn, 0, 12);
+  unsigned pred = fieldFromInstruction32(Insn, 28, 4);
+  unsigned reg = fieldFromInstruction32(Insn, 25, 1);
+  unsigned P = fieldFromInstruction32(Insn, 24, 1);
+  unsigned W = fieldFromInstruction32(Insn, 21, 1);
+
+  // On stores, the writeback operand precedes Rt.
+  switch (Inst.getOpcode()) {
+    case ARM::STR_POST_IMM:
+    case ARM::STR_POST_REG:
+    case ARM::STRB_POST_IMM:
+    case ARM::STRB_POST_REG:
+    case ARM::STRT_POST_REG:
+    case ARM::STRT_POST_IMM:
+    case ARM::STRBT_POST_REG:
+    case ARM::STRBT_POST_IMM:
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  // On loads, the writeback operand comes after Rt.
+  switch (Inst.getOpcode()) {
+    case ARM::LDR_POST_IMM:
+    case ARM::LDR_POST_REG:
+    case ARM::LDRB_POST_IMM:
+    case ARM::LDRB_POST_REG:
+    case ARM::LDRBT_POST_REG:
+    case ARM::LDRBT_POST_IMM:
+    case ARM::LDRT_POST_REG:
+    case ARM::LDRT_POST_IMM:
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  ARM_AM::AddrOpc Op = ARM_AM::add;
+  if (!fieldFromInstruction32(Insn, 23, 1))
+    Op = ARM_AM::sub;
+
+  bool writeback = (P == 0) || (W == 1);
+  unsigned idx_mode = 0;
+  if (P && writeback)
+    idx_mode = ARMII::IndexModePre;
+  else if (!P && writeback)
+    idx_mode = ARMII::IndexModePost;
+
+  if (writeback && (Rn == 15 || Rn == Rt))
+    S = MCDisassembler::SoftFail; // UNPREDICTABLE
+
+  if (reg) {
+    if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rm, Address, Decoder)))
+      return MCDisassembler::Fail;
+    ARM_AM::ShiftOpc Opc = ARM_AM::lsl;
+    switch( fieldFromInstruction32(Insn, 5, 2)) {
+      case 0:
+        Opc = ARM_AM::lsl;
+        break;
+      case 1:
+        Opc = ARM_AM::lsr;
+        break;
+      case 2:
+        Opc = ARM_AM::asr;
+        break;
+      case 3:
+        Opc = ARM_AM::ror;
+        break;
+      default:
+        return MCDisassembler::Fail;
+    }
+    unsigned amt = fieldFromInstruction32(Insn, 7, 5);
+    unsigned imm = ARM_AM::getAM2Opc(Op, amt, Opc, idx_mode);
+
+    Inst.addOperand(MCOperand::CreateImm(imm));
+  } else {
+    Inst.addOperand(MCOperand::CreateReg(0));
+    unsigned tmp = ARM_AM::getAM2Opc(Op, imm, ARM_AM::lsl, idx_mode);
+    Inst.addOperand(MCOperand::CreateImm(tmp));
+  }
+
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeSORegMemOperand(llvm::MCInst &Inst, unsigned Val,
+                                  uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction32(Val, 13, 4);
+  unsigned Rm = fieldFromInstruction32(Val,  0, 4);
+  unsigned type = fieldFromInstruction32(Val, 5, 2);
+  unsigned imm = fieldFromInstruction32(Val, 7, 5);
+  unsigned U = fieldFromInstruction32(Val, 12, 1);
+
+  ARM_AM::ShiftOpc ShOp = ARM_AM::lsl;
+  switch (type) {
+    case 0:
+      ShOp = ARM_AM::lsl;
+      break;
+    case 1:
+      ShOp = ARM_AM::lsr;
+      break;
+    case 2:
+      ShOp = ARM_AM::asr;
+      break;
+    case 3:
+      ShOp = ARM_AM::ror;
+      break;
+  }
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  unsigned shift;
+  if (U)
+    shift = ARM_AM::getAM2Opc(ARM_AM::add, imm, ShOp);
+  else
+    shift = ARM_AM::getAM2Opc(ARM_AM::sub, imm, ShOp);
+  Inst.addOperand(MCOperand::CreateImm(shift));
+
+  return S;
+}
+
+static DecodeStatus
+DecodeAddrMode3Instruction(llvm::MCInst &Inst, unsigned Insn,
+                           uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rt = fieldFromInstruction32(Insn, 12, 4);
+  unsigned Rn = fieldFromInstruction32(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction32(Insn, 0, 4);
+  unsigned type = fieldFromInstruction32(Insn, 22, 1);
+  unsigned imm = fieldFromInstruction32(Insn, 8, 4);
+  unsigned U = ((~fieldFromInstruction32(Insn, 23, 1)) & 1) << 8;
+  unsigned pred = fieldFromInstruction32(Insn, 28, 4);
+  unsigned W = fieldFromInstruction32(Insn, 21, 1);
+  unsigned P = fieldFromInstruction32(Insn, 24, 1);
+
+  bool writeback = (W == 1) | (P == 0);
+
+  // For {LD,ST}RD, Rt must be even, else undefined.
+  switch (Inst.getOpcode()) {
+    case ARM::STRD:
+    case ARM::STRD_PRE:
+    case ARM::STRD_POST:
+    case ARM::LDRD:
+    case ARM::LDRD_PRE:
+    case ARM::LDRD_POST:
+      if (Rt & 0x1) return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  if (writeback) { // Writeback
+    if (P)
+      U |= ARMII::IndexModePre << 9;
+    else
+      U |= ARMII::IndexModePost << 9;
+
+    // On stores, the writeback operand precedes Rt.
+    switch (Inst.getOpcode()) {
+    case ARM::STRD:
+    case ARM::STRD_PRE:
+    case ARM::STRD_POST:
+    case ARM::STRH:
+    case ARM::STRH_PRE:
+    case ARM::STRH_POST:
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+    }
+  }
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+  switch (Inst.getOpcode()) {
+    case ARM::STRD:
+    case ARM::STRD_PRE:
+    case ARM::STRD_POST:
+    case ARM::LDRD:
+    case ARM::LDRD_PRE:
+    case ARM::LDRD_POST:
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rt+1, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  if (writeback) {
+    // On loads, the writeback operand comes after Rt.
+    switch (Inst.getOpcode()) {
+    case ARM::LDRD:
+    case ARM::LDRD_PRE:
+    case ARM::LDRD_POST:
+    case ARM::LDRH:
+    case ARM::LDRH_PRE:
+    case ARM::LDRH_POST:
+    case ARM::LDRSH:
+    case ARM::LDRSH_PRE:
+    case ARM::LDRSH_POST:
+    case ARM::LDRSB:
+    case ARM::LDRSB_PRE:
+    case ARM::LDRSB_POST:
+    case ARM::LDRHTr:
+    case ARM::LDRSBTr:
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+    }
+  }
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  if (type) {
+    Inst.addOperand(MCOperand::CreateReg(0));
+    Inst.addOperand(MCOperand::CreateImm(U | (imm << 4) | Rm));
+  } else {
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+    Inst.addOperand(MCOperand::CreateImm(U));
+  }
+
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeRFEInstruction(llvm::MCInst &Inst, unsigned Insn,
+                                 uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction32(Insn, 16, 4);
+  unsigned mode = fieldFromInstruction32(Insn, 23, 2);
+
+  switch (mode) {
+    case 0:
+      mode = ARM_AM::da;
+      break;
+    case 1:
+      mode = ARM_AM::ia;
+      break;
+    case 2:
+      mode = ARM_AM::db;
+      break;
+    case 3:
+      mode = ARM_AM::ib;
+      break;
+  }
+
+  Inst.addOperand(MCOperand::CreateImm(mode));
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeMemMultipleWritebackInstruction(llvm::MCInst &Inst,
+                                  unsigned Insn,
+                                  uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction32(Insn, 16, 4);
+  unsigned pred = fieldFromInstruction32(Insn, 28, 4);
+  unsigned reglist = fieldFromInstruction32(Insn, 0, 16);
+
+  if (pred == 0xF) {
+    switch (Inst.getOpcode()) {
+      case ARM::LDMDA:
+        Inst.setOpcode(ARM::RFEDA);
+        break;
+      case ARM::LDMDA_UPD:
+        Inst.setOpcode(ARM::RFEDA_UPD);
+        break;
+      case ARM::LDMDB:
+        Inst.setOpcode(ARM::RFEDB);
+        break;
+      case ARM::LDMDB_UPD:
+        Inst.setOpcode(ARM::RFEDB_UPD);
+        break;
+      case ARM::LDMIA:
+        Inst.setOpcode(ARM::RFEIA);
+        break;
+      case ARM::LDMIA_UPD:
+        Inst.setOpcode(ARM::RFEIA_UPD);
+        break;
+      case ARM::LDMIB:
+        Inst.setOpcode(ARM::RFEIB);
+        break;
+      case ARM::LDMIB_UPD:
+        Inst.setOpcode(ARM::RFEIB_UPD);
+        break;
+      case ARM::STMDA:
+        Inst.setOpcode(ARM::SRSDA);
+        break;
+      case ARM::STMDA_UPD:
+        Inst.setOpcode(ARM::SRSDA_UPD);
+        break;
+      case ARM::STMDB:
+        Inst.setOpcode(ARM::SRSDB);
+        break;
+      case ARM::STMDB_UPD:
+        Inst.setOpcode(ARM::SRSDB_UPD);
+        break;
+      case ARM::STMIA:
+        Inst.setOpcode(ARM::SRSIA);
+        break;
+      case ARM::STMIA_UPD:
+        Inst.setOpcode(ARM::SRSIA_UPD);
+        break;
+      case ARM::STMIB:
+        Inst.setOpcode(ARM::SRSIB);
+        break;
+      case ARM::STMIB_UPD:
+        Inst.setOpcode(ARM::SRSIB_UPD);
+        break;
+      default:
+        if (!Check(S, MCDisassembler::Fail)) return MCDisassembler::Fail;
+    }
+
+    // For stores (which become SRS's, the only operand is the mode.
+    if (fieldFromInstruction32(Insn, 20, 1) == 0) {
+      Inst.addOperand(
+          MCOperand::CreateImm(fieldFromInstruction32(Insn, 0, 4)));
+      return S;
+    }
+
+    return DecodeRFEInstruction(Inst, Insn, Address, Decoder);
+  }
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail; // Tied
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeRegListOperand(Inst, reglist, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeCPSInstruction(llvm::MCInst &Inst, unsigned Insn,
+                                 uint64_t Address, const void *Decoder) {
+  unsigned imod = fieldFromInstruction32(Insn, 18, 2);
+  unsigned M = fieldFromInstruction32(Insn, 17, 1);
+  unsigned iflags = fieldFromInstruction32(Insn, 6, 3);
+  unsigned mode = fieldFromInstruction32(Insn, 0, 5);
+
+  DecodeStatus S = MCDisassembler::Success;
+
+  // imod == '01' --> UNPREDICTABLE
+  // NOTE: Even though this is technically UNPREDICTABLE, we choose to
+  // return failure here.  The '01' imod value is unprintable, so there's
+  // nothing useful we could do even if we returned UNPREDICTABLE.
+
+  if (imod == 1) return MCDisassembler::Fail;
+
+  if (imod && M) {
+    Inst.setOpcode(ARM::CPS3p);
+    Inst.addOperand(MCOperand::CreateImm(imod));
+    Inst.addOperand(MCOperand::CreateImm(iflags));
+    Inst.addOperand(MCOperand::CreateImm(mode));
+  } else if (imod && !M) {
+    Inst.setOpcode(ARM::CPS2p);
+    Inst.addOperand(MCOperand::CreateImm(imod));
+    Inst.addOperand(MCOperand::CreateImm(iflags));
+    if (mode) S = MCDisassembler::SoftFail;
+  } else if (!imod && M) {
+    Inst.setOpcode(ARM::CPS1p);
+    Inst.addOperand(MCOperand::CreateImm(mode));
+    if (iflags) S = MCDisassembler::SoftFail;
+  } else {
+    // imod == '00' && M == '0' --> UNPREDICTABLE
+    Inst.setOpcode(ARM::CPS1p);
+    Inst.addOperand(MCOperand::CreateImm(mode));
+    S = MCDisassembler::SoftFail;
+  }
+
+  return S;
+}
+
+static DecodeStatus DecodeT2CPSInstruction(llvm::MCInst &Inst, unsigned Insn,
+                                 uint64_t Address, const void *Decoder) {
+  unsigned imod = fieldFromInstruction32(Insn, 9, 2);
+  unsigned M = fieldFromInstruction32(Insn, 8, 1);
+  unsigned iflags = fieldFromInstruction32(Insn, 5, 3);
+  unsigned mode = fieldFromInstruction32(Insn, 0, 5);
+
+  DecodeStatus S = MCDisassembler::Success;
+
+  // imod == '01' --> UNPREDICTABLE
+  // NOTE: Even though this is technically UNPREDICTABLE, we choose to
+  // return failure here.  The '01' imod value is unprintable, so there's
+  // nothing useful we could do even if we returned UNPREDICTABLE.
+
+  if (imod == 1) return MCDisassembler::Fail;
+
+  if (imod && M) {
+    Inst.setOpcode(ARM::t2CPS3p);
+    Inst.addOperand(MCOperand::CreateImm(imod));
+    Inst.addOperand(MCOperand::CreateImm(iflags));
+    Inst.addOperand(MCOperand::CreateImm(mode));
+  } else if (imod && !M) {
+    Inst.setOpcode(ARM::t2CPS2p);
+    Inst.addOperand(MCOperand::CreateImm(imod));
+    Inst.addOperand(MCOperand::CreateImm(iflags));
+    if (mode) S = MCDisassembler::SoftFail;
+  } else if (!imod && M) {
+    Inst.setOpcode(ARM::t2CPS1p);
+    Inst.addOperand(MCOperand::CreateImm(mode));
+    if (iflags) S = MCDisassembler::SoftFail;
+  } else {
+    // imod == '00' && M == '0' --> UNPREDICTABLE
+    Inst.setOpcode(ARM::t2CPS1p);
+    Inst.addOperand(MCOperand::CreateImm(mode));
+    S = MCDisassembler::SoftFail;
+  }
+
+  return S;
+}
+
+static DecodeStatus DecodeT2MOVTWInstruction(llvm::MCInst &Inst, unsigned Insn,
+                                 uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction32(Insn, 8, 4);
+  unsigned imm = 0;
+
+  imm |= (fieldFromInstruction32(Insn, 0, 8) << 0);
+  imm |= (fieldFromInstruction32(Insn, 12, 3) << 8);
+  imm |= (fieldFromInstruction32(Insn, 16, 4) << 12);
+  imm |= (fieldFromInstruction32(Insn, 26, 1) << 11);
+
+  if (Inst.getOpcode() == ARM::t2MOVTi16)
+    if (!Check(S, DecoderGPRRegisterClass(Inst, Rd, Address, Decoder)))
+      return MCDisassembler::Fail;
+  if (!Check(S, DecoderGPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  if (!tryAddingSymbolicOperand(Address, imm, false, 4, Inst, Decoder))
+    Inst.addOperand(MCOperand::CreateImm(imm));
+
+  return S;
+}
+
+static DecodeStatus DecodeArmMOVTWInstruction(llvm::MCInst &Inst, unsigned Insn,
+                                 uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction32(Insn, 12, 4);
+  unsigned pred = fieldFromInstruction32(Insn, 28, 4);
+  unsigned imm = 0;
+
+  imm |= (fieldFromInstruction32(Insn, 0, 12) << 0);
+  imm |= (fieldFromInstruction32(Insn, 16, 4) << 12);
+
+  if (Inst.getOpcode() == ARM::MOVTi16)
+    if (!Check(S, DecoderGPRRegisterClass(Inst, Rd, Address, Decoder)))
+      return MCDisassembler::Fail;
+  if (!Check(S, DecoderGPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  if (!tryAddingSymbolicOperand(Address, imm, false, 4, Inst, Decoder))
+    Inst.addOperand(MCOperand::CreateImm(imm));
+
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeSMLAInstruction(llvm::MCInst &Inst, unsigned Insn,
+                                 uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction32(Insn, 16, 4);
+  unsigned Rn = fieldFromInstruction32(Insn, 0, 4);
+  unsigned Rm = fieldFromInstruction32(Insn, 8, 4);
+  unsigned Ra = fieldFromInstruction32(Insn, 12, 4);
+  unsigned pred = fieldFromInstruction32(Insn, 28, 4);
+
+  if (pred == 0xF)
+    return DecodeCPSInstruction(Inst, Insn, Address, Decoder);
+
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Ra, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeAddrModeImm12Operand(llvm::MCInst &Inst, unsigned Val,
+                           uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned add = fieldFromInstruction32(Val, 12, 1);
+  unsigned imm = fieldFromInstruction32(Val, 0, 12);
+  unsigned Rn = fieldFromInstruction32(Val, 13, 4);
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  if (!add) imm *= -1;
+  if (imm == 0 && !add) imm = INT32_MIN;
+  Inst.addOperand(MCOperand::CreateImm(imm));
+  if (Rn == 15)
+    tryAddingPcLoadReferenceComment(Address, Address + imm + 8, Decoder);
+
+  return S;
+}
+
+static DecodeStatus DecodeAddrMode5Operand(llvm::MCInst &Inst, unsigned Val,
+                                   uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction32(Val, 9, 4);
+  unsigned U = fieldFromInstruction32(Val, 8, 1);
+  unsigned imm = fieldFromInstruction32(Val, 0, 8);
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  if (U)
+    Inst.addOperand(MCOperand::CreateImm(ARM_AM::getAM5Opc(ARM_AM::add, imm)));
+  else
+    Inst.addOperand(MCOperand::CreateImm(ARM_AM::getAM5Opc(ARM_AM::sub, imm)));
+
+  return S;
+}
+
+static DecodeStatus DecodeAddrMode7Operand(llvm::MCInst &Inst, unsigned Val,
+                                   uint64_t Address, const void *Decoder) {
+  return DecodeGPRRegisterClass(Inst, Val, Address, Decoder);
+}
+
+static DecodeStatus
+DecodeBranchImmInstruction(llvm::MCInst &Inst, unsigned Insn,
+                           uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned pred = fieldFromInstruction32(Insn, 28, 4);
+  unsigned imm = fieldFromInstruction32(Insn, 0, 24) << 2;
+
+  if (pred == 0xF) {
+    Inst.setOpcode(ARM::BLXi);
+    imm |= fieldFromInstruction32(Insn, 24, 1) << 1;
+    Inst.addOperand(MCOperand::CreateImm(SignExtend32<26>(imm)));
+    return S;
+  }
+
+  if (!tryAddingSymbolicOperand(Address, Address + SignExtend32<26>(imm) + 8, true,
+                                4, Inst, Decoder))
+    Inst.addOperand(MCOperand::CreateImm(SignExtend32<26>(imm)));
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+
+static DecodeStatus DecodeVCVTImmOperand(llvm::MCInst &Inst, unsigned Val,
+                                 uint64_t Address, const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm(64 - Val));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeAddrMode6Operand(llvm::MCInst &Inst, unsigned Val,
+                                   uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rm = fieldFromInstruction32(Val, 0, 4);
+  unsigned align = fieldFromInstruction32(Val, 4, 2);
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!align)
+    Inst.addOperand(MCOperand::CreateImm(0));
+  else
+    Inst.addOperand(MCOperand::CreateImm(4 << align));
+
+  return S;
+}
+
+static DecodeStatus DecodeVLDInstruction(llvm::MCInst &Inst, unsigned Insn,
+                                   uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction32(Insn, 12, 4);
+  Rd |= fieldFromInstruction32(Insn, 22, 1) << 4;
+  unsigned wb = fieldFromInstruction32(Insn, 16, 4);
+  unsigned Rn = fieldFromInstruction32(Insn, 16, 4);
+  Rn |= fieldFromInstruction32(Insn, 4, 2) << 4;
+  unsigned Rm = fieldFromInstruction32(Insn, 0, 4);
+
+  // First output register
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  // Second output register
+  switch (Inst.getOpcode()) {
+    case ARM::VLD1q8:
+    case ARM::VLD1q16:
+    case ARM::VLD1q32:
+    case ARM::VLD1q64:
+    case ARM::VLD1q8_UPD:
+    case ARM::VLD1q16_UPD:
+    case ARM::VLD1q32_UPD:
+    case ARM::VLD1q64_UPD:
+    case ARM::VLD1d8T:
+    case ARM::VLD1d16T:
+    case ARM::VLD1d32T:
+    case ARM::VLD1d64T:
+    case ARM::VLD1d8T_UPD:
+    case ARM::VLD1d16T_UPD:
+    case ARM::VLD1d32T_UPD:
+    case ARM::VLD1d64T_UPD:
+    case ARM::VLD1d8Q:
+    case ARM::VLD1d16Q:
+    case ARM::VLD1d32Q:
+    case ARM::VLD1d64Q:
+    case ARM::VLD1d8Q_UPD:
+    case ARM::VLD1d16Q_UPD:
+    case ARM::VLD1d32Q_UPD:
+    case ARM::VLD1d64Q_UPD:
+    case ARM::VLD2d8:
+    case ARM::VLD2d16:
+    case ARM::VLD2d32:
+    case ARM::VLD2d8_UPD:
+    case ARM::VLD2d16_UPD:
+    case ARM::VLD2d32_UPD:
+    case ARM::VLD2q8:
+    case ARM::VLD2q16:
+    case ARM::VLD2q32:
+    case ARM::VLD2q8_UPD:
+    case ARM::VLD2q16_UPD:
+    case ARM::VLD2q32_UPD:
+    case ARM::VLD3d8:
+    case ARM::VLD3d16:
+    case ARM::VLD3d32:
+    case ARM::VLD3d8_UPD:
+    case ARM::VLD3d16_UPD:
+    case ARM::VLD3d32_UPD:
+    case ARM::VLD4d8:
+    case ARM::VLD4d16:
+    case ARM::VLD4d32:
+    case ARM::VLD4d8_UPD:
+    case ARM::VLD4d16_UPD:
+    case ARM::VLD4d32_UPD:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+1)%32, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    case ARM::VLD2b8:
+    case ARM::VLD2b16:
+    case ARM::VLD2b32:
+    case ARM::VLD2b8_UPD:
+    case ARM::VLD2b16_UPD:
+    case ARM::VLD2b32_UPD:
+    case ARM::VLD3q8:
+    case ARM::VLD3q16:
+    case ARM::VLD3q32:
+    case ARM::VLD3q8_UPD:
+    case ARM::VLD3q16_UPD:
+    case ARM::VLD3q32_UPD:
+    case ARM::VLD4q8:
+    case ARM::VLD4q16:
+    case ARM::VLD4q32:
+    case ARM::VLD4q8_UPD:
+    case ARM::VLD4q16_UPD:
+    case ARM::VLD4q32_UPD:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+2)%32, Address, Decoder)))
+        return MCDisassembler::Fail;
+    default:
+      break;
+  }
+
+  // Third output register
+  switch(Inst.getOpcode()) {
+    case ARM::VLD1d8T:
+    case ARM::VLD1d16T:
+    case ARM::VLD1d32T:
+    case ARM::VLD1d64T:
+    case ARM::VLD1d8T_UPD:
+    case ARM::VLD1d16T_UPD:
+    case ARM::VLD1d32T_UPD:
+    case ARM::VLD1d64T_UPD:
+    case ARM::VLD1d8Q:
+    case ARM::VLD1d16Q:
+    case ARM::VLD1d32Q:
+    case ARM::VLD1d64Q:
+    case ARM::VLD1d8Q_UPD:
+    case ARM::VLD1d16Q_UPD:
+    case ARM::VLD1d32Q_UPD:
+    case ARM::VLD1d64Q_UPD:
+    case ARM::VLD2q8:
+    case ARM::VLD2q16:
+    case ARM::VLD2q32:
+    case ARM::VLD2q8_UPD:
+    case ARM::VLD2q16_UPD:
+    case ARM::VLD2q32_UPD:
+    case ARM::VLD3d8:
+    case ARM::VLD3d16:
+    case ARM::VLD3d32:
+    case ARM::VLD3d8_UPD:
+    case ARM::VLD3d16_UPD:
+    case ARM::VLD3d32_UPD:
+    case ARM::VLD4d8:
+    case ARM::VLD4d16:
+    case ARM::VLD4d32:
+    case ARM::VLD4d8_UPD:
+    case ARM::VLD4d16_UPD:
+    case ARM::VLD4d32_UPD:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+2)%32, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    case ARM::VLD3q8:
+    case ARM::VLD3q16:
+    case ARM::VLD3q32:
+    case ARM::VLD3q8_UPD:
+    case ARM::VLD3q16_UPD:
+    case ARM::VLD3q32_UPD:
+    case ARM::VLD4q8:
+    case ARM::VLD4q16:
+    case ARM::VLD4q32:
+    case ARM::VLD4q8_UPD:
+    case ARM::VLD4q16_UPD:
+    case ARM::VLD4q32_UPD:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+4)%32, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  // Fourth output register
+  switch (Inst.getOpcode()) {
+    case ARM::VLD1d8Q:
+    case ARM::VLD1d16Q:
+    case ARM::VLD1d32Q:
+    case ARM::VLD1d64Q:
+    case ARM::VLD1d8Q_UPD:
+    case ARM::VLD1d16Q_UPD:
+    case ARM::VLD1d32Q_UPD:
+    case ARM::VLD1d64Q_UPD:
+    case ARM::VLD2q8:
+    case ARM::VLD2q16:
+    case ARM::VLD2q32:
+    case ARM::VLD2q8_UPD:
+    case ARM::VLD2q16_UPD:
+    case ARM::VLD2q32_UPD:
+    case ARM::VLD4d8:
+    case ARM::VLD4d16:
+    case ARM::VLD4d32:
+    case ARM::VLD4d8_UPD:
+    case ARM::VLD4d16_UPD:
+    case ARM::VLD4d32_UPD:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+3)%32, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    case ARM::VLD4q8:
+    case ARM::VLD4q16:
+    case ARM::VLD4q32:
+    case ARM::VLD4q8_UPD:
+    case ARM::VLD4q16_UPD:
+    case ARM::VLD4q32_UPD:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+6)%32, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  // Writeback operand
+  switch (Inst.getOpcode()) {
+    case ARM::VLD1d8_UPD:
+    case ARM::VLD1d16_UPD:
+    case ARM::VLD1d32_UPD:
+    case ARM::VLD1d64_UPD:
+    case ARM::VLD1q8_UPD:
+    case ARM::VLD1q16_UPD:
+    case ARM::VLD1q32_UPD:
+    case ARM::VLD1q64_UPD:
+    case ARM::VLD1d8T_UPD:
+    case ARM::VLD1d16T_UPD:
+    case ARM::VLD1d32T_UPD:
+    case ARM::VLD1d64T_UPD:
+    case ARM::VLD1d8Q_UPD:
+    case ARM::VLD1d16Q_UPD:
+    case ARM::VLD1d32Q_UPD:
+    case ARM::VLD1d64Q_UPD:
+    case ARM::VLD2d8_UPD:
+    case ARM::VLD2d16_UPD:
+    case ARM::VLD2d32_UPD:
+    case ARM::VLD2q8_UPD:
+    case ARM::VLD2q16_UPD:
+    case ARM::VLD2q32_UPD:
+    case ARM::VLD2b8_UPD:
+    case ARM::VLD2b16_UPD:
+    case ARM::VLD2b32_UPD:
+    case ARM::VLD3d8_UPD:
+    case ARM::VLD3d16_UPD:
+    case ARM::VLD3d32_UPD:
+    case ARM::VLD3q8_UPD:
+    case ARM::VLD3q16_UPD:
+    case ARM::VLD3q32_UPD:
+    case ARM::VLD4d8_UPD:
+    case ARM::VLD4d16_UPD:
+    case ARM::VLD4d32_UPD:
+    case ARM::VLD4q8_UPD:
+    case ARM::VLD4q16_UPD:
+    case ARM::VLD4q32_UPD:
+      if (!Check(S, DecodeGPRRegisterClass(Inst, wb, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  // AddrMode6 Base (register+alignment)
+  if (!Check(S, DecodeAddrMode6Operand(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  // AddrMode6 Offset (register)
+  if (Rm == 0xD)
+    Inst.addOperand(MCOperand::CreateReg(0));
+  else if (Rm != 0xF) {
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  return S;
+}
+
+static DecodeStatus DecodeVSTInstruction(llvm::MCInst &Inst, unsigned Insn,
+                                 uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction32(Insn, 12, 4);
+  Rd |= fieldFromInstruction32(Insn, 22, 1) << 4;
+  unsigned wb = fieldFromInstruction32(Insn, 16, 4);
+  unsigned Rn = fieldFromInstruction32(Insn, 16, 4);
+  Rn |= fieldFromInstruction32(Insn, 4, 2) << 4;
+  unsigned Rm = fieldFromInstruction32(Insn, 0, 4);
+
+  // Writeback Operand
+  switch (Inst.getOpcode()) {
+    case ARM::VST1d8_UPD:
+    case ARM::VST1d16_UPD:
+    case ARM::VST1d32_UPD:
+    case ARM::VST1d64_UPD:
+    case ARM::VST1q8_UPD:
+    case ARM::VST1q16_UPD:
+    case ARM::VST1q32_UPD:
+    case ARM::VST1q64_UPD:
+    case ARM::VST1d8T_UPD:
+    case ARM::VST1d16T_UPD:
+    case ARM::VST1d32T_UPD:
+    case ARM::VST1d64T_UPD:
+    case ARM::VST1d8Q_UPD:
+    case ARM::VST1d16Q_UPD:
+    case ARM::VST1d32Q_UPD:
+    case ARM::VST1d64Q_UPD:
+    case ARM::VST2d8_UPD:
+    case ARM::VST2d16_UPD:
+    case ARM::VST2d32_UPD:
+    case ARM::VST2q8_UPD:
+    case ARM::VST2q16_UPD:
+    case ARM::VST2q32_UPD:
+    case ARM::VST2b8_UPD:
+    case ARM::VST2b16_UPD:
+    case ARM::VST2b32_UPD:
+    case ARM::VST3d8_UPD:
+    case ARM::VST3d16_UPD:
+    case ARM::VST3d32_UPD:
+    case ARM::VST3q8_UPD:
+    case ARM::VST3q16_UPD:
+    case ARM::VST3q32_UPD:
+    case ARM::VST4d8_UPD:
+    case ARM::VST4d16_UPD:
+    case ARM::VST4d32_UPD:
+    case ARM::VST4q8_UPD:
+    case ARM::VST4q16_UPD:
+    case ARM::VST4q32_UPD:
+      if (!Check(S, DecodeGPRRegisterClass(Inst, wb, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  // AddrMode6 Base (register+alignment)
+  if (!Check(S, DecodeAddrMode6Operand(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  // AddrMode6 Offset (register)
+  if (Rm == 0xD)
+    Inst.addOperand(MCOperand::CreateReg(0));
+  else if (Rm != 0xF) {
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  }
+
+  // First input register
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  // Second input register
+  switch (Inst.getOpcode()) {
+    case ARM::VST1q8:
+    case ARM::VST1q16:
+    case ARM::VST1q32:
+    case ARM::VST1q64:
+    case ARM::VST1q8_UPD:
+    case ARM::VST1q16_UPD:
+    case ARM::VST1q32_UPD:
+    case ARM::VST1q64_UPD:
+    case ARM::VST1d8T:
+    case ARM::VST1d16T:
+    case ARM::VST1d32T:
+    case ARM::VST1d64T:
+    case ARM::VST1d8T_UPD:
+    case ARM::VST1d16T_UPD:
+    case ARM::VST1d32T_UPD:
+    case ARM::VST1d64T_UPD:
+    case ARM::VST1d8Q:
+    case ARM::VST1d16Q:
+    case ARM::VST1d32Q:
+    case ARM::VST1d64Q:
+    case ARM::VST1d8Q_UPD:
+    case ARM::VST1d16Q_UPD:
+    case ARM::VST1d32Q_UPD:
+    case ARM::VST1d64Q_UPD:
+    case ARM::VST2d8:
+    case ARM::VST2d16:
+    case ARM::VST2d32:
+    case ARM::VST2d8_UPD:
+    case ARM::VST2d16_UPD:
+    case ARM::VST2d32_UPD:
+    case ARM::VST2q8:
+    case ARM::VST2q16:
+    case ARM::VST2q32:
+    case ARM::VST2q8_UPD:
+    case ARM::VST2q16_UPD:
+    case ARM::VST2q32_UPD:
+    case ARM::VST3d8:
+    case ARM::VST3d16:
+    case ARM::VST3d32:
+    case ARM::VST3d8_UPD:
+    case ARM::VST3d16_UPD:
+    case ARM::VST3d32_UPD:
+    case ARM::VST4d8:
+    case ARM::VST4d16:
+    case ARM::VST4d32:
+    case ARM::VST4d8_UPD:
+    case ARM::VST4d16_UPD:
+    case ARM::VST4d32_UPD:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+1)%32, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    case ARM::VST2b8:
+    case ARM::VST2b16:
+    case ARM::VST2b32:
+    case ARM::VST2b8_UPD:
+    case ARM::VST2b16_UPD:
+    case ARM::VST2b32_UPD:
+    case ARM::VST3q8:
+    case ARM::VST3q16:
+    case ARM::VST3q32:
+    case ARM::VST3q8_UPD:
+    case ARM::VST3q16_UPD:
+    case ARM::VST3q32_UPD:
+    case ARM::VST4q8:
+    case ARM::VST4q16:
+    case ARM::VST4q32:
+    case ARM::VST4q8_UPD:
+    case ARM::VST4q16_UPD:
+    case ARM::VST4q32_UPD:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+2)%32, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  // Third input register
+  switch (Inst.getOpcode()) {
+    case ARM::VST1d8T:
+    case ARM::VST1d16T:
+    case ARM::VST1d32T:
+    case ARM::VST1d64T:
+    case ARM::VST1d8T_UPD:
+    case ARM::VST1d16T_UPD:
+    case ARM::VST1d32T_UPD:
+    case ARM::VST1d64T_UPD:
+    case ARM::VST1d8Q:
+    case ARM::VST1d16Q:
+    case ARM::VST1d32Q:
+    case ARM::VST1d64Q:
+    case ARM::VST1d8Q_UPD:
+    case ARM::VST1d16Q_UPD:
+    case ARM::VST1d32Q_UPD:
+    case ARM::VST1d64Q_UPD:
+    case ARM::VST2q8:
+    case ARM::VST2q16:
+    case ARM::VST2q32:
+    case ARM::VST2q8_UPD:
+    case ARM::VST2q16_UPD:
+    case ARM::VST2q32_UPD:
+    case ARM::VST3d8:
+    case ARM::VST3d16:
+    case ARM::VST3d32:
+    case ARM::VST3d8_UPD:
+    case ARM::VST3d16_UPD:
+    case ARM::VST3d32_UPD:
+    case ARM::VST4d8:
+    case ARM::VST4d16:
+    case ARM::VST4d32:
+    case ARM::VST4d8_UPD:
+    case ARM::VST4d16_UPD:
+    case ARM::VST4d32_UPD:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+2)%32, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    case ARM::VST3q8:
+    case ARM::VST3q16:
+    case ARM::VST3q32:
+    case ARM::VST3q8_UPD:
+    case ARM::VST3q16_UPD:
+    case ARM::VST3q32_UPD:
+    case ARM::VST4q8:
+    case ARM::VST4q16:
+    case ARM::VST4q32:
+    case ARM::VST4q8_UPD:
+    case ARM::VST4q16_UPD:
+    case ARM::VST4q32_UPD:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+4)%32, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  // Fourth input register
+  switch (Inst.getOpcode()) {
+    case ARM::VST1d8Q:
+    case ARM::VST1d16Q:
+    case ARM::VST1d32Q:
+    case ARM::VST1d64Q:
+    case ARM::VST1d8Q_UPD:
+    case ARM::VST1d16Q_UPD:
+    case ARM::VST1d32Q_UPD:
+    case ARM::VST1d64Q_UPD:
+    case ARM::VST2q8:
+    case ARM::VST2q16:
+    case ARM::VST2q32:
+    case ARM::VST2q8_UPD:
+    case ARM::VST2q16_UPD:
+    case ARM::VST2q32_UPD:
+    case ARM::VST4d8:
+    case ARM::VST4d16:
+    case ARM::VST4d32:
+    case ARM::VST4d8_UPD:
+    case ARM::VST4d16_UPD:
+    case ARM::VST4d32_UPD:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+3)%32, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    case ARM::VST4q8:
+    case ARM::VST4q16:
+    case ARM::VST4q32:
+    case ARM::VST4q8_UPD:
+    case ARM::VST4q16_UPD:
+    case ARM::VST4q32_UPD:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+6)%32, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  return S;
+}
+
+static DecodeStatus DecodeVLD1DupInstruction(llvm::MCInst &Inst, unsigned Insn,
+                                    uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction32(Insn, 12, 4);
+  Rd |= fieldFromInstruction32(Insn, 22, 1) << 4;
+  unsigned Rn = fieldFromInstruction32(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction32(Insn, 0, 4);
+  unsigned align = fieldFromInstruction32(Insn, 4, 1);
+  unsigned size = fieldFromInstruction32(Insn, 6, 2);
+  unsigned regs = fieldFromInstruction32(Insn, 5, 1) + 1;
+
+  align *= (1 << size);
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (regs == 2) {
+    if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+1)%32, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+  if (Rm != 0xF) {
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(align));
+
+  if (Rm == 0xD)
+    Inst.addOperand(MCOperand::CreateReg(0));
+  else if (Rm != 0xF) {
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  return S;
+}
+
+static DecodeStatus DecodeVLD2DupInstruction(llvm::MCInst &Inst, unsigned Insn,
+                                    uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction32(Insn, 12, 4);
+  Rd |= fieldFromInstruction32(Insn, 22, 1) << 4;
+  unsigned Rn = fieldFromInstruction32(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction32(Insn, 0, 4);
+  unsigned align = fieldFromInstruction32(Insn, 4, 1);
+  unsigned size = 1 << fieldFromInstruction32(Insn, 6, 2);
+  unsigned inc = fieldFromInstruction32(Insn, 5, 1) + 1;
+  align *= 2*size;
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+inc)%32, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (Rm != 0xF) {
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(align));
+
+  if (Rm == 0xD)
+    Inst.addOperand(MCOperand::CreateReg(0));
+  else if (Rm != 0xF) {
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  return S;
+}
+
+static DecodeStatus DecodeVLD3DupInstruction(llvm::MCInst &Inst, unsigned Insn,
+                                    uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction32(Insn, 12, 4);
+  Rd |= fieldFromInstruction32(Insn, 22, 1) << 4;
+  unsigned Rn = fieldFromInstruction32(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction32(Insn, 0, 4);
+  unsigned inc = fieldFromInstruction32(Insn, 5, 1) + 1;
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+inc)%32, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+2*inc)%32, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (Rm != 0xF) {
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(0));
+
+  if (Rm == 0xD)
+    Inst.addOperand(MCOperand::CreateReg(0));
+  else if (Rm != 0xF) {
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  return S;
+}
+
+static DecodeStatus DecodeVLD4DupInstruction(llvm::MCInst &Inst, unsigned Insn,
+                                    uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction32(Insn, 12, 4);
+  Rd |= fieldFromInstruction32(Insn, 22, 1) << 4;
+  unsigned Rn = fieldFromInstruction32(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction32(Insn, 0, 4);
+  unsigned size = fieldFromInstruction32(Insn, 6, 2);
+  unsigned inc = fieldFromInstruction32(Insn, 5, 1) + 1;
+  unsigned align = fieldFromInstruction32(Insn, 4, 1);
+
+  if (size == 0x3) {
+    size = 4;
+    align = 16;
+  } else {
+    if (size == 2) {
+      size = 1 << size;
+      align *= 8;
+    } else {
+      size = 1 << size;
+      align *= 4*size;
+    }
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+inc)%32, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+2*inc)%32, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+3*inc)%32, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (Rm != 0xF) {
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(align));
+
+  if (Rm == 0xD)
+    Inst.addOperand(MCOperand::CreateReg(0));
+  else if (Rm != 0xF) {
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  return S;
+}
+
+static DecodeStatus
+DecodeNEONModImmInstruction(llvm::MCInst &Inst, unsigned Insn,
+                            uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction32(Insn, 12, 4);
+  Rd |= fieldFromInstruction32(Insn, 22, 1) << 4;
+  unsigned imm = fieldFromInstruction32(Insn, 0, 4);
+  imm |= fieldFromInstruction32(Insn, 16, 3) << 4;
+  imm |= fieldFromInstruction32(Insn, 24, 1) << 7;
+  imm |= fieldFromInstruction32(Insn, 8, 4) << 8;
+  imm |= fieldFromInstruction32(Insn, 5, 1) << 12;
+  unsigned Q = fieldFromInstruction32(Insn, 6, 1);
+
+  if (Q) {
+    if (!Check(S, DecodeQPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  } else {
+    if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  }
+
+  Inst.addOperand(MCOperand::CreateImm(imm));
+
+  switch (Inst.getOpcode()) {
+    case ARM::VORRiv4i16:
+    case ARM::VORRiv2i32:
+    case ARM::VBICiv4i16:
+    case ARM::VBICiv2i32:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    case ARM::VORRiv8i16:
+    case ARM::VORRiv4i32:
+    case ARM::VBICiv8i16:
+    case ARM::VBICiv4i32:
+      if (!Check(S, DecodeQPRRegisterClass(Inst, Rd, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  return S;
+}
+
+static DecodeStatus DecodeVSHLMaxInstruction(llvm::MCInst &Inst, unsigned Insn,
+                                        uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction32(Insn, 12, 4);
+  Rd |= fieldFromInstruction32(Insn, 22, 1) << 4;
+  unsigned Rm = fieldFromInstruction32(Insn, 0, 4);
+  Rm |= fieldFromInstruction32(Insn, 5, 1) << 4;
+  unsigned size = fieldFromInstruction32(Insn, 18, 2);
+
+  if (!Check(S, DecodeQPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(8 << size));
+
+  return S;
+}
+
+static DecodeStatus DecodeShiftRight8Imm(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm(8 - Val));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeShiftRight16Imm(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm(16 - Val));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeShiftRight32Imm(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm(32 - Val));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeShiftRight64Imm(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm(64 - Val));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeTBLInstruction(llvm::MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction32(Insn, 12, 4);
+  Rd |= fieldFromInstruction32(Insn, 22, 1) << 4;
+  unsigned Rn = fieldFromInstruction32(Insn, 16, 4);
+  Rn |= fieldFromInstruction32(Insn, 7, 1) << 4;
+  unsigned Rm = fieldFromInstruction32(Insn, 0, 4);
+  Rm |= fieldFromInstruction32(Insn, 5, 1) << 4;
+  unsigned op = fieldFromInstruction32(Insn, 6, 1);
+  unsigned length = fieldFromInstruction32(Insn, 8, 2) + 1;
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (op) {
+    if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail; // Writeback
+  }
+
+  for (unsigned i = 0; i < length; ++i) {
+    if (!Check(S, DecodeDPRRegisterClass(Inst, (Rn+i)%32, Address, Decoder)))
+    return MCDisassembler::Fail;
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeThumbAddSpecialReg(llvm::MCInst &Inst, uint16_t Insn,
+                                     uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned dst = fieldFromInstruction16(Insn, 8, 3);
+  unsigned imm = fieldFromInstruction16(Insn, 0, 8);
+
+  if (!Check(S, DecodetGPRRegisterClass(Inst, dst, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  switch(Inst.getOpcode()) {
+    default:
+      return MCDisassembler::Fail;
+    case ARM::tADR:
+      break; // tADR does not explicitly represent the PC as an operand.
+    case ARM::tADDrSPi:
+      Inst.addOperand(MCOperand::CreateReg(ARM::SP));
+      break;
+  }
+
+  Inst.addOperand(MCOperand::CreateImm(imm));
+  return S;
+}
+
+static DecodeStatus DecodeThumbBROperand(llvm::MCInst &Inst, unsigned Val,
+                                 uint64_t Address, const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm(SignExtend32<12>(Val << 1)));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeT2BROperand(llvm::MCInst &Inst, unsigned Val,
+                                 uint64_t Address, const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm(SignExtend32<21>(Val)));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeThumbCmpBROperand(llvm::MCInst &Inst, unsigned Val,
+                                 uint64_t Address, const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm(SignExtend32<7>(Val << 1)));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeThumbAddrModeRR(llvm::MCInst &Inst, unsigned Val,
+                                 uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction32(Val, 0, 3);
+  unsigned Rm = fieldFromInstruction32(Val, 3, 3);
+
+  if (!Check(S, DecodetGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodetGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeThumbAddrModeIS(llvm::MCInst &Inst, unsigned Val,
+                                  uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction32(Val, 0, 3);
+  unsigned imm = fieldFromInstruction32(Val, 3, 5);
+
+  if (!Check(S, DecodetGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(imm));
+
+  return S;
+}
+
+static DecodeStatus DecodeThumbAddrModePC(llvm::MCInst &Inst, unsigned Val,
+                                  uint64_t Address, const void *Decoder) {
+  unsigned imm = Val << 2;
+
+  Inst.addOperand(MCOperand::CreateImm(imm));
+  tryAddingPcLoadReferenceComment(Address, (Address & ~2u) + imm + 4, Decoder);
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeThumbAddrModeSP(llvm::MCInst &Inst, unsigned Val,
+                                  uint64_t Address, const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateReg(ARM::SP));
+  Inst.addOperand(MCOperand::CreateImm(Val));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeT2AddrModeSOReg(llvm::MCInst &Inst, unsigned Val,
+                                  uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction32(Val, 6, 4);
+  unsigned Rm = fieldFromInstruction32(Val, 2, 4);
+  unsigned imm = fieldFromInstruction32(Val, 0, 2);
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecoderGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(imm));
+
+  return S;
+}
+
+static DecodeStatus DecodeT2LoadShift(llvm::MCInst &Inst, unsigned Insn,
+                              uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  switch (Inst.getOpcode()) {
+    case ARM::t2PLDs:
+    case ARM::t2PLDWs:
+    case ARM::t2PLIs:
+      break;
+    default: {
+      unsigned Rt = fieldFromInstruction32(Insn, 12, 4);
+      if (!Check(S, DecoderGPRRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+    }
+  }
+
+  unsigned Rn = fieldFromInstruction32(Insn, 16, 4);
+  if (Rn == 0xF) {
+    switch (Inst.getOpcode()) {
+      case ARM::t2LDRBs:
+        Inst.setOpcode(ARM::t2LDRBpci);
+        break;
+      case ARM::t2LDRHs:
+        Inst.setOpcode(ARM::t2LDRHpci);
+        break;
+      case ARM::t2LDRSHs:
+        Inst.setOpcode(ARM::t2LDRSHpci);
+        break;
+      case ARM::t2LDRSBs:
+        Inst.setOpcode(ARM::t2LDRSBpci);
+        break;
+      case ARM::t2PLDs:
+        Inst.setOpcode(ARM::t2PLDi12);
+        Inst.addOperand(MCOperand::CreateReg(ARM::PC));
+        break;
+      default:
+        return MCDisassembler::Fail;
+    }
+
+    int imm = fieldFromInstruction32(Insn, 0, 12);
+    if (!fieldFromInstruction32(Insn, 23, 1)) imm *= -1;
+    Inst.addOperand(MCOperand::CreateImm(imm));
+
+    return S;
+  }
+
+  unsigned addrmode = fieldFromInstruction32(Insn, 4, 2);
+  addrmode |= fieldFromInstruction32(Insn, 0, 4) << 2;
+  addrmode |= fieldFromInstruction32(Insn, 16, 4) << 6;
+  if (!Check(S, DecodeT2AddrModeSOReg(Inst, addrmode, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeT2Imm8S4(llvm::MCInst &Inst, unsigned Val,
+                           uint64_t Address, const void *Decoder) {
+  int imm = Val & 0xFF;
+  if (!(Val & 0x100)) imm *= -1;
+  Inst.addOperand(MCOperand::CreateImm(imm << 2));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeT2AddrModeImm8s4(llvm::MCInst &Inst, unsigned Val,
+                                   uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction32(Val, 9, 4);
+  unsigned imm = fieldFromInstruction32(Val, 0, 9);
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeT2Imm8S4(Inst, imm, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeT2AddrModeImm0_1020s4(llvm::MCInst &Inst,unsigned Val,
+                                   uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction32(Val, 8, 4);
+  unsigned imm = fieldFromInstruction32(Val, 0, 8);
+
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::CreateImm(imm));
+
+  return S;
+}
+
+static DecodeStatus DecodeT2Imm8(llvm::MCInst &Inst, unsigned Val,
+                         uint64_t Address, const void *Decoder) {
+  int imm = Val & 0xFF;
+  if (Val == 0)
+    imm = INT32_MIN;
+  else if (!(Val & 0x100))
+    imm *= -1;
+  Inst.addOperand(MCOperand::CreateImm(imm));
+
+  return MCDisassembler::Success;
+}
+
+
+static DecodeStatus DecodeT2AddrModeImm8(llvm::MCInst &Inst, unsigned Val,
+                                 uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction32(Val, 9, 4);
+  unsigned imm = fieldFromInstruction32(Val, 0, 9);
+
+  // Some instructions always use an additive offset.
+  switch (Inst.getOpcode()) {
+    case ARM::t2LDRT:
+    case ARM::t2LDRBT:
+    case ARM::t2LDRHT:
+    case ARM::t2LDRSBT:
+    case ARM::t2LDRSHT:
+    case ARM::t2STRT:
+    case ARM::t2STRBT:
+    case ARM::t2STRHT:
+      imm |= 0x100;
+      break;
+    default:
+      break;
+  }
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeT2Imm8(Inst, imm, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeT2LdStPre(llvm::MCInst &Inst, unsigned Insn,
+                                    uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rt = fieldFromInstruction32(Insn, 12, 4);
+  unsigned Rn = fieldFromInstruction32(Insn, 16, 4);
+  unsigned addr = fieldFromInstruction32(Insn, 0, 8);
+  addr |= fieldFromInstruction32(Insn, 9, 1) << 8;
+  addr |= Rn << 9;
+  unsigned load = fieldFromInstruction32(Insn, 20, 1);
+
+  if (!load) {
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  if (!Check(S, DecoderGPRRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  if (load) {
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  if (!Check(S, DecodeT2AddrModeImm8(Inst, addr, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeT2AddrModeImm12(llvm::MCInst &Inst, unsigned Val,
+                                  uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction32(Val, 13, 4);
+  unsigned imm = fieldFromInstruction32(Val, 0, 12);
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(imm));
+
+  return S;
+}
+
+
+static DecodeStatus DecodeThumbAddSPImm(llvm::MCInst &Inst, uint16_t Insn,
+                                uint64_t Address, const void *Decoder) {
+  unsigned imm = fieldFromInstruction16(Insn, 0, 7);
+
+  Inst.addOperand(MCOperand::CreateReg(ARM::SP));
+  Inst.addOperand(MCOperand::CreateReg(ARM::SP));
+  Inst.addOperand(MCOperand::CreateImm(imm));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeThumbAddSPReg(llvm::MCInst &Inst, uint16_t Insn,
+                                uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  if (Inst.getOpcode() == ARM::tADDrSP) {
+    unsigned Rdm = fieldFromInstruction16(Insn, 0, 3);
+    Rdm |= fieldFromInstruction16(Insn, 7, 1) << 3;
+
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rdm, Address, Decoder)))
+    return MCDisassembler::Fail;
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rdm, Address, Decoder)))
+    return MCDisassembler::Fail;
+    Inst.addOperand(MCOperand::CreateReg(ARM::SP));
+  } else if (Inst.getOpcode() == ARM::tADDspr) {
+    unsigned Rm = fieldFromInstruction16(Insn, 3, 4);
+
+    Inst.addOperand(MCOperand::CreateReg(ARM::SP));
+    Inst.addOperand(MCOperand::CreateReg(ARM::SP));
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  }
+
+  return S;
+}
+
+static DecodeStatus DecodeThumbCPS(llvm::MCInst &Inst, uint16_t Insn,
+                           uint64_t Address, const void *Decoder) {
+  unsigned imod = fieldFromInstruction16(Insn, 4, 1) | 0x2;
+  unsigned flags = fieldFromInstruction16(Insn, 0, 3);
+
+  Inst.addOperand(MCOperand::CreateImm(imod));
+  Inst.addOperand(MCOperand::CreateImm(flags));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodePostIdxReg(llvm::MCInst &Inst, unsigned Insn,
+                             uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+  unsigned Rm = fieldFromInstruction32(Insn, 0, 4);
+  unsigned add = fieldFromInstruction32(Insn, 4, 1);
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(add));
+
+  return S;
+}
+
+static DecodeStatus DecodeThumbBLXOffset(llvm::MCInst &Inst, unsigned Val,
+                                 uint64_t Address, const void *Decoder) {
+  if (!tryAddingSymbolicOperand(Address, 
+                                (Address & ~2u) + SignExtend32<22>(Val << 1) + 4,
+                                true, 4, Inst, Decoder))
+    Inst.addOperand(MCOperand::CreateImm(SignExtend32<22>(Val << 1)));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeCoprocessor(llvm::MCInst &Inst, unsigned Val,
+                              uint64_t Address, const void *Decoder) {
+  if (Val == 0xA || Val == 0xB)
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::CreateImm(Val));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus
+DecodeThumbTableBranch(llvm::MCInst &Inst, unsigned Insn,
+                       uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction32(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction32(Insn, 0, 4);
+
+  if (Rn == ARM::SP) S = MCDisassembler::SoftFail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecoderGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  return S;
+}
+
+static DecodeStatus
+DecodeThumb2BCCInstruction(llvm::MCInst &Inst, unsigned Insn,
+                           uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned pred = fieldFromInstruction32(Insn, 22, 4);
+  if (pred == 0xE || pred == 0xF) {
+    unsigned opc = fieldFromInstruction32(Insn, 4, 28);
+    switch (opc) {
+      default:
+        return MCDisassembler::Fail;
+      case 0xf3bf8f4:
+        Inst.setOpcode(ARM::t2DSB);
+        break;
+      case 0xf3bf8f5:
+        Inst.setOpcode(ARM::t2DMB);
+        break;
+      case 0xf3bf8f6:
+        Inst.setOpcode(ARM::t2ISB);
+        break;
+    }
+
+    unsigned imm = fieldFromInstruction32(Insn, 0, 4);
+    return DecodeMemBarrierOption(Inst, imm, Address, Decoder);
+  }
+
+  unsigned brtarget = fieldFromInstruction32(Insn, 0, 11) << 1;
+  brtarget |= fieldFromInstruction32(Insn, 11, 1) << 19;
+  brtarget |= fieldFromInstruction32(Insn, 13, 1) << 18;
+  brtarget |= fieldFromInstruction32(Insn, 16, 6) << 12;
+  brtarget |= fieldFromInstruction32(Insn, 26, 1) << 20;
+
+  if (!Check(S, DecodeT2BROperand(Inst, brtarget, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+// Decode a shifted immediate operand.  These basically consist
+// of an 8-bit value, and a 4-bit directive that specifies either
+// a splat operation or a rotation.
+static DecodeStatus DecodeT2SOImm(llvm::MCInst &Inst, unsigned Val,
+                          uint64_t Address, const void *Decoder) {
+  unsigned ctrl = fieldFromInstruction32(Val, 10, 2);
+  if (ctrl == 0) {
+    unsigned byte = fieldFromInstruction32(Val, 8, 2);
+    unsigned imm = fieldFromInstruction32(Val, 0, 8);
+    switch (byte) {
+      case 0:
+        Inst.addOperand(MCOperand::CreateImm(imm));
+        break;
+      case 1:
+        Inst.addOperand(MCOperand::CreateImm((imm << 16) | imm));
+        break;
+      case 2:
+        Inst.addOperand(MCOperand::CreateImm((imm << 24) | (imm << 8)));
+        break;
+      case 3:
+        Inst.addOperand(MCOperand::CreateImm((imm << 24) | (imm << 16) |
+                                             (imm << 8)  |  imm));
+        break;
+    }
+  } else {
+    unsigned unrot = fieldFromInstruction32(Val, 0, 7) | 0x80;
+    unsigned rot = fieldFromInstruction32(Val, 7, 5);
+    unsigned imm = (unrot >> rot) | (unrot << ((32-rot)&31));
+    Inst.addOperand(MCOperand::CreateImm(imm));
+  }
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus
+DecodeThumbBCCTargetOperand(llvm::MCInst &Inst, unsigned Val,
+                            uint64_t Address, const void *Decoder){
+  Inst.addOperand(MCOperand::CreateImm(Val << 1));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeThumbBLTargetOperand(llvm::MCInst &Inst, unsigned Val,
+                                       uint64_t Address, const void *Decoder){
+  Inst.addOperand(MCOperand::CreateImm(SignExtend32<22>(Val << 1)));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMemBarrierOption(llvm::MCInst &Inst, unsigned Val,
+                                   uint64_t Address, const void *Decoder) {
+  switch (Val) {
+  default:
+    return MCDisassembler::Fail;
+  case 0xF: // SY
+  case 0xE: // ST
+  case 0xB: // ISH
+  case 0xA: // ISHST
+  case 0x7: // NSH
+  case 0x6: // NSHST
+  case 0x3: // OSH
+  case 0x2: // OSHST
+    break;
+  }
+
+  Inst.addOperand(MCOperand::CreateImm(Val));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMSRMask(llvm::MCInst &Inst, unsigned Val,
+                          uint64_t Address, const void *Decoder) {
+  if (!Val) return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(Val));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeDoubleRegLoad(llvm::MCInst &Inst, unsigned Insn,
+                                        uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rt = fieldFromInstruction32(Insn, 12, 4);
+  unsigned Rn = fieldFromInstruction32(Insn, 16, 4);
+  unsigned pred = fieldFromInstruction32(Insn, 28, 4);
+
+  if ((Rt & 1) || Rt == 0xE || Rn == 0xF) return MCDisassembler::Fail;
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt+1, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+
+static DecodeStatus DecodeDoubleRegStore(llvm::MCInst &Inst, unsigned Insn,
+                                         uint64_t Address, const void *Decoder){
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction32(Insn, 12, 4);
+  unsigned Rt = fieldFromInstruction32(Insn, 0, 4);
+  unsigned Rn = fieldFromInstruction32(Insn, 16, 4);
+  unsigned pred = fieldFromInstruction32(Insn, 28, 4);
+
+  if (!Check(S, DecoderGPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  if ((Rt & 1) || Rt == 0xE || Rn == 0xF) return MCDisassembler::Fail;
+  if (Rd == Rn || Rd == Rt || Rd == Rt+1) return MCDisassembler::Fail;
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt+1, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeLDRPreImm(llvm::MCInst &Inst, unsigned Insn,
+                            uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction32(Insn, 16, 4);
+  unsigned Rt = fieldFromInstruction32(Insn, 12, 4);
+  unsigned imm = fieldFromInstruction32(Insn, 0, 12);
+  imm |= fieldFromInstruction32(Insn, 16, 4) << 13;
+  imm |= fieldFromInstruction32(Insn, 23, 1) << 12;
+  unsigned pred = fieldFromInstruction32(Insn, 28, 4);
+
+  if (Rn == 0xF || Rn == Rt) S = MCDisassembler::SoftFail;
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeAddrModeImm12Operand(Inst, imm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeLDRPreReg(llvm::MCInst &Inst, unsigned Insn,
+                            uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction32(Insn, 16, 4);
+  unsigned Rt = fieldFromInstruction32(Insn, 12, 4);
+  unsigned imm = fieldFromInstruction32(Insn, 0, 12);
+  imm |= fieldFromInstruction32(Insn, 16, 4) << 13;
+  imm |= fieldFromInstruction32(Insn, 23, 1) << 12;
+  unsigned pred = fieldFromInstruction32(Insn, 28, 4);
+  unsigned Rm = fieldFromInstruction32(Insn, 0, 4);
+
+  if (Rn == 0xF || Rn == Rt) S = MCDisassembler::SoftFail;
+  if (Rm == 0xF) S = MCDisassembler::SoftFail;
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeSORegMemOperand(Inst, imm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+
+static DecodeStatus DecodeSTRPreImm(llvm::MCInst &Inst, unsigned Insn,
+                            uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction32(Insn, 16, 4);
+  unsigned Rt = fieldFromInstruction32(Insn, 12, 4);
+  unsigned imm = fieldFromInstruction32(Insn, 0, 12);
+  imm |= fieldFromInstruction32(Insn, 16, 4) << 13;
+  imm |= fieldFromInstruction32(Insn, 23, 1) << 12;
+  unsigned pred = fieldFromInstruction32(Insn, 28, 4);
+
+  if (Rn == 0xF || Rn == Rt) S = MCDisassembler::SoftFail;
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeAddrModeImm12Operand(Inst, imm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeSTRPreReg(llvm::MCInst &Inst, unsigned Insn,
+                            uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction32(Insn, 16, 4);
+  unsigned Rt = fieldFromInstruction32(Insn, 12, 4);
+  unsigned imm = fieldFromInstruction32(Insn, 0, 12);
+  imm |= fieldFromInstruction32(Insn, 16, 4) << 13;
+  imm |= fieldFromInstruction32(Insn, 23, 1) << 12;
+  unsigned pred = fieldFromInstruction32(Insn, 28, 4);
+
+  if (Rn == 0xF || Rn == Rt) S = MCDisassembler::SoftFail;
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeSORegMemOperand(Inst, imm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeVLD1LN(llvm::MCInst &Inst, unsigned Insn,
+                         uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction32(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction32(Insn, 0, 4);
+  unsigned Rd = fieldFromInstruction32(Insn, 12, 4);
+  Rd |= fieldFromInstruction32(Insn, 22, 1) << 4;
+  unsigned size = fieldFromInstruction32(Insn, 10, 2);
+
+  unsigned align = 0;
+  unsigned index = 0;
+  switch (size) {
+    default:
+      return MCDisassembler::Fail;
+    case 0:
+      if (fieldFromInstruction32(Insn, 4, 1))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction32(Insn, 5, 3);
+      break;
+    case 1:
+      if (fieldFromInstruction32(Insn, 5, 1))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction32(Insn, 6, 2);
+      if (fieldFromInstruction32(Insn, 4, 1))
+        align = 2;
+      break;
+    case 2:
+      if (fieldFromInstruction32(Insn, 6, 1))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction32(Insn, 7, 1);
+      if (fieldFromInstruction32(Insn, 4, 2) != 0)
+        align = 4;
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (Rm != 0xF) { // Writeback
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(align));
+  if (Rm != 0xF) {
+    if (Rm != 0xD) {
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+        return MCDisassembler::Fail;
+    } else
+      Inst.addOperand(MCOperand::CreateReg(0));
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(index));
+
+  return S;
+}
+
+static DecodeStatus DecodeVST1LN(llvm::MCInst &Inst, unsigned Insn,
+                         uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction32(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction32(Insn, 0, 4);
+  unsigned Rd = fieldFromInstruction32(Insn, 12, 4);
+  Rd |= fieldFromInstruction32(Insn, 22, 1) << 4;
+  unsigned size = fieldFromInstruction32(Insn, 10, 2);
+
+  unsigned align = 0;
+  unsigned index = 0;
+  switch (size) {
+    default:
+      return MCDisassembler::Fail;
+    case 0:
+      if (fieldFromInstruction32(Insn, 4, 1))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction32(Insn, 5, 3);
+      break;
+    case 1:
+      if (fieldFromInstruction32(Insn, 5, 1))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction32(Insn, 6, 2);
+      if (fieldFromInstruction32(Insn, 4, 1))
+        align = 2;
+      break;
+    case 2:
+      if (fieldFromInstruction32(Insn, 6, 1))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction32(Insn, 7, 1);
+      if (fieldFromInstruction32(Insn, 4, 2) != 0)
+        align = 4;
+  }
+
+  if (Rm != 0xF) { // Writeback
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  }
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(align));
+  if (Rm != 0xF) {
+    if (Rm != 0xD) {
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+    } else
+      Inst.addOperand(MCOperand::CreateReg(0));
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(index));
+
+  return S;
+}
+
+
+static DecodeStatus DecodeVLD2LN(llvm::MCInst &Inst, unsigned Insn,
+                         uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction32(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction32(Insn, 0, 4);
+  unsigned Rd = fieldFromInstruction32(Insn, 12, 4);
+  Rd |= fieldFromInstruction32(Insn, 22, 1) << 4;
+  unsigned size = fieldFromInstruction32(Insn, 10, 2);
+
+  unsigned align = 0;
+  unsigned index = 0;
+  unsigned inc = 1;
+  switch (size) {
+    default:
+      return MCDisassembler::Fail;
+    case 0:
+      index = fieldFromInstruction32(Insn, 5, 3);
+      if (fieldFromInstruction32(Insn, 4, 1))
+        align = 2;
+      break;
+    case 1:
+      index = fieldFromInstruction32(Insn, 6, 2);
+      if (fieldFromInstruction32(Insn, 4, 1))
+        align = 4;
+      if (fieldFromInstruction32(Insn, 5, 1))
+        inc = 2;
+      break;
+    case 2:
+      if (fieldFromInstruction32(Insn, 5, 1))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction32(Insn, 7, 1);
+      if (fieldFromInstruction32(Insn, 4, 1) != 0)
+        align = 8;
+      if (fieldFromInstruction32(Insn, 6, 1))
+        inc = 2;
+      break;
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (Rm != 0xF) { // Writeback
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(align));
+  if (Rm != 0xF) {
+    if (Rm != 0xD) {
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+        return MCDisassembler::Fail;
+    } else
+      Inst.addOperand(MCOperand::CreateReg(0));
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(index));
+
+  return S;
+}
+
+static DecodeStatus DecodeVST2LN(llvm::MCInst &Inst, unsigned Insn,
+                         uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction32(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction32(Insn, 0, 4);
+  unsigned Rd = fieldFromInstruction32(Insn, 12, 4);
+  Rd |= fieldFromInstruction32(Insn, 22, 1) << 4;
+  unsigned size = fieldFromInstruction32(Insn, 10, 2);
+
+  unsigned align = 0;
+  unsigned index = 0;
+  unsigned inc = 1;
+  switch (size) {
+    default:
+      return MCDisassembler::Fail;
+    case 0:
+      index = fieldFromInstruction32(Insn, 5, 3);
+      if (fieldFromInstruction32(Insn, 4, 1))
+        align = 2;
+      break;
+    case 1:
+      index = fieldFromInstruction32(Insn, 6, 2);
+      if (fieldFromInstruction32(Insn, 4, 1))
+        align = 4;
+      if (fieldFromInstruction32(Insn, 5, 1))
+        inc = 2;
+      break;
+    case 2:
+      if (fieldFromInstruction32(Insn, 5, 1))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction32(Insn, 7, 1);
+      if (fieldFromInstruction32(Insn, 4, 1) != 0)
+        align = 8;
+      if (fieldFromInstruction32(Insn, 6, 1))
+        inc = 2;
+      break;
+  }
+
+  if (Rm != 0xF) { // Writeback
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(align));
+  if (Rm != 0xF) {
+    if (Rm != 0xD) {
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+        return MCDisassembler::Fail;
+    } else
+      Inst.addOperand(MCOperand::CreateReg(0));
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(index));
+
+  return S;
+}
+
+
+static DecodeStatus DecodeVLD3LN(llvm::MCInst &Inst, unsigned Insn,
+                         uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction32(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction32(Insn, 0, 4);
+  unsigned Rd = fieldFromInstruction32(Insn, 12, 4);
+  Rd |= fieldFromInstruction32(Insn, 22, 1) << 4;
+  unsigned size = fieldFromInstruction32(Insn, 10, 2);
+
+  unsigned align = 0;
+  unsigned index = 0;
+  unsigned inc = 1;
+  switch (size) {
+    default:
+      return MCDisassembler::Fail;
+    case 0:
+      if (fieldFromInstruction32(Insn, 4, 1))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction32(Insn, 5, 3);
+      break;
+    case 1:
+      if (fieldFromInstruction32(Insn, 4, 1))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction32(Insn, 6, 2);
+      if (fieldFromInstruction32(Insn, 5, 1))
+        inc = 2;
+      break;
+    case 2:
+      if (fieldFromInstruction32(Insn, 4, 2))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction32(Insn, 7, 1);
+      if (fieldFromInstruction32(Insn, 6, 1))
+        inc = 2;
+      break;
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+2*inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  if (Rm != 0xF) { // Writeback
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  }
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(align));
+  if (Rm != 0xF) {
+    if (Rm != 0xD) {
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+    } else
+      Inst.addOperand(MCOperand::CreateReg(0));
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+2*inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(index));
+
+  return S;
+}
+
+static DecodeStatus DecodeVST3LN(llvm::MCInst &Inst, unsigned Insn,
+                         uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction32(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction32(Insn, 0, 4);
+  unsigned Rd = fieldFromInstruction32(Insn, 12, 4);
+  Rd |= fieldFromInstruction32(Insn, 22, 1) << 4;
+  unsigned size = fieldFromInstruction32(Insn, 10, 2);
+
+  unsigned align = 0;
+  unsigned index = 0;
+  unsigned inc = 1;
+  switch (size) {
+    default:
+      return MCDisassembler::Fail;
+    case 0:
+      if (fieldFromInstruction32(Insn, 4, 1))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction32(Insn, 5, 3);
+      break;
+    case 1:
+      if (fieldFromInstruction32(Insn, 4, 1))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction32(Insn, 6, 2);
+      if (fieldFromInstruction32(Insn, 5, 1))
+        inc = 2;
+      break;
+    case 2:
+      if (fieldFromInstruction32(Insn, 4, 2))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction32(Insn, 7, 1);
+      if (fieldFromInstruction32(Insn, 6, 1))
+        inc = 2;
+      break;
+  }
+
+  if (Rm != 0xF) { // Writeback
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  }
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(align));
+  if (Rm != 0xF) {
+    if (Rm != 0xD) {
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+    } else
+      Inst.addOperand(MCOperand::CreateReg(0));
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+2*inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(index));
+
+  return S;
+}
+
+
+static DecodeStatus DecodeVLD4LN(llvm::MCInst &Inst, unsigned Insn,
+                         uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction32(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction32(Insn, 0, 4);
+  unsigned Rd = fieldFromInstruction32(Insn, 12, 4);
+  Rd |= fieldFromInstruction32(Insn, 22, 1) << 4;
+  unsigned size = fieldFromInstruction32(Insn, 10, 2);
+
+  unsigned align = 0;
+  unsigned index = 0;
+  unsigned inc = 1;
+  switch (size) {
+    default:
+      return MCDisassembler::Fail;
+    case 0:
+      if (fieldFromInstruction32(Insn, 4, 1))
+        align = 4;
+      index = fieldFromInstruction32(Insn, 5, 3);
+      break;
+    case 1:
+      if (fieldFromInstruction32(Insn, 4, 1))
+        align = 8;
+      index = fieldFromInstruction32(Insn, 6, 2);
+      if (fieldFromInstruction32(Insn, 5, 1))
+        inc = 2;
+      break;
+    case 2:
+      if (fieldFromInstruction32(Insn, 4, 2))
+        align = 4 << fieldFromInstruction32(Insn, 4, 2);
+      index = fieldFromInstruction32(Insn, 7, 1);
+      if (fieldFromInstruction32(Insn, 6, 1))
+        inc = 2;
+      break;
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+2*inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+3*inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  if (Rm != 0xF) { // Writeback
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(align));
+  if (Rm != 0xF) {
+    if (Rm != 0xD) {
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+        return MCDisassembler::Fail;
+    } else
+      Inst.addOperand(MCOperand::CreateReg(0));
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+2*inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+3*inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(index));
+
+  return S;
+}
+
+static DecodeStatus DecodeVST4LN(llvm::MCInst &Inst, unsigned Insn,
+                         uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction32(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction32(Insn, 0, 4);
+  unsigned Rd = fieldFromInstruction32(Insn, 12, 4);
+  Rd |= fieldFromInstruction32(Insn, 22, 1) << 4;
+  unsigned size = fieldFromInstruction32(Insn, 10, 2);
+
+  unsigned align = 0;
+  unsigned index = 0;
+  unsigned inc = 1;
+  switch (size) {
+    default:
+      return MCDisassembler::Fail;
+    case 0:
+      if (fieldFromInstruction32(Insn, 4, 1))
+        align = 4;
+      index = fieldFromInstruction32(Insn, 5, 3);
+      break;
+    case 1:
+      if (fieldFromInstruction32(Insn, 4, 1))
+        align = 8;
+      index = fieldFromInstruction32(Insn, 6, 2);
+      if (fieldFromInstruction32(Insn, 5, 1))
+        inc = 2;
+      break;
+    case 2:
+      if (fieldFromInstruction32(Insn, 4, 2))
+        align = 4 << fieldFromInstruction32(Insn, 4, 2);
+      index = fieldFromInstruction32(Insn, 7, 1);
+      if (fieldFromInstruction32(Insn, 6, 1))
+        inc = 2;
+      break;
+  }
+
+  if (Rm != 0xF) { // Writeback
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  }
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(align));
+  if (Rm != 0xF) {
+    if (Rm != 0xD) {
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+    } else
+      Inst.addOperand(MCOperand::CreateReg(0));
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+2*inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+3*inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(index));
+
+  return S;
+}
+
+static DecodeStatus DecodeVMOVSRR(llvm::MCInst &Inst, unsigned Insn,
+                                  uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+  unsigned Rt  = fieldFromInstruction32(Insn, 12, 4);
+  unsigned Rt2 = fieldFromInstruction32(Insn, 16, 4);
+  unsigned Rm  = fieldFromInstruction32(Insn,  0, 4);
+  unsigned pred = fieldFromInstruction32(Insn, 28, 4);
+  Rm |= fieldFromInstruction32(Insn, 5, 1) << 4;
+
+  if (Rt == 0xF || Rt2 == 0xF || Rm == 0x1F)
+    S = MCDisassembler::SoftFail;
+
+  if (!Check(S, DecodeSPRRegisterClass(Inst, Rm  , Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeSPRRegisterClass(Inst, Rm+1, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt  , Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt2 , Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeVMOVRRS(llvm::MCInst &Inst, unsigned Insn,
+                                  uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+  unsigned Rt  = fieldFromInstruction32(Insn, 12, 4);
+  unsigned Rt2 = fieldFromInstruction32(Insn, 16, 4);
+  unsigned Rm  = fieldFromInstruction32(Insn,  0, 4);
+  unsigned pred = fieldFromInstruction32(Insn, 28, 4);
+  Rm |= fieldFromInstruction32(Insn, 5, 1) << 4;
+
+  if (Rt == 0xF || Rt2 == 0xF || Rm == 0x1F)
+    S = MCDisassembler::SoftFail;
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt  , Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt2 , Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeSPRRegisterClass(Inst, Rm  , Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeSPRRegisterClass(Inst, Rm+1, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeIT(llvm::MCInst &Inst, unsigned Insn,
+                             uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+  unsigned pred = fieldFromInstruction16(Insn, 4, 4);
+  // The InstPrinter needs to have the low bit of the predicate in
+  // the mask operand to be able to print it properly.
+  unsigned mask = fieldFromInstruction16(Insn, 0, 5);
+
+  if (pred == 0xF) {
+    pred = 0xE;
+    S = MCDisassembler::SoftFail;
+  }
+
+  if ((mask & 0xF) == 0) {
+    // Preserve the high bit of the mask, which is the low bit of
+    // the predicate.
+    mask &= 0x10;
+    mask |= 0x8;
+    S = MCDisassembler::SoftFail;
+  }
+
+  Inst.addOperand(MCOperand::CreateImm(pred));
+  Inst.addOperand(MCOperand::CreateImm(mask));
+  return S;
+}
+
+static DecodeStatus
+DecodeT2LDRDPreInstruction(llvm::MCInst &Inst, unsigned Insn,
+                           uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rt = fieldFromInstruction32(Insn, 12, 4);
+  unsigned Rt2 = fieldFromInstruction32(Insn, 8, 4);
+  unsigned Rn = fieldFromInstruction32(Insn, 16, 4);
+  unsigned addr = fieldFromInstruction32(Insn, 0, 8);
+  unsigned W = fieldFromInstruction32(Insn, 21, 1);
+  unsigned U = fieldFromInstruction32(Insn, 23, 1);
+  unsigned P = fieldFromInstruction32(Insn, 24, 1);
+  bool writeback = (W == 1) | (P == 0);
+
+  addr |= (U << 8) | (Rn << 9);
+
+  if (writeback && (Rn == Rt || Rn == Rt2))
+    Check(S, MCDisassembler::SoftFail);
+  if (Rt == Rt2)
+    Check(S, MCDisassembler::SoftFail);
+
+  // Rt
+  if (!Check(S, DecoderGPRRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+  // Rt2
+  if (!Check(S, DecoderGPRRegisterClass(Inst, Rt2, Address, Decoder)))
+    return MCDisassembler::Fail;
+  // Writeback operand
+  if (!Check(S, DecoderGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  // addr
+  if (!Check(S, DecodeT2AddrModeImm8s4(Inst, addr, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus
+DecodeT2STRDPreInstruction(llvm::MCInst &Inst, unsigned Insn,
+                           uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rt = fieldFromInstruction32(Insn, 12, 4);
+  unsigned Rt2 = fieldFromInstruction32(Insn, 8, 4);
+  unsigned Rn = fieldFromInstruction32(Insn, 16, 4);
+  unsigned addr = fieldFromInstruction32(Insn, 0, 8);
+  unsigned W = fieldFromInstruction32(Insn, 21, 1);
+  unsigned U = fieldFromInstruction32(Insn, 23, 1);
+  unsigned P = fieldFromInstruction32(Insn, 24, 1);
+  bool writeback = (W == 1) | (P == 0);
+
+  addr |= (U << 8) | (Rn << 9);
+
+  if (writeback && (Rn == Rt || Rn == Rt2))
+    Check(S, MCDisassembler::SoftFail);
+
+  // Writeback operand
+  if (!Check(S, DecoderGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  // Rt
+  if (!Check(S, DecoderGPRRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+  // Rt2
+  if (!Check(S, DecoderGPRRegisterClass(Inst, Rt2, Address, Decoder)))
+    return MCDisassembler::Fail;
+  // addr
+  if (!Check(S, DecodeT2AddrModeImm8s4(Inst, addr, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeT2Adr(llvm::MCInst &Inst, uint32_t Insn,
+                                uint64_t Address, const void *Decoder) {
+  unsigned sign1 = fieldFromInstruction32(Insn, 21, 1);
+  unsigned sign2 = fieldFromInstruction32(Insn, 23, 1);
+  if (sign1 != sign2) return MCDisassembler::Fail;
+
+  unsigned Val = fieldFromInstruction32(Insn, 0, 8);
+  Val |= fieldFromInstruction32(Insn, 12, 3) << 8;
+  Val |= fieldFromInstruction32(Insn, 26, 1) << 11;
+  Val |= sign1 << 12;
+  Inst.addOperand(MCOperand::CreateImm(SignExtend32<13>(Val)));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeT2ShifterImmOperand(llvm::MCInst &Inst, uint32_t Val,
+                                              uint64_t Address,
+                                              const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  // Shift of "asr #32" is not allowed in Thumb2 mode.
+  if (Val == 0x20) S = MCDisassembler::SoftFail;
+  Inst.addOperand(MCOperand::CreateImm(Val));
+  return S;
+}
+
diff --git a/lib/Target/ARM/Disassembler/ARMDisassembler.h b/lib/Target/ARM/Disassembler/ARMDisassembler.h
deleted file mode 100644
index 0a74a3866eedc..0000000000000
--- a/lib/Target/ARM/Disassembler/ARMDisassembler.h
+++ /dev/null
@@ -1,99 +0,0 @@
-//===- ARMDisassembler.h - Disassembler for ARM/Thumb ISA -------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file is part of the ARM Disassembler.
-// It contains the header for ARMDisassembler and ThumbDisassembler, both are
-// subclasses of MCDisassembler.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ARMDISASSEMBLER_H
-#define ARMDISASSEMBLER_H
-
-#include "llvm/MC/MCDisassembler.h"
-
-namespace llvm {
-  
-class MCInst;
-class MemoryObject;
-class raw_ostream;
-  
-struct EDInstInfo;
-  
-/// ARMDisassembler - ARM disassembler for all ARM platforms.
-class ARMDisassembler : public MCDisassembler {
-public:
-  /// Constructor     - Initializes the disassembler.
-  ///
-  ARMDisassembler() :
-    MCDisassembler() {
-  }
-
-  ~ARMDisassembler() {
-  }
-
-  /// getInstruction - See MCDisassembler.
-  bool getInstruction(MCInst &instr,
-                      uint64_t &size,
-                      const MemoryObject &region,
-                      uint64_t address,
-                      raw_ostream &vStream) const;
-  
-  /// getEDInfo - See MCDisassembler.
-  EDInstInfo *getEDInfo() const;
-private:
-};
-
-// Forward declaration.
-class ARMBasicMCBuilder;
-
-/// Session - Keep track of the IT Block progression.
-class Session {
-  friend class ARMBasicMCBuilder;
-public:
-  Session() : ITCounter(0), ITState(0) {}
-  ~Session() {}
-  /// InitIT - Initializes ITCounter/ITState.
-  bool InitIT(unsigned short bits7_0);
-  /// UpdateIT - Updates ITCounter/ITState as IT Block progresses.
-  void UpdateIT();
-
-private:
-  unsigned ITCounter; // Possible values: 0, 1, 2, 3, 4.
-  unsigned ITState;   // A2.5.2 Consists of IT[7:5] and IT[4:0] initially.
-};
-
-/// ThumbDisassembler - Thumb disassembler for all ARM platforms.
-class ThumbDisassembler : public MCDisassembler {
-public:
-  /// Constructor     - Initializes the disassembler.
-  ///
-  ThumbDisassembler() :
-    MCDisassembler(), SO() {
-  }
-
-  ~ThumbDisassembler() {
-  }
-
-  /// getInstruction - See MCDisassembler.
-  bool getInstruction(MCInst &instr,
-                      uint64_t &size,
-                      const MemoryObject &region,
-                      uint64_t address,
-                      raw_ostream &vStream) const;
-  
-  /// getEDInfo - See MCDisassembler.
-  EDInstInfo *getEDInfo() const;
-private:
-  Session SO;
-};
-
-} // namespace llvm
-  
-#endif
diff --git a/lib/Target/ARM/Disassembler/ARMDisassemblerCore.cpp b/lib/Target/ARM/Disassembler/ARMDisassemblerCore.cpp
deleted file mode 100644
index d89c80a9d4575..0000000000000
--- a/lib/Target/ARM/Disassembler/ARMDisassemblerCore.cpp
+++ /dev/null
@@ -1,3818 +0,0 @@
-//===- ARMDisassemblerCore.cpp - ARM disassembler helpers -------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file is part of the ARM Disassembler.
-// It contains code to represent the core concepts of Builder and DisassembleFP
-// to solve the problem of disassembling an ARM instr.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "arm-disassembler"
-
-#include "ARMDisassemblerCore.h"
-#include "ARMAddressingModes.h"
-#include "ARMMCExpr.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-
-//#define DEBUG(X) do { X; } while (0)
-
-/// ARMGenInstrInfo.inc - ARMGenInstrInfo.inc contains the static const
-/// MCInstrDesc ARMInsts[] definition and the MCOperandInfo[]'s describing the
-/// operand info for each ARMInsts[i].
-///
-/// Together with an instruction's encoding format, we can take advantage of the
-/// NumOperands and the OpInfo fields of the target instruction description in
-/// the quest to build out the MCOperand list for an MCInst.
-///
-/// The general guideline is that with a known format, the number of dst and src
-/// operands are well-known.  The dst is built first, followed by the src
-/// operand(s).  The operands not yet used at this point are for the Implicit
-/// Uses and Defs by this instr.  For the Uses part, the pred:$p operand is
-/// defined with two components:
-///
-/// def pred { // Operand PredicateOperand
-///   ValueType Type = OtherVT;
-///   string PrintMethod = "printPredicateOperand";
-///   string AsmOperandLowerMethod = ?;
-///   dag MIOperandInfo = (ops i32imm, CCR);
-///   AsmOperandClass ParserMatchClass = ImmAsmOperand;
-///   dag DefaultOps = (ops (i32 14), (i32 zero_reg));
-/// }
-///
-/// which is manifested by the MCOperandInfo[] of:
-///
-/// { 0, 0|(1<<MCOI::Predicate), 0 },
-/// { ARM::CCRRegClassID, 0|(1<<MCOI::Predicate), 0 }
-///
-/// So the first predicate MCOperand corresponds to the immediate part of the
-/// ARM condition field (Inst{31-28}), and the second predicate MCOperand
-/// corresponds to a register kind of ARM::CPSR.
-///
-/// For the Defs part, in the simple case of only cc_out:$s, we have:
-///
-/// def cc_out { // Operand OptionalDefOperand
-///   ValueType Type = OtherVT;
-///   string PrintMethod = "printSBitModifierOperand";
-///   string AsmOperandLowerMethod = ?;
-///   dag MIOperandInfo = (ops CCR);
-///   AsmOperandClass ParserMatchClass = ImmAsmOperand;
-///   dag DefaultOps = (ops (i32 zero_reg));
-/// }
-///
-/// which is manifested by the one MCOperandInfo of:
-///
-/// { ARM::CCRRegClassID, 0|(1<<MCOI::OptionalDef), 0 }
-///
-
-namespace llvm {
-extern MCInstrDesc ARMInsts[];
-}
-
-using namespace llvm;
-
-const char *ARMUtils::OpcodeName(unsigned Opcode) {
-  return ARMInsts[Opcode].Name;
-}
-
-// Return the register enum Based on RegClass and the raw register number.
-// FIXME: Auto-gened?
-static unsigned
-getRegisterEnum(BO B, unsigned RegClassID, unsigned RawRegister) {
-  if (RegClassID == ARM::rGPRRegClassID) {
-    // Check for The register numbers 13 and 15 that are not permitted for many
-    // Thumb register specifiers.
-    if (RawRegister == 13 || RawRegister == 15) {
-      B->SetErr(-1);
-      return 0;
-    }
-    // For this purpose, we can treat rGPR as if it were GPR.
-    RegClassID = ARM::GPRRegClassID;
-  }
-
-  // See also decodeNEONRd(), decodeNEONRn(), decodeNEONRm().
-  // A7.3 register encoding
-  //     Qd -> bit[12] == 0
-  //     Qn -> bit[16] == 0
-  //     Qm -> bit[0]  == 0
-  //
-  // If one of these bits is 1, the instruction is UNDEFINED.
-  if (RegClassID == ARM::QPRRegClassID && slice(RawRegister, 0, 0) == 1) {
-    B->SetErr(-1);
-    return 0;
-  }
-  unsigned RegNum =
-    RegClassID == ARM::QPRRegClassID ? RawRegister >> 1 : RawRegister;
-
-  switch (RegNum) {
-  default:
-    break;
-  case 0:
-    switch (RegClassID) {
-    case ARM::GPRRegClassID: case ARM::tGPRRegClassID: return ARM::R0;
-    case ARM::DPRRegClassID: case ARM::DPR_8RegClassID:
-    case ARM::DPR_VFP2RegClassID:
-      return ARM::D0;
-    case ARM::QPRRegClassID: case ARM::QPR_8RegClassID:
-    case ARM::QPR_VFP2RegClassID:
-      return ARM::Q0;
-    case ARM::SPRRegClassID: case ARM::SPR_8RegClassID: return ARM::S0;
-    }
-    break;
-  case 1:
-    switch (RegClassID) {
-    case ARM::GPRRegClassID: case ARM::tGPRRegClassID: return ARM::R1;
-    case ARM::DPRRegClassID: case ARM::DPR_8RegClassID:
-    case ARM::DPR_VFP2RegClassID:
-      return ARM::D1;
-    case ARM::QPRRegClassID: case ARM::QPR_8RegClassID:
-    case ARM::QPR_VFP2RegClassID:
-      return ARM::Q1;
-    case ARM::SPRRegClassID: case ARM::SPR_8RegClassID: return ARM::S1;
-    }
-    break;
-  case 2:
-    switch (RegClassID) {
-    case ARM::GPRRegClassID: case ARM::tGPRRegClassID: return ARM::R2;
-    case ARM::DPRRegClassID: case ARM::DPR_8RegClassID:
-    case ARM::DPR_VFP2RegClassID:
-      return ARM::D2;
-    case ARM::QPRRegClassID: case ARM::QPR_8RegClassID:
-    case ARM::QPR_VFP2RegClassID:
-      return ARM::Q2;
-    case ARM::SPRRegClassID: case ARM::SPR_8RegClassID: return ARM::S2;
-    }
-    break;
-  case 3:
-    switch (RegClassID) {
-    case ARM::GPRRegClassID: case ARM::tGPRRegClassID: return ARM::R3;
-    case ARM::DPRRegClassID: case ARM::DPR_8RegClassID:
-    case ARM::DPR_VFP2RegClassID:
-      return ARM::D3;
-    case ARM::QPRRegClassID: case ARM::QPR_8RegClassID:
-    case ARM::QPR_VFP2RegClassID:
-      return ARM::Q3;
-    case ARM::SPRRegClassID: case ARM::SPR_8RegClassID: return ARM::S3;
-    }
-    break;
-  case 4:
-    switch (RegClassID) {
-    case ARM::GPRRegClassID: case ARM::tGPRRegClassID: return ARM::R4;
-    case ARM::DPRRegClassID: case ARM::DPR_8RegClassID:
-    case ARM::DPR_VFP2RegClassID:
-      return ARM::D4;
-    case ARM::QPRRegClassID: case ARM::QPR_VFP2RegClassID: return ARM::Q4;
-    case ARM::SPRRegClassID: case ARM::SPR_8RegClassID: return ARM::S4;
-    }
-    break;
-  case 5:
-    switch (RegClassID) {
-    case ARM::GPRRegClassID: case ARM::tGPRRegClassID: return ARM::R5;
-    case ARM::DPRRegClassID: case ARM::DPR_8RegClassID:
-    case ARM::DPR_VFP2RegClassID:
-      return ARM::D5;
-    case ARM::QPRRegClassID: case ARM::QPR_VFP2RegClassID: return ARM::Q5;
-    case ARM::SPRRegClassID: case ARM::SPR_8RegClassID: return ARM::S5;
-    }
-    break;
-  case 6:
-    switch (RegClassID) {
-    case ARM::GPRRegClassID: case ARM::tGPRRegClassID: return ARM::R6;
-    case ARM::DPRRegClassID: case ARM::DPR_8RegClassID:
-    case ARM::DPR_VFP2RegClassID:
-      return ARM::D6;
-    case ARM::QPRRegClassID: case ARM::QPR_VFP2RegClassID: return ARM::Q6;
-    case ARM::SPRRegClassID: case ARM::SPR_8RegClassID: return ARM::S6;
-    }
-    break;
-  case 7:
-    switch (RegClassID) {
-    case ARM::GPRRegClassID: case ARM::tGPRRegClassID: return ARM::R7;
-    case ARM::DPRRegClassID: case ARM::DPR_8RegClassID:
-    case ARM::DPR_VFP2RegClassID:
-      return ARM::D7;
-    case ARM::QPRRegClassID: case ARM::QPR_VFP2RegClassID: return ARM::Q7;
-    case ARM::SPRRegClassID: case ARM::SPR_8RegClassID: return ARM::S7;
-    }
-    break;
-  case 8:
-    switch (RegClassID) {
-    case ARM::GPRRegClassID: return ARM::R8;
-    case ARM::DPRRegClassID: case ARM::DPR_VFP2RegClassID: return ARM::D8;
-    case ARM::QPRRegClassID: return ARM::Q8;
-    case ARM::SPRRegClassID: case ARM::SPR_8RegClassID: return ARM::S8;
-    }
-    break;
-  case 9:
-    switch (RegClassID) {
-    case ARM::GPRRegClassID: return ARM::R9;
-    case ARM::DPRRegClassID: case ARM::DPR_VFP2RegClassID: return ARM::D9;
-    case ARM::QPRRegClassID: return ARM::Q9;
-    case ARM::SPRRegClassID: case ARM::SPR_8RegClassID: return ARM::S9;
-    }
-    break;
-  case 10:
-    switch (RegClassID) {
-    case ARM::GPRRegClassID: return ARM::R10;
-    case ARM::DPRRegClassID: case ARM::DPR_VFP2RegClassID: return ARM::D10;
-    case ARM::QPRRegClassID: return ARM::Q10;
-    case ARM::SPRRegClassID: case ARM::SPR_8RegClassID: return ARM::S10;
-    }
-    break;
-  case 11:
-    switch (RegClassID) {
-    case ARM::GPRRegClassID: return ARM::R11;
-    case ARM::DPRRegClassID: case ARM::DPR_VFP2RegClassID: return ARM::D11;
-    case ARM::QPRRegClassID: return ARM::Q11;
-    case ARM::SPRRegClassID: case ARM::SPR_8RegClassID: return ARM::S11;
-    }
-    break;
-  case 12:
-    switch (RegClassID) {
-    case ARM::GPRRegClassID: return ARM::R12;
-    case ARM::DPRRegClassID: case ARM::DPR_VFP2RegClassID: return ARM::D12;
-    case ARM::QPRRegClassID: return ARM::Q12;
-    case ARM::SPRRegClassID: case ARM::SPR_8RegClassID: return ARM::S12;
-    }
-    break;
-  case 13:
-    switch (RegClassID) {
-    case ARM::GPRRegClassID: return ARM::SP;
-    case ARM::DPRRegClassID: case ARM::DPR_VFP2RegClassID: return ARM::D13;
-    case ARM::QPRRegClassID: return ARM::Q13;
-    case ARM::SPRRegClassID: case ARM::SPR_8RegClassID: return ARM::S13;
-    }
-    break;
-  case 14:
-    switch (RegClassID) {
-    case ARM::GPRRegClassID: return ARM::LR;
-    case ARM::DPRRegClassID: case ARM::DPR_VFP2RegClassID: return ARM::D14;
-    case ARM::QPRRegClassID: return ARM::Q14;
-    case ARM::SPRRegClassID: case ARM::SPR_8RegClassID: return ARM::S14;
-    }
-    break;
-  case 15:
-    switch (RegClassID) {
-    case ARM::GPRRegClassID: return ARM::PC;
-    case ARM::DPRRegClassID: case ARM::DPR_VFP2RegClassID: return ARM::D15;
-    case ARM::QPRRegClassID: return ARM::Q15;
-    case ARM::SPRRegClassID: case ARM::SPR_8RegClassID: return ARM::S15;
-    }
-    break;
-  case 16:
-    switch (RegClassID) {
-    case ARM::DPRRegClassID: return ARM::D16;
-    case ARM::SPRRegClassID: return ARM::S16;
-    }
-    break;
-  case 17:
-    switch (RegClassID) {
-    case ARM::DPRRegClassID: return ARM::D17;
-    case ARM::SPRRegClassID: return ARM::S17;
-    }
-    break;
-  case 18:
-    switch (RegClassID) {
-    case ARM::DPRRegClassID: return ARM::D18;
-    case ARM::SPRRegClassID: return ARM::S18;
-    }
-    break;
-  case 19:
-    switch (RegClassID) {
-    case ARM::DPRRegClassID: return ARM::D19;
-    case ARM::SPRRegClassID: return ARM::S19;
-    }
-    break;
-  case 20:
-    switch (RegClassID) {
-    case ARM::DPRRegClassID: return ARM::D20;
-    case ARM::SPRRegClassID: return ARM::S20;
-    }
-    break;
-  case 21:
-    switch (RegClassID) {
-    case ARM::DPRRegClassID: return ARM::D21;
-    case ARM::SPRRegClassID: return ARM::S21;
-    }
-    break;
-  case 22:
-    switch (RegClassID) {
-    case ARM::DPRRegClassID: return ARM::D22;
-    case ARM::SPRRegClassID: return ARM::S22;
-    }
-    break;
-  case 23:
-    switch (RegClassID) {
-    case ARM::DPRRegClassID: return ARM::D23;
-    case ARM::SPRRegClassID: return ARM::S23;
-    }
-    break;
-  case 24:
-    switch (RegClassID) {
-    case ARM::DPRRegClassID: return ARM::D24;
-    case ARM::SPRRegClassID: return ARM::S24;
-    }
-    break;
-  case 25:
-    switch (RegClassID) {
-    case ARM::DPRRegClassID: return ARM::D25;
-    case ARM::SPRRegClassID: return ARM::S25;
-    }
-    break;
-  case 26:
-    switch (RegClassID) {
-    case ARM::DPRRegClassID: return ARM::D26;
-    case ARM::SPRRegClassID: return ARM::S26;
-    }
-    break;
-  case 27:
-    switch (RegClassID) {
-    case ARM::DPRRegClassID: return ARM::D27;
-    case ARM::SPRRegClassID: return ARM::S27;
-    }
-    break;
-  case 28:
-    switch (RegClassID) {
-    case ARM::DPRRegClassID: return ARM::D28;
-    case ARM::SPRRegClassID: return ARM::S28;
-    }
-    break;
-  case 29:
-    switch (RegClassID) {
-    case ARM::DPRRegClassID: return ARM::D29;
-    case ARM::SPRRegClassID: return ARM::S29;
-    }
-    break;
-  case 30:
-    switch (RegClassID) {
-    case ARM::DPRRegClassID: return ARM::D30;
-    case ARM::SPRRegClassID: return ARM::S30;
-    }
-    break;
-  case 31:
-    switch (RegClassID) {
-    case ARM::DPRRegClassID: return ARM::D31;
-    case ARM::SPRRegClassID: return ARM::S31;
-    }
-    break;
-  }
-  DEBUG(errs() << "Invalid (RegClassID, RawRegister) combination\n");
-  // Encoding error.  Mark the builder with error code != 0.
-  B->SetErr(-1);
-  return 0;
-}
-
-///////////////////////////////
-//                           //
-//     Utility Functions     //
-//                           //
-///////////////////////////////
-
-// Extract/Decode Rd: Inst{15-12}.
-static inline unsigned decodeRd(uint32_t insn) {
-  return (insn >> ARMII::RegRdShift) & ARMII::GPRRegMask;
-}
-
-// Extract/Decode Rn: Inst{19-16}.
-static inline unsigned decodeRn(uint32_t insn) {
-  return (insn >> ARMII::RegRnShift) & ARMII::GPRRegMask;
-}
-
-// Extract/Decode Rm: Inst{3-0}.
-static inline unsigned decodeRm(uint32_t insn) {
-  return (insn & ARMII::GPRRegMask);
-}
-
-// Extract/Decode Rs: Inst{11-8}.
-static inline unsigned decodeRs(uint32_t insn) {
-  return (insn >> ARMII::RegRsShift) & ARMII::GPRRegMask;
-}
-
-static inline unsigned getCondField(uint32_t insn) {
-  return (insn >> ARMII::CondShift);
-}
-
-static inline unsigned getIBit(uint32_t insn) {
-  return (insn >> ARMII::I_BitShift) & 1;
-}
-
-static inline unsigned getAM3IBit(uint32_t insn) {
-  return (insn >> ARMII::AM3_I_BitShift) & 1;
-}
-
-static inline unsigned getPBit(uint32_t insn) {
-  return (insn >> ARMII::P_BitShift) & 1;
-}
-
-static inline unsigned getUBit(uint32_t insn) {
-  return (insn >> ARMII::U_BitShift) & 1;
-}
-
-static inline unsigned getPUBits(uint32_t insn) {
-  return (insn >> ARMII::U_BitShift) & 3;
-}
-
-static inline unsigned getSBit(uint32_t insn) {
-  return (insn >> ARMII::S_BitShift) & 1;
-}
-
-static inline unsigned getWBit(uint32_t insn) {
-  return (insn >> ARMII::W_BitShift) & 1;
-}
-
-static inline unsigned getDBit(uint32_t insn) {
-  return (insn >> ARMII::D_BitShift) & 1;
-}
-
-static inline unsigned getNBit(uint32_t insn) {
-  return (insn >> ARMII::N_BitShift) & 1;
-}
-
-static inline unsigned getMBit(uint32_t insn) {
-  return (insn >> ARMII::M_BitShift) & 1;
-}
-
-// See A8.4 Shifts applied to a register.
-//     A8.4.2 Register controlled shifts.
-//
-// getShiftOpcForBits - getShiftOpcForBits translates from the ARM encoding bits
-// into llvm enums for shift opcode.  The API clients should pass in the value
-// encoded with two bits, so the assert stays to signal a wrong API usage.
-//
-// A8-12: DecodeRegShift()
-static inline ARM_AM::ShiftOpc getShiftOpcForBits(unsigned bits) {
-  switch (bits) {
-  default: assert(0 && "No such value"); return ARM_AM::no_shift;
-  case 0:  return ARM_AM::lsl;
-  case 1:  return ARM_AM::lsr;
-  case 2:  return ARM_AM::asr;
-  case 3:  return ARM_AM::ror;
-  }
-}
-
-// See A8.4 Shifts applied to a register.
-//     A8.4.1 Constant shifts.
-//
-// getImmShiftSE - getImmShiftSE translates from the raw ShiftOpc and raw Imm5
-// encodings into the intended ShiftOpc and shift amount.
-//
-// A8-11: DecodeImmShift()
-static inline void getImmShiftSE(ARM_AM::ShiftOpc &ShOp, unsigned &ShImm) {
-  if (ShImm != 0)
-    return;
-  switch (ShOp) {
-  case ARM_AM::no_shift:
-  case ARM_AM::rrx:
-    break;
-  case ARM_AM::lsl:
-    ShOp = ARM_AM::no_shift;
-    break;
-  case ARM_AM::lsr:
-  case ARM_AM::asr:
-    ShImm = 32;
-    break;
-  case ARM_AM::ror:
-    ShOp = ARM_AM::rrx;
-    break;
-  }
-}
-
-// getAMSubModeForBits - getAMSubModeForBits translates from the ARM encoding
-// bits Inst{24-23} (P(24) and U(23)) into llvm enums for AMSubMode.  The API
-// clients should pass in the value encoded with two bits, so the assert stays
-// to signal a wrong API usage.
-static inline ARM_AM::AMSubMode getAMSubModeForBits(unsigned bits) {
-  switch (bits) {
-  default: assert(0 && "No such value"); return ARM_AM::bad_am_submode;
-  case 1:  return ARM_AM::ia;   // P=0 U=1
-  case 3:  return ARM_AM::ib;   // P=1 U=1
-  case 0:  return ARM_AM::da;   // P=0 U=0
-  case 2:  return ARM_AM::db;   // P=1 U=0
-  }
-}
-
-////////////////////////////////////////////
-//                                        //
-//    Disassemble function definitions    //
-//                                        //
-////////////////////////////////////////////
-
-/// There is a separate Disassemble*Frm function entry for disassembly of an ARM
-/// instr into a list of MCOperands in the appropriate order, with possible dst,
-/// followed by possible src(s).
-///
-/// The processing of the predicate, and the 'S' modifier bit, if MI modifies
-/// the CPSR, is factored into ARMBasicMCBuilder's method named
-/// TryPredicateAndSBitModifier.
-
-static bool DisassemblePseudo(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO) {
-
-  assert(0 && "Unexpected pseudo instruction!");
-  return false;
-}
-
-// A8.6.94 MLA
-// if d == 15 || n == 15 || m == 15 || a == 15 then UNPREDICTABLE;
-//
-// A8.6.105 MUL
-// if d == 15 || n == 15 || m == 15 then UNPREDICTABLE;
-//
-// A8.6.246 UMULL
-// if dLo == 15 || dHi == 15 || n == 15 || m == 15 then UNPREDICTABLE;
-// if dHi == dLo then UNPREDICTABLE;
-static bool BadRegsMulFrm(unsigned Opcode, uint32_t insn) {
-  unsigned R19_16 = slice(insn, 19, 16);
-  unsigned R15_12 = slice(insn, 15, 12);
-  unsigned R11_8  = slice(insn, 11, 8);
-  unsigned R3_0   = slice(insn, 3, 0);
-  switch (Opcode) {
-  default:
-    // Did we miss an opcode?
-    DEBUG(errs() << "BadRegsMulFrm: unexpected opcode!");
-    return false;
-  case ARM::MLA:     case ARM::MLS:     case ARM::SMLABB:  case ARM::SMLABT:
-  case ARM::SMLATB:  case ARM::SMLATT:  case ARM::SMLAWB:  case ARM::SMLAWT:
-  case ARM::SMMLA:   case ARM::SMMLAR:  case ARM::SMMLS:   case ARM::SMMLSR:
-  case ARM::USADA8:
-    if (R19_16 == 15 || R15_12 == 15 || R11_8 == 15 || R3_0 == 15)
-      return true;
-    return false;
-  case ARM::MUL:     case ARM::SMMUL:   case ARM::SMMULR:
-  case ARM::SMULBB:  case ARM::SMULBT:  case ARM::SMULTB:  case ARM::SMULTT:
-  case ARM::SMULWB:  case ARM::SMULWT:  case ARM::SMUAD:   case ARM::SMUADX:
-  // A8.6.167 SMLAD & A8.6.172 SMLSD
-  case ARM::SMLAD:   case ARM::SMLADX:  case ARM::SMLSD:   case ARM::SMLSDX:
-  case ARM::USAD8:
-    if (R19_16 == 15 || R11_8 == 15 || R3_0 == 15)
-      return true;
-    return false;
-  case ARM::SMLAL:   case ARM::SMULL:   case ARM::UMAAL:   case ARM::UMLAL:
-  case ARM::UMULL:
-  case ARM::SMLALBB: case ARM::SMLALBT: case ARM::SMLALTB: case ARM::SMLALTT:
-  case ARM::SMLALD:  case ARM::SMLALDX: case ARM::SMLSLD:  case ARM::SMLSLDX:
-    if (R19_16 == 15 || R15_12 == 15 || R11_8 == 15 || R3_0 == 15)
-      return true;
-    if (R19_16 == R15_12)
-      return true;
-    return false;;
-  }
-}
-
-// Multiply Instructions.
-// MLA, MLS, SMLABB, SMLABT, SMLATB, SMLATT, SMLAWB, SMLAWT, SMMLA, SMMLAR,
-// SMMLS, SMMLAR, SMLAD, SMLADX, SMLSD, SMLSDX, and USADA8 (for convenience):
-//     Rd{19-16} Rn{3-0} Rm{11-8} Ra{15-12}
-// But note that register checking for {SMLAD, SMLADX, SMLSD, SMLSDX} is
-// only for {d, n, m}.
-//
-// MUL, SMMUL, SMMULR, SMULBB, SMULBT, SMULTB, SMULTT, SMULWB, SMULWT, SMUAD,
-// SMUADX, and USAD8 (for convenience):
-//     Rd{19-16} Rn{3-0} Rm{11-8}
-//
-// SMLAL, SMULL, UMAAL, UMLAL, UMULL, SMLALBB, SMLALBT, SMLALTB, SMLALTT,
-// SMLALD, SMLADLX, SMLSLD, SMLSLDX:
-//     RdLo{15-12} RdHi{19-16} Rn{3-0} Rm{11-8}
-//
-// The mapping of the multiply registers to the "regular" ARM registers, where
-// there are convenience decoder functions, is:
-//
-// Inst{15-12} => Rd
-// Inst{19-16} => Rn
-// Inst{3-0} => Rm
-// Inst{11-8} => Rs
-static bool DisassembleMulFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  const MCInstrDesc &MCID = ARMInsts[Opcode];
-  unsigned short NumDefs = MCID.getNumDefs();
-  const MCOperandInfo *OpInfo = MCID.OpInfo;
-  unsigned &OpIdx = NumOpsAdded;
-
-  OpIdx = 0;
-
-  assert(NumDefs > 0 && "NumDefs should be greater than 0 for MulFrm");
-  assert(NumOps >= 3
-         && OpInfo[0].RegClass == ARM::GPRRegClassID
-         && OpInfo[1].RegClass == ARM::GPRRegClassID
-         && OpInfo[2].RegClass == ARM::GPRRegClassID
-         && "Expect three register operands");
-
-  // Sanity check for the register encodings.
-  if (BadRegsMulFrm(Opcode, insn))
-    return false;
-
-  // Instructions with two destination registers have RdLo{15-12} first.
-  if (NumDefs == 2) {
-    assert(NumOps >= 4 && OpInfo[3].RegClass == ARM::GPRRegClassID &&
-           "Expect 4th register operand");
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRd(insn))));
-    ++OpIdx;
-  }
-
-  // The destination register: RdHi{19-16} or Rd{19-16}.
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRn(insn))));
-
-  // The two src regsiters: Rn{3-0}, then Rm{11-8}.
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRm(insn))));
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRs(insn))));
-  OpIdx += 3;
-
-  // Many multiply instructions (e.g., MLA) have three src registers.
-  // The third register operand is Ra{15-12}.
-  if (OpIdx < NumOps && OpInfo[OpIdx].RegClass == ARM::GPRRegClassID) {
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRd(insn))));
-    ++OpIdx;
-  }
-
-  return true;
-}
-
-// Helper routines for disassembly of coprocessor instructions.
-
-static bool LdStCopOpcode(unsigned Opcode) {
-  if ((Opcode >= ARM::LDC2L_OFFSET && Opcode <= ARM::LDC_PRE) ||
-      (Opcode >= ARM::STC2L_OFFSET && Opcode <= ARM::STC_PRE))
-    return true;
-  return false;
-}
-static bool CoprocessorOpcode(unsigned Opcode) {
-  if (LdStCopOpcode(Opcode))
-    return true;
-
-  switch (Opcode) {
-  default:
-    return false;
-  case ARM::CDP:  case ARM::CDP2:
-  case ARM::MCR:  case ARM::MCR2:  case ARM::MRC:  case ARM::MRC2:
-  case ARM::MCRR: case ARM::MCRR2: case ARM::MRRC: case ARM::MRRC2:
-    return true;
-  }
-}
-static inline unsigned GetCoprocessor(uint32_t insn) {
-  return slice(insn, 11, 8);
-}
-static inline unsigned GetCopOpc1(uint32_t insn, bool CDP) {
-  return CDP ? slice(insn, 23, 20) : slice(insn, 23, 21);
-}
-static inline unsigned GetCopOpc2(uint32_t insn) {
-  return slice(insn, 7, 5);
-}
-static inline unsigned GetCopOpc(uint32_t insn) {
-  return slice(insn, 7, 4);
-}
-// Most of the operands are in immediate forms, except Rd and Rn, which are ARM
-// core registers.
-//
-// CDP, CDP2:                cop opc1 CRd CRn CRm opc2
-//
-// MCR, MCR2, MRC, MRC2:     cop opc1 Rd CRn CRm opc2
-//
-// MCRR, MCRR2, MRRC, MRRc2: cop opc Rd Rn CRm
-//
-// LDC_OFFSET, LDC_PRE, LDC_POST: cop CRd Rn R0 [+/-]imm8:00
-// and friends
-// STC_OFFSET, STC_PRE, STC_POST: cop CRd Rn R0 [+/-]imm8:00
-// and friends
-//                                        <-- addrmode2 -->
-//
-// LDC_OPTION:                    cop CRd Rn imm8
-// and friends
-// STC_OPTION:                    cop CRd Rn imm8
-// and friends
-//
-static bool DisassembleCoprocessor(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  assert(NumOps >= 4 && "Num of operands >= 4 for coprocessor instr");
-
-  unsigned &OpIdx = NumOpsAdded;
-  // A8.6.92
-  // if coproc == '101x' then SEE "Advanced SIMD and VFP"
-  // But since the special instructions have more explicit encoding bits
-  // specified, if coproc == 10 or 11, we should reject it as invalid.
-  unsigned coproc = GetCoprocessor(insn);
-  if ((Opcode == ARM::MCR || Opcode == ARM::MCRR ||
-       Opcode == ARM::MRC || Opcode == ARM::MRRC) &&
-      (coproc == 10 || coproc == 11)) {
-    DEBUG(errs() << "Encoding error: coproc == 10 or 11 for MCR[R]/MR[R]C\n");
-    return false;
-  }
-
-  bool OneCopOpc = (Opcode == ARM::MCRR || Opcode == ARM::MCRR2 ||
-                    Opcode == ARM::MRRC || Opcode == ARM::MRRC2);
-
-  // CDP/CDP2 has no GPR operand; the opc1 operand is also wider (Inst{23-20}).
-  bool NoGPR = (Opcode == ARM::CDP || Opcode == ARM::CDP2);
-  bool LdStCop = LdStCopOpcode(Opcode);
-  bool RtOut = (Opcode == ARM::MRC || Opcode == ARM::MRC2);
-
-  OpIdx = 0;
-
-  if (RtOut) {
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRd(insn))));
-    ++OpIdx;
-  }
-  MI.addOperand(MCOperand::CreateImm(coproc));
-  ++OpIdx;
-
-  if (LdStCop) {
-    // Unindex if P:W = 0b00 --> _OPTION variant
-    unsigned PW = getPBit(insn) << 1 | getWBit(insn);
-
-    MI.addOperand(MCOperand::CreateImm(decodeRd(insn)));
-
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRn(insn))));
-    OpIdx += 2;
-
-    if (PW) {
-      MI.addOperand(MCOperand::CreateReg(0));
-      ARM_AM::AddrOpc AddrOpcode = getUBit(insn) ? ARM_AM::add : ARM_AM::sub;
-      const MCInstrDesc &MCID = ARMInsts[Opcode];
-      unsigned IndexMode =
-                 (MCID.TSFlags & ARMII::IndexModeMask) >> ARMII::IndexModeShift;
-      unsigned Offset = ARM_AM::getAM2Opc(AddrOpcode, slice(insn, 7, 0) << 2,
-                                          ARM_AM::no_shift, IndexMode);
-      MI.addOperand(MCOperand::CreateImm(Offset));
-      OpIdx += 2;
-    } else {
-      MI.addOperand(MCOperand::CreateImm(slice(insn, 7, 0)));
-      ++OpIdx;
-    }
-  } else {
-    MI.addOperand(MCOperand::CreateImm(OneCopOpc ? GetCopOpc(insn)
-                                                 : GetCopOpc1(insn, NoGPR)));
-    ++OpIdx;
-
-    if (!RtOut) {
-      MI.addOperand(NoGPR ? MCOperand::CreateImm(decodeRd(insn))
-                          : MCOperand::CreateReg(
-                                getRegisterEnum(B, ARM::GPRRegClassID,
-                                                decodeRd(insn))));
-      ++OpIdx;
-    }
-
-    MI.addOperand(OneCopOpc ? MCOperand::CreateReg(
-                                getRegisterEnum(B, ARM::GPRRegClassID,
-                                                decodeRn(insn)))
-                            : MCOperand::CreateImm(decodeRn(insn)));
-
-    MI.addOperand(MCOperand::CreateImm(decodeRm(insn)));
-
-    OpIdx += 2;
-
-    if (!OneCopOpc) {
-      MI.addOperand(MCOperand::CreateImm(GetCopOpc2(insn)));
-      ++OpIdx;
-    }
-  }
-
-  return true;
-}
-
-// Branch Instructions.
-// BL: SignExtend(Imm24:'00', 32)
-// Bcc, BL_pred: SignExtend(Imm24:'00', 32) Pred0 Pred1
-// SMC: ZeroExtend(imm4, 32)
-// SVC: ZeroExtend(Imm24, 32)
-//
-// Various coprocessor instructions are assigned BrFrm arbitrarily.
-// Delegates to DisassembleCoprocessor() helper function.
-//
-// MRS/MRSsys: Rd
-// MSR/MSRsys: Rm mask=Inst{19-16}
-// BXJ:        Rm
-// MSRi/MSRsysi: so_imm
-// SRSW/SRS: ldstm_mode:$amode mode_imm
-// RFEW/RFE: ldstm_mode:$amode Rn
-static bool DisassembleBrFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  if (CoprocessorOpcode(Opcode))
-    return DisassembleCoprocessor(MI, Opcode, insn, NumOps, NumOpsAdded, B);
-
-  const MCOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
-  if (!OpInfo) return false;
-
-  // MRS and MRSsys take one GPR reg Rd.
-  if (Opcode == ARM::MRS || Opcode == ARM::MRSsys) {
-    assert(NumOps >= 1 && OpInfo[0].RegClass == ARM::GPRRegClassID &&
-           "Reg operand expected");
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRd(insn))));
-    NumOpsAdded = 1;
-    return true;
-  }
-  // BXJ takes one GPR reg Rm.
-  if (Opcode == ARM::BXJ) {
-    assert(NumOps >= 1 && OpInfo[0].RegClass == ARM::GPRRegClassID &&
-           "Reg operand expected");
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRm(insn))));
-    NumOpsAdded = 1;
-    return true;
-  }
-  // MSR take a mask, followed by one GPR reg Rm. The mask contains the R Bit in
-  // bit 4, and the special register fields in bits 3-0.
-  if (Opcode == ARM::MSR) {
-    assert(NumOps >= 1 && OpInfo[1].RegClass == ARM::GPRRegClassID &&
-           "Reg operand expected");
-    MI.addOperand(MCOperand::CreateImm(slice(insn, 22, 22) << 4 /* R Bit */ |
-                                       slice(insn, 19, 16) /* Special Reg */ ));
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRm(insn))));
-    NumOpsAdded = 2;
-    return true;
-  }
-  // MSRi take a mask, followed by one so_imm operand. The mask contains the
-  // R Bit in bit 4, and the special register fields in bits 3-0.
-  if (Opcode == ARM::MSRi) {
-    // A5.2.11 MSR (immediate), and hints & B6.1.6 MSR (immediate)
-    // The hints instructions have more specific encodings, so if mask == 0,
-    // we should reject this as an invalid instruction.
-    if (slice(insn, 19, 16) == 0)
-      return false;
-    MI.addOperand(MCOperand::CreateImm(slice(insn, 22, 22) << 4 /* R Bit */ |
-                                       slice(insn, 19, 16) /* Special Reg */ ));
-    // SOImm is 4-bit rotate amount in bits 11-8 with 8-bit imm in bits 7-0.
-    // A5.2.4 Rotate amount is twice the numeric value of Inst{11-8}.
-    // See also ARMAddressingModes.h: getSOImmValImm() and getSOImmValRot().
-    unsigned Rot = (insn >> ARMII::SoRotImmShift) & 0xF;
-    unsigned Imm = insn & 0xFF;
-    MI.addOperand(MCOperand::CreateImm(ARM_AM::rotr32(Imm, 2*Rot)));
-    NumOpsAdded = 2;
-    return true;
-  }
-  if (Opcode == ARM::SRSW || Opcode == ARM::SRS ||
-      Opcode == ARM::RFEW || Opcode == ARM::RFE) {
-    ARM_AM::AMSubMode SubMode = getAMSubModeForBits(getPUBits(insn));
-    MI.addOperand(MCOperand::CreateImm(ARM_AM::getAM4ModeImm(SubMode)));
-
-    if (Opcode == ARM::SRSW || Opcode == ARM::SRS)
-      MI.addOperand(MCOperand::CreateImm(slice(insn, 4, 0)));
-    else
-      MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                         decodeRn(insn))));
-    NumOpsAdded = 3;
-    return true;
-  }
-
-  assert((Opcode == ARM::Bcc || Opcode == ARM::BL || Opcode == ARM::BL_pred
-          || Opcode == ARM::SMC || Opcode == ARM::SVC) &&
-         "Unexpected Opcode");
-
-  assert(NumOps >= 1 && OpInfo[0].RegClass < 0 && "Imm operand expected");
-
-  int Imm32 = 0;
-  if (Opcode == ARM::SMC) {
-    // ZeroExtend(imm4, 32) where imm24 = Inst{3-0}.
-    Imm32 = slice(insn, 3, 0);
-  } else if (Opcode == ARM::SVC) {
-    // ZeroExtend(imm24, 32) where imm24 = Inst{23-0}.
-    Imm32 = slice(insn, 23, 0);
-  } else {
-    // SignExtend(imm24:'00', 32) where imm24 = Inst{23-0}.
-    unsigned Imm26 = slice(insn, 23, 0) << 2;
-    //Imm32 = signextend<signed int, 26>(Imm26);
-    Imm32 = SignExtend32<26>(Imm26);
-  }
-
-  MI.addOperand(MCOperand::CreateImm(Imm32));
-  NumOpsAdded = 1;
-
-  return true;
-}
-
-// Misc. Branch Instructions.
-// BX_RET, MOVPCLR
-// BLX, BLX_pred, BX, BX_pred
-// BLXi
-static bool DisassembleBrMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  const MCOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
-  if (!OpInfo) return false;
-
-  unsigned &OpIdx = NumOpsAdded;
-
-  OpIdx = 0;
-
-  // BX_RET and MOVPCLR have only two predicate operands; do an early return.
-  if (Opcode == ARM::BX_RET || Opcode == ARM::MOVPCLR)
-    return true;
-
-  // BLX and BX take one GPR reg.
-  if (Opcode == ARM::BLX || Opcode == ARM::BLX_pred ||
-      Opcode == ARM::BX || Opcode == ARM::BX_pred) {
-    assert(NumOps >= 1 && OpInfo[OpIdx].RegClass == ARM::GPRRegClassID &&
-           "Reg operand expected");
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRm(insn))));
-    OpIdx = 1;
-    return true;
-  }
-
-  // BLXi takes imm32 (the PC offset).
-  if (Opcode == ARM::BLXi) {
-    assert(NumOps >= 1 && OpInfo[0].RegClass < 0 && "Imm operand expected");
-    // SignExtend(imm24:H:'0', 32) where imm24 = Inst{23-0} and H = Inst{24}.
-    unsigned Imm26 = slice(insn, 23, 0) << 2 | slice(insn, 24, 24) << 1;
-    int Imm32 = SignExtend32<26>(Imm26);
-    MI.addOperand(MCOperand::CreateImm(Imm32));
-    OpIdx = 1;
-    return true;
-  }
-
-  return false;
-}
-
-static inline bool getBFCInvMask(uint32_t insn, uint32_t &mask) {
-  uint32_t lsb = slice(insn, 11, 7);
-  uint32_t msb = slice(insn, 20, 16);
-  uint32_t Val = 0;
-  if (msb < lsb) {
-    DEBUG(errs() << "Encoding error: msb < lsb\n");
-    return false;
-  }
-
-  for (uint32_t i = lsb; i <= msb; ++i)
-    Val |= (1 << i);
-  mask = ~Val;
-  return true;
-}
-
-// Standard data-processing instructions allow PC as a register specifier,
-// but we should reject other DPFrm instructions with PC as registers.
-static bool BadRegsDPFrm(unsigned Opcode, uint32_t insn) {
-  switch (Opcode) {
-  default:
-    // Did we miss an opcode?
-    if (decodeRd(insn) == 15 || decodeRn(insn) == 15 || decodeRm(insn) == 15) {
-      DEBUG(errs() << "DPFrm with bad reg specifier(s)\n");
-      return true;
-    }
-  case ARM::ADCrr:  case ARM::ADDSrr: case ARM::ADDrr:  case ARM::ANDrr:
-  case ARM::BICrr:  case ARM::CMNzrr: case ARM::CMPrr:  case ARM::EORrr:
-  case ARM::ORRrr:  case ARM::RSBrr:  case ARM::RSCrr:  case ARM::SBCrr:
-  case ARM::SUBSrr: case ARM::SUBrr:  case ARM::TEQrr:  case ARM::TSTrr:
-    return false;
-  }
-}
-
-// A major complication is the fact that some of the saturating add/subtract
-// operations have Rd Rm Rn, instead of the "normal" Rd Rn Rm.
-// They are QADD, QDADD, QDSUB, and QSUB.
-static bool DisassembleDPFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  const MCInstrDesc &MCID = ARMInsts[Opcode];
-  unsigned short NumDefs = MCID.getNumDefs();
-  bool isUnary = isUnaryDP(MCID.TSFlags);
-  const MCOperandInfo *OpInfo = MCID.OpInfo;
-  unsigned &OpIdx = NumOpsAdded;
-
-  OpIdx = 0;
-
-  // Disassemble register def if there is one.
-  if (NumDefs && (OpInfo[OpIdx].RegClass == ARM::GPRRegClassID)) {
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRd(insn))));
-    ++OpIdx;
-  }
-
-  // Now disassemble the src operands.
-  if (OpIdx >= NumOps)
-    return false;
-
-  // Special-case handling of BFC/BFI/SBFX/UBFX.
-  if (Opcode == ARM::BFC || Opcode == ARM::BFI) {
-    // A8.6.17 BFC & A8.6.18 BFI
-    // Sanity check Rd.
-    if (decodeRd(insn) == 15)
-      return false;
-    MI.addOperand(MCOperand::CreateReg(0));
-    if (Opcode == ARM::BFI) {
-      MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                         decodeRm(insn))));
-      ++OpIdx;
-    }
-    uint32_t mask = 0;
-    if (!getBFCInvMask(insn, mask))
-      return false;
-
-    MI.addOperand(MCOperand::CreateImm(mask));
-    OpIdx += 2;
-    return true;
-  }
-  if (Opcode == ARM::SBFX || Opcode == ARM::UBFX) {
-    // Sanity check Rd and Rm.
-    if (decodeRd(insn) == 15 || decodeRm(insn) == 15)
-      return false;
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRm(insn))));
-    MI.addOperand(MCOperand::CreateImm(slice(insn, 11, 7)));
-    MI.addOperand(MCOperand::CreateImm(slice(insn, 20, 16) + 1));
-    OpIdx += 3;
-    return true;
-  }
-
-  bool RmRn = (Opcode == ARM::QADD || Opcode == ARM::QDADD ||
-               Opcode == ARM::QDSUB || Opcode == ARM::QSUB);
-
-  // BinaryDP has an Rn operand.
-  if (!isUnary) {
-    assert(OpInfo[OpIdx].RegClass == ARM::GPRRegClassID &&
-           "Reg operand expected");
-    MI.addOperand(MCOperand::CreateReg(
-                    getRegisterEnum(B, ARM::GPRRegClassID,
-                                    RmRn ? decodeRm(insn) : decodeRn(insn))));
-    ++OpIdx;
-  }
-
-  // If this is a two-address operand, skip it, e.g., MOVCCr operand 1.
-  if (isUnary && (MCID.getOperandConstraint(OpIdx, MCOI::TIED_TO) != -1)) {
-    MI.addOperand(MCOperand::CreateReg(0));
-    ++OpIdx;
-  }
-
-  // Now disassemble operand 2.
-  if (OpIdx >= NumOps)
-    return false;
-
-  if (OpInfo[OpIdx].RegClass == ARM::GPRRegClassID) {
-    // We have a reg/reg form.
-    // Assert disabled because saturating operations, e.g., A8.6.127 QASX, are
-    // routed here as well.
-    // assert(getIBit(insn) == 0 && "I_Bit != '0' reg/reg form");
-    if (BadRegsDPFrm(Opcode, insn))
-      return false;
-    MI.addOperand(MCOperand::CreateReg(
-                    getRegisterEnum(B, ARM::GPRRegClassID,
-                                    RmRn? decodeRn(insn) : decodeRm(insn))));
-    ++OpIdx;
-  } else if (Opcode == ARM::MOVi16 || Opcode == ARM::MOVTi16) {
-    // These two instructions don't allow d as 15.
-    if (decodeRd(insn) == 15)
-      return false;
-    // We have an imm16 = imm4:imm12 (imm4=Inst{19:16}, imm12 = Inst{11:0}).
-    assert(getIBit(insn) == 1 && "I_Bit != '1' reg/imm form");
-    unsigned Imm16 = slice(insn, 19, 16) << 12 | slice(insn, 11, 0);
-    if (!B->tryAddingSymbolicOperand(Imm16, 4, MI))
-      MI.addOperand(MCOperand::CreateImm(Imm16));
-    ++OpIdx;
-  } else {
-    // We have a reg/imm form.
-    // SOImm is 4-bit rotate amount in bits 11-8 with 8-bit imm in bits 7-0.
-    // A5.2.4 Rotate amount is twice the numeric value of Inst{11-8}.
-    // See also ARMAddressingModes.h: getSOImmValImm() and getSOImmValRot().
-    assert(getIBit(insn) == 1 && "I_Bit != '1' reg/imm form");
-    unsigned Rot = (insn >> ARMII::SoRotImmShift) & 0xF;
-    unsigned Imm = insn & 0xFF;
-    MI.addOperand(MCOperand::CreateImm(ARM_AM::rotr32(Imm, 2*Rot)));
-    ++OpIdx;
-  }
-
-  return true;
-}
-
-static bool DisassembleDPSoRegFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  const MCInstrDesc &MCID = ARMInsts[Opcode];
-  unsigned short NumDefs = MCID.getNumDefs();
-  bool isUnary = isUnaryDP(MCID.TSFlags);
-  const MCOperandInfo *OpInfo = MCID.OpInfo;
-  unsigned &OpIdx = NumOpsAdded;
-
-  OpIdx = 0;
-
-  // Disassemble register def if there is one.
-  if (NumDefs && (OpInfo[OpIdx].RegClass == ARM::GPRRegClassID)) {
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRd(insn))));
-    ++OpIdx;
-  }
-
-  // Disassemble the src operands.
-  if (OpIdx >= NumOps)
-    return false;
-
-  // BinaryDP has an Rn operand.
-  if (!isUnary) {
-    assert(OpInfo[OpIdx].RegClass == ARM::GPRRegClassID &&
-           "Reg operand expected");
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRn(insn))));
-    ++OpIdx;
-  }
-
-  // If this is a two-address operand, skip it, e.g., MOVCCs operand 1.
-  if (isUnary && (MCID.getOperandConstraint(OpIdx, MCOI::TIED_TO) != -1)) {
-    MI.addOperand(MCOperand::CreateReg(0));
-    ++OpIdx;
-  }
-
-  // Disassemble operand 2, which consists of three components.
-  if (OpIdx + 2 >= NumOps)
-    return false;
-
-  assert((OpInfo[OpIdx].RegClass == ARM::GPRRegClassID) &&
-         (OpInfo[OpIdx+1].RegClass == ARM::GPRRegClassID) &&
-         (OpInfo[OpIdx+2].RegClass < 0) &&
-         "Expect 3 reg operands");
-
-  // Register-controlled shifts have Inst{7} = 0 and Inst{4} = 1.
-  unsigned Rs = slice(insn, 4, 4);
-
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRm(insn))));
-  if (Rs) {
-    // If Inst{7} != 0, we should reject this insn as an invalid encoding.
-    if (slice(insn, 7, 7))
-      return false;
-
-    // A8.6.3 ADC (register-shifted register)
-    // if d == 15 || n == 15 || m == 15 || s == 15 then UNPREDICTABLE;
-    // 
-    // This also accounts for shift instructions (register) where, fortunately,
-    // Inst{19-16} = 0b0000.
-    // A8.6.89 LSL (register)
-    // if d == 15 || n == 15 || m == 15 then UNPREDICTABLE;
-    if (decodeRd(insn) == 15 || decodeRn(insn) == 15 ||
-        decodeRm(insn) == 15 || decodeRs(insn) == 15)
-      return false;
-    
-    // Register-controlled shifts: [Rm, Rs, shift].
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRs(insn))));
-    // Inst{6-5} encodes the shift opcode.
-    ARM_AM::ShiftOpc ShOp = getShiftOpcForBits(slice(insn, 6, 5));
-    MI.addOperand(MCOperand::CreateImm(ARM_AM::getSORegOpc(ShOp, 0)));
-  } else {
-    // Constant shifts: [Rm, reg0, shift_imm].
-    MI.addOperand(MCOperand::CreateReg(0)); // NoRegister
-    // Inst{6-5} encodes the shift opcode.
-    ARM_AM::ShiftOpc ShOp = getShiftOpcForBits(slice(insn, 6, 5));
-    // Inst{11-7} encodes the imm5 shift amount.
-    unsigned ShImm = slice(insn, 11, 7);
-
-    // A8.4.1.  Possible rrx or shift amount of 32...
-    getImmShiftSE(ShOp, ShImm);
-    MI.addOperand(MCOperand::CreateImm(ARM_AM::getSORegOpc(ShOp, ShImm)));
-  }
-  OpIdx += 3;
-
-  return true;
-}
-
-static bool BadRegsLdStFrm(unsigned Opcode, uint32_t insn, bool Store, bool WBack,
-                           bool Imm) {
-  const StringRef Name = ARMInsts[Opcode].Name;
-  unsigned Rt = decodeRd(insn);
-  unsigned Rn = decodeRn(insn);
-  unsigned Rm = decodeRm(insn);
-  unsigned P  = getPBit(insn);
-  unsigned W  = getWBit(insn);
-
-  if (Store) {
-    // Only STR (immediate, register) allows PC as the source.
-    if (Name.startswith("STRB") && Rt == 15) {
-      DEBUG(errs() << "if t == 15 then UNPREDICTABLE\n");
-      return true;
-    }
-    if (WBack && (Rn == 15 || Rn == Rt)) {
-      DEBUG(errs() << "if wback && (n == 15 || n == t) then UNPREDICTABLE\n");
-      return true;
-    }
-    if (!Imm && Rm == 15) {
-      DEBUG(errs() << "if m == 15 then UNPREDICTABLE\n");
-      return true;
-    }
-  } else {
-    // Only LDR (immediate, register) allows PC as the destination.
-    if (Name.startswith("LDRB") && Rt == 15) {
-      DEBUG(errs() << "if t == 15 then UNPREDICTABLE\n");
-      return true;
-    }
-    if (Imm) {
-      // Immediate
-      if (Rn == 15) {
-        // The literal form must be in offset mode; it's an encoding error
-        // otherwise.
-        if (!(P == 1 && W == 0)) {
-          DEBUG(errs() << "Ld literal form with !(P == 1 && W == 0)\n");
-          return true;
-        }
-        // LDRB (literal) does not allow PC as the destination.
-        if (Opcode != ARM::LDRi12 && Rt == 15) {
-          DEBUG(errs() << "if t == 15 then UNPREDICTABLE\n");
-          return true;
-        }
-      } else {
-        // Write back while Rn == Rt does not make sense.
-        if (WBack && (Rn == Rt)) {
-          DEBUG(errs() << "if wback && n == t then UNPREDICTABLE\n");
-          return true;
-        }
-      }
-    } else {
-      // Register
-      if (Rm == 15) {
-        DEBUG(errs() << "if m == 15 then UNPREDICTABLE\n");
-        return true;
-      }
-      if (WBack && (Rn == 15 || Rn == Rt)) {
-        DEBUG(errs() << "if wback && (n == 15 || n == t) then UNPREDICTABLE\n");
-        return true;
-      }
-    }
-  }
-  return false;
-}
-
-static bool DisassembleLdStFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, bool isStore, BO B) {
-
-  const MCInstrDesc &MCID = ARMInsts[Opcode];
-  bool isPrePost = isPrePostLdSt(MCID.TSFlags);
-  const MCOperandInfo *OpInfo = MCID.OpInfo;
-  if (!OpInfo) return false;
-
-  unsigned &OpIdx = NumOpsAdded;
-
-  OpIdx = 0;
-
-  assert(((!isStore && MCID.getNumDefs() > 0) ||
-          (isStore && (MCID.getNumDefs() == 0 || isPrePost)))
-         && "Invalid arguments");
-
-  // Operand 0 of a pre- and post-indexed store is the address base writeback.
-  if (isPrePost && isStore) {
-    assert(OpInfo[OpIdx].RegClass == ARM::GPRRegClassID &&
-           "Reg operand expected");
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRn(insn))));
-    ++OpIdx;
-  }
-
-  // Disassemble the dst/src operand.
-  if (OpIdx >= NumOps)
-    return false;
-
-  assert(OpInfo[OpIdx].RegClass == ARM::GPRRegClassID &&
-         "Reg operand expected");
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRd(insn))));
-  ++OpIdx;
-
-  // After dst of a pre- and post-indexed load is the address base writeback.
-  if (isPrePost && !isStore) {
-    assert(OpInfo[OpIdx].RegClass == ARM::GPRRegClassID &&
-           "Reg operand expected");
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRn(insn))));
-    ++OpIdx;
-  }
-
-  // Disassemble the base operand.
-  if (OpIdx >= NumOps)
-    return false;
-
-  assert(OpInfo[OpIdx].RegClass == ARM::GPRRegClassID &&
-         "Reg operand expected");
-  assert((!isPrePost || (MCID.getOperandConstraint(OpIdx, MCOI::TIED_TO) != -1))
-         && "Index mode or tied_to operand expected");
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRn(insn))));
-  ++OpIdx;
-
-  // For reg/reg form, base reg is followed by +/- reg shop imm.
-  // For immediate form, it is followed by +/- imm12.
-  // See also ARMAddressingModes.h (Addressing Mode #2).
-  if (OpIdx + 1 >= NumOps)
-    return false;
-
-  if (BadRegsLdStFrm(Opcode, insn, isStore, isPrePost, getIBit(insn)==0))
-    return false;
-
-  ARM_AM::AddrOpc AddrOpcode = getUBit(insn) ? ARM_AM::add : ARM_AM::sub;
-  unsigned IndexMode =
-               (MCID.TSFlags & ARMII::IndexModeMask) >> ARMII::IndexModeShift;
-  if (getIBit(insn) == 0) {
-    // For pre- and post-indexed case, add a reg0 operand (Addressing Mode #2).
-    // Otherwise, skip the reg operand since for addrmode_imm12, Rn has already
-    // been populated.
-    if (isPrePost) {
-      MI.addOperand(MCOperand::CreateReg(0));
-      OpIdx += 1;
-    }
-
-    unsigned Imm12 = slice(insn, 11, 0);
-    if (Opcode == ARM::LDRBi12 || Opcode == ARM::LDRi12 ||
-        Opcode == ARM::STRBi12 || Opcode == ARM::STRi12) {
-      // Disassemble the 12-bit immediate offset, which is the second operand in
-      // $addrmode_imm12 => (ops GPR:$base, i32imm:$offsimm).    
-      int Offset = AddrOpcode == ARM_AM::add ? 1 * Imm12 : -1 * Imm12;
-      MI.addOperand(MCOperand::CreateImm(Offset));
-    } else {
-      // Disassemble the 12-bit immediate offset, which is the second operand in
-      // $am2offset => (ops GPR, i32imm).
-      unsigned Offset = ARM_AM::getAM2Opc(AddrOpcode, Imm12, ARM_AM::no_shift,
-                                          IndexMode);
-      MI.addOperand(MCOperand::CreateImm(Offset));
-    }
-    OpIdx += 1;
-  } else {
-    // If Inst{25} = 1 and Inst{4} != 0, we should reject this as invalid.
-    if (slice(insn,4,4) == 1)
-      return false;
-
-    // Disassemble the offset reg (Rm), shift type, and immediate shift length.
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRm(insn))));
-    // Inst{6-5} encodes the shift opcode.
-    ARM_AM::ShiftOpc ShOp = getShiftOpcForBits(slice(insn, 6, 5));
-    // Inst{11-7} encodes the imm5 shift amount.
-    unsigned ShImm = slice(insn, 11, 7);
-
-    // A8.4.1.  Possible rrx or shift amount of 32...
-    getImmShiftSE(ShOp, ShImm);
-    MI.addOperand(MCOperand::CreateImm(
-                    ARM_AM::getAM2Opc(AddrOpcode, ShImm, ShOp, IndexMode)));
-    OpIdx += 2;
-  }
-
-  return true;
-}
-
-static bool DisassembleLdFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-  return DisassembleLdStFrm(MI, Opcode, insn, NumOps, NumOpsAdded, false, B);
-}
-
-static bool DisassembleStFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-  return DisassembleLdStFrm(MI, Opcode, insn, NumOps, NumOpsAdded, true, B);
-}
-
-static bool HasDualReg(unsigned Opcode) {
-  switch (Opcode) {
-  default:
-    return false;
-  case ARM::LDRD: case ARM::LDRD_PRE: case ARM::LDRD_POST:
-  case ARM::STRD: case ARM::STRD_PRE: case ARM::STRD_POST:
-    return true;
-  }
-}
-
-static bool DisassembleLdStMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, bool isStore, BO B) {
-
-  const MCInstrDesc &MCID = ARMInsts[Opcode];
-  bool isPrePost = isPrePostLdSt(MCID.TSFlags);
-  const MCOperandInfo *OpInfo = MCID.OpInfo;
-  if (!OpInfo) return false;
-
-  unsigned &OpIdx = NumOpsAdded;
-
-  OpIdx = 0;
-
-  assert(((!isStore && MCID.getNumDefs() > 0) ||
-          (isStore && (MCID.getNumDefs() == 0 || isPrePost)))
-         && "Invalid arguments");
-
-  // Operand 0 of a pre- and post-indexed store is the address base writeback.
-  if (isPrePost && isStore) {
-    assert(OpInfo[OpIdx].RegClass == ARM::GPRRegClassID &&
-           "Reg operand expected");
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRn(insn))));
-    ++OpIdx;
-  }
-
-  // Disassemble the dst/src operand.
-  if (OpIdx >= NumOps)
-    return false;
-
-  assert(OpInfo[OpIdx].RegClass == ARM::GPRRegClassID &&
-         "Reg operand expected");
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRd(insn))));
-  ++OpIdx;
-
-  // Fill in LDRD and STRD's second operand Rt operand.
-  if (HasDualReg(Opcode)) {
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRd(insn) + 1)));
-    ++OpIdx;
-  }
-
-  // After dst of a pre- and post-indexed load is the address base writeback.
-  if (isPrePost && !isStore) {
-    assert(OpInfo[OpIdx].RegClass == ARM::GPRRegClassID &&
-           "Reg operand expected");
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRn(insn))));
-    ++OpIdx;
-  }
-
-  // Disassemble the base operand.
-  if (OpIdx >= NumOps)
-    return false;
-
-  assert(OpInfo[OpIdx].RegClass == ARM::GPRRegClassID &&
-         "Reg operand expected");
-  assert((!isPrePost || (MCID.getOperandConstraint(OpIdx, MCOI::TIED_TO) != -1))
-         && "Offset mode or tied_to operand expected");
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRn(insn))));
-  ++OpIdx;
-
-  // For reg/reg form, base reg is followed by +/- reg.
-  // For immediate form, it is followed by +/- imm8.
-  // See also ARMAddressingModes.h (Addressing Mode #3).
-  if (OpIdx + 1 >= NumOps)
-    return false;
-
-  assert((OpInfo[OpIdx].RegClass == ARM::GPRRegClassID) &&
-         (OpInfo[OpIdx+1].RegClass < 0) &&
-         "Expect 1 reg operand followed by 1 imm operand");
-
-  ARM_AM::AddrOpc AddrOpcode = getUBit(insn) ? ARM_AM::add : ARM_AM::sub;
-  unsigned IndexMode =
-                 (MCID.TSFlags & ARMII::IndexModeMask) >> ARMII::IndexModeShift;
-  if (getAM3IBit(insn) == 1) {
-    MI.addOperand(MCOperand::CreateReg(0));
-
-    // Disassemble the 8-bit immediate offset.
-    unsigned Imm4H = (insn >> ARMII::ImmHiShift) & 0xF;
-    unsigned Imm4L = insn & 0xF;
-    unsigned Offset = ARM_AM::getAM3Opc(AddrOpcode, (Imm4H << 4) | Imm4L,
-                                        IndexMode);
-    MI.addOperand(MCOperand::CreateImm(Offset));
-  } else {
-    // Disassemble the offset reg (Rm).
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRm(insn))));
-    unsigned Offset = ARM_AM::getAM3Opc(AddrOpcode, 0, IndexMode);
-    MI.addOperand(MCOperand::CreateImm(Offset));
-  }
-  OpIdx += 2;
-
-  return true;
-}
-
-static bool DisassembleLdMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-  return DisassembleLdStMiscFrm(MI, Opcode, insn, NumOps, NumOpsAdded, false,
-                                B);
-}
-
-static bool DisassembleStMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-  return DisassembleLdStMiscFrm(MI, Opcode, insn, NumOps, NumOpsAdded, true, B);
-}
-
-// The algorithm for disassembly of LdStMulFrm is different from others because
-// it explicitly populates the two predicate operands after the base register.
-// After that, we need to populate the reglist with each affected register
-// encoded as an MCOperand.
-static bool DisassembleLdStMulFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  assert(NumOps >= 4 && "LdStMulFrm expects NumOps >= 4");
-  NumOpsAdded = 0;
-
-  unsigned Base = getRegisterEnum(B, ARM::GPRRegClassID, decodeRn(insn));
-
-  // Writeback to base, if necessary.
-  if (Opcode == ARM::LDMIA_UPD || Opcode == ARM::STMIA_UPD ||
-      Opcode == ARM::LDMDA_UPD || Opcode == ARM::STMDA_UPD ||
-      Opcode == ARM::LDMDB_UPD || Opcode == ARM::STMDB_UPD ||
-      Opcode == ARM::LDMIB_UPD || Opcode == ARM::STMIB_UPD) {
-    MI.addOperand(MCOperand::CreateReg(Base));
-    ++NumOpsAdded;
-  }
-
-  // Add the base register operand.
-  MI.addOperand(MCOperand::CreateReg(Base));
-
-  // Handling the two predicate operands before the reglist.
-  int64_t CondVal = getCondField(insn);
-  if (CondVal == 0xF)
-    return false;
-  MI.addOperand(MCOperand::CreateImm(CondVal));
-  MI.addOperand(MCOperand::CreateReg(ARM::CPSR));
-
-  NumOpsAdded += 3;
-
-  // Fill the variadic part of reglist.
-  unsigned RegListBits = insn & ((1 << 16) - 1);
-  for (unsigned i = 0; i < 16; ++i) {
-    if ((RegListBits >> i) & 1) {
-      MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                         i)));
-      ++NumOpsAdded;
-    }
-  }
-
-  return true;
-}
-
-// LDREX, LDREXB, LDREXH: Rd Rn
-// LDREXD:                Rd Rd+1 Rn
-// STREX, STREXB, STREXH: Rd Rm Rn
-// STREXD:                Rd Rm Rm+1 Rn
-//
-// SWP, SWPB:             Rd Rm Rn
-static bool DisassembleLdStExFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  const MCOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
-  if (!OpInfo) return false;
-
-  unsigned &OpIdx = NumOpsAdded;
-
-  OpIdx = 0;
-
-  assert(NumOps >= 2
-         && OpInfo[0].RegClass == ARM::GPRRegClassID
-         && OpInfo[1].RegClass == ARM::GPRRegClassID
-         && "Expect 2 reg operands");
-
-  bool isStore = slice(insn, 20, 20) == 0;
-  bool isDW = (Opcode == ARM::LDREXD || Opcode == ARM::STREXD);
-
-  // Add the destination operand.
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRd(insn))));
-  ++OpIdx;
-
-  // Store register Exclusive needs a source operand.
-  if (isStore) {
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRm(insn))));
-    ++OpIdx;
-
-    if (isDW) {
-      MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                         decodeRm(insn)+1)));
-      ++OpIdx;
-    }
-  } else if (isDW) {
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRd(insn)+1)));
-    ++OpIdx;
-  }
-
-  // Finally add the pointer operand.
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRn(insn))));
-  ++OpIdx;
-
-  return true;
-}
-
-// Misc. Arithmetic Instructions.
-// CLZ: Rd Rm
-// PKHBT, PKHTB: Rd Rn Rm , LSL/ASR #imm5
-// RBIT, REV, REV16, REVSH: Rd Rm
-static bool DisassembleArithMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  const MCOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
-  unsigned &OpIdx = NumOpsAdded;
-
-  OpIdx = 0;
-
-  assert(NumOps >= 2
-         && OpInfo[0].RegClass == ARM::GPRRegClassID
-         && OpInfo[1].RegClass == ARM::GPRRegClassID
-         && "Expect 2 reg operands");
-
-  bool ThreeReg = NumOps > 2 && OpInfo[2].RegClass == ARM::GPRRegClassID;
-
-  // Sanity check the registers, which should not be 15.
-  if (decodeRd(insn) == 15 || decodeRm(insn) == 15)
-    return false;
-  if (ThreeReg && decodeRn(insn) == 15)
-    return false;
-
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRd(insn))));
-  ++OpIdx;
-
-  if (ThreeReg) {
-    assert(NumOps >= 4 && "Expect >= 4 operands");
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRn(insn))));
-    ++OpIdx;
-  }
-
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRm(insn))));
-  ++OpIdx;
-
-  // If there is still an operand info left which is an immediate operand, add
-  // an additional imm5 LSL/ASR operand.
-  if (ThreeReg && OpInfo[OpIdx].RegClass < 0
-      && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()) {
-    // Extract the 5-bit immediate field Inst{11-7}.
-    unsigned ShiftAmt = (insn >> ARMII::ShiftShift) & 0x1F;
-    ARM_AM::ShiftOpc Opc = ARM_AM::no_shift;
-    if (Opcode == ARM::PKHBT)
-      Opc = ARM_AM::lsl;
-    else if (Opcode == ARM::PKHTB)
-      Opc = ARM_AM::asr;
-    getImmShiftSE(Opc, ShiftAmt);
-    MI.addOperand(MCOperand::CreateImm(ARM_AM::getSORegOpc(Opc, ShiftAmt)));
-    ++OpIdx;
-  }
-
-  return true;
-}
-
-/// DisassembleSatFrm - Disassemble saturate instructions:
-/// SSAT, SSAT16, USAT, and USAT16.
-static bool DisassembleSatFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  // A8.6.183 SSAT
-  // if d == 15 || n == 15 then UNPREDICTABLE;
-  if (decodeRd(insn) == 15 || decodeRm(insn) == 15)
-    return false;
-
-  const MCInstrDesc &MCID = ARMInsts[Opcode];
-  NumOpsAdded = MCID.getNumOperands() - 2; // ignore predicate operands
-
-  // Disassemble register def.
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRd(insn))));
-
-  unsigned Pos = slice(insn, 20, 16);
-  if (Opcode == ARM::SSAT || Opcode == ARM::SSAT16)
-    Pos += 1;
-  MI.addOperand(MCOperand::CreateImm(Pos));
-
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRm(insn))));
-
-  if (NumOpsAdded == 4) {
-    ARM_AM::ShiftOpc Opc = (slice(insn, 6, 6) != 0 ? ARM_AM::asr : ARM_AM::lsl);
-    // Inst{11-7} encodes the imm5 shift amount.
-    unsigned ShAmt = slice(insn, 11, 7);
-    if (ShAmt == 0) {
-      // A8.6.183.  Possible ASR shift amount of 32...
-      if (Opc == ARM_AM::asr)
-        ShAmt = 32;
-      else
-        Opc = ARM_AM::no_shift;
-    }
-    MI.addOperand(MCOperand::CreateImm(ARM_AM::getSORegOpc(Opc, ShAmt)));
-  }
-  return true;
-}
-
-// Extend instructions.
-// SXT* and UXT*: Rd [Rn] Rm [rot_imm].
-// The 2nd operand register is Rn and the 3rd operand regsiter is Rm for the
-// three register operand form.  Otherwise, Rn=0b1111 and only Rm is used.
-static bool DisassembleExtFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  // A8.6.220 SXTAB
-  // if d == 15 || m == 15 then UNPREDICTABLE;
-  if (decodeRd(insn) == 15 || decodeRm(insn) == 15)
-    return false;
-
-  const MCOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
-  unsigned &OpIdx = NumOpsAdded;
-
-  OpIdx = 0;
-
-  assert(NumOps >= 2
-         && OpInfo[0].RegClass == ARM::GPRRegClassID
-         && OpInfo[1].RegClass == ARM::GPRRegClassID
-         && "Expect 2 reg operands");
-
-  bool ThreeReg = NumOps > 2 && OpInfo[2].RegClass == ARM::GPRRegClassID;
-
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRd(insn))));
-  ++OpIdx;
-
-  if (ThreeReg) {
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRn(insn))));
-    ++OpIdx;
-  }
-
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRm(insn))));
-  ++OpIdx;
-
-  // If there is still an operand info left which is an immediate operand, add
-  // an additional rotate immediate operand.
-  if (OpIdx < NumOps && OpInfo[OpIdx].RegClass < 0
-      && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()) {
-    // Extract the 2-bit rotate field Inst{11-10}.
-    unsigned rot = (insn >> ARMII::ExtRotImmShift) & 3;
-    // Rotation by 8, 16, or 24 bits.
-    MI.addOperand(MCOperand::CreateImm(rot << 3));
-    ++OpIdx;
-  }
-
-  return true;
-}
-
-/////////////////////////////////////
-//                                 //
-//    Utility Functions For VFP    //
-//                                 //
-/////////////////////////////////////
-
-// Extract/Decode Dd/Sd:
-//
-// SP => d = UInt(Vd:D)
-// DP => d = UInt(D:Vd)
-static unsigned decodeVFPRd(uint32_t insn, bool isSPVFP) {
-  return isSPVFP ? (decodeRd(insn) << 1 | getDBit(insn))
-                 : (decodeRd(insn) | getDBit(insn) << 4);
-}
-
-// Extract/Decode Dn/Sn:
-//
-// SP => n = UInt(Vn:N)
-// DP => n = UInt(N:Vn)
-static unsigned decodeVFPRn(uint32_t insn, bool isSPVFP) {
-  return isSPVFP ? (decodeRn(insn) << 1 | getNBit(insn))
-                 : (decodeRn(insn) | getNBit(insn) << 4);
-}
-
-// Extract/Decode Dm/Sm:
-//
-// SP => m = UInt(Vm:M)
-// DP => m = UInt(M:Vm)
-static unsigned decodeVFPRm(uint32_t insn, bool isSPVFP) {
-  return isSPVFP ? (decodeRm(insn) << 1 | getMBit(insn))
-                 : (decodeRm(insn) | getMBit(insn) << 4);
-}
-
-// A7.5.1
-static APInt VFPExpandImm(unsigned char byte, unsigned N) {
-  assert(N == 32 || N == 64);
-
-  uint64_t Result;
-  unsigned bit6 = slice(byte, 6, 6);
-  if (N == 32) {
-    Result = slice(byte, 7, 7) << 31 | slice(byte, 5, 0) << 19;
-    if (bit6)
-      Result |= 0x1f << 25;
-    else
-      Result |= 0x1 << 30;
-  } else {
-    Result = (uint64_t)slice(byte, 7, 7) << 63 |
-             (uint64_t)slice(byte, 5, 0) << 48;
-    if (bit6)
-      Result |= 0xffULL << 54;
-    else
-      Result |= 0x1ULL << 62;
-  }
-  return APInt(N, Result);
-}
-
-// VFP Unary Format Instructions:
-//
-// VCMP[E]ZD, VCMP[E]ZS: compares one floating-point register with zero
-// VCVTDS, VCVTSD: converts between double-precision and single-precision
-// The rest of the instructions have homogeneous [VFP]Rd and [VFP]Rm registers.
-static bool DisassembleVFPUnaryFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  assert(NumOps >= 1 && "VFPUnaryFrm expects NumOps >= 1");
-
-  const MCOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
-  unsigned &OpIdx = NumOpsAdded;
-
-  OpIdx = 0;
-
-  unsigned RegClass = OpInfo[OpIdx].RegClass;
-  assert((RegClass == ARM::SPRRegClassID || RegClass == ARM::DPRRegClassID) &&
-         "Reg operand expected");
-  bool isSP = (RegClass == ARM::SPRRegClassID);
-
-  MI.addOperand(MCOperand::CreateReg(
-                  getRegisterEnum(B, RegClass, decodeVFPRd(insn, isSP))));
-  ++OpIdx;
-
-  // Early return for compare with zero instructions.
-  if (Opcode == ARM::VCMPEZD || Opcode == ARM::VCMPEZS
-      || Opcode == ARM::VCMPZD || Opcode == ARM::VCMPZS)
-    return true;
-
-  RegClass = OpInfo[OpIdx].RegClass;
-  assert((RegClass == ARM::SPRRegClassID || RegClass == ARM::DPRRegClassID) &&
-         "Reg operand expected");
-  isSP = (RegClass == ARM::SPRRegClassID);
-
-  MI.addOperand(MCOperand::CreateReg(
-                  getRegisterEnum(B, RegClass, decodeVFPRm(insn, isSP))));
-  ++OpIdx;
-
-  return true;
-}
-
-// All the instructions have homogeneous [VFP]Rd, [VFP]Rn, and [VFP]Rm regs.
-// Some of them have operand constraints which tie the first operand in the
-// InOperandList to that of the dst.  As far as asm printing is concerned, this
-// tied_to operand is simply skipped.
-static bool DisassembleVFPBinaryFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  assert(NumOps >= 3 && "VFPBinaryFrm expects NumOps >= 3");
-
-  const MCInstrDesc &MCID = ARMInsts[Opcode];
-  const MCOperandInfo *OpInfo = MCID.OpInfo;
-  unsigned &OpIdx = NumOpsAdded;
-
-  OpIdx = 0;
-
-  unsigned RegClass = OpInfo[OpIdx].RegClass;
-  assert((RegClass == ARM::SPRRegClassID || RegClass == ARM::DPRRegClassID) &&
-         "Reg operand expected");
-  bool isSP = (RegClass == ARM::SPRRegClassID);
-
-  MI.addOperand(MCOperand::CreateReg(
-                  getRegisterEnum(B, RegClass, decodeVFPRd(insn, isSP))));
-  ++OpIdx;
-
-  // Skip tied_to operand constraint.
-  if (MCID.getOperandConstraint(OpIdx, MCOI::TIED_TO) != -1) {
-    assert(NumOps >= 4 && "Expect >=4 operands");
-    MI.addOperand(MCOperand::CreateReg(0));
-    ++OpIdx;
-  }
-
-  MI.addOperand(MCOperand::CreateReg(
-                  getRegisterEnum(B, RegClass, decodeVFPRn(insn, isSP))));
-  ++OpIdx;
-
-  MI.addOperand(MCOperand::CreateReg(
-                  getRegisterEnum(B, RegClass, decodeVFPRm(insn, isSP))));
-  ++OpIdx;
-
-  return true;
-}
-
-// A8.6.295 vcvt (floating-point <-> integer)
-// Int to FP: VSITOD, VSITOS, VUITOD, VUITOS
-// FP to Int: VTOSI[Z|R]D, VTOSI[Z|R]S, VTOUI[Z|R]D, VTOUI[Z|R]S
-//
-// A8.6.297 vcvt (floating-point and fixed-point)
-// Dd|Sd Dd|Sd(TIED_TO) #fbits(= 16|32 - UInt(imm4:i))
-static bool DisassembleVFPConv1Frm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  assert(NumOps >= 2 && "VFPConv1Frm expects NumOps >= 2");
-
-  const MCInstrDesc &MCID = ARMInsts[Opcode];
-  const MCOperandInfo *OpInfo = MCID.OpInfo;
-  if (!OpInfo) return false;
-
-  bool SP = slice(insn, 8, 8) == 0; // A8.6.295 & A8.6.297
-  bool fixed_point = slice(insn, 17, 17) == 1; // A8.6.297
-  unsigned RegClassID = SP ? ARM::SPRRegClassID : ARM::DPRRegClassID;
-
-  if (fixed_point) {
-    // A8.6.297
-    assert(NumOps >= 3 && "Expect >= 3 operands");
-    int size = slice(insn, 7, 7) == 0 ? 16 : 32;
-    int fbits = size - (slice(insn,3,0) << 1 | slice(insn,5,5));
-    MI.addOperand(MCOperand::CreateReg(
-                    getRegisterEnum(B, RegClassID,
-                                    decodeVFPRd(insn, SP))));
-
-    assert(MCID.getOperandConstraint(1, MCOI::TIED_TO) != -1 &&
-           "Tied to operand expected");
-    MI.addOperand(MI.getOperand(0));
-
-    assert(OpInfo[2].RegClass < 0 && !OpInfo[2].isPredicate() &&
-           !OpInfo[2].isOptionalDef() && "Imm operand expected");
-    MI.addOperand(MCOperand::CreateImm(fbits));
-
-    NumOpsAdded = 3;
-  } else {
-    // A8.6.295
-    // The Rd (destination) and Rm (source) bits have different interpretations
-    // depending on their single-precisonness.
-    unsigned d, m;
-    if (slice(insn, 18, 18) == 1) { // to_integer operation
-      d = decodeVFPRd(insn, true /* Is Single Precision */);
-      MI.addOperand(MCOperand::CreateReg(
-                      getRegisterEnum(B, ARM::SPRRegClassID, d)));
-      m = decodeVFPRm(insn, SP);
-      MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, RegClassID, m)));
-    } else {
-      d = decodeVFPRd(insn, SP);
-      MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, RegClassID, d)));
-      m = decodeVFPRm(insn, true /* Is Single Precision */);
-      MI.addOperand(MCOperand::CreateReg(
-                      getRegisterEnum(B, ARM::SPRRegClassID, m)));
-    }
-    NumOpsAdded = 2;
-  }
-
-  return true;
-}
-
-// VMOVRS - A8.6.330
-// Rt => Rd; Sn => UInt(Vn:N)
-static bool DisassembleVFPConv2Frm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  assert(NumOps >= 2 && "VFPConv2Frm expects NumOps >= 2");
-
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRd(insn))));
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::SPRRegClassID,
-                                                     decodeVFPRn(insn, true))));
-  NumOpsAdded = 2;
-  return true;
-}
-
-// VMOVRRD - A8.6.332
-// Rt => Rd; Rt2 => Rn; Dm => UInt(M:Vm)
-//
-// VMOVRRS - A8.6.331
-// Rt => Rd; Rt2 => Rn; Sm => UInt(Vm:M); Sm1 = Sm+1
-static bool DisassembleVFPConv3Frm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  assert(NumOps >= 3 && "VFPConv3Frm expects NumOps >= 3");
-
-  const MCOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
-  unsigned &OpIdx = NumOpsAdded;
-
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRd(insn))));
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRn(insn))));
-  OpIdx = 2;
-
-  if (OpInfo[OpIdx].RegClass == ARM::SPRRegClassID) {
-    unsigned Sm = decodeVFPRm(insn, true);
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::SPRRegClassID,
-                                                       Sm)));
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::SPRRegClassID,
-                                                       Sm+1)));
-    OpIdx += 2;
-  } else {
-    MI.addOperand(MCOperand::CreateReg(
-                    getRegisterEnum(B, ARM::DPRRegClassID,
-                                    decodeVFPRm(insn, false))));
-    ++OpIdx;
-  }
-  return true;
-}
-
-// VMOVSR - A8.6.330
-// Rt => Rd; Sn => UInt(Vn:N)
-static bool DisassembleVFPConv4Frm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  assert(NumOps >= 2 && "VFPConv4Frm expects NumOps >= 2");
-
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::SPRRegClassID,
-                                                     decodeVFPRn(insn, true))));
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRd(insn))));
-  NumOpsAdded = 2;
-  return true;
-}
-
-// VMOVDRR - A8.6.332
-// Rt => Rd; Rt2 => Rn; Dm => UInt(M:Vm)
-//
-// VMOVRRS - A8.6.331
-// Rt => Rd; Rt2 => Rn; Sm => UInt(Vm:M); Sm1 = Sm+1
-static bool DisassembleVFPConv5Frm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  assert(NumOps >= 3 && "VFPConv5Frm expects NumOps >= 3");
-
-  const MCOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
-  unsigned &OpIdx = NumOpsAdded;
-
-  OpIdx = 0;
-
-  if (OpInfo[OpIdx].RegClass == ARM::SPRRegClassID) {
-    unsigned Sm = decodeVFPRm(insn, true);
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::SPRRegClassID,
-                                                       Sm)));
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::SPRRegClassID,
-                                                       Sm+1)));
-    OpIdx += 2;
-  } else {
-    MI.addOperand(MCOperand::CreateReg(
-                    getRegisterEnum(B, ARM::DPRRegClassID,
-                                    decodeVFPRm(insn, false))));
-    ++OpIdx;
-  }
-
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRd(insn))));
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRn(insn))));
-  OpIdx += 2;
-  return true;
-}
-
-// VFP Load/Store Instructions.
-// VLDRD, VLDRS, VSTRD, VSTRS
-static bool DisassembleVFPLdStFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  assert(NumOps >= 3 && "VFPLdStFrm expects NumOps >= 3");
-
-  bool isSPVFP = (Opcode == ARM::VLDRS || Opcode == ARM::VSTRS);
-  unsigned RegClassID = isSPVFP ? ARM::SPRRegClassID : ARM::DPRRegClassID;
-
-  // Extract Dd/Sd for operand 0.
-  unsigned RegD = decodeVFPRd(insn, isSPVFP);
-
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, RegClassID, RegD)));
-
-  unsigned Base = getRegisterEnum(B, ARM::GPRRegClassID, decodeRn(insn));
-  MI.addOperand(MCOperand::CreateReg(Base));
-
-  // Next comes the AM5 Opcode.
-  ARM_AM::AddrOpc AddrOpcode = getUBit(insn) ? ARM_AM::add : ARM_AM::sub;
-  unsigned char Imm8 = insn & 0xFF;
-  MI.addOperand(MCOperand::CreateImm(ARM_AM::getAM5Opc(AddrOpcode, Imm8)));
-
-  NumOpsAdded = 3;
-
-  return true;
-}
-
-// VFP Load/Store Multiple Instructions.
-// We have an optional write back reg, the base, and two predicate operands.
-// It is then followed by a reglist of either DPR(s) or SPR(s).
-//
-// VLDMD[_UPD], VLDMS[_UPD], VSTMD[_UPD], VSTMS[_UPD]
-static bool DisassembleVFPLdStMulFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  assert(NumOps >= 4 && "VFPLdStMulFrm expects NumOps >= 4");
-
-  unsigned &OpIdx = NumOpsAdded;
-
-  OpIdx = 0;
-
-  unsigned Base = getRegisterEnum(B, ARM::GPRRegClassID, decodeRn(insn));
-
-  // Writeback to base, if necessary.
-  if (Opcode == ARM::VLDMDIA_UPD || Opcode == ARM::VLDMSIA_UPD ||
-      Opcode == ARM::VLDMDDB_UPD || Opcode == ARM::VLDMSDB_UPD ||
-      Opcode == ARM::VSTMDIA_UPD || Opcode == ARM::VSTMSIA_UPD ||
-      Opcode == ARM::VSTMDDB_UPD || Opcode == ARM::VSTMSDB_UPD) {
-    MI.addOperand(MCOperand::CreateReg(Base));
-    ++OpIdx;
-  }
-
-  MI.addOperand(MCOperand::CreateReg(Base));
-
-  // Handling the two predicate operands before the reglist.
-  int64_t CondVal = getCondField(insn);
-  if (CondVal == 0xF)
-    return false;
-  MI.addOperand(MCOperand::CreateImm(CondVal));
-  MI.addOperand(MCOperand::CreateReg(ARM::CPSR));
-
-  OpIdx += 3;
-
-  bool isSPVFP = (Opcode == ARM::VLDMSIA     ||
-                  Opcode == ARM::VLDMSIA_UPD || Opcode == ARM::VLDMSDB_UPD ||
-                  Opcode == ARM::VSTMSIA     ||
-                  Opcode == ARM::VSTMSIA_UPD || Opcode == ARM::VSTMSDB_UPD);
-  unsigned RegClassID = isSPVFP ? ARM::SPRRegClassID : ARM::DPRRegClassID;
-
-  // Extract Dd/Sd.
-  unsigned RegD = decodeVFPRd(insn, isSPVFP);
-
-  // Fill the variadic part of reglist.
-  unsigned char Imm8 = insn & 0xFF;
-  unsigned Regs = isSPVFP ? Imm8 : Imm8/2;
-
-  // Apply some sanity checks before proceeding.
-  if (Regs == 0 || (RegD + Regs) > 32 || (!isSPVFP && Regs > 16))
-    return false;
-
-  for (unsigned i = 0; i < Regs; ++i) {
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, RegClassID,
-                                                       RegD + i)));
-    ++OpIdx;
-  }
-
-  return true;
-}
-
-// Misc. VFP Instructions.
-// FMSTAT (vmrs with Rt=0b1111, i.e., to apsr_nzcv and no register operand)
-// FCONSTD (DPR and a VFPf64Imm operand)
-// FCONSTS (SPR and a VFPf32Imm operand)
-// VMRS/VMSR (GPR operand)
-static bool DisassembleVFPMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  const MCOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
-  unsigned &OpIdx = NumOpsAdded;
-
-  OpIdx = 0;
-
-  if (Opcode == ARM::FMSTAT)
-    return true;
-
-  assert(NumOps >= 2 && "VFPMiscFrm expects >=2 operands");
-
-  unsigned RegEnum = 0;
-  switch (OpInfo[0].RegClass) {
-  case ARM::DPRRegClassID:
-    RegEnum = getRegisterEnum(B, ARM::DPRRegClassID, decodeVFPRd(insn, false));
-    break;
-  case ARM::SPRRegClassID:
-    RegEnum = getRegisterEnum(B, ARM::SPRRegClassID, decodeVFPRd(insn, true));
-    break;
-  case ARM::GPRRegClassID:
-    RegEnum = getRegisterEnum(B, ARM::GPRRegClassID, decodeRd(insn));
-    break;
-  default:
-    assert(0 && "Invalid reg class id");
-    return false;
-  }
-
-  MI.addOperand(MCOperand::CreateReg(RegEnum));
-  ++OpIdx;
-
-  // Extract/decode the f64/f32 immediate.
-  if (OpIdx < NumOps && OpInfo[OpIdx].RegClass < 0
-        && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()) {
-    // The asm syntax specifies the floating point value, not the 8-bit literal.
-    APInt immRaw = VFPExpandImm(slice(insn,19,16) << 4 | slice(insn, 3, 0),
-                             Opcode == ARM::FCONSTD ? 64 : 32);
-    APFloat immFP = APFloat(immRaw, true);
-    double imm = Opcode == ARM::FCONSTD ? immFP.convertToDouble() :
-      immFP.convertToFloat();
-    MI.addOperand(MCOperand::CreateFPImm(imm));
-
-    ++OpIdx;
-  }
-
-  return true;
-}
-
-// DisassembleThumbFrm() is defined in ThumbDisassemblerCore.h file.
-#include "ThumbDisassemblerCore.h"
-
-/////////////////////////////////////////////////////
-//                                                 //
-//     Utility Functions For ARM Advanced SIMD     //
-//                                                 //
-/////////////////////////////////////////////////////
-
-// The following NEON namings are based on A8.6.266 VABA, VABAL.  Notice that
-// A8.6.303 VDUP (ARM core register)'s D/Vd pair is the N/Vn pair of VABA/VABAL.
-
-// A7.3 Register encoding
-
-// Extract/Decode NEON D/Vd:
-//
-// Note that for quadword, Qd = UInt(D:Vd<3:1>) = Inst{22:15-13}, whereas for
-// doubleword, Dd = UInt(D:Vd).  We compensate for this difference by
-// handling it in the getRegisterEnum() utility function.
-// D = Inst{22}, Vd = Inst{15-12}
-static unsigned decodeNEONRd(uint32_t insn) {
-  return ((insn >> ARMII::NEON_D_BitShift) & 1) << 4
-    | ((insn >> ARMII::NEON_RegRdShift) & ARMII::NEONRegMask);
-}
-
-// Extract/Decode NEON N/Vn:
-//
-// Note that for quadword, Qn = UInt(N:Vn<3:1>) = Inst{7:19-17}, whereas for
-// doubleword, Dn = UInt(N:Vn).  We compensate for this difference by
-// handling it in the getRegisterEnum() utility function.
-// N = Inst{7}, Vn = Inst{19-16}
-static unsigned decodeNEONRn(uint32_t insn) {
-  return ((insn >> ARMII::NEON_N_BitShift) & 1) << 4
-    | ((insn >> ARMII::NEON_RegRnShift) & ARMII::NEONRegMask);
-}
-
-// Extract/Decode NEON M/Vm:
-//
-// Note that for quadword, Qm = UInt(M:Vm<3:1>) = Inst{5:3-1}, whereas for
-// doubleword, Dm = UInt(M:Vm).  We compensate for this difference by
-// handling it in the getRegisterEnum() utility function.
-// M = Inst{5}, Vm = Inst{3-0}
-static unsigned decodeNEONRm(uint32_t insn) {
-  return ((insn >> ARMII::NEON_M_BitShift) & 1) << 4
-    | ((insn >> ARMII::NEON_RegRmShift) & ARMII::NEONRegMask);
-}
-
-namespace {
-enum ElemSize {
-  ESizeNA = 0,
-  ESize8 = 8,
-  ESize16 = 16,
-  ESize32 = 32,
-  ESize64 = 64
-};
-} // End of unnamed namespace
-
-// size        field -> Inst{11-10}
-// index_align field -> Inst{7-4}
-//
-// The Lane Index interpretation depends on the Data Size:
-//   8  (encoded as size = 0b00) -> Index = index_align[3:1]
-//   16 (encoded as size = 0b01) -> Index = index_align[3:2]
-//   32 (encoded as size = 0b10) -> Index = index_align[3]
-//
-// Ref: A8.6.317 VLD4 (single 4-element structure to one lane).
-static unsigned decodeLaneIndex(uint32_t insn) {
-  unsigned size = insn >> 10 & 3;
-  assert((size == 0 || size == 1 || size == 2) &&
-         "Encoding error: size should be either 0, 1, or 2");
-
-  unsigned index_align = insn >> 4 & 0xF;
-  return (index_align >> 1) >> size;
-}
-
-// imm64 = AdvSIMDExpandImm(op, cmode, i:imm3:imm4)
-// op = Inst{5}, cmode = Inst{11-8}
-// i = Inst{24} (ARM architecture)
-// imm3 = Inst{18-16}, imm4 = Inst{3-0}
-// Ref: Table A7-15 Modified immediate values for Advanced SIMD instructions.
-static uint64_t decodeN1VImm(uint32_t insn, ElemSize esize) {
-  unsigned char op = (insn >> 5) & 1;
-  unsigned char cmode = (insn >> 8) & 0xF;
-  unsigned char Imm8 = ((insn >> 24) & 1) << 7 |
-                       ((insn >> 16) & 7) << 4 |
-                       (insn & 0xF);
-  return (op << 12) | (cmode << 8) | Imm8;
-}
-
-// A8.6.339 VMUL, VMULL (by scalar)
-// ESize16 => m = Inst{2-0} (Vm<2:0>) D0-D7
-// ESize32 => m = Inst{3-0} (Vm<3:0>) D0-D15
-static unsigned decodeRestrictedDm(uint32_t insn, ElemSize esize) {
-  switch (esize) {
-  case ESize16:
-    return insn & 7;
-  case ESize32:
-    return insn & 0xF;
-  default:
-    assert(0 && "Unreachable code!");
-    return 0;
-  }
-}
-
-// A8.6.339 VMUL, VMULL (by scalar)
-// ESize16 => index = Inst{5:3} (M:Vm<3>) D0-D7
-// ESize32 => index = Inst{5}   (M)       D0-D15
-static unsigned decodeRestrictedDmIndex(uint32_t insn, ElemSize esize) {
-  switch (esize) {
-  case ESize16:
-    return (((insn >> 5) & 1) << 1) | ((insn >> 3) & 1);
-  case ESize32:
-    return (insn >> 5) & 1;
-  default:
-    assert(0 && "Unreachable code!");
-    return 0;
-  }
-}
-
-// A8.6.296 VCVT (between floating-point and fixed-point, Advanced SIMD)
-// (64 - <fbits>) is encoded as imm6, i.e., Inst{21-16}.
-static unsigned decodeVCVTFractionBits(uint32_t insn) {
-  return 64 - ((insn >> 16) & 0x3F);
-}
-
-// A8.6.302 VDUP (scalar)
-// ESize8  => index = Inst{19-17}
-// ESize16 => index = Inst{19-18}
-// ESize32 => index = Inst{19}
-static unsigned decodeNVLaneDupIndex(uint32_t insn, ElemSize esize) {
-  switch (esize) {
-  case ESize8:
-    return (insn >> 17) & 7;
-  case ESize16:
-    return (insn >> 18) & 3;
-  case ESize32:
-    return (insn >> 19) & 1;
-  default:
-    assert(0 && "Unspecified element size!");
-    return 0;
-  }
-}
-
-// A8.6.328 VMOV (ARM core register to scalar)
-// A8.6.329 VMOV (scalar to ARM core register)
-// ESize8  => index = Inst{21:6-5}
-// ESize16 => index = Inst{21:6}
-// ESize32 => index = Inst{21}
-static unsigned decodeNVLaneOpIndex(uint32_t insn, ElemSize esize) {
-  switch (esize) {
-  case ESize8:
-    return ((insn >> 21) & 1) << 2 | ((insn >> 5) & 3);
-  case ESize16:
-    return ((insn >> 21) & 1) << 1 | ((insn >> 6) & 1);
-  case ESize32:
-    return ((insn >> 21) & 1);
-  default:
-    assert(0 && "Unspecified element size!");
-    return 0;
-  }
-}
-
-// Imm6 = Inst{21-16}, L = Inst{7}
-//
-// LeftShift == true (A8.6.367 VQSHL, A8.6.387 VSLI):
-// case L:imm6 of
-//   '0001xxx' => esize = 8; shift_amount = imm6 - 8
-//   '001xxxx' => esize = 16; shift_amount = imm6 - 16
-//   '01xxxxx' => esize = 32; shift_amount = imm6 - 32
-//   '1xxxxxx' => esize = 64; shift_amount = imm6
-//
-// LeftShift == false (A8.6.376 VRSHR, A8.6.368 VQSHRN):
-// case L:imm6 of
-//   '0001xxx' => esize = 8; shift_amount = 16 - imm6
-//   '001xxxx' => esize = 16; shift_amount = 32 - imm6
-//   '01xxxxx' => esize = 32; shift_amount = 64 - imm6
-//   '1xxxxxx' => esize = 64; shift_amount = 64 - imm6
-//
-static unsigned decodeNVSAmt(uint32_t insn, bool LeftShift) {
-  ElemSize esize = ESizeNA;
-  unsigned L = (insn >> 7) & 1;
-  unsigned imm6 = (insn >> 16) & 0x3F;
-  if (L == 0) {
-    if (imm6 >> 3 == 1)
-      esize = ESize8;
-    else if (imm6 >> 4 == 1)
-      esize = ESize16;
-    else if (imm6 >> 5 == 1)
-      esize = ESize32;
-    else
-      assert(0 && "Wrong encoding of Inst{7:21-16}!");
-  } else
-    esize = ESize64;
-
-  if (LeftShift)
-    return esize == ESize64 ? imm6 : (imm6 - esize);
-  else
-    return esize == ESize64 ? (esize - imm6) : (2*esize - imm6);
-}
-
-// A8.6.305 VEXT
-// Imm4 = Inst{11-8}
-static unsigned decodeN3VImm(uint32_t insn) {
-  return (insn >> 8) & 0xF;
-}
-
-// VLD*
-//   D[d] D[d2] ... Rn [TIED_TO Rn] align [Rm]
-// VLD*LN*
-//   D[d] D[d2] ... Rn [TIED_TO Rn] align [Rm] TIED_TO ... imm(idx)
-// VST*
-//   Rn [TIED_TO Rn] align [Rm] D[d] D[d2] ...
-// VST*LN*
-//   Rn [TIED_TO Rn] align [Rm] D[d] D[d2] ... [imm(idx)]
-//
-// Correctly set VLD*/VST*'s TIED_TO GPR, as the asm printer needs it.
-static bool DisassembleNLdSt0(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, bool Store, bool DblSpaced,
-    unsigned alignment, BO B) {
-
-  const MCInstrDesc &MCID = ARMInsts[Opcode];
-  const MCOperandInfo *OpInfo = MCID.OpInfo;
-
-  // At least one DPR register plus addressing mode #6.
-  assert(NumOps >= 3 && "Expect >= 3 operands");
-
-  unsigned &OpIdx = NumOpsAdded;
-
-  OpIdx = 0;
-
-  // We have homogeneous NEON registers for Load/Store.
-  unsigned RegClass = 0;
-
-  // Double-spaced registers have increments of 2.
-  unsigned Inc = DblSpaced ? 2 : 1;
-
-  unsigned Rn = decodeRn(insn);
-  unsigned Rm = decodeRm(insn);
-  unsigned Rd = decodeNEONRd(insn);
-
-  // A7.7.1 Advanced SIMD addressing mode.
-  bool WB = Rm != 15;
-
-  // LLVM Addressing Mode #6.
-  unsigned RmEnum = 0;
-  if (WB && Rm != 13)
-    RmEnum = getRegisterEnum(B, ARM::GPRRegClassID, Rm);
-
-  if (Store) {
-    // Consume possible WB, AddrMode6, possible increment reg, the DPR/QPR's,
-    // then possible lane index.
-    assert(OpIdx < NumOps && OpInfo[0].RegClass == ARM::GPRRegClassID &&
-           "Reg operand expected");
-
-    if (WB) {
-      MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                         Rn)));
-      ++OpIdx;
-    }
-
-    assert((OpIdx+1) < NumOps && OpInfo[OpIdx].RegClass == ARM::GPRRegClassID &&
-           OpInfo[OpIdx + 1].RegClass < 0 && "Addrmode #6 Operands expected");
-    // addrmode6 := (ops GPR:$addr, i32imm)
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       Rn)));
-    MI.addOperand(MCOperand::CreateImm(alignment)); // Alignment
-    OpIdx += 2;
-
-    if (WB) {
-      MI.addOperand(MCOperand::CreateReg(RmEnum));
-      ++OpIdx;
-    }
-
-    assert(OpIdx < NumOps &&
-           (OpInfo[OpIdx].RegClass == ARM::DPRRegClassID ||
-            OpInfo[OpIdx].RegClass == ARM::QPRRegClassID) &&
-           "Reg operand expected");
-
-    RegClass = OpInfo[OpIdx].RegClass;
-    while (OpIdx < NumOps && (unsigned)OpInfo[OpIdx].RegClass == RegClass) {
-      MI.addOperand(MCOperand::CreateReg(
-                      getRegisterEnum(B, RegClass, Rd)));
-      Rd += Inc;
-      ++OpIdx;
-    }
-
-    // Handle possible lane index.
-    if (OpIdx < NumOps && OpInfo[OpIdx].RegClass < 0
-        && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()) {
-      MI.addOperand(MCOperand::CreateImm(decodeLaneIndex(insn)));
-      ++OpIdx;
-    }
-
-  } else {
-    // Consume the DPR/QPR's, possible WB, AddrMode6, possible incrment reg,
-    // possible TIED_TO DPR/QPR's (ignored), then possible lane index.
-    RegClass = OpInfo[0].RegClass;
-
-    while (OpIdx < NumOps && (unsigned)OpInfo[OpIdx].RegClass == RegClass) {
-      MI.addOperand(MCOperand::CreateReg(
-                      getRegisterEnum(B, RegClass, Rd)));
-      Rd += Inc;
-      ++OpIdx;
-    }
-
-    if (WB) {
-      MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                         Rn)));
-      ++OpIdx;
-    }
-
-    assert((OpIdx+1) < NumOps && OpInfo[OpIdx].RegClass == ARM::GPRRegClassID &&
-           OpInfo[OpIdx + 1].RegClass < 0 && "Addrmode #6 Operands expected");
-    // addrmode6 := (ops GPR:$addr, i32imm)
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       Rn)));
-    MI.addOperand(MCOperand::CreateImm(alignment)); // Alignment
-    OpIdx += 2;
-
-    if (WB) {
-      MI.addOperand(MCOperand::CreateReg(RmEnum));
-      ++OpIdx;
-    }
-
-    while (OpIdx < NumOps && (unsigned)OpInfo[OpIdx].RegClass == RegClass) {
-      assert(MCID.getOperandConstraint(OpIdx, MCOI::TIED_TO) != -1 &&
-             "Tied to operand expected");
-      MI.addOperand(MCOperand::CreateReg(0));
-      ++OpIdx;
-    }
-
-    // Handle possible lane index.
-    if (OpIdx < NumOps && OpInfo[OpIdx].RegClass < 0
-        && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()) {
-      MI.addOperand(MCOperand::CreateImm(decodeLaneIndex(insn)));
-      ++OpIdx;
-    }
-  }
-
-  // Accessing registers past the end of the NEON register file is not
-  // defined.
-  if (Rd > 32)
-    return false;
-
-  return true;
-}
-
-// A8.6.308, A8.6.311, A8.6.314, A8.6.317.
-static bool Align4OneLaneInst(unsigned elem, unsigned size,
-    unsigned index_align, unsigned & alignment) {
-  unsigned bits = 0;
-  switch (elem) {
-  default:
-    return false;
-  case 1:
-    // A8.6.308
-    if (size == 0)
-      return slice(index_align, 0, 0) == 0;
-    else if (size == 1) {
-      bits = slice(index_align, 1, 0);
-      if (bits != 0 && bits != 1)
-        return false;
-      if (bits == 1)
-        alignment = 16;
-      return true;
-    } else if (size == 2) {
-      bits = slice(index_align, 2, 0);
-      if (bits != 0 && bits != 3)
-        return false;
-      if (bits == 3)
-        alignment = 32;
-      return true;;
-    }
-    return true;
-  case 2:
-    // A8.6.311
-    if (size == 0) {
-      if (slice(index_align, 0, 0) == 1)
-        alignment = 16;
-      return true;
-    } if (size == 1) {
-      if (slice(index_align, 0, 0) == 1)
-        alignment = 32;
-      return true;
-    } else if (size == 2) {
-      if (slice(index_align, 1, 1) != 0)
-        return false;
-      if (slice(index_align, 0, 0) == 1)
-        alignment = 64;
-      return true;;
-    }
-    return true;
-  case 3:
-    // A8.6.314
-    if (size == 0) {
-      if (slice(index_align, 0, 0) != 0)
-        return false;
-      return true;
-    } if (size == 1) {
-      if (slice(index_align, 0, 0) != 0)
-        return false;
-      return true;
-      return true;
-    } else if (size == 2) {
-      if (slice(index_align, 1, 0) != 0)
-        return false;
-      return true;;
-    }
-    return true;
-  case 4:
-    // A8.6.317
-    if (size == 0) {
-      if (slice(index_align, 0, 0) == 1)
-        alignment = 32;
-      return true;
-    } if (size == 1) {
-      if (slice(index_align, 0, 0) == 1)
-        alignment = 64;
-      return true;
-    } else if (size == 2) {
-      bits = slice(index_align, 1, 0);
-      if (bits == 3)
-        return false;
-      if (bits == 1)
-        alignment = 64;
-      else if (bits == 2)
-        alignment = 128;
-      return true;;
-    }
-    return true;
-  }
-}
-
-// A7.7
-// If L (Inst{21}) == 0, store instructions.
-// Find out about double-spaced-ness of the Opcode and pass it on to
-// DisassembleNLdSt0().
-static bool DisassembleNLdSt(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  const StringRef Name = ARMInsts[Opcode].Name;
-  bool DblSpaced = false;
-  // 0 represents standard alignment, i.e., unaligned data access.
-  unsigned alignment = 0;
-
-  unsigned elem = 0; // legal values: {1, 2, 3, 4}
-  if (Name.startswith("VST1") || Name.startswith("VLD1"))
-    elem = 1;
-
-  if (Name.startswith("VST2") || Name.startswith("VLD2"))
-    elem = 2;
-
-  if (Name.startswith("VST3") || Name.startswith("VLD3"))
-    elem = 3;
-
-  if (Name.startswith("VST4") || Name.startswith("VLD4"))
-    elem = 4;
-
-  if (Name.find("LN") != std::string::npos) {
-    // To one lane instructions.
-    // See, for example, 8.6.317 VLD4 (single 4-element structure to one lane).
-
-    // Utility function takes number of elements, size, and index_align.
-    if (!Align4OneLaneInst(elem,
-                           slice(insn, 11, 10),
-                           slice(insn, 7, 4),
-                           alignment))
-      return false;
-
-    // <size> == 16 && Inst{5} == 1 --> DblSpaced = true
-    if (Name.endswith("16") || Name.endswith("16_UPD"))
-      DblSpaced = slice(insn, 5, 5) == 1;
-
-    // <size> == 32 && Inst{6} == 1 --> DblSpaced = true
-    if (Name.endswith("32") || Name.endswith("32_UPD"))
-      DblSpaced = slice(insn, 6, 6) == 1;
-  } else if (Name.find("DUP") != std::string::npos) {
-    // Single element (or structure) to all lanes.
-    // Inst{9-8} encodes the number of element(s) in the structure, with:
-    // 0b00 (VLD1DUP) (for this, a bit makes sense only for data size 16 and 32.
-    // 0b01 (VLD2DUP)
-    // 0b10 (VLD3DUP) (for this, a bit must be encoded as 0)
-    // 0b11 (VLD4DUP)
-    //
-    // Inst{7-6} encodes the data size, with:
-    // 0b00 => 8, 0b01 => 16, 0b10 => 32
-    //
-    // Inst{4} (the a bit) encodes the align action (0: standard alignment)
-    unsigned elem = slice(insn, 9, 8) + 1;
-    unsigned a = slice(insn, 4, 4);
-    if (elem != 3) {
-      // 0b11 is not a valid encoding for Inst{7-6}.
-      if (slice(insn, 7, 6) == 3)
-        return false;
-      unsigned data_size = 8 << slice(insn, 7, 6);
-      // For VLD1DUP, a bit makes sense only for data size of 16 and 32.
-      if (a && data_size == 8)
-        return false;
-
-      // Now we can calculate the alignment!
-      if (a)
-        alignment = elem * data_size;
-    } else {
-      if (a) {
-        // A8.6.315 VLD3 (single 3-element structure to all lanes)
-        // The a bit must be encoded as 0.
-        return false;
-      }
-    }
-  } else {
-    // Multiple n-element structures with type encoded as Inst{11-8}.
-    // See, for example, A8.6.316 VLD4 (multiple 4-element structures).
-
-    // Inst{5-4} encodes alignment.
-    unsigned align = slice(insn, 5, 4);
-    switch (align) {
-    default:
-      break;
-    case 1:
-      alignment = 64; break;
-    case 2:
-      alignment = 128; break;
-    case 3:
-      alignment = 256; break;
-    }
-
-    unsigned type = slice(insn, 11, 8);
-    // Reject UNDEFINED instructions based on type and align.
-    // Plus set DblSpaced flag where appropriate.
-    switch (elem) {
-    default:
-      break;
-    case 1:
-      // n == 1
-      // A8.6.307 & A8.6.391
-      if ((type == 7  && slice(align, 1, 1) == 1) ||
-          (type == 10 && align == 3) ||
-          (type == 6  && slice(align, 1, 1) == 1))
-        return false;
-      break;
-    case 2:
-      // n == 2 && type == 0b1001 -> DblSpaced = true
-      // A8.6.310 & A8.6.393
-      if ((type == 8 || type == 9) && align == 3)
-        return false;
-      DblSpaced = (type == 9);
-      break;
-    case 3:
-      // n == 3 && type == 0b0101 -> DblSpaced = true
-      // A8.6.313 & A8.6.395
-      if (slice(insn, 7, 6) == 3 || slice(align, 1, 1) == 1)
-        return false;
-      DblSpaced = (type == 5);
-      break;
-    case 4:
-      // n == 4 && type == 0b0001 -> DblSpaced = true
-      // A8.6.316 & A8.6.397
-      if (slice(insn, 7, 6) == 3)
-        return false;
-      DblSpaced = (type == 1);
-      break;
-    }
-  }
-  return DisassembleNLdSt0(MI, Opcode, insn, NumOps, NumOpsAdded,
-                           slice(insn, 21, 21) == 0, DblSpaced, alignment/8, B);
-}
-
-// VMOV (immediate)
-//   Qd/Dd imm
-// VBIC (immediate)
-// VORR (immediate)
-//   Qd/Dd imm src(=Qd/Dd)
-static bool DisassembleN1RegModImmFrm(MCInst &MI, unsigned Opcode,
-    uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  const MCInstrDesc &MCID = ARMInsts[Opcode];
-  const MCOperandInfo *OpInfo = MCID.OpInfo;
-
-  assert(NumOps >= 2 &&
-         (OpInfo[0].RegClass == ARM::DPRRegClassID ||
-          OpInfo[0].RegClass == ARM::QPRRegClassID) &&
-         (OpInfo[1].RegClass < 0) &&
-         "Expect 1 reg operand followed by 1 imm operand");
-
-  // Qd/Dd = Inst{22:15-12} => NEON Rd
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, OpInfo[0].RegClass,
-                                                     decodeNEONRd(insn))));
-
-  ElemSize esize = ESizeNA;
-  switch (Opcode) {
-  case ARM::VMOVv8i8:
-  case ARM::VMOVv16i8:
-    esize = ESize8;
-    break;
-  case ARM::VMOVv4i16:
-  case ARM::VMOVv8i16:
-  case ARM::VMVNv4i16:
-  case ARM::VMVNv8i16:
-  case ARM::VBICiv4i16:
-  case ARM::VBICiv8i16:
-  case ARM::VORRiv4i16:
-  case ARM::VORRiv8i16:
-    esize = ESize16;
-    break;
-  case ARM::VMOVv2i32:
-  case ARM::VMOVv4i32:
-  case ARM::VMVNv2i32:
-  case ARM::VMVNv4i32:
-  case ARM::VBICiv2i32:
-  case ARM::VBICiv4i32:
-  case ARM::VORRiv2i32:
-  case ARM::VORRiv4i32:
-    esize = ESize32;
-    break;
-  case ARM::VMOVv1i64:
-  case ARM::VMOVv2i64:
-    esize = ESize64;
-    break;
-  default:
-    assert(0 && "Unexpected opcode!");
-    return false;
-  }
-
-  // One register and a modified immediate value.
-  // Add the imm operand.
-  MI.addOperand(MCOperand::CreateImm(decodeN1VImm(insn, esize)));
-
-  NumOpsAdded = 2;
-
-  // VBIC/VORRiv*i* variants have an extra $src = $Vd to be filled in.
-  if (NumOps >= 3 &&
-      (OpInfo[2].RegClass == ARM::DPRRegClassID ||
-       OpInfo[2].RegClass == ARM::QPRRegClassID)) {
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, OpInfo[0].RegClass,
-                                                     decodeNEONRd(insn))));
-    NumOpsAdded += 1;
-  }
-
-  return true;
-}
-
-namespace {
-enum N2VFlag {
-  N2V_None,
-  N2V_VectorDupLane,
-  N2V_VectorConvert_Between_Float_Fixed
-};
-} // End of unnamed namespace
-
-// Vector Convert [between floating-point and fixed-point]
-//   Qd/Dd Qm/Dm [fbits]
-//
-// Vector Duplicate Lane (from scalar to all elements) Instructions.
-// VDUPLN16d, VDUPLN16q, VDUPLN32d, VDUPLN32q, VDUPLN8d, VDUPLN8q:
-//   Qd/Dd Dm index
-//
-// Vector Move Long:
-//   Qd Dm
-//
-// Vector Move Narrow:
-//   Dd Qm
-//
-// Others
-static bool DisassembleNVdVmOptImm(MCInst &MI, unsigned Opc, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, N2VFlag Flag, BO B) {
-
-  const MCInstrDesc &MCID = ARMInsts[Opc];
-  const MCOperandInfo *OpInfo = MCID.OpInfo;
-
-  assert(NumOps >= 2 &&
-         (OpInfo[0].RegClass == ARM::DPRRegClassID ||
-          OpInfo[0].RegClass == ARM::QPRRegClassID) &&
-         (OpInfo[1].RegClass == ARM::DPRRegClassID ||
-          OpInfo[1].RegClass == ARM::QPRRegClassID) &&
-         "Expect >= 2 operands and first 2 as reg operands");
-
-  unsigned &OpIdx = NumOpsAdded;
-
-  OpIdx = 0;
-
-  ElemSize esize = ESizeNA;
-  if (Flag == N2V_VectorDupLane) {
-    // VDUPLN has its index embedded.  Its size can be inferred from the Opcode.
-    assert(Opc >= ARM::VDUPLN16d && Opc <= ARM::VDUPLN8q &&
-           "Unexpected Opcode");
-    esize = (Opc == ARM::VDUPLN8d || Opc == ARM::VDUPLN8q) ? ESize8
-       : ((Opc == ARM::VDUPLN16d || Opc == ARM::VDUPLN16q) ? ESize16
-                                                           : ESize32);
-  }
-
-  // Qd/Dd = Inst{22:15-12} => NEON Rd
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, OpInfo[OpIdx].RegClass,
-                                                     decodeNEONRd(insn))));
-  ++OpIdx;
-
-  // VPADAL...
-  if (MCID.getOperandConstraint(OpIdx, MCOI::TIED_TO) != -1) {
-    // TIED_TO operand.
-    MI.addOperand(MCOperand::CreateReg(0));
-    ++OpIdx;
-  }
-
-  // Dm = Inst{5:3-0} => NEON Rm
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, OpInfo[OpIdx].RegClass,
-                                                     decodeNEONRm(insn))));
-  ++OpIdx;
-
-  // VZIP and others have two TIED_TO reg operands.
-  int Idx;
-  while (OpIdx < NumOps &&
-         (Idx = MCID.getOperandConstraint(OpIdx, MCOI::TIED_TO)) != -1) {
-    // Add TIED_TO operand.
-    MI.addOperand(MI.getOperand(Idx));
-    ++OpIdx;
-  }
-
-  // Add the imm operand, if required.
-  if (OpIdx < NumOps && OpInfo[OpIdx].RegClass < 0
-      && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()) {
-
-    unsigned imm = 0xFFFFFFFF;
-
-    if (Flag == N2V_VectorDupLane)
-      imm = decodeNVLaneDupIndex(insn, esize);
-    if (Flag == N2V_VectorConvert_Between_Float_Fixed)
-      imm = decodeVCVTFractionBits(insn);
-
-    assert(imm != 0xFFFFFFFF && "Internal error");
-    MI.addOperand(MCOperand::CreateImm(imm));
-    ++OpIdx;
-  }
-
-  return true;
-}
-
-static bool DisassembleN2RegFrm(MCInst &MI, unsigned Opc, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  return DisassembleNVdVmOptImm(MI, Opc, insn, NumOps, NumOpsAdded,
-                                N2V_None, B);
-}
-static bool DisassembleNVCVTFrm(MCInst &MI, unsigned Opc, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  return DisassembleNVdVmOptImm(MI, Opc, insn, NumOps, NumOpsAdded,
-                                N2V_VectorConvert_Between_Float_Fixed, B);
-}
-static bool DisassembleNVecDupLnFrm(MCInst &MI, unsigned Opc, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  return DisassembleNVdVmOptImm(MI, Opc, insn, NumOps, NumOpsAdded,
-                                N2V_VectorDupLane, B);
-}
-
-// Vector Shift [Accumulate] Instructions.
-// Qd/Dd [Qd/Dd (TIED_TO)] Qm/Dm ShiftAmt
-//
-// Vector Shift Left Long (with maximum shift count) Instructions.
-// VSHLLi16, VSHLLi32, VSHLLi8: Qd Dm imm (== size)
-//
-static bool DisassembleNVectorShift(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, bool LeftShift, BO B) {
-
-  const MCInstrDesc &MCID = ARMInsts[Opcode];
-  const MCOperandInfo *OpInfo = MCID.OpInfo;
-
-  assert(NumOps >= 3 &&
-         (OpInfo[0].RegClass == ARM::DPRRegClassID ||
-          OpInfo[0].RegClass == ARM::QPRRegClassID) &&
-         (OpInfo[1].RegClass == ARM::DPRRegClassID ||
-          OpInfo[1].RegClass == ARM::QPRRegClassID) &&
-         "Expect >= 3 operands and first 2 as reg operands");
-
-  unsigned &OpIdx = NumOpsAdded;
-
-  OpIdx = 0;
-
-  // Qd/Dd = Inst{22:15-12} => NEON Rd
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, OpInfo[OpIdx].RegClass,
-                                                     decodeNEONRd(insn))));
-  ++OpIdx;
-
-  if (MCID.getOperandConstraint(OpIdx, MCOI::TIED_TO) != -1) {
-    // TIED_TO operand.
-    MI.addOperand(MCOperand::CreateReg(0));
-    ++OpIdx;
-  }
-
-  assert((OpInfo[OpIdx].RegClass == ARM::DPRRegClassID ||
-          OpInfo[OpIdx].RegClass == ARM::QPRRegClassID) &&
-         "Reg operand expected");
-
-  // Qm/Dm = Inst{5:3-0} => NEON Rm
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, OpInfo[OpIdx].RegClass,
-                                                     decodeNEONRm(insn))));
-  ++OpIdx;
-
-  assert(OpInfo[OpIdx].RegClass < 0 && "Imm operand expected");
-
-  // Add the imm operand.
-
-  // VSHLL has maximum shift count as the imm, inferred from its size.
-  unsigned Imm;
-  switch (Opcode) {
-  default:
-    Imm = decodeNVSAmt(insn, LeftShift);
-    break;
-  case ARM::VSHLLi8:
-    Imm = 8;
-    break;
-  case ARM::VSHLLi16:
-    Imm = 16;
-    break;
-  case ARM::VSHLLi32:
-    Imm = 32;
-    break;
-  }
-  MI.addOperand(MCOperand::CreateImm(Imm));
-  ++OpIdx;
-
-  return true;
-}
-
-// Left shift instructions.
-static bool DisassembleN2RegVecShLFrm(MCInst &MI, unsigned Opcode,
-    uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  return DisassembleNVectorShift(MI, Opcode, insn, NumOps, NumOpsAdded, true,
-                                 B);
-}
-// Right shift instructions have different shift amount interpretation.
-static bool DisassembleN2RegVecShRFrm(MCInst &MI, unsigned Opcode,
-    uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  return DisassembleNVectorShift(MI, Opcode, insn, NumOps, NumOpsAdded, false,
-                                 B);
-}
-
-namespace {
-enum N3VFlag {
-  N3V_None,
-  N3V_VectorExtract,
-  N3V_VectorShift,
-  N3V_Multiply_By_Scalar
-};
-} // End of unnamed namespace
-
-// NEON Three Register Instructions with Optional Immediate Operand
-//
-// Vector Extract Instructions.
-// Qd/Dd Qn/Dn Qm/Dm imm4
-//
-// Vector Shift (Register) Instructions.
-// Qd/Dd Qm/Dm Qn/Dn (notice the order of m, n)
-//
-// Vector Multiply [Accumulate/Subtract] [Long] By Scalar Instructions.
-// Qd/Dd Qn/Dn RestrictedDm index
-//
-// Others
-static bool DisassembleNVdVnVmOptImm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, N3VFlag Flag, BO B) {
-
-  const MCInstrDesc &MCID = ARMInsts[Opcode];
-  const MCOperandInfo *OpInfo = MCID.OpInfo;
-
-  // No checking for OpInfo[2] because of MOVDneon/MOVQ with only two regs.
-  assert(NumOps >= 3 &&
-         (OpInfo[0].RegClass == ARM::DPRRegClassID ||
-          OpInfo[0].RegClass == ARM::QPRRegClassID) &&
-         (OpInfo[1].RegClass == ARM::DPRRegClassID ||
-          OpInfo[1].RegClass == ARM::QPRRegClassID) &&
-         "Expect >= 3 operands and first 2 as reg operands");
-
-  unsigned &OpIdx = NumOpsAdded;
-
-  OpIdx = 0;
-
-  bool VdVnVm = Flag == N3V_VectorShift ? false : true;
-  bool IsImm4 = Flag == N3V_VectorExtract ? true : false;
-  bool IsDmRestricted = Flag == N3V_Multiply_By_Scalar ? true : false;
-  ElemSize esize = ESizeNA;
-  if (Flag == N3V_Multiply_By_Scalar) {
-    unsigned size = (insn >> 20) & 3;
-    if (size == 1) esize = ESize16;
-    if (size == 2) esize = ESize32;
-    assert (esize == ESize16 || esize == ESize32);
-  }
-
-  // Qd/Dd = Inst{22:15-12} => NEON Rd
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, OpInfo[OpIdx].RegClass,
-                                                     decodeNEONRd(insn))));
-  ++OpIdx;
-
-  // VABA, VABAL, VBSLd, VBSLq, ...
-  if (MCID.getOperandConstraint(OpIdx, MCOI::TIED_TO) != -1) {
-    // TIED_TO operand.
-    MI.addOperand(MCOperand::CreateReg(0));
-    ++OpIdx;
-  }
-
-  // Dn = Inst{7:19-16} => NEON Rn
-  // or
-  // Dm = Inst{5:3-0} => NEON Rm
-  MI.addOperand(MCOperand::CreateReg(
-                  getRegisterEnum(B, OpInfo[OpIdx].RegClass,
-                                  VdVnVm ? decodeNEONRn(insn)
-                                         : decodeNEONRm(insn))));
-  ++OpIdx;
-
-  // Dm = Inst{5:3-0} => NEON Rm
-  // or
-  // Dm is restricted to D0-D7 if size is 16, D0-D15 otherwise
-  // or
-  // Dn = Inst{7:19-16} => NEON Rn
-  unsigned m = VdVnVm ? (IsDmRestricted ? decodeRestrictedDm(insn, esize)
-                                        : decodeNEONRm(insn))
-                      : decodeNEONRn(insn);
-
-  MI.addOperand(MCOperand::CreateReg(
-                  getRegisterEnum(B, OpInfo[OpIdx].RegClass, m)));
-  ++OpIdx;
-
-  if (OpIdx < NumOps && OpInfo[OpIdx].RegClass < 0
-      && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()) {
-    // Add the imm operand.
-    unsigned Imm = 0;
-    if (IsImm4)
-      Imm = decodeN3VImm(insn);
-    else if (IsDmRestricted)
-      Imm = decodeRestrictedDmIndex(insn, esize);
-    else {
-      assert(0 && "Internal error: unreachable code!");
-      return false;
-    }
-
-    MI.addOperand(MCOperand::CreateImm(Imm));
-    ++OpIdx;
-  }
-
-  return true;
-}
-
-static bool DisassembleN3RegFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  return DisassembleNVdVnVmOptImm(MI, Opcode, insn, NumOps, NumOpsAdded,
-                                  N3V_None, B);
-}
-static bool DisassembleN3RegVecShFrm(MCInst &MI, unsigned Opcode,
-    uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  return DisassembleNVdVnVmOptImm(MI, Opcode, insn, NumOps, NumOpsAdded,
-                                  N3V_VectorShift, B);
-}
-static bool DisassembleNVecExtractFrm(MCInst &MI, unsigned Opcode,
-    uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  return DisassembleNVdVnVmOptImm(MI, Opcode, insn, NumOps, NumOpsAdded,
-                                  N3V_VectorExtract, B);
-}
-static bool DisassembleNVecMulScalarFrm(MCInst &MI, unsigned Opcode,
-    uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  return DisassembleNVdVnVmOptImm(MI, Opcode, insn, NumOps, NumOpsAdded,
-                                  N3V_Multiply_By_Scalar, B);
-}
-
-// Vector Table Lookup
-//
-// VTBL1, VTBX1: Dd [Dd(TIED_TO)] Dn Dm
-// VTBL2, VTBX2: Dd [Dd(TIED_TO)] Dn Dn+1 Dm
-// VTBL3, VTBX3: Dd [Dd(TIED_TO)] Dn Dn+1 Dn+2 Dm
-// VTBL4, VTBX4: Dd [Dd(TIED_TO)] Dn Dn+1 Dn+2 Dn+3 Dm
-static bool DisassembleNVTBLFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  const MCInstrDesc &MCID = ARMInsts[Opcode];
-  const MCOperandInfo *OpInfo = MCID.OpInfo;
-  if (!OpInfo) return false;
-
-  assert(NumOps >= 3 &&
-         OpInfo[0].RegClass == ARM::DPRRegClassID &&
-         OpInfo[1].RegClass == ARM::DPRRegClassID &&
-         OpInfo[2].RegClass == ARM::DPRRegClassID &&
-         "Expect >= 3 operands and first 3 as reg operands");
-
-  unsigned &OpIdx = NumOpsAdded;
-
-  OpIdx = 0;
-
-  unsigned Rn = decodeNEONRn(insn);
-
-  // {Dn} encoded as len = 0b00
-  // {Dn Dn+1} encoded as len = 0b01
-  // {Dn Dn+1 Dn+2 } encoded as len = 0b10
-  // {Dn Dn+1 Dn+2 Dn+3} encoded as len = 0b11
-  unsigned Len = slice(insn, 9, 8) + 1;
-
-  // Dd (the destination vector)
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::DPRRegClassID,
-                                                     decodeNEONRd(insn))));
-  ++OpIdx;
-
-  // Process tied_to operand constraint.
-  int Idx;
-  if ((Idx = MCID.getOperandConstraint(OpIdx, MCOI::TIED_TO)) != -1) {
-    MI.addOperand(MI.getOperand(Idx));
-    ++OpIdx;
-  }
-
-  // Do the <list> now.
-  for (unsigned i = 0; i < Len; ++i) {
-    assert(OpIdx < NumOps && OpInfo[OpIdx].RegClass == ARM::DPRRegClassID &&
-           "Reg operand expected");
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::DPRRegClassID,
-                                                       Rn + i)));
-    ++OpIdx;
-  }
-
-  // Dm (the index vector)
-  assert(OpIdx < NumOps && OpInfo[OpIdx].RegClass == ARM::DPRRegClassID &&
-         "Reg operand (index vector) expected");
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::DPRRegClassID,
-                                                     decodeNEONRm(insn))));
-  ++OpIdx;
-
-  return true;
-}
-
-// Vector Get Lane (move scalar to ARM core register) Instructions.
-// VGETLNi32, VGETLNs16, VGETLNs8, VGETLNu16, VGETLNu8: Rt Dn index
-static bool DisassembleNGetLnFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  const MCInstrDesc &MCID = ARMInsts[Opcode];
-  const MCOperandInfo *OpInfo = MCID.OpInfo;
-  if (!OpInfo) return false;
-
-  assert(MCID.getNumDefs() == 1 && NumOps >= 3 &&
-         OpInfo[0].RegClass == ARM::GPRRegClassID &&
-         OpInfo[1].RegClass == ARM::DPRRegClassID &&
-         OpInfo[2].RegClass < 0 &&
-         "Expect >= 3 operands with one dst operand");
-
-  ElemSize esize =
-    Opcode == ARM::VGETLNi32 ? ESize32
-      : ((Opcode == ARM::VGETLNs16 || Opcode == ARM::VGETLNu16) ? ESize16
-                                                                : ESize8);
-
-  // Rt = Inst{15-12} => ARM Rd
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRd(insn))));
-
-  // Dn = Inst{7:19-16} => NEON Rn
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::DPRRegClassID,
-                                                     decodeNEONRn(insn))));
-
-  MI.addOperand(MCOperand::CreateImm(decodeNVLaneOpIndex(insn, esize)));
-
-  NumOpsAdded = 3;
-  return true;
-}
-
-// Vector Set Lane (move ARM core register to scalar) Instructions.
-// VSETLNi16, VSETLNi32, VSETLNi8: Dd Dd (TIED_TO) Rt index
-static bool DisassembleNSetLnFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  const MCInstrDesc &MCID = ARMInsts[Opcode];
-  const MCOperandInfo *OpInfo = MCID.OpInfo;
-  if (!OpInfo) return false;
-
-  assert(MCID.getNumDefs() == 1 && NumOps >= 3 &&
-         OpInfo[0].RegClass == ARM::DPRRegClassID &&
-         OpInfo[1].RegClass == ARM::DPRRegClassID &&
-         MCID.getOperandConstraint(1, MCOI::TIED_TO) != -1 &&
-         OpInfo[2].RegClass == ARM::GPRRegClassID &&
-         OpInfo[3].RegClass < 0 &&
-         "Expect >= 3 operands with one dst operand");
-
-  ElemSize esize =
-    Opcode == ARM::VSETLNi8 ? ESize8
-                            : (Opcode == ARM::VSETLNi16 ? ESize16
-                                                        : ESize32);
-
-  // Dd = Inst{7:19-16} => NEON Rn
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::DPRRegClassID,
-                                                     decodeNEONRn(insn))));
-
-  // TIED_TO operand.
-  MI.addOperand(MCOperand::CreateReg(0));
-
-  // Rt = Inst{15-12} => ARM Rd
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRd(insn))));
-
-  MI.addOperand(MCOperand::CreateImm(decodeNVLaneOpIndex(insn, esize)));
-
-  NumOpsAdded = 4;
-  return true;
-}
-
-// Vector Duplicate Instructions (from ARM core register to all elements).
-// VDUP8d, VDUP16d, VDUP32d, VDUP8q, VDUP16q, VDUP32q: Qd/Dd Rt
-static bool DisassembleNDupFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  const MCOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
-
-  assert(NumOps >= 2 &&
-         (OpInfo[0].RegClass == ARM::DPRRegClassID ||
-          OpInfo[0].RegClass == ARM::QPRRegClassID) &&
-         OpInfo[1].RegClass == ARM::GPRRegClassID &&
-         "Expect >= 2 operands and first 2 as reg operand");
-
-  unsigned RegClass = OpInfo[0].RegClass;
-
-  // Qd/Dd = Inst{7:19-16} => NEON Rn
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, RegClass,
-                                                     decodeNEONRn(insn))));
-
-  // Rt = Inst{15-12} => ARM Rd
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRd(insn))));
-
-  NumOpsAdded = 2;
-  return true;
-}
-
-static inline bool PreLoadOpcode(unsigned Opcode) {
-  switch(Opcode) {
-  case ARM::PLDi12:  case ARM::PLDrs:
-  case ARM::PLDWi12: case ARM::PLDWrs:
-  case ARM::PLIi12:  case ARM::PLIrs:
-    return true;
-  default:
-    return false;
-  }
-}
-
-static bool DisassemblePreLoadFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  // Preload Data/Instruction requires either 2 or 3 operands.
-  // PLDi12, PLDWi12, PLIi12: addrmode_imm12
-  // PLDrs, PLDWrs, PLIrs:    ldst_so_reg
-
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRn(insn))));
-
-  if (Opcode == ARM::PLDi12 || Opcode == ARM::PLDWi12
-      || Opcode == ARM::PLIi12) {
-    unsigned Imm12 = slice(insn, 11, 0);
-    bool Negative = getUBit(insn) == 0;
-
-    // A8.6.118 PLD (literal) PLDWi12 with Rn=PC is transformed to PLDi12.
-    if (Opcode == ARM::PLDWi12 && slice(insn, 19, 16) == 0xF) {
-      DEBUG(errs() << "Rn == '1111': PLDWi12 morphed to PLDi12\n");
-      MI.setOpcode(ARM::PLDi12);
-    }
-    
-    // -0 is represented specially. All other values are as normal.
-    int Offset = Negative ? -1 * Imm12 : Imm12;
-    if (Imm12 == 0 && Negative)
-      Offset = INT32_MIN;
-
-    MI.addOperand(MCOperand::CreateImm(Offset));
-    NumOpsAdded = 2;
-  } else {
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRm(insn))));
-
-    ARM_AM::AddrOpc AddrOpcode = getUBit(insn) ? ARM_AM::add : ARM_AM::sub;
-
-    // Inst{6-5} encodes the shift opcode.
-    ARM_AM::ShiftOpc ShOp = getShiftOpcForBits(slice(insn, 6, 5));
-    // Inst{11-7} encodes the imm5 shift amount.
-    unsigned ShImm = slice(insn, 11, 7);
-
-    // A8.4.1.  Possible rrx or shift amount of 32...
-    getImmShiftSE(ShOp, ShImm);
-    MI.addOperand(MCOperand::CreateImm(
-                    ARM_AM::getAM2Opc(AddrOpcode, ShImm, ShOp)));
-    NumOpsAdded = 3;
-  }
-
-  return true;
-}
-
-static bool DisassembleMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  if (Opcode == ARM::DMB || Opcode == ARM::DSB || Opcode == ARM::ISB) {
-    // Inst{3-0} encodes the memory barrier option for the variants.
-    unsigned opt = slice(insn, 3, 0);
-    switch (opt) {
-    case ARM_MB::SY:  case ARM_MB::ST:
-    case ARM_MB::ISH: case ARM_MB::ISHST:
-    case ARM_MB::NSH: case ARM_MB::NSHST:
-    case ARM_MB::OSH: case ARM_MB::OSHST:
-      MI.addOperand(MCOperand::CreateImm(opt));
-      NumOpsAdded = 1;
-      return true;
-    default:
-      return false;
-    }
-  }
-
-  switch (Opcode) {
-  case ARM::CLREX:
-  case ARM::NOP:
-  case ARM::TRAP:
-  case ARM::YIELD:
-  case ARM::WFE:
-  case ARM::WFI:
-  case ARM::SEV:
-    return true;
-  case ARM::SWP:
-  case ARM::SWPB:
-    // SWP, SWPB: Rd Rm Rn
-    // Delegate to DisassembleLdStExFrm()....
-    return DisassembleLdStExFrm(MI, Opcode, insn, NumOps, NumOpsAdded, B);
-  default:
-    break;
-  }
-
-  if (Opcode == ARM::SETEND) {
-    NumOpsAdded = 1;
-    MI.addOperand(MCOperand::CreateImm(slice(insn, 9, 9)));
-    return true;
-  }
-
-  // FIXME: To enable correct asm parsing and disasm of CPS we need 3 different
-  // opcodes which match the same real instruction. This is needed since there's
-  // no current handling of optional arguments. Fix here when a better handling
-  // of optional arguments is implemented.
-  if (Opcode == ARM::CPS3p) {   // M = 1
-    // Let's reject these impossible imod values by returning false:
-    // 1. (imod=0b01)
-    //
-    // AsmPrinter cannot handle imod=0b00, plus (imod=0b00,M=1,iflags!=0) is an
-    // invalid combination, so we just check for imod=0b00 here.
-    if (slice(insn, 19, 18) == 0 || slice(insn, 19, 18) == 1)
-      return false;
-    MI.addOperand(MCOperand::CreateImm(slice(insn, 19, 18))); // imod
-    MI.addOperand(MCOperand::CreateImm(slice(insn, 8, 6)));   // iflags
-    MI.addOperand(MCOperand::CreateImm(slice(insn, 4, 0)));   // mode
-    NumOpsAdded = 3;
-    return true;
-  }
-  if (Opcode == ARM::CPS2p) { // mode = 0, M = 0
-    // Let's reject these impossible imod values by returning false:
-    // 1. (imod=0b00,M=0)
-    // 2. (imod=0b01)
-    if (slice(insn, 19, 18) == 0 || slice(insn, 19, 18) == 1)
-      return false;
-    MI.addOperand(MCOperand::CreateImm(slice(insn, 19, 18))); // imod
-    MI.addOperand(MCOperand::CreateImm(slice(insn, 8, 6)));   // iflags
-    NumOpsAdded = 2;
-    return true;
-  }
-  if (Opcode == ARM::CPS1p) { // imod = 0, iflags = 0, M = 1
-    MI.addOperand(MCOperand::CreateImm(slice(insn, 4, 0))); // mode
-    NumOpsAdded = 1;
-    return true;
-  }
-
-  // DBG has its option specified in Inst{3-0}.
-  if (Opcode == ARM::DBG) {
-    MI.addOperand(MCOperand::CreateImm(slice(insn, 3, 0)));
-    NumOpsAdded = 1;
-    return true;
-  }
-
-  // BKPT takes an imm32 val equal to ZeroExtend(Inst{19-8:3-0}).
-  if (Opcode == ARM::BKPT) {
-    MI.addOperand(MCOperand::CreateImm(slice(insn, 19, 8) << 4 |
-                                       slice(insn, 3, 0)));
-    NumOpsAdded = 1;
-    return true;
-  }
-
-  if (PreLoadOpcode(Opcode))
-    return DisassemblePreLoadFrm(MI, Opcode, insn, NumOps, NumOpsAdded, B);
-
-  assert(0 && "Unexpected misc instruction!");
-  return false;
-}
-
-/// FuncPtrs - FuncPtrs maps ARMFormat to its corresponding DisassembleFP.
-/// We divide the disassembly task into different categories, with each one
-/// corresponding to a specific instruction encoding format.  There could be
-/// exceptions when handling a specific format, and that is why the Opcode is
-/// also present in the function prototype.
-static const DisassembleFP FuncPtrs[] = {
-  &DisassemblePseudo,
-  &DisassembleMulFrm,
-  &DisassembleBrFrm,
-  &DisassembleBrMiscFrm,
-  &DisassembleDPFrm,
-  &DisassembleDPSoRegFrm,
-  &DisassembleLdFrm,
-  &DisassembleStFrm,
-  &DisassembleLdMiscFrm,
-  &DisassembleStMiscFrm,
-  &DisassembleLdStMulFrm,
-  &DisassembleLdStExFrm,
-  &DisassembleArithMiscFrm,
-  &DisassembleSatFrm,
-  &DisassembleExtFrm,
-  &DisassembleVFPUnaryFrm,
-  &DisassembleVFPBinaryFrm,
-  &DisassembleVFPConv1Frm,
-  &DisassembleVFPConv2Frm,
-  &DisassembleVFPConv3Frm,
-  &DisassembleVFPConv4Frm,
-  &DisassembleVFPConv5Frm,
-  &DisassembleVFPLdStFrm,
-  &DisassembleVFPLdStMulFrm,
-  &DisassembleVFPMiscFrm,
-  &DisassembleThumbFrm,
-  &DisassembleMiscFrm,
-  &DisassembleNGetLnFrm,
-  &DisassembleNSetLnFrm,
-  &DisassembleNDupFrm,
-
-  // VLD and VST (including one lane) Instructions.
-  &DisassembleNLdSt,
-
-  // A7.4.6 One register and a modified immediate value
-  // 1-Register Instructions with imm.
-  // LLVM only defines VMOVv instructions.
-  &DisassembleN1RegModImmFrm,
-
-  // 2-Register Instructions with no imm.
-  &DisassembleN2RegFrm,
-
-  // 2-Register Instructions with imm (vector convert float/fixed point).
-  &DisassembleNVCVTFrm,
-
-  // 2-Register Instructions with imm (vector dup lane).
-  &DisassembleNVecDupLnFrm,
-
-  // Vector Shift Left Instructions.
-  &DisassembleN2RegVecShLFrm,
-
-  // Vector Shift Righ Instructions, which has different interpretation of the
-  // shift amount from the imm6 field.
-  &DisassembleN2RegVecShRFrm,
-
-  // 3-Register Data-Processing Instructions.
-  &DisassembleN3RegFrm,
-
-  // Vector Shift (Register) Instructions.
-  // D:Vd M:Vm N:Vn (notice that M:Vm is the first operand)
-  &DisassembleN3RegVecShFrm,
-
-  // Vector Extract Instructions.
-  &DisassembleNVecExtractFrm,
-
-  // Vector [Saturating Rounding Doubling] Multiply [Accumulate/Subtract] [Long]
-  // By Scalar Instructions.
-  &DisassembleNVecMulScalarFrm,
-
-  // Vector Table Lookup uses byte indexes in a control vector to look up byte
-  // values in a table and generate a new vector.
-  &DisassembleNVTBLFrm,
-
-  NULL
-};
-
-/// BuildIt - BuildIt performs the build step for this ARM Basic MC Builder.
-/// The general idea is to set the Opcode for the MCInst, followed by adding
-/// the appropriate MCOperands to the MCInst.  ARM Basic MC Builder delegates
-/// to the Format-specific disassemble function for disassembly, followed by
-/// TryPredicateAndSBitModifier() to do PredicateOperand and OptionalDefOperand
-/// which follow the Dst/Src Operands.
-bool ARMBasicMCBuilder::BuildIt(MCInst &MI, uint32_t insn) {
-  // Stage 1 sets the Opcode.
-  MI.setOpcode(Opcode);
-  // If the number of operands is zero, we're done!
-  if (NumOps == 0)
-    return true;
-
-  // Stage 2 calls the format-specific disassemble function to build the operand
-  // list.
-  if (Disasm == NULL)
-    return false;
-  unsigned NumOpsAdded = 0;
-  bool OK = (*Disasm)(MI, Opcode, insn, NumOps, NumOpsAdded, this);
-
-  if (!OK || this->Err != 0) return false;
-  if (NumOpsAdded >= NumOps)
-    return true;
-
-  // Stage 3 deals with operands unaccounted for after stage 2 is finished.
-  // FIXME: Should this be done selectively?
-  return TryPredicateAndSBitModifier(MI, Opcode, insn, NumOps - NumOpsAdded);
-}
-
-// A8.3 Conditional execution
-// A8.3.1 Pseudocode details of conditional execution
-// Condition bits '111x' indicate the instruction is always executed.
-static uint32_t CondCode(uint32_t CondField) {
-  if (CondField == 0xF)
-    return ARMCC::AL;
-  return CondField;
-}
-
-/// DoPredicateOperands - DoPredicateOperands process the predicate operands
-/// of some Thumb instructions which come before the reglist operands.  It
-/// returns true if the two predicate operands have been processed.
-bool ARMBasicMCBuilder::DoPredicateOperands(MCInst& MI, unsigned Opcode,
-    uint32_t /* insn */, unsigned short NumOpsRemaining) {
-
-  assert(NumOpsRemaining > 0 && "Invalid argument");
-
-  const MCOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
-  unsigned Idx = MI.getNumOperands();
-
-  // First, we check whether this instr specifies the PredicateOperand through
-  // a pair of MCOperandInfos with isPredicate() property.
-  if (NumOpsRemaining >= 2 &&
-      OpInfo[Idx].isPredicate() && OpInfo[Idx+1].isPredicate() &&
-      OpInfo[Idx].RegClass < 0 &&
-      OpInfo[Idx+1].RegClass == ARM::CCRRegClassID)
-  {
-    // If we are inside an IT block, get the IT condition bits maintained via
-    // ARMBasicMCBuilder::ITState[7:0], through ARMBasicMCBuilder::GetITCond().
-    // See also A2.5.2.
-    if (InITBlock())
-      MI.addOperand(MCOperand::CreateImm(GetITCond()));
-    else
-      MI.addOperand(MCOperand::CreateImm(ARMCC::AL));
-    MI.addOperand(MCOperand::CreateReg(ARM::CPSR));
-    return true;
-  }
-
-  return false;
-}
-
-/// TryPredicateAndSBitModifier - TryPredicateAndSBitModifier tries to process
-/// the possible Predicate and SBitModifier, to build the remaining MCOperand
-/// constituents.
-bool ARMBasicMCBuilder::TryPredicateAndSBitModifier(MCInst& MI, unsigned Opcode,
-    uint32_t insn, unsigned short NumOpsRemaining) {
-
-  assert(NumOpsRemaining > 0 && "Invalid argument");
-
-  const MCOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
-  const std::string &Name = ARMInsts[Opcode].Name;
-  unsigned Idx = MI.getNumOperands();
-  uint64_t TSFlags = ARMInsts[Opcode].TSFlags;
-
-  // First, we check whether this instr specifies the PredicateOperand through
-  // a pair of MCOperandInfos with isPredicate() property.
-  if (NumOpsRemaining >= 2 &&
-      OpInfo[Idx].isPredicate() && OpInfo[Idx+1].isPredicate() &&
-      OpInfo[Idx].RegClass < 0 &&
-      OpInfo[Idx+1].RegClass == ARM::CCRRegClassID)
-  {
-    // If we are inside an IT block, get the IT condition bits maintained via
-    // ARMBasicMCBuilder::ITState[7:0], through ARMBasicMCBuilder::GetITCond().
-    // See also A2.5.2.
-    if (InITBlock())
-      MI.addOperand(MCOperand::CreateImm(GetITCond()));
-    else {
-      if (Name.length() > 1 && Name[0] == 't') {
-        // Thumb conditional branch instructions have their cond field embedded,
-        // like ARM.
-        //
-        // A8.6.16 B
-        // Check for undefined encodings.
-        unsigned cond;
-        if (Name == "t2Bcc") {
-          if ((cond = slice(insn, 25, 22)) >= 14)
-            return false;
-          MI.addOperand(MCOperand::CreateImm(CondCode(cond)));
-        } else if (Name == "tBcc") {
-          if ((cond = slice(insn, 11, 8)) == 14)
-            return false;
-          MI.addOperand(MCOperand::CreateImm(CondCode(cond)));
-        } else
-          MI.addOperand(MCOperand::CreateImm(ARMCC::AL));
-      } else {
-        // ARM instructions get their condition field from Inst{31-28}.
-        // We should reject Inst{31-28} = 0b1111 as invalid encoding.
-        if (!isNEONDomain(TSFlags) && getCondField(insn) == 0xF)
-          return false;
-        MI.addOperand(MCOperand::CreateImm(CondCode(getCondField(insn))));
-      }
-    }
-    MI.addOperand(MCOperand::CreateReg(ARM::CPSR));
-    Idx += 2;
-    NumOpsRemaining -= 2;
-  }
-
-  if (NumOpsRemaining == 0)
-    return true;
-
-  // Next, if OptionalDefOperand exists, we check whether the 'S' bit is set.
-  if (OpInfo[Idx].isOptionalDef() && OpInfo[Idx].RegClass==ARM::CCRRegClassID) {
-    MI.addOperand(MCOperand::CreateReg(getSBit(insn) == 1 ? ARM::CPSR : 0));
-    --NumOpsRemaining;
-  }
-
-  if (NumOpsRemaining == 0)
-    return true;
-  else
-    return false;
-}
-
-/// RunBuildAfterHook - RunBuildAfterHook performs operations deemed necessary
-/// after BuildIt is finished.
-bool ARMBasicMCBuilder::RunBuildAfterHook(bool Status, MCInst &MI,
-    uint32_t insn) {
-
-  if (!SP) return Status;
-
-  if (Opcode == ARM::t2IT)
-    Status = SP->InitIT(slice(insn, 7, 0)) ? Status : false;
-  else if (InITBlock())
-    SP->UpdateIT();
-
-  return Status;
-}
-
-/// Opcode, Format, and NumOperands make up an ARM Basic MCBuilder.
-ARMBasicMCBuilder::ARMBasicMCBuilder(unsigned opc, ARMFormat format,
-                                     unsigned short num)
-  : Opcode(opc), Format(format), NumOps(num), SP(0), Err(0) {
-  unsigned Idx = (unsigned)format;
-  assert(Idx < (array_lengthof(FuncPtrs) - 1) && "Unknown format");
-  Disasm = FuncPtrs[Idx];
-}
-
-/// CreateMCBuilder - Return an ARMBasicMCBuilder that can build up the MC
-/// infrastructure of an MCInst given the Opcode and Format of the instr.
-/// Return NULL if it fails to create/return a proper builder.  API clients
-/// are responsible for freeing up of the allocated memory.  Cacheing can be
-/// performed by the API clients to improve performance.
-ARMBasicMCBuilder *llvm::CreateMCBuilder(unsigned Opcode, ARMFormat Format) {
-  // For "Unknown format", fail by returning a NULL pointer.
-  if ((unsigned)Format >= (array_lengthof(FuncPtrs) - 1)) {
-    DEBUG(errs() << "Unknown format\n");
-    return 0;
-  }
-
-  return new ARMBasicMCBuilder(Opcode, Format,
-                               ARMInsts[Opcode].getNumOperands());
-}
-
-/// tryAddingSymbolicOperand - tryAddingSymbolicOperand trys to add a symbolic
-/// operand in place of the immediate Value in the MCInst.  The immediate
-/// Value has had any PC adjustment made by the caller.  If the getOpInfo()
-/// function was set as part of the setupBuilderForSymbolicDisassembly() call
-/// then that function is called to get any symbolic information at the
-/// builder's Address for this instrution.  If that returns non-zero then the
-/// symbolic information it returns is used to create an MCExpr and that is
-/// added as an operand to the MCInst.  This function returns true if it adds
-/// an operand to the MCInst and false otherwise.
-bool ARMBasicMCBuilder::tryAddingSymbolicOperand(uint64_t Value,
-                                                 uint64_t InstSize,
-                                                 MCInst &MI) {
-  if (!GetOpInfo)
-    return false;
-
-  struct LLVMOpInfo1 SymbolicOp;
-  SymbolicOp.Value = Value;
-  if (!GetOpInfo(DisInfo, Address, 0 /* Offset */, InstSize, 1, &SymbolicOp))
-    return false;
-
-  const MCExpr *Add = NULL;
-  if (SymbolicOp.AddSymbol.Present) {
-    if (SymbolicOp.AddSymbol.Name) {
-      StringRef Name(SymbolicOp.AddSymbol.Name);
-      MCSymbol *Sym = Ctx->GetOrCreateSymbol(Name);
-      Add = MCSymbolRefExpr::Create(Sym, *Ctx);
-    } else {
-      Add = MCConstantExpr::Create(SymbolicOp.AddSymbol.Value, *Ctx);
-    }
-  }
-
-  const MCExpr *Sub = NULL;
-  if (SymbolicOp.SubtractSymbol.Present) {
-    if (SymbolicOp.SubtractSymbol.Name) {
-      StringRef Name(SymbolicOp.SubtractSymbol.Name);
-      MCSymbol *Sym = Ctx->GetOrCreateSymbol(Name);
-      Sub = MCSymbolRefExpr::Create(Sym, *Ctx);
-    } else {
-      Sub = MCConstantExpr::Create(SymbolicOp.SubtractSymbol.Value, *Ctx);
-    }
-  }
-
-  const MCExpr *Off = NULL;
-  if (SymbolicOp.Value != 0)
-    Off = MCConstantExpr::Create(SymbolicOp.Value, *Ctx);
-
-  const MCExpr *Expr;
-  if (Sub) {
-    const MCExpr *LHS;
-    if (Add)
-      LHS = MCBinaryExpr::CreateSub(Add, Sub, *Ctx);
-    else
-      LHS = MCUnaryExpr::CreateMinus(Sub, *Ctx);
-    if (Off != 0)
-      Expr = MCBinaryExpr::CreateAdd(LHS, Off, *Ctx);
-    else
-      Expr = LHS;
-  } else if (Add) {
-    if (Off != 0)
-      Expr = MCBinaryExpr::CreateAdd(Add, Off, *Ctx);
-    else
-      Expr = Add;
-  } else {
-    if (Off != 0)
-      Expr = Off;
-    else
-      Expr = MCConstantExpr::Create(0, *Ctx);
-  }
-
-  if (SymbolicOp.VariantKind == LLVMDisassembler_VariantKind_ARM_HI16)
-    MI.addOperand(MCOperand::CreateExpr(ARMMCExpr::CreateUpper16(Expr, *Ctx)));
-  else if (SymbolicOp.VariantKind == LLVMDisassembler_VariantKind_ARM_LO16)
-    MI.addOperand(MCOperand::CreateExpr(ARMMCExpr::CreateLower16(Expr, *Ctx)));
-  else if (SymbolicOp.VariantKind == LLVMDisassembler_VariantKind_None)
-    MI.addOperand(MCOperand::CreateExpr(Expr));
-  else 
-    assert("bad SymbolicOp.VariantKind");
-
-  return true;
-}
diff --git a/lib/Target/ARM/Disassembler/ARMDisassemblerCore.h b/lib/Target/ARM/Disassembler/ARMDisassemblerCore.h
deleted file mode 100644
index a7ba14141c0ab..0000000000000
--- a/lib/Target/ARM/Disassembler/ARMDisassemblerCore.h
+++ /dev/null
@@ -1,336 +0,0 @@
-//===- ARMDisassemblerCore.h - ARM disassembler helpers ---------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file is part of the ARM Disassembler.
-//
-// The first part defines the enumeration type of ARM instruction format, which
-// specifies the encoding used by the instruction, as well as a helper function
-// to convert the enums to printable char strings.
-//
-// It also contains code to represent the concepts of Builder and DisassembleFP
-// to solve the problem of disassembling an ARM instr.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ARMDISASSEMBLERCORE_H
-#define ARMDISASSEMBLERCORE_H
-
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCSymbol.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm-c/Disassembler.h"
-#include "ARMBaseInstrInfo.h"
-#include "ARMRegisterInfo.h"
-#include "ARMDisassembler.h"
-
-namespace llvm {
-class MCContext;
-
-class ARMUtils {
-public:
-  static const char *OpcodeName(unsigned Opcode);
-};
-
-/////////////////////////////////////////////////////
-//                                                 //
-//  Enums and Utilities for ARM Instruction Format //
-//                                                 //
-/////////////////////////////////////////////////////
-
-#define ARM_FORMATS                   \
-  ENTRY(ARM_FORMAT_PSEUDO,         0) \
-  ENTRY(ARM_FORMAT_MULFRM,         1) \
-  ENTRY(ARM_FORMAT_BRFRM,          2) \
-  ENTRY(ARM_FORMAT_BRMISCFRM,      3) \
-  ENTRY(ARM_FORMAT_DPFRM,          4) \
-  ENTRY(ARM_FORMAT_DPSOREGFRM,     5) \
-  ENTRY(ARM_FORMAT_LDFRM,          6) \
-  ENTRY(ARM_FORMAT_STFRM,          7) \
-  ENTRY(ARM_FORMAT_LDMISCFRM,      8) \
-  ENTRY(ARM_FORMAT_STMISCFRM,      9) \
-  ENTRY(ARM_FORMAT_LDSTMULFRM,    10) \
-  ENTRY(ARM_FORMAT_LDSTEXFRM,     11) \
-  ENTRY(ARM_FORMAT_ARITHMISCFRM,  12) \
-  ENTRY(ARM_FORMAT_SATFRM,        13) \
-  ENTRY(ARM_FORMAT_EXTFRM,        14) \
-  ENTRY(ARM_FORMAT_VFPUNARYFRM,   15) \
-  ENTRY(ARM_FORMAT_VFPBINARYFRM,  16) \
-  ENTRY(ARM_FORMAT_VFPCONV1FRM,   17) \
-  ENTRY(ARM_FORMAT_VFPCONV2FRM,   18) \
-  ENTRY(ARM_FORMAT_VFPCONV3FRM,   19) \
-  ENTRY(ARM_FORMAT_VFPCONV4FRM,   20) \
-  ENTRY(ARM_FORMAT_VFPCONV5FRM,   21) \
-  ENTRY(ARM_FORMAT_VFPLDSTFRM,    22) \
-  ENTRY(ARM_FORMAT_VFPLDSTMULFRM, 23) \
-  ENTRY(ARM_FORMAT_VFPMISCFRM,    24) \
-  ENTRY(ARM_FORMAT_THUMBFRM,      25) \
-  ENTRY(ARM_FORMAT_MISCFRM,       26) \
-  ENTRY(ARM_FORMAT_NEONGETLNFRM,  27) \
-  ENTRY(ARM_FORMAT_NEONSETLNFRM,  28) \
-  ENTRY(ARM_FORMAT_NEONDUPFRM,    29) \
-  ENTRY(ARM_FORMAT_NLdSt,         30) \
-  ENTRY(ARM_FORMAT_N1RegModImm,   31) \
-  ENTRY(ARM_FORMAT_N2Reg,         32) \
-  ENTRY(ARM_FORMAT_NVCVT,         33) \
-  ENTRY(ARM_FORMAT_NVecDupLn,     34) \
-  ENTRY(ARM_FORMAT_N2RegVecShL,   35) \
-  ENTRY(ARM_FORMAT_N2RegVecShR,   36) \
-  ENTRY(ARM_FORMAT_N3Reg,         37) \
-  ENTRY(ARM_FORMAT_N3RegVecSh,    38) \
-  ENTRY(ARM_FORMAT_NVecExtract,   39) \
-  ENTRY(ARM_FORMAT_NVecMulScalar, 40) \
-  ENTRY(ARM_FORMAT_NVTBL,         41)
-
-// ARM instruction format specifies the encoding used by the instruction.
-#define ENTRY(n, v) n = v,
-typedef enum {
-  ARM_FORMATS
-  ARM_FORMAT_NA
-} ARMFormat;
-#undef ENTRY
-
-// Converts enum to const char*.
-static const inline char *stringForARMFormat(ARMFormat form) {
-#define ENTRY(n, v) case n: return #n;
-  switch(form) {
-    ARM_FORMATS
-  case ARM_FORMAT_NA:
-  default:
-    return "";
-  }
-#undef ENTRY
-}
-
-/// Expands on the enum definitions from ARMBaseInstrInfo.h.
-/// They are being used by the disassembler implementation.
-namespace ARMII {
-  enum {
-    NEONRegMask = 15,
-    GPRRegMask = 15,
-    NEON_RegRdShift = 12,
-    NEON_D_BitShift = 22,
-    NEON_RegRnShift = 16,
-    NEON_N_BitShift = 7,
-    NEON_RegRmShift = 0,
-    NEON_M_BitShift = 5
-  };
-}
-
-/// Utility function for extracting [From, To] bits from a uint32_t.
-static inline unsigned slice(uint32_t Bits, unsigned From, unsigned To) {
-  assert(From < 32 && To < 32 && From >= To);
-  return (Bits >> To) & ((1 << (From - To + 1)) - 1);
-}
-
-/// Utility function for setting [From, To] bits to Val for a uint32_t.
-static inline void setSlice(unsigned &Bits, unsigned From, unsigned To,
-                            unsigned Val) {
-  assert(From < 32 && To < 32 && From >= To);
-  uint32_t Mask = ((1 << (From - To + 1)) - 1);
-  Bits &= ~(Mask << To);
-  Bits |= (Val & Mask) << To;
-}
-
-// Return an integer result equal to the number of bits of x that are ones.
-static inline uint32_t
-BitCount (uint64_t x)
-{
-    // c accumulates the total bits set in x
-    uint32_t c;
-    for (c = 0; x; ++c)
-    {
-        x &= x - 1; // clear the least significant bit set
-    }
-    return c;
-}
-
-static inline bool
-BitIsSet (const uint64_t value, const uint64_t bit)
-{
-    return (value & (1ull << bit)) != 0;
-}
-
-static inline bool
-BitIsClear (const uint64_t value, const uint64_t bit)
-{
-    return (value & (1ull << bit)) == 0;
-}
-
-/// Various utilities for checking the target specific flags.
-
-/// A unary data processing instruction doesn't have an Rn operand.
-static inline bool isUnaryDP(uint64_t TSFlags) {
-  return (TSFlags & ARMII::UnaryDP);
-}
-
-/// A NEON Domain instruction has cond field (Inst{31-28}) as 0b1111.
-static inline bool isNEONDomain(uint64_t TSFlags) {
-  return (TSFlags & ARMII::DomainNEON) ||
-         (TSFlags & ARMII::DomainNEONA8);
-}
-
-/// This four-bit field describes the addressing mode used.
-/// See also ARMBaseInstrInfo.h.
-static inline unsigned getAddrMode(uint64_t TSFlags) {
-  return (TSFlags & ARMII::AddrModeMask);
-}
-
-/// {IndexModePre, IndexModePost}
-/// Only valid for load and store ops.
-/// See also ARMBaseInstrInfo.h.
-static inline unsigned getIndexMode(uint64_t TSFlags) {
-  return (TSFlags & ARMII::IndexModeMask) >> ARMII::IndexModeShift;
-}
-
-/// Pre-/post-indexed operations define an extra $base_wb in the OutOperandList.
-static inline bool isPrePostLdSt(uint64_t TSFlags) {
-  return (TSFlags & ARMII::IndexModeMask) != 0;
-}
-
-// Forward declaration.
-class ARMBasicMCBuilder;
-
-// Builder Object is mostly ignored except in some Thumb disassemble functions.
-typedef ARMBasicMCBuilder *BO;
-
-/// DisassembleFP - DisassembleFP points to a function that disassembles an insn
-/// and builds the MCOperand list upon disassembly.  It returns false on failure
-/// or true on success.  The number of operands added is updated upon success.
-typedef bool (*DisassembleFP)(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO Builder);
-
-/// CreateMCBuilder - Return an ARMBasicMCBuilder that can build up the MC
-/// infrastructure of an MCInst given the Opcode and Format of the instr.
-/// Return NULL if it fails to create/return a proper builder.  API clients
-/// are responsible for freeing up of the allocated memory.  Cacheing can be
-/// performed by the API clients to improve performance.
-extern ARMBasicMCBuilder *CreateMCBuilder(unsigned Opcode, ARMFormat Format);
-
-/// ARMBasicMCBuilder - ARMBasicMCBuilder represents an ARM MCInst builder that
-/// knows how to build up the MCOperand list.
-class ARMBasicMCBuilder {
-  friend ARMBasicMCBuilder *CreateMCBuilder(unsigned Opcode, ARMFormat Format);
-  unsigned Opcode;
-  ARMFormat Format;
-  unsigned short NumOps;
-  DisassembleFP Disasm;
-  Session *SP;
-  int Err; // !=0 if the builder encounters some error condition during build.
-
-private:
-  /// Opcode, Format, and NumOperands make up an ARM Basic MCBuilder.
-  ARMBasicMCBuilder(unsigned opc, ARMFormat format, unsigned short num);
-
-public:
-  ARMBasicMCBuilder(ARMBasicMCBuilder &B)
-    : Opcode(B.Opcode), Format(B.Format), NumOps(B.NumOps), Disasm(B.Disasm),
-      SP(B.SP), GetOpInfo(0), DisInfo(0), Ctx(0) {
-    Err = 0;
-  }
-
-  virtual ~ARMBasicMCBuilder() {}
-
-  void SetSession(Session *sp) {
-    SP = sp;
-  }
-
-  void SetErr(int ErrCode) {
-    Err = ErrCode;
-  }
-
-  /// DoPredicateOperands - DoPredicateOperands process the predicate operands
-  /// of some Thumb instructions which come before the reglist operands.  It
-  /// returns true if the two predicate operands have been processed.
-  bool DoPredicateOperands(MCInst& MI, unsigned Opcode,
-      uint32_t insn, unsigned short NumOpsRemaning);
-  
-  /// TryPredicateAndSBitModifier - TryPredicateAndSBitModifier tries to process
-  /// the possible Predicate and SBitModifier, to build the remaining MCOperand
-  /// constituents.
-  bool TryPredicateAndSBitModifier(MCInst& MI, unsigned Opcode,
-      uint32_t insn, unsigned short NumOpsRemaning);
-
-  /// InITBlock - InITBlock returns true if we are inside an IT block.
-  bool InITBlock() {
-    if (SP)
-      return SP->ITCounter > 0;
-
-    return false;
-  }
-
-  /// Build - Build delegates to BuildIt to perform the heavy liftling.  After
-  /// that, it invokes RunBuildAfterHook where some housekeepings can be done.
-  virtual bool Build(MCInst &MI, uint32_t insn) {
-    bool Status = BuildIt(MI, insn);
-    return RunBuildAfterHook(Status, MI, insn);
-  }
-
-  /// BuildIt - BuildIt performs the build step for this ARM Basic MC Builder.
-  /// The general idea is to set the Opcode for the MCInst, followed by adding
-  /// the appropriate MCOperands to the MCInst.  ARM Basic MC Builder delegates
-  /// to the Format-specific disassemble function for disassembly, followed by
-  /// TryPredicateAndSBitModifier() for PredicateOperand and OptionalDefOperand
-  /// which follow the Dst/Src Operands.
-  virtual bool BuildIt(MCInst &MI, uint32_t insn);
-
-  /// RunBuildAfterHook - RunBuildAfterHook performs operations deemed necessary
-  /// after BuildIt is finished.
-  virtual bool RunBuildAfterHook(bool Status, MCInst &MI, uint32_t insn);
-
-private:
-  /// Get condition of the current IT instruction.
-  unsigned GetITCond() {
-    assert(SP);
-    return slice(SP->ITState, 7, 4);
-  }
-
-private:
-  //
-  // Hooks for symbolic disassembly via the public 'C' interface.
-  //
-  // The function to get the symbolic information for operands.
-  LLVMOpInfoCallback GetOpInfo;
-  // The pointer to the block of symbolic information for above call back.
-  void *DisInfo;
-  // The assembly context for creating symbols and MCExprs in place of
-  // immediate operands when there is symbolic information.
-  MCContext *Ctx;
-  // The address of the instruction being disassembled.
-  uint64_t Address;
-
-public:
-  void setupBuilderForSymbolicDisassembly(LLVMOpInfoCallback getOpInfo,
-                                          void *disInfo, MCContext *ctx,
-                                          uint64_t address) {
-    GetOpInfo = getOpInfo;
-    DisInfo = disInfo;
-    Ctx = ctx;
-    Address = address;
-  }
-
-  uint64_t getBuilderAddress() const { return Address; }
-
-  /// tryAddingSymbolicOperand - tryAddingSymbolicOperand trys to add a symbolic
-  /// operand in place of the immediate Value in the MCInst.  The immediate
-  /// Value has had any PC adjustment made by the caller.  If the getOpInfo()
-  /// function was set as part of the setupBuilderForSymbolicDisassembly() call
-  /// then that function is called to get any symbolic information at the
-  /// builder's Address for this instrution.  If that returns non-zero then the
-  /// symbolic information it returns is used to create an MCExpr and that is
-  /// added as an operand to the MCInst.  This function returns true if it adds
-  /// an operand to the MCInst and false otherwise.
-  bool tryAddingSymbolicOperand(uint64_t Value, uint64_t InstSize, MCInst &MI);
-
-};
-
-} // namespace llvm
-
-#endif
diff --git a/lib/Target/ARM/Disassembler/CMakeLists.txt b/lib/Target/ARM/Disassembler/CMakeLists.txt
index b23dd6ba57efa..da87751150e85 100644
--- a/lib/Target/ARM/Disassembler/CMakeLists.txt
+++ b/lib/Target/ARM/Disassembler/CMakeLists.txt
@@ -2,7 +2,6 @@ include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/
 
 add_llvm_library(LLVMARMDisassembler
   ARMDisassembler.cpp
-  ARMDisassemblerCore.cpp
   )
 # workaround for hanging compilation on MSVC8, 9 and 10
 if( MSVC_VERSION EQUAL 1400 OR MSVC_VERSION EQUAL 1500 OR MSVC_VERSION EQUAL 1600 )
@@ -11,4 +10,12 @@ set_property(
   PROPERTY COMPILE_FLAGS "/Od"
   )
 endif()
-add_dependencies(LLVMARMDisassembler ARMCodeGenTable_gen)
+add_dependencies(LLVMARMDisassembler ARMCommonTableGen)
+
+add_llvm_library_dependencies(LLVMARMDisassembler
+  LLVMARMCodeGen
+  LLVMARMDesc
+  LLVMARMInfo
+  LLVMMC
+  LLVMSupport
+  )
diff --git a/lib/Target/ARM/Disassembler/ThumbDisassemblerCore.h b/lib/Target/ARM/Disassembler/ThumbDisassemblerCore.h
deleted file mode 100644
index 834c6f65295db..0000000000000
--- a/lib/Target/ARM/Disassembler/ThumbDisassemblerCore.h
+++ /dev/null
@@ -1,2459 +0,0 @@
-//===- ThumbDisassemblerCore.h - Thumb disassembler helpers -----*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file is part of the ARM Disassembler.
-// It contains code for disassembling a Thumb instr.  It is to be included by
-// ARMDisassemblerCore.cpp because it contains the static DisassembleThumbFrm()
-// function which acts as the dispatcher to disassemble a Thumb instruction.
-//
-//===----------------------------------------------------------------------===//
-
-///////////////////////////////
-//                           //
-//     Utility Functions     //
-//                           //
-///////////////////////////////
-
-// Utilities for 16-bit Thumb instructions.
-/*
-15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0
-               [  tRt ]
-                      [ tRm ]  [ tRn ]  [ tRd ]
-                         D  [   Rm   ]  [  Rd ]
-
-                      [ imm3]
-               [    imm5    ]
-                   i     [    imm5   ]
-                            [       imm7      ]
-                         [       imm8         ]
-               [             imm11            ]
-
-            [   cond  ]
-*/
-
-// Extract tRt: Inst{10-8}.
-static inline unsigned getT1tRt(uint32_t insn) {
-  return slice(insn, 10, 8);
-}
-
-// Extract tRm: Inst{8-6}.
-static inline unsigned getT1tRm(uint32_t insn) {
-  return slice(insn, 8, 6);
-}
-
-// Extract tRn: Inst{5-3}.
-static inline unsigned getT1tRn(uint32_t insn) {
-  return slice(insn, 5, 3);
-}
-
-// Extract tRd: Inst{2-0}.
-static inline unsigned getT1tRd(uint32_t insn) {
-  return slice(insn, 2, 0);
-}
-
-// Extract [D:Rd]: Inst{7:2-0}.
-static inline unsigned getT1Rd(uint32_t insn) {
-  return slice(insn, 7, 7) << 3 | slice(insn, 2, 0);
-}
-
-// Extract Rm: Inst{6-3}.
-static inline unsigned getT1Rm(uint32_t insn) {
-  return slice(insn, 6, 3);
-}
-
-// Extract imm3: Inst{8-6}.
-static inline unsigned getT1Imm3(uint32_t insn) {
-  return slice(insn, 8, 6);
-}
-
-// Extract imm5: Inst{10-6}.
-static inline unsigned getT1Imm5(uint32_t insn) {
-  return slice(insn, 10, 6);
-}
-
-// Extract i:imm5: Inst{9:7-3}.
-static inline unsigned getT1Imm6(uint32_t insn) {
-  return slice(insn, 9, 9) << 5 | slice(insn, 7, 3);
-}
-
-// Extract imm7: Inst{6-0}.
-static inline unsigned getT1Imm7(uint32_t insn) {
-  return slice(insn, 6, 0);
-}
-
-// Extract imm8: Inst{7-0}.
-static inline unsigned getT1Imm8(uint32_t insn) {
-  return slice(insn, 7, 0);
-}
-
-// Extract imm11: Inst{10-0}.
-static inline unsigned getT1Imm11(uint32_t insn) {
-  return slice(insn, 10, 0);
-}
-
-// Extract cond: Inst{11-8}.
-static inline unsigned getT1Cond(uint32_t insn) {
-  return slice(insn, 11, 8);
-}
-
-static inline bool IsGPR(unsigned RegClass) {
-  return RegClass == ARM::GPRRegClassID || RegClass == ARM::rGPRRegClassID;
-}
-
-// Utilities for 32-bit Thumb instructions.
-
-static inline bool BadReg(uint32_t n) { return n == 13 || n == 15; }
-
-// Extract imm4: Inst{19-16}.
-static inline unsigned getImm4(uint32_t insn) {
-  return slice(insn, 19, 16);
-}
-
-// Extract imm3: Inst{14-12}.
-static inline unsigned getImm3(uint32_t insn) {
-  return slice(insn, 14, 12);
-}
-
-// Extract imm8: Inst{7-0}.
-static inline unsigned getImm8(uint32_t insn) {
-  return slice(insn, 7, 0);
-}
-
-// A8.6.61 LDRB (immediate, Thumb) and friends
-// +/-: Inst{9}
-// imm8: Inst{7-0}
-static inline int decodeImm8(uint32_t insn) {
-  int Offset = getImm8(insn);
-  return slice(insn, 9, 9) ? Offset : -Offset;
-}
-
-// Extract imm12: Inst{11-0}.
-static inline unsigned getImm12(uint32_t insn) {
-  return slice(insn, 11, 0);
-}
-
-// A8.6.63 LDRB (literal) and friends
-// +/-: Inst{23}
-// imm12: Inst{11-0}
-static inline int decodeImm12(uint32_t insn) {
-  int Offset = getImm12(insn);
-  return slice(insn, 23, 23) ? Offset : -Offset;
-}
-
-// Extract imm2: Inst{7-6}.
-static inline unsigned getImm2(uint32_t insn) {
-  return slice(insn, 7, 6);
-}
-
-// For BFI, BFC, t2SBFX, and t2UBFX.
-// Extract lsb: Inst{14-12:7-6}.
-static inline unsigned getLsb(uint32_t insn) {
-  return getImm3(insn) << 2 | getImm2(insn);
-}
-
-// For BFI and BFC.
-// Extract msb: Inst{4-0}.
-static inline unsigned getMsb(uint32_t insn) {
-  return slice(insn, 4, 0);
-}
-
-// For t2SBFX and t2UBFX.
-// Extract widthminus1: Inst{4-0}.
-static inline unsigned getWidthMinus1(uint32_t insn) {
-  return slice(insn, 4, 0);
-}
-
-// For t2ADDri12 and t2SUBri12.
-// imm12 = i:imm3:imm8;
-static inline unsigned getIImm3Imm8(uint32_t insn) {
-  return slice(insn, 26, 26) << 11 | getImm3(insn) << 8 | getImm8(insn);
-}
-
-// For t2MOVi16 and t2MOVTi16.
-// imm16 = imm4:i:imm3:imm8;
-static inline unsigned getImm16(uint32_t insn) {
-  return getImm4(insn) << 12 | slice(insn, 26, 26) << 11 |
-    getImm3(insn) << 8 | getImm8(insn);
-}
-
-// Inst{5-4} encodes the shift type.
-static inline unsigned getShiftTypeBits(uint32_t insn) {
-  return slice(insn, 5, 4);
-}
-
-// Inst{14-12}:Inst{7-6} encodes the imm5 shift amount.
-static inline unsigned getShiftAmtBits(uint32_t insn) {
-  return getImm3(insn) << 2 | getImm2(insn);
-}
-
-// A8.6.17 BFC
-// Encoding T1 ARMv6T2, ARMv7
-// LLVM-specific encoding for #<lsb> and #<width>
-static inline bool getBitfieldInvMask(uint32_t insn, uint32_t &mask) {
-  uint32_t lsb = getImm3(insn) << 2 | getImm2(insn);
-  uint32_t msb = getMsb(insn);
-  uint32_t Val = 0;
-  if (msb < lsb) {
-    DEBUG(errs() << "Encoding error: msb < lsb\n");
-    return false;
-  }
-  for (uint32_t i = lsb; i <= msb; ++i)
-    Val |= (1 << i);
-  mask = ~Val;
-  return true;
-}
-
-// A8.4 Shifts applied to a register
-// A8.4.1 Constant shifts
-// A8.4.3 Pseudocode details of instruction-specified shifts and rotates
-//
-// decodeImmShift() returns the shift amount and the the shift opcode.
-// Note that, as of Jan-06-2010, LLVM does not support rrx shifted operands yet.
-static inline unsigned decodeImmShift(unsigned bits2, unsigned imm5,
-                                      ARM_AM::ShiftOpc &ShOp) {
-
-  assert(imm5 < 32 && "Invalid imm5 argument");
-  switch (bits2) {
-  default: assert(0 && "No such value");
-  case 0:
-    ShOp = (imm5 == 0 ? ARM_AM::no_shift : ARM_AM::lsl);
-    return imm5;
-  case 1:
-    ShOp = ARM_AM::lsr;
-    return (imm5 == 0 ? 32 : imm5);
-  case 2:
-    ShOp = ARM_AM::asr;
-    return (imm5 == 0 ? 32 : imm5);
-  case 3:
-    ShOp = (imm5 == 0 ? ARM_AM::rrx : ARM_AM::ror);
-    return (imm5 == 0 ? 1 : imm5);
-  }
-}
-
-// A6.3.2 Modified immediate constants in Thumb instructions
-//
-// ThumbExpandImm() returns the modified immediate constant given an imm12 for
-// Thumb data-processing instructions with modified immediate.
-// See also A6.3.1 Data-processing (modified immediate).
-static inline unsigned ThumbExpandImm(unsigned imm12) {
-  assert(imm12 <= 0xFFF && "Invalid imm12 argument");
-
-  // If the leading two bits is 0b00, the modified immediate constant is
-  // obtained by splatting the low 8 bits into the first byte, every other byte,
-  // or every byte of a 32-bit value.
-  //
-  // Otherwise, a rotate right of '1':imm12<6:0> by the amount imm12<11:7> is
-  // performed.
-
-  if (slice(imm12, 11, 10) == 0) {
-    unsigned short control = slice(imm12, 9, 8);
-    unsigned imm8 = slice(imm12, 7, 0);
-    switch (control) {
-    default:
-      assert(0 && "No such value");
-      return 0;
-    case 0:
-      return imm8;
-    case 1:
-      return imm8 << 16 | imm8;
-    case 2:
-      return imm8 << 24 | imm8 << 8;
-    case 3:
-      return imm8 << 24 | imm8 << 16 | imm8 << 8 | imm8;
-    }
-  } else {
-    // A rotate is required.
-    unsigned Val = 1 << 7 | slice(imm12, 6, 0);
-    unsigned Amt = slice(imm12, 11, 7);
-    return ARM_AM::rotr32(Val, Amt);
-  }
-}
-
-static inline int decodeImm32_B_EncodingT3(uint32_t insn) {
-  bool S = slice(insn, 26, 26);
-  bool J1 = slice(insn, 13, 13);
-  bool J2 = slice(insn, 11, 11);
-  unsigned Imm21 = slice(insn, 21, 16) << 12 | slice(insn, 10, 0) << 1;
-  if (S) Imm21 |= 1 << 20;
-  if (J2) Imm21 |= 1 << 19;
-  if (J1) Imm21 |= 1 << 18;
-
-  return SignExtend32<21>(Imm21);
-}
-
-static inline int decodeImm32_B_EncodingT4(uint32_t insn) {
-  unsigned S = slice(insn, 26, 26);
-  bool I1 = slice(insn, 13, 13) == S;
-  bool I2 = slice(insn, 11, 11) == S;
-  unsigned Imm25 = slice(insn, 25, 16) << 12 | slice(insn, 10, 0) << 1;
-  if (S) Imm25 |= 1 << 24;
-  if (I1) Imm25 |= 1 << 23;
-  if (I2) Imm25 |= 1 << 22;
-
-  return SignExtend32<25>(Imm25);
-}
-
-static inline int decodeImm32_BL(uint32_t insn) {
-  unsigned S = slice(insn, 26, 26);
-  bool I1 = slice(insn, 13, 13) == S;
-  bool I2 = slice(insn, 11, 11) == S;
-  unsigned Imm25 = slice(insn, 25, 16) << 12 | slice(insn, 10, 0) << 1;
-  if (S) Imm25 |= 1 << 24;
-  if (I1) Imm25 |= 1 << 23;
-  if (I2) Imm25 |= 1 << 22;
-
-  return SignExtend32<25>(Imm25);
-}
-
-static inline int decodeImm32_BLX(uint32_t insn) {
-  unsigned S = slice(insn, 26, 26);
-  bool I1 = slice(insn, 13, 13) == S;
-  bool I2 = slice(insn, 11, 11) == S;
-  unsigned Imm25 = slice(insn, 25, 16) << 12 | slice(insn, 10, 1) << 2;
-  if (S) Imm25 |= 1 << 24;
-  if (I1) Imm25 |= 1 << 23;
-  if (I2) Imm25 |= 1 << 22;
-
-  return SignExtend32<25>(Imm25);
-}
-
-// See, for example, A8.6.221 SXTAB16.
-static inline unsigned decodeRotate(uint32_t insn) {
-  unsigned rotate = slice(insn, 5, 4);
-  return rotate << 3;
-}
-
-///////////////////////////////////////////////
-//                                           //
-// Thumb1 instruction disassembly functions. //
-//                                           //
-///////////////////////////////////////////////
-
-// See "Utilities for 16-bit Thumb instructions" for register naming convention.
-
-// A6.2.1 Shift (immediate), add, subtract, move, and compare
-//
-// shift immediate:         tRd CPSR tRn imm5
-// add/sub register:        tRd CPSR tRn tRm
-// add/sub 3-bit immediate: tRd CPSR tRn imm3
-// add/sub 8-bit immediate: tRt CPSR tRt(TIED_TO) imm8
-// mov/cmp immediate:       tRt [CPSR] imm8 (CPSR present for mov)
-//
-// Special case:
-// tMOVSr:                  tRd tRn
-static bool DisassembleThumb1General(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  const MCOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
-  unsigned &OpIdx = NumOpsAdded;
-
-  OpIdx = 0;
-
-  assert(NumOps >= 2 && OpInfo[0].RegClass == ARM::tGPRRegClassID
-         && "Invalid arguments");
-
-  bool Imm3 = (Opcode == ARM::tADDi3 || Opcode == ARM::tSUBi3);
-
-  // Use Rt implies use imm8.
-  bool UseRt = (Opcode == ARM::tADDi8 || Opcode == ARM::tSUBi8 ||
-                Opcode == ARM::tMOVi8 || Opcode == ARM::tCMPi8);
-
-  // Add the destination operand.
-  MI.addOperand(MCOperand::CreateReg(
-                  getRegisterEnum(B, ARM::tGPRRegClassID,
-                                  UseRt ? getT1tRt(insn) : getT1tRd(insn))));
-  ++OpIdx;
-
-  // Check whether the next operand to be added is a CCR Register.
-  if (OpInfo[OpIdx].RegClass == ARM::CCRRegClassID) {
-    assert(OpInfo[OpIdx].isOptionalDef() && "Optional def operand expected");
-    MI.addOperand(MCOperand::CreateReg(B->InITBlock() ? 0 : ARM::CPSR));
-    ++OpIdx;
-  }
-
-  // Check whether the next operand to be added is a Thumb1 Register.
-  assert(OpIdx < NumOps && "More operands expected");
-  if (OpInfo[OpIdx].RegClass == ARM::tGPRRegClassID) {
-    // For UseRt, the reg operand is tied to the first reg operand.
-    MI.addOperand(MCOperand::CreateReg(
-                    getRegisterEnum(B, ARM::tGPRRegClassID,
-                                    UseRt ? getT1tRt(insn) : getT1tRn(insn))));
-    ++OpIdx;
-  }
-
-  // Special case for tMOVSr.
-  if (OpIdx == NumOps)
-    return true;
-
-  // The next available operand is either a reg operand or an imm operand.
-  if (OpInfo[OpIdx].RegClass == ARM::tGPRRegClassID) {
-    // Three register operand instructions.
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::tGPRRegClassID,
-                                                       getT1tRm(insn))));
-  } else {
-    assert(OpInfo[OpIdx].RegClass < 0 &&
-           !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()
-           && "Pure imm operand expected");
-    unsigned Imm = 0;
-    if (UseRt)
-      Imm = getT1Imm8(insn);
-    else if (Imm3)
-      Imm = getT1Imm3(insn);
-    else {
-      Imm = getT1Imm5(insn);
-      ARM_AM::ShiftOpc ShOp = getShiftOpcForBits(slice(insn, 12, 11));
-      getImmShiftSE(ShOp, Imm);
-    }
-    MI.addOperand(MCOperand::CreateImm(Imm));
-  }
-  ++OpIdx;
-
-  return true;
-}
-
-// A6.2.2 Data-processing
-//
-// tCMPr, tTST, tCMN: tRd tRn
-// tMVN, tRSB:        tRd CPSR tRn
-// Others:            tRd CPSR tRd(TIED_TO) tRn
-static bool DisassembleThumb1DP(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  const MCInstrDesc &MCID = ARMInsts[Opcode];
-  const MCOperandInfo *OpInfo = MCID.OpInfo;
-  unsigned &OpIdx = NumOpsAdded;
-
-  OpIdx = 0;
-
-  assert(NumOps >= 2 && OpInfo[0].RegClass == ARM::tGPRRegClassID &&
-         (OpInfo[1].RegClass == ARM::CCRRegClassID
-          || OpInfo[1].RegClass == ARM::tGPRRegClassID)
-         && "Invalid arguments");
-
-  // Add the destination operand.
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::tGPRRegClassID,
-                                                     getT1tRd(insn))));
-  ++OpIdx;
-
-  // Check whether the next operand to be added is a CCR Register.
-  if (OpInfo[OpIdx].RegClass == ARM::CCRRegClassID) {
-    assert(OpInfo[OpIdx].isOptionalDef() && "Optional def operand expected");
-    MI.addOperand(MCOperand::CreateReg(B->InITBlock() ? 0 : ARM::CPSR));
-    ++OpIdx;
-  }
-
-  // We have either { tRd(TIED_TO), tRn } or { tRn } remaining.
-  // Process the TIED_TO operand first.
-
-  assert(OpIdx < NumOps && OpInfo[OpIdx].RegClass == ARM::tGPRRegClassID
-         && "Thumb reg operand expected");
-  int Idx;
-  if ((Idx = MCID.getOperandConstraint(OpIdx, MCOI::TIED_TO)) != -1) {
-    // The reg operand is tied to the first reg operand.
-    MI.addOperand(MI.getOperand(Idx));
-    ++OpIdx;
-  }
-
-  // Process possible next reg operand.
-  if (OpIdx < NumOps && OpInfo[OpIdx].RegClass == ARM::tGPRRegClassID) {
-    // Add tRn operand.
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::tGPRRegClassID,
-                                                       getT1tRn(insn))));
-    ++OpIdx;
-  }
-
-  return true;
-}
-
-// A6.2.3 Special data instructions and branch and exchange
-//
-// tADDhirr: Rd Rd(TIED_TO) Rm
-// tCMPhir:  Rd Rm
-// tMOVr, tMOVgpr2gpr, tMOVgpr2tgpr, tMOVtgpr2gpr: Rd|tRd Rm|tRn
-// tBX: Rm
-// tBX_RET: 0 operand
-// tBX_RET_vararg: Rm
-// tBLXr_r9: Rm
-// tBRIND: Rm
-static bool DisassembleThumb1Special(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  // tBX_RET has 0 operand.
-  if (NumOps == 0)
-    return true;
-
-  // BX/BLX/tBRIND (indirect branch, i.e, mov pc, Rm) has 1 reg operand: Rm.
-  if (Opcode==ARM::tBLXr_r9 || Opcode==ARM::tBX || Opcode==ARM::tBRIND) {
-    if (Opcode == ARM::tBLXr_r9) {
-      // Handling the two predicate operands before the reg operand.
-      if (!B->DoPredicateOperands(MI, Opcode, insn, NumOps))
-        return false;
-      NumOpsAdded += 2;
-    }
-
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       getT1Rm(insn))));
-    NumOpsAdded += 1;
-
-    if (Opcode == ARM::tBX) {
-      // Handling the two predicate operands after the reg operand.
-      if (!B->DoPredicateOperands(MI, Opcode, insn, NumOps))
-        return false;
-      NumOpsAdded += 2;
-    }
-
-    return true;
-  }
-
-  const MCInstrDesc &MCID = ARMInsts[Opcode];
-  const MCOperandInfo *OpInfo = MCID.OpInfo;
-  unsigned &OpIdx = NumOpsAdded;
-
-  OpIdx = 0;
-
-  // Add the destination operand.
-  unsigned RegClass = OpInfo[OpIdx].RegClass;
-  MI.addOperand(MCOperand::CreateReg(
-                  getRegisterEnum(B, RegClass,
-                                  IsGPR(RegClass) ? getT1Rd(insn)
-                                                  : getT1tRd(insn))));
-  ++OpIdx;
-
-  // We have either { Rd(TIED_TO), Rm } or { Rm|tRn } remaining.
-  // Process the TIED_TO operand first.
-
-  assert(OpIdx < NumOps && "More operands expected");
-  int Idx;
-  if ((Idx = MCID.getOperandConstraint(OpIdx, MCOI::TIED_TO)) != -1) {
-    // The reg operand is tied to the first reg operand.
-    MI.addOperand(MI.getOperand(Idx));
-    ++OpIdx;
-  }
-
-  // The next reg operand is either Rm or tRn.
-  assert(OpIdx < NumOps && "More operands expected");
-  RegClass = OpInfo[OpIdx].RegClass;
-  MI.addOperand(MCOperand::CreateReg(
-                  getRegisterEnum(B, RegClass,
-                                  IsGPR(RegClass) ? getT1Rm(insn)
-                                                  : getT1tRn(insn))));
-  ++OpIdx;
-
-  return true;
-}
-
-// A8.6.59 LDR (literal)
-//
-// tLDRpci: tRt imm8*4
-static bool DisassembleThumb1LdPC(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  const MCOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
-  if (!OpInfo) return false;
-
-  assert(NumOps >= 2 && OpInfo[0].RegClass == ARM::tGPRRegClassID &&
-         (OpInfo[1].RegClass < 0 &&
-          !OpInfo[1].isPredicate() &&
-          !OpInfo[1].isOptionalDef())
-         && "Invalid arguments");
-
-  // Add the destination operand.
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::tGPRRegClassID,
-                                                     getT1tRt(insn))));
-
-  // And the (imm8 << 2) operand.
-  MI.addOperand(MCOperand::CreateImm(getT1Imm8(insn) << 2));
-
-  NumOpsAdded = 2;
-
-  return true;
-}
-
-// Thumb specific addressing modes (see ARMInstrThumb.td):
-//
-// t_addrmode_rr := reg + reg
-//
-// t_addrmode_s4 := reg + reg
-//                  reg + imm5 * 4
-//
-// t_addrmode_s2 := reg + reg
-//                  reg + imm5 * 2
-//
-// t_addrmode_s1 := reg + reg
-//                  reg + imm5
-//
-// t_addrmode_sp := sp + imm8 * 4
-//
-
-// A8.6.63 LDRB (literal)
-// A8.6.79 LDRSB (literal)
-// A8.6.75 LDRH (literal)
-// A8.6.83 LDRSH (literal)
-// A8.6.59 LDR (literal)
-//
-// These instrs calculate an address from the PC value and an immediate offset.
-// Rd Rn=PC (+/-)imm12 (+ if Inst{23} == 0b1)
-static bool DisassembleThumb2Ldpci(MCInst &MI, unsigned Opcode,
-    uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  const MCOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
-  if (!OpInfo) return false;
-
-  assert(NumOps >= 2 &&
-         OpInfo[0].RegClass == ARM::GPRRegClassID &&
-         OpInfo[1].RegClass < 0 &&
-         "Expect >= 2 operands, first as reg, and second as imm operand");
-
-  // Build the register operand, followed by the (+/-)imm12 immediate.
-
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRd(insn))));
-
-  MI.addOperand(MCOperand::CreateImm(decodeImm12(insn)));
-
-  NumOpsAdded = 2;
-
-  return true;
-}
-
-
-// A6.2.4 Load/store single data item
-//
-// Load/Store Register (reg|imm):      tRd tRn imm5|tRm
-// Load Register Signed Byte|Halfword: tRd tRn tRm
-static bool DisassembleThumb1LdSt(unsigned opA, MCInst &MI, unsigned Opcode,
-    uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  const MCInstrDesc &MCID = ARMInsts[Opcode];
-  const MCOperandInfo *OpInfo = MCID.OpInfo;
-  unsigned &OpIdx = NumOpsAdded;
-
-  assert(NumOps >= 2
-         && OpInfo[0].RegClass == ARM::tGPRRegClassID
-         && OpInfo[1].RegClass == ARM::tGPRRegClassID
-         && "Expect >= 2 operands and first two as thumb reg operands");
-
-  // Add the destination reg and the base reg.
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::tGPRRegClassID,
-                                                     getT1tRd(insn))));
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::tGPRRegClassID,
-                                                     getT1tRn(insn))));
-  OpIdx = 2;
-
-  // We have either { imm5 } or { tRm } remaining.
-  // Note that STR/LDR (register) should skip the imm5 offset operand for
-  // t_addrmode_s[1|2|4].
-
-  assert(OpIdx < NumOps && "More operands expected");
-
-  if (OpInfo[OpIdx].RegClass < 0 && !OpInfo[OpIdx].isPredicate() &&
-      !OpInfo[OpIdx].isOptionalDef()) {
-    // Table A6-5 16-bit Thumb Load/store instructions
-    // opA = 0b0101 for STR/LDR (register) and friends.
-    // Otherwise, we have STR/LDR (immediate) and friends.
-    assert(opA != 5 && "Immediate operand expected for this opcode");
-    MI.addOperand(MCOperand::CreateImm(getT1Imm5(insn)));
-    ++OpIdx;
-  } else {
-    // The next reg operand is tRm, the offset.
-    assert(OpIdx < NumOps && OpInfo[OpIdx].RegClass == ARM::tGPRRegClassID
-           && "Thumb reg operand expected");
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::tGPRRegClassID,
-                                                       getT1tRm(insn))));
-    ++OpIdx;
-  }
-  return true;
-}
-
-// A6.2.4 Load/store single data item
-//
-// Load/Store Register SP relative: tRt ARM::SP imm8
-static bool DisassembleThumb1LdStSP(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  assert((Opcode == ARM::tLDRspi || Opcode == ARM::tSTRspi)
-         && "Unexpected opcode");
-
-  const MCOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
-  if (!OpInfo) return false;
-
-  assert(NumOps >= 3 &&
-         OpInfo[0].RegClass == ARM::tGPRRegClassID &&
-         OpInfo[1].RegClass == ARM::GPRRegClassID &&
-         (OpInfo[2].RegClass < 0 &&
-          !OpInfo[2].isPredicate() &&
-          !OpInfo[2].isOptionalDef())
-         && "Invalid arguments");
-
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::tGPRRegClassID,
-                                                     getT1tRt(insn))));
-  MI.addOperand(MCOperand::CreateReg(ARM::SP));
-  MI.addOperand(MCOperand::CreateImm(getT1Imm8(insn)));
-  NumOpsAdded = 3;
-  return true;
-}
-
-// Table A6-1 16-bit Thumb instruction encoding
-// A8.6.10 ADR
-//
-// tADDrPCi: tRt imm8
-static bool DisassembleThumb1AddPCi(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  assert(Opcode == ARM::tADDrPCi && "Unexpected opcode");
-
-  const MCOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
-  if (!OpInfo) return false;
-
-  assert(NumOps >= 2 && OpInfo[0].RegClass == ARM::tGPRRegClassID &&
-         (OpInfo[1].RegClass < 0 &&
-          !OpInfo[1].isPredicate() &&
-          !OpInfo[1].isOptionalDef())
-         && "Invalid arguments");
-
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::tGPRRegClassID,
-                                                     getT1tRt(insn))));
-  MI.addOperand(MCOperand::CreateImm(getT1Imm8(insn)));
-  NumOpsAdded = 2;
-  return true;
-}
-
-// Table A6-1 16-bit Thumb instruction encoding
-// A8.6.8 ADD (SP plus immediate)
-//
-// tADDrSPi: tRt ARM::SP imm8
-static bool DisassembleThumb1AddSPi(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  assert(Opcode == ARM::tADDrSPi && "Unexpected opcode");
-
-  const MCOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
-  if (!OpInfo) return false;
-
-  assert(NumOps >= 3 &&
-         OpInfo[0].RegClass == ARM::tGPRRegClassID &&
-         OpInfo[1].RegClass == ARM::GPRRegClassID &&
-         (OpInfo[2].RegClass < 0 &&
-          !OpInfo[2].isPredicate() &&
-          !OpInfo[2].isOptionalDef())
-         && "Invalid arguments");
-
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::tGPRRegClassID,
-                                                     getT1tRt(insn))));
-  MI.addOperand(MCOperand::CreateReg(ARM::SP));
-  MI.addOperand(MCOperand::CreateImm(getT1Imm8(insn)));
-  NumOpsAdded = 3;
-  return true;
-}
-
-// tPUSH, tPOP: Pred-Imm Pred-CCR register_list
-//
-// where register_list = low registers + [lr] for PUSH or
-//                       low registers + [pc] for POP
-//
-// "low registers" is specified by Inst{7-0}
-// lr|pc is specified by Inst{8}
-static bool DisassembleThumb1PushPop(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  assert((Opcode == ARM::tPUSH || Opcode == ARM::tPOP) && "Unexpected opcode");
-
-  unsigned &OpIdx = NumOpsAdded;
-
-  // Handling the two predicate operands before the reglist.
-  if (B->DoPredicateOperands(MI, Opcode, insn, NumOps))
-    OpIdx += 2;
-  else {
-    DEBUG(errs() << "Expected predicate operands not found.\n");
-    return false;
-  }
-
-  unsigned RegListBits = slice(insn, 8, 8) << (Opcode == ARM::tPUSH ? 14 : 15)
-    | slice(insn, 7, 0);
-
-  // Fill the variadic part of reglist.
-  for (unsigned i = 0; i < 16; ++i) {
-    if ((RegListBits >> i) & 1) {
-      MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                         i)));
-      ++OpIdx;
-    }
-  }
-
-  return true;
-}
-
-// A6.2.5 Miscellaneous 16-bit instructions
-// Delegate to DisassembleThumb1PushPop() for tPUSH & tPOP.
-//
-// tADDspi, tSUBspi: ARM::SP ARM::SP(TIED_TO) imm7
-// t2IT:             firstcond=Inst{7-4} mask=Inst{3-0}
-// tCBNZ, tCBZ:      tRd imm6*2
-// tBKPT:            imm8
-// tNOP, tSEV, tYIELD, tWFE, tWFI:
-//   no operand (except predicate pair)
-// tSETENDBE, tSETENDLE, :
-//   no operand
-// Others:           tRd tRn
-static bool DisassembleThumb1Misc(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  if (NumOps == 0)
-    return true;
-
-  if (Opcode == ARM::tPUSH || Opcode == ARM::tPOP)
-    return DisassembleThumb1PushPop(MI, Opcode, insn, NumOps, NumOpsAdded, B);
-
-  const MCOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
-
-  // Predicate operands are handled elsewhere.
-  if (NumOps == 2 &&
-      OpInfo[0].isPredicate() && OpInfo[1].isPredicate() &&
-      OpInfo[0].RegClass < 0 && OpInfo[1].RegClass == ARM::CCRRegClassID) {
-    return true;
-  }
-
-  if (Opcode == ARM::tADDspi || Opcode == ARM::tSUBspi) {
-    // Special case handling for tADDspi and tSUBspi.
-    // A8.6.8 ADD (SP plus immediate) & A8.6.215 SUB (SP minus immediate)
-    MI.addOperand(MCOperand::CreateReg(ARM::SP));
-    MI.addOperand(MCOperand::CreateReg(ARM::SP));
-    MI.addOperand(MCOperand::CreateImm(getT1Imm7(insn)));
-    NumOpsAdded = 3;
-    return true;
-  }
-
-  if (Opcode == ARM::t2IT) {
-    // Special case handling for If-Then.
-    // A8.6.50 IT
-    // Tag the (firstcond[0] bit << 4) along with mask.
-
-    // firstcond
-    MI.addOperand(MCOperand::CreateImm(slice(insn, 7, 4)));
-
-    // firstcond[0] and mask
-    MI.addOperand(MCOperand::CreateImm(slice(insn, 4, 0)));
-    NumOpsAdded = 2;
-    return true;
-  }
-
-  if (Opcode == ARM::tBKPT) {
-    MI.addOperand(MCOperand::CreateImm(getT1Imm8(insn))); // breakpoint value
-    NumOpsAdded = 1;
-    return true;
-  }
-
-  // CPS has a singleton $opt operand that contains the following information:
-  // The first op would be 0b10 as enable and 0b11 as disable in regular ARM,
-  // but in Thumb it's is 0 as enable and 1 as disable. So map it to ARM's
-  // default one. The second get the AIF flags from Inst{2-0}.
-  if (Opcode == ARM::tCPS) {
-    MI.addOperand(MCOperand::CreateImm(2 + slice(insn, 4, 4)));
-    MI.addOperand(MCOperand::CreateImm(slice(insn, 2, 0)));
-    NumOpsAdded = 2;
-    return true;
-  }
-
-  assert(NumOps >= 2 && OpInfo[0].RegClass == ARM::tGPRRegClassID &&
-         (OpInfo[1].RegClass < 0 || OpInfo[1].RegClass==ARM::tGPRRegClassID)
-         && "Expect >=2 operands");
-
-  // Add the destination operand.
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::tGPRRegClassID,
-                                                     getT1tRd(insn))));
-
-  if (OpInfo[1].RegClass == ARM::tGPRRegClassID) {
-    // Two register instructions.
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::tGPRRegClassID,
-                                                       getT1tRn(insn))));
-  } else {
-    // CBNZ, CBZ
-    assert((Opcode == ARM::tCBNZ || Opcode == ARM::tCBZ) &&"Unexpected opcode");
-    MI.addOperand(MCOperand::CreateImm(getT1Imm6(insn) * 2));
-  }
-
-  NumOpsAdded = 2;
-
-  return true;
-}
-
-// A8.6.53  LDM / LDMIA
-// A8.6.189 STM / STMIA
-//
-// tLDMIA_UPD/tSTMIA_UPD: tRt tRt AM4ModeImm Pred-Imm Pred-CCR register_list
-// tLDMIA:                tRt AM4ModeImm Pred-Imm Pred-CCR register_list
-static bool DisassembleThumb1LdStMul(bool Ld, MCInst &MI, unsigned Opcode,
-                                     uint32_t insn, unsigned short NumOps,
-                                     unsigned &NumOpsAdded, BO B) {
-  assert((Opcode == ARM::tLDMIA || Opcode == ARM::tLDMIA_UPD ||
-          Opcode == ARM::tSTMIA_UPD) && "Unexpected opcode");
-
-  unsigned tRt = getT1tRt(insn);
-  NumOpsAdded = 0;
-
-  // WB register, if necessary.
-  if (Opcode == ARM::tLDMIA_UPD || Opcode == ARM::tSTMIA_UPD) {
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       tRt)));
-    ++NumOpsAdded;
-  }
-
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     tRt)));
-  ++NumOpsAdded;
-
-  // Handling the two predicate operands before the reglist.
-  if (B->DoPredicateOperands(MI, Opcode, insn, NumOps)) {
-    NumOpsAdded += 2;
-  } else {
-    DEBUG(errs() << "Expected predicate operands not found.\n");
-    return false;
-  }
-
-  unsigned RegListBits = slice(insn, 7, 0);
-  if (BitCount(RegListBits) < 1) {
-    DEBUG(errs() << "if BitCount(registers) < 1 then UNPREDICTABLE\n");
-    return false;
-  }
-
-  // Fill the variadic part of reglist.
-  for (unsigned i = 0; i < 8; ++i)
-    if ((RegListBits >> i) & 1) {
-      MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::tGPRRegClassID,
-                                                         i)));
-      ++NumOpsAdded;
-    }
-
-  return true;
-}
-
-static bool DisassembleThumb1LdMul(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-  return DisassembleThumb1LdStMul(true, MI, Opcode, insn, NumOps, NumOpsAdded,
-                                  B);
-}
-
-static bool DisassembleThumb1StMul(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-  return DisassembleThumb1LdStMul(false, MI, Opcode, insn, NumOps, NumOpsAdded,
-                                  B);
-}
-
-// A8.6.16 B Encoding T1
-// cond = Inst{11-8} & imm8 = Inst{7-0}
-// imm32 = SignExtend(imm8:'0', 32)
-//
-// tBcc: offset Pred-Imm Pred-CCR
-// tSVC: imm8 Pred-Imm Pred-CCR
-// tTRAP: 0 operand (early return)
-static bool DisassembleThumb1CondBr(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO) {
-
-  if (Opcode == ARM::tTRAP)
-    return true;
-
-  const MCOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
-  if (!OpInfo) return false;
-
-  assert(NumOps == 3 && OpInfo[0].RegClass < 0 &&
-         OpInfo[1].isPredicate() && OpInfo[2].RegClass == ARM::CCRRegClassID
-         && "Exactly 3 operands expected");
-
-  unsigned Imm8 = getT1Imm8(insn);
-  MI.addOperand(MCOperand::CreateImm(
-                  Opcode == ARM::tBcc ? SignExtend32<9>(Imm8 << 1)
-                                      : (int)Imm8));
-
-  // Predicate operands by ARMBasicMCBuilder::TryPredicateAndSBitModifier().
-  // But note that for tBcc, if cond = '1110' then UNDEFINED.
-  if (Opcode == ARM::tBcc && slice(insn, 11, 8) == 14) {
-    DEBUG(errs() << "if cond = '1110' then UNDEFINED\n");
-    return false;
-  }
-  NumOpsAdded = 1;
-
-  return true;
-}
-
-// A8.6.16 B Encoding T2
-// imm11 = Inst{10-0}
-// imm32 = SignExtend(imm11:'0', 32)
-//
-// tB: offset
-static bool DisassembleThumb1Br(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO) {
-
-  const MCOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
-  if (!OpInfo) return false;
-
-  assert(NumOps == 1 && OpInfo[0].RegClass < 0 && "1 imm operand expected");
-
-  unsigned Imm11 = getT1Imm11(insn);
-
-  MI.addOperand(MCOperand::CreateImm(SignExtend32<12>(Imm11 << 1)));
-
-  NumOpsAdded = 1;
-
-  return true;
-
-}
-
-// See A6.2 16-bit Thumb instruction encoding for instruction classes
-// corresponding to op.
-//
-// Table A6-1 16-bit Thumb instruction encoding (abridged)
-// op    Instruction or instruction class
-// ------  --------------------------------------------------------------------
-// 00xxxx  Shift (immediate), add, subtract, move, and compare on page A6-7
-// 010000  Data-processing on page A6-8
-// 010001  Special data instructions and branch and exchange on page A6-9
-// 01001x  Load from Literal Pool, see LDR (literal) on page A8-122
-// 0101xx  Load/store single data item on page A6-10
-// 011xxx
-// 100xxx
-// 10100x  Generate PC-relative address, see ADR on page A8-32
-// 10101x  Generate SP-relative address, see ADD (SP plus immediate) on
-//         page A8-28
-// 1011xx  Miscellaneous 16-bit instructions on page A6-11
-// 11000x  Store multiple registers, see STM / STMIA / STMEA on page A8-374
-// 11001x  Load multiple registers, see LDM / LDMIA / LDMFD on page A8-110 a
-// 1101xx  Conditional branch, and Supervisor Call on page A6-13
-// 11100x  Unconditional Branch, see B on page A8-44
-//
-static bool DisassembleThumb1(uint16_t op, MCInst &MI, unsigned Opcode,
-    uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  unsigned op1 = slice(op, 5, 4);
-  unsigned op2 = slice(op, 3, 2);
-  unsigned op3 = slice(op, 1, 0);
-  unsigned opA = slice(op, 5, 2);
-  switch (op1) {
-  case 0:
-    // A6.2.1 Shift (immediate), add, subtract, move, and compare
-    return DisassembleThumb1General(MI, Opcode, insn, NumOps, NumOpsAdded, B);
-  case 1:
-    switch (op2) {
-    case 0:
-      switch (op3) {
-      case 0:
-        // A6.2.2 Data-processing
-        return DisassembleThumb1DP(MI, Opcode, insn, NumOps, NumOpsAdded, B);
-      case 1:
-        // A6.2.3 Special data instructions and branch and exchange
-        return DisassembleThumb1Special(MI, Opcode, insn, NumOps, NumOpsAdded,
-                                        B);
-      default:
-        // A8.6.59 LDR (literal)
-        return DisassembleThumb1LdPC(MI, Opcode, insn, NumOps, NumOpsAdded, B);
-      }
-      break;
-    default:
-      // A6.2.4 Load/store single data item
-      return DisassembleThumb1LdSt(opA, MI, Opcode, insn, NumOps, NumOpsAdded,
-                                   B);
-      break;
-    }
-    break;
-  case 2:
-    switch (op2) {
-    case 0:
-      // A6.2.4 Load/store single data item
-      return DisassembleThumb1LdSt(opA, MI, Opcode, insn, NumOps, NumOpsAdded,
-                                   B);
-    case 1:
-      // A6.2.4 Load/store single data item
-      return DisassembleThumb1LdStSP(MI, Opcode, insn, NumOps, NumOpsAdded, B);
-    case 2:
-      if (op3 <= 1) {
-        // A8.6.10 ADR
-        return DisassembleThumb1AddPCi(MI, Opcode, insn, NumOps, NumOpsAdded,
-                                       B);
-      } else {
-        // A8.6.8 ADD (SP plus immediate)
-        return DisassembleThumb1AddSPi(MI, Opcode, insn, NumOps, NumOpsAdded,
-                                       B);
-      }
-    default:
-      // A6.2.5 Miscellaneous 16-bit instructions
-      return DisassembleThumb1Misc(MI, Opcode, insn, NumOps, NumOpsAdded, B);
-    }
-    break;
-  case 3:
-    switch (op2) {
-    case 0:
-      if (op3 <= 1) {
-        // A8.6.189 STM / STMIA / STMEA
-        return DisassembleThumb1StMul(MI, Opcode, insn, NumOps, NumOpsAdded, B);
-      } else {
-        // A8.6.53 LDM / LDMIA / LDMFD
-        return DisassembleThumb1LdMul(MI, Opcode, insn, NumOps, NumOpsAdded, B);
-      }
-    case 1:
-      // A6.2.6 Conditional branch, and Supervisor Call
-      return DisassembleThumb1CondBr(MI, Opcode, insn, NumOps, NumOpsAdded, B);
-    case 2:
-      // Unconditional Branch, see B on page A8-44
-      return DisassembleThumb1Br(MI, Opcode, insn, NumOps, NumOpsAdded, B);
-    default:
-      assert(0 && "Unreachable code");
-      break;
-    }
-    break;
-  default:
-    assert(0 && "Unreachable code");
-    break;
-  }
-
-  return false;
-}
-
-///////////////////////////////////////////////
-//                                           //
-// Thumb2 instruction disassembly functions. //
-//                                           //
-///////////////////////////////////////////////
-
-///////////////////////////////////////////////////////////
-//                                                       //
-// Note: the register naming follows the ARM convention! //
-//                                                       //
-///////////////////////////////////////////////////////////
-
-static inline bool Thumb2SRSOpcode(unsigned Opcode) {
-  switch (Opcode) {
-  default:
-    return false;
-  case ARM::t2SRSDBW: case ARM::t2SRSDB:
-  case ARM::t2SRSIAW: case ARM::t2SRSIA:
-    return true;
-  }
-}
-
-static inline bool Thumb2RFEOpcode(unsigned Opcode) {
-  switch (Opcode) {
-  default:
-    return false;
-  case ARM::t2RFEDBW: case ARM::t2RFEDB:
-  case ARM::t2RFEIAW: case ARM::t2RFEIA:
-    return true;
-  }
-}
-
-// t2SRS[IA|DB]W/t2SRS[IA|DB]: mode_imm = Inst{4-0}
-static bool DisassembleThumb2SRS(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded) {
-  MI.addOperand(MCOperand::CreateImm(slice(insn, 4, 0)));
-  NumOpsAdded = 1;
-  return true;
-}
-
-// t2RFE[IA|DB]W/t2RFE[IA|DB]: Rn
-static bool DisassembleThumb2RFE(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-  unsigned Rn = decodeRn(insn);
-  if (Rn == 15) {
-    DEBUG(errs() << "if n == 15 then UNPREDICTABLE\n");
-    return false;
-  }
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B,ARM::GPRRegClassID,Rn)));
-  NumOpsAdded = 1;
-  return true;
-}
-
-static bool DisassembleThumb2LdStMul(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  if (Thumb2SRSOpcode(Opcode))
-    return DisassembleThumb2SRS(MI, Opcode, insn, NumOps, NumOpsAdded);
-
-  if (Thumb2RFEOpcode(Opcode))
-    return DisassembleThumb2RFE(MI, Opcode, insn, NumOps, NumOpsAdded, B);
-
-  assert((Opcode == ARM::t2LDMIA || Opcode == ARM::t2LDMIA_UPD ||
-          Opcode == ARM::t2LDMDB || Opcode == ARM::t2LDMDB_UPD ||
-          Opcode == ARM::t2STMIA || Opcode == ARM::t2STMIA_UPD ||
-          Opcode == ARM::t2STMDB || Opcode == ARM::t2STMDB_UPD)
-         && "Unexpected opcode");
-  assert(NumOps >= 4 && "Thumb2 LdStMul expects NumOps >= 4");
-
-  NumOpsAdded = 0;
-
-  unsigned Base = getRegisterEnum(B, ARM::GPRRegClassID, decodeRn(insn));
-
-  // Writeback to base.
-  if (Opcode == ARM::t2LDMIA_UPD || Opcode == ARM::t2LDMDB_UPD ||
-      Opcode == ARM::t2STMIA_UPD || Opcode == ARM::t2STMDB_UPD) {
-    MI.addOperand(MCOperand::CreateReg(Base));
-    ++NumOpsAdded;
-  }
-
-  MI.addOperand(MCOperand::CreateReg(Base));
-  ++NumOpsAdded;
-
-  // Handling the two predicate operands before the reglist.
-  if (B->DoPredicateOperands(MI, Opcode, insn, NumOps)) {
-    NumOpsAdded += 2;
-  } else {
-    DEBUG(errs() << "Expected predicate operands not found.\n");
-    return false;
-  }
-
-  unsigned RegListBits = insn & ((1 << 16) - 1);
-
-  // Fill the variadic part of reglist.
-  for (unsigned i = 0; i < 16; ++i)
-    if ((RegListBits >> i) & 1) {
-      MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                         i)));
-      ++NumOpsAdded;
-    }
-
-  return true;
-}
-
-// t2LDREX: Rd Rn
-// t2LDREXD: Rd Rs Rn
-// t2LDREXB, t2LDREXH: Rd Rn
-// t2STREX: Rs Rd Rn
-// t2STREXD: Rm Rd Rs Rn
-// t2STREXB, t2STREXH: Rm Rd Rn
-static bool DisassembleThumb2LdStEx(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  const MCOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
-  if (!OpInfo) return false;
-
-  unsigned &OpIdx = NumOpsAdded;
-
-  OpIdx = 0;
-
-  assert(NumOps >= 2
-         && OpInfo[0].RegClass > 0
-         && OpInfo[1].RegClass > 0
-         && "Expect >=2 operands and first two as reg operands");
-
-  bool isStore = (ARM::t2STREX <= Opcode && Opcode <= ARM::t2STREXH);
-  bool isSW = (Opcode == ARM::t2LDREX || Opcode == ARM::t2STREX);
-  bool isDW = (Opcode == ARM::t2LDREXD || Opcode == ARM::t2STREXD);
-
-  unsigned Rt  = decodeRd(insn);
-  unsigned Rt2 = decodeRs(insn); // But note that this is Rd for t2STREX.
-  unsigned Rd  = decodeRm(insn);
-  unsigned Rn  = decodeRn(insn);
-
-  // Some sanity checking first.
-  if (isStore) {
-    // if d == n || d == t then UNPREDICTABLE
-    // if d == n || d == t || d == t2 then UNPREDICTABLE
-    if (isDW) {
-      if (Rd == Rn || Rd == Rt || Rd == Rt2) {
-        DEBUG(errs() << "if d == n || d == t || d == t2 then UNPREDICTABLE\n");
-        return false;
-      }
-    } else {
-      if (isSW) {
-        if (Rt2 == Rn || Rt2 == Rt) {
-          DEBUG(errs() << "if d == n || d == t then UNPREDICTABLE\n");
-          return false;
-        }
-      } else {
-        if (Rd == Rn || Rd == Rt) {
-          DEBUG(errs() << "if d == n || d == t then UNPREDICTABLE\n");
-          return false;
-        }
-      }
-    }
-  } else {
-    // Load
-    // A8.6.71 LDREXD
-    // if t == t2 then UNPREDICTABLE
-    if (isDW && Rt == Rt2) {
-      DEBUG(errs() << "if t == t2 then UNPREDICTABLE\n");
-      return false;
-    }
-  }
-
-  // Add the destination operand for store.
-  if (isStore) {
-    MI.addOperand(MCOperand::CreateReg(
-                    getRegisterEnum(B, OpInfo[OpIdx].RegClass,
-                                    isSW ? Rt2 : Rd)));
-    ++OpIdx;
-  }
-
-  // Source operand for store and destination operand for load.
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, OpInfo[OpIdx].RegClass,
-                                                     Rt)));
-  ++OpIdx;
-
-  // Thumb2 doubleword complication: with an extra source/destination operand.
-  if (isDW) {
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B,OpInfo[OpIdx].RegClass,
-                                                       Rt2)));
-    ++OpIdx;
-  }
-
-  // Finally add the pointer operand.
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, OpInfo[OpIdx].RegClass,
-                                                     Rn)));
-  ++OpIdx;
-
-  return true;
-}
-
-// t2LDRDi8: Rd Rs Rn imm8s4 (offset mode)
-// t2LDRDpci: Rd Rs imm8s4 (Not decoded, prefer the generic t2LDRDi8 version)
-// t2STRDi8: Rd Rs Rn imm8s4 (offset mode)
-//
-// Ditto for t2LDRD_PRE, t2LDRD_POST, t2STRD_PRE, t2STRD_POST, which are for
-// disassembly only and do not have a tied_to writeback base register operand.
-static bool DisassembleThumb2LdStDual(MCInst &MI, unsigned Opcode,
-    uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  const MCOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
-  if (!OpInfo) return false;
-
-  assert(NumOps >= 4
-         && OpInfo[0].RegClass > 0
-         && OpInfo[0].RegClass == OpInfo[1].RegClass
-         && OpInfo[2].RegClass > 0
-         && OpInfo[3].RegClass < 0
-         && "Expect >= 4 operands and first 3 as reg operands");
-
-  // Thumnb allows for specifying Rt and Rt2, unlike ARM (which has Rt2==Rt+1).
-  unsigned Rt  = decodeRd(insn);
-  unsigned Rt2 = decodeRs(insn);
-  unsigned Rn  = decodeRn(insn);
-
-  // Some sanity checking first.
-
-  // A8.6.67 LDRD (literal) has its W bit as (0).
-  if (Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2LDRD_PRE || Opcode == ARM::t2LDRD_POST) {
-    if (Rn == 15 && slice(insn, 21, 21) != 0)
-      return false;
-  } else {
-    // For Dual Store, PC cannot be used as the base register.
-    if (Rn == 15) {
-      DEBUG(errs() << "if n == 15 then UNPREDICTABLE\n");
-      return false;
-    }
-  }
-  if (Rt == Rt2) {
-    DEBUG(errs() << "if t == t2 then UNPREDICTABLE\n");
-    return false;
-  }
-  if (Opcode != ARM::t2LDRDi8 && Opcode != ARM::t2STRDi8) {
-    if (Rn == Rt || Rn == Rt2) {
-      DEBUG(errs() << "if wback && (n == t || n == t2) then UNPREDICTABLE\n");
-      return false;
-    }
-  }
-
-  // Add the <Rt> <Rt2> operands.
-  unsigned RegClassPair = OpInfo[0].RegClass;
-  unsigned RegClassBase = OpInfo[2].RegClass;
-  
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, RegClassPair,
-                                                     decodeRd(insn))));
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, RegClassPair,
-                                                     decodeRs(insn))));
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, RegClassBase,
-                                                     decodeRn(insn))));
-
-  // Finally add (+/-)imm8*4, depending on the U bit.
-  int Offset = getImm8(insn) * 4;
-  if (getUBit(insn) == 0)
-    Offset = -Offset;
-  MI.addOperand(MCOperand::CreateImm(Offset));
-  NumOpsAdded = 4;
-
-  return true;
-}
-
-// t2TBB, t2TBH: Rn Rm Pred-Imm Pred-CCR
-static bool DisassembleThumb2TB(MCInst &MI, unsigned Opcode,
-    uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  assert(NumOps >= 2 && "Expect >= 2 operands");
-
-  // The generic version of TBB/TBH needs a base register.
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRn(insn))));
-  // Add the index register.
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRm(insn))));
-  NumOpsAdded = 2;
-
-  return true;
-}
-
-static inline bool Thumb2ShiftOpcode(unsigned Opcode) {
-  switch (Opcode) {
-  default:
-    return false;
-  case ARM::t2MOVCClsl: case ARM::t2MOVCClsr:
-  case ARM::t2MOVCCasr: case ARM::t2MOVCCror:
-  case ARM::t2LSLri:    case ARM::t2LSRri:
-  case ARM::t2ASRri:    case ARM::t2RORri:
-    return true;
-  }
-}
-
-// A6.3.11 Data-processing (shifted register)
-//
-// Two register operands (Rn=0b1111 no 1st operand reg): Rs Rm
-// Two register operands (Rs=0b1111 no dst operand reg): Rn Rm
-// Three register operands: Rs Rn Rm
-// Three register operands: (Rn=0b1111 Conditional Move) Rs Ro(TIED_TO) Rm
-//
-// Constant shifts t2_so_reg is a 2-operand unit corresponding to the Thumb2
-// register with shift forms: (Rm, ConstantShiftSpecifier).
-// Constant shift specifier: Imm = (ShOp | ShAmt<<3).
-//
-// There are special instructions, like t2MOVsra_flag and t2MOVsrl_flag, which
-// only require two register operands: Rd, Rm in ARM Reference Manual terms, and
-// nothing else, because the shift amount is already specified.
-// Similar case holds for t2MOVrx, t2ADDrr, ..., etc.
-static bool DisassembleThumb2DPSoReg(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  const MCInstrDesc &MCID = ARMInsts[Opcode];
-  const MCOperandInfo *OpInfo = MCID.OpInfo;
-  unsigned &OpIdx = NumOpsAdded;
-
-  // Special case handling.
-  if (Opcode == ARM::t2BR_JT) {
-    assert(NumOps == 4
-           && OpInfo[0].RegClass == ARM::GPRRegClassID
-           && OpInfo[1].RegClass == ARM::GPRRegClassID
-           && OpInfo[2].RegClass < 0
-           && OpInfo[3].RegClass < 0
-           && "Exactly 4 operands expect and first two as reg operands");
-    // Only need to populate the src reg operand.
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRm(insn))));
-    MI.addOperand(MCOperand::CreateReg(0));
-    MI.addOperand(MCOperand::CreateImm(0));
-    MI.addOperand(MCOperand::CreateImm(0));
-    NumOpsAdded = 4;
-    return true;
-  }
-
-  OpIdx = 0;
-
-  assert(NumOps >= 2
-         && (OpInfo[0].RegClass == ARM::GPRRegClassID ||
-             OpInfo[0].RegClass == ARM::rGPRRegClassID)
-         && (OpInfo[1].RegClass == ARM::GPRRegClassID ||
-             OpInfo[1].RegClass == ARM::rGPRRegClassID)
-         && "Expect >= 2 operands and first two as reg operands");
-
-  bool ThreeReg = (NumOps > 2 && (OpInfo[2].RegClass == ARM::GPRRegClassID ||
-                                  OpInfo[2].RegClass == ARM::rGPRRegClassID));
-  bool NoDstReg = (decodeRs(insn) == 0xF);
-
-  // Build the register operands, followed by the constant shift specifier.
-
-  MI.addOperand(MCOperand::CreateReg(
-                  getRegisterEnum(B, OpInfo[0].RegClass,
-                                  NoDstReg ? decodeRn(insn) : decodeRs(insn))));
-  ++OpIdx;
-
-  if (ThreeReg) {
-    int Idx;
-    if ((Idx = MCID.getOperandConstraint(OpIdx, MCOI::TIED_TO)) != -1) {
-      // Process tied_to operand constraint.
-      MI.addOperand(MI.getOperand(Idx));
-      ++OpIdx;
-    } else if (!NoDstReg) {
-      MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, OpInfo[1].RegClass,
-                                                         decodeRn(insn))));
-      ++OpIdx;
-    } else {
-      DEBUG(errs() << "Thumb2 encoding error: d==15 for three-reg operands.\n");
-      return false;
-    }
-  }
-
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, OpInfo[OpIdx].RegClass,
-                                                     decodeRm(insn))));
-  ++OpIdx;
-
-  if (NumOps == OpIdx)
-    return true;
-
-  if (OpInfo[OpIdx].RegClass < 0 && !OpInfo[OpIdx].isPredicate()
-      && !OpInfo[OpIdx].isOptionalDef()) {
-
-    if (Thumb2ShiftOpcode(Opcode)) {
-      unsigned Imm = getShiftAmtBits(insn);
-      ARM_AM::ShiftOpc ShOp = getShiftOpcForBits(slice(insn, 5, 4));
-      getImmShiftSE(ShOp, Imm);
-      MI.addOperand(MCOperand::CreateImm(Imm));
-    } else {
-      // Build the constant shift specifier operand.
-      unsigned bits2 = getShiftTypeBits(insn);
-      unsigned imm5 = getShiftAmtBits(insn);
-      ARM_AM::ShiftOpc ShOp = ARM_AM::no_shift;
-      unsigned ShAmt = decodeImmShift(bits2, imm5, ShOp);
-      MI.addOperand(MCOperand::CreateImm(ARM_AM::getSORegOpc(ShOp, ShAmt)));
-    }
-    ++OpIdx;
-  }
-
-  return true;
-}
-
-// A6.3.1 Data-processing (modified immediate)
-//
-// Two register operands: Rs Rn ModImm
-// One register operands (Rs=0b1111 no explicit dest reg): Rn ModImm
-// One register operands (Rn=0b1111 no explicit src reg): Rs ModImm -
-// {t2MOVi, t2MVNi}
-//
-// ModImm = ThumbExpandImm(i:imm3:imm8)
-static bool DisassembleThumb2DPModImm(MCInst &MI, unsigned Opcode,
-    uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  const MCInstrDesc &MCID = ARMInsts[Opcode];
-  const MCOperandInfo *OpInfo = MCID.OpInfo;
-  unsigned &OpIdx = NumOpsAdded;
-
-  OpIdx = 0;
-
-  unsigned RdRegClassID = OpInfo[0].RegClass;
-  assert(NumOps >= 2 && (RdRegClassID == ARM::GPRRegClassID ||
-                         RdRegClassID == ARM::rGPRRegClassID)
-         && "Expect >= 2 operands and first one as reg operand");
-
-  unsigned RnRegClassID = OpInfo[1].RegClass;
-  bool TwoReg = (RnRegClassID == ARM::GPRRegClassID
-                 || RnRegClassID == ARM::rGPRRegClassID);
-  bool NoDstReg = (decodeRs(insn) == 0xF);
-
-  // Build the register operands, followed by the modified immediate.
-
-  MI.addOperand(MCOperand::CreateReg(
-                  getRegisterEnum(B, RdRegClassID,
-                                  NoDstReg ? decodeRn(insn) : decodeRs(insn))));
-  ++OpIdx;
-
-  if (TwoReg) {
-    if (NoDstReg) {
-      DEBUG(errs()<<"Thumb2 encoding error: d==15 for DPModImm 2-reg instr.\n");
-      return false;
-    }
-    int Idx;
-    if ((Idx = MCID.getOperandConstraint(OpIdx, MCOI::TIED_TO)) != -1) {
-      // The reg operand is tied to the first reg operand.
-      MI.addOperand(MI.getOperand(Idx));
-    } else {
-      // Add second reg operand.
-      MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, RnRegClassID,
-                                                         decodeRn(insn))));
-    }
-    ++OpIdx;
-  }
-
-  // The modified immediate operand should come next.
-  assert(OpIdx < NumOps && OpInfo[OpIdx].RegClass < 0 &&
-         !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()
-         && "Pure imm operand expected");
-
-  // i:imm3:imm8
-  // A6.3.2 Modified immediate constants in Thumb instructions
-  unsigned imm12 = getIImm3Imm8(insn);
-  MI.addOperand(MCOperand::CreateImm(ThumbExpandImm(imm12)));
-  ++OpIdx;
-
-  return true;
-}
-
-static inline bool Thumb2SaturateOpcode(unsigned Opcode) {
-  switch (Opcode) {
-  case ARM::t2SSAT: case ARM::t2SSAT16:
-  case ARM::t2USAT: case ARM::t2USAT16:
-    return true;
-  default:
-    return false;
-  }
-}
-
-/// DisassembleThumb2Sat - Disassemble Thumb2 saturate instructions:
-/// o t2SSAT, t2USAT: Rs sat_pos Rn shamt
-/// o t2SSAT16, t2USAT16: Rs sat_pos Rn
-static bool DisassembleThumb2Sat(MCInst &MI, unsigned Opcode, uint32_t insn,
-                                 unsigned &NumOpsAdded, BO B) {
-  const MCInstrDesc &MCID = ARMInsts[Opcode];
-  NumOpsAdded = MCID.getNumOperands() - 2; // ignore predicate operands
-
-  // Disassemble the register def.
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::rGPRRegClassID,
-                                                     decodeRs(insn))));
-
-  unsigned Pos = slice(insn, 4, 0);
-  if (Opcode == ARM::t2SSAT || Opcode == ARM::t2SSAT16)
-    Pos += 1;
-  MI.addOperand(MCOperand::CreateImm(Pos));
-
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::rGPRRegClassID,
-                                                     decodeRn(insn))));
-
-  if (NumOpsAdded == 4) {
-    ARM_AM::ShiftOpc Opc = (slice(insn, 21, 21) != 0 ?
-                            ARM_AM::asr : ARM_AM::lsl);
-    // Inst{14-12:7-6} encodes the imm5 shift amount.
-    unsigned ShAmt = slice(insn, 14, 12) << 2 | slice(insn, 7, 6);
-    if (ShAmt == 0) {
-      if (Opc == ARM_AM::asr)
-        ShAmt = 32;
-      else
-        Opc = ARM_AM::no_shift;
-    }
-    MI.addOperand(MCOperand::CreateImm(ARM_AM::getSORegOpc(Opc, ShAmt)));
-  }
-  return true;
-}
-
-// A6.3.3 Data-processing (plain binary immediate)
-//
-// o t2ADDri12, t2SUBri12: Rs Rn imm12
-// o t2LEApcrel (ADR): Rs imm12
-// o t2BFC (BFC): Rs Ro(TIED_TO) bf_inv_mask_imm
-// o t2BFI (BFI): Rs Ro(TIED_TO) Rn bf_inv_mask_imm
-// o t2MOVi16: Rs imm16
-// o t2MOVTi16: Rs imm16
-// o t2SBFX (SBFX): Rs Rn lsb width
-// o t2UBFX (UBFX): Rs Rn lsb width
-// o t2BFI (BFI): Rs Rn lsb width
-static bool DisassembleThumb2DPBinImm(MCInst &MI, unsigned Opcode,
-    uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  const MCInstrDesc &MCID = ARMInsts[Opcode];
-  const MCOperandInfo *OpInfo = MCID.OpInfo;
-  unsigned &OpIdx = NumOpsAdded;
-
-  OpIdx = 0;
-
-  unsigned RdRegClassID = OpInfo[0].RegClass;
-  assert(NumOps >= 2 && (RdRegClassID == ARM::GPRRegClassID ||
-                         RdRegClassID == ARM::rGPRRegClassID)
-         && "Expect >= 2 operands and first one as reg operand");
-
-  unsigned RnRegClassID = OpInfo[1].RegClass;
-  bool TwoReg = (RnRegClassID == ARM::GPRRegClassID
-                 || RnRegClassID == ARM::rGPRRegClassID);
-
-  // Build the register operand(s), followed by the immediate(s).
-
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, RdRegClassID,
-                                                     decodeRs(insn))));
-  ++OpIdx;
-
-  if (TwoReg) {
-    assert(NumOps >= 3 && "Expect >= 3 operands");
-    int Idx;
-    if ((Idx = MCID.getOperandConstraint(OpIdx, MCOI::TIED_TO)) != -1) {
-      // Process tied_to operand constraint.
-      MI.addOperand(MI.getOperand(Idx));
-    } else {
-      // Add src reg operand.
-      MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, RnRegClassID,
-                                                         decodeRn(insn))));
-    }
-    ++OpIdx;
-  }
-
-  if (Opcode == ARM::t2BFI) {
-    // Add val reg operand.
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, RnRegClassID,
-                                                       decodeRn(insn))));
-    ++OpIdx;
-  }
-
-  assert(OpInfo[OpIdx].RegClass < 0 && !OpInfo[OpIdx].isPredicate()
-         && !OpInfo[OpIdx].isOptionalDef()
-         && "Pure imm operand expected");
-
-  // Pre-increment OpIdx.
-  ++OpIdx;
-
-  if (Opcode == ARM::t2ADDri12 || Opcode == ARM::t2SUBri12
-      || Opcode == ARM::t2LEApcrel)
-    MI.addOperand(MCOperand::CreateImm(getIImm3Imm8(insn)));
-  else if (Opcode == ARM::t2MOVi16 || Opcode == ARM::t2MOVTi16) {
-    if (!B->tryAddingSymbolicOperand(getImm16(insn), 4, MI))
-      MI.addOperand(MCOperand::CreateImm(getImm16(insn)));
-  } else if (Opcode == ARM::t2BFC || Opcode == ARM::t2BFI) {
-    uint32_t mask = 0;
-    if (getBitfieldInvMask(insn, mask))
-      MI.addOperand(MCOperand::CreateImm(mask));
-    else
-      return false;
-  } else {
-    // Handle the case of: lsb width
-    assert((Opcode == ARM::t2SBFX || Opcode == ARM::t2UBFX)
-            && "Unexpected opcode");
-    MI.addOperand(MCOperand::CreateImm(getLsb(insn)));
-    MI.addOperand(MCOperand::CreateImm(getWidthMinus1(insn) + 1));
-
-    ++OpIdx;
-  }
-
-  return true;
-}
-
-// A6.3.4 Table A6-15 Miscellaneous control instructions
-// A8.6.41 DMB
-// A8.6.42 DSB
-// A8.6.49 ISB
-static inline bool t2MiscCtrlInstr(uint32_t insn) {
-  if (slice(insn, 31, 20) == 0xf3b && slice(insn, 15, 14) == 2 &&
-      slice(insn, 12, 12) == 0)
-    return true;
-
-  return false;
-}
-
-// A6.3.4 Branches and miscellaneous control
-//
-// A8.6.16 B
-// Branches: t2B, t2Bcc -> imm operand
-//
-// Branches: t2TPsoft -> no operand
-//
-// A8.6.23 BL, BLX (immediate)
-// Branches (defined in ARMInstrThumb.td): tBLr9, tBLXi_r9 -> imm operand
-//
-// A8.6.26
-// t2BXJ -> Rn
-//
-// Miscellaneous control:
-//   -> no operand (except pred-imm pred-ccr for CLREX, memory barrier variants)
-//
-// Hint: t2NOP, t2YIELD, t2WFE, t2WFI, t2SEV
-//   -> no operand (except pred-imm pred-ccr)
-//
-// t2DBG -> imm4 = Inst{3-0}
-//
-// t2MRS/t2MRSsys -> Rs
-// t2MSR/t2MSRsys -> Rn mask=Inst{11-8}
-// t2SMC -> imm4 = Inst{19-16}
-static bool DisassembleThumb2BrMiscCtrl(MCInst &MI, unsigned Opcode,
-    uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  if (NumOps == 0)
-    return true;
-
-  if (Opcode == ARM::t2DMB || Opcode == ARM::t2DSB) {
-    // Inst{3-0} encodes the memory barrier option for the variants.
-    unsigned opt = slice(insn, 3, 0);
-    switch (opt) {
-    case ARM_MB::SY:  case ARM_MB::ST:
-    case ARM_MB::ISH: case ARM_MB::ISHST:
-    case ARM_MB::NSH: case ARM_MB::NSHST:
-    case ARM_MB::OSH: case ARM_MB::OSHST:
-      MI.addOperand(MCOperand::CreateImm(opt));
-      NumOpsAdded = 1;
-      return true;
-    default:
-      return false;
-    }
-  }
-
-  if (t2MiscCtrlInstr(insn))
-    return true;
-
-  switch (Opcode) {
-  case ARM::t2CLREX:
-  case ARM::t2NOP:
-  case ARM::t2YIELD:
-  case ARM::t2WFE:
-  case ARM::t2WFI:
-  case ARM::t2SEV:
-    return true;
-  default:
-    break;
-  }
-
-  // FIXME: To enable correct asm parsing and disasm of CPS we need 3 different
-  // opcodes which match the same real instruction. This is needed since there's
-  // no current handling of optional arguments. Fix here when a better handling
-  // of optional arguments is implemented.
-  if (Opcode == ARM::t2CPS3p) {
-    MI.addOperand(MCOperand::CreateImm(slice(insn, 10, 9))); // imod
-    MI.addOperand(MCOperand::CreateImm(slice(insn, 7, 5)));  // iflags
-    MI.addOperand(MCOperand::CreateImm(slice(insn, 4, 0)));  // mode
-    NumOpsAdded = 3;
-    return true;
-  }
-  if (Opcode == ARM::t2CPS2p) {
-    MI.addOperand(MCOperand::CreateImm(slice(insn, 10, 9))); // imod
-    MI.addOperand(MCOperand::CreateImm(slice(insn, 7, 5)));  // iflags
-    NumOpsAdded = 2;
-    return true;
-  }
-  if (Opcode == ARM::t2CPS1p) {
-    MI.addOperand(MCOperand::CreateImm(slice(insn, 4, 0))); // mode
-    NumOpsAdded = 1;
-    return true;
-  }
-
-  // DBG has its option specified in Inst{3-0}.
-  if (Opcode == ARM::t2DBG) {
-    MI.addOperand(MCOperand::CreateImm(slice(insn, 3, 0)));
-    NumOpsAdded = 1;
-    return true;
-  }
-
-  // MRS and MRSsys take one GPR reg Rs.
-  if (Opcode == ARM::t2MRS || Opcode == ARM::t2MRSsys) {
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRs(insn))));
-    NumOpsAdded = 1;
-    return true;
-  }
-  // BXJ takes one GPR reg Rn.
-  if (Opcode == ARM::t2BXJ) {
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRn(insn))));
-    NumOpsAdded = 1;
-    return true;
-  }
-  // MSR take a mask, followed by one GPR reg Rn. The mask contains the R Bit in
-  // bit 4, and the special register fields in bits 3-0.
-  if (Opcode == ARM::t2MSR) {
-    MI.addOperand(MCOperand::CreateImm(slice(insn, 20, 20) << 4 /* R Bit */ |
-                                       slice(insn, 11, 8) /* Special Reg */));
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRn(insn))));
-    NumOpsAdded = 2;
-    return true;
-  }
-  // SMC take imm4.
-  if (Opcode == ARM::t2SMC) {
-    MI.addOperand(MCOperand::CreateImm(slice(insn, 19, 16)));
-    NumOpsAdded = 1;
-    return true;
-  }
-
-  // Some instructions have predicate operands first before the immediate.
-  if (Opcode == ARM::tBLXi_r9 || Opcode == ARM::tBLr9) {
-    // Handling the two predicate operands before the imm operand.
-    if (B->DoPredicateOperands(MI, Opcode, insn, NumOps))
-      NumOpsAdded += 2;
-    else {
-      DEBUG(errs() << "Expected predicate operands not found.\n");
-      return false;
-    }
-  }
-
-  // Add the imm operand.
-  int Offset = 0;
-
-  switch (Opcode) {
-  default:
-    assert(0 && "Unexpected opcode");
-    return false;
-  case ARM::t2B:
-    Offset = decodeImm32_B_EncodingT4(insn);
-    break;
-  case ARM::t2Bcc:
-    Offset = decodeImm32_B_EncodingT3(insn);
-    break;
-  case ARM::tBLr9:
-    Offset = decodeImm32_BL(insn);
-    break;
-  case ARM::tBLXi_r9:
-    Offset = decodeImm32_BLX(insn);
-    break;
-  }
-
-  if (!B->tryAddingSymbolicOperand(Offset + B->getBuilderAddress() + 4, 4, MI))
-    MI.addOperand(MCOperand::CreateImm(Offset));
-
-  // This is an increment as some predicate operands may have been added first.
-  NumOpsAdded += 1;
-
-  return true;
-}
-
-static inline bool Thumb2PreloadOpcode(unsigned Opcode) {
-  switch (Opcode) {
-  default:
-    return false;
-  case ARM::t2PLDi12:   case ARM::t2PLDi8:
-  case ARM::t2PLDs:
-  case ARM::t2PLDWi12:  case ARM::t2PLDWi8:
-  case ARM::t2PLDWs:
-  case ARM::t2PLIi12:   case ARM::t2PLIi8:
-  case ARM::t2PLIs:
-    return true;
-  }
-}
-
-static bool DisassembleThumb2PreLoad(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  // Preload Data/Instruction requires either 2 or 3 operands.
-  // t2PLDi12, t2PLDi8, t2PLDpci: Rn [+/-]imm12/imm8
-  // t2PLDr:                      Rn Rm
-  // t2PLDs:                      Rn Rm imm2=Inst{5-4}
-  // Same pattern applies for t2PLDW* and t2PLI*.
-
-  const MCInstrDesc &MCID = ARMInsts[Opcode];
-  const MCOperandInfo *OpInfo = MCID.OpInfo;
-  unsigned &OpIdx = NumOpsAdded;
-
-  OpIdx = 0;
-
-  assert(NumOps >= 2 &&
-         OpInfo[0].RegClass == ARM::GPRRegClassID &&
-         "Expect >= 2 operands and first one as reg operand");
-
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRn(insn))));
-  ++OpIdx;
-
-  if (OpInfo[OpIdx].RegClass == ARM::rGPRRegClassID) {
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRm(insn))));
-  } else {
-    assert(OpInfo[OpIdx].RegClass < 0 && !OpInfo[OpIdx].isPredicate()
-           && !OpInfo[OpIdx].isOptionalDef()
-           && "Pure imm operand expected");
-    int Offset = 0;
-    if (Opcode == ARM::t2PLDi8 || Opcode == ARM::t2PLDWi8 ||
-        Opcode == ARM::t2PLIi8) {
-      // A8.6.117 Encoding T2: add = FALSE
-      unsigned Imm8 = getImm8(insn);
-      Offset = -1 * Imm8;
-    } else {
-      // The i12 forms.  See, for example, A8.6.117 Encoding T1.
-      // Note that currently t2PLDi12 also handles the previously named t2PLDpci
-      // opcode, that's why we use decodeImm12(insn) which returns +/- imm12.
-      Offset = decodeImm12(insn);
-    }
-    MI.addOperand(MCOperand::CreateImm(Offset));
-  }
-  ++OpIdx;
-
-  if (OpIdx < NumOps && OpInfo[OpIdx].RegClass < 0 &&
-      !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()) {
-    // Fills in the shift amount for t2PLDs, t2PLDWs, t2PLIs.
-    MI.addOperand(MCOperand::CreateImm(slice(insn, 5, 4)));
-    ++OpIdx;
-  }
-
-  return true;
-}
-
-static bool BadRegsThumb2LdSt(unsigned Opcode, uint32_t insn, bool Load,
-      unsigned R0, unsigned R1, unsigned R2, bool UseRm, bool WB) {
-
-  // Inst{22-21} encodes the data item transferred for load/store.
-  // For single word, it is encoded as ob10.
-  bool Word = (slice(insn, 22, 21) == 2);
-  bool Half = (slice(insn, 22, 21) == 1);
-  bool Byte = (slice(insn, 22, 21) == 0);
-
-  if (UseRm && BadReg(R2)) {
-    DEBUG(errs() << "if BadReg(m) then UNPREDICTABLE\n");
-    return true;
-  }
-
-  if (Load) {
-    if (!Word && R0 == 13) {
-      DEBUG(errs() << "if t == 13 then UNPREDICTABLE\n");
-      return true;
-    }
-    if (Byte) {
-      if (WB && R0 == 15 && slice(insn, 10, 8) == 3)  {
-        // A8.6.78 LDRSB (immediate) Encoding T2 (errata markup 8.0)
-        DEBUG(errs() << "if t == 15 && PUW == '011' then UNPREDICTABLE\n");
-        return true;
-      }
-    }
-    // A6.3.8 Load halfword, memory hints
-    if (Half) {
-      if (WB) {
-        if (R0 == R1)  {
-          // A8.6.82 LDRSH (immediate) Encoding T2
-          DEBUG(errs() << "if WB && n == t then UNPREDICTABLE\n");
-          return true;
-        }
-        if (R0 == 15 && slice(insn, 10, 8) == 3)  {
-          // A8.6.82 LDRSH (immediate) Encoding T2 (errata markup 8.0)
-          DEBUG(errs() << "if t == 15 && PUW == '011' then UNPREDICTABLE\n");
-          return true;
-        }
-      } else {
-        if (Opcode == ARM::t2LDRHi8 || Opcode == ARM::t2LDRSHi8) {
-          if (R0 == 15 && slice(insn, 10, 8) == 4) {
-            // A8.6.82 LDRSH (immediate) Encoding T2
-            DEBUG(errs() << "if Rt == '1111' and PUW == '100' then SEE"
-                         << " \"Unallocated memory hints\"\n");
-            return true;
-          }
-        } else {
-          if (R0 == 15) {
-            // A8.6.82 LDRSH (immediate) Encoding T1
-            DEBUG(errs() << "if Rt == '1111' then SEE"
-                         << " \"Unallocated memory hints\"\n");
-            return true;
-          }
-        }
-      }
-    }
-  } else {
-    if (WB && R0 == R1) {
-      DEBUG(errs() << "if wback && n == t then UNPREDICTABLE\n");
-      return true;
-    }
-    if ((WB && R0 == 15) || (!WB && R1 == 15)) {
-      DEBUG(errs() << "if Rn == '1111' then UNDEFINED\n");
-      return true;
-    }
-    if (Word) {
-      if ((WB && R1 == 15) || (!WB && R0 == 15)) {
-        DEBUG(errs() << "if t == 15 then UNPREDICTABLE\n");
-        return true;
-      }
-    } else {
-      if ((WB && BadReg(R1)) || (!WB && BadReg(R0))) {
-        DEBUG(errs() << "if BadReg(t) then UNPREDICTABLE\n");
-        return true;
-      }
-    }
-  }
-  return false;
-}
-
-// A6.3.10 Store single data item
-// A6.3.9 Load byte, memory hints
-// A6.3.8 Load halfword, memory hints
-// A6.3.7 Load word
-//
-// For example,
-//
-// t2LDRi12:   Rd Rn (+)imm12
-// t2LDRi8:    Rd Rn (+/-)imm8 (+ if Inst{9} == 0b1)
-// t2LDRs:     Rd Rn Rm ConstantShiftSpecifier (see also
-//             DisassembleThumb2DPSoReg)
-// t2LDR_POST: Rd Rn Rn(TIED_TO) (+/-)imm8 (+ if Inst{9} == 0b1)
-// t2LDR_PRE:  Rd Rn Rn(TIED_TO) (+/-)imm8 (+ if Inst{9} == 0b1)
-//
-// t2STRi12:   Rd Rn (+)imm12
-// t2STRi8:    Rd Rn (+/-)imm8 (+ if Inst{9} == 0b1)
-// t2STRs:     Rd Rn Rm ConstantShiftSpecifier (see also
-//             DisassembleThumb2DPSoReg)
-// t2STR_POST: Rn Rd Rn(TIED_TO) (+/-)imm8 (+ if Inst{9} == 0b1)
-// t2STR_PRE:  Rn Rd Rn(TIED_TO) (+/-)imm8 (+ if Inst{9} == 0b1)
-//
-// Note that for indexed modes, the Rn(TIED_TO) operand needs to be populated
-// correctly, as LLVM AsmPrinter depends on it.  For indexed stores, the first
-// operand is Rn; for all the other instructions, Rd is the first operand.
-//
-// Delegates to DisassembleThumb2PreLoad() for preload data/instruction.
-// Delegates to DisassembleThumb2Ldpci() for load * literal operations.
-static bool DisassembleThumb2LdSt(bool Load, MCInst &MI, unsigned Opcode,
-    uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  unsigned Rn = decodeRn(insn);
-
-  if (Thumb2PreloadOpcode(Opcode))
-    return DisassembleThumb2PreLoad(MI, Opcode, insn, NumOps, NumOpsAdded, B);
-
-  // See, for example, A6.3.7 Load word: Table A6-18 Load word.
-  if (Load && Rn == 15)
-    return DisassembleThumb2Ldpci(MI, Opcode, insn, NumOps, NumOpsAdded, B);
-  const MCInstrDesc &MCID = ARMInsts[Opcode];
-  const MCOperandInfo *OpInfo = MCID.OpInfo;
-  unsigned &OpIdx = NumOpsAdded;
-
-  OpIdx = 0;
-
-  assert(NumOps >= 3 &&
-         OpInfo[0].RegClass > 0 &&
-         OpInfo[1].RegClass > 0 &&
-         "Expect >= 3 operands and first two as reg operands");
-
-  bool ThreeReg = (OpInfo[2].RegClass > 0);
-  bool TIED_TO = ThreeReg && MCID.getOperandConstraint(2, MCOI::TIED_TO) != -1;
-  bool Imm12 = !ThreeReg && slice(insn, 23, 23) == 1; // ARMInstrThumb2.td
-
-  // Build the register operands, followed by the immediate.
-  unsigned R0 = 0, R1 = 0, R2 = 0;
-  unsigned Rd = decodeRd(insn);
-  int Imm = 0;
-
-  if (!Load && TIED_TO) {
-    R0 = Rn;
-    R1 = Rd;
-  } else {
-    R0 = Rd;
-    R1 = Rn;
-  }
-  if (ThreeReg) {
-    if (TIED_TO) {
-      R2 = Rn;
-      Imm = decodeImm8(insn);
-    } else {
-      R2 = decodeRm(insn);
-      // See, for example, A8.6.64 LDRB (register).
-      // And ARMAsmPrinter::printT2AddrModeSoRegOperand().
-      // LSL is the default shift opc, and LLVM does not expect it to be encoded
-      // as part of the immediate operand.
-      // Imm = ARM_AM::getSORegOpc(ARM_AM::lsl, slice(insn, 5, 4));
-      Imm = slice(insn, 5, 4);
-    }
-  } else {
-    if (Imm12)
-      Imm = getImm12(insn);
-    else
-      Imm = decodeImm8(insn);
-  }
-
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, OpInfo[OpIdx].RegClass,
-                                                     R0)));
-  ++OpIdx;
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, OpInfo[OpIdx].RegClass,
-                                                     R1)));
-  ++OpIdx;
-
-  if (ThreeReg) {
-    // This could be an offset register or a TIED_TO register.
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B,OpInfo[OpIdx].RegClass,
-                                                       R2)));
-    ++OpIdx;
-  }
-
-  if (BadRegsThumb2LdSt(Opcode, insn, Load, R0, R1, R2, ThreeReg & !TIED_TO,
-                        TIED_TO))
-    return false;
-
-  assert(OpInfo[OpIdx].RegClass < 0 && !OpInfo[OpIdx].isPredicate()
-         && !OpInfo[OpIdx].isOptionalDef()
-         && "Pure imm operand expected");
-
-  MI.addOperand(MCOperand::CreateImm(Imm));
-  ++OpIdx;
-
-  return true;
-}
-
-// A6.3.12 Data-processing (register)
-//
-// Two register operands [rotate]:   Rs Rm [rotation(= (rotate:'000'))]
-// Three register operands only:     Rs Rn Rm
-// Three register operands [rotate]: Rs Rn Rm [rotation(= (rotate:'000'))]
-//
-// Parallel addition and subtraction 32-bit Thumb instructions: Rs Rn Rm
-//
-// Miscellaneous operations: Rs [Rn] Rm
-static bool DisassembleThumb2DPReg(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  const MCInstrDesc &MCID = ARMInsts[Opcode];
-  const MCOperandInfo *OpInfo = MCID.OpInfo;
-  unsigned &OpIdx = NumOpsAdded;
-
-  OpIdx = 0;
-
-  assert(NumOps >= 2 &&
-         OpInfo[0].RegClass > 0 &&
-         OpInfo[1].RegClass > 0 &&
-         "Expect >= 2 operands and first two as reg operands");
-
-  // Build the register operands, followed by the optional rotation amount.
-
-  bool ThreeReg = NumOps > 2 && OpInfo[2].RegClass > 0;
-
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, OpInfo[OpIdx].RegClass,
-                                                     decodeRs(insn))));
-  ++OpIdx;
-
-  if (ThreeReg) {
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B,OpInfo[OpIdx].RegClass,
-                                                       decodeRn(insn))));
-    ++OpIdx;
-  }
-
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, OpInfo[OpIdx].RegClass,
-                                                     decodeRm(insn))));
-  ++OpIdx;
-
-  if (OpIdx < NumOps && OpInfo[OpIdx].RegClass < 0
-      && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()) {
-    // Add the rotation amount immediate.
-    MI.addOperand(MCOperand::CreateImm(decodeRotate(insn)));
-    ++OpIdx;
-  }
-
-  return true;
-}
-
-// A6.3.16 Multiply, multiply accumulate, and absolute difference
-//
-// t2MLA, t2MLS, t2SMMLA, t2SMMLS: Rs Rn Rm Ra=Inst{15-12}
-// t2MUL, t2SMMUL:                 Rs Rn Rm
-// t2SMLA[BB|BT|TB|TT|WB|WT]:      Rs Rn Rm Ra=Inst{15-12}
-// t2SMUL[BB|BT|TB|TT|WB|WT]:      Rs Rn Rm
-//
-// Dual halfword multiply: t2SMUAD[X], t2SMUSD[X], t2SMLAD[X], t2SMLSD[X]:
-//   Rs Rn Rm Ra=Inst{15-12}
-//
-// Unsigned Sum of Absolute Differences [and Accumulate]
-//    Rs Rn Rm [Ra=Inst{15-12}]
-static bool DisassembleThumb2Mul(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  const MCOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
-
-  assert(NumOps >= 3 &&
-         OpInfo[0].RegClass == ARM::rGPRRegClassID &&
-         OpInfo[1].RegClass == ARM::rGPRRegClassID &&
-         OpInfo[2].RegClass == ARM::rGPRRegClassID &&
-         "Expect >= 3 operands and first three as reg operands");
-
-  // Build the register operands.
-
-  bool FourReg = NumOps > 3 && OpInfo[3].RegClass == ARM::rGPRRegClassID;
-
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::rGPRRegClassID,
-                                                     decodeRs(insn))));
-
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::rGPRRegClassID,
-                                                     decodeRn(insn))));
-
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::rGPRRegClassID,
-                                                     decodeRm(insn))));
-
-  if (FourReg)
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::rGPRRegClassID,
-                                                       decodeRd(insn))));
-
-  NumOpsAdded = FourReg ? 4 : 3;
-
-  return true;
-}
-
-// A6.3.17 Long multiply, long multiply accumulate, and divide
-//
-// t2SMULL, t2UMULL, t2SMLAL, t2UMLAL, t2UMAAL: RdLo RdHi Rn Rm
-// where RdLo = Inst{15-12} and RdHi = Inst{11-8}
-//
-// Halfword multiple accumulate long: t2SMLAL<x><y>: RdLo RdHi Rn Rm
-// where RdLo = Inst{15-12} and RdHi = Inst{11-8}
-//
-// Dual halfword multiple: t2SMLALD[X], t2SMLSLD[X]: RdLo RdHi Rn Rm
-// where RdLo = Inst{15-12} and RdHi = Inst{11-8}
-//
-// Signed/Unsigned divide: t2SDIV, t2UDIV: Rs Rn Rm
-static bool DisassembleThumb2LongMul(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  const MCOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
-
-  assert(NumOps >= 3 &&
-         OpInfo[0].RegClass == ARM::rGPRRegClassID &&
-         OpInfo[1].RegClass == ARM::rGPRRegClassID &&
-         OpInfo[2].RegClass == ARM::rGPRRegClassID &&
-         "Expect >= 3 operands and first three as reg operands");
-
-  bool FourReg = NumOps > 3 && OpInfo[3].RegClass == ARM::rGPRRegClassID;
-
-  // Build the register operands.
-
-  if (FourReg)
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::rGPRRegClassID,
-                                                       decodeRd(insn))));
-
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::rGPRRegClassID,
-                                                     decodeRs(insn))));
-
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::rGPRRegClassID,
-                                                     decodeRn(insn))));
-
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::rGPRRegClassID,
-                                                     decodeRm(insn))));
-
-  if (FourReg)
-    NumOpsAdded = 4;
-  else
-    NumOpsAdded = 3;
-
-  return true;
-}
-
-// See A6.3 32-bit Thumb instruction encoding for instruction classes
-// corresponding to (op1, op2, op).
-//
-// Table A6-9 32-bit Thumb instruction encoding
-// op1  op2    op  Instruction class, see
-// ---  -------  --  -----------------------------------------------------------
-// 01  00xx0xx  -  Load/store multiple on page A6-23
-//     00xx1xx  -  Load/store dual, load/store exclusive, table branch on
-//                 page A6-24
-//     01xxxxx  -  Data-processing (shifted register) on page A6-31
-//     1xxxxxx  -  Coprocessor instructions on page A6-40
-// 10  x0xxxxx  0  Data-processing (modified immediate) on page A6-15
-//     x1xxxxx  0  Data-processing (plain binary immediate) on page A6-19
-//         -    1  Branches and miscellaneous control on page A6-20
-// 11  000xxx0  -  Store single data item on page A6-30
-//     001xxx0  -  Advanced SIMD element or structure load/store instructions
-//                 on page A7-27
-//     00xx001  - Load byte, memory hints on page A6-28
-//     00xx011  -  Load halfword, memory hints on page A6-26
-//     00xx101  -  Load word on page A6-25
-//     00xx111  -  UNDEFINED
-//     010xxxx  -  Data-processing (register) on page A6-33
-//     0110xxx  -  Multiply, multiply accumulate, and absolute difference on
-//                 page A6-38
-//     0111xxx  -  Long multiply, long multiply accumulate, and divide on
-//                 page A6-39
-//     1xxxxxx  -  Coprocessor instructions on page A6-40
-//
-static bool DisassembleThumb2(uint16_t op1, uint16_t op2, uint16_t op,
-    MCInst &MI, unsigned Opcode, uint32_t insn, unsigned short NumOps,
-    unsigned &NumOpsAdded, BO B) {
-
-  switch (op1) {
-  case 1:
-    if (slice(op2, 6, 5) == 0) {
-      if (slice(op2, 2, 2) == 0) {
-        // Load/store multiple.
-        return DisassembleThumb2LdStMul(MI, Opcode, insn, NumOps, NumOpsAdded,
-                                        B);
-      }
-
-      // Load/store dual, load/store exclusive, table branch, otherwise.
-      assert(slice(op2, 2, 2) == 1 && "Thumb2 encoding error!");
-      if ((ARM::t2LDREX <= Opcode && Opcode <= ARM::t2LDREXH) ||
-          (ARM::t2STREX <= Opcode && Opcode <= ARM::t2STREXH)) {
-        // Load/store exclusive.
-        return DisassembleThumb2LdStEx(MI, Opcode, insn, NumOps, NumOpsAdded,
-                                       B);
-      }
-      if (Opcode == ARM::t2LDRDi8 ||
-          Opcode == ARM::t2LDRD_PRE || Opcode == ARM::t2LDRD_POST ||
-          Opcode == ARM::t2STRDi8 ||
-          Opcode == ARM::t2STRD_PRE || Opcode == ARM::t2STRD_POST) {
-        // Load/store dual.
-        return DisassembleThumb2LdStDual(MI, Opcode, insn, NumOps, NumOpsAdded,
-                                         B);
-      }
-      if (Opcode == ARM::t2TBB || Opcode == ARM::t2TBH) {
-        // Table branch.
-        return DisassembleThumb2TB(MI, Opcode, insn, NumOps, NumOpsAdded, B);
-      }
-    } else if (slice(op2, 6, 5) == 1) {
-      // Data-processing (shifted register).
-      return DisassembleThumb2DPSoReg(MI, Opcode, insn, NumOps, NumOpsAdded, B);
-    }
-
-    // FIXME: A6.3.18 Coprocessor instructions
-    // But see ThumbDisassembler::getInstruction().
-
-    break;
-  case 2:
-    if (op == 0) {
-      if (slice(op2, 5, 5) == 0)
-        // Data-processing (modified immediate)
-        return DisassembleThumb2DPModImm(MI, Opcode, insn, NumOps, NumOpsAdded,
-                                         B);
-      if (Thumb2SaturateOpcode(Opcode))
-        return DisassembleThumb2Sat(MI, Opcode, insn, NumOpsAdded, B);
-
-      // Data-processing (plain binary immediate)
-      return DisassembleThumb2DPBinImm(MI, Opcode, insn, NumOps, NumOpsAdded,
-                                       B);
-    }
-    // Branches and miscellaneous control on page A6-20.
-    return DisassembleThumb2BrMiscCtrl(MI, Opcode, insn, NumOps, NumOpsAdded,
-                                       B);
-  case 3:
-    switch (slice(op2, 6, 5)) {
-    case 0:
-      // Load/store instructions...
-      if (slice(op2, 0, 0) == 0) {
-        if (slice(op2, 4, 4) == 0) {
-          // Store single data item on page A6-30
-          return DisassembleThumb2LdSt(false, MI,Opcode,insn,NumOps,NumOpsAdded,
-                                       B);
-        } else {
-          // FIXME: Advanced SIMD element or structure load/store instructions.
-          // But see ThumbDisassembler::getInstruction().
-          ;
-        }
-      } else {
-        // Table A6-9 32-bit Thumb instruction encoding: Load byte|halfword|word
-        return DisassembleThumb2LdSt(true, MI, Opcode, insn, NumOps,
-                                     NumOpsAdded, B);
-      }
-      break;
-    case 1:
-      if (slice(op2, 4, 4) == 0) {
-        // A6.3.12 Data-processing (register)
-        return DisassembleThumb2DPReg(MI, Opcode, insn, NumOps, NumOpsAdded, B);
-      } else if (slice(op2, 3, 3) == 0) {
-        // A6.3.16 Multiply, multiply accumulate, and absolute difference
-        return DisassembleThumb2Mul(MI, Opcode, insn, NumOps, NumOpsAdded, B);
-      } else {
-        // A6.3.17 Long multiply, long multiply accumulate, and divide
-        return DisassembleThumb2LongMul(MI, Opcode, insn, NumOps, NumOpsAdded,
-                                        B);
-      }
-      break;
-    default:
-      // FIXME: A6.3.18 Coprocessor instructions
-      // But see ThumbDisassembler::getInstruction().
-      ;
-      break;
-    }
-
-    break;
-  default:
-    assert(0 && "Thumb2 encoding error!");
-    break;
-  }
-
-  return false;
-}
-
-static bool DisassembleThumbFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO Builder) {
-
-  uint16_t HalfWord = slice(insn, 31, 16);
-
-  if (HalfWord == 0) {
-    // A6.2 16-bit Thumb instruction encoding
-    // op = bits[15:10]
-    uint16_t op = slice(insn, 15, 10);
-    return DisassembleThumb1(op, MI, Opcode, insn, NumOps, NumOpsAdded,
-                             Builder);
-  }
-
-  unsigned bits15_11 = slice(HalfWord, 15, 11);
-
-  // A6.1 Thumb instruction set encoding
-  if (!(bits15_11 == 0x1D || bits15_11 == 0x1E || bits15_11 == 0x1F)) {
-    assert("Bits[15:11] first halfword of Thumb2 instruction is out of range");
-    return false;
-  }
-
-  // A6.3 32-bit Thumb instruction encoding
-
-  uint16_t op1 = slice(HalfWord, 12, 11);
-  uint16_t op2 = slice(HalfWord, 10, 4);
-  uint16_t op = slice(insn, 15, 15);
-
-  return DisassembleThumb2(op1, op2, op, MI, Opcode, insn, NumOps, NumOpsAdded,
-                           Builder);
-}
diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
index 78d3e477975c8..ccdac3ebeb471 100644
--- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
+++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
@@ -12,9 +12,9 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "asm-printer"
-#include "ARMBaseInfo.h"
 #include "ARMInstPrinter.h"
-#include "ARMAddressingModes.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
@@ -25,6 +25,23 @@ using namespace llvm;
 #define GET_INSTRUCTION_NAME
 #include "ARMGenAsmWriter.inc"
 
+/// translateShiftImm - Convert shift immediate from 0-31 to 1-32 for printing.
+///
+/// getSORegOffset returns an integer from 0-31, representing '32' as 0.
+static unsigned translateShiftImm(unsigned imm) {
+  if (imm == 0)
+    return 32;
+  return imm;
+}
+
+
+ARMInstPrinter::ARMInstPrinter(const MCAsmInfo &MAI,
+                               const MCSubtargetInfo &STI) :
+  MCInstPrinter(MAI) {
+  // Initialize the set of available features.
+  setAvailableFeatures(STI.getFeatureBits());
+}
+
 StringRef ARMInstPrinter::getOpcodeName(unsigned Opcode) const {
   return getInstructionName(Opcode);
 }
@@ -33,11 +50,12 @@ void ARMInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
   OS << getRegisterName(RegNo);
 }
 
-void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O) {
+void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+                               StringRef Annot) {
   unsigned Opcode = MI->getOpcode();
 
   // Check for MOVs and print canonical forms, instead.
-  if (Opcode == ARM::MOVs) {
+  if (Opcode == ARM::MOVsr) {
     // FIXME: Thumb variants?
     const MCOperand &Dst = MI->getOperand(0);
     const MCOperand &MO1 = MI->getOperand(1);
@@ -51,20 +69,36 @@ void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O) {
     O << '\t' << getRegisterName(Dst.getReg())
       << ", " << getRegisterName(MO1.getReg());
 
-    if (ARM_AM::getSORegShOp(MO3.getImm()) == ARM_AM::rrx)
-      return;
+    O << ", " << getRegisterName(MO2.getReg());
+    assert(ARM_AM::getSORegOffset(MO3.getImm()) == 0);
+    printAnnotation(O, Annot);
+    return;
+  }
 
-    O << ", ";
+  if (Opcode == ARM::MOVsi) {
+    // FIXME: Thumb variants?
+    const MCOperand &Dst = MI->getOperand(0);
+    const MCOperand &MO1 = MI->getOperand(1);
+    const MCOperand &MO2 = MI->getOperand(2);
+
+    O << '\t' << ARM_AM::getShiftOpcStr(ARM_AM::getSORegShOp(MO2.getImm()));
+    printSBitModifierOperand(MI, 5, O);
+    printPredicateOperand(MI, 3, O);
 
-    if (MO2.getReg()) {
-      O << getRegisterName(MO2.getReg());
-      assert(ARM_AM::getSORegOffset(MO3.getImm()) == 0);
-    } else {
-      O << "#" << ARM_AM::getSORegOffset(MO3.getImm());
+    O << '\t' << getRegisterName(Dst.getReg())
+      << ", " << getRegisterName(MO1.getReg());
+
+    if (ARM_AM::getSORegShOp(MO2.getImm()) == ARM_AM::rrx) {
+      printAnnotation(O, Annot);
+      return;
     }
+
+    O << ", #" << translateShiftImm(ARM_AM::getSORegOffset(MO2.getImm()));
+    printAnnotation(O, Annot);
     return;
   }
 
+
   // A8.6.123 PUSH
   if ((Opcode == ARM::STMDB_UPD || Opcode == ARM::t2STMDB_UPD) &&
       MI->getOperand(0).getReg() == ARM::SP) {
@@ -74,6 +108,15 @@ void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O) {
       O << ".w";
     O << '\t';
     printRegisterList(MI, 4, O);
+    printAnnotation(O, Annot);
+    return;
+  }
+  if (Opcode == ARM::STR_PRE_IMM && MI->getOperand(2).getReg() == ARM::SP &&
+      MI->getOperand(3).getImm() == -4) {
+    O << '\t' << "push";
+    printPredicateOperand(MI, 4, O);
+    O << "\t{" << getRegisterName(MI->getOperand(1).getReg()) << "}";
+    printAnnotation(O, Annot);
     return;
   }
 
@@ -86,8 +129,18 @@ void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O) {
       O << ".w";
     O << '\t';
     printRegisterList(MI, 4, O);
+    printAnnotation(O, Annot);
     return;
   }
+  if (Opcode == ARM::LDR_POST_IMM && MI->getOperand(2).getReg() == ARM::SP &&
+      MI->getOperand(4).getImm() == 4) {
+    O << '\t' << "pop";
+    printPredicateOperand(MI, 5, O);
+    O << "\t{" << getRegisterName(MI->getOperand(0).getReg()) << "}";
+    printAnnotation(O, Annot);
+    return;
+  }
+
 
   // A8.6.355 VPUSH
   if ((Opcode == ARM::VSTMSDB_UPD || Opcode == ARM::VSTMDDB_UPD) &&
@@ -96,6 +149,7 @@ void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O) {
     printPredicateOperand(MI, 2, O);
     O << '\t';
     printRegisterList(MI, 4, O);
+    printAnnotation(O, Annot);
     return;
   }
 
@@ -106,10 +160,40 @@ void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O) {
     printPredicateOperand(MI, 2, O);
     O << '\t';
     printRegisterList(MI, 4, O);
+    printAnnotation(O, Annot);
+    return;
+  }
+
+  if (Opcode == ARM::tLDMIA) {
+    bool Writeback = true;
+    unsigned BaseReg = MI->getOperand(0).getReg();
+    for (unsigned i = 3; i < MI->getNumOperands(); ++i) {
+      if (MI->getOperand(i).getReg() == BaseReg)
+        Writeback = false;
+    }
+
+    O << "\tldm";
+
+    printPredicateOperand(MI, 1, O);
+    O << '\t' << getRegisterName(BaseReg);
+    if (Writeback) O << "!";
+    O << ", ";
+    printRegisterList(MI, 3, O);
+    printAnnotation(O, Annot);
+    return;
+  }
+
+  // Thumb1 NOP
+  if (Opcode == ARM::tMOVr && MI->getOperand(0).getReg() == ARM::R8 &&
+      MI->getOperand(1).getReg() == ARM::R8) {
+    O << "\tnop";
+    printPredicateOperand(MI, 2, O);
+    printAnnotation(O, Annot);
     return;
   }
 
   printInstruction(MI, O);
+  printAnnotation(O, Annot);
 }
 
 void ARMInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
@@ -122,16 +206,38 @@ void ARMInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
     O << '#' << Op.getImm();
   } else {
     assert(Op.isExpr() && "unknown operand kind in printOperand");
-    O << *Op.getExpr();
+    // If a symbolic branch target was added as a constant expression then print
+    // that address in hex.
+    const MCConstantExpr *BranchTarget = dyn_cast<MCConstantExpr>(Op.getExpr());
+    int64_t Address;
+    if (BranchTarget && BranchTarget->EvaluateAsAbsolute(Address)) {
+      O << "0x";
+      O.write_hex(Address);
+    }
+    else {
+      // Otherwise, just print the expression.
+      O << *Op.getExpr();
+    }
   }
 }
 
+void ARMInstPrinter::printT2LdrLabelOperand(const MCInst *MI, unsigned OpNum,
+                                       raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  if (MO1.isExpr())
+    O << *MO1.getExpr();
+  else if (MO1.isImm())
+    O << "[pc, #" << MO1.getImm() << "]";
+  else
+    llvm_unreachable("Unknown LDR label operand?");
+}
+
 // so_reg is a 4-operand unit corresponding to register forms of the A5.1
 // "Addressing Mode 1 - Data-processing operands" forms.  This includes:
 //    REG 0   0           - e.g. R5
 //    REG REG 0,SH_OPC    - e.g. R5, ROR R3
 //    REG 0   IMM,SH_OPC  - e.g. R5, LSL #3
-void ARMInstPrinter::printSORegOperand(const MCInst *MI, unsigned OpNum,
+void ARMInstPrinter::printSORegRegOperand(const MCInst *MI, unsigned OpNum,
                                        raw_ostream &O) {
   const MCOperand &MO1 = MI->getOperand(OpNum);
   const MCOperand &MO2 = MI->getOperand(OpNum+1);
@@ -144,14 +250,27 @@ void ARMInstPrinter::printSORegOperand(const MCInst *MI, unsigned OpNum,
   O << ", " << ARM_AM::getShiftOpcStr(ShOpc);
   if (ShOpc == ARM_AM::rrx)
     return;
-  if (MO2.getReg()) {
-    O << ' ' << getRegisterName(MO2.getReg());
-    assert(ARM_AM::getSORegOffset(MO3.getImm()) == 0);
-  } else if (ShOpc != ARM_AM::rrx) {
-    O << " #" << ARM_AM::getSORegOffset(MO3.getImm());
-  }
+  
+  O << ' ' << getRegisterName(MO2.getReg());
+  assert(ARM_AM::getSORegOffset(MO3.getImm()) == 0);
+}
+
+void ARMInstPrinter::printSORegImmOperand(const MCInst *MI, unsigned OpNum,
+                                       raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum+1);
+
+  O << getRegisterName(MO1.getReg());
+
+  // Print the shift opc.
+  ARM_AM::ShiftOpc ShOpc = ARM_AM::getSORegShOp(MO2.getImm());
+  O << ", " << ARM_AM::getShiftOpcStr(ShOpc);
+  if (ShOpc == ARM_AM::rrx)
+    return;
+  O << " #" << translateShiftImm(ARM_AM::getSORegOffset(MO2.getImm()));
 }
 
+
 //===--------------------------------------------------------------------===//
 // Addressing Mode #2
 //===--------------------------------------------------------------------===//
@@ -209,6 +328,22 @@ void ARMInstPrinter::printAM2PostIndexOp(const MCInst *MI, unsigned Op,
     << " #" << ShImm;
 }
 
+void ARMInstPrinter::printAddrModeTBB(const MCInst *MI, unsigned Op,
+                                           raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(Op);
+  const MCOperand &MO2 = MI->getOperand(Op+1);
+  O << "[" << getRegisterName(MO1.getReg()) << ", "
+    << getRegisterName(MO2.getReg()) << "]";
+}
+
+void ARMInstPrinter::printAddrModeTBH(const MCInst *MI, unsigned Op,
+                                           raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(Op);
+  const MCOperand &MO2 = MI->getOperand(Op+1);
+  O << "[" << getRegisterName(MO1.getReg()) << ", "
+    << getRegisterName(MO2.getReg()) << ", lsl #1]";
+}
+
 void ARMInstPrinter::printAddrMode2Operand(const MCInst *MI, unsigned Op,
                                            raw_ostream &O) {
   const MCOperand &MO1 = MI->getOperand(Op);
@@ -284,7 +419,7 @@ void ARMInstPrinter::printAM3PreOrOffsetIndexOp(const MCInst *MI, unsigned Op,
   O << '[' << getRegisterName(MO1.getReg());
 
   if (MO2.getReg()) {
-    O << ", " << (char)ARM_AM::getAM3Op(MO3.getImm())
+    O << ", " << getAddrOpcStr(ARM_AM::getAM3Op(MO3.getImm()))
       << getRegisterName(MO2.getReg()) << ']';
     return;
   }
@@ -315,8 +450,8 @@ void ARMInstPrinter::printAddrMode3OffsetOperand(const MCInst *MI,
   const MCOperand &MO2 = MI->getOperand(OpNum+1);
 
   if (MO1.getReg()) {
-    O << (char)ARM_AM::getAM3Op(MO2.getImm())
-    << getRegisterName(MO1.getReg());
+    O << getAddrOpcStr(ARM_AM::getAM3Op(MO2.getImm()))
+      << getRegisterName(MO1.getReg());
     return;
   }
 
@@ -326,6 +461,31 @@ void ARMInstPrinter::printAddrMode3OffsetOperand(const MCInst *MI,
     << ImmOffs;
 }
 
+void ARMInstPrinter::printPostIdxImm8Operand(const MCInst *MI,
+                                             unsigned OpNum,
+                                             raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(OpNum);
+  unsigned Imm = MO.getImm();
+  O << '#' << ((Imm & 256) ? "" : "-") << (Imm & 0xff);
+}
+
+void ARMInstPrinter::printPostIdxRegOperand(const MCInst *MI, unsigned OpNum,
+                                            raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum+1);
+
+  O << (MO2.getImm() ? "" : "-") << getRegisterName(MO1.getReg());
+}
+
+void ARMInstPrinter::printPostIdxImm8s4Operand(const MCInst *MI,
+                                             unsigned OpNum,
+                                             raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(OpNum);
+  unsigned Imm = MO.getImm();
+  O << '#' << ((Imm & 256) ? "" : "-") << ((Imm & 0xff) << 2);
+}
+
+
 void ARMInstPrinter::printLdStmModeOperand(const MCInst *MI, unsigned OpNum,
                                            raw_ostream &O) {
   ARM_AM::AMSubMode Mode = ARM_AM::getAM4SubMode(MI->getOperand(OpNum)
@@ -345,7 +505,9 @@ void ARMInstPrinter::printAddrMode5Operand(const MCInst *MI, unsigned OpNum,
 
   O << "[" << getRegisterName(MO1.getReg());
 
-  if (unsigned ImmOffs = ARM_AM::getAM5Offset(MO2.getImm())) {
+  unsigned ImmOffs = ARM_AM::getAM5Offset(MO2.getImm());
+  unsigned Op = ARM_AM::getAM5Op(MO2.getImm());
+  if (ImmOffs || Op == ARM_AM::sub) {
     O << ", #"
       << ARM_AM::getAddrOpcStr(ARM_AM::getAM5Op(MO2.getImm()))
       << ImmOffs * 4;
@@ -402,20 +564,31 @@ void ARMInstPrinter::printMemBOption(const MCInst *MI, unsigned OpNum,
 void ARMInstPrinter::printShiftImmOperand(const MCInst *MI, unsigned OpNum,
                                           raw_ostream &O) {
   unsigned ShiftOp = MI->getOperand(OpNum).getImm();
-  ARM_AM::ShiftOpc Opc = ARM_AM::getSORegShOp(ShiftOp);
-  switch (Opc) {
-  case ARM_AM::no_shift:
+  bool isASR = (ShiftOp & (1 << 5)) != 0;
+  unsigned Amt = ShiftOp & 0x1f;
+  if (isASR)
+    O << ", asr #" << (Amt == 0 ? 32 : Amt);
+  else if (Amt)
+    O << ", lsl #" << Amt;
+}
+
+void ARMInstPrinter::printPKHLSLShiftImm(const MCInst *MI, unsigned OpNum,
+                                         raw_ostream &O) {
+  unsigned Imm = MI->getOperand(OpNum).getImm();
+  if (Imm == 0)
     return;
-  case ARM_AM::lsl:
-    O << ", lsl #";
-    break;
-  case ARM_AM::asr:
-    O << ", asr #";
-    break;
-  default:
-    assert(0 && "unexpected shift opcode for shift immediate operand");
-  }
-  O << ARM_AM::getSORegOffset(ShiftOp);
+  assert(Imm > 0 && Imm < 32 && "Invalid PKH shift immediate value!");
+  O << ", lsl #" << Imm;
+}
+
+void ARMInstPrinter::printPKHASRShiftImm(const MCInst *MI, unsigned OpNum,
+                                         raw_ostream &O) {
+  unsigned Imm = MI->getOperand(OpNum).getImm();
+  // A shift amount of 32 is encoded as 0.
+  if (Imm == 0)
+    Imm = 32;
+  assert(Imm > 0 && Imm <= 32 && "Invalid PKH shift immediate value!");
+  O << ", asr #" << Imm;
 }
 
 void ARMInstPrinter::printRegisterList(const MCInst *MI, unsigned OpNum,
@@ -450,6 +623,9 @@ void ARMInstPrinter::printCPSIFlag(const MCInst *MI, unsigned OpNum,
   for (int i=2; i >= 0; --i)
     if (IFlags & (1 << i))
       O << ARM_PROC::IFlagsToString(1 << i);
+
+  if (IFlags == 0)
+    O << "none";
 }
 
 void ARMInstPrinter::printMSRMaskOperand(const MCInst *MI, unsigned OpNum,
@@ -458,10 +634,43 @@ void ARMInstPrinter::printMSRMaskOperand(const MCInst *MI, unsigned OpNum,
   unsigned SpecRegRBit = Op.getImm() >> 4;
   unsigned Mask = Op.getImm() & 0xf;
 
+  if (getAvailableFeatures() & ARM::FeatureMClass) {
+    switch (Op.getImm()) {
+    default: assert(0 && "Unexpected mask value!");
+    case 0: O << "apsr"; return;
+    case 1: O << "iapsr"; return;
+    case 2: O << "eapsr"; return;
+    case 3: O << "xpsr"; return;
+    case 5: O << "ipsr"; return;
+    case 6: O << "epsr"; return;
+    case 7: O << "iepsr"; return;
+    case 8: O << "msp"; return;
+    case 9: O << "psp"; return;
+    case 16: O << "primask"; return;
+    case 17: O << "basepri"; return;
+    case 18: O << "basepri_max"; return;
+    case 19: O << "faultmask"; return;
+    case 20: O << "control"; return;
+    }
+  }
+
+  // As special cases, CPSR_f, CPSR_s and CPSR_fs prefer printing as
+  // APSR_nzcvq, APSR_g and APSRnzcvqg, respectively.
+  if (!SpecRegRBit && (Mask == 8 || Mask == 4 || Mask == 12)) {
+    O << "APSR_";
+    switch (Mask) {
+    default: assert(0);
+    case 4:  O << "g"; return;
+    case 8:  O << "nzcvq"; return;
+    case 12: O << "nzcvqg"; return;
+    }
+    llvm_unreachable("Unexpected mask value!");
+  }
+
   if (SpecRegRBit)
-    O << "spsr";
+    O << "SPSR";
   else
-    O << "cpsr";
+    O << "CPSR";
 
   if (Mask) {
     O << '_';
@@ -501,15 +710,20 @@ void ARMInstPrinter::printNoHashImmediate(const MCInst *MI, unsigned OpNum,
 }
 
 void ARMInstPrinter::printPImmediate(const MCInst *MI, unsigned OpNum,
-                                          raw_ostream &O) {
+                                     raw_ostream &O) {
   O << "p" << MI->getOperand(OpNum).getImm();
 }
 
 void ARMInstPrinter::printCImmediate(const MCInst *MI, unsigned OpNum,
-                                          raw_ostream &O) {
+                                     raw_ostream &O) {
   O << "c" << MI->getOperand(OpNum).getImm();
 }
 
+void ARMInstPrinter::printCoprocOptionImm(const MCInst *MI, unsigned OpNum,
+                                          raw_ostream &O) {
+  O << "{" << MI->getOperand(OpNum).getImm() << "}";
+}
+
 void ARMInstPrinter::printPCLabel(const MCInst *MI, unsigned OpNum,
                                   raw_ostream &O) {
   llvm_unreachable("Unhandled PC-relative pseudo-instruction!");
@@ -517,7 +731,13 @@ void ARMInstPrinter::printPCLabel(const MCInst *MI, unsigned OpNum,
 
 void ARMInstPrinter::printThumbS4ImmOperand(const MCInst *MI, unsigned OpNum,
                                             raw_ostream &O) {
-  O << "#" <<  MI->getOperand(OpNum).getImm() * 4;
+  O << "#" << MI->getOperand(OpNum).getImm() * 4;
+}
+
+void ARMInstPrinter::printThumbSRImm(const MCInst *MI, unsigned OpNum,
+                                     raw_ostream &O) {
+  unsigned Imm = MI->getOperand(OpNum).getImm();
+  O << "#" << (Imm == 0 ? 32 : Imm);
 }
 
 void ARMInstPrinter::printThumbITMask(const MCInst *MI, unsigned OpNum,
@@ -610,7 +830,7 @@ void ARMInstPrinter::printT2SOOperand(const MCInst *MI, unsigned OpNum,
   ARM_AM::ShiftOpc ShOpc = ARM_AM::getSORegShOp(MO2.getImm());
   O << ", " << ARM_AM::getShiftOpcStr(ShOpc);
   if (ShOpc != ARM_AM::rrx)
-    O << " #" << ARM_AM::getSORegOffset(MO2.getImm());
+    O << " #" << translateShiftImm(ARM_AM::getSORegOffset(MO2.getImm()));
 }
 
 void ARMInstPrinter::printAddrModeImm12Operand(const MCInst *MI, unsigned OpNum,
@@ -647,7 +867,9 @@ void ARMInstPrinter::printT2AddrModeImm8Operand(const MCInst *MI,
 
   int32_t OffImm = (int32_t)MO2.getImm();
   // Don't print +0.
-  if (OffImm < 0)
+  if (OffImm == INT32_MIN)
+    O << ", #-0";
+  else if (OffImm < 0)
     O << ", #-" << -OffImm;
   else if (OffImm > 0)
     O << ", #" << OffImm;
@@ -671,6 +893,18 @@ void ARMInstPrinter::printT2AddrModeImm8s4Operand(const MCInst *MI,
   O << "]";
 }
 
+void ARMInstPrinter::printT2AddrModeImm0_1020s4Operand(const MCInst *MI,
+                                                       unsigned OpNum,
+                                                       raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum+1);
+
+  O << "[" << getRegisterName(MO1.getReg());
+  if (MO2.getImm())
+    O << ", #" << MO2.getImm() * 4;
+  O << "]";
+}
+
 void ARMInstPrinter::printT2AddrModeImm8OffsetOperand(const MCInst *MI,
                                                       unsigned OpNum,
                                                       raw_ostream &O) {
@@ -678,9 +912,9 @@ void ARMInstPrinter::printT2AddrModeImm8OffsetOperand(const MCInst *MI,
   int32_t OffImm = (int32_t)MO1.getImm();
   // Don't print +0.
   if (OffImm < 0)
-    O << "#-" << -OffImm;
-  else if (OffImm > 0)
-    O << "#" << OffImm;
+    O << ", #-" << -OffImm;
+  else
+    O << ", #" << OffImm;
 }
 
 void ARMInstPrinter::printT2AddrModeImm8s4OffsetOperand(const MCInst *MI,
@@ -689,10 +923,13 @@ void ARMInstPrinter::printT2AddrModeImm8s4OffsetOperand(const MCInst *MI,
   const MCOperand &MO1 = MI->getOperand(OpNum);
   int32_t OffImm = (int32_t)MO1.getImm() / 4;
   // Don't print +0.
-  if (OffImm < 0)
-    O << "#-" << -OffImm * 4;
-  else if (OffImm > 0)
-    O << "#" << OffImm * 4;
+  if (OffImm != 0) {
+    O << ", ";
+    if (OffImm < 0)
+      O << "#-" << -OffImm * 4;
+    else if (OffImm > 0)
+      O << "#" << OffImm * 4;
+  }
 }
 
 void ARMInstPrinter::printT2AddrModeSoRegOperand(const MCInst *MI,
@@ -715,39 +952,10 @@ void ARMInstPrinter::printT2AddrModeSoRegOperand(const MCInst *MI,
   O << "]";
 }
 
-void ARMInstPrinter::printVFPf32ImmOperand(const MCInst *MI, unsigned OpNum,
-                                           raw_ostream &O) {
-  const MCOperand &MO = MI->getOperand(OpNum);
-  O << '#';
-  if (MO.isFPImm()) {
-    O << (float)MO.getFPImm();
-  } else {
-    union {
-      uint32_t I;
-      float F;
-    } FPUnion;
-
-    FPUnion.I = MO.getImm();
-    O << FPUnion.F;
-  }
-}
-
-void ARMInstPrinter::printVFPf64ImmOperand(const MCInst *MI, unsigned OpNum,
-                                           raw_ostream &O) {
+void ARMInstPrinter::printFPImmOperand(const MCInst *MI, unsigned OpNum,
+                                       raw_ostream &O) {
   const MCOperand &MO = MI->getOperand(OpNum);
-  O << '#';
-  if (MO.isFPImm()) {
-    O << MO.getFPImm();
-  } else {
-    // We expect the binary encoding of a floating point number here.
-    union {
-      uint64_t I;
-      double D;
-    } FPUnion;
-
-    FPUnion.I = MO.getImm();
-    O << FPUnion.D;
-  }
+  O << '#' << ARM_AM::getFPImmFloat(MO.getImm());
 }
 
 void ARMInstPrinter::printNEONModImmOperand(const MCInst *MI, unsigned OpNum,
@@ -757,3 +965,28 @@ void ARMInstPrinter::printNEONModImmOperand(const MCInst *MI, unsigned OpNum,
   uint64_t Val = ARM_AM::decodeNEONModImm(EncodedImm, EltBits);
   O << "#0x" << utohexstr(Val);
 }
+
+void ARMInstPrinter::printImmPlusOneOperand(const MCInst *MI, unsigned OpNum,
+                                            raw_ostream &O) {
+  unsigned Imm = MI->getOperand(OpNum).getImm();
+  O << "#" << Imm + 1;
+}
+
+void ARMInstPrinter::printRotImmOperand(const MCInst *MI, unsigned OpNum,
+                                        raw_ostream &O) {
+  unsigned Imm = MI->getOperand(OpNum).getImm();
+  if (Imm == 0)
+    return;
+  O << ", ror #";
+  switch (Imm) {
+  default: assert (0 && "illegal ror immediate!");
+  case 1: O << "8"; break;
+  case 2: O << "16"; break;
+  case 3: O << "24"; break;
+  }
+}
+
+void ARMInstPrinter::printVectorIndex(const MCInst *MI, unsigned OpNum,
+                                      raw_ostream &O) {
+  O << "[" << MI->getOperand(OpNum).getImm() << "]";
+}
diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.h b/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
index d5f238bb8a61b..5c2173fcde626 100644
--- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
+++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
@@ -15,6 +15,7 @@
 #define ARMINSTPRINTER_H
 
 #include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 
 namespace llvm {
 
@@ -22,10 +23,9 @@ class MCOperand;
 
 class ARMInstPrinter : public MCInstPrinter {
 public:
-  ARMInstPrinter(const MCAsmInfo &MAI)
-    : MCInstPrinter(MAI) {}
+    ARMInstPrinter(const MCAsmInfo &MAI, const MCSubtargetInfo &STI);
 
-  virtual void printInst(const MCInst *MI, raw_ostream &O);
+  virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
   virtual StringRef getOpcodeName(unsigned Opcode) const;
   virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
 
@@ -38,8 +38,11 @@ public:
 
   void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
 
-  void printSORegOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printSORegRegOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printSORegImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
 
+  void printAddrModeTBB(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printAddrModeTBH(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printAddrMode2Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printAM2PostIndexOp(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printAM2PreOrOffsetIndexOp(const MCInst *MI, unsigned OpNum,
@@ -48,11 +51,15 @@ public:
                                    raw_ostream &O);
 
   void printAddrMode3Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printAM3PostIndexOp(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printAM3PreOrOffsetIndexOp(const MCInst *MI, unsigned OpNum,
-                                  raw_ostream &O);
   void printAddrMode3OffsetOperand(const MCInst *MI, unsigned OpNum,
                                    raw_ostream &O);
+  void printAM3PostIndexOp(const MCInst *MI, unsigned Op, raw_ostream &O);
+  void printAM3PreOrOffsetIndexOp(const MCInst *MI, unsigned Op,raw_ostream &O);
+  void printPostIdxImm8Operand(const MCInst *MI, unsigned OpNum,
+                               raw_ostream &O);
+  void printPostIdxRegOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printPostIdxImm8s4Operand(const MCInst *MI, unsigned OpNum,
+                               raw_ostream &O);
 
   void printLdStmModeOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printAddrMode5Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
@@ -65,8 +72,11 @@ public:
                                       raw_ostream &O);
   void printMemBOption(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printShiftImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printPKHLSLShiftImm(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printPKHASRShiftImm(const MCInst *MI, unsigned OpNum, raw_ostream &O);
 
   void printThumbS4ImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printThumbSRImm(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printThumbITMask(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printThumbAddrModeRROperand(const MCInst *MI, unsigned OpNum,
                                    raw_ostream &O);
@@ -88,6 +98,8 @@ public:
                                   raw_ostream &O);
   void printT2AddrModeImm8s4Operand(const MCInst *MI, unsigned OpNum,
                                     raw_ostream &O);
+  void printT2AddrModeImm0_1020s4Operand(const MCInst *MI, unsigned OpNum,
+                                    raw_ostream &O);
   void printT2AddrModeImm8OffsetOperand(const MCInst *MI, unsigned OpNum,
                                         raw_ostream &O);
   void printT2AddrModeImm8s4OffsetOperand(const MCInst *MI, unsigned OpNum,
@@ -108,11 +120,15 @@ public:
   void printNoHashImmediate(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printPImmediate(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printCImmediate(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printVFPf32ImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printVFPf64ImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printCoprocOptionImm(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printFPImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printNEONModImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printImmPlusOneOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printRotImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
 
   void printPCLabel(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printT2LdrLabelOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printVectorIndex(const MCInst *MI, unsigned OpNum, raw_ostream &O);
 };
 
 } // end namespace llvm
diff --git a/lib/Target/ARM/InstPrinter/CMakeLists.txt b/lib/Target/ARM/InstPrinter/CMakeLists.txt
index 18645c0864a32..fa0b4957ccdee 100644
--- a/lib/Target/ARM/InstPrinter/CMakeLists.txt
+++ b/lib/Target/ARM/InstPrinter/CMakeLists.txt
@@ -3,4 +3,10 @@ include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/
 add_llvm_library(LLVMARMAsmPrinter
   ARMInstPrinter.cpp
   )
-add_dependencies(LLVMARMAsmPrinter ARMCodeGenTable_gen)
+
+add_dependencies(LLVMARMAsmPrinter ARMCommonTableGen)
+
+add_llvm_library_dependencies(LLVMARMAsmPrinter
+  LLVMMC
+  LLVMSupport
+  )
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h b/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
new file mode 100644
index 0000000000000..9982fa68a5787
--- /dev/null
+++ b/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
@@ -0,0 +1,667 @@
+//===- ARMAddressingModes.h - ARM Addressing Modes --------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the ARM addressing mode implementation stuff.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_ARM_ARMADDRESSINGMODES_H
+#define LLVM_TARGET_ARM_ARMADDRESSINGMODES_H
+
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/Support/MathExtras.h"
+#include <cassert>
+
+namespace llvm {
+
+/// ARM_AM - ARM Addressing Mode Stuff
+namespace ARM_AM {
+  enum ShiftOpc {
+    no_shift = 0,
+    asr,
+    lsl,
+    lsr,
+    ror,
+    rrx
+  };
+
+  enum AddrOpc {
+    sub = 0,
+    add
+  };
+
+  static inline const char *getAddrOpcStr(AddrOpc Op) {
+    return Op == sub ? "-" : "";
+  }
+
+  static inline const char *getShiftOpcStr(ShiftOpc Op) {
+    switch (Op) {
+    default: assert(0 && "Unknown shift opc!");
+    case ARM_AM::asr: return "asr";
+    case ARM_AM::lsl: return "lsl";
+    case ARM_AM::lsr: return "lsr";
+    case ARM_AM::ror: return "ror";
+    case ARM_AM::rrx: return "rrx";
+    }
+  }
+
+  static inline unsigned getShiftOpcEncoding(ShiftOpc Op) {
+    switch (Op) {
+    default: assert(0 && "Unknown shift opc!");
+    case ARM_AM::asr: return 2;
+    case ARM_AM::lsl: return 0;
+    case ARM_AM::lsr: return 1;
+    case ARM_AM::ror: return 3;
+    }
+  }
+
+  enum AMSubMode {
+    bad_am_submode = 0,
+    ia,
+    ib,
+    da,
+    db
+  };
+
+  static inline const char *getAMSubModeStr(AMSubMode Mode) {
+    switch (Mode) {
+    default: assert(0 && "Unknown addressing sub-mode!");
+    case ARM_AM::ia: return "ia";
+    case ARM_AM::ib: return "ib";
+    case ARM_AM::da: return "da";
+    case ARM_AM::db: return "db";
+    }
+  }
+
+  /// rotr32 - Rotate a 32-bit unsigned value right by a specified # bits.
+  ///
+  static inline unsigned rotr32(unsigned Val, unsigned Amt) {
+    assert(Amt < 32 && "Invalid rotate amount");
+    return (Val >> Amt) | (Val << ((32-Amt)&31));
+  }
+
+  /// rotl32 - Rotate a 32-bit unsigned value left by a specified # bits.
+  ///
+  static inline unsigned rotl32(unsigned Val, unsigned Amt) {
+    assert(Amt < 32 && "Invalid rotate amount");
+    return (Val << Amt) | (Val >> ((32-Amt)&31));
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Addressing Mode #1: shift_operand with registers
+  //===--------------------------------------------------------------------===//
+  //
+  // This 'addressing mode' is used for arithmetic instructions.  It can
+  // represent things like:
+  //   reg
+  //   reg [asr|lsl|lsr|ror|rrx] reg
+  //   reg [asr|lsl|lsr|ror|rrx] imm
+  //
+  // This is stored three operands [rega, regb, opc].  The first is the base
+  // reg, the second is the shift amount (or reg0 if not present or imm).  The
+  // third operand encodes the shift opcode and the imm if a reg isn't present.
+  //
+  static inline unsigned getSORegOpc(ShiftOpc ShOp, unsigned Imm) {
+    return ShOp | (Imm << 3);
+  }
+  static inline unsigned getSORegOffset(unsigned Op) {
+    return Op >> 3;
+  }
+  static inline ShiftOpc getSORegShOp(unsigned Op) {
+    return (ShiftOpc)(Op & 7);
+  }
+
+  /// getSOImmValImm - Given an encoded imm field for the reg/imm form, return
+  /// the 8-bit imm value.
+  static inline unsigned getSOImmValImm(unsigned Imm) {
+    return Imm & 0xFF;
+  }
+  /// getSOImmValRot - Given an encoded imm field for the reg/imm form, return
+  /// the rotate amount.
+  static inline unsigned getSOImmValRot(unsigned Imm) {
+    return (Imm >> 8) * 2;
+  }
+
+  /// getSOImmValRotate - Try to handle Imm with an immediate shifter operand,
+  /// computing the rotate amount to use.  If this immediate value cannot be
+  /// handled with a single shifter-op, determine a good rotate amount that will
+  /// take a maximal chunk of bits out of the immediate.
+  static inline unsigned getSOImmValRotate(unsigned Imm) {
+    // 8-bit (or less) immediates are trivially shifter_operands with a rotate
+    // of zero.
+    if ((Imm & ~255U) == 0) return 0;
+
+    // Use CTZ to compute the rotate amount.
+    unsigned TZ = CountTrailingZeros_32(Imm);
+
+    // Rotate amount must be even.  Something like 0x200 must be rotated 8 bits,
+    // not 9.
+    unsigned RotAmt = TZ & ~1;
+
+    // If we can handle this spread, return it.
+    if ((rotr32(Imm, RotAmt) & ~255U) == 0)
+      return (32-RotAmt)&31;  // HW rotates right, not left.
+
+    // For values like 0xF000000F, we should ignore the low 6 bits, then
+    // retry the hunt.
+    if (Imm & 63U) {
+      unsigned TZ2 = CountTrailingZeros_32(Imm & ~63U);
+      unsigned RotAmt2 = TZ2 & ~1;
+      if ((rotr32(Imm, RotAmt2) & ~255U) == 0)
+        return (32-RotAmt2)&31;  // HW rotates right, not left.
+    }
+
+    // Otherwise, we have no way to cover this span of bits with a single
+    // shifter_op immediate.  Return a chunk of bits that will be useful to
+    // handle.
+    return (32-RotAmt)&31;  // HW rotates right, not left.
+  }
+
+  /// getSOImmVal - Given a 32-bit immediate, if it is something that can fit
+  /// into an shifter_operand immediate operand, return the 12-bit encoding for
+  /// it.  If not, return -1.
+  static inline int getSOImmVal(unsigned Arg) {
+    // 8-bit (or less) immediates are trivially shifter_operands with a rotate
+    // of zero.
+    if ((Arg & ~255U) == 0) return Arg;
+
+    unsigned RotAmt = getSOImmValRotate(Arg);
+
+    // If this cannot be handled with a single shifter_op, bail out.
+    if (rotr32(~255U, RotAmt) & Arg)
+      return -1;
+
+    // Encode this correctly.
+    return rotl32(Arg, RotAmt) | ((RotAmt>>1) << 8);
+  }
+
+  /// isSOImmTwoPartVal - Return true if the specified value can be obtained by
+  /// or'ing together two SOImmVal's.
+  static inline bool isSOImmTwoPartVal(unsigned V) {
+    // If this can be handled with a single shifter_op, bail out.
+    V = rotr32(~255U, getSOImmValRotate(V)) & V;
+    if (V == 0)
+      return false;
+
+    // If this can be handled with two shifter_op's, accept.
+    V = rotr32(~255U, getSOImmValRotate(V)) & V;
+    return V == 0;
+  }
+
+  /// getSOImmTwoPartFirst - If V is a value that satisfies isSOImmTwoPartVal,
+  /// return the first chunk of it.
+  static inline unsigned getSOImmTwoPartFirst(unsigned V) {
+    return rotr32(255U, getSOImmValRotate(V)) & V;
+  }
+
+  /// getSOImmTwoPartSecond - If V is a value that satisfies isSOImmTwoPartVal,
+  /// return the second chunk of it.
+  static inline unsigned getSOImmTwoPartSecond(unsigned V) {
+    // Mask out the first hunk.
+    V = rotr32(~255U, getSOImmValRotate(V)) & V;
+
+    // Take what's left.
+    assert(V == (rotr32(255U, getSOImmValRotate(V)) & V));
+    return V;
+  }
+
+  /// getThumbImmValShift - Try to handle Imm with a 8-bit immediate followed
+  /// by a left shift. Returns the shift amount to use.
+  static inline unsigned getThumbImmValShift(unsigned Imm) {
+    // 8-bit (or less) immediates are trivially immediate operand with a shift
+    // of zero.
+    if ((Imm & ~255U) == 0) return 0;
+
+    // Use CTZ to compute the shift amount.
+    return CountTrailingZeros_32(Imm);
+  }
+
+  /// isThumbImmShiftedVal - Return true if the specified value can be obtained
+  /// by left shifting a 8-bit immediate.
+  static inline bool isThumbImmShiftedVal(unsigned V) {
+    // If this can be handled with
+    V = (~255U << getThumbImmValShift(V)) & V;
+    return V == 0;
+  }
+
+  /// getThumbImm16ValShift - Try to handle Imm with a 16-bit immediate followed
+  /// by a left shift. Returns the shift amount to use.
+  static inline unsigned getThumbImm16ValShift(unsigned Imm) {
+    // 16-bit (or less) immediates are trivially immediate operand with a shift
+    // of zero.
+    if ((Imm & ~65535U) == 0) return 0;
+
+    // Use CTZ to compute the shift amount.
+    return CountTrailingZeros_32(Imm);
+  }
+
+  /// isThumbImm16ShiftedVal - Return true if the specified value can be
+  /// obtained by left shifting a 16-bit immediate.
+  static inline bool isThumbImm16ShiftedVal(unsigned V) {
+    // If this can be handled with
+    V = (~65535U << getThumbImm16ValShift(V)) & V;
+    return V == 0;
+  }
+
+  /// getThumbImmNonShiftedVal - If V is a value that satisfies
+  /// isThumbImmShiftedVal, return the non-shiftd value.
+  static inline unsigned getThumbImmNonShiftedVal(unsigned V) {
+    return V >> getThumbImmValShift(V);
+  }
+
+
+  /// getT2SOImmValSplat - Return the 12-bit encoded representation
+  /// if the specified value can be obtained by splatting the low 8 bits
+  /// into every other byte or every byte of a 32-bit value. i.e.,
+  ///     00000000 00000000 00000000 abcdefgh    control = 0
+  ///     00000000 abcdefgh 00000000 abcdefgh    control = 1
+  ///     abcdefgh 00000000 abcdefgh 00000000    control = 2
+  ///     abcdefgh abcdefgh abcdefgh abcdefgh    control = 3
+  /// Return -1 if none of the above apply.
+  /// See ARM Reference Manual A6.3.2.
+  static inline int getT2SOImmValSplatVal(unsigned V) {
+    unsigned u, Vs, Imm;
+    // control = 0
+    if ((V & 0xffffff00) == 0)
+      return V;
+
+    // If the value is zeroes in the first byte, just shift those off
+    Vs = ((V & 0xff) == 0) ? V >> 8 : V;
+    // Any passing value only has 8 bits of payload, splatted across the word
+    Imm = Vs & 0xff;
+    // Likewise, any passing values have the payload splatted into the 3rd byte
+    u = Imm | (Imm << 16);
+
+    // control = 1 or 2
+    if (Vs == u)
+      return (((Vs == V) ? 1 : 2) << 8) | Imm;
+
+    // control = 3
+    if (Vs == (u | (u << 8)))
+      return (3 << 8) | Imm;
+
+    return -1;
+  }
+
+  /// getT2SOImmValRotateVal - Return the 12-bit encoded representation if the
+  /// specified value is a rotated 8-bit value. Return -1 if no rotation
+  /// encoding is possible.
+  /// See ARM Reference Manual A6.3.2.
+  static inline int getT2SOImmValRotateVal(unsigned V) {
+    unsigned RotAmt = CountLeadingZeros_32(V);
+    if (RotAmt >= 24)
+      return -1;
+
+    // If 'Arg' can be handled with a single shifter_op return the value.
+    if ((rotr32(0xff000000U, RotAmt) & V) == V)
+      return (rotr32(V, 24 - RotAmt) & 0x7f) | ((RotAmt + 8) << 7);
+
+    return -1;
+  }
+
+  /// getT2SOImmVal - Given a 32-bit immediate, if it is something that can fit
+  /// into a Thumb-2 shifter_operand immediate operand, return the 12-bit
+  /// encoding for it.  If not, return -1.
+  /// See ARM Reference Manual A6.3.2.
+  static inline int getT2SOImmVal(unsigned Arg) {
+    // If 'Arg' is an 8-bit splat, then get the encoded value.
+    int Splat = getT2SOImmValSplatVal(Arg);
+    if (Splat != -1)
+      return Splat;
+
+    // If 'Arg' can be handled with a single shifter_op return the value.
+    int Rot = getT2SOImmValRotateVal(Arg);
+    if (Rot != -1)
+      return Rot;
+
+    return -1;
+  }
+
+  static inline unsigned getT2SOImmValRotate(unsigned V) {
+    if ((V & ~255U) == 0) return 0;
+    // Use CTZ to compute the rotate amount.
+    unsigned RotAmt = CountTrailingZeros_32(V);
+    return (32 - RotAmt) & 31;
+  }
+
+  static inline bool isT2SOImmTwoPartVal (unsigned Imm) {
+    unsigned V = Imm;
+    // Passing values can be any combination of splat values and shifter
+    // values. If this can be handled with a single shifter or splat, bail
+    // out. Those should be handled directly, not with a two-part val.
+    if (getT2SOImmValSplatVal(V) != -1)
+      return false;
+    V = rotr32 (~255U, getT2SOImmValRotate(V)) & V;
+    if (V == 0)
+      return false;
+
+    // If this can be handled as an immediate, accept.
+    if (getT2SOImmVal(V) != -1) return true;
+
+    // Likewise, try masking out a splat value first.
+    V = Imm;
+    if (getT2SOImmValSplatVal(V & 0xff00ff00U) != -1)
+      V &= ~0xff00ff00U;
+    else if (getT2SOImmValSplatVal(V & 0x00ff00ffU) != -1)
+      V &= ~0x00ff00ffU;
+    // If what's left can be handled as an immediate, accept.
+    if (getT2SOImmVal(V) != -1) return true;
+
+    // Otherwise, do not accept.
+    return false;
+  }
+
+  static inline unsigned getT2SOImmTwoPartFirst(unsigned Imm) {
+    assert (isT2SOImmTwoPartVal(Imm) &&
+            "Immedate cannot be encoded as two part immediate!");
+    // Try a shifter operand as one part
+    unsigned V = rotr32 (~255, getT2SOImmValRotate(Imm)) & Imm;
+    // If the rest is encodable as an immediate, then return it.
+    if (getT2SOImmVal(V) != -1) return V;
+
+    // Try masking out a splat value first.
+    if (getT2SOImmValSplatVal(Imm & 0xff00ff00U) != -1)
+      return Imm & 0xff00ff00U;
+
+    // The other splat is all that's left as an option.
+    assert (getT2SOImmValSplatVal(Imm & 0x00ff00ffU) != -1);
+    return Imm & 0x00ff00ffU;
+  }
+
+  static inline unsigned getT2SOImmTwoPartSecond(unsigned Imm) {
+    // Mask out the first hunk
+    Imm ^= getT2SOImmTwoPartFirst(Imm);
+    // Return what's left
+    assert (getT2SOImmVal(Imm) != -1 &&
+            "Unable to encode second part of T2 two part SO immediate");
+    return Imm;
+  }
+
+
+  //===--------------------------------------------------------------------===//
+  // Addressing Mode #2
+  //===--------------------------------------------------------------------===//
+  //
+  // This is used for most simple load/store instructions.
+  //
+  // addrmode2 := reg +/- reg shop imm
+  // addrmode2 := reg +/- imm12
+  //
+  // The first operand is always a Reg.  The second operand is a reg if in
+  // reg/reg form, otherwise it's reg#0.  The third field encodes the operation
+  // in bit 12, the immediate in bits 0-11, and the shift op in 13-15. The
+  // fourth operand 16-17 encodes the index mode.
+  //
+  // If this addressing mode is a frame index (before prolog/epilog insertion
+  // and code rewriting), this operand will have the form:  FI#, reg0, <offs>
+  // with no shift amount for the frame offset.
+  //
+  static inline unsigned getAM2Opc(AddrOpc Opc, unsigned Imm12, ShiftOpc SO,
+                                   unsigned IdxMode = 0) {
+    assert(Imm12 < (1 << 12) && "Imm too large!");
+    bool isSub = Opc == sub;
+    return Imm12 | ((int)isSub << 12) | (SO << 13) | (IdxMode << 16) ;
+  }
+  static inline unsigned getAM2Offset(unsigned AM2Opc) {
+    return AM2Opc & ((1 << 12)-1);
+  }
+  static inline AddrOpc getAM2Op(unsigned AM2Opc) {
+    return ((AM2Opc >> 12) & 1) ? sub : add;
+  }
+  static inline ShiftOpc getAM2ShiftOpc(unsigned AM2Opc) {
+    return (ShiftOpc)((AM2Opc >> 13) & 7);
+  }
+  static inline unsigned getAM2IdxMode(unsigned AM2Opc) {
+    return (AM2Opc >> 16);
+  }
+
+
+  //===--------------------------------------------------------------------===//
+  // Addressing Mode #3
+  //===--------------------------------------------------------------------===//
+  //
+  // This is used for sign-extending loads, and load/store-pair instructions.
+  //
+  // addrmode3 := reg +/- reg
+  // addrmode3 := reg +/- imm8
+  //
+  // The first operand is always a Reg.  The second operand is a reg if in
+  // reg/reg form, otherwise it's reg#0.  The third field encodes the operation
+  // in bit 8, the immediate in bits 0-7. The fourth operand 9-10 encodes the
+  // index mode.
+
+  /// getAM3Opc - This function encodes the addrmode3 opc field.
+  static inline unsigned getAM3Opc(AddrOpc Opc, unsigned char Offset,
+                                   unsigned IdxMode = 0) {
+    bool isSub = Opc == sub;
+    return ((int)isSub << 8) | Offset | (IdxMode << 9);
+  }
+  static inline unsigned char getAM3Offset(unsigned AM3Opc) {
+    return AM3Opc & 0xFF;
+  }
+  static inline AddrOpc getAM3Op(unsigned AM3Opc) {
+    return ((AM3Opc >> 8) & 1) ? sub : add;
+  }
+  static inline unsigned getAM3IdxMode(unsigned AM3Opc) {
+    return (AM3Opc >> 9);
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Addressing Mode #4
+  //===--------------------------------------------------------------------===//
+  //
+  // This is used for load / store multiple instructions.
+  //
+  // addrmode4 := reg, <mode>
+  //
+  // The four modes are:
+  //    IA - Increment after
+  //    IB - Increment before
+  //    DA - Decrement after
+  //    DB - Decrement before
+  // For VFP instructions, only the IA and DB modes are valid.
+
+  static inline AMSubMode getAM4SubMode(unsigned Mode) {
+    return (AMSubMode)(Mode & 0x7);
+  }
+
+  static inline unsigned getAM4ModeImm(AMSubMode SubMode) {
+    return (int)SubMode;
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Addressing Mode #5
+  //===--------------------------------------------------------------------===//
+  //
+  // This is used for coprocessor instructions, such as FP load/stores.
+  //
+  // addrmode5 := reg +/- imm8*4
+  //
+  // The first operand is always a Reg.  The second operand encodes the
+  // operation in bit 8 and the immediate in bits 0-7.
+
+  /// getAM5Opc - This function encodes the addrmode5 opc field.
+  static inline unsigned getAM5Opc(AddrOpc Opc, unsigned char Offset) {
+    bool isSub = Opc == sub;
+    return ((int)isSub << 8) | Offset;
+  }
+  static inline unsigned char getAM5Offset(unsigned AM5Opc) {
+    return AM5Opc & 0xFF;
+  }
+  static inline AddrOpc getAM5Op(unsigned AM5Opc) {
+    return ((AM5Opc >> 8) & 1) ? sub : add;
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Addressing Mode #6
+  //===--------------------------------------------------------------------===//
+  //
+  // This is used for NEON load / store instructions.
+  //
+  // addrmode6 := reg with optional alignment
+  //
+  // This is stored in two operands [regaddr, align].  The first is the
+  // address register.  The second operand is the value of the alignment
+  // specifier in bytes or zero if no explicit alignment.
+  // Valid alignments depend on the specific instruction.
+
+  //===--------------------------------------------------------------------===//
+  // NEON Modified Immediates
+  //===--------------------------------------------------------------------===//
+  //
+  // Several NEON instructions (e.g., VMOV) take a "modified immediate"
+  // vector operand, where a small immediate encoded in the instruction
+  // specifies a full NEON vector value.  These modified immediates are
+  // represented here as encoded integers.  The low 8 bits hold the immediate
+  // value; bit 12 holds the "Op" field of the instruction, and bits 11-8 hold
+  // the "Cmode" field of the instruction.  The interfaces below treat the
+  // Op and Cmode values as a single 5-bit value.
+
+  static inline unsigned createNEONModImm(unsigned OpCmode, unsigned Val) {
+    return (OpCmode << 8) | Val;
+  }
+  static inline unsigned getNEONModImmOpCmode(unsigned ModImm) {
+    return (ModImm >> 8) & 0x1f;
+  }
+  static inline unsigned getNEONModImmVal(unsigned ModImm) {
+    return ModImm & 0xff;
+  }
+
+  /// decodeNEONModImm - Decode a NEON modified immediate value into the
+  /// element value and the element size in bits.  (If the element size is
+  /// smaller than the vector, it is splatted into all the elements.)
+  static inline uint64_t decodeNEONModImm(unsigned ModImm, unsigned &EltBits) {
+    unsigned OpCmode = getNEONModImmOpCmode(ModImm);
+    unsigned Imm8 = getNEONModImmVal(ModImm);
+    uint64_t Val = 0;
+
+    if (OpCmode == 0xe) {
+      // 8-bit vector elements
+      Val = Imm8;
+      EltBits = 8;
+    } else if ((OpCmode & 0xc) == 0x8) {
+      // 16-bit vector elements
+      unsigned ByteNum = (OpCmode & 0x6) >> 1;
+      Val = Imm8 << (8 * ByteNum);
+      EltBits = 16;
+    } else if ((OpCmode & 0x8) == 0) {
+      // 32-bit vector elements, zero with one byte set
+      unsigned ByteNum = (OpCmode & 0x6) >> 1;
+      Val = Imm8 << (8 * ByteNum);
+      EltBits = 32;
+    } else if ((OpCmode & 0xe) == 0xc) {
+      // 32-bit vector elements, one byte with low bits set
+      unsigned ByteNum = 1 + (OpCmode & 0x1);
+      Val = (Imm8 << (8 * ByteNum)) | (0xffff >> (8 * (2 - ByteNum)));
+      EltBits = 32;
+    } else if (OpCmode == 0x1e) {
+      // 64-bit vector elements
+      for (unsigned ByteNum = 0; ByteNum < 8; ++ByteNum) {
+        if ((ModImm >> ByteNum) & 1)
+          Val |= (uint64_t)0xff << (8 * ByteNum);
+      }
+      EltBits = 64;
+    } else {
+      assert(false && "Unsupported NEON immediate");
+    }
+    return Val;
+  }
+
+  AMSubMode getLoadStoreMultipleSubMode(int Opcode);
+
+  //===--------------------------------------------------------------------===//
+  // Floating-point Immediates
+  //
+  static inline float getFPImmFloat(unsigned Imm) {
+    // We expect an 8-bit binary encoding of a floating-point number here.
+    union {
+      uint32_t I;
+      float F;
+    } FPUnion;
+
+    uint8_t Sign = (Imm >> 7) & 0x1;
+    uint8_t Exp = (Imm >> 4) & 0x7;
+    uint8_t Mantissa = Imm & 0xf;
+
+    //   8-bit FP    iEEEE Float Encoding
+    //   abcd efgh   aBbbbbbc defgh000 00000000 00000000
+    //
+    // where B = NOT(b);
+
+    FPUnion.I = 0;
+    FPUnion.I |= Sign << 31;
+    FPUnion.I |= ((Exp & 0x4) != 0 ? 0 : 1) << 30;
+    FPUnion.I |= ((Exp & 0x4) != 0 ? 0x1f : 0) << 25;
+    FPUnion.I |= (Exp & 0x3) << 23;
+    FPUnion.I |= Mantissa << 19;
+    return FPUnion.F;
+  }
+
+  /// getFP32Imm - Return an 8-bit floating-point version of the 32-bit
+  /// floating-point value. If the value cannot be represented as an 8-bit
+  /// floating-point value, then return -1.
+  static inline int getFP32Imm(const APInt &Imm) {
+    uint32_t Sign = Imm.lshr(31).getZExtValue() & 1;
+    int32_t Exp = (Imm.lshr(23).getSExtValue() & 0xff) - 127;  // -126 to 127
+    int64_t Mantissa = Imm.getZExtValue() & 0x7fffff;  // 23 bits
+
+    // We can handle 4 bits of mantissa.
+    // mantissa = (16+UInt(e:f:g:h))/16.
+    if (Mantissa & 0x7ffff)
+      return -1;
+    Mantissa >>= 19;
+    if ((Mantissa & 0xf) != Mantissa)
+      return -1;
+
+    // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3
+    if (Exp < -3 || Exp > 4)
+      return -1;
+    Exp = ((Exp+3) & 0x7) ^ 4;
+
+    return ((int)Sign << 7) | (Exp << 4) | Mantissa;
+  }
+
+  static inline int getFP32Imm(const APFloat &FPImm) {
+    return getFP32Imm(FPImm.bitcastToAPInt());
+  }
+
+  /// getFP64Imm - Return an 8-bit floating-point version of the 64-bit
+  /// floating-point value. If the value cannot be represented as an 8-bit
+  /// floating-point value, then return -1.
+  static inline int getFP64Imm(const APInt &Imm) {
+    uint64_t Sign = Imm.lshr(63).getZExtValue() & 1;
+    int64_t Exp = (Imm.lshr(52).getSExtValue() & 0x7ff) - 1023; // -1022 to 1023
+    uint64_t Mantissa = Imm.getZExtValue() & 0xfffffffffffffULL;
+
+    // We can handle 4 bits of mantissa.
+    // mantissa = (16+UInt(e:f:g:h))/16.
+    if (Mantissa & 0xffffffffffffULL)
+      return -1;
+    Mantissa >>= 48;
+    if ((Mantissa & 0xf) != Mantissa)
+      return -1;
+
+    // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3
+    if (Exp < -3 || Exp > 4)
+      return -1;
+    Exp = ((Exp+3) & 0x7) ^ 4;
+
+    return ((int)Sign << 7) | (Exp << 4) | Mantissa;
+  }
+
+  static inline int getFP64Imm(const APFloat &FPImm) {
+    return getFP64Imm(FPImm.bitcastToAPInt());
+  }
+
+} // end namespace ARM_AM
+} // end namespace llvm
+
+#endif
+
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
new file mode 100644
index 0000000000000..c31c5e6b84520
--- /dev/null
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -0,0 +1,531 @@
+//===-- ARMAsmBackend.cpp - ARM Assembler Backend -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/ARMMCTargetDesc.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
+#include "MCTargetDesc/ARMFixupKinds.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCDirectives.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCMachObjectWriter.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Object/MachOFormat.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+namespace {
+class ARMELFObjectWriter : public MCELFObjectTargetWriter {
+public:
+  ARMELFObjectWriter(Triple::OSType OSType)
+    : MCELFObjectTargetWriter(/*Is64Bit*/ false, OSType, ELF::EM_ARM,
+                              /*HasRelocationAddend*/ false) {}
+};
+
+class ARMAsmBackend : public MCAsmBackend {
+  const MCSubtargetInfo* STI;
+  bool isThumbMode;  // Currently emitting Thumb code.
+public:
+  ARMAsmBackend(const Target &T, const StringRef TT)
+    : MCAsmBackend(), STI(ARM_MC::createARMMCSubtargetInfo(TT, "", "")),
+      isThumbMode(TT.startswith("thumb")) {}
+
+  ~ARMAsmBackend() {
+    delete STI;
+  }
+
+  unsigned getNumFixupKinds() const { return ARM::NumTargetFixupKinds; }
+
+  bool hasNOP() const {
+    return (STI->getFeatureBits() & ARM::HasV6T2Ops) != 0;
+  }
+
+  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const {
+    const static MCFixupKindInfo Infos[ARM::NumTargetFixupKinds] = {
+// This table *must* be in the order that the fixup_* kinds are defined in
+// ARMFixupKinds.h.
+//
+// Name                      Offset (bits) Size (bits)     Flags
+{ "fixup_arm_ldst_pcrel_12", 1,            24,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_t2_ldst_pcrel_12",  0,            32,  MCFixupKindInfo::FKF_IsPCRel |
+                                   MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+{ "fixup_arm_pcrel_10",      1,            24,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_t2_pcrel_10",       0,            32,  MCFixupKindInfo::FKF_IsPCRel |
+                                   MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+{ "fixup_thumb_adr_pcrel_10",0,            8,   MCFixupKindInfo::FKF_IsPCRel |
+                                   MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+{ "fixup_arm_adr_pcrel_12",  1,            24,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_t2_adr_pcrel_12",   0,            32,  MCFixupKindInfo::FKF_IsPCRel |
+                                   MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+{ "fixup_arm_condbranch",    0,            24,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_uncondbranch",  0,            24,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_t2_condbranch",     0,            32,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_t2_uncondbranch",   0,            32,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_thumb_br",      0,            16,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_thumb_bl",      0,            32,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_thumb_blx",     0,            32,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_thumb_cb",      0,            16,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_thumb_cp",      0,             8,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_thumb_bcc",     0,             8,  MCFixupKindInfo::FKF_IsPCRel },
+// movw / movt: 16-bits immediate but scattered into two chunks 0 - 12, 16 - 19.
+{ "fixup_arm_movt_hi16",     0,            20,  0 },
+{ "fixup_arm_movw_lo16",     0,            20,  0 },
+{ "fixup_t2_movt_hi16",      0,            20,  0 },
+{ "fixup_t2_movw_lo16",      0,            20,  0 },
+{ "fixup_arm_movt_hi16_pcrel", 0,          20,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_movw_lo16_pcrel", 0,          20,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_t2_movt_hi16_pcrel", 0,           20,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_t2_movw_lo16_pcrel", 0,           20,  MCFixupKindInfo::FKF_IsPCRel },
+    };
+
+    if (Kind < FirstTargetFixupKind)
+      return MCAsmBackend::getFixupKindInfo(Kind);
+
+    assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+           "Invalid kind!");
+    return Infos[Kind - FirstTargetFixupKind];
+  }
+
+  bool MayNeedRelaxation(const MCInst &Inst) const;
+
+  void RelaxInstruction(const MCInst &Inst, MCInst &Res) const;
+
+  bool WriteNopData(uint64_t Count, MCObjectWriter *OW) const;
+
+  void HandleAssemblerFlag(MCAssemblerFlag Flag) {
+    switch (Flag) {
+    default: break;
+    case MCAF_Code16:
+      setIsThumb(true);
+      break;
+    case MCAF_Code32:
+      setIsThumb(false);
+      break;
+    }
+  }
+
+  unsigned getPointerSize() const { return 4; }
+  bool isThumb() const { return isThumbMode; }
+  void setIsThumb(bool it) { isThumbMode = it; }
+};
+} // end anonymous namespace
+
+bool ARMAsmBackend::MayNeedRelaxation(const MCInst &Inst) const {
+  // FIXME: Thumb targets, different move constant targets..
+  return false;
+}
+
+void ARMAsmBackend::RelaxInstruction(const MCInst &Inst, MCInst &Res) const {
+  assert(0 && "ARMAsmBackend::RelaxInstruction() unimplemented");
+  return;
+}
+
+bool ARMAsmBackend::WriteNopData(uint64_t Count, MCObjectWriter *OW) const {
+  const uint16_t Thumb1_16bitNopEncoding = 0x46c0; // using MOV r8,r8
+  const uint16_t Thumb2_16bitNopEncoding = 0xbf00; // NOP
+  const uint32_t ARMv4_NopEncoding = 0xe1a0000; // using MOV r0,r0
+  const uint32_t ARMv6T2_NopEncoding = 0xe3207800; // NOP
+  if (isThumb()) {
+    const uint16_t nopEncoding = hasNOP() ? Thumb2_16bitNopEncoding
+                                          : Thumb1_16bitNopEncoding;
+    uint64_t NumNops = Count / 2;
+    for (uint64_t i = 0; i != NumNops; ++i)
+      OW->Write16(nopEncoding);
+    if (Count & 1)
+      OW->Write8(0);
+    return true;
+  }
+  // ARM mode
+  const uint32_t nopEncoding = hasNOP() ? ARMv6T2_NopEncoding
+                                        : ARMv4_NopEncoding;
+  uint64_t NumNops = Count / 4;
+  for (uint64_t i = 0; i != NumNops; ++i)
+    OW->Write32(nopEncoding);
+  // FIXME: should this function return false when unable to write exactly
+  // 'Count' bytes with NOP encodings?
+  switch (Count % 4) {
+  default: break; // No leftover bytes to write
+  case 1: OW->Write8(0); break;
+  case 2: OW->Write16(0); break;
+  case 3: OW->Write16(0); OW->Write8(0xa0); break;
+  }
+
+  return true;
+}
+
+static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
+  switch (Kind) {
+  default:
+    llvm_unreachable("Unknown fixup kind!");
+  case FK_Data_1:
+  case FK_Data_2:
+  case FK_Data_4:
+    return Value;
+  case ARM::fixup_arm_movt_hi16:
+    Value >>= 16;
+    // Fallthrough
+  case ARM::fixup_arm_movw_lo16:
+  case ARM::fixup_arm_movt_hi16_pcrel:
+  case ARM::fixup_arm_movw_lo16_pcrel: {
+    unsigned Hi4 = (Value & 0xF000) >> 12;
+    unsigned Lo12 = Value & 0x0FFF;
+    // inst{19-16} = Hi4;
+    // inst{11-0} = Lo12;
+    Value = (Hi4 << 16) | (Lo12);
+    return Value;
+  }
+  case ARM::fixup_t2_movt_hi16:
+    Value >>= 16;
+    // Fallthrough
+  case ARM::fixup_t2_movw_lo16:
+  case ARM::fixup_t2_movt_hi16_pcrel:  //FIXME: Shouldn't this be shifted like
+                                       // the other hi16 fixup?
+  case ARM::fixup_t2_movw_lo16_pcrel: {
+    unsigned Hi4 = (Value & 0xF000) >> 12;
+    unsigned i = (Value & 0x800) >> 11;
+    unsigned Mid3 = (Value & 0x700) >> 8;
+    unsigned Lo8 = Value & 0x0FF;
+    // inst{19-16} = Hi4;
+    // inst{26} = i;
+    // inst{14-12} = Mid3;
+    // inst{7-0} = Lo8;
+    Value = (Hi4 << 16) | (i << 26) | (Mid3 << 12) | (Lo8);
+    uint64_t swapped = (Value & 0xFFFF0000) >> 16;
+    swapped |= (Value & 0x0000FFFF) << 16;
+    return swapped;
+  }
+  case ARM::fixup_arm_ldst_pcrel_12:
+    // ARM PC-relative values are offset by 8.
+    Value -= 4;
+    // FALLTHROUGH
+  case ARM::fixup_t2_ldst_pcrel_12: {
+    // Offset by 4, adjusted by two due to the half-word ordering of thumb.
+    Value -= 4;
+    bool isAdd = true;
+    if ((int64_t)Value < 0) {
+      Value = -Value;
+      isAdd = false;
+    }
+    assert ((Value < 4096) && "Out of range pc-relative fixup value!");
+    Value |= isAdd << 23;
+
+    // Same addressing mode as fixup_arm_pcrel_10,
+    // but with 16-bit halfwords swapped.
+    if (Kind == ARM::fixup_t2_ldst_pcrel_12) {
+      uint64_t swapped = (Value & 0xFFFF0000) >> 16;
+      swapped |= (Value & 0x0000FFFF) << 16;
+      return swapped;
+    }
+
+    return Value;
+  }
+  case ARM::fixup_thumb_adr_pcrel_10:
+    return ((Value - 4) >> 2) & 0xff;
+  case ARM::fixup_arm_adr_pcrel_12: {
+    // ARM PC-relative values are offset by 8.
+    Value -= 8;
+    unsigned opc = 4; // bits {24-21}. Default to add: 0b0100
+    if ((int64_t)Value < 0) {
+      Value = -Value;
+      opc = 2; // 0b0010
+    }
+    assert(ARM_AM::getSOImmVal(Value) != -1 &&
+           "Out of range pc-relative fixup value!");
+    // Encode the immediate and shift the opcode into place.
+    return ARM_AM::getSOImmVal(Value) | (opc << 21);
+  }
+
+  case ARM::fixup_t2_adr_pcrel_12: {
+    Value -= 4;
+    unsigned opc = 0;
+    if ((int64_t)Value < 0) {
+      Value = -Value;
+      opc = 5;
+    }
+
+    uint32_t out = (opc << 21);
+    out |= (Value & 0x800) << 15;
+    out |= (Value & 0x700) << 4;
+    out |= (Value & 0x0FF);
+
+    uint64_t swapped = (out & 0xFFFF0000) >> 16;
+    swapped |= (out & 0x0000FFFF) << 16;
+    return swapped;
+  }
+
+  case ARM::fixup_arm_condbranch:
+  case ARM::fixup_arm_uncondbranch:
+    // These values don't encode the low two bits since they're always zero.
+    // Offset by 8 just as above.
+    return 0xffffff & ((Value - 8) >> 2);
+  case ARM::fixup_t2_uncondbranch: {
+    Value = Value - 4;
+    Value >>= 1; // Low bit is not encoded.
+
+    uint32_t out = 0;
+    bool I =  Value & 0x800000;
+    bool J1 = Value & 0x400000;
+    bool J2 = Value & 0x200000;
+    J1 ^= I;
+    J2 ^= I;
+
+    out |= I  << 26; // S bit
+    out |= !J1 << 13; // J1 bit
+    out |= !J2 << 11; // J2 bit
+    out |= (Value & 0x1FF800)  << 5; // imm6 field
+    out |= (Value & 0x0007FF);        // imm11 field
+
+    uint64_t swapped = (out & 0xFFFF0000) >> 16;
+    swapped |= (out & 0x0000FFFF) << 16;
+    return swapped;
+  }
+  case ARM::fixup_t2_condbranch: {
+    Value = Value - 4;
+    Value >>= 1; // Low bit is not encoded.
+
+    uint64_t out = 0;
+    out |= (Value & 0x80000) << 7; // S bit
+    out |= (Value & 0x40000) >> 7; // J2 bit
+    out |= (Value & 0x20000) >> 4; // J1 bit
+    out |= (Value & 0x1F800) << 5; // imm6 field
+    out |= (Value & 0x007FF);      // imm11 field
+
+    uint32_t swapped = (out & 0xFFFF0000) >> 16;
+    swapped |= (out & 0x0000FFFF) << 16;
+    return swapped;
+  }
+  case ARM::fixup_arm_thumb_bl: {
+    // The value doesn't encode the low bit (always zero) and is offset by
+    // four. The value is encoded into disjoint bit positions in the destination
+    // opcode. x = unchanged, I = immediate value bit, S = sign extension bit
+    //
+    //   BL:  xxxxxSIIIIIIIIII xxxxxIIIIIIIIIII
+    //
+    // Note that the halfwords are stored high first, low second; so we need
+    // to transpose the fixup value here to map properly.
+    unsigned isNeg = (int64_t(Value - 4) < 0) ? 1 : 0;
+    uint32_t Binary = 0;
+    Value = 0x3fffff & ((Value - 4) >> 1);
+    Binary  = (Value & 0x7ff) << 16;    // Low imm11 value.
+    Binary |= (Value & 0x1ffc00) >> 11; // High imm10 value.
+    Binary |= isNeg << 10;              // Sign bit.
+    return Binary;
+  }
+  case ARM::fixup_arm_thumb_blx: {
+    // The value doesn't encode the low two bits (always zero) and is offset by
+    // four (see fixup_arm_thumb_cp). The value is encoded into disjoint bit
+    // positions in the destination opcode. x = unchanged, I = immediate value
+    // bit, S = sign extension bit, 0 = zero.
+    //
+    //   BLX: xxxxxSIIIIIIIIII xxxxxIIIIIIIIII0
+    //
+    // Note that the halfwords are stored high first, low second; so we need
+    // to transpose the fixup value here to map properly.
+    unsigned isNeg = (int64_t(Value-4) < 0) ? 1 : 0;
+    uint32_t Binary = 0;
+    Value = 0xfffff & ((Value - 2) >> 2);
+    Binary  = (Value & 0x3ff) << 17;    // Low imm10L value.
+    Binary |= (Value & 0xffc00) >> 10;  // High imm10H value.
+    Binary |= isNeg << 10;              // Sign bit.
+    return Binary;
+  }
+  case ARM::fixup_arm_thumb_cp:
+    // Offset by 4, and don't encode the low two bits. Two bytes of that
+    // 'off by 4' is implicitly handled by the half-word ordering of the
+    // Thumb encoding, so we only need to adjust by 2 here.
+    return ((Value - 2) >> 2) & 0xff;
+  case ARM::fixup_arm_thumb_cb: {
+    // Offset by 4 and don't encode the lower bit, which is always 0.
+    uint32_t Binary = (Value - 4) >> 1;
+    return ((Binary & 0x20) << 4) | ((Binary & 0x1f) << 3);
+  }
+  case ARM::fixup_arm_thumb_br:
+    // Offset by 4 and don't encode the lower bit, which is always 0.
+    return ((Value - 4) >> 1) & 0x7ff;
+  case ARM::fixup_arm_thumb_bcc:
+    // Offset by 4 and don't encode the lower bit, which is always 0.
+    return ((Value - 4) >> 1) & 0xff;
+  case ARM::fixup_arm_pcrel_10:
+    Value = Value - 4; // ARM fixups offset by an additional word and don't
+                       // need to adjust for the half-word ordering.
+    // Fall through.
+  case ARM::fixup_t2_pcrel_10: {
+    // Offset by 4, adjusted by two due to the half-word ordering of thumb.
+    Value = Value - 4;
+    bool isAdd = true;
+    if ((int64_t)Value < 0) {
+      Value = -Value;
+      isAdd = false;
+    }
+    // These values don't encode the low two bits since they're always zero.
+    Value >>= 2;
+    assert ((Value < 256) && "Out of range pc-relative fixup value!");
+    Value |= isAdd << 23;
+
+    // Same addressing mode as fixup_arm_pcrel_10,
+    // but with 16-bit halfwords swapped.
+    if (Kind == ARM::fixup_t2_pcrel_10) {
+      uint32_t swapped = (Value & 0xFFFF0000) >> 16;
+      swapped |= (Value & 0x0000FFFF) << 16;
+      return swapped;
+    }
+
+    return Value;
+  }
+  }
+}
+
+namespace {
+
+// FIXME: This should be in a separate file.
+// ELF is an ELF of course...
+class ELFARMAsmBackend : public ARMAsmBackend {
+public:
+  Triple::OSType OSType;
+  ELFARMAsmBackend(const Target &T, const StringRef TT,
+                   Triple::OSType _OSType)
+    : ARMAsmBackend(T, TT), OSType(_OSType) { }
+
+  void ApplyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                  uint64_t Value) const;
+
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
+    return createELFObjectWriter(new ARMELFObjectWriter(OSType), OS,
+                              /*IsLittleEndian*/ true);
+  }
+};
+
+// FIXME: Raise this to share code between Darwin and ELF.
+void ELFARMAsmBackend::ApplyFixup(const MCFixup &Fixup, char *Data,
+                                  unsigned DataSize, uint64_t Value) const {
+  unsigned NumBytes = 4;        // FIXME: 2 for Thumb
+  Value = adjustFixupValue(Fixup.getKind(), Value);
+  if (!Value) return;           // Doesn't change encoding.
+
+  unsigned Offset = Fixup.getOffset();
+
+  // For each byte of the fragment that the fixup touches, mask in the bits from
+  // the fixup value. The Value has been "split up" into the appropriate
+  // bitfields above.
+  for (unsigned i = 0; i != NumBytes; ++i)
+    Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff);
+}
+
+// FIXME: This should be in a separate file.
+class DarwinARMAsmBackend : public ARMAsmBackend {
+public:
+  const object::mach::CPUSubtypeARM Subtype;
+  DarwinARMAsmBackend(const Target &T, const StringRef TT,
+                      object::mach::CPUSubtypeARM st)
+    : ARMAsmBackend(T, TT), Subtype(st) { }
+
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
+    return createARMMachObjectWriter(OS, /*Is64Bit=*/false,
+                                     object::mach::CTM_ARM,
+                                     Subtype);
+  }
+
+  void ApplyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                  uint64_t Value) const;
+
+  virtual bool doesSectionRequireSymbols(const MCSection &Section) const {
+    return false;
+  }
+};
+
+/// getFixupKindNumBytes - The number of bytes the fixup may change.
+static unsigned getFixupKindNumBytes(unsigned Kind) {
+  switch (Kind) {
+  default:
+    llvm_unreachable("Unknown fixup kind!");
+
+  case FK_Data_1:
+  case ARM::fixup_arm_thumb_bcc:
+  case ARM::fixup_arm_thumb_cp:
+  case ARM::fixup_thumb_adr_pcrel_10:
+    return 1;
+
+  case FK_Data_2:
+  case ARM::fixup_arm_thumb_br:
+  case ARM::fixup_arm_thumb_cb:
+    return 2;
+
+  case ARM::fixup_arm_ldst_pcrel_12:
+  case ARM::fixup_arm_pcrel_10:
+  case ARM::fixup_arm_adr_pcrel_12:
+  case ARM::fixup_arm_condbranch:
+  case ARM::fixup_arm_uncondbranch:
+    return 3;
+
+  case FK_Data_4:
+  case ARM::fixup_t2_ldst_pcrel_12:
+  case ARM::fixup_t2_condbranch:
+  case ARM::fixup_t2_uncondbranch:
+  case ARM::fixup_t2_pcrel_10:
+  case ARM::fixup_t2_adr_pcrel_12:
+  case ARM::fixup_arm_thumb_bl:
+  case ARM::fixup_arm_thumb_blx:
+  case ARM::fixup_arm_movt_hi16:
+  case ARM::fixup_arm_movw_lo16:
+  case ARM::fixup_arm_movt_hi16_pcrel:
+  case ARM::fixup_arm_movw_lo16_pcrel:
+  case ARM::fixup_t2_movt_hi16:
+  case ARM::fixup_t2_movw_lo16:
+  case ARM::fixup_t2_movt_hi16_pcrel:
+  case ARM::fixup_t2_movw_lo16_pcrel:
+    return 4;
+  }
+}
+
+void DarwinARMAsmBackend::ApplyFixup(const MCFixup &Fixup, char *Data,
+                                     unsigned DataSize, uint64_t Value) const {
+  unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
+  Value = adjustFixupValue(Fixup.getKind(), Value);
+  if (!Value) return;           // Doesn't change encoding.
+
+  unsigned Offset = Fixup.getOffset();
+  assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
+
+  // For each byte of the fragment that the fixup touches, mask in the
+  // bits from the fixup value.
+  for (unsigned i = 0; i != NumBytes; ++i)
+    Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff);
+}
+
+} // end anonymous namespace
+
+MCAsmBackend *llvm::createARMAsmBackend(const Target &T, StringRef TT) {
+  Triple TheTriple(TT);
+
+  if (TheTriple.isOSDarwin()) {
+    if (TheTriple.getArchName() == "armv4t" ||
+        TheTriple.getArchName() == "thumbv4t")
+      return new DarwinARMAsmBackend(T, TT, object::mach::CSARM_V4T);
+    else if (TheTriple.getArchName() == "armv5e" ||
+        TheTriple.getArchName() == "thumbv5e")
+      return new DarwinARMAsmBackend(T, TT, object::mach::CSARM_V5TEJ);
+    else if (TheTriple.getArchName() == "armv6" ||
+        TheTriple.getArchName() == "thumbv6")
+      return new DarwinARMAsmBackend(T, TT, object::mach::CSARM_V6);
+    return new DarwinARMAsmBackend(T, TT, object::mach::CSARM_V7);
+  }
+
+  if (TheTriple.isOSWindows())
+    assert(0 && "Windows not supported on ARM");
+
+  return new ELFARMAsmBackend(T, TT, Triple(TT).getOS());
+}
diff --git a/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h b/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
new file mode 100644
index 0000000000000..ec4b6ffcfe834
--- /dev/null
+++ b/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
@@ -0,0 +1,449 @@
+//===-- ARMBaseInfo.h - Top level definitions for ARM -------- --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains small standalone helper functions and enum definitions for
+// the ARM target useful for the compiler back-end and the MC libraries.
+// As such, it deliberately does not include references to LLVM core
+// code gen types, passes, etc..
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMBASEINFO_H
+#define ARMBASEINFO_H
+
+#include "ARMMCTargetDesc.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+
+// Enums corresponding to ARM condition codes
+namespace ARMCC {
+  // The CondCodes constants map directly to the 4-bit encoding of the
+  // condition field for predicated instructions.
+  enum CondCodes { // Meaning (integer)          Meaning (floating-point)
+    EQ,            // Equal                      Equal
+    NE,            // Not equal                  Not equal, or unordered
+    HS,            // Carry set                  >, ==, or unordered
+    LO,            // Carry clear                Less than
+    MI,            // Minus, negative            Less than
+    PL,            // Plus, positive or zero     >, ==, or unordered
+    VS,            // Overflow                   Unordered
+    VC,            // No overflow                Not unordered
+    HI,            // Unsigned higher            Greater than, or unordered
+    LS,            // Unsigned lower or same     Less than or equal
+    GE,            // Greater than or equal      Greater than or equal
+    LT,            // Less than                  Less than, or unordered
+    GT,            // Greater than               Greater than
+    LE,            // Less than or equal         <, ==, or unordered
+    AL             // Always (unconditional)     Always (unconditional)
+  };
+
+  inline static CondCodes getOppositeCondition(CondCodes CC) {
+    switch (CC) {
+    default: llvm_unreachable("Unknown condition code");
+    case EQ: return NE;
+    case NE: return EQ;
+    case HS: return LO;
+    case LO: return HS;
+    case MI: return PL;
+    case PL: return MI;
+    case VS: return VC;
+    case VC: return VS;
+    case HI: return LS;
+    case LS: return HI;
+    case GE: return LT;
+    case LT: return GE;
+    case GT: return LE;
+    case LE: return GT;
+    }
+  }
+} // namespace ARMCC
+
+inline static const char *ARMCondCodeToString(ARMCC::CondCodes CC) {
+  switch (CC) {
+  default: llvm_unreachable("Unknown condition code");
+  case ARMCC::EQ:  return "eq";
+  case ARMCC::NE:  return "ne";
+  case ARMCC::HS:  return "hs";
+  case ARMCC::LO:  return "lo";
+  case ARMCC::MI:  return "mi";
+  case ARMCC::PL:  return "pl";
+  case ARMCC::VS:  return "vs";
+  case ARMCC::VC:  return "vc";
+  case ARMCC::HI:  return "hi";
+  case ARMCC::LS:  return "ls";
+  case ARMCC::GE:  return "ge";
+  case ARMCC::LT:  return "lt";
+  case ARMCC::GT:  return "gt";
+  case ARMCC::LE:  return "le";
+  case ARMCC::AL:  return "al";
+  }
+}
+
+namespace ARM_PROC {
+  enum IMod {
+    IE = 2,
+    ID = 3
+  };
+
+  enum IFlags {
+    F = 1,
+    I = 2,
+    A = 4
+  };
+
+  inline static const char *IFlagsToString(unsigned val) {
+    switch (val) {
+    default: llvm_unreachable("Unknown iflags operand");
+    case F: return "f";
+    case I: return "i";
+    case A: return "a";
+    }
+  }
+
+  inline static const char *IModToString(unsigned val) {
+    switch (val) {
+    default: llvm_unreachable("Unknown imod operand");
+    case IE: return "ie";
+    case ID: return "id";
+    }
+  }
+}
+
+namespace ARM_MB {
+  // The Memory Barrier Option constants map directly to the 4-bit encoding of
+  // the option field for memory barrier operations.
+  enum MemBOpt {
+    SY    = 15,
+    ST    = 14,
+    ISH   = 11,
+    ISHST = 10,
+    NSH   = 7,
+    NSHST = 6,
+    OSH   = 3,
+    OSHST = 2
+  };
+
+  inline static const char *MemBOptToString(unsigned val) {
+    switch (val) {
+    default: llvm_unreachable("Unknown memory operation");
+    case SY:    return "sy";
+    case ST:    return "st";
+    case ISH:   return "ish";
+    case ISHST: return "ishst";
+    case NSH:   return "nsh";
+    case NSHST: return "nshst";
+    case OSH:   return "osh";
+    case OSHST: return "oshst";
+    }
+  }
+} // namespace ARM_MB
+
+/// getARMRegisterNumbering - Given the enum value for some register, e.g.
+/// ARM::LR, return the number that it corresponds to (e.g. 14).
+inline static unsigned getARMRegisterNumbering(unsigned Reg) {
+  using namespace ARM;
+  switch (Reg) {
+  default:
+    llvm_unreachable("Unknown ARM register!");
+  case R0:  case S0:  case D0:  case Q0:  return 0;
+  case R1:  case S1:  case D1:  case Q1:  return 1;
+  case R2:  case S2:  case D2:  case Q2:  return 2;
+  case R3:  case S3:  case D3:  case Q3:  return 3;
+  case R4:  case S4:  case D4:  case Q4:  return 4;
+  case R5:  case S5:  case D5:  case Q5:  return 5;
+  case R6:  case S6:  case D6:  case Q6:  return 6;
+  case R7:  case S7:  case D7:  case Q7:  return 7;
+  case R8:  case S8:  case D8:  case Q8:  return 8;
+  case R9:  case S9:  case D9:  case Q9:  return 9;
+  case R10: case S10: case D10: case Q10: return 10;
+  case R11: case S11: case D11: case Q11: return 11;
+  case R12: case S12: case D12: case Q12: return 12;
+  case SP:  case S13: case D13: case Q13: return 13;
+  case LR:  case S14: case D14: case Q14: return 14;
+  case PC:  case S15: case D15: case Q15: return 15;
+
+  case S16: case D16: return 16;
+  case S17: case D17: return 17;
+  case S18: case D18: return 18;
+  case S19: case D19: return 19;
+  case S20: case D20: return 20;
+  case S21: case D21: return 21;
+  case S22: case D22: return 22;
+  case S23: case D23: return 23;
+  case S24: case D24: return 24;
+  case S25: case D25: return 25;
+  case S26: case D26: return 26;
+  case S27: case D27: return 27;
+  case S28: case D28: return 28;
+  case S29: case D29: return 29;
+  case S30: case D30: return 30;
+  case S31: case D31: return 31;
+  }
+}
+
+/// isARMLowRegister - Returns true if the register is a low register (r0-r7).
+///
+static inline bool isARMLowRegister(unsigned Reg) {
+  using namespace ARM;
+  switch (Reg) {
+  case R0:  case R1:  case R2:  case R3:
+  case R4:  case R5:  case R6:  case R7:
+    return true;
+  default:
+    return false;
+  }
+}
+
+/// ARMII - This namespace holds all of the target specific flags that
+/// instruction info tracks.
+///
+namespace ARMII {
+
+  /// ARM Index Modes
+  enum IndexMode {
+    IndexModeNone  = 0,
+    IndexModePre   = 1,
+    IndexModePost  = 2,
+    IndexModeUpd   = 3
+  };
+
+  /// ARM Addressing Modes
+  enum AddrMode {
+    AddrModeNone    = 0,
+    AddrMode1       = 1,
+    AddrMode2       = 2,
+    AddrMode3       = 3,
+    AddrMode4       = 4,
+    AddrMode5       = 5,
+    AddrMode6       = 6,
+    AddrModeT1_1    = 7,
+    AddrModeT1_2    = 8,
+    AddrModeT1_4    = 9,
+    AddrModeT1_s    = 10, // i8 * 4 for pc and sp relative data
+    AddrModeT2_i12  = 11,
+    AddrModeT2_i8   = 12,
+    AddrModeT2_so   = 13,
+    AddrModeT2_pc   = 14, // +/- i12 for pc relative data
+    AddrModeT2_i8s4 = 15, // i8 * 4
+    AddrMode_i12    = 16
+  };
+
+  inline static const char *AddrModeToString(AddrMode addrmode) {
+    switch (addrmode) {
+    default: llvm_unreachable("Unknown memory operation");
+    case AddrModeNone:    return "AddrModeNone";
+    case AddrMode1:       return "AddrMode1";
+    case AddrMode2:       return "AddrMode2";
+    case AddrMode3:       return "AddrMode3";
+    case AddrMode4:       return "AddrMode4";
+    case AddrMode5:       return "AddrMode5";
+    case AddrMode6:       return "AddrMode6";
+    case AddrModeT1_1:    return "AddrModeT1_1";
+    case AddrModeT1_2:    return "AddrModeT1_2";
+    case AddrModeT1_4:    return "AddrModeT1_4";
+    case AddrModeT1_s:    return "AddrModeT1_s";
+    case AddrModeT2_i12:  return "AddrModeT2_i12";
+    case AddrModeT2_i8:   return "AddrModeT2_i8";
+    case AddrModeT2_so:   return "AddrModeT2_so";
+    case AddrModeT2_pc:   return "AddrModeT2_pc";
+    case AddrModeT2_i8s4: return "AddrModeT2_i8s4";
+    case AddrMode_i12:    return "AddrMode_i12";
+    }
+  }
+
+  /// Target Operand Flag enum.
+  enum TOF {
+    //===------------------------------------------------------------------===//
+    // ARM Specific MachineOperand flags.
+
+    MO_NO_FLAG,
+
+    /// MO_LO16 - On a symbol operand, this represents a relocation containing
+    /// lower 16 bit of the address. Used only via movw instruction.
+    MO_LO16,
+
+    /// MO_HI16 - On a symbol operand, this represents a relocation containing
+    /// higher 16 bit of the address. Used only via movt instruction.
+    MO_HI16,
+
+    /// MO_LO16_NONLAZY - On a symbol operand "FOO", this represents a
+    /// relocation containing lower 16 bit of the non-lazy-ptr indirect symbol,
+    /// i.e. "FOO$non_lazy_ptr".
+    /// Used only via movw instruction.
+    MO_LO16_NONLAZY,
+
+    /// MO_HI16_NONLAZY - On a symbol operand "FOO", this represents a
+    /// relocation containing lower 16 bit of the non-lazy-ptr indirect symbol,
+    /// i.e. "FOO$non_lazy_ptr". Used only via movt instruction.
+    MO_HI16_NONLAZY,
+
+    /// MO_LO16_NONLAZY_PIC - On a symbol operand "FOO", this represents a
+    /// relocation containing lower 16 bit of the PC relative address of the
+    /// non-lazy-ptr indirect symbol, i.e. "FOO$non_lazy_ptr - LABEL".
+    /// Used only via movw instruction.
+    MO_LO16_NONLAZY_PIC,
+
+    /// MO_HI16_NONLAZY_PIC - On a symbol operand "FOO", this represents a
+    /// relocation containing lower 16 bit of the PC relative address of the
+    /// non-lazy-ptr indirect symbol, i.e. "FOO$non_lazy_ptr - LABEL".
+    /// Used only via movt instruction.
+    MO_HI16_NONLAZY_PIC,
+
+    /// MO_PLT - On a symbol operand, this represents an ELF PLT reference on a
+    /// call operand.
+    MO_PLT
+  };
+
+  enum {
+    //===------------------------------------------------------------------===//
+    // Instruction Flags.
+
+    //===------------------------------------------------------------------===//
+    // This four-bit field describes the addressing mode used.
+    AddrModeMask  = 0x1f, // The AddrMode enums are declared in ARMBaseInfo.h
+
+    // IndexMode - Unindex, pre-indexed, or post-indexed are valid for load
+    // and store ops only.  Generic "updating" flag is used for ld/st multiple.
+    // The index mode enums are declared in ARMBaseInfo.h
+    IndexModeShift = 5,
+    IndexModeMask  = 3 << IndexModeShift,
+
+    //===------------------------------------------------------------------===//
+    // Instruction encoding formats.
+    //
+    FormShift     = 7,
+    FormMask      = 0x3f << FormShift,
+
+    // Pseudo instructions
+    Pseudo        = 0  << FormShift,
+
+    // Multiply instructions
+    MulFrm        = 1  << FormShift,
+
+    // Branch instructions
+    BrFrm         = 2  << FormShift,
+    BrMiscFrm     = 3  << FormShift,
+
+    // Data Processing instructions
+    DPFrm         = 4  << FormShift,
+    DPSoRegFrm    = 5  << FormShift,
+
+    // Load and Store
+    LdFrm         = 6  << FormShift,
+    StFrm         = 7  << FormShift,
+    LdMiscFrm     = 8  << FormShift,
+    StMiscFrm     = 9  << FormShift,
+    LdStMulFrm    = 10 << FormShift,
+
+    LdStExFrm     = 11 << FormShift,
+
+    // Miscellaneous arithmetic instructions
+    ArithMiscFrm  = 12 << FormShift,
+    SatFrm        = 13 << FormShift,
+
+    // Extend instructions
+    ExtFrm        = 14 << FormShift,
+
+    // VFP formats
+    VFPUnaryFrm   = 15 << FormShift,
+    VFPBinaryFrm  = 16 << FormShift,
+    VFPConv1Frm   = 17 << FormShift,
+    VFPConv2Frm   = 18 << FormShift,
+    VFPConv3Frm   = 19 << FormShift,
+    VFPConv4Frm   = 20 << FormShift,
+    VFPConv5Frm   = 21 << FormShift,
+    VFPLdStFrm    = 22 << FormShift,
+    VFPLdStMulFrm = 23 << FormShift,
+    VFPMiscFrm    = 24 << FormShift,
+
+    // Thumb format
+    ThumbFrm      = 25 << FormShift,
+
+    // Miscelleaneous format
+    MiscFrm       = 26 << FormShift,
+
+    // NEON formats
+    NGetLnFrm     = 27 << FormShift,
+    NSetLnFrm     = 28 << FormShift,
+    NDupFrm       = 29 << FormShift,
+    NLdStFrm      = 30 << FormShift,
+    N1RegModImmFrm= 31 << FormShift,
+    N2RegFrm      = 32 << FormShift,
+    NVCVTFrm      = 33 << FormShift,
+    NVDupLnFrm    = 34 << FormShift,
+    N2RegVShLFrm  = 35 << FormShift,
+    N2RegVShRFrm  = 36 << FormShift,
+    N3RegFrm      = 37 << FormShift,
+    N3RegVShFrm   = 38 << FormShift,
+    NVExtFrm      = 39 << FormShift,
+    NVMulSLFrm    = 40 << FormShift,
+    NVTBLFrm      = 41 << FormShift,
+
+    //===------------------------------------------------------------------===//
+    // Misc flags.
+
+    // UnaryDP - Indicates this is a unary data processing instruction, i.e.
+    // it doesn't have a Rn operand.
+    UnaryDP       = 1 << 13,
+
+    // Xform16Bit - Indicates this Thumb2 instruction may be transformed into
+    // a 16-bit Thumb instruction if certain conditions are met.
+    Xform16Bit    = 1 << 14,
+
+    // ThumbArithFlagSetting - The instruction is a 16-bit flag setting Thumb
+    // instruction. Used by the parser to determine whether to require the 'S'
+    // suffix on the mnemonic (when not in an IT block) or preclude it (when
+    // in an IT block).
+    ThumbArithFlagSetting = 1 << 18,
+
+    //===------------------------------------------------------------------===//
+    // Code domain.
+    DomainShift   = 15,
+    DomainMask    = 7 << DomainShift,
+    DomainGeneral = 0 << DomainShift,
+    DomainVFP     = 1 << DomainShift,
+    DomainNEON    = 2 << DomainShift,
+    DomainNEONA8  = 4 << DomainShift,
+
+    //===------------------------------------------------------------------===//
+    // Field shifts - such shifts are used to set field while generating
+    // machine instructions.
+    //
+    // FIXME: This list will need adjusting/fixing as the MC code emitter
+    // takes shape and the ARMCodeEmitter.cpp bits go away.
+    ShiftTypeShift = 4,
+
+    M_BitShift     = 5,
+    ShiftImmShift  = 5,
+    ShiftShift     = 7,
+    N_BitShift     = 7,
+    ImmHiShift     = 8,
+    SoRotImmShift  = 8,
+    RegRsShift     = 8,
+    ExtRotImmShift = 10,
+    RegRdLoShift   = 12,
+    RegRdShift     = 12,
+    RegRdHiShift   = 16,
+    RegRnShift     = 16,
+    S_BitShift     = 20,
+    W_BitShift     = 21,
+    AM3_I_BitShift = 22,
+    D_BitShift     = 22,
+    U_BitShift     = 23,
+    P_BitShift     = 24,
+    I_BitShift     = 25,
+    CondShift      = 28
+  };
+
+} // end namespace ARMII
+
+} // end namespace llvm;
+
+#endif
diff --git a/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h b/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h
new file mode 100644
index 0000000000000..350c92decdce4
--- /dev/null
+++ b/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h
@@ -0,0 +1,97 @@
+//===-- ARM/ARMFixupKinds.h - ARM Specific Fixup Entries --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ARM_ARMFIXUPKINDS_H
+#define LLVM_ARM_ARMFIXUPKINDS_H
+
+#include "llvm/MC/MCFixup.h"
+
+namespace llvm {
+namespace ARM {
+enum Fixups {
+  // fixup_arm_ldst_pcrel_12 - 12-bit PC relative relocation for symbol
+  // addresses
+  fixup_arm_ldst_pcrel_12 = FirstTargetFixupKind,
+
+  // fixup_t2_ldst_pcrel_12 - Equivalent to fixup_arm_ldst_pcrel_12, with
+  // the 16-bit halfwords reordered.
+  fixup_t2_ldst_pcrel_12,
+
+  // fixup_arm_pcrel_10 - 10-bit PC relative relocation for symbol addresses
+  // used in VFP instructions where the lower 2 bits are not encoded
+  // (so it's encoded as an 8-bit immediate).
+  fixup_arm_pcrel_10,
+  // fixup_t2_pcrel_10 - Equivalent to fixup_arm_pcrel_10, accounting for
+  // the short-swapped encoding of Thumb2 instructions.
+  fixup_t2_pcrel_10,
+  // fixup_thumb_adr_pcrel_10 - 10-bit PC relative relocation for symbol
+  // addresses where the lower 2 bits are not encoded (so it's encoded as an
+  // 8-bit immediate).
+  fixup_thumb_adr_pcrel_10,
+  // fixup_arm_adr_pcrel_12 - 12-bit PC relative relocation for the ADR
+  // instruction.
+  fixup_arm_adr_pcrel_12,
+  // fixup_t2_adr_pcrel_12 - 12-bit PC relative relocation for the ADR
+  // instruction.
+  fixup_t2_adr_pcrel_12,
+  // fixup_arm_condbranch - 24-bit PC relative relocation for conditional branch
+  // instructions. 
+  fixup_arm_condbranch,
+  // fixup_arm_uncondbranch - 24-bit PC relative relocation for 
+  // branch instructions. (unconditional)
+  fixup_arm_uncondbranch,
+  // fixup_t2_condbranch - 20-bit PC relative relocation for Thumb2 direct
+  // uconditional branch instructions.
+  fixup_t2_condbranch,
+  // fixup_t2_uncondbranch - 20-bit PC relative relocation for Thumb2 direct
+  // branch unconditional branch instructions.
+  fixup_t2_uncondbranch,
+
+  // fixup_arm_thumb_br - 12-bit fixup for Thumb B instructions.
+  fixup_arm_thumb_br,
+
+  // fixup_arm_thumb_bl - Fixup for Thumb BL instructions.
+  fixup_arm_thumb_bl,
+
+  // fixup_arm_thumb_blx - Fixup for Thumb BLX instructions.
+  fixup_arm_thumb_blx,
+
+  // fixup_arm_thumb_cb - Fixup for Thumb branch instructions.
+  fixup_arm_thumb_cb,
+
+  // fixup_arm_thumb_cp - Fixup for Thumb load/store from constant pool instrs.
+  fixup_arm_thumb_cp,
+
+  // fixup_arm_thumb_bcc - Fixup for Thumb conditional branching instructions.
+  fixup_arm_thumb_bcc,
+
+  // The next two are for the movt/movw pair
+  // the 16bit imm field are split into imm{15-12} and imm{11-0}
+  fixup_arm_movt_hi16, // :upper16:
+  fixup_arm_movw_lo16, // :lower16:
+  fixup_t2_movt_hi16, // :upper16:
+  fixup_t2_movw_lo16, // :lower16:
+
+  // It is possible to create an "immediate" that happens to be pcrel.
+  // movw r0, :lower16:Foo-(Bar+8) and movt  r0, :upper16:Foo-(Bar+8)
+  // result in different reloc tags than the above two.
+  // Needed to support ELF::R_ARM_MOVT_PREL and ELF::R_ARM_MOVW_PREL_NC
+  fixup_arm_movt_hi16_pcrel, // :upper16:
+  fixup_arm_movw_lo16_pcrel, // :lower16:
+  fixup_t2_movt_hi16_pcrel, // :upper16:
+  fixup_t2_movw_lo16_pcrel, // :lower16:
+
+  // Marker
+  LastTargetFixupKind,
+  NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+};
+}
+}
+
+#endif
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
index 53b4c95d38012..1c109e0152805 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
@@ -52,6 +52,9 @@ ARMMCAsmInfoDarwin::ARMMCAsmInfoDarwin() {
   AsmTransCBE = arm_asm_table;
   Data64bitsDirective = 0;
   CommentString = "@";
+  Code16Directive = ".code\t16";
+  Code32Directive = ".code\t32";
+
   SupportsDebugInformation = true;
 
   // Exceptions handling
@@ -64,12 +67,14 @@ ARMELFMCAsmInfo::ARMELFMCAsmInfo() {
 
   Data64bitsDirective = 0;
   CommentString = "@";
-
-  HasLEB128 = true;
   PrivateGlobalPrefix = ".L";
+  Code16Directive = ".code\t16";
+  Code32Directive = ".code\t32";
+
   WeakRefDirective = "\t.weak\t";
-  HasLCOMMDirective = true;
+  LCOMMDirectiveType = LCOMM::NoAlignment;
 
+  HasLEB128 = true;
   SupportsDebugInformation = true;
 
   // Exceptions handling
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
new file mode 100644
index 0000000000000..865c3e22b8424
--- /dev/null
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
@@ -0,0 +1,1468 @@
+//===-- ARM/ARMMCCodeEmitter.cpp - Convert ARM code to machine code -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ARMMCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mccodeemitter"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
+#include "MCTargetDesc/ARMFixupKinds.h"
+#include "MCTargetDesc/ARMMCExpr.h"
+#include "MCTargetDesc/ARMMCTargetDesc.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+STATISTIC(MCNumEmitted, "Number of MC instructions emitted.");
+STATISTIC(MCNumCPRelocations, "Number of constant pool relocations created.");
+
+namespace {
+class ARMMCCodeEmitter : public MCCodeEmitter {
+  ARMMCCodeEmitter(const ARMMCCodeEmitter &); // DO NOT IMPLEMENT
+  void operator=(const ARMMCCodeEmitter &); // DO NOT IMPLEMENT
+  const MCInstrInfo &MCII;
+  const MCSubtargetInfo &STI;
+
+public:
+  ARMMCCodeEmitter(const MCInstrInfo &mcii, const MCSubtargetInfo &sti,
+                   MCContext &ctx)
+    : MCII(mcii), STI(sti) {
+  }
+
+  ~ARMMCCodeEmitter() {}
+
+  bool isThumb() const {
+    // FIXME: Can tablegen auto-generate this?
+    return (STI.getFeatureBits() & ARM::ModeThumb) != 0;
+  }
+  bool isThumb2() const {
+    return isThumb() && (STI.getFeatureBits() & ARM::FeatureThumb2) != 0;
+  }
+  bool isTargetDarwin() const {
+    Triple TT(STI.getTargetTriple());
+    Triple::OSType OS = TT.getOS();
+    return OS == Triple::Darwin || OS == Triple::MacOSX || OS == Triple::IOS;
+  }
+
+  unsigned getMachineSoImmOpValue(unsigned SoImm) const;
+
+  // getBinaryCodeForInstr - TableGen'erated function for getting the
+  // binary encoding for an instruction.
+  unsigned getBinaryCodeForInstr(const MCInst &MI,
+                                 SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getMachineOpValue - Return binary encoding of operand. If the machine
+  /// operand requires relocation, record the relocation and return zero.
+  unsigned getMachineOpValue(const MCInst &MI,const MCOperand &MO,
+                             SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getHiLo16ImmOpValue - Return the encoding for the hi / low 16-bit of
+  /// the specified operand. This is used for operands with :lower16: and
+  /// :upper16: prefixes.
+  uint32_t getHiLo16ImmOpValue(const MCInst &MI, unsigned OpIdx,
+                               SmallVectorImpl<MCFixup> &Fixups) const;
+
+  bool EncodeAddrModeOpValues(const MCInst &MI, unsigned OpIdx,
+                              unsigned &Reg, unsigned &Imm,
+                              SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getThumbBLTargetOpValue - Return encoding info for Thumb immediate
+  /// BL branch target.
+  uint32_t getThumbBLTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                   SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getThumbBLXTargetOpValue - Return encoding info for Thumb immediate
+  /// BLX branch target.
+  uint32_t getThumbBLXTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                    SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getThumbBRTargetOpValue - Return encoding info for Thumb branch target.
+  uint32_t getThumbBRTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                   SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getThumbBCCTargetOpValue - Return encoding info for Thumb branch target.
+  uint32_t getThumbBCCTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                    SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getThumbCBTargetOpValue - Return encoding info for Thumb branch target.
+  uint32_t getThumbCBTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                   SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getBranchTargetOpValue - Return encoding info for 24-bit immediate
+  /// branch target.
+  uint32_t getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                  SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getUnconditionalBranchTargetOpValue - Return encoding info for 24-bit
+  /// immediate Thumb2 direct branch target.
+  uint32_t getUnconditionalBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                  SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getARMBranchTargetOpValue - Return encoding info for 24-bit immediate
+  /// branch target.
+  uint32_t getARMBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                     SmallVectorImpl<MCFixup> &Fixups) const;
+  uint32_t getARMBLXTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                     SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getAdrLabelOpValue - Return encoding info for 12-bit immediate
+  /// ADR label target.
+  uint32_t getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
+                              SmallVectorImpl<MCFixup> &Fixups) const;
+  uint32_t getThumbAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
+                              SmallVectorImpl<MCFixup> &Fixups) const;
+  uint32_t getT2AdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
+                              SmallVectorImpl<MCFixup> &Fixups) const;
+
+
+  /// getAddrModeImm12OpValue - Return encoding info for 'reg +/- imm12'
+  /// operand.
+  uint32_t getAddrModeImm12OpValue(const MCInst &MI, unsigned OpIdx,
+                                   SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getThumbAddrModeRegRegOpValue - Return encoding for 'reg + reg' operand.
+  uint32_t getThumbAddrModeRegRegOpValue(const MCInst &MI, unsigned OpIdx,
+                                         SmallVectorImpl<MCFixup> &Fixups)const;
+
+  /// getT2AddrModeImm8s4OpValue - Return encoding info for 'reg +/- imm8<<2'
+  /// operand.
+  uint32_t getT2AddrModeImm8s4OpValue(const MCInst &MI, unsigned OpIdx,
+                                   SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getT2AddrModeImm0_1020s4OpValue - Return encoding info for 'reg + imm8<<2'
+  /// operand.
+  uint32_t getT2AddrModeImm0_1020s4OpValue(const MCInst &MI, unsigned OpIdx,
+                                   SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getT2Imm8s4OpValue - Return encoding info for '+/- imm8<<2'
+  /// operand.
+  uint32_t getT2Imm8s4OpValue(const MCInst &MI, unsigned OpIdx,
+                              SmallVectorImpl<MCFixup> &Fixups) const;
+
+
+  /// getLdStSORegOpValue - Return encoding info for 'reg +/- reg shop imm'
+  /// operand as needed by load/store instructions.
+  uint32_t getLdStSORegOpValue(const MCInst &MI, unsigned OpIdx,
+                               SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getLdStmModeOpValue - Return encoding for load/store multiple mode.
+  uint32_t getLdStmModeOpValue(const MCInst &MI, unsigned OpIdx,
+                               SmallVectorImpl<MCFixup> &Fixups) const {
+    ARM_AM::AMSubMode Mode = (ARM_AM::AMSubMode)MI.getOperand(OpIdx).getImm();
+    switch (Mode) {
+    default: assert(0 && "Unknown addressing sub-mode!");
+    case ARM_AM::da: return 0;
+    case ARM_AM::ia: return 1;
+    case ARM_AM::db: return 2;
+    case ARM_AM::ib: return 3;
+    }
+  }
+  /// getShiftOp - Return the shift opcode (bit[6:5]) of the immediate value.
+  ///
+  unsigned getShiftOp(ARM_AM::ShiftOpc ShOpc) const {
+    switch (ShOpc) {
+    default: llvm_unreachable("Unknown shift opc!");
+    case ARM_AM::no_shift:
+    case ARM_AM::lsl: return 0;
+    case ARM_AM::lsr: return 1;
+    case ARM_AM::asr: return 2;
+    case ARM_AM::ror:
+    case ARM_AM::rrx: return 3;
+    }
+    return 0;
+  }
+
+  /// getAddrMode2OpValue - Return encoding for addrmode2 operands.
+  uint32_t getAddrMode2OpValue(const MCInst &MI, unsigned OpIdx,
+                               SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getAddrMode2OffsetOpValue - Return encoding for am2offset operands.
+  uint32_t getAddrMode2OffsetOpValue(const MCInst &MI, unsigned OpIdx,
+                                     SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getPostIdxRegOpValue - Return encoding for postidx_reg operands.
+  uint32_t getPostIdxRegOpValue(const MCInst &MI, unsigned OpIdx,
+                                SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getAddrMode3OffsetOpValue - Return encoding for am3offset operands.
+  uint32_t getAddrMode3OffsetOpValue(const MCInst &MI, unsigned OpIdx,
+                                     SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getAddrMode3OpValue - Return encoding for addrmode3 operands.
+  uint32_t getAddrMode3OpValue(const MCInst &MI, unsigned OpIdx,
+                               SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getAddrModeThumbSPOpValue - Return encoding info for 'reg +/- imm12'
+  /// operand.
+  uint32_t getAddrModeThumbSPOpValue(const MCInst &MI, unsigned OpIdx,
+                                     SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getAddrModeISOpValue - Encode the t_addrmode_is# operands.
+  uint32_t getAddrModeISOpValue(const MCInst &MI, unsigned OpIdx,
+                                SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getAddrModePCOpValue - Return encoding for t_addrmode_pc operands.
+  uint32_t getAddrModePCOpValue(const MCInst &MI, unsigned OpIdx,
+                                SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getAddrMode5OpValue - Return encoding info for 'reg +/- imm8' operand.
+  uint32_t getAddrMode5OpValue(const MCInst &MI, unsigned OpIdx,
+                               SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getCCOutOpValue - Return encoding of the 's' bit.
+  unsigned getCCOutOpValue(const MCInst &MI, unsigned Op,
+                           SmallVectorImpl<MCFixup> &Fixups) const {
+    // The operand is either reg0 or CPSR. The 's' bit is encoded as '0' or
+    // '1' respectively.
+    return MI.getOperand(Op).getReg() == ARM::CPSR;
+  }
+
+  /// getSOImmOpValue - Return an encoded 12-bit shifted-immediate value.
+  unsigned getSOImmOpValue(const MCInst &MI, unsigned Op,
+                           SmallVectorImpl<MCFixup> &Fixups) const {
+    unsigned SoImm = MI.getOperand(Op).getImm();
+    int SoImmVal = ARM_AM::getSOImmVal(SoImm);
+    assert(SoImmVal != -1 && "Not a valid so_imm value!");
+
+    // Encode rotate_imm.
+    unsigned Binary = (ARM_AM::getSOImmValRot((unsigned)SoImmVal) >> 1)
+      << ARMII::SoRotImmShift;
+
+    // Encode immed_8.
+    Binary |= ARM_AM::getSOImmValImm((unsigned)SoImmVal);
+    return Binary;
+  }
+
+  /// getT2SOImmOpValue - Return an encoded 12-bit shifted-immediate value.
+  unsigned getT2SOImmOpValue(const MCInst &MI, unsigned Op,
+                           SmallVectorImpl<MCFixup> &Fixups) const {
+    unsigned SoImm = MI.getOperand(Op).getImm();
+    unsigned Encoded =  ARM_AM::getT2SOImmVal(SoImm);
+    assert(Encoded != ~0U && "Not a Thumb2 so_imm value?");
+    return Encoded;
+  }
+
+  unsigned getT2AddrModeSORegOpValue(const MCInst &MI, unsigned OpNum,
+    SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getT2AddrModeImm8OpValue(const MCInst &MI, unsigned OpNum,
+    SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getT2AddrModeImm8OffsetOpValue(const MCInst &MI, unsigned OpNum,
+    SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getT2AddrModeImm12OffsetOpValue(const MCInst &MI, unsigned OpNum,
+    SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getSORegOpValue - Return an encoded so_reg shifted register value.
+  unsigned getSORegRegOpValue(const MCInst &MI, unsigned Op,
+                           SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getSORegImmOpValue(const MCInst &MI, unsigned Op,
+                           SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getT2SORegOpValue(const MCInst &MI, unsigned Op,
+                             SmallVectorImpl<MCFixup> &Fixups) const;
+
+  unsigned getNEONVcvtImm32OpValue(const MCInst &MI, unsigned Op,
+                                   SmallVectorImpl<MCFixup> &Fixups) const {
+    return 64 - MI.getOperand(Op).getImm();
+  }
+
+  unsigned getBitfieldInvertedMaskOpValue(const MCInst &MI, unsigned Op,
+                                      SmallVectorImpl<MCFixup> &Fixups) const;
+
+  unsigned getRegisterListOpValue(const MCInst &MI, unsigned Op,
+                                  SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getAddrMode6AddressOpValue(const MCInst &MI, unsigned Op,
+                                      SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getAddrMode6OneLane32AddressOpValue(const MCInst &MI, unsigned Op,
+                                        SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getAddrMode6DupAddressOpValue(const MCInst &MI, unsigned Op,
+                                        SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getAddrMode6OffsetOpValue(const MCInst &MI, unsigned Op,
+                                     SmallVectorImpl<MCFixup> &Fixups) const;
+
+  unsigned getShiftRight8Imm(const MCInst &MI, unsigned Op,
+                             SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getShiftRight16Imm(const MCInst &MI, unsigned Op,
+                              SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getShiftRight32Imm(const MCInst &MI, unsigned Op,
+                              SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getShiftRight64Imm(const MCInst &MI, unsigned Op,
+                              SmallVectorImpl<MCFixup> &Fixups) const;
+
+  unsigned getThumbSRImmOpValue(const MCInst &MI, unsigned Op,
+                                 SmallVectorImpl<MCFixup> &Fixups) const;
+
+  unsigned NEONThumb2DataIPostEncoder(const MCInst &MI,
+                                      unsigned EncodedValue) const;
+  unsigned NEONThumb2LoadStorePostEncoder(const MCInst &MI,
+                                          unsigned EncodedValue) const;
+  unsigned NEONThumb2DupPostEncoder(const MCInst &MI,
+                                    unsigned EncodedValue) const;
+
+  unsigned VFPThumb2PostEncoder(const MCInst &MI,
+                                unsigned EncodedValue) const;
+
+  void EmitByte(unsigned char C, raw_ostream &OS) const {
+    OS << (char)C;
+  }
+
+  void EmitConstant(uint64_t Val, unsigned Size, raw_ostream &OS) const {
+    // Output the constant in little endian byte order.
+    for (unsigned i = 0; i != Size; ++i) {
+      EmitByte(Val & 255, OS);
+      Val >>= 8;
+    }
+  }
+
+  void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups) const;
+};
+
+} // end anonymous namespace
+
+MCCodeEmitter *llvm::createARMMCCodeEmitter(const MCInstrInfo &MCII,
+                                            const MCSubtargetInfo &STI,
+                                            MCContext &Ctx) {
+  return new ARMMCCodeEmitter(MCII, STI, Ctx);
+}
+
+/// NEONThumb2DataIPostEncoder - Post-process encoded NEON data-processing
+/// instructions, and rewrite them to their Thumb2 form if we are currently in
+/// Thumb2 mode.
+unsigned ARMMCCodeEmitter::NEONThumb2DataIPostEncoder(const MCInst &MI,
+                                                 unsigned EncodedValue) const {
+  if (isThumb2()) {
+    // NEON Thumb2 data-processsing encodings are very simple: bit 24 is moved
+    // to bit 12 of the high half-word (i.e. bit 28), and bits 27-24 are
+    // set to 1111.
+    unsigned Bit24 = EncodedValue & 0x01000000;
+    unsigned Bit28 = Bit24 << 4;
+    EncodedValue &= 0xEFFFFFFF;
+    EncodedValue |= Bit28;
+    EncodedValue |= 0x0F000000;
+  }
+
+  return EncodedValue;
+}
+
+/// NEONThumb2LoadStorePostEncoder - Post-process encoded NEON load/store
+/// instructions, and rewrite them to their Thumb2 form if we are currently in
+/// Thumb2 mode.
+unsigned ARMMCCodeEmitter::NEONThumb2LoadStorePostEncoder(const MCInst &MI,
+                                                 unsigned EncodedValue) const {
+  if (isThumb2()) {
+    EncodedValue &= 0xF0FFFFFF;
+    EncodedValue |= 0x09000000;
+  }
+
+  return EncodedValue;
+}
+
+/// NEONThumb2DupPostEncoder - Post-process encoded NEON vdup
+/// instructions, and rewrite them to their Thumb2 form if we are currently in
+/// Thumb2 mode.
+unsigned ARMMCCodeEmitter::NEONThumb2DupPostEncoder(const MCInst &MI,
+                                                 unsigned EncodedValue) const {
+  if (isThumb2()) {
+    EncodedValue &= 0x00FFFFFF;
+    EncodedValue |= 0xEE000000;
+  }
+
+  return EncodedValue;
+}
+
+/// VFPThumb2PostEncoder - Post-process encoded VFP instructions and rewrite
+/// them to their Thumb2 form if we are currently in Thumb2 mode.
+unsigned ARMMCCodeEmitter::
+VFPThumb2PostEncoder(const MCInst &MI, unsigned EncodedValue) const {
+  if (isThumb2()) {
+    EncodedValue &= 0x0FFFFFFF;
+    EncodedValue |= 0xE0000000;
+  }
+  return EncodedValue;
+}
+
+/// getMachineOpValue - Return binary encoding of operand. If the machine
+/// operand requires relocation, record the relocation and return zero.
+unsigned ARMMCCodeEmitter::
+getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                  SmallVectorImpl<MCFixup> &Fixups) const {
+  if (MO.isReg()) {
+    unsigned Reg = MO.getReg();
+    unsigned RegNo = getARMRegisterNumbering(Reg);
+
+    // Q registers are encoded as 2x their register number.
+    switch (Reg) {
+    default:
+      return RegNo;
+    case ARM::Q0:  case ARM::Q1:  case ARM::Q2:  case ARM::Q3:
+    case ARM::Q4:  case ARM::Q5:  case ARM::Q6:  case ARM::Q7:
+    case ARM::Q8:  case ARM::Q9:  case ARM::Q10: case ARM::Q11:
+    case ARM::Q12: case ARM::Q13: case ARM::Q14: case ARM::Q15:
+      return 2 * RegNo;
+    }
+  } else if (MO.isImm()) {
+    return static_cast<unsigned>(MO.getImm());
+  } else if (MO.isFPImm()) {
+    return static_cast<unsigned>(APFloat(MO.getFPImm())
+                     .bitcastToAPInt().getHiBits(32).getLimitedValue());
+  }
+
+  llvm_unreachable("Unable to encode MCOperand!");
+  return 0;
+}
+
+/// getAddrModeImmOpValue - Return encoding info for 'reg +/- imm' operand.
+bool ARMMCCodeEmitter::
+EncodeAddrModeOpValues(const MCInst &MI, unsigned OpIdx, unsigned &Reg,
+                       unsigned &Imm, SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand &MO  = MI.getOperand(OpIdx);
+  const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
+
+  Reg = getARMRegisterNumbering(MO.getReg());
+
+  int32_t SImm = MO1.getImm();
+  bool isAdd = true;
+
+  // Special value for #-0
+  if (SImm == INT32_MIN) {
+    SImm = 0;
+    isAdd = false;
+  }
+
+  // Immediate is always encoded as positive. The 'U' bit controls add vs sub.
+  if (SImm < 0) {
+    SImm = -SImm;
+    isAdd = false;
+  }
+
+  Imm = SImm;
+  return isAdd;
+}
+
+/// getBranchTargetOpValue - Helper function to get the branch target operand,
+/// which is either an immediate or requires a fixup.
+static uint32_t getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                       unsigned FixupKind,
+                                       SmallVectorImpl<MCFixup> &Fixups) {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+
+  // If the destination is an immediate, we have nothing to do.
+  if (MO.isImm()) return MO.getImm();
+  assert(MO.isExpr() && "Unexpected branch target type!");
+  const MCExpr *Expr = MO.getExpr();
+  MCFixupKind Kind = MCFixupKind(FixupKind);
+  Fixups.push_back(MCFixup::Create(0, Expr, Kind));
+
+  // All of the information is in the fixup.
+  return 0;
+}
+
+// Thumb BL and BLX use a strange offset encoding where bits 22 and 21 are
+// determined by negating them and XOR'ing them with bit 23.
+static int32_t encodeThumbBLOffset(int32_t offset) {
+  offset >>= 1;
+  uint32_t S  = (offset & 0x800000) >> 23;
+  uint32_t J1 = (offset & 0x400000) >> 22;
+  uint32_t J2 = (offset & 0x200000) >> 21;
+  J1 = (~J1 & 0x1);
+  J2 = (~J2 & 0x1);
+  J1 ^= S;
+  J2 ^= S;
+
+  offset &= ~0x600000;
+  offset |= J1 << 22;
+  offset |= J2 << 21;
+
+  return offset;
+}
+
+/// getThumbBLTargetOpValue - Return encoding info for immediate branch target.
+uint32_t ARMMCCodeEmitter::
+getThumbBLTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                        SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand MO = MI.getOperand(OpIdx);
+  if (MO.isExpr())
+    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_bl,
+                                    Fixups);
+  return encodeThumbBLOffset(MO.getImm());
+}
+
+/// getThumbBLXTargetOpValue - Return encoding info for Thumb immediate
+/// BLX branch target.
+uint32_t ARMMCCodeEmitter::
+getThumbBLXTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                         SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand MO = MI.getOperand(OpIdx);
+  if (MO.isExpr())
+    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_blx,
+                                    Fixups);
+  return encodeThumbBLOffset(MO.getImm());
+}
+
+/// getThumbBRTargetOpValue - Return encoding info for Thumb branch target.
+uint32_t ARMMCCodeEmitter::
+getThumbBRTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                        SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand MO = MI.getOperand(OpIdx);
+  if (MO.isExpr())
+    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_br,
+                                    Fixups);
+  return (MO.getImm() >> 1);
+}
+
+/// getThumbBCCTargetOpValue - Return encoding info for Thumb branch target.
+uint32_t ARMMCCodeEmitter::
+getThumbBCCTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                         SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand MO = MI.getOperand(OpIdx);
+  if (MO.isExpr())
+    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_bcc,
+                                    Fixups);
+  return (MO.getImm() >> 1);
+}
+
+/// getThumbCBTargetOpValue - Return encoding info for Thumb branch target.
+uint32_t ARMMCCodeEmitter::
+getThumbCBTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                        SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand MO = MI.getOperand(OpIdx);
+  if (MO.isExpr())
+    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_cb, Fixups);
+  return (MO.getImm() >> 1);
+}
+
+/// Return true if this branch has a non-always predication
+static bool HasConditionalBranch(const MCInst &MI) {
+  int NumOp = MI.getNumOperands();
+  if (NumOp >= 2) {
+    for (int i = 0; i < NumOp-1; ++i) {
+      const MCOperand &MCOp1 = MI.getOperand(i);
+      const MCOperand &MCOp2 = MI.getOperand(i + 1);
+      if (MCOp1.isImm() && MCOp2.isReg() &&
+          (MCOp2.getReg() == 0 || MCOp2.getReg() == ARM::CPSR)) {
+        if (ARMCC::CondCodes(MCOp1.getImm()) != ARMCC::AL)
+          return true;
+      }
+    }
+  }
+  return false;
+}
+
+/// getBranchTargetOpValue - Return encoding info for 24-bit immediate branch
+/// target.
+uint32_t ARMMCCodeEmitter::
+getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                       SmallVectorImpl<MCFixup> &Fixups) const {
+  // FIXME: This really, really shouldn't use TargetMachine. We don't want
+  // coupling between MC and TM anywhere we can help it.
+  if (isThumb2())
+    return
+      ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_t2_condbranch, Fixups);
+  return getARMBranchTargetOpValue(MI, OpIdx, Fixups);
+}
+
+/// getBranchTargetOpValue - Return encoding info for 24-bit immediate branch
+/// target.
+uint32_t ARMMCCodeEmitter::
+getARMBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                          SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand MO = MI.getOperand(OpIdx);
+  if (MO.isExpr()) {
+    if (HasConditionalBranch(MI))
+      return ::getBranchTargetOpValue(MI, OpIdx,
+                                      ARM::fixup_arm_condbranch, Fixups);
+    return ::getBranchTargetOpValue(MI, OpIdx,
+                                    ARM::fixup_arm_uncondbranch, Fixups);
+  }
+
+  return MO.getImm() >> 2;
+}
+
+uint32_t ARMMCCodeEmitter::
+getARMBLXTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                          SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand MO = MI.getOperand(OpIdx);
+  if (MO.isExpr()) {
+    if (HasConditionalBranch(MI))
+      return ::getBranchTargetOpValue(MI, OpIdx,
+                                      ARM::fixup_arm_condbranch, Fixups);
+    return ::getBranchTargetOpValue(MI, OpIdx,
+                                    ARM::fixup_arm_uncondbranch, Fixups);
+  }
+
+  return MO.getImm() >> 1;
+}
+
+/// getUnconditionalBranchTargetOpValue - Return encoding info for 24-bit
+/// immediate branch target.
+uint32_t ARMMCCodeEmitter::
+getUnconditionalBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                       SmallVectorImpl<MCFixup> &Fixups) const {
+  unsigned Val =
+    ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_t2_uncondbranch, Fixups);
+  bool I  = (Val & 0x800000);
+  bool J1 = (Val & 0x400000);
+  bool J2 = (Val & 0x200000);
+  if (I ^ J1)
+    Val &= ~0x400000;
+  else
+    Val |= 0x400000;
+
+  if (I ^ J2)
+    Val &= ~0x200000;
+  else
+    Val |= 0x200000;
+
+  return Val;
+}
+
+/// getAdrLabelOpValue - Return encoding info for 12-bit immediate ADR label
+/// target.
+uint32_t ARMMCCodeEmitter::
+getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
+                   SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand MO = MI.getOperand(OpIdx);
+  if (MO.isExpr())
+    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_adr_pcrel_12,
+                                    Fixups);
+  int32_t offset = MO.getImm();
+  uint32_t Val = 0x2000;
+  if (offset < 0) {
+    Val = 0x1000;
+    offset *= -1;
+  }
+  Val |= offset;
+  return Val;
+}
+
+/// getAdrLabelOpValue - Return encoding info for 12-bit immediate ADR label
+/// target.
+uint32_t ARMMCCodeEmitter::
+getT2AdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
+                   SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand MO = MI.getOperand(OpIdx);
+  if (MO.isExpr())
+    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_t2_adr_pcrel_12,
+                                    Fixups);
+  int32_t Val = MO.getImm();
+  if (Val < 0) {
+    Val *= -1;
+    Val |= 0x1000;
+  }
+  return Val;
+}
+
+/// getAdrLabelOpValue - Return encoding info for 8-bit immediate ADR label
+/// target.
+uint32_t ARMMCCodeEmitter::
+getThumbAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
+                   SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand MO = MI.getOperand(OpIdx);
+  if (MO.isExpr())
+    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_thumb_adr_pcrel_10,
+                                    Fixups);
+  return MO.getImm();
+}
+
+/// getThumbAddrModeRegRegOpValue - Return encoding info for 'reg + reg'
+/// operand.
+uint32_t ARMMCCodeEmitter::
+getThumbAddrModeRegRegOpValue(const MCInst &MI, unsigned OpIdx,
+                              SmallVectorImpl<MCFixup> &) const {
+  // [Rn, Rm]
+  //   {5-3} = Rm
+  //   {2-0} = Rn
+  const MCOperand &MO1 = MI.getOperand(OpIdx);
+  const MCOperand &MO2 = MI.getOperand(OpIdx + 1);
+  unsigned Rn = getARMRegisterNumbering(MO1.getReg());
+  unsigned Rm = getARMRegisterNumbering(MO2.getReg());
+  return (Rm << 3) | Rn;
+}
+
+/// getAddrModeImm12OpValue - Return encoding info for 'reg +/- imm12' operand.
+uint32_t ARMMCCodeEmitter::
+getAddrModeImm12OpValue(const MCInst &MI, unsigned OpIdx,
+                        SmallVectorImpl<MCFixup> &Fixups) const {
+  // {17-13} = reg
+  // {12}    = (U)nsigned (add == '1', sub == '0')
+  // {11-0}  = imm12
+  unsigned Reg, Imm12;
+  bool isAdd = true;
+  // If The first operand isn't a register, we have a label reference.
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  if (!MO.isReg()) {
+    Reg = getARMRegisterNumbering(ARM::PC);   // Rn is PC.
+    Imm12 = 0;
+    isAdd = false ; // 'U' bit is set as part of the fixup.
+
+    if (MO.isExpr()) {
+      const MCExpr *Expr = MO.getExpr();
+
+      MCFixupKind Kind;
+      if (isThumb2())
+        Kind = MCFixupKind(ARM::fixup_t2_ldst_pcrel_12);
+      else
+        Kind = MCFixupKind(ARM::fixup_arm_ldst_pcrel_12);
+      Fixups.push_back(MCFixup::Create(0, Expr, Kind));
+
+      ++MCNumCPRelocations;
+    } else {
+      Reg = ARM::PC;
+      int32_t Offset = MO.getImm();
+      if (Offset < 0) {
+        Offset *= -1;
+        isAdd = false;
+      }
+      Imm12 = Offset;
+    }
+  } else
+    isAdd = EncodeAddrModeOpValues(MI, OpIdx, Reg, Imm12, Fixups);
+
+  uint32_t Binary = Imm12 & 0xfff;
+  // Immediate is always encoded as positive. The 'U' bit controls add vs sub.
+  if (isAdd)
+    Binary |= (1 << 12);
+  Binary |= (Reg << 13);
+  return Binary;
+}
+
+/// getT2Imm8s4OpValue - Return encoding info for
+/// '+/- imm8<<2' operand.
+uint32_t ARMMCCodeEmitter::
+getT2Imm8s4OpValue(const MCInst &MI, unsigned OpIdx,
+                   SmallVectorImpl<MCFixup> &Fixups) const {
+  // FIXME: The immediate operand should have already been encoded like this
+  // before ever getting here. The encoder method should just need to combine
+  // the MI operands for the register and the offset into a single
+  // representation for the complex operand in the .td file. This isn't just
+  // style, unfortunately. As-is, we can't represent the distinct encoding
+  // for #-0.
+
+  // {8}    = (U)nsigned (add == '1', sub == '0')
+  // {7-0}  = imm8
+  int32_t Imm8 = MI.getOperand(OpIdx).getImm();
+  bool isAdd = Imm8 >= 0;
+
+  // Immediate is always encoded as positive. The 'U' bit controls add vs sub.
+  if (Imm8 < 0)
+    Imm8 = -Imm8;
+
+  // Scaled by 4.
+  Imm8 /= 4;
+
+  uint32_t Binary = Imm8 & 0xff;
+  // Immediate is always encoded as positive. The 'U' bit controls add vs sub.
+  if (isAdd)
+    Binary |= (1 << 8);
+  return Binary;
+}
+
+/// getT2AddrModeImm8s4OpValue - Return encoding info for
+/// 'reg +/- imm8<<2' operand.
+uint32_t ARMMCCodeEmitter::
+getT2AddrModeImm8s4OpValue(const MCInst &MI, unsigned OpIdx,
+                        SmallVectorImpl<MCFixup> &Fixups) const {
+  // {12-9} = reg
+  // {8}    = (U)nsigned (add == '1', sub == '0')
+  // {7-0}  = imm8
+  unsigned Reg, Imm8;
+  bool isAdd = true;
+  // If The first operand isn't a register, we have a label reference.
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  if (!MO.isReg()) {
+    Reg = getARMRegisterNumbering(ARM::PC);   // Rn is PC.
+    Imm8 = 0;
+    isAdd = false ; // 'U' bit is set as part of the fixup.
+
+    assert(MO.isExpr() && "Unexpected machine operand type!");
+    const MCExpr *Expr = MO.getExpr();
+    MCFixupKind Kind = MCFixupKind(ARM::fixup_arm_pcrel_10);
+    Fixups.push_back(MCFixup::Create(0, Expr, Kind));
+
+    ++MCNumCPRelocations;
+  } else
+    isAdd = EncodeAddrModeOpValues(MI, OpIdx, Reg, Imm8, Fixups);
+
+  // FIXME: The immediate operand should have already been encoded like this
+  // before ever getting here. The encoder method should just need to combine
+  // the MI operands for the register and the offset into a single
+  // representation for the complex operand in the .td file. This isn't just
+  // style, unfortunately. As-is, we can't represent the distinct encoding
+  // for #-0.
+  uint32_t Binary = (Imm8 >> 2) & 0xff;
+  // Immediate is always encoded as positive. The 'U' bit controls add vs sub.
+  if (isAdd)
+    Binary |= (1 << 8);
+  Binary |= (Reg << 9);
+  return Binary;
+}
+
+/// getT2AddrModeImm0_1020s4OpValue - Return encoding info for
+/// 'reg + imm8<<2' operand.
+uint32_t ARMMCCodeEmitter::
+getT2AddrModeImm0_1020s4OpValue(const MCInst &MI, unsigned OpIdx,
+                        SmallVectorImpl<MCFixup> &Fixups) const {
+  // {11-8} = reg
+  // {7-0}  = imm8
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
+  unsigned Reg = getARMRegisterNumbering(MO.getReg());
+  unsigned Imm8 = MO1.getImm();
+  return (Reg << 8) | Imm8;
+}
+
+// FIXME: This routine assumes that a binary
+// expression will always result in a PCRel expression
+// In reality, its only true if one or more subexpressions
+// is itself a PCRel (i.e. "." in asm or some other pcrel construct)
+// but this is good enough for now.
+static bool EvaluateAsPCRel(const MCExpr *Expr) {
+  switch (Expr->getKind()) {
+  default: assert(0 && "Unexpected expression type");
+  case MCExpr::SymbolRef: return false;
+  case MCExpr::Binary: return true;
+  }
+}
+
+uint32_t
+ARMMCCodeEmitter::getHiLo16ImmOpValue(const MCInst &MI, unsigned OpIdx,
+                                      SmallVectorImpl<MCFixup> &Fixups) const {
+  // {20-16} = imm{15-12}
+  // {11-0}  = imm{11-0}
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  if (MO.isImm())
+    // Hi / lo 16 bits already extracted during earlier passes.
+    return static_cast<unsigned>(MO.getImm());
+
+  // Handle :upper16: and :lower16: assembly prefixes.
+  const MCExpr *E = MO.getExpr();
+  if (E->getKind() == MCExpr::Target) {
+    const ARMMCExpr *ARM16Expr = cast<ARMMCExpr>(E);
+    E = ARM16Expr->getSubExpr();
+
+    MCFixupKind Kind;
+    switch (ARM16Expr->getKind()) {
+    default: assert(0 && "Unsupported ARMFixup");
+    case ARMMCExpr::VK_ARM_HI16:
+      if (!isTargetDarwin() && EvaluateAsPCRel(E))
+        Kind = MCFixupKind(isThumb2()
+                           ? ARM::fixup_t2_movt_hi16_pcrel
+                           : ARM::fixup_arm_movt_hi16_pcrel);
+      else
+        Kind = MCFixupKind(isThumb2()
+                           ? ARM::fixup_t2_movt_hi16
+                           : ARM::fixup_arm_movt_hi16);
+      break;
+    case ARMMCExpr::VK_ARM_LO16:
+      if (!isTargetDarwin() && EvaluateAsPCRel(E))
+        Kind = MCFixupKind(isThumb2()
+                           ? ARM::fixup_t2_movw_lo16_pcrel
+                           : ARM::fixup_arm_movw_lo16_pcrel);
+      else
+        Kind = MCFixupKind(isThumb2()
+                           ? ARM::fixup_t2_movw_lo16
+                           : ARM::fixup_arm_movw_lo16);
+      break;
+    }
+    Fixups.push_back(MCFixup::Create(0, E, Kind));
+    return 0;
+  };
+
+  llvm_unreachable("Unsupported MCExpr type in MCOperand!");
+  return 0;
+}
+
+uint32_t ARMMCCodeEmitter::
+getLdStSORegOpValue(const MCInst &MI, unsigned OpIdx,
+                    SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  const MCOperand &MO1 = MI.getOperand(OpIdx+1);
+  const MCOperand &MO2 = MI.getOperand(OpIdx+2);
+  unsigned Rn = getARMRegisterNumbering(MO.getReg());
+  unsigned Rm = getARMRegisterNumbering(MO1.getReg());
+  unsigned ShImm = ARM_AM::getAM2Offset(MO2.getImm());
+  bool isAdd = ARM_AM::getAM2Op(MO2.getImm()) == ARM_AM::add;
+  ARM_AM::ShiftOpc ShOp = ARM_AM::getAM2ShiftOpc(MO2.getImm());
+  unsigned SBits = getShiftOp(ShOp);
+
+  // {16-13} = Rn
+  // {12}    = isAdd
+  // {11-0}  = shifter
+  //  {3-0}  = Rm
+  //  {4}    = 0
+  //  {6-5}  = type
+  //  {11-7} = imm
+  uint32_t Binary = Rm;
+  Binary |= Rn << 13;
+  Binary |= SBits << 5;
+  Binary |= ShImm << 7;
+  if (isAdd)
+    Binary |= 1 << 12;
+  return Binary;
+}
+
+uint32_t ARMMCCodeEmitter::
+getAddrMode2OpValue(const MCInst &MI, unsigned OpIdx,
+                    SmallVectorImpl<MCFixup> &Fixups) const {
+  // {17-14}  Rn
+  // {13}     1 == imm12, 0 == Rm
+  // {12}     isAdd
+  // {11-0}   imm12/Rm
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  unsigned Rn = getARMRegisterNumbering(MO.getReg());
+  uint32_t Binary = getAddrMode2OffsetOpValue(MI, OpIdx + 1, Fixups);
+  Binary |= Rn << 14;
+  return Binary;
+}
+
+uint32_t ARMMCCodeEmitter::
+getAddrMode2OffsetOpValue(const MCInst &MI, unsigned OpIdx,
+                          SmallVectorImpl<MCFixup> &Fixups) const {
+  // {13}     1 == imm12, 0 == Rm
+  // {12}     isAdd
+  // {11-0}   imm12/Rm
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  const MCOperand &MO1 = MI.getOperand(OpIdx+1);
+  unsigned Imm = MO1.getImm();
+  bool isAdd = ARM_AM::getAM2Op(Imm) == ARM_AM::add;
+  bool isReg = MO.getReg() != 0;
+  uint32_t Binary = ARM_AM::getAM2Offset(Imm);
+  // if reg +/- reg, Rm will be non-zero. Otherwise, we have reg +/- imm12
+  if (isReg) {
+    ARM_AM::ShiftOpc ShOp = ARM_AM::getAM2ShiftOpc(Imm);
+    Binary <<= 7;                    // Shift amount is bits [11:7]
+    Binary |= getShiftOp(ShOp) << 5; // Shift type is bits [6:5]
+    Binary |= getARMRegisterNumbering(MO.getReg()); // Rm is bits [3:0]
+  }
+  return Binary | (isAdd << 12) | (isReg << 13);
+}
+
+uint32_t ARMMCCodeEmitter::
+getPostIdxRegOpValue(const MCInst &MI, unsigned OpIdx,
+                     SmallVectorImpl<MCFixup> &Fixups) const {
+  // {4}      isAdd
+  // {3-0}    Rm
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  const MCOperand &MO1 = MI.getOperand(OpIdx+1);
+  bool isAdd = MO1.getImm() != 0;
+  return getARMRegisterNumbering(MO.getReg()) | (isAdd << 4);
+}
+
+uint32_t ARMMCCodeEmitter::
+getAddrMode3OffsetOpValue(const MCInst &MI, unsigned OpIdx,
+                          SmallVectorImpl<MCFixup> &Fixups) const {
+  // {9}      1 == imm8, 0 == Rm
+  // {8}      isAdd
+  // {7-4}    imm7_4/zero
+  // {3-0}    imm3_0/Rm
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  const MCOperand &MO1 = MI.getOperand(OpIdx+1);
+  unsigned Imm = MO1.getImm();
+  bool isAdd = ARM_AM::getAM3Op(Imm) == ARM_AM::add;
+  bool isImm = MO.getReg() == 0;
+  uint32_t Imm8 = ARM_AM::getAM3Offset(Imm);
+  // if reg +/- reg, Rm will be non-zero. Otherwise, we have reg +/- imm8
+  if (!isImm)
+    Imm8 = getARMRegisterNumbering(MO.getReg());
+  return Imm8 | (isAdd << 8) | (isImm << 9);
+}
+
+uint32_t ARMMCCodeEmitter::
+getAddrMode3OpValue(const MCInst &MI, unsigned OpIdx,
+                    SmallVectorImpl<MCFixup> &Fixups) const {
+  // {13}     1 == imm8, 0 == Rm
+  // {12-9}   Rn
+  // {8}      isAdd
+  // {7-4}    imm7_4/zero
+  // {3-0}    imm3_0/Rm
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  const MCOperand &MO1 = MI.getOperand(OpIdx+1);
+  const MCOperand &MO2 = MI.getOperand(OpIdx+2);
+  unsigned Rn = getARMRegisterNumbering(MO.getReg());
+  unsigned Imm = MO2.getImm();
+  bool isAdd = ARM_AM::getAM3Op(Imm) == ARM_AM::add;
+  bool isImm = MO1.getReg() == 0;
+  uint32_t Imm8 = ARM_AM::getAM3Offset(Imm);
+  // if reg +/- reg, Rm will be non-zero. Otherwise, we have reg +/- imm8
+  if (!isImm)
+    Imm8 = getARMRegisterNumbering(MO1.getReg());
+  return (Rn << 9) | Imm8 | (isAdd << 8) | (isImm << 13);
+}
+
+/// getAddrModeThumbSPOpValue - Encode the t_addrmode_sp operands.
+uint32_t ARMMCCodeEmitter::
+getAddrModeThumbSPOpValue(const MCInst &MI, unsigned OpIdx,
+                          SmallVectorImpl<MCFixup> &Fixups) const {
+  // [SP, #imm]
+  //   {7-0} = imm8
+  const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
+  assert(MI.getOperand(OpIdx).getReg() == ARM::SP &&
+         "Unexpected base register!");
+
+  // The immediate is already shifted for the implicit zeroes, so no change
+  // here.
+  return MO1.getImm() & 0xff;
+}
+
+/// getAddrModeISOpValue - Encode the t_addrmode_is# operands.
+uint32_t ARMMCCodeEmitter::
+getAddrModeISOpValue(const MCInst &MI, unsigned OpIdx,
+                     SmallVectorImpl<MCFixup> &Fixups) const {
+  // [Rn, #imm]
+  //   {7-3} = imm5
+  //   {2-0} = Rn
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
+  unsigned Rn = getARMRegisterNumbering(MO.getReg());
+  unsigned Imm5 = MO1.getImm();
+  return ((Imm5 & 0x1f) << 3) | Rn;
+}
+
+/// getAddrModePCOpValue - Return encoding for t_addrmode_pc operands.
+uint32_t ARMMCCodeEmitter::
+getAddrModePCOpValue(const MCInst &MI, unsigned OpIdx,
+                     SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand MO = MI.getOperand(OpIdx);
+  if (MO.isExpr())
+    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_cp, Fixups);
+  return (MO.getImm() >> 2);
+}
+
+/// getAddrMode5OpValue - Return encoding info for 'reg +/- imm10' operand.
+uint32_t ARMMCCodeEmitter::
+getAddrMode5OpValue(const MCInst &MI, unsigned OpIdx,
+                    SmallVectorImpl<MCFixup> &Fixups) const {
+  // {12-9} = reg
+  // {8}    = (U)nsigned (add == '1', sub == '0')
+  // {7-0}  = imm8
+  unsigned Reg, Imm8;
+  bool isAdd;
+  // If The first operand isn't a register, we have a label reference.
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  if (!MO.isReg()) {
+    Reg = getARMRegisterNumbering(ARM::PC);   // Rn is PC.
+    Imm8 = 0;
+    isAdd = false; // 'U' bit is handled as part of the fixup.
+
+    assert(MO.isExpr() && "Unexpected machine operand type!");
+    const MCExpr *Expr = MO.getExpr();
+    MCFixupKind Kind;
+    if (isThumb2())
+      Kind = MCFixupKind(ARM::fixup_t2_pcrel_10);
+    else
+      Kind = MCFixupKind(ARM::fixup_arm_pcrel_10);
+    Fixups.push_back(MCFixup::Create(0, Expr, Kind));
+
+    ++MCNumCPRelocations;
+  } else {
+    EncodeAddrModeOpValues(MI, OpIdx, Reg, Imm8, Fixups);
+    isAdd = ARM_AM::getAM5Op(Imm8) == ARM_AM::add;
+  }
+
+  uint32_t Binary = ARM_AM::getAM5Offset(Imm8);
+  // Immediate is always encoded as positive. The 'U' bit controls add vs sub.
+  if (isAdd)
+    Binary |= (1 << 8);
+  Binary |= (Reg << 9);
+  return Binary;
+}
+
+unsigned ARMMCCodeEmitter::
+getSORegRegOpValue(const MCInst &MI, unsigned OpIdx,
+                SmallVectorImpl<MCFixup> &Fixups) const {
+  // Sub-operands are [reg, reg, imm]. The first register is Rm, the reg to be
+  // shifted. The second is Rs, the amount to shift by, and the third specifies
+  // the type of the shift.
+  //
+  // {3-0} = Rm.
+  // {4}   = 1
+  // {6-5} = type
+  // {11-8} = Rs
+  // {7}    = 0
+
+  const MCOperand &MO  = MI.getOperand(OpIdx);
+  const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
+  const MCOperand &MO2 = MI.getOperand(OpIdx + 2);
+  ARM_AM::ShiftOpc SOpc = ARM_AM::getSORegShOp(MO2.getImm());
+
+  // Encode Rm.
+  unsigned Binary = getARMRegisterNumbering(MO.getReg());
+
+  // Encode the shift opcode.
+  unsigned SBits = 0;
+  unsigned Rs = MO1.getReg();
+  if (Rs) {
+    // Set shift operand (bit[7:4]).
+    // LSL - 0001
+    // LSR - 0011
+    // ASR - 0101
+    // ROR - 0111
+    switch (SOpc) {
+    default: llvm_unreachable("Unknown shift opc!");
+    case ARM_AM::lsl: SBits = 0x1; break;
+    case ARM_AM::lsr: SBits = 0x3; break;
+    case ARM_AM::asr: SBits = 0x5; break;
+    case ARM_AM::ror: SBits = 0x7; break;
+    }
+  }
+
+  Binary |= SBits << 4;
+
+  // Encode the shift operation Rs.
+  // Encode Rs bit[11:8].
+  assert(ARM_AM::getSORegOffset(MO2.getImm()) == 0);
+  return Binary | (getARMRegisterNumbering(Rs) << ARMII::RegRsShift);
+}
+
+unsigned ARMMCCodeEmitter::
+getSORegImmOpValue(const MCInst &MI, unsigned OpIdx,
+                SmallVectorImpl<MCFixup> &Fixups) const {
+  // Sub-operands are [reg, imm]. The first register is Rm, the reg to be
+  // shifted. The second is the amount to shift by.
+  //
+  // {3-0} = Rm.
+  // {4}   = 0
+  // {6-5} = type
+  // {11-7} = imm
+
+  const MCOperand &MO  = MI.getOperand(OpIdx);
+  const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
+  ARM_AM::ShiftOpc SOpc = ARM_AM::getSORegShOp(MO1.getImm());
+
+  // Encode Rm.
+  unsigned Binary = getARMRegisterNumbering(MO.getReg());
+
+  // Encode the shift opcode.
+  unsigned SBits = 0;
+
+  // Set shift operand (bit[6:4]).
+  // LSL - 000
+  // LSR - 010
+  // ASR - 100
+  // ROR - 110
+  // RRX - 110 and bit[11:8] clear.
+  switch (SOpc) {
+  default: llvm_unreachable("Unknown shift opc!");
+  case ARM_AM::lsl: SBits = 0x0; break;
+  case ARM_AM::lsr: SBits = 0x2; break;
+  case ARM_AM::asr: SBits = 0x4; break;
+  case ARM_AM::ror: SBits = 0x6; break;
+  case ARM_AM::rrx:
+    Binary |= 0x60;
+    return Binary;
+  }
+
+  // Encode shift_imm bit[11:7].
+  Binary |= SBits << 4;
+  unsigned Offset = ARM_AM::getSORegOffset(MO1.getImm());
+  assert(Offset && "Offset must be in range 1-32!");
+  if (Offset == 32) Offset = 0;
+  return Binary | (Offset << 7);
+}
+
+
+unsigned ARMMCCodeEmitter::
+getT2AddrModeSORegOpValue(const MCInst &MI, unsigned OpNum,
+                SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand &MO1 = MI.getOperand(OpNum);
+  const MCOperand &MO2 = MI.getOperand(OpNum+1);
+  const MCOperand &MO3 = MI.getOperand(OpNum+2);
+
+  // Encoded as [Rn, Rm, imm].
+  // FIXME: Needs fixup support.
+  unsigned Value = getARMRegisterNumbering(MO1.getReg());
+  Value <<= 4;
+  Value |= getARMRegisterNumbering(MO2.getReg());
+  Value <<= 2;
+  Value |= MO3.getImm();
+
+  return Value;
+}
+
+unsigned ARMMCCodeEmitter::
+getT2AddrModeImm8OpValue(const MCInst &MI, unsigned OpNum,
+                         SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand &MO1 = MI.getOperand(OpNum);
+  const MCOperand &MO2 = MI.getOperand(OpNum+1);
+
+  // FIXME: Needs fixup support.
+  unsigned Value = getARMRegisterNumbering(MO1.getReg());
+
+  // Even though the immediate is 8 bits long, we need 9 bits in order
+  // to represent the (inverse of the) sign bit.
+  Value <<= 9;
+  int32_t tmp = (int32_t)MO2.getImm();
+  if (tmp < 0)
+    tmp = abs(tmp);
+  else
+    Value |= 256; // Set the ADD bit
+  Value |= tmp & 255;
+  return Value;
+}
+
+unsigned ARMMCCodeEmitter::
+getT2AddrModeImm8OffsetOpValue(const MCInst &MI, unsigned OpNum,
+                         SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand &MO1 = MI.getOperand(OpNum);
+
+  // FIXME: Needs fixup support.
+  unsigned Value = 0;
+  int32_t tmp = (int32_t)MO1.getImm();
+  if (tmp < 0)
+    tmp = abs(tmp);
+  else
+    Value |= 256; // Set the ADD bit
+  Value |= tmp & 255;
+  return Value;
+}
+
+unsigned ARMMCCodeEmitter::
+getT2AddrModeImm12OffsetOpValue(const MCInst &MI, unsigned OpNum,
+                         SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand &MO1 = MI.getOperand(OpNum);
+
+  // FIXME: Needs fixup support.
+  unsigned Value = 0;
+  int32_t tmp = (int32_t)MO1.getImm();
+  if (tmp < 0)
+    tmp = abs(tmp);
+  else
+    Value |= 4096; // Set the ADD bit
+  Value |= tmp & 4095;
+  return Value;
+}
+
+unsigned ARMMCCodeEmitter::
+getT2SORegOpValue(const MCInst &MI, unsigned OpIdx,
+                SmallVectorImpl<MCFixup> &Fixups) const {
+  // Sub-operands are [reg, imm]. The first register is Rm, the reg to be
+  // shifted. The second is the amount to shift by.
+  //
+  // {3-0} = Rm.
+  // {4}   = 0
+  // {6-5} = type
+  // {11-7} = imm
+
+  const MCOperand &MO  = MI.getOperand(OpIdx);
+  const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
+  ARM_AM::ShiftOpc SOpc = ARM_AM::getSORegShOp(MO1.getImm());
+
+  // Encode Rm.
+  unsigned Binary = getARMRegisterNumbering(MO.getReg());
+
+  // Encode the shift opcode.
+  unsigned SBits = 0;
+  // Set shift operand (bit[6:4]).
+  // LSL - 000
+  // LSR - 010
+  // ASR - 100
+  // ROR - 110
+  switch (SOpc) {
+  default: llvm_unreachable("Unknown shift opc!");
+  case ARM_AM::lsl: SBits = 0x0; break;
+  case ARM_AM::lsr: SBits = 0x2; break;
+  case ARM_AM::asr: SBits = 0x4; break;
+  case ARM_AM::rrx: // FALLTHROUGH
+  case ARM_AM::ror: SBits = 0x6; break;
+  }
+
+  Binary |= SBits << 4;
+  if (SOpc == ARM_AM::rrx)
+    return Binary;
+
+  // Encode shift_imm bit[11:7].
+  return Binary | ARM_AM::getSORegOffset(MO1.getImm()) << 7;
+}
+
+unsigned ARMMCCodeEmitter::
+getBitfieldInvertedMaskOpValue(const MCInst &MI, unsigned Op,
+                               SmallVectorImpl<MCFixup> &Fixups) const {
+  // 10 bits. lower 5 bits are are the lsb of the mask, high five bits are the
+  // msb of the mask.
+  const MCOperand &MO = MI.getOperand(Op);
+  uint32_t v = ~MO.getImm();
+  uint32_t lsb = CountTrailingZeros_32(v);
+  uint32_t msb = (32 - CountLeadingZeros_32 (v)) - 1;
+  assert (v != 0 && lsb < 32 && msb < 32 && "Illegal bitfield mask!");
+  return lsb | (msb << 5);
+}
+
+unsigned ARMMCCodeEmitter::
+getRegisterListOpValue(const MCInst &MI, unsigned Op,
+                       SmallVectorImpl<MCFixup> &Fixups) const {
+  // VLDM/VSTM:
+  //   {12-8} = Vd
+  //   {7-0}  = Number of registers
+  //
+  // LDM/STM:
+  //   {15-0}  = Bitfield of GPRs.
+  unsigned Reg = MI.getOperand(Op).getReg();
+  bool SPRRegs = llvm::ARMMCRegisterClasses[ARM::SPRRegClassID].contains(Reg);
+  bool DPRRegs = llvm::ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Reg);
+
+  unsigned Binary = 0;
+
+  if (SPRRegs || DPRRegs) {
+    // VLDM/VSTM
+    unsigned RegNo = getARMRegisterNumbering(Reg);
+    unsigned NumRegs = (MI.getNumOperands() - Op) & 0xff;
+    Binary |= (RegNo & 0x1f) << 8;
+    if (SPRRegs)
+      Binary |= NumRegs;
+    else
+      Binary |= NumRegs * 2;
+  } else {
+    for (unsigned I = Op, E = MI.getNumOperands(); I < E; ++I) {
+      unsigned RegNo = getARMRegisterNumbering(MI.getOperand(I).getReg());
+      Binary |= 1 << RegNo;
+    }
+  }
+
+  return Binary;
+}
+
+/// getAddrMode6AddressOpValue - Encode an addrmode6 register number along
+/// with the alignment operand.
+unsigned ARMMCCodeEmitter::
+getAddrMode6AddressOpValue(const MCInst &MI, unsigned Op,
+                           SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand &Reg = MI.getOperand(Op);
+  const MCOperand &Imm = MI.getOperand(Op + 1);
+
+  unsigned RegNo = getARMRegisterNumbering(Reg.getReg());
+  unsigned Align = 0;
+
+  switch (Imm.getImm()) {
+  default: break;
+  case 2:
+  case 4:
+  case 8:  Align = 0x01; break;
+  case 16: Align = 0x02; break;
+  case 32: Align = 0x03; break;
+  }
+
+  return RegNo | (Align << 4);
+}
+
+/// getAddrMode6OneLane32AddressOpValue - Encode an addrmode6 register number
+/// along  with the alignment operand for use in VST1 and VLD1 with size 32.
+unsigned ARMMCCodeEmitter::
+getAddrMode6OneLane32AddressOpValue(const MCInst &MI, unsigned Op,
+                                    SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand &Reg = MI.getOperand(Op);
+  const MCOperand &Imm = MI.getOperand(Op + 1);
+
+  unsigned RegNo = getARMRegisterNumbering(Reg.getReg());
+  unsigned Align = 0;
+
+  switch (Imm.getImm()) {
+  default: break;
+  case 2:
+  case 4:
+  case 8:
+  case 16: Align = 0x00; break;
+  case 32: Align = 0x03; break;
+  }
+
+  return RegNo | (Align << 4);
+}
+
+
+/// getAddrMode6DupAddressOpValue - Encode an addrmode6 register number and
+/// alignment operand for use in VLD-dup instructions.  This is the same as
+/// getAddrMode6AddressOpValue except for the alignment encoding, which is
+/// different for VLD4-dup.
+unsigned ARMMCCodeEmitter::
+getAddrMode6DupAddressOpValue(const MCInst &MI, unsigned Op,
+                              SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand &Reg = MI.getOperand(Op);
+  const MCOperand &Imm = MI.getOperand(Op + 1);
+
+  unsigned RegNo = getARMRegisterNumbering(Reg.getReg());
+  unsigned Align = 0;
+
+  switch (Imm.getImm()) {
+  default: break;
+  case 2:
+  case 4:
+  case 8:  Align = 0x01; break;
+  case 16: Align = 0x03; break;
+  }
+
+  return RegNo | (Align << 4);
+}
+
+unsigned ARMMCCodeEmitter::
+getAddrMode6OffsetOpValue(const MCInst &MI, unsigned Op,
+                          SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand &MO = MI.getOperand(Op);
+  if (MO.getReg() == 0) return 0x0D;
+  return MO.getReg();
+}
+
+unsigned ARMMCCodeEmitter::
+getShiftRight8Imm(const MCInst &MI, unsigned Op,
+                  SmallVectorImpl<MCFixup> &Fixups) const {
+  return 8 - MI.getOperand(Op).getImm();
+}
+
+unsigned ARMMCCodeEmitter::
+getShiftRight16Imm(const MCInst &MI, unsigned Op,
+                   SmallVectorImpl<MCFixup> &Fixups) const {
+  return 16 - MI.getOperand(Op).getImm();
+}
+
+unsigned ARMMCCodeEmitter::
+getShiftRight32Imm(const MCInst &MI, unsigned Op,
+                   SmallVectorImpl<MCFixup> &Fixups) const {
+  return 32 - MI.getOperand(Op).getImm();
+}
+
+unsigned ARMMCCodeEmitter::
+getShiftRight64Imm(const MCInst &MI, unsigned Op,
+                   SmallVectorImpl<MCFixup> &Fixups) const {
+  return 64 - MI.getOperand(Op).getImm();
+}
+
+void ARMMCCodeEmitter::
+EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                  SmallVectorImpl<MCFixup> &Fixups) const {
+  // Pseudo instructions don't get encoded.
+  const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
+  uint64_t TSFlags = Desc.TSFlags;
+  if ((TSFlags & ARMII::FormMask) == ARMII::Pseudo)
+    return;
+
+  int Size;
+  if (Desc.getSize() == 2 || Desc.getSize() == 4)
+    Size = Desc.getSize();
+  else
+    llvm_unreachable("Unexpected instruction size!");
+
+  uint32_t Binary = getBinaryCodeForInstr(MI, Fixups);
+  // Thumb 32-bit wide instructions need to emit the high order halfword
+  // first.
+  if (isThumb() && Size == 4) {
+    EmitConstant(Binary >> 16, 2, OS);
+    EmitConstant(Binary & 0xffff, 2, OS);
+  } else
+    EmitConstant(Binary, Size, OS);
+  ++MCNumEmitted;  // Keep track of the # of mi's emitted.
+}
+
+#include "ARMGenMCCodeEmitter.inc"
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp
new file mode 100644
index 0000000000000..2727ba8c8aa5f
--- /dev/null
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp
@@ -0,0 +1,73 @@
+//===-- ARMMCExpr.cpp - ARM specific MC expression classes ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "armmcexpr"
+#include "ARMMCExpr.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCAssembler.h"
+using namespace llvm;
+
+const ARMMCExpr*
+ARMMCExpr::Create(VariantKind Kind, const MCExpr *Expr,
+                       MCContext &Ctx) {
+  return new (Ctx) ARMMCExpr(Kind, Expr);
+}
+
+void ARMMCExpr::PrintImpl(raw_ostream &OS) const {
+  switch (Kind) {
+  default: assert(0 && "Invalid kind!");
+  case VK_ARM_HI16: OS << ":upper16:"; break;
+  case VK_ARM_LO16: OS << ":lower16:"; break;
+  }
+
+  const MCExpr *Expr = getSubExpr();
+  if (Expr->getKind() != MCExpr::SymbolRef)
+    OS << '(';
+  Expr->print(OS);
+  if (Expr->getKind() != MCExpr::SymbolRef)
+    OS << ')';
+}
+
+bool
+ARMMCExpr::EvaluateAsRelocatableImpl(MCValue &Res,
+                                     const MCAsmLayout *Layout) const {
+  return false;
+}
+
+// FIXME: This basically copies MCObjectStreamer::AddValueSymbols. Perhaps
+// that method should be made public?
+static void AddValueSymbols_(const MCExpr *Value, MCAssembler *Asm) {
+  switch (Value->getKind()) {
+  case MCExpr::Target:
+    assert(0 && "Can't handle nested target expr!");
+    break;
+
+  case MCExpr::Constant:
+    break;
+
+  case MCExpr::Binary: {
+    const MCBinaryExpr *BE = cast<MCBinaryExpr>(Value);
+    AddValueSymbols_(BE->getLHS(), Asm);
+    AddValueSymbols_(BE->getRHS(), Asm);
+    break;
+  }
+
+  case MCExpr::SymbolRef:
+    Asm->getOrCreateSymbolData(cast<MCSymbolRefExpr>(Value)->getSymbol());
+    break;
+
+  case MCExpr::Unary:
+    AddValueSymbols_(cast<MCUnaryExpr>(Value)->getSubExpr(), Asm);
+    break;
+  }
+}
+
+void ARMMCExpr::AddValueSymbols(MCAssembler *Asm) const {
+  AddValueSymbols_(getSubExpr(), Asm);
+}
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h b/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h
new file mode 100644
index 0000000000000..0a2e883deb1d4
--- /dev/null
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h
@@ -0,0 +1,76 @@
+//===-- ARMMCExpr.h - ARM specific MC expression classes ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMMCEXPR_H
+#define ARMMCEXPR_H
+
+#include "llvm/MC/MCExpr.h"
+
+namespace llvm {
+
+class ARMMCExpr : public MCTargetExpr {
+public:
+  enum VariantKind {
+    VK_ARM_None,
+    VK_ARM_HI16,  // The R_ARM_MOVT_ABS relocation (:upper16: in the .s file)
+    VK_ARM_LO16   // The R_ARM_MOVW_ABS_NC relocation (:lower16: in the .s file)
+  };
+
+private:
+  const VariantKind Kind;
+  const MCExpr *Expr;
+
+  explicit ARMMCExpr(VariantKind _Kind, const MCExpr *_Expr)
+    : Kind(_Kind), Expr(_Expr) {}
+  
+public:
+  /// @name Construction
+  /// @{
+
+  static const ARMMCExpr *Create(VariantKind Kind, const MCExpr *Expr,
+                                      MCContext &Ctx);
+
+  static const ARMMCExpr *CreateUpper16(const MCExpr *Expr, MCContext &Ctx) {
+    return Create(VK_ARM_HI16, Expr, Ctx);
+  }
+
+  static const ARMMCExpr *CreateLower16(const MCExpr *Expr, MCContext &Ctx) {
+    return Create(VK_ARM_LO16, Expr, Ctx);
+  }
+
+  /// @}
+  /// @name Accessors
+  /// @{
+
+  /// getOpcode - Get the kind of this expression.
+  VariantKind getKind() const { return Kind; }
+
+  /// getSubExpr - Get the child of this expression.
+  const MCExpr *getSubExpr() const { return Expr; }
+
+  /// @}
+
+  void PrintImpl(raw_ostream &OS) const;
+  bool EvaluateAsRelocatableImpl(MCValue &Res,
+                                 const MCAsmLayout *Layout) const;
+  void AddValueSymbols(MCAssembler *) const;
+  const MCSection *FindAssociatedSection() const {
+    return getSubExpr()->FindAssociatedSection();
+  }
+
+  static bool classof(const MCExpr *E) {
+    return E->getKind() == MCExpr::Target;
+  }
+  
+  static bool classof(const ARMMCExpr *) { return true; }
+
+};
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
index f8fcf2b8aff17..a55c41075d403 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
@@ -13,10 +13,16 @@
 
 #include "ARMMCTargetDesc.h"
 #include "ARMMCAsmInfo.h"
+#include "ARMBaseInfo.h"
+#include "InstPrinter/ARMInstPrinter.h"
+#include "llvm/MC/MCCodeGenInfo.h"
+#include "llvm/MC/MCInstrAnalysis.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
 
 #define GET_REGINFO_MC_DESC
 #include "ARMGenRegisterInfo.inc"
@@ -35,7 +41,7 @@ std::string ARM_MC::ParseARMTriple(StringRef TT) {
   unsigned Len = TT.size();
   unsigned Idx = 0;
 
-  // FIXME: Enahnce Triple helper class to extract ARM version.
+  // FIXME: Enhance Triple helper class to extract ARM version.
   bool isThumb = false;
   if (Len >= 5 && TT.substr(0, 4) == "armv")
     Idx = 4;
@@ -50,18 +56,21 @@ std::string ARM_MC::ParseARMTriple(StringRef TT) {
     unsigned SubVer = TT[Idx];
     if (SubVer >= '7' && SubVer <= '9') {
       if (Len >= Idx+2 && TT[Idx+1] == 'm') {
-        // v7m: FeatureNoARM, FeatureDB, FeatureHWDiv
-        ARMArchFeature = "+v7,+noarm,+db,+hwdiv";
+        // v7m: FeatureNoARM, FeatureDB, FeatureHWDiv, FeatureMClass
+        ARMArchFeature = "+v7,+noarm,+db,+hwdiv,+mclass";
       } else if (Len >= Idx+3 && TT[Idx+1] == 'e'&& TT[Idx+2] == 'm') {
         // v7em: FeatureNoARM, FeatureDB, FeatureHWDiv, FeatureDSPThumb2,
-        //       FeatureT2XtPk
-        ARMArchFeature = "+v7,+noarm,+db,+hwdiv,+t2dsp,t2xtpk";
+        //       FeatureT2XtPk, FeatureMClass
+        ARMArchFeature = "+v7,+noarm,+db,+hwdiv,+t2dsp,t2xtpk,+mclass";
       } else
-        // v7a: FeatureNEON, FeatureDB, FeatureDSPThumb2
-        ARMArchFeature = "+v7,+neon,+db,+t2dsp";
+        // v7a: FeatureNEON, FeatureDB, FeatureDSPThumb2, FeatureT2XtPk
+        ARMArchFeature = "+v7,+neon,+db,+t2dsp,+t2xtpk";
     } else if (SubVer == '6') {
       if (Len >= Idx+3 && TT[Idx+1] == 't' && TT[Idx+2] == '2')
         ARMArchFeature = "+v6t2";
+      else if (Len >= Idx+2 && TT[Idx+1] == 'm')
+        // v6m: FeatureNoARM, FeatureMClass
+        ARMArchFeature = "+v6t2,+noarm,+mclass";
       else
         ARMArchFeature = "+v6";
     } else if (SubVer == '5') {
@@ -80,6 +89,14 @@ std::string ARM_MC::ParseARMTriple(StringRef TT) {
       ARMArchFeature += ",+thumb-mode";
   }
 
+  Triple TheTriple(TT);
+  if (TheTriple.getOS() == Triple::NativeClient) {
+    if (ARMArchFeature.empty())
+      ARMArchFeature = "+nacl-mode";
+    else
+      ARMArchFeature += ",+nacl-mode";
+  }
+
   return ARMArchFeature;
 }
 
@@ -98,36 +115,18 @@ MCSubtargetInfo *ARM_MC::createARMMCSubtargetInfo(StringRef TT, StringRef CPU,
   return X;
 }
 
-// Force static initialization.
-extern "C" void LLVMInitializeARMMCSubtargetInfo() {
-  TargetRegistry::RegisterMCSubtargetInfo(TheARMTarget,
-                                          ARM_MC::createARMMCSubtargetInfo);
-  TargetRegistry::RegisterMCSubtargetInfo(TheThumbTarget,
-                                          ARM_MC::createARMMCSubtargetInfo);
-}
-
 static MCInstrInfo *createARMMCInstrInfo() {
   MCInstrInfo *X = new MCInstrInfo();
   InitARMMCInstrInfo(X);
   return X;
 }
 
-extern "C" void LLVMInitializeARMMCInstrInfo() {
-  TargetRegistry::RegisterMCInstrInfo(TheARMTarget, createARMMCInstrInfo);
-  TargetRegistry::RegisterMCInstrInfo(TheThumbTarget, createARMMCInstrInfo);
-}
-
-static MCRegisterInfo *createARMMCRegisterInfo() {
+static MCRegisterInfo *createARMMCRegisterInfo(StringRef Triple) {
   MCRegisterInfo *X = new MCRegisterInfo();
-  InitARMMCRegisterInfo(X);
+  InitARMMCRegisterInfo(X, ARM::LR);
   return X;
 }
 
-extern "C" void LLVMInitializeARMMCRegInfo() {
-  TargetRegistry::RegisterMCRegInfo(TheARMTarget, createARMMCRegisterInfo);
-  TargetRegistry::RegisterMCRegInfo(TheThumbTarget, createARMMCRegisterInfo);
-}
-
 static MCAsmInfo *createARMMCAsmInfo(const Target &T, StringRef TT) {
   Triple TheTriple(TT);
 
@@ -137,8 +136,128 @@ static MCAsmInfo *createARMMCAsmInfo(const Target &T, StringRef TT) {
   return new ARMELFMCAsmInfo();
 }
 
-extern "C" void LLVMInitializeARMMCAsmInfo() {
-  // Register the target asm info.
+static MCCodeGenInfo *createARMMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                             CodeModel::Model CM) {
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+  if (RM == Reloc::Default) {
+    Triple TheTriple(TT);
+    // Default relocation model on Darwin is PIC, not DynamicNoPIC.
+    RM = TheTriple.isOSDarwin() ? Reloc::PIC_ : Reloc::DynamicNoPIC;
+  }
+  X->InitMCCodeGenInfo(RM, CM);
+  return X;
+}
+
+// This is duplicated code. Refactor this.
+static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
+                                    MCContext &Ctx, MCAsmBackend &MAB,
+                                    raw_ostream &OS,
+                                    MCCodeEmitter *Emitter,
+                                    bool RelaxAll,
+                                    bool NoExecStack) {
+  Triple TheTriple(TT);
+
+  if (TheTriple.isOSDarwin())
+    return createMachOStreamer(Ctx, MAB, OS, Emitter, RelaxAll);
+
+  if (TheTriple.isOSWindows()) {
+    llvm_unreachable("ARM does not support Windows COFF format");
+    return NULL;
+  }
+
+  return createELFStreamer(Ctx, MAB, OS, Emitter, RelaxAll, NoExecStack);
+}
+
+static MCInstPrinter *createARMMCInstPrinter(const Target &T,
+                                             unsigned SyntaxVariant,
+                                             const MCAsmInfo &MAI,
+                                             const MCSubtargetInfo &STI) {
+  if (SyntaxVariant == 0)
+    return new ARMInstPrinter(MAI, STI);
+  return 0;
+}
+
+namespace {
+
+class ARMMCInstrAnalysis : public MCInstrAnalysis {
+public:
+  ARMMCInstrAnalysis(const MCInstrInfo *Info) : MCInstrAnalysis(Info) {}
+
+  virtual bool isUnconditionalBranch(const MCInst &Inst) const {
+    // BCCs with the "always" predicate are unconditional branches.
+    if (Inst.getOpcode() == ARM::Bcc && Inst.getOperand(1).getImm()==ARMCC::AL)
+      return true;
+    return MCInstrAnalysis::isUnconditionalBranch(Inst);
+  }
+
+  virtual bool isConditionalBranch(const MCInst &Inst) const {
+    // BCCs with the "always" predicate are unconditional branches.
+    if (Inst.getOpcode() == ARM::Bcc && Inst.getOperand(1).getImm()==ARMCC::AL)
+      return false;
+    return MCInstrAnalysis::isConditionalBranch(Inst);
+  }
+
+  uint64_t evaluateBranch(const MCInst &Inst, uint64_t Addr,
+                          uint64_t Size) const {
+    // We only handle PCRel branches for now.
+    if (Info->get(Inst.getOpcode()).OpInfo[0].OperandType!=MCOI::OPERAND_PCREL)
+      return -1ULL;
+
+    int64_t Imm = Inst.getOperand(0).getImm();
+    // FIXME: This is not right for thumb.
+    return Addr+Imm+8; // In ARM mode the PC is always off by 8 bytes.
+  }
+};
+
+}
+
+static MCInstrAnalysis *createARMMCInstrAnalysis(const MCInstrInfo *Info) {
+  return new ARMMCInstrAnalysis(Info);
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeARMTargetMC() {
+  // Register the MC asm info.
   RegisterMCAsmInfoFn A(TheARMTarget, createARMMCAsmInfo);
   RegisterMCAsmInfoFn B(TheThumbTarget, createARMMCAsmInfo);
+
+  // Register the MC codegen info.
+  TargetRegistry::RegisterMCCodeGenInfo(TheARMTarget, createARMMCCodeGenInfo);
+  TargetRegistry::RegisterMCCodeGenInfo(TheThumbTarget, createARMMCCodeGenInfo);
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(TheARMTarget, createARMMCInstrInfo);
+  TargetRegistry::RegisterMCInstrInfo(TheThumbTarget, createARMMCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(TheARMTarget, createARMMCRegisterInfo);
+  TargetRegistry::RegisterMCRegInfo(TheThumbTarget, createARMMCRegisterInfo);
+
+  // Register the MC subtarget info.
+  TargetRegistry::RegisterMCSubtargetInfo(TheARMTarget,
+                                          ARM_MC::createARMMCSubtargetInfo);
+  TargetRegistry::RegisterMCSubtargetInfo(TheThumbTarget,
+                                          ARM_MC::createARMMCSubtargetInfo);
+
+  // Register the MC instruction analyzer.
+  TargetRegistry::RegisterMCInstrAnalysis(TheARMTarget,
+                                          createARMMCInstrAnalysis);
+  TargetRegistry::RegisterMCInstrAnalysis(TheThumbTarget,
+                                          createARMMCInstrAnalysis);
+
+  // Register the MC Code Emitter
+  TargetRegistry::RegisterMCCodeEmitter(TheARMTarget, createARMMCCodeEmitter);
+  TargetRegistry::RegisterMCCodeEmitter(TheThumbTarget, createARMMCCodeEmitter);
+
+  // Register the asm backend.
+  TargetRegistry::RegisterMCAsmBackend(TheARMTarget, createARMAsmBackend);
+  TargetRegistry::RegisterMCAsmBackend(TheThumbTarget, createARMAsmBackend);
+
+  // Register the object streamer.
+  TargetRegistry::RegisterMCObjectStreamer(TheARMTarget, createMCStreamer);
+  TargetRegistry::RegisterMCObjectStreamer(TheThumbTarget, createMCStreamer);
+
+  // Register the MCInstPrinter.
+  TargetRegistry::RegisterMCInstPrinter(TheARMTarget, createARMMCInstPrinter);
+  TargetRegistry::RegisterMCInstPrinter(TheThumbTarget, createARMMCInstPrinter);
 }
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
index 74701e3516dc1..9b3d3bd32183b 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
@@ -14,12 +14,19 @@
 #ifndef ARMMCTARGETDESC_H
 #define ARMMCTARGETDESC_H
 
+#include "llvm/Support/DataTypes.h"
 #include <string>
 
 namespace llvm {
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCObjectWriter;
 class MCSubtargetInfo;
-class Target;
 class StringRef;
+class Target;
+class raw_ostream;
 
 extern Target TheARMTarget, TheThumbTarget;
 
@@ -33,6 +40,18 @@ namespace ARM_MC {
                                             StringRef FS);
 }
 
+MCCodeEmitter *createARMMCCodeEmitter(const MCInstrInfo &MCII,
+                                      const MCSubtargetInfo &STI,
+                                      MCContext &Ctx);
+
+MCAsmBackend *createARMAsmBackend(const Target &T, StringRef TT);
+
+/// createARMMachObjectWriter - Construct an ARM Mach-O object writer.
+MCObjectWriter *createARMMachObjectWriter(raw_ostream &OS,
+                                          bool Is64Bit,
+                                          uint32_t CPUType,
+                                          uint32_t CPUSubtype);
+
 } // End llvm namespace
 
 // Defines symbolic names for ARM registers.  This defines a mapping from
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
new file mode 100644
index 0000000000000..352c73e84df1b
--- /dev/null
+++ b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
@@ -0,0 +1,388 @@
+//===-- ARMMachObjectWriter.cpp - ARM Mach Object Writer ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/ARMBaseInfo.h"
+#include "MCTargetDesc/ARMFixupKinds.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCMachObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Object/MachOFormat.h"
+#include "llvm/Support/ErrorHandling.h"
+using namespace llvm;
+using namespace llvm::object;
+
+namespace {
+class ARMMachObjectWriter : public MCMachObjectTargetWriter {
+  void RecordARMScatteredRelocation(MachObjectWriter *Writer,
+                                    const MCAssembler &Asm,
+                                    const MCAsmLayout &Layout,
+                                    const MCFragment *Fragment,
+                                    const MCFixup &Fixup,
+                                    MCValue Target,
+                                    unsigned Log2Size,
+                                    uint64_t &FixedValue);
+  void RecordARMMovwMovtRelocation(MachObjectWriter *Writer,
+                                   const MCAssembler &Asm,
+                                   const MCAsmLayout &Layout,
+                                   const MCFragment *Fragment,
+                                   const MCFixup &Fixup, MCValue Target,
+                                   uint64_t &FixedValue);
+
+public:
+  ARMMachObjectWriter(bool Is64Bit, uint32_t CPUType,
+                      uint32_t CPUSubtype)
+    : MCMachObjectTargetWriter(Is64Bit, CPUType, CPUSubtype,
+                               /*UseAggressiveSymbolFolding=*/true) {}
+
+  void RecordRelocation(MachObjectWriter *Writer,
+                        const MCAssembler &Asm, const MCAsmLayout &Layout,
+                        const MCFragment *Fragment, const MCFixup &Fixup,
+                        MCValue Target, uint64_t &FixedValue);
+};
+}
+
+static bool getARMFixupKindMachOInfo(unsigned Kind, unsigned &RelocType,
+                              unsigned &Log2Size) {
+  RelocType = unsigned(macho::RIT_Vanilla);
+  Log2Size = ~0U;
+
+  switch (Kind) {
+  default:
+    return false;
+
+  case FK_Data_1:
+    Log2Size = llvm::Log2_32(1);
+    return true;
+  case FK_Data_2:
+    Log2Size = llvm::Log2_32(2);
+    return true;
+  case FK_Data_4:
+    Log2Size = llvm::Log2_32(4);
+    return true;
+  case FK_Data_8:
+    Log2Size = llvm::Log2_32(8);
+    return true;
+
+    // Handle 24-bit branch kinds.
+  case ARM::fixup_arm_ldst_pcrel_12:
+  case ARM::fixup_arm_pcrel_10:
+  case ARM::fixup_arm_adr_pcrel_12:
+  case ARM::fixup_arm_condbranch:
+  case ARM::fixup_arm_uncondbranch:
+    RelocType = unsigned(macho::RIT_ARM_Branch24Bit);
+    // Report as 'long', even though that is not quite accurate.
+    Log2Size = llvm::Log2_32(4);
+    return true;
+
+    // Handle Thumb branches.
+  case ARM::fixup_arm_thumb_br:
+    RelocType = unsigned(macho::RIT_ARM_ThumbBranch22Bit);
+    Log2Size = llvm::Log2_32(2);
+    return true;
+
+  case ARM::fixup_t2_uncondbranch:
+  case ARM::fixup_arm_thumb_bl:
+  case ARM::fixup_arm_thumb_blx:
+    RelocType = unsigned(macho::RIT_ARM_ThumbBranch22Bit);
+    Log2Size = llvm::Log2_32(4);
+    return true;
+
+  case ARM::fixup_arm_movt_hi16:
+  case ARM::fixup_arm_movt_hi16_pcrel:
+  case ARM::fixup_t2_movt_hi16:
+  case ARM::fixup_t2_movt_hi16_pcrel:
+    RelocType = unsigned(macho::RIT_ARM_HalfDifference);
+    // Report as 'long', even though that is not quite accurate.
+    Log2Size = llvm::Log2_32(4);
+    return true;
+
+  case ARM::fixup_arm_movw_lo16:
+  case ARM::fixup_arm_movw_lo16_pcrel:
+  case ARM::fixup_t2_movw_lo16:
+  case ARM::fixup_t2_movw_lo16_pcrel:
+    RelocType = unsigned(macho::RIT_ARM_Half);
+    // Report as 'long', even though that is not quite accurate.
+    Log2Size = llvm::Log2_32(4);
+    return true;
+  }
+}
+
+void ARMMachObjectWriter::
+RecordARMMovwMovtRelocation(MachObjectWriter *Writer,
+                            const MCAssembler &Asm,
+                            const MCAsmLayout &Layout,
+                            const MCFragment *Fragment,
+                            const MCFixup &Fixup,
+                            MCValue Target,
+                            uint64_t &FixedValue) {
+  uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
+  unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
+  unsigned Type = macho::RIT_ARM_Half;
+
+  // See <reloc.h>.
+  const MCSymbol *A = &Target.getSymA()->getSymbol();
+  MCSymbolData *A_SD = &Asm.getSymbolData(*A);
+
+  if (!A_SD->getFragment())
+    report_fatal_error("symbol '" + A->getName() +
+                       "' can not be undefined in a subtraction expression");
+
+  uint32_t Value = Writer->getSymbolAddress(A_SD, Layout);
+  uint32_t Value2 = 0;
+  uint64_t SecAddr =
+    Writer->getSectionAddress(A_SD->getFragment()->getParent());
+  FixedValue += SecAddr;
+
+  if (const MCSymbolRefExpr *B = Target.getSymB()) {
+    MCSymbolData *B_SD = &Asm.getSymbolData(B->getSymbol());
+
+    if (!B_SD->getFragment())
+      report_fatal_error("symbol '" + B->getSymbol().getName() +
+                         "' can not be undefined in a subtraction expression");
+
+    // Select the appropriate difference relocation type.
+    Type = macho::RIT_ARM_HalfDifference;
+    Value2 = Writer->getSymbolAddress(B_SD, Layout);
+    FixedValue -= Writer->getSectionAddress(B_SD->getFragment()->getParent());
+  }
+
+  // Relocations are written out in reverse order, so the PAIR comes first.
+  // ARM_RELOC_HALF and ARM_RELOC_HALF_SECTDIFF abuse the r_length field:
+  //
+  // For these two r_type relocations they always have a pair following them and
+  // the r_length bits are used differently.  The encoding of the r_length is as
+  // follows:
+  //   low bit of r_length:
+  //      0 - :lower16: for movw instructions
+  //      1 - :upper16: for movt instructions
+  //   high bit of r_length:
+  //      0 - arm instructions
+  //      1 - thumb instructions
+  // the other half of the relocated expression is in the following pair
+  // relocation entry in the the low 16 bits of r_address field.
+  unsigned ThumbBit = 0;
+  unsigned MovtBit = 0;
+  switch ((unsigned)Fixup.getKind()) {
+  default: break;
+  case ARM::fixup_arm_movt_hi16:
+  case ARM::fixup_arm_movt_hi16_pcrel:
+    MovtBit = 1;
+    break;
+  case ARM::fixup_t2_movt_hi16:
+  case ARM::fixup_t2_movt_hi16_pcrel:
+    MovtBit = 1;
+    // Fallthrough
+  case ARM::fixup_t2_movw_lo16:
+  case ARM::fixup_t2_movw_lo16_pcrel:
+    ThumbBit = 1;
+    break;
+  }
+
+
+  if (Type == macho::RIT_ARM_HalfDifference) {
+    uint32_t OtherHalf = MovtBit
+      ? (FixedValue & 0xffff) : ((FixedValue & 0xffff0000) >> 16);
+
+    macho::RelocationEntry MRE;
+    MRE.Word0 = ((OtherHalf       <<  0) |
+                 (macho::RIT_Pair << 24) |
+                 (MovtBit         << 28) |
+                 (ThumbBit        << 29) |
+                 (IsPCRel         << 30) |
+                 macho::RF_Scattered);
+    MRE.Word1 = Value2;
+    Writer->addRelocation(Fragment->getParent(), MRE);
+  }
+
+  macho::RelocationEntry MRE;
+  MRE.Word0 = ((FixupOffset <<  0) |
+               (Type        << 24) |
+               (MovtBit     << 28) |
+               (ThumbBit    << 29) |
+               (IsPCRel     << 30) |
+               macho::RF_Scattered);
+  MRE.Word1 = Value;
+  Writer->addRelocation(Fragment->getParent(), MRE);
+}
+
+void ARMMachObjectWriter::RecordARMScatteredRelocation(MachObjectWriter *Writer,
+                                                    const MCAssembler &Asm,
+                                                    const MCAsmLayout &Layout,
+                                                    const MCFragment *Fragment,
+                                                    const MCFixup &Fixup,
+                                                    MCValue Target,
+                                                    unsigned Log2Size,
+                                                    uint64_t &FixedValue) {
+  uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
+  unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
+  unsigned Type = macho::RIT_Vanilla;
+
+  // See <reloc.h>.
+  const MCSymbol *A = &Target.getSymA()->getSymbol();
+  MCSymbolData *A_SD = &Asm.getSymbolData(*A);
+
+  if (!A_SD->getFragment())
+    report_fatal_error("symbol '" + A->getName() +
+                       "' can not be undefined in a subtraction expression");
+
+  uint32_t Value = Writer->getSymbolAddress(A_SD, Layout);
+  uint64_t SecAddr = Writer->getSectionAddress(A_SD->getFragment()->getParent());
+  FixedValue += SecAddr;
+  uint32_t Value2 = 0;
+
+  if (const MCSymbolRefExpr *B = Target.getSymB()) {
+    MCSymbolData *B_SD = &Asm.getSymbolData(B->getSymbol());
+
+    if (!B_SD->getFragment())
+      report_fatal_error("symbol '" + B->getSymbol().getName() +
+                         "' can not be undefined in a subtraction expression");
+
+    // Select the appropriate difference relocation type.
+    Type = macho::RIT_Difference;
+    Value2 = Writer->getSymbolAddress(B_SD, Layout);
+    FixedValue -= Writer->getSectionAddress(B_SD->getFragment()->getParent());
+  }
+
+  // Relocations are written out in reverse order, so the PAIR comes first.
+  if (Type == macho::RIT_Difference ||
+      Type == macho::RIT_Generic_LocalDifference) {
+    macho::RelocationEntry MRE;
+    MRE.Word0 = ((0         <<  0) |
+                 (macho::RIT_Pair  << 24) |
+                 (Log2Size  << 28) |
+                 (IsPCRel   << 30) |
+                 macho::RF_Scattered);
+    MRE.Word1 = Value2;
+    Writer->addRelocation(Fragment->getParent(), MRE);
+  }
+
+  macho::RelocationEntry MRE;
+  MRE.Word0 = ((FixupOffset <<  0) |
+               (Type        << 24) |
+               (Log2Size    << 28) |
+               (IsPCRel     << 30) |
+               macho::RF_Scattered);
+  MRE.Word1 = Value;
+  Writer->addRelocation(Fragment->getParent(), MRE);
+}
+
+void ARMMachObjectWriter::RecordRelocation(MachObjectWriter *Writer,
+                                           const MCAssembler &Asm,
+                                           const MCAsmLayout &Layout,
+                                           const MCFragment *Fragment,
+                                           const MCFixup &Fixup,
+                                           MCValue Target,
+                                           uint64_t &FixedValue) {
+  unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
+  unsigned Log2Size;
+  unsigned RelocType = macho::RIT_Vanilla;
+  if (!getARMFixupKindMachOInfo(Fixup.getKind(), RelocType, Log2Size)) {
+    report_fatal_error("unknown ARM fixup kind!");
+    return;
+  }
+
+  // If this is a difference or a defined symbol plus an offset, then we need a
+  // scattered relocation entry.  Differences always require scattered
+  // relocations.
+  if (Target.getSymB()) {
+    if (RelocType == macho::RIT_ARM_Half ||
+        RelocType == macho::RIT_ARM_HalfDifference)
+      return RecordARMMovwMovtRelocation(Writer, Asm, Layout, Fragment, Fixup,
+                                         Target, FixedValue);
+    return RecordARMScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup,
+                                        Target, Log2Size, FixedValue);
+  }
+
+  // Get the symbol data, if any.
+  MCSymbolData *SD = 0;
+  if (Target.getSymA())
+    SD = &Asm.getSymbolData(Target.getSymA()->getSymbol());
+
+  // FIXME: For other platforms, we need to use scattered relocations for
+  // internal relocations with offsets.  If this is an internal relocation with
+  // an offset, it also needs a scattered relocation entry.
+  //
+  // Is this right for ARM?
+  uint32_t Offset = Target.getConstant();
+  if (IsPCRel && RelocType == macho::RIT_Vanilla)
+    Offset += 1 << Log2Size;
+  if (Offset && SD && !Writer->doesSymbolRequireExternRelocation(SD))
+    return RecordARMScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup,
+                                        Target, Log2Size, FixedValue);
+
+  // See <reloc.h>.
+  uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
+  unsigned Index = 0;
+  unsigned IsExtern = 0;
+  unsigned Type = 0;
+
+  if (Target.isAbsolute()) { // constant
+    // FIXME!
+    report_fatal_error("FIXME: relocations to absolute targets "
+                       "not yet implemented");
+  } else {
+    // Resolve constant variables.
+    if (SD->getSymbol().isVariable()) {
+      int64_t Res;
+      if (SD->getSymbol().getVariableValue()->EvaluateAsAbsolute(
+            Res, Layout, Writer->getSectionAddressMap())) {
+        FixedValue = Res;
+        return;
+      }
+    }
+
+    // Check whether we need an external or internal relocation.
+    if (Writer->doesSymbolRequireExternRelocation(SD)) {
+      IsExtern = 1;
+      Index = SD->getIndex();
+
+      // For external relocations, make sure to offset the fixup value to
+      // compensate for the addend of the symbol address, if it was
+      // undefined. This occurs with weak definitions, for example.
+      if (!SD->Symbol->isUndefined())
+        FixedValue -= Layout.getSymbolOffset(SD);
+    } else {
+      // The index is the section ordinal (1-based).
+      const MCSectionData &SymSD = Asm.getSectionData(
+        SD->getSymbol().getSection());
+      Index = SymSD.getOrdinal() + 1;
+      FixedValue += Writer->getSectionAddress(&SymSD);
+    }
+    if (IsPCRel)
+      FixedValue -= Writer->getSectionAddress(Fragment->getParent());
+
+    // The type is determined by the fixup kind.
+    Type = RelocType;
+  }
+
+  // struct relocation_info (8 bytes)
+  macho::RelocationEntry MRE;
+  MRE.Word0 = FixupOffset;
+  MRE.Word1 = ((Index     <<  0) |
+               (IsPCRel   << 24) |
+               (Log2Size  << 25) |
+               (IsExtern  << 27) |
+               (Type      << 28));
+  Writer->addRelocation(Fragment->getParent(), MRE);
+}
+
+MCObjectWriter *llvm::createARMMachObjectWriter(raw_ostream &OS,
+                                                bool Is64Bit,
+                                                uint32_t CPUType,
+                                                uint32_t CPUSubtype) {
+  return createMachObjectWriter(new ARMMachObjectWriter(Is64Bit,
+                                                        CPUType,
+                                                        CPUSubtype),
+                                OS, /*IsLittleEndian=*/true);
+}
diff --git a/lib/Target/ARM/MCTargetDesc/CMakeLists.txt b/lib/Target/ARM/MCTargetDesc/CMakeLists.txt
index 68daf42c91915..adc37cbf582e5 100644
--- a/lib/Target/ARM/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/ARM/MCTargetDesc/CMakeLists.txt
@@ -1,7 +1,19 @@
 add_llvm_library(LLVMARMDesc
-  ARMMCTargetDesc.cpp
+  ARMAsmBackend.cpp
   ARMMCAsmInfo.cpp
+  ARMMCCodeEmitter.cpp
+  ARMMCExpr.cpp
+  ARMMCTargetDesc.cpp
+  ARMMachObjectWriter.cpp
   )
+add_dependencies(LLVMARMDesc ARMCommonTableGen)
 
 # Hack: we need to include 'main' target directory to grab private headers
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/.. ${CMAKE_CURRENT_BINARY_DIR}/..)
+
+add_llvm_library_dependencies(LLVMARMDesc
+  LLVMARMInfo
+  LLVMARMAsmPrinter
+  LLVMMC
+  LLVMSupport
+  )
diff --git a/lib/Target/ARM/Makefile b/lib/Target/ARM/Makefile
index eb8c603544765..3e48ed1189cc4 100644
--- a/lib/Target/ARM/Makefile
+++ b/lib/Target/ARM/Makefile
@@ -16,9 +16,8 @@ BUILT_SOURCES = ARMGenRegisterInfo.inc ARMGenInstrInfo.inc \
 		ARMGenAsmWriter.inc ARMGenAsmMatcher.inc \
                 ARMGenDAGISel.inc ARMGenSubtargetInfo.inc \
                 ARMGenCodeEmitter.inc ARMGenCallingConv.inc \
-                ARMGenDecoderTables.inc ARMGenEDInfo.inc \
-                ARMGenFastISel.inc ARMGenMCCodeEmitter.inc \
-		ARMGenMCPseudoLowering.inc
+                ARMGenEDInfo.inc ARMGenFastISel.inc ARMGenMCCodeEmitter.inc \
+                ARMGenMCPseudoLowering.inc ARMGenDisassemblerTables.inc
 
 DIRS = InstPrinter AsmParser Disassembler TargetInfo MCTargetDesc
 
diff --git a/lib/Target/ARM/NEONMoveFix.cpp b/lib/Target/ARM/NEONMoveFix.cpp
deleted file mode 100644
index c85d1e99705ae..0000000000000
--- a/lib/Target/ARM/NEONMoveFix.cpp
+++ /dev/null
@@ -1,149 +0,0 @@
-//===-- NEONMoveFix.cpp - Convert vfp reg-reg moves into neon ---*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "neon-mov-fix"
-#include "ARM.h"
-#include "ARMMachineFunctionInfo.h"
-#include "ARMInstrInfo.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-using namespace llvm;
-
-STATISTIC(NumVMovs, "Number of reg-reg moves converted");
-
-namespace {
-  struct NEONMoveFixPass : public MachineFunctionPass {
-    static char ID;
-    NEONMoveFixPass() : MachineFunctionPass(ID) {}
-
-    virtual bool runOnMachineFunction(MachineFunction &Fn);
-
-    virtual const char *getPassName() const {
-      return "NEON reg-reg move conversion";
-    }
-
-  private:
-    const TargetRegisterInfo *TRI;
-    const ARMBaseInstrInfo *TII;
-    bool isA8;
-
-    typedef DenseMap<unsigned, const MachineInstr*> RegMap;
-
-    bool InsertMoves(MachineBasicBlock &MBB);
-  };
-  char NEONMoveFixPass::ID = 0;
-}
-
-static bool inNEONDomain(unsigned Domain, bool isA8) {
-  return (Domain & ARMII::DomainNEON) ||
-    (isA8 && (Domain & ARMII::DomainNEONA8));
-}
-
-bool NEONMoveFixPass::InsertMoves(MachineBasicBlock &MBB) {
-  RegMap Defs;
-  bool Modified = false;
-
-  // Walk over MBB tracking the def points of the registers.
-  MachineBasicBlock::iterator MII = MBB.begin(), E = MBB.end();
-  MachineBasicBlock::iterator NextMII;
-  for (; MII != E; MII = NextMII) {
-    NextMII = llvm::next(MII);
-    MachineInstr *MI = &*MII;
-
-    if (MI->getOpcode() == ARM::VMOVD &&
-        !TII->isPredicated(MI)) {
-      unsigned SrcReg = MI->getOperand(1).getReg();
-      // If we do not find an instruction defining the reg, this means the
-      // register should be live-in for this BB. It's always to better to use
-      // NEON reg-reg moves.
-      unsigned Domain = ARMII::DomainNEON;
-      RegMap::iterator DefMI = Defs.find(SrcReg);
-      if (DefMI != Defs.end()) {
-        Domain = DefMI->second->getDesc().TSFlags & ARMII::DomainMask;
-        // Instructions in general domain are subreg accesses.
-        // Map them to NEON reg-reg moves.
-        if (Domain == ARMII::DomainGeneral)
-          Domain = ARMII::DomainNEON;
-      }
-
-      if (inNEONDomain(Domain, isA8)) {
-        // Convert VMOVD to VORRd
-        unsigned DestReg = MI->getOperand(0).getReg();
-
-        DEBUG({errs() << "vmov convert: "; MI->dump();});
-
-        // It's safe to ignore imp-defs / imp-uses here, since:
-        //  - We're running late, no intelligent condegen passes should be run
-        //    afterwards
-        //  - The imp-defs / imp-uses are superregs only, we don't care about
-        //    them.
-        AddDefaultPred(BuildMI(MBB, *MI, MI->getDebugLoc(),
-                             TII->get(ARM::VORRd), DestReg)
-          .addReg(SrcReg).addReg(SrcReg));
-        MBB.erase(MI);
-        MachineBasicBlock::iterator I = prior(NextMII);
-        MI = &*I;
-
-        DEBUG({errs() << "        into: "; MI->dump();});
-
-        Modified = true;
-        ++NumVMovs;
-      } else {
-        assert((Domain & ARMII::DomainVFP) && "Invalid domain!");
-        // Do nothing.
-      }
-    }
-
-    // Update def information.
-    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-      const MachineOperand& MO = MI->getOperand(i);
-      if (!MO.isReg() || !MO.isDef())
-        continue;
-      unsigned MOReg = MO.getReg();
-
-      Defs[MOReg] = MI;
-      // Catch aliases as well.
-      for (const unsigned *R = TRI->getAliasSet(MOReg); *R; ++R)
-        Defs[*R] = MI;
-    }
-  }
-
-  return Modified;
-}
-
-bool NEONMoveFixPass::runOnMachineFunction(MachineFunction &Fn) {
-  ARMFunctionInfo *AFI = Fn.getInfo<ARMFunctionInfo>();
-  const TargetMachine &TM = Fn.getTarget();
-
-  if (AFI->isThumb1OnlyFunction())
-    return false;
-
-  TRI = TM.getRegisterInfo();
-  TII = static_cast<const ARMBaseInstrInfo*>(TM.getInstrInfo());
-  isA8 = TM.getSubtarget<ARMSubtarget>().isCortexA8();
-
-  bool Modified = false;
-  for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
-       ++MFI) {
-    MachineBasicBlock &MBB = *MFI;
-    Modified |= InsertMoves(MBB);
-  }
-
-  return Modified;
-}
-
-/// createNEONMoveFixPass - Returns an instance of the NEON reg-reg moves fix
-/// pass.
-FunctionPass *llvm::createNEONMoveFixPass() {
-  return new NEONMoveFixPass();
-}
diff --git a/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp b/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp
index 163a0a9875849..500e3de82db34 100644
--- a/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp
+++ b/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp
@@ -9,7 +9,7 @@
 
 #include "ARM.h"
 #include "llvm/Module.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
 Target llvm::TheARMTarget, llvm::TheThumbTarget;
diff --git a/lib/Target/ARM/TargetInfo/CMakeLists.txt b/lib/Target/ARM/TargetInfo/CMakeLists.txt
index 3910bb02e2196..8b38b136ce645 100644
--- a/lib/Target/ARM/TargetInfo/CMakeLists.txt
+++ b/lib/Target/ARM/TargetInfo/CMakeLists.txt
@@ -4,4 +4,10 @@ add_llvm_library(LLVMARMInfo
   ARMTargetInfo.cpp
   )
 
-add_dependencies(LLVMARMInfo ARMCodeGenTable_gen)
+add_dependencies(LLVMARMInfo ARMCommonTableGen)
+
+add_llvm_library_dependencies(LLVMARMInfo
+  LLVMMC
+  LLVMSupport
+  LLVMTarget
+  )
diff --git a/lib/Target/ARM/Thumb1FrameLowering.cpp b/lib/Target/ARM/Thumb1FrameLowering.cpp
index c258870e48a58..d8481778c0dae 100644
--- a/lib/Target/ARM/Thumb1FrameLowering.cpp
+++ b/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -21,7 +21,7 @@
 
 using namespace llvm;
 
-bool Thumb1FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
+bool Thumb1FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const{
   const MachineFrameInfo *FFI = MF.getFrameInfo();
   unsigned CFSize = FFI->getMaxCallFrameSize();
   // It's not always a good idea to include the call frame as part of the
@@ -133,9 +133,9 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF) const {
 
   // Adjust FP so it point to the stack slot that contains the previous FP.
   if (hasFP(MF)) {
-    BuildMI(MBB, MBBI, dl, TII.get(ARM::tADDrSPi), FramePtr)
+    AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tADDrSPi), FramePtr)
       .addFrameIndex(FramePtrSpillFI).addImm(0)
-      .setMIFlags(MachineInstr::FrameSetup);
+      .setMIFlags(MachineInstr::FrameSetup));
     if (NumBytes > 508)
       // If offset is > 508 then sp cannot be adjusted in a single instruction,
       // try restoring from fp instead.
@@ -155,6 +155,11 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF) const {
   AFI->setGPRCalleeSavedArea2Size(GPRCS2Size);
   AFI->setDPRCalleeSavedAreaSize(DPRCSSize);
 
+  // Thumb1 does not currently support dynamic stack realignment.  Report a
+  // fatal error rather then silently generate bad code.
+  if (RegInfo->needsStackRealignment(MF))
+      report_fatal_error("Dynamic stack realignment not supported for thumb1.");
+
   // If we need a base pointer, set it up here. It's whatever the value
   // of the stack pointer is at this point. Any variable size objects
   // will be allocated after this, so we can still use the base pointer
diff --git a/lib/Target/ARM/Thumb1RegisterInfo.cpp b/lib/Target/ARM/Thumb1RegisterInfo.cpp
index 4eb0b6c93e1dc..e8ed482a66fae 100644
--- a/lib/Target/ARM/Thumb1RegisterInfo.cpp
+++ b/lib/Target/ARM/Thumb1RegisterInfo.cpp
@@ -13,12 +13,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "ARM.h"
-#include "ARMAddressingModes.h"
 #include "ARMBaseInstrInfo.h"
 #include "ARMMachineFunctionInfo.h"
 #include "ARMSubtarget.h"
 #include "Thumb1InstrInfo.h"
 #include "Thumb1RegisterInfo.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Function.h"
@@ -27,7 +27,6 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineLocation.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetMachine.h"
@@ -182,7 +181,6 @@ void llvm::emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
   int Opc = 0;
   int ExtraOpc = 0;
   bool NeedCC = false;
-  bool NeedPred = false;
 
   if (DestReg == BaseReg && BaseReg == ARM::SP) {
     assert(isMul4 && "Thumb sp inc / dec size must be multiple of 4!");
@@ -217,7 +215,7 @@ void llvm::emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
     } else {
       Opc = isSub ? ARM::tSUBi8 : ARM::tADDi8;
       NumBits = 8;
-      NeedPred = NeedCC = true;
+      NeedCC = true;
     }
     isTwoAddr = true;
   }
@@ -241,7 +239,8 @@ void llvm::emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
       Bytes -= ThisVal;
       const MCInstrDesc &MCID = TII.get(isSub ? ARM::tSUBi3 : ARM::tADDi3);
       const MachineInstrBuilder MIB =
-        AddDefaultT1CC(BuildMI(MBB, MBBI, dl, MCID, DestReg).setMIFlags(MIFlags));
+        AddDefaultT1CC(BuildMI(MBB, MBBI, dl, MCID, DestReg)
+                         .setMIFlags(MIFlags));
       AddDefaultPred(MIB.addReg(BaseReg, RegState::Kill).addImm(ThisVal));
     } else {
       AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), DestReg)
@@ -262,18 +261,15 @@ void llvm::emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
       if (NeedCC)
         MIB = AddDefaultT1CC(MIB);
       MIB.addReg(DestReg).addImm(ThisVal);
-      if (NeedPred)
-        MIB = AddDefaultPred(MIB);
+      MIB = AddDefaultPred(MIB);
       MIB.setMIFlags(MIFlags);
-    }
-    else {
+    } else {
       bool isKill = BaseReg != ARM::SP;
       MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg);
       if (NeedCC)
         MIB = AddDefaultT1CC(MIB);
       MIB.addReg(BaseReg, getKillRegState(isKill)).addImm(ThisVal);
-      if (NeedPred)
-        MIB = AddDefaultPred(MIB);
+      MIB = AddDefaultPred(MIB);
       MIB.setMIFlags(MIFlags);
 
       BaseReg = DestReg;
@@ -285,7 +281,7 @@ void llvm::emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
         Scale = 1;
         Chunk = ((1 << NumBits) - 1) * Scale;
         Opc = isSub ? ARM::tSUBi8 : ARM::tADDi8;
-        NeedPred = NeedCC = isTwoAddr = true;
+        NeedCC = isTwoAddr = true;
       }
     }
   }
@@ -405,7 +401,6 @@ rewriteFrameIndex(MachineBasicBlock::iterator II, unsigned FrameRegIdx,
     unsigned Scale = 1;
     if (FrameReg != ARM::SP) {
       Opcode = ARM::tADDi3;
-      MI.setDesc(TII.get(Opcode));
       NumBits = 3;
     } else {
       NumBits = 8;
@@ -419,10 +414,9 @@ rewriteFrameIndex(MachineBasicBlock::iterator II, unsigned FrameRegIdx,
       // Turn it into a move.
       MI.setDesc(TII.get(ARM::tMOVr));
       MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
-      // Remove offset and add predicate operands.
+      // Remove offset
       MI.RemoveOperand(FrameRegIdx+1);
       MachineInstrBuilder MIB(&MI);
-      AddDefaultPred(MIB);
       return true;
     }
 
@@ -431,6 +425,7 @@ rewriteFrameIndex(MachineBasicBlock::iterator II, unsigned FrameRegIdx,
     if (((Offset / Scale) & ~Mask) == 0) {
       // Replace the FrameIndex with sp / fp
       if (Opcode == ARM::tADDi3) {
+        MI.setDesc(TII.get(Opcode));
         removeOperands(MI, FrameRegIdx);
         MachineInstrBuilder MIB(&MI);
         AddDefaultPred(AddDefaultT1CC(MIB).addReg(FrameReg)
@@ -459,6 +454,7 @@ rewriteFrameIndex(MachineBasicBlock::iterator II, unsigned FrameRegIdx,
       // r0 = add sp, 255*4
       // r0 = add r0, (imm - 255*4)
       if (Opcode == ARM::tADDi3) {
+        MI.setDesc(TII.get(Opcode));
         removeOperands(MI, FrameRegIdx);
         MachineInstrBuilder MIB(&MI);
         AddDefaultPred(AddDefaultT1CC(MIB).addReg(FrameReg).addImm(Mask));
@@ -479,10 +475,6 @@ rewriteFrameIndex(MachineBasicBlock::iterator II, unsigned FrameRegIdx,
       MI.setDesc(TII.get(ARM::tADDhirr));
       MI.getOperand(FrameRegIdx).ChangeToRegister(DestReg, false, false, true);
       MI.getOperand(FrameRegIdx+1).ChangeToRegister(FrameReg, false);
-      if (Opcode == ARM::tADDi3) {
-        MachineInstrBuilder MIB(&MI);
-        AddDefaultPred(MIB);
-      }
     }
     return true;
   } else {
@@ -545,9 +537,9 @@ Thumb1RegisterInfo::resolveFrameIndex(MachineBasicBlock::iterator I,
     ++i;
     assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
   }
-  bool Done = false;
-  Done = rewriteFrameIndex(MI, i, BaseReg, Off, TII);
+  bool Done = rewriteFrameIndex(MI, i, BaseReg, Off, TII);
   assert (Done && "Unable to resolve frame index!");
+  (void)Done;
 }
 
 /// saveScavengerRegister - Spill the register so it can be used by the
diff --git a/lib/Target/ARM/Thumb2ITBlockPass.cpp b/lib/Target/ARM/Thumb2ITBlockPass.cpp
index 360ec009e2016..b6274007b2373 100644
--- a/lib/Target/ARM/Thumb2ITBlockPass.cpp
+++ b/lib/Target/ARM/Thumb2ITBlockPass.cpp
@@ -124,6 +124,27 @@ Thumb2ITBlockPass::MoveCopyOutOfITBlock(MachineInstr *MI,
   if (Uses.count(DstReg) || Defs.count(SrcReg))
     return false;
 
+  // If the CPSR is defined by this copy, then we don't want to move it. E.g.,
+  // if we have:
+  //
+  //   movs  r1, r1
+  //   rsb   r1, 0
+  //   movs  r2, r2
+  //   rsb   r2, 0
+  //
+  // we don't want this to be converted to:
+  //
+  //   movs  r1, r1
+  //   movs  r2, r2
+  //   itt   mi
+  //   rsb   r1, 0
+  //   rsb   r2, 0
+  //
+  const MCInstrDesc &MCID = MI->getDesc();
+  if (MCID.hasOptionalDef() &&
+      MI->getOperand(MCID.getNumOperands() - 1).getReg() == ARM::CPSR)
+    return false;
+
   // Then peek at the next instruction to see if it's predicated on CC or OCC.
   // If not, then there is nothing to be gained by moving the copy.
   MachineBasicBlock::iterator I = MI; ++I;
diff --git a/lib/Target/ARM/Thumb2InstrInfo.cpp b/lib/Target/ARM/Thumb2InstrInfo.cpp
index 51b56aaeb008e..cf040c822de95 100644
--- a/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -14,9 +14,9 @@
 #include "Thumb2InstrInfo.h"
 #include "ARM.h"
 #include "ARMConstantPoolValue.h"
-#include "ARMAddressingModes.h"
 #include "ARMMachineFunctionInfo.h"
 #include "Thumb2InstrInfo.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
@@ -122,7 +122,8 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                     const TargetRegisterClass *RC,
                     const TargetRegisterInfo *TRI) const {
   if (RC == ARM::GPRRegisterClass   || RC == ARM::tGPRRegisterClass ||
-      RC == ARM::tcGPRRegisterClass || RC == ARM::rGPRRegisterClass) {
+      RC == ARM::tcGPRRegisterClass || RC == ARM::rGPRRegisterClass ||
+      RC == ARM::GPRnopcRegisterClass) {
     DebugLoc DL;
     if (I != MBB.end()) DL = I->getDebugLoc();
 
@@ -149,7 +150,8 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                      const TargetRegisterClass *RC,
                      const TargetRegisterInfo *TRI) const {
   if (RC == ARM::GPRRegisterClass   || RC == ARM::tGPRRegisterClass ||
-      RC == ARM::tcGPRRegisterClass || RC == ARM::rGPRRegisterClass) {
+      RC == ARM::tcGPRRegisterClass || RC == ARM::rGPRRegisterClass ||
+      RC == ARM::GPRnopcRegisterClass) {
     DebugLoc DL;
     if (I != MBB.end()) DL = I->getDebugLoc();
 
@@ -233,9 +235,8 @@ void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB,
       if (DestReg == ARM::SP && (ThisVal < ((1 << 7)-1) * 4)) {
         assert((ThisVal & 3) == 0 && "Stack update is not multiple of 4?");
         Opc = isSub ? ARM::tSUBspi : ARM::tADDspi;
-        // FIXME: Fix Thumb1 immediate encoding.
-        BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg)
-          .addReg(BaseReg).addImm(ThisVal/4).setMIFlags(MIFlags);
+        AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg)
+          .addReg(BaseReg).addImm(ThisVal/4).setMIFlags(MIFlags));
         NumBytes = 0;
         continue;
       }
diff --git a/lib/Target/ARM/Thumb2SizeReduction.cpp b/lib/Target/ARM/Thumb2SizeReduction.cpp
index c741a6e8a5b7c..89a155c5a7f5c 100644
--- a/lib/Target/ARM/Thumb2SizeReduction.cpp
+++ b/lib/Target/ARM/Thumb2SizeReduction.cpp
@@ -9,11 +9,11 @@
 
 #define DEBUG_TYPE "t2-reduce-size"
 #include "ARM.h"
-#include "ARMAddressingModes.h"
 #include "ARMBaseRegisterInfo.h"
 #include "ARMBaseInstrInfo.h"
 #include "ARMSubtarget.h"
 #include "Thumb2InstrInfo.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -97,11 +97,11 @@ namespace {
     { ARM::t2SUBrr, ARM::tSUBrr,  0,             0,   0,    1,   0,  0,0, 0,0 },
     { ARM::t2SUBSri,ARM::tSUBi3,  ARM::tSUBi8,   3,   8,    1,   1,  2,2, 0,0 },
     { ARM::t2SUBSrr,ARM::tSUBrr,  0,             0,   0,    1,   0,  2,0, 0,0 },
-    { ARM::t2SXTBr, ARM::tSXTB,   0,             0,   0,    1,   0,  1,0, 0,0 },
-    { ARM::t2SXTHr, ARM::tSXTH,   0,             0,   0,    1,   0,  1,0, 0,0 },
+    { ARM::t2SXTB,  ARM::tSXTB,   0,             0,   0,    1,   0,  1,0, 0,1 },
+    { ARM::t2SXTH,  ARM::tSXTH,   0,             0,   0,    1,   0,  1,0, 0,1 },
     { ARM::t2TSTrr, ARM::tTST,    0,             0,   0,    1,   0,  2,0, 0,0 },
-    { ARM::t2UXTBr, ARM::tUXTB,   0,             0,   0,    1,   0,  1,0, 0,0 },
-    { ARM::t2UXTHr, ARM::tUXTH,   0,             0,   0,    1,   0,  1,0, 0,0 },
+    { ARM::t2UXTB,  ARM::tUXTB,   0,             0,   0,    1,   0,  1,0, 0,1 },
+    { ARM::t2UXTH,  ARM::tUXTH,   0,             0,   0,    1,   0,  1,0, 0,1 },
 
     // FIXME: Clean this up after splitting each Thumb load / store opcode
     // into multiple ones.
@@ -507,6 +507,7 @@ Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI,
       .addOperand(MI->getOperand(0))
       .addOperand(MI->getOperand(1))
       .addImm(Imm / 4); // The tADDrSPi has an implied scale by four.
+    AddDefaultPred(MIB);
 
     // Transfer MI flags.
     MIB.setMIFlags(MI->getFlags());
@@ -546,6 +547,10 @@ Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI,
   }
   case ARM::t2RSBri:
   case ARM::t2RSBSri:
+  case ARM::t2SXTB:
+  case ARM::t2SXTH:
+  case ARM::t2UXTB:
+  case ARM::t2UXTH:
     if (MI->getOperand(2).getImm() == 0)
       return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef);
     break;
@@ -742,7 +747,11 @@ Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI,
     if (i < NumOps && MCID.OpInfo[i].isOptionalDef())
       continue;
     if ((MCID.getOpcode() == ARM::t2RSBSri ||
-         MCID.getOpcode() == ARM::t2RSBri) && i == 2)
+         MCID.getOpcode() == ARM::t2RSBri ||
+         MCID.getOpcode() == ARM::t2SXTB ||
+         MCID.getOpcode() == ARM::t2SXTH ||
+         MCID.getOpcode() == ARM::t2UXTB ||
+         MCID.getOpcode() == ARM::t2UXTH) && i == 2)
       // Skip the zero immediate operand, it's now implicit.
       continue;
     bool isPred = (i < NumOps && MCID.OpInfo[i].isPredicate());
diff --git a/lib/Target/Alpha/AlphaAsmPrinter.cpp b/lib/Target/Alpha/AlphaAsmPrinter.cpp
index 46ae286895a93..5dce06ac86a5d 100644
--- a/lib/Target/Alpha/AlphaAsmPrinter.cpp
+++ b/lib/Target/Alpha/AlphaAsmPrinter.cpp
@@ -26,8 +26,8 @@
 #include "llvm/Target/Mangler.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetRegistry.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
diff --git a/lib/Target/Alpha/AlphaISelDAGToDAG.cpp b/lib/Target/Alpha/AlphaISelDAGToDAG.cpp
index 7b91fea54af4d..f877c65cd61f4 100644
--- a/lib/Target/Alpha/AlphaISelDAGToDAG.cpp
+++ b/lib/Target/Alpha/AlphaISelDAGToDAG.cpp
@@ -80,7 +80,7 @@ namespace {
             // Otherwise we don't know that the it's okay to zapnot this entire
             // byte.  Only do this iff we can prove that the missing bits are
             // already null, so the bytezap doesn't need to really null them.
-            BitsToCheck |= ~Constant & (0xFF << 8*i);
+            BitsToCheck |= ~Constant & (0xFFULL << 8*i);
           }
         }
       }
@@ -114,9 +114,8 @@ namespace {
       if (!x) return 0;
       unsigned at = CountLeadingZeros_64(x);
       uint64_t complow = 1ULL << (63 - at);
-      uint64_t comphigh = 1ULL << (64 - at);
-      //cerr << x << ":" << complow << ":" << comphigh << "\n";
-      if (abs64(complow - x) <= abs64(comphigh - x))
+      uint64_t comphigh = complow << 1;
+      if (x - complow <= comphigh - x)
         return complow;
       else
         return comphigh;
diff --git a/lib/Target/Alpha/AlphaISelLowering.cpp b/lib/Target/Alpha/AlphaISelLowering.cpp
index de003fb4c65e1..3057eb8c57fbe 100644
--- a/lib/Target/Alpha/AlphaISelLowering.cpp
+++ b/lib/Target/Alpha/AlphaISelLowering.cpp
@@ -49,6 +49,7 @@ AlphaTargetLowering::AlphaTargetLowering(TargetMachine &TM)
   // Set up the TargetLowering object.
   //I am having problems with shr n i8 1
   setBooleanContents(ZeroOrOneBooleanContent);
+  setBooleanVectorContents(ZeroOrOneBooleanContent); // FIXME: Is this correct?
 
   addRegisterClass(MVT::i64, Alpha::GPRCRegisterClass);
   addRegisterClass(MVT::f64, Alpha::F8RCRegisterClass);
@@ -153,6 +154,9 @@ AlphaTargetLowering::AlphaTargetLowering(TargetMachine &TM)
   setOperationAction(ISD::JumpTable, MVT::i64, Custom);
   setOperationAction(ISD::JumpTable, MVT::i32, Custom);
 
+  setOperationAction(ISD::ATOMIC_LOAD,  MVT::i32, Expand);
+  setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Expand);
+
   setStackPointerRegisterToSaveRestore(Alpha::R30);
 
   setJumpBufSize(272);
@@ -160,10 +164,12 @@ AlphaTargetLowering::AlphaTargetLowering(TargetMachine &TM)
 
   setMinFunctionAlignment(4);
 
+  setInsertFencesForAtomic(true);
+
   computeRegisterProperties();
 }
 
-MVT::SimpleValueType AlphaTargetLowering::getSetCCResultType(EVT VT) const {
+EVT AlphaTargetLowering::getSetCCResultType(EVT VT) const {
   return MVT::i64;
 }
 
diff --git a/lib/Target/Alpha/AlphaISelLowering.h b/lib/Target/Alpha/AlphaISelLowering.h
index 13383f4430f9a..80f8efaea5d2f 100644
--- a/lib/Target/Alpha/AlphaISelLowering.h
+++ b/lib/Target/Alpha/AlphaISelLowering.h
@@ -66,7 +66,7 @@ namespace llvm {
     virtual MVT getShiftAmountTy(EVT LHSTy) const { return MVT::i64; }
 
     /// getSetCCResultType - Get the SETCC result ValueType
-    virtual MVT::SimpleValueType getSetCCResultType(EVT VT) const;
+    virtual EVT getSetCCResultType(EVT VT) const;
 
     /// LowerOperation - Provide custom lowering hooks for some operations.
     ///
diff --git a/lib/Target/Alpha/AlphaInstrInfo.cpp b/lib/Target/Alpha/AlphaInstrInfo.cpp
index 4dcec8f31750b..8df2ed75f6255 100644
--- a/lib/Target/Alpha/AlphaInstrInfo.cpp
+++ b/lib/Target/Alpha/AlphaInstrInfo.cpp
@@ -16,7 +16,6 @@
 #include "AlphaMachineFunctionInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Target/TargetRegistry.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/ErrorHandling.h"
diff --git a/lib/Target/Alpha/AlphaInstrInfo.td b/lib/Target/Alpha/AlphaInstrInfo.td
index b20171224e29c..c8c9377c3d8d6 100644
--- a/lib/Target/Alpha/AlphaInstrInfo.td
+++ b/lib/Target/Alpha/AlphaInstrInfo.td
@@ -607,6 +607,8 @@ def : Pat<(membarrier (i64 imm), (i64 imm), (i64 imm), (i64 1), (i64 imm)),
 def : Pat<(membarrier (i64 imm), (i64 imm), (i64 imm), (i64 imm), (i64 imm)),
           (MB)>;
 
+def : Pat<(atomic_fence (imm), (imm)), (MB)>;
+
 //Basic Floating point ops
 
 //Floats
diff --git a/lib/Target/Alpha/AlphaRegisterInfo.cpp b/lib/Target/Alpha/AlphaRegisterInfo.cpp
index df8f157266e1b..8b6230fa2a242 100644
--- a/lib/Target/Alpha/AlphaRegisterInfo.cpp
+++ b/lib/Target/Alpha/AlphaRegisterInfo.cpp
@@ -21,7 +21,6 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineLocation.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
@@ -40,8 +39,7 @@
 using namespace llvm;
 
 AlphaRegisterInfo::AlphaRegisterInfo(const TargetInstrInfo &tii)
-  : AlphaGenRegisterInfo(),
-    TII(tii) {
+  : AlphaGenRegisterInfo(Alpha::R26), TII(tii) {
 }
 
 static long getUpper16(long l) {
@@ -178,10 +176,6 @@ AlphaRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   }
 }
 
-unsigned AlphaRegisterInfo::getRARegister() const {
-  return Alpha::R26;
-}
-
 unsigned AlphaRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
 
@@ -198,16 +192,6 @@ unsigned AlphaRegisterInfo::getEHHandlerRegister() const {
   return 0;
 }
 
-int AlphaRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
-  llvm_unreachable("What is the dwarf register number");
-  return -1;
-}
-
-int AlphaRegisterInfo::getLLVMRegNum(unsigned DwarfRegNum, bool isEH) const {
-  llvm_unreachable("What is the dwarf register number");
-  return -1;
-}
-
 std::string AlphaRegisterInfo::getPrettyName(unsigned reg)
 {
   std::string s(AlphaRegDesc[reg].Name);
diff --git a/lib/Target/Alpha/AlphaRegisterInfo.h b/lib/Target/Alpha/AlphaRegisterInfo.h
index 1072bf73f1999..e35be273c7cd9 100644
--- a/lib/Target/Alpha/AlphaRegisterInfo.h
+++ b/lib/Target/Alpha/AlphaRegisterInfo.h
@@ -42,16 +42,12 @@ struct AlphaRegisterInfo : public AlphaGenRegisterInfo {
                            int SPAdj, RegScavenger *RS = NULL) const;
 
   // Debug information queries.
-  unsigned getRARegister() const;
   unsigned getFrameRegister(const MachineFunction &MF) const;
 
   // Exception handling queries.
   unsigned getEHExceptionRegister() const;
   unsigned getEHHandlerRegister() const;
 
-  int getDwarfRegNum(unsigned RegNum, bool isEH) const;
-  int getLLVMRegNum(unsigned RegNum, bool isEH) const;
-
   static std::string getPrettyName(unsigned reg);
 };
 
diff --git a/lib/Target/Alpha/AlphaSubtarget.cpp b/lib/Target/Alpha/AlphaSubtarget.cpp
index 624a5e2ebd09b..bd55ce9e315a2 100644
--- a/lib/Target/Alpha/AlphaSubtarget.cpp
+++ b/lib/Target/Alpha/AlphaSubtarget.cpp
@@ -13,7 +13,6 @@
 
 #include "AlphaSubtarget.h"
 #include "Alpha.h"
-#include "llvm/Target/TargetRegistry.h"
 
 #define GET_SUBTARGETINFO_TARGET_DESC
 #define GET_SUBTARGETINFO_CTOR
diff --git a/lib/Target/Alpha/AlphaTargetMachine.cpp b/lib/Target/Alpha/AlphaTargetMachine.cpp
index 3b65d41be892b..fc9a6771a9003 100644
--- a/lib/Target/Alpha/AlphaTargetMachine.cpp
+++ b/lib/Target/Alpha/AlphaTargetMachine.cpp
@@ -14,7 +14,7 @@
 #include "AlphaTargetMachine.h"
 #include "llvm/PassManager.h"
 #include "llvm/Support/FormattedStream.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
 extern "C" void LLVMInitializeAlphaTarget() { 
@@ -22,19 +22,17 @@ extern "C" void LLVMInitializeAlphaTarget() {
   RegisterTargetMachine<AlphaTargetMachine> X(TheAlphaTarget);
 }
 
-AlphaTargetMachine::AlphaTargetMachine(const Target &T, const std::string &TT,
-                                       const std::string &CPU,
-                                       const std::string &FS)
-  : LLVMTargetMachine(T, TT, CPU, FS),
+AlphaTargetMachine::AlphaTargetMachine(const Target &T, StringRef TT,
+                                       StringRef CPU, StringRef FS,
+                                       Reloc::Model RM, CodeModel::Model CM)
+  : LLVMTargetMachine(T, TT, CPU, FS, RM, CM),
     DataLayout("e-f128:128:128-n64"),
     FrameLowering(Subtarget),
     Subtarget(TT, CPU, FS),
     TLInfo(*this),
     TSInfo(*this) {
-  setRelocationModel(Reloc::PIC_);
 }
 
-
 //===----------------------------------------------------------------------===//
 // Pass Pipeline Configuration
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/Alpha/AlphaTargetMachine.h b/lib/Target/Alpha/AlphaTargetMachine.h
index cf00e5875d342..48bb948a79456 100644
--- a/lib/Target/Alpha/AlphaTargetMachine.h
+++ b/lib/Target/Alpha/AlphaTargetMachine.h
@@ -36,8 +36,9 @@ class AlphaTargetMachine : public LLVMTargetMachine {
   AlphaSelectionDAGInfo TSInfo;
 
 public:
-  AlphaTargetMachine(const Target &T, const std::string &TT,
-                     const std::string &CPU, const std::string &FS);
+  AlphaTargetMachine(const Target &T, StringRef TT,
+                     StringRef CPU, StringRef FS,
+                     Reloc::Model RM, CodeModel::Model CM);
 
   virtual const AlphaInstrInfo *getInstrInfo() const { return &InstrInfo; }
   virtual const TargetFrameLowering  *getFrameLowering() const {
diff --git a/lib/Target/Alpha/CMakeLists.txt b/lib/Target/Alpha/CMakeLists.txt
index a6027bbf0b2ad..a6d551618b9fd 100644
--- a/lib/Target/Alpha/CMakeLists.txt
+++ b/lib/Target/Alpha/CMakeLists.txt
@@ -1,11 +1,12 @@
 set(LLVM_TARGET_DEFINITIONS Alpha.td)
 
-tablegen(AlphaGenRegisterInfo.inc -gen-register-info)
-tablegen(AlphaGenInstrInfo.inc -gen-instr-info)
-tablegen(AlphaGenAsmWriter.inc -gen-asm-writer)
-tablegen(AlphaGenDAGISel.inc -gen-dag-isel)
-tablegen(AlphaGenCallingConv.inc -gen-callingconv)
-tablegen(AlphaGenSubtargetInfo.inc -gen-subtarget)
+llvm_tablegen(AlphaGenRegisterInfo.inc -gen-register-info)
+llvm_tablegen(AlphaGenInstrInfo.inc -gen-instr-info)
+llvm_tablegen(AlphaGenAsmWriter.inc -gen-asm-writer)
+llvm_tablegen(AlphaGenDAGISel.inc -gen-dag-isel)
+llvm_tablegen(AlphaGenCallingConv.inc -gen-callingconv)
+llvm_tablegen(AlphaGenSubtargetInfo.inc -gen-subtarget)
+add_public_tablegen_target(AlphaCommonTableGen)
 
 add_llvm_target(AlphaCodeGen
   AlphaAsmPrinter.cpp
@@ -21,5 +22,17 @@ add_llvm_target(AlphaCodeGen
   AlphaSelectionDAGInfo.cpp
   )
 
+add_llvm_library_dependencies(LLVMAlphaCodeGen
+  LLVMAlphaDesc
+  LLVMAlphaInfo
+  LLVMAsmPrinter
+  LLVMCodeGen
+  LLVMCore
+  LLVMMC
+  LLVMSelectionDAG
+  LLVMSupport
+  LLVMTarget
+  )
+
 add_subdirectory(TargetInfo)
 add_subdirectory(MCTargetDesc)
diff --git a/lib/Target/Alpha/MCTargetDesc/AlphaMCTargetDesc.cpp b/lib/Target/Alpha/MCTargetDesc/AlphaMCTargetDesc.cpp
index 562052b6df678..4ad021ca6761f 100644
--- a/lib/Target/Alpha/MCTargetDesc/AlphaMCTargetDesc.cpp
+++ b/lib/Target/Alpha/MCTargetDesc/AlphaMCTargetDesc.cpp
@@ -13,10 +13,11 @@
 
 #include "AlphaMCTargetDesc.h"
 #include "AlphaMCAsmInfo.h"
+#include "llvm/MC/MCCodeGenInfo.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 
 #define GET_INSTRINFO_MC_DESC
 #include "AlphaGenInstrInfo.inc"
@@ -36,8 +37,10 @@ static MCInstrInfo *createAlphaMCInstrInfo() {
   return X;
 }
 
-extern "C" void LLVMInitializeAlphaMCInstrInfo() {
-  TargetRegistry::RegisterMCInstrInfo(TheAlphaTarget, createAlphaMCInstrInfo);
+static MCRegisterInfo *createAlphaMCRegisterInfo(StringRef TT) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitAlphaMCRegisterInfo(X, Alpha::R26);
+  return X;
 }
 
 static MCSubtargetInfo *createAlphaMCSubtargetInfo(StringRef TT, StringRef CPU,
@@ -47,11 +50,29 @@ static MCSubtargetInfo *createAlphaMCSubtargetInfo(StringRef TT, StringRef CPU,
   return X;
 }
 
-extern "C" void LLVMInitializeAlphaMCSubtargetInfo() {
-  TargetRegistry::RegisterMCSubtargetInfo(TheAlphaTarget,
-                                          createAlphaMCSubtargetInfo);
+static MCCodeGenInfo *createAlphaMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                               CodeModel::Model CM) {
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+  X->InitMCCodeGenInfo(Reloc::PIC_, CM);
+  return X;
 }
 
-extern "C" void LLVMInitializeAlphaMCAsmInfo() {
+// Force static initialization.
+extern "C" void LLVMInitializeAlphaTargetMC() {
+  // Register the MC asm info.
   RegisterMCAsmInfo<AlphaMCAsmInfo> X(TheAlphaTarget);
+
+  // Register the MC codegen info.
+  TargetRegistry::RegisterMCCodeGenInfo(TheAlphaTarget,
+                                        createAlphaMCCodeGenInfo);
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(TheAlphaTarget, createAlphaMCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(TheAlphaTarget, createAlphaMCRegisterInfo);
+
+  // Register the MC subtarget info.
+  TargetRegistry::RegisterMCSubtargetInfo(TheAlphaTarget,
+                                          createAlphaMCSubtargetInfo);
 }
diff --git a/lib/Target/Alpha/MCTargetDesc/CMakeLists.txt b/lib/Target/Alpha/MCTargetDesc/CMakeLists.txt
index ad0dd26aafb10..f745ecbdb67fb 100644
--- a/lib/Target/Alpha/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/Alpha/MCTargetDesc/CMakeLists.txt
@@ -2,3 +2,10 @@ add_llvm_library(LLVMAlphaDesc
   AlphaMCTargetDesc.cpp
   AlphaMCAsmInfo.cpp
   )
+
+add_llvm_library_dependencies(LLVMAlphaDesc
+  LLVMAlphaInfo
+  LLVMMC
+  )
+
+add_dependencies(LLVMAlphaDesc AlphaCommonTableGen)
diff --git a/lib/Target/Alpha/TargetInfo/AlphaTargetInfo.cpp b/lib/Target/Alpha/TargetInfo/AlphaTargetInfo.cpp
index f7099b9ae9753..bdc69e788bcc6 100644
--- a/lib/Target/Alpha/TargetInfo/AlphaTargetInfo.cpp
+++ b/lib/Target/Alpha/TargetInfo/AlphaTargetInfo.cpp
@@ -9,7 +9,7 @@
 
 #include "Alpha.h"
 #include "llvm/Module.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
 llvm::Target llvm::TheAlphaTarget;
diff --git a/lib/Target/Alpha/TargetInfo/CMakeLists.txt b/lib/Target/Alpha/TargetInfo/CMakeLists.txt
index 2a7291b90aeb9..cac3178b789d0 100644
--- a/lib/Target/Alpha/TargetInfo/CMakeLists.txt
+++ b/lib/Target/Alpha/TargetInfo/CMakeLists.txt
@@ -4,4 +4,10 @@ add_llvm_library(LLVMAlphaInfo
   AlphaTargetInfo.cpp
   )
 
-add_dependencies(LLVMAlphaInfo AlphaCodeGenTable_gen)
+add_llvm_library_dependencies(LLVMAlphaInfo
+  LLVMMC
+  LLVMSupport
+  LLVMTarget
+  )
+
+add_dependencies(LLVMAlphaInfo AlphaCommonTableGen)
diff --git a/lib/Target/Blackfin/BlackfinAsmPrinter.cpp b/lib/Target/Blackfin/BlackfinAsmPrinter.cpp
index 6ba258beb2b8c..ed9844e1bee47 100644
--- a/lib/Target/Blackfin/BlackfinAsmPrinter.cpp
+++ b/lib/Target/Blackfin/BlackfinAsmPrinter.cpp
@@ -29,9 +29,9 @@
 #include "llvm/Target/Mangler.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
-#include "llvm/Target/TargetRegistry.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
diff --git a/lib/Target/Blackfin/BlackfinFrameLowering.h b/lib/Target/Blackfin/BlackfinFrameLowering.h
index 726fa2c063f7b..169aa8e3011de 100644
--- a/lib/Target/Blackfin/BlackfinFrameLowering.h
+++ b/lib/Target/Blackfin/BlackfinFrameLowering.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef ALPHA_FRAMEINFO_H
-#define ALPHA_FRAMEINFO_H
+#ifndef BLACKFIN_FRAMEINFO_H
+#define BLACKFIN_FRAMEINFO_H
 
 #include "Blackfin.h"
 #include "BlackfinSubtarget.h"
diff --git a/lib/Target/Blackfin/BlackfinISelLowering.cpp b/lib/Target/Blackfin/BlackfinISelLowering.cpp
index d5728324de87a..7d4c45fdf6653 100644
--- a/lib/Target/Blackfin/BlackfinISelLowering.cpp
+++ b/lib/Target/Blackfin/BlackfinISelLowering.cpp
@@ -42,6 +42,7 @@ using namespace llvm;
 BlackfinTargetLowering::BlackfinTargetLowering(TargetMachine &TM)
   : TargetLowering(TM, new TargetLoweringObjectFileELF()) {
   setBooleanContents(ZeroOrOneBooleanContent);
+  setBooleanVectorContents(ZeroOrOneBooleanContent); // FIXME: Is this correct?
   setStackPointerRegisterToSaveRestore(BF::SP);
   setIntDivIsCheap(false);
 
@@ -99,6 +100,7 @@ BlackfinTargetLowering::BlackfinTargetLowering(TargetMachine &TM)
 
   // Blackfin has no intrinsics for these particular operations.
   setOperationAction(ISD::MEMBARRIER, MVT::Other, Expand);
+  setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Expand);
   setOperationAction(ISD::BSWAP, MVT::i32, Expand);
 
   setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand);
@@ -134,7 +136,7 @@ const char *BlackfinTargetLowering::getTargetNodeName(unsigned Opcode) const {
   }
 }
 
-MVT::SimpleValueType BlackfinTargetLowering::getSetCCResultType(EVT VT) const {
+EVT BlackfinTargetLowering::getSetCCResultType(EVT VT) const {
   // SETCC always sets the CC register. Technically that is an i1 register, but
   // that type is not legal, so we treat it as an i32 register.
   return MVT::i32;
diff --git a/lib/Target/Blackfin/BlackfinISelLowering.h b/lib/Target/Blackfin/BlackfinISelLowering.h
index b65775b9285d6..90908baaae9d3 100644
--- a/lib/Target/Blackfin/BlackfinISelLowering.h
+++ b/lib/Target/Blackfin/BlackfinISelLowering.h
@@ -33,7 +33,7 @@ namespace llvm {
   public:
     BlackfinTargetLowering(TargetMachine &TM);
     virtual MVT getShiftAmountTy(EVT LHSTy) const { return MVT::i16; }
-    virtual MVT::SimpleValueType getSetCCResultType(EVT VT) const;
+    virtual EVT getSetCCResultType(EVT VT) const;
     virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
     virtual void ReplaceNodeResults(SDNode *N,
                                     SmallVectorImpl<SDValue> &Results,
diff --git a/lib/Target/Blackfin/BlackfinInstrInfo.cpp b/lib/Target/Blackfin/BlackfinInstrInfo.cpp
index d190ae7984b23..c06a919708d6b 100644
--- a/lib/Target/Blackfin/BlackfinInstrInfo.cpp
+++ b/lib/Target/Blackfin/BlackfinInstrInfo.cpp
@@ -16,10 +16,10 @@
 #include "Blackfin.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/Target/TargetRegistry.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
 
 #define GET_INSTRINFO_CTOR
 #include "BlackfinGenInstrInfo.inc"
diff --git a/lib/Target/Blackfin/BlackfinIntrinsicInfo.cpp b/lib/Target/Blackfin/BlackfinIntrinsicInfo.cpp
index ae8ee9e2a1a26..71356768dd5dd 100644
--- a/lib/Target/Blackfin/BlackfinIntrinsicInfo.cpp
+++ b/lib/Target/Blackfin/BlackfinIntrinsicInfo.cpp
@@ -34,7 +34,7 @@ namespace bfinIntrinsic {
 
 }
 
-std::string BlackfinIntrinsicInfo::getName(unsigned IntrID, const Type **Tys,
+std::string BlackfinIntrinsicInfo::getName(unsigned IntrID, Type **Tys,
                                            unsigned numTys) const {
   static const char *const names[] = {
 #define GET_INTRINSIC_NAME_TABLE
@@ -81,8 +81,8 @@ bool BlackfinIntrinsicInfo::isOverloaded(unsigned IntrID) const {
 #include "BlackfinGenIntrinsics.inc"
 #undef GET_INTRINSIC_ATTRIBUTES
 
-static const FunctionType *getType(LLVMContext &Context, unsigned id) {
-  const Type *ResultTy = NULL;
+static FunctionType *getType(LLVMContext &Context, unsigned id) {
+  Type *ResultTy = NULL;
   std::vector<Type*> ArgTys;
   bool IsVarArg = false;
   
@@ -94,7 +94,7 @@ static const FunctionType *getType(LLVMContext &Context, unsigned id) {
 }
 
 Function *BlackfinIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID,
-                                                const Type **Tys,
+                                                Type **Tys,
                                                 unsigned numTy) const {
   assert(!isOverloaded(IntrID) && "Blackfin intrinsics are not overloaded");
   AttrListPtr AList = getAttributes((bfinIntrinsic::ID) IntrID);
diff --git a/lib/Target/Blackfin/BlackfinIntrinsicInfo.h b/lib/Target/Blackfin/BlackfinIntrinsicInfo.h
index 7c4b5a9967d2a..f05db5ad7cd97 100644
--- a/lib/Target/Blackfin/BlackfinIntrinsicInfo.h
+++ b/lib/Target/Blackfin/BlackfinIntrinsicInfo.h
@@ -19,11 +19,11 @@ namespace llvm {
 
   class BlackfinIntrinsicInfo : public TargetIntrinsicInfo {
   public:
-    std::string getName(unsigned IntrID, const Type **Tys = 0,
+    std::string getName(unsigned IntrID, Type **Tys = 0,
                         unsigned numTys = 0) const;
     unsigned lookupName(const char *Name, unsigned Len) const;
     bool isOverloaded(unsigned IID) const;
-    Function *getDeclaration(Module *M, unsigned ID, const Type **Tys = 0,
+    Function *getDeclaration(Module *M, unsigned ID, Type **Tys = 0,
                              unsigned numTys = 0) const;
   };
 
diff --git a/lib/Target/Blackfin/BlackfinRegisterInfo.cpp b/lib/Target/Blackfin/BlackfinRegisterInfo.cpp
index 3a7c104ee0555..0d415c5f342be 100644
--- a/lib/Target/Blackfin/BlackfinRegisterInfo.cpp
+++ b/lib/Target/Blackfin/BlackfinRegisterInfo.cpp
@@ -20,7 +20,6 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineLocation.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetMachine.h"
@@ -37,7 +36,7 @@ using namespace llvm;
 
 BlackfinRegisterInfo::BlackfinRegisterInfo(BlackfinSubtarget &st,
                                            const TargetInstrInfo &tii)
-  : BlackfinGenRegisterInfo(), Subtarget(st), TII(tii) {}
+  : BlackfinGenRegisterInfo(BF::RETS), Subtarget(st), TII(tii) {}
 
 const unsigned*
 BlackfinRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
@@ -327,10 +326,6 @@ BlackfinRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   }
 }
 
-unsigned BlackfinRegisterInfo::getRARegister() const {
-  return BF::RETS;
-}
-
 unsigned
 BlackfinRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
@@ -347,14 +342,3 @@ unsigned BlackfinRegisterInfo::getEHHandlerRegister() const {
   llvm_unreachable("What is the exception handler register");
   return 0;
 }
-
-int BlackfinRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
-  llvm_unreachable("What is the dwarf register number");
-  return -1;
-}
-
-int BlackfinRegisterInfo::getLLVMRegNum(unsigned DwarfRegNum,
-                                        bool isEH) const {
-  llvm_unreachable("What is the dwarf register number");
-  return -1;
-}
diff --git a/lib/Target/Blackfin/BlackfinRegisterInfo.h b/lib/Target/Blackfin/BlackfinRegisterInfo.h
index 86f45c17c6252..6ac22af793e01 100644
--- a/lib/Target/Blackfin/BlackfinRegisterInfo.h
+++ b/lib/Target/Blackfin/BlackfinRegisterInfo.h
@@ -53,15 +53,11 @@ namespace llvm {
                              int SPAdj, RegScavenger *RS = NULL) const;
 
     unsigned getFrameRegister(const MachineFunction &MF) const;
-    unsigned getRARegister() const;
 
     // Exception handling queries.
     unsigned getEHExceptionRegister() const;
     unsigned getEHHandlerRegister() const;
 
-    int getDwarfRegNum(unsigned RegNum, bool isEH) const;
-    int getLLVMRegNum(unsigned RegNum, bool isEH) const;
-
     // Utility functions
     void adjustRegister(MachineBasicBlock &MBB,
                         MachineBasicBlock::iterator I,
diff --git a/lib/Target/Blackfin/BlackfinSubtarget.cpp b/lib/Target/Blackfin/BlackfinSubtarget.cpp
index ec919cdf0b905..0bdce09177eda 100644
--- a/lib/Target/Blackfin/BlackfinSubtarget.cpp
+++ b/lib/Target/Blackfin/BlackfinSubtarget.cpp
@@ -13,7 +13,7 @@
 
 #include "BlackfinSubtarget.h"
 #include "Blackfin.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 
 #define GET_SUBTARGETINFO_TARGET_DESC
 #define GET_SUBTARGETINFO_CTOR
diff --git a/lib/Target/Blackfin/BlackfinTargetMachine.cpp b/lib/Target/Blackfin/BlackfinTargetMachine.cpp
index a1c9f1c05e0d1..a4ae46b90fa0c 100644
--- a/lib/Target/Blackfin/BlackfinTargetMachine.cpp
+++ b/lib/Target/Blackfin/BlackfinTargetMachine.cpp
@@ -13,7 +13,7 @@
 #include "BlackfinTargetMachine.h"
 #include "Blackfin.h"
 #include "llvm/PassManager.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
 
@@ -22,10 +22,12 @@ extern "C" void LLVMInitializeBlackfinTarget() {
 }
 
 BlackfinTargetMachine::BlackfinTargetMachine(const Target &T,
-                                             const std::string &TT,
-                                             const std::string &CPU,
-                                             const std::string &FS)
-  : LLVMTargetMachine(T, TT, CPU, FS),
+                                             StringRef TT,
+                                             StringRef CPU,
+                                             StringRef FS,
+                                             Reloc::Model RM,
+                                             CodeModel::Model CM)
+  : LLVMTargetMachine(T, TT, CPU, FS, RM, CM),
     DataLayout("e-p:32:32-i64:32-f64:32-n32"),
     Subtarget(TT, CPU, FS),
     TLInfo(*this),
diff --git a/lib/Target/Blackfin/BlackfinTargetMachine.h b/lib/Target/Blackfin/BlackfinTargetMachine.h
index bd7dc84f04ae6..c85337fe237fc 100644
--- a/lib/Target/Blackfin/BlackfinTargetMachine.h
+++ b/lib/Target/Blackfin/BlackfinTargetMachine.h
@@ -35,8 +35,9 @@ namespace llvm {
     BlackfinFrameLowering FrameLowering;
     BlackfinIntrinsicInfo IntrinsicInfo;
   public:
-    BlackfinTargetMachine(const Target &T, const std::string &TT,
-                          const std::string &CPU, const std::string &FS);
+    BlackfinTargetMachine(const Target &T, StringRef TT,
+                          StringRef CPU, StringRef FS,
+                          Reloc::Model RM, CodeModel::Model CM);
 
     virtual const BlackfinInstrInfo *getInstrInfo() const { return &InstrInfo; }
     virtual const TargetFrameLowering *getFrameLowering() const {
diff --git a/lib/Target/Blackfin/CMakeLists.txt b/lib/Target/Blackfin/CMakeLists.txt
index d3f33a987e697..94d05fbf88782 100644
--- a/lib/Target/Blackfin/CMakeLists.txt
+++ b/lib/Target/Blackfin/CMakeLists.txt
@@ -1,12 +1,13 @@
 set(LLVM_TARGET_DEFINITIONS Blackfin.td)
 
-tablegen(BlackfinGenRegisterInfo.inc -gen-register-info)
-tablegen(BlackfinGenInstrInfo.inc -gen-instr-info)
-tablegen(BlackfinGenAsmWriter.inc -gen-asm-writer)
-tablegen(BlackfinGenDAGISel.inc -gen-dag-isel)
-tablegen(BlackfinGenSubtargetInfo.inc -gen-subtarget)
-tablegen(BlackfinGenCallingConv.inc -gen-callingconv)
-tablegen(BlackfinGenIntrinsics.inc -gen-tgt-intrinsic)
+llvm_tablegen(BlackfinGenRegisterInfo.inc -gen-register-info)
+llvm_tablegen(BlackfinGenInstrInfo.inc -gen-instr-info)
+llvm_tablegen(BlackfinGenAsmWriter.inc -gen-asm-writer)
+llvm_tablegen(BlackfinGenDAGISel.inc -gen-dag-isel)
+llvm_tablegen(BlackfinGenSubtargetInfo.inc -gen-subtarget)
+llvm_tablegen(BlackfinGenCallingConv.inc -gen-callingconv)
+llvm_tablegen(BlackfinGenIntrinsics.inc -gen-tgt-intrinsic)
+add_public_tablegen_target(BlackfinCommonTableGen)
 
 add_llvm_target(BlackfinCodeGen
   BlackfinAsmPrinter.cpp
@@ -21,5 +22,17 @@ add_llvm_target(BlackfinCodeGen
   BlackfinSelectionDAGInfo.cpp
   )
 
+add_llvm_library_dependencies(LLVMBlackfinCodeGen
+  LLVMAsmPrinter
+  LLVMBlackfinDesc
+  LLVMBlackfinInfo
+  LLVMCodeGen
+  LLVMCore
+  LLVMMC
+  LLVMSelectionDAG
+  LLVMSupport
+  LLVMTarget
+  )
+
 add_subdirectory(TargetInfo)
 add_subdirectory(MCTargetDesc)
diff --git a/lib/Target/Blackfin/MCTargetDesc/BlackfinMCTargetDesc.cpp b/lib/Target/Blackfin/MCTargetDesc/BlackfinMCTargetDesc.cpp
index 0fa1471ae3e7d..272e3c2bbb75d 100644
--- a/lib/Target/Blackfin/MCTargetDesc/BlackfinMCTargetDesc.cpp
+++ b/lib/Target/Blackfin/MCTargetDesc/BlackfinMCTargetDesc.cpp
@@ -13,10 +13,11 @@
 
 #include "BlackfinMCTargetDesc.h"
 #include "BlackfinMCAsmInfo.h"
+#include "llvm/MC/MCCodeGenInfo.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 
 #define GET_INSTRINFO_MC_DESC
 #include "BlackfinGenInstrInfo.inc"
@@ -36,12 +37,12 @@ static MCInstrInfo *createBlackfinMCInstrInfo() {
   return X;
 }
 
-extern "C" void LLVMInitializeBlackfinMCInstrInfo() {
-  TargetRegistry::RegisterMCInstrInfo(TheBlackfinTarget,
-                                      createBlackfinMCInstrInfo);
+static MCRegisterInfo *createBlackfinMCRegisterInfo(StringRef TT) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitBlackfinMCRegisterInfo(X, BF::RETS);
+  return X;
 }
 
-
 static MCSubtargetInfo *createBlackfinMCSubtargetInfo(StringRef TT,
                                                       StringRef CPU,
                                                       StringRef FS) {
@@ -50,11 +51,31 @@ static MCSubtargetInfo *createBlackfinMCSubtargetInfo(StringRef TT,
   return X;
 }
 
-extern "C" void LLVMInitializeBlackfinMCSubtargetInfo() {
-  TargetRegistry::RegisterMCSubtargetInfo(TheBlackfinTarget,
-                                          createBlackfinMCSubtargetInfo);
+static MCCodeGenInfo *createBlackfinMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                                  CodeModel::Model CM) {
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+  X->InitMCCodeGenInfo(RM, CM);
+  return X;
 }
 
-extern "C" void LLVMInitializeBlackfinMCAsmInfo() {
+// Force static initialization.
+extern "C" void LLVMInitializeBlackfinTargetMC() {
+  // Register the MC asm info.
   RegisterMCAsmInfo<BlackfinMCAsmInfo> X(TheBlackfinTarget);
+
+  // Register the MC codegen info.
+  TargetRegistry::RegisterMCCodeGenInfo(TheBlackfinTarget,
+                                        createBlackfinMCCodeGenInfo);
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(TheBlackfinTarget,
+                                      createBlackfinMCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(TheBlackfinTarget,
+                                    createBlackfinMCRegisterInfo);
+
+  // Register the MC subtarget info.
+  TargetRegistry::RegisterMCSubtargetInfo(TheBlackfinTarget,
+                                          createBlackfinMCSubtargetInfo);
 }
diff --git a/lib/Target/Blackfin/MCTargetDesc/CMakeLists.txt b/lib/Target/Blackfin/MCTargetDesc/CMakeLists.txt
index 8cd924f9236ff..73315d852cbda 100644
--- a/lib/Target/Blackfin/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/Blackfin/MCTargetDesc/CMakeLists.txt
@@ -2,3 +2,10 @@ add_llvm_library(LLVMBlackfinDesc
   BlackfinMCTargetDesc.cpp
   BlackfinMCAsmInfo.cpp
   )
+
+add_llvm_library_dependencies(LLVMBlackfinDesc
+  LLVMBlackfinInfo
+  LLVMMC
+  )
+
+add_dependencies(LLVMBlackfinDesc BlackfinCommonTableGen)
diff --git a/lib/Target/Blackfin/TargetInfo/BlackfinTargetInfo.cpp b/lib/Target/Blackfin/TargetInfo/BlackfinTargetInfo.cpp
index 402e0afde81db..57f1d3e95fbf6 100644
--- a/lib/Target/Blackfin/TargetInfo/BlackfinTargetInfo.cpp
+++ b/lib/Target/Blackfin/TargetInfo/BlackfinTargetInfo.cpp
@@ -9,7 +9,7 @@
 
 #include "Blackfin.h"
 #include "llvm/Module.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
 
diff --git a/lib/Target/Blackfin/TargetInfo/CMakeLists.txt b/lib/Target/Blackfin/TargetInfo/CMakeLists.txt
index 5ca80604f63cd..771f092eb0628 100644
--- a/lib/Target/Blackfin/TargetInfo/CMakeLists.txt
+++ b/lib/Target/Blackfin/TargetInfo/CMakeLists.txt
@@ -4,4 +4,10 @@ add_llvm_library(LLVMBlackfinInfo
   BlackfinTargetInfo.cpp
   )
 
-add_dependencies(LLVMBlackfinInfo BlackfinCodeGenTable_gen)
+add_llvm_library_dependencies(LLVMBlackfinInfo
+  LLVMMC
+  LLVMSupport
+  LLVMTarget
+  )
+
+add_dependencies(LLVMBlackfinInfo BlackfinCommonTableGen)
diff --git a/lib/Target/CBackend/CBackend.cpp b/lib/Target/CBackend/CBackend.cpp
index 415beb1dd1cd4..69d8c46a50246 100644
--- a/lib/Target/CBackend/CBackend.cpp
+++ b/lib/Target/CBackend/CBackend.cpp
@@ -37,10 +37,11 @@
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Target/TargetData.h"
-#include "llvm/Target/TargetRegistry.h"
 #include "llvm/Support/CallSite.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -48,6 +49,7 @@
 #include "llvm/Support/GetElementPtrTypeIterator.h"
 #include "llvm/Support/InstVisitor.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/Host.h"
 #include "llvm/Config/config.h"
 #include <algorithm>
@@ -62,12 +64,6 @@ extern "C" void LLVMInitializeCBackendTarget() {
   RegisterTargetMachine<CTargetMachine> X(TheCBackendTarget);
 }
 
-extern "C" void LLVMInitializeCBackendMCAsmInfo() {}
-
-extern "C" void LLVMInitializeCBackendMCInstrInfo() {}
-
-extern "C" void LLVMInitializeCBackendMCSubtargetInfo() {}
-
 namespace {
   class CBEMCAsmInfo : public MCAsmInfo {
   public:
@@ -86,6 +82,8 @@ namespace {
     LoopInfo *LI;
     const Module *TheModule;
     const MCAsmInfo* TAsm;
+    const MCRegisterInfo *MRI;
+    const MCObjectFileInfo *MOFI;
     MCContext *TCtx;
     const TargetData* TD;
     
@@ -99,14 +97,14 @@ namespace {
 
     /// UnnamedStructIDs - This contains a unique ID for each struct that is
     /// either anonymous or has no name.
-    DenseMap<const StructType*, unsigned> UnnamedStructIDs;
+    DenseMap<StructType*, unsigned> UnnamedStructIDs;
     
   public:
     static char ID;
     explicit CWriter(formatted_raw_ostream &o)
       : FunctionPass(ID), Out(o), IL(0), Mang(0), LI(0),
-        TheModule(0), TAsm(0), TCtx(0), TD(0), OpaqueCounter(0),
-        NextAnonValueNumber(0) {
+        TheModule(0), TAsm(0), MRI(0), MOFI(0), TCtx(0), TD(0),
+        OpaqueCounter(0), NextAnonValueNumber(0) {
       initializeLoopInfoPass(*PassRegistry::getPassRegistry());
       FPCounter = 0;
     }
@@ -145,6 +143,8 @@ namespace {
       delete Mang;
       delete TCtx;
       delete TAsm;
+      delete MRI;
+      delete MOFI;
       FPConstantMap.clear();
       ByValParams.clear();
       intrinsicPrototypesAlreadyGenerated.clear();
@@ -152,20 +152,20 @@ namespace {
       return false;
     }
 
-    raw_ostream &printType(raw_ostream &Out, const Type *Ty,
+    raw_ostream &printType(raw_ostream &Out, Type *Ty,
                            bool isSigned = false,
                            const std::string &VariableName = "",
                            bool IgnoreName = false,
                            const AttrListPtr &PAL = AttrListPtr());
-    raw_ostream &printSimpleType(raw_ostream &Out, const Type *Ty,
+    raw_ostream &printSimpleType(raw_ostream &Out, Type *Ty,
                                  bool isSigned,
                                  const std::string &NameSoFar = "");
 
     void printStructReturnPointerFunctionType(raw_ostream &Out,
                                               const AttrListPtr &PAL,
-                                              const PointerType *Ty);
+                                              PointerType *Ty);
 
-    std::string getStructName(const StructType *ST);
+    std::string getStructName(StructType *ST);
     
     /// writeOperandDeref - Print the result of dereferencing the specified
     /// operand with '*'.  This is equivalent to printing '*' then using
@@ -188,7 +188,7 @@ namespace {
     void writeOperandWithCast(Value* Operand, const ICmpInst &I);
     bool writeInstructionCast(const Instruction &I);
 
-    void writeMemoryAccess(Value *Operand, const Type *OperandType,
+    void writeMemoryAccess(Value *Operand, Type *OperandType,
                            bool IsVolatile, unsigned Alignment);
 
   private :
@@ -200,7 +200,7 @@ namespace {
     void printIntrinsicDefinition(const Function &F, raw_ostream &Out);
 
     void printModuleTypes();
-    void printContainedStructs(const Type *Ty, SmallPtrSet<const Type *, 16> &);
+    void printContainedStructs(Type *Ty, SmallPtrSet<Type *, 16> &);
     void printFloatingPointConstants(Function &F);
     void printFloatingPointConstants(const Constant *C);
     void printFunctionSignature(const Function *F, bool Prototype);
@@ -209,7 +209,7 @@ namespace {
     void printBasicBlock(BasicBlock *BB);
     void printLoop(Loop *L);
 
-    void printCast(unsigned opcode, const Type *SrcTy, const Type *DstTy);
+    void printCast(unsigned opcode, Type *SrcTy, Type *DstTy);
     void printConstant(Constant *CPV, bool Static);
     void printConstantWithCast(Constant *CPV, unsigned Opcode);
     bool printConstExprCast(const ConstantExpr *CE, bool Static);
@@ -288,10 +288,12 @@ namespace {
     void visitInvokeInst(InvokeInst &I) {
       llvm_unreachable("Lowerinvoke pass didn't work!");
     }
-
     void visitUnwindInst(UnwindInst &I) {
       llvm_unreachable("Lowerinvoke pass didn't work!");
     }
+    void visitResumeInst(ResumeInst &I) {
+      llvm_unreachable("DwarfEHPrepare pass didn't work!");
+    }
     void visitUnreachableInst(UnreachableInst &I);
 
     void visitPHINode(PHINode &I);
@@ -360,8 +362,8 @@ static std::string CBEMangle(const std::string &S) {
   return Result;
 }
 
-std::string CWriter::getStructName(const StructType *ST) {
-  if (!ST->isAnonymous() && !ST->getName().empty())
+std::string CWriter::getStructName(StructType *ST) {
+  if (!ST->isLiteral() && !ST->getName().empty())
     return CBEMangle("l_"+ST->getName().str());
   
   return "l_unnamed_" + utostr(UnnamedStructIDs[ST]);
@@ -373,20 +375,20 @@ std::string CWriter::getStructName(const StructType *ST) {
 /// print it as "Struct (*)(...)", for struct return functions.
 void CWriter::printStructReturnPointerFunctionType(raw_ostream &Out,
                                                    const AttrListPtr &PAL,
-                                                   const PointerType *TheTy) {
-  const FunctionType *FTy = cast<FunctionType>(TheTy->getElementType());
+                                                   PointerType *TheTy) {
+  FunctionType *FTy = cast<FunctionType>(TheTy->getElementType());
   std::string tstr;
   raw_string_ostream FunctionInnards(tstr);
   FunctionInnards << " (*) (";
   bool PrintedType = false;
 
   FunctionType::param_iterator I = FTy->param_begin(), E = FTy->param_end();
-  const Type *RetTy = cast<PointerType>(*I)->getElementType();
+  Type *RetTy = cast<PointerType>(*I)->getElementType();
   unsigned Idx = 1;
   for (++I, ++Idx; I != E; ++I, ++Idx) {
     if (PrintedType)
       FunctionInnards << ", ";
-    const Type *ArgTy = *I;
+    Type *ArgTy = *I;
     if (PAL.paramHasAttr(Idx, Attribute::ByVal)) {
       assert(ArgTy->isPointerTy());
       ArgTy = cast<PointerType>(ArgTy)->getElementType();
@@ -408,7 +410,7 @@ void CWriter::printStructReturnPointerFunctionType(raw_ostream &Out,
 }
 
 raw_ostream &
-CWriter::printSimpleType(raw_ostream &Out, const Type *Ty, bool isSigned,
+CWriter::printSimpleType(raw_ostream &Out, Type *Ty, bool isSigned,
                          const std::string &NameSoFar) {
   assert((Ty->isPrimitiveType() || Ty->isIntegerTy() || Ty->isVectorTy()) &&
          "Invalid type for printSimpleType");
@@ -444,7 +446,7 @@ CWriter::printSimpleType(raw_ostream &Out, const Type *Ty, bool isSigned,
                      " __attribute__((vector_size(64))) " + NameSoFar);
 
   case Type::VectorTyID: {
-    const VectorType *VTy = cast<VectorType>(Ty);
+    VectorType *VTy = cast<VectorType>(Ty);
     return printSimpleType(Out, VTy->getElementType(), isSigned,
                      " __attribute__((vector_size(" +
                      utostr(TD->getTypeAllocSize(VTy)) + " ))) " + NameSoFar);
@@ -461,7 +463,7 @@ CWriter::printSimpleType(raw_ostream &Out, const Type *Ty, bool isSigned,
 // Pass the Type* and the variable name and this prints out the variable
 // declaration.
 //
-raw_ostream &CWriter::printType(raw_ostream &Out, const Type *Ty,
+raw_ostream &CWriter::printType(raw_ostream &Out, Type *Ty,
                                 bool isSigned, const std::string &NameSoFar,
                                 bool IgnoreName, const AttrListPtr &PAL) {
   if (Ty->isPrimitiveType() || Ty->isIntegerTy() || Ty->isVectorTy()) {
@@ -471,14 +473,14 @@ raw_ostream &CWriter::printType(raw_ostream &Out, const Type *Ty,
 
   switch (Ty->getTypeID()) {
   case Type::FunctionTyID: {
-    const FunctionType *FTy = cast<FunctionType>(Ty);
+    FunctionType *FTy = cast<FunctionType>(Ty);
     std::string tstr;
     raw_string_ostream FunctionInnards(tstr);
     FunctionInnards << " (" << NameSoFar << ") (";
     unsigned Idx = 1;
     for (FunctionType::param_iterator I = FTy->param_begin(),
            E = FTy->param_end(); I != E; ++I) {
-      const Type *ArgTy = *I;
+      Type *ArgTy = *I;
       if (PAL.paramHasAttr(Idx, Attribute::ByVal)) {
         assert(ArgTy->isPointerTy());
         ArgTy = cast<PointerType>(ArgTy)->getElementType();
@@ -502,7 +504,7 @@ raw_ostream &CWriter::printType(raw_ostream &Out, const Type *Ty,
     return Out;
   }
   case Type::StructTyID: {
-    const StructType *STy = cast<StructType>(Ty);
+    StructType *STy = cast<StructType>(Ty);
     
     // Check to see if the type is named.
     if (!IgnoreName)
@@ -523,7 +525,7 @@ raw_ostream &CWriter::printType(raw_ostream &Out, const Type *Ty,
   }
 
   case Type::PointerTyID: {
-    const PointerType *PTy = cast<PointerType>(Ty);
+    PointerType *PTy = cast<PointerType>(Ty);
     std::string ptrName = "*" + NameSoFar;
 
     if (PTy->getElementType()->isArrayTy() ||
@@ -537,7 +539,7 @@ raw_ostream &CWriter::printType(raw_ostream &Out, const Type *Ty,
   }
 
   case Type::ArrayTyID: {
-    const ArrayType *ATy = cast<ArrayType>(Ty);
+    ArrayType *ATy = cast<ArrayType>(Ty);
     unsigned NumElements = ATy->getNumElements();
     if (NumElements == 0) NumElements = 1;
     // Arrays are wrapped in structs to allow them to have normal
@@ -560,7 +562,7 @@ void CWriter::printConstantArray(ConstantArray *CPA, bool Static) {
   // As a special case, print the array as a string if it is an array of
   // ubytes or an array of sbytes with positive values.
   //
-  const Type *ETy = CPA->getType()->getElementType();
+  Type *ETy = CPA->getType()->getElementType();
   bool isString = (ETy == Type::getInt8Ty(CPA->getContext()) ||
                    ETy == Type::getInt8Ty(CPA->getContext()));
 
@@ -682,7 +684,7 @@ static bool isFPCSafeToPrint(const ConstantFP *CFP) {
 /// Print out the casting for a cast operation. This does the double casting
 /// necessary for conversion to the destination type, if necessary.
 /// @brief Print a cast
-void CWriter::printCast(unsigned opc, const Type *SrcTy, const Type *DstTy) {
+void CWriter::printCast(unsigned opc, Type *SrcTy, Type *DstTy) {
   // Print the destination type cast
   switch (opc) {
     case Instruction::UIToFP:
@@ -917,7 +919,7 @@ void CWriter::printConstant(Constant *CPV, bool Static) {
   }
 
   if (ConstantInt *CI = dyn_cast<ConstantInt>(CPV)) {
-    const Type* Ty = CI->getType();
+    Type* Ty = CI->getType();
     if (Ty == Type::getInt1Ty(CPV->getContext()))
       Out << (CI->getZExtValue() ? '1' : '0');
     else if (Ty == Type::getInt32Ty(CPV->getContext()))
@@ -1027,7 +1029,7 @@ void CWriter::printConstant(Constant *CPV, bool Static) {
       printConstantArray(CA, Static);
     } else {
       assert(isa<ConstantAggregateZero>(CPV) || isa<UndefValue>(CPV));
-      const ArrayType *AT = cast<ArrayType>(CPV->getType());
+      ArrayType *AT = cast<ArrayType>(CPV->getType());
       Out << '{';
       if (AT->getNumElements()) {
         Out << ' ';
@@ -1054,7 +1056,7 @@ void CWriter::printConstant(Constant *CPV, bool Static) {
       printConstantVector(CV, Static);
     } else {
       assert(isa<ConstantAggregateZero>(CPV) || isa<UndefValue>(CPV));
-      const VectorType *VT = cast<VectorType>(CPV->getType());
+      VectorType *VT = cast<VectorType>(CPV->getType());
       Out << "{ ";
       Constant *CZ = Constant::getNullValue(VT->getElementType());
       printConstant(CZ, Static);
@@ -1074,7 +1076,7 @@ void CWriter::printConstant(Constant *CPV, bool Static) {
       Out << ")";
     }
     if (isa<ConstantAggregateZero>(CPV) || isa<UndefValue>(CPV)) {
-      const StructType *ST = cast<StructType>(CPV->getType());
+      StructType *ST = cast<StructType>(CPV->getType());
       Out << '{';
       if (ST->getNumElements()) {
         Out << ' ';
@@ -1123,7 +1125,7 @@ void CWriter::printConstant(Constant *CPV, bool Static) {
 // care of detecting that case and printing the cast for the ConstantExpr.
 bool CWriter::printConstExprCast(const ConstantExpr* CE, bool Static) {
   bool NeedsExplicitCast = false;
-  const Type *Ty = CE->getOperand(0)->getType();
+  Type *Ty = CE->getOperand(0)->getType();
   bool TypeIsSigned = false;
   switch (CE->getOpcode()) {
   case Instruction::Add:
@@ -1175,7 +1177,7 @@ bool CWriter::printConstExprCast(const ConstantExpr* CE, bool Static) {
 void CWriter::printConstantWithCast(Constant* CPV, unsigned Opcode) {
 
   // Extract the operand's type, we'll need it.
-  const Type* OpTy = CPV->getType();
+  Type* OpTy = CPV->getType();
 
   // Indicate whether to do the cast or not.
   bool shouldCast = false;
@@ -1267,7 +1269,7 @@ std::string CWriter::GetValueName(const Value *Operand) {
 void CWriter::writeInstComputationInline(Instruction &I) {
   // We can't currently support integer types other than 1, 8, 16, 32, 64.
   // Validate this.
-  const Type *Ty = I.getType();
+  Type *Ty = I.getType();
   if (Ty->isIntegerTy() && (Ty!=Type::getInt1Ty(I.getContext()) &&
         Ty!=Type::getInt8Ty(I.getContext()) &&
         Ty!=Type::getInt16Ty(I.getContext()) &&
@@ -1330,7 +1332,7 @@ void CWriter::writeOperand(Value *Operand, bool Static) {
 // This function takes care of detecting that case and printing the cast
 // for the Instruction.
 bool CWriter::writeInstructionCast(const Instruction &I) {
-  const Type *Ty = I.getOperand(0)->getType();
+  Type *Ty = I.getOperand(0)->getType();
   switch (I.getOpcode()) {
   case Instruction::Add:
   case Instruction::Sub:
@@ -1362,7 +1364,7 @@ bool CWriter::writeInstructionCast(const Instruction &I) {
 void CWriter::writeOperandWithCast(Value* Operand, unsigned Opcode) {
 
   // Extract the operand's type, we'll need it.
-  const Type* OpTy = Operand->getType();
+  Type* OpTy = Operand->getType();
 
   // Indicate whether to do the cast or not.
   bool shouldCast = false;
@@ -1430,7 +1432,7 @@ void CWriter::writeOperandWithCast(Value* Operand, const ICmpInst &Cmp) {
   bool castIsSigned = Cmp.isSigned();
 
   // If the operand was a pointer, convert to a large integer type.
-  const Type* OpTy = Operand->getType();
+  Type* OpTy = Operand->getType();
   if (OpTy->isPointerTy())
     OpTy = TD->getIntPtrType(Operand->getContext());
 
@@ -1665,7 +1667,8 @@ bool CWriter::doInitialization(Module &M) {
     TAsm = Match->createMCAsmInfo(Triple);
 #endif
   TAsm = new CBEMCAsmInfo();
-  TCtx = new MCContext(*TAsm, NULL);
+  MRI  = new MCRegisterInfo();
+  TCtx = new MCContext(*TAsm, *MRI, NULL);
   Mang = new Mangler(*TCtx, *TD);
 
   // Keep track of which functions are static ctors/dtors so they can have
@@ -2049,7 +2052,7 @@ void CWriter::printModuleTypes() {
   for (unsigned i = 0, e = StructTypes.size(); i != e; ++i) {
     StructType *ST = StructTypes[i];
 
-    if (ST->isAnonymous() || ST->getName().empty())
+    if (ST->isLiteral() || ST->getName().empty())
       UnnamedStructIDs[ST] = NextTypeID++;
 
     std::string Name = getStructName(ST);
@@ -2060,7 +2063,7 @@ void CWriter::printModuleTypes() {
   Out << '\n';
 
   // Keep track of which structures have been printed so far.
-  SmallPtrSet<const Type *, 16> StructPrinted;
+  SmallPtrSet<Type *, 16> StructPrinted;
 
   // Loop over all structures then push them into the stack so they are
   // printed in the correct order.
@@ -2077,8 +2080,8 @@ void CWriter::printModuleTypes() {
 //
 // TODO:  Make this work properly with vector types
 //
-void CWriter::printContainedStructs(const Type *Ty,
-                                SmallPtrSet<const Type *, 16> &StructPrinted) {
+void CWriter::printContainedStructs(Type *Ty,
+                                SmallPtrSet<Type *, 16> &StructPrinted) {
   // Don't walk through pointers.
   if (Ty->isPointerTy() || Ty->isPrimitiveType() || Ty->isIntegerTy())
     return;
@@ -2088,7 +2091,7 @@ void CWriter::printContainedStructs(const Type *Ty,
        E = Ty->subtype_end(); I != E; ++I)
     printContainedStructs(*I, StructPrinted);
 
-  if (const StructType *ST = dyn_cast<StructType>(Ty)) {
+  if (StructType *ST = dyn_cast<StructType>(Ty)) {
     // Check to see if we have already printed this struct.
     if (!StructPrinted.insert(Ty)) return;
     
@@ -2120,7 +2123,7 @@ void CWriter::printFunctionSignature(const Function *F, bool Prototype) {
   }
 
   // Loop over the arguments, printing them...
-  const FunctionType *FT = cast<FunctionType>(F->getFunctionType());
+  FunctionType *FT = cast<FunctionType>(F->getFunctionType());
   const AttrListPtr &PAL = F->getAttributes();
 
   std::string tstr;
@@ -2150,7 +2153,7 @@ void CWriter::printFunctionSignature(const Function *F, bool Prototype) {
           ArgName = GetValueName(I);
         else
           ArgName = "";
-        const Type *ArgTy = I->getType();
+        Type *ArgTy = I->getType();
         if (PAL.paramHasAttr(Idx, Attribute::ByVal)) {
           ArgTy = cast<PointerType>(ArgTy)->getElementType();
           ByValParams.insert(I);
@@ -2177,7 +2180,7 @@ void CWriter::printFunctionSignature(const Function *F, bool Prototype) {
 
     for (; I != E; ++I) {
       if (PrintedArg) FunctionInnards << ", ";
-      const Type *ArgTy = *I;
+      Type *ArgTy = *I;
       if (PAL.paramHasAttr(Idx, Attribute::ByVal)) {
         assert(ArgTy->isPointerTy());
         ArgTy = cast<PointerType>(ArgTy)->getElementType();
@@ -2205,7 +2208,7 @@ void CWriter::printFunctionSignature(const Function *F, bool Prototype) {
   FunctionInnards << ')';
 
   // Get the return tpe for the function.
-  const Type *RetTy;
+  Type *RetTy;
   if (!isStructReturn)
     RetTy = F->getReturnType();
   else {
@@ -2222,8 +2225,8 @@ void CWriter::printFunctionSignature(const Function *F, bool Prototype) {
 static inline bool isFPIntBitCast(const Instruction &I) {
   if (!isa<BitCastInst>(I))
     return false;
-  const Type *SrcTy = I.getOperand(0)->getType();
-  const Type *DstTy = I.getType();
+  Type *SrcTy = I.getOperand(0)->getType();
+  Type *DstTy = I.getType();
   return (SrcTy->isFloatingPointTy() && DstTy->isIntegerTy()) ||
          (DstTy->isFloatingPointTy() && SrcTy->isIntegerTy());
 }
@@ -2237,7 +2240,7 @@ void CWriter::printFunction(Function &F) {
 
   // If this is a struct return function, handle the result with magic.
   if (isStructReturn) {
-    const Type *StructTy =
+    Type *StructTy =
       cast<PointerType>(F.arg_begin()->getType())->getElementType();
     Out << "  ";
     printType(Out, StructTy, false, "StructReturn");
@@ -2380,22 +2383,29 @@ void CWriter::visitReturnInst(ReturnInst &I) {
 
 void CWriter::visitSwitchInst(SwitchInst &SI) {
 
+  Value* Cond = SI.getCondition();
+
   Out << "  switch (";
-  writeOperand(SI.getOperand(0));
+  writeOperand(Cond);
   Out << ") {\n  default:\n";
   printPHICopiesForSuccessor (SI.getParent(), SI.getDefaultDest(), 2);
   printBranchToBlock(SI.getParent(), SI.getDefaultDest(), 2);
   Out << ";\n";
-  for (unsigned i = 2, e = SI.getNumOperands(); i != e; i += 2) {
+
+  unsigned NumCases = SI.getNumCases();
+  // Skip the first item since that's the default case.
+  for (unsigned i = 1; i < NumCases; ++i) {
+    ConstantInt* CaseVal = SI.getCaseValue(i);
+    BasicBlock* Succ = SI.getSuccessor(i);
     Out << "  case ";
-    writeOperand(SI.getOperand(i));
+    writeOperand(CaseVal);
     Out << ":\n";
-    BasicBlock *Succ = cast<BasicBlock>(SI.getOperand(i+1));
     printPHICopiesForSuccessor (SI.getParent(), Succ, 2);
     printBranchToBlock(SI.getParent(), Succ, 2);
     if (Function::iterator(Succ) == llvm::next(Function::iterator(SI.getParent())))
       Out << "    break;\n";
   }
+
   Out << "  }\n";
 }
 
@@ -2656,7 +2666,7 @@ void CWriter::visitFCmpInst(FCmpInst &I) {
   Out << ")";
 }
 
-static const char * getFloatBitCastField(const Type *Ty) {
+static const char * getFloatBitCastField(Type *Ty) {
   switch (Ty->getTypeID()) {
     default: llvm_unreachable("Invalid Type");
     case Type::FloatTyID:  return "Float";
@@ -2672,8 +2682,8 @@ static const char * getFloatBitCastField(const Type *Ty) {
 }
 
 void CWriter::visitCastInst(CastInst &I) {
-  const Type *DstTy = I.getType();
-  const Type *SrcTy = I.getOperand(0)->getType();
+  Type *DstTy = I.getType();
+  Type *SrcTy = I.getOperand(0)->getType();
   if (isFPIntBitCast(I)) {
     Out << '(';
     // These int<->float and long<->double casts need to be handled specially
@@ -2719,7 +2729,7 @@ void CWriter::visitSelectInst(SelectInst &I) {
 
 // Returns the macro name or value of the max or min of an integer type
 // (as defined in limits.h).
-static void printLimitValue(const IntegerType &Ty, bool isSigned, bool isMax,
+static void printLimitValue(IntegerType &Ty, bool isSigned, bool isMax,
                             raw_ostream &Out) {
   const char* type;
   const char* sprefix = "";
@@ -2745,16 +2755,16 @@ static void printLimitValue(const IntegerType &Ty, bool isSigned, bool isMax,
 }
 
 #ifndef NDEBUG
-static bool isSupportedIntegerSize(const IntegerType &T) {
+static bool isSupportedIntegerSize(IntegerType &T) {
   return T.getBitWidth() == 8 || T.getBitWidth() == 16 ||
          T.getBitWidth() == 32 || T.getBitWidth() == 64;
 }
 #endif
 
 void CWriter::printIntrinsicDefinition(const Function &F, raw_ostream &Out) {
-  const FunctionType *funT = F.getFunctionType();
-  const Type *retT = F.getReturnType();
-  const IntegerType *elemT = cast<IntegerType>(funT->getParamType(1));
+  FunctionType *funT = F.getFunctionType();
+  Type *retT = F.getReturnType();
+  IntegerType *elemT = cast<IntegerType>(funT->getParamType(1));
 
   assert(isSupportedIntegerSize(*elemT) &&
          "CBackend does not support arbitrary size integers.");
@@ -2829,7 +2839,6 @@ void CWriter::lowerIntrinsics(Function &F) {
         if (Function *F = CI->getCalledFunction())
           switch (F->getIntrinsicID()) {
           case Intrinsic::not_intrinsic:
-          case Intrinsic::memory_barrier:
           case Intrinsic::vastart:
           case Intrinsic::vacopy:
           case Intrinsic::vaend:
@@ -2908,8 +2917,8 @@ void CWriter::visitCallInst(CallInst &I) {
 
   Value *Callee = I.getCalledValue();
 
-  const PointerType  *PTy   = cast<PointerType>(Callee->getType());
-  const FunctionType *FTy   = cast<FunctionType>(PTy->getElementType());
+  PointerType  *PTy   = cast<PointerType>(Callee->getType());
+  FunctionType *FTy   = cast<FunctionType>(PTy->getElementType());
 
   // If this is a call to a struct-return function, assign to the first
   // parameter instead of passing it to the call.
@@ -3020,9 +3029,6 @@ bool CWriter::visitBuiltinCall(CallInst &I, Intrinsic::ID ID,
     WroteCallee = true;
     return false;
   }
-  case Intrinsic::memory_barrier:
-    Out << "__sync_synchronize()";
-    return true;
   case Intrinsic::vastart:
     Out << "0; ";
 
@@ -3217,7 +3223,7 @@ void CWriter::visitInlineAsm(CallInst &CI) {
   std::vector<std::pair<Value*, int> > ResultVals;
   if (CI.getType() == Type::getVoidTy(CI.getContext()))
     ;
-  else if (const StructType *ST = dyn_cast<StructType>(CI.getType())) {
+  else if (StructType *ST = dyn_cast<StructType>(CI.getType())) {
     for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i)
       ResultVals.push_back(std::make_pair(&CI, (int)i));
   } else {
@@ -3348,7 +3354,7 @@ void CWriter::printGEPExpression(Value *Ptr, gep_type_iterator I,
   // Find out if the last index is into a vector.  If so, we have to print this
   // specially.  Since vectors can't have elements of indexable type, only the
   // last index could possibly be of a vector element.
-  const VectorType *LastIndexIsVector = 0;
+  VectorType *LastIndexIsVector = 0;
   {
     for (gep_type_iterator TmpI = I; TmpI != E; ++TmpI)
       LastIndexIsVector = dyn_cast<VectorType>(*TmpI);
@@ -3421,7 +3427,7 @@ void CWriter::printGEPExpression(Value *Ptr, gep_type_iterator I,
   Out << ")";
 }
 
-void CWriter::writeMemoryAccess(Value *Operand, const Type *OperandType,
+void CWriter::writeMemoryAccess(Value *Operand, Type *OperandType,
                                 bool IsVolatile, unsigned Alignment) {
 
   bool IsUnaligned = Alignment &&
@@ -3463,7 +3469,7 @@ void CWriter::visitStoreInst(StoreInst &I) {
   Out << " = ";
   Value *Operand = I.getOperand(0);
   Constant *BitMask = 0;
-  if (const IntegerType* ITy = dyn_cast<IntegerType>(Operand->getType()))
+  if (IntegerType* ITy = dyn_cast<IntegerType>(Operand->getType()))
     if (!ITy->isPowerOf2ByteWidth())
       // We have a bit width that doesn't match an even power-of-2 byte
       // size. Consequently we must & the value with the type's bit mask
@@ -3492,7 +3498,7 @@ void CWriter::visitVAArgInst(VAArgInst &I) {
 }
 
 void CWriter::visitInsertElementInst(InsertElementInst &I) {
-  const Type *EltTy = I.getType()->getElementType();
+  Type *EltTy = I.getType()->getElementType();
   writeOperand(I.getOperand(0));
   Out << ";\n  ";
   Out << "((";
@@ -3507,7 +3513,7 @@ void CWriter::visitInsertElementInst(InsertElementInst &I) {
 void CWriter::visitExtractElementInst(ExtractElementInst &I) {
   // We know that our operand is not inlined.
   Out << "((";
-  const Type *EltTy =
+  Type *EltTy =
     cast<VectorType>(I.getOperand(0)->getType())->getElementType();
   printType(Out, PointerType::getUnqual(EltTy));
   Out << ")(&" << GetValueName(I.getOperand(0)) << "))[";
@@ -3519,9 +3525,9 @@ void CWriter::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
   Out << "(";
   printType(Out, SVI.getType());
   Out << "){ ";
-  const VectorType *VT = SVI.getType();
+  VectorType *VT = SVI.getType();
   unsigned NumElts = VT->getNumElements();
-  const Type *EltTy = VT->getElementType();
+  Type *EltTy = VT->getElementType();
 
   for (unsigned i = 0; i != NumElts; ++i) {
     if (i) Out << ", ";
@@ -3557,9 +3563,9 @@ void CWriter::visitInsertValueInst(InsertValueInst &IVI) {
   Out << GetValueName(&IVI);
   for (const unsigned *b = IVI.idx_begin(), *i = b, *e = IVI.idx_end();
        i != e; ++i) {
-    const Type *IndexedTy =
+    Type *IndexedTy =
       ExtractValueInst::getIndexedType(IVI.getOperand(0)->getType(),
-                                       ArrayRef<unsigned>(b, i+1));
+                                       makeArrayRef(b, i+1));
     if (IndexedTy->isArrayTy())
       Out << ".array[" << *i << "]";
     else
@@ -3579,9 +3585,9 @@ void CWriter::visitExtractValueInst(ExtractValueInst &EVI) {
     Out << GetValueName(EVI.getOperand(0));
     for (const unsigned *b = EVI.idx_begin(), *i = b, *e = EVI.idx_end();
          i != e; ++i) {
-      const Type *IndexedTy =
+      Type *IndexedTy =
         ExtractValueInst::getIndexedType(EVI.getOperand(0)->getType(),
-                                         ArrayRef<unsigned>(b, i+1));
+                                         makeArrayRef(b, i+1));
       if (IndexedTy->isArrayTy())
         Out << ".array[" << *i << "]";
       else
diff --git a/lib/Target/CBackend/CMakeLists.txt b/lib/Target/CBackend/CMakeLists.txt
index a23ff85297031..96ae49f01fc07 100644
--- a/lib/Target/CBackend/CMakeLists.txt
+++ b/lib/Target/CBackend/CMakeLists.txt
@@ -2,4 +2,16 @@ add_llvm_target(CBackend
   CBackend.cpp
   )
 
+add_llvm_library_dependencies(LLVMCBackend
+  LLVMAnalysis
+  LLVMCBackendInfo
+  LLVMCodeGen
+  LLVMCore
+  LLVMMC
+  LLVMScalarOpts
+  LLVMSupport
+  LLVMTarget
+  LLVMTransformUtils
+  )
+
 add_subdirectory(TargetInfo)
diff --git a/lib/Target/CBackend/CTargetMachine.h b/lib/Target/CBackend/CTargetMachine.h
index e64216be0bdc1..4f1ca974ded95 100644
--- a/lib/Target/CBackend/CTargetMachine.h
+++ b/lib/Target/CBackend/CTargetMachine.h
@@ -20,8 +20,9 @@
 namespace llvm {
 
 struct CTargetMachine : public TargetMachine {
-  CTargetMachine(const Target &T, const std::string &TT,
-                 const std::string &CPU, const std::string &FS)
+  CTargetMachine(const Target &T, StringRef TT,
+                 StringRef CPU, StringRef FS,
+                 Reloc::Model RM, CodeModel::Model CM)
     : TargetMachine(T, TT, CPU, FS) {}
 
   virtual bool addPassesToEmitFile(PassManagerBase &PM,
diff --git a/lib/Target/CBackend/TargetInfo/CBackendTargetInfo.cpp b/lib/Target/CBackend/TargetInfo/CBackendTargetInfo.cpp
index f7e8ff254848e..e8274ff9ce5a1 100644
--- a/lib/Target/CBackend/TargetInfo/CBackendTargetInfo.cpp
+++ b/lib/Target/CBackend/TargetInfo/CBackendTargetInfo.cpp
@@ -9,7 +9,7 @@
 
 #include "CTargetMachine.h"
 #include "llvm/Module.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
 Target llvm::TheCBackendTarget;
@@ -17,3 +17,5 @@ Target llvm::TheCBackendTarget;
 extern "C" void LLVMInitializeCBackendTargetInfo() { 
   RegisterTarget<> X(TheCBackendTarget, "c", "C backend");
 }
+
+extern "C" void LLVMInitializeCBackendTargetMC() {}
diff --git a/lib/Target/CBackend/TargetInfo/CMakeLists.txt b/lib/Target/CBackend/TargetInfo/CMakeLists.txt
index 5b35fa7c065b3..8e616bebd5320 100644
--- a/lib/Target/CBackend/TargetInfo/CMakeLists.txt
+++ b/lib/Target/CBackend/TargetInfo/CMakeLists.txt
@@ -4,3 +4,8 @@ add_llvm_library(LLVMCBackendInfo
   CBackendTargetInfo.cpp
   )
 
+add_llvm_library_dependencies(LLVMCBackendInfo
+  LLVMMC
+  LLVMSupport
+  LLVMTarget
+  )
diff --git a/lib/Target/CMakeLists.txt b/lib/Target/CMakeLists.txt
index f982316fc087d..030f8089abf7c 100644
--- a/lib/Target/CMakeLists.txt
+++ b/lib/Target/CMakeLists.txt
@@ -1,8 +1,6 @@
 add_llvm_library(LLVMTarget
   Mangler.cpp
   Target.cpp
-  TargetAsmInfo.cpp
-  TargetAsmLexer.cpp
   TargetData.cpp
   TargetELFWriterInfo.cpp
   TargetFrameLowering.cpp
@@ -15,6 +13,12 @@ add_llvm_library(LLVMTarget
   TargetSubtargetInfo.cpp
   )
 
+add_llvm_library_dependencies(LLVMTarget
+  LLVMCore
+  LLVMMC
+  LLVMSupport
+  )
+
 set(LLVM_ENUM_ASM_PRINTERS "")
 set(LLVM_ENUM_ASM_PARSERS "")
 set(LLVM_ENUM_DISASSEMBLERS "")
diff --git a/lib/Target/CellSPU/CMakeLists.txt b/lib/Target/CellSPU/CMakeLists.txt
index 0b94e0cf1193d..158fb3eacce3f 100644
--- a/lib/Target/CellSPU/CMakeLists.txt
+++ b/lib/Target/CellSPU/CMakeLists.txt
@@ -1,12 +1,13 @@
 set(LLVM_TARGET_DEFINITIONS SPU.td)
 
-tablegen(SPUGenAsmWriter.inc -gen-asm-writer)
-tablegen(SPUGenCodeEmitter.inc -gen-emitter)
-tablegen(SPUGenRegisterInfo.inc -gen-register-info)
-tablegen(SPUGenInstrInfo.inc -gen-instr-info)
-tablegen(SPUGenDAGISel.inc -gen-dag-isel)
-tablegen(SPUGenSubtargetInfo.inc -gen-subtarget)
-tablegen(SPUGenCallingConv.inc -gen-callingconv)
+llvm_tablegen(SPUGenAsmWriter.inc -gen-asm-writer)
+llvm_tablegen(SPUGenCodeEmitter.inc -gen-emitter)
+llvm_tablegen(SPUGenRegisterInfo.inc -gen-register-info)
+llvm_tablegen(SPUGenInstrInfo.inc -gen-instr-info)
+llvm_tablegen(SPUGenDAGISel.inc -gen-dag-isel)
+llvm_tablegen(SPUGenSubtargetInfo.inc -gen-subtarget)
+llvm_tablegen(SPUGenCallingConv.inc -gen-callingconv)
+add_public_tablegen_target(CellSPUCommonTableGen)
 
 add_llvm_target(CellSPUCodeGen
   SPUAsmPrinter.cpp
@@ -22,5 +23,17 @@ add_llvm_target(CellSPUCodeGen
   SPUNopFiller.cpp
   )
 
+add_llvm_library_dependencies(LLVMCellSPUCodeGen
+  LLVMAsmPrinter
+  LLVMCellSPUDesc
+  LLVMCellSPUInfo
+  LLVMCodeGen
+  LLVMCore
+  LLVMMC
+  LLVMSelectionDAG
+  LLVMSupport
+  LLVMTarget
+  )
+
 add_subdirectory(TargetInfo)
 add_subdirectory(MCTargetDesc)
diff --git a/lib/Target/CellSPU/MCTargetDesc/CMakeLists.txt b/lib/Target/CellSPU/MCTargetDesc/CMakeLists.txt
index 85fb258eac2c6..d41fe934e2c5c 100644
--- a/lib/Target/CellSPU/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/CellSPU/MCTargetDesc/CMakeLists.txt
@@ -2,3 +2,10 @@ add_llvm_library(LLVMCellSPUDesc
   SPUMCTargetDesc.cpp
   SPUMCAsmInfo.cpp
   )
+
+add_llvm_library_dependencies(LLVMCellSPUDesc
+  LLVMCellSPUInfo
+  LLVMMC
+  )
+
+add_dependencies(LLVMCellSPUDesc CellSPUCommonTableGen)
diff --git a/lib/Target/CellSPU/MCTargetDesc/SPUMCTargetDesc.cpp b/lib/Target/CellSPU/MCTargetDesc/SPUMCTargetDesc.cpp
index 26c5a4bc7b33a..d5af2a88aed1e 100644
--- a/lib/Target/CellSPU/MCTargetDesc/SPUMCTargetDesc.cpp
+++ b/lib/Target/CellSPU/MCTargetDesc/SPUMCTargetDesc.cpp
@@ -13,10 +13,12 @@
 
 #include "SPUMCTargetDesc.h"
 #include "SPUMCAsmInfo.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/MC/MCCodeGenInfo.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 
 #define GET_INSTRINFO_MC_DESC
 #include "SPUGenInstrInfo.inc"
@@ -35,8 +37,10 @@ static MCInstrInfo *createSPUMCInstrInfo() {
   return X;
 }
 
-extern "C" void LLVMInitializeCellSPUMCInstrInfo() {
-  TargetRegistry::RegisterMCInstrInfo(TheCellSPUTarget, createSPUMCInstrInfo);
+static MCRegisterInfo *createCellSPUMCRegisterInfo(StringRef TT) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitSPUMCRegisterInfo(X, SPU::R0);
+  return X;
 }
 
 static MCSubtargetInfo *createSPUMCSubtargetInfo(StringRef TT, StringRef CPU,
@@ -46,11 +50,43 @@ static MCSubtargetInfo *createSPUMCSubtargetInfo(StringRef TT, StringRef CPU,
   return X;
 }
 
-extern "C" void LLVMInitializeCellSPUMCSubtargetInfo() {
-  TargetRegistry::RegisterMCSubtargetInfo(TheCellSPUTarget,
-                                          createSPUMCSubtargetInfo);
+static MCAsmInfo *createSPUMCAsmInfo(const Target &T, StringRef TT) {
+  MCAsmInfo *MAI = new SPULinuxMCAsmInfo(T, TT);
+
+  // Initial state of the frame pointer is R1.
+  MachineLocation Dst(MachineLocation::VirtualFP);
+  MachineLocation Src(SPU::R1, 0);
+  MAI->addInitialFrameState(0, Dst, Src);
+
+  return MAI;
+}
+
+static MCCodeGenInfo *createSPUMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                      CodeModel::Model CM) {
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+  // For the time being, use static relocations, since there's really no
+  // support for PIC yet.
+  X->InitMCCodeGenInfo(Reloc::Static, CM);
+  return X;
 }
 
-extern "C" void LLVMInitializeCellSPUMCAsmInfo() {
-  RegisterMCAsmInfo<SPULinuxMCAsmInfo> X(TheCellSPUTarget);
+// Force static initialization.
+extern "C" void LLVMInitializeCellSPUTargetMC() {
+  // Register the MC asm info.
+  RegisterMCAsmInfoFn X(TheCellSPUTarget, createSPUMCAsmInfo);
+
+  // Register the MC codegen info.
+  TargetRegistry::RegisterMCCodeGenInfo(TheCellSPUTarget,
+                                        createSPUMCCodeGenInfo);
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(TheCellSPUTarget, createSPUMCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(TheCellSPUTarget,
+                                    createCellSPUMCRegisterInfo);
+
+  // Register the MC subtarget info.
+  TargetRegistry::RegisterMCSubtargetInfo(TheCellSPUTarget,
+                                          createSPUMCSubtargetInfo);
 }
diff --git a/lib/Target/CellSPU/MCTargetDesc/SPUMCTargetDesc.h b/lib/Target/CellSPU/MCTargetDesc/SPUMCTargetDesc.h
index c5c037d4de446..a3717b0bfc10e 100644
--- a/lib/Target/CellSPU/MCTargetDesc/SPUMCTargetDesc.h
+++ b/lib/Target/CellSPU/MCTargetDesc/SPUMCTargetDesc.h
@@ -1,4 +1,4 @@
-//===-- SPUMCTargetDesc.h - Alpha Target Descriptions ---------*- C++ -*-===//
+//===-- SPUMCTargetDesc.h - CellSPU Target Descriptions ---------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file provides Alpha specific target descriptions.
+// This file provides CellSPU specific target descriptions.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/lib/Target/CellSPU/SPUAsmPrinter.cpp b/lib/Target/CellSPU/SPUAsmPrinter.cpp
index fd96694b32fe7..90b5270a9dae2 100644
--- a/lib/Target/CellSPU/SPUAsmPrinter.cpp
+++ b/lib/Target/CellSPU/SPUAsmPrinter.cpp
@@ -29,10 +29,10 @@
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetRegisterInfo.h"
-#include "llvm/Target/TargetRegistry.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
diff --git a/lib/Target/CellSPU/SPUFrameLowering.cpp b/lib/Target/CellSPU/SPUFrameLowering.cpp
index a3e7e73ae30ae..093f99f287112 100644
--- a/lib/Target/CellSPU/SPUFrameLowering.cpp
+++ b/lib/Target/CellSPU/SPUFrameLowering.cpp
@@ -181,18 +181,6 @@ void SPUFrameLowering::emitPrologue(MachineFunction &MF) const {
       MachineLocation FPSrc(MachineLocation::VirtualFP);
       Moves.push_back(MachineMove(ReadyLabel, FPDst, FPSrc));
     }
-  } else {
-    // This is a leaf function -- insert a branch hint iff there are
-    // sufficient number instructions in the basic block. Note that
-    // this is just a best guess based on the basic block's size.
-    if (MBB.size() >= (unsigned) SPUFrameLowering::branchHintPenalty()) {
-      MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
-      dl = MBBI->getDebugLoc();
-
-      // Insert terminator label
-      BuildMI(MBB, MBBI, dl, TII.get(SPU::PROLOG_LABEL))
-        .addSym(MMI.getContext().CreateTempSymbol());
-    }
   }
 }
 
@@ -249,14 +237,6 @@ void SPUFrameLowering::emitEpilogue(MachineFunction &MF,
   }
 }
 
-void SPUFrameLowering::getInitialFrameState(std::vector<MachineMove> &Moves)
-                                                                         const {
-  // Initial state of the frame pointer is R1.
-  MachineLocation Dst(MachineLocation::VirtualFP);
-  MachineLocation Src(SPU::R1, 0);
-  Moves.push_back(MachineMove(0, Dst, Src));
-}
-
 void SPUFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
                                                         RegScavenger *RS) const{
   // Mark LR and SP unused, since the prolog spills them to stack and
diff --git a/lib/Target/CellSPU/SPUFrameLowering.h b/lib/Target/CellSPU/SPUFrameLowering.h
index 4fee72d946a2a..b837f2cf94e1d 100644
--- a/lib/Target/CellSPU/SPUFrameLowering.h
+++ b/lib/Target/CellSPU/SPUFrameLowering.h
@@ -43,9 +43,6 @@ namespace llvm {
     void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
                                               RegScavenger *RS = NULL) const;
 
-    //! Perform target-specific stack frame setup.
-    void getInitialFrameState(std::vector<MachineMove> &Moves) const;
-
     //! Return a function's saved spill slots
     /*!
       For CellSPU, a function's saved spill slots is just the link register.
@@ -77,17 +74,6 @@ namespace llvm {
     static int FItoStackOffset(int frame_index) {
       return frame_index * stackSlotSize();
     }
-    //! Number of instructions required to overcome hint-for-branch latency
-    /*!
-      HBR (hint-for-branch) instructions can be inserted when, for example,
-      we know that a given function is going to be called, such as printf(),
-      in the control flow graph. HBRs are only inserted if a sufficient number
-      of instructions occurs between the HBR and the target. Currently, HBRs
-      take 6 cycles, ergo, the magic number 6.
-     */
-    static int branchHintPenalty() {
-      return 6;
-    }
   };
 }
 
diff --git a/lib/Target/CellSPU/SPUISelLowering.cpp b/lib/Target/CellSPU/SPUISelLowering.cpp
index f0ceee2141497..ac33111f74ae7 100644
--- a/lib/Target/CellSPU/SPUISelLowering.cpp
+++ b/lib/Target/CellSPU/SPUISelLowering.cpp
@@ -69,7 +69,7 @@ namespace {
     TargetLowering::ArgListEntry Entry;
     for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
       EVT ArgVT = Op.getOperand(i).getValueType();
-      const Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+      Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
       Entry.Node = Op.getOperand(i);
       Entry.Ty = ArgTy;
       Entry.isSExt = isSigned;
@@ -80,7 +80,7 @@ namespace {
                                            TLI.getPointerTy());
 
     // Splice the libcall in wherever FindInputOutputChains tells us to.
-    const Type *RetTy =
+    Type *RetTy =
                 Op.getNode()->getValueType(0).getTypeForEVT(*DAG.getContext());
     std::pair<SDValue, SDValue> CallInfo =
             TLI.LowerCallTo(InChain, RetTy, isSigned, !isSigned, false, false,
@@ -174,6 +174,7 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
 
   // SPU has no intrinsics for these particular operations:
   setOperationAction(ISD::MEMBARRIER, MVT::Other, Expand);
+  setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Expand);
 
   // SPU has no division/remainder instructions
   setOperationAction(ISD::SREM,    MVT::i8,   Expand);
@@ -401,6 +402,9 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
        i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) {
     MVT::SimpleValueType VT = (MVT::SimpleValueType)i;
 
+    // Set operation actions to legal types only.
+    if (!isTypeLegal(VT)) continue;
+
     // add/sub are legal for all supported vector VT's.
     setOperationAction(ISD::ADD,     VT, Legal);
     setOperationAction(ISD::SUB,     VT, Legal);
@@ -438,6 +442,7 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
   setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
 
   setBooleanContents(ZeroOrNegativeOneBooleanContent);
+  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); // FIXME: Is this correct?
 
   setStackPointerRegisterToSaveRestore(SPU::R1);
 
@@ -497,7 +502,7 @@ SPUTargetLowering::getTargetNodeName(unsigned Opcode) const
 // Return the Cell SPU's SETCC result type
 //===----------------------------------------------------------------------===//
 
-MVT::SimpleValueType SPUTargetLowering::getSetCCResultType(EVT VT) const {
+EVT SPUTargetLowering::getSetCCResultType(EVT VT) const {
   // i8, i16 and i32 are valid SETCC result types
   MVT::SimpleValueType retval;
 
@@ -2727,6 +2732,7 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG)
   // the type to extend from needs to be i64 or i32.
   assert((OpVT == MVT::i128 && (Op0VT == MVT::i64 || Op0VT == MVT::i32)) &&
           "LowerSIGN_EXTEND: input and/or output operand have wrong size");
+  (void)OpVT;
 
   // Create shuffle mask
   unsigned mask1 = 0x10101010; // byte 0 - 3 and 4 - 7
@@ -3216,7 +3222,7 @@ SPUTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
 /// isLegalAddressImmediate - Return true if the integer value can be used
 /// as the offset of the target addressing mode.
 bool SPUTargetLowering::isLegalAddressImmediate(int64_t V,
-                                                const Type *Ty) const {
+                                                Type *Ty) const {
   // SPU's addresses are 256K:
   return (V > -(1 << 18) && V < (1 << 18) - 1);
 }
@@ -3239,7 +3245,7 @@ bool SPUTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
 
 bool
 SPUTargetLowering::isLegalAddressingMode(const AddrMode &AM,
-                                         const Type * ) const{
+                                         Type * ) const{
 
   // A-form: 18bit absolute address.
   if (AM.BaseGV && !AM.HasBaseReg && AM.Scale == 0 && AM.BaseOffs == 0)
diff --git a/lib/Target/CellSPU/SPUISelLowering.h b/lib/Target/CellSPU/SPUISelLowering.h
index d23f6cc48c715..aa4a1687278a0 100644
--- a/lib/Target/CellSPU/SPUISelLowering.h
+++ b/lib/Target/CellSPU/SPUISelLowering.h
@@ -107,7 +107,7 @@ namespace llvm {
     virtual const char *getTargetNodeName(unsigned Opcode) const;
 
     /// getSetCCResultType - Return the ValueType for ISD::SETCC
-    virtual MVT::SimpleValueType getSetCCResultType(EVT VT) const;
+    virtual EVT getSetCCResultType(EVT VT) const;
 
     virtual MVT getShiftAmountTy(EVT LHSTy) const { return MVT::i32; }
 
@@ -147,7 +147,7 @@ namespace llvm {
 
     /// isLegalAddressImmediate - Return true if the integer value can be used
     /// as the offset of the target addressing mode.
-    virtual bool isLegalAddressImmediate(int64_t V, const Type *Ty) const;
+    virtual bool isLegalAddressImmediate(int64_t V, Type *Ty) const;
     virtual bool isLegalAddressImmediate(GlobalValue *) const;
 
     virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
@@ -179,7 +179,7 @@ namespace llvm {
     virtual bool isLegalICmpImmediate(int64_t Imm) const;
 
     virtual bool isLegalAddressingMode(const AddrMode &AM,
-                                       const Type *Ty) const;
+                                       Type *Ty) const;
   };
 }
 
diff --git a/lib/Target/CellSPU/SPUInstrInfo.cpp b/lib/Target/CellSPU/SPUInstrInfo.cpp
index e67b10c7984d6..007bc0e02c7ec 100644
--- a/lib/Target/CellSPU/SPUInstrInfo.cpp
+++ b/lib/Target/CellSPU/SPUInstrInfo.cpp
@@ -17,9 +17,9 @@
 #include "SPUHazardRecognizers.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/Target/TargetRegistry.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 
 #define GET_INSTRINFO_CTOR
@@ -290,6 +290,8 @@ static void removeHBR( MachineBasicBlock &MBB) {
     if (I->getOpcode() == SPU::HBRA ||
         I->getOpcode() == SPU::HBR_LABEL){
       I=MBB.erase(I);
+      if (I == MBB.end())
+        break;
     }
   }
 }
diff --git a/lib/Target/CellSPU/SPUInstrInfo.td b/lib/Target/CellSPU/SPUInstrInfo.td
index e103c9b6a5af9..f76ebd75bfef7 100644
--- a/lib/Target/CellSPU/SPUInstrInfo.td
+++ b/lib/Target/CellSPU/SPUInstrInfo.td
@@ -1594,8 +1594,8 @@ multiclass BitwiseOrImm
 {
   def v4i32: ORIVecInst<v4i32, v4i32Uns10Imm>;
 
-  def r32: ORIInst<(outs R32C:$rT), (ins R32C:$rA, u10imm_i32:$val),
-                   [(set R32C:$rT, (or R32C:$rA, i32ImmUns10:$val))]>;
+  def r32: ORIInst<(outs R32C:$rT), (ins R32C:$rA, s10imm_i32:$val),
+                   [(set R32C:$rT, (or R32C:$rA, i32ImmSExt10:$val))]>;
 
   // i16i32: hacked version of the ori instruction to extend 16-bit quantities
   // to 32-bit quantities. used exclusively to match "anyext" conversions (vide
@@ -3467,8 +3467,10 @@ let isBranch = 1, isTerminator = 1, hasCtrlDep = 1 in {
         [/* no pattern */]>;
 
     // Indirect branch
-    def BI:
-      BIForm<0b00010101100, "bi\t$func", [(brind R32C:$func)]>;
+    let isIndirectBranch = 1 in {
+      def BI:
+        BIForm<0b00010101100, "bi\t$func", [(brind R32C:$func)]>;
+    }
   }
 
   // Conditional branches:
diff --git a/lib/Target/CellSPU/SPURegisterInfo.cpp b/lib/Target/CellSPU/SPURegisterInfo.cpp
index 19896c0b4be9f..bbac6fd0be54d 100644
--- a/lib/Target/CellSPU/SPURegisterInfo.cpp
+++ b/lib/Target/CellSPU/SPURegisterInfo.cpp
@@ -25,7 +25,6 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineLocation.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/CodeGen/ValueTypes.h"
@@ -187,7 +186,7 @@ unsigned SPURegisterInfo::getRegisterNumbering(unsigned RegEnum) {
 
 SPURegisterInfo::SPURegisterInfo(const SPUSubtarget &subtarget,
                                  const TargetInstrInfo &tii) :
-  SPUGenRegisterInfo(), Subtarget(subtarget), TII(tii)
+  SPUGenRegisterInfo(SPU::R0), Subtarget(subtarget), TII(tii)
 {
 }
 
@@ -310,28 +309,12 @@ SPURegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
   }
 }
 
-unsigned
-SPURegisterInfo::getRARegister() const
-{
-  return SPU::R0;
-}
-
 unsigned
 SPURegisterInfo::getFrameRegister(const MachineFunction &MF) const
 {
   return SPU::R1;
 }
 
-int
-SPURegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
-  // FIXME: Most probably dwarf numbers differs for Linux and Darwin
-  return SPUGenRegisterInfo::getDwarfRegNumFull(RegNum, 0);
-}
-
-int SPURegisterInfo::getLLVMRegNum(unsigned RegNum, bool isEH) const {
-  return SPUGenRegisterInfo::getLLVMRegNumFull(RegNum, 0);
-}
-
 int
 SPURegisterInfo::convertDFormToXForm(int dFormOpcode) const
 {
diff --git a/lib/Target/CellSPU/SPURegisterInfo.h b/lib/Target/CellSPU/SPURegisterInfo.h
index 5e014f8adbfca..b7818a47abd7e 100644
--- a/lib/Target/CellSPU/SPURegisterInfo.h
+++ b/lib/Target/CellSPU/SPURegisterInfo.h
@@ -74,8 +74,6 @@ namespace llvm {
     void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
                              RegScavenger *RS = NULL) const;
 
-    //! Get return address register (LR, aka R0)
-    unsigned getRARegister() const;
     //! Get the stack frame register (SP, aka R1)
     unsigned getFrameRegister(const MachineFunction &MF) const;
 
@@ -83,10 +81,6 @@ namespace llvm {
     // New methods added:
     //------------------------------------------------------------------------
 
-    //! Get DWARF debugging register number
-    int getDwarfRegNum(unsigned RegNum, bool isEH) const;
-    int getLLVMRegNum(unsigned RegNum, bool isEH) const;
-
     //! Convert D-form load/store to X-form load/store
     /*!
       Converts a regiser displacement load/store into a register-indexed
diff --git a/lib/Target/CellSPU/SPUSubtarget.cpp b/lib/Target/CellSPU/SPUSubtarget.cpp
index 856dc82f786be..43335abf0ac2b 100644
--- a/lib/Target/CellSPU/SPUSubtarget.cpp
+++ b/lib/Target/CellSPU/SPUSubtarget.cpp
@@ -14,7 +14,7 @@
 #include "SPUSubtarget.h"
 #include "SPU.h"
 #include "SPURegisterInfo.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/ADT/SmallVector.h"
 
 #define GET_SUBTARGETINFO_TARGET_DESC
diff --git a/lib/Target/CellSPU/SPUTargetMachine.cpp b/lib/Target/CellSPU/SPUTargetMachine.cpp
index 3542a2b87e432..93a7f6e36501f 100644
--- a/lib/Target/CellSPU/SPUTargetMachine.cpp
+++ b/lib/Target/CellSPU/SPUTargetMachine.cpp
@@ -16,7 +16,8 @@
 #include "llvm/PassManager.h"
 #include "llvm/CodeGen/RegAllocRegistry.h"
 #include "llvm/CodeGen/SchedulerRegistry.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/DynamicLibrary.h"
+#include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
 
@@ -31,9 +32,10 @@ SPUFrameLowering::getCalleeSaveSpillSlots(unsigned &NumEntries) const {
   return &LR[0];
 }
 
-SPUTargetMachine::SPUTargetMachine(const Target &T, const std::string &TT,
-                                   const std::string &CPU,const std::string &FS)
-  : LLVMTargetMachine(T, TT, CPU, FS),
+SPUTargetMachine::SPUTargetMachine(const Target &T, StringRef TT,
+                                   StringRef CPU, StringRef FS,
+                                   Reloc::Model RM, CodeModel::Model CM)
+  : LLVMTargetMachine(T, TT, CPU, FS, RM, CM),
     Subtarget(TT, CPU, FS),
     DataLayout(Subtarget.getTargetDataString()),
     InstrInfo(*this),
@@ -41,9 +43,6 @@ SPUTargetMachine::SPUTargetMachine(const Target &T, const std::string &TT,
     TLInfo(*this),
     TSInfo(*this),
     InstrItins(Subtarget.getInstrItineraryData()) {
-  // For the time being, use static relocations, since there's really no
-  // support for PIC yet.
-  setRelocationModel(Reloc::Static);
 }
 
 //===----------------------------------------------------------------------===//
@@ -59,8 +58,16 @@ bool SPUTargetMachine::addInstSelector(PassManagerBase &PM,
 
 // passes to run just before printing the assembly
 bool SPUTargetMachine::
-addPreEmitPass(PassManagerBase &PM, CodeGenOpt::Level OptLevel) 
-{
+addPreEmitPass(PassManagerBase &PM, CodeGenOpt::Level OptLevel) {
+  // load the TCE instruction scheduler, if available via
+  // loaded plugins
+  typedef llvm::FunctionPass* (*BuilderFunc)(const char*);
+  BuilderFunc schedulerCreator =
+    (BuilderFunc)(intptr_t)sys::DynamicLibrary::SearchForAddressOfSymbol(
+          "createTCESchedulerPass");
+  if (schedulerCreator != NULL)
+      PM.add(schedulerCreator("cellspu"));
+
   //align instructions with nops/lnops for dual issue
   PM.add(createSPUNopFillerPass(*this));
   return true;
diff --git a/lib/Target/CellSPU/SPUTargetMachine.h b/lib/Target/CellSPU/SPUTargetMachine.h
index d96f86dcaeb0f..fffe77cabba37 100644
--- a/lib/Target/CellSPU/SPUTargetMachine.h
+++ b/lib/Target/CellSPU/SPUTargetMachine.h
@@ -38,8 +38,9 @@ class SPUTargetMachine : public LLVMTargetMachine {
   SPUSelectionDAGInfo TSInfo;
   InstrItineraryData  InstrItins;
 public:
-  SPUTargetMachine(const Target &T, const std::string &TT,
-                   const std::string &CPU, const std::string &FS);
+  SPUTargetMachine(const Target &T, StringRef TT,
+                   StringRef CPU, StringRef FS,
+                   Reloc::Model RM, CodeModel::Model CM);
 
   /// Return the subtarget implementation object
   virtual const SPUSubtarget     *getSubtargetImpl() const {
diff --git a/lib/Target/CellSPU/TargetInfo/CMakeLists.txt b/lib/Target/CellSPU/TargetInfo/CMakeLists.txt
index 928d0fe97e0d2..3f2d6b09adad5 100644
--- a/lib/Target/CellSPU/TargetInfo/CMakeLists.txt
+++ b/lib/Target/CellSPU/TargetInfo/CMakeLists.txt
@@ -4,4 +4,10 @@ add_llvm_library(LLVMCellSPUInfo
   CellSPUTargetInfo.cpp
   )
 
-add_dependencies(LLVMCellSPUInfo CellSPUCodeGenTable_gen)
+add_llvm_library_dependencies(LLVMCellSPUInfo
+  LLVMMC
+  LLVMSupport
+  LLVMTarget
+  )
+
+add_dependencies(LLVMCellSPUInfo CellSPUCommonTableGen)
diff --git a/lib/Target/CellSPU/TargetInfo/CellSPUTargetInfo.cpp b/lib/Target/CellSPU/TargetInfo/CellSPUTargetInfo.cpp
index 049ea236e9922..84aadfad6f8d6 100644
--- a/lib/Target/CellSPU/TargetInfo/CellSPUTargetInfo.cpp
+++ b/lib/Target/CellSPU/TargetInfo/CellSPUTargetInfo.cpp
@@ -9,7 +9,7 @@
 
 #include "SPU.h"
 #include "llvm/Module.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
 Target llvm::TheCellSPUTarget;
diff --git a/lib/Target/CppBackend/CMakeLists.txt b/lib/Target/CppBackend/CMakeLists.txt
index e9375599511cd..95b6058243dce 100644
--- a/lib/Target/CppBackend/CMakeLists.txt
+++ b/lib/Target/CppBackend/CMakeLists.txt
@@ -2,4 +2,11 @@ add_llvm_target(CppBackend
   CPPBackend.cpp
   )
 
+add_llvm_library_dependencies(LLVMCppBackend
+  LLVMCore
+  LLVMCppBackendInfo
+  LLVMSupport
+  LLVMTarget
+  )
+
 add_subdirectory(TargetInfo)
diff --git a/lib/Target/CppBackend/CPPBackend.cpp b/lib/Target/CppBackend/CPPBackend.cpp
index 10d18f61c7e2b..ae0e3c404b76e 100644
--- a/lib/Target/CppBackend/CPPBackend.cpp
+++ b/lib/Target/CppBackend/CPPBackend.cpp
@@ -29,7 +29,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Config/config.h"
 #include <algorithm>
@@ -77,22 +77,12 @@ extern "C" void LLVMInitializeCppBackendTarget() {
   RegisterTargetMachine<CPPTargetMachine> X(TheCppBackendTarget);
 }
 
-extern "C" void LLVMInitializeCppBackendMCAsmInfo() {}
-
-extern "C" void LLVMInitializeCppBackendMCInstrInfo() {
-  RegisterMCInstrInfo<MCInstrInfo> X(TheCppBackendTarget);
-}
-
-extern "C" void LLVMInitializeCppBackendMCSubtargetInfo() {
-  RegisterMCSubtargetInfo<MCSubtargetInfo> X(TheCppBackendTarget);
-}
-
 namespace {
-  typedef std::vector<const Type*> TypeList;
-  typedef std::map<const Type*,std::string> TypeMap;
+  typedef std::vector<Type*> TypeList;
+  typedef std::map<Type*,std::string> TypeMap;
   typedef std::map<const Value*,std::string> ValueMap;
   typedef std::set<std::string> NameSet;
-  typedef std::set<const Type*> TypeSet;
+  typedef std::set<Type*> TypeSet;
   typedef std::set<const Value*> ValueSet;
   typedef std::map<const Value*,std::string> ForwardRefMap;
 
@@ -143,14 +133,14 @@ namespace {
     void printEscapedString(const std::string& str);
     void printCFP(const ConstantFP* CFP);
 
-    std::string getCppName(const Type* val);
-    inline void printCppName(const Type* val);
+    std::string getCppName(Type* val);
+    inline void printCppName(Type* val);
 
     std::string getCppName(const Value* val);
     inline void printCppName(const Value* val);
 
     void printAttributes(const AttrListPtr &PAL, const std::string &name);
-    void printType(const Type* Ty);
+    void printType(Type* Ty);
     void printTypes(const Module* M);
 
     void printConstant(const Constant *CPV);
@@ -164,7 +154,7 @@ namespace {
     void printFunctionHead(const Function *F);
     void printFunctionBody(const Function *F);
     void printInstruction(const Instruction *I, const std::string& bbname);
-    std::string getOpName(Value*);
+    std::string getOpName(const Value*);
 
     void printModuleBody();
   };
@@ -184,7 +174,7 @@ static inline void sanitize(std::string &str) {
       str[i] = '_';
 }
 
-static std::string getTypePrefix(const Type *Ty) {
+static std::string getTypePrefix(Type *Ty) {
   switch (Ty->getTypeID()) {
   case Type::VoidTyID:     return "void_";
   case Type::IntegerTyID:
@@ -339,7 +329,7 @@ void CppWriter::printEscapedString(const std::string &Str) {
   }
 }
 
-std::string CppWriter::getCppName(const Type* Ty) {
+std::string CppWriter::getCppName(Type* Ty) {
   // First, handle the primitive types .. easy
   if (Ty->isPrimitiveType() || Ty->isIntegerTy()) {
     switch (Ty->getTypeID()) {
@@ -379,7 +369,7 @@ std::string CppWriter::getCppName(const Type* Ty) {
 
   // See if the type has a name in the symboltable and build accordingly
   std::string name;
-  if (const StructType *STy = dyn_cast<StructType>(Ty))
+  if (StructType *STy = dyn_cast<StructType>(Ty))
     if (STy->hasName())
       name = STy->getName();
   
@@ -393,7 +383,7 @@ std::string CppWriter::getCppName(const Type* Ty) {
   return TypeNames[Ty] = name;
 }
 
-void CppWriter::printCppName(const Type* Ty) {
+void CppWriter::printCppName(Type* Ty) {
   printEscapedString(getCppName(Ty));
 }
 
@@ -480,6 +470,9 @@ void CppWriter::printAttributes(const AttrListPtr &PAL,
       HANDLE_ATTR(NoImplicitFloat);
       HANDLE_ATTR(Naked);
       HANDLE_ATTR(InlineHint);
+      HANDLE_ATTR(ReturnsTwice);
+      HANDLE_ATTR(UWTable);
+      HANDLE_ATTR(NonLazyBind);
 #undef HANDLE_ATTR
       if (attrs & Attribute::StackAlignment)
         Out << " | Attribute::constructStackAlignmentFromInt("
@@ -499,7 +492,7 @@ void CppWriter::printAttributes(const AttrListPtr &PAL,
   }
 }
 
-void CppWriter::printType(const Type* Ty) {
+void CppWriter::printType(Type* Ty) {
   // We don't print definitions for primitive types
   if (Ty->isPrimitiveType() || Ty->isIntegerTy())
     return;
@@ -514,13 +507,13 @@ void CppWriter::printType(const Type* Ty) {
   // Print the type definition
   switch (Ty->getTypeID()) {
   case Type::FunctionTyID:  {
-    const FunctionType* FT = cast<FunctionType>(Ty);
+    FunctionType* FT = cast<FunctionType>(Ty);
     Out << "std::vector<Type*>" << typeName << "_args;";
     nl(Out);
     FunctionType::param_iterator PI = FT->param_begin();
     FunctionType::param_iterator PE = FT->param_end();
     for (; PI != PE; ++PI) {
-      const Type* argTy = static_cast<const Type*>(*PI);
+      Type* argTy = static_cast<Type*>(*PI);
       printType(argTy);
       std::string argName(getCppName(argTy));
       Out << typeName << "_args.push_back(" << argName;
@@ -539,13 +532,21 @@ void CppWriter::printType(const Type* Ty) {
     break;
   }
   case Type::StructTyID: {
-    const StructType* ST = cast<StructType>(Ty);
-    if (!ST->isAnonymous()) {
-      Out << "StructType *" << typeName << " = ";
-      Out << "StructType::createNamed(mod->getContext(), \"";
+    StructType* ST = cast<StructType>(Ty);
+    if (!ST->isLiteral()) {
+      Out << "StructType *" << typeName << " = mod->getTypeByName(\"";
+      printEscapedString(ST->getName());
+      Out << "\");";
+      nl(Out);
+      Out << "if (!" << typeName << ") {";
+      nl(Out);
+      Out << typeName << " = ";
+      Out << "StructType::create(mod->getContext(), \"";
       printEscapedString(ST->getName());
       Out << "\");";
       nl(Out);
+      Out << "}";
+      nl(Out);
       // Indicate that this type is now defined.
       DefinedTypes.insert(Ty);
     }
@@ -555,7 +556,7 @@ void CppWriter::printType(const Type* Ty) {
     StructType::element_iterator EI = ST->element_begin();
     StructType::element_iterator EE = ST->element_end();
     for (; EI != EE; ++EI) {
-      const Type* fieldTy = static_cast<const Type*>(*EI);
+      Type* fieldTy = static_cast<Type*>(*EI);
       printType(fieldTy);
       std::string fieldName(getCppName(fieldTy));
       Out << typeName << "_fields.push_back(" << fieldName;
@@ -563,21 +564,27 @@ void CppWriter::printType(const Type* Ty) {
       nl(Out);
     }
 
-    if (ST->isAnonymous()) {
+    if (ST->isLiteral()) {
       Out << "StructType *" << typeName << " = ";
       Out << "StructType::get(" << "mod->getContext(), ";
     } else {
+      Out << "if (" << typeName << "->isOpaque()) {";
+      nl(Out);
       Out << typeName << "->setBody(";
     }
 
     Out << typeName << "_fields, /*isPacked=*/"
         << (ST->isPacked() ? "true" : "false") << ");";
     nl(Out);
+    if (!ST->isLiteral()) {
+      Out << "}";
+      nl(Out);
+    }
     break;
   }
   case Type::ArrayTyID: {
-    const ArrayType* AT = cast<ArrayType>(Ty);
-    const Type* ET = AT->getElementType();
+    ArrayType* AT = cast<ArrayType>(Ty);
+    Type* ET = AT->getElementType();
     printType(ET);
     if (DefinedTypes.find(Ty) == DefinedTypes.end()) {
       std::string elemName(getCppName(ET));
@@ -589,8 +596,8 @@ void CppWriter::printType(const Type* Ty) {
     break;
   }
   case Type::PointerTyID: {
-    const PointerType* PT = cast<PointerType>(Ty);
-    const Type* ET = PT->getElementType();
+    PointerType* PT = cast<PointerType>(Ty);
+    Type* ET = PT->getElementType();
     printType(ET);
     if (DefinedTypes.find(Ty) == DefinedTypes.end()) {
       std::string elemName(getCppName(ET));
@@ -602,8 +609,8 @@ void CppWriter::printType(const Type* Ty) {
     break;
   }
   case Type::VectorTyID: {
-    const VectorType* PT = cast<VectorType>(Ty);
-    const Type* ET = PT->getElementType();
+    VectorType* PT = cast<VectorType>(Ty);
+    Type* ET = PT->getElementType();
     printType(ET);
     if (DefinedTypes.find(Ty) == DefinedTypes.end()) {
       std::string elemName(getCppName(ET));
@@ -766,9 +773,7 @@ void CppWriter::printConstant(const Constant *CV) {
       Out << "Constant* " << constName
           << " = ConstantExpr::getGetElementPtr("
           << getCppName(CE->getOperand(0)) << ", "
-          << "&" << constName << "_indices[0], "
-          << constName << "_indices.size()"
-          << ");";
+          << constName << "_indices);";
     } else if (CE->isCast()) {
       printConstant(CE->getOperand(0));
       Out << "Constant* " << constName << " = ConstantExpr::getCast(";
@@ -988,7 +993,7 @@ void CppWriter::printVariableBody(const GlobalVariable *GV) {
   }
 }
 
-std::string CppWriter::getOpName(Value* V) {
+std::string CppWriter::getOpName(const Value* V) {
   if (!isa<Instruction>(V) || DefinedValues.find(V) != DefinedValues.end())
     return getCppName(V);
 
@@ -1053,14 +1058,17 @@ void CppWriter::printInstruction(const Instruction *I,
   case Instruction::Switch: {
     const SwitchInst *SI = cast<SwitchInst>(I);
     Out << "SwitchInst* " << iName << " = SwitchInst::Create("
-        << opNames[0] << ", "
-        << opNames[1] << ", "
+        << getOpName(SI->getCondition()) << ", "
+        << getOpName(SI->getDefaultDest()) << ", "
         << SI->getNumCases() << ", " << bbname << ");";
     nl(Out);
-    for (unsigned i = 2; i != SI->getNumOperands(); i += 2) {
+    unsigned NumCases = SI->getNumCases();
+    for (unsigned i = 1; i < NumCases; ++i) {
+      const ConstantInt* CaseVal = SI->getCaseValue(i);
+      const BasicBlock* BB = SI->getSuccessor(i);
       Out << iName << "->addCase("
-          << opNames[i] << ", "
-          << opNames[i+1] << ");";
+          << getOpName(CaseVal) << ", "
+          << getOpName(BB) << ");";
       nl(Out);
     }
     break;
@@ -1076,6 +1084,11 @@ void CppWriter::printInstruction(const Instruction *I,
     }
     break;
   }
+  case Instruction::Resume: {
+    Out << "ResumeInst::Create(mod->getContext(), " << opNames[0]
+        << ", " << bbname << ");";
+    break;
+  }
   case Instruction::Invoke: {
     const InvokeInst* inv = cast<InvokeInst>(I);
     Out << "std::vector<Value*> " << iName << "_params;";
@@ -1090,8 +1103,7 @@ void CppWriter::printInstruction(const Instruction *I,
         << getOpName(inv->getCalledFunction()) << ", "
         << getOpName(inv->getNormalDest()) << ", "
         << getOpName(inv->getUnwindDest()) << ", "
-        << iName << "_params.begin(), "
-        << iName << "_params.end(), \"";
+        << iName << "_params, \"";
     printEscapedString(inv->getName());
     Out << "\", " << bbname << ");";
     nl(Out) << iName << "->setCallingConv(";
@@ -1252,8 +1264,7 @@ void CppWriter::printInstruction(const Instruction *I,
         nl(Out);
       }
       Out << "Instruction* " << iName << " = GetElementPtrInst::Create("
-          << opNames[0] << ", " << iName << "_indices.begin(), "
-          << iName << "_indices.end()";
+          << opNames[0] << ", " << iName << "_indices";
     }
     Out << ", \"";
     printEscapedString(gep->getName());
@@ -1304,7 +1315,7 @@ void CppWriter::printInstruction(const Instruction *I,
     case Instruction::PtrToInt: Out << "PtrToIntInst"; break;
     case Instruction::IntToPtr: Out << "IntToPtrInst"; break;
     case Instruction::BitCast:  Out << "BitCastInst"; break;
-    default: assert(!"Unreachable"); break;
+    default: assert(0 && "Unreachable"); break;
     }
     Out << "(" << opNames[0] << ", "
         << getCppName(cst->getType()) << ", \"";
@@ -1331,8 +1342,7 @@ void CppWriter::printInstruction(const Instruction *I,
       }
       Out << "CallInst* " << iName << " = CallInst::Create("
           << opNames[call->getNumArgOperands()] << ", "
-          << iName << "_params.begin(), "
-          << iName << "_params.end(), \"";
+          << iName << "_params, \"";
     } else if (call->getNumArgOperands() == 1) {
       Out << "CallInst* " << iName << " = CallInst::Create("
           << opNames[call->getNumArgOperands()] << ", " << opNames[0] << ", \"";
@@ -1415,7 +1425,7 @@ void CppWriter::printInstruction(const Instruction *I,
     Out << "ExtractValueInst* " << getCppName(evi)
         << " = ExtractValueInst::Create(" << opNames[0]
         << ", "
-        << iName << "_indices.begin(), " << iName << "_indices.end(), \"";
+        << iName << "_indices, \"";
     printEscapedString(evi->getName());
     Out << "\", " << bbname << ");";
     break;
@@ -1432,7 +1442,7 @@ void CppWriter::printInstruction(const Instruction *I,
     Out << "InsertValueInst* " << getCppName(ivi)
         << " = InsertValueInst::Create(" << opNames[0]
         << ", " << opNames[1] << ", "
-        << iName << "_indices.begin(), " << iName << "_indices.end(), \"";
+        << iName << "_indices, \"";
     printEscapedString(ivi->getName());
     Out << "\", " << bbname << ");";
     break;
@@ -1542,13 +1552,12 @@ void CppWriter::printFunctionUses(const Function* F) {
 
 void CppWriter::printFunctionHead(const Function* F) {
   nl(Out) << "Function* " << getCppName(F);
-  if (is_inline) {
-    Out << " = mod->getFunction(\"";
-    printEscapedString(F->getName());
-    Out << "\", " << getCppName(F->getFunctionType()) << ");";
-    nl(Out) << "if (!" << getCppName(F) << ") {";
-    nl(Out) << getCppName(F);
-  }
+  Out << " = mod->getFunction(\"";
+  printEscapedString(F->getName());
+  Out << "\");";
+  nl(Out) << "if (!" << getCppName(F) << ") {";
+  nl(Out) << getCppName(F);
+
   Out<< " = Function::Create(";
   nl(Out,1) << "/*Type=*/" << getCppName(F->getFunctionType()) << ",";
   nl(Out) << "/*Linkage=*/";
@@ -1585,10 +1594,8 @@ void CppWriter::printFunctionHead(const Function* F) {
     Out << "->setGC(\"" << F->getGC() << "\");";
     nl(Out);
   }
-  if (is_inline) {
-    Out << "}";
-    nl(Out);
-  }
+  Out << "}";
+  nl(Out);
   printAttributes(F->getAttributes(), getCppName(F));
   printCppName(F);
   Out << "->setAttributes(" << getCppName(F) << "_PAL);";
@@ -1873,7 +1880,7 @@ void CppWriter::printVariable(const std::string& fname,
 
 void CppWriter::printType(const std::string &fname,
                           const std::string &typeName) {
-  const Type* Ty = TheModule->getTypeByName(typeName);
+  Type* Ty = TheModule->getTypeByName(typeName);
   if (!Ty) {
     error(std::string("Type '") + typeName + "' not found in input module");
     return;
diff --git a/lib/Target/CppBackend/CPPTargetMachine.h b/lib/Target/CppBackend/CPPTargetMachine.h
index 7322e3e34f00c..287e53727139f 100644
--- a/lib/Target/CppBackend/CPPTargetMachine.h
+++ b/lib/Target/CppBackend/CPPTargetMachine.h
@@ -22,8 +22,9 @@ namespace llvm {
 class formatted_raw_ostream;
 
 struct CPPTargetMachine : public TargetMachine {
-  CPPTargetMachine(const Target &T, const std::string &TT,
-                   const std::string &CPU, const std::string &FS)
+  CPPTargetMachine(const Target &T, StringRef TT,
+                   StringRef CPU, StringRef FS,
+                   Reloc::Model RM, CodeModel::Model CM)
     : TargetMachine(T, TT, CPU, FS) {}
 
   virtual bool addPassesToEmitFile(PassManagerBase &PM,
diff --git a/lib/Target/CppBackend/TargetInfo/CMakeLists.txt b/lib/Target/CppBackend/TargetInfo/CMakeLists.txt
index edaf5d3cb1886..7165d8fdf2cd9 100644
--- a/lib/Target/CppBackend/TargetInfo/CMakeLists.txt
+++ b/lib/Target/CppBackend/TargetInfo/CMakeLists.txt
@@ -4,3 +4,7 @@ add_llvm_library(LLVMCppBackendInfo
   CppBackendTargetInfo.cpp
   )
 
+add_llvm_library_dependencies(LLVMCppBackendInfo
+  LLVMMC
+  LLVMTarget
+  )
diff --git a/lib/Target/CppBackend/TargetInfo/CppBackendTargetInfo.cpp b/lib/Target/CppBackend/TargetInfo/CppBackendTargetInfo.cpp
index d0aeb12499c53..a8ac0a282cd1d 100644
--- a/lib/Target/CppBackend/TargetInfo/CppBackendTargetInfo.cpp
+++ b/lib/Target/CppBackend/TargetInfo/CppBackendTargetInfo.cpp
@@ -9,7 +9,7 @@
 
 #include "CPPTargetMachine.h"
 #include "llvm/Module.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
 Target llvm::TheCppBackendTarget;
@@ -24,3 +24,5 @@ extern "C" void LLVMInitializeCppBackendTargetInfo() {
                                   "C++ backend",
                                   &CppBackend_TripleMatchQuality);
 }
+
+extern "C" void LLVMInitializeCppBackendTargetMC() {}
diff --git a/lib/Target/MBlaze/AsmParser/CMakeLists.txt b/lib/Target/MBlaze/AsmParser/CMakeLists.txt
index 87e7cb5da561e..ec8f52a92cb1e 100644
--- a/lib/Target/MBlaze/AsmParser/CMakeLists.txt
+++ b/lib/Target/MBlaze/AsmParser/CMakeLists.txt
@@ -6,3 +6,11 @@ add_llvm_library(LLVMMBlazeAsmParser
   MBlazeAsmParser.cpp
   )
 
+add_llvm_library_dependencies(LLVMMBlazeAsmParser
+  LLVMMBlazeInfo
+  LLVMMC
+  LLVMMCParser
+  LLVMSupport
+  )
+
+add_dependencies(LLVMMBlazeAsmParser MBlazeCommonTableGen)
diff --git a/lib/Target/MBlaze/AsmParser/MBlazeAsmLexer.cpp b/lib/Target/MBlaze/AsmParser/MBlazeAsmLexer.cpp
index 15965964452a3..2d357bb9674dd 100644
--- a/lib/Target/MBlaze/AsmParser/MBlazeAsmLexer.cpp
+++ b/lib/Target/MBlaze/AsmParser/MBlazeAsmLexer.cpp
@@ -7,8 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MBlaze.h"
-#include "MBlazeTargetMachine.h"
+#include "MCTargetDesc/MBlazeBaseInfo.h"
 
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/SmallVector.h"
@@ -17,10 +16,10 @@
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCTargetAsmLexer.h"
 
-#include "llvm/Target/TargetAsmLexer.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 
 #include <string>
 #include <map>
@@ -29,7 +28,7 @@ using namespace llvm;
 
 namespace {
   
-  class MBlazeBaseAsmLexer : public TargetAsmLexer {
+  class MBlazeBaseAsmLexer : public MCTargetAsmLexer {
     const MCAsmInfo &AsmInfo;
     
     const AsmToken &lexDefinite() {
@@ -42,7 +41,7 @@ namespace {
     
     rmap_ty RegisterMap;
     
-    void InitRegisterMap(const TargetRegisterInfo *info) {
+    void InitRegisterMap(const MCRegisterInfo *info) {
       unsigned numRegs = info->getNumRegs();
 
       for (unsigned i = 0; i < numRegs; ++i) {
@@ -76,20 +75,16 @@ namespace {
     }
   public:
     MBlazeBaseAsmLexer(const Target &T, const MCAsmInfo &MAI)
-      : TargetAsmLexer(T), AsmInfo(MAI) {
+      : MCTargetAsmLexer(T), AsmInfo(MAI) {
     }
   };
   
   class MBlazeAsmLexer : public MBlazeBaseAsmLexer {
   public:
-    MBlazeAsmLexer(const Target &T, const MCAsmInfo &MAI)
+    MBlazeAsmLexer(const Target &T, const MCRegisterInfo &MRI,
+                   const MCAsmInfo &MAI)
       : MBlazeBaseAsmLexer(T, MAI) {
-      std::string tripleString("mblaze-unknown-unknown");
-      std::string featureString;
-      std::string CPU;
-      OwningPtr<const TargetMachine> 
-        targetMachine(T.createTargetMachine(tripleString, CPU, featureString));
-      InitRegisterMap(targetMachine->getRegisterInfo());
+      InitRegisterMap(&MRI);
     }
   };
 }
@@ -123,6 +118,6 @@ AsmToken MBlazeBaseAsmLexer::LexTokenUAL() {
 }
 
 extern "C" void LLVMInitializeMBlazeAsmLexer() {
-  RegisterAsmLexer<MBlazeAsmLexer> X(TheMBlazeTarget);
+  RegisterMCAsmLexer<MBlazeAsmLexer> X(TheMBlazeTarget);
 }
 
diff --git a/lib/Target/MBlaze/AsmParser/MBlazeAsmParser.cpp b/lib/Target/MBlaze/AsmParser/MBlazeAsmParser.cpp
index eebd9d878943a..97d311c15107f 100644
--- a/lib/Target/MBlaze/AsmParser/MBlazeAsmParser.cpp
+++ b/lib/Target/MBlaze/AsmParser/MBlazeAsmParser.cpp
@@ -7,19 +7,16 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MBlaze.h"
-#include "MBlazeSubtarget.h"
-#include "MBlazeRegisterInfo.h"
-#include "MBlazeISelLowering.h"
+#include "MCTargetDesc/MBlazeBaseInfo.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
-#include "llvm/Target/TargetRegistry.h"
-#include "llvm/Target/TargetAsmParser.h"
+#include "llvm/MC/MCTargetAsmParser.h"
 #include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/SmallVector.h"
@@ -30,7 +27,7 @@ using namespace llvm;
 namespace {
 struct MBlazeOperand;
 
-class MBlazeAsmParser : public TargetAsmParser {
+class MBlazeAsmParser : public MCTargetAsmParser {
   MCAsmParser &Parser;
 
   MCAsmParser &getParser() const { return Parser; }
@@ -64,7 +61,7 @@ class MBlazeAsmParser : public TargetAsmParser {
 
 public:
   MBlazeAsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser)
-    : TargetAsmParser(), Parser(_Parser) {}
+    : MCTargetAsmParser(), Parser(_Parser) {}
 
   virtual bool ParseInstruction(StringRef Name, SMLoc NameLoc,
                                 SmallVectorImpl<MCParsedAsmOperand*> &Operands);
@@ -286,19 +283,19 @@ void MBlazeOperand::print(raw_ostream &OS) const {
     break;
   case Register:
     OS << "<register R";
-    OS << MBlazeRegisterInfo::getRegisterNumbering(getReg()) << ">";
+    OS << getMBlazeRegisterNumbering(getReg()) << ">";
     break;
   case Token:
     OS << "'" << getToken() << "'";
     break;
   case Memory: {
     OS << "<memory R";
-    OS << MBlazeRegisterInfo::getRegisterNumbering(getMemBase());
+    OS << getMBlazeRegisterNumbering(getMemBase());
     OS << ", ";
 
     unsigned RegOff = getMemOffReg();
     if (RegOff)
-      OS << "R" << MBlazeRegisterInfo::getRegisterNumbering(RegOff);
+      OS << "R" << getMBlazeRegisterNumbering(RegOff);
     else
       OS << getMemOff();
     OS << ">";
@@ -326,6 +323,7 @@ MatchAndEmitInstruction(SMLoc IDLoc,
   unsigned ErrorInfo;
 
   switch (MatchInstructionImpl(Operands, Inst, ErrorInfo)) {
+  default: break;
   case Match_Success:
     Out.EmitInstruction(Inst);
     return false;
@@ -521,7 +519,7 @@ ParseInstruction(StringRef Name, SMLoc NameLoc,
   return false;
 }
 
-/// ParseDirective parses the arm specific directives
+/// ParseDirective parses the MBlaze specific directives
 bool MBlazeAsmParser::ParseDirective(AsmToken DirectiveID) {
   StringRef IDVal = DirectiveID.getIdentifier();
   if (IDVal == ".word")
@@ -558,7 +556,7 @@ extern "C" void LLVMInitializeMBlazeAsmLexer();
 
 /// Force static initialization.
 extern "C" void LLVMInitializeMBlazeAsmParser() {
-  RegisterAsmParser<MBlazeAsmParser> X(TheMBlazeTarget);
+  RegisterMCAsmParser<MBlazeAsmParser> X(TheMBlazeTarget);
   LLVMInitializeMBlazeAsmLexer();
 }
 
diff --git a/lib/Target/MBlaze/AsmParser/Makefile b/lib/Target/MBlaze/AsmParser/Makefile
index 611a0f473f73f..1e68766a084f7 100644
--- a/lib/Target/MBlaze/AsmParser/Makefile
+++ b/lib/Target/MBlaze/AsmParser/Makefile
@@ -1,4 +1,4 @@
-##===- lib/Target/ARM/AsmParser/Makefile -------------------*- Makefile -*-===##
+##===- lib/Target/MBlaze/AsmParser/Makefile ----------------*- Makefile -*-===##
 #
 #                     The LLVM Compiler Infrastructure
 #
diff --git a/lib/Target/MBlaze/CMakeLists.txt b/lib/Target/MBlaze/CMakeLists.txt
index 0bc5b7820378d..47b0db2cb2d02 100644
--- a/lib/Target/MBlaze/CMakeLists.txt
+++ b/lib/Target/MBlaze/CMakeLists.txt
@@ -1,15 +1,16 @@
 set(LLVM_TARGET_DEFINITIONS MBlaze.td)
 
-tablegen(MBlazeGenRegisterInfo.inc -gen-register-info)
-tablegen(MBlazeGenInstrInfo.inc -gen-instr-info)
-tablegen(MBlazeGenCodeEmitter.inc -gen-emitter)
-tablegen(MBlazeGenAsmWriter.inc -gen-asm-writer)
-tablegen(MBlazeGenAsmMatcher.inc -gen-asm-matcher)
-tablegen(MBlazeGenDAGISel.inc -gen-dag-isel)
-tablegen(MBlazeGenCallingConv.inc -gen-callingconv)
-tablegen(MBlazeGenSubtargetInfo.inc -gen-subtarget)
-tablegen(MBlazeGenIntrinsics.inc -gen-tgt-intrinsic)
-tablegen(MBlazeGenEDInfo.inc -gen-enhanced-disassembly-info)
+llvm_tablegen(MBlazeGenRegisterInfo.inc -gen-register-info)
+llvm_tablegen(MBlazeGenInstrInfo.inc -gen-instr-info)
+llvm_tablegen(MBlazeGenCodeEmitter.inc -gen-emitter)
+llvm_tablegen(MBlazeGenAsmWriter.inc -gen-asm-writer)
+llvm_tablegen(MBlazeGenAsmMatcher.inc -gen-asm-matcher)
+llvm_tablegen(MBlazeGenDAGISel.inc -gen-dag-isel)
+llvm_tablegen(MBlazeGenCallingConv.inc -gen-callingconv)
+llvm_tablegen(MBlazeGenSubtargetInfo.inc -gen-subtarget)
+llvm_tablegen(MBlazeGenIntrinsics.inc -gen-tgt-intrinsic)
+llvm_tablegen(MBlazeGenEDInfo.inc -gen-enhanced-disassembly-info)
+add_public_tablegen_target(MBlazeCommonTableGen)
 
 add_llvm_target(MBlazeCodeGen
   MBlazeDelaySlotFiller.cpp
@@ -24,10 +25,21 @@ add_llvm_target(MBlazeCodeGen
   MBlazeIntrinsicInfo.cpp
   MBlazeSelectionDAGInfo.cpp
   MBlazeAsmPrinter.cpp
-  MBlazeAsmBackend.cpp
   MBlazeMCInstLower.cpp
   MBlazeELFWriterInfo.cpp
-  MBlazeMCCodeEmitter.cpp
+  )
+
+add_llvm_library_dependencies(LLVMMBlazeCodeGen
+  LLVMAsmPrinter
+  LLVMCodeGen
+  LLVMCore
+  LLVMMBlazeAsmPrinter
+  LLVMMBlazeDesc
+  LLVMMBlazeInfo
+  LLVMMC
+  LLVMSelectionDAG
+  LLVMSupport
+  LLVMTarget
   )
 
 add_subdirectory(AsmParser)
diff --git a/lib/Target/MBlaze/Disassembler/CMakeLists.txt b/lib/Target/MBlaze/Disassembler/CMakeLists.txt
index 9376e68a35cf7..112c64c026381 100644
--- a/lib/Target/MBlaze/Disassembler/CMakeLists.txt
+++ b/lib/Target/MBlaze/Disassembler/CMakeLists.txt
@@ -13,4 +13,12 @@ set_property(
   )
 endif()
 
-add_dependencies(LLVMMBlazeDisassembler MBlazeCodeGenTable_gen)
+add_llvm_library_dependencies(LLVMMBlazeDisassembler
+  LLVMMBlazeCodeGen
+  LLVMMBlazeDesc
+  LLVMMBlazeInfo
+  LLVMMC
+  LLVMSupport
+  )
+
+add_dependencies(LLVMMBlazeDisassembler MBlazeCommonTableGen)
diff --git a/lib/Target/MBlaze/Disassembler/MBlazeDisassembler.cpp b/lib/Target/MBlaze/Disassembler/MBlazeDisassembler.cpp
index 88d80a12eb3a2..fd761f1ca8c15 100644
--- a/lib/Target/MBlaze/Disassembler/MBlazeDisassembler.cpp
+++ b/lib/Target/MBlaze/Disassembler/MBlazeDisassembler.cpp
@@ -20,9 +20,9 @@
 #include "llvm/MC/MCDisassembler.h"
 #include "llvm/MC/MCDisassembler.h"
 #include "llvm/MC/MCInst.h"
-#include "llvm/Target/TargetRegistry.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MemoryObject.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 
 // #include "MBlazeGenDecoderTables.inc"
@@ -60,27 +60,27 @@ static unsigned mblazeBinary2Opcode[] = {
 };
 
 static unsigned getRD(uint32_t insn) {
-  if (!MBlazeRegisterInfo::isRegister((insn>>21)&0x1F))
+  if (!isMBlazeRegister((insn>>21)&0x1F))
     return UNSUPPORTED;
-  return MBlazeRegisterInfo::getRegisterFromNumbering((insn>>21)&0x1F);
+  return getMBlazeRegisterFromNumbering((insn>>21)&0x1F);
 }
 
 static unsigned getRA(uint32_t insn) {
-  if (!MBlazeRegisterInfo::getRegisterFromNumbering((insn>>16)&0x1F))
+  if (!getMBlazeRegisterFromNumbering((insn>>16)&0x1F))
     return UNSUPPORTED;
-  return MBlazeRegisterInfo::getRegisterFromNumbering((insn>>16)&0x1F);
+  return getMBlazeRegisterFromNumbering((insn>>16)&0x1F);
 }
 
 static unsigned getRB(uint32_t insn) {
-  if (!MBlazeRegisterInfo::getRegisterFromNumbering((insn>>11)&0x1F))
+  if (!getMBlazeRegisterFromNumbering((insn>>11)&0x1F))
     return UNSUPPORTED;
-  return MBlazeRegisterInfo::getRegisterFromNumbering((insn>>11)&0x1F);
+  return getMBlazeRegisterFromNumbering((insn>>11)&0x1F);
 }
 
 static int64_t getRS(uint32_t insn) {
-  if (!MBlazeRegisterInfo::isSpecialRegister(insn&0x3FFF))
+  if (!isSpecialMBlazeRegister(insn&0x3FFF))
     return UNSUPPORTED;
-  return MBlazeRegisterInfo::getSpecialRegisterFromNumbering(insn&0x3FFF);
+  return getSpecialMBlazeRegisterFromNumbering(insn&0x3FFF);
 }
 
 static int64_t getIMM(uint32_t insn) {
@@ -493,11 +493,12 @@ EDInstInfo *MBlazeDisassembler::getEDInfo() const {
 // Public interface for the disassembler
 //
 
-bool MBlazeDisassembler::getInstruction(MCInst &instr,
+MCDisassembler::DecodeStatus MBlazeDisassembler::getInstruction(MCInst &instr,
                                         uint64_t &size,
                                         const MemoryObject &region,
                                         uint64_t address,
-                                        raw_ostream &vStream) const {
+                                        raw_ostream &vStream,
+                                        raw_ostream &cStream) const {
   // The machine instruction.
   uint32_t insn;
   uint64_t read;
@@ -508,7 +509,7 @@ bool MBlazeDisassembler::getInstruction(MCInst &instr,
 
   // We want to read exactly 4 bytes of data.
   if (region.readBytes(address, 4, (uint8_t*)bytes, &read) == -1 || read < 4)
-    return false;
+    return Fail;
 
   // Encoded as a big-endian 32-bit word in the stream.
   insn = (bytes[0]<<24) | (bytes[1]<<16) | (bytes[2]<< 8) | (bytes[3]<<0);
@@ -517,7 +518,7 @@ bool MBlazeDisassembler::getInstruction(MCInst &instr,
   // that it is a valid instruction.
   unsigned opcode = getOPCODE(insn);
   if (opcode == UNSUPPORTED)
-    return false;
+    return Fail;
 
   instr.setOpcode(opcode);
 
@@ -529,11 +530,11 @@ bool MBlazeDisassembler::getInstruction(MCInst &instr,
   uint64_t tsFlags = MBlazeInsts[opcode].TSFlags;
   switch ((tsFlags & MBlazeII::FormMask)) {
   default: 
-    return false;
+    return Fail;
 
   case MBlazeII::FRRRR:
     if (RD == UNSUPPORTED || RA == UNSUPPORTED || RB == UNSUPPORTED)
-      return false;
+      return Fail;
     instr.addOperand(MCOperand::CreateReg(RD));
     instr.addOperand(MCOperand::CreateReg(RB));
     instr.addOperand(MCOperand::CreateReg(RA));
@@ -541,7 +542,7 @@ bool MBlazeDisassembler::getInstruction(MCInst &instr,
 
   case MBlazeII::FRRR:
     if (RD == UNSUPPORTED || RA == UNSUPPORTED || RB == UNSUPPORTED)
-      return false;
+      return Fail;
     instr.addOperand(MCOperand::CreateReg(RD));
     instr.addOperand(MCOperand::CreateReg(RA));
     instr.addOperand(MCOperand::CreateReg(RB));
@@ -550,23 +551,23 @@ bool MBlazeDisassembler::getInstruction(MCInst &instr,
   case MBlazeII::FRI:
     switch (opcode) {
     default: 
-      return false;
+      return Fail;
     case MBlaze::MFS:
       if (RD == UNSUPPORTED)
-        return false;
+        return Fail;
       instr.addOperand(MCOperand::CreateReg(RD));
       instr.addOperand(MCOperand::CreateImm(insn&0x3FFF));
       break;
     case MBlaze::MTS:
       if (RA == UNSUPPORTED)
-        return false;
+        return Fail;
       instr.addOperand(MCOperand::CreateImm(insn&0x3FFF));
       instr.addOperand(MCOperand::CreateReg(RA));
       break;
     case MBlaze::MSRSET:
     case MBlaze::MSRCLR:
       if (RD == UNSUPPORTED)
-        return false;
+        return Fail;
       instr.addOperand(MCOperand::CreateReg(RD));
       instr.addOperand(MCOperand::CreateImm(insn&0x7FFF));
       break;
@@ -575,7 +576,7 @@ bool MBlazeDisassembler::getInstruction(MCInst &instr,
 
   case MBlazeII::FRRI:
     if (RD == UNSUPPORTED || RA == UNSUPPORTED)
-      return false;
+      return Fail;
     instr.addOperand(MCOperand::CreateReg(RD));
     instr.addOperand(MCOperand::CreateReg(RA));
     switch (opcode) {
@@ -592,35 +593,35 @@ bool MBlazeDisassembler::getInstruction(MCInst &instr,
 
   case MBlazeII::FCRR:
     if (RA == UNSUPPORTED || RB == UNSUPPORTED)
-      return false;
+      return Fail;
     instr.addOperand(MCOperand::CreateReg(RA));
     instr.addOperand(MCOperand::CreateReg(RB));
     break;
 
   case MBlazeII::FCRI:
     if (RA == UNSUPPORTED)
-      return false;
+      return Fail;
     instr.addOperand(MCOperand::CreateReg(RA));
     instr.addOperand(MCOperand::CreateImm(getIMM(insn)));
     break;
 
   case MBlazeII::FRCR:
     if (RD == UNSUPPORTED || RB == UNSUPPORTED)
-      return false;
+      return Fail;
     instr.addOperand(MCOperand::CreateReg(RD));
     instr.addOperand(MCOperand::CreateReg(RB));
     break;
 
   case MBlazeII::FRCI:
     if (RD == UNSUPPORTED)
-      return false;
+      return Fail;
     instr.addOperand(MCOperand::CreateReg(RD));
     instr.addOperand(MCOperand::CreateImm(getIMM(insn)));
     break;
 
   case MBlazeII::FCCR:
     if (RB == UNSUPPORTED)
-      return false;
+      return Fail;
     instr.addOperand(MCOperand::CreateReg(RB));
     break;
 
@@ -630,7 +631,7 @@ bool MBlazeDisassembler::getInstruction(MCInst &instr,
 
   case MBlazeII::FRRCI:
     if (RD == UNSUPPORTED || RA == UNSUPPORTED)
-      return false;
+      return Fail;
     instr.addOperand(MCOperand::CreateReg(RD));
     instr.addOperand(MCOperand::CreateReg(RA));
     instr.addOperand(MCOperand::CreateImm(getSHT(insn)));
@@ -638,35 +639,35 @@ bool MBlazeDisassembler::getInstruction(MCInst &instr,
 
   case MBlazeII::FRRC:
     if (RD == UNSUPPORTED || RA == UNSUPPORTED)
-      return false;
+      return Fail;
     instr.addOperand(MCOperand::CreateReg(RD));
     instr.addOperand(MCOperand::CreateReg(RA));
     break;
 
   case MBlazeII::FRCX:
     if (RD == UNSUPPORTED)
-      return false;
+      return Fail;
     instr.addOperand(MCOperand::CreateReg(RD));
     instr.addOperand(MCOperand::CreateImm(getFSL(insn)));
     break;
 
   case MBlazeII::FRCS:
     if (RD == UNSUPPORTED || RS == UNSUPPORTED)
-      return false;
+      return Fail;
     instr.addOperand(MCOperand::CreateReg(RD));
     instr.addOperand(MCOperand::CreateReg(RS));
     break;
 
   case MBlazeII::FCRCS:
     if (RS == UNSUPPORTED || RA == UNSUPPORTED)
-      return false;
+      return Fail;
     instr.addOperand(MCOperand::CreateReg(RS));
     instr.addOperand(MCOperand::CreateReg(RA));
     break;
 
   case MBlazeII::FCRCX:
     if (RA == UNSUPPORTED)
-      return false;
+      return Fail;
     instr.addOperand(MCOperand::CreateReg(RA));
     instr.addOperand(MCOperand::CreateImm(getFSL(insn)));
     break;
@@ -677,13 +678,13 @@ bool MBlazeDisassembler::getInstruction(MCInst &instr,
 
   case MBlazeII::FCR:
     if (RB == UNSUPPORTED)
-      return false;
+      return Fail;
     instr.addOperand(MCOperand::CreateReg(RB));
     break;
 
   case MBlazeII::FRIR:
     if (RD == UNSUPPORTED || RA == UNSUPPORTED)
-      return false;
+      return Fail;
     instr.addOperand(MCOperand::CreateReg(RD));
     instr.addOperand(MCOperand::CreateImm(getIMM(insn)));
     instr.addOperand(MCOperand::CreateReg(RA));
@@ -693,11 +694,12 @@ bool MBlazeDisassembler::getInstruction(MCInst &instr,
   // We always consume 4 bytes of data on success
   size = 4;
 
-  return true;
+  return Success;
 }
 
-static MCDisassembler *createMBlazeDisassembler(const Target &T) {
-  return new MBlazeDisassembler;
+static MCDisassembler *createMBlazeDisassembler(const Target &T,
+                                                const MCSubtargetInfo &STI) {
+  return new MBlazeDisassembler(STI);
 }
 
 extern "C" void LLVMInitializeMBlazeDisassembler() {
diff --git a/lib/Target/MBlaze/Disassembler/MBlazeDisassembler.h b/lib/Target/MBlaze/Disassembler/MBlazeDisassembler.h
index d05eced0bacf4..0ac0d89efbe78 100644
--- a/lib/Target/MBlaze/Disassembler/MBlazeDisassembler.h
+++ b/lib/Target/MBlaze/Disassembler/MBlazeDisassembler.h
@@ -32,19 +32,20 @@ class MBlazeDisassembler : public MCDisassembler {
 public:
   /// Constructor     - Initializes the disassembler.
   ///
-  MBlazeDisassembler() :
-    MCDisassembler() {
+  MBlazeDisassembler(const MCSubtargetInfo &STI) :
+    MCDisassembler(STI) {
   }
 
   ~MBlazeDisassembler() {
   }
 
   /// getInstruction - See MCDisassembler.
-  bool getInstruction(MCInst &instr,
+  MCDisassembler::DecodeStatus getInstruction(MCInst &instr,
                       uint64_t &size,
                       const MemoryObject &region,
                       uint64_t address,
-                      raw_ostream &vStream) const;
+                      raw_ostream &vStream,
+                      raw_ostream &cStream) const;
 
   /// getEDInfo - See MCDisassembler.
   EDInstInfo *getEDInfo() const;
diff --git a/lib/Target/MBlaze/InstPrinter/CMakeLists.txt b/lib/Target/MBlaze/InstPrinter/CMakeLists.txt
index 242a573036e62..aff0b3d992d4c 100644
--- a/lib/Target/MBlaze/InstPrinter/CMakeLists.txt
+++ b/lib/Target/MBlaze/InstPrinter/CMakeLists.txt
@@ -2,7 +2,12 @@ include_directories( ${CMAKE_CURRENT_BINARY_DIR}/..
                      ${CMAKE_CURRENT_SOURCE_DIR}/.. )
 
 add_llvm_library(LLVMMBlazeAsmPrinter
-    MBlazeInstPrinter.cpp
+  MBlazeInstPrinter.cpp
   )
 
-add_dependencies(LLVMMBlazeAsmPrinter MBlazeCodeGenTable_gen)
+add_llvm_library_dependencies(LLVMMBlazeAsmPrinter
+  LLVMMC
+  LLVMSupport
+  )
+
+add_dependencies(LLVMMBlazeAsmPrinter MBlazeCommonTableGen)
diff --git a/lib/Target/MBlaze/InstPrinter/MBlazeInstPrinter.cpp b/lib/Target/MBlaze/InstPrinter/MBlazeInstPrinter.cpp
index a7fd287990b7f..a1f1dbc7a23b7 100644
--- a/lib/Target/MBlaze/InstPrinter/MBlazeInstPrinter.cpp
+++ b/lib/Target/MBlaze/InstPrinter/MBlazeInstPrinter.cpp
@@ -25,8 +25,10 @@ using namespace llvm;
 // Include the auto-generated portion of the assembly writer.
 #include "MBlazeGenAsmWriter.inc"
 
-void MBlazeInstPrinter::printInst(const MCInst *MI, raw_ostream &O) {
+void MBlazeInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+                                  StringRef Annot) {
   printInstruction(MI, O);
+  printAnnotation(O, Annot);
 }
 
 void MBlazeInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
diff --git a/lib/Target/MBlaze/InstPrinter/MBlazeInstPrinter.h b/lib/Target/MBlaze/InstPrinter/MBlazeInstPrinter.h
index eacca410b9867..570ab08a07aad 100644
--- a/lib/Target/MBlaze/InstPrinter/MBlazeInstPrinter.h
+++ b/lib/Target/MBlaze/InstPrinter/MBlazeInstPrinter.h
@@ -24,7 +24,7 @@ namespace llvm {
     MBlazeInstPrinter(const MCAsmInfo &MAI)
       : MCInstPrinter(MAI) {}
 
-    virtual void printInst(const MCInst *MI, raw_ostream &O);
+    virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
 
     // Autogenerated by tblgen.
     void printInstruction(const MCInst *MI, raw_ostream &O);
diff --git a/lib/Target/MBlaze/MBlaze.h b/lib/Target/MBlaze/MBlaze.h
index 3390794c93757..1399b85d34d2f 100644
--- a/lib/Target/MBlaze/MBlaze.h
+++ b/lib/Target/MBlaze/MBlaze.h
@@ -15,6 +15,7 @@
 #ifndef TARGET_MBLAZE_H
 #define TARGET_MBLAZE_H
 
+#include "MCTargetDesc/MBlazeBaseInfo.h"
 #include "MCTargetDesc/MBlazeMCTargetDesc.h"
 #include "llvm/Target/TargetMachine.h"
 
@@ -22,17 +23,6 @@ namespace llvm {
   class MBlazeTargetMachine;
   class FunctionPass;
   class MachineCodeEmitter;
-  class MCCodeEmitter;
-  class MCInstrInfo;
-  class MCSubtargetInfo;
-  class TargetAsmBackend;
-  class formatted_raw_ostream;
-
-  MCCodeEmitter *createMBlazeMCCodeEmitter(const MCInstrInfo &MCII,
-                                           const MCSubtargetInfo &STI,
-                                           MCContext &Ctx);
-  
-  TargetAsmBackend *createMBlazeAsmBackend(const Target &, const std::string &);
 
   FunctionPass *createMBlazeISelDag(MBlazeTargetMachine &TM);
   FunctionPass *createMBlazeDelaySlotFillerPass(MBlazeTargetMachine &TM);
diff --git a/lib/Target/MBlaze/MBlazeAsmBackend.cpp b/lib/Target/MBlaze/MBlazeAsmBackend.cpp
deleted file mode 100644
index 08f14c3659571..0000000000000
--- a/lib/Target/MBlaze/MBlazeAsmBackend.cpp
+++ /dev/null
@@ -1,162 +0,0 @@
-//===-- MBlazeAsmBackend.cpp - MBlaze Assembler Backend -------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Target/TargetAsmBackend.h"
-#include "MBlaze.h"
-#include "MBlazeELFWriterInfo.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/MC/MCAssembler.h"
-#include "llvm/MC/MCAsmLayout.h"
-#include "llvm/MC/MCELFObjectWriter.h"
-#include "llvm/MC/MCELFSymbolFlags.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCObjectWriter.h"
-#include "llvm/MC/MCSectionELF.h"
-#include "llvm/MC/MCSectionMachO.h"
-#include "llvm/MC/MCValue.h"
-#include "llvm/Support/ELF.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetRegistry.h"
-#include "llvm/Target/TargetAsmBackend.h"
-using namespace llvm;
-
-static unsigned getFixupKindSize(unsigned Kind) {
-  switch (Kind) {
-  default: assert(0 && "invalid fixup kind!");
-  case FK_Data_1: return 1;
-  case FK_PCRel_2:
-  case FK_Data_2: return 2;
-  case FK_PCRel_4:
-  case FK_Data_4: return 4;
-  case FK_Data_8: return 8;
-  }
-}
-
-
-namespace {
-class MBlazeELFObjectWriter : public MCELFObjectTargetWriter {
-public:
-  MBlazeELFObjectWriter(Triple::OSType OSType)
-    : MCELFObjectTargetWriter(/*is64Bit*/ false, OSType, ELF::EM_MBLAZE,
-                              /*HasRelocationAddend*/ true) {}
-};
-
-class MBlazeAsmBackend : public TargetAsmBackend {
-public:
-  MBlazeAsmBackend(const Target &T)
-    : TargetAsmBackend() {
-  }
-
-  unsigned getNumFixupKinds() const {
-    return 2;
-  }
-
-  bool MayNeedRelaxation(const MCInst &Inst) const;
-
-  void RelaxInstruction(const MCInst &Inst, MCInst &Res) const;
-
-  bool WriteNopData(uint64_t Count, MCObjectWriter *OW) const;
-
-  unsigned getPointerSize() const {
-    return 4;
-  }
-};
-
-static unsigned getRelaxedOpcode(unsigned Op) {
-    switch (Op) {
-    default:            return Op;
-    case MBlaze::ADDIK: return MBlaze::ADDIK32;
-    case MBlaze::ORI:   return MBlaze::ORI32;
-    case MBlaze::BRLID: return MBlaze::BRLID32;
-    }
-}
-
-bool MBlazeAsmBackend::MayNeedRelaxation(const MCInst &Inst) const {
-  if (getRelaxedOpcode(Inst.getOpcode()) == Inst.getOpcode())
-    return false;
-
-  bool hasExprOrImm = false;
-  for (unsigned i = 0; i < Inst.getNumOperands(); ++i)
-    hasExprOrImm |= Inst.getOperand(i).isExpr();
-
-  return hasExprOrImm;
-}
-
-void MBlazeAsmBackend::RelaxInstruction(const MCInst &Inst, MCInst &Res) const {
-  Res = Inst;
-  Res.setOpcode(getRelaxedOpcode(Inst.getOpcode()));
-}
-
-bool MBlazeAsmBackend::WriteNopData(uint64_t Count, MCObjectWriter *OW) const {
-  if ((Count % 4) != 0)
-    return false;
-
-  for (uint64_t i = 0; i < Count; i += 4)
-      OW->Write32(0x00000000);
-
-  return true;
-}
-} // end anonymous namespace
-
-namespace {
-class ELFMBlazeAsmBackend : public MBlazeAsmBackend {
-public:
-  Triple::OSType OSType;
-  ELFMBlazeAsmBackend(const Target &T, Triple::OSType _OSType)
-    : MBlazeAsmBackend(T), OSType(_OSType) { }
-
-  void ApplyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value) const;
-
-  MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
-    return createELFObjectWriter(new MBlazeELFObjectWriter(OSType), OS,
-                                 /*IsLittleEndian*/ false);
-  }
-};
-
-void ELFMBlazeAsmBackend::ApplyFixup(const MCFixup &Fixup, char *Data,
-                                     unsigned DataSize, uint64_t Value) const {
-  unsigned Size = getFixupKindSize(Fixup.getKind());
-
-  assert(Fixup.getOffset() + Size <= DataSize &&
-         "Invalid fixup offset!");
-
-  char *data = Data + Fixup.getOffset();
-  switch (Size) {
-  default: llvm_unreachable("Cannot fixup unknown value.");
-  case 1:  llvm_unreachable("Cannot fixup 1 byte value.");
-  case 8:  llvm_unreachable("Cannot fixup 8 byte value.");
-
-  case 4:
-    *(data+7) = uint8_t(Value);
-    *(data+6) = uint8_t(Value >> 8);
-    *(data+3) = uint8_t(Value >> 16);
-    *(data+2) = uint8_t(Value >> 24);
-    break;
-
-  case 2:
-    *(data+3) = uint8_t(Value >> 0);
-    *(data+2) = uint8_t(Value >> 8);
-  }
-}
-} // end anonymous namespace
-
-TargetAsmBackend *llvm::createMBlazeAsmBackend(const Target &T,
-                                            const std::string &TT) {
-  Triple TheTriple(TT);
-
-  if (TheTriple.isOSDarwin())
-    assert(0 && "Mac not supported on MBlaze");
-
-  if (TheTriple.isOSWindows())
-    assert(0 && "Windows not supported on MBlaze");
-
-  return new ELFMBlazeAsmBackend(T, TheTriple.getOS());
-}
diff --git a/lib/Target/MBlaze/MBlazeAsmPrinter.cpp b/lib/Target/MBlaze/MBlazeAsmPrinter.cpp
index 0016df569b93e..97bd083fdd158 100644
--- a/lib/Target/MBlaze/MBlazeAsmPrinter.cpp
+++ b/lib/Target/MBlaze/MBlazeAsmPrinter.cpp
@@ -38,10 +38,10 @@
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
-#include "llvm/Target/TargetRegistry.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cctype>
 
@@ -136,19 +136,17 @@ void MBlazeAsmPrinter::printSavedRegsBitmask() {
   const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
   for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
     unsigned Reg = CSI[i].getReg();
-    unsigned RegNum = MBlazeRegisterInfo::getRegisterNumbering(Reg);
+    unsigned RegNum = getMBlazeRegisterNumbering(Reg);
     if (MBlaze::GPRRegisterClass->contains(Reg))
       CPUBitmask |= (1 << RegNum);
   }
 
   // Return Address and Frame registers must also be set in CPUBitmask.
   if (TFI->hasFP(*MF))
-    CPUBitmask |= (1 << MBlazeRegisterInfo::
-                getRegisterNumbering(RI.getFrameRegister(*MF)));
+    CPUBitmask |= (1 <<  getMBlazeRegisterNumbering(RI.getFrameRegister(*MF)));
 
   if (MFI->adjustsStack())
-    CPUBitmask |= (1 << MBlazeRegisterInfo::
-                getRegisterNumbering(RI.getRARegister()));
+    CPUBitmask |= (1 << getMBlazeRegisterNumbering(RI.getRARegister()));
 
   // Print CPUBitmask
   OutStreamer.EmitRawText("\t.mask\t0x" + Twine::utohexstr(CPUBitmask));
@@ -318,18 +316,7 @@ isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const {
   return I == Pred->end() || !I->getDesc().isBarrier();
 }
 
-static MCInstPrinter *createMBlazeMCInstPrinter(const Target &T,
-                                                unsigned SyntaxVariant,
-                                                const MCAsmInfo &MAI) {
-  if (SyntaxVariant == 0)
-    return new MBlazeInstPrinter(MAI);
-  return 0;
-}
-
 // Force static initialization.
 extern "C" void LLVMInitializeMBlazeAsmPrinter() {
   RegisterAsmPrinter<MBlazeAsmPrinter> X(TheMBlazeTarget);
-  TargetRegistry::RegisterMCInstPrinter(TheMBlazeTarget,
-                                        createMBlazeMCInstPrinter);
-
 }
diff --git a/lib/Target/MBlaze/MBlazeFrameLowering.cpp b/lib/Target/MBlaze/MBlazeFrameLowering.cpp
index e7639025cf1a9..f28d5a77d49ca 100644
--- a/lib/Target/MBlaze/MBlazeFrameLowering.cpp
+++ b/lib/Target/MBlaze/MBlazeFrameLowering.cpp
@@ -1,4 +1,4 @@
-//=======- MBlazeFrameLowering.cpp - MBlaze Frame Information ------*- C++ -*-====//
+//===- MBlazeFrameLowering.cpp - MBlaze Frame Information ------*- C++ -*-====//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/lib/Target/MBlaze/MBlazeISelLowering.cpp b/lib/Target/MBlaze/MBlazeISelLowering.cpp
index 62dfdcc2fd103..8ec548f1437db 100644
--- a/lib/Target/MBlaze/MBlazeISelLowering.cpp
+++ b/lib/Target/MBlaze/MBlazeISelLowering.cpp
@@ -59,6 +59,7 @@ MBlazeTargetLowering::MBlazeTargetLowering(MBlazeTargetMachine &TM)
   // MBlaze does not have i1 type, so use i32 for
   // setcc operations results (slt, sgt, ...).
   setBooleanContents(ZeroOrOneBooleanContent);
+  setBooleanVectorContents(ZeroOrOneBooleanContent); // FIXME: Is this correct?
 
   // Set up the register classes
   addRegisterClass(MVT::i32, MBlaze::GPRRegisterClass);
@@ -187,7 +188,7 @@ MBlazeTargetLowering::MBlazeTargetLowering(MBlazeTargetMachine &TM)
   computeRegisterProperties();
 }
 
-MVT::SimpleValueType MBlazeTargetLowering::getSetCCResultType(EVT VT) const {
+EVT MBlazeTargetLowering::getSetCCResultType(EVT VT) const {
   return MVT::i32;
 }
 
@@ -964,13 +965,13 @@ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
     // The last register argument that must be saved is MBlaze::R10
     TargetRegisterClass *RC = MBlaze::GPRRegisterClass;
 
-    unsigned Begin = MBlazeRegisterInfo::getRegisterNumbering(MBlaze::R5);
-    unsigned Start = MBlazeRegisterInfo::getRegisterNumbering(ArgRegEnd+1);
-    unsigned End   = MBlazeRegisterInfo::getRegisterNumbering(MBlaze::R10);
+    unsigned Begin = getMBlazeRegisterNumbering(MBlaze::R5);
+    unsigned Start = getMBlazeRegisterNumbering(ArgRegEnd+1);
+    unsigned End   = getMBlazeRegisterNumbering(MBlaze::R10);
     unsigned StackLoc = Start - Begin + 1;
 
     for (; Start <= End; ++Start, ++StackLoc) {
-      unsigned Reg = MBlazeRegisterInfo::getRegisterFromNumbering(Start);
+      unsigned Reg = getMBlazeRegisterFromNumbering(Start);
       unsigned LiveReg = MF.addLiveIn(Reg, RC);
       SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, LiveReg, MVT::i32);
 
@@ -1096,7 +1097,7 @@ MBlazeTargetLowering::getSingleConstraintMatchWeight(
     // but allow it at the lowest weight.
   if (CallOperandVal == NULL)
     return CW_Default;
-  const Type *type = CallOperandVal->getType();
+  Type *type = CallOperandVal->getType();
   // Look at the constraint type.
   switch (*constraint) {
   default:
diff --git a/lib/Target/MBlaze/MBlazeISelLowering.h b/lib/Target/MBlaze/MBlazeISelLowering.h
index bb128da3c7c0d..8b49bc3de0cce 100644
--- a/lib/Target/MBlaze/MBlazeISelLowering.h
+++ b/lib/Target/MBlaze/MBlazeISelLowering.h
@@ -102,7 +102,7 @@ namespace llvm {
     virtual const char *getTargetNodeName(unsigned Opcode) const;
 
     /// getSetCCResultType - get the ISD::SETCC result ValueType
-    MVT::SimpleValueType getSetCCResultType(EVT VT) const;
+    EVT getSetCCResultType(EVT VT) const;
 
   private:
     // Subtarget Info
diff --git a/lib/Target/MBlaze/MBlazeInstrInfo.cpp b/lib/Target/MBlaze/MBlazeInstrInfo.cpp
index 188f10a3972ea..7ae05b367cbad 100644
--- a/lib/Target/MBlaze/MBlazeInstrInfo.cpp
+++ b/lib/Target/MBlaze/MBlazeInstrInfo.cpp
@@ -17,9 +17,9 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/ScoreboardHazardRecognizer.h"
-#include "llvm/Target/TargetRegistry.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/ADT/STLExtras.h"
 
 #define GET_INSTRINFO_CTOR
@@ -239,7 +239,8 @@ unsigned MBlazeInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
   return 2;
 }
 
-bool MBlazeInstrInfo::ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
+bool MBlazeInstrInfo::ReverseBranchCondition(SmallVectorImpl<MachineOperand>
+                                               &Cond) const {
   assert(Cond.size() == 2 && "Invalid MBlaze branch opcode!");
   switch (Cond[0].getImm()) {
   default:            return true;
diff --git a/lib/Target/MBlaze/MBlazeInstrInfo.h b/lib/Target/MBlaze/MBlazeInstrInfo.h
index 79f962b349bf9..7174405a49d94 100644
--- a/lib/Target/MBlaze/MBlazeInstrInfo.h
+++ b/lib/Target/MBlaze/MBlazeInstrInfo.h
@@ -166,62 +166,6 @@ namespace MBlaze {
   }
 }
 
-/// MBlazeII - This namespace holds all of the target specific flags that
-/// instruction info tracks.
-///
-namespace MBlazeII {
-  enum {
-    // PseudoFrm - This represents an instruction that is a pseudo instruction
-    // or one that has not been implemented yet.  It is illegal to code generate
-    // it, but tolerated for intermediate implementation stages.
-    FPseudo = 0,
-    FRRR,
-    FRRI,
-    FCRR,
-    FCRI,
-    FRCR,
-    FRCI,
-    FCCR,
-    FCCI,
-    FRRCI,
-    FRRC,
-    FRCX,
-    FRCS,
-    FCRCS,
-    FCRCX,
-    FCX,
-    FCR,
-    FRIR,
-    FRRRR,
-    FRI,
-    FC,
-    FormMask = 63
-
-    //===------------------------------------------------------------------===//
-    // MBlaze Specific MachineOperand flags.
-    // MO_NO_FLAG,
-
-    /// MO_GOT - Represents the offset into the global offset table at which
-    /// the address the relocation entry symbol resides during execution.
-    // MO_GOT,
-
-    /// MO_GOT_CALL - Represents the offset into the global offset table at
-    /// which the address of a call site relocation entry symbol resides
-    /// during execution. This is different from the above since this flag
-    /// can only be present in call instructions.
-    // MO_GOT_CALL,
-
-    /// MO_GPREL - Represents the offset from the current gp value to be used
-    /// for the relocatable object file being produced.
-    // MO_GPREL,
-
-    /// MO_ABS_HILO - Represents the hi or low part of an absolute symbol
-    /// address.
-    // MO_ABS_HILO
-
-  };
-}
-
 class MBlazeInstrInfo : public MBlazeGenInstrInfo {
   MBlazeTargetMachine &TM;
   const MBlazeRegisterInfo RI;
diff --git a/lib/Target/MBlaze/MBlazeInstrInfo.td b/lib/Target/MBlaze/MBlazeInstrInfo.td
index 950f2d77029bf..1d8c987a0856c 100644
--- a/lib/Target/MBlaze/MBlazeInstrInfo.td
+++ b/lib/Target/MBlaze/MBlazeInstrInfo.td
@@ -442,17 +442,19 @@ let Predicates=[HasMul] in {
 //===----------------------------------------------------------------------===//
 
 let canFoldAsLoad = 1, isReMaterializable = 1 in {
-  def LBU  :  LoadM<0x30, 0x000, "lbu    ">;
-  def LBUR :  LoadM<0x30, 0x200, "lbur   ">;
+  let neverHasSideEffects = 1 in {
+    def LBU  :  LoadM<0x30, 0x000, "lbu    ">;
+    def LBUR :  LoadM<0x30, 0x200, "lbur   ">;
 
-  def LHU  :  LoadM<0x31, 0x000, "lhu    ">;
-  def LHUR :  LoadM<0x31, 0x200, "lhur   ">;
+    def LHU  :  LoadM<0x31, 0x000, "lhu    ">;
+    def LHUR :  LoadM<0x31, 0x200, "lhur   ">;
 
-  def LW   :  LoadM<0x32, 0x000, "lw     ">;
-  def LWR  :  LoadM<0x32, 0x200, "lwr    ">;
+    def LW   :  LoadM<0x32, 0x000, "lw     ">;
+    def LWR  :  LoadM<0x32, 0x200, "lwr    ">;
 
-  let Defs = [CARRY] in {
-    def LWX  :  LoadM<0x32, 0x400, "lwx    ">;
+    let Defs = [CARRY] in {
+      def LWX  :  LoadM<0x32, 0x400, "lwx    ">;
+    }
   }
 
   def LBUI : LoadMI<0x38, "lbui   ", zextloadi8>;
@@ -877,6 +879,9 @@ def : Pat<(zextloadi8 xaddr:$addr), (i32 (LBU xaddr:$addr))>;
 // Peepholes
 def : Pat<(store (i32 0), iaddr:$dst), (SWI (i32 R0), iaddr:$dst)>;
 
+// Atomic fence
+def : Pat<(atomic_fence (imm), (imm)), (MEMBARRIER)>;
+
 //===----------------------------------------------------------------------===//
 // Floating Point Support
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/MBlaze/MBlazeIntrinsicInfo.cpp b/lib/Target/MBlaze/MBlazeIntrinsicInfo.cpp
index 32d67b264a206..ea81dd63d195c 100644
--- a/lib/Target/MBlaze/MBlazeIntrinsicInfo.cpp
+++ b/lib/Target/MBlaze/MBlazeIntrinsicInfo.cpp
@@ -37,7 +37,7 @@ namespace mblazeIntrinsic {
 #undef GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN
 }
 
-std::string MBlazeIntrinsicInfo::getName(unsigned IntrID, const Type **Tys,
+std::string MBlazeIntrinsicInfo::getName(unsigned IntrID, Type **Tys,
                                          unsigned numTys) const {
   static const char *const names[] = {
 #define GET_INTRINSIC_NAME_TABLE
@@ -90,8 +90,8 @@ bool MBlazeIntrinsicInfo::isOverloaded(unsigned IntrID) const {
 #include "MBlazeGenIntrinsics.inc"
 #undef GET_INTRINSIC_ATTRIBUTES
 
-static const FunctionType *getType(LLVMContext &Context, unsigned id) {
-  const Type *ResultTy = NULL;
+static FunctionType *getType(LLVMContext &Context, unsigned id) {
+  Type *ResultTy = NULL;
   std::vector<Type*> ArgTys;
   bool IsVarArg = false;
 
@@ -103,7 +103,7 @@ static const FunctionType *getType(LLVMContext &Context, unsigned id) {
 }
 
 Function *MBlazeIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID,
-                                                const Type **Tys,
+                                                Type **Tys,
                                                 unsigned numTy) const {
   assert(!isOverloaded(IntrID) && "MBlaze intrinsics are not overloaded");
   AttrListPtr AList = getAttributes((mblazeIntrinsic::ID) IntrID);
diff --git a/lib/Target/MBlaze/MBlazeIntrinsicInfo.h b/lib/Target/MBlaze/MBlazeIntrinsicInfo.h
index 9804c7723bee2..80760d87e00ab 100644
--- a/lib/Target/MBlaze/MBlazeIntrinsicInfo.h
+++ b/lib/Target/MBlaze/MBlazeIntrinsicInfo.h
@@ -19,12 +19,12 @@ namespace llvm {
 
   class MBlazeIntrinsicInfo : public TargetIntrinsicInfo {
   public:
-    std::string getName(unsigned IntrID, const Type **Tys = 0,
+    std::string getName(unsigned IntrID, Type **Tys = 0,
                         unsigned numTys = 0) const;
     unsigned lookupName(const char *Name, unsigned Len) const;
     unsigned lookupGCCName(const char *Name) const;
     bool isOverloaded(unsigned IID) const;
-    Function *getDeclaration(Module *M, unsigned ID, const Type **Tys = 0,
+    Function *getDeclaration(Module *M, unsigned ID, Type **Tys = 0,
                              unsigned numTys = 0) const;
   };
 
diff --git a/lib/Target/MBlaze/MBlazeMCCodeEmitter.cpp b/lib/Target/MBlaze/MBlazeMCCodeEmitter.cpp
deleted file mode 100644
index ddc636d0ce646..0000000000000
--- a/lib/Target/MBlaze/MBlazeMCCodeEmitter.cpp
+++ /dev/null
@@ -1,222 +0,0 @@
-//===-- MBlazeMCCodeEmitter.cpp - Convert MBlaze code to machine code -----===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the MBlazeMCCodeEmitter class.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "mccodeemitter"
-#include "MBlaze.h"
-#include "MBlazeInstrInfo.h"
-#include "llvm/MC/MCCodeEmitter.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCSymbol.h"
-#include "llvm/MC/MCFixup.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Support/raw_ostream.h"
-using namespace llvm;
-
-STATISTIC(MCNumEmitted, "Number of MC instructions emitted");
-
-namespace {
-class MBlazeMCCodeEmitter : public MCCodeEmitter {
-  MBlazeMCCodeEmitter(const MBlazeMCCodeEmitter &); // DO NOT IMPLEMENT
-  void operator=(const MBlazeMCCodeEmitter &); // DO NOT IMPLEMENT
-  const MCInstrInfo &MCII;
-
-public:
-  MBlazeMCCodeEmitter(const MCInstrInfo &mcii, const MCSubtargetInfo &sti,
-                      MCContext &ctx)
-    : MCII(mcii) {
-  }
-
-  ~MBlazeMCCodeEmitter() {}
-
-  // getBinaryCodeForInstr - TableGen'erated function for getting the
-  // binary encoding for an instruction.
-  unsigned getBinaryCodeForInstr(const MCInst &MI) const;
-
-  /// getMachineOpValue - Return binary encoding of operand. If the machine
-  /// operand requires relocation, record the relocation and return zero.
-  unsigned getMachineOpValue(const MCInst &MI,const MCOperand &MO) const;
-  unsigned getMachineOpValue(const MCInst &MI, unsigned OpIdx) const {
-    return getMachineOpValue(MI, MI.getOperand(OpIdx));
-  }
-
-  static unsigned GetMBlazeRegNum(const MCOperand &MO) {
-    // FIXME: getMBlazeRegisterNumbering() is sufficient?
-    assert(0 && "MBlazeMCCodeEmitter::GetMBlazeRegNum() not yet implemented.");
-    return 0;
-  }
-
-  void EmitByte(unsigned char C, unsigned &CurByte, raw_ostream &OS) const {
-    // The MicroBlaze uses a bit reversed format so we need to reverse the
-    // order of the bits. Taken from:
-    // http://graphics.stanford.edu/~seander/bithacks.html
-    C = ((C * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
-
-    OS << (char)C;
-    ++CurByte;
-  }
-
-  void EmitRawByte(unsigned char C, unsigned &CurByte, raw_ostream &OS) const {
-    OS << (char)C;
-    ++CurByte;
-  }
-
-  void EmitConstant(uint64_t Val, unsigned Size, unsigned &CurByte,
-                    raw_ostream &OS) const {
-    assert(Size <= 8 && "size too big in emit constant");
-
-    for (unsigned i = 0; i != Size; ++i) {
-      EmitByte(Val & 255, CurByte, OS);
-      Val >>= 8;
-    }
-  }
-
-  void EmitIMM(const MCOperand &imm, unsigned &CurByte, raw_ostream &OS) const;
-  void EmitIMM(const MCInst &MI, unsigned &CurByte, raw_ostream &OS) const;
-
-  void EmitImmediate(const MCInst &MI, unsigned opNo, bool pcrel,
-                     unsigned &CurByte, raw_ostream &OS,
-                     SmallVectorImpl<MCFixup> &Fixups) const;
-
-  void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
-                         SmallVectorImpl<MCFixup> &Fixups) const;
-};
-
-} // end anonymous namespace
-
-
-MCCodeEmitter *llvm::createMBlazeMCCodeEmitter(const MCInstrInfo &MCII,
-                                               const MCSubtargetInfo &STI,
-                                               MCContext &Ctx) {
-  return new MBlazeMCCodeEmitter(MCII, STI, Ctx);
-}
-
-/// getMachineOpValue - Return binary encoding of operand. If the machine
-/// operand requires relocation, record the relocation and return zero.
-unsigned MBlazeMCCodeEmitter::getMachineOpValue(const MCInst &MI,
-                                             const MCOperand &MO) const {
-  if (MO.isReg())
-    return MBlazeRegisterInfo::getRegisterNumbering(MO.getReg());
-  else if (MO.isImm())
-    return static_cast<unsigned>(MO.getImm());
-  else if (MO.isExpr())
-      return 0; // The relocation has already been recorded at this point.
-  else {
-#ifndef NDEBUG
-    errs() << MO;
-#endif
-    llvm_unreachable(0);
-  }
-  return 0;
-}
-
-void MBlazeMCCodeEmitter::
-EmitIMM(const MCOperand &imm, unsigned &CurByte, raw_ostream &OS) const {
-  int32_t val = (int32_t)imm.getImm();
-  if (val > 32767 || val < -32768) {
-    EmitByte(0x0D, CurByte, OS);
-    EmitByte(0x00, CurByte, OS);
-    EmitRawByte((val >> 24) & 0xFF, CurByte, OS);
-    EmitRawByte((val >> 16) & 0xFF, CurByte, OS);
-  }
-}
-
-void MBlazeMCCodeEmitter::
-EmitIMM(const MCInst &MI, unsigned &CurByte,raw_ostream &OS) const {
-  switch (MI.getOpcode()) {
-  default: break;
-
-  case MBlaze::ADDIK32:
-  case MBlaze::ORI32:
-  case MBlaze::BRLID32:
-    EmitByte(0x0D, CurByte, OS);
-    EmitByte(0x00, CurByte, OS);
-    EmitRawByte(0, CurByte, OS);
-    EmitRawByte(0, CurByte, OS);
-  }
-}
-
-void MBlazeMCCodeEmitter::
-EmitImmediate(const MCInst &MI, unsigned opNo, bool pcrel, unsigned &CurByte,
-              raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups) const {
-  assert(MI.getNumOperands()>opNo && "Not enought operands for instruction");
-
-  MCOperand oper = MI.getOperand(opNo);
-
-  if (oper.isImm()) {
-    EmitIMM(oper, CurByte, OS);
-  } else if (oper.isExpr()) {
-    MCFixupKind FixupKind;
-    switch (MI.getOpcode()) {
-    default:
-      FixupKind = pcrel ? FK_PCRel_2 : FK_Data_2;
-      Fixups.push_back(MCFixup::Create(0,oper.getExpr(),FixupKind));
-      break;
-    case MBlaze::ORI32:
-    case MBlaze::ADDIK32:
-    case MBlaze::BRLID32:
-      FixupKind = pcrel ? FK_PCRel_4 : FK_Data_4;
-      Fixups.push_back(MCFixup::Create(0,oper.getExpr(),FixupKind));
-      break;
-    }
-  }
-}
-
-
-
-void MBlazeMCCodeEmitter::
-EncodeInstruction(const MCInst &MI, raw_ostream &OS,
-                  SmallVectorImpl<MCFixup> &Fixups) const {
-  unsigned Opcode = MI.getOpcode();
-  const MCInstrDesc &Desc = MCII.get(Opcode);
-  uint64_t TSFlags = Desc.TSFlags;
-  // Keep track of the current byte being emitted.
-  unsigned CurByte = 0;
-
-  // Emit an IMM instruction if the instruction we are encoding requires it
-  EmitIMM(MI,CurByte,OS);
-
-  switch ((TSFlags & MBlazeII::FormMask)) {
-  default: break;
-  case MBlazeII::FPseudo:
-    // Pseudo instructions don't get encoded.
-    return;
-  case MBlazeII::FRRI:
-    EmitImmediate(MI, 2, false, CurByte, OS, Fixups);
-    break;
-  case MBlazeII::FRIR:
-    EmitImmediate(MI, 1, false, CurByte, OS, Fixups);
-    break;
-  case MBlazeII::FCRI:
-    EmitImmediate(MI, 1, true, CurByte, OS, Fixups);
-    break;
-  case MBlazeII::FRCI:
-    EmitImmediate(MI, 1, true, CurByte, OS, Fixups);
-  case MBlazeII::FCCI:
-    EmitImmediate(MI, 0, true, CurByte, OS, Fixups);
-    break;
-  }
-
-  ++MCNumEmitted;  // Keep track of the # of mi's emitted
-  unsigned Value = getBinaryCodeForInstr(MI);
-  EmitConstant(Value, 4, CurByte, OS);
-}
-
-// FIXME: These #defines shouldn't be necessary. Instead, tblgen should
-// be able to generate code emitter helpers for either variant, like it
-// does for the AsmWriter.
-#define MBlazeCodeEmitter MBlazeMCCodeEmitter
-#define MachineInstr MCInst
-#include "MBlazeGenCodeEmitter.inc"
-#undef MBlazeCodeEmitter
-#undef MachineInstr
diff --git a/lib/Target/MBlaze/MBlazeRegisterInfo.cpp b/lib/Target/MBlaze/MBlazeRegisterInfo.cpp
index f0b201a661709..9788ba9e6021e 100644
--- a/lib/Target/MBlaze/MBlazeRegisterInfo.cpp
+++ b/lib/Target/MBlaze/MBlazeRegisterInfo.cpp
@@ -25,7 +25,6 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineLocation.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
@@ -44,164 +43,7 @@ using namespace llvm;
 
 MBlazeRegisterInfo::
 MBlazeRegisterInfo(const MBlazeSubtarget &ST, const TargetInstrInfo &tii)
-  : MBlazeGenRegisterInfo(), Subtarget(ST), TII(tii) {}
-
-/// getRegisterNumbering - Given the enum value for some register, e.g.
-/// MBlaze::R0, return the number that it corresponds to (e.g. 0).
-unsigned MBlazeRegisterInfo::getRegisterNumbering(unsigned RegEnum) {
-  switch (RegEnum) {
-    case MBlaze::R0     : return 0;
-    case MBlaze::R1     : return 1;
-    case MBlaze::R2     : return 2;
-    case MBlaze::R3     : return 3;
-    case MBlaze::R4     : return 4;
-    case MBlaze::R5     : return 5;
-    case MBlaze::R6     : return 6;
-    case MBlaze::R7     : return 7;
-    case MBlaze::R8     : return 8;
-    case MBlaze::R9     : return 9;
-    case MBlaze::R10    : return 10;
-    case MBlaze::R11    : return 11;
-    case MBlaze::R12    : return 12;
-    case MBlaze::R13    : return 13;
-    case MBlaze::R14    : return 14;
-    case MBlaze::R15    : return 15;
-    case MBlaze::R16    : return 16;
-    case MBlaze::R17    : return 17;
-    case MBlaze::R18    : return 18;
-    case MBlaze::R19    : return 19;
-    case MBlaze::R20    : return 20;
-    case MBlaze::R21    : return 21;
-    case MBlaze::R22    : return 22;
-    case MBlaze::R23    : return 23;
-    case MBlaze::R24    : return 24;
-    case MBlaze::R25    : return 25;
-    case MBlaze::R26    : return 26;
-    case MBlaze::R27    : return 27;
-    case MBlaze::R28    : return 28;
-    case MBlaze::R29    : return 29;
-    case MBlaze::R30    : return 30;
-    case MBlaze::R31    : return 31;
-    case MBlaze::RPC    : return 0x0000;
-    case MBlaze::RMSR   : return 0x0001;
-    case MBlaze::REAR   : return 0x0003;
-    case MBlaze::RESR   : return 0x0005;
-    case MBlaze::RFSR   : return 0x0007;
-    case MBlaze::RBTR   : return 0x000B;
-    case MBlaze::REDR   : return 0x000D;
-    case MBlaze::RPID   : return 0x1000;
-    case MBlaze::RZPR   : return 0x1001;
-    case MBlaze::RTLBX  : return 0x1002;
-    case MBlaze::RTLBLO : return 0x1003;
-    case MBlaze::RTLBHI : return 0x1004;
-    case MBlaze::RPVR0  : return 0x2000;
-    case MBlaze::RPVR1  : return 0x2001;
-    case MBlaze::RPVR2  : return 0x2002;
-    case MBlaze::RPVR3  : return 0x2003;
-    case MBlaze::RPVR4  : return 0x2004;
-    case MBlaze::RPVR5  : return 0x2005;
-    case MBlaze::RPVR6  : return 0x2006;
-    case MBlaze::RPVR7  : return 0x2007;
-    case MBlaze::RPVR8  : return 0x2008;
-    case MBlaze::RPVR9  : return 0x2009;
-    case MBlaze::RPVR10 : return 0x200A;
-    case MBlaze::RPVR11 : return 0x200B;
-    default: llvm_unreachable("Unknown register number!");
-  }
-  return 0; // Not reached
-}
-
-/// getRegisterFromNumbering - Given the enum value for some register, e.g.
-/// MBlaze::R0, return the number that it corresponds to (e.g. 0).
-unsigned MBlazeRegisterInfo::getRegisterFromNumbering(unsigned Reg) {
-  switch (Reg) {
-    case 0  : return MBlaze::R0;
-    case 1  : return MBlaze::R1;
-    case 2  : return MBlaze::R2;
-    case 3  : return MBlaze::R3;
-    case 4  : return MBlaze::R4;
-    case 5  : return MBlaze::R5;
-    case 6  : return MBlaze::R6;
-    case 7  : return MBlaze::R7;
-    case 8  : return MBlaze::R8;
-    case 9  : return MBlaze::R9;
-    case 10 : return MBlaze::R10;
-    case 11 : return MBlaze::R11;
-    case 12 : return MBlaze::R12;
-    case 13 : return MBlaze::R13;
-    case 14 : return MBlaze::R14;
-    case 15 : return MBlaze::R15;
-    case 16 : return MBlaze::R16;
-    case 17 : return MBlaze::R17;
-    case 18 : return MBlaze::R18;
-    case 19 : return MBlaze::R19;
-    case 20 : return MBlaze::R20;
-    case 21 : return MBlaze::R21;
-    case 22 : return MBlaze::R22;
-    case 23 : return MBlaze::R23;
-    case 24 : return MBlaze::R24;
-    case 25 : return MBlaze::R25;
-    case 26 : return MBlaze::R26;
-    case 27 : return MBlaze::R27;
-    case 28 : return MBlaze::R28;
-    case 29 : return MBlaze::R29;
-    case 30 : return MBlaze::R30;
-    case 31 : return MBlaze::R31;
-    default: llvm_unreachable("Unknown register number!");
-  }
-  return 0; // Not reached
-}
-
-unsigned MBlazeRegisterInfo::getSpecialRegisterFromNumbering(unsigned Reg) {
-  switch (Reg) {
-    case 0x0000 : return MBlaze::RPC;
-    case 0x0001 : return MBlaze::RMSR;
-    case 0x0003 : return MBlaze::REAR;
-    case 0x0005 : return MBlaze::RESR;
-    case 0x0007 : return MBlaze::RFSR;
-    case 0x000B : return MBlaze::RBTR;
-    case 0x000D : return MBlaze::REDR;
-    case 0x1000 : return MBlaze::RPID;
-    case 0x1001 : return MBlaze::RZPR;
-    case 0x1002 : return MBlaze::RTLBX;
-    case 0x1003 : return MBlaze::RTLBLO;
-    case 0x1004 : return MBlaze::RTLBHI;
-    case 0x2000 : return MBlaze::RPVR0;
-    case 0x2001 : return MBlaze::RPVR1;
-    case 0x2002 : return MBlaze::RPVR2;
-    case 0x2003 : return MBlaze::RPVR3;
-    case 0x2004 : return MBlaze::RPVR4;
-    case 0x2005 : return MBlaze::RPVR5;
-    case 0x2006 : return MBlaze::RPVR6;
-    case 0x2007 : return MBlaze::RPVR7;
-    case 0x2008 : return MBlaze::RPVR8;
-    case 0x2009 : return MBlaze::RPVR9;
-    case 0x200A : return MBlaze::RPVR10;
-    case 0x200B : return MBlaze::RPVR11;
-    default: llvm_unreachable("Unknown register number!");
-  }
-  return 0; // Not reached
-}
-
-bool MBlazeRegisterInfo::isRegister(unsigned Reg) {
-  return Reg <= 31;
-}
-
-bool MBlazeRegisterInfo::isSpecialRegister(unsigned Reg) {
-  switch (Reg) {
-    case 0x0000 : case 0x0001 : case 0x0003 : case 0x0005 : 
-    case 0x0007 : case 0x000B : case 0x000D : case 0x1000 : 
-    case 0x1001 : case 0x1002 : case 0x1003 : case 0x1004 : 
-    case 0x2000 : case 0x2001 : case 0x2002 : case 0x2003 : 
-    case 0x2004 : case 0x2005 : case 0x2006 : case 0x2007 : 
-    case 0x2008 : case 0x2009 : case 0x200A : case 0x200B : 
-      return true;
-
-    default:
-      return false;
-  }
-  return false; // Not reached
-}
+  : MBlazeGenRegisterInfo(MBlaze::R15), Subtarget(ST), TII(tii) {}
 
 unsigned MBlazeRegisterInfo::getPICCallReg() {
   return MBlaze::R20;
@@ -334,10 +176,6 @@ processFunctionBeforeFrameFinalized(MachineFunction &MF) const {
     MFI->setObjectOffset(MBlazeFI->getGPFI(), MBlazeFI->getGPStackOffset());
 }
 
-unsigned MBlazeRegisterInfo::getRARegister() const {
-  return MBlaze::R15;
-}
-
 unsigned MBlazeRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
 
@@ -353,11 +191,3 @@ unsigned MBlazeRegisterInfo::getEHHandlerRegister() const {
   llvm_unreachable("What is the exception handler register");
   return 0;
 }
-
-int MBlazeRegisterInfo::getDwarfRegNum(unsigned RegNo, bool isEH) const {
-  return MBlazeGenRegisterInfo::getDwarfRegNumFull(RegNo,0);
-}
-
-int MBlazeRegisterInfo::getLLVMRegNum(unsigned DwarfRegNo, bool isEH) const {
-  return MBlazeGenRegisterInfo::getLLVMRegNumFull(DwarfRegNo,0);
-}
diff --git a/lib/Target/MBlaze/MBlazeRegisterInfo.h b/lib/Target/MBlaze/MBlazeRegisterInfo.h
index 7ebce21d3a801..7e4b269cb8870 100644
--- a/lib/Target/MBlaze/MBlazeRegisterInfo.h
+++ b/lib/Target/MBlaze/MBlazeRegisterInfo.h
@@ -42,14 +42,6 @@ struct MBlazeRegisterInfo : public MBlazeGenRegisterInfo {
   MBlazeRegisterInfo(const MBlazeSubtarget &Subtarget,
                      const TargetInstrInfo &tii);
 
-  /// getRegisterNumbering - Given the enum value for some register, e.g.
-  /// MBlaze::RA, return the number that it corresponds to (e.g. 31).
-  static unsigned getRegisterNumbering(unsigned RegEnum);
-  static unsigned getRegisterFromNumbering(unsigned RegEnum);
-  static unsigned getSpecialRegisterFromNumbering(unsigned RegEnum);
-  static bool isRegister(unsigned RegEnum);
-  static bool isSpecialRegister(unsigned RegEnum);
-
   /// Get PIC indirect call register
   static unsigned getPICCallReg();
 
@@ -69,15 +61,11 @@ struct MBlazeRegisterInfo : public MBlazeGenRegisterInfo {
   void processFunctionBeforeFrameFinalized(MachineFunction &MF) const;
 
   /// Debug information queries.
-  unsigned getRARegister() const;
   unsigned getFrameRegister(const MachineFunction &MF) const;
 
   /// Exception handling queries.
   unsigned getEHExceptionRegister() const;
   unsigned getEHHandlerRegister() const;
-
-  int getDwarfRegNum(unsigned RegNum, bool isEH) const;
-  int getLLVMRegNum(unsigned RegNum, bool isEH) const;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/MBlaze/MBlazeSubtarget.cpp b/lib/Target/MBlaze/MBlazeSubtarget.cpp
index eda141daf2b3e..7e5667f55c15d 100644
--- a/lib/Target/MBlaze/MBlazeSubtarget.cpp
+++ b/lib/Target/MBlaze/MBlazeSubtarget.cpp
@@ -15,7 +15,7 @@
 #include "MBlaze.h"
 #include "MBlazeRegisterInfo.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 
 #define GET_SUBTARGETINFO_TARGET_DESC
 #define GET_SUBTARGETINFO_CTOR
diff --git a/lib/Target/MBlaze/MBlazeTargetMachine.cpp b/lib/Target/MBlaze/MBlazeTargetMachine.cpp
index 7208874aef1da..7bff53ef8717f 100644
--- a/lib/Target/MBlaze/MBlazeTargetMachine.cpp
+++ b/lib/Target/MBlaze/MBlazeTargetMachine.cpp
@@ -16,48 +16,13 @@
 #include "llvm/PassManager.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/TargetOptions.h"
-#include "llvm/Target/TargetRegistry.h"
 using namespace llvm;
 
-static MCStreamer *createMCStreamer(const Target &T, const std::string &TT,
-                                    MCContext &Ctx, TargetAsmBackend &TAB,
-                                    raw_ostream &_OS,
-                                    MCCodeEmitter *_Emitter,
-                                    bool RelaxAll,
-                                    bool NoExecStack) {
-  Triple TheTriple(TT);
-
-  if (TheTriple.isOSDarwin()) {
-    llvm_unreachable("MBlaze does not support Darwin MACH-O format");
-    return NULL;
-  }
-
-  if (TheTriple.isOSWindows()) {
-    llvm_unreachable("MBlaze does not support Windows COFF format");
-    return NULL;
-  }
-
-  return createELFStreamer(Ctx, TAB, _OS, _Emitter, RelaxAll, NoExecStack);
-}
-
-
 extern "C" void LLVMInitializeMBlazeTarget() {
   // Register the target.
   RegisterTargetMachine<MBlazeTargetMachine> X(TheMBlazeTarget);
-
-  // Register the MC code emitter
-  TargetRegistry::RegisterCodeEmitter(TheMBlazeTarget,
-                                      llvm::createMBlazeMCCodeEmitter);
-
-  // Register the asm backend
-  TargetRegistry::RegisterAsmBackend(TheMBlazeTarget,
-                                     createMBlazeAsmBackend);
-
-  // Register the object streamer
-  TargetRegistry::RegisterObjectStreamer(TheMBlazeTarget,
-                                         createMCStreamer);
-
 }
 
 // DataLayout --> Big-endian, 32-bit pointer/ABI/alignment
@@ -67,21 +32,16 @@ extern "C" void LLVMInitializeMBlazeTarget() {
 // offset from the stack/frame pointer, using StackGrowsUp enables
 // an easier handling.
 MBlazeTargetMachine::
-MBlazeTargetMachine(const Target &T, const std::string &TT,
-                    const std::string &CPU, const std::string &FS):
-  LLVMTargetMachine(T, TT, CPU, FS),
+MBlazeTargetMachine(const Target &T, StringRef TT,
+                    StringRef CPU, StringRef FS,
+                    Reloc::Model RM, CodeModel::Model CM):
+  LLVMTargetMachine(T, TT, CPU, FS, RM, CM),
   Subtarget(TT, CPU, FS),
   DataLayout("E-p:32:32:32-i8:8:8-i16:16:16"),
   InstrInfo(*this),
   FrameLowering(Subtarget),
   TLInfo(*this), TSInfo(*this), ELFWriterInfo(*this),
   InstrItins(Subtarget.getInstrItineraryData()) {
-  if (getRelocationModel() == Reloc::Default) {
-      setRelocationModel(Reloc::Static);
-  }
-
-  if (getCodeModel() == CodeModel::Default)
-    setCodeModel(CodeModel::Small);
 }
 
 // Install an instruction selector pass using
diff --git a/lib/Target/MBlaze/MBlazeTargetMachine.h b/lib/Target/MBlaze/MBlazeTargetMachine.h
index cd6caafbf3096..c1bc08aeb5050 100644
--- a/lib/Target/MBlaze/MBlazeTargetMachine.h
+++ b/lib/Target/MBlaze/MBlazeTargetMachine.h
@@ -41,8 +41,9 @@ namespace llvm {
     InstrItineraryData     InstrItins;
 
   public:
-    MBlazeTargetMachine(const Target &T, const std::string &TT,
-                        const std::string &CPU, const std::string &FS);
+    MBlazeTargetMachine(const Target &T, StringRef TT,
+                        StringRef CPU, StringRef FS,
+                        Reloc::Model RM, CodeModel::Model CM);
 
     virtual const MBlazeInstrInfo *getInstrInfo() const
     { return &InstrInfo; }
diff --git a/lib/Target/MBlaze/MBlazeTargetObjectFile.cpp b/lib/Target/MBlaze/MBlazeTargetObjectFile.cpp
index abd1b0b62c7de..f66ea302d9fed 100644
--- a/lib/Target/MBlaze/MBlazeTargetObjectFile.cpp
+++ b/lib/Target/MBlaze/MBlazeTargetObjectFile.cpp
@@ -69,7 +69,7 @@ IsGlobalInSmallSection(const GlobalValue *GV, const TargetMachine &TM,
   if (Kind.isMergeable1ByteCString())
     return false;
 
-  const Type *Ty = GV->getType()->getElementType();
+  Type *Ty = GV->getType()->getElementType();
   return IsInSmallSection(TM.getTargetData()->getTypeAllocSize(Ty));
 }
 
diff --git a/lib/Target/MBlaze/MCTargetDesc/CMakeLists.txt b/lib/Target/MBlaze/MCTargetDesc/CMakeLists.txt
index 3d15708c35b8e..37871b6916c9f 100644
--- a/lib/Target/MBlaze/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/MBlaze/MCTargetDesc/CMakeLists.txt
@@ -1,4 +1,15 @@
 add_llvm_library(LLVMMBlazeDesc
-  MBlazeMCTargetDesc.cpp
+  MBlazeAsmBackend.cpp
   MBlazeMCAsmInfo.cpp
+  MBlazeMCCodeEmitter.cpp
+  MBlazeMCTargetDesc.cpp
+  )
+
+add_llvm_library_dependencies(LLVMMBlazeDesc
+  LLVMMBlazeAsmPrinter
+  LLVMMBlazeInfo
+  LLVMMC
+  LLVMSupport
   )
+
+add_dependencies(LLVMMBlazeDesc MBlazeCommonTableGen)
diff --git a/lib/Target/MBlaze/MCTargetDesc/MBlazeAsmBackend.cpp b/lib/Target/MBlaze/MCTargetDesc/MBlazeAsmBackend.cpp
new file mode 100644
index 0000000000000..08f7d46a58f93
--- /dev/null
+++ b/lib/Target/MBlaze/MCTargetDesc/MBlazeAsmBackend.cpp
@@ -0,0 +1,159 @@
+//===-- MBlazeAsmBackend.cpp - MBlaze Assembler Backend -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/MBlazeMCTargetDesc.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCELFSymbolFlags.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+static unsigned getFixupKindSize(unsigned Kind) {
+  switch (Kind) {
+  default: assert(0 && "invalid fixup kind!");
+  case FK_Data_1: return 1;
+  case FK_PCRel_2:
+  case FK_Data_2: return 2;
+  case FK_PCRel_4:
+  case FK_Data_4: return 4;
+  case FK_Data_8: return 8;
+  }
+}
+
+
+namespace {
+class MBlazeELFObjectWriter : public MCELFObjectTargetWriter {
+public:
+  MBlazeELFObjectWriter(Triple::OSType OSType)
+    : MCELFObjectTargetWriter(/*is64Bit*/ false, OSType, ELF::EM_MBLAZE,
+                              /*HasRelocationAddend*/ true) {}
+};
+
+class MBlazeAsmBackend : public MCAsmBackend {
+public:
+  MBlazeAsmBackend(const Target &T)
+    : MCAsmBackend() {
+  }
+
+  unsigned getNumFixupKinds() const {
+    return 2;
+  }
+
+  bool MayNeedRelaxation(const MCInst &Inst) const;
+
+  void RelaxInstruction(const MCInst &Inst, MCInst &Res) const;
+
+  bool WriteNopData(uint64_t Count, MCObjectWriter *OW) const;
+
+  unsigned getPointerSize() const {
+    return 4;
+  }
+};
+
+static unsigned getRelaxedOpcode(unsigned Op) {
+    switch (Op) {
+    default:            return Op;
+    case MBlaze::ADDIK: return MBlaze::ADDIK32;
+    case MBlaze::ORI:   return MBlaze::ORI32;
+    case MBlaze::BRLID: return MBlaze::BRLID32;
+    }
+}
+
+bool MBlazeAsmBackend::MayNeedRelaxation(const MCInst &Inst) const {
+  if (getRelaxedOpcode(Inst.getOpcode()) == Inst.getOpcode())
+    return false;
+
+  bool hasExprOrImm = false;
+  for (unsigned i = 0; i < Inst.getNumOperands(); ++i)
+    hasExprOrImm |= Inst.getOperand(i).isExpr();
+
+  return hasExprOrImm;
+}
+
+void MBlazeAsmBackend::RelaxInstruction(const MCInst &Inst, MCInst &Res) const {
+  Res = Inst;
+  Res.setOpcode(getRelaxedOpcode(Inst.getOpcode()));
+}
+
+bool MBlazeAsmBackend::WriteNopData(uint64_t Count, MCObjectWriter *OW) const {
+  if ((Count % 4) != 0)
+    return false;
+
+  for (uint64_t i = 0; i < Count; i += 4)
+      OW->Write32(0x00000000);
+
+  return true;
+}
+} // end anonymous namespace
+
+namespace {
+class ELFMBlazeAsmBackend : public MBlazeAsmBackend {
+public:
+  Triple::OSType OSType;
+  ELFMBlazeAsmBackend(const Target &T, Triple::OSType _OSType)
+    : MBlazeAsmBackend(T), OSType(_OSType) { }
+
+  void ApplyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                  uint64_t Value) const;
+
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
+    return createELFObjectWriter(new MBlazeELFObjectWriter(OSType), OS,
+                                 /*IsLittleEndian*/ false);
+  }
+};
+
+void ELFMBlazeAsmBackend::ApplyFixup(const MCFixup &Fixup, char *Data,
+                                     unsigned DataSize, uint64_t Value) const {
+  unsigned Size = getFixupKindSize(Fixup.getKind());
+
+  assert(Fixup.getOffset() + Size <= DataSize &&
+         "Invalid fixup offset!");
+
+  char *data = Data + Fixup.getOffset();
+  switch (Size) {
+  default: llvm_unreachable("Cannot fixup unknown value.");
+  case 1:  llvm_unreachable("Cannot fixup 1 byte value.");
+  case 8:  llvm_unreachable("Cannot fixup 8 byte value.");
+
+  case 4:
+    *(data+7) = uint8_t(Value);
+    *(data+6) = uint8_t(Value >> 8);
+    *(data+3) = uint8_t(Value >> 16);
+    *(data+2) = uint8_t(Value >> 24);
+    break;
+
+  case 2:
+    *(data+3) = uint8_t(Value >> 0);
+    *(data+2) = uint8_t(Value >> 8);
+  }
+}
+} // end anonymous namespace
+
+MCAsmBackend *llvm::createMBlazeAsmBackend(const Target &T, StringRef TT) {
+  Triple TheTriple(TT);
+
+  if (TheTriple.isOSDarwin())
+    assert(0 && "Mac not supported on MBlaze");
+
+  if (TheTriple.isOSWindows())
+    assert(0 && "Windows not supported on MBlaze");
+
+  return new ELFMBlazeAsmBackend(T, TheTriple.getOS());
+}
diff --git a/lib/Target/MBlaze/MCTargetDesc/MBlazeBaseInfo.h b/lib/Target/MBlaze/MCTargetDesc/MBlazeBaseInfo.h
new file mode 100644
index 0000000000000..776dbc4d86780
--- /dev/null
+++ b/lib/Target/MBlaze/MCTargetDesc/MBlazeBaseInfo.h
@@ -0,0 +1,240 @@
+//===-- MBlazeBaseInfo.h - Top level definitions for MBlaze -- --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains small standalone helper functions and enum definitions for
+// the MBlaze target useful for the compiler back-end and the MC libraries.
+// As such, it deliberately does not include references to LLVM core
+// code gen types, passes, etc..
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MBlazeBASEINFO_H
+#define MBlazeBASEINFO_H
+
+#include "MBlazeMCTargetDesc.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+
+/// MBlazeII - This namespace holds all of the target specific flags that
+/// instruction info tracks.
+///
+namespace MBlazeII {
+  enum {
+    // PseudoFrm - This represents an instruction that is a pseudo instruction
+    // or one that has not been implemented yet.  It is illegal to code generate
+    // it, but tolerated for intermediate implementation stages.
+    FPseudo = 0,
+    FRRR,
+    FRRI,
+    FCRR,
+    FCRI,
+    FRCR,
+    FRCI,
+    FCCR,
+    FCCI,
+    FRRCI,
+    FRRC,
+    FRCX,
+    FRCS,
+    FCRCS,
+    FCRCX,
+    FCX,
+    FCR,
+    FRIR,
+    FRRRR,
+    FRI,
+    FC,
+    FormMask = 63
+
+    //===------------------------------------------------------------------===//
+    // MBlaze Specific MachineOperand flags.
+    // MO_NO_FLAG,
+
+    /// MO_GOT - Represents the offset into the global offset table at which
+    /// the address the relocation entry symbol resides during execution.
+    // MO_GOT,
+
+    /// MO_GOT_CALL - Represents the offset into the global offset table at
+    /// which the address of a call site relocation entry symbol resides
+    /// during execution. This is different from the above since this flag
+    /// can only be present in call instructions.
+    // MO_GOT_CALL,
+
+    /// MO_GPREL - Represents the offset from the current gp value to be used
+    /// for the relocatable object file being produced.
+    // MO_GPREL,
+
+    /// MO_ABS_HILO - Represents the hi or low part of an absolute symbol
+    /// address.
+    // MO_ABS_HILO
+
+  };
+}
+
+static inline bool isMBlazeRegister(unsigned Reg) {
+  return Reg <= 31;
+}
+
+static inline bool isSpecialMBlazeRegister(unsigned Reg) {
+  switch (Reg) {
+    case 0x0000 : case 0x0001 : case 0x0003 : case 0x0005 : 
+    case 0x0007 : case 0x000B : case 0x000D : case 0x1000 : 
+    case 0x1001 : case 0x1002 : case 0x1003 : case 0x1004 : 
+    case 0x2000 : case 0x2001 : case 0x2002 : case 0x2003 : 
+    case 0x2004 : case 0x2005 : case 0x2006 : case 0x2007 : 
+    case 0x2008 : case 0x2009 : case 0x200A : case 0x200B : 
+      return true;
+
+    default:
+      return false;
+  }
+  return false; // Not reached
+}
+
+/// getMBlazeRegisterNumbering - Given the enum value for some register, e.g.
+/// MBlaze::R0, return the number that it corresponds to (e.g. 0).
+static inline unsigned getMBlazeRegisterNumbering(unsigned RegEnum) {
+  switch (RegEnum) {
+    case MBlaze::R0     : return 0;
+    case MBlaze::R1     : return 1;
+    case MBlaze::R2     : return 2;
+    case MBlaze::R3     : return 3;
+    case MBlaze::R4     : return 4;
+    case MBlaze::R5     : return 5;
+    case MBlaze::R6     : return 6;
+    case MBlaze::R7     : return 7;
+    case MBlaze::R8     : return 8;
+    case MBlaze::R9     : return 9;
+    case MBlaze::R10    : return 10;
+    case MBlaze::R11    : return 11;
+    case MBlaze::R12    : return 12;
+    case MBlaze::R13    : return 13;
+    case MBlaze::R14    : return 14;
+    case MBlaze::R15    : return 15;
+    case MBlaze::R16    : return 16;
+    case MBlaze::R17    : return 17;
+    case MBlaze::R18    : return 18;
+    case MBlaze::R19    : return 19;
+    case MBlaze::R20    : return 20;
+    case MBlaze::R21    : return 21;
+    case MBlaze::R22    : return 22;
+    case MBlaze::R23    : return 23;
+    case MBlaze::R24    : return 24;
+    case MBlaze::R25    : return 25;
+    case MBlaze::R26    : return 26;
+    case MBlaze::R27    : return 27;
+    case MBlaze::R28    : return 28;
+    case MBlaze::R29    : return 29;
+    case MBlaze::R30    : return 30;
+    case MBlaze::R31    : return 31;
+    case MBlaze::RPC    : return 0x0000;
+    case MBlaze::RMSR   : return 0x0001;
+    case MBlaze::REAR   : return 0x0003;
+    case MBlaze::RESR   : return 0x0005;
+    case MBlaze::RFSR   : return 0x0007;
+    case MBlaze::RBTR   : return 0x000B;
+    case MBlaze::REDR   : return 0x000D;
+    case MBlaze::RPID   : return 0x1000;
+    case MBlaze::RZPR   : return 0x1001;
+    case MBlaze::RTLBX  : return 0x1002;
+    case MBlaze::RTLBLO : return 0x1003;
+    case MBlaze::RTLBHI : return 0x1004;
+    case MBlaze::RPVR0  : return 0x2000;
+    case MBlaze::RPVR1  : return 0x2001;
+    case MBlaze::RPVR2  : return 0x2002;
+    case MBlaze::RPVR3  : return 0x2003;
+    case MBlaze::RPVR4  : return 0x2004;
+    case MBlaze::RPVR5  : return 0x2005;
+    case MBlaze::RPVR6  : return 0x2006;
+    case MBlaze::RPVR7  : return 0x2007;
+    case MBlaze::RPVR8  : return 0x2008;
+    case MBlaze::RPVR9  : return 0x2009;
+    case MBlaze::RPVR10 : return 0x200A;
+    case MBlaze::RPVR11 : return 0x200B;
+    default: llvm_unreachable("Unknown register number!");
+  }
+  return 0; // Not reached
+}
+
+/// getRegisterFromNumbering - Given the enum value for some register, e.g.
+/// MBlaze::R0, return the number that it corresponds to (e.g. 0).
+static inline unsigned getMBlazeRegisterFromNumbering(unsigned Reg) {
+  switch (Reg) {
+    case 0  : return MBlaze::R0;
+    case 1  : return MBlaze::R1;
+    case 2  : return MBlaze::R2;
+    case 3  : return MBlaze::R3;
+    case 4  : return MBlaze::R4;
+    case 5  : return MBlaze::R5;
+    case 6  : return MBlaze::R6;
+    case 7  : return MBlaze::R7;
+    case 8  : return MBlaze::R8;
+    case 9  : return MBlaze::R9;
+    case 10 : return MBlaze::R10;
+    case 11 : return MBlaze::R11;
+    case 12 : return MBlaze::R12;
+    case 13 : return MBlaze::R13;
+    case 14 : return MBlaze::R14;
+    case 15 : return MBlaze::R15;
+    case 16 : return MBlaze::R16;
+    case 17 : return MBlaze::R17;
+    case 18 : return MBlaze::R18;
+    case 19 : return MBlaze::R19;
+    case 20 : return MBlaze::R20;
+    case 21 : return MBlaze::R21;
+    case 22 : return MBlaze::R22;
+    case 23 : return MBlaze::R23;
+    case 24 : return MBlaze::R24;
+    case 25 : return MBlaze::R25;
+    case 26 : return MBlaze::R26;
+    case 27 : return MBlaze::R27;
+    case 28 : return MBlaze::R28;
+    case 29 : return MBlaze::R29;
+    case 30 : return MBlaze::R30;
+    case 31 : return MBlaze::R31;
+    default: llvm_unreachable("Unknown register number!");
+  }
+  return 0; // Not reached
+}
+
+static inline unsigned getSpecialMBlazeRegisterFromNumbering(unsigned Reg) {
+  switch (Reg) {
+    case 0x0000 : return MBlaze::RPC;
+    case 0x0001 : return MBlaze::RMSR;
+    case 0x0003 : return MBlaze::REAR;
+    case 0x0005 : return MBlaze::RESR;
+    case 0x0007 : return MBlaze::RFSR;
+    case 0x000B : return MBlaze::RBTR;
+    case 0x000D : return MBlaze::REDR;
+    case 0x1000 : return MBlaze::RPID;
+    case 0x1001 : return MBlaze::RZPR;
+    case 0x1002 : return MBlaze::RTLBX;
+    case 0x1003 : return MBlaze::RTLBLO;
+    case 0x1004 : return MBlaze::RTLBHI;
+    case 0x2000 : return MBlaze::RPVR0;
+    case 0x2001 : return MBlaze::RPVR1;
+    case 0x2002 : return MBlaze::RPVR2;
+    case 0x2003 : return MBlaze::RPVR3;
+    case 0x2004 : return MBlaze::RPVR4;
+    case 0x2005 : return MBlaze::RPVR5;
+    case 0x2006 : return MBlaze::RPVR6;
+    case 0x2007 : return MBlaze::RPVR7;
+    case 0x2008 : return MBlaze::RPVR8;
+    case 0x2009 : return MBlaze::RPVR9;
+    case 0x200A : return MBlaze::RPVR10;
+    case 0x200B : return MBlaze::RPVR11;
+    default: llvm_unreachable("Unknown register number!");
+  }
+  return 0; // Not reached
+}
+
+} // end namespace llvm;
+
+#endif
diff --git a/lib/Target/MBlaze/MCTargetDesc/MBlazeMCCodeEmitter.cpp b/lib/Target/MBlaze/MCTargetDesc/MBlazeMCCodeEmitter.cpp
new file mode 100644
index 0000000000000..1514557bf00b0
--- /dev/null
+++ b/lib/Target/MBlaze/MCTargetDesc/MBlazeMCCodeEmitter.cpp
@@ -0,0 +1,224 @@
+//===-- MBlazeMCCodeEmitter.cpp - Convert MBlaze code to machine code -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the MBlazeMCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mccodeemitter"
+#include "MCTargetDesc/MBlazeBaseInfo.h"
+#include "MCTargetDesc/MBlazeMCTargetDesc.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+STATISTIC(MCNumEmitted, "Number of MC instructions emitted");
+
+namespace {
+class MBlazeMCCodeEmitter : public MCCodeEmitter {
+  MBlazeMCCodeEmitter(const MBlazeMCCodeEmitter &); // DO NOT IMPLEMENT
+  void operator=(const MBlazeMCCodeEmitter &); // DO NOT IMPLEMENT
+  const MCInstrInfo &MCII;
+
+public:
+  MBlazeMCCodeEmitter(const MCInstrInfo &mcii, const MCSubtargetInfo &sti,
+                      MCContext &ctx)
+    : MCII(mcii) {
+  }
+
+  ~MBlazeMCCodeEmitter() {}
+
+  // getBinaryCodeForInstr - TableGen'erated function for getting the
+  // binary encoding for an instruction.
+  unsigned getBinaryCodeForInstr(const MCInst &MI) const;
+
+  /// getMachineOpValue - Return binary encoding of operand. If the machine
+  /// operand requires relocation, record the relocation and return zero.
+  unsigned getMachineOpValue(const MCInst &MI,const MCOperand &MO) const;
+  unsigned getMachineOpValue(const MCInst &MI, unsigned OpIdx) const {
+    return getMachineOpValue(MI, MI.getOperand(OpIdx));
+  }
+
+  static unsigned GetMBlazeRegNum(const MCOperand &MO) {
+    // FIXME: getMBlazeRegisterNumbering() is sufficient?
+    assert(0 && "MBlazeMCCodeEmitter::GetMBlazeRegNum() not yet implemented.");
+    return 0;
+  }
+
+  void EmitByte(unsigned char C, unsigned &CurByte, raw_ostream &OS) const {
+    // The MicroBlaze uses a bit reversed format so we need to reverse the
+    // order of the bits. Taken from:
+    // http://graphics.stanford.edu/~seander/bithacks.html
+    C = ((C * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
+
+    OS << (char)C;
+    ++CurByte;
+  }
+
+  void EmitRawByte(unsigned char C, unsigned &CurByte, raw_ostream &OS) const {
+    OS << (char)C;
+    ++CurByte;
+  }
+
+  void EmitConstant(uint64_t Val, unsigned Size, unsigned &CurByte,
+                    raw_ostream &OS) const {
+    assert(Size <= 8 && "size too big in emit constant");
+
+    for (unsigned i = 0; i != Size; ++i) {
+      EmitByte(Val & 255, CurByte, OS);
+      Val >>= 8;
+    }
+  }
+
+  void EmitIMM(const MCOperand &imm, unsigned &CurByte, raw_ostream &OS) const;
+  void EmitIMM(const MCInst &MI, unsigned &CurByte, raw_ostream &OS) const;
+
+  void EmitImmediate(const MCInst &MI, unsigned opNo, bool pcrel,
+                     unsigned &CurByte, raw_ostream &OS,
+                     SmallVectorImpl<MCFixup> &Fixups) const;
+
+  void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups) const;
+};
+
+} // end anonymous namespace
+
+
+MCCodeEmitter *llvm::createMBlazeMCCodeEmitter(const MCInstrInfo &MCII,
+                                               const MCSubtargetInfo &STI,
+                                               MCContext &Ctx) {
+  return new MBlazeMCCodeEmitter(MCII, STI, Ctx);
+}
+
+/// getMachineOpValue - Return binary encoding of operand. If the machine
+/// operand requires relocation, record the relocation and return zero.
+unsigned MBlazeMCCodeEmitter::getMachineOpValue(const MCInst &MI,
+                                             const MCOperand &MO) const {
+  if (MO.isReg())
+    return getMBlazeRegisterNumbering(MO.getReg());
+  else if (MO.isImm())
+    return static_cast<unsigned>(MO.getImm());
+  else if (MO.isExpr())
+      return 0; // The relocation has already been recorded at this point.
+  else {
+#ifndef NDEBUG
+    errs() << MO;
+#endif
+    llvm_unreachable(0);
+  }
+  return 0;
+}
+
+void MBlazeMCCodeEmitter::
+EmitIMM(const MCOperand &imm, unsigned &CurByte, raw_ostream &OS) const {
+  int32_t val = (int32_t)imm.getImm();
+  if (val > 32767 || val < -32768) {
+    EmitByte(0x0D, CurByte, OS);
+    EmitByte(0x00, CurByte, OS);
+    EmitRawByte((val >> 24) & 0xFF, CurByte, OS);
+    EmitRawByte((val >> 16) & 0xFF, CurByte, OS);
+  }
+}
+
+void MBlazeMCCodeEmitter::
+EmitIMM(const MCInst &MI, unsigned &CurByte,raw_ostream &OS) const {
+  switch (MI.getOpcode()) {
+  default: break;
+
+  case MBlaze::ADDIK32:
+  case MBlaze::ORI32:
+  case MBlaze::BRLID32:
+    EmitByte(0x0D, CurByte, OS);
+    EmitByte(0x00, CurByte, OS);
+    EmitRawByte(0, CurByte, OS);
+    EmitRawByte(0, CurByte, OS);
+  }
+}
+
+void MBlazeMCCodeEmitter::
+EmitImmediate(const MCInst &MI, unsigned opNo, bool pcrel, unsigned &CurByte,
+              raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups) const {
+  assert(MI.getNumOperands()>opNo && "Not enought operands for instruction");
+
+  MCOperand oper = MI.getOperand(opNo);
+
+  if (oper.isImm()) {
+    EmitIMM(oper, CurByte, OS);
+  } else if (oper.isExpr()) {
+    MCFixupKind FixupKind;
+    switch (MI.getOpcode()) {
+    default:
+      FixupKind = pcrel ? FK_PCRel_2 : FK_Data_2;
+      Fixups.push_back(MCFixup::Create(0,oper.getExpr(),FixupKind));
+      break;
+    case MBlaze::ORI32:
+    case MBlaze::ADDIK32:
+    case MBlaze::BRLID32:
+      FixupKind = pcrel ? FK_PCRel_4 : FK_Data_4;
+      Fixups.push_back(MCFixup::Create(0,oper.getExpr(),FixupKind));
+      break;
+    }
+  }
+}
+
+
+
+void MBlazeMCCodeEmitter::
+EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                  SmallVectorImpl<MCFixup> &Fixups) const {
+  unsigned Opcode = MI.getOpcode();
+  const MCInstrDesc &Desc = MCII.get(Opcode);
+  uint64_t TSFlags = Desc.TSFlags;
+  // Keep track of the current byte being emitted.
+  unsigned CurByte = 0;
+
+  // Emit an IMM instruction if the instruction we are encoding requires it
+  EmitIMM(MI,CurByte,OS);
+
+  switch ((TSFlags & MBlazeII::FormMask)) {
+  default: break;
+  case MBlazeII::FPseudo:
+    // Pseudo instructions don't get encoded.
+    return;
+  case MBlazeII::FRRI:
+    EmitImmediate(MI, 2, false, CurByte, OS, Fixups);
+    break;
+  case MBlazeII::FRIR:
+    EmitImmediate(MI, 1, false, CurByte, OS, Fixups);
+    break;
+  case MBlazeII::FCRI:
+    EmitImmediate(MI, 1, true, CurByte, OS, Fixups);
+    break;
+  case MBlazeII::FRCI:
+    EmitImmediate(MI, 1, true, CurByte, OS, Fixups);
+  case MBlazeII::FCCI:
+    EmitImmediate(MI, 0, true, CurByte, OS, Fixups);
+    break;
+  }
+
+  ++MCNumEmitted;  // Keep track of the # of mi's emitted
+  unsigned Value = getBinaryCodeForInstr(MI);
+  EmitConstant(Value, 4, CurByte, OS);
+}
+
+// FIXME: These #defines shouldn't be necessary. Instead, tblgen should
+// be able to generate code emitter helpers for either variant, like it
+// does for the AsmWriter.
+#define MBlazeCodeEmitter MBlazeMCCodeEmitter
+#define MachineInstr MCInst
+#include "MBlazeGenCodeEmitter.inc"
+#undef MBlazeCodeEmitter
+#undef MachineInstr
diff --git a/lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.cpp b/lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.cpp
index 20d6c0bd2156f..43ae281519c25 100644
--- a/lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.cpp
+++ b/lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.cpp
@@ -13,10 +13,14 @@
 
 #include "MBlazeMCTargetDesc.h"
 #include "MBlazeMCAsmInfo.h"
+#include "InstPrinter/MBlazeInstPrinter.h"
+#include "llvm/MC/MCCodeGenInfo.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
 
 #define GET_INSTRINFO_MC_DESC
 #include "MBlazeGenInstrInfo.inc"
@@ -36,8 +40,10 @@ static MCInstrInfo *createMBlazeMCInstrInfo() {
   return X;
 }
 
-extern "C" void LLVMInitializeMBlazeMCInstrInfo() {
-  TargetRegistry::RegisterMCInstrInfo(TheMBlazeTarget, createMBlazeMCInstrInfo);
+static MCRegisterInfo *createMBlazeMCRegisterInfo(StringRef TT) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitMBlazeMCRegisterInfo(X, MBlaze::R15);
+  return X;
 }
 
 static MCSubtargetInfo *createMBlazeMCSubtargetInfo(StringRef TT, StringRef CPU,
@@ -47,11 +53,6 @@ static MCSubtargetInfo *createMBlazeMCSubtargetInfo(StringRef TT, StringRef CPU,
   return X;
 }
 
-extern "C" void LLVMInitializeMBlazeMCSubtargetInfo() {
-  TargetRegistry::RegisterMCSubtargetInfo(TheMBlazeTarget,
-                                          createMBlazeMCSubtargetInfo);
-}
-
 static MCAsmInfo *createMCAsmInfo(const Target &T, StringRef TT) {
   Triple TheTriple(TT);
   switch (TheTriple.getOS()) {
@@ -60,6 +61,80 @@ static MCAsmInfo *createMCAsmInfo(const Target &T, StringRef TT) {
   }
 }
 
-extern "C" void LLVMInitializeMBlazeMCAsmInfo() {
+static MCCodeGenInfo *createMBlazeMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                                CodeModel::Model CM) {
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+  if (RM == Reloc::Default)
+    RM = Reloc::Static;
+  if (CM == CodeModel::Default)
+    CM = CodeModel::Small;
+  X->InitMCCodeGenInfo(RM, CM);
+  return X;
+}
+
+static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
+                                    MCContext &Ctx, MCAsmBackend &MAB,
+                                    raw_ostream &_OS,
+                                    MCCodeEmitter *_Emitter,
+                                    bool RelaxAll,
+                                    bool NoExecStack) {
+  Triple TheTriple(TT);
+
+  if (TheTriple.isOSDarwin()) {
+    llvm_unreachable("MBlaze does not support Darwin MACH-O format");
+    return NULL;
+  }
+
+  if (TheTriple.isOSWindows()) {
+    llvm_unreachable("MBlaze does not support Windows COFF format");
+    return NULL;
+  }
+
+  return createELFStreamer(Ctx, MAB, _OS, _Emitter, RelaxAll, NoExecStack);
+}
+
+static MCInstPrinter *createMBlazeMCInstPrinter(const Target &T,
+                                                unsigned SyntaxVariant,
+                                                const MCAsmInfo &MAI,
+                                                const MCSubtargetInfo &STI) {
+  if (SyntaxVariant == 0)
+    return new MBlazeInstPrinter(MAI);
+  return 0;
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeMBlazeTargetMC() {
+  // Register the MC asm info.
   RegisterMCAsmInfoFn X(TheMBlazeTarget, createMCAsmInfo);
+
+  // Register the MC codegen info.
+  TargetRegistry::RegisterMCCodeGenInfo(TheMBlazeTarget,
+                                        createMBlazeMCCodeGenInfo);
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(TheMBlazeTarget, createMBlazeMCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(TheMBlazeTarget,
+                                    createMBlazeMCRegisterInfo);
+
+  // Register the MC subtarget info.
+  TargetRegistry::RegisterMCSubtargetInfo(TheMBlazeTarget,
+                                          createMBlazeMCSubtargetInfo);
+
+  // Register the MC code emitter
+  TargetRegistry::RegisterMCCodeEmitter(TheMBlazeTarget,
+                                        llvm::createMBlazeMCCodeEmitter);
+
+  // Register the asm backend
+  TargetRegistry::RegisterMCAsmBackend(TheMBlazeTarget,
+                                       createMBlazeAsmBackend);
+
+  // Register the object streamer
+  TargetRegistry::RegisterMCObjectStreamer(TheMBlazeTarget,
+                                           createMCStreamer);
+
+  // Register the MCInstPrinter.
+  TargetRegistry::RegisterMCInstPrinter(TheMBlazeTarget,
+                                        createMBlazeMCInstPrinter);
 }
diff --git a/lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.h b/lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.h
index b14772ef060b3..deff5cb078f95 100644
--- a/lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.h
+++ b/lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.h
@@ -15,12 +15,23 @@
 #define MBLAZEMCTARGETDESC_H
 
 namespace llvm {
+class MCAsmBackend;
+class MCContext;
+class MCCodeEmitter;
+class MCInstrInfo;
 class MCSubtargetInfo;
 class Target;
 class StringRef;
+class formatted_raw_ostream;
 
 extern Target TheMBlazeTarget;
 
+MCCodeEmitter *createMBlazeMCCodeEmitter(const MCInstrInfo &MCII,
+                                         const MCSubtargetInfo &STI,
+                                         MCContext &Ctx);
+  
+MCAsmBackend *createMBlazeAsmBackend(const Target &T, StringRef TT);
+
 } // End llvm namespace
 
 // Defines symbolic names for MBlaze registers.  This defines a mapping from
diff --git a/lib/Target/MBlaze/TargetInfo/CMakeLists.txt b/lib/Target/MBlaze/TargetInfo/CMakeLists.txt
index 40696f63c462a..93fce58883edc 100644
--- a/lib/Target/MBlaze/TargetInfo/CMakeLists.txt
+++ b/lib/Target/MBlaze/TargetInfo/CMakeLists.txt
@@ -5,4 +5,10 @@ add_llvm_library(LLVMMBlazeInfo
   MBlazeTargetInfo.cpp
   )
 
-add_dependencies(LLVMMBlazeInfo MBlazeCodeGenTable_gen)
+add_llvm_library_dependencies(LLVMMBlazeInfo
+  LLVMMC
+  LLVMSupport
+  LLVMTarget
+  )
+
+add_dependencies(LLVMMBlazeInfo MBlazeCommonTableGen)
diff --git a/lib/Target/MBlaze/TargetInfo/MBlazeTargetInfo.cpp b/lib/Target/MBlaze/TargetInfo/MBlazeTargetInfo.cpp
index 16e01dbfdeed1..71210d8db4666 100644
--- a/lib/Target/MBlaze/TargetInfo/MBlazeTargetInfo.cpp
+++ b/lib/Target/MBlaze/TargetInfo/MBlazeTargetInfo.cpp
@@ -9,7 +9,7 @@
 
 #include "MBlaze.h"
 #include "llvm/Module.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
 Target llvm::TheMBlazeTarget;
diff --git a/lib/Target/MSP430/CMakeLists.txt b/lib/Target/MSP430/CMakeLists.txt
index 33f3d449ed99b..0952b76aefab0 100644
--- a/lib/Target/MSP430/CMakeLists.txt
+++ b/lib/Target/MSP430/CMakeLists.txt
@@ -1,11 +1,12 @@
 set(LLVM_TARGET_DEFINITIONS MSP430.td)
 
-tablegen(MSP430GenRegisterInfo.inc -gen-register-info)
-tablegen(MSP430GenInstrInfo.inc -gen-instr-info)
-tablegen(MSP430GenAsmWriter.inc -gen-asm-writer)
-tablegen(MSP430GenDAGISel.inc -gen-dag-isel)
-tablegen(MSP430GenCallingConv.inc -gen-callingconv)
-tablegen(MSP430GenSubtargetInfo.inc -gen-subtarget)
+llvm_tablegen(MSP430GenRegisterInfo.inc -gen-register-info)
+llvm_tablegen(MSP430GenInstrInfo.inc -gen-instr-info)
+llvm_tablegen(MSP430GenAsmWriter.inc -gen-asm-writer)
+llvm_tablegen(MSP430GenDAGISel.inc -gen-dag-isel)
+llvm_tablegen(MSP430GenCallingConv.inc -gen-callingconv)
+llvm_tablegen(MSP430GenSubtargetInfo.inc -gen-subtarget)
+add_public_tablegen_target(MSP430CommonTableGen)
 
 add_llvm_target(MSP430CodeGen
   MSP430BranchSelector.cpp
@@ -21,6 +22,19 @@ add_llvm_target(MSP430CodeGen
   MSP430MCInstLower.cpp
   )
 
+add_llvm_library_dependencies(LLVMMSP430CodeGen
+  LLVMAsmPrinter
+  LLVMCodeGen
+  LLVMCore
+  LLVMMC
+  LLVMMSP430AsmPrinter
+  LLVMMSP430Desc
+  LLVMMSP430Info
+  LLVMSelectionDAG
+  LLVMSupport
+  LLVMTarget
+  )
+
 add_subdirectory(InstPrinter)
 add_subdirectory(TargetInfo)
 add_subdirectory(MCTargetDesc)
diff --git a/lib/Target/MSP430/InstPrinter/CMakeLists.txt b/lib/Target/MSP430/InstPrinter/CMakeLists.txt
index f5458d59a8217..ce39d9517efe1 100644
--- a/lib/Target/MSP430/InstPrinter/CMakeLists.txt
+++ b/lib/Target/MSP430/InstPrinter/CMakeLists.txt
@@ -3,4 +3,10 @@ include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/
 add_llvm_library(LLVMMSP430AsmPrinter
   MSP430InstPrinter.cpp
   )
-add_dependencies(LLVMMSP430AsmPrinter MSP430CodeGenTable_gen)
+
+add_llvm_library_dependencies(LLVMMSP430AsmPrinter
+  LLVMMC
+  LLVMSupport
+  )
+
+add_dependencies(LLVMMSP430AsmPrinter MSP430CommonTableGen)
diff --git a/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.cpp b/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.cpp
index e10d4fe7ca165..5d6c6ad93dbe9 100644
--- a/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.cpp
+++ b/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.cpp
@@ -25,8 +25,10 @@ using namespace llvm;
 // Include the auto-generated portion of the assembly writer.
 #include "MSP430GenAsmWriter.inc"
 
-void MSP430InstPrinter::printInst(const MCInst *MI, raw_ostream &O) {
+void MSP430InstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+                                  StringRef Annot) {
   printInstruction(MI, O);
+  printAnnotation(O, Annot);
 }
 
 void MSP430InstPrinter::printPCRelImmOperand(const MCInst *MI, unsigned OpNo,
diff --git a/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h b/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h
index 50d98b7c41fde..a1984a8aec197 100644
--- a/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h
+++ b/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h
@@ -22,9 +22,9 @@ namespace llvm {
   class MSP430InstPrinter : public MCInstPrinter {
   public:
     MSP430InstPrinter(const MCAsmInfo &MAI)
-      : MCInstPrinter(MAI) {}
+        : MCInstPrinter(MAI) {}
 
-    virtual void printInst(const MCInst *MI, raw_ostream &O);
+    virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
 
     // Autogenerated by tblgen.
     void printInstruction(const MCInst *MI, raw_ostream &O);
diff --git a/lib/Target/MSP430/MCTargetDesc/CMakeLists.txt b/lib/Target/MSP430/MCTargetDesc/CMakeLists.txt
index 0f3ebd3039241..04bd03e494604 100644
--- a/lib/Target/MSP430/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/MSP430/MCTargetDesc/CMakeLists.txt
@@ -2,3 +2,11 @@ add_llvm_library(LLVMMSP430Desc
   MSP430MCTargetDesc.cpp
   MSP430MCAsmInfo.cpp
   )
+
+add_llvm_library_dependencies(LLVMMSP430Desc
+  LLVMMC
+  LLVMMSP430AsmPrinter
+  LLVMMSP430Info
+  )
+
+add_dependencies(LLVMMSP430Desc MSP430CommonTableGen)
diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp b/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
index 43a704d7a7df3..fda70b81dc8c5 100644
--- a/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
+++ b/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
@@ -13,10 +13,12 @@
 
 #include "MSP430MCTargetDesc.h"
 #include "MSP430MCAsmInfo.h"
+#include "InstPrinter/MSP430InstPrinter.h"
+#include "llvm/MC/MCCodeGenInfo.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 
 #define GET_INSTRINFO_MC_DESC
 #include "MSP430GenInstrInfo.inc"
@@ -29,18 +31,18 @@
 
 using namespace llvm;
 
-
 static MCInstrInfo *createMSP430MCInstrInfo() {
   MCInstrInfo *X = new MCInstrInfo();
   InitMSP430MCInstrInfo(X);
   return X;
 }
 
-extern "C" void LLVMInitializeMSP430MCInstrInfo() {
-  TargetRegistry::RegisterMCInstrInfo(TheMSP430Target, createMSP430MCInstrInfo);
+static MCRegisterInfo *createMSP430MCRegisterInfo(StringRef TT) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitMSP430MCRegisterInfo(X, MSP430::PCW);
+  return X;
 }
 
-
 static MCSubtargetInfo *createMSP430MCSubtargetInfo(StringRef TT, StringRef CPU,
                                                     StringRef FS) {
   MCSubtargetInfo *X = new MCSubtargetInfo();
@@ -48,11 +50,42 @@ static MCSubtargetInfo *createMSP430MCSubtargetInfo(StringRef TT, StringRef CPU,
   return X;
 }
 
-extern "C" void LLVMInitializeMSP430MCSubtargetInfo() {
-  TargetRegistry::RegisterMCSubtargetInfo(TheMSP430Target,
-                                          createMSP430MCSubtargetInfo);
+static MCCodeGenInfo *createMSP430MCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                                CodeModel::Model CM) {
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+  X->InitMCCodeGenInfo(RM, CM);
+  return X;
+}
+
+static MCInstPrinter *createMSP430MCInstPrinter(const Target &T,
+                                                unsigned SyntaxVariant,
+                                                const MCAsmInfo &MAI,
+                                                const MCSubtargetInfo &STI) {
+  if (SyntaxVariant == 0)
+    return new MSP430InstPrinter(MAI);
+  return 0;
 }
 
-extern "C" void LLVMInitializeMSP430MCAsmInfo() {
+extern "C" void LLVMInitializeMSP430TargetMC() {
+  // Register the MC asm info.
   RegisterMCAsmInfo<MSP430MCAsmInfo> X(TheMSP430Target);
+
+  // Register the MC codegen info.
+  TargetRegistry::RegisterMCCodeGenInfo(TheMSP430Target,
+                                        createMSP430MCCodeGenInfo);
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(TheMSP430Target, createMSP430MCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(TheMSP430Target,
+                                    createMSP430MCRegisterInfo);
+
+  // Register the MC subtarget info.
+  TargetRegistry::RegisterMCSubtargetInfo(TheMSP430Target,
+                                          createMSP430MCSubtargetInfo);
+
+  // Register the MCInstPrinter.
+  TargetRegistry::RegisterMCInstPrinter(TheMSP430Target,
+                                        createMSP430MCInstPrinter);
 }
diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h b/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h
index 0d8a6bdb44f9a..35f259076441e 100644
--- a/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h
+++ b/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef ALPHAMCTARGETDESC_H
-#define ALPHAMCTARGETDESC_H
+#ifndef MSP430MCTARGETDESC_H
+#define MSP430MCTARGETDESC_H
 
 namespace llvm {
 class MCSubtargetInfo;
diff --git a/lib/Target/MSP430/MSP430AsmPrinter.cpp b/lib/Target/MSP430/MSP430AsmPrinter.cpp
index 2042056617ace..883654943b649 100644
--- a/lib/Target/MSP430/MSP430AsmPrinter.cpp
+++ b/lib/Target/MSP430/MSP430AsmPrinter.cpp
@@ -32,9 +32,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Target/Mangler.h"
-#include "llvm/Target/TargetData.h"
-#include "llvm/Target/TargetLoweringObjectFile.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
@@ -163,17 +161,7 @@ void MSP430AsmPrinter::EmitInstruction(const MachineInstr *MI) {
   OutStreamer.EmitInstruction(TmpInst);
 }
 
-static MCInstPrinter *createMSP430MCInstPrinter(const Target &T,
-                                                unsigned SyntaxVariant,
-                                                const MCAsmInfo &MAI) {
-  if (SyntaxVariant == 0)
-    return new MSP430InstPrinter(MAI);
-  return 0;
-}
-
 // Force static initialization.
 extern "C" void LLVMInitializeMSP430AsmPrinter() {
   RegisterAsmPrinter<MSP430AsmPrinter> X(TheMSP430Target);
-  TargetRegistry::RegisterMCInstPrinter(TheMSP430Target,
-                                        createMSP430MCInstPrinter);
 }
diff --git a/lib/Target/MSP430/MSP430ISelLowering.cpp b/lib/Target/MSP430/MSP430ISelLowering.cpp
index 0a3eab1f7a669..dc374315171fb 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -79,6 +79,7 @@ MSP430TargetLowering::MSP430TargetLowering(MSP430TargetMachine &tm) :
 
   setStackPointerRegisterToSaveRestore(MSP430::SPW);
   setBooleanContents(ZeroOrOneBooleanContent);
+  setBooleanVectorContents(ZeroOrOneBooleanContent); // FIXME: Is this correct?
   setSchedulingPreference(Sched::Latency);
 
   // We have post-incremented loads / stores.
@@ -987,8 +988,8 @@ const char *MSP430TargetLowering::getTargetNodeName(unsigned Opcode) const {
   }
 }
 
-bool MSP430TargetLowering::isTruncateFree(const Type *Ty1,
-                                          const Type *Ty2) const {
+bool MSP430TargetLowering::isTruncateFree(Type *Ty1,
+                                          Type *Ty2) const {
   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
     return false;
 
@@ -1002,7 +1003,7 @@ bool MSP430TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
   return (VT1.getSizeInBits() > VT2.getSizeInBits());
 }
 
-bool MSP430TargetLowering::isZExtFree(const Type *Ty1, const Type *Ty2) const {
+bool MSP430TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
   // MSP430 implicitly zero-extends 8-bit results in 16-bit registers.
   return 0 && Ty1->isIntegerTy(8) && Ty2->isIntegerTy(16);
 }
diff --git a/lib/Target/MSP430/MSP430ISelLowering.h b/lib/Target/MSP430/MSP430ISelLowering.h
index bd660a0bb0399..237f604357362 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.h
+++ b/lib/Target/MSP430/MSP430ISelLowering.h
@@ -102,7 +102,7 @@ namespace llvm {
     /// isTruncateFree - Return true if it's free to truncate a value of type
     /// Ty1 to type Ty2. e.g. On msp430 it's free to truncate a i16 value in
     /// register R15W to i8 by referencing its sub-register R15B.
-    virtual bool isTruncateFree(const Type *Ty1, const Type *Ty2) const;
+    virtual bool isTruncateFree(Type *Ty1, Type *Ty2) const;
     virtual bool isTruncateFree(EVT VT1, EVT VT2) const;
 
     /// isZExtFree - Return true if any actual instruction that defines a value
@@ -113,7 +113,7 @@ namespace llvm {
     /// necessarily apply to truncate instructions. e.g. on msp430, all
     /// instructions that define 8-bit values implicit zero-extend the result
     /// out to 16 bits.
-    virtual bool isZExtFree(const Type *Ty1, const Type *Ty2) const;
+    virtual bool isZExtFree(Type *Ty1, Type *Ty2) const;
     virtual bool isZExtFree(EVT VT1, EVT VT2) const;
 
     MachineBasicBlock* EmitInstrWithCustomInserter(MachineInstr *MI,
diff --git a/lib/Target/MSP430/MSP430InstrInfo.cpp b/lib/Target/MSP430/MSP430InstrInfo.cpp
index 846d09361b33e..ffd43183c5d9d 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.cpp
+++ b/lib/Target/MSP430/MSP430InstrInfo.cpp
@@ -20,8 +20,8 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
-#include "llvm/Target/TargetRegistry.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
 
 #define GET_INSTRINFO_CTOR
 #include "MSP430GenInstrInfo.inc"
diff --git a/lib/Target/MSP430/MSP430RegisterInfo.cpp b/lib/Target/MSP430/MSP430RegisterInfo.cpp
index 1cc60bba3a553..9049c4bf8f653 100644
--- a/lib/Target/MSP430/MSP430RegisterInfo.cpp
+++ b/lib/Target/MSP430/MSP430RegisterInfo.cpp
@@ -34,7 +34,7 @@ using namespace llvm;
 // FIXME: Provide proper call frame setup / destroy opcodes.
 MSP430RegisterInfo::MSP430RegisterInfo(MSP430TargetMachine &tm,
                                        const TargetInstrInfo &tii)
-  : MSP430GenRegisterInfo(), TM(tm), TII(tii) {
+  : MSP430GenRegisterInfo(MSP430::PCW), TM(tm), TII(tii) {
   StackAlign = TM.getFrameLowering()->getStackAlignment();
 }
 
@@ -233,22 +233,8 @@ MSP430RegisterInfo::processFunctionBeforeFrameFinalized(MachineFunction &MF)
   }
 }
 
-unsigned MSP430RegisterInfo::getRARegister() const {
-  return MSP430::PCW;
-}
-
 unsigned MSP430RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
 
   return TFI->hasFP(MF) ? MSP430::FPW : MSP430::SPW;
 }
-
-int MSP430RegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
-  llvm_unreachable("Not implemented yet!");
-  return 0;
-}
-
-int MSP430RegisterInfo::getLLVMRegNum(unsigned RegNum, bool isEH) const {
-  llvm_unreachable("Not implemented yet!");
-  return 0;
-}
diff --git a/lib/Target/MSP430/MSP430RegisterInfo.h b/lib/Target/MSP430/MSP430RegisterInfo.h
index fb70594ab37cb..10a3d5320636a 100644
--- a/lib/Target/MSP430/MSP430RegisterInfo.h
+++ b/lib/Target/MSP430/MSP430RegisterInfo.h
@@ -58,12 +58,7 @@ public:
   void processFunctionBeforeFrameFinalized(MachineFunction &MF) const;
 
   // Debug information queries.
-  unsigned getRARegister() const;
   unsigned getFrameRegister(const MachineFunction &MF) const;
-
-  //! Get DWARF debugging register number
-  int getDwarfRegNum(unsigned RegNum, bool isEH) const;
-  int getLLVMRegNum(unsigned RegNum, bool isEH) const;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/MSP430/MSP430Subtarget.cpp b/lib/Target/MSP430/MSP430Subtarget.cpp
index b58c50afb9824..3ee14d9f7a83e 100644
--- a/lib/Target/MSP430/MSP430Subtarget.cpp
+++ b/lib/Target/MSP430/MSP430Subtarget.cpp
@@ -13,7 +13,7 @@
 
 #include "MSP430Subtarget.h"
 #include "MSP430.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 
 #define GET_SUBTARGETINFO_TARGET_DESC
 #define GET_SUBTARGETINFO_CTOR
diff --git a/lib/Target/MSP430/MSP430TargetMachine.cpp b/lib/Target/MSP430/MSP430TargetMachine.cpp
index 971f512141e80..4dd893326e3f8 100644
--- a/lib/Target/MSP430/MSP430TargetMachine.cpp
+++ b/lib/Target/MSP430/MSP430TargetMachine.cpp
@@ -16,7 +16,7 @@
 #include "llvm/PassManager.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/MC/MCAsmInfo.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
 extern "C" void LLVMInitializeMSP430Target() {
@@ -25,10 +25,11 @@ extern "C" void LLVMInitializeMSP430Target() {
 }
 
 MSP430TargetMachine::MSP430TargetMachine(const Target &T,
-                                         const std::string &TT,
-                                         const std::string &CPU,
-                                         const std::string &FS)
-  : LLVMTargetMachine(T, TT, CPU, FS),
+                                         StringRef TT,
+                                         StringRef CPU,
+                                         StringRef FS,
+                                         Reloc::Model RM, CodeModel::Model CM)
+  : LLVMTargetMachine(T, TT, CPU, FS, RM, CM),
     Subtarget(TT, CPU, FS),
     // FIXME: Check TargetData string.
     DataLayout("e-p:16:16:16-i8:8:8-i16:16:16-i32:16:32-n8:16"),
diff --git a/lib/Target/MSP430/MSP430TargetMachine.h b/lib/Target/MSP430/MSP430TargetMachine.h
index 2a9eea0bcd82a..eb483dc8706f5 100644
--- a/lib/Target/MSP430/MSP430TargetMachine.h
+++ b/lib/Target/MSP430/MSP430TargetMachine.h
@@ -38,8 +38,9 @@ class MSP430TargetMachine : public LLVMTargetMachine {
   MSP430FrameLowering    FrameLowering;
 
 public:
-  MSP430TargetMachine(const Target &T, const std::string &TT,
-                      const std::string &CPU, const std::string &FS);
+  MSP430TargetMachine(const Target &T, StringRef TT,
+                      StringRef CPU, StringRef FS,
+                      Reloc::Model RM, CodeModel::Model CM);
 
   virtual const TargetFrameLowering *getFrameLowering() const {
     return &FrameLowering;
diff --git a/lib/Target/MSP430/TargetInfo/CMakeLists.txt b/lib/Target/MSP430/TargetInfo/CMakeLists.txt
index 2d1aa9d4e5e76..1526946af5fd6 100644
--- a/lib/Target/MSP430/TargetInfo/CMakeLists.txt
+++ b/lib/Target/MSP430/TargetInfo/CMakeLists.txt
@@ -4,4 +4,10 @@ add_llvm_library(LLVMMSP430Info
   MSP430TargetInfo.cpp
   )
 
-add_dependencies(LLVMMSP430Info MSP430CodeGenTable_gen)
+add_llvm_library_dependencies(LLVMMSP430Info
+  LLVMMC
+  LLVMSupport
+  LLVMTarget
+  )
+
+add_dependencies(LLVMMSP430Info MSP430CommonTableGen)
diff --git a/lib/Target/MSP430/TargetInfo/MSP430TargetInfo.cpp b/lib/Target/MSP430/TargetInfo/MSP430TargetInfo.cpp
index f9ca5c49c9790..8b3e01ecf52c3 100644
--- a/lib/Target/MSP430/TargetInfo/MSP430TargetInfo.cpp
+++ b/lib/Target/MSP430/TargetInfo/MSP430TargetInfo.cpp
@@ -9,7 +9,7 @@
 
 #include "MSP430.h"
 #include "llvm/Module.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
 Target llvm::TheMSP430Target;
diff --git a/lib/Target/Mangler.cpp b/lib/Target/Mangler.cpp
index 46c687b64001d..53ad155f376c2 100644
--- a/lib/Target/Mangler.cpp
+++ b/lib/Target/Mangler.cpp
@@ -159,7 +159,7 @@ static void AddFastCallStdCallSuffix(SmallVectorImpl<char> &OutName,
   unsigned ArgWords = 0;
   for (Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end();
        AI != AE; ++AI) {
-    const Type *Ty = AI->getType();
+    Type *Ty = AI->getType();
     // 'Dereference' type in case of byval parameter attribute
     if (AI->hasByValAttr())
       Ty = cast<PointerType>(Ty)->getElementType();
@@ -214,7 +214,7 @@ void Mangler::getNameWithPrefix(SmallVectorImpl<char> &OutName,
     
       // fastcall and stdcall functions usually need @42 at the end to specify
       // the argument info.
-      const FunctionType *FT = F->getFunctionType();
+      FunctionType *FT = F->getFunctionType();
       if ((CC == CallingConv::X86_FastCall || CC == CallingConv::X86_StdCall) &&
           // "Pure" variadic functions do not receive @0 suffix.
           (!FT->isVarArg() || FT->getNumParams() == 0 ||
diff --git a/lib/Target/Mips/CMakeLists.txt b/lib/Target/Mips/CMakeLists.txt
index 36ab1a97e4f86..1b4329baf0f0a 100644
--- a/lib/Target/Mips/CMakeLists.txt
+++ b/lib/Target/Mips/CMakeLists.txt
@@ -1,17 +1,20 @@
 set(LLVM_TARGET_DEFINITIONS Mips.td)
 
-tablegen(MipsGenRegisterInfo.inc -gen-register-info)
-tablegen(MipsGenInstrInfo.inc -gen-instr-info)
-tablegen(MipsGenAsmWriter.inc -gen-asm-writer)
-tablegen(MipsGenDAGISel.inc -gen-dag-isel)
-tablegen(MipsGenCallingConv.inc -gen-callingconv)
-tablegen(MipsGenSubtargetInfo.inc -gen-subtarget)
+llvm_tablegen(MipsGenRegisterInfo.inc -gen-register-info)
+llvm_tablegen(MipsGenInstrInfo.inc -gen-instr-info)
+llvm_tablegen(MipsGenAsmWriter.inc -gen-asm-writer)
+llvm_tablegen(MipsGenDAGISel.inc -gen-dag-isel)
+llvm_tablegen(MipsGenCallingConv.inc -gen-callingconv)
+llvm_tablegen(MipsGenSubtargetInfo.inc -gen-subtarget)
+add_public_tablegen_target(MipsCommonTableGen)
 
 add_llvm_target(MipsCodeGen
   MipsAsmPrinter.cpp
+  MipsCodeEmitter.cpp
   MipsDelaySlotFiller.cpp
   MipsEmitGPRestore.cpp
   MipsExpandPseudo.cpp
+  MipsJITInfo.cpp
   MipsInstrInfo.cpp
   MipsISelDAGToDAG.cpp
   MipsISelLowering.cpp
@@ -25,6 +28,19 @@ add_llvm_target(MipsCodeGen
   MipsSelectionDAGInfo.cpp
   )
 
+add_llvm_library_dependencies(LLVMMipsCodeGen
+  LLVMAsmPrinter
+  LLVMCodeGen
+  LLVMCore
+  LLVMMC
+  LLVMMipsAsmPrinter
+  LLVMMipsDesc
+  LLVMMipsInfo
+  LLVMSelectionDAG
+  LLVMSupport
+  LLVMTarget
+  )
+
 add_subdirectory(InstPrinter)
 add_subdirectory(TargetInfo)
 add_subdirectory(MCTargetDesc)
diff --git a/lib/Target/Mips/InstPrinter/CMakeLists.txt b/lib/Target/Mips/InstPrinter/CMakeLists.txt
index 8852fd4126e61..c45b35df8c115 100644
--- a/lib/Target/Mips/InstPrinter/CMakeLists.txt
+++ b/lib/Target/Mips/InstPrinter/CMakeLists.txt
@@ -3,4 +3,10 @@ include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/
 add_llvm_library(LLVMMipsAsmPrinter
   MipsInstPrinter.cpp
   )
-add_dependencies(LLVMMipsAsmPrinter MipsCodeGenTable_gen)
+
+add_llvm_library_dependencies(LLVMMipsAsmPrinter
+  LLVMMC
+  LLVMSupport
+  )
+
+add_dependencies(LLVMMipsAsmPrinter MipsCommonTableGen)
diff --git a/lib/Target/Mips/InstPrinter/Makefile b/lib/Target/Mips/InstPrinter/Makefile
index 63e38ef3e6aa3..74872a48b9741 100644
--- a/lib/Target/Mips/InstPrinter/Makefile
+++ b/lib/Target/Mips/InstPrinter/Makefile
@@ -10,7 +10,7 @@
 LEVEL = ../../../..
 LIBRARYNAME = LLVMMipsAsmPrinter
 
-# Hack: we need to include 'main' arm target directory to grab private headers
+# Hack: we need to include 'main' mips target directory to grab private headers
 CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
 
 include $(LEVEL)/Makefile.common
diff --git a/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp b/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
index 41c1dd3919b49..3dafc6134488e 100644
--- a/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
+++ b/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
@@ -1,4 +1,4 @@
-//===-- MipsInstPrinter.cpp - Convert Mips MCInst to assembly syntax --------===//
+//===-- MipsInstPrinter.cpp - Convert Mips MCInst to assembly syntax ------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -69,8 +69,10 @@ void MipsInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
   OS << '$' << LowercaseString(getRegisterName(RegNo));
 }
 
-void MipsInstPrinter::printInst(const MCInst *MI, raw_ostream &O) {
+void MipsInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+                                StringRef Annot) {
   printInstruction(MI, O);
+  printAnnotation(O, Annot);
 }
 
 void MipsInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
diff --git a/lib/Target/Mips/InstPrinter/MipsInstPrinter.h b/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
index 680208eb819b3..5c1116538c610 100644
--- a/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
+++ b/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
@@ -1,4 +1,4 @@
-//===-- MipsInstPrinter.h - Convert Mips MCInst to assembly syntax ----------===//
+//===-- MipsInstPrinter.h - Convert Mips MCInst to assembly syntax --------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -86,7 +86,7 @@ public:
   
   virtual StringRef getOpcodeName(unsigned Opcode) const;
   virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
-  virtual void printInst(const MCInst *MI, raw_ostream &O);
+  virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
   
 private:
   void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
diff --git a/lib/Target/Mips/MCTargetDesc/CMakeLists.txt b/lib/Target/Mips/MCTargetDesc/CMakeLists.txt
index 97de75db5347a..2ceb5c95746bd 100644
--- a/lib/Target/Mips/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/Mips/MCTargetDesc/CMakeLists.txt
@@ -1,4 +1,15 @@
 add_llvm_library(LLVMMipsDesc
-  MipsMCTargetDesc.cpp
+  MipsAsmBackend.cpp
   MipsMCAsmInfo.cpp
+  MipsMCCodeEmitter.cpp
+  MipsMCTargetDesc.cpp
+  )
+
+add_llvm_library_dependencies(LLVMMipsDesc
+  LLVMMC
+  LLVMMipsAsmPrinter
+  LLVMMipsInfo
+  LLVMSupport
   )
+
+add_dependencies(LLVMMipsDesc MipsCommonTableGen)
diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
new file mode 100644
index 0000000000000..f190ec42c776c
--- /dev/null
+++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
@@ -0,0 +1,117 @@
+#include "MCTargetDesc/MipsMCTargetDesc.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCDirectives.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCMachObjectWriter.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Object/MachOFormat.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+namespace {
+class MipsELFObjectWriter : public MCELFObjectTargetWriter {
+public:
+  MipsELFObjectWriter(bool is64Bit, Triple::OSType OSType, uint16_t EMachine,
+                      bool HasRelocationAddend)
+    : MCELFObjectTargetWriter(is64Bit, OSType, EMachine,
+                              HasRelocationAddend) {}
+};
+
+class MipsAsmBackend : public MCAsmBackend {
+public:
+  MipsAsmBackend(const Target &T)
+    : MCAsmBackend() {}
+
+  unsigned getNumFixupKinds() const {
+    return 1;   //tbd
+  }
+
+  /// ApplyFixup - Apply the \arg Value for given \arg Fixup into the provided
+  /// data fragment, at the offset specified by the fixup and following the
+  /// fixup kind as appropriate.
+  void ApplyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                  uint64_t Value) const {
+  }
+
+  /// @name Target Relaxation Interfaces
+  /// @{
+
+  /// MayNeedRelaxation - Check whether the given instruction may need
+  /// relaxation.
+  ///
+  /// \param Inst - The instruction to test.
+  bool MayNeedRelaxation(const MCInst &Inst) const {
+    return false;
+  }
+
+  /// RelaxInstruction - Relax the instruction in the given fragment to the next
+  /// wider instruction.
+  ///
+  /// \param Inst - The instruction to relax, which may be the same as the
+  /// output.
+  /// \parm Res [output] - On return, the relaxed instruction.
+  void RelaxInstruction(const MCInst &Inst, MCInst &Res) const {
+  }
+  
+  /// @}
+
+  /// WriteNopData - Write an (optimal) nop sequence of Count bytes to the given
+  /// output. If the target cannot generate such a sequence, it should return an
+  /// error.
+  ///
+  /// \return - True on success.
+  bool WriteNopData(uint64_t Count, MCObjectWriter *OW) const {
+    return false;
+  }
+};
+
+class MipsEB_AsmBackend : public MipsAsmBackend {
+public:
+  Triple::OSType OSType;
+
+  MipsEB_AsmBackend(const Target &T, Triple::OSType _OSType)
+    : MipsAsmBackend(T), OSType(_OSType) {}
+
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
+    return createELFObjectWriter(createELFObjectTargetWriter(),
+                                 OS, /*IsLittleEndian*/ false);
+  }
+
+  MCELFObjectTargetWriter *createELFObjectTargetWriter() const {
+    return new MipsELFObjectWriter(false, OSType, ELF::EM_MIPS, false);
+  }
+};
+
+class MipsEL_AsmBackend : public MipsAsmBackend {
+public:
+  Triple::OSType OSType;
+
+  MipsEL_AsmBackend(const Target &T, Triple::OSType _OSType)
+    : MipsAsmBackend(T), OSType(_OSType) {}
+
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
+    return createELFObjectWriter(createELFObjectTargetWriter(),
+                                 OS, /*IsLittleEndian*/ true);
+  }
+
+  MCELFObjectTargetWriter *createELFObjectTargetWriter() const {
+    return new MipsELFObjectWriter(false, OSType, ELF::EM_MIPS, false);
+  }
+};
+}
+
+MCAsmBackend *llvm::createMipsAsmBackend(const Target &T, StringRef TT) {
+  Triple TheTriple(TT);
+
+  // just return little endian for now
+  //
+  return new MipsEL_AsmBackend(T, Triple(TT).getOS());
+}
diff --git a/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h b/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h
new file mode 100644
index 0000000000000..f7a6fa9490914
--- /dev/null
+++ b/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h
@@ -0,0 +1,113 @@
+//===-- MipsBaseInfo.h - Top level definitions for ARM ------- --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains small standalone helper functions and enum definitions for
+// the Mips target useful for the compiler back-end and the MC libraries.
+//
+//===----------------------------------------------------------------------===//
+#ifndef MIPSBASEINFO_H
+#define MIPSBASEINFO_H
+
+#include "MipsMCTargetDesc.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+/// getMipsRegisterNumbering - Given the enum value for some register,
+/// return the number that it corresponds to.
+inline static unsigned getMipsRegisterNumbering(unsigned RegEnum)
+{
+  switch (RegEnum) {
+  case Mips::ZERO: case Mips::ZERO_64: case Mips::F0: case Mips::D0_64:
+  case Mips::D0:
+    return 0;
+  case Mips::AT: case Mips::AT_64: case Mips::F1: case Mips::D1_64:
+    return 1;
+  case Mips::V0: case Mips::V0_64: case Mips::F2: case Mips::D2_64:
+  case Mips::D1:
+    return 2;
+  case Mips::V1: case Mips::V1_64: case Mips::F3: case Mips::D3_64:
+    return 3;
+  case Mips::A0: case Mips::A0_64: case Mips::F4: case Mips::D4_64:
+  case Mips::D2:
+    return 4;
+  case Mips::A1: case Mips::A1_64: case Mips::F5: case Mips::D5_64:
+    return 5;
+  case Mips::A2: case Mips::A2_64: case Mips::F6: case Mips::D6_64:
+  case Mips::D3:
+    return 6;
+  case Mips::A3: case Mips::A3_64: case Mips::F7: case Mips::D7_64:
+    return 7;
+  case Mips::T0: case Mips::T0_64: case Mips::F8: case Mips::D8_64:
+  case Mips::D4:
+    return 8;
+  case Mips::T1: case Mips::T1_64: case Mips::F9: case Mips::D9_64:
+    return 9;
+  case Mips::T2: case Mips::T2_64: case Mips::F10: case Mips::D10_64:
+  case Mips::D5:
+    return 10;
+  case Mips::T3: case Mips::T3_64: case Mips::F11: case Mips::D11_64:
+    return 11;
+  case Mips::T4: case Mips::T4_64: case Mips::F12: case Mips::D12_64:
+  case Mips::D6:
+    return 12;
+  case Mips::T5: case Mips::T5_64: case Mips::F13: case Mips::D13_64:
+    return 13;
+  case Mips::T6: case Mips::T6_64: case Mips::F14: case Mips::D14_64:
+  case Mips::D7:
+    return 14;
+  case Mips::T7: case Mips::T7_64: case Mips::F15: case Mips::D15_64:
+    return 15;
+  case Mips::S0: case Mips::S0_64: case Mips::F16: case Mips::D16_64:
+  case Mips::D8:
+    return 16;
+  case Mips::S1: case Mips::S1_64: case Mips::F17: case Mips::D17_64:
+    return 17;
+  case Mips::S2: case Mips::S2_64: case Mips::F18: case Mips::D18_64:
+  case Mips::D9:
+    return 18;
+  case Mips::S3: case Mips::S3_64: case Mips::F19: case Mips::D19_64:
+    return 19;
+  case Mips::S4: case Mips::S4_64: case Mips::F20: case Mips::D20_64:
+  case Mips::D10:
+    return 20;
+  case Mips::S5: case Mips::S5_64: case Mips::F21: case Mips::D21_64:
+    return 21;
+  case Mips::S6: case Mips::S6_64: case Mips::F22: case Mips::D22_64:
+  case Mips::D11:
+    return 22;
+  case Mips::S7: case Mips::S7_64: case Mips::F23: case Mips::D23_64:
+    return 23;
+  case Mips::T8: case Mips::T8_64: case Mips::F24: case Mips::D24_64:
+  case Mips::D12:
+    return 24;
+  case Mips::T9: case Mips::T9_64: case Mips::F25: case Mips::D25_64:
+    return 25;
+  case Mips::K0: case Mips::K0_64: case Mips::F26: case Mips::D26_64:
+  case Mips::D13:
+    return 26;
+  case Mips::K1: case Mips::K1_64: case Mips::F27: case Mips::D27_64:
+    return 27;
+  case Mips::GP: case Mips::GP_64: case Mips::F28: case Mips::D28_64:
+  case Mips::D14:
+    return 28;
+  case Mips::SP: case Mips::SP_64: case Mips::F29: case Mips::D29_64:
+    return 29;
+  case Mips::FP: case Mips::FP_64: case Mips::F30: case Mips::D30_64:
+  case Mips::D15: 
+    return 30;
+  case Mips::RA: case Mips::RA_64: case Mips::F31: case Mips::D31_64:
+    return 31;
+  default: llvm_unreachable("Unknown register number!");
+  }
+  return 0; // Not reached
+}
+}
+
+#endif
diff --git a/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h b/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
new file mode 100644
index 0000000000000..8b099eab91fd7
--- /dev/null
+++ b/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
@@ -0,0 +1,90 @@
+#ifndef LLVM_Mips_MipsFIXUPKINDS_H
+#define LLVM_Mips_MipsFIXUPKINDS_H
+
+//===-- Mips/MipsFixupKinds.h - Mips Specific Fixup Entries --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "llvm/MC/MCFixup.h"
+
+namespace llvm {
+namespace Mips {
+    enum Fixups {
+        // fixup_Mips_xxx - R_MIPS_NONE
+        fixup_Mips_NONE = FirstTargetFixupKind,
+
+        // fixup_Mips_xxx - R_MIPS_16.
+        fixup_Mips_16,
+
+        // fixup_Mips_xxx - R_MIPS_32.
+        fixup_Mips_32,
+
+        // fixup_Mips_xxx - R_MIPS_REL32.
+        fixup_Mips_REL32,
+
+        // fixup_Mips_xxx - R_MIPS_26.
+        fixup_Mips_26,
+
+        // fixup_Mips_xxx - R_MIPS_HI16.
+        fixup_Mips_HI16,
+
+        // fixup_Mips_xxx - R_MIPS_LO16.
+        fixup_Mips_LO16,
+
+        // fixup_Mips_xxx - R_MIPS_GPREL16.
+        fixup_Mips_GPREL16,
+
+        // fixup_Mips_xxx - R_MIPS_LITERAL.
+        fixup_Mips_LITERAL,
+
+        // fixup_Mips_xxx - R_MIPS_GOT16.
+        fixup_Mips_GOT16,
+
+        // fixup_Mips_xxx - R_MIPS_PC16.
+        fixup_Mips_PC16,
+
+        // fixup_Mips_xxx - R_MIPS_CALL16.
+        fixup_Mips_CALL16,
+
+        // fixup_Mips_xxx - R_MIPS_GPREL32.
+        fixup_Mips_GPREL32,
+
+        // fixup_Mips_xxx - R_MIPS_SHIFT5.
+        fixup_Mips_SHIFT5,
+
+        // fixup_Mips_xxx - R_MIPS_SHIFT6.
+        fixup_Mips_SHIFT6,
+
+        // fixup_Mips_xxx - R_MIPS_64.
+        fixup_Mips_64,
+
+        // fixup_Mips_xxx - R_MIPS_TLS_GD.
+        fixup_Mips_TLSGD,
+
+        // fixup_Mips_xxx - R_MIPS_TLS_GOTTPREL.
+        fixup_Mips_GOTTPREL,
+
+        // fixup_Mips_xxx - R_MIPS_TLS_TPREL_HI16.
+        fixup_Mips_TPREL_HI,
+
+        // fixup_Mips_xxx - R_MIPS_TLS_TPREL_LO16.
+        fixup_Mips_TPREL_LO,
+
+        // fixup_Mips_xxx - yyy. // This should become R_MIPS_PC16
+        fixup_Mips_Branch_PCRel,
+
+        // Marker
+        LastTargetFixupKind,
+        NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+    };
+} // namespace llvm
+} // namespace Mips
+
+
+#endif /* LLVM_Mips_MipsFIXUPKINDS_H */
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
index 5d9242500f6d5..71ae804989958 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
@@ -18,7 +18,8 @@ using namespace llvm;
 
 MipsMCAsmInfo::MipsMCAsmInfo(const Target &T, StringRef TT) {
   Triple TheTriple(TT);
-  if (TheTriple.getArch() == Triple::mips)
+  if ((TheTriple.getArch() == Triple::mips) ||
+      (TheTriple.getArch() == Triple::mips64))
     IsLittleEndian = false;
 
   AlignmentIsInBytes          = false;
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
new file mode 100644
index 0000000000000..d66de23ba115c
--- /dev/null
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
@@ -0,0 +1,52 @@
+//===-- MipsMCCodeEmitter.cpp - Convert Mips code to machine code ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the MipsMCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+//
+#define DEBUG_TYPE "mccodeemitter"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/raw_ostream.h"
+#include "MCTargetDesc/MipsMCTargetDesc.h"
+
+using namespace llvm;
+
+namespace {
+class MipsMCCodeEmitter : public MCCodeEmitter {
+  MipsMCCodeEmitter(const MipsMCCodeEmitter &); // DO NOT IMPLEMENT
+  void operator=(const MipsMCCodeEmitter &); // DO NOT IMPLEMENT
+  const MCInstrInfo &MCII;
+  const MCSubtargetInfo &STI;
+
+public:
+  MipsMCCodeEmitter(const MCInstrInfo &mcii, const MCSubtargetInfo &sti,
+                    MCContext &ctx)
+    : MCII(mcii), STI(sti) {}
+
+  ~MipsMCCodeEmitter() {}
+
+  void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups) const {
+  }
+}; // class MipsMCCodeEmitter
+}  // namespace
+
+MCCodeEmitter *llvm::createMipsMCCodeEmitter(const MCInstrInfo &MCII,
+                                             const MCSubtargetInfo &STI,
+                                             MCContext &Ctx) {
+  return new MipsMCCodeEmitter(MCII, STI, Ctx);
+}
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
index 06f0d0bfb6b9b..1f9e3ddf13c86 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
@@ -13,10 +13,14 @@
 
 #include "MipsMCTargetDesc.h"
 #include "MipsMCAsmInfo.h"
+#include "InstPrinter/MipsInstPrinter.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/MC/MCCodeGenInfo.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 
 #define GET_INSTRINFO_MC_DESC
 #include "MipsGenInstrInfo.inc"
@@ -35,11 +39,12 @@ static MCInstrInfo *createMipsMCInstrInfo() {
   return X;
 }
 
-extern "C" void LLVMInitializeMipsMCInstrInfo() {
-  TargetRegistry::RegisterMCInstrInfo(TheMipsTarget, createMipsMCInstrInfo);
+static MCRegisterInfo *createMipsMCRegisterInfo(StringRef TT) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitMipsMCRegisterInfo(X, Mips::RA);
+  return X;
 }
 
-
 static MCSubtargetInfo *createMipsMCSubtargetInfo(StringRef TT, StringRef CPU,
                                                   StringRef FS) {
   MCSubtargetInfo *X = new MCSubtargetInfo();
@@ -47,12 +52,111 @@ static MCSubtargetInfo *createMipsMCSubtargetInfo(StringRef TT, StringRef CPU,
   return X;
 }
 
-extern "C" void LLVMInitializeMipsMCSubtargetInfo() {
+static MCAsmInfo *createMipsMCAsmInfo(const Target &T, StringRef TT) {
+  MCAsmInfo *MAI = new MipsMCAsmInfo(T, TT);
+
+  MachineLocation Dst(MachineLocation::VirtualFP);
+  MachineLocation Src(Mips::SP, 0);
+  MAI->addInitialFrameState(0, Dst, Src);
+
+  return MAI;
+}
+
+static MCCodeGenInfo *createMipsMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                              CodeModel::Model CM) {
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+  if (RM == Reloc::Default)
+    RM = Reloc::PIC_;
+  X->InitMCCodeGenInfo(RM, CM);
+  return X;
+}
+
+static MCInstPrinter *createMipsMCInstPrinter(const Target &T,
+                                              unsigned SyntaxVariant,
+                                              const MCAsmInfo &MAI,
+                                              const MCSubtargetInfo &STI) {
+  return new MipsInstPrinter(MAI);
+}
+
+static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
+                                    MCContext &Ctx, MCAsmBackend &MAB,
+                                    raw_ostream &_OS,
+                                    MCCodeEmitter *_Emitter,
+                                    bool RelaxAll,
+                                    bool NoExecStack) {
+  Triple TheTriple(TT);
+
+  return createELFStreamer(Ctx, MAB, _OS, _Emitter, RelaxAll, NoExecStack);
+}
+
+extern "C" void LLVMInitializeMipsTargetMC() {
+  // Register the MC asm info.
+  RegisterMCAsmInfoFn X(TheMipsTarget, createMipsMCAsmInfo);
+  RegisterMCAsmInfoFn Y(TheMipselTarget, createMipsMCAsmInfo);
+  RegisterMCAsmInfoFn A(TheMips64Target, createMipsMCAsmInfo);
+  RegisterMCAsmInfoFn B(TheMips64elTarget, createMipsMCAsmInfo);
+
+  // Register the MC codegen info.
+  TargetRegistry::RegisterMCCodeGenInfo(TheMipsTarget,
+                                        createMipsMCCodeGenInfo);
+  TargetRegistry::RegisterMCCodeGenInfo(TheMipselTarget,
+                                        createMipsMCCodeGenInfo);
+  TargetRegistry::RegisterMCCodeGenInfo(TheMips64Target,
+                                        createMipsMCCodeGenInfo);
+  TargetRegistry::RegisterMCCodeGenInfo(TheMips64elTarget,
+                                        createMipsMCCodeGenInfo);
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(TheMipsTarget, createMipsMCInstrInfo);
+  TargetRegistry::RegisterMCInstrInfo(TheMipselTarget, createMipsMCInstrInfo);
+  TargetRegistry::RegisterMCInstrInfo(TheMips64Target, createMipsMCInstrInfo);
+  TargetRegistry::RegisterMCInstrInfo(TheMips64elTarget, createMipsMCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(TheMipsTarget, createMipsMCRegisterInfo);
+  TargetRegistry::RegisterMCRegInfo(TheMipselTarget, createMipsMCRegisterInfo);
+  TargetRegistry::RegisterMCRegInfo(TheMips64Target, createMipsMCRegisterInfo);
+  TargetRegistry::RegisterMCRegInfo(TheMips64elTarget,
+                                    createMipsMCRegisterInfo);
+
+  // Register the MC Code Emitter
+  TargetRegistry::RegisterMCCodeEmitter(TheMipsTarget, createMipsMCCodeEmitter);
+  TargetRegistry::RegisterMCCodeEmitter(TheMipselTarget,
+                                        createMipsMCCodeEmitter);
+  TargetRegistry::RegisterMCCodeEmitter(TheMips64Target,
+                                        createMipsMCCodeEmitter);
+  TargetRegistry::RegisterMCCodeEmitter(TheMips64elTarget,
+                                        createMipsMCCodeEmitter);
+
+  // Register the object streamer.
+  TargetRegistry::RegisterMCObjectStreamer(TheMipsTarget, createMCStreamer);
+  TargetRegistry::RegisterMCObjectStreamer(TheMipselTarget, createMCStreamer);
+  TargetRegistry::RegisterMCObjectStreamer(TheMips64Target, createMCStreamer);
+  TargetRegistry::RegisterMCObjectStreamer(TheMips64elTarget, createMCStreamer);
+
+  // Register the asm backend.
+  TargetRegistry::RegisterMCAsmBackend(TheMipsTarget, createMipsAsmBackend);
+  TargetRegistry::RegisterMCAsmBackend(TheMipselTarget, createMipsAsmBackend);
+  TargetRegistry::RegisterMCAsmBackend(TheMips64Target, createMipsAsmBackend);
+  TargetRegistry::RegisterMCAsmBackend(TheMips64elTarget, createMipsAsmBackend);
+
+  // Register the MC subtarget info.
   TargetRegistry::RegisterMCSubtargetInfo(TheMipsTarget,
                                           createMipsMCSubtargetInfo);
-}
+  TargetRegistry::RegisterMCSubtargetInfo(TheMipselTarget,
+                                          createMipsMCSubtargetInfo);
+  TargetRegistry::RegisterMCSubtargetInfo(TheMips64Target,
+                                          createMipsMCSubtargetInfo);
+  TargetRegistry::RegisterMCSubtargetInfo(TheMips64elTarget,
+                                          createMipsMCSubtargetInfo);
 
-extern "C" void LLVMInitializeMipsMCAsmInfo() {
-  RegisterMCAsmInfo<MipsMCAsmInfo> X(TheMipsTarget);
-  RegisterMCAsmInfo<MipsMCAsmInfo> Y(TheMipselTarget);
+  // Register the MCInstPrinter.
+  TargetRegistry::RegisterMCInstPrinter(TheMipsTarget,
+                                        createMipsMCInstPrinter);
+  TargetRegistry::RegisterMCInstPrinter(TheMipselTarget,
+                                        createMipsMCInstPrinter);
+  TargetRegistry::RegisterMCInstPrinter(TheMips64Target,
+                                        createMipsMCInstPrinter);
+  TargetRegistry::RegisterMCInstPrinter(TheMips64elTarget,
+                                        createMipsMCInstPrinter);
 }
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
index 3d18f114c8bd0..7a0042ad889ea 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
@@ -1,4 +1,4 @@
-//===-- AlphaMCTargetDesc.h - Alpha Target Descriptions ---------*- C++ -*-===//
+//===-- MipsMCTargetDesc.h - Mips Target Descriptions -----------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,21 +7,32 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file provides Alpha specific target descriptions.
+// This file provides Mips specific target descriptions.
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef ALPHAMCTARGETDESC_H
-#define ALPHAMCTARGETDESC_H
+#ifndef MIPSMCTARGETDESC_H
+#define MIPSMCTARGETDESC_H
 
 namespace llvm {
+class MCAsmBackend;
+class MCInstrInfo;
+class MCCodeEmitter;
+class MCContext;
 class MCSubtargetInfo;
-class Target;
 class StringRef;
+class Target;
 
 extern Target TheMipsTarget;
 extern Target TheMipselTarget;
+extern Target TheMips64Target;
+extern Target TheMips64elTarget;
+
+MCCodeEmitter *createMipsMCCodeEmitter(const MCInstrInfo &MCII,
+                                       const MCSubtargetInfo &STI,
+                                       MCContext &Ctx);
 
+MCAsmBackend *createMipsAsmBackend(const Target &T, StringRef TT);
 } // End llvm namespace
 
 // Defines symbolic names for Mips registers.  This defines a mapping from
diff --git a/lib/Target/Mips/Mips.h b/lib/Target/Mips/Mips.h
index 984b5adfc5f34..bacecf20b9200 100644
--- a/lib/Target/Mips/Mips.h
+++ b/lib/Target/Mips/Mips.h
@@ -29,6 +29,9 @@ namespace llvm {
   FunctionPass *createMipsExpandPseudoPass(MipsTargetMachine &TM);
   FunctionPass *createMipsEmitGPRestorePass(MipsTargetMachine &TM);
 
+  FunctionPass *createMipsJITCodeEmitterPass(MipsTargetMachine &TM,
+                                             JITCodeEmitter &JCE);
+
 } // end namespace llvm;
 
 #endif
diff --git a/lib/Target/Mips/Mips.td b/lib/Target/Mips/Mips.td
index 433cd57f34e08..39c2c164f664c 100644
--- a/lib/Target/Mips/Mips.td
+++ b/lib/Target/Mips/Mips.td
@@ -38,6 +38,10 @@ def FeatureSingleFloat : SubtargetFeature<"single-float", "IsSingleFloat",
                                 "true", "Only supports single precision float">;
 def FeatureO32         : SubtargetFeature<"o32", "MipsABI", "O32",
                                 "Enable o32 ABI">;
+def FeatureN32         : SubtargetFeature<"n32", "MipsABI", "N32",
+                                "Enable n32 ABI">;
+def FeatureN64         : SubtargetFeature<"n64", "MipsABI", "N64",
+                                "Enable n64 ABI">;
 def FeatureEABI        : SubtargetFeature<"eabi", "MipsABI", "EABI",
                                 "Enable eabi ABI">;
 def FeatureVFPU        : SubtargetFeature<"vfpu", "HasVFPU",
@@ -54,16 +58,19 @@ def FeatureSwap        : SubtargetFeature<"swap", "HasSwap", "true",
                                 "Enable 'byte/half swap' instructions.">;
 def FeatureBitCount    : SubtargetFeature<"bitcount", "HasBitCount", "true",
                                 "Enable 'count leading bits' instructions.">;
-def FeatureMips1       : SubtargetFeature<"mips1", "MipsArchVersion", "Mips1",
-                                "Mips1 ISA Support">;
-def FeatureMips2       : SubtargetFeature<"mips2", "MipsArchVersion", "Mips2",
-                                "Mips2 ISA Support">;
 def FeatureMips32      : SubtargetFeature<"mips32", "MipsArchVersion", "Mips32",
                                 "Mips32 ISA Support",
                                 [FeatureCondMov, FeatureBitCount]>;
 def FeatureMips32r2    : SubtargetFeature<"mips32r2", "MipsArchVersion",
                                 "Mips32r2", "Mips32r2 ISA Support",
                                 [FeatureMips32, FeatureSEInReg]>;
+def FeatureMips64      : SubtargetFeature<"mips64", "MipsArchVersion",
+                                "Mips64", "Mips64 ISA Support",
+                                [FeatureGP64Bit, FeatureFP64Bit,
+                                 FeatureMips32]>;
+def FeatureMips64r2    : SubtargetFeature<"mips64r2", "MipsArchVersion",
+                                "Mips64r2", "Mips64r2 ISA Support",
+                                [FeatureMips64, FeatureMips32r2]>;
 
 //===----------------------------------------------------------------------===//
 // Mips processors supported.
@@ -72,21 +79,10 @@ def FeatureMips32r2    : SubtargetFeature<"mips32r2", "MipsArchVersion",
 class Proc<string Name, list<SubtargetFeature> Features>
  : Processor<Name, MipsGenericItineraries, Features>;
 
-def : Proc<"mips1", [FeatureMips1]>;
-def : Proc<"r2000", [FeatureMips1]>;
-def : Proc<"r3000", [FeatureMips1]>;
-
-def : Proc<"mips2", [FeatureMips2]>;
-def : Proc<"r6000", [FeatureMips2]>;
-
+def : Proc<"mips32r1", [FeatureMips32]>;
 def : Proc<"4ke", [FeatureMips32r2]>;
-
-// Allegrex is a 32bit subset of r4000, both for integer and fp registers,
-// but much more similar to Mips2 than Mips3. It also contains some of
-// Mips32/Mips32r2 instructions and a custom vector fpu processor.
-def : Proc<"allegrex", [FeatureMips2, FeatureSingleFloat, FeatureEABI,
-      FeatureVFPU, FeatureSEInReg, FeatureCondMov, FeatureMulDivAdd,
-      FeatureMinMax, FeatureSwap, FeatureBitCount]>;
+def : Proc<"mips64r1", [FeatureMips64]>;
+def : Proc<"mips64r2", [FeatureMips64r2]>;
 
 def MipsAsmWriter : AsmWriter {
   string AsmWriterClassName  = "InstPrinter";
diff --git a/lib/Target/Mips/Mips64InstrInfo.td b/lib/Target/Mips/Mips64InstrInfo.td
new file mode 100644
index 0000000000000..49b0223643bbb
--- /dev/null
+++ b/lib/Target/Mips/Mips64InstrInfo.td
@@ -0,0 +1,214 @@
+//===- Mips64InstrInfo.td - Mips64 Instruction Information -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes Mips64 instructions.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Mips Operand, Complex Patterns and Transformations Definitions.
+//===----------------------------------------------------------------------===//
+
+// Instruction operand types
+def shamt_64       : Operand<i64>;
+
+// Unsigned Operand
+def uimm16_64      : Operand<i64> {
+  let PrintMethod = "printUnsignedImm";
+}
+
+// Transformation Function - get Imm - 32.
+def Subtract32 : SDNodeXForm<imm, [{
+  return getI32Imm((unsigned)N->getZExtValue() - 32);
+}]>;
+
+// imm32_63 predicate - True if imm is in range [32, 63].
+def imm32_63 : ImmLeaf<i64,
+                       [{return (int32_t)Imm >= 32 && (int32_t)Imm < 64;}],
+                       Subtract32>;
+
+//===----------------------------------------------------------------------===//
+// Instructions specific format
+//===----------------------------------------------------------------------===//
+// Shifts
+class LogicR_shift_rotate_imm64<bits<6> func, bits<5> _rs, string instr_asm,
+                                SDNode OpNode, PatFrag PF>:
+  FR<0x00, func, (outs CPU64Regs:$dst), (ins CPU64Regs:$b, shamt_64:$c),
+     !strconcat(instr_asm, "\t$dst, $b, $c"),
+     [(set CPU64Regs:$dst, (OpNode CPU64Regs:$b, (i64 PF:$c)))],
+     IIAlu> {
+  let rs = _rs;
+}
+
+class LogicR_shift_rotate_reg64<bits<6> func, bits<5> _shamt, string instr_asm,
+                                SDNode OpNode>:
+  FR<0x00, func, (outs CPU64Regs:$dst), (ins CPU64Regs:$c, CPU64Regs:$b),
+     !strconcat(instr_asm, "\t$dst, $b, $c"),
+     [(set CPU64Regs:$dst, (OpNode CPU64Regs:$b, CPU64Regs:$c))], IIAlu> {
+  let shamt = _shamt;
+}
+
+// Mul, Div
+let Defs = [HI64, LO64] in {
+  let isCommutable = 1 in
+  class Mul64<bits<6> func, string instr_asm, InstrItinClass itin>:
+    FR<0x00, func, (outs), (ins CPU64Regs:$a, CPU64Regs:$b),
+       !strconcat(instr_asm, "\t$a, $b"), [], itin>;
+
+  class Div64<SDNode op, bits<6> func, string instr_asm, InstrItinClass itin>:
+              FR<0x00, func, (outs), (ins CPU64Regs:$a, CPU64Regs:$b),
+              !strconcat(instr_asm, "\t$$zero, $a, $b"),
+              [(op CPU64Regs:$a, CPU64Regs:$b)], itin>;
+}
+
+// Move from Hi/Lo
+let shamt = 0 in {
+let rs = 0, rt = 0 in
+class MoveFromLOHI64<bits<6> func, string instr_asm>:
+  FR<0x00, func, (outs CPU64Regs:$dst), (ins),
+     !strconcat(instr_asm, "\t$dst"), [], IIHiLo>;
+
+let rt = 0, rd = 0 in
+class MoveToLOHI64<bits<6> func, string instr_asm>:
+  FR<0x00, func, (outs), (ins CPU64Regs:$src),
+     !strconcat(instr_asm, "\t$src"), [], IIHiLo>;
+}
+
+// Count Leading Ones/Zeros in Word
+class CountLeading64<bits<6> func, string instr_asm, list<dag> pattern>:
+  FR<0x1c, func, (outs CPU64Regs:$dst), (ins CPU64Regs:$src),
+     !strconcat(instr_asm, "\t$dst, $src"), pattern, IIAlu>,
+     Requires<[HasBitCount]> {
+  let shamt = 0;
+  let rt = rd;
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction definition
+//===----------------------------------------------------------------------===//
+
+/// Arithmetic Instructions (ALU Immediate)
+def DADDiu   : ArithLogicI<0x19, "daddiu", add, simm16_64, immSExt16,
+                           CPU64Regs>;
+def DANDi    : ArithLogicI<0x0c, "andi", and, uimm16_64, immZExt16, CPU64Regs>;
+def SLTi64   : SetCC_I<0x0a, "slti", setlt, simm16_64, immSExt16, CPU64Regs>;
+def SLTiu64  : SetCC_I<0x0b, "sltiu", setult, simm16_64, immSExt16, CPU64Regs>;
+def ORi64    : ArithLogicI<0x0d, "ori", or, uimm16_64, immZExt16, CPU64Regs>;
+def XORi64   : ArithLogicI<0x0e, "xori", xor, uimm16_64, immZExt16, CPU64Regs>;
+
+/// Arithmetic Instructions (3-Operand, R-Type)
+def DADDu    : ArithLogicR<0x00, 0x2d, "daddu", add, IIAlu, CPU64Regs, 1>;
+def DSUBu    : ArithLogicR<0x00, 0x2f, "dsubu", sub, IIAlu, CPU64Regs>;
+def SLT64    : SetCC_R<0x00, 0x2a, "slt", setlt, CPU64Regs>;
+def SLTu64   : SetCC_R<0x00, 0x2b, "sltu", setult, CPU64Regs>;
+def AND64    : ArithLogicR<0x00, 0x24, "and", and, IIAlu, CPU64Regs, 1>;
+def OR64     : ArithLogicR<0x00, 0x25, "or", or, IIAlu, CPU64Regs, 1>;
+def XOR64    : ArithLogicR<0x00, 0x26, "xor", xor, IIAlu, CPU64Regs, 1>;
+def NOR64    : LogicNOR<0x00, 0x27, "nor", CPU64Regs>;
+
+/// Shift Instructions
+def DSLL     : LogicR_shift_rotate_imm64<0x38, 0x00, "dsll", shl, immZExt5>;
+def DSRL     : LogicR_shift_rotate_imm64<0x3a, 0x00, "dsrl", srl, immZExt5>;
+def DSRA     : LogicR_shift_rotate_imm64<0x3b, 0x00, "dsra", sra, immZExt5>;
+def DSLL32   : LogicR_shift_rotate_imm64<0x3c, 0x00, "dsll32", shl, imm32_63>;
+def DSRL32   : LogicR_shift_rotate_imm64<0x3e, 0x00, "dsrl32", srl, imm32_63>;
+def DSRA32   : LogicR_shift_rotate_imm64<0x3f, 0x00, "dsra32", sra, imm32_63>;
+def DSLLV    : LogicR_shift_rotate_reg64<0x24, 0x00, "dsllv", shl>;
+def DSRLV    : LogicR_shift_rotate_reg64<0x26, 0x00, "dsrlv", srl>;
+def DSRAV    : LogicR_shift_rotate_reg64<0x27, 0x00, "dsrav", sra>;
+
+// Rotate Instructions
+let Predicates = [HasMips64r2] in {
+  def DROTR    : LogicR_shift_rotate_imm64<0x3a, 0x01, "drotr", rotr, immZExt5>;
+  def DROTR32  : LogicR_shift_rotate_imm64<0x3e, 0x01, "drotr32", rotr,
+                                           imm32_63>;
+  def DROTRV   : LogicR_shift_rotate_reg64<0x16, 0x01, "drotrv", rotr>;
+}
+
+/// Load and Store Instructions
+///  aligned 
+defm LB64    : LoadM64<0x20, "lb",  sextloadi8>;
+defm LBu64   : LoadM64<0x24, "lbu", zextloadi8>;
+defm LH64    : LoadM64<0x21, "lh",  sextloadi16_a>;
+defm LHu64   : LoadM64<0x25, "lhu", zextloadi16_a>;
+defm LW64    : LoadM64<0x23, "lw",  sextloadi32_a>;
+defm LWu64   : LoadM64<0x27, "lwu", zextloadi32_a>;
+defm SB64    : StoreM64<0x28, "sb", truncstorei8>;
+defm SH64    : StoreM64<0x29, "sh", truncstorei16_a>;
+defm SW64    : StoreM64<0x2b, "sw", truncstorei32_a>;
+defm LD      : LoadM64<0x37, "ld",  load_a>;
+defm SD      : StoreM64<0x3f, "sd", store_a>;
+
+///  unaligned
+defm ULH64     : LoadM64<0x21, "ulh",  sextloadi16_u, 1>;
+defm ULHu64    : LoadM64<0x25, "ulhu", zextloadi16_u, 1>;
+defm ULW64     : LoadM64<0x23, "ulw",  sextloadi32_u, 1>;
+defm USH64     : StoreM64<0x29, "ush", truncstorei16_u, 1>;
+defm USW64     : StoreM64<0x2b, "usw", truncstorei32_u, 1>;
+defm ULD       : LoadM64<0x37, "uld",  load_u, 1>;
+defm USD       : StoreM64<0x3f, "usd", store_u, 1>;
+
+/// Jump and Branch Instructions
+def BEQ64  : CBranch<0x04, "beq", seteq, CPU64Regs>;
+def BNE64  : CBranch<0x05, "bne", setne, CPU64Regs>;
+def BGEZ64 : CBranchZero<0x01, 1, "bgez", setge, CPU64Regs>;
+def BGTZ64 : CBranchZero<0x07, 0, "bgtz", setgt, CPU64Regs>;
+def BLEZ64 : CBranchZero<0x07, 0, "blez", setle, CPU64Regs>;
+def BLTZ64 : CBranchZero<0x01, 0, "bltz", setlt, CPU64Regs>;
+
+/// Multiply and Divide Instructions.
+def DMULT    : Mul64<0x1c, "dmult", IIImul>;
+def DMULTu   : Mul64<0x1d, "dmultu", IIImul>;
+def DSDIV    : Div64<MipsDivRem, 0x1e, "ddiv", IIIdiv>;
+def DUDIV    : Div64<MipsDivRemU, 0x1f, "ddivu", IIIdiv>;
+
+let Defs = [HI64] in
+  def MTHI64  : MoveToLOHI64<0x11, "mthi">;
+let Defs = [LO64] in
+  def MTLO64  : MoveToLOHI64<0x13, "mtlo">;
+
+let Uses = [HI64] in
+  def MFHI64  : MoveFromLOHI64<0x10, "mfhi">;
+let Uses = [LO64] in
+  def MFLO64  : MoveFromLOHI64<0x12, "mflo">;
+
+/// Count Leading
+def DCLZ : CountLeading64<0x24, "dclz",
+                          [(set CPU64Regs:$dst, (ctlz CPU64Regs:$src))]>;
+def DCLO : CountLeading64<0x25, "dclo",
+                          [(set CPU64Regs:$dst, (ctlz (not CPU64Regs:$src)))]>;
+
+//===----------------------------------------------------------------------===//
+//  Arbitrary patterns that map to one or more instructions
+//===----------------------------------------------------------------------===//
+
+// Small immediates
+def : Pat<(i64 immSExt16:$in),
+          (DADDiu ZERO_64, imm:$in)>;
+def : Pat<(i64 immZExt16:$in),
+          (ORi64 ZERO_64, imm:$in)>;
+
+// zextloadi32_u
+def : Pat<(zextloadi32_u addr:$a), (DSRL (DSLL (ULW64_P8 addr:$a), 32), 32)>,
+      Requires<[IsN64]>;
+def : Pat<(zextloadi32_u addr:$a), (DSRL (DSLL (ULW64 addr:$a), 32), 32)>,
+      Requires<[NotN64]>;
+
+// hi/lo relocs
+def : Pat<(i64 (MipsLo tglobaladdr:$in)), (DADDiu ZERO_64, tglobaladdr:$in)>;
+
+defm : BrcondPats<CPU64Regs, BEQ64, BNE64, SLT64, SLTu64, SLTi64, SLTiu64,
+                  ZERO_64>;
+
+// setcc patterns
+defm : SeteqPats<CPU64Regs, SLTiu64, XOR64, SLTu64, ZERO_64>;
+defm : SetlePats<CPU64Regs, SLT64, SLTu64>;
+defm : SetgtPats<CPU64Regs, SLT64, SLTu64>;
+defm : SetgePats<CPU64Regs, SLT64, SLTu64>;
+defm : SetgeImmPats<CPU64Regs, SLTi64, SLTiu64>;
diff --git a/lib/Target/Mips/MipsAsmPrinter.cpp b/lib/Target/Mips/MipsAsmPrinter.cpp
index 69e03bd297241..0e826812d0766 100644
--- a/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -18,6 +18,7 @@
 #include "MipsInstrInfo.h"
 #include "MipsMachineFunction.h"
 #include "MipsMCInstLower.h"
+#include "MipsMCSymbolRefExpr.h"
 #include "InstPrinter/MipsInstPrinter.h"
 #include "llvm/BasicBlock.h"
 #include "llvm/Instructions.h"
@@ -25,6 +26,7 @@
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCInst.h"
@@ -33,15 +35,22 @@
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetOptions.h"
-#include "llvm/Target/TargetRegistry.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Analysis/DebugInfo.h"
 
 using namespace llvm;
 
+static bool isUnalignedLoadStore(unsigned Opc) {
+  return Opc == Mips::ULW    || Opc == Mips::ULH    || Opc == Mips::ULHu ||
+         Opc == Mips::USW    || Opc == Mips::USH    ||
+         Opc == Mips::ULW_P8 || Opc == Mips::ULH_P8 || Opc == Mips::ULHu_P8 ||
+         Opc == Mips::USW_P8 || Opc == Mips::USH_P8;
+}
+
 void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   SmallString<128> Str;
   raw_svector_ostream OS(Str);
@@ -52,8 +61,21 @@ void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   }
 
   MipsMCInstLower MCInstLowering(Mang, *MF, *this);
+  unsigned Opc = MI->getOpcode();
   MCInst TmpInst0;
   MCInstLowering.Lower(MI, TmpInst0);
+  
+  // Enclose unaligned load or store with .macro & .nomacro directives.
+  if (isUnalignedLoadStore(Opc)) {
+    MCInst Directive;
+    Directive.setOpcode(Mips::MACRO);
+    OutStreamer.EmitInstruction(Directive);
+    OutStreamer.EmitInstruction(TmpInst0);
+    Directive.setOpcode(Mips::NOMACRO);
+    OutStreamer.EmitInstruction(Directive);
+    return;
+  }
+
   OutStreamer.EmitInstruction(TmpInst0);
 }
 
@@ -180,7 +202,6 @@ void MipsAsmPrinter::emitFrameDirective() {
 const char *MipsAsmPrinter::getCurrentABIString() const {
   switch (Subtarget->getTargetABI()) {
   case MipsSubtarget::O32:  return "abi32";
-  case MipsSubtarget::O64:  return "abiO64";
   case MipsSubtarget::N32:  return "abiN32";
   case MipsSubtarget::N64:  return "abi64";
   case MipsSubtarget::EABI: return "eabi32"; // TODO: handle eabi64
@@ -304,6 +325,11 @@ void MipsAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
   case MipsII::MO_GOTTPREL: O << "%gottprel("; break;
   case MipsII::MO_TPREL_HI: O << "%tprel_hi("; break;
   case MipsII::MO_TPREL_LO: O << "%tprel_lo("; break;
+  case MipsII::MO_GPOFF_HI: O << "%hi(%neg(%gp_rel("; break;
+  case MipsII::MO_GPOFF_LO: O << "%lo(%neg(%gp_rel("; break;
+  case MipsII::MO_GOT_DISP: O << "%got_disp("; break;
+  case MipsII::MO_GOT_PAGE: O << "%got_page("; break;
+  case MipsII::MO_GOT_OFST: O << "%got_ofst("; break;
   }
 
   switch (MO.getType()) {
@@ -424,17 +450,9 @@ void MipsAsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
 }
 
 // Force static initialization.
-static MCInstPrinter *createMipsMCInstPrinter(const Target &T,
-                                              unsigned SyntaxVariant,
-                                              const MCAsmInfo &MAI) {
-  return new MipsInstPrinter(MAI);
-}
-
 extern "C" void LLVMInitializeMipsAsmPrinter() {
   RegisterAsmPrinter<MipsAsmPrinter> X(TheMipsTarget);
   RegisterAsmPrinter<MipsAsmPrinter> Y(TheMipselTarget);
-
-  TargetRegistry::RegisterMCInstPrinter(TheMipsTarget, createMipsMCInstPrinter);
-  TargetRegistry::RegisterMCInstPrinter(TheMipselTarget,
-                                        createMipsMCInstPrinter);
+  RegisterAsmPrinter<MipsAsmPrinter> A(TheMips64Target);
+  RegisterAsmPrinter<MipsAsmPrinter> B(TheMips64elTarget);
 }
diff --git a/lib/Target/Mips/MipsCallingConv.td b/lib/Target/Mips/MipsCallingConv.td
index 876f0fcc83ea2..0ae4ef6fbad47 100644
--- a/lib/Target/Mips/MipsCallingConv.td
+++ b/lib/Target/Mips/MipsCallingConv.td
@@ -30,6 +30,55 @@ def RetCC_MipsO32 : CallingConv<[
   CCIfType<[f64], CCIfSubtarget<"isNotSingleFloat()", CCAssignToReg<[D0, D1]>>>
 ]>;
 
+//===----------------------------------------------------------------------===//
+// Mips N32/64 Calling Convention
+//===----------------------------------------------------------------------===//
+
+def CC_MipsN : CallingConv<[
+  // FIXME: Handle byval, complex and float double parameters.
+
+  // Promote i8/i16/i32 arguments to i64.
+  CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+
+  // Integer arguments are passed in integer registers.
+  CCIfType<[i64], CCAssignToRegWithShadow<[A0_64, A1_64, A2_64, A3_64,
+                                           T0_64, T1_64, T2_64, T3_64],
+                                          [D12_64, D13_64, D14_64, D15_64,
+                                           D16_64, D17_64, D18_64, D19_64]>>,
+
+  // f32 arguments are passed in single precision FP registers.
+  CCIfType<[f32], CCAssignToRegWithShadow<[F12, F13, F14, F15,
+                                           F16, F17, F18, F19],
+                                          [A0_64, A1_64, A2_64, A3_64,
+                                           T0_64, T1_64, T2_64, T3_64]>>,
+
+  // f64 arguments are passed in double precision FP registers.
+  CCIfType<[f64], CCAssignToRegWithShadow<[D12_64, D13_64, D14_64, D15_64,
+                                           D16_64, D17_64, D18_64, D19_64],
+                                          [A0_64, A1_64, A2_64, A3_64,
+                                           T0_64, T1_64, T2_64, T3_64]>>,
+
+  // All stack parameter slots become 64-bit doublewords and are 8-byte aligned.
+  CCIfType<[i64, f64], CCAssignToStack<8, 8>>,
+  CCIfType<[f32], CCAssignToStack<4, 8>>
+]>;
+
+def RetCC_MipsN : CallingConv<[
+  // FIXME: Handle complex and float double return values.
+
+  // i32 are returned in registers V0, V1
+  CCIfType<[i32], CCAssignToReg<[V0, V1]>>,
+
+  // i64 are returned in registers V0_64, V1_64
+  CCIfType<[i64], CCAssignToReg<[V0_64, V1_64]>>,
+
+  // f32 are returned in registers F0, F2
+  CCIfType<[f32], CCAssignToReg<[F0, F2]>>,
+
+  // f64 are returned in registers D0, D2
+  CCIfType<[f64], CCAssignToReg<[D0_64, D2_64]>>
+]>;
+
 //===----------------------------------------------------------------------===//
 // Mips EABI Calling Convention
 //===----------------------------------------------------------------------===//
@@ -77,10 +126,14 @@ def RetCC_MipsEABI : CallingConv<[
 //===----------------------------------------------------------------------===//
 
 def CC_Mips : CallingConv<[
-  CCIfSubtarget<"isABI_EABI()", CCDelegateTo<CC_MipsEABI>>
+  CCIfSubtarget<"isABI_EABI()", CCDelegateTo<CC_MipsEABI>>,
+  CCIfSubtarget<"isABI_N32()", CCDelegateTo<CC_MipsN>>,
+  CCIfSubtarget<"isABI_N64()", CCDelegateTo<CC_MipsN>>
 ]>;
 
 def RetCC_Mips : CallingConv<[
   CCIfSubtarget<"isABI_EABI()", CCDelegateTo<RetCC_MipsEABI>>,
+  CCIfSubtarget<"isABI_N32()", CCDelegateTo<RetCC_MipsN>>,
+  CCIfSubtarget<"isABI_N64()", CCDelegateTo<RetCC_MipsN>>,
   CCDelegateTo<RetCC_MipsO32>
 ]>;
diff --git a/lib/Target/Mips/MipsCodeEmitter.cpp b/lib/Target/Mips/MipsCodeEmitter.cpp
new file mode 100644
index 0000000000000..9220d9c7ab4f2
--- /dev/null
+++ b/lib/Target/Mips/MipsCodeEmitter.cpp
@@ -0,0 +1,245 @@
+//===-- Mips/MipsCodeEmitter.cpp - Convert Mips code to machine code -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===---------------------------------------------------------------------===//
+//
+// This file contains the pass that transforms the Mips machine instructions
+// into relocatable machine code.
+//
+//===---------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "jit"
+#include "Mips.h"
+#include "MipsInstrInfo.h"
+#include "MipsRelocations.h"
+#include "MipsSubtarget.h"
+#include "MipsTargetMachine.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/PassManager.h"
+#include "llvm/CodeGen/JITCodeEmitter.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#ifndef NDEBUG
+#include <iomanip>
+#endif
+
+#include "llvm/CodeGen/MachineOperand.h"
+
+using namespace llvm;
+
+STATISTIC(NumEmitted, "Number of machine instructions emitted");
+
+namespace {
+
+class MipsCodeEmitter : public MachineFunctionPass {
+  MipsJITInfo *JTI;
+  const MipsInstrInfo *II;
+  const TargetData *TD;
+  const MipsSubtarget *Subtarget;
+  TargetMachine &TM;
+  JITCodeEmitter &MCE;
+  const std::vector<MachineConstantPoolEntry> *MCPEs;
+  const std::vector<MachineJumpTableEntry> *MJTEs;
+  bool IsPIC;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.addRequired<MachineModuleInfo> ();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  static char ID;
+
+  public:
+    MipsCodeEmitter(TargetMachine &tm, JITCodeEmitter &mce) :
+      MachineFunctionPass(ID), JTI(0),
+        II((const MipsInstrInfo *) tm.getInstrInfo()),
+        TD(tm.getTargetData()), TM(tm), MCE(mce), MCPEs(0), MJTEs(0),
+        IsPIC(TM.getRelocationModel() == Reloc::PIC_) {
+    }
+
+    bool runOnMachineFunction(MachineFunction &MF);
+
+    virtual const char *getPassName() const {
+      return "Mips Machine Code Emitter";
+    }
+
+    /// getBinaryCodeForInstr - This function, generated by the
+    /// CodeEmitterGenerator using TableGen, produces the binary encoding for
+    /// machine instructions.
+    unsigned getBinaryCodeForInstr(const MachineInstr &MI) const;
+
+    void emitInstruction(const MachineInstr &MI);
+
+  private:
+
+    void emitWordLE(unsigned Word);
+
+    /// Routines that handle operands which add machine relocations which are
+    /// fixed up by the relocation stage.
+    void emitGlobalAddress(const GlobalValue *GV, unsigned Reloc,
+                                bool MayNeedFarStub) const;
+    void emitExternalSymbolAddress(const char *ES, unsigned Reloc) const;
+    void emitConstPoolAddress(unsigned CPI, unsigned Reloc) const;
+    void emitJumpTableAddress(unsigned JTIndex, unsigned Reloc) const;
+    void emitMachineBasicBlock(MachineBasicBlock *BB, unsigned Reloc) const;
+
+    /// getMachineOpValue - Return binary encoding of operand. If the machine
+    /// operand requires relocation, record the relocation and return zero.
+    unsigned getMachineOpValue(const MachineInstr &MI,
+                               const MachineOperand &MO) const;
+
+    unsigned getRelocation(const MachineInstr &MI,
+                           const MachineOperand &MO) const;
+
+  };
+}
+
+char MipsCodeEmitter::ID = 0;
+
+bool MipsCodeEmitter::runOnMachineFunction(MachineFunction &MF) {
+  JTI = ((MipsTargetMachine&) MF.getTarget()).getJITInfo();
+  II = ((const MipsTargetMachine&) MF.getTarget()).getInstrInfo();
+  TD = ((const MipsTargetMachine&) MF.getTarget()).getTargetData();
+  Subtarget = &TM.getSubtarget<MipsSubtarget> ();
+  MCPEs = &MF.getConstantPool()->getConstants();
+  MJTEs = 0;
+  if (MF.getJumpTableInfo()) MJTEs = &MF.getJumpTableInfo()->getJumpTables();
+  JTI->Initialize(MF, IsPIC);
+  MCE.setModuleInfo(&getAnalysis<MachineModuleInfo> ());
+
+  do {
+    DEBUG(errs() << "JITTing function '"
+        << MF.getFunction()->getName() << "'\n");
+    MCE.startFunction(MF);
+
+    for (MachineFunction::iterator MBB = MF.begin(), E = MF.end();
+        MBB != E; ++MBB){
+      MCE.StartMachineBasicBlock(MBB);
+      for (MachineBasicBlock::const_iterator I = MBB->begin(), E = MBB->end();
+          I != E; ++I)
+        emitInstruction(*I);
+    }
+  } while (MCE.finishFunction(MF));
+
+  return false;
+}
+
+unsigned MipsCodeEmitter::getRelocation(const MachineInstr &MI,
+                                        const MachineOperand &MO) const {
+  // NOTE: This relocations are for static.
+  uint64_t TSFlags = MI.getDesc().TSFlags;
+  uint64_t Form = TSFlags & MipsII::FormMask;
+  if (Form == MipsII::FrmJ)
+    return Mips::reloc_mips_26;
+  if ((Form == MipsII::FrmI || Form == MipsII::FrmFI)
+       && MI.getDesc().isBranch())
+    return Mips::reloc_mips_branch;
+  if (Form == MipsII::FrmI && MI.getOpcode() == Mips::LUi)
+    return Mips::reloc_mips_hi;
+  return Mips::reloc_mips_lo;
+}
+
+/// getMachineOpValue - Return binary encoding of operand. If the machine
+/// operand requires relocation, record the relocation and return zero.
+unsigned MipsCodeEmitter::getMachineOpValue(const MachineInstr &MI,
+                                           const MachineOperand &MO) const {
+  if (MO.isReg())
+    return MipsRegisterInfo::getRegisterNumbering(MO.getReg());
+  else if (MO.isImm())
+    return static_cast<unsigned>(MO.getImm());
+  else if (MO.isGlobal())
+    emitGlobalAddress(MO.getGlobal(), getRelocation(MI, MO), true);
+  else if (MO.isSymbol())
+    emitExternalSymbolAddress(MO.getSymbolName(), getRelocation(MI, MO));
+  else if (MO.isCPI())
+    emitConstPoolAddress(MO.getIndex(), getRelocation(MI, MO));
+  else if (MO.isJTI())
+    emitJumpTableAddress(MO.getIndex(), getRelocation(MI, MO));
+  else if (MO.isMBB())
+    emitMachineBasicBlock(MO.getMBB(), getRelocation(MI, MO));
+  else
+    llvm_unreachable("Unable to encode MachineOperand!");
+  return 0;
+}
+
+void MipsCodeEmitter::emitGlobalAddress(const GlobalValue *GV, unsigned Reloc,
+                                                bool MayNeedFarStub) const {
+  MCE.addRelocation(MachineRelocation::getGV(MCE.getCurrentPCOffset(), Reloc,
+                             const_cast<GlobalValue *>(GV), 0, MayNeedFarStub));
+}
+
+void MipsCodeEmitter::
+emitExternalSymbolAddress(const char *ES, unsigned Reloc) const {
+  MCE.addRelocation(MachineRelocation::getExtSym(MCE.getCurrentPCOffset(),
+                                                 Reloc, ES, 0, 0, false));
+}
+
+void MipsCodeEmitter::emitConstPoolAddress(unsigned CPI, unsigned Reloc) const {
+  MCE.addRelocation(MachineRelocation::getConstPool(MCE.getCurrentPCOffset(),
+                                                    Reloc, CPI, 0, false));
+}
+
+void MipsCodeEmitter::
+emitJumpTableAddress(unsigned JTIndex, unsigned Reloc) const {
+  MCE.addRelocation(MachineRelocation::getJumpTable(MCE.getCurrentPCOffset(),
+                                                    Reloc, JTIndex, 0, false));
+}
+
+void MipsCodeEmitter::emitMachineBasicBlock(MachineBasicBlock *BB,
+                                           unsigned Reloc) const {
+  MCE.addRelocation(MachineRelocation::getBB(MCE.getCurrentPCOffset(),
+                                             Reloc, BB));
+}
+
+void MipsCodeEmitter::emitInstruction(const MachineInstr &MI) {
+  DEBUG(errs() << "JIT: " << (void*)MCE.getCurrentPCValue() << ":\t" << MI);
+
+  MCE.processDebugLoc(MI.getDebugLoc(), true);
+
+  // Skip pseudo instructions.
+  if ((MI.getDesc().TSFlags & MipsII::FormMask) == MipsII::Pseudo)
+    return;
+
+  ++NumEmitted;  // Keep track of the # of mi's emitted
+
+  switch (MI.getOpcode()) {
+  default:
+    emitWordLE(getBinaryCodeForInstr(MI));
+    break;
+  }
+
+  MCE.processDebugLoc(MI.getDebugLoc(), false);
+}
+
+void MipsCodeEmitter::emitWordLE(unsigned Word) {
+  DEBUG(errs() << "  0x";
+        errs().write_hex(Word) << "\n");
+  MCE.emitWordLE(Word);
+}
+
+/// createMipsJITCodeEmitterPass - Return a pass that emits the collected Mips
+/// code to the specified MCE object.
+FunctionPass *llvm::createMipsJITCodeEmitterPass(MipsTargetMachine &TM,
+    JITCodeEmitter &JCE) {
+  return new MipsCodeEmitter(TM, JCE);
+}
+
+unsigned MipsCodeEmitter::getBinaryCodeForInstr(const MachineInstr &MI) const {
+ // this function will be automatically generated by the CodeEmitterGenerator
+ // using TableGen
+ return 0;
+}
diff --git a/lib/Target/Mips/MipsDelaySlotFiller.cpp b/lib/Target/Mips/MipsDelaySlotFiller.cpp
index c3a6211399cd9..be3b7a02ec312 100644
--- a/lib/Target/Mips/MipsDelaySlotFiller.cpp
+++ b/lib/Target/Mips/MipsDelaySlotFiller.cpp
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// Simple pass to fills delay slots with NOPs.
+// Simple pass to fills delay slots with useful instructions.
 //
 //===----------------------------------------------------------------------===//
 
@@ -17,18 +17,31 @@
 #include "MipsTargetMachine.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
 
 using namespace llvm;
 
 STATISTIC(FilledSlots, "Number of delay slots filled");
+STATISTIC(UsefulSlots, "Number of delay slots filled with instructions that"
+                       " are not NOP.");
+
+static cl::opt<bool> EnableDelaySlotFiller(
+  "enable-mips-delay-filler",
+  cl::init(false),
+  cl::desc("Fill the Mips delay slots useful instructions."),
+  cl::Hidden);
 
 namespace {
   struct Filler : public MachineFunctionPass {
 
     TargetMachine &TM;
     const TargetInstrInfo *TII;
+    MachineBasicBlock::iterator LastFiller;
 
     static char ID;
     Filler(TargetMachine &tm)
@@ -47,31 +60,61 @@ namespace {
       return Changed;
     }
 
+    bool isDelayFiller(MachineBasicBlock &MBB,
+                       MachineBasicBlock::iterator candidate);
+
+    void insertCallUses(MachineBasicBlock::iterator MI,
+                        SmallSet<unsigned, 32>& RegDefs,
+                        SmallSet<unsigned, 32>& RegUses);
+
+    void insertDefsUses(MachineBasicBlock::iterator MI,
+                        SmallSet<unsigned, 32>& RegDefs,
+                        SmallSet<unsigned, 32>& RegUses);
+
+    bool IsRegInSet(SmallSet<unsigned, 32>& RegSet,
+                    unsigned Reg);
+
+    bool delayHasHazard(MachineBasicBlock::iterator candidate,
+                        bool &sawLoad, bool &sawStore,
+                        SmallSet<unsigned, 32> &RegDefs,
+                        SmallSet<unsigned, 32> &RegUses);
+
+    bool
+    findDelayInstr(MachineBasicBlock &MBB, MachineBasicBlock::iterator slot,
+                   MachineBasicBlock::iterator &Filler);
+
+
   };
   char Filler::ID = 0;
 } // end of anonymous namespace
 
 /// runOnMachineBasicBlock - Fill in delay slots for the given basic block.
-/// Currently, we fill delay slots with NOPs. We assume there is only one
-/// delay slot per delayed instruction.
+/// We assume there is only one delay slot per delayed instruction.
 bool Filler::
-runOnMachineBasicBlock(MachineBasicBlock &MBB)
-{
+runOnMachineBasicBlock(MachineBasicBlock &MBB) {
   bool Changed = false;
-  for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) {
-    const MCInstrDesc& MCid = I->getDesc();
-    if (MCid.hasDelaySlot() &&
-        (TM.getSubtarget<MipsSubtarget>().isMips1() ||
-         MCid.isCall() || MCid.isBranch() || MCid.isReturn())) {
-      MachineBasicBlock::iterator J = I;
-      ++J;
-      BuildMI(MBB, J, I->getDebugLoc(), TII->get(Mips::NOP));
+  LastFiller = MBB.end();
+
+  for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I)
+    if (I->getDesc().hasDelaySlot()) {
       ++FilledSlots;
       Changed = true;
-    }
-  }
 
+      MachineBasicBlock::iterator D;
+
+      if (EnableDelaySlotFiller && findDelayInstr(MBB, I, D)) {
+        MBB.splice(llvm::next(I), &MBB, D);
+        ++UsefulSlots;
+      }
+      else 
+        BuildMI(MBB, llvm::next(I), I->getDebugLoc(), TII->get(Mips::NOP));
+
+      // Record the filler instruction that filled the delay slot.
+      // The instruction after it will be visited in the next iteration.
+      LastFiller = ++I;
+     }
   return Changed;
+
 }
 
 /// createMipsDelaySlotFillerPass - Returns a pass that fills in delay
@@ -80,3 +123,134 @@ FunctionPass *llvm::createMipsDelaySlotFillerPass(MipsTargetMachine &tm) {
   return new Filler(tm);
 }
 
+bool Filler::findDelayInstr(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator slot,
+                            MachineBasicBlock::iterator &Filler) {
+  SmallSet<unsigned, 32> RegDefs;
+  SmallSet<unsigned, 32> RegUses;
+
+  insertDefsUses(slot, RegDefs, RegUses);
+
+  bool sawLoad = false;
+  bool sawStore = false;
+
+  for (MachineBasicBlock::reverse_iterator I(slot); I != MBB.rend(); ++I) {
+    // skip debug value
+    if (I->isDebugValue())
+      continue;
+
+    // Convert to forward iterator.
+    MachineBasicBlock::iterator FI(llvm::next(I).base());
+
+    if (I->hasUnmodeledSideEffects()
+        || I->isInlineAsm()
+        || I->isLabel()
+        || FI == LastFiller
+        || I->getDesc().isPseudo()
+        //
+        // Should not allow:
+        // ERET, DERET or WAIT, PAUSE. Need to add these to instruction
+        // list. TBD.
+        )
+      break;
+
+    if (delayHasHazard(FI, sawLoad, sawStore, RegDefs, RegUses)) {
+      insertDefsUses(FI, RegDefs, RegUses);
+      continue;
+    }
+
+    Filler = FI;
+    return true;
+  }
+
+  return false;
+}
+
+bool Filler::delayHasHazard(MachineBasicBlock::iterator candidate,
+                            bool &sawLoad,
+                            bool &sawStore,
+                            SmallSet<unsigned, 32> &RegDefs,
+                            SmallSet<unsigned, 32> &RegUses) {
+  if (candidate->isImplicitDef() || candidate->isKill())
+    return true;
+
+  MCInstrDesc MCID = candidate->getDesc();
+  // Loads or stores cannot be moved past a store to the delay slot
+  // and stores cannot be moved past a load. 
+  if (MCID.mayLoad()) {
+    if (sawStore)
+      return true;
+    sawLoad = true;
+  }
+
+  if (MCID.mayStore()) {
+    if (sawStore)
+      return true;
+    sawStore = true;
+    if (sawLoad)
+      return true;
+  }
+
+  assert((!MCID.isCall() && !MCID.isReturn()) &&
+         "Cannot put calls or returns in delay slot.");
+
+  for (unsigned i = 0, e = candidate->getNumOperands(); i!= e; ++i) {
+    const MachineOperand &MO = candidate->getOperand(i);
+    unsigned Reg;
+
+    if (!MO.isReg() || !(Reg = MO.getReg()))
+      continue; // skip
+
+    if (MO.isDef()) {
+      // check whether Reg is defined or used before delay slot.
+      if (IsRegInSet(RegDefs, Reg) || IsRegInSet(RegUses, Reg))
+        return true;
+    }
+    if (MO.isUse()) {
+      // check whether Reg is defined before delay slot.
+      if (IsRegInSet(RegDefs, Reg))
+        return true;
+    }
+  }
+  return false;
+}
+
+// Insert Defs and Uses of MI into the sets RegDefs and RegUses.
+void Filler::insertDefsUses(MachineBasicBlock::iterator MI,
+                            SmallSet<unsigned, 32>& RegDefs,
+                            SmallSet<unsigned, 32>& RegUses) {
+  // If MI is a call or return, just examine the explicit non-variadic operands.
+  MCInstrDesc MCID = MI->getDesc();
+  unsigned e = MCID.isCall() || MCID.isReturn() ? MCID.getNumOperands() :
+                                                  MI->getNumOperands();
+  
+  // Add RA to RegDefs to prevent users of RA from going into delay slot. 
+  if (MCID.isCall())
+    RegDefs.insert(Mips::RA);
+
+  for (unsigned i = 0; i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    unsigned Reg;
+
+    if (!MO.isReg() || !(Reg = MO.getReg()))
+      continue;
+
+    if (MO.isDef())
+      RegDefs.insert(Reg);
+    else if (MO.isUse())
+      RegUses.insert(Reg);
+  }
+}
+
+//returns true if the Reg or its alias is in the RegSet.
+bool Filler::IsRegInSet(SmallSet<unsigned, 32>& RegSet, unsigned Reg) {
+  if (RegSet.count(Reg))
+    return true;
+  // check Aliased Registers
+  for (const unsigned *Alias = TM.getRegisterInfo()->getAliasSet(Reg);
+       *Alias; ++Alias)
+    if (RegSet.count(*Alias))
+      return true;
+
+  return false;
+}
diff --git a/lib/Target/Mips/MipsFrameLowering.cpp b/lib/Target/Mips/MipsFrameLowering.cpp
index a0f90a0d487ae..22d1e47b1a2b6 100644
--- a/lib/Target/Mips/MipsFrameLowering.cpp
+++ b/lib/Target/Mips/MipsFrameLowering.cpp
@@ -254,9 +254,15 @@ void MipsFrameLowering::emitPrologue(MachineFunction &MF) const {
   }
 
   // Restore GP from the saved stack location
-  if (MipsFI->needGPSaveRestore())
-    BuildMI(MBB, MBBI, dl, TII.get(Mips::CPRESTORE))
-      .addImm(MFI->getObjectOffset(MipsFI->getGPFI()));
+  if (MipsFI->needGPSaveRestore()) {
+    unsigned Offset = MFI->getObjectOffset(MipsFI->getGPFI());
+    BuildMI(MBB, MBBI, dl, TII.get(Mips::CPRESTORE)).addImm(Offset);
+
+    if (Offset >= 0x8000) {
+      BuildMI(MBB, llvm::prior(MBBI), dl, TII.get(Mips::MACRO));
+      BuildMI(MBB, MBBI, dl, TII.get(Mips::NOMACRO));
+    }
+  }
 }
 
 void MipsFrameLowering::emitEpilogue(MachineFunction &MF,
@@ -300,13 +306,6 @@ void MipsFrameLowering::emitEpilogue(MachineFunction &MF,
   }
 }
 
-void
-MipsFrameLowering::getInitialFrameState(std::vector<MachineMove> &Moves) const {
-  MachineLocation Dst(MachineLocation::VirtualFP);
-  MachineLocation Src(Mips::SP, 0);
-  Moves.push_back(MachineMove(0, Dst, Src));
-}
-
 void MipsFrameLowering::
 processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
                                      RegScavenger *RS) const {
diff --git a/lib/Target/Mips/MipsFrameLowering.h b/lib/Target/Mips/MipsFrameLowering.h
index 78c78eea5b9af..c24975614c8d5 100644
--- a/lib/Target/Mips/MipsFrameLowering.h
+++ b/lib/Target/Mips/MipsFrameLowering.h
@@ -27,7 +27,8 @@ protected:
 
 public:
   explicit MipsFrameLowering(const MipsSubtarget &sti)
-    : TargetFrameLowering(StackGrowsDown, 8, 0), STI(sti) {
+    : TargetFrameLowering(StackGrowsDown, sti.hasMips64() ? 16 : 8, 0),
+      STI(sti) {
   }
 
   bool targetHandlesStackFrameRounding() const;
@@ -39,8 +40,6 @@ public:
 
   bool hasFP(const MachineFunction &MF) const;
 
-  void getInitialFrameState(std::vector<MachineMove> &Moves) const;
-  
   void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
                                             RegScavenger *RS) const;
 };
diff --git a/lib/Target/Mips/MipsISelDAGToDAG.cpp b/lib/Target/Mips/MipsISelDAGToDAG.cpp
index 90aaeb60d06fe..9c831ede9dbfc 100644
--- a/lib/Target/Mips/MipsISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsISelDAGToDAG.cpp
@@ -86,9 +86,6 @@ private:
   // Complex Pattern.
   bool SelectAddr(SDValue N, SDValue &Base, SDValue &Offset);
 
-  SDNode *SelectLoadFp64(SDNode *N);
-  SDNode *SelectStoreFp64(SDNode *N);
-
   // getI32Imm - Return a target constant with the specified
   // value, of type i32.
   inline SDValue getI32Imm(unsigned Imm) {
@@ -114,17 +111,20 @@ SDNode *MipsDAGToDAGISel::getGlobalBaseReg() {
 /// Used on Mips Load/Store instructions
 bool MipsDAGToDAGISel::
 SelectAddr(SDValue Addr, SDValue &Base, SDValue &Offset) {
+  EVT ValTy = Addr.getValueType();
+  unsigned GPReg = ValTy == MVT::i32 ? Mips::GP : Mips::GP_64;
+
   // if Address is FI, get the TargetFrameIndex.
   if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
-    Base   = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
-    Offset = CurDAG->getTargetConstant(0, MVT::i32);
+    Base   = CurDAG->getTargetFrameIndex(FIN->getIndex(), ValTy);
+    Offset = CurDAG->getTargetConstant(0, ValTy);
     return true;
   }
 
   // on PIC code Load GA
   if (TM.getRelocationModel() == Reloc::PIC_) {
     if (Addr.getOpcode() == MipsISD::WrapperPIC) {
-      Base   = CurDAG->getRegister(Mips::GP, MVT::i32);
+      Base   = CurDAG->getRegister(GPReg, ValTy);
       Offset = Addr.getOperand(0);
       return true;
     }
@@ -133,7 +133,7 @@ SelectAddr(SDValue Addr, SDValue &Base, SDValue &Offset) {
         Addr.getOpcode() == ISD::TargetGlobalAddress))
       return false;
     else if (Addr.getOpcode() == ISD::TargetGlobalTLSAddress) {
-      Base   = CurDAG->getRegister(Mips::GP, MVT::i32);
+      Base   = CurDAG->getRegister(GPReg, ValTy);
       Offset = Addr;
       return true;
     }
@@ -147,11 +147,11 @@ SelectAddr(SDValue Addr, SDValue &Base, SDValue &Offset) {
       // If the first operand is a FI, get the TargetFI Node
       if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>
                                   (Addr.getOperand(0)))
-        Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+        Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), ValTy);
       else
         Base = Addr.getOperand(0);
 
-      Offset = CurDAG->getTargetConstant(CN->getZExtValue(), MVT::i32);
+      Offset = CurDAG->getTargetConstant(CN->getZExtValue(), ValTy);
       return true;
     }
   }
@@ -180,134 +180,10 @@ SelectAddr(SDValue Addr, SDValue &Base, SDValue &Offset) {
   }
 
   Base   = Addr;
-  Offset = CurDAG->getTargetConstant(0, MVT::i32);
+  Offset = CurDAG->getTargetConstant(0, ValTy);
   return true;
 }
 
-SDNode *MipsDAGToDAGISel::SelectLoadFp64(SDNode *N) {
-  MVT::SimpleValueType NVT =
-    N->getValueType(0).getSimpleVT().SimpleTy;
-
-  if (!Subtarget.isMips1() || NVT != MVT::f64)
-    return NULL;
-
-  LoadSDNode *LN = cast<LoadSDNode>(N);
-  if (LN->getExtensionType() != ISD::NON_EXTLOAD ||
-      LN->getAddressingMode() != ISD::UNINDEXED)
-    return NULL;
-
-  SDValue Chain = N->getOperand(0);
-  SDValue N1 = N->getOperand(1);
-  SDValue Offset0, Offset1, Base;
-
-  if (!SelectAddr(N1, Base, Offset0) ||
-      N1.getValueType() != MVT::i32)
-    return NULL;
-
-  MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
-  MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
-  DebugLoc dl = N->getDebugLoc();
-
-  // The second load should start after for 4 bytes.
-  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Offset0))
-    Offset1 = CurDAG->getTargetConstant(C->getSExtValue()+4, MVT::i32);
-  else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(Offset0))
-    Offset1 = CurDAG->getTargetConstantPool(CP->getConstVal(),
-                                            MVT::i32,
-                                            CP->getAlignment(),
-                                            CP->getOffset()+4,
-                                            CP->getTargetFlags());
-  else
-    return NULL;
-
-  // Choose the offsets depending on the endianess
-  if (TM.getTargetData()->isBigEndian())
-    std::swap(Offset0, Offset1);
-
-  // Instead of:
-  //    ldc $f0, X($3)
-  // Generate:
-  //    lwc $f0, X($3)
-  //    lwc $f1, X+4($3)
-  SDNode *LD0 = CurDAG->getMachineNode(Mips::LWC1, dl, MVT::f32,
-                                       MVT::Other, Base, Offset0, Chain);
-  SDValue Undef = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
-                                                 dl, NVT), 0);
-  SDValue I0 = CurDAG->getTargetInsertSubreg(Mips::sub_fpeven, dl,
-                            MVT::f64, Undef, SDValue(LD0, 0));
-
-  SDNode *LD1 = CurDAG->getMachineNode(Mips::LWC1, dl, MVT::f32,
-                                       MVT::Other, Base, Offset1, SDValue(LD0, 1));
-  SDValue I1 = CurDAG->getTargetInsertSubreg(Mips::sub_fpodd, dl,
-                            MVT::f64, I0, SDValue(LD1, 0));
-
-  ReplaceUses(SDValue(N, 0), I1);
-  ReplaceUses(SDValue(N, 1), Chain);
-  cast<MachineSDNode>(LD0)->setMemRefs(MemRefs0, MemRefs0 + 1);
-  cast<MachineSDNode>(LD1)->setMemRefs(MemRefs0, MemRefs0 + 1);
-  return I1.getNode();
-}
-
-SDNode *MipsDAGToDAGISel::SelectStoreFp64(SDNode *N) {
-
-  if (!Subtarget.isMips1() ||
-      N->getOperand(1).getValueType() != MVT::f64)
-    return NULL;
-
-  SDValue Chain = N->getOperand(0);
-
-  StoreSDNode *SN = cast<StoreSDNode>(N);
-  if (SN->isTruncatingStore() || SN->getAddressingMode() != ISD::UNINDEXED)
-    return NULL;
-
-  SDValue N1 = N->getOperand(1);
-  SDValue N2 = N->getOperand(2);
-  SDValue Offset0, Offset1, Base;
-
-  if (!SelectAddr(N2, Base, Offset0) ||
-      N1.getValueType() != MVT::f64 ||
-      N2.getValueType() != MVT::i32)
-    return NULL;
-
-  MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
-  MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
-  DebugLoc dl = N->getDebugLoc();
-
-  // Get the even and odd part from the f64 register
-  SDValue FPOdd = CurDAG->getTargetExtractSubreg(Mips::sub_fpodd,
-                                                 dl, MVT::f32, N1);
-  SDValue FPEven = CurDAG->getTargetExtractSubreg(Mips::sub_fpeven,
-                                                 dl, MVT::f32, N1);
-
-  // The second store should start after for 4 bytes.
-  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Offset0))
-    Offset1 = CurDAG->getTargetConstant(C->getSExtValue()+4, MVT::i32);
-  else
-    return NULL;
-
-  // Choose the offsets depending on the endianess
-  if (TM.getTargetData()->isBigEndian())
-    std::swap(Offset0, Offset1);
-
-  // Instead of:
-  //    sdc $f0, X($3)
-  // Generate:
-  //    swc $f0, X($3)
-  //    swc $f1, X+4($3)
-  SDValue Ops0[] = { FPEven, Base, Offset0, Chain };
-  Chain = SDValue(CurDAG->getMachineNode(Mips::SWC1, dl,
-                                       MVT::Other, Ops0, 4), 0);
-  cast<MachineSDNode>(Chain.getNode())->setMemRefs(MemRefs0, MemRefs0 + 1);
-
-  SDValue Ops1[] = { FPOdd, Base, Offset1, Chain };
-  Chain = SDValue(CurDAG->getMachineNode(Mips::SWC1, dl,
-                                       MVT::Other, Ops1, 4), 0);
-  cast<MachineSDNode>(Chain.getNode())->setMemRefs(MemRefs0, MemRefs0 + 1);
-
-  ReplaceUses(SDValue(N, 0), Chain);
-  return Chain.getNode();
-}
-
 /// Select instructions not customized! Used for
 /// expanded, promoted and normal instructions
 SDNode* MipsDAGToDAGISel::Select(SDNode *Node) {
@@ -364,6 +240,8 @@ SDNode* MipsDAGToDAGISel::Select(SDNode *Node) {
     /// Mul with two results
     case ISD::SMUL_LOHI:
     case ISD::UMUL_LOHI: {
+      assert(Node->getValueType(0) != MVT::i64 &&
+             "64-bit multiplication with two results not handled.");
       SDValue Op1 = Node->getOperand(0);
       SDValue Op2 = Node->getOperand(1);
 
@@ -389,21 +267,29 @@ SDNode* MipsDAGToDAGISel::Select(SDNode *Node) {
 
     /// Special Muls
     case ISD::MUL:
-      if (Subtarget.isMips32())
+      // Mips32 has a 32-bit three operand mul instruction.
+      if (Subtarget.hasMips32() && Node->getValueType(0) == MVT::i32)
         break;
     case ISD::MULHS:
     case ISD::MULHU: {
+      assert((Opcode == ISD::MUL || Node->getValueType(0) != MVT::i64) &&
+             "64-bit MULH* not handled.");
+      EVT Ty = Node->getValueType(0);
       SDValue MulOp1 = Node->getOperand(0);
       SDValue MulOp2 = Node->getOperand(1);
 
-      unsigned MulOp  = (Opcode == ISD::MULHU ? Mips::MULTu : Mips::MULT);
+      unsigned MulOp  = (Opcode == ISD::MULHU ?
+                         Mips::MULTu :
+                         (Ty == MVT::i32 ? Mips::MULT : Mips::DMULT));
       SDNode *MulNode = CurDAG->getMachineNode(MulOp, dl,
                                                MVT::Glue, MulOp1, MulOp2);
 
       SDValue InFlag = SDValue(MulNode, 0);
 
-      if (Opcode == ISD::MUL)
-        return CurDAG->getMachineNode(Mips::MFLO, dl, MVT::i32, InFlag);
+      if (Opcode == ISD::MUL) {
+        unsigned Opc = (Ty == MVT::i32 ? Mips::MFLO : Mips::MFLO64);
+        return CurDAG->getMachineNode(Opc, dl, Ty, InFlag);
+      }
       else
         return CurDAG->getMachineNode(Mips::MFHI, dl, MVT::i32, InFlag);
     }
@@ -417,31 +303,12 @@ SDNode* MipsDAGToDAGISel::Select(SDNode *Node) {
       if (Node->getValueType(0) == MVT::f64 && CN->isExactlyValue(+0.0)) {
         SDValue Zero = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
                                         Mips::ZERO, MVT::i32);
-        SDValue Undef = SDValue(
-          CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, MVT::f64), 0);
-        SDNode *MTC = CurDAG->getMachineNode(Mips::MTC1, dl, MVT::f32, Zero);
-        SDValue I0 = CurDAG->getTargetInsertSubreg(Mips::sub_fpeven, dl,
-                            MVT::f64, Undef, SDValue(MTC, 0));
-        SDValue I1 = CurDAG->getTargetInsertSubreg(Mips::sub_fpodd, dl,
-                            MVT::f64, I0, SDValue(MTC, 0));
-        ReplaceUses(SDValue(Node, 0), I1);
-        return I1.getNode();
+        return CurDAG->getMachineNode(Mips::BuildPairF64, dl, MVT::f64, Zero,
+                                      Zero);
       }
       break;
     }
 
-    case ISD::LOAD:
-      if (SDNode *ResNode = SelectLoadFp64(Node))
-        return ResNode;
-      // Other cases are autogenerated.
-      break;
-
-    case ISD::STORE:
-      if (SDNode *ResNode = SelectStoreFp64(Node))
-        return ResNode;
-      // Other cases are autogenerated.
-      break;
-
     case MipsISD::ThreadPointer: {
       unsigned SrcReg = Mips::HWR29;
       unsigned DestReg = Mips::V1;
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index b4f4b1b4bf045..1932e745c5933 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -35,6 +35,18 @@
 #include "llvm/Support/ErrorHandling.h"
 using namespace llvm;
 
+// If I is a shifted mask, set the size (Size) and the first bit of the 
+// mask (Pos), and return true.
+// For example, if I is 0x003ff800, (Pos, Size) = (11, 11).  
+static bool IsShiftedMask(uint64_t I, uint64_t &Pos, uint64_t &Size) {
+  if (!isUInt<32>(I) || !isShiftedMask_32(I))
+     return false;
+
+  Size = CountPopulation_32(I);
+  Pos = CountTrailingZeros_32(I);
+  return true;
+}
+
 const char *MipsTargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch (Opcode) {
   case MipsISD::JmpLink:           return "MipsISD::JmpLink";
@@ -61,27 +73,38 @@ const char *MipsTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case MipsISD::ExtractElementF64: return "MipsISD::ExtractElementF64";
   case MipsISD::WrapperPIC:        return "MipsISD::WrapperPIC";
   case MipsISD::DynAlloc:          return "MipsISD::DynAlloc";
+  case MipsISD::Sync:              return "MipsISD::Sync";
+  case MipsISD::Ext:               return "MipsISD::Ext";
+  case MipsISD::Ins:               return "MipsISD::Ins";
   default:                         return NULL;
   }
 }
 
 MipsTargetLowering::
 MipsTargetLowering(MipsTargetMachine &TM)
-  : TargetLowering(TM, new MipsTargetObjectFile()) {
-  Subtarget = &TM.getSubtarget<MipsSubtarget>();
+  : TargetLowering(TM, new MipsTargetObjectFile()),
+    Subtarget(&TM.getSubtarget<MipsSubtarget>()),
+    HasMips64(Subtarget->hasMips64()), IsN64(Subtarget->isABI_N64()) {
 
   // Mips does not have i1 type, so use i32 for
   // setcc operations results (slt, sgt, ...).
   setBooleanContents(ZeroOrOneBooleanContent);
+  setBooleanVectorContents(ZeroOrOneBooleanContent); // FIXME: Is this correct?
 
   // Set up the register classes
   addRegisterClass(MVT::i32, Mips::CPURegsRegisterClass);
   addRegisterClass(MVT::f32, Mips::FGR32RegisterClass);
 
+  if (HasMips64)
+    addRegisterClass(MVT::i64, Mips::CPU64RegsRegisterClass);
+
   // When dealing with single precision only, use libcalls
-  if (!Subtarget->isSingleFloat())
-    if (!Subtarget->isFP64bit())
+  if (!Subtarget->isSingleFloat()) {
+    if (HasMips64)
+      addRegisterClass(MVT::f64, Mips::FGR64RegisterClass);
+    else
       addRegisterClass(MVT::f64, Mips::AFGR64RegisterClass);
+  }
 
   // Load extented operations for i1 types must be promoted
   setLoadExtAction(ISD::EXTLOAD,  MVT::i1,  Promote);
@@ -100,6 +123,7 @@ MipsTargetLowering(MipsTargetMachine &TM)
 
   // Mips Custom Operations
   setOperationAction(ISD::GlobalAddress,      MVT::i32,   Custom);
+  setOperationAction(ISD::GlobalAddress,      MVT::i64,   Custom);
   setOperationAction(ISD::BlockAddress,       MVT::i32,   Custom);
   setOperationAction(ISD::GlobalTLSAddress,   MVT::i32,   Custom);
   setOperationAction(ISD::JumpTable,          MVT::i32,   Custom);
@@ -115,6 +139,10 @@ MipsTargetLowering(MipsTargetMachine &TM)
   setOperationAction(ISD::SREM, MVT::i32, Expand);
   setOperationAction(ISD::UDIV, MVT::i32, Expand);
   setOperationAction(ISD::UREM, MVT::i32, Expand);
+  setOperationAction(ISD::SDIV, MVT::i64, Expand);
+  setOperationAction(ISD::SREM, MVT::i64, Expand);
+  setOperationAction(ISD::UDIV, MVT::i64, Expand);
+  setOperationAction(ISD::UREM, MVT::i64, Expand);
 
   // Operations not directly supported by Mips.
   setOperationAction(ISD::BR_JT,             MVT::Other, Expand);
@@ -126,10 +154,14 @@ MipsTargetLowering(MipsTargetMachine &TM)
   setOperationAction(ISD::CTPOP,             MVT::i32,   Expand);
   setOperationAction(ISD::CTTZ,              MVT::i32,   Expand);
   setOperationAction(ISD::ROTL,              MVT::i32,   Expand);
+  setOperationAction(ISD::ROTL,              MVT::i64,   Expand);
 
-  if (!Subtarget->isMips32r2())
+  if (!Subtarget->hasMips32r2())
     setOperationAction(ISD::ROTR, MVT::i32,   Expand);
 
+  if (!Subtarget->hasMips64r2())
+    setOperationAction(ISD::ROTR, MVT::i64,   Expand);
+
   setOperationAction(ISD::SHL_PARTS,         MVT::i32,   Expand);
   setOperationAction(ISD::SRA_PARTS,         MVT::i32,   Expand);
   setOperationAction(ISD::SRL_PARTS,         MVT::i32,   Expand);
@@ -159,7 +191,14 @@ MipsTargetLowering(MipsTargetMachine &TM)
   // Use the default for now
   setOperationAction(ISD::STACKSAVE,         MVT::Other, Expand);
   setOperationAction(ISD::STACKRESTORE,      MVT::Other, Expand);
-  setOperationAction(ISD::MEMBARRIER,        MVT::Other, Expand);
+
+  setOperationAction(ISD::MEMBARRIER,        MVT::Other, Custom);
+  setOperationAction(ISD::ATOMIC_FENCE,      MVT::Other, Custom);  
+
+  setOperationAction(ISD::ATOMIC_LOAD,       MVT::i32,    Expand);  
+  setOperationAction(ISD::ATOMIC_STORE,      MVT::i32,    Expand);  
+
+  setInsertFencesForAtomic(true);
 
   if (Subtarget->isSingleFloat())
     setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
@@ -180,6 +219,8 @@ MipsTargetLowering(MipsTargetMachine &TM)
   setTargetDAGCombine(ISD::SDIVREM);
   setTargetDAGCombine(ISD::UDIVREM);
   setTargetDAGCombine(ISD::SETCC);
+  setTargetDAGCombine(ISD::AND);
+  setTargetDAGCombine(ISD::OR);
 
   setMinFunctionAlignment(2);
 
@@ -190,7 +231,12 @@ MipsTargetLowering(MipsTargetMachine &TM)
   setExceptionSelectorRegister(Mips::A1);
 }
 
-MVT::SimpleValueType MipsTargetLowering::getSetCCResultType(EVT VT) const {
+bool MipsTargetLowering::allowsUnalignedMemoryAccesses(EVT VT) const {
+  MVT::SimpleValueType SVT = VT.getSimpleVT().SimpleTy;
+  return SVT == MVT::i64 || SVT == MVT::i32 || SVT == MVT::i16; 
+}
+
+EVT MipsTargetLowering::getSetCCResultType(EVT VT) const {
   return MVT::i32;
 }
 
@@ -348,7 +394,7 @@ static SDValue PerformADDECombine(SDNode *N, SelectionDAG& DAG,
   if (DCI.isBeforeLegalize())
     return SDValue();
 
-  if (Subtarget->isMips32() && SelectMadd(N, &DAG))
+  if (Subtarget->hasMips32() && SelectMadd(N, &DAG))
     return SDValue(N, 0);
 
   return SDValue();
@@ -360,7 +406,7 @@ static SDValue PerformSUBECombine(SDNode *N, SelectionDAG& DAG,
   if (DCI.isBeforeLegalize())
     return SDValue();
 
-  if (Subtarget->isMips32() && SelectMsub(N, &DAG))
+  if (Subtarget->hasMips32() && SelectMsub(N, &DAG))
     return SDValue(N, 0);
 
   return SDValue();
@@ -372,6 +418,9 @@ static SDValue PerformDivRemCombine(SDNode *N, SelectionDAG& DAG,
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
+  EVT Ty = N->getValueType(0);
+  unsigned LO = (Ty == MVT::i32) ? Mips::LO : Mips::LO64; 
+  unsigned HI = (Ty == MVT::i32) ? Mips::HI : Mips::HI64; 
   unsigned opc = N->getOpcode() == ISD::SDIVREM ? MipsISD::DivRem :
                                                   MipsISD::DivRemU;
   DebugLoc dl = N->getDebugLoc();
@@ -383,7 +432,7 @@ static SDValue PerformDivRemCombine(SDNode *N, SelectionDAG& DAG,
 
   // insert MFLO
   if (N->hasAnyUseOfValue(0)) {
-    SDValue CopyFromLo = DAG.getCopyFromReg(InChain, dl, Mips::LO, MVT::i32,
+    SDValue CopyFromLo = DAG.getCopyFromReg(InChain, dl, LO, Ty,
                                             InGlue);
     DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), CopyFromLo);
     InChain = CopyFromLo.getValue(1);
@@ -393,7 +442,7 @@ static SDValue PerformDivRemCombine(SDNode *N, SelectionDAG& DAG,
   // insert MFHI
   if (N->hasAnyUseOfValue(1)) {
     SDValue CopyFromHi = DAG.getCopyFromReg(InChain, dl,
-                                            Mips::HI, MVT::i32, InGlue);
+                                            HI, Ty, InGlue);
     DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), CopyFromHi);
   }
 
@@ -490,6 +539,101 @@ static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG& DAG,
   return CreateCMovFP(DAG, Cond, True, False, N->getDebugLoc());
 }
 
+static SDValue PerformANDCombine(SDNode *N, SelectionDAG& DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const MipsSubtarget* Subtarget) {
+  // Pattern match EXT.
+  //  $dst = and ((sra or srl) $src , pos), (2**size - 1)
+  //  => ext $dst, $src, size, pos
+  if (DCI.isBeforeLegalizeOps() || !Subtarget->hasMips32r2())
+    return SDValue();
+
+  SDValue ShiftRight = N->getOperand(0), Mask = N->getOperand(1);
+  
+  // Op's first operand must be a shift right.
+  if (ShiftRight.getOpcode() != ISD::SRA && ShiftRight.getOpcode() != ISD::SRL)
+    return SDValue();
+
+  // The second operand of the shift must be an immediate.
+  uint64_t Pos;
+  ConstantSDNode *CN;
+  if (!(CN = dyn_cast<ConstantSDNode>(ShiftRight.getOperand(1))))
+    return SDValue();
+  
+  Pos = CN->getZExtValue();
+
+  uint64_t SMPos, SMSize;
+  // Op's second operand must be a shifted mask.
+  if (!(CN = dyn_cast<ConstantSDNode>(Mask)) ||
+      !IsShiftedMask(CN->getZExtValue(), SMPos, SMSize))
+    return SDValue();
+
+  // Return if the shifted mask does not start at bit 0 or the sum of its size
+  // and Pos exceeds the word's size.
+  if (SMPos != 0 || Pos + SMSize > 32)
+    return SDValue();
+
+  return DAG.getNode(MipsISD::Ext, N->getDebugLoc(), MVT::i32,
+                     ShiftRight.getOperand(0),
+                     DAG.getConstant(Pos, MVT::i32),
+                     DAG.getConstant(SMSize, MVT::i32));
+}
+  
+static SDValue PerformORCombine(SDNode *N, SelectionDAG& DAG,
+                                TargetLowering::DAGCombinerInfo &DCI,
+                                const MipsSubtarget* Subtarget) {
+  // Pattern match INS.
+  //  $dst = or (and $src1 , mask0), (and (shl $src, pos), mask1),
+  //  where mask1 = (2**size - 1) << pos, mask0 = ~mask1 
+  //  => ins $dst, $src, size, pos, $src1
+  if (DCI.isBeforeLegalizeOps() || !Subtarget->hasMips32r2())
+    return SDValue();
+
+  SDValue And0 = N->getOperand(0), And1 = N->getOperand(1);
+  uint64_t SMPos0, SMSize0, SMPos1, SMSize1;
+  ConstantSDNode *CN;
+
+  // See if Op's first operand matches (and $src1 , mask0).
+  if (And0.getOpcode() != ISD::AND)
+    return SDValue();
+
+  if (!(CN = dyn_cast<ConstantSDNode>(And0.getOperand(1))) ||
+      !IsShiftedMask(~CN->getSExtValue(), SMPos0, SMSize0))
+    return SDValue();
+
+  // See if Op's second operand matches (and (shl $src, pos), mask1).
+  if (And1.getOpcode() != ISD::AND)
+    return SDValue();
+  
+  if (!(CN = dyn_cast<ConstantSDNode>(And1.getOperand(1))) ||
+      !IsShiftedMask(CN->getZExtValue(), SMPos1, SMSize1))
+    return SDValue();
+
+  // The shift masks must have the same position and size.
+  if (SMPos0 != SMPos1 || SMSize0 != SMSize1)
+    return SDValue();
+
+  SDValue Shl = And1.getOperand(0);
+  if (Shl.getOpcode() != ISD::SHL)
+    return SDValue();
+
+  if (!(CN = dyn_cast<ConstantSDNode>(Shl.getOperand(1))))
+    return SDValue();
+
+  unsigned Shamt = CN->getZExtValue();
+
+  // Return if the shift amount and the first bit position of mask are not the
+  // same.  
+  if (Shamt != SMPos0)
+    return SDValue();
+  
+  return DAG.getNode(MipsISD::Ins, N->getDebugLoc(), MVT::i32,
+                     Shl.getOperand(0),
+                     DAG.getConstant(SMPos0, MVT::i32),
+                     DAG.getConstant(SMSize0, MVT::i32),
+                     And0.getOperand(0));  
+}
+  
 SDValue  MipsTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI)
   const {
   SelectionDAG &DAG = DCI.DAG;
@@ -506,6 +650,10 @@ SDValue  MipsTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI)
     return PerformDivRemCombine(N, DAG, DCI, Subtarget);
   case ISD::SETCC:
     return PerformSETCCCombine(N, DAG, DCI, Subtarget);
+  case ISD::AND:
+    return PerformANDCombine(N, DAG, DCI, Subtarget);
+  case ISD::OR:
+    return PerformORCombine(N, DAG, DCI, Subtarget);
   }
 
   return SDValue();
@@ -527,6 +675,8 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const
     case ISD::VASTART:            return LowerVASTART(Op, DAG);
     case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
     case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
+    case ISD::MEMBARRIER:         return LowerMEMBARRIER(Op, DAG);
+    case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, DAG);
   }
   return SDValue();
 }
@@ -733,13 +883,13 @@ MipsTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
   DebugLoc dl = MI->getDebugLoc();
 
-  unsigned Dest = MI->getOperand(0).getReg();
+  unsigned OldVal = MI->getOperand(0).getReg();
   unsigned Ptr = MI->getOperand(1).getReg();
   unsigned Incr = MI->getOperand(2).getReg();
 
-  unsigned Oldval = RegInfo.createVirtualRegister(RC);
-  unsigned Tmp1 = RegInfo.createVirtualRegister(RC);
-  unsigned Tmp2 = RegInfo.createVirtualRegister(RC);
+  unsigned StoreVal = RegInfo.createVirtualRegister(RC);
+  unsigned AndRes = RegInfo.createVirtualRegister(RC);
+  unsigned Success = RegInfo.createVirtualRegister(RC);
 
   // insert new blocks after the current block
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
@@ -758,61 +908,38 @@ MipsTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
 
   //  thisMBB:
   //    ...
-  //    sw incr, fi(sp)           // store incr to stack (when BinOpcode == 0)
   //    fallthrough --> loopMBB
-
-  // Note: for atomic.swap (when BinOpcode == 0), storing incr to stack before
-  // the loop and then loading it from stack in block loopMBB is necessary to
-  // prevent MachineLICM pass to hoist "or" instruction out of the block
-  // loopMBB.
-
-  int fi = 0;
-  if (BinOpcode == 0 && !Nand) {
-    // Get or create a temporary stack location.
-    MipsFunctionInfo *MipsFI = MF->getInfo<MipsFunctionInfo>();
-    fi = MipsFI->getAtomicFrameIndex();
-    if (fi == -1) {
-      fi = MF->getFrameInfo()->CreateStackObject(Size, Size, false);
-      MipsFI->setAtomicFrameIndex(fi);
-    }
-
-    BuildMI(BB, dl, TII->get(Mips::SW))
-        .addReg(Incr).addFrameIndex(fi).addImm(0);
-  }
   BB->addSuccessor(loopMBB);
+  loopMBB->addSuccessor(loopMBB);
+  loopMBB->addSuccessor(exitMBB);
 
   //  loopMBB:
   //    ll oldval, 0(ptr)
-  //    or dest, $0, oldval
-  //    <binop> tmp1, oldval, incr
-  //    sc tmp1, 0(ptr)
-  //    beq tmp1, $0, loopMBB
+  //    <binop> storeval, oldval, incr
+  //    sc success, storeval, 0(ptr)
+  //    beq success, $0, loopMBB
   BB = loopMBB;
-  BuildMI(BB, dl, TII->get(Mips::LL), Oldval).addReg(Ptr).addImm(0);
-  BuildMI(BB, dl, TII->get(Mips::OR), Dest).addReg(Mips::ZERO).addReg(Oldval);
+  BuildMI(BB, dl, TII->get(Mips::LL), OldVal).addReg(Ptr).addImm(0);
   if (Nand) {
-    //  and tmp2, oldval, incr
-    //  nor tmp1, $0, tmp2
-    BuildMI(BB, dl, TII->get(Mips::AND), Tmp2).addReg(Oldval).addReg(Incr);
-    BuildMI(BB, dl, TII->get(Mips::NOR), Tmp1).addReg(Mips::ZERO).addReg(Tmp2);
+    //  and andres, oldval, incr
+    //  nor storeval, $0, andres
+    BuildMI(BB, dl, TII->get(Mips::AND), AndRes).addReg(OldVal).addReg(Incr);
+    BuildMI(BB, dl, TII->get(Mips::NOR), StoreVal)
+      .addReg(Mips::ZERO).addReg(AndRes);
   } else if (BinOpcode) {
-    //  <binop> tmp1, oldval, incr
-    BuildMI(BB, dl, TII->get(BinOpcode), Tmp1).addReg(Oldval).addReg(Incr);
+    //  <binop> storeval, oldval, incr
+    BuildMI(BB, dl, TII->get(BinOpcode), StoreVal).addReg(OldVal).addReg(Incr);
   } else {
-    //  lw tmp2, fi(sp)              // load incr from stack
-    //  or tmp1, $zero, tmp2
-    BuildMI(BB, dl, TII->get(Mips::LW), Tmp2).addFrameIndex(fi).addImm(0);
-    BuildMI(BB, dl, TII->get(Mips::OR), Tmp1).addReg(Mips::ZERO).addReg(Tmp2);
+    StoreVal = Incr;
   }
-  BuildMI(BB, dl, TII->get(Mips::SC), Tmp1).addReg(Tmp1).addReg(Ptr).addImm(0);
+  BuildMI(BB, dl, TII->get(Mips::SC), Success)
+    .addReg(StoreVal).addReg(Ptr).addImm(0);
   BuildMI(BB, dl, TII->get(Mips::BEQ))
-    .addReg(Tmp1).addReg(Mips::ZERO).addMBB(loopMBB);
-  BB->addSuccessor(loopMBB);
-  BB->addSuccessor(exitMBB);
+    .addReg(Success).addReg(Mips::ZERO).addMBB(loopMBB);
 
   MI->eraseFromParent();   // The instruction is gone now.
 
-  return BB;
+  return exitMBB;
 }
 
 MachineBasicBlock *
@@ -833,33 +960,34 @@ MipsTargetLowering::EmitAtomicBinaryPartword(MachineInstr *MI,
   unsigned Ptr = MI->getOperand(1).getReg();
   unsigned Incr = MI->getOperand(2).getReg();
 
-  unsigned Addr = RegInfo.createVirtualRegister(RC);
-  unsigned Shift = RegInfo.createVirtualRegister(RC);
+  unsigned AlignedAddr = RegInfo.createVirtualRegister(RC);
+  unsigned ShiftAmt = RegInfo.createVirtualRegister(RC);
   unsigned Mask = RegInfo.createVirtualRegister(RC);
   unsigned Mask2 = RegInfo.createVirtualRegister(RC);
-  unsigned Newval = RegInfo.createVirtualRegister(RC);
-  unsigned Oldval = RegInfo.createVirtualRegister(RC);
+  unsigned NewVal = RegInfo.createVirtualRegister(RC);
+  unsigned OldVal = RegInfo.createVirtualRegister(RC);
   unsigned Incr2 = RegInfo.createVirtualRegister(RC);
-  unsigned Tmp1 = RegInfo.createVirtualRegister(RC);
-  unsigned Tmp2 = RegInfo.createVirtualRegister(RC);
-  unsigned Tmp3 = RegInfo.createVirtualRegister(RC);
-  unsigned Tmp4 = RegInfo.createVirtualRegister(RC);
-  unsigned Tmp5 = RegInfo.createVirtualRegister(RC);
-  unsigned Tmp6 = RegInfo.createVirtualRegister(RC);
-  unsigned Tmp7 = RegInfo.createVirtualRegister(RC);
-  unsigned Tmp8 = RegInfo.createVirtualRegister(RC);
-  unsigned Tmp9 = RegInfo.createVirtualRegister(RC);
-  unsigned Tmp10 = RegInfo.createVirtualRegister(RC);
-  unsigned Tmp11 = RegInfo.createVirtualRegister(RC);
-  unsigned Tmp12 = RegInfo.createVirtualRegister(RC);
+  unsigned MaskLSB2 = RegInfo.createVirtualRegister(RC);
+  unsigned PtrLSB2 = RegInfo.createVirtualRegister(RC);
+  unsigned MaskUpper = RegInfo.createVirtualRegister(RC);
+  unsigned AndRes = RegInfo.createVirtualRegister(RC);
+  unsigned BinOpRes = RegInfo.createVirtualRegister(RC);
+  unsigned MaskedOldVal0 = RegInfo.createVirtualRegister(RC);
+  unsigned StoreVal = RegInfo.createVirtualRegister(RC);
+  unsigned MaskedOldVal1 = RegInfo.createVirtualRegister(RC);
+  unsigned SrlRes = RegInfo.createVirtualRegister(RC);
+  unsigned SllRes = RegInfo.createVirtualRegister(RC);
+  unsigned Success = RegInfo.createVirtualRegister(RC);
 
   // insert new blocks after the current block
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
   MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   MachineFunction::iterator It = BB;
   ++It;
   MF->insert(It, loopMBB);
+  MF->insert(It, sinkMBB);
   MF->insert(It, exitMBB);
 
   // Transfer the remainder of BB and its successor edges to exitMBB.
@@ -868,111 +996,104 @@ MipsTargetLowering::EmitAtomicBinaryPartword(MachineInstr *MI,
                   BB->end());
   exitMBB->transferSuccessorsAndUpdatePHIs(BB);
 
+  BB->addSuccessor(loopMBB);
+  loopMBB->addSuccessor(loopMBB);
+  loopMBB->addSuccessor(sinkMBB);
+  sinkMBB->addSuccessor(exitMBB);
+
   //  thisMBB:
-  //    addiu   tmp1,$0,-4                # 0xfffffffc
-  //    and     addr,ptr,tmp1
-  //    andi    tmp2,ptr,3
-  //    sll     shift,tmp2,3
-  //    ori     tmp3,$0,255               # 0xff
-  //    sll     mask,tmp3,shift
+  //    addiu   masklsb2,$0,-4                # 0xfffffffc
+  //    and     alignedaddr,ptr,masklsb2
+  //    andi    ptrlsb2,ptr,3
+  //    sll     shiftamt,ptrlsb2,3
+  //    ori     maskupper,$0,255               # 0xff
+  //    sll     mask,maskupper,shiftamt
   //    nor     mask2,$0,mask
-  //    andi    tmp4,incr,255
-  //    sll     incr2,tmp4,shift
-  //    sw      incr2, fi(sp)      // store incr2 to stack (when BinOpcode == 0)
-
-  // Note: for atomic.swap (when BinOpcode == 0), storing incr2 to stack before
-  // the loop and then loading it from stack in block loopMBB is necessary to
-  // prevent MachineLICM pass to hoist "or" instruction out of the block
-  // loopMBB.
+  //    sll     incr2,incr,shiftamt
 
   int64_t MaskImm = (Size == 1) ? 255 : 65535;
-  BuildMI(BB, dl, TII->get(Mips::ADDiu), Tmp1).addReg(Mips::ZERO).addImm(-4);
-  BuildMI(BB, dl, TII->get(Mips::AND), Addr).addReg(Ptr).addReg(Tmp1);
-  BuildMI(BB, dl, TII->get(Mips::ANDi), Tmp2).addReg(Ptr).addImm(3);
-  BuildMI(BB, dl, TII->get(Mips::SLL), Shift).addReg(Tmp2).addImm(3);
-  BuildMI(BB, dl, TII->get(Mips::ORi), Tmp3).addReg(Mips::ZERO).addImm(MaskImm);
-  BuildMI(BB, dl, TII->get(Mips::SLL), Mask).addReg(Tmp3).addReg(Shift);
+  BuildMI(BB, dl, TII->get(Mips::ADDiu), MaskLSB2)
+    .addReg(Mips::ZERO).addImm(-4);
+  BuildMI(BB, dl, TII->get(Mips::AND), AlignedAddr)
+    .addReg(Ptr).addReg(MaskLSB2);
+  BuildMI(BB, dl, TII->get(Mips::ANDi), PtrLSB2).addReg(Ptr).addImm(3);
+  BuildMI(BB, dl, TII->get(Mips::SLL), ShiftAmt).addReg(PtrLSB2).addImm(3);
+  BuildMI(BB, dl, TII->get(Mips::ORi), MaskUpper)
+    .addReg(Mips::ZERO).addImm(MaskImm);
+  BuildMI(BB, dl, TII->get(Mips::SLLV), Mask)
+    .addReg(ShiftAmt).addReg(MaskUpper);
   BuildMI(BB, dl, TII->get(Mips::NOR), Mask2).addReg(Mips::ZERO).addReg(Mask);
-  if (BinOpcode != Mips::SUBu) {
-    BuildMI(BB, dl, TII->get(Mips::ANDi), Tmp4).addReg(Incr).addImm(MaskImm);
-    BuildMI(BB, dl, TII->get(Mips::SLL), Incr2).addReg(Tmp4).addReg(Shift);
-  } else {
-    BuildMI(BB, dl, TII->get(Mips::SUBu), Tmp4).addReg(Mips::ZERO).addReg(Incr);
-    BuildMI(BB, dl, TII->get(Mips::ANDi), Tmp5).addReg(Tmp4).addImm(MaskImm);
-    BuildMI(BB, dl, TII->get(Mips::SLL), Incr2).addReg(Tmp5).addReg(Shift);
-  }
+  BuildMI(BB, dl, TII->get(Mips::SLLV), Incr2).addReg(ShiftAmt).addReg(Incr);
 
-  int fi = 0;
-  if (BinOpcode == 0 && !Nand) {
-    // Get or create a temporary stack location.
-    MipsFunctionInfo *MipsFI = MF->getInfo<MipsFunctionInfo>();
-    fi = MipsFI->getAtomicFrameIndex();
-    if (fi == -1) {
-      fi = MF->getFrameInfo()->CreateStackObject(Size, Size, false);
-      MipsFI->setAtomicFrameIndex(fi);
-    }
-
-    BuildMI(BB, dl, TII->get(Mips::SW))
-        .addReg(Incr2).addFrameIndex(fi).addImm(0);
-  }
-  BB->addSuccessor(loopMBB);
 
+  // atomic.load.binop
+  // loopMBB:
+  //   ll      oldval,0(alignedaddr)
+  //   binop   binopres,oldval,incr2
+  //   and     newval,binopres,mask
+  //   and     maskedoldval0,oldval,mask2
+  //   or      storeval,maskedoldval0,newval
+  //   sc      success,storeval,0(alignedaddr)
+  //   beq     success,$0,loopMBB
+
+  // atomic.swap
   // loopMBB:
-  //   ll      oldval,0(addr)
-  //   binop   tmp7,oldval,incr2
-  //   and     newval,tmp7,mask
-  //   and     tmp8,oldval,mask2
-  //   or      tmp9,tmp8,newval
-  //   sc      tmp9,0(addr)
-  //   beq     tmp9,$0,loopMBB
+  //   ll      oldval,0(alignedaddr)
+  //   and     newval,incr2,mask
+  //   and     maskedoldval0,oldval,mask2
+  //   or      storeval,maskedoldval0,newval
+  //   sc      success,storeval,0(alignedaddr)
+  //   beq     success,$0,loopMBB
+
   BB = loopMBB;
-  BuildMI(BB, dl, TII->get(Mips::LL), Oldval).addReg(Addr).addImm(0);
+  BuildMI(BB, dl, TII->get(Mips::LL), OldVal).addReg(AlignedAddr).addImm(0);
   if (Nand) {
-    //  and tmp6, oldval, incr2
-    //  nor tmp7, $0, tmp6
-    BuildMI(BB, dl, TII->get(Mips::AND), Tmp6).addReg(Oldval).addReg(Incr2);
-    BuildMI(BB, dl, TII->get(Mips::NOR), Tmp7).addReg(Mips::ZERO).addReg(Tmp6);
-  } else if (BinOpcode == Mips::SUBu) {
-    //  addu tmp7, oldval, incr2
-    BuildMI(BB, dl, TII->get(Mips::ADDu), Tmp7).addReg(Oldval).addReg(Incr2);
+    //  and andres, oldval, incr2
+    //  nor binopres, $0, andres
+    //  and newval, binopres, mask
+    BuildMI(BB, dl, TII->get(Mips::AND), AndRes).addReg(OldVal).addReg(Incr2);
+    BuildMI(BB, dl, TII->get(Mips::NOR), BinOpRes)
+      .addReg(Mips::ZERO).addReg(AndRes);
+    BuildMI(BB, dl, TII->get(Mips::AND), NewVal).addReg(BinOpRes).addReg(Mask);
   } else if (BinOpcode) {
-    //  <binop> tmp7, oldval, incr2
-    BuildMI(BB, dl, TII->get(BinOpcode), Tmp7).addReg(Oldval).addReg(Incr2);
-  } else {
-    //  lw tmp6, fi(sp)              // load incr2 from stack
-    //  or tmp7, $zero, tmp6
-    BuildMI(BB, dl, TII->get(Mips::LW), Tmp6).addFrameIndex(fi).addImm(0);
-    BuildMI(BB, dl, TII->get(Mips::OR), Tmp7).addReg(Mips::ZERO).addReg(Tmp6);
+    //  <binop> binopres, oldval, incr2
+    //  and newval, binopres, mask
+    BuildMI(BB, dl, TII->get(BinOpcode), BinOpRes).addReg(OldVal).addReg(Incr2);
+    BuildMI(BB, dl, TII->get(Mips::AND), NewVal).addReg(BinOpRes).addReg(Mask);
+  } else {// atomic.swap
+    //  and newval, incr2, mask
+    BuildMI(BB, dl, TII->get(Mips::AND), NewVal).addReg(Incr2).addReg(Mask);
   }
-  BuildMI(BB, dl, TII->get(Mips::AND), Newval).addReg(Tmp7).addReg(Mask);
-  BuildMI(BB, dl, TII->get(Mips::AND), Tmp8).addReg(Oldval).addReg(Mask2);
-  BuildMI(BB, dl, TII->get(Mips::OR), Tmp9).addReg(Tmp8).addReg(Newval);
-  BuildMI(BB, dl, TII->get(Mips::SC), Tmp9).addReg(Tmp9).addReg(Addr).addImm(0);
+    
+  BuildMI(BB, dl, TII->get(Mips::AND), MaskedOldVal0)
+    .addReg(OldVal).addReg(Mask2);
+  BuildMI(BB, dl, TII->get(Mips::OR), StoreVal)
+    .addReg(MaskedOldVal0).addReg(NewVal);
+  BuildMI(BB, dl, TII->get(Mips::SC), Success)
+    .addReg(StoreVal).addReg(AlignedAddr).addImm(0);
   BuildMI(BB, dl, TII->get(Mips::BEQ))
-      .addReg(Tmp9).addReg(Mips::ZERO).addMBB(loopMBB);
-  BB->addSuccessor(loopMBB);
-  BB->addSuccessor(exitMBB);
-
-  //  exitMBB:
-  //    and     tmp10,oldval,mask
-  //    srl     tmp11,tmp10,shift
-  //    sll     tmp12,tmp11,24
-  //    sra     dest,tmp12,24
-  BB = exitMBB;
+    .addReg(Success).addReg(Mips::ZERO).addMBB(loopMBB);
+
+  //  sinkMBB:
+  //    and     maskedoldval1,oldval,mask
+  //    srl     srlres,maskedoldval1,shiftamt
+  //    sll     sllres,srlres,24
+  //    sra     dest,sllres,24
+  BB = sinkMBB;
   int64_t ShiftImm = (Size == 1) ? 24 : 16;
-  // reverse order
-  BuildMI(*BB, BB->begin(), dl, TII->get(Mips::SRA), Dest)
-      .addReg(Tmp12).addImm(ShiftImm);
-  BuildMI(*BB, BB->begin(), dl, TII->get(Mips::SLL), Tmp12)
-      .addReg(Tmp11).addImm(ShiftImm);
-  BuildMI(*BB, BB->begin(), dl, TII->get(Mips::SRL), Tmp11)
-      .addReg(Tmp10).addReg(Shift);
-  BuildMI(*BB, BB->begin(), dl, TII->get(Mips::AND), Tmp10)
-    .addReg(Oldval).addReg(Mask);
+
+  BuildMI(BB, dl, TII->get(Mips::AND), MaskedOldVal1)
+    .addReg(OldVal).addReg(Mask);
+  BuildMI(BB, dl, TII->get(Mips::SRLV), SrlRes)
+      .addReg(ShiftAmt).addReg(MaskedOldVal1);
+  BuildMI(BB, dl, TII->get(Mips::SLL), SllRes)
+      .addReg(SrlRes).addImm(ShiftImm);
+  BuildMI(BB, dl, TII->get(Mips::SRA), Dest)
+      .addReg(SllRes).addImm(ShiftImm);
 
   MI->eraseFromParent();   // The instruction is gone now.
 
-  return BB;
+  return exitMBB;
 }
 
 MachineBasicBlock *
@@ -989,11 +1110,10 @@ MipsTargetLowering::EmitAtomicCmpSwap(MachineInstr *MI,
 
   unsigned Dest    = MI->getOperand(0).getReg();
   unsigned Ptr     = MI->getOperand(1).getReg();
-  unsigned Oldval  = MI->getOperand(2).getReg();
-  unsigned Newval  = MI->getOperand(3).getReg();
+  unsigned OldVal  = MI->getOperand(2).getReg();
+  unsigned NewVal  = MI->getOperand(3).getReg();
 
-  unsigned Tmp1 = RegInfo.createVirtualRegister(RC);
-  unsigned Tmp2 = RegInfo.createVirtualRegister(RC);
+  unsigned Success = RegInfo.createVirtualRegister(RC);
 
   // insert new blocks after the current block
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
@@ -1012,26 +1132,14 @@ MipsTargetLowering::EmitAtomicCmpSwap(MachineInstr *MI,
                   BB->end());
   exitMBB->transferSuccessorsAndUpdatePHIs(BB);
 
-  // Get or create a temporary stack location.
-  MipsFunctionInfo *MipsFI = MF->getInfo<MipsFunctionInfo>();
-  int fi = MipsFI->getAtomicFrameIndex();
-  if (fi == -1) {
-    fi = MF->getFrameInfo()->CreateStackObject(Size, Size, false);
-    MipsFI->setAtomicFrameIndex(fi);
-  }
-
   //  thisMBB:
   //    ...
-  //    sw newval, fi(sp)           // store newval to stack
   //    fallthrough --> loop1MBB
-
-  // Note: storing newval to stack before the loop and then loading it from
-  // stack in block loop2MBB is necessary to prevent MachineLICM pass to
-  // hoist "or" instruction out of the block loop2MBB.
-
-  BuildMI(BB, dl, TII->get(Mips::SW))
-      .addReg(Newval).addFrameIndex(fi).addImm(0);
   BB->addSuccessor(loop1MBB);
+  loop1MBB->addSuccessor(exitMBB);
+  loop1MBB->addSuccessor(loop2MBB);
+  loop2MBB->addSuccessor(loop1MBB);
+  loop2MBB->addSuccessor(exitMBB);
 
   // loop1MBB:
   //   ll dest, 0(ptr)
@@ -1039,27 +1147,20 @@ MipsTargetLowering::EmitAtomicCmpSwap(MachineInstr *MI,
   BB = loop1MBB;
   BuildMI(BB, dl, TII->get(Mips::LL), Dest).addReg(Ptr).addImm(0);
   BuildMI(BB, dl, TII->get(Mips::BNE))
-    .addReg(Dest).addReg(Oldval).addMBB(exitMBB);
-  BB->addSuccessor(exitMBB);
-  BB->addSuccessor(loop2MBB);
+    .addReg(Dest).addReg(OldVal).addMBB(exitMBB);
 
   // loop2MBB:
-  //   lw tmp2, fi(sp)              // load newval from stack
-  //   or tmp1, $0, tmp2
-  //   sc tmp1, 0(ptr)
-  //   beq tmp1, $0, loop1MBB
+  //   sc success, newval, 0(ptr)
+  //   beq success, $0, loop1MBB
   BB = loop2MBB;
-  BuildMI(BB, dl, TII->get(Mips::LW), Tmp2).addFrameIndex(fi).addImm(0);
-  BuildMI(BB, dl, TII->get(Mips::OR), Tmp1).addReg(Mips::ZERO).addReg(Tmp2);
-  BuildMI(BB, dl, TII->get(Mips::SC), Tmp1).addReg(Tmp1).addReg(Ptr).addImm(0);
+  BuildMI(BB, dl, TII->get(Mips::SC), Success)
+    .addReg(NewVal).addReg(Ptr).addImm(0);
   BuildMI(BB, dl, TII->get(Mips::BEQ))
-    .addReg(Tmp1).addReg(Mips::ZERO).addMBB(loop1MBB);
-  BB->addSuccessor(loop1MBB);
-  BB->addSuccessor(exitMBB);
+    .addReg(Success).addReg(Mips::ZERO).addMBB(loop1MBB);
 
   MI->eraseFromParent();   // The instruction is gone now.
 
-  return BB;
+  return exitMBB;
 }
 
 MachineBasicBlock *
@@ -1077,36 +1178,39 @@ MipsTargetLowering::EmitAtomicCmpSwapPartword(MachineInstr *MI,
 
   unsigned Dest    = MI->getOperand(0).getReg();
   unsigned Ptr     = MI->getOperand(1).getReg();
-  unsigned Oldval  = MI->getOperand(2).getReg();
-  unsigned Newval  = MI->getOperand(3).getReg();
+  unsigned CmpVal  = MI->getOperand(2).getReg();
+  unsigned NewVal  = MI->getOperand(3).getReg();
 
-  unsigned Addr = RegInfo.createVirtualRegister(RC);
-  unsigned Shift = RegInfo.createVirtualRegister(RC);
+  unsigned AlignedAddr = RegInfo.createVirtualRegister(RC);
+  unsigned ShiftAmt = RegInfo.createVirtualRegister(RC);
   unsigned Mask = RegInfo.createVirtualRegister(RC);
   unsigned Mask2 = RegInfo.createVirtualRegister(RC);
-  unsigned Oldval2 = RegInfo.createVirtualRegister(RC);
-  unsigned Oldval3 = RegInfo.createVirtualRegister(RC);
-  unsigned Oldval4 = RegInfo.createVirtualRegister(RC);
-  unsigned Newval2 = RegInfo.createVirtualRegister(RC);
-  unsigned Tmp1 = RegInfo.createVirtualRegister(RC);
-  unsigned Tmp2 = RegInfo.createVirtualRegister(RC);
-  unsigned Tmp3 = RegInfo.createVirtualRegister(RC);
-  unsigned Tmp4 = RegInfo.createVirtualRegister(RC);
-  unsigned Tmp5 = RegInfo.createVirtualRegister(RC);
-  unsigned Tmp6 = RegInfo.createVirtualRegister(RC);
-  unsigned Tmp7 = RegInfo.createVirtualRegister(RC);
-  unsigned Tmp8 = RegInfo.createVirtualRegister(RC);
-  unsigned Tmp9 = RegInfo.createVirtualRegister(RC);
+  unsigned ShiftedCmpVal = RegInfo.createVirtualRegister(RC);
+  unsigned OldVal = RegInfo.createVirtualRegister(RC);
+  unsigned MaskedOldVal0 = RegInfo.createVirtualRegister(RC);
+  unsigned ShiftedNewVal = RegInfo.createVirtualRegister(RC);
+  unsigned MaskLSB2 = RegInfo.createVirtualRegister(RC);
+  unsigned PtrLSB2 = RegInfo.createVirtualRegister(RC);
+  unsigned MaskUpper = RegInfo.createVirtualRegister(RC);
+  unsigned MaskedCmpVal = RegInfo.createVirtualRegister(RC);
+  unsigned MaskedNewVal = RegInfo.createVirtualRegister(RC);
+  unsigned MaskedOldVal1 = RegInfo.createVirtualRegister(RC);
+  unsigned StoreVal = RegInfo.createVirtualRegister(RC);
+  unsigned SrlRes = RegInfo.createVirtualRegister(RC);
+  unsigned SllRes = RegInfo.createVirtualRegister(RC);
+  unsigned Success = RegInfo.createVirtualRegister(RC);
 
   // insert new blocks after the current block
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
   MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   MachineFunction::iterator It = BB;
   ++It;
   MF->insert(It, loop1MBB);
   MF->insert(It, loop2MBB);
+  MF->insert(It, sinkMBB);
   MF->insert(It, exitMBB);
 
   // Transfer the remainder of BB and its successor edges to exitMBB.
@@ -1115,76 +1219,90 @@ MipsTargetLowering::EmitAtomicCmpSwapPartword(MachineInstr *MI,
                   BB->end());
   exitMBB->transferSuccessorsAndUpdatePHIs(BB);
 
+  BB->addSuccessor(loop1MBB);
+  loop1MBB->addSuccessor(sinkMBB);
+  loop1MBB->addSuccessor(loop2MBB);
+  loop2MBB->addSuccessor(loop1MBB);
+  loop2MBB->addSuccessor(sinkMBB);
+  sinkMBB->addSuccessor(exitMBB);
+
+  // FIXME: computation of newval2 can be moved to loop2MBB.
   //  thisMBB:
-  //    addiu   tmp1,$0,-4                # 0xfffffffc
-  //    and     addr,ptr,tmp1
-  //    andi    tmp2,ptr,3
-  //    sll     shift,tmp2,3
-  //    ori     tmp3,$0,255               # 0xff
-  //    sll     mask,tmp3,shift
+  //    addiu   masklsb2,$0,-4                # 0xfffffffc
+  //    and     alignedaddr,ptr,masklsb2
+  //    andi    ptrlsb2,ptr,3
+  //    sll     shiftamt,ptrlsb2,3
+  //    ori     maskupper,$0,255               # 0xff
+  //    sll     mask,maskupper,shiftamt
   //    nor     mask2,$0,mask
-  //    andi    tmp4,oldval,255
-  //    sll     oldval2,tmp4,shift
-  //    andi    tmp5,newval,255
-  //    sll     newval2,tmp5,shift
+  //    andi    maskedcmpval,cmpval,255
+  //    sll     shiftedcmpval,maskedcmpval,shiftamt
+  //    andi    maskednewval,newval,255
+  //    sll     shiftednewval,maskednewval,shiftamt
   int64_t MaskImm = (Size == 1) ? 255 : 65535;
-  BuildMI(BB, dl, TII->get(Mips::ADDiu), Tmp1).addReg(Mips::ZERO).addImm(-4);
-  BuildMI(BB, dl, TII->get(Mips::AND), Addr).addReg(Ptr).addReg(Tmp1);
-  BuildMI(BB, dl, TII->get(Mips::ANDi), Tmp2).addReg(Ptr).addImm(3);
-  BuildMI(BB, dl, TII->get(Mips::SLL), Shift).addReg(Tmp2).addImm(3);
-  BuildMI(BB, dl, TII->get(Mips::ORi), Tmp3).addReg(Mips::ZERO).addImm(MaskImm);
-  BuildMI(BB, dl, TII->get(Mips::SLL), Mask).addReg(Tmp3).addReg(Shift);
+  BuildMI(BB, dl, TII->get(Mips::ADDiu), MaskLSB2)
+    .addReg(Mips::ZERO).addImm(-4);
+  BuildMI(BB, dl, TII->get(Mips::AND), AlignedAddr)
+    .addReg(Ptr).addReg(MaskLSB2);
+  BuildMI(BB, dl, TII->get(Mips::ANDi), PtrLSB2).addReg(Ptr).addImm(3);
+  BuildMI(BB, dl, TII->get(Mips::SLL), ShiftAmt).addReg(PtrLSB2).addImm(3);
+  BuildMI(BB, dl, TII->get(Mips::ORi), MaskUpper)
+    .addReg(Mips::ZERO).addImm(MaskImm);
+  BuildMI(BB, dl, TII->get(Mips::SLLV), Mask)
+    .addReg(ShiftAmt).addReg(MaskUpper);
   BuildMI(BB, dl, TII->get(Mips::NOR), Mask2).addReg(Mips::ZERO).addReg(Mask);
-  BuildMI(BB, dl, TII->get(Mips::ANDi), Tmp4).addReg(Oldval).addImm(MaskImm);
-  BuildMI(BB, dl, TII->get(Mips::SLL), Oldval2).addReg(Tmp4).addReg(Shift);
-  BuildMI(BB, dl, TII->get(Mips::ANDi), Tmp5).addReg(Newval).addImm(MaskImm);
-  BuildMI(BB, dl, TII->get(Mips::SLL), Newval2).addReg(Tmp5).addReg(Shift);
-  BB->addSuccessor(loop1MBB);
+  BuildMI(BB, dl, TII->get(Mips::ANDi), MaskedCmpVal)
+    .addReg(CmpVal).addImm(MaskImm);
+  BuildMI(BB, dl, TII->get(Mips::SLLV), ShiftedCmpVal)
+    .addReg(ShiftAmt).addReg(MaskedCmpVal);
+  BuildMI(BB, dl, TII->get(Mips::ANDi), MaskedNewVal)
+    .addReg(NewVal).addImm(MaskImm);
+  BuildMI(BB, dl, TII->get(Mips::SLLV), ShiftedNewVal)
+    .addReg(ShiftAmt).addReg(MaskedNewVal);
 
   //  loop1MBB:
-  //    ll      oldval3,0(addr)
-  //    and     oldval4,oldval3,mask
-  //    bne     oldval4,oldval2,exitMBB
+  //    ll      oldval,0(alginedaddr)
+  //    and     maskedoldval0,oldval,mask
+  //    bne     maskedoldval0,shiftedcmpval,sinkMBB
   BB = loop1MBB;
-  BuildMI(BB, dl, TII->get(Mips::LL), Oldval3).addReg(Addr).addImm(0);
-  BuildMI(BB, dl, TII->get(Mips::AND), Oldval4).addReg(Oldval3).addReg(Mask);
+  BuildMI(BB, dl, TII->get(Mips::LL), OldVal).addReg(AlignedAddr).addImm(0);
+  BuildMI(BB, dl, TII->get(Mips::AND), MaskedOldVal0)
+    .addReg(OldVal).addReg(Mask);
   BuildMI(BB, dl, TII->get(Mips::BNE))
-      .addReg(Oldval4).addReg(Oldval2).addMBB(exitMBB);
-  BB->addSuccessor(exitMBB);
-  BB->addSuccessor(loop2MBB);
+    .addReg(MaskedOldVal0).addReg(ShiftedCmpVal).addMBB(sinkMBB);
 
   //  loop2MBB:
-  //    and     tmp6,oldval3,mask2
-  //    or      tmp7,tmp6,newval2
-  //    sc      tmp7,0(addr)
-  //    beq     tmp7,$0,loop1MBB
+  //    and     maskedoldval1,oldval,mask2
+  //    or      storeval,maskedoldval1,shiftednewval
+  //    sc      success,storeval,0(alignedaddr)
+  //    beq     success,$0,loop1MBB
   BB = loop2MBB;
-  BuildMI(BB, dl, TII->get(Mips::AND), Tmp6).addReg(Oldval3).addReg(Mask2);
-  BuildMI(BB, dl, TII->get(Mips::OR), Tmp7).addReg(Tmp6).addReg(Newval2);
-  BuildMI(BB, dl, TII->get(Mips::SC), Tmp7)
-      .addReg(Tmp7).addReg(Addr).addImm(0);
+  BuildMI(BB, dl, TII->get(Mips::AND), MaskedOldVal1)
+    .addReg(OldVal).addReg(Mask2);
+  BuildMI(BB, dl, TII->get(Mips::OR), StoreVal)
+    .addReg(MaskedOldVal1).addReg(ShiftedNewVal);
+  BuildMI(BB, dl, TII->get(Mips::SC), Success)
+      .addReg(StoreVal).addReg(AlignedAddr).addImm(0);
   BuildMI(BB, dl, TII->get(Mips::BEQ))
-      .addReg(Tmp7).addReg(Mips::ZERO).addMBB(loop1MBB);
-  BB->addSuccessor(loop1MBB);
-  BB->addSuccessor(exitMBB);
+      .addReg(Success).addReg(Mips::ZERO).addMBB(loop1MBB);
 
-  //  exitMBB:
-  //    srl     tmp8,oldval4,shift
-  //    sll     tmp9,tmp8,24
-  //    sra     dest,tmp9,24
-  BB = exitMBB;
+  //  sinkMBB:
+  //    srl     srlres,maskedoldval0,shiftamt
+  //    sll     sllres,srlres,24
+  //    sra     dest,sllres,24
+  BB = sinkMBB;
   int64_t ShiftImm = (Size == 1) ? 24 : 16;
-  // reverse order
-  BuildMI(*BB, BB->begin(), dl, TII->get(Mips::SRA), Dest)
-      .addReg(Tmp9).addImm(ShiftImm);
-  BuildMI(*BB, BB->begin(), dl, TII->get(Mips::SLL), Tmp9)
-      .addReg(Tmp8).addImm(ShiftImm);
-  BuildMI(*BB, BB->begin(), dl, TII->get(Mips::SRL), Tmp8)
-      .addReg(Oldval4).addReg(Shift);
+
+  BuildMI(BB, dl, TII->get(Mips::SRLV), SrlRes)
+      .addReg(ShiftAmt).addReg(MaskedOldVal0);
+  BuildMI(BB, dl, TII->get(Mips::SLL), SllRes)
+      .addReg(SrlRes).addImm(ShiftImm);
+  BuildMI(BB, dl, TII->get(Mips::SRA), Dest)
+      .addReg(SllRes).addImm(ShiftImm);
 
   MI->eraseFromParent();   // The instruction is gone now.
 
-  return BB;
+  return exitMBB;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1267,9 +1385,9 @@ SDValue MipsTargetLowering::LowerGlobalAddress(SDValue Op,
                                                SelectionDAG &DAG) const {
   // FIXME there isn't actually debug info here
   DebugLoc dl = Op.getDebugLoc();
-  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 	
 
-  if (getTargetMachine().getRelocationModel() != Reloc::PIC_) {
+  if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && !IsN64) {
     SDVTList VTs = DAG.getVTList(MVT::i32);
 
     MipsTargetObjectFile &TLOF = (MipsTargetObjectFile&)getObjFileLowering();
@@ -1292,21 +1410,26 @@ SDValue MipsTargetLowering::LowerGlobalAddress(SDValue Op,
     return DAG.getNode(ISD::ADD, dl, MVT::i32, HiPart, Lo);
   }
 
-  SDValue GA = DAG.getTargetGlobalAddress(GV, dl, MVT::i32, 0,
-                                          MipsII::MO_GOT);
-  GA = DAG.getNode(MipsISD::WrapperPIC, dl, MVT::i32, GA);
-  SDValue ResNode = DAG.getLoad(MVT::i32, dl,
+  EVT ValTy = Op.getValueType();
+  bool HasGotOfst = (GV->hasInternalLinkage() ||
+                     (GV->hasLocalLinkage() && !isa<Function>(GV)));
+  unsigned GotFlag = IsN64 ?
+                     (HasGotOfst ? MipsII::MO_GOT_PAGE : MipsII::MO_GOT_DISP) :
+                     MipsII::MO_GOT;
+  SDValue GA = DAG.getTargetGlobalAddress(GV, dl, ValTy, 0, GotFlag);
+  GA = DAG.getNode(MipsISD::WrapperPIC, dl, ValTy, GA);
+  SDValue ResNode = DAG.getLoad(ValTy, dl,
                                 DAG.getEntryNode(), GA, MachinePointerInfo(),
                                 false, false, 0);
   // On functions and global targets not internal linked only
   // a load from got/GP is necessary for PIC to work.
-  if (!GV->hasInternalLinkage() &&
-      (!GV->hasLocalLinkage() || isa<Function>(GV)))
+  if (!HasGotOfst)
     return ResNode;
-  SDValue GALo = DAG.getTargetGlobalAddress(GV, dl, MVT::i32, 0,
-                                            MipsII::MO_ABS_LO);
-  SDValue Lo = DAG.getNode(MipsISD::Lo, dl, MVT::i32, GALo);
-  return DAG.getNode(ISD::ADD, dl, MVT::i32, ResNode, Lo);
+  SDValue GALo = DAG.getTargetGlobalAddress(GV, dl, ValTy, 0,
+                                            IsN64 ? MipsII::MO_GOT_OFST :
+                                                    MipsII::MO_ABS_LO);
+  SDValue Lo = DAG.getNode(MipsISD::Lo, dl, ValTy, GALo);
+  return DAG.getNode(ISD::ADD, dl, ValTy, ResNode, Lo);
 }
 
 SDValue MipsTargetLowering::LowerBlockAddress(SDValue Op,
@@ -1361,11 +1484,11 @@ LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const
     ArgListTy Args;
     ArgListEntry Entry;
     Entry.Node = Argument;
-    Entry.Ty = (const Type *) Type::getInt32Ty(*DAG.getContext());
+    Entry.Ty = (Type *) Type::getInt32Ty(*DAG.getContext());
     Args.push_back(Entry);
     std::pair<SDValue, SDValue> CallResult =
         LowerCallTo(DAG.getEntryNode(),
-                    (const Type *) Type::getInt32Ty(*DAG.getContext()),
+                    (Type *) Type::getInt32Ty(*DAG.getContext()),
                     false, false, false, false, 0, CallingConv::C, false, true,
                     DAG.getExternalSymbol("__tls_get_addr", PtrVT), Args, DAG,
                     dl);
@@ -1557,6 +1680,25 @@ LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
   return FrameAddr;
 }
 
+// TODO: set SType according to the desired memory barrier behavior.
+SDValue MipsTargetLowering::LowerMEMBARRIER(SDValue Op,
+                                            SelectionDAG& DAG) const {
+  unsigned SType = 0;
+  DebugLoc dl = Op.getDebugLoc();
+  return DAG.getNode(MipsISD::Sync, dl, MVT::Other, Op.getOperand(0),
+                     DAG.getConstant(SType, MVT::i32));
+}
+
+SDValue MipsTargetLowering::LowerATOMIC_FENCE(SDValue Op,
+                                              SelectionDAG& DAG) const {
+  // FIXME: Need pseudo-fence for 'singlethread' fences
+  // FIXME: Set SType for weaker fences where supported/appropriate.
+  unsigned SType = 0;
+  DebugLoc dl = Op.getDebugLoc();
+  return DAG.getNode(MipsISD::Sync, dl, MVT::Other, Op.getOperand(0),
+                     DAG.getConstant(SType, MVT::i32));
+}
+
 //===----------------------------------------------------------------------===//
 //                      Calling Convention Implementation
 //===----------------------------------------------------------------------===//
@@ -1679,55 +1821,109 @@ static const unsigned O32IntRegs[] = {
   Mips::A0, Mips::A1, Mips::A2, Mips::A3
 };
 
+// Return next O32 integer argument register.
+static unsigned getNextIntArgReg(unsigned Reg) {
+  assert((Reg == Mips::A0) || (Reg == Mips::A2));
+  return (Reg == Mips::A0) ? Mips::A1 : Mips::A3;
+}
+
 // Write ByVal Arg to arg registers and stack.
 static void
-WriteByValArg(SDValue& Chain, DebugLoc dl,
+WriteByValArg(SDValue& ByValChain, SDValue Chain, DebugLoc dl,
               SmallVector<std::pair<unsigned, SDValue>, 16>& RegsToPass,
               SmallVector<SDValue, 8>& MemOpChains, int& LastFI,
               MachineFrameInfo *MFI, SelectionDAG &DAG, SDValue Arg,
               const CCValAssign &VA, const ISD::ArgFlagsTy& Flags,
-              MVT PtrType) {
-  unsigned FirstWord = VA.getLocMemOffset() / 4;
-  unsigned NumWords = (Flags.getByValSize() + 3) / 4;
-  unsigned LastWord = FirstWord + NumWords;
-  unsigned CurWord;
-
-  // copy the first 4 words of byval arg to registers A0 - A3
-  for (CurWord = FirstWord; CurWord < std::min(LastWord, O32IntRegsSize);
-       ++CurWord) {
+              MVT PtrType, bool isLittle) {
+  unsigned LocMemOffset = VA.getLocMemOffset();
+  unsigned Offset = 0;
+  uint32_t RemainingSize = Flags.getByValSize();
+  unsigned ByValAlign = Flags.getByValAlign();
+
+  // Copy the first 4 words of byval arg to registers A0 - A3.
+  // FIXME: Use a stricter alignment if it enables better optimization in passes
+  //        run later.
+  for (; RemainingSize >= 4 && LocMemOffset < 4 * 4;
+       Offset += 4, RemainingSize -= 4, LocMemOffset += 4) {
     SDValue LoadPtr = DAG.getNode(ISD::ADD, dl, MVT::i32, Arg,
-                                  DAG.getConstant((CurWord - FirstWord) * 4,
-                                                  MVT::i32));
+                                  DAG.getConstant(Offset, MVT::i32));
     SDValue LoadVal = DAG.getLoad(MVT::i32, dl, Chain, LoadPtr,
                                   MachinePointerInfo(),
-                                  false, false, 0);
+                                  false, false, std::min(ByValAlign,
+                                                         (unsigned )4));
     MemOpChains.push_back(LoadVal.getValue(1));
-    unsigned DstReg = O32IntRegs[CurWord];
+    unsigned DstReg = O32IntRegs[LocMemOffset / 4];
     RegsToPass.push_back(std::make_pair(DstReg, LoadVal));
   }
 
-  // copy remaining part of byval arg to stack.
-  if (CurWord < LastWord) {
-    unsigned SizeInBytes = (LastWord - CurWord) * 4;
-    SDValue Src = DAG.getNode(ISD::ADD, dl, MVT::i32, Arg,
-                              DAG.getConstant((CurWord - FirstWord) * 4,
-                                              MVT::i32));
-    LastFI = MFI->CreateFixedObject(SizeInBytes, CurWord * 4, true);
-    SDValue Dst = DAG.getFrameIndex(LastFI, PtrType);
-    Chain = DAG.getMemcpy(Chain, dl, Dst, Src,
-                          DAG.getConstant(SizeInBytes, MVT::i32),
-                          /*Align*/4,
-                          /*isVolatile=*/false, /*AlwaysInline=*/false,
-                          MachinePointerInfo(0), MachinePointerInfo(0));
-    MemOpChains.push_back(Chain);
+  if (RemainingSize == 0)
+    return;
+
+  // If there still is a register available for argument passing, write the
+  // remaining part of the structure to it using subword loads and shifts.
+  if (LocMemOffset < 4 * 4) {
+    assert(RemainingSize <= 3 && RemainingSize >= 1 &&
+           "There must be one to three bytes remaining.");
+    unsigned LoadSize = (RemainingSize == 3 ? 2 : RemainingSize);
+    SDValue LoadPtr = DAG.getNode(ISD::ADD, dl, MVT::i32, Arg,
+                                  DAG.getConstant(Offset, MVT::i32));
+    unsigned Alignment = std::min(ByValAlign, (unsigned )4);
+    SDValue LoadVal = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, Chain,
+                                     LoadPtr, MachinePointerInfo(),
+                                     MVT::getIntegerVT(LoadSize * 8), false,
+                                     false, Alignment);
+    MemOpChains.push_back(LoadVal.getValue(1));
+
+    // If target is big endian, shift it to the most significant half-word or
+    // byte.
+    if (!isLittle)
+      LoadVal = DAG.getNode(ISD::SHL, dl, MVT::i32, LoadVal,
+                            DAG.getConstant(32 - LoadSize * 8, MVT::i32));
+
+    Offset += LoadSize;
+    RemainingSize -= LoadSize;
+
+    // Read second subword if necessary.
+    if (RemainingSize != 0)  {
+      assert(RemainingSize == 1 && "There must be one byte remaining.");
+      LoadPtr = DAG.getNode(ISD::ADD, dl, MVT::i32, Arg, 
+                            DAG.getConstant(Offset, MVT::i32));
+      unsigned Alignment = std::min(ByValAlign, (unsigned )2);
+      SDValue Subword = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, Chain,
+                                       LoadPtr, MachinePointerInfo(),
+                                       MVT::i8, false, false, Alignment);
+      MemOpChains.push_back(Subword.getValue(1));
+      // Insert the loaded byte to LoadVal.
+      // FIXME: Use INS if supported by target.
+      unsigned ShiftAmt = isLittle ? 16 : 8;
+      SDValue Shift = DAG.getNode(ISD::SHL, dl, MVT::i32, Subword,
+                                  DAG.getConstant(ShiftAmt, MVT::i32));
+      LoadVal = DAG.getNode(ISD::OR, dl, MVT::i32, LoadVal, Shift);
+    }
+
+    unsigned DstReg = O32IntRegs[LocMemOffset / 4];
+    RegsToPass.push_back(std::make_pair(DstReg, LoadVal));
+    return;
   }
+
+  // Create a fixed object on stack at offset LocMemOffset and copy
+  // remaining part of byval arg to it using memcpy.
+  SDValue Src = DAG.getNode(ISD::ADD, dl, MVT::i32, Arg,
+                            DAG.getConstant(Offset, MVT::i32));
+  LastFI = MFI->CreateFixedObject(RemainingSize, LocMemOffset, true);
+  SDValue Dst = DAG.getFrameIndex(LastFI, PtrType);
+  ByValChain = DAG.getMemcpy(ByValChain, dl, Dst, Src,
+                             DAG.getConstant(RemainingSize, MVT::i32),
+                             std::min(ByValAlign, (unsigned)4),
+                             /*isVolatile=*/false, /*AlwaysInline=*/false,
+                             MachinePointerInfo(0), MachinePointerInfo(0));
 }
 
 /// LowerCall - functions arguments are copied from virtual regs to
 /// (physical regs)/(stack frame), CALLSEQ_START and CALLSEQ_END are emitted.
 /// TODO: isTailCall.
 SDValue
-MipsTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
+MipsTargetLowering::LowerCall(SDValue InChain, SDValue Callee,
                               CallingConv::ID CallConv, bool isVarArg,
                               bool &isTailCall,
                               const SmallVectorImpl<ISD::OutputArg> &Outs,
@@ -1757,8 +1953,13 @@ MipsTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
   // Get a count of how many bytes are to be pushed on the stack.
   unsigned NextStackOffset = CCInfo.getNextStackOffset();
 
-  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NextStackOffset,
-                                                            true));
+  // Chain is the output chain of the last Load/Store or CopyToReg node.
+  // ByValChain is the output chain of the last Memcpy node created for copying
+  // byval arguments to the stack.
+  SDValue Chain, CallSeqStart, ByValChain;
+  SDValue NextStackOffsetVal = DAG.getIntPtrConstant(NextStackOffset, true);
+  Chain = CallSeqStart = DAG.getCALLSEQ_START(InChain, NextStackOffsetVal);
+  ByValChain = InChain;
 
   // If this is the first call, create a stack frame object that points to
   // a location to which .cprestore saves $gp.
@@ -1818,8 +2019,10 @@ MipsTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
                                    Arg, DAG.getConstant(1, MVT::i32));
           if (!Subtarget->isLittle())
             std::swap(Lo, Hi);
-          RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
-          RegsToPass.push_back(std::make_pair(VA.getLocReg()+1, Hi));
+          unsigned LocRegLo = VA.getLocReg(); 
+          unsigned LocRegHigh = getNextIntArgReg(LocRegLo);
+          RegsToPass.push_back(std::make_pair(LocRegLo, Lo));
+          RegsToPass.push_back(std::make_pair(LocRegHigh, Hi));
           continue;
         }
       }
@@ -1852,8 +2055,8 @@ MipsTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
              "No support for ByVal args by ABIs other than O32 yet.");
       assert(Flags.getByValSize() &&
              "ByVal args of size 0 should have been ignored by front-end.");
-      WriteByValArg(Chain, dl, RegsToPass, MemOpChains, LastFI, MFI, DAG, Arg,
-                    VA, Flags, getPointerTy());
+      WriteByValArg(ByValChain, Chain, dl, RegsToPass, MemOpChains, LastFI, MFI,
+                    DAG, Arg, VA, Flags, getPointerTy(), Subtarget->isLittle());
       continue;
     }
 
@@ -1875,6 +2078,12 @@ MipsTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
   if (LastFI)
     MipsFI->extendOutArgFIRange(FirstFI, LastFI);
 
+  // If a memcpy has been created to copy a byval arg to a stack, replace the
+  // chain input of CallSeqStart with ByValChain.
+  if (InChain != ByValChain)
+    DAG.UpdateNodeOperands(CallSeqStart.getNode(), ByValChain,
+                           NextStackOffsetVal);
+
   // Transform all store nodes into one single node because all store
   // nodes are independent of each other.
   if (!MemOpChains.empty())
@@ -2071,12 +2280,13 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
 
       if (RegVT == MVT::i32)
         RC = Mips::CPURegsRegisterClass;
+      else if (RegVT == MVT::i64)
+        RC = Mips::CPU64RegsRegisterClass;
       else if (RegVT == MVT::f32)
         RC = Mips::FGR32RegisterClass;
-      else if (RegVT == MVT::f64) {
-        if (!Subtarget->isSingleFloat())
-          RC = Mips::AFGR64RegisterClass;
-      } else
+      else if (RegVT == MVT::f64)
+        RC = HasMips64 ? Mips::FGR64RegisterClass : Mips::AFGR64RegisterClass;
+      else
         llvm_unreachable("RegVT not supported by FormalArguments Lowering");
 
       // Transform the arguments stored on
@@ -2105,7 +2315,7 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
           ArgValue = DAG.getNode(ISD::BITCAST, dl, MVT::f32, ArgValue);
         if (RegVT == MVT::i32 && VA.getValVT() == MVT::f64) {
           unsigned Reg2 = AddLiveIn(DAG.getMachineFunction(),
-                                    VA.getLocReg()+1, RC);
+                                    getNextIntArgReg(ArgReg), RC);
           SDValue ArgValue2 = DAG.getCopyFromReg(Chain, dl, Reg2, RegVT);
           if (!Subtarget->isLittle())
             std::swap(ArgValue, ArgValue2);
@@ -2313,7 +2523,7 @@ MipsTargetLowering::getSingleConstraintMatchWeight(
     // but allow it at the lowest weight.
   if (CallOperandVal == NULL)
     return CW_Default;
-  const Type *type = CallOperandVal->getType();
+  Type *type = CallOperandVal->getType();
   // Look at the constraint type.
   switch (*constraint) {
   default:
diff --git a/lib/Target/Mips/MipsISelLowering.h b/lib/Target/Mips/MipsISelLowering.h
index bda26a229e72e..4be3fed59fc06 100644
--- a/lib/Target/Mips/MipsISelLowering.h
+++ b/lib/Target/Mips/MipsISelLowering.h
@@ -81,7 +81,12 @@ namespace llvm {
 
       WrapperPIC,
 
-      DynAlloc
+      DynAlloc,
+
+      Sync,
+
+      Ext,
+      Ins
     };
   }
 
@@ -93,6 +98,8 @@ namespace llvm {
   public:
     explicit MipsTargetLowering(MipsTargetMachine &TM);
 
+    virtual bool allowsUnalignedMemoryAccesses (EVT VT) const;
+
     /// LowerOperation - Provide custom lowering hooks for some operations.
     virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
 
@@ -101,13 +108,14 @@ namespace llvm {
     virtual const char *getTargetNodeName(unsigned Opcode) const;
 
     /// getSetCCResultType - get the ISD::SETCC result ValueType
-    MVT::SimpleValueType getSetCCResultType(EVT VT) const;
+    EVT getSetCCResultType(EVT VT) const;
 
     virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   private:
     // Subtarget Info
     const MipsSubtarget *Subtarget;
-
+    
+    bool HasMips64, IsN64;
 
     // Lower Operand helpers
     SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
@@ -128,6 +136,8 @@ namespace llvm {
     SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerMEMBARRIER(SDValue Op, SelectionDAG& DAG) const;
+    SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG& DAG) const;
 
     virtual SDValue
       LowerFormalArguments(SDValue Chain,
diff --git a/lib/Target/Mips/MipsInstrFPU.td b/lib/Target/Mips/MipsInstrFPU.td
index 021c167997796..2fb9d184b2b16 100644
--- a/lib/Target/Mips/MipsInstrFPU.td
+++ b/lib/Target/Mips/MipsInstrFPU.td
@@ -27,7 +27,7 @@
 def SDT_MipsFPBrcond : SDTypeProfile<0, 2, [SDTCisInt<0>,
                                             SDTCisVT<1, OtherVT>]>;
 def SDT_MipsFPCmp : SDTypeProfile<0, 3, [SDTCisSameAs<0, 1>, SDTCisFP<1>,
-                                         SDTCisInt<2>]>;
+                                         SDTCisVT<2, i32>]>;
 def SDT_MipsCMovFP : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>,
                                           SDTCisSameAs<1, 2>]>;
 def SDT_MipsBuildPairF64 : SDTypeProfile<1, 2, [SDTCisVT<0, f64>,
@@ -35,12 +35,11 @@ def SDT_MipsBuildPairF64 : SDTypeProfile<1, 2, [SDTCisVT<0, f64>,
                                                 SDTCisSameAs<1, 2>]>;
 def SDT_MipsExtractElementF64 : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
                                                      SDTCisVT<1, f64>,
-                                                     SDTCisVT<0, i32>]>;
+                                                     SDTCisVT<2, i32>]>;
 
 def MipsFPCmp : SDNode<"MipsISD::FPCmp", SDT_MipsFPCmp, [SDNPOutGlue]>;
 def MipsCMovFP_T : SDNode<"MipsISD::CMovFP_T", SDT_MipsCMovFP, [SDNPInGlue]>;
 def MipsCMovFP_F : SDNode<"MipsISD::CMovFP_F", SDT_MipsCMovFP, [SDNPInGlue]>;
-def MipsFPRound : SDNode<"MipsISD::FPRound", SDTFPRoundOp, [SDNPOptInGlue]>;
 def MipsFPBrcond : SDNode<"MipsISD::FPBrcond", SDT_MipsFPBrcond,
                           [SDNPHasChain, SDNPOptInGlue]>;
 def MipsBuildPairF64 : SDNode<"MipsISD::BuildPairF64", SDT_MipsBuildPairF64>;
@@ -55,10 +54,10 @@ let PrintMethod = "printFCCOperand" in
 // Feature predicates.
 //===----------------------------------------------------------------------===//
 
-def In32BitMode      : Predicate<"!Subtarget.isFP64bit()">;
+def IsFP64bit        : Predicate<"Subtarget.isFP64bit()">;
+def NotFP64bit       : Predicate<"!Subtarget.isFP64bit()">;
 def IsSingleFloat    : Predicate<"Subtarget.isSingleFloat()">;
 def IsNotSingleFloat : Predicate<"!Subtarget.isSingleFloat()">;
-def IsNotMipsI       : Predicate<"!Subtarget.isMips1()">;
 
 //===----------------------------------------------------------------------===//
 // Instruction Class Templates
@@ -74,97 +73,87 @@ def IsNotMipsI       : Predicate<"!Subtarget.isMips1()">;
 // Only S32 and D32 are supported right now.
 //===----------------------------------------------------------------------===//
 
-multiclass FFR1_1<bits<6> funct, string asmstr>
-{
-  def _S32 : FFR<0x11, funct, 0x0, (outs FGR32:$fd), (ins FGR32:$fs),
-      !strconcat(asmstr, ".s\t$fd, $fs"), []>;
-
-  def _D32  : FFR<0x11, funct, 0x1, (outs FGR32:$fd), (ins AFGR64:$fs),
-      !strconcat(asmstr, ".d\t$fd, $fs"), []>, Requires<[In32BitMode]>;
+// FP load.
+class FPLoad<bits<6> op, string opstr, PatFrag FOp, RegisterClass RC,
+             Operand MemOpnd>:
+  FFI<op, (outs RC:$ft), (ins MemOpnd:$base),
+      !strconcat(opstr, "\t$ft, $base"), [(set RC:$ft, (FOp addr:$base))]>;
+
+// FP store.
+class FPStore<bits<6> op, string opstr, PatFrag FOp, RegisterClass RC,
+              Operand MemOpnd>:
+  FFI<op, (outs), (ins RC:$ft, MemOpnd:$base),
+      !strconcat(opstr, "\t$ft, $base"), [(store RC:$ft, addr:$base)]>;
+
+// Instructions that convert an FP value to 32-bit fixed point.
+multiclass FFR1_W_M<bits<6> funct, string opstr> {
+  def _S   : FFR1<funct, 16, opstr, "w.s", FGR32, FGR32>;
+  def _D32 : FFR1<funct, 17, opstr, "w.d", FGR32, AFGR64>,
+             Requires<[NotFP64bit]>;
+  def _D64 : FFR1<funct, 17, opstr, "w.d", FGR32, FGR64>,
+             Requires<[IsFP64bit]>;
 }
 
-multiclass FFR1_2<bits<6> funct, string asmstr, SDNode FOp>
-{
-  def _S32 : FFR<0x11, funct, 0x0, (outs FGR32:$fd), (ins FGR32:$fs),
-                 !strconcat(asmstr, ".s\t$fd, $fs"),
-                 [(set FGR32:$fd, (FOp FGR32:$fs))]>;
-
-  def _D32  : FFR<0x11, funct, 0x1, (outs AFGR64:$fd), (ins AFGR64:$fs),
-                 !strconcat(asmstr, ".d\t$fd, $fs"),
-                 [(set AFGR64:$fd, (FOp AFGR64:$fs))]>, Requires<[In32BitMode]>;
+// Instructions that convert an FP value to 64-bit fixed point.
+let Predicates = [IsFP64bit] in
+multiclass FFR1_L_M<bits<6> funct, string opstr> {
+  def _S   : FFR1<funct, 16, opstr, "l.s", FGR64, FGR32>;
+  def _D64 : FFR1<funct, 17, opstr, "l.d", FGR64, FGR64>;
 }
 
-class FFR1_3<bits<6> funct, bits<5> fmt, RegisterClass RcSrc,
-              RegisterClass RcDst, string asmstr>:
-  FFR<0x11, funct, fmt, (outs RcSrc:$fd), (ins RcDst:$fs),
-      !strconcat(asmstr, "\t$fd, $fs"), []>;
-
+// FP-to-FP conversion instructions.
+multiclass FFR1P_M<bits<6> funct, string opstr, SDNode OpNode> {
+  def _S   : FFR1P<funct, 16, opstr, "s", FGR32, FGR32, OpNode>;
+  def _D32 : FFR1P<funct, 17, opstr, "d", AFGR64, AFGR64, OpNode>,
+             Requires<[NotFP64bit]>;
+  def _D64 : FFR1P<funct, 17, opstr, "d", FGR64, FGR64, OpNode>,
+             Requires<[IsFP64bit]>;
+}
 
-multiclass FFR1_4<bits<6> funct, string asmstr, SDNode FOp, bit isComm = 0> {
+multiclass FFR2P_M<bits<6> funct, string opstr, SDNode OpNode, bit isComm = 0> {
   let isCommutable = isComm in {
-  def _S32 : FFR<0x11, funct, 0x0, (outs FGR32:$fd),
-                 (ins FGR32:$fs, FGR32:$ft),
-                 !strconcat(asmstr, ".s\t$fd, $fs, $ft"),
-                 [(set FGR32:$fd, (FOp FGR32:$fs, FGR32:$ft))]>;
-
-  def _D32 : FFR<0x11, funct, 0x1, (outs AFGR64:$fd),
-                 (ins AFGR64:$fs, AFGR64:$ft),
-                 !strconcat(asmstr, ".d\t$fd, $fs, $ft"),
-                 [(set AFGR64:$fd, (FOp AFGR64:$fs, AFGR64:$ft))]>,
-                 Requires<[In32BitMode]>;
+  def _S   : FFR2P<funct, 16, opstr, "s", FGR32, OpNode>;
+  def _D32 : FFR2P<funct, 17, opstr, "d", AFGR64, OpNode>,
+             Requires<[NotFP64bit]>;
+  def _D64 : FFR2P<funct, 17, opstr, "d", FGR64, OpNode>,
+             Requires<[IsFP64bit]>;
   }
 }
 
 //===----------------------------------------------------------------------===//
 // Floating Point Instructions
 //===----------------------------------------------------------------------===//
+defm ROUND_W : FFR1_W_M<0xc, "round">;
+defm ROUND_L : FFR1_L_M<0x8, "round">;
+defm TRUNC_W : FFR1_W_M<0xd, "trunc">;
+defm TRUNC_L : FFR1_L_M<0x9, "trunc">;
+defm CEIL_W  : FFR1_W_M<0xe, "ceil">;
+defm CEIL_L  : FFR1_L_M<0xa, "ceil">;
+defm FLOOR_W : FFR1_W_M<0xf, "floor">;
+defm FLOOR_L : FFR1_L_M<0xb, "floor">;
+defm CVT_W   : FFR1_W_M<0x24, "cvt">;
+defm CVT_L   : FFR1_L_M<0x25, "cvt">;
+
+def CVT_S_W : FFR1<0x20, 20, "cvt", "s.w", FGR32, FGR32>;
+
+let Predicates = [NotFP64bit] in {
+  def CVT_S_D32 : FFR1<0x20, 17, "cvt", "s.d", FGR32, AFGR64>;
+  def CVT_D32_W : FFR1<0x21, 20, "cvt", "d.w", AFGR64, FGR32>;
+  def CVT_D32_S : FFR1<0x21, 16, "cvt", "d.s", AFGR64, FGR32>;
+}
 
-let ft = 0 in {
-  defm FLOOR_W : FFR1_1<0b001111, "floor.w">;
-  defm CEIL_W  : FFR1_1<0b001110, "ceil.w">;
-  defm ROUND_W : FFR1_1<0b001100, "round.w">;
-  defm TRUNC_W : FFR1_1<0b001101, "trunc.w">;
-  defm CVTW    : FFR1_1<0b100100, "cvt.w">;
-
-  defm FABS    : FFR1_2<0b000101, "abs",  fabs>;
-  defm FNEG    : FFR1_2<0b000111, "neg",  fneg>;
-  defm FSQRT   : FFR1_2<0b000100, "sqrt", fsqrt>;
-
-  /// Convert to Single Precison
-  def CVTS_W32 : FFR1_3<0b100000, 0x2, FGR32,  FGR32,  "cvt.s.w">;
-
-  let Predicates = [IsNotSingleFloat] in {
-    /// Ceil to long signed integer
-    def CEIL_LS   : FFR1_3<0b001010, 0x0, FGR32, FGR32, "ceil.l">;
-    def CEIL_LD   : FFR1_3<0b001010, 0x1, AFGR64, AFGR64, "ceil.l">;
-
-    /// Round to long signed integer
-    def ROUND_LS  : FFR1_3<0b001000, 0x0, FGR32, FGR32, "round.l">;
-    def ROUND_LD  : FFR1_3<0b001000, 0x1, AFGR64, AFGR64, "round.l">;
-
-    /// Floor to long signed integer
-    def FLOOR_LS  : FFR1_3<0b001011, 0x0, FGR32, FGR32, "floor.l">;
-    def FLOOR_LD  : FFR1_3<0b001011, 0x1, AFGR64, AFGR64, "floor.l">;
-
-    /// Trunc to long signed integer
-    def TRUNC_LS  : FFR1_3<0b001001, 0x0, FGR32, FGR32, "trunc.l">;
-    def TRUNC_LD  : FFR1_3<0b001001, 0x1, AFGR64, AFGR64, "trunc.l">;
-
-    /// Convert to long signed integer
-    def CVTL_S    : FFR1_3<0b100101, 0x0, FGR32, FGR32, "cvt.l">;
-    def CVTL_D    : FFR1_3<0b100101, 0x1, AFGR64, AFGR64, "cvt.l">;
-
-    /// Convert to Double Precison
-    def CVTD_S32 : FFR1_3<0b100001, 0x0, AFGR64, FGR32, "cvt.d.s">;
-    def CVTD_W32 : FFR1_3<0b100001, 0x2, AFGR64, FGR32, "cvt.d.w">;
-    def CVTD_L32 : FFR1_3<0b100001, 0x3, AFGR64, AFGR64, "cvt.d.l">;
-
-    /// Convert to Single Precison
-    def CVTS_D32 : FFR1_3<0b100000, 0x1, FGR32, AFGR64, "cvt.s.d">;
-    def CVTS_L32 : FFR1_3<0b100000, 0x3, FGR32, AFGR64, "cvt.s.l">;
-  }
+let Predicates = [IsFP64bit] in {
+ def CVT_S_D64 : FFR1<0x20, 17, "cvt", "s.d", FGR32, FGR64>;
+ def CVT_S_L   : FFR1<0x20, 21, "cvt", "s.l", FGR32, FGR64>;
+ def CVT_D64_W : FFR1<0x21, 20, "cvt", "d.w", FGR64, FGR32>;
+ def CVT_D64_S : FFR1<0x21, 16, "cvt", "d.s", FGR64, FGR32>;
+ def CVT_D64_L : FFR1<0x21, 21, "cvt", "d.l", FGR64, FGR64>;
 }
 
+defm FABS    : FFR1P_M<0x5, "abs",  fabs>;
+defm FNEG    : FFR1P_M<0x7, "neg",  fneg>;
+defm FSQRT   : FFR1P_M<0x4, "sqrt", fsqrt>;
+
 // The odd-numbered registers are only referenced when doing loads,
 // stores, and moves between floating-point and integer registers.
 // When defining instructions, we reference all 32-bit registers,
@@ -178,37 +167,46 @@ let fd = 0 in {
                   "ctc1\t$fs, $rt", []>;
 
   def MFC1  : FFR<0x11, 0x00, 0x00, (outs CPURegs:$rt), (ins FGR32:$fs),
-                  "mfc1\t$rt, $fs", []>;
+                  "mfc1\t$rt, $fs",
+                  [(set CPURegs:$rt, (bitconvert FGR32:$fs))]>;
 
   def MTC1  : FFR<0x11, 0x00, 0x04, (outs FGR32:$fs), (ins CPURegs:$rt),
-                  "mtc1\t$rt, $fs", []>;
+                  "mtc1\t$rt, $fs",
+                  [(set FGR32:$fs, (bitconvert CPURegs:$rt))]>;
 }
 
-def FMOV_S32 : FFR<0x11, 0b000110, 0x0, (outs FGR32:$fd), (ins FGR32:$fs),
-                   "mov.s\t$fd, $fs", []>;
-def FMOV_D32 : FFR<0x11, 0b000110, 0x1, (outs AFGR64:$fd), (ins AFGR64:$fs),
-                   "mov.d\t$fd, $fs", []>;
+def FMOV_S   : FFR1<0x6, 16, "mov", "s", FGR32, FGR32>;
+def FMOV_D32 : FFR1<0x6, 17, "mov", "d", AFGR64, AFGR64>,
+               Requires<[NotFP64bit]>;
+def FMOV_D64 : FFR1<0x6, 17, "mov", "d", FGR64, FGR64>,
+               Requires<[IsFP64bit]>;
 
 /// Floating Point Memory Instructions
-let Predicates = [IsNotSingleFloat, IsNotMipsI] in {
-  def LDC1 : FFI<0b110101, (outs AFGR64:$ft), (ins mem:$addr),
-                 "ldc1\t$ft, $addr", [(set AFGR64:$ft, (load addr:$addr))]>;
-
-  def SDC1 : FFI<0b111101, (outs), (ins AFGR64:$ft, mem:$addr),
-                 "sdc1\t$ft, $addr", [(store AFGR64:$ft, addr:$addr)]>;
+let Predicates = [IsN64] in {
+  def LWC1_P8   : FPLoad<0x31, "lwc1", load, FGR32, mem64>;
+  def SWC1_P8   : FPStore<0x39, "swc1", store, FGR32, mem64>;
+  def LDC164_P8 : FPLoad<0x35, "ldc1", load, FGR64, mem64>;
+  def SDC164_P8 : FPStore<0x3d, "sdc1", store, FGR64, mem64>;
 }
 
-// LWC1 and SWC1 can always be emitted with odd registers.
-def LWC1  : FFI<0b110001, (outs FGR32:$ft), (ins mem:$addr), "lwc1\t$ft, $addr",
-               [(set FGR32:$ft, (load addr:$addr))]>;
-def SWC1  : FFI<0b111001, (outs), (ins FGR32:$ft, mem:$addr),
-               "swc1\t$ft, $addr", [(store FGR32:$ft, addr:$addr)]>;
+let Predicates = [NotN64] in {
+  def LWC1   : FPLoad<0x31, "lwc1", load, FGR32, mem>;
+  def SWC1   : FPStore<0x39, "swc1", store, FGR32, mem>;
+  let Predicates = [HasMips64] in {
+    def LDC164 : FPLoad<0x35, "ldc1", load, FGR64, mem>;
+    def SDC164 : FPStore<0x3d, "sdc1", store, FGR64, mem>;
+  }
+  let Predicates = [NotMips64] in {
+    def LDC1   : FPLoad<0x35, "ldc1", load, AFGR64, mem>;
+    def SDC1   : FPStore<0x3d, "sdc1", store, AFGR64, mem>;
+  }
+}
 
 /// Floating-point Aritmetic
-defm FADD : FFR1_4<0x10, "add", fadd, 1>;
-defm FDIV : FFR1_4<0x03, "div", fdiv>;
-defm FMUL : FFR1_4<0x02, "mul", fmul, 1>;
-defm FSUB : FFR1_4<0x01, "sub", fsub>;
+defm FADD : FFR2P_M<0x10, "add", fadd, 1>;
+defm FDIV : FFR2P_M<0x03, "div", fdiv>;
+defm FMUL : FFR2P_M<0x02, "mul", fmul, 1>;
+defm FSUB : FFR2P_M<0x01, "sub", fsub>;
 
 //===----------------------------------------------------------------------===//
 // Floating Point Branch Codes
@@ -217,8 +215,6 @@ defm FSUB : FFR1_4<0x01, "sub", fsub>;
 // They must be kept in synch.
 def MIPS_BRANCH_F  : PatLeaf<(i32 0)>;
 def MIPS_BRANCH_T  : PatLeaf<(i32 1)>;
-def MIPS_BRANCH_FL : PatLeaf<(i32 2)>;
-def MIPS_BRANCH_TL : PatLeaf<(i32 3)>;
 
 /// Floating Point Branch of False/True (Likely)
 let isBranch=1, isTerminator=1, hasDelaySlot=1, base=0x8, Uses=[FCR31] in
@@ -228,8 +224,6 @@ let isBranch=1, isTerminator=1, hasDelaySlot=1, base=0x8, Uses=[FCR31] in
 
 def BC1F  : FBRANCH<MIPS_BRANCH_F,  "bc1f">;
 def BC1T  : FBRANCH<MIPS_BRANCH_T,  "bc1t">;
-def BC1FL : FBRANCH<MIPS_BRANCH_FL, "bc1fl">;
-def BC1TL : FBRANCH<MIPS_BRANCH_TL, "bc1tl">;
 
 //===----------------------------------------------------------------------===//
 // Floating Point Flag Conditions
@@ -254,7 +248,7 @@ def MIPS_FCOND_LE   : PatLeaf<(i32 14)>;
 def MIPS_FCOND_NGT  : PatLeaf<(i32 15)>;
 
 /// Floating Point Compare
-let hasDelaySlot = 1, Defs=[FCR31] in {
+let Defs=[FCR31] in {
   def FCMP_S32 : FCC<0x0, (outs), (ins FGR32:$fs, FGR32:$ft, condcode:$cc),
                      "c.$cc.s\t$fs, $ft",
                      [(MipsFPCmp FGR32:$fs, FGR32:$ft, imm:$cc)]>;
@@ -262,7 +256,7 @@ let hasDelaySlot = 1, Defs=[FCR31] in {
   def FCMP_D32 : FCC<0x1, (outs), (ins AFGR64:$fs, AFGR64:$ft, condcode:$cc),
                      "c.$cc.d\t$fs, $ft",
                      [(MipsFPCmp AFGR64:$fs, AFGR64:$ft, imm:$cc)]>,
-                     Requires<[In32BitMode]>;
+                     Requires<[NotFP64bit]>;
 }
 
 
@@ -280,7 +274,7 @@ class CondMovIntFP<RegisterClass RC, bits<5> fmt, bits<6> func,
 def MOVZ_S : CondMovIntFP<FGR32, 16, 18, "movz.s">;
 def MOVN_S : CondMovIntFP<FGR32, 16, 19, "movn.s">;
 
-let Predicates = [In32BitMode] in {
+let Predicates = [NotFP64bit] in {
   def MOVZ_D : CondMovIntFP<AFGR64, 17, 18, "movz.d">;
   def MOVN_D : CondMovIntFP<AFGR64, 17, 19, "movn.d">;
 }
@@ -288,7 +282,7 @@ let Predicates = [In32BitMode] in {
 defm : MovzPats<FGR32, MOVZ_S>;
 defm : MovnPats<FGR32, MOVN_S>;
 
-let Predicates = [In32BitMode] in {
+let Predicates = [NotFP64bit] in {
   defm : MovzPats<AFGR64, MOVZ_D>;
   defm : MovnPats<AFGR64, MOVN_D>;
 }
@@ -313,7 +307,7 @@ def MOVF : CondMovFPInt<MipsCMovFP_F, 0, "movf">;
 def MOVT_S : CondMovFPFP<FGR32, MipsCMovFP_T, 16, 1, "movt.s">;
 def MOVF_S : CondMovFPFP<FGR32, MipsCMovFP_F, 16, 0, "movf.s">;
 
-let Predicates = [In32BitMode] in {
+let Predicates = [NotFP64bit] in {
   def MOVT_D : CondMovFPFP<AFGR64, MipsCMovFP_T, 17, 1, "movt.d">;
   def MOVF_D : CondMovFPFP<AFGR64, MipsCMovFP_F, 17, 0, "movf.d">;
 }
@@ -353,22 +347,16 @@ def fpimm0neg : PatLeaf<(fpimm), [{
 }]>;
 
 def : Pat<(f32 fpimm0), (MTC1 ZERO)>;
-def : Pat<(f32 fpimm0neg), (FNEG_S32 (MTC1 ZERO))>;
+def : Pat<(f32 fpimm0neg), (FNEG_S (MTC1 ZERO))>;
 
-def : Pat<(f32 (sint_to_fp CPURegs:$src)), (CVTS_W32 (MTC1 CPURegs:$src))>;
-def : Pat<(f64 (sint_to_fp CPURegs:$src)), (CVTD_W32 (MTC1 CPURegs:$src))>;
+def : Pat<(f32 (sint_to_fp CPURegs:$src)), (CVT_S_W (MTC1 CPURegs:$src))>;
+def : Pat<(f64 (sint_to_fp CPURegs:$src)), (CVT_D32_W (MTC1 CPURegs:$src))>;
 
-def : Pat<(i32 (fp_to_sint FGR32:$src)), (MFC1 (TRUNC_W_S32 FGR32:$src))>;
+def : Pat<(i32 (fp_to_sint FGR32:$src)), (MFC1 (TRUNC_W_S FGR32:$src))>;
 def : Pat<(i32 (fp_to_sint AFGR64:$src)), (MFC1 (TRUNC_W_D32 AFGR64:$src))>;
 
-def : Pat<(i32 (bitconvert FGR32:$src)),  (MFC1 FGR32:$src)>;
-def : Pat<(f32 (bitconvert CPURegs:$src)), (MTC1 CPURegs:$src)>;
-
-let Predicates = [In32BitMode] in {
-  def : Pat<(f32 (fround AFGR64:$src)), (CVTS_D32 AFGR64:$src)>;
-  def : Pat<(f64 (fextend FGR32:$src)), (CVTD_S32 FGR32:$src)>;
+let Predicates = [NotFP64bit] in {
+  def : Pat<(f32 (fround AFGR64:$src)), (CVT_S_D32 AFGR64:$src)>;
+  def : Pat<(f64 (fextend FGR32:$src)), (CVT_D32_S FGR32:$src)>;
 }
 
-// MipsFPRound is only emitted for MipsI targets.
-def : Pat<(f32 (MipsFPRound AFGR64:$src)), (CVTW_D32 AFGR64:$src)>;
-
diff --git a/lib/Target/Mips/MipsInstrFormats.td b/lib/Target/Mips/MipsInstrFormats.td
index 9f55fb38b959c..d246a26eb233c 100644
--- a/lib/Target/Mips/MipsInstrFormats.td
+++ b/lib/Target/Mips/MipsInstrFormats.td
@@ -44,7 +44,9 @@ class MipsInst<dag outs, dag ins, string asmstr, list<dag> pattern,
 
 // Mips Pseudo Instructions Format
 class MipsPseudo<dag outs, dag ins, string asmstr, list<dag> pattern>:
-      MipsInst<outs, ins, asmstr, pattern, IIPseudo>;
+      MipsInst<outs, ins, asmstr, pattern, IIPseudo> {
+  let isPseudo = 1;
+}
 
 //===----------------------------------------------------------------------===//
 // Format R instruction class in Mips : <|opcode|rs|rt|rd|shamt|funct|>
@@ -88,6 +90,21 @@ class FI<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern,
   let Inst{15-0}  = imm16;
 }
 
+class CBranchBase<bits<6> op, dag outs, dag ins, string asmstr,
+                  list<dag> pattern, InstrItinClass itin>:
+  MipsInst<outs, ins, asmstr, pattern, itin>
+{
+  bits<5>  rs;
+  bits<5>  rt;
+  bits<16> imm16;
+
+  let opcode = op;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-0}  = imm16;
+}
+
 //===----------------------------------------------------------------------===//
 // Format J instruction class in Mips : <|opcode|address|>
 //===----------------------------------------------------------------------===//
@@ -224,4 +241,27 @@ class FFCMOV<bits<5> _fmt, bits<1> _tf, dag outs, dag ins, string asmstr,
   let Inst{15-11} = fs;
   let Inst{10-6}  = fd;
   let Inst{5-0}   = 17;
-}
\ No newline at end of file
+}
+
+// FP unary instructions without patterns.
+class FFR1<bits<6> funct, bits<5> fmt, string opstr, string fmtstr,
+           RegisterClass DstRC, RegisterClass SrcRC> :
+  FFR<0x11, funct, fmt, (outs DstRC:$fd), (ins SrcRC:$fs),
+      !strconcat(opstr, ".", fmtstr, "\t$fd, $fs"), []> {
+  let ft = 0;
+}
+
+// FP unary instructions with patterns.
+class FFR1P<bits<6> funct, bits<5> fmt, string opstr, string fmtstr,
+            RegisterClass DstRC, RegisterClass SrcRC, SDNode OpNode> :
+  FFR<0x11, funct, fmt, (outs DstRC:$fd), (ins SrcRC:$fs),
+      !strconcat(opstr, ".", fmtstr, "\t$fd, $fs"),
+      [(set DstRC:$fd, (OpNode SrcRC:$fs))]> {
+  let ft = 0;
+}
+
+class FFR2P<bits<6> funct, bits<5> fmt, string opstr,
+            string fmtstr, RegisterClass RC, SDNode OpNode> :
+  FFR<0x11, funct, fmt, (outs RC:$fd), (ins RC:$fs, RC:$ft),
+      !strconcat(opstr, ".", fmtstr, "\t$fd, $fs, $ft"),
+      [(set RC:$fd, (OpNode RC:$fs, RC:$ft))]>;
diff --git a/lib/Target/Mips/MipsInstrInfo.cpp b/lib/Target/Mips/MipsInstrInfo.cpp
index 0a7a7f2dfe4ea..559943a8dbaee 100644
--- a/lib/Target/Mips/MipsInstrInfo.cpp
+++ b/lib/Target/Mips/MipsInstrInfo.cpp
@@ -17,8 +17,8 @@
 #include "InstPrinter/MipsInstPrinter.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Target/TargetRegistry.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/ADT/STLExtras.h"
 
 #define GET_INSTRINFO_CTOR
@@ -28,7 +28,8 @@ using namespace llvm;
 
 MipsInstrInfo::MipsInstrInfo(MipsTargetMachine &tm)
   : MipsGenInstrInfo(Mips::ADJCALLSTACKDOWN, Mips::ADJCALLSTACKUP),
-    TM(tm), RI(*TM.getSubtargetImpl(), *this) {}
+    TM(tm), IsN64(TM.getSubtarget<MipsSubtarget>().isABI_N64()),
+    RI(*TM.getSubtargetImpl(), *this) {}
 
 
 const MipsRegisterInfo &MipsInstrInfo::getRegisterInfo() const { 
@@ -47,8 +48,12 @@ static bool isZeroImm(const MachineOperand &op) {
 unsigned MipsInstrInfo::
 isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const
 {
-  if ((MI->getOpcode() == Mips::LW) || (MI->getOpcode() == Mips::LWC1) ||
-      (MI->getOpcode() == Mips::LDC1)) {
+  unsigned Opc = MI->getOpcode();
+
+  if ((Opc == Mips::LW)    || (Opc == Mips::LW_P8)  || (Opc == Mips::LD) ||
+      (Opc == Mips::LD_P8) || (Opc == Mips::LWC1)   || (Opc == Mips::LWC1_P8) ||
+      (Opc == Mips::LDC1)  || (Opc == Mips::LDC164) ||
+      (Opc == Mips::LDC164_P8)) {
     if ((MI->getOperand(1).isFI()) && // is a stack slot
         (MI->getOperand(2).isImm()) &&  // the imm is zero
         (isZeroImm(MI->getOperand(2)))) {
@@ -68,8 +73,12 @@ isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const
 unsigned MipsInstrInfo::
 isStoreToStackSlot(const MachineInstr *MI, int &FrameIndex) const
 {
-  if ((MI->getOpcode() == Mips::SW) || (MI->getOpcode() == Mips::SWC1) ||
-      (MI->getOpcode() == Mips::SDC1)) {
+  unsigned Opc = MI->getOpcode();
+
+  if ((Opc == Mips::SW)    || (Opc == Mips::SW_P8)  || (Opc == Mips::SD) ||
+      (Opc == Mips::SD_P8) || (Opc == Mips::SWC1)   || (Opc == Mips::SWC1_P8) ||
+      (Opc == Mips::SDC1)  || (Opc == Mips::SDC164) ||
+      (Opc == Mips::SDC164_P8)) {
     if ((MI->getOperand(1).isFI()) && // is a stack slot
         (MI->getOperand(2).isImm()) &&  // the imm is zero
         (isZeroImm(MI->getOperand(2)))) {
@@ -94,70 +103,63 @@ copyPhysReg(MachineBasicBlock &MBB,
             MachineBasicBlock::iterator I, DebugLoc DL,
             unsigned DestReg, unsigned SrcReg,
             bool KillSrc) const {
-  bool DestCPU = Mips::CPURegsRegClass.contains(DestReg);
-  bool SrcCPU  = Mips::CPURegsRegClass.contains(SrcReg);
-
-  // CPU-CPU is the most common.
-  if (DestCPU && SrcCPU) {
-    BuildMI(MBB, I, DL, get(Mips::ADDu), DestReg).addReg(Mips::ZERO)
-      .addReg(SrcReg, getKillRegState(KillSrc));
-    return;
-  }
+  unsigned Opc = 0, ZeroReg = 0;
 
-  // Copy to CPU from other registers.
-  if (DestCPU) {
-    if (Mips::CCRRegClass.contains(SrcReg))
-      BuildMI(MBB, I, DL, get(Mips::CFC1), DestReg)
-        .addReg(SrcReg, getKillRegState(KillSrc));
+  if (Mips::CPURegsRegClass.contains(DestReg)) { // Copy to CPU Reg.
+    if (Mips::CPURegsRegClass.contains(SrcReg))
+      Opc = Mips::ADDu, ZeroReg = Mips::ZERO;
+    else if (Mips::CCRRegClass.contains(SrcReg))
+      Opc = Mips::CFC1;
     else if (Mips::FGR32RegClass.contains(SrcReg))
-      BuildMI(MBB, I, DL, get(Mips::MFC1), DestReg)
-        .addReg(SrcReg, getKillRegState(KillSrc));
+      Opc = Mips::MFC1;
     else if (SrcReg == Mips::HI)
-      BuildMI(MBB, I, DL, get(Mips::MFHI), DestReg);
+      Opc = Mips::MFHI, SrcReg = 0;
     else if (SrcReg == Mips::LO)
-      BuildMI(MBB, I, DL, get(Mips::MFLO), DestReg);
-    else
-      llvm_unreachable("Copy to CPU from invalid register");
-    return;
+      Opc = Mips::MFLO, SrcReg = 0;
   }
-
-  // Copy to other registers from CPU.
-  if (SrcCPU) {
+  else if (Mips::CPURegsRegClass.contains(SrcReg)) { // Copy from CPU Reg.
     if (Mips::CCRRegClass.contains(DestReg))
-      BuildMI(MBB, I, DL, get(Mips::CTC1), DestReg)
-        .addReg(SrcReg, getKillRegState(KillSrc));
+      Opc = Mips::CTC1;
     else if (Mips::FGR32RegClass.contains(DestReg))
-      BuildMI(MBB, I, DL, get(Mips::MTC1), DestReg)
-        .addReg(SrcReg, getKillRegState(KillSrc));
+      Opc = Mips::MTC1;
     else if (DestReg == Mips::HI)
-      BuildMI(MBB, I, DL, get(Mips::MTHI))
-        .addReg(SrcReg, getKillRegState(KillSrc));
+      Opc = Mips::MTHI, DestReg = 0;
     else if (DestReg == Mips::LO)
-      BuildMI(MBB, I, DL, get(Mips::MTLO))
-        .addReg(SrcReg, getKillRegState(KillSrc));
-    else
-      llvm_unreachable("Copy from CPU to invalid register");
-    return;
+      Opc = Mips::MTLO, DestReg = 0;
   }
-
-  if (Mips::FGR32RegClass.contains(DestReg, SrcReg)) {
-    BuildMI(MBB, I, DL, get(Mips::FMOV_S32), DestReg)
-      .addReg(SrcReg, getKillRegState(KillSrc));
-    return;
+  else if (Mips::FGR32RegClass.contains(DestReg, SrcReg))
+    Opc = Mips::FMOV_S;
+  else if (Mips::AFGR64RegClass.contains(DestReg, SrcReg))
+    Opc = Mips::FMOV_D32;
+  else if (Mips::CCRRegClass.contains(DestReg, SrcReg))
+    Opc = Mips::MOVCCRToCCR;
+  else if (Mips::CPU64RegsRegClass.contains(DestReg)) { // Copy to CPU64 Reg.
+    if (Mips::CPU64RegsRegClass.contains(SrcReg))
+      Opc = Mips::DADDu, ZeroReg = Mips::ZERO_64;
+    else if (SrcReg == Mips::HI64)
+      Opc = Mips::MFHI64, SrcReg = 0;
+    else if (SrcReg == Mips::LO64)
+      Opc = Mips::MFLO64, SrcReg = 0;
   }
-
-  if (Mips::AFGR64RegClass.contains(DestReg, SrcReg)) {
-    BuildMI(MBB, I, DL, get(Mips::FMOV_D32), DestReg)
-      .addReg(SrcReg, getKillRegState(KillSrc));
-    return;
+  else if (Mips::CPU64RegsRegClass.contains(SrcReg)) { // Copy from CPU64 Reg.
+    if (DestReg == Mips::HI64)
+      Opc = Mips::MTHI64, DestReg = 0;
+    else if (DestReg == Mips::LO64)
+      Opc = Mips::MTLO64, DestReg = 0;
   }
 
-  if (Mips::CCRRegClass.contains(DestReg, SrcReg)) {
-    BuildMI(MBB, I, DL, get(Mips::MOVCCRToCCR), DestReg)
-      .addReg(SrcReg, getKillRegState(KillSrc));
-    return;
-  }
-  llvm_unreachable("Cannot copy registers");
+  assert(Opc && "Cannot copy registers");
+
+  MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opc));
+  
+  if (DestReg)
+    MIB.addReg(DestReg, RegState::Define);
+
+  if (ZeroReg)
+    MIB.addReg(ZeroReg);
+
+  if (SrcReg)
+    MIB.addReg(SrcReg, getKillRegState(KillSrc));
 }
 
 void MipsInstrInfo::
@@ -167,31 +169,22 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                     const TargetRegisterInfo *TRI) const {
   DebugLoc DL;
   if (I != MBB.end()) DL = I->getDebugLoc();
+  unsigned Opc = 0;
 
   if (RC == Mips::CPURegsRegisterClass)
-    BuildMI(MBB, I, DL, get(Mips::SW)).addReg(SrcReg, getKillRegState(isKill))
-                                      .addFrameIndex(FI).addImm(0);
+    Opc = IsN64 ? Mips::SW_P8 : Mips::SW;
+  else if (RC == Mips::CPU64RegsRegisterClass)
+    Opc = IsN64 ? Mips::SD_P8 : Mips::SD;
   else if (RC == Mips::FGR32RegisterClass)
-    BuildMI(MBB, I, DL, get(Mips::SWC1)).addReg(SrcReg, getKillRegState(isKill))
-                                        .addFrameIndex(FI).addImm(0);
-  else if (RC == Mips::AFGR64RegisterClass) {
-    if (!TM.getSubtarget<MipsSubtarget>().isMips1()) {
-      BuildMI(MBB, I, DL, get(Mips::SDC1))
-        .addReg(SrcReg, getKillRegState(isKill))
-        .addFrameIndex(FI).addImm(0);
-    } else {
-      const TargetRegisterInfo *TRI =
-        MBB.getParent()->getTarget().getRegisterInfo();
-      const unsigned *SubSet = TRI->getSubRegisters(SrcReg);
-      BuildMI(MBB, I, DL, get(Mips::SWC1))
-        .addReg(SubSet[0], getKillRegState(isKill))
-        .addFrameIndex(FI).addImm(0);
-      BuildMI(MBB, I, DL, get(Mips::SWC1))
-        .addReg(SubSet[1], getKillRegState(isKill))
-        .addFrameIndex(FI).addImm(4);
-    }
-  } else
-    llvm_unreachable("Register class not handled!");
+    Opc = IsN64 ? Mips::SWC1_P8 : Mips::SWC1;
+  else if (RC == Mips::AFGR64RegisterClass)
+    Opc = Mips::SDC1;
+  else if (RC == Mips::FGR64RegisterClass)
+    Opc = IsN64 ? Mips::SDC164_P8 : Mips::SDC164;
+
+  assert(Opc && "Register class not handled!");
+  BuildMI(MBB, I, DL, get(Opc)).addReg(SrcReg, getKillRegState(isKill))
+    .addFrameIndex(FI).addImm(0);
 }
 
 void MipsInstrInfo::
@@ -202,25 +195,21 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
 {
   DebugLoc DL;
   if (I != MBB.end()) DL = I->getDebugLoc();
+  unsigned Opc = 0;
 
   if (RC == Mips::CPURegsRegisterClass)
-    BuildMI(MBB, I, DL, get(Mips::LW), DestReg).addFrameIndex(FI).addImm(0);
+    Opc = IsN64 ? Mips::LW_P8 : Mips::LW;
+  else if (RC == Mips::CPU64RegsRegisterClass)
+    Opc = IsN64 ? Mips::LD_P8 : Mips::LD;
   else if (RC == Mips::FGR32RegisterClass)
-    BuildMI(MBB, I, DL, get(Mips::LWC1), DestReg).addFrameIndex(FI).addImm(0);
-  else if (RC == Mips::AFGR64RegisterClass) {
-    if (!TM.getSubtarget<MipsSubtarget>().isMips1()) {
-      BuildMI(MBB, I, DL, get(Mips::LDC1), DestReg).addFrameIndex(FI).addImm(0);
-    } else {
-      const TargetRegisterInfo *TRI =
-        MBB.getParent()->getTarget().getRegisterInfo();
-      const unsigned *SubSet = TRI->getSubRegisters(DestReg);
-      BuildMI(MBB, I, DL, get(Mips::LWC1), SubSet[0])
-        .addFrameIndex(FI).addImm(0);
-      BuildMI(MBB, I, DL, get(Mips::LWC1), SubSet[1])
-        .addFrameIndex(FI).addImm(4);
-    }
-  } else
-    llvm_unreachable("Register class not handled!");
+    Opc = IsN64 ? Mips::LWC1_P8 : Mips::LWC1;
+  else if (RC == Mips::AFGR64RegisterClass)
+    Opc = Mips::LDC1;
+  else if (RC == Mips::FGR64RegisterClass)
+    Opc = IsN64 ? Mips::LDC164_P8 : Mips::LDC164;
+
+  assert(Opc && "Register class not handled!");
+  BuildMI(MBB, I, DL, get(Opc), DestReg).addFrameIndex(FI).addImm(0);
 }
 
 MachineInstr*
@@ -237,9 +226,12 @@ MipsInstrInfo::emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx,
 //===----------------------------------------------------------------------===//
 
 static unsigned GetAnalyzableBrOpc(unsigned Opc) {
-  return (Opc == Mips::BEQ  || Opc == Mips::BNE  || Opc == Mips::BGTZ ||
-          Opc == Mips::BGEZ || Opc == Mips::BLTZ || Opc == Mips::BLEZ ||
-          Opc == Mips::BC1T || Opc == Mips::BC1F || Opc == Mips::J) ? Opc : 0;
+  return (Opc == Mips::BEQ    || Opc == Mips::BNE    || Opc == Mips::BGTZ   ||
+          Opc == Mips::BGEZ   || Opc == Mips::BLTZ   || Opc == Mips::BLEZ   ||
+          Opc == Mips::BEQ64  || Opc == Mips::BNE64  || Opc == Mips::BGTZ64 ||
+          Opc == Mips::BGEZ64 || Opc == Mips::BLTZ64 || Opc == Mips::BLEZ64 ||
+          Opc == Mips::BC1T   || Opc == Mips::BC1F   || Opc == Mips::J) ?
+         Opc : 0;
 }
 
 /// GetOppositeBranchOpc - Return the inverse of the specified
@@ -248,14 +240,20 @@ unsigned Mips::GetOppositeBranchOpc(unsigned Opc)
 {
   switch (Opc) {
   default: llvm_unreachable("Illegal opcode!");
-  case Mips::BEQ  : return Mips::BNE;
-  case Mips::BNE  : return Mips::BEQ;
-  case Mips::BGTZ : return Mips::BLEZ;
-  case Mips::BGEZ : return Mips::BLTZ;
-  case Mips::BLTZ : return Mips::BGEZ;
-  case Mips::BLEZ : return Mips::BGTZ;
-  case Mips::BC1T : return Mips::BC1F;
-  case Mips::BC1F : return Mips::BC1T;
+  case Mips::BEQ    : return Mips::BNE;
+  case Mips::BNE    : return Mips::BEQ;
+  case Mips::BGTZ   : return Mips::BLEZ;
+  case Mips::BGEZ   : return Mips::BLTZ;
+  case Mips::BLTZ   : return Mips::BGEZ;
+  case Mips::BLEZ   : return Mips::BGTZ;
+  case Mips::BEQ64  : return Mips::BNE64;
+  case Mips::BNE64  : return Mips::BEQ64;
+  case Mips::BGTZ64 : return Mips::BLEZ64;
+  case Mips::BGEZ64 : return Mips::BLTZ64;
+  case Mips::BLTZ64 : return Mips::BGEZ64;
+  case Mips::BLEZ64 : return Mips::BGTZ64;
+  case Mips::BC1T   : return Mips::BC1F;
+  case Mips::BC1F   : return Mips::BC1T;
   }
 }
 
diff --git a/lib/Target/Mips/MipsInstrInfo.h b/lib/Target/Mips/MipsInstrInfo.h
index 4421c4862fa0c..271d2487ecfaa 100644
--- a/lib/Target/Mips/MipsInstrInfo.h
+++ b/lib/Target/Mips/MipsInstrInfo.h
@@ -72,12 +72,47 @@ namespace MipsII {
     /// MO_TPREL_HI/LO - Represents the hi and low part of the offset from
     // the thread pointer (Local Exec TLS).
     MO_TPREL_HI,
-    MO_TPREL_LO
+    MO_TPREL_LO,
+
+    // N32/64 Flags.
+    MO_GPOFF_HI,
+    MO_GPOFF_LO,
+    MO_GOT_DISP,
+    MO_GOT_PAGE,
+    MO_GOT_OFST
+  };
+
+  enum {
+    //===------------------------------------------------------------------===//
+    // Instruction encodings.  These are the standard/most common forms for
+    // Mips instructions.
+    //
+
+    // Pseudo - This represents an instruction that is a pseudo instruction
+    // or one that has not been implemented yet.  It is illegal to code generate
+    // it, but tolerated for intermediate implementation stages.
+    Pseudo   = 0,
+
+    /// FrmR - This form is for instructions of the format R.
+    FrmR  = 1,
+    /// FrmI - This form is for instructions of the format I.
+    FrmI  = 2,
+    /// FrmJ - This form is for instructions of the format J.
+    FrmJ  = 3,
+    /// FrmFR - This form is for instructions of the format FR.
+    FrmFR = 4,
+    /// FrmFI - This form is for instructions of the format FI.
+    FrmFI = 5,
+    /// FrmOther - This form is for instructions that have no specific format.
+    FrmOther = 6,
+
+    FormMask = 15
   };
 }
 
 class MipsInstrInfo : public MipsGenInstrInfo {
   MipsTargetMachine &TM;
+  bool IsN64;
   const MipsRegisterInfo RI;
 public:
   explicit MipsInstrInfo(MipsTargetMachine &TM);
diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td
index d1a0587124593..06b7de7e8e323 100644
--- a/lib/Target/Mips/MipsInstrInfo.td
+++ b/lib/Target/Mips/MipsInstrInfo.td
@@ -34,13 +34,20 @@ def SDT_MipsMAddMSub     : SDTypeProfile<0, 4,
                                           SDTCisSameAs<1, 2>,
                                           SDTCisSameAs<2, 3>]>;
 def SDT_MipsDivRem       : SDTypeProfile<0, 2,
-                                         [SDTCisVT<0, i32>,
+                                         [SDTCisInt<0>,
                                           SDTCisSameAs<0, 1>]>;
 
 def SDT_MipsThreadPointer : SDTypeProfile<1, 0, [SDTCisPtrTy<0>]>;
 
 def SDT_MipsDynAlloc    : SDTypeProfile<1, 1, [SDTCisVT<0, i32>,
                                                SDTCisVT<1, iPTR>]>;
+def SDT_Sync             : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
+
+def SDT_Ext : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<0, 1>,
+                                   SDTCisVT<2, i32>, SDTCisSameAs<2, 3>]>;
+def SDT_Ins : SDTypeProfile<1, 4, [SDTCisInt<0>, SDTCisSameAs<0, 1>,
+                                   SDTCisVT<2, i32>, SDTCisSameAs<2, 3>,
+                                   SDTCisSameAs<0, 4>]>;
 
 // Call
 def MipsJmpLink : SDNode<"MipsISD::JmpLink",SDT_MipsJmpLink,
@@ -106,6 +113,11 @@ def MipsWrapperPIC    : SDNode<"MipsISD::WrapperPIC",  SDTIntUnaryOp>;
 def MipsDynAlloc  : SDNode<"MipsISD::DynAlloc", SDT_MipsDynAlloc,
                            [SDNPHasChain, SDNPInGlue]>;
 
+def MipsSync : SDNode<"MipsISD::Sync", SDT_Sync, [SDNPHasChain]>;
+
+def MipsExt :  SDNode<"MipsISD::Ext", SDT_Ext>;
+def MipsIns :  SDNode<"MipsISD::Ins", SDT_Ins>;
+
 //===----------------------------------------------------------------------===//
 // Mips Instruction Predicate Definitions.
 //===----------------------------------------------------------------------===//
@@ -113,8 +125,13 @@ def HasSEInReg  : Predicate<"Subtarget.hasSEInReg()">;
 def HasBitCount : Predicate<"Subtarget.hasBitCount()">;
 def HasSwap     : Predicate<"Subtarget.hasSwap()">;
 def HasCondMov  : Predicate<"Subtarget.hasCondMov()">;
-def IsMips32    : Predicate<"Subtarget.isMips32()">;
-def IsMips32r2  : Predicate<"Subtarget.isMips32r2()">;
+def HasMips32    : Predicate<"Subtarget.hasMips32()">;
+def HasMips32r2  : Predicate<"Subtarget.hasMips32r2()">;
+def HasMips64    : Predicate<"Subtarget.hasMips64()">;
+def NotMips64    : Predicate<"!Subtarget.hasMips64()">;
+def HasMips64r2  : Predicate<"Subtarget.hasMips64r2()">;
+def IsN64       : Predicate<"Subtarget.isABI_N64()">;
+def NotN64      : Predicate<"!Subtarget.isABI_N64()">;
 
 //===----------------------------------------------------------------------===//
 // Mips Operand, Complex Patterns and Transformations Definitions.
@@ -124,6 +141,7 @@ def IsMips32r2  : Predicate<"Subtarget.isMips32r2()">;
 def brtarget    : Operand<OtherVT>;
 def calltarget  : Operand<i32>;
 def simm16      : Operand<i32>;
+def simm16_64   : Operand<i64>;
 def shamt       : Operand<i32>;
 
 // Unsigned Operand
@@ -137,6 +155,11 @@ def mem : Operand<i32> {
   let MIOperandInfo = (ops CPURegs, simm16);
 }
 
+def mem64 : Operand<i64> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops CPU64Regs, simm16_64);
+}
+
 def mem_ea : Operand<i32> {
   let PrintMethod = "printMemOperandEA";
   let MIOperandInfo = (ops CPURegs, simm16);
@@ -176,37 +199,86 @@ def immZExt5 : PatLeaf<(imm), [{
 // since load and store instructions from stack used it.
 def addr : ComplexPattern<iPTR, 2, "SelectAddr", [frameindex], []>;
 
+//===----------------------------------------------------------------------===//
+// Pattern fragment for load/store
+//===----------------------------------------------------------------------===//
+class UnalignedLoad<PatFrag Node> : PatFrag<(ops node:$ptr), (Node node:$ptr), [{
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  return LD->getMemoryVT().getSizeInBits()/8 > LD->getAlignment();
+}]>;
+
+class AlignedLoad<PatFrag Node> : PatFrag<(ops node:$ptr), (Node node:$ptr), [{
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  return LD->getMemoryVT().getSizeInBits()/8 <= LD->getAlignment();
+}]>;
+
+class UnalignedStore<PatFrag Node> : PatFrag<(ops node:$val, node:$ptr),
+                                             (Node node:$val, node:$ptr), [{
+  StoreSDNode *SD = cast<StoreSDNode>(N);
+  return SD->getMemoryVT().getSizeInBits()/8 > SD->getAlignment();
+}]>;
+
+class AlignedStore<PatFrag Node> : PatFrag<(ops node:$val, node:$ptr),
+                                           (Node node:$val, node:$ptr), [{
+  StoreSDNode *SD = cast<StoreSDNode>(N);
+  return SD->getMemoryVT().getSizeInBits()/8 <= SD->getAlignment();
+}]>;
+
+// Load/Store PatFrags.
+def sextloadi16_a   : AlignedLoad<sextloadi16>;
+def zextloadi16_a   : AlignedLoad<zextloadi16>;
+def extloadi16_a    : AlignedLoad<extloadi16>;
+def load_a          : AlignedLoad<load>;
+def sextloadi32_a   : AlignedLoad<sextloadi32>;
+def zextloadi32_a   : AlignedLoad<zextloadi32>;
+def extloadi32_a    : AlignedLoad<extloadi32>;
+def truncstorei16_a : AlignedStore<truncstorei16>;
+def store_a         : AlignedStore<store>;
+def truncstorei32_a : AlignedStore<truncstorei32>;
+def sextloadi16_u   : UnalignedLoad<sextloadi16>;
+def zextloadi16_u   : UnalignedLoad<zextloadi16>;
+def extloadi16_u    : UnalignedLoad<extloadi16>;
+def load_u          : UnalignedLoad<load>;
+def sextloadi32_u   : UnalignedLoad<sextloadi32>;
+def zextloadi32_u   : UnalignedLoad<zextloadi32>;
+def extloadi32_u    : UnalignedLoad<extloadi32>;
+def truncstorei16_u : UnalignedStore<truncstorei16>;
+def store_u         : UnalignedStore<store>;
+def truncstorei32_u : UnalignedStore<truncstorei32>;
+
 //===----------------------------------------------------------------------===//
 // Instructions specific format
 //===----------------------------------------------------------------------===//
 
-// Arithmetic 3 register operands
-class ArithR<bits<6> op, bits<6> func, string instr_asm, SDNode OpNode,
-             InstrItinClass itin, bit isComm = 0>:
-  FR<op, func, (outs CPURegs:$dst), (ins CPURegs:$b, CPURegs:$c),
-     !strconcat(instr_asm, "\t$dst, $b, $c"),
-     [(set CPURegs:$dst, (OpNode CPURegs:$b, CPURegs:$c))], itin> {
+// Arithmetic and logical instructions with 3 register operands.
+class ArithLogicR<bits<6> op, bits<6> func, string instr_asm, SDNode OpNode,
+                  InstrItinClass itin, RegisterClass RC, bit isComm = 0>:
+  FR<op, func, (outs RC:$rd), (ins RC:$rs, RC:$rt),
+     !strconcat(instr_asm, "\t$rd, $rs, $rt"),
+     [(set RC:$rd, (OpNode RC:$rs, RC:$rt))], itin> {
+  let shamt = 0;
   let isCommutable = isComm;
 }
 
 class ArithOverflowR<bits<6> op, bits<6> func, string instr_asm,
-                     bit isComm = 0>:
-  FR<op, func, (outs CPURegs:$dst), (ins CPURegs:$b, CPURegs:$c),
-     !strconcat(instr_asm, "\t$dst, $b, $c"), [], IIAlu> {
+                    InstrItinClass itin, RegisterClass RC, bit isComm = 0>:
+  FR<op, func, (outs RC:$rd), (ins RC:$rs, RC:$rt),
+     !strconcat(instr_asm, "\t$rd, $rs, $rt"), [], itin> {
+  let shamt = 0;
   let isCommutable = isComm;
 }
 
-// Arithmetic 2 register operands
-class ArithI<bits<6> op, string instr_asm, SDNode OpNode,
-             Operand Od, PatLeaf imm_type> :
-  FI<op, (outs CPURegs:$dst), (ins CPURegs:$b, Od:$c),
-     !strconcat(instr_asm, "\t$dst, $b, $c"),
-     [(set CPURegs:$dst, (OpNode CPURegs:$b, imm_type:$c))], IIAlu>;
+// Arithmetic and logical instructions with 2 register operands.
+class ArithLogicI<bits<6> op, string instr_asm, SDNode OpNode,
+                  Operand Od, PatLeaf imm_type, RegisterClass RC> :
+  FI<op, (outs RC:$rt), (ins RC:$rs, Od:$i),
+     !strconcat(instr_asm, "\t$rt, $rs, $i"),
+     [(set RC:$rt, (OpNode RC:$rs, imm_type:$i))], IIAlu>;
 
 class ArithOverflowI<bits<6> op, string instr_asm, SDNode OpNode,
-             Operand Od, PatLeaf imm_type> :
-  FI<op, (outs CPURegs:$dst), (ins CPURegs:$b, Od:$c),
-     !strconcat(instr_asm, "\t$dst, $b, $c"), [], IIAlu>;
+                     Operand Od, PatLeaf imm_type, RegisterClass RC> :
+  FI<op, (outs RC:$rt), (ins RC:$rs, Od:$i),
+     !strconcat(instr_asm, "\t$rt, $rs, $i"), [], IIAlu>;
 
 // Arithmetic Multiply ADD/SUB
 let rd = 0, shamt = 0, Defs = [HI, LO], Uses = [HI, LO] in
@@ -214,92 +286,134 @@ class MArithR<bits<6> func, string instr_asm, SDNode op, bit isComm = 0> :
   FR<0x1c, func, (outs), (ins CPURegs:$rs, CPURegs:$rt),
      !strconcat(instr_asm, "\t$rs, $rt"),
      [(op CPURegs:$rs, CPURegs:$rt, LO, HI)], IIImul> {
+  let rd = 0;
+  let shamt = 0;
   let isCommutable = isComm;
 }
 
 //  Logical
-let isCommutable = 1 in
-class LogicR<bits<6> func, string instr_asm, SDNode OpNode>:
-  FR<0x00, func, (outs CPURegs:$dst), (ins CPURegs:$b, CPURegs:$c),
-     !strconcat(instr_asm, "\t$dst, $b, $c"),
-     [(set CPURegs:$dst, (OpNode CPURegs:$b, CPURegs:$c))], IIAlu>;
-
-class LogicI<bits<6> op, string instr_asm, SDNode OpNode>:
-  FI<op, (outs CPURegs:$dst), (ins CPURegs:$b, uimm16:$c),
-     !strconcat(instr_asm, "\t$dst, $b, $c"),
-     [(set CPURegs:$dst, (OpNode CPURegs:$b, immZExt16:$c))], IIAlu>;
-
-let isCommutable = 1 in
-class LogicNOR<bits<6> op, bits<6> func, string instr_asm>:
-  FR<op, func, (outs CPURegs:$dst), (ins CPURegs:$b, CPURegs:$c),
-     !strconcat(instr_asm, "\t$dst, $b, $c"),
-     [(set CPURegs:$dst, (not (or CPURegs:$b, CPURegs:$c)))], IIAlu>;
+class LogicNOR<bits<6> op, bits<6> func, string instr_asm, RegisterClass RC>:
+  FR<op, func, (outs RC:$rd), (ins RC:$rs, RC:$rt),
+     !strconcat(instr_asm, "\t$rd, $rs, $rt"),
+     [(set RC:$rd, (not (or RC:$rs, RC:$rt)))], IIAlu> {
+  let shamt = 0;
+  let isCommutable = 1;
+}
 
 // Shifts
 class LogicR_shift_rotate_imm<bits<6> func, bits<5> _rs, string instr_asm,
                               SDNode OpNode>:
-  FR<0x00, func, (outs CPURegs:$dst), (ins CPURegs:$b, shamt:$c),
-     !strconcat(instr_asm, "\t$dst, $b, $c"),
-     [(set CPURegs:$dst, (OpNode CPURegs:$b, immZExt5:$c))], IIAlu> {
+  FR<0x00, func, (outs CPURegs:$rd), (ins CPURegs:$rt, shamt:$shamt),
+     !strconcat(instr_asm, "\t$rd, $rt, $shamt"),
+     [(set CPURegs:$rd, (OpNode CPURegs:$rt, (i32 immZExt5:$shamt)))], IIAlu> {
   let rs = _rs;
 }
 
-class LogicR_shift_rotate_reg<bits<6> func, bits<5> _shamt, string instr_asm,
+class LogicR_shift_rotate_reg<bits<6> func, bits<5> isRotate, string instr_asm,
                               SDNode OpNode>:
-  FR<0x00, func, (outs CPURegs:$dst), (ins CPURegs:$c, CPURegs:$b),
-     !strconcat(instr_asm, "\t$dst, $b, $c"),
-     [(set CPURegs:$dst, (OpNode CPURegs:$b, CPURegs:$c))], IIAlu> {
-  let shamt = _shamt;
+  FR<0x00, func, (outs CPURegs:$rd), (ins CPURegs:$rs, CPURegs:$rt),
+     !strconcat(instr_asm, "\t$rd, $rt, $rs"),
+     [(set CPURegs:$rd, (OpNode CPURegs:$rt, CPURegs:$rs))], IIAlu> {
+  let shamt = isRotate;
 }
 
 // Load Upper Imediate
 class LoadUpper<bits<6> op, string instr_asm>:
-  FI< op,
-      (outs CPURegs:$dst),
-      (ins uimm16:$imm),
-      !strconcat(instr_asm, "\t$dst, $imm"),
-      [], IIAlu>;
+  FI<op, (outs CPURegs:$rt), (ins uimm16:$imm),
+     !strconcat(instr_asm, "\t$rt, $imm"), [], IIAlu> {
+  let rs = 0;
+}
 
 // Memory Load/Store
-let canFoldAsLoad = 1, hasDelaySlot = 1 in
-class LoadM<bits<6> op, string instr_asm, PatFrag OpNode>:
-  FI<op, (outs CPURegs:$dst), (ins mem:$addr),
-     !strconcat(instr_asm, "\t$dst, $addr"),
-     [(set CPURegs:$dst, (OpNode addr:$addr))], IILoad>;
+let canFoldAsLoad = 1 in
+class LoadM<bits<6> op, string instr_asm, PatFrag OpNode, RegisterClass RC,
+            Operand MemOpnd, bit Pseudo>:
+  FI<op, (outs RC:$rt), (ins MemOpnd:$addr),
+     !strconcat(instr_asm, "\t$rt, $addr"),
+     [(set RC:$rt, (OpNode addr:$addr))], IILoad> {
+  let isPseudo = Pseudo;
+}
 
-class StoreM<bits<6> op, string instr_asm, PatFrag OpNode>:
-  FI<op, (outs), (ins CPURegs:$dst, mem:$addr),
-     !strconcat(instr_asm, "\t$dst, $addr"),
-     [(OpNode CPURegs:$dst, addr:$addr)], IIStore>;
+class StoreM<bits<6> op, string instr_asm, PatFrag OpNode, RegisterClass RC,
+             Operand MemOpnd, bit Pseudo>:
+  FI<op, (outs), (ins RC:$rt, MemOpnd:$addr),
+     !strconcat(instr_asm, "\t$rt, $addr"),
+     [(OpNode RC:$rt, addr:$addr)], IIStore> {
+  let isPseudo = Pseudo;
+}
+
+// 32-bit load.
+multiclass LoadM32<bits<6> op, string instr_asm, PatFrag OpNode,
+                   bit Pseudo = 0> {
+  def #NAME# : LoadM<op, instr_asm, OpNode, CPURegs, mem, Pseudo>,
+               Requires<[NotN64]>;
+  def _P8    : LoadM<op, instr_asm, OpNode, CPURegs, mem64, Pseudo>,
+               Requires<[IsN64]>;
+} 
+
+// 64-bit load.
+multiclass LoadM64<bits<6> op, string instr_asm, PatFrag OpNode,
+                   bit Pseudo = 0> {
+  def #NAME# : LoadM<op, instr_asm, OpNode, CPU64Regs, mem, Pseudo>,
+               Requires<[NotN64]>;
+  def _P8    : LoadM<op, instr_asm, OpNode, CPU64Regs, mem64, Pseudo>,
+               Requires<[IsN64]>;
+} 
+
+// 32-bit store.
+multiclass StoreM32<bits<6> op, string instr_asm, PatFrag OpNode,
+                    bit Pseudo = 0> {
+  def #NAME# : StoreM<op, instr_asm, OpNode, CPURegs, mem, Pseudo>,
+               Requires<[NotN64]>;
+  def _P8    : StoreM<op, instr_asm, OpNode, CPURegs, mem64, Pseudo>,
+               Requires<[IsN64]>;
+}
+
+// 64-bit store.
+multiclass StoreM64<bits<6> op, string instr_asm, PatFrag OpNode,
+                    bit Pseudo = 0> {
+  def #NAME# : StoreM<op, instr_asm, OpNode, CPU64Regs, mem, Pseudo>,
+               Requires<[NotN64]>;
+  def _P8    : StoreM<op, instr_asm, OpNode, CPU64Regs, mem64, Pseudo>,
+               Requires<[IsN64]>;
+}
 
 // Conditional Branch
-let isBranch = 1, isTerminator=1, hasDelaySlot = 1 in {
-class CBranch<bits<6> op, string instr_asm, PatFrag cond_op>:
-  FI<op, (outs), (ins CPURegs:$a, CPURegs:$b, brtarget:$offset),
-     !strconcat(instr_asm, "\t$a, $b, $offset"),
-     [(brcond (cond_op CPURegs:$a, CPURegs:$b), bb:$offset)],
-     IIBranch>;
+class CBranch<bits<6> op, string instr_asm, PatFrag cond_op, RegisterClass RC>:
+  CBranchBase<op, (outs), (ins RC:$rs, RC:$rt, brtarget:$offset),
+              !strconcat(instr_asm, "\t$rs, $rt, $offset"),
+              [(brcond (i32 (cond_op RC:$rs, RC:$rt)), bb:$offset)], IIBranch> {
+  let isBranch = 1;
+  let isTerminator = 1;
+  let hasDelaySlot = 1;
+}
 
-class CBranchZero<bits<6> op, string instr_asm, PatFrag cond_op>:
-  FI<op, (outs), (ins CPURegs:$src, brtarget:$offset),
-     !strconcat(instr_asm, "\t$src, $offset"),
-     [(brcond (cond_op CPURegs:$src, 0), bb:$offset)],
-     IIBranch>;
+class CBranchZero<bits<6> op, bits<5> _rt, string instr_asm, PatFrag cond_op,
+                  RegisterClass RC>:
+  CBranchBase<op, (outs), (ins RC:$rs, brtarget:$offset),
+              !strconcat(instr_asm, "\t$rs, $offset"),
+              [(brcond (i32 (cond_op RC:$rs, 0)), bb:$offset)], IIBranch> {
+  let rt = _rt;
+  let isBranch = 1;
+  let isTerminator = 1;
+  let hasDelaySlot = 1;
 }
 
 // SetCC
-class SetCC_R<bits<6> op, bits<6> func, string instr_asm,
-      PatFrag cond_op>:
-  FR<op, func, (outs CPURegs:$dst), (ins CPURegs:$b, CPURegs:$c),
-     !strconcat(instr_asm, "\t$dst, $b, $c"),
-     [(set CPURegs:$dst, (cond_op CPURegs:$b, CPURegs:$c))],
-     IIAlu>;
+class SetCC_R<bits<6> op, bits<6> func, string instr_asm, PatFrag cond_op,
+              RegisterClass RC>:
+  FR<op, func, (outs CPURegs:$rd), (ins RC:$rs, RC:$rt),
+     !strconcat(instr_asm, "\t$rd, $rs, $rt"),
+     [(set CPURegs:$rd, (cond_op RC:$rs, RC:$rt))],
+     IIAlu> {
+  let shamt = 0;
+}
 
-class SetCC_I<bits<6> op, string instr_asm, PatFrag cond_op,
-      Operand Od, PatLeaf imm_type>:
-  FI<op, (outs CPURegs:$dst), (ins CPURegs:$b, Od:$c),
-     !strconcat(instr_asm, "\t$dst, $b, $c"),
-     [(set CPURegs:$dst, (cond_op CPURegs:$b, imm_type:$c))],
+class SetCC_I<bits<6> op, string instr_asm, PatFrag cond_op, Operand Od,
+              PatLeaf imm_type, RegisterClass RC>:
+  FI<op, (outs CPURegs:$rd), (ins RC:$rs, Od:$i),
+     !strconcat(instr_asm, "\t$rd, $rs, $i"),
+     [(set CPURegs:$rd, (cond_op RC:$rs, imm_type:$i))],
      IIAlu>;
 
 // Unconditional branch
@@ -310,8 +424,12 @@ class JumpFJ<bits<6> op, string instr_asm>:
 
 let isBranch=1, isTerminator=1, isBarrier=1, rd=0, hasDelaySlot = 1 in
 class JumpFR<bits<6> op, bits<6> func, string instr_asm>:
-  FR<op, func, (outs), (ins CPURegs:$target),
-     !strconcat(instr_asm, "\t$target"), [(brind CPURegs:$target)], IIBranch>;
+  FR<op, func, (outs), (ins CPURegs:$rs),
+     !strconcat(instr_asm, "\t$rs"), [(brind CPURegs:$rs)], IIBranch> {
+  let rt = 0;
+  let rd = 0;
+  let shamt = 0;
+}
 
 // Jump and Link (Call)
 let isCall=1, hasDelaySlot=1,
@@ -323,76 +441,124 @@ let isCall=1, hasDelaySlot=1,
        !strconcat(instr_asm, "\t$target"), [(MipsJmpLink imm:$target)],
        IIBranch>;
 
-  let rd=31 in
   class JumpLinkReg<bits<6> op, bits<6> func, string instr_asm>:
     FR<op, func, (outs), (ins CPURegs:$rs, variable_ops),
-       !strconcat(instr_asm, "\t$rs"), [(MipsJmpLink CPURegs:$rs)], IIBranch>;
+       !strconcat(instr_asm, "\t$rs"), [(MipsJmpLink CPURegs:$rs)], IIBranch> {
+    let rt = 0;
+    let rd = 31;
+    let shamt = 0;
+  }
 
   class BranchLink<string instr_asm>:
     FI<0x1, (outs), (ins CPURegs:$rs, brtarget:$target, variable_ops),
-       !strconcat(instr_asm, "\t$rs, $target"), [], IIBranch>;
+       !strconcat(instr_asm, "\t$rs, $target"), [], IIBranch> {
+    let rt = 0;
+  }
 }
 
 // Mul, Div
-let Defs = [HI, LO] in {
-  let isCommutable = 1 in
-  class Mul<bits<6> func, string instr_asm, InstrItinClass itin>:
-    FR<0x00, func, (outs), (ins CPURegs:$a, CPURegs:$b),
-       !strconcat(instr_asm, "\t$a, $b"), [], itin>;
+class Mul<bits<6> func, string instr_asm, InstrItinClass itin>:
+  FR<0x00, func, (outs), (ins CPURegs:$rs, CPURegs:$rt),
+     !strconcat(instr_asm, "\t$rs, $rt"), [], itin> {
+  let rd = 0;
+  let shamt = 0;
+  let isCommutable = 1;
+  let Defs = [HI, LO];
+}
 
-  class Div<SDNode op, bits<6> func, string instr_asm, InstrItinClass itin>:
-            FR<0x00, func, (outs), (ins CPURegs:$a, CPURegs:$b),
-            !strconcat(instr_asm, "\t$$zero, $a, $b"),
-            [(op CPURegs:$a, CPURegs:$b)], itin>;
+class Div<SDNode op, bits<6> func, string instr_asm, InstrItinClass itin>:
+          FR<0x00, func, (outs), (ins CPURegs:$rs, CPURegs:$rt),
+          !strconcat(instr_asm, "\t$$zero, $rs, $rt"),
+          [(op CPURegs:$rs, CPURegs:$rt)], itin> {
+  let rd = 0;
+  let shamt = 0;
+  let Defs = [HI, LO];
 }
 
 // Move from Hi/Lo
 class MoveFromLOHI<bits<6> func, string instr_asm>:
-  FR<0x00, func, (outs CPURegs:$dst), (ins),
-     !strconcat(instr_asm, "\t$dst"), [], IIHiLo>;
+  FR<0x00, func, (outs CPURegs:$rd), (ins),
+     !strconcat(instr_asm, "\t$rd"), [], IIHiLo> {
+  let rs = 0;
+  let rt = 0;
+  let shamt = 0;
+}
 
 class MoveToLOHI<bits<6> func, string instr_asm>:
-  FR<0x00, func, (outs), (ins CPURegs:$src),
-     !strconcat(instr_asm, "\t$src"), [], IIHiLo>;
+  FR<0x00, func, (outs), (ins CPURegs:$rs),
+     !strconcat(instr_asm, "\t$rs"), [], IIHiLo> {
+  let rt = 0;
+  let rd = 0;
+  let shamt = 0;
+}
 
 class EffectiveAddress<string instr_asm> :
-  FI<0x09, (outs CPURegs:$dst), (ins mem_ea:$addr),
-     instr_asm, [(set CPURegs:$dst, addr:$addr)], IIAlu>;
+  FI<0x09, (outs CPURegs:$rt), (ins mem_ea:$addr),
+     instr_asm, [(set CPURegs:$rt, addr:$addr)], IIAlu>;
 
 // Count Leading Ones/Zeros in Word
 class CountLeading<bits<6> func, string instr_asm, list<dag> pattern>:
-  FR<0x1c, func, (outs CPURegs:$dst), (ins CPURegs:$src),
-     !strconcat(instr_asm, "\t$dst, $src"), pattern, IIAlu>,
+  FR<0x1c, func, (outs CPURegs:$rd), (ins CPURegs:$rs),
+     !strconcat(instr_asm, "\t$rd, $rs"), pattern, IIAlu>,
      Requires<[HasBitCount]> {
   let shamt = 0;
   let rt = rd;
 }
 
 // Sign Extend in Register.
-class SignExtInReg<bits<6> func, string instr_asm, ValueType vt>:
-  FR<0x3f, func, (outs CPURegs:$dst), (ins CPURegs:$src),
-     !strconcat(instr_asm, "\t$dst, $src"),
-     [(set CPURegs:$dst, (sext_inreg CPURegs:$src, vt))], NoItinerary>;
+class SignExtInReg<bits<5> sa, string instr_asm, ValueType vt>:
+  FR<0x3f, 0x20, (outs CPURegs:$rd), (ins CPURegs:$rt),
+     !strconcat(instr_asm, "\t$rd, $rt"),
+     [(set CPURegs:$rd, (sext_inreg CPURegs:$rt, vt))], NoItinerary> {
+  let rs = 0;
+  let shamt = sa;
+  let Predicates = [HasSEInReg];
+}
 
 // Byte Swap
-class ByteSwap<bits<6> func, string instr_asm>:
-  FR<0x1f, func, (outs CPURegs:$dst), (ins CPURegs:$src),
-     !strconcat(instr_asm, "\t$dst, $src"),
-     [(set CPURegs:$dst, (bswap CPURegs:$src))], NoItinerary>;
-
-// Conditional Move
-class CondMov<bits<6> func, string instr_asm, PatLeaf MovCode>:
-  FR<0x00, func, (outs CPURegs:$dst), (ins CPURegs:$F, CPURegs:$T,
-     CPURegs:$cond), !strconcat(instr_asm, "\t$dst, $T, $cond"),
-     [], NoItinerary>;
+class ByteSwap<bits<6> func, bits<5> sa, string instr_asm>:
+  FR<0x1f, func, (outs CPURegs:$rd), (ins CPURegs:$rt),
+     !strconcat(instr_asm, "\t$rd, $rt"),
+     [(set CPURegs:$rd, (bswap CPURegs:$rt))], NoItinerary> {
+  let rs = 0;
+  let shamt = sa;
+  let Predicates = [HasSwap];
+}
 
 // Read Hardware
-class ReadHardware: FR<0x1f, 0x3b, (outs CPURegs:$dst), (ins HWRegs:$src),
-    "rdhwr\t$dst, $src", [], IIAlu> {
+class ReadHardware: FR<0x1f, 0x3b, (outs CPURegs:$rt), (ins HWRegs:$rd),
+    "rdhwr\t$rt, $rd", [], IIAlu> {
   let rs = 0;
   let shamt = 0;
 }
 
+// Ext and Ins
+class ExtIns<bits<6> _funct, string instr_asm, dag outs, dag ins,
+             list<dag> pattern, InstrItinClass itin>:
+  FR<0x1f, _funct, outs, ins, !strconcat(instr_asm, " $rt, $rs, $pos, $sz"),
+     pattern, itin>, Requires<[HasMips32r2]> {
+  bits<5> pos;
+  bits<5> sz;
+  let rd = sz;
+  let shamt = pos;
+}
+
+// Atomic instructions with 2 source operands (ATOMIC_SWAP & ATOMIC_LOAD_*).
+class Atomic2Ops<PatFrag Op, string Opstr> :
+  MipsPseudo<(outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
+             !strconcat("atomic_", Opstr, "\t$dst, $ptr, $incr"),
+             [(set CPURegs:$dst,
+              (Op CPURegs:$ptr, CPURegs:$incr))]>;
+
+// Atomic Compare & Swap.
+class AtomicCmpSwap<PatFrag Op, string Width> :
+  MipsPseudo<(outs CPURegs:$dst), 
+             (ins CPURegs:$ptr, CPURegs:$cmp, CPURegs:$swap),
+             !strconcat("atomic_cmp_swap_", Width, 
+                        "\t$dst, $ptr, $cmp, $swap"),
+             [(set CPURegs:$dst,
+              (Op CPURegs:$ptr, CPURegs:$cmp, CPURegs:$swap))]>;
+
 //===----------------------------------------------------------------------===//
 // Pseudo instructions
 //===----------------------------------------------------------------------===//
@@ -427,112 +593,32 @@ def CPLOAD : MipsPseudo<(outs), (ins CPURegs:$picreg), ".cpload\t$picreg", []>;
 def CPRESTORE : MipsPseudo<(outs), (ins i32imm:$loc), ".cprestore\t$loc", []>;
 
 let usesCustomInserter = 1 in {
-  def ATOMIC_LOAD_ADD_I8 : MipsPseudo<
-    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
-    "atomic_load_add_8\t$dst, $ptr, $incr",
-    [(set CPURegs:$dst, (atomic_load_add_8 CPURegs:$ptr, CPURegs:$incr))]>;
-  def ATOMIC_LOAD_ADD_I16 : MipsPseudo<
-    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
-    "atomic_load_add_16\t$dst, $ptr, $incr",
-    [(set CPURegs:$dst, (atomic_load_add_16 CPURegs:$ptr, CPURegs:$incr))]>;
-  def ATOMIC_LOAD_ADD_I32 : MipsPseudo<
-    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
-    "atomic_load_add_32\t$dst, $ptr, $incr",
-    [(set CPURegs:$dst, (atomic_load_add_32 CPURegs:$ptr, CPURegs:$incr))]>;
-
-  def ATOMIC_LOAD_SUB_I8 : MipsPseudo<
-    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
-    "atomic_load_sub_8\t$dst, $ptr, $incr",
-    [(set CPURegs:$dst, (atomic_load_sub_8 CPURegs:$ptr, CPURegs:$incr))]>;
-  def ATOMIC_LOAD_SUB_I16 : MipsPseudo<
-    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
-    "atomic_load_sub_16\t$dst, $ptr, $incr",
-    [(set CPURegs:$dst, (atomic_load_sub_16 CPURegs:$ptr, CPURegs:$incr))]>;
-  def ATOMIC_LOAD_SUB_I32 : MipsPseudo<
-    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
-    "atomic_load_sub_32\t$dst, $ptr, $incr",
-    [(set CPURegs:$dst, (atomic_load_sub_32 CPURegs:$ptr, CPURegs:$incr))]>;
-
-  def ATOMIC_LOAD_AND_I8 : MipsPseudo<
-    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
-    "atomic_load_and_8\t$dst, $ptr, $incr",
-    [(set CPURegs:$dst, (atomic_load_and_8 CPURegs:$ptr, CPURegs:$incr))]>;
-  def ATOMIC_LOAD_AND_I16 : MipsPseudo<
-    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
-    "atomic_load_and_16\t$dst, $ptr, $incr",
-    [(set CPURegs:$dst, (atomic_load_and_16 CPURegs:$ptr, CPURegs:$incr))]>;
-  def ATOMIC_LOAD_AND_I32 : MipsPseudo<
-    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
-    "atomic_load_and_32\t$dst, $ptr, $incr",
-    [(set CPURegs:$dst, (atomic_load_and_32 CPURegs:$ptr, CPURegs:$incr))]>;
-
-  def ATOMIC_LOAD_OR_I8 : MipsPseudo<
-    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
-    "atomic_load_or_8\t$dst, $ptr, $incr",
-    [(set CPURegs:$dst, (atomic_load_or_8 CPURegs:$ptr, CPURegs:$incr))]>;
-  def ATOMIC_LOAD_OR_I16 : MipsPseudo<
-    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
-    "atomic_load_or_16\t$dst, $ptr, $incr",
-    [(set CPURegs:$dst, (atomic_load_or_16 CPURegs:$ptr, CPURegs:$incr))]>;
-  def ATOMIC_LOAD_OR_I32 : MipsPseudo<
-    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
-    "atomic_load_or_32\t$dst, $ptr, $incr",
-    [(set CPURegs:$dst, (atomic_load_or_32 CPURegs:$ptr, CPURegs:$incr))]>;
-
-  def ATOMIC_LOAD_XOR_I8 : MipsPseudo<
-    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
-    "atomic_load_xor_8\t$dst, $ptr, $incr",
-    [(set CPURegs:$dst, (atomic_load_xor_8 CPURegs:$ptr, CPURegs:$incr))]>;
-  def ATOMIC_LOAD_XOR_I16 : MipsPseudo<
-    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
-    "atomic_load_xor_16\t$dst, $ptr, $incr",
-    [(set CPURegs:$dst, (atomic_load_xor_16 CPURegs:$ptr, CPURegs:$incr))]>;
-  def ATOMIC_LOAD_XOR_I32 : MipsPseudo<
-    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
-    "atomic_load_xor_32\t$dst, $ptr, $incr",
-    [(set CPURegs:$dst, (atomic_load_xor_32 CPURegs:$ptr, CPURegs:$incr))]>;
-
-  def ATOMIC_LOAD_NAND_I8 : MipsPseudo<
-    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
-    "atomic_load_nand_8\t$dst, $ptr, $incr",
-    [(set CPURegs:$dst, (atomic_load_nand_8 CPURegs:$ptr, CPURegs:$incr))]>;
-  def ATOMIC_LOAD_NAND_I16 : MipsPseudo<
-    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
-    "atomic_load_nand_16\t$dst, $ptr, $incr",
-    [(set CPURegs:$dst, (atomic_load_nand_16 CPURegs:$ptr, CPURegs:$incr))]>;
-  def ATOMIC_LOAD_NAND_I32 : MipsPseudo<
-    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
-    "atomic_load_nand_32\t$dst, $ptr, $incr",
-    [(set CPURegs:$dst, (atomic_load_nand_32 CPURegs:$ptr, CPURegs:$incr))]>;
-
-  def ATOMIC_SWAP_I8 : MipsPseudo<
-    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$val),
-    "atomic_swap_8\t$dst, $ptr, $val",
-    [(set CPURegs:$dst, (atomic_swap_8 CPURegs:$ptr, CPURegs:$val))]>;
-  def ATOMIC_SWAP_I16 : MipsPseudo<
-    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$val),
-    "atomic_swap_16\t$dst, $ptr, $val",
-    [(set CPURegs:$dst, (atomic_swap_16 CPURegs:$ptr, CPURegs:$val))]>;
-  def ATOMIC_SWAP_I32 : MipsPseudo<
-    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$val),
-    "atomic_swap_32\t$dst, $ptr, $val",
-    [(set CPURegs:$dst, (atomic_swap_32 CPURegs:$ptr, CPURegs:$val))]>;
-
-  def ATOMIC_CMP_SWAP_I8 : MipsPseudo<
-    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$oldval, CPURegs:$newval),
-    "atomic_cmp_swap_8\t$dst, $ptr, $oldval, $newval",
-    [(set CPURegs:$dst,
-         (atomic_cmp_swap_8 CPURegs:$ptr, CPURegs:$oldval, CPURegs:$newval))]>;
-  def ATOMIC_CMP_SWAP_I16 : MipsPseudo<
-    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$oldval, CPURegs:$newval),
-    "atomic_cmp_swap_16\t$dst, $ptr, $oldval, $newval",
-    [(set CPURegs:$dst,
-         (atomic_cmp_swap_16 CPURegs:$ptr, CPURegs:$oldval, CPURegs:$newval))]>;
-  def ATOMIC_CMP_SWAP_I32 : MipsPseudo<
-    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$oldval, CPURegs:$newval),
-    "atomic_cmp_swap_32\t$dst, $ptr, $oldval, $newval",
-    [(set CPURegs:$dst,
-         (atomic_cmp_swap_32 CPURegs:$ptr, CPURegs:$oldval, CPURegs:$newval))]>;
+  def ATOMIC_LOAD_ADD_I8   : Atomic2Ops<atomic_load_add_8, "load_add_8">;
+  def ATOMIC_LOAD_ADD_I16  : Atomic2Ops<atomic_load_add_16, "load_add_16">;
+  def ATOMIC_LOAD_ADD_I32  : Atomic2Ops<atomic_load_add_32, "load_add_32">;
+  def ATOMIC_LOAD_SUB_I8   : Atomic2Ops<atomic_load_sub_8, "load_sub_8">;
+  def ATOMIC_LOAD_SUB_I16  : Atomic2Ops<atomic_load_sub_16, "load_sub_16">;
+  def ATOMIC_LOAD_SUB_I32  : Atomic2Ops<atomic_load_sub_32, "load_sub_32">;
+  def ATOMIC_LOAD_AND_I8   : Atomic2Ops<atomic_load_and_8, "load_and_8">;
+  def ATOMIC_LOAD_AND_I16  : Atomic2Ops<atomic_load_and_16, "load_and_16">;
+  def ATOMIC_LOAD_AND_I32  : Atomic2Ops<atomic_load_and_32, "load_and_32">;
+  def ATOMIC_LOAD_OR_I8    : Atomic2Ops<atomic_load_or_8, "load_or_8">;
+  def ATOMIC_LOAD_OR_I16   : Atomic2Ops<atomic_load_or_16, "load_or_16">;
+  def ATOMIC_LOAD_OR_I32   : Atomic2Ops<atomic_load_or_32, "load_or_32">;
+  def ATOMIC_LOAD_XOR_I8   : Atomic2Ops<atomic_load_xor_8, "load_xor_8">;
+  def ATOMIC_LOAD_XOR_I16  : Atomic2Ops<atomic_load_xor_16, "load_xor_16">;
+  def ATOMIC_LOAD_XOR_I32  : Atomic2Ops<atomic_load_xor_32, "load_xor_32">;
+  def ATOMIC_LOAD_NAND_I8  : Atomic2Ops<atomic_load_nand_8, "load_nand_8">;
+  def ATOMIC_LOAD_NAND_I16 : Atomic2Ops<atomic_load_nand_16, "load_nand_16">;
+  def ATOMIC_LOAD_NAND_I32 : Atomic2Ops<atomic_load_nand_32, "load_nand_32">;
+
+  def ATOMIC_SWAP_I8       : Atomic2Ops<atomic_swap_8, "swap_8">;
+  def ATOMIC_SWAP_I16      : Atomic2Ops<atomic_swap_16, "swap_16">;
+  def ATOMIC_SWAP_I32      : Atomic2Ops<atomic_swap_32, "swap_32">;
+
+  def ATOMIC_CMP_SWAP_I8   : AtomicCmpSwap<atomic_cmp_swap_8, "8">;
+  def ATOMIC_CMP_SWAP_I16  : AtomicCmpSwap<atomic_cmp_swap_16, "16">;
+  def ATOMIC_CMP_SWAP_I32  : AtomicCmpSwap<atomic_cmp_swap_32, "32">;
 }
 
 //===----------------------------------------------------------------------===//
@@ -544,26 +630,26 @@ let usesCustomInserter = 1 in {
 //===----------------------------------------------------------------------===//
 
 /// Arithmetic Instructions (ALU Immediate)
-def ADDiu   : ArithI<0x09, "addiu", add, simm16, immSExt16>;
-def ADDi    : ArithOverflowI<0x08, "addi",  add, simm16, immSExt16>;
-def SLTi    : SetCC_I<0x0a, "slti", setlt, simm16, immSExt16>;
-def SLTiu   : SetCC_I<0x0b, "sltiu", setult, simm16, immSExt16>;
-def ANDi    : LogicI<0x0c, "andi", and>;
-def ORi     : LogicI<0x0d, "ori",  or>;
-def XORi    : LogicI<0x0e, "xori",  xor>;
+def ADDiu   : ArithLogicI<0x09, "addiu", add, simm16, immSExt16, CPURegs>;
+def ADDi    : ArithOverflowI<0x08, "addi", add, simm16, immSExt16, CPURegs>;
+def SLTi    : SetCC_I<0x0a, "slti", setlt, simm16, immSExt16, CPURegs>;
+def SLTiu   : SetCC_I<0x0b, "sltiu", setult, simm16, immSExt16, CPURegs>;
+def ANDi    : ArithLogicI<0x0c, "andi", and, uimm16, immZExt16, CPURegs>;
+def ORi     : ArithLogicI<0x0d, "ori", or, uimm16, immZExt16, CPURegs>;
+def XORi    : ArithLogicI<0x0e, "xori", xor, uimm16, immZExt16, CPURegs>;
 def LUi     : LoadUpper<0x0f, "lui">;
 
 /// Arithmetic Instructions (3-Operand, R-Type)
-def ADDu    : ArithR<0x00, 0x21, "addu", add, IIAlu, 1>;
-def SUBu    : ArithR<0x00, 0x23, "subu", sub, IIAlu>;
-def ADD     : ArithOverflowR<0x00, 0x20, "add", 1>;
-def SUB     : ArithOverflowR<0x00, 0x22, "sub">;
-def SLT     : SetCC_R<0x00, 0x2a, "slt", setlt>;
-def SLTu    : SetCC_R<0x00, 0x2b, "sltu", setult>;
-def AND     : LogicR<0x24, "and", and>;
-def OR      : LogicR<0x25, "or",  or>;
-def XOR     : LogicR<0x26, "xor", xor>;
-def NOR     : LogicNOR<0x00, 0x27, "nor">;
+def ADDu    : ArithLogicR<0x00, 0x21, "addu", add, IIAlu, CPURegs, 1>;
+def SUBu    : ArithLogicR<0x00, 0x23, "subu", sub, IIAlu, CPURegs>;
+def ADD     : ArithOverflowR<0x00, 0x20, "add", IIAlu, CPURegs, 1>;
+def SUB     : ArithOverflowR<0x00, 0x22, "sub", IIAlu, CPURegs>;
+def SLT     : SetCC_R<0x00, 0x2a, "slt", setlt, CPURegs>;
+def SLTu    : SetCC_R<0x00, 0x2b, "sltu", setult, CPURegs>;
+def AND     : ArithLogicR<0x00, 0x24, "and", and, IIAlu, CPURegs, 1>;
+def OR      : ArithLogicR<0x00, 0x25, "or",  or, IIAlu, CPURegs, 1>;
+def XOR     : ArithLogicR<0x00, 0x26, "xor", xor, IIAlu, CPURegs, 1>;
+def NOR     : LogicNOR<0x00, 0x27, "nor", CPURegs>;
 
 /// Shift Instructions
 def SLL     : LogicR_shift_rotate_imm<0x00, 0x00, "sll", shl>;
@@ -574,45 +660,58 @@ def SRLV    : LogicR_shift_rotate_reg<0x06, 0x00, "srlv", srl>;
 def SRAV    : LogicR_shift_rotate_reg<0x07, 0x00, "srav", sra>;
 
 // Rotate Instructions
-let Predicates = [IsMips32r2] in {
+let Predicates = [HasMips32r2] in {
     def ROTR    : LogicR_shift_rotate_imm<0x02, 0x01, "rotr", rotr>;
     def ROTRV   : LogicR_shift_rotate_reg<0x06, 0x01, "rotrv", rotr>;
 }
 
 /// Load and Store Instructions
-def LB      : LoadM<0x20, "lb",  sextloadi8>;
-def LBu     : LoadM<0x24, "lbu", zextloadi8>;
-def LH      : LoadM<0x21, "lh",  sextloadi16>;
-def LHu     : LoadM<0x25, "lhu", zextloadi16>;
-def LW      : LoadM<0x23, "lw",  load>;
-def SB      : StoreM<0x28, "sb", truncstorei8>;
-def SH      : StoreM<0x29, "sh", truncstorei16>;
-def SW      : StoreM<0x2b, "sw", store>;
+///  aligned
+defm LB      : LoadM32<0x20, "lb",  sextloadi8>;
+defm LBu     : LoadM32<0x24, "lbu", zextloadi8>;
+defm LH      : LoadM32<0x21, "lh",  sextloadi16_a>;
+defm LHu     : LoadM32<0x25, "lhu", zextloadi16_a>;
+defm LW      : LoadM32<0x23, "lw",  load_a>;
+defm SB      : StoreM32<0x28, "sb", truncstorei8>;
+defm SH      : StoreM32<0x29, "sh", truncstorei16_a>;
+defm SW      : StoreM32<0x2b, "sw", store_a>;
+
+///  unaligned
+defm ULH     : LoadM32<0x21, "ulh",  sextloadi16_u, 1>;
+defm ULHu    : LoadM32<0x25, "ulhu", zextloadi16_u, 1>;
+defm ULW     : LoadM32<0x23, "ulw",  load_u, 1>;
+defm USH     : StoreM32<0x29, "ush", truncstorei16_u, 1>;
+defm USW     : StoreM32<0x2b, "usw", store_u, 1>;
+
+let hasSideEffects = 1 in
+def SYNC : MipsInst<(outs), (ins i32imm:$stype), "sync $stype",
+                    [(MipsSync imm:$stype)], NoItinerary>
+{
+  let opcode = 0;
+  let Inst{25-11} = 0;
+  let Inst{5-0} = 15;
+}
 
 /// Load-linked, Store-conditional
-let hasDelaySlot = 1 in
+let mayLoad = 1 in
   def LL    : FI<0x30, (outs CPURegs:$dst), (ins mem:$addr),
               "ll\t$dst, $addr", [], IILoad>;
-let Constraints = "$src = $dst" in
+let mayStore = 1, Constraints = "$src = $dst" in
   def SC    : FI<0x38, (outs CPURegs:$dst), (ins CPURegs:$src, mem:$addr),
               "sc\t$src, $addr", [], IIStore>;
 
 /// Jump and Branch Instructions
 def J       : JumpFJ<0x02, "j">;
-def JR      : JumpFR<0x00, 0x08, "jr">;
+let isIndirectBranch = 1 in
+  def JR      : JumpFR<0x00, 0x08, "jr">;
 def JAL     : JumpLink<0x03, "jal">;
 def JALR    : JumpLinkReg<0x00, 0x09, "jalr">;
-def BEQ     : CBranch<0x04, "beq", seteq>;
-def BNE     : CBranch<0x05, "bne", setne>;
-
-let rt=1 in
-  def BGEZ  : CBranchZero<0x01, "bgez", setge>;
-
-let rt=0 in {
-  def BGTZ  : CBranchZero<0x07, "bgtz", setgt>;
-  def BLEZ  : CBranchZero<0x07, "blez", setle>;
-  def BLTZ  : CBranchZero<0x01, "bltz", setlt>;
-}
+def BEQ     : CBranch<0x04, "beq", seteq, CPURegs>;
+def BNE     : CBranch<0x05, "bne", setne, CPURegs>;
+def BGEZ    : CBranchZero<0x01, 1, "bgez", setge, CPURegs>;
+def BGTZ    : CBranchZero<0x07, 0, "bgtz", setgt, CPURegs>;
+def BLEZ    : CBranchZero<0x07, 0, "blez", setle, CPURegs>;
+def BLTZ    : CBranchZero<0x01, 0, "bltz", setlt, CPURegs>;
 
 def BGEZAL  : BranchLink<"bgezal">;
 def BLTZAL  : BranchLink<"bltzal">;
@@ -639,40 +738,31 @@ let Uses = [LO] in
   def MFLO  : MoveFromLOHI<0x12, "mflo">;
 
 /// Sign Ext In Register Instructions.
-let Predicates = [HasSEInReg] in {
-  let shamt = 0x10, rs = 0 in
-    def SEB : SignExtInReg<0x21, "seb", i8>;
-
-  let shamt = 0x18, rs = 0 in
-    def SEH : SignExtInReg<0x20, "seh", i16>;
-}
+def SEB : SignExtInReg<0x10, "seb", i8>;
+def SEH : SignExtInReg<0x18, "seh", i16>;
 
 /// Count Leading
-def CLZ : CountLeading<0b100000, "clz",
-                       [(set CPURegs:$dst, (ctlz CPURegs:$src))]>;
-def CLO : CountLeading<0b100001, "clo",
-                       [(set CPURegs:$dst, (ctlz (not CPURegs:$src)))]>;
+def CLZ : CountLeading<0x20, "clz",
+                       [(set CPURegs:$rd, (ctlz CPURegs:$rs))]>;
+def CLO : CountLeading<0x21, "clo",
+                       [(set CPURegs:$rd, (ctlz (not CPURegs:$rs)))]>;
 
 /// Byte Swap
-let Predicates = [HasSwap] in {
-  let shamt = 0x3, rs = 0 in
-    def WSBW : ByteSwap<0x20, "wsbw">;
-}
-
-/// Conditional Move
-def MIPS_CMOV_ZERO  : PatLeaf<(i32 0)>;
-def MIPS_CMOV_NZERO : PatLeaf<(i32 1)>;
+def WSBW : ByteSwap<0x20, 0x2, "wsbw">;
 
 // Conditional moves:
 // These instructions are expanded in
 // MipsISelLowering::EmitInstrWithCustomInserter if target does not have
 // conditional move instructions.
 // flag:int, data:int
-let usesCustomInserter = 1, shamt = 0, Constraints = "$F = $dst" in
-  class CondMovIntInt<bits<6> funct, string instr_asm> :
-    FR<0, funct, (outs CPURegs:$dst),
-       (ins CPURegs:$T, CPURegs:$cond, CPURegs:$F),
-       !strconcat(instr_asm, "\t$dst, $T, $cond"), [], NoItinerary>;
+class CondMovIntInt<bits<6> funct, string instr_asm> :
+  FR<0, funct, (outs CPURegs:$rd),
+     (ins CPURegs:$rs, CPURegs:$rt, CPURegs:$F),
+     !strconcat(instr_asm, "\t$rd, $rs, $rt"), [], NoItinerary> {
+  let shamt = 0;
+  let usesCustomInserter = 1;
+  let Constraints = "$F = $rd";
+}
 
 def MOVZ_I : CondMovIntInt<0x0a, "movz">;
 def MOVN_I : CondMovIntInt<0x0b, "movn">;
@@ -685,13 +775,13 @@ let addr=0 in
 // instructions. The same not happens for stack address copies, so an
 // add op with mem ComplexPattern is used and the stack address copy
 // can be matched. It's similar to Sparc LEA_ADDRi
-def LEA_ADDiu : EffectiveAddress<"addiu\t$dst, $addr">;
+def LEA_ADDiu : EffectiveAddress<"addiu\t$rt, $addr">;
 
 // DynAlloc node points to dynamically allocated stack space.
 // $sp is added to the list of implicitly used registers to prevent dead code
 // elimination from removing instructions that modify $sp.
 let Uses = [SP] in
-def DynAlloc : EffectiveAddress<"addiu\t$dst, $addr">;
+def DynAlloc : EffectiveAddress<"addiu\t$rt, $addr">;
 
 // MADD*/MSUB*
 def MADD  : MArithR<0, "madd", MipsMAdd, 1>;
@@ -701,10 +791,25 @@ def MSUBU : MArithR<5, "msubu", MipsMSubu>;
 
 // MUL is a assembly macro in the current used ISAs. In recent ISA's
 // it is a real instruction.
-def MUL   : ArithR<0x1c, 0x02, "mul", mul, IIImul, 1>, Requires<[IsMips32]>;
+def MUL   : ArithLogicR<0x1c, 0x02, "mul", mul, IIImul, CPURegs, 1>,
+            Requires<[HasMips32]>;
 
 def RDHWR : ReadHardware;
 
+def EXT : ExtIns<0, "ext", (outs CPURegs:$rt),
+                 (ins CPURegs:$rs, uimm16:$pos, uimm16:$sz),
+                 [(set CPURegs:$rt,
+                   (MipsExt CPURegs:$rs, immZExt5:$pos, immZExt5:$sz))],
+                 NoItinerary>;
+
+let Constraints = "$src = $rt" in
+def INS : ExtIns<4, "ins", (outs CPURegs:$rt),
+                 (ins CPURegs:$rs, uimm16:$pos, uimm16:$sz, CPURegs:$src),
+                 [(set CPURegs:$rt,
+                   (MipsIns CPURegs:$rs, immZExt5:$pos, immZExt5:$sz,
+                    CPURegs:$src))],
+                 NoItinerary>;
+
 //===----------------------------------------------------------------------===//
 //  Arbitrary patterns that map to one or more instructions
 //===----------------------------------------------------------------------===//
@@ -738,16 +843,20 @@ def : Pat<(MipsJmpLink (i32 texternalsym:$dst)),
 // hi/lo relocs
 def : Pat<(MipsHi tglobaladdr:$in), (LUi tglobaladdr:$in)>;
 def : Pat<(MipsHi tblockaddress:$in), (LUi tblockaddress:$in)>;
+def : Pat<(MipsLo tglobaladdr:$in), (ADDiu ZERO, tglobaladdr:$in)>;
+def : Pat<(MipsLo tblockaddress:$in), (ADDiu ZERO, tblockaddress:$in)>;
 def : Pat<(add CPURegs:$hi, (MipsLo tglobaladdr:$lo)),
           (ADDiu CPURegs:$hi, tglobaladdr:$lo)>;
 def : Pat<(add CPURegs:$hi, (MipsLo tblockaddress:$lo)),
           (ADDiu CPURegs:$hi, tblockaddress:$lo)>;
 
 def : Pat<(MipsHi tjumptable:$in), (LUi tjumptable:$in)>;
+def : Pat<(MipsLo tjumptable:$in), (ADDiu ZERO, tjumptable:$in)>;
 def : Pat<(add CPURegs:$hi, (MipsLo tjumptable:$lo)),
           (ADDiu CPURegs:$hi, tjumptable:$lo)>;
 
 def : Pat<(MipsHi tconstpool:$in), (LUi tconstpool:$in)>;
+def : Pat<(MipsLo tconstpool:$in), (ADDiu ZERO, tconstpool:$in)>;
 def : Pat<(add CPURegs:$hi, (MipsLo tconstpool:$lo)),
           (ADDiu CPURegs:$hi, tconstpool:$lo)>;
 
@@ -763,6 +872,7 @@ def : Pat<(add CPURegs:$gp, (MipsTlsGd tglobaltlsaddr:$in)),
 
 // tprel hi/lo
 def : Pat<(MipsTprelHi tglobaltlsaddr:$in), (LUi tglobaltlsaddr:$in)>;
+def : Pat<(MipsTprelLo tglobaltlsaddr:$in), (ADDiu ZERO, tglobaltlsaddr:$in)>;
 def : Pat<(add CPURegs:$hi, (MipsTprelLo tglobaltlsaddr:$lo)),
           (ADDiu CPURegs:$hi, tglobaltlsaddr:$lo)>;
 
@@ -784,60 +894,67 @@ def : Pat<(not CPURegs:$in),
 // extended load and stores
 def : Pat<(extloadi1  addr:$src), (LBu addr:$src)>;
 def : Pat<(extloadi8  addr:$src), (LBu addr:$src)>;
-def : Pat<(extloadi16 addr:$src), (LHu addr:$src)>;
+def : Pat<(extloadi16_a addr:$src), (LHu addr:$src)>;
+def : Pat<(extloadi16_u addr:$src), (ULHu addr:$src)>;
 
 // peepholes
 def : Pat<(store (i32 0), addr:$dst), (SW ZERO, addr:$dst)>;
 
 // brcond patterns
-def : Pat<(brcond (setne CPURegs:$lhs, 0), bb:$dst),
-          (BNE CPURegs:$lhs, ZERO, bb:$dst)>;
-def : Pat<(brcond (seteq CPURegs:$lhs, 0), bb:$dst),
-          (BEQ CPURegs:$lhs, ZERO, bb:$dst)>;
-
-def : Pat<(brcond (setge CPURegs:$lhs, CPURegs:$rhs), bb:$dst),
-          (BEQ (SLT CPURegs:$lhs, CPURegs:$rhs), ZERO, bb:$dst)>;
-def : Pat<(brcond (setuge CPURegs:$lhs, CPURegs:$rhs), bb:$dst),
-          (BEQ (SLTu CPURegs:$lhs, CPURegs:$rhs), ZERO, bb:$dst)>;
-def : Pat<(brcond (setge CPURegs:$lhs, immSExt16:$rhs), bb:$dst),
-          (BEQ (SLTi CPURegs:$lhs, immSExt16:$rhs), ZERO, bb:$dst)>;
-def : Pat<(brcond (setuge CPURegs:$lhs, immSExt16:$rhs), bb:$dst),
-          (BEQ (SLTiu CPURegs:$lhs, immSExt16:$rhs), ZERO, bb:$dst)>;
-
-def : Pat<(brcond (setle CPURegs:$lhs, CPURegs:$rhs), bb:$dst),
-          (BEQ (SLT CPURegs:$rhs, CPURegs:$lhs), ZERO, bb:$dst)>;
-def : Pat<(brcond (setule CPURegs:$lhs, CPURegs:$rhs), bb:$dst),
-          (BEQ (SLTu CPURegs:$rhs, CPURegs:$lhs), ZERO, bb:$dst)>;
-
-def : Pat<(brcond CPURegs:$cond, bb:$dst),
-          (BNE CPURegs:$cond, ZERO, bb:$dst)>;
+multiclass BrcondPats<RegisterClass RC, Instruction BEQOp, Instruction BNEOp,
+                      Instruction SLTOp, Instruction SLTuOp, Instruction SLTiOp,
+                      Instruction SLTiuOp, Register ZEROReg> {
+def : Pat<(brcond (i32 (setne RC:$lhs, 0)), bb:$dst),
+          (BNEOp RC:$lhs, ZEROReg, bb:$dst)>;
+def : Pat<(brcond (i32 (seteq RC:$lhs, 0)), bb:$dst),
+          (BEQOp RC:$lhs, ZEROReg, bb:$dst)>;
+
+def : Pat<(brcond (i32 (setge RC:$lhs, RC:$rhs)), bb:$dst),
+          (BEQ (SLTOp RC:$lhs, RC:$rhs), ZERO, bb:$dst)>;
+def : Pat<(brcond (i32 (setuge RC:$lhs, RC:$rhs)), bb:$dst),
+          (BEQ (SLTuOp RC:$lhs, RC:$rhs), ZERO, bb:$dst)>;
+def : Pat<(brcond (i32 (setge RC:$lhs, immSExt16:$rhs)), bb:$dst),
+          (BEQ (SLTiOp RC:$lhs, immSExt16:$rhs), ZERO, bb:$dst)>;
+def : Pat<(brcond (i32 (setuge RC:$lhs, immSExt16:$rhs)), bb:$dst),
+          (BEQ (SLTiuOp RC:$lhs, immSExt16:$rhs), ZERO, bb:$dst)>;
+
+def : Pat<(brcond (i32 (setle RC:$lhs, RC:$rhs)), bb:$dst),
+          (BEQ (SLTOp RC:$rhs, RC:$lhs), ZERO, bb:$dst)>;
+def : Pat<(brcond (i32 (setule RC:$lhs, RC:$rhs)), bb:$dst),
+          (BEQ (SLTuOp RC:$rhs, RC:$lhs), ZERO, bb:$dst)>;
+
+def : Pat<(brcond RC:$cond, bb:$dst),
+          (BNEOp RC:$cond, ZEROReg, bb:$dst)>;
+}
+
+defm : BrcondPats<CPURegs, BEQ, BNE, SLT, SLTu, SLTi, SLTiu, ZERO>;
 
 // select patterns
 multiclass MovzPats<RegisterClass RC, Instruction MOVZInst> {
-  def : Pat<(select (setge CPURegs:$lhs, CPURegs:$rhs), RC:$T, RC:$F),
+  def : Pat<(select (i32 (setge CPURegs:$lhs, CPURegs:$rhs)), RC:$T, RC:$F),
             (MOVZInst RC:$T, (SLT CPURegs:$lhs, CPURegs:$rhs), RC:$F)>;
-  def : Pat<(select (setuge CPURegs:$lhs, CPURegs:$rhs), RC:$T, RC:$F),
+  def : Pat<(select (i32 (setuge CPURegs:$lhs, CPURegs:$rhs)), RC:$T, RC:$F),
             (MOVZInst RC:$T, (SLTu CPURegs:$lhs, CPURegs:$rhs), RC:$F)>;
-  def : Pat<(select (setge CPURegs:$lhs, immSExt16:$rhs), RC:$T, RC:$F),
+  def : Pat<(select (i32 (setge CPURegs:$lhs, immSExt16:$rhs)), RC:$T, RC:$F),
             (MOVZInst RC:$T, (SLTi CPURegs:$lhs, immSExt16:$rhs), RC:$F)>;
-  def : Pat<(select (setuge CPURegs:$lh, immSExt16:$rh), RC:$T, RC:$F),
+  def : Pat<(select (i32 (setuge CPURegs:$lh, immSExt16:$rh)), RC:$T, RC:$F),
             (MOVZInst RC:$T, (SLTiu CPURegs:$lh, immSExt16:$rh), RC:$F)>;
-  def : Pat<(select (setle CPURegs:$lhs, CPURegs:$rhs), RC:$T, RC:$F),
+  def : Pat<(select (i32 (setle CPURegs:$lhs, CPURegs:$rhs)), RC:$T, RC:$F),
             (MOVZInst RC:$T, (SLT CPURegs:$rhs, CPURegs:$lhs), RC:$F)>;
-  def : Pat<(select (setule CPURegs:$lhs, CPURegs:$rhs), RC:$T, RC:$F),
+  def : Pat<(select (i32 (setule CPURegs:$lhs, CPURegs:$rhs)), RC:$T, RC:$F),
             (MOVZInst RC:$T, (SLTu CPURegs:$rhs, CPURegs:$lhs), RC:$F)>;
-  def : Pat<(select (seteq CPURegs:$lhs, CPURegs:$rhs), RC:$T, RC:$F),
+  def : Pat<(select (i32 (seteq CPURegs:$lhs, CPURegs:$rhs)), RC:$T, RC:$F),
             (MOVZInst RC:$T, (XOR CPURegs:$lhs, CPURegs:$rhs), RC:$F)>;
-  def : Pat<(select (seteq CPURegs:$lhs, 0), RC:$T, RC:$F),
+  def : Pat<(select (i32 (seteq CPURegs:$lhs, 0)), RC:$T, RC:$F),
             (MOVZInst RC:$T, CPURegs:$lhs, RC:$F)>;
 }
 
 multiclass MovnPats<RegisterClass RC, Instruction MOVNInst> {
-  def : Pat<(select (setne CPURegs:$lhs, CPURegs:$rhs), RC:$T, RC:$F),
+  def : Pat<(select (i32 (setne CPURegs:$lhs, CPURegs:$rhs)), RC:$T, RC:$F),
             (MOVNInst RC:$T, (XOR CPURegs:$lhs, CPURegs:$rhs), RC:$F)>;
   def : Pat<(select CPURegs:$cond, RC:$T, RC:$F),
             (MOVNInst RC:$T, CPURegs:$cond, RC:$F)>;
-  def : Pat<(select (setne CPURegs:$lhs, 0), RC:$T, RC:$F),
+  def : Pat<(select (i32 (setne CPURegs:$lhs, 0)), RC:$T, RC:$F),
             (MOVNInst RC:$T, CPURegs:$lhs, RC:$F)>;
 }
 
@@ -845,30 +962,48 @@ defm : MovzPats<CPURegs, MOVZ_I>;
 defm : MovnPats<CPURegs, MOVN_I>;
 
 // setcc patterns
-def : Pat<(seteq CPURegs:$lhs, CPURegs:$rhs),
-          (SLTu (XOR CPURegs:$lhs, CPURegs:$rhs), 1)>;
-def : Pat<(setne CPURegs:$lhs, CPURegs:$rhs),
-          (SLTu ZERO, (XOR CPURegs:$lhs, CPURegs:$rhs))>;
-
-def : Pat<(setle CPURegs:$lhs, CPURegs:$rhs),
-          (XORi (SLT CPURegs:$rhs, CPURegs:$lhs), 1)>;
-def : Pat<(setule CPURegs:$lhs, CPURegs:$rhs),
-          (XORi (SLTu CPURegs:$rhs, CPURegs:$lhs), 1)>;
-
-def : Pat<(setgt CPURegs:$lhs, CPURegs:$rhs),
-          (SLT CPURegs:$rhs, CPURegs:$lhs)>;
-def : Pat<(setugt CPURegs:$lhs, CPURegs:$rhs),
-          (SLTu CPURegs:$rhs, CPURegs:$lhs)>;
-
-def : Pat<(setge CPURegs:$lhs, CPURegs:$rhs),
-          (XORi (SLT CPURegs:$lhs, CPURegs:$rhs), 1)>;
-def : Pat<(setuge CPURegs:$lhs, CPURegs:$rhs),
-          (XORi (SLTu CPURegs:$lhs, CPURegs:$rhs), 1)>;
-
-def : Pat<(setge CPURegs:$lhs, immSExt16:$rhs),
-          (XORi (SLTi CPURegs:$lhs, immSExt16:$rhs), 1)>;
-def : Pat<(setuge CPURegs:$lhs, immSExt16:$rhs),
-          (XORi (SLTiu CPURegs:$lhs, immSExt16:$rhs), 1)>;
+multiclass SeteqPats<RegisterClass RC, Instruction SLTiuOp, Instruction XOROp,
+                     Instruction SLTuOp, Register ZEROReg> {
+  def : Pat<(seteq RC:$lhs, RC:$rhs),
+            (SLTiuOp (XOROp RC:$lhs, RC:$rhs), 1)>;
+  def : Pat<(setne RC:$lhs, RC:$rhs),
+            (SLTuOp ZEROReg, (XOROp RC:$lhs, RC:$rhs))>;
+}
+
+multiclass SetlePats<RegisterClass RC, Instruction SLTOp, Instruction SLTuOp> {
+  def : Pat<(setle RC:$lhs, RC:$rhs),
+            (XORi (SLTOp RC:$rhs, RC:$lhs), 1)>;
+  def : Pat<(setule RC:$lhs, RC:$rhs),
+            (XORi (SLTuOp RC:$rhs, RC:$lhs), 1)>;
+}
+
+multiclass SetgtPats<RegisterClass RC, Instruction SLTOp, Instruction SLTuOp> {
+  def : Pat<(setgt RC:$lhs, RC:$rhs),
+            (SLTOp RC:$rhs, RC:$lhs)>;
+  def : Pat<(setugt RC:$lhs, RC:$rhs),
+            (SLTuOp RC:$rhs, RC:$lhs)>;
+}
+
+multiclass SetgePats<RegisterClass RC, Instruction SLTOp, Instruction SLTuOp> {
+  def : Pat<(setge RC:$lhs, RC:$rhs),
+            (XORi (SLTOp RC:$lhs, RC:$rhs), 1)>;
+  def : Pat<(setuge RC:$lhs, RC:$rhs),
+            (XORi (SLTuOp RC:$lhs, RC:$rhs), 1)>;
+}
+
+multiclass SetgeImmPats<RegisterClass RC, Instruction SLTiOp,
+                        Instruction SLTiuOp> {
+  def : Pat<(setge RC:$lhs, immSExt16:$rhs),
+            (XORi (SLTiOp RC:$lhs, immSExt16:$rhs), 1)>;
+  def : Pat<(setuge RC:$lhs, immSExt16:$rhs),
+            (XORi (SLTiuOp RC:$lhs, immSExt16:$rhs), 1)>;
+}
+
+defm : SeteqPats<CPURegs, SLTiu, XOR, SLTu, ZERO>;
+defm : SetlePats<CPURegs, SLT, SLTu>;
+defm : SetgtPats<CPURegs, SLT, SLTu>;
+defm : SetgePats<CPURegs, SLT, SLTu>;
+defm : SetgeImmPats<CPURegs, SLTi, SLTiu>;
 
 // select MipsDynAlloc
 def : Pat<(MipsDynAlloc addr:$f), (DynAlloc addr:$f)>;
@@ -878,4 +1013,5 @@ def : Pat<(MipsDynAlloc addr:$f), (DynAlloc addr:$f)>;
 //===----------------------------------------------------------------------===//
 
 include "MipsInstrFPU.td"
+include "Mips64InstrInfo.td"
 
diff --git a/lib/Target/Mips/MipsJITInfo.cpp b/lib/Target/Mips/MipsJITInfo.cpp
new file mode 100644
index 0000000000000..28c2b48b2e192
--- /dev/null
+++ b/lib/Target/Mips/MipsJITInfo.cpp
@@ -0,0 +1,230 @@
+//===- MipsJITInfo.cpp - Implement the JIT interfaces for the Mips target -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the JIT interfaces for the Mips target.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "jit"
+#include "MipsJITInfo.h"
+#include "MipsInstrInfo.h"
+#include "MipsRelocations.h"
+#include "MipsSubtarget.h"
+#include "llvm/Function.h"
+#include "llvm/CodeGen/JITCodeEmitter.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Memory.h"
+#include <cstdlib>
+using namespace llvm;
+
+
+void MipsJITInfo::replaceMachineCodeForFunction(void *Old, void *New) {
+  report_fatal_error("MipsJITInfo::replaceMachineCodeForFunction");
+}
+
+/// JITCompilerFunction - This contains the address of the JIT function used to
+/// compile a function lazily.
+static TargetJITInfo::JITCompilerFn JITCompilerFunction;
+
+// Get the ASMPREFIX for the current host.  This is often '_'.
+#ifndef __USER_LABEL_PREFIX__
+#define __USER_LABEL_PREFIX__
+#endif
+#define GETASMPREFIX2(X) #X
+#define GETASMPREFIX(X) GETASMPREFIX2(X)
+#define ASMPREFIX GETASMPREFIX(__USER_LABEL_PREFIX__)
+
+// CompilationCallback stub - We can't use a C function with inline assembly in
+// it, because the prolog/epilog inserted by GCC won't work for us. Instead,
+// write our own wrapper, which does things our way, so we have complete control
+// over register saving and restoring. This code saves registers, calls
+// MipsCompilationCallbackC and restores registers.
+extern "C" {
+#if defined (__mips__)
+void MipsCompilationCallback();
+
+  asm(
+    ".text\n"
+    ".align 2\n"
+    ".globl " ASMPREFIX "MipsCompilationCallback\n"
+    ASMPREFIX "MipsCompilationCallback:\n"
+    ".ent " ASMPREFIX "MipsCompilationCallback\n"
+    ".frame  $29, 32, $31\n"
+    ".set  noreorder\n"
+    ".cpload $t9\n"
+
+    "addiu $sp, $sp, -60\n"
+    ".cprestore 16\n"
+
+    // Save argument registers a0, a1, a2, a3, f12, f14 since they may contain
+    // stuff for the real target function right now. We have to act as if this
+    // whole compilation callback doesn't exist as far as the caller is
+    // concerned. We also need to save the ra register since it contains the
+    // original return address, and t8 register since it contains the address
+    // of the end of function stub.
+    "sw $a0, 20($sp)\n"
+    "sw $a1, 24($sp)\n"
+    "sw $a2, 28($sp)\n"
+    "sw $a3, 32($sp)\n"
+    "sw $ra, 36($sp)\n"
+    "sw $t8, 40($sp)\n"
+    "sdc1 $f12, 44($sp)\n"
+    "sdc1 $f14, 52($sp)\n"
+
+    // t8 points at the end of function stub. Pass the beginning of the stub
+    // to the MipsCompilationCallbackC.
+    "addiu $a0, $t8, -16\n"
+    "jal " ASMPREFIX "MipsCompilationCallbackC\n"
+    "nop\n"
+
+    // Restore registers.
+    "lw $a0, 20($sp)\n"
+    "lw $a1, 24($sp)\n"
+    "lw $a2, 28($sp)\n"
+    "lw $a3, 32($sp)\n"
+    "lw $ra, 36($sp)\n"
+    "lw $t8, 40($sp)\n"
+    "ldc1 $f12, 44($sp)\n"
+    "ldc1 $f14, 52($sp)\n"
+    "addiu $sp, $sp, 60\n"
+
+    // Jump to the (newly modified) stub to invoke the real function.
+    "addiu $t8, $t8, -16\n"
+    "jr $t8\n"
+    "nop\n"
+
+    ".set  reorder\n"
+    ".end " ASMPREFIX "MipsCompilationCallback\n"
+      );
+#else  // host != Mips
+  void MipsCompilationCallback() {
+    llvm_unreachable(
+      "Cannot call MipsCompilationCallback() on a non-Mips arch!");
+  }
+#endif
+}
+
+/// MipsCompilationCallbackC - This is the target-specific function invoked
+/// by the function stub when we did not know the real target of a call.
+/// This function must locate the start of the stub or call site and pass
+/// it into the JIT compiler function.
+extern "C" void MipsCompilationCallbackC(intptr_t StubAddr) {
+  // Get the address of the compiled code for this function.
+  intptr_t NewVal = (intptr_t) JITCompilerFunction((void*) StubAddr);
+
+  // Rewrite the function stub so that we don't end up here every time we
+  // execute the call. We're replacing the first four instructions of the
+  // stub with code that jumps to the compiled function:
+  //   lui $t9, %hi(NewVal)
+  //   addiu $t9, $t9, %lo(NewVal)
+  //   jr $t9
+  //   nop
+
+  int Hi = ((unsigned)NewVal & 0xffff0000) >> 16;
+  if ((NewVal & 0x8000) != 0)
+    Hi++;
+  int Lo = (int)(NewVal & 0xffff);
+
+  *(intptr_t *)(StubAddr) = 0xf << 26 | 25 << 16 | Hi;
+  *(intptr_t *)(StubAddr + 4) = 9 << 26 | 25 << 21 | 25 << 16 | Lo;
+  *(intptr_t *)(StubAddr + 8) = 25 << 21 | 8;
+  *(intptr_t *)(StubAddr + 12) = 0;
+
+  sys::Memory::InvalidateInstructionCache((void*) StubAddr, 16);
+}
+
+TargetJITInfo::LazyResolverFn MipsJITInfo::getLazyResolverFunction(
+    JITCompilerFn F) {
+  JITCompilerFunction = F;
+  return MipsCompilationCallback;
+}
+
+TargetJITInfo::StubLayout MipsJITInfo::getStubLayout() {
+  // The stub contains 4 4-byte instructions, aligned at 4 bytes. See
+  // emitFunctionStub for details.
+  StubLayout Result = { 4*4, 4 };
+  return Result;
+}
+
+void *MipsJITInfo::emitFunctionStub(const Function* F, void *Fn,
+    JITCodeEmitter &JCE) {
+  JCE.emitAlignment(4);
+  void *Addr = (void*) (JCE.getCurrentPCValue());
+  if (!sys::Memory::setRangeWritable(Addr, 16))
+    llvm_unreachable("ERROR: Unable to mark stub writable.");
+
+  intptr_t EmittedAddr;
+  if (Fn != (void*)(intptr_t)MipsCompilationCallback)
+    EmittedAddr = (intptr_t)Fn;
+  else
+    EmittedAddr = (intptr_t)MipsCompilationCallback;
+
+
+  int Hi = ((unsigned)EmittedAddr & 0xffff0000) >> 16;
+  if ((EmittedAddr & 0x8000) != 0)
+    Hi++;
+  int Lo = (int)(EmittedAddr & 0xffff);
+
+  // lui t9, %hi(EmittedAddr)
+  // addiu t9, t9, %lo(EmittedAddr)
+  // jalr t8, t9
+  // nop
+  JCE.emitWordLE(0xf << 26 | 25 << 16 | Hi);
+  JCE.emitWordLE(9 << 26 | 25 << 21 | 25 << 16 | Lo);
+  JCE.emitWordLE(25 << 21 | 24 << 11 | 9);
+  JCE.emitWordLE(0);
+
+  sys::Memory::InvalidateInstructionCache(Addr, 16);
+  if (!sys::Memory::setRangeExecutable(Addr, 16))
+    llvm_unreachable("ERROR: Unable to mark stub executable.");
+
+  return Addr;
+}
+
+/// relocate - Before the JIT can run a block of code that has been emitted,
+/// it must rewrite the code to contain the actual addresses of any
+/// referenced global symbols.
+void MipsJITInfo::relocate(void *Function, MachineRelocation *MR,
+    unsigned NumRelocs, unsigned char* GOTBase) {
+  for (unsigned i = 0; i != NumRelocs; ++i, ++MR) {
+
+    void *RelocPos = (char*) Function + MR->getMachineCodeOffset();
+    intptr_t ResultPtr = (intptr_t) MR->getResultPointer();
+
+    switch ((Mips::RelocationType) MR->getRelocationType()) {
+    case Mips::reloc_mips_branch:
+      ResultPtr = (((ResultPtr - (intptr_t) RelocPos) - 4) >> 2) & 0xffff;
+      *((unsigned*) RelocPos) |= (unsigned) ResultPtr;
+      break;
+
+    case Mips::reloc_mips_26:
+      ResultPtr = (ResultPtr & 0x0fffffff) >> 2;
+      *((unsigned*) RelocPos) |= (unsigned) ResultPtr;
+      break;
+
+    case Mips::reloc_mips_hi:
+      ResultPtr = ResultPtr >> 16;
+      if ((((intptr_t) (MR->getResultPointer()) & 0xffff) >> 15) == 1) {
+        ResultPtr += 1;
+      }
+      *((unsigned*) RelocPos) |= (unsigned) ResultPtr;
+      break;
+
+    case Mips::reloc_mips_lo:
+      ResultPtr = ResultPtr & 0xffff;
+      *((unsigned*) RelocPos) |= (unsigned) ResultPtr;
+      break;
+
+    default:
+      llvm_unreachable("ERROR: Unknown Mips relocation.");
+    }
+  }
+}
diff --git a/lib/Target/Mips/MipsJITInfo.h b/lib/Target/Mips/MipsJITInfo.h
new file mode 100644
index 0000000000000..41f32a35f1b05
--- /dev/null
+++ b/lib/Target/Mips/MipsJITInfo.h
@@ -0,0 +1,70 @@
+//===- MipsJITInfo.h - Mips implementation of the JIT interface -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the MipsJITInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSJITINFO_H
+#define MIPSJITINFO_H
+
+#include "MipsMachineFunction.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/Target/TargetJITInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+
+namespace llvm {
+class MipsTargetMachine;
+
+class MipsJITInfo : public TargetJITInfo {
+
+  bool IsPIC;
+
+  public:
+    explicit MipsJITInfo() :
+      IsPIC(false) {}
+
+    /// replaceMachineCodeForFunction - Make it so that calling the function
+    /// whose machine code is at OLD turns into a call to NEW, perhaps by
+    /// overwriting OLD with a branch to NEW.  This is used for self-modifying
+    /// code.
+    ///
+    virtual void replaceMachineCodeForFunction(void *Old, void *New);
+
+    // getStubLayout - Returns the size and alignment of the largest call stub
+    // on Mips.
+    virtual StubLayout getStubLayout();
+
+    /// emitFunctionStub - Use the specified JITCodeEmitter object to emit a
+    /// small native function that simply calls the function at the specified
+    /// address.
+    virtual void *emitFunctionStub(const Function* F, void *Fn,
+        JITCodeEmitter &JCE);
+
+    /// getLazyResolverFunction - Expose the lazy resolver to the JIT.
+    virtual LazyResolverFn getLazyResolverFunction(JITCompilerFn);
+
+    /// relocate - Before the JIT can run a block of code that has been emitted,
+    /// it must rewrite the code to contain the actual addresses of any
+    /// referenced global symbols.
+    virtual void relocate(void *Function, MachineRelocation *MR,
+        unsigned NumRelocs, unsigned char* GOTBase);
+
+    /// Initialize - Initialize internal stage for the function being JITted.
+    void Initialize(const MachineFunction &MF, bool isPIC) {
+      IsPIC = isPIC;
+    }
+
+};
+}
+
+#endif
diff --git a/lib/Target/Mips/MipsMCInstLower.cpp b/lib/Target/Mips/MipsMCInstLower.cpp
index f5cc3aa25f1bc..608a7d21a4f9b 100644
--- a/lib/Target/Mips/MipsMCInstLower.cpp
+++ b/lib/Target/Mips/MipsMCInstLower.cpp
@@ -29,10 +29,10 @@ MipsMCInstLower::MipsMCInstLower(Mangler *mang, const MachineFunction &mf,
   : Ctx(mf.getContext()), Mang(mang), AsmPrinter(asmprinter) {}
 
 MCOperand MipsMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
-                                              MachineOperandType MOTy) const {
+                                              MachineOperandType MOTy,
+                                              unsigned Offset) const {
   MipsMCSymbolRefExpr::VariantKind Kind;
   const MCSymbol *Symbol;
-  int Offset = 0;
 
   switch(MO.getTargetFlags()) {
   default:                  assert(0 && "Invalid target flag!");
@@ -46,6 +46,11 @@ MCOperand MipsMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
   case MipsII::MO_GOTTPREL: Kind = MipsMCSymbolRefExpr::VK_Mips_GOTTPREL; break;
   case MipsII::MO_TPREL_HI: Kind = MipsMCSymbolRefExpr::VK_Mips_TPREL_HI; break;
   case MipsII::MO_TPREL_LO: Kind = MipsMCSymbolRefExpr::VK_Mips_TPREL_LO; break;
+  case MipsII::MO_GPOFF_HI: Kind = MipsMCSymbolRefExpr::VK_Mips_GPOFF_HI; break;
+  case MipsII::MO_GPOFF_LO: Kind = MipsMCSymbolRefExpr::VK_Mips_GPOFF_LO; break;
+  case MipsII::MO_GOT_DISP: Kind = MipsMCSymbolRefExpr::VK_Mips_GOT_DISP; break;
+  case MipsII::MO_GOT_PAGE: Kind = MipsMCSymbolRefExpr::VK_Mips_GOT_PAGE; break;
+  case MipsII::MO_GOT_OFST: Kind = MipsMCSymbolRefExpr::VK_Mips_GOT_OFST; break;
   }
 
   switch (MOTy) {
@@ -72,7 +77,7 @@ MCOperand MipsMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
     case MachineOperand::MO_ConstantPoolIndex:
       Symbol = AsmPrinter.GetCPISymbol(MO.getIndex());
       if (MO.getOffset())
-        Offset = MO.getOffset();  
+        Offset += MO.getOffset();  
       break;
 
     default:
@@ -83,36 +88,39 @@ MCOperand MipsMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
                                                            Ctx));
 }
 
+MCOperand MipsMCInstLower::LowerOperand(const MachineOperand& MO) const {
+  MachineOperandType MOTy = MO.getType();
+  
+  switch (MOTy) {
+  default:
+    assert(0 && "unknown operand type");
+    break;
+  case MachineOperand::MO_Register:
+    // Ignore all implicit register operands.
+    if (MO.isImplicit()) break;
+    return MCOperand::CreateReg(MO.getReg());
+  case MachineOperand::MO_Immediate:
+    return MCOperand::CreateImm(MO.getImm());
+  case MachineOperand::MO_MachineBasicBlock:
+  case MachineOperand::MO_GlobalAddress:
+  case MachineOperand::MO_ExternalSymbol:
+  case MachineOperand::MO_JumpTableIndex:
+  case MachineOperand::MO_ConstantPoolIndex:
+  case MachineOperand::MO_BlockAddress:
+    return LowerSymbolOperand(MO, MOTy, 0);
+ }
+
+  return MCOperand();
+}
+
 void MipsMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
   OutMI.setOpcode(MI->getOpcode());
   
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI->getOperand(i);
-    MCOperand MCOp;
-    MachineOperandType MOTy = MO.getType();
+    MCOperand MCOp = LowerOperand(MO);
 
-    switch (MOTy) {
-    default:
-      MI->dump();
-      llvm_unreachable("unknown operand type");
-    case MachineOperand::MO_Register:
-      // Ignore all implicit register operands.
-      if (MO.isImplicit()) continue;
-      MCOp = MCOperand::CreateReg(MO.getReg());
-      break;
-    case MachineOperand::MO_Immediate:
-      MCOp = MCOperand::CreateImm(MO.getImm());
-      break;
-    case MachineOperand::MO_MachineBasicBlock:
-    case MachineOperand::MO_GlobalAddress:
-    case MachineOperand::MO_ExternalSymbol:
-    case MachineOperand::MO_JumpTableIndex:
-    case MachineOperand::MO_ConstantPoolIndex:
-    case MachineOperand::MO_BlockAddress:
-      MCOp = LowerSymbolOperand(MO, MOTy);
-      break;
-    }
-    
-    OutMI.addOperand(MCOp);
+    if (MCOp.isValid())
+      OutMI.addOperand(MCOp);
   }
 }
diff --git a/lib/Target/Mips/MipsMCInstLower.h b/lib/Target/Mips/MipsMCInstLower.h
index ec5201be7f6dd..223f23aed2868 100644
--- a/lib/Target/Mips/MipsMCInstLower.h
+++ b/lib/Target/Mips/MipsMCInstLower.h
@@ -1,4 +1,4 @@
-//===-- MipsMCInstLower.h - Lower MachineInstr to MCInst -------------------===//
+//===-- MipsMCInstLower.h - Lower MachineInstr to MCInst -------------------==//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -36,7 +36,8 @@ public:
   void Lower(const MachineInstr *MI, MCInst &OutMI) const;
 private:
   MCOperand LowerSymbolOperand(const MachineOperand &MO,
-                               MachineOperandType MOTy) const;
+                               MachineOperandType MOTy, unsigned Offset) const;
+  MCOperand LowerOperand(const MachineOperand& MO) const;
 };
 }
 
diff --git a/lib/Target/Mips/MipsMCSymbolRefExpr.cpp b/lib/Target/Mips/MipsMCSymbolRefExpr.cpp
index 9a2bdae0e3391..a0a242c8c443c 100644
--- a/lib/Target/Mips/MipsMCSymbolRefExpr.cpp
+++ b/lib/Target/Mips/MipsMCSymbolRefExpr.cpp
@@ -33,6 +33,11 @@ void MipsMCSymbolRefExpr::PrintImpl(raw_ostream &OS) const {
   case VK_Mips_GOTTPREL: OS << "%gottprel("; break;
   case VK_Mips_TPREL_HI: OS << "%tprel_hi("; break;
   case VK_Mips_TPREL_LO: OS << "%tprel_lo("; break;
+  case VK_Mips_GPOFF_HI: OS << "%hi(%neg(%gp_rel("; break;
+  case VK_Mips_GPOFF_LO: OS << "%lo(%neg(%gp_rel("; break;
+  case VK_Mips_GOT_DISP: OS << "%got_disp("; break;
+  case VK_Mips_GOT_PAGE: OS << "%got_page("; break;
+  case VK_Mips_GOT_OFST: OS << "%got_ofst("; break;
   }
 
   OS << *Symbol;
@@ -43,7 +48,9 @@ void MipsMCSymbolRefExpr::PrintImpl(raw_ostream &OS) const {
     OS << Offset;
   }
 
-  if (Kind != VK_Mips_None)
+  if (Kind == VK_Mips_GPOFF_HI || Kind == VK_Mips_GPOFF_LO)
+    OS << ")))";
+  else if (Kind != VK_Mips_None)
     OS << ')';
 }
 
diff --git a/lib/Target/Mips/MipsMCSymbolRefExpr.h b/lib/Target/Mips/MipsMCSymbolRefExpr.h
index 3e695963709e7..55e85a79c1c8a 100644
--- a/lib/Target/Mips/MipsMCSymbolRefExpr.h
+++ b/lib/Target/Mips/MipsMCSymbolRefExpr.h
@@ -25,7 +25,12 @@ public:
     VK_Mips_TLSGD,
     VK_Mips_GOTTPREL,
     VK_Mips_TPREL_HI,
-    VK_Mips_TPREL_LO
+    VK_Mips_TPREL_LO,
+    VK_Mips_GPOFF_HI,
+    VK_Mips_GPOFF_LO,
+    VK_Mips_GOT_DISP,
+    VK_Mips_GOT_PAGE,
+    VK_Mips_GOT_OFST
   };
 
 private:
diff --git a/lib/Target/Mips/MipsMachineFunction.h b/lib/Target/Mips/MipsMachineFunction.h
index dbb7a6744224f..bc30b6b2425b4 100644
--- a/lib/Target/Mips/MipsMachineFunction.h
+++ b/lib/Target/Mips/MipsMachineFunction.h
@@ -51,16 +51,12 @@ private:
   mutable int DynAllocFI; // Frame index of dynamically allocated stack area.   
   unsigned MaxCallFrameSize;
 
-  /// AtomicFrameIndex - To implement atomic.swap and atomic.cmp.swap
-  /// intrinsics, it is necessary to use a temporary stack location.
-  /// This field holds the frame index of this location.
-  int AtomicFrameIndex;
 public:
   MipsFunctionInfo(MachineFunction& MF)
   : MF(MF), SRetReturnReg(0), GlobalBaseReg(0),
     VarArgsFrameIndex(0), InArgFIRange(std::make_pair(-1, 0)),
     OutArgFIRange(std::make_pair(-1, 0)), GPFI(0), DynAllocFI(0),
-    MaxCallFrameSize(0), AtomicFrameIndex(-1)
+    MaxCallFrameSize(0)
   {}
 
   bool isInArgFI(int FI) const {
@@ -104,9 +100,6 @@ public:
 
   unsigned getMaxCallFrameSize() const { return MaxCallFrameSize; }
   void setMaxCallFrameSize(unsigned S) { MaxCallFrameSize = S; }
-
-  int getAtomicFrameIndex() const { return AtomicFrameIndex; }
-  void setAtomicFrameIndex(int Index) { AtomicFrameIndex = Index; }
 };
 
 } // end of namespace llvm
diff --git a/lib/Target/Mips/MipsRegisterInfo.cpp b/lib/Target/Mips/MipsRegisterInfo.cpp
index 24390daff75c3..f8c0fdac8cf0b 100644
--- a/lib/Target/Mips/MipsRegisterInfo.cpp
+++ b/lib/Target/Mips/MipsRegisterInfo.cpp
@@ -24,7 +24,6 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineLocation.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
@@ -44,7 +43,7 @@ using namespace llvm;
 
 MipsRegisterInfo::MipsRegisterInfo(const MipsSubtarget &ST,
                                    const TargetInstrInfo &tii)
-  : MipsGenRegisterInfo(), Subtarget(ST), TII(tii) {}
+  : MipsGenRegisterInfo(Mips::RA), Subtarget(ST), TII(tii) {}
 
 /// getRegisterNumbering - Given the enum value for some register, e.g.
 /// Mips::RA, return the number that it corresponds to (e.g. 31).
@@ -52,39 +51,87 @@ unsigned MipsRegisterInfo::
 getRegisterNumbering(unsigned RegEnum)
 {
   switch (RegEnum) {
-    case Mips::ZERO : case Mips::F0 : case Mips::D0 : return 0;
-    case Mips::AT   : case Mips::F1 : return 1;
-    case Mips::V0   : case Mips::F2 : case Mips::D1 : return 2;
-    case Mips::V1   : case Mips::F3 : return 3;
-    case Mips::A0   : case Mips::F4 : case Mips::D2 : return 4;
-    case Mips::A1   : case Mips::F5 : return 5;
-    case Mips::A2   : case Mips::F6 : case Mips::D3 : return 6;
-    case Mips::A3   : case Mips::F7 : return 7;
-    case Mips::T0   : case Mips::F8 : case Mips::D4 : return 8;
-    case Mips::T1   : case Mips::F9 : return 9;
-    case Mips::T2   : case Mips::F10: case Mips::D5: return 10;
-    case Mips::T3   : case Mips::F11: return 11;
-    case Mips::T4   : case Mips::F12: case Mips::D6: return 12;
-    case Mips::T5   : case Mips::F13: return 13;
-    case Mips::T6   : case Mips::F14: case Mips::D7: return 14;
-    case Mips::T7   : case Mips::F15: return 15;
-    case Mips::S0   : case Mips::F16: case Mips::D8: return 16;
-    case Mips::S1   : case Mips::F17: return 17;
-    case Mips::S2   : case Mips::F18: case Mips::D9: return 18;
-    case Mips::S3   : case Mips::F19: return 19;
-    case Mips::S4   : case Mips::F20: case Mips::D10: return 20;
-    case Mips::S5   : case Mips::F21: return 21;
-    case Mips::S6   : case Mips::F22: case Mips::D11: return 22;
-    case Mips::S7   : case Mips::F23: return 23;
-    case Mips::T8   : case Mips::F24: case Mips::D12: return 24;
-    case Mips::T9   : case Mips::F25: return 25;
-    case Mips::K0   : case Mips::F26: case Mips::D13: return 26;
-    case Mips::K1   : case Mips::F27: return 27;
-    case Mips::GP   : case Mips::F28: case Mips::D14: return 28;
-    case Mips::SP   : case Mips::F29: return 29;
-    case Mips::FP   : case Mips::F30: case Mips::D15: return 30;
-    case Mips::RA   : case Mips::F31: return 31;
-    default: llvm_unreachable("Unknown register number!");
+  case Mips::ZERO: case Mips::ZERO_64: case Mips::F0: case Mips::D0_64:
+  case Mips::D0:
+    return 0;
+  case Mips::AT: case Mips::AT_64: case Mips::F1: case Mips::D1_64:
+    return 1;
+  case Mips::V0: case Mips::V0_64: case Mips::F2: case Mips::D2_64:
+  case Mips::D1:
+    return 2;
+  case Mips::V1: case Mips::V1_64: case Mips::F3: case Mips::D3_64:
+    return 3;
+  case Mips::A0: case Mips::A0_64: case Mips::F4: case Mips::D4_64:
+  case Mips::D2:
+    return 4;
+  case Mips::A1: case Mips::A1_64: case Mips::F5: case Mips::D5_64:
+    return 5;
+  case Mips::A2: case Mips::A2_64: case Mips::F6: case Mips::D6_64:
+  case Mips::D3:
+    return 6;
+  case Mips::A3: case Mips::A3_64: case Mips::F7: case Mips::D7_64:
+    return 7;
+  case Mips::T0: case Mips::T0_64: case Mips::F8: case Mips::D8_64:
+  case Mips::D4:
+    return 8;
+  case Mips::T1: case Mips::T1_64: case Mips::F9: case Mips::D9_64:
+    return 9;
+  case Mips::T2: case Mips::T2_64: case Mips::F10: case Mips::D10_64:
+  case Mips::D5:
+    return 10;
+  case Mips::T3: case Mips::T3_64: case Mips::F11: case Mips::D11_64:
+    return 11;
+  case Mips::T4: case Mips::T4_64: case Mips::F12: case Mips::D12_64:
+  case Mips::D6:
+    return 12;
+  case Mips::T5: case Mips::T5_64: case Mips::F13: case Mips::D13_64:
+    return 13;
+  case Mips::T6: case Mips::T6_64: case Mips::F14: case Mips::D14_64:
+  case Mips::D7:
+    return 14;
+  case Mips::T7: case Mips::T7_64: case Mips::F15: case Mips::D15_64:
+    return 15;
+  case Mips::S0: case Mips::S0_64: case Mips::F16: case Mips::D16_64:
+  case Mips::D8:
+    return 16;
+  case Mips::S1: case Mips::S1_64: case Mips::F17: case Mips::D17_64:
+    return 17;
+  case Mips::S2: case Mips::S2_64: case Mips::F18: case Mips::D18_64:
+  case Mips::D9:
+    return 18;
+  case Mips::S3: case Mips::S3_64: case Mips::F19: case Mips::D19_64:
+    return 19;
+  case Mips::S4: case Mips::S4_64: case Mips::F20: case Mips::D20_64:
+  case Mips::D10:
+    return 20;
+  case Mips::S5: case Mips::S5_64: case Mips::F21: case Mips::D21_64:
+    return 21;
+  case Mips::S6: case Mips::S6_64: case Mips::F22: case Mips::D22_64:
+  case Mips::D11:
+    return 22;
+  case Mips::S7: case Mips::S7_64: case Mips::F23: case Mips::D23_64:
+    return 23;
+  case Mips::T8: case Mips::T8_64: case Mips::F24: case Mips::D24_64:
+  case Mips::D12:
+    return 24;
+  case Mips::T9: case Mips::T9_64: case Mips::F25: case Mips::D25_64:
+    return 25;
+  case Mips::K0: case Mips::K0_64: case Mips::F26: case Mips::D26_64:
+  case Mips::D13:
+    return 26;
+  case Mips::K1: case Mips::K1_64: case Mips::F27: case Mips::D27_64:
+    return 27;
+  case Mips::GP: case Mips::GP_64: case Mips::F28: case Mips::D28_64:
+  case Mips::D14:
+    return 28;
+  case Mips::SP: case Mips::SP_64: case Mips::F29: case Mips::D29_64:
+    return 29;
+  case Mips::FP: case Mips::FP_64: case Mips::F30: case Mips::D30_64:
+  case Mips::D15: 
+    return 30;
+  case Mips::RA: case Mips::RA_64: case Mips::F31: case Mips::D31_64:
+    return 31;
+  default: llvm_unreachable("Unknown register number!");
   }
   return 0; // Not reached
 }
@@ -101,7 +148,7 @@ getCalleeSavedRegs(const MachineFunction *MF) const
 {
   // Mips callee-save register range is $16-$23, $f20-$f30
   static const unsigned SingleFloatOnlyCalleeSavedRegs[] = {
-    Mips::F30, Mips::F29, Mips::F28, Mips::F27, Mips::F26,
+    Mips::F31, Mips::F30, Mips::F29, Mips::F28, Mips::F27, Mips::F26,
     Mips::F25, Mips::F24, Mips::F23, Mips::F22, Mips::F21, Mips::F20,
     Mips::RA, Mips::FP, Mips::S7, Mips::S6, Mips::S5, Mips::S4,
     Mips::S3, Mips::S2, Mips::S1, Mips::S0, 0
@@ -113,31 +160,71 @@ getCalleeSavedRegs(const MachineFunction *MF) const
     Mips::S3, Mips::S2, Mips::S1, Mips::S0, 0
   };
 
+  static const unsigned N32CalleeSavedRegs[] = {
+    Mips::D31_64, Mips::D29_64, Mips::D27_64, Mips::D25_64, Mips::D23_64,
+    Mips::D21_64,
+    Mips::RA_64, Mips::FP_64, Mips::GP_64, Mips::S7_64, Mips::S6_64,
+    Mips::S5_64, Mips::S4_64, Mips::S3_64, Mips::S2_64, Mips::S1_64,
+    Mips::S0_64, 0
+  };
+
+  static const unsigned N64CalleeSavedRegs[] = {
+    Mips::D31_64, Mips::D30_64, Mips::D29_64, Mips::D28_64, Mips::D27_64,
+    Mips::D26_64, Mips::D25_64, Mips::D24_64,
+    Mips::RA_64, Mips::FP_64, Mips::GP_64, Mips::S7_64, Mips::S6_64,
+    Mips::S5_64, Mips::S4_64, Mips::S3_64, Mips::S2_64, Mips::S1_64,
+    Mips::S0_64, 0
+  };
+
   if (Subtarget.isSingleFloat())
     return SingleFloatOnlyCalleeSavedRegs;
-  else
+  else if (!Subtarget.hasMips64())
     return Mips32CalleeSavedRegs;
+  else if (Subtarget.isABI_N32())
+    return N32CalleeSavedRegs;
+  
+  assert(Subtarget.isABI_N64());
+  return N64CalleeSavedRegs;  
 }
 
 BitVector MipsRegisterInfo::
 getReservedRegs(const MachineFunction &MF) const {
+  static const unsigned ReservedCPURegs[] = {
+    Mips::ZERO, Mips::AT, Mips::K0, Mips::K1, 
+    Mips::GP, Mips::SP, Mips::FP, Mips::RA, 0
+  };
+
+  static const unsigned ReservedCPU64Regs[] = {
+    Mips::ZERO_64, Mips::AT_64, Mips::K0_64, Mips::K1_64, 
+    Mips::GP_64, Mips::SP_64, Mips::FP_64, Mips::RA_64, 0
+  };
+
   BitVector Reserved(getNumRegs());
-  Reserved.set(Mips::ZERO);
-  Reserved.set(Mips::AT);
-  Reserved.set(Mips::K0);
-  Reserved.set(Mips::K1);
-  Reserved.set(Mips::GP);
-  Reserved.set(Mips::SP);
-  Reserved.set(Mips::FP);
-  Reserved.set(Mips::RA);
-  Reserved.set(Mips::F31);
-  Reserved.set(Mips::D15);
-
-  // SRV4 requires that odd register can't be used.
-  if (!Subtarget.isSingleFloat() && !Subtarget.isMips32())
-    for (unsigned FReg=(Mips::F0)+1; FReg < Mips::F30; FReg+=2)
-      Reserved.set(FReg);
+  typedef TargetRegisterClass::iterator RegIter;
+
+  for (const unsigned *Reg = ReservedCPURegs; *Reg; ++Reg)
+    Reserved.set(*Reg);
 
+  if (Subtarget.hasMips64()) {
+    for (const unsigned *Reg = ReservedCPU64Regs; *Reg; ++Reg)
+      Reserved.set(*Reg);
+
+    // Reserve all registers in AFGR64.
+    for (RegIter Reg = Mips::AFGR64RegisterClass->begin();
+         Reg != Mips::AFGR64RegisterClass->end(); ++Reg)
+      Reserved.set(*Reg);
+  }
+  else {
+    // Reserve all registers in CPU64Regs & FGR64.
+    for (RegIter Reg = Mips::CPU64RegsRegisterClass->begin();
+         Reg != Mips::CPU64RegsRegisterClass->end(); ++Reg)
+      Reserved.set(*Reg);
+
+    for (RegIter Reg = Mips::FGR64RegisterClass->begin();
+         Reg != Mips::FGR64RegisterClass->end(); ++Reg)
+      Reserved.set(*Reg);
+  }
+  
   return Reserved;
 }
 
@@ -244,11 +331,6 @@ eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
   MI.getOperand(i+1).ChangeToImmediate(Offset);
 }
 
-unsigned MipsRegisterInfo::
-getRARegister() const {
-  return Mips::RA;
-}
-
 unsigned MipsRegisterInfo::
 getFrameRegister(const MachineFunction &MF) const {
   const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
@@ -267,12 +349,3 @@ getEHHandlerRegister() const {
   llvm_unreachable("What is the exception handler register");
   return 0;
 }
-
-int MipsRegisterInfo::
-getDwarfRegNum(unsigned RegNum, bool isEH) const {
-  return MipsGenRegisterInfo::getDwarfRegNumFull(RegNum, 0);
-}
-
-int MipsRegisterInfo::getLLVMRegNum(unsigned DwarfRegNo, bool isEH) const {
-  return MipsGenRegisterInfo::getLLVMRegNumFull(DwarfRegNo,0);
-}
diff --git a/lib/Target/Mips/MipsRegisterInfo.h b/lib/Target/Mips/MipsRegisterInfo.h
index 646369b5966f4..67e57dd71bdda 100644
--- a/lib/Target/Mips/MipsRegisterInfo.h
+++ b/lib/Target/Mips/MipsRegisterInfo.h
@@ -57,15 +57,11 @@ struct MipsRegisterInfo : public MipsGenRegisterInfo {
   void processFunctionBeforeFrameFinalized(MachineFunction &MF) const;
 
   /// Debug information queries.
-  unsigned getRARegister() const;
   unsigned getFrameRegister(const MachineFunction &MF) const;
 
   /// Exception handling queries.
   unsigned getEHExceptionRegister() const;
   unsigned getEHHandlerRegister() const;
-
-  int getDwarfRegNum(unsigned RegNum, bool isEH) const;
-  int getLLVMRegNum(unsigned RegNum, bool isEH) const;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/Mips/MipsRegisterInfo.td b/lib/Target/Mips/MipsRegisterInfo.td
index f0db518b754b4..925ad9e70ab6d 100644
--- a/lib/Target/Mips/MipsRegisterInfo.td
+++ b/lib/Target/Mips/MipsRegisterInfo.td
@@ -10,6 +10,11 @@
 //===----------------------------------------------------------------------===//
 //  Declarations that describe the MIPS register file
 //===----------------------------------------------------------------------===//
+let Namespace = "Mips" in {
+def sub_fpeven : SubRegIndex;
+def sub_fpodd  : SubRegIndex;
+def sub_32     : SubRegIndex;
+}
 
 // We have banks of 32 registers each.
 class MipsReg<string n> : Register<n> {
@@ -28,22 +33,31 @@ class MipsGPRReg<bits<5> num, string n> : MipsReg<n> {
   let Num = num;
 }
 
+// Mips 64-bit CPU Registers
+class Mips64GPRReg<bits<5> num, string n, list<Register> subregs>
+  : MipsRegWithSubRegs<n, subregs> {
+  let Num = num;
+  let SubRegIndices = [sub_32];
+}
+
 // Mips 32-bit FPU Registers
 class FPR<bits<5> num, string n> : MipsReg<n> {
   let Num = num;
 }
 
 // Mips 64-bit (aliased) FPU Registers
-let Namespace = "Mips" in {
-def sub_fpeven : SubRegIndex;
-def sub_fpodd  : SubRegIndex;
-}
 class AFPR<bits<5> num, string n, list<Register> subregs>
   : MipsRegWithSubRegs<n, subregs> {
   let Num = num;
   let SubRegIndices = [sub_fpeven, sub_fpodd];
 }
 
+class AFPR64<bits<5> num, string n, list<Register> subregs>
+  : MipsRegWithSubRegs<n, subregs> {
+  let Num = num;
+  let SubRegIndices = [sub_32];
+}
+
 // Mips Hardware Registers
 class HWR<bits<5> num, string n> : MipsReg<n> {
   let Num = num;
@@ -54,6 +68,7 @@ class HWR<bits<5> num, string n> : MipsReg<n> {
 //===----------------------------------------------------------------------===//
 
 let Namespace = "Mips" in {
+  // FIXME: Fix DwarfRegNum.
 
   // General Purpose Registers
   def ZERO : MipsGPRReg< 0, "ZERO">, DwarfRegNum<[0]>;
@@ -89,6 +104,40 @@ let Namespace = "Mips" in {
   def FP   : MipsGPRReg< 30, "FP">,  DwarfRegNum<[30]>;
   def RA   : MipsGPRReg< 31, "RA">,  DwarfRegNum<[31]>;
 
+  // General Purpose 64-bit Registers
+  def ZERO_64 : Mips64GPRReg< 0, "ZERO", [ZERO]>;
+  def AT_64   : Mips64GPRReg< 1, "AT",   [AT]>;
+  def V0_64   : Mips64GPRReg< 2, "2",    [V0]>;
+  def V1_64   : Mips64GPRReg< 3, "3",    [V1]>;
+  def A0_64   : Mips64GPRReg< 4, "4",    [A0]>;
+  def A1_64   : Mips64GPRReg< 5, "5",    [A1]>;
+  def A2_64   : Mips64GPRReg< 6, "6",    [A2]>;
+  def A3_64   : Mips64GPRReg< 7, "7",    [A3]>;
+  def T0_64   : Mips64GPRReg< 8, "8",    [T0]>;
+  def T1_64   : Mips64GPRReg< 9, "9",    [T1]>;
+  def T2_64   : Mips64GPRReg< 10, "10",  [T2]>;   
+  def T3_64   : Mips64GPRReg< 11, "11",  [T3]>;   
+  def T4_64   : Mips64GPRReg< 12, "12",  [T4]>;   
+  def T5_64   : Mips64GPRReg< 13, "13",  [T5]>;   
+  def T6_64   : Mips64GPRReg< 14, "14",  [T6]>;   
+  def T7_64   : Mips64GPRReg< 15, "15",  [T7]>;   
+  def S0_64   : Mips64GPRReg< 16, "16",  [S0]>;   
+  def S1_64   : Mips64GPRReg< 17, "17",  [S1]>;   
+  def S2_64   : Mips64GPRReg< 18, "18",  [S2]>;   
+  def S3_64   : Mips64GPRReg< 19, "19",  [S3]>;   
+  def S4_64   : Mips64GPRReg< 20, "20",  [S4]>;   
+  def S5_64   : Mips64GPRReg< 21, "21",  [S5]>;   
+  def S6_64   : Mips64GPRReg< 22, "22",  [S6]>;   
+  def S7_64   : Mips64GPRReg< 23, "23",  [S7]>;
+  def T8_64   : Mips64GPRReg< 24, "24",  [T8]>;
+  def T9_64   : Mips64GPRReg< 25, "25",  [T9]>;
+  def K0_64   : Mips64GPRReg< 26, "26",  [K0]>;
+  def K1_64   : Mips64GPRReg< 27, "27",  [K1]>;
+  def GP_64   : Mips64GPRReg< 28, "GP",  [GP]>;
+  def SP_64   : Mips64GPRReg< 29, "SP",  [SP]>;
+  def FP_64   : Mips64GPRReg< 30, "FP",  [FP]>;
+  def RA_64   : Mips64GPRReg< 31, "RA",  [RA]>;
+
   /// Mips Single point precision FPU Registers
   def F0  : FPR< 0,  "F0">, DwarfRegNum<[32]>;
   def F1  : FPR< 1,  "F1">, DwarfRegNum<[33]>;
@@ -142,10 +191,49 @@ let Namespace = "Mips" in {
   def D14 : AFPR<28, "F28", [F28, F29]>;
   def D15 : AFPR<30, "F30", [F30, F31]>;
 
+  /// Mips Double point precision FPU Registers in MFP64 mode.
+  def D0_64  : AFPR64<0, "F0", [F0]>;
+  def D1_64  : AFPR64<1, "F1", [F1]>;
+  def D2_64  : AFPR64<2, "F2", [F2]>;
+  def D3_64  : AFPR64<3, "F3", [F3]>;
+  def D4_64  : AFPR64<4, "F4", [F4]>;
+  def D5_64  : AFPR64<5, "F5", [F5]>;
+  def D6_64  : AFPR64<6, "F6", [F6]>;
+  def D7_64  : AFPR64<7, "F7", [F7]>;
+  def D8_64  : AFPR64<8, "F8", [F8]>;
+  def D9_64  : AFPR64<9, "F9", [F9]>;
+  def D10_64  : AFPR64<10, "F10", [F10]>;
+  def D11_64  : AFPR64<11, "F11", [F11]>;
+  def D12_64  : AFPR64<12, "F12", [F12]>;
+  def D13_64  : AFPR64<13, "F13", [F13]>;
+  def D14_64  : AFPR64<14, "F14", [F14]>;
+  def D15_64  : AFPR64<15, "F15", [F15]>;
+  def D16_64  : AFPR64<16, "F16", [F16]>;
+  def D17_64  : AFPR64<17, "F17", [F17]>;
+  def D18_64  : AFPR64<18, "F18", [F18]>;
+  def D19_64  : AFPR64<19, "F19", [F19]>;
+  def D20_64  : AFPR64<20, "F20", [F20]>;
+  def D21_64  : AFPR64<21, "F21", [F21]>;
+  def D22_64  : AFPR64<22, "F22", [F22]>;
+  def D23_64  : AFPR64<23, "F23", [F23]>;
+  def D24_64  : AFPR64<24, "F24", [F24]>;
+  def D25_64  : AFPR64<25, "F25", [F25]>;
+  def D26_64  : AFPR64<26, "F26", [F26]>;
+  def D27_64  : AFPR64<27, "F27", [F27]>;
+  def D28_64  : AFPR64<28, "F28", [F28]>;
+  def D29_64  : AFPR64<29, "F29", [F29]>;
+  def D30_64  : AFPR64<30, "F30", [F30]>;
+  def D31_64  : AFPR64<31, "F31", [F31]>;
+
   // Hi/Lo registers
   def HI  : Register<"hi">, DwarfRegNum<[64]>;
   def LO  : Register<"lo">, DwarfRegNum<[65]>;
 
+  let SubRegIndices = [sub_32] in {
+  def HI64  : RegisterWithSubRegs<"hi", [HI]>;
+  def LO64  : RegisterWithSubRegs<"lo", [LO]>;
+  }
+
   // Status flags register
   def FCR31 : Register<"31">;
 
@@ -167,6 +255,18 @@ def CPURegs : RegisterClass<"Mips", [i32], 32, (add
   // Reserved
   ZERO, AT, K0, K1, GP, SP, FP, RA)>;
 
+def CPU64Regs : RegisterClass<"Mips", [i64], 64, (add
+  // Return Values and Arguments
+  V0_64, V1_64, A0_64, A1_64, A2_64, A3_64,
+  // Not preserved across procedure calls
+  T0_64, T1_64, T2_64, T3_64, T4_64, T5_64, T6_64, T7_64, T8_64, T9_64,
+  // Callee save
+  S0_64, S1_64, S2_64, S3_64, S4_64, S5_64, S6_64, S7_64,
+  // Reserved
+  ZERO_64, AT_64, K0_64, K1_64, GP_64, SP_64, FP_64, RA_64)> {
+  let SubRegClasses = [(CPURegs sub_32)];
+}
+
 // 64bit fp:
 // * FGR64  - 32 64-bit registers
 // * AFGR64 - 16 32-bit even registers (32-bit FP Mode)
@@ -182,17 +282,22 @@ def AFGR64 : RegisterClass<"Mips", [f64], 64, (add
   // Not preserved across procedure calls
   D2, D3, D4, D5, D8, D9,
   // Callee save
-  D10, D11, D12, D13, D14,
-  // Reserved
-  D15)> {
+  D10, D11, D12, D13, D14, D15)> {
   let SubRegClasses = [(FGR32 sub_fpeven, sub_fpodd)];
 }
 
+def FGR64 : RegisterClass<"Mips", [f64], 64, (sequence "D%u_64", 0, 31)> {
+  let SubRegClasses = [(FGR32 sub_32)];
+}
+
 // Condition Register for floating point operations
 def CCR  : RegisterClass<"Mips", [i32], 32, (add FCR31)>;
 
 // Hi/Lo Registers
 def HILO : RegisterClass<"Mips", [i32], 32, (add HI, LO)>;
+def HILO64 : RegisterClass<"Mips", [i64], 64, (add HI64, LO64)> {
+  let SubRegClasses = [(HILO sub_32)];
+}
 
 // Hardware registers
 def HWRegs : RegisterClass<"Mips", [i32], 32, (add HWR29)>;
diff --git a/lib/Target/Mips/MipsRelocations.h b/lib/Target/Mips/MipsRelocations.h
new file mode 100644
index 0000000000000..66d1bfd993f54
--- /dev/null
+++ b/lib/Target/Mips/MipsRelocations.h
@@ -0,0 +1,41 @@
+//===- MipsRelocations.h - Mips Code Relocations ---------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===---------------------------------------------------------------------===//
+//
+// This file defines the Mips target-specific relocation types
+// (for relocation-model=static).
+//
+//===---------------------------------------------------------------------===//
+
+#ifndef MIPSRELOCATIONS_H_
+#define MIPSRELOCATIONS_H_
+
+#include "llvm/CodeGen/MachineRelocation.h"
+
+namespace llvm {
+  namespace Mips{
+    enum RelocationType {
+      // reloc_mips_branch - pc relative relocation for branches. The lower 18
+      // bits of the difference between the branch target and the branch
+      // instruction, shifted right by 2.
+      reloc_mips_branch = 1,
+
+      // reloc_mips_hi - upper 16 bits of the address (modified by +1 if the
+      // lower 16 bits of the address is negative).
+      reloc_mips_hi = 2,
+
+      // reloc_mips_lo - lower 16 bits of the address.
+      reloc_mips_lo = 3,
+
+      // reloc_mips_26 - lower 28 bits of the address, shifted right by 2.
+      reloc_mips_26 = 4
+    };
+  }
+}
+
+#endif /* MIPSRELOCATIONS_H_ */
diff --git a/lib/Target/Mips/MipsSubtarget.cpp b/lib/Target/Mips/MipsSubtarget.cpp
index 6eee3333d584a..016d449a10671 100644
--- a/lib/Target/Mips/MipsSubtarget.cpp
+++ b/lib/Target/Mips/MipsSubtarget.cpp
@@ -13,7 +13,7 @@
 
 #include "MipsSubtarget.h"
 #include "Mips.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 
 #define GET_SUBTARGETINFO_TARGET_DESC
 #define GET_SUBTARGETINFO_CTOR
@@ -24,15 +24,14 @@ using namespace llvm;
 MipsSubtarget::MipsSubtarget(const std::string &TT, const std::string &CPU,
                              const std::string &FS, bool little) :
   MipsGenSubtargetInfo(TT, CPU, FS),
-  MipsArchVersion(Mips1), MipsABI(O32), IsLittle(little), IsSingleFloat(false),
-  IsFP64bit(false), IsGP64bit(false), HasVFPU(false), IsLinux(true),
-  HasSEInReg(false), HasCondMov(false), HasMulDivAdd(false), HasMinMax(false),
-  HasSwap(false), HasBitCount(false)
+  MipsArchVersion(Mips32), MipsABI(UnknownABI), IsLittle(little), 
+  IsSingleFloat(false), IsFP64bit(false), IsGP64bit(false), HasVFPU(false),
+  IsLinux(true), HasSEInReg(false), HasCondMov(false), HasMulDivAdd(false),
+  HasMinMax(false), HasSwap(false), HasBitCount(false)
 {
   std::string CPUName = CPU;
   if (CPUName.empty())
-    CPUName = "mips1";
-  MipsArchVersion = Mips1;
+    CPUName = "mips32r1";
 
   // Parse features string.
   ParseSubtargetFeatures(CPUName, FS);
@@ -40,23 +39,16 @@ MipsSubtarget::MipsSubtarget(const std::string &TT, const std::string &CPU,
   // Initialize scheduling itinerary for the specified CPU.
   InstrItins = getInstrItineraryForCPU(CPUName);
 
+  // Set MipsABI if it hasn't been set yet.
+  if (MipsABI == UnknownABI)
+    MipsABI = hasMips64() ? N64 : O32; 
+
+  // Check if Architecture and ABI are compatible.
+  assert(((!hasMips64() && (isABI_O32() || isABI_EABI())) ||
+          (hasMips64() && (isABI_N32() || isABI_N64()))) &&
+         "Invalid  Arch & ABI pair.");
+
   // Is the target system Linux ?
   if (TT.find("linux") == std::string::npos)
     IsLinux = false;
-
-  // When only the target triple is specified and is
-  // a allegrex target, set the features. We also match
-  // big and little endian allegrex cores (dont really
-  // know if a big one exists)
-  if (TT.find("mipsallegrex") != std::string::npos ||
-      TT.find("psp") != std::string::npos) {
-    MipsABI = EABI;
-    IsSingleFloat = true;
-    MipsArchVersion = Mips2;
-    HasVFPU = true; // Enables Allegrex Vector FPU (not supported yet)
-    HasSEInReg = true;
-    HasBitCount = true;
-    HasSwap = true;
-    HasCondMov = true;
-  }
 }
diff --git a/lib/Target/Mips/MipsSubtarget.h b/lib/Target/Mips/MipsSubtarget.h
index 533d4afe073ee..d9dddad23a48e 100644
--- a/lib/Target/Mips/MipsSubtarget.h
+++ b/lib/Target/Mips/MipsSubtarget.h
@@ -27,14 +27,15 @@ class StringRef;
 class MipsSubtarget : public MipsGenSubtargetInfo {
 
 public:
+  // NOTE: O64 will not be supported.
   enum MipsABIEnum {
-    O32, O64, N32, N64, EABI
+    UnknownABI, O32, N32, N64, EABI
   };
 
 protected:
 
   enum MipsArchEnum {
-    Mips1, Mips2, Mips3, Mips4, Mips32, Mips32r2
+    Mips32, Mips32r2, Mips64, Mips64r2
   };
 
   // Mips architecture version
@@ -90,6 +91,8 @@ public:
 
   /// Only O32 and EABI supported right now.
   bool isABI_EABI() const { return MipsABI == EABI; }
+  bool isABI_N64() const { return MipsABI == N64; }
+  bool isABI_N32() const { return MipsABI == N32; }
   bool isABI_O32() const { return MipsABI == O32; }
   unsigned getTargetABI() const { return MipsABI; }
 
@@ -102,9 +105,11 @@ public:
   /// subtarget options.  Definition of function is auto generated by tblgen.
   void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
 
-  bool isMips1() const { return MipsArchVersion == Mips1; }
-  bool isMips32() const { return MipsArchVersion >= Mips32; }
-  bool isMips32r2() const { return MipsArchVersion == Mips32r2; }
+  bool hasMips32() const { return MipsArchVersion >= Mips32; }
+  bool hasMips32r2() const { return MipsArchVersion == Mips32r2 ||
+                                   MipsArchVersion == Mips64r2; }
+  bool hasMips64() const { return MipsArchVersion >= Mips64; }
+  bool hasMips64r2() const { return MipsArchVersion == Mips64r2; }
 
   bool isLittle() const { return IsLittle; }
   bool isFP64bit() const { return IsFP64bit; }
diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp
index 20b9f4ea38530..6480da3e6dfcd 100644
--- a/lib/Target/Mips/MipsTargetMachine.cpp
+++ b/lib/Target/Mips/MipsTargetMachine.cpp
@@ -14,13 +14,15 @@
 #include "Mips.h"
 #include "MipsTargetMachine.h"
 #include "llvm/PassManager.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
 extern "C" void LLVMInitializeMipsTarget() {
   // Register the target.
-  RegisterTargetMachine<MipsTargetMachine> X(TheMipsTarget);
+  RegisterTargetMachine<MipsebTargetMachine> X(TheMipsTarget);
   RegisterTargetMachine<MipselTargetMachine> Y(TheMipselTarget);
+  RegisterTargetMachine<Mips64ebTargetMachine> A(TheMips64Target);
+  RegisterTargetMachine<Mips64elTargetMachine> B(TheMips64elTarget);
 }
 
 // DataLayout --> Big-endian, 32-bit pointer/ABI/alignment
@@ -31,30 +33,47 @@ extern "C" void LLVMInitializeMipsTarget() {
 // an easier handling.
 // Using CodeModel::Large enables different CALL behavior.
 MipsTargetMachine::
-MipsTargetMachine(const Target &T, const std::string &TT,
-                  const std::string &CPU, const std::string &FS,
-                  bool isLittle=false):
-  LLVMTargetMachine(T, TT, CPU, FS),
+MipsTargetMachine(const Target &T, StringRef TT,
+                  StringRef CPU, StringRef FS,
+                  Reloc::Model RM, CodeModel::Model CM,
+                  bool isLittle):
+  LLVMTargetMachine(T, TT, CPU, FS, RM, CM),
   Subtarget(TT, CPU, FS, isLittle),
-  DataLayout(isLittle ? 
-             std::string("e-p:32:32:32-i8:8:32-i16:16:32-i64:64:64-n32") :
-             std::string("E-p:32:32:32-i8:8:32-i16:16:32-i64:64:64-n32")),
+  DataLayout(isLittle ?
+             (Subtarget.isABI_N64() ?
+              "e-p:64:64:64-i8:8:32-i16:16:32-i64:64:64-f128:128:128-n32" :
+              "e-p:32:32:32-i8:8:32-i16:16:32-i64:64:64-n32") :
+             (Subtarget.isABI_N64() ?
+              "E-p:64:64:64-i8:8:32-i16:16:32-i64:64:64-f128:128:128-n32" :
+              "E-p:32:32:32-i8:8:32-i16:16:32-i64:64:64-n32")),
   InstrInfo(*this),
   FrameLowering(Subtarget),
-  TLInfo(*this), TSInfo(*this) {
-  // Abicall enables PIC by default
-  if (getRelocationModel() == Reloc::Default) {
-    if (Subtarget.isABI_O32())
-      setRelocationModel(Reloc::PIC_);
-    else
-      setRelocationModel(Reloc::Static);
-  }
+  TLInfo(*this), TSInfo(*this), JITInfo() {
 }
 
+MipsebTargetMachine::
+MipsebTargetMachine(const Target &T, StringRef TT,
+                    StringRef CPU, StringRef FS,
+                    Reloc::Model RM, CodeModel::Model CM) :
+  MipsTargetMachine(T, TT, CPU, FS, RM, CM, false) {}
+
 MipselTargetMachine::
-MipselTargetMachine(const Target &T, const std::string &TT,
-                    const std::string &CPU, const std::string &FS) :
-  MipsTargetMachine(T, TT, CPU, FS, true) {}
+MipselTargetMachine(const Target &T, StringRef TT,
+                    StringRef CPU, StringRef FS,
+                    Reloc::Model RM, CodeModel::Model CM) :
+  MipsTargetMachine(T, TT, CPU, FS, RM, CM, true) {}
+
+Mips64ebTargetMachine::
+Mips64ebTargetMachine(const Target &T, StringRef TT,
+                      StringRef CPU, StringRef FS,
+                      Reloc::Model RM, CodeModel::Model CM) :
+  MipsTargetMachine(T, TT, CPU, FS, RM, CM, false) {}
+
+Mips64elTargetMachine::
+Mips64elTargetMachine(const Target &T, StringRef TT,
+                      StringRef CPU, StringRef FS,
+                      Reloc::Model RM, CodeModel::Model CM) :
+  MipsTargetMachine(T, TT, CPU, FS, RM, CM, true) {}
 
 // Install an instruction selector pass using
 // the ISelDag to gen Mips code.
@@ -77,7 +96,10 @@ addPreEmitPass(PassManagerBase &PM, CodeGenOpt::Level OptLevel)
 
 bool MipsTargetMachine::
 addPreRegAlloc(PassManagerBase &PM, CodeGenOpt::Level OptLevel) {
-  PM.add(createMipsEmitGPRestorePass(*this));
+  // Do not restore $gp if target is Mips64.
+  // In N32/64, $gp is a callee-saved register.
+  if (!Subtarget.hasMips64())
+    PM.add(createMipsEmitGPRestorePass(*this));
   return true;
 }
 
@@ -86,3 +108,12 @@ addPostRegAlloc(PassManagerBase &PM, CodeGenOpt::Level OptLevel) {
   PM.add(createMipsExpandPseudoPass(*this));
   return true;
 }
+
+bool MipsTargetMachine::addCodeEmitter(PassManagerBase &PM,
+                                          CodeGenOpt::Level OptLevel,
+                                          JITCodeEmitter &JCE) {
+  // Machine code emitter pass for Mips.
+  PM.add(createMipsJITCodeEmitterPass(*this, JCE));
+  return false;
+}
+
diff --git a/lib/Target/Mips/MipsTargetMachine.h b/lib/Target/Mips/MipsTargetMachine.h
index a021af2ff16df..118ed107c5145 100644
--- a/lib/Target/Mips/MipsTargetMachine.h
+++ b/lib/Target/Mips/MipsTargetMachine.h
@@ -22,6 +22,7 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetFrameLowering.h"
+#include "MipsJITInfo.h"
 
 namespace llvm {
   class formatted_raw_ostream;
@@ -33,9 +34,12 @@ namespace llvm {
     MipsFrameLowering   FrameLowering;
     MipsTargetLowering  TLInfo;
     MipsSelectionDAGInfo TSInfo;
+    MipsJITInfo JITInfo;
+
   public:
-    MipsTargetMachine(const Target &T, const std::string &TT,
-                      const std::string &CPU, const std::string &FS,
+    MipsTargetMachine(const Target &T, StringRef TT,
+                      StringRef CPU, StringRef FS,
+                      Reloc::Model RM, CodeModel::Model CM,
                       bool isLittle);
 
     virtual const MipsInstrInfo   *getInstrInfo()     const
@@ -46,6 +50,9 @@ namespace llvm {
     { return &Subtarget; }
     virtual const TargetData      *getTargetData()    const
     { return &DataLayout;}
+    virtual MipsJITInfo *getJITInfo()
+    { return &JITInfo; }
+
 
     virtual const MipsRegisterInfo *getRegisterInfo()  const {
       return &InstrInfo.getRegisterInfo();
@@ -67,16 +74,47 @@ namespace llvm {
     virtual bool addPreRegAlloc(PassManagerBase &PM,
                                 CodeGenOpt::Level OptLevel);
     virtual bool addPostRegAlloc(PassManagerBase &, CodeGenOpt::Level);
+    virtual bool addCodeEmitter(PassManagerBase &PM,
+				 CodeGenOpt::Level OptLevel,
+				 JITCodeEmitter &JCE);
+
   };
 
-/// MipselTargetMachine - Mipsel target machine.
+/// MipsebTargetMachine - Mips32 big endian target machine.
+///
+class MipsebTargetMachine : public MipsTargetMachine {
+public:
+  MipsebTargetMachine(const Target &T, StringRef TT,
+                      StringRef CPU, StringRef FS,
+                      Reloc::Model RM, CodeModel::Model CM);
+};
+
+/// MipselTargetMachine - Mips32 little endian target machine.
 ///
 class MipselTargetMachine : public MipsTargetMachine {
 public:
-  MipselTargetMachine(const Target &T, const std::string &TT,
-                      const std::string &CPU, const std::string &FS);
+  MipselTargetMachine(const Target &T, StringRef TT,
+                      StringRef CPU, StringRef FS,
+                      Reloc::Model RM, CodeModel::Model CM);
 };
 
+/// Mips64ebTargetMachine - Mips64 big endian target machine.
+///
+class Mips64ebTargetMachine : public MipsTargetMachine {
+public:
+  Mips64ebTargetMachine(const Target &T, StringRef TT,
+                        StringRef CPU, StringRef FS,
+                        Reloc::Model RM, CodeModel::Model CM);
+};
+
+/// Mips64elTargetMachine - Mips64 little endian target machine.
+///
+class Mips64elTargetMachine : public MipsTargetMachine {
+public:
+  Mips64elTargetMachine(const Target &T, StringRef TT,
+                        StringRef CPU, StringRef FS,
+                        Reloc::Model RM, CodeModel::Model CM);
+};
 } // End llvm namespace
 
 #endif
diff --git a/lib/Target/Mips/MipsTargetObjectFile.cpp b/lib/Target/Mips/MipsTargetObjectFile.cpp
index cf5d1b58addd7..05c46f5c97a5c 100644
--- a/lib/Target/Mips/MipsTargetObjectFile.cpp
+++ b/lib/Target/Mips/MipsTargetObjectFile.cpp
@@ -79,7 +79,7 @@ IsGlobalInSmallSection(const GlobalValue *GV, const TargetMachine &TM,
   if (Kind.isMergeable1ByteCString())
     return false;
 
-  const Type *Ty = GV->getType()->getElementType();
+  Type *Ty = GV->getType()->getElementType();
   return IsInSmallSection(TM.getTargetData()->getTypeAllocSize(Ty));
 }
 
diff --git a/lib/Target/Mips/TargetInfo/CMakeLists.txt b/lib/Target/Mips/TargetInfo/CMakeLists.txt
index 6e5d56ba4ae78..5692604504a8e 100644
--- a/lib/Target/Mips/TargetInfo/CMakeLists.txt
+++ b/lib/Target/Mips/TargetInfo/CMakeLists.txt
@@ -4,4 +4,10 @@ add_llvm_library(LLVMMipsInfo
   MipsTargetInfo.cpp
   )
 
-add_dependencies(LLVMMipsInfo MipsCodeGenTable_gen)
+add_llvm_library_dependencies(LLVMMipsInfo
+  LLVMMC
+  LLVMSupport
+  LLVMTarget
+  )
+
+add_dependencies(LLVMMipsInfo MipsCommonTableGen)
diff --git a/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp b/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp
index a8d6fe94b1ad8..243632b20aac0 100644
--- a/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp
+++ b/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp
@@ -9,13 +9,23 @@
 
 #include "Mips.h"
 #include "llvm/Module.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
 Target llvm::TheMipsTarget, llvm::TheMipselTarget;
+Target llvm::TheMips64Target, llvm::TheMips64elTarget;
 
 extern "C" void LLVMInitializeMipsTargetInfo() {
-  RegisterTarget<Triple::mips> X(TheMipsTarget, "mips", "Mips");
+  RegisterTarget<Triple::mips,
+        /*HasJIT=*/true> X(TheMipsTarget, "mips", "Mips");
 
-  RegisterTarget<Triple::mipsel> Y(TheMipselTarget, "mipsel", "Mipsel");
+  RegisterTarget<Triple::mipsel,
+        /*HasJIT=*/true> Y(TheMipselTarget, "mipsel", "Mipsel");
+
+  RegisterTarget<Triple::mips64,
+        /*HasJIT=*/false> A(TheMips64Target, "mips64", "Mips64 [experimental]");
+
+  RegisterTarget<Triple::mips64el,
+        /*HasJIT=*/false> B(TheMips64elTarget,
+                            "mips64el", "Mips64el [experimental]");
 }
diff --git a/lib/Target/PTX/CMakeLists.txt b/lib/Target/PTX/CMakeLists.txt
index ce08916aaac1f..6e87b171d896c 100644
--- a/lib/Target/PTX/CMakeLists.txt
+++ b/lib/Target/PTX/CMakeLists.txt
@@ -1,24 +1,44 @@
 set(LLVM_TARGET_DEFINITIONS PTX.td)
 
-tablegen(PTXGenAsmWriter.inc -gen-asm-writer)
-tablegen(PTXGenCallingConv.inc -gen-callingconv)
-tablegen(PTXGenDAGISel.inc -gen-dag-isel)
-tablegen(PTXGenInstrInfo.inc -gen-instr-info)
-tablegen(PTXGenRegisterInfo.inc -gen-register-info)
-tablegen(PTXGenSubtargetInfo.inc -gen-subtarget)
+llvm_tablegen(PTXGenAsmWriter.inc -gen-asm-writer)
+llvm_tablegen(PTXGenDAGISel.inc -gen-dag-isel)
+llvm_tablegen(PTXGenInstrInfo.inc -gen-instr-info)
+llvm_tablegen(PTXGenRegisterInfo.inc -gen-register-info)
+llvm_tablegen(PTXGenSubtargetInfo.inc -gen-subtarget)
+add_public_tablegen_target(PTXCommonTableGen)
 
 add_llvm_target(PTXCodeGen
   PTXAsmPrinter.cpp
   PTXISelDAGToDAG.cpp
   PTXISelLowering.cpp
   PTXInstrInfo.cpp
+  PTXFPRoundingModePass.cpp
   PTXFrameLowering.cpp
   PTXMCAsmStreamer.cpp
+  PTXMCInstLower.cpp
   PTXMFInfoExtract.cpp
+  PTXParamManager.cpp
+  PTXRegAlloc.cpp
   PTXRegisterInfo.cpp
+  PTXSelectionDAGInfo.cpp
   PTXSubtarget.cpp
   PTXTargetMachine.cpp
   )
 
+add_llvm_library_dependencies(LLVMPTXCodeGen
+  LLVMAnalysis
+  LLVMAsmPrinter
+  LLVMCodeGen
+  LLVMCore
+  LLVMMC
+  LLVMPTXDesc
+  LLVMPTXInfo
+  LLVMSelectionDAG
+  LLVMSupport
+  LLVMTarget
+  )
+
 add_subdirectory(TargetInfo)
+add_subdirectory(InstPrinter)
 add_subdirectory(MCTargetDesc)
+
diff --git a/lib/Target/PTX/InstPrinter/CMakeLists.txt b/lib/Target/PTX/InstPrinter/CMakeLists.txt
new file mode 100644
index 0000000000000..029d06031dd23
--- /dev/null
+++ b/lib/Target/PTX/InstPrinter/CMakeLists.txt
@@ -0,0 +1,13 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMPTXAsmPrinter
+  PTXInstPrinter.cpp
+  )
+
+add_dependencies(LLVMPTXAsmPrinter PTXCommonTableGen)
+
+add_llvm_library_dependencies(LLVMPTXAsmPrinter
+  LLVMMC
+  LLVMSupport
+  )
+
diff --git a/lib/Target/PTX/InstPrinter/Makefile b/lib/Target/PTX/InstPrinter/Makefile
new file mode 100644
index 0000000000000..0ccfe44078960
--- /dev/null
+++ b/lib/Target/PTX/InstPrinter/Makefile
@@ -0,0 +1,16 @@
+##===- lib/Target/PTX/AsmPrinter/Makefile ------------------*- Makefile -*-===##
+#
+#											The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMPTXAsmPrinter
+
+# Hack: we need to include 'main' ptx target directory to grab private headers
+CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
+
diff --git a/lib/Target/PTX/InstPrinter/PTXInstPrinter.cpp b/lib/Target/PTX/InstPrinter/PTXInstPrinter.cpp
new file mode 100644
index 0000000000000..aabb404dad684
--- /dev/null
+++ b/lib/Target/PTX/InstPrinter/PTXInstPrinter.cpp
@@ -0,0 +1,192 @@
+//===-- PTXInstPrinter.cpp - Convert PTX MCInst to assembly syntax --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints a PTX MCInst to a .ptx file.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "PTXInstPrinter.h"
+#include "MCTargetDesc/PTXBaseInfo.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define GET_INSTRUCTION_NAME
+#include "PTXGenAsmWriter.inc"
+
+PTXInstPrinter::PTXInstPrinter(const MCAsmInfo &MAI,
+                               const MCSubtargetInfo &STI) :
+  MCInstPrinter(MAI) {
+  // Initialize the set of available features.
+  setAvailableFeatures(STI.getFeatureBits());
+}
+
+StringRef PTXInstPrinter::getOpcodeName(unsigned Opcode) const {
+  return getInstructionName(Opcode);
+}
+
+void PTXInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+  OS << getRegisterName(RegNo);
+}
+
+void PTXInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+                               StringRef Annot) {
+  printPredicate(MI, O);
+  switch (MI->getOpcode()) {
+  default:
+    printInstruction(MI, O);
+    break;
+  case PTX::CALL:
+    printCall(MI, O);
+  }
+  O << ";";
+  printAnnotation(O, Annot);
+}
+
+void PTXInstPrinter::printPredicate(const MCInst *MI, raw_ostream &O) {
+  // The last two operands are the predicate operands
+  int RegIndex;
+  int OpIndex;
+
+  if (MI->getOpcode() == PTX::CALL) {
+    RegIndex = 0;
+    OpIndex  = 1;
+  } else {
+    RegIndex = MI->getNumOperands()-2;
+    OpIndex = MI->getNumOperands()-1;
+  }
+
+  int PredOp = MI->getOperand(OpIndex).getImm();
+  if (PredOp == PTXPredicate::None)
+    return;
+
+  if (PredOp == PTXPredicate::Negate)
+    O << '!';
+  else
+    O << '@';
+
+  printOperand(MI, RegIndex, O);
+}
+
+void PTXInstPrinter::printCall(const MCInst *MI, raw_ostream &O) {
+  O << "\tcall.uni\t";
+  // The first two operands are the predicate slot
+  unsigned Index = 2;
+  unsigned NumRets = MI->getOperand(Index++).getImm();
+
+  if (NumRets > 0) {
+    O << "(";
+    printOperand(MI, Index++, O);
+    for (unsigned i = 1; i < NumRets; ++i) {
+      O << ", ";
+      printOperand(MI, Index++, O);
+    }
+    O << "), ";
+  }
+
+  O << *(MI->getOperand(Index++).getExpr()) << ", (";
+
+  unsigned NumArgs = MI->getOperand(Index++).getImm();
+  if (NumArgs > 0) {
+    printOperand(MI, Index++, O);
+    for (unsigned i = 1; i < NumArgs; ++i) {
+      O << ", ";
+      printOperand(MI, Index++, O);
+    }
+  }
+  O << ")";
+}
+
+void PTXInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                  raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isImm()) {
+    O << Op.getImm();
+  } else if (Op.isFPImm()) {
+    double Imm = Op.getFPImm();
+    APFloat FPImm(Imm);
+    APInt FPIntImm = FPImm.bitcastToAPInt();
+    O << "0D";
+    // PTX requires us to output the full 64 bits, even if the number is zero
+    if (FPIntImm.getZExtValue() > 0) {
+      O << FPIntImm.toString(16, false);
+    } else {
+      O << "0000000000000000";
+    }
+  } else {
+    assert(Op.isExpr() && "unknown operand kind in printOperand");
+    const MCExpr *Expr = Op.getExpr();
+    if (const MCSymbolRefExpr *SymRefExpr = dyn_cast<MCSymbolRefExpr>(Expr)) {
+      const MCSymbol &Sym = SymRefExpr->getSymbol();
+      O << Sym.getName();
+    } else {
+      O << *Op.getExpr();
+    }
+  }
+}
+
+void PTXInstPrinter::printMemOperand(const MCInst *MI, unsigned OpNo,
+                                     raw_ostream &O) {
+  // By definition, operand OpNo+1 is an i32imm
+  const MCOperand &Op2 = MI->getOperand(OpNo+1);
+  printOperand(MI, OpNo, O);
+  if (Op2.getImm() == 0)
+    return; // don't print "+0"
+  O << "+" << Op2.getImm();
+}
+
+void PTXInstPrinter::printRoundingMode(const MCInst *MI, unsigned OpNo,
+                                       raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  assert (Op.isImm() && "Rounding modes must be immediate values");
+  switch (Op.getImm()) {
+  default:
+    llvm_unreachable("Unknown rounding mode!");
+  case PTXRoundingMode::RndDefault:
+    llvm_unreachable("FP rounding-mode pass did not handle instruction!");
+    break;
+  case PTXRoundingMode::RndNone:
+    // Do not print anything.
+    break;
+  case PTXRoundingMode::RndNearestEven:
+    O << ".rn";
+    break;
+  case PTXRoundingMode::RndTowardsZero:
+    O << ".rz";
+    break;
+  case PTXRoundingMode::RndNegInf:
+    O << ".rm";
+    break;
+  case PTXRoundingMode::RndPosInf:
+    O << ".rp";
+    break;
+  case PTXRoundingMode::RndApprox:
+    O << ".approx";
+    break;
+  case PTXRoundingMode::RndNearestEvenInt:
+    O << ".rni";
+    break;
+  case PTXRoundingMode::RndTowardsZeroInt:
+    O << ".rzi";
+    break;
+  case PTXRoundingMode::RndNegInfInt:
+    O << ".rmi";
+    break;
+  case PTXRoundingMode::RndPosInfInt:
+    O << ".rpi";
+    break;
+  }
+}
+
diff --git a/lib/Target/PTX/InstPrinter/PTXInstPrinter.h b/lib/Target/PTX/InstPrinter/PTXInstPrinter.h
new file mode 100644
index 0000000000000..86dfd482885bd
--- /dev/null
+++ b/lib/Target/PTX/InstPrinter/PTXInstPrinter.h
@@ -0,0 +1,47 @@
+//===-- PTXInstPrinter.h - Convert PTX MCInst to assembly syntax ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints n PTX MCInst to a .ptx file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PTXINSTPRINTER_H
+#define PTXINSTPRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+
+namespace llvm {
+
+class MCOperand;
+
+class PTXInstPrinter : public MCInstPrinter {
+public:
+  PTXInstPrinter(const MCAsmInfo &MAI, const MCSubtargetInfo &STI);
+
+  virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
+  virtual StringRef getOpcodeName(unsigned Opcode) const;
+  virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
+
+  static const char *getInstructionName(unsigned Opcode);
+
+  // Autogenerated by tblgen.
+  void printInstruction(const MCInst *MI, raw_ostream &O);
+  static const char *getRegisterName(unsigned RegNo);
+
+  void printPredicate(const MCInst *MI, raw_ostream &O);
+  void printCall(const MCInst *MI, raw_ostream &O);
+  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printMemOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printRoundingMode(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+};
+}
+
+#endif
+
diff --git a/lib/Target/PTX/MCTargetDesc/CMakeLists.txt b/lib/Target/PTX/MCTargetDesc/CMakeLists.txt
index df0f63fdba607..811ef4bd1fcf7 100644
--- a/lib/Target/PTX/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/PTX/MCTargetDesc/CMakeLists.txt
@@ -2,3 +2,12 @@ add_llvm_library(LLVMPTXDesc
   PTXMCTargetDesc.cpp
   PTXMCAsmInfo.cpp
   )
+
+add_llvm_library_dependencies(LLVMPTXDesc
+  LLVMMC
+  LLVMPTXInfo
+  LLVMPTXAsmPrinter
+  LLVMSupport
+  )
+
+add_dependencies(LLVMPTXDesc PTXCommonTableGen)
diff --git a/lib/Target/PTX/MCTargetDesc/PTXBaseInfo.h b/lib/Target/PTX/MCTargetDesc/PTXBaseInfo.h
new file mode 100644
index 0000000000000..c6094be4d15be
--- /dev/null
+++ b/lib/Target/PTX/MCTargetDesc/PTXBaseInfo.h
@@ -0,0 +1,63 @@
+//===-- PTXBaseInfo.h - Top level definitions for PTX -------- --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains small standalone helper functions and enum definitions for
+// the PTX target useful for the compiler back-end and the MC libraries.
+// As such, it deliberately does not include references to LLVM core
+// code gen types, passes, etc..
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PTXBASEINFO_H
+#define PTXBASEINFO_H
+
+#include "PTXMCTargetDesc.h"
+
+namespace llvm {
+  namespace PTXStateSpace {
+    enum {
+      Global    = 0, // default to global state space
+      Constant  = 1,
+      Local     = 2,
+      Parameter = 3,
+      Shared    = 4
+    };
+  } // namespace PTXStateSpace
+
+  namespace PTXPredicate {
+    enum {
+      Normal = 0,
+      Negate = 1,
+      None   = 2
+    };
+  } // namespace PTXPredicate
+
+  /// Namespace to hold all target-specific flags.
+  namespace PTXRoundingMode {
+    // Instruction Flags
+    enum {
+      // Rounding Mode Flags
+      RndMask             = 15,
+      RndDefault          =  0, // ---
+      RndNone             =  1, // <NONE>
+      RndNearestEven      =  2, // .rn
+      RndTowardsZero      =  3, // .rz
+      RndNegInf           =  4, // .rm
+      RndPosInf           =  5, // .rp
+      RndApprox           =  6, // .approx
+      RndNearestEvenInt   =  7, // .rni
+      RndTowardsZeroInt   =  8, // .rzi
+      RndNegInfInt        =  9, // .rmi
+      RndPosInfInt        = 10  // .rpi
+    };
+  } // namespace PTXII
+} // namespace llvm
+
+#endif
+
diff --git a/lib/Target/PTX/MCTargetDesc/PTXMCTargetDesc.cpp b/lib/Target/PTX/MCTargetDesc/PTXMCTargetDesc.cpp
index 23f70bd137879..a5af3b880135b 100644
--- a/lib/Target/PTX/MCTargetDesc/PTXMCTargetDesc.cpp
+++ b/lib/Target/PTX/MCTargetDesc/PTXMCTargetDesc.cpp
@@ -13,10 +13,12 @@
 
 #include "PTXMCTargetDesc.h"
 #include "PTXMCAsmInfo.h"
+#include "InstPrinter/PTXInstPrinter.h"
+#include "llvm/MC/MCCodeGenInfo.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 
 #define GET_INSTRINFO_MC_DESC
 #include "PTXGenInstrInfo.inc"
@@ -35,9 +37,11 @@ static MCInstrInfo *createPTXMCInstrInfo() {
   return X;
 }
 
-extern "C" void LLVMInitializePTXMCInstrInfo() {
-  TargetRegistry::RegisterMCInstrInfo(ThePTX32Target, createPTXMCInstrInfo);
-  TargetRegistry::RegisterMCInstrInfo(ThePTX64Target, createPTXMCInstrInfo);
+static MCRegisterInfo *createPTXMCRegisterInfo(StringRef TT) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  // PTX does not have a return address register.
+  InitPTXMCRegisterInfo(X, 0);
+  return X;
 }
 
 static MCSubtargetInfo *createPTXMCSubtargetInfo(StringRef TT, StringRef CPU,
@@ -47,14 +51,45 @@ static MCSubtargetInfo *createPTXMCSubtargetInfo(StringRef TT, StringRef CPU,
   return X;
 }
 
-extern "C" void LLVMInitializePTXMCSubtargetInfo() {
+static MCCodeGenInfo *createPTXMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                             CodeModel::Model CM) {
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+  X->InitMCCodeGenInfo(RM, CM);
+  return X;
+}
+
+static MCInstPrinter *createPTXMCInstPrinter(const Target &T,
+                                             unsigned SyntaxVariant,
+                                             const MCAsmInfo &MAI,
+                                             const MCSubtargetInfo &STI) {
+  assert(SyntaxVariant == 0 && "We only have one syntax variant");
+  return new PTXInstPrinter(MAI, STI);
+}
+
+extern "C" void LLVMInitializePTXTargetMC() {
+  // Register the MC asm info.
+  RegisterMCAsmInfo<PTXMCAsmInfo> X(ThePTX32Target);
+  RegisterMCAsmInfo<PTXMCAsmInfo> Y(ThePTX64Target);
+
+  // Register the MC codegen info.
+  TargetRegistry::RegisterMCCodeGenInfo(ThePTX32Target, createPTXMCCodeGenInfo);
+  TargetRegistry::RegisterMCCodeGenInfo(ThePTX64Target, createPTXMCCodeGenInfo);
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(ThePTX32Target, createPTXMCInstrInfo);
+  TargetRegistry::RegisterMCInstrInfo(ThePTX64Target, createPTXMCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(ThePTX32Target, createPTXMCRegisterInfo);
+  TargetRegistry::RegisterMCRegInfo(ThePTX64Target, createPTXMCRegisterInfo);
+
+  // Register the MC subtarget info.
   TargetRegistry::RegisterMCSubtargetInfo(ThePTX32Target,
                                           createPTXMCSubtargetInfo);
   TargetRegistry::RegisterMCSubtargetInfo(ThePTX64Target,
                                           createPTXMCSubtargetInfo);
-}
 
-extern "C" void LLVMInitializePTXMCAsmInfo() {
-  RegisterMCAsmInfo<PTXMCAsmInfo> X(ThePTX32Target);
-  RegisterMCAsmInfo<PTXMCAsmInfo> Y(ThePTX64Target);
+  // Register the MCInstPrinter.
+  TargetRegistry::RegisterMCInstPrinter(ThePTX32Target, createPTXMCInstPrinter);
+  TargetRegistry::RegisterMCInstPrinter(ThePTX64Target, createPTXMCInstPrinter);
 }
diff --git a/lib/Target/PTX/Makefile b/lib/Target/PTX/Makefile
index 93dd38aca7ece..fa09634965ac7 100644
--- a/lib/Target/PTX/Makefile
+++ b/lib/Target/PTX/Makefile
@@ -13,12 +13,11 @@ TARGET = PTX
 
 # Make sure that tblgen is run, first thing.
 BUILT_SOURCES = PTXGenAsmWriter.inc \
-		PTXGenCallingConv.inc \
 		PTXGenDAGISel.inc \
 		PTXGenInstrInfo.inc \
 		PTXGenRegisterInfo.inc \
 		PTXGenSubtargetInfo.inc
 
-DIRS = TargetInfo MCTargetDesc
+DIRS = InstPrinter TargetInfo MCTargetDesc
 
 include $(LEVEL)/Makefile.common
diff --git a/lib/Target/PTX/PTX.h b/lib/Target/PTX/PTX.h
index 28cab2429c81c..7d46cce4aeca9 100644
--- a/lib/Target/PTX/PTX.h
+++ b/lib/Target/PTX/PTX.h
@@ -15,34 +15,30 @@
 #ifndef PTX_H
 #define PTX_H
 
-#include "MCTargetDesc/PTXMCTargetDesc.h"
+#include "MCTargetDesc/PTXBaseInfo.h"
 #include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
+  class MachineInstr;
+  class MCInst;
+  class PTXAsmPrinter;
   class PTXTargetMachine;
   class FunctionPass;
 
-  namespace PTX {
-    enum StateSpace {
-      GLOBAL = 0, // default to global state space
-      CONSTANT = 1,
-      LOCAL = 2,
-      PARAMETER = 3,
-      SHARED = 4
-    };
-
-    enum Predicate {
-      PRED_NORMAL = 0,
-      PRED_NEGATE = 1
-    };
-  } // namespace PTX
-
   FunctionPass *createPTXISelDag(PTXTargetMachine &TM,
                                  CodeGenOpt::Level OptLevel);
 
   FunctionPass *createPTXMFInfoExtract(PTXTargetMachine &TM,
                                        CodeGenOpt::Level OptLevel);
 
+  FunctionPass *createPTXFPRoundingModePass(PTXTargetMachine &TM,
+                                            CodeGenOpt::Level OptLevel);
+
+  FunctionPass *createPTXRegisterAllocator();
+
+  void LowerPTXMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
+                                    PTXAsmPrinter &AP);
+
 } // namespace llvm;
 
 #endif // PTX_H
diff --git a/lib/Target/PTX/PTX.td b/lib/Target/PTX/PTX.td
index f6fbe9fffc6f7..693bb9c483446 100644
--- a/lib/Target/PTX/PTX.td
+++ b/lib/Target/PTX/PTX.td
@@ -52,13 +52,13 @@ def FeatureSM12 : SubtargetFeature<"sm12", "PTXTarget", "PTX_SM_1_2",
 def FeatureSM13 : SubtargetFeature<"sm13", "PTXTarget", "PTX_SM_1_3",
                                    "Use Shader Model 1.3">;
 def FeatureSM20 : SubtargetFeature<"sm20", "PTXTarget", "PTX_SM_2_0",
-                                   "Use Shader Model 2.0">;
+                                   "Use Shader Model 2.0", [FeatureDouble]>;
 def FeatureSM21 : SubtargetFeature<"sm21", "PTXTarget", "PTX_SM_2_1",
-                                   "Use Shader Model 2.1">;
+                                   "Use Shader Model 2.1", [FeatureDouble]>;
 def FeatureSM22 : SubtargetFeature<"sm22", "PTXTarget", "PTX_SM_2_2",
-                                   "Use Shader Model 2.2">;
+                                   "Use Shader Model 2.2", [FeatureDouble]>;
 def FeatureSM23 : SubtargetFeature<"sm23", "PTXTarget", "PTX_SM_2_3",
-                                   "Use Shader Model 2.3">;
+                                   "Use Shader Model 2.3", [FeatureDouble]>;
 
 def FeatureCOMPUTE10 : SubtargetFeature<"compute10", "PTXTarget",
                                         "PTX_COMPUTE_1_0",
@@ -74,7 +74,8 @@ def FeatureCOMPUTE13 : SubtargetFeature<"compute13", "PTXTarget",
                                         "Use Compute Compatibility 1.3">;
 def FeatureCOMPUTE20 : SubtargetFeature<"compute20", "PTXTarget",
                                         "PTX_COMPUTE_2_0",
-                                        "Use Compute Compatibility 2.0">;
+                                        "Use Compute Compatibility 2.0",
+                                        [FeatureDouble]>;
 
 //===----------------------------------------------------------------------===//
 // PTX supported processors
@@ -112,12 +113,6 @@ def : Proc<"fermi",      [FeatureSM20, FeatureDouble]>;
 
 include "PTXRegisterInfo.td"
 
-//===----------------------------------------------------------------------===//
-// Calling Conventions
-//===----------------------------------------------------------------------===//
-
-include "PTXCallingConv.td"
-
 //===----------------------------------------------------------------------===//
 // Instruction Descriptions
 //===----------------------------------------------------------------------===//
@@ -126,10 +121,21 @@ include "PTXInstrInfo.td"
 
 def PTXInstrInfo : InstrInfo;
 
+//===----------------------------------------------------------------------===//
+// Assembly printer
+//===----------------------------------------------------------------------===//
+// PTX uses the MC printer for asm output, so make sure the TableGen
+// AsmWriter bits get associated with the correct class.
+def PTXAsmWriter : AsmWriter {
+  string AsmWriterClassName  = "InstPrinter";
+  bit isMCAsmWriter = 1;
+}
+
 //===----------------------------------------------------------------------===//
 // Target Declaration
 //===----------------------------------------------------------------------===//
 
 def PTX : Target {
   let InstructionSet = PTXInstrInfo;
+  let AssemblyWriters = [PTXAsmWriter];
 }
diff --git a/lib/Target/PTX/PTXAsmPrinter.cpp b/lib/Target/PTX/PTXAsmPrinter.cpp
index 2848d5460eee0..733744bbd08bf 100644
--- a/lib/Target/PTX/PTXAsmPrinter.cpp
+++ b/lib/Target/PTX/PTXAsmPrinter.cpp
@@ -15,9 +15,14 @@
 #define DEBUG_TYPE "ptx-asm-printer"
 
 #include "PTX.h"
+#include "PTXAsmPrinter.h"
 #include "PTXMachineFunctionInfo.h"
+#include "PTXParamManager.h"
+#include "PTXRegisterInfo.h"
 #include "PTXTargetMachine.h"
+#include "llvm/Argument.h"
 #include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
 #include "llvm/Module.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
@@ -28,69 +33,32 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Target/Mangler.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
-#include "llvm/Target/TargetRegistry.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
-namespace {
-class PTXAsmPrinter : public AsmPrinter {
-public:
-  explicit PTXAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
-    : AsmPrinter(TM, Streamer) {}
-
-  const char *getPassName() const { return "PTX Assembly Printer"; }
-
-  bool doFinalization(Module &M);
-
-  virtual void EmitStartOfAsmFile(Module &M);
-
-  virtual bool runOnMachineFunction(MachineFunction &MF);
-
-  virtual void EmitFunctionBodyStart();
-  virtual void EmitFunctionBodyEnd() { OutStreamer.EmitRawText(Twine("}")); }
-
-  virtual void EmitInstruction(const MachineInstr *MI);
-
-  void printOperand(const MachineInstr *MI, int opNum, raw_ostream &OS);
-  void printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &OS,
-                       const char *Modifier = 0);
-  void printParamOperand(const MachineInstr *MI, int opNum, raw_ostream &OS,
-                         const char *Modifier = 0);
-  void printReturnOperand(const MachineInstr *MI, int opNum, raw_ostream &OS,
-                          const char *Modifier = 0); 
-  void printPredicateOperand(const MachineInstr *MI, raw_ostream &O);
-
-  unsigned GetOrCreateSourceID(StringRef FileName,
-                               StringRef DirName);
-
-  // autogen'd.
-  void printInstruction(const MachineInstr *MI, raw_ostream &OS);
-  static const char *getRegisterName(unsigned RegNo);
-
-private:
-  void EmitVariableDeclaration(const GlobalVariable *gv);
-  void EmitFunctionDeclaration();
-
-  StringMap<unsigned> SourceIdMap;
-}; // class PTXAsmPrinter
-} // namespace
-
 static const char PARAM_PREFIX[] = "__param_";
 static const char RETURN_PREFIX[] = "__ret_";
 
-static const char *getRegisterTypeName(unsigned RegNo) {
-#define TEST_REGCLS(cls, clsstr)                \
-  if (PTX::cls ## RegisterClass->contains(RegNo)) return # clsstr;
+static const char *getRegisterTypeName(unsigned RegNo,
+                                       const MachineRegisterInfo& MRI) {
+  const TargetRegisterClass *TRC = MRI.getRegClass(RegNo);
+
+#define TEST_REGCLS(cls, clsstr) \
+  if (PTX::cls ## RegisterClass == TRC) return # clsstr;
+
   TEST_REGCLS(RegPred, pred);
   TEST_REGCLS(RegI16, b16);
   TEST_REGCLS(RegI32, b32);
@@ -106,16 +74,16 @@ static const char *getRegisterTypeName(unsigned RegNo) {
 static const char *getStateSpaceName(unsigned addressSpace) {
   switch (addressSpace) {
   default: llvm_unreachable("Unknown state space");
-  case PTX::GLOBAL:    return "global";
-  case PTX::CONSTANT:  return "const";
-  case PTX::LOCAL:     return "local";
-  case PTX::PARAMETER: return "param";
-  case PTX::SHARED:    return "shared";
+  case PTXStateSpace::Global:    return "global";
+  case PTXStateSpace::Constant:  return "const";
+  case PTXStateSpace::Local:     return "local";
+  case PTXStateSpace::Parameter: return "param";
+  case PTXStateSpace::Shared:    return "shared";
   }
   return NULL;
 }
 
-static const char *getTypeName(const Type* type) {
+static const char *getTypeName(Type* type) {
   while (true) {
     switch (type->getTypeID()) {
       default: llvm_unreachable("Unknown type");
@@ -130,7 +98,7 @@ static const char *getTypeName(const Type* type) {
         }
       case Type::ArrayTyID:
       case Type::PointerTyID:
-        type = dyn_cast<const SequentialType>(type)->getElementType();
+        type = dyn_cast<SequentialType>(type)->getElementType();
         break;
     }
   }
@@ -170,6 +138,7 @@ void PTXAsmPrinter::EmitStartOfAsmFile(Module &M)
 {
   const PTXSubtarget& ST = TM.getSubtarget<PTXSubtarget>();
 
+  // Emit the PTX .version and .target attributes
   OutStreamer.EmitRawText(Twine("\t.version " + ST.getPTXVersionString()));
   OutStreamer.EmitRawText(Twine("\t.target " + ST.getTargetString() +
                                 (ST.supportsDouble() ? ""
@@ -203,177 +172,118 @@ void PTXAsmPrinter::EmitStartOfAsmFile(Module &M)
     EmitVariableDeclaration(i);
 }
 
-bool PTXAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
-  SetupMachineFunction(MF);
-  EmitFunctionDeclaration();
-  EmitFunctionBody();
-  return false;
-}
-
 void PTXAsmPrinter::EmitFunctionBodyStart() {
   OutStreamer.EmitRawText(Twine("{"));
 
   const PTXMachineFunctionInfo *MFI = MF->getInfo<PTXMachineFunctionInfo>();
+  const PTXParamManager &PM = MFI->getParamManager();
+
+  // Print register definitions
+  std::string regDefs;
+  unsigned numRegs;
+
+  // pred
+  numRegs = MFI->getNumRegistersForClass(PTX::RegPredRegisterClass);
+  if(numRegs > 0) {
+    regDefs += "\t.reg .pred %p<";
+    regDefs += utostr(numRegs);
+    regDefs += ">;\n";
+  }
+
+  // i16
+  numRegs = MFI->getNumRegistersForClass(PTX::RegI16RegisterClass);
+  if(numRegs > 0) {
+    regDefs += "\t.reg .b16 %rh<";
+    regDefs += utostr(numRegs);
+    regDefs += ">;\n";
+  }
+
+  // i32
+  numRegs = MFI->getNumRegistersForClass(PTX::RegI32RegisterClass);
+  if(numRegs > 0) {
+    regDefs += "\t.reg .b32 %r<";
+    regDefs += utostr(numRegs);
+    regDefs += ">;\n";
+  }
+
+  // i64
+  numRegs = MFI->getNumRegistersForClass(PTX::RegI64RegisterClass);
+  if(numRegs > 0) {
+    regDefs += "\t.reg .b64 %rd<";
+    regDefs += utostr(numRegs);
+    regDefs += ">;\n";
+  }
+
+  // f32
+  numRegs = MFI->getNumRegistersForClass(PTX::RegF32RegisterClass);
+  if(numRegs > 0) {
+    regDefs += "\t.reg .f32 %f<";
+    regDefs += utostr(numRegs);
+    regDefs += ">;\n";
+  }
+
+  // f64
+  numRegs = MFI->getNumRegistersForClass(PTX::RegF64RegisterClass);
+  if(numRegs > 0) {
+    regDefs += "\t.reg .f64 %fd<";
+    regDefs += utostr(numRegs);
+    regDefs += ">;\n";
+  }
 
-  // Print local variable definition
-  for (PTXMachineFunctionInfo::reg_iterator
-       i = MFI->localVarRegBegin(), e = MFI->localVarRegEnd(); i != e; ++ i) {
-    unsigned reg = *i;
-
-    std::string def = "\t.reg .";
-    def += getRegisterTypeName(reg);
-    def += ' ';
-    def += getRegisterName(reg);
-    def += ';';
-    OutStreamer.EmitRawText(Twine(def));
+  // Local params
+  for (PTXParamManager::param_iterator i = PM.local_begin(), e = PM.local_end();
+       i != e; ++i) {
+    regDefs += "\t.param .b";
+    regDefs += utostr(PM.getParamSize(*i));
+    regDefs += " ";
+    regDefs += PM.getParamName(*i);
+    regDefs += ";\n";
   }
 
+  OutStreamer.EmitRawText(Twine(regDefs));
+
+
   const MachineFrameInfo* FrameInfo = MF->getFrameInfo();
   DEBUG(dbgs() << "Have " << FrameInfo->getNumObjects()
                << " frame object(s)\n");
   for (unsigned i = 0, e = FrameInfo->getNumObjects(); i != e; ++i) {
     DEBUG(dbgs() << "Size of object: " << FrameInfo->getObjectSize(i) << "\n");
     if (FrameInfo->getObjectSize(i) > 0) {
-      std::string def = "\t.reg .b";
-      def += utostr(FrameInfo->getObjectSize(i)*8); // Convert to bits
-      def += " s";
+      std::string def = "\t.local .align ";
+      def += utostr(FrameInfo->getObjectAlignment(i));
+      def += " .b8";
+      def += " __local";
       def += utostr(i);
+      def += "[";
+      def += utostr(FrameInfo->getObjectSize(i)); // Convert to bits
+      def += "]";
       def += ";";
       OutStreamer.EmitRawText(Twine(def));
     }
   }
-}
-
-void PTXAsmPrinter::EmitInstruction(const MachineInstr *MI) {
-  std::string str;
-  str.reserve(64);
-
-  raw_string_ostream OS(str);
-
-  DebugLoc DL = MI->getDebugLoc();
-  if (!DL.isUnknown()) {
-
-    const MDNode *S = DL.getScope(MF->getFunction()->getContext());
-
-    // This is taken from DwarfDebug.cpp, which is conveniently not a public
-    // LLVM class.
-    StringRef Fn;
-    StringRef Dir;
-    unsigned Src = 1;
-    if (S) {
-      DIDescriptor Scope(S);
-      if (Scope.isCompileUnit()) {
-        DICompileUnit CU(S);
-        Fn = CU.getFilename();
-        Dir = CU.getDirectory();
-      } else if (Scope.isFile()) {
-        DIFile F(S);
-        Fn = F.getFilename();
-        Dir = F.getDirectory();
-      } else if (Scope.isSubprogram()) {
-        DISubprogram SP(S);
-        Fn = SP.getFilename();
-        Dir = SP.getDirectory();
-      } else if (Scope.isLexicalBlock()) {
-        DILexicalBlock DB(S);
-        Fn = DB.getFilename();
-        Dir = DB.getDirectory();
-      } else
-        assert(0 && "Unexpected scope info");
-
-      Src = GetOrCreateSourceID(Fn, Dir);
-    }
-    OutStreamer.EmitDwarfLocDirective(Src, DL.getLine(), DL.getCol(),
-                                     0, 0, 0, Fn);
-
-    const MCDwarfLoc& MDL = OutContext.getCurrentDwarfLoc();
-
-    OS << "\t.loc ";
-    OS << utostr(MDL.getFileNum());
-    OS << " ";
-    OS << utostr(MDL.getLine());
-    OS << " ";
-    OS << utostr(MDL.getColumn());
-    OS << "\n";
-  }
-
-
-  // Emit predicate
-  printPredicateOperand(MI, OS);
-
-  // Write instruction to str
-  printInstruction(MI, OS);
-  OS << ';';
-  OS.flush();
-
-  StringRef strref = StringRef(str);
-  OutStreamer.EmitRawText(strref);
-}
-
-void PTXAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
-                                 raw_ostream &OS) {
-  const MachineOperand &MO = MI->getOperand(opNum);
-
-  switch (MO.getType()) {
-    default:
-      llvm_unreachable("<unknown operand type>");
-      break;
-    case MachineOperand::MO_GlobalAddress:
-      OS << *Mang->getSymbol(MO.getGlobal());
-      break;
-    case MachineOperand::MO_Immediate:
-      OS << (long) MO.getImm();
-      break;
-    case MachineOperand::MO_MachineBasicBlock:
-      OS << *MO.getMBB()->getSymbol();
-      break;
-    case MachineOperand::MO_Register:
-      OS << getRegisterName(MO.getReg());
-      break;
-    case MachineOperand::MO_FPImmediate:
-      APInt constFP = MO.getFPImm()->getValueAPF().bitcastToAPInt();
-      bool  isFloat = MO.getFPImm()->getType()->getTypeID() == Type::FloatTyID;
-      // Emit 0F for 32-bit floats and 0D for 64-bit doubles.
-      if (isFloat) {
-        OS << "0F";
-      }
-      else {
-        OS << "0D";
-      }
-      // Emit the encoded floating-point value.
-      if (constFP.getZExtValue() > 0) {
-        OS << constFP.toString(16, false);
-      }
-      else {
-        OS << "00000000";
-        // If We have a double-precision zero, pad to 8-bytes.
-        if (!isFloat) {
-          OS << "00000000";
-        }
-      }
-      break;
-  }
-}
-
-void PTXAsmPrinter::printMemOperand(const MachineInstr *MI, int opNum,
-                                    raw_ostream &OS, const char *Modifier) {
-  printOperand(MI, opNum, OS);
 
-  if (MI->getOperand(opNum+1).isImm() && MI->getOperand(opNum+1).getImm() == 0)
-    return; // don't print "+0"
-
-  OS << "+";
-  printOperand(MI, opNum+1, OS);
+  //unsigned Index = 1;
+  // Print parameter passing params
+  //for (PTXMachineFunctionInfo::param_iterator
+  //     i = MFI->paramBegin(), e = MFI->paramEnd(); i != e; ++i) {
+  //  std::string def = "\t.param .b";
+  //  def += utostr(*i);
+  //  def += " __ret_";
+  //  def += utostr(Index);
+  //  Index++;
+  //  def += ";";
+  //  OutStreamer.EmitRawText(Twine(def));
+  //}
 }
 
-void PTXAsmPrinter::printParamOperand(const MachineInstr *MI, int opNum,
-                                      raw_ostream &OS, const char *Modifier) {
-  OS << PARAM_PREFIX << (int) MI->getOperand(opNum).getImm() + 1;
+void PTXAsmPrinter::EmitFunctionBodyEnd() {
+  OutStreamer.EmitRawText(Twine("}"));
 }
 
-void PTXAsmPrinter::printReturnOperand(const MachineInstr *MI, int opNum,
-                                       raw_ostream &OS, const char *Modifier) {
-  OS << RETURN_PREFIX << (int) MI->getOperand(opNum).getImm() + 1;
+void PTXAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  MCInst TmpInst;
+  LowerPTXMachineInstrToMCInst(MI, TmpInst, *this);
+  OutStreamer.EmitInstruction(TmpInst);
 }
 
 void PTXAsmPrinter::EmitVariableDeclaration(const GlobalVariable *gv) {
@@ -400,14 +310,14 @@ void PTXAsmPrinter::EmitVariableDeclaration(const GlobalVariable *gv) {
   unsigned alignment = gv->getAlignment();
   if (alignment != 0) {
     decl += ".align ";
-    decl += utostr(Log2_32(gv->getAlignment()));
+    decl += utostr(gv->getAlignment());
     decl += " ";
   }
 
 
   if (PointerType::classof(gv->getType())) {
-    const PointerType* pointerTy = dyn_cast<const PointerType>(gv->getType());
-    const Type* elementTy = pointerTy->getElementType();
+    PointerType* pointerTy = dyn_cast<PointerType>(gv->getType());
+    Type* elementTy = pointerTy->getElementType();
 
     decl += ".b8 ";
     decl += gvsym->getName();
@@ -417,14 +327,14 @@ void PTXAsmPrinter::EmitVariableDeclaration(const GlobalVariable *gv) {
     {
       assert(elementTy->isArrayTy() && "Only pointers to arrays are supported");
 
-      const ArrayType* arrayTy = dyn_cast<const ArrayType>(elementTy);
+      ArrayType* arrayTy = dyn_cast<ArrayType>(elementTy);
       elementTy = arrayTy->getElementType();
 
       unsigned numElements = arrayTy->getNumElements();
 
       while (elementTy->isArrayTy()) {
 
-        arrayTy = dyn_cast<const ArrayType>(elementTy);
+        arrayTy = dyn_cast<ArrayType>(elementTy);
         elementTy = arrayTy->getElementType();
 
         numElements *= arrayTy->getNumElements();
@@ -447,7 +357,7 @@ void PTXAsmPrinter::EmitVariableDeclaration(const GlobalVariable *gv) {
 
     if (gv->hasInitializer())
     {
-      const Constant *C = gv->getInitializer();  
+      const Constant *C = gv->getInitializer();
       if (const ConstantArray *CA = dyn_cast<ConstantArray>(C))
       {
         decl += " = {";
@@ -484,7 +394,7 @@ void PTXAsmPrinter::EmitVariableDeclaration(const GlobalVariable *gv) {
   OutStreamer.AddBlankLine();
 }
 
-void PTXAsmPrinter::EmitFunctionDeclaration() {
+void PTXAsmPrinter::EmitFunctionEntryLabel() {
   // The function label could have already been emitted if two symbols end up
   // conflicting due to asm renaming.  Detect this and emit an error.
   if (!CurrentFnSym->isUndefined()) {
@@ -494,25 +404,39 @@ void PTXAsmPrinter::EmitFunctionDeclaration() {
   }
 
   const PTXMachineFunctionInfo *MFI = MF->getInfo<PTXMachineFunctionInfo>();
+  const PTXParamManager &PM = MFI->getParamManager();
   const bool isKernel = MFI->isKernel();
   const PTXSubtarget& ST = TM.getSubtarget<PTXSubtarget>();
+  const MachineRegisterInfo& MRI = MF->getRegInfo();
 
   std::string decl = isKernel ? ".entry" : ".func";
 
-  unsigned cnt = 0;
-
   if (!isKernel) {
     decl += " (";
-    for (PTXMachineFunctionInfo::ret_iterator
-         i = MFI->retRegBegin(), e = MFI->retRegEnd(), b = i;
-         i != e; ++i) {
-      if (i != b) {
-        decl += ", ";
+    if (ST.useParamSpaceForDeviceArgs()) {
+      for (PTXParamManager::param_iterator i = PM.ret_begin(), e = PM.ret_end(),
+           b = i; i != e; ++i) {
+        if (i != b) {
+          decl += ", ";
+        }
+
+        decl += ".param .b";
+        decl += utostr(PM.getParamSize(*i));
+        decl += " ";
+        decl += PM.getParamName(*i);
+      }
+    } else {
+      for (PTXMachineFunctionInfo::reg_iterator
+           i = MFI->retreg_begin(), e = MFI->retreg_end(), b = i;
+           i != e; ++i) {
+        if (i != b) {
+          decl += ", ";
+        }
+        decl += ".reg .";
+        decl += getRegisterTypeName(*i, MRI);
+        decl += " ";
+        decl += MFI->getRegisterName(*i);
       }
-      decl += ".reg .";
-      decl += getRegisterTypeName(*i);
-      decl += " ";
-      decl += getRegisterName(*i);
     }
     decl += ")";
   }
@@ -523,26 +447,65 @@ void PTXAsmPrinter::EmitFunctionDeclaration() {
 
   decl += " (";
 
-  cnt = 0;
+  const Function *F = MF->getFunction();
 
   // Print parameters
-  for (PTXMachineFunctionInfo::reg_iterator
-       i = MFI->argRegBegin(), e = MFI->argRegEnd(), b = i;
-       i != e; ++i) {
-    if (i != b) {
-      decl += ", ";
-    }
-    if (isKernel || ST.useParamSpaceForDeviceArgs()) {
+  if (isKernel || ST.useParamSpaceForDeviceArgs()) {
+    /*for (PTXParamManager::param_iterator i = PM.arg_begin(), e = PM.arg_end(),
+         b = i; i != e; ++i) {
+      if (i != b) {
+        decl += ", ";
+      }
+
       decl += ".param .b";
-      decl += utostr(*i);
+      decl += utostr(PM.getParamSize(*i));
       decl += " ";
-      decl += PARAM_PREFIX;
-      decl += utostr(++cnt);
-    } else {
+      decl += PM.getParamName(*i);
+    }*/
+    int Counter = 1;
+    for (Function::const_arg_iterator i = F->arg_begin(), e = F->arg_end(),
+         b = i; i != e; ++i) {
+      if (i != b)
+        decl += ", ";
+      const Type *ArgType = (*i).getType();
+      decl += ".param .b";
+      if (ArgType->isPointerTy()) {
+        if (ST.is64Bit())
+          decl += "64";
+        else
+          decl += "32";
+      } else {
+        decl += utostr(ArgType->getPrimitiveSizeInBits());
+      }
+      if (ArgType->isPointerTy() && ST.emitPtrAttribute()) {
+        const PointerType *PtrType = dyn_cast<const PointerType>(ArgType);
+        decl += " .ptr";
+        switch (PtrType->getAddressSpace()) {
+        default:
+          llvm_unreachable("Unknown address space in argument");
+        case PTXStateSpace::Global:
+          decl += " .global";
+          break;
+        case PTXStateSpace::Shared:
+          decl += " .shared";
+          break;
+        }
+      }
+      decl += " __param_";
+      decl += utostr(Counter++);
+    }
+  } else {
+    for (PTXMachineFunctionInfo::reg_iterator
+         i = MFI->argreg_begin(), e = MFI->argreg_end(), b = i;
+         i != e; ++i) {
+      if (i != b) {
+        decl += ", ";
+      }
+
       decl += ".reg .";
-      decl += getRegisterTypeName(*i);
+      decl += getRegisterTypeName(*i, MRI);
       decl += " ";
-      decl += getRegisterName(*i);
+      decl += MFI->getRegisterName(*i);
     }
   }
   decl += ")";
@@ -550,25 +513,6 @@ void PTXAsmPrinter::EmitFunctionDeclaration() {
   OutStreamer.EmitRawText(Twine(decl));
 }
 
-void PTXAsmPrinter::
-printPredicateOperand(const MachineInstr *MI, raw_ostream &O) {
-  int i = MI->findFirstPredOperandIdx();
-  if (i == -1)
-    llvm_unreachable("missing predicate operand");
-
-  unsigned reg = MI->getOperand(i).getReg();
-  int predOp = MI->getOperand(i+1).getImm();
-
-  DEBUG(dbgs() << "predicate: (" << reg << ", " << predOp << ")\n");
-
-  if (reg != PTX::NoRegister) {
-    O << '@';
-    if (predOp == PTX::PRED_NEGATE)
-      O << '!';
-    O << getRegisterName(reg);
-  }
-}
-
 unsigned PTXAsmPrinter::GetOrCreateSourceID(StringRef FileName,
                                             StringRef DirName) {
   // If FE did not provide a file name, then assume stdin.
@@ -596,10 +540,58 @@ unsigned PTXAsmPrinter::GetOrCreateSourceID(StringRef FileName,
   return SrcId;
 }
 
-#include "PTXGenAsmWriter.inc"
+MCOperand PTXAsmPrinter::GetSymbolRef(const MachineOperand &MO,
+                                      const MCSymbol *Symbol) {
+  const MCExpr *Expr;
+  Expr = MCSymbolRefExpr::Create(Symbol, MCSymbolRefExpr::VK_None, OutContext);
+  return MCOperand::CreateExpr(Expr);
+}
+
+MCOperand PTXAsmPrinter::lowerOperand(const MachineOperand &MO) {
+  MCOperand MCOp;
+  const PTXMachineFunctionInfo *MFI = MF->getInfo<PTXMachineFunctionInfo>();
+  const MCExpr *Expr;
+  const char *RegSymbolName;
+  switch (MO.getType()) {
+  default:
+    llvm_unreachable("Unknown operand type");
+  case MachineOperand::MO_Register:
+    // We create register operands as symbols, since the PTXInstPrinter class
+    // has no way to map virtual registers back to a name without some ugly
+    // hacks.
+    // FIXME: Figure out a better way to handle virtual register naming.
+    RegSymbolName = MFI->getRegisterName(MO.getReg());
+    Expr = MCSymbolRefExpr::Create(RegSymbolName, MCSymbolRefExpr::VK_None,
+                                   OutContext);
+    MCOp = MCOperand::CreateExpr(Expr);
+    break;
+  case MachineOperand::MO_Immediate:
+    MCOp = MCOperand::CreateImm(MO.getImm());
+    break;
+  case MachineOperand::MO_MachineBasicBlock:
+    MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create(
+                                 MO.getMBB()->getSymbol(), OutContext));
+    break;
+  case MachineOperand::MO_GlobalAddress:
+    MCOp = GetSymbolRef(MO, Mang->getSymbol(MO.getGlobal()));
+    break;
+  case MachineOperand::MO_ExternalSymbol:
+    MCOp = GetSymbolRef(MO, GetExternalSymbolSymbol(MO.getSymbolName()));
+    break;
+  case MachineOperand::MO_FPImmediate:
+    APFloat Val = MO.getFPImm()->getValueAPF();
+    bool ignored;
+    Val.convert(APFloat::IEEEdouble, APFloat::rmTowardZero, &ignored);
+    MCOp = MCOperand::CreateFPImm(Val.convertToDouble());
+    break;
+  }
+
+  return MCOp;
+}
 
 // Force static initialization.
 extern "C" void LLVMInitializePTXAsmPrinter() {
   RegisterAsmPrinter<PTXAsmPrinter> X(ThePTX32Target);
   RegisterAsmPrinter<PTXAsmPrinter> Y(ThePTX64Target);
 }
+
diff --git a/lib/Target/PTX/PTXAsmPrinter.h b/lib/Target/PTX/PTXAsmPrinter.h
new file mode 100644
index 0000000000000..538c0802a27e4
--- /dev/null
+++ b/lib/Target/PTX/PTXAsmPrinter.h
@@ -0,0 +1,57 @@
+//===-- PTXAsmPrinter.h - Print machine code to a PTX file ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// PTX Assembly printer class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PTXASMPRINTER_H
+#define PTXASMPRINTER_H
+
+#include "PTX.h"
+#include "PTXTargetMachine.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+
+class MCOperand;
+
+class LLVM_LIBRARY_VISIBILITY PTXAsmPrinter : public AsmPrinter {
+public:
+  explicit PTXAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
+    : AsmPrinter(TM, Streamer) {}
+
+  const char *getPassName() const { return "PTX Assembly Printer"; }
+
+  bool doFinalization(Module &M);
+
+  virtual void EmitStartOfAsmFile(Module &M);
+  virtual void EmitFunctionBodyStart();
+  virtual void EmitFunctionBodyEnd();
+  virtual void EmitFunctionEntryLabel();
+  virtual void EmitInstruction(const MachineInstr *MI);
+
+  unsigned GetOrCreateSourceID(StringRef FileName,
+                               StringRef DirName);
+
+  MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol);
+  MCOperand lowerOperand(const MachineOperand &MO);
+
+private:
+  void EmitVariableDeclaration(const GlobalVariable *gv);
+  void EmitFunctionDeclaration();
+
+  StringMap<unsigned> SourceIdMap;
+}; // class PTXAsmPrinter
+} // namespace llvm
+
+#endif
+
diff --git a/lib/Target/PTX/PTXCallingConv.td b/lib/Target/PTX/PTXCallingConv.td
deleted file mode 100644
index 3e3ff48966212..0000000000000
--- a/lib/Target/PTX/PTXCallingConv.td
+++ /dev/null
@@ -1,29 +0,0 @@
-
-//===--- PTXCallingConv.td - Calling Conventions -----------*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This describes the calling conventions for the PTX architecture.
-//
-//===----------------------------------------------------------------------===//
-
-// PTX Formal Parameter Calling Convention
-def CC_PTX : CallingConv<[
-  CCIfType<[i1],      CCAssignToReg<[P12, P13, P14, P15, P16, P17, P18, P19, P20, P21, P22, P23, P24, P25, P26, P27, P28, P29, P30, P31, P32, P33, P34, P35, P36, P37, P38, P39, P40, P41, P42, P43, P44, P45, P46, P47, P48, P49, P50, P51, P52, P53, P54, P55, P56, P57, P58, P59, P60, P61, P62, P63, P64, P65, P66, P67, P68, P69, P70, P71, P72, P73, P74, P75, P76, P77, P78, P79, P80, P81, P82, P83, P84, P85, P86, P87, P88, P89, P90, P91, P92, P93, P94, P95, P96, P97, P98, P99, P100, P101, P102, P103, P104, P105, P106, P107, P108, P109, P110, P111, P112, P113, P114, P115, P116, P117, P118, P119, P120, P121, P122, P123, P124, P125, P126, P127]>>,
-  CCIfType<[i16],     CCAssignToReg<[RH12, RH13, RH14, RH15, RH16, RH17, RH18, RH19, RH20, RH21, RH22, RH23, RH24, RH25, RH26, RH27, RH28, RH29, RH30, RH31, RH32, RH33, RH34, RH35, RH36, RH37, RH38, RH39, RH40, RH41, RH42, RH43, RH44, RH45, RH46, RH47, RH48, RH49, RH50, RH51, RH52, RH53, RH54, RH55, RH56, RH57, RH58, RH59, RH60, RH61, RH62, RH63, RH64, RH65, RH66, RH67, RH68, RH69, RH70, RH71, RH72, RH73, RH74, RH75, RH76, RH77, RH78, RH79, RH80, RH81, RH82, RH83, RH84, RH85, RH86, RH87, RH88, RH89, RH90, RH91, RH92, RH93, RH94, RH95, RH96, RH97, RH98, RH99, RH100, RH101, RH102, RH103, RH104, RH105, RH106, RH107, RH108, RH109, RH110, RH111, RH112, RH113, RH114, RH115, RH116, RH117, RH118, RH119, RH120, RH121, RH122, RH123, RH124, RH125, RH126, RH127]>>,
-  CCIfType<[i32,f32], CCAssignToReg<[R12, R13, R14, R15, R16, R17, R18, R19, R20, R21, R22, R23, R24, R25, R26, R27, R28, R29, R30, R31, R32, R33, R34, R35, R36, R37, R38, R39, R40, R41, R42, R43, R44, R45, R46, R47, R48, R49, R50, R51, R52, R53, R54, R55, R56, R57, R58, R59, R60, R61, R62, R63, R64, R65, R66, R67, R68, R69, R70, R71, R72, R73, R74, R75, R76, R77, R78, R79, R80, R81, R82, R83, R84, R85, R86, R87, R88, R89, R90, R91, R92, R93, R94, R95, R96, R97, R98, R99, R100, R101, R102, R103, R104, R105, R106, R107, R108, R109, R110, R111, R112, R113, R114, R115, R116, R117, R118, R119, R120, R121, R122, R123, R124, R125, R126, R127]>>,
-  CCIfType<[i64,f64], CCAssignToReg<[RD12, RD13, RD14, RD15, RD16, RD17, RD18, RD19, RD20, RD21, RD22, RD23, RD24, RD25, RD26, RD27, RD28, RD29, RD30, RD31, RD32, RD33, RD34, RD35, RD36, RD37, RD38, RD39, RD40, RD41, RD42, RD43, RD44, RD45, RD46, RD47, RD48, RD49, RD50, RD51, RD52, RD53, RD54, RD55, RD56, RD57, RD58, RD59, RD60, RD61, RD62, RD63, RD64, RD65, RD66, RD67, RD68, RD69, RD70, RD71, RD72, RD73, RD74, RD75, RD76, RD77, RD78, RD79, RD80, RD81, RD82, RD83, RD84, RD85, RD86, RD87, RD88, RD89, RD90, RD91, RD92, RD93, RD94, RD95, RD96, RD97, RD98, RD99, RD100, RD101, RD102, RD103, RD104, RD105, RD106, RD107, RD108, RD109, RD110, RD111, RD112, RD113, RD114, RD115, RD116, RD117, RD118, RD119, RD120, RD121, RD122, RD123, RD124, RD125, RD126, RD127]>>
-]>;
-
-// PTX Return Value Calling Convention
-def RetCC_PTX : CallingConv<[
-  CCIfType<[i1],      CCAssignToReg<[P0, P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11]>>,
-  CCIfType<[i16],     CCAssignToReg<[RH0, RH1, RH2, RH3, RH4, RH5, RH6, RH7, RH8, RH9, RH10, RH11]>>,
-  CCIfType<[i32,f32], CCAssignToReg<[R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11]>>,
-  CCIfType<[i64,f64], CCAssignToReg<[RD0, RD1, RD2, RD3, RD4, RD5, RD6, RD7, RD8, RD9, RD10, RD11]>>
-]>;
diff --git a/lib/Target/PTX/PTXFPRoundingModePass.cpp b/lib/Target/PTX/PTXFPRoundingModePass.cpp
new file mode 100644
index 0000000000000..0b653e04b3bb1
--- /dev/null
+++ b/lib/Target/PTX/PTXFPRoundingModePass.cpp
@@ -0,0 +1,179 @@
+//===-- PTXFPRoundingModePass.cpp - Assign rounding modes pass ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a machine function pass that sets appropriate FP rounding
+// modes for all relevant instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "ptx-fp-rounding-mode"
+
+#include "PTX.h"
+#include "PTXTargetMachine.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+// NOTE: PTXFPRoundingModePass should be executed just before emission.
+
+namespace llvm {
+  /// PTXFPRoundingModePass - Pass to assign appropriate FP rounding modes to
+  /// all FP instructions. Essentially, this pass just looks for all FP
+  /// instructions that have a rounding mode set to RndDefault, and sets an
+  /// appropriate rounding mode based on the target device.
+  ///
+  class PTXFPRoundingModePass : public MachineFunctionPass {
+    private:
+      static char ID;
+
+      typedef std::pair<unsigned, unsigned> RndModeDesc;
+
+      PTXTargetMachine& TargetMachine;
+      DenseMap<unsigned, RndModeDesc> Instrs;
+
+    public:
+      PTXFPRoundingModePass(PTXTargetMachine &TM, CodeGenOpt::Level OptLevel)
+        : MachineFunctionPass(ID),
+          TargetMachine(TM) {
+        initializeMap();
+      }
+
+      virtual bool runOnMachineFunction(MachineFunction &MF);
+
+      virtual const char *getPassName() const {
+        return "PTX FP Rounding Mode Pass";
+      }
+
+    private:
+
+      void initializeMap();
+      void processInstruction(MachineInstr &MI);
+  }; // class PTXFPRoundingModePass
+} // namespace llvm
+
+using namespace llvm;
+
+char PTXFPRoundingModePass::ID = 0;
+
+bool PTXFPRoundingModePass::runOnMachineFunction(MachineFunction &MF) {
+  // Look at each basic block
+  for (MachineFunction::iterator bbi = MF.begin(), bbe = MF.end(); bbi != bbe;
+       ++bbi) {
+    MachineBasicBlock &MBB = *bbi;
+    // Look at each instruction
+    for (MachineBasicBlock::iterator ii = MBB.begin(), ie = MBB.end();
+         ii != ie; ++ii) {
+      MachineInstr &MI = *ii;
+      processInstruction(MI);
+    }
+  }
+  return false;
+}
+
+void PTXFPRoundingModePass::initializeMap() {
+  using namespace PTXRoundingMode;
+  const PTXSubtarget& ST = TargetMachine.getSubtarget<PTXSubtarget>();
+
+  // Build a map of default rounding mode for all instructions that need a
+  // rounding mode.
+  Instrs[PTX::FADDrr32] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::FADDri32] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::FADDrr64] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::FADDri64] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::FSUBrr32] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::FSUBri32] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::FSUBrr64] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::FSUBri64] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::FMULrr32] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::FMULri32] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::FMULrr64] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::FMULri64] = std::make_pair(1U, (unsigned)RndNearestEven);
+
+  Instrs[PTX::FNEGrr32] = std::make_pair(1U, (unsigned)RndNone);
+  Instrs[PTX::FNEGri32] = std::make_pair(1U, (unsigned)RndNone);
+  Instrs[PTX::FNEGrr64] = std::make_pair(1U, (unsigned)RndNone);
+  Instrs[PTX::FNEGri64] = std::make_pair(1U, (unsigned)RndNone);
+
+  unsigned FDivRndMode = ST.fdivNeedsRoundingMode() ? RndNearestEven : RndNone;
+  Instrs[PTX::FDIVrr32] = std::make_pair(1U, FDivRndMode);
+  Instrs[PTX::FDIVri32] = std::make_pair(1U, FDivRndMode);
+  Instrs[PTX::FDIVrr64] = std::make_pair(1U, FDivRndMode);
+  Instrs[PTX::FDIVri64] = std::make_pair(1U, FDivRndMode);
+
+  unsigned FMADRndMode = ST.fmadNeedsRoundingMode() ? RndNearestEven : RndNone;
+  Instrs[PTX::FMADrrr32] = std::make_pair(1U, FMADRndMode);
+  Instrs[PTX::FMADrri32] = std::make_pair(1U, FMADRndMode);
+  Instrs[PTX::FMADrii32] = std::make_pair(1U, FMADRndMode);
+  Instrs[PTX::FMADrrr64] = std::make_pair(1U, FMADRndMode);
+  Instrs[PTX::FMADrri64] = std::make_pair(1U, FMADRndMode);
+  Instrs[PTX::FMADrii64] = std::make_pair(1U, FMADRndMode);
+
+  Instrs[PTX::FSQRTrr32] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::FSQRTri32] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::FSQRTrr64] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::FSQRTri64] = std::make_pair(1U, (unsigned)RndNearestEven);
+
+  Instrs[PTX::FSINrr32] = std::make_pair(1U, (unsigned)RndApprox);
+  Instrs[PTX::FSINri32] = std::make_pair(1U, (unsigned)RndApprox);
+  Instrs[PTX::FSINrr64] = std::make_pair(1U, (unsigned)RndApprox);
+  Instrs[PTX::FSINri64] = std::make_pair(1U, (unsigned)RndApprox);
+  Instrs[PTX::FCOSrr32] = std::make_pair(1U, (unsigned)RndApprox);
+  Instrs[PTX::FCOSri32] = std::make_pair(1U, (unsigned)RndApprox);
+  Instrs[PTX::FCOSrr64] = std::make_pair(1U, (unsigned)RndApprox);
+  Instrs[PTX::FCOSri64] = std::make_pair(1U, (unsigned)RndApprox);
+
+  Instrs[PTX::CVTu16f32] = std::make_pair(1U, (unsigned)RndTowardsZeroInt);
+  Instrs[PTX::CVTs16f32] = std::make_pair(1U, (unsigned)RndTowardsZeroInt);
+  Instrs[PTX::CVTu16f64] = std::make_pair(1U, (unsigned)RndTowardsZeroInt);
+  Instrs[PTX::CVTs16f64] = std::make_pair(1U, (unsigned)RndTowardsZeroInt);
+  Instrs[PTX::CVTu32f32] = std::make_pair(1U, (unsigned)RndTowardsZeroInt);
+  Instrs[PTX::CVTs32f32] = std::make_pair(1U, (unsigned)RndTowardsZeroInt);
+  Instrs[PTX::CVTu32f64] = std::make_pair(1U, (unsigned)RndTowardsZeroInt);
+  Instrs[PTX::CVTs32f64] = std::make_pair(1U, (unsigned)RndTowardsZeroInt);
+  Instrs[PTX::CVTu64f32] = std::make_pair(1U, (unsigned)RndTowardsZeroInt);
+  Instrs[PTX::CVTs64f32] = std::make_pair(1U, (unsigned)RndTowardsZeroInt);
+  Instrs[PTX::CVTu64f64] = std::make_pair(1U, (unsigned)RndTowardsZeroInt);
+  Instrs[PTX::CVTs64f64] = std::make_pair(1U, (unsigned)RndTowardsZeroInt);
+
+  Instrs[PTX::CVTf32u16] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::CVTf32s16] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::CVTf32u32] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::CVTf32s32] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::CVTf32u64] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::CVTf32s64] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::CVTf32f64] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::CVTf64u16] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::CVTf64s16] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::CVTf64u32] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::CVTf64s32] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::CVTf64u64] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::CVTf64s64] = std::make_pair(1U, (unsigned)RndNearestEven);
+}
+
+void PTXFPRoundingModePass::processInstruction(MachineInstr &MI) {
+  // Is this an instruction that needs a rounding mode?
+  if (Instrs.count(MI.getOpcode())) {
+    const RndModeDesc &Desc = Instrs[MI.getOpcode()];
+    // Get the rounding mode operand
+    MachineOperand &Op = MI.getOperand(Desc.first);
+    // Update the rounding mode if needed
+    if (Op.getImm() == PTXRoundingMode::RndDefault) {
+      Op.setImm(Desc.second);
+    }
+  }
+}
+
+FunctionPass *llvm::createPTXFPRoundingModePass(PTXTargetMachine &TM,
+                                                CodeGenOpt::Level OptLevel) {
+  return new PTXFPRoundingModePass(TM, OptLevel);
+}
+
diff --git a/lib/Target/PTX/PTXISelDAGToDAG.cpp b/lib/Target/PTX/PTXISelDAGToDAG.cpp
index 9adfa624b29ed..5c7ee298f31ab 100644
--- a/lib/Target/PTX/PTXISelDAGToDAG.cpp
+++ b/lib/Target/PTX/PTXISelDAGToDAG.cpp
@@ -12,7 +12,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "PTX.h"
+#include "PTXMachineFunctionInfo.h"
 #include "PTXTargetMachine.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Support/Debug.h"
@@ -37,6 +39,7 @@ class PTXDAGToDAGISel : public SelectionDAGISel {
     bool SelectADDRrr(SDValue &Addr, SDValue &R1, SDValue &R2);
     bool SelectADDRri(SDValue &Addr, SDValue &Base, SDValue &Offset);
     bool SelectADDRii(SDValue &Addr, SDValue &Base, SDValue &Offset);
+    bool SelectADDRlocal(SDValue &Addr, SDValue &Base, SDValue &Offset);
 
     // Include the pieces auto'gened from the target description
 #include "PTXGenDAGISel.inc"
@@ -46,6 +49,10 @@ class PTXDAGToDAGISel : public SelectionDAGISel {
     // pattern (PTXbrcond bb:$d, ...) in PTXInstrInfo.td
     SDNode *SelectBRCOND(SDNode *Node);
 
+    SDNode *SelectREADPARAM(SDNode *Node);
+    SDNode *SelectWRITEPARAM(SDNode *Node);
+    SDNode *SelectFrameIndex(SDNode *Node);
+
     bool isImm(const SDValue &operand);
     bool SelectImm(const SDValue &operand, SDValue &imm);
 
@@ -68,6 +75,12 @@ SDNode *PTXDAGToDAGISel::Select(SDNode *Node) {
   switch (Node->getOpcode()) {
     case ISD::BRCOND:
       return SelectBRCOND(Node);
+    case PTXISD::READ_PARAM:
+      return SelectREADPARAM(Node);
+    case PTXISD::WRITE_PARAM:
+      return SelectWRITEPARAM(Node);
+    case ISD::FrameIndex:
+      return SelectFrameIndex(Node);
     default:
       return SelectCode(Node);
   }
@@ -79,7 +92,7 @@ SDNode *PTXDAGToDAGISel::SelectBRCOND(SDNode *Node) {
   SDValue Chain  = Node->getOperand(0);
   SDValue Pred   = Node->getOperand(1);
   SDValue Target = Node->getOperand(2); // branch target
-  SDValue PredOp = CurDAG->getTargetConstant(PTX::PRED_NORMAL, MVT::i32);
+  SDValue PredOp = CurDAG->getTargetConstant(PTXPredicate::Normal, MVT::i32);
   DebugLoc dl = Node->getDebugLoc();
 
   assert(Target.getOpcode()  == ISD::BasicBlock);
@@ -90,6 +103,97 @@ SDNode *PTXDAGToDAGISel::SelectBRCOND(SDNode *Node) {
   return CurDAG->getMachineNode(PTX::BRAdp, dl, MVT::Other, Ops, 4);
 }
 
+SDNode *PTXDAGToDAGISel::SelectREADPARAM(SDNode *Node) {
+  SDValue Chain = Node->getOperand(0);
+  SDValue Index = Node->getOperand(1);
+
+  int OpCode;
+
+  // Get the type of parameter we are reading
+  EVT VT = Node->getValueType(0);
+  assert(VT.isSimple() && "READ_PARAM only implemented for MVT types");
+
+  MVT Type = VT.getSimpleVT();
+
+  if (Type == MVT::i1)
+    OpCode = PTX::READPARAMPRED;
+  else if (Type == MVT::i16)
+    OpCode = PTX::READPARAMI16;
+  else if (Type == MVT::i32)
+    OpCode = PTX::READPARAMI32;
+  else if (Type == MVT::i64)
+    OpCode = PTX::READPARAMI64;
+  else if (Type == MVT::f32)
+    OpCode = PTX::READPARAMF32;
+  else {
+    assert(Type == MVT::f64 && "Unexpected type!");
+    OpCode = PTX::READPARAMF64;
+  }
+
+  SDValue Pred = CurDAG->getRegister(PTX::NoRegister, MVT::i1);
+  SDValue PredOp = CurDAG->getTargetConstant(PTXPredicate::None, MVT::i32);
+  DebugLoc dl = Node->getDebugLoc();
+
+  SDValue Ops[] = { Index, Pred, PredOp, Chain };
+  return CurDAG->getMachineNode(OpCode, dl, VT, Ops, 4);
+}
+
+SDNode *PTXDAGToDAGISel::SelectWRITEPARAM(SDNode *Node) {
+
+  SDValue Chain = Node->getOperand(0);
+  SDValue Value = Node->getOperand(1);
+
+  int OpCode;
+
+  //Node->dumpr(CurDAG);
+
+  // Get the type of parameter we are writing
+  EVT VT = Value->getValueType(0);
+  assert(VT.isSimple() && "WRITE_PARAM only implemented for MVT types");
+
+  MVT Type = VT.getSimpleVT();
+
+  if (Type == MVT::i1)
+    OpCode = PTX::WRITEPARAMPRED;
+  else if (Type == MVT::i16)
+    OpCode = PTX::WRITEPARAMI16;
+  else if (Type == MVT::i32)
+    OpCode = PTX::WRITEPARAMI32;
+  else if (Type == MVT::i64)
+    OpCode = PTX::WRITEPARAMI64;
+  else if (Type == MVT::f32)
+    OpCode = PTX::WRITEPARAMF32;
+  else if (Type == MVT::f64)
+    OpCode = PTX::WRITEPARAMF64;
+  else
+    llvm_unreachable("Invalid type in SelectWRITEPARAM");
+
+  SDValue Pred = CurDAG->getRegister(PTX::NoRegister, MVT::i1);
+  SDValue PredOp = CurDAG->getTargetConstant(PTXPredicate::None, MVT::i32);
+  DebugLoc dl = Node->getDebugLoc();
+
+  SDValue Ops[] = { Value, Pred, PredOp, Chain };
+  SDNode* Ret = CurDAG->getMachineNode(OpCode, dl, MVT::Other, Ops, 4);
+
+  //dbgs() << "SelectWRITEPARAM produced:\n\t";
+  //Ret->dumpr(CurDAG);
+
+  return Ret;
+}
+
+SDNode *PTXDAGToDAGISel::SelectFrameIndex(SDNode *Node) {
+  int FI = cast<FrameIndexSDNode>(Node)->getIndex();
+  //dbgs() << "Selecting FrameIndex at index " << FI << "\n";
+  //SDValue TFI = CurDAG->getTargetFrameIndex(FI, Node->getValueType(0));
+
+  PTXMachineFunctionInfo *MFI = MF->getInfo<PTXMachineFunctionInfo>();
+
+  SDValue FrameSymbol = CurDAG->getTargetExternalSymbol(MFI->getFrameSymbol(FI),
+                                                        Node->getValueType(0));
+
+  return FrameSymbol.getNode();
+}
+
 // Match memory operand of the form [reg+reg]
 bool PTXDAGToDAGISel::SelectADDRrr(SDValue &Addr, SDValue &R1, SDValue &R2) {
   if (Addr.getOpcode() != ISD::ADD || Addr.getNumOperands() < 2 ||
@@ -107,14 +211,54 @@ bool PTXDAGToDAGISel::SelectADDRrr(SDValue &Addr, SDValue &R1, SDValue &R2) {
 // Match memory operand of the form [reg], [imm+reg], and [reg+imm]
 bool PTXDAGToDAGISel::SelectADDRri(SDValue &Addr, SDValue &Base,
                                    SDValue &Offset) {
-  if (Addr.getOpcode() != ISD::ADD) {
+  // FrameIndex addresses are handled separately
+  //errs() << "SelectADDRri: ";
+  //Addr.getNode()->dumpr();
+  if (isa<FrameIndexSDNode>(Addr)) {
+    //errs() << "Failure\n";
+    return false;
+  }
+
+  if (CurDAG->isBaseWithConstantOffset(Addr)) {
+    Base = Addr.getOperand(0);
+    if (isa<FrameIndexSDNode>(Base)) {
+      //errs() << "Failure\n";
+      return false;
+    }
+    ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
+    Offset = CurDAG->getTargetConstant(CN->getZExtValue(), MVT::i32);
+    //errs() << "Success\n";
+    return true;
+  }
+
+  /*if (Addr.getNumOperands() == 1) {
+    Base = Addr;
+    Offset = CurDAG->getTargetConstant(0, Addr.getValueType().getSimpleVT());
+    errs() << "Success\n";
+    return true;
+  }*/
+
+  //errs() << "SelectADDRri fails on: ";
+  //Addr.getNode()->dumpr();
+
+  if (isImm(Addr)) {
+    //errs() << "Failure\n";
+    return false;
+  }
+
+  Base = Addr;
+  Offset = CurDAG->getTargetConstant(0, Addr.getValueType().getSimpleVT());
+
+  //errs() << "Success\n";
+  return true;
+
+  /*if (Addr.getOpcode() != ISD::ADD) {
     // let SelectADDRii handle the [imm] case
     if (isImm(Addr))
       return false;
     // it is [reg]
 
     assert(Addr.getValueType().isSimple() && "Type must be simple");
-
     Base = Addr;
     Offset = CurDAG->getTargetConstant(0, Addr.getValueType().getSimpleVT());
 
@@ -136,7 +280,7 @@ bool PTXDAGToDAGISel::SelectADDRri(SDValue &Addr, SDValue &Base,
     }
 
   // neither [reg+imm] nor [imm+reg]
-  return false;
+  return false;*/
 }
 
 // Match memory operand of the form [imm+imm] and [imm]
@@ -160,6 +304,36 @@ bool PTXDAGToDAGISel::SelectADDRii(SDValue &Addr, SDValue &Base,
   return false;
 }
 
+// Match memory operand of the form [reg], [imm+reg], and [reg+imm]
+bool PTXDAGToDAGISel::SelectADDRlocal(SDValue &Addr, SDValue &Base,
+                                      SDValue &Offset) {
+  //errs() << "SelectADDRlocal: ";
+  //Addr.getNode()->dumpr();
+  if (isa<FrameIndexSDNode>(Addr)) {
+    Base = Addr;
+    Offset = CurDAG->getTargetConstant(0, Addr.getValueType().getSimpleVT());
+    //errs() << "Success\n";
+    return true;
+  }
+
+  if (CurDAG->isBaseWithConstantOffset(Addr)) {
+    Base = Addr.getOperand(0);
+    if (!isa<FrameIndexSDNode>(Base)) {
+      //errs() << "Failure\n";
+      return false;
+    }
+    ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
+    Offset = CurDAG->getTargetConstant(CN->getZExtValue(), MVT::i32);
+    //errs() << "Offset: ";
+    //Offset.getNode()->dumpr();
+    //errs() << "Success\n";
+    return true;
+  }
+
+  //errs() << "Failure\n";
+  return false;
+}
+
 bool PTXDAGToDAGISel::isImm(const SDValue &operand) {
   return ConstantSDNode::classof(operand.getNode());
 }
diff --git a/lib/Target/PTX/PTXISelLowering.cpp b/lib/Target/PTX/PTXISelLowering.cpp
index 6fcf710e3f1fe..3307d91a61880 100644
--- a/lib/Target/PTX/PTXISelLowering.cpp
+++ b/lib/Target/PTX/PTXISelLowering.cpp
@@ -16,22 +16,18 @@
 #include "PTXMachineFunctionInfo.h"
 #include "PTXRegisterInfo.h"
 #include "PTXSubtarget.h"
+#include "llvm/Function.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
-//===----------------------------------------------------------------------===//
-// Calling Convention Implementation
-//===----------------------------------------------------------------------===//
-
-#include "PTXGenCallingConv.inc"
-
 //===----------------------------------------------------------------------===//
 // TargetLowering Implementation
 //===----------------------------------------------------------------------===//
@@ -47,57 +43,58 @@ PTXTargetLowering::PTXTargetLowering(TargetMachine &TM)
   addRegisterClass(MVT::f64, PTX::RegF64RegisterClass);
 
   setBooleanContents(ZeroOrOneBooleanContent);
+  setBooleanVectorContents(ZeroOrOneBooleanContent); // FIXME: Is this correct?
   setMinFunctionAlignment(2);
-  
+
   ////////////////////////////////////
   /////////// Expansion //////////////
   ////////////////////////////////////
-  
+
   // (any/zero/sign) extload => load + (any/zero/sign) extend
-  
+
   setLoadExtAction(ISD::EXTLOAD, MVT::i16, Expand);
   setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Expand);
   setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Expand);
-  
+
   // f32 extload => load + fextend
-  
-  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);  
-  
+
+  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
+
   // f64 truncstore => trunc + store
-  
-  setTruncStoreAction(MVT::f64, MVT::f32, Expand); 
-  
+
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+
   // sign_extend_inreg => sign_extend
-  
+
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
-  
+
   // br_cc => brcond
-  
+
   setOperationAction(ISD::BR_CC, MVT::Other, Expand);
 
   // select_cc => setcc
-  
+
   setOperationAction(ISD::SELECT_CC, MVT::Other, Expand);
   setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
   setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
-  
+
   ////////////////////////////////////
   //////////// Legal /////////////////
   ////////////////////////////////////
-  
+
   setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
   setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
-  
+
   ////////////////////////////////////
   //////////// Custom ////////////////
   ////////////////////////////////////
-  
+
   // customise setcc to use bitwise logic if possible
-  
+
   setOperationAction(ISD::SETCC, MVT::i1, Custom);
 
   // customize translation of memory addresses
-  
+
   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
   setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
 
@@ -105,7 +102,7 @@ PTXTargetLowering::PTXTargetLowering(TargetMachine &TM)
   computeRegisterProperties();
 }
 
-MVT::SimpleValueType PTXTargetLowering::getSetCCResultType(EVT VT) const {
+EVT PTXTargetLowering::getSetCCResultType(EVT VT) const {
   return MVT::i1;
 }
 
@@ -130,10 +127,16 @@ const char *PTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
       return "PTXISD::LOAD_PARAM";
     case PTXISD::STORE_PARAM:
       return "PTXISD::STORE_PARAM";
+    case PTXISD::READ_PARAM:
+      return "PTXISD::READ_PARAM";
+    case PTXISD::WRITE_PARAM:
+      return "PTXISD::WRITE_PARAM";
     case PTXISD::EXIT:
       return "PTXISD::EXIT";
     case PTXISD::RET:
       return "PTXISD::RET";
+    case PTXISD::CALL:
+      return "PTXISD::CALL";
   }
 }
 
@@ -149,7 +152,7 @@ SDValue PTXTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
   DebugLoc dl = Op.getDebugLoc();
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
 
-  // Look for X == 0, X == 1, X != 0, or X != 1  
+  // Look for X == 0, X == 1, X != 0, or X != 1
   // We can simplify these to bitwise logic
 
   if (Op1.getOpcode() == ISD::Constant &&
@@ -197,6 +200,7 @@ SDValue PTXTargetLowering::
   MachineFunction &MF = DAG.getMachineFunction();
   const PTXSubtarget& ST = getTargetMachine().getSubtarget<PTXSubtarget>();
   PTXMachineFunctionInfo *MFI = MF.getInfo<PTXMachineFunctionInfo>();
+  PTXParamManager &PM = MFI->getParamManager();
 
   switch (CallConv) {
     default:
@@ -216,68 +220,34 @@ SDValue PTXTargetLowering::
   if (MFI->isKernel() || ST.useParamSpaceForDeviceArgs()) {
     // We just need to emit the proper LOAD_PARAM ISDs
     for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
-
       assert((!MFI->isKernel() || Ins[i].VT != MVT::i1) &&
              "Kernels cannot take pred operands");
 
+      unsigned ParamSize = Ins[i].VT.getStoreSizeInBits();
+      unsigned Param = PM.addArgumentParam(ParamSize);
+      const std::string &ParamName = PM.getParamName(Param);
+      SDValue ParamValue = DAG.getTargetExternalSymbol(ParamName.c_str(),
+                                                       MVT::Other);
       SDValue ArgValue = DAG.getNode(PTXISD::LOAD_PARAM, dl, Ins[i].VT, Chain,
-                                     DAG.getTargetConstant(i, MVT::i32));
+                                     ParamValue);
       InVals.push_back(ArgValue);
-
-      // Instead of storing a physical register in our argument list, we just
-      // store the total size of the parameter, in bits.  The ASM printer
-      // knows how to process this.
-      MFI->addArgReg(Ins[i].VT.getStoreSizeInBits());
     }
   }
   else {
-    // For device functions, we use the PTX calling convention to do register
-    // assignments then create CopyFromReg ISDs for the allocated registers
-
-    SmallVector<CCValAssign, 16> ArgLocs;
-    CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), ArgLocs,
-                   *DAG.getContext());
-
-    CCInfo.AnalyzeFormalArguments(Ins, CC_PTX);
-
-    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
-
-      CCValAssign&         VA    = ArgLocs[i];
-      EVT                  RegVT = VA.getLocVT();
-      TargetRegisterClass* TRC   = 0;
-
-      assert(VA.isRegLoc() && "CCValAssign must be RegLoc");
-
-      // Determine which register class we need
-      if (RegVT == MVT::i1) {
-        TRC = PTX::RegPredRegisterClass;
-      }
-      else if (RegVT == MVT::i16) {
-        TRC = PTX::RegI16RegisterClass;
-      }
-      else if (RegVT == MVT::i32) {
-        TRC = PTX::RegI32RegisterClass;
-      }
-      else if (RegVT == MVT::i64) {
-        TRC = PTX::RegI64RegisterClass;
-      }
-      else if (RegVT == MVT::f32) {
-        TRC = PTX::RegF32RegisterClass;
-      }
-      else if (RegVT == MVT::f64) {
-        TRC = PTX::RegF64RegisterClass;
-      }
-      else {
-        llvm_unreachable("Unknown parameter type");
-      }
+    for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
+      EVT                  RegVT = Ins[i].VT;
+      TargetRegisterClass* TRC   = getRegClassFor(RegVT);
 
+      // Use a unique index in the instruction to prevent instruction folding.
+      // Yes, this is a hack.
+      SDValue Index = DAG.getTargetConstant(i, MVT::i32);
       unsigned Reg = MF.getRegInfo().createVirtualRegister(TRC);
-      MF.getRegInfo().addLiveIn(VA.getLocReg(), Reg);
+      SDValue ArgValue = DAG.getNode(PTXISD::READ_PARAM, dl, RegVT, Chain,
+                                     Index);
 
-      SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
       InVals.push_back(ArgValue);
 
-      MFI->addArgReg(VA.getLocReg());
+      MFI->addArgReg(Reg);
     }
   }
 
@@ -301,41 +271,66 @@ SDValue PTXTargetLowering::
       assert(Outs.size() == 0 && "Kernel must return void.");
       return DAG.getNode(PTXISD::EXIT, dl, MVT::Other, Chain);
     case CallingConv::PTX_Device:
-      //assert(Outs.size() <= 1 && "Can at most return one value.");
+      assert(Outs.size() <= 1 && "Can at most return one value.");
       break;
   }
 
   MachineFunction& MF = DAG.getMachineFunction();
   PTXMachineFunctionInfo *MFI = MF.getInfo<PTXMachineFunctionInfo>();
+  PTXParamManager &PM = MFI->getParamManager();
 
   SDValue Flag;
+  const PTXSubtarget& ST = getTargetMachine().getSubtarget<PTXSubtarget>();
 
-  // Even though we could use the .param space for return arguments for
-  // device functions if SM >= 2.0 and the number of return arguments is
-  // only 1, we just always use registers since this makes the codegen
-  // easier.
-  SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-  getTargetMachine(), RVLocs, *DAG.getContext());
-
-  CCInfo.AnalyzeReturn(Outs, RetCC_PTX);
-
-  for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
-    CCValAssign& VA  = RVLocs[i];
-
-    assert(VA.isRegLoc() && "CCValAssign must be RegLoc");
+  if (ST.useParamSpaceForDeviceArgs()) {
+    assert(Outs.size() < 2 && "Device functions can return at most one value");
+
+    if (Outs.size() == 1) {
+      unsigned ParamSize = OutVals[0].getValueType().getSizeInBits();
+      unsigned Param = PM.addReturnParam(ParamSize);
+      const std::string &ParamName = PM.getParamName(Param);
+      SDValue ParamValue = DAG.getTargetExternalSymbol(ParamName.c_str(),
+                                                       MVT::Other);
+      Chain = DAG.getNode(PTXISD::STORE_PARAM, dl, MVT::Other, Chain,
+                          ParamValue, OutVals[0]);
+    }
+  } else {
+    for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
+      EVT                  RegVT = Outs[i].VT;
+      TargetRegisterClass* TRC = 0;
 
-    unsigned Reg = VA.getLocReg();
+      // Determine which register class we need
+      if (RegVT == MVT::i1) {
+        TRC = PTX::RegPredRegisterClass;
+      }
+      else if (RegVT == MVT::i16) {
+        TRC = PTX::RegI16RegisterClass;
+      }
+      else if (RegVT == MVT::i32) {
+        TRC = PTX::RegI32RegisterClass;
+      }
+      else if (RegVT == MVT::i64) {
+        TRC = PTX::RegI64RegisterClass;
+      }
+      else if (RegVT == MVT::f32) {
+        TRC = PTX::RegF32RegisterClass;
+      }
+      else if (RegVT == MVT::f64) {
+        TRC = PTX::RegF64RegisterClass;
+      }
+      else {
+        llvm_unreachable("Unknown parameter type");
+      }
 
-    DAG.getMachineFunction().getRegInfo().addLiveOut(Reg);
+      unsigned Reg = MF.getRegInfo().createVirtualRegister(TRC);
 
-    Chain = DAG.getCopyToReg(Chain, dl, Reg, OutVals[i], Flag);
+      SDValue Copy = DAG.getCopyToReg(Chain, dl, Reg, OutVals[i]/*, Flag*/);
+      SDValue OutReg = DAG.getRegister(Reg, RegVT);
 
-    // Guarantee that all emitted copies are stuck together,
-    // avoiding something bad
-    Flag = Chain.getValue(1);
+      Chain = DAG.getNode(PTXISD::WRITE_PARAM, dl, MVT::Other, Copy, OutReg);
 
-    MFI->addRetReg(Reg);
+      MFI->addRetReg(Reg);
+    }
   }
 
   if (Flag.getNode() == 0) {
@@ -345,3 +340,83 @@ SDValue PTXTargetLowering::
     return DAG.getNode(PTXISD::RET, dl, MVT::Other, Chain, Flag);
   }
 }
+
+SDValue
+PTXTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
+                             CallingConv::ID CallConv, bool isVarArg,
+                             bool &isTailCall,
+                             const SmallVectorImpl<ISD::OutputArg> &Outs,
+                             const SmallVectorImpl<SDValue> &OutVals,
+                             const SmallVectorImpl<ISD::InputArg> &Ins,
+                             DebugLoc dl, SelectionDAG &DAG,
+                             SmallVectorImpl<SDValue> &InVals) const {
+
+  MachineFunction& MF = DAG.getMachineFunction();
+  PTXMachineFunctionInfo *MFI = MF.getInfo<PTXMachineFunctionInfo>();
+  PTXParamManager &PM = MFI->getParamManager();
+
+  assert(getTargetMachine().getSubtarget<PTXSubtarget>().callsAreHandled() &&
+         "Calls are not handled for the target device");
+
+  std::vector<SDValue> Ops;
+  // The layout of the ops will be [Chain, #Ins, Ins, Callee, #Outs, Outs]
+  Ops.resize(Outs.size() + Ins.size() + 4);
+
+  Ops[0] = Chain;
+
+  // Identify the callee function
+  const GlobalValue *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();
+  assert(cast<Function>(GV)->getCallingConv() == CallingConv::PTX_Device &&
+         "PTX function calls must be to PTX device functions");
+  Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy());
+  Ops[Ins.size()+2] = Callee;
+
+  // Generate STORE_PARAM nodes for each function argument.  In PTX, function
+  // arguments are explicitly stored into .param variables and passed as
+  // arguments. There is no register/stack-based calling convention in PTX.
+  Ops[Ins.size()+3] = DAG.getTargetConstant(OutVals.size(), MVT::i32);
+  for (unsigned i = 0; i != OutVals.size(); ++i) {
+    unsigned Size = OutVals[i].getValueType().getSizeInBits();
+    unsigned Param = PM.addLocalParam(Size);
+    const std::string &ParamName = PM.getParamName(Param);
+    SDValue ParamValue = DAG.getTargetExternalSymbol(ParamName.c_str(),
+                                                     MVT::Other);
+    Chain = DAG.getNode(PTXISD::STORE_PARAM, dl, MVT::Other, Chain,
+                        ParamValue, OutVals[i]);
+    Ops[i+Ins.size()+4] = ParamValue;
+  }
+
+  std::vector<SDValue> InParams;
+
+  // Generate list of .param variables to hold the return value(s).
+  Ops[1] = DAG.getTargetConstant(Ins.size(), MVT::i32);
+  for (unsigned i = 0; i < Ins.size(); ++i) {
+    unsigned Size = Ins[i].VT.getStoreSizeInBits();
+    unsigned Param = PM.addLocalParam(Size);
+    const std::string &ParamName = PM.getParamName(Param);
+    SDValue ParamValue = DAG.getTargetExternalSymbol(ParamName.c_str(),
+                                                     MVT::Other);
+    Ops[i+2] = ParamValue;
+    InParams.push_back(ParamValue);
+  }
+
+  Ops[0] = Chain;
+
+  // Create the CALL node.
+  Chain = DAG.getNode(PTXISD::CALL, dl, MVT::Other, &Ops[0], Ops.size());
+
+  // Create the LOAD_PARAM nodes that retrieve the function return value(s).
+  for (unsigned i = 0; i < Ins.size(); ++i) {
+    SDValue Load = DAG.getNode(PTXISD::LOAD_PARAM, dl, Ins[i].VT, Chain,
+                               InParams[i]);
+    InVals.push_back(Load);
+  }
+
+  return Chain;
+}
+
+unsigned PTXTargetLowering::getNumRegisters(LLVMContext &Context, EVT VT) {
+  // All arguments consist of one "register," regardless of the type.
+  return 1;
+}
+
diff --git a/lib/Target/PTX/PTXISelLowering.h b/lib/Target/PTX/PTXISelLowering.h
index 43185416e1fc7..4d2566540af20 100644
--- a/lib/Target/PTX/PTXISelLowering.h
+++ b/lib/Target/PTX/PTXISelLowering.h
@@ -26,9 +26,12 @@ namespace PTXISD {
     FIRST_NUMBER = ISD::BUILTIN_OP_END,
     LOAD_PARAM,
     STORE_PARAM,
+    READ_PARAM,
+    WRITE_PARAM,
     EXIT,
     RET,
-    COPY_ADDRESS
+    COPY_ADDRESS,
+    CALL
   };
 }                               // namespace PTXISD
 
@@ -60,7 +63,19 @@ class PTXTargetLowering : public TargetLowering {
                   DebugLoc dl,
                   SelectionDAG &DAG) const;
 
-    virtual MVT::SimpleValueType getSetCCResultType(EVT VT) const;
+    virtual SDValue
+      LowerCall(SDValue Chain, SDValue Callee,
+                CallingConv::ID CallConv, bool isVarArg,
+                bool &isTailCall,
+                const SmallVectorImpl<ISD::OutputArg> &Outs,
+                const SmallVectorImpl<SDValue> &OutVals,
+                const SmallVectorImpl<ISD::InputArg> &Ins,
+                DebugLoc dl, SelectionDAG &DAG,
+                SmallVectorImpl<SDValue> &InVals) const;
+
+    virtual EVT getSetCCResultType(EVT VT) const;
+
+    virtual unsigned getNumRegisters(LLVMContext &Context, EVT VT);
 
   private:
     SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/PTX/PTXInstrFormats.td b/lib/Target/PTX/PTXInstrFormats.td
index 8cee351ee0df5..397fdc319a84f 100644
--- a/lib/Target/PTX/PTXInstrFormats.td
+++ b/lib/Target/PTX/PTXInstrFormats.td
@@ -7,12 +7,39 @@
 //
 //===----------------------------------------------------------------------===//
 
-// PTX Predicate operand, default to (0, 0) = (zero-reg, always).
+
+// Rounding Mode Specifier
+/*class RoundingMode<bits<3> val> {
+  bits<3> Value = val;
+}
+
+def RndDefault     : RoundingMode<0>;
+def RndNearestEven : RoundingMode<1>;
+def RndNearestZero : RoundingMode<2>;
+def RndNegInf      : RoundingMode<3>;
+def RndPosInf      : RoundingMode<4>;
+def RndApprox      : RoundingMode<5>;*/
+
+
+// Rounding Mode Operand
+def RndMode : Operand<i32> {
+  let PrintMethod = "printRoundingMode";
+}
+
+def RndDefault : PatLeaf<(i32 0)>;
+
+// PTX Predicate operand, default to (0, 0) = (zero-reg, none).
 // Leave PrintMethod empty; predicate printing is defined elsewhere.
 def pred : PredicateOperand<OtherVT, (ops RegPred, i32imm),
-                                     (ops (i1 zero_reg), (i32 0))>;
+                                     (ops (i1 zero_reg), (i32 2))>;
 
+def RndModeOperand : Operand<OtherVT> {
+  let MIOperandInfo = (ops i32imm);
+}
+
+// Instruction Types
 let Namespace = "PTX" in {
+
   class InstPTX<dag oops, dag iops, string asmstr, list<dag> pattern>
     : Instruction {
       dag OutOperandList = oops;
diff --git a/lib/Target/PTX/PTXInstrInfo.cpp b/lib/Target/PTX/PTXInstrInfo.cpp
index 425265a2fdb75..1b947a5400f46 100644
--- a/lib/Target/PTX/PTXInstrInfo.cpp
+++ b/lib/Target/PTX/PTXInstrInfo.cpp
@@ -16,10 +16,11 @@
 #include "PTX.h"
 #include "PTXInstrInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
-#include "llvm/Target/TargetRegistry.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 
 #define GET_INSTRINFO_CTOR
@@ -47,8 +48,13 @@ void PTXInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                MachineBasicBlock::iterator I, DebugLoc DL,
                                unsigned DstReg, unsigned SrcReg,
                                bool KillSrc) const {
-  for (int i = 0, e = sizeof(map)/sizeof(map[0]); i != e; ++ i) {
-    if (map[i].cls->contains(DstReg, SrcReg)) {
+
+  const MachineRegisterInfo& MRI = MBB.getParent()->getRegInfo();
+  //assert(MRI.getRegClass(SrcReg) == MRI.getRegClass(DstReg) &&
+  //  "Invalid register copy between two register classes");
+
+  for (int i = 0, e = sizeof(map)/sizeof(map[0]); i != e; ++i) {
+    if (map[i].cls == MRI.getRegClass(DstReg)) {
       const MCInstrDesc &MCID = get(map[i].opcode);
       MachineInstr *MI = BuildMI(MBB, I, DL, MCID, DstReg).
         addReg(SrcReg, getKillRegState(KillSrc));
@@ -161,7 +167,7 @@ DefinesPredicate(MachineInstr *MI,
     return false;
 
   Pred.push_back(MO);
-  Pred.push_back(MachineOperand::CreateImm(PTX::PRED_NORMAL));
+  Pred.push_back(MachineOperand::CreateImm(PTXPredicate::None));
   return true;
 }
 
@@ -277,7 +283,7 @@ InsertBranch(MachineBasicBlock &MBB,
     BuildMI(&MBB, DL, get(PTX::BRAdp))
       .addMBB(TBB).addReg(Cond[0].getReg()).addImm(Cond[1].getImm());
     BuildMI(&MBB, DL, get(PTX::BRAd))
-      .addMBB(FBB).addReg(PTX::NoRegister).addImm(PTX::PRED_NORMAL);
+      .addMBB(FBB).addReg(PTX::NoRegister).addImm(PTXPredicate::None);
     return 2;
   } else if (Cond.size()) {
     BuildMI(&MBB, DL, get(PTX::BRAdp))
@@ -285,7 +291,7 @@ InsertBranch(MachineBasicBlock &MBB,
     return 1;
   } else {
     BuildMI(&MBB, DL, get(PTX::BRAd))
-      .addMBB(TBB).addReg(PTX::NoRegister).addImm(PTX::PRED_NORMAL);
+      .addMBB(TBB).addReg(PTX::NoRegister).addImm(PTXPredicate::None);
     return 1;
   }
 }
@@ -296,34 +302,7 @@ void PTXInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                      unsigned SrcReg, bool isKill, int FrameIdx,
                                        const TargetRegisterClass *RC,
                                        const TargetRegisterInfo *TRI) const {
-  MachineInstr& MI = *MII;
-  DebugLoc DL = MI.getDebugLoc();
-
-  DEBUG(dbgs() << "storeRegToStackSlot: " << MI);
-
-  int OpCode;
-
-  // Select the appropriate opcode based on the register class
-  if (RC == PTX::RegI16RegisterClass) {
-    OpCode = PTX::STACKSTOREI16;
-  }  else if (RC == PTX::RegI32RegisterClass) {
-    OpCode = PTX::STACKSTOREI32;
-  }  else if (RC == PTX::RegI64RegisterClass) {
-    OpCode = PTX::STACKSTOREI32;
-  }  else if (RC == PTX::RegF32RegisterClass) {
-    OpCode = PTX::STACKSTOREF32;
-  }  else if (RC == PTX::RegF64RegisterClass) {
-    OpCode = PTX::STACKSTOREF64;
-  } else {
-    llvm_unreachable("Unknown PTX register class!");
-  }
-
-  // Build the store instruction (really a mov)
-  MachineInstrBuilder MIB = BuildMI(MBB, MII, DL, get(OpCode));
-  MIB.addFrameIndex(FrameIdx);
-  MIB.addReg(SrcReg);
-
-  AddDefaultPredicate(MIB);
+  assert(false && "storeRegToStackSlot should not be called for PTX");
 }
 
 void PTXInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
@@ -331,34 +310,7 @@ void PTXInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                         unsigned DestReg, int FrameIdx,
                                         const TargetRegisterClass *RC,
                                         const TargetRegisterInfo *TRI) const {
-  MachineInstr& MI = *MII;
-  DebugLoc DL = MI.getDebugLoc();
-
-  DEBUG(dbgs() << "loadRegToStackSlot: " << MI);
-
-  int OpCode;
-
-  // Select the appropriate opcode based on the register class
-  if (RC == PTX::RegI16RegisterClass) {
-    OpCode = PTX::STACKLOADI16;
-  } else if (RC == PTX::RegI32RegisterClass) {
-    OpCode = PTX::STACKLOADI32;
-  } else if (RC == PTX::RegI64RegisterClass) {
-    OpCode = PTX::STACKLOADI32;
-  } else if (RC == PTX::RegF32RegisterClass) {
-    OpCode = PTX::STACKLOADF32;
-  } else if (RC == PTX::RegF64RegisterClass) {
-    OpCode = PTX::STACKLOADF64;
-  } else {
-    llvm_unreachable("Unknown PTX register class!");
-  }
-
-  // Build the load instruction (really a mov)
-  MachineInstrBuilder MIB = BuildMI(MBB, MII, DL, get(OpCode));
-  MIB.addReg(DestReg);
-  MIB.addFrameIndex(FrameIdx);
-
-  AddDefaultPredicate(MIB);
+  assert(false && "loadRegFromStackSlot should not be called for PTX");
 }
 
 // static helper routines
@@ -367,7 +319,7 @@ MachineSDNode *PTXInstrInfo::
 GetPTXMachineNode(SelectionDAG *DAG, unsigned Opcode,
                   DebugLoc dl, EVT VT, SDValue Op1) {
   SDValue predReg = DAG->getRegister(PTX::NoRegister, MVT::i1);
-  SDValue predOp = DAG->getTargetConstant(PTX::PRED_NORMAL, MVT::i32);
+  SDValue predOp = DAG->getTargetConstant(PTXPredicate::None, MVT::i32);
   SDValue ops[] = { Op1, predReg, predOp };
   return DAG->getMachineNode(Opcode, dl, VT, ops, array_lengthof(ops));
 }
@@ -376,7 +328,7 @@ MachineSDNode *PTXInstrInfo::
 GetPTXMachineNode(SelectionDAG *DAG, unsigned Opcode,
                   DebugLoc dl, EVT VT, SDValue Op1, SDValue Op2) {
   SDValue predReg = DAG->getRegister(PTX::NoRegister, MVT::i1);
-  SDValue predOp = DAG->getTargetConstant(PTX::PRED_NORMAL, MVT::i32);
+  SDValue predOp = DAG->getTargetConstant(PTXPredicate::None, MVT::i32);
   SDValue ops[] = { Op1, Op2, predReg, predOp };
   return DAG->getMachineNode(Opcode, dl, VT, ops, array_lengthof(ops));
 }
@@ -384,7 +336,7 @@ GetPTXMachineNode(SelectionDAG *DAG, unsigned Opcode,
 void PTXInstrInfo::AddDefaultPredicate(MachineInstr *MI) {
   if (MI->findFirstPredOperandIdx() == -1) {
     MI->addOperand(MachineOperand::CreateReg(PTX::NoRegister, /*IsDef=*/false));
-    MI->addOperand(MachineOperand::CreateImm(PTX::PRED_NORMAL));
+    MI->addOperand(MachineOperand::CreateImm(PTXPredicate::None));
   }
 }
 
diff --git a/lib/Target/PTX/PTXInstrInfo.td b/lib/Target/PTX/PTXInstrInfo.td
index 6bfe906d40abe..a3fcea9038cfb 100644
--- a/lib/Target/PTX/PTXInstrInfo.td
+++ b/lib/Target/PTX/PTXInstrInfo.td
@@ -21,10 +21,6 @@ include "PTXInstrFormats.td"
 // Code Generation Predicates
 //===----------------------------------------------------------------------===//
 
-// Addressing
-def Use32BitAddresses : Predicate<"!getSubtarget().is64Bit()">;
-def Use64BitAddresses : Predicate<"getSubtarget().is64Bit()">;
-
 // Shader Model Support
 def FDivNeedsRoundingMode : Predicate<"getSubtarget().fdivNeedsRoundingMode()">;
 def FDivNoRoundingMode : Predicate<"!getSubtarget().fdivNeedsRoundingMode()">;
@@ -43,130 +39,19 @@ def DoesNotSupportPTX23 : Predicate<"!getSubtarget().supportsPTX23()">;
 def SupportsFMA         : Predicate<"getSubtarget().supportsFMA()">;
 def DoesNotSupportFMA   : Predicate<"!getSubtarget().supportsFMA()">;
 
-//===----------------------------------------------------------------------===//
-// Instruction Pattern Stuff
-//===----------------------------------------------------------------------===//
 
-def load_global : PatFrag<(ops node:$ptr), (load node:$ptr), [{
-  const Value *Src;
-  const PointerType *PT;
-  if ((Src = cast<LoadSDNode>(N)->getSrcValue()) &&
-      (PT = dyn_cast<PointerType>(Src->getType())))
-    return PT->getAddressSpace() == PTX::GLOBAL;
-  return false;
-}]>;
-
-def load_constant : PatFrag<(ops node:$ptr), (load node:$ptr), [{
-  const Value *Src;
-  const PointerType *PT;
-  if ((Src = cast<LoadSDNode>(N)->getSrcValue()) &&
-      (PT = dyn_cast<PointerType>(Src->getType())))
-    return PT->getAddressSpace() == PTX::CONSTANT;
-  return false;
-}]>;
-
-def load_local : PatFrag<(ops node:$ptr), (load node:$ptr), [{
-  const Value *Src;
-  const PointerType *PT;
-  if ((Src = cast<LoadSDNode>(N)->getSrcValue()) &&
-      (PT = dyn_cast<PointerType>(Src->getType())))
-    return PT->getAddressSpace() == PTX::LOCAL;
-  return false;
-}]>;
-
-def load_parameter : PatFrag<(ops node:$ptr), (load node:$ptr), [{
-  const Value *Src;
-  const PointerType *PT;
-  if ((Src = cast<LoadSDNode>(N)->getSrcValue()) &&
-      (PT = dyn_cast<PointerType>(Src->getType())))
-    return PT->getAddressSpace() == PTX::PARAMETER;
-  return false;
-}]>;
-
-def load_shared : PatFrag<(ops node:$ptr), (load node:$ptr), [{
-  const Value *Src;
-  const PointerType *PT;
-  if ((Src = cast<LoadSDNode>(N)->getSrcValue()) &&
-      (PT = dyn_cast<PointerType>(Src->getType())))
-    return PT->getAddressSpace() == PTX::SHARED;
-  return false;
-}]>;
-
-def store_global
-  : PatFrag<(ops node:$d, node:$ptr), (store node:$d, node:$ptr), [{
-  const Value *Src;
-  const PointerType *PT;
-  if ((Src = cast<StoreSDNode>(N)->getSrcValue()) &&
-      (PT = dyn_cast<PointerType>(Src->getType())))
-    return PT->getAddressSpace() == PTX::GLOBAL;
-  return false;
-}]>;
-
-def store_local
-  : PatFrag<(ops node:$d, node:$ptr), (store node:$d, node:$ptr), [{
-  const Value *Src;
-  const PointerType *PT;
-  if ((Src = cast<StoreSDNode>(N)->getSrcValue()) &&
-      (PT = dyn_cast<PointerType>(Src->getType())))
-    return PT->getAddressSpace() == PTX::LOCAL;
-  return false;
-}]>;
-
-def store_parameter
-  : PatFrag<(ops node:$d, node:$ptr), (store node:$d, node:$ptr), [{
-  const Value *Src;
-  const PointerType *PT;
-  if ((Src = cast<StoreSDNode>(N)->getSrcValue()) &&
-      (PT = dyn_cast<PointerType>(Src->getType())))
-    return PT->getAddressSpace() == PTX::PARAMETER;
-  return false;
-}]>;
-
-def store_shared
-  : PatFrag<(ops node:$d, node:$ptr), (store node:$d, node:$ptr), [{
-  const Value *Src;
-  const PointerType *PT;
-  if ((Src = cast<StoreSDNode>(N)->getSrcValue()) &&
-      (PT = dyn_cast<PointerType>(Src->getType())))
-    return PT->getAddressSpace() == PTX::SHARED;
-  return false;
-}]>;
-
-// Addressing modes.
-def ADDRrr32 : ComplexPattern<i32, 2, "SelectADDRrr", [], []>;
-def ADDRrr64 : ComplexPattern<i64, 2, "SelectADDRrr", [], []>;
-def ADDRri32 : ComplexPattern<i32, 2, "SelectADDRri", [], []>;
-def ADDRri64 : ComplexPattern<i64, 2, "SelectADDRri", [], []>;
-def ADDRii32 : ComplexPattern<i32, 2, "SelectADDRii", [], []>;
-def ADDRii64 : ComplexPattern<i64, 2, "SelectADDRii", [], []>;
-
-// Address operands
-def MEMri32 : Operand<i32> {
-  let PrintMethod = "printMemOperand";
-  let MIOperandInfo = (ops RegI32, i32imm);
-}
-def MEMri64 : Operand<i64> {
-  let PrintMethod = "printMemOperand";
-  let MIOperandInfo = (ops RegI64, i64imm);
-}
-def MEMii32 : Operand<i32> {
-  let PrintMethod = "printMemOperand";
-  let MIOperandInfo = (ops i32imm, i32imm);
-}
-def MEMii64 : Operand<i64> {
-  let PrintMethod = "printMemOperand";
-  let MIOperandInfo = (ops i64imm, i64imm);
-}
-// The operand here does not correspond to an actual address, so we
-// can use i32 in 64-bit address modes.
-def MEMpi : Operand<i32> {
-  let PrintMethod = "printParamOperand";
-  let MIOperandInfo = (ops i32imm);
-}
-def MEMret : Operand<i32> {
-  let PrintMethod = "printReturnOperand";
-  let MIOperandInfo = (ops i32imm);
-}
+
+// def SDT_PTXCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>;
+// def SDT_PTXCallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
+
+// def PTXcallseq_start : SDNode<"ISD::CALLSEQ_START", SDT_PTXCallSeqStart,
+//                               [SDNPHasChain, SDNPOutGlue]>;
+// def PTXcallseq_end   : SDNode<"ISD::CALLSEQ_END", SDT_PTXCallSeqEnd,
+//                               [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+def PTXcall : SDNode<"PTXISD::CALL", SDTNone,
+                     [SDNPHasChain, SDNPVariadic, SDNPOptInGlue, SDNPOutGlue]>;
+
 
 // Branch & call targets have OtherVT type.
 def brtarget   : Operand<OtherVT>;
@@ -189,87 +74,73 @@ def PTXret
 def PTXcopyaddress
   : SDNode<"PTXISD::COPY_ADDRESS", SDTypeProfile<1, 1, []>, []>;
 
-// Load/store .param space
-def PTXloadparam
-  : SDNode<"PTXISD::LOAD_PARAM", SDTypeProfile<1, 1, [SDTCisVT<1, i32>]>,
-           [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue]>;
-def PTXstoreparam
-  : SDNode<"PTXISD::STORE_PARAM", SDTypeProfile<0, 2, [SDTCisVT<0, i32>]>,
-           [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue]>;
+
 
 //===----------------------------------------------------------------------===//
 // Instruction Class Templates
 //===----------------------------------------------------------------------===//
 
+// For floating-point instructions, we cannot just embed the pattern into the
+// instruction definition since we need to muck around with the rounding mode,
+// and I do not know how to insert constants into instructions directly from
+// pattern matches.
+
 //===- Floating-Point Instructions - 2 Operand Form -----------------------===//
-multiclass PTX_FLOAT_2OP<string opcstr, SDNode opnode> {
+multiclass PTX_FLOAT_2OP<string opcstr> {
   def rr32 : InstPTX<(outs RegF32:$d),
-                     (ins RegF32:$a),
-                     !strconcat(opcstr, ".f32\t$d, $a"),
-                     [(set RegF32:$d, (opnode RegF32:$a))]>;
+                     (ins RndMode:$r, RegF32:$a),
+                     !strconcat(opcstr, "$r.f32\t$d, $a"), []>;
   def ri32 : InstPTX<(outs RegF32:$d),
-                     (ins f32imm:$a),
-                     !strconcat(opcstr, ".f32\t$d, $a"),
-                     [(set RegF32:$d, (opnode fpimm:$a))]>;
+                     (ins RndMode:$r, f32imm:$a),
+                     !strconcat(opcstr, "$r.f32\t$d, $a"), []>;
   def rr64 : InstPTX<(outs RegF64:$d),
-                     (ins RegF64:$a),
-                     !strconcat(opcstr, ".f64\t$d, $a"),
-                     [(set RegF64:$d, (opnode RegF64:$a))]>;
+                     (ins RndMode:$r, RegF64:$a),
+                     !strconcat(opcstr, "$r.f64\t$d, $a"), []>;
   def ri64 : InstPTX<(outs RegF64:$d),
-                     (ins f64imm:$a),
-                     !strconcat(opcstr, ".f64\t$d, $a"),
-                     [(set RegF64:$d, (opnode fpimm:$a))]>;
+                     (ins RndMode:$r, f64imm:$a),
+                     !strconcat(opcstr, "$r.f64\t$d, $a"), []>;
 }
 
 //===- Floating-Point Instructions - 3 Operand Form -----------------------===//
-multiclass PTX_FLOAT_3OP<string opcstr, SDNode opnode> {
+multiclass PTX_FLOAT_3OP<string opcstr> {
   def rr32 : InstPTX<(outs RegF32:$d),
-                     (ins RegF32:$a, RegF32:$b),
-                     !strconcat(opcstr, ".f32\t$d, $a, $b"),
-                     [(set RegF32:$d, (opnode RegF32:$a, RegF32:$b))]>;
+                     (ins RndMode:$r, RegF32:$a, RegF32:$b),
+                     !strconcat(opcstr, "$r.f32\t$d, $a, $b"), []>;
   def ri32 : InstPTX<(outs RegF32:$d),
-                     (ins RegF32:$a, f32imm:$b),
-                     !strconcat(opcstr, ".f32\t$d, $a, $b"),
-                     [(set RegF32:$d, (opnode RegF32:$a, fpimm:$b))]>;
+                     (ins RndMode:$r, RegF32:$a, f32imm:$b),
+                     !strconcat(opcstr, "$r.f32\t$d, $a, $b"), []>;
   def rr64 : InstPTX<(outs RegF64:$d),
-                     (ins RegF64:$a, RegF64:$b),
-                     !strconcat(opcstr, ".f64\t$d, $a, $b"),
-                     [(set RegF64:$d, (opnode RegF64:$a, RegF64:$b))]>;
+                     (ins RndMode:$r, RegF64:$a, RegF64:$b),
+                     !strconcat(opcstr, "$r.f64\t$d, $a, $b"), []>;
   def ri64 : InstPTX<(outs RegF64:$d),
-                     (ins RegF64:$a, f64imm:$b),
-                     !strconcat(opcstr, ".f64\t$d, $a, $b"),
-                     [(set RegF64:$d, (opnode RegF64:$a, fpimm:$b))]>;
+                     (ins RndMode:$r, RegF64:$a, f64imm:$b),
+                     !strconcat(opcstr, "$r.f64\t$d, $a, $b"), []>;
 }
 
 //===- Floating-Point Instructions - 4 Operand Form -----------------------===//
-multiclass PTX_FLOAT_4OP<string opcstr, SDNode opnode1, SDNode opnode2> {
+multiclass PTX_FLOAT_4OP<string opcstr> {
   def rrr32 : InstPTX<(outs RegF32:$d),
-                      (ins RegF32:$a, RegF32:$b, RegF32:$c),
-                      !strconcat(opcstr, ".f32\t$d, $a, $b, $c"),
-                      [(set RegF32:$d, (opnode2 (opnode1 RegF32:$a,
-                                                          RegF32:$b),
-                                                 RegF32:$c))]>;
+                      (ins RndMode:$r, RegF32:$a, RegF32:$b, RegF32:$c),
+                      !strconcat(opcstr, "$r.f32\t$d, $a, $b, $c"), []>;
   def rri32 : InstPTX<(outs RegF32:$d),
-                      (ins RegF32:$a, RegF32:$b, f32imm:$c),
-                      !strconcat(opcstr, ".f32\t$d, $a, $b, $c"),
-                      [(set RegF32:$d, (opnode2 (opnode1 RegF32:$a,
-                                                          RegF32:$b),
-                                                 fpimm:$c))]>;
+                      (ins RndMode:$r, RegF32:$a, RegF32:$b, f32imm:$c),
+                      !strconcat(opcstr, "$r.f32\t$d, $a, $b, $c"), []>;
+  def rii32 : InstPTX<(outs RegF32:$d),
+                      (ins RndMode:$r, RegF32:$a, f32imm:$b, f32imm:$c),
+                      !strconcat(opcstr, "$r.f32\t$d, $a, $b, $c"), []>;
   def rrr64 : InstPTX<(outs RegF64:$d),
-                      (ins RegF64:$a, RegF64:$b, RegF64:$c),
-                      !strconcat(opcstr, ".f64\t$d, $a, $b, $c"),
-                      [(set RegF64:$d, (opnode2 (opnode1 RegF64:$a,
-                                                          RegF64:$b),
-                                                 RegF64:$c))]>;
+                      (ins RndMode:$r, RegF64:$a, RegF64:$b, RegF64:$c),
+                      !strconcat(opcstr, "$r.f64\t$d, $a, $b, $c"), []>;
   def rri64 : InstPTX<(outs RegF64:$d),
-                      (ins RegF64:$a, RegF64:$b, f64imm:$c),
-                      !strconcat(opcstr, ".f64\t$d, $a, $b, $c"),
-                      [(set RegF64:$d, (opnode2 (opnode1 RegF64:$a,
-                                                          RegF64:$b),
-                                                 fpimm:$c))]>;
+                      (ins RndMode:$r, RegF64:$a, RegF64:$b, f64imm:$c),
+                      !strconcat(opcstr, "$r.f64\t$d, $a, $b, $c"), []>;
+  def rii64 : InstPTX<(outs RegF64:$d),
+                      (ins RndMode:$r, RegF64:$a, f64imm:$b, f64imm:$c),
+                      !strconcat(opcstr, "$r.f64\t$d, $a, $b, $c"), []>;
 }
 
-multiclass INT3<string opcstr, SDNode opnode> {
+//===- Integer Instructions - 3 Operand Form ------------------------------===//
+multiclass PTX_INT3<string opcstr, SDNode opnode> {
   def rr16 : InstPTX<(outs RegI16:$d),
                      (ins RegI16:$a, RegI16:$b),
                      !strconcat(opcstr, ".u16\t$d, $a, $b"),
@@ -296,6 +167,35 @@ multiclass INT3<string opcstr, SDNode opnode> {
                      [(set RegI64:$d, (opnode RegI64:$a, imm:$b))]>;
 }
 
+//===- Integer Instructions - 3 Operand Form (Signed) ---------------------===//
+multiclass PTX_INT3_SIGNED<string opcstr, SDNode opnode> {
+  def rr16 : InstPTX<(outs RegI16:$d),
+                     (ins RegI16:$a, RegI16:$b),
+                     !strconcat(opcstr, ".s16\t$d, $a, $b"),
+                     [(set RegI16:$d, (opnode RegI16:$a, RegI16:$b))]>;
+  def ri16 : InstPTX<(outs RegI16:$d),
+                     (ins RegI16:$a, i16imm:$b),
+                     !strconcat(opcstr, ".s16\t$d, $a, $b"),
+                     [(set RegI16:$d, (opnode RegI16:$a, imm:$b))]>;
+  def rr32 : InstPTX<(outs RegI32:$d),
+                     (ins RegI32:$a, RegI32:$b),
+                     !strconcat(opcstr, ".s32\t$d, $a, $b"),
+                     [(set RegI32:$d, (opnode RegI32:$a, RegI32:$b))]>;
+  def ri32 : InstPTX<(outs RegI32:$d),
+                     (ins RegI32:$a, i32imm:$b),
+                     !strconcat(opcstr, ".s32\t$d, $a, $b"),
+                     [(set RegI32:$d, (opnode RegI32:$a, imm:$b))]>;
+  def rr64 : InstPTX<(outs RegI64:$d),
+                     (ins RegI64:$a, RegI64:$b),
+                     !strconcat(opcstr, ".s64\t$d, $a, $b"),
+                     [(set RegI64:$d, (opnode RegI64:$a, RegI64:$b))]>;
+  def ri64 : InstPTX<(outs RegI64:$d),
+                     (ins RegI64:$a, i64imm:$b),
+                     !strconcat(opcstr, ".s64\t$d, $a, $b"),
+                     [(set RegI64:$d, (opnode RegI64:$a, imm:$b))]>;
+}
+
+//===- Bitwise Logic Instructions - 3 Operand Form ------------------------===//
 multiclass PTX_LOGIC<string opcstr, SDNode opnode> {
   def ripreds : InstPTX<(outs RegPred:$d),
                      (ins RegPred:$a, i1imm:$b),
@@ -331,7 +231,8 @@ multiclass PTX_LOGIC<string opcstr, SDNode opnode> {
                      [(set RegI64:$d, (opnode RegI64:$a, imm:$b))]>;
 }
 
-multiclass INT3ntnc<string opcstr, SDNode opnode> {
+//===- Integer Shift Instructions - 3 Operand Form ------------------------===//
+multiclass PTX_INT3ntnc<string opcstr, SDNode opnode> {
   def rr16 : InstPTX<(outs RegI16:$d),
                      (ins RegI16:$a, RegI16:$b),
                      !strconcat(opcstr, "16\t$d, $a, $b"),
@@ -370,6 +271,7 @@ multiclass INT3ntnc<string opcstr, SDNode opnode> {
                      [(set RegI64:$d, (opnode imm:$a, RegI64:$b))]>;
 }
 
+//===- Set Predicate Instructions (Int) - 3/4 Operand Forms ---------------===//
 multiclass PTX_SETP_I<RegisterClass RC, string regclsname, Operand immcls,
                         CondCode cmp, string cmpstr> {
   // TODO support 5-operand format: p|q, a, b, c
@@ -385,56 +287,77 @@ multiclass PTX_SETP_I<RegisterClass RC, string regclsname, Operand immcls,
 
   def rr_and_r
     : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".and.", regclsname, "\t$p, $a, $b, $c"),
+              !strconcat("setp.", cmpstr, ".and.", regclsname,
+                         "\t$p, $a, $b, $c"),
               [(set RegPred:$p, (and (setcc RC:$a, RC:$b, cmp), RegPred:$c))]>;
   def ri_and_r
     : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".and.", regclsname, "\t$p, $a, $b, $c"),
-              [(set RegPred:$p, (and (setcc RC:$a, imm:$b, cmp), RegPred:$c))]>;
+              !strconcat("setp.", cmpstr, ".and.", regclsname,
+                         "\t$p, $a, $b, $c"),
+              [(set RegPred:$p, (and (setcc RC:$a, imm:$b, cmp),
+                                     RegPred:$c))]>;
   def rr_or_r
     : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".or.", regclsname, "\t$p, $a, $b, $c"),
+              !strconcat("setp.", cmpstr, ".or.", regclsname,
+                         "\t$p, $a, $b, $c"),
               [(set RegPred:$p, (or (setcc RC:$a, RC:$b, cmp), RegPred:$c))]>;
   def ri_or_r
     : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".or.", regclsname, "\t$p, $a, $b, $c"),
+              !strconcat("setp.", cmpstr, ".or.", regclsname,
+                         "\t$p, $a, $b, $c"),
               [(set RegPred:$p, (or (setcc RC:$a, imm:$b, cmp), RegPred:$c))]>;
   def rr_xor_r
     : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".xor.", regclsname, "\t$p, $a, $b, $c"),
+              !strconcat("setp.", cmpstr, ".xor.", regclsname,
+                         "\t$p, $a, $b, $c"),
               [(set RegPred:$p, (xor (setcc RC:$a, RC:$b, cmp), RegPred:$c))]>;
   def ri_xor_r
     : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".xor.", regclsname, "\t$p, $a, $b, $c"),
-              [(set RegPred:$p, (xor (setcc RC:$a, imm:$b, cmp), RegPred:$c))]>;
+              !strconcat("setp.", cmpstr, ".xor.", regclsname,
+                         "\t$p, $a, $b, $c"),
+              [(set RegPred:$p, (xor (setcc RC:$a, imm:$b, cmp),
+                                     RegPred:$c))]>;
 
   def rr_and_not_r
     : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".and.", regclsname, "\t$p, $a, $b, !$c"),
-              [(set RegPred:$p, (and (setcc RC:$a, RC:$b, cmp), (not RegPred:$c)))]>;
+              !strconcat("setp.", cmpstr, ".and.", regclsname,
+                         "\t$p, $a, $b, !$c"),
+              [(set RegPred:$p, (and (setcc RC:$a, RC:$b, cmp),
+                                     (not RegPred:$c)))]>;
   def ri_and_not_r
     : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".and.", regclsname, "\t$p, $a, $b, !$c"),
-              [(set RegPred:$p, (and (setcc RC:$a, imm:$b, cmp), (not RegPred:$c)))]>;
+              !strconcat("setp.", cmpstr, ".and.", regclsname,
+                         "\t$p, $a, $b, !$c"),
+              [(set RegPred:$p, (and (setcc RC:$a, imm:$b, cmp),
+                                     (not RegPred:$c)))]>;
   def rr_or_not_r
     : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".or.", regclsname, "\t$p, $a, $b, !$c"),
-              [(set RegPred:$p, (or (setcc RC:$a, RC:$b, cmp), (not RegPred:$c)))]>;
+              !strconcat("setp.", cmpstr, ".or.", regclsname,
+                         "\t$p, $a, $b, !$c"),
+              [(set RegPred:$p, (or (setcc RC:$a, RC:$b, cmp),
+                                    (not RegPred:$c)))]>;
   def ri_or_not_r
     : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".or.", regclsname, "\t$p, $a, $b, !$c"),
-              [(set RegPred:$p, (or (setcc RC:$a, imm:$b, cmp), (not RegPred:$c)))]>;
+              !strconcat("setp.", cmpstr, ".or.", regclsname,
+                         "\t$p, $a, $b, !$c"),
+              [(set RegPred:$p, (or (setcc RC:$a, imm:$b, cmp),
+                                    (not RegPred:$c)))]>;
   def rr_xor_not_r
     : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".xor.", regclsname, "\t$p, $a, $b, !$c"),
-              [(set RegPred:$p, (xor (setcc RC:$a, RC:$b, cmp), (not RegPred:$c)))]>;
+              !strconcat("setp.", cmpstr, ".xor.", regclsname,
+                         "\t$p, $a, $b, !$c"),
+              [(set RegPred:$p, (xor (setcc RC:$a, RC:$b, cmp),
+                                     (not RegPred:$c)))]>;
   def ri_xor_not_r
     : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".xor.", regclsname, "\t$p, $a, $b, !$c"),
-              [(set RegPred:$p, (xor (setcc RC:$a, imm:$b, cmp), (not RegPred:$c)))]>;
+              !strconcat("setp.", cmpstr, ".xor.", regclsname,
+                         "\t$p, $a, $b, !$c"),
+              [(set RegPred:$p, (xor (setcc RC:$a, imm:$b, cmp),
+                                     (not RegPred:$c)))]>;
 }
 
-multiclass PTX_SETP_FP<RegisterClass RC, string regclsname,
+//===- Set Predicate Instructions (FP) - 3/4 Operand Form -----------------===//
+multiclass PTX_SETP_FP<RegisterClass RC, string regclsname, Operand immcls,
                         CondCode ucmp, CondCode ocmp, string cmpstr> {
   // TODO support 5-operand format: p|q, a, b, c
 
@@ -447,137 +370,110 @@ multiclass PTX_SETP_FP<RegisterClass RC, string regclsname,
               !strconcat("setp.", cmpstr, ".", regclsname, "\t$p, $a, $b"),
               [(set RegPred:$p, (setcc RC:$a, RC:$b, ocmp))]>;
 
+  def ri_u
+    : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b),
+              !strconcat("setp.", cmpstr, "u.", regclsname, "\t$p, $a, $b"),
+              [(set RegPred:$p, (setcc RC:$a, fpimm:$b, ucmp))]>;
+  def ri_o
+    : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b),
+              !strconcat("setp.", cmpstr, ".", regclsname, "\t$p, $a, $b"),
+              [(set RegPred:$p, (setcc RC:$a, fpimm:$b, ocmp))]>;
+
   def rr_and_r_u
     : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, "u.and.", regclsname, "\t$p, $a, $b, $c"),
-              [(set RegPred:$p, (and (setcc RC:$a, RC:$b, ucmp), RegPred:$c))]>;
+              !strconcat("setp.", cmpstr, "u.and.", regclsname,
+                         "\t$p, $a, $b, $c"),
+              [(set RegPred:$p, (and (setcc RC:$a, RC:$b, ucmp),
+                                     RegPred:$c))]>;
   def rr_and_r_o
     : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".and.", regclsname, "\t$p, $a, $b, $c"),
-              [(set RegPred:$p, (and (setcc RC:$a, RC:$b, ocmp), RegPred:$c))]>;
+              !strconcat("setp.", cmpstr, ".and.", regclsname,
+                         "\t$p, $a, $b, $c"),
+              [(set RegPred:$p, (and (setcc RC:$a, RC:$b, ocmp),
+                                     RegPred:$c))]>;
 
   def rr_or_r_u
     : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, "u.or.", regclsname, "\t$p, $a, $b, $c"),
+              !strconcat("setp.", cmpstr, "u.or.", regclsname,
+                         "\t$p, $a, $b, $c"),
               [(set RegPred:$p, (or (setcc RC:$a, RC:$b, ucmp), RegPred:$c))]>;
   def rr_or_r_o
     : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".or.", regclsname, "\t$p, $a, $b, $c"),
+              !strconcat("setp.", cmpstr, ".or.", regclsname,
+                         "\t$p, $a, $b, $c"),
               [(set RegPred:$p, (or (setcc RC:$a, RC:$b, ocmp), RegPred:$c))]>;
 
   def rr_xor_r_u
     : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, "u.xor.", regclsname, "\t$p, $a, $b, $c"),
-              [(set RegPred:$p, (xor (setcc RC:$a, RC:$b, ucmp), RegPred:$c))]>;
+              !strconcat("setp.", cmpstr, "u.xor.", regclsname,
+                         "\t$p, $a, $b, $c"),
+              [(set RegPred:$p, (xor (setcc RC:$a, RC:$b, ucmp),
+                                     RegPred:$c))]>;
   def rr_xor_r_o
     : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".xor.", regclsname, "\t$p, $a, $b, $c"),
-              [(set RegPred:$p, (xor (setcc RC:$a, RC:$b, ocmp), RegPred:$c))]>;
+              !strconcat("setp.", cmpstr, ".xor.", regclsname,
+                         "\t$p, $a, $b, $c"),
+              [(set RegPred:$p, (xor (setcc RC:$a, RC:$b, ocmp),
+                                     RegPred:$c))]>;
 
   def rr_and_not_r_u
     : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, "u.and.", regclsname, "\t$p, $a, $b, !$c"),
-              [(set RegPred:$p, (and (setcc RC:$a, RC:$b, ucmp), (not RegPred:$c)))]>;
+              !strconcat("setp.", cmpstr, "u.and.", regclsname,
+                         "\t$p, $a, $b, !$c"),
+              [(set RegPred:$p, (and (setcc RC:$a, RC:$b, ucmp),
+                                     (not RegPred:$c)))]>;
   def rr_and_not_r_o
     : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".and.", regclsname, "\t$p, $a, $b, !$c"),
-              [(set RegPred:$p, (and (setcc RC:$a, RC:$b, ocmp), (not RegPred:$c)))]>;
+              !strconcat("setp.", cmpstr, ".and.", regclsname,
+                         "\t$p, $a, $b, !$c"),
+              [(set RegPred:$p, (and (setcc RC:$a, RC:$b, ocmp),
+                                     (not RegPred:$c)))]>;
 
   def rr_or_not_r_u
     : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, "u.or.", regclsname, "\t$p, $a, $b, !$c"),
-              [(set RegPred:$p, (or (setcc RC:$a, RC:$b, ucmp), (not RegPred:$c)))]>;
+              !strconcat("setp.", cmpstr, "u.or.", regclsname,
+                         "\t$p, $a, $b, !$c"),
+              [(set RegPred:$p, (or (setcc RC:$a, RC:$b, ucmp),
+                                    (not RegPred:$c)))]>;
   def rr_or_not_r_o
     : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".or.", regclsname, "\t$p, $a, $b, !$c"),
-              [(set RegPred:$p, (or (setcc RC:$a, RC:$b, ocmp), (not RegPred:$c)))]>;
+              !strconcat("setp.", cmpstr, ".or.", regclsname,
+                         "\t$p, $a, $b, !$c"),
+              [(set RegPred:$p, (or (setcc RC:$a, RC:$b, ocmp),
+                                    (not RegPred:$c)))]>;
 
   def rr_xor_not_r_u
     : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, "u.xor.", regclsname, "\t$p, $a, $b, !$c"),
-              [(set RegPred:$p, (xor (setcc RC:$a, RC:$b, ucmp), (not RegPred:$c)))]>;
+              !strconcat("setp.", cmpstr, "u.xor.", regclsname,
+                         "\t$p, $a, $b, !$c"),
+              [(set RegPred:$p, (xor (setcc RC:$a, RC:$b, ucmp),
+                                     (not RegPred:$c)))]>;
   def rr_xor_not_r_o
     : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".xor.", regclsname, "\t$p, $a, $b, !$c"),
-              [(set RegPred:$p, (xor (setcc RC:$a, RC:$b, ocmp), (not RegPred:$c)))]>;
+              !strconcat("setp.", cmpstr, ".xor.", regclsname,
+                         "\t$p, $a, $b, !$c"),
+              [(set RegPred:$p, (xor (setcc RC:$a, RC:$b, ocmp),
+                                     (not RegPred:$c)))]>;
 }
 
-multiclass PTX_SELP<RegisterClass RC, string regclsname> {
+//===- Select Predicate Instructions - 4 Operand Form ---------------------===//
+multiclass PTX_SELP<RegisterClass RC, string regclsname, Operand immcls,
+                    SDNode immnode> {
   def rr
     : InstPTX<(outs RC:$r), (ins RegPred:$a, RC:$b, RC:$c),
               !strconcat("selp.", regclsname, "\t$r, $b, $c, $a"),
               [(set RC:$r, (select RegPred:$a, RC:$b, RC:$c))]>;
+  def ri
+    : InstPTX<(outs RC:$r), (ins RegPred:$a, RC:$b, immcls:$c),
+              !strconcat("selp.", regclsname, "\t$r, $b, $c, $a"),
+              [(set RC:$r, (select RegPred:$a, RC:$b, immnode:$c))]>;
+  def ii
+    : InstPTX<(outs RC:$r), (ins RegPred:$a, immcls:$b, immcls:$c),
+              !strconcat("selp.", regclsname, "\t$r, $b, $c, $a"),
+              [(set RC:$r, (select RegPred:$a, immnode:$b, immnode:$c))]>;
 }
 
-multiclass PTX_LD<string opstr, string typestr, RegisterClass RC, PatFrag pat_load> {
-  def rr32 : InstPTX<(outs RC:$d),
-                     (ins MEMri32:$a),
-                     !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")),
-                     [(set RC:$d, (pat_load ADDRrr32:$a))]>, Requires<[Use32BitAddresses]>;
-  def rr64 : InstPTX<(outs RC:$d),
-                     (ins MEMri64:$a),
-                     !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")),
-                     [(set RC:$d, (pat_load ADDRrr64:$a))]>, Requires<[Use64BitAddresses]>;
-  def ri32 : InstPTX<(outs RC:$d),
-                     (ins MEMri32:$a),
-                     !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")),
-                     [(set RC:$d, (pat_load ADDRri32:$a))]>, Requires<[Use32BitAddresses]>;
-  def ri64 : InstPTX<(outs RC:$d),
-                     (ins MEMri64:$a),
-                     !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")),
-                     [(set RC:$d, (pat_load ADDRri64:$a))]>, Requires<[Use64BitAddresses]>;
-  def ii32 : InstPTX<(outs RC:$d),
-                     (ins MEMii32:$a),
-                     !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")),
-                     [(set RC:$d, (pat_load ADDRii32:$a))]>, Requires<[Use32BitAddresses]>;
-  def ii64 : InstPTX<(outs RC:$d),
-                     (ins MEMii64:$a),
-                     !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")),
-                     [(set RC:$d, (pat_load ADDRii64:$a))]>, Requires<[Use64BitAddresses]>;
-}
-
-multiclass PTX_LD_ALL<string opstr, PatFrag pat_load> {
-  defm u16 : PTX_LD<opstr, ".u16", RegI16, pat_load>;
-  defm u32 : PTX_LD<opstr, ".u32", RegI32, pat_load>;
-  defm u64 : PTX_LD<opstr, ".u64", RegI64, pat_load>;
-  defm f32 : PTX_LD<opstr, ".f32", RegF32, pat_load>;
-  defm f64 : PTX_LD<opstr, ".f64", RegF64, pat_load>;
-}
-
-multiclass PTX_ST<string opstr, string typestr, RegisterClass RC, PatFrag pat_store> {
-  def rr32 : InstPTX<(outs),
-                     (ins RC:$d, MEMri32:$a),
-                     !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")),
-                     [(pat_store RC:$d, ADDRrr32:$a)]>, Requires<[Use32BitAddresses]>;
-  def rr64 : InstPTX<(outs),
-                     (ins RC:$d, MEMri64:$a),
-                     !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")),
-                     [(pat_store RC:$d, ADDRrr64:$a)]>, Requires<[Use64BitAddresses]>;
-  def ri32 : InstPTX<(outs),
-                   (ins RC:$d, MEMri32:$a),
-                   !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")),
-                   [(pat_store RC:$d, ADDRri32:$a)]>, Requires<[Use32BitAddresses]>;
-  def ri64 : InstPTX<(outs),
-                   (ins RC:$d, MEMri64:$a),
-                   !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")),
-                   [(pat_store RC:$d, ADDRri64:$a)]>, Requires<[Use64BitAddresses]>;
-  def ii32 : InstPTX<(outs),
-                   (ins RC:$d, MEMii32:$a),
-                   !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")),
-                   [(pat_store RC:$d, ADDRii32:$a)]>, Requires<[Use32BitAddresses]>;
-  def ii64 : InstPTX<(outs),
-                   (ins RC:$d, MEMii64:$a),
-                   !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")),
-                   [(pat_store RC:$d, ADDRii64:$a)]>, Requires<[Use64BitAddresses]>;
-}
 
-multiclass PTX_ST_ALL<string opstr, PatFrag pat_store> {
-  defm u16 : PTX_ST<opstr, ".u16", RegI16, pat_store>;
-  defm u32 : PTX_ST<opstr, ".u32", RegI32, pat_store>;
-  defm u64 : PTX_ST<opstr, ".u64", RegI64, pat_store>;
-  defm f32 : PTX_ST<opstr, ".f32", RegF32, pat_store>;
-  defm f64 : PTX_ST<opstr, ".f64", RegF64, pat_store>;
-}
 
 //===----------------------------------------------------------------------===//
 // Instructions
@@ -585,118 +481,61 @@ multiclass PTX_ST_ALL<string opstr, PatFrag pat_store> {
 
 ///===- Integer Arithmetic Instructions -----------------------------------===//
 
-defm ADD : INT3<"add", add>;
-defm SUB : INT3<"sub", sub>;
-defm MUL : INT3<"mul.lo", mul>; // FIXME: Allow 32x32 -> 64 multiplies
-defm DIV : INT3<"div", udiv>;
-defm REM : INT3<"rem", urem>;
+defm ADD  : PTX_INT3<"add", add>;
+defm SUB  : PTX_INT3<"sub", sub>;
+defm MUL  : PTX_INT3<"mul.lo", mul>; // FIXME: Allow 32x32 -> 64 multiplies
+defm DIV  : PTX_INT3<"div", udiv>;
+defm SDIV : PTX_INT3_SIGNED<"div", sdiv>;
+defm REM  : PTX_INT3<"rem", urem>;
 
 ///===- Floating-Point Arithmetic Instructions ----------------------------===//
 
-// Standard Unary Operations
-defm FNEG : PTX_FLOAT_2OP<"neg", fneg>;
+// FNEG
+defm FNEG : PTX_FLOAT_2OP<"neg">;
 
 // Standard Binary Operations
-defm FADD : PTX_FLOAT_3OP<"add.rn", fadd>;
-defm FSUB : PTX_FLOAT_3OP<"sub.rn", fsub>;
-defm FMUL : PTX_FLOAT_3OP<"mul.rn", fmul>;
-
-// For floating-point division:
-// SM_13+ defaults to .rn for f32 and f64,
-// SM10 must *not* provide a rounding
-
-// TODO: 
-//     - Allow user selection of rounding modes for fdiv
-//     - Add support for -prec-div=false (.approx)
-
-def FDIVrr32SM13 : InstPTX<(outs RegF32:$d),
-                       (ins RegF32:$a, RegF32:$b),
-                       "div.rn.f32\t$d, $a, $b",
-                       [(set RegF32:$d, (fdiv RegF32:$a, RegF32:$b))]>,
-                   Requires<[FDivNeedsRoundingMode]>;
-def FDIVri32SM13 : InstPTX<(outs RegF32:$d),
-                       (ins RegF32:$a, f32imm:$b),
-                       "div.rn.f32\t$d, $a, $b",
-                       [(set RegF32:$d, (fdiv RegF32:$a, fpimm:$b))]>,
-                   Requires<[FDivNeedsRoundingMode]>;
-def FDIVrr32SM10 : InstPTX<(outs RegF32:$d),
-                       (ins RegF32:$a, RegF32:$b),
-                       "div.f32\t$d, $a, $b",
-                       [(set RegF32:$d, (fdiv RegF32:$a, RegF32:$b))]>,
-                   Requires<[FDivNoRoundingMode]>;
-def FDIVri32SM10 : InstPTX<(outs RegF32:$d),
-                       (ins RegF32:$a, f32imm:$b),
-                       "div.f32\t$d, $a, $b",
-                       [(set RegF32:$d, (fdiv RegF32:$a, fpimm:$b))]>,
-                   Requires<[FDivNoRoundingMode]>;
-
-def FDIVrr64SM13 : InstPTX<(outs RegF64:$d),
-                           (ins RegF64:$a, RegF64:$b),
-                           "div.rn.f64\t$d, $a, $b",
-                           [(set RegF64:$d, (fdiv RegF64:$a, RegF64:$b))]>,
-                   Requires<[FDivNeedsRoundingMode]>;
-def FDIVri64SM13 : InstPTX<(outs RegF64:$d),
-                           (ins RegF64:$a, f64imm:$b),
-                           "div.rn.f64\t$d, $a, $b",
-                           [(set RegF64:$d, (fdiv RegF64:$a, fpimm:$b))]>,
-                   Requires<[FDivNeedsRoundingMode]>;
-def FDIVrr64SM10 : InstPTX<(outs RegF64:$d),
-                           (ins RegF64:$a, RegF64:$b),
-                           "div.f64\t$d, $a, $b",
-                           [(set RegF64:$d, (fdiv RegF64:$a, RegF64:$b))]>,
-                   Requires<[FDivNoRoundingMode]>;
-def FDIVri64SM10 : InstPTX<(outs RegF64:$d),
-                           (ins RegF64:$a, f64imm:$b),
-                           "div.f64\t$d, $a, $b",
-                           [(set RegF64:$d, (fdiv RegF64:$a, fpimm:$b))]>,
-                   Requires<[FDivNoRoundingMode]>;
-
-
+defm FADD : PTX_FLOAT_3OP<"add">;
+defm FSUB : PTX_FLOAT_3OP<"sub">;
+defm FMUL : PTX_FLOAT_3OP<"mul">;
+defm FDIV : PTX_FLOAT_3OP<"div">;
 
 // Multi-operation hybrid instructions
+defm FMAD : PTX_FLOAT_4OP<"mad">, Requires<[SupportsFMA]>;
 
-// The selection of mad/fma is tricky.  In some cases, they are the *same*
-// instruction, but in other cases we may prefer one or the other.  Also,
-// different PTX versions differ on whether rounding mode flags are required.
-// In the short term, mad is supported on all PTX versions and we use a
-// default rounding mode no matter what shader model or PTX version.
-// TODO: Allow the rounding mode to be selectable through llc.
-defm FMADSM13 : PTX_FLOAT_4OP<"mad.rn", fmul, fadd>,
-                Requires<[FMadNeedsRoundingMode, SupportsFMA]>;
-defm FMAD : PTX_FLOAT_4OP<"mad", fmul, fadd>,
-            Requires<[FMadNoRoundingMode, SupportsFMA]>;
 
 ///===- Floating-Point Intrinsic Instructions -----------------------------===//
 
-def FSQRT32 : InstPTX<(outs RegF32:$d),
-                      (ins RegF32:$a),
-                      "sqrt.rn.f32\t$d, $a",
-                      [(set RegF32:$d, (fsqrt RegF32:$a))]>;
-
-def FSQRT64 : InstPTX<(outs RegF64:$d),
-                      (ins RegF64:$a),
-                      "sqrt.rn.f64\t$d, $a",
-                      [(set RegF64:$d, (fsqrt RegF64:$a))]>;
-
-def FSIN32 : InstPTX<(outs RegF32:$d),
-                     (ins RegF32:$a),
-                     "sin.approx.f32\t$d, $a",
-                     [(set RegF32:$d, (fsin RegF32:$a))]>;
+// SQRT
+def FSQRTrr32 : InstPTX<(outs RegF32:$d), (ins RndMode:$r, RegF32:$a),
+                        "sqrt$r.f32\t$d, $a", []>;
+def FSQRTri32 : InstPTX<(outs RegF32:$d), (ins RndMode:$r, f32imm:$a),
+                        "sqrt$r.f32\t$d, $a", []>;
+def FSQRTrr64 : InstPTX<(outs RegF64:$d), (ins RndMode:$r, RegF64:$a),
+                        "sqrt$r.f64\t$d, $a", []>;
+def FSQRTri64 : InstPTX<(outs RegF64:$d), (ins RndMode:$r, f64imm:$a),
+                        "sqrt$r.f64\t$d, $a", []>;
+
+// SIN
+def FSINrr32 : InstPTX<(outs RegF32:$d), (ins RndMode:$r, RegF32:$a),
+                       "sin$r.f32\t$d, $a", []>;
+def FSINri32 : InstPTX<(outs RegF32:$d), (ins RndMode:$r, f32imm:$a),
+                       "sin$r.f32\t$d, $a", []>;
+def FSINrr64 : InstPTX<(outs RegF64:$d), (ins RndMode:$r, RegF64:$a),
+                       "sin$r.f64\t$d, $a", []>;
+def FSINri64 : InstPTX<(outs RegF64:$d), (ins RndMode:$r, f64imm:$a),
+                       "sin$r.f64\t$d, $a", []>;
+
+// COS
+def FCOSrr32 : InstPTX<(outs RegF32:$d), (ins RndMode:$r, RegF32:$a),
+                       "cos$r.f32\t$d, $a", []>;
+def FCOSri32 : InstPTX<(outs RegF32:$d), (ins RndMode:$r, f32imm:$a),
+                       "cos$r.f32\t$d, $a", []>;
+def FCOSrr64 : InstPTX<(outs RegF64:$d), (ins RndMode:$r, RegF64:$a),
+                       "cos$r.f64\t$d, $a", []>;
+def FCOSri64 : InstPTX<(outs RegF64:$d), (ins RndMode:$r, f64imm:$a),
+                       "cos$r.f64\t$d, $a", []>;
 
-def FSIN64 : InstPTX<(outs RegF64:$d),
-                     (ins RegF64:$a),
-                     "sin.approx.f64\t$d, $a",
-                     [(set RegF64:$d, (fsin RegF64:$a))]>;
 
-def FCOS32 : InstPTX<(outs RegF32:$d),
-                     (ins RegF32:$a),
-                     "cos.approx.f32\t$d, $a",
-                     [(set RegF32:$d, (fcos RegF32:$a))]>;
-
-def FCOS64 : InstPTX<(outs RegF64:$d),
-                     (ins RegF64:$a),
-                     "cos.approx.f64\t$d, $a",
-                     [(set RegF64:$d, (fcos RegF64:$a))]>;
 
 
 ///===- Comparison and Selection Instructions -----------------------------===//
@@ -744,35 +583,35 @@ defm SETPGEs64 : PTX_SETP_I<RegI64, "s64", i64imm, SETGE,  "ge">;
 
 // Compare f32
 
-defm SETPEQf32 : PTX_SETP_FP<RegF32, "f32", SETUEQ, SETOEQ, "eq">;
-defm SETPNEf32 : PTX_SETP_FP<RegF32, "f32", SETUNE, SETONE, "ne">;
-defm SETPLTf32 : PTX_SETP_FP<RegF32, "f32", SETULT, SETOLT, "lt">;
-defm SETPLEf32 : PTX_SETP_FP<RegF32, "f32", SETULE, SETOLE, "le">;
-defm SETPGTf32 : PTX_SETP_FP<RegF32, "f32", SETUGT, SETOGT, "gt">;
-defm SETPGEf32 : PTX_SETP_FP<RegF32, "f32", SETUGE, SETOGE, "ge">;
+defm SETPEQf32 : PTX_SETP_FP<RegF32, "f32", f32imm, SETUEQ, SETOEQ, "eq">;
+defm SETPNEf32 : PTX_SETP_FP<RegF32, "f32", f32imm, SETUNE, SETONE, "ne">;
+defm SETPLTf32 : PTX_SETP_FP<RegF32, "f32", f32imm, SETULT, SETOLT, "lt">;
+defm SETPLEf32 : PTX_SETP_FP<RegF32, "f32", f32imm, SETULE, SETOLE, "le">;
+defm SETPGTf32 : PTX_SETP_FP<RegF32, "f32", f32imm, SETUGT, SETOGT, "gt">;
+defm SETPGEf32 : PTX_SETP_FP<RegF32, "f32", f32imm, SETUGE, SETOGE, "ge">;
 
 // Compare f64
 
-defm SETPEQf64 : PTX_SETP_FP<RegF64, "f64", SETUEQ, SETOEQ, "eq">;
-defm SETPNEf64 : PTX_SETP_FP<RegF64, "f64", SETUNE, SETONE, "ne">;
-defm SETPLTf64 : PTX_SETP_FP<RegF64, "f64", SETULT, SETOLT, "lt">;
-defm SETPLEf64 : PTX_SETP_FP<RegF64, "f64", SETULE, SETOLE, "le">;
-defm SETPGTf64 : PTX_SETP_FP<RegF64, "f64", SETUGT, SETOGT, "gt">;
-defm SETPGEf64 : PTX_SETP_FP<RegF64, "f64", SETUGE, SETOGE, "ge">;
+defm SETPEQf64 : PTX_SETP_FP<RegF64, "f64", f64imm, SETUEQ, SETOEQ, "eq">;
+defm SETPNEf64 : PTX_SETP_FP<RegF64, "f64", f64imm, SETUNE, SETONE, "ne">;
+defm SETPLTf64 : PTX_SETP_FP<RegF64, "f64", f64imm, SETULT, SETOLT, "lt">;
+defm SETPLEf64 : PTX_SETP_FP<RegF64, "f64", f64imm, SETULE, SETOLE, "le">;
+defm SETPGTf64 : PTX_SETP_FP<RegF64, "f64", f64imm, SETUGT, SETOGT, "gt">;
+defm SETPGEf64 : PTX_SETP_FP<RegF64, "f64", f64imm, SETUGE, SETOGE, "ge">;
 
 // .selp
 
-defm PTX_SELPu16 : PTX_SELP<RegI16, "u16">;
-defm PTX_SELPu32 : PTX_SELP<RegI32, "u32">;
-defm PTX_SELPu64 : PTX_SELP<RegI64, "u64">;
-defm PTX_SELPf32 : PTX_SELP<RegF32, "f32">;
-defm PTX_SELPf64 : PTX_SELP<RegF64, "f64">;
+defm SELPi16 : PTX_SELP<RegI16, "u16", i16imm, imm>;
+defm SELPi32 : PTX_SELP<RegI32, "u32", i32imm, imm>;
+defm SELPi64 : PTX_SELP<RegI64, "u64", i64imm, imm>;
+defm SELPf32 : PTX_SELP<RegF32, "f32", f32imm, fpimm>;
+defm SELPf64 : PTX_SELP<RegF64, "f64", f64imm, fpimm>;
 
 ///===- Logic and Shift Instructions --------------------------------------===//
 
-defm SHL : INT3ntnc<"shl.b", PTXshl>;
-defm SRL : INT3ntnc<"shr.u", PTXsrl>;
-defm SRA : INT3ntnc<"shr.s", PTXsra>;
+defm SHL : PTX_INT3ntnc<"shl.b", PTXshl>;
+defm SRL : PTX_INT3ntnc<"shr.u", PTXsrl>;
+defm SRA : PTX_INT3ntnc<"shr.s", PTXsra>;
 
 defm AND : PTX_LOGIC<"and", and>;
 defm OR  : PTX_LOGIC<"or",  or>;
@@ -780,6 +619,24 @@ defm XOR : PTX_LOGIC<"xor", xor>;
 
 ///===- Data Movement and Conversion Instructions -------------------------===//
 
+// any_extend
+// Implement the anyext instruction in terms of the PTX cvt instructions.
+//def : Pat<(i32 (anyext RegI16:$a)), (CVT_u32_u16 RegI16:$a)>;
+//def : Pat<(i64 (anyext RegI16:$a)), (CVT_u64_u16 RegI16:$a)>;
+//def : Pat<(i64 (anyext RegI32:$a)), (CVT_u64_u32 RegI32:$a)>;
+
+// bitconvert
+// These instructions implement the bit-wise conversion between integer and
+// floating-point types.
+def MOVi32f32
+  : InstPTX<(outs RegI32:$d), (ins RegF32:$a), "mov.b32\t$d, $a", []>;
+def MOVf32i32
+  : InstPTX<(outs RegF32:$d), (ins RegI32:$a), "mov.b32\t$d, $a", []>;
+def MOVi64f64
+  : InstPTX<(outs RegI64:$d), (ins RegF64:$a), "mov.b64\t$d, $a", []>;
+def MOVf64i64
+  : InstPTX<(outs RegF64:$d), (ins RegI64:$a), "mov.b64\t$d, $a", []>;
+
 let neverHasSideEffects = 1 in {
   def MOVPREDrr
     : InstPTX<(outs RegPred:$d), (ins RegPred:$a), "mov.pred\t$d, $a", []>;
@@ -825,278 +682,332 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
               [(set RegI64:$d, (PTXcopyaddress tglobaladdr:$a))]>;
 }
 
-// Loads
-defm LDg : PTX_LD_ALL<"ld.global", load_global>;
-defm LDc : PTX_LD_ALL<"ld.const",  load_constant>;
-defm LDl : PTX_LD_ALL<"ld.local",  load_local>;
-defm LDs : PTX_LD_ALL<"ld.shared", load_shared>;
+// PTX cvt instructions
+// Note all of these may actually be used, we just define all possible patterns
+// here (that make sense).
+// FIXME: Can we collapse this somehow into a multiclass def?
+
+// To i16
+def CVTu16u32
+  : InstPTX<(outs RegI16:$d), (ins RegI32:$a), "cvt.u16.u32\t$d, $a", []>;
+def CVTu16u64
+  : InstPTX<(outs RegI16:$d), (ins RegI64:$a), "cvt.u16.u64\t$d, $a", []>;
+def CVTu16f32
+  : InstPTX<(outs RegI16:$d), (ins RndMode:$r, RegF32:$a),
+            "cvt$r.u16.f32\t$d, $a", []>;
+def CVTs16f32
+  : InstPTX<(outs RegI16:$d), (ins RndMode:$r, RegF32:$a),
+            "cvt$r.s16.f32\t$d, $a", []>;
+def CVTu16f64
+  : InstPTX<(outs RegI16:$d), (ins RndMode:$r, RegF64:$a),
+            "cvt$r.u16.f64\t$d, $a", []>;
+def CVTs16f64
+  : InstPTX<(outs RegI16:$d), (ins RndMode:$r, RegF64:$a),
+            "cvt$r.s16.f64\t$d, $a", []>;
+
+// To i32
+def CVTu32u16
+  : InstPTX<(outs RegI32:$d), (ins RegI16:$a), "cvt.u32.u16\t$d, $a", []>;
+def CVTs32s16
+  : InstPTX<(outs RegI32:$d), (ins RegI16:$a), "cvt.s32.s16\t$d, $a", []>;
+def CVTu32u64
+  : InstPTX<(outs RegI32:$d), (ins RegI64:$a), "cvt.u32.u64\t$d, $a", []>;
+def CVTu32f32
+  : InstPTX<(outs RegI32:$d), (ins RndMode:$r, RegF32:$a),
+            "cvt$r.u32.f32\t$d, $a", []>;
+def CVTs32f32
+  : InstPTX<(outs RegI32:$d), (ins RndMode:$r, RegF32:$a),
+            "cvt$r.s32.f32\t$d, $a", []>;
+def CVTu32f64
+  : InstPTX<(outs RegI32:$d), (ins RndMode:$r, RegF64:$a),
+            "cvt$r.u32.f64\t$d, $a", []>;
+def CVTs32f64
+  : InstPTX<(outs RegI32:$d), (ins RndMode:$r, RegF64:$a),
+            "cvt$r.s32.f64\t$d, $a", []>;
+
+// To i64
+def CVTu64u16
+  : InstPTX<(outs RegI64:$d), (ins RegI16:$a), "cvt.u64.u16\t$d, $a", []>;
+def CVTs64s16
+  : InstPTX<(outs RegI64:$d), (ins RegI16:$a), "cvt.s64.s16\t$d, $a", []>;
+def CVTu64u32
+  : InstPTX<(outs RegI64:$d), (ins RegI32:$a), "cvt.u64.u32\t$d, $a", []>;
+def CVTs64s32
+  : InstPTX<(outs RegI64:$d), (ins RegI32:$a), "cvt.s64.s32\t$d, $a", []>;
+def CVTu64f32
+  : InstPTX<(outs RegI64:$d), (ins RndMode:$r, RegF32:$a),
+            "cvt$r.u64.f32\t$d, $a", []>;
+def CVTs64f32
+  : InstPTX<(outs RegI64:$d), (ins RndMode:$r, RegF32:$a),
+            "cvt$r.s64.f32\t$d, $a", []>;
+def CVTu64f64
+  : InstPTX<(outs RegI64:$d), (ins RndMode:$r, RegF64:$a),
+            "cvt$r.u64.f64\t$d, $a", []>;
+def CVTs64f64
+  : InstPTX<(outs RegI64:$d), (ins RndMode:$r, RegF64:$a),
+            "cvt$r.s64.f64\t$d, $a", []>;
+
+// To f32
+def CVTf32u16
+  : InstPTX<(outs RegF32:$d), (ins RndMode:$r, RegI16:$a),
+            "cvt$r.f32.u16\t$d, $a", []>;
+def CVTf32s16
+  : InstPTX<(outs RegF32:$d), (ins RndMode:$r, RegI16:$a),
+            "cvt$r.f32.s16\t$d, $a", []>;
+def CVTf32u32
+  : InstPTX<(outs RegF32:$d), (ins RndMode:$r, RegI32:$a),
+            "cvt$r.f32.u32\t$d, $a", []>;
+def CVTf32s32
+  : InstPTX<(outs RegF32:$d), (ins RndMode:$r, RegI32:$a),
+            "cvt$r.f32.s32\t$d, $a", []>;
+def CVTf32u64
+  : InstPTX<(outs RegF32:$d), (ins RndMode:$r, RegI64:$a),
+            "cvt$r.f32.u64\t$d, $a", []>;
+def CVTf32s64
+  : InstPTX<(outs RegF32:$d), (ins RndMode:$r, RegI64:$a),
+            "cvt$r.f32.s64\t$d, $a", []>;
+def CVTf32f64
+  : InstPTX<(outs RegF32:$d), (ins RndMode:$r, RegF64:$a),
+            "cvt$r.f32.f64\t$d, $a", []>;
+
+// To f64
+def CVTf64u16
+  : InstPTX<(outs RegF64:$d), (ins RndMode:$r, RegI16:$a),
+            "cvt$r.f64.u16\t$d, $a", []>;
+def CVTf64s16
+  : InstPTX<(outs RegF64:$d), (ins RndMode:$r, RegI16:$a),
+            "cvt$r.f64.s16\t$d, $a", []>;
+def CVTf64u32
+  : InstPTX<(outs RegF64:$d), (ins RndMode:$r, RegI32:$a),
+            "cvt$r.f64.u32\t$d, $a", []>;
+def CVTf64s32
+  : InstPTX<(outs RegF64:$d), (ins RndMode:$r, RegI32:$a),
+            "cvt$r.f64.s32\t$d, $a", []>;
+def CVTf64u64
+  : InstPTX<(outs RegF64:$d), (ins RndMode:$r, RegI64:$a),
+            "cvt$r.f64.u64\t$d, $a", []>;
+def CVTf64s64
+  : InstPTX<(outs RegF64:$d), (ins RndMode:$r, RegI64:$a),
+            "cvt$r.f64.s64\t$d, $a", []>;
+def CVTf64f32
+  : InstPTX<(outs RegF64:$d), (ins RegF32:$a), "cvt.f64.f32\t$d, $a", []>;
+
+  ///===- Control Flow Instructions -----------------------------------------===//
 
-// These instructions are used to load/store from the .param space for
-// device and kernel parameters
+let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
+  def BRAd
+    : InstPTX<(outs), (ins brtarget:$d), "bra\t$d", [(br bb:$d)]>;
+}
 
-let hasSideEffects = 1 in {
-  def LDpiPred : InstPTX<(outs RegPred:$d), (ins MEMpi:$a),
-                         "ld.param.pred\t$d, [$a]",
-                         [(set RegPred:$d, (PTXloadparam timm:$a))]>;
-  def LDpiU16  : InstPTX<(outs RegI16:$d), (ins MEMpi:$a),
-                         "ld.param.u16\t$d, [$a]",
-                         [(set RegI16:$d, (PTXloadparam timm:$a))]>;
-  def LDpiU32  : InstPTX<(outs RegI32:$d), (ins MEMpi:$a),
-                         "ld.param.u32\t$d, [$a]",
-                         [(set RegI32:$d, (PTXloadparam timm:$a))]>;
-  def LDpiU64  : InstPTX<(outs RegI64:$d), (ins MEMpi:$a),
-                         "ld.param.u64\t$d, [$a]",
-                         [(set RegI64:$d, (PTXloadparam timm:$a))]>;
-  def LDpiF32  : InstPTX<(outs RegF32:$d), (ins MEMpi:$a),
-                         "ld.param.f32\t$d, [$a]",
-                         [(set RegF32:$d, (PTXloadparam timm:$a))]>;
-  def LDpiF64  : InstPTX<(outs RegF64:$d), (ins MEMpi:$a),
-                         "ld.param.f64\t$d, [$a]",
-                         [(set RegF64:$d, (PTXloadparam timm:$a))]>;
-
-  def STpiPred : InstPTX<(outs), (ins MEMret:$d, RegPred:$a),
-                         "st.param.pred\t[$d], $a",
-                         [(PTXstoreparam timm:$d, RegPred:$a)]>;
-  def STpiU16  : InstPTX<(outs), (ins MEMret:$d, RegI16:$a),
-                         "st.param.u16\t[$d], $a",
-                         [(PTXstoreparam timm:$d, RegI16:$a)]>;
-  def STpiU32  : InstPTX<(outs), (ins MEMret:$d, RegI32:$a),
-                         "st.param.u32\t[$d], $a",
-                         [(PTXstoreparam timm:$d, RegI32:$a)]>;
-  def STpiU64  : InstPTX<(outs), (ins MEMret:$d, RegI64:$a),
-                         "st.param.u64\t[$d], $a",
-                         [(PTXstoreparam timm:$d, RegI64:$a)]>;
-  def STpiF32  : InstPTX<(outs), (ins MEMret:$d, RegF32:$a),
-                         "st.param.f32\t[$d], $a",
-                         [(PTXstoreparam timm:$d, RegF32:$a)]>;
-  def STpiF64  : InstPTX<(outs), (ins MEMret:$d, RegF64:$a),
-                         "st.param.f64\t[$d], $a",
-                         [(PTXstoreparam timm:$d, RegF64:$a)]>;
+let isBranch = 1, isTerminator = 1 in {
+  // FIXME: The pattern part is blank because I cannot (or do not yet know
+  // how to) use the first operand of PredicateOperand (a RegPred register) here
+  def BRAdp
+    : InstPTX<(outs), (ins brtarget:$d), "bra\t$d",
+              [/*(brcond pred:$_p, bb:$d)*/]>;
 }
 
-// Stores
-defm STg : PTX_ST_ALL<"st.global", store_global>;
-defm STl : PTX_ST_ALL<"st.local",  store_local>;
-defm STs : PTX_ST_ALL<"st.shared", store_shared>;
+let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
+  def EXIT : InstPTX<(outs), (ins), "exit", [(PTXexit)]>;
+  def RET  : InstPTX<(outs), (ins), "ret",  [(PTXret)]>;
+}
 
-// defm STp : PTX_ST_ALL<"st.param",  store_parameter>;
-// defm LDp : PTX_LD_ALL<"ld.param",  load_parameter>;
-// TODO: Do something with st.param if/when it is needed.
+let hasSideEffects = 1 in {
+  def CALL : InstPTX<(outs), (ins), "call", [(PTXcall)]>;
+}
 
-// Conversion to pred
-// PTX does not directly support converting to a predicate type, so we fake it
-// by performing a greater-than test between the value and zero.  This follows
-// the C convention that any non-zero value is equivalent to 'true'.
-def CVT_pred_u16
-  : InstPTX<(outs RegPred:$d), (ins RegI16:$a), "setp.gt.u16\t$d, $a, 0",
-            [(set RegPred:$d, (trunc RegI16:$a))]>;
+///===- Parameter Passing Pseudo-Instructions -----------------------------===//
+
+def READPARAMPRED : InstPTX<(outs RegPred:$a), (ins i32imm:$b),
+                            "mov.pred\t$a, %param$b", []>;
+def READPARAMI16  : InstPTX<(outs RegI16:$a), (ins i32imm:$b),
+                            "mov.b16\t$a, %param$b", []>;
+def READPARAMI32  : InstPTX<(outs RegI32:$a), (ins i32imm:$b),
+                            "mov.b32\t$a, %param$b", []>;
+def READPARAMI64  : InstPTX<(outs RegI64:$a), (ins i32imm:$b),
+                            "mov.b64\t$a, %param$b", []>;
+def READPARAMF32  : InstPTX<(outs RegF32:$a), (ins i32imm:$b),
+                            "mov.f32\t$a, %param$b", []>;
+def READPARAMF64  : InstPTX<(outs RegF64:$a), (ins i32imm:$b),
+                            "mov.f64\t$a, %param$b", []>;
+
+def WRITEPARAMPRED : InstPTX<(outs), (ins RegPred:$a), "//w", []>;
+def WRITEPARAMI16  : InstPTX<(outs), (ins RegI16:$a), "//w", []>;
+def WRITEPARAMI32  : InstPTX<(outs), (ins RegI32:$a), "//w", []>;
+def WRITEPARAMI64  : InstPTX<(outs), (ins RegI64:$a), "//w", []>;
+def WRITEPARAMF32  : InstPTX<(outs), (ins RegF32:$a), "//w", []>;
+def WRITEPARAMF64  : InstPTX<(outs), (ins RegF64:$a), "//w", []>;
 
-def CVT_pred_u32
-  : InstPTX<(outs RegPred:$d), (ins RegI32:$a), "setp.gt.u32\t$d, $a, 0",
-            [(set RegPred:$d, (trunc RegI32:$a))]>;
 
-def CVT_pred_u64
-  : InstPTX<(outs RegPred:$d), (ins RegI64:$a), "setp.gt.u64\t$d, $a, 0",
-            [(set RegPred:$d, (trunc RegI64:$a))]>;
+//===----------------------------------------------------------------------===//
+// Instruction Selection Patterns
+//===----------------------------------------------------------------------===//
 
-def CVT_pred_f32
-  : InstPTX<(outs RegPred:$d), (ins RegF32:$a), "setp.gt.f32\t$d, $a, 0",
-            [(set RegPred:$d, (fp_to_uint RegF32:$a))]>;
+// FADD
+def : Pat<(f32 (fadd RegF32:$a, RegF32:$b)),
+          (FADDrr32 RndDefault, RegF32:$a, RegF32:$b)>;
+def : Pat<(f32 (fadd RegF32:$a, fpimm:$b)),
+          (FADDri32 RndDefault, RegF32:$a, fpimm:$b)>;
+def : Pat<(f64 (fadd RegF64:$a, RegF64:$b)),
+          (FADDrr64 RndDefault, RegF64:$a, RegF64:$b)>;
+def : Pat<(f64 (fadd RegF64:$a, fpimm:$b)),
+          (FADDri64 RndDefault, RegF64:$a, fpimm:$b)>;
+
+// FSUB
+def : Pat<(f32 (fsub RegF32:$a, RegF32:$b)),
+          (FSUBrr32 RndDefault, RegF32:$a, RegF32:$b)>;
+def : Pat<(f32 (fsub RegF32:$a, fpimm:$b)),
+          (FSUBri32 RndDefault, RegF32:$a, fpimm:$b)>;
+def : Pat<(f64 (fsub RegF64:$a, RegF64:$b)),
+          (FSUBrr64 RndDefault, RegF64:$a, RegF64:$b)>;
+def : Pat<(f64 (fsub RegF64:$a, fpimm:$b)),
+          (FSUBri64 RndDefault, RegF64:$a, fpimm:$b)>;
+
+// FMUL
+def : Pat<(f32 (fmul RegF32:$a, RegF32:$b)),
+          (FMULrr32 RndDefault, RegF32:$a, RegF32:$b)>;
+def : Pat<(f32 (fmul RegF32:$a, fpimm:$b)),
+          (FMULri32 RndDefault, RegF32:$a, fpimm:$b)>;
+def : Pat<(f64 (fmul RegF64:$a, RegF64:$b)),
+          (FMULrr64 RndDefault, RegF64:$a, RegF64:$b)>;
+def : Pat<(f64 (fmul RegF64:$a, fpimm:$b)),
+          (FMULri64 RndDefault, RegF64:$a, fpimm:$b)>;
+
+// FDIV
+def : Pat<(f32 (fdiv RegF32:$a, RegF32:$b)),
+          (FDIVrr32 RndDefault, RegF32:$a, RegF32:$b)>;
+def : Pat<(f32 (fdiv RegF32:$a, fpimm:$b)),
+          (FDIVri32 RndDefault, RegF32:$a, fpimm:$b)>;
+def : Pat<(f64 (fdiv RegF64:$a, RegF64:$b)),
+          (FDIVrr64 RndDefault, RegF64:$a, RegF64:$b)>;
+def : Pat<(f64 (fdiv RegF64:$a, fpimm:$b)),
+          (FDIVri64 RndDefault, RegF64:$a, fpimm:$b)>;
+
+// FMUL+FADD
+def : Pat<(f32 (fadd (fmul RegF32:$a, RegF32:$b), RegF32:$c)),
+          (FMADrrr32 RndDefault, RegF32:$a, RegF32:$b, RegF32:$c)>;
+def : Pat<(f32 (fadd (fmul RegF32:$a, RegF32:$b), fpimm:$c)),
+          (FMADrri32 RndDefault, RegF32:$a, RegF32:$b, fpimm:$c)>;
+def : Pat<(f32 (fadd (fmul RegF32:$a, fpimm:$b), fpimm:$c)),
+          (FMADrrr32 RndDefault, RegF32:$a, fpimm:$b, fpimm:$c)>;
+def : Pat<(f32 (fadd (fmul RegF32:$a, RegF32:$b), fpimm:$c)),
+          (FMADrri32 RndDefault, RegF32:$a, RegF32:$b, fpimm:$c)>;
+def : Pat<(f64 (fadd (fmul RegF64:$a, RegF64:$b), RegF64:$c)),
+          (FMADrrr64 RndDefault, RegF64:$a, RegF64:$b, RegF64:$c)>;
+def : Pat<(f64 (fadd (fmul RegF64:$a, RegF64:$b), fpimm:$c)),
+          (FMADrri64 RndDefault, RegF64:$a, RegF64:$b, fpimm:$c)>;
+def : Pat<(f64 (fadd (fmul RegF64:$a, fpimm:$b), fpimm:$c)),
+          (FMADrri64 RndDefault, RegF64:$a, fpimm:$b, fpimm:$c)>;
+
+// FNEG
+def : Pat<(f32 (fneg RegF32:$a)), (FNEGrr32 RndDefault, RegF32:$a)>;
+def : Pat<(f32 (fneg fpimm:$a)), (FNEGri32 RndDefault, fpimm:$a)>;
+def : Pat<(f64 (fneg RegF64:$a)), (FNEGrr64 RndDefault, RegF64:$a)>;
+def : Pat<(f64 (fneg fpimm:$a)), (FNEGri64 RndDefault, fpimm:$a)>;
+
+// FSQRT
+def : Pat<(f32 (fsqrt RegF32:$a)), (FSQRTrr32 RndDefault, RegF32:$a)>;
+def : Pat<(f32 (fsqrt fpimm:$a)), (FSQRTri32 RndDefault, fpimm:$a)>;
+def : Pat<(f64 (fsqrt RegF64:$a)), (FSQRTrr64 RndDefault, RegF64:$a)>;
+def : Pat<(f64 (fsqrt fpimm:$a)), (FSQRTri64 RndDefault, fpimm:$a)>;
+
+// FSIN
+def : Pat<(f32 (fsin RegF32:$a)), (FSINrr32 RndDefault, RegF32:$a)>;
+def : Pat<(f32 (fsin fpimm:$a)), (FSINri32 RndDefault, fpimm:$a)>;
+def : Pat<(f64 (fsin RegF64:$a)), (FSINrr64 RndDefault, RegF64:$a)>;
+def : Pat<(f64 (fsin fpimm:$a)), (FSINri64 RndDefault, fpimm:$a)>;
+
+// FCOS
+def : Pat<(f32 (fcos RegF32:$a)), (FCOSrr32 RndDefault, RegF32:$a)>;
+def : Pat<(f32 (fcos fpimm:$a)), (FCOSri32 RndDefault, fpimm:$a)>;
+def : Pat<(f64 (fcos RegF64:$a)), (FCOSrr64 RndDefault, RegF64:$a)>;
+def : Pat<(f64 (fcos fpimm:$a)), (FCOSri64 RndDefault, fpimm:$a)>;
+
+// Type conversion notes:
+// - PTX does not directly support converting a predicate to a value, so we
+//   use a select instruction to select either 0 or 1 (integer or fp) based
+//   on the truth value of the predicate.
+// - PTX does not directly support converting to a predicate type, so we fake it
+//   by performing a greater-than test between the value and zero.  This follows
+//   the C convention that any non-zero value is equivalent to 'true'.
 
-def CVT_pred_f64
-  : InstPTX<(outs RegPred:$d), (ins RegF64:$a), "setp.gt.f64\t$d, $a, 0",
-            [(set RegPred:$d, (fp_to_uint RegF64:$a))]>;
+// Conversion to pred
+def : Pat<(i1 (trunc RegI16:$a)),      (SETPGTu16ri RegI16:$a, 0)>;
+def : Pat<(i1 (trunc RegI32:$a)),      (SETPGTu32ri RegI32:$a, 0)>;
+def : Pat<(i1 (trunc RegI64:$a)),      (SETPGTu64ri RegI64:$a, 0)>;
+def : Pat<(i1 (fp_to_uint RegF32:$a)), (SETPGTu32ri (MOVi32f32 RegF32:$a), 0)>;
+def : Pat<(i1 (fp_to_uint RegF64:$a)), (SETPGTu64ri (MOVi64f64 RegF64:$a), 0)>;
 
 // Conversion to u16
-// PTX does not directly support converting a predicate to a value, so we
-// use a select instruction to select either 0 or 1 (integer or fp) based
-// on the truth value of the predicate.
-def CVT_u16_preda
-  : InstPTX<(outs RegI16:$d), (ins RegPred:$a), "selp.u16\t$d, 1, 0, $a",
-            [(set RegI16:$d, (anyext RegPred:$a))]>;
-
-def CVT_u16_pred
-  : InstPTX<(outs RegI16:$d), (ins RegPred:$a), "selp.u16\t$d, 1, 0, $a",
-            [(set RegI16:$d, (zext RegPred:$a))]>;
-
-def CVT_u16_preds
-  : InstPTX<(outs RegI16:$d), (ins RegPred:$a), "selp.u16\t$d, 1, 0, $a",
-            [(set RegI16:$d, (sext RegPred:$a))]>;
-
-def CVT_u16_u32
-  : InstPTX<(outs RegI16:$d), (ins RegI32:$a), "cvt.u16.u32\t$d, $a",
-            [(set RegI16:$d, (trunc RegI32:$a))]>;
-
-def CVT_u16_u64
-  : InstPTX<(outs RegI16:$d), (ins RegI64:$a), "cvt.u16.u64\t$d, $a",
-            [(set RegI16:$d, (trunc RegI64:$a))]>;
-
-def CVT_u16_f32
-  : InstPTX<(outs RegI16:$d), (ins RegF32:$a), "cvt.rzi.u16.f32\t$d, $a",
-            [(set RegI16:$d, (fp_to_uint RegF32:$a))]>;
-
-def CVT_u16_f64
-  : InstPTX<(outs RegI16:$d), (ins RegF64:$a), "cvt.rzi.u16.f64\t$d, $a",
-            [(set RegI16:$d, (fp_to_uint RegF64:$a))]>;
+def : Pat<(i16 (anyext RegPred:$a)),    (SELPi16ii RegPred:$a, 1, 0)>;
+def : Pat<(i16 (sext RegPred:$a)),      (SELPi16ii RegPred:$a, 0xFFFF, 0)>;
+def : Pat<(i16 (zext RegPred:$a)),      (SELPi16ii RegPred:$a, 1, 0)>;
+def : Pat<(i16 (trunc RegI32:$a)),      (CVTu16u32 RegI32:$a)>;
+def : Pat<(i16 (trunc RegI64:$a)),      (CVTu16u64 RegI64:$a)>;
+def : Pat<(i16 (fp_to_uint RegF32:$a)), (CVTu16f32 RndDefault, RegF32:$a)>;
+def : Pat<(i16 (fp_to_sint RegF32:$a)), (CVTs16f32 RndDefault, RegF32:$a)>;
+def : Pat<(i16 (fp_to_uint RegF64:$a)), (CVTu16f64 RndDefault, RegF64:$a)>;
+def : Pat<(i16 (fp_to_sint RegF64:$a)), (CVTs16f64 RndDefault, RegF64:$a)>;
 
 // Conversion to u32
-
-def CVT_u32_pred
-  : InstPTX<(outs RegI32:$d), (ins RegPred:$a), "selp.u32\t$d, 1, 0, $a",
-            [(set RegI32:$d, (zext RegPred:$a))]>;
-
-def CVT_u32_b16
-  : InstPTX<(outs RegI32:$d), (ins RegI16:$a), "cvt.u32.u16\t$d, $a",
-            [(set RegI32:$d, (anyext RegI16:$a))]>;
-
-def CVT_u32_u16
-  : InstPTX<(outs RegI32:$d), (ins RegI16:$a), "cvt.u32.u16\t$d, $a",
-            [(set RegI32:$d, (zext RegI16:$a))]>;
-
-def CVT_u32_preds
-  : InstPTX<(outs RegI32:$d), (ins RegPred:$a), "selp.u32\t$d, 1, 0, $a",
-            [(set RegI32:$d, (sext RegPred:$a))]>;
-
-def CVT_u32_s16
-  : InstPTX<(outs RegI32:$d), (ins RegI16:$a), "cvt.u32.s16\t$d, $a",
-            [(set RegI32:$d, (sext RegI16:$a))]>;
-
-def CVT_u32_u64
-  : InstPTX<(outs RegI32:$d), (ins RegI64:$a), "cvt.u32.u64\t$d, $a",
-            [(set RegI32:$d, (trunc RegI64:$a))]>;
-
-def CVT_u32_f32
-  : InstPTX<(outs RegI32:$d), (ins RegF32:$a), "cvt.rzi.u32.f32\t$d, $a",
-            [(set RegI32:$d, (fp_to_uint RegF32:$a))]>;
-
-def CVT_u32_f64
-  : InstPTX<(outs RegI32:$d), (ins RegF64:$a), "cvt.rzi.u32.f64\t$d, $a",
-            [(set RegI32:$d, (fp_to_uint RegF64:$a))]>;
+def : Pat<(i32 (anyext RegPred:$a)),    (SELPi32ii RegPred:$a, 1, 0)>;
+def : Pat<(i32 (sext RegPred:$a)),      (SELPi32ii RegPred:$a, 0xFFFFFFFF, 0)>;
+def : Pat<(i32 (zext RegPred:$a)),      (SELPi32ii RegPred:$a, 1, 0)>;
+def : Pat<(i32 (anyext RegI16:$a)),     (CVTu32u16 RegI16:$a)>;
+def : Pat<(i32 (sext RegI16:$a)),       (CVTs32s16 RegI16:$a)>;
+def : Pat<(i32 (zext RegI16:$a)),       (CVTu32u16 RegI16:$a)>;
+def : Pat<(i32 (trunc RegI64:$a)),      (CVTu32u64 RegI64:$a)>;
+def : Pat<(i32 (fp_to_uint RegF32:$a)), (CVTu32f32 RndDefault, RegF32:$a)>;
+def : Pat<(i32 (fp_to_sint RegF32:$a)), (CVTs32f32 RndDefault, RegF32:$a)>;
+def : Pat<(i32 (fp_to_uint RegF64:$a)), (CVTu32f64 RndDefault, RegF64:$a)>;
+def : Pat<(i32 (fp_to_sint RegF64:$a)), (CVTs32f64 RndDefault, RegF64:$a)>;
+def : Pat<(i32 (bitconvert RegF32:$a)), (MOVi32f32 RegF32:$a)>;
 
 // Conversion to u64
-
-def CVT_u64_pred
-  : InstPTX<(outs RegI64:$d), (ins RegPred:$a), "selp.u64\t$d, 1, 0, $a",
-            [(set RegI64:$d, (zext RegPred:$a))]>;
-
-def CVT_u64_preds
-  : InstPTX<(outs RegI64:$d), (ins RegPred:$a), "selp.u64\t$d, 1, 0, $a",
-            [(set RegI64:$d, (sext RegPred:$a))]>;
-
-def CVT_u64_u16
-  : InstPTX<(outs RegI64:$d), (ins RegI16:$a), "cvt.u64.u16\t$d, $a",
-            [(set RegI64:$d, (zext RegI16:$a))]>;
-
-def CVT_u64_s16
-  : InstPTX<(outs RegI64:$d), (ins RegI16:$a), "cvt.u64.s16\t$d, $a",
-            [(set RegI64:$d, (sext RegI16:$a))]>;
-
-def CVT_u64_u32
-  : InstPTX<(outs RegI64:$d), (ins RegI32:$a), "cvt.u64.u32\t$d, $a",
-            [(set RegI64:$d, (zext RegI32:$a))]>;
-
-def CVT_u64_s32
-  : InstPTX<(outs RegI64:$d), (ins RegI32:$a), "cvt.u64.s32\t$d, $a",
-            [(set RegI64:$d, (sext RegI32:$a))]>;
-
-def CVT_u64_f32
-  : InstPTX<(outs RegI64:$d), (ins RegF32:$a), "cvt.rzi.u64.f32\t$d, $a",
-            [(set RegI64:$d, (fp_to_uint RegF32:$a))]>;
-
-def CVT_u64_f64
-  : InstPTX<(outs RegI64:$d), (ins RegF64:$a), "cvt.rzi.u64.f64\t$d, $a",
-            [(set RegI64:$d, (fp_to_uint RegF64:$a))]>;
+def : Pat<(i64 (anyext RegPred:$a)),    (SELPi64ii RegPred:$a, 1, 0)>;
+def : Pat<(i64 (sext RegPred:$a)),      (SELPi64ii RegPred:$a,
+                                         0xFFFFFFFFFFFFFFFF, 0)>;
+def : Pat<(i64 (zext RegPred:$a)),      (SELPi64ii RegPred:$a, 1, 0)>;
+def : Pat<(i64 (anyext RegI16:$a)),     (CVTu64u16 RegI16:$a)>;
+def : Pat<(i64 (sext RegI16:$a)),       (CVTs64s16 RegI16:$a)>;
+def : Pat<(i64 (zext RegI16:$a)),       (CVTu64u16 RegI16:$a)>;
+def : Pat<(i64 (anyext RegI32:$a)),     (CVTu64u32 RegI32:$a)>;
+def : Pat<(i64 (sext RegI32:$a)),       (CVTs64s32 RegI32:$a)>;
+def : Pat<(i64 (zext RegI32:$a)),       (CVTu64u32 RegI32:$a)>;
+def : Pat<(i64 (fp_to_uint RegF32:$a)), (CVTu64f32 RndDefault, RegF32:$a)>;
+def : Pat<(i64 (fp_to_sint RegF32:$a)), (CVTs64f32 RndDefault, RegF32:$a)>;
+def : Pat<(i64 (fp_to_uint RegF64:$a)), (CVTu64f64 RndDefault, RegF64:$a)>;
+def : Pat<(i64 (fp_to_sint RegF64:$a)), (CVTs64f64 RndDefault, RegF64:$a)>;
+def : Pat<(i64 (bitconvert RegF64:$a)), (MOVi64f64 RegF64:$a)>;
 
 // Conversion to f32
-
-def CVT_f32_pred
-  : InstPTX<(outs RegF32:$d), (ins RegPred:$a),
-            "selp.f32\t$d, 0F3F800000, 0F00000000, $a",  // 1.0
-            [(set RegF32:$d, (uint_to_fp RegPred:$a))]>;
-
-def CVT_f32_u16
-  : InstPTX<(outs RegF32:$d), (ins RegI16:$a), "cvt.rn.f32.u16\t$d, $a",
-            [(set RegF32:$d, (uint_to_fp RegI16:$a))]>;
-
-def CVT_f32_u32
-  : InstPTX<(outs RegF32:$d), (ins RegI32:$a), "cvt.rn.f32.u32\t$d, $a",
-            [(set RegF32:$d, (uint_to_fp RegI32:$a))]>;
-
-def CVT_f32_u64
-  : InstPTX<(outs RegF32:$d), (ins RegI64:$a), "cvt.rn.f32.u64\t$d, $a",
-            [(set RegF32:$d, (uint_to_fp RegI64:$a))]>;
-
-def CVT_f32_f64
-  : InstPTX<(outs RegF32:$d), (ins RegF64:$a), "cvt.rn.f32.f64\t$d, $a",
-            [(set RegF32:$d, (fround RegF64:$a))]>;
+def : Pat<(f32 (uint_to_fp RegPred:$a)), (SELPf32rr RegPred:$a,
+                                        (MOVf32i32 0x3F800000), (MOVf32i32 0))>;
+def : Pat<(f32 (uint_to_fp RegI16:$a)),  (CVTf32u16 RndDefault, RegI16:$a)>;
+def : Pat<(f32 (sint_to_fp RegI16:$a)),  (CVTf32s16 RndDefault, RegI16:$a)>;
+def : Pat<(f32 (uint_to_fp RegI32:$a)),  (CVTf32u32 RndDefault, RegI32:$a)>;
+def : Pat<(f32 (sint_to_fp RegI32:$a)),  (CVTf32s32 RndDefault, RegI32:$a)>;
+def : Pat<(f32 (uint_to_fp RegI64:$a)),  (CVTf32u64 RndDefault, RegI64:$a)>;
+def : Pat<(f32 (sint_to_fp RegI64:$a)),  (CVTf32s64 RndDefault, RegI64:$a)>;
+def : Pat<(f32 (fround RegF64:$a)),      (CVTf32f64 RndDefault, RegF64:$a)>;
+def : Pat<(f32 (bitconvert RegI32:$a)),  (MOVf32i32 RegI32:$a)>;
 
 // Conversion to f64
+def : Pat<(f64 (uint_to_fp RegPred:$a)), (SELPf64rr RegPred:$a,
+                                (MOVf64i64 0x3F80000000000000), (MOVf64i64 0))>;
+def : Pat<(f64 (uint_to_fp RegI16:$a)), (CVTf64u16 RndDefault, RegI16:$a)>;
+def : Pat<(f64 (sint_to_fp RegI16:$a)), (CVTf64s16 RndDefault, RegI16:$a)>;
+def : Pat<(f64 (uint_to_fp RegI32:$a)), (CVTf64u32 RndDefault, RegI32:$a)>;
+def : Pat<(f64 (sint_to_fp RegI32:$a)), (CVTf64s32 RndDefault, RegI32:$a)>;
+def : Pat<(f64 (uint_to_fp RegI64:$a)), (CVTf64u64 RndDefault, RegI64:$a)>;
+def : Pat<(f64 (sint_to_fp RegI64:$a)), (CVTf64s64 RndDefault, RegI64:$a)>;
+def : Pat<(f64 (fextend RegF32:$a)),    (CVTf64f32 RegF32:$a)>;
+def : Pat<(f64 (bitconvert RegI64:$a)), (MOVf64i64 RegI64:$a)>;
 
-def CVT_f64_pred
-  : InstPTX<(outs RegF64:$d), (ins RegPred:$a), 
-            "selp.f64\t$d, 0D3F80000000000000, 0D0000000000000000, $a",  // 1.0
-            [(set RegF64:$d, (uint_to_fp RegPred:$a))]>;
-
-def CVT_f64_u16
-  : InstPTX<(outs RegF64:$d), (ins RegI16:$a), "cvt.rn.f64.u16\t$d, $a",
-            [(set RegF64:$d, (uint_to_fp RegI16:$a))]>;
-
-def CVT_f64_u32
-  : InstPTX<(outs RegF64:$d), (ins RegI32:$a), "cvt.rn.f64.u32\t$d, $a",
-            [(set RegF64:$d, (uint_to_fp RegI32:$a))]>;
-
-def CVT_f64_u64
-  : InstPTX<(outs RegF64:$d), (ins RegI64:$a), "cvt.rn.f64.u64\t$d, $a",
-            [(set RegF64:$d, (uint_to_fp RegI64:$a))]>;
-
-def CVT_f64_f32
-  : InstPTX<(outs RegF64:$d), (ins RegF32:$a), "cvt.f64.f32\t$d, $a",
-            [(set RegF64:$d, (fextend RegF32:$a))]>;
-
-///===- Control Flow Instructions -----------------------------------------===//
-
-let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
-  def BRAd
-    : InstPTX<(outs), (ins brtarget:$d), "bra\t$d", [(br bb:$d)]>;
-}
-
-let isBranch = 1, isTerminator = 1 in {
-  // FIXME: The pattern part is blank because I cannot (or do not yet know
-  // how to) use the first operand of PredicateOperand (a RegPred register) here
-  def BRAdp
-    : InstPTX<(outs), (ins brtarget:$d), "bra\t$d",
-              [/*(brcond pred:$_p, bb:$d)*/]>;
-}
-
-let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
-  def EXIT : InstPTX<(outs), (ins), "exit", [(PTXexit)]>;
-  def RET  : InstPTX<(outs), (ins), "ret",  [(PTXret)]>;
-}
-
-///===- Spill Instructions ------------------------------------------------===//
-// Special instructions used for stack spilling
-def STACKSTOREI16 : InstPTX<(outs), (ins i32imm:$d, RegI16:$a),
-                            "mov.u16\ts$d, $a", []>;
-def STACKSTOREI32 : InstPTX<(outs), (ins i32imm:$d, RegI32:$a),
-                            "mov.u32\ts$d, $a", []>;
-def STACKSTOREI64 : InstPTX<(outs), (ins i32imm:$d, RegI64:$a),
-                            "mov.u64\ts$d, $a", []>;
-def STACKSTOREF32 : InstPTX<(outs), (ins i32imm:$d, RegF32:$a),
-                            "mov.f32\ts$d, $a", []>;
-def STACKSTOREF64 : InstPTX<(outs), (ins i32imm:$d, RegF64:$a),
-                            "mov.f64\ts$d, $a", []>;
-
-def STACKLOADI16 : InstPTX<(outs), (ins RegI16:$d, i32imm:$a),
-                           "mov.u16\t$d, s$a", []>;
-def STACKLOADI32 : InstPTX<(outs), (ins RegI32:$d, i32imm:$a),
-                           "mov.u32\t$d, s$a", []>;
-def STACKLOADI64 : InstPTX<(outs), (ins RegI64:$d, i32imm:$a),
-                           "mov.u64\t$d, s$a", []>;
-def STACKLOADF32 : InstPTX<(outs), (ins RegF32:$d, i32imm:$a),
-                           "mov.f32\t$d, s$a", []>;
-def STACKLOADF64 : InstPTX<(outs), (ins RegF64:$d, i32imm:$a),
-                           "mov.f64\t$d, s$a", []>;
 
 ///===- Intrinsic Instructions --------------------------------------------===//
-
 include "PTXIntrinsicInstrInfo.td"
+
+///===- Load/Store Instructions -------------------------------------------===//
+include "PTXInstrLoadStore.td"
+
diff --git a/lib/Target/PTX/PTXInstrLoadStore.td b/lib/Target/PTX/PTXInstrLoadStore.td
new file mode 100644
index 0000000000000..9b4f56cf25c69
--- /dev/null
+++ b/lib/Target/PTX/PTXInstrLoadStore.td
@@ -0,0 +1,278 @@
+//===- PTXInstrLoadStore.td - PTX Load/Store Instruction Defs -*- tblgen-*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the PTX load/store instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+
+// Addressing Predicates
+// We have to differentiate between 32- and 64-bit pointer types
+def Use32BitAddresses : Predicate<"!getSubtarget().is64Bit()">;
+def Use64BitAddresses : Predicate<"getSubtarget().is64Bit()">;
+
+//===----------------------------------------------------------------------===//
+// Pattern Fragments for Loads/Stores
+//===----------------------------------------------------------------------===//
+
+def load_global : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  const Value *Src;
+  const PointerType *PT;
+  if ((Src = cast<LoadSDNode>(N)->getSrcValue()) &&
+      (PT = dyn_cast<PointerType>(Src->getType())))
+    return PT->getAddressSpace() == PTXStateSpace::Global;
+  return false;
+}]>;
+
+def load_constant : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  const Value *Src;
+  const PointerType *PT;
+  if ((Src = cast<LoadSDNode>(N)->getSrcValue()) &&
+      (PT = dyn_cast<PointerType>(Src->getType())))
+    return PT->getAddressSpace() == PTXStateSpace::Constant;
+  return false;
+}]>;
+
+def load_shared : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  const Value *Src;
+  const PointerType *PT;
+  if ((Src = cast<LoadSDNode>(N)->getSrcValue()) &&
+      (PT = dyn_cast<PointerType>(Src->getType())))
+    return PT->getAddressSpace() == PTXStateSpace::Shared;
+  return false;
+}]>;
+
+def store_global
+  : PatFrag<(ops node:$d, node:$ptr), (store node:$d, node:$ptr), [{
+  const Value *Src;
+  const PointerType *PT;
+  if ((Src = cast<StoreSDNode>(N)->getSrcValue()) &&
+      (PT = dyn_cast<PointerType>(Src->getType())))
+    return PT->getAddressSpace() == PTXStateSpace::Global;
+  return false;
+}]>;
+
+def store_shared
+  : PatFrag<(ops node:$d, node:$ptr), (store node:$d, node:$ptr), [{
+  const Value *Src;
+  const PointerType *PT;
+  if ((Src = cast<StoreSDNode>(N)->getSrcValue()) &&
+      (PT = dyn_cast<PointerType>(Src->getType())))
+    return PT->getAddressSpace() == PTXStateSpace::Shared;
+  return false;
+}]>;
+
+// Addressing modes.
+def ADDRrr32    : ComplexPattern<i32, 2, "SelectADDRrr", [], []>;
+def ADDRrr64    : ComplexPattern<i64, 2, "SelectADDRrr", [], []>;
+def ADDRri32    : ComplexPattern<i32, 2, "SelectADDRri", [], []>;
+def ADDRri64    : ComplexPattern<i64, 2, "SelectADDRri", [], []>;
+def ADDRii32    : ComplexPattern<i32, 2, "SelectADDRii", [], []>;
+def ADDRii64    : ComplexPattern<i64, 2, "SelectADDRii", [], []>;
+def ADDRlocal32 : ComplexPattern<i32, 2, "SelectADDRlocal", [], []>;
+def ADDRlocal64 : ComplexPattern<i64, 2, "SelectADDRlocal", [], []>;
+
+// Address operands
+def MEMri32 : Operand<i32> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops RegI32, i32imm);
+}
+def MEMri64 : Operand<i64> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops RegI64, i64imm);
+}
+def LOCALri32 : Operand<i32> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops i32imm, i32imm);
+}
+def LOCALri64 : Operand<i64> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops i64imm, i64imm);
+}
+def MEMii32 : Operand<i32> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops i32imm, i32imm);
+}
+def MEMii64 : Operand<i64> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops i64imm, i64imm);
+}
+// The operand here does not correspond to an actual address, so we
+// can use i32 in 64-bit address modes.
+def MEMpi : Operand<i32> {
+  let PrintMethod = "printParamOperand";
+  let MIOperandInfo = (ops i32imm);
+}
+def MEMret : Operand<i32> {
+  let PrintMethod = "printReturnOperand";
+  let MIOperandInfo = (ops i32imm);
+}
+
+
+// Load/store .param space
+def PTXloadparam
+  : SDNode<"PTXISD::LOAD_PARAM", SDTypeProfile<1, 1, [SDTCisPtrTy<1>]>,
+           [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue]>;
+def PTXstoreparam
+  : SDNode<"PTXISD::STORE_PARAM", SDTypeProfile<0, 2, [SDTCisVT<0, i32>]>,
+           [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue]>;
+
+def PTXreadparam
+  : SDNode<"PTXISD::READ_PARAM", SDTypeProfile<1, 1, [SDTCisVT<1, i32>]>,
+      [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue]>;
+def PTXwriteparam
+  : SDNode<"PTXISD::WRITE_PARAM", SDTypeProfile<0, 1, []>,
+      [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue]>;
+
+
+
+//===----------------------------------------------------------------------===//
+// Classes for loads/stores
+//===----------------------------------------------------------------------===//
+multiclass PTX_LD<string opstr, string typestr,
+           RegisterClass RC, PatFrag pat_load> {
+  def rr32 : InstPTX<(outs RC:$d),
+                     (ins MEMri32:$a),
+                     !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")),
+                     [(set RC:$d, (pat_load ADDRrr32:$a))]>,
+                     Requires<[Use32BitAddresses]>;
+  def rr64 : InstPTX<(outs RC:$d),
+                     (ins MEMri64:$a),
+                     !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")),
+                     [(set RC:$d, (pat_load ADDRrr64:$a))]>,
+                     Requires<[Use64BitAddresses]>;
+  def ri32 : InstPTX<(outs RC:$d),
+                     (ins MEMri32:$a),
+                     !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")),
+                     [(set RC:$d, (pat_load ADDRri32:$a))]>,
+                     Requires<[Use32BitAddresses]>;
+  def ri64 : InstPTX<(outs RC:$d),
+                     (ins MEMri64:$a),
+                     !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")),
+                     [(set RC:$d, (pat_load ADDRri64:$a))]>,
+                     Requires<[Use64BitAddresses]>;
+  def ii32 : InstPTX<(outs RC:$d),
+                     (ins MEMii32:$a),
+                     !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")),
+                     [(set RC:$d, (pat_load ADDRii32:$a))]>,
+                     Requires<[Use32BitAddresses]>;
+  def ii64 : InstPTX<(outs RC:$d),
+                     (ins MEMii64:$a),
+                     !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")),
+                     [(set RC:$d, (pat_load ADDRii64:$a))]>,
+                     Requires<[Use64BitAddresses]>;
+}
+
+multiclass PTX_ST<string opstr, string typestr, RegisterClass RC,
+                  PatFrag pat_store> {
+  def rr32 : InstPTX<(outs),
+                     (ins RC:$d, MEMri32:$a),
+                     !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")),
+                     [(pat_store RC:$d, ADDRrr32:$a)]>,
+                     Requires<[Use32BitAddresses]>;
+  def rr64 : InstPTX<(outs),
+                     (ins RC:$d, MEMri64:$a),
+                     !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")),
+                     [(pat_store RC:$d, ADDRrr64:$a)]>,
+                     Requires<[Use64BitAddresses]>;
+  def ri32 : InstPTX<(outs),
+                   (ins RC:$d, MEMri32:$a),
+                   !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")),
+                   [(pat_store RC:$d, ADDRri32:$a)]>,
+                   Requires<[Use32BitAddresses]>;
+  def ri64 : InstPTX<(outs),
+                   (ins RC:$d, MEMri64:$a),
+                   !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")),
+                   [(pat_store RC:$d, ADDRri64:$a)]>,
+                   Requires<[Use64BitAddresses]>;
+  def ii32 : InstPTX<(outs),
+                   (ins RC:$d, MEMii32:$a),
+                   !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")),
+                   [(pat_store RC:$d, ADDRii32:$a)]>,
+                   Requires<[Use32BitAddresses]>;
+  def ii64 : InstPTX<(outs),
+                   (ins RC:$d, MEMii64:$a),
+                   !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")),
+                   [(pat_store RC:$d, ADDRii64:$a)]>,
+                   Requires<[Use64BitAddresses]>;
+}
+
+multiclass PTX_LOCAL_LD_ST<string typestr, RegisterClass RC> {
+  def LDri32 : InstPTX<(outs RC:$d), (ins LOCALri32:$a),
+                      !strconcat("ld.local", !strconcat(typestr, "\t$d, [$a]")),
+                       [(set RC:$d, (load_global ADDRlocal32:$a))]>;
+  def LDri64 : InstPTX<(outs RC:$d), (ins LOCALri64:$a),
+                      !strconcat("ld.local", !strconcat(typestr, "\t$d, [$a]")),
+                       [(set RC:$d, (load_global ADDRlocal64:$a))]>;
+  def STri32 : InstPTX<(outs), (ins RC:$d, LOCALri32:$a),
+                      !strconcat("st.local", !strconcat(typestr, "\t[$a], $d")),
+                       [(store_global RC:$d, ADDRlocal32:$a)]>;
+  def STri64 : InstPTX<(outs), (ins RC:$d, LOCALri64:$a),
+                      !strconcat("st.local", !strconcat(typestr, "\t[$a], $d")),
+                       [(store_global RC:$d, ADDRlocal64:$a)]>;
+}
+
+multiclass PTX_PARAM_LD_ST<string typestr, RegisterClass RC> {
+  let hasSideEffects = 1 in {
+  def LDpi : InstPTX<(outs RC:$d), (ins i32imm:$a),
+                     !strconcat("ld.param", !strconcat(typestr, "\t$d, [$a]")),
+                     [(set RC:$d, (PTXloadparam texternalsym:$a))]>;
+  def STpi : InstPTX<(outs), (ins i32imm:$d, RC:$a),
+                     !strconcat("st.param", !strconcat(typestr, "\t[$d], $a")),
+                     [(PTXstoreparam texternalsym:$d, RC:$a)]>;
+  }
+}
+
+multiclass PTX_LD_ALL<string opstr, PatFrag pat_load> {
+  defm u16 : PTX_LD<opstr, ".u16", RegI16, pat_load>;
+  defm u32 : PTX_LD<opstr, ".u32", RegI32, pat_load>;
+  defm u64 : PTX_LD<opstr, ".u64", RegI64, pat_load>;
+  defm f32 : PTX_LD<opstr, ".f32", RegF32, pat_load>;
+  defm f64 : PTX_LD<opstr, ".f64", RegF64, pat_load>;
+}
+
+multiclass PTX_ST_ALL<string opstr, PatFrag pat_store> {
+  defm u16 : PTX_ST<opstr, ".u16", RegI16, pat_store>;
+  defm u32 : PTX_ST<opstr, ".u32", RegI32, pat_store>;
+  defm u64 : PTX_ST<opstr, ".u64", RegI64, pat_store>;
+  defm f32 : PTX_ST<opstr, ".f32", RegF32, pat_store>;
+  defm f64 : PTX_ST<opstr, ".f64", RegF64, pat_store>;
+}
+
+
+
+//===----------------------------------------------------------------------===//
+// Instruction definitions for loads/stores
+//===----------------------------------------------------------------------===//
+
+// Global/shared stores
+defm STg : PTX_ST_ALL<"st.global", store_global>;
+defm STs : PTX_ST_ALL<"st.shared", store_shared>;
+
+// Global/shared/constant loads
+defm LDg : PTX_LD_ALL<"ld.global", load_global>;
+defm LDc : PTX_LD_ALL<"ld.const",  load_constant>;
+defm LDs : PTX_LD_ALL<"ld.shared", load_shared>;
+
+// Param loads/stores
+defm PARAMPRED : PTX_PARAM_LD_ST<".pred", RegPred>;
+defm PARAMU16  : PTX_PARAM_LD_ST<".u16", RegI16>;
+defm PARAMU32  : PTX_PARAM_LD_ST<".u32", RegI32>;
+defm PARAMU64  : PTX_PARAM_LD_ST<".u64", RegI64>;
+defm PARAMF32  : PTX_PARAM_LD_ST<".f32", RegF32>;
+defm PARAMF64  : PTX_PARAM_LD_ST<".f64", RegF64>;
+
+// Local loads/stores
+defm LOCALPRED : PTX_LOCAL_LD_ST<".pred", RegPred>;
+defm LOCALU16  : PTX_LOCAL_LD_ST<".u16", RegI16>;
+defm LOCALU32  : PTX_LOCAL_LD_ST<".u32", RegI32>;
+defm LOCALU64  : PTX_LOCAL_LD_ST<".u64", RegI64>;
+defm LOCALF32  : PTX_LOCAL_LD_ST<".f32", RegF32>;
+defm LOCALF64  : PTX_LOCAL_LD_ST<".f64", RegF64>;
+
diff --git a/lib/Target/PTX/PTXIntrinsicInstrInfo.td b/lib/Target/PTX/PTXIntrinsicInstrInfo.td
index 8d97909d339aa..9de1cb62719a6 100644
--- a/lib/Target/PTX/PTXIntrinsicInstrInfo.td
+++ b/lib/Target/PTX/PTXIntrinsicInstrInfo.td
@@ -25,37 +25,63 @@ class PTX_READ_SPECIAL_REGISTER_R32<string regname, Intrinsic intop>
 
 // TODO Add read vector-version of special registers
 
-//def PTX_READ_TID_R64 : PTX_READ_SPECIAL_REGISTER_R64<"tid", int_ptx_read_tid_r64>;
-def PTX_READ_TID_X   : PTX_READ_SPECIAL_REGISTER_R32<"tid.x", int_ptx_read_tid_x>;
-def PTX_READ_TID_Y   : PTX_READ_SPECIAL_REGISTER_R32<"tid.y", int_ptx_read_tid_y>;
-def PTX_READ_TID_Z   : PTX_READ_SPECIAL_REGISTER_R32<"tid.z", int_ptx_read_tid_z>;
-def PTX_READ_TID_W   : PTX_READ_SPECIAL_REGISTER_R32<"tid.w", int_ptx_read_tid_w>;
+//def PTX_READ_TID_R64 : PTX_READ_SPECIAL_REGISTER_R64<"tid",
+//                                                     int_ptx_read_tid_r64>;
+def PTX_READ_TID_X   : PTX_READ_SPECIAL_REGISTER_R32<"tid.x",
+                                                     int_ptx_read_tid_x>;
+def PTX_READ_TID_Y   : PTX_READ_SPECIAL_REGISTER_R32<"tid.y",
+                                                     int_ptx_read_tid_y>;
+def PTX_READ_TID_Z   : PTX_READ_SPECIAL_REGISTER_R32<"tid.z",
+                                                     int_ptx_read_tid_z>;
+def PTX_READ_TID_W   : PTX_READ_SPECIAL_REGISTER_R32<"tid.w",
+                                                     int_ptx_read_tid_w>;
 
-//def PTX_READ_NTID_R64 : PTX_READ_SPECIAL_REGISTER_R64<"ntid", int_ptx_read_ntid_r64>;
-def PTX_READ_NTID_X   : PTX_READ_SPECIAL_REGISTER_R32<"ntid.x", int_ptx_read_ntid_x>;
-def PTX_READ_NTID_Y   : PTX_READ_SPECIAL_REGISTER_R32<"ntid.y", int_ptx_read_ntid_y>;
-def PTX_READ_NTID_Z   : PTX_READ_SPECIAL_REGISTER_R32<"ntid.z", int_ptx_read_ntid_z>;
-def PTX_READ_NTID_W   : PTX_READ_SPECIAL_REGISTER_R32<"ntid.w", int_ptx_read_ntid_w>;
+//def PTX_READ_NTID_R64 : PTX_READ_SPECIAL_REGISTER_R64<"ntid",
+//                                                      int_ptx_read_ntid_r64>;
+def PTX_READ_NTID_X   : PTX_READ_SPECIAL_REGISTER_R32<"ntid.x",
+                                                      int_ptx_read_ntid_x>;
+def PTX_READ_NTID_Y   : PTX_READ_SPECIAL_REGISTER_R32<"ntid.y",
+                                                      int_ptx_read_ntid_y>;
+def PTX_READ_NTID_Z   : PTX_READ_SPECIAL_REGISTER_R32<"ntid.z",
+                                                      int_ptx_read_ntid_z>;
+def PTX_READ_NTID_W   : PTX_READ_SPECIAL_REGISTER_R32<"ntid.w",
+                                                      int_ptx_read_ntid_w>;
 
-def PTX_READ_LANEID  : PTX_READ_SPECIAL_REGISTER_R32<"laneid", int_ptx_read_laneid>;
-def PTX_READ_WARPID  : PTX_READ_SPECIAL_REGISTER_R32<"warpid", int_ptx_read_warpid>;
-def PTX_READ_NWARPID : PTX_READ_SPECIAL_REGISTER_R32<"nwarpid", int_ptx_read_nwarpid>;
+def PTX_READ_LANEID  : PTX_READ_SPECIAL_REGISTER_R32<"laneid",
+                                                     int_ptx_read_laneid>;
+def PTX_READ_WARPID  : PTX_READ_SPECIAL_REGISTER_R32<"warpid",
+                                                     int_ptx_read_warpid>;
+def PTX_READ_NWARPID : PTX_READ_SPECIAL_REGISTER_R32<"nwarpid",
+                                                     int_ptx_read_nwarpid>;
 
-//def PTX_READ_CTAID_R64 : PTX_READ_SPECIAL_REGISTER_R64<"ctaid", int_ptx_read_ctaid_r64>;
-def PTX_READ_CTAID_X   : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.x", int_ptx_read_ctaid_x>;
-def PTX_READ_CTAID_Y   : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.y", int_ptx_read_ctaid_y>;
-def PTX_READ_CTAID_Z   : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.z", int_ptx_read_ctaid_z>;
-def PTX_READ_CTAID_W   : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.w", int_ptx_read_ctaid_w>;
+//def PTX_READ_CTAID_R64 :
+//PTX_READ_SPECIAL_REGISTER_R64<"ctaid", int_ptx_read_ctaid_r64>;
+def PTX_READ_CTAID_X   : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.x",
+                                                       int_ptx_read_ctaid_x>;
+def PTX_READ_CTAID_Y   : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.y",
+                                                       int_ptx_read_ctaid_y>;
+def PTX_READ_CTAID_Z   : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.z",
+                                                       int_ptx_read_ctaid_z>;
+def PTX_READ_CTAID_W   : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.w",
+                                                       int_ptx_read_ctaid_w>;
 
-//def PTX_READ_NCTAID_R64 : PTX_READ_SPECIAL_REGISTER_R64<"nctaid", int_ptx_read_nctaid_r64>;
-def PTX_READ_NCTAID_X   : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.x", int_ptx_read_nctaid_x>;
-def PTX_READ_NCTAID_Y   : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.y", int_ptx_read_nctaid_y>;
-def PTX_READ_NCTAID_Z   : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.z", int_ptx_read_nctaid_z>;
-def PTX_READ_NCTAID_W   : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.w", int_ptx_read_nctaid_w>;
+//def PTX_READ_NCTAID_R64 :
+//PTX_READ_SPECIAL_REGISTER_R64<"nctaid", int_ptx_read_nctaid_r64>;
+def PTX_READ_NCTAID_X   : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.x",
+                                                        int_ptx_read_nctaid_x>;
+def PTX_READ_NCTAID_Y   : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.y",
+                                                        int_ptx_read_nctaid_y>;
+def PTX_READ_NCTAID_Z   : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.z",
+                                                        int_ptx_read_nctaid_z>;
+def PTX_READ_NCTAID_W   : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.w",
+                                                        int_ptx_read_nctaid_w>;
 
-def PTX_READ_SMID  : PTX_READ_SPECIAL_REGISTER_R32<"smid", int_ptx_read_smid>;
-def PTX_READ_NSMID  : PTX_READ_SPECIAL_REGISTER_R32<"nsmid", int_ptx_read_nsmid>;
-def PTX_READ_GRIDID  : PTX_READ_SPECIAL_REGISTER_R32<"gridid", int_ptx_read_gridid>;
+def PTX_READ_SMID  : PTX_READ_SPECIAL_REGISTER_R32<"smid",
+                                                   int_ptx_read_smid>;
+def PTX_READ_NSMID  : PTX_READ_SPECIAL_REGISTER_R32<"nsmid",
+                                                    int_ptx_read_nsmid>;
+def PTX_READ_GRIDID  : PTX_READ_SPECIAL_REGISTER_R32<"gridid",
+                                                     int_ptx_read_gridid>;
 
 def PTX_READ_LANEMASK_EQ
   : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_eq", int_ptx_read_lanemask_eq>;
diff --git a/lib/Target/PTX/PTXMCAsmStreamer.cpp b/lib/Target/PTX/PTXMCAsmStreamer.cpp
index b13a3dace1309..468ce9301de4a 100644
--- a/lib/Target/PTX/PTXMCAsmStreamer.cpp
+++ b/lib/Target/PTX/PTXMCAsmStreamer.cpp
@@ -100,7 +100,7 @@ public:
   /// @{
 
   virtual void ChangeSection(const MCSection *Section);
-  virtual void InitSections() {}
+  virtual void InitSections() { /* PTX does not use sections */ }
 
   virtual void EmitLabel(MCSymbol *Symbol);
 
@@ -132,7 +132,9 @@ public:
   ///
   /// @param Symbol - The common symbol to emit.
   /// @param Size - The size of the common symbol.
-  virtual void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size);
+  /// @param ByteAlignment - The alignment of the common symbol in bytes.
+  virtual void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                                     unsigned ByteAlignment);
 
   virtual void EmitZerofill(const MCSection *Section, MCSymbol *Symbol = 0,
                             unsigned Size = 0, unsigned ByteAlignment = 0);
@@ -233,7 +235,7 @@ void PTXMCAsmStreamer::ChangeSection(const MCSection *Section) {
 void PTXMCAsmStreamer::EmitLabel(MCSymbol *Symbol) {
   assert(Symbol->isUndefined() && "Cannot define a symbol twice!");
   assert(!Symbol->isVariable() && "Cannot emit a variable symbol!");
-  //assert(getCurrentSection() && "Cannot emit before setting section!");
+  assert(getCurrentSection() && "Cannot emit before setting section!");
 
   OS << *Symbol << MAI.getLabelSuffix();
   EmitEOL();
@@ -283,7 +285,8 @@ void PTXMCAsmStreamer::EmitELFSize(MCSymbol *Symbol, const MCExpr *Value) {}
 void PTXMCAsmStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                                         unsigned ByteAlignment) {}
 
-void PTXMCAsmStreamer::EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size) {}
+void PTXMCAsmStreamer::EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                                             unsigned ByteAlignment) {}
 
 void PTXMCAsmStreamer::EmitZerofill(const MCSection *Section, MCSymbol *Symbol,
                                     unsigned Size, unsigned ByteAlignment) {}
@@ -510,7 +513,7 @@ void PTXMCAsmStreamer::EmitInstruction(const MCInst &Inst) {
 
   // If we have an AsmPrinter, use that to print, otherwise print the MCInst.
   if (InstPrinter)
-    InstPrinter->printInst(&Inst, OS);
+    InstPrinter->printInst(&Inst, OS, "");
   else
     Inst.print(OS, &MAI);
   EmitEOL();
@@ -533,7 +536,7 @@ namespace llvm {
                                    formatted_raw_ostream &OS,
                                    bool isVerboseAsm, bool useLoc, bool useCFI,
                                    MCInstPrinter *IP,
-                                   MCCodeEmitter *CE, TargetAsmBackend *TAB,
+                                   MCCodeEmitter *CE, MCAsmBackend *MAB,
                                    bool ShowInst) {
     return new PTXMCAsmStreamer(Context, OS, isVerboseAsm, useLoc,
                                 IP, CE, ShowInst);
diff --git a/lib/Target/PTX/PTXMCInstLower.cpp b/lib/Target/PTX/PTXMCInstLower.cpp
new file mode 100644
index 0000000000000..142e639b3ef77
--- /dev/null
+++ b/lib/Target/PTX/PTXMCInstLower.cpp
@@ -0,0 +1,32 @@
+//===-- PTXMCInstLower.cpp - Convert PTX MachineInstr to an MCInst --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower PTX MachineInstrs to their corresponding
+// MCInst records.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PTX.h"
+#include "PTXAsmPrinter.h"
+#include "llvm/Constants.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Target/Mangler.h"
+
+void llvm::LowerPTXMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
+                                        PTXAsmPrinter &AP) {
+  OutMI.setOpcode(MI->getOpcode());
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    MCOperand MCOp;
+    OutMI.addOperand(AP.lowerOperand(MO));
+  }
+}
+
diff --git a/lib/Target/PTX/PTXMFInfoExtract.cpp b/lib/Target/PTX/PTXMFInfoExtract.cpp
index 6fe9e6c3f6572..b33a273dc93d6 100644
--- a/lib/Target/PTX/PTXMFInfoExtract.cpp
+++ b/lib/Target/PTX/PTXMFInfoExtract.cpp
@@ -52,36 +52,12 @@ bool PTXMFInfoExtract::runOnMachineFunction(MachineFunction &MF) {
   PTXMachineFunctionInfo *MFI = MF.getInfo<PTXMachineFunctionInfo>();
   MachineRegisterInfo &MRI = MF.getRegInfo();
 
-  DEBUG(dbgs() << "******** PTX FUNCTION LOCAL VAR REG DEF ********\n");
-
-  DEBUG(dbgs()
-        << "PTX::NoRegister == " << PTX::NoRegister << "\n"
-        << "PTX::NUM_TARGET_REGS == " << PTX::NUM_TARGET_REGS << "\n");
-
-  DEBUG(for (unsigned reg = PTX::NoRegister + 1;
-             reg < PTX::NUM_TARGET_REGS; ++reg)
-          if (MRI.isPhysRegUsed(reg))
-            dbgs() << "Used Reg: " << reg << "\n";);
-
-  // FIXME: This is a slow linear scanning
-  for (unsigned reg = PTX::NoRegister + 1; reg < PTX::NUM_TARGET_REGS; ++reg)
-    if (MRI.isPhysRegUsed(reg) &&
-        !MFI->isRetReg(reg) &&
-        (MFI->isKernel() || !MFI->isArgReg(reg)))
-      MFI->addLocalVarReg(reg);
-
-  // Notify MachineFunctionInfo that I've done adding local var reg
-  MFI->doneAddLocalVar();
-
-  DEBUG(for (PTXMachineFunctionInfo::reg_iterator
-             i = MFI->argRegBegin(), e = MFI->argRegEnd();
-             i != e; ++i)
-        dbgs() << "Arg Reg: " << *i << "\n";);
-
-  DEBUG(for (PTXMachineFunctionInfo::reg_iterator
-             i = MFI->localVarRegBegin(), e = MFI->localVarRegEnd();
-             i != e; ++i)
-        dbgs() << "Local Var Reg: " << *i << "\n";);
+  // Generate list of all virtual registers used in this function
+  for (unsigned i = 0; i < MRI.getNumVirtRegs(); ++i) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    const TargetRegisterClass *TRC = MRI.getRegClass(Reg);
+    MFI->addVirtualRegister(TRC, Reg);
+  }
 
   return false;
 }
diff --git a/lib/Target/PTX/PTXMachineFunctionInfo.h b/lib/Target/PTX/PTXMachineFunctionInfo.h
index 9d65f5bd1aded..3b985f7dd6b00 100644
--- a/lib/Target/PTX/PTXMachineFunctionInfo.h
+++ b/lib/Target/PTX/PTXMachineFunctionInfo.h
@@ -15,75 +15,148 @@
 #define PTX_MACHINE_FUNCTION_INFO_H
 
 #include "PTX.h"
+#include "PTXParamManager.h"
+#include "PTXRegisterInfo.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
 
 namespace llvm {
+
 /// PTXMachineFunctionInfo - This class is derived from MachineFunction and
 /// contains private PTX target-specific information for each MachineFunction.
 ///
 class PTXMachineFunctionInfo : public MachineFunctionInfo {
 private:
-  bool is_kernel;
-  std::vector<unsigned> reg_arg, reg_local_var;
-  std::vector<unsigned> reg_ret;
-  bool _isDoneAddArg;
+  bool IsKernel;
+  DenseSet<unsigned> RegArgs;
+  DenseSet<unsigned> RegRets;
+
+  typedef std::vector<unsigned> RegisterList;
+  typedef DenseMap<const TargetRegisterClass*, RegisterList> RegisterMap;
+  typedef DenseMap<unsigned, std::string> RegisterNameMap;
+  typedef DenseMap<int, std::string> FrameMap;
+
+  RegisterMap UsedRegs;
+  RegisterNameMap RegNames;
+  FrameMap FrameSymbols;
+
+  PTXParamManager ParamManager;
 
 public:
+  typedef DenseSet<unsigned>::const_iterator reg_iterator;
+
   PTXMachineFunctionInfo(MachineFunction &MF)
-    : is_kernel(false), reg_ret(PTX::NoRegister), _isDoneAddArg(false) {
-      reg_arg.reserve(8);
-      reg_local_var.reserve(32);
+    : IsKernel(false) {
+      UsedRegs[PTX::RegPredRegisterClass] = RegisterList();
+      UsedRegs[PTX::RegI16RegisterClass] = RegisterList();
+      UsedRegs[PTX::RegI32RegisterClass] = RegisterList();
+      UsedRegs[PTX::RegI64RegisterClass] = RegisterList();
+      UsedRegs[PTX::RegF32RegisterClass] = RegisterList();
+      UsedRegs[PTX::RegF64RegisterClass] = RegisterList();
     }
 
-  void setKernel(bool _is_kernel=true) { is_kernel = _is_kernel; }
-
-  void addArgReg(unsigned reg) { reg_arg.push_back(reg); }
-  void addLocalVarReg(unsigned reg) { reg_local_var.push_back(reg); }
-  void addRetReg(unsigned reg) {
-    if (!isRetReg(reg)) {
-      reg_ret.push_back(reg);
+  /// getParamManager - Returns the PTXParamManager instance for this function.
+  PTXParamManager& getParamManager() { return ParamManager; }
+  const PTXParamManager& getParamManager() const { return ParamManager; }
+
+  /// setKernel/isKernel - Gets/sets a flag that indicates if this function is
+  /// a PTX kernel function.
+  void setKernel(bool _IsKernel=true) { IsKernel = _IsKernel; }
+  bool isKernel() const { return IsKernel; }
+
+  /// argreg_begin/argreg_end - Returns iterators to the set of registers
+  /// containing function arguments.
+  reg_iterator argreg_begin() const { return RegArgs.begin(); }
+  reg_iterator argreg_end()   const { return RegArgs.end(); }
+
+  /// retreg_begin/retreg_end - Returns iterators to the set of registers
+  /// containing the function return values.
+  reg_iterator retreg_begin() const { return RegRets.begin(); }
+  reg_iterator retreg_end()   const { return RegRets.end(); }
+
+  /// addRetReg - Adds a register to the set of return-value registers.
+  void addRetReg(unsigned Reg) {
+    if (!RegRets.count(Reg)) {
+      RegRets.insert(Reg);
+      std::string name;
+      name = "%ret";
+      name += utostr(RegRets.size() - 1);
+      RegNames[Reg] = name;
     }
   }
 
-  void doneAddArg(void) {
-    _isDoneAddArg = true;
+  /// addArgReg - Adds a register to the set of function argument registers.
+  void addArgReg(unsigned Reg) {
+    RegArgs.insert(Reg);
+    std::string name;
+    name = "%param";
+    name += utostr(RegArgs.size() - 1);
+    RegNames[Reg] = name;
   }
-  void doneAddLocalVar(void) {}
-
-  bool isKernel() const { return is_kernel; }
 
-  typedef std::vector<unsigned>::const_iterator         reg_iterator;
-  typedef std::vector<unsigned>::const_reverse_iterator reg_reverse_iterator;
-  typedef std::vector<unsigned>::const_iterator         ret_iterator;
-
-  bool         argRegEmpty() const { return reg_arg.empty(); }
-  int          getNumArg() const { return reg_arg.size(); }
-  reg_iterator argRegBegin() const { return reg_arg.begin(); }
-  reg_iterator argRegEnd()   const { return reg_arg.end(); }
-  reg_reverse_iterator argRegReverseBegin() const { return reg_arg.rbegin(); }
-  reg_reverse_iterator argRegReverseEnd() const { return reg_arg.rend(); }
-
-  bool         localVarRegEmpty() const { return reg_local_var.empty(); }
-  reg_iterator localVarRegBegin() const { return reg_local_var.begin(); }
-  reg_iterator localVarRegEnd()   const { return reg_local_var.end(); }
-
-  bool         retRegEmpty() const { return reg_ret.empty(); }
-  int          getNumRet() const { return reg_ret.size(); }
-  ret_iterator retRegBegin() const { return reg_ret.begin(); }
-  ret_iterator retRegEnd()   const { return reg_ret.end(); }
+  /// addVirtualRegister - Adds a virtual register to the set of all used
+  /// registers in the function.
+  void addVirtualRegister(const TargetRegisterClass *TRC, unsigned Reg) {
+    std::string name;
+
+    // Do not count registers that are argument/return registers.
+    if (!RegRets.count(Reg) && !RegArgs.count(Reg)) {
+      UsedRegs[TRC].push_back(Reg);
+      if (TRC == PTX::RegPredRegisterClass)
+        name = "%p";
+      else if (TRC == PTX::RegI16RegisterClass)
+        name = "%rh";
+      else if (TRC == PTX::RegI32RegisterClass)
+        name = "%r";
+      else if (TRC == PTX::RegI64RegisterClass)
+        name = "%rd";
+      else if (TRC == PTX::RegF32RegisterClass)
+        name = "%f";
+      else if (TRC == PTX::RegF64RegisterClass)
+        name = "%fd";
+      else
+        llvm_unreachable("Invalid register class");
+
+      name += utostr(UsedRegs[TRC].size() - 1);
+      RegNames[Reg] = name;
+    }
+  }
 
-  bool isArgReg(unsigned reg) const {
-    return std::find(reg_arg.begin(), reg_arg.end(), reg) != reg_arg.end();
+  /// getRegisterName - Returns the name of the specified virtual register. This
+  /// name is used during PTX emission.
+  const char *getRegisterName(unsigned Reg) const {
+    if (RegNames.count(Reg))
+      return RegNames.find(Reg)->second.c_str();
+    else if (Reg == PTX::NoRegister)
+      return "%noreg";
+    else
+      llvm_unreachable("Register not in register name map");
   }
 
-  bool isRetReg(unsigned reg) const {
-    return std::find(reg_ret.begin(), reg_ret.end(), reg) != reg_ret.end();
+  /// getNumRegistersForClass - Returns the number of virtual registers that are
+  /// used for the specified register class.
+  unsigned getNumRegistersForClass(const TargetRegisterClass *TRC) const {
+    return UsedRegs.lookup(TRC).size();
   }
 
-  bool isLocalVarReg(unsigned reg) const {
-    return std::find(reg_local_var.begin(), reg_local_var.end(), reg)
-      != reg_local_var.end();
+  /// getFrameSymbol - Returns the symbol name for the given FrameIndex.
+  const char* getFrameSymbol(int FrameIndex) {
+    if (FrameSymbols.count(FrameIndex)) {
+      return FrameSymbols.lookup(FrameIndex).c_str();
+    } else {
+      std::string Name = "__local";
+      Name += utostr(FrameIndex);
+      // The whole point of caching this name is to ensure the pointer we pass
+      // to any getExternalSymbol() calls will remain valid for the lifetime of
+      // the back-end instance. This is to work around an issue in SelectionDAG
+      // where symbol names are expected to be life-long strings.
+      FrameSymbols[FrameIndex] = Name;
+      return FrameSymbols[FrameIndex].c_str();
+    }
   }
 }; // class PTXMachineFunctionInfo
 } // namespace llvm
diff --git a/lib/Target/PTX/PTXParamManager.cpp b/lib/Target/PTX/PTXParamManager.cpp
new file mode 100644
index 0000000000000..7753787ebc519
--- /dev/null
+++ b/lib/Target/PTX/PTXParamManager.cpp
@@ -0,0 +1,73 @@
+//===- PTXParamManager.cpp - Manager for .param variables -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the PTXParamManager class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PTX.h"
+#include "PTXParamManager.h"
+#include "llvm/ADT/StringExtras.h"
+
+using namespace llvm;
+
+PTXParamManager::PTXParamManager() {
+}
+
+unsigned PTXParamManager::addArgumentParam(unsigned Size) {
+  PTXParam Param;
+  Param.Type = PTX_PARAM_TYPE_ARGUMENT;
+  Param.Size = Size;
+
+  std::string Name;
+  Name = "__param_";
+  Name += utostr(ArgumentParams.size()+1);
+  Param.Name = Name;
+
+  unsigned Index = AllParams.size();
+  AllParams[Index] = Param;
+  ArgumentParams.push_back(Index);
+
+  return Index;
+}
+
+unsigned PTXParamManager::addReturnParam(unsigned Size) {
+  PTXParam Param;
+  Param.Type = PTX_PARAM_TYPE_RETURN;
+  Param.Size = Size;
+
+  std::string Name;
+  Name = "__ret_";
+  Name += utostr(ReturnParams.size()+1);
+  Param.Name = Name;
+
+  unsigned Index = AllParams.size();
+  AllParams[Index] = Param;
+  ReturnParams.push_back(Index);
+
+  return Index;
+}
+
+unsigned PTXParamManager::addLocalParam(unsigned Size) {
+  PTXParam Param;
+  Param.Type = PTX_PARAM_TYPE_LOCAL;
+  Param.Size = Size;
+
+  std::string Name;
+  Name = "__localparam_";
+  Name += utostr(LocalParams.size()+1);
+  Param.Name = Name;
+
+  unsigned Index = AllParams.size();
+  AllParams[Index] = Param;
+  LocalParams.push_back(Index);
+
+  return Index;
+}
+
diff --git a/lib/Target/PTX/PTXParamManager.h b/lib/Target/PTX/PTXParamManager.h
new file mode 100644
index 0000000000000..9fd2de52f7f24
--- /dev/null
+++ b/lib/Target/PTX/PTXParamManager.h
@@ -0,0 +1,86 @@
+//===- PTXParamManager.h - Manager for .param variables ----------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the PTXParamManager class, which manages all defined .param
+// variables for a particular function.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PTX_PARAM_MANAGER_H
+#define PTX_PARAM_MANAGER_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+
+namespace llvm {
+
+/// PTXParamManager - This class manages all .param variables defined for a
+/// particular function.
+class PTXParamManager {
+private:
+
+  /// PTXParamType - Type of a .param variable
+  enum PTXParamType {
+    PTX_PARAM_TYPE_ARGUMENT,
+    PTX_PARAM_TYPE_RETURN,
+    PTX_PARAM_TYPE_LOCAL
+  };
+
+  /// PTXParam - Definition of a PTX .param variable
+  struct PTXParam {
+    PTXParamType  Type;
+    unsigned      Size;
+    std::string   Name;
+  };
+
+  DenseMap<unsigned, PTXParam> AllParams;
+  SmallVector<unsigned, 4> ArgumentParams;
+  SmallVector<unsigned, 4> ReturnParams;
+  SmallVector<unsigned, 4> LocalParams;
+
+public:
+
+  typedef SmallVector<unsigned, 4>::const_iterator param_iterator;
+
+  PTXParamManager();
+
+  param_iterator arg_begin() const { return ArgumentParams.begin(); }
+  param_iterator arg_end() const { return ArgumentParams.end(); }
+  param_iterator ret_begin() const { return ReturnParams.begin(); }
+  param_iterator ret_end() const { return ReturnParams.end(); }
+  param_iterator local_begin() const { return LocalParams.begin(); }
+  param_iterator local_end() const { return LocalParams.end(); }
+
+  /// addArgumentParam - Returns a new .param used as an argument.
+  unsigned addArgumentParam(unsigned Size);
+
+  /// addReturnParam - Returns a new .param used as a return argument.
+  unsigned addReturnParam(unsigned Size);
+
+  /// addLocalParam - Returns a new .param used as a local .param variable.
+  unsigned addLocalParam(unsigned Size);
+
+  /// getParamName - Returns the name of the parameter as a string.
+  const std::string &getParamName(unsigned Param) const {
+    assert(AllParams.count(Param) == 1 && "Param has not been defined!");
+    return AllParams.find(Param)->second.Name;
+  }
+
+  /// getParamSize - Returns the size of the parameter in bits.
+  unsigned getParamSize(unsigned Param) const {
+    assert(AllParams.count(Param) == 1 && "Param has not been defined!");
+    return AllParams.find(Param)->second.Size;
+  }
+
+};
+
+}
+
+#endif
+
diff --git a/lib/Target/PTX/PTXRegAlloc.cpp b/lib/Target/PTX/PTXRegAlloc.cpp
new file mode 100644
index 0000000000000..2d2d5c30c8ca9
--- /dev/null
+++ b/lib/Target/PTX/PTXRegAlloc.cpp
@@ -0,0 +1,58 @@
+//===-- PTXRegAlloc.cpp - PTX Register Allocator --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a register allocator for PTX code.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "ptx-reg-alloc"
+
+#include "PTX.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/RegAllocRegistry.h"
+
+using namespace llvm;
+
+namespace {
+  // Special register allocator for PTX.
+  class PTXRegAlloc : public MachineFunctionPass {
+  public:
+    static char ID;
+    PTXRegAlloc() : MachineFunctionPass(ID) {
+      initializePHIEliminationPass(*PassRegistry::getPassRegistry());
+      initializeTwoAddressInstructionPassPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual const char* getPassName() const {
+      return "PTX Register Allocator";
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      AU.addRequiredID(PHIEliminationID);
+      AU.addRequiredID(TwoAddressInstructionPassID);
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    virtual bool runOnMachineFunction(MachineFunction &MF) {
+      // We do not actually do anything (at least not yet).
+      return false;
+    }
+  };
+
+  char PTXRegAlloc::ID = 0;
+
+  static RegisterRegAlloc
+    ptxRegAlloc("ptx", "PTX register allocator", createPTXRegisterAllocator);
+}
+
+FunctionPass *llvm::createPTXRegisterAllocator() {
+  return new PTXRegAlloc();
+}
+
diff --git a/lib/Target/PTX/PTXRegisterInfo.cpp b/lib/Target/PTX/PTXRegisterInfo.cpp
index cb56ea98a2b8d..c8062664a93b8 100644
--- a/lib/Target/PTX/PTXRegisterInfo.cpp
+++ b/lib/Target/PTX/PTXRegisterInfo.cpp
@@ -14,6 +14,9 @@
 #include "PTX.h"
 #include "PTXRegisterInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -23,15 +26,23 @@
 using namespace llvm;
 
 PTXRegisterInfo::PTXRegisterInfo(PTXTargetMachine &TM,
-                                 const TargetInstrInfo &TII)
-  : PTXGenRegisterInfo() {
+                                 const TargetInstrInfo &tii)
+  // PTX does not have a return address register.
+  : PTXGenRegisterInfo(0), TII(tii) {
 }
 
 void PTXRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
                                           int SPAdj,
                                           RegScavenger *RS) const {
   unsigned Index;
-  MachineInstr& MI = *II;
+  MachineInstr &MI = *II;
+  //MachineBasicBlock &MBB = *MI.getParent();
+  //DebugLoc dl = MI.getDebugLoc();
+  //MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+
+  //unsigned Reg = MRI.createVirtualRegister(PTX::RegF32RegisterClass);
+
+  llvm_unreachable("FrameIndex should have been previously eliminated!");
 
   Index = 0;
   while (!MI.getOperand(Index).isFI()) {
@@ -46,6 +57,18 @@ void PTXRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   DEBUG(dbgs() << "- SPAdj: " << SPAdj << "\n");
   DEBUG(dbgs() << "- FrameIndex: " << FrameIndex << "\n");
 
+  //MachineInstr* MI2 = BuildMI(MBB, II, dl, TII.get(PTX::LOAD_LOCAL_F32))
+  //.addReg(Reg, RegState::Define).addImm(FrameIndex);
+  //if (MI2->findFirstPredOperandIdx() == -1) {
+  //  MI2->addOperand(MachineOperand::CreateReg(PTX::NoRegister, /*IsDef=*/false));
+  //  MI2->addOperand(MachineOperand::CreateImm(PTX::PRED_NORMAL));
+  //}
+  //MI2->dump();
+
+  //MachineOperand ESOp = MachineOperand::CreateES("__local__");
+
   // This frame index is post stack slot re-use assignments
+  //MI.getOperand(Index).ChangeToRegister(Reg, false);
   MI.getOperand(Index).ChangeToImmediate(FrameIndex);
+  //MI.getOperand(Index) = ESOp;
 }
diff --git a/lib/Target/PTX/PTXRegisterInfo.h b/lib/Target/PTX/PTXRegisterInfo.h
index 0b63cb6d458e6..55fafe47bf353 100644
--- a/lib/Target/PTX/PTXRegisterInfo.h
+++ b/lib/Target/PTX/PTXRegisterInfo.h
@@ -25,8 +25,12 @@ class PTXTargetMachine;
 class MachineFunction;
 
 struct PTXRegisterInfo : public PTXGenRegisterInfo {
+private:
+  const TargetInstrInfo &TII;
+
+public:
   PTXRegisterInfo(PTXTargetMachine &TM,
-                  const TargetInstrInfo &TII);
+                  const TargetInstrInfo &tii);
 
   virtual const unsigned
     *getCalleeSavedRegs(const MachineFunction *MF = 0) const {
@@ -47,18 +51,6 @@ struct PTXRegisterInfo : public PTXGenRegisterInfo {
     llvm_unreachable("PTX does not have a frame register");
     return 0;
   }
-
-  virtual unsigned getRARegister() const {
-    llvm_unreachable("PTX does not have a return address register");
-    return 0;
-  }
-
-  virtual int getDwarfRegNum(unsigned RegNum, bool isEH) const {
-    return PTXGenRegisterInfo::getDwarfRegNumFull(RegNum, 0);
-  }
-  virtual int getLLVMRegNum(unsigned RegNum, bool isEH) const {
-    return PTXGenRegisterInfo::getLLVMRegNumFull(RegNum, 0);
-  }
 }; // struct PTXRegisterInfo
 } // namespace llvm
 
diff --git a/lib/Target/PTX/PTXRegisterInfo.td b/lib/Target/PTX/PTXRegisterInfo.td
index 1313d248325e3..6ed6d3fe385f9 100644
--- a/lib/Target/PTX/PTXRegisterInfo.td
+++ b/lib/Target/PTX/PTXRegisterInfo.td
@@ -20,536 +20,18 @@ class PTXReg<string n> : Register<n> {
 //  Registers
 //===----------------------------------------------------------------------===//
 
-///===- Predicate Registers -----------------------------------------------===//
-
-def P0 : PTXReg<"p0">;
-def P1 : PTXReg<"p1">;
-def P2 : PTXReg<"p2">;
-def P3 : PTXReg<"p3">;
-def P4 : PTXReg<"p4">;
-def P5 : PTXReg<"p5">;
-def P6 : PTXReg<"p6">;
-def P7 : PTXReg<"p7">;
-def P8 : PTXReg<"p8">;
-def P9 : PTXReg<"p9">;
-def P10 : PTXReg<"p10">;
-def P11 : PTXReg<"p11">;
-def P12 : PTXReg<"p12">;
-def P13 : PTXReg<"p13">;
-def P14 : PTXReg<"p14">;
-def P15 : PTXReg<"p15">;
-def P16 : PTXReg<"p16">;
-def P17 : PTXReg<"p17">;
-def P18 : PTXReg<"p18">;
-def P19 : PTXReg<"p19">;
-def P20 : PTXReg<"p20">;
-def P21 : PTXReg<"p21">;
-def P22 : PTXReg<"p22">;
-def P23 : PTXReg<"p23">;
-def P24 : PTXReg<"p24">;
-def P25 : PTXReg<"p25">;
-def P26 : PTXReg<"p26">;
-def P27 : PTXReg<"p27">;
-def P28 : PTXReg<"p28">;
-def P29 : PTXReg<"p29">;
-def P30 : PTXReg<"p30">;
-def P31 : PTXReg<"p31">;
-def P32 : PTXReg<"p32">;
-def P33 : PTXReg<"p33">;
-def P34 : PTXReg<"p34">;
-def P35 : PTXReg<"p35">;
-def P36 : PTXReg<"p36">;
-def P37 : PTXReg<"p37">;
-def P38 : PTXReg<"p38">;
-def P39 : PTXReg<"p39">;
-def P40 : PTXReg<"p40">;
-def P41 : PTXReg<"p41">;
-def P42 : PTXReg<"p42">;
-def P43 : PTXReg<"p43">;
-def P44 : PTXReg<"p44">;
-def P45 : PTXReg<"p45">;
-def P46 : PTXReg<"p46">;
-def P47 : PTXReg<"p47">;
-def P48 : PTXReg<"p48">;
-def P49 : PTXReg<"p49">;
-def P50 : PTXReg<"p50">;
-def P51 : PTXReg<"p51">;
-def P52 : PTXReg<"p52">;
-def P53 : PTXReg<"p53">;
-def P54 : PTXReg<"p54">;
-def P55 : PTXReg<"p55">;
-def P56 : PTXReg<"p56">;
-def P57 : PTXReg<"p57">;
-def P58 : PTXReg<"p58">;
-def P59 : PTXReg<"p59">;
-def P60 : PTXReg<"p60">;
-def P61 : PTXReg<"p61">;
-def P62 : PTXReg<"p62">;
-def P63 : PTXReg<"p63">;
-def P64 : PTXReg<"p64">;
-def P65 : PTXReg<"p65">;
-def P66 : PTXReg<"p66">;
-def P67 : PTXReg<"p67">;
-def P68 : PTXReg<"p68">;
-def P69 : PTXReg<"p69">;
-def P70 : PTXReg<"p70">;
-def P71 : PTXReg<"p71">;
-def P72 : PTXReg<"p72">;
-def P73 : PTXReg<"p73">;
-def P74 : PTXReg<"p74">;
-def P75 : PTXReg<"p75">;
-def P76 : PTXReg<"p76">;
-def P77 : PTXReg<"p77">;
-def P78 : PTXReg<"p78">;
-def P79 : PTXReg<"p79">;
-def P80 : PTXReg<"p80">;
-def P81 : PTXReg<"p81">;
-def P82 : PTXReg<"p82">;
-def P83 : PTXReg<"p83">;
-def P84 : PTXReg<"p84">;
-def P85 : PTXReg<"p85">;
-def P86 : PTXReg<"p86">;
-def P87 : PTXReg<"p87">;
-def P88 : PTXReg<"p88">;
-def P89 : PTXReg<"p89">;
-def P90 : PTXReg<"p90">;
-def P91 : PTXReg<"p91">;
-def P92 : PTXReg<"p92">;
-def P93 : PTXReg<"p93">;
-def P94 : PTXReg<"p94">;
-def P95 : PTXReg<"p95">;
-def P96 : PTXReg<"p96">;
-def P97 : PTXReg<"p97">;
-def P98 : PTXReg<"p98">;
-def P99 : PTXReg<"p99">;
-def P100 : PTXReg<"p100">;
-def P101 : PTXReg<"p101">;
-def P102 : PTXReg<"p102">;
-def P103 : PTXReg<"p103">;
-def P104 : PTXReg<"p104">;
-def P105 : PTXReg<"p105">;
-def P106 : PTXReg<"p106">;
-def P107 : PTXReg<"p107">;
-def P108 : PTXReg<"p108">;
-def P109 : PTXReg<"p109">;
-def P110 : PTXReg<"p110">;
-def P111 : PTXReg<"p111">;
-def P112 : PTXReg<"p112">;
-def P113 : PTXReg<"p113">;
-def P114 : PTXReg<"p114">;
-def P115 : PTXReg<"p115">;
-def P116 : PTXReg<"p116">;
-def P117 : PTXReg<"p117">;
-def P118 : PTXReg<"p118">;
-def P119 : PTXReg<"p119">;
-def P120 : PTXReg<"p120">;
-def P121 : PTXReg<"p121">;
-def P122 : PTXReg<"p122">;
-def P123 : PTXReg<"p123">;
-def P124 : PTXReg<"p124">;
-def P125 : PTXReg<"p125">;
-def P126 : PTXReg<"p126">;
-def P127 : PTXReg<"p127">;
-
-///===- 16-Bit Registers --------------------------------------------------===//
-
-def RH0 : PTXReg<"rh0">;
-def RH1 : PTXReg<"rh1">;
-def RH2 : PTXReg<"rh2">;
-def RH3 : PTXReg<"rh3">;
-def RH4 : PTXReg<"rh4">;
-def RH5 : PTXReg<"rh5">;
-def RH6 : PTXReg<"rh6">;
-def RH7 : PTXReg<"rh7">;
-def RH8 : PTXReg<"rh8">;
-def RH9 : PTXReg<"rh9">;
-def RH10 : PTXReg<"rh10">;
-def RH11 : PTXReg<"rh11">;
-def RH12 : PTXReg<"rh12">;
-def RH13 : PTXReg<"rh13">;
-def RH14 : PTXReg<"rh14">;
-def RH15 : PTXReg<"rh15">;
-def RH16 : PTXReg<"rh16">;
-def RH17 : PTXReg<"rh17">;
-def RH18 : PTXReg<"rh18">;
-def RH19 : PTXReg<"rh19">;
-def RH20 : PTXReg<"rh20">;
-def RH21 : PTXReg<"rh21">;
-def RH22 : PTXReg<"rh22">;
-def RH23 : PTXReg<"rh23">;
-def RH24 : PTXReg<"rh24">;
-def RH25 : PTXReg<"rh25">;
-def RH26 : PTXReg<"rh26">;
-def RH27 : PTXReg<"rh27">;
-def RH28 : PTXReg<"rh28">;
-def RH29 : PTXReg<"rh29">;
-def RH30 : PTXReg<"rh30">;
-def RH31 : PTXReg<"rh31">;
-def RH32 : PTXReg<"rh32">;
-def RH33 : PTXReg<"rh33">;
-def RH34 : PTXReg<"rh34">;
-def RH35 : PTXReg<"rh35">;
-def RH36 : PTXReg<"rh36">;
-def RH37 : PTXReg<"rh37">;
-def RH38 : PTXReg<"rh38">;
-def RH39 : PTXReg<"rh39">;
-def RH40 : PTXReg<"rh40">;
-def RH41 : PTXReg<"rh41">;
-def RH42 : PTXReg<"rh42">;
-def RH43 : PTXReg<"rh43">;
-def RH44 : PTXReg<"rh44">;
-def RH45 : PTXReg<"rh45">;
-def RH46 : PTXReg<"rh46">;
-def RH47 : PTXReg<"rh47">;
-def RH48 : PTXReg<"rh48">;
-def RH49 : PTXReg<"rh49">;
-def RH50 : PTXReg<"rh50">;
-def RH51 : PTXReg<"rh51">;
-def RH52 : PTXReg<"rh52">;
-def RH53 : PTXReg<"rh53">;
-def RH54 : PTXReg<"rh54">;
-def RH55 : PTXReg<"rh55">;
-def RH56 : PTXReg<"rh56">;
-def RH57 : PTXReg<"rh57">;
-def RH58 : PTXReg<"rh58">;
-def RH59 : PTXReg<"rh59">;
-def RH60 : PTXReg<"rh60">;
-def RH61 : PTXReg<"rh61">;
-def RH62 : PTXReg<"rh62">;
-def RH63 : PTXReg<"rh63">;
-def RH64 : PTXReg<"rh64">;
-def RH65 : PTXReg<"rh65">;
-def RH66 : PTXReg<"rh66">;
-def RH67 : PTXReg<"rh67">;
-def RH68 : PTXReg<"rh68">;
-def RH69 : PTXReg<"rh69">;
-def RH70 : PTXReg<"rh70">;
-def RH71 : PTXReg<"rh71">;
-def RH72 : PTXReg<"rh72">;
-def RH73 : PTXReg<"rh73">;
-def RH74 : PTXReg<"rh74">;
-def RH75 : PTXReg<"rh75">;
-def RH76 : PTXReg<"rh76">;
-def RH77 : PTXReg<"rh77">;
-def RH78 : PTXReg<"rh78">;
-def RH79 : PTXReg<"rh79">;
-def RH80 : PTXReg<"rh80">;
-def RH81 : PTXReg<"rh81">;
-def RH82 : PTXReg<"rh82">;
-def RH83 : PTXReg<"rh83">;
-def RH84 : PTXReg<"rh84">;
-def RH85 : PTXReg<"rh85">;
-def RH86 : PTXReg<"rh86">;
-def RH87 : PTXReg<"rh87">;
-def RH88 : PTXReg<"rh88">;
-def RH89 : PTXReg<"rh89">;
-def RH90 : PTXReg<"rh90">;
-def RH91 : PTXReg<"rh91">;
-def RH92 : PTXReg<"rh92">;
-def RH93 : PTXReg<"rh93">;
-def RH94 : PTXReg<"rh94">;
-def RH95 : PTXReg<"rh95">;
-def RH96 : PTXReg<"rh96">;
-def RH97 : PTXReg<"rh97">;
-def RH98 : PTXReg<"rh98">;
-def RH99 : PTXReg<"rh99">;
-def RH100 : PTXReg<"rh100">;
-def RH101 : PTXReg<"rh101">;
-def RH102 : PTXReg<"rh102">;
-def RH103 : PTXReg<"rh103">;
-def RH104 : PTXReg<"rh104">;
-def RH105 : PTXReg<"rh105">;
-def RH106 : PTXReg<"rh106">;
-def RH107 : PTXReg<"rh107">;
-def RH108 : PTXReg<"rh108">;
-def RH109 : PTXReg<"rh109">;
-def RH110 : PTXReg<"rh110">;
-def RH111 : PTXReg<"rh111">;
-def RH112 : PTXReg<"rh112">;
-def RH113 : PTXReg<"rh113">;
-def RH114 : PTXReg<"rh114">;
-def RH115 : PTXReg<"rh115">;
-def RH116 : PTXReg<"rh116">;
-def RH117 : PTXReg<"rh117">;
-def RH118 : PTXReg<"rh118">;
-def RH119 : PTXReg<"rh119">;
-def RH120 : PTXReg<"rh120">;
-def RH121 : PTXReg<"rh121">;
-def RH122 : PTXReg<"rh122">;
-def RH123 : PTXReg<"rh123">;
-def RH124 : PTXReg<"rh124">;
-def RH125 : PTXReg<"rh125">;
-def RH126 : PTXReg<"rh126">;
-def RH127 : PTXReg<"rh127">;
-
-///===- 32-Bit Registers --------------------------------------------------===//
-
-def R0 : PTXReg<"r0">;
-def R1 : PTXReg<"r1">;
-def R2 : PTXReg<"r2">;
-def R3 : PTXReg<"r3">;
-def R4 : PTXReg<"r4">;
-def R5 : PTXReg<"r5">;
-def R6 : PTXReg<"r6">;
-def R7 : PTXReg<"r7">;
-def R8 : PTXReg<"r8">;
-def R9 : PTXReg<"r9">;
-def R10 : PTXReg<"r10">;
-def R11 : PTXReg<"r11">;
-def R12 : PTXReg<"r12">;
-def R13 : PTXReg<"r13">;
-def R14 : PTXReg<"r14">;
-def R15 : PTXReg<"r15">;
-def R16 : PTXReg<"r16">;
-def R17 : PTXReg<"r17">;
-def R18 : PTXReg<"r18">;
-def R19 : PTXReg<"r19">;
-def R20 : PTXReg<"r20">;
-def R21 : PTXReg<"r21">;
-def R22 : PTXReg<"r22">;
-def R23 : PTXReg<"r23">;
-def R24 : PTXReg<"r24">;
-def R25 : PTXReg<"r25">;
-def R26 : PTXReg<"r26">;
-def R27 : PTXReg<"r27">;
-def R28 : PTXReg<"r28">;
-def R29 : PTXReg<"r29">;
-def R30 : PTXReg<"r30">;
-def R31 : PTXReg<"r31">;
-def R32 : PTXReg<"r32">;
-def R33 : PTXReg<"r33">;
-def R34 : PTXReg<"r34">;
-def R35 : PTXReg<"r35">;
-def R36 : PTXReg<"r36">;
-def R37 : PTXReg<"r37">;
-def R38 : PTXReg<"r38">;
-def R39 : PTXReg<"r39">;
-def R40 : PTXReg<"r40">;
-def R41 : PTXReg<"r41">;
-def R42 : PTXReg<"r42">;
-def R43 : PTXReg<"r43">;
-def R44 : PTXReg<"r44">;
-def R45 : PTXReg<"r45">;
-def R46 : PTXReg<"r46">;
-def R47 : PTXReg<"r47">;
-def R48 : PTXReg<"r48">;
-def R49 : PTXReg<"r49">;
-def R50 : PTXReg<"r50">;
-def R51 : PTXReg<"r51">;
-def R52 : PTXReg<"r52">;
-def R53 : PTXReg<"r53">;
-def R54 : PTXReg<"r54">;
-def R55 : PTXReg<"r55">;
-def R56 : PTXReg<"r56">;
-def R57 : PTXReg<"r57">;
-def R58 : PTXReg<"r58">;
-def R59 : PTXReg<"r59">;
-def R60 : PTXReg<"r60">;
-def R61 : PTXReg<"r61">;
-def R62 : PTXReg<"r62">;
-def R63 : PTXReg<"r63">;
-def R64 : PTXReg<"r64">;
-def R65 : PTXReg<"r65">;
-def R66 : PTXReg<"r66">;
-def R67 : PTXReg<"r67">;
-def R68 : PTXReg<"r68">;
-def R69 : PTXReg<"r69">;
-def R70 : PTXReg<"r70">;
-def R71 : PTXReg<"r71">;
-def R72 : PTXReg<"r72">;
-def R73 : PTXReg<"r73">;
-def R74 : PTXReg<"r74">;
-def R75 : PTXReg<"r75">;
-def R76 : PTXReg<"r76">;
-def R77 : PTXReg<"r77">;
-def R78 : PTXReg<"r78">;
-def R79 : PTXReg<"r79">;
-def R80 : PTXReg<"r80">;
-def R81 : PTXReg<"r81">;
-def R82 : PTXReg<"r82">;
-def R83 : PTXReg<"r83">;
-def R84 : PTXReg<"r84">;
-def R85 : PTXReg<"r85">;
-def R86 : PTXReg<"r86">;
-def R87 : PTXReg<"r87">;
-def R88 : PTXReg<"r88">;
-def R89 : PTXReg<"r89">;
-def R90 : PTXReg<"r90">;
-def R91 : PTXReg<"r91">;
-def R92 : PTXReg<"r92">;
-def R93 : PTXReg<"r93">;
-def R94 : PTXReg<"r94">;
-def R95 : PTXReg<"r95">;
-def R96 : PTXReg<"r96">;
-def R97 : PTXReg<"r97">;
-def R98 : PTXReg<"r98">;
-def R99 : PTXReg<"r99">;
-def R100 : PTXReg<"r100">;
-def R101 : PTXReg<"r101">;
-def R102 : PTXReg<"r102">;
-def R103 : PTXReg<"r103">;
-def R104 : PTXReg<"r104">;
-def R105 : PTXReg<"r105">;
-def R106 : PTXReg<"r106">;
-def R107 : PTXReg<"r107">;
-def R108 : PTXReg<"r108">;
-def R109 : PTXReg<"r109">;
-def R110 : PTXReg<"r110">;
-def R111 : PTXReg<"r111">;
-def R112 : PTXReg<"r112">;
-def R113 : PTXReg<"r113">;
-def R114 : PTXReg<"r114">;
-def R115 : PTXReg<"r115">;
-def R116 : PTXReg<"r116">;
-def R117 : PTXReg<"r117">;
-def R118 : PTXReg<"r118">;
-def R119 : PTXReg<"r119">;
-def R120 : PTXReg<"r120">;
-def R121 : PTXReg<"r121">;
-def R122 : PTXReg<"r122">;
-def R123 : PTXReg<"r123">;
-def R124 : PTXReg<"r124">;
-def R125 : PTXReg<"r125">;
-def R126 : PTXReg<"r126">;
-def R127 : PTXReg<"r127">;
-
-///===- 64-Bit Registers --------------------------------------------------===//
-
-def RD0 : PTXReg<"rd0">;
-def RD1 : PTXReg<"rd1">;
-def RD2 : PTXReg<"rd2">;
-def RD3 : PTXReg<"rd3">;
-def RD4 : PTXReg<"rd4">;
-def RD5 : PTXReg<"rd5">;
-def RD6 : PTXReg<"rd6">;
-def RD7 : PTXReg<"rd7">;
-def RD8 : PTXReg<"rd8">;
-def RD9 : PTXReg<"rd9">;
-def RD10 : PTXReg<"rd10">;
-def RD11 : PTXReg<"rd11">;
-def RD12 : PTXReg<"rd12">;
-def RD13 : PTXReg<"rd13">;
-def RD14 : PTXReg<"rd14">;
-def RD15 : PTXReg<"rd15">;
-def RD16 : PTXReg<"rd16">;
-def RD17 : PTXReg<"rd17">;
-def RD18 : PTXReg<"rd18">;
-def RD19 : PTXReg<"rd19">;
-def RD20 : PTXReg<"rd20">;
-def RD21 : PTXReg<"rd21">;
-def RD22 : PTXReg<"rd22">;
-def RD23 : PTXReg<"rd23">;
-def RD24 : PTXReg<"rd24">;
-def RD25 : PTXReg<"rd25">;
-def RD26 : PTXReg<"rd26">;
-def RD27 : PTXReg<"rd27">;
-def RD28 : PTXReg<"rd28">;
-def RD29 : PTXReg<"rd29">;
-def RD30 : PTXReg<"rd30">;
-def RD31 : PTXReg<"rd31">;
-def RD32 : PTXReg<"rd32">;
-def RD33 : PTXReg<"rd33">;
-def RD34 : PTXReg<"rd34">;
-def RD35 : PTXReg<"rd35">;
-def RD36 : PTXReg<"rd36">;
-def RD37 : PTXReg<"rd37">;
-def RD38 : PTXReg<"rd38">;
-def RD39 : PTXReg<"rd39">;
-def RD40 : PTXReg<"rd40">;
-def RD41 : PTXReg<"rd41">;
-def RD42 : PTXReg<"rd42">;
-def RD43 : PTXReg<"rd43">;
-def RD44 : PTXReg<"rd44">;
-def RD45 : PTXReg<"rd45">;
-def RD46 : PTXReg<"rd46">;
-def RD47 : PTXReg<"rd47">;
-def RD48 : PTXReg<"rd48">;
-def RD49 : PTXReg<"rd49">;
-def RD50 : PTXReg<"rd50">;
-def RD51 : PTXReg<"rd51">;
-def RD52 : PTXReg<"rd52">;
-def RD53 : PTXReg<"rd53">;
-def RD54 : PTXReg<"rd54">;
-def RD55 : PTXReg<"rd55">;
-def RD56 : PTXReg<"rd56">;
-def RD57 : PTXReg<"rd57">;
-def RD58 : PTXReg<"rd58">;
-def RD59 : PTXReg<"rd59">;
-def RD60 : PTXReg<"rd60">;
-def RD61 : PTXReg<"rd61">;
-def RD62 : PTXReg<"rd62">;
-def RD63 : PTXReg<"rd63">;
-def RD64 : PTXReg<"rd64">;
-def RD65 : PTXReg<"rd65">;
-def RD66 : PTXReg<"rd66">;
-def RD67 : PTXReg<"rd67">;
-def RD68 : PTXReg<"rd68">;
-def RD69 : PTXReg<"rd69">;
-def RD70 : PTXReg<"rd70">;
-def RD71 : PTXReg<"rd71">;
-def RD72 : PTXReg<"rd72">;
-def RD73 : PTXReg<"rd73">;
-def RD74 : PTXReg<"rd74">;
-def RD75 : PTXReg<"rd75">;
-def RD76 : PTXReg<"rd76">;
-def RD77 : PTXReg<"rd77">;
-def RD78 : PTXReg<"rd78">;
-def RD79 : PTXReg<"rd79">;
-def RD80 : PTXReg<"rd80">;
-def RD81 : PTXReg<"rd81">;
-def RD82 : PTXReg<"rd82">;
-def RD83 : PTXReg<"rd83">;
-def RD84 : PTXReg<"rd84">;
-def RD85 : PTXReg<"rd85">;
-def RD86 : PTXReg<"rd86">;
-def RD87 : PTXReg<"rd87">;
-def RD88 : PTXReg<"rd88">;
-def RD89 : PTXReg<"rd89">;
-def RD90 : PTXReg<"rd90">;
-def RD91 : PTXReg<"rd91">;
-def RD92 : PTXReg<"rd92">;
-def RD93 : PTXReg<"rd93">;
-def RD94 : PTXReg<"rd94">;
-def RD95 : PTXReg<"rd95">;
-def RD96 : PTXReg<"rd96">;
-def RD97 : PTXReg<"rd97">;
-def RD98 : PTXReg<"rd98">;
-def RD99 : PTXReg<"rd99">;
-def RD100 : PTXReg<"rd100">;
-def RD101 : PTXReg<"rd101">;
-def RD102 : PTXReg<"rd102">;
-def RD103 : PTXReg<"rd103">;
-def RD104 : PTXReg<"rd104">;
-def RD105 : PTXReg<"rd105">;
-def RD106 : PTXReg<"rd106">;
-def RD107 : PTXReg<"rd107">;
-def RD108 : PTXReg<"rd108">;
-def RD109 : PTXReg<"rd109">;
-def RD110 : PTXReg<"rd110">;
-def RD111 : PTXReg<"rd111">;
-def RD112 : PTXReg<"rd112">;
-def RD113 : PTXReg<"rd113">;
-def RD114 : PTXReg<"rd114">;
-def RD115 : PTXReg<"rd115">;
-def RD116 : PTXReg<"rd116">;
-def RD117 : PTXReg<"rd117">;
-def RD118 : PTXReg<"rd118">;
-def RD119 : PTXReg<"rd119">;
-def RD120 : PTXReg<"rd120">;
-def RD121 : PTXReg<"rd121">;
-def RD122 : PTXReg<"rd122">;
-def RD123 : PTXReg<"rd123">;
-def RD124 : PTXReg<"rd124">;
-def RD125 : PTXReg<"rd125">;
-def RD126 : PTXReg<"rd126">;
-def RD127 : PTXReg<"rd127">;
+// The generated register info code throws warnings for empty register classes
+// (e.g. zero-length arrays), so we use a dummy register here just to prevent
+// these warnings.
+def DUMMY_REG : PTXReg<"R0">;
 
 //===----------------------------------------------------------------------===//
 //  Register classes
 //===----------------------------------------------------------------------===//
-def RegPred : RegisterClass<"PTX", [i1], 8, (sequence "P%u", 0, 127)>;
-def RegI16 : RegisterClass<"PTX", [i16], 16, (sequence "RH%u", 0, 127)>;
-def RegI32 : RegisterClass<"PTX", [i32], 32, (sequence "R%u", 0, 127)>;
-def RegI64 : RegisterClass<"PTX", [i64], 64, (sequence "RD%u", 0, 127)>;
-def RegF32 : RegisterClass<"PTX", [f32], 32, (sequence "R%u", 0, 127)>;
-def RegF64 : RegisterClass<"PTX", [f64], 64, (sequence "RD%u", 0, 127)>;
+def RegPred : RegisterClass<"PTX", [i1], 8, (add DUMMY_REG)>;
+def RegI16 : RegisterClass<"PTX", [i16], 16, (add DUMMY_REG)>;
+def RegI32 : RegisterClass<"PTX", [i32], 32, (add DUMMY_REG)>;
+def RegI64 : RegisterClass<"PTX", [i64], 64, (add DUMMY_REG)>;
+def RegF32 : RegisterClass<"PTX", [f32], 32, (add DUMMY_REG)>;
+def RegF64 : RegisterClass<"PTX", [f64], 64, (add DUMMY_REG)>;
+
diff --git a/lib/Target/PTX/PTXSelectionDAGInfo.cpp b/lib/Target/PTX/PTXSelectionDAGInfo.cpp
new file mode 100644
index 0000000000000..50ef14a13d949
--- /dev/null
+++ b/lib/Target/PTX/PTXSelectionDAGInfo.cpp
@@ -0,0 +1,149 @@
+//===-- PTXSelectionDAGInfo.cpp - PTX SelectionDAG Info -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the PTXSelectionDAGInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "ptx-selectiondag-info"
+#include "PTXTargetMachine.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+using namespace llvm;
+
+PTXSelectionDAGInfo::PTXSelectionDAGInfo(const TargetMachine &TM)
+  : TargetSelectionDAGInfo(TM),
+    Subtarget(&TM.getSubtarget<PTXSubtarget>()) {
+}
+
+PTXSelectionDAGInfo::~PTXSelectionDAGInfo() {
+}
+
+SDValue
+PTXSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
+                                             SDValue Chain,
+                                             SDValue Dst, SDValue Src,
+                                             SDValue Size, unsigned Align,
+                                             bool isVolatile, bool AlwaysInline,
+                                             MachinePointerInfo DstPtrInfo,
+                                          MachinePointerInfo SrcPtrInfo) const {
+  // Do repeated 4-byte loads and stores. To be improved.
+  // This requires 4-byte alignment.
+  if ((Align & 3) != 0)
+    return SDValue();
+  // This requires the copy size to be a constant, preferably
+  // within a subtarget-specific limit.
+  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+  if (!ConstantSize)
+    return SDValue();
+  uint64_t SizeVal = ConstantSize->getZExtValue();
+  // Always inline memcpys. In PTX, we do not have a C library that provides
+  // a memcpy function.
+  //if (!AlwaysInline)
+  //  return SDValue();
+
+  unsigned BytesLeft = SizeVal & 3;
+  unsigned NumMemOps = SizeVal >> 2;
+  unsigned EmittedNumMemOps = 0;
+  EVT VT = MVT::i32;
+  unsigned VTSize = 4;
+  unsigned i = 0;
+  const unsigned MAX_LOADS_IN_LDM = 6;
+  SDValue TFOps[MAX_LOADS_IN_LDM];
+  SDValue Loads[MAX_LOADS_IN_LDM];
+  uint64_t SrcOff = 0, DstOff = 0;
+  EVT PointerType = Subtarget->is64Bit() ? MVT::i64 : MVT::i32;
+
+  // Emit up to MAX_LOADS_IN_LDM loads, then a TokenFactor barrier, then the
+  // same number of stores.  The loads and stores will get combined into
+  // ldm/stm later on.
+  while (EmittedNumMemOps < NumMemOps) {
+    for (i = 0;
+         i < MAX_LOADS_IN_LDM && EmittedNumMemOps + i < NumMemOps; ++i) {
+      Loads[i] = DAG.getLoad(VT, dl, Chain,
+                             DAG.getNode(ISD::ADD, dl, PointerType, Src,
+                                         DAG.getConstant(SrcOff, PointerType)),
+                             SrcPtrInfo.getWithOffset(SrcOff), isVolatile,
+                             false, 0);
+      TFOps[i] = Loads[i].getValue(1);
+      SrcOff += VTSize;
+    }
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i);
+
+    for (i = 0;
+         i < MAX_LOADS_IN_LDM && EmittedNumMemOps + i < NumMemOps; ++i) {
+      TFOps[i] = DAG.getStore(Chain, dl, Loads[i],
+                              DAG.getNode(ISD::ADD, dl, PointerType, Dst,
+                                          DAG.getConstant(DstOff, PointerType)),
+                              DstPtrInfo.getWithOffset(DstOff),
+                              isVolatile, false, 0);
+      DstOff += VTSize;
+    }
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i);
+
+    EmittedNumMemOps += i;
+  }
+
+  if (BytesLeft == 0)
+    return Chain;
+
+  // Issue loads / stores for the trailing (1 - 3) bytes.
+  unsigned BytesLeftSave = BytesLeft;
+  i = 0;
+  while (BytesLeft) {
+    if (BytesLeft >= 2) {
+      VT = MVT::i16;
+      VTSize = 2;
+    } else {
+      VT = MVT::i8;
+      VTSize = 1;
+    }
+
+    Loads[i] = DAG.getLoad(VT, dl, Chain,
+                           DAG.getNode(ISD::ADD, dl, PointerType, Src,
+                                       DAG.getConstant(SrcOff, PointerType)),
+                           SrcPtrInfo.getWithOffset(SrcOff), false, false, 0);
+    TFOps[i] = Loads[i].getValue(1);
+    ++i;
+    SrcOff += VTSize;
+    BytesLeft -= VTSize;
+  }
+  Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i);
+
+  i = 0;
+  BytesLeft = BytesLeftSave;
+  while (BytesLeft) {
+    if (BytesLeft >= 2) {
+      VT = MVT::i16;
+      VTSize = 2;
+    } else {
+      VT = MVT::i8;
+      VTSize = 1;
+    }
+
+    TFOps[i] = DAG.getStore(Chain, dl, Loads[i],
+                            DAG.getNode(ISD::ADD, dl, PointerType, Dst,
+                                        DAG.getConstant(DstOff, PointerType)),
+                            DstPtrInfo.getWithOffset(DstOff), false, false, 0);
+    ++i;
+    DstOff += VTSize;
+    BytesLeft -= VTSize;
+  }
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i);
+}
+
+SDValue PTXSelectionDAGInfo::
+EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
+                        SDValue Chain, SDValue Dst,
+                        SDValue Src, SDValue Size,
+                        unsigned Align, bool isVolatile,
+                        MachinePointerInfo DstPtrInfo) const {
+  llvm_unreachable("memset lowering not implemented for PTX yet");
+}
+
diff --git a/lib/Target/PTX/PTXSelectionDAGInfo.h b/lib/Target/PTX/PTXSelectionDAGInfo.h
new file mode 100644
index 0000000000000..e0c716718f084
--- /dev/null
+++ b/lib/Target/PTX/PTXSelectionDAGInfo.h
@@ -0,0 +1,53 @@
+//===-- PTXSelectionDAGInfo.h - PTX SelectionDAG Info -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the PTX subclass for TargetSelectionDAGInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PTXSELECTIONDAGINFO_H
+#define PTXSELECTIONDAGINFO_H
+
+#include "llvm/Target/TargetSelectionDAGInfo.h"
+
+namespace llvm {
+
+/// PTXSelectionDAGInfo - TargetSelectionDAGInfo sub-class for the PTX target.
+/// At the moment, this is mostly just a copy of ARMSelectionDAGInfo.
+class PTXSelectionDAGInfo : public TargetSelectionDAGInfo {
+  /// Subtarget - Keep a pointer to the PTXSubtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  const PTXSubtarget *Subtarget;
+
+public:
+  explicit PTXSelectionDAGInfo(const TargetMachine &TM);
+  ~PTXSelectionDAGInfo();
+
+  virtual
+  SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
+                                  SDValue Chain,
+                                  SDValue Dst, SDValue Src,
+                                  SDValue Size, unsigned Align,
+                                  bool isVolatile, bool AlwaysInline,
+                                  MachinePointerInfo DstPtrInfo,
+                                  MachinePointerInfo SrcPtrInfo) const;
+
+  virtual
+  SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
+                                  SDValue Chain,
+                                  SDValue Op1, SDValue Op2,
+                                  SDValue Op3, unsigned Align,
+                                  bool isVolatile,
+                                  MachinePointerInfo DstPtrInfo) const;
+};
+
+}
+
+#endif
+
diff --git a/lib/Target/PTX/PTXSubtarget.cpp b/lib/Target/PTX/PTXSubtarget.cpp
index 8ec646e46f68c..1eb57d2f17021 100644
--- a/lib/Target/PTX/PTXSubtarget.cpp
+++ b/lib/Target/PTX/PTXSubtarget.cpp
@@ -14,7 +14,7 @@
 #include "PTXSubtarget.h"
 #include "PTX.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 
 #define GET_SUBTARGETINFO_TARGET_DESC
 #define GET_SUBTARGETINFO_CTOR
diff --git a/lib/Target/PTX/PTXSubtarget.h b/lib/Target/PTX/PTXSubtarget.h
index 0921f1f22c49b..b946d7c11cefd 100644
--- a/lib/Target/PTX/PTXSubtarget.h
+++ b/lib/Target/PTX/PTXSubtarget.h
@@ -114,7 +114,16 @@ class StringRef;
                (PTXTarget >= PTX_COMPUTE_2_0 && PTXTarget < PTX_LAST_COMPUTE);
       }
 
-    void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+      bool callsAreHandled() const {
+        return (PTXTarget >= PTX_SM_2_0 && PTXTarget < PTX_LAST_SM) ||
+               (PTXTarget >= PTX_COMPUTE_2_0 && PTXTarget < PTX_LAST_COMPUTE);
+      }
+
+      bool emitPtrAttribute() const {
+        return PTXVersion >= PTX_VERSION_2_2;
+      }
+
+      void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
   }; // class PTXSubtarget
 } // namespace llvm
 
diff --git a/lib/Target/PTX/PTXTargetMachine.cpp b/lib/Target/PTX/PTXTargetMachine.cpp
index ab926e02d66f9..449a3d9fc8d44 100644
--- a/lib/Target/PTX/PTXTargetMachine.cpp
+++ b/lib/Target/PTX/PTXTargetMachine.cpp
@@ -14,8 +14,32 @@
 #include "PTX.h"
 #include "PTXTargetMachine.h"
 #include "llvm/PassManager.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/Verifier.h"
+#include "llvm/Assembly/PrintModulePass.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/TargetRegistry.h"
+
 
 using namespace llvm;
 
@@ -25,7 +49,7 @@ namespace llvm {
                                    bool useCFI,
                                    MCInstPrinter *InstPrint,
                                    MCCodeEmitter *CE,
-                                   TargetAsmBackend *TAB,
+                                   MCAsmBackend *MAB,
                                    bool ShowInst);
 }
 
@@ -43,34 +67,47 @@ namespace {
     "e-p:32:32-i64:32:32-f64:32:32-v128:32:128-v64:32:64-n32:64";
   const char* DataLayout64 =
     "e-p:64:64-i64:32:32-f64:32:32-v128:32:128-v64:32:64-n32:64";
+
+  // Copied from LLVMTargetMachine.cpp
+  void printNoVerify(PassManagerBase &PM, const char *Banner) {
+    if (PrintMachineCode)
+      PM.add(createMachineFunctionPrinterPass(dbgs(), Banner));
+  }
+
+  void printAndVerify(PassManagerBase &PM,
+                      const char *Banner) {
+    if (PrintMachineCode)
+      PM.add(createMachineFunctionPrinterPass(dbgs(), Banner));
+
+    //if (VerifyMachineCode)
+    //  PM.add(createMachineVerifierPass(Banner));
+  }
 }
 
 // DataLayout and FrameLowering are filled with dummy data
 PTXTargetMachine::PTXTargetMachine(const Target &T,
-                                   const std::string &TT,
-                                   const std::string &CPU,
-                                   const std::string &FS,
+                                   StringRef TT, StringRef CPU, StringRef FS,
+                                   Reloc::Model RM, CodeModel::Model CM,
                                    bool is64Bit)
-  : LLVMTargetMachine(T, TT, CPU, FS),
+  : LLVMTargetMachine(T, TT, CPU, FS, RM, CM),
     DataLayout(is64Bit ? DataLayout64 : DataLayout32),
     Subtarget(TT, CPU, FS, is64Bit),
     FrameLowering(Subtarget),
     InstrInfo(*this),
+    TSInfo(*this),
     TLInfo(*this) {
 }
 
-PTX32TargetMachine::PTX32TargetMachine(const Target &T,
-                                       const std::string& TT,
-                                       const std::string& CPU,
-                                       const std::string& FS)
-  : PTXTargetMachine(T, TT, CPU, FS, false) {
+PTX32TargetMachine::PTX32TargetMachine(const Target &T, StringRef TT,
+                                       StringRef CPU, StringRef FS,
+                                       Reloc::Model RM, CodeModel::Model CM)
+  : PTXTargetMachine(T, TT, CPU, FS, RM, CM, false) {
 }
 
-PTX64TargetMachine::PTX64TargetMachine(const Target &T,
-                                       const std::string& TT,
-                                       const std::string& CPU,
-                                       const std::string& FS)
-  : PTXTargetMachine(T, TT, CPU, FS, true) {
+PTX64TargetMachine::PTX64TargetMachine(const Target &T, StringRef TT,
+                                       StringRef CPU, StringRef FS,
+                                       Reloc::Model RM, CodeModel::Model CM)
+  : PTXTargetMachine(T, TT, CPU, FS, RM, CM, true) {
 }
 
 bool PTXTargetMachine::addInstSelector(PassManagerBase &PM,
@@ -82,6 +119,255 @@ bool PTXTargetMachine::addInstSelector(PassManagerBase &PM,
 bool PTXTargetMachine::addPostRegAlloc(PassManagerBase &PM,
                                        CodeGenOpt::Level OptLevel) {
   // PTXMFInfoExtract must after register allocation!
+  //PM.add(createPTXMFInfoExtract(*this, OptLevel));
+  return false;
+}
+
+bool PTXTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
+                                           formatted_raw_ostream &Out,
+                                           CodeGenFileType FileType,
+                                           CodeGenOpt::Level OptLevel,
+                                           bool DisableVerify) {
+  // This is mostly based on LLVMTargetMachine::addPassesToEmitFile
+
+  // Add common CodeGen passes.
+  MCContext *Context = 0;
+  if (addCommonCodeGenPasses(PM, OptLevel, DisableVerify, Context))
+    return true;
+  assert(Context != 0 && "Failed to get MCContext");
+
+  if (hasMCSaveTempLabels())
+    Context->setAllowTemporaryLabels(false);
+
+  const MCAsmInfo &MAI = *getMCAsmInfo();
+  const MCSubtargetInfo &STI = getSubtarget<MCSubtargetInfo>();
+  OwningPtr<MCStreamer> AsmStreamer;
+
+  switch (FileType) {
+  default: return true;
+  case CGFT_AssemblyFile: {
+    MCInstPrinter *InstPrinter =
+      getTarget().createMCInstPrinter(MAI.getAssemblerDialect(), MAI, STI);
+
+    // Create a code emitter if asked to show the encoding.
+    MCCodeEmitter *MCE = 0;
+    MCAsmBackend *MAB = 0;
+
+    MCStreamer *S = getTarget().createAsmStreamer(*Context, Out,
+                                                  true, /* verbose asm */
+                                                  hasMCUseLoc(),
+                                                  hasMCUseCFI(),
+                                                  InstPrinter,
+                                                  MCE, MAB,
+                                                  false /* show MC encoding */);
+    AsmStreamer.reset(S);
+    break;
+  }
+  case CGFT_ObjectFile: {
+    llvm_unreachable("Object file emission is not supported with PTX");
+  }
+  case CGFT_Null:
+    // The Null output is intended for use for performance analysis and testing,
+    // not real users.
+    AsmStreamer.reset(createNullStreamer(*Context));
+    break;
+  }
+
+  // MC Logging
+  //AsmStreamer.reset(createLoggingStreamer(AsmStreamer.take(), errs()));
+
+  // Create the AsmPrinter, which takes ownership of AsmStreamer if successful.
+  FunctionPass *Printer = getTarget().createAsmPrinter(*this, *AsmStreamer);
+  if (Printer == 0)
+    return true;
+
+  // If successful, createAsmPrinter took ownership of AsmStreamer.
+  AsmStreamer.take();
+
+  PM.add(Printer);
+
+  PM.add(createGCInfoDeleter());
+  return false;
+}
+
+bool PTXTargetMachine::addCommonCodeGenPasses(PassManagerBase &PM,
+                                              CodeGenOpt::Level OptLevel,
+                                              bool DisableVerify,
+                                              MCContext *&OutContext) {
+  // Add standard LLVM codegen passes.
+  // This is derived from LLVMTargetMachine::addCommonCodeGenPasses, with some
+  // modifications for the PTX target.
+
+  // Standard LLVM-Level Passes.
+
+  // Basic AliasAnalysis support.
+  // Add TypeBasedAliasAnalysis before BasicAliasAnalysis so that
+  // BasicAliasAnalysis wins if they disagree. This is intended to help
+  // support "obvious" type-punning idioms.
+  PM.add(createTypeBasedAliasAnalysisPass());
+  PM.add(createBasicAliasAnalysisPass());
+
+  // Before running any passes, run the verifier to determine if the input
+  // coming from the front-end and/or optimizer is valid.
+  if (!DisableVerify)
+    PM.add(createVerifierPass());
+
+  // Run loop strength reduction before anything else.
+  if (OptLevel != CodeGenOpt::None) {
+    PM.add(createLoopStrengthReducePass(getTargetLowering()));
+    //PM.add(createPrintFunctionPass("\n\n*** Code after LSR ***\n", &dbgs()));
+  }
+
+  PM.add(createGCLoweringPass());
+
+  // Make sure that no unreachable blocks are instruction selected.
+  PM.add(createUnreachableBlockEliminationPass());
+
+  PM.add(createLowerInvokePass(getTargetLowering()));
+  // The lower invoke pass may create unreachable code. Remove it.
+  PM.add(createUnreachableBlockEliminationPass());
+
+  if (OptLevel != CodeGenOpt::None)
+    PM.add(createCodeGenPreparePass(getTargetLowering()));
+
+  PM.add(createStackProtectorPass(getTargetLowering()));
+
+  addPreISel(PM, OptLevel);
+
+  //PM.add(createPrintFunctionPass("\n\n"
+  //                               "*** Final LLVM Code input to ISel ***\n",
+  //                               &dbgs()));
+
+  // All passes which modify the LLVM IR are now complete; run the verifier
+  // to ensure that the IR is valid.
+  if (!DisableVerify)
+    PM.add(createVerifierPass());
+
+  // Standard Lower-Level Passes.
+
+  // Install a MachineModuleInfo class, which is an immutable pass that holds
+  // all the per-module stuff we're generating, including MCContext.
+  MachineModuleInfo *MMI = new MachineModuleInfo(*getMCAsmInfo(),
+                                                 *getRegisterInfo(),
+                                    &getTargetLowering()->getObjFileLowering());
+  PM.add(MMI);
+  OutContext = &MMI->getContext(); // Return the MCContext specifically by-ref.
+
+  // Set up a MachineFunction for the rest of CodeGen to work on.
+  PM.add(new MachineFunctionAnalysis(*this, OptLevel));
+
+  // Ask the target for an isel.
+  if (addInstSelector(PM, OptLevel))
+    return true;
+
+  // Print the instruction selected machine code...
+  printAndVerify(PM, "After Instruction Selection");
+
+  // Expand pseudo-instructions emitted by ISel.
+  PM.add(createExpandISelPseudosPass());
+
+  // Pre-ra tail duplication.
+  if (OptLevel != CodeGenOpt::None) {
+    PM.add(createTailDuplicatePass(true));
+    printAndVerify(PM, "After Pre-RegAlloc TailDuplicate");
+  }
+
+  // Optimize PHIs before DCE: removing dead PHI cycles may make more
+  // instructions dead.
+  if (OptLevel != CodeGenOpt::None)
+    PM.add(createOptimizePHIsPass());
+
+  // If the target requests it, assign local variables to stack slots relative
+  // to one another and simplify frame index references where possible.
+  PM.add(createLocalStackSlotAllocationPass());
+
+  if (OptLevel != CodeGenOpt::None) {
+    // With optimization, dead code should already be eliminated. However
+    // there is one known exception: lowered code for arguments that are only
+    // used by tail calls, where the tail calls reuse the incoming stack
+    // arguments directly (see t11 in test/CodeGen/X86/sibcall.ll).
+    PM.add(createDeadMachineInstructionElimPass());
+    printAndVerify(PM, "After codegen DCE pass");
+
+    PM.add(createMachineLICMPass());
+    PM.add(createMachineCSEPass());
+    PM.add(createMachineSinkingPass());
+    printAndVerify(PM, "After Machine LICM, CSE and Sinking passes");
+
+    PM.add(createPeepholeOptimizerPass());
+    printAndVerify(PM, "After codegen peephole optimization pass");
+  }
+
+  // Run pre-ra passes.
+  if (addPreRegAlloc(PM, OptLevel))
+    printAndVerify(PM, "After PreRegAlloc passes");
+
+  // Perform register allocation.
+  PM.add(createPTXRegisterAllocator());
+  printAndVerify(PM, "After Register Allocation");
+
+  // Perform stack slot coloring and post-ra machine LICM.
+  if (OptLevel != CodeGenOpt::None) {
+    // FIXME: Re-enable coloring with register when it's capable of adding
+    // kill markers.
+    PM.add(createStackSlotColoringPass(false));
+
+    // FIXME: Post-RA LICM has asserts that fire on virtual registers.
+    // Run post-ra machine LICM to hoist reloads / remats.
+    //if (!DisablePostRAMachineLICM)
+    //  PM.add(createMachineLICMPass(false));
+
+    printAndVerify(PM, "After StackSlotColoring and postra Machine LICM");
+  }
+
+  // Run post-ra passes.
+  if (addPostRegAlloc(PM, OptLevel))
+    printAndVerify(PM, "After PostRegAlloc passes");
+
+  PM.add(createExpandPostRAPseudosPass());
+  printAndVerify(PM, "After ExpandPostRAPseudos");
+
+  // Insert prolog/epilog code.  Eliminate abstract frame index references...
+  PM.add(createPrologEpilogCodeInserter());
+  printAndVerify(PM, "After PrologEpilogCodeInserter");
+
+  // Run pre-sched2 passes.
+  if (addPreSched2(PM, OptLevel))
+    printAndVerify(PM, "After PreSched2 passes");
+
+  // Second pass scheduler.
+  if (OptLevel != CodeGenOpt::None) {
+    PM.add(createPostRAScheduler(OptLevel));
+    printAndVerify(PM, "After PostRAScheduler");
+  }
+
+  // Branch folding must be run after regalloc and prolog/epilog insertion.
+  if (OptLevel != CodeGenOpt::None) {
+    PM.add(createBranchFoldingPass(getEnableTailMergeDefault()));
+    printNoVerify(PM, "After BranchFolding");
+  }
+
+  // Tail duplication.
+  if (OptLevel != CodeGenOpt::None) {
+    PM.add(createTailDuplicatePass(false));
+    printNoVerify(PM, "After TailDuplicate");
+  }
+
+  PM.add(createGCMachineCodeAnalysisPass());
+
+  //if (PrintGCInfo)
+  //  PM.add(createGCInfoPrinter(dbgs()));
+
+  if (OptLevel != CodeGenOpt::None) {
+    PM.add(createCodePlacementOptPass());
+    printNoVerify(PM, "After CodePlacementOpt");
+  }
+
+  if (addPreEmitPass(PM, OptLevel))
+    printNoVerify(PM, "After PreEmit passes");
+
   PM.add(createPTXMFInfoExtract(*this, OptLevel));
+  PM.add(createPTXFPRoundingModePass(*this, OptLevel));
+
   return false;
 }
diff --git a/lib/Target/PTX/PTXTargetMachine.h b/lib/Target/PTX/PTXTargetMachine.h
index ae42153252111..5b7c82b1f4f42 100644
--- a/lib/Target/PTX/PTXTargetMachine.h
+++ b/lib/Target/PTX/PTXTargetMachine.h
@@ -17,6 +17,7 @@
 #include "PTXISelLowering.h"
 #include "PTXInstrInfo.h"
 #include "PTXFrameLowering.h"
+#include "PTXSelectionDAGInfo.h"
 #include "PTXSubtarget.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetFrameLowering.h"
@@ -25,15 +26,17 @@
 namespace llvm {
 class PTXTargetMachine : public LLVMTargetMachine {
   private:
-    const TargetData  DataLayout;
-    PTXSubtarget      Subtarget; // has to be initialized before FrameLowering
-    PTXFrameLowering  FrameLowering;
-    PTXInstrInfo      InstrInfo;
-    PTXTargetLowering TLInfo;
+    const TargetData    DataLayout;
+    PTXSubtarget        Subtarget; // has to be initialized before FrameLowering
+    PTXFrameLowering    FrameLowering;
+    PTXInstrInfo        InstrInfo;
+    PTXSelectionDAGInfo TSInfo;
+    PTXTargetLowering   TLInfo;
 
   public:
-    PTXTargetMachine(const Target &T, const std::string &TT,
-                     const std::string &CPU, const std::string &FS,
+    PTXTargetMachine(const Target &T, StringRef TT,
+                     StringRef CPU, StringRef FS,
+                     Reloc::Model RM, CodeModel::Model CM,
                      bool is64Bit);
 
     virtual const TargetData *getTargetData() const { return &DataLayout; }
@@ -49,27 +52,62 @@ class PTXTargetMachine : public LLVMTargetMachine {
     virtual const PTXTargetLowering *getTargetLowering() const {
       return &TLInfo; }
 
+    virtual const PTXSelectionDAGInfo* getSelectionDAGInfo() const {
+      return &TSInfo;
+    }
+
     virtual const PTXSubtarget *getSubtargetImpl() const { return &Subtarget; }
 
     virtual bool addInstSelector(PassManagerBase &PM,
                                  CodeGenOpt::Level OptLevel);
     virtual bool addPostRegAlloc(PassManagerBase &PM,
                                  CodeGenOpt::Level OptLevel);
+
+    // We override this method to supply our own set of codegen passes.
+    virtual bool addPassesToEmitFile(PassManagerBase &,
+                                     formatted_raw_ostream &,
+                                     CodeGenFileType,
+                                     CodeGenOpt::Level,
+                                     bool = true);
+
+    // Emission of machine code through JITCodeEmitter is not supported.
+    virtual bool addPassesToEmitMachineCode(PassManagerBase &,
+                                            JITCodeEmitter &,
+                                            CodeGenOpt::Level,
+                                            bool = true) {
+      return true;
+    }
+
+    // Emission of machine code through MCJIT is not supported.
+    virtual bool addPassesToEmitMC(PassManagerBase &,
+                                   MCContext *&,
+                                   raw_ostream &,
+                                   CodeGenOpt::Level,
+                                   bool = true) {
+      return true;
+    }
+
+  private:
+
+    bool addCommonCodeGenPasses(PassManagerBase &, CodeGenOpt::Level,
+                                bool DisableVerify, MCContext *&OutCtx);
 }; // class PTXTargetMachine
 
 
 class PTX32TargetMachine : public PTXTargetMachine {
 public:
 
-  PTX32TargetMachine(const Target &T, const std::string &TT,
-                     const std::string& CPU, const std::string& FS);
+  PTX32TargetMachine(const Target &T, StringRef TT,
+                     StringRef CPU, StringRef FS,
+                     Reloc::Model RM, CodeModel::Model CM);
 }; // class PTX32TargetMachine
 
 class PTX64TargetMachine : public PTXTargetMachine {
 public:
 
-  PTX64TargetMachine(const Target &T, const std::string &TT,
-                     const std::string& CPU, const std::string& FS);
+  PTX64TargetMachine(const Target &T, StringRef TT,
+                     StringRef CPU, StringRef FS,
+                     Reloc::Model RM, CodeModel::Model CM);
 }; // class PTX32TargetMachine
 
 } // namespace llvm
diff --git a/lib/Target/PTX/TargetInfo/CMakeLists.txt b/lib/Target/PTX/TargetInfo/CMakeLists.txt
index 4b09cf5ce0995..2366e45294f83 100644
--- a/lib/Target/PTX/TargetInfo/CMakeLists.txt
+++ b/lib/Target/PTX/TargetInfo/CMakeLists.txt
@@ -4,4 +4,10 @@ add_llvm_library(LLVMPTXInfo
   PTXTargetInfo.cpp
   )
 
-add_dependencies(LLVMPTXInfo PTXCodeGenTable_gen)
+add_llvm_library_dependencies(LLVMPTXInfo
+  LLVMMC
+  LLVMSupport
+  LLVMTarget
+  )
+
+add_dependencies(LLVMPTXInfo PTXCommonTableGen)
diff --git a/lib/Target/PTX/TargetInfo/PTXTargetInfo.cpp b/lib/Target/PTX/TargetInfo/PTXTargetInfo.cpp
index 9df6c7567bd10..09a27358da9fc 100644
--- a/lib/Target/PTX/TargetInfo/PTXTargetInfo.cpp
+++ b/lib/Target/PTX/TargetInfo/PTXTargetInfo.cpp
@@ -9,7 +9,7 @@
 
 #include "PTX.h"
 #include "llvm/Module.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
 
diff --git a/lib/Target/PTX/generate-register-td.py b/lib/Target/PTX/generate-register-td.py
deleted file mode 100755
index 15286908961d7..0000000000000
--- a/lib/Target/PTX/generate-register-td.py
+++ /dev/null
@@ -1,163 +0,0 @@
-#!/usr/bin/env python
-##===- generate-register-td.py --------------------------------*-python-*--===##
-##
-##                     The LLVM Compiler Infrastructure
-##
-## This file is distributed under the University of Illinois Open Source
-## License. See LICENSE.TXT for details.
-##
-##===----------------------------------------------------------------------===##
-##
-## This file describes the PTX register file generator.
-##
-##===----------------------------------------------------------------------===##
-
-from sys import argv, exit, stdout
-
-
-if len(argv) != 5:
-    print('Usage: generate-register-td.py <num_preds> <num_16> <num_32> <num_64>')
-    exit(1)
-
-try:
-    num_pred  = int(argv[1])
-    num_16bit = int(argv[2])
-    num_32bit = int(argv[3])
-    num_64bit = int(argv[4])
-except:
-    print('ERROR: Invalid integer parameter')
-    exit(1)
-
-## Print the register definition file
-td_file = open('PTXRegisterInfo.td', 'w')
-
-td_file.write('''
-//===- PTXRegisterInfo.td - PTX Register defs ----------------*- tblgen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-//  Declarations that describe the PTX register file
-//===----------------------------------------------------------------------===//
-
-class PTXReg<string n> : Register<n> {
-  let Namespace = "PTX";
-}
-
-//===----------------------------------------------------------------------===//
-//  Registers
-//===----------------------------------------------------------------------===//
-''')
-
-
-# Print predicate registers
-td_file.write('\n///===- Predicate Registers -----------------------------------------------===//\n\n')
-for r in range(0, num_pred):
-    td_file.write('def P%d : PTXReg<"p%d">;\n' % (r, r))
-
-# Print 16-bit registers
-td_file.write('\n///===- 16-Bit Registers --------------------------------------------------===//\n\n')
-for r in range(0, num_16bit):
-    td_file.write('def RH%d : PTXReg<"rh%d">;\n' % (r, r))
-
-# Print 32-bit registers
-td_file.write('\n///===- 32-Bit Registers --------------------------------------------------===//\n\n')
-for r in range(0, num_32bit):
-    td_file.write('def R%d : PTXReg<"r%d">;\n' % (r, r))
-
-# Print 64-bit registers
-td_file.write('\n///===- 64-Bit Registers --------------------------------------------------===//\n\n')
-for r in range(0, num_64bit):
-    td_file.write('def RD%d : PTXReg<"rd%d">;\n' % (r, r))
-
-
-td_file.write('''
-//===----------------------------------------------------------------------===//
-//  Register classes
-//===----------------------------------------------------------------------===//
-''')
-
-
-# Print register classes
-
-td_file.write('def RegPred : RegisterClass<"PTX", [i1], 8, (sequence "P%%u", 0, %d)>;\n' % (num_pred-1))
-td_file.write('def RegI16 : RegisterClass<"PTX", [i16], 16, (sequence "RH%%u", 0, %d)>;\n' % (num_16bit-1))
-td_file.write('def RegI32 : RegisterClass<"PTX", [i32], 32, (sequence "R%%u", 0, %d)>;\n' % (num_32bit-1))
-td_file.write('def RegI64 : RegisterClass<"PTX", [i64], 64, (sequence "RD%%u", 0, %d)>;\n' % (num_64bit-1))
-td_file.write('def RegF32 : RegisterClass<"PTX", [f32], 32, (sequence "R%%u", 0, %d)>;\n' % (num_32bit-1))
-td_file.write('def RegF64 : RegisterClass<"PTX", [f64], 64, (sequence "RD%%u", 0, %d)>;\n' % (num_64bit-1))
-
-
-td_file.close()
-
-## Now write the PTXCallingConv.td file
-td_file = open('PTXCallingConv.td', 'w')
-
-# Reserve 10% of the available registers for return values, and the other 90%
-# for parameters
-num_ret_pred    = int(0.1 * num_pred)
-num_ret_16bit   = int(0.1 * num_16bit)
-num_ret_32bit   = int(0.1 * num_32bit)
-num_ret_64bit   = int(0.1 * num_64bit)
-num_param_pred  = num_pred - num_ret_pred
-num_param_16bit = num_16bit - num_ret_16bit
-num_param_32bit = num_32bit - num_ret_32bit
-num_param_64bit = num_64bit - num_ret_64bit
-
-param_regs_pred  = [('P%d' % (i+num_ret_pred)) for i in range(0, num_param_pred)]
-ret_regs_pred    = ['P%d' % i for i in range(0, num_ret_pred)]
-param_regs_16bit = [('RH%d' % (i+num_ret_16bit)) for i in range(0, num_param_16bit)]
-ret_regs_16bit   = ['RH%d' % i for i in range(0, num_ret_16bit)]
-param_regs_32bit = [('R%d' % (i+num_ret_32bit)) for i in range(0, num_param_32bit)]
-ret_regs_32bit   = ['R%d' % i for i in range(0, num_ret_32bit)]
-param_regs_64bit = [('RD%d' % (i+num_ret_64bit)) for i in range(0, num_param_64bit)]
-ret_regs_64bit   = ['RD%d' % i for i in range(0, num_ret_64bit)]
-
-param_list_pred  = reduce(lambda x, y: '%s, %s' % (x, y), param_regs_pred)
-ret_list_pred    = reduce(lambda x, y: '%s, %s' % (x, y), ret_regs_pred)
-param_list_16bit = reduce(lambda x, y: '%s, %s' % (x, y), param_regs_16bit)
-ret_list_16bit   = reduce(lambda x, y: '%s, %s' % (x, y), ret_regs_16bit)
-param_list_32bit = reduce(lambda x, y: '%s, %s' % (x, y), param_regs_32bit)
-ret_list_32bit   = reduce(lambda x, y: '%s, %s' % (x, y), ret_regs_32bit)
-param_list_64bit = reduce(lambda x, y: '%s, %s' % (x, y), param_regs_64bit)
-ret_list_64bit   = reduce(lambda x, y: '%s, %s' % (x, y), ret_regs_64bit)
-
-td_file.write('''
-//===--- PTXCallingConv.td - Calling Conventions -----------*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This describes the calling conventions for the PTX architecture.
-//
-//===----------------------------------------------------------------------===//
-
-// PTX Formal Parameter Calling Convention
-def CC_PTX : CallingConv<[
-  CCIfType<[i1],      CCAssignToReg<[%s]>>,
-  CCIfType<[i16],     CCAssignToReg<[%s]>>,
-  CCIfType<[i32,f32], CCAssignToReg<[%s]>>,
-  CCIfType<[i64,f64], CCAssignToReg<[%s]>>
-]>;
-
-// PTX Return Value Calling Convention
-def RetCC_PTX : CallingConv<[
-  CCIfType<[i1],      CCAssignToReg<[%s]>>,
-  CCIfType<[i16],     CCAssignToReg<[%s]>>,
-  CCIfType<[i32,f32], CCAssignToReg<[%s]>>,
-  CCIfType<[i64,f64], CCAssignToReg<[%s]>>
-]>;
-''' % (param_list_pred, param_list_16bit, param_list_32bit, param_list_64bit,
-       ret_list_pred, ret_list_16bit, ret_list_32bit, ret_list_64bit))
-
-
-td_file.close()
diff --git a/lib/Target/PowerPC/CMakeLists.txt b/lib/Target/PowerPC/CMakeLists.txt
index d1dda3716c4a1..73b4aba9f0154 100644
--- a/lib/Target/PowerPC/CMakeLists.txt
+++ b/lib/Target/PowerPC/CMakeLists.txt
@@ -1,16 +1,16 @@
 set(LLVM_TARGET_DEFINITIONS PPC.td)
 
-tablegen(PPCGenAsmWriter.inc -gen-asm-writer)
-tablegen(PPCGenCodeEmitter.inc -gen-emitter)
-tablegen(PPCGenMCCodeEmitter.inc -gen-emitter -mc-emitter)
-tablegen(PPCGenRegisterInfo.inc -gen-register-info)
-tablegen(PPCGenInstrInfo.inc -gen-instr-info)
-tablegen(PPCGenDAGISel.inc -gen-dag-isel)
-tablegen(PPCGenCallingConv.inc -gen-callingconv)
-tablegen(PPCGenSubtargetInfo.inc -gen-subtarget)
+llvm_tablegen(PPCGenAsmWriter.inc -gen-asm-writer)
+llvm_tablegen(PPCGenCodeEmitter.inc -gen-emitter)
+llvm_tablegen(PPCGenMCCodeEmitter.inc -gen-emitter -mc-emitter)
+llvm_tablegen(PPCGenRegisterInfo.inc -gen-register-info)
+llvm_tablegen(PPCGenInstrInfo.inc -gen-instr-info)
+llvm_tablegen(PPCGenDAGISel.inc -gen-dag-isel)
+llvm_tablegen(PPCGenCallingConv.inc -gen-callingconv)
+llvm_tablegen(PPCGenSubtargetInfo.inc -gen-subtarget)
+add_public_tablegen_target(PowerPCCommonTableGen)
 
 add_llvm_target(PowerPCCodeGen
-  PPCAsmBackend.cpp
   PPCAsmPrinter.cpp
   PPCBranchSelector.cpp
   PPCCodeEmitter.cpp
@@ -20,15 +20,27 @@ add_llvm_target(PowerPCCodeGen
   PPCISelLowering.cpp
   PPCFrameLowering.cpp
   PPCJITInfo.cpp
-  PPCMCCodeEmitter.cpp
   PPCMCInstLower.cpp
-  PPCPredicates.cpp
   PPCRegisterInfo.cpp
   PPCSubtarget.cpp
   PPCTargetMachine.cpp
   PPCSelectionDAGInfo.cpp
   )
 
+add_llvm_library_dependencies(LLVMPowerPCCodeGen
+  LLVMAnalysis
+  LLVMAsmPrinter
+  LLVMCodeGen
+  LLVMCore
+  LLVMMC
+  LLVMPowerPCAsmPrinter
+  LLVMPowerPCDesc
+  LLVMPowerPCInfo
+  LLVMSelectionDAG
+  LLVMSupport
+  LLVMTarget
+  )
+
 add_subdirectory(InstPrinter)
 add_subdirectory(TargetInfo)
 add_subdirectory(MCTargetDesc)
diff --git a/lib/Target/PowerPC/InstPrinter/CMakeLists.txt b/lib/Target/PowerPC/InstPrinter/CMakeLists.txt
index 389ea7742b065..1d857e2f48d44 100644
--- a/lib/Target/PowerPC/InstPrinter/CMakeLists.txt
+++ b/lib/Target/PowerPC/InstPrinter/CMakeLists.txt
@@ -3,4 +3,10 @@ include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/
 add_llvm_library(LLVMPowerPCAsmPrinter
   PPCInstPrinter.cpp
   )
-add_dependencies(LLVMPowerPCAsmPrinter PowerPCCodeGenTable_gen)
+
+add_llvm_library_dependencies(LLVMPowerPCAsmPrinter
+  LLVMMC
+  LLVMSupport
+  )
+
+add_dependencies(LLVMPowerPCAsmPrinter PowerPCCommonTableGen)
diff --git a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
index 1a9bd761359c8..b6a08354a2100 100644
--- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
+++ b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
@@ -13,7 +13,8 @@
 
 #define DEBUG_TYPE "asm-printer"
 #include "PPCInstPrinter.h"
-#include "PPCPredicates.h"
+#include "MCTargetDesc/PPCBaseInfo.h"
+#include "MCTargetDesc/PPCPredicates.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/Support/raw_ostream.h"
@@ -30,7 +31,8 @@ void PPCInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
   OS << getRegisterName(RegNo);
 }
 
-void PPCInstPrinter::printInst(const MCInst *MI, raw_ostream &O) {
+void PPCInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+                               StringRef Annot) {
   // Check for slwi/srwi mnemonics.
   if (MI->getOpcode() == PPC::RLWINM) {
     unsigned char SH = MI->getOperand(2).getImm();
@@ -49,6 +51,8 @@ void PPCInstPrinter::printInst(const MCInst *MI, raw_ostream &O) {
       O << ", ";
       printOperand(MI, 1, O);
       O << ", " << (unsigned int)SH;
+
+      printAnnotation(O, Annot);
       return;
     }
   }
@@ -59,6 +63,7 @@ void PPCInstPrinter::printInst(const MCInst *MI, raw_ostream &O) {
     printOperand(MI, 0, O);
     O << ", ";
     printOperand(MI, 1, O);
+    printAnnotation(O, Annot);
     return;
   }
   
@@ -72,11 +77,13 @@ void PPCInstPrinter::printInst(const MCInst *MI, raw_ostream &O) {
       O << ", ";
       printOperand(MI, 1, O);
       O << ", " << (unsigned int)SH;
+      printAnnotation(O, Annot);
       return;
     }
   }
   
   printInstruction(MI, O);
+  printAnnotation(O, Annot);
 }
 
 
diff --git a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
index d022a4496e840..4ed4b765c1c70 100644
--- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
+++ b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
@@ -32,7 +32,7 @@ public:
   }
   
   virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
-  virtual void printInst(const MCInst *MI, raw_ostream &O);
+  virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
   virtual StringRef getOpcodeName(unsigned Opcode) const;
   
   static const char *getInstructionName(unsigned Opcode);
diff --git a/lib/Target/PowerPC/MCTargetDesc/CMakeLists.txt b/lib/Target/PowerPC/MCTargetDesc/CMakeLists.txt
index a1b81662115a1..c4041db8cf0bc 100644
--- a/lib/Target/PowerPC/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/PowerPC/MCTargetDesc/CMakeLists.txt
@@ -1,4 +1,16 @@
 add_llvm_library(LLVMPowerPCDesc
+  PPCAsmBackend.cpp
   PPCMCTargetDesc.cpp
   PPCMCAsmInfo.cpp
+  PPCMCCodeEmitter.cpp
+  PPCPredicates.cpp
   )
+
+add_llvm_library_dependencies(LLVMPowerPCDesc
+  LLVMMC
+  LLVMPowerPCAsmPrinter
+  LLVMPowerPCInfo
+  LLVMSupport
+  )
+
+add_dependencies(LLVMPowerPCDesc PowerPCCommonTableGen)
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
new file mode 100644
index 0000000000000..9f2fd6d01b8e4
--- /dev/null
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
@@ -0,0 +1,191 @@
+//===-- PPCAsmBackend.cpp - PPC Assembler Backend -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCAsmBackend.h"
+#include "MCTargetDesc/PPCMCTargetDesc.h"
+#include "MCTargetDesc/PPCFixupKinds.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCMachObjectWriter.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Object/MachOFormat.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
+  switch (Kind) {
+  default:
+    llvm_unreachable("Unknown fixup kind!");
+  case FK_Data_1:
+  case FK_Data_2:
+  case FK_Data_4:
+    return Value;
+  case PPC::fixup_ppc_brcond14:
+    return Value & 0x3ffc;
+  case PPC::fixup_ppc_br24:
+    return Value & 0x3fffffc;
+#if 0
+  case PPC::fixup_ppc_hi16:
+    return (Value >> 16) & 0xffff;
+#endif
+  case PPC::fixup_ppc_ha16:
+    return ((Value >> 16) + ((Value & 0x8000) ? 1 : 0)) & 0xffff;
+  case PPC::fixup_ppc_lo16:
+    return Value & 0xffff;
+  }
+}
+
+namespace {
+class PPCMachObjectWriter : public MCMachObjectTargetWriter {
+public:
+  PPCMachObjectWriter(bool Is64Bit, uint32_t CPUType,
+                      uint32_t CPUSubtype)
+    : MCMachObjectTargetWriter(Is64Bit, CPUType, CPUSubtype) {}
+
+  void RecordRelocation(MachObjectWriter *Writer,
+                        const MCAssembler &Asm, const MCAsmLayout &Layout,
+                        const MCFragment *Fragment, const MCFixup &Fixup,
+                        MCValue Target, uint64_t &FixedValue) {}
+};
+
+class PPCELFObjectWriter : public MCELFObjectTargetWriter {
+public:
+  PPCELFObjectWriter(bool Is64Bit, Triple::OSType OSType, uint16_t EMachine,
+                     bool HasRelocationAddend, bool isLittleEndian)
+    : MCELFObjectTargetWriter(Is64Bit, OSType, EMachine, HasRelocationAddend) {}
+};
+
+class PPCAsmBackend : public MCAsmBackend {
+const Target &TheTarget;
+public:
+  PPCAsmBackend(const Target &T) : MCAsmBackend(), TheTarget(T) {}
+
+  unsigned getNumFixupKinds() const { return PPC::NumTargetFixupKinds; }
+
+  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const {
+    const static MCFixupKindInfo Infos[PPC::NumTargetFixupKinds] = {
+      // name                    offset  bits  flags
+      { "fixup_ppc_br24",        6,      24,   MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_ppc_brcond14",    16,     14,   MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_ppc_lo16",        16,     16,   0 },
+      { "fixup_ppc_ha16",        16,     16,   0 },
+      { "fixup_ppc_lo14",        16,     14,   0 }
+    };
+  
+    if (Kind < FirstTargetFixupKind)
+      return MCAsmBackend::getFixupKindInfo(Kind);
+  
+    assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+           "Invalid kind!");
+    return Infos[Kind - FirstTargetFixupKind];
+  }
+  
+  bool MayNeedRelaxation(const MCInst &Inst) const {
+    // FIXME.
+    return false;
+  }
+  
+  void RelaxInstruction(const MCInst &Inst, MCInst &Res) const {
+    // FIXME.
+    assert(0 && "RelaxInstruction() unimplemented");
+  }
+  
+  bool WriteNopData(uint64_t Count, MCObjectWriter *OW) const {
+    // FIXME: Zero fill for now. That's not right, but at least will get the
+    // section size right.
+    for (uint64_t i = 0; i != Count; ++i)
+      OW->Write8(0);
+    return true;
+  }      
+  
+  unsigned getPointerSize() const {
+    StringRef Name = TheTarget.getName();
+    if (Name == "ppc64") return 8;
+    assert(Name == "ppc32" && "Unknown target name!");
+    return 4;
+  }
+};
+} // end anonymous namespace
+
+
+// FIXME: This should be in a separate file.
+namespace {
+  class DarwinPPCAsmBackend : public PPCAsmBackend {
+  public:
+    DarwinPPCAsmBackend(const Target &T) : PPCAsmBackend(T) { }
+    
+    void ApplyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                    uint64_t Value) const {
+      assert(0 && "UNIMP");
+    }
+    
+    MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
+      bool is64 = getPointerSize() == 8;
+      return createMachObjectWriter(new PPCMachObjectWriter(
+                                      /*Is64Bit=*/is64,
+                                      (is64 ? object::mach::CTM_PowerPC64 :
+                                       object::mach::CTM_PowerPC),
+                                      object::mach::CSPPC_ALL),
+                                    OS, /*IsLittleEndian=*/false);
+    }
+    
+    virtual bool doesSectionRequireSymbols(const MCSection &Section) const {
+      return false;
+    }
+  };
+
+  class ELFPPCAsmBackend : public PPCAsmBackend {
+    Triple::OSType OSType;
+  public:
+    ELFPPCAsmBackend(const Target &T, Triple::OSType OSType) :
+      PPCAsmBackend(T), OSType(OSType) { }
+    
+    void ApplyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                    uint64_t Value) const {
+      Value = adjustFixupValue(Fixup.getKind(), Value);
+      if (!Value) return;           // Doesn't change encoding.
+
+      unsigned Offset = Fixup.getOffset();
+
+      // For each byte of the fragment that the fixup touches, mask in the bits from
+      // the fixup value. The Value has been "split up" into the appropriate
+      // bitfields above.
+      for (unsigned i = 0; i != 4; ++i)
+        Data[Offset + i] |= uint8_t((Value >> ((4 - i - 1)*8)) & 0xff);
+    }
+    
+    MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
+      bool is64 = getPointerSize() == 8;
+      return createELFObjectWriter(new PPCELFObjectWriter(
+                                      /*Is64Bit=*/is64,
+                                      OSType,
+                                      is64 ? ELF::EM_PPC64 : ELF::EM_PPC,                                      
+                                      /*addend*/ true, /*isLittleEndian*/ false),
+                                   OS, /*IsLittleEndian=*/false);
+    }
+    
+    virtual bool doesSectionRequireSymbols(const MCSection &Section) const {
+      return false;
+    }
+  };
+
+} // end anonymous namespace
+
+
+
+
+MCAsmBackend *llvm::createPPCAsmBackend(const Target &T, StringRef TT) {
+  if (Triple(TT).isOSDarwin())
+    return new DarwinPPCAsmBackend(T);
+
+  return new ELFPPCAsmBackend(T, Triple(TT).getOS());
+}
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCBaseInfo.h b/lib/Target/PowerPC/MCTargetDesc/PPCBaseInfo.h
new file mode 100644
index 0000000000000..369bbdce11f59
--- /dev/null
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCBaseInfo.h
@@ -0,0 +1,70 @@
+//===-- PPCBaseInfo.h - Top level definitions for PPC -------- --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains small standalone helper functions and enum definitions for
+// the PPC target useful for the compiler back-end and the MC libraries.
+// As such, it deliberately does not include references to LLVM core
+// code gen types, passes, etc..
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PPCBASEINFO_H
+#define PPCBASEINFO_H
+
+#include "PPCMCTargetDesc.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+
+/// getPPCRegisterNumbering - Given the enum value for some register, e.g.
+/// PPC::F14, return the number that it corresponds to (e.g. 14).
+inline static unsigned getPPCRegisterNumbering(unsigned RegEnum) {
+  using namespace PPC;
+  switch (RegEnum) {
+  case 0: return 0;
+  case R0 :  case X0 :  case F0 :  case V0 : case CR0:  case CR0LT: return  0;
+  case R1 :  case X1 :  case F1 :  case V1 : case CR1:  case CR0GT: return  1;
+  case R2 :  case X2 :  case F2 :  case V2 : case CR2:  case CR0EQ: return  2;
+  case R3 :  case X3 :  case F3 :  case V3 : case CR3:  case CR0UN: return  3;
+  case R4 :  case X4 :  case F4 :  case V4 : case CR4:  case CR1LT: return  4;
+  case R5 :  case X5 :  case F5 :  case V5 : case CR5:  case CR1GT: return  5;
+  case R6 :  case X6 :  case F6 :  case V6 : case CR6:  case CR1EQ: return  6;
+  case R7 :  case X7 :  case F7 :  case V7 : case CR7:  case CR1UN: return  7;
+  case R8 :  case X8 :  case F8 :  case V8 : case CR2LT: return  8;
+  case R9 :  case X9 :  case F9 :  case V9 : case CR2GT: return  9;
+  case R10:  case X10:  case F10:  case V10: case CR2EQ: return 10;
+  case R11:  case X11:  case F11:  case V11: case CR2UN: return 11;
+  case R12:  case X12:  case F12:  case V12: case CR3LT: return 12;
+  case R13:  case X13:  case F13:  case V13: case CR3GT: return 13;
+  case R14:  case X14:  case F14:  case V14: case CR3EQ: return 14;
+  case R15:  case X15:  case F15:  case V15: case CR3UN: return 15;
+  case R16:  case X16:  case F16:  case V16: case CR4LT: return 16;
+  case R17:  case X17:  case F17:  case V17: case CR4GT: return 17;
+  case R18:  case X18:  case F18:  case V18: case CR4EQ: return 18;
+  case R19:  case X19:  case F19:  case V19: case CR4UN: return 19;
+  case R20:  case X20:  case F20:  case V20: case CR5LT: return 20;
+  case R21:  case X21:  case F21:  case V21: case CR5GT: return 21;
+  case R22:  case X22:  case F22:  case V22: case CR5EQ: return 22;
+  case R23:  case X23:  case F23:  case V23: case CR5UN: return 23;
+  case R24:  case X24:  case F24:  case V24: case CR6LT: return 24;
+  case R25:  case X25:  case F25:  case V25: case CR6GT: return 25;
+  case R26:  case X26:  case F26:  case V26: case CR6EQ: return 26;
+  case R27:  case X27:  case F27:  case V27: case CR6UN: return 27;
+  case R28:  case X28:  case F28:  case V28: case CR7LT: return 28;
+  case R29:  case X29:  case F29:  case V29: case CR7GT: return 29;
+  case R30:  case X30:  case F30:  case V30: case CR7EQ: return 30;
+  case R31:  case X31:  case F31:  case V31: case CR7UN: return 31;
+  default:
+    llvm_unreachable("Unhandled reg in PPCRegisterInfo::getRegisterNumbering!");
+  }
+}
+
+} // end namespace llvm;
+
+#endif
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h b/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
new file mode 100644
index 0000000000000..b3c889e3f8da4
--- /dev/null
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
@@ -0,0 +1,45 @@
+//===-- PPCFixupKinds.h - PPC Specific Fixup Entries ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_PPC_PPCFIXUPKINDS_H
+#define LLVM_PPC_PPCFIXUPKINDS_H
+
+#include "llvm/MC/MCFixup.h"
+
+namespace llvm {
+namespace PPC {
+enum Fixups {
+  // fixup_ppc_br24 - 24-bit PC relative relocation for direct branches like 'b'
+  // and 'bl'.
+  fixup_ppc_br24 = FirstTargetFixupKind,
+  
+  /// fixup_ppc_brcond14 - 14-bit PC relative relocation for conditional
+  /// branches.
+  fixup_ppc_brcond14,
+  
+  /// fixup_ppc_lo16 - A 16-bit fixup corresponding to lo16(_foo) for instrs
+  /// like 'li'.
+  fixup_ppc_lo16,
+  
+  /// fixup_ppc_ha16 - A 16-bit fixup corresponding to ha16(_foo) for instrs
+  /// like 'lis'.
+  fixup_ppc_ha16,
+  
+  /// fixup_ppc_lo14 - A 14-bit fixup corresponding to lo16(_foo) for instrs
+  /// like 'std'.
+  fixup_ppc_lo14,
+  
+  // Marker
+  LastTargetFixupKind,
+  NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+};
+}
+}
+
+#endif
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
index b6dca835b18d6..e9424d8415f64 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
@@ -31,6 +31,10 @@ PPCMCAsmInfoDarwin::PPCMCAsmInfoDarwin(bool is64Bit) {
 }
 
 PPCLinuxMCAsmInfo::PPCLinuxMCAsmInfo(bool is64Bit) {
+  if (is64Bit)
+    PointerSize = 8;
+  IsLittleEndian = false;
+
   // ".comm align is in bytes but .align is pow-2."
   AlignmentIsInBytes = false;
 
@@ -56,7 +60,7 @@ PPCLinuxMCAsmInfo::PPCLinuxMCAsmInfo(bool is64Bit) {
     
   ZeroDirective = "\t.space\t";
   Data64bitsDirective = is64Bit ? "\t.quad\t" : 0;
-  HasLCOMMDirective = true;
+  LCOMMDirectiveType = LCOMM::NoAlignment;
   AssemblerDialect = 0;           // Old-Style mnemonics.
 }
 
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
new file mode 100644
index 0000000000000..262f97c36a936
--- /dev/null
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
@@ -0,0 +1,193 @@
+//===-- PPCMCCodeEmitter.cpp - Convert PPC code to machine code -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the PPCMCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mccodeemitter"
+#include "MCTargetDesc/PPCBaseInfo.h"
+#include "MCTargetDesc/PPCFixupKinds.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/ErrorHandling.h"
+using namespace llvm;
+
+STATISTIC(MCNumEmitted, "Number of MC instructions emitted");
+
+namespace {
+class PPCMCCodeEmitter : public MCCodeEmitter {
+  PPCMCCodeEmitter(const PPCMCCodeEmitter &); // DO NOT IMPLEMENT
+  void operator=(const PPCMCCodeEmitter &);   // DO NOT IMPLEMENT
+  
+public:
+  PPCMCCodeEmitter(const MCInstrInfo &mcii, const MCSubtargetInfo &sti,
+                   MCContext &ctx) {
+  }
+  
+  ~PPCMCCodeEmitter() {}
+
+  unsigned getDirectBrEncoding(const MCInst &MI, unsigned OpNo,
+                               SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getCondBrEncoding(const MCInst &MI, unsigned OpNo,
+                             SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getHA16Encoding(const MCInst &MI, unsigned OpNo,
+                           SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getLO16Encoding(const MCInst &MI, unsigned OpNo,
+                           SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getMemRIEncoding(const MCInst &MI, unsigned OpNo,
+                            SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getMemRIXEncoding(const MCInst &MI, unsigned OpNo,
+                             SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned get_crbitm_encoding(const MCInst &MI, unsigned OpNo,
+                               SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getMachineOpValue - Return binary encoding of operand. If the machine
+  /// operand requires relocation, record the relocation and return zero.
+  unsigned getMachineOpValue(const MCInst &MI,const MCOperand &MO,
+                             SmallVectorImpl<MCFixup> &Fixups) const;
+  
+  // getBinaryCodeForInstr - TableGen'erated function for getting the
+  // binary encoding for an instruction.
+  unsigned getBinaryCodeForInstr(const MCInst &MI,
+                                 SmallVectorImpl<MCFixup> &Fixups) const;
+  void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups) const {
+    unsigned Bits = getBinaryCodeForInstr(MI, Fixups);
+    
+    // Output the constant in big endian byte order.
+    for (unsigned i = 0; i != 4; ++i) {
+      OS << (char)(Bits >> 24);
+      Bits <<= 8;
+    }
+    
+    ++MCNumEmitted;  // Keep track of the # of mi's emitted.
+  }
+  
+};
+  
+} // end anonymous namespace
+  
+MCCodeEmitter *llvm::createPPCMCCodeEmitter(const MCInstrInfo &MCII,
+                                            const MCSubtargetInfo &STI,
+                                            MCContext &Ctx) {
+  return new PPCMCCodeEmitter(MCII, STI, Ctx);
+}
+
+unsigned PPCMCCodeEmitter::
+getDirectBrEncoding(const MCInst &MI, unsigned OpNo,
+                    SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO, Fixups);
+  
+  // Add a fixup for the branch target.
+  Fixups.push_back(MCFixup::Create(0, MO.getExpr(),
+                                   (MCFixupKind)PPC::fixup_ppc_br24));
+  return 0;
+}
+
+unsigned PPCMCCodeEmitter::getCondBrEncoding(const MCInst &MI, unsigned OpNo,
+                                     SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO, Fixups);
+
+  // Add a fixup for the branch target.
+  Fixups.push_back(MCFixup::Create(0, MO.getExpr(),
+                                   (MCFixupKind)PPC::fixup_ppc_brcond14));
+  return 0;
+}
+
+unsigned PPCMCCodeEmitter::getHA16Encoding(const MCInst &MI, unsigned OpNo,
+                                       SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO, Fixups);
+  
+  // Add a fixup for the branch target.
+  Fixups.push_back(MCFixup::Create(0, MO.getExpr(),
+                                   (MCFixupKind)PPC::fixup_ppc_ha16));
+  return 0;
+}
+
+unsigned PPCMCCodeEmitter::getLO16Encoding(const MCInst &MI, unsigned OpNo,
+                                       SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO, Fixups);
+  
+  // Add a fixup for the branch target.
+  Fixups.push_back(MCFixup::Create(0, MO.getExpr(),
+                                   (MCFixupKind)PPC::fixup_ppc_lo16));
+  return 0;
+}
+
+unsigned PPCMCCodeEmitter::getMemRIEncoding(const MCInst &MI, unsigned OpNo,
+                                            SmallVectorImpl<MCFixup> &Fixups) const {
+  // Encode (imm, reg) as a memri, which has the low 16-bits as the
+  // displacement and the next 5 bits as the register #.
+  assert(MI.getOperand(OpNo+1).isReg());
+  unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups) << 16;
+  
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isImm())
+    return (getMachineOpValue(MI, MO, Fixups) & 0xFFFF) | RegBits;
+  
+  // Add a fixup for the displacement field.
+  Fixups.push_back(MCFixup::Create(0, MO.getExpr(),
+                                   (MCFixupKind)PPC::fixup_ppc_lo16));
+  return RegBits;
+}
+
+
+unsigned PPCMCCodeEmitter::getMemRIXEncoding(const MCInst &MI, unsigned OpNo,
+                                       SmallVectorImpl<MCFixup> &Fixups) const {
+  // Encode (imm, reg) as a memrix, which has the low 14-bits as the
+  // displacement and the next 5 bits as the register #.
+  assert(MI.getOperand(OpNo+1).isReg());
+  unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups) << 14;
+  
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isImm())
+    return (getMachineOpValue(MI, MO, Fixups) & 0x3FFF) | RegBits;
+  
+  // Add a fixup for the branch target.
+  Fixups.push_back(MCFixup::Create(0, MO.getExpr(),
+                                   (MCFixupKind)PPC::fixup_ppc_lo14));
+  return RegBits;
+}
+
+
+unsigned PPCMCCodeEmitter::
+get_crbitm_encoding(const MCInst &MI, unsigned OpNo,
+                    SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand &MO = MI.getOperand(OpNo);
+  assert((MI.getOpcode() == PPC::MTCRF || MI.getOpcode() == PPC::MFOCRF) &&
+         (MO.getReg() >= PPC::CR0 && MO.getReg() <= PPC::CR7));
+  return 0x80 >> getPPCRegisterNumbering(MO.getReg());
+}
+
+
+unsigned PPCMCCodeEmitter::
+getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                  SmallVectorImpl<MCFixup> &Fixups) const {
+  if (MO.isReg()) {
+    // MTCRF/MFOCRF should go through get_crbitm_encoding for the CR operand.
+    // The GPR operand should come through here though.
+    assert((MI.getOpcode() != PPC::MTCRF && MI.getOpcode() != PPC::MFOCRF) ||
+           MO.getReg() < PPC::CR0 || MO.getReg() > PPC::CR7);
+    return getPPCRegisterNumbering(MO.getReg());
+  }
+  
+  assert(MO.isImm() &&
+         "Relocation required in an instruction that we cannot encode!");
+  return MO.getImm();
+}
+
+
+#include "PPCGenMCCodeEmitter.inc"
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
index 02b887f4d5dc6..d5c8a9e72c67f 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
@@ -13,10 +13,14 @@
 
 #include "PPCMCTargetDesc.h"
 #include "PPCMCAsmInfo.h"
+#include "InstPrinter/PPCInstPrinter.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/MC/MCCodeGenInfo.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 
 #define GET_INSTRINFO_MC_DESC
 #include "PPCGenInstrInfo.inc"
@@ -35,11 +39,16 @@ static MCInstrInfo *createPPCMCInstrInfo() {
   return X;
 }
 
-extern "C" void LLVMInitializePowerPCMCInstrInfo() {
-  TargetRegistry::RegisterMCInstrInfo(ThePPC32Target, createPPCMCInstrInfo);
-  TargetRegistry::RegisterMCInstrInfo(ThePPC64Target, createPPCMCInstrInfo);
-}
+static MCRegisterInfo *createPPCMCRegisterInfo(StringRef TT) {
+  Triple TheTriple(TT);
+  bool isPPC64 = (TheTriple.getArch() == Triple::ppc64);
+  unsigned Flavour = isPPC64 ? 0 : 1;
+  unsigned RA = isPPC64 ? PPC::LR8 : PPC::LR;
 
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitPPCMCRegisterInfo(X, RA, Flavour, Flavour);
+  return X;
+}
 
 static MCSubtargetInfo *createPPCMCSubtargetInfo(StringRef TT, StringRef CPU,
                                                  StringRef FS) {
@@ -48,23 +57,95 @@ static MCSubtargetInfo *createPPCMCSubtargetInfo(StringRef TT, StringRef CPU,
   return X;
 }
 
-extern "C" void LLVMInitializePowerPCMCSubtargetInfo() {
+static MCAsmInfo *createPPCMCAsmInfo(const Target &T, StringRef TT) {
+  Triple TheTriple(TT);
+  bool isPPC64 = TheTriple.getArch() == Triple::ppc64;
+
+  MCAsmInfo *MAI;
+  if (TheTriple.isOSDarwin())
+    MAI = new PPCMCAsmInfoDarwin(isPPC64);
+  else
+    MAI = new PPCLinuxMCAsmInfo(isPPC64);
+
+  // Initial state of the frame pointer is R1.
+  MachineLocation Dst(MachineLocation::VirtualFP);
+  MachineLocation Src(PPC::R1, 0);
+  MAI->addInitialFrameState(0, Dst, Src);
+
+  return MAI;
+}
+
+static MCCodeGenInfo *createPPCMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                             CodeModel::Model CM) {
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+
+  if (RM == Reloc::Default) {
+    Triple T(TT);
+    if (T.isOSDarwin())
+      RM = Reloc::DynamicNoPIC;
+    else
+      RM = Reloc::Static;
+  }
+  X->InitMCCodeGenInfo(RM, CM);
+  return X;
+}
+
+// This is duplicated code. Refactor this.
+static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
+                                    MCContext &Ctx, MCAsmBackend &MAB,
+                                    raw_ostream &OS,
+                                    MCCodeEmitter *Emitter,
+                                    bool RelaxAll,
+                                    bool NoExecStack) {
+  if (Triple(TT).isOSDarwin())
+    return createMachOStreamer(Ctx, MAB, OS, Emitter, RelaxAll);
+
+  return createELFStreamer(Ctx, MAB, OS, Emitter, RelaxAll, NoExecStack);
+}
+
+static MCInstPrinter *createPPCMCInstPrinter(const Target &T,
+                                             unsigned SyntaxVariant,
+                                             const MCAsmInfo &MAI,
+                                             const MCSubtargetInfo &STI) {
+  return new PPCInstPrinter(MAI, SyntaxVariant);
+}
+
+extern "C" void LLVMInitializePowerPCTargetMC() {
+  // Register the MC asm info.
+  RegisterMCAsmInfoFn C(ThePPC32Target, createPPCMCAsmInfo);
+  RegisterMCAsmInfoFn D(ThePPC64Target, createPPCMCAsmInfo);  
+
+  // Register the MC codegen info.
+  TargetRegistry::RegisterMCCodeGenInfo(ThePPC32Target, createPPCMCCodeGenInfo);
+  TargetRegistry::RegisterMCCodeGenInfo(ThePPC64Target, createPPCMCCodeGenInfo);
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(ThePPC32Target, createPPCMCInstrInfo);
+  TargetRegistry::RegisterMCInstrInfo(ThePPC64Target, createPPCMCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(ThePPC32Target, createPPCMCRegisterInfo);
+  TargetRegistry::RegisterMCRegInfo(ThePPC64Target, createPPCMCRegisterInfo);
+
+  // Register the MC subtarget info.
   TargetRegistry::RegisterMCSubtargetInfo(ThePPC32Target,
                                           createPPCMCSubtargetInfo);
   TargetRegistry::RegisterMCSubtargetInfo(ThePPC64Target,
                                           createPPCMCSubtargetInfo);
-}
 
-static MCAsmInfo *createMCAsmInfo(const Target &T, StringRef TT) {
-  Triple TheTriple(TT);
-  bool isPPC64 = TheTriple.getArch() == Triple::ppc64;
-  if (TheTriple.isOSDarwin())
-    return new PPCMCAsmInfoDarwin(isPPC64);
-  return new PPCLinuxMCAsmInfo(isPPC64);
+  // Register the MC Code Emitter
+  TargetRegistry::RegisterMCCodeEmitter(ThePPC32Target, createPPCMCCodeEmitter);
+  TargetRegistry::RegisterMCCodeEmitter(ThePPC64Target, createPPCMCCodeEmitter);
   
-}
+    // Register the asm backend.
+  TargetRegistry::RegisterMCAsmBackend(ThePPC32Target, createPPCAsmBackend);
+  TargetRegistry::RegisterMCAsmBackend(ThePPC64Target, createPPCAsmBackend);
+  
+  // Register the object streamer.
+  TargetRegistry::RegisterMCObjectStreamer(ThePPC32Target, createMCStreamer);
+  TargetRegistry::RegisterMCObjectStreamer(ThePPC64Target, createMCStreamer);
 
-extern "C" void LLVMInitializePowerPCMCAsmInfo() {
-  RegisterMCAsmInfoFn C(ThePPC32Target, createMCAsmInfo);
-  RegisterMCAsmInfoFn D(ThePPC64Target, createMCAsmInfo);  
+  // Register the MCInstPrinter.
+  TargetRegistry::RegisterMCInstPrinter(ThePPC32Target, createPPCMCInstPrinter);
+  TargetRegistry::RegisterMCInstPrinter(ThePPC64Target, createPPCMCInstPrinter);
 }
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
index cee235097a0ab..e5bf2a9dd92fb 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
@@ -15,6 +15,10 @@
 #define PPCMCTARGETDESC_H
 
 namespace llvm {
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
 class MCSubtargetInfo;
 class Target;
 class StringRef;
@@ -22,6 +26,12 @@ class StringRef;
 extern Target ThePPC32Target;
 extern Target ThePPC64Target;
   
+MCCodeEmitter *createPPCMCCodeEmitter(const MCInstrInfo &MCII,
+                                      const MCSubtargetInfo &STI,
+                                      MCContext &Ctx);
+
+MCAsmBackend *createPPCAsmBackend(const Target &T, StringRef TT);
+  
 } // End llvm namespace
 
 // Defines symbolic names for PowerPC registers.  This defines a mapping from
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.cpp
new file mode 100644
index 0000000000000..12bb0a143406b
--- /dev/null
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.cpp
@@ -0,0 +1,31 @@
+//===-- PPCPredicates.cpp - PPC Branch Predicate Information --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the PowerPC branch predicates.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCPredicates.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <cassert>
+using namespace llvm;
+
+PPC::Predicate PPC::InvertPredicate(PPC::Predicate Opcode) {
+  switch (Opcode) {
+  default: llvm_unreachable("Unknown PPC branch opcode!");
+  case PPC::PRED_EQ: return PPC::PRED_NE;
+  case PPC::PRED_NE: return PPC::PRED_EQ;
+  case PPC::PRED_LT: return PPC::PRED_GE;
+  case PPC::PRED_GE: return PPC::PRED_LT;
+  case PPC::PRED_GT: return PPC::PRED_LE;
+  case PPC::PRED_LE: return PPC::PRED_GT;
+  case PPC::PRED_NU: return PPC::PRED_UN;
+  case PPC::PRED_UN: return PPC::PRED_NU;
+  }
+}
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h b/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h
new file mode 100644
index 0000000000000..f872e861bfa71
--- /dev/null
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h
@@ -0,0 +1,37 @@
+//===-- PPCPredicates.h - PPC Branch Predicate Information ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the PowerPC branch predicates.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_POWERPC_PPCPREDICATES_H
+#define LLVM_TARGET_POWERPC_PPCPREDICATES_H
+
+namespace llvm {
+namespace PPC {
+  /// Predicate - These are "(BI << 5) | BO"  for various predicates.
+  enum Predicate {
+    PRED_ALWAYS = (0 << 5) | 20,
+    PRED_LT     = (0 << 5) | 12,
+    PRED_LE     = (1 << 5) |  4,
+    PRED_EQ     = (2 << 5) | 12,
+    PRED_GE     = (0 << 5) |  4,
+    PRED_GT     = (1 << 5) | 12,
+    PRED_NE     = (2 << 5) |  4,
+    PRED_UN     = (3 << 5) | 12,
+    PRED_NU     = (3 << 5) |  4
+  };
+  
+  /// Invert the specified predicate.  != -> ==, < -> >=.
+  Predicate InvertPredicate(Predicate Opcode);
+}
+}
+
+#endif
diff --git a/lib/Target/PowerPC/PPC.h b/lib/Target/PowerPC/PPC.h
index 7191dd105f3c2..5dc1863a0b2d6 100644
--- a/lib/Target/PowerPC/PPC.h
+++ b/lib/Target/PowerPC/PPC.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_TARGET_POWERPC_H
 #define LLVM_TARGET_POWERPC_H
 
+#include "MCTargetDesc/PPCBaseInfo.h"
 #include "MCTargetDesc/PPCMCTargetDesc.h"
 #include <string>
 
@@ -30,22 +31,12 @@ namespace llvm {
   class MachineInstr;
   class AsmPrinter;
   class MCInst;
-  class MCCodeEmitter;
-  class MCContext;
-  class MCInstrInfo;
-  class MCSubtargetInfo;
   class TargetMachine;
-  class TargetAsmBackend;
   
   FunctionPass *createPPCBranchSelectionPass();
   FunctionPass *createPPCISelDag(PPCTargetMachine &TM);
   FunctionPass *createPPCJITCodeEmitterPass(PPCTargetMachine &TM,
                                             JITCodeEmitter &MCE);
-  MCCodeEmitter *createPPCMCCodeEmitter(const MCInstrInfo &MCII,
-                                        const MCSubtargetInfo &STI,
-                                        MCContext &Ctx);
-  TargetAsmBackend *createPPCAsmBackend(const Target &, const std::string &);
-  
   void LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
                                     AsmPrinter &AP, bool isDarwin);
   
diff --git a/lib/Target/PowerPC/PPC.td b/lib/Target/PowerPC/PPC.td
index aabf494012e25..2d5d302728f73 100644
--- a/lib/Target/PowerPC/PPC.td
+++ b/lib/Target/PowerPC/PPC.td
@@ -43,9 +43,9 @@ def FeatureAltivec   : SubtargetFeature<"altivec","HasAltivec", "true",
 def FeatureGPUL      : SubtargetFeature<"gpul","IsGigaProcessor", "true",
                                         "Enable GPUL instructions">;
 def FeatureFSqrt     : SubtargetFeature<"fsqrt","HasFSQRT", "true",
-                                        "Enable the fsqrt instruction">; 
+                                        "Enable the fsqrt instruction">;
 def FeatureSTFIWX    : SubtargetFeature<"stfiwx","HasSTFIWX", "true",
-                                        "Enable the stfiwx instruction">; 
+                                        "Enable the stfiwx instruction">;
 
 //===----------------------------------------------------------------------===//
 // Register File Description
diff --git a/lib/Target/PowerPC/PPCAsmBackend.cpp b/lib/Target/PowerPC/PPCAsmBackend.cpp
deleted file mode 100644
index 4b8cbb7118330..0000000000000
--- a/lib/Target/PowerPC/PPCAsmBackend.cpp
+++ /dev/null
@@ -1,123 +0,0 @@
-//===-- PPCAsmBackend.cpp - PPC Assembler Backend -------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Target/TargetAsmBackend.h"
-#include "PPC.h"
-#include "PPCFixupKinds.h"
-#include "llvm/MC/MCMachObjectWriter.h"
-#include "llvm/MC/MCSectionMachO.h"
-#include "llvm/MC/MCObjectWriter.h"
-#include "llvm/MC/MCValue.h"
-#include "llvm/Object/MachOFormat.h"
-#include "llvm/Target/TargetRegistry.h"
-using namespace llvm;
-
-namespace {
-class PPCMachObjectWriter : public MCMachObjectTargetWriter {
-public:
-  PPCMachObjectWriter(bool Is64Bit, uint32_t CPUType,
-                      uint32_t CPUSubtype)
-    : MCMachObjectTargetWriter(Is64Bit, CPUType, CPUSubtype) {}
-
-  void RecordRelocation(MachObjectWriter *Writer,
-                        const MCAssembler &Asm, const MCAsmLayout &Layout,
-                        const MCFragment *Fragment, const MCFixup &Fixup,
-                        MCValue Target, uint64_t &FixedValue) {}
-};
-
-class PPCAsmBackend : public TargetAsmBackend {
-const Target &TheTarget;
-public:
-  PPCAsmBackend(const Target &T) : TargetAsmBackend(), TheTarget(T) {}
-
-  unsigned getNumFixupKinds() const { return PPC::NumTargetFixupKinds; }
-
-  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const {
-    const static MCFixupKindInfo Infos[PPC::NumTargetFixupKinds] = {
-      // name                    offset  bits  flags
-      { "fixup_ppc_br24",        6,      24,   MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_ppc_brcond14",    16,     14,   MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_ppc_lo16",        16,     16,   0 },
-      { "fixup_ppc_ha16",        16,     16,   0 },
-      { "fixup_ppc_lo14",        16,     14,   0 }
-    };
-  
-    if (Kind < FirstTargetFixupKind)
-      return TargetAsmBackend::getFixupKindInfo(Kind);
-  
-    assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
-           "Invalid kind!");
-    return Infos[Kind - FirstTargetFixupKind];
-  }
-  
-  bool MayNeedRelaxation(const MCInst &Inst) const {
-    // FIXME.
-    return false;
-  }
-  
-  void RelaxInstruction(const MCInst &Inst, MCInst &Res) const {
-    // FIXME.
-    assert(0 && "RelaxInstruction() unimplemented");
-  }
-  
-  bool WriteNopData(uint64_t Count, MCObjectWriter *OW) const {
-    // FIXME: Zero fill for now. That's not right, but at least will get the
-    // section size right.
-    for (uint64_t i = 0; i != Count; ++i)
-      OW->Write8(0);
-    return true;
-  }      
-  
-  unsigned getPointerSize() const {
-    StringRef Name = TheTarget.getName();
-    if (Name == "ppc64") return 8;
-    assert(Name == "ppc32" && "Unknown target name!");
-    return 4;
-  }
-};
-} // end anonymous namespace
-
-
-// FIXME: This should be in a separate file.
-namespace {
-  class DarwinPPCAsmBackend : public PPCAsmBackend {
-  public:
-    DarwinPPCAsmBackend(const Target &T) : PPCAsmBackend(T) { }
-    
-    void ApplyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                    uint64_t Value) const {
-      assert(0 && "UNIMP");
-    }
-    
-    MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
-      bool is64 = getPointerSize() == 8;
-      return createMachObjectWriter(new PPCMachObjectWriter(
-                                      /*Is64Bit=*/is64,
-                                      (is64 ? object::mach::CTM_PowerPC64 :
-                                       object::mach::CTM_PowerPC),
-                                      object::mach::CSPPC_ALL),
-                                    OS, /*IsLittleEndian=*/false);
-    }
-    
-    virtual bool doesSectionRequireSymbols(const MCSection &Section) const {
-      return false;
-    }
-  };
-} // end anonymous namespace
-
-
-
-
-TargetAsmBackend *llvm::createPPCAsmBackend(const Target &T,
-                                            const std::string &TT) {
-  if (Triple(TT).isOSDarwin())
-    return new DarwinPPCAsmBackend(T);
-
-  return 0;
-}
diff --git a/lib/Target/PowerPC/PPCAsmPrinter.cpp b/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 9de2200296e80..952845943179c 100644
--- a/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -18,9 +18,9 @@
 
 #define DEBUG_TYPE "asmprinter"
 #include "PPC.h"
-#include "PPCPredicates.h"
 #include "PPCTargetMachine.h"
 #include "PPCSubtarget.h"
+#include "MCTargetDesc/PPCPredicates.h"
 #include "llvm/Analysis/DebugInfo.h"
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
@@ -43,11 +43,11 @@
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetOptions.h"
-#include "llvm/Target/TargetRegistry.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSet.h"
@@ -679,18 +679,8 @@ static AsmPrinter *createPPCAsmPrinterPass(TargetMachine &tm,
   return new PPCLinuxAsmPrinter(tm, Streamer);
 }
 
-static MCInstPrinter *createPPCMCInstPrinter(const Target &T,
-                                             unsigned SyntaxVariant,
-                                             const MCAsmInfo &MAI) {
-  return new PPCInstPrinter(MAI, SyntaxVariant);
-}
-
-
 // Force static initialization.
 extern "C" void LLVMInitializePowerPCAsmPrinter() { 
   TargetRegistry::RegisterAsmPrinter(ThePPC32Target, createPPCAsmPrinterPass);
   TargetRegistry::RegisterAsmPrinter(ThePPC64Target, createPPCAsmPrinterPass);
-  
-  TargetRegistry::RegisterMCInstPrinter(ThePPC32Target, createPPCMCInstPrinter);
-  TargetRegistry::RegisterMCInstPrinter(ThePPC64Target, createPPCMCInstPrinter);
 }
diff --git a/lib/Target/PowerPC/PPCBranchSelector.cpp b/lib/Target/PowerPC/PPCBranchSelector.cpp
index e161d23600e27..475edf309c0ce 100644
--- a/lib/Target/PowerPC/PPCBranchSelector.cpp
+++ b/lib/Target/PowerPC/PPCBranchSelector.cpp
@@ -19,7 +19,7 @@
 #include "PPC.h"
 #include "PPCInstrBuilder.h"
 #include "PPCInstrInfo.h"
-#include "PPCPredicates.h"
+#include "MCTargetDesc/PPCPredicates.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/ADT/Statistic.h"
diff --git a/lib/Target/PowerPC/PPCCodeEmitter.cpp b/lib/Target/PowerPC/PPCCodeEmitter.cpp
index 42232a07535b0..4a1f1822afdf1 100644
--- a/lib/Target/PowerPC/PPCCodeEmitter.cpp
+++ b/lib/Target/PowerPC/PPCCodeEmitter.cpp
@@ -140,7 +140,7 @@ unsigned PPCCodeEmitter::get_crbitm_encoding(const MachineInstr &MI,
   const MachineOperand &MO = MI.getOperand(OpNo);
   assert((MI.getOpcode() == PPC::MTCRF || MI.getOpcode() == PPC::MFOCRF) &&
          (MO.getReg() >= PPC::CR0 && MO.getReg() <= PPC::CR7));
-  return 0x80 >> PPCRegisterInfo::getRegisterNumbering(MO.getReg());
+  return 0x80 >> getPPCRegisterNumbering(MO.getReg());
 }
 
 MachineRelocation PPCCodeEmitter::GetRelocation(const MachineOperand &MO, 
@@ -250,7 +250,7 @@ unsigned PPCCodeEmitter::getMachineOpValue(const MachineInstr &MI,
     // The GPR operand should come through here though.
     assert((MI.getOpcode() != PPC::MTCRF && MI.getOpcode() != PPC::MFOCRF) ||
            MO.getReg() < PPC::CR0 || MO.getReg() > PPC::CR7);
-    return PPCRegisterInfo::getRegisterNumbering(MO.getReg());
+    return getPPCRegisterNumbering(MO.getReg());
   }
   
   assert(MO.isImm() &&
diff --git a/lib/Target/PowerPC/PPCFixupKinds.h b/lib/Target/PowerPC/PPCFixupKinds.h
deleted file mode 100644
index b3c889e3f8da4..0000000000000
--- a/lib/Target/PowerPC/PPCFixupKinds.h
+++ /dev/null
@@ -1,45 +0,0 @@
-//===-- PPCFixupKinds.h - PPC Specific Fixup Entries ------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_PPC_PPCFIXUPKINDS_H
-#define LLVM_PPC_PPCFIXUPKINDS_H
-
-#include "llvm/MC/MCFixup.h"
-
-namespace llvm {
-namespace PPC {
-enum Fixups {
-  // fixup_ppc_br24 - 24-bit PC relative relocation for direct branches like 'b'
-  // and 'bl'.
-  fixup_ppc_br24 = FirstTargetFixupKind,
-  
-  /// fixup_ppc_brcond14 - 14-bit PC relative relocation for conditional
-  /// branches.
-  fixup_ppc_brcond14,
-  
-  /// fixup_ppc_lo16 - A 16-bit fixup corresponding to lo16(_foo) for instrs
-  /// like 'li'.
-  fixup_ppc_lo16,
-  
-  /// fixup_ppc_ha16 - A 16-bit fixup corresponding to ha16(_foo) for instrs
-  /// like 'lis'.
-  fixup_ppc_ha16,
-  
-  /// fixup_ppc_lo14 - A 14-bit fixup corresponding to lo16(_foo) for instrs
-  /// like 'std'.
-  fixup_ppc_lo14,
-  
-  // Marker
-  LastTargetFixupKind,
-  NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
-};
-}
-}
-
-#endif
diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp
index 375e000fe401b..7dead10c7c2a1 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -109,14 +109,14 @@ static void HandleVRSaveUpdate(MachineInstr *MI, const TargetInstrInfo &TII) {
   for (MachineRegisterInfo::livein_iterator
        I = MF->getRegInfo().livein_begin(),
        E = MF->getRegInfo().livein_end(); I != E; ++I) {
-    unsigned RegNo = PPCRegisterInfo::getRegisterNumbering(I->first);
+    unsigned RegNo = getPPCRegisterNumbering(I->first);
     if (VRRegNo[RegNo] == I->first)        // If this really is a vector reg.
       UsedRegMask &= ~(1 << (31-RegNo));   // Doesn't need to be marked.
   }
   for (MachineRegisterInfo::liveout_iterator
        I = MF->getRegInfo().liveout_begin(),
        E = MF->getRegInfo().liveout_end(); I != E; ++I) {
-    unsigned RegNo = PPCRegisterInfo::getRegisterNumbering(*I);
+    unsigned RegNo = getPPCRegisterNumbering(*I);
     if (VRRegNo[RegNo] == *I)              // If this really is a vector reg.
       UsedRegMask &= ~(1 << (31-RegNo));   // Doesn't need to be marked.
   }
@@ -712,13 +712,6 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
   }
 }
 
-void PPCFrameLowering::getInitialFrameState(std::vector<MachineMove> &Moves) const {
-  // Initial state of the frame pointer is R1.
-  MachineLocation Dst(MachineLocation::VirtualFP);
-  MachineLocation Src(PPC::R1, 0);
-  Moves.push_back(MachineMove(0, Dst, Src));
-}
-
 static bool spillsCR(const MachineFunction &MF) {
   const PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
   return FuncInfo->isCRSpilled();
@@ -885,7 +878,7 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF)
       FFI->setObjectOffset(FI, LowerBound + FFI->getObjectOffset(FI));
     }
 
-    LowerBound -= (31 - PPCRegisterInfo::getRegisterNumbering(MinFPR) + 1) * 8;
+    LowerBound -= (31 - getPPCRegisterNumbering(MinFPR) + 1) * 8;
   }
 
   // Check whether the frame pointer register is allocated. If so, make sure it
@@ -919,8 +912,8 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF)
     }
 
     unsigned MinReg =
-      std::min<unsigned>(PPCRegisterInfo::getRegisterNumbering(MinGPR),
-                         PPCRegisterInfo::getRegisterNumbering(MinG8R));
+      std::min<unsigned>(getPPCRegisterNumbering(MinGPR),
+                         getPPCRegisterNumbering(MinG8R));
 
     if (Subtarget.isPPC64()) {
       LowerBound -= (31 - MinReg + 1) * 8;
diff --git a/lib/Target/PowerPC/PPCFrameLowering.h b/lib/Target/PowerPC/PPCFrameLowering.h
index 0c18de1e2e26e..20faa71d4148d 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.h
+++ b/lib/Target/PowerPC/PPCFrameLowering.h
@@ -40,7 +40,6 @@ public:
 
   bool hasFP(const MachineFunction &MF) const;
   bool needsFP(const MachineFunction &MF) const;
-  void getInitialFrameState(std::vector<MachineMove> &Moves) const;
 
   void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
                                             RegScavenger *RS = NULL) const;
diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 2176c02c85032..6f204cc586369 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -14,8 +14,8 @@
 
 #define DEBUG_TYPE "ppc-codegen"
 #include "PPC.h"
-#include "PPCPredicates.h"
 #include "PPCTargetMachine.h"
+#include "MCTargetDesc/PPCPredicates.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 9741a3902af7e..d6b8a9ee93c7a 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -14,8 +14,8 @@
 #include "PPCISelLowering.h"
 #include "PPCMachineFunctionInfo.h"
 #include "PPCPerfectShuffle.h"
-#include "PPCPredicates.h"
 #include "PPCTargetMachine.h"
+#include "MCTargetDesc/PPCPredicates.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/VectorExtras.h"
 #include "llvm/CodeGen/CallingConvLower.h"
@@ -211,7 +211,8 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   setOperationAction(ISD::TRAP, MVT::Other, Legal);
 
   // TRAMPOLINE is custom lowered.
-  setOperationAction(ISD::TRAMPOLINE, MVT::Other, Custom);
+  setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
+  setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
 
   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
@@ -365,7 +366,11 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
     setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
   }
 
+  setOperationAction(ISD::ATOMIC_LOAD,  MVT::i32, Expand);
+  setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Expand);
+
   setBooleanContents(ZeroOrOneBooleanContent);
+  setBooleanVectorContents(ZeroOrOneBooleanContent); // FIXME: Is this correct?
 
   if (TM.getSubtarget<PPCSubtarget>().isPPC64()) {
     setStackPointerRegisterToSaveRestore(PPC::X1);
@@ -401,12 +406,14 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   if (PPCSubTarget.isDarwin())
     setPrefFunctionAlignment(4);
 
+  setInsertFencesForAtomic(true);
+
   computeRegisterProperties();
 }
 
 /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
 /// function arguments in the caller parameter area.
-unsigned PPCTargetLowering::getByValTypeAlignment(const Type *Ty) const {
+unsigned PPCTargetLowering::getByValTypeAlignment(Type *Ty) const {
   const TargetMachine &TM = getTargetMachine();
   // Darwin passes everything on 4 byte boundary.
   if (TM.getSubtarget<PPCSubtarget>().isDarwin())
@@ -463,7 +470,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   }
 }
 
-MVT::SimpleValueType PPCTargetLowering::getSetCCResultType(EVT VT) const {
+EVT PPCTargetLowering::getSetCCResultType(EVT VT) const {
   return MVT::i32;
 }
 
@@ -1368,8 +1375,13 @@ SDValue PPCTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG,
   return DAG.getLoad(VT, dl, InChain, Result, MachinePointerInfo(), false, false, 0);
 }
 
-SDValue PPCTargetLowering::LowerTRAMPOLINE(SDValue Op,
-                                           SelectionDAG &DAG) const {
+SDValue PPCTargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  return Op.getOperand(0);
+}
+
+SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
+                                                SelectionDAG &DAG) const {
   SDValue Chain = Op.getOperand(0);
   SDValue Trmp = Op.getOperand(1); // trampoline
   SDValue FPtr = Op.getOperand(2); // nested function
@@ -1378,7 +1390,7 @@ SDValue PPCTargetLowering::LowerTRAMPOLINE(SDValue Op,
 
   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
   bool isPPC64 = (PtrVT == MVT::i64);
-  const Type *IntPtrTy =
+  Type *IntPtrTy =
     DAG.getTargetLoweringInfo().getTargetData()->getIntPtrType(
                                                              *DAG.getContext());
 
@@ -1398,16 +1410,13 @@ SDValue PPCTargetLowering::LowerTRAMPOLINE(SDValue Op,
 
   // Lower to a call to __trampoline_setup(Trmp, TrampSize, FPtr, ctx_reg)
   std::pair<SDValue, SDValue> CallResult =
-    LowerCallTo(Chain, Op.getValueType().getTypeForEVT(*DAG.getContext()),
+    LowerCallTo(Chain, Type::getVoidTy(*DAG.getContext()),
                 false, false, false, false, 0, CallingConv::C, false,
                 /*isReturnValueUsed=*/true,
                 DAG.getExternalSymbol("__trampoline_setup", PtrVT),
                 Args, DAG, dl);
 
-  SDValue Ops[] =
-    { CallResult.first, CallResult.second };
-
-  return DAG.getMergeValues(Ops, 2, dl);
+  return CallResult.second;
 }
 
 SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG,
@@ -2550,7 +2559,7 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
     if (!DAG.getTarget().getSubtarget<PPCSubtarget>().isJITCodeModel()) {
       unsigned OpFlags = 0;
       if (DAG.getTarget().getRelocationModel() != Reloc::Static &&
-          (!PPCSubTarget.getTargetTriple().isMacOSX() ||
+          (PPCSubTarget.getTargetTriple().isMacOSX() &&
            PPCSubTarget.getTargetTriple().isMacOSXVersionLT(10, 5)) &&
           (G->getGlobal()->isDeclaration() ||
            G->getGlobal()->isWeakForLinker())) {
@@ -2574,7 +2583,7 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
     unsigned char OpFlags = 0;
 
     if (DAG.getTarget().getRelocationModel() != Reloc::Static &&
-        (!PPCSubTarget.getTargetTriple().isMacOSX() ||
+        (PPCSubTarget.getTargetTriple().isMacOSX() &&
          PPCSubTarget.getTargetTriple().isMacOSXVersionLT(10, 5))) {
       // PC-relative references to external symbols should go through $stub,
       // unless we're building with the leopard linker or later, which
@@ -2941,6 +2950,7 @@ PPCTargetLowering::LowerCall_SVR4(SDValue Chain, SDValue Callee,
   SmallVector<TailCallArgumentInfo, 8> TailCallArguments;
   SmallVector<SDValue, 8> MemOpChains;
 
+  bool seenFloatArg = false;
   // Walk the register/memloc assignments, inserting copies/loads.
   for (unsigned i = 0, j = 0, e = ArgLocs.size();
        i != e;
@@ -2985,6 +2995,7 @@ PPCTargetLowering::LowerCall_SVR4(SDValue Chain, SDValue Callee,
     }
 
     if (VA.isRegLoc()) {
+      seenFloatArg |= VA.getLocVT().isFloatingPoint();
       // Put argument in a physical register.
       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
     } else {
@@ -3011,9 +3022,11 @@ PPCTargetLowering::LowerCall_SVR4(SDValue Chain, SDValue Callee,
     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
                         &MemOpChains[0], MemOpChains.size());
 
-  // Set CR6 to true if this is a vararg call.
+  // Set CR6 to true if this is a vararg call with floating args passed in
+  // registers.
   if (isVarArg) {
-    SDValue SetCR(DAG.getMachineNode(PPC::CRSET, dl, MVT::i32), 0);
+    SDValue SetCR(DAG.getMachineNode(seenFloatArg ? PPC::CRSET : PPC::CRUNSET,
+                                     dl, MVT::i32), 0);
     RegsToPass.push_back(std::make_pair(unsigned(PPC::CR1EQ), SetCR));
   }
 
@@ -3403,6 +3416,17 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
                     Ins, InVals);
 }
 
+bool
+PPCTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
+                                  MachineFunction &MF, bool isVarArg,
+                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                  LLVMContext &Context) const {
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
+                 RVLocs, Context);
+  return CCInfo.CheckReturn(Outs, RetCC_PPC);
+}
+
 SDValue
 PPCTargetLowering::LowerReturn(SDValue Chain,
                                CallingConv::ID CallConv, bool isVarArg,
@@ -4490,7 +4514,8 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::GlobalTLSAddress:   llvm_unreachable("TLS not implemented for PPC");
   case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
   case ISD::SETCC:              return LowerSETCC(Op, DAG);
-  case ISD::TRAMPOLINE:         return LowerTRAMPOLINE(Op, DAG);
+  case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
+  case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
   case ISD::VASTART:
     return LowerVASTART(Op, DAG, PPCSubTarget);
 
@@ -5504,7 +5529,7 @@ PPCTargetLowering::getSingleConstraintMatchWeight(
     // but allow it at the lowest weight.
   if (CallOperandVal == NULL)
     return CW_Default;
-  const Type *type = CallOperandVal->getType();
+  Type *type = CallOperandVal->getType();
   // Look at the constraint type.
   switch (*constraint) {
   default:
@@ -5634,7 +5659,7 @@ void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
 // isLegalAddressingMode - Return true if the addressing mode represented
 // by AM is legal for this target, for a load/store of the specified type.
 bool PPCTargetLowering::isLegalAddressingMode(const AddrMode &AM,
-                                              const Type *Ty) const {
+                                              Type *Ty) const {
   // FIXME: PPC does not allow r+i addressing modes for vectors!
 
   // PPC allows a sign-extended 16-bit immediate field.
@@ -5670,7 +5695,7 @@ bool PPCTargetLowering::isLegalAddressingMode(const AddrMode &AM,
 /// isLegalAddressImmediate - Return true if the integer value can be used
 /// as the offset of the target addressing mode for load / store of the
 /// given type.
-bool PPCTargetLowering::isLegalAddressImmediate(int64_t V,const Type *Ty) const{
+bool PPCTargetLowering::isLegalAddressImmediate(int64_t V,Type *Ty) const{
   // PPC allows a sign-extended 16-bit immediate field.
   return (V > -(1 << 16) && V < (1 << 16)-1);
 }
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index 986b4e73fbd3c..430e45e804934 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -246,7 +246,7 @@ namespace llvm {
     virtual MVT getShiftAmountTy(EVT LHSTy) const { return MVT::i32; }
 
     /// getSetCCResultType - Return the ISD::SETCC ValueType
-    virtual MVT::SimpleValueType getSetCCResultType(EVT VT) const;
+    virtual EVT getSetCCResultType(EVT VT) const;
 
     /// getPreIndexedAddressParts - returns true by value, base pointer and
     /// offset pointer and addressing mode by reference if the node's address
@@ -323,7 +323,7 @@ namespace llvm {
     /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
     /// function arguments in the caller parameter area.  This is the actual
     /// alignment, not its logarithm.
-    unsigned getByValTypeAlignment(const Type *Ty) const;
+    unsigned getByValTypeAlignment(Type *Ty) const;
 
     /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
     /// vector.  If it is invalid, don't add anything to Ops.
@@ -334,12 +334,12 @@ namespace llvm {
 
     /// isLegalAddressingMode - Return true if the addressing mode represented
     /// by AM is legal for this target, for a load/store of the specified type.
-    virtual bool isLegalAddressingMode(const AddrMode &AM, const Type *Ty)const;
+    virtual bool isLegalAddressingMode(const AddrMode &AM, Type *Ty)const;
 
     /// isLegalAddressImmediate - Return true if the integer value can be used
     /// as the offset of the target addressing mode for load / store of the
     /// given type.
-    virtual bool isLegalAddressImmediate(int64_t V, const Type *Ty) const;
+    virtual bool isLegalAddressImmediate(int64_t V, Type *Ty) const;
 
     /// isLegalAddressImmediate - Return true if the GlobalValue can be used as
     /// the offset of the target addressing mode.
@@ -390,7 +390,8 @@ namespace llvm {
     SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerTRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG,
                          const PPCSubtarget &Subtarget) const;
     SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG,
@@ -444,6 +445,12 @@ namespace llvm {
                 DebugLoc dl, SelectionDAG &DAG,
                 SmallVectorImpl<SDValue> &InVals) const;
 
+    virtual bool
+      CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+                   bool isVarArg,
+                   const SmallVectorImpl<ISD::OutputArg> &Outs,
+                   LLVMContext &Context) const;
+
     virtual SDValue
       LowerReturn(SDValue Chain,
                   CallingConv::ID CallConv, bool isVarArg,
diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp
index 143444fdc22b3..2bc109c8785a1 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -15,18 +15,18 @@
 #include "PPC.h"
 #include "PPCInstrBuilder.h"
 #include "PPCMachineFunctionInfo.h"
-#include "PPCPredicates.h"
 #include "PPCTargetMachine.h"
 #include "PPCHazardRecognizers.h"
+#include "MCTargetDesc/PPCPredicates.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/MC/MCAsmInfo.h"
-#include "llvm/Target/TargetRegistry.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/ADT/STLExtras.h"
 
@@ -334,7 +334,7 @@ PPCInstrInfo::StoreRegToStackSlot(MachineFunction &MF,
                                   const TargetRegisterClass *RC,
                                   SmallVectorImpl<MachineInstr*> &NewMIs) const{
   DebugLoc DL;
-  if (RC == PPC::GPRCRegisterClass) {
+  if (PPC::GPRCRegisterClass->hasSubClassEq(RC)) {
     if (SrcReg != PPC::LR) {
       NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STW))
                                          .addReg(SrcReg,
@@ -350,7 +350,7 @@ PPCInstrInfo::StoreRegToStackSlot(MachineFunction &MF,
                                                  getKillRegState(isKill)),
                                          FrameIdx));
     }
-  } else if (RC == PPC::G8RCRegisterClass) {
+  } else if (PPC::G8RCRegisterClass->hasSubClassEq(RC)) {
     if (SrcReg != PPC::LR8) {
       NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STD))
                                          .addReg(SrcReg,
@@ -366,17 +366,17 @@ PPCInstrInfo::StoreRegToStackSlot(MachineFunction &MF,
                                                  getKillRegState(isKill)),
                                          FrameIdx));
     }
-  } else if (RC == PPC::F8RCRegisterClass) {
+  } else if (PPC::F8RCRegisterClass->hasSubClassEq(RC)) {
     NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STFD))
                                        .addReg(SrcReg,
                                                getKillRegState(isKill)),
                                        FrameIdx));
-  } else if (RC == PPC::F4RCRegisterClass) {
+  } else if (PPC::F4RCRegisterClass->hasSubClassEq(RC)) {
     NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STFS))
                                        .addReg(SrcReg,
                                                getKillRegState(isKill)),
                                        FrameIdx));
-  } else if (RC == PPC::CRRCRegisterClass) {
+  } else if (PPC::CRRCRegisterClass->hasSubClassEq(RC)) {
     if ((EnablePPC32RS && !TM.getSubtargetImpl()->isPPC64()) ||
         (EnablePPC64RS && TM.getSubtargetImpl()->isPPC64())) {
       // FIXME (64-bit): Enable
@@ -402,7 +402,7 @@ PPCInstrInfo::StoreRegToStackSlot(MachineFunction &MF,
       // If the saved register wasn't CR0, shift the bits left so that they are
       // in CR0's slot.
       if (SrcReg != PPC::CR0) {
-        unsigned ShiftBits = PPCRegisterInfo::getRegisterNumbering(SrcReg)*4;
+        unsigned ShiftBits = getPPCRegisterNumbering(SrcReg)*4;
         // rlwinm scratch, scratch, ShiftBits, 0, 31.
         NewMIs.push_back(BuildMI(MF, DL, get(PPC::RLWINM), ScratchReg)
                        .addReg(ScratchReg).addImm(ShiftBits)
@@ -414,7 +414,7 @@ PPCInstrInfo::StoreRegToStackSlot(MachineFunction &MF,
                                                  getKillRegState(isKill)),
                                          FrameIdx));
     }
-  } else if (RC == PPC::CRBITRCRegisterClass) {
+  } else if (PPC::CRBITRCRegisterClass->hasSubClassEq(RC)) {
     // FIXME: We use CRi here because there is no mtcrf on a bit. Since the
     // backend currently only uses CR1EQ as an individual bit, this should
     // not cause any bug. If we need other uses of CR bits, the following
@@ -448,7 +448,7 @@ PPCInstrInfo::StoreRegToStackSlot(MachineFunction &MF,
     return StoreRegToStackSlot(MF, Reg, isKill, FrameIdx,
                                PPC::CRRCRegisterClass, NewMIs);
 
-  } else if (RC == PPC::VRRCRegisterClass) {
+  } else if (PPC::VRRCRegisterClass->hasSubClassEq(RC)) {
     // We don't have indexed addressing for vector loads.  Emit:
     // R0 = ADDI FI#
     // STVX VAL, 0, R0
@@ -499,7 +499,7 @@ PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, DebugLoc DL,
                                    unsigned DestReg, int FrameIdx,
                                    const TargetRegisterClass *RC,
                                    SmallVectorImpl<MachineInstr*> &NewMIs)const{
-  if (RC == PPC::GPRCRegisterClass) {
+  if (PPC::GPRCRegisterClass->hasSubClassEq(RC)) {
     if (DestReg != PPC::LR) {
       NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LWZ),
                                                  DestReg), FrameIdx));
@@ -508,7 +508,7 @@ PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, DebugLoc DL,
                                                  PPC::R11), FrameIdx));
       NewMIs.push_back(BuildMI(MF, DL, get(PPC::MTLR)).addReg(PPC::R11));
     }
-  } else if (RC == PPC::G8RCRegisterClass) {
+  } else if (PPC::G8RCRegisterClass->hasSubClassEq(RC)) {
     if (DestReg != PPC::LR8) {
       NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LD), DestReg),
                                          FrameIdx));
@@ -517,13 +517,13 @@ PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, DebugLoc DL,
                                                  PPC::R11), FrameIdx));
       NewMIs.push_back(BuildMI(MF, DL, get(PPC::MTLR8)).addReg(PPC::R11));
     }
-  } else if (RC == PPC::F8RCRegisterClass) {
+  } else if (PPC::F8RCRegisterClass->hasSubClassEq(RC)) {
     NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LFD), DestReg),
                                        FrameIdx));
-  } else if (RC == PPC::F4RCRegisterClass) {
+  } else if (PPC::F4RCRegisterClass->hasSubClassEq(RC)) {
     NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LFS), DestReg),
                                        FrameIdx));
-  } else if (RC == PPC::CRRCRegisterClass) {
+  } else if (PPC::CRRCRegisterClass->hasSubClassEq(RC)) {
     // FIXME: We need a scatch reg here.  The trouble with using R0 is that
     // it's possible for the stack frame to be so big the save location is
     // out of range of immediate offsets, necessitating another register.
@@ -537,7 +537,7 @@ PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, DebugLoc DL,
     // If the reloaded register isn't CR0, shift the bits right so that they are
     // in the right CR's slot.
     if (DestReg != PPC::CR0) {
-      unsigned ShiftBits = PPCRegisterInfo::getRegisterNumbering(DestReg)*4;
+      unsigned ShiftBits = getPPCRegisterNumbering(DestReg)*4;
       // rlwinm r11, r11, 32-ShiftBits, 0, 31.
       NewMIs.push_back(BuildMI(MF, DL, get(PPC::RLWINM), ScratchReg)
                     .addReg(ScratchReg).addImm(32-ShiftBits).addImm(0)
@@ -546,7 +546,7 @@ PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, DebugLoc DL,
 
     NewMIs.push_back(BuildMI(MF, DL, get(PPC::MTCRF), DestReg)
                      .addReg(ScratchReg));
-  } else if (RC == PPC::CRBITRCRegisterClass) {
+  } else if (PPC::CRBITRCRegisterClass->hasSubClassEq(RC)) {
 
     unsigned Reg = 0;
     if (DestReg == PPC::CR0LT || DestReg == PPC::CR0GT ||
@@ -577,7 +577,7 @@ PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, DebugLoc DL,
     return LoadRegFromStackSlot(MF, DL, Reg, FrameIdx,
                                 PPC::CRRCRegisterClass, NewMIs);
 
-  } else if (RC == PPC::VRRCRegisterClass) {
+  } else if (PPC::VRRCRegisterClass->hasSubClassEq(RC)) {
     // We don't have indexed addressing for vector loads.  Emit:
     // R0 = ADDI FI#
     // Dest = LVX 0, R0
diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td
index 773578c6ea816..f248b5ba8c48d 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/lib/Target/PowerPC/PPCInstrInfo.td
@@ -1053,6 +1053,10 @@ def CRSET  : XLForm_1_ext<19, 289, (outs CRBITRC:$dst), (ins),
               "creqv $dst, $dst, $dst", BrCR,
               []>;
 
+def CRUNSET: XLForm_1_ext<19, 193, (outs CRBITRC:$dst), (ins),
+              "crxor $dst, $dst, $dst", BrCR,
+              []>;
+
 // XFX-Form instructions.  Instructions that deal with SPRs.
 //
 let Uses = [CTR] in {
@@ -1472,5 +1476,7 @@ def : Pat<(membarrier (i32 imm /*ll*/),
                       (i32 imm /*device*/)),
            (SYNC)>;
 
+def : Pat<(atomic_fence (imm), (imm)), (SYNC)>;
+
 include "PPCInstrAltivec.td"
 include "PPCInstr64Bit.td"
diff --git a/lib/Target/PowerPC/PPCMCCodeEmitter.cpp b/lib/Target/PowerPC/PPCMCCodeEmitter.cpp
deleted file mode 100644
index cf73d861fa4d7..0000000000000
--- a/lib/Target/PowerPC/PPCMCCodeEmitter.cpp
+++ /dev/null
@@ -1,194 +0,0 @@
-//===-- PPCMCCodeEmitter.cpp - Convert PPC code to machine code -----------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the PPCMCCodeEmitter class.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "mccodeemitter"
-#include "PPC.h"
-#include "PPCRegisterInfo.h"
-#include "PPCFixupKinds.h"
-#include "llvm/MC/MCCodeEmitter.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/ErrorHandling.h"
-using namespace llvm;
-
-STATISTIC(MCNumEmitted, "Number of MC instructions emitted");
-
-namespace {
-class PPCMCCodeEmitter : public MCCodeEmitter {
-  PPCMCCodeEmitter(const PPCMCCodeEmitter &); // DO NOT IMPLEMENT
-  void operator=(const PPCMCCodeEmitter &);   // DO NOT IMPLEMENT
-  
-public:
-  PPCMCCodeEmitter(const MCInstrInfo &mcii, const MCSubtargetInfo &sti,
-                   MCContext &ctx) {
-  }
-  
-  ~PPCMCCodeEmitter() {}
-
-  unsigned getDirectBrEncoding(const MCInst &MI, unsigned OpNo,
-                               SmallVectorImpl<MCFixup> &Fixups) const;
-  unsigned getCondBrEncoding(const MCInst &MI, unsigned OpNo,
-                             SmallVectorImpl<MCFixup> &Fixups) const;
-  unsigned getHA16Encoding(const MCInst &MI, unsigned OpNo,
-                           SmallVectorImpl<MCFixup> &Fixups) const;
-  unsigned getLO16Encoding(const MCInst &MI, unsigned OpNo,
-                           SmallVectorImpl<MCFixup> &Fixups) const;
-  unsigned getMemRIEncoding(const MCInst &MI, unsigned OpNo,
-                            SmallVectorImpl<MCFixup> &Fixups) const;
-  unsigned getMemRIXEncoding(const MCInst &MI, unsigned OpNo,
-                             SmallVectorImpl<MCFixup> &Fixups) const;
-  unsigned get_crbitm_encoding(const MCInst &MI, unsigned OpNo,
-                               SmallVectorImpl<MCFixup> &Fixups) const;
-
-  /// getMachineOpValue - Return binary encoding of operand. If the machine
-  /// operand requires relocation, record the relocation and return zero.
-  unsigned getMachineOpValue(const MCInst &MI,const MCOperand &MO,
-                             SmallVectorImpl<MCFixup> &Fixups) const;
-  
-  // getBinaryCodeForInstr - TableGen'erated function for getting the
-  // binary encoding for an instruction.
-  unsigned getBinaryCodeForInstr(const MCInst &MI,
-                                 SmallVectorImpl<MCFixup> &Fixups) const;
-  void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
-                         SmallVectorImpl<MCFixup> &Fixups) const {
-    unsigned Bits = getBinaryCodeForInstr(MI, Fixups);
-    
-    // Output the constant in big endian byte order.
-    for (unsigned i = 0; i != 4; ++i) {
-      OS << (char)(Bits >> 24);
-      Bits <<= 8;
-    }
-    
-    ++MCNumEmitted;  // Keep track of the # of mi's emitted.
-  }
-  
-};
-  
-} // end anonymous namespace
-  
-MCCodeEmitter *llvm::createPPCMCCodeEmitter(const MCInstrInfo &MCII,
-                                            const MCSubtargetInfo &STI,
-                                            MCContext &Ctx) {
-  return new PPCMCCodeEmitter(MCII, STI, Ctx);
-}
-
-unsigned PPCMCCodeEmitter::
-getDirectBrEncoding(const MCInst &MI, unsigned OpNo,
-                    SmallVectorImpl<MCFixup> &Fixups) const {
-  const MCOperand &MO = MI.getOperand(OpNo);
-  if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO, Fixups);
-  
-  // Add a fixup for the branch target.
-  Fixups.push_back(MCFixup::Create(0, MO.getExpr(),
-                                   (MCFixupKind)PPC::fixup_ppc_br24));
-  return 0;
-}
-
-unsigned PPCMCCodeEmitter::getCondBrEncoding(const MCInst &MI, unsigned OpNo,
-                                     SmallVectorImpl<MCFixup> &Fixups) const {
-  const MCOperand &MO = MI.getOperand(OpNo);
-  if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO, Fixups);
-
-  // Add a fixup for the branch target.
-  Fixups.push_back(MCFixup::Create(0, MO.getExpr(),
-                                   (MCFixupKind)PPC::fixup_ppc_brcond14));
-  return 0;
-}
-
-unsigned PPCMCCodeEmitter::getHA16Encoding(const MCInst &MI, unsigned OpNo,
-                                       SmallVectorImpl<MCFixup> &Fixups) const {
-  const MCOperand &MO = MI.getOperand(OpNo);
-  if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO, Fixups);
-  
-  // Add a fixup for the branch target.
-  Fixups.push_back(MCFixup::Create(0, MO.getExpr(),
-                                   (MCFixupKind)PPC::fixup_ppc_ha16));
-  return 0;
-}
-
-unsigned PPCMCCodeEmitter::getLO16Encoding(const MCInst &MI, unsigned OpNo,
-                                       SmallVectorImpl<MCFixup> &Fixups) const {
-  const MCOperand &MO = MI.getOperand(OpNo);
-  if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO, Fixups);
-  
-  // Add a fixup for the branch target.
-  Fixups.push_back(MCFixup::Create(0, MO.getExpr(),
-                                   (MCFixupKind)PPC::fixup_ppc_lo16));
-  return 0;
-}
-
-unsigned PPCMCCodeEmitter::getMemRIEncoding(const MCInst &MI, unsigned OpNo,
-                                            SmallVectorImpl<MCFixup> &Fixups) const {
-  // Encode (imm, reg) as a memri, which has the low 16-bits as the
-  // displacement and the next 5 bits as the register #.
-  assert(MI.getOperand(OpNo+1).isReg());
-  unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups) << 16;
-  
-  const MCOperand &MO = MI.getOperand(OpNo);
-  if (MO.isImm())
-    return (getMachineOpValue(MI, MO, Fixups) & 0xFFFF) | RegBits;
-  
-  // Add a fixup for the displacement field.
-  Fixups.push_back(MCFixup::Create(0, MO.getExpr(),
-                                   (MCFixupKind)PPC::fixup_ppc_lo16));
-  return RegBits;
-}
-
-
-unsigned PPCMCCodeEmitter::getMemRIXEncoding(const MCInst &MI, unsigned OpNo,
-                                       SmallVectorImpl<MCFixup> &Fixups) const {
-  // Encode (imm, reg) as a memrix, which has the low 14-bits as the
-  // displacement and the next 5 bits as the register #.
-  assert(MI.getOperand(OpNo+1).isReg());
-  unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups) << 14;
-  
-  const MCOperand &MO = MI.getOperand(OpNo);
-  if (MO.isImm())
-    return (getMachineOpValue(MI, MO, Fixups) & 0x3FFF) | RegBits;
-  
-  // Add a fixup for the branch target.
-  Fixups.push_back(MCFixup::Create(0, MO.getExpr(),
-                                   (MCFixupKind)PPC::fixup_ppc_lo14));
-  return RegBits;
-}
-
-
-unsigned PPCMCCodeEmitter::
-get_crbitm_encoding(const MCInst &MI, unsigned OpNo,
-                    SmallVectorImpl<MCFixup> &Fixups) const {
-  const MCOperand &MO = MI.getOperand(OpNo);
-  assert((MI.getOpcode() == PPC::MTCRF || MI.getOpcode() == PPC::MFOCRF) &&
-         (MO.getReg() >= PPC::CR0 && MO.getReg() <= PPC::CR7));
-  return 0x80 >> PPCRegisterInfo::getRegisterNumbering(MO.getReg());
-}
-
-
-unsigned PPCMCCodeEmitter::
-getMachineOpValue(const MCInst &MI, const MCOperand &MO,
-                  SmallVectorImpl<MCFixup> &Fixups) const {
-  if (MO.isReg()) {
-    // MTCRF/MFOCRF should go through get_crbitm_encoding for the CR operand.
-    // The GPR operand should come through here though.
-    assert((MI.getOpcode() != PPC::MTCRF && MI.getOpcode() != PPC::MFOCRF) ||
-           MO.getReg() < PPC::CR0 || MO.getReg() > PPC::CR7);
-    return PPCRegisterInfo::getRegisterNumbering(MO.getReg());
-  }
-  
-  assert(MO.isImm() &&
-         "Relocation required in an instruction that we cannot encode!");
-  return MO.getImm();
-}
-
-
-#include "PPCGenMCCodeEmitter.inc"
diff --git a/lib/Target/PowerPC/PPCPredicates.cpp b/lib/Target/PowerPC/PPCPredicates.cpp
deleted file mode 100644
index 12bb0a143406b..0000000000000
--- a/lib/Target/PowerPC/PPCPredicates.cpp
+++ /dev/null
@@ -1,31 +0,0 @@
-//===-- PPCPredicates.cpp - PPC Branch Predicate Information --------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the PowerPC branch predicates.
-//
-//===----------------------------------------------------------------------===//
-
-#include "PPCPredicates.h"
-#include "llvm/Support/ErrorHandling.h"
-#include <cassert>
-using namespace llvm;
-
-PPC::Predicate PPC::InvertPredicate(PPC::Predicate Opcode) {
-  switch (Opcode) {
-  default: llvm_unreachable("Unknown PPC branch opcode!");
-  case PPC::PRED_EQ: return PPC::PRED_NE;
-  case PPC::PRED_NE: return PPC::PRED_EQ;
-  case PPC::PRED_LT: return PPC::PRED_GE;
-  case PPC::PRED_GE: return PPC::PRED_LT;
-  case PPC::PRED_GT: return PPC::PRED_LE;
-  case PPC::PRED_LE: return PPC::PRED_GT;
-  case PPC::PRED_NU: return PPC::PRED_UN;
-  case PPC::PRED_UN: return PPC::PRED_NU;
-  }
-}
diff --git a/lib/Target/PowerPC/PPCPredicates.h b/lib/Target/PowerPC/PPCPredicates.h
deleted file mode 100644
index b2c831579f792..0000000000000
--- a/lib/Target/PowerPC/PPCPredicates.h
+++ /dev/null
@@ -1,39 +0,0 @@
-//===-- PPCPredicates.h - PPC Branch Predicate Information ------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file describes the PowerPC branch predicates.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TARGET_POWERPC_PPCPREDICATES_H
-#define LLVM_TARGET_POWERPC_PPCPREDICATES_H
-
-#include "PPC.h"
-
-namespace llvm {
-namespace PPC {
-  /// Predicate - These are "(BI << 5) | BO"  for various predicates.
-  enum Predicate {
-    PRED_ALWAYS = (0 << 5) | 20,
-    PRED_LT     = (0 << 5) | 12,
-    PRED_LE     = (1 << 5) |  4,
-    PRED_EQ     = (2 << 5) | 12,
-    PRED_GE     = (0 << 5) |  4,
-    PRED_GT     = (1 << 5) | 12,
-    PRED_NE     = (2 << 5) |  4,
-    PRED_UN     = (3 << 5) | 12,
-    PRED_NU     = (3 << 5) |  4
-  };
-  
-  /// Invert the specified predicate.  != -> ==, < -> >=.
-  Predicate InvertPredicate(Predicate Opcode);
-}
-}
-
-#endif
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 9c2428b92e65b..2e90b7a4086bf 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -28,7 +28,6 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineLocation.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/Target/TargetFrameLowering.h"
@@ -68,52 +67,12 @@ PPCRegisterInfo::requiresRegisterScavenging(const MachineFunction &) const {
           (EnablePPC64RS && Subtarget.isPPC64()));
 }
 
-/// getRegisterNumbering - Given the enum value for some register, e.g.
-/// PPC::F14, return the number that it corresponds to (e.g. 14).
-unsigned PPCRegisterInfo::getRegisterNumbering(unsigned RegEnum) {
-  using namespace PPC;
-  switch (RegEnum) {
-  case 0: return 0;
-  case R0 :  case X0 :  case F0 :  case V0 : case CR0:  case CR0LT: return  0;
-  case R1 :  case X1 :  case F1 :  case V1 : case CR1:  case CR0GT: return  1;
-  case R2 :  case X2 :  case F2 :  case V2 : case CR2:  case CR0EQ: return  2;
-  case R3 :  case X3 :  case F3 :  case V3 : case CR3:  case CR0UN: return  3;
-  case R4 :  case X4 :  case F4 :  case V4 : case CR4:  case CR1LT: return  4;
-  case R5 :  case X5 :  case F5 :  case V5 : case CR5:  case CR1GT: return  5;
-  case R6 :  case X6 :  case F6 :  case V6 : case CR6:  case CR1EQ: return  6;
-  case R7 :  case X7 :  case F7 :  case V7 : case CR7:  case CR1UN: return  7;
-  case R8 :  case X8 :  case F8 :  case V8 : case CR2LT: return  8;
-  case R9 :  case X9 :  case F9 :  case V9 : case CR2GT: return  9;
-  case R10:  case X10:  case F10:  case V10: case CR2EQ: return 10;
-  case R11:  case X11:  case F11:  case V11: case CR2UN: return 11;
-  case R12:  case X12:  case F12:  case V12: case CR3LT: return 12;
-  case R13:  case X13:  case F13:  case V13: case CR3GT: return 13;
-  case R14:  case X14:  case F14:  case V14: case CR3EQ: return 14;
-  case R15:  case X15:  case F15:  case V15: case CR3UN: return 15;
-  case R16:  case X16:  case F16:  case V16: case CR4LT: return 16;
-  case R17:  case X17:  case F17:  case V17: case CR4GT: return 17;
-  case R18:  case X18:  case F18:  case V18: case CR4EQ: return 18;
-  case R19:  case X19:  case F19:  case V19: case CR4UN: return 19;
-  case R20:  case X20:  case F20:  case V20: case CR5LT: return 20;
-  case R21:  case X21:  case F21:  case V21: case CR5GT: return 21;
-  case R22:  case X22:  case F22:  case V22: case CR5EQ: return 22;
-  case R23:  case X23:  case F23:  case V23: case CR5UN: return 23;
-  case R24:  case X24:  case F24:  case V24: case CR6LT: return 24;
-  case R25:  case X25:  case F25:  case V25: case CR6GT: return 25;
-  case R26:  case X26:  case F26:  case V26: case CR6EQ: return 26;
-  case R27:  case X27:  case F27:  case V27: case CR6UN: return 27;
-  case R28:  case X28:  case F28:  case V28: case CR7LT: return 28;
-  case R29:  case X29:  case F29:  case V29: case CR7GT: return 29;
-  case R30:  case X30:  case F30:  case V30: case CR7EQ: return 30;
-  case R31:  case X31:  case F31:  case V31: case CR7UN: return 31;
-  default:
-    llvm_unreachable("Unhandled reg in PPCRegisterInfo::getRegisterNumbering!");
-  }
-}
-
 PPCRegisterInfo::PPCRegisterInfo(const PPCSubtarget &ST,
                                  const TargetInstrInfo &tii)
-  : PPCGenRegisterInfo(), Subtarget(ST), TII(tii) {
+  : PPCGenRegisterInfo(ST.isPPC64() ? PPC::LR8 : PPC::LR,
+                       ST.isPPC64() ? 0 : 1,
+                       ST.isPPC64() ? 0 : 1),
+    Subtarget(ST), TII(tii) {
   ImmToIdxMap[PPC::LD]   = PPC::LDX;    ImmToIdxMap[PPC::STD]  = PPC::STDX;
   ImmToIdxMap[PPC::LBZ]  = PPC::LBZX;   ImmToIdxMap[PPC::STB]  = PPC::STBX;
   ImmToIdxMap[PPC::LHZ]  = PPC::LHZX;   ImmToIdxMap[PPC::LHA]  = PPC::LHAX;
@@ -519,7 +478,7 @@ void PPCRegisterInfo::lowerCRSpilling(MachineBasicBlock::iterator II,
     // rlwinm rA, rA, ShiftBits, 0, 31.
     BuildMI(MBB, II, dl, TII.get(PPC::RLWINM), Reg)
       .addReg(Reg, RegState::Kill)
-      .addImm(PPCRegisterInfo::getRegisterNumbering(SrcReg) * 4)
+      .addImm(getPPCRegisterNumbering(SrcReg) * 4)
       .addImm(0)
       .addImm(31);
 
@@ -668,10 +627,6 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   MI.getOperand(OperandBase + 1).ChangeToRegister(SReg, false);
 }
 
-unsigned PPCRegisterInfo::getRARegister() const {
-  return !Subtarget.isPPC64() ? PPC::LR : PPC::LR8;
-}
-
 unsigned PPCRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
 
@@ -688,27 +643,3 @@ unsigned PPCRegisterInfo::getEHExceptionRegister() const {
 unsigned PPCRegisterInfo::getEHHandlerRegister() const {
   return !Subtarget.isPPC64() ? PPC::R4 : PPC::X4;
 }
-
-/// DWARFFlavour - Flavour of dwarf regnumbers
-///
-namespace DWARFFlavour {
-  enum {
-    PPC64 = 0, PPC32 = 1
-  };
-}
-
-int PPCRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
-  // FIXME: Most probably dwarf numbers differs for Linux and Darwin
-  unsigned Flavour = Subtarget.isPPC64() ?
-    DWARFFlavour::PPC64 : DWARFFlavour::PPC32;
-
-  return PPCGenRegisterInfo::getDwarfRegNumFull(RegNum, Flavour);
-}
-
-int PPCRegisterInfo::getLLVMRegNum(unsigned RegNum, bool isEH) const {
-  // FIXME: Most probably dwarf numbers differs for Linux and Darwin
-  unsigned Flavour = Subtarget.isPPC64() ?
-    DWARFFlavour::PPC64 : DWARFFlavour::PPC32;
-
-  return PPCGenRegisterInfo::getLLVMRegNumFull(RegNum, Flavour);
-}
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.h b/lib/Target/PowerPC/PPCRegisterInfo.h
index 33fe5ebcf4cd9..1cc7213417d1b 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.h
+++ b/lib/Target/PowerPC/PPCRegisterInfo.h
@@ -33,10 +33,6 @@ class PPCRegisterInfo : public PPCGenRegisterInfo {
 public:
   PPCRegisterInfo(const PPCSubtarget &SubTarget, const TargetInstrInfo &tii);
   
-  /// getRegisterNumbering - Given the enum value for some register, e.g.
-  /// PPC::F14, return the number that it corresponds to (e.g. 14).
-  static unsigned getRegisterNumbering(unsigned RegEnum);
-
   /// getPointerRegClass - Return the register class to use to hold pointers.
   /// This is used for addressing modes.
   virtual const TargetRegisterClass *getPointerRegClass(unsigned Kind=0) const;  
@@ -62,15 +58,11 @@ public:
                            int SPAdj, RegScavenger *RS = NULL) const;
 
   // Debug information queries.
-  unsigned getRARegister() const;
   unsigned getFrameRegister(const MachineFunction &MF) const;
 
   // Exception handling queries.
   unsigned getEHExceptionRegister() const;
   unsigned getEHHandlerRegister() const;
-
-  int getDwarfRegNum(unsigned RegNum, bool isEH) const;
-  int getLLVMRegNum(unsigned RegNum, bool isEH) const;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/PowerPC/PPCSubtarget.cpp b/lib/Target/PowerPC/PPCSubtarget.cpp
index 5ea9b0f6596c4..cf194de42e8f3 100644
--- a/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -15,7 +15,7 @@
 #include "PPC.h"
 #include "llvm/GlobalValue.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 #include <cstdlib>
 
 #define GET_SUBTARGETINFO_TARGET_DESC
diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp
index e0ea5adba751e..f5744b830489f 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -16,76 +16,43 @@
 #include "llvm/PassManager.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/Target/TargetOptions.h"
-#include "llvm/Target/TargetRegistry.h"
 #include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
-// This is duplicated code. Refactor this.
-static MCStreamer *createMCStreamer(const Target &T, const std::string &TT,
-                                    MCContext &Ctx, TargetAsmBackend &TAB,
-                                    raw_ostream &OS,
-                                    MCCodeEmitter *Emitter,
-                                    bool RelaxAll,
-                                    bool NoExecStack) {
-  if (Triple(TT).isOSDarwin())
-    return createMachOStreamer(Ctx, TAB, OS, Emitter, RelaxAll);
-
-  return NULL;
-}
-
 extern "C" void LLVMInitializePowerPCTarget() {
   // Register the targets
   RegisterTargetMachine<PPC32TargetMachine> A(ThePPC32Target);  
   RegisterTargetMachine<PPC64TargetMachine> B(ThePPC64Target);
-  
-  // Register the MC Code Emitter
-  TargetRegistry::RegisterCodeEmitter(ThePPC32Target, createPPCMCCodeEmitter);
-  TargetRegistry::RegisterCodeEmitter(ThePPC64Target, createPPCMCCodeEmitter);
-  
-  
-  // Register the asm backend.
-  TargetRegistry::RegisterAsmBackend(ThePPC32Target, createPPCAsmBackend);
-  TargetRegistry::RegisterAsmBackend(ThePPC64Target, createPPCAsmBackend);
-  
-  // Register the object streamer.
-  TargetRegistry::RegisterObjectStreamer(ThePPC32Target, createMCStreamer);
-  TargetRegistry::RegisterObjectStreamer(ThePPC64Target, createMCStreamer);
 }
 
-
-PPCTargetMachine::PPCTargetMachine(const Target &T, const std::string &TT,
-                                   const std::string &CPU,
-                                   const std::string &FS, bool is64Bit)
-  : LLVMTargetMachine(T, TT, CPU, FS),
+PPCTargetMachine::PPCTargetMachine(const Target &T, StringRef TT,
+                                   StringRef CPU, StringRef FS,
+                                   Reloc::Model RM, CodeModel::Model CM,
+                                   bool is64Bit)
+  : LLVMTargetMachine(T, TT, CPU, FS, RM, CM),
     Subtarget(TT, CPU, FS, is64Bit),
     DataLayout(Subtarget.getTargetDataString()), InstrInfo(*this),
     FrameLowering(Subtarget), JITInfo(*this, is64Bit),
     TLInfo(*this), TSInfo(*this),
     InstrItins(Subtarget.getInstrItineraryData()) {
-
-  if (getRelocationModel() == Reloc::Default) {
-    if (Subtarget.isDarwin())
-      setRelocationModel(Reloc::DynamicNoPIC);
-    else
-      setRelocationModel(Reloc::Static);
-  }
 }
 
 /// Override this for PowerPC.  Tail merging happily breaks up instruction issue
 /// groups, which typically degrades performance.
 bool PPCTargetMachine::getEnableTailMergeDefault() const { return false; }
 
-PPC32TargetMachine::PPC32TargetMachine(const Target &T, const std::string &TT, 
-                                       const std::string &CPU,
-                                       const std::string &FS) 
-  : PPCTargetMachine(T, TT, CPU, FS, false) {
+PPC32TargetMachine::PPC32TargetMachine(const Target &T, StringRef TT, 
+                                       StringRef CPU, StringRef FS,
+                                       Reloc::Model RM, CodeModel::Model CM) 
+  : PPCTargetMachine(T, TT, CPU, FS, RM, CM, false) {
 }
 
 
-PPC64TargetMachine::PPC64TargetMachine(const Target &T, const std::string &TT, 
-                                       const std::string &CPU, 
-                                       const std::string &FS)
-  : PPCTargetMachine(T, TT, CPU, FS, true) {
+PPC64TargetMachine::PPC64TargetMachine(const Target &T, StringRef TT, 
+                                       StringRef CPU,  StringRef FS,
+                                       Reloc::Model RM, CodeModel::Model CM)
+  : PPCTargetMachine(T, TT, CPU, FS, RM, CM, true) {
 }
 
 
@@ -110,19 +77,11 @@ bool PPCTargetMachine::addPreEmitPass(PassManagerBase &PM,
 bool PPCTargetMachine::addCodeEmitter(PassManagerBase &PM,
                                       CodeGenOpt::Level OptLevel,
                                       JITCodeEmitter &JCE) {
-  // The JIT should use the static relocation model in ppc32 mode, PIC in ppc64.
   // FIXME: This should be moved to TargetJITInfo!!
-  if (Subtarget.isPPC64()) {
-    // We use PIC codegen in ppc64 mode, because otherwise we'd have to use many
-    // instructions to materialize arbitrary global variable + function +
-    // constant pool addresses.
-    setRelocationModel(Reloc::PIC_);
+  if (Subtarget.isPPC64())
     // Temporary workaround for the inability of PPC64 JIT to handle jump
     // tables.
     DisableJumpTables = true;      
-  } else {
-    setRelocationModel(Reloc::Static);
-  }
   
   // Inform the subtarget that we are in JIT mode.  FIXME: does this break macho
   // writing?
diff --git a/lib/Target/PowerPC/PPCTargetMachine.h b/lib/Target/PowerPC/PPCTargetMachine.h
index baf07e3498f81..d06f0843bd6db 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.h
+++ b/lib/Target/PowerPC/PPCTargetMachine.h
@@ -40,9 +40,9 @@ class PPCTargetMachine : public LLVMTargetMachine {
   InstrItineraryData  InstrItins;
 
 public:
-  PPCTargetMachine(const Target &T, const std::string &TT,
-                   const std::string &CPU, const std::string &FS,
-                   bool is64Bit);
+  PPCTargetMachine(const Target &T, StringRef TT,
+                   StringRef CPU, StringRef FS,
+                   Reloc::Model RM, CodeModel::Model CM, bool is64Bit);
 
   virtual const PPCInstrInfo      *getInstrInfo() const { return &InstrInfo; }
   virtual const PPCFrameLowering  *getFrameLowering() const {
@@ -77,16 +77,18 @@ public:
 ///
 class PPC32TargetMachine : public PPCTargetMachine {
 public:
-  PPC32TargetMachine(const Target &T, const std::string &TT,
-                     const std::string &CPU, const std::string &FS);
+  PPC32TargetMachine(const Target &T, StringRef TT,
+                     StringRef CPU, StringRef FS,
+                     Reloc::Model RM, CodeModel::Model CM);
 };
 
 /// PPC64TargetMachine - PowerPC 64-bit target machine.
 ///
 class PPC64TargetMachine : public PPCTargetMachine {
 public:
-  PPC64TargetMachine(const Target &T, const std::string &TT,
-                     const std::string &CPU, const std::string &FS);
+  PPC64TargetMachine(const Target &T, StringRef TT,
+                     StringRef CPU, StringRef FS,
+                     Reloc::Model RM, CodeModel::Model CM);
 };
 
 } // end namespace llvm
diff --git a/lib/Target/PowerPC/TargetInfo/CMakeLists.txt b/lib/Target/PowerPC/TargetInfo/CMakeLists.txt
index 058d599a4af06..f63111f465c33 100644
--- a/lib/Target/PowerPC/TargetInfo/CMakeLists.txt
+++ b/lib/Target/PowerPC/TargetInfo/CMakeLists.txt
@@ -4,4 +4,10 @@ add_llvm_library(LLVMPowerPCInfo
   PowerPCTargetInfo.cpp
   )
 
-add_dependencies(LLVMPowerPCInfo PowerPCCodeGenTable_gen)
+add_llvm_library_dependencies(LLVMPowerPCInfo
+  LLVMMC
+  LLVMSupport
+  LLVMTarget
+  )
+
+add_dependencies(LLVMPowerPCInfo PowerPCCommonTableGen)
diff --git a/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp b/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp
index ad607d0ade6ab..5dc8568d83f2f 100644
--- a/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp
+++ b/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp
@@ -9,7 +9,7 @@
 
 #include "PPC.h"
 #include "llvm/Module.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
 Target llvm::ThePPC32Target, llvm::ThePPC64Target;
diff --git a/lib/Target/README.txt b/lib/Target/README.txt
index 4cc95340890d4..1f69ffb09c0a4 100644
--- a/lib/Target/README.txt
+++ b/lib/Target/README.txt
@@ -2348,3 +2348,13 @@ which can do this in a single operation (instruction or libcall).  It is
 probably best to do this in the code generator.
 
 //===---------------------------------------------------------------------===//
+
+unsigned foo(unsigned x, unsigned y) { return (x & y) == 0 || x == 0; }
+should fold to (x & y) == 0.
+
+//===---------------------------------------------------------------------===//
+
+unsigned foo(unsigned x, unsigned y) { return x > y && x != 0; }
+should fold to x > y.
+
+//===---------------------------------------------------------------------===//
diff --git a/lib/Target/Sparc/CMakeLists.txt b/lib/Target/Sparc/CMakeLists.txt
index c77ded4b435ec..5b87849b9d17b 100644
--- a/lib/Target/Sparc/CMakeLists.txt
+++ b/lib/Target/Sparc/CMakeLists.txt
@@ -1,11 +1,12 @@
 set(LLVM_TARGET_DEFINITIONS Sparc.td)
 
-tablegen(SparcGenRegisterInfo.inc -gen-register-info)
-tablegen(SparcGenInstrInfo.inc -gen-instr-info)
-tablegen(SparcGenAsmWriter.inc -gen-asm-writer)
-tablegen(SparcGenDAGISel.inc -gen-dag-isel)
-tablegen(SparcGenSubtargetInfo.inc -gen-subtarget)
-tablegen(SparcGenCallingConv.inc -gen-callingconv)
+llvm_tablegen(SparcGenRegisterInfo.inc -gen-register-info)
+llvm_tablegen(SparcGenInstrInfo.inc -gen-instr-info)
+llvm_tablegen(SparcGenAsmWriter.inc -gen-asm-writer)
+llvm_tablegen(SparcGenDAGISel.inc -gen-dag-isel)
+llvm_tablegen(SparcGenSubtargetInfo.inc -gen-subtarget)
+llvm_tablegen(SparcGenCallingConv.inc -gen-callingconv)
+add_public_tablegen_target(SparcCommonTableGen)
 
 add_llvm_target(SparcCodeGen
   DelaySlotFiller.cpp
@@ -21,5 +22,17 @@ add_llvm_target(SparcCodeGen
   SparcSelectionDAGInfo.cpp
   )
 
+add_llvm_library_dependencies(LLVMSparcCodeGen
+  LLVMAsmPrinter
+  LLVMCodeGen
+  LLVMCore
+  LLVMMC
+  LLVMSelectionDAG
+  LLVMSparcDesc
+  LLVMSparcInfo
+  LLVMSupport
+  LLVMTarget
+  )
+
 add_subdirectory(TargetInfo)
 add_subdirectory(MCTargetDesc)
diff --git a/lib/Target/Sparc/MCTargetDesc/CMakeLists.txt b/lib/Target/Sparc/MCTargetDesc/CMakeLists.txt
index 1e8c029798871..d3bdf0b503aeb 100644
--- a/lib/Target/Sparc/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/Sparc/MCTargetDesc/CMakeLists.txt
@@ -2,3 +2,11 @@ add_llvm_library(LLVMSparcDesc
   SparcMCTargetDesc.cpp
   SparcMCAsmInfo.cpp
   )
+
+add_llvm_library_dependencies(LLVMSparcDesc
+  LLVMMC
+  LLVMSparcInfo
+  LLVMSupport
+  )
+
+add_dependencies(LLVMSparcDesc SparcCommonTableGen)
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp b/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
index cb92a2bfd417a..cb2a7dfe61601 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
@@ -13,10 +13,11 @@
 
 #include "SparcMCTargetDesc.h"
 #include "SparcMCAsmInfo.h"
+#include "llvm/MC/MCCodeGenInfo.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 
 #define GET_INSTRINFO_MC_DESC
 #include "SparcGenInstrInfo.inc"
@@ -35,8 +36,10 @@ static MCInstrInfo *createSparcMCInstrInfo() {
   return X;
 }
 
-extern "C" void LLVMInitializeSparcMCInstrInfo() {
-  TargetRegistry::RegisterMCInstrInfo(TheSparcTarget, createSparcMCInstrInfo);
+static MCRegisterInfo *createSparcMCRegisterInfo(StringRef TT) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitSparcMCRegisterInfo(X, SP::I7);
+  return X;
 }
 
 static MCSubtargetInfo *createSparcMCSubtargetInfo(StringRef TT, StringRef CPU,
@@ -46,12 +49,31 @@ static MCSubtargetInfo *createSparcMCSubtargetInfo(StringRef TT, StringRef CPU,
   return X;
 }
 
-extern "C" void LLVMInitializeSparcMCSubtargetInfo() {
-  TargetRegistry::RegisterMCSubtargetInfo(TheSparcTarget,
-                                          createSparcMCSubtargetInfo);
+static MCCodeGenInfo *createSparcMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                               CodeModel::Model CM) {
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+  X->InitMCCodeGenInfo(RM, CM);
+  return X;
 }
 
-extern "C" void LLVMInitializeSparcMCAsmInfo() {
+extern "C" void LLVMInitializeSparcTargetMC() {
+  // Register the MC asm info.
   RegisterMCAsmInfo<SparcELFMCAsmInfo> X(TheSparcTarget);
   RegisterMCAsmInfo<SparcELFMCAsmInfo> Y(TheSparcV9Target);
+
+  // Register the MC codegen info.
+  TargetRegistry::RegisterMCCodeGenInfo(TheSparcTarget,
+                                       createSparcMCCodeGenInfo);
+  TargetRegistry::RegisterMCCodeGenInfo(TheSparcV9Target,
+                                       createSparcMCCodeGenInfo);
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(TheSparcTarget, createSparcMCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(TheSparcTarget, createSparcMCRegisterInfo);
+
+  // Register the MC subtarget info.
+  TargetRegistry::RegisterMCSubtargetInfo(TheSparcTarget,
+                                          createSparcMCSubtargetInfo);
 }
diff --git a/lib/Target/Sparc/SparcAsmPrinter.cpp b/lib/Target/Sparc/SparcAsmPrinter.cpp
index edde8427aa896..345e1bca54c64 100644
--- a/lib/Target/Sparc/SparcAsmPrinter.cpp
+++ b/lib/Target/Sparc/SparcAsmPrinter.cpp
@@ -22,9 +22,9 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Target/Mangler.h"
-#include "llvm/Target/TargetRegistry.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp
index 6f30d3fd6c35e..d70b16375e953 100644
--- a/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/lib/Target/Sparc/SparcISelLowering.cpp
@@ -631,8 +631,8 @@ SparcTargetLowering::getSRetArgSize(SelectionDAG &DAG, SDValue Callee) const
   assert(CalleeFn->hasStructRetAttr() &&
          "Callee does not have the StructRet attribute.");
 
-  const PointerType *Ty = cast<PointerType>(CalleeFn->arg_begin()->getType());
-  const Type *ElementTy = Ty->getElementType();
+  PointerType *Ty = cast<PointerType>(CalleeFn->arg_begin()->getType());
+  Type *ElementTy = Ty->getElementType();
   return getTargetData()->getTypeAllocSize(ElementTy);
 }
 
@@ -748,8 +748,10 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM)
   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
 
-  // SPARC has no intrinsics for these particular operations.
+  // FIXME: There are instructions available for ATOMIC_FENCE
+  // on SparcV8 and later.
   setOperationAction(ISD::MEMBARRIER, MVT::Other, Expand);
+  setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Expand);
 
   setOperationAction(ISD::FSIN , MVT::f64, Expand);
   setOperationAction(ISD::FCOS , MVT::f64, Expand);
diff --git a/lib/Target/Sparc/SparcInstrInfo.cpp b/lib/Target/Sparc/SparcInstrInfo.cpp
index 4e3ddf839985b..7a6bf50fa7d4a 100644
--- a/lib/Target/Sparc/SparcInstrInfo.cpp
+++ b/lib/Target/Sparc/SparcInstrInfo.cpp
@@ -17,8 +17,8 @@
 #include "SparcSubtarget.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Target/TargetRegistry.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 
diff --git a/lib/Target/Sparc/SparcRegisterInfo.cpp b/lib/Target/Sparc/SparcRegisterInfo.cpp
index 0acdd2c55d6ba..8c1625148c8cc 100644
--- a/lib/Target/Sparc/SparcRegisterInfo.cpp
+++ b/lib/Target/Sparc/SparcRegisterInfo.cpp
@@ -17,7 +17,6 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineLocation.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Type.h"
@@ -31,7 +30,7 @@ using namespace llvm;
 
 SparcRegisterInfo::SparcRegisterInfo(SparcSubtarget &st,
                                      const TargetInstrInfo &tii)
-  : SparcGenRegisterInfo(), Subtarget(st), TII(tii) {
+  : SparcGenRegisterInfo(SP::I7), Subtarget(st), TII(tii) {
 }
 
 const unsigned* SparcRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF)
@@ -113,10 +112,6 @@ SparcRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
 void SparcRegisterInfo::
 processFunctionBeforeFrameFinalized(MachineFunction &MF) const {}
 
-unsigned SparcRegisterInfo::getRARegister() const {
-  return SP::I7;
-}
-
 unsigned SparcRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   return SP::I6;
 }
@@ -130,11 +125,3 @@ unsigned SparcRegisterInfo::getEHHandlerRegister() const {
   llvm_unreachable("What is the exception handler register");
   return 0;
 }
-
-int SparcRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
-  return SparcGenRegisterInfo::getDwarfRegNumFull(RegNum, 0);
-}
-
-int SparcRegisterInfo::getLLVMRegNum(unsigned DwarfRegNo, bool isEH) const {
-  return SparcGenRegisterInfo::getLLVMRegNumFull(DwarfRegNo,0);
-}
diff --git a/lib/Target/Sparc/SparcRegisterInfo.h b/lib/Target/Sparc/SparcRegisterInfo.h
index ec9e63a686bcf..f845667b4d9cb 100644
--- a/lib/Target/Sparc/SparcRegisterInfo.h
+++ b/lib/Target/Sparc/SparcRegisterInfo.h
@@ -46,15 +46,11 @@ struct SparcRegisterInfo : public SparcGenRegisterInfo {
   void processFunctionBeforeFrameFinalized(MachineFunction &MF) const;
 
   // Debug information queries.
-  unsigned getRARegister() const;
   unsigned getFrameRegister(const MachineFunction &MF) const;
 
   // Exception handling queries.
   unsigned getEHExceptionRegister() const;
   unsigned getEHHandlerRegister() const;
-
-  int getDwarfRegNum(unsigned RegNum, bool isEH) const;
-  int getLLVMRegNum(unsigned RegNum, bool isEH) const;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/Sparc/SparcSubtarget.cpp b/lib/Target/Sparc/SparcSubtarget.cpp
index de647e8221a22..6c501cff6a3ab 100644
--- a/lib/Target/Sparc/SparcSubtarget.cpp
+++ b/lib/Target/Sparc/SparcSubtarget.cpp
@@ -13,7 +13,7 @@
 
 #include "SparcSubtarget.h"
 #include "Sparc.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 
 #define GET_SUBTARGETINFO_TARGET_DESC
 #define GET_SUBTARGETINFO_CTOR
diff --git a/lib/Target/Sparc/SparcTargetMachine.cpp b/lib/Target/Sparc/SparcTargetMachine.cpp
index cbe6d8754efd2..3d7b4a47d1a82 100644
--- a/lib/Target/Sparc/SparcTargetMachine.cpp
+++ b/lib/Target/Sparc/SparcTargetMachine.cpp
@@ -13,7 +13,7 @@
 #include "Sparc.h"
 #include "SparcTargetMachine.h"
 #include "llvm/PassManager.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
 extern "C" void LLVMInitializeSparcTarget() {
@@ -24,10 +24,11 @@ extern "C" void LLVMInitializeSparcTarget() {
 
 /// SparcTargetMachine ctor - Create an ILP32 architecture model
 ///
-SparcTargetMachine::SparcTargetMachine(const Target &T, const std::string &TT, 
-                                       const std::string &CPU,
-                                       const std::string &FS, bool is64bit)
-  : LLVMTargetMachine(T, TT, CPU, FS),
+SparcTargetMachine::SparcTargetMachine(const Target &T, StringRef TT, 
+                                       StringRef CPU, StringRef FS,
+                                       Reloc::Model RM, CodeModel::Model CM,
+                                       bool is64bit)
+  : LLVMTargetMachine(T, TT, CPU, FS, RM, CM),
     Subtarget(TT, CPU, FS, is64bit),
     DataLayout(Subtarget.getDataLayout()),
     TLInfo(*this), TSInfo(*this), InstrInfo(Subtarget),
@@ -51,15 +52,15 @@ bool SparcTargetMachine::addPreEmitPass(PassManagerBase &PM,
 }
 
 SparcV8TargetMachine::SparcV8TargetMachine(const Target &T,
-                                           const std::string &TT, 
-                                           const std::string &CPU,
-                                           const std::string &FS)
-  : SparcTargetMachine(T, TT, CPU, FS, false) {
+                                           StringRef TT, StringRef CPU,
+                                           StringRef FS, Reloc::Model RM,
+                                           CodeModel::Model CM)
+  : SparcTargetMachine(T, TT, CPU, FS, RM, CM, false) {
 }
 
 SparcV9TargetMachine::SparcV9TargetMachine(const Target &T, 
-                                           const std::string &TT, 
-                                           const std::string &CPU,
-                                           const std::string &FS)
-  : SparcTargetMachine(T, TT, CPU, FS, true) {
+                                           StringRef TT,  StringRef CPU,
+                                           StringRef FS, Reloc::Model RM,
+                                           CodeModel::Model CM)
+  : SparcTargetMachine(T, TT, CPU, FS, RM, CM, true) {
 }
diff --git a/lib/Target/Sparc/SparcTargetMachine.h b/lib/Target/Sparc/SparcTargetMachine.h
index 799fc497f4aea..3c907dd44de1d 100644
--- a/lib/Target/Sparc/SparcTargetMachine.h
+++ b/lib/Target/Sparc/SparcTargetMachine.h
@@ -33,9 +33,9 @@ class SparcTargetMachine : public LLVMTargetMachine {
   SparcInstrInfo InstrInfo;
   SparcFrameLowering FrameLowering;
 public:
-  SparcTargetMachine(const Target &T, const std::string &TT,
-                     const std::string &CPU, const std::string &FS,
-                     bool is64bit);
+  SparcTargetMachine(const Target &T, StringRef TT,
+                     StringRef CPU, StringRef FS,
+                     Reloc::Model RM, CodeModel::Model CM, bool is64bit);
 
   virtual const SparcInstrInfo *getInstrInfo() const { return &InstrInfo; }
   virtual const TargetFrameLowering  *getFrameLowering() const {
@@ -62,16 +62,18 @@ public:
 ///
 class SparcV8TargetMachine : public SparcTargetMachine {
 public:
-  SparcV8TargetMachine(const Target &T, const std::string &TT,
-                       const std::string &CPU, const std::string &FS);
+  SparcV8TargetMachine(const Target &T, StringRef TT,
+                       StringRef CPU, StringRef FS,
+                       Reloc::Model RM, CodeModel::Model CM);
 };
 
 /// SparcV9TargetMachine - Sparc 64-bit target machine
 ///
 class SparcV9TargetMachine : public SparcTargetMachine {
 public:
-  SparcV9TargetMachine(const Target &T, const std::string &TT,
-                       const std::string &CPU, const std::string &FS);
+  SparcV9TargetMachine(const Target &T, StringRef TT,
+                       StringRef CPU, StringRef FS,
+                       Reloc::Model RM, CodeModel::Model CM);
 };
 
 } // end namespace llvm
diff --git a/lib/Target/Sparc/TargetInfo/CMakeLists.txt b/lib/Target/Sparc/TargetInfo/CMakeLists.txt
index 870b56a6ea1be..a0760231386a5 100644
--- a/lib/Target/Sparc/TargetInfo/CMakeLists.txt
+++ b/lib/Target/Sparc/TargetInfo/CMakeLists.txt
@@ -4,4 +4,10 @@ add_llvm_library(LLVMSparcInfo
   SparcTargetInfo.cpp
   )
 
-add_dependencies(LLVMSparcInfo SparcCodeGenTable_gen)
+add_llvm_library_dependencies(LLVMSparcInfo
+  LLVMMC
+  LLVMSupport
+  LLVMTarget
+  )
+
+add_dependencies(LLVMSparcInfo SparcCommonTableGen)
diff --git a/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp b/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp
index 5c06f0727e96e..c9d5b7bdfb3de 100644
--- a/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp
+++ b/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp
@@ -9,7 +9,7 @@
 
 #include "Sparc.h"
 #include "llvm/Module.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
 Target llvm::TheSparcTarget;
diff --git a/lib/Target/SystemZ/CMakeLists.txt b/lib/Target/SystemZ/CMakeLists.txt
index f4bdbd8cd1731..7c09c0ea7b5a6 100644
--- a/lib/Target/SystemZ/CMakeLists.txt
+++ b/lib/Target/SystemZ/CMakeLists.txt
@@ -1,11 +1,12 @@
 set(LLVM_TARGET_DEFINITIONS SystemZ.td)
 
-tablegen(SystemZGenRegisterInfo.inc -gen-register-info)
-tablegen(SystemZGenInstrInfo.inc -gen-instr-info)
-tablegen(SystemZGenAsmWriter.inc -gen-asm-writer)
-tablegen(SystemZGenDAGISel.inc -gen-dag-isel)
-tablegen(SystemZGenCallingConv.inc -gen-callingconv)
-tablegen(SystemZGenSubtargetInfo.inc -gen-subtarget)
+llvm_tablegen(SystemZGenRegisterInfo.inc -gen-register-info)
+llvm_tablegen(SystemZGenInstrInfo.inc -gen-instr-info)
+llvm_tablegen(SystemZGenAsmWriter.inc -gen-asm-writer)
+llvm_tablegen(SystemZGenDAGISel.inc -gen-dag-isel)
+llvm_tablegen(SystemZGenCallingConv.inc -gen-callingconv)
+llvm_tablegen(SystemZGenSubtargetInfo.inc -gen-subtarget)
+add_public_tablegen_target(SystemZCommonTableGen)
 
 add_llvm_target(SystemZCodeGen
   SystemZAsmPrinter.cpp
@@ -19,5 +20,17 @@ add_llvm_target(SystemZCodeGen
   SystemZSelectionDAGInfo.cpp
   )
 
+add_llvm_library_dependencies(LLVMSystemZCodeGen
+  LLVMAsmPrinter
+  LLVMCodeGen
+  LLVMCore
+  LLVMMC
+  LLVMSelectionDAG
+  LLVMSupport
+  LLVMSystemZDesc
+  LLVMSystemZInfo
+  LLVMTarget
+  )
+
 add_subdirectory(TargetInfo)
 add_subdirectory(MCTargetDesc)
diff --git a/lib/Target/SystemZ/MCTargetDesc/CMakeLists.txt b/lib/Target/SystemZ/MCTargetDesc/CMakeLists.txt
index 2ac90164721ff..822df097a37d6 100644
--- a/lib/Target/SystemZ/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/SystemZ/MCTargetDesc/CMakeLists.txt
@@ -3,5 +3,12 @@ add_llvm_library(LLVMSystemZDesc
   SystemZMCAsmInfo.cpp
   )
 
+add_llvm_library_dependencies(LLVMSystemZDesc
+  LLVMMC
+  LLVMSystemZInfo
+  )
+
+add_dependencies(LLVMSystemZDesc SystemZCommonTableGen)
+
 # Hack: we need to include 'main' target directory to grab private headers
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/.. ${CMAKE_CURRENT_BINARY_DIR}/..)
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
index 5a826a6ef8878..23fb1e068e704 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
@@ -13,10 +13,11 @@
 
 #include "SystemZMCTargetDesc.h"
 #include "SystemZMCAsmInfo.h"
+#include "llvm/MC/MCCodeGenInfo.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 
 #define GET_INSTRINFO_MC_DESC
 #include "SystemZGenInstrInfo.inc"
@@ -35,9 +36,10 @@ static MCInstrInfo *createSystemZMCInstrInfo() {
   return X;
 }
 
-extern "C" void LLVMInitializeSystemZMCInstrInfo() {
-  TargetRegistry::RegisterMCInstrInfo(TheSystemZTarget,
-                                      createSystemZMCInstrInfo);
+static MCRegisterInfo *createSystemZMCRegisterInfo(StringRef TT) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitSystemZMCRegisterInfo(X, 0);
+  return X;
 }
 
 static MCSubtargetInfo *createSystemZMCSubtargetInfo(StringRef TT,
@@ -48,11 +50,32 @@ static MCSubtargetInfo *createSystemZMCSubtargetInfo(StringRef TT,
   return X;
 }
 
-extern "C" void LLVMInitializeSystemZMCSubtargetInfo() {
-  TargetRegistry::RegisterMCSubtargetInfo(TheSystemZTarget,
-                                          createSystemZMCSubtargetInfo);
+static MCCodeGenInfo *createSystemZMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                                 CodeModel::Model CM) {
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+  if (RM == Reloc::Default)
+    RM = Reloc::Static;
+  X->InitMCCodeGenInfo(RM, CM);
+  return X;
 }
 
-extern "C" void LLVMInitializeSystemZMCAsmInfo() {
+extern "C" void LLVMInitializeSystemZTargetMC() {
+  // Register the MC asm info.
   RegisterMCAsmInfo<SystemZMCAsmInfo> X(TheSystemZTarget);
+
+  // Register the MC codegen info.
+  TargetRegistry::RegisterMCCodeGenInfo(TheSystemZTarget,
+                                        createSystemZMCCodeGenInfo);
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(TheSystemZTarget,
+                                      createSystemZMCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(TheSystemZTarget,
+                                    createSystemZMCRegisterInfo);
+
+  // Register the MC subtarget info.
+  TargetRegistry::RegisterMCSubtargetInfo(TheSystemZTarget,
+                                          createSystemZMCSubtargetInfo);
 }
diff --git a/lib/Target/SystemZ/SystemZAsmPrinter.cpp b/lib/Target/SystemZ/SystemZAsmPrinter.cpp
index fd4d8b70c75e0..43dcdfc3936b8 100644
--- a/lib/Target/SystemZ/SystemZAsmPrinter.cpp
+++ b/lib/Target/SystemZ/SystemZAsmPrinter.cpp
@@ -28,10 +28,8 @@
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Target/Mangler.h"
-#include "llvm/Target/TargetData.h"
-#include "llvm/Target/TargetLoweringObjectFile.h"
-#include "llvm/Target/TargetRegistry.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp
index 871c2972a8c48..48ca99ff9ea27 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -81,6 +81,7 @@ SystemZTargetLowering::SystemZTargetLowering(SystemZTargetMachine &tm) :
   setSchedulingPreference(Sched::RegPressure);
 
   setBooleanContents(ZeroOrOneBooleanContent);
+  setBooleanVectorContents(ZeroOrOneBooleanContent); // FIXME: Is this correct?
 
   setOperationAction(ISD::BR_JT,            MVT::Other, Expand);
   setOperationAction(ISD::BRCOND,           MVT::Other, Expand);
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.cpp b/lib/Target/SystemZ/SystemZInstrInfo.cpp
index 99e2730609e8f..5f3dd80f2d90a 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -21,8 +21,8 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
-#include "llvm/Target/TargetRegistry.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
 
 #define GET_INSTRINFO_CTOR
 #include "SystemZGenInstrInfo.inc"
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.td b/lib/Target/SystemZ/SystemZInstrInfo.td
index 11a39fcd023a8..580d65b27f8a2 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.td
+++ b/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -478,7 +478,7 @@ def MOV64rmm  : RSYI<0x04EB,
                      "lmg\t{$from, $to, $dst}",
                      []>;
 
-let isReMaterializable = 1, isAsCheapAsAMove = 1,
+let isReMaterializable = 1, neverHasSideEffects = 1, isAsCheapAsAMove = 1,
     Constraints = "$src = $dst" in {
 def MOV64Pr0_even : Pseudo<(outs GR64P:$dst), (ins GR64P:$src),
                            "lhi\t${dst:subreg_even}, 0",
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.cpp b/lib/Target/SystemZ/SystemZRegisterInfo.cpp
index 59692e8833660..b1050d46e5509 100644
--- a/lib/Target/SystemZ/SystemZRegisterInfo.cpp
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.cpp
@@ -33,7 +33,7 @@ using namespace llvm;
 
 SystemZRegisterInfo::SystemZRegisterInfo(SystemZTargetMachine &tm,
                                          const SystemZInstrInfo &tii)
-  : SystemZGenRegisterInfo(), TM(tm), TII(tii) {
+  : SystemZGenRegisterInfo(0), TM(tm), TII(tii) {
 }
 
 const unsigned*
@@ -126,11 +126,6 @@ SystemZRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   MI.getOperand(i+1).ChangeToImmediate(Offset);
 }
 
-unsigned SystemZRegisterInfo::getRARegister() const {
-  assert(0 && "What is the return address register");
-  return 0;
-}
-
 unsigned
 SystemZRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   assert(0 && "What is the frame register");
@@ -146,13 +141,3 @@ unsigned SystemZRegisterInfo::getEHHandlerRegister() const {
   assert(0 && "What is the exception handler register");
   return 0;
 }
-
-int SystemZRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
-  assert(0 && "What is the dwarf register number");
-  return -1;
-}
-
-int SystemZRegisterInfo::getLLVMRegNum(unsigned DwarfRegNo, bool isEH) const {
-  assert(0 && "What is the dwarf register number");
-  return -1;
-}
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.h b/lib/Target/SystemZ/SystemZRegisterInfo.h
index 2e262e1acc304..03935b2bec10b 100644
--- a/lib/Target/SystemZ/SystemZRegisterInfo.h
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.h
@@ -48,15 +48,11 @@ struct SystemZRegisterInfo : public SystemZGenRegisterInfo {
                            int SPAdj, RegScavenger *RS = NULL) const;
 
   // Debug information queries.
-  unsigned getRARegister() const;
   unsigned getFrameRegister(const MachineFunction &MF) const;
 
   // Exception handling queries.
   unsigned getEHExceptionRegister() const;
   unsigned getEHHandlerRegister() const;
-
-  int getDwarfRegNum(unsigned RegNum, bool isEH) const;
-  int getLLVMRegNum(unsigned RegNum, bool isEH) const;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/SystemZ/SystemZSubtarget.cpp b/lib/Target/SystemZ/SystemZSubtarget.cpp
index b3ed06639758c..0845510761c2f 100644
--- a/lib/Target/SystemZ/SystemZSubtarget.cpp
+++ b/lib/Target/SystemZ/SystemZSubtarget.cpp
@@ -15,7 +15,7 @@
 #include "SystemZ.h"
 #include "llvm/GlobalValue.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 
 #define GET_SUBTARGETINFO_TARGET_DESC
 #define GET_SUBTARGETINFO_CTOR
diff --git a/lib/Target/SystemZ/SystemZTargetMachine.cpp b/lib/Target/SystemZ/SystemZTargetMachine.cpp
index 48298cc744e72..e390f060c9a91 100644
--- a/lib/Target/SystemZ/SystemZTargetMachine.cpp
+++ b/lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -10,7 +10,7 @@
 #include "SystemZTargetMachine.h"
 #include "SystemZ.h"
 #include "llvm/PassManager.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
 extern "C" void LLVMInitializeSystemZTarget() {
@@ -21,18 +21,15 @@ extern "C" void LLVMInitializeSystemZTarget() {
 /// SystemZTargetMachine ctor - Create an ILP64 architecture model
 ///
 SystemZTargetMachine::SystemZTargetMachine(const Target &T,
-                                           const std::string &TT,
-                                           const std::string &CPU,
-                                           const std::string &FS)
-  : LLVMTargetMachine(T, TT, CPU, FS),
+                                           StringRef TT, StringRef CPU,
+                                           StringRef FS, Reloc::Model RM,
+                                           CodeModel::Model CM)
+  : LLVMTargetMachine(T, TT, CPU, FS, RM, CM),
     Subtarget(TT, CPU, FS),
     DataLayout("E-p:64:64:64-i8:8:16-i16:16:16-i32:32:32-i64:64:64-f32:32:32"
                "-f64:64:64-f128:128:128-a0:16:16-n32:64"),
     InstrInfo(*this), TLInfo(*this), TSInfo(*this),
     FrameLowering(Subtarget) {
-
-  if (getRelocationModel() == Reloc::Default)
-    setRelocationModel(Reloc::Static);
 }
 
 bool SystemZTargetMachine::addInstSelector(PassManagerBase &PM,
diff --git a/lib/Target/SystemZ/SystemZTargetMachine.h b/lib/Target/SystemZ/SystemZTargetMachine.h
index e40b556c0c3c9..43dce4bd148e3 100644
--- a/lib/Target/SystemZ/SystemZTargetMachine.h
+++ b/lib/Target/SystemZ/SystemZTargetMachine.h
@@ -37,8 +37,9 @@ class SystemZTargetMachine : public LLVMTargetMachine {
   SystemZSelectionDAGInfo TSInfo;
   SystemZFrameLowering    FrameLowering;
 public:
-  SystemZTargetMachine(const Target &T, const std::string &TT,
-                       const std::string &CPU, const std::string &FS);
+  SystemZTargetMachine(const Target &T, StringRef TT,
+                       StringRef CPU, StringRef FS,
+                       Reloc::Model RM, CodeModel::Model CM);
 
   virtual const TargetFrameLowering *getFrameLowering() const {
     return &FrameLowering;
diff --git a/lib/Target/SystemZ/TargetInfo/CMakeLists.txt b/lib/Target/SystemZ/TargetInfo/CMakeLists.txt
index 743d8d322d054..31807081bd6ed 100644
--- a/lib/Target/SystemZ/TargetInfo/CMakeLists.txt
+++ b/lib/Target/SystemZ/TargetInfo/CMakeLists.txt
@@ -4,4 +4,10 @@ add_llvm_library(LLVMSystemZInfo
   SystemZTargetInfo.cpp
   )
 
-add_dependencies(LLVMSystemZInfo SystemZCodeGenTable_gen)
+add_llvm_library_dependencies(LLVMSystemZInfo
+  LLVMMC
+  LLVMSupport
+  LLVMTarget
+  )
+
+add_dependencies(LLVMSystemZInfo SystemZCommonTableGen)
diff --git a/lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.cpp b/lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.cpp
index 8272b1188201c..da99282ecb042 100644
--- a/lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.cpp
+++ b/lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.cpp
@@ -9,7 +9,7 @@
 
 #include "SystemZ.h"
 #include "llvm/Module.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
 Target llvm::TheSystemZTarget;
diff --git a/lib/Target/Target.cpp b/lib/Target/Target.cpp
index a42ce548c8955..a2b83bcce46e5 100644
--- a/lib/Target/Target.cpp
+++ b/lib/Target/Target.cpp
@@ -17,6 +17,7 @@
 #include "llvm/InitializePasses.h"
 #include "llvm/PassManager.h"
 #include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/LLVMContext.h"
 #include <cstring>
 
@@ -39,6 +40,11 @@ void LLVMAddTargetData(LLVMTargetDataRef TD, LLVMPassManagerRef PM) {
   unwrap(PM)->add(new TargetData(*unwrap(TD)));
 }
 
+void LLVMAddTargetLibraryInfo(LLVMTargetLibraryInfoRef TLI,
+                              LLVMPassManagerRef PM) {
+  unwrap(PM)->add(new TargetLibraryInfo(*unwrap(TLI)));
+}
+
 char *LLVMCopyStringRepOfTargetData(LLVMTargetDataRef TD) {
   std::string StringRep = unwrap(TD)->getStringRepresentation();
   return strdup(StringRep.c_str());
@@ -87,13 +93,13 @@ unsigned LLVMPreferredAlignmentOfGlobal(LLVMTargetDataRef TD,
 
 unsigned LLVMElementAtOffset(LLVMTargetDataRef TD, LLVMTypeRef StructTy,
                              unsigned long long Offset) {
-  const StructType *STy = unwrap<StructType>(StructTy);
+  StructType *STy = unwrap<StructType>(StructTy);
   return unwrap(TD)->getStructLayout(STy)->getElementContainingOffset(Offset);
 }
 
 unsigned long long LLVMOffsetOfElement(LLVMTargetDataRef TD, LLVMTypeRef StructTy,
                                        unsigned Element) {
-  const StructType *STy = unwrap<StructType>(StructTy);
+  StructType *STy = unwrap<StructType>(StructTy);
   return unwrap(TD)->getStructLayout(STy)->getElementOffset(Element);
 }
 
diff --git a/lib/Target/TargetAsmInfo.cpp b/lib/Target/TargetAsmInfo.cpp
deleted file mode 100644
index a97b0e8689894..0000000000000
--- a/lib/Target/TargetAsmInfo.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-//===-- llvm/Target/TargetAsmInfo.cpp - Target Assembly Info --------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Target/TargetAsmInfo.h"
-#include "llvm/Target/TargetData.h"
-#include "llvm/Target/TargetLowering.h"
-#include "llvm/Target/TargetLoweringObjectFile.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetRegisterInfo.h"
-using namespace llvm;
-
-TargetAsmInfo::TargetAsmInfo(const TargetMachine &TM) {
-  TLOF = &TM.getTargetLowering()->getObjFileLowering();
-  TFI = TM.getFrameLowering();
-  TRI = TM.getRegisterInfo();
-  TFI->getInitialFrameState(InitialFrameState);
-}
diff --git a/lib/Target/TargetAsmLexer.cpp b/lib/Target/TargetAsmLexer.cpp
deleted file mode 100644
index d4893ff2521eb..0000000000000
--- a/lib/Target/TargetAsmLexer.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-//===-- llvm/Target/TargetAsmLexer.cpp - Target Assembly Lexer ------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Target/TargetAsmLexer.h"
-using namespace llvm;
-
-TargetAsmLexer::TargetAsmLexer(const Target &T) : TheTarget(T), Lexer(NULL) {}
-TargetAsmLexer::~TargetAsmLexer() {}
diff --git a/lib/Target/TargetData.cpp b/lib/Target/TargetData.cpp
index 17d022a339e69..bd6a6b67beb97 100644
--- a/lib/Target/TargetData.cpp
+++ b/lib/Target/TargetData.cpp
@@ -41,7 +41,7 @@ char TargetData::ID = 0;
 // Support for StructLayout
 //===----------------------------------------------------------------------===//
 
-StructLayout::StructLayout(const StructType *ST, const TargetData &TD) {
+StructLayout::StructLayout(StructType *ST, const TargetData &TD) {
   assert(!ST->isOpaque() && "Cannot get layout of opaque structs");
   StructAlignment = 0;
   StructSize = 0;
@@ -49,7 +49,7 @@ StructLayout::StructLayout(const StructType *ST, const TargetData &TD) {
 
   // Loop over each of the elements, placing them in memory.
   for (unsigned i = 0, e = NumElements; i != e; ++i) {
-    const Type *Ty = ST->getElementType(i);
+    Type *Ty = ST->getElementType(i);
     unsigned TyAlign = ST->isPacked() ? 1 : TD.getABITypeAlignment(Ty);
 
     // Add padding if necessary to align the data element properly.
@@ -139,6 +139,7 @@ void TargetData::init(StringRef Desc) {
   PointerMemSize = 8;
   PointerABIAlign = 8;
   PointerPrefAlign = PointerABIAlign;
+  StackNaturalAlign = 0;
 
   // Default alignments
   setAlignment(INTEGER_ALIGN,   1,  1, 1);   // i1
@@ -218,7 +219,12 @@ void TargetData::init(StringRef Desc) {
         Token = Split.second;
       } while (!Specifier.empty() || !Token.empty());
       break;
-
+    case 'S': // Stack natural alignment.
+      StackNaturalAlign = getInt(Specifier.substr(1));
+      StackNaturalAlign /= 8;
+      // FIXME: Should we really be truncating these alingments and
+      // sizes silently?
+      break;
     default:
       break;
     }
@@ -261,7 +267,7 @@ TargetData::setAlignment(AlignTypeEnum align_type, unsigned abi_align,
 /// preferred if ABIInfo = false) the target wants for the specified datatype.
 unsigned TargetData::getAlignmentInfo(AlignTypeEnum AlignType,
                                       uint32_t BitWidth, bool ABIInfo,
-                                      const Type *Ty) const {
+                                      Type *Ty) const {
   // Check to see if we have an exact match and remember the best match we see.
   int BestMatchIdx = -1;
   int LargestInt = -1;
@@ -315,7 +321,7 @@ unsigned TargetData::getAlignmentInfo(AlignTypeEnum AlignType,
 namespace {
 
 class StructLayoutMap {
-  typedef DenseMap<const StructType*, StructLayout*> LayoutInfoTy;
+  typedef DenseMap<StructType*, StructLayout*> LayoutInfoTy;
   LayoutInfoTy LayoutInfo;
 
 public:
@@ -329,7 +335,7 @@ public:
     }
   }
 
-  StructLayout *&operator[](const StructType *STy) {
+  StructLayout *&operator[](StructType *STy) {
     return LayoutInfo[STy];
   }
 
@@ -343,7 +349,7 @@ TargetData::~TargetData() {
   delete static_cast<StructLayoutMap*>(LayoutMap);
 }
 
-const StructLayout *TargetData::getStructLayout(const StructType *Ty) const {
+const StructLayout *TargetData::getStructLayout(StructType *Ty) const {
   if (!LayoutMap)
     LayoutMap = new StructLayoutMap();
 
@@ -372,7 +378,9 @@ std::string TargetData::getStringRepresentation() const {
 
   OS << (LittleEndian ? "e" : "E")
      << "-p:" << PointerMemSize*8 << ':' << PointerABIAlign*8
-     << ':' << PointerPrefAlign*8;
+     << ':' << PointerPrefAlign*8
+     << "-S" << StackNaturalAlign*8;
+
   for (unsigned i = 0, e = Alignments.size(); i != e; ++i) {
     const TargetAlignElem &AI = Alignments[i];
     OS << '-' << (char)AI.AlignType << AI.TypeBitWidth << ':'
@@ -389,14 +397,14 @@ std::string TargetData::getStringRepresentation() const {
 }
 
 
-uint64_t TargetData::getTypeSizeInBits(const Type *Ty) const {
+uint64_t TargetData::getTypeSizeInBits(Type *Ty) const {
   assert(Ty->isSized() && "Cannot getTypeInfo() on a type that is unsized!");
   switch (Ty->getTypeID()) {
   case Type::LabelTyID:
   case Type::PointerTyID:
     return getPointerSizeInBits();
   case Type::ArrayTyID: {
-    const ArrayType *ATy = cast<ArrayType>(Ty);
+    ArrayType *ATy = cast<ArrayType>(Ty);
     return getTypeAllocSizeInBits(ATy->getElementType())*ATy->getNumElements();
   }
   case Type::StructTyID:
@@ -435,7 +443,7 @@ uint64_t TargetData::getTypeSizeInBits(const Type *Ty) const {
   Get the ABI (\a abi_or_pref == true) or preferred alignment (\a abi_or_pref
   == false) for the requested type \a Ty.
  */
-unsigned TargetData::getAlignment(const Type *Ty, bool abi_or_pref) const {
+unsigned TargetData::getAlignment(Type *Ty, bool abi_or_pref) const {
   int AlignType = -1;
 
   assert(Ty->isSized() && "Cannot getTypeInfo() on a type that is unsized!");
@@ -485,7 +493,7 @@ unsigned TargetData::getAlignment(const Type *Ty, bool abi_or_pref) const {
                           abi_or_pref, Ty);
 }
 
-unsigned TargetData::getABITypeAlignment(const Type *Ty) const {
+unsigned TargetData::getABITypeAlignment(Type *Ty) const {
   return getAlignment(Ty, true);
 }
 
@@ -496,7 +504,7 @@ unsigned TargetData::getABIIntegerTypeAlignment(unsigned BitWidth) const {
 }
 
 
-unsigned TargetData::getCallFrameTypeAlignment(const Type *Ty) const {
+unsigned TargetData::getCallFrameTypeAlignment(Type *Ty) const {
   for (unsigned i = 0, e = Alignments.size(); i != e; ++i)
     if (Alignments[i].AlignType == STACK_ALIGN)
       return Alignments[i].ABIAlign;
@@ -504,11 +512,11 @@ unsigned TargetData::getCallFrameTypeAlignment(const Type *Ty) const {
   return getABITypeAlignment(Ty);
 }
 
-unsigned TargetData::getPrefTypeAlignment(const Type *Ty) const {
+unsigned TargetData::getPrefTypeAlignment(Type *Ty) const {
   return getAlignment(Ty, false);
 }
 
-unsigned TargetData::getPreferredTypeAlignmentShift(const Type *Ty) const {
+unsigned TargetData::getPreferredTypeAlignmentShift(Type *Ty) const {
   unsigned Align = getPrefTypeAlignment(Ty);
   assert(!(Align & (Align-1)) && "Alignment is not a power of two!");
   return Log2_32(Align);
@@ -521,16 +529,17 @@ IntegerType *TargetData::getIntPtrType(LLVMContext &C) const {
 }
 
 
-uint64_t TargetData::getIndexedOffset(const Type *ptrTy, Value* const* Indices,
-                                      unsigned NumIndices) const {
-  const Type *Ty = ptrTy;
+uint64_t TargetData::getIndexedOffset(Type *ptrTy,
+                                      ArrayRef<Value *> Indices) const {
+  Type *Ty = ptrTy;
   assert(Ty->isPointerTy() && "Illegal argument for getIndexedOffset()");
   uint64_t Result = 0;
 
   generic_gep_type_iterator<Value* const*>
-    TI = gep_type_begin(ptrTy, Indices, Indices+NumIndices);
-  for (unsigned CurIDX = 0; CurIDX != NumIndices; ++CurIDX, ++TI) {
-    if (const StructType *STy = dyn_cast<StructType>(*TI)) {
+    TI = gep_type_begin(ptrTy, Indices);
+  for (unsigned CurIDX = 0, EndIDX = Indices.size(); CurIDX != EndIDX;
+       ++CurIDX, ++TI) {
+    if (StructType *STy = dyn_cast<StructType>(*TI)) {
       assert(Indices[CurIDX]->getType() ==
              Type::getInt32Ty(ptrTy->getContext()) &&
              "Illegal struct idx");
@@ -561,7 +570,7 @@ uint64_t TargetData::getIndexedOffset(const Type *ptrTy, Value* const* Indices,
 /// global.  This includes an explicitly requested alignment (if the global
 /// has one).
 unsigned TargetData::getPreferredAlignment(const GlobalVariable *GV) const {
-  const Type *ElemType = GV->getType()->getElementType();
+  Type *ElemType = GV->getType()->getElementType();
   unsigned Alignment = getPrefTypeAlignment(ElemType);
   unsigned GVAlignment = GV->getAlignment();
   if (GVAlignment >= Alignment) {
diff --git a/lib/Target/TargetFrameLowering.cpp b/lib/Target/TargetFrameLowering.cpp
index 19fd581c7dd5c..122f8696e2ce6 100644
--- a/lib/Target/TargetFrameLowering.cpp
+++ b/lib/Target/TargetFrameLowering.cpp
@@ -23,14 +23,6 @@ using namespace llvm;
 TargetFrameLowering::~TargetFrameLowering() {
 }
 
-/// getInitialFrameState - Returns a list of machine moves that are assumed
-/// on entry to a function.
-void
-TargetFrameLowering::getInitialFrameState(std::vector<MachineMove> &Moves)
-                                                                         const {
-  // Default is to do nothing.
-}
-
 /// getFrameIndexOffset - Returns the displacement from the frame register to
 /// the stack frame of the specified index. This is the default implementation
 /// which is overridden for some targets.
diff --git a/lib/Target/TargetLoweringObjectFile.cpp b/lib/Target/TargetLoweringObjectFile.cpp
index 703431b3806e2..56b7b69de0bdc 100644
--- a/lib/Target/TargetLoweringObjectFile.cpp
+++ b/lib/Target/TargetLoweringObjectFile.cpp
@@ -35,34 +35,16 @@ using namespace llvm;
 //                              Generic Code
 //===----------------------------------------------------------------------===//
 
-TargetLoweringObjectFile::TargetLoweringObjectFile() :
-  Ctx(0),
-  TextSection(0),
-  DataSection(0),
-  BSSSection(0),
-  ReadOnlySection(0),
-  StaticCtorSection(0),
-  StaticDtorSection(0),
-  LSDASection(0),
-  CompactUnwindSection(0),
-  DwarfAbbrevSection(0),
-  DwarfInfoSection(0),
-  DwarfLineSection(0),
-  DwarfFrameSection(0),
-  DwarfPubNamesSection(0),
-  DwarfPubTypesSection(0),
-  DwarfDebugInlineSection(0),
-  DwarfStrSection(0),
-  DwarfLocSection(0),
-  DwarfARangesSection(0),
-  DwarfRangesSection(0),
-  DwarfMacroInfoSection(0),
-  TLSExtraDataSection(0),
-  CommDirectiveSupportsAlignment(true),
-  SupportsWeakOmittedEHFrame(true), 
-  IsFunctionEHFrameSymbolPrivate(true) {
+/// Initialize - this method must be called before any actual lowering is
+/// done.  This specifies the current context for codegen, and gives the
+/// lowering implementations a chance to set up their default sections.
+void TargetLoweringObjectFile::Initialize(MCContext &ctx,
+                                          const TargetMachine &TM) {
+  Ctx = &ctx;
+  InitMCObjectFileInfo(TM.getTargetTriple(),
+                       TM.getRelocationModel(), TM.getCodeModel(), *Ctx);
 }
-
+  
 TargetLoweringObjectFile::~TargetLoweringObjectFile() {
 }
 
@@ -93,7 +75,7 @@ static bool isSuitableForBSS(const GlobalVariable *GV) {
 /// known to have a type that is an array of 1/2/4 byte elements) ends with a
 /// nul value and contains no other nuls in it.
 static bool IsNullTerminatedString(const Constant *C) {
-  const ArrayType *ATy = cast<ArrayType>(C->getType());
+  ArrayType *ATy = cast<ArrayType>(C->getType());
 
   // First check: is we have constant array of i8 terminated with zero
   if (const ConstantArray *CVA = dyn_cast<ConstantArray>(C)) {
@@ -188,8 +170,8 @@ SectionKind TargetLoweringObjectFile::getKindForGlobal(const GlobalValue *GV,
         
       // If initializer is a null-terminated string, put it in a "cstring"
       // section of the right width.
-      if (const ArrayType *ATy = dyn_cast<ArrayType>(C->getType())) {
-        if (const IntegerType *ITy =
+      if (ArrayType *ATy = dyn_cast<ArrayType>(C->getType())) {
+        if (IntegerType *ITy =
               dyn_cast<IntegerType>(ATy->getElementType())) {
           if ((ITy->getBitWidth() == 8 || ITy->getBitWidth() == 16 ||
                ITy->getBitWidth() == 32) &&
@@ -341,20 +323,3 @@ getExprForDwarfReference(const MCSymbol *Sym, unsigned Encoding,
   }
   }
 }
-
-unsigned TargetLoweringObjectFile::getPersonalityEncoding() const {
-  return dwarf::DW_EH_PE_absptr;
-}
-
-unsigned TargetLoweringObjectFile::getLSDAEncoding() const {
-  return dwarf::DW_EH_PE_absptr;
-}
-
-unsigned TargetLoweringObjectFile::getFDEEncoding(bool CFI) const {
-  return dwarf::DW_EH_PE_absptr;
-}
-
-unsigned TargetLoweringObjectFile::getTTypeEncoding() const {
-  return dwarf::DW_EH_PE_absptr;
-}
-
diff --git a/lib/Target/TargetMachine.cpp b/lib/Target/TargetMachine.cpp
index 74a1f4e8da56b..fe8a7cebd0a01 100644
--- a/lib/Target/TargetMachine.cpp
+++ b/lib/Target/TargetMachine.cpp
@@ -40,8 +40,6 @@ namespace llvm {
   bool JITExceptionHandling;
   bool JITEmitDebugInfo;
   bool JITEmitDebugInfoToDisk;
-  Reloc::Model RelocationModel;
-  CodeModel::Model CMModel;
   bool GuaranteedTailCallOpt;
   unsigned StackAlignmentOverride;
   bool RealignStack;
@@ -49,6 +47,7 @@ namespace llvm {
   bool StrongPHIElim;
   bool HasDivModLibcall;
   bool AsmVerbosityDefault(false);
+  bool EnableSegmentedStacks;
 }
 
 static cl::opt<bool, true>
@@ -143,38 +142,6 @@ EmitJitDebugInfoToDisk("jit-emit-debug-to-disk",
   cl::location(JITEmitDebugInfoToDisk),
   cl::init(false));
 
-static cl::opt<llvm::Reloc::Model, true>
-DefRelocationModel("relocation-model",
-  cl::desc("Choose relocation model"),
-  cl::location(RelocationModel),
-  cl::init(Reloc::Default),
-  cl::values(
-    clEnumValN(Reloc::Default, "default",
-               "Target default relocation model"),
-    clEnumValN(Reloc::Static, "static",
-               "Non-relocatable code"),
-    clEnumValN(Reloc::PIC_, "pic",
-               "Fully relocatable, position independent code"),
-    clEnumValN(Reloc::DynamicNoPIC, "dynamic-no-pic",
-               "Relocatable external references, non-relocatable code"),
-    clEnumValEnd));
-static cl::opt<llvm::CodeModel::Model, true>
-DefCodeModel("code-model",
-  cl::desc("Choose code model"),
-  cl::location(CMModel),
-  cl::init(CodeModel::Default),
-  cl::values(
-    clEnumValN(CodeModel::Default, "default",
-               "Target default code model"),
-    clEnumValN(CodeModel::Small, "small",
-               "Small code model"),
-    clEnumValN(CodeModel::Kernel, "kernel",
-               "Kernel code model"),
-    clEnumValN(CodeModel::Medium, "medium",
-               "Medium code model"),
-    clEnumValN(CodeModel::Large, "large",
-               "Large code model"),
-    clEnumValEnd));
 static cl::opt<bool, true>
 EnableGuaranteedTailCallOpt("tailcallopt",
   cl::desc("Turn fastcc calls into tail calls by (potentially) changing ABI."),
@@ -212,13 +179,20 @@ static cl::opt<bool>
 FunctionSections("ffunction-sections",
   cl::desc("Emit functions into separate sections"),
   cl::init(false));
+static cl::opt<bool, true>
+SegmentedStacks("segmented-stacks",
+  cl::desc("Use segmented stacks if possible."),
+  cl::location(EnableSegmentedStacks),
+  cl::init(false));
+                         
 //---------------------------------------------------------------------------
 // TargetMachine Class
 //
 
 TargetMachine::TargetMachine(const Target &T,
                              StringRef TT, StringRef CPU, StringRef FS)
-  : TheTarget(T), TargetTriple(TT), TargetCPU(CPU), TargetFS(FS), AsmInfo(0),
+  : TheTarget(T), TargetTriple(TT), TargetCPU(CPU), TargetFS(FS),
+    CodeGenInfo(0), AsmInfo(0),
     MCRelaxAll(false),
     MCNoExecStack(false),
     MCSaveTempLabels(false),
@@ -231,29 +205,24 @@ TargetMachine::TargetMachine(const Target &T,
 }
 
 TargetMachine::~TargetMachine() {
+  delete CodeGenInfo;
   delete AsmInfo;
 }
 
 /// getRelocationModel - Returns the code generation relocation model. The
 /// choices are static, PIC, and dynamic-no-pic, and target default.
-Reloc::Model TargetMachine::getRelocationModel() {
-  return RelocationModel;
-}
-
-/// setRelocationModel - Sets the code generation relocation model.
-void TargetMachine::setRelocationModel(Reloc::Model Model) {
-  RelocationModel = Model;
+Reloc::Model TargetMachine::getRelocationModel() const {
+  if (!CodeGenInfo)
+    return Reloc::Default;
+  return CodeGenInfo->getRelocationModel();
 }
 
 /// getCodeModel - Returns the code model. The choices are small, kernel,
 /// medium, large, and target default.
-CodeModel::Model TargetMachine::getCodeModel() {
-  return CMModel;
-}
-
-/// setCodeModel - Sets the code model.
-void TargetMachine::setCodeModel(CodeModel::Model Model) {
-  CMModel = Model;
+CodeModel::Model TargetMachine::getCodeModel() const {
+  if (!CodeGenInfo)
+    return CodeModel::Default;
+  return CodeGenInfo->getCodeModel();
 }
 
 bool TargetMachine::getAsmVerbosityDefault() {
diff --git a/lib/Target/TargetRegisterInfo.cpp b/lib/Target/TargetRegisterInfo.cpp
index 90a8f8d8fdcce..67239b830eb55 100644
--- a/lib/Target/TargetRegisterInfo.cpp
+++ b/lib/Target/TargetRegisterInfo.cpp
@@ -98,44 +98,25 @@ BitVector TargetRegisterInfo::getAllocatableSet(const MachineFunction &MF,
 }
 
 const TargetRegisterClass *
-llvm::getCommonSubClass(const TargetRegisterClass *A,
-                        const TargetRegisterClass *B) {
-  // First take care of the trivial cases
+TargetRegisterInfo::getCommonSubClass(const TargetRegisterClass *A,
+                                      const TargetRegisterClass *B) const {
+  // First take care of the trivial cases.
   if (A == B)
     return A;
   if (!A || !B)
     return 0;
 
-  // If B is a subclass of A, it will be handled in the loop below
-  if (B->hasSubClass(A))
-    return A;
+  // Register classes are ordered topologically, so the largest common
+  // sub-class it the common sub-class with the smallest ID.
+  const unsigned *SubA = A->getSubClassMask();
+  const unsigned *SubB = B->getSubClassMask();
 
-  const TargetRegisterClass *Best = 0;
-  for (TargetRegisterClass::sc_iterator I = A->subclasses_begin();
-       const TargetRegisterClass *X = *I; ++I) {
-    if (X == B)
-      return B;                 // B is a subclass of A
-
-    // X must be a common subclass of A and B
-    if (!B->hasSubClass(X))
-      continue;
-
-    // A superclass is definitely better.
-    if (!Best || Best->hasSuperClass(X)) {
-      Best = X;
-      continue;
-    }
-
-    // A subclass is definitely worse
-    if (Best->hasSubClass(X))
-      continue;
-
-    // Best and *I have no super/sub class relation - pick the larger class, or
-    // the smaller spill size.
-    int nb = std::distance(Best->begin(), Best->end());
-    int ni = std::distance(X->begin(), X->end());
-    if (ni>nb || (ni==nb && X->getSize() < Best->getSize()))
-      Best = X;
-  }
-  return Best;
+  // We could start the search from max(A.ID, B.ID), but we are only going to
+  // execute 2-3 iterations anyway.
+  for (unsigned Base = 0, BaseE = getNumRegClasses(); Base < BaseE; Base += 32)
+    if (unsigned Common = *SubA++ & *SubB++)
+      return getRegClass(Base + CountTrailingZeros_32(Common));
+
+  // No common sub-class exists.
+  return NULL;
 }
diff --git a/lib/Target/X86/AsmParser/CMakeLists.txt b/lib/Target/X86/AsmParser/CMakeLists.txt
index 40dbdd72faa19..94aca7abb2bdf 100644
--- a/lib/Target/X86/AsmParser/CMakeLists.txt
+++ b/lib/Target/X86/AsmParser/CMakeLists.txt
@@ -4,4 +4,13 @@ add_llvm_library(LLVMX86AsmParser
   X86AsmLexer.cpp
   X86AsmParser.cpp
   )
-add_dependencies(LLVMX86AsmParser X86CodeGenTable_gen)
+
+add_llvm_library_dependencies(LLVMX86AsmParser
+  LLVMMC
+  LLVMMCParser
+  LLVMSupport
+  LLVMX86Desc
+  LLVMX86Info
+  )
+
+add_dependencies(LLVMX86AsmParser X86CommonTableGen)
diff --git a/lib/Target/X86/AsmParser/X86AsmLexer.cpp b/lib/Target/X86/AsmParser/X86AsmLexer.cpp
index ec73087a3305f..1eaccff58a9d3 100644
--- a/lib/Target/X86/AsmParser/X86AsmLexer.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmLexer.cpp
@@ -7,20 +7,20 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/Target/TargetAsmLexer.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "MCTargetDesc/X86BaseInfo.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
-#include "X86.h"
+#include "llvm/MC/MCTargetAsmLexer.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
 
 using namespace llvm;
 
 namespace {
   
-class X86AsmLexer : public TargetAsmLexer {
+class X86AsmLexer : public MCTargetAsmLexer {
   const MCAsmInfo &AsmInfo;
   
   bool tentativeIsValid;
@@ -60,8 +60,8 @@ protected:
     }
   }
 public:
-  X86AsmLexer(const Target &T, const MCAsmInfo &MAI)
-    : TargetAsmLexer(T), AsmInfo(MAI), tentativeIsValid(false) {
+  X86AsmLexer(const Target &T, const MCRegisterInfo &MRI, const MCAsmInfo &MAI)
+    : MCTargetAsmLexer(T), AsmInfo(MAI), tentativeIsValid(false) {
   }
 };
 
@@ -160,6 +160,6 @@ AsmToken X86AsmLexer::LexTokenIntel() {
 }
 
 extern "C" void LLVMInitializeX86AsmLexer() {
-  RegisterAsmLexer<X86AsmLexer> X(TheX86_32Target);
-  RegisterAsmLexer<X86AsmLexer> Y(TheX86_64Target);
+  RegisterMCAsmLexer<X86AsmLexer> X(TheX86_32Target);
+  RegisterMCAsmLexer<X86AsmLexer> Y(TheX86_64Target);
 }
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index d45dd352fbc49..cb4f15ffed3e9 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -7,14 +7,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Target/TargetAsmParser.h"
-#include "X86.h"
-#include "X86Subtarget.h"
-#include "llvm/Target/TargetRegistry.h"
-#include "llvm/Target/TargetAsmParser.h"
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "llvm/MC/MCTargetAsmParser.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
@@ -26,6 +24,7 @@
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
@@ -33,7 +32,7 @@ using namespace llvm;
 namespace {
 struct X86Operand;
 
-class X86ATTAsmParser : public TargetAsmParser {
+class X86ATTAsmParser : public MCTargetAsmParser {
   MCSubtargetInfo &STI;
   MCAsmParser &Parser;
 
@@ -48,6 +47,7 @@ private:
   X86Operand *ParseMemOperand(unsigned SegReg, SMLoc StartLoc);
 
   bool ParseDirectiveWord(unsigned Size, SMLoc L);
+  bool ParseDirectiveCode(StringRef IDVal, SMLoc L);
 
   bool MatchAndEmitInstruction(SMLoc IDLoc,
                                SmallVectorImpl<MCParsedAsmOperand*> &Operands,
@@ -65,6 +65,10 @@ private:
     // FIXME: Can tablegen auto-generate this?
     return (STI.getFeatureBits() & X86::Mode64Bit) != 0;
   }
+  void SwitchMode() {
+    unsigned FB = ComputeAvailableFeatures(STI.ToggleFeature(X86::Mode64Bit));
+    setAvailableFeatures(FB);
+  }
 
   /// @name Auto-generated Matcher Functions
   /// {
@@ -76,7 +80,7 @@ private:
 
 public:
   X86ATTAsmParser(MCSubtargetInfo &sti, MCAsmParser &parser)
-    : TargetAsmParser(), STI(sti), Parser(parser) {
+    : MCTargetAsmParser(), STI(sti), Parser(parser) {
 
     // Initialize the set of available features.
     setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
@@ -223,6 +227,21 @@ struct X86Operand : public MCParsedAsmOperand {
             (0x00000000FFFFFF80ULL <= Value && Value <= 0x00000000FFFFFFFFULL)||
             (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL));
   }
+  bool isImmZExtu32u8() const {
+    if (!isImm())
+      return false;
+
+    // If this isn't a constant expr, just assume it fits and let relaxation
+    // handle it.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE)
+      return true;
+
+    // Otherwise, check the value is in a range that makes sense for this
+    // extension.
+    uint64_t Value = CE->getValue();
+    return (Value <= 0x00000000000000FFULL);
+  }
   bool isImmSExti64i8() const {
     if (!isImm())
       return false;
@@ -382,19 +401,25 @@ bool X86ATTAsmParser::ParseRegister(unsigned &RegNo,
   if (Tok.isNot(AsmToken::Identifier))
     return Error(Tok.getLoc(), "invalid register name");
 
-  // FIXME: Validate register for the current architecture; we have to do
-  // validation later, so maybe there is no need for this here.
   RegNo = MatchRegisterName(Tok.getString());
 
   // If the match failed, try the register name as lowercase.
   if (RegNo == 0)
     RegNo = MatchRegisterName(LowercaseString(Tok.getString()));
 
-  // FIXME: This should be done using Requires<In32BitMode> and
-  // Requires<In64BitMode> so "eiz" usage in 64-bit instructions
-  // can be also checked.
-  if (RegNo == X86::RIZ && !is64BitMode())
-    return Error(Tok.getLoc(), "riz register in 64-bit mode only");
+  if (!is64BitMode()) {
+    // FIXME: This should be done using Requires<In32BitMode> and
+    // Requires<In64BitMode> so "eiz" usage in 64-bit instructions can be also
+    // checked.
+    // FIXME: Check AH, CH, DH, BH cannot be used in an instruction requiring a
+    // REX prefix.
+    if (RegNo == X86::RIZ ||
+        X86MCRegisterClasses[X86::GR64RegClassID].contains(RegNo) ||
+        X86II::isX86_64NonExtLowByteReg(RegNo) ||
+        X86II::isX86_64ExtendedReg(RegNo))
+      return Error(Tok.getLoc(), "register %"
+                   + Tok.getString() + " is only available in 64-bit mode");
+  }
 
   // Parse "%st" as "%st(0)" and "%st(1)", which is multiple tokens.
   if (RegNo == 0 && (Tok.getString() == "st" || Tok.getString() == "ST")) {
@@ -472,7 +497,7 @@ X86Operand *X86ATTAsmParser::ParseOperand() {
     SMLoc Start, End;
     if (ParseRegister(RegNo, Start, End)) return 0;
     if (RegNo == X86::EIZ || RegNo == X86::RIZ) {
-      Error(Start, "eiz and riz can only be used as index registers");
+      Error(Start, "%eiz and %riz can only be used as index registers");
       return 0;
     }
 
@@ -956,6 +981,7 @@ MatchAndEmitInstruction(SMLoc IDLoc,
 
   // First, try a direct match.
   switch (MatchInstructionImpl(Operands, Inst, OrigErrorInfo)) {
+  default: break;
   case Match_Success:
     Out.EmitInstruction(Inst);
     return false;
@@ -994,7 +1020,7 @@ MatchAndEmitInstruction(SMLoc IDLoc,
   // Check for the various suffix matches.
   Tmp[Base.size()] = Suffixes[0];
   unsigned ErrorInfoIgnore;
-  MatchResultTy Match1, Match2, Match3, Match4;
+  unsigned Match1, Match2, Match3, Match4;
   
   Match1 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore);
   Tmp[Base.size()] = Suffixes[1];
@@ -1096,6 +1122,8 @@ bool X86ATTAsmParser::ParseDirective(AsmToken DirectiveID) {
   StringRef IDVal = DirectiveID.getIdentifier();
   if (IDVal == ".word")
     return ParseDirectiveWord(2, DirectiveID.getLoc());
+  else if (IDVal.startswith(".code"))
+    return ParseDirectiveCode(IDVal, DirectiveID.getLoc());
   return true;
 }
 
@@ -1124,15 +1152,35 @@ bool X86ATTAsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
   return false;
 }
 
+/// ParseDirectiveCode
+///  ::= .code32 | .code64
+bool X86ATTAsmParser::ParseDirectiveCode(StringRef IDVal, SMLoc L) {
+  if (IDVal == ".code32") {
+    Parser.Lex();
+    if (is64BitMode()) {
+      SwitchMode();
+      getParser().getStreamer().EmitAssemblerFlag(MCAF_Code32);
+    }
+  } else if (IDVal == ".code64") {
+    Parser.Lex();
+    if (!is64BitMode()) {
+      SwitchMode();
+      getParser().getStreamer().EmitAssemblerFlag(MCAF_Code64);
+    }
+  } else {
+    return Error(L, "unexpected directive " + IDVal);
+  }
 
+  return false;
+}
 
 
 extern "C" void LLVMInitializeX86AsmLexer();
 
 // Force static initialization.
 extern "C" void LLVMInitializeX86AsmParser() {
-  RegisterAsmParser<X86ATTAsmParser> X(TheX86_32Target);
-  RegisterAsmParser<X86ATTAsmParser> Y(TheX86_64Target);
+  RegisterMCAsmParser<X86ATTAsmParser> X(TheX86_32Target);
+  RegisterMCAsmParser<X86ATTAsmParser> Y(TheX86_64Target);
   LLVMInitializeX86AsmLexer();
 }
 
diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt
index b112f9ff69bb9..351e7675a7e8e 100644
--- a/lib/Target/X86/CMakeLists.txt
+++ b/lib/Target/X86/CMakeLists.txt
@@ -1,20 +1,19 @@
 set(LLVM_TARGET_DEFINITIONS X86.td)
 
-tablegen(X86GenRegisterInfo.inc -gen-register-info)
-tablegen(X86GenDisassemblerTables.inc -gen-disassembler)
-tablegen(X86GenInstrInfo.inc -gen-instr-info)
-tablegen(X86GenAsmWriter.inc -gen-asm-writer)
-tablegen(X86GenAsmWriter1.inc -gen-asm-writer -asmwriternum=1)
-tablegen(X86GenAsmMatcher.inc -gen-asm-matcher)
-tablegen(X86GenDAGISel.inc -gen-dag-isel)
-tablegen(X86GenFastISel.inc -gen-fast-isel)
-tablegen(X86GenCallingConv.inc -gen-callingconv)
-tablegen(X86GenSubtargetInfo.inc -gen-subtarget)
-tablegen(X86GenEDInfo.inc -gen-enhanced-disassembly-info)
+llvm_tablegen(X86GenRegisterInfo.inc -gen-register-info)
+llvm_tablegen(X86GenDisassemblerTables.inc -gen-disassembler)
+llvm_tablegen(X86GenInstrInfo.inc -gen-instr-info)
+llvm_tablegen(X86GenAsmWriter.inc -gen-asm-writer)
+llvm_tablegen(X86GenAsmWriter1.inc -gen-asm-writer -asmwriternum=1)
+llvm_tablegen(X86GenAsmMatcher.inc -gen-asm-matcher)
+llvm_tablegen(X86GenDAGISel.inc -gen-dag-isel)
+llvm_tablegen(X86GenFastISel.inc -gen-fast-isel)
+llvm_tablegen(X86GenCallingConv.inc -gen-callingconv)
+llvm_tablegen(X86GenSubtargetInfo.inc -gen-subtarget)
+llvm_tablegen(X86GenEDInfo.inc -gen-enhanced-disassembly-info)
+add_public_tablegen_target(X86CommonTableGen)
 
 set(sources
-  SSEDomainFix.cpp
-  X86AsmBackend.cpp
   X86AsmPrinter.cpp
   X86COFFMachineModuleInfo.cpp
   X86CodeEmitter.cpp
@@ -26,14 +25,13 @@ set(sources
   X86ISelLowering.cpp
   X86InstrInfo.cpp
   X86JITInfo.cpp
-  X86MachObjectWriter.cpp
-  X86MCCodeEmitter.cpp 
   X86MCInstLower.cpp
   X86RegisterInfo.cpp
   X86SelectionDAGInfo.cpp
   X86Subtarget.cpp
   X86TargetMachine.cpp
   X86TargetObjectFile.cpp
+  X86VZeroUpper.cpp
   )
 
 if( CMAKE_CL_64 )
@@ -53,6 +51,19 @@ endif()
 
 add_llvm_target(X86CodeGen ${sources})
 
+add_llvm_library_dependencies(LLVMX86CodeGen
+  LLVMAnalysis
+  LLVMAsmPrinter
+  LLVMCodeGen
+  LLVMCore
+  LLVMMC
+  LLVMSelectionDAG
+  LLVMSupport
+  LLVMTarget
+  LLVMX86AsmPrinter
+  LLVMX86Desc
+  )
+
 add_subdirectory(AsmParser)
 add_subdirectory(Disassembler)
 add_subdirectory(InstPrinter)
diff --git a/lib/Target/X86/Disassembler/CMakeLists.txt b/lib/Target/X86/Disassembler/CMakeLists.txt
index 972a0d9e7e031..4f570d56e60f0 100644
--- a/lib/Target/X86/Disassembler/CMakeLists.txt
+++ b/lib/Target/X86/Disassembler/CMakeLists.txt
@@ -4,6 +4,13 @@ add_llvm_library(LLVMX86Disassembler
   X86Disassembler.cpp
   X86DisassemblerDecoder.c
   )
+
+add_llvm_library_dependencies(LLVMX86Disassembler
+  LLVMMC
+  LLVMSupport
+  LLVMX86Info
+  )
+
 # workaround for hanging compilation on MSVC9 and 10
 if( MSVC_VERSION EQUAL 1400 OR MSVC_VERSION EQUAL 1500 OR MSVC_VERSION EQUAL 1600 )
 set_property(
@@ -11,4 +18,5 @@ set_property(
   PROPERTY COMPILE_FLAGS "/Od"
   )
 endif()
-add_dependencies(LLVMX86Disassembler X86CodeGenTable_gen)
+
+add_dependencies(LLVMX86Disassembler X86CommonTableGen)
diff --git a/lib/Target/X86/Disassembler/X86Disassembler.cpp b/lib/Target/X86/Disassembler/X86Disassembler.cpp
index 4a0d2ec727a92..3aacb20e73df6 100644
--- a/lib/Target/X86/Disassembler/X86Disassembler.cpp
+++ b/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -21,13 +21,16 @@
 #include "llvm/MC/MCDisassembler.h"
 #include "llvm/MC/MCDisassembler.h"
 #include "llvm/MC/MCInst.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MemoryObject.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 
 #define GET_REGINFO_ENUM
 #include "X86GenRegisterInfo.inc"
+#define GET_INSTRINFO_ENUM
+#include "X86GenInstrInfo.inc"
 #include "X86GenEDInfo.inc"
 
 using namespace llvm;
@@ -64,8 +67,8 @@ extern Target TheX86_32Target, TheX86_64Target;
 static bool translateInstruction(MCInst &target,
                                 InternalInstruction &source);
 
-X86GenericDisassembler::X86GenericDisassembler(DisassemblerMode mode) :
-    MCDisassembler(),
+X86GenericDisassembler::X86GenericDisassembler(const MCSubtargetInfo &STI, DisassemblerMode mode) :
+    MCDisassembler(STI),
     fMode(mode) {
 }
 
@@ -106,28 +109,34 @@ static void logger(void* arg, const char* log) {
 // Public interface for the disassembler
 //
 
-bool X86GenericDisassembler::getInstruction(MCInst &instr,
-                                            uint64_t &size,
-                                            const MemoryObject &region,
-                                            uint64_t address,
-                                            raw_ostream &vStream) const {
+MCDisassembler::DecodeStatus
+X86GenericDisassembler::getInstruction(MCInst &instr,
+                                       uint64_t &size,
+                                       const MemoryObject &region,
+                                       uint64_t address,
+                                       raw_ostream &vStream,
+                                       raw_ostream &cStream) const {
   InternalInstruction internalInstr;
+
+  dlog_t loggerFn = logger;
+  if (&vStream == &nulls())
+    loggerFn = 0; // Disable logging completely if it's going to nulls().
   
   int ret = decodeInstruction(&internalInstr,
                               regionReader,
                               (void*)&region,
-                              logger,
+                              loggerFn,
                               (void*)&vStream,
                               address,
                               fMode);
 
   if (ret) {
     size = internalInstr.readerCursor - address;
-    return false;
+    return Fail;
   }
   else {
     size = internalInstr.length;
-    return !translateInstruction(instr, internalInstr);
+    return (!translateInstruction(instr, internalInstr)) ? Success : Fail;
   }
 }
 
@@ -183,8 +192,46 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate,
       break;
     }
   }
+  // By default sign-extend all X86 immediates based on their encoding.
+  else if (type == TYPE_IMM8 || type == TYPE_IMM16 || type == TYPE_IMM32 ||
+           type == TYPE_IMM64) {
+    uint32_t Opcode = mcInst.getOpcode();
+    switch (operand.encoding) {
+    default:
+      break;
+    case ENCODING_IB:
+      // Special case those X86 instructions that use the imm8 as a set of
+      // bits, bit count, etc. and are not sign-extend.
+      if (Opcode != X86::BLENDPSrri && Opcode != X86::BLENDPDrri &&
+	  Opcode != X86::PBLENDWrri && Opcode != X86::MPSADBWrri &&
+	  Opcode != X86::DPPSrri && Opcode != X86::DPPDrri &&
+	  Opcode != X86::INSERTPSrr && Opcode != X86::VBLENDPSYrri &&
+	  Opcode != X86::VBLENDPSYrmi && Opcode != X86::VBLENDPDYrri &&
+	  Opcode != X86::VBLENDPDYrmi && Opcode != X86::VPBLENDWrri &&
+	  Opcode != X86::VMPSADBWrri && Opcode != X86::VDPPSYrri &&
+	  Opcode != X86::VDPPSYrmi && Opcode != X86::VDPPDrri &&
+	  Opcode != X86::VINSERTPSrr)
+	type = TYPE_MOFFS8;
+      break;
+    case ENCODING_IW:
+      type = TYPE_MOFFS16;
+      break;
+    case ENCODING_ID:
+      type = TYPE_MOFFS32;
+      break;
+    case ENCODING_IO:
+      type = TYPE_MOFFS64;
+      break;
+    }
+  }
 
   switch (type) {
+  case TYPE_XMM128:
+    mcInst.addOperand(MCOperand::CreateReg(X86::XMM0 + (immediate >> 4)));
+    return;
+  case TYPE_XMM256:
+    mcInst.addOperand(MCOperand::CreateReg(X86::YMM0 + (immediate >> 4)));
+    return;
   case TYPE_MOFFS8:
   case TYPE_REL8:
     if(immediate & 0x80)
@@ -543,12 +590,12 @@ static bool translateInstruction(MCInst &mcInst,
   return false;
 }
 
-static MCDisassembler *createX86_32Disassembler(const Target &T) {
-  return new X86Disassembler::X86_32Disassembler;
+static MCDisassembler *createX86_32Disassembler(const Target &T, const MCSubtargetInfo &STI) {
+  return new X86Disassembler::X86_32Disassembler(STI);
 }
 
-static MCDisassembler *createX86_64Disassembler(const Target &T) {
-  return new X86Disassembler::X86_64Disassembler;
+static MCDisassembler *createX86_64Disassembler(const Target &T, const MCSubtargetInfo &STI) {
+  return new X86Disassembler::X86_64Disassembler(STI);
 }
 
 extern "C" void LLVMInitializeX86Disassembler() { 
diff --git a/lib/Target/X86/Disassembler/X86Disassembler.h b/lib/Target/X86/Disassembler/X86Disassembler.h
index 550cf9d40de2a..6ac9a0ff1019a 100644
--- a/lib/Target/X86/Disassembler/X86Disassembler.h
+++ b/lib/Target/X86/Disassembler/X86Disassembler.h
@@ -92,6 +92,7 @@ struct InternalInstruction;
 namespace llvm {
   
 class MCInst;
+class MCSubtargetInfo;
 class MemoryObject;
 class raw_ostream;
 
@@ -107,16 +108,17 @@ protected:
   /// Constructor     - Initializes the disassembler.
   ///
   /// @param mode     - The X86 architecture mode to decode for.
-  X86GenericDisassembler(DisassemblerMode mode);
+  X86GenericDisassembler(const MCSubtargetInfo &STI, DisassemblerMode mode);
 public:
   ~X86GenericDisassembler();
 
   /// getInstruction - See MCDisassembler.
-  bool getInstruction(MCInst &instr,
-                      uint64_t &size,
-                      const MemoryObject &region,
-                      uint64_t address,
-                      raw_ostream &vStream) const;
+  DecodeStatus getInstruction(MCInst &instr,
+                              uint64_t &size,
+                              const MemoryObject &region,
+                              uint64_t address,
+                              raw_ostream &vStream,
+                              raw_ostream &cStream) const;
 
   /// getEDInfo - See MCDisassembler.
   EDInstInfo *getEDInfo() const;
@@ -127,24 +129,24 @@ private:
 /// X86_16Disassembler - 16-bit X86 disassembler.
 class X86_16Disassembler : public X86GenericDisassembler {
 public:
-  X86_16Disassembler() :
-    X86GenericDisassembler(MODE_16BIT) {
+  X86_16Disassembler(const MCSubtargetInfo &STI) :
+    X86GenericDisassembler(STI, MODE_16BIT) {
   }
 };  
 
 /// X86_16Disassembler - 32-bit X86 disassembler.
 class X86_32Disassembler : public X86GenericDisassembler {
 public:
-  X86_32Disassembler() :
-    X86GenericDisassembler(MODE_32BIT) {
+  X86_32Disassembler(const MCSubtargetInfo &STI) :
+    X86GenericDisassembler(STI, MODE_32BIT) {
   }
 };
 
 /// X86_16Disassembler - 64-bit X86 disassembler.
 class X86_64Disassembler : public X86GenericDisassembler {
 public:
-  X86_64Disassembler() :
-    X86GenericDisassembler(MODE_64BIT) {
+  X86_64Disassembler(const MCSubtargetInfo &STI) :
+    X86GenericDisassembler(STI, MODE_64BIT) {
   }
 };
 
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c
index de1610ba3d66a..f9b0fe5d51b9b 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c
@@ -58,8 +58,8 @@ static InstructionContext contextForAttrs(uint8_t attrMask) {
  * @return            - TRUE if the ModR/M byte is required, FALSE otherwise.
  */
 static int modRMRequired(OpcodeType type,
-                                InstructionContext insnContext,
-                                uint8_t opcode) {
+                         InstructionContext insnContext,
+                         uint8_t opcode) {
   const struct ContextDecision* decision = 0;
   
   switch (type) {
@@ -391,7 +391,7 @@ static int readPrefixes(struct InternalInstruction* insn) {
       return -1;
     }
     
-    if (insn->mode == MODE_64BIT || byte1 & 0x8) {
+    if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) {
       insn->vexSize = 3;
       insn->necessaryPrefixLocation = insn->readerCursor - 1;
     }
@@ -406,12 +406,14 @@ static int readPrefixes(struct InternalInstruction* insn) {
       consumeByte(insn, &insn->vexPrefix[2]);
 
       /* We simulate the REX prefix for simplicity's sake */
-    
-      insn->rexPrefix = 0x40 
-                      | (wFromVEX3of3(insn->vexPrefix[2]) << 3)
-                      | (rFromVEX2of3(insn->vexPrefix[1]) << 2)
-                      | (xFromVEX2of3(insn->vexPrefix[1]) << 1)
-                      | (bFromVEX2of3(insn->vexPrefix[1]) << 0);
+   
+      if (insn->mode == MODE_64BIT) {
+        insn->rexPrefix = 0x40 
+                        | (wFromVEX3of3(insn->vexPrefix[2]) << 3)
+                        | (rFromVEX2of3(insn->vexPrefix[1]) << 2)
+                        | (xFromVEX2of3(insn->vexPrefix[1]) << 1)
+                        | (bFromVEX2of3(insn->vexPrefix[1]) << 0);
+      }
     
       switch (ppFromVEX3of3(insn->vexPrefix[2]))
       {
@@ -433,7 +435,7 @@ static int readPrefixes(struct InternalInstruction* insn) {
       return -1;
     }
       
-    if (insn->mode == MODE_64BIT || byte1 & 0x8) {
+    if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) {
       insn->vexSize = 2;
     }
     else {
@@ -444,8 +446,10 @@ static int readPrefixes(struct InternalInstruction* insn) {
       insn->vexPrefix[0] = byte;
       consumeByte(insn, &insn->vexPrefix[1]);
         
-      insn->rexPrefix = 0x40 
-                      | (rFromVEX2of2(insn->vexPrefix[1]) << 2);
+      if (insn->mode == MODE_64BIT) {
+        insn->rexPrefix = 0x40 
+                        | (rFromVEX2of2(insn->vexPrefix[1]) << 2);
+      }
         
       switch (ppFromVEX2of2(insn->vexPrefix[1]))
       {
@@ -699,34 +703,6 @@ static BOOL is16BitEquvalent(const char* orig, const char* equiv) {
   }
 }
 
-/*
- * is64BitEquivalent - Determines whether two instruction names refer to
- * equivalent instructions but one is 64-bit whereas the other is not.
- *
- * @param orig  - The instruction that is not 64-bit
- * @param equiv - The instruction that is 64-bit
- */
-static BOOL is64BitEquivalent(const char* orig, const char* equiv) {
-  off_t i;
-  
-  for (i = 0;; i++) {
-    if (orig[i] == '\0' && equiv[i] == '\0')
-      return TRUE;
-    if (orig[i] == '\0' || equiv[i] == '\0')
-      return FALSE;
-    if (orig[i] != equiv[i]) {
-      if ((orig[i] == 'W' || orig[i] == 'L') && equiv[i] == 'Q')
-        continue;
-      if ((orig[i] == '1' || orig[i] == '3') && equiv[i] == '6')
-        continue;
-      if ((orig[i] == '6' || orig[i] == '2') && equiv[i] == '4')
-        continue;
-      return FALSE;
-    }
-  }
-}
-
-
 /*
  * getID - Determines the ID of an instruction, consuming the ModR/M byte as 
  *   appropriate for extended and escape opcodes.  Determines the attributes and 
@@ -763,8 +739,6 @@ static int getID(struct InternalInstruction* insn) {
         break;
       }
     
-      if (wFromVEX3of3(insn->vexPrefix[2]))
-        attrMask |= ATTR_REXW;
       if (lFromVEX3of3(insn->vexPrefix[2]))
         attrMask |= ATTR_VEXL;
     }
@@ -789,63 +763,55 @@ static int getID(struct InternalInstruction* insn) {
     }
   }
   else {
-    if (insn->rexPrefix & 0x08)
-      attrMask |= ATTR_REXW;
-  
     if (isPrefixAtLocation(insn, 0x66, insn->necessaryPrefixLocation))
       attrMask |= ATTR_OPSIZE;
     else if (isPrefixAtLocation(insn, 0xf3, insn->necessaryPrefixLocation))
       attrMask |= ATTR_XS;
     else if (isPrefixAtLocation(insn, 0xf2, insn->necessaryPrefixLocation))
       attrMask |= ATTR_XD;
-    
   }
 
+  if (insn->rexPrefix & 0x08)
+    attrMask |= ATTR_REXW;
+  
   if (getIDWithAttrMask(&instructionID, insn, attrMask))
     return -1;
   
   /* The following clauses compensate for limitations of the tables. */
   
-  if ((attrMask & ATTR_XD) && (attrMask & ATTR_REXW)) {
+  if ((attrMask & ATTR_VEXL) && (attrMask & ATTR_REXW)) {
     /*
-     * Although for SSE instructions it is usually necessary to treat REX.W+F2
-     * as F2 for decode (in the absence of a 64BIT_REXW_XD category) there is
-     * an occasional instruction where F2 is incidental and REX.W is the more
-     * significant.  If the decoded instruction is 32-bit and adding REX.W
-     * instead of F2 changes a 32 to a 64, we adopt the new encoding.
+     * Some VEX instructions ignore the L-bit, but use the W-bit. Normally L-bit
+     * has precedence since there are no L-bit with W-bit entries in the tables.
+     * So if the L-bit isn't significant we should use the W-bit instead.
      */
-    
+
     const struct InstructionSpecifier *spec;
-    uint16_t instructionIDWithREXw;
-    const struct InstructionSpecifier *specWithREXw;
-    
+    uint16_t instructionIDWithWBit;
+    const struct InstructionSpecifier *specWithWBit;
+
     spec = specifierForUID(instructionID);
-    
-    if (getIDWithAttrMask(&instructionIDWithREXw,
+
+    if (getIDWithAttrMask(&instructionIDWithWBit,
                           insn,
-                          attrMask & (~ATTR_XD))) {
-      /*
-       * Decoding with REX.w would yield nothing; give up and return original
-       * decode.
-       */
-      
+                          (attrMask & (~ATTR_VEXL)) | ATTR_REXW)) {
       insn->instructionID = instructionID;
       insn->spec = spec;
       return 0;
     }
-    
-    specWithREXw = specifierForUID(instructionIDWithREXw);
-    
-    if (is64BitEquivalent(spec->name, specWithREXw->name)) {
-      insn->instructionID = instructionIDWithREXw;
-      insn->spec = specWithREXw;
+
+    specWithWBit = specifierForUID(instructionIDWithWBit);
+
+    if (instructionID != instructionIDWithWBit) {
+      insn->instructionID = instructionIDWithWBit;
+      insn->spec = specWithWBit;
     } else {
       insn->instructionID = instructionID;
       insn->spec = spec;
     }
     return 0;
   }
-  
+
   if (insn->prefixPresent[0x66] && !(attrMask & ATTR_OPSIZE)) {
     /*
      * The instruction tables make no distinction between instructions that
@@ -885,6 +851,43 @@ static int getID(struct InternalInstruction* insn) {
     }
     return 0;
   }
+
+  if (insn->opcodeType == ONEBYTE && insn->opcode == 0x90 &&
+      insn->rexPrefix & 0x01) {
+    /*
+     * NOOP shouldn't decode as NOOP if REX.b is set. Instead
+     * it should decode as XCHG %r8, %eax.
+     */
+
+    const struct InstructionSpecifier *spec;
+    uint16_t instructionIDWithNewOpcode;
+    const struct InstructionSpecifier *specWithNewOpcode;
+
+    spec = specifierForUID(instructionID);
+    
+    /* Borrow opcode from one of the other XCHGar opcodes */
+    insn->opcode = 0x91;
+   
+    if (getIDWithAttrMask(&instructionIDWithNewOpcode,
+                          insn,
+                          attrMask)) {
+      insn->opcode = 0x90;
+
+      insn->instructionID = instructionID;
+      insn->spec = spec;
+      return 0;
+    }
+
+    specWithNewOpcode = specifierForUID(instructionIDWithNewOpcode);
+
+    /* Change back */
+    insn->opcode = 0x90;
+
+    insn->instructionID = instructionIDWithNewOpcode;
+    insn->spec = specWithNewOpcode;
+
+    return 0;
+  }
   
   insn->instructionID = instructionID;
   insn->spec = specifierForUID(insn->instructionID);
@@ -1434,11 +1437,10 @@ static int readImmediate(struct InternalInstruction* insn, uint8_t size) {
 }
 
 /*
- * readVVVV - Consumes an immediate operand from an instruction, given the
- *   desired operand size.
+ * readVVVV - Consumes vvvv from an instruction if it has a VEX prefix.
  *
  * @param insn  - The instruction whose operand is to be read.
- * @return      - 0 if the immediate was successfully consumed; nonzero
+ * @return      - 0 if the vvvv was successfully consumed; nonzero
  *                otherwise.
  */
 static int readVVVV(struct InternalInstruction* insn) {
@@ -1451,6 +1453,9 @@ static int readVVVV(struct InternalInstruction* insn) {
   else
     return -1;
 
+  if (insn->mode != MODE_64BIT)
+    insn->vvvv &= 0x7;
+
   return 0;
 }
 
@@ -1463,8 +1468,14 @@ static int readVVVV(struct InternalInstruction* insn) {
  */
 static int readOperands(struct InternalInstruction* insn) {
   int index;
+  int hasVVVV, needVVVV;
   
   dbgprintf(insn, "readOperands()");
+
+  /* If non-zero vvvv specified, need to make sure one of the operands
+     uses it. */
+  hasVVVV = !readVVVV(insn);
+  needVVVV = hasVVVV && (insn->vvvv != 0);
   
   for (index = 0; index < X86_MAX_OPERANDS; ++index) {
     switch (insn->spec->operands[index].encoding) {
@@ -1537,7 +1548,8 @@ static int readOperands(struct InternalInstruction* insn) {
         return -1;
       break;
     case ENCODING_VVVV:
-      if (readVVVV(insn))
+      needVVVV = 0; /* Mark that we have found a VVVV operand. */
+      if (!hasVVVV)
         return -1;
       if (fixupReg(insn, &insn->spec->operands[index]))
         return -1;
@@ -1549,6 +1561,9 @@ static int readOperands(struct InternalInstruction* insn) {
       return -1;
     }
   }
+
+  /* If we didn't find ENCODING_VVVV operand, but non-zero vvvv present, fail */
+  if (needVVVV) return -1;
   
   return 0;
 }
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
index 70315ed572b40..8b7933545a567 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
@@ -81,12 +81,18 @@ enum attributeBits {
                                         "but not the operands")                \
   ENUM_ENTRY(IC_XS,                 2,  "may say something about the opcode "  \
                                         "but not the operands")                \
+  ENUM_ENTRY(IC_XD_OPSIZE,          3,  "requires an OPSIZE prefix, so "       \
+                                        "operands change width")               \
+  ENUM_ENTRY(IC_XS_OPSIZE,          3,  "requires an OPSIZE prefix, so "       \
+                                        "operands change width")               \
   ENUM_ENTRY(IC_64BIT_REXW,         4,  "requires a REX.W prefix, so operands "\
                                         "change width; overrides IC_OPSIZE")   \
   ENUM_ENTRY(IC_64BIT_OPSIZE,       3,  "Just as meaningful as IC_OPSIZE")     \
   ENUM_ENTRY(IC_64BIT_XD,           5,  "XD instructions are SSE; REX.W is "   \
                                         "secondary")                           \
   ENUM_ENTRY(IC_64BIT_XS,           5,  "Just as meaningful as IC_64BIT_XD")   \
+  ENUM_ENTRY(IC_64BIT_XD_OPSIZE,    3,  "Just as meaningful as IC_XD_OPSIZE")  \
+  ENUM_ENTRY(IC_64BIT_XS_OPSIZE,    3,  "Just as meaningful as IC_XS_OPSIZE")  \
   ENUM_ENTRY(IC_64BIT_REXW_XS,      6,  "OPSIZE could mean a different "       \
                                         "opcode")                              \
   ENUM_ENTRY(IC_64BIT_REXW_XD,      6,  "Just as meaningful as "               \
@@ -104,7 +110,7 @@ enum attributeBits {
   ENUM_ENTRY(IC_VEX_W_OPSIZE,       4,  "requires VEX, W, and OpSize")         \
   ENUM_ENTRY(IC_VEX_L,              3,  "requires VEX and the L prefix")       \
   ENUM_ENTRY(IC_VEX_L_XS,           4,  "requires VEX and the L and XS prefix")\
-  ENUM_ENTRY(IC_VEX_L_XD,           4,  "requires VEX and the L and XS prefix")\
+  ENUM_ENTRY(IC_VEX_L_XD,           4,  "requires VEX and the L and XD prefix")\
   ENUM_ENTRY(IC_VEX_L_OPSIZE,       4,  "requires VEX, L, and OpSize")
 
 
diff --git a/lib/Target/X86/InstPrinter/CMakeLists.txt b/lib/Target/X86/InstPrinter/CMakeLists.txt
index 033973eeeff93..2a2b5dbb43db6 100644
--- a/lib/Target/X86/InstPrinter/CMakeLists.txt
+++ b/lib/Target/X86/InstPrinter/CMakeLists.txt
@@ -5,4 +5,11 @@ add_llvm_library(LLVMX86AsmPrinter
   X86IntelInstPrinter.cpp
   X86InstComments.cpp
   )
-add_dependencies(LLVMX86AsmPrinter X86CodeGenTable_gen)
+
+add_llvm_library_dependencies(LLVMX86AsmPrinter
+  LLVMMC
+  LLVMSupport
+  LLVMX86Utils
+  )
+
+add_dependencies(LLVMX86AsmPrinter X86CommonTableGen)
diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
index c37d8797b39c1..029d491260f6b 100644
--- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
+++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
@@ -39,14 +39,17 @@ void X86ATTInstPrinter::printRegName(raw_ostream &OS,
   OS << '%' << getRegisterName(RegNo);
 }
 
-void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS) {
+void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
+                                  StringRef Annot) {
   // Try to print any aliases first.
   if (!printAliasInstr(MI, OS))
     printInstruction(MI, OS);
   
   // If verbose assembly is enabled, we can print some informative comments.
-  if (CommentStream)
+  if (CommentStream) {
+    printAnnotation(OS, Annot);
     EmitAnyX86InstComments(MI, *CommentStream, getRegisterName);
+  }
 }
 
 StringRef X86ATTInstPrinter::getOpcodeName(unsigned Opcode) const {
@@ -90,7 +93,8 @@ void X86ATTInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
   if (Op.isReg()) {
     O << '%' << getRegisterName(Op.getReg());
   } else if (Op.isImm()) {
-    O << '$' << Op.getImm();
+    // Print X86 immediates as signed values.
+    O << '$' << (int64_t)Op.getImm();
     
     if (CommentStream && (Op.getImm() > 255 || Op.getImm() < -256))
       *CommentStream << format("imm = 0x%llX\n", (long long)Op.getImm());
diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
index 5426e5cf38d9a..0293869b0a9b5 100644
--- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
+++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
@@ -25,7 +25,7 @@ public:
   X86ATTInstPrinter(const MCAsmInfo &MAI);
   
   virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
-  virtual void printInst(const MCInst *MI, raw_ostream &OS);
+  virtual void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot);
   virtual StringRef getOpcodeName(unsigned Opcode) const;
 
   // Autogenerated by tblgen, returns true if we successfully printed an
diff --git a/lib/Target/X86/InstPrinter/X86InstComments.cpp b/lib/Target/X86/InstPrinter/X86InstComments.cpp
index 4e28dfe7fa81f..8d85b95fe81d9 100644
--- a/lib/Target/X86/InstPrinter/X86InstComments.cpp
+++ b/lib/Target/X86/InstPrinter/X86InstComments.cpp
@@ -14,9 +14,9 @@
 
 #include "X86InstComments.h"
 #include "MCTargetDesc/X86MCTargetDesc.h"
+#include "Utils/X86ShuffleDecode.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/Support/raw_ostream.h"
-#include "../Utils/X86ShuffleDecode.h"
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
@@ -136,9 +136,11 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     break;
 
   case X86::SHUFPDrri:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::SHUFPDrmi:
     DecodeSHUFPSMask(2, MI->getOperand(3).getImm(), ShuffleMask);
     Src1Name = getRegName(MI->getOperand(0).getReg());
-    Src2Name = getRegName(MI->getOperand(2).getReg());
     break;
 
   case X86::SHUFPSrri:
@@ -205,6 +207,31 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     DecodeUNPCKHPMask(4, ShuffleMask);
     Src1Name = getRegName(MI->getOperand(0).getReg());
     break;
+  case X86::VPERMILPSri:
+    DecodeVPERMILPSMask(4, MI->getOperand(2).getImm(),
+                        ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::VPERMILPSYri:
+    DecodeVPERMILPSMask(8, MI->getOperand(2).getImm(),
+                        ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::VPERMILPDri:
+    DecodeVPERMILPDMask(2, MI->getOperand(2).getImm(),
+                        ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::VPERMILPDYri:
+    DecodeVPERMILPDMask(4, MI->getOperand(2).getImm(),
+                        ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::VPERM2F128rr:
+    DecodeVPERM2F128Mask(MI->getOperand(3).getImm(), ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    break;
   }
 
 
diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
index 506e26cbf7cdf..f9ab5aeee122c 100644
--- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
+++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
@@ -32,12 +32,15 @@ void X86IntelInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
   OS << getRegisterName(RegNo);
 }
 
-void X86IntelInstPrinter::printInst(const MCInst *MI, raw_ostream &OS) {
+void X86IntelInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
+                                    StringRef Annot) {
   printInstruction(MI, OS);
   
   // If verbose assembly is enabled, we can print some informative comments.
-  if (CommentStream)
+  if (CommentStream) {
+    printAnnotation(OS, Annot);
     EmitAnyX86InstComments(MI, *CommentStream, getRegisterName);
+  }
 }
 StringRef X86IntelInstPrinter::getOpcodeName(unsigned Opcode) const {
   return getInstructionName(Opcode);
diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
index e84a1940017d2..6d5ec6226a9eb 100644
--- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
+++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
@@ -27,7 +27,7 @@ public:
     : MCInstPrinter(MAI) {}
 
   virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
-  virtual void printInst(const MCInst *MI, raw_ostream &OS);
+  virtual void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot);
   virtual StringRef getOpcodeName(unsigned Opcode) const;
   
   // Autogenerated by tblgen.
diff --git a/lib/Target/X86/MCTargetDesc/CMakeLists.txt b/lib/Target/X86/MCTargetDesc/CMakeLists.txt
index ca88f8ffd08ca..87219120e2a82 100644
--- a/lib/Target/X86/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/X86/MCTargetDesc/CMakeLists.txt
@@ -1,7 +1,20 @@
 add_llvm_library(LLVMX86Desc
+  X86AsmBackend.cpp
   X86MCTargetDesc.cpp
   X86MCAsmInfo.cpp
+  X86MCCodeEmitter.cpp 
+  X86MachObjectWriter.cpp
   )
 
+add_llvm_library_dependencies(LLVMX86Desc
+  LLVMMC
+  LLVMSupport
+  LLVMX86AsmPrinter
+  LLVMX86AsmPrinter
+  LLVMX86Info
+  )
+
+add_dependencies(LLVMX86Desc X86CommonTableGen)
+
 # Hack: we need to include 'main' target directory to grab private headers
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/.. ${CMAKE_CURRENT_BINARY_DIR}/..)
diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
new file mode 100644
index 0000000000000..69ad7d7b6b327
--- /dev/null
+++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -0,0 +1,458 @@
+//===-- X86AsmBackend.cpp - X86 Assembler Backend -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCAsmBackend.h"
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "MCTargetDesc/X86FixupKinds.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCMachObjectWriter.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSectionCOFF.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/Object/MachOFormat.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+// Option to allow disabling arithmetic relaxation to workaround PR9807, which
+// is useful when running bitwise comparison experiments on Darwin. We should be
+// able to remove this once PR9807 is resolved.
+static cl::opt<bool>
+MCDisableArithRelaxation("mc-x86-disable-arith-relaxation",
+         cl::desc("Disable relaxation of arithmetic instruction for X86"));
+
+static unsigned getFixupKindLog2Size(unsigned Kind) {
+  switch (Kind) {
+  default: assert(0 && "invalid fixup kind!");
+  case FK_PCRel_1:
+  case FK_Data_1: return 0;
+  case FK_PCRel_2:
+  case FK_Data_2: return 1;
+  case FK_PCRel_4:
+  case X86::reloc_riprel_4byte:
+  case X86::reloc_riprel_4byte_movq_load:
+  case X86::reloc_signed_4byte:
+  case X86::reloc_global_offset_table:
+  case FK_Data_4: return 2;
+  case FK_PCRel_8:
+  case FK_Data_8: return 3;
+  }
+}
+
+namespace {
+
+class X86ELFObjectWriter : public MCELFObjectTargetWriter {
+public:
+  X86ELFObjectWriter(bool is64Bit, Triple::OSType OSType, uint16_t EMachine,
+                     bool HasRelocationAddend)
+    : MCELFObjectTargetWriter(is64Bit, OSType, EMachine, HasRelocationAddend) {}
+};
+
+class X86AsmBackend : public MCAsmBackend {
+public:
+  X86AsmBackend(const Target &T)
+    : MCAsmBackend() {}
+
+  unsigned getNumFixupKinds() const {
+    return X86::NumTargetFixupKinds;
+  }
+
+  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const {
+    const static MCFixupKindInfo Infos[X86::NumTargetFixupKinds] = {
+      { "reloc_riprel_4byte", 0, 4 * 8, MCFixupKindInfo::FKF_IsPCRel },
+      { "reloc_riprel_4byte_movq_load", 0, 4 * 8, MCFixupKindInfo::FKF_IsPCRel},
+      { "reloc_signed_4byte", 0, 4 * 8, 0},
+      { "reloc_global_offset_table", 0, 4 * 8, 0}
+    };
+
+    if (Kind < FirstTargetFixupKind)
+      return MCAsmBackend::getFixupKindInfo(Kind);
+
+    assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+           "Invalid kind!");
+    return Infos[Kind - FirstTargetFixupKind];
+  }
+
+  void ApplyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                  uint64_t Value) const {
+    unsigned Size = 1 << getFixupKindLog2Size(Fixup.getKind());
+
+    assert(Fixup.getOffset() + Size <= DataSize &&
+           "Invalid fixup offset!");
+
+    // Check that uppper bits are either all zeros or all ones.
+    // Specifically ignore overflow/underflow as long as the leakage is
+    // limited to the lower bits. This is to remain compatible with
+    // other assemblers.
+    assert(isIntN(Size * 8 + 1, Value) &&
+           "Value does not fit in the Fixup field");
+
+    for (unsigned i = 0; i != Size; ++i)
+      Data[Fixup.getOffset() + i] = uint8_t(Value >> (i * 8));
+  }
+
+  bool MayNeedRelaxation(const MCInst &Inst) const;
+
+  void RelaxInstruction(const MCInst &Inst, MCInst &Res) const;
+
+  bool WriteNopData(uint64_t Count, MCObjectWriter *OW) const;
+};
+} // end anonymous namespace
+
+static unsigned getRelaxedOpcodeBranch(unsigned Op) {
+  switch (Op) {
+  default:
+    return Op;
+
+  case X86::JAE_1: return X86::JAE_4;
+  case X86::JA_1:  return X86::JA_4;
+  case X86::JBE_1: return X86::JBE_4;
+  case X86::JB_1:  return X86::JB_4;
+  case X86::JE_1:  return X86::JE_4;
+  case X86::JGE_1: return X86::JGE_4;
+  case X86::JG_1:  return X86::JG_4;
+  case X86::JLE_1: return X86::JLE_4;
+  case X86::JL_1:  return X86::JL_4;
+  case X86::JMP_1: return X86::JMP_4;
+  case X86::JNE_1: return X86::JNE_4;
+  case X86::JNO_1: return X86::JNO_4;
+  case X86::JNP_1: return X86::JNP_4;
+  case X86::JNS_1: return X86::JNS_4;
+  case X86::JO_1:  return X86::JO_4;
+  case X86::JP_1:  return X86::JP_4;
+  case X86::JS_1:  return X86::JS_4;
+  }
+}
+
+static unsigned getRelaxedOpcodeArith(unsigned Op) {
+  switch (Op) {
+  default:
+    return Op;
+
+    // IMUL
+  case X86::IMUL16rri8: return X86::IMUL16rri;
+  case X86::IMUL16rmi8: return X86::IMUL16rmi;
+  case X86::IMUL32rri8: return X86::IMUL32rri;
+  case X86::IMUL32rmi8: return X86::IMUL32rmi;
+  case X86::IMUL64rri8: return X86::IMUL64rri32;
+  case X86::IMUL64rmi8: return X86::IMUL64rmi32;
+
+    // AND
+  case X86::AND16ri8: return X86::AND16ri;
+  case X86::AND16mi8: return X86::AND16mi;
+  case X86::AND32ri8: return X86::AND32ri;
+  case X86::AND32mi8: return X86::AND32mi;
+  case X86::AND64ri8: return X86::AND64ri32;
+  case X86::AND64mi8: return X86::AND64mi32;
+
+    // OR
+  case X86::OR16ri8: return X86::OR16ri;
+  case X86::OR16mi8: return X86::OR16mi;
+  case X86::OR32ri8: return X86::OR32ri;
+  case X86::OR32mi8: return X86::OR32mi;
+  case X86::OR64ri8: return X86::OR64ri32;
+  case X86::OR64mi8: return X86::OR64mi32;
+
+    // XOR
+  case X86::XOR16ri8: return X86::XOR16ri;
+  case X86::XOR16mi8: return X86::XOR16mi;
+  case X86::XOR32ri8: return X86::XOR32ri;
+  case X86::XOR32mi8: return X86::XOR32mi;
+  case X86::XOR64ri8: return X86::XOR64ri32;
+  case X86::XOR64mi8: return X86::XOR64mi32;
+
+    // ADD
+  case X86::ADD16ri8: return X86::ADD16ri;
+  case X86::ADD16mi8: return X86::ADD16mi;
+  case X86::ADD32ri8: return X86::ADD32ri;
+  case X86::ADD32mi8: return X86::ADD32mi;
+  case X86::ADD64ri8: return X86::ADD64ri32;
+  case X86::ADD64mi8: return X86::ADD64mi32;
+
+    // SUB
+  case X86::SUB16ri8: return X86::SUB16ri;
+  case X86::SUB16mi8: return X86::SUB16mi;
+  case X86::SUB32ri8: return X86::SUB32ri;
+  case X86::SUB32mi8: return X86::SUB32mi;
+  case X86::SUB64ri8: return X86::SUB64ri32;
+  case X86::SUB64mi8: return X86::SUB64mi32;
+
+    // CMP
+  case X86::CMP16ri8: return X86::CMP16ri;
+  case X86::CMP16mi8: return X86::CMP16mi;
+  case X86::CMP32ri8: return X86::CMP32ri;
+  case X86::CMP32mi8: return X86::CMP32mi;
+  case X86::CMP64ri8: return X86::CMP64ri32;
+  case X86::CMP64mi8: return X86::CMP64mi32;
+
+    // PUSH
+  case X86::PUSHi8: return X86::PUSHi32;
+  case X86::PUSHi16: return X86::PUSHi32;
+  case X86::PUSH64i8: return X86::PUSH64i32;
+  case X86::PUSH64i16: return X86::PUSH64i32;
+  }
+}
+
+static unsigned getRelaxedOpcode(unsigned Op) {
+  unsigned R = getRelaxedOpcodeArith(Op);
+  if (R != Op)
+    return R;
+  return getRelaxedOpcodeBranch(Op);
+}
+
+bool X86AsmBackend::MayNeedRelaxation(const MCInst &Inst) const {
+  // Branches can always be relaxed.
+  if (getRelaxedOpcodeBranch(Inst.getOpcode()) != Inst.getOpcode())
+    return true;
+
+  if (MCDisableArithRelaxation)
+    return false;
+
+  // Check if this instruction is ever relaxable.
+  if (getRelaxedOpcodeArith(Inst.getOpcode()) == Inst.getOpcode())
+    return false;
+
+
+  // Check if it has an expression and is not RIP relative.
+  bool hasExp = false;
+  bool hasRIP = false;
+  for (unsigned i = 0; i < Inst.getNumOperands(); ++i) {
+    const MCOperand &Op = Inst.getOperand(i);
+    if (Op.isExpr())
+      hasExp = true;
+
+    if (Op.isReg() && Op.getReg() == X86::RIP)
+      hasRIP = true;
+  }
+
+  // FIXME: Why exactly do we need the !hasRIP? Is it just a limitation on
+  // how we do relaxations?
+  return hasExp && !hasRIP;
+}
+
+// FIXME: Can tblgen help at all here to verify there aren't other instructions
+// we can relax?
+void X86AsmBackend::RelaxInstruction(const MCInst &Inst, MCInst &Res) const {
+  // The only relaxations X86 does is from a 1byte pcrel to a 4byte pcrel.
+  unsigned RelaxedOp = getRelaxedOpcode(Inst.getOpcode());
+
+  if (RelaxedOp == Inst.getOpcode()) {
+    SmallString<256> Tmp;
+    raw_svector_ostream OS(Tmp);
+    Inst.dump_pretty(OS);
+    OS << "\n";
+    report_fatal_error("unexpected instruction to relax: " + OS.str());
+  }
+
+  Res = Inst;
+  Res.setOpcode(RelaxedOp);
+}
+
+/// WriteNopData - Write optimal nops to the output file for the \arg Count
+/// bytes.  This returns the number of bytes written.  It may return 0 if
+/// the \arg Count is more than the maximum optimal nops.
+bool X86AsmBackend::WriteNopData(uint64_t Count, MCObjectWriter *OW) const {
+  static const uint8_t Nops[10][10] = {
+    // nop
+    {0x90},
+    // xchg %ax,%ax
+    {0x66, 0x90},
+    // nopl (%[re]ax)
+    {0x0f, 0x1f, 0x00},
+    // nopl 0(%[re]ax)
+    {0x0f, 0x1f, 0x40, 0x00},
+    // nopl 0(%[re]ax,%[re]ax,1)
+    {0x0f, 0x1f, 0x44, 0x00, 0x00},
+    // nopw 0(%[re]ax,%[re]ax,1)
+    {0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00},
+    // nopl 0L(%[re]ax)
+    {0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00},
+    // nopl 0L(%[re]ax,%[re]ax,1)
+    {0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
+    // nopw 0L(%[re]ax,%[re]ax,1)
+    {0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
+    // nopw %cs:0L(%[re]ax,%[re]ax,1)
+    {0x66, 0x2e, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
+  };
+
+  // Write an optimal sequence for the first 15 bytes.
+  const uint64_t OptimalCount = (Count < 16) ? Count : 15;
+  const uint64_t Prefixes = OptimalCount <= 10 ? 0 : OptimalCount - 10;
+  for (uint64_t i = 0, e = Prefixes; i != e; i++)
+    OW->Write8(0x66);
+  const uint64_t Rest = OptimalCount - Prefixes;
+  for (uint64_t i = 0, e = Rest; i != e; i++)
+    OW->Write8(Nops[Rest - 1][i]);
+
+  // Finish with single byte nops.
+  for (uint64_t i = OptimalCount, e = Count; i != e; ++i)
+   OW->Write8(0x90);
+
+  return true;
+}
+
+/* *** */
+
+namespace {
+class ELFX86AsmBackend : public X86AsmBackend {
+public:
+  Triple::OSType OSType;
+  ELFX86AsmBackend(const Target &T, Triple::OSType _OSType)
+    : X86AsmBackend(T), OSType(_OSType) {
+    HasReliableSymbolDifference = true;
+  }
+
+  virtual bool doesSectionRequireSymbols(const MCSection &Section) const {
+    const MCSectionELF &ES = static_cast<const MCSectionELF&>(Section);
+    return ES.getFlags() & ELF::SHF_MERGE;
+  }
+};
+
+class ELFX86_32AsmBackend : public ELFX86AsmBackend {
+public:
+  ELFX86_32AsmBackend(const Target &T, Triple::OSType OSType)
+    : ELFX86AsmBackend(T, OSType) {}
+
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
+    return createELFObjectWriter(createELFObjectTargetWriter(),
+                                 OS, /*IsLittleEndian*/ true);
+  }
+
+  MCELFObjectTargetWriter *createELFObjectTargetWriter() const {
+    return new X86ELFObjectWriter(false, OSType, ELF::EM_386, false);
+  }
+};
+
+class ELFX86_64AsmBackend : public ELFX86AsmBackend {
+public:
+  ELFX86_64AsmBackend(const Target &T, Triple::OSType OSType)
+    : ELFX86AsmBackend(T, OSType) {}
+
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
+    return createELFObjectWriter(createELFObjectTargetWriter(),
+                                 OS, /*IsLittleEndian*/ true);
+  }
+
+  MCELFObjectTargetWriter *createELFObjectTargetWriter() const {
+    return new X86ELFObjectWriter(true, OSType, ELF::EM_X86_64, true);
+  }
+};
+
+class WindowsX86AsmBackend : public X86AsmBackend {
+  bool Is64Bit;
+
+public:
+  WindowsX86AsmBackend(const Target &T, bool is64Bit)
+    : X86AsmBackend(T)
+    , Is64Bit(is64Bit) {
+  }
+
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
+    return createWinCOFFObjectWriter(OS, Is64Bit);
+  }
+};
+
+class DarwinX86AsmBackend : public X86AsmBackend {
+public:
+  DarwinX86AsmBackend(const Target &T)
+    : X86AsmBackend(T) { }
+};
+
+class DarwinX86_32AsmBackend : public DarwinX86AsmBackend {
+public:
+  DarwinX86_32AsmBackend(const Target &T)
+    : DarwinX86AsmBackend(T) {}
+
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
+    return createX86MachObjectWriter(OS, /*Is64Bit=*/false,
+                                     object::mach::CTM_i386,
+                                     object::mach::CSX86_ALL);
+  }
+};
+
+class DarwinX86_64AsmBackend : public DarwinX86AsmBackend {
+public:
+  DarwinX86_64AsmBackend(const Target &T)
+    : DarwinX86AsmBackend(T) {
+    HasReliableSymbolDifference = true;
+  }
+
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
+    return createX86MachObjectWriter(OS, /*Is64Bit=*/true,
+                                     object::mach::CTM_x86_64,
+                                     object::mach::CSX86_ALL);
+  }
+
+  virtual bool doesSectionRequireSymbols(const MCSection &Section) const {
+    // Temporary labels in the string literals sections require symbols. The
+    // issue is that the x86_64 relocation format does not allow symbol +
+    // offset, and so the linker does not have enough information to resolve the
+    // access to the appropriate atom unless an external relocation is used. For
+    // non-cstring sections, we expect the compiler to use a non-temporary label
+    // for anything that could have an addend pointing outside the symbol.
+    //
+    // See <rdar://problem/4765733>.
+    const MCSectionMachO &SMO = static_cast<const MCSectionMachO&>(Section);
+    return SMO.getType() == MCSectionMachO::S_CSTRING_LITERALS;
+  }
+
+  virtual bool isSectionAtomizable(const MCSection &Section) const {
+    const MCSectionMachO &SMO = static_cast<const MCSectionMachO&>(Section);
+    // Fixed sized data sections are uniqued, they cannot be diced into atoms.
+    switch (SMO.getType()) {
+    default:
+      return true;
+
+    case MCSectionMachO::S_4BYTE_LITERALS:
+    case MCSectionMachO::S_8BYTE_LITERALS:
+    case MCSectionMachO::S_16BYTE_LITERALS:
+    case MCSectionMachO::S_LITERAL_POINTERS:
+    case MCSectionMachO::S_NON_LAZY_SYMBOL_POINTERS:
+    case MCSectionMachO::S_LAZY_SYMBOL_POINTERS:
+    case MCSectionMachO::S_MOD_INIT_FUNC_POINTERS:
+    case MCSectionMachO::S_MOD_TERM_FUNC_POINTERS:
+    case MCSectionMachO::S_INTERPOSING:
+      return false;
+    }
+  }
+};
+
+} // end anonymous namespace
+
+MCAsmBackend *llvm::createX86_32AsmBackend(const Target &T, StringRef TT) {
+  Triple TheTriple(TT);
+
+  if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO)
+    return new DarwinX86_32AsmBackend(T);
+
+  if (TheTriple.isOSWindows())
+    return new WindowsX86AsmBackend(T, false);
+
+  return new ELFX86_32AsmBackend(T, TheTriple.getOS());
+}
+
+MCAsmBackend *llvm::createX86_64AsmBackend(const Target &T, StringRef TT) {
+  Triple TheTriple(TT);
+
+  if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO)
+    return new DarwinX86_64AsmBackend(T);
+
+  if (TheTriple.isOSWindows())
+    return new WindowsX86AsmBackend(T, true);
+
+  return new ELFX86_64AsmBackend(T, TheTriple.getOS());
+}
diff --git a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
new file mode 100644
index 0000000000000..e6ba705d4d87c
--- /dev/null
+++ b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -0,0 +1,548 @@
+//===-- X86BaseInfo.h - Top level definitions for X86 -------- --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains small standalone helper functions and enum definitions for
+// the X86 target useful for the compiler back-end and the MC libraries.
+// As such, it deliberately does not include references to LLVM core
+// code gen types, passes, etc..
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86BASEINFO_H
+#define X86BASEINFO_H
+
+#include "X86MCTargetDesc.h"
+#include "llvm/Support/DataTypes.h"
+#include <cassert>
+
+namespace llvm {
+
+namespace X86 {
+  // Enums for memory operand decoding.  Each memory operand is represented with
+  // a 5 operand sequence in the form:
+  //   [BaseReg, ScaleAmt, IndexReg, Disp, Segment]
+  // These enums help decode this.
+  enum {
+    AddrBaseReg = 0,
+    AddrScaleAmt = 1,
+    AddrIndexReg = 2,
+    AddrDisp = 3,
+
+    /// AddrSegmentReg - The operand # of the segment in the memory operand.
+    AddrSegmentReg = 4,
+
+    /// AddrNumOperands - Total number of operands in a memory reference.
+    AddrNumOperands = 5
+  };
+} // end namespace X86;
+ 
+
+/// X86II - This namespace holds all of the target specific flags that
+/// instruction info tracks.
+///
+namespace X86II {
+  /// Target Operand Flag enum.
+  enum TOF {
+    //===------------------------------------------------------------------===//
+    // X86 Specific MachineOperand flags.
+
+    MO_NO_FLAG,
+
+    /// MO_GOT_ABSOLUTE_ADDRESS - On a symbol operand, this represents a
+    /// relocation of:
+    ///    SYMBOL_LABEL + [. - PICBASELABEL]
+    MO_GOT_ABSOLUTE_ADDRESS,
+
+    /// MO_PIC_BASE_OFFSET - On a symbol operand this indicates that the
+    /// immediate should get the value of the symbol minus the PIC base label:
+    ///    SYMBOL_LABEL - PICBASELABEL
+    MO_PIC_BASE_OFFSET,
+
+    /// MO_GOT - On a symbol operand this indicates that the immediate is the
+    /// offset to the GOT entry for the symbol name from the base of the GOT.
+    ///
+    /// See the X86-64 ELF ABI supplement for more details.
+    ///    SYMBOL_LABEL @GOT
+    MO_GOT,
+
+    /// MO_GOTOFF - On a symbol operand this indicates that the immediate is
+    /// the offset to the location of the symbol name from the base of the GOT.
+    ///
+    /// See the X86-64 ELF ABI supplement for more details.
+    ///    SYMBOL_LABEL @GOTOFF
+    MO_GOTOFF,
+
+    /// MO_GOTPCREL - On a symbol operand this indicates that the immediate is
+    /// offset to the GOT entry for the symbol name from the current code
+    /// location.
+    ///
+    /// See the X86-64 ELF ABI supplement for more details.
+    ///    SYMBOL_LABEL @GOTPCREL
+    MO_GOTPCREL,
+
+    /// MO_PLT - On a symbol operand this indicates that the immediate is
+    /// offset to the PLT entry of symbol name from the current code location.
+    ///
+    /// See the X86-64 ELF ABI supplement for more details.
+    ///    SYMBOL_LABEL @PLT
+    MO_PLT,
+
+    /// MO_TLSGD - On a symbol operand this indicates that the immediate is
+    /// some TLS offset.
+    ///
+    /// See 'ELF Handling for Thread-Local Storage' for more details.
+    ///    SYMBOL_LABEL @TLSGD
+    MO_TLSGD,
+
+    /// MO_GOTTPOFF - On a symbol operand this indicates that the immediate is
+    /// some TLS offset.
+    ///
+    /// See 'ELF Handling for Thread-Local Storage' for more details.
+    ///    SYMBOL_LABEL @GOTTPOFF
+    MO_GOTTPOFF,
+
+    /// MO_INDNTPOFF - On a symbol operand this indicates that the immediate is
+    /// some TLS offset.
+    ///
+    /// See 'ELF Handling for Thread-Local Storage' for more details.
+    ///    SYMBOL_LABEL @INDNTPOFF
+    MO_INDNTPOFF,
+
+    /// MO_TPOFF - On a symbol operand this indicates that the immediate is
+    /// some TLS offset.
+    ///
+    /// See 'ELF Handling for Thread-Local Storage' for more details.
+    ///    SYMBOL_LABEL @TPOFF
+    MO_TPOFF,
+
+    /// MO_NTPOFF - On a symbol operand this indicates that the immediate is
+    /// some TLS offset.
+    ///
+    /// See 'ELF Handling for Thread-Local Storage' for more details.
+    ///    SYMBOL_LABEL @NTPOFF
+    MO_NTPOFF,
+
+    /// MO_DLLIMPORT - On a symbol operand "FOO", this indicates that the
+    /// reference is actually to the "__imp_FOO" symbol.  This is used for
+    /// dllimport linkage on windows.
+    MO_DLLIMPORT,
+
+    /// MO_DARWIN_STUB - On a symbol operand "FOO", this indicates that the
+    /// reference is actually to the "FOO$stub" symbol.  This is used for calls
+    /// and jumps to external functions on Tiger and earlier.
+    MO_DARWIN_STUB,
+
+    /// MO_DARWIN_NONLAZY - On a symbol operand "FOO", this indicates that the
+    /// reference is actually to the "FOO$non_lazy_ptr" symbol, which is a
+    /// non-PIC-base-relative reference to a non-hidden dyld lazy pointer stub.
+    MO_DARWIN_NONLAZY,
+
+    /// MO_DARWIN_NONLAZY_PIC_BASE - On a symbol operand "FOO", this indicates
+    /// that the reference is actually to "FOO$non_lazy_ptr - PICBASE", which is
+    /// a PIC-base-relative reference to a non-hidden dyld lazy pointer stub.
+    MO_DARWIN_NONLAZY_PIC_BASE,
+
+    /// MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE - On a symbol operand "FOO", this
+    /// indicates that the reference is actually to "FOO$non_lazy_ptr -PICBASE",
+    /// which is a PIC-base-relative reference to a hidden dyld lazy pointer
+    /// stub.
+    MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE,
+
+    /// MO_TLVP - On a symbol operand this indicates that the immediate is
+    /// some TLS offset.
+    ///
+    /// This is the TLS offset for the Darwin TLS mechanism.
+    MO_TLVP,
+
+    /// MO_TLVP_PIC_BASE - On a symbol operand this indicates that the immediate
+    /// is some TLS offset from the picbase.
+    ///
+    /// This is the 32-bit TLS offset for Darwin TLS in PIC mode.
+    MO_TLVP_PIC_BASE
+  };
+
+  enum {
+    //===------------------------------------------------------------------===//
+    // Instruction encodings.  These are the standard/most common forms for X86
+    // instructions.
+    //
+
+    // PseudoFrm - This represents an instruction that is a pseudo instruction
+    // or one that has not been implemented yet.  It is illegal to code generate
+    // it, but tolerated for intermediate implementation stages.
+    Pseudo         = 0,
+
+    /// Raw - This form is for instructions that don't have any operands, so
+    /// they are just a fixed opcode value, like 'leave'.
+    RawFrm         = 1,
+
+    /// AddRegFrm - This form is used for instructions like 'push r32' that have
+    /// their one register operand added to their opcode.
+    AddRegFrm      = 2,
+
+    /// MRMDestReg - This form is used for instructions that use the Mod/RM byte
+    /// to specify a destination, which in this case is a register.
+    ///
+    MRMDestReg     = 3,
+
+    /// MRMDestMem - This form is used for instructions that use the Mod/RM byte
+    /// to specify a destination, which in this case is memory.
+    ///
+    MRMDestMem     = 4,
+
+    /// MRMSrcReg - This form is used for instructions that use the Mod/RM byte
+    /// to specify a source, which in this case is a register.
+    ///
+    MRMSrcReg      = 5,
+
+    /// MRMSrcMem - This form is used for instructions that use the Mod/RM byte
+    /// to specify a source, which in this case is memory.
+    ///
+    MRMSrcMem      = 6,
+
+    /// MRM[0-7][rm] - These forms are used to represent instructions that use
+    /// a Mod/RM byte, and use the middle field to hold extended opcode
+    /// information.  In the intel manual these are represented as /0, /1, ...
+    ///
+
+    // First, instructions that operate on a register r/m operand...
+    MRM0r = 16,  MRM1r = 17,  MRM2r = 18,  MRM3r = 19, // Format /0 /1 /2 /3
+    MRM4r = 20,  MRM5r = 21,  MRM6r = 22,  MRM7r = 23, // Format /4 /5 /6 /7
+
+    // Next, instructions that operate on a memory r/m operand...
+    MRM0m = 24,  MRM1m = 25,  MRM2m = 26,  MRM3m = 27, // Format /0 /1 /2 /3
+    MRM4m = 28,  MRM5m = 29,  MRM6m = 30,  MRM7m = 31, // Format /4 /5 /6 /7
+
+    // MRMInitReg - This form is used for instructions whose source and
+    // destinations are the same register.
+    MRMInitReg = 32,
+
+    //// MRM_C1 - A mod/rm byte of exactly 0xC1.
+    MRM_C1 = 33,
+    MRM_C2 = 34,
+    MRM_C3 = 35,
+    MRM_C4 = 36,
+    MRM_C8 = 37,
+    MRM_C9 = 38,
+    MRM_E8 = 39,
+    MRM_F0 = 40,
+    MRM_F8 = 41,
+    MRM_F9 = 42,
+    MRM_D0 = 45,
+    MRM_D1 = 46,
+
+    /// RawFrmImm8 - This is used for the ENTER instruction, which has two
+    /// immediates, the first of which is a 16-bit immediate (specified by
+    /// the imm encoding) and the second is a 8-bit fixed value.
+    RawFrmImm8 = 43,
+
+    /// RawFrmImm16 - This is used for CALL FAR instructions, which have two
+    /// immediates, the first of which is a 16 or 32-bit immediate (specified by
+    /// the imm encoding) and the second is a 16-bit fixed value.  In the AMD
+    /// manual, this operand is described as pntr16:32 and pntr16:16
+    RawFrmImm16 = 44,
+
+    FormMask       = 63,
+
+    //===------------------------------------------------------------------===//
+    // Actual flags...
+
+    // OpSize - Set if this instruction requires an operand size prefix (0x66),
+    // which most often indicates that the instruction operates on 16 bit data
+    // instead of 32 bit data.
+    OpSize      = 1 << 6,
+
+    // AsSize - Set if this instruction requires an operand size prefix (0x67),
+    // which most often indicates that the instruction address 16 bit address
+    // instead of 32 bit address (or 32 bit address in 64 bit mode).
+    AdSize      = 1 << 7,
+
+    //===------------------------------------------------------------------===//
+    // Op0Mask - There are several prefix bytes that are used to form two byte
+    // opcodes.  These are currently 0x0F, 0xF3, and 0xD8-0xDF.  This mask is
+    // used to obtain the setting of this field.  If no bits in this field is
+    // set, there is no prefix byte for obtaining a multibyte opcode.
+    //
+    Op0Shift    = 8,
+    Op0Mask     = 0x1F << Op0Shift,
+
+    // TB - TwoByte - Set if this instruction has a two byte opcode, which
+    // starts with a 0x0F byte before the real opcode.
+    TB          = 1 << Op0Shift,
+
+    // REP - The 0xF3 prefix byte indicating repetition of the following
+    // instruction.
+    REP         = 2 << Op0Shift,
+
+    // D8-DF - These escape opcodes are used by the floating point unit.  These
+    // values must remain sequential.
+    D8 = 3 << Op0Shift,   D9 = 4 << Op0Shift,
+    DA = 5 << Op0Shift,   DB = 6 << Op0Shift,
+    DC = 7 << Op0Shift,   DD = 8 << Op0Shift,
+    DE = 9 << Op0Shift,   DF = 10 << Op0Shift,
+
+    // XS, XD - These prefix codes are for single and double precision scalar
+    // floating point operations performed in the SSE registers.
+    XD = 11 << Op0Shift,  XS = 12 << Op0Shift,
+
+    // T8, TA, A6, A7 - Prefix after the 0x0F prefix.
+    T8 = 13 << Op0Shift,  TA = 14 << Op0Shift,
+    A6 = 15 << Op0Shift,  A7 = 16 << Op0Shift,
+
+    // TF - Prefix before and after 0x0F
+    TF = 17 << Op0Shift,
+
+    //===------------------------------------------------------------------===//
+    // REX_W - REX prefixes are instruction prefixes used in 64-bit mode.
+    // They are used to specify GPRs and SSE registers, 64-bit operand size,
+    // etc. We only cares about REX.W and REX.R bits and only the former is
+    // statically determined.
+    //
+    REXShift    = Op0Shift + 5,
+    REX_W       = 1 << REXShift,
+
+    //===------------------------------------------------------------------===//
+    // This three-bit field describes the size of an immediate operand.  Zero is
+    // unused so that we can tell if we forgot to set a value.
+    ImmShift = REXShift + 1,
+    ImmMask    = 7 << ImmShift,
+    Imm8       = 1 << ImmShift,
+    Imm8PCRel  = 2 << ImmShift,
+    Imm16      = 3 << ImmShift,
+    Imm16PCRel = 4 << ImmShift,
+    Imm32      = 5 << ImmShift,
+    Imm32PCRel = 6 << ImmShift,
+    Imm64      = 7 << ImmShift,
+
+    //===------------------------------------------------------------------===//
+    // FP Instruction Classification...  Zero is non-fp instruction.
+
+    // FPTypeMask - Mask for all of the FP types...
+    FPTypeShift = ImmShift + 3,
+    FPTypeMask  = 7 << FPTypeShift,
+
+    // NotFP - The default, set for instructions that do not use FP registers.
+    NotFP      = 0 << FPTypeShift,
+
+    // ZeroArgFP - 0 arg FP instruction which implicitly pushes ST(0), f.e. fld0
+    ZeroArgFP  = 1 << FPTypeShift,
+
+    // OneArgFP - 1 arg FP instructions which implicitly read ST(0), such as fst
+    OneArgFP   = 2 << FPTypeShift,
+
+    // OneArgFPRW - 1 arg FP instruction which implicitly read ST(0) and write a
+    // result back to ST(0).  For example, fcos, fsqrt, etc.
+    //
+    OneArgFPRW = 3 << FPTypeShift,
+
+    // TwoArgFP - 2 arg FP instructions which implicitly read ST(0), and an
+    // explicit argument, storing the result to either ST(0) or the implicit
+    // argument.  For example: fadd, fsub, fmul, etc...
+    TwoArgFP   = 4 << FPTypeShift,
+
+    // CompareFP - 2 arg FP instructions which implicitly read ST(0) and an
+    // explicit argument, but have no destination.  Example: fucom, fucomi, ...
+    CompareFP  = 5 << FPTypeShift,
+
+    // CondMovFP - "2 operand" floating point conditional move instructions.
+    CondMovFP  = 6 << FPTypeShift,
+
+    // SpecialFP - Special instruction forms.  Dispatch by opcode explicitly.
+    SpecialFP  = 7 << FPTypeShift,
+
+    // Lock prefix
+    LOCKShift = FPTypeShift + 3,
+    LOCK = 1 << LOCKShift,
+
+    // Segment override prefixes. Currently we just need ability to address
+    // stuff in gs and fs segments.
+    SegOvrShift = LOCKShift + 1,
+    SegOvrMask  = 3 << SegOvrShift,
+    FS          = 1 << SegOvrShift,
+    GS          = 2 << SegOvrShift,
+
+    // Execution domain for SSE instructions in bits 23, 24.
+    // 0 in bits 23-24 means normal, non-SSE instruction.
+    SSEDomainShift = SegOvrShift + 2,
+
+    OpcodeShift   = SSEDomainShift + 2,
+
+    //===------------------------------------------------------------------===//
+    /// VEX - The opcode prefix used by AVX instructions
+    VEXShift = OpcodeShift + 8,
+    VEX         = 1U << 0,
+
+    /// VEX_W - Has a opcode specific functionality, but is used in the same
+    /// way as REX_W is for regular SSE instructions.
+    VEX_W       = 1U << 1,
+
+    /// VEX_4V - Used to specify an additional AVX/SSE register. Several 2
+    /// address instructions in SSE are represented as 3 address ones in AVX
+    /// and the additional register is encoded in VEX_VVVV prefix.
+    VEX_4V      = 1U << 2,
+
+    /// VEX_I8IMM - Specifies that the last register used in a AVX instruction,
+    /// must be encoded in the i8 immediate field. This usually happens in
+    /// instructions with 4 operands.
+    VEX_I8IMM   = 1U << 3,
+
+    /// VEX_L - Stands for a bit in the VEX opcode prefix meaning the current
+    /// instruction uses 256-bit wide registers. This is usually auto detected
+    /// if a VR256 register is used, but some AVX instructions also have this
+    /// field marked when using a f256 memory references.
+    VEX_L       = 1U << 4,
+
+    // VEX_LIG - Specifies that this instruction ignores the L-bit in the VEX
+    // prefix. Usually used for scalar instructions. Needed by disassembler.
+    VEX_LIG     = 1U << 5,
+
+    /// Has3DNow0F0FOpcode - This flag indicates that the instruction uses the
+    /// wacky 0x0F 0x0F prefix for 3DNow! instructions.  The manual documents
+    /// this as having a 0x0F prefix with a 0x0F opcode, and each instruction
+    /// storing a classifier in the imm8 field.  To simplify our implementation,
+    /// we handle this by storeing the classifier in the opcode field and using
+    /// this flag to indicate that the encoder should do the wacky 3DNow! thing.
+    Has3DNow0F0FOpcode = 1U << 6
+  };
+
+  // getBaseOpcodeFor - This function returns the "base" X86 opcode for the
+  // specified machine instruction.
+  //
+  static inline unsigned char getBaseOpcodeFor(uint64_t TSFlags) {
+    return TSFlags >> X86II::OpcodeShift;
+  }
+
+  static inline bool hasImm(uint64_t TSFlags) {
+    return (TSFlags & X86II::ImmMask) != 0;
+  }
+
+  /// getSizeOfImm - Decode the "size of immediate" field from the TSFlags field
+  /// of the specified instruction.
+  static inline unsigned getSizeOfImm(uint64_t TSFlags) {
+    switch (TSFlags & X86II::ImmMask) {
+    default: assert(0 && "Unknown immediate size");
+    case X86II::Imm8:
+    case X86II::Imm8PCRel:  return 1;
+    case X86II::Imm16:
+    case X86II::Imm16PCRel: return 2;
+    case X86II::Imm32:
+    case X86II::Imm32PCRel: return 4;
+    case X86II::Imm64:      return 8;
+    }
+  }
+
+  /// isImmPCRel - Return true if the immediate of the specified instruction's
+  /// TSFlags indicates that it is pc relative.
+  static inline unsigned isImmPCRel(uint64_t TSFlags) {
+    switch (TSFlags & X86II::ImmMask) {
+    default: assert(0 && "Unknown immediate size");
+    case X86II::Imm8PCRel:
+    case X86II::Imm16PCRel:
+    case X86II::Imm32PCRel:
+      return true;
+    case X86II::Imm8:
+    case X86II::Imm16:
+    case X86II::Imm32:
+    case X86II::Imm64:
+      return false;
+    }
+  }
+
+  /// getMemoryOperandNo - The function returns the MCInst operand # for the
+  /// first field of the memory operand.  If the instruction doesn't have a
+  /// memory operand, this returns -1.
+  ///
+  /// Note that this ignores tied operands.  If there is a tied register which
+  /// is duplicated in the MCInst (e.g. "EAX = addl EAX, [mem]") it is only
+  /// counted as one operand.
+  ///
+  static inline int getMemoryOperandNo(uint64_t TSFlags) {
+    switch (TSFlags & X86II::FormMask) {
+    case X86II::MRMInitReg:  assert(0 && "FIXME: Remove this form");
+    default: assert(0 && "Unknown FormMask value in getMemoryOperandNo!");
+    case X86II::Pseudo:
+    case X86II::RawFrm:
+    case X86II::AddRegFrm:
+    case X86II::MRMDestReg:
+    case X86II::MRMSrcReg:
+    case X86II::RawFrmImm8:
+    case X86II::RawFrmImm16:
+       return -1;
+    case X86II::MRMDestMem:
+      return 0;
+    case X86II::MRMSrcMem: {
+      bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V;
+      unsigned FirstMemOp = 1;
+      if (HasVEX_4V)
+        ++FirstMemOp;// Skip the register source (which is encoded in VEX_VVVV).
+
+      // FIXME: Maybe lea should have its own form?  This is a horrible hack.
+      //if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r ||
+      //    Opcode == X86::LEA16r || Opcode == X86::LEA32r)
+      return FirstMemOp;
+    }
+    case X86II::MRM0r: case X86II::MRM1r:
+    case X86II::MRM2r: case X86II::MRM3r:
+    case X86II::MRM4r: case X86II::MRM5r:
+    case X86II::MRM6r: case X86II::MRM7r:
+      return -1;
+    case X86II::MRM0m: case X86II::MRM1m:
+    case X86II::MRM2m: case X86II::MRM3m:
+    case X86II::MRM4m: case X86II::MRM5m:
+    case X86II::MRM6m: case X86II::MRM7m:
+      return 0;
+    case X86II::MRM_C1:
+    case X86II::MRM_C2:
+    case X86II::MRM_C3:
+    case X86II::MRM_C4:
+    case X86II::MRM_C8:
+    case X86II::MRM_C9:
+    case X86II::MRM_E8:
+    case X86II::MRM_F0:
+    case X86II::MRM_F8:
+    case X86II::MRM_F9:
+    case X86II::MRM_D0:
+    case X86II::MRM_D1:
+      return -1;
+    }
+  }
+
+  /// isX86_64ExtendedReg - Is the MachineOperand a x86-64 extended (r8 or
+  /// higher) register?  e.g. r8, xmm8, xmm13, etc.
+  static inline bool isX86_64ExtendedReg(unsigned RegNo) {
+    switch (RegNo) {
+    default: break;
+    case X86::R8:    case X86::R9:    case X86::R10:   case X86::R11:
+    case X86::R12:   case X86::R13:   case X86::R14:   case X86::R15:
+    case X86::R8D:   case X86::R9D:   case X86::R10D:  case X86::R11D:
+    case X86::R12D:  case X86::R13D:  case X86::R14D:  case X86::R15D:
+    case X86::R8W:   case X86::R9W:   case X86::R10W:  case X86::R11W:
+    case X86::R12W:  case X86::R13W:  case X86::R14W:  case X86::R15W:
+    case X86::R8B:   case X86::R9B:   case X86::R10B:  case X86::R11B:
+    case X86::R12B:  case X86::R13B:  case X86::R14B:  case X86::R15B:
+    case X86::XMM8:  case X86::XMM9:  case X86::XMM10: case X86::XMM11:
+    case X86::XMM12: case X86::XMM13: case X86::XMM14: case X86::XMM15:
+    case X86::YMM8:  case X86::YMM9:  case X86::YMM10: case X86::YMM11:
+    case X86::YMM12: case X86::YMM13: case X86::YMM14: case X86::YMM15:
+    case X86::CR8:   case X86::CR9:   case X86::CR10:  case X86::CR11:
+    case X86::CR12:  case X86::CR13:  case X86::CR14:  case X86::CR15:
+        return true;
+    }
+    return false;
+  }
+  
+  static inline bool isX86_64NonExtLowByteReg(unsigned reg) {
+    return (reg == X86::SPL || reg == X86::BPL ||
+            reg == X86::SIL || reg == X86::DIL);
+  }
+}
+
+} // end namespace llvm;
+
+#endif
diff --git a/lib/Target/X86/MCTargetDesc/X86FixupKinds.h b/lib/Target/X86/MCTargetDesc/X86FixupKinds.h
new file mode 100644
index 0000000000000..17d242ab761e3
--- /dev/null
+++ b/lib/Target/X86/MCTargetDesc/X86FixupKinds.h
@@ -0,0 +1,33 @@
+//===-- X86/X86FixupKinds.h - X86 Specific Fixup Entries --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_X86_X86FIXUPKINDS_H
+#define LLVM_X86_X86FIXUPKINDS_H
+
+#include "llvm/MC/MCFixup.h"
+
+namespace llvm {
+namespace X86 {
+enum Fixups {
+  reloc_riprel_4byte = FirstTargetFixupKind, // 32-bit rip-relative
+  reloc_riprel_4byte_movq_load,              // 32-bit rip-relative in movq
+  reloc_signed_4byte,                        // 32-bit signed. Unlike FK_Data_4
+                                             // this will be sign extended at
+                                             // runtime.
+  reloc_global_offset_table,                 // 32-bit, relative to the start
+                                             // of the instruction. Used only
+                                             // for _GLOBAL_OFFSET_TABLE_.
+  // Marker
+  LastTargetFixupKind,
+  NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+};
+}
+}
+
+#endif
diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
new file mode 100644
index 0000000000000..2eee1128119e4
--- /dev/null
+++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -0,0 +1,1074 @@
+//===-- X86/X86MCCodeEmitter.cpp - Convert X86 code to machine code -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the X86MCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mccodeemitter"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "MCTargetDesc/X86FixupKinds.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+class X86MCCodeEmitter : public MCCodeEmitter {
+  X86MCCodeEmitter(const X86MCCodeEmitter &); // DO NOT IMPLEMENT
+  void operator=(const X86MCCodeEmitter &); // DO NOT IMPLEMENT
+  const MCInstrInfo &MCII;
+  const MCSubtargetInfo &STI;
+  MCContext &Ctx;
+public:
+  X86MCCodeEmitter(const MCInstrInfo &mcii, const MCSubtargetInfo &sti,
+                   MCContext &ctx)
+    : MCII(mcii), STI(sti), Ctx(ctx) {
+  }
+
+  ~X86MCCodeEmitter() {}
+
+  bool is64BitMode() const {
+    // FIXME: Can tablegen auto-generate this?
+    return (STI.getFeatureBits() & X86::Mode64Bit) != 0;
+  }
+
+  static unsigned GetX86RegNum(const MCOperand &MO) {
+    return X86_MC::getX86RegNum(MO.getReg());
+  }
+
+  // On regular x86, both XMM0-XMM7 and XMM8-XMM15 are encoded in the range
+  // 0-7 and the difference between the 2 groups is given by the REX prefix.
+  // In the VEX prefix, registers are seen sequencially from 0-15 and encoded
+  // in 1's complement form, example:
+  //
+  //  ModRM field => XMM9 => 1
+  //  VEX.VVVV    => XMM9 => ~9
+  //
+  // See table 4-35 of Intel AVX Programming Reference for details.
+  static unsigned char getVEXRegisterEncoding(const MCInst &MI,
+                                              unsigned OpNum) {
+    unsigned SrcReg = MI.getOperand(OpNum).getReg();
+    unsigned SrcRegNum = GetX86RegNum(MI.getOperand(OpNum));
+    if ((SrcReg >= X86::XMM8 && SrcReg <= X86::XMM15) ||
+        (SrcReg >= X86::YMM8 && SrcReg <= X86::YMM15))
+      SrcRegNum += 8;
+
+    // The registers represented through VEX_VVVV should
+    // be encoded in 1's complement form.
+    return (~SrcRegNum) & 0xf;
+  }
+
+  void EmitByte(unsigned char C, unsigned &CurByte, raw_ostream &OS) const {
+    OS << (char)C;
+    ++CurByte;
+  }
+
+  void EmitConstant(uint64_t Val, unsigned Size, unsigned &CurByte,
+                    raw_ostream &OS) const {
+    // Output the constant in little endian byte order.
+    for (unsigned i = 0; i != Size; ++i) {
+      EmitByte(Val & 255, CurByte, OS);
+      Val >>= 8;
+    }
+  }
+
+  void EmitImmediate(const MCOperand &Disp,
+                     unsigned ImmSize, MCFixupKind FixupKind,
+                     unsigned &CurByte, raw_ostream &OS,
+                     SmallVectorImpl<MCFixup> &Fixups,
+                     int ImmOffset = 0) const;
+
+  inline static unsigned char ModRMByte(unsigned Mod, unsigned RegOpcode,
+                                        unsigned RM) {
+    assert(Mod < 4 && RegOpcode < 8 && RM < 8 && "ModRM Fields out of range!");
+    return RM | (RegOpcode << 3) | (Mod << 6);
+  }
+
+  void EmitRegModRMByte(const MCOperand &ModRMReg, unsigned RegOpcodeFld,
+                        unsigned &CurByte, raw_ostream &OS) const {
+    EmitByte(ModRMByte(3, RegOpcodeFld, GetX86RegNum(ModRMReg)), CurByte, OS);
+  }
+
+  void EmitSIBByte(unsigned SS, unsigned Index, unsigned Base,
+                   unsigned &CurByte, raw_ostream &OS) const {
+    // SIB byte is in the same format as the ModRMByte.
+    EmitByte(ModRMByte(SS, Index, Base), CurByte, OS);
+  }
+
+
+  void EmitMemModRMByte(const MCInst &MI, unsigned Op,
+                        unsigned RegOpcodeField,
+                        uint64_t TSFlags, unsigned &CurByte, raw_ostream &OS,
+                        SmallVectorImpl<MCFixup> &Fixups) const;
+
+  void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups) const;
+
+  void EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, int MemOperand,
+                           const MCInst &MI, const MCInstrDesc &Desc,
+                           raw_ostream &OS) const;
+
+  void EmitSegmentOverridePrefix(uint64_t TSFlags, unsigned &CurByte,
+                                 int MemOperand, const MCInst &MI,
+                                 raw_ostream &OS) const;
+
+  void EmitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, int MemOperand,
+                        const MCInst &MI, const MCInstrDesc &Desc,
+                        raw_ostream &OS) const;
+};
+
+} // end anonymous namespace
+
+
+MCCodeEmitter *llvm::createX86MCCodeEmitter(const MCInstrInfo &MCII,
+                                            const MCSubtargetInfo &STI,
+                                            MCContext &Ctx) {
+  return new X86MCCodeEmitter(MCII, STI, Ctx);
+}
+
+/// isDisp8 - Return true if this signed displacement fits in a 8-bit
+/// sign-extended field.
+static bool isDisp8(int Value) {
+  return Value == (signed char)Value;
+}
+
+/// getImmFixupKind - Return the appropriate fixup kind to use for an immediate
+/// in an instruction with the specified TSFlags.
+static MCFixupKind getImmFixupKind(uint64_t TSFlags) {
+  unsigned Size = X86II::getSizeOfImm(TSFlags);
+  bool isPCRel = X86II::isImmPCRel(TSFlags);
+
+  return MCFixup::getKindForSize(Size, isPCRel);
+}
+
+/// Is32BitMemOperand - Return true if the specified instruction with a memory
+/// operand should emit the 0x67 prefix byte in 64-bit mode due to a 32-bit
+/// memory operand.  Op specifies the operand # of the memoperand.
+static bool Is32BitMemOperand(const MCInst &MI, unsigned Op) {
+  const MCOperand &BaseReg  = MI.getOperand(Op+X86::AddrBaseReg);
+  const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
+
+  if ((BaseReg.getReg() != 0 &&
+       X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg.getReg())) ||
+      (IndexReg.getReg() != 0 &&
+       X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg.getReg())))
+    return true;
+  return false;
+}
+
+/// StartsWithGlobalOffsetTable - Return true for the simple cases where this
+/// expression starts with _GLOBAL_OFFSET_TABLE_. This is a needed to support
+/// PIC on ELF i386 as that symbol is magic. We check only simple case that
+/// are know to be used: _GLOBAL_OFFSET_TABLE_ by itself or at the start
+/// of a binary expression.
+static bool StartsWithGlobalOffsetTable(const MCExpr *Expr) {
+  if (Expr->getKind() == MCExpr::Binary) {
+    const MCBinaryExpr *BE = static_cast<const MCBinaryExpr *>(Expr);
+    Expr = BE->getLHS();
+  }
+
+  if (Expr->getKind() != MCExpr::SymbolRef)
+    return false;
+
+  const MCSymbolRefExpr *Ref = static_cast<const MCSymbolRefExpr*>(Expr);
+  const MCSymbol &S = Ref->getSymbol();
+  return S.getName() == "_GLOBAL_OFFSET_TABLE_";
+}
+
+void X86MCCodeEmitter::
+EmitImmediate(const MCOperand &DispOp, unsigned Size, MCFixupKind FixupKind,
+              unsigned &CurByte, raw_ostream &OS,
+              SmallVectorImpl<MCFixup> &Fixups, int ImmOffset) const {
+  const MCExpr *Expr = NULL;
+  if (DispOp.isImm()) {
+    // If this is a simple integer displacement that doesn't require a
+    // relocation, emit it now.
+    if (FixupKind != FK_PCRel_1 &&
+        FixupKind != FK_PCRel_2 &&
+        FixupKind != FK_PCRel_4) {
+      EmitConstant(DispOp.getImm()+ImmOffset, Size, CurByte, OS);
+      return;
+    }
+    Expr = MCConstantExpr::Create(DispOp.getImm(), Ctx);
+  } else {
+    Expr = DispOp.getExpr();
+  }
+
+  // If we have an immoffset, add it to the expression.
+  if ((FixupKind == FK_Data_4 ||
+       FixupKind == MCFixupKind(X86::reloc_signed_4byte)) &&
+      StartsWithGlobalOffsetTable(Expr)) {
+    assert(ImmOffset == 0);
+
+    FixupKind = MCFixupKind(X86::reloc_global_offset_table);
+    ImmOffset = CurByte;
+  }
+
+  // If the fixup is pc-relative, we need to bias the value to be relative to
+  // the start of the field, not the end of the field.
+  if (FixupKind == FK_PCRel_4 ||
+      FixupKind == MCFixupKind(X86::reloc_riprel_4byte) ||
+      FixupKind == MCFixupKind(X86::reloc_riprel_4byte_movq_load))
+    ImmOffset -= 4;
+  if (FixupKind == FK_PCRel_2)
+    ImmOffset -= 2;
+  if (FixupKind == FK_PCRel_1)
+    ImmOffset -= 1;
+
+  if (ImmOffset)
+    Expr = MCBinaryExpr::CreateAdd(Expr, MCConstantExpr::Create(ImmOffset, Ctx),
+                                   Ctx);
+
+  // Emit a symbolic constant as a fixup and 4 zeros.
+  Fixups.push_back(MCFixup::Create(CurByte, Expr, FixupKind));
+  EmitConstant(0, Size, CurByte, OS);
+}
+
+void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op,
+                                        unsigned RegOpcodeField,
+                                        uint64_t TSFlags, unsigned &CurByte,
+                                        raw_ostream &OS,
+                                        SmallVectorImpl<MCFixup> &Fixups) const{
+  const MCOperand &Disp     = MI.getOperand(Op+X86::AddrDisp);
+  const MCOperand &Base     = MI.getOperand(Op+X86::AddrBaseReg);
+  const MCOperand &Scale    = MI.getOperand(Op+X86::AddrScaleAmt);
+  const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
+  unsigned BaseReg = Base.getReg();
+
+  // Handle %rip relative addressing.
+  if (BaseReg == X86::RIP) {    // [disp32+RIP] in X86-64 mode
+    assert(is64BitMode() && "Rip-relative addressing requires 64-bit mode");
+    assert(IndexReg.getReg() == 0 && "Invalid rip-relative address");
+    EmitByte(ModRMByte(0, RegOpcodeField, 5), CurByte, OS);
+
+    unsigned FixupKind = X86::reloc_riprel_4byte;
+
+    // movq loads are handled with a special relocation form which allows the
+    // linker to eliminate some loads for GOT references which end up in the
+    // same linkage unit.
+    if (MI.getOpcode() == X86::MOV64rm)
+      FixupKind = X86::reloc_riprel_4byte_movq_load;
+
+    // rip-relative addressing is actually relative to the *next* instruction.
+    // Since an immediate can follow the mod/rm byte for an instruction, this
+    // means that we need to bias the immediate field of the instruction with
+    // the size of the immediate field.  If we have this case, add it into the
+    // expression to emit.
+    int ImmSize = X86II::hasImm(TSFlags) ? X86II::getSizeOfImm(TSFlags) : 0;
+
+    EmitImmediate(Disp, 4, MCFixupKind(FixupKind),
+                  CurByte, OS, Fixups, -ImmSize);
+    return;
+  }
+
+  unsigned BaseRegNo = BaseReg ? GetX86RegNum(Base) : -1U;
+
+  // Determine whether a SIB byte is needed.
+  // If no BaseReg, issue a RIP relative instruction only if the MCE can
+  // resolve addresses on-the-fly, otherwise use SIB (Intel Manual 2A, table
+  // 2-7) and absolute references.
+
+  if (// The SIB byte must be used if there is an index register.
+      IndexReg.getReg() == 0 &&
+      // The SIB byte must be used if the base is ESP/RSP/R12, all of which
+      // encode to an R/M value of 4, which indicates that a SIB byte is
+      // present.
+      BaseRegNo != N86::ESP &&
+      // If there is no base register and we're in 64-bit mode, we need a SIB
+      // byte to emit an addr that is just 'disp32' (the non-RIP relative form).
+      (!is64BitMode() || BaseReg != 0)) {
+
+    if (BaseReg == 0) {          // [disp32]     in X86-32 mode
+      EmitByte(ModRMByte(0, RegOpcodeField, 5), CurByte, OS);
+      EmitImmediate(Disp, 4, FK_Data_4, CurByte, OS, Fixups);
+      return;
+    }
+
+    // If the base is not EBP/ESP and there is no displacement, use simple
+    // indirect register encoding, this handles addresses like [EAX].  The
+    // encoding for [EBP] with no displacement means [disp32] so we handle it
+    // by emitting a displacement of 0 below.
+    if (Disp.isImm() && Disp.getImm() == 0 && BaseRegNo != N86::EBP) {
+      EmitByte(ModRMByte(0, RegOpcodeField, BaseRegNo), CurByte, OS);
+      return;
+    }
+
+    // Otherwise, if the displacement fits in a byte, encode as [REG+disp8].
+    if (Disp.isImm() && isDisp8(Disp.getImm())) {
+      EmitByte(ModRMByte(1, RegOpcodeField, BaseRegNo), CurByte, OS);
+      EmitImmediate(Disp, 1, FK_Data_1, CurByte, OS, Fixups);
+      return;
+    }
+
+    // Otherwise, emit the most general non-SIB encoding: [REG+disp32]
+    EmitByte(ModRMByte(2, RegOpcodeField, BaseRegNo), CurByte, OS);
+    EmitImmediate(Disp, 4, MCFixupKind(X86::reloc_signed_4byte), CurByte, OS,
+                  Fixups);
+    return;
+  }
+
+  // We need a SIB byte, so start by outputting the ModR/M byte first
+  assert(IndexReg.getReg() != X86::ESP &&
+         IndexReg.getReg() != X86::RSP && "Cannot use ESP as index reg!");
+
+  bool ForceDisp32 = false;
+  bool ForceDisp8  = false;
+  if (BaseReg == 0) {
+    // If there is no base register, we emit the special case SIB byte with
+    // MOD=0, BASE=5, to JUST get the index, scale, and displacement.
+    EmitByte(ModRMByte(0, RegOpcodeField, 4), CurByte, OS);
+    ForceDisp32 = true;
+  } else if (!Disp.isImm()) {
+    // Emit the normal disp32 encoding.
+    EmitByte(ModRMByte(2, RegOpcodeField, 4), CurByte, OS);
+    ForceDisp32 = true;
+  } else if (Disp.getImm() == 0 &&
+             // Base reg can't be anything that ends up with '5' as the base
+             // reg, it is the magic [*] nomenclature that indicates no base.
+             BaseRegNo != N86::EBP) {
+    // Emit no displacement ModR/M byte
+    EmitByte(ModRMByte(0, RegOpcodeField, 4), CurByte, OS);
+  } else if (isDisp8(Disp.getImm())) {
+    // Emit the disp8 encoding.
+    EmitByte(ModRMByte(1, RegOpcodeField, 4), CurByte, OS);
+    ForceDisp8 = true;           // Make sure to force 8 bit disp if Base=EBP
+  } else {
+    // Emit the normal disp32 encoding.
+    EmitByte(ModRMByte(2, RegOpcodeField, 4), CurByte, OS);
+  }
+
+  // Calculate what the SS field value should be...
+  static const unsigned SSTable[] = { ~0U, 0, 1, ~0U, 2, ~0U, ~0U, ~0U, 3 };
+  unsigned SS = SSTable[Scale.getImm()];
+
+  if (BaseReg == 0) {
+    // Handle the SIB byte for the case where there is no base, see Intel
+    // Manual 2A, table 2-7. The displacement has already been output.
+    unsigned IndexRegNo;
+    if (IndexReg.getReg())
+      IndexRegNo = GetX86RegNum(IndexReg);
+    else // Examples: [ESP+1*<noreg>+4] or [scaled idx]+disp32 (MOD=0,BASE=5)
+      IndexRegNo = 4;
+    EmitSIBByte(SS, IndexRegNo, 5, CurByte, OS);
+  } else {
+    unsigned IndexRegNo;
+    if (IndexReg.getReg())
+      IndexRegNo = GetX86RegNum(IndexReg);
+    else
+      IndexRegNo = 4;   // For example [ESP+1*<noreg>+4]
+    EmitSIBByte(SS, IndexRegNo, GetX86RegNum(Base), CurByte, OS);
+  }
+
+  // Do we need to output a displacement?
+  if (ForceDisp8)
+    EmitImmediate(Disp, 1, FK_Data_1, CurByte, OS, Fixups);
+  else if (ForceDisp32 || Disp.getImm() != 0)
+    EmitImmediate(Disp, 4, MCFixupKind(X86::reloc_signed_4byte), CurByte, OS,
+                  Fixups);
+}
+
+/// EmitVEXOpcodePrefix - AVX instructions are encoded using a opcode prefix
+/// called VEX.
+void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
+                                           int MemOperand, const MCInst &MI,
+                                           const MCInstrDesc &Desc,
+                                           raw_ostream &OS) const {
+  bool HasVEX_4V = false;
+  if ((TSFlags >> X86II::VEXShift) & X86II::VEX_4V)
+    HasVEX_4V = true;
+
+  // VEX_R: opcode externsion equivalent to REX.R in
+  // 1's complement (inverted) form
+  //
+  //  1: Same as REX_R=0 (must be 1 in 32-bit mode)
+  //  0: Same as REX_R=1 (64 bit mode only)
+  //
+  unsigned char VEX_R = 0x1;
+
+  // VEX_X: equivalent to REX.X, only used when a
+  // register is used for index in SIB Byte.
+  //
+  //  1: Same as REX.X=0 (must be 1 in 32-bit mode)
+  //  0: Same as REX.X=1 (64-bit mode only)
+  unsigned char VEX_X = 0x1;
+
+  // VEX_B:
+  //
+  //  1: Same as REX_B=0 (ignored in 32-bit mode)
+  //  0: Same as REX_B=1 (64 bit mode only)
+  //
+  unsigned char VEX_B = 0x1;
+
+  // VEX_W: opcode specific (use like REX.W, or used for
+  // opcode extension, or ignored, depending on the opcode byte)
+  unsigned char VEX_W = 0;
+
+  // VEX_5M (VEX m-mmmmm field):
+  //
+  //  0b00000: Reserved for future use
+  //  0b00001: implied 0F leading opcode
+  //  0b00010: implied 0F 38 leading opcode bytes
+  //  0b00011: implied 0F 3A leading opcode bytes
+  //  0b00100-0b11111: Reserved for future use
+  //
+  unsigned char VEX_5M = 0x1;
+
+  // VEX_4V (VEX vvvv field): a register specifier
+  // (in 1's complement form) or 1111 if unused.
+  unsigned char VEX_4V = 0xf;
+
+  // VEX_L (Vector Length):
+  //
+  //  0: scalar or 128-bit vector
+  //  1: 256-bit vector
+  //
+  unsigned char VEX_L = 0;
+
+  // VEX_PP: opcode extension providing equivalent
+  // functionality of a SIMD prefix
+  //
+  //  0b00: None
+  //  0b01: 66
+  //  0b10: F3
+  //  0b11: F2
+  //
+  unsigned char VEX_PP = 0;
+
+  // Encode the operand size opcode prefix as needed.
+  if (TSFlags & X86II::OpSize)
+    VEX_PP = 0x01;
+
+  if ((TSFlags >> X86II::VEXShift) & X86II::VEX_W)
+    VEX_W = 1;
+
+  if ((TSFlags >> X86II::VEXShift) & X86II::VEX_L)
+    VEX_L = 1;
+
+  switch (TSFlags & X86II::Op0Mask) {
+  default: assert(0 && "Invalid prefix!");
+  case X86II::T8:  // 0F 38
+    VEX_5M = 0x2;
+    break;
+  case X86II::TA:  // 0F 3A
+    VEX_5M = 0x3;
+    break;
+  case X86II::TF:  // F2 0F 38
+    VEX_PP = 0x3;
+    VEX_5M = 0x2;
+    break;
+  case X86II::XS:  // F3 0F
+    VEX_PP = 0x2;
+    break;
+  case X86II::XD:  // F2 0F
+    VEX_PP = 0x3;
+    break;
+  case X86II::A6:  // Bypass: Not used by VEX
+  case X86II::A7:  // Bypass: Not used by VEX
+  case X86II::TB:  // Bypass: Not used by VEX
+  case 0:
+    break;  // No prefix!
+  }
+
+  // Set the vector length to 256-bit if YMM0-YMM15 is used
+  for (unsigned i = 0; i != MI.getNumOperands(); ++i) {
+    if (!MI.getOperand(i).isReg())
+      continue;
+    unsigned SrcReg = MI.getOperand(i).getReg();
+    if (SrcReg >= X86::YMM0 && SrcReg <= X86::YMM15)
+      VEX_L = 1;
+  }
+
+  // Classify VEX_B, VEX_4V, VEX_R, VEX_X
+  unsigned CurOp = 0;
+  switch (TSFlags & X86II::FormMask) {
+  case X86II::MRMInitReg: assert(0 && "FIXME: Remove this!");
+  case X86II::MRMDestMem: {
+    // MRMDestMem instructions forms:
+    //  MemAddr, src1(ModR/M)
+    //  MemAddr, src1(VEX_4V), src2(ModR/M)
+    //  MemAddr, src1(ModR/M), imm8
+    //
+    if (X86II::isX86_64ExtendedReg(MI.getOperand(X86::AddrBaseReg).getReg()))
+      VEX_B = 0x0;
+    if (X86II::isX86_64ExtendedReg(MI.getOperand(X86::AddrIndexReg).getReg()))
+      VEX_X = 0x0;
+
+    CurOp = X86::AddrNumOperands;
+    if (HasVEX_4V)
+      VEX_4V = getVEXRegisterEncoding(MI, CurOp++);
+
+    const MCOperand &MO = MI.getOperand(CurOp);
+    if (MO.isReg() && X86II::isX86_64ExtendedReg(MO.getReg()))
+      VEX_R = 0x0;
+    break;
+  }
+  case X86II::MRMSrcMem: {
+    // MRMSrcMem instructions forms:
+    //  src1(ModR/M), MemAddr
+    //  src1(ModR/M), src2(VEX_4V), MemAddr
+    //  src1(ModR/M), MemAddr, imm8
+    //  src1(ModR/M), MemAddr, src2(VEX_I8IMM)
+    //
+    if (X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg()))
+      VEX_R = 0x0;
+
+    unsigned MemAddrOffset = 1;
+    if (HasVEX_4V) {
+      VEX_4V = getVEXRegisterEncoding(MI, 1);
+      MemAddrOffset++;
+    }
+
+    if (X86II::isX86_64ExtendedReg(
+               MI.getOperand(MemAddrOffset+X86::AddrBaseReg).getReg()))
+      VEX_B = 0x0;
+    if (X86II::isX86_64ExtendedReg(
+               MI.getOperand(MemAddrOffset+X86::AddrIndexReg).getReg()))
+      VEX_X = 0x0;
+    break;
+  }
+  case X86II::MRM0m: case X86II::MRM1m:
+  case X86II::MRM2m: case X86II::MRM3m:
+  case X86II::MRM4m: case X86II::MRM5m:
+  case X86II::MRM6m: case X86II::MRM7m:
+    // MRM[0-9]m instructions forms:
+    //  MemAddr
+    if (X86II::isX86_64ExtendedReg(MI.getOperand(X86::AddrBaseReg).getReg()))
+      VEX_B = 0x0;
+    if (X86II::isX86_64ExtendedReg(MI.getOperand(X86::AddrIndexReg).getReg()))
+      VEX_X = 0x0;
+    break;
+  case X86II::MRMSrcReg:
+    // MRMSrcReg instructions forms:
+    //  dst(ModR/M), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM)
+    //  dst(ModR/M), src1(ModR/M)
+    //  dst(ModR/M), src1(ModR/M), imm8
+    //
+    if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+      VEX_R = 0x0;
+    CurOp++;
+
+    if (HasVEX_4V)
+      VEX_4V = getVEXRegisterEncoding(MI, CurOp++);
+    if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+      VEX_B = 0x0;
+    break;
+  case X86II::MRMDestReg:
+    // MRMDestReg instructions forms:
+    //  dst(ModR/M), src(ModR/M)
+    //  dst(ModR/M), src(ModR/M), imm8
+    if (X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg()))
+      VEX_B = 0x0;
+    if (X86II::isX86_64ExtendedReg(MI.getOperand(1).getReg()))
+      VEX_R = 0x0;
+    break;
+  case X86II::MRM0r: case X86II::MRM1r:
+  case X86II::MRM2r: case X86II::MRM3r:
+  case X86II::MRM4r: case X86II::MRM5r:
+  case X86II::MRM6r: case X86II::MRM7r:
+    // MRM0r-MRM7r instructions forms:
+    //  dst(VEX_4V), src(ModR/M), imm8
+    VEX_4V = getVEXRegisterEncoding(MI, 0);
+    if (X86II::isX86_64ExtendedReg(MI.getOperand(1).getReg()))
+      VEX_B = 0x0;
+    break;
+  default: // RawFrm
+    break;
+  }
+
+  // Emit segment override opcode prefix as needed.
+  EmitSegmentOverridePrefix(TSFlags, CurByte, MemOperand, MI, OS);
+
+  // VEX opcode prefix can have 2 or 3 bytes
+  //
+  //  3 bytes:
+  //    +-----+ +--------------+ +-------------------+
+  //    | C4h | | RXB | m-mmmm | | W | vvvv | L | pp |
+  //    +-----+ +--------------+ +-------------------+
+  //  2 bytes:
+  //    +-----+ +-------------------+
+  //    | C5h | | R | vvvv | L | pp |
+  //    +-----+ +-------------------+
+  //
+  unsigned char LastByte = VEX_PP | (VEX_L << 2) | (VEX_4V << 3);
+
+  if (VEX_B && VEX_X && !VEX_W && (VEX_5M == 1)) { // 2 byte VEX prefix
+    EmitByte(0xC5, CurByte, OS);
+    EmitByte(LastByte | (VEX_R << 7), CurByte, OS);
+    return;
+  }
+
+  // 3 byte VEX prefix
+  EmitByte(0xC4, CurByte, OS);
+  EmitByte(VEX_R << 7 | VEX_X << 6 | VEX_B << 5 | VEX_5M, CurByte, OS);
+  EmitByte(LastByte | (VEX_W << 7), CurByte, OS);
+}
+
+/// DetermineREXPrefix - Determine if the MCInst has to be encoded with a X86-64
+/// REX prefix which specifies 1) 64-bit instructions, 2) non-default operand
+/// size, and 3) use of X86-64 extended registers.
+static unsigned DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags,
+                                   const MCInstrDesc &Desc) {
+  unsigned REX = 0;
+  if (TSFlags & X86II::REX_W)
+    REX |= 1 << 3; // set REX.W
+
+  if (MI.getNumOperands() == 0) return REX;
+
+  unsigned NumOps = MI.getNumOperands();
+  // FIXME: MCInst should explicitize the two-addrness.
+  bool isTwoAddr = NumOps > 1 &&
+                      Desc.getOperandConstraint(1, MCOI::TIED_TO) != -1;
+
+  // If it accesses SPL, BPL, SIL, or DIL, then it requires a 0x40 REX prefix.
+  unsigned i = isTwoAddr ? 1 : 0;
+  for (; i != NumOps; ++i) {
+    const MCOperand &MO = MI.getOperand(i);
+    if (!MO.isReg()) continue;
+    unsigned Reg = MO.getReg();
+    if (!X86II::isX86_64NonExtLowByteReg(Reg)) continue;
+    // FIXME: The caller of DetermineREXPrefix slaps this prefix onto anything
+    // that returns non-zero.
+    REX |= 0x40; // REX fixed encoding prefix
+    break;
+  }
+
+  switch (TSFlags & X86II::FormMask) {
+  case X86II::MRMInitReg: assert(0 && "FIXME: Remove this!");
+  case X86II::MRMSrcReg:
+    if (MI.getOperand(0).isReg() &&
+        X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg()))
+      REX |= 1 << 2; // set REX.R
+    i = isTwoAddr ? 2 : 1;
+    for (; i != NumOps; ++i) {
+      const MCOperand &MO = MI.getOperand(i);
+      if (MO.isReg() && X86II::isX86_64ExtendedReg(MO.getReg()))
+        REX |= 1 << 0; // set REX.B
+    }
+    break;
+  case X86II::MRMSrcMem: {
+    if (MI.getOperand(0).isReg() &&
+        X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg()))
+      REX |= 1 << 2; // set REX.R
+    unsigned Bit = 0;
+    i = isTwoAddr ? 2 : 1;
+    for (; i != NumOps; ++i) {
+      const MCOperand &MO = MI.getOperand(i);
+      if (MO.isReg()) {
+        if (X86II::isX86_64ExtendedReg(MO.getReg()))
+          REX |= 1 << Bit; // set REX.B (Bit=0) and REX.X (Bit=1)
+        Bit++;
+      }
+    }
+    break;
+  }
+  case X86II::MRM0m: case X86II::MRM1m:
+  case X86II::MRM2m: case X86II::MRM3m:
+  case X86II::MRM4m: case X86II::MRM5m:
+  case X86II::MRM6m: case X86II::MRM7m:
+  case X86II::MRMDestMem: {
+    unsigned e = (isTwoAddr ? X86::AddrNumOperands+1 : X86::AddrNumOperands);
+    i = isTwoAddr ? 1 : 0;
+    if (NumOps > e && MI.getOperand(e).isReg() &&
+        X86II::isX86_64ExtendedReg(MI.getOperand(e).getReg()))
+      REX |= 1 << 2; // set REX.R
+    unsigned Bit = 0;
+    for (; i != e; ++i) {
+      const MCOperand &MO = MI.getOperand(i);
+      if (MO.isReg()) {
+        if (X86II::isX86_64ExtendedReg(MO.getReg()))
+          REX |= 1 << Bit; // REX.B (Bit=0) and REX.X (Bit=1)
+        Bit++;
+      }
+    }
+    break;
+  }
+  default:
+    if (MI.getOperand(0).isReg() &&
+        X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg()))
+      REX |= 1 << 0; // set REX.B
+    i = isTwoAddr ? 2 : 1;
+    for (unsigned e = NumOps; i != e; ++i) {
+      const MCOperand &MO = MI.getOperand(i);
+      if (MO.isReg() && X86II::isX86_64ExtendedReg(MO.getReg()))
+        REX |= 1 << 2; // set REX.R
+    }
+    break;
+  }
+  return REX;
+}
+
+/// EmitSegmentOverridePrefix - Emit segment override opcode prefix as needed
+void X86MCCodeEmitter::EmitSegmentOverridePrefix(uint64_t TSFlags,
+                                        unsigned &CurByte, int MemOperand,
+                                        const MCInst &MI,
+                                        raw_ostream &OS) const {
+  switch (TSFlags & X86II::SegOvrMask) {
+  default: assert(0 && "Invalid segment!");
+  case 0:
+    // No segment override, check for explicit one on memory operand.
+    if (MemOperand != -1) {   // If the instruction has a memory operand.
+      switch (MI.getOperand(MemOperand+X86::AddrSegmentReg).getReg()) {
+      default: assert(0 && "Unknown segment register!");
+      case 0: break;
+      case X86::CS: EmitByte(0x2E, CurByte, OS); break;
+      case X86::SS: EmitByte(0x36, CurByte, OS); break;
+      case X86::DS: EmitByte(0x3E, CurByte, OS); break;
+      case X86::ES: EmitByte(0x26, CurByte, OS); break;
+      case X86::FS: EmitByte(0x64, CurByte, OS); break;
+      case X86::GS: EmitByte(0x65, CurByte, OS); break;
+      }
+    }
+    break;
+  case X86II::FS:
+    EmitByte(0x64, CurByte, OS);
+    break;
+  case X86II::GS:
+    EmitByte(0x65, CurByte, OS);
+    break;
+  }
+}
+
+/// EmitOpcodePrefix - Emit all instruction prefixes prior to the opcode.
+///
+/// MemOperand is the operand # of the start of a memory operand if present.  If
+/// Not present, it is -1.
+void X86MCCodeEmitter::EmitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
+                                        int MemOperand, const MCInst &MI,
+                                        const MCInstrDesc &Desc,
+                                        raw_ostream &OS) const {
+
+  // Emit the lock opcode prefix as needed.
+  if (TSFlags & X86II::LOCK)
+    EmitByte(0xF0, CurByte, OS);
+
+  // Emit segment override opcode prefix as needed.
+  EmitSegmentOverridePrefix(TSFlags, CurByte, MemOperand, MI, OS);
+
+  // Emit the repeat opcode prefix as needed.
+  if ((TSFlags & X86II::Op0Mask) == X86II::REP)
+    EmitByte(0xF3, CurByte, OS);
+
+  // Emit the address size opcode prefix as needed.
+  if ((TSFlags & X86II::AdSize) ||
+      (MemOperand != -1 && is64BitMode() && Is32BitMemOperand(MI, MemOperand)))
+    EmitByte(0x67, CurByte, OS);
+
+  // Emit the operand size opcode prefix as needed.
+  if (TSFlags & X86II::OpSize)
+    EmitByte(0x66, CurByte, OS);
+
+  bool Need0FPrefix = false;
+  switch (TSFlags & X86II::Op0Mask) {
+  default: assert(0 && "Invalid prefix!");
+  case 0: break;  // No prefix!
+  case X86II::REP: break; // already handled.
+  case X86II::TB:  // Two-byte opcode prefix
+  case X86II::T8:  // 0F 38
+  case X86II::TA:  // 0F 3A
+  case X86II::A6:  // 0F A6
+  case X86II::A7:  // 0F A7
+    Need0FPrefix = true;
+    break;
+  case X86II::TF: // F2 0F 38
+    EmitByte(0xF2, CurByte, OS);
+    Need0FPrefix = true;
+    break;
+  case X86II::XS:   // F3 0F
+    EmitByte(0xF3, CurByte, OS);
+    Need0FPrefix = true;
+    break;
+  case X86II::XD:   // F2 0F
+    EmitByte(0xF2, CurByte, OS);
+    Need0FPrefix = true;
+    break;
+  case X86II::D8: EmitByte(0xD8, CurByte, OS); break;
+  case X86II::D9: EmitByte(0xD9, CurByte, OS); break;
+  case X86II::DA: EmitByte(0xDA, CurByte, OS); break;
+  case X86II::DB: EmitByte(0xDB, CurByte, OS); break;
+  case X86II::DC: EmitByte(0xDC, CurByte, OS); break;
+  case X86II::DD: EmitByte(0xDD, CurByte, OS); break;
+  case X86II::DE: EmitByte(0xDE, CurByte, OS); break;
+  case X86II::DF: EmitByte(0xDF, CurByte, OS); break;
+  }
+
+  // Handle REX prefix.
+  // FIXME: Can this come before F2 etc to simplify emission?
+  if (is64BitMode()) {
+    if (unsigned REX = DetermineREXPrefix(MI, TSFlags, Desc))
+      EmitByte(0x40 | REX, CurByte, OS);
+  }
+
+  // 0x0F escape code must be emitted just before the opcode.
+  if (Need0FPrefix)
+    EmitByte(0x0F, CurByte, OS);
+
+  // FIXME: Pull this up into previous switch if REX can be moved earlier.
+  switch (TSFlags & X86II::Op0Mask) {
+  case X86II::TF:    // F2 0F 38
+  case X86II::T8:    // 0F 38
+    EmitByte(0x38, CurByte, OS);
+    break;
+  case X86II::TA:    // 0F 3A
+    EmitByte(0x3A, CurByte, OS);
+    break;
+  case X86II::A6:    // 0F A6
+    EmitByte(0xA6, CurByte, OS);
+    break;
+  case X86II::A7:    // 0F A7
+    EmitByte(0xA7, CurByte, OS);
+    break;
+  }
+}
+
+void X86MCCodeEmitter::
+EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                  SmallVectorImpl<MCFixup> &Fixups) const {
+  unsigned Opcode = MI.getOpcode();
+  const MCInstrDesc &Desc = MCII.get(Opcode);
+  uint64_t TSFlags = Desc.TSFlags;
+
+  // Pseudo instructions don't get encoded.
+  if ((TSFlags & X86II::FormMask) == X86II::Pseudo)
+    return;
+
+  // If this is a two-address instruction, skip one of the register operands.
+  // FIXME: This should be handled during MCInst lowering.
+  unsigned NumOps = Desc.getNumOperands();
+  unsigned CurOp = 0;
+  if (NumOps > 1 && Desc.getOperandConstraint(1, MCOI::TIED_TO) != -1)
+    ++CurOp;
+  else if (NumOps > 2 && Desc.getOperandConstraint(NumOps-1, MCOI::TIED_TO)== 0)
+    // Skip the last source operand that is tied_to the dest reg. e.g. LXADD32
+    --NumOps;
+
+  // Keep track of the current byte being emitted.
+  unsigned CurByte = 0;
+
+  // Is this instruction encoded using the AVX VEX prefix?
+  bool HasVEXPrefix = false;
+
+  // It uses the VEX.VVVV field?
+  bool HasVEX_4V = false;
+
+  if ((TSFlags >> X86II::VEXShift) & X86II::VEX)
+    HasVEXPrefix = true;
+  if ((TSFlags >> X86II::VEXShift) & X86II::VEX_4V)
+    HasVEX_4V = true;
+
+  // Determine where the memory operand starts, if present.
+  int MemoryOperand = X86II::getMemoryOperandNo(TSFlags);
+  if (MemoryOperand != -1) MemoryOperand += CurOp;
+
+  if (!HasVEXPrefix)
+    EmitOpcodePrefix(TSFlags, CurByte, MemoryOperand, MI, Desc, OS);
+  else
+    EmitVEXOpcodePrefix(TSFlags, CurByte, MemoryOperand, MI, Desc, OS);
+
+  unsigned char BaseOpcode = X86II::getBaseOpcodeFor(TSFlags);
+
+  if ((TSFlags >> X86II::VEXShift) & X86II::Has3DNow0F0FOpcode)
+    BaseOpcode = 0x0F;   // Weird 3DNow! encoding.
+
+  unsigned SrcRegNum = 0;
+  switch (TSFlags & X86II::FormMask) {
+  case X86II::MRMInitReg:
+    assert(0 && "FIXME: Remove this form when the JIT moves to MCCodeEmitter!");
+  default: errs() << "FORM: " << (TSFlags & X86II::FormMask) << "\n";
+    assert(0 && "Unknown FormMask value in X86MCCodeEmitter!");
+  case X86II::Pseudo:
+    assert(0 && "Pseudo instruction shouldn't be emitted");
+  case X86II::RawFrm:
+    EmitByte(BaseOpcode, CurByte, OS);
+    break;
+  case X86II::RawFrmImm8:
+    EmitByte(BaseOpcode, CurByte, OS);
+    EmitImmediate(MI.getOperand(CurOp++),
+                  X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags),
+                  CurByte, OS, Fixups);
+    EmitImmediate(MI.getOperand(CurOp++), 1, FK_Data_1, CurByte, OS, Fixups);
+    break;
+  case X86II::RawFrmImm16:
+    EmitByte(BaseOpcode, CurByte, OS);
+    EmitImmediate(MI.getOperand(CurOp++),
+                  X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags),
+                  CurByte, OS, Fixups);
+    EmitImmediate(MI.getOperand(CurOp++), 2, FK_Data_2, CurByte, OS, Fixups);
+    break;
+
+  case X86II::AddRegFrm:
+    EmitByte(BaseOpcode + GetX86RegNum(MI.getOperand(CurOp++)), CurByte, OS);
+    break;
+
+  case X86II::MRMDestReg:
+    EmitByte(BaseOpcode, CurByte, OS);
+    EmitRegModRMByte(MI.getOperand(CurOp),
+                     GetX86RegNum(MI.getOperand(CurOp+1)), CurByte, OS);
+    CurOp += 2;
+    break;
+
+  case X86II::MRMDestMem:
+    EmitByte(BaseOpcode, CurByte, OS);
+    SrcRegNum = CurOp + X86::AddrNumOperands;
+
+    if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
+      SrcRegNum++;
+
+    EmitMemModRMByte(MI, CurOp,
+                     GetX86RegNum(MI.getOperand(SrcRegNum)),
+                     TSFlags, CurByte, OS, Fixups);
+    CurOp = SrcRegNum + 1;
+    break;
+
+  case X86II::MRMSrcReg:
+    EmitByte(BaseOpcode, CurByte, OS);
+    SrcRegNum = CurOp + 1;
+
+    if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
+      SrcRegNum++;
+
+    EmitRegModRMByte(MI.getOperand(SrcRegNum),
+                     GetX86RegNum(MI.getOperand(CurOp)), CurByte, OS);
+    CurOp = SrcRegNum + 1;
+    break;
+
+  case X86II::MRMSrcMem: {
+    int AddrOperands = X86::AddrNumOperands;
+    unsigned FirstMemOp = CurOp+1;
+    if (HasVEX_4V) {
+      ++AddrOperands;
+      ++FirstMemOp;  // Skip the register source (which is encoded in VEX_VVVV).
+    }
+
+    EmitByte(BaseOpcode, CurByte, OS);
+
+    EmitMemModRMByte(MI, FirstMemOp, GetX86RegNum(MI.getOperand(CurOp)),
+                     TSFlags, CurByte, OS, Fixups);
+    CurOp += AddrOperands + 1;
+    break;
+  }
+
+  case X86II::MRM0r: case X86II::MRM1r:
+  case X86II::MRM2r: case X86II::MRM3r:
+  case X86II::MRM4r: case X86II::MRM5r:
+  case X86II::MRM6r: case X86II::MRM7r:
+    if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV).
+      CurOp++;
+    EmitByte(BaseOpcode, CurByte, OS);
+    EmitRegModRMByte(MI.getOperand(CurOp++),
+                     (TSFlags & X86II::FormMask)-X86II::MRM0r,
+                     CurByte, OS);
+    break;
+  case X86II::MRM0m: case X86II::MRM1m:
+  case X86II::MRM2m: case X86II::MRM3m:
+  case X86II::MRM4m: case X86II::MRM5m:
+  case X86II::MRM6m: case X86II::MRM7m:
+    EmitByte(BaseOpcode, CurByte, OS);
+    EmitMemModRMByte(MI, CurOp, (TSFlags & X86II::FormMask)-X86II::MRM0m,
+                     TSFlags, CurByte, OS, Fixups);
+    CurOp += X86::AddrNumOperands;
+    break;
+  case X86II::MRM_C1:
+    EmitByte(BaseOpcode, CurByte, OS);
+    EmitByte(0xC1, CurByte, OS);
+    break;
+  case X86II::MRM_C2:
+    EmitByte(BaseOpcode, CurByte, OS);
+    EmitByte(0xC2, CurByte, OS);
+    break;
+  case X86II::MRM_C3:
+    EmitByte(BaseOpcode, CurByte, OS);
+    EmitByte(0xC3, CurByte, OS);
+    break;
+  case X86II::MRM_C4:
+    EmitByte(BaseOpcode, CurByte, OS);
+    EmitByte(0xC4, CurByte, OS);
+    break;
+  case X86II::MRM_C8:
+    EmitByte(BaseOpcode, CurByte, OS);
+    EmitByte(0xC8, CurByte, OS);
+    break;
+  case X86II::MRM_C9:
+    EmitByte(BaseOpcode, CurByte, OS);
+    EmitByte(0xC9, CurByte, OS);
+    break;
+  case X86II::MRM_E8:
+    EmitByte(BaseOpcode, CurByte, OS);
+    EmitByte(0xE8, CurByte, OS);
+    break;
+  case X86II::MRM_F0:
+    EmitByte(BaseOpcode, CurByte, OS);
+    EmitByte(0xF0, CurByte, OS);
+    break;
+  case X86II::MRM_F8:
+    EmitByte(BaseOpcode, CurByte, OS);
+    EmitByte(0xF8, CurByte, OS);
+    break;
+  case X86II::MRM_F9:
+    EmitByte(BaseOpcode, CurByte, OS);
+    EmitByte(0xF9, CurByte, OS);
+    break;
+  case X86II::MRM_D0:
+    EmitByte(BaseOpcode, CurByte, OS);
+    EmitByte(0xD0, CurByte, OS);
+    break;
+  case X86II::MRM_D1:
+    EmitByte(BaseOpcode, CurByte, OS);
+    EmitByte(0xD1, CurByte, OS);
+    break;
+  }
+
+  // If there is a remaining operand, it must be a trailing immediate.  Emit it
+  // according to the right size for the instruction.
+  if (CurOp != NumOps) {
+    // The last source register of a 4 operand instruction in AVX is encoded
+    // in bits[7:4] of a immediate byte, and bits[3:0] are ignored.
+    if ((TSFlags >> X86II::VEXShift) & X86II::VEX_I8IMM) {
+      const MCOperand &MO = MI.getOperand(CurOp++);
+      bool IsExtReg = X86II::isX86_64ExtendedReg(MO.getReg());
+      unsigned RegNum = (IsExtReg ? (1 << 7) : 0);
+      RegNum |= GetX86RegNum(MO) << 4;
+      EmitImmediate(MCOperand::CreateImm(RegNum), 1, FK_Data_1, CurByte, OS,
+                    Fixups);
+    } else {
+      unsigned FixupKind;
+      // FIXME: Is there a better way to know that we need a signed relocation?
+      if (MI.getOpcode() == X86::ADD64ri32 ||
+          MI.getOpcode() == X86::MOV64ri32 ||
+          MI.getOpcode() == X86::MOV64mi32 ||
+          MI.getOpcode() == X86::PUSH64i32)
+        FixupKind = X86::reloc_signed_4byte;
+      else
+        FixupKind = getImmFixupKind(TSFlags);
+      EmitImmediate(MI.getOperand(CurOp++),
+                    X86II::getSizeOfImm(TSFlags), MCFixupKind(FixupKind),
+                    CurByte, OS, Fixups);
+    }
+  }
+
+  if ((TSFlags >> X86II::VEXShift) & X86II::Has3DNow0F0FOpcode)
+    EmitByte(X86II::getBaseOpcodeFor(TSFlags), CurByte, OS);
+
+#ifndef NDEBUG
+  // FIXME: Verify.
+  if (/*!Desc.isVariadic() &&*/ CurOp != NumOps) {
+    errs() << "Cannot encode all operands of: ";
+    MI.dump();
+    errs() << '\n';
+    abort();
+  }
+#endif
+}
diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
index b77f37b03f192..f98d5e331fefb 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -13,12 +13,18 @@
 
 #include "X86MCTargetDesc.h"
 #include "X86MCAsmInfo.h"
+#include "InstPrinter/X86ATTInstPrinter.h"
+#include "InstPrinter/X86IntelInstPrinter.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/MC/MCCodeGenInfo.h"
+#include "llvm/MC/MCInstrAnalysis.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Target/TargetRegistry.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Support/Host.h"
+#include "llvm/Support/TargetRegistry.h"
 
 #define GET_REGINFO_MC_DESC
 #include "X86GenRegisterInfo.inc"
@@ -34,9 +40,16 @@ using namespace llvm;
 
 std::string X86_MC::ParseX86Triple(StringRef TT) {
   Triple TheTriple(TT);
+  std::string FS;
   if (TheTriple.getArch() == Triple::x86_64)
-    return "+64bit-mode";
-  return "-64bit-mode";
+    FS = "+64bit-mode";
+  else
+    FS = "-64bit-mode";
+  if (TheTriple.getOS() == Triple::NativeClient)
+    FS += ",+nacl-mode";
+  else
+    FS += ",-nacl-mode";
+  return FS;
 }
 
 /// GetCpuIDAndInfo - Execute the specified cpuid and return the 4 values in the
@@ -107,6 +120,135 @@ void X86_MC::DetectFamilyModel(unsigned EAX, unsigned &Family,
   }
 }
 
+unsigned X86_MC::getDwarfRegFlavour(StringRef TT, bool isEH) {
+  Triple TheTriple(TT);
+  if (TheTriple.getArch() == Triple::x86_64)
+    return DWARFFlavour::X86_64;
+
+  if (TheTriple.isOSDarwin())
+    return isEH ? DWARFFlavour::X86_32_DarwinEH : DWARFFlavour::X86_32_Generic;
+  if (TheTriple.getOS() == Triple::MinGW32 ||
+      TheTriple.getOS() == Triple::Cygwin)
+    // Unsupported by now, just quick fallback
+    return DWARFFlavour::X86_32_Generic;
+  return DWARFFlavour::X86_32_Generic;
+}
+
+/// getX86RegNum - This function maps LLVM register identifiers to their X86
+/// specific numbering, which is used in various places encoding instructions.
+unsigned X86_MC::getX86RegNum(unsigned RegNo) {
+  switch(RegNo) {
+  case X86::RAX: case X86::EAX: case X86::AX: case X86::AL: return N86::EAX;
+  case X86::RCX: case X86::ECX: case X86::CX: case X86::CL: return N86::ECX;
+  case X86::RDX: case X86::EDX: case X86::DX: case X86::DL: return N86::EDX;
+  case X86::RBX: case X86::EBX: case X86::BX: case X86::BL: return N86::EBX;
+  case X86::RSP: case X86::ESP: case X86::SP: case X86::SPL: case X86::AH:
+    return N86::ESP;
+  case X86::RBP: case X86::EBP: case X86::BP: case X86::BPL: case X86::CH:
+    return N86::EBP;
+  case X86::RSI: case X86::ESI: case X86::SI: case X86::SIL: case X86::DH:
+    return N86::ESI;
+  case X86::RDI: case X86::EDI: case X86::DI: case X86::DIL: case X86::BH:
+    return N86::EDI;
+
+  case X86::R8:  case X86::R8D:  case X86::R8W:  case X86::R8B:
+    return N86::EAX;
+  case X86::R9:  case X86::R9D:  case X86::R9W:  case X86::R9B:
+    return N86::ECX;
+  case X86::R10: case X86::R10D: case X86::R10W: case X86::R10B:
+    return N86::EDX;
+  case X86::R11: case X86::R11D: case X86::R11W: case X86::R11B:
+    return N86::EBX;
+  case X86::R12: case X86::R12D: case X86::R12W: case X86::R12B:
+    return N86::ESP;
+  case X86::R13: case X86::R13D: case X86::R13W: case X86::R13B:
+    return N86::EBP;
+  case X86::R14: case X86::R14D: case X86::R14W: case X86::R14B:
+    return N86::ESI;
+  case X86::R15: case X86::R15D: case X86::R15W: case X86::R15B:
+    return N86::EDI;
+
+  case X86::ST0: case X86::ST1: case X86::ST2: case X86::ST3:
+  case X86::ST4: case X86::ST5: case X86::ST6: case X86::ST7:
+    return RegNo-X86::ST0;
+
+  case X86::XMM0: case X86::XMM8:
+  case X86::YMM0: case X86::YMM8: case X86::MM0:
+    return 0;
+  case X86::XMM1: case X86::XMM9:
+  case X86::YMM1: case X86::YMM9: case X86::MM1:
+    return 1;
+  case X86::XMM2: case X86::XMM10:
+  case X86::YMM2: case X86::YMM10: case X86::MM2:
+    return 2;
+  case X86::XMM3: case X86::XMM11:
+  case X86::YMM3: case X86::YMM11: case X86::MM3:
+    return 3;
+  case X86::XMM4: case X86::XMM12:
+  case X86::YMM4: case X86::YMM12: case X86::MM4:
+    return 4;
+  case X86::XMM5: case X86::XMM13:
+  case X86::YMM5: case X86::YMM13: case X86::MM5:
+    return 5;
+  case X86::XMM6: case X86::XMM14:
+  case X86::YMM6: case X86::YMM14: case X86::MM6:
+    return 6;
+  case X86::XMM7: case X86::XMM15:
+  case X86::YMM7: case X86::YMM15: case X86::MM7:
+    return 7;
+
+  case X86::ES: return 0;
+  case X86::CS: return 1;
+  case X86::SS: return 2;
+  case X86::DS: return 3;
+  case X86::FS: return 4;
+  case X86::GS: return 5;
+
+  case X86::CR0: case X86::CR8 : case X86::DR0: return 0;
+  case X86::CR1: case X86::CR9 : case X86::DR1: return 1;
+  case X86::CR2: case X86::CR10: case X86::DR2: return 2;
+  case X86::CR3: case X86::CR11: case X86::DR3: return 3;
+  case X86::CR4: case X86::CR12: case X86::DR4: return 4;
+  case X86::CR5: case X86::CR13: case X86::DR5: return 5;
+  case X86::CR6: case X86::CR14: case X86::DR6: return 6;
+  case X86::CR7: case X86::CR15: case X86::DR7: return 7;
+
+  // Pseudo index registers are equivalent to a "none"
+  // scaled index (See Intel Manual 2A, table 2-3)
+  case X86::EIZ:
+  case X86::RIZ:
+    return 4;
+
+  default:
+    assert((int(RegNo) > 0) && "Unknown physical register!");
+    return 0;
+  }
+}
+
+void X86_MC::InitLLVM2SEHRegisterMapping(MCRegisterInfo *MRI) {
+  // FIXME: TableGen these.
+  for (unsigned Reg = X86::NoRegister+1; Reg < X86::NUM_TARGET_REGS; ++Reg) {
+    int SEH = X86_MC::getX86RegNum(Reg);
+    switch (Reg) {
+    case X86::R8:  case X86::R8D:  case X86::R8W:  case X86::R8B:
+    case X86::R9:  case X86::R9D:  case X86::R9W:  case X86::R9B:
+    case X86::R10: case X86::R10D: case X86::R10W: case X86::R10B:
+    case X86::R11: case X86::R11D: case X86::R11W: case X86::R11B:
+    case X86::R12: case X86::R12D: case X86::R12W: case X86::R12B:
+    case X86::R13: case X86::R13D: case X86::R13W: case X86::R13B:
+    case X86::R14: case X86::R14D: case X86::R14W: case X86::R14B:
+    case X86::R15: case X86::R15D: case X86::R15W: case X86::R15B:
+    case X86::XMM8:  case X86::XMM9:  case X86::XMM10: case X86::XMM11:
+    case X86::XMM12: case X86::XMM13: case X86::XMM14: case X86::XMM15:
+    case X86::YMM8:  case X86::YMM9:  case X86::YMM10: case X86::YMM11:
+    case X86::YMM12: case X86::YMM13: case X86::YMM14: case X86::YMM15:
+      SEH += 8;
+      break;
+    }
+    MRI->mapLLVMRegToSEHReg(Reg, SEH);
+  }
+}
+
 MCSubtargetInfo *X86_MC::createX86MCSubtargetInfo(StringRef TT, StringRef CPU,
                                                   StringRef FS) {
   std::string ArchFS = X86_MC::ParseX86Triple(TT);
@@ -131,55 +273,191 @@ MCSubtargetInfo *X86_MC::createX86MCSubtargetInfo(StringRef TT, StringRef CPU,
   return X;
 }
 
-// Force static initialization.
-extern "C" void LLVMInitializeX86MCSubtargetInfo() {
-  TargetRegistry::RegisterMCSubtargetInfo(TheX86_32Target,
-                                          X86_MC::createX86MCSubtargetInfo);
-  TargetRegistry::RegisterMCSubtargetInfo(TheX86_64Target,
-                                          X86_MC::createX86MCSubtargetInfo);
-}
-
 static MCInstrInfo *createX86MCInstrInfo() {
   MCInstrInfo *X = new MCInstrInfo();
   InitX86MCInstrInfo(X);
   return X;
 }
 
-extern "C" void LLVMInitializeX86MCInstrInfo() {
-  TargetRegistry::RegisterMCInstrInfo(TheX86_32Target, createX86MCInstrInfo);
-  TargetRegistry::RegisterMCInstrInfo(TheX86_64Target, createX86MCInstrInfo);
-}
+static MCRegisterInfo *createX86MCRegisterInfo(StringRef TT) {
+  Triple TheTriple(TT);
+  unsigned RA = (TheTriple.getArch() == Triple::x86_64)
+    ? X86::RIP     // Should have dwarf #16.
+    : X86::EIP;    // Should have dwarf #8.
 
-static MCRegisterInfo *createX86MCRegisterInfo() {
   MCRegisterInfo *X = new MCRegisterInfo();
-  InitX86MCRegisterInfo(X);
+  InitX86MCRegisterInfo(X, RA,
+                        X86_MC::getDwarfRegFlavour(TT, false),
+                        X86_MC::getDwarfRegFlavour(TT, true));
+  X86_MC::InitLLVM2SEHRegisterMapping(X);
   return X;
 }
 
-extern "C" void LLVMInitializeX86MCRegInfo() {
-  TargetRegistry::RegisterMCRegInfo(TheX86_32Target, createX86MCRegisterInfo);
-  TargetRegistry::RegisterMCRegInfo(TheX86_64Target, createX86MCRegisterInfo);
-}
-
-
 static MCAsmInfo *createX86MCAsmInfo(const Target &T, StringRef TT) {
   Triple TheTriple(TT);
+  bool is64Bit = TheTriple.getArch() == Triple::x86_64;
 
+  MCAsmInfo *MAI;
   if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO) {
-    if (TheTriple.getArch() == Triple::x86_64)
-      return new X86_64MCAsmInfoDarwin(TheTriple);
+    if (is64Bit)
+      MAI = new X86_64MCAsmInfoDarwin(TheTriple);
     else
-      return new X86MCAsmInfoDarwin(TheTriple);
+      MAI = new X86MCAsmInfoDarwin(TheTriple);
+  } else if (TheTriple.isOSWindows()) {
+    MAI = new X86MCAsmInfoCOFF(TheTriple);
+  } else {
+    MAI = new X86ELFMCAsmInfo(TheTriple);
   }
 
+  // Initialize initial frame state.
+  // Calculate amount of bytes used for return address storing
+  int stackGrowth = is64Bit ? -8 : -4;
+
+  // Initial state of the frame pointer is esp+stackGrowth.
+  MachineLocation Dst(MachineLocation::VirtualFP);
+  MachineLocation Src(is64Bit ? X86::RSP : X86::ESP, stackGrowth);
+  MAI->addInitialFrameState(0, Dst, Src);
+
+  // Add return address to move list
+  MachineLocation CSDst(is64Bit ? X86::RSP : X86::ESP, stackGrowth);
+  MachineLocation CSSrc(is64Bit ? X86::RIP : X86::EIP);
+  MAI->addInitialFrameState(0, CSDst, CSSrc);
+
+  return MAI;
+}
+
+static MCCodeGenInfo *createX86MCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                             CodeModel::Model CM) {
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+
+  Triple T(TT);
+  bool is64Bit = T.getArch() == Triple::x86_64;
+
+  if (RM == Reloc::Default) {
+    // Darwin defaults to PIC in 64 bit mode and dynamic-no-pic in 32 bit mode.
+    // Win64 requires rip-rel addressing, thus we force it to PIC. Otherwise we
+    // use static relocation model by default.
+    if (T.isOSDarwin()) {
+      if (is64Bit)
+        RM = Reloc::PIC_;
+      else
+        RM = Reloc::DynamicNoPIC;
+    } else if (T.isOSWindows() && is64Bit)
+      RM = Reloc::PIC_;
+    else
+      RM = Reloc::Static;
+  }
+
+  // ELF and X86-64 don't have a distinct DynamicNoPIC model.  DynamicNoPIC
+  // is defined as a model for code which may be used in static or dynamic
+  // executables but not necessarily a shared library. On X86-32 we just
+  // compile in -static mode, in x86-64 we use PIC.
+  if (RM == Reloc::DynamicNoPIC) {
+    if (is64Bit)
+      RM = Reloc::PIC_;
+    else if (!T.isOSDarwin())
+      RM = Reloc::Static;
+  }
+
+  // If we are on Darwin, disallow static relocation model in X86-64 mode, since
+  // the Mach-O file format doesn't support it.
+  if (RM == Reloc::Static && T.isOSDarwin() && is64Bit)
+    RM = Reloc::PIC_;
+
+  // For static codegen, if we're not already set, use Small codegen.
+  if (CM == CodeModel::Default)
+    CM = CodeModel::Small;
+  else if (CM == CodeModel::JITDefault)
+    // 64-bit JIT places everything in the same buffer except external funcs.
+    CM = is64Bit ? CodeModel::Large : CodeModel::Small;
+
+  X->InitMCCodeGenInfo(RM, CM);
+  return X;
+}
+
+static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
+                                    MCContext &Ctx, MCAsmBackend &MAB,
+                                    raw_ostream &_OS,
+                                    MCCodeEmitter *_Emitter,
+                                    bool RelaxAll,
+                                    bool NoExecStack) {
+  Triple TheTriple(TT);
+
+  if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO)
+    return createMachOStreamer(Ctx, MAB, _OS, _Emitter, RelaxAll);
+
   if (TheTriple.isOSWindows())
-    return new X86MCAsmInfoCOFF(TheTriple);
+    return createWinCOFFStreamer(Ctx, MAB, *_Emitter, _OS, RelaxAll);
+
+  return createELFStreamer(Ctx, MAB, _OS, _Emitter, RelaxAll, NoExecStack);
+}
+
+static MCInstPrinter *createX86MCInstPrinter(const Target &T,
+                                             unsigned SyntaxVariant,
+                                             const MCAsmInfo &MAI,
+                                             const MCSubtargetInfo &STI) {
+  if (SyntaxVariant == 0)
+    return new X86ATTInstPrinter(MAI);
+  if (SyntaxVariant == 1)
+    return new X86IntelInstPrinter(MAI);
+  return 0;
+}
 
-  return new X86ELFMCAsmInfo(TheTriple);
+static MCInstrAnalysis *createX86MCInstrAnalysis(const MCInstrInfo *Info) {
+  return new MCInstrAnalysis(Info);
 }
 
-extern "C" void LLVMInitializeX86MCAsmInfo() {
-  // Register the target asm info.
+// Force static initialization.
+extern "C" void LLVMInitializeX86TargetMC() {
+  // Register the MC asm info.
   RegisterMCAsmInfoFn A(TheX86_32Target, createX86MCAsmInfo);
   RegisterMCAsmInfoFn B(TheX86_64Target, createX86MCAsmInfo);
+
+  // Register the MC codegen info.
+  RegisterMCCodeGenInfoFn C(TheX86_32Target, createX86MCCodeGenInfo);
+  RegisterMCCodeGenInfoFn D(TheX86_64Target, createX86MCCodeGenInfo);
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(TheX86_32Target, createX86MCInstrInfo);
+  TargetRegistry::RegisterMCInstrInfo(TheX86_64Target, createX86MCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(TheX86_32Target, createX86MCRegisterInfo);
+  TargetRegistry::RegisterMCRegInfo(TheX86_64Target, createX86MCRegisterInfo);
+
+  // Register the MC subtarget info.
+  TargetRegistry::RegisterMCSubtargetInfo(TheX86_32Target,
+                                          X86_MC::createX86MCSubtargetInfo);
+  TargetRegistry::RegisterMCSubtargetInfo(TheX86_64Target,
+                                          X86_MC::createX86MCSubtargetInfo);
+
+  // Register the MC instruction analyzer.
+  TargetRegistry::RegisterMCInstrAnalysis(TheX86_32Target,
+                                          createX86MCInstrAnalysis);
+  TargetRegistry::RegisterMCInstrAnalysis(TheX86_64Target,
+                                          createX86MCInstrAnalysis);
+
+  // Register the code emitter.
+  TargetRegistry::RegisterMCCodeEmitter(TheX86_32Target,
+                                        createX86MCCodeEmitter);
+  TargetRegistry::RegisterMCCodeEmitter(TheX86_64Target,
+                                        createX86MCCodeEmitter);
+
+  // Register the asm backend.
+  TargetRegistry::RegisterMCAsmBackend(TheX86_32Target,
+                                       createX86_32AsmBackend);
+  TargetRegistry::RegisterMCAsmBackend(TheX86_64Target,
+                                       createX86_64AsmBackend);
+
+  // Register the object streamer.
+  TargetRegistry::RegisterMCObjectStreamer(TheX86_32Target,
+                                           createMCStreamer);
+  TargetRegistry::RegisterMCObjectStreamer(TheX86_64Target,
+                                           createMCStreamer);
+
+  // Register the MCInstPrinter.
+  TargetRegistry::RegisterMCInstPrinter(TheX86_32Target,
+                                        createX86MCInstPrinter);
+  TargetRegistry::RegisterMCInstPrinter(TheX86_64Target,
+                                        createX86MCInstPrinter);
 }
diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
index 89ea22b31be2f..c144c513de152 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
@@ -14,15 +14,39 @@
 #ifndef X86MCTARGETDESC_H
 #define X86MCTARGETDESC_H
 
+#include "llvm/Support/DataTypes.h"
 #include <string>
 
 namespace llvm {
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCObjectWriter;
+class MCRegisterInfo;
 class MCSubtargetInfo;
 class Target;
 class StringRef;
+class raw_ostream;
 
 extern Target TheX86_32Target, TheX86_64Target;
 
+/// DWARFFlavour - Flavour of dwarf regnumbers
+///
+namespace DWARFFlavour {
+  enum {
+    X86_64 = 0, X86_32_DarwinEH = 1, X86_32_Generic = 2
+  };
+} 
+  
+/// N86 namespace - Native X86 register numbers
+///
+namespace N86 {
+  enum {
+    EAX = 0, ECX = 1, EDX = 2, EBX = 3, ESP = 4, EBP = 5, ESI = 6, EDI = 7
+  };
+}
+
 namespace X86_MC {
   std::string ParseX86Triple(StringRef TT);
 
@@ -33,13 +57,32 @@ namespace X86_MC {
 
   void DetectFamilyModel(unsigned EAX, unsigned &Family, unsigned &Model);
 
-  /// createARMMCSubtargetInfo - Create a X86 MCSubtargetInfo instance.
+  unsigned getDwarfRegFlavour(StringRef TT, bool isEH);
+
+  unsigned getX86RegNum(unsigned RegNo);
+
+  void InitLLVM2SEHRegisterMapping(MCRegisterInfo *MRI);
+
+  /// createX86MCSubtargetInfo - Create a X86 MCSubtargetInfo instance.
   /// This is exposed so Asm parser, etc. do not need to go through
   /// TargetRegistry.
   MCSubtargetInfo *createX86MCSubtargetInfo(StringRef TT, StringRef CPU,
                                             StringRef FS);
 }
 
+MCCodeEmitter *createX86MCCodeEmitter(const MCInstrInfo &MCII,
+                                      const MCSubtargetInfo &STI,
+                                      MCContext &Ctx);
+
+MCAsmBackend *createX86_32AsmBackend(const Target &T, StringRef TT);
+MCAsmBackend *createX86_64AsmBackend(const Target &T, StringRef TT);
+
+/// createX86MachObjectWriter - Construct an X86 Mach-O object writer.
+MCObjectWriter *createX86MachObjectWriter(raw_ostream &OS,
+                                          bool Is64Bit,
+                                          uint32_t CPUType,
+                                          uint32_t CPUSubtype);
+
 } // End llvm namespace
 
 
diff --git a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
new file mode 100644
index 0000000000000..f0f1982d57f20
--- /dev/null
+++ b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
@@ -0,0 +1,554 @@
+//===-- X86MachObjectWriter.cpp - X86 Mach-O Writer -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86FixupKinds.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCMachObjectWriter.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Object/MachOFormat.h"
+
+using namespace llvm;
+using namespace llvm::object;
+
+namespace {
+class X86MachObjectWriter : public MCMachObjectTargetWriter {
+  void RecordScatteredRelocation(MachObjectWriter *Writer,
+                                 const MCAssembler &Asm,
+                                 const MCAsmLayout &Layout,
+                                 const MCFragment *Fragment,
+                                 const MCFixup &Fixup,
+                                 MCValue Target,
+                                 unsigned Log2Size,
+                                 uint64_t &FixedValue);
+  void RecordTLVPRelocation(MachObjectWriter *Writer,
+                            const MCAssembler &Asm,
+                            const MCAsmLayout &Layout,
+                            const MCFragment *Fragment,
+                            const MCFixup &Fixup,
+                            MCValue Target,
+                            uint64_t &FixedValue);
+
+  void RecordX86Relocation(MachObjectWriter *Writer,
+                              const MCAssembler &Asm,
+                              const MCAsmLayout &Layout,
+                              const MCFragment *Fragment,
+                              const MCFixup &Fixup,
+                              MCValue Target,
+                              uint64_t &FixedValue);
+  void RecordX86_64Relocation(MachObjectWriter *Writer,
+                              const MCAssembler &Asm,
+                              const MCAsmLayout &Layout,
+                              const MCFragment *Fragment,
+                              const MCFixup &Fixup,
+                              MCValue Target,
+                              uint64_t &FixedValue);
+public:
+  X86MachObjectWriter(bool Is64Bit, uint32_t CPUType,
+                      uint32_t CPUSubtype)
+    : MCMachObjectTargetWriter(Is64Bit, CPUType, CPUSubtype,
+                               /*UseAggressiveSymbolFolding=*/Is64Bit) {}
+
+  void RecordRelocation(MachObjectWriter *Writer,
+                        const MCAssembler &Asm, const MCAsmLayout &Layout,
+                        const MCFragment *Fragment, const MCFixup &Fixup,
+                        MCValue Target, uint64_t &FixedValue) {
+    if (Writer->is64Bit())
+      RecordX86_64Relocation(Writer, Asm, Layout, Fragment, Fixup, Target,
+                             FixedValue);
+    else
+      RecordX86Relocation(Writer, Asm, Layout, Fragment, Fixup, Target,
+                          FixedValue);
+  }
+};
+}
+
+static bool isFixupKindRIPRel(unsigned Kind) {
+  return Kind == X86::reloc_riprel_4byte ||
+    Kind == X86::reloc_riprel_4byte_movq_load;
+}
+
+static unsigned getFixupKindLog2Size(unsigned Kind) {
+  switch (Kind) {
+  default:
+    llvm_unreachable("invalid fixup kind!");
+  case FK_PCRel_1:
+  case FK_Data_1: return 0;
+  case FK_PCRel_2:
+  case FK_Data_2: return 1;
+  case FK_PCRel_4:
+    // FIXME: Remove these!!!
+  case X86::reloc_riprel_4byte:
+  case X86::reloc_riprel_4byte_movq_load:
+  case X86::reloc_signed_4byte:
+  case FK_Data_4: return 2;
+  case FK_Data_8: return 3;
+  }
+}
+
+void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer,
+                                                 const MCAssembler &Asm,
+                                                 const MCAsmLayout &Layout,
+                                                 const MCFragment *Fragment,
+                                                 const MCFixup &Fixup,
+                                                 MCValue Target,
+                                                 uint64_t &FixedValue) {
+  unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
+  unsigned IsRIPRel = isFixupKindRIPRel(Fixup.getKind());
+  unsigned Log2Size = getFixupKindLog2Size(Fixup.getKind());
+
+  // See <reloc.h>.
+  uint32_t FixupOffset =
+    Layout.getFragmentOffset(Fragment) + Fixup.getOffset();
+  uint32_t FixupAddress =
+    Writer->getFragmentAddress(Fragment, Layout) + Fixup.getOffset();
+  int64_t Value = 0;
+  unsigned Index = 0;
+  unsigned IsExtern = 0;
+  unsigned Type = 0;
+
+  Value = Target.getConstant();
+
+  if (IsPCRel) {
+    // Compensate for the relocation offset, Darwin x86_64 relocations only have
+    // the addend and appear to have attempted to define it to be the actual
+    // expression addend without the PCrel bias. However, instructions with data
+    // following the relocation are not accommodated for (see comment below
+    // regarding SIGNED{1,2,4}), so it isn't exactly that either.
+    Value += 1LL << Log2Size;
+  }
+
+  if (Target.isAbsolute()) { // constant
+    // SymbolNum of 0 indicates the absolute section.
+    Type = macho::RIT_X86_64_Unsigned;
+    Index = 0;
+
+    // FIXME: I believe this is broken, I don't think the linker can understand
+    // it. I think it would require a local relocation, but I'm not sure if that
+    // would work either. The official way to get an absolute PCrel relocation
+    // is to use an absolute symbol (which we don't support yet).
+    if (IsPCRel) {
+      IsExtern = 1;
+      Type = macho::RIT_X86_64_Branch;
+    }
+  } else if (Target.getSymB()) { // A - B + constant
+    const MCSymbol *A = &Target.getSymA()->getSymbol();
+    MCSymbolData &A_SD = Asm.getSymbolData(*A);
+    const MCSymbolData *A_Base = Asm.getAtom(&A_SD);
+
+    const MCSymbol *B = &Target.getSymB()->getSymbol();
+    MCSymbolData &B_SD = Asm.getSymbolData(*B);
+    const MCSymbolData *B_Base = Asm.getAtom(&B_SD);
+
+    // Neither symbol can be modified.
+    if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None ||
+        Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None)
+      report_fatal_error("unsupported relocation of modified symbol");
+
+    // We don't support PCrel relocations of differences. Darwin 'as' doesn't
+    // implement most of these correctly.
+    if (IsPCRel)
+      report_fatal_error("unsupported pc-relative relocation of difference");
+
+    // The support for the situation where one or both of the symbols would
+    // require a local relocation is handled just like if the symbols were
+    // external.  This is certainly used in the case of debug sections where the
+    // section has only temporary symbols and thus the symbols don't have base
+    // symbols.  This is encoded using the section ordinal and non-extern
+    // relocation entries.
+
+    // Darwin 'as' doesn't emit correct relocations for this (it ends up with a
+    // single SIGNED relocation); reject it for now.  Except the case where both
+    // symbols don't have a base, equal but both NULL.
+    if (A_Base == B_Base && A_Base)
+      report_fatal_error("unsupported relocation with identical base");
+
+    Value += Writer->getSymbolAddress(&A_SD, Layout) -
+      (A_Base == NULL ? 0 : Writer->getSymbolAddress(A_Base, Layout));
+    Value -= Writer->getSymbolAddress(&B_SD, Layout) -
+      (B_Base == NULL ? 0 : Writer->getSymbolAddress(B_Base, Layout));
+
+    if (A_Base) {
+      Index = A_Base->getIndex();
+      IsExtern = 1;
+    }
+    else {
+      Index = A_SD.getFragment()->getParent()->getOrdinal() + 1;
+      IsExtern = 0;
+    }
+    Type = macho::RIT_X86_64_Unsigned;
+
+    macho::RelocationEntry MRE;
+    MRE.Word0 = FixupOffset;
+    MRE.Word1 = ((Index     <<  0) |
+                 (IsPCRel   << 24) |
+                 (Log2Size  << 25) |
+                 (IsExtern  << 27) |
+                 (Type      << 28));
+    Writer->addRelocation(Fragment->getParent(), MRE);
+
+    if (B_Base) {
+      Index = B_Base->getIndex();
+      IsExtern = 1;
+    }
+    else {
+      Index = B_SD.getFragment()->getParent()->getOrdinal() + 1;
+      IsExtern = 0;
+    }
+    Type = macho::RIT_X86_64_Subtractor;
+  } else {
+    const MCSymbol *Symbol = &Target.getSymA()->getSymbol();
+    MCSymbolData &SD = Asm.getSymbolData(*Symbol);
+    const MCSymbolData *Base = Asm.getAtom(&SD);
+
+    // Relocations inside debug sections always use local relocations when
+    // possible. This seems to be done because the debugger doesn't fully
+    // understand x86_64 relocation entries, and expects to find values that
+    // have already been fixed up.
+    if (Symbol->isInSection()) {
+      const MCSectionMachO &Section = static_cast<const MCSectionMachO&>(
+        Fragment->getParent()->getSection());
+      if (Section.hasAttribute(MCSectionMachO::S_ATTR_DEBUG))
+        Base = 0;
+    }
+
+    // x86_64 almost always uses external relocations, except when there is no
+    // symbol to use as a base address (a local symbol with no preceding
+    // non-local symbol).
+    if (Base) {
+      Index = Base->getIndex();
+      IsExtern = 1;
+
+      // Add the local offset, if needed.
+      if (Base != &SD)
+        Value += Layout.getSymbolOffset(&SD) - Layout.getSymbolOffset(Base);
+    } else if (Symbol->isInSection() && !Symbol->isVariable()) {
+      // The index is the section ordinal (1-based).
+      Index = SD.getFragment()->getParent()->getOrdinal() + 1;
+      IsExtern = 0;
+      Value += Writer->getSymbolAddress(&SD, Layout);
+
+      if (IsPCRel)
+        Value -= FixupAddress + (1 << Log2Size);
+    } else if (Symbol->isVariable()) {
+      const MCExpr *Value = Symbol->getVariableValue();
+      int64_t Res;
+      bool isAbs = Value->EvaluateAsAbsolute(Res, Layout,
+                                             Writer->getSectionAddressMap());
+      if (isAbs) {
+        FixedValue = Res;
+        return;
+      } else {
+        report_fatal_error("unsupported relocation of variable '" +
+                           Symbol->getName() + "'");
+      }
+    } else {
+      report_fatal_error("unsupported relocation of undefined symbol '" +
+                         Symbol->getName() + "'");
+    }
+
+    MCSymbolRefExpr::VariantKind Modifier = Target.getSymA()->getKind();
+    if (IsPCRel) {
+      if (IsRIPRel) {
+        if (Modifier == MCSymbolRefExpr::VK_GOTPCREL) {
+          // x86_64 distinguishes movq foo@GOTPCREL so that the linker can
+          // rewrite the movq to an leaq at link time if the symbol ends up in
+          // the same linkage unit.
+          if (unsigned(Fixup.getKind()) == X86::reloc_riprel_4byte_movq_load)
+            Type = macho::RIT_X86_64_GOTLoad;
+          else
+            Type = macho::RIT_X86_64_GOT;
+        }  else if (Modifier == MCSymbolRefExpr::VK_TLVP) {
+          Type = macho::RIT_X86_64_TLV;
+        }  else if (Modifier != MCSymbolRefExpr::VK_None) {
+          report_fatal_error("unsupported symbol modifier in relocation");
+        } else {
+          Type = macho::RIT_X86_64_Signed;
+
+          // The Darwin x86_64 relocation format has a problem where it cannot
+          // encode an address (L<foo> + <constant>) which is outside the atom
+          // containing L<foo>. Generally, this shouldn't occur but it does
+          // happen when we have a RIPrel instruction with data following the
+          // relocation entry (e.g., movb $012, L0(%rip)). Even with the PCrel
+          // adjustment Darwin x86_64 uses, the offset is still negative and the
+          // linker has no way to recognize this.
+          //
+          // To work around this, Darwin uses several special relocation types
+          // to indicate the offsets. However, the specification or
+          // implementation of these seems to also be incomplete; they should
+          // adjust the addend as well based on the actual encoded instruction
+          // (the additional bias), but instead appear to just look at the final
+          // offset.
+          switch (-(Target.getConstant() + (1LL << Log2Size))) {
+          case 1: Type = macho::RIT_X86_64_Signed1; break;
+          case 2: Type = macho::RIT_X86_64_Signed2; break;
+          case 4: Type = macho::RIT_X86_64_Signed4; break;
+          }
+        }
+      } else {
+        if (Modifier != MCSymbolRefExpr::VK_None)
+          report_fatal_error("unsupported symbol modifier in branch "
+                             "relocation");
+
+        Type = macho::RIT_X86_64_Branch;
+      }
+    } else {
+      if (Modifier == MCSymbolRefExpr::VK_GOT) {
+        Type = macho::RIT_X86_64_GOT;
+      } else if (Modifier == MCSymbolRefExpr::VK_GOTPCREL) {
+        // GOTPCREL is allowed as a modifier on non-PCrel instructions, in which
+        // case all we do is set the PCrel bit in the relocation entry; this is
+        // used with exception handling, for example. The source is required to
+        // include any necessary offset directly.
+        Type = macho::RIT_X86_64_GOT;
+        IsPCRel = 1;
+      } else if (Modifier == MCSymbolRefExpr::VK_TLVP) {
+        report_fatal_error("TLVP symbol modifier should have been rip-rel");
+      } else if (Modifier != MCSymbolRefExpr::VK_None)
+        report_fatal_error("unsupported symbol modifier in relocation");
+      else
+        Type = macho::RIT_X86_64_Unsigned;
+    }
+  }
+
+  // x86_64 always writes custom values into the fixups.
+  FixedValue = Value;
+
+  // struct relocation_info (8 bytes)
+  macho::RelocationEntry MRE;
+  MRE.Word0 = FixupOffset;
+  MRE.Word1 = ((Index     <<  0) |
+               (IsPCRel   << 24) |
+               (Log2Size  << 25) |
+               (IsExtern  << 27) |
+               (Type      << 28));
+  Writer->addRelocation(Fragment->getParent(), MRE);
+}
+
+void X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer,
+                                                    const MCAssembler &Asm,
+                                                    const MCAsmLayout &Layout,
+                                                    const MCFragment *Fragment,
+                                                    const MCFixup &Fixup,
+                                                    MCValue Target,
+                                                    unsigned Log2Size,
+                                                    uint64_t &FixedValue) {
+  uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
+  unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
+  unsigned Type = macho::RIT_Vanilla;
+
+  // See <reloc.h>.
+  const MCSymbol *A = &Target.getSymA()->getSymbol();
+  MCSymbolData *A_SD = &Asm.getSymbolData(*A);
+
+  if (!A_SD->getFragment())
+    report_fatal_error("symbol '" + A->getName() +
+                       "' can not be undefined in a subtraction expression");
+
+  uint32_t Value = Writer->getSymbolAddress(A_SD, Layout);
+  uint64_t SecAddr = Writer->getSectionAddress(A_SD->getFragment()->getParent());
+  FixedValue += SecAddr;
+  uint32_t Value2 = 0;
+
+  if (const MCSymbolRefExpr *B = Target.getSymB()) {
+    MCSymbolData *B_SD = &Asm.getSymbolData(B->getSymbol());
+
+    if (!B_SD->getFragment())
+      report_fatal_error("symbol '" + B->getSymbol().getName() +
+                         "' can not be undefined in a subtraction expression");
+
+    // Select the appropriate difference relocation type.
+    //
+    // Note that there is no longer any semantic difference between these two
+    // relocation types from the linkers point of view, this is done solely for
+    // pedantic compatibility with 'as'.
+    Type = A_SD->isExternal() ? (unsigned)macho::RIT_Difference :
+      (unsigned)macho::RIT_Generic_LocalDifference;
+    Value2 = Writer->getSymbolAddress(B_SD, Layout);
+    FixedValue -= Writer->getSectionAddress(B_SD->getFragment()->getParent());
+  }
+
+  // Relocations are written out in reverse order, so the PAIR comes first.
+  if (Type == macho::RIT_Difference ||
+      Type == macho::RIT_Generic_LocalDifference) {
+    macho::RelocationEntry MRE;
+    MRE.Word0 = ((0         <<  0) |
+                 (macho::RIT_Pair  << 24) |
+                 (Log2Size  << 28) |
+                 (IsPCRel   << 30) |
+                 macho::RF_Scattered);
+    MRE.Word1 = Value2;
+    Writer->addRelocation(Fragment->getParent(), MRE);
+  }
+
+  macho::RelocationEntry MRE;
+  MRE.Word0 = ((FixupOffset <<  0) |
+               (Type        << 24) |
+               (Log2Size    << 28) |
+               (IsPCRel     << 30) |
+               macho::RF_Scattered);
+  MRE.Word1 = Value;
+  Writer->addRelocation(Fragment->getParent(), MRE);
+}
+
+void X86MachObjectWriter::RecordTLVPRelocation(MachObjectWriter *Writer,
+                                               const MCAssembler &Asm,
+                                               const MCAsmLayout &Layout,
+                                               const MCFragment *Fragment,
+                                               const MCFixup &Fixup,
+                                               MCValue Target,
+                                               uint64_t &FixedValue) {
+  assert(Target.getSymA()->getKind() == MCSymbolRefExpr::VK_TLVP &&
+         !is64Bit() &&
+         "Should only be called with a 32-bit TLVP relocation!");
+
+  unsigned Log2Size = getFixupKindLog2Size(Fixup.getKind());
+  uint32_t Value = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
+  unsigned IsPCRel = 0;
+
+  // Get the symbol data.
+  MCSymbolData *SD_A = &Asm.getSymbolData(Target.getSymA()->getSymbol());
+  unsigned Index = SD_A->getIndex();
+
+  // We're only going to have a second symbol in pic mode and it'll be a
+  // subtraction from the picbase. For 32-bit pic the addend is the difference
+  // between the picbase and the next address.  For 32-bit static the addend is
+  // zero.
+  if (Target.getSymB()) {
+    // If this is a subtraction then we're pcrel.
+    uint32_t FixupAddress =
+      Writer->getFragmentAddress(Fragment, Layout) + Fixup.getOffset();
+    MCSymbolData *SD_B = &Asm.getSymbolData(Target.getSymB()->getSymbol());
+    IsPCRel = 1;
+    FixedValue = (FixupAddress - Writer->getSymbolAddress(SD_B, Layout) +
+                  Target.getConstant());
+    FixedValue += 1ULL << Log2Size;
+  } else {
+    FixedValue = 0;
+  }
+
+  // struct relocation_info (8 bytes)
+  macho::RelocationEntry MRE;
+  MRE.Word0 = Value;
+  MRE.Word1 = ((Index                  <<  0) |
+               (IsPCRel                << 24) |
+               (Log2Size               << 25) |
+               (1                      << 27) | // Extern
+               (macho::RIT_Generic_TLV << 28)); // Type
+  Writer->addRelocation(Fragment->getParent(), MRE);
+}
+
+void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer,
+                                              const MCAssembler &Asm,
+                                              const MCAsmLayout &Layout,
+                                              const MCFragment *Fragment,
+                                              const MCFixup &Fixup,
+                                              MCValue Target,
+                                              uint64_t &FixedValue) {
+  unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
+  unsigned Log2Size = getFixupKindLog2Size(Fixup.getKind());
+
+  // If this is a 32-bit TLVP reloc it's handled a bit differently.
+  if (Target.getSymA() &&
+      Target.getSymA()->getKind() == MCSymbolRefExpr::VK_TLVP) {
+    RecordTLVPRelocation(Writer, Asm, Layout, Fragment, Fixup, Target,
+                         FixedValue);
+    return;
+  }
+
+  // If this is a difference or a defined symbol plus an offset, then we need a
+  // scattered relocation entry. Differences always require scattered
+  // relocations.
+  if (Target.getSymB())
+    return RecordScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup,
+                                     Target, Log2Size, FixedValue);
+
+  // Get the symbol data, if any.
+  MCSymbolData *SD = 0;
+  if (Target.getSymA())
+    SD = &Asm.getSymbolData(Target.getSymA()->getSymbol());
+
+  // If this is an internal relocation with an offset, it also needs a scattered
+  // relocation entry.
+  uint32_t Offset = Target.getConstant();
+  if (IsPCRel)
+    Offset += 1 << Log2Size;
+  if (Offset && SD && !Writer->doesSymbolRequireExternRelocation(SD))
+    return RecordScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup,
+                                     Target, Log2Size, FixedValue);
+
+  // See <reloc.h>.
+  uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
+  unsigned Index = 0;
+  unsigned IsExtern = 0;
+  unsigned Type = 0;
+
+  if (Target.isAbsolute()) { // constant
+    // SymbolNum of 0 indicates the absolute section.
+    //
+    // FIXME: Currently, these are never generated (see code below). I cannot
+    // find a case where they are actually emitted.
+    Type = macho::RIT_Vanilla;
+  } else {
+    // Resolve constant variables.
+    if (SD->getSymbol().isVariable()) {
+      int64_t Res;
+      if (SD->getSymbol().getVariableValue()->EvaluateAsAbsolute(
+            Res, Layout, Writer->getSectionAddressMap())) {
+        FixedValue = Res;
+        return;
+      }
+    }
+
+    // Check whether we need an external or internal relocation.
+    if (Writer->doesSymbolRequireExternRelocation(SD)) {
+      IsExtern = 1;
+      Index = SD->getIndex();
+      // For external relocations, make sure to offset the fixup value to
+      // compensate for the addend of the symbol address, if it was
+      // undefined. This occurs with weak definitions, for example.
+      if (!SD->Symbol->isUndefined())
+        FixedValue -= Layout.getSymbolOffset(SD);
+    } else {
+      // The index is the section ordinal (1-based).
+      const MCSectionData &SymSD = Asm.getSectionData(
+        SD->getSymbol().getSection());
+      Index = SymSD.getOrdinal() + 1;
+      FixedValue += Writer->getSectionAddress(&SymSD);
+    }
+    if (IsPCRel)
+      FixedValue -= Writer->getSectionAddress(Fragment->getParent());
+
+    Type = macho::RIT_Vanilla;
+  }
+
+  // struct relocation_info (8 bytes)
+  macho::RelocationEntry MRE;
+  MRE.Word0 = FixupOffset;
+  MRE.Word1 = ((Index     <<  0) |
+               (IsPCRel   << 24) |
+               (Log2Size  << 25) |
+               (IsExtern  << 27) |
+               (Type      << 28));
+  Writer->addRelocation(Fragment->getParent(), MRE);
+}
+
+MCObjectWriter *llvm::createX86MachObjectWriter(raw_ostream &OS,
+                                                bool Is64Bit,
+                                                uint32_t CPUType,
+                                                uint32_t CPUSubtype) {
+  return createMachObjectWriter(new X86MachObjectWriter(Is64Bit,
+                                                        CPUType,
+                                                        CPUSubtype),
+                                OS, /*IsLittleEndian=*/true);
+}
diff --git a/lib/Target/X86/README-SSE.txt b/lib/Target/X86/README-SSE.txt
index f16ec029e96ab..7d901afae4749 100644
--- a/lib/Target/X86/README-SSE.txt
+++ b/lib/Target/X86/README-SSE.txt
@@ -862,7 +862,7 @@ define float @bar(float %x) nounwind {
 
 This IR (from PR6194):
 
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-darwin10.0.0"
 
 %0 = type { double, double }
diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt
index 560947a4a04b7..b40795506071b 100644
--- a/lib/Target/X86/README.txt
+++ b/lib/Target/X86/README.txt
@@ -2,11 +2,6 @@
 // Random ideas for the X86 backend.
 //===---------------------------------------------------------------------===//
 
-We should add support for the "movbe" instruction, which does a byte-swapping
-copy (3-addr bswap + memory support?)  This is available on Atom processors.
-
-//===---------------------------------------------------------------------===//
-
 This should be one DIV/IDIV instruction, not a libcall:
 
 unsigned test(unsigned long long X, unsigned Y) {
@@ -1222,7 +1217,7 @@ Also check why xmm7 is not used at all in the function.
 
 Take the following:
 
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-S128"
 target triple = "i386-apple-darwin8"
 @in_exit.4870.b = internal global i1 false		; <i1*> [#uses=2]
 define fastcc void @abort_gzip() noreturn nounwind  {
@@ -2066,3 +2061,34 @@ The trick is to match "fetch_and_add(X, -C) == C".
 
 //===---------------------------------------------------------------------===//
 
+unsigned log2(unsigned x) {
+  return x > 1 ? 32-__builtin_clz(x-1) : 0;
+}
+
+generates (x86_64):
+	xorl	%eax, %eax
+	cmpl	$2, %edi
+	jb	LBB0_2
+## BB#1:
+	decl	%edi
+	movl	$63, %ecx
+	bsrl	%edi, %eax
+	cmovel	%ecx, %eax
+	xorl	$-32, %eax
+	addl	$33, %eax
+LBB0_2:
+	ret
+
+The cmov and the early test are redundant:
+	xorl	%eax, %eax
+	cmpl	$2, %edi
+	jb	LBB0_2
+## BB#1:
+	decl	%edi
+	bsrl	%edi, %eax
+	xorl	$-32, %eax
+	addl	$33, %eax
+LBB0_2:
+	ret
+
+//===---------------------------------------------------------------------===//
diff --git a/lib/Target/X86/SSEDomainFix.cpp b/lib/Target/X86/SSEDomainFix.cpp
deleted file mode 100644
index 13680c592e01b..0000000000000
--- a/lib/Target/X86/SSEDomainFix.cpp
+++ /dev/null
@@ -1,506 +0,0 @@
-//===- SSEDomainFix.cpp - Use proper int/float domain for SSE ---*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the SSEDomainFix pass.
-//
-// Some SSE instructions like mov, and, or, xor are available in different
-// variants for different operand types. These variant instructions are
-// equivalent, but on Nehalem and newer cpus there is extra latency
-// transferring data between integer and floating point domains.
-//
-// This pass changes the variant instructions to minimize domain crossings.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "sse-domain-fix"
-#include "X86InstrInfo.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/Support/Allocator.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-using namespace llvm;
-
-/// A DomainValue is a bit like LiveIntervals' ValNo, but it also keeps track
-/// of execution domains.
-///
-/// An open DomainValue represents a set of instructions that can still switch
-/// execution domain. Multiple registers may refer to the same open
-/// DomainValue - they will eventually be collapsed to the same execution
-/// domain.
-///
-/// A collapsed DomainValue represents a single register that has been forced
-/// into one of more execution domains. There is a separate collapsed
-/// DomainValue for each register, but it may contain multiple execution
-/// domains. A register value is initially created in a single execution
-/// domain, but if we were forced to pay the penalty of a domain crossing, we
-/// keep track of the fact the the register is now available in multiple
-/// domains.
-namespace {
-struct DomainValue {
-  // Basic reference counting.
-  unsigned Refs;
-
-  // Bitmask of available domains. For an open DomainValue, it is the still
-  // possible domains for collapsing. For a collapsed DomainValue it is the
-  // domains where the register is available for free.
-  unsigned AvailableDomains;
-
-  // Position of the last defining instruction.
-  unsigned Dist;
-
-  // Twiddleable instructions using or defining these registers.
-  SmallVector<MachineInstr*, 8> Instrs;
-
-  // A collapsed DomainValue has no instructions to twiddle - it simply keeps
-  // track of the domains where the registers are already available.
-  bool isCollapsed() const { return Instrs.empty(); }
-
-  // Is domain available?
-  bool hasDomain(unsigned domain) const {
-    return AvailableDomains & (1u << domain);
-  }
-
-  // Mark domain as available.
-  void addDomain(unsigned domain) {
-    AvailableDomains |= 1u << domain;
-  }
-
-  // Restrict to a single domain available.
-  void setSingleDomain(unsigned domain) {
-    AvailableDomains = 1u << domain;
-  }
-
-  // Return bitmask of domains that are available and in mask.
-  unsigned getCommonDomains(unsigned mask) const {
-    return AvailableDomains & mask;
-  }
-
-  // First domain available.
-  unsigned getFirstDomain() const {
-    return CountTrailingZeros_32(AvailableDomains);
-  }
-
-  DomainValue() { clear(); }
-
-  void clear() {
-    Refs = AvailableDomains = Dist = 0;
-    Instrs.clear();
-  }
-};
-}
-
-static const unsigned NumRegs = 16;
-
-namespace {
-class SSEDomainFixPass : public MachineFunctionPass {
-  static char ID;
-  SpecificBumpPtrAllocator<DomainValue> Allocator;
-  SmallVector<DomainValue*,16> Avail;
-
-  MachineFunction *MF;
-  const X86InstrInfo *TII;
-  const TargetRegisterInfo *TRI;
-  MachineBasicBlock *MBB;
-  DomainValue **LiveRegs;
-  typedef DenseMap<MachineBasicBlock*,DomainValue**> LiveOutMap;
-  LiveOutMap LiveOuts;
-  unsigned Distance;
-
-public:
-  SSEDomainFixPass() : MachineFunctionPass(ID) {}
-
-  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-    AU.setPreservesAll();
-    MachineFunctionPass::getAnalysisUsage(AU);
-  }
-
-  virtual bool runOnMachineFunction(MachineFunction &MF);
-
-  virtual const char *getPassName() const {
-    return "SSE execution domain fixup";
-  }
-
-private:
-  // Register mapping.
-  int RegIndex(unsigned Reg);
-
-  // DomainValue allocation.
-  DomainValue *Alloc(int domain = -1);
-  void Recycle(DomainValue*);
-
-  // LiveRegs manipulations.
-  void SetLiveReg(int rx, DomainValue *DV);
-  void Kill(int rx);
-  void Force(int rx, unsigned domain);
-  void Collapse(DomainValue *dv, unsigned domain);
-  bool Merge(DomainValue *A, DomainValue *B);
-
-  void enterBasicBlock();
-  void visitGenericInstr(MachineInstr*);
-  void visitSoftInstr(MachineInstr*, unsigned mask);
-  void visitHardInstr(MachineInstr*, unsigned domain);
-};
-}
-
-char SSEDomainFixPass::ID = 0;
-
-/// Translate TRI register number to an index into our smaller tables of
-/// interesting registers. Return -1 for boring registers.
-int SSEDomainFixPass::RegIndex(unsigned reg) {
-  assert(X86::XMM15 == X86::XMM0+NumRegs-1 && "Unexpected sort");
-  reg -= X86::XMM0;
-  return reg < NumRegs ? (int) reg : -1;
-}
-
-DomainValue *SSEDomainFixPass::Alloc(int domain) {
-  DomainValue *dv = Avail.empty() ?
-                      new(Allocator.Allocate()) DomainValue :
-                      Avail.pop_back_val();
-  dv->Dist = Distance;
-  if (domain >= 0)
-    dv->addDomain(domain);
-  return dv;
-}
-
-void SSEDomainFixPass::Recycle(DomainValue *dv) {
-  assert(dv && "Cannot recycle NULL");
-  dv->clear();
-  Avail.push_back(dv);
-}
-
-/// Set LiveRegs[rx] = dv, updating reference counts.
-void SSEDomainFixPass::SetLiveReg(int rx, DomainValue *dv) {
-  assert(unsigned(rx) < NumRegs && "Invalid index");
-  if (!LiveRegs) {
-    LiveRegs = new DomainValue*[NumRegs];
-    std::fill(LiveRegs, LiveRegs+NumRegs, (DomainValue*)0);
-  }
-
-  if (LiveRegs[rx] == dv)
-    return;
-  if (LiveRegs[rx]) {
-    assert(LiveRegs[rx]->Refs && "Bad refcount");
-    if (--LiveRegs[rx]->Refs == 0) Recycle(LiveRegs[rx]);
-  }
-  LiveRegs[rx] = dv;
-  if (dv) ++dv->Refs;
-}
-
-// Kill register rx, recycle or collapse any DomainValue.
-void SSEDomainFixPass::Kill(int rx) {
-  assert(unsigned(rx) < NumRegs && "Invalid index");
-  if (!LiveRegs || !LiveRegs[rx]) return;
-
-  // Before killing the last reference to an open DomainValue, collapse it to
-  // the first available domain.
-  if (LiveRegs[rx]->Refs == 1 && !LiveRegs[rx]->isCollapsed())
-    Collapse(LiveRegs[rx], LiveRegs[rx]->getFirstDomain());
-  else
-    SetLiveReg(rx, 0);
-}
-
-/// Force register rx into domain.
-void SSEDomainFixPass::Force(int rx, unsigned domain) {
-  assert(unsigned(rx) < NumRegs && "Invalid index");
-  DomainValue *dv;
-  if (LiveRegs && (dv = LiveRegs[rx])) {
-    if (dv->isCollapsed())
-      dv->addDomain(domain);
-    else if (dv->hasDomain(domain))
-      Collapse(dv, domain);
-    else {
-      // This is an incompatible open DomainValue. Collapse it to whatever and force
-      // the new value into domain. This costs a domain crossing.
-      Collapse(dv, dv->getFirstDomain());
-      assert(LiveRegs[rx] && "Not live after collapse?");
-      LiveRegs[rx]->addDomain(domain);
-    }
-  } else {
-    // Set up basic collapsed DomainValue.
-    SetLiveReg(rx, Alloc(domain));
-  }
-}
-
-/// Collapse open DomainValue into given domain. If there are multiple
-/// registers using dv, they each get a unique collapsed DomainValue.
-void SSEDomainFixPass::Collapse(DomainValue *dv, unsigned domain) {
-  assert(dv->hasDomain(domain) && "Cannot collapse");
-
-  // Collapse all the instructions.
-  while (!dv->Instrs.empty())
-    TII->SetSSEDomain(dv->Instrs.pop_back_val(), domain);
-  dv->setSingleDomain(domain);
-
-  // If there are multiple users, give them new, unique DomainValues.
-  if (LiveRegs && dv->Refs > 1)
-    for (unsigned rx = 0; rx != NumRegs; ++rx)
-      if (LiveRegs[rx] == dv)
-        SetLiveReg(rx, Alloc(domain));
-}
-
-/// Merge - All instructions and registers in B are moved to A, and B is
-/// released.
-bool SSEDomainFixPass::Merge(DomainValue *A, DomainValue *B) {
-  assert(!A->isCollapsed() && "Cannot merge into collapsed");
-  assert(!B->isCollapsed() && "Cannot merge from collapsed");
-  if (A == B)
-    return true;
-  // Restrict to the domains that A and B have in common.
-  unsigned common = A->getCommonDomains(B->AvailableDomains);
-  if (!common)
-    return false;
-  A->AvailableDomains = common;
-  A->Dist = std::max(A->Dist, B->Dist);
-  A->Instrs.append(B->Instrs.begin(), B->Instrs.end());
-  for (unsigned rx = 0; rx != NumRegs; ++rx)
-    if (LiveRegs[rx] == B)
-      SetLiveReg(rx, A);
-  return true;
-}
-
-void SSEDomainFixPass::enterBasicBlock() {
-  // Try to coalesce live-out registers from predecessors.
-  for (MachineBasicBlock::livein_iterator i = MBB->livein_begin(),
-         e = MBB->livein_end(); i != e; ++i) {
-    int rx = RegIndex(*i);
-    if (rx < 0) continue;
-    for (MachineBasicBlock::const_pred_iterator pi = MBB->pred_begin(),
-           pe = MBB->pred_end(); pi != pe; ++pi) {
-      LiveOutMap::const_iterator fi = LiveOuts.find(*pi);
-      if (fi == LiveOuts.end()) continue;
-      DomainValue *pdv = fi->second[rx];
-      if (!pdv) continue;
-      if (!LiveRegs || !LiveRegs[rx]) {
-        SetLiveReg(rx, pdv);
-        continue;
-      }
-
-      // We have a live DomainValue from more than one predecessor.
-      if (LiveRegs[rx]->isCollapsed()) {
-        // We are already collapsed, but predecessor is not. Force him.
-        unsigned domain = LiveRegs[rx]->getFirstDomain();
-        if (!pdv->isCollapsed() && pdv->hasDomain(domain))
-          Collapse(pdv, domain);
-        continue;
-      }
-
-      // Currently open, merge in predecessor.
-      if (!pdv->isCollapsed())
-        Merge(LiveRegs[rx], pdv);
-      else
-        Force(rx, pdv->getFirstDomain());
-    }
-  }
-}
-
-// A hard instruction only works in one domain. All input registers will be
-// forced into that domain.
-void SSEDomainFixPass::visitHardInstr(MachineInstr *mi, unsigned domain) {
-  // Collapse all uses.
-  for (unsigned i = mi->getDesc().getNumDefs(),
-                e = mi->getDesc().getNumOperands(); i != e; ++i) {
-    MachineOperand &mo = mi->getOperand(i);
-    if (!mo.isReg()) continue;
-    int rx = RegIndex(mo.getReg());
-    if (rx < 0) continue;
-    Force(rx, domain);
-  }
-
-  // Kill all defs and force them.
-  for (unsigned i = 0, e = mi->getDesc().getNumDefs(); i != e; ++i) {
-    MachineOperand &mo = mi->getOperand(i);
-    if (!mo.isReg()) continue;
-    int rx = RegIndex(mo.getReg());
-    if (rx < 0) continue;
-    Kill(rx);
-    Force(rx, domain);
-  }
-}
-
-// A soft instruction can be changed to work in other domains given by mask.
-void SSEDomainFixPass::visitSoftInstr(MachineInstr *mi, unsigned mask) {
-  // Bitmask of available domains for this instruction after taking collapsed
-  // operands into account.
-  unsigned available = mask;
-
-  // Scan the explicit use operands for incoming domains.
-  SmallVector<int, 4> used;
-  if (LiveRegs)
-    for (unsigned i = mi->getDesc().getNumDefs(),
-                  e = mi->getDesc().getNumOperands(); i != e; ++i) {
-      MachineOperand &mo = mi->getOperand(i);
-      if (!mo.isReg()) continue;
-      int rx = RegIndex(mo.getReg());
-      if (rx < 0) continue;
-      if (DomainValue *dv = LiveRegs[rx]) {
-        // Bitmask of domains that dv and available have in common.
-        unsigned common = dv->getCommonDomains(available);
-        // Is it possible to use this collapsed register for free?
-        if (dv->isCollapsed()) {
-          // Restrict available domains to the ones in common with the operand.
-          // If there are no common domains, we must pay the cross-domain 
-          // penalty for this operand.
-          if (common) available = common;
-        } else if (common)
-          // Open DomainValue is compatible, save it for merging.
-          used.push_back(rx);
-        else
-          // Open DomainValue is not compatible with instruction. It is useless
-          // now.
-          Kill(rx);
-      }
-    }
-
-  // If the collapsed operands force a single domain, propagate the collapse.
-  if (isPowerOf2_32(available)) {
-    unsigned domain = CountTrailingZeros_32(available);
-    TII->SetSSEDomain(mi, domain);
-    visitHardInstr(mi, domain);
-    return;
-  }
-
-  // Kill off any remaining uses that don't match available, and build a list of
-  // incoming DomainValues that we want to merge.
-  SmallVector<DomainValue*,4> doms;
-  for (SmallVector<int, 4>::iterator i=used.begin(), e=used.end(); i!=e; ++i) {
-    int rx = *i;
-    DomainValue *dv = LiveRegs[rx];
-    // This useless DomainValue could have been missed above.
-    if (!dv->getCommonDomains(available)) {
-      Kill(*i);
-      continue;
-    }
-    // sorted, uniqued insert.
-    bool inserted = false;
-    for (SmallVector<DomainValue*,4>::iterator i = doms.begin(), e = doms.end();
-           i != e && !inserted; ++i) {
-      if (dv == *i)
-        inserted = true;
-      else if (dv->Dist < (*i)->Dist) {
-        inserted = true;
-        doms.insert(i, dv);
-      }
-    }
-    if (!inserted)
-      doms.push_back(dv);
-  }
-
-  // doms are now sorted in order of appearance. Try to merge them all, giving
-  // priority to the latest ones.
-  DomainValue *dv = 0;
-  while (!doms.empty()) {
-    if (!dv) {
-      dv = doms.pop_back_val();
-      continue;
-    }
-
-    DomainValue *latest = doms.pop_back_val();
-    if (Merge(dv, latest)) continue;
-
-    // If latest didn't merge, it is useless now. Kill all registers using it.
-    for (SmallVector<int,4>::iterator i=used.begin(), e=used.end(); i != e; ++i)
-      if (LiveRegs[*i] == latest)
-        Kill(*i);
-  }
-
-  // dv is the DomainValue we are going to use for this instruction.
-  if (!dv)
-    dv = Alloc();
-  dv->Dist = Distance;
-  dv->AvailableDomains = available;
-  dv->Instrs.push_back(mi);
-
-  // Finally set all defs and non-collapsed uses to dv.
-  for (unsigned i = 0, e = mi->getDesc().getNumOperands(); i != e; ++i) {
-    MachineOperand &mo = mi->getOperand(i);
-    if (!mo.isReg()) continue;
-    int rx = RegIndex(mo.getReg());
-    if (rx < 0) continue;
-    if (!LiveRegs || !LiveRegs[rx] || (mo.isDef() && LiveRegs[rx]!=dv)) {
-      Kill(rx);
-      SetLiveReg(rx, dv);
-    }
-  }
-}
-
-void SSEDomainFixPass::visitGenericInstr(MachineInstr *mi) {
-  // Process explicit defs, kill any XMM registers redefined.
-  for (unsigned i = 0, e = mi->getDesc().getNumDefs(); i != e; ++i) {
-    MachineOperand &mo = mi->getOperand(i);
-    if (!mo.isReg()) continue;
-    int rx = RegIndex(mo.getReg());
-    if (rx < 0) continue;
-    Kill(rx);
-  }
-}
-
-bool SSEDomainFixPass::runOnMachineFunction(MachineFunction &mf) {
-  MF = &mf;
-  TII = static_cast<const X86InstrInfo*>(MF->getTarget().getInstrInfo());
-  TRI = MF->getTarget().getRegisterInfo();
-  MBB = 0;
-  LiveRegs = 0;
-  Distance = 0;
-  assert(NumRegs == X86::VR128RegClass.getNumRegs() && "Bad regclass");
-
-  // If no XMM registers are used in the function, we can skip it completely.
-  bool anyregs = false;
-  for (TargetRegisterClass::const_iterator I = X86::VR128RegClass.begin(),
-         E = X86::VR128RegClass.end(); I != E; ++I)
-    if (MF->getRegInfo().isPhysRegUsed(*I)) {
-      anyregs = true;
-      break;
-    }
-  if (!anyregs) return false;
-
-  MachineBasicBlock *Entry = MF->begin();
-  SmallPtrSet<MachineBasicBlock*, 16> Visited;
-  for (df_ext_iterator<MachineBasicBlock*, SmallPtrSet<MachineBasicBlock*, 16> >
-         DFI = df_ext_begin(Entry, Visited), DFE = df_ext_end(Entry, Visited);
-         DFI != DFE; ++DFI) {
-    MBB = *DFI;
-    enterBasicBlock();
-    for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
-        ++I) {
-      MachineInstr *mi = I;
-      if (mi->isDebugValue()) continue;
-      ++Distance;
-      std::pair<uint16_t, uint16_t> domp = TII->GetSSEDomain(mi);
-      if (domp.first)
-        if (domp.second)
-          visitSoftInstr(mi, domp.second);
-        else
-          visitHardInstr(mi, domp.first);
-      else if (LiveRegs)
-        visitGenericInstr(mi);
-    }
-
-    // Save live registers at end of MBB - used by enterBasicBlock().
-    if (LiveRegs)
-      LiveOuts.insert(std::make_pair(MBB, LiveRegs));
-    LiveRegs = 0;
-  }
-
-  // Clear the LiveOuts vectors. Should we also collapse any remaining
-  // DomainValues?
-  for (LiveOutMap::const_iterator i = LiveOuts.begin(), e = LiveOuts.end();
-         i != e; ++i)
-    delete[] i->second;
-  LiveOuts.clear();
-  Avail.clear();
-  Allocator.DestroyAll();
-
-  return false;
-}
-
-FunctionPass *llvm::createSSEDomainFixPass() {
-  return new SSEDomainFixPass();
-}
diff --git a/lib/Target/X86/TargetInfo/CMakeLists.txt b/lib/Target/X86/TargetInfo/CMakeLists.txt
index 90be9f58cc73f..4da00fa44f4d4 100644
--- a/lib/Target/X86/TargetInfo/CMakeLists.txt
+++ b/lib/Target/X86/TargetInfo/CMakeLists.txt
@@ -4,4 +4,10 @@ add_llvm_library(LLVMX86Info
   X86TargetInfo.cpp
   )
 
-add_dependencies(LLVMX86Info X86CodeGenTable_gen)
+add_llvm_library_dependencies(LLVMX86Info
+  LLVMMC
+  LLVMSupport
+  LLVMTarget
+  )
+
+add_dependencies(LLVMX86Info X86CommonTableGen)
diff --git a/lib/Target/X86/TargetInfo/X86TargetInfo.cpp b/lib/Target/X86/TargetInfo/X86TargetInfo.cpp
index 08d4d84f8a8ad..52a67f763b0a0 100644
--- a/lib/Target/X86/TargetInfo/X86TargetInfo.cpp
+++ b/lib/Target/X86/TargetInfo/X86TargetInfo.cpp
@@ -9,7 +9,7 @@
 
 #include "X86.h"
 #include "llvm/Module.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
 Target llvm::TheX86_32Target, llvm::TheX86_64Target;
diff --git a/lib/Target/X86/Utils/CMakeLists.txt b/lib/Target/X86/Utils/CMakeLists.txt
index 3ad5f991c8653..caffd8b378168 100644
--- a/lib/Target/X86/Utils/CMakeLists.txt
+++ b/lib/Target/X86/Utils/CMakeLists.txt
@@ -3,4 +3,10 @@ include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/
 add_llvm_library(LLVMX86Utils
   X86ShuffleDecode.cpp
   )
-add_dependencies(LLVMX86Utils X86CodeGenTable_gen)
+
+add_llvm_library_dependencies(LLVMX86Utils
+  LLVMCore
+  LLVMSupport
+  )
+
+add_dependencies(LLVMX86Utils X86CommonTableGen)
diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
index cd06060748b7c..aeb3309d09aa3 100644
--- a/lib/Target/X86/Utils/X86ShuffleDecode.cpp
+++ b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
@@ -167,24 +167,77 @@ void DecodeUNPCKLPMask(EVT VT,
                        SmallVectorImpl<unsigned> &ShuffleMask) {
   unsigned NumElts = VT.getVectorNumElements();
 
-  // Handle vector lengths > 128 bits.  Define a "section" as a set of
-  // 128 bits.  AVX defines UNPCK* to operate independently on 128-bit
-  // sections.
-  unsigned NumSections = VT.getSizeInBits() / 128;
-  if (NumSections == 0 ) NumSections = 1;  // Handle MMX
-  unsigned NumSectionElts = NumElts / NumSections;
+  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
+  // independently on 128-bit lanes.
+  unsigned NumLanes = VT.getSizeInBits() / 128;
+  if (NumLanes == 0 ) NumLanes = 1;  // Handle MMX
+  unsigned NumLaneElts = NumElts / NumLanes;
 
   unsigned Start = 0;
-  unsigned End = NumSectionElts / 2;
-  for (unsigned s = 0; s < NumSections; ++s) {
+  unsigned End = NumLaneElts / 2;
+  for (unsigned s = 0; s < NumLanes; ++s) {
     for (unsigned i = Start; i != End; ++i) {
       ShuffleMask.push_back(i);                 // Reads from dest/src1
-      ShuffleMask.push_back(i+NumSectionElts);  // Reads from src/src2
+      ShuffleMask.push_back(i+NumLaneElts);  // Reads from src/src2
     }
     // Process the next 128 bits.
-    Start += NumSectionElts;
-    End += NumSectionElts;
+    Start += NumLaneElts;
+    End += NumLaneElts;
   }
 }
 
+// DecodeVPERMILPSMask - Decodes VPERMILPS permutes for any 128-bit 32-bit
+// elements. For 256-bit vectors, it's considered as two 128 lanes, the
+// referenced elements can't cross lanes and the mask of the first lane must
+// be the same of the second.
+void DecodeVPERMILPSMask(unsigned NumElts, unsigned Imm,
+                         SmallVectorImpl<unsigned> &ShuffleMask) {
+  unsigned NumLanes = (NumElts*32)/128;
+  unsigned LaneSize = NumElts/NumLanes;
+
+  for (unsigned l = 0; l != NumLanes; ++l) {
+    for (unsigned i = 0; i != LaneSize; ++i) {
+      unsigned Idx = (Imm >> (i*2)) & 0x3 ;
+      ShuffleMask.push_back(Idx+(l*LaneSize));
+    }
+  }
+}
+
+// DecodeVPERMILPDMask - Decodes VPERMILPD permutes for any 128-bit 64-bit
+// elements. For 256-bit vectors, it's considered as two 128 lanes, the
+// referenced elements can't cross lanes but the mask of the first lane can
+// be the different of the second (not like VPERMILPS).
+void DecodeVPERMILPDMask(unsigned NumElts, unsigned Imm,
+                         SmallVectorImpl<unsigned> &ShuffleMask) {
+  unsigned NumLanes = (NumElts*64)/128;
+  unsigned LaneSize = NumElts/NumLanes;
+
+  for (unsigned l = 0; l < NumLanes; ++l) {
+    for (unsigned i = l*LaneSize; i < LaneSize*(l+1); ++i) {
+      unsigned Idx = (Imm >> i) & 0x1;
+      ShuffleMask.push_back(Idx+(l*LaneSize));
+    }
+  }
+}
+
+void DecodeVPERM2F128Mask(EVT VT, unsigned Imm,
+                          SmallVectorImpl<unsigned> &ShuffleMask) {
+  unsigned HalfSize = VT.getVectorNumElements()/2;
+  unsigned FstHalfBegin = (Imm & 0x3) * HalfSize;
+  unsigned SndHalfBegin = ((Imm >> 4) & 0x3) * HalfSize;
+
+  for (int i = FstHalfBegin, e = FstHalfBegin+HalfSize; i != e; ++i)
+    ShuffleMask.push_back(i);
+  for (int i = SndHalfBegin, e = SndHalfBegin+HalfSize; i != e; ++i)
+    ShuffleMask.push_back(i);
+}
+
+void DecodeVPERM2F128Mask(unsigned Imm,
+                          SmallVectorImpl<unsigned> &ShuffleMask) {
+  // VPERM2F128 is used by any 256-bit EVT, but X86InstComments only
+  // has information about the instruction and not the types. So for
+  // instruction comments purpose, assume the 256-bit vector is v4i64.
+  return DecodeVPERM2F128Mask(MVT::v4i64, Imm, ShuffleMask);
+}
+
 } // llvm namespace
diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.h b/lib/Target/X86/Utils/X86ShuffleDecode.h
index b18f670330961..58193e6a46887 100644
--- a/lib/Target/X86/Utils/X86ShuffleDecode.h
+++ b/lib/Target/X86/Utils/X86ShuffleDecode.h
@@ -82,6 +82,26 @@ void DecodeUNPCKLPDMask(unsigned NElts,
 void DecodeUNPCKLPMask(EVT VT,
                        SmallVectorImpl<unsigned> &ShuffleMask);
 
+
+// DecodeVPERMILPSMask - Decodes VPERMILPS permutes for any 128-bit 32-bit
+// elements. For 256-bit vectors, it's considered as two 128 lanes, the
+// referenced elements can't cross lanes and the mask of the first lane must
+// be the same of the second.
+void DecodeVPERMILPSMask(unsigned NElts, unsigned Imm,
+                        SmallVectorImpl<unsigned> &ShuffleMask);
+
+// DecodeVPERMILPDMask - Decodes VPERMILPD permutes for any 128-bit 64-bit
+// elements. For 256-bit vectors, it's considered as two 128 lanes, the
+// referenced elements can't cross lanes but the mask of the first lane can
+// be the different of the second (not like VPERMILPS).
+void DecodeVPERMILPDMask(unsigned NElts, unsigned Imm,
+                        SmallVectorImpl<unsigned> &ShuffleMask);
+
+void DecodeVPERM2F128Mask(unsigned Imm,
+                          SmallVectorImpl<unsigned> &ShuffleMask);
+void DecodeVPERM2F128Mask(EVT VT, unsigned Imm,
+                          SmallVectorImpl<unsigned> &ShuffleMask);
+
 } // llvm namespace
 
 #endif
diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h
index ec52dfb3e7d11..81e94227fca6c 100644
--- a/lib/Target/X86/X86.h
+++ b/lib/Target/X86/X86.h
@@ -15,6 +15,7 @@
 #ifndef TARGET_X86_H
 #define TARGET_X86_H
 
+#include "MCTargetDesc/X86BaseInfo.h"
 #include "MCTargetDesc/X86MCTargetDesc.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Target/TargetMachine.h"
@@ -24,16 +25,8 @@ namespace llvm {
 class FunctionPass;
 class JITCodeEmitter;
 class MachineCodeEmitter;
-class MCCodeEmitter;
-class MCContext;
-class MCInstrInfo;
-class MCObjectWriter;
-class MCSubtargetInfo;
 class Target;
-class TargetAsmBackend;
 class X86TargetMachine;
-class formatted_raw_ostream;
-class raw_ostream;
 
 /// createX86ISelDag - This pass converts a legalized DAG into a 
 /// X86-specific DAG, ready for instruction scheduling.
@@ -51,22 +44,16 @@ FunctionPass* createGlobalBaseRegPass();
 ///
 FunctionPass *createX86FloatingPointStackifierPass();
 
-/// createSSEDomainFixPass - This pass twiddles SSE opcodes to prevent domain
-/// crossings.
-FunctionPass *createSSEDomainFixPass();
+/// createX86IssueVZeroUpperPass - This pass inserts AVX vzeroupper instructions
+/// before each call to avoid transition penalty between functions encoded with
+/// AVX and SSE.
+FunctionPass *createX86IssueVZeroUpperPass();
 
 /// createX86CodeEmitterPass - Return a pass that emits the collected X86 code
 /// to the specified MCE object.
 FunctionPass *createX86JITCodeEmitterPass(X86TargetMachine &TM,
                                           JITCodeEmitter &JCE);
 
-MCCodeEmitter *createX86MCCodeEmitter(const MCInstrInfo &MCII,
-                                      const MCSubtargetInfo &STI,
-                                      MCContext &Ctx);
-
-TargetAsmBackend *createX86_32AsmBackend(const Target &, const std::string &);
-TargetAsmBackend *createX86_64AsmBackend(const Target &, const std::string &);
-
 /// createX86EmitCodeToMemory - Returns a pass that converts a register
 /// allocated function into raw machine code in a dynamically
 /// allocated chunk of memory.
@@ -79,13 +66,6 @@ FunctionPass *createEmitX86CodeToMemory();
 ///
 FunctionPass *createX86MaxStackAlignmentHeuristicPass();
 
-
-/// createX86MachObjectWriter - Construct an X86 Mach-O object writer.
-MCObjectWriter *createX86MachObjectWriter(raw_ostream &OS,
-                                          bool Is64Bit,
-                                          uint32_t CPUType,
-                                          uint32_t CPUSubtype);
-
 } // End llvm namespace
 
 #endif
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 4ccb43fe18cc3..104b91fd35340 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This is a target description file for the Intel i386 architecture, referred to
-// here as the "X86" architecture.
+// This is a target description file for the Intel i386 architecture, referred
+// to here as the "X86" architecture.
 //
 //===----------------------------------------------------------------------===//
 
@@ -23,6 +23,9 @@ include "llvm/Target/Target.td"
 def Mode64Bit : SubtargetFeature<"64bit-mode", "In64BitMode", "true",
                                   "64-bit mode (x86_64)">;
 
+def ModeNaCl  : SubtargetFeature<"nacl-mode", "InNaClMode", "true",
+                                 "Native Client mode">;
+
 //===----------------------------------------------------------------------===//
 // X86 Subtarget features.
 //===----------------------------------------------------------------------===//
@@ -68,6 +71,9 @@ def Feature3DNowA  : SubtargetFeature<"3dnowa", "X863DNowLevel", "ThreeDNowA",
 def Feature64Bit   : SubtargetFeature<"64bit", "HasX86_64", "true",
                                       "Support 64-bit instructions",
                                       [FeatureCMOV]>;
+def FeatureCMPXCHG16B : SubtargetFeature<"cmpxchg16b", "HasCmpxchg16b", "true",
+                                      "64-bit with cmpxchg16b",
+                                      [Feature64Bit]>;
 def FeatureSlowBTMem : SubtargetFeature<"slow-bt-mem", "IsBTMemSlow", "true",
                                        "Bit testing of memory is slow">;
 def FeatureFastUAMem : SubtargetFeature<"fast-unaligned-mem",
@@ -90,6 +96,16 @@ def FeatureVectorUAMem : SubtargetFeature<"vector-unaligned-mem",
                  "Allow unaligned memory operands on vector/SIMD instructions">;
 def FeatureAES     : SubtargetFeature<"aes", "HasAES", "true",
                                       "Enable AES instructions">;
+def FeatureMOVBE   : SubtargetFeature<"movbe", "HasMOVBE", "true",
+                                      "Support MOVBE instruction">;
+def FeatureRDRAND  : SubtargetFeature<"rdrand", "HasRDRAND", "true",
+                                      "Support RDRAND instruction">;
+def FeatureF16C    : SubtargetFeature<"f16c", "HasF16C", "true",
+                       "Support 16-bit floating point conversion instructions">;
+def FeatureLZCNT   : SubtargetFeature<"lzcnt", "HasLZCNT", "true",
+                                      "Support LZCNT instruction">;
+def FeatureBMI     : SubtargetFeature<"bmi", "HasBMI", "true",
+                                      "Support BMI instructions">;
 
 //===----------------------------------------------------------------------===//
 // X86 processors supported.
@@ -112,27 +128,43 @@ def : Proc<"pentium3m",       [FeatureSSE1, FeatureSlowBTMem]>;
 def : Proc<"pentium-m",       [FeatureSSE2, FeatureSlowBTMem]>;
 def : Proc<"pentium4",        [FeatureSSE2]>;
 def : Proc<"pentium4m",       [FeatureSSE2, FeatureSlowBTMem]>;
-def : Proc<"x86-64",          [FeatureSSE2,   Feature64Bit, FeatureSlowBTMem]>;
+def : Proc<"x86-64",          [FeatureSSE2, Feature64Bit, FeatureSlowBTMem]>;
 def : Proc<"yonah",           [FeatureSSE3, FeatureSlowBTMem]>;
 def : Proc<"prescott",        [FeatureSSE3, FeatureSlowBTMem]>;
-def : Proc<"nocona",          [FeatureSSE3,   Feature64Bit, FeatureSlowBTMem]>;
-def : Proc<"core2",           [FeatureSSSE3,  Feature64Bit, FeatureSlowBTMem]>;
-def : Proc<"penryn",          [FeatureSSE41,  Feature64Bit, FeatureSlowBTMem]>;
-def : Proc<"atom",            [FeatureSSE3,   Feature64Bit, FeatureSlowBTMem]>;
+def : Proc<"nocona",          [FeatureSSE3, FeatureCMPXCHG16B,
+                               FeatureSlowBTMem]>;
+def : Proc<"core2",           [FeatureSSSE3, FeatureCMPXCHG16B,
+                               FeatureSlowBTMem]>;
+def : Proc<"penryn",          [FeatureSSE41, FeatureCMPXCHG16B,
+                               FeatureSlowBTMem]>;
+def : Proc<"atom",            [FeatureSSE3, FeatureCMPXCHG16B, FeatureMOVBE,
+                               FeatureSlowBTMem]>;
 // "Arrandale" along with corei3 and corei5
-def : Proc<"corei7",          [FeatureSSE42,  Feature64Bit, FeatureSlowBTMem,
-                               FeatureFastUAMem, FeatureAES]>;
-def : Proc<"nehalem",         [FeatureSSE42,  Feature64Bit, FeatureSlowBTMem,
-                               FeatureFastUAMem]>;
+def : Proc<"corei7",          [FeatureSSE42, FeatureCMPXCHG16B,
+                               FeatureSlowBTMem, FeatureFastUAMem, FeatureAES]>;
+def : Proc<"nehalem",         [FeatureSSE42,  FeatureCMPXCHG16B,
+                               FeatureSlowBTMem, FeatureFastUAMem]>;
 // Westmere is a similar machine to nehalem with some additional features.
 // Westmere is the corei3/i5/i7 path from nehalem to sandybridge
-def : Proc<"westmere",        [FeatureSSE42,  Feature64Bit, FeatureSlowBTMem,
-                               FeatureFastUAMem, FeatureAES, FeatureCLMUL]>;
+def : Proc<"westmere",        [FeatureSSE42, FeatureCMPXCHG16B,
+                               FeatureSlowBTMem, FeatureFastUAMem, FeatureAES,
+                               FeatureCLMUL]>;
+// Sandy Bridge
 // SSE is not listed here since llvm treats AVX as a reimplementation of SSE,
 // rather than a superset.
 // FIXME: Disabling AVX for now since it's not ready.
-def : Proc<"corei7-avx",      [FeatureSSE42, Feature64Bit,
+def : Proc<"corei7-avx",      [FeatureSSE42, FeatureCMPXCHG16B,
                                FeatureAES, FeatureCLMUL]>;
+// Ivy Bridge
+def : Proc<"core-avx-i",      [FeatureSSE42, FeatureCMPXCHG16B,
+                               FeatureAES, FeatureCLMUL,
+                               FeatureRDRAND, FeatureF16C]>;
+
+// Haswell
+def : Proc<"core-avx2",       [FeatureSSE42, FeatureCMPXCHG16B, FeatureAES,
+                               FeatureCLMUL, FeatureRDRAND, FeatureF16C,
+                               FeatureFMA3, FeatureMOVBE, FeatureLZCNT,
+                               FeatureBMI]>;
 
 def : Proc<"k6",              [FeatureMMX]>;
 def : Proc<"k6-2",            [Feature3DNow]>;
@@ -150,19 +182,21 @@ def : Proc<"athlon64",        [FeatureSSE2,   Feature3DNowA, Feature64Bit,
                                FeatureSlowBTMem]>;
 def : Proc<"athlon-fx",       [FeatureSSE2,   Feature3DNowA, Feature64Bit,
                                FeatureSlowBTMem]>;
-def : Proc<"k8-sse3",         [FeatureSSE3,   Feature3DNowA, Feature64Bit,
+def : Proc<"k8-sse3",         [FeatureSSE3,   Feature3DNowA, FeatureCMPXCHG16B,
                                FeatureSlowBTMem]>;
-def : Proc<"opteron-sse3",    [FeatureSSE3,   Feature3DNowA, Feature64Bit,
+def : Proc<"opteron-sse3",    [FeatureSSE3,   Feature3DNowA, FeatureCMPXCHG16B,
                                FeatureSlowBTMem]>;
-def : Proc<"athlon64-sse3",   [FeatureSSE3,   Feature3DNowA, Feature64Bit,
+def : Proc<"athlon64-sse3",   [FeatureSSE3,   Feature3DNowA, FeatureCMPXCHG16B,
                                FeatureSlowBTMem]>;
 def : Proc<"amdfam10",        [FeatureSSE3,   FeatureSSE4A,
-                               Feature3DNowA, Feature64Bit, FeatureSlowBTMem]>;
+                               Feature3DNowA, FeatureCMPXCHG16B,
+                               FeatureSlowBTMem]>;
 def : Proc<"barcelona",       [FeatureSSE3,   FeatureSSE4A,
-                               Feature3DNowA, Feature64Bit, FeatureSlowBTMem]>;
-def : Proc<"istanbul",        [Feature3DNowA, Feature64Bit, FeatureSSE4A,
-                               Feature3DNowA]>;
-def : Proc<"shanghai",        [Feature3DNowA, Feature64Bit, FeatureSSE4A,
+                               Feature3DNowA, FeatureCMPXCHG16B,
+                               FeatureSlowBTMem]>;
+def : Proc<"istanbul",        [Feature3DNowA, FeatureCMPXCHG16B,
+                               FeatureSSE4A, Feature3DNowA]>;
+def : Proc<"shanghai",        [Feature3DNowA, FeatureCMPXCHG16B, FeatureSSE4A,
                                Feature3DNowA]>;
 
 def : Proc<"winchip-c6",      [FeatureMMX]>;
diff --git a/lib/Target/X86/X86AsmBackend.cpp b/lib/Target/X86/X86AsmBackend.cpp
deleted file mode 100644
index 9b556a55efd9d..0000000000000
--- a/lib/Target/X86/X86AsmBackend.cpp
+++ /dev/null
@@ -1,453 +0,0 @@
-//===-- X86AsmBackend.cpp - X86 Assembler Backend -------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Target/TargetAsmBackend.h"
-#include "X86.h"
-#include "X86FixupKinds.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/MC/MCAssembler.h"
-#include "llvm/MC/MCELFObjectWriter.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCFixupKindInfo.h"
-#include "llvm/MC/MCMachObjectWriter.h"
-#include "llvm/MC/MCObjectWriter.h"
-#include "llvm/MC/MCSectionCOFF.h"
-#include "llvm/MC/MCSectionELF.h"
-#include "llvm/MC/MCSectionMachO.h"
-#include "llvm/Object/MachOFormat.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/ELF.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetRegistry.h"
-#include "llvm/Target/TargetAsmBackend.h"
-using namespace llvm;
-
-// Option to allow disabling arithmetic relaxation to workaround PR9807, which
-// is useful when running bitwise comparison experiments on Darwin. We should be
-// able to remove this once PR9807 is resolved.
-static cl::opt<bool>
-MCDisableArithRelaxation("mc-x86-disable-arith-relaxation",
-         cl::desc("Disable relaxation of arithmetic instruction for X86"));
-
-static unsigned getFixupKindLog2Size(unsigned Kind) {
-  switch (Kind) {
-  default: assert(0 && "invalid fixup kind!");
-  case FK_PCRel_1:
-  case FK_Data_1: return 0;
-  case FK_PCRel_2:
-  case FK_Data_2: return 1;
-  case FK_PCRel_4:
-  case X86::reloc_riprel_4byte:
-  case X86::reloc_riprel_4byte_movq_load:
-  case X86::reloc_signed_4byte:
-  case X86::reloc_global_offset_table:
-  case FK_Data_4: return 2;
-  case FK_PCRel_8:
-  case FK_Data_8: return 3;
-  }
-}
-
-namespace {
-
-class X86ELFObjectWriter : public MCELFObjectTargetWriter {
-public:
-  X86ELFObjectWriter(bool is64Bit, Triple::OSType OSType, uint16_t EMachine,
-                     bool HasRelocationAddend)
-    : MCELFObjectTargetWriter(is64Bit, OSType, EMachine, HasRelocationAddend) {}
-};
-
-class X86AsmBackend : public TargetAsmBackend {
-public:
-  X86AsmBackend(const Target &T)
-    : TargetAsmBackend() {}
-
-  unsigned getNumFixupKinds() const {
-    return X86::NumTargetFixupKinds;
-  }
-
-  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const {
-    const static MCFixupKindInfo Infos[X86::NumTargetFixupKinds] = {
-      { "reloc_riprel_4byte", 0, 4 * 8, MCFixupKindInfo::FKF_IsPCRel },
-      { "reloc_riprel_4byte_movq_load", 0, 4 * 8, MCFixupKindInfo::FKF_IsPCRel},
-      { "reloc_signed_4byte", 0, 4 * 8, 0},
-      { "reloc_global_offset_table", 0, 4 * 8, 0}
-    };
-
-    if (Kind < FirstTargetFixupKind)
-      return TargetAsmBackend::getFixupKindInfo(Kind);
-
-    assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
-           "Invalid kind!");
-    return Infos[Kind - FirstTargetFixupKind];
-  }
-
-  void ApplyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value) const {
-    unsigned Size = 1 << getFixupKindLog2Size(Fixup.getKind());
-
-    assert(Fixup.getOffset() + Size <= DataSize &&
-           "Invalid fixup offset!");
-    for (unsigned i = 0; i != Size; ++i)
-      Data[Fixup.getOffset() + i] = uint8_t(Value >> (i * 8));
-  }
-
-  bool MayNeedRelaxation(const MCInst &Inst) const;
-
-  void RelaxInstruction(const MCInst &Inst, MCInst &Res) const;
-
-  bool WriteNopData(uint64_t Count, MCObjectWriter *OW) const;
-};
-} // end anonymous namespace
-
-static unsigned getRelaxedOpcodeBranch(unsigned Op) {
-  switch (Op) {
-  default:
-    return Op;
-
-  case X86::JAE_1: return X86::JAE_4;
-  case X86::JA_1:  return X86::JA_4;
-  case X86::JBE_1: return X86::JBE_4;
-  case X86::JB_1:  return X86::JB_4;
-  case X86::JE_1:  return X86::JE_4;
-  case X86::JGE_1: return X86::JGE_4;
-  case X86::JG_1:  return X86::JG_4;
-  case X86::JLE_1: return X86::JLE_4;
-  case X86::JL_1:  return X86::JL_4;
-  case X86::JMP_1: return X86::JMP_4;
-  case X86::JNE_1: return X86::JNE_4;
-  case X86::JNO_1: return X86::JNO_4;
-  case X86::JNP_1: return X86::JNP_4;
-  case X86::JNS_1: return X86::JNS_4;
-  case X86::JO_1:  return X86::JO_4;
-  case X86::JP_1:  return X86::JP_4;
-  case X86::JS_1:  return X86::JS_4;
-  }
-}
-
-static unsigned getRelaxedOpcodeArith(unsigned Op) {
-  switch (Op) {
-  default:
-    return Op;
-
-    // IMUL
-  case X86::IMUL16rri8: return X86::IMUL16rri;
-  case X86::IMUL16rmi8: return X86::IMUL16rmi;
-  case X86::IMUL32rri8: return X86::IMUL32rri;
-  case X86::IMUL32rmi8: return X86::IMUL32rmi;
-  case X86::IMUL64rri8: return X86::IMUL64rri32;
-  case X86::IMUL64rmi8: return X86::IMUL64rmi32;
-
-    // AND
-  case X86::AND16ri8: return X86::AND16ri;
-  case X86::AND16mi8: return X86::AND16mi;
-  case X86::AND32ri8: return X86::AND32ri;
-  case X86::AND32mi8: return X86::AND32mi;
-  case X86::AND64ri8: return X86::AND64ri32;
-  case X86::AND64mi8: return X86::AND64mi32;
-
-    // OR
-  case X86::OR16ri8: return X86::OR16ri;
-  case X86::OR16mi8: return X86::OR16mi;
-  case X86::OR32ri8: return X86::OR32ri;
-  case X86::OR32mi8: return X86::OR32mi;
-  case X86::OR64ri8: return X86::OR64ri32;
-  case X86::OR64mi8: return X86::OR64mi32;
-
-    // XOR
-  case X86::XOR16ri8: return X86::XOR16ri;
-  case X86::XOR16mi8: return X86::XOR16mi;
-  case X86::XOR32ri8: return X86::XOR32ri;
-  case X86::XOR32mi8: return X86::XOR32mi;
-  case X86::XOR64ri8: return X86::XOR64ri32;
-  case X86::XOR64mi8: return X86::XOR64mi32;
-
-    // ADD
-  case X86::ADD16ri8: return X86::ADD16ri;
-  case X86::ADD16mi8: return X86::ADD16mi;
-  case X86::ADD32ri8: return X86::ADD32ri;
-  case X86::ADD32mi8: return X86::ADD32mi;
-  case X86::ADD64ri8: return X86::ADD64ri32;
-  case X86::ADD64mi8: return X86::ADD64mi32;
-
-    // SUB
-  case X86::SUB16ri8: return X86::SUB16ri;
-  case X86::SUB16mi8: return X86::SUB16mi;
-  case X86::SUB32ri8: return X86::SUB32ri;
-  case X86::SUB32mi8: return X86::SUB32mi;
-  case X86::SUB64ri8: return X86::SUB64ri32;
-  case X86::SUB64mi8: return X86::SUB64mi32;
-
-    // CMP
-  case X86::CMP16ri8: return X86::CMP16ri;
-  case X86::CMP16mi8: return X86::CMP16mi;
-  case X86::CMP32ri8: return X86::CMP32ri;
-  case X86::CMP32mi8: return X86::CMP32mi;
-  case X86::CMP64ri8: return X86::CMP64ri32;
-  case X86::CMP64mi8: return X86::CMP64mi32;
-
-    // PUSH
-  case X86::PUSHi8: return X86::PUSHi32;
-  case X86::PUSHi16: return X86::PUSHi32;
-  case X86::PUSH64i8: return X86::PUSH64i32;
-  case X86::PUSH64i16: return X86::PUSH64i32;
-  }
-}
-
-static unsigned getRelaxedOpcode(unsigned Op) {
-  unsigned R = getRelaxedOpcodeArith(Op);
-  if (R != Op)
-    return R;
-  return getRelaxedOpcodeBranch(Op);
-}
-
-bool X86AsmBackend::MayNeedRelaxation(const MCInst &Inst) const {
-  // Branches can always be relaxed.
-  if (getRelaxedOpcodeBranch(Inst.getOpcode()) != Inst.getOpcode())
-    return true;
-
-  if (MCDisableArithRelaxation)
-    return false;
-
-  // Check if this instruction is ever relaxable.
-  if (getRelaxedOpcodeArith(Inst.getOpcode()) == Inst.getOpcode())
-    return false;
-
-
-  // Check if it has an expression and is not RIP relative.
-  bool hasExp = false;
-  bool hasRIP = false;
-  for (unsigned i = 0; i < Inst.getNumOperands(); ++i) {
-    const MCOperand &Op = Inst.getOperand(i);
-    if (Op.isExpr())
-      hasExp = true;
-
-    if (Op.isReg() && Op.getReg() == X86::RIP)
-      hasRIP = true;
-  }
-
-  // FIXME: Why exactly do we need the !hasRIP? Is it just a limitation on
-  // how we do relaxations?
-  return hasExp && !hasRIP;
-}
-
-// FIXME: Can tblgen help at all here to verify there aren't other instructions
-// we can relax?
-void X86AsmBackend::RelaxInstruction(const MCInst &Inst, MCInst &Res) const {
-  // The only relaxations X86 does is from a 1byte pcrel to a 4byte pcrel.
-  unsigned RelaxedOp = getRelaxedOpcode(Inst.getOpcode());
-
-  if (RelaxedOp == Inst.getOpcode()) {
-    SmallString<256> Tmp;
-    raw_svector_ostream OS(Tmp);
-    Inst.dump_pretty(OS);
-    OS << "\n";
-    report_fatal_error("unexpected instruction to relax: " + OS.str());
-  }
-
-  Res = Inst;
-  Res.setOpcode(RelaxedOp);
-}
-
-/// WriteNopData - Write optimal nops to the output file for the \arg Count
-/// bytes.  This returns the number of bytes written.  It may return 0 if
-/// the \arg Count is more than the maximum optimal nops.
-bool X86AsmBackend::WriteNopData(uint64_t Count, MCObjectWriter *OW) const {
-  static const uint8_t Nops[10][10] = {
-    // nop
-    {0x90},
-    // xchg %ax,%ax
-    {0x66, 0x90},
-    // nopl (%[re]ax)
-    {0x0f, 0x1f, 0x00},
-    // nopl 0(%[re]ax)
-    {0x0f, 0x1f, 0x40, 0x00},
-    // nopl 0(%[re]ax,%[re]ax,1)
-    {0x0f, 0x1f, 0x44, 0x00, 0x00},
-    // nopw 0(%[re]ax,%[re]ax,1)
-    {0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00},
-    // nopl 0L(%[re]ax)
-    {0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00},
-    // nopl 0L(%[re]ax,%[re]ax,1)
-    {0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
-    // nopw 0L(%[re]ax,%[re]ax,1)
-    {0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
-    // nopw %cs:0L(%[re]ax,%[re]ax,1)
-    {0x66, 0x2e, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
-  };
-
-  // Write an optimal sequence for the first 15 bytes.
-  const uint64_t OptimalCount = (Count < 16) ? Count : 15;
-  const uint64_t Prefixes = OptimalCount <= 10 ? 0 : OptimalCount - 10;
-  for (uint64_t i = 0, e = Prefixes; i != e; i++)
-    OW->Write8(0x66);
-  const uint64_t Rest = OptimalCount - Prefixes;
-  for (uint64_t i = 0, e = Rest; i != e; i++)
-    OW->Write8(Nops[Rest - 1][i]);
-
-  // Finish with single byte nops.
-  for (uint64_t i = OptimalCount, e = Count; i != e; ++i)
-   OW->Write8(0x90);
-
-  return true;
-}
-
-/* *** */
-
-namespace {
-class ELFX86AsmBackend : public X86AsmBackend {
-public:
-  Triple::OSType OSType;
-  ELFX86AsmBackend(const Target &T, Triple::OSType _OSType)
-    : X86AsmBackend(T), OSType(_OSType) {
-    HasReliableSymbolDifference = true;
-  }
-
-  virtual bool doesSectionRequireSymbols(const MCSection &Section) const {
-    const MCSectionELF &ES = static_cast<const MCSectionELF&>(Section);
-    return ES.getFlags() & ELF::SHF_MERGE;
-  }
-};
-
-class ELFX86_32AsmBackend : public ELFX86AsmBackend {
-public:
-  ELFX86_32AsmBackend(const Target &T, Triple::OSType OSType)
-    : ELFX86AsmBackend(T, OSType) {}
-
-  MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
-    return createELFObjectWriter(createELFObjectTargetWriter(),
-                                 OS, /*IsLittleEndian*/ true);
-  }
-
-  MCELFObjectTargetWriter *createELFObjectTargetWriter() const {
-    return new X86ELFObjectWriter(false, OSType, ELF::EM_386, false);
-  }
-};
-
-class ELFX86_64AsmBackend : public ELFX86AsmBackend {
-public:
-  ELFX86_64AsmBackend(const Target &T, Triple::OSType OSType)
-    : ELFX86AsmBackend(T, OSType) {}
-
-  MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
-    return createELFObjectWriter(createELFObjectTargetWriter(),
-                                 OS, /*IsLittleEndian*/ true);
-  }
-
-  MCELFObjectTargetWriter *createELFObjectTargetWriter() const {
-    return new X86ELFObjectWriter(true, OSType, ELF::EM_X86_64, true);
-  }
-};
-
-class WindowsX86AsmBackend : public X86AsmBackend {
-  bool Is64Bit;
-
-public:
-  WindowsX86AsmBackend(const Target &T, bool is64Bit)
-    : X86AsmBackend(T)
-    , Is64Bit(is64Bit) {
-  }
-
-  MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
-    return createWinCOFFObjectWriter(OS, Is64Bit);
-  }
-};
-
-class DarwinX86AsmBackend : public X86AsmBackend {
-public:
-  DarwinX86AsmBackend(const Target &T)
-    : X86AsmBackend(T) { }
-};
-
-class DarwinX86_32AsmBackend : public DarwinX86AsmBackend {
-public:
-  DarwinX86_32AsmBackend(const Target &T)
-    : DarwinX86AsmBackend(T) {}
-
-  MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
-    return createX86MachObjectWriter(OS, /*Is64Bit=*/false,
-                                     object::mach::CTM_i386,
-                                     object::mach::CSX86_ALL);
-  }
-};
-
-class DarwinX86_64AsmBackend : public DarwinX86AsmBackend {
-public:
-  DarwinX86_64AsmBackend(const Target &T)
-    : DarwinX86AsmBackend(T) {
-    HasReliableSymbolDifference = true;
-  }
-
-  MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
-    return createX86MachObjectWriter(OS, /*Is64Bit=*/true,
-                                     object::mach::CTM_x86_64,
-                                     object::mach::CSX86_ALL);
-  }
-
-  virtual bool doesSectionRequireSymbols(const MCSection &Section) const {
-    // Temporary labels in the string literals sections require symbols. The
-    // issue is that the x86_64 relocation format does not allow symbol +
-    // offset, and so the linker does not have enough information to resolve the
-    // access to the appropriate atom unless an external relocation is used. For
-    // non-cstring sections, we expect the compiler to use a non-temporary label
-    // for anything that could have an addend pointing outside the symbol.
-    //
-    // See <rdar://problem/4765733>.
-    const MCSectionMachO &SMO = static_cast<const MCSectionMachO&>(Section);
-    return SMO.getType() == MCSectionMachO::S_CSTRING_LITERALS;
-  }
-
-  virtual bool isSectionAtomizable(const MCSection &Section) const {
-    const MCSectionMachO &SMO = static_cast<const MCSectionMachO&>(Section);
-    // Fixed sized data sections are uniqued, they cannot be diced into atoms.
-    switch (SMO.getType()) {
-    default:
-      return true;
-
-    case MCSectionMachO::S_4BYTE_LITERALS:
-    case MCSectionMachO::S_8BYTE_LITERALS:
-    case MCSectionMachO::S_16BYTE_LITERALS:
-    case MCSectionMachO::S_LITERAL_POINTERS:
-    case MCSectionMachO::S_NON_LAZY_SYMBOL_POINTERS:
-    case MCSectionMachO::S_LAZY_SYMBOL_POINTERS:
-    case MCSectionMachO::S_MOD_INIT_FUNC_POINTERS:
-    case MCSectionMachO::S_MOD_TERM_FUNC_POINTERS:
-    case MCSectionMachO::S_INTERPOSING:
-      return false;
-    }
-  }
-};
-
-} // end anonymous namespace
-
-TargetAsmBackend *llvm::createX86_32AsmBackend(const Target &T,
-                                               const std::string &TT) {
-  Triple TheTriple(TT);
-
-  if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO)
-    return new DarwinX86_32AsmBackend(T);
-
-  if (TheTriple.isOSWindows())
-    return new WindowsX86AsmBackend(T, false);
-
-  return new ELFX86_32AsmBackend(T, TheTriple.getOS());
-}
-
-TargetAsmBackend *llvm::createX86_64AsmBackend(const Target &T,
-                                               const std::string &TT) {
-  Triple TheTriple(TT);
-
-  if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO)
-    return new DarwinX86_64AsmBackend(T);
-
-  if (TheTriple.isOSWindows())
-    return new WindowsX86AsmBackend(T, true);
-
-  return new ELFX86_64AsmBackend(T, TheTriple.getOS());
-}
diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp
index 99b4479a9fc9b..4c3ff02826b07 100644
--- a/lib/Target/X86/X86AsmPrinter.cpp
+++ b/lib/Target/X86/X86AsmPrinter.cpp
@@ -35,12 +35,12 @@
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/Target/Mangler.h"
+#include "llvm/Target/TargetOptions.h"
 #include "llvm/Support/COFF.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Target/Mangler.h"
-#include "llvm/Target/TargetOptions.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/ADT/SmallString.h"
 using namespace llvm;
 
@@ -504,8 +504,8 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
         //   .indirect_symbol _foo
         OutStreamer.EmitSymbolAttribute(Stubs[i].second.getPointer(),
                                         MCSA_IndirectSymbol);
-        // hlt; hlt; hlt; hlt; hlt     hlt = 0xf4 = -12.
-        const char HltInsts[] = { -12, -12, -12, -12, -12 };
+        // hlt; hlt; hlt; hlt; hlt     hlt = 0xf4.
+        const char HltInsts[] = "\xf4\xf4\xf4\xf4\xf4";
         OutStreamer.EmitBytes(StringRef(HltInsts, 5), 0/*addrspace*/);
       }
 
@@ -708,21 +708,8 @@ void X86AsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
 // Target Registry Stuff
 //===----------------------------------------------------------------------===//
 
-static MCInstPrinter *createX86MCInstPrinter(const Target &T,
-                                             unsigned SyntaxVariant,
-                                             const MCAsmInfo &MAI) {
-  if (SyntaxVariant == 0)
-    return new X86ATTInstPrinter(MAI);
-  if (SyntaxVariant == 1)
-    return new X86IntelInstPrinter(MAI);
-  return 0;
-}
-
 // Force static initialization.
 extern "C" void LLVMInitializeX86AsmPrinter() {
   RegisterAsmPrinter<X86AsmPrinter> X(TheX86_32Target);
   RegisterAsmPrinter<X86AsmPrinter> Y(TheX86_64Target);
-
-  TargetRegistry::RegisterMCInstPrinter(TheX86_32Target,createX86MCInstPrinter);
-  TargetRegistry::RegisterMCInstPrinter(TheX86_64Target,createX86MCInstPrinter);
 }
diff --git a/lib/Target/X86/X86CodeEmitter.cpp b/lib/Target/X86/X86CodeEmitter.cpp
index 4b11db7c03312..aeff03a89ec9b 100644
--- a/lib/Target/X86/X86CodeEmitter.cpp
+++ b/lib/Target/X86/X86CodeEmitter.cpp
@@ -98,8 +98,6 @@ namespace {
     void emitMemModRMByte(const MachineInstr &MI,
                           unsigned Op, unsigned RegOpcodeField,
                           intptr_t PCAdj = 0);
-
-    unsigned getX86RegNum(unsigned RegNo) const;
   };
 
 template<class CodeEmitter>
@@ -169,7 +167,7 @@ static unsigned determineREX(const MachineInstr &MI) {
       const MachineOperand& MO = MI.getOperand(i);
       if (MO.isReg()) {
         unsigned Reg = MO.getReg();
-        if (X86InstrInfo::isX86_64NonExtLowByteReg(Reg))
+        if (X86II::isX86_64NonExtLowByteReg(Reg))
           REX |= 0x40;
       }
     }
@@ -346,11 +344,6 @@ void Emitter<CodeEmitter>::emitJumpTableAddress(unsigned JTI, unsigned Reloc,
     MCE.emitWordLE(0);
 }
 
-template<class CodeEmitter>
-unsigned Emitter<CodeEmitter>::getX86RegNum(unsigned RegNo) const {
-  return X86RegisterInfo::getX86RegNum(RegNo);
-}
-
 inline static unsigned char ModRMByte(unsigned Mod, unsigned RegOpcode,
                                       unsigned RM) {
   assert(Mod < 4 && RegOpcode < 8 && RM < 8 && "ModRM Fields out of range!");
@@ -360,7 +353,7 @@ inline static unsigned char ModRMByte(unsigned Mod, unsigned RegOpcode,
 template<class CodeEmitter>
 void Emitter<CodeEmitter>::emitRegModRMByte(unsigned ModRMReg,
                                             unsigned RegOpcodeFld){
-  MCE.emitByte(ModRMByte(3, RegOpcodeFld, getX86RegNum(ModRMReg)));
+  MCE.emitByte(ModRMByte(3, RegOpcodeFld, X86_MC::getX86RegNum(ModRMReg)));
 }
 
 template<class CodeEmitter>
@@ -498,7 +491,7 @@ void Emitter<CodeEmitter>::emitMemModRMByte(const MachineInstr &MI,
   // 2-7) and absolute references.
   unsigned BaseRegNo = -1U;
   if (BaseReg != 0 && BaseReg != X86::RIP)
-    BaseRegNo = getX86RegNum(BaseReg);
+    BaseRegNo = X86_MC::getX86RegNum(BaseReg);
 
   if (// The SIB byte must be used if there is an index register.
       IndexReg.getReg() == 0 && 
@@ -566,7 +559,7 @@ void Emitter<CodeEmitter>::emitMemModRMByte(const MachineInstr &MI,
   }
 
   // Calculate what the SS field value should be...
-  static const unsigned SSTable[] = { ~0, 0, 1, ~0, 2, ~0, ~0, ~0, 3 };
+  static const unsigned SSTable[] = { ~0U, 0, 1, ~0U, 2, ~0U, ~0U, ~0U, 3 };
   unsigned SS = SSTable[Scale.getImm()];
 
   if (BaseReg == 0) {
@@ -574,15 +567,15 @@ void Emitter<CodeEmitter>::emitMemModRMByte(const MachineInstr &MI,
     // Manual 2A, table 2-7. The displacement has already been output.
     unsigned IndexRegNo;
     if (IndexReg.getReg())
-      IndexRegNo = getX86RegNum(IndexReg.getReg());
+      IndexRegNo = X86_MC::getX86RegNum(IndexReg.getReg());
     else // Examples: [ESP+1*<noreg>+4] or [scaled idx]+disp32 (MOD=0,BASE=5)
       IndexRegNo = 4;
     emitSIBByte(SS, IndexRegNo, 5);
   } else {
-    unsigned BaseRegNo = getX86RegNum(BaseReg);
+    unsigned BaseRegNo = X86_MC::getX86RegNum(BaseReg);
     unsigned IndexRegNo;
     if (IndexReg.getReg())
-      IndexRegNo = getX86RegNum(IndexReg.getReg());
+      IndexRegNo = X86_MC::getX86RegNum(IndexReg.getReg());
     else
       IndexRegNo = 4;   // For example [ESP+1*<noreg>+4]
     emitSIBByte(SS, IndexRegNo, BaseRegNo);
@@ -809,7 +802,8 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
   }
       
   case X86II::AddRegFrm: {
-    MCE.emitByte(BaseOpcode + getX86RegNum(MI.getOperand(CurOp++).getReg()));
+    MCE.emitByte(BaseOpcode +
+                 X86_MC::getX86RegNum(MI.getOperand(CurOp++).getReg()));
     
     if (CurOp == NumOps)
       break;
@@ -844,7 +838,7 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
   case X86II::MRMDestReg: {
     MCE.emitByte(BaseOpcode);
     emitRegModRMByte(MI.getOperand(CurOp).getReg(),
-                     getX86RegNum(MI.getOperand(CurOp+1).getReg()));
+                     X86_MC::getX86RegNum(MI.getOperand(CurOp+1).getReg()));
     CurOp += 2;
     if (CurOp != NumOps)
       emitConstant(MI.getOperand(CurOp++).getImm(),
@@ -854,7 +848,7 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
   case X86II::MRMDestMem: {
     MCE.emitByte(BaseOpcode);
     emitMemModRMByte(MI, CurOp,
-                     getX86RegNum(MI.getOperand(CurOp + X86::AddrNumOperands)
+                X86_MC::getX86RegNum(MI.getOperand(CurOp + X86::AddrNumOperands)
                                   .getReg()));
     CurOp +=  X86::AddrNumOperands + 1;
     if (CurOp != NumOps)
@@ -866,7 +860,7 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
   case X86II::MRMSrcReg:
     MCE.emitByte(BaseOpcode);
     emitRegModRMByte(MI.getOperand(CurOp+1).getReg(),
-                     getX86RegNum(MI.getOperand(CurOp).getReg()));
+                     X86_MC::getX86RegNum(MI.getOperand(CurOp).getReg()));
     CurOp += 2;
     if (CurOp != NumOps)
       emitConstant(MI.getOperand(CurOp++).getImm(),
@@ -880,8 +874,8 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
       X86II::getSizeOfImm(Desc->TSFlags) : 0;
 
     MCE.emitByte(BaseOpcode);
-    emitMemModRMByte(MI, CurOp+1, getX86RegNum(MI.getOperand(CurOp).getReg()),
-                     PCAdj);
+    emitMemModRMByte(MI, CurOp+1,
+                     X86_MC::getX86RegNum(MI.getOperand(CurOp).getReg()),PCAdj);
     CurOp += AddrOperands + 1;
     if (CurOp != NumOps)
       emitConstant(MI.getOperand(CurOp++).getImm(),
@@ -968,7 +962,7 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
     MCE.emitByte(BaseOpcode);
     // Duplicate register, used by things like MOV8r0 (aka xor reg,reg).
     emitRegModRMByte(MI.getOperand(CurOp).getReg(),
-                     getX86RegNum(MI.getOperand(CurOp).getReg()));
+                     X86_MC::getX86RegNum(MI.getOperand(CurOp).getReg()));
     ++CurOp;
     break;
       
diff --git a/lib/Target/X86/X86ELFWriterInfo.cpp b/lib/Target/X86/X86ELFWriterInfo.cpp
index f1d7edea7210a..4a72d154c335e 100644
--- a/lib/Target/X86/X86ELFWriterInfo.cpp
+++ b/lib/Target/X86/X86ELFWriterInfo.cpp
@@ -147,7 +147,7 @@ long int X86ELFWriterInfo::computeRelocation(unsigned SymOffset,
   if (RelTy == ELF::R_X86_64_PC32 || RelTy == ELF::R_386_PC32)
     return SymOffset - (RelOffset + 4);
   else
-    assert("computeRelocation unknown for this relocation type");
+    assert(0 && "computeRelocation unknown for this relocation type");
 
   return 0;
 }
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index 21e163a300545..f912b28eb47b9 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -22,6 +22,7 @@
 #include "llvm/CallingConv.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/GlobalVariable.h"
+#include "llvm/GlobalAlias.h"
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Operator.h"
@@ -59,8 +60,8 @@ public:
   explicit X86FastISel(FunctionLoweringInfo &funcInfo) : FastISel(funcInfo) {
     Subtarget = &TM.getSubtarget<X86Subtarget>();
     StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP;
-    X86ScalarSSEf64 = Subtarget->hasSSE2();
-    X86ScalarSSEf32 = Subtarget->hasSSE1();
+    X86ScalarSSEf64 = Subtarget->hasSSE2() || Subtarget->hasAVX();
+    X86ScalarSSEf32 = Subtarget->hasSSE1() || Subtarget->hasAVX();
   }
 
   virtual bool TargetSelectInstruction(const Instruction *I);
@@ -134,7 +135,7 @@ private:
       (VT == MVT::f32 && X86ScalarSSEf32);   // f32 is when SSE1
   }
 
-  bool isTypeLegal(const Type *Ty, MVT &VT, bool AllowI1 = false);
+  bool isTypeLegal(Type *Ty, MVT &VT, bool AllowI1 = false);
 
   bool IsMemcpySmall(uint64_t Len);
 
@@ -144,7 +145,7 @@ private:
 
 } // end anonymous namespace.
 
-bool X86FastISel::isTypeLegal(const Type *Ty, MVT &VT, bool AllowI1) {
+bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) {
   EVT evt = TLI.getValueType(Ty, /*HandleUnknown=*/true);
   if (evt == MVT::Other || !evt.isSimple())
     // Unhandled type. Halt "fast" selection and bail.
@@ -198,8 +199,8 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM,
     RC  = X86::GR64RegisterClass;
     break;
   case MVT::f32:
-    if (Subtarget->hasSSE1()) {
-      Opc = X86::MOVSSrm;
+    if (X86ScalarSSEf32) {
+      Opc = Subtarget->hasAVX() ? X86::VMOVSSrm : X86::MOVSSrm;
       RC  = X86::FR32RegisterClass;
     } else {
       Opc = X86::LD_Fp32m;
@@ -207,8 +208,8 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM,
     }
     break;
   case MVT::f64:
-    if (Subtarget->hasSSE2()) {
-      Opc = X86::MOVSDrm;
+    if (X86ScalarSSEf64) {
+      Opc = Subtarget->hasAVX() ? X86::VMOVSDrm : X86::MOVSDrm;
       RC  = X86::FR64RegisterClass;
     } else {
       Opc = X86::LD_Fp64m;
@@ -250,10 +251,12 @@ X86FastISel::X86FastEmitStore(EVT VT, unsigned Val, const X86AddressMode &AM) {
   case MVT::i32: Opc = X86::MOV32mr; break;
   case MVT::i64: Opc = X86::MOV64mr; break; // Must be in x86-64 mode.
   case MVT::f32:
-    Opc = Subtarget->hasSSE1() ? X86::MOVSSmr : X86::ST_Fp32m;
+    Opc = X86ScalarSSEf32 ?
+          (Subtarget->hasAVX() ? X86::VMOVSSmr : X86::MOVSSmr) : X86::ST_Fp32m;
     break;
   case MVT::f64:
-    Opc = Subtarget->hasSSE2() ? X86::MOVSDmr : X86::ST_Fp64m;
+    Opc = X86ScalarSSEf64 ?
+          (Subtarget->hasAVX() ? X86::VMOVSDmr : X86::MOVSDmr) : X86::ST_Fp64m;
     break;
   }
 
@@ -336,7 +339,7 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) {
     U = C;
   }
 
-  if (const PointerType *Ty = dyn_cast<PointerType>(V->getType()))
+  if (PointerType *Ty = dyn_cast<PointerType>(V->getType()))
     if (Ty->getAddressSpace() > 255)
       // Fast instruction selection doesn't support the special
       // address spaces.
@@ -399,7 +402,7 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) {
     for (User::const_op_iterator i = U->op_begin() + 1, e = U->op_end();
          i != e; ++i, ++GTI) {
       const Value *Op = *i;
-      if (const StructType *STy = dyn_cast<StructType>(*GTI)) {
+      if (StructType *STy = dyn_cast<StructType>(*GTI)) {
         const StructLayout *SL = TD.getStructLayout(STy);
         Disp += SL->getElementOffset(cast<ConstantInt>(Op)->getZExtValue());
         continue;
@@ -465,14 +468,23 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) {
 
   // Handle constant address.
   if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
-    // Can't handle alternate code models or TLS yet.
+    // Can't handle alternate code models yet.
     if (TM.getCodeModel() != CodeModel::Small)
       return false;
 
+    // Can't handle TLS yet.
     if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
       if (GVar->isThreadLocal())
         return false;
 
+    // Can't handle TLS yet, part 2 (this is slightly crazy, but this is how
+    // it works...).
+    if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
+      if (const GlobalVariable *GVar =
+            dyn_cast_or_null<GlobalVariable>(GA->resolveAliasedGlobal(false)))
+        if (GVar->isThreadLocal())
+          return false;
+
     // RIP-relative addresses can't have additional register operands, so if
     // we've already folded stuff into the addressing mode, just force the
     // global value into its own register, which we can use as the basereg.
@@ -658,6 +670,10 @@ bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) {
 
 /// X86SelectStore - Select and emit code to implement store instructions.
 bool X86FastISel::X86SelectStore(const Instruction *I) {
+  // Atomic stores need special handling.
+  if (cast<StoreInst>(I)->isAtomic())
+    return false;
+
   MVT VT;
   if (!isTypeLegal(I->getOperand(0)->getType(), VT, /*AllowI1=*/true))
     return false;
@@ -780,6 +796,10 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
 /// X86SelectLoad - Select and emit code to implement load instructions.
 ///
 bool X86FastISel::X86SelectLoad(const Instruction *I)  {
+  // Atomic loads need special handling.
+  if (cast<LoadInst>(I)->isAtomic())
+    return false;
+
   MVT VT;
   if (!isTypeLegal(I->getType(), VT, /*AllowI1=*/true))
     return false;
@@ -797,14 +817,20 @@ bool X86FastISel::X86SelectLoad(const Instruction *I)  {
 }
 
 static unsigned X86ChooseCmpOpcode(EVT VT, const X86Subtarget *Subtarget) {
+  bool HasAVX = Subtarget->hasAVX();
+  bool X86ScalarSSEf32 = HasAVX || Subtarget->hasSSE1();
+  bool X86ScalarSSEf64 = HasAVX || Subtarget->hasSSE2();
+
   switch (VT.getSimpleVT().SimpleTy) {
   default:       return 0;
   case MVT::i8:  return X86::CMP8rr;
   case MVT::i16: return X86::CMP16rr;
   case MVT::i32: return X86::CMP32rr;
   case MVT::i64: return X86::CMP64rr;
-  case MVT::f32: return Subtarget->hasSSE1() ? X86::UCOMISSrr : 0;
-  case MVT::f64: return Subtarget->hasSSE2() ? X86::UCOMISDrr : 0;
+  case MVT::f32:
+    return X86ScalarSSEf32 ? (HasAVX ? X86::VUCOMISSrr : X86::UCOMISSrr) : 0;
+  case MVT::f64:
+    return X86ScalarSSEf64 ? (HasAVX ? X86::VUCOMISDrr : X86::UCOMISDrr) : 0;
   }
 }
 
@@ -1207,7 +1233,7 @@ bool X86FastISel::X86SelectSelect(const Instruction *I) {
 
 bool X86FastISel::X86SelectFPExt(const Instruction *I) {
   // fpext from float to double.
-  if (Subtarget->hasSSE2() &&
+  if (X86ScalarSSEf64 &&
       I->getType()->isDoubleTy()) {
     const Value *V = I->getOperand(0);
     if (V->getType()->isFloatTy()) {
@@ -1226,7 +1252,7 @@ bool X86FastISel::X86SelectFPExt(const Instruction *I) {
 }
 
 bool X86FastISel::X86SelectFPTrunc(const Instruction *I) {
-  if (Subtarget->hasSSE2()) {
+  if (X86ScalarSSEf64) {
     if (I->getType()->isFloatTy()) {
       const Value *V = I->getOperand(0);
       if (V->getType()->isDoubleTy()) {
@@ -1365,6 +1391,9 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) {
   case Intrinsic::memset: {
     const MemSetInst &MSI = cast<MemSetInst>(I);
 
+    if (MSI.isVolatile())
+      return false;
+
     unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32;
     if (!MSI.getLength()->getType()->isIntegerTy(SizeWidth))
       return false;
@@ -1411,7 +1440,7 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) {
     // Replace "add with overflow" intrinsics with an "add" instruction followed
     // by a seto/setc instruction.
     const Function *Callee = I.getCalledFunction();
-    const Type *RetTy =
+    Type *RetTy =
       cast<StructType>(Callee->getReturnType())->getTypeAtIndex(unsigned(0));
 
     MVT VT;
@@ -1484,8 +1513,8 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
   if (CC == CallingConv::Fast && GuaranteedTailCallOpt)
     return false;
 
-  const PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType());
-  const FunctionType *FTy = cast<FunctionType>(PT->getElementType());
+  PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType());
+  FunctionType *FTy = cast<FunctionType>(PT->getElementType());
   bool isVarArg = FTy->isVarArg();
 
   // Don't know how to handle Win64 varargs yet.  Nothing special needed for
@@ -1547,8 +1576,8 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
       Flags.setZExt();
 
     if (CS.paramHasAttr(AttrInd, Attribute::ByVal)) {
-      const PointerType *Ty = cast<PointerType>(ArgVal->getType());
-      const Type *ElementTy = Ty->getElementType();
+      PointerType *Ty = cast<PointerType>(ArgVal->getType());
+      Type *ElementTy = Ty->getElementType();
       unsigned FrameSize = TD.getTypeAllocSize(ElementTy);
       unsigned FrameAlign = CS.getParamAlignment(AttrInd);
       if (!FrameAlign)
@@ -1600,7 +1629,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
 
     if (ArgReg == 0) return false;
 
-    const Type *ArgTy = ArgVal->getType();
+    Type *ArgTy = ArgVal->getType();
     MVT ArgVT;
     if (!isTypeLegal(ArgTy, ArgVT))
       return false;
@@ -1709,7 +1738,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
         assert(Res && "memcpy length already checked!"); (void)Res;
       } else if (isa<ConstantInt>(ArgVal) || isa<ConstantPointerNull>(ArgVal)) {
         // If this is a really simple value, emit this with the Value* version
-        //of X86FastEmitStore.  If it isn't simple, we don't want to do this,
+        // of X86FastEmitStore.  If it isn't simple, we don't want to do this,
         // as it can cause us to reevaluate the argument.
         X86FastEmitStore(ArgVT, ArgVal, AM);
       } else {
@@ -1965,8 +1994,8 @@ unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) {
     RC  = X86::GR64RegisterClass;
     break;
   case MVT::f32:
-    if (Subtarget->hasSSE1()) {
-      Opc = X86::MOVSSrm;
+    if (X86ScalarSSEf32) {
+      Opc = Subtarget->hasAVX() ? X86::VMOVSSrm : X86::MOVSSrm;
       RC  = X86::FR32RegisterClass;
     } else {
       Opc = X86::LD_Fp32m;
@@ -1974,8 +2003,8 @@ unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) {
     }
     break;
   case MVT::f64:
-    if (Subtarget->hasSSE2()) {
-      Opc = X86::MOVSDrm;
+    if (X86ScalarSSEf64) {
+      Opc = Subtarget->hasAVX() ? X86::VMOVSDrm : X86::MOVSDrm;
       RC  = X86::FR64RegisterClass;
     } else {
       Opc = X86::LD_Fp64m;
@@ -2070,8 +2099,8 @@ unsigned X86FastISel::TargetMaterializeFloatZero(const ConstantFP *CF) {
   switch (VT.SimpleTy) {
     default: return false;
     case MVT::f32:
-      if (Subtarget->hasSSE1()) {
-        Opc = X86::FsFLD0SS;
+      if (X86ScalarSSEf32) {
+        Opc = Subtarget->hasAVX() ? X86::VFsFLD0SS : X86::FsFLD0SS;
         RC  = X86::FR32RegisterClass;
       } else {
         Opc = X86::LD_Fp032;
@@ -2079,8 +2108,8 @@ unsigned X86FastISel::TargetMaterializeFloatZero(const ConstantFP *CF) {
       }
       break;
     case MVT::f64:
-      if (Subtarget->hasSSE2()) {
-        Opc = X86::FsFLD0SD;
+      if (X86ScalarSSEf64) {
+        Opc = Subtarget->hasAVX() ? X86::VFsFLD0SD : X86::FsFLD0SD;
         RC  = X86::FR64RegisterClass;
       } else {
         Opc = X86::LD_Fp064;
diff --git a/lib/Target/X86/X86FixupKinds.h b/lib/Target/X86/X86FixupKinds.h
deleted file mode 100644
index 17d242ab761e3..0000000000000
--- a/lib/Target/X86/X86FixupKinds.h
+++ /dev/null
@@ -1,33 +0,0 @@
-//===-- X86/X86FixupKinds.h - X86 Specific Fixup Entries --------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_X86_X86FIXUPKINDS_H
-#define LLVM_X86_X86FIXUPKINDS_H
-
-#include "llvm/MC/MCFixup.h"
-
-namespace llvm {
-namespace X86 {
-enum Fixups {
-  reloc_riprel_4byte = FirstTargetFixupKind, // 32-bit rip-relative
-  reloc_riprel_4byte_movq_load,              // 32-bit rip-relative in movq
-  reloc_signed_4byte,                        // 32-bit signed. Unlike FK_Data_4
-                                             // this will be sign extended at
-                                             // runtime.
-  reloc_global_offset_table,                 // 32-bit, relative to the start
-                                             // of the instruction. Used only
-                                             // for _GLOBAL_OFFSET_TABLE_.
-  // Marker
-  LastTargetFixupKind,
-  NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
-};
-}
-}
-
-#endif
diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp
index 6eed6abd43e25..e3461c82c7a6c 100644
--- a/lib/Target/X86/X86FloatingPoint.cpp
+++ b/lib/Target/X86/X86FloatingPoint.cpp
@@ -260,6 +260,21 @@ namespace {
       BuildMI(*MBB, I, dl, TII->get(X86::LD_Frr)).addReg(STReg);
     }
 
+    /// duplicatePendingSTBeforeKill - The instruction at I is about to kill
+    /// RegNo. If any PendingST registers still need the RegNo value, duplicate
+    /// them to new scratch registers.
+    void duplicatePendingSTBeforeKill(unsigned RegNo, MachineInstr *I) {
+      for (unsigned i = 0; i != NumPendingSTs; ++i) {
+        if (PendingST[i] != RegNo)
+          continue;
+        unsigned SR = getScratchReg();
+        DEBUG(dbgs() << "Duplicating pending ST" << i
+                     << " in FP" << RegNo << " to FP" << SR << '\n');
+        duplicateToTop(RegNo, SR, I);
+        PendingST[i] = SR;
+      }
+    }
+
     /// popStackAfter - Pop the current value off of the top of the FP stack
     /// after the specified instruction.
     void popStackAfter(MachineBasicBlock::iterator &I);
@@ -406,6 +421,10 @@ bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
     if (MI->isCopy() && isFPCopy(MI))
       FPInstClass = X86II::SpecialFP;
 
+    if (MI->isImplicitDef() &&
+        X86::RFP80RegClass.contains(MI->getOperand(0).getReg()))
+      FPInstClass = X86II::SpecialFP;
+
     if (FPInstClass == X86II::NotFP)
       continue;  // Efficiently ignore non-fp insts!
 
@@ -461,6 +480,7 @@ bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
       }
       dumpStack();
     );
+    (void)PrevMI;
 
     Changed = true;
   }
@@ -969,6 +989,9 @@ void FPS::handleOneArgFP(MachineBasicBlock::iterator &I) {
   unsigned Reg = getFPReg(MI->getOperand(NumOps-1));
   bool KillsSrc = MI->killsRegister(X86::FP0+Reg);
 
+  if (KillsSrc)
+    duplicatePendingSTBeforeKill(Reg, I);
+
   // FISTP64m is strange because there isn't a non-popping versions.
   // If we have one _and_ we don't want to pop the operand, duplicate the value
   // on the stack instead of moving it.  This ensure that popping the value is
@@ -1032,6 +1055,7 @@ void FPS::handleOneArgFPRW(MachineBasicBlock::iterator &I) {
   bool KillsSrc = MI->killsRegister(X86::FP0+Reg);
 
   if (KillsSrc) {
+    duplicatePendingSTBeforeKill(Reg, I);
     // If this is the last use of the source register, just make sure it's on
     // the top of the stack.
     moveToTop(Reg, I);
@@ -1318,6 +1342,7 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) {
 
       // When the source is killed, allocate a scratch FP register.
       if (KillsSrc) {
+        duplicatePendingSTBeforeKill(SrcFP, I);
         unsigned Slot = getSlot(SrcFP);
         unsigned SR = getScratchReg();
         PendingST[DstST] = SR;
@@ -1369,6 +1394,15 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) {
     break;
   }
 
+  case TargetOpcode::IMPLICIT_DEF: {
+    // All FP registers must be explicitly defined, so load a 0 instead.
+    unsigned Reg = MI->getOperand(0).getReg() - X86::FP0;
+    DEBUG(dbgs() << "Emitting LD_F0 for implicit FP" << Reg << '\n');
+    BuildMI(*MBB, I, MI->getDebugLoc(), TII->get(X86::LD_F0));
+    pushReg(Reg);
+    break;
+  }
+
   case X86::FpPOP_RETVAL: {
     // The FpPOP_RETVAL instruction is used after calls that return a value on
     // the floating point stack. We cannot model this with ST defs since CALL
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index ed45a9a4c1c0a..d54f4ae2a2c81 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -15,6 +15,7 @@
 #include "X86InstrBuilder.h"
 #include "X86InstrInfo.h"
 #include "X86MachineFunctionInfo.h"
+#include "X86Subtarget.h"
 #include "X86TargetMachine.h"
 #include "llvm/Function.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -91,12 +92,12 @@ static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB,
     return 0;
 
   static const unsigned CallerSavedRegs32Bit[] = {
-    X86::EAX, X86::EDX, X86::ECX
+    X86::EAX, X86::EDX, X86::ECX, 0
   };
 
   static const unsigned CallerSavedRegs64Bit[] = {
     X86::RAX, X86::RDX, X86::RCX, X86::RSI, X86::RDI,
-    X86::R8,  X86::R9,  X86::R10, X86::R11
+    X86::R8,  X86::R9,  X86::R10, X86::R11, 0
   };
 
   unsigned Opc = MBBI->getOpcode();
@@ -283,8 +284,8 @@ static bool isEAXLiveIn(MachineFunction &MF) {
 }
 
 void X86FrameLowering::emitCalleeSavedFrameMoves(MachineFunction &MF,
-                                             MCSymbol *Label,
-                                             unsigned FramePtr) const {
+                                                 MCSymbol *Label,
+                                                 unsigned FramePtr) const {
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MachineModuleInfo &MMI = MF.getMMI();
 
@@ -346,6 +347,247 @@ void X86FrameLowering::emitCalleeSavedFrameMoves(MachineFunction &MF,
   }
 }
 
+/// getCompactUnwindRegNum - Get the compact unwind number for a given
+/// register. The number corresponds to the enum lists in
+/// compact_unwind_encoding.h.
+static int getCompactUnwindRegNum(const unsigned *CURegs, unsigned Reg) {
+  int Idx = 1;
+  for (; *CURegs; ++CURegs, ++Idx)
+    if (*CURegs == Reg)
+      return Idx;
+
+  return -1;
+}
+
+/// encodeCompactUnwindRegistersWithoutFrame - Create the permutation encoding
+/// used with frameless stacks. It is passed the number of registers to be saved
+/// and an array of the registers saved.
+static uint32_t encodeCompactUnwindRegistersWithoutFrame(unsigned SavedRegs[6],
+                                                         unsigned RegCount,
+                                                         bool Is64Bit) {
+  // The saved registers are numbered from 1 to 6. In order to encode the order
+  // in which they were saved, we re-number them according to their place in the
+  // register order. The re-numbering is relative to the last re-numbered
+  // register. E.g., if we have registers {6, 2, 4, 5} saved in that order:
+  //
+  //    Orig  Re-Num
+  //    ----  ------
+  //     6       6
+  //     2       2
+  //     4       3
+  //     5       3
+  //
+  static const unsigned CU32BitRegs[] = {
+    X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0
+  };
+  static const unsigned CU64BitRegs[] = {
+    X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0
+  };
+  const unsigned *CURegs = (Is64Bit ? CU64BitRegs : CU32BitRegs);
+
+  uint32_t RenumRegs[6];
+  for (unsigned i = 6 - RegCount; i < 6; ++i) {
+    int CUReg = getCompactUnwindRegNum(CURegs, SavedRegs[i]);
+    if (CUReg == -1) return ~0U;
+    SavedRegs[i] = CUReg;
+
+    unsigned Countless = 0;
+    for (unsigned j = 6 - RegCount; j < i; ++j)
+      if (SavedRegs[j] < SavedRegs[i])
+        ++Countless;
+
+    RenumRegs[i] = SavedRegs[i] - Countless - 1;
+  }
+
+  // Take the renumbered values and encode them into a 10-bit number.
+  uint32_t permutationEncoding = 0;
+  switch (RegCount) {
+  case 6:
+    permutationEncoding |= 120 * RenumRegs[0] + 24 * RenumRegs[1]
+                           + 6 * RenumRegs[2] +  2 * RenumRegs[3]
+                           +     RenumRegs[4];
+    break;
+  case 5:
+    permutationEncoding |= 120 * RenumRegs[1] + 24 * RenumRegs[2]
+                           + 6 * RenumRegs[3] +  2 * RenumRegs[4]
+                           +     RenumRegs[5];
+    break;
+  case 4:
+    permutationEncoding |=  60 * RenumRegs[2] + 12 * RenumRegs[3]
+                           + 3 * RenumRegs[4] +      RenumRegs[5];
+    break;
+  case 3:
+    permutationEncoding |=  20 * RenumRegs[3] +  4 * RenumRegs[4]
+                           +     RenumRegs[5];
+    break;
+  case 2:
+    permutationEncoding |=   5 * RenumRegs[4] +      RenumRegs[5];
+    break;
+  case 1:
+    permutationEncoding |=       RenumRegs[5];
+    break;
+  }
+
+  assert((permutationEncoding & 0x3FF) == permutationEncoding &&
+         "Invalid compact register encoding!");
+  return permutationEncoding;
+}
+
+/// encodeCompactUnwindRegistersWithFrame - Return the registers encoded for a
+/// compact encoding with a frame pointer.
+static uint32_t encodeCompactUnwindRegistersWithFrame(unsigned SavedRegs[6],
+                                                      bool Is64Bit) {
+  static const unsigned CU32BitRegs[] = {
+    X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0
+  };
+  static const unsigned CU64BitRegs[] = {
+    X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0
+  };
+  const unsigned *CURegs = (Is64Bit ? CU64BitRegs : CU32BitRegs);
+
+  // Encode the registers in the order they were saved, 3-bits per register. The
+  // registers are numbered from 1 to 6.
+  uint32_t RegEnc = 0;
+  for (int I = 5; I >= 0; --I) {
+    unsigned Reg = SavedRegs[I];
+    if (Reg == 0) break;
+    int CURegNum = getCompactUnwindRegNum(CURegs, Reg);
+    if (CURegNum == -1)
+      return ~0U;
+    RegEnc |= (CURegNum & 0x7) << (5 - I);
+  }
+
+  assert((RegEnc & 0x7FFF) == RegEnc && "Invalid compact register encoding!");
+  return RegEnc;
+}
+
+uint32_t X86FrameLowering::getCompactUnwindEncoding(MachineFunction &MF) const {
+  const X86RegisterInfo *RegInfo = TM.getRegisterInfo();
+  unsigned FramePtr = RegInfo->getFrameRegister(MF);
+  unsigned StackPtr = RegInfo->getStackRegister();
+
+  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+  int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
+
+  bool Is64Bit = STI.is64Bit();
+  bool HasFP = hasFP(MF);
+
+  unsigned SavedRegs[6] = { 0, 0, 0, 0, 0, 0 };
+  int SavedRegIdx = 6;
+
+  unsigned OffsetSize = (Is64Bit ? 8 : 4);
+
+  unsigned PushInstr = (Is64Bit ? X86::PUSH64r : X86::PUSH32r);
+  unsigned PushInstrSize = 1;
+  unsigned MoveInstr = (Is64Bit ? X86::MOV64rr : X86::MOV32rr);
+  unsigned MoveInstrSize = (Is64Bit ? 3 : 2);
+  unsigned SubtractInstr = getSUBriOpcode(Is64Bit, -TailCallReturnAddrDelta);
+  unsigned SubtractInstrIdx = (Is64Bit ? 3 : 2);
+
+  unsigned StackDivide = (Is64Bit ? 8 : 4);
+
+  unsigned InstrOffset = 0;
+  unsigned CFAOffset = 0;
+  unsigned StackAdjust = 0;
+
+  MachineBasicBlock &MBB = MF.front(); // Prologue is in entry BB.
+  bool ExpectEnd = false;
+  for (MachineBasicBlock::iterator
+         MBBI = MBB.begin(), MBBE = MBB.end(); MBBI != MBBE; ++MBBI) {
+    MachineInstr &MI = *MBBI;
+    unsigned Opc = MI.getOpcode();
+    if (Opc == X86::PROLOG_LABEL) continue;
+    if (!MI.getFlag(MachineInstr::FrameSetup)) break;
+
+    // We don't exect any more prolog instructions.
+    if (ExpectEnd) return 0;
+
+    if (Opc == PushInstr) {
+      // If there are too many saved registers, we cannot use compact encoding.
+      if (--SavedRegIdx < 0) return 0;
+
+      SavedRegs[SavedRegIdx] = MI.getOperand(0).getReg();
+      CFAOffset += OffsetSize;
+      InstrOffset += PushInstrSize;
+    } else if (Opc == MoveInstr) {
+      unsigned SrcReg = MI.getOperand(1).getReg();
+      unsigned DstReg = MI.getOperand(0).getReg();
+
+      if (DstReg != FramePtr || SrcReg != StackPtr)
+        return 0;
+
+      CFAOffset = 0;
+      memset(SavedRegs, 0, sizeof(SavedRegs));
+      InstrOffset += MoveInstrSize;
+    } else if (Opc == SubtractInstr) {
+      if (StackAdjust)
+        // We all ready have a stack pointer adjustment.
+        return 0;
+
+      if (!MI.getOperand(0).isReg() ||
+          MI.getOperand(0).getReg() != MI.getOperand(1).getReg() ||
+          MI.getOperand(0).getReg() != StackPtr || !MI.getOperand(2).isImm())
+        // We need this to be a stack adjustment pointer. Something like:
+        //
+        //   %RSP<def> = SUB64ri8 %RSP, 48
+        return 0;
+
+      StackAdjust = MI.getOperand(2).getImm() / StackDivide;
+      SubtractInstrIdx += InstrOffset;
+      ExpectEnd = true;
+    }
+  }
+
+  // Encode that we are using EBP/RBP as the frame pointer.
+  uint32_t CompactUnwindEncoding = 0;
+  CFAOffset /= StackDivide;
+  if (HasFP) {
+    if ((CFAOffset & 0xFF) != CFAOffset)
+      // Offset was too big for compact encoding.
+      return 0;
+
+    // Get the encoding of the saved registers when we have a frame pointer.
+    uint32_t RegEnc = encodeCompactUnwindRegistersWithFrame(SavedRegs, Is64Bit);
+    if (RegEnc == ~0U)
+      return 0;
+
+    CompactUnwindEncoding |= 0x01000000;
+    CompactUnwindEncoding |= (CFAOffset & 0xFF) << 16;
+    CompactUnwindEncoding |= RegEnc & 0x7FFF;
+  } else {
+    unsigned FullOffset = CFAOffset + StackAdjust;
+    if ((FullOffset & 0xFF) == FullOffset) {
+      // Frameless stack.
+      CompactUnwindEncoding |= 0x02000000;
+      CompactUnwindEncoding |= (FullOffset & 0xFF) << 16;
+    } else {
+      if ((CFAOffset & 0x7) != CFAOffset)
+        // The extra stack adjustments are too big for us to handle.
+        return 0;
+
+      // Frameless stack with an offset too large for us to encode compactly.
+      CompactUnwindEncoding |= 0x03000000;
+
+      // Encode the offset to the nnnnnn value in the 'subl $nnnnnn, ESP'
+      // instruction.
+      CompactUnwindEncoding |= (SubtractInstrIdx & 0xFF) << 16;
+
+      // Encode any extra stack stack changes (done via push instructions).
+      CompactUnwindEncoding |= (CFAOffset & 0x7) << 13;
+    }
+
+    // Get the encoding of the saved registers when we don't have a frame
+    // pointer.
+    uint32_t RegEnc = encodeCompactUnwindRegistersWithoutFrame(SavedRegs,
+                                                               6 - SavedRegIdx,
+                                                               Is64Bit);
+    if (RegEnc == ~0U) return 0;
+    CompactUnwindEncoding |= RegEnc & 0x3FF;
+  }
+
+  return CompactUnwindEncoding;
+}
+
 /// emitPrologue - Push callee-saved registers onto the stack, which
 /// automatically adjust the stack pointer. Adjust the stack pointer to allocate
 /// space for local variables. Also emit labels used by the exception handler to
@@ -370,7 +612,6 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
   unsigned SlotSize = RegInfo->getSlotSize();
   unsigned FramePtr = RegInfo->getFrameRegister(MF);
   unsigned StackPtr = RegInfo->getStackRegister();
-
   DebugLoc DL;
 
   // If we're forcing a stack realignment we can't rely on just the frame
@@ -398,7 +639,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
       !RegInfo->needsStackRealignment(MF) &&
       !MFI->hasVarSizedObjects() &&                // No dynamic alloca.
       !MFI->adjustsStack() &&                      // No calls.
-      !IsWin64) {                                  // Win64 has no Red Zone
+      !IsWin64 &&                                  // Win64 has no Red Zone
+      !EnableSegmentedStacks) {                    // Regular stack
     uint64_t MinSize = X86FI->getCalleeSavedFrameSize();
     if (HasFP) MinSize += SlotSize;
     StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0);
@@ -459,7 +701,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
     if (needsFrameMoves) {
       // Mark the place where EBP/RBP was saved.
       MCSymbol *FrameLabel = MMI.getContext().CreateTempSymbol();
-      BuildMI(MBB, MBBI, DL, TII.get(X86::PROLOG_LABEL)).addSym(FrameLabel);
+      BuildMI(MBB, MBBI, DL, TII.get(X86::PROLOG_LABEL))
+        .addSym(FrameLabel);
 
       // Define the current CFA rule to use the provided offset.
       if (StackSize) {
@@ -478,7 +721,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
       Moves.push_back(MachineMove(FrameLabel, FPDst, FPSrc));
     }
 
-    // Update EBP with the new base value...
+    // Update EBP with the new base value.
     BuildMI(MBB, MBBI, DL,
             TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr), FramePtr)
         .addReg(StackPtr)
@@ -487,7 +730,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
     if (needsFrameMoves) {
       // Mark effective beginning of when frame pointer becomes valid.
       MCSymbol *FrameLabel = MMI.getContext().CreateTempSymbol();
-      BuildMI(MBB, MBBI, DL, TII.get(X86::PROLOG_LABEL)).addSym(FrameLabel);
+      BuildMI(MBB, MBBI, DL, TII.get(X86::PROLOG_LABEL))
+        .addSym(FrameLabel);
 
       // Define the current CFA to use the EBP/RBP register.
       MachineLocation FPDst(FramePtr);
@@ -504,8 +748,10 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
     if (RegInfo->needsStackRealignment(MF)) {
       MachineInstr *MI =
         BuildMI(MBB, MBBI, DL,
-                TII.get(Is64Bit ? X86::AND64ri32 : X86::AND32ri),
-                StackPtr).addReg(StackPtr).addImm(-MaxAlign);
+                TII.get(Is64Bit ? X86::AND64ri32 : X86::AND32ri), StackPtr)
+        .addReg(StackPtr)
+        .addImm(-MaxAlign)
+        .setMIFlag(MachineInstr::FrameSetup);
 
       // The EFLAGS implicit def is dead.
       MI->getOperand(3).setIsDead();
@@ -522,6 +768,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
          (MBBI->getOpcode() == X86::PUSH32r ||
           MBBI->getOpcode() == X86::PUSH64r)) {
     PushedRegs = true;
+    MBBI->setFlag(MachineInstr::FrameSetup);
     ++MBBI;
 
     if (!HasFP && needsFrameMoves) {
@@ -530,8 +777,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
       BuildMI(MBB, MBBI, DL, TII.get(X86::PROLOG_LABEL)).addSym(Label);
 
       // Define the current CFA rule to use the provided offset.
-      unsigned Ptr = StackSize ?
-        MachineLocation::VirtualFP : StackPtr;
+      unsigned Ptr = StackSize ? MachineLocation::VirtualFP : StackPtr;
       MachineLocation SPDst(Ptr);
       MachineLocation SPSrc(Ptr, StackOffset);
       Moves.push_back(MachineMove(Label, SPDst, SPSrc));
@@ -586,26 +832,30 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
 
       // Save EAX
       BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH32r))
-        .addReg(X86::EAX, RegState::Kill);
+        .addReg(X86::EAX, RegState::Kill)
+        .setMIFlag(MachineInstr::FrameSetup);
     }
 
     if (Is64Bit) {
       // Handle the 64-bit Windows ABI case where we need to call __chkstk.
       // Function prologue is responsible for adjusting the stack pointer.
       BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::RAX)
-        .addImm(NumBytes);
+        .addImm(NumBytes)
+        .setMIFlag(MachineInstr::FrameSetup);
     } else {
       // Allocate NumBytes-4 bytes on stack in case of isEAXAlive.
       // We'll also use 4 already allocated bytes for EAX.
       BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)
-        .addImm(isEAXAlive ? NumBytes - 4 : NumBytes);
+        .addImm(isEAXAlive ? NumBytes - 4 : NumBytes)
+        .setMIFlag(MachineInstr::FrameSetup);
     }
 
     BuildMI(MBB, MBBI, DL,
             TII.get(Is64Bit ? X86::W64ALLOCA : X86::CALLpcrel32))
       .addExternalSymbol(StackProbeSymbol)
       .addReg(StackPtr,    RegState::Define | RegState::Implicit)
-      .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
+      .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit)
+      .setMIFlag(MachineInstr::FrameSetup);
 
     // MSVC x64's __chkstk needs to adjust %rsp.
     // FIXME: %rax preserves the offset and should be available.
@@ -618,6 +868,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
         MachineInstr *MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm),
                                                 X86::EAX),
                                         StackPtr, false, NumBytes - 4);
+        MI->setFlag(MachineInstr::FrameSetup);
         MBB.insert(MBBI, MI);
     }
   } else if (NumBytes)
@@ -627,7 +878,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
   if (( (!HasFP && NumBytes) || PushedRegs) && needsFrameMoves) {
     // Mark end of stack pointer adjustment.
     MCSymbol *Label = MMI.getContext().CreateTempSymbol();
-    BuildMI(MBB, MBBI, DL, TII.get(X86::PROLOG_LABEL)).addSym(Label);
+    BuildMI(MBB, MBBI, DL, TII.get(X86::PROLOG_LABEL))
+      .addSym(Label);
 
     if (!HasFP && NumBytes) {
       // Define the current CFA rule to use the provided offset.
@@ -647,6 +899,11 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
     if (PushedRegs)
       emitCalleeSavedFrameMoves(MF, Label, HasFP ? FramePtr : StackPtr);
   }
+
+  // Darwin 10.7 and greater has support for compact unwind encoding.
+  if (STI.getTargetTriple().isMacOSX() &&
+      !STI.getTargetTriple().isMacOSXVersionLT(10, 7))
+    MMI.setCompactUnwindEncoding(getCompactUnwindEncoding(MF));
 }
 
 void X86FrameLowering::emitEpilogue(MachineFunction &MF,
@@ -844,23 +1101,6 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
   }
 }
 
-void
-X86FrameLowering::getInitialFrameState(std::vector<MachineMove> &Moves) const {
-  // Calculate amount of bytes used for return address storing
-  int stackGrowth = (STI.is64Bit() ? -8 : -4);
-  const X86RegisterInfo *RI = TM.getRegisterInfo();
-
-  // Initial state of the frame pointer is esp+stackGrowth.
-  MachineLocation Dst(MachineLocation::VirtualFP);
-  MachineLocation Src(RI->getStackRegister(), stackGrowth);
-  Moves.push_back(MachineMove(0, Dst, Src));
-
-  // Add return address to move list
-  MachineLocation CSDst(RI->getStackRegister(), stackGrowth);
-  MachineLocation CSSrc(RI->getRARegister());
-  Moves.push_back(MachineMove(0, CSDst, CSSrc));
-}
-
 int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF, int FI) const {
   const X86RegisterInfo *RI =
     static_cast<const X86RegisterInfo*>(MF.getTarget().getRegisterInfo());
@@ -873,9 +1113,7 @@ int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF, int FI) con
       // Skip the saved EBP.
       Offset += RI->getSlotSize();
     } else {
-      unsigned Align = MFI->getObjectAlignment(FI);
-      assert((-(Offset + StackSize)) % Align == 0);
-      Align = 0;
+      assert((-(Offset + StackSize)) % MFI->getObjectAlignment(FI) == 0);
       return Offset + StackSize;
     }
     // FIXME: Support tail calls
@@ -1027,184 +1265,183 @@ X86FrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
                                           true);
     assert(FrameIdx == MFI->getObjectIndexBegin() &&
            "Slot for EBP register must be last in order to be found!");
-    FrameIdx = 0;
+    (void)FrameIdx;
   }
 }
 
-/// permuteEncode - Create the permutation encoding used with frameless
-/// stacks. It is passed the number of registers to be saved and an array of the
-/// registers saved.
-static uint32_t permuteEncode(unsigned SavedCount, unsigned Registers[6]) {
-  // The saved registers are numbered from 1 to 6. In order to encode the order
-  // in which they were saved, we re-number them according to their place in the
-  // register order. The re-numbering is relative to the last re-numbered
-  // register. E.g., if we have registers {6, 2, 4, 5} saved in that order:
-  //
-  //    Orig  Re-Num
-  //    ----  ------
-  //     6       6
-  //     2       2
-  //     4       3
-  //     5       3
-  //
-  bool Used[7] = { false, false, false, false, false, false, false };
-  uint32_t RenumRegs[6];
-  for (unsigned I = 0; I < SavedCount; ++I) {
-    uint32_t Renum = 0;
-    for (unsigned U = 1; U < 7; ++U) {
-      if (U == Registers[I])
-        break;
-      if (!Used[U])
-        ++Renum;
-    }
-
-    Used[Registers[I]] = true;
-    RenumRegs[I] = Renum;
+static bool
+HasNestArgument(const MachineFunction *MF) {
+  const Function *F = MF->getFunction();
+  for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
+       I != E; I++) {
+    if (I->hasNestAttr())
+      return true;
   }
+  return false;
+}
 
-  // Take the renumbered values and encode them into a 10-bit number.
-  uint32_t permutationEncoding = 0;
-  switch (SavedCount) {
-  case 6:
-    permutationEncoding |= 120 * RenumRegs[0] + 24 * RenumRegs[1]
-                           + 6 * RenumRegs[2] +  2 * RenumRegs[3]
-                           +     RenumRegs[4];
-    break;
-  case 5:
-    permutationEncoding |= 120 * RenumRegs[0] + 24 * RenumRegs[1]
-                           + 6 * RenumRegs[2] +  2 * RenumRegs[3]
-                           +     RenumRegs[4];
-    break;
-  case 4:
-    permutationEncoding |= 60 * RenumRegs[0] + 12 * RenumRegs[1]
-                          + 3 * RenumRegs[2] +      RenumRegs[3];
-    break;
-  case 3:
-    permutationEncoding |= 20 * RenumRegs[0] + 4 * RenumRegs[1]
-                              + RenumRegs[2];
-    break;
-  case 2:
-    permutationEncoding |=  5 * RenumRegs[0] +     RenumRegs[1];
-    break;
-  case 1:
-    permutationEncoding |=      RenumRegs[0];
-    break;
+static unsigned
+GetScratchRegister(bool Is64Bit, const MachineFunction &MF) {
+  if (Is64Bit) {
+    return X86::R11;
+  } else {
+    CallingConv::ID CallingConvention = MF.getFunction()->getCallingConv();
+    bool IsNested = HasNestArgument(&MF);
+
+    if (CallingConvention == CallingConv::X86_FastCall) {
+      if (IsNested) {
+        report_fatal_error("Segmented stacks does not support fastcall with "
+                           "nested function.");
+        return -1;
+      } else {
+        return X86::EAX;
+      }
+    } else {
+      if (IsNested)
+        return X86::EDX;
+      else
+        return X86::ECX;
+    }
   }
-
-  return permutationEncoding;
 }
 
-uint32_t X86FrameLowering::
-getCompactUnwindEncoding(ArrayRef<MCCFIInstruction> Instrs,
-                         int DataAlignmentFactor, bool IsEH) const {
-  uint32_t Encoding = 0;
-  int CFAOffset = 0;
-  const TargetRegisterInfo *TRI = TM.getRegisterInfo();
-  unsigned SavedRegs[6] = { 0, 0, 0, 0, 0, 0 };
-  unsigned SavedRegIdx = 0;
-  int FramePointerReg = -1;
+void
+X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
+  MachineBasicBlock &prologueMBB = MF.front();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const X86InstrInfo &TII = *TM.getInstrInfo();
+  uint64_t StackSize;
+  bool Is64Bit = STI.is64Bit();
+  unsigned TlsReg, TlsOffset;
+  DebugLoc DL;
+  const X86Subtarget *ST = &MF.getTarget().getSubtarget<X86Subtarget>();
 
-  for (ArrayRef<MCCFIInstruction>::const_iterator
-         I = Instrs.begin(), E = Instrs.end(); I != E; ++I) {
-    const MCCFIInstruction &Inst = *I;
-    MCSymbol *Label = Inst.getLabel();
+  unsigned ScratchReg = GetScratchRegister(Is64Bit, MF);
+  assert(!MF.getRegInfo().isLiveIn(ScratchReg) &&
+         "Scratch register is live-in");
 
-    // Ignore invalid labels.
-    if (Label && !Label->isDefined()) continue;
+  if (MF.getFunction()->isVarArg())
+    report_fatal_error("Segmented stacks do not support vararg functions.");
+  if (!ST->isTargetLinux())
+    report_fatal_error("Segmented stacks supported only on linux.");
 
-    unsigned Operation = Inst.getOperation();
-    if (Operation != MCCFIInstruction::Move &&
-        Operation != MCCFIInstruction::RelMove)
-      // FIXME: We can't handle this frame just yet.
-      return 0;
-
-    const MachineLocation &Dst = Inst.getDestination();
-    const MachineLocation &Src = Inst.getSource();
-    const bool IsRelative = (Operation == MCCFIInstruction::RelMove);
-
-    if (Dst.isReg() && Dst.getReg() == MachineLocation::VirtualFP) {
-      if (Src.getReg() != MachineLocation::VirtualFP) {
-        // DW_CFA_def_cfa
-        assert(FramePointerReg == -1 &&"Defining more than one frame pointer?");
-        if (TRI->getLLVMRegNum(Src.getReg(), IsEH) != X86::EBP &&
-            TRI->getLLVMRegNum(Src.getReg(), IsEH) != X86::RBP)
-          // The frame pointer isn't EBP/RBP. Cannot make unwind information
-          // compact.
-          return 0;
-        FramePointerReg = TRI->getCompactUnwindRegNum(Src.getReg(), IsEH);
-      } // else DW_CFA_def_cfa_offset
-
-      if (IsRelative)
-        CFAOffset += Src.getOffset();
-      else
-        CFAOffset -= Src.getOffset();
+  MachineBasicBlock *allocMBB = MF.CreateMachineBasicBlock();
+  MachineBasicBlock *checkMBB = MF.CreateMachineBasicBlock();
+  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+  bool IsNested = false;
 
-      continue;
-    }
+  // We need to know if the function has a nest argument only in 64 bit mode.
+  if (Is64Bit)
+    IsNested = HasNestArgument(&MF);
 
-    if (Src.isReg() && Src.getReg() == MachineLocation::VirtualFP) {
-      // DW_CFA_def_cfa_register
-      assert(FramePointerReg == -1 && "Defining more than one frame pointer?");
+  // The MOV R10, RAX needs to be in a different block, since the RET we emit in
+  // allocMBB needs to be last (terminating) instruction.
+  MachineBasicBlock *restoreR10MBB = NULL;
+  if (IsNested)
+    restoreR10MBB = MF.CreateMachineBasicBlock();
 
-      if (TRI->getLLVMRegNum(Dst.getReg(), IsEH) != X86::EBP &&
-          TRI->getLLVMRegNum(Dst.getReg(), IsEH) != X86::RBP)
-        // The frame pointer isn't EBP/RBP. Cannot make unwind information
-        // compact.
-        return 0;
+  for (MachineBasicBlock::livein_iterator i = prologueMBB.livein_begin(),
+         e = prologueMBB.livein_end(); i != e; i++) {
+    allocMBB->addLiveIn(*i);
+    checkMBB->addLiveIn(*i);
 
-      FramePointerReg = TRI->getCompactUnwindRegNum(Dst.getReg(), IsEH);
-      if (SavedRegIdx != 1 || SavedRegs[0] != unsigned(FramePointerReg))
-        return 0;
+    if (IsNested)
+      restoreR10MBB->addLiveIn(*i);
+  }
 
-      SavedRegs[0] = 0;
-      SavedRegIdx = 0;
-      continue;
-    }
+  if (IsNested) {
+    allocMBB->addLiveIn(X86::R10);
+    restoreR10MBB->addLiveIn(X86::RAX);
+  }
 
-    unsigned Reg = Src.getReg();
-    int Offset = Dst.getOffset();
-    if (IsRelative)
-      Offset -= CFAOffset;
-    Offset /= DataAlignmentFactor;
+  if (IsNested)
+    MF.push_front(restoreR10MBB);
+  MF.push_front(allocMBB);
+  MF.push_front(checkMBB);
+
+  // Eventually StackSize will be calculated by a link-time pass; which will
+  // also decide whether checking code needs to be injected into this particular
+  // prologue.
+  StackSize = MFI->getStackSize();
+
+  // Read the limit off the current stacklet off the stack_guard location.
+  if (Is64Bit) {
+    TlsReg = X86::FS;
+    TlsOffset = 0x70;
+
+    BuildMI(checkMBB, DL, TII.get(X86::LEA64r), ScratchReg).addReg(X86::RSP)
+      .addImm(0).addReg(0).addImm(-StackSize).addReg(0);
+    BuildMI(checkMBB, DL, TII.get(X86::CMP64rm)).addReg(ScratchReg)
+      .addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg);
+  } else {
+    TlsReg = X86::GS;
+    TlsOffset = 0x30;
 
-    if (Offset < 0) {
-      // FIXME: Handle?
-      // DW_CFA_offset_extended_sf
-      return 0;
-    } else if (Reg < 64) {
-      // DW_CFA_offset + Reg
-      if (SavedRegIdx >= 6) return 0;
-      int CURegNum = TRI->getCompactUnwindRegNum(Reg, IsEH);
-      if (CURegNum == -1) return 0;
-      SavedRegs[SavedRegIdx++] = CURegNum;
-    } else {
-      // FIXME: Handle?
-      // DW_CFA_offset_extended
-      return 0;
-    }
+    BuildMI(checkMBB, DL, TII.get(X86::LEA32r), ScratchReg).addReg(X86::ESP)
+      .addImm(0).addReg(0).addImm(-StackSize).addReg(0);
+    BuildMI(checkMBB, DL, TII.get(X86::CMP32rm)).addReg(ScratchReg)
+      .addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg);
   }
 
-  // Bail if there are too many registers to encode.
-  if (SavedRegIdx > 6) return 0;
+  // This jump is taken if SP >= (Stacklet Limit + Stack Space required).
+  // It jumps to normal execution of the function body.
+  BuildMI(checkMBB, DL, TII.get(X86::JG_4)).addMBB(&prologueMBB);
+
+  // On 32 bit we first push the arguments size and then the frame size. On 64
+  // bit, we pass the stack frame size in r10 and the argument size in r11.
+  if (Is64Bit) {
+    // Functions with nested arguments use R10, so it needs to be saved across
+    // the call to _morestack
+
+    if (IsNested)
+      BuildMI(allocMBB, DL, TII.get(X86::MOV64rr), X86::RAX).addReg(X86::R10);
+
+    BuildMI(allocMBB, DL, TII.get(X86::MOV64ri), X86::R10)
+      .addImm(StackSize);
+    BuildMI(allocMBB, DL, TII.get(X86::MOV64ri), X86::R11)
+      .addImm(X86FI->getArgumentStackSize());
+    MF.getRegInfo().setPhysRegUsed(X86::R10);
+    MF.getRegInfo().setPhysRegUsed(X86::R11);
+  } else {
+    // Since we'll call __morestack, stack alignment needs to be preserved.
+    BuildMI(allocMBB, DL, TII.get(X86::SUB32ri), X86::ESP).addReg(X86::ESP)
+      .addImm(8);
+    BuildMI(allocMBB, DL, TII.get(X86::PUSHi32))
+      .addImm(X86FI->getArgumentStackSize());
+    BuildMI(allocMBB, DL, TII.get(X86::PUSHi32))
+      .addImm(StackSize);
+  }
 
-  // Check if the offset is too big.
-  CFAOffset /= 4;
-  if ((CFAOffset & 0xFF) != CFAOffset)
-    return 0;
-  Encoding |= (CFAOffset & 0xFF) << 16; // Size encoding.
-
-  if (FramePointerReg != -1) {
-    Encoding |= 0x01000000;     // EBP/RBP Unwind Frame
-    for (unsigned I = 0; I != SavedRegIdx; ++I) {
-      unsigned Reg = SavedRegs[I];
-      if (Reg == unsigned(FramePointerReg)) continue;
-      Encoding |= (Reg & 0x7) << (I * 3); // Register encoding
-    }
+  // __morestack is in libgcc
+  if (Is64Bit)
+    BuildMI(allocMBB, DL, TII.get(X86::CALL64pcrel32))
+      .addExternalSymbol("__morestack");
+  else
+    BuildMI(allocMBB, DL, TII.get(X86::CALLpcrel32))
+      .addExternalSymbol("__morestack");
+
+  // __morestack only seems to remove 8 bytes off the stack. Add back the
+  // additional 8 bytes we added before pushing the arguments.
+  if (!Is64Bit)
+    BuildMI(allocMBB, DL, TII.get(X86::ADD32ri), X86::ESP).addReg(X86::ESP)
+      .addImm(8);
+  BuildMI(allocMBB, DL, TII.get(X86::RET));
+
+  if (IsNested)
+    BuildMI(restoreR10MBB, DL, TII.get(X86::MOV64rr), X86::R10)
+      .addReg(X86::RAX);
+
+  if (IsNested) {
+    allocMBB->addSuccessor(restoreR10MBB);
+    restoreR10MBB->addSuccessor(&prologueMBB);
   } else {
-    Encoding |= 0x02000000;     // Frameless unwind with small stack
-    Encoding |= (SavedRegIdx & 0x7) << 10;
-    Encoding |= permuteEncode(SavedRegIdx, SavedRegs);
+    allocMBB->addSuccessor(&prologueMBB);
   }
 
-  return Encoding;
+  checkMBB->addSuccessor(allocMBB);
+  checkMBB->addSuccessor(&prologueMBB);
+
+#ifdef XDEBUG
+  MF.verify();
+#endif
 }
diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h
index 14c31ed47cf14..6f490640b4edb 100644
--- a/lib/Target/X86/X86FrameLowering.h
+++ b/lib/Target/X86/X86FrameLowering.h
@@ -41,6 +41,8 @@ public:
   void emitPrologue(MachineFunction &MF) const;
   void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
 
+  void adjustForSegmentedStacks(MachineFunction &MF) const;
+
   void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
                                             RegScavenger *RS = NULL) const;
 
@@ -57,11 +59,8 @@ public:
   bool hasFP(const MachineFunction &MF) const;
   bool hasReservedCallFrame(const MachineFunction &MF) const;
 
-  void getInitialFrameState(std::vector<MachineMove> &Moves) const;
   int getFrameIndexOffset(const MachineFunction &MF, int FI) const;
-
-  uint32_t getCompactUnwindEncoding(ArrayRef<MCCFIInstruction> Instrs,
-                                    int DataAlignmentFactor, bool IsEH) const;
+  uint32_t getCompactUnwindEncoding(MachineFunction &MF) const;
 };
 
 } // End llvm namespace
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 2b0f283bec759..02b0ff26032bc 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -474,10 +474,15 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
     if (N->getOpcode() != ISD::FP_ROUND && N->getOpcode() != ISD::FP_EXTEND)
       continue;
     
-    // If the source and destination are SSE registers, then this is a legal
-    // conversion that should not be lowered.
     EVT SrcVT = N->getOperand(0).getValueType();
     EVT DstVT = N->getValueType(0);
+
+    // If any of the sources are vectors, no fp stack involved.
+    if (SrcVT.isVector() || DstVT.isVector())
+      continue;
+
+    // If the source and destination are SSE registers, then this is a legal
+    // conversion that should not be lowered.
     bool SrcIsSSE = X86Lowering.isScalarFPTypeInSSEReg(SrcVT);
     bool DstIsSSE = X86Lowering.isScalarFPTypeInSSEReg(DstVT);
     if (SrcIsSSE && DstIsSSE)
@@ -2168,9 +2173,10 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
         SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_8bit_hi, dl,
                                                         MVT::i8, Reg);
 
-        // Emit a testb. No special NOREX tricks are needed since there's
-        // only one GPR operand!
-        return CurDAG->getMachineNode(X86::TEST8ri, dl, MVT::i32,
+        // Emit a testb.  The EXTRACT_SUBREG becomes a COPY that can only
+        // target GR8_NOREX registers, so make sure the register class is
+        // forced.
+        return CurDAG->getMachineNode(X86::TEST8ri_NOREX, dl, MVT::i32,
                                       Subreg, ShiftedImm);
       }
 
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 5096d9ae2edfd..7c8ce177ecd30 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -51,6 +51,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOptions.h"
 using namespace llvm;
 using namespace dwarf;
 
@@ -71,9 +72,6 @@ static SDValue Extract128BitVector(SDValue Vec,
                                    SelectionDAG &DAG,
                                    DebugLoc dl);
 
-static SDValue ConcatVectors(SDValue Lower, SDValue Upper, SelectionDAG &DAG);
-
-
 /// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
 /// sets things up to match to an AVX VEXTRACTF128 instruction or a
 /// simple subregister reference.  Idx is an index in the 128 bits we
@@ -85,14 +83,10 @@ static SDValue Extract128BitVector(SDValue Vec,
                                    DebugLoc dl) {
   EVT VT = Vec.getValueType();
   assert(VT.getSizeInBits() == 256 && "Unexpected vector size!");
-
   EVT ElVT = VT.getVectorElementType();
-
-  int Factor = VT.getSizeInBits() / 128;
-
-  EVT ResultVT = EVT::getVectorVT(*DAG.getContext(),
-                                  ElVT,
-                                  VT.getVectorNumElements() / Factor);
+  int Factor = VT.getSizeInBits()/128;
+  EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
+                                  VT.getVectorNumElements()/Factor);
 
   // Extract from UNDEF is UNDEF.
   if (Vec.getOpcode() == ISD::UNDEF)
@@ -111,7 +105,6 @@ static SDValue Extract128BitVector(SDValue Vec,
                                  * ElemsPerChunk);
 
     SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32);
-
     SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec,
                                  VecIdx);
 
@@ -136,21 +129,18 @@ static SDValue Insert128BitVector(SDValue Result,
     assert(VT.getSizeInBits() == 128 && "Unexpected vector size!");
 
     EVT ElVT = VT.getVectorElementType();
-
     unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
-
     EVT ResultVT = Result.getValueType();
 
     // Insert the relevant 128 bits.
-    unsigned ElemsPerChunk = 128 / ElVT.getSizeInBits();
+    unsigned ElemsPerChunk = 128/ElVT.getSizeInBits();
 
     // This is the index of the first element of the 128-bit chunk
     // we want.
-    unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128)
+    unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/128)
                                  * ElemsPerChunk);
 
     SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32);
-
     Result = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec,
                          VecIdx);
     return Result;
@@ -159,34 +149,6 @@ static SDValue Insert128BitVector(SDValue Result,
   return SDValue();
 }
 
-/// Given two vectors, concat them.
-static SDValue ConcatVectors(SDValue Lower, SDValue Upper, SelectionDAG &DAG) {
-  DebugLoc dl = Lower.getDebugLoc();
-
-  assert(Lower.getValueType() == Upper.getValueType() && "Mismatched vectors!");
-
-  EVT VT = EVT::getVectorVT(*DAG.getContext(),
-                            Lower.getValueType().getVectorElementType(),
-                            Lower.getValueType().getVectorNumElements() * 2);
-
-  // TODO: Generalize to arbitrary vector length (this assumes 256-bit vectors).
-  assert(VT.getSizeInBits() == 256 && "Unsupported vector concat!");
-
-  // Insert the upper subvector.
-  SDValue Vec = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), Upper,
-                                   DAG.getConstant(
-                                     // This is half the length of the result
-                                     // vector.  Start inserting the upper 128
-                                     // bits here.
-                                     Lower.getValueType().getVectorNumElements(),
-                                     MVT::i32),
-                                   DAG, dl);
-
-  // Insert the lower subvector.
-  Vec = Insert128BitVector(Vec, Lower, DAG.getConstant(0, MVT::i32), DAG, dl);
-  return Vec;
-}
-
 static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
   const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
   bool is64Bit = Subtarget->is64Bit();
@@ -197,11 +159,8 @@ static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
     return new TargetLoweringObjectFileMachO();
   }
 
-  if (Subtarget->isTargetELF()) {
-    if (is64Bit)
-      return new X8664_ELFTargetObjectFile(TM);
-    return new X8632_ELFTargetObjectFile(TM);
-  }
+  if (Subtarget->isTargetELF())
+    return new TargetLoweringObjectFileELF();
   if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho())
     return new TargetLoweringObjectFileCOFF();
   llvm_unreachable("unknown subtarget type");
@@ -222,6 +181,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
 
   // X86 is weird, it always uses i8 for shift amounts and setcc results.
   setBooleanContents(ZeroOrOneBooleanContent);
+  // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
+  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 
   // For 64-bit since we have so many registers use the ILP scheduler, for
   // 32-bit code use the register pressure specific scheduling.
@@ -354,7 +315,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Expand);
     setOperationAction(ISD::FP_TO_UINT     , MVT::i32  , Promote);
   } else if (!UseSoftFloat) {
-    if (X86ScalarSSEf32 && !Subtarget->hasSSE3())
+    // Since AVX is a superset of SSE3, only check for SSE here.
+    if (Subtarget->hasSSE1() && !Subtarget->hasSSE3())
       // Expand FP_TO_UINT into a select.
       // FIXME: We would like to use a Custom expander here eventually to do
       // the optimal thing for SSE vs. the default expansion in the legalizer.
@@ -417,15 +379,24 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   setOperationAction(ISD::FREM             , MVT::f80  , Expand);
   setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
 
-  setOperationAction(ISD::CTTZ             , MVT::i8   , Custom);
-  setOperationAction(ISD::CTLZ             , MVT::i8   , Custom);
-  setOperationAction(ISD::CTTZ             , MVT::i16  , Custom);
-  setOperationAction(ISD::CTLZ             , MVT::i16  , Custom);
-  setOperationAction(ISD::CTTZ             , MVT::i32  , Custom);
-  setOperationAction(ISD::CTLZ             , MVT::i32  , Custom);
-  if (Subtarget->is64Bit()) {
-    setOperationAction(ISD::CTTZ           , MVT::i64  , Custom);
-    setOperationAction(ISD::CTLZ           , MVT::i64  , Custom);
+  if (Subtarget->hasBMI()) {
+    setOperationAction(ISD::CTTZ           , MVT::i8   , Promote);
+  } else {
+    setOperationAction(ISD::CTTZ           , MVT::i8   , Custom);
+    setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
+    setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
+    if (Subtarget->is64Bit())
+      setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
+  }
+
+  if (Subtarget->hasLZCNT()) {
+    setOperationAction(ISD::CTLZ           , MVT::i8   , Promote);
+  } else {
+    setOperationAction(ISD::CTLZ           , MVT::i8   , Custom);
+    setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
+    setOperationAction(ISD::CTLZ           , MVT::i32  , Custom);
+    if (Subtarget->is64Bit())
+      setOperationAction(ISD::CTLZ         , MVT::i64  , Custom);
   }
 
   if (Subtarget->hasPOPCNT()) {
@@ -491,8 +462,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   if (Subtarget->hasXMM())
     setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
 
-  // We may not have a libcall for MEMBARRIER so we should lower this.
   setOperationAction(ISD::MEMBARRIER    , MVT::Other, Custom);
+  setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
 
   // On X86 and X86-64, atomic operations are lowered to locked instructions.
   // Locked instructions, in turn, have implicit fence semantics (all memory
@@ -506,9 +477,11 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     MVT VT = IntVTs[i];
     setOperationAction(ISD::ATOMIC_CMP_SWAP, VT, Custom);
     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
+    setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
   }
 
   if (!Subtarget->is64Bit()) {
+    setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
     setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom);
     setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
     setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
@@ -518,6 +491,10 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom);
   }
 
+  if (Subtarget->hasCmpxchg16b()) {
+    setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
+  }
+
   // FIXME - use subtarget debug flags
   if (!Subtarget->isTargetDarwin() &&
       !Subtarget->isTargetELF() &&
@@ -539,7 +516,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
 
-  setOperationAction(ISD::TRAMPOLINE, MVT::Other, Custom);
+  setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
+  setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
 
   setOperationAction(ISD::TRAP, MVT::Other, Legal);
 
@@ -556,11 +534,16 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
 
   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
-  setOperationAction(ISD::DYNAMIC_STACKALLOC,
-                     (Subtarget->is64Bit() ? MVT::i64 : MVT::i32),
-                     (Subtarget->isTargetCOFF()
-                      && !Subtarget->isTargetEnvMacho()
-                      ? Custom : Expand));
+
+  if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho())
+    setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
+                       MVT::i64 : MVT::i32, Custom);
+  else if (EnableSegmentedStacks)
+    setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
+                       MVT::i64 : MVT::i32, Custom);
+  else
+    setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
+                       MVT::i64 : MVT::i32, Expand);
 
   if (!UseSoftFloat && X86ScalarSSEf64) {
     // f32 and f64 use SSE.
@@ -739,7 +722,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand);
-    setOperationAction(ISD::VSETCC, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::SETCC, (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand);
@@ -754,6 +737,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::SIGN_EXTEND,  (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::ZERO_EXTEND,  (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::ANY_EXTEND,  (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::VSELECT,  (MVT::SimpleValueType)VT, Expand);
     for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
          InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
       setTruncStoreAction((MVT::SimpleValueType)VT,
@@ -816,7 +800,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
     setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
-    setOperationAction(ISD::VSETCC,             MVT::v4f32, Custom);
+    setOperationAction(ISD::SETCC,              MVT::v4f32, Custom);
   }
 
   if (!UseSoftFloat && Subtarget->hasXMMInt()) {
@@ -846,10 +830,10 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
     setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
 
-    setOperationAction(ISD::VSETCC,             MVT::v2f64, Custom);
-    setOperationAction(ISD::VSETCC,             MVT::v16i8, Custom);
-    setOperationAction(ISD::VSETCC,             MVT::v8i16, Custom);
-    setOperationAction(ISD::VSETCC,             MVT::v4i32, Custom);
+    setOperationAction(ISD::SETCC,              MVT::v2i64, Custom);
+    setOperationAction(ISD::SETCC,              MVT::v16i8, Custom);
+    setOperationAction(ISD::SETCC,              MVT::v8i16, Custom);
+    setOperationAction(ISD::SETCC,              MVT::v4i32, Custom);
 
     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
@@ -925,7 +909,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
   }
 
-  if (Subtarget->hasSSE41()) {
+  if (Subtarget->hasSSE41() || Subtarget->hasAVX()) {
     setOperationAction(ISD::FFLOOR,             MVT::f32,   Legal);
     setOperationAction(ISD::FCEIL,              MVT::f32,   Legal);
     setOperationAction(ISD::FTRUNC,             MVT::f32,   Legal);
@@ -944,6 +928,12 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::SHL,                MVT::v4i32, Custom);
     setOperationAction(ISD::SHL,                MVT::v16i8, Custom);
 
+    setOperationAction(ISD::VSELECT,            MVT::v2f64, Legal);
+    setOperationAction(ISD::VSELECT,            MVT::v2i64, Legal);
+    setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
+    setOperationAction(ISD::VSELECT,            MVT::v4i32, Legal);
+    setOperationAction(ISD::VSELECT,            MVT::v4f32, Legal);
+
     // i8 and i16 vectors are custom , because the source register and source
     // source memory operand types are not the same width.  f32 vectors are
     // custom since the immediate controlling the insert encodes additional
@@ -964,10 +954,11 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     }
   }
 
-  if (Subtarget->hasSSE2()) {
+  if (Subtarget->hasXMMInt()) {
     setOperationAction(ISD::SRL,               MVT::v2i64, Custom);
     setOperationAction(ISD::SRL,               MVT::v4i32, Custom);
     setOperationAction(ISD::SRL,               MVT::v16i8, Custom);
+    setOperationAction(ISD::SRL,               MVT::v8i16, Custom);
 
     setOperationAction(ISD::SHL,               MVT::v2i64, Custom);
     setOperationAction(ISD::SHL,               MVT::v4i32, Custom);
@@ -977,15 +968,16 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::SRA,               MVT::v8i16, Custom);
   }
 
-  if (Subtarget->hasSSE42())
-    setOperationAction(ISD::VSETCC,             MVT::v2i64, Custom);
+  if (Subtarget->hasSSE42() || Subtarget->hasAVX())
+    setOperationAction(ISD::SETCC,             MVT::v2i64, Custom);
 
   if (!UseSoftFloat && Subtarget->hasAVX()) {
-    addRegisterClass(MVT::v8f32, X86::VR256RegisterClass);
-    addRegisterClass(MVT::v4f64, X86::VR256RegisterClass);
-    addRegisterClass(MVT::v8i32, X86::VR256RegisterClass);
-    addRegisterClass(MVT::v4i64, X86::VR256RegisterClass);
-    addRegisterClass(MVT::v32i8, X86::VR256RegisterClass);
+    addRegisterClass(MVT::v32i8,  X86::VR256RegisterClass);
+    addRegisterClass(MVT::v16i16, X86::VR256RegisterClass);
+    addRegisterClass(MVT::v8i32,  X86::VR256RegisterClass);
+    addRegisterClass(MVT::v8f32,  X86::VR256RegisterClass);
+    addRegisterClass(MVT::v4i64,  X86::VR256RegisterClass);
+    addRegisterClass(MVT::v4f64,  X86::VR256RegisterClass);
 
     setOperationAction(ISD::LOAD,               MVT::v8f32, Legal);
     setOperationAction(ISD::LOAD,               MVT::v4f64, Legal);
@@ -1005,6 +997,59 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::FSQRT,              MVT::v4f64, Legal);
     setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
 
+    setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
+    setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
+    setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
+
+    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v4f64,  Custom);
+    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v4i64,  Custom);
+    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8f32,  Custom);
+    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i32,  Custom);
+    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v32i8,  Custom);
+    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i16, Custom);
+
+    setOperationAction(ISD::SRL,               MVT::v4i64, Custom);
+    setOperationAction(ISD::SRL,               MVT::v8i32, Custom);
+    setOperationAction(ISD::SRL,               MVT::v16i16, Custom);
+    setOperationAction(ISD::SRL,               MVT::v32i8, Custom);
+
+    setOperationAction(ISD::SHL,               MVT::v4i64, Custom);
+    setOperationAction(ISD::SHL,               MVT::v8i32, Custom);
+    setOperationAction(ISD::SHL,               MVT::v16i16, Custom);
+    setOperationAction(ISD::SHL,               MVT::v32i8, Custom);
+
+    setOperationAction(ISD::SRA,               MVT::v8i32, Custom);
+    setOperationAction(ISD::SRA,               MVT::v16i16, Custom);
+
+    setOperationAction(ISD::SETCC,             MVT::v32i8, Custom);
+    setOperationAction(ISD::SETCC,             MVT::v16i16, Custom);
+    setOperationAction(ISD::SETCC,             MVT::v8i32, Custom);
+    setOperationAction(ISD::SETCC,             MVT::v4i64, Custom);
+
+    setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
+    setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
+    setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
+
+    setOperationAction(ISD::VSELECT,            MVT::v4f64, Legal);
+    setOperationAction(ISD::VSELECT,            MVT::v4i64, Legal);
+    setOperationAction(ISD::VSELECT,            MVT::v8i32, Legal);
+    setOperationAction(ISD::VSELECT,            MVT::v8f32, Legal);
+
+    setOperationAction(ISD::ADD,               MVT::v4i64, Custom);
+    setOperationAction(ISD::ADD,               MVT::v8i32, Custom);
+    setOperationAction(ISD::ADD,               MVT::v16i16, Custom);
+    setOperationAction(ISD::ADD,               MVT::v32i8, Custom);
+
+    setOperationAction(ISD::SUB,               MVT::v4i64, Custom);
+    setOperationAction(ISD::SUB,               MVT::v8i32, Custom);
+    setOperationAction(ISD::SUB,               MVT::v16i16, Custom);
+    setOperationAction(ISD::SUB,               MVT::v32i8, Custom);
+
+    setOperationAction(ISD::MUL,               MVT::v4i64, Custom);
+    setOperationAction(ISD::MUL,               MVT::v8i32, Custom);
+    setOperationAction(ISD::MUL,               MVT::v16i16, Custom);
+    // Don't lower v32i8 because there is no 128-bit byte mul
+
     // Custom lower several nodes for 256-bit types.
     for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
                   i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) {
@@ -1093,6 +1138,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
   setTargetDAGCombine(ISD::BUILD_VECTOR);
+  setTargetDAGCombine(ISD::VSELECT);
   setTargetDAGCombine(ISD::SELECT);
   setTargetDAGCombine(ISD::SHL);
   setTargetDAGCombine(ISD::SRA);
@@ -1100,7 +1146,10 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   setTargetDAGCombine(ISD::OR);
   setTargetDAGCombine(ISD::AND);
   setTargetDAGCombine(ISD::ADD);
+  setTargetDAGCombine(ISD::FADD);
+  setTargetDAGCombine(ISD::FSUB);
   setTargetDAGCombine(ISD::SUB);
+  setTargetDAGCombine(ISD::LOAD);
   setTargetDAGCombine(ISD::STORE);
   setTargetDAGCombine(ISD::ZERO_EXTEND);
   setTargetDAGCombine(ISD::SINT_TO_FP);
@@ -1124,25 +1173,26 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
 }
 
 
-MVT::SimpleValueType X86TargetLowering::getSetCCResultType(EVT VT) const {
-  return MVT::i8;
+EVT X86TargetLowering::getSetCCResultType(EVT VT) const {
+  if (!VT.isVector()) return MVT::i8;
+  return VT.changeVectorElementTypeToInteger();
 }
 
 
 /// getMaxByValAlign - Helper for getByValTypeAlignment to determine
 /// the desired ByVal argument alignment.
-static void getMaxByValAlign(const Type *Ty, unsigned &MaxAlign) {
+static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
   if (MaxAlign == 16)
     return;
-  if (const VectorType *VTy = dyn_cast<VectorType>(Ty)) {
+  if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
     if (VTy->getBitWidth() == 128)
       MaxAlign = 16;
-  } else if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+  } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
     unsigned EltAlign = 0;
     getMaxByValAlign(ATy->getElementType(), EltAlign);
     if (EltAlign > MaxAlign)
       MaxAlign = EltAlign;
-  } else if (const StructType *STy = dyn_cast<StructType>(Ty)) {
+  } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
     for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
       unsigned EltAlign = 0;
       getMaxByValAlign(STy->getElementType(i), EltAlign);
@@ -1159,7 +1209,7 @@ static void getMaxByValAlign(const Type *Ty, unsigned &MaxAlign) {
 /// function arguments in the caller parameter area. For X86, aggregates
 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
 /// are at 4-byte boundaries.
-unsigned X86TargetLowering::getByValTypeAlignment(const Type *Ty) const {
+unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
   if (Subtarget->is64Bit()) {
     // Max of 8 and alignment of type.
     unsigned TyAlign = TD->getABITypeAlignment(Ty);
@@ -1203,9 +1253,12 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size,
          ((DstAlign == 0 || DstAlign >= 16) &&
           (SrcAlign == 0 || SrcAlign >= 16))) &&
         Subtarget->getStackAlignment() >= 16) {
-      if (Subtarget->hasSSE2())
+      if (Subtarget->hasAVX() &&
+          Subtarget->getStackAlignment() >= 32)
+        return MVT::v8f32;
+      if (Subtarget->hasXMMInt())
         return MVT::v4i32;
-      if (Subtarget->hasSSE1())
+      if (Subtarget->hasXMM())
         return MVT::v4f32;
     } else if (!MemcpyStrSrc && Size >= 8 &&
                !Subtarget->is64Bit() &&
@@ -1408,7 +1461,7 @@ X86TargetLowering::LowerReturn(SDValue Chain,
                                   ValToCopy);
           // If we don't have SSE2 available, convert to v4f32 so the generated
           // register is legal.
-          if (!Subtarget->hasSSE2())
+          if (!Subtarget->hasXMMInt())
             ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy);
         }
       }
@@ -1700,6 +1753,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
     // places.
     assert(VA.getValNo() != LastVal &&
            "Don't support value assigned to multiple locs yet");
+    (void)LastVal;
     LastVal = VA.getValNo();
 
     if (VA.isRegLoc()) {
@@ -1917,6 +1971,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
       FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
   }
 
+  FuncInfo->setArgumentStackSize(StackSize);
+
   return Chain;
 }
 
@@ -2744,8 +2800,6 @@ static bool isTargetShuffle(unsigned Opcode) {
   case X86ISD::MOVSD:
   case X86ISD::UNPCKLPS:
   case X86ISD::UNPCKLPD:
-  case X86ISD::VUNPCKLPS:
-  case X86ISD::VUNPCKLPD:
   case X86ISD::VUNPCKLPSY:
   case X86ISD::VUNPCKLPDY:
   case X86ISD::PUNPCKLWD:
@@ -2754,10 +2808,17 @@ static bool isTargetShuffle(unsigned Opcode) {
   case X86ISD::PUNPCKLQDQ:
   case X86ISD::UNPCKHPS:
   case X86ISD::UNPCKHPD:
+  case X86ISD::VUNPCKHPSY:
+  case X86ISD::VUNPCKHPDY:
   case X86ISD::PUNPCKHWD:
   case X86ISD::PUNPCKHBW:
   case X86ISD::PUNPCKHDQ:
   case X86ISD::PUNPCKHQDQ:
+  case X86ISD::VPERMILPS:
+  case X86ISD::VPERMILPSY:
+  case X86ISD::VPERMILPD:
+  case X86ISD::VPERMILPDY:
+  case X86ISD::VPERM2F128:
     return true;
   }
   return false;
@@ -2783,6 +2844,10 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
   case X86ISD::PSHUFD:
   case X86ISD::PSHUFHW:
   case X86ISD::PSHUFLW:
+  case X86ISD::VPERMILPS:
+  case X86ISD::VPERMILPSY:
+  case X86ISD::VPERMILPD:
+  case X86ISD::VPERMILPDY:
     return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
   }
 
@@ -2796,6 +2861,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
   case X86ISD::PALIGN:
   case X86ISD::SHUFPD:
   case X86ISD::SHUFPS:
+  case X86ISD::VPERM2F128:
     return DAG.getNode(Opc, dl, VT, V1, V2,
                        DAG.getConstant(TargetMask, MVT::i8));
   }
@@ -2815,8 +2881,6 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
   case X86ISD::MOVSD:
   case X86ISD::UNPCKLPS:
   case X86ISD::UNPCKLPD:
-  case X86ISD::VUNPCKLPS:
-  case X86ISD::VUNPCKLPD:
   case X86ISD::VUNPCKLPSY:
   case X86ISD::VUNPCKLPDY:
   case X86ISD::PUNPCKLWD:
@@ -2825,6 +2889,8 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
   case X86ISD::PUNPCKLQDQ:
   case X86ISD::UNPCKHPS:
   case X86ISD::UNPCKHPD:
+  case X86ISD::VUNPCKHPSY:
+  case X86ISD::VUNPCKHPDY:
   case X86ISD::PUNPCKHWD:
   case X86ISD::PUNPCKHBW:
   case X86ISD::PUNPCKHDQ:
@@ -3026,6 +3092,17 @@ static bool isUndefOrInRange(int Val, int Low, int Hi) {
   return (Val < 0) || (Val >= Low && Val < Hi);
 }
 
+/// isUndefOrInRange - Return true if every element in Mask, begining
+/// from position Pos and ending in Pos+Size, falls within the specified
+/// range (L, L+Pos]. or is undef.
+static bool isUndefOrInRange(const SmallVectorImpl<int> &Mask,
+                             int Pos, int Size, int Low, int Hi) {
+  for (int i = Pos, e = Pos+Size; i != e; ++i)
+    if (!isUndefOrInRange(Mask[i], Low, Hi))
+      return false;
+  return true;
+}
+
 /// isUndefOrEqual - Val is either less than zero (undef) or equal to the
 /// specified value.
 static bool isUndefOrEqual(int Val, int CmpVal) {
@@ -3034,6 +3111,17 @@ static bool isUndefOrEqual(int Val, int CmpVal) {
   return false;
 }
 
+/// isSequentialOrUndefInRange - Return true if every element in Mask, begining
+/// from position Pos and ending in Pos+Size, falls within the specified
+/// sequential range (L, L+Pos]. or is undef.
+static bool isSequentialOrUndefInRange(const SmallVectorImpl<int> &Mask,
+                                       int Pos, int Size, int Low) {
+  for (int i = Pos, e = Pos+Size; i != e; ++i, ++Low)
+    if (!isUndefOrEqual(Mask[i], Low))
+      return false;
+  return true;
+}
+
 /// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
 /// is suitable for input to PSHUFD or PSHUFW.  That is, it doesn't reference
 /// the second operand.
@@ -3104,11 +3192,13 @@ bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) {
 /// isPALIGNRMask - Return true if the node specifies a shuffle of elements that
 /// is suitable for input to PALIGNR.
 static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT,
-                          bool hasSSSE3) {
+                          bool hasSSSE3OrAVX) {
   int i, e = VT.getVectorNumElements();
+  if (VT.getSizeInBits() != 128 && VT.getSizeInBits() != 64)
+    return false;
 
   // Do not handle v2i64 / v2f64 shuffles with palignr.
-  if (e < 4 || !hasSSSE3)
+  if (e < 4 || !hasSSSE3OrAVX)
     return false;
 
   for (i = 0; i != e; ++i)
@@ -3119,42 +3209,176 @@ static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT,
   if (i == e)
     return false;
 
-  // Determine if it's ok to perform a palignr with only the LHS, since we
-  // don't have access to the actual shuffle elements to see if RHS is undef.
-  bool Unary = Mask[i] < (int)e;
-  bool NeedsUnary = false;
+  // Make sure we're shifting in the right direction.
+  if (Mask[i] <= i)
+    return false;
 
   int s = Mask[i] - i;
 
   // Check the rest of the elements to see if they are consecutive.
   for (++i; i != e; ++i) {
     int m = Mask[i];
-    if (m < 0)
-      continue;
+    if (m >= 0 && m != s+i)
+      return false;
+  }
+  return true;
+}
+
+/// isVSHUFPSYMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to 256-bit
+/// VSHUFPSY.
+static bool isVSHUFPSYMask(const SmallVectorImpl<int> &Mask, EVT VT,
+                          const X86Subtarget *Subtarget) {
+  int NumElems = VT.getVectorNumElements();
+
+  if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256)
+    return false;
+
+  if (NumElems != 8)
+    return false;
 
-    Unary = Unary && (m < (int)e);
-    NeedsUnary = NeedsUnary || (m < s);
+  // VSHUFPSY divides the resulting vector into 4 chunks.
+  // The sources are also splitted into 4 chunks, and each destination
+  // chunk must come from a different source chunk.
+  //
+  //  SRC1 =>   X7    X6    X5    X4    X3    X2    X1    X0
+  //  SRC2 =>   Y7    Y6    Y5    Y4    Y3    Y2    Y1    Y9
+  //
+  //  DST  =>  Y7..Y4,   Y7..Y4,   X7..X4,   X7..X4,
+  //           Y3..Y0,   Y3..Y0,   X3..X0,   X3..X0
+  //
+  int QuarterSize = NumElems/4;
+  int HalfSize = QuarterSize*2;
+  for (int i = 0; i < QuarterSize; ++i)
+    if (!isUndefOrInRange(Mask[i], 0, HalfSize))
+      return false;
+  for (int i = QuarterSize; i < QuarterSize*2; ++i)
+    if (!isUndefOrInRange(Mask[i], NumElems, NumElems+HalfSize))
+      return false;
 
-    if (NeedsUnary && !Unary)
+  // The mask of the second half must be the same as the first but with
+  // the appropriate offsets. This works in the same way as VPERMILPS
+  // works with masks.
+  for (int i = QuarterSize*2; i < QuarterSize*3; ++i) {
+    if (!isUndefOrInRange(Mask[i], HalfSize, NumElems))
+      return false;
+    int FstHalfIdx = i-HalfSize;
+    if (Mask[FstHalfIdx] < 0)
+      continue;
+    if (!isUndefOrEqual(Mask[i], Mask[FstHalfIdx]+HalfSize))
       return false;
-    if (Unary && m != ((s+i) & (e-1)))
+  }
+  for (int i = QuarterSize*3; i < NumElems; ++i) {
+    if (!isUndefOrInRange(Mask[i], NumElems+HalfSize, NumElems*2))
       return false;
-    if (!Unary && m != (s+i))
+    int FstHalfIdx = i-HalfSize;
+    if (Mask[FstHalfIdx] < 0)
+      continue;
+    if (!isUndefOrEqual(Mask[i], Mask[FstHalfIdx]+HalfSize))
       return false;
+
   }
+
   return true;
 }
 
-bool X86::isPALIGNRMask(ShuffleVectorSDNode *N) {
-  SmallVector<int, 8> M;
-  N->getMask(M);
-  return ::isPALIGNRMask(M, N->getValueType(0), true);
+/// getShuffleVSHUFPSYImmediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_MASK mask with VSHUFPSY instruction.
+static unsigned getShuffleVSHUFPSYImmediate(SDNode *N) {
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  EVT VT = SVOp->getValueType(0);
+  int NumElems = VT.getVectorNumElements();
+
+  assert(NumElems == 8 && VT.getSizeInBits() == 256 &&
+         "Only supports v8i32 and v8f32 types");
+
+  int HalfSize = NumElems/2;
+  unsigned Mask = 0;
+  for (int i = 0; i != NumElems ; ++i) {
+    if (SVOp->getMaskElt(i) < 0)
+      continue;
+    // The mask of the first half must be equal to the second one.
+    unsigned Shamt = (i%HalfSize)*2;
+    unsigned Elt = SVOp->getMaskElt(i) % HalfSize;
+    Mask |= Elt << Shamt;
+  }
+
+  return Mask;
+}
+
+/// isVSHUFPDYMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to 256-bit
+/// VSHUFPDY. This shuffle doesn't have the same restriction as the PS
+/// version and the mask of the second half isn't binded with the first
+/// one.
+static bool isVSHUFPDYMask(const SmallVectorImpl<int> &Mask, EVT VT,
+                           const X86Subtarget *Subtarget) {
+  int NumElems = VT.getVectorNumElements();
+
+  if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256)
+    return false;
+
+  if (NumElems != 4)
+    return false;
+
+  // VSHUFPSY divides the resulting vector into 4 chunks.
+  // The sources are also splitted into 4 chunks, and each destination
+  // chunk must come from a different source chunk.
+  //
+  //  SRC1 =>      X3       X2       X1       X0
+  //  SRC2 =>      Y3       Y2       Y1       Y0
+  //
+  //  DST  =>  Y2..Y3,  X2..X3,  Y1..Y0,  X1..X0
+  //
+  int QuarterSize = NumElems/4;
+  int HalfSize = QuarterSize*2;
+  for (int i = 0; i < QuarterSize; ++i)
+    if (!isUndefOrInRange(Mask[i], 0, HalfSize))
+      return false;
+  for (int i = QuarterSize; i < QuarterSize*2; ++i)
+    if (!isUndefOrInRange(Mask[i], NumElems, NumElems+HalfSize))
+      return false;
+  for (int i = QuarterSize*2; i < QuarterSize*3; ++i)
+    if (!isUndefOrInRange(Mask[i], HalfSize, NumElems))
+      return false;
+  for (int i = QuarterSize*3; i < NumElems; ++i)
+    if (!isUndefOrInRange(Mask[i], NumElems+HalfSize, NumElems*2))
+      return false;
+
+  return true;
+}
+
+/// getShuffleVSHUFPDYImmediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_MASK mask with VSHUFPDY instruction.
+static unsigned getShuffleVSHUFPDYImmediate(SDNode *N) {
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  EVT VT = SVOp->getValueType(0);
+  int NumElems = VT.getVectorNumElements();
+
+  assert(NumElems == 4 && VT.getSizeInBits() == 256 &&
+         "Only supports v4i64 and v4f64 types");
+
+  int HalfSize = NumElems/2;
+  unsigned Mask = 0;
+  for (int i = 0; i != NumElems ; ++i) {
+    if (SVOp->getMaskElt(i) < 0)
+      continue;
+    int Elt = SVOp->getMaskElt(i) % HalfSize;
+    Mask |= Elt << i;
+  }
+
+  return Mask;
 }
 
 /// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
-/// specifies a shuffle of elements that is suitable for input to SHUFP*.
+/// specifies a shuffle of elements that is suitable for input to 128-bit
+/// SHUFPS and SHUFPD.
 static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) {
   int NumElems = VT.getVectorNumElements();
+
+  if (VT.getSizeInBits() != 128)
+    return false;
+
   if (NumElems != 2 && NumElems != 4)
     return false;
 
@@ -3204,7 +3428,13 @@ static bool isCommutedSHUFP(ShuffleVectorSDNode *N) {
 /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to MOVHLPS.
 bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) {
-  if (N->getValueType(0).getVectorNumElements() != 4)
+  EVT VT = N->getValueType(0);
+  unsigned NumElems = VT.getVectorNumElements();
+
+  if (VT.getSizeInBits() != 128)
+    return false;
+
+  if (NumElems != 4)
     return false;
 
   // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
@@ -3218,15 +3448,19 @@ bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) {
 /// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
 /// <2, 3, 2, 3>
 bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) {
-  unsigned NumElems = N->getValueType(0).getVectorNumElements();
+  EVT VT = N->getValueType(0);
+  unsigned NumElems = VT.getVectorNumElements();
+
+  if (VT.getSizeInBits() != 128)
+    return false;
 
   if (NumElems != 4)
     return false;
 
   return isUndefOrEqual(N->getMaskElt(0), 2) &&
-  isUndefOrEqual(N->getMaskElt(1), 3) &&
-  isUndefOrEqual(N->getMaskElt(2), 2) &&
-  isUndefOrEqual(N->getMaskElt(3), 3);
+         isUndefOrEqual(N->getMaskElt(1), 3) &&
+         isUndefOrEqual(N->getMaskElt(2), 2) &&
+         isUndefOrEqual(N->getMaskElt(3), 3);
 }
 
 /// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
@@ -3273,20 +3507,22 @@ bool X86::isMOVLHPSMask(ShuffleVectorSDNode *N) {
 static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT,
                          bool V2IsSplat = false) {
   int NumElts = VT.getVectorNumElements();
-  if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
+
+  assert((VT.is128BitVector() || VT.is256BitVector()) &&
+         "Unsupported vector type for unpckh");
+
+  if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8)
     return false;
 
-  // Handle vector lengths > 128 bits.  Define a "section" as a set of
-  // 128 bits.  AVX defines UNPCK* to operate independently on 128-bit
-  // sections.
-  unsigned NumSections = VT.getSizeInBits() / 128;
-  if (NumSections == 0 ) NumSections = 1;  // Handle MMX
-  unsigned NumSectionElts = NumElts / NumSections;
+  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
+  // independently on 128-bit lanes.
+  unsigned NumLanes = VT.getSizeInBits()/128;
+  unsigned NumLaneElts = NumElts/NumLanes;
 
   unsigned Start = 0;
-  unsigned End = NumSectionElts;
-  for (unsigned s = 0; s < NumSections; ++s) {
-    for (unsigned i = Start, j = s * NumSectionElts;
+  unsigned End = NumLaneElts;
+  for (unsigned s = 0; s < NumLanes; ++s) {
+    for (unsigned i = Start, j = s * NumLaneElts;
          i != End;
          i += 2, ++j) {
       int BitI  = Mask[i];
@@ -3302,8 +3538,8 @@ static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT,
       }
     }
     // Process the next 128 bits.
-    Start += NumSectionElts;
-    End += NumSectionElts;
+    Start += NumLaneElts;
+    End += NumLaneElts;
   }
 
   return true;
@@ -3320,21 +3556,38 @@ bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat) {
 static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT,
                          bool V2IsSplat = false) {
   int NumElts = VT.getVectorNumElements();
-  if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
+
+  assert((VT.is128BitVector() || VT.is256BitVector()) &&
+         "Unsupported vector type for unpckh");
+
+  if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8)
     return false;
 
-  for (int i = 0, j = 0; i != NumElts; i += 2, ++j) {
-    int BitI  = Mask[i];
-    int BitI1 = Mask[i+1];
-    if (!isUndefOrEqual(BitI, j + NumElts/2))
-      return false;
-    if (V2IsSplat) {
-      if (isUndefOrEqual(BitI1, NumElts))
-        return false;
-    } else {
-      if (!isUndefOrEqual(BitI1, j + NumElts/2 + NumElts))
+  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
+  // independently on 128-bit lanes.
+  unsigned NumLanes = VT.getSizeInBits()/128;
+  unsigned NumLaneElts = NumElts/NumLanes;
+
+  unsigned Start = 0;
+  unsigned End = NumLaneElts;
+  for (unsigned l = 0; l != NumLanes; ++l) {
+    for (unsigned i = Start, j = (l*NumLaneElts)+NumLaneElts/2;
+                             i != End; i += 2, ++j) {
+      int BitI  = Mask[i];
+      int BitI1 = Mask[i+1];
+      if (!isUndefOrEqual(BitI, j))
         return false;
+      if (V2IsSplat) {
+        if (isUndefOrEqual(BitI1, NumElts))
+          return false;
+      } else {
+        if (!isUndefOrEqual(BitI1, j+NumElts))
+          return false;
+      }
     }
+    // Process the next 128 bits.
+    Start += NumLaneElts;
+    End += NumLaneElts;
   }
   return true;
 }
@@ -3353,16 +3606,21 @@ static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) {
   if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16)
     return false;
 
-  // Handle vector lengths > 128 bits.  Define a "section" as a set of
-  // 128 bits.  AVX defines UNPCK* to operate independently on 128-bit
-  // sections.
-  unsigned NumSections = VT.getSizeInBits() / 128;
-  if (NumSections == 0 ) NumSections = 1;  // Handle MMX
-  unsigned NumSectionElts = NumElems / NumSections;
+  // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern
+  // FIXME: Need a better way to get rid of this, there's no latency difference
+  // between UNPCKLPD and MOVDDUP, the later should always be checked first and
+  // the former later. We should also remove the "_undef" special mask.
+  if (NumElems == 4 && VT.getSizeInBits() == 256)
+    return false;
+
+  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
+  // independently on 128-bit lanes.
+  unsigned NumLanes = VT.getSizeInBits() / 128;
+  unsigned NumLaneElts = NumElems / NumLanes;
 
-  for (unsigned s = 0; s < NumSections; ++s) {
-    for (unsigned i = s * NumSectionElts, j = s * NumSectionElts;
-         i != NumSectionElts * (s + 1);
+  for (unsigned s = 0; s < NumLanes; ++s) {
+    for (unsigned i = s * NumLaneElts, j = s * NumLaneElts;
+         i != NumLaneElts * (s + 1);
          i += 2, ++j) {
       int BitI  = Mask[i];
       int BitI1 = Mask[i+1];
@@ -3433,6 +3691,189 @@ bool X86::isMOVLMask(ShuffleVectorSDNode *N) {
   return ::isMOVLMask(M, N->getValueType(0));
 }
 
+/// isVPERM2F128Mask - Match 256-bit shuffles where the elements are considered
+/// as permutations between 128-bit chunks or halves. As an example: this
+/// shuffle bellow:
+///   vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15>
+/// The first half comes from the second half of V1 and the second half from the
+/// the second half of V2.
+static bool isVPERM2F128Mask(const SmallVectorImpl<int> &Mask, EVT VT,
+                             const X86Subtarget *Subtarget) {
+  if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256)
+    return false;
+
+  // The shuffle result is divided into half A and half B. In total the two
+  // sources have 4 halves, namely: C, D, E, F. The final values of A and
+  // B must come from C, D, E or F.
+  int HalfSize = VT.getVectorNumElements()/2;
+  bool MatchA = false, MatchB = false;
+
+  // Check if A comes from one of C, D, E, F.
+  for (int Half = 0; Half < 4; ++Half) {
+    if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) {
+      MatchA = true;
+      break;
+    }
+  }
+
+  // Check if B comes from one of C, D, E, F.
+  for (int Half = 0; Half < 4; ++Half) {
+    if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) {
+      MatchB = true;
+      break;
+    }
+  }
+
+  return MatchA && MatchB;
+}
+
+/// getShuffleVPERM2F128Immediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_MASK mask with VPERM2F128 instructions.
+static unsigned getShuffleVPERM2F128Immediate(SDNode *N) {
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  EVT VT = SVOp->getValueType(0);
+
+  int HalfSize = VT.getVectorNumElements()/2;
+
+  int FstHalf = 0, SndHalf = 0;
+  for (int i = 0; i < HalfSize; ++i) {
+    if (SVOp->getMaskElt(i) > 0) {
+      FstHalf = SVOp->getMaskElt(i)/HalfSize;
+      break;
+    }
+  }
+  for (int i = HalfSize; i < HalfSize*2; ++i) {
+    if (SVOp->getMaskElt(i) > 0) {
+      SndHalf = SVOp->getMaskElt(i)/HalfSize;
+      break;
+    }
+  }
+
+  return (FstHalf | (SndHalf << 4));
+}
+
+/// isVPERMILPDMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to VPERMILPD*.
+/// Note that VPERMIL mask matching is different depending whether theunderlying
+/// type is 32 or 64. In the VPERMILPS the high half of the mask should point
+/// to the same elements of the low, but to the higher half of the source.
+/// In VPERMILPD the two lanes could be shuffled independently of each other
+/// with the same restriction that lanes can't be crossed.
+static bool isVPERMILPDMask(const SmallVectorImpl<int> &Mask, EVT VT,
+                            const X86Subtarget *Subtarget) {
+  int NumElts = VT.getVectorNumElements();
+  int NumLanes = VT.getSizeInBits()/128;
+
+  if (!Subtarget->hasAVX())
+    return false;
+
+  // Only match 256-bit with 64-bit types
+  if (VT.getSizeInBits() != 256 || NumElts != 4)
+    return false;
+
+  // The mask on the high lane is independent of the low. Both can match
+  // any element in inside its own lane, but can't cross.
+  int LaneSize = NumElts/NumLanes;
+  for (int l = 0; l < NumLanes; ++l)
+    for (int i = l*LaneSize; i < LaneSize*(l+1); ++i) {
+      int LaneStart = l*LaneSize;
+      if (!isUndefOrInRange(Mask[i], LaneStart, LaneStart+LaneSize))
+        return false;
+    }
+
+  return true;
+}
+
+/// isVPERMILPSMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to VPERMILPS*.
+/// Note that VPERMIL mask matching is different depending whether theunderlying
+/// type is 32 or 64. In the VPERMILPS the high half of the mask should point
+/// to the same elements of the low, but to the higher half of the source.
+/// In VPERMILPD the two lanes could be shuffled independently of each other
+/// with the same restriction that lanes can't be crossed.
+static bool isVPERMILPSMask(const SmallVectorImpl<int> &Mask, EVT VT,
+                            const X86Subtarget *Subtarget) {
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned NumLanes = VT.getSizeInBits()/128;
+
+  if (!Subtarget->hasAVX())
+    return false;
+
+  // Only match 256-bit with 32-bit types
+  if (VT.getSizeInBits() != 256 || NumElts != 8)
+    return false;
+
+  // The mask on the high lane should be the same as the low. Actually,
+  // they can differ if any of the corresponding index in a lane is undef
+  // and the other stays in range.
+  int LaneSize = NumElts/NumLanes;
+  for (int i = 0; i < LaneSize; ++i) {
+    int HighElt = i+LaneSize;
+    bool HighValid = isUndefOrInRange(Mask[HighElt], LaneSize, NumElts);
+    bool LowValid = isUndefOrInRange(Mask[i], 0, LaneSize);
+
+    if (!HighValid || !LowValid)
+      return false;
+    if (Mask[i] < 0 || Mask[HighElt] < 0)
+      continue;
+    if (Mask[HighElt]-Mask[i] != LaneSize)
+      return false;
+  }
+
+  return true;
+}
+
+/// getShuffleVPERMILPSImmediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_MASK mask with VPERMILPS* instructions.
+static unsigned getShuffleVPERMILPSImmediate(SDNode *N) {
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  EVT VT = SVOp->getValueType(0);
+
+  int NumElts = VT.getVectorNumElements();
+  int NumLanes = VT.getSizeInBits()/128;
+  int LaneSize = NumElts/NumLanes;
+
+  // Although the mask is equal for both lanes do it twice to get the cases
+  // where a mask will match because the same mask element is undef on the
+  // first half but valid on the second. This would get pathological cases
+  // such as: shuffle <u, 0, 1, 2, 4, 4, 5, 6>, which is completely valid.
+  unsigned Mask = 0;
+  for (int l = 0; l < NumLanes; ++l) {
+    for (int i = 0; i < LaneSize; ++i) {
+      int MaskElt = SVOp->getMaskElt(i+(l*LaneSize));
+      if (MaskElt < 0)
+        continue;
+      if (MaskElt >= LaneSize)
+        MaskElt -= LaneSize;
+      Mask |= MaskElt << (i*2);
+    }
+  }
+
+  return Mask;
+}
+
+/// getShuffleVPERMILPDImmediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_MASK mask with VPERMILPD* instructions.
+static unsigned getShuffleVPERMILPDImmediate(SDNode *N) {
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  EVT VT = SVOp->getValueType(0);
+
+  int NumElts = VT.getVectorNumElements();
+  int NumLanes = VT.getSizeInBits()/128;
+
+  unsigned Mask = 0;
+  int LaneSize = NumElts/NumLanes;
+  for (int l = 0; l < NumLanes; ++l)
+    for (int i = l*LaneSize; i < LaneSize*(l+1); ++i) {
+      int MaskElt = SVOp->getMaskElt(i);
+      if (MaskElt < 0)
+        continue;
+      Mask |= (MaskElt-l*LaneSize) << i;
+    }
+
+  return Mask;
+}
+
 /// isCommutedMOVL - Returns true if the shuffle mask is except the reverse
 /// of what x86 movss want. X86 movs requires the lowest  element to be lowest
 /// element of vector 2 and the other elements to come from vector 1 in order.
@@ -3463,58 +3904,92 @@ static bool isCommutedMOVL(ShuffleVectorSDNode *N, bool V2IsSplat = false,
 
 /// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
-bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N) {
-  if (N->getValueType(0).getVectorNumElements() != 4)
+/// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7>
+bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N,
+                         const X86Subtarget *Subtarget) {
+  if (!Subtarget->hasSSE3() && !Subtarget->hasAVX())
     return false;
 
-  // Expect 1, 1, 3, 3
-  for (unsigned i = 0; i < 2; ++i) {
-    int Elt = N->getMaskElt(i);
-    if (Elt >= 0 && Elt != 1)
-      return false;
-  }
+  // The second vector must be undef
+  if (N->getOperand(1).getOpcode() != ISD::UNDEF)
+    return false;
+
+  EVT VT = N->getValueType(0);
+  unsigned NumElems = VT.getVectorNumElements();
+
+  if ((VT.getSizeInBits() == 128 && NumElems != 4) ||
+      (VT.getSizeInBits() == 256 && NumElems != 8))
+    return false;
 
-  bool HasHi = false;
-  for (unsigned i = 2; i < 4; ++i) {
-    int Elt = N->getMaskElt(i);
-    if (Elt >= 0 && Elt != 3)
+  // "i+1" is the value the indexed mask element must have
+  for (unsigned i = 0; i < NumElems; i += 2)
+    if (!isUndefOrEqual(N->getMaskElt(i), i+1) ||
+        !isUndefOrEqual(N->getMaskElt(i+1), i+1))
       return false;
-    if (Elt == 3)
-      HasHi = true;
-  }
-  // Don't use movshdup if it can be done with a shufps.
-  // FIXME: verify that matching u, u, 3, 3 is what we want.
-  return HasHi;
+
+  return true;
 }
 
 /// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
-bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N) {
-  if (N->getValueType(0).getVectorNumElements() != 4)
+/// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6>
+bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N,
+                         const X86Subtarget *Subtarget) {
+  if (!Subtarget->hasSSE3() && !Subtarget->hasAVX())
+    return false;
+
+  // The second vector must be undef
+  if (N->getOperand(1).getOpcode() != ISD::UNDEF)
+    return false;
+
+  EVT VT = N->getValueType(0);
+  unsigned NumElems = VT.getVectorNumElements();
+
+  if ((VT.getSizeInBits() == 128 && NumElems != 4) ||
+      (VT.getSizeInBits() == 256 && NumElems != 8))
     return false;
 
-  // Expect 0, 0, 2, 2
-  for (unsigned i = 0; i < 2; ++i)
-    if (N->getMaskElt(i) > 0)
+  // "i" is the value the indexed mask element must have
+  for (unsigned i = 0; i < NumElems; i += 2)
+    if (!isUndefOrEqual(N->getMaskElt(i), i) ||
+        !isUndefOrEqual(N->getMaskElt(i+1), i))
       return false;
 
-  bool HasHi = false;
-  for (unsigned i = 2; i < 4; ++i) {
-    int Elt = N->getMaskElt(i);
-    if (Elt >= 0 && Elt != 2)
+  return true;
+}
+
+/// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to 256-bit
+/// version of MOVDDUP.
+static bool isMOVDDUPYMask(ShuffleVectorSDNode *N,
+                           const X86Subtarget *Subtarget) {
+  EVT VT = N->getValueType(0);
+  int NumElts = VT.getVectorNumElements();
+  bool V2IsUndef = N->getOperand(1).getOpcode() == ISD::UNDEF;
+
+  if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256 ||
+      !V2IsUndef || NumElts != 4)
+    return false;
+
+  for (int i = 0; i != NumElts/2; ++i)
+    if (!isUndefOrEqual(N->getMaskElt(i), 0))
       return false;
-    if (Elt == 2)
-      HasHi = true;
-  }
-  // Don't use movsldup if it can be done with a shufps.
-  return HasHi;
+  for (int i = NumElts/2; i != NumElts; ++i)
+    if (!isUndefOrEqual(N->getMaskElt(i), NumElts/2))
+      return false;
+  return true;
 }
 
 /// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
-/// specifies a shuffle of elements that is suitable for input to MOVDDUP.
+/// specifies a shuffle of elements that is suitable for input to 128-bit
+/// version of MOVDDUP.
 bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) {
-  int e = N->getValueType(0).getVectorNumElements() / 2;
+  EVT VT = N->getValueType(0);
+
+  if (VT.getSizeInBits() != 128)
+    return false;
 
+  int e = VT.getVectorNumElements() / 2;
   for (int i = 0; i < e; ++i)
     if (!isUndefOrEqual(N->getMaskElt(i), i))
       return false;
@@ -3627,6 +4102,7 @@ unsigned X86::getShufflePALIGNRImmediate(SDNode *N) {
     if (Val >= 0)
       break;
   }
+  assert(Val - i > 0 && "PALIGNR imm should be positive");
   return (Val - i) * EltSize;
 }
 
@@ -3644,7 +4120,6 @@ unsigned X86::getExtractVEXTRACTF128Immediate(SDNode *N) {
   EVT ElVT = VecVT.getVectorElementType();
 
   unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits();
-
   return Index / NumElemsPerChunk;
 }
 
@@ -3662,7 +4137,6 @@ unsigned X86::getInsertVINSERTF128Immediate(SDNode *N) {
   EVT ElVT = VecVT.getVectorElementType();
 
   unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits();
-
   return Index / NumElemsPerChunk;
 }
 
@@ -3716,7 +4190,10 @@ static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, EVT VT) {
 /// V1 (and in order), and the upper half elements should come from the upper
 /// half of V2 (and in order).
 static bool ShouldXformToMOVHLPS(ShuffleVectorSDNode *Op) {
-  if (Op->getValueType(0).getVectorNumElements() != 4)
+  EVT VT = Op->getValueType(0);
+  if (VT.getSizeInBits() != 128)
+    return false;
+  if (VT.getVectorNumElements() != 4)
     return false;
   for (unsigned i = 0, e = 2; i != e; ++i)
     if (!isUndefOrEqual(Op->getMaskElt(i), i+2))
@@ -3748,6 +4225,10 @@ static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) {
 /// MOVLP, it must be either a vector load or a scalar load to vector.
 static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
                                ShuffleVectorSDNode *Op) {
+  EVT VT = Op->getValueType(0);
+  if (VT.getSizeInBits() != 128)
+    return false;
+
   if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
     return false;
   // Is V2 is a vector load, don't do this transformation. We will try to use
@@ -3755,7 +4236,7 @@ static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
   if (ISD::isNON_EXTLoad(V2))
     return false;
 
-  unsigned NumElems = Op->getValueType(0).getVectorNumElements();
+  unsigned NumElems = VT.getVectorNumElements();
 
   if (NumElems != 2 && NumElems != 4)
     return false;
@@ -3811,7 +4292,7 @@ static bool isZeroShuffle(ShuffleVectorSDNode *N) {
 
 /// getZeroVector - Returns a vector of specified type with all zero elements.
 ///
-static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG,
+static SDValue getZeroVector(EVT VT, bool HasXMMInt, SelectionDAG &DAG,
                              DebugLoc dl) {
   assert(VT.isVector() && "Expected a vector type");
 
@@ -3819,7 +4300,7 @@ static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG,
   // to their dest type. This ensures they get CSE'd.
   SDValue Vec;
   if (VT.getSizeInBits() == 128) {  // SSE
-    if (HasSSE2) {  // SSE2
+    if (HasXMMInt) {  // SSE2
       SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
     } else { // SSE1
@@ -3838,21 +4319,25 @@ static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG,
 }
 
 /// getOnesVector - Returns a vector of specified type with all bits set.
-/// Always build ones vectors as <4 x i32> or <8 x i32> bitcasted to
-/// their original type, ensuring they get CSE'd.
+/// Always build ones vectors as <4 x i32>. For 256-bit types, use two
+/// <4 x i32> inserted in a <8 x i32> appropriately. Then bitcast to their
+/// original type, ensuring they get CSE'd.
 static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) {
   assert(VT.isVector() && "Expected a vector type");
   assert((VT.is128BitVector() || VT.is256BitVector())
          && "Expected a 128-bit or 256-bit vector type");
 
   SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
+  SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                            Cst, Cst, Cst, Cst);
 
-  SDValue Vec;
   if (VT.is256BitVector()) {
-    SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
-    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8);
-  } else
-    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
+    SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, MVT::v8i32),
+                              Vec, DAG.getConstant(0, MVT::i32), DAG, dl);
+    Vec = Insert128BitVector(InsV, Vec,
+                  DAG.getConstant(4 /* NumElems/2 */, MVT::i32), DAG, dl);
+  }
+
   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
 }
 
@@ -3902,7 +4387,7 @@ static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
 }
 
-/// getUnpackhMask - Returns a vector_shuffle node for an unpackh operation.
+/// getUnpackh - Returns a vector_shuffle node for an unpackh operation.
 static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
                           SDValue V2) {
   unsigned NumElems = VT.getVectorNumElements();
@@ -3915,31 +4400,95 @@ static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
 }
 
-/// PromoteSplat - Promote a splat of v4i32, v8i16 or v16i8 to v4f32.
-static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
-  EVT PVT = MVT::v4f32;
-  EVT VT = SV->getValueType(0);
-  DebugLoc dl = SV->getDebugLoc();
-  SDValue V1 = SV->getOperand(0);
+// PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by
+// a generic shuffle instruction because the target has no such instructions.
+// Generate shuffles which repeat i16 and i8 several times until they can be
+// represented by v4f32 and then be manipulated by target suported shuffles.
+static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
+  EVT VT = V.getValueType();
   int NumElems = VT.getVectorNumElements();
-  int EltNo = SV->getSplatIndex();
+  DebugLoc dl = V.getDebugLoc();
 
-  // unpack elements to the correct location
   while (NumElems > 4) {
     if (EltNo < NumElems/2) {
-      V1 = getUnpackl(DAG, dl, VT, V1, V1);
+      V = getUnpackl(DAG, dl, VT, V, V);
     } else {
-      V1 = getUnpackh(DAG, dl, VT, V1, V1);
+      V = getUnpackh(DAG, dl, VT, V, V);
       EltNo -= NumElems/2;
     }
     NumElems >>= 1;
   }
+  return V;
+}
+
+/// getLegalSplat - Generate a legal splat with supported x86 shuffles
+static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
+  EVT VT = V.getValueType();
+  DebugLoc dl = V.getDebugLoc();
+  assert((VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256)
+         && "Vector size not supported");
+
+  if (VT.getSizeInBits() == 128) {
+    V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V);
+    int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
+    V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32),
+                             &SplatMask[0]);
+  } else {
+    // To use VPERMILPS to splat scalars, the second half of indicies must
+    // refer to the higher part, which is a duplication of the lower one,
+    // because VPERMILPS can only handle in-lane permutations.
+    int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo,
+                         EltNo+4, EltNo+4, EltNo+4, EltNo+4 };
+
+    V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V);
+    V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32),
+                             &SplatMask[0]);
+  }
+
+  return DAG.getNode(ISD::BITCAST, dl, VT, V);
+}
+
+/// PromoteSplat - Splat is promoted to target supported vector shuffles.
+static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
+  EVT SrcVT = SV->getValueType(0);
+  SDValue V1 = SV->getOperand(0);
+  DebugLoc dl = SV->getDebugLoc();
+
+  int EltNo = SV->getSplatIndex();
+  int NumElems = SrcVT.getVectorNumElements();
+  unsigned Size = SrcVT.getSizeInBits();
+
+  assert(((Size == 128 && NumElems > 4) || Size == 256) &&
+          "Unknown how to promote splat for type");
+
+  // Extract the 128-bit part containing the splat element and update
+  // the splat element index when it refers to the higher register.
+  if (Size == 256) {
+    unsigned Idx = (EltNo > NumElems/2) ? NumElems/2 : 0;
+    V1 = Extract128BitVector(V1, DAG.getConstant(Idx, MVT::i32), DAG, dl);
+    if (Idx > 0)
+      EltNo -= NumElems/2;
+  }
+
+  // All i16 and i8 vector types can't be used directly by a generic shuffle
+  // instruction because the target has no such instruction. Generate shuffles
+  // which repeat i16 and i8 several times until they fit in i32, and then can
+  // be manipulated by target suported shuffles.
+  EVT EltVT = SrcVT.getVectorElementType();
+  if (EltVT == MVT::i8 || EltVT == MVT::i16)
+    V1 = PromoteSplati8i16(V1, DAG, EltNo);
+
+  // Recreate the 256-bit vector and place the same 128-bit vector
+  // into the low and high part. This is necessary because we want
+  // to use VPERM* to shuffle the vectors
+  if (Size == 256) {
+    SDValue InsV = Insert128BitVector(DAG.getUNDEF(SrcVT), V1,
+                         DAG.getConstant(0, MVT::i32), DAG, dl);
+    V1 = Insert128BitVector(InsV, V1,
+               DAG.getConstant(NumElems/2, MVT::i32), DAG, dl);
+  }
 
-  // Perform the splat.
-  int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
-  V1 = DAG.getNode(ISD::BITCAST, dl, PVT, V1);
-  V1 = DAG.getVectorShuffle(PVT, dl, V1, DAG.getUNDEF(PVT), &SplatMask[0]);
-  return DAG.getNode(ISD::BITCAST, dl, VT, V1);
+  return getLegalSplat(DAG, V1, EltNo);
 }
 
 /// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
@@ -3947,11 +4496,11 @@ static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
 /// element of V2 is swizzled into the zero/undef vector, landing at element
 /// Idx.  This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
-                                             bool isZero, bool HasSSE2,
-                                             SelectionDAG &DAG) {
+                                           bool isZero, bool HasXMMInt,
+                                           SelectionDAG &DAG) {
   EVT VT = V2.getValueType();
   SDValue V1 = isZero
-    ? getZeroVector(VT, HasSSE2, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT);
+    ? getZeroVector(VT, HasXMMInt, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT);
   unsigned NumElems = VT.getVectorNumElements();
   SmallVector<int, 16> MaskVec;
   for (unsigned i = 0; i != NumElems; ++i)
@@ -4005,6 +4554,8 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG,
       break;
     case X86ISD::UNPCKHPS:
     case X86ISD::UNPCKHPD:
+    case X86ISD::VUNPCKHPSY:
+    case X86ISD::VUNPCKHPDY:
       DecodeUNPCKHPMask(NumElems, ShuffleMask);
       break;
     case X86ISD::PUNPCKLBW:
@@ -4015,8 +4566,6 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG,
       break;
     case X86ISD::UNPCKLPS:
     case X86ISD::UNPCKLPD:
-    case X86ISD::VUNPCKLPS:
-    case X86ISD::VUNPCKLPD:
     case X86ISD::VUNPCKLPSY:
     case X86ISD::VUNPCKLPDY:
       DecodeUNPCKLPMask(VT, ShuffleMask);
@@ -4052,8 +4601,41 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG,
       return getShuffleScalarElt(V.getOperand(OpNum).getNode(), Index, DAG,
                                  Depth+1);
     }
+    case X86ISD::VPERMILPS:
+      ImmN = N->getOperand(N->getNumOperands()-1);
+      DecodeVPERMILPSMask(4, cast<ConstantSDNode>(ImmN)->getZExtValue(),
+                        ShuffleMask);
+      break;
+    case X86ISD::VPERMILPSY:
+      ImmN = N->getOperand(N->getNumOperands()-1);
+      DecodeVPERMILPSMask(8, cast<ConstantSDNode>(ImmN)->getZExtValue(),
+                        ShuffleMask);
+      break;
+    case X86ISD::VPERMILPD:
+      ImmN = N->getOperand(N->getNumOperands()-1);
+      DecodeVPERMILPDMask(2, cast<ConstantSDNode>(ImmN)->getZExtValue(),
+                        ShuffleMask);
+      break;
+    case X86ISD::VPERMILPDY:
+      ImmN = N->getOperand(N->getNumOperands()-1);
+      DecodeVPERMILPDMask(4, cast<ConstantSDNode>(ImmN)->getZExtValue(),
+                        ShuffleMask);
+      break;
+    case X86ISD::VPERM2F128:
+      ImmN = N->getOperand(N->getNumOperands()-1);
+      DecodeVPERM2F128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(),
+                           ShuffleMask);
+      break;
+    case X86ISD::MOVDDUP:
+    case X86ISD::MOVLHPD:
+    case X86ISD::MOVLPD:
+    case X86ISD::MOVLPS:
+    case X86ISD::MOVSHDUP:
+    case X86ISD::MOVSLDUP:
+    case X86ISD::PALIGN:
+      return SDValue(); // Not yet implemented.
     default:
-      assert("not implemented for target shuffle node");
+      assert(0 && "unknown target shuffle node");
       return SDValue();
     }
 
@@ -4205,6 +4787,11 @@ static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
 /// logical left or right shift of a vector.
 static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
                           bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
+  // Although the logic below support any bitwidth size, there are no
+  // shift instructions which handle more than 128-bit vectors.
+  if (SVOp->getValueType(0).getSizeInBits() > 128)
+    return false;
+
   if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) ||
       isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt))
     return true;
@@ -4295,6 +4882,7 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
                          unsigned NumBits, SelectionDAG &DAG,
                          const TargetLowering &TLI, DebugLoc dl) {
+  assert(VT.getSizeInBits() == 128 && "Unknown type for VShift");
   EVT ShVT = MVT::v2i64;
   unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL;
   SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
@@ -4333,42 +4921,52 @@ X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
       return SDValue();
     }
 
+    // FIXME: 256-bit vector instructions don't require a strict alignment,
+    // improve this code to support it better.
+    unsigned RequiredAlign = VT.getSizeInBits()/8;
     SDValue Chain = LD->getChain();
-    // Make sure the stack object alignment is at least 16.
+    // Make sure the stack object alignment is at least 16 or 32.
     MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
-    if (DAG.InferPtrAlignment(Ptr) < 16) {
+    if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
       if (MFI->isFixedObjectIndex(FI)) {
         // Can't change the alignment. FIXME: It's possible to compute
         // the exact stack offset and reference FI + adjust offset instead.
         // If someone *really* cares about this. That's the way to implement it.
         return SDValue();
       } else {
-        MFI->setObjectAlignment(FI, 16);
+        MFI->setObjectAlignment(FI, RequiredAlign);
       }
     }
 
-    // (Offset % 16) must be multiple of 4. Then address is then
+    // (Offset % 16 or 32) must be multiple of 4. Then address is then
     // Ptr + (Offset & ~15).
     if (Offset < 0)
       return SDValue();
-    if ((Offset % 16) & 3)
+    if ((Offset % RequiredAlign) & 3)
       return SDValue();
-    int64_t StartOffset = Offset & ~15;
+    int64_t StartOffset = Offset & ~(RequiredAlign-1);
     if (StartOffset)
       Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(),
                         Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
 
     int EltNo = (Offset - StartOffset) >> 2;
-    int Mask[4] = { EltNo, EltNo, EltNo, EltNo };
-    EVT VT = (PVT == MVT::i32) ? MVT::v4i32 : MVT::v4f32;
-    SDValue V1 = DAG.getLoad(VT, dl, Chain, Ptr,
+    int NumElems = VT.getVectorNumElements();
+
+    EVT CanonVT = VT.getSizeInBits() == 128 ? MVT::v4i32 : MVT::v8i32;
+    EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
+    SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
                              LD->getPointerInfo().getWithOffset(StartOffset),
                              false, false, 0);
-    // Canonicalize it to a v4i32 shuffle.
-    V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1);
-    return DAG.getNode(ISD::BITCAST, dl, VT,
-                       DAG.getVectorShuffle(MVT::v4i32, dl, V1,
-                                            DAG.getUNDEF(MVT::v4i32),&Mask[0]));
+
+    // Canonicalize it to a v4i32 or v8i32 shuffle.
+    SmallVector<int, 8> Mask;
+    for (int i = 0; i < NumElems; ++i)
+      Mask.push_back(EltNo);
+
+    V1 = DAG.getNode(ISD::BITCAST, dl, CanonVT, V1);
+    return DAG.getNode(ISD::BITCAST, dl, NVT,
+                       DAG.getVectorShuffle(CanonVT, dl, V1,
+                                            DAG.getUNDEF(CanonVT),&Mask[0]));
   }
 
   return SDValue();
@@ -4428,12 +5026,16 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
                        LDBase->getPointerInfo(),
                        LDBase->isVolatile(), LDBase->isNonTemporal(),
                        LDBase->getAlignment());
-  } else if (NumElems == 4 && LastLoadedElt == 1) {
+  } else if (NumElems == 4 && LastLoadedElt == 1 &&
+             DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
     SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
     SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
-    SDValue ResNode = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys,
-                                              Ops, 2, MVT::i32,
-                                              LDBase->getMemOperand());
+    SDValue ResNode =
+        DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, 2, MVT::i64,
+                                LDBase->getPointerInfo(),
+                                LDBase->getAlignment(),
+                                false/*isVolatile*/, true/*ReadMem*/,
+                                false/*WriteMem*/);
     return DAG.getNode(ISD::BITCAST, DL, VT, ResNode);
   }
   return SDValue();
@@ -4445,47 +5047,26 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
 
   EVT VT = Op.getValueType();
   EVT ExtVT = VT.getVectorElementType();
-
   unsigned NumElems = Op.getNumOperands();
 
-  // For AVX-length vectors, build the individual 128-bit pieces and
-  // use shuffles to put them in place.
-  if (VT.getSizeInBits() > 256 &&
-      Subtarget->hasAVX() &&
-      !ISD::isBuildVectorAllZeros(Op.getNode())) {
-    SmallVector<SDValue, 8> V;
-    V.resize(NumElems);
-    for (unsigned i = 0; i < NumElems; ++i) {
-      V[i] = Op.getOperand(i);
-    }
-
-    EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
-
-    // Build the lower subvector.
-    SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[0], NumElems/2);
-    // Build the upper subvector.
-    SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[NumElems / 2],
-                                NumElems/2);
+  // Vectors containing all zeros can be matched by pxor and xorps later
+  if (ISD::isBuildVectorAllZeros(Op.getNode())) {
+    // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
+    // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
+    if (Op.getValueType() == MVT::v4i32 ||
+        Op.getValueType() == MVT::v8i32)
+      return Op;
 
-    return ConcatVectors(Lower, Upper, DAG);
+    return getZeroVector(Op.getValueType(), Subtarget->hasXMMInt(), DAG, dl);
   }
 
-  // All zero's:
-  //  - pxor (SSE2), xorps (SSE1), vpxor (128 AVX), xorp[s|d] (256 AVX)
-  // All one's:
-  //  - pcmpeqd (SSE2 and 128 AVX), fallback to constant pools (256 AVX)
-  if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
-      ISD::isBuildVectorAllOnes(Op.getNode())) {
-    // Canonicalize this to <4 x i32> or <8 x 32> (SSE) to
-    // 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are
-    // eliminated on x86-32 hosts.
-    if (Op.getValueType() == MVT::v4i32 ||
-        Op.getValueType() == MVT::v8i32)
+  // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
+  // vectors or broken into v4i32 operations on 256-bit vectors.
+  if (ISD::isBuildVectorAllOnes(Op.getNode())) {
+    if (Op.getValueType() == MVT::v4i32)
       return Op;
 
-    if (ISD::isBuildVectorAllOnes(Op.getNode()))
-      return getOnesVector(Op.getValueType(), DAG, dl);
-    return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG, dl);
+    return getOnesVector(Op.getValueType(), DAG, dl);
   }
 
   unsigned EVTBits = ExtVT.getSizeInBits();
@@ -4538,7 +5119,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
         Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
         Item = getShuffleVectorZeroOrUndef(Item, 0, true,
-                                           Subtarget->hasSSE2(), DAG);
+                                           Subtarget->hasXMMInt(), DAG);
 
         // Now we have our 32-bit value zero extended in the low element of
         // a vector.  If Idx != 0, swizzle it into place.
@@ -4566,7 +5147,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
           (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
         // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
-        return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget->hasSSE2(),
+        return getShuffleVectorZeroOrUndef(Item, 0, true,Subtarget->hasXMMInt(),
                                            DAG);
       } else if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
         Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
@@ -4574,7 +5155,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
         EVT MiddleVT = MVT::v4i32;
         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item);
         Item = getShuffleVectorZeroOrUndef(Item, 0, true,
-                                           Subtarget->hasSSE2(), DAG);
+                                           Subtarget->hasXMMInt(), DAG);
         return DAG.getNode(ISD::BITCAST, dl, VT, Item);
       }
     }
@@ -4603,7 +5184,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
 
       // Turn it into a shuffle of zero and zero-extended scalar to vector.
       Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0,
-                                         Subtarget->hasSSE2(), DAG);
+                                         Subtarget->hasXMMInt(), DAG);
       SmallVector<int, 8> MaskVec;
       for (unsigned i = 0; i < NumElems; i++)
         MaskVec.push_back(i == Idx ? 0 : 1);
@@ -4631,6 +5212,27 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   if (IsAllConstants)
     return SDValue();
 
+  // For AVX-length vectors, build the individual 128-bit pieces and use
+  // shuffles to put them in place.
+  if (VT.getSizeInBits() == 256 && !ISD::isBuildVectorAllZeros(Op.getNode())) {
+    SmallVector<SDValue, 32> V;
+    for (unsigned i = 0; i < NumElems; ++i)
+      V.push_back(Op.getOperand(i));
+
+    EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
+
+    // Build both the lower and upper subvector.
+    SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[0], NumElems/2);
+    SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[NumElems / 2],
+                                NumElems/2);
+
+    // Recreate the wider vector with the lower and upper part.
+    SDValue Vec = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), Lower,
+                                DAG.getConstant(0, MVT::i32), DAG, dl);
+    return Insert128BitVector(Vec, Upper, DAG.getConstant(NumElems/2, MVT::i32),
+                              DAG, dl);
+  }
+
   // Let legalizer expand 2-wide build_vectors.
   if (EVTBits == 64) {
     if (NumNonZero == 1) {
@@ -4639,7 +5241,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
       SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
                                  Op.getOperand(Idx));
       return getShuffleVectorZeroOrUndef(V2, Idx, true,
-                                         Subtarget->hasSSE2(), DAG);
+                                         Subtarget->hasXMMInt(), DAG);
     }
     return SDValue();
   }
@@ -4664,7 +5266,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
     for (unsigned i = 0; i < 4; ++i) {
       bool isZero = !(NonZeros & (1 << i));
       if (isZero)
-        V[i] = getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl);
+        V[i] = getZeroVector(VT, Subtarget->hasXMMInt(), DAG, dl);
       else
         V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
     }
@@ -4708,7 +5310,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
       return LD;
 
     // For SSE 4.1, use insertps to put the high elements into the low element.
-    if (getSubtarget()->hasSSE41()) {
+    if (getSubtarget()->hasSSE41() || getSubtarget()->hasAVX()) {
       SDValue Result;
       if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
         Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
@@ -4758,13 +5360,12 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   return SDValue();
 }
 
-SDValue
-X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
-  // We support concatenate two MMX registers and place them in a MMX
-  // register.  This is better than doing a stack convert.
+// LowerMMXCONCAT_VECTORS - We support concatenate two MMX registers and place
+// them in a MMX register.  This is better than doing a stack convert.
+static SDValue LowerMMXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
   DebugLoc dl = Op.getDebugLoc();
   EVT ResVT = Op.getValueType();
-  assert(Op.getNumOperands() == 2);
+
   assert(ResVT == MVT::v2i64 || ResVT == MVT::v4i32 ||
          ResVT == MVT::v8i16 || ResVT == MVT::v16i8);
   int Mask[2];
@@ -4785,6 +5386,42 @@ X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getNode(ISD::BITCAST, dl, ResVT, VecOp);
 }
 
+// LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction
+// to create 256-bit vectors from two other 128-bit ones.
+static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+  EVT ResVT = Op.getValueType();
+
+  assert(ResVT.getSizeInBits() == 256 && "Value type must be 256-bit wide");
+
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  unsigned NumElems = ResVT.getVectorNumElements();
+
+  SDValue V = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, ResVT), V1,
+                                 DAG.getConstant(0, MVT::i32), DAG, dl);
+  return Insert128BitVector(V, V2, DAG.getConstant(NumElems/2, MVT::i32),
+                            DAG, dl);
+}
+
+SDValue
+X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
+  EVT ResVT = Op.getValueType();
+
+  assert(Op.getNumOperands() == 2);
+  assert((ResVT.getSizeInBits() == 128 || ResVT.getSizeInBits() == 256) &&
+         "Unsupported CONCAT_VECTORS for value type");
+
+  // We support concatenate two MMX registers and place them in a MMX register.
+  // This is better than doing a stack convert.
+  if (ResVT.is128BitVector())
+    return LowerMMXCONCAT_VECTORS(Op, DAG);
+
+  // 256-bit AVX can use the vinsertf128 instruction to create 256-bit vectors
+  // from two other 128-bit ones.
+  return LowerAVXCONCAT_VECTORS(Op, DAG);
+}
+
 // v8i16 shuffles - Prefer shuffles in the following order:
 // 1. [all]   pshuflw, pshufhw, optional move
 // 2. [ssse3] 1 x pshufb
@@ -4844,7 +5481,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
   // quads, disable the next transformation since it does not help SSSE3.
   bool V1Used = InputQuads[0] || InputQuads[1];
   bool V2Used = InputQuads[2] || InputQuads[3];
-  if (Subtarget->hasSSSE3()) {
+  if (Subtarget->hasSSSE3() || Subtarget->hasAVX()) {
     if (InputQuads.count() == 2 && V1Used && V2Used) {
       BestLoQuad = InputQuads.find_first();
       BestHiQuad = InputQuads.find_next(BestLoQuad);
@@ -4917,7 +5554,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
   // If we have SSSE3, and all words of the result are from 1 input vector,
   // case 2 is generated, otherwise case 3 is generated.  If no SSSE3
   // is present, fall back to case 4.
-  if (Subtarget->hasSSSE3()) {
+  if (Subtarget->hasSSSE3() || Subtarget->hasAVX()) {
     SmallVector<SDValue,16> pshufbMask;
 
     // If we have elements from both input vectors, set the high bit of the
@@ -4985,7 +5622,8 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
     NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
                                 &MaskV[0]);
 
-    if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3())
+    if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE &&
+        (Subtarget->hasSSSE3() || Subtarget->hasAVX()))
       NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16,
                                NewV.getOperand(0),
                                X86::getShufflePSHUFLWImmediate(NewV.getNode()),
@@ -5013,7 +5651,8 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
     NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
                                 &MaskV[0]);
 
-    if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3())
+    if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE &&
+        (Subtarget->hasSSSE3() || Subtarget->hasAVX()))
       NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16,
                               NewV.getOperand(0),
                               X86::getShufflePSHUFHWImmediate(NewV.getNode()),
@@ -5079,7 +5718,7 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
   }
 
   // If SSSE3, use 1 pshufb instruction per vector with elements in the result.
-  if (TLI.getSubtarget()->hasSSSE3()) {
+  if (TLI.getSubtarget()->hasSSSE3() || TLI.getSubtarget()->hasAVX()) {
     SmallVector<SDValue,16> pshufbMask;
 
     // If all result elements are from one input vector, then only translate
@@ -5276,15 +5915,109 @@ static SDValue getVZextMovL(EVT VT, EVT OpVT,
                                              OpVT, SrcOp)));
 }
 
-/// LowerVECTOR_SHUFFLE_4wide - Handle all 4 wide cases with a number of
-/// shuffles.
+/// areShuffleHalvesWithinDisjointLanes - Check whether each half of a vector
+/// shuffle node referes to only one lane in the sources.
+static bool areShuffleHalvesWithinDisjointLanes(ShuffleVectorSDNode *SVOp) {
+  EVT VT = SVOp->getValueType(0);
+  int NumElems = VT.getVectorNumElements();
+  int HalfSize = NumElems/2;
+  SmallVector<int, 16> M;
+  SVOp->getMask(M);
+  bool MatchA = false, MatchB = false;
+
+  for (int l = 0; l < NumElems*2; l += HalfSize) {
+    if (isUndefOrInRange(M, 0, HalfSize, l, l+HalfSize)) {
+      MatchA = true;
+      break;
+    }
+  }
+
+  for (int l = 0; l < NumElems*2; l += HalfSize) {
+    if (isUndefOrInRange(M, HalfSize, HalfSize, l, l+HalfSize)) {
+      MatchB = true;
+      break;
+    }
+  }
+
+  return MatchA && MatchB;
+}
+
+/// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles
+/// which could not be matched by any known target speficic shuffle
+static SDValue
+LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
+  if (areShuffleHalvesWithinDisjointLanes(SVOp)) {
+    // If each half of a vector shuffle node referes to only one lane in the
+    // source vectors, extract each used 128-bit lane and shuffle them using
+    // 128-bit shuffles. Then, concatenate the results. Otherwise leave
+    // the work to the legalizer.
+    DebugLoc dl = SVOp->getDebugLoc();
+    EVT VT = SVOp->getValueType(0);
+    int NumElems = VT.getVectorNumElements();
+    int HalfSize = NumElems/2;
+
+    // Extract the reference for each half
+    int FstVecExtractIdx = 0, SndVecExtractIdx = 0;
+    int FstVecOpNum = 0, SndVecOpNum = 0;
+    for (int i = 0; i < HalfSize; ++i) {
+      int Elt = SVOp->getMaskElt(i);
+      if (SVOp->getMaskElt(i) < 0)
+        continue;
+      FstVecOpNum = Elt/NumElems;
+      FstVecExtractIdx = Elt % NumElems < HalfSize ? 0 : HalfSize;
+      break;
+    }
+    for (int i = HalfSize; i < NumElems; ++i) {
+      int Elt = SVOp->getMaskElt(i);
+      if (SVOp->getMaskElt(i) < 0)
+        continue;
+      SndVecOpNum = Elt/NumElems;
+      SndVecExtractIdx = Elt % NumElems < HalfSize ? 0 : HalfSize;
+      break;
+    }
+
+    // Extract the subvectors
+    SDValue V1 = Extract128BitVector(SVOp->getOperand(FstVecOpNum),
+                      DAG.getConstant(FstVecExtractIdx, MVT::i32), DAG, dl);
+    SDValue V2 = Extract128BitVector(SVOp->getOperand(SndVecOpNum),
+                      DAG.getConstant(SndVecExtractIdx, MVT::i32), DAG, dl);
+
+    // Generate 128-bit shuffles
+    SmallVector<int, 16> MaskV1, MaskV2;
+    for (int i = 0; i < HalfSize; ++i) {
+      int Elt = SVOp->getMaskElt(i);
+      MaskV1.push_back(Elt < 0 ? Elt : Elt % HalfSize);
+    }
+    for (int i = HalfSize; i < NumElems; ++i) {
+      int Elt = SVOp->getMaskElt(i);
+      MaskV2.push_back(Elt < 0 ? Elt : Elt % HalfSize);
+    }
+
+    EVT NVT = V1.getValueType();
+    V1 = DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &MaskV1[0]);
+    V2 = DAG.getVectorShuffle(NVT, dl, V2, DAG.getUNDEF(NVT), &MaskV2[0]);
+
+    // Concatenate the result back
+    SDValue V = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), V1,
+                                   DAG.getConstant(0, MVT::i32), DAG, dl);
+    return Insert128BitVector(V, V2, DAG.getConstant(NumElems/2, MVT::i32),
+                              DAG, dl);
+  }
+
+  return SDValue();
+}
+
+/// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with
+/// 4 elements, and match them with several different shuffle types.
 static SDValue
-LowerVECTOR_SHUFFLE_4wide(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
+LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
   SDValue V1 = SVOp->getOperand(0);
   SDValue V2 = SVOp->getOperand(1);
   DebugLoc dl = SVOp->getDebugLoc();
   EVT VT = SVOp->getValueType(0);
 
+  assert(VT.getSizeInBits() == 128 && "Unsupported vector size");
+
   SmallVector<std::pair<int, int>, 8> Locs;
   Locs.resize(4);
   SmallVector<int, 8> Mask1(4U, -1);
@@ -5542,18 +6275,21 @@ SDValue getMOVDDup(SDValue &Op, DebugLoc &dl, SDValue V1, SelectionDAG &DAG) {
 
 static
 SDValue getMOVLowToHigh(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG,
-                        bool HasSSE2) {
+                        bool HasXMMInt) {
   SDValue V1 = Op.getOperand(0);
   SDValue V2 = Op.getOperand(1);
   EVT VT = Op.getValueType();
 
   assert(VT != MVT::v2i64 && "unsupported shuffle type");
 
-  if (HasSSE2 && VT == MVT::v2f64)
+  if (HasXMMInt && VT == MVT::v2f64)
     return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG);
 
-  // v4f32 or v4i32
-  return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V2, DAG);
+  // v4f32 or v4i32: canonizalized to v4f32 (which is legal for SSE1)
+  return DAG.getNode(ISD::BITCAST, dl, VT,
+                     getTargetShuffleNode(X86ISD::MOVLHPS, dl, MVT::v4f32,
+                           DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V1),
+                           DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V2), DAG));
 }
 
 static
@@ -5572,8 +6308,24 @@ SDValue getMOVHighToLow(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG) {
   return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG);
 }
 
+static inline unsigned getSHUFPOpcode(EVT VT) {
+  switch(VT.getSimpleVT().SimpleTy) {
+  case MVT::v8i32: // Use fp unit for int unpack.
+  case MVT::v8f32:
+  case MVT::v4i32: // Use fp unit for int unpack.
+  case MVT::v4f32: return X86ISD::SHUFPS;
+  case MVT::v4i64: // Use fp unit for int unpack.
+  case MVT::v4f64:
+  case MVT::v2i64: // Use fp unit for int unpack.
+  case MVT::v2f64: return X86ISD::SHUFPD;
+  default:
+    llvm_unreachable("Unknown type for shufp*");
+  }
+  return 0;
+}
+
 static
-SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
+SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasXMMInt) {
   SDValue V1 = Op.getOperand(0);
   SDValue V2 = Op.getOperand(1);
   EVT VT = Op.getValueType();
@@ -5602,7 +6354,7 @@ SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
     CanFoldLoad = false;
 
   if (CanFoldLoad) {
-    if (HasSSE2 && NumElems == 2)
+    if (HasXMMInt && NumElems == 2)
       return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG);
 
     if (NumElems == 4)
@@ -5616,28 +6368,30 @@ SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
   // this is horrible, but will stay like this until we move all shuffle
   // matching to x86 specific nodes. Note that for the 1st condition all
   // types are matched with movsd.
-  if ((HasSSE2 && NumElems == 2) || !X86::isMOVLMask(SVOp))
-    return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
-  else if (HasSSE2)
+  if (HasXMMInt) {
+    // FIXME: isMOVLMask should be checked and matched before getMOVLP,
+    // as to remove this logic from here, as much as possible
+    if (NumElems == 2 || !X86::isMOVLMask(SVOp))
+      return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
     return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
-
+  }
 
   assert(VT != MVT::v4i32 && "unsupported shuffle type");
 
   // Invert the operand order and use SHUFPS to match it.
-  return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V2, V1,
+  return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V2, V1,
                               X86::getShuffleSHUFImmediate(SVOp), DAG);
 }
 
-static inline unsigned getUNPCKLOpcode(EVT VT, const X86Subtarget *Subtarget) {
+static inline unsigned getUNPCKLOpcode(EVT VT) {
   switch(VT.getSimpleVT().SimpleTy) {
   case MVT::v4i32: return X86ISD::PUNPCKLDQ;
   case MVT::v2i64: return X86ISD::PUNPCKLQDQ;
-  case MVT::v4f32:
-    return Subtarget->hasAVX() ? X86ISD::VUNPCKLPS : X86ISD::UNPCKLPS;
-  case MVT::v2f64:
-    return Subtarget->hasAVX() ? X86ISD::VUNPCKLPD : X86ISD::UNPCKLPD;
+  case MVT::v4f32: return X86ISD::UNPCKLPS;
+  case MVT::v2f64: return X86ISD::UNPCKLPD;
+  case MVT::v8i32: // Use fp unit for int unpack.
   case MVT::v8f32: return X86ISD::VUNPCKLPSY;
+  case MVT::v4i64: // Use fp unit for int unpack.
   case MVT::v4f64: return X86ISD::VUNPCKLPDY;
   case MVT::v16i8: return X86ISD::PUNPCKLBW;
   case MVT::v8i16: return X86ISD::PUNPCKLWD;
@@ -5653,6 +6407,10 @@ static inline unsigned getUNPCKHOpcode(EVT VT) {
   case MVT::v2i64: return X86ISD::PUNPCKHQDQ;
   case MVT::v4f32: return X86ISD::UNPCKHPS;
   case MVT::v2f64: return X86ISD::UNPCKHPD;
+  case MVT::v8i32: // Use fp unit for int unpack.
+  case MVT::v8f32: return X86ISD::VUNPCKHPSY;
+  case MVT::v4i64: // Use fp unit for int unpack.
+  case MVT::v4f64: return X86ISD::VUNPCKHPDY;
   case MVT::v16i8: return X86ISD::PUNPCKHBW;
   case MVT::v8i16: return X86ISD::PUNPCKHWD;
   default:
@@ -5661,6 +6419,68 @@ static inline unsigned getUNPCKHOpcode(EVT VT) {
   return 0;
 }
 
+static inline unsigned getVPERMILOpcode(EVT VT) {
+  switch(VT.getSimpleVT().SimpleTy) {
+  case MVT::v4i32:
+  case MVT::v4f32: return X86ISD::VPERMILPS;
+  case MVT::v2i64:
+  case MVT::v2f64: return X86ISD::VPERMILPD;
+  case MVT::v8i32:
+  case MVT::v8f32: return X86ISD::VPERMILPSY;
+  case MVT::v4i64:
+  case MVT::v4f64: return X86ISD::VPERMILPDY;
+  default:
+    llvm_unreachable("Unknown type for vpermil");
+  }
+  return 0;
+}
+
+/// isVectorBroadcast - Check if the node chain is suitable to be xformed to
+/// a vbroadcast node. The nodes are suitable whenever we can fold a load coming
+/// from a 32 or 64 bit scalar. Update Op to the desired load to be folded.
+static bool isVectorBroadcast(SDValue &Op) {
+  EVT VT = Op.getValueType();
+  bool Is256 = VT.getSizeInBits() == 256;
+
+  assert((VT.getSizeInBits() == 128 || Is256) &&
+         "Unsupported type for vbroadcast node");
+
+  SDValue V = Op;
+  if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
+    V = V.getOperand(0);
+
+  if (Is256 && !(V.hasOneUse() &&
+                 V.getOpcode() == ISD::INSERT_SUBVECTOR &&
+                 V.getOperand(0).getOpcode() == ISD::UNDEF))
+    return false;
+
+  if (Is256)
+    V = V.getOperand(1);
+
+  if (!V.hasOneUse())
+    return false;
+
+  // Check the source scalar_to_vector type. 256-bit broadcasts are
+  // supported for 32/64-bit sizes, while 128-bit ones are only supported
+  // for 32-bit scalars.
+  if (V.getOpcode() != ISD::SCALAR_TO_VECTOR)
+    return false;
+
+  unsigned ScalarSize = V.getOperand(0).getValueType().getSizeInBits();
+  if (ScalarSize != 32 && ScalarSize != 64)
+    return false;
+  if (!Is256 && ScalarSize == 64)
+    return false;
+
+  V = V.getOperand(0);
+  if (!MayFoldLoad(V))
+    return false;
+
+  // Return the load node
+  Op = V;
+  return true;
+}
+
 static
 SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG,
                                const TargetLowering &TLI,
@@ -5672,23 +6492,29 @@ SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG,
   SDValue V2 = Op.getOperand(1);
 
   if (isZeroShuffle(SVOp))
-    return getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl);
+    return getZeroVector(VT, Subtarget->hasXMMInt(), DAG, dl);
 
   // Handle splat operations
   if (SVOp->isSplat()) {
-    // Special case, this is the only place now where it's
-    // allowed to return a vector_shuffle operation without
-    // using a target specific node, because *hopefully* it
-    // will be optimized away by the dag combiner.
-    if (VT.getVectorNumElements() <= 4 &&
-        CanXFormVExtractWithShuffleIntoLoad(Op, DAG, TLI))
+    unsigned NumElem = VT.getVectorNumElements();
+    int Size = VT.getSizeInBits();
+    // Special case, this is the only place now where it's allowed to return
+    // a vector_shuffle operation without using a target specific node, because
+    // *hopefully* it will be optimized away by the dag combiner. FIXME: should
+    // this be moved to DAGCombine instead?
+    if (NumElem <= 4 && CanXFormVExtractWithShuffleIntoLoad(Op, DAG, TLI))
       return Op;
 
-    // Handle splats by matching through known masks
-    if (VT.getVectorNumElements() <= 4)
+    // Use vbroadcast whenever the splat comes from a foldable load
+    if (Subtarget->hasAVX() && isVectorBroadcast(V1))
+      return DAG.getNode(X86ISD::VBROADCAST, dl, VT, V1);
+
+    // Handle splats by matching through known shuffle masks
+    if ((Size == 128 && NumElem <= 4) ||
+        (Size == 256 && NumElem < 8))
       return SDValue();
 
-    // Canonicalize all of the remaining to v4f32.
+    // All remaning splats are promoted to target supported vector shuffles.
     return PromoteSplat(SVOp, DAG);
   }
 
@@ -5698,7 +6524,8 @@ SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG,
     SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl);
     if (NewOp.getNode())
       return DAG.getNode(ISD::BITCAST, dl, VT, NewOp);
-  } else if ((VT == MVT::v4i32 || (VT == MVT::v4f32 && Subtarget->hasSSE2()))) {
+  } else if ((VT == MVT::v4i32 ||
+             (VT == MVT::v4f32 && Subtarget->hasXMMInt()))) {
     // FIXME: Figure out a cleaner way to do this.
     // Try to make use of movq to zero out the top part.
     if (ISD::isBuildVectorAllZeros(V2.getNode())) {
@@ -5731,9 +6558,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
   bool V1IsSplat = false;
   bool V2IsSplat = false;
-  bool HasSSE2 = Subtarget->hasSSE2() || Subtarget->hasAVX();
-  bool HasSSE3 = Subtarget->hasSSE3() || Subtarget->hasAVX();
-  bool HasSSSE3 = Subtarget->hasSSSE3() || Subtarget->hasAVX();
+  bool HasXMMInt = Subtarget->hasXMMInt();
   MachineFunction &MF = DAG.getMachineFunction();
   bool OptForSize = MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize);
 
@@ -5765,21 +6590,20 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and
   // unpckh_undef). Only use pshufd if speed is more important than size.
   if (OptForSize && X86::isUNPCKL_v_undef_Mask(SVOp))
-    if (VT != MVT::v2i64 && VT != MVT::v2f64)
-      return getTargetShuffleNode(getUNPCKLOpcode(VT, getSubtarget()), dl, VT, V1, V1, DAG);
+    return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V1, DAG);
   if (OptForSize && X86::isUNPCKH_v_undef_Mask(SVOp))
-    if (VT != MVT::v2i64 && VT != MVT::v2f64)
-      return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG);
+    return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG);
 
-  if (X86::isMOVDDUPMask(SVOp) && HasSSE3 && V2IsUndef &&
-      RelaxedMayFoldVectorLoad(V1))
+  if (X86::isMOVDDUPMask(SVOp) &&
+      (Subtarget->hasSSE3() || Subtarget->hasAVX()) &&
+      V2IsUndef && RelaxedMayFoldVectorLoad(V1))
     return getMOVDDup(Op, dl, V1, DAG);
 
   if (X86::isMOVHLPS_v_undef_Mask(SVOp))
     return getMOVHighToLow(Op, dl, DAG);
 
   // Use to match splats
-  if (HasSSE2 && X86::isUNPCKHMask(SVOp) && V2IsUndef &&
+  if (HasXMMInt && X86::isUNPCKHMask(SVOp) && V2IsUndef &&
       (VT == MVT::v2f64 || VT == MVT::v2i64))
     return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG);
 
@@ -5792,24 +6616,19 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
 
     unsigned TargetMask = X86::getShuffleSHUFImmediate(SVOp);
 
-    if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32))
+    if (HasXMMInt && (VT == MVT::v4f32 || VT == MVT::v4i32))
       return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG);
 
-    if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64))
-      return getTargetShuffleNode(X86ISD::SHUFPD, dl, VT, V1, V1,
-                                  TargetMask, DAG);
-
-    if (VT == MVT::v4f32)
-      return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V1, V1,
-                                  TargetMask, DAG);
+    return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V1,
+                                TargetMask, DAG);
   }
 
   // Check if this can be converted into a logical shift.
   bool isLeft = false;
   unsigned ShAmt = 0;
   SDValue ShVal;
-  bool isShift = getSubtarget()->hasSSE2() &&
-    isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
+  bool isShift = getSubtarget()->hasXMMInt() &&
+                 isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
   if (isShift && ShVal.hasOneUse()) {
     // If the shifted value has multiple uses, it may be cheaper to use
     // v_set0 + movlhps or movhlps, etc.
@@ -5824,7 +6643,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
     if (ISD::isBuildVectorAllZeros(V1.getNode()))
       return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl);
     if (!X86::isMOVLPMask(SVOp)) {
-      if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64))
+      if (HasXMMInt && (VT == MVT::v2i64 || VT == MVT::v2f64))
         return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
 
       if (VT == MVT::v4i32 || VT == MVT::v4f32)
@@ -5834,19 +6653,19 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
 
   // FIXME: fold these into legal mask.
   if (X86::isMOVLHPSMask(SVOp) && !X86::isUNPCKLMask(SVOp))
-    return getMOVLowToHigh(Op, dl, DAG, HasSSE2);
+    return getMOVLowToHigh(Op, dl, DAG, HasXMMInt);
 
   if (X86::isMOVHLPSMask(SVOp))
     return getMOVHighToLow(Op, dl, DAG);
 
-  if (X86::isMOVSHDUPMask(SVOp) && HasSSE3 && V2IsUndef && NumElems == 4)
+  if (X86::isMOVSHDUPMask(SVOp, Subtarget))
     return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG);
 
-  if (X86::isMOVSLDUPMask(SVOp) && HasSSE3 && V2IsUndef && NumElems == 4)
+  if (X86::isMOVSLDUPMask(SVOp, Subtarget))
     return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG);
 
   if (X86::isMOVLPMask(SVOp))
-    return getMOVLP(Op, dl, DAG, HasSSE2);
+    return getMOVLP(Op, dl, DAG, HasXMMInt);
 
   if (ShouldXformToMOVHLPS(SVOp) ||
       ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp))
@@ -5887,8 +6706,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   }
 
   if (X86::isUNPCKLMask(SVOp))
-    return getTargetShuffleNode(getUNPCKLOpcode(VT, getSubtarget()),
-                                dl, VT, V1, V2, DAG);
+    return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V2, DAG);
 
   if (X86::isUNPCKHMask(SVOp))
     return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V2, DAG);
@@ -5915,8 +6733,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
     ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp);
 
     if (X86::isUNPCKLMask(NewSVOp))
-      return getTargetShuffleNode(getUNPCKLOpcode(VT, getSubtarget()),
-                                  dl, VT, V2, V1, DAG);
+      return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V2, V1, DAG);
 
     if (X86::isUNPCKHMask(NewSVOp))
       return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V2, V1, DAG);
@@ -5932,18 +6749,15 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   SmallVector<int, 16> M;
   SVOp->getMask(M);
 
-  if (isPALIGNRMask(M, VT, HasSSSE3))
+  if (isPALIGNRMask(M, VT, Subtarget->hasSSSE3() || Subtarget->hasAVX()))
     return getTargetShuffleNode(X86ISD::PALIGN, dl, VT, V1, V2,
                                 X86::getShufflePALIGNRImmediate(SVOp),
                                 DAG);
 
   if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) &&
       SVOp->getSplatIndex() == 0 && V2IsUndef) {
-    if (VT == MVT::v2f64) {
-      X86ISD::NodeType Opcode =
-        getSubtarget()->hasAVX() ? X86ISD::VUNPCKLPD : X86ISD::UNPCKLPD;
-      return getTargetShuffleNode(Opcode, dl, VT, V1, V1, DAG);
-    }
+    if (VT == MVT::v2f64)
+      return getTargetShuffleNode(X86ISD::UNPCKLPD, dl, VT, V1, V1, DAG);
     if (VT == MVT::v2i64)
       return getTargetShuffleNode(X86ISD::PUNPCKLQDQ, dl, VT, V1, V1, DAG);
   }
@@ -5958,23 +6772,54 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
                                 X86::getShufflePSHUFLWImmediate(SVOp),
                                 DAG);
 
-  if (isSHUFPMask(M, VT)) {
-    unsigned TargetMask = X86::getShuffleSHUFImmediate(SVOp);
-    if (VT == MVT::v4f32 || VT == MVT::v4i32)
-      return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V1, V2,
-                                  TargetMask, DAG);
-    if (VT == MVT::v2f64 || VT == MVT::v2i64)
-      return getTargetShuffleNode(X86ISD::SHUFPD, dl, VT, V1, V2,
-                                  TargetMask, DAG);
-  }
+  if (isSHUFPMask(M, VT))
+    return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V2,
+                                X86::getShuffleSHUFImmediate(SVOp), DAG);
 
   if (X86::isUNPCKL_v_undef_Mask(SVOp))
-    if (VT != MVT::v2i64 && VT != MVT::v2f64)
-      return getTargetShuffleNode(getUNPCKLOpcode(VT, getSubtarget()),
-                                  dl, VT, V1, V1, DAG);
+    return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V1, DAG);
   if (X86::isUNPCKH_v_undef_Mask(SVOp))
-    if (VT != MVT::v2i64 && VT != MVT::v2f64)
-      return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG);
+    return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG);
+
+  //===--------------------------------------------------------------------===//
+  // Generate target specific nodes for 128 or 256-bit shuffles only
+  // supported in the AVX instruction set.
+  //
+
+  // Handle VMOVDDUPY permutations
+  if (isMOVDDUPYMask(SVOp, Subtarget))
+    return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG);
+
+  // Handle VPERMILPS* permutations
+  if (isVPERMILPSMask(M, VT, Subtarget))
+    return getTargetShuffleNode(getVPERMILOpcode(VT), dl, VT, V1,
+                                getShuffleVPERMILPSImmediate(SVOp), DAG);
+
+  // Handle VPERMILPD* permutations
+  if (isVPERMILPDMask(M, VT, Subtarget))
+    return getTargetShuffleNode(getVPERMILOpcode(VT), dl, VT, V1,
+                                getShuffleVPERMILPDImmediate(SVOp), DAG);
+
+  // Handle VPERM2F128 permutations
+  if (isVPERM2F128Mask(M, VT, Subtarget))
+    return getTargetShuffleNode(X86ISD::VPERM2F128, dl, VT, V1, V2,
+                                getShuffleVPERM2F128Immediate(SVOp), DAG);
+
+  // Handle VSHUFPSY permutations
+  if (isVSHUFPSYMask(M, VT, Subtarget))
+    return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V2,
+                                getShuffleVSHUFPSYImmediate(SVOp), DAG);
+
+  // Handle VSHUFPDY permutations
+  if (isVSHUFPDYMask(M, VT, Subtarget))
+    return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V2,
+                                getShuffleVSHUFPDYImmediate(SVOp), DAG);
+
+  //===--------------------------------------------------------------------===//
+  // Since no target specific shuffle was selected for this generic one,
+  // lower it into other known shuffles. FIXME: this isn't true yet, but
+  // this is the plan.
+  //
 
   // Handle v8i16 specifically since SSE can do byte extraction and insertion.
   if (VT == MVT::v8i16) {
@@ -5989,9 +6834,14 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
       return NewOp;
   }
 
-  // Handle all 4 wide cases with a number of shuffles.
-  if (NumElems == 4)
-    return LowerVECTOR_SHUFFLE_4wide(SVOp, DAG);
+  // Handle all 128-bit wide vectors with 4 elements, and match them with
+  // several different shuffle types.
+  if (NumElems == 4 && VT.getSizeInBits() == 128)
+    return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG);
+
+  // Handle general 256-bit shuffles
+  if (VT.is256BitVector())
+    return LowerVECTOR_SHUFFLE_256(SVOp, DAG);
 
   return SDValue();
 }
@@ -6001,6 +6851,10 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,
                                                 SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
   DebugLoc dl = Op.getDebugLoc();
+
+  if (Op.getOperand(0).getValueType().getSizeInBits() != 128)
+    return SDValue();
+
   if (VT.getSizeInBits() == 8) {
     SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
                                     Op.getOperand(0), Op.getOperand(1));
@@ -6060,36 +6914,26 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
   SDValue Vec = Op.getOperand(0);
   EVT VecVT = Vec.getValueType();
 
-  // If this is a 256-bit vector result, first extract the 128-bit
-  // vector and then extract from the 128-bit vector.
-  if (VecVT.getSizeInBits() > 128) {
+  // If this is a 256-bit vector result, first extract the 128-bit vector and
+  // then extract the element from the 128-bit vector.
+  if (VecVT.getSizeInBits() == 256) {
     DebugLoc dl = Op.getNode()->getDebugLoc();
     unsigned NumElems = VecVT.getVectorNumElements();
     SDValue Idx = Op.getOperand(1);
-
-    if (!isa<ConstantSDNode>(Idx))
-      return SDValue();
-
-    unsigned ExtractNumElems = NumElems / (VecVT.getSizeInBits() / 128);
     unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
 
     // Get the 128-bit vector.
-    bool Upper = IdxVal >= ExtractNumElems;
-    Vec = Extract128BitVector(Vec, Idx, DAG, dl);
-
-    // Extract from it.
-    SDValue ScaledIdx = Idx;
-    if (Upper)
-      ScaledIdx = DAG.getNode(ISD::SUB, dl, Idx.getValueType(), Idx,
-                              DAG.getConstant(ExtractNumElems,
-                                              Idx.getValueType()));
+    bool Upper = IdxVal >= NumElems/2;
+    Vec = Extract128BitVector(Vec,
+                    DAG.getConstant(Upper ? NumElems/2 : 0, MVT::i32), DAG, dl);
+
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
-                       ScaledIdx);
+                    Upper ? DAG.getConstant(IdxVal-NumElems/2, MVT::i32) : Idx);
   }
 
   assert(Vec.getValueSizeInBits() <= 128 && "Unexpected vector length");
 
-  if (Subtarget->hasSSE41()) {
+  if (Subtarget->hasSSE41() || Subtarget->hasAVX()) {
     SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
     if (Res.getNode())
       return Res;
@@ -6120,7 +6964,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
       return Op;
 
     // SHUFPS the element to the lowest double word, then movss.
-    int Mask[4] = { Idx, -1, -1, -1 };
+    int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 };
     EVT VVT = Op.getOperand(0).getValueType();
     SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
                                        DAG.getUNDEF(VVT), Mask);
@@ -6159,6 +7003,9 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op,
   SDValue N1 = Op.getOperand(1);
   SDValue N2 = Op.getOperand(2);
 
+  if (VT.getSizeInBits() == 256)
+    return SDValue();
+
   if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) &&
       isa<ConstantSDNode>(N2)) {
     unsigned Opc;
@@ -6206,35 +7053,28 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
   SDValue N1 = Op.getOperand(1);
   SDValue N2 = Op.getOperand(2);
 
-  // If this is a 256-bit vector result, first insert into a 128-bit
-  // vector and then insert into the 256-bit vector.
-  if (VT.getSizeInBits() > 128) {
+  // If this is a 256-bit vector result, first extract the 128-bit vector,
+  // insert the element into the extracted half and then place it back.
+  if (VT.getSizeInBits() == 256) {
     if (!isa<ConstantSDNode>(N2))
       return SDValue();
 
-    // Get the 128-bit vector.
+    // Get the desired 128-bit vector half.
     unsigned NumElems = VT.getVectorNumElements();
     unsigned IdxVal = cast<ConstantSDNode>(N2)->getZExtValue();
-    bool Upper = IdxVal >= NumElems / 2;
-
-    SDValue SubN0 = Extract128BitVector(N0, N2, DAG, dl);
+    bool Upper = IdxVal >= NumElems/2;
+    SDValue Ins128Idx = DAG.getConstant(Upper ? NumElems/2 : 0, MVT::i32);
+    SDValue V = Extract128BitVector(N0, Ins128Idx, DAG, dl);
 
-    // Insert into it.
-    SDValue ScaledN2 = N2;
-    if (Upper)
-      ScaledN2 = DAG.getNode(ISD::SUB, dl, N2.getValueType(), N2,
-                             DAG.getConstant(NumElems /
-                                             (VT.getSizeInBits() / 128),
-                                             N2.getValueType()));
-    Op = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubN0.getValueType(), SubN0,
-                     N1, ScaledN2);
+    // Insert the element into the desired half.
+    V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V,
+                 N1, Upper ? DAG.getConstant(IdxVal-NumElems/2, MVT::i32) : N2);
 
-    // Insert the 128-bit vector
-    // FIXME: Why UNDEF?
-    return Insert128BitVector(N0, Op, N2, DAG, dl);
+    // Insert the changed part back to the 256-bit vector
+    return Insert128BitVector(N0, V, Ins128Idx, DAG, dl);
   }
 
-  if (Subtarget->hasSSE41())
+  if (Subtarget->hasSSE41() || Subtarget->hasAVX())
     return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG);
 
   if (EltVT == MVT::i8)
@@ -6405,12 +7245,17 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
   CodeModel::Model M = getTargetMachine().getCodeModel();
 
   if (Subtarget->isPICStyleRIPRel() &&
-      (M == CodeModel::Small || M == CodeModel::Kernel))
+      (M == CodeModel::Small || M == CodeModel::Kernel)) {
+    if (Subtarget->isTargetDarwin() || Subtarget->isTargetELF())
+      OpFlag = X86II::MO_GOTPCREL;
     WrapperKind = X86ISD::WrapperRIP;
-  else if (Subtarget->isPICStyleGOT())
-    OpFlag = X86II::MO_GOTOFF;
-  else if (Subtarget->isPICStyleStubPIC())
-    OpFlag = X86II::MO_PIC_BASE_OFFSET;
+  } else if (Subtarget->isPICStyleGOT()) {
+    OpFlag = X86II::MO_GOT;
+  } else if (Subtarget->isPICStyleStubPIC()) {
+    OpFlag = X86II::MO_DARWIN_NONLAZY_PIC_BASE;
+  } else if (Subtarget->isPICStyleStubNoDynamic()) {
+    OpFlag = X86II::MO_DARWIN_NONLAZY;
+  }
 
   SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag);
 
@@ -6427,6 +7272,12 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
                          Result);
   }
 
+  // For symbols that require a load from a stub to get the address, emit the
+  // load.
+  if (isGlobalStubReference(OpFlag))
+    Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result,
+                         MachinePointerInfo::getGOT(), false, false, 0);
+
   return Result;
 }
 
@@ -6676,7 +7527,8 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
     // And our return value (tls address) is in the standard call return value
     // location.
     unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
-    return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy());
+    return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(),
+                              Chain.getValue(1));
   }
 
   assert(false &&
@@ -6922,9 +7774,11 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
 
   // Load the 32-bit value into an XMM register.
   SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
-                             DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
-                                         Op.getOperand(0),
-                                         DAG.getIntPtrConstant(0)));
+                             Op.getOperand(0));
+
+  // Zero out the upper parts of the register.
+  Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget->hasXMMInt(),
+                                     DAG);
 
   Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
                      DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load),
@@ -7513,6 +8367,9 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
 }
 
 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
+
+  if (Op.getValueType().isVector()) return LowerVSETCC(Op, DAG);
+
   assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer");
   SDValue Op0 = Op.getOperand(0);
   SDValue Op1 = Op.getOperand(1);
@@ -7563,6 +8420,39 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
                      DAG.getConstant(X86CC, MVT::i8), EFLAGS);
 }
 
+// Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128
+// ones, and then concatenate the result back.
+static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
+
+  assert(VT.getSizeInBits() == 256 && Op.getOpcode() == ISD::SETCC &&
+         "Unsupported value type for operation");
+
+  int NumElems = VT.getVectorNumElements();
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue CC = Op.getOperand(2);
+  SDValue Idx0 = DAG.getConstant(0, MVT::i32);
+  SDValue Idx1 = DAG.getConstant(NumElems/2, MVT::i32);
+
+  // Extract the LHS vectors
+  SDValue LHS = Op.getOperand(0);
+  SDValue LHS1 = Extract128BitVector(LHS, Idx0, DAG, dl);
+  SDValue LHS2 = Extract128BitVector(LHS, Idx1, DAG, dl);
+
+  // Extract the RHS vectors
+  SDValue RHS = Op.getOperand(1);
+  SDValue RHS1 = Extract128BitVector(RHS, Idx0, DAG, dl);
+  SDValue RHS2 = Extract128BitVector(RHS, Idx1, DAG, dl);
+
+  // Issue the operation on the smaller types and concatenate the result back
+  MVT EltVT = VT.getVectorElementType().getSimpleVT();
+  EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
+                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
+                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
+}
+
+
 SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
   SDValue Cond;
   SDValue Op0 = Op.getOperand(0);
@@ -7575,11 +8465,21 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
 
   if (isFP) {
     unsigned SSECC = 8;
-    EVT VT0 = Op0.getValueType();
-    assert(VT0 == MVT::v4f32 || VT0 == MVT::v2f64);
-    unsigned Opc = VT0 == MVT::v4f32 ? X86ISD::CMPPS : X86ISD::CMPPD;
+    EVT EltVT = Op0.getValueType().getVectorElementType();
+    assert(EltVT == MVT::f32 || EltVT == MVT::f64);
+
+    unsigned Opc = EltVT == MVT::f32 ? X86ISD::CMPPS : X86ISD::CMPPD;
     bool Swap = false;
 
+    // SSE Condition code mapping:
+    //  0 - EQ
+    //  1 - LT
+    //  2 - LE
+    //  3 - UNORD
+    //  4 - NEQ
+    //  5 - NLT
+    //  6 - NLE
+    //  7 - ORD
     switch (SetCCOpcode) {
     default: break;
     case ISD::SETOEQ:
@@ -7624,6 +8524,10 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
     return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8));
   }
 
+  // Break 256-bit integer vector compare into smaller ones.
+  if (!isFP && VT.getSizeInBits() == 256)
+    return Lower256IntVSETCC(Op, DAG);
+
   // We are handling one of the integer comparisons here.  Since SSE only has
   // GT and EQ comparisons for integer, swapping operands and multiple
   // operations may be required for some comparisons.
@@ -7654,6 +8558,13 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
   if (Swap)
     std::swap(Op0, Op1);
 
+  // Check that the operation in question is available (most are plain SSE2,
+  // but PCMPGTQ and PCMPEQQ have different requirements).
+  if (Opc == X86ISD::PCMPGTQ && !Subtarget->hasSSE42() && !Subtarget->hasAVX())
+    return SDValue();
+  if (Opc == X86ISD::PCMPEQQ && !Subtarget->hasSSE41() && !Subtarget->hasAVX())
+    return SDValue();
+
   // Since SSE has no unsigned integer comparisons, we need to flip  the sign
   // bits of the inputs before performing those operations.
   if (FlipSigns) {
@@ -8014,9 +8925,11 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
 SDValue
 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
                                            SelectionDAG &DAG) const {
-  assert((Subtarget->isTargetCygMing() || Subtarget->isTargetWindows()) &&
-         "This should be used only on Windows targets");
-  assert(!Subtarget->isTargetEnvMacho());
+  assert((Subtarget->isTargetCygMing() || Subtarget->isTargetWindows() ||
+          EnableSegmentedStacks) &&
+         "This should be used only on Windows targets or when segmented stacks "
+         "are being used");
+  assert(!Subtarget->isTargetEnvMacho() && "Not implemented");
   DebugLoc dl = Op.getDebugLoc();
 
   // Get the inputs.
@@ -8024,23 +8937,49 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
   SDValue Size  = Op.getOperand(1);
   // FIXME: Ensure alignment here
 
-  SDValue Flag;
+  bool Is64Bit = Subtarget->is64Bit();
+  EVT SPTy = Is64Bit ? MVT::i64 : MVT::i32;
 
-  EVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32;
-  unsigned Reg = (Subtarget->is64Bit() ? X86::RAX : X86::EAX);
+  if (EnableSegmentedStacks) {
+    MachineFunction &MF = DAG.getMachineFunction();
+    MachineRegisterInfo &MRI = MF.getRegInfo();
 
-  Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag);
-  Flag = Chain.getValue(1);
+    if (Is64Bit) {
+      // The 64 bit implementation of segmented stacks needs to clobber both r10
+      // r11. This makes it impossible to use it along with nested parameters.
+      const Function *F = MF.getFunction();
+
+      for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
+           I != E; I++)
+        if (I->hasNestAttr())
+          report_fatal_error("Cannot use segmented stacks with functions that "
+                             "have nested arguments.");
+    }
+
+    const TargetRegisterClass *AddrRegClass =
+      getRegClassFor(Subtarget->is64Bit() ? MVT::i64:MVT::i32);
+    unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
+    Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
+    SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
+                                DAG.getRegister(Vreg, SPTy));
+    SDValue Ops1[2] = { Value, Chain };
+    return DAG.getMergeValues(Ops1, 2, dl);
+  } else {
+    SDValue Flag;
+    unsigned Reg = (Subtarget->is64Bit() ? X86::RAX : X86::EAX);
 
-  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+    Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag);
+    Flag = Chain.getValue(1);
+    SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
 
-  Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
-  Flag = Chain.getValue(1);
+    Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
+    Flag = Chain.getValue(1);
 
-  Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1);
+    Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1);
 
-  SDValue Ops1[2] = { Chain.getValue(0), Chain };
-  return DAG.getMergeValues(Ops1, 2, dl);
+    SDValue Ops1[2] = { Chain.getValue(0), Chain };
+    return DAG.getMergeValues(Ops1, 2, dl);
+  }
 }
 
 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
@@ -8118,7 +9057,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
   DebugLoc dl = Op.getDebugLoc();
 
   EVT ArgVT = Op.getNode()->getValueType(0);
-  const Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
   uint32_t ArgSize = getTargetData()->getTypeAllocSize(ArgTy);
   uint8_t ArgMode;
 
@@ -8292,6 +9231,19 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
                                 DAG.getConstant(X86CC, MVT::i8), Cond);
     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
   }
+  // Arithmetic intrinsics.
+  case Intrinsic::x86_sse3_hadd_ps:
+  case Intrinsic::x86_sse3_hadd_pd:
+  case Intrinsic::x86_avx_hadd_ps_256:
+  case Intrinsic::x86_avx_hadd_pd_256:
+    return DAG.getNode(X86ISD::FHADD, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+  case Intrinsic::x86_sse3_hsub_ps:
+  case Intrinsic::x86_sse3_hsub_pd:
+  case Intrinsic::x86_avx_hsub_ps_256:
+  case Intrinsic::x86_avx_hsub_pd_256:
+    return DAG.getNode(X86ISD::FHSUB, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
   // ptest and testp intrinsics. The intrinsic these come from are designed to
   // return an integer value, not just an instruction so lower it to the ptest
   // or testp pattern and a setcc for the result.
@@ -8535,8 +9487,13 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
                      Chain, DAG.getRegister(StoreAddrReg, getPointerTy()));
 }
 
-SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op,
-                                             SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  return Op.getOperand(0);
+}
+
+SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
+                                                SelectionDAG &DAG) const {
   SDValue Root = Op.getOperand(0);
   SDValue Trmp = Op.getOperand(1); // trampoline
   SDValue FPtr = Op.getOperand(2); // nested function
@@ -8552,8 +9509,8 @@ SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op,
     const unsigned char JMP64r  = 0xFF; // 64-bit jmp through register opcode.
     const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
 
-    const unsigned char N86R10 = RegInfo->getX86RegNum(X86::R10);
-    const unsigned char N86R11 = RegInfo->getX86RegNum(X86::R11);
+    const unsigned char N86R10 = X86_MC::getX86RegNum(X86::R10);
+    const unsigned char N86R11 = X86_MC::getX86RegNum(X86::R11);
 
     const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
 
@@ -8600,9 +9557,7 @@ SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op,
                                 MachinePointerInfo(TrmpAddr, 22),
                                 false, false, 0);
 
-    SDValue Ops[] =
-      { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6) };
-    return DAG.getMergeValues(Ops, 2, dl);
+    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6);
   } else {
     const Function *Func =
       cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
@@ -8619,7 +9574,7 @@ SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op,
       NestReg = X86::ECX;
 
       // Check that ECX wasn't needed by an 'inreg' parameter.
-      const FunctionType *FTy = Func->getFunctionType();
+      FunctionType *FTy = Func->getFunctionType();
       const AttrListPtr &Attrs = Func->getAttributes();
 
       if (!Attrs.isEmpty() && !Func->isVarArg()) {
@@ -8657,7 +9612,7 @@ SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op,
 
     // This is storing the opcode for MOV32ri.
     const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
-    const unsigned char N86Reg = RegInfo->getX86RegNum(NestReg);
+    const unsigned char N86Reg = X86_MC::getX86RegNum(NestReg);
     OutChains[0] = DAG.getStore(Root, dl,
                                 DAG.getConstant(MOV32ri|N86Reg, MVT::i8),
                                 Trmp, MachinePointerInfo(TrmpAddr),
@@ -8682,9 +9637,7 @@ SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op,
                                 MachinePointerInfo(TrmpAddr, 6),
                                 false, false, 1);
 
-    SDValue Ops[] =
-      { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4) };
-    return DAG.getMergeValues(Ops, 2, dl);
+    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4);
   }
 }
 
@@ -8822,8 +9775,58 @@ SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
   return Op;
 }
 
-SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) const {
+// Lower256IntArith - Break a 256-bit integer operation into two new 128-bit
+// ones, and then concatenate the result back.
+static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
+
+  assert(VT.getSizeInBits() == 256 && VT.isInteger() &&
+         "Unsupported value type for operation");
+
+  int NumElems = VT.getVectorNumElements();
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue Idx0 = DAG.getConstant(0, MVT::i32);
+  SDValue Idx1 = DAG.getConstant(NumElems/2, MVT::i32);
+
+  // Extract the LHS vectors
+  SDValue LHS = Op.getOperand(0);
+  SDValue LHS1 = Extract128BitVector(LHS, Idx0, DAG, dl);
+  SDValue LHS2 = Extract128BitVector(LHS, Idx1, DAG, dl);
+
+  // Extract the RHS vectors
+  SDValue RHS = Op.getOperand(1);
+  SDValue RHS1 = Extract128BitVector(RHS, Idx0, DAG, dl);
+  SDValue RHS2 = Extract128BitVector(RHS, Idx1, DAG, dl);
+
+  MVT EltVT = VT.getVectorElementType().getSimpleVT();
+  EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
+
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
+                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
+                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
+}
+
+SDValue X86TargetLowering::LowerADD(SDValue Op, SelectionDAG &DAG) const {
+  assert(Op.getValueType().getSizeInBits() == 256 &&
+         Op.getValueType().isInteger() &&
+         "Only handle AVX 256-bit vector integer operation");
+  return Lower256IntArith(Op, DAG);
+}
+
+SDValue X86TargetLowering::LowerSUB(SDValue Op, SelectionDAG &DAG) const {
+  assert(Op.getValueType().getSizeInBits() == 256 &&
+         Op.getValueType().isInteger() &&
+         "Only handle AVX 256-bit vector integer operation");
+  return Lower256IntArith(Op, DAG);
+}
+
+SDValue X86TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
+
+  // Decompose 256-bit ops into smaller 128-bit ops.
+  if (VT.getSizeInBits() == 256)
+    return Lower256IntArith(Op, DAG);
+
   assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply");
   DebugLoc dl = Op.getDebugLoc();
 
@@ -8872,11 +9875,51 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
   DebugLoc dl = Op.getDebugLoc();
   SDValue R = Op.getOperand(0);
   SDValue Amt = Op.getOperand(1);
-
   LLVMContext *Context = DAG.getContext();
 
-  // Must have SSE2.
-  if (!Subtarget->hasSSE2()) return SDValue();
+  if (!Subtarget->hasXMMInt())
+    return SDValue();
+
+  // Decompose 256-bit shifts into smaller 128-bit shifts.
+  if (VT.getSizeInBits() == 256) {
+    int NumElems = VT.getVectorNumElements();
+    MVT EltVT = VT.getVectorElementType().getSimpleVT();
+    EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
+
+    // Extract the two vectors
+    SDValue V1 = Extract128BitVector(R, DAG.getConstant(0, MVT::i32), DAG, dl);
+    SDValue V2 = Extract128BitVector(R, DAG.getConstant(NumElems/2, MVT::i32),
+                                     DAG, dl);
+
+    // Recreate the shift amount vectors
+    SDValue Amt1, Amt2;
+    if (Amt.getOpcode() == ISD::BUILD_VECTOR) {
+      // Constant shift amount
+      SmallVector<SDValue, 4> Amt1Csts;
+      SmallVector<SDValue, 4> Amt2Csts;
+      for (int i = 0; i < NumElems/2; ++i)
+        Amt1Csts.push_back(Amt->getOperand(i));
+      for (int i = NumElems/2; i < NumElems; ++i)
+        Amt2Csts.push_back(Amt->getOperand(i));
+
+      Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT,
+                                 &Amt1Csts[0], NumElems/2);
+      Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT,
+                                 &Amt2Csts[0], NumElems/2);
+    } else {
+      // Variable shift amount
+      Amt1 = Extract128BitVector(Amt, DAG.getConstant(0, MVT::i32), DAG, dl);
+      Amt2 = Extract128BitVector(Amt, DAG.getConstant(NumElems/2, MVT::i32),
+                                 DAG, dl);
+    }
+
+    // Issue new vector shifts for the smaller types
+    V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1);
+    V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2);
+
+    // Concatenate the result back
+    return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2);
+  }
 
   // Optimize shl/srl/sra with constant shift amount.
   if (isSplatVector(Amt.getNode())) {
@@ -8927,9 +9970,6 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
   }
 
   // Lower SHL with variable shift amount.
-  // Cannot lower SHL without SSE2 or later.
-  if (!Subtarget->hasSSE2()) return SDValue();
-
   if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
     Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
                      DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32),
@@ -8971,7 +10011,7 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
     M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
                     DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M,
                     DAG.getConstant(4, MVT::i32));
-    R = DAG.getNode(X86ISD::PBLENDVB, dl, VT, R, M, Op);
+    R = DAG.getNode(ISD::VSELECT, dl, VT, Op, R, M);
     // a += a
     Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
 
@@ -8986,13 +10026,13 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
     M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
                     DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M,
                     DAG.getConstant(2, MVT::i32));
-    R = DAG.getNode(X86ISD::PBLENDVB, dl, VT, R, M, Op);
+    R = DAG.getNode(ISD::VSELECT, dl, VT, Op, R, M);
     // a += a
     Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
 
     // return pblendv(r, r+r, a);
-    R = DAG.getNode(X86ISD::PBLENDVB, dl, VT,
-                    R, DAG.getNode(ISD::ADD, dl, VT, R, R), Op);
+    R = DAG.getNode(ISD::VSELECT, dl, VT, Op,
+                    R, DAG.getNode(ISD::ADD, dl, VT, R, R));
     return R;
   }
   return SDValue();
@@ -9057,8 +10097,7 @@ SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const {
                   DAG.getConstant(X86::COND_O, MVT::i32),
                   SDValue(Sum.getNode(), 2));
 
-    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SetCC);
-    return Sum;
+    return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
   }
   }
 
@@ -9071,8 +10110,7 @@ SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const {
                 DAG.getConstant(Cond, MVT::i32),
                 SDValue(Sum.getNode(), 1));
 
-  DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SetCC);
-  return Sum;
+  return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
 }
 
 SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const{
@@ -9080,8 +10118,7 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG)
   SDNode* Node = Op.getNode();
   EVT ExtraVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
   EVT VT = Node->getValueType(0);
-
-  if (Subtarget->hasSSE2() && VT.isVector()) {
+  if (Subtarget->hasXMMInt() && VT.isVector()) {
     unsigned BitsDiff = VT.getScalarType().getSizeInBits() -
                         ExtraVT.getScalarType().getSizeInBits();
     SDValue ShAmt = DAG.getConstant(BitsDiff, MVT::i32);
@@ -9091,11 +10128,6 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG)
     switch (VT.getSimpleVT().SimpleTy) {
       default:
         return SDValue();
-      case MVT::v2i64: {
-        SHLIntrinsicsID = Intrinsic::x86_sse2_pslli_q;
-        SRAIntrinsicsID = 0;
-        break;
-      }
       case MVT::v4i32: {
         SHLIntrinsicsID = Intrinsic::x86_sse2_pslli_d;
         SRAIntrinsicsID = Intrinsic::x86_sse2_psrai_d;
@@ -9115,12 +10147,9 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG)
     // In case of 1 bit sext, no need to shr
     if (ExtraVT.getScalarType().getSizeInBits() == 1) return Tmp1;
 
-    if (SRAIntrinsicsID) {
-      Tmp1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                         DAG.getConstant(SRAIntrinsicsID, MVT::i32),
-                         Tmp1, ShAmt);
-    }
-    return Tmp1;
+    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                       DAG.getConstant(SRAIntrinsicsID, MVT::i32),
+                       Tmp1, ShAmt);
   }
 
   return SDValue();
@@ -9132,7 +10161,7 @@ SDValue X86TargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const{
 
   // Go ahead and emit the fence on x86-64 even if we asked for no-sse2.
   // There isn't any reason to disable it if the target processor supports it.
-  if (!Subtarget->hasSSE2() && !Subtarget->is64Bit()) {
+  if (!Subtarget->hasXMMInt() && !Subtarget->is64Bit()) {
     SDValue Chain = Op.getOperand(0);
     SDValue Zero = DAG.getConstant(0, MVT::i32);
     SDValue Ops[] = {
@@ -9172,6 +10201,45 @@ SDValue X86TargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const{
   return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
 }
 
+SDValue X86TargetLowering::LowerATOMIC_FENCE(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  DebugLoc dl = Op.getDebugLoc();
+  AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
+    cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
+  SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
+    cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
+
+  // The only fence that needs an instruction is a sequentially-consistent
+  // cross-thread fence.
+  if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) {
+    // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
+    // no-sse2). There isn't any reason to disable it if the target processor
+    // supports it.
+    if (Subtarget->hasXMMInt() || Subtarget->is64Bit())
+      return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
+
+    SDValue Chain = Op.getOperand(0);
+    SDValue Zero = DAG.getConstant(0, MVT::i32);
+    SDValue Ops[] = {
+      DAG.getRegister(X86::ESP, MVT::i32), // Base
+      DAG.getTargetConstant(1, MVT::i8),   // Scale
+      DAG.getRegister(0, MVT::i32),        // Index
+      DAG.getTargetConstant(0, MVT::i32),  // Disp
+      DAG.getRegister(0, MVT::i32),        // Segment.
+      Zero,
+      Chain
+    };
+    SDNode *Res =
+      DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops,
+                         array_lengthof(Ops));
+    return SDValue(Res, 0);
+  }
+
+  // MEMBARRIER is a compiler barrier; it codegens to a no-op.
+  return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
+}
+
+
 SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
   EVT T = Op.getValueType();
   DebugLoc DL = Op.getDebugLoc();
@@ -9227,7 +10295,7 @@ SDValue X86TargetLowering::LowerBITCAST(SDValue Op,
                                             SelectionDAG &DAG) const {
   EVT SrcVT = Op.getOperand(0).getValueType();
   EVT DstVT = Op.getValueType();
-  assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() &&
+  assert(Subtarget->is64Bit() && !Subtarget->hasXMMInt() &&
          Subtarget->hasMMX() && "Unexpected custom BITCAST");
   assert((DstVT == MVT::i64 ||
           (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
@@ -9255,7 +10323,34 @@ SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) const {
                        Node->getOperand(0),
                        Node->getOperand(1), negOp,
                        cast<AtomicSDNode>(Node)->getSrcValue(),
-                       cast<AtomicSDNode>(Node)->getAlignment());
+                       cast<AtomicSDNode>(Node)->getAlignment(),
+                       cast<AtomicSDNode>(Node)->getOrdering(),
+                       cast<AtomicSDNode>(Node)->getSynchScope());
+}
+
+static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
+  SDNode *Node = Op.getNode();
+  DebugLoc dl = Node->getDebugLoc();
+  EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
+
+  // Convert seq_cst store -> xchg
+  // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
+  // FIXME: On 32-bit, store -> fist or movq would be more efficient
+  //        (The only way to get a 16-byte store is cmpxchg16b)
+  // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
+  if (cast<AtomicSDNode>(Node)->getOrdering() == SequentiallyConsistent ||
+      !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
+    SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
+                                 cast<AtomicSDNode>(Node)->getMemoryVT(),
+                                 Node->getOperand(0),
+                                 Node->getOperand(1), Node->getOperand(2),
+                                 cast<AtomicSDNode>(Node)->getMemOperand(),
+                                 cast<AtomicSDNode>(Node)->getOrdering(),
+                                 cast<AtomicSDNode>(Node)->getSynchScope());
+    return Swap.getValue(1);
+  }
+  // Other atomic stores have a simple pattern.
+  return Op;
 }
 
 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
@@ -9291,8 +10386,10 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   default: llvm_unreachable("Should not custom lower this!");
   case ISD::SIGN_EXTEND_INREG:  return LowerSIGN_EXTEND_INREG(Op,DAG);
   case ISD::MEMBARRIER:         return LowerMEMBARRIER(Op,DAG);
+  case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op,DAG);
   case ISD::ATOMIC_CMP_SWAP:    return LowerCMP_SWAP(Op,DAG);
   case ISD::ATOMIC_LOAD_SUB:    return LowerLOAD_SUB(Op,DAG);
+  case ISD::ATOMIC_STORE:       return LowerATOMIC_STORE(Op,DAG);
   case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
   case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, DAG);
   case ISD::VECTOR_SHUFFLE:     return LowerVECTOR_SHUFFLE(Op, DAG);
@@ -9318,7 +10415,6 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
   case ISD::FGETSIGN:           return LowerFGETSIGN(Op, DAG);
   case ISD::SETCC:              return LowerSETCC(Op, DAG);
-  case ISD::VSETCC:             return LowerVSETCC(Op, DAG);
   case ISD::SELECT:             return LowerSELECT(Op, DAG);
   case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
   case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
@@ -9332,11 +10428,12 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
                                 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
   case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
   case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
-  case ISD::TRAMPOLINE:         return LowerTRAMPOLINE(Op, DAG);
+  case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
+  case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
   case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
   case ISD::CTLZ:               return LowerCTLZ(Op, DAG);
   case ISD::CTTZ:               return LowerCTTZ(Op, DAG);
-  case ISD::MUL:                return LowerMUL_V2I64(Op, DAG);
+  case ISD::MUL:                return LowerMUL(Op, DAG);
   case ISD::SRA:
   case ISD::SRL:
   case ISD::SHL:                return LowerShift(Op, DAG);
@@ -9352,15 +10449,38 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::ADDE:
   case ISD::SUBC:
   case ISD::SUBE:               return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
+  case ISD::ADD:                return LowerADD(Op, DAG);
+  case ISD::SUB:                return LowerSUB(Op, DAG);
   }
 }
 
+static void ReplaceATOMIC_LOAD(SDNode *Node,
+                                  SmallVectorImpl<SDValue> &Results,
+                                  SelectionDAG &DAG) {
+  DebugLoc dl = Node->getDebugLoc();
+  EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
+
+  // Convert wide load -> cmpxchg8b/cmpxchg16b
+  // FIXME: On 32-bit, load -> fild or movq would be more efficient
+  //        (The only way to get a 16-byte load is cmpxchg16b)
+  // FIXME: 16-byte ATOMIC_CMP_SWAP isn't actually hooked up at the moment.
+  SDValue Zero = DAG.getConstant(0, VT);
+  SDValue Swap = DAG.getAtomic(ISD::ATOMIC_CMP_SWAP, dl, VT,
+                               Node->getOperand(0),
+                               Node->getOperand(1), Zero, Zero,
+                               cast<AtomicSDNode>(Node)->getMemOperand(),
+                               cast<AtomicSDNode>(Node)->getOrdering(),
+                               cast<AtomicSDNode>(Node)->getSynchScope());
+  Results.push_back(Swap.getValue(0));
+  Results.push_back(Swap.getValue(1));
+}
+
 void X86TargetLowering::
 ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results,
                         SelectionDAG &DAG, unsigned NewOp) const {
-  EVT T = Node->getValueType(0);
   DebugLoc dl = Node->getDebugLoc();
-  assert (T == MVT::i64 && "Only know how to expand i64 atomics");
+  assert (Node->getValueType(0) == MVT::i64 &&
+          "Only know how to expand i64 atomics");
 
   SDValue Chain = Node->getOperand(0);
   SDValue In1 = Node->getOperand(1);
@@ -9423,37 +10543,48 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
   }
   case ISD::ATOMIC_CMP_SWAP: {
     EVT T = N->getValueType(0);
-    assert (T == MVT::i64 && "Only know how to expand i64 Cmp and Swap");
+    assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
+    bool Regs64bit = T == MVT::i128;
+    EVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
     SDValue cpInL, cpInH;
-    cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2),
-                        DAG.getConstant(0, MVT::i32));
-    cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2),
-                        DAG.getConstant(1, MVT::i32));
-    cpInL = DAG.getCopyToReg(N->getOperand(0), dl, X86::EAX, cpInL, SDValue());
-    cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, X86::EDX, cpInH,
-                             cpInL.getValue(1));
+    cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
+                        DAG.getConstant(0, HalfT));
+    cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
+                        DAG.getConstant(1, HalfT));
+    cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
+                             Regs64bit ? X86::RAX : X86::EAX,
+                             cpInL, SDValue());
+    cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
+                             Regs64bit ? X86::RDX : X86::EDX,
+                             cpInH, cpInL.getValue(1));
     SDValue swapInL, swapInH;
-    swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3),
-                          DAG.getConstant(0, MVT::i32));
-    swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3),
-                          DAG.getConstant(1, MVT::i32));
-    swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, X86::EBX, swapInL,
-                               cpInH.getValue(1));
-    swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, X86::ECX, swapInH,
-                               swapInL.getValue(1));
+    swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
+                          DAG.getConstant(0, HalfT));
+    swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
+                          DAG.getConstant(1, HalfT));
+    swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl,
+                               Regs64bit ? X86::RBX : X86::EBX,
+                               swapInL, cpInH.getValue(1));
+    swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl,
+                               Regs64bit ? X86::RCX : X86::ECX, 
+                               swapInH, swapInL.getValue(1));
     SDValue Ops[] = { swapInH.getValue(0),
                       N->getOperand(1),
                       swapInH.getValue(1) };
     SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
     MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
-    SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys,
+    unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG :
+                                  X86ISD::LCMPXCHG8_DAG;
+    SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys,
                                              Ops, 3, T, MMO);
-    SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, X86::EAX,
-                                        MVT::i32, Result.getValue(1));
-    SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, X86::EDX,
-                                        MVT::i32, cpOutL.getValue(2));
+    SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
+                                        Regs64bit ? X86::RAX : X86::EAX,
+                                        HalfT, Result.getValue(1));
+    SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
+                                        Regs64bit ? X86::RDX : X86::EDX,
+                                        HalfT, cpOutL.getValue(2));
     SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
-    Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2));
+    Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF, 2));
     Results.push_back(cpOutH.getValue(1));
     return;
   }
@@ -9478,6 +10609,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
   case ISD::ATOMIC_SWAP:
     ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG);
     return;
+  case ISD::ATOMIC_LOAD:
+    ReplaceATOMIC_LOAD(N, Results, DAG);
   }
 }
 
@@ -9527,11 +10660,12 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::PSIGNB:             return "X86ISD::PSIGNB";
   case X86ISD::PSIGNW:             return "X86ISD::PSIGNW";
   case X86ISD::PSIGND:             return "X86ISD::PSIGND";
-  case X86ISD::PBLENDVB:           return "X86ISD::PBLENDVB";
   case X86ISD::FMAX:               return "X86ISD::FMAX";
   case X86ISD::FMIN:               return "X86ISD::FMIN";
   case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
   case X86ISD::FRCP:               return "X86ISD::FRCP";
+  case X86ISD::FHADD:              return "X86ISD::FHADD";
+  case X86ISD::FHSUB:              return "X86ISD::FHSUB";
   case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
   case X86ISD::TLSCALL:            return "X86ISD::TLSCALL";
   case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN";
@@ -9570,6 +10704,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::OR:                 return "X86ISD::OR";
   case X86ISD::XOR:                return "X86ISD::XOR";
   case X86ISD::AND:                return "X86ISD::AND";
+  case X86ISD::ANDN:               return "X86ISD::ANDN";
   case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
   case X86ISD::PTEST:              return "X86ISD::PTEST";
   case X86ISD::TESTP:              return "X86ISD::TESTP";
@@ -9596,9 +10731,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::MOVSS:              return "X86ISD::MOVSS";
   case X86ISD::UNPCKLPS:           return "X86ISD::UNPCKLPS";
   case X86ISD::UNPCKLPD:           return "X86ISD::UNPCKLPD";
-  case X86ISD::VUNPCKLPS:          return "X86ISD::VUNPCKLPS";
-  case X86ISD::VUNPCKLPD:          return "X86ISD::VUNPCKLPD";
-  case X86ISD::VUNPCKLPSY:         return "X86ISD::VUNPCKLPSY";
   case X86ISD::VUNPCKLPDY:         return "X86ISD::VUNPCKLPDY";
   case X86ISD::UNPCKHPS:           return "X86ISD::UNPCKHPS";
   case X86ISD::UNPCKHPD:           return "X86ISD::UNPCKHPD";
@@ -9610,16 +10742,24 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::PUNPCKHWD:          return "X86ISD::PUNPCKHWD";
   case X86ISD::PUNPCKHDQ:          return "X86ISD::PUNPCKHDQ";
   case X86ISD::PUNPCKHQDQ:         return "X86ISD::PUNPCKHQDQ";
+  case X86ISD::VBROADCAST:         return "X86ISD::VBROADCAST";
+  case X86ISD::VPERMILPS:          return "X86ISD::VPERMILPS";
+  case X86ISD::VPERMILPSY:         return "X86ISD::VPERMILPSY";
+  case X86ISD::VPERMILPD:          return "X86ISD::VPERMILPD";
+  case X86ISD::VPERMILPDY:         return "X86ISD::VPERMILPDY";
+  case X86ISD::VPERM2F128:         return "X86ISD::VPERM2F128";
   case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
   case X86ISD::VAARG_64:           return "X86ISD::VAARG_64";
   case X86ISD::WIN_ALLOCA:         return "X86ISD::WIN_ALLOCA";
+  case X86ISD::MEMBARRIER:         return "X86ISD::MEMBARRIER";
+  case X86ISD::SEG_ALLOCA:         return "X86ISD::SEG_ALLOCA";
   }
 }
 
 // isLegalAddressingMode - Return true if the addressing mode represented
 // by AM is legal for this target, for a load/store of the specified type.
 bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
-                                              const Type *Ty) const {
+                                              Type *Ty) const {
   // X86 supports extremely general addressing modes.
   CodeModel::Model M = getTargetMachine().getCodeModel();
   Reloc::Model R = getTargetMachine().getRelocationModel();
@@ -9671,7 +10811,7 @@ bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
 }
 
 
-bool X86TargetLowering::isTruncateFree(const Type *Ty1, const Type *Ty2) const {
+bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
     return false;
   unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
@@ -9691,7 +10831,7 @@ bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
   return true;
 }
 
-bool X86TargetLowering::isZExtFree(const Type *Ty1, const Type *Ty2) const {
+bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
   return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit();
 }
@@ -9715,7 +10855,7 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
                                       EVT VT) const {
   // Very little shuffling can be done for 64-bit vectors right now.
   if (VT.getSizeInBits() == 64)
-    return isPALIGNRMask(M, VT, Subtarget->hasSSSE3());
+    return isPALIGNRMask(M, VT, Subtarget->hasSSSE3() || Subtarget->hasAVX());
 
   // FIXME: pshufb, blends, shifts.
   return (VT.getVectorNumElements() == 2 ||
@@ -9725,7 +10865,7 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
           isPSHUFDMask(M, VT) ||
           isPSHUFHWMask(M, VT) ||
           isPSHUFLWMask(M, VT) ||
-          isPALIGNRMask(M, VT, Subtarget->hasSSSE3()) ||
+          isPALIGNRMask(M, VT, Subtarget->hasSSSE3() || Subtarget->hasAVX()) ||
           isUNPCKLMask(M, VT) ||
           isUNPCKHMask(M, VT) ||
           isUNPCKL_v_undef_Mask(M, VT) ||
@@ -10158,7 +11298,9 @@ X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB,
     if (!(Op.isReg() && Op.isImplicit()))
       MIB.addOperand(Op);
   }
-  BuildMI(*BB, MI, dl, TII->get(X86::MOVAPSrr), MI->getOperand(0).getReg())
+  BuildMI(*BB, MI, dl,
+    TII->get(Subtarget->hasAVX() ? X86::VMOVAPSrr : X86::MOVAPSrr),
+             MI->getOperand(0).getReg())
     .addReg(X86::XMM0);
 
   MI->eraseFromParent();
@@ -10513,6 +11655,7 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
     MBB->addSuccessor(EndMBB);
   }
 
+  unsigned MOVOpc = Subtarget->hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr;
   // In the XMM save block, save all the XMM argument registers.
   for (int i = 3, e = MI->getNumOperands(); i != e; ++i) {
     int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
@@ -10521,7 +11664,7 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
           MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset),
         MachineMemOperand::MOStore,
         /*Size=*/16, /*Align=*/16);
-    BuildMI(XMMSaveMBB, DL, TII->get(X86::MOVAPSmr))
+    BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
       .addFrameIndex(RegSaveFrameIndex)
       .addImm(/*Scale=*/1)
       .addReg(/*IndexReg=*/0)
@@ -10565,17 +11708,9 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
 
   // If the EFLAGS register isn't dead in the terminator, then claim that it's
   // live into the sink and copy blocks.
-  const MachineFunction *MF = BB->getParent();
-  const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo();
-  BitVector ReservedRegs = TRI->getReservedRegs(*MF);
-
-  for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) {
-    const MachineOperand &MO = MI->getOperand(I);
-    if (!MO.isReg() || !MO.isUse() || MO.isKill()) continue;
-    unsigned Reg = MO.getReg();
-    if (Reg != X86::EFLAGS) continue;
-    copy0MBB->addLiveIn(Reg);
-    sinkMBB->addLiveIn(Reg);
+  if (!MI->killsRegister(X86::EFLAGS)) {
+    copy0MBB->addLiveIn(X86::EFLAGS);
+    sinkMBB->addLiveIn(X86::EFLAGS);
   }
 
   // Transfer the remainder of BB and its successor edges to sinkMBB.
@@ -10610,6 +11745,119 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
   return sinkMBB;
 }
 
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB,
+                                        bool Is64Bit) const {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc DL = MI->getDebugLoc();
+  MachineFunction *MF = BB->getParent();
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+
+  assert(EnableSegmentedStacks);
+
+  unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
+  unsigned TlsOffset = Is64Bit ? 0x70 : 0x30;
+
+  // BB:
+  //  ... [Till the alloca]
+  // If stacklet is not large enough, jump to mallocMBB
+  //
+  // bumpMBB:
+  //  Allocate by subtracting from RSP
+  //  Jump to continueMBB
+  //
+  // mallocMBB:
+  //  Allocate by call to runtime
+  //
+  // continueMBB:
+  //  ...
+  //  [rest of original BB]
+  //
+
+  MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  const TargetRegisterClass *AddrRegClass =
+    getRegClassFor(Is64Bit ? MVT::i64:MVT::i32);
+
+  unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
+    bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
+    tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
+    sizeVReg = MI->getOperand(1).getReg(),
+    physSPReg = Is64Bit ? X86::RSP : X86::ESP;
+
+  MachineFunction::iterator MBBIter = BB;
+  ++MBBIter;
+
+  MF->insert(MBBIter, bumpMBB);
+  MF->insert(MBBIter, mallocMBB);
+  MF->insert(MBBIter, continueMBB);
+
+  continueMBB->splice(continueMBB->begin(), BB, llvm::next
+                      (MachineBasicBlock::iterator(MI)), BB->end());
+  continueMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  // Add code to the main basic block to check if the stack limit has been hit,
+  // and if so, jump to mallocMBB otherwise to bumpMBB.
+  BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
+  BuildMI(BB, DL, TII->get(Is64Bit ? X86::SUB64rr:X86::SUB32rr), tmpSPVReg)
+    .addReg(tmpSPVReg).addReg(sizeVReg);
+  BuildMI(BB, DL, TII->get(Is64Bit ? X86::CMP64mr:X86::CMP32mr))
+    .addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg)
+    .addReg(tmpSPVReg);
+  BuildMI(BB, DL, TII->get(X86::JG_4)).addMBB(mallocMBB);
+
+  // bumpMBB simply decreases the stack pointer, since we know the current
+  // stacklet has enough space.
+  BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
+    .addReg(tmpSPVReg);
+  BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
+    .addReg(tmpSPVReg);
+  BuildMI(bumpMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB);
+
+  // Calls into a routine in libgcc to allocate more space from the heap.
+  if (Is64Bit) {
+    BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
+      .addReg(sizeVReg);
+    BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
+    .addExternalSymbol("__morestack_allocate_stack_space").addReg(X86::RDI);
+  } else {
+    BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
+      .addImm(12);
+    BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
+    BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
+      .addExternalSymbol("__morestack_allocate_stack_space");
+  }
+
+  if (!Is64Bit)
+    BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
+      .addImm(16);
+
+  BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
+    .addReg(Is64Bit ? X86::RAX : X86::EAX);
+  BuildMI(mallocMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB);
+
+  // Set up the CFG correctly.
+  BB->addSuccessor(bumpMBB);
+  BB->addSuccessor(mallocMBB);
+  mallocMBB->addSuccessor(continueMBB);
+  bumpMBB->addSuccessor(continueMBB);
+
+  // Take care of the PHI nodes.
+  BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
+          MI->getOperand(0).getReg())
+    .addReg(mallocPtrVReg).addMBB(mallocMBB)
+    .addReg(bumpSPPtrVReg).addMBB(bumpMBB);
+
+  // Delete the original pseudo instruction.
+  MI->eraseFromParent();
+
+  // And we're done.
+  return continueMBB;
+}
+
 MachineBasicBlock *
 X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
                                           MachineBasicBlock *BB) const {
@@ -10718,11 +11966,11 @@ MachineBasicBlock *
 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
                                                MachineBasicBlock *BB) const {
   switch (MI->getOpcode()) {
-  default: assert(false && "Unexpected instr type to insert");
+  default: assert(0 && "Unexpected instr type to insert");
   case X86::TAILJMPd64:
   case X86::TAILJMPr64:
   case X86::TAILJMPm64:
-    assert(!"TAILJMP64 would not be touched here.");
+    assert(0 && "TAILJMP64 would not be touched here.");
   case X86::TCRETURNdi64:
   case X86::TCRETURNri64:
   case X86::TCRETURNmi64:
@@ -10745,6 +11993,10 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     return BB;
   case X86::WIN_ALLOCA:
     return EmitLoweredWinAlloca(MI, BB);
+  case X86::SEG_ALLOCA_32:
+    return EmitLoweredSegAlloca(MI, BB, false);
+  case X86::SEG_ALLOCA_64:
+    return EmitLoweredSegAlloca(MI, BB, true);
   case X86::TLSCall_32:
   case X86::TLSCall_64:
     return EmitLoweredTLSCall(MI, BB);
@@ -10754,6 +12006,9 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   case X86::CMOV_V4F32:
   case X86::CMOV_V2F64:
   case X86::CMOV_V2I64:
+  case X86::CMOV_V8F32:
+  case X86::CMOV_V4F64:
+  case X86::CMOV_V4I64:
   case X86::CMOV_GR16:
   case X86::CMOV_GR32:
   case X86::CMOV_RFP32:
@@ -11074,6 +12329,33 @@ void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
     KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(),
                                        Mask.getBitWidth() - 1);
     break;
+  case ISD::INTRINSIC_WO_CHAIN: {
+    unsigned IntId = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+    unsigned NumLoBits = 0;
+    switch (IntId) {
+    default: break;
+    case Intrinsic::x86_sse_movmsk_ps:
+    case Intrinsic::x86_avx_movmsk_ps_256:
+    case Intrinsic::x86_sse2_movmsk_pd:
+    case Intrinsic::x86_avx_movmsk_pd_256:
+    case Intrinsic::x86_mmx_pmovmskb:
+    case Intrinsic::x86_sse2_pmovmskb_128: {
+      // High bits of movmskp{s|d}, pmovmskb are known zero.
+      switch (IntId) {
+        case Intrinsic::x86_sse_movmsk_ps:      NumLoBits = 4; break;
+        case Intrinsic::x86_avx_movmsk_ps_256:  NumLoBits = 8; break;
+        case Intrinsic::x86_sse2_movmsk_pd:     NumLoBits = 2; break;
+        case Intrinsic::x86_avx_movmsk_pd_256:  NumLoBits = 4; break;
+        case Intrinsic::x86_mmx_pmovmskb:       NumLoBits = 8; break;
+        case Intrinsic::x86_sse2_pmovmskb_128:  NumLoBits = 16; break;
+      }
+      KnownZero = APInt::getHighBitsSet(Mask.getBitWidth(),
+                                        Mask.getBitWidth() - NumLoBits);
+      break;
+    }
+    }
+    break;
+  }
   }
 }
 
@@ -11102,23 +12384,132 @@ bool X86TargetLowering::isGAPlusOffset(SDNode *N,
   return TargetLowering::isGAPlusOffset(N, GA, Offset);
 }
 
-/// PerformShuffleCombine - Combine a vector_shuffle that is equal to
-/// build_vector load1, load2, load3, load4, <0, 1, 2, 3> into a 128-bit load
-/// if the load addresses are consecutive, non-overlapping, and in the right
-/// order.
+/// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the
+/// same as extracting the high 128-bit part of 256-bit vector and then
+/// inserting the result into the low part of a new 256-bit vector
+static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) {
+  EVT VT = SVOp->getValueType(0);
+  int NumElems = VT.getVectorNumElements();
+
+  // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
+  for (int i = 0, j = NumElems/2; i < NumElems/2; ++i, ++j)
+    if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
+        SVOp->getMaskElt(j) >= 0)
+      return false;
+
+  return true;
+}
+
+/// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the
+/// same as extracting the low 128-bit part of 256-bit vector and then
+/// inserting the result into the high part of a new 256-bit vector
+static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) {
+  EVT VT = SVOp->getValueType(0);
+  int NumElems = VT.getVectorNumElements();
+
+  // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
+  for (int i = NumElems/2, j = 0; i < NumElems; ++i, ++j)
+    if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
+        SVOp->getMaskElt(j) >= 0)
+      return false;
+
+  return true;
+}
+
+/// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors.
+static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
+                                        TargetLowering::DAGCombinerInfo &DCI) {
+  DebugLoc dl = N->getDebugLoc();
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  SDValue V1 = SVOp->getOperand(0);
+  SDValue V2 = SVOp->getOperand(1);
+  EVT VT = SVOp->getValueType(0);
+  int NumElems = VT.getVectorNumElements();
+
+  if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
+      V2.getOpcode() == ISD::CONCAT_VECTORS) {
+    //
+    //                   0,0,0,...
+    //                      |
+    //    V      UNDEF    BUILD_VECTOR    UNDEF
+    //     \      /           \           /
+    //  CONCAT_VECTOR         CONCAT_VECTOR
+    //         \                  /
+    //          \                /
+    //          RESULT: V + zero extended
+    //
+    if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR ||
+        V2.getOperand(1).getOpcode() != ISD::UNDEF ||
+        V1.getOperand(1).getOpcode() != ISD::UNDEF)
+      return SDValue();
+
+    if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()))
+      return SDValue();
+
+    // To match the shuffle mask, the first half of the mask should
+    // be exactly the first vector, and all the rest a splat with the
+    // first element of the second one.
+    for (int i = 0; i < NumElems/2; ++i)
+      if (!isUndefOrEqual(SVOp->getMaskElt(i), i) ||
+          !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems))
+        return SDValue();
+
+    // Emit a zeroed vector and insert the desired subvector on its
+    // first half.
+    SDValue Zeros = getZeroVector(VT, true /* HasXMMInt */, DAG, dl);
+    SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0),
+                         DAG.getConstant(0, MVT::i32), DAG, dl);
+    return DCI.CombineTo(N, InsV);
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Combine some shuffles into subvector extracts and inserts:
+  //
+
+  // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
+  if (isShuffleHigh128VectorInsertLow(SVOp)) {
+    SDValue V = Extract128BitVector(V1, DAG.getConstant(NumElems/2, MVT::i32),
+                                    DAG, dl);
+    SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT),
+                                      V, DAG.getConstant(0, MVT::i32), DAG, dl);
+    return DCI.CombineTo(N, InsV);
+  }
+
+  // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
+  if (isShuffleLow128VectorInsertHigh(SVOp)) {
+    SDValue V = Extract128BitVector(V1, DAG.getConstant(0, MVT::i32), DAG, dl);
+    SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT),
+                             V, DAG.getConstant(NumElems/2, MVT::i32), DAG, dl);
+    return DCI.CombineTo(N, InsV);
+  }
+
+  return SDValue();
+}
+
+/// PerformShuffleCombine - Performs several different shuffle combines.
 static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
-                                     TargetLowering::DAGCombinerInfo &DCI) {
+                                     TargetLowering::DAGCombinerInfo &DCI,
+                                     const X86Subtarget *Subtarget) {
   DebugLoc dl = N->getDebugLoc();
   EVT VT = N->getValueType(0);
 
-  if (VT.getSizeInBits() != 128)
-    return SDValue();
-
   // Don't create instructions with illegal types after legalize types has run.
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType()))
     return SDValue();
 
+  // Combine 256-bit vector shuffles. This is only profitable when in AVX mode
+  if (Subtarget->hasAVX() && VT.getSizeInBits() == 256 &&
+      N->getOpcode() == ISD::VECTOR_SHUFFLE)
+    return PerformShuffleCombine256(N, DAG, DCI);
+
+  // Only handle 128 wide vector from here on.
+  if (VT.getSizeInBits() != 128)
+    return SDValue();
+
+  // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
+  // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
+  // consecutive, non-overlapping, and in the right order.
   SmallVector<SDValue, 16> Elts;
   for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
     Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));
@@ -11209,7 +12600,8 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-/// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes.
+/// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT
+/// nodes.
 static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
                                     const X86Subtarget *Subtarget) {
   DebugLoc DL = N->getDebugLoc();
@@ -11217,14 +12609,16 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
   // Get the LHS/RHS of the select.
   SDValue LHS = N->getOperand(1);
   SDValue RHS = N->getOperand(2);
+  EVT VT = LHS.getValueType();
 
   // If we have SSE[12] support, try to form min/max nodes. SSE min/max
   // instructions match the semantics of the common C idiom x<y?x:y but not
   // x<=y?x:y, because of how they handle negative zero (which can be
   // ignored in unsafe-math mode).
-  if (Subtarget->hasSSE2() &&
-      (LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64) &&
-      Cond.getOpcode() == ISD::SETCC) {
+  if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
+      VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
+      (Subtarget->hasXMMInt() ||
+       (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) {
     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
 
     unsigned Opcode = 0;
@@ -11267,7 +12661,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
         // Converting this to a max would handle comparisons between positive
         // and negative zero incorrectly.
         if (!UnsafeFPMath &&
-            !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(LHS))
+            !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
           break;
         Opcode = X86ISD::FMAX;
         break;
@@ -11680,7 +13074,7 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
   // all elements are shifted by the same amount.  We can't do this in legalize
   // because the a constant vector is typically transformed to a constant pool
   // so we have no knowledge of the shift amount.
-  if (!Subtarget->hasSSE2())
+  if (!Subtarget->hasXMMInt())
     return SDValue();
 
   if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16)
@@ -11796,7 +13190,7 @@ static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
 
   // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
   // we're requiring SSE2 for both.
-  if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
+  if (Subtarget->hasXMMInt() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
     SDValue N0 = N->getOperand(0);
     SDValue N1 = N->getOperand(1);
     SDValue CMP0 = N0->getOperand(1);
@@ -11864,6 +13258,36 @@ static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+/// CanFoldXORWithAllOnes - Test whether the XOR operand is a AllOnes vector
+/// so it can be folded inside ANDNP.
+static bool CanFoldXORWithAllOnes(const SDNode *N) {
+  EVT VT = N->getValueType(0);
+
+  // Match direct AllOnes for 128 and 256-bit vectors
+  if (ISD::isBuildVectorAllOnes(N))
+    return true;
+
+  // Look through a bit convert.
+  if (N->getOpcode() == ISD::BITCAST)
+    N = N->getOperand(0).getNode();
+
+  // Sometimes the operand may come from a insert_subvector building a 256-bit
+  // allones vector
+  if (VT.getSizeInBits() == 256 &&
+      N->getOpcode() == ISD::INSERT_SUBVECTOR) {
+    SDValue V1 = N->getOperand(0);
+    SDValue V2 = N->getOperand(1);
+
+    if (V1.getOpcode() == ISD::INSERT_SUBVECTOR &&
+        V1.getOperand(0).getOpcode() == ISD::UNDEF &&
+        ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) &&
+        ISD::isBuildVectorAllOnes(V2.getNode()))
+      return true;
+  }
+
+  return false;
+}
+
 static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const X86Subtarget *Subtarget) {
@@ -11874,11 +13298,28 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
   if (R.getNode())
     return R;
 
+  EVT VT = N->getValueType(0);
+
+  // Create ANDN instructions
+  if (Subtarget->hasBMI() && (VT == MVT::i32 || VT == MVT::i64)) {
+    SDValue N0 = N->getOperand(0);
+    SDValue N1 = N->getOperand(1);
+    DebugLoc DL = N->getDebugLoc();
+
+    // Check LHS for not
+    if (N0.getOpcode() == ISD::XOR && isAllOnes(N0.getOperand(1)))
+      return DAG.getNode(X86ISD::ANDN, DL, VT, N0.getOperand(0), N1);
+    // Check RHS for not
+    if (N1.getOpcode() == ISD::XOR && isAllOnes(N1.getOperand(1)))
+      return DAG.getNode(X86ISD::ANDN, DL, VT, N1.getOperand(0), N0);
+
+    return SDValue();
+  }
+
   // Want to form ANDNP nodes:
   // 1) In the hopes of then easily combining them with OR and AND nodes
   //    to form PBLEND/PSIGN.
   // 2) To match ANDN packed intrinsics
-  EVT VT = N->getValueType(0);
   if (VT != MVT::v2i64 && VT != MVT::v4i64)
     return SDValue();
 
@@ -11888,12 +13329,14 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
 
   // Check LHS for vnot
   if (N0.getOpcode() == ISD::XOR &&
-      ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
+      //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
+      CanFoldXORWithAllOnes(N0.getOperand(1).getNode()))
     return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
 
   // Check RHS for vnot
   if (N1.getOpcode() == ISD::XOR &&
-      ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
+      //ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
+      CanFoldXORWithAllOnes(N1.getOperand(1).getNode()))
     return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
 
   return SDValue();
@@ -11917,7 +13360,7 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
   SDValue N1 = N->getOperand(1);
 
   // look for psign/blend
-  if (Subtarget->hasSSSE3()) {
+  if (Subtarget->hasSSSE3() || Subtarget->hasAVX()) {
     if (VT == MVT::v2i64) {
       // Canonicalize pandn to RHS
       if (N0.getOpcode() == X86ISD::ANDNP)
@@ -11990,13 +13433,13 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
           }
         }
         // PBLENDVB only available on SSE 4.1
-        if (!Subtarget->hasSSE41())
+        if (!(Subtarget->hasSSE41() || Subtarget->hasAVX()))
           return SDValue();
 
         X = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, X);
         Y = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Y);
         Mask = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Mask);
-        Mask = DAG.getNode(X86ISD::PBLENDVB, DL, MVT::v16i8, X, Y, Mask);
+        Mask = DAG.getNode(ISD::VSELECT, DL, MVT::v16i8, Mask, X, Y);
         return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Mask);
       }
     }
@@ -12057,24 +13500,211 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+/// PerformLOADCombine - Do target-specific dag combines on LOAD nodes.
+static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
+                                   const X86Subtarget *Subtarget) {
+  LoadSDNode *Ld = cast<LoadSDNode>(N);
+  EVT RegVT = Ld->getValueType(0);
+  EVT MemVT = Ld->getMemoryVT();
+  DebugLoc dl = Ld->getDebugLoc();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  ISD::LoadExtType Ext = Ld->getExtensionType();
+
+  // If this is a vector EXT Load then attempt to optimize it using a
+  // shuffle. We need SSE4 for the shuffles.
+  // TODO: It is possible to support ZExt by zeroing the undef values
+  // during the shuffle phase or after the shuffle.
+  if (RegVT.isVector() && Ext == ISD::EXTLOAD && Subtarget->hasSSE41()) {
+    assert(MemVT != RegVT && "Cannot extend to the same type");
+    assert(MemVT.isVector() && "Must load a vector from memory");
+
+    unsigned NumElems = RegVT.getVectorNumElements();
+    unsigned RegSz = RegVT.getSizeInBits();
+    unsigned MemSz = MemVT.getSizeInBits();
+    assert(RegSz > MemSz && "Register size must be greater than the mem size");
+    // All sizes must be a power of two
+    if (!isPowerOf2_32(RegSz * MemSz * NumElems)) return SDValue();
+
+    // Attempt to load the original value using a single load op.
+    // Find a scalar type which is equal to the loaded word size.
+    MVT SclrLoadTy = MVT::i8;
+    for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
+         tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
+      MVT Tp = (MVT::SimpleValueType)tp;
+      if (TLI.isTypeLegal(Tp) &&  Tp.getSizeInBits() == MemSz) {
+        SclrLoadTy = Tp;
+        break;
+      }
+    }
+
+    // Proceed if a load word is found.
+    if (SclrLoadTy.getSizeInBits() != MemSz) return SDValue();
+
+    EVT LoadUnitVecVT = EVT::getVectorVT(*DAG.getContext(), SclrLoadTy,
+      RegSz/SclrLoadTy.getSizeInBits());
+
+    EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
+                                  RegSz/MemVT.getScalarType().getSizeInBits());
+    // Can't shuffle using an illegal type.
+    if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
+
+    // Perform a single load.
+    SDValue ScalarLoad = DAG.getLoad(SclrLoadTy, dl, Ld->getChain(),
+                                  Ld->getBasePtr(),
+                                  Ld->getPointerInfo(), Ld->isVolatile(),
+                                  Ld->isNonTemporal(), Ld->getAlignment());
+
+    // Insert the word loaded into a vector.
+    SDValue ScalarInVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
+      LoadUnitVecVT, ScalarLoad);
+
+    // Bitcast the loaded value to a vector of the original element type, in
+    // the size of the target vector type.
+    SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, ScalarInVector);
+    unsigned SizeRatio = RegSz/MemSz;
+
+    // Redistribute the loaded elements into the different locations.
+    SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
+    for (unsigned i = 0; i < NumElems; i++) ShuffleVec[i*SizeRatio] = i;
+
+    SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
+                                DAG.getUNDEF(SlicedVec.getValueType()),
+                                ShuffleVec.data());
+
+    // Bitcast to the requested type.
+    Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
+    // Replace the original load with the new sequence
+    // and return the new chain.
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Shuff);
+    return SDValue(ScalarLoad.getNode(), 1);
+  }
+
+  return SDValue();
+}
+
 /// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
 static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
                                    const X86Subtarget *Subtarget) {
+  StoreSDNode *St = cast<StoreSDNode>(N);
+  EVT VT = St->getValue().getValueType();
+  EVT StVT = St->getMemoryVT();
+  DebugLoc dl = St->getDebugLoc();
+  SDValue StoredVal = St->getOperand(1);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  // If we are saving a concatination of two XMM registers, perform two stores.
+  // This is better in Sandy Bridge cause one 256-bit mem op is done via two
+  // 128-bit ones. If in the future the cost becomes only one memory access the
+  // first version would be better.
+  if (VT.getSizeInBits() == 256 &&
+    StoredVal.getNode()->getOpcode() == ISD::CONCAT_VECTORS &&
+    StoredVal.getNumOperands() == 2) {
+
+    SDValue Value0 = StoredVal.getOperand(0);
+    SDValue Value1 = StoredVal.getOperand(1);
+
+    SDValue Stride = DAG.getConstant(16, TLI.getPointerTy());
+    SDValue Ptr0 = St->getBasePtr();
+    SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride);
+
+    SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0,
+                                St->getPointerInfo(), St->isVolatile(),
+                                St->isNonTemporal(), St->getAlignment());
+    SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1,
+                                St->getPointerInfo(), St->isVolatile(),
+                                St->isNonTemporal(), St->getAlignment());
+    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
+  }
+
+  // Optimize trunc store (of multiple scalars) to shuffle and store.
+  // First, pack all of the elements in one place. Next, store to memory
+  // in fewer chunks.
+  if (St->isTruncatingStore() && VT.isVector()) {
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    unsigned NumElems = VT.getVectorNumElements();
+    assert(StVT != VT && "Cannot truncate to the same type");
+    unsigned FromSz = VT.getVectorElementType().getSizeInBits();
+    unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
+
+    // From, To sizes and ElemCount must be pow of two
+    if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
+    // We are going to use the original vector elt for storing.
+    // Accumulated smaller vector elements must be a multiple of the store size.
+    if (0 != (NumElems * FromSz) % ToSz) return SDValue();
+
+    unsigned SizeRatio  = FromSz / ToSz;
+
+    assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
+
+    // Create a type on which we perform the shuffle
+    EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
+            StVT.getScalarType(), NumElems*SizeRatio);
+
+    assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
+
+    SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, St->getValue());
+    SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
+    for (unsigned i = 0; i < NumElems; i++ ) ShuffleVec[i] = i * SizeRatio;
+
+    // Can't shuffle using an illegal type
+    if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
+
+    SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
+                                DAG.getUNDEF(WideVec.getValueType()),
+                                ShuffleVec.data());
+    // At this point all of the data is stored at the bottom of the
+    // register. We now need to save it to mem.
+
+    // Find the largest store unit
+    MVT StoreType = MVT::i8;
+    for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
+         tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
+      MVT Tp = (MVT::SimpleValueType)tp;
+      if (TLI.isTypeLegal(Tp) && StoreType.getSizeInBits() < NumElems * ToSz)
+        StoreType = Tp;
+    }
+
+    // Bitcast the original vector into a vector of store-size units
+    EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
+            StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits());
+    assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
+    SDValue ShuffWide = DAG.getNode(ISD::BITCAST, dl, StoreVecVT, Shuff);
+    SmallVector<SDValue, 8> Chains;
+    SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8,
+                                        TLI.getPointerTy());
+    SDValue Ptr = St->getBasePtr();
+
+    // Perform one or more big stores into memory.
+    for (unsigned i = 0; i < (ToSz*NumElems)/StoreType.getSizeInBits() ; i++) {
+      SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+                                   StoreType, ShuffWide,
+                                   DAG.getIntPtrConstant(i));
+      SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr,
+                                St->getPointerInfo(), St->isVolatile(),
+                                St->isNonTemporal(), St->getAlignment());
+      Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
+      Chains.push_back(Ch);
+    }
+
+    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0],
+                               Chains.size());
+  }
+
+
   // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
   // the FP state in cases where an emms may be missing.
   // A preferable solution to the general problem is to figure out the right
   // places to insert EMMS.  This qualifies as a quick hack.
 
   // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
-  StoreSDNode *St = cast<StoreSDNode>(N);
-  EVT VT = St->getValue().getValueType();
   if (VT.getSizeInBits() != 64)
     return SDValue();
 
   const Function *F = DAG.getMachineFunction().getFunction();
   bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat);
   bool F64IsLegal = !UseSoftFloat && !NoImplicitFloatOps
-    && Subtarget->hasSSE2();
+                     && Subtarget->hasXMMInt();
   if ((VT.isVector() ||
        (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) &&
       isa<LoadSDNode>(St->getValue()) &&
@@ -12172,6 +13802,150 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+/// isHorizontalBinOp - Return 'true' if this vector operation is "horizontal"
+/// and return the operands for the horizontal operation in LHS and RHS.  A
+/// horizontal operation performs the binary operation on successive elements
+/// of its first operand, then on successive elements of its second operand,
+/// returning the resulting values in a vector.  For example, if
+///   A = < float a0, float a1, float a2, float a3 >
+/// and
+///   B = < float b0, float b1, float b2, float b3 >
+/// then the result of doing a horizontal operation on A and B is
+///   A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
+/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
+/// A horizontal-op B, for some already available A and B, and if so then LHS is
+/// set to A, RHS to B, and the routine returns 'true'.
+/// Note that the binary operation should have the property that if one of the
+/// operands is UNDEF then the result is UNDEF.
+static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool isCommutative) {
+  // Look for the following pattern: if
+  //   A = < float a0, float a1, float a2, float a3 >
+  //   B = < float b0, float b1, float b2, float b3 >
+  // and
+  //   LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
+  //   RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
+  // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
+  // which is A horizontal-op B.
+
+  // At least one of the operands should be a vector shuffle.
+  if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
+      RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
+    return false;
+
+  EVT VT = LHS.getValueType();
+  unsigned N = VT.getVectorNumElements();
+
+  // View LHS in the form
+  //   LHS = VECTOR_SHUFFLE A, B, LMask
+  // If LHS is not a shuffle then pretend it is the shuffle
+  //   LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
+  // NOTE: in what follows a default initialized SDValue represents an UNDEF of
+  // type VT.
+  SDValue A, B;
+  SmallVector<int, 8> LMask(N);
+  if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
+    if (LHS.getOperand(0).getOpcode() != ISD::UNDEF)
+      A = LHS.getOperand(0);
+    if (LHS.getOperand(1).getOpcode() != ISD::UNDEF)
+      B = LHS.getOperand(1);
+    cast<ShuffleVectorSDNode>(LHS.getNode())->getMask(LMask);
+  } else {
+    if (LHS.getOpcode() != ISD::UNDEF)
+      A = LHS;
+    for (unsigned i = 0; i != N; ++i)
+      LMask[i] = i;
+  }
+
+  // Likewise, view RHS in the form
+  //   RHS = VECTOR_SHUFFLE C, D, RMask
+  SDValue C, D;
+  SmallVector<int, 8> RMask(N);
+  if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
+    if (RHS.getOperand(0).getOpcode() != ISD::UNDEF)
+      C = RHS.getOperand(0);
+    if (RHS.getOperand(1).getOpcode() != ISD::UNDEF)
+      D = RHS.getOperand(1);
+    cast<ShuffleVectorSDNode>(RHS.getNode())->getMask(RMask);
+  } else {
+    if (RHS.getOpcode() != ISD::UNDEF)
+      C = RHS;
+    for (unsigned i = 0; i != N; ++i)
+      RMask[i] = i;
+  }
+
+  // Check that the shuffles are both shuffling the same vectors.
+  if (!(A == C && B == D) && !(A == D && B == C))
+    return false;
+
+  // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
+  if (!A.getNode() && !B.getNode())
+    return false;
+
+  // If A and B occur in reverse order in RHS, then "swap" them (which means
+  // rewriting the mask).
+  if (A != C)
+    for (unsigned i = 0; i != N; ++i) {
+      unsigned Idx = RMask[i];
+      if (Idx < N)
+        RMask[i] += N;
+      else if (Idx < 2*N)
+        RMask[i] -= N;
+    }
+
+  // At this point LHS and RHS are equivalent to
+  //   LHS = VECTOR_SHUFFLE A, B, LMask
+  //   RHS = VECTOR_SHUFFLE A, B, RMask
+  // Check that the masks correspond to performing a horizontal operation.
+  for (unsigned i = 0; i != N; ++i) {
+    unsigned LIdx = LMask[i], RIdx = RMask[i];
+
+    // Ignore any UNDEF components.
+    if (LIdx >= 2*N || RIdx >= 2*N || (!A.getNode() && (LIdx < N || RIdx < N))
+        || (!B.getNode() && (LIdx >= N || RIdx >= N)))
+      continue;
+
+    // Check that successive elements are being operated on.  If not, this is
+    // not a horizontal operation.
+    if (!(LIdx == 2*i && RIdx == 2*i + 1) &&
+        !(isCommutative && LIdx == 2*i + 1 && RIdx == 2*i))
+      return false;
+  }
+
+  LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
+  RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
+  return true;
+}
+
+/// PerformFADDCombine - Do target-specific dag combines on floating point adds.
+static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG,
+                                  const X86Subtarget *Subtarget) {
+  EVT VT = N->getValueType(0);
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+
+  // Try to synthesize horizontal adds from adds of shuffles.
+  if ((Subtarget->hasSSE3() || Subtarget->hasAVX()) &&
+      (VT == MVT::v4f32 || VT == MVT::v2f64) &&
+      isHorizontalBinOp(LHS, RHS, true))
+    return DAG.getNode(X86ISD::FHADD, N->getDebugLoc(), VT, LHS, RHS);
+  return SDValue();
+}
+
+/// PerformFSUBCombine - Do target-specific dag combines on floating point subs.
+static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG,
+                                  const X86Subtarget *Subtarget) {
+  EVT VT = N->getValueType(0);
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+
+  // Try to synthesize horizontal subs from subs of shuffles.
+  if ((Subtarget->hasSSE3() || Subtarget->hasAVX()) &&
+      (VT == MVT::v4f32 || VT == MVT::v2f64) &&
+      isHorizontalBinOp(LHS, RHS, false))
+    return DAG.getNode(X86ISD::FHSUB, N->getDebugLoc(), VT, LHS, RHS);
+  return SDValue();
+}
+
 /// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and
 /// X86ISD::FXOR nodes.
 static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) {
@@ -12326,7 +14100,7 @@ static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG,
 //      (add Y, (setne X, 0)) -> sbb -1, Y
 //      (sub (sete  X, 0), Y) -> sbb  0, Y
 //      (sub (setne X, 0), Y) -> adc -1, Y
-static SDValue OptimizeConditonalInDecrement(SDNode *N, SelectionDAG &DAG) {
+static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) {
   DebugLoc DL = N->getDebugLoc();
 
   // Look through ZExts.
@@ -12362,6 +14136,31 @@ static SDValue OptimizeConditonalInDecrement(SDNode *N, SelectionDAG &DAG) {
                      DAG.getConstant(0, OtherVal.getValueType()), NewCmp);
 }
 
+static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG) {
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+
+  // X86 can't encode an immediate LHS of a sub. See if we can push the
+  // negation into a preceding instruction.
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
+    // If the RHS of the sub is a XOR with one use and a constant, invert the
+    // immediate. Then add one to the LHS of the sub so we can turn
+    // X-Y -> X+~Y+1, saving one register.
+    if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
+        isa<ConstantSDNode>(Op1.getOperand(1))) {
+      APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
+      EVT VT = Op0.getValueType();
+      SDValue NewXor = DAG.getNode(ISD::XOR, Op1.getDebugLoc(), VT,
+                                   Op1.getOperand(0),
+                                   DAG.getConstant(~XorC, VT));
+      return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, NewXor,
+                         DAG.getConstant(C->getAPIntValue()+1, VT));
+    }
+  }
+
+  return OptimizeConditionalInDecrement(N, DAG);
+}
+
 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -12369,10 +14168,11 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   default: break;
   case ISD::EXTRACT_VECTOR_ELT:
     return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, *this);
+  case ISD::VSELECT:
   case ISD::SELECT:         return PerformSELECTCombine(N, DAG, Subtarget);
   case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI);
-  case ISD::ADD:
-  case ISD::SUB:            return OptimizeConditonalInDecrement(N, DAG);
+  case ISD::ADD:            return OptimizeConditionalInDecrement(N, DAG);
+  case ISD::SUB:            return PerformSubCombine(N, DAG);
   case X86ISD::ADC:         return PerformADCCombine(N, DAG, DCI);
   case ISD::MUL:            return PerformMulCombine(N, DAG, DCI);
   case ISD::SHL:
@@ -12380,8 +14180,11 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::SRL:            return PerformShiftCombine(N, DAG, Subtarget);
   case ISD::AND:            return PerformAndCombine(N, DAG, DCI, Subtarget);
   case ISD::OR:             return PerformOrCombine(N, DAG, DCI, Subtarget);
+  case ISD::LOAD:           return PerformLOADCombine(N, DAG, Subtarget);
   case ISD::STORE:          return PerformSTORECombine(N, DAG, Subtarget);
   case ISD::SINT_TO_FP:     return PerformSINT_TO_FPCombine(N, DAG, this);
+  case ISD::FADD:           return PerformFADDCombine(N, DAG, Subtarget);
+  case ISD::FSUB:           return PerformFSUBCombine(N, DAG, Subtarget);
   case X86ISD::FXOR:
   case X86ISD::FOR:         return PerformFORCombine(N, DAG);
   case X86ISD::FAND:        return PerformFANDCombine(N, DAG);
@@ -12398,14 +14201,14 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case X86ISD::PUNPCKHQDQ:
   case X86ISD::UNPCKHPS:
   case X86ISD::UNPCKHPD:
+  case X86ISD::VUNPCKHPSY:
+  case X86ISD::VUNPCKHPDY:
   case X86ISD::PUNPCKLBW:
   case X86ISD::PUNPCKLWD:
   case X86ISD::PUNPCKLDQ:
   case X86ISD::PUNPCKLQDQ:
   case X86ISD::UNPCKLPS:
   case X86ISD::UNPCKLPD:
-  case X86ISD::VUNPCKLPS:
-  case X86ISD::VUNPCKLPD:
   case X86ISD::VUNPCKLPSY:
   case X86ISD::VUNPCKLPDY:
   case X86ISD::MOVHLPS:
@@ -12415,7 +14218,12 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case X86ISD::PSHUFLW:
   case X86ISD::MOVSS:
   case X86ISD::MOVSD:
-  case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI);
+  case X86ISD::VPERMILPS:
+  case X86ISD::VPERMILPSY:
+  case X86ISD::VPERMILPD:
+  case X86ISD::VPERMILPDY:
+  case X86ISD::VPERM2F128:
+  case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
   }
 
   return SDValue();
@@ -12551,7 +14359,7 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
          AsmPieces[1] == "${0:q}")) {
       // No need to check constraints, nothing other than the equivalent of
       // "=r,0" would be valid here.
-      const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
+      IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
       if (!Ty || Ty->getBitWidth() % 16 != 0)
         return false;
       return IntrinsicLowering::LowerToByteSwap(CI);
@@ -12572,7 +14380,7 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
           AsmPieces[1] == "~{dirflag}" &&
           AsmPieces[2] == "~{flags}" &&
           AsmPieces[3] == "~{fpsr}") {
-        const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
+        IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
         if (!Ty || Ty->getBitWidth() % 16 != 0)
           return false;
         return IntrinsicLowering::LowerToByteSwap(CI);
@@ -12603,7 +14411,7 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
                 AsmPieces[1] == "~{dirflag}" &&
                 AsmPieces[2] == "~{flags}" &&
                 AsmPieces[3] == "~{fpsr}") {
-              const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
+              IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
               if (!Ty || Ty->getBitWidth() % 16 != 0)
                 return false;
               return IntrinsicLowering::LowerToByteSwap(CI);
@@ -12629,7 +14437,7 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
             SplitString(AsmPieces[2], Words, " \t,");
             if (Words.size() == 3 && Words[0] == "xchgl" && Words[1] == "%eax" &&
                 Words[2] == "%edx") {
-              const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
+              IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
               if (!Ty || Ty->getBitWidth() % 16 != 0)
                 return false;
               return IntrinsicLowering::LowerToByteSwap(CI);
@@ -12700,7 +14508,7 @@ TargetLowering::ConstraintWeight
     // but allow it at the lowest weight.
   if (CallOperandVal == NULL)
     return CW_Default;
-  const Type *type = CallOperandVal->getType();
+  Type *type = CallOperandVal->getType();
   // Look at the constraint type.
   switch (*constraint) {
   default:
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index b6036782b8654..342a5e6175451 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -175,8 +175,14 @@ namespace llvm {
       /// PSIGNB/W/D - Copy integer sign.
       PSIGNB, PSIGNW, PSIGND,
 
-      /// PBLENDVB - Variable blend
-      PBLENDVB,
+      /// BLEND family of opcodes
+      BLENDV,
+
+      /// FHADD - Floating point horizontal add.
+      FHADD,
+
+      /// FHSUB - Floating point horizontal sub.
+      FHSUB,
 
       /// FMAX, FMIN - Floating point max and min.
       ///
@@ -222,6 +228,8 @@ namespace llvm {
       ADD, SUB, ADC, SBB, SMUL,
       INC, DEC, OR, XOR, AND,
 
+      ANDN, // ANDN - Bitwise AND NOT with FLAGS results.
+
       UMUL, // LOW, HI, FLAGS = umul LHS, RHS
 
       // MUL_IMM - X86 specific multiply by immediate.
@@ -257,12 +265,12 @@ namespace llvm {
       MOVSS,
       UNPCKLPS,
       UNPCKLPD,
-      VUNPCKLPS,
-      VUNPCKLPD,
       VUNPCKLPSY,
       VUNPCKLPDY,
       UNPCKHPS,
       UNPCKHPD,
+      VUNPCKHPSY,
+      VUNPCKHPDY,
       PUNPCKLBW,
       PUNPCKLWD,
       PUNPCKLDQ,
@@ -271,6 +279,12 @@ namespace llvm {
       PUNPCKHWD,
       PUNPCKHDQ,
       PUNPCKHQDQ,
+      VPERMILPS,
+      VPERMILPSY,
+      VPERMILPD,
+      VPERMILPDY,
+      VPERM2F128,
+      VBROADCAST,
 
       // VASTART_SAVE_XMM_REGS - Save xmm argument registers to the stack,
       // according to %al. An operator is needed so that this can be expanded
@@ -280,6 +294,11 @@ namespace llvm {
       // WIN_ALLOCA - Windows's _chkstk call to do stack probing.
       WIN_ALLOCA,
 
+      // SEG_ALLOCA - For allocating variable amounts of stack space when using
+      // segmented stacks. Check if the current stacklet has enough space, and
+      // falls back to heap allocation if not.
+      SEG_ALLOCA,
+
       // Memory barrier
       MEMBARRIER,
       MFENCE,
@@ -297,9 +316,10 @@ namespace llvm {
       ATOMNAND64_DAG,
       ATOMSWAP64_DAG,
 
-      // LCMPXCHG_DAG, LCMPXCHG8_DAG - Compare and swap.
+      // LCMPXCHG_DAG, LCMPXCHG8_DAG, LCMPXCHG16_DAG - Compare and swap.
       LCMPXCHG_DAG,
       LCMPXCHG8_DAG,
+      LCMPXCHG16_DAG,
 
       // VZEXT_LOAD - Load, scalar_to_vector, and zero extend.
       VZEXT_LOAD,
@@ -407,20 +427,16 @@ namespace llvm {
 
     /// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
     /// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
-    bool isMOVSHDUPMask(ShuffleVectorSDNode *N);
+    bool isMOVSHDUPMask(ShuffleVectorSDNode *N, const X86Subtarget *Subtarget);
 
     /// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
     /// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
-    bool isMOVSLDUPMask(ShuffleVectorSDNode *N);
+    bool isMOVSLDUPMask(ShuffleVectorSDNode *N, const X86Subtarget *Subtarget);
 
     /// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
     /// specifies a shuffle of elements that is suitable for input to MOVDDUP.
     bool isMOVDDUPMask(ShuffleVectorSDNode *N);
 
-    /// isPALIGNRMask - Return true if the specified VECTOR_SHUFFLE operand
-    /// specifies a shuffle of elements that is suitable for input to PALIGNR.
-    bool isPALIGNRMask(ShuffleVectorSDNode *N);
-
     /// isVEXTRACTF128Index - Return true if the specified
     /// EXTRACT_SUBVECTOR operand specifies a vector extract that is
     /// suitable for input to VEXTRACTF128.
@@ -505,7 +521,7 @@ namespace llvm {
     /// function arguments in the caller parameter area. For X86, aggregates
     /// that contains are placed at 16-byte boundaries while the rest are at
     /// 4-byte boundaries.
-    virtual unsigned getByValTypeAlignment(const Type *Ty) const;
+    virtual unsigned getByValTypeAlignment(Type *Ty) const;
 
     /// getOptimalMemOpType - Returns the target specific optimal type for load
     /// and store operations as a result of memset, memcpy, and memmove
@@ -564,8 +580,8 @@ namespace llvm {
     /// DAG node.
     virtual const char *getTargetNodeName(unsigned Opcode) const;
 
-    /// getSetCCResultType - Return the ISD::SETCC ValueType
-    virtual MVT::SimpleValueType getSetCCResultType(EVT VT) const;
+    /// getSetCCResultType - Return the value type to use for ISD::SETCC.
+    virtual EVT getSetCCResultType(EVT VT) const;
 
     /// computeMaskedBitsForTargetNode - Determine which of the bits specified
     /// in Mask are known to be either zero or one and return them in the
@@ -617,12 +633,12 @@ namespace llvm {
 
     /// isLegalAddressingMode - Return true if the addressing mode represented
     /// by AM is legal for this target, for a load/store of the specified type.
-    virtual bool isLegalAddressingMode(const AddrMode &AM, const Type *Ty)const;
+    virtual bool isLegalAddressingMode(const AddrMode &AM, Type *Ty)const;
 
     /// isTruncateFree - Return true if it's free to truncate a value of
     /// type Ty1 to type Ty2. e.g. On x86 it's free to truncate a i32 value in
     /// register EAX to i16 by referencing its sub-register AX.
-    virtual bool isTruncateFree(const Type *Ty1, const Type *Ty2) const;
+    virtual bool isTruncateFree(Type *Ty1, Type *Ty2) const;
     virtual bool isTruncateFree(EVT VT1, EVT VT2) const;
 
     /// isZExtFree - Return true if any actual instruction that defines a
@@ -633,7 +649,7 @@ namespace llvm {
     /// does not necessarily apply to truncate instructions. e.g. on x86-64,
     /// all instructions that define 32-bit values implicit zero-extend the
     /// result out to 64 bits.
-    virtual bool isZExtFree(const Type *Ty1, const Type *Ty2) const;
+    virtual bool isZExtFree(Type *Ty1, Type *Ty2) const;
     virtual bool isZExtFree(EVT VT1, EVT VT2) const;
 
     /// isNarrowingProfitable - Return true if it's profitable to narrow
@@ -813,11 +829,14 @@ namespace llvm {
     SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerTRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerADD(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerShift(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) const;
 
@@ -825,6 +844,7 @@ namespace llvm {
     SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerREADCYCLECOUNTER(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
 
     // Utility functions to help LowerVECTOR_SHUFFLE
@@ -931,6 +951,10 @@ namespace llvm {
     MachineBasicBlock *EmitLoweredWinAlloca(MachineInstr *MI,
                                               MachineBasicBlock *BB) const;
 
+    MachineBasicBlock *EmitLoweredSegAlloca(MachineInstr *MI,
+                                            MachineBasicBlock *BB,
+                                            bool Is64Bit) const;
+
     MachineBasicBlock *EmitLoweredTLSCall(MachineInstr *MI,
                                           MachineBasicBlock *BB) const;
 
diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td
index 9f7a4b06dc6fe..74b647a4f6b19 100644
--- a/lib/Target/X86/X86InstrArithmetic.td
+++ b/lib/Target/X86/X86InstrArithmetic.td
@@ -650,6 +650,15 @@ class BinOpRR_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo>
   let isCodeGenOnly = 1;
 }
 
+// BinOpRR_F_Rev - Instructions like "cmp reg, reg" (reversed encoding).
+class BinOpRR_F_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo>
+  : ITy<opcode, MRMSrcReg, typeinfo, (outs),
+        (ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2),
+        mnemonic, "{$src2, $src1|$src1, $src2}", []> {
+  // The disassembler should know about this, but not the asmparser.
+  let isCodeGenOnly = 1;
+}
+
 // BinOpRM - Instructions like "add reg, reg, [mem]".
 class BinOpRM<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
               dag outlist, list<dag> pattern>
@@ -857,11 +866,10 @@ class BinOpMI8_F<string mnemonic, X86TypeInfo typeinfo,
 
 // BinOpAI - Instructions like "add %eax, %eax, imm".
 class BinOpAI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-              Register areg>
+              Register areg, string operands>
   : ITy<opcode, RawFrm, typeinfo,
         (outs), (ins typeinfo.ImmOperand:$src),
-        mnemonic, !strconcat("{$src, %", areg.AsmName, "|%",
-                               areg.AsmName, ", $src}"), []> {
+        mnemonic, operands, []> {
   let ImmT = typeinfo.ImmEncoding;
   let Uses = [areg];
   let Defs = [areg];
@@ -926,10 +934,14 @@ multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
     def #NAME#32mi   : BinOpMI_RMW<mnemonic, Xi32, opnode, MemMRM>;
     def #NAME#64mi32 : BinOpMI_RMW<mnemonic, Xi64, opnode, MemMRM>;
 
-    def #NAME#8i8   : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL>;
-    def #NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX>;
-    def #NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX>;
-    def #NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX>;
+    def #NAME#8i8   : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL,
+                              "{$src, %al|AL, $src}">;
+    def #NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX,
+                              "{$src, %ax|AX, $src}">;
+    def #NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX,
+                              "{$src, %eax|EAX, $src}">;
+    def #NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX,
+                              "{$src, %rax|RAX, $src}">;
   }                          
 }
 
@@ -993,10 +1005,14 @@ multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
     def #NAME#32mi   : BinOpMI_RMW_FF<mnemonic, Xi32, opnode, MemMRM>;
     def #NAME#64mi32 : BinOpMI_RMW_FF<mnemonic, Xi64, opnode, MemMRM>;
 
-    def #NAME#8i8   : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL>;
-    def #NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX>;
-    def #NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX>;
-    def #NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX>;
+    def #NAME#8i8   : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL,
+                              "{$src, %al|AL, $src}">;
+    def #NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX,
+                              "{$src, %ax|AX, $src}">;
+    def #NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX,
+                              "{$src, %eax|EAX, $src}">;
+    def #NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX, 
+                              "{$src, %rax|RAX, $src}">;
   }                          
 }
 
@@ -1017,10 +1033,10 @@ multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
       def #NAME#64rr : BinOpRR_F<BaseOpc, mnemonic, Xi64, opnode>;
     } // isCommutable
 
-    def #NAME#8rr_REV  : BinOpRR_Rev<BaseOpc2, mnemonic, Xi8>;
-    def #NAME#16rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi16>;
-    def #NAME#32rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi32>;
-    def #NAME#64rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi64>;
+    def #NAME#8rr_REV  : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi8>;
+    def #NAME#16rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi16>;
+    def #NAME#32rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi32>;
+    def #NAME#64rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi64>;
 
     def #NAME#8rm   : BinOpRM_F<BaseOpc2, mnemonic, Xi8 , opnode>;
     def #NAME#16rm  : BinOpRM_F<BaseOpc2, mnemonic, Xi16, opnode>;
@@ -1056,10 +1072,14 @@ multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
     def #NAME#32mi   : BinOpMI_F<mnemonic, Xi32, opnode, MemMRM>;
     def #NAME#64mi32 : BinOpMI_F<mnemonic, Xi64, opnode, MemMRM>;
 
-    def #NAME#8i8   : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL>;
-    def #NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX>;
-    def #NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX>;
-    def #NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX>;
+    def #NAME#8i8   : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL,
+                              "{$src, %al|AL, $src}">;
+    def #NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX,
+                              "{$src, %ax|AX, $src}">;
+    def #NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX,
+                              "{$src, %eax|EAX, $src}">;
+    def #NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX,
+                              "{$src, %rax|RAX, $src}">;
   }                          
 }
 
@@ -1117,9 +1137,37 @@ let Defs = [EFLAGS] in {
   def TEST32mi   : BinOpMI_F<"test", Xi32, X86testpat, MRM0m, 0xF6>;
   def TEST64mi32 : BinOpMI_F<"test", Xi64, X86testpat, MRM0m, 0xF6>;
                      
-  def TEST8i8    : BinOpAI<0xA8, "test", Xi8 , AL>;
-  def TEST16i16  : BinOpAI<0xA8, "test", Xi16, AX>;
-  def TEST32i32  : BinOpAI<0xA8, "test", Xi32, EAX>;
-  def TEST64i32  : BinOpAI<0xA8, "test", Xi64, RAX>;
-}                          
+  def TEST8i8    : BinOpAI<0xA8, "test", Xi8 , AL,
+                           "{$src, %al|AL, $src}">;
+  def TEST16i16  : BinOpAI<0xA8, "test", Xi16, AX,
+                           "{$src, %ax|AX, $src}">;
+  def TEST32i32  : BinOpAI<0xA8, "test", Xi32, EAX,
+                           "{$src, %eax|EAX, $src}">;
+  def TEST64i32  : BinOpAI<0xA8, "test", Xi64, RAX,
+                           "{$src, %rax|RAX, $src}">;
+
+  // When testing the result of EXTRACT_SUBREG sub_8bit_hi, make sure the
+  // register class is constrained to GR8_NOREX.
+  let isPseudo = 1 in
+  def TEST8ri_NOREX : I<0, Pseudo, (outs), (ins GR8_NOREX:$src, i8imm:$mask),
+                        "", []>;
+}
 
+//===----------------------------------------------------------------------===//
+// ANDN Instruction
+//
+multiclass bmi_andn<string mnemonic, RegisterClass RC, X86MemOperand x86memop,
+                    PatFrag ld_frag> {
+  def rr : I<0xF2, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+            !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+            [(set RC:$dst, EFLAGS, (X86andn_flag RC:$src1, RC:$src2))]>;
+  def rm : I<0xF2, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+            !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+            [(set RC:$dst, EFLAGS,
+             (X86andn_flag RC:$src1, (ld_frag addr:$src2)))]>;
+}
+
+let Predicates = [HasBMI], Defs = [EFLAGS] in {
+  defm ANDN32 : bmi_andn<"andn{l}", GR32, i32mem, loadi32>, T8, VEX_4V;
+  defm ANDN64 : bmi_andn<"andn{q}", GR64, i64mem, loadi64>, T8, VEX_4V, VEX_W;
+}
diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td
index adcc747eb4b88..da28690672a66 100644
--- a/lib/Target/X86/X86InstrCompiler.td
+++ b/lib/Target/X86/X86InstrCompiler.td
@@ -106,6 +106,26 @@ let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in
   def WIN_ALLOCA : I<0, Pseudo, (outs), (ins),
                      "# dynamic stack allocation",
                      [(X86WinAlloca)]>;
+
+// When using segmented stacks these are lowered into instructions which first
+// check if the current stacklet has enough free memory. If it does, memory is
+// allocated by bumping the stack pointer. Otherwise memory is allocated from 
+// the heap.
+
+let Defs = [EAX, ESP, EFLAGS], Uses = [ESP, EAX] in
+def SEG_ALLOCA_32 : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$size),
+                      "# variable sized alloca for segmented stacks",
+                      [(set GR32:$dst,
+                         (X86SegAlloca GR32:$size))]>,
+                    Requires<[In32BitMode]>;
+
+let Defs = [RAX, RSP, EFLAGS], Uses = [RSP, RAX] in
+def SEG_ALLOCA_64 : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$size),
+                      "# variable sized alloca for segmented stacks",
+                      [(set GR64:$dst,
+                         (X86SegAlloca GR64:$size))]>,
+                    Requires<[In64BitMode]>;
+
 }
 
 
@@ -329,18 +349,11 @@ def TLSCall_64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
 //===----------------------------------------------------------------------===//
 // Conditional Move Pseudo Instructions
 
-let Constraints = "$src1 = $dst" in {
-
-// Conditional moves
-let Uses = [EFLAGS] in {
-
 // X86 doesn't have 8-bit conditional moves. Use a customInserter to
 // emit control flow. An alternative to this is to mark i8 SELECT as Promote,
 // however that requires promoting the operands, and can induce additional
-// i8 register pressure. Note that CMOV_GR8 is conservatively considered to
-// clobber EFLAGS, because if one of the operands is zero, the expansion
-// could involve an xor.
-let usesCustomInserter = 1, Constraints = "", Defs = [EFLAGS] in {
+// i8 register pressure.
+let usesCustomInserter = 1, Uses = [EFLAGS] in {
 def CMOV_GR8 : I<0, Pseudo,
                  (outs GR8:$dst), (ins GR8:$src1, GR8:$src2, i8imm:$cond),
                  "#CMOV_GR8 PSEUDO!",
@@ -380,10 +393,7 @@ def CMOV_RFP80 : I<0, Pseudo,
                       (X86cmov RFP80:$src1, RFP80:$src2, imm:$cond,
                                                   EFLAGS))]>;
 } // Predicates = [NoCMov]
-} // UsesCustomInserter = 1, Constraints = "", Defs = [EFLAGS]
-} // Uses = [EFLAGS]
-
-} // Constraints = "$src1 = $dst" in
+} // UsesCustomInserter = 1, Uses = [EFLAGS]
 
 
 //===----------------------------------------------------------------------===//
@@ -532,7 +542,7 @@ def OR32mrLocked  : I<0x09, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$zero),
 let hasSideEffects = 1 in
 def Int_MemBarrier : I<0, Pseudo, (outs), (ins),
                      "#MEMBARRIER",
-                     [(X86MemBarrier)]>, Requires<[HasSSE2]>;
+                     [(X86MemBarrier)]>;
 
 // TODO: Get this to fold the constant into the instruction.
 let hasSideEffects = 1, Defs = [ESP], isCodeGenOnly = 1 in
@@ -630,8 +640,8 @@ def #NAME#64mi8 : RIi8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
 defm LOCK_ADD : LOCK_ArithBinOp<0x00, 0x80, 0x83, MRM0m, "add">;
 defm LOCK_SUB : LOCK_ArithBinOp<0x28, 0x80, 0x83, MRM5m, "sub">;
 defm LOCK_OR  : LOCK_ArithBinOp<0x08, 0x80, 0x83, MRM1m, "or">;
-defm LOCK_AND : LOCK_ArithBinOp<0x08, 0x80, 0x83, MRM4m, "and">;
-defm LOCK_XOR : LOCK_ArithBinOp<0x08, 0x80, 0x83, MRM6m, "xor">;
+defm LOCK_AND : LOCK_ArithBinOp<0x20, 0x80, 0x83, MRM4m, "and">;
+defm LOCK_XOR : LOCK_ArithBinOp<0x30, 0x80, 0x83, MRM6m, "xor">;
 
 // Optimized codegen when the non-memory output is not used.
 let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1 in {
@@ -665,12 +675,20 @@ def LOCK_DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst),
 
 // Atomic compare and swap.
 let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX],
-    isCodeGenOnly = 1 in {
+    isCodeGenOnly = 1 in
 def LCMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$ptr),
                "lock\n\t"
                "cmpxchg8b\t$ptr",
                [(X86cas8 addr:$ptr)]>, TB, LOCK;
-}
+
+let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX],
+    isCodeGenOnly = 1 in
+def LCMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$ptr),
+                    "lock\n\t"
+                    "cmpxchg16b\t$ptr",
+                    [(X86cas16 addr:$ptr)]>, TB, LOCK,
+                    Requires<[HasCmpxchg16b]>;
+
 let Defs = [AL, EFLAGS], Uses = [AL], isCodeGenOnly = 1 in {
 def LCMPXCHG8 : I<0xB0, MRMDestMem, (outs), (ins i8mem:$ptr, GR8:$swap),
                "lock\n\t"
@@ -695,7 +713,7 @@ def LCMPXCHG32 : I<0xB1, MRMDestMem, (outs), (ins i32mem:$ptr, GR32:$swap),
 let Defs = [RAX, EFLAGS], Uses = [RAX], isCodeGenOnly = 1 in {
 def LCMPXCHG64 : RI<0xB1, MRMDestMem, (outs), (ins i64mem:$ptr, GR64:$swap),
                "lock\n\t"
-               "cmpxchgq\t$swap,$ptr",
+               "cmpxchg{q}\t{$swap, $ptr|$ptr, $swap}",
                [(X86cas addr:$ptr, GR64:$swap, 8)]>, TB, LOCK;
 }
 
@@ -718,11 +736,37 @@ def LXADD32 : I<0xC1, MRMSrcMem, (outs GR32:$dst), (ins GR32:$val, i32mem:$ptr),
                 TB, LOCK;
 def LXADD64 : RI<0xC1, MRMSrcMem, (outs GR64:$dst), (ins GR64:$val,i64mem:$ptr),
                "lock\n\t"
-               "xadd\t$val, $ptr",
+               "xadd{q}\t{$val, $ptr|$ptr, $val}",
                [(set GR64:$dst, (atomic_load_add_64 addr:$ptr, GR64:$val))]>,
                 TB, LOCK;
 }
 
+def ACQUIRE_MOV8rm  : I<0, Pseudo, (outs GR8 :$dst), (ins i8mem :$src),
+                      "#ACQUIRE_MOV PSEUDO!",
+                      [(set GR8:$dst,  (atomic_load_8  addr:$src))]>;
+def ACQUIRE_MOV16rm : I<0, Pseudo, (outs GR16:$dst), (ins i16mem:$src),
+                      "#ACQUIRE_MOV PSEUDO!",
+                      [(set GR16:$dst, (atomic_load_16 addr:$src))]>;
+def ACQUIRE_MOV32rm : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$src),
+                      "#ACQUIRE_MOV PSEUDO!",
+                      [(set GR32:$dst, (atomic_load_32 addr:$src))]>;
+def ACQUIRE_MOV64rm : I<0, Pseudo, (outs GR64:$dst), (ins i64mem:$src),
+                      "#ACQUIRE_MOV PSEUDO!",
+                      [(set GR64:$dst, (atomic_load_64 addr:$src))]>;
+
+def RELEASE_MOV8mr  : I<0, Pseudo, (outs), (ins i8mem :$dst, GR8 :$src),
+                        "#RELEASE_MOV PSEUDO!",
+                        [(atomic_store_8  addr:$dst, GR8 :$src)]>;
+def RELEASE_MOV16mr : I<0, Pseudo, (outs), (ins i16mem:$dst, GR16:$src),
+                        "#RELEASE_MOV PSEUDO!",
+                        [(atomic_store_16 addr:$dst, GR16:$src)]>;
+def RELEASE_MOV32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, GR32:$src),
+                        "#RELEASE_MOV PSEUDO!",
+                        [(atomic_store_32 addr:$dst, GR32:$src)]>;
+def RELEASE_MOV64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, GR64:$src),
+                        "#RELEASE_MOV PSEUDO!",
+                        [(atomic_store_64 addr:$dst, GR64:$src)]>;
+
 //===----------------------------------------------------------------------===//
 // Conditional Move Pseudo Instructions.
 //===----------------------------------------------------------------------===//
@@ -759,6 +803,24 @@ let Uses = [EFLAGS], usesCustomInserter = 1 in {
                     [(set VR128:$dst,
                       (v2i64 (X86cmov VR128:$t, VR128:$f, imm:$cond,
                                           EFLAGS)))]>;
+  def CMOV_V8F32 : I<0, Pseudo,
+                    (outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond),
+                    "#CMOV_V8F32 PSEUDO!",
+                    [(set VR256:$dst,
+                      (v8f32 (X86cmov VR256:$t, VR256:$f, imm:$cond,
+                                          EFLAGS)))]>;
+  def CMOV_V4F64 : I<0, Pseudo,
+                    (outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond),
+                    "#CMOV_V4F64 PSEUDO!",
+                    [(set VR256:$dst,
+                      (v4f64 (X86cmov VR256:$t, VR256:$f, imm:$cond,
+                                          EFLAGS)))]>;
+  def CMOV_V4I64 : I<0, Pseudo,
+                    (outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond),
+                    "#CMOV_V4I64 PSEUDO!",
+                    [(set VR256:$dst,
+                      (v4i64 (X86cmov VR256:$t, VR256:$f, imm:$cond,
+                                          EFLAGS)))]>;
 }
 
 
diff --git a/lib/Target/X86/X86InstrExtension.td b/lib/Target/X86/X86InstrExtension.td
index 2e1d523ea16de..e62e6b701f463 100644
--- a/lib/Target/X86/X86InstrExtension.td
+++ b/lib/Target/X86/X86InstrExtension.td
@@ -76,12 +76,12 @@ def MOVZX32rm16: I<0xB7, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
 // except that they use GR32_NOREX for the output operand register class
 // instead of GR32. This allows them to operate on h registers on x86-64.
 def MOVZX32_NOREXrr8 : I<0xB6, MRMSrcReg,
-                         (outs GR32_NOREX:$dst), (ins GR8:$src),
+                         (outs GR32_NOREX:$dst), (ins GR8_NOREX:$src),
                          "movz{bl|x}\t{$src, $dst|$dst, $src}",
                          []>, TB;
 let mayLoad = 1 in
 def MOVZX32_NOREXrm8 : I<0xB6, MRMSrcMem,
-                         (outs GR32_NOREX:$dst), (ins i8mem:$src),
+                         (outs GR32_NOREX:$dst), (ins i8mem_NOREX:$src),
                          "movz{bl|x}\t{$src, $dst|$dst, $src}",
                          []>, TB;
 
diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td
index 6d89bcc29e7b0..0a1590b3e0a2b 100644
--- a/lib/Target/X86/X86InstrFormats.td
+++ b/lib/Target/X86/X86InstrFormats.td
@@ -113,6 +113,7 @@ class VEX_W  { bit hasVEX_WPrefix = 1; }
 class VEX_4V : VEX { bit hasVEX_4VPrefix = 1; }
 class VEX_I8IMM { bit hasVEX_i8ImmReg = 1; }
 class VEX_L  { bit hasVEX_L = 1; }
+class VEX_LIG { bit ignoresVEX_L = 1; }
 class Has3DNow0F0FOpcode  { bit has3DNow0F0FOpcode = 1; }
 
 class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
@@ -150,6 +151,7 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
   bit hasVEX_i8ImmReg = 0;  // Does this inst require the last source register
                             // to be encoded in a immediate field?
   bit hasVEX_L = 0;         // Does this inst use large (256-bit) registers?
+  bit ignoresVEX_L = 0;     // Does this instruction ignore the L-bit
   bit has3DNow0F0FOpcode =0;// Wacky 3dNow! encoding?
 
   // TSFlags layout should be kept in sync with X86InstrInfo.h.
@@ -169,7 +171,8 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
   let TSFlags{35}    = hasVEX_4VPrefix;
   let TSFlags{36}    = hasVEX_i8ImmReg;
   let TSFlags{37}    = hasVEX_L;
-  let TSFlags{38}    = has3DNow0F0FOpcode;
+  let TSFlags{38}    = ignoresVEX_L;
+  let TSFlags{39}    = has3DNow0F0FOpcode;
 }
 
 class PseudoI<dag oops, dag iops, list<dag> pattern>
@@ -501,6 +504,9 @@ class RSDI<bits<8> o, Format F, dag outs, dag ins, string asm,
 class RPDI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern>
       : PDI<o, F, outs, ins, asm, pattern>, REX_W;
+class VRPDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern>
+      : VPDI<o, F, outs, ins, asm, pattern>, VEX_W;
 
 // MMX Instruction templates
 //
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index b00109c9fa4de..af919fba8ee4c 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -39,6 +39,8 @@ def X86frsqrt  : SDNode<"X86ISD::FRSQRT",    SDTFPUnaryOp>;
 def X86frcp    : SDNode<"X86ISD::FRCP",      SDTFPUnaryOp>;
 def X86fsrl    : SDNode<"X86ISD::FSRL",      SDTX86FPShiftOp>;
 def X86fgetsign: SDNode<"X86ISD::FGETSIGNx86",SDTFPToIntOp>;
+def X86fhadd   : SDNode<"X86ISD::FHADD",     SDTFPBinOp>;
+def X86fhsub   : SDNode<"X86ISD::FHSUB",     SDTFPBinOp>;
 def X86comi    : SDNode<"X86ISD::COMI",      SDTX86CmpTest>;
 def X86ucomi   : SDNode<"X86ISD::UCOMI",     SDTX86CmpTest>;
 def X86cmpss   : SDNode<"X86ISD::FSETCCss",    SDTX86Cmpss>;
@@ -49,18 +51,15 @@ def X86pshufb  : SDNode<"X86ISD::PSHUFB",
 def X86andnp   : SDNode<"X86ISD::ANDNP",
                  SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
                                       SDTCisSameAs<0,2>]>>;
-def X86psignb  : SDNode<"X86ISD::PSIGNB", 
+def X86psignb  : SDNode<"X86ISD::PSIGNB",
                  SDTypeProfile<1, 2, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>,
                                       SDTCisSameAs<0,2>]>>;
-def X86psignw  : SDNode<"X86ISD::PSIGNW", 
+def X86psignw  : SDNode<"X86ISD::PSIGNW",
                  SDTypeProfile<1, 2, [SDTCisVT<0, v8i16>, SDTCisSameAs<0,1>,
                                       SDTCisSameAs<0,2>]>>;
-def X86psignd  : SDNode<"X86ISD::PSIGND", 
+def X86psignd  : SDNode<"X86ISD::PSIGND",
                  SDTypeProfile<1, 2, [SDTCisVT<0, v4i32>, SDTCisSameAs<0,1>,
                                       SDTCisSameAs<0,2>]>>;
-def X86pblendv : SDNode<"X86ISD::PBLENDVB", 
-                 SDTypeProfile<1, 3, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>,
-                                      SDTCisSameAs<0,2>, SDTCisSameAs<0,3>]>>;
 def X86pextrb  : SDNode<"X86ISD::PEXTRB",
                  SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<2>]>>;
 def X86pextrw  : SDNode<"X86ISD::PEXTRW",
@@ -109,6 +108,8 @@ def SDTShuff2OpI : SDTypeProfile<1, 2, [SDTCisVec<0>,
 def SDTShuff3OpI : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
                                  SDTCisSameAs<0,2>, SDTCisInt<3>]>;
 
+def SDTVBroadcast : SDTypeProfile<1, 1, [SDTCisVec<0>]>;
+
 def X86PAlign : SDNode<"X86ISD::PALIGN", SDTShuff3OpI>;
 
 def X86PShufd  : SDNode<"X86ISD::PSHUFD", SDTShuff2OpI>;
@@ -133,12 +134,15 @@ def X86Movhlpd : SDNode<"X86ISD::MOVHLPD", SDTShuff2Op>;
 def X86Movlps : SDNode<"X86ISD::MOVLPS", SDTShuff2Op>;
 def X86Movlpd : SDNode<"X86ISD::MOVLPD", SDTShuff2Op>;
 
-def X86Unpcklps : SDNode<"X86ISD::UNPCKLPS", SDTShuff2Op>;
-def X86Unpcklpd : SDNode<"X86ISD::UNPCKLPD", SDTShuff2Op>;
+def X86Unpcklps  : SDNode<"X86ISD::UNPCKLPS", SDTShuff2Op>;
+def X86Unpcklpd  : SDNode<"X86ISD::UNPCKLPD", SDTShuff2Op>;
 def X86Unpcklpsy : SDNode<"X86ISD::VUNPCKLPSY", SDTShuff2Op>;
 def X86Unpcklpdy : SDNode<"X86ISD::VUNPCKLPDY", SDTShuff2Op>;
-def X86Unpckhps : SDNode<"X86ISD::UNPCKHPS", SDTShuff2Op>;
-def X86Unpckhpd : SDNode<"X86ISD::UNPCKHPD", SDTShuff2Op>;
+
+def X86Unpckhps  : SDNode<"X86ISD::UNPCKHPS", SDTShuff2Op>;
+def X86Unpckhpd  : SDNode<"X86ISD::UNPCKHPD", SDTShuff2Op>;
+def X86Unpckhpsy : SDNode<"X86ISD::VUNPCKHPSY", SDTShuff2Op>;
+def X86Unpckhpdy : SDNode<"X86ISD::VUNPCKHPDY", SDTShuff2Op>;
 
 def X86Punpcklbw  : SDNode<"X86ISD::PUNPCKLBW", SDTShuff2Op>;
 def X86Punpcklwd  : SDNode<"X86ISD::PUNPCKLWD", SDTShuff2Op>;
@@ -150,6 +154,15 @@ def X86Punpckhwd  : SDNode<"X86ISD::PUNPCKHWD", SDTShuff2Op>;
 def X86Punpckhdq  : SDNode<"X86ISD::PUNPCKHDQ", SDTShuff2Op>;
 def X86Punpckhqdq : SDNode<"X86ISD::PUNPCKHQDQ", SDTShuff2Op>;
 
+def X86VPermilps  : SDNode<"X86ISD::VPERMILPS", SDTShuff2OpI>;
+def X86VPermilpsy : SDNode<"X86ISD::VPERMILPSY", SDTShuff2OpI>;
+def X86VPermilpd  : SDNode<"X86ISD::VPERMILPD", SDTShuff2OpI>;
+def X86VPermilpdy : SDNode<"X86ISD::VPERMILPDY", SDTShuff2OpI>;
+
+def X86VPerm2f128 : SDNode<"X86ISD::VPERM2F128", SDTShuff3OpI>;
+
+def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>;
+
 //===----------------------------------------------------------------------===//
 // SSE Complex Patterns
 //===----------------------------------------------------------------------===//
@@ -193,17 +206,28 @@ def loadv4f64    : PatFrag<(ops node:$ptr), (v4f64 (load node:$ptr))>;
 def loadv8i32    : PatFrag<(ops node:$ptr), (v8i32 (load node:$ptr))>;
 def loadv4i64    : PatFrag<(ops node:$ptr), (v4i64 (load node:$ptr))>;
 
-// Like 'store', but always requires vector alignment.
+// Like 'store', but always requires 128-bit vector alignment.
 def alignedstore : PatFrag<(ops node:$val, node:$ptr),
                            (store node:$val, node:$ptr), [{
   return cast<StoreSDNode>(N)->getAlignment() >= 16;
 }]>;
 
-// Like 'load', but always requires vector alignment.
+// Like 'store', but always requires 256-bit vector alignment.
+def alignedstore256 : PatFrag<(ops node:$val, node:$ptr),
+                              (store node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getAlignment() >= 32;
+}]>;
+
+// Like 'load', but always requires 128-bit vector alignment.
 def alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
   return cast<LoadSDNode>(N)->getAlignment() >= 16;
 }]>;
 
+// Like 'load', but always requires 256-bit vector alignment.
+def alignedload256 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() >= 32;
+}]>;
+
 def alignedloadfsf32 : PatFrag<(ops node:$ptr),
                                (f32 (alignedload node:$ptr))>;
 def alignedloadfsf64 : PatFrag<(ops node:$ptr),
@@ -221,13 +245,13 @@ def alignedloadv2i64 : PatFrag<(ops node:$ptr),
 
 // 256-bit aligned load pattern fragments
 def alignedloadv8f32 : PatFrag<(ops node:$ptr),
-                               (v8f32 (alignedload node:$ptr))>;
+                               (v8f32 (alignedload256 node:$ptr))>;
 def alignedloadv4f64 : PatFrag<(ops node:$ptr),
-                               (v4f64 (alignedload node:$ptr))>;
+                               (v4f64 (alignedload256 node:$ptr))>;
 def alignedloadv8i32 : PatFrag<(ops node:$ptr),
-                               (v8i32 (alignedload node:$ptr))>;
+                               (v8i32 (alignedload256 node:$ptr))>;
 def alignedloadv4i64 : PatFrag<(ops node:$ptr),
-                               (v4i64 (alignedload node:$ptr))>;
+                               (v4i64 (alignedload256 node:$ptr))>;
 
 // Like 'load', but uses special alignment checks suitable for use in
 // memory operands in most SSE instructions, which are required to
@@ -356,7 +380,7 @@ def EXTRACT_get_vextractf128_imm : SDNodeXForm<extract_subvector, [{
   return getI8Imm(X86::getExtractVEXTRACTF128Immediate(N));
 }]>;
 
-// INSERT_get_vinsertf128_imm xform function: convert insert_subvector index to 
+// INSERT_get_vinsertf128_imm xform function: convert insert_subvector index to
 // VINSERTF128 imm.
 def INSERT_get_vinsertf128_imm : SDNodeXForm<insert_subvector, [{
   return getI8Imm(X86::getInsertVINSERTF128Immediate(N));
@@ -398,16 +422,6 @@ def movl : PatFrag<(ops node:$lhs, node:$rhs),
   return X86::isMOVLMask(cast<ShuffleVectorSDNode>(N));
 }]>;
 
-def movshdup : PatFrag<(ops node:$lhs, node:$rhs),
-                       (vector_shuffle node:$lhs, node:$rhs), [{
-  return X86::isMOVSHDUPMask(cast<ShuffleVectorSDNode>(N));
-}]>;
-
-def movsldup : PatFrag<(ops node:$lhs, node:$rhs),
-                       (vector_shuffle node:$lhs, node:$rhs), [{
-  return X86::isMOVSLDUPMask(cast<ShuffleVectorSDNode>(N));
-}]>;
-
 def unpckl : PatFrag<(ops node:$lhs, node:$rhs),
                      (vector_shuffle node:$lhs, node:$rhs), [{
   return X86::isUNPCKLMask(cast<ShuffleVectorSDNode>(N));
@@ -418,16 +432,6 @@ def unpckh : PatFrag<(ops node:$lhs, node:$rhs),
   return X86::isUNPCKHMask(cast<ShuffleVectorSDNode>(N));
 }]>;
 
-def unpckl_undef : PatFrag<(ops node:$lhs, node:$rhs),
-                           (vector_shuffle node:$lhs, node:$rhs), [{
-  return X86::isUNPCKL_v_undef_Mask(cast<ShuffleVectorSDNode>(N));
-}]>;
-
-def unpckh_undef : PatFrag<(ops node:$lhs, node:$rhs),
-                           (vector_shuffle node:$lhs, node:$rhs), [{
-  return X86::isUNPCKH_v_undef_Mask(cast<ShuffleVectorSDNode>(N));
-}]>;
-
 def pshufd : PatFrag<(ops node:$lhs, node:$rhs),
                      (vector_shuffle node:$lhs, node:$rhs), [{
   return X86::isPSHUFDMask(cast<ShuffleVectorSDNode>(N));
@@ -448,11 +452,6 @@ def pshuflw : PatFrag<(ops node:$lhs, node:$rhs),
   return X86::isPSHUFLWMask(cast<ShuffleVectorSDNode>(N));
 }], SHUFFLE_get_pshuflw_imm>;
 
-def palign : PatFrag<(ops node:$lhs, node:$rhs),
-                     (vector_shuffle node:$lhs, node:$rhs), [{
-  return X86::isPALIGNRMask(cast<ShuffleVectorSDNode>(N));
-}], SHUFFLE_get_palign_imm>;
-
 def vextractf128_extract : PatFrag<(ops node:$bigvec, node:$index),
                                    (extract_subvector node:$bigvec,
                                                       node:$index), [{
@@ -465,3 +464,4 @@ def vinsertf128_insert : PatFrag<(ops node:$bigvec, node:$smallvec,
                                                    node:$index), [{
   return X86::isVINSERTF128Index(N);
 }], INSERT_get_vinsertf128_imm>;
+
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 55b5835f52a7f..3a02de0aa01b9 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -53,6 +53,36 @@ ReMatPICStubLoad("remat-pic-stub-load",
                  cl::desc("Re-materialize load from stub in PIC mode"),
                  cl::init(false), cl::Hidden);
 
+enum {
+  // Select which memory operand is being unfolded.
+  // (stored in bits 0 - 7)
+  TB_INDEX_0    = 0,
+  TB_INDEX_1    = 1,
+  TB_INDEX_2    = 2,
+  TB_INDEX_MASK = 0xff,
+
+  // Minimum alignment required for load/store.
+  // Used for RegOp->MemOp conversion.
+  // (stored in bits 8 - 15)
+  TB_ALIGN_SHIFT = 8,
+  TB_ALIGN_NONE  =    0 << TB_ALIGN_SHIFT,
+  TB_ALIGN_16    =   16 << TB_ALIGN_SHIFT,
+  TB_ALIGN_32    =   32 << TB_ALIGN_SHIFT,
+  TB_ALIGN_MASK  = 0xff << TB_ALIGN_SHIFT,
+
+  // Do not insert the reverse map (MemOp -> RegOp) into the table.
+  // This may be needed because there is a many -> one mapping.
+  TB_NO_REVERSE   = 1 << 16,
+
+  // Do not insert the forward map (RegOp -> MemOp) into the table.
+  // This is needed for Native Client, which prohibits branch
+  // instructions from using a memory operand.
+  TB_NO_FORWARD   = 1 << 17,
+
+  TB_FOLDED_LOAD  = 1 << 18,
+  TB_FOLDED_STORE = 1 << 19
+};
+
 X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
   : X86GenInstrInfo((tm.getSubtarget<X86Subtarget>().is64Bit()
                      ? X86::ADJCALLSTACKDOWN64
@@ -61,655 +91,829 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
                      ? X86::ADJCALLSTACKUP64
                      : X86::ADJCALLSTACKUP32)),
     TM(tm), RI(tm, *this) {
-  enum {
-    TB_NOT_REVERSABLE = 1U << 31,
-    TB_FLAGS = TB_NOT_REVERSABLE
-  };
 
-  static const unsigned OpTbl2Addr[][2] = {
-    { X86::ADC32ri,     X86::ADC32mi },
-    { X86::ADC32ri8,    X86::ADC32mi8 },
-    { X86::ADC32rr,     X86::ADC32mr },
-    { X86::ADC64ri32,   X86::ADC64mi32 },
-    { X86::ADC64ri8,    X86::ADC64mi8 },
-    { X86::ADC64rr,     X86::ADC64mr },
-    { X86::ADD16ri,     X86::ADD16mi },
-    { X86::ADD16ri8,    X86::ADD16mi8 },
-    { X86::ADD16ri_DB,  X86::ADD16mi  | TB_NOT_REVERSABLE },
-    { X86::ADD16ri8_DB, X86::ADD16mi8 | TB_NOT_REVERSABLE },
-    { X86::ADD16rr,     X86::ADD16mr },
-    { X86::ADD16rr_DB,  X86::ADD16mr | TB_NOT_REVERSABLE },
-    { X86::ADD32ri,     X86::ADD32mi },
-    { X86::ADD32ri8,    X86::ADD32mi8 },
-    { X86::ADD32ri_DB,  X86::ADD32mi | TB_NOT_REVERSABLE },
-    { X86::ADD32ri8_DB, X86::ADD32mi8 | TB_NOT_REVERSABLE },
-    { X86::ADD32rr,     X86::ADD32mr },
-    { X86::ADD32rr_DB,  X86::ADD32mr | TB_NOT_REVERSABLE },
-    { X86::ADD64ri32,   X86::ADD64mi32 },
-    { X86::ADD64ri8,    X86::ADD64mi8 },
-    { X86::ADD64ri32_DB,X86::ADD64mi32 | TB_NOT_REVERSABLE },
-    { X86::ADD64ri8_DB, X86::ADD64mi8 | TB_NOT_REVERSABLE },
-    { X86::ADD64rr,     X86::ADD64mr },
-    { X86::ADD64rr_DB,  X86::ADD64mr | TB_NOT_REVERSABLE },
-    { X86::ADD8ri,      X86::ADD8mi },
-    { X86::ADD8rr,      X86::ADD8mr },
-    { X86::AND16ri,     X86::AND16mi },
-    { X86::AND16ri8,    X86::AND16mi8 },
-    { X86::AND16rr,     X86::AND16mr },
-    { X86::AND32ri,     X86::AND32mi },
-    { X86::AND32ri8,    X86::AND32mi8 },
-    { X86::AND32rr,     X86::AND32mr },
-    { X86::AND64ri32,   X86::AND64mi32 },
-    { X86::AND64ri8,    X86::AND64mi8 },
-    { X86::AND64rr,     X86::AND64mr },
-    { X86::AND8ri,      X86::AND8mi },
-    { X86::AND8rr,      X86::AND8mr },
-    { X86::DEC16r,      X86::DEC16m },
-    { X86::DEC32r,      X86::DEC32m },
-    { X86::DEC64_16r,   X86::DEC64_16m },
-    { X86::DEC64_32r,   X86::DEC64_32m },
-    { X86::DEC64r,      X86::DEC64m },
-    { X86::DEC8r,       X86::DEC8m },
-    { X86::INC16r,      X86::INC16m },
-    { X86::INC32r,      X86::INC32m },
-    { X86::INC64_16r,   X86::INC64_16m },
-    { X86::INC64_32r,   X86::INC64_32m },
-    { X86::INC64r,      X86::INC64m },
-    { X86::INC8r,       X86::INC8m },
-    { X86::NEG16r,      X86::NEG16m },
-    { X86::NEG32r,      X86::NEG32m },
-    { X86::NEG64r,      X86::NEG64m },
-    { X86::NEG8r,       X86::NEG8m },
-    { X86::NOT16r,      X86::NOT16m },
-    { X86::NOT32r,      X86::NOT32m },
-    { X86::NOT64r,      X86::NOT64m },
-    { X86::NOT8r,       X86::NOT8m },
-    { X86::OR16ri,      X86::OR16mi },
-    { X86::OR16ri8,     X86::OR16mi8 },
-    { X86::OR16rr,      X86::OR16mr },
-    { X86::OR32ri,      X86::OR32mi },
-    { X86::OR32ri8,     X86::OR32mi8 },
-    { X86::OR32rr,      X86::OR32mr },
-    { X86::OR64ri32,    X86::OR64mi32 },
-    { X86::OR64ri8,     X86::OR64mi8 },
-    { X86::OR64rr,      X86::OR64mr },
-    { X86::OR8ri,       X86::OR8mi },
-    { X86::OR8rr,       X86::OR8mr },
-    { X86::ROL16r1,     X86::ROL16m1 },
-    { X86::ROL16rCL,    X86::ROL16mCL },
-    { X86::ROL16ri,     X86::ROL16mi },
-    { X86::ROL32r1,     X86::ROL32m1 },
-    { X86::ROL32rCL,    X86::ROL32mCL },
-    { X86::ROL32ri,     X86::ROL32mi },
-    { X86::ROL64r1,     X86::ROL64m1 },
-    { X86::ROL64rCL,    X86::ROL64mCL },
-    { X86::ROL64ri,     X86::ROL64mi },
-    { X86::ROL8r1,      X86::ROL8m1 },
-    { X86::ROL8rCL,     X86::ROL8mCL },
-    { X86::ROL8ri,      X86::ROL8mi },
-    { X86::ROR16r1,     X86::ROR16m1 },
-    { X86::ROR16rCL,    X86::ROR16mCL },
-    { X86::ROR16ri,     X86::ROR16mi },
-    { X86::ROR32r1,     X86::ROR32m1 },
-    { X86::ROR32rCL,    X86::ROR32mCL },
-    { X86::ROR32ri,     X86::ROR32mi },
-    { X86::ROR64r1,     X86::ROR64m1 },
-    { X86::ROR64rCL,    X86::ROR64mCL },
-    { X86::ROR64ri,     X86::ROR64mi },
-    { X86::ROR8r1,      X86::ROR8m1 },
-    { X86::ROR8rCL,     X86::ROR8mCL },
-    { X86::ROR8ri,      X86::ROR8mi },
-    { X86::SAR16r1,     X86::SAR16m1 },
-    { X86::SAR16rCL,    X86::SAR16mCL },
-    { X86::SAR16ri,     X86::SAR16mi },
-    { X86::SAR32r1,     X86::SAR32m1 },
-    { X86::SAR32rCL,    X86::SAR32mCL },
-    { X86::SAR32ri,     X86::SAR32mi },
-    { X86::SAR64r1,     X86::SAR64m1 },
-    { X86::SAR64rCL,    X86::SAR64mCL },
-    { X86::SAR64ri,     X86::SAR64mi },
-    { X86::SAR8r1,      X86::SAR8m1 },
-    { X86::SAR8rCL,     X86::SAR8mCL },
-    { X86::SAR8ri,      X86::SAR8mi },
-    { X86::SBB32ri,     X86::SBB32mi },
-    { X86::SBB32ri8,    X86::SBB32mi8 },
-    { X86::SBB32rr,     X86::SBB32mr },
-    { X86::SBB64ri32,   X86::SBB64mi32 },
-    { X86::SBB64ri8,    X86::SBB64mi8 },
-    { X86::SBB64rr,     X86::SBB64mr },
-    { X86::SHL16rCL,    X86::SHL16mCL },
-    { X86::SHL16ri,     X86::SHL16mi },
-    { X86::SHL32rCL,    X86::SHL32mCL },
-    { X86::SHL32ri,     X86::SHL32mi },
-    { X86::SHL64rCL,    X86::SHL64mCL },
-    { X86::SHL64ri,     X86::SHL64mi },
-    { X86::SHL8rCL,     X86::SHL8mCL },
-    { X86::SHL8ri,      X86::SHL8mi },
-    { X86::SHLD16rrCL,  X86::SHLD16mrCL },
-    { X86::SHLD16rri8,  X86::SHLD16mri8 },
-    { X86::SHLD32rrCL,  X86::SHLD32mrCL },
-    { X86::SHLD32rri8,  X86::SHLD32mri8 },
-    { X86::SHLD64rrCL,  X86::SHLD64mrCL },
-    { X86::SHLD64rri8,  X86::SHLD64mri8 },
-    { X86::SHR16r1,     X86::SHR16m1 },
-    { X86::SHR16rCL,    X86::SHR16mCL },
-    { X86::SHR16ri,     X86::SHR16mi },
-    { X86::SHR32r1,     X86::SHR32m1 },
-    { X86::SHR32rCL,    X86::SHR32mCL },
-    { X86::SHR32ri,     X86::SHR32mi },
-    { X86::SHR64r1,     X86::SHR64m1 },
-    { X86::SHR64rCL,    X86::SHR64mCL },
-    { X86::SHR64ri,     X86::SHR64mi },
-    { X86::SHR8r1,      X86::SHR8m1 },
-    { X86::SHR8rCL,     X86::SHR8mCL },
-    { X86::SHR8ri,      X86::SHR8mi },
-    { X86::SHRD16rrCL,  X86::SHRD16mrCL },
-    { X86::SHRD16rri8,  X86::SHRD16mri8 },
-    { X86::SHRD32rrCL,  X86::SHRD32mrCL },
-    { X86::SHRD32rri8,  X86::SHRD32mri8 },
-    { X86::SHRD64rrCL,  X86::SHRD64mrCL },
-    { X86::SHRD64rri8,  X86::SHRD64mri8 },
-    { X86::SUB16ri,     X86::SUB16mi },
-    { X86::SUB16ri8,    X86::SUB16mi8 },
-    { X86::SUB16rr,     X86::SUB16mr },
-    { X86::SUB32ri,     X86::SUB32mi },
-    { X86::SUB32ri8,    X86::SUB32mi8 },
-    { X86::SUB32rr,     X86::SUB32mr },
-    { X86::SUB64ri32,   X86::SUB64mi32 },
-    { X86::SUB64ri8,    X86::SUB64mi8 },
-    { X86::SUB64rr,     X86::SUB64mr },
-    { X86::SUB8ri,      X86::SUB8mi },
-    { X86::SUB8rr,      X86::SUB8mr },
-    { X86::XOR16ri,     X86::XOR16mi },
-    { X86::XOR16ri8,    X86::XOR16mi8 },
-    { X86::XOR16rr,     X86::XOR16mr },
-    { X86::XOR32ri,     X86::XOR32mi },
-    { X86::XOR32ri8,    X86::XOR32mi8 },
-    { X86::XOR32rr,     X86::XOR32mr },
-    { X86::XOR64ri32,   X86::XOR64mi32 },
-    { X86::XOR64ri8,    X86::XOR64mi8 },
-    { X86::XOR64rr,     X86::XOR64mr },
-    { X86::XOR8ri,      X86::XOR8mi },
-    { X86::XOR8rr,      X86::XOR8mr }
+  static const unsigned OpTbl2Addr[][3] = {
+    { X86::ADC32ri,     X86::ADC32mi,    0 },
+    { X86::ADC32ri8,    X86::ADC32mi8,   0 },
+    { X86::ADC32rr,     X86::ADC32mr,    0 },
+    { X86::ADC64ri32,   X86::ADC64mi32,  0 },
+    { X86::ADC64ri8,    X86::ADC64mi8,   0 },
+    { X86::ADC64rr,     X86::ADC64mr,    0 },
+    { X86::ADD16ri,     X86::ADD16mi,    0 },
+    { X86::ADD16ri8,    X86::ADD16mi8,   0 },
+    { X86::ADD16ri_DB,  X86::ADD16mi,    TB_NO_REVERSE },
+    { X86::ADD16ri8_DB, X86::ADD16mi8,   TB_NO_REVERSE },
+    { X86::ADD16rr,     X86::ADD16mr,    0 },
+    { X86::ADD16rr_DB,  X86::ADD16mr,    TB_NO_REVERSE },
+    { X86::ADD32ri,     X86::ADD32mi,    0 },
+    { X86::ADD32ri8,    X86::ADD32mi8,   0 },
+    { X86::ADD32ri_DB,  X86::ADD32mi,    TB_NO_REVERSE },
+    { X86::ADD32ri8_DB, X86::ADD32mi8,   TB_NO_REVERSE },
+    { X86::ADD32rr,     X86::ADD32mr,    0 },
+    { X86::ADD32rr_DB,  X86::ADD32mr,    TB_NO_REVERSE },
+    { X86::ADD64ri32,   X86::ADD64mi32,  0 },
+    { X86::ADD64ri8,    X86::ADD64mi8,   0 },
+    { X86::ADD64ri32_DB,X86::ADD64mi32,  TB_NO_REVERSE },
+    { X86::ADD64ri8_DB, X86::ADD64mi8,   TB_NO_REVERSE },
+    { X86::ADD64rr,     X86::ADD64mr,    0 },
+    { X86::ADD64rr_DB,  X86::ADD64mr,    TB_NO_REVERSE },
+    { X86::ADD8ri,      X86::ADD8mi,     0 },
+    { X86::ADD8rr,      X86::ADD8mr,     0 },
+    { X86::AND16ri,     X86::AND16mi,    0 },
+    { X86::AND16ri8,    X86::AND16mi8,   0 },
+    { X86::AND16rr,     X86::AND16mr,    0 },
+    { X86::AND32ri,     X86::AND32mi,    0 },
+    { X86::AND32ri8,    X86::AND32mi8,   0 },
+    { X86::AND32rr,     X86::AND32mr,    0 },
+    { X86::AND64ri32,   X86::AND64mi32,  0 },
+    { X86::AND64ri8,    X86::AND64mi8,   0 },
+    { X86::AND64rr,     X86::AND64mr,    0 },
+    { X86::AND8ri,      X86::AND8mi,     0 },
+    { X86::AND8rr,      X86::AND8mr,     0 },
+    { X86::DEC16r,      X86::DEC16m,     0 },
+    { X86::DEC32r,      X86::DEC32m,     0 },
+    { X86::DEC64_16r,   X86::DEC64_16m,  0 },
+    { X86::DEC64_32r,   X86::DEC64_32m,  0 },
+    { X86::DEC64r,      X86::DEC64m,     0 },
+    { X86::DEC8r,       X86::DEC8m,      0 },
+    { X86::INC16r,      X86::INC16m,     0 },
+    { X86::INC32r,      X86::INC32m,     0 },
+    { X86::INC64_16r,   X86::INC64_16m,  0 },
+    { X86::INC64_32r,   X86::INC64_32m,  0 },
+    { X86::INC64r,      X86::INC64m,     0 },
+    { X86::INC8r,       X86::INC8m,      0 },
+    { X86::NEG16r,      X86::NEG16m,     0 },
+    { X86::NEG32r,      X86::NEG32m,     0 },
+    { X86::NEG64r,      X86::NEG64m,     0 },
+    { X86::NEG8r,       X86::NEG8m,      0 },
+    { X86::NOT16r,      X86::NOT16m,     0 },
+    { X86::NOT32r,      X86::NOT32m,     0 },
+    { X86::NOT64r,      X86::NOT64m,     0 },
+    { X86::NOT8r,       X86::NOT8m,      0 },
+    { X86::OR16ri,      X86::OR16mi,     0 },
+    { X86::OR16ri8,     X86::OR16mi8,    0 },
+    { X86::OR16rr,      X86::OR16mr,     0 },
+    { X86::OR32ri,      X86::OR32mi,     0 },
+    { X86::OR32ri8,     X86::OR32mi8,    0 },
+    { X86::OR32rr,      X86::OR32mr,     0 },
+    { X86::OR64ri32,    X86::OR64mi32,   0 },
+    { X86::OR64ri8,     X86::OR64mi8,    0 },
+    { X86::OR64rr,      X86::OR64mr,     0 },
+    { X86::OR8ri,       X86::OR8mi,      0 },
+    { X86::OR8rr,       X86::OR8mr,      0 },
+    { X86::ROL16r1,     X86::ROL16m1,    0 },
+    { X86::ROL16rCL,    X86::ROL16mCL,   0 },
+    { X86::ROL16ri,     X86::ROL16mi,    0 },
+    { X86::ROL32r1,     X86::ROL32m1,    0 },
+    { X86::ROL32rCL,    X86::ROL32mCL,   0 },
+    { X86::ROL32ri,     X86::ROL32mi,    0 },
+    { X86::ROL64r1,     X86::ROL64m1,    0 },
+    { X86::ROL64rCL,    X86::ROL64mCL,   0 },
+    { X86::ROL64ri,     X86::ROL64mi,    0 },
+    { X86::ROL8r1,      X86::ROL8m1,     0 },
+    { X86::ROL8rCL,     X86::ROL8mCL,    0 },
+    { X86::ROL8ri,      X86::ROL8mi,     0 },
+    { X86::ROR16r1,     X86::ROR16m1,    0 },
+    { X86::ROR16rCL,    X86::ROR16mCL,   0 },
+    { X86::ROR16ri,     X86::ROR16mi,    0 },
+    { X86::ROR32r1,     X86::ROR32m1,    0 },
+    { X86::ROR32rCL,    X86::ROR32mCL,   0 },
+    { X86::ROR32ri,     X86::ROR32mi,    0 },
+    { X86::ROR64r1,     X86::ROR64m1,    0 },
+    { X86::ROR64rCL,    X86::ROR64mCL,   0 },
+    { X86::ROR64ri,     X86::ROR64mi,    0 },
+    { X86::ROR8r1,      X86::ROR8m1,     0 },
+    { X86::ROR8rCL,     X86::ROR8mCL,    0 },
+    { X86::ROR8ri,      X86::ROR8mi,     0 },
+    { X86::SAR16r1,     X86::SAR16m1,    0 },
+    { X86::SAR16rCL,    X86::SAR16mCL,   0 },
+    { X86::SAR16ri,     X86::SAR16mi,    0 },
+    { X86::SAR32r1,     X86::SAR32m1,    0 },
+    { X86::SAR32rCL,    X86::SAR32mCL,   0 },
+    { X86::SAR32ri,     X86::SAR32mi,    0 },
+    { X86::SAR64r1,     X86::SAR64m1,    0 },
+    { X86::SAR64rCL,    X86::SAR64mCL,   0 },
+    { X86::SAR64ri,     X86::SAR64mi,    0 },
+    { X86::SAR8r1,      X86::SAR8m1,     0 },
+    { X86::SAR8rCL,     X86::SAR8mCL,    0 },
+    { X86::SAR8ri,      X86::SAR8mi,     0 },
+    { X86::SBB32ri,     X86::SBB32mi,    0 },
+    { X86::SBB32ri8,    X86::SBB32mi8,   0 },
+    { X86::SBB32rr,     X86::SBB32mr,    0 },
+    { X86::SBB64ri32,   X86::SBB64mi32,  0 },
+    { X86::SBB64ri8,    X86::SBB64mi8,   0 },
+    { X86::SBB64rr,     X86::SBB64mr,    0 },
+    { X86::SHL16rCL,    X86::SHL16mCL,   0 },
+    { X86::SHL16ri,     X86::SHL16mi,    0 },
+    { X86::SHL32rCL,    X86::SHL32mCL,   0 },
+    { X86::SHL32ri,     X86::SHL32mi,    0 },
+    { X86::SHL64rCL,    X86::SHL64mCL,   0 },
+    { X86::SHL64ri,     X86::SHL64mi,    0 },
+    { X86::SHL8rCL,     X86::SHL8mCL,    0 },
+    { X86::SHL8ri,      X86::SHL8mi,     0 },
+    { X86::SHLD16rrCL,  X86::SHLD16mrCL, 0 },
+    { X86::SHLD16rri8,  X86::SHLD16mri8, 0 },
+    { X86::SHLD32rrCL,  X86::SHLD32mrCL, 0 },
+    { X86::SHLD32rri8,  X86::SHLD32mri8, 0 },
+    { X86::SHLD64rrCL,  X86::SHLD64mrCL, 0 },
+    { X86::SHLD64rri8,  X86::SHLD64mri8, 0 },
+    { X86::SHR16r1,     X86::SHR16m1,    0 },
+    { X86::SHR16rCL,    X86::SHR16mCL,   0 },
+    { X86::SHR16ri,     X86::SHR16mi,    0 },
+    { X86::SHR32r1,     X86::SHR32m1,    0 },
+    { X86::SHR32rCL,    X86::SHR32mCL,   0 },
+    { X86::SHR32ri,     X86::SHR32mi,    0 },
+    { X86::SHR64r1,     X86::SHR64m1,    0 },
+    { X86::SHR64rCL,    X86::SHR64mCL,   0 },
+    { X86::SHR64ri,     X86::SHR64mi,    0 },
+    { X86::SHR8r1,      X86::SHR8m1,     0 },
+    { X86::SHR8rCL,     X86::SHR8mCL,    0 },
+    { X86::SHR8ri,      X86::SHR8mi,     0 },
+    { X86::SHRD16rrCL,  X86::SHRD16mrCL, 0 },
+    { X86::SHRD16rri8,  X86::SHRD16mri8, 0 },
+    { X86::SHRD32rrCL,  X86::SHRD32mrCL, 0 },
+    { X86::SHRD32rri8,  X86::SHRD32mri8, 0 },
+    { X86::SHRD64rrCL,  X86::SHRD64mrCL, 0 },
+    { X86::SHRD64rri8,  X86::SHRD64mri8, 0 },
+    { X86::SUB16ri,     X86::SUB16mi,    0 },
+    { X86::SUB16ri8,    X86::SUB16mi8,   0 },
+    { X86::SUB16rr,     X86::SUB16mr,    0 },
+    { X86::SUB32ri,     X86::SUB32mi,    0 },
+    { X86::SUB32ri8,    X86::SUB32mi8,   0 },
+    { X86::SUB32rr,     X86::SUB32mr,    0 },
+    { X86::SUB64ri32,   X86::SUB64mi32,  0 },
+    { X86::SUB64ri8,    X86::SUB64mi8,   0 },
+    { X86::SUB64rr,     X86::SUB64mr,    0 },
+    { X86::SUB8ri,      X86::SUB8mi,     0 },
+    { X86::SUB8rr,      X86::SUB8mr,     0 },
+    { X86::XOR16ri,     X86::XOR16mi,    0 },
+    { X86::XOR16ri8,    X86::XOR16mi8,   0 },
+    { X86::XOR16rr,     X86::XOR16mr,    0 },
+    { X86::XOR32ri,     X86::XOR32mi,    0 },
+    { X86::XOR32ri8,    X86::XOR32mi8,   0 },
+    { X86::XOR32rr,     X86::XOR32mr,    0 },
+    { X86::XOR64ri32,   X86::XOR64mi32,  0 },
+    { X86::XOR64ri8,    X86::XOR64mi8,   0 },
+    { X86::XOR64rr,     X86::XOR64mr,    0 },
+    { X86::XOR8ri,      X86::XOR8mi,     0 },
+    { X86::XOR8rr,      X86::XOR8mr,     0 }
   };
 
   for (unsigned i = 0, e = array_lengthof(OpTbl2Addr); i != e; ++i) {
     unsigned RegOp = OpTbl2Addr[i][0];
-    unsigned MemOp = OpTbl2Addr[i][1] & ~TB_FLAGS;
-    assert(!RegOp2MemOpTable2Addr.count(RegOp) && "Duplicated entries?");
-    RegOp2MemOpTable2Addr[RegOp] = std::make_pair(MemOp, 0U);
-
-    // If this is not a reversible operation (because there is a many->one)
-    // mapping, don't insert the reverse of the operation into MemOp2RegOpTable.
-    if (OpTbl2Addr[i][1] & TB_NOT_REVERSABLE)
-      continue;
-
-    // Index 0, folded load and store, no alignment requirement.
-    unsigned AuxInfo = 0 | (1 << 4) | (1 << 5);
-
-    assert(!MemOp2RegOpTable.count(MemOp) &&
-            "Duplicated entries in unfolding maps?");
-    MemOp2RegOpTable[MemOp] = std::make_pair(RegOp, AuxInfo);
-  }
-
-  // If the third value is 1, then it's folding either a load or a store.
-  static const unsigned OpTbl0[][4] = {
-    { X86::BT16ri8,     X86::BT16mi8, 1, 0 },
-    { X86::BT32ri8,     X86::BT32mi8, 1, 0 },
-    { X86::BT64ri8,     X86::BT64mi8, 1, 0 },
-    { X86::CALL32r,     X86::CALL32m, 1, 0 },
-    { X86::CALL64r,     X86::CALL64m, 1, 0 },
-    { X86::WINCALL64r,  X86::WINCALL64m, 1, 0 },
-    { X86::CMP16ri,     X86::CMP16mi, 1, 0 },
-    { X86::CMP16ri8,    X86::CMP16mi8, 1, 0 },
-    { X86::CMP16rr,     X86::CMP16mr, 1, 0 },
-    { X86::CMP32ri,     X86::CMP32mi, 1, 0 },
-    { X86::CMP32ri8,    X86::CMP32mi8, 1, 0 },
-    { X86::CMP32rr,     X86::CMP32mr, 1, 0 },
-    { X86::CMP64ri32,   X86::CMP64mi32, 1, 0 },
-    { X86::CMP64ri8,    X86::CMP64mi8, 1, 0 },
-    { X86::CMP64rr,     X86::CMP64mr, 1, 0 },
-    { X86::CMP8ri,      X86::CMP8mi, 1, 0 },
-    { X86::CMP8rr,      X86::CMP8mr, 1, 0 },
-    { X86::DIV16r,      X86::DIV16m, 1, 0 },
-    { X86::DIV32r,      X86::DIV32m, 1, 0 },
-    { X86::DIV64r,      X86::DIV64m, 1, 0 },
-    { X86::DIV8r,       X86::DIV8m, 1, 0 },
-    { X86::EXTRACTPSrr, X86::EXTRACTPSmr, 0, 16 },
-    { X86::FsMOVAPDrr,  X86::MOVSDmr | TB_NOT_REVERSABLE , 0, 0 },
-    { X86::FsMOVAPSrr,  X86::MOVSSmr | TB_NOT_REVERSABLE , 0, 0 },
-    { X86::IDIV16r,     X86::IDIV16m, 1, 0 },
-    { X86::IDIV32r,     X86::IDIV32m, 1, 0 },
-    { X86::IDIV64r,     X86::IDIV64m, 1, 0 },
-    { X86::IDIV8r,      X86::IDIV8m, 1, 0 },
-    { X86::IMUL16r,     X86::IMUL16m, 1, 0 },
-    { X86::IMUL32r,     X86::IMUL32m, 1, 0 },
-    { X86::IMUL64r,     X86::IMUL64m, 1, 0 },
-    { X86::IMUL8r,      X86::IMUL8m, 1, 0 },
-    { X86::JMP32r,      X86::JMP32m, 1, 0 },
-    { X86::JMP64r,      X86::JMP64m, 1, 0 },
-    { X86::MOV16ri,     X86::MOV16mi, 0, 0 },
-    { X86::MOV16rr,     X86::MOV16mr, 0, 0 },
-    { X86::MOV32ri,     X86::MOV32mi, 0, 0 },
-    { X86::MOV32rr,     X86::MOV32mr, 0, 0 },
-    { X86::MOV64ri32,   X86::MOV64mi32, 0, 0 },
-    { X86::MOV64rr,     X86::MOV64mr, 0, 0 },
-    { X86::MOV8ri,      X86::MOV8mi, 0, 0 },
-    { X86::MOV8rr,      X86::MOV8mr, 0, 0 },
-    { X86::MOV8rr_NOREX, X86::MOV8mr_NOREX, 0, 0 },
-    { X86::MOVAPDrr,    X86::MOVAPDmr, 0, 16 },
-    { X86::MOVAPSrr,    X86::MOVAPSmr, 0, 16 },
-    { X86::MOVDQArr,    X86::MOVDQAmr, 0, 16 },
-    { X86::VMOVAPDYrr,  X86::VMOVAPDYmr, 0, 32 },
-    { X86::VMOVAPSYrr,  X86::VMOVAPSYmr, 0, 32 },
-    { X86::VMOVDQAYrr,  X86::VMOVDQAYmr, 0, 32 },
-    { X86::MOVPDI2DIrr, X86::MOVPDI2DImr, 0, 0 },
-    { X86::MOVPQIto64rr,X86::MOVPQI2QImr, 0, 0 },
-    { X86::MOVSDto64rr, X86::MOVSDto64mr, 0, 0 },
-    { X86::MOVSS2DIrr,  X86::MOVSS2DImr, 0, 0 },
-    { X86::MOVUPDrr,    X86::MOVUPDmr, 0, 0 },
-    { X86::MOVUPSrr,    X86::MOVUPSmr, 0, 0 },
-    { X86::VMOVUPDYrr,  X86::VMOVUPDYmr, 0, 0 },
-    { X86::VMOVUPSYrr,  X86::VMOVUPSYmr, 0, 0 },
-    { X86::MUL16r,      X86::MUL16m, 1, 0 },
-    { X86::MUL32r,      X86::MUL32m, 1, 0 },
-    { X86::MUL64r,      X86::MUL64m, 1, 0 },
-    { X86::MUL8r,       X86::MUL8m, 1, 0 },
-    { X86::SETAEr,      X86::SETAEm, 0, 0 },
-    { X86::SETAr,       X86::SETAm, 0, 0 },
-    { X86::SETBEr,      X86::SETBEm, 0, 0 },
-    { X86::SETBr,       X86::SETBm, 0, 0 },
-    { X86::SETEr,       X86::SETEm, 0, 0 },
-    { X86::SETGEr,      X86::SETGEm, 0, 0 },
-    { X86::SETGr,       X86::SETGm, 0, 0 },
-    { X86::SETLEr,      X86::SETLEm, 0, 0 },
-    { X86::SETLr,       X86::SETLm, 0, 0 },
-    { X86::SETNEr,      X86::SETNEm, 0, 0 },
-    { X86::SETNOr,      X86::SETNOm, 0, 0 },
-    { X86::SETNPr,      X86::SETNPm, 0, 0 },
-    { X86::SETNSr,      X86::SETNSm, 0, 0 },
-    { X86::SETOr,       X86::SETOm, 0, 0 },
-    { X86::SETPr,       X86::SETPm, 0, 0 },
-    { X86::SETSr,       X86::SETSm, 0, 0 },
-    { X86::TAILJMPr,    X86::TAILJMPm, 1, 0 },
-    { X86::TAILJMPr64,  X86::TAILJMPm64, 1, 0 },
-    { X86::TEST16ri,    X86::TEST16mi, 1, 0 },
-    { X86::TEST32ri,    X86::TEST32mi, 1, 0 },
-    { X86::TEST64ri32,  X86::TEST64mi32, 1, 0 },
-    { X86::TEST8ri,     X86::TEST8mi, 1, 0 }
+    unsigned MemOp = OpTbl2Addr[i][1];
+    unsigned Flags = OpTbl2Addr[i][2];
+    AddTableEntry(RegOp2MemOpTable2Addr, MemOp2RegOpTable,
+                  RegOp, MemOp,
+                  // Index 0, folded load and store, no alignment requirement.
+                  Flags | TB_INDEX_0 | TB_FOLDED_LOAD | TB_FOLDED_STORE);
+  }
+
+  static const unsigned OpTbl0[][3] = {
+    { X86::BT16ri8,     X86::BT16mi8,       TB_FOLDED_LOAD },
+    { X86::BT32ri8,     X86::BT32mi8,       TB_FOLDED_LOAD },
+    { X86::BT64ri8,     X86::BT64mi8,       TB_FOLDED_LOAD },
+    { X86::CALL32r,     X86::CALL32m,       TB_FOLDED_LOAD },
+    { X86::CALL64r,     X86::CALL64m,       TB_FOLDED_LOAD },
+    { X86::WINCALL64r,  X86::WINCALL64m,    TB_FOLDED_LOAD },
+    { X86::CMP16ri,     X86::CMP16mi,       TB_FOLDED_LOAD },
+    { X86::CMP16ri8,    X86::CMP16mi8,      TB_FOLDED_LOAD },
+    { X86::CMP16rr,     X86::CMP16mr,       TB_FOLDED_LOAD },
+    { X86::CMP32ri,     X86::CMP32mi,       TB_FOLDED_LOAD },
+    { X86::CMP32ri8,    X86::CMP32mi8,      TB_FOLDED_LOAD },
+    { X86::CMP32rr,     X86::CMP32mr,       TB_FOLDED_LOAD },
+    { X86::CMP64ri32,   X86::CMP64mi32,     TB_FOLDED_LOAD },
+    { X86::CMP64ri8,    X86::CMP64mi8,      TB_FOLDED_LOAD },
+    { X86::CMP64rr,     X86::CMP64mr,       TB_FOLDED_LOAD },
+    { X86::CMP8ri,      X86::CMP8mi,        TB_FOLDED_LOAD },
+    { X86::CMP8rr,      X86::CMP8mr,        TB_FOLDED_LOAD },
+    { X86::DIV16r,      X86::DIV16m,        TB_FOLDED_LOAD },
+    { X86::DIV32r,      X86::DIV32m,        TB_FOLDED_LOAD },
+    { X86::DIV64r,      X86::DIV64m,        TB_FOLDED_LOAD },
+    { X86::DIV8r,       X86::DIV8m,         TB_FOLDED_LOAD },
+    { X86::EXTRACTPSrr, X86::EXTRACTPSmr,   TB_FOLDED_STORE | TB_ALIGN_16 },
+    { X86::FsMOVAPDrr,  X86::MOVSDmr,       TB_FOLDED_STORE | TB_NO_REVERSE },
+    { X86::FsMOVAPSrr,  X86::MOVSSmr,       TB_FOLDED_STORE | TB_NO_REVERSE },
+    { X86::IDIV16r,     X86::IDIV16m,       TB_FOLDED_LOAD },
+    { X86::IDIV32r,     X86::IDIV32m,       TB_FOLDED_LOAD },
+    { X86::IDIV64r,     X86::IDIV64m,       TB_FOLDED_LOAD },
+    { X86::IDIV8r,      X86::IDIV8m,        TB_FOLDED_LOAD },
+    { X86::IMUL16r,     X86::IMUL16m,       TB_FOLDED_LOAD },
+    { X86::IMUL32r,     X86::IMUL32m,       TB_FOLDED_LOAD },
+    { X86::IMUL64r,     X86::IMUL64m,       TB_FOLDED_LOAD },
+    { X86::IMUL8r,      X86::IMUL8m,        TB_FOLDED_LOAD },
+    { X86::JMP32r,      X86::JMP32m,        TB_FOLDED_LOAD },
+    { X86::JMP64r,      X86::JMP64m,        TB_FOLDED_LOAD },
+    { X86::MOV16ri,     X86::MOV16mi,       TB_FOLDED_STORE },
+    { X86::MOV16rr,     X86::MOV16mr,       TB_FOLDED_STORE },
+    { X86::MOV32ri,     X86::MOV32mi,       TB_FOLDED_STORE },
+    { X86::MOV32rr,     X86::MOV32mr,       TB_FOLDED_STORE },
+    { X86::MOV64ri32,   X86::MOV64mi32,     TB_FOLDED_STORE },
+    { X86::MOV64rr,     X86::MOV64mr,       TB_FOLDED_STORE },
+    { X86::MOV8ri,      X86::MOV8mi,        TB_FOLDED_STORE },
+    { X86::MOV8rr,      X86::MOV8mr,        TB_FOLDED_STORE },
+    { X86::MOV8rr_NOREX, X86::MOV8mr_NOREX, TB_FOLDED_STORE },
+    { X86::MOVAPDrr,    X86::MOVAPDmr,      TB_FOLDED_STORE | TB_ALIGN_16 },
+    { X86::MOVAPSrr,    X86::MOVAPSmr,      TB_FOLDED_STORE | TB_ALIGN_16 },
+    { X86::MOVDQArr,    X86::MOVDQAmr,      TB_FOLDED_STORE | TB_ALIGN_16 },
+    { X86::MOVPDI2DIrr, X86::MOVPDI2DImr,   TB_FOLDED_STORE },
+    { X86::MOVPQIto64rr,X86::MOVPQI2QImr,   TB_FOLDED_STORE },
+    { X86::MOVSDto64rr, X86::MOVSDto64mr,   TB_FOLDED_STORE },
+    { X86::MOVSS2DIrr,  X86::MOVSS2DImr,    TB_FOLDED_STORE },
+    { X86::MOVUPDrr,    X86::MOVUPDmr,      TB_FOLDED_STORE },
+    { X86::MOVUPSrr,    X86::MOVUPSmr,      TB_FOLDED_STORE },
+    { X86::MUL16r,      X86::MUL16m,        TB_FOLDED_LOAD },
+    { X86::MUL32r,      X86::MUL32m,        TB_FOLDED_LOAD },
+    { X86::MUL64r,      X86::MUL64m,        TB_FOLDED_LOAD },
+    { X86::MUL8r,       X86::MUL8m,         TB_FOLDED_LOAD },
+    { X86::SETAEr,      X86::SETAEm,        TB_FOLDED_STORE },
+    { X86::SETAr,       X86::SETAm,         TB_FOLDED_STORE },
+    { X86::SETBEr,      X86::SETBEm,        TB_FOLDED_STORE },
+    { X86::SETBr,       X86::SETBm,         TB_FOLDED_STORE },
+    { X86::SETEr,       X86::SETEm,         TB_FOLDED_STORE },
+    { X86::SETGEr,      X86::SETGEm,        TB_FOLDED_STORE },
+    { X86::SETGr,       X86::SETGm,         TB_FOLDED_STORE },
+    { X86::SETLEr,      X86::SETLEm,        TB_FOLDED_STORE },
+    { X86::SETLr,       X86::SETLm,         TB_FOLDED_STORE },
+    { X86::SETNEr,      X86::SETNEm,        TB_FOLDED_STORE },
+    { X86::SETNOr,      X86::SETNOm,        TB_FOLDED_STORE },
+    { X86::SETNPr,      X86::SETNPm,        TB_FOLDED_STORE },
+    { X86::SETNSr,      X86::SETNSm,        TB_FOLDED_STORE },
+    { X86::SETOr,       X86::SETOm,         TB_FOLDED_STORE },
+    { X86::SETPr,       X86::SETPm,         TB_FOLDED_STORE },
+    { X86::SETSr,       X86::SETSm,         TB_FOLDED_STORE },
+    { X86::TAILJMPr,    X86::TAILJMPm,      TB_FOLDED_LOAD },
+    { X86::TAILJMPr64,  X86::TAILJMPm64,    TB_FOLDED_LOAD },
+    { X86::TEST16ri,    X86::TEST16mi,      TB_FOLDED_LOAD },
+    { X86::TEST32ri,    X86::TEST32mi,      TB_FOLDED_LOAD },
+    { X86::TEST64ri32,  X86::TEST64mi32,    TB_FOLDED_LOAD },
+    { X86::TEST8ri,     X86::TEST8mi,       TB_FOLDED_LOAD },
+    // AVX 128-bit versions of foldable instructions
+    { X86::VEXTRACTPSrr,X86::VEXTRACTPSmr,  TB_FOLDED_STORE | TB_ALIGN_16 },
+    { X86::FsVMOVAPDrr, X86::VMOVSDmr,      TB_FOLDED_STORE | TB_NO_REVERSE },
+    { X86::FsVMOVAPSrr, X86::VMOVSSmr,      TB_FOLDED_STORE | TB_NO_REVERSE },
+    { X86::VMOVAPDrr,   X86::VMOVAPDmr,     TB_FOLDED_STORE | TB_ALIGN_16 },
+    { X86::VMOVAPSrr,   X86::VMOVAPSmr,     TB_FOLDED_STORE | TB_ALIGN_16 },
+    { X86::VMOVDQArr,   X86::VMOVDQAmr,     TB_FOLDED_STORE | TB_ALIGN_16 },
+    { X86::VMOVPDI2DIrr,X86::VMOVPDI2DImr,  TB_FOLDED_STORE },
+    { X86::VMOVPQIto64rr, X86::VMOVPQI2QImr,TB_FOLDED_STORE },
+    { X86::VMOVSDto64rr,X86::VMOVSDto64mr,  TB_FOLDED_STORE },
+    { X86::VMOVSS2DIrr, X86::VMOVSS2DImr,   TB_FOLDED_STORE },
+    { X86::VMOVUPDrr,   X86::VMOVUPDmr,     TB_FOLDED_STORE },
+    { X86::VMOVUPSrr,   X86::VMOVUPSmr,     TB_FOLDED_STORE },
+    // AVX 256-bit foldable instructions
+    { X86::VMOVAPDYrr,  X86::VMOVAPDYmr,    TB_FOLDED_STORE | TB_ALIGN_32 },
+    { X86::VMOVAPSYrr,  X86::VMOVAPSYmr,    TB_FOLDED_STORE | TB_ALIGN_32 },
+    { X86::VMOVDQAYrr,  X86::VMOVDQAYmr,    TB_FOLDED_STORE | TB_ALIGN_32 },
+    { X86::VMOVUPDYrr,  X86::VMOVUPDYmr,    TB_FOLDED_STORE },
+    { X86::VMOVUPSYrr,  X86::VMOVUPSYmr,    TB_FOLDED_STORE }
   };
 
   for (unsigned i = 0, e = array_lengthof(OpTbl0); i != e; ++i) {
     unsigned RegOp      = OpTbl0[i][0];
-    unsigned MemOp      = OpTbl0[i][1] & ~TB_FLAGS;
-    unsigned FoldedLoad = OpTbl0[i][2];
-    unsigned Align      = OpTbl0[i][3];
-    assert(!RegOp2MemOpTable0.count(RegOp) && "Duplicated entries?");
-    RegOp2MemOpTable0[RegOp] = std::make_pair(MemOp, Align);
-
-    // If this is not a reversible operation (because there is a many->one)
-    // mapping, don't insert the reverse of the operation into MemOp2RegOpTable.
-    if (OpTbl0[i][1] & TB_NOT_REVERSABLE)
-      continue;
-
-    // Index 0, folded load or store.
-    unsigned AuxInfo = 0 | (FoldedLoad << 4) | ((FoldedLoad^1) << 5);
-    assert(!MemOp2RegOpTable.count(MemOp) && "Duplicated entries?");
-    MemOp2RegOpTable[MemOp] = std::make_pair(RegOp, AuxInfo);
+    unsigned MemOp      = OpTbl0[i][1];
+    unsigned Flags      = OpTbl0[i][2];
+    AddTableEntry(RegOp2MemOpTable0, MemOp2RegOpTable,
+                  RegOp, MemOp, TB_INDEX_0 | Flags);
   }
 
   static const unsigned OpTbl1[][3] = {
-    { X86::CMP16rr,         X86::CMP16rm, 0 },
-    { X86::CMP32rr,         X86::CMP32rm, 0 },
-    { X86::CMP64rr,         X86::CMP64rm, 0 },
-    { X86::CMP8rr,          X86::CMP8rm, 0 },
-    { X86::CVTSD2SSrr,      X86::CVTSD2SSrm, 0 },
-    { X86::CVTSI2SD64rr,    X86::CVTSI2SD64rm, 0 },
-    { X86::CVTSI2SDrr,      X86::CVTSI2SDrm, 0 },
-    { X86::CVTSI2SS64rr,    X86::CVTSI2SS64rm, 0 },
-    { X86::CVTSI2SSrr,      X86::CVTSI2SSrm, 0 },
-    { X86::CVTSS2SDrr,      X86::CVTSS2SDrm, 0 },
-    { X86::CVTTSD2SI64rr,   X86::CVTTSD2SI64rm, 0 },
-    { X86::CVTTSD2SIrr,     X86::CVTTSD2SIrm, 0 },
-    { X86::CVTTSS2SI64rr,   X86::CVTTSS2SI64rm, 0 },
-    { X86::CVTTSS2SIrr,     X86::CVTTSS2SIrm, 0 },
-    { X86::FsMOVAPDrr,      X86::MOVSDrm | TB_NOT_REVERSABLE , 0 },
-    { X86::FsMOVAPSrr,      X86::MOVSSrm | TB_NOT_REVERSABLE , 0 },
-    { X86::IMUL16rri,       X86::IMUL16rmi, 0 },
-    { X86::IMUL16rri8,      X86::IMUL16rmi8, 0 },
-    { X86::IMUL32rri,       X86::IMUL32rmi, 0 },
-    { X86::IMUL32rri8,      X86::IMUL32rmi8, 0 },
-    { X86::IMUL64rri32,     X86::IMUL64rmi32, 0 },
-    { X86::IMUL64rri8,      X86::IMUL64rmi8, 0 },
-    { X86::Int_COMISDrr,    X86::Int_COMISDrm, 0 },
-    { X86::Int_COMISSrr,    X86::Int_COMISSrm, 0 },
-    { X86::Int_CVTDQ2PDrr,  X86::Int_CVTDQ2PDrm, 16 },
-    { X86::Int_CVTDQ2PSrr,  X86::Int_CVTDQ2PSrm, 16 },
-    { X86::Int_CVTPD2DQrr,  X86::Int_CVTPD2DQrm, 16 },
-    { X86::Int_CVTPD2PSrr,  X86::Int_CVTPD2PSrm, 16 },
-    { X86::Int_CVTPS2DQrr,  X86::Int_CVTPS2DQrm, 16 },
-    { X86::Int_CVTPS2PDrr,  X86::Int_CVTPS2PDrm, 0 },
-    { X86::CVTSD2SI64rr,    X86::CVTSD2SI64rm, 0 },
-    { X86::CVTSD2SIrr,      X86::CVTSD2SIrm, 0 },
-    { X86::Int_CVTSD2SSrr,  X86::Int_CVTSD2SSrm, 0 },
-    { X86::Int_CVTSI2SD64rr,X86::Int_CVTSI2SD64rm, 0 },
-    { X86::Int_CVTSI2SDrr,  X86::Int_CVTSI2SDrm, 0 },
-    { X86::Int_CVTSI2SS64rr,X86::Int_CVTSI2SS64rm, 0 },
-    { X86::Int_CVTSI2SSrr,  X86::Int_CVTSI2SSrm, 0 },
-    { X86::Int_CVTSS2SDrr,  X86::Int_CVTSS2SDrm, 0 },
-    { X86::Int_CVTSS2SI64rr,X86::Int_CVTSS2SI64rm, 0 },
-    { X86::Int_CVTSS2SIrr,  X86::Int_CVTSS2SIrm, 0 },
-    { X86::CVTTPD2DQrr,     X86::CVTTPD2DQrm, 16 },
-    { X86::CVTTPS2DQrr,     X86::CVTTPS2DQrm, 16 },
-    { X86::Int_CVTTSD2SI64rr,X86::Int_CVTTSD2SI64rm, 0 },
-    { X86::Int_CVTTSD2SIrr, X86::Int_CVTTSD2SIrm, 0 },
-    { X86::Int_CVTTSS2SI64rr,X86::Int_CVTTSS2SI64rm, 0 },
-    { X86::Int_CVTTSS2SIrr, X86::Int_CVTTSS2SIrm, 0 },
-    { X86::Int_UCOMISDrr,   X86::Int_UCOMISDrm, 0 },
-    { X86::Int_UCOMISSrr,   X86::Int_UCOMISSrm, 0 },
-    { X86::MOV16rr,         X86::MOV16rm, 0 },
-    { X86::MOV32rr,         X86::MOV32rm, 0 },
-    { X86::MOV64rr,         X86::MOV64rm, 0 },
-    { X86::MOV64toPQIrr,    X86::MOVQI2PQIrm, 0 },
-    { X86::MOV64toSDrr,     X86::MOV64toSDrm, 0 },
-    { X86::MOV8rr,          X86::MOV8rm, 0 },
-    { X86::MOVAPDrr,        X86::MOVAPDrm, 16 },
-    { X86::MOVAPSrr,        X86::MOVAPSrm, 16 },
-    { X86::VMOVAPDYrr,      X86::VMOVAPDYrm, 32 },
-    { X86::VMOVAPSYrr,      X86::VMOVAPSYrm, 32 },
-    { X86::MOVDDUPrr,       X86::MOVDDUPrm, 0 },
-    { X86::MOVDI2PDIrr,     X86::MOVDI2PDIrm, 0 },
-    { X86::MOVDI2SSrr,      X86::MOVDI2SSrm, 0 },
-    { X86::MOVDQArr,        X86::MOVDQArm, 16 },
-    { X86::VMOVDQAYrr,      X86::VMOVDQAYrm, 16 },
-    { X86::MOVSHDUPrr,      X86::MOVSHDUPrm, 16 },
-    { X86::MOVSLDUPrr,      X86::MOVSLDUPrm, 16 },
-    { X86::MOVSX16rr8,      X86::MOVSX16rm8, 0 },
-    { X86::MOVSX32rr16,     X86::MOVSX32rm16, 0 },
-    { X86::MOVSX32rr8,      X86::MOVSX32rm8, 0 },
-    { X86::MOVSX64rr16,     X86::MOVSX64rm16, 0 },
-    { X86::MOVSX64rr32,     X86::MOVSX64rm32, 0 },
-    { X86::MOVSX64rr8,      X86::MOVSX64rm8, 0 },
-    { X86::MOVUPDrr,        X86::MOVUPDrm, 16 },
-    { X86::MOVUPSrr,        X86::MOVUPSrm, 0 },
-    { X86::VMOVUPDYrr,      X86::VMOVUPDYrm, 0 },
-    { X86::VMOVUPSYrr,      X86::VMOVUPSYrm, 0 },
-    { X86::MOVZDI2PDIrr,    X86::MOVZDI2PDIrm, 0 },
-    { X86::MOVZQI2PQIrr,    X86::MOVZQI2PQIrm, 0 },
-    { X86::MOVZPQILo2PQIrr, X86::MOVZPQILo2PQIrm, 16 },
-    { X86::MOVZX16rr8,      X86::MOVZX16rm8, 0 },
-    { X86::MOVZX32rr16,     X86::MOVZX32rm16, 0 },
-    { X86::MOVZX32_NOREXrr8, X86::MOVZX32_NOREXrm8, 0 },
-    { X86::MOVZX32rr8,      X86::MOVZX32rm8, 0 },
-    { X86::MOVZX64rr16,     X86::MOVZX64rm16, 0 },
-    { X86::MOVZX64rr32,     X86::MOVZX64rm32, 0 },
-    { X86::MOVZX64rr8,      X86::MOVZX64rm8, 0 },
-    { X86::PSHUFDri,        X86::PSHUFDmi, 16 },
-    { X86::PSHUFHWri,       X86::PSHUFHWmi, 16 },
-    { X86::PSHUFLWri,       X86::PSHUFLWmi, 16 },
-    { X86::RCPPSr,          X86::RCPPSm, 16 },
-    { X86::RCPPSr_Int,      X86::RCPPSm_Int, 16 },
-    { X86::RSQRTPSr,        X86::RSQRTPSm, 16 },
-    { X86::RSQRTPSr_Int,    X86::RSQRTPSm_Int, 16 },
-    { X86::RSQRTSSr,        X86::RSQRTSSm, 0 },
-    { X86::RSQRTSSr_Int,    X86::RSQRTSSm_Int, 0 },
-    { X86::SQRTPDr,         X86::SQRTPDm, 16 },
-    { X86::SQRTPDr_Int,     X86::SQRTPDm_Int, 16 },
-    { X86::SQRTPSr,         X86::SQRTPSm, 16 },
-    { X86::SQRTPSr_Int,     X86::SQRTPSm_Int, 16 },
-    { X86::SQRTSDr,         X86::SQRTSDm, 0 },
-    { X86::SQRTSDr_Int,     X86::SQRTSDm_Int, 0 },
-    { X86::SQRTSSr,         X86::SQRTSSm, 0 },
-    { X86::SQRTSSr_Int,     X86::SQRTSSm_Int, 0 },
-    { X86::TEST16rr,        X86::TEST16rm, 0 },
-    { X86::TEST32rr,        X86::TEST32rm, 0 },
-    { X86::TEST64rr,        X86::TEST64rm, 0 },
-    { X86::TEST8rr,         X86::TEST8rm, 0 },
+    { X86::CMP16rr,         X86::CMP16rm,             0 },
+    { X86::CMP32rr,         X86::CMP32rm,             0 },
+    { X86::CMP64rr,         X86::CMP64rm,             0 },
+    { X86::CMP8rr,          X86::CMP8rm,              0 },
+    { X86::CVTSD2SSrr,      X86::CVTSD2SSrm,          0 },
+    { X86::CVTSI2SD64rr,    X86::CVTSI2SD64rm,        0 },
+    { X86::CVTSI2SDrr,      X86::CVTSI2SDrm,          0 },
+    { X86::CVTSI2SS64rr,    X86::CVTSI2SS64rm,        0 },
+    { X86::CVTSI2SSrr,      X86::CVTSI2SSrm,          0 },
+    { X86::CVTSS2SDrr,      X86::CVTSS2SDrm,          0 },
+    { X86::CVTTSD2SI64rr,   X86::CVTTSD2SI64rm,       0 },
+    { X86::CVTTSD2SIrr,     X86::CVTTSD2SIrm,         0 },
+    { X86::CVTTSS2SI64rr,   X86::CVTTSS2SI64rm,       0 },
+    { X86::CVTTSS2SIrr,     X86::CVTTSS2SIrm,         0 },
+    { X86::FsMOVAPDrr,      X86::MOVSDrm,             TB_NO_REVERSE },
+    { X86::FsMOVAPSrr,      X86::MOVSSrm,             TB_NO_REVERSE },
+    { X86::IMUL16rri,       X86::IMUL16rmi,           0 },
+    { X86::IMUL16rri8,      X86::IMUL16rmi8,          0 },
+    { X86::IMUL32rri,       X86::IMUL32rmi,           0 },
+    { X86::IMUL32rri8,      X86::IMUL32rmi8,          0 },
+    { X86::IMUL64rri32,     X86::IMUL64rmi32,         0 },
+    { X86::IMUL64rri8,      X86::IMUL64rmi8,          0 },
+    { X86::Int_COMISDrr,    X86::Int_COMISDrm,        0 },
+    { X86::Int_COMISSrr,    X86::Int_COMISSrm,        0 },
+    { X86::Int_CVTDQ2PDrr,  X86::Int_CVTDQ2PDrm,      TB_ALIGN_16 },
+    { X86::Int_CVTDQ2PSrr,  X86::Int_CVTDQ2PSrm,      TB_ALIGN_16 },
+    { X86::Int_CVTPD2DQrr,  X86::Int_CVTPD2DQrm,      TB_ALIGN_16 },
+    { X86::Int_CVTPD2PSrr,  X86::Int_CVTPD2PSrm,      TB_ALIGN_16 },
+    { X86::Int_CVTPS2DQrr,  X86::Int_CVTPS2DQrm,      TB_ALIGN_16 },
+    { X86::Int_CVTPS2PDrr,  X86::Int_CVTPS2PDrm,      0 },
+    { X86::CVTSD2SI64rr,    X86::CVTSD2SI64rm,        0 },
+    { X86::CVTSD2SIrr,      X86::CVTSD2SIrm,          0 },
+    { X86::Int_CVTSD2SSrr,  X86::Int_CVTSD2SSrm,      0 },
+    { X86::Int_CVTSI2SD64rr,X86::Int_CVTSI2SD64rm,    0 },
+    { X86::Int_CVTSI2SDrr,  X86::Int_CVTSI2SDrm,      0 },
+    { X86::Int_CVTSI2SS64rr,X86::Int_CVTSI2SS64rm,    0 },
+    { X86::Int_CVTSI2SSrr,  X86::Int_CVTSI2SSrm,      0 },
+    { X86::Int_CVTSS2SDrr,  X86::Int_CVTSS2SDrm,      0 },
+    { X86::CVTTPD2DQrr,     X86::CVTTPD2DQrm,         TB_ALIGN_16 },
+    { X86::CVTTPS2DQrr,     X86::CVTTPS2DQrm,         TB_ALIGN_16 },
+    { X86::Int_CVTTSD2SI64rr,X86::Int_CVTTSD2SI64rm,  0 },
+    { X86::Int_CVTTSD2SIrr, X86::Int_CVTTSD2SIrm,     0 },
+    { X86::Int_CVTTSS2SI64rr,X86::Int_CVTTSS2SI64rm,  0 },
+    { X86::Int_CVTTSS2SIrr, X86::Int_CVTTSS2SIrm,     0 },
+    { X86::Int_UCOMISDrr,   X86::Int_UCOMISDrm,       0 },
+    { X86::Int_UCOMISSrr,   X86::Int_UCOMISSrm,       0 },
+    { X86::MOV16rr,         X86::MOV16rm,             0 },
+    { X86::MOV32rr,         X86::MOV32rm,             0 },
+    { X86::MOV64rr,         X86::MOV64rm,             0 },
+    { X86::MOV64toPQIrr,    X86::MOVQI2PQIrm,         0 },
+    { X86::MOV64toSDrr,     X86::MOV64toSDrm,         0 },
+    { X86::MOV8rr,          X86::MOV8rm,              0 },
+    { X86::MOVAPDrr,        X86::MOVAPDrm,            TB_ALIGN_16 },
+    { X86::MOVAPSrr,        X86::MOVAPSrm,            TB_ALIGN_16 },
+    { X86::MOVDDUPrr,       X86::MOVDDUPrm,           0 },
+    { X86::MOVDI2PDIrr,     X86::MOVDI2PDIrm,         0 },
+    { X86::MOVDI2SSrr,      X86::MOVDI2SSrm,          0 },
+    { X86::MOVDQArr,        X86::MOVDQArm,            TB_ALIGN_16 },
+    { X86::MOVSHDUPrr,      X86::MOVSHDUPrm,          TB_ALIGN_16 },
+    { X86::MOVSLDUPrr,      X86::MOVSLDUPrm,          TB_ALIGN_16 },
+    { X86::MOVSX16rr8,      X86::MOVSX16rm8,          0 },
+    { X86::MOVSX32rr16,     X86::MOVSX32rm16,         0 },
+    { X86::MOVSX32rr8,      X86::MOVSX32rm8,          0 },
+    { X86::MOVSX64rr16,     X86::MOVSX64rm16,         0 },
+    { X86::MOVSX64rr32,     X86::MOVSX64rm32,         0 },
+    { X86::MOVSX64rr8,      X86::MOVSX64rm8,          0 },
+    { X86::MOVUPDrr,        X86::MOVUPDrm,            TB_ALIGN_16 },
+    { X86::MOVUPSrr,        X86::MOVUPSrm,            0 },
+    { X86::MOVZDI2PDIrr,    X86::MOVZDI2PDIrm,        0 },
+    { X86::MOVZQI2PQIrr,    X86::MOVZQI2PQIrm,        0 },
+    { X86::MOVZPQILo2PQIrr, X86::MOVZPQILo2PQIrm,     TB_ALIGN_16 },
+    { X86::MOVZX16rr8,      X86::MOVZX16rm8,          0 },
+    { X86::MOVZX32rr16,     X86::MOVZX32rm16,         0 },
+    { X86::MOVZX32_NOREXrr8, X86::MOVZX32_NOREXrm8,   0 },
+    { X86::MOVZX32rr8,      X86::MOVZX32rm8,          0 },
+    { X86::MOVZX64rr16,     X86::MOVZX64rm16,         0 },
+    { X86::MOVZX64rr32,     X86::MOVZX64rm32,         0 },
+    { X86::MOVZX64rr8,      X86::MOVZX64rm8,          0 },
+    { X86::PSHUFDri,        X86::PSHUFDmi,            TB_ALIGN_16 },
+    { X86::PSHUFHWri,       X86::PSHUFHWmi,           TB_ALIGN_16 },
+    { X86::PSHUFLWri,       X86::PSHUFLWmi,           TB_ALIGN_16 },
+    { X86::RCPPSr,          X86::RCPPSm,              TB_ALIGN_16 },
+    { X86::RCPPSr_Int,      X86::RCPPSm_Int,          TB_ALIGN_16 },
+    { X86::RSQRTPSr,        X86::RSQRTPSm,            TB_ALIGN_16 },
+    { X86::RSQRTPSr_Int,    X86::RSQRTPSm_Int,        TB_ALIGN_16 },
+    { X86::RSQRTSSr,        X86::RSQRTSSm,            0 },
+    { X86::RSQRTSSr_Int,    X86::RSQRTSSm_Int,        0 },
+    { X86::SQRTPDr,         X86::SQRTPDm,             TB_ALIGN_16 },
+    { X86::SQRTPDr_Int,     X86::SQRTPDm_Int,         TB_ALIGN_16 },
+    { X86::SQRTPSr,         X86::SQRTPSm,             TB_ALIGN_16 },
+    { X86::SQRTPSr_Int,     X86::SQRTPSm_Int,         TB_ALIGN_16 },
+    { X86::SQRTSDr,         X86::SQRTSDm,             0 },
+    { X86::SQRTSDr_Int,     X86::SQRTSDm_Int,         0 },
+    { X86::SQRTSSr,         X86::SQRTSSm,             0 },
+    { X86::SQRTSSr_Int,     X86::SQRTSSm_Int,         0 },
+    { X86::TEST16rr,        X86::TEST16rm,            0 },
+    { X86::TEST32rr,        X86::TEST32rm,            0 },
+    { X86::TEST64rr,        X86::TEST64rm,            0 },
+    { X86::TEST8rr,         X86::TEST8rm,             0 },
     // FIXME: TEST*rr EAX,EAX ---> CMP [mem], 0
-    { X86::UCOMISDrr,       X86::UCOMISDrm, 0 },
-    { X86::UCOMISSrr,       X86::UCOMISSrm, 0 }
+    { X86::UCOMISDrr,       X86::UCOMISDrm,           0 },
+    { X86::UCOMISSrr,       X86::UCOMISSrm,           0 },
+    // AVX 128-bit versions of foldable instructions
+    { X86::Int_VCOMISDrr,   X86::Int_VCOMISDrm,       0 },
+    { X86::Int_VCOMISSrr,   X86::Int_VCOMISSrm,       0 },
+    { X86::Int_VCVTDQ2PDrr, X86::Int_VCVTDQ2PDrm,     TB_ALIGN_16 },
+    { X86::Int_VCVTDQ2PSrr, X86::Int_VCVTDQ2PSrm,     TB_ALIGN_16 },
+    { X86::Int_VCVTPD2DQrr, X86::Int_VCVTPD2DQrm,     TB_ALIGN_16 },
+    { X86::Int_VCVTPD2PSrr, X86::Int_VCVTPD2PSrm,     TB_ALIGN_16 },
+    { X86::Int_VCVTPS2DQrr, X86::Int_VCVTPS2DQrm,     TB_ALIGN_16 },
+    { X86::Int_VCVTPS2PDrr, X86::Int_VCVTPS2PDrm,     0 },
+    { X86::Int_VUCOMISDrr,  X86::Int_VUCOMISDrm,      0 },
+    { X86::Int_VUCOMISSrr,  X86::Int_VUCOMISSrm,      0 },
+    { X86::FsVMOVAPDrr,     X86::VMOVSDrm,            TB_NO_REVERSE },
+    { X86::FsVMOVAPSrr,     X86::VMOVSSrm,            TB_NO_REVERSE },
+    { X86::VMOV64toPQIrr,   X86::VMOVQI2PQIrm,        0 },
+    { X86::VMOV64toSDrr,    X86::VMOV64toSDrm,        0 },
+    { X86::VMOVAPDrr,       X86::VMOVAPDrm,           TB_ALIGN_16 },
+    { X86::VMOVAPSrr,       X86::VMOVAPSrm,           TB_ALIGN_16 },
+    { X86::VMOVDDUPrr,      X86::VMOVDDUPrm,          0 },
+    { X86::VMOVDI2PDIrr,    X86::VMOVDI2PDIrm,        0 },
+    { X86::VMOVDI2SSrr,     X86::VMOVDI2SSrm,         0 },
+    { X86::VMOVDQArr,       X86::VMOVDQArm,           TB_ALIGN_16 },
+    { X86::VMOVSLDUPrr,     X86::VMOVSLDUPrm,         TB_ALIGN_16 },
+    { X86::VMOVSHDUPrr,     X86::VMOVSHDUPrm,         TB_ALIGN_16 },
+    { X86::VMOVUPDrr,       X86::VMOVUPDrm,           TB_ALIGN_16 },
+    { X86::VMOVUPSrr,       X86::VMOVUPSrm,           0 },
+    { X86::VMOVZDI2PDIrr,   X86::VMOVZDI2PDIrm,       0 },
+    { X86::VMOVZQI2PQIrr,   X86::VMOVZQI2PQIrm,       0 },
+    { X86::VMOVZPQILo2PQIrr,X86::VMOVZPQILo2PQIrm,    TB_ALIGN_16 },
+    { X86::VPSHUFDri,       X86::VPSHUFDmi,           TB_ALIGN_16 },
+    { X86::VPSHUFHWri,      X86::VPSHUFHWmi,          TB_ALIGN_16 },
+    { X86::VPSHUFLWri,      X86::VPSHUFLWmi,          TB_ALIGN_16 },
+    { X86::VRCPPSr,         X86::VRCPPSm,             TB_ALIGN_16 },
+    { X86::VRCPPSr_Int,     X86::VRCPPSm_Int,         TB_ALIGN_16 },
+    { X86::VRSQRTPSr,       X86::VRSQRTPSm,           TB_ALIGN_16 },
+    { X86::VRSQRTPSr_Int,   X86::VRSQRTPSm_Int,       TB_ALIGN_16 },
+    { X86::VSQRTPDr,        X86::VSQRTPDm,            TB_ALIGN_16 },
+    { X86::VSQRTPDr_Int,    X86::VSQRTPDm_Int,        TB_ALIGN_16 },
+    { X86::VSQRTPSr,        X86::VSQRTPSm,            TB_ALIGN_16 },
+    { X86::VSQRTPSr_Int,    X86::VSQRTPSm_Int,        TB_ALIGN_16 },
+    { X86::VUCOMISDrr,      X86::VUCOMISDrm,          0 },
+    { X86::VUCOMISSrr,      X86::VUCOMISSrm,          0 },
+    // AVX 256-bit foldable instructions
+    { X86::VMOVAPDYrr,      X86::VMOVAPDYrm,          TB_ALIGN_32 },
+    { X86::VMOVAPSYrr,      X86::VMOVAPSYrm,          TB_ALIGN_32 },
+    { X86::VMOVDQAYrr,      X86::VMOVDQAYrm,          TB_ALIGN_16 },
+    { X86::VMOVUPDYrr,      X86::VMOVUPDYrm,          0 },
+    { X86::VMOVUPSYrr,      X86::VMOVUPSYrm,          0 }
   };
 
   for (unsigned i = 0, e = array_lengthof(OpTbl1); i != e; ++i) {
     unsigned RegOp = OpTbl1[i][0];
-    unsigned MemOp = OpTbl1[i][1] & ~TB_FLAGS;
-    unsigned Align = OpTbl1[i][2];
-    assert(!RegOp2MemOpTable1.count(RegOp) && "Duplicate entries");
-    RegOp2MemOpTable1[RegOp] = std::make_pair(MemOp, Align);
-
-    // If this is not a reversible operation (because there is a many->one)
-    // mapping, don't insert the reverse of the operation into MemOp2RegOpTable.
-    if (OpTbl1[i][1] & TB_NOT_REVERSABLE)
-      continue;
-
-    // Index 1, folded load
-    unsigned AuxInfo = 1 | (1 << 4);
-    assert(!MemOp2RegOpTable.count(MemOp) && "Duplicate entries");
-    MemOp2RegOpTable[MemOp] = std::make_pair(RegOp, AuxInfo);
+    unsigned MemOp = OpTbl1[i][1];
+    unsigned Flags = OpTbl1[i][2];
+    AddTableEntry(RegOp2MemOpTable1, MemOp2RegOpTable,
+                  RegOp, MemOp,
+                  // Index 1, folded load
+                  Flags | TB_INDEX_1 | TB_FOLDED_LOAD);
   }
 
   static const unsigned OpTbl2[][3] = {
-    { X86::ADC32rr,         X86::ADC32rm, 0 },
-    { X86::ADC64rr,         X86::ADC64rm, 0 },
-    { X86::ADD16rr,         X86::ADD16rm, 0 },
-    { X86::ADD16rr_DB,      X86::ADD16rm | TB_NOT_REVERSABLE, 0 },
-    { X86::ADD32rr,         X86::ADD32rm, 0 },
-    { X86::ADD32rr_DB,      X86::ADD32rm | TB_NOT_REVERSABLE, 0 },
-    { X86::ADD64rr,         X86::ADD64rm, 0 },
-    { X86::ADD64rr_DB,      X86::ADD64rm | TB_NOT_REVERSABLE, 0 },
-    { X86::ADD8rr,          X86::ADD8rm, 0 },
-    { X86::ADDPDrr,         X86::ADDPDrm, 16 },
-    { X86::ADDPSrr,         X86::ADDPSrm, 16 },
-    { X86::ADDSDrr,         X86::ADDSDrm, 0 },
-    { X86::ADDSSrr,         X86::ADDSSrm, 0 },
-    { X86::ADDSUBPDrr,      X86::ADDSUBPDrm, 16 },
-    { X86::ADDSUBPSrr,      X86::ADDSUBPSrm, 16 },
-    { X86::AND16rr,         X86::AND16rm, 0 },
-    { X86::AND32rr,         X86::AND32rm, 0 },
-    { X86::AND64rr,         X86::AND64rm, 0 },
-    { X86::AND8rr,          X86::AND8rm, 0 },
-    { X86::ANDNPDrr,        X86::ANDNPDrm, 16 },
-    { X86::ANDNPSrr,        X86::ANDNPSrm, 16 },
-    { X86::ANDPDrr,         X86::ANDPDrm, 16 },
-    { X86::ANDPSrr,         X86::ANDPSrm, 16 },
-    { X86::CMOVA16rr,       X86::CMOVA16rm, 0 },
-    { X86::CMOVA32rr,       X86::CMOVA32rm, 0 },
-    { X86::CMOVA64rr,       X86::CMOVA64rm, 0 },
-    { X86::CMOVAE16rr,      X86::CMOVAE16rm, 0 },
-    { X86::CMOVAE32rr,      X86::CMOVAE32rm, 0 },
-    { X86::CMOVAE64rr,      X86::CMOVAE64rm, 0 },
-    { X86::CMOVB16rr,       X86::CMOVB16rm, 0 },
-    { X86::CMOVB32rr,       X86::CMOVB32rm, 0 },
-    { X86::CMOVB64rr,       X86::CMOVB64rm, 0 },
-    { X86::CMOVBE16rr,      X86::CMOVBE16rm, 0 },
-    { X86::CMOVBE32rr,      X86::CMOVBE32rm, 0 },
-    { X86::CMOVBE64rr,      X86::CMOVBE64rm, 0 },
-    { X86::CMOVE16rr,       X86::CMOVE16rm, 0 },
-    { X86::CMOVE32rr,       X86::CMOVE32rm, 0 },
-    { X86::CMOVE64rr,       X86::CMOVE64rm, 0 },
-    { X86::CMOVG16rr,       X86::CMOVG16rm, 0 },
-    { X86::CMOVG32rr,       X86::CMOVG32rm, 0 },
-    { X86::CMOVG64rr,       X86::CMOVG64rm, 0 },
-    { X86::CMOVGE16rr,      X86::CMOVGE16rm, 0 },
-    { X86::CMOVGE32rr,      X86::CMOVGE32rm, 0 },
-    { X86::CMOVGE64rr,      X86::CMOVGE64rm, 0 },
-    { X86::CMOVL16rr,       X86::CMOVL16rm, 0 },
-    { X86::CMOVL32rr,       X86::CMOVL32rm, 0 },
-    { X86::CMOVL64rr,       X86::CMOVL64rm, 0 },
-    { X86::CMOVLE16rr,      X86::CMOVLE16rm, 0 },
-    { X86::CMOVLE32rr,      X86::CMOVLE32rm, 0 },
-    { X86::CMOVLE64rr,      X86::CMOVLE64rm, 0 },
-    { X86::CMOVNE16rr,      X86::CMOVNE16rm, 0 },
-    { X86::CMOVNE32rr,      X86::CMOVNE32rm, 0 },
-    { X86::CMOVNE64rr,      X86::CMOVNE64rm, 0 },
-    { X86::CMOVNO16rr,      X86::CMOVNO16rm, 0 },
-    { X86::CMOVNO32rr,      X86::CMOVNO32rm, 0 },
-    { X86::CMOVNO64rr,      X86::CMOVNO64rm, 0 },
-    { X86::CMOVNP16rr,      X86::CMOVNP16rm, 0 },
-    { X86::CMOVNP32rr,      X86::CMOVNP32rm, 0 },
-    { X86::CMOVNP64rr,      X86::CMOVNP64rm, 0 },
-    { X86::CMOVNS16rr,      X86::CMOVNS16rm, 0 },
-    { X86::CMOVNS32rr,      X86::CMOVNS32rm, 0 },
-    { X86::CMOVNS64rr,      X86::CMOVNS64rm, 0 },
-    { X86::CMOVO16rr,       X86::CMOVO16rm, 0 },
-    { X86::CMOVO32rr,       X86::CMOVO32rm, 0 },
-    { X86::CMOVO64rr,       X86::CMOVO64rm, 0 },
-    { X86::CMOVP16rr,       X86::CMOVP16rm, 0 },
-    { X86::CMOVP32rr,       X86::CMOVP32rm, 0 },
-    { X86::CMOVP64rr,       X86::CMOVP64rm, 0 },
-    { X86::CMOVS16rr,       X86::CMOVS16rm, 0 },
-    { X86::CMOVS32rr,       X86::CMOVS32rm, 0 },
-    { X86::CMOVS64rr,       X86::CMOVS64rm, 0 },
-    { X86::CMPPDrri,        X86::CMPPDrmi, 16 },
-    { X86::CMPPSrri,        X86::CMPPSrmi, 16 },
-    { X86::CMPSDrr,         X86::CMPSDrm, 0 },
-    { X86::CMPSSrr,         X86::CMPSSrm, 0 },
-    { X86::DIVPDrr,         X86::DIVPDrm, 16 },
-    { X86::DIVPSrr,         X86::DIVPSrm, 16 },
-    { X86::DIVSDrr,         X86::DIVSDrm, 0 },
-    { X86::DIVSSrr,         X86::DIVSSrm, 0 },
-    { X86::FsANDNPDrr,      X86::FsANDNPDrm, 16 },
-    { X86::FsANDNPSrr,      X86::FsANDNPSrm, 16 },
-    { X86::FsANDPDrr,       X86::FsANDPDrm, 16 },
-    { X86::FsANDPSrr,       X86::FsANDPSrm, 16 },
-    { X86::FsORPDrr,        X86::FsORPDrm, 16 },
-    { X86::FsORPSrr,        X86::FsORPSrm, 16 },
-    { X86::FsXORPDrr,       X86::FsXORPDrm, 16 },
-    { X86::FsXORPSrr,       X86::FsXORPSrm, 16 },
-    { X86::HADDPDrr,        X86::HADDPDrm, 16 },
-    { X86::HADDPSrr,        X86::HADDPSrm, 16 },
-    { X86::HSUBPDrr,        X86::HSUBPDrm, 16 },
-    { X86::HSUBPSrr,        X86::HSUBPSrm, 16 },
-    { X86::IMUL16rr,        X86::IMUL16rm, 0 },
-    { X86::IMUL32rr,        X86::IMUL32rm, 0 },
-    { X86::IMUL64rr,        X86::IMUL64rm, 0 },
-    { X86::Int_CMPSDrr,     X86::Int_CMPSDrm, 0 },
-    { X86::Int_CMPSSrr,     X86::Int_CMPSSrm, 0 },
-    { X86::MAXPDrr,         X86::MAXPDrm, 16 },
-    { X86::MAXPDrr_Int,     X86::MAXPDrm_Int, 16 },
-    { X86::MAXPSrr,         X86::MAXPSrm, 16 },
-    { X86::MAXPSrr_Int,     X86::MAXPSrm_Int, 16 },
-    { X86::MAXSDrr,         X86::MAXSDrm, 0 },
-    { X86::MAXSDrr_Int,     X86::MAXSDrm_Int, 0 },
-    { X86::MAXSSrr,         X86::MAXSSrm, 0 },
-    { X86::MAXSSrr_Int,     X86::MAXSSrm_Int, 0 },
-    { X86::MINPDrr,         X86::MINPDrm, 16 },
-    { X86::MINPDrr_Int,     X86::MINPDrm_Int, 16 },
-    { X86::MINPSrr,         X86::MINPSrm, 16 },
-    { X86::MINPSrr_Int,     X86::MINPSrm_Int, 16 },
-    { X86::MINSDrr,         X86::MINSDrm, 0 },
-    { X86::MINSDrr_Int,     X86::MINSDrm_Int, 0 },
-    { X86::MINSSrr,         X86::MINSSrm, 0 },
-    { X86::MINSSrr_Int,     X86::MINSSrm_Int, 0 },
-    { X86::MULPDrr,         X86::MULPDrm, 16 },
-    { X86::MULPSrr,         X86::MULPSrm, 16 },
-    { X86::MULSDrr,         X86::MULSDrm, 0 },
-    { X86::MULSSrr,         X86::MULSSrm, 0 },
-    { X86::OR16rr,          X86::OR16rm, 0 },
-    { X86::OR32rr,          X86::OR32rm, 0 },
-    { X86::OR64rr,          X86::OR64rm, 0 },
-    { X86::OR8rr,           X86::OR8rm, 0 },
-    { X86::ORPDrr,          X86::ORPDrm, 16 },
-    { X86::ORPSrr,          X86::ORPSrm, 16 },
-    { X86::PACKSSDWrr,      X86::PACKSSDWrm, 16 },
-    { X86::PACKSSWBrr,      X86::PACKSSWBrm, 16 },
-    { X86::PACKUSWBrr,      X86::PACKUSWBrm, 16 },
-    { X86::PADDBrr,         X86::PADDBrm, 16 },
-    { X86::PADDDrr,         X86::PADDDrm, 16 },
-    { X86::PADDQrr,         X86::PADDQrm, 16 },
-    { X86::PADDSBrr,        X86::PADDSBrm, 16 },
-    { X86::PADDSWrr,        X86::PADDSWrm, 16 },
-    { X86::PADDWrr,         X86::PADDWrm, 16 },
-    { X86::PANDNrr,         X86::PANDNrm, 16 },
-    { X86::PANDrr,          X86::PANDrm, 16 },
-    { X86::PAVGBrr,         X86::PAVGBrm, 16 },
-    { X86::PAVGWrr,         X86::PAVGWrm, 16 },
-    { X86::PCMPEQBrr,       X86::PCMPEQBrm, 16 },
-    { X86::PCMPEQDrr,       X86::PCMPEQDrm, 16 },
-    { X86::PCMPEQWrr,       X86::PCMPEQWrm, 16 },
-    { X86::PCMPGTBrr,       X86::PCMPGTBrm, 16 },
-    { X86::PCMPGTDrr,       X86::PCMPGTDrm, 16 },
-    { X86::PCMPGTWrr,       X86::PCMPGTWrm, 16 },
-    { X86::PINSRWrri,       X86::PINSRWrmi, 16 },
-    { X86::PMADDWDrr,       X86::PMADDWDrm, 16 },
-    { X86::PMAXSWrr,        X86::PMAXSWrm, 16 },
-    { X86::PMAXUBrr,        X86::PMAXUBrm, 16 },
-    { X86::PMINSWrr,        X86::PMINSWrm, 16 },
-    { X86::PMINUBrr,        X86::PMINUBrm, 16 },
-    { X86::PMULDQrr,        X86::PMULDQrm, 16 },
-    { X86::PMULHUWrr,       X86::PMULHUWrm, 16 },
-    { X86::PMULHWrr,        X86::PMULHWrm, 16 },
-    { X86::PMULLDrr,        X86::PMULLDrm, 16 },
-    { X86::PMULLWrr,        X86::PMULLWrm, 16 },
-    { X86::PMULUDQrr,       X86::PMULUDQrm, 16 },
-    { X86::PORrr,           X86::PORrm, 16 },
-    { X86::PSADBWrr,        X86::PSADBWrm, 16 },
-    { X86::PSLLDrr,         X86::PSLLDrm, 16 },
-    { X86::PSLLQrr,         X86::PSLLQrm, 16 },
-    { X86::PSLLWrr,         X86::PSLLWrm, 16 },
-    { X86::PSRADrr,         X86::PSRADrm, 16 },
-    { X86::PSRAWrr,         X86::PSRAWrm, 16 },
-    { X86::PSRLDrr,         X86::PSRLDrm, 16 },
-    { X86::PSRLQrr,         X86::PSRLQrm, 16 },
-    { X86::PSRLWrr,         X86::PSRLWrm, 16 },
-    { X86::PSUBBrr,         X86::PSUBBrm, 16 },
-    { X86::PSUBDrr,         X86::PSUBDrm, 16 },
-    { X86::PSUBSBrr,        X86::PSUBSBrm, 16 },
-    { X86::PSUBSWrr,        X86::PSUBSWrm, 16 },
-    { X86::PSUBWrr,         X86::PSUBWrm, 16 },
-    { X86::PUNPCKHBWrr,     X86::PUNPCKHBWrm, 16 },
-    { X86::PUNPCKHDQrr,     X86::PUNPCKHDQrm, 16 },
-    { X86::PUNPCKHQDQrr,    X86::PUNPCKHQDQrm, 16 },
-    { X86::PUNPCKHWDrr,     X86::PUNPCKHWDrm, 16 },
-    { X86::PUNPCKLBWrr,     X86::PUNPCKLBWrm, 16 },
-    { X86::PUNPCKLDQrr,     X86::PUNPCKLDQrm, 16 },
-    { X86::PUNPCKLQDQrr,    X86::PUNPCKLQDQrm, 16 },
-    { X86::PUNPCKLWDrr,     X86::PUNPCKLWDrm, 16 },
-    { X86::PXORrr,          X86::PXORrm, 16 },
-    { X86::SBB32rr,         X86::SBB32rm, 0 },
-    { X86::SBB64rr,         X86::SBB64rm, 0 },
-    { X86::SHUFPDrri,       X86::SHUFPDrmi, 16 },
-    { X86::SHUFPSrri,       X86::SHUFPSrmi, 16 },
-    { X86::SUB16rr,         X86::SUB16rm, 0 },
-    { X86::SUB32rr,         X86::SUB32rm, 0 },
-    { X86::SUB64rr,         X86::SUB64rm, 0 },
-    { X86::SUB8rr,          X86::SUB8rm, 0 },
-    { X86::SUBPDrr,         X86::SUBPDrm, 16 },
-    { X86::SUBPSrr,         X86::SUBPSrm, 16 },
-    { X86::SUBSDrr,         X86::SUBSDrm, 0 },
-    { X86::SUBSSrr,         X86::SUBSSrm, 0 },
+    { X86::ADC32rr,         X86::ADC32rm,       0 },
+    { X86::ADC64rr,         X86::ADC64rm,       0 },
+    { X86::ADD16rr,         X86::ADD16rm,       0 },
+    { X86::ADD16rr_DB,      X86::ADD16rm,       TB_NO_REVERSE },
+    { X86::ADD32rr,         X86::ADD32rm,       0 },
+    { X86::ADD32rr_DB,      X86::ADD32rm,       TB_NO_REVERSE },
+    { X86::ADD64rr,         X86::ADD64rm,       0 },
+    { X86::ADD64rr_DB,      X86::ADD64rm,       TB_NO_REVERSE },
+    { X86::ADD8rr,          X86::ADD8rm,        0 },
+    { X86::ADDPDrr,         X86::ADDPDrm,       TB_ALIGN_16 },
+    { X86::ADDPSrr,         X86::ADDPSrm,       TB_ALIGN_16 },
+    { X86::ADDSDrr,         X86::ADDSDrm,       0 },
+    { X86::ADDSSrr,         X86::ADDSSrm,       0 },
+    { X86::ADDSUBPDrr,      X86::ADDSUBPDrm,    TB_ALIGN_16 },
+    { X86::ADDSUBPSrr,      X86::ADDSUBPSrm,    TB_ALIGN_16 },
+    { X86::AND16rr,         X86::AND16rm,       0 },
+    { X86::AND32rr,         X86::AND32rm,       0 },
+    { X86::AND64rr,         X86::AND64rm,       0 },
+    { X86::AND8rr,          X86::AND8rm,        0 },
+    { X86::ANDNPDrr,        X86::ANDNPDrm,      TB_ALIGN_16 },
+    { X86::ANDNPSrr,        X86::ANDNPSrm,      TB_ALIGN_16 },
+    { X86::ANDPDrr,         X86::ANDPDrm,       TB_ALIGN_16 },
+    { X86::ANDPSrr,         X86::ANDPSrm,       TB_ALIGN_16 },
+    { X86::CMOVA16rr,       X86::CMOVA16rm,     0 },
+    { X86::CMOVA32rr,       X86::CMOVA32rm,     0 },
+    { X86::CMOVA64rr,       X86::CMOVA64rm,     0 },
+    { X86::CMOVAE16rr,      X86::CMOVAE16rm,    0 },
+    { X86::CMOVAE32rr,      X86::CMOVAE32rm,    0 },
+    { X86::CMOVAE64rr,      X86::CMOVAE64rm,    0 },
+    { X86::CMOVB16rr,       X86::CMOVB16rm,     0 },
+    { X86::CMOVB32rr,       X86::CMOVB32rm,     0 },
+    { X86::CMOVB64rr,       X86::CMOVB64rm,     0 },
+    { X86::CMOVBE16rr,      X86::CMOVBE16rm,    0 },
+    { X86::CMOVBE32rr,      X86::CMOVBE32rm,    0 },
+    { X86::CMOVBE64rr,      X86::CMOVBE64rm,    0 },
+    { X86::CMOVE16rr,       X86::CMOVE16rm,     0 },
+    { X86::CMOVE32rr,       X86::CMOVE32rm,     0 },
+    { X86::CMOVE64rr,       X86::CMOVE64rm,     0 },
+    { X86::CMOVG16rr,       X86::CMOVG16rm,     0 },
+    { X86::CMOVG32rr,       X86::CMOVG32rm,     0 },
+    { X86::CMOVG64rr,       X86::CMOVG64rm,     0 },
+    { X86::CMOVGE16rr,      X86::CMOVGE16rm,    0 },
+    { X86::CMOVGE32rr,      X86::CMOVGE32rm,    0 },
+    { X86::CMOVGE64rr,      X86::CMOVGE64rm,    0 },
+    { X86::CMOVL16rr,       X86::CMOVL16rm,     0 },
+    { X86::CMOVL32rr,       X86::CMOVL32rm,     0 },
+    { X86::CMOVL64rr,       X86::CMOVL64rm,     0 },
+    { X86::CMOVLE16rr,      X86::CMOVLE16rm,    0 },
+    { X86::CMOVLE32rr,      X86::CMOVLE32rm,    0 },
+    { X86::CMOVLE64rr,      X86::CMOVLE64rm,    0 },
+    { X86::CMOVNE16rr,      X86::CMOVNE16rm,    0 },
+    { X86::CMOVNE32rr,      X86::CMOVNE32rm,    0 },
+    { X86::CMOVNE64rr,      X86::CMOVNE64rm,    0 },
+    { X86::CMOVNO16rr,      X86::CMOVNO16rm,    0 },
+    { X86::CMOVNO32rr,      X86::CMOVNO32rm,    0 },
+    { X86::CMOVNO64rr,      X86::CMOVNO64rm,    0 },
+    { X86::CMOVNP16rr,      X86::CMOVNP16rm,    0 },
+    { X86::CMOVNP32rr,      X86::CMOVNP32rm,    0 },
+    { X86::CMOVNP64rr,      X86::CMOVNP64rm,    0 },
+    { X86::CMOVNS16rr,      X86::CMOVNS16rm,    0 },
+    { X86::CMOVNS32rr,      X86::CMOVNS32rm,    0 },
+    { X86::CMOVNS64rr,      X86::CMOVNS64rm,    0 },
+    { X86::CMOVO16rr,       X86::CMOVO16rm,     0 },
+    { X86::CMOVO32rr,       X86::CMOVO32rm,     0 },
+    { X86::CMOVO64rr,       X86::CMOVO64rm,     0 },
+    { X86::CMOVP16rr,       X86::CMOVP16rm,     0 },
+    { X86::CMOVP32rr,       X86::CMOVP32rm,     0 },
+    { X86::CMOVP64rr,       X86::CMOVP64rm,     0 },
+    { X86::CMOVS16rr,       X86::CMOVS16rm,     0 },
+    { X86::CMOVS32rr,       X86::CMOVS32rm,     0 },
+    { X86::CMOVS64rr,       X86::CMOVS64rm,     0 },
+    { X86::CMPPDrri,        X86::CMPPDrmi,      TB_ALIGN_16 },
+    { X86::CMPPSrri,        X86::CMPPSrmi,      TB_ALIGN_16 },
+    { X86::CMPSDrr,         X86::CMPSDrm,       0 },
+    { X86::CMPSSrr,         X86::CMPSSrm,       0 },
+    { X86::DIVPDrr,         X86::DIVPDrm,       TB_ALIGN_16 },
+    { X86::DIVPSrr,         X86::DIVPSrm,       TB_ALIGN_16 },
+    { X86::DIVSDrr,         X86::DIVSDrm,       0 },
+    { X86::DIVSSrr,         X86::DIVSSrm,       0 },
+    { X86::FsANDNPDrr,      X86::FsANDNPDrm,    TB_ALIGN_16 },
+    { X86::FsANDNPSrr,      X86::FsANDNPSrm,    TB_ALIGN_16 },
+    { X86::FsANDPDrr,       X86::FsANDPDrm,     TB_ALIGN_16 },
+    { X86::FsANDPSrr,       X86::FsANDPSrm,     TB_ALIGN_16 },
+    { X86::FsORPDrr,        X86::FsORPDrm,      TB_ALIGN_16 },
+    { X86::FsORPSrr,        X86::FsORPSrm,      TB_ALIGN_16 },
+    { X86::FsXORPDrr,       X86::FsXORPDrm,     TB_ALIGN_16 },
+    { X86::FsXORPSrr,       X86::FsXORPSrm,     TB_ALIGN_16 },
+    { X86::HADDPDrr,        X86::HADDPDrm,      TB_ALIGN_16 },
+    { X86::HADDPSrr,        X86::HADDPSrm,      TB_ALIGN_16 },
+    { X86::HSUBPDrr,        X86::HSUBPDrm,      TB_ALIGN_16 },
+    { X86::HSUBPSrr,        X86::HSUBPSrm,      TB_ALIGN_16 },
+    { X86::IMUL16rr,        X86::IMUL16rm,      0 },
+    { X86::IMUL32rr,        X86::IMUL32rm,      0 },
+    { X86::IMUL64rr,        X86::IMUL64rm,      0 },
+    { X86::Int_CMPSDrr,     X86::Int_CMPSDrm,   0 },
+    { X86::Int_CMPSSrr,     X86::Int_CMPSSrm,   0 },
+    { X86::MAXPDrr,         X86::MAXPDrm,       TB_ALIGN_16 },
+    { X86::MAXPDrr_Int,     X86::MAXPDrm_Int,   TB_ALIGN_16 },
+    { X86::MAXPSrr,         X86::MAXPSrm,       TB_ALIGN_16 },
+    { X86::MAXPSrr_Int,     X86::MAXPSrm_Int,   TB_ALIGN_16 },
+    { X86::MAXSDrr,         X86::MAXSDrm,       0 },
+    { X86::MAXSDrr_Int,     X86::MAXSDrm_Int,   0 },
+    { X86::MAXSSrr,         X86::MAXSSrm,       0 },
+    { X86::MAXSSrr_Int,     X86::MAXSSrm_Int,   0 },
+    { X86::MINPDrr,         X86::MINPDrm,       TB_ALIGN_16 },
+    { X86::MINPDrr_Int,     X86::MINPDrm_Int,   TB_ALIGN_16 },
+    { X86::MINPSrr,         X86::MINPSrm,       TB_ALIGN_16 },
+    { X86::MINPSrr_Int,     X86::MINPSrm_Int,   TB_ALIGN_16 },
+    { X86::MINSDrr,         X86::MINSDrm,       0 },
+    { X86::MINSDrr_Int,     X86::MINSDrm_Int,   0 },
+    { X86::MINSSrr,         X86::MINSSrm,       0 },
+    { X86::MINSSrr_Int,     X86::MINSSrm_Int,   0 },
+    { X86::MULPDrr,         X86::MULPDrm,       TB_ALIGN_16 },
+    { X86::MULPSrr,         X86::MULPSrm,       TB_ALIGN_16 },
+    { X86::MULSDrr,         X86::MULSDrm,       0 },
+    { X86::MULSSrr,         X86::MULSSrm,       0 },
+    { X86::OR16rr,          X86::OR16rm,        0 },
+    { X86::OR32rr,          X86::OR32rm,        0 },
+    { X86::OR64rr,          X86::OR64rm,        0 },
+    { X86::OR8rr,           X86::OR8rm,         0 },
+    { X86::ORPDrr,          X86::ORPDrm,        TB_ALIGN_16 },
+    { X86::ORPSrr,          X86::ORPSrm,        TB_ALIGN_16 },
+    { X86::PACKSSDWrr,      X86::PACKSSDWrm,    TB_ALIGN_16 },
+    { X86::PACKSSWBrr,      X86::PACKSSWBrm,    TB_ALIGN_16 },
+    { X86::PACKUSWBrr,      X86::PACKUSWBrm,    TB_ALIGN_16 },
+    { X86::PADDBrr,         X86::PADDBrm,       TB_ALIGN_16 },
+    { X86::PADDDrr,         X86::PADDDrm,       TB_ALIGN_16 },
+    { X86::PADDQrr,         X86::PADDQrm,       TB_ALIGN_16 },
+    { X86::PADDSBrr,        X86::PADDSBrm,      TB_ALIGN_16 },
+    { X86::PADDSWrr,        X86::PADDSWrm,      TB_ALIGN_16 },
+    { X86::PADDWrr,         X86::PADDWrm,       TB_ALIGN_16 },
+    { X86::PANDNrr,         X86::PANDNrm,       TB_ALIGN_16 },
+    { X86::PANDrr,          X86::PANDrm,        TB_ALIGN_16 },
+    { X86::PAVGBrr,         X86::PAVGBrm,       TB_ALIGN_16 },
+    { X86::PAVGWrr,         X86::PAVGWrm,       TB_ALIGN_16 },
+    { X86::PCMPEQBrr,       X86::PCMPEQBrm,     TB_ALIGN_16 },
+    { X86::PCMPEQDrr,       X86::PCMPEQDrm,     TB_ALIGN_16 },
+    { X86::PCMPEQWrr,       X86::PCMPEQWrm,     TB_ALIGN_16 },
+    { X86::PCMPGTBrr,       X86::PCMPGTBrm,     TB_ALIGN_16 },
+    { X86::PCMPGTDrr,       X86::PCMPGTDrm,     TB_ALIGN_16 },
+    { X86::PCMPGTWrr,       X86::PCMPGTWrm,     TB_ALIGN_16 },
+    { X86::PINSRWrri,       X86::PINSRWrmi,     TB_ALIGN_16 },
+    { X86::PMADDWDrr,       X86::PMADDWDrm,     TB_ALIGN_16 },
+    { X86::PMAXSWrr,        X86::PMAXSWrm,      TB_ALIGN_16 },
+    { X86::PMAXUBrr,        X86::PMAXUBrm,      TB_ALIGN_16 },
+    { X86::PMINSWrr,        X86::PMINSWrm,      TB_ALIGN_16 },
+    { X86::PMINUBrr,        X86::PMINUBrm,      TB_ALIGN_16 },
+    { X86::PMULDQrr,        X86::PMULDQrm,      TB_ALIGN_16 },
+    { X86::PMULHUWrr,       X86::PMULHUWrm,     TB_ALIGN_16 },
+    { X86::PMULHWrr,        X86::PMULHWrm,      TB_ALIGN_16 },
+    { X86::PMULLDrr,        X86::PMULLDrm,      TB_ALIGN_16 },
+    { X86::PMULLWrr,        X86::PMULLWrm,      TB_ALIGN_16 },
+    { X86::PMULUDQrr,       X86::PMULUDQrm,     TB_ALIGN_16 },
+    { X86::PORrr,           X86::PORrm,         TB_ALIGN_16 },
+    { X86::PSADBWrr,        X86::PSADBWrm,      TB_ALIGN_16 },
+    { X86::PSLLDrr,         X86::PSLLDrm,       TB_ALIGN_16 },
+    { X86::PSLLQrr,         X86::PSLLQrm,       TB_ALIGN_16 },
+    { X86::PSLLWrr,         X86::PSLLWrm,       TB_ALIGN_16 },
+    { X86::PSRADrr,         X86::PSRADrm,       TB_ALIGN_16 },
+    { X86::PSRAWrr,         X86::PSRAWrm,       TB_ALIGN_16 },
+    { X86::PSRLDrr,         X86::PSRLDrm,       TB_ALIGN_16 },
+    { X86::PSRLQrr,         X86::PSRLQrm,       TB_ALIGN_16 },
+    { X86::PSRLWrr,         X86::PSRLWrm,       TB_ALIGN_16 },
+    { X86::PSUBBrr,         X86::PSUBBrm,       TB_ALIGN_16 },
+    { X86::PSUBDrr,         X86::PSUBDrm,       TB_ALIGN_16 },
+    { X86::PSUBSBrr,        X86::PSUBSBrm,      TB_ALIGN_16 },
+    { X86::PSUBSWrr,        X86::PSUBSWrm,      TB_ALIGN_16 },
+    { X86::PSUBWrr,         X86::PSUBWrm,       TB_ALIGN_16 },
+    { X86::PUNPCKHBWrr,     X86::PUNPCKHBWrm,   TB_ALIGN_16 },
+    { X86::PUNPCKHDQrr,     X86::PUNPCKHDQrm,   TB_ALIGN_16 },
+    { X86::PUNPCKHQDQrr,    X86::PUNPCKHQDQrm,  TB_ALIGN_16 },
+    { X86::PUNPCKHWDrr,     X86::PUNPCKHWDrm,   TB_ALIGN_16 },
+    { X86::PUNPCKLBWrr,     X86::PUNPCKLBWrm,   TB_ALIGN_16 },
+    { X86::PUNPCKLDQrr,     X86::PUNPCKLDQrm,   TB_ALIGN_16 },
+    { X86::PUNPCKLQDQrr,    X86::PUNPCKLQDQrm,  TB_ALIGN_16 },
+    { X86::PUNPCKLWDrr,     X86::PUNPCKLWDrm,   TB_ALIGN_16 },
+    { X86::PXORrr,          X86::PXORrm,        TB_ALIGN_16 },
+    { X86::SBB32rr,         X86::SBB32rm,       0 },
+    { X86::SBB64rr,         X86::SBB64rm,       0 },
+    { X86::SHUFPDrri,       X86::SHUFPDrmi,     TB_ALIGN_16 },
+    { X86::SHUFPSrri,       X86::SHUFPSrmi,     TB_ALIGN_16 },
+    { X86::SUB16rr,         X86::SUB16rm,       0 },
+    { X86::SUB32rr,         X86::SUB32rm,       0 },
+    { X86::SUB64rr,         X86::SUB64rm,       0 },
+    { X86::SUB8rr,          X86::SUB8rm,        0 },
+    { X86::SUBPDrr,         X86::SUBPDrm,       TB_ALIGN_16 },
+    { X86::SUBPSrr,         X86::SUBPSrm,       TB_ALIGN_16 },
+    { X86::SUBSDrr,         X86::SUBSDrm,       0 },
+    { X86::SUBSSrr,         X86::SUBSSrm,       0 },
     // FIXME: TEST*rr -> swapped operand of TEST*mr.
-    { X86::UNPCKHPDrr,      X86::UNPCKHPDrm, 16 },
-    { X86::UNPCKHPSrr,      X86::UNPCKHPSrm, 16 },
-    { X86::UNPCKLPDrr,      X86::UNPCKLPDrm, 16 },
-    { X86::UNPCKLPSrr,      X86::UNPCKLPSrm, 16 },
-    { X86::XOR16rr,         X86::XOR16rm, 0 },
-    { X86::XOR32rr,         X86::XOR32rm, 0 },
-    { X86::XOR64rr,         X86::XOR64rm, 0 },
-    { X86::XOR8rr,          X86::XOR8rm, 0 },
-    { X86::XORPDrr,         X86::XORPDrm, 16 },
-    { X86::XORPSrr,         X86::XORPSrm, 16 }
+    { X86::UNPCKHPDrr,      X86::UNPCKHPDrm,    TB_ALIGN_16 },
+    { X86::UNPCKHPSrr,      X86::UNPCKHPSrm,    TB_ALIGN_16 },
+    { X86::UNPCKLPDrr,      X86::UNPCKLPDrm,    TB_ALIGN_16 },
+    { X86::UNPCKLPSrr,      X86::UNPCKLPSrm,    TB_ALIGN_16 },
+    { X86::XOR16rr,         X86::XOR16rm,       0 },
+    { X86::XOR32rr,         X86::XOR32rm,       0 },
+    { X86::XOR64rr,         X86::XOR64rm,       0 },
+    { X86::XOR8rr,          X86::XOR8rm,        0 },
+    { X86::XORPDrr,         X86::XORPDrm,       TB_ALIGN_16 },
+    { X86::XORPSrr,         X86::XORPSrm,       TB_ALIGN_16 },
+    // AVX 128-bit versions of foldable instructions
+    { X86::VCVTSD2SSrr,       X86::VCVTSD2SSrm,        0 },
+    { X86::Int_VCVTSD2SSrr,   X86::Int_VCVTSD2SSrm,    0 },
+    { X86::VCVTSI2SD64rr,     X86::VCVTSI2SD64rm,      0 },
+    { X86::Int_VCVTSI2SD64rr, X86::Int_VCVTSI2SD64rm,  0 },
+    { X86::VCVTSI2SDrr,       X86::VCVTSI2SDrm,        0 },
+    { X86::Int_VCVTSI2SDrr,   X86::Int_VCVTSI2SDrm,    0 },
+    { X86::VCVTSI2SS64rr,     X86::VCVTSI2SS64rm,      0 },
+    { X86::Int_VCVTSI2SS64rr, X86::Int_VCVTSI2SS64rm,  0 },
+    { X86::VCVTSI2SSrr,       X86::VCVTSI2SSrm,        0 },
+    { X86::Int_VCVTSI2SSrr,   X86::Int_VCVTSI2SSrm,    0 },
+    { X86::VCVTSS2SDrr,       X86::VCVTSS2SDrm,        0 },
+    { X86::Int_VCVTSS2SDrr,   X86::Int_VCVTSS2SDrm,    0 },
+    { X86::VCVTTSD2SI64rr,    X86::VCVTTSD2SI64rm,     0 },
+    { X86::Int_VCVTTSD2SI64rr,X86::Int_VCVTTSD2SI64rm, 0 },
+    { X86::VCVTTSD2SIrr,      X86::VCVTTSD2SIrm,       0 },
+    { X86::Int_VCVTTSD2SIrr,  X86::Int_VCVTTSD2SIrm,   0 },
+    { X86::VCVTTSS2SI64rr,    X86::VCVTTSS2SI64rm,     0 },
+    { X86::Int_VCVTTSS2SI64rr,X86::Int_VCVTTSS2SI64rm, 0 },
+    { X86::VCVTTSS2SIrr,      X86::VCVTTSS2SIrm,       0 },
+    { X86::Int_VCVTTSS2SIrr,  X86::Int_VCVTTSS2SIrm,   0 },
+    { X86::VCVTSD2SI64rr,     X86::VCVTSD2SI64rm,      0 },
+    { X86::VCVTSD2SIrr,       X86::VCVTSD2SIrm,        0 },
+    { X86::VCVTTPD2DQrr,      X86::VCVTTPD2DQrm,       TB_ALIGN_16 },
+    { X86::VCVTTPS2DQrr,      X86::VCVTTPS2DQrm,       TB_ALIGN_16 },
+    { X86::VRSQRTSSr,         X86::VRSQRTSSm,          0 },
+    { X86::VSQRTSDr,          X86::VSQRTSDm,           0 },
+    { X86::VSQRTSSr,          X86::VSQRTSSm,           0 },
+    { X86::VADDPDrr,          X86::VADDPDrm,           TB_ALIGN_16 },
+    { X86::VADDPSrr,          X86::VADDPSrm,           TB_ALIGN_16 },
+    { X86::VADDSDrr,          X86::VADDSDrm,           0 },
+    { X86::VADDSSrr,          X86::VADDSSrm,           0 },
+    { X86::VADDSUBPDrr,       X86::VADDSUBPDrm,        TB_ALIGN_16 },
+    { X86::VADDSUBPSrr,       X86::VADDSUBPSrm,        TB_ALIGN_16 },
+    { X86::VANDNPDrr,         X86::VANDNPDrm,          TB_ALIGN_16 },
+    { X86::VANDNPSrr,         X86::VANDNPSrm,          TB_ALIGN_16 },
+    { X86::VANDPDrr,          X86::VANDPDrm,           TB_ALIGN_16 },
+    { X86::VANDPSrr,          X86::VANDPSrm,           TB_ALIGN_16 },
+    { X86::VCMPPDrri,         X86::VCMPPDrmi,          TB_ALIGN_16 },
+    { X86::VCMPPSrri,         X86::VCMPPSrmi,          TB_ALIGN_16 },
+    { X86::VCMPSDrr,          X86::VCMPSDrm,           0 },
+    { X86::VCMPSSrr,          X86::VCMPSSrm,           0 },
+    { X86::VDIVPDrr,          X86::VDIVPDrm,           TB_ALIGN_16 },
+    { X86::VDIVPSrr,          X86::VDIVPSrm,           TB_ALIGN_16 },
+    { X86::VDIVSDrr,          X86::VDIVSDrm,           0 },
+    { X86::VDIVSSrr,          X86::VDIVSSrm,           0 },
+    { X86::VFsANDNPDrr,       X86::VFsANDNPDrm,        TB_ALIGN_16 },
+    { X86::VFsANDNPSrr,       X86::VFsANDNPSrm,        TB_ALIGN_16 },
+    { X86::VFsANDPDrr,        X86::VFsANDPDrm,         TB_ALIGN_16 },
+    { X86::VFsANDPSrr,        X86::VFsANDPSrm,         TB_ALIGN_16 },
+    { X86::VFsORPDrr,         X86::VFsORPDrm,          TB_ALIGN_16 },
+    { X86::VFsORPSrr,         X86::VFsORPSrm,          TB_ALIGN_16 },
+    { X86::VFsXORPDrr,        X86::VFsXORPDrm,         TB_ALIGN_16 },
+    { X86::VFsXORPSrr,        X86::VFsXORPSrm,         TB_ALIGN_16 },
+    { X86::VHADDPDrr,         X86::VHADDPDrm,          TB_ALIGN_16 },
+    { X86::VHADDPSrr,         X86::VHADDPSrm,          TB_ALIGN_16 },
+    { X86::VHSUBPDrr,         X86::VHSUBPDrm,          TB_ALIGN_16 },
+    { X86::VHSUBPSrr,         X86::VHSUBPSrm,          TB_ALIGN_16 },
+    { X86::Int_VCMPSDrr,      X86::Int_VCMPSDrm,       0 },
+    { X86::Int_VCMPSSrr,      X86::Int_VCMPSSrm,       0 },
+    { X86::VMAXPDrr,          X86::VMAXPDrm,           TB_ALIGN_16 },
+    { X86::VMAXPDrr_Int,      X86::VMAXPDrm_Int,       TB_ALIGN_16 },
+    { X86::VMAXPSrr,          X86::VMAXPSrm,           TB_ALIGN_16 },
+    { X86::VMAXPSrr_Int,      X86::VMAXPSrm_Int,       TB_ALIGN_16 },
+    { X86::VMAXSDrr,          X86::VMAXSDrm,           0 },
+    { X86::VMAXSDrr_Int,      X86::VMAXSDrm_Int,       0 },
+    { X86::VMAXSSrr,          X86::VMAXSSrm,           0 },
+    { X86::VMAXSSrr_Int,      X86::VMAXSSrm_Int,       0 },
+    { X86::VMINPDrr,          X86::VMINPDrm,           TB_ALIGN_16 },
+    { X86::VMINPDrr_Int,      X86::VMINPDrm_Int,       TB_ALIGN_16 },
+    { X86::VMINPSrr,          X86::VMINPSrm,           TB_ALIGN_16 },
+    { X86::VMINPSrr_Int,      X86::VMINPSrm_Int,       TB_ALIGN_16 },
+    { X86::VMINSDrr,          X86::VMINSDrm,           0 },
+    { X86::VMINSDrr_Int,      X86::VMINSDrm_Int,       0 },
+    { X86::VMINSSrr,          X86::VMINSSrm,           0 },
+    { X86::VMINSSrr_Int,      X86::VMINSSrm_Int,       0 },
+    { X86::VMULPDrr,          X86::VMULPDrm,           TB_ALIGN_16 },
+    { X86::VMULPSrr,          X86::VMULPSrm,           TB_ALIGN_16 },
+    { X86::VMULSDrr,          X86::VMULSDrm,           0 },
+    { X86::VMULSSrr,          X86::VMULSSrm,           0 },
+    { X86::VORPDrr,           X86::VORPDrm,            TB_ALIGN_16 },
+    { X86::VORPSrr,           X86::VORPSrm,            TB_ALIGN_16 },
+    { X86::VPACKSSDWrr,       X86::VPACKSSDWrm,        TB_ALIGN_16 },
+    { X86::VPACKSSWBrr,       X86::VPACKSSWBrm,        TB_ALIGN_16 },
+    { X86::VPACKUSWBrr,       X86::VPACKUSWBrm,        TB_ALIGN_16 },
+    { X86::VPADDBrr,          X86::VPADDBrm,           TB_ALIGN_16 },
+    { X86::VPADDDrr,          X86::VPADDDrm,           TB_ALIGN_16 },
+    { X86::VPADDQrr,          X86::VPADDQrm,           TB_ALIGN_16 },
+    { X86::VPADDSBrr,         X86::VPADDSBrm,          TB_ALIGN_16 },
+    { X86::VPADDSWrr,         X86::VPADDSWrm,          TB_ALIGN_16 },
+    { X86::VPADDWrr,          X86::VPADDWrm,           TB_ALIGN_16 },
+    { X86::VPANDNrr,          X86::VPANDNrm,           TB_ALIGN_16 },
+    { X86::VPANDrr,           X86::VPANDrm,            TB_ALIGN_16 },
+    { X86::VPCMPEQBrr,        X86::VPCMPEQBrm,         TB_ALIGN_16 },
+    { X86::VPCMPEQDrr,        X86::VPCMPEQDrm,         TB_ALIGN_16 },
+    { X86::VPCMPEQWrr,        X86::VPCMPEQWrm,         TB_ALIGN_16 },
+    { X86::VPCMPGTBrr,        X86::VPCMPGTBrm,         TB_ALIGN_16 },
+    { X86::VPCMPGTDrr,        X86::VPCMPGTDrm,         TB_ALIGN_16 },
+    { X86::VPCMPGTWrr,        X86::VPCMPGTWrm,         TB_ALIGN_16 },
+    { X86::VPINSRWrri,        X86::VPINSRWrmi,         TB_ALIGN_16 },
+    { X86::VPMADDWDrr,        X86::VPMADDWDrm,         TB_ALIGN_16 },
+    { X86::VPMAXSWrr,         X86::VPMAXSWrm,          TB_ALIGN_16 },
+    { X86::VPMAXUBrr,         X86::VPMAXUBrm,          TB_ALIGN_16 },
+    { X86::VPMINSWrr,         X86::VPMINSWrm,          TB_ALIGN_16 },
+    { X86::VPMINUBrr,         X86::VPMINUBrm,          TB_ALIGN_16 },
+    { X86::VPMULDQrr,         X86::VPMULDQrm,          TB_ALIGN_16 },
+    { X86::VPMULHUWrr,        X86::VPMULHUWrm,         TB_ALIGN_16 },
+    { X86::VPMULHWrr,         X86::VPMULHWrm,          TB_ALIGN_16 },
+    { X86::VPMULLDrr,         X86::VPMULLDrm,          TB_ALIGN_16 },
+    { X86::VPMULLWrr,         X86::VPMULLWrm,          TB_ALIGN_16 },
+    { X86::VPMULUDQrr,        X86::VPMULUDQrm,         TB_ALIGN_16 },
+    { X86::VPORrr,            X86::VPORrm,             TB_ALIGN_16 },
+    { X86::VPSADBWrr,         X86::VPSADBWrm,          TB_ALIGN_16 },
+    { X86::VPSLLDrr,          X86::VPSLLDrm,           TB_ALIGN_16 },
+    { X86::VPSLLQrr,          X86::VPSLLQrm,           TB_ALIGN_16 },
+    { X86::VPSLLWrr,          X86::VPSLLWrm,           TB_ALIGN_16 },
+    { X86::VPSRADrr,          X86::VPSRADrm,           TB_ALIGN_16 },
+    { X86::VPSRAWrr,          X86::VPSRAWrm,           TB_ALIGN_16 },
+    { X86::VPSRLDrr,          X86::VPSRLDrm,           TB_ALIGN_16 },
+    { X86::VPSRLQrr,          X86::VPSRLQrm,           TB_ALIGN_16 },
+    { X86::VPSRLWrr,          X86::VPSRLWrm,           TB_ALIGN_16 },
+    { X86::VPSUBBrr,          X86::VPSUBBrm,           TB_ALIGN_16 },
+    { X86::VPSUBDrr,          X86::VPSUBDrm,           TB_ALIGN_16 },
+    { X86::VPSUBSBrr,         X86::VPSUBSBrm,          TB_ALIGN_16 },
+    { X86::VPSUBSWrr,         X86::VPSUBSWrm,          TB_ALIGN_16 },
+    { X86::VPSUBWrr,          X86::VPSUBWrm,           TB_ALIGN_16 },
+    { X86::VPUNPCKHBWrr,      X86::VPUNPCKHBWrm,       TB_ALIGN_16 },
+    { X86::VPUNPCKHDQrr,      X86::VPUNPCKHDQrm,       TB_ALIGN_16 },
+    { X86::VPUNPCKHQDQrr,     X86::VPUNPCKHQDQrm,      TB_ALIGN_16 },
+    { X86::VPUNPCKHWDrr,      X86::VPUNPCKHWDrm,       TB_ALIGN_16 },
+    { X86::VPUNPCKLBWrr,      X86::VPUNPCKLBWrm,       TB_ALIGN_16 },
+    { X86::VPUNPCKLDQrr,      X86::VPUNPCKLDQrm,       TB_ALIGN_16 },
+    { X86::VPUNPCKLQDQrr,     X86::VPUNPCKLQDQrm,      TB_ALIGN_16 },
+    { X86::VPUNPCKLWDrr,      X86::VPUNPCKLWDrm,       TB_ALIGN_16 },
+    { X86::VPXORrr,           X86::VPXORrm,            TB_ALIGN_16 },
+    { X86::VSHUFPDrri,        X86::VSHUFPDrmi,         TB_ALIGN_16 },
+    { X86::VSHUFPSrri,        X86::VSHUFPSrmi,         TB_ALIGN_16 },
+    { X86::VSUBPDrr,          X86::VSUBPDrm,           TB_ALIGN_16 },
+    { X86::VSUBPSrr,          X86::VSUBPSrm,           TB_ALIGN_16 },
+    { X86::VSUBSDrr,          X86::VSUBSDrm,           0 },
+    { X86::VSUBSSrr,          X86::VSUBSSrm,           0 },
+    { X86::VUNPCKHPDrr,       X86::VUNPCKHPDrm,        TB_ALIGN_16 },
+    { X86::VUNPCKHPSrr,       X86::VUNPCKHPSrm,        TB_ALIGN_16 },
+    { X86::VUNPCKLPDrr,       X86::VUNPCKLPDrm,        TB_ALIGN_16 },
+    { X86::VUNPCKLPSrr,       X86::VUNPCKLPSrm,        TB_ALIGN_16 },
+    { X86::VXORPDrr,          X86::VXORPDrm,           TB_ALIGN_16 },
+    { X86::VXORPSrr,          X86::VXORPSrm,           TB_ALIGN_16 }
+    // FIXME: add AVX 256-bit foldable instructions
   };
 
   for (unsigned i = 0, e = array_lengthof(OpTbl2); i != e; ++i) {
     unsigned RegOp = OpTbl2[i][0];
-    unsigned MemOp = OpTbl2[i][1] & ~TB_FLAGS;
-    unsigned Align = OpTbl2[i][2];
-
-    assert(!RegOp2MemOpTable2.count(RegOp) && "Duplicate entry!");
-    RegOp2MemOpTable2[RegOp] = std::make_pair(MemOp, Align);
-
-    // If this is not a reversible operation (because there is a many->one)
-    // mapping, don't insert the reverse of the operation into MemOp2RegOpTable.
-    if (OpTbl2[i][1] & TB_NOT_REVERSABLE)
-      continue;
+    unsigned MemOp = OpTbl2[i][1];
+    unsigned Flags = OpTbl2[i][2];
+    AddTableEntry(RegOp2MemOpTable2, MemOp2RegOpTable,
+                  RegOp, MemOp,
+                  // Index 2, folded load
+                  Flags | TB_INDEX_2 | TB_FOLDED_LOAD);
+  }
+}
 
-    // Index 2, folded load
-    unsigned AuxInfo = 2 | (1 << 4);
-    assert(!MemOp2RegOpTable.count(MemOp) &&
+void
+X86InstrInfo::AddTableEntry(RegOp2MemOpTableType &R2MTable,
+                            MemOp2RegOpTableType &M2RTable,
+                            unsigned RegOp, unsigned MemOp, unsigned Flags) {
+    if ((Flags & TB_NO_FORWARD) == 0) {
+      assert(!R2MTable.count(RegOp) && "Duplicate entry!");
+      R2MTable[RegOp] = std::make_pair(MemOp, Flags);
+    }
+    if ((Flags & TB_NO_REVERSE) == 0) {
+      assert(!M2RTable.count(MemOp) &&
            "Duplicated entries in unfolding maps?");
-    MemOp2RegOpTable[MemOp] = std::make_pair(RegOp, AuxInfo);
-  }
+      M2RTable[MemOp] = std::make_pair(RegOp, Flags);
+    }
 }
 
 bool
@@ -796,6 +1000,11 @@ static bool isFrameLoadOpcode(int Opcode) {
   case X86::MOVAPSrm:
   case X86::MOVAPDrm:
   case X86::MOVDQArm:
+  case X86::VMOVSSrm:
+  case X86::VMOVSDrm:
+  case X86::VMOVAPSrm:
+  case X86::VMOVAPDrm:
+  case X86::VMOVDQArm:
   case X86::VMOVAPSYrm:
   case X86::VMOVAPDYrm:
   case X86::VMOVDQAYrm:
@@ -820,6 +1029,11 @@ static bool isFrameStoreOpcode(int Opcode) {
   case X86::MOVAPSmr:
   case X86::MOVAPDmr:
   case X86::MOVDQAmr:
+  case X86::VMOVSSmr:
+  case X86::VMOVSDmr:
+  case X86::VMOVAPSmr:
+  case X86::VMOVAPDmr:
+  case X86::VMOVDQAmr:
   case X86::VMOVAPSYmr:
   case X86::VMOVAPDYmr:
   case X86::VMOVDQAYmr:
@@ -852,24 +1066,6 @@ unsigned X86InstrInfo::isLoadFromStackSlotPostFE(const MachineInstr *MI,
   return 0;
 }
 
-bool X86InstrInfo::hasLoadFromStackSlot(const MachineInstr *MI,
-                                        const MachineMemOperand *&MMO,
-                                        int &FrameIndex) const {
-  for (MachineInstr::mmo_iterator o = MI->memoperands_begin(),
-         oe = MI->memoperands_end();
-       o != oe;
-       ++o) {
-    if ((*o)->isLoad() && (*o)->getValue())
-      if (const FixedStackPseudoSourceValue *Value =
-          dyn_cast<const FixedStackPseudoSourceValue>((*o)->getValue())) {
-        FrameIndex = Value->getFrameIndex();
-        MMO = *o;
-        return true;
-      }
-  }
-  return false;
-}
-
 unsigned X86InstrInfo::isStoreToStackSlot(const MachineInstr *MI,
                                           int &FrameIndex) const {
   if (isFrameStoreOpcode(MI->getOpcode()))
@@ -892,24 +1088,6 @@ unsigned X86InstrInfo::isStoreToStackSlotPostFE(const MachineInstr *MI,
   return 0;
 }
 
-bool X86InstrInfo::hasStoreToStackSlot(const MachineInstr *MI,
-                                       const MachineMemOperand *&MMO,
-                                       int &FrameIndex) const {
-  for (MachineInstr::mmo_iterator o = MI->memoperands_begin(),
-         oe = MI->memoperands_end();
-       o != oe;
-       ++o) {
-    if ((*o)->isStore() && (*o)->getValue())
-      if (const FixedStackPseudoSourceValue *Value =
-          dyn_cast<const FixedStackPseudoSourceValue>((*o)->getValue())) {
-        FrameIndex = Value->getFrameIndex();
-        MMO = *o;
-        return true;
-      }
-  }
-  return false;
-}
-
 /// regIsPICBase - Return true if register is PIC base (i.e.g defined by
 /// X86::MOVPC32r.
 static bool regIsPICBase(unsigned BaseReg, const MachineRegisterInfo &MRI) {
@@ -941,12 +1119,20 @@ X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI,
     case X86::MOVUPSrm:
     case X86::MOVAPDrm:
     case X86::MOVDQArm:
+    case X86::VMOVSSrm:
+    case X86::VMOVSDrm:
+    case X86::VMOVAPSrm:
+    case X86::VMOVUPSrm:
+    case X86::VMOVAPDrm:
+    case X86::VMOVDQArm:
     case X86::VMOVAPSYrm:
     case X86::VMOVUPSYrm:
     case X86::VMOVAPDYrm:
     case X86::VMOVDQAYrm:
     case X86::MMX_MOVD64rm:
     case X86::MMX_MOVQ64rm:
+    case X86::FsVMOVAPSrm:
+    case X86::FsVMOVAPDrm:
     case X86::FsMOVAPSrm:
     case X86::FsMOVAPDrm: {
       // Loads from constant pools are trivially rematerializable.
@@ -1009,15 +1195,11 @@ static bool isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
                                   MachineBasicBlock::iterator I) {
   MachineBasicBlock::iterator E = MBB.end();
 
-  // It's always safe to clobber EFLAGS at the end of a block.
-  if (I == E)
-    return true;
-
   // For compile time consideration, if we are not able to determine the
   // safety after visiting 4 instructions in each direction, we will assume
   // it's not safe.
   MachineBasicBlock::iterator Iter = I;
-  for (unsigned i = 0; i < 4; ++i) {
+  for (unsigned i = 0; Iter != E && i < 4; ++i) {
     bool SeenDef = false;
     for (unsigned j = 0, e = Iter->getNumOperands(); j != e; ++j) {
       MachineOperand &MO = Iter->getOperand(j);
@@ -1037,10 +1219,16 @@ static bool isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
     // Skip over DBG_VALUE.
     while (Iter != E && Iter->isDebugValue())
       ++Iter;
+  }
 
-    // If we make it to the end of the block, it's safe to clobber EFLAGS.
-    if (Iter == E)
-      return true;
+  // It is safe to clobber EFLAGS at the end of a block of no successor has it
+  // live in.
+  if (Iter == E) {
+    for (MachineBasicBlock::succ_iterator SI = MBB.succ_begin(),
+           SE = MBB.succ_end(); SI != SE; ++SI)
+      if ((*SI)->isLiveIn(X86::EFLAGS))
+        return false;
+    return true;
   }
 
   MachineBasicBlock::iterator B = MBB.begin();
@@ -1946,7 +2134,8 @@ static bool isHReg(unsigned Reg) {
 }
 
 // Try and copy between VR128/VR64 and GR64 registers.
-static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg) {
+static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg,
+                                        bool HasAVX) {
   // SrcReg(VR128) -> DestReg(GR64)
   // SrcReg(VR64)  -> DestReg(GR64)
   // SrcReg(GR64)  -> DestReg(VR128)
@@ -1955,7 +2144,7 @@ static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg) {
   if (X86::GR64RegClass.contains(DestReg)) {
     if (X86::VR128RegClass.contains(SrcReg)) {
       // Copy from a VR128 register to a GR64 register.
-      return X86::MOVPQIto64rr;
+      return HasAVX ? X86::VMOVPQIto64rr : X86::MOVPQIto64rr;
     } else if (X86::VR64RegClass.contains(SrcReg)) {
       // Copy from a VR64 register to a GR64 register.
       return X86::MOVSDto64rr;
@@ -1963,12 +2152,23 @@ static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg) {
   } else if (X86::GR64RegClass.contains(SrcReg)) {
     // Copy from a GR64 register to a VR128 register.
     if (X86::VR128RegClass.contains(DestReg))
-      return X86::MOV64toPQIrr;
+      return HasAVX ? X86::VMOV64toPQIrr : X86::MOV64toPQIrr;
     // Copy from a GR64 register to a VR64 register.
     else if (X86::VR64RegClass.contains(DestReg))
       return X86::MOV64toSDrr;
   }
 
+  // SrcReg(FR32) -> DestReg(GR32)
+  // SrcReg(GR32) -> DestReg(FR32)
+
+  if (X86::GR32RegClass.contains(DestReg) && X86::FR32RegClass.contains(SrcReg))
+      // Copy from a FR32 register to a GR32 register.
+      return HasAVX ? X86::VMOVSS2DIrr : X86::MOVSS2DIrr;
+
+  if (X86::FR32RegClass.contains(DestReg) && X86::GR32RegClass.contains(SrcReg))
+      // Copy from a GR32 register to a FR32 register.
+      return HasAVX ? X86::VMOVDI2SSrr : X86::MOVDI2SSrr;
+
   return 0;
 }
 
@@ -1977,6 +2177,7 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                unsigned DestReg, unsigned SrcReg,
                                bool KillSrc) const {
   // First deal with the normal symmetric copies.
+  bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
   unsigned Opc = 0;
   if (X86::GR64RegClass.contains(DestReg, SrcReg))
     Opc = X86::MOV64rr;
@@ -1988,18 +2189,21 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     // Copying to or from a physical H register on x86-64 requires a NOREX
     // move.  Otherwise use a normal move.
     if ((isHReg(DestReg) || isHReg(SrcReg)) &&
-        TM.getSubtarget<X86Subtarget>().is64Bit())
+        TM.getSubtarget<X86Subtarget>().is64Bit()) {
       Opc = X86::MOV8rr_NOREX;
-    else
+      // Both operands must be encodable without an REX prefix.
+      assert(X86::GR8_NOREXRegClass.contains(SrcReg, DestReg) &&
+             "8-bit H register can not be copied outside GR8_NOREX");
+    } else
       Opc = X86::MOV8rr;
   } else if (X86::VR128RegClass.contains(DestReg, SrcReg))
-    Opc = X86::MOVAPSrr;
+    Opc = HasAVX ? X86::VMOVAPSrr : X86::MOVAPSrr;
   else if (X86::VR256RegClass.contains(DestReg, SrcReg))
     Opc = X86::VMOVAPSYrr;
   else if (X86::VR64RegClass.contains(DestReg, SrcReg))
     Opc = X86::MMX_MOVQ64rr;
   else
-    Opc = CopyToFromAsymmetricReg(DestReg, SrcReg);
+    Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, HasAVX);
 
   if (Opc) {
     BuildMI(MBB, MI, DL, get(Opc), DestReg)
@@ -2043,6 +2247,7 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg,
                                       bool isStackAligned,
                                       const TargetMachine &TM,
                                       bool load) {
+  bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
   switch (RC->getSize()) {
   default:
     llvm_unreachable("Unknown spill size");
@@ -2061,7 +2266,9 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg,
     if (X86::GR32RegClass.hasSubClassEq(RC))
       return load ? X86::MOV32rm : X86::MOV32mr;
     if (X86::FR32RegClass.hasSubClassEq(RC))
-      return load ? X86::MOVSSrm : X86::MOVSSmr;
+      return load ?
+        (HasAVX ? X86::VMOVSSrm : X86::MOVSSrm) :
+        (HasAVX ? X86::VMOVSSmr : X86::MOVSSmr);
     if (X86::RFP32RegClass.hasSubClassEq(RC))
       return load ? X86::LD_Fp32m : X86::ST_Fp32m;
     llvm_unreachable("Unknown 4-byte regclass");
@@ -2069,7 +2276,9 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg,
     if (X86::GR64RegClass.hasSubClassEq(RC))
       return load ? X86::MOV64rm : X86::MOV64mr;
     if (X86::FR64RegClass.hasSubClassEq(RC))
-      return load ? X86::MOVSDrm : X86::MOVSDmr;
+      return load ?
+        (HasAVX ? X86::VMOVSDrm : X86::MOVSDrm) :
+        (HasAVX ? X86::VMOVSDmr : X86::MOVSDmr);
     if (X86::VR64RegClass.hasSubClassEq(RC))
       return load ? X86::MMX_MOVQ64rm : X86::MMX_MOVQ64mr;
     if (X86::RFP64RegClass.hasSubClassEq(RC))
@@ -2078,13 +2287,18 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg,
   case 10:
     assert(X86::RFP80RegClass.hasSubClassEq(RC) && "Unknown 10-byte regclass");
     return load ? X86::LD_Fp80m : X86::ST_FpP80m;
-  case 16:
+  case 16: {
     assert(X86::VR128RegClass.hasSubClassEq(RC) && "Unknown 16-byte regclass");
     // If stack is realigned we can use aligned stores.
     if (isStackAligned)
-      return load ? X86::MOVAPSrm : X86::MOVAPSmr;
+      return load ?
+        (HasAVX ? X86::VMOVAPSrm : X86::MOVAPSrm) :
+        (HasAVX ? X86::VMOVAPSmr : X86::MOVAPSmr);
     else
-      return load ? X86::MOVUPSrm : X86::MOVUPSmr;
+      return load ?
+        (HasAVX ? X86::VMOVUPSrm : X86::MOVUPSrm) :
+        (HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr);
+  }
   case 32:
     assert(X86::VR256RegClass.hasSubClassEq(RC) && "Unknown 32-byte regclass");
     // If stack is realigned we can use aligned stores.
@@ -2118,7 +2332,8 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
   const MachineFunction &MF = *MBB.getParent();
   assert(MF.getFrameInfo()->getObjectSize(FrameIdx) >= RC->getSize() &&
          "Stack slot too small for store");
-  bool isAligned = (TM.getFrameLowering()->getStackAlignment() >= 16) ||
+  unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
+  bool isAligned = (TM.getFrameLowering()->getStackAlignment() >= Alignment) ||
     RI.canRealignStack(MF);
   unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, TM);
   DebugLoc DL = MBB.findDebugLoc(MI);
@@ -2133,7 +2348,9 @@ void X86InstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg,
                                   MachineInstr::mmo_iterator MMOBegin,
                                   MachineInstr::mmo_iterator MMOEnd,
                                   SmallVectorImpl<MachineInstr*> &NewMIs) const {
-  bool isAligned = MMOBegin != MMOEnd && (*MMOBegin)->getAlignment() >= 16;
+  unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
+  bool isAligned = MMOBegin != MMOEnd &&
+                   (*MMOBegin)->getAlignment() >= Alignment;
   unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, TM);
   DebugLoc DL;
   MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc));
@@ -2151,7 +2368,8 @@ void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                         const TargetRegisterClass *RC,
                                         const TargetRegisterInfo *TRI) const {
   const MachineFunction &MF = *MBB.getParent();
-  bool isAligned = (TM.getFrameLowering()->getStackAlignment() >= 16) ||
+  unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
+  bool isAligned = (TM.getFrameLowering()->getStackAlignment() >= Alignment) ||
     RI.canRealignStack(MF);
   unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, TM);
   DebugLoc DL = MBB.findDebugLoc(MI);
@@ -2164,7 +2382,9 @@ void X86InstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
                                  MachineInstr::mmo_iterator MMOBegin,
                                  MachineInstr::mmo_iterator MMOEnd,
                                  SmallVectorImpl<MachineInstr*> &NewMIs) const {
-  bool isAligned = MMOBegin != MMOEnd && (*MMOBegin)->getAlignment() >= 16;
+  unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
+  bool isAligned = MMOBegin != MMOEnd &&
+                   (*MMOBegin)->getAlignment() >= Alignment;
   unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, TM);
   DebugLoc DL;
   MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), DestReg);
@@ -2174,6 +2394,40 @@ void X86InstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
   NewMIs.push_back(MIB);
 }
 
+/// Expand2AddrUndef - Expand a single-def pseudo instruction to a two-addr
+/// instruction with two undef reads of the register being defined.  This is
+/// used for mapping:
+///   %xmm4 = V_SET0
+/// to:
+///   %xmm4 = PXORrr %xmm4<undef>, %xmm4<undef>
+///
+static bool Expand2AddrUndef(MachineInstr *MI, const MCInstrDesc &Desc) {
+  assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction.");
+  unsigned Reg = MI->getOperand(0).getReg();
+  MI->setDesc(Desc);
+
+  // MachineInstr::addOperand() will insert explicit operands before any
+  // implicit operands.
+  MachineInstrBuilder(MI).addReg(Reg, RegState::Undef)
+                         .addReg(Reg, RegState::Undef);
+  // But we don't trust that.
+  assert(MI->getOperand(1).getReg() == Reg &&
+         MI->getOperand(2).getReg() == Reg && "Misplaced operand");
+  return true;
+}
+
+bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
+  bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
+  switch (MI->getOpcode()) {
+  case X86::V_SET0:
+    return Expand2AddrUndef(MI, get(HasAVX ? X86::VPXORrr : X86::PXORrr));
+  case X86::TEST8ri_NOREX:
+    MI->setDesc(get(X86::TEST8ri));
+    return true;
+  }
+  return false;
+}
+
 MachineInstr*
 X86InstrInfo::emitFrameIndexDebugValue(MachineFunction &MF,
                                        int FrameIx, uint64_t Offset,
@@ -2305,7 +2559,7 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
       OpcodeTablePtr->find(MI->getOpcode());
     if (I != OpcodeTablePtr->end()) {
       unsigned Opcode = I->second.first;
-      unsigned MinAlign = I->second.second;
+      unsigned MinAlign = (I->second.second & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT;
       if (Align < MinAlign)
         return NULL;
       bool NarrowToMOV32rm = false;
@@ -2352,6 +2606,51 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
   return NULL;
 }
 
+/// hasPartialRegUpdate - Return true for all instructions that only update
+/// the first 32 or 64-bits of the destination register and leave the rest
+/// unmodified. This can be used to avoid folding loads if the instructions
+/// only update part of the destination register, and the non-updated part is
+/// not needed. e.g. cvtss2sd, sqrtss. Unfolding the load from these
+/// instructions breaks the partial register dependency and it can improve
+/// performance. e.g.:
+///
+///   movss (%rdi), %xmm0
+///   cvtss2sd %xmm0, %xmm0
+///
+/// Instead of
+///   cvtss2sd (%rdi), %xmm0
+///
+/// FIXME: This should be turned into a TSFlags.
+///
+static bool hasPartialRegUpdate(unsigned Opcode) {
+  switch (Opcode) {
+  case X86::CVTSD2SSrr:
+  case X86::Int_CVTSD2SSrr:
+  case X86::CVTSS2SDrr:
+  case X86::Int_CVTSS2SDrr:
+  case X86::RCPSSr:
+  case X86::RCPSSr_Int:
+  case X86::ROUNDSDr:
+  case X86::ROUNDSSr:
+  case X86::RSQRTSSr:
+  case X86::RSQRTSSr_Int:
+  case X86::SQRTSSr:
+  case X86::SQRTSSr_Int:
+  // AVX encoded versions
+  case X86::VCVTSD2SSrr:
+  case X86::Int_VCVTSD2SSrr:
+  case X86::VCVTSS2SDrr:
+  case X86::Int_VCVTSS2SDrr:
+  case X86::VRCPSSr:
+  case X86::VROUNDSDr:
+  case X86::VROUNDSSr:
+  case X86::VRSQRTSSr:
+  case X86::VSQRTSSr:
+    return true;
+  }
+
+  return false;
+}
 
 MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
                                                   MachineInstr *MI,
@@ -2360,22 +2659,11 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
   // Check switch flag
   if (NoFusing) return NULL;
 
-  if (!MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize))
-    switch (MI->getOpcode()) {
-    case X86::CVTSD2SSrr:
-    case X86::Int_CVTSD2SSrr:
-    case X86::CVTSS2SDrr:
-    case X86::Int_CVTSS2SDrr:
-    case X86::RCPSSr:
-    case X86::RCPSSr_Int:
-    case X86::ROUNDSDr:
-    case X86::ROUNDSSr:
-    case X86::RSQRTSSr:
-    case X86::RSQRTSSr_Int:
-    case X86::SQRTSSr:
-    case X86::SQRTSSr_Int:
-      return 0;
-    }
+  // Unless optimizing for size, don't fold to avoid partial
+  // register update stalls
+  if (!MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize) &&
+      hasPartialRegUpdate(MI->getOpcode()))
+    return 0;
 
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   unsigned Size = MFI->getObjectSize(FrameIndex);
@@ -2412,22 +2700,11 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
   // Check switch flag
   if (NoFusing) return NULL;
 
-  if (!MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize))
-    switch (MI->getOpcode()) {
-    case X86::CVTSD2SSrr:
-    case X86::Int_CVTSD2SSrr:
-    case X86::CVTSS2SDrr:
-    case X86::Int_CVTSS2SDrr:
-    case X86::RCPSSr:
-    case X86::RCPSSr_Int:
-    case X86::ROUNDSDr:
-    case X86::ROUNDSSr:
-    case X86::RSQRTSSr:
-    case X86::RSQRTSSr_Int:
-    case X86::SQRTSSr:
-    case X86::SQRTSSr_Int:
-      return 0;
-    }
+  // Unless optimizing for size, don't fold to avoid partial
+  // register update stalls
+  if (!MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize) &&
+      hasPartialRegUpdate(MI->getOpcode()))
+    return 0;
 
   // Determine the alignment of the load.
   unsigned Alignment = 0;
@@ -2439,13 +2716,9 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
     case X86::AVX_SET0PDY:
       Alignment = 32;
       break;
-    case X86::V_SET0PS:
-    case X86::V_SET0PD:
-    case X86::V_SET0PI:
+    case X86::V_SET0:
     case X86::V_SETALLONES:
-    case X86::AVX_SET0PS:
-    case X86::AVX_SET0PD:
-    case X86::AVX_SET0PI:
+    case X86::AVX_SETALLONES:
       Alignment = 16;
       break;
     case X86::FsFLD0SD:
@@ -2481,18 +2754,16 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
 
   SmallVector<MachineOperand,X86::AddrNumOperands> MOs;
   switch (LoadMI->getOpcode()) {
-  case X86::V_SET0PS:
-  case X86::V_SET0PD:
-  case X86::V_SET0PI:
+  case X86::V_SET0:
   case X86::V_SETALLONES:
-  case X86::AVX_SET0PS:
-  case X86::AVX_SET0PD:
-  case X86::AVX_SET0PI:
   case X86::AVX_SET0PSY:
   case X86::AVX_SET0PDY:
+  case X86::AVX_SETALLONES:
   case X86::FsFLD0SD:
-  case X86::FsFLD0SS: {
-    // Folding a V_SET0P? or V_SETALLONES as a load, to ease register pressure.
+  case X86::FsFLD0SS:
+  case X86::VFsFLD0SD:
+  case X86::VFsFLD0SS: {
+    // Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure.
     // Create a constant-pool entry and operands to load from it.
 
     // Medium and large mode can't fold loads this way.
@@ -2515,7 +2786,7 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
 
     // Create a constant-pool entry.
     MachineConstantPool &MCP = *MF.getConstantPool();
-    const Type *Ty;
+    Type *Ty;
     unsigned Opc = LoadMI->getOpcode();
     if (Opc == X86::FsFLD0SS || Opc == X86::VFsFLD0SS)
       Ty = Type::getFloatTy(MF.getFunction()->getContext());
@@ -2525,9 +2796,10 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
       Ty = VectorType::get(Type::getFloatTy(MF.getFunction()->getContext()), 8);
     else
       Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 4);
-    const Constant *C = LoadMI->getOpcode() == X86::V_SETALLONES ?
-                    Constant::getAllOnesValue(Ty) :
-                    Constant::getNullValue(Ty);
+
+    bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX_SETALLONES);
+    const Constant *C = IsAllOnes ? Constant::getAllOnesValue(Ty) :
+                                    Constant::getNullValue(Ty);
     unsigned CPI = MCP.getConstantPoolIndex(C, Alignment);
 
     // Create operands to load from the constant pool entry.
@@ -2615,9 +2887,9 @@ bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
   if (I == MemOp2RegOpTable.end())
     return false;
   unsigned Opc = I->second.first;
-  unsigned Index = I->second.second & 0xf;
-  bool FoldedLoad = I->second.second & (1 << 4);
-  bool FoldedStore = I->second.second & (1 << 5);
+  unsigned Index = I->second.second & TB_INDEX_MASK;
+  bool FoldedLoad = I->second.second & TB_FOLDED_LOAD;
+  bool FoldedStore = I->second.second & TB_FOLDED_STORE;
   if (UnfoldLoad && !FoldedLoad)
     return false;
   UnfoldLoad &= FoldedLoad;
@@ -2743,9 +3015,9 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
   if (I == MemOp2RegOpTable.end())
     return false;
   unsigned Opc = I->second.first;
-  unsigned Index = I->second.second & 0xf;
-  bool FoldedLoad = I->second.second & (1 << 4);
-  bool FoldedStore = I->second.second & (1 << 5);
+  unsigned Index = I->second.second & TB_INDEX_MASK;
+  bool FoldedLoad = I->second.second & TB_FOLDED_LOAD;
+  bool FoldedStore = I->second.second & TB_FOLDED_STORE;
   const MCInstrDesc &MCID = get(Opc);
   const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI);
   unsigned NumDefs = MCID.NumDefs;
@@ -2780,7 +3052,9 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
         !TM.getSubtarget<X86Subtarget>().isUnalignedMemAccessFast())
       // Do not introduce a slow unaligned load.
       return false;
-    bool isAligned = (*MMOs.first) && (*MMOs.first)->getAlignment() >= 16;
+    unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
+    bool isAligned = (*MMOs.first) &&
+                     (*MMOs.first)->getAlignment() >= Alignment;
     Load = DAG.getMachineNode(getLoadRegOpcode(0, RC, isAligned, TM), dl,
                               VT, MVT::Other, &AddrOps[0], AddrOps.size());
     NewNodes.push_back(Load);
@@ -2822,7 +3096,9 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
         !TM.getSubtarget<X86Subtarget>().isUnalignedMemAccessFast())
       // Do not introduce a slow unaligned store.
       return false;
-    bool isAligned = (*MMOs.first) && (*MMOs.first)->getAlignment() >= 16;
+    unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
+    bool isAligned = (*MMOs.first) &&
+                     (*MMOs.first)->getAlignment() >= Alignment;
     SDNode *Store = DAG.getMachineNode(getStoreRegOpcode(0, DstRC,
                                                          isAligned, TM),
                                        dl, MVT::Other,
@@ -2843,14 +3119,14 @@ unsigned X86InstrInfo::getOpcodeAfterMemoryUnfold(unsigned Opc,
     MemOp2RegOpTable.find(Opc);
   if (I == MemOp2RegOpTable.end())
     return 0;
-  bool FoldedLoad = I->second.second & (1 << 4);
-  bool FoldedStore = I->second.second & (1 << 5);
+  bool FoldedLoad = I->second.second & TB_FOLDED_LOAD;
+  bool FoldedStore = I->second.second & TB_FOLDED_STORE;
   if (UnfoldLoad && !FoldedLoad)
     return 0;
   if (UnfoldStore && !FoldedStore)
     return 0;
   if (LoadRegIndex)
-    *LoadRegIndex = I->second.second & 0xf;
+    *LoadRegIndex = I->second.second & TB_INDEX_MASK;
   return I->second.first;
 }
 
@@ -2881,6 +3157,16 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
   case X86::MOVAPDrm:
   case X86::MOVDQArm:
   case X86::MOVDQUrm:
+  // AVX load instructions
+  case X86::VMOVSSrm:
+  case X86::VMOVSDrm:
+  case X86::FsVMOVAPSrm:
+  case X86::FsVMOVAPDrm:
+  case X86::VMOVAPSrm:
+  case X86::VMOVUPSrm:
+  case X86::VMOVAPDrm:
+  case X86::VMOVDQArm:
+  case X86::VMOVDQUrm:
   case X86::VMOVAPSYrm:
   case X86::VMOVUPSYrm:
   case X86::VMOVAPDYrm:
@@ -2908,6 +3194,16 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
   case X86::MOVAPDrm:
   case X86::MOVDQArm:
   case X86::MOVDQUrm:
+  // AVX load instructions
+  case X86::VMOVSSrm:
+  case X86::VMOVSDrm:
+  case X86::FsVMOVAPSrm:
+  case X86::FsVMOVAPDrm:
+  case X86::VMOVAPSrm:
+  case X86::VMOVUPSrm:
+  case X86::VMOVAPDrm:
+  case X86::VMOVDQArm:
+  case X86::VMOVDQUrm:
   case X86::VMOVAPSYrm:
   case X86::VMOVUPSYrm:
   case X86::VMOVAPDYrm:
@@ -3007,31 +3303,6 @@ isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
            RC == &X86::RFP64RegClass || RC == &X86::RFP80RegClass);
 }
 
-
-/// isX86_64ExtendedReg - Is the MachineOperand a x86-64 extended (r8 or higher)
-/// register?  e.g. r8, xmm8, xmm13, etc.
-bool X86InstrInfo::isX86_64ExtendedReg(unsigned RegNo) {
-  switch (RegNo) {
-  default: break;
-  case X86::R8:    case X86::R9:    case X86::R10:   case X86::R11:
-  case X86::R12:   case X86::R13:   case X86::R14:   case X86::R15:
-  case X86::R8D:   case X86::R9D:   case X86::R10D:  case X86::R11D:
-  case X86::R12D:  case X86::R13D:  case X86::R14D:  case X86::R15D:
-  case X86::R8W:   case X86::R9W:   case X86::R10W:  case X86::R11W:
-  case X86::R12W:  case X86::R13W:  case X86::R14W:  case X86::R15W:
-  case X86::R8B:   case X86::R9B:   case X86::R10B:  case X86::R11B:
-  case X86::R12B:  case X86::R13B:  case X86::R14B:  case X86::R15B:
-  case X86::XMM8:  case X86::XMM9:  case X86::XMM10: case X86::XMM11:
-  case X86::XMM12: case X86::XMM13: case X86::XMM14: case X86::XMM15:
-  case X86::YMM8:  case X86::YMM9:  case X86::YMM10: case X86::YMM11:
-  case X86::YMM12: case X86::YMM13: case X86::YMM14: case X86::YMM15:
-  case X86::CR8:   case X86::CR9:   case X86::CR10:  case X86::CR11:
-  case X86::CR12:  case X86::CR13:  case X86::CR14:  case X86::CR15:
-    return true;
-  }
-  return false;
-}
-
 /// getGlobalBaseReg - Return a virtual register initialized with the
 /// the global base register value. Output instructions required to
 /// initialize the register in the function entry block, if necessary.
@@ -3072,7 +3343,6 @@ static const unsigned ReplaceableInstrs[][3] = {
   { X86::ANDPSrr,    X86::ANDPDrr,   X86::PANDrr    },
   { X86::ORPSrm,     X86::ORPDrm,    X86::PORrm     },
   { X86::ORPSrr,     X86::ORPDrr,    X86::PORrr     },
-  { X86::V_SET0PS,   X86::V_SET0PD,  X86::V_SET0PI  },
   { X86::XORPSrm,    X86::XORPDrm,   X86::PXORrm    },
   { X86::XORPSrr,    X86::XORPDrr,   X86::PXORrr    },
   // AVX 128-bit support
@@ -3088,7 +3358,6 @@ static const unsigned ReplaceableInstrs[][3] = {
   { X86::VANDPSrr,   X86::VANDPDrr,   X86::VPANDrr    },
   { X86::VORPSrm,    X86::VORPDrm,    X86::VPORrm     },
   { X86::VORPSrr,    X86::VORPDrr,    X86::VPORrr     },
-  { X86::AVX_SET0PS, X86::AVX_SET0PD, X86::AVX_SET0PI },
   { X86::VXORPSrm,   X86::VXORPDrm,   X86::VPXORrm    },
   { X86::VXORPSrr,   X86::VXORPDrr,   X86::VPXORrr    },
   // AVX 256-bit support
@@ -3111,13 +3380,13 @@ static const unsigned *lookup(unsigned opcode, unsigned domain) {
 }
 
 std::pair<uint16_t, uint16_t>
-X86InstrInfo::GetSSEDomain(const MachineInstr *MI) const {
+X86InstrInfo::getExecutionDomain(const MachineInstr *MI) const {
   uint16_t domain = (MI->getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
   return std::make_pair(domain,
                         domain && lookup(MI->getOpcode(), domain) ? 0xe : 0);
 }
 
-void X86InstrInfo::SetSSEDomain(MachineInstr *MI, unsigned Domain) const {
+void X86InstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const {
   assert(Domain>0 && Domain<4 && "Invalid execution domain");
   uint16_t dom = (MI->getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
   assert(dom && "Not an SSE instruction");
@@ -3158,6 +3427,29 @@ bool X86InstrInfo::isHighLatencyDef(int opc) const {
   case X86::SQRTSSm_Int:
   case X86::SQRTSSr:
   case X86::SQRTSSr_Int:
+  // AVX instructions with high latency
+  case X86::VDIVSDrm:
+  case X86::VDIVSDrm_Int:
+  case X86::VDIVSDrr:
+  case X86::VDIVSDrr_Int:
+  case X86::VDIVSSrm:
+  case X86::VDIVSSrm_Int:
+  case X86::VDIVSSrr:
+  case X86::VDIVSSrr_Int:
+  case X86::VSQRTPDm:
+  case X86::VSQRTPDm_Int:
+  case X86::VSQRTPDr:
+  case X86::VSQRTPDr_Int:
+  case X86::VSQRTPSm:
+  case X86::VSQRTPSm_Int:
+  case X86::VSQRTPSr:
+  case X86::VSQRTPSr_Int:
+  case X86::VSQRTSDm:
+  case X86::VSQRTSDm_Int:
+  case X86::VSQRTSDr:
+  case X86::VSQRTSSm:
+  case X86::VSQRTSSm_Int:
+  case X86::VSQRTSSr:
     return true;
   }
 }
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index 5f2eba34ac45f..97009dbdbe501 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -27,24 +27,6 @@ namespace llvm {
   class X86TargetMachine;
 
 namespace X86 {
-  // Enums for memory operand decoding.  Each memory operand is represented with
-  // a 5 operand sequence in the form:
-  //   [BaseReg, ScaleAmt, IndexReg, Disp, Segment]
-  // These enums help decode this.
-  enum {
-    AddrBaseReg = 0,
-    AddrScaleAmt = 1,
-    AddrIndexReg = 2,
-    AddrDisp = 3,
-
-    /// AddrSegmentReg - The operand # of the segment in the memory operand.
-    AddrSegmentReg = 4,
-
-    /// AddrNumOperands - Total number of operands in a memory reference.
-    AddrNumOperands = 5
-  };
-
-
   // X86 specific condition code. These correspond to X86_*_COND in
   // X86InstrInfo.td. They must be kept in synch.
   enum CondCode {
@@ -82,133 +64,8 @@ namespace X86 {
   /// GetOppositeBranchCondition - Return the inverse of the specified cond,
   /// e.g. turning COND_E to COND_NE.
   CondCode GetOppositeBranchCondition(X86::CondCode CC);
+}  // end namespace X86;
 
-}
-
-/// X86II - This namespace holds all of the target specific flags that
-/// instruction info tracks.
-///
-namespace X86II {
-  /// Target Operand Flag enum.
-  enum TOF {
-    //===------------------------------------------------------------------===//
-    // X86 Specific MachineOperand flags.
-
-    MO_NO_FLAG,
-
-    /// MO_GOT_ABSOLUTE_ADDRESS - On a symbol operand, this represents a
-    /// relocation of:
-    ///    SYMBOL_LABEL + [. - PICBASELABEL]
-    MO_GOT_ABSOLUTE_ADDRESS,
-
-    /// MO_PIC_BASE_OFFSET - On a symbol operand this indicates that the
-    /// immediate should get the value of the symbol minus the PIC base label:
-    ///    SYMBOL_LABEL - PICBASELABEL
-    MO_PIC_BASE_OFFSET,
-
-    /// MO_GOT - On a symbol operand this indicates that the immediate is the
-    /// offset to the GOT entry for the symbol name from the base of the GOT.
-    ///
-    /// See the X86-64 ELF ABI supplement for more details.
-    ///    SYMBOL_LABEL @GOT
-    MO_GOT,
-
-    /// MO_GOTOFF - On a symbol operand this indicates that the immediate is
-    /// the offset to the location of the symbol name from the base of the GOT.
-    ///
-    /// See the X86-64 ELF ABI supplement for more details.
-    ///    SYMBOL_LABEL @GOTOFF
-    MO_GOTOFF,
-
-    /// MO_GOTPCREL - On a symbol operand this indicates that the immediate is
-    /// offset to the GOT entry for the symbol name from the current code
-    /// location.
-    ///
-    /// See the X86-64 ELF ABI supplement for more details.
-    ///    SYMBOL_LABEL @GOTPCREL
-    MO_GOTPCREL,
-
-    /// MO_PLT - On a symbol operand this indicates that the immediate is
-    /// offset to the PLT entry of symbol name from the current code location.
-    ///
-    /// See the X86-64 ELF ABI supplement for more details.
-    ///    SYMBOL_LABEL @PLT
-    MO_PLT,
-
-    /// MO_TLSGD - On a symbol operand this indicates that the immediate is
-    /// some TLS offset.
-    ///
-    /// See 'ELF Handling for Thread-Local Storage' for more details.
-    ///    SYMBOL_LABEL @TLSGD
-    MO_TLSGD,
-
-    /// MO_GOTTPOFF - On a symbol operand this indicates that the immediate is
-    /// some TLS offset.
-    ///
-    /// See 'ELF Handling for Thread-Local Storage' for more details.
-    ///    SYMBOL_LABEL @GOTTPOFF
-    MO_GOTTPOFF,
-
-    /// MO_INDNTPOFF - On a symbol operand this indicates that the immediate is
-    /// some TLS offset.
-    ///
-    /// See 'ELF Handling for Thread-Local Storage' for more details.
-    ///    SYMBOL_LABEL @INDNTPOFF
-    MO_INDNTPOFF,
-
-    /// MO_TPOFF - On a symbol operand this indicates that the immediate is
-    /// some TLS offset.
-    ///
-    /// See 'ELF Handling for Thread-Local Storage' for more details.
-    ///    SYMBOL_LABEL @TPOFF
-    MO_TPOFF,
-
-    /// MO_NTPOFF - On a symbol operand this indicates that the immediate is
-    /// some TLS offset.
-    ///
-    /// See 'ELF Handling for Thread-Local Storage' for more details.
-    ///    SYMBOL_LABEL @NTPOFF
-    MO_NTPOFF,
-
-    /// MO_DLLIMPORT - On a symbol operand "FOO", this indicates that the
-    /// reference is actually to the "__imp_FOO" symbol.  This is used for
-    /// dllimport linkage on windows.
-    MO_DLLIMPORT,
-
-    /// MO_DARWIN_STUB - On a symbol operand "FOO", this indicates that the
-    /// reference is actually to the "FOO$stub" symbol.  This is used for calls
-    /// and jumps to external functions on Tiger and earlier.
-    MO_DARWIN_STUB,
-
-    /// MO_DARWIN_NONLAZY - On a symbol operand "FOO", this indicates that the
-    /// reference is actually to the "FOO$non_lazy_ptr" symbol, which is a
-    /// non-PIC-base-relative reference to a non-hidden dyld lazy pointer stub.
-    MO_DARWIN_NONLAZY,
-
-    /// MO_DARWIN_NONLAZY_PIC_BASE - On a symbol operand "FOO", this indicates
-    /// that the reference is actually to "FOO$non_lazy_ptr - PICBASE", which is
-    /// a PIC-base-relative reference to a non-hidden dyld lazy pointer stub.
-    MO_DARWIN_NONLAZY_PIC_BASE,
-
-    /// MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE - On a symbol operand "FOO", this
-    /// indicates that the reference is actually to "FOO$non_lazy_ptr -PICBASE",
-    /// which is a PIC-base-relative reference to a hidden dyld lazy pointer
-    /// stub.
-    MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE,
-
-    /// MO_TLVP - On a symbol operand this indicates that the immediate is
-    /// some TLS offset.
-    ///
-    /// This is the TLS offset for the Darwin TLS mechanism.
-    MO_TLVP,
-
-    /// MO_TLVP_PIC_BASE - On a symbol operand this indicates that the immediate
-    /// is some TLS offset from the picbase.
-    ///
-    /// This is the 32-bit TLS offset for Darwin TLS in PIC mode.
-    MO_TLVP_PIC_BASE
-  };
-}
 
 /// isGlobalStubReference - Return true if the specified TargetFlag operand is
 /// a reference to a stub for a global, not the global itself.
@@ -243,353 +100,6 @@ inline static bool isGlobalRelativeToPICBase(unsigned char TargetFlag) {
   }
 }
 
-/// X86II - This namespace holds all of the target specific flags that
-/// instruction info tracks.
-///
-namespace X86II {
-  enum {
-    //===------------------------------------------------------------------===//
-    // Instruction encodings.  These are the standard/most common forms for X86
-    // instructions.
-    //
-
-    // PseudoFrm - This represents an instruction that is a pseudo instruction
-    // or one that has not been implemented yet.  It is illegal to code generate
-    // it, but tolerated for intermediate implementation stages.
-    Pseudo         = 0,
-
-    /// Raw - This form is for instructions that don't have any operands, so
-    /// they are just a fixed opcode value, like 'leave'.
-    RawFrm         = 1,
-
-    /// AddRegFrm - This form is used for instructions like 'push r32' that have
-    /// their one register operand added to their opcode.
-    AddRegFrm      = 2,
-
-    /// MRMDestReg - This form is used for instructions that use the Mod/RM byte
-    /// to specify a destination, which in this case is a register.
-    ///
-    MRMDestReg     = 3,
-
-    /// MRMDestMem - This form is used for instructions that use the Mod/RM byte
-    /// to specify a destination, which in this case is memory.
-    ///
-    MRMDestMem     = 4,
-
-    /// MRMSrcReg - This form is used for instructions that use the Mod/RM byte
-    /// to specify a source, which in this case is a register.
-    ///
-    MRMSrcReg      = 5,
-
-    /// MRMSrcMem - This form is used for instructions that use the Mod/RM byte
-    /// to specify a source, which in this case is memory.
-    ///
-    MRMSrcMem      = 6,
-
-    /// MRM[0-7][rm] - These forms are used to represent instructions that use
-    /// a Mod/RM byte, and use the middle field to hold extended opcode
-    /// information.  In the intel manual these are represented as /0, /1, ...
-    ///
-
-    // First, instructions that operate on a register r/m operand...
-    MRM0r = 16,  MRM1r = 17,  MRM2r = 18,  MRM3r = 19, // Format /0 /1 /2 /3
-    MRM4r = 20,  MRM5r = 21,  MRM6r = 22,  MRM7r = 23, // Format /4 /5 /6 /7
-
-    // Next, instructions that operate on a memory r/m operand...
-    MRM0m = 24,  MRM1m = 25,  MRM2m = 26,  MRM3m = 27, // Format /0 /1 /2 /3
-    MRM4m = 28,  MRM5m = 29,  MRM6m = 30,  MRM7m = 31, // Format /4 /5 /6 /7
-
-    // MRMInitReg - This form is used for instructions whose source and
-    // destinations are the same register.
-    MRMInitReg = 32,
-
-    //// MRM_C1 - A mod/rm byte of exactly 0xC1.
-    MRM_C1 = 33,
-    MRM_C2 = 34,
-    MRM_C3 = 35,
-    MRM_C4 = 36,
-    MRM_C8 = 37,
-    MRM_C9 = 38,
-    MRM_E8 = 39,
-    MRM_F0 = 40,
-    MRM_F8 = 41,
-    MRM_F9 = 42,
-    MRM_D0 = 45,
-    MRM_D1 = 46,
-
-    /// RawFrmImm8 - This is used for the ENTER instruction, which has two
-    /// immediates, the first of which is a 16-bit immediate (specified by
-    /// the imm encoding) and the second is a 8-bit fixed value.
-    RawFrmImm8 = 43,
-
-    /// RawFrmImm16 - This is used for CALL FAR instructions, which have two
-    /// immediates, the first of which is a 16 or 32-bit immediate (specified by
-    /// the imm encoding) and the second is a 16-bit fixed value.  In the AMD
-    /// manual, this operand is described as pntr16:32 and pntr16:16
-    RawFrmImm16 = 44,
-
-    FormMask       = 63,
-
-    //===------------------------------------------------------------------===//
-    // Actual flags...
-
-    // OpSize - Set if this instruction requires an operand size prefix (0x66),
-    // which most often indicates that the instruction operates on 16 bit data
-    // instead of 32 bit data.
-    OpSize      = 1 << 6,
-
-    // AsSize - Set if this instruction requires an operand size prefix (0x67),
-    // which most often indicates that the instruction address 16 bit address
-    // instead of 32 bit address (or 32 bit address in 64 bit mode).
-    AdSize      = 1 << 7,
-
-    //===------------------------------------------------------------------===//
-    // Op0Mask - There are several prefix bytes that are used to form two byte
-    // opcodes.  These are currently 0x0F, 0xF3, and 0xD8-0xDF.  This mask is
-    // used to obtain the setting of this field.  If no bits in this field is
-    // set, there is no prefix byte for obtaining a multibyte opcode.
-    //
-    Op0Shift    = 8,
-    Op0Mask     = 0x1F << Op0Shift,
-
-    // TB - TwoByte - Set if this instruction has a two byte opcode, which
-    // starts with a 0x0F byte before the real opcode.
-    TB          = 1 << Op0Shift,
-
-    // REP - The 0xF3 prefix byte indicating repetition of the following
-    // instruction.
-    REP         = 2 << Op0Shift,
-
-    // D8-DF - These escape opcodes are used by the floating point unit.  These
-    // values must remain sequential.
-    D8 = 3 << Op0Shift,   D9 = 4 << Op0Shift,
-    DA = 5 << Op0Shift,   DB = 6 << Op0Shift,
-    DC = 7 << Op0Shift,   DD = 8 << Op0Shift,
-    DE = 9 << Op0Shift,   DF = 10 << Op0Shift,
-
-    // XS, XD - These prefix codes are for single and double precision scalar
-    // floating point operations performed in the SSE registers.
-    XD = 11 << Op0Shift,  XS = 12 << Op0Shift,
-
-    // T8, TA, A6, A7 - Prefix after the 0x0F prefix.
-    T8 = 13 << Op0Shift,  TA = 14 << Op0Shift,
-    A6 = 15 << Op0Shift,  A7 = 16 << Op0Shift,
-
-    // TF - Prefix before and after 0x0F
-    TF = 17 << Op0Shift,
-
-    //===------------------------------------------------------------------===//
-    // REX_W - REX prefixes are instruction prefixes used in 64-bit mode.
-    // They are used to specify GPRs and SSE registers, 64-bit operand size,
-    // etc. We only cares about REX.W and REX.R bits and only the former is
-    // statically determined.
-    //
-    REXShift    = Op0Shift + 5,
-    REX_W       = 1 << REXShift,
-
-    //===------------------------------------------------------------------===//
-    // This three-bit field describes the size of an immediate operand.  Zero is
-    // unused so that we can tell if we forgot to set a value.
-    ImmShift = REXShift + 1,
-    ImmMask    = 7 << ImmShift,
-    Imm8       = 1 << ImmShift,
-    Imm8PCRel  = 2 << ImmShift,
-    Imm16      = 3 << ImmShift,
-    Imm16PCRel = 4 << ImmShift,
-    Imm32      = 5 << ImmShift,
-    Imm32PCRel = 6 << ImmShift,
-    Imm64      = 7 << ImmShift,
-
-    //===------------------------------------------------------------------===//
-    // FP Instruction Classification...  Zero is non-fp instruction.
-
-    // FPTypeMask - Mask for all of the FP types...
-    FPTypeShift = ImmShift + 3,
-    FPTypeMask  = 7 << FPTypeShift,
-
-    // NotFP - The default, set for instructions that do not use FP registers.
-    NotFP      = 0 << FPTypeShift,
-
-    // ZeroArgFP - 0 arg FP instruction which implicitly pushes ST(0), f.e. fld0
-    ZeroArgFP  = 1 << FPTypeShift,
-
-    // OneArgFP - 1 arg FP instructions which implicitly read ST(0), such as fst
-    OneArgFP   = 2 << FPTypeShift,
-
-    // OneArgFPRW - 1 arg FP instruction which implicitly read ST(0) and write a
-    // result back to ST(0).  For example, fcos, fsqrt, etc.
-    //
-    OneArgFPRW = 3 << FPTypeShift,
-
-    // TwoArgFP - 2 arg FP instructions which implicitly read ST(0), and an
-    // explicit argument, storing the result to either ST(0) or the implicit
-    // argument.  For example: fadd, fsub, fmul, etc...
-    TwoArgFP   = 4 << FPTypeShift,
-
-    // CompareFP - 2 arg FP instructions which implicitly read ST(0) and an
-    // explicit argument, but have no destination.  Example: fucom, fucomi, ...
-    CompareFP  = 5 << FPTypeShift,
-
-    // CondMovFP - "2 operand" floating point conditional move instructions.
-    CondMovFP  = 6 << FPTypeShift,
-
-    // SpecialFP - Special instruction forms.  Dispatch by opcode explicitly.
-    SpecialFP  = 7 << FPTypeShift,
-
-    // Lock prefix
-    LOCKShift = FPTypeShift + 3,
-    LOCK = 1 << LOCKShift,
-
-    // Segment override prefixes. Currently we just need ability to address
-    // stuff in gs and fs segments.
-    SegOvrShift = LOCKShift + 1,
-    SegOvrMask  = 3 << SegOvrShift,
-    FS          = 1 << SegOvrShift,
-    GS          = 2 << SegOvrShift,
-
-    // Execution domain for SSE instructions in bits 23, 24.
-    // 0 in bits 23-24 means normal, non-SSE instruction.
-    SSEDomainShift = SegOvrShift + 2,
-
-    OpcodeShift   = SSEDomainShift + 2,
-
-    //===------------------------------------------------------------------===//
-    /// VEX - The opcode prefix used by AVX instructions
-    VEXShift = OpcodeShift + 8,
-    VEX         = 1U << 0,
-
-    /// VEX_W - Has a opcode specific functionality, but is used in the same
-    /// way as REX_W is for regular SSE instructions.
-    VEX_W       = 1U << 1,
-
-    /// VEX_4V - Used to specify an additional AVX/SSE register. Several 2
-    /// address instructions in SSE are represented as 3 address ones in AVX
-    /// and the additional register is encoded in VEX_VVVV prefix.
-    VEX_4V      = 1U << 2,
-
-    /// VEX_I8IMM - Specifies that the last register used in a AVX instruction,
-    /// must be encoded in the i8 immediate field. This usually happens in
-    /// instructions with 4 operands.
-    VEX_I8IMM   = 1U << 3,
-
-    /// VEX_L - Stands for a bit in the VEX opcode prefix meaning the current
-    /// instruction uses 256-bit wide registers. This is usually auto detected
-    /// if a VR256 register is used, but some AVX instructions also have this
-    /// field marked when using a f256 memory references.
-    VEX_L       = 1U << 4,
-
-    /// Has3DNow0F0FOpcode - This flag indicates that the instruction uses the
-    /// wacky 0x0F 0x0F prefix for 3DNow! instructions.  The manual documents
-    /// this as having a 0x0F prefix with a 0x0F opcode, and each instruction
-    /// storing a classifier in the imm8 field.  To simplify our implementation,
-    /// we handle this by storeing the classifier in the opcode field and using
-    /// this flag to indicate that the encoder should do the wacky 3DNow! thing.
-    Has3DNow0F0FOpcode = 1U << 5
-  };
-
-  // getBaseOpcodeFor - This function returns the "base" X86 opcode for the
-  // specified machine instruction.
-  //
-  static inline unsigned char getBaseOpcodeFor(uint64_t TSFlags) {
-    return TSFlags >> X86II::OpcodeShift;
-  }
-
-  static inline bool hasImm(uint64_t TSFlags) {
-    return (TSFlags & X86II::ImmMask) != 0;
-  }
-
-  /// getSizeOfImm - Decode the "size of immediate" field from the TSFlags field
-  /// of the specified instruction.
-  static inline unsigned getSizeOfImm(uint64_t TSFlags) {
-    switch (TSFlags & X86II::ImmMask) {
-    default: assert(0 && "Unknown immediate size");
-    case X86II::Imm8:
-    case X86II::Imm8PCRel:  return 1;
-    case X86II::Imm16:
-    case X86II::Imm16PCRel: return 2;
-    case X86II::Imm32:
-    case X86II::Imm32PCRel: return 4;
-    case X86II::Imm64:      return 8;
-    }
-  }
-
-  /// isImmPCRel - Return true if the immediate of the specified instruction's
-  /// TSFlags indicates that it is pc relative.
-  static inline unsigned isImmPCRel(uint64_t TSFlags) {
-    switch (TSFlags & X86II::ImmMask) {
-    default: assert(0 && "Unknown immediate size");
-    case X86II::Imm8PCRel:
-    case X86II::Imm16PCRel:
-    case X86II::Imm32PCRel:
-      return true;
-    case X86II::Imm8:
-    case X86II::Imm16:
-    case X86II::Imm32:
-    case X86II::Imm64:
-      return false;
-    }
-  }
-
-  /// getMemoryOperandNo - The function returns the MCInst operand # for the
-  /// first field of the memory operand.  If the instruction doesn't have a
-  /// memory operand, this returns -1.
-  ///
-  /// Note that this ignores tied operands.  If there is a tied register which
-  /// is duplicated in the MCInst (e.g. "EAX = addl EAX, [mem]") it is only
-  /// counted as one operand.
-  ///
-  static inline int getMemoryOperandNo(uint64_t TSFlags) {
-    switch (TSFlags & X86II::FormMask) {
-    case X86II::MRMInitReg:  assert(0 && "FIXME: Remove this form");
-    default: assert(0 && "Unknown FormMask value in getMemoryOperandNo!");
-    case X86II::Pseudo:
-    case X86II::RawFrm:
-    case X86II::AddRegFrm:
-    case X86II::MRMDestReg:
-    case X86II::MRMSrcReg:
-    case X86II::RawFrmImm8:
-    case X86II::RawFrmImm16:
-       return -1;
-    case X86II::MRMDestMem:
-      return 0;
-    case X86II::MRMSrcMem: {
-      bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V;
-      unsigned FirstMemOp = 1;
-      if (HasVEX_4V)
-        ++FirstMemOp;// Skip the register source (which is encoded in VEX_VVVV).
-
-      // FIXME: Maybe lea should have its own form?  This is a horrible hack.
-      //if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r ||
-      //    Opcode == X86::LEA16r || Opcode == X86::LEA32r)
-      return FirstMemOp;
-    }
-    case X86II::MRM0r: case X86II::MRM1r:
-    case X86II::MRM2r: case X86II::MRM3r:
-    case X86II::MRM4r: case X86II::MRM5r:
-    case X86II::MRM6r: case X86II::MRM7r:
-      return -1;
-    case X86II::MRM0m: case X86II::MRM1m:
-    case X86II::MRM2m: case X86II::MRM3m:
-    case X86II::MRM4m: case X86II::MRM5m:
-    case X86II::MRM6m: case X86II::MRM7m:
-      return 0;
-    case X86II::MRM_C1:
-    case X86II::MRM_C2:
-    case X86II::MRM_C3:
-    case X86II::MRM_C4:
-    case X86II::MRM_C8:
-    case X86II::MRM_C9:
-    case X86II::MRM_E8:
-    case X86II::MRM_F0:
-    case X86II::MRM_F8:
-    case X86II::MRM_F9:
-    case X86II::MRM_D0:
-    case X86II::MRM_D1:
-      return -1;
-    }
-  }
-}
-
 inline static bool isScale(const MachineOperand &MO) {
   return MO.isImm() &&
     (MO.getImm() == 1 || MO.getImm() == 2 ||
@@ -621,14 +131,22 @@ class X86InstrInfo : public X86GenInstrInfo {
   /// RegOp2MemOpTable2Addr, RegOp2MemOpTable0, RegOp2MemOpTable1,
   /// RegOp2MemOpTable2 - Load / store folding opcode maps.
   ///
-  DenseMap<unsigned, std::pair<unsigned,unsigned> > RegOp2MemOpTable2Addr;
-  DenseMap<unsigned, std::pair<unsigned,unsigned> > RegOp2MemOpTable0;
-  DenseMap<unsigned, std::pair<unsigned,unsigned> > RegOp2MemOpTable1;
-  DenseMap<unsigned, std::pair<unsigned,unsigned> > RegOp2MemOpTable2;
+  typedef DenseMap<unsigned,
+                   std::pair<unsigned, unsigned> > RegOp2MemOpTableType;
+  RegOp2MemOpTableType RegOp2MemOpTable2Addr;
+  RegOp2MemOpTableType RegOp2MemOpTable0;
+  RegOp2MemOpTableType RegOp2MemOpTable1;
+  RegOp2MemOpTableType RegOp2MemOpTable2;
 
   /// MemOp2RegOpTable - Load / store unfolding opcode map.
   ///
-  DenseMap<unsigned, std::pair<unsigned, unsigned> > MemOp2RegOpTable;
+  typedef DenseMap<unsigned,
+                   std::pair<unsigned, unsigned> > MemOp2RegOpTableType;
+  MemOp2RegOpTableType MemOp2RegOpTable;
+
+  void AddTableEntry(RegOp2MemOpTableType &R2MTable,
+                     MemOp2RegOpTableType &M2RTable,
+                     unsigned RegOp, unsigned MemOp, unsigned Flags);
 
 public:
   explicit X86InstrInfo(X86TargetMachine &tm);
@@ -656,17 +174,6 @@ public:
   unsigned isLoadFromStackSlotPostFE(const MachineInstr *MI,
                                      int &FrameIndex) const;
 
-  /// hasLoadFromStackSlot - If the specified machine instruction has
-  /// a load from a stack slot, return true along with the FrameIndex
-  /// of the loaded stack slot and the machine mem operand containing
-  /// the reference.  If not, return false.  Unlike
-  /// isLoadFromStackSlot, this returns true for any instructions that
-  /// loads from the stack.  This is a hint only and may not catch all
-  /// cases.
-  bool hasLoadFromStackSlot(const MachineInstr *MI,
-                            const MachineMemOperand *&MMO,
-                            int &FrameIndex) const;
-
   unsigned isStoreToStackSlot(const MachineInstr *MI, int &FrameIndex) const;
   /// isStoreToStackSlotPostFE - Check for post-frame ptr elimination
   /// stack locations as well.  This uses a heuristic so it isn't
@@ -674,16 +181,6 @@ public:
   unsigned isStoreToStackSlotPostFE(const MachineInstr *MI,
                                     int &FrameIndex) const;
 
-  /// hasStoreToStackSlot - If the specified machine instruction has a
-  /// store to a stack slot, return true along with the FrameIndex of
-  /// the loaded stack slot and the machine mem operand containing the
-  /// reference.  If not, return false.  Unlike isStoreToStackSlot,
-  /// this returns true for any instructions that loads from the
-  /// stack.  This is a hint only and may not catch all cases.
-  bool hasStoreToStackSlot(const MachineInstr *MI,
-                           const MachineMemOperand *&MMO,
-                           int &FrameIndex) const;
-
   bool isReallyTriviallyReMaterializable(const MachineInstr *MI,
                                          AliasAnalysis *AA) const;
   void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
@@ -750,6 +247,9 @@ public:
                                MachineInstr::mmo_iterator MMOBegin,
                                MachineInstr::mmo_iterator MMOEnd,
                                SmallVectorImpl<MachineInstr*> &NewMIs) const;
+
+  virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const;
+
   virtual
   MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF,
                                          int FrameIx, uint64_t Offset,
@@ -829,32 +329,21 @@ public:
   /// instruction that defines the specified register class.
   bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const;
 
-  static bool isX86_64NonExtLowByteReg(unsigned reg) {
-    return (reg == X86::SPL || reg == X86::BPL ||
-          reg == X86::SIL || reg == X86::DIL);
-  }
-
   static bool isX86_64ExtendedReg(const MachineOperand &MO) {
     if (!MO.isReg()) return false;
-    return isX86_64ExtendedReg(MO.getReg());
+    return X86II::isX86_64ExtendedReg(MO.getReg());
   }
 
-  /// isX86_64ExtendedReg - Is the MachineOperand a x86-64 extended (r8 or
-  /// higher) register?  e.g. r8, xmm8, xmm13, etc.
-  static bool isX86_64ExtendedReg(unsigned RegNo);
-
   /// getGlobalBaseReg - Return a virtual register initialized with the
   /// the global base register value. Output instructions required to
   /// initialize the register in the function entry block, if necessary.
   ///
   unsigned getGlobalBaseReg(MachineFunction *MF) const;
 
-  /// GetSSEDomain - Return the SSE execution domain of MI as the first element,
-  /// and a bitmask of possible arguments to SetSSEDomain ase the second.
-  std::pair<uint16_t, uint16_t> GetSSEDomain(const MachineInstr *MI) const;
+  std::pair<uint16_t, uint16_t>
+  getExecutionDomain(const MachineInstr *MI) const;
 
-  /// SetSSEDomain - Set the SSEDomain of MI.
-  void SetSSEDomain(MachineInstr *MI, unsigned Domain) const;
+  void setExecutionDomain(MachineInstr *MI, unsigned Domain) const;
 
   MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
                                       MachineInstr* MI,
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 7eb07b0a97bd3..d54bf275c04f8 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -65,7 +65,7 @@ def SDTX86SetCC_C : SDTypeProfile<1, 2,
 
 def SDTX86cas : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisInt<1>,
                                      SDTCisVT<2, i8>]>;
-def SDTX86cas8 : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
+def SDTX86caspair : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
 
 def SDTX86atomicBinary : SDTypeProfile<2, 3, [SDTCisInt<0>, SDTCisInt<1>,
                                 SDTCisPtrTy<2>, SDTCisInt<3>,SDTCisInt<4>]>;
@@ -97,6 +97,8 @@ def SDT_X86TLSADDR : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
 
 def SDT_X86TLSCALL : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
 
+def SDT_X86SEG_ALLOCA : SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>;
+
 def SDT_X86EHRET : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
 
 def SDT_X86TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisVT<1, i32>]>;
@@ -133,9 +135,13 @@ def X86setcc_c : SDNode<"X86ISD::SETCC_CARRY", SDTX86SetCC_C>;
 def X86cas : SDNode<"X86ISD::LCMPXCHG_DAG", SDTX86cas,
                         [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
                          SDNPMayLoad, SDNPMemOperand]>;
-def X86cas8 : SDNode<"X86ISD::LCMPXCHG8_DAG", SDTX86cas8,
+def X86cas8 : SDNode<"X86ISD::LCMPXCHG8_DAG", SDTX86caspair,
+                        [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
+                         SDNPMayLoad, SDNPMemOperand]>;
+def X86cas16 : SDNode<"X86ISD::LCMPXCHG16_DAG", SDTX86caspair,
                         [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
                          SDNPMayLoad, SDNPMemOperand]>;
+
 def X86AtomAdd64 : SDNode<"X86ISD::ATOMADD64_DAG", SDTX86atomicBinary,
                         [SDNPHasChain, SDNPMayStore,
                          SDNPMayLoad, SDNPMemOperand]>;
@@ -218,12 +224,16 @@ def X86xor_flag  : SDNode<"X86ISD::XOR",  SDTBinaryArithWithFlags,
                           [SDNPCommutative]>;
 def X86and_flag  : SDNode<"X86ISD::AND",  SDTBinaryArithWithFlags,
                           [SDNPCommutative]>;
+def X86andn_flag : SDNode<"X86ISD::ANDN", SDTBinaryArithWithFlags>;
 
 def X86mul_imm : SDNode<"X86ISD::MUL_IMM", SDTIntBinOp>;
 
 def X86WinAlloca : SDNode<"X86ISD::WIN_ALLOCA", SDTX86Void,
                           [SDNPHasChain, SDNPInGlue, SDNPOutGlue]>;
 
+def X86SegAlloca : SDNode<"X86ISD::SEG_ALLOCA", SDT_X86SEG_ALLOCA,
+                          [SDNPHasChain]>;
+
 def X86TLSCall : SDNode<"X86ISD::TLSCALL", SDT_X86TLSCALL,
                         [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
 
@@ -331,6 +341,11 @@ class ImmSExtAsmOperandClass : AsmOperandClass {
   let RenderMethod = "addImmOperands";
 }
 
+class ImmZExtAsmOperandClass : AsmOperandClass {
+  let SuperClasses = [ImmAsmOperand];
+  let RenderMethod = "addImmOperands";
+}
+
 // Sign-extended immediate classes. We don't need to define the full lattice
 // here because there is no instruction with an ambiguity between ImmSExti64i32
 // and ImmSExti32i8.
@@ -358,6 +373,12 @@ def ImmSExti32i8AsmOperand : ImmSExtAsmOperandClass {
   let Name = "ImmSExti32i8";
 }
 
+// [0, 0x000000FF]
+def ImmZExtu32u8AsmOperand : ImmZExtAsmOperandClass {
+  let Name = "ImmZExtu32u8";
+}
+
+
 // [0, 0x0000007F]                                            |
 //   [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF]
 def ImmSExti64i8AsmOperand : ImmSExtAsmOperandClass {
@@ -377,6 +398,11 @@ def i32i8imm  : Operand<i32> {
   let ParserMatchClass = ImmSExti32i8AsmOperand;
   let OperandType = "OPERAND_IMMEDIATE";
 }
+// 32-bits but only 8 bits are significant, and those 8 bits are unsigned.
+def u32u8imm  : Operand<i32> {
+  let ParserMatchClass = ImmZExtu32u8AsmOperand;
+  let OperandType = "OPERAND_IMMEDIATE";
+}
 
 // 64-bits but only 32 bits are significant.
 def i64i32imm  : Operand<i64> {
@@ -389,11 +415,13 @@ def i64i32imm  : Operand<i64> {
 def i64i32imm_pcrel : Operand<i64> {
   let PrintMethod = "print_pcrel_imm";
   let ParserMatchClass = X86AbsMemAsmOperand;
+  let OperandType = "OPERAND_PCREL";
 }
 
 // 64-bits but only 8 bits are significant.
 def i64i8imm   : Operand<i64> {
   let ParserMatchClass = ImmSExti64i8AsmOperand;
+  let OperandType = "OPERAND_IMMEDIATE";
 }
 
 def lea64_32mem : Operand<i32> {
@@ -442,18 +470,33 @@ def HasSSE4A     : Predicate<"Subtarget->hasSSE4A()">;
 def HasAVX       : Predicate<"Subtarget->hasAVX()">;
 def HasXMMInt    : Predicate<"Subtarget->hasXMMInt()">;
 
+def HasPOPCNT    : Predicate<"Subtarget->hasPOPCNT()">;
 def HasAES       : Predicate<"Subtarget->hasAES()">;
 def HasCLMUL     : Predicate<"Subtarget->hasCLMUL()">;
 def HasFMA3      : Predicate<"Subtarget->hasFMA3()">;
 def HasFMA4      : Predicate<"Subtarget->hasFMA4()">;
+def HasMOVBE     : Predicate<"Subtarget->hasMOVBE()">;
+def HasRDRAND    : Predicate<"Subtarget->hasRDRAND()">;
+def HasF16C      : Predicate<"Subtarget->hasF16C()">;
+def HasLZCNT     : Predicate<"Subtarget->hasLZCNT()">;
+def HasBMI       : Predicate<"Subtarget->hasBMI()">;
 def FPStackf32   : Predicate<"!Subtarget->hasXMM()">;
 def FPStackf64   : Predicate<"!Subtarget->hasXMMInt()">;
+def HasCmpxchg16b: Predicate<"Subtarget->hasCmpxchg16b()">;
 def In32BitMode  : Predicate<"!Subtarget->is64Bit()">,
                              AssemblerPredicate<"!Mode64Bit">;
 def In64BitMode  : Predicate<"Subtarget->is64Bit()">,
                              AssemblerPredicate<"Mode64Bit">;
 def IsWin64      : Predicate<"Subtarget->isTargetWin64()">;
 def NotWin64     : Predicate<"!Subtarget->isTargetWin64()">;
+def IsNaCl       : Predicate<"Subtarget->isTargetNaCl()">,
+                             AssemblerPredicate<"ModeNaCl">;
+def IsNaCl32     : Predicate<"Subtarget->isTargetNaCl32()">,
+                             AssemblerPredicate<"ModeNaCl,!Mode64Bit">;
+def IsNaCl64     : Predicate<"Subtarget->isTargetNaCl64()">,
+                             AssemblerPredicate<"ModeNaCl,Mode64Bit">;
+def NotNaCl      : Predicate<"!Subtarget->isTargetNaCl()">,
+                             AssemblerPredicate<"!ModeNaCl">;
 def SmallCode    : Predicate<"TM.getCodeModel() == CodeModel::Small">;
 def KernelCode   : Predicate<"TM.getCodeModel() == CodeModel::Kernel">;
 def FarData      : Predicate<"TM.getCodeModel() != CodeModel::Small &&"
@@ -766,30 +809,30 @@ def BSR64rm  : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
 
 // These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
 let Defs = [EDI,ESI], Uses = [EDI,ESI,EFLAGS] in {
-def MOVSB : I<0xA4, RawFrm, (outs), (ins), "{movsb}", []>;
-def MOVSW : I<0xA5, RawFrm, (outs), (ins), "{movsw}", []>, OpSize;
-def MOVSD : I<0xA5, RawFrm, (outs), (ins), "{movsl|movsd}", []>;
+def MOVSB : I<0xA4, RawFrm, (outs), (ins), "movsb", []>;
+def MOVSW : I<0xA5, RawFrm, (outs), (ins), "movsw", []>, OpSize;
+def MOVSD : I<0xA5, RawFrm, (outs), (ins), "movs{l|d}", []>;
 def MOVSQ : RI<0xA5, RawFrm, (outs), (ins), "movsq", []>;
 }
 
 // These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
 let Defs = [EDI], Uses = [AL,EDI,EFLAGS] in
-def STOSB : I<0xAA, RawFrm, (outs), (ins), "{stosb}", []>;
+def STOSB : I<0xAA, RawFrm, (outs), (ins), "stosb", []>;
 let Defs = [EDI], Uses = [AX,EDI,EFLAGS] in
-def STOSW : I<0xAB, RawFrm, (outs), (ins), "{stosw}", []>, OpSize;
+def STOSW : I<0xAB, RawFrm, (outs), (ins), "stosw", []>, OpSize;
 let Defs = [EDI], Uses = [EAX,EDI,EFLAGS] in
-def STOSD : I<0xAB, RawFrm, (outs), (ins), "{stosl|stosd}", []>;
+def STOSD : I<0xAB, RawFrm, (outs), (ins), "stos{l|d}", []>;
 let Defs = [RCX,RDI], Uses = [RAX,RCX,RDI,EFLAGS] in
 def STOSQ : RI<0xAB, RawFrm, (outs), (ins), "stosq", []>;
 
-def SCAS8 : I<0xAE, RawFrm, (outs), (ins), "scas{b}", []>;
-def SCAS16 : I<0xAF, RawFrm, (outs), (ins), "scas{w}", []>, OpSize;
-def SCAS32 : I<0xAF, RawFrm, (outs), (ins), "scas{l}", []>;
+def SCAS8 : I<0xAE, RawFrm, (outs), (ins), "scasb", []>;
+def SCAS16 : I<0xAF, RawFrm, (outs), (ins), "scasw", []>, OpSize;
+def SCAS32 : I<0xAF, RawFrm, (outs), (ins), "scas{l|d}", []>;
 def SCAS64 : RI<0xAF, RawFrm, (outs), (ins), "scasq", []>;
 
-def CMPS8 : I<0xA6, RawFrm, (outs), (ins), "cmps{b}", []>;
-def CMPS16 : I<0xA7, RawFrm, (outs), (ins), "cmps{w}", []>, OpSize;
-def CMPS32 : I<0xA7, RawFrm, (outs), (ins), "cmps{l}", []>;
+def CMPS8 : I<0xA6, RawFrm, (outs), (ins), "cmpsb", []>;
+def CMPS16 : I<0xA7, RawFrm, (outs), (ins), "cmpsw", []>, OpSize;
+def CMPS32 : I<0xA7, RawFrm, (outs), (ins), "cmps{l|d}", []>;
 def CMPS64 : RI<0xA7, RawFrm, (outs), (ins), "cmpsq", []>;
 
 
@@ -841,22 +884,22 @@ def MOV64mi32 : RIi32<0xC7, MRM0m, (outs), (ins i64mem:$dst, i64i32imm:$src),
 /// moffs8, moffs16 and moffs32 versions of moves.  The immediate is a
 /// 32-bit offset from the PC.  These are only valid in x86-32 mode.
 def MOV8o8a : Ii32 <0xA0, RawFrm, (outs), (ins offset8:$src),
-                   "mov{b}\t{$src, %al|%al, $src}", []>,
+                   "mov{b}\t{$src, %al|AL, $src}", []>,
                    Requires<[In32BitMode]>;
 def MOV16o16a : Ii32 <0xA1, RawFrm, (outs), (ins offset16:$src),
-                      "mov{w}\t{$src, %ax|%ax, $src}", []>, OpSize,
+                      "mov{w}\t{$src, %ax|AL, $src}", []>, OpSize,
                      Requires<[In32BitMode]>;
 def MOV32o32a : Ii32 <0xA1, RawFrm, (outs), (ins offset32:$src),
-                      "mov{l}\t{$src, %eax|%eax, $src}", []>,
+                      "mov{l}\t{$src, %eax|EAX, $src}", []>,
                      Requires<[In32BitMode]>;
 def MOV8ao8 : Ii32 <0xA2, RawFrm, (outs offset8:$dst), (ins),
-                   "mov{b}\t{%al, $dst|$dst, %al}", []>,
+                   "mov{b}\t{%al, $dst|$dst, AL}", []>,
                   Requires<[In32BitMode]>;
 def MOV16ao16 : Ii32 <0xA3, RawFrm, (outs offset16:$dst), (ins),
-                      "mov{w}\t{%ax, $dst|$dst, %ax}", []>, OpSize,
+                      "mov{w}\t{%ax, $dst|$dst, AL}", []>, OpSize,
                      Requires<[In32BitMode]>;
 def MOV32ao32 : Ii32 <0xA3, RawFrm, (outs offset32:$dst), (ins),
-                      "mov{l}\t{%eax, $dst|$dst, %eax}", []>,
+                      "mov{l}\t{%eax, $dst|$dst, EAX}", []>,
                      Requires<[In32BitMode]>;
 
 // FIXME: These definitions are utterly broken
@@ -865,13 +908,13 @@ def MOV32ao32 : Ii32 <0xA3, RawFrm, (outs offset32:$dst), (ins),
 // in question.
 /*
 def MOV64o8a : RIi8<0xA0, RawFrm, (outs), (ins offset8:$src),
-                      "mov{q}\t{$src, %rax|%rax, $src}", []>;
+                      "mov{q}\t{$src, %rax|RAX, $src}", []>;
 def MOV64o64a : RIi32<0xA1, RawFrm, (outs), (ins offset64:$src),
-                       "mov{q}\t{$src, %rax|%rax, $src}", []>;
+                       "mov{q}\t{$src, %rax|RAX, $src}", []>;
 def MOV64ao8 : RIi8<0xA2, RawFrm, (outs offset8:$dst), (ins),
-                       "mov{q}\t{%rax, $dst|$dst, %rax}", []>;
+                       "mov{q}\t{%rax, $dst|$dst, RAX}", []>;
 def MOV64ao64 : RIi32<0xA3, RawFrm, (outs offset64:$dst), (ins),
-                       "mov{q}\t{%rax, $dst|$dst, %rax}", []>;
+                       "mov{q}\t{%rax, $dst|$dst, RAX}", []>;
 */
 
 
@@ -926,7 +969,7 @@ let mayStore = 1 in
 def MOV8mr_NOREX : I<0x88, MRMDestMem,
                      (outs), (ins i8mem_NOREX:$dst, GR8_NOREX:$src),
                      "mov{b}\t{$src, $dst|$dst, $src}  # NOREX", []>;
-let mayLoad = 1,
+let mayLoad = 1, neverHasSideEffects = 1,
     canFoldAsLoad = 1, isReMaterializable = 1 in
 def MOV8rm_NOREX : I<0x8A, MRMSrcMem,
                      (outs GR8_NOREX:$dst), (ins i8mem_NOREX:$src),
@@ -1117,11 +1160,15 @@ def XCHG64rr : RI<0x87, MRMSrcReg, (outs GR64:$dst), (ins GR64:$val,GR64:$src),
 }
 
 def XCHG16ar : I<0x90, AddRegFrm, (outs), (ins GR16:$src),
-                  "xchg{w}\t{$src, %ax|%ax, $src}", []>, OpSize;
+                  "xchg{w}\t{$src, %ax|AX, $src}", []>, OpSize;
 def XCHG32ar : I<0x90, AddRegFrm, (outs), (ins GR32:$src),
-                  "xchg{l}\t{$src, %eax|%eax, $src}", []>;
+                  "xchg{l}\t{$src, %eax|EAX, $src}", []>, Requires<[In32BitMode]>;
+// Uses GR32_NOAX in 64-bit mode to prevent encoding using the 0x90 NOP encoding.
+// xchg %eax, %eax needs to clear upper 32-bits of RAX so is not a NOP.
+def XCHG32ar64 : I<0x90, AddRegFrm, (outs), (ins GR32_NOAX:$src),
+                   "xchg{l}\t{$src, %eax|EAX, $src}", []>, Requires<[In64BitMode]>;
 def XCHG64ar : RI<0x90, AddRegFrm, (outs), (ins GR64:$src),
-                  "xchg{q}\t{$src, %rax|%rax, $src}", []>;
+                  "xchg{q}\t{$src, %rax|RAX, $src}", []>;
 
 
 
@@ -1172,7 +1219,7 @@ def CMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$dst),
 
 let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX] in
 def CMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$dst),
-                    "cmpxchg16b\t$dst", []>, TB;
+                    "cmpxchg16b\t$dst", []>, TB, Requires<[HasCmpxchg16b]>;
 
 
 
@@ -1260,6 +1307,104 @@ def ARPL16rr : I<0x63, MRMDestReg, (outs GR16:$src), (ins GR16:$dst),
 def ARPL16mr : I<0x63, MRMSrcMem, (outs GR16:$src), (ins i16mem:$dst),
                  "arpl\t{$src, $dst|$dst, $src}", []>, Requires<[In32BitMode]>;
 
+//===----------------------------------------------------------------------===//
+// MOVBE Instructions
+//
+let Predicates = [HasMOVBE] in {
+  def MOVBE16rm : I<0xF0, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+                    "movbe{w}\t{$src, $dst|$dst, $src}",
+                    [(set GR16:$dst, (bswap (loadi16 addr:$src)))]>, OpSize, T8;
+  def MOVBE32rm : I<0xF0, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+                    "movbe{l}\t{$src, $dst|$dst, $src}",
+                    [(set GR32:$dst, (bswap (loadi32 addr:$src)))]>, T8;
+  def MOVBE64rm : RI<0xF0, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+                     "movbe{q}\t{$src, $dst|$dst, $src}",
+                     [(set GR64:$dst, (bswap (loadi64 addr:$src)))]>, T8;
+  def MOVBE16mr : I<0xF1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
+                    "movbe{w}\t{$src, $dst|$dst, $src}",
+                    [(store (bswap GR16:$src), addr:$dst)]>, OpSize, T8;
+  def MOVBE32mr : I<0xF1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+                    "movbe{l}\t{$src, $dst|$dst, $src}",
+                    [(store (bswap GR32:$src), addr:$dst)]>, T8;
+  def MOVBE64mr : RI<0xF1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+                     "movbe{q}\t{$src, $dst|$dst, $src}",
+                     [(store (bswap GR64:$src), addr:$dst)]>, T8;
+}
+
+//===----------------------------------------------------------------------===//
+// RDRAND Instruction
+//
+let Predicates = [HasRDRAND], Defs = [EFLAGS] in {
+  def RDRAND16r : I<0xC7, MRM6r, (outs GR16:$dst), (ins),
+                    "rdrand{w}\t$dst", []>, OpSize, TB;
+  def RDRAND32r : I<0xC7, MRM6r, (outs GR32:$dst), (ins),
+                    "rdrand{l}\t$dst", []>, TB;
+  def RDRAND64r : RI<0xC7, MRM6r, (outs GR64:$dst), (ins),
+                     "rdrand{q}\t$dst", []>, TB;
+}
+
+//===----------------------------------------------------------------------===//
+// LZCNT Instruction
+//
+let Predicates = [HasLZCNT], Defs = [EFLAGS] in {
+  def LZCNT16rr : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+                    "lzcnt{w}\t{$src, $dst|$dst, $src}",
+                    [(set GR16:$dst, (ctlz GR16:$src)), (implicit EFLAGS)]>, XS,
+                    OpSize;
+  def LZCNT16rm : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+                    "lzcnt{w}\t{$src, $dst|$dst, $src}",
+                    [(set GR16:$dst, (ctlz (loadi16 addr:$src))),
+                     (implicit EFLAGS)]>, XS, OpSize;
+
+  def LZCNT32rr : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+                    "lzcnt{l}\t{$src, $dst|$dst, $src}",
+                    [(set GR32:$dst, (ctlz GR32:$src)), (implicit EFLAGS)]>, XS;
+  def LZCNT32rm : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+                    "lzcnt{l}\t{$src, $dst|$dst, $src}",
+                    [(set GR32:$dst, (ctlz (loadi32 addr:$src))),
+                     (implicit EFLAGS)]>, XS;
+
+  def LZCNT64rr : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+                     "lzcnt{q}\t{$src, $dst|$dst, $src}",
+                     [(set GR64:$dst, (ctlz GR64:$src)), (implicit EFLAGS)]>,
+                     XS;
+  def LZCNT64rm : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+                     "lzcnt{q}\t{$src, $dst|$dst, $src}",
+                     [(set GR64:$dst, (ctlz (loadi64 addr:$src))),
+                      (implicit EFLAGS)]>, XS;
+}
+
+//===----------------------------------------------------------------------===//
+// TZCNT Instruction
+//
+let Predicates = [HasBMI], Defs = [EFLAGS] in {
+  def TZCNT16rr : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+                    "tzcnt{w}\t{$src, $dst|$dst, $src}",
+                    [(set GR16:$dst, (cttz GR16:$src)), (implicit EFLAGS)]>, XS,
+                    OpSize;
+  def TZCNT16rm : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+                    "tzcnt{w}\t{$src, $dst|$dst, $src}",
+                    [(set GR16:$dst, (cttz (loadi16 addr:$src))),
+                     (implicit EFLAGS)]>, XS, OpSize;
+
+  def TZCNT32rr : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+                    "tzcnt{l}\t{$src, $dst|$dst, $src}",
+                    [(set GR32:$dst, (cttz GR32:$src)), (implicit EFLAGS)]>, XS;
+  def TZCNT32rm : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+                    "tzcnt{l}\t{$src, $dst|$dst, $src}",
+                    [(set GR32:$dst, (cttz (loadi32 addr:$src))),
+                     (implicit EFLAGS)]>, XS;
+
+  def TZCNT64rr : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+                     "tzcnt{q}\t{$src, $dst|$dst, $src}",
+                     [(set GR64:$dst, (cttz GR64:$src)), (implicit EFLAGS)]>,
+                     XS;
+  def TZCNT64rm : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+                     "tzcnt{q}\t{$src, $dst|$dst, $src}",
+                     [(set GR64:$dst, (cttz (loadi64 addr:$src))),
+                      (implicit EFLAGS)]>, XS;
+}
+
 //===----------------------------------------------------------------------===//
 // Subsystems.
 //===----------------------------------------------------------------------===//
@@ -1646,3 +1791,9 @@ def : InstAlias<"xchgb $mem, $val", (XCHG8rm  GR8 :$val, i8mem :$mem)>;
 def : InstAlias<"xchgw $mem, $val", (XCHG16rm GR16:$val, i16mem:$mem)>;
 def : InstAlias<"xchgl $mem, $val", (XCHG32rm GR32:$val, i32mem:$mem)>;
 def : InstAlias<"xchgq $mem, $val", (XCHG64rm GR64:$val, i64mem:$mem)>;
+
+// xchg: We accept "xchgX <reg>, %eax" and "xchgX %eax, <reg>" as synonyms.
+def : InstAlias<"xchgw %ax, $src", (XCHG16ar GR16:$src)>;
+def : InstAlias<"xchgl %eax, $src", (XCHG32ar GR32:$src)>, Requires<[In32BitMode]>;
+def : InstAlias<"xchgl %eax, $src", (XCHG32ar64 GR32_NOAX:$src)>, Requires<[In64BitMode]>;
+def : InstAlias<"xchgq %rax, $src", (XCHG64ar GR64:$src)>;
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index fe11d776804c7..d3ced23450fee 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -116,7 +116,217 @@ multiclass sse12_fp_packed_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
 }
 
 //===----------------------------------------------------------------------===//
-// SSE 1 & 2 - Move Instructions
+//  Non-instruction patterns
+//===----------------------------------------------------------------------===//
+
+// A vector extract of the first f32/f64 position is a subregister copy
+def : Pat<(f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
+          (f32 (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>;
+def : Pat<(f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
+          (f64 (EXTRACT_SUBREG (v2f64 VR128:$src), sub_sd))>;
+
+// A 128-bit subvector extract from the first 256-bit vector position
+// is a subregister copy that needs no instruction.
+def : Pat<(v4i32 (extract_subvector (v8i32 VR256:$src), (i32 0))),
+          (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm))>;
+def : Pat<(v4f32 (extract_subvector (v8f32 VR256:$src), (i32 0))),
+          (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm))>;
+
+def : Pat<(v2i64 (extract_subvector (v4i64 VR256:$src), (i32 0))),
+          (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm))>;
+def : Pat<(v2f64 (extract_subvector (v4f64 VR256:$src), (i32 0))),
+          (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm))>;
+
+def : Pat<(v8i16 (extract_subvector (v16i16 VR256:$src), (i32 0))),
+          (v8i16 (EXTRACT_SUBREG (v16i16 VR256:$src), sub_xmm))>;
+def : Pat<(v16i8 (extract_subvector (v32i8 VR256:$src), (i32 0))),
+          (v16i8 (EXTRACT_SUBREG (v32i8 VR256:$src), sub_xmm))>;
+
+// A 128-bit subvector insert to the first 256-bit vector position
+// is a subregister copy that needs no instruction.
+def : Pat<(insert_subvector undef, (v2i64 VR128:$src), (i32 0)),
+          (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
+def : Pat<(insert_subvector undef, (v2f64 VR128:$src), (i32 0)),
+          (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
+def : Pat<(insert_subvector undef, (v4i32 VR128:$src), (i32 0)),
+          (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
+def : Pat<(insert_subvector undef, (v4f32 VR128:$src), (i32 0)),
+          (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
+def : Pat<(insert_subvector undef, (v8i16 VR128:$src), (i32 0)),
+          (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
+def : Pat<(insert_subvector undef, (v16i8 VR128:$src), (i32 0)),
+          (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
+
+// Implicitly promote a 32-bit scalar to a vector.
+def : Pat<(v4f32 (scalar_to_vector FR32:$src)),
+          (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src, sub_ss)>;
+def : Pat<(v8f32 (scalar_to_vector FR32:$src)),
+          (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), FR32:$src, sub_ss)>;
+// Implicitly promote a 64-bit scalar to a vector.
+def : Pat<(v2f64 (scalar_to_vector FR64:$src)),
+          (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src, sub_sd)>;
+def : Pat<(v4f64 (scalar_to_vector FR64:$src)),
+          (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), FR64:$src, sub_sd)>;
+
+// Bitcasts between 128-bit vector types. Return the original type since
+// no instruction is needed for the conversion
+let Predicates = [HasXMMInt] in {
+  def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>;
+  def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>;
+  def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>;
+  def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>;
+  def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>;
+  def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>;
+  def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>;
+  def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>;
+  def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>;
+  def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>;
+  def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>;
+  def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>;
+  def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>;
+  def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>;
+  def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>;
+  def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>;
+  def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>;
+  def : Pat<(v16i8 (bitconvert (v8i16 VR128:$src))), (v16i8 VR128:$src)>;
+  def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>;
+  def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>;
+  def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>;
+  def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>;
+  def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>;
+  def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>;
+  def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>;
+  def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>;
+  def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>;
+  def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>;
+  def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>;
+  def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>;
+}
+
+// Bitcasts between 256-bit vector types. Return the original type since
+// no instruction is needed for the conversion
+let Predicates = [HasAVX] in {
+  def : Pat<(v4f64  (bitconvert (v8f32 VR256:$src))),  (v4f64 VR256:$src)>;
+  def : Pat<(v4f64  (bitconvert (v8i32 VR256:$src))),  (v4f64 VR256:$src)>;
+  def : Pat<(v4f64  (bitconvert (v4i64 VR256:$src))),  (v4f64 VR256:$src)>;
+  def : Pat<(v4f64  (bitconvert (v16i16 VR256:$src))), (v4f64 VR256:$src)>;
+  def : Pat<(v4f64  (bitconvert (v32i8 VR256:$src))),  (v4f64 VR256:$src)>;
+  def : Pat<(v8f32  (bitconvert (v8i32 VR256:$src))),  (v8f32 VR256:$src)>;
+  def : Pat<(v8f32  (bitconvert (v4i64 VR256:$src))),  (v8f32 VR256:$src)>;
+  def : Pat<(v8f32  (bitconvert (v4f64 VR256:$src))),  (v8f32 VR256:$src)>;
+  def : Pat<(v8f32  (bitconvert (v32i8 VR256:$src))),  (v8f32 VR256:$src)>;
+  def : Pat<(v8f32  (bitconvert (v16i16 VR256:$src))), (v8f32 VR256:$src)>;
+  def : Pat<(v4i64  (bitconvert (v8f32 VR256:$src))),  (v4i64 VR256:$src)>;
+  def : Pat<(v4i64  (bitconvert (v8i32 VR256:$src))),  (v4i64 VR256:$src)>;
+  def : Pat<(v4i64  (bitconvert (v4f64 VR256:$src))),  (v4i64 VR256:$src)>;
+  def : Pat<(v4i64  (bitconvert (v32i8 VR256:$src))),  (v4i64 VR256:$src)>;
+  def : Pat<(v4i64  (bitconvert (v16i16 VR256:$src))), (v4i64 VR256:$src)>;
+  def : Pat<(v32i8  (bitconvert (v4f64 VR256:$src))),  (v32i8 VR256:$src)>;
+  def : Pat<(v32i8  (bitconvert (v4i64 VR256:$src))),  (v32i8 VR256:$src)>;
+  def : Pat<(v32i8  (bitconvert (v8f32 VR256:$src))),  (v32i8 VR256:$src)>;
+  def : Pat<(v32i8  (bitconvert (v8i32 VR256:$src))),  (v32i8 VR256:$src)>;
+  def : Pat<(v32i8  (bitconvert (v16i16 VR256:$src))), (v32i8 VR256:$src)>;
+  def : Pat<(v8i32  (bitconvert (v32i8 VR256:$src))),  (v8i32 VR256:$src)>;
+  def : Pat<(v8i32  (bitconvert (v16i16 VR256:$src))), (v8i32 VR256:$src)>;
+  def : Pat<(v8i32  (bitconvert (v8f32 VR256:$src))),  (v8i32 VR256:$src)>;
+  def : Pat<(v8i32  (bitconvert (v4i64 VR256:$src))),  (v8i32 VR256:$src)>;
+  def : Pat<(v8i32  (bitconvert (v4f64 VR256:$src))),  (v8i32 VR256:$src)>;
+  def : Pat<(v16i16 (bitconvert (v8f32 VR256:$src))),  (v16i16 VR256:$src)>;
+  def : Pat<(v16i16 (bitconvert (v8i32 VR256:$src))),  (v16i16 VR256:$src)>;
+  def : Pat<(v16i16 (bitconvert (v4i64 VR256:$src))),  (v16i16 VR256:$src)>;
+  def : Pat<(v16i16 (bitconvert (v4f64 VR256:$src))),  (v16i16 VR256:$src)>;
+  def : Pat<(v16i16 (bitconvert (v32i8 VR256:$src))),  (v16i16 VR256:$src)>;
+}
+
+// Alias instructions that map fld0 to pxor for sse.
+// FIXME: Set encoding to pseudo!
+let isReMaterializable = 1, isAsCheapAsAMove = 1, isCodeGenOnly = 1,
+    canFoldAsLoad = 1 in {
+  def FsFLD0SS : I<0xEF, MRMInitReg, (outs FR32:$dst), (ins), "",
+                   [(set FR32:$dst, fp32imm0)]>,
+                   Requires<[HasSSE1]>, TB, OpSize;
+  def FsFLD0SD : I<0xEF, MRMInitReg, (outs FR64:$dst), (ins), "",
+                   [(set FR64:$dst, fpimm0)]>,
+                 Requires<[HasSSE2]>, TB, OpSize;
+  def VFsFLD0SS : I<0xEF, MRMInitReg, (outs FR32:$dst), (ins), "",
+                    [(set FR32:$dst, fp32imm0)]>,
+                    Requires<[HasAVX]>, TB, OpSize, VEX_4V;
+  def VFsFLD0SD : I<0xEF, MRMInitReg, (outs FR64:$dst), (ins), "",
+                    [(set FR64:$dst, fpimm0)]>,
+                    Requires<[HasAVX]>, TB, OpSize, VEX_4V;
+}
+
+//===----------------------------------------------------------------------===//
+// AVX & SSE - Zero/One Vectors
+//===----------------------------------------------------------------------===//
+
+// Alias instruction that maps zero vector to pxor / xorp* for sse.
+// This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then
+// swizzled by ExecutionDepsFix to pxor.
+// We set canFoldAsLoad because this can be converted to a constant-pool
+// load of an all-zeros value if folding it would be beneficial.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+    isPseudo = 1, neverHasSideEffects = 1 in {
+def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "", []>;
+}
+
+def : Pat<(v4f32 immAllZerosV), (V_SET0)>;
+def : Pat<(v2f64 immAllZerosV), (V_SET0)>;
+def : Pat<(v4i32 immAllZerosV), (V_SET0)>;
+def : Pat<(v2i64 immAllZerosV), (V_SET0)>;
+def : Pat<(v8i16 immAllZerosV), (V_SET0)>;
+def : Pat<(v16i8 immAllZerosV), (V_SET0)>;
+
+
+// The same as done above but for AVX.  The 256-bit ISA does not support PI,
+// and doesn't need it because on sandy bridge the register is set to zero
+// at the rename stage without using any execution unit, so SET0PSY
+// and SET0PDY can be used for vector int instructions without penalty
+// FIXME: Change encoding to pseudo! This is blocked right now by the x86
+// JIT implementatioan, it does not expand the instructions below like
+// X86MCInstLower does.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+    isCodeGenOnly = 1, Predicates = [HasAVX] in {
+def AVX_SET0PSY : PSI<0x57, MRMInitReg, (outs VR256:$dst), (ins), "",
+                   [(set VR256:$dst, (v8f32 immAllZerosV))]>, VEX_4V;
+def AVX_SET0PDY : PDI<0x57, MRMInitReg, (outs VR256:$dst), (ins), "",
+                   [(set VR256:$dst, (v4f64 immAllZerosV))]>, VEX_4V;
+}
+
+
+// AVX has no support for 256-bit integer instructions, but since the 128-bit
+// VPXOR instruction writes zero to its upper part, it's safe build zeros.
+def : Pat<(v8i32 immAllZerosV), (SUBREG_TO_REG (i32 0), (V_SET0), sub_xmm)>;
+def : Pat<(bc_v8i32 (v8f32 immAllZerosV)),
+          (SUBREG_TO_REG (i32 0), (V_SET0), sub_xmm)>;
+
+def : Pat<(v4i64 immAllZerosV), (SUBREG_TO_REG (i64 0), (V_SET0), sub_xmm)>;
+def : Pat<(bc_v4i64 (v8f32 immAllZerosV)),
+          (SUBREG_TO_REG (i64 0), (V_SET0), sub_xmm)>;
+
+// We set canFoldAsLoad because this can be converted to a constant-pool
+// load of an all-ones value if folding it would be beneficial.
+// FIXME: Change encoding to pseudo! This is blocked right now by the x86
+// JIT implementation, it does not expand the instructions below like
+// X86MCInstLower does.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+    isCodeGenOnly = 1, ExeDomain = SSEPackedInt in
+  def V_SETALLONES : PDI<0x76, MRMInitReg, (outs VR128:$dst), (ins), "",
+                         [(set VR128:$dst, (v4i32 immAllOnesV))]>;
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+    isCodeGenOnly = 1, ExeDomain = SSEPackedInt, Predicates = [HasAVX] in
+  def AVX_SETALLONES : PDI<0x76, MRMInitReg, (outs VR128:$dst), (ins), "",
+                         [(set VR128:$dst, (v4i32 immAllOnesV))]>, VEX_4V;
+
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Move FP Scalar Instructions
+//
+// Move Instructions. Register-to-register movss/movsd is not used for FR32/64
+// register copies because it's a partial register update; FsMOVAPSrr/FsMOVAPDrr
+// is used instead. Register-to-register movss/movsd is not modeled as an
+// INSERT_SUBREG because INSERT_SUBREG requires that the insert be implementable
+// in terms of a copy, and just mentioned, we don't use movss/movsd for copies.
 //===----------------------------------------------------------------------===//
 
 class sse12_move_rr<RegisterClass RC, ValueType vt, string asm> :
@@ -130,28 +340,57 @@ class sse12_move_rm<RegisterClass RC, X86MemOperand x86memop,
          !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                         [(set RC:$dst, (mem_pat addr:$src))]>;
 
-// Move Instructions. Register-to-register movss/movsd is not used for FR32/64
-// register copies because it's a partial register update; FsMOVAPSrr/FsMOVAPDrr
-// is used instead. Register-to-register movss/movsd is not modeled as an
-// INSERT_SUBREG because INSERT_SUBREG requires that the insert be implementable
-// in terms of a copy, and just mentioned, we don't use movss/movsd for copies.
+// AVX
 def VMOVSSrr : sse12_move_rr<FR32, v4f32,
-                "movss\t{$src2, $src1, $dst|$dst, $src1, $src2}">, XS, VEX_4V;
+                "movss\t{$src2, $src1, $dst|$dst, $src1, $src2}">, XS, VEX_4V,
+                VEX_LIG;
 def VMOVSDrr : sse12_move_rr<FR64, v2f64,
-                "movsd\t{$src2, $src1, $dst|$dst, $src1, $src2}">, XD, VEX_4V;
+                "movsd\t{$src2, $src1, $dst|$dst, $src1, $src2}">, XD, VEX_4V,
+                VEX_LIG;
 
-let canFoldAsLoad = 1, isReMaterializable = 1 in {
-  def VMOVSSrm : sse12_move_rm<FR32, f32mem, loadf32, "movss">, XS, VEX;
+// For the disassembler
+let isCodeGenOnly = 1 in {
+  def VMOVSSrr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
+                        (ins VR128:$src1, FR32:$src2),
+                        "movss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+                        XS, VEX_4V, VEX_LIG;
+  def VMOVSDrr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
+                        (ins VR128:$src1, FR64:$src2),
+                        "movsd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+                        XD, VEX_4V, VEX_LIG;
+}
 
+let canFoldAsLoad = 1, isReMaterializable = 1 in {
+  def VMOVSSrm : sse12_move_rm<FR32, f32mem, loadf32, "movss">, XS, VEX,
+                 VEX_LIG;
   let AddedComplexity = 20 in
-    def VMOVSDrm : sse12_move_rm<FR64, f64mem, loadf64, "movsd">, XD, VEX;
+    def VMOVSDrm : sse12_move_rm<FR64, f64mem, loadf64, "movsd">, XD, VEX,
+                   VEX_LIG;
 }
 
+def VMOVSSmr : SI<0x11, MRMDestMem, (outs), (ins f32mem:$dst, FR32:$src),
+                  "movss\t{$src, $dst|$dst, $src}",
+                  [(store FR32:$src, addr:$dst)]>, XS, VEX, VEX_LIG;
+def VMOVSDmr : SI<0x11, MRMDestMem, (outs), (ins f64mem:$dst, FR64:$src),
+                  "movsd\t{$src, $dst|$dst, $src}",
+                  [(store FR64:$src, addr:$dst)]>, XD, VEX, VEX_LIG;
+
+// SSE1 & 2
 let Constraints = "$src1 = $dst" in {
   def MOVSSrr : sse12_move_rr<FR32, v4f32,
                           "movss\t{$src2, $dst|$dst, $src2}">, XS;
   def MOVSDrr : sse12_move_rr<FR64, v2f64,
                           "movsd\t{$src2, $dst|$dst, $src2}">, XD;
+
+  // For the disassembler
+  let isCodeGenOnly = 1 in {
+    def MOVSSrr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
+                         (ins VR128:$src1, FR32:$src2),
+                         "movss\t{$src2, $dst|$dst, $src2}", []>, XS;
+    def MOVSDrr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
+                         (ins VR128:$src1, FR64:$src2),
+                         "movsd\t{$src2, $dst|$dst, $src2}", []>, XD;
+  }
 }
 
 let canFoldAsLoad = 1, isReMaterializable = 1 in {
@@ -161,54 +400,6 @@ let canFoldAsLoad = 1, isReMaterializable = 1 in {
     def MOVSDrm : sse12_move_rm<FR64, f64mem, loadf64, "movsd">, XD;
 }
 
-let AddedComplexity = 15 in {
-// Extract the low 32-bit value from one vector and insert it into another.
-def : Pat<(v4f32 (movl VR128:$src1, VR128:$src2)),
-          (MOVSSrr (v4f32 VR128:$src1),
-                   (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_ss))>;
-// Extract the low 64-bit value from one vector and insert it into another.
-def : Pat<(v2f64 (movl VR128:$src1, VR128:$src2)),
-          (MOVSDrr (v2f64 VR128:$src1),
-                   (EXTRACT_SUBREG (v2f64 VR128:$src2), sub_sd))>;
-}
-
-// Implicitly promote a 32-bit scalar to a vector.
-def : Pat<(v4f32 (scalar_to_vector FR32:$src)),
-          (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src, sub_ss)>;
-// Implicitly promote a 64-bit scalar to a vector.
-def : Pat<(v2f64 (scalar_to_vector FR64:$src)),
-          (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src, sub_sd)>;
-// Implicitly promote a 32-bit scalar to a vector.
-def : Pat<(v8f32 (scalar_to_vector FR32:$src)),
-          (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), FR32:$src, sub_ss)>;
-// Implicitly promote a 64-bit scalar to a vector.
-def : Pat<(v4f64 (scalar_to_vector FR64:$src)),
-          (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), FR64:$src, sub_sd)>;
-
-let AddedComplexity = 20 in {
-// MOVSSrm zeros the high parts of the register; represent this
-// with SUBREG_TO_REG.
-def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
-          (SUBREG_TO_REG (i32 0), (MOVSSrm addr:$src), sub_ss)>;
-def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
-          (SUBREG_TO_REG (i32 0), (MOVSSrm addr:$src), sub_ss)>;
-def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
-          (SUBREG_TO_REG (i32 0), (MOVSSrm addr:$src), sub_ss)>;
-// MOVSDrm zeros the high parts of the register; represent this
-// with SUBREG_TO_REG.
-def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
-          (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
-def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
-          (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
-def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
-          (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
-def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
-          (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
-def : Pat<(v2f64 (X86vzload addr:$src)),
-          (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
-}
-
-// Store scalar value to memory.
 def MOVSSmr : SSI<0x11, MRMDestMem, (outs), (ins f32mem:$dst, FR32:$src),
                   "movss\t{$src, $dst|$dst, $src}",
                   [(store FR32:$src, addr:$dst)]>;
@@ -216,24 +407,257 @@ def MOVSDmr : SDI<0x11, MRMDestMem, (outs), (ins f64mem:$dst, FR64:$src),
                   "movsd\t{$src, $dst|$dst, $src}",
                   [(store FR64:$src, addr:$dst)]>;
 
-def VMOVSSmr : SI<0x11, MRMDestMem, (outs), (ins f32mem:$dst, FR32:$src),
-                  "movss\t{$src, $dst|$dst, $src}",
-                  [(store FR32:$src, addr:$dst)]>, XS, VEX;
-def VMOVSDmr : SI<0x11, MRMDestMem, (outs), (ins f64mem:$dst, FR64:$src),
-                  "movsd\t{$src, $dst|$dst, $src}",
-                  [(store FR64:$src, addr:$dst)]>, XD, VEX;
+// Patterns
+let Predicates = [HasSSE1] in {
+  let AddedComplexity = 15 in {
+  // Extract the low 32-bit value from one vector and insert it into another.
+  def : Pat<(v4f32 (movl VR128:$src1, VR128:$src2)),
+            (MOVSSrr (v4f32 VR128:$src1),
+                     (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_ss))>;
+  def : Pat<(v4i32 (movl VR128:$src1, VR128:$src2)),
+            (MOVSSrr (v4i32 VR128:$src1),
+                     (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_ss))>;
+
+  // Move scalar to XMM zero-extended, zeroing a VR128 then do a
+  // MOVSS to the lower bits.
+  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
+            (MOVSSrr (v4f32 (V_SET0)), FR32:$src)>;
+  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
+            (MOVSSrr (v4f32 (V_SET0)),
+                     (f32 (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss)))>;
+  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
+            (MOVSSrr (v4i32 (V_SET0)),
+                     (EXTRACT_SUBREG (v4i32 VR128:$src), sub_ss))>;
+  }
 
-// Extract and store.
-def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
-                 addr:$dst),
-          (MOVSSmr addr:$dst,
-                   (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>;
-def : Pat<(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
-                 addr:$dst),
-          (MOVSDmr addr:$dst,
-                   (EXTRACT_SUBREG (v2f64 VR128:$src), sub_sd))>;
+  let AddedComplexity = 20 in {
+  // MOVSSrm zeros the high parts of the register; represent this
+  // with SUBREG_TO_REG.
+  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
+            (SUBREG_TO_REG (i32 0), (MOVSSrm addr:$src), sub_ss)>;
+  def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
+            (SUBREG_TO_REG (i32 0), (MOVSSrm addr:$src), sub_ss)>;
+  def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
+            (SUBREG_TO_REG (i32 0), (MOVSSrm addr:$src), sub_ss)>;
+  }
+
+  // Extract and store.
+  def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
+                   addr:$dst),
+            (MOVSSmr addr:$dst,
+                     (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>;
+
+  // Shuffle with MOVSS
+  def : Pat<(v4f32 (X86Movss VR128:$src1, (scalar_to_vector FR32:$src2))),
+            (MOVSSrr VR128:$src1, FR32:$src2)>;
+  def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
+            (MOVSSrr (v4i32 VR128:$src1),
+                     (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_ss))>;
+  def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
+            (MOVSSrr (v4f32 VR128:$src1),
+                     (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_ss))>;
+}
+
+let Predicates = [HasSSE2] in {
+  let AddedComplexity = 15 in {
+  // Extract the low 64-bit value from one vector and insert it into another.
+  def : Pat<(v2f64 (movl VR128:$src1, VR128:$src2)),
+            (MOVSDrr (v2f64 VR128:$src1),
+                     (EXTRACT_SUBREG (v2f64 VR128:$src2), sub_sd))>;
+  def : Pat<(v2i64 (movl VR128:$src1, VR128:$src2)),
+            (MOVSDrr (v2i64 VR128:$src1),
+                     (EXTRACT_SUBREG (v2i64 VR128:$src2), sub_sd))>;
+
+  // vector_shuffle v1, v2 <4, 5, 2, 3> using movsd
+  def : Pat<(v4f32 (movlp VR128:$src1, VR128:$src2)),
+            (MOVSDrr VR128:$src1, (EXTRACT_SUBREG VR128:$src2, sub_sd))>;
+  def : Pat<(v4i32 (movlp VR128:$src1, VR128:$src2)),
+            (MOVSDrr VR128:$src1, (EXTRACT_SUBREG VR128:$src2, sub_sd))>;
+
+  // Move scalar to XMM zero-extended, zeroing a VR128 then do a
+  // MOVSD to the lower bits.
+  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
+            (MOVSDrr (v2f64 (V_SET0)), FR64:$src)>;
+  }
+
+  let AddedComplexity = 20 in {
+  // MOVSDrm zeros the high parts of the register; represent this
+  // with SUBREG_TO_REG.
+  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
+            (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
+  def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
+            (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
+  def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
+            (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
+  def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
+            (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
+  def : Pat<(v2f64 (X86vzload addr:$src)),
+            (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
+  }
+
+  // Extract and store.
+  def : Pat<(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
+                   addr:$dst),
+            (MOVSDmr addr:$dst,
+                     (EXTRACT_SUBREG (v2f64 VR128:$src), sub_sd))>;
+
+  // Shuffle with MOVSD
+  def : Pat<(v2f64 (X86Movsd VR128:$src1, (scalar_to_vector FR64:$src2))),
+            (MOVSDrr VR128:$src1, FR64:$src2)>;
+  def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
+            (MOVSDrr (v2i64 VR128:$src1),
+                     (EXTRACT_SUBREG (v2i64 VR128:$src2), sub_sd))>;
+  def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
+            (MOVSDrr (v2f64 VR128:$src1),
+                     (EXTRACT_SUBREG (v2f64 VR128:$src2), sub_sd))>;
+  def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)),
+            (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4f32 VR128:$src2),sub_sd))>;
+  def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)),
+            (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4i32 VR128:$src2),sub_sd))>;
+
+  // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem
+  // is during lowering, where it's not possible to recognize the fold cause
+  // it has two uses through a bitcast. One use disappears at isel time and the
+  // fold opportunity reappears.
+  def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)),
+            (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4f32 VR128:$src2),sub_sd))>;
+  def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)),
+            (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4i32 VR128:$src2),sub_sd))>;
+}
+
+let Predicates = [HasAVX] in {
+  let AddedComplexity = 15 in {
+  // Extract the low 32-bit value from one vector and insert it into another.
+  def : Pat<(v4f32 (movl VR128:$src1, VR128:$src2)),
+            (VMOVSSrr (v4f32 VR128:$src1),
+                      (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_ss))>;
+  def : Pat<(v4i32 (movl VR128:$src1, VR128:$src2)),
+            (VMOVSSrr (v4i32 VR128:$src1),
+                      (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_ss))>;
+
+  // Extract the low 64-bit value from one vector and insert it into another.
+  def : Pat<(v2f64 (movl VR128:$src1, VR128:$src2)),
+            (VMOVSDrr (v2f64 VR128:$src1),
+                      (EXTRACT_SUBREG (v2f64 VR128:$src2), sub_sd))>;
+  def : Pat<(v2i64 (movl VR128:$src1, VR128:$src2)),
+            (VMOVSDrr (v2i64 VR128:$src1),
+                      (EXTRACT_SUBREG (v2i64 VR128:$src2), sub_sd))>;
+
+  // vector_shuffle v1, v2 <4, 5, 2, 3> using movsd
+  def : Pat<(v4f32 (movlp VR128:$src1, VR128:$src2)),
+            (VMOVSDrr VR128:$src1, (EXTRACT_SUBREG VR128:$src2, sub_sd))>;
+  def : Pat<(v4i32 (movlp VR128:$src1, VR128:$src2)),
+            (VMOVSDrr VR128:$src1, (EXTRACT_SUBREG VR128:$src2, sub_sd))>;
+
+  // Move scalar to XMM zero-extended, zeroing a VR128 then do a
+  // MOVS{S,D} to the lower bits.
+  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
+            (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)>;
+  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
+            (VMOVSSrr (v4f32 (V_SET0)),
+                      (f32 (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss)))>;
+  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
+            (VMOVSSrr (v4i32 (V_SET0)),
+                      (EXTRACT_SUBREG (v4i32 VR128:$src), sub_ss))>;
+  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
+            (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)>;
+  }
+
+  let AddedComplexity = 20 in {
+  // MOVSSrm zeros the high parts of the register; represent this
+  // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
+  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
+            (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_ss)>;
+  def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
+            (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_ss)>;
+  def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
+            (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_ss)>;
+
+  // MOVSDrm zeros the high parts of the register; represent this
+  // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
+  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
+            (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_sd)>;
+  def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
+            (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_sd)>;
+  def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
+            (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_sd)>;
+  def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
+            (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_sd)>;
+  def : Pat<(v2f64 (X86vzload addr:$src)),
+            (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_sd)>;
+
+  // Represent the same patterns above but in the form they appear for
+  // 256-bit types
+  def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
+                   (v4f32 (scalar_to_vector (loadf32 addr:$src))), (i32 0)))),
+            (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_ss)>;
+  def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
+                   (v2f64 (scalar_to_vector (loadf64 addr:$src))), (i32 0)))),
+            (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_sd)>;
+  }
+  def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
+                   (v4f32 (scalar_to_vector FR32:$src)), (i32 0)))),
+            (SUBREG_TO_REG (i32 0),
+                           (v4f32 (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)),
+                           sub_xmm)>;
+  def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
+                   (v2f64 (scalar_to_vector FR64:$src)), (i32 0)))),
+            (SUBREG_TO_REG (i64 0),
+                           (v2f64 (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)),
+                           sub_xmm)>;
+
+  // Extract and store.
+  def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
+                   addr:$dst),
+            (VMOVSSmr addr:$dst,
+                     (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>;
+  def : Pat<(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
+                   addr:$dst),
+            (VMOVSDmr addr:$dst,
+                     (EXTRACT_SUBREG (v2f64 VR128:$src), sub_sd))>;
+
+  // Shuffle with VMOVSS
+  def : Pat<(v4f32 (X86Movss VR128:$src1, (scalar_to_vector FR32:$src2))),
+            (VMOVSSrr VR128:$src1, FR32:$src2)>;
+  def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
+            (VMOVSSrr (v4i32 VR128:$src1),
+                      (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_ss))>;
+  def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
+            (VMOVSSrr (v4f32 VR128:$src1),
+                      (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_ss))>;
+
+  // Shuffle with VMOVSD
+  def : Pat<(v2f64 (X86Movsd VR128:$src1, (scalar_to_vector FR64:$src2))),
+            (VMOVSDrr VR128:$src1, FR64:$src2)>;
+  def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
+            (VMOVSDrr (v2i64 VR128:$src1),
+                     (EXTRACT_SUBREG (v2i64 VR128:$src2), sub_sd))>;
+  def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
+            (VMOVSDrr (v2f64 VR128:$src1),
+                     (EXTRACT_SUBREG (v2f64 VR128:$src2), sub_sd))>;
+  def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)),
+            (VMOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4f32 VR128:$src2),
+                                                   sub_sd))>;
+  def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)),
+            (VMOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4i32 VR128:$src2),
+                                                   sub_sd))>;
+
+  // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem
+  // is during lowering, where it's not possible to recognize the fold cause
+  // it has two uses through a bitcast. One use disappears at isel time and the
+  // fold opportunity reappears.
+  def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)),
+            (VMOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4f32 VR128:$src2),
+                                                   sub_sd))>;
+  def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)),
+            (VMOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4i32 VR128:$src2),
+                                                   sub_sd))>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Move Aligned/Unaligned FP Instructions
+//===----------------------------------------------------------------------===//
 
-// Move Aligned/Unaligned floating point values
 multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC,
                             X86MemOperand x86memop, PatFrag ld_frag,
                             string asm, Domain d,
@@ -248,22 +672,22 @@ let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable in
 }
 
 defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
-                              "movaps", SSEPackedSingle>, VEX;
+                              "movaps", SSEPackedSingle>, TB, VEX;
 defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64,
-                              "movapd", SSEPackedDouble>, OpSize, VEX;
+                              "movapd", SSEPackedDouble>, TB, OpSize, VEX;
 defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32,
-                              "movups", SSEPackedSingle>, VEX;
+                              "movups", SSEPackedSingle>, TB, VEX;
 defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64,
-                              "movupd", SSEPackedDouble, 0>, OpSize, VEX;
+                              "movupd", SSEPackedDouble, 0>, TB, OpSize, VEX;
 
 defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32,
-                              "movaps", SSEPackedSingle>, VEX;
+                              "movaps", SSEPackedSingle>, TB, VEX;
 defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64,
-                              "movapd", SSEPackedDouble>, OpSize, VEX;
+                              "movapd", SSEPackedDouble>, TB, OpSize, VEX;
 defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32,
-                              "movups", SSEPackedSingle>, VEX;
+                              "movups", SSEPackedSingle>, TB, VEX;
 defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64,
-                              "movupd", SSEPackedDouble, 0>, OpSize, VEX;
+                              "movupd", SSEPackedDouble, 0>, TB, OpSize, VEX;
 defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
                               "movaps", SSEPackedSingle>, TB;
 defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64,
@@ -287,10 +711,10 @@ def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                    [(store (v2f64 VR128:$src), addr:$dst)]>, VEX;
 def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
                    "movaps\t{$src, $dst|$dst, $src}",
-                   [(alignedstore (v8f32 VR256:$src), addr:$dst)]>, VEX;
+                   [(alignedstore256 (v8f32 VR256:$src), addr:$dst)]>, VEX;
 def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
                    "movapd\t{$src, $dst|$dst, $src}",
-                   [(alignedstore (v4f64 VR256:$src), addr:$dst)]>, VEX;
+                   [(alignedstore256 (v4f64 VR256:$src), addr:$dst)]>, VEX;
 def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
                    "movups\t{$src, $dst|$dst, $src}",
                    [(store (v8f32 VR256:$src), addr:$dst)]>, VEX;
@@ -298,6 +722,34 @@ def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
                    "movupd\t{$src, $dst|$dst, $src}",
                    [(store (v4f64 VR256:$src), addr:$dst)]>, VEX;
 
+// For disassembler
+let isCodeGenOnly = 1 in {
+  def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst),
+                          (ins VR128:$src),
+                          "movaps\t{$src, $dst|$dst, $src}", []>, VEX;
+  def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst),
+                           (ins VR128:$src),
+                           "movapd\t{$src, $dst|$dst, $src}", []>, VEX;
+  def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst),
+                           (ins VR128:$src),
+                           "movups\t{$src, $dst|$dst, $src}", []>, VEX;
+  def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst),
+                           (ins VR128:$src),
+                           "movupd\t{$src, $dst|$dst, $src}", []>, VEX;
+  def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst),
+                            (ins VR256:$src),
+                            "movaps\t{$src, $dst|$dst, $src}", []>, VEX;
+  def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst),
+                            (ins VR256:$src),
+                            "movapd\t{$src, $dst|$dst, $src}", []>, VEX;
+  def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst),
+                            (ins VR256:$src),
+                            "movups\t{$src, $dst|$dst, $src}", []>, VEX;
+  def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst),
+                            (ins VR256:$src),
+                            "movupd\t{$src, $dst|$dst, $src}", []>, VEX;
+}
+
 def : Pat<(int_x86_avx_loadu_ps_256 addr:$src), (VMOVUPSYrm addr:$src)>;
 def : Pat<(int_x86_avx_storeu_ps_256 addr:$dst, VR256:$src),
           (VMOVUPSYmr addr:$dst, VR256:$src)>;
@@ -319,24 +771,155 @@ def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                    "movupd\t{$src, $dst|$dst, $src}",
                    [(store (v2f64 VR128:$src), addr:$dst)]>;
 
-// Intrinsic forms of MOVUPS/D load and store
-def VMOVUPSmr_Int : VPSI<0x11, MRMDestMem, (outs),
-           (ins f128mem:$dst, VR128:$src),
-           "movups\t{$src, $dst|$dst, $src}",
-           [(int_x86_sse_storeu_ps addr:$dst, VR128:$src)]>, VEX;
-def VMOVUPDmr_Int : VPDI<0x11, MRMDestMem, (outs),
-           (ins f128mem:$dst, VR128:$src),
-           "movupd\t{$src, $dst|$dst, $src}",
-           [(int_x86_sse2_storeu_pd addr:$dst, VR128:$src)]>, VEX;
-
-def MOVUPSmr_Int : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
-                       "movups\t{$src, $dst|$dst, $src}",
-                       [(int_x86_sse_storeu_ps addr:$dst, VR128:$src)]>;
-def MOVUPDmr_Int : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
-                       "movupd\t{$src, $dst|$dst, $src}",
-                       [(int_x86_sse2_storeu_pd addr:$dst, VR128:$src)]>;
-
-// Move Low/High packed floating point values
+// For disassembler
+let isCodeGenOnly = 1 in {
+  def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+                         "movaps\t{$src, $dst|$dst, $src}", []>;
+  def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+                         "movapd\t{$src, $dst|$dst, $src}", []>;
+  def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+                         "movups\t{$src, $dst|$dst, $src}", []>;
+  def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+                         "movupd\t{$src, $dst|$dst, $src}", []>;
+}
+
+let Predicates = [HasAVX] in {
+  def : Pat<(int_x86_sse_storeu_ps addr:$dst, VR128:$src),
+            (VMOVUPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(int_x86_sse2_storeu_pd addr:$dst, VR128:$src),
+            (VMOVUPDmr addr:$dst, VR128:$src)>;
+}
+
+let Predicates = [HasSSE1] in
+  def : Pat<(int_x86_sse_storeu_ps addr:$dst, VR128:$src),
+            (MOVUPSmr addr:$dst, VR128:$src)>;
+let Predicates = [HasSSE2] in
+  def : Pat<(int_x86_sse2_storeu_pd addr:$dst, VR128:$src),
+            (MOVUPDmr addr:$dst, VR128:$src)>;
+
+// Use movaps / movups for SSE integer load / store (one byte shorter).
+// The instructions selected below are then converted to MOVDQA/MOVDQU
+// during the SSE domain pass.
+let Predicates = [HasSSE1] in {
+  def : Pat<(alignedloadv4i32 addr:$src),
+            (MOVAPSrm addr:$src)>;
+  def : Pat<(loadv4i32 addr:$src),
+            (MOVUPSrm addr:$src)>;
+  def : Pat<(alignedloadv2i64 addr:$src),
+            (MOVAPSrm addr:$src)>;
+  def : Pat<(loadv2i64 addr:$src),
+            (MOVUPSrm addr:$src)>;
+
+  def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
+            (MOVAPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
+            (MOVAPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
+            (MOVAPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
+            (MOVAPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v2i64 VR128:$src), addr:$dst),
+            (MOVUPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v4i32 VR128:$src), addr:$dst),
+            (MOVUPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v8i16 VR128:$src), addr:$dst),
+            (MOVUPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v16i8 VR128:$src), addr:$dst),
+            (MOVUPSmr addr:$dst, VR128:$src)>;
+}
+
+// Use vmovaps/vmovups for AVX integer load/store.
+let Predicates = [HasAVX] in {
+  // 128-bit load/store
+  def : Pat<(alignedloadv4i32 addr:$src),
+            (VMOVAPSrm addr:$src)>;
+  def : Pat<(loadv4i32 addr:$src),
+            (VMOVUPSrm addr:$src)>;
+  def : Pat<(alignedloadv2i64 addr:$src),
+            (VMOVAPSrm addr:$src)>;
+  def : Pat<(loadv2i64 addr:$src),
+            (VMOVUPSrm addr:$src)>;
+
+  def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
+            (VMOVAPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
+            (VMOVAPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
+            (VMOVAPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
+            (VMOVAPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v2i64 VR128:$src), addr:$dst),
+            (VMOVUPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v4i32 VR128:$src), addr:$dst),
+            (VMOVUPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v8i16 VR128:$src), addr:$dst),
+            (VMOVUPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v16i8 VR128:$src), addr:$dst),
+            (VMOVUPSmr addr:$dst, VR128:$src)>;
+
+  // 256-bit load/store
+  def : Pat<(alignedloadv4i64 addr:$src),
+            (VMOVAPSYrm addr:$src)>;
+  def : Pat<(loadv4i64 addr:$src),
+            (VMOVUPSYrm addr:$src)>;
+  def : Pat<(alignedloadv8i32 addr:$src),
+            (VMOVAPSYrm addr:$src)>;
+  def : Pat<(loadv8i32 addr:$src),
+            (VMOVUPSYrm addr:$src)>;
+  def : Pat<(alignedstore256 (v4i64 VR256:$src), addr:$dst),
+            (VMOVAPSYmr addr:$dst, VR256:$src)>;
+  def : Pat<(alignedstore256 (v8i32 VR256:$src), addr:$dst),
+            (VMOVAPSYmr addr:$dst, VR256:$src)>;
+  def : Pat<(alignedstore256 (v16i16 VR256:$src), addr:$dst),
+            (VMOVAPSYmr addr:$dst, VR256:$src)>;
+  def : Pat<(alignedstore256 (v32i8 VR256:$src), addr:$dst),
+            (VMOVAPSYmr addr:$dst, VR256:$src)>;
+  def : Pat<(store (v4i64 VR256:$src), addr:$dst),
+            (VMOVUPSYmr addr:$dst, VR256:$src)>;
+  def : Pat<(store (v8i32 VR256:$src), addr:$dst),
+            (VMOVUPSYmr addr:$dst, VR256:$src)>;
+  def : Pat<(store (v16i16 VR256:$src), addr:$dst),
+            (VMOVUPSYmr addr:$dst, VR256:$src)>;
+  def : Pat<(store (v32i8 VR256:$src), addr:$dst),
+            (VMOVUPSYmr addr:$dst, VR256:$src)>;
+}
+
+// Alias instruction to do FR32 or FR64 reg-to-reg copy using movaps. Upper
+// bits are disregarded. FIXME: Set encoding to pseudo!
+let neverHasSideEffects = 1 in {
+def FsMOVAPSrr : PSI<0x28, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
+                     "movaps\t{$src, $dst|$dst, $src}", []>;
+def FsMOVAPDrr : PDI<0x28, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src),
+                     "movapd\t{$src, $dst|$dst, $src}", []>;
+def FsVMOVAPSrr : VPSI<0x28, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
+                       "movaps\t{$src, $dst|$dst, $src}", []>, VEX;
+def FsVMOVAPDrr : VPDI<0x28, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src),
+                       "movapd\t{$src, $dst|$dst, $src}", []>, VEX;
+}
+
+// Alias instruction to load FR32 or FR64 from f128mem using movaps. Upper
+// bits are disregarded. FIXME: Set encoding to pseudo!
+let canFoldAsLoad = 1, isReMaterializable = 1 in {
+def FsMOVAPSrm : PSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src),
+                     "movaps\t{$src, $dst|$dst, $src}",
+                     [(set FR32:$dst, (alignedloadfsf32 addr:$src))]>;
+def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
+                     "movapd\t{$src, $dst|$dst, $src}",
+                     [(set FR64:$dst, (alignedloadfsf64 addr:$src))]>;
+let isCodeGenOnly = 1 in {
+  def FsVMOVAPSrm : VPSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src),
+                         "movaps\t{$src, $dst|$dst, $src}",
+                         [(set FR32:$dst, (alignedloadfsf32 addr:$src))]>, VEX;
+  def FsVMOVAPDrm : VPDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
+                         "movapd\t{$src, $dst|$dst, $src}",
+                         [(set FR64:$dst, (alignedloadfsf64 addr:$src))]>, VEX;
+}
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Move Low packed FP Instructions
+//===----------------------------------------------------------------------===//
+
 multiclass sse12_mov_hilo_packed<bits<8>opc, RegisterClass RC,
                                  PatFrag mov_frag, string base_opc,
                                  string asm_opr> {
@@ -359,33 +942,170 @@ multiclass sse12_mov_hilo_packed<bits<8>opc, RegisterClass RC,
 let AddedComplexity = 20 in {
   defm VMOVL : sse12_mov_hilo_packed<0x12, VR128, movlp, "movlp",
                      "\t{$src2, $src1, $dst|$dst, $src1, $src2}">, VEX_4V;
-  defm VMOVH : sse12_mov_hilo_packed<0x16, VR128, movlhps, "movhp",
-                     "\t{$src2, $src1, $dst|$dst, $src1, $src2}">, VEX_4V;
 }
 let Constraints = "$src1 = $dst", AddedComplexity = 20 in {
   defm MOVL : sse12_mov_hilo_packed<0x12, VR128, movlp, "movlp",
                                    "\t{$src2, $dst|$dst, $src2}">;
+}
+
+def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+                   "movlps\t{$src, $dst|$dst, $src}",
+                   [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)),
+                                 (iPTR 0))), addr:$dst)]>, VEX;
+def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+                   "movlpd\t{$src, $dst|$dst, $src}",
+                   [(store (f64 (vector_extract (v2f64 VR128:$src),
+                                 (iPTR 0))), addr:$dst)]>, VEX;
+def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+                   "movlps\t{$src, $dst|$dst, $src}",
+                   [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)),
+                                 (iPTR 0))), addr:$dst)]>;
+def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+                   "movlpd\t{$src, $dst|$dst, $src}",
+                   [(store (f64 (vector_extract (v2f64 VR128:$src),
+                                 (iPTR 0))), addr:$dst)]>;
+
+let Predicates = [HasAVX] in {
+  let AddedComplexity = 20 in {
+    // vector_shuffle v1, (load v2) <4, 5, 2, 3> using MOVLPS
+    def : Pat<(v4f32 (movlp VR128:$src1, (load addr:$src2))),
+              (VMOVLPSrm VR128:$src1, addr:$src2)>;
+    def : Pat<(v4i32 (movlp VR128:$src1, (load addr:$src2))),
+              (VMOVLPSrm VR128:$src1, addr:$src2)>;
+    // vector_shuffle v1, (load v2) <2, 1> using MOVLPS
+    def : Pat<(v2f64 (movlp VR128:$src1, (load addr:$src2))),
+              (VMOVLPDrm VR128:$src1, addr:$src2)>;
+    def : Pat<(v2i64 (movlp VR128:$src1, (load addr:$src2))),
+              (VMOVLPDrm VR128:$src1, addr:$src2)>;
+  }
+
+  // (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS
+  def : Pat<(store (v4f32 (movlp (load addr:$src1), VR128:$src2)), addr:$src1),
+            (VMOVLPSmr addr:$src1, VR128:$src2)>;
+  def : Pat<(store (v4i32 (movlp (bc_v4i32 (loadv2i64 addr:$src1)),
+                                 VR128:$src2)), addr:$src1),
+            (VMOVLPSmr addr:$src1, VR128:$src2)>;
+
+  // (store (vector_shuffle (load addr), v2, <2, 1>), addr) using MOVLPS
+  def : Pat<(store (v2f64 (movlp (load addr:$src1), VR128:$src2)), addr:$src1),
+            (VMOVLPDmr addr:$src1, VR128:$src2)>;
+  def : Pat<(store (v2i64 (movlp (load addr:$src1), VR128:$src2)), addr:$src1),
+            (VMOVLPDmr addr:$src1, VR128:$src2)>;
+
+  // Shuffle with VMOVLPS
+  def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))),
+            (VMOVLPSrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v4i32 (X86Movlps VR128:$src1, (load addr:$src2))),
+            (VMOVLPSrm VR128:$src1, addr:$src2)>;
+  def : Pat<(X86Movlps VR128:$src1,
+                      (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
+            (VMOVLPSrm VR128:$src1, addr:$src2)>;
+
+  // Shuffle with VMOVLPD
+  def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))),
+            (VMOVLPDrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))),
+            (VMOVLPDrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v2f64 (X86Movlpd VR128:$src1,
+                              (scalar_to_vector (loadf64 addr:$src2)))),
+            (VMOVLPDrm VR128:$src1, addr:$src2)>;
+
+  // Store patterns
+  def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)),
+                   addr:$src1),
+            (VMOVLPSmr addr:$src1, VR128:$src2)>;
+  def : Pat<(store (v4i32 (X86Movlps
+                   (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)), addr:$src1),
+            (VMOVLPSmr addr:$src1, VR128:$src2)>;
+  def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)),
+                   addr:$src1),
+            (VMOVLPDmr addr:$src1, VR128:$src2)>;
+  def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128:$src2)),
+                   addr:$src1),
+            (VMOVLPDmr addr:$src1, VR128:$src2)>;
+}
+
+let Predicates = [HasSSE1] in {
+  let AddedComplexity = 20 in {
+    // vector_shuffle v1, (load v2) <4, 5, 2, 3> using MOVLPS
+    def : Pat<(v4f32 (movlp VR128:$src1, (load addr:$src2))),
+              (MOVLPSrm VR128:$src1, addr:$src2)>;
+    def : Pat<(v4i32 (movlp VR128:$src1, (load addr:$src2))),
+              (MOVLPSrm VR128:$src1, addr:$src2)>;
+  }
+
+  // (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS
+  def : Pat<(store (v4f32 (movlp (load addr:$src1), VR128:$src2)), addr:$src1),
+            (MOVLPSmr addr:$src1, VR128:$src2)>;
+  def : Pat<(store (v4i32 (movlp (bc_v4i32 (loadv2i64 addr:$src1)),
+                                 VR128:$src2)), addr:$src1),
+            (MOVLPSmr addr:$src1, VR128:$src2)>;
+
+  // Shuffle with MOVLPS
+  def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))),
+            (MOVLPSrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v4i32 (X86Movlps VR128:$src1, (load addr:$src2))),
+            (MOVLPSrm VR128:$src1, addr:$src2)>;
+  def : Pat<(X86Movlps VR128:$src1,
+                      (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
+            (MOVLPSrm VR128:$src1, addr:$src2)>;
+
+  // Store patterns
+  def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)),
+                                      addr:$src1),
+            (MOVLPSmr addr:$src1, VR128:$src2)>;
+  def : Pat<(store (v4i32 (X86Movlps
+                   (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)),
+                              addr:$src1),
+            (MOVLPSmr addr:$src1, VR128:$src2)>;
+}
+
+let Predicates = [HasSSE2] in {
+  let AddedComplexity = 20 in {
+    // vector_shuffle v1, (load v2) <2, 1> using MOVLPS
+    def : Pat<(v2f64 (movlp VR128:$src1, (load addr:$src2))),
+              (MOVLPDrm VR128:$src1, addr:$src2)>;
+    def : Pat<(v2i64 (movlp VR128:$src1, (load addr:$src2))),
+              (MOVLPDrm VR128:$src1, addr:$src2)>;
+  }
+
+  // (store (vector_shuffle (load addr), v2, <2, 1>), addr) using MOVLPS
+  def : Pat<(store (v2f64 (movlp (load addr:$src1), VR128:$src2)), addr:$src1),
+            (MOVLPDmr addr:$src1, VR128:$src2)>;
+  def : Pat<(store (v2i64 (movlp (load addr:$src1), VR128:$src2)), addr:$src1),
+            (MOVLPDmr addr:$src1, VR128:$src2)>;
+
+  // Shuffle with MOVLPD
+  def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))),
+            (MOVLPDrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))),
+            (MOVLPDrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v2f64 (X86Movlpd VR128:$src1,
+                              (scalar_to_vector (loadf64 addr:$src2)))),
+            (MOVLPDrm VR128:$src1, addr:$src2)>;
+
+  // Store patterns
+  def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)),
+                           addr:$src1),
+            (MOVLPDmr addr:$src1, VR128:$src2)>;
+  def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128:$src2)),
+                           addr:$src1),
+            (MOVLPDmr addr:$src1, VR128:$src2)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Move Hi packed FP Instructions
+//===----------------------------------------------------------------------===//
+
+let AddedComplexity = 20 in {
+  defm VMOVH : sse12_mov_hilo_packed<0x16, VR128, movlhps, "movhp",
+                     "\t{$src2, $src1, $dst|$dst, $src1, $src2}">, VEX_4V;
+}
+let Constraints = "$src1 = $dst", AddedComplexity = 20 in {
   defm MOVH : sse12_mov_hilo_packed<0x16, VR128, movlhps, "movhp",
                                    "\t{$src2, $dst|$dst, $src2}">;
 }
 
-def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
-                   "movlps\t{$src, $dst|$dst, $src}",
-                   [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)),
-                                 (iPTR 0))), addr:$dst)]>, VEX;
-def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
-                   "movlpd\t{$src, $dst|$dst, $src}",
-                   [(store (f64 (vector_extract (v2f64 VR128:$src),
-                                 (iPTR 0))), addr:$dst)]>, VEX;
-def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
-                   "movlps\t{$src, $dst|$dst, $src}",
-                   [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)),
-                                 (iPTR 0))), addr:$dst)]>;
-def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
-                   "movlpd\t{$src, $dst|$dst, $src}",
-                   [(store (f64 (vector_extract (v2f64 VR128:$src),
-                                 (iPTR 0))), addr:$dst)]>;
-
 // v2f64 extract element 1 is always custom lowered to unpack high to low
 // and extract element 0 so the non-store version isn't too horrible.
 def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
@@ -411,6 +1131,80 @@ def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
                                  (v2f64 (unpckh VR128:$src, (undef))),
                                  (iPTR 0))), addr:$dst)]>;
 
+let Predicates = [HasAVX] in {
+  // VMOVHPS patterns
+  def : Pat<(movlhps VR128:$src1, (bc_v4i32 (v2i64 (X86vzload addr:$src2)))),
+            (VMOVHPSrm (v4i32 VR128:$src1), addr:$src2)>;
+  def : Pat<(X86Movlhps VR128:$src1,
+                 (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
+            (VMOVHPSrm VR128:$src1, addr:$src2)>;
+  def : Pat<(X86Movlhps VR128:$src1,
+                 (bc_v4i32 (v2i64 (X86vzload addr:$src2)))),
+            (VMOVHPSrm VR128:$src1, addr:$src2)>;
+
+  // FIXME: Instead of X86Unpcklpd, there should be a X86Movlhpd here, the problem
+  // is during lowering, where it's not possible to recognize the load fold cause
+  // it has two uses through a bitcast. One use disappears at isel time and the
+  // fold opportunity reappears.
+  def : Pat<(v2f64 (X86Unpcklpd VR128:$src1,
+                      (scalar_to_vector (loadf64 addr:$src2)))),
+            (VMOVHPDrm VR128:$src1, addr:$src2)>;
+
+  // FIXME: This should be matched by a X86Movhpd instead. Same as above
+  def : Pat<(v2f64 (X86Movlhpd VR128:$src1,
+                      (scalar_to_vector (loadf64 addr:$src2)))),
+            (VMOVHPDrm VR128:$src1, addr:$src2)>;
+
+  // Store patterns
+  def : Pat<(store (f64 (vector_extract
+            (v2f64 (X86Unpckhps VR128:$src, (undef))), (iPTR 0))), addr:$dst),
+            (VMOVHPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (f64 (vector_extract
+            (v2f64 (X86Unpckhpd VR128:$src, (undef))), (iPTR 0))), addr:$dst),
+            (VMOVHPDmr addr:$dst, VR128:$src)>;
+}
+
+let Predicates = [HasSSE1] in {
+  // MOVHPS patterns
+  def : Pat<(movlhps VR128:$src1, (bc_v4i32 (v2i64 (X86vzload addr:$src2)))),
+            (MOVHPSrm (v4i32 VR128:$src1), addr:$src2)>;
+  def : Pat<(X86Movlhps VR128:$src1,
+                 (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
+            (MOVHPSrm VR128:$src1, addr:$src2)>;
+  def : Pat<(X86Movlhps VR128:$src1,
+                 (bc_v4f32 (v2i64 (X86vzload addr:$src2)))),
+            (MOVHPSrm VR128:$src1, addr:$src2)>;
+
+  // Store patterns
+  def : Pat<(store (f64 (vector_extract
+            (v2f64 (X86Unpckhps VR128:$src, (undef))), (iPTR 0))), addr:$dst),
+            (MOVHPSmr addr:$dst, VR128:$src)>;
+}
+
+let Predicates = [HasSSE2] in {
+  // FIXME: Instead of X86Unpcklpd, there should be a X86Movlhpd here, the problem
+  // is during lowering, where it's not possible to recognize the load fold cause
+  // it has two uses through a bitcast. One use disappears at isel time and the
+  // fold opportunity reappears.
+  def : Pat<(v2f64 (X86Unpcklpd VR128:$src1,
+                      (scalar_to_vector (loadf64 addr:$src2)))),
+            (MOVHPDrm VR128:$src1, addr:$src2)>;
+
+  // FIXME: This should be matched by a X86Movhpd instead. Same as above
+  def : Pat<(v2f64 (X86Movlhpd VR128:$src1,
+                      (scalar_to_vector (loadf64 addr:$src2)))),
+            (MOVHPDrm VR128:$src1, addr:$src2)>;
+
+  // Store patterns
+  def : Pat<(store (f64 (vector_extract
+            (v2f64 (X86Unpckhpd VR128:$src, (undef))), (iPTR 0))),addr:$dst),
+            (MOVHPDmr addr:$dst, VR128:$src)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions
+//===----------------------------------------------------------------------===//
+
 let AddedComplexity = 20 in {
   def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst),
                                        (ins VR128:$src1, VR128:$src2),
@@ -438,13 +1232,80 @@ let Constraints = "$src1 = $dst", AddedComplexity = 20 in {
                         (v4f32 (movhlps VR128:$src1, VR128:$src2)))]>;
 }
 
-def : Pat<(movlhps VR128:$src1, (bc_v4i32 (v2i64 (X86vzload addr:$src2)))),
-          (MOVHPSrm (v4i32 VR128:$src1), addr:$src2)>;
-let AddedComplexity = 20 in {
-  def : Pat<(v4f32 (movddup VR128:$src, (undef))),
-            (MOVLHPSrr (v4f32 VR128:$src), (v4f32 VR128:$src))>;
-  def : Pat<(v2i64 (movddup VR128:$src, (undef))),
-            (MOVLHPSrr (v2i64 VR128:$src), (v2i64 VR128:$src))>;
+let Predicates = [HasAVX] in {
+  // MOVLHPS patterns
+  let AddedComplexity = 20 in {
+    def : Pat<(v4f32 (movddup VR128:$src, (undef))),
+              (VMOVLHPSrr (v4f32 VR128:$src), (v4f32 VR128:$src))>;
+    def : Pat<(v2i64 (movddup VR128:$src, (undef))),
+              (VMOVLHPSrr (v2i64 VR128:$src), (v2i64 VR128:$src))>;
+
+    // vector_shuffle v1, v2 <0, 1, 4, 5> using MOVLHPS
+    def : Pat<(v4i32 (movlhps VR128:$src1, VR128:$src2)),
+              (VMOVLHPSrr VR128:$src1, VR128:$src2)>;
+  }
+  def : Pat<(v4f32 (X86Movlhps VR128:$src1, VR128:$src2)),
+            (VMOVLHPSrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)),
+            (VMOVLHPSrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)),
+            (VMOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>;
+
+  // MOVHLPS patterns
+  let AddedComplexity = 20 in {
+    // vector_shuffle v1, v2 <6, 7, 2, 3> using MOVHLPS
+    def : Pat<(v4i32 (movhlps VR128:$src1, VR128:$src2)),
+              (VMOVHLPSrr VR128:$src1, VR128:$src2)>;
+
+    // vector_shuffle v1, undef <2, ?, ?, ?> using MOVHLPS
+    def : Pat<(v4f32 (movhlps_undef VR128:$src1, (undef))),
+              (VMOVHLPSrr VR128:$src1, VR128:$src1)>;
+    def : Pat<(v4i32 (movhlps_undef VR128:$src1, (undef))),
+              (VMOVHLPSrr VR128:$src1, VR128:$src1)>;
+  }
+
+  def : Pat<(v4f32 (X86Movhlps VR128:$src1, VR128:$src2)),
+            (VMOVHLPSrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v4i32 (X86Movhlps VR128:$src1, VR128:$src2)),
+            (VMOVHLPSrr VR128:$src1, VR128:$src2)>;
+}
+
+let Predicates = [HasSSE1] in {
+  // MOVLHPS patterns
+  let AddedComplexity = 20 in {
+    def : Pat<(v4f32 (movddup VR128:$src, (undef))),
+              (MOVLHPSrr (v4f32 VR128:$src), (v4f32 VR128:$src))>;
+    def : Pat<(v2i64 (movddup VR128:$src, (undef))),
+              (MOVLHPSrr (v2i64 VR128:$src), (v2i64 VR128:$src))>;
+
+    // vector_shuffle v1, v2 <0, 1, 4, 5> using MOVLHPS
+    def : Pat<(v4i32 (movlhps VR128:$src1, VR128:$src2)),
+              (MOVLHPSrr VR128:$src1, VR128:$src2)>;
+  }
+  def : Pat<(v4f32 (X86Movlhps VR128:$src1, VR128:$src2)),
+            (MOVLHPSrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)),
+            (MOVLHPSrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)),
+            (MOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>;
+
+  // MOVHLPS patterns
+  let AddedComplexity = 20 in {
+    // vector_shuffle v1, v2 <6, 7, 2, 3> using MOVHLPS
+    def : Pat<(v4i32 (movhlps VR128:$src1, VR128:$src2)),
+              (MOVHLPSrr VR128:$src1, VR128:$src2)>;
+
+    // vector_shuffle v1, undef <2, ?, ?, ?> using MOVHLPS
+    def : Pat<(v4f32 (movhlps_undef VR128:$src1, (undef))),
+              (MOVHLPSrr VR128:$src1, VR128:$src1)>;
+    def : Pat<(v4i32 (movhlps_undef VR128:$src1, (undef))),
+              (MOVHLPSrr VR128:$src1, VR128:$src1)>;
+  }
+
+  def : Pat<(v4f32 (X86Movhlps VR128:$src1, VR128:$src2)),
+            (MOVHLPSrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v4i32 (X86Movhlps VR128:$src1, VR128:$src2)),
+            (MOVHLPSrr VR128:$src1, VR128:$src2)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -462,10 +1323,9 @@ multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
 
 multiclass sse12_cvt_s_np<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
                           X86MemOperand x86memop, string asm> {
-  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
-                        []>;
-  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
-                        []>;
+  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm, []>;
+  let mayLoad = 1 in
+  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm, []>;
 }
 
 multiclass sse12_cvt_p<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
@@ -481,36 +1341,39 @@ multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
                           X86MemOperand x86memop, string asm> {
   def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src),
               !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>;
+  let mayLoad = 1 in
   def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
               (ins DstRC:$src1, x86memop:$src),
               !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>;
 }
 
 defm VCVTTSS2SI   : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
-                                "cvttss2si\t{$src, $dst|$dst, $src}">, XS, VEX;
+                                "cvttss2si\t{$src, $dst|$dst, $src}">, XS, VEX,
+                                VEX_LIG;
 defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
                                 "cvttss2si\t{$src, $dst|$dst, $src}">, XS, VEX,
-                                VEX_W;
+                                VEX_W, VEX_LIG;
 defm VCVTTSD2SI   : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
-                                "cvttsd2si\t{$src, $dst|$dst, $src}">, XD, VEX;
+                                "cvttsd2si\t{$src, $dst|$dst, $src}">, XD, VEX,
+                                VEX_LIG;
 defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
                                 "cvttsd2si\t{$src, $dst|$dst, $src}">, XD,
-                                VEX, VEX_W;
+                                VEX, VEX_W, VEX_LIG;
 
 // The assembler can recognize rr 64-bit instructions by seeing a rxx
 // register, but the same isn't true when only using memory operands,
 // provide other assembly "l" and "q" forms to address this explicitly
 // where appropriate to do so.
 defm VCVTSI2SS   : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss">, XS,
-                                  VEX_4V;
+                                  VEX_4V, VEX_LIG;
 defm VCVTSI2SS64 : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss{q}">, XS,
-                                  VEX_4V, VEX_W;
+                                  VEX_4V, VEX_W, VEX_LIG;
 defm VCVTSI2SD   : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd">, XD,
-                                  VEX_4V;
+                                  VEX_4V, VEX_LIG;
 defm VCVTSI2SDL  : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd{l}">, XD,
-                                  VEX_4V;
+                                  VEX_4V, VEX_LIG;
 defm VCVTSI2SD64 : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd{q}">, XD,
-                                  VEX_4V, VEX_W;
+                                  VEX_4V, VEX_W, VEX_LIG;
 
 let Predicates = [HasAVX] in {
   def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))),
@@ -579,11 +1442,6 @@ multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC,
               [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))]>;
 }
 
-defm Int_VCVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
-                      f32mem, load, "cvtss2si">, XS, VEX;
-defm Int_VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64,
-                        int_x86_sse_cvtss2si64, f32mem, load, "cvtss2si">,
-                        XS, VEX, VEX_W;
 defm Int_VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si,
                       f128mem, load, "cvtsd2si">, XD, VEX;
 defm Int_VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64,
@@ -594,14 +1452,12 @@ defm Int_VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64,
 // Get rid of this hack or rename the intrinsics, there are several
 // intructions that only match with the intrinsic form, why create duplicates
 // to let them be recognized by the assembler?
-defm VCVTSD2SI_alt : sse12_cvt_s_np<0x2D, FR64, GR32, f64mem,
-                      "cvtsd2si\t{$src, $dst|$dst, $src}">, XD, VEX;
+defm VCVTSD2SI     : sse12_cvt_s_np<0x2D, FR64, GR32, f64mem,
+                      "cvtsd2si\t{$src, $dst|$dst, $src}">, XD, VEX, VEX_LIG;
 defm VCVTSD2SI64   : sse12_cvt_s_np<0x2D, FR64, GR64, f64mem,
-                      "cvtsd2si\t{$src, $dst|$dst, $src}">, XD, VEX, VEX_W;
-defm Int_CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
-                      f32mem, load, "cvtss2si">, XS;
-defm Int_CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64,
-                      f32mem, load, "cvtss2si{q}">, XS, REX_W;
+                      "cvtsd2si\t{$src, $dst|$dst, $src}">, XD, VEX, VEX_W,
+                      VEX_LIG;
+
 defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si,
                 f128mem, load, "cvtsd2si{l}">, XD;
 defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse2_cvtsd2si64,
@@ -660,10 +1516,11 @@ defm Int_CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
 
 let Pattern = []<dag> in {
 defm VCVTSS2SI   : sse12_cvt_s<0x2D, FR32, GR32, undef, f32mem, load,
-                               "cvtss2si{l}\t{$src, $dst|$dst, $src}">, XS, VEX;
+                               "cvtss2si{l}\t{$src, $dst|$dst, $src}">, XS,
+                               VEX, VEX_LIG;
 defm VCVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, undef, f32mem, load,
                                "cvtss2si\t{$src, $dst|$dst, $src}">, XS, VEX,
-                               VEX_W;
+                               VEX_W, VEX_LIG;
 defm VCVTDQ2PS   : sse12_cvt_p<0x5B, VR128, VR128, undef, i128mem, load,
                                "cvtdq2ps\t{$src, $dst|$dst, $src}",
                                SSEPackedSingle>, TB, VEX;
@@ -671,6 +1528,7 @@ defm VCVTDQ2PSY  : sse12_cvt_p<0x5B, VR256, VR256, undef, i256mem, load,
                                "cvtdq2ps\t{$src, $dst|$dst, $src}",
                                SSEPackedSingle>, TB, VEX;
 }
+
 let Pattern = []<dag> in {
 defm CVTSS2SI : sse12_cvt_s<0x2D, FR32, GR32, undef, f32mem, load /*dummy*/,
                           "cvtss2si{l}\t{$src, $dst|$dst, $src}">, XS;
@@ -681,19 +1539,43 @@ defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, undef, i128mem, load /*dummy*/,
                             SSEPackedSingle>, TB; /* PD SSE3 form is avaiable */
 }
 
+let Predicates = [HasSSE1] in {
+  def : Pat<(int_x86_sse_cvtss2si VR128:$src),
+            (CVTSS2SIrr (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>;
+  def : Pat<(int_x86_sse_cvtss2si (load addr:$src)),
+            (CVTSS2SIrm addr:$src)>;
+  def : Pat<(int_x86_sse_cvtss2si64 VR128:$src),
+            (CVTSS2SI64rr (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>;
+  def : Pat<(int_x86_sse_cvtss2si64 (load addr:$src)),
+            (CVTSS2SI64rm addr:$src)>;
+}
+
+let Predicates = [HasAVX] in {
+  def : Pat<(int_x86_sse_cvtss2si VR128:$src),
+            (VCVTSS2SIrr (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>;
+  def : Pat<(int_x86_sse_cvtss2si (load addr:$src)),
+            (VCVTSS2SIrm addr:$src)>;
+  def : Pat<(int_x86_sse_cvtss2si64 VR128:$src),
+            (VCVTSS2SI64rr (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>;
+  def : Pat<(int_x86_sse_cvtss2si64 (load addr:$src)),
+            (VCVTSS2SI64rm addr:$src)>;
+}
+
 /// SSE 2 Only
 
 // Convert scalar double to scalar single
 def VCVTSD2SSrr  : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
                        (ins FR64:$src1, FR64:$src2),
                       "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
-                      VEX_4V;
+                      VEX_4V, VEX_LIG;
+let mayLoad = 1 in
 def VCVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst),
                        (ins FR64:$src1, f64mem:$src2),
                       "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                      []>, XD, Requires<[HasAVX, OptForSize]>, VEX_4V;
+                      []>, XD, Requires<[HasAVX, OptForSize]>, VEX_4V, VEX_LIG;
+
 def : Pat<(f32 (fround FR64:$src)), (VCVTSD2SSrr FR64:$src, FR64:$src)>,
-        Requires<[HasAVX]>;
+          Requires<[HasAVX]>;
 
 def CVTSD2SSrr  : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
                       "cvtsd2ss\t{$src, $dst|$dst, $src}",
@@ -715,13 +1597,25 @@ defm Int_CVTSD2SS: sse12_cvt_sint_3addr<0x5A, VR128, VR128,
 def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
                     (ins FR32:$src1, FR32:$src2),
                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                    []>, XS, Requires<[HasAVX]>, VEX_4V;
+                    []>, XS, Requires<[HasAVX]>, VEX_4V, VEX_LIG;
+let mayLoad = 1 in
 def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
                     (ins FR32:$src1, f32mem:$src2),
                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                    []>, XS, VEX_4V, Requires<[HasAVX, OptForSize]>;
-def : Pat<(f64 (fextend FR32:$src)), (VCVTSS2SDrr FR32:$src, FR32:$src)>,
-        Requires<[HasAVX]>;
+                    []>, XS, VEX_4V, VEX_LIG, Requires<[HasAVX, OptForSize]>;
+
+let Predicates = [HasAVX] in {
+  def : Pat<(f64 (fextend FR32:$src)),
+            (VCVTSS2SDrr FR32:$src, FR32:$src)>;
+  def : Pat<(fextend (loadf32 addr:$src)),
+            (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>;
+  def : Pat<(extloadf32 addr:$src),
+            (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>;
+}
+
+def : Pat<(extloadf32 addr:$src),
+          (VCVTSS2SDrr (f32 (IMPLICIT_DEF)), (MOVSSrm addr:$src))>,
+          Requires<[HasAVX, OptForSpeed]>;
 
 def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
                    "cvtss2sd\t{$src, $dst|$dst, $src}",
@@ -732,6 +1626,16 @@ def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
                    [(set FR64:$dst, (extloadf32 addr:$src))]>, XS,
                  Requires<[HasSSE2, OptForSize]>;
 
+// extload f32 -> f64.  This matches load+fextend because we have a hack in
+// the isel (PreprocessForFPConvert) that can introduce loads after dag
+// combine.
+// Since these loads aren't folded into the fextend, we have to match it
+// explicitly here.
+def : Pat<(fextend (loadf32 addr:$src)),
+          (CVTSS2SDrm addr:$src)>, Requires<[HasSSE2]>;
+def : Pat<(extloadf32 addr:$src),
+          (CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[HasSSE2, OptForSpeed]>;
+
 def Int_VCVTSS2SDrr: I<0x5A, MRMSrcReg,
                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -759,10 +1663,6 @@ def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem,
                     Requires<[HasSSE2]>;
 }
 
-def : Pat<(extloadf32 addr:$src),
-          (CVTSS2SDrr (MOVSSrm addr:$src))>,
-      Requires<[HasSSE2, OptForSpeed]>;
-
 // Convert doubleword to packed single/double fp
 // SSE2 instructions without OpSize prefix
 def Int_VCVTDQ2PSrr : I<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
@@ -862,10 +1762,12 @@ def Int_CVTPD2DQrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
 // SSE2 packed instructions with XS prefix
 def VCVTTPS2DQrr : VSSI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                       "cvttps2dq\t{$src, $dst|$dst, $src}", []>, VEX;
+let mayLoad = 1 in
 def VCVTTPS2DQrm : VSSI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                       "cvttps2dq\t{$src, $dst|$dst, $src}", []>, VEX;
 def VCVTTPS2DQYrr : VSSI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                       "cvttps2dq\t{$src, $dst|$dst, $src}", []>, VEX;
+let mayLoad = 1 in
 def VCVTTPS2DQYrm : VSSI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
                       "cvttps2dq\t{$src, $dst|$dst, $src}", []>, VEX;
 def CVTTPS2DQrr : SSI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
@@ -877,7 +1779,6 @@ def CVTTPS2DQrm : SSI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                       [(set VR128:$dst,
                             (int_x86_sse2_cvttps2dq (memop addr:$src)))]>;
 
-
 def Int_VCVTTPS2DQrr : I<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                         "vcvttps2dq\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst,
@@ -889,16 +1790,33 @@ def Int_VCVTTPS2DQrm : I<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                                            (memop addr:$src)))]>,
                       XS, VEX, Requires<[HasAVX]>;
 
-def Int_VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst),
-                            (ins VR128:$src),
-                          "cvttpd2dq\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst, (int_x86_sse2_cvttpd2dq VR128:$src))]>,
-                       VEX;
-def Int_VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst),
-                          (ins f128mem:$src),
-                          "cvttpd2dq\t{$src, $dst|$dst, $src}",
-                          [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
-                                             (memop addr:$src)))]>, VEX;
+let Predicates = [HasSSE2] in {
+  def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
+            (Int_CVTDQ2PSrr VR128:$src)>;
+  def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
+            (CVTTPS2DQrr VR128:$src)>;
+}
+
+let Predicates = [HasAVX] in {
+  def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
+            (Int_VCVTDQ2PSrr VR128:$src)>;
+  def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
+            (VCVTTPS2DQrr VR128:$src)>;
+  def : Pat<(v8f32 (sint_to_fp (v8i32 VR256:$src))),
+            (VCVTDQ2PSYrr VR256:$src)>;
+  def : Pat<(v8i32 (fp_to_sint (v8f32 VR256:$src))),
+            (VCVTTPS2DQYrr VR256:$src)>;
+}
+
+def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                        "cvttpd2dq\t{$src, $dst|$dst, $src}",
+                        [(set VR128:$dst,
+                              (int_x86_sse2_cvttpd2dq VR128:$src))]>, VEX;
+let isCodeGenOnly = 1 in
+def VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                        "cvttpd2dq\t{$src, $dst|$dst, $src}",
+                        [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
+                                               (memop addr:$src)))]>, VEX;
 def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                       "cvttpd2dq\t{$src, $dst|$dst, $src}",
                       [(set VR128:$dst, (int_x86_sse2_cvttpd2dq VR128:$src))]>;
@@ -910,8 +1828,6 @@ def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
 // The assembler can recognize rr 256-bit instructions by seeing a ymm
 // register, but the same isn't true when using memory operands instead.
 // Provide other assembly rr and rm forms to address this explicitly.
-def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                        "cvttpd2dq\t{$src, $dst|$dst, $src}", []>, VEX;
 def VCVTTPD2DQXrYr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
                           "cvttpd2dq\t{$src, $dst|$dst, $src}", []>, VEX;
 
@@ -931,13 +1847,13 @@ def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
 let Predicates = [HasAVX] in {
                   // SSE2 instructions without OpSize prefix
 def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                     "vcvtps2pd\t{$src, $dst|$dst, $src}", []>, VEX;
+                     "vcvtps2pd\t{$src, $dst|$dst, $src}", []>, TB, VEX;
 def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
-                     "vcvtps2pd\t{$src, $dst|$dst, $src}", []>, VEX;
+                     "vcvtps2pd\t{$src, $dst|$dst, $src}", []>, TB, VEX;
 def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
-                     "vcvtps2pd\t{$src, $dst|$dst, $src}", []>, VEX;
+                     "vcvtps2pd\t{$src, $dst|$dst, $src}", []>, TB, VEX;
 def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
-                     "vcvtps2pd\t{$src, $dst|$dst, $src}", []>, VEX;
+                     "vcvtps2pd\t{$src, $dst|$dst, $src}", []>, TB, VEX;
 }
 def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "cvtps2pd\t{$src, $dst|$dst, $src}", []>, TB;
@@ -947,12 +1863,12 @@ def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
 def Int_VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "vcvtps2pd\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))]>,
-                     VEX, Requires<[HasAVX]>;
+                     TB, VEX, Requires<[HasAVX]>;
 def Int_VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
                        "vcvtps2pd\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (int_x86_sse2_cvtps2pd
                                           (load addr:$src)))]>,
-                     VEX, Requires<[HasAVX]>;
+                     TB, VEX, Requires<[HasAVX]>;
 def Int_CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "cvtps2pd\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))]>,
@@ -1038,75 +1954,61 @@ def : Pat<(int_x86_avx_cvtt_ps2dq_256 VR256:$src),
 def : Pat<(int_x86_avx_cvtt_ps2dq_256 (memopv8f32 addr:$src)),
           (VCVTTPS2DQYrm addr:$src)>;
 
+// Match fround and fextend for 128/256-bit conversions
+def : Pat<(v4f32 (fround (v4f64 VR256:$src))),
+          (VCVTPD2PSYrr VR256:$src)>;
+def : Pat<(v4f32 (fround (loadv4f64 addr:$src))),
+          (VCVTPD2PSYrm addr:$src)>;
+
+def : Pat<(v4f64 (fextend (v4f32 VR128:$src))),
+          (VCVTPS2PDYrr VR128:$src)>;
+def : Pat<(v4f64 (fextend (loadv4f32 addr:$src))),
+          (VCVTPS2PDYrm addr:$src)>;
+
 //===----------------------------------------------------------------------===//
 // SSE 1 & 2 - Compare Instructions
 //===----------------------------------------------------------------------===//
 
 // sse12_cmp_scalar - sse 1 & 2 compare scalar instructions
 multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
+                            SDNode OpNode, ValueType VT, PatFrag ld_frag,
                             string asm, string asm_alt> {
-  let isAsmParserOnly = 1 in {
-    def rr : SIi8<0xC2, MRMSrcReg,
-                  (outs RC:$dst), (ins RC:$src1, RC:$src, SSECC:$cc),
-                  asm, []>;
-    let mayLoad = 1 in
-    def rm : SIi8<0xC2, MRMSrcMem,
-                  (outs RC:$dst), (ins RC:$src1, x86memop:$src, SSECC:$cc),
-                  asm, []>;
-  }
+  def rr : SIi8<0xC2, MRMSrcReg,
+                (outs RC:$dst), (ins RC:$src1, RC:$src2, SSECC:$cc), asm,
+                [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, imm:$cc))]>;
+  def rm : SIi8<0xC2, MRMSrcMem,
+                (outs RC:$dst), (ins RC:$src1, x86memop:$src2, SSECC:$cc), asm,
+                [(set RC:$dst, (OpNode (VT RC:$src1),
+                                         (ld_frag addr:$src2), imm:$cc))]>;
 
   // Accept explicit immediate argument form instead of comparison code.
-  def rr_alt : SIi8<0xC2, MRMSrcReg,
-                (outs RC:$dst), (ins RC:$src1, RC:$src, i8imm:$src2),
-                asm_alt, []>;
-  let mayLoad = 1 in
-  def rm_alt : SIi8<0xC2, MRMSrcMem,
-                (outs RC:$dst), (ins RC:$src1, x86memop:$src, i8imm:$src2),
-                asm_alt, []>;
+  let neverHasSideEffects = 1 in {
+    def rr_alt : SIi8<0xC2, MRMSrcReg, (outs RC:$dst),
+                      (ins RC:$src1, RC:$src2, i8imm:$cc), asm_alt, []>;
+    let mayLoad = 1 in
+    def rm_alt : SIi8<0xC2, MRMSrcMem, (outs RC:$dst),
+                      (ins RC:$src1, x86memop:$src2, i8imm:$cc), asm_alt, []>;
+  }
 }
 
-let neverHasSideEffects = 1 in {
-  defm VCMPSS  : sse12_cmp_scalar<FR32, f32mem,
-                  "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}",
-                  "cmpss\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}">,
-                  XS, VEX_4V;
-  defm VCMPSD  : sse12_cmp_scalar<FR64, f64mem,
-                  "cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}",
-                  "cmpsd\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}">,
-                  XD, VEX_4V;
-}
+defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, X86cmpss, f32, loadf32,
+                 "cmp${cc}ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                 "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">,
+                 XS, VEX_4V, VEX_LIG;
+defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, X86cmpsd, f64, loadf64,
+                 "cmp${cc}sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                 "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">,
+                 XD, VEX_4V, VEX_LIG;
 
 let Constraints = "$src1 = $dst" in {
-def CMPSSrr : SIi8<0xC2, MRMSrcReg,
-                  (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, SSECC:$cc),
-                  "cmp${cc}ss\t{$src2, $dst|$dst, $src2}",
-                  [(set FR32:$dst, (X86cmpss (f32 FR32:$src1), FR32:$src2, imm:$cc))]>, XS;
-def CMPSSrm : SIi8<0xC2, MRMSrcMem,
-                  (outs FR32:$dst), (ins FR32:$src1, f32mem:$src2, SSECC:$cc),
+  defm CMPSS : sse12_cmp_scalar<FR32, f32mem, X86cmpss, f32, loadf32,
                   "cmp${cc}ss\t{$src2, $dst|$dst, $src2}",
-                  [(set FR32:$dst, (X86cmpss (f32 FR32:$src1), (loadf32 addr:$src2), imm:$cc))]>, XS;
-def CMPSDrr : SIi8<0xC2, MRMSrcReg,
-                  (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, SSECC:$cc),
+                  "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}">,
+                  XS;
+  defm CMPSD : sse12_cmp_scalar<FR64, f64mem, X86cmpsd, f64, loadf64,
                   "cmp${cc}sd\t{$src2, $dst|$dst, $src2}",
-                  [(set FR64:$dst, (X86cmpsd (f64 FR64:$src1), FR64:$src2, imm:$cc))]>, XD;
-def CMPSDrm : SIi8<0xC2, MRMSrcMem,
-                  (outs FR64:$dst), (ins FR64:$src1, f64mem:$src2, SSECC:$cc),
-                  "cmp${cc}sd\t{$src2, $dst|$dst, $src2}",
-                  [(set FR64:$dst, (X86cmpsd (f64 FR64:$src1), (loadf64 addr:$src2), imm:$cc))]>, XD;
-}
-let Constraints = "$src1 = $dst", neverHasSideEffects = 1 in {
-def CMPSSrr_alt : SIi8<0xC2, MRMSrcReg,
-                  (outs FR32:$dst), (ins FR32:$src1, FR32:$src, i8imm:$src2),
-                  "cmpss\t{$src2, $src, $dst|$dst, $src, $src2}", []>, XS;
-def CMPSSrm_alt : SIi8<0xC2, MRMSrcMem,
-                  (outs FR32:$dst), (ins FR32:$src1, f32mem:$src, i8imm:$src2),
-                  "cmpss\t{$src2, $src, $dst|$dst, $src, $src2}", []>, XS;
-def CMPSDrr_alt : SIi8<0xC2, MRMSrcReg,
-                  (outs FR64:$dst), (ins FR64:$src1, FR64:$src, i8imm:$src2),
-                  "cmpsd\t{$src2, $src, $dst|$dst, $src, $src2}", []>, XD;
-def CMPSDrm_alt : SIi8<0xC2, MRMSrcMem,
-                  (outs FR64:$dst), (ins FR64:$src1, f64mem:$src, i8imm:$src2),
-                  "cmpsd\t{$src2, $src, $dst|$dst, $src, $src2}", []>, XD;
+                  "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}">,
+                  XD;
 }
 
 multiclass sse12_cmp_scalar_int<RegisterClass RC, X86MemOperand x86memop,
@@ -1151,25 +2053,28 @@ multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode,
 
 let Defs = [EFLAGS] in {
   defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
-                                  "ucomiss", SSEPackedSingle>, VEX;
+                                  "ucomiss", SSEPackedSingle>, TB, VEX, VEX_LIG;
   defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
-                                  "ucomisd", SSEPackedDouble>, OpSize, VEX;
+                                  "ucomisd", SSEPackedDouble>, TB, OpSize, VEX,
+                                  VEX_LIG;
   let Pattern = []<dag> in {
     defm VCOMISS  : sse12_ord_cmp<0x2F, VR128, undef, v4f32, f128mem, load,
-                                    "comiss", SSEPackedSingle>, VEX;
+                                    "comiss", SSEPackedSingle>, TB, VEX,
+                                    VEX_LIG;
     defm VCOMISD  : sse12_ord_cmp<0x2F, VR128, undef, v2f64, f128mem, load,
-                                    "comisd", SSEPackedDouble>, OpSize, VEX;
+                                    "comisd", SSEPackedDouble>, TB, OpSize, VEX,
+                                    VEX_LIG;
   }
 
   defm Int_VUCOMISS  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem,
-                            load, "ucomiss", SSEPackedSingle>, VEX;
+                            load, "ucomiss", SSEPackedSingle>, TB, VEX;
   defm Int_VUCOMISD  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem,
-                            load, "ucomisd", SSEPackedDouble>, OpSize, VEX;
+                            load, "ucomisd", SSEPackedDouble>, TB, OpSize, VEX;
 
   defm Int_VCOMISS  : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem,
-                            load, "comiss", SSEPackedSingle>, VEX;
+                            load, "comiss", SSEPackedSingle>, TB, VEX;
   defm Int_VCOMISD  : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem,
-                            load, "comisd", SSEPackedDouble>, OpSize, VEX;
+                            load, "comisd", SSEPackedDouble>, TB, OpSize, VEX;
   defm UCOMISS  : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
                                   "ucomiss", SSEPackedSingle>, TB;
   defm UCOMISD  : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
@@ -1199,57 +2104,82 @@ multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
                             Domain d> {
   let isAsmParserOnly = 1 in {
     def rri : PIi8<0xC2, MRMSrcReg,
-               (outs RC:$dst), (ins RC:$src1, RC:$src, SSECC:$cc), asm,
-               [(set RC:$dst, (Int RC:$src1, RC:$src, imm:$cc))], d>;
+               (outs RC:$dst), (ins RC:$src1, RC:$src2, SSECC:$cc), asm,
+               [(set RC:$dst, (Int RC:$src1, RC:$src2, imm:$cc))], d>;
     def rmi : PIi8<0xC2, MRMSrcMem,
-               (outs RC:$dst), (ins RC:$src1, f128mem:$src, SSECC:$cc), asm,
-               [(set RC:$dst, (Int RC:$src1, (memop addr:$src), imm:$cc))], d>;
+               (outs RC:$dst), (ins RC:$src1, f128mem:$src2, SSECC:$cc), asm,
+               [(set RC:$dst, (Int RC:$src1, (memop addr:$src2), imm:$cc))], d>;
   }
 
   // Accept explicit immediate argument form instead of comparison code.
   def rri_alt : PIi8<0xC2, MRMSrcReg,
-             (outs RC:$dst), (ins RC:$src1, RC:$src, i8imm:$src2),
+             (outs RC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc),
              asm_alt, [], d>;
   def rmi_alt : PIi8<0xC2, MRMSrcMem,
-             (outs RC:$dst), (ins RC:$src1, f128mem:$src, i8imm:$src2),
+             (outs RC:$dst), (ins RC:$src1, f128mem:$src2, i8imm:$cc),
              asm_alt, [], d>;
 }
 
 defm VCMPPS : sse12_cmp_packed<VR128, f128mem, int_x86_sse_cmp_ps,
-               "cmp${cc}ps\t{$src, $src1, $dst|$dst, $src1, $src}",
-               "cmpps\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}",
-               SSEPackedSingle>, VEX_4V;
+               "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+               "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+               SSEPackedSingle>, TB, VEX_4V;
 defm VCMPPD : sse12_cmp_packed<VR128, f128mem, int_x86_sse2_cmp_pd,
-               "cmp${cc}pd\t{$src, $src1, $dst|$dst, $src1, $src}",
-               "cmppd\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}",
-               SSEPackedDouble>, OpSize, VEX_4V;
+               "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+               "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+               SSEPackedDouble>, TB, OpSize, VEX_4V;
 defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, int_x86_avx_cmp_ps_256,
-               "cmp${cc}ps\t{$src, $src1, $dst|$dst, $src1, $src}",
-               "cmpps\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}",
-               SSEPackedSingle>, VEX_4V;
+               "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+               "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+               SSEPackedSingle>, TB, VEX_4V;
 defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, int_x86_avx_cmp_pd_256,
-               "cmp${cc}pd\t{$src, $src1, $dst|$dst, $src1, $src}",
-               "cmppd\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}",
-               SSEPackedDouble>, OpSize, VEX_4V;
+               "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+               "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+               SSEPackedDouble>, TB, OpSize, VEX_4V;
 let Constraints = "$src1 = $dst" in {
   defm CMPPS : sse12_cmp_packed<VR128, f128mem, int_x86_sse_cmp_ps,
-                 "cmp${cc}ps\t{$src, $dst|$dst, $src}",
-                 "cmpps\t{$src2, $src, $dst|$dst, $src, $src2}",
+                 "cmp${cc}ps\t{$src2, $dst|$dst, $src2}",
+                 "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}",
                  SSEPackedSingle>, TB;
   defm CMPPD : sse12_cmp_packed<VR128, f128mem, int_x86_sse2_cmp_pd,
-                 "cmp${cc}pd\t{$src, $dst|$dst, $src}",
-                 "cmppd\t{$src2, $src, $dst|$dst, $src, $src2}",
+                 "cmp${cc}pd\t{$src2, $dst|$dst, $src2}",
+                 "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
                  SSEPackedDouble>, TB, OpSize;
 }
 
+let Predicates = [HasSSE1] in {
 def : Pat<(v4i32 (X86cmpps (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
           (CMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>;
 def : Pat<(v4i32 (X86cmpps (v4f32 VR128:$src1), (memop addr:$src2), imm:$cc)),
           (CMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>;
+}
+
+let Predicates = [HasSSE2] in {
 def : Pat<(v2i64 (X86cmppd (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
           (CMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
 def : Pat<(v2i64 (X86cmppd (v2f64 VR128:$src1), (memop addr:$src2), imm:$cc)),
           (CMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
+}
+
+let Predicates = [HasAVX] in {
+def : Pat<(v4i32 (X86cmpps (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
+          (VCMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>;
+def : Pat<(v4i32 (X86cmpps (v4f32 VR128:$src1), (memop addr:$src2), imm:$cc)),
+          (VCMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>;
+def : Pat<(v2i64 (X86cmppd (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
+          (VCMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
+def : Pat<(v2i64 (X86cmppd (v2f64 VR128:$src1), (memop addr:$src2), imm:$cc)),
+          (VCMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
+
+def : Pat<(v8i32 (X86cmpps (v8f32 VR256:$src1), VR256:$src2, imm:$cc)),
+          (VCMPPSYrri (v8f32 VR256:$src1), (v8f32 VR256:$src2), imm:$cc)>;
+def : Pat<(v8i32 (X86cmpps (v8f32 VR256:$src1), (memop addr:$src2), imm:$cc)),
+          (VCMPPSYrmi (v8f32 VR256:$src1), addr:$src2, imm:$cc)>;
+def : Pat<(v4i64 (X86cmppd (v4f64 VR256:$src1), VR256:$src2, imm:$cc)),
+          (VCMPPDYrri VR256:$src1, VR256:$src2, imm:$cc)>;
+def : Pat<(v4i64 (X86cmppd (v4f64 VR256:$src1), (memop addr:$src2), imm:$cc)),
+          (VCMPPDYrmi VR256:$src1, addr:$src2, imm:$cc)>;
+}
 
 //===----------------------------------------------------------------------===//
 // SSE 1 & 2 - Shuffle Instructions
@@ -1293,6 +2223,132 @@ let Constraints = "$src1 = $dst" in {
                     memopv2f64, SSEPackedDouble>, TB, OpSize;
 }
 
+let Predicates = [HasSSE1] in {
+  def : Pat<(v4f32 (X86Shufps VR128:$src1,
+                       (memopv4f32 addr:$src2), (i8 imm:$imm))),
+            (SHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
+  def : Pat<(v4f32 (X86Shufps VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+            (SHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>;
+  def : Pat<(v4i32 (X86Shufps VR128:$src1,
+                       (bc_v4i32 (memopv2i64 addr:$src2)), (i8 imm:$imm))),
+            (SHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
+  def : Pat<(v4i32 (X86Shufps VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+            (SHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>;
+  // vector_shuffle v1, v2 <4, 5, 2, 3> using SHUFPSrri (we prefer movsd, but
+  // fall back to this for SSE1)
+  def : Pat<(v4f32 (movlp:$src3 VR128:$src1, (v4f32 VR128:$src2))),
+            (SHUFPSrri VR128:$src2, VR128:$src1,
+                       (SHUFFLE_get_shuf_imm VR128:$src3))>;
+  // Special unary SHUFPSrri case.
+  def : Pat<(v4f32 (pshufd:$src3 VR128:$src1, (undef))),
+            (SHUFPSrri VR128:$src1, VR128:$src1,
+                       (SHUFFLE_get_shuf_imm VR128:$src3))>;
+}
+
+let Predicates = [HasSSE2] in {
+  // Special binary v4i32 shuffle cases with SHUFPS.
+  def : Pat<(v4i32 (shufp:$src3 VR128:$src1, (v4i32 VR128:$src2))),
+            (SHUFPSrri VR128:$src1, VR128:$src2,
+                       (SHUFFLE_get_shuf_imm VR128:$src3))>;
+  def : Pat<(v4i32 (shufp:$src3 VR128:$src1,
+                                (bc_v4i32 (memopv2i64 addr:$src2)))),
+            (SHUFPSrmi VR128:$src1, addr:$src2,
+                      (SHUFFLE_get_shuf_imm VR128:$src3))>;
+  // Special unary SHUFPDrri cases.
+  def : Pat<(v2i64 (pshufd:$src3 VR128:$src1, (undef))),
+            (SHUFPDrri VR128:$src1, VR128:$src1,
+                       (SHUFFLE_get_shuf_imm VR128:$src3))>;
+  def : Pat<(v2f64 (pshufd:$src3 VR128:$src1, (undef))),
+            (SHUFPDrri VR128:$src1, VR128:$src1,
+                       (SHUFFLE_get_shuf_imm VR128:$src3))>;
+  // Special binary v2i64 shuffle cases using SHUFPDrri.
+  def : Pat<(v2i64 (shufp:$src3 VR128:$src1, VR128:$src2)),
+            (SHUFPDrri VR128:$src1, VR128:$src2,
+                       (SHUFFLE_get_shuf_imm VR128:$src3))>;
+  // Generic SHUFPD patterns
+  def : Pat<(v2f64 (X86Shufps VR128:$src1,
+                       (memopv2f64 addr:$src2), (i8 imm:$imm))),
+            (SHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>;
+  def : Pat<(v2i64 (X86Shufpd VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+            (SHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>;
+  def : Pat<(v2f64 (X86Shufpd VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+            (SHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>;
+}
+
+let Predicates = [HasAVX] in {
+  def : Pat<(v4f32 (X86Shufps VR128:$src1,
+                       (memopv4f32 addr:$src2), (i8 imm:$imm))),
+            (VSHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
+  def : Pat<(v4f32 (X86Shufps VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+            (VSHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>;
+  def : Pat<(v4i32 (X86Shufps VR128:$src1,
+                       (bc_v4i32 (memopv2i64 addr:$src2)), (i8 imm:$imm))),
+            (VSHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
+  def : Pat<(v4i32 (X86Shufps VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+            (VSHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>;
+  // vector_shuffle v1, v2 <4, 5, 2, 3> using SHUFPSrri (we prefer movsd, but
+  // fall back to this for SSE1)
+  def : Pat<(v4f32 (movlp:$src3 VR128:$src1, (v4f32 VR128:$src2))),
+            (VSHUFPSrri VR128:$src2, VR128:$src1,
+                        (SHUFFLE_get_shuf_imm VR128:$src3))>;
+  // Special unary SHUFPSrri case.
+  def : Pat<(v4f32 (pshufd:$src3 VR128:$src1, (undef))),
+            (VSHUFPSrri VR128:$src1, VR128:$src1,
+                        (SHUFFLE_get_shuf_imm VR128:$src3))>;
+  // Special binary v4i32 shuffle cases with SHUFPS.
+  def : Pat<(v4i32 (shufp:$src3 VR128:$src1, (v4i32 VR128:$src2))),
+            (VSHUFPSrri VR128:$src1, VR128:$src2,
+                        (SHUFFLE_get_shuf_imm VR128:$src3))>;
+  def : Pat<(v4i32 (shufp:$src3 VR128:$src1,
+                                (bc_v4i32 (memopv2i64 addr:$src2)))),
+            (VSHUFPSrmi VR128:$src1, addr:$src2,
+                        (SHUFFLE_get_shuf_imm VR128:$src3))>;
+  // Special unary SHUFPDrri cases.
+  def : Pat<(v2i64 (pshufd:$src3 VR128:$src1, (undef))),
+            (VSHUFPDrri VR128:$src1, VR128:$src1,
+                        (SHUFFLE_get_shuf_imm VR128:$src3))>;
+  def : Pat<(v2f64 (pshufd:$src3 VR128:$src1, (undef))),
+            (VSHUFPDrri VR128:$src1, VR128:$src1,
+                        (SHUFFLE_get_shuf_imm VR128:$src3))>;
+  // Special binary v2i64 shuffle cases using SHUFPDrri.
+  def : Pat<(v2i64 (shufp:$src3 VR128:$src1, VR128:$src2)),
+            (VSHUFPDrri VR128:$src1, VR128:$src2,
+                        (SHUFFLE_get_shuf_imm VR128:$src3))>;
+
+  def : Pat<(v2f64 (X86Shufps VR128:$src1,
+                       (memopv2f64 addr:$src2), (i8 imm:$imm))),
+            (VSHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>;
+  def : Pat<(v2i64 (X86Shufpd VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+            (VSHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>;
+  def : Pat<(v2f64 (X86Shufpd VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+            (VSHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>;
+
+  // 256-bit patterns
+  def : Pat<(v8i32 (X86Shufps VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+            (VSHUFPSYrri VR256:$src1, VR256:$src2, imm:$imm)>;
+  def : Pat<(v8i32 (X86Shufps VR256:$src1,
+                      (bc_v8i32 (memopv4i64 addr:$src2)), (i8 imm:$imm))),
+            (VSHUFPSYrmi VR256:$src1, addr:$src2, imm:$imm)>;
+
+  def : Pat<(v8f32 (X86Shufps VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+            (VSHUFPSYrri VR256:$src1, VR256:$src2, imm:$imm)>;
+  def : Pat<(v8f32 (X86Shufps VR256:$src1,
+                              (memopv8f32 addr:$src2), (i8 imm:$imm))),
+            (VSHUFPSYrmi VR256:$src1, addr:$src2, imm:$imm)>;
+
+  def : Pat<(v4i64 (X86Shufpd VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+            (VSHUFPDYrri VR256:$src1, VR256:$src2, imm:$imm)>;
+  def : Pat<(v4i64 (X86Shufpd VR256:$src1,
+                              (memopv4i64 addr:$src2), (i8 imm:$imm))),
+            (VSHUFPDYrmi VR256:$src1, addr:$src2, imm:$imm)>;
+
+  def : Pat<(v4f64 (X86Shufpd VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+            (VSHUFPDYrri VR256:$src1, VR256:$src2, imm:$imm)>;
+  def : Pat<(v4f64 (X86Shufpd VR256:$src1,
+                              (memopv4f64 addr:$src2), (i8 imm:$imm))),
+            (VSHUFPDYrmi VR256:$src1, addr:$src2, imm:$imm)>;
+}
+
 //===----------------------------------------------------------------------===//
 // SSE 1 & 2 - Unpack Instructions
 //===----------------------------------------------------------------------===//
@@ -1316,29 +2372,29 @@ multiclass sse12_unpack_interleave<bits<8> opc, PatFrag OpNode, ValueType vt,
 let AddedComplexity = 10 in {
   defm VUNPCKHPS: sse12_unpack_interleave<0x15, unpckh, v4f32, memopv4f32,
         VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                       SSEPackedSingle>, VEX_4V;
+                       SSEPackedSingle>, TB, VEX_4V;
   defm VUNPCKHPD: sse12_unpack_interleave<0x15, unpckh, v2f64, memopv2f64,
         VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                       SSEPackedDouble>, OpSize, VEX_4V;
+                       SSEPackedDouble>, TB, OpSize, VEX_4V;
   defm VUNPCKLPS: sse12_unpack_interleave<0x14, unpckl, v4f32, memopv4f32,
         VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                       SSEPackedSingle>, VEX_4V;
+                       SSEPackedSingle>, TB, VEX_4V;
   defm VUNPCKLPD: sse12_unpack_interleave<0x14, unpckl, v2f64, memopv2f64,
         VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                       SSEPackedDouble>, OpSize, VEX_4V;
+                       SSEPackedDouble>, TB, OpSize, VEX_4V;
 
   defm VUNPCKHPSY: sse12_unpack_interleave<0x15, unpckh, v8f32, memopv8f32,
         VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                       SSEPackedSingle>, VEX_4V;
+                       SSEPackedSingle>, TB, VEX_4V;
   defm VUNPCKHPDY: sse12_unpack_interleave<0x15, unpckh, v4f64, memopv4f64,
         VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                       SSEPackedDouble>, OpSize, VEX_4V;
+                       SSEPackedDouble>, TB, OpSize, VEX_4V;
   defm VUNPCKLPSY: sse12_unpack_interleave<0x14, unpckl, v8f32, memopv8f32,
         VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                       SSEPackedSingle>, VEX_4V;
+                       SSEPackedSingle>, TB, VEX_4V;
   defm VUNPCKLPDY: sse12_unpack_interleave<0x14, unpckl, v4f64, memopv4f64,
         VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                       SSEPackedDouble>, OpSize, VEX_4V;
+                       SSEPackedDouble>, TB, OpSize, VEX_4V;
 
   let Constraints = "$src1 = $dst" in {
     defm UNPCKHPS: sse12_unpack_interleave<0x15, unpckh, v4f32, memopv4f32,
@@ -1356,6 +2412,103 @@ let AddedComplexity = 10 in {
   } // Constraints = "$src1 = $dst"
 } // AddedComplexity
 
+let Predicates = [HasSSE1] in {
+  def : Pat<(v4f32 (X86Unpcklps VR128:$src1, (memopv4f32 addr:$src2))),
+            (UNPCKLPSrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v4f32 (X86Unpcklps VR128:$src1, VR128:$src2)),
+            (UNPCKLPSrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v4f32 (X86Unpckhps VR128:$src1, (memopv4f32 addr:$src2))),
+            (UNPCKHPSrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v4f32 (X86Unpckhps VR128:$src1, VR128:$src2)),
+            (UNPCKHPSrr VR128:$src1, VR128:$src2)>;
+}
+
+let Predicates = [HasSSE2] in {
+  def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, (memopv2f64 addr:$src2))),
+            (UNPCKLPDrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, VR128:$src2)),
+            (UNPCKLPDrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v2f64 (X86Unpckhpd VR128:$src1, (memopv2f64 addr:$src2))),
+            (UNPCKHPDrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v2f64 (X86Unpckhpd VR128:$src1, VR128:$src2)),
+            (UNPCKHPDrr VR128:$src1, VR128:$src2)>;
+
+  // FIXME: Instead of X86Movddup, there should be a X86Unpcklpd here, the
+  // problem is during lowering, where it's not possible to recognize the load
+  // fold cause it has two uses through a bitcast. One use disappears at isel
+  // time and the fold opportunity reappears.
+  def : Pat<(v2f64 (X86Movddup VR128:$src)),
+            (UNPCKLPDrr VR128:$src, VR128:$src)>;
+
+  let AddedComplexity = 10 in
+  def : Pat<(splat_lo (v2f64 VR128:$src), (undef)),
+            (UNPCKLPDrr VR128:$src, VR128:$src)>;
+}
+
+let Predicates = [HasAVX] in {
+  def : Pat<(v4f32 (X86Unpcklps VR128:$src1, (memopv4f32 addr:$src2))),
+            (VUNPCKLPSrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v4f32 (X86Unpcklps VR128:$src1, VR128:$src2)),
+            (VUNPCKLPSrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v4f32 (X86Unpckhps VR128:$src1, (memopv4f32 addr:$src2))),
+            (VUNPCKHPSrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v4f32 (X86Unpckhps VR128:$src1, VR128:$src2)),
+            (VUNPCKHPSrr VR128:$src1, VR128:$src2)>;
+
+  def : Pat<(v8f32 (X86Unpcklpsy VR256:$src1, (memopv8f32 addr:$src2))),
+            (VUNPCKLPSYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(v8f32 (X86Unpcklpsy VR256:$src1, VR256:$src2)),
+            (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v8i32 (X86Unpcklpsy VR256:$src1, VR256:$src2)),
+            (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v8i32 (X86Unpcklpsy VR256:$src1, (memopv8i32 addr:$src2))),
+            (VUNPCKLPSYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(v8f32 (X86Unpckhpsy VR256:$src1, (memopv8f32 addr:$src2))),
+            (VUNPCKHPSYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(v8f32 (X86Unpckhpsy VR256:$src1, VR256:$src2)),
+            (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v8i32 (X86Unpckhpsy VR256:$src1, (memopv8i32 addr:$src2))),
+            (VUNPCKHPSYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(v8i32 (X86Unpckhpsy VR256:$src1, VR256:$src2)),
+            (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>;
+
+  def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, (memopv2f64 addr:$src2))),
+            (VUNPCKLPDrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, VR128:$src2)),
+            (VUNPCKLPDrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v2f64 (X86Unpckhpd VR128:$src1, (memopv2f64 addr:$src2))),
+            (VUNPCKHPDrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v2f64 (X86Unpckhpd VR128:$src1, VR128:$src2)),
+            (VUNPCKHPDrr VR128:$src1, VR128:$src2)>;
+
+  def : Pat<(v4f64 (X86Unpcklpdy VR256:$src1, (memopv4f64 addr:$src2))),
+            (VUNPCKLPDYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(v4f64 (X86Unpcklpdy VR256:$src1, VR256:$src2)),
+            (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v4i64 (X86Unpcklpdy VR256:$src1, (memopv4i64 addr:$src2))),
+            (VUNPCKLPDYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(v4i64 (X86Unpcklpdy VR256:$src1, VR256:$src2)),
+            (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v4f64 (X86Unpckhpdy VR256:$src1, (memopv4f64 addr:$src2))),
+            (VUNPCKHPDYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(v4f64 (X86Unpckhpdy VR256:$src1, VR256:$src2)),
+            (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v4i64 (X86Unpckhpdy VR256:$src1, (memopv4i64 addr:$src2))),
+            (VUNPCKHPDYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(v4i64 (X86Unpckhpdy VR256:$src1, VR256:$src2)),
+            (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>;
+
+  // FIXME: Instead of X86Movddup, there should be a X86Unpcklpd here, the
+  // problem is during lowering, where it's not possible to recognize the load
+  // fold cause it has two uses through a bitcast. One use disappears at isel
+  // time and the fold opportunity reappears.
+  def : Pat<(v2f64 (X86Movddup VR128:$src)),
+            (VUNPCKLPDrr VR128:$src, VR128:$src)>;
+  let AddedComplexity = 10 in
+  def : Pat<(splat_lo (v2f64 VR128:$src), (undef)),
+            (VUNPCKLPDrr VR128:$src, VR128:$src)>;
+}
+
 //===----------------------------------------------------------------------===//
 // SSE 1 & 2 - Extract Floating-Point Sign mask
 //===----------------------------------------------------------------------===//
@@ -1370,91 +2523,60 @@ multiclass sse12_extr_sign_mask<RegisterClass RC, Intrinsic Int, string asm,
                 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], d>, REX_W;
 }
 
-// Mask creation
-defm VMOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps,
-                                      "movmskps", SSEPackedSingle>, VEX;
-defm VMOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd,
-                                      "movmskpd", SSEPackedDouble>, OpSize,
-                                      VEX;
-defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_ps_256,
-                                      "movmskps", SSEPackedSingle>, VEX;
-defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_pd_256,
-                                      "movmskpd", SSEPackedDouble>, OpSize,
-                                      VEX;
 defm MOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps, "movmskps",
                                      SSEPackedSingle>, TB;
 defm MOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd, "movmskpd",
                                      SSEPackedDouble>, TB, OpSize;
 
-// X86fgetsign
-def MOVMSKPDrr32_alt : PI<0x50, MRMSrcReg, (outs GR32:$dst), (ins FR64:$src),
-                    "movmskpd\t{$src, $dst|$dst, $src}",
-                    [(set GR32:$dst, (X86fgetsign FR64:$src))], SSEPackedDouble>, TB, OpSize;
-def MOVMSKPDrr64_alt : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins FR64:$src),
-                    "movmskpd\t{$src, $dst|$dst, $src}",
-                    [(set GR64:$dst, (X86fgetsign FR64:$src))], SSEPackedDouble>, TB, OpSize;
-def MOVMSKPSrr32_alt : PI<0x50, MRMSrcReg, (outs GR32:$dst), (ins FR32:$src),
-                    "movmskps\t{$src, $dst|$dst, $src}",
-                    [(set GR32:$dst, (X86fgetsign FR32:$src))], SSEPackedSingle>, TB;
-def MOVMSKPSrr64_alt : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins FR32:$src),
-                    "movmskps\t{$src, $dst|$dst, $src}",
-                    [(set GR64:$dst, (X86fgetsign FR32:$src))], SSEPackedSingle>, TB;
-
-// Assembler Only
-def VMOVMSKPSr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
-           "movmskps\t{$src, $dst|$dst, $src}", [], SSEPackedSingle>, VEX;
-def VMOVMSKPDr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
-           "movmskpd\t{$src, $dst|$dst, $src}", [], SSEPackedDouble>, OpSize,
-           VEX;
-def VMOVMSKPSYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src),
-           "movmskps\t{$src, $dst|$dst, $src}", [], SSEPackedSingle>, VEX;
-def VMOVMSKPDYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src),
-           "movmskpd\t{$src, $dst|$dst, $src}", [], SSEPackedDouble>, OpSize,
-           VEX;
-
-//===----------------------------------------------------------------------===//
-// SSE 1 & 2 - Misc aliasing of packed SSE 1 & 2 instructions
-//===----------------------------------------------------------------------===//
-
-// Aliases of packed SSE1 & SSE2 instructions for scalar use. These all have
-// names that start with 'Fs'.
-
-// Alias instructions that map fld0 to pxor for sse.
-let isReMaterializable = 1, isAsCheapAsAMove = 1, isCodeGenOnly = 1,
-    canFoldAsLoad = 1 in {
-  // FIXME: Set encoding to pseudo!
-def FsFLD0SS : I<0xEF, MRMInitReg, (outs FR32:$dst), (ins), "",
-                 [(set FR32:$dst, fp32imm0)]>,
-                 Requires<[HasSSE1]>, TB, OpSize;
-def FsFLD0SD : I<0xEF, MRMInitReg, (outs FR64:$dst), (ins), "",
-                 [(set FR64:$dst, fpimm0)]>,
-               Requires<[HasSSE2]>, TB, OpSize;
-def VFsFLD0SS : I<0xEF, MRMInitReg, (outs FR32:$dst), (ins), "",
-                  [(set FR32:$dst, fp32imm0)]>,
-                  Requires<[HasAVX]>, TB, OpSize, VEX_4V;
-def VFsFLD0SD : I<0xEF, MRMInitReg, (outs FR64:$dst), (ins), "",
-                  [(set FR64:$dst, fpimm0)]>,
-                  Requires<[HasAVX]>, TB, OpSize, VEX_4V;
-}
-
-// Alias instruction to do FR32 or FR64 reg-to-reg copy using movaps. Upper
-// bits are disregarded.
-let neverHasSideEffects = 1 in {
-def FsMOVAPSrr : PSI<0x28, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
-                     "movaps\t{$src, $dst|$dst, $src}", []>;
-def FsMOVAPDrr : PDI<0x28, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src),
-                     "movapd\t{$src, $dst|$dst, $src}", []>;
-}
+def : Pat<(i32 (X86fgetsign FR32:$src)),
+          (MOVMSKPSrr32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src,
+                                       sub_ss))>, Requires<[HasSSE1]>;
+def : Pat<(i64 (X86fgetsign FR32:$src)),
+          (MOVMSKPSrr64 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src,
+                                       sub_ss))>, Requires<[HasSSE1]>;
+def : Pat<(i32 (X86fgetsign FR64:$src)),
+          (MOVMSKPDrr32 (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src,
+                                       sub_sd))>, Requires<[HasSSE2]>;
+def : Pat<(i64 (X86fgetsign FR64:$src)),
+          (MOVMSKPDrr64 (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src,
+                                       sub_sd))>, Requires<[HasSSE2]>;
 
-// Alias instruction to load FR32 or FR64 from f128mem using movaps. Upper
-// bits are disregarded.
-let canFoldAsLoad = 1, isReMaterializable = 1 in {
-def FsMOVAPSrm : PSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src),
-                     "movaps\t{$src, $dst|$dst, $src}",
-                     [(set FR32:$dst, (alignedloadfsf32 addr:$src))]>;
-def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
-                     "movapd\t{$src, $dst|$dst, $src}",
-                     [(set FR64:$dst, (alignedloadfsf64 addr:$src))]>;
+let Predicates = [HasAVX] in {
+  defm VMOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps,
+                                        "movmskps", SSEPackedSingle>, TB, VEX;
+  defm VMOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd,
+                                        "movmskpd", SSEPackedDouble>, TB,
+                                        OpSize, VEX;
+  defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_ps_256,
+                                        "movmskps", SSEPackedSingle>, TB, VEX;
+  defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_pd_256,
+                                        "movmskpd", SSEPackedDouble>, TB,
+                                        OpSize, VEX;
+
+  def : Pat<(i32 (X86fgetsign FR32:$src)),
+            (VMOVMSKPSrr32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src,
+                                          sub_ss))>;
+  def : Pat<(i64 (X86fgetsign FR32:$src)),
+            (VMOVMSKPSrr64 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src,
+                                          sub_ss))>;
+  def : Pat<(i32 (X86fgetsign FR64:$src)),
+            (VMOVMSKPDrr32 (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src,
+                                          sub_sd))>;
+  def : Pat<(i64 (X86fgetsign FR64:$src)),
+            (VMOVMSKPDrr64 (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src,
+                                          sub_sd))>;
+
+  // Assembler Only
+  def VMOVMSKPSr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
+             "movmskps\t{$src, $dst|$dst, $src}", [], SSEPackedSingle>, TB, VEX;
+  def VMOVMSKPDr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
+             "movmskpd\t{$src, $dst|$dst, $src}", [], SSEPackedDouble>, TB,
+             OpSize, VEX;
+  def VMOVMSKPSYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src),
+             "movmskps\t{$src, $dst|$dst, $src}", [], SSEPackedSingle>, TB, VEX;
+  def VMOVMSKPDYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src),
+             "movmskpd\t{$src, $dst|$dst, $src}", [], SSEPackedDouble>, TB,
+             OpSize, VEX;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1466,10 +2588,10 @@ def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
 multiclass sse12_fp_alias_pack_logical<bits<8> opc, string OpcodeStr,
                                        SDNode OpNode> {
   defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
-              FR32, f32, f128mem, memopfsf32, SSEPackedSingle, 0>, VEX_4V;
+              FR32, f32, f128mem, memopfsf32, SSEPackedSingle, 0>, TB, VEX_4V;
 
   defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
-        FR64, f64, f128mem, memopfsf64, SSEPackedDouble, 0>, OpSize, VEX_4V;
+        FR64, f64, f128mem, memopfsf64, SSEPackedDouble, 0>, TB, OpSize, VEX_4V;
 
   let Constraints = "$src1 = $dst" in {
     defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, FR32,
@@ -1494,21 +2616,22 @@ let neverHasSideEffects = 1, Pattern = []<dag>, isCommutable = 0 in
 ///
 multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
                                    SDNode OpNode> {
-  let Pattern = []<dag> in {
-    defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
-         !strconcat(OpcodeStr, "ps"), f128mem,
-         [(set VR128:$dst, (v2i64 (OpNode VR128:$src1, VR128:$src2)))],
-         [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
-                                   (memopv2i64 addr:$src2)))], 0>, VEX_4V;
-
-    defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
-         !strconcat(OpcodeStr, "pd"), f128mem,
-         [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
-                                   (bc_v2i64 (v2f64 VR128:$src2))))],
-         [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
-                                   (memopv2i64 addr:$src2)))], 0>,
-                                                   OpSize, VEX_4V;
-  }
+  // In AVX no need to add a pattern for 128-bit logical rr ps, because they
+  // are all promoted to v2i64, and the patterns are covered by the int
+  // version. This is needed in SSE only, because v2i64 isn't supported on
+  // SSE1, but only on SSE2.
+  defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
+       !strconcat(OpcodeStr, "ps"), f128mem, [],
+       [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
+                                 (memopv2i64 addr:$src2)))], 0>, TB, VEX_4V;
+
+  defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
+       !strconcat(OpcodeStr, "pd"), f128mem,
+       [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
+                                 (bc_v2i64 (v2f64 VR128:$src2))))],
+       [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
+                                 (memopv2i64 addr:$src2)))], 0>,
+                                                 TB, OpSize, VEX_4V;
   let Constraints = "$src1 = $dst" in {
     defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
          !strconcat(OpcodeStr, "ps"), f128mem,
@@ -1533,7 +2656,7 @@ multiclass sse12_fp_packed_logical_y<bits<8> opc, string OpcodeStr,
           !strconcat(OpcodeStr, "ps"), f256mem,
           [(set VR256:$dst, (v4i64 (OpNode VR256:$src1, VR256:$src2)))],
           [(set VR256:$dst, (OpNode (bc_v4i64 (v8f32 VR256:$src1)),
-                                    (memopv4i64 addr:$src2)))], 0>, VEX_4V;
+                                    (memopv4i64 addr:$src2)))], 0>, TB, VEX_4V;
 
     defm PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble,
           !strconcat(OpcodeStr, "pd"), f256mem,
@@ -1541,7 +2664,7 @@ multiclass sse12_fp_packed_logical_y<bits<8> opc, string OpcodeStr,
                                     (bc_v4i64 (v4f64 VR256:$src2))))],
           [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)),
                                     (memopv4i64 addr:$src2)))], 0>,
-                                    OpSize, VEX_4V;
+                                    TB, OpSize, VEX_4V;
 }
 
 // AVX 256-bit packed logical ops forms
@@ -1632,32 +2755,32 @@ multiclass basic_sse12_fp_binop_p_y_int<bits<8> opc, string OpcodeStr> {
 
 // Binary Arithmetic instructions
 defm VADD : basic_sse12_fp_binop_s<0x58, "add", fadd, 0>,
-            basic_sse12_fp_binop_s_int<0x58, "add", 0>,
-            basic_sse12_fp_binop_p<0x58, "add", fadd, 0>,
+            basic_sse12_fp_binop_s_int<0x58, "add", 0>, VEX_4V, VEX_LIG;
+defm VADD : basic_sse12_fp_binop_p<0x58, "add", fadd, 0>,
             basic_sse12_fp_binop_p_y<0x58, "add", fadd>, VEX_4V;
 defm VMUL : basic_sse12_fp_binop_s<0x59, "mul", fmul, 0>,
-            basic_sse12_fp_binop_s_int<0x59, "mul", 0>,
-            basic_sse12_fp_binop_p<0x59, "mul", fmul, 0>,
+            basic_sse12_fp_binop_s_int<0x59, "mul", 0>, VEX_4V, VEX_LIG;
+defm VMUL : basic_sse12_fp_binop_p<0x59, "mul", fmul, 0>,
             basic_sse12_fp_binop_p_y<0x59, "mul", fmul>, VEX_4V;
 
 let isCommutable = 0 in {
   defm VSUB : basic_sse12_fp_binop_s<0x5C, "sub", fsub, 0>,
-              basic_sse12_fp_binop_s_int<0x5C, "sub", 0>,
-              basic_sse12_fp_binop_p<0x5C, "sub", fsub, 0>,
+              basic_sse12_fp_binop_s_int<0x5C, "sub", 0>, VEX_4V, VEX_LIG;
+  defm VSUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, 0>,
               basic_sse12_fp_binop_p_y<0x5C, "sub", fsub>, VEX_4V;
   defm VDIV : basic_sse12_fp_binop_s<0x5E, "div", fdiv, 0>,
-              basic_sse12_fp_binop_s_int<0x5E, "div", 0>,
-              basic_sse12_fp_binop_p<0x5E, "div", fdiv, 0>,
+              basic_sse12_fp_binop_s_int<0x5E, "div", 0>, VEX_4V, VEX_LIG;
+  defm VDIV : basic_sse12_fp_binop_p<0x5E, "div", fdiv, 0>,
               basic_sse12_fp_binop_p_y<0x5E, "div", fdiv>, VEX_4V;
   defm VMAX : basic_sse12_fp_binop_s<0x5F, "max", X86fmax, 0>,
-              basic_sse12_fp_binop_s_int<0x5F, "max", 0>,
-              basic_sse12_fp_binop_p<0x5F, "max", X86fmax, 0>,
+              basic_sse12_fp_binop_s_int<0x5F, "max", 0>, VEX_4V, VEX_LIG;
+  defm VMAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, 0>,
               basic_sse12_fp_binop_p_int<0x5F, "max", 0>,
               basic_sse12_fp_binop_p_y<0x5F, "max", X86fmax>,
               basic_sse12_fp_binop_p_y_int<0x5F, "max">, VEX_4V;
   defm VMIN : basic_sse12_fp_binop_s<0x5D, "min", X86fmin, 0>,
-              basic_sse12_fp_binop_s_int<0x5D, "min", 0>,
-              basic_sse12_fp_binop_p<0x5D, "min", X86fmin, 0>,
+              basic_sse12_fp_binop_s_int<0x5D, "min", 0>, VEX_4V, VEX_LIG;
+  defm VMIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, 0>,
               basic_sse12_fp_binop_p_int<0x5D, "min", 0>,
               basic_sse12_fp_binop_p_y_int<0x5D, "min">,
               basic_sse12_fp_binop_p_y<0x5D, "min", X86fmin>, VEX_4V;
@@ -1720,23 +2843,18 @@ multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr,
 }
 
 /// sse1_fp_unop_s_avx - AVX SSE1 unops in scalar form.
-multiclass sse1_fp_unop_s_avx<bits<8> opc, string OpcodeStr,
-                              SDNode OpNode, Intrinsic F32Int> {
+multiclass sse1_fp_unop_s_avx<bits<8> opc, string OpcodeStr> {
   def SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src1, FR32:$src2),
                 !strconcat(OpcodeStr,
                            "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
-  def SSm : I<opc, MRMSrcMem, (outs FR32:$dst), (ins FR32:$src1, f32mem:$src2),
-                !strconcat(OpcodeStr,
-                           "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                []>, XS, Requires<[HasAVX, OptForSize]>;
-  def SSr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+  let mayLoad = 1 in
+  def SSm : SSI<opc, MRMSrcMem, (outs FR32:$dst), (ins FR32:$src1,f32mem:$src2),
                 !strconcat(OpcodeStr,
-                           "ss\t{$src, $dst, $dst|$dst, $dst, $src}"),
-                [(set VR128:$dst, (F32Int VR128:$src))]>;
-  def SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst), (ins ssmem:$src),
+                           "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
+  def SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst),
+                (ins ssmem:$src1, VR128:$src2),
                 !strconcat(OpcodeStr,
-                           "ss\t{$src, $dst, $dst|$dst, $dst, $src}"),
-                [(set VR128:$dst, (F32Int sse_load_f32:$src))]>;
+                           "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
 }
 
 /// sse1_fp_unop_p - SSE1 unops in packed form.
@@ -1801,21 +2919,17 @@ multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr,
 }
 
 /// sse2_fp_unop_s_avx - AVX SSE2 unops in scalar form.
-multiclass sse2_fp_unop_s_avx<bits<8> opc, string OpcodeStr,
-                              SDNode OpNode, Intrinsic F64Int> {
+multiclass sse2_fp_unop_s_avx<bits<8> opc, string OpcodeStr> {
   def SDr : SDI<opc, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src1, FR64:$src2),
                !strconcat(OpcodeStr,
                           "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
-  def SDm : SDI<opc, MRMSrcMem, (outs FR64:$dst),
-               (ins FR64:$src1, f64mem:$src2),
+  def SDm : SDI<opc, MRMSrcMem, (outs FR64:$dst), (ins FR64:$src1,f64mem:$src2),
+               !strconcat(OpcodeStr,
+                          "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
+  def SDm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst),
+               (ins VR128:$src1, sdmem:$src2),
                !strconcat(OpcodeStr,
                           "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
-  def SDr_Int : SDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-           !strconcat(OpcodeStr, "sd\t{$src, $dst, $dst|$dst, $dst, $src}"),
-           [(set VR128:$dst, (F64Int VR128:$src))]>;
-  def SDm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst), (ins sdmem:$src),
-           !strconcat(OpcodeStr, "sd\t{$src, $dst, $dst|$dst, $dst, $src}"),
-           [(set VR128:$dst, (F64Int sse_load_f64:$src))]>;
 }
 
 /// sse2_fp_unop_p - SSE2 unops in vector forms.
@@ -1863,9 +2977,8 @@ multiclass sse2_fp_unop_p_y_int<bits<8> opc, string OpcodeStr,
 
 let Predicates = [HasAVX] in {
   // Square root.
-  defm VSQRT  : sse1_fp_unop_s_avx<0x51, "vsqrt", fsqrt, int_x86_sse_sqrt_ss>,
-                sse2_fp_unop_s_avx<0x51, "vsqrt", fsqrt, int_x86_sse2_sqrt_sd>,
-                VEX_4V;
+  defm VSQRT  : sse1_fp_unop_s_avx<0x51, "vsqrt">,
+                sse2_fp_unop_s_avx<0x51, "vsqrt">, VEX_4V, VEX_LIG;
 
   defm VSQRT  : sse1_fp_unop_p<0x51, "vsqrt", fsqrt>,
                 sse2_fp_unop_p<0x51, "vsqrt", fsqrt>,
@@ -1879,21 +2992,76 @@ let Predicates = [HasAVX] in {
 
   // Reciprocal approximations. Note that these typically require refinement
   // in order to obtain suitable precision.
-  defm VRSQRT : sse1_fp_unop_s_avx<0x52, "vrsqrt", X86frsqrt,
-                                   int_x86_sse_rsqrt_ss>, VEX_4V;
+  defm VRSQRT : sse1_fp_unop_s_avx<0x52, "vrsqrt">, VEX_4V, VEX_LIG;
   defm VRSQRT : sse1_fp_unop_p<0x52, "vrsqrt", X86frsqrt>,
                 sse1_fp_unop_p_y<0x52, "vrsqrt", X86frsqrt>,
                 sse1_fp_unop_p_y_int<0x52, "vrsqrt", int_x86_avx_rsqrt_ps_256>,
                 sse1_fp_unop_p_int<0x52, "vrsqrt", int_x86_sse_rsqrt_ps>, VEX;
 
-  defm VRCP   : sse1_fp_unop_s_avx<0x53, "vrcp", X86frcp, int_x86_sse_rcp_ss>,
-                                   VEX_4V;
+  defm VRCP   : sse1_fp_unop_s_avx<0x53, "vrcp">, VEX_4V, VEX_LIG;
   defm VRCP   : sse1_fp_unop_p<0x53, "vrcp", X86frcp>,
                 sse1_fp_unop_p_y<0x53, "vrcp", X86frcp>,
                 sse1_fp_unop_p_y_int<0x53, "vrcp", int_x86_avx_rcp_ps_256>,
                 sse1_fp_unop_p_int<0x53, "vrcp", int_x86_sse_rcp_ps>, VEX;
 }
 
+def : Pat<(f32 (fsqrt FR32:$src)),
+          (VSQRTSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>;
+def : Pat<(f32 (fsqrt (load addr:$src))),
+          (VSQRTSSm (f32 (IMPLICIT_DEF)), addr:$src)>,
+          Requires<[HasAVX, OptForSize]>;
+def : Pat<(f64 (fsqrt FR64:$src)),
+          (VSQRTSDr (f64 (IMPLICIT_DEF)), FR64:$src)>, Requires<[HasAVX]>;
+def : Pat<(f64 (fsqrt (load addr:$src))),
+          (VSQRTSDm (f64 (IMPLICIT_DEF)), addr:$src)>,
+          Requires<[HasAVX, OptForSize]>;
+
+def : Pat<(f32 (X86frsqrt FR32:$src)),
+          (VRSQRTSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>;
+def : Pat<(f32 (X86frsqrt (load addr:$src))),
+          (VRSQRTSSm (f32 (IMPLICIT_DEF)), addr:$src)>,
+          Requires<[HasAVX, OptForSize]>;
+
+def : Pat<(f32 (X86frcp FR32:$src)),
+          (VRCPSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>;
+def : Pat<(f32 (X86frcp (load addr:$src))),
+          (VRCPSSm (f32 (IMPLICIT_DEF)), addr:$src)>,
+          Requires<[HasAVX, OptForSize]>;
+
+let Predicates = [HasAVX] in {
+  def : Pat<(int_x86_sse_sqrt_ss VR128:$src),
+            (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)),
+                (VSQRTSSr (f32 (IMPLICIT_DEF)),
+                          (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss)),
+                sub_ss)>;
+  def : Pat<(int_x86_sse_sqrt_ss sse_load_f32:$src),
+            (VSQRTSSm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>;
+
+  def : Pat<(int_x86_sse2_sqrt_sd VR128:$src),
+            (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)),
+                (VSQRTSDr (f64 (IMPLICIT_DEF)),
+                          (EXTRACT_SUBREG (v2f64 VR128:$src), sub_sd)),
+                sub_sd)>;
+  def : Pat<(int_x86_sse2_sqrt_sd sse_load_f64:$src),
+            (VSQRTSDm_Int (v2f64 (IMPLICIT_DEF)), sse_load_f64:$src)>;
+
+  def : Pat<(int_x86_sse_rsqrt_ss VR128:$src),
+            (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)),
+                (VRSQRTSSr (f32 (IMPLICIT_DEF)),
+                          (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss)),
+                sub_ss)>;
+  def : Pat<(int_x86_sse_rsqrt_ss sse_load_f32:$src),
+            (VRSQRTSSm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>;
+
+  def : Pat<(int_x86_sse_rcp_ss VR128:$src),
+            (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)),
+                (VRCPSSr (f32 (IMPLICIT_DEF)),
+                         (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss)),
+                sub_ss)>;
+  def : Pat<(int_x86_sse_rcp_ss sse_load_f32:$src),
+            (VRCPSSm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>;
+}
+
 // Square root.
 defm SQRT  : sse1_fp_unop_s<0x51, "sqrt",  fsqrt, int_x86_sse_sqrt_ss>,
              sse1_fp_unop_p<0x51, "sqrt",  fsqrt>,
@@ -1992,7 +3160,7 @@ def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                     [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>;
 
 def : Pat<(alignednontemporalstore (v2i64 VR128:$src), addr:$dst),
-          (MOVNTDQmr addr:$dst, VR128:$src)>;
+          (MOVNTDQmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
 
 // There is no AVX form for instructions below this point
 def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
@@ -2006,7 +3174,7 @@ def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
 }
 
 //===----------------------------------------------------------------------===//
-// SSE 1 & 2 - Misc Instructions (No AVX form)
+// SSE 1 & 2 - Prefetch and memory fence
 //===----------------------------------------------------------------------===//
 
 // Prefetch intrinsic.
@@ -2019,66 +3187,26 @@ def PREFETCHT2   : PSI<0x18, MRM3m, (outs), (ins i8mem:$src),
 def PREFETCHNTA  : PSI<0x18, MRM0m, (outs), (ins i8mem:$src),
     "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))]>;
 
-// Load, store, and memory fence
-def SFENCE : I<0xAE, MRM_F8, (outs), (ins), "sfence", [(int_x86_sse_sfence)]>,
-             TB, Requires<[HasSSE1]>;
-def : Pat<(X86SFence), (SFENCE)>;
-
-// Alias instructions that map zero vector to pxor / xorp* for sse.
-// We set canFoldAsLoad because this can be converted to a constant-pool
-// load of an all-zeros value if folding it would be beneficial.
-// FIXME: Change encoding to pseudo! This is blocked right now by the x86
-// JIT implementation, it does not expand the instructions below like
-// X86MCInstLower does.
-let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
-    isCodeGenOnly = 1 in {
-def V_SET0PS : PSI<0x57, MRMInitReg, (outs VR128:$dst), (ins), "",
-                 [(set VR128:$dst, (v4f32 immAllZerosV))]>;
-def V_SET0PD : PDI<0x57, MRMInitReg, (outs VR128:$dst), (ins), "",
-                 [(set VR128:$dst, (v2f64 immAllZerosV))]>;
-let ExeDomain = SSEPackedInt in
-def V_SET0PI : PDI<0xEF, MRMInitReg, (outs VR128:$dst), (ins), "",
-                 [(set VR128:$dst, (v4i32 immAllZerosV))]>;
-}
-
-// The same as done above but for AVX. The 128-bit versions are the
-// same, but re-encoded. The 256-bit does not support PI version, and
-// doesn't need it because on sandy bridge the register is set to zero
-// at the rename stage without using any execution unit, so SET0PSY
-// and SET0PDY can be used for vector int instructions without penalty
-// FIXME: Change encoding to pseudo! This is blocked right now by the x86
-// JIT implementatioan, it does not expand the instructions below like
-// X86MCInstLower does.
-let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
-    isCodeGenOnly = 1, Predicates = [HasAVX] in {
-def AVX_SET0PS  : PSI<0x57, MRMInitReg, (outs VR128:$dst), (ins), "",
-                   [(set VR128:$dst, (v4f32 immAllZerosV))]>, VEX_4V;
-def AVX_SET0PD  : PDI<0x57, MRMInitReg, (outs VR128:$dst), (ins), "",
-                   [(set VR128:$dst, (v2f64 immAllZerosV))]>, VEX_4V;
-def AVX_SET0PSY : PSI<0x57, MRMInitReg, (outs VR256:$dst), (ins), "",
-                   [(set VR256:$dst, (v8f32 immAllZerosV))]>, VEX_4V;
-def AVX_SET0PDY : PDI<0x57, MRMInitReg, (outs VR256:$dst), (ins), "",
-                   [(set VR256:$dst, (v4f64 immAllZerosV))]>, VEX_4V;
-let ExeDomain = SSEPackedInt in
-def AVX_SET0PI  : PDI<0xEF, MRMInitReg, (outs VR128:$dst), (ins), "",
-                   [(set VR128:$dst, (v4i32 immAllZerosV))]>;
-}
+// Flush cache
+def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
+               "clflush\t$src", [(int_x86_sse2_clflush addr:$src)]>,
+              TB, Requires<[HasSSE2]>;
 
-def : Pat<(v2i64 immAllZerosV), (V_SET0PI)>;
-def : Pat<(v8i16 immAllZerosV), (V_SET0PI)>;
-def : Pat<(v16i8 immAllZerosV), (V_SET0PI)>;
+// Pause. This "instruction" is encoded as "rep; nop", so even though it
+// was introduced with SSE2, it's backward compatible.
+def PAUSE : I<0x90, RawFrm, (outs), (ins), "pause", []>, REP;
 
-def : Pat<(f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
-          (f32 (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>;
+// Load, store, and memory fence
+def SFENCE : I<0xAE, MRM_F8, (outs), (ins),
+               "sfence", [(int_x86_sse_sfence)]>, TB, Requires<[HasSSE1]>;
+def LFENCE : I<0xAE, MRM_E8, (outs), (ins),
+               "lfence", [(int_x86_sse2_lfence)]>, TB, Requires<[HasSSE2]>;
+def MFENCE : I<0xAE, MRM_F0, (outs), (ins),
+               "mfence", [(int_x86_sse2_mfence)]>, TB, Requires<[HasSSE2]>;
 
-// FIXME: According to the intel manual, DEST[127:64] <- SRC1[127:64], while
-// in the non-AVX version bits 127:64 aren't touched. Find a better way to
-// represent this instead of always zeroing SRC1. One possible solution is
-// to represent the instruction w/ something similar as the "$src1 = $dst"
-// constraint but without the tied operands.
-def : Pat<(extloadf32 addr:$src),
-          (VCVTSS2SDrm (f32 (EXTRACT_SUBREG (AVX_SET0PS), sub_ss)), addr:$src)>,
-      Requires<[HasAVX, OptForSpeed]>;
+def : Pat<(X86SFence), (SFENCE)>;
+def : Pat<(X86LFence), (LFENCE)>;
+def : Pat<(X86MFence), (MFENCE)>;
 
 //===----------------------------------------------------------------------===//
 // SSE 1 & 2 - Load/Store XCSR register
@@ -2106,10 +3234,22 @@ def VMOVDQArr  : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
 def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                     "movdqa\t{$src, $dst|$dst, $src}", []>, VEX;
 }
-def VMOVDQUrr  : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                    "movdqu\t{$src, $dst|$dst, $src}", []>, XS, VEX;
-def VMOVDQUYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
-                    "movdqu\t{$src, $dst|$dst, $src}", []>, XS, VEX;
+def VMOVDQUrr  : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                    "movdqu\t{$src, $dst|$dst, $src}", []>, VEX;
+def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+                    "movdqu\t{$src, $dst|$dst, $src}", []>, VEX;
+
+// For Disassembler
+let isCodeGenOnly = 1 in {
+def VMOVDQArr_REV  : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+                        "movdqa\t{$src, $dst|$dst, $src}", []>, VEX;
+def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
+                        "movdqa\t{$src, $dst|$dst, $src}", []>, VEX;
+def VMOVDQUrr_REV  : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+                        "movdqu\t{$src, $dst|$dst, $src}", []>, VEX;
+def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
+                        "movdqu\t{$src, $dst|$dst, $src}", []>, VEX;
+}
 
 let canFoldAsLoad = 1, mayLoad = 1 in {
 def VMOVDQArm  : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
@@ -2147,6 +3287,16 @@ def MOVDQUrr :   I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                    "movdqu\t{$src, $dst|$dst, $src}",
                    []>, XS, Requires<[HasSSE2]>;
 
+// For Disassembler
+let isCodeGenOnly = 1 in {
+def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+                       "movdqa\t{$src, $dst|$dst, $src}", []>;
+
+def MOVDQUrr_REV :   I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+                       "movdqu\t{$src, $dst|$dst, $src}",
+                       []>, XS, Requires<[HasSSE2]>;
+}
+
 let canFoldAsLoad = 1, mayLoad = 1 in {
 def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                    "movdqa\t{$src, $dst|$dst, $src}",
@@ -2180,9 +3330,11 @@ def MOVDQUmr_Int :   I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
 
 } // ExeDomain = SSEPackedInt
 
-def : Pat<(int_x86_avx_loadu_dq_256 addr:$src), (VMOVDQUYrm addr:$src)>;
-def : Pat<(int_x86_avx_storeu_dq_256 addr:$dst, VR256:$src),
-          (VMOVDQUYmr addr:$dst, VR256:$src)>;
+let Predicates = [HasAVX] in {
+  def : Pat<(int_x86_avx_loadu_dq_256 addr:$src), (VMOVDQUYrm addr:$src)>;
+  def : Pat<(int_x86_avx_storeu_dq_256 addr:$dst, VR256:$src),
+            (VMOVDQUYmr addr:$dst, VR256:$src)>;
+}
 
 //===---------------------------------------------------------------------===//
 // SSE2 - Packed Integer Arithmetic Instructions
@@ -2415,15 +3567,14 @@ let ExeDomain = SSEPackedInt in {
   def VPANDNrr : PDI<0xDF, MRMSrcReg,
                     (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
                     "vpandn\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                    [(set VR128:$dst, (v2i64 (and (vnot VR128:$src1),
-                                              VR128:$src2)))]>, VEX_4V;
+                    [(set VR128:$dst,
+                          (v2i64 (X86andnp VR128:$src1, VR128:$src2)))]>,VEX_4V;
 
   def VPANDNrm : PDI<0xDF, MRMSrcMem,
                     (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
                     "vpandn\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                    [(set VR128:$dst, (v2i64 (and (vnot VR128:$src1),
-                                              (memopv2i64 addr:$src2))))]>,
-                                              VEX_4V;
+                    [(set VR128:$dst, (X86andnp VR128:$src1,
+                                            (memopv2i64 addr:$src2)))]>, VEX_4V;
 }
 }
 
@@ -2527,6 +3678,32 @@ let Predicates = [HasAVX] in {
                                     0>, VEX_4V;
   defm VPCMPGTD  : PDI_binop_rm_int<0x66, "vpcmpgtd", int_x86_sse2_pcmpgt_d, 0,
                                     0>, VEX_4V;
+
+  def : Pat<(v16i8 (X86pcmpeqb VR128:$src1, VR128:$src2)),
+            (VPCMPEQBrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v16i8 (X86pcmpeqb VR128:$src1, (memop addr:$src2))),
+            (VPCMPEQBrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v8i16 (X86pcmpeqw VR128:$src1, VR128:$src2)),
+            (VPCMPEQWrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v8i16 (X86pcmpeqw VR128:$src1, (memop addr:$src2))),
+            (VPCMPEQWrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v4i32 (X86pcmpeqd VR128:$src1, VR128:$src2)),
+            (VPCMPEQDrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v4i32 (X86pcmpeqd VR128:$src1, (memop addr:$src2))),
+            (VPCMPEQDrm VR128:$src1, addr:$src2)>;
+
+  def : Pat<(v16i8 (X86pcmpgtb VR128:$src1, VR128:$src2)),
+            (VPCMPGTBrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v16i8 (X86pcmpgtb VR128:$src1, (memop addr:$src2))),
+            (VPCMPGTBrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v8i16 (X86pcmpgtw VR128:$src1, VR128:$src2)),
+            (VPCMPGTWrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v8i16 (X86pcmpgtw VR128:$src1, (memop addr:$src2))),
+            (VPCMPGTWrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v4i32 (X86pcmpgtd VR128:$src1, VR128:$src2)),
+            (VPCMPGTDrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v4i32 (X86pcmpgtd VR128:$src1, (memop addr:$src2))),
+            (VPCMPGTDrm VR128:$src1, addr:$src2)>;
 }
 
 let Constraints = "$src1 = $dst" in {
@@ -2538,31 +3715,33 @@ let Constraints = "$src1 = $dst" in {
   defm PCMPGTD  : PDI_binop_rm_int<0x66, "pcmpgtd", int_x86_sse2_pcmpgt_d>;
 } // Constraints = "$src1 = $dst"
 
-def : Pat<(v16i8 (X86pcmpeqb VR128:$src1, VR128:$src2)),
-          (PCMPEQBrr VR128:$src1, VR128:$src2)>;
-def : Pat<(v16i8 (X86pcmpeqb VR128:$src1, (memop addr:$src2))),
-          (PCMPEQBrm VR128:$src1, addr:$src2)>;
-def : Pat<(v8i16 (X86pcmpeqw VR128:$src1, VR128:$src2)),
-          (PCMPEQWrr VR128:$src1, VR128:$src2)>;
-def : Pat<(v8i16 (X86pcmpeqw VR128:$src1, (memop addr:$src2))),
-          (PCMPEQWrm VR128:$src1, addr:$src2)>;
-def : Pat<(v4i32 (X86pcmpeqd VR128:$src1, VR128:$src2)),
-          (PCMPEQDrr VR128:$src1, VR128:$src2)>;
-def : Pat<(v4i32 (X86pcmpeqd VR128:$src1, (memop addr:$src2))),
-          (PCMPEQDrm VR128:$src1, addr:$src2)>;
-
-def : Pat<(v16i8 (X86pcmpgtb VR128:$src1, VR128:$src2)),
-          (PCMPGTBrr VR128:$src1, VR128:$src2)>;
-def : Pat<(v16i8 (X86pcmpgtb VR128:$src1, (memop addr:$src2))),
-          (PCMPGTBrm VR128:$src1, addr:$src2)>;
-def : Pat<(v8i16 (X86pcmpgtw VR128:$src1, VR128:$src2)),
-          (PCMPGTWrr VR128:$src1, VR128:$src2)>;
-def : Pat<(v8i16 (X86pcmpgtw VR128:$src1, (memop addr:$src2))),
-          (PCMPGTWrm VR128:$src1, addr:$src2)>;
-def : Pat<(v4i32 (X86pcmpgtd VR128:$src1, VR128:$src2)),
-          (PCMPGTDrr VR128:$src1, VR128:$src2)>;
-def : Pat<(v4i32 (X86pcmpgtd VR128:$src1, (memop addr:$src2))),
-          (PCMPGTDrm VR128:$src1, addr:$src2)>;
+let Predicates = [HasSSE2] in {
+  def : Pat<(v16i8 (X86pcmpeqb VR128:$src1, VR128:$src2)),
+            (PCMPEQBrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v16i8 (X86pcmpeqb VR128:$src1, (memop addr:$src2))),
+            (PCMPEQBrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v8i16 (X86pcmpeqw VR128:$src1, VR128:$src2)),
+            (PCMPEQWrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v8i16 (X86pcmpeqw VR128:$src1, (memop addr:$src2))),
+            (PCMPEQWrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v4i32 (X86pcmpeqd VR128:$src1, VR128:$src2)),
+            (PCMPEQDrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v4i32 (X86pcmpeqd VR128:$src1, (memop addr:$src2))),
+            (PCMPEQDrm VR128:$src1, addr:$src2)>;
+
+  def : Pat<(v16i8 (X86pcmpgtb VR128:$src1, VR128:$src2)),
+            (PCMPGTBrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v16i8 (X86pcmpgtb VR128:$src1, (memop addr:$src2))),
+            (PCMPGTBrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v8i16 (X86pcmpgtw VR128:$src1, VR128:$src2)),
+            (PCMPGTWrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v8i16 (X86pcmpgtw VR128:$src1, (memop addr:$src2))),
+            (PCMPGTWrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v4i32 (X86pcmpgtd VR128:$src1, VR128:$src2)),
+            (PCMPGTDrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v4i32 (X86pcmpgtd VR128:$src1, (memop addr:$src2))),
+            (PCMPGTDrm VR128:$src1, addr:$src2)>;
+}
 
 //===---------------------------------------------------------------------===//
 // SSE2 - Packed Integer Pack Instructions
@@ -2608,7 +3787,7 @@ def mi : Ii8<0x70, MRMSrcMem,
 
 let Predicates = [HasAVX] in {
   let AddedComplexity = 5 in
-  defm VPSHUFD : sse2_pshuffle<"vpshufd", v4i32, pshufd, bc_v4i32>, OpSize,
+  defm VPSHUFD : sse2_pshuffle<"vpshufd", v4i32, pshufd, bc_v4i32>, TB, OpSize,
                                VEX;
 
   // SSE2 with ImmT == Imm8 and XS prefix.
@@ -2618,6 +3797,34 @@ let Predicates = [HasAVX] in {
   // SSE2 with ImmT == Imm8 and XD prefix.
   defm VPSHUFLW : sse2_pshuffle<"vpshuflw", v8i16, pshuflw, bc_v8i16>, XD,
                                VEX;
+
+  let AddedComplexity = 5 in
+  def : Pat<(v4f32 (pshufd:$src2 VR128:$src1, (undef))),
+            (VPSHUFDri VR128:$src1, (SHUFFLE_get_shuf_imm VR128:$src2))>;
+  // Unary v4f32 shuffle with VPSHUF* in order to fold a load.
+  def : Pat<(pshufd:$src2 (bc_v4i32 (memopv4f32 addr:$src1)), (undef)),
+            (VPSHUFDmi addr:$src1, (SHUFFLE_get_shuf_imm VR128:$src2))>;
+
+  def : Pat<(v4i32 (X86PShufd (bc_v4i32 (memopv2i64 addr:$src1)),
+                                   (i8 imm:$imm))),
+            (VPSHUFDmi addr:$src1, imm:$imm)>;
+  def : Pat<(v4i32 (X86PShufd (bc_v4i32 (memopv4f32 addr:$src1)),
+                                   (i8 imm:$imm))),
+            (VPSHUFDmi addr:$src1, imm:$imm)>;
+  def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
+            (VPSHUFDri VR128:$src1, imm:$imm)>;
+  def : Pat<(v4i32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
+            (VPSHUFDri VR128:$src1, imm:$imm)>;
+  def : Pat<(v8i16 (X86PShufhw VR128:$src, (i8 imm:$imm))),
+            (VPSHUFHWri VR128:$src, imm:$imm)>;
+  def : Pat<(v8i16 (X86PShufhw (bc_v8i16 (memopv2i64 addr:$src)),
+                               (i8 imm:$imm))),
+            (VPSHUFHWmi addr:$src, imm:$imm)>;
+  def : Pat<(v8i16 (X86PShuflw VR128:$src, (i8 imm:$imm))),
+            (VPSHUFLWri VR128:$src, imm:$imm)>;
+  def : Pat<(v8i16 (X86PShuflw (bc_v8i16 (memopv2i64 addr:$src)),
+                               (i8 imm:$imm))),
+            (VPSHUFLWmi addr:$src, imm:$imm)>;
 }
 
 let Predicates = [HasSSE2] in {
@@ -2629,6 +3836,34 @@ let Predicates = [HasSSE2] in {
 
   // SSE2 with ImmT == Imm8 and XD prefix.
   defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, pshuflw, bc_v8i16>, XD;
+
+  let AddedComplexity = 5 in
+  def : Pat<(v4f32 (pshufd:$src2 VR128:$src1, (undef))),
+            (PSHUFDri VR128:$src1, (SHUFFLE_get_shuf_imm VR128:$src2))>;
+  // Unary v4f32 shuffle with PSHUF* in order to fold a load.
+  def : Pat<(pshufd:$src2 (bc_v4i32 (memopv4f32 addr:$src1)), (undef)),
+            (PSHUFDmi addr:$src1, (SHUFFLE_get_shuf_imm VR128:$src2))>;
+
+  def : Pat<(v4i32 (X86PShufd (bc_v4i32 (memopv2i64 addr:$src1)),
+                                   (i8 imm:$imm))),
+            (PSHUFDmi addr:$src1, imm:$imm)>;
+  def : Pat<(v4i32 (X86PShufd (bc_v4i32 (memopv4f32 addr:$src1)),
+                                   (i8 imm:$imm))),
+            (PSHUFDmi addr:$src1, imm:$imm)>;
+  def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
+            (PSHUFDri VR128:$src1, imm:$imm)>;
+  def : Pat<(v4i32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
+            (PSHUFDri VR128:$src1, imm:$imm)>;
+  def : Pat<(v8i16 (X86PShufhw VR128:$src, (i8 imm:$imm))),
+            (PSHUFHWri VR128:$src, imm:$imm)>;
+  def : Pat<(v8i16 (X86PShufhw (bc_v8i16 (memopv2i64 addr:$src)),
+                               (i8 imm:$imm))),
+            (PSHUFHWmi addr:$src, imm:$imm)>;
+  def : Pat<(v8i16 (X86PShuflw VR128:$src, (i8 imm:$imm))),
+            (PSHUFLWri VR128:$src, imm:$imm)>;
+  def : Pat<(v8i16 (X86PShuflw (bc_v8i16 (memopv2i64 addr:$src)),
+                               (i8 imm:$imm))),
+            (PSHUFLWmi addr:$src, imm:$imm)>;
 }
 
 //===---------------------------------------------------------------------===//
@@ -2637,71 +3872,69 @@ let Predicates = [HasSSE2] in {
 
 let ExeDomain = SSEPackedInt in {
 multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt,
-                       PatFrag unp_frag, PatFrag bc_frag, bit Is2Addr = 1> {
+                       SDNode OpNode, PatFrag bc_frag, bit Is2Addr = 1> {
   def rr : PDI<opc, MRMSrcReg,
       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
       !if(Is2Addr,
           !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
           !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-      [(set VR128:$dst, (vt (unp_frag VR128:$src1, VR128:$src2)))]>;
+      [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))]>;
   def rm : PDI<opc, MRMSrcMem,
       (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
       !if(Is2Addr,
           !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
           !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-      [(set VR128:$dst, (unp_frag VR128:$src1,
+      [(set VR128:$dst, (OpNode VR128:$src1,
                                   (bc_frag (memopv2i64
                                                addr:$src2))))]>;
 }
 
 let Predicates = [HasAVX] in {
-  defm VPUNPCKLBW  : sse2_unpack<0x60, "vpunpcklbw", v16i8, unpckl, bc_v16i8,
-                                 0>, VEX_4V;
-  defm VPUNPCKLWD  : sse2_unpack<0x61, "vpunpcklwd", v8i16, unpckl, bc_v8i16,
-                                 0>, VEX_4V;
-  defm VPUNPCKLDQ  : sse2_unpack<0x62, "vpunpckldq", v4i32, unpckl, bc_v4i32,
-                                 0>, VEX_4V;
+  defm VPUNPCKLBW  : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Punpcklbw,
+                                 bc_v16i8, 0>, VEX_4V;
+  defm VPUNPCKLWD  : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Punpcklwd,
+                                 bc_v8i16, 0>, VEX_4V;
+  defm VPUNPCKLDQ  : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Punpckldq,
+                                 bc_v4i32, 0>, VEX_4V;
 
   /// FIXME: we could eliminate this and use sse2_unpack instead if tblgen
   /// knew to collapse (bitconvert VT to VT) into its operand.
   def VPUNPCKLQDQrr : PDI<0x6C, MRMSrcReg,
-                         (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
-                         "vpunpcklqdq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                        [(set VR128:$dst,
-                          (v2i64 (unpckl VR128:$src1, VR128:$src2)))]>, VEX_4V;
+            (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+            "vpunpcklqdq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+            [(set VR128:$dst, (v2i64 (X86Punpcklqdq VR128:$src1,
+                                                    VR128:$src2)))]>, VEX_4V;
   def VPUNPCKLQDQrm : PDI<0x6C, MRMSrcMem,
-                         (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
-                         "vpunpcklqdq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                        [(set VR128:$dst,
-                          (v2i64 (unpckl VR128:$src1,
-                                         (memopv2i64 addr:$src2))))]>, VEX_4V;
-
-  defm VPUNPCKHBW  : sse2_unpack<0x68, "vpunpckhbw", v16i8, unpckh, bc_v16i8,
-                                 0>, VEX_4V;
-  defm VPUNPCKHWD  : sse2_unpack<0x69, "vpunpckhwd", v8i16, unpckh, bc_v8i16,
-                                 0>, VEX_4V;
-  defm VPUNPCKHDQ  : sse2_unpack<0x6A, "vpunpckhdq", v4i32, unpckh, bc_v4i32,
-                                 0>, VEX_4V;
+            (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+            "vpunpcklqdq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+            [(set VR128:$dst, (v2i64 (X86Punpcklqdq VR128:$src1,
+                                        (memopv2i64 addr:$src2))))]>, VEX_4V;
+
+  defm VPUNPCKHBW  : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Punpckhbw,
+                                 bc_v16i8, 0>, VEX_4V;
+  defm VPUNPCKHWD  : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Punpckhwd,
+                                 bc_v8i16, 0>, VEX_4V;
+  defm VPUNPCKHDQ  : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Punpckhdq,
+                                 bc_v4i32, 0>, VEX_4V;
 
   /// FIXME: we could eliminate this and use sse2_unpack instead if tblgen
   /// knew to collapse (bitconvert VT to VT) into its operand.
   def VPUNPCKHQDQrr : PDI<0x6D, MRMSrcReg,
-                         (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
-                         "vpunpckhqdq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                        [(set VR128:$dst,
-                          (v2i64 (unpckh VR128:$src1, VR128:$src2)))]>, VEX_4V;
+             (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+             "vpunpckhqdq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+             [(set VR128:$dst, (v2i64 (X86Punpckhqdq VR128:$src1,
+                                                     VR128:$src2)))]>, VEX_4V;
   def VPUNPCKHQDQrm : PDI<0x6D, MRMSrcMem,
-                        (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
-                        "vpunpckhqdq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                        [(set VR128:$dst,
-                          (v2i64 (unpckh VR128:$src1,
-                                         (memopv2i64 addr:$src2))))]>, VEX_4V;
+             (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+             "vpunpckhqdq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+             [(set VR128:$dst, (v2i64 (X86Punpckhqdq VR128:$src1,
+                                        (memopv2i64 addr:$src2))))]>, VEX_4V;
 }
 
 let Constraints = "$src1 = $dst" in {
-  defm PUNPCKLBW  : sse2_unpack<0x60, "punpcklbw", v16i8, unpckl, bc_v16i8>;
-  defm PUNPCKLWD  : sse2_unpack<0x61, "punpcklwd", v8i16, unpckl, bc_v8i16>;
-  defm PUNPCKLDQ  : sse2_unpack<0x62, "punpckldq", v4i32, unpckl, bc_v4i32>;
+  defm PUNPCKLBW  : sse2_unpack<0x60, "punpcklbw", v16i8, X86Punpcklbw, bc_v16i8>;
+  defm PUNPCKLWD  : sse2_unpack<0x61, "punpcklwd", v8i16, X86Punpcklwd, bc_v8i16>;
+  defm PUNPCKLDQ  : sse2_unpack<0x62, "punpckldq", v4i32, X86Punpckldq, bc_v4i32>;
 
   /// FIXME: we could eliminate this and use sse2_unpack instead if tblgen
   /// knew to collapse (bitconvert VT to VT) into its operand.
@@ -2709,17 +3942,17 @@ let Constraints = "$src1 = $dst" in {
                          (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
                          "punpcklqdq\t{$src2, $dst|$dst, $src2}",
                         [(set VR128:$dst,
-                          (v2i64 (unpckl VR128:$src1, VR128:$src2)))]>;
+                          (v2i64 (X86Punpcklqdq VR128:$src1, VR128:$src2)))]>;
   def PUNPCKLQDQrm : PDI<0x6C, MRMSrcMem,
                          (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
                          "punpcklqdq\t{$src2, $dst|$dst, $src2}",
                         [(set VR128:$dst,
-                          (v2i64 (unpckl VR128:$src1,
+                          (v2i64 (X86Punpcklqdq VR128:$src1,
                                          (memopv2i64 addr:$src2))))]>;
 
-  defm PUNPCKHBW  : sse2_unpack<0x68, "punpckhbw", v16i8, unpckh, bc_v16i8>;
-  defm PUNPCKHWD  : sse2_unpack<0x69, "punpckhwd", v8i16, unpckh, bc_v8i16>;
-  defm PUNPCKHDQ  : sse2_unpack<0x6A, "punpckhdq", v4i32, unpckh, bc_v4i32>;
+  defm PUNPCKHBW  : sse2_unpack<0x68, "punpckhbw", v16i8, X86Punpckhbw, bc_v16i8>;
+  defm PUNPCKHWD  : sse2_unpack<0x69, "punpckhwd", v8i16, X86Punpckhwd, bc_v8i16>;
+  defm PUNPCKHDQ  : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Punpckhdq, bc_v4i32>;
 
   /// FIXME: we could eliminate this and use sse2_unpack instead if tblgen
   /// knew to collapse (bitconvert VT to VT) into its operand.
@@ -2727,17 +3960,24 @@ let Constraints = "$src1 = $dst" in {
                          (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
                          "punpckhqdq\t{$src2, $dst|$dst, $src2}",
                         [(set VR128:$dst,
-                          (v2i64 (unpckh VR128:$src1, VR128:$src2)))]>;
+                          (v2i64 (X86Punpckhqdq VR128:$src1, VR128:$src2)))]>;
   def PUNPCKHQDQrm : PDI<0x6D, MRMSrcMem,
                         (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
                         "punpckhqdq\t{$src2, $dst|$dst, $src2}",
                         [(set VR128:$dst,
-                          (v2i64 (unpckh VR128:$src1,
+                          (v2i64 (X86Punpckhqdq VR128:$src1,
                                          (memopv2i64 addr:$src2))))]>;
 }
-
 } // ExeDomain = SSEPackedInt
 
+// Splat v2f64 / v2i64
+let AddedComplexity = 10 in {
+  def : Pat<(splat_lo (v2i64 VR128:$src), (undef)),
+            (PUNPCKLQDQrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+  def : Pat<(splat_lo (v2i64 VR128:$src), (undef)),
+            (VPUNPCKLQDQrr VR128:$src, VR128:$src)>, Requires<[HasAVX]>;
+}
+
 //===---------------------------------------------------------------------===//
 // SSE2 - Packed Integer Extract and Insert
 //===---------------------------------------------------------------------===//
@@ -2769,7 +4009,7 @@ def VPEXTRWri : Ii8<0xC5, MRMSrcReg,
                     (outs GR32:$dst), (ins VR128:$src1, i32i8imm:$src2),
                     "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                     [(set GR32:$dst, (X86pextrw (v8i16 VR128:$src1),
-                                                imm:$src2))]>, OpSize, VEX;
+                                                imm:$src2))]>, TB, OpSize, VEX;
 def PEXTRWri : PDIi8<0xC5, MRMSrcReg,
                     (outs GR32:$dst), (ins VR128:$src1, i32i8imm:$src2),
                     "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -2778,11 +4018,11 @@ def PEXTRWri : PDIi8<0xC5, MRMSrcReg,
 
 // Insert
 let Predicates = [HasAVX] in {
-  defm VPINSRW : sse2_pinsrw<0>, OpSize, VEX_4V;
+  defm VPINSRW : sse2_pinsrw<0>, TB, OpSize, VEX_4V;
   def  VPINSRWrr64i : Ii8<0xC4, MRMSrcReg, (outs VR128:$dst),
        (ins VR128:$src1, GR64:$src2, i32i8imm:$src3),
        "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-       []>, OpSize, VEX_4V;
+       []>, TB, OpSize, VEX_4V;
 }
 
 let Constraints = "$src1 = $dst" in
@@ -2839,7 +4079,9 @@ def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
 // SSE2 - Move Doubleword
 //===---------------------------------------------------------------------===//
 
+//===---------------------------------------------------------------------===//
 // Move Int Doubleword to Packed Double Int
+//
 def VMOVDI2PDIrr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
                       "movd\t{$src, $dst|$dst, $src}",
                       [(set VR128:$dst,
@@ -2849,6 +4091,14 @@ def VMOVDI2PDIrm : VPDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
                       [(set VR128:$dst,
                         (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>,
                       VEX;
+def VMOV64toPQIrr : VRPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
+                        "mov{d|q}\t{$src, $dst|$dst, $src}",
+                        [(set VR128:$dst,
+                          (v2i64 (scalar_to_vector GR64:$src)))]>, VEX;
+def VMOV64toSDrr : VRPDI<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
+                       "mov{d|q}\t{$src, $dst|$dst, $src}",
+                       [(set FR64:$dst, (bitconvert GR64:$src))]>, VEX;
+
 def MOVDI2PDIrr : PDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
                       "movd\t{$src, $dst|$dst, $src}",
                       [(set VR128:$dst,
@@ -2865,8 +4115,9 @@ def MOV64toSDrr : RPDI<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
                        "mov{d|q}\t{$src, $dst|$dst, $src}",
                        [(set FR64:$dst, (bitconvert GR64:$src))]>;
 
-
+//===---------------------------------------------------------------------===//
 // Move Int Doubleword to Single Scalar
+//
 def VMOVDI2SSrr  : VPDI<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
                       "movd\t{$src, $dst|$dst, $src}",
                       [(set FR32:$dst, (bitconvert GR32:$src))]>, VEX;
@@ -2883,7 +4134,9 @@ def MOVDI2SSrm  : PDI<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
                       "movd\t{$src, $dst|$dst, $src}",
                       [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))]>;
 
+//===---------------------------------------------------------------------===//
 // Move Packed Doubleword Int to Packed Double Int
+//
 def VMOVPDI2DIrr  : VPDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
                        "movd\t{$src, $dst|$dst, $src}",
                        [(set GR32:$dst, (vector_extract (v4i32 VR128:$src),
@@ -2902,22 +4155,48 @@ def MOVPDI2DImr  : PDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
                        [(store (i32 (vector_extract (v4i32 VR128:$src),
                                      (iPTR 0))), addr:$dst)]>;
 
-def MOVPQIto64rr  : RPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
+//===---------------------------------------------------------------------===//
+// Move Packed Doubleword Int first element to Doubleword Int
+//
+def VMOVPQIto64rr : I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
+                          "mov{d|q}\t{$src, $dst|$dst, $src}",
+                          [(set GR64:$dst, (vector_extract (v2i64 VR128:$src),
+                                                           (iPTR 0)))]>,
+                      TB, OpSize, VEX, VEX_W, Requires<[HasAVX, In64BitMode]>;
+
+def MOVPQIto64rr : RPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
+                        "mov{d|q}\t{$src, $dst|$dst, $src}",
+                        [(set GR64:$dst, (vector_extract (v2i64 VR128:$src),
+                                                         (iPTR 0)))]>;
+
+//===---------------------------------------------------------------------===//
+// Bitcast FR64 <-> GR64
+//
+let Predicates = [HasAVX] in
+def VMOV64toSDrm : S3SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
+                        "vmovq\t{$src, $dst|$dst, $src}",
+                        [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>,
+                        VEX;
+def VMOVSDto64rr : VRPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
                          "mov{d|q}\t{$src, $dst|$dst, $src}",
-                         [(set GR64:$dst, (vector_extract (v2i64 VR128:$src),
-                                           (iPTR 0)))]>;
+                         [(set GR64:$dst, (bitconvert FR64:$src))]>;
+def VMOVSDto64mr : VRPDI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
+                         "movq\t{$src, $dst|$dst, $src}",
+                         [(store (i64 (bitconvert FR64:$src)), addr:$dst)]>;
+
 def MOV64toSDrm : S3SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
                        "movq\t{$src, $dst|$dst, $src}",
                        [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>;
+def MOVSDto64rr : RPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
+                       "mov{d|q}\t{$src, $dst|$dst, $src}",
+                       [(set GR64:$dst, (bitconvert FR64:$src))]>;
+def MOVSDto64mr : RPDI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
+                       "movq\t{$src, $dst|$dst, $src}",
+                       [(store (i64 (bitconvert FR64:$src)), addr:$dst)]>;
 
-def MOVSDto64rr  : RPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
-                        "mov{d|q}\t{$src, $dst|$dst, $src}",
-                        [(set GR64:$dst, (bitconvert FR64:$src))]>;
-def MOVSDto64mr  : RPDI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
-                        "movq\t{$src, $dst|$dst, $src}",
-                        [(store (i64 (bitconvert FR64:$src)), addr:$dst)]>;
-
+//===---------------------------------------------------------------------===//
 // Move Scalar Single to Double Int
+//
 def VMOVSS2DIrr  : VPDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
                       "movd\t{$src, $dst|$dst, $src}",
                       [(set GR32:$dst, (bitconvert FR32:$src))]>, VEX;
@@ -2931,7 +4210,9 @@ def MOVSS2DImr  : PDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
                       "movd\t{$src, $dst|$dst, $src}",
                       [(store (i32 (bitconvert FR32:$src)), addr:$dst)]>;
 
-// movd / movq to XMM register zero-extends
+//===---------------------------------------------------------------------===//
+// Patterns and instructions to describe movd/movq to XMM register zero-extends
+//
 let AddedComplexity = 15 in {
 def VMOVZDI2PDIrr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
                        "movd\t{$src, $dst|$dst, $src}",
@@ -2967,15 +4248,36 @@ def MOVZDI2PDIrm : PDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
                        [(set VR128:$dst,
                          (v4i32 (X86vzmovl (v4i32 (scalar_to_vector
                                                    (loadi32 addr:$src))))))]>;
+}
 
-def : Pat<(v4i32 (X86vzmovl (loadv4i32 addr:$src))),
+let Predicates = [HasSSE2], AddedComplexity = 20 in {
+  def : Pat<(v4i32 (X86vzmovl (loadv4i32 addr:$src))),
             (MOVZDI2PDIrm addr:$src)>;
-def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
+  def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
             (MOVZDI2PDIrm addr:$src)>;
-def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
+  def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
             (MOVZDI2PDIrm addr:$src)>;
 }
 
+let Predicates = [HasAVX] in {
+  // AVX 128-bit movd/movq instruction write zeros in the high 128-bit part.
+  let AddedComplexity = 20 in {
+    def : Pat<(v4i32 (X86vzmovl (loadv4i32 addr:$src))),
+              (VMOVZDI2PDIrm addr:$src)>;
+    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
+              (VMOVZDI2PDIrm addr:$src)>;
+    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
+              (VMOVZDI2PDIrm addr:$src)>;
+  }
+  // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext.
+  def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
+                                (v4i32 (scalar_to_vector GR32:$src)),(i32 0)))),
+            (SUBREG_TO_REG (i32 0), (VMOVZDI2PDIrr GR32:$src), sub_xmm)>;
+  def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
+                                (v2i64 (scalar_to_vector GR64:$src)),(i32 0)))),
+            (SUBREG_TO_REG (i64 0), (VMOVZQI2PQIrr GR64:$src), sub_xmm)>;
+}
+
 // These are the correct encodings of the instructions so that we know how to
 // read correct assembly, even though we continue to emit the wrong ones for
 // compatibility with Darwin's buggy assembler.
@@ -2996,7 +4298,9 @@ def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
 // SSE2 - Move Quadword
 //===---------------------------------------------------------------------===//
 
+//===---------------------------------------------------------------------===//
 // Move Quadword Int to Packed Quadword Int
+//
 def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                     "vmovq\t{$src, $dst|$dst, $src}",
                     [(set VR128:$dst,
@@ -3008,7 +4312,9 @@ def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                       (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS,
                     Requires<[HasSSE2]>; // SSE2 instruction with XS Prefix
 
+//===---------------------------------------------------------------------===//
 // Move Packed Quadword Int to Quadword Int
+//
 def VMOVPQI2QImr : VPDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
                       "movq\t{$src, $dst|$dst, $src}",
                       [(store (i64 (vector_extract (v2i64 VR128:$src),
@@ -3018,10 +4324,9 @@ def MOVPQI2QImr : PDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
                       [(store (i64 (vector_extract (v2i64 VR128:$src),
                                     (iPTR 0))), addr:$dst)]>;
 
-def : Pat<(f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
-          (f64 (EXTRACT_SUBREG (v2f64 VR128:$src), sub_sd))>;
-
+//===---------------------------------------------------------------------===//
 // Store / copy lower 64-bits of a XMM register.
+//
 def VMOVLQ128mr : VPDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
                      "movq\t{$src, $dst|$dst, $src}",
                      [(int_x86_sse2_storel_dq addr:$dst, VR128:$src)]>, VEX;
@@ -3037,7 +4342,7 @@ def VMOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                                                  (loadi64 addr:$src))))))]>,
                      XS, VEX, Requires<[HasAVX]>;
 
-let AddedComplexity = 20 in {
+let AddedComplexity = 20 in
 def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                      "movq\t{$src, $dst|$dst, $src}",
                      [(set VR128:$dst,
@@ -3045,15 +4350,27 @@ def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                                                  (loadi64 addr:$src))))))]>,
                      XS, Requires<[HasSSE2]>;
 
-def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
+let Predicates = [HasSSE2], AddedComplexity = 20 in {
+  def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
             (MOVZQI2PQIrm addr:$src)>;
-def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
+  def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
             (MOVZQI2PQIrm addr:$src)>;
-def : Pat<(v2i64 (X86vzload addr:$src)), (MOVZQI2PQIrm addr:$src)>;
+  def : Pat<(v2i64 (X86vzload addr:$src)), (MOVZQI2PQIrm addr:$src)>;
 }
 
+let Predicates = [HasAVX], AddedComplexity = 20 in {
+  def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
+            (VMOVZQI2PQIrm addr:$src)>;
+  def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
+            (VMOVZQI2PQIrm addr:$src)>;
+  def : Pat<(v2i64 (X86vzload addr:$src)),
+            (VMOVZQI2PQIrm addr:$src)>;
+}
+
+//===---------------------------------------------------------------------===//
 // Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in
 // IA32 document. movq xmm1, xmm2 does clear the high bits.
+//
 let AddedComplexity = 15 in
 def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                         "vmovq\t{$src, $dst|$dst, $src}",
@@ -3077,9 +4394,21 @@ def MOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                     [(set VR128:$dst, (v2i64 (X86vzmovl
                                              (loadv2i64 addr:$src))))]>,
                       XS, Requires<[HasSSE2]>;
+}
 
-def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4i32 addr:$src)))),
-            (MOVZPQILo2PQIrm addr:$src)>;
+let AddedComplexity = 20 in {
+  let Predicates = [HasSSE2] in {
+    def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4i32 addr:$src)))),
+              (MOVZPQILo2PQIrm addr:$src)>;
+    def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
+              (MOVZPQILo2PQIrr VR128:$src)>;
+  }
+  let Predicates = [HasAVX] in {
+    def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4i32 addr:$src)))),
+              (VMOVZPQILo2PQIrm addr:$src)>;
+    def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
+              (VMOVZPQILo2PQIrr VR128:$src)>;
+  }
 }
 
 // Instructions to match in the assembler
@@ -3101,37 +4430,6 @@ def VMOVQxrxr: I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
 def MOVQxrxr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                  "movq\t{$src, $dst|$dst, $src}", []>, XS;
 
-//===---------------------------------------------------------------------===//
-// SSE2 - Misc Instructions
-//===---------------------------------------------------------------------===//
-
-// Flush cache
-def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
-               "clflush\t$src", [(int_x86_sse2_clflush addr:$src)]>,
-              TB, Requires<[HasSSE2]>;
-
-// Load, store, and memory fence
-def LFENCE : I<0xAE, MRM_E8, (outs), (ins),
-               "lfence", [(int_x86_sse2_lfence)]>, TB, Requires<[HasSSE2]>;
-def MFENCE : I<0xAE, MRM_F0, (outs), (ins),
-               "mfence", [(int_x86_sse2_mfence)]>, TB, Requires<[HasSSE2]>;
-def : Pat<(X86LFence), (LFENCE)>;
-def : Pat<(X86MFence), (MFENCE)>;
-
-
-// Pause. This "instruction" is encoded as "rep; nop", so even though it
-// was introduced with SSE2, it's backward compatible.
-def PAUSE : I<0x90, RawFrm, (outs), (ins), "pause", []>, REP;
-
-// Alias instructions that map zero vector to pxor / xorp* for sse.
-// We set canFoldAsLoad because this can be converted to a constant-pool
-// load of an all-ones value if folding it would be beneficial.
-let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
-    isCodeGenOnly = 1, ExeDomain = SSEPackedInt in
-  // FIXME: Change encoding to pseudo.
-  def V_SETALLONES : PDI<0x76, MRMInitReg, (outs VR128:$dst), (ins), "",
-                         [(set VR128:$dst, (v4i32 immAllOnesV))]>;
-
 //===---------------------------------------------------------------------===//
 // SSE3 - Conversion Instructions
 //===---------------------------------------------------------------------===//
@@ -3164,6 +4462,11 @@ def CVTPD2DQrm  : S3DI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
 def CVTPD2DQrr  : S3DI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "cvtpd2dq\t{$src, $dst|$dst, $src}", []>;
 
+def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))),
+          (VCVTPD2DQYrr VR256:$src)>;
+def : Pat<(v4i32 (fp_to_sint (memopv4f64 addr:$src))),
+          (VCVTPD2DQYrm addr:$src)>;
+
 // Convert Packed DW Integers to Packed Double FP
 let Predicates = [HasAVX] in {
 def VCVTDQ2PDrm  : S3SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
@@ -3192,41 +4495,74 @@ def : Pat<(int_x86_avx_cvt_pd2dq_256 VR256:$src),
 def : Pat<(int_x86_avx_cvt_pd2dq_256 (memopv4f64 addr:$src)),
           (VCVTPD2DQYrm addr:$src)>;
 
+def : Pat<(v4f64 (sint_to_fp (v4i32 VR128:$src))),
+          (VCVTDQ2PDYrr VR128:$src)>;
+def : Pat<(v4f64 (sint_to_fp (memopv4i32 addr:$src))),
+          (VCVTDQ2PDYrm addr:$src)>;
+
 //===---------------------------------------------------------------------===//
-// SSE3 - Move Instructions
+// SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP
 //===---------------------------------------------------------------------===//
-
-// Replicate Single FP
-multiclass sse3_replicate_sfp<bits<8> op, PatFrag rep_frag, string OpcodeStr> {
-def rr : S3SI<op, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr,
+                              ValueType vt, RegisterClass RC, PatFrag mem_frag,
+                              X86MemOperand x86memop> {
+def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                      [(set VR128:$dst, (v4f32 (rep_frag
-                                                VR128:$src, (undef))))]>;
-def rm : S3SI<op, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                      [(set RC:$dst, (vt (OpNode RC:$src)))]>;
+def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                      [(set VR128:$dst, (rep_frag
-                                         (memopv4f32 addr:$src), (undef)))]>;
+                      [(set RC:$dst, (OpNode (mem_frag addr:$src)))]>;
 }
 
-multiclass sse3_replicate_sfp_y<bits<8> op, PatFrag rep_frag,
-                                string OpcodeStr> {
-def rr : S3SI<op, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
-              !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>;
-def rm : S3SI<op, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
-              !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>;
+let Predicates = [HasAVX] in {
+  defm VMOVSHDUP  : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
+                                       v4f32, VR128, memopv4f32, f128mem>, VEX;
+  defm VMOVSLDUP  : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
+                                       v4f32, VR128, memopv4f32, f128mem>, VEX;
+  defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
+                                       v8f32, VR256, memopv8f32, f256mem>, VEX;
+  defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
+                                       v8f32, VR256, memopv8f32, f256mem>, VEX;
+}
+defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128,
+                                   memopv4f32, f128mem>;
+defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128,
+                                   memopv4f32, f128mem>;
+
+let Predicates = [HasSSE3] in {
+  def : Pat<(v4i32 (X86Movshdup VR128:$src)),
+            (MOVSHDUPrr VR128:$src)>;
+  def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (memopv2i64 addr:$src)))),
+            (MOVSHDUPrm addr:$src)>;
+  def : Pat<(v4i32 (X86Movsldup VR128:$src)),
+            (MOVSLDUPrr VR128:$src)>;
+  def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (memopv2i64 addr:$src)))),
+            (MOVSLDUPrm addr:$src)>;
 }
 
 let Predicates = [HasAVX] in {
-  // FIXME: Merge above classes when we have patterns for the ymm version
-  defm VMOVSHDUP  : sse3_replicate_sfp<0x16, movshdup, "vmovshdup">, VEX;
-  defm VMOVSLDUP  : sse3_replicate_sfp<0x12, movsldup, "vmovsldup">, VEX;
-  defm VMOVSHDUPY : sse3_replicate_sfp_y<0x16, movshdup, "vmovshdup">, VEX;
-  defm VMOVSLDUPY : sse3_replicate_sfp_y<0x12, movsldup, "vmovsldup">, VEX;
+  def : Pat<(v4i32 (X86Movshdup VR128:$src)),
+            (VMOVSHDUPrr VR128:$src)>;
+  def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (memopv2i64 addr:$src)))),
+            (VMOVSHDUPrm addr:$src)>;
+  def : Pat<(v4i32 (X86Movsldup VR128:$src)),
+            (VMOVSLDUPrr VR128:$src)>;
+  def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (memopv2i64 addr:$src)))),
+            (VMOVSLDUPrm addr:$src)>;
+  def : Pat<(v8i32 (X86Movshdup VR256:$src)),
+            (VMOVSHDUPYrr VR256:$src)>;
+  def : Pat<(v8i32 (X86Movshdup (bc_v8i32 (memopv4i64 addr:$src)))),
+            (VMOVSHDUPYrm addr:$src)>;
+  def : Pat<(v8i32 (X86Movsldup VR256:$src)),
+            (VMOVSLDUPYrr VR256:$src)>;
+  def : Pat<(v8i32 (X86Movsldup (bc_v8i32 (memopv4i64 addr:$src)))),
+            (VMOVSLDUPYrm addr:$src)>;
 }
-defm MOVSHDUP : sse3_replicate_sfp<0x16, movshdup, "movshdup">;
-defm MOVSLDUP : sse3_replicate_sfp<0x12, movsldup, "movsldup">;
 
-// Replicate Double FP
+//===---------------------------------------------------------------------===//
+// SSE3 - Replicate Double FP - MOVDDUP
+//===---------------------------------------------------------------------===//
+
 multiclass sse3_replicate_dfp<string OpcodeStr> {
 def rr  : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
@@ -3238,23 +4574,90 @@ def rm  : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
                                       (undef))))]>;
 }
 
+// FIXME: Merge with above classe when there're patterns for the ymm version
 multiclass sse3_replicate_dfp_y<string OpcodeStr> {
-def rr  : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
-                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                    []>;
-def rm  : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
-                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                    []>;
+let Predicates = [HasAVX] in {
+  def rr  : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+                      !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                      []>;
+  def rm  : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+                      !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                      []>;
+  }
+}
+
+defm MOVDDUP : sse3_replicate_dfp<"movddup">;
+defm VMOVDDUP  : sse3_replicate_dfp<"vmovddup">, VEX;
+defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup">, VEX;
+
+let Predicates = [HasSSE3] in {
+  def : Pat<(movddup (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src)))),
+                   (undef)),
+            (MOVDDUPrm addr:$src)>;
+  let AddedComplexity = 5 in {
+  def : Pat<(movddup (memopv2f64 addr:$src), (undef)), (MOVDDUPrm addr:$src)>;
+  def : Pat<(movddup (bc_v4f32 (memopv2f64 addr:$src)), (undef)),
+            (MOVDDUPrm addr:$src)>;
+  def : Pat<(movddup (memopv2i64 addr:$src), (undef)), (MOVDDUPrm addr:$src)>;
+  def : Pat<(movddup (bc_v4i32 (memopv2i64 addr:$src)), (undef)),
+            (MOVDDUPrm addr:$src)>;
+  }
+  def : Pat<(X86Movddup (memopv2f64 addr:$src)),
+            (MOVDDUPrm addr:$src)>;
+  def : Pat<(X86Movddup (bc_v2f64 (memopv4f32 addr:$src))),
+            (MOVDDUPrm addr:$src)>;
+  def : Pat<(X86Movddup (bc_v2f64 (memopv2i64 addr:$src))),
+            (MOVDDUPrm addr:$src)>;
+  def : Pat<(X86Movddup (v2f64 (scalar_to_vector (loadf64 addr:$src)))),
+            (MOVDDUPrm addr:$src)>;
+  def : Pat<(X86Movddup (bc_v2f64
+                             (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
+            (MOVDDUPrm addr:$src)>;
 }
 
 let Predicates = [HasAVX] in {
-  // FIXME: Merge above classes when we have patterns for the ymm version
-  defm VMOVDDUP  : sse3_replicate_dfp<"vmovddup">, VEX;
-  defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup">, VEX;
+  def : Pat<(movddup (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src)))),
+                   (undef)),
+            (VMOVDDUPrm addr:$src)>;
+  let AddedComplexity = 5 in {
+  def : Pat<(movddup (memopv2f64 addr:$src), (undef)), (VMOVDDUPrm addr:$src)>;
+  def : Pat<(movddup (bc_v4f32 (memopv2f64 addr:$src)), (undef)),
+            (VMOVDDUPrm addr:$src)>;
+  def : Pat<(movddup (memopv2i64 addr:$src), (undef)), (VMOVDDUPrm addr:$src)>;
+  def : Pat<(movddup (bc_v4i32 (memopv2i64 addr:$src)), (undef)),
+            (VMOVDDUPrm addr:$src)>;
+  }
+  def : Pat<(X86Movddup (memopv2f64 addr:$src)),
+            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
+  def : Pat<(X86Movddup (bc_v2f64 (memopv4f32 addr:$src))),
+            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
+  def : Pat<(X86Movddup (bc_v2f64 (memopv2i64 addr:$src))),
+            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
+  def : Pat<(X86Movddup (v2f64 (scalar_to_vector (loadf64 addr:$src)))),
+            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
+  def : Pat<(X86Movddup (bc_v2f64
+                             (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
+            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
+
+  // 256-bit version
+  def : Pat<(X86Movddup (memopv4f64 addr:$src)),
+            (VMOVDDUPYrm addr:$src)>;
+  def : Pat<(X86Movddup (memopv4i64 addr:$src)),
+            (VMOVDDUPYrm addr:$src)>;
+  def : Pat<(X86Movddup (v4f64 (scalar_to_vector (loadf64 addr:$src)))),
+            (VMOVDDUPYrm addr:$src)>;
+  def : Pat<(X86Movddup (v4i64 (scalar_to_vector (loadi64 addr:$src)))),
+            (VMOVDDUPYrm addr:$src)>;
+  def : Pat<(X86Movddup (v4f64 VR256:$src)),
+            (VMOVDDUPYrr VR256:$src)>;
+  def : Pat<(X86Movddup (v4i64 VR256:$src)),
+            (VMOVDDUPYrr VR256:$src)>;
 }
-defm MOVDDUP : sse3_replicate_dfp<"movddup">;
 
-// Move Unaligned Integer
+//===---------------------------------------------------------------------===//
+// SSE3 - Move Unaligned Integer
+//===---------------------------------------------------------------------===//
+
 let Predicates = [HasAVX] in {
   def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                    "vlddqu\t{$src, $dst|$dst, $src}",
@@ -3267,38 +4670,6 @@ def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                    "lddqu\t{$src, $dst|$dst, $src}",
                    [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>;
 
-def : Pat<(movddup (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src)))),
-                   (undef)),
-          (MOVDDUPrm addr:$src)>, Requires<[HasSSE3]>;
-
-// Several Move patterns
-let AddedComplexity = 5 in {
-def : Pat<(movddup (memopv2f64 addr:$src), (undef)),
-          (MOVDDUPrm addr:$src)>, Requires<[HasSSE3]>;
-def : Pat<(movddup (bc_v4f32 (memopv2f64 addr:$src)), (undef)),
-          (MOVDDUPrm addr:$src)>, Requires<[HasSSE3]>;
-def : Pat<(movddup (memopv2i64 addr:$src), (undef)),
-          (MOVDDUPrm addr:$src)>, Requires<[HasSSE3]>;
-def : Pat<(movddup (bc_v4i32 (memopv2i64 addr:$src)), (undef)),
-          (MOVDDUPrm addr:$src)>, Requires<[HasSSE3]>;
-}
-
-// vector_shuffle v1, <undef> <1, 1, 3, 3>
-let AddedComplexity = 15 in
-def : Pat<(v4i32 (movshdup VR128:$src, (undef))),
-          (MOVSHDUPrr VR128:$src)>, Requires<[HasSSE3]>;
-let AddedComplexity = 20 in
-def : Pat<(v4i32 (movshdup (bc_v4i32 (memopv2i64 addr:$src)), (undef))),
-          (MOVSHDUPrm addr:$src)>, Requires<[HasSSE3]>;
-
-// vector_shuffle v1, <undef> <0, 0, 2, 2>
-let AddedComplexity = 15 in
-  def : Pat<(v4i32 (movsldup VR128:$src, (undef))),
-            (MOVSLDUPrr VR128:$src)>, Requires<[HasSSE3]>;
-let AddedComplexity = 20 in
-  def : Pat<(v4i32 (movsldup (bc_v4i32 (memopv2i64 addr:$src)), (undef))),
-            (MOVSLDUPrm addr:$src)>, Requires<[HasSSE3]>;
-
 //===---------------------------------------------------------------------===//
 // SSE3 - Arithmetic
 //===---------------------------------------------------------------------===//
@@ -3344,62 +4715,58 @@ let Constraints = "$src1 = $dst", Predicates = [HasSSE3],
 
 // Horizontal ops
 multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
-                   X86MemOperand x86memop, Intrinsic IntId, bit Is2Addr = 1> {
+                   X86MemOperand x86memop, SDNode OpNode, bit Is2Addr = 1> {
   def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
        !if(Is2Addr,
          !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
          !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-      [(set RC:$dst, (vt (IntId RC:$src1, RC:$src2)))]>;
+      [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>;
 
   def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
        !if(Is2Addr,
          !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
          !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-      [(set RC:$dst, (vt (IntId RC:$src1, (memop addr:$src2))))]>;
+      [(set RC:$dst, (vt (OpNode RC:$src1, (memop addr:$src2))))]>;
 }
 multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
-                  X86MemOperand x86memop, Intrinsic IntId, bit Is2Addr = 1> {
+                  X86MemOperand x86memop, SDNode OpNode, bit Is2Addr = 1> {
   def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
        !if(Is2Addr,
          !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
          !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-      [(set RC:$dst, (vt (IntId RC:$src1, RC:$src2)))]>;
+      [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>;
 
   def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
        !if(Is2Addr,
          !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
          !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-      [(set RC:$dst, (vt (IntId RC:$src1, (memop addr:$src2))))]>;
+      [(set RC:$dst, (vt (OpNode RC:$src1, (memop addr:$src2))))]>;
 }
 
 let Predicates = [HasAVX] in {
   defm VHADDPS  : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem,
-                          int_x86_sse3_hadd_ps, 0>, VEX_4V;
+                          X86fhadd, 0>, VEX_4V;
   defm VHADDPD  : S3_Int <0x7C, "vhaddpd", v2f64, VR128, f128mem,
-                          int_x86_sse3_hadd_pd, 0>, VEX_4V;
+                          X86fhadd, 0>, VEX_4V;
   defm VHSUBPS  : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem,
-                          int_x86_sse3_hsub_ps, 0>, VEX_4V;
+                          X86fhsub, 0>, VEX_4V;
   defm VHSUBPD  : S3_Int <0x7D, "vhsubpd", v2f64, VR128, f128mem,
-                          int_x86_sse3_hsub_pd, 0>, VEX_4V;
+                          X86fhsub, 0>, VEX_4V;
   defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem,
-                          int_x86_avx_hadd_ps_256, 0>, VEX_4V;
+                          X86fhadd, 0>, VEX_4V;
   defm VHADDPDY : S3_Int <0x7C, "vhaddpd", v4f64, VR256, f256mem,
-                          int_x86_avx_hadd_pd_256, 0>, VEX_4V;
+                          X86fhadd, 0>, VEX_4V;
   defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem,
-                          int_x86_avx_hsub_ps_256, 0>, VEX_4V;
+                          X86fhsub, 0>, VEX_4V;
   defm VHSUBPDY : S3_Int <0x7D, "vhsubpd", v4f64, VR256, f256mem,
-                          int_x86_avx_hsub_pd_256, 0>, VEX_4V;
+                          X86fhsub, 0>, VEX_4V;
 }
 
 let Constraints = "$src1 = $dst" in {
-  defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem,
-                        int_x86_sse3_hadd_ps>;
-  defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem,
-                       int_x86_sse3_hadd_pd>;
-  defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem,
-                        int_x86_sse3_hsub_ps>;
-  defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem,
-                       int_x86_sse3_hsub_pd>;
+  defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd>;
+  defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd>;
+  defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub>;
+  defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub>;
 }
 
 //===---------------------------------------------------------------------===//
@@ -3466,7 +4833,7 @@ multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr,
           (bitconvert (memopv16i8 addr:$src2))))]>, OpSize;
 }
 
-let Predicates = [HasAVX] in {
+let ImmT = NoImm, Predicates = [HasAVX] in {
 let isCommutable = 0 in {
   defm VPHADDW    : SS3I_binop_rm_int<0x01, "vphaddw", memopv8i16,
                                       int_x86_ssse3_phadd_w_128, 0>, VEX_4V;
@@ -3525,17 +4892,33 @@ defm PMULHRSW    : SS3I_binop_rm_int<0x0B, "pmulhrsw", memopv8i16,
                                      int_x86_ssse3_pmul_hr_sw_128>;
 }
 
-def : Pat<(X86pshufb VR128:$src, VR128:$mask),
-          (PSHUFBrr128 VR128:$src, VR128:$mask)>, Requires<[HasSSSE3]>;
-def : Pat<(X86pshufb VR128:$src, (bc_v16i8 (memopv2i64 addr:$mask))),
-          (PSHUFBrm128 VR128:$src, addr:$mask)>, Requires<[HasSSSE3]>;
+let Predicates = [HasSSSE3] in {
+  def : Pat<(X86pshufb VR128:$src, VR128:$mask),
+            (PSHUFBrr128 VR128:$src, VR128:$mask)>;
+  def : Pat<(X86pshufb VR128:$src, (bc_v16i8 (memopv2i64 addr:$mask))),
+            (PSHUFBrm128 VR128:$src, addr:$mask)>;
 
-def : Pat<(X86psignb VR128:$src1, VR128:$src2),
-          (PSIGNBrr128 VR128:$src1, VR128:$src2)>, Requires<[HasSSSE3]>;
-def : Pat<(X86psignw VR128:$src1, VR128:$src2),
-          (PSIGNWrr128 VR128:$src1, VR128:$src2)>, Requires<[HasSSSE3]>;
-def : Pat<(X86psignd VR128:$src1, VR128:$src2),
-          (PSIGNDrr128 VR128:$src1, VR128:$src2)>, Requires<[HasSSSE3]>;
+  def : Pat<(X86psignb VR128:$src1, VR128:$src2),
+            (PSIGNBrr128 VR128:$src1, VR128:$src2)>;
+  def : Pat<(X86psignw VR128:$src1, VR128:$src2),
+            (PSIGNWrr128 VR128:$src1, VR128:$src2)>;
+  def : Pat<(X86psignd VR128:$src1, VR128:$src2),
+            (PSIGNDrr128 VR128:$src1, VR128:$src2)>;
+}
+
+let Predicates = [HasAVX] in {
+  def : Pat<(X86pshufb VR128:$src, VR128:$mask),
+            (VPSHUFBrr128 VR128:$src, VR128:$mask)>;
+  def : Pat<(X86pshufb VR128:$src, (bc_v16i8 (memopv2i64 addr:$mask))),
+            (VPSHUFBrm128 VR128:$src, addr:$mask)>;
+
+  def : Pat<(X86psignb VR128:$src1, VR128:$src2),
+            (VPSIGNBrr128 VR128:$src1, VR128:$src2)>;
+  def : Pat<(X86psignw VR128:$src1, VR128:$src2),
+            (VPSIGNWrr128 VR128:$src1, VR128:$src2)>;
+  def : Pat<(X86psignd VR128:$src1, VR128:$src2),
+            (VPSIGNDrr128 VR128:$src1, VR128:$src2)>;
+}
 
 //===---------------------------------------------------------------------===//
 // SSSE3 - Packed Align Instruction Patterns
@@ -3560,387 +4943,57 @@ multiclass ssse3_palign<string asm, bit Is2Addr = 1> {
 
 let Predicates = [HasAVX] in
   defm VPALIGN : ssse3_palign<"vpalignr", 0>, VEX_4V;
-let Constraints = "$src1 = $dst" in
+let Constraints = "$src1 = $dst", Predicates = [HasSSSE3] in
   defm PALIGN : ssse3_palign<"palignr">;
 
-let AddedComplexity = 5 in {
-def : Pat<(v4i32 (palign:$src3 VR128:$src1, VR128:$src2)),
-          (PALIGNR128rr VR128:$src2, VR128:$src1,
-                        (SHUFFLE_get_palign_imm VR128:$src3))>,
-      Requires<[HasSSSE3]>;
-def : Pat<(v4f32 (palign:$src3 VR128:$src1, VR128:$src2)),
-          (PALIGNR128rr VR128:$src2, VR128:$src1,
-                        (SHUFFLE_get_palign_imm VR128:$src3))>,
-      Requires<[HasSSSE3]>;
-def : Pat<(v8i16 (palign:$src3 VR128:$src1, VR128:$src2)),
-          (PALIGNR128rr VR128:$src2, VR128:$src1,
-                        (SHUFFLE_get_palign_imm VR128:$src3))>,
-      Requires<[HasSSSE3]>;
-def : Pat<(v16i8 (palign:$src3 VR128:$src1, VR128:$src2)),
-          (PALIGNR128rr VR128:$src2, VR128:$src1,
-                        (SHUFFLE_get_palign_imm VR128:$src3))>,
-      Requires<[HasSSSE3]>;
-}
-
-//===---------------------------------------------------------------------===//
-// SSSE3 Misc Instructions
-//===---------------------------------------------------------------------===//
-
-// Thread synchronization
-let usesCustomInserter = 1 in {
-def MONITOR : PseudoI<(outs), (ins i32mem:$src1, GR32:$src2, GR32:$src3),
-                [(int_x86_sse3_monitor addr:$src1, GR32:$src2, GR32:$src3)]>;
-def MWAIT : PseudoI<(outs), (ins GR32:$src1, GR32:$src2),
-                [(int_x86_sse3_mwait GR32:$src1, GR32:$src2)]>;
-}
-
-let Uses = [EAX, ECX, EDX] in
-def MONITORrrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>, TB,
-                 Requires<[HasSSE3]>;
-let Uses = [ECX, EAX] in
-def MWAITrr   : I<0x01, MRM_C9, (outs), (ins), "mwait", []>, TB,
-                Requires<[HasSSE3]>;
-
-def : InstAlias<"mwait %eax, %ecx", (MWAITrr)>, Requires<[In32BitMode]>;
-def : InstAlias<"mwait %rax, %rcx", (MWAITrr)>, Requires<[In64BitMode]>;
-
-def : InstAlias<"monitor %eax, %ecx, %edx", (MONITORrrr)>,
-      Requires<[In32BitMode]>;
-def : InstAlias<"monitor %rax, %rcx, %rdx", (MONITORrrr)>,
-      Requires<[In64BitMode]>;
-
-//===---------------------------------------------------------------------===//
-// Non-Instruction Patterns
-//===---------------------------------------------------------------------===//
-
-// extload f32 -> f64.  This matches load+fextend because we have a hack in
-// the isel (PreprocessForFPConvert) that can introduce loads after dag
-// combine.
-// Since these loads aren't folded into the fextend, we have to match it
-// explicitly here.
-let Predicates = [HasSSE2] in
- def : Pat<(fextend (loadf32 addr:$src)),
-           (CVTSS2SDrm addr:$src)>;
-
-// FIXME: According to the intel manual, DEST[127:64] <- SRC1[127:64], while
-// in the non-AVX version bits 127:64 aren't touched. Find a better way to
-// represent this instead of always zeroing SRC1. One possible solution is
-// to represent the instruction w/ something similar as the "$src1 = $dst"
-// constraint but without the tied operands.
-let Predicates = [HasAVX] in
- def : Pat<(fextend (loadf32 addr:$src)),
-           (VCVTSS2SDrm (f32 (EXTRACT_SUBREG (AVX_SET0PS), sub_ss)),
-                        addr:$src)>;
-
-// bit_convert
-let Predicates = [HasXMMInt] in {
-  def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>;
-  def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>;
-  def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>;
-  def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>;
-  def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>;
-  def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>;
-  def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>;
-  def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>;
-  def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>;
-  def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>;
-  def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>;
-  def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>;
-  def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>;
-  def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>;
-  def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>;
-  def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>;
-  def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>;
-  def : Pat<(v16i8 (bitconvert (v8i16 VR128:$src))), (v16i8 VR128:$src)>;
-  def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>;
-  def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>;
-  def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>;
-  def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>;
-  def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>;
-  def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>;
-  def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>;
-  def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>;
-  def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>;
-  def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>;
-  def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>;
-  def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>;
-}
-
-let Predicates = [HasAVX] in {
-  def : Pat<(v4f64 (bitconvert (v8f32 VR256:$src))), (v4f64 VR256:$src)>;
-  def : Pat<(v4f64 (bitconvert (v4i64 VR256:$src))), (v4f64 VR256:$src)>;
-  def : Pat<(v4f64 (bitconvert (v32i8 VR256:$src))), (v4f64 VR256:$src)>;
-  def : Pat<(v8f32 (bitconvert (v4i64 VR256:$src))), (v8f32 VR256:$src)>;
-  def : Pat<(v8f32 (bitconvert (v4f64 VR256:$src))), (v8f32 VR256:$src)>;
-  def : Pat<(v8f32 (bitconvert (v32i8 VR256:$src))), (v8f32 VR256:$src)>;
-  def : Pat<(v4i64 (bitconvert (v8f32 VR256:$src))), (v4i64 VR256:$src)>;
-  def : Pat<(v4i64 (bitconvert (v4f64 VR256:$src))), (v4i64 VR256:$src)>;
-  def : Pat<(v4i64 (bitconvert (v32i8 VR256:$src))), (v4i64 VR256:$src)>;
-  def : Pat<(v32i8 (bitconvert (v4f64 VR256:$src))), (v32i8 VR256:$src)>;
-  def : Pat<(v32i8 (bitconvert (v4i64 VR256:$src))), (v32i8 VR256:$src)>;
-  def : Pat<(v32i8 (bitconvert (v8f32 VR256:$src))), (v32i8 VR256:$src)>;
-  def : Pat<(v32i8 (bitconvert (v8i32 VR256:$src))), (v32i8 VR256:$src)>;
-  def : Pat<(v8i32 (bitconvert (v32i8 VR256:$src))), (v8i32 VR256:$src)>;
-}
-
-// Move scalar to XMM zero-extended
-// movd to XMM register zero-extends
-let AddedComplexity = 15 in {
-// Zeroing a VR128 then do a MOVS{S|D} to the lower bits.
-def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
-          (MOVSDrr (v2f64 (V_SET0PS)), FR64:$src)>;
-def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
-          (MOVSSrr (v4f32 (V_SET0PS)), FR32:$src)>;
-def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
-          (MOVSSrr (v4f32 (V_SET0PS)),
-                   (f32 (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss)))>;
-def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
-          (MOVSSrr (v4i32 (V_SET0PI)),
-                   (EXTRACT_SUBREG (v4i32 VR128:$src), sub_ss))>;
-}
-
-// Splat v2f64 / v2i64
-let AddedComplexity = 10 in {
-def : Pat<(splat_lo (v2f64 VR128:$src), (undef)),
-          (UNPCKLPDrr VR128:$src, VR128:$src)>,   Requires<[HasSSE2]>;
-def : Pat<(unpckh (v2f64 VR128:$src), (undef)),
-          (UNPCKHPDrr VR128:$src, VR128:$src)>,   Requires<[HasSSE2]>;
-def : Pat<(splat_lo (v2i64 VR128:$src), (undef)),
-          (PUNPCKLQDQrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
-def : Pat<(unpckh (v2i64 VR128:$src), (undef)),
-          (PUNPCKHQDQrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
-}
-
-// Special unary SHUFPSrri case.
-def : Pat<(v4f32 (pshufd:$src3 VR128:$src1, (undef))),
-          (SHUFPSrri VR128:$src1, VR128:$src1,
-                     (SHUFFLE_get_shuf_imm VR128:$src3))>;
-let AddedComplexity = 5 in
-def : Pat<(v4f32 (pshufd:$src2 VR128:$src1, (undef))),
-          (PSHUFDri VR128:$src1, (SHUFFLE_get_shuf_imm VR128:$src2))>,
-      Requires<[HasSSE2]>;
-// Special unary SHUFPDrri case.
-def : Pat<(v2i64 (pshufd:$src3 VR128:$src1, (undef))),
-          (SHUFPDrri VR128:$src1, VR128:$src1,
-                     (SHUFFLE_get_shuf_imm VR128:$src3))>,
-      Requires<[HasSSE2]>;
-// Special unary SHUFPDrri case.
-def : Pat<(v2f64 (pshufd:$src3 VR128:$src1, (undef))),
-          (SHUFPDrri VR128:$src1, VR128:$src1,
-                     (SHUFFLE_get_shuf_imm VR128:$src3))>,
-      Requires<[HasSSE2]>;
-// Unary v4f32 shuffle with PSHUF* in order to fold a load.
-def : Pat<(pshufd:$src2 (bc_v4i32 (memopv4f32 addr:$src1)), (undef)),
-          (PSHUFDmi addr:$src1, (SHUFFLE_get_shuf_imm VR128:$src2))>,
-      Requires<[HasSSE2]>;
-
-// Special binary v4i32 shuffle cases with SHUFPS.
-def : Pat<(v4i32 (shufp:$src3 VR128:$src1, (v4i32 VR128:$src2))),
-          (SHUFPSrri VR128:$src1, VR128:$src2,
-                     (SHUFFLE_get_shuf_imm VR128:$src3))>,
-           Requires<[HasSSE2]>;
-def : Pat<(v4i32 (shufp:$src3 VR128:$src1, (bc_v4i32 (memopv2i64 addr:$src2)))),
-          (SHUFPSrmi VR128:$src1, addr:$src2,
-                    (SHUFFLE_get_shuf_imm VR128:$src3))>,
-           Requires<[HasSSE2]>;
-// Special binary v2i64 shuffle cases using SHUFPDrri.
-def : Pat<(v2i64 (shufp:$src3 VR128:$src1, VR128:$src2)),
-          (SHUFPDrri VR128:$src1, VR128:$src2,
-                     (SHUFFLE_get_shuf_imm VR128:$src3))>,
-          Requires<[HasSSE2]>;
-
-// vector_shuffle v1, <undef>, <0, 0, 1, 1, ...>
-let AddedComplexity = 15 in {
-def : Pat<(v4i32 (unpckl_undef:$src2 VR128:$src, (undef))),
-          (PSHUFDri VR128:$src, (SHUFFLE_get_shuf_imm VR128:$src2))>,
-          Requires<[OptForSpeed, HasSSE2]>;
-def : Pat<(v4f32 (unpckl_undef:$src2 VR128:$src, (undef))),
-          (PSHUFDri VR128:$src, (SHUFFLE_get_shuf_imm VR128:$src2))>,
-          Requires<[OptForSpeed, HasSSE2]>;
-}
-let AddedComplexity = 10 in {
-def : Pat<(v4f32 (unpckl_undef VR128:$src, (undef))),
-          (UNPCKLPSrr VR128:$src, VR128:$src)>;
-def : Pat<(v16i8 (unpckl_undef VR128:$src, (undef))),
-          (PUNPCKLBWrr VR128:$src, VR128:$src)>;
-def : Pat<(v8i16 (unpckl_undef VR128:$src, (undef))),
-          (PUNPCKLWDrr VR128:$src, VR128:$src)>;
-def : Pat<(v4i32 (unpckl_undef VR128:$src, (undef))),
-          (PUNPCKLDQrr VR128:$src, VR128:$src)>;
-}
-
-// vector_shuffle v1, <undef>, <2, 2, 3, 3, ...>
-let AddedComplexity = 15 in {
-def : Pat<(v4i32 (unpckh_undef:$src2 VR128:$src, (undef))),
-          (PSHUFDri VR128:$src, (SHUFFLE_get_shuf_imm VR128:$src2))>,
-          Requires<[OptForSpeed, HasSSE2]>;
-def : Pat<(v4f32 (unpckh_undef:$src2 VR128:$src, (undef))),
-          (PSHUFDri VR128:$src, (SHUFFLE_get_shuf_imm VR128:$src2))>,
-          Requires<[OptForSpeed, HasSSE2]>;
-}
-let AddedComplexity = 10 in {
-def : Pat<(v4f32 (unpckh_undef VR128:$src, (undef))),
-          (UNPCKHPSrr VR128:$src, VR128:$src)>;
-def : Pat<(v16i8 (unpckh_undef VR128:$src, (undef))),
-          (PUNPCKHBWrr VR128:$src, VR128:$src)>;
-def : Pat<(v8i16 (unpckh_undef VR128:$src, (undef))),
-          (PUNPCKHWDrr VR128:$src, VR128:$src)>;
-def : Pat<(v4i32 (unpckh_undef VR128:$src, (undef))),
-          (PUNPCKHDQrr VR128:$src, VR128:$src)>;
-}
-
-let AddedComplexity = 20 in {
-// vector_shuffle v1, v2 <0, 1, 4, 5> using MOVLHPS
-def : Pat<(v4i32 (movlhps VR128:$src1, VR128:$src2)),
-          (MOVLHPSrr VR128:$src1, VR128:$src2)>;
-
-// vector_shuffle v1, v2 <6, 7, 2, 3> using MOVHLPS
-def : Pat<(v4i32 (movhlps VR128:$src1, VR128:$src2)),
-          (MOVHLPSrr VR128:$src1, VR128:$src2)>;
-
-// vector_shuffle v1, undef <2, ?, ?, ?> using MOVHLPS
-def : Pat<(v4f32 (movhlps_undef VR128:$src1, (undef))),
-          (MOVHLPSrr VR128:$src1, VR128:$src1)>;
-def : Pat<(v4i32 (movhlps_undef VR128:$src1, (undef))),
-          (MOVHLPSrr VR128:$src1, VR128:$src1)>;
-}
-
-let AddedComplexity = 20 in {
-// vector_shuffle v1, (load v2) <4, 5, 2, 3> using MOVLPS
-def : Pat<(v4f32 (movlp VR128:$src1, (load addr:$src2))),
-          (MOVLPSrm VR128:$src1, addr:$src2)>;
-def : Pat<(v2f64 (movlp VR128:$src1, (load addr:$src2))),
-          (MOVLPDrm VR128:$src1, addr:$src2)>;
-def : Pat<(v4i32 (movlp VR128:$src1, (load addr:$src2))),
-          (MOVLPSrm VR128:$src1, addr:$src2)>;
-def : Pat<(v2i64 (movlp VR128:$src1, (load addr:$src2))),
-          (MOVLPDrm VR128:$src1, addr:$src2)>;
-}
-
-// (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS
-def : Pat<(store (v4f32 (movlp (load addr:$src1), VR128:$src2)), addr:$src1),
-          (MOVLPSmr addr:$src1, VR128:$src2)>;
-def : Pat<(store (v2f64 (movlp (load addr:$src1), VR128:$src2)), addr:$src1),
-          (MOVLPDmr addr:$src1, VR128:$src2)>;
-def : Pat<(store (v4i32 (movlp (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)),
-                 addr:$src1),
-          (MOVLPSmr addr:$src1, VR128:$src2)>;
-def : Pat<(store (v2i64 (movlp (load addr:$src1), VR128:$src2)), addr:$src1),
-          (MOVLPDmr addr:$src1, VR128:$src2)>;
-
-let AddedComplexity = 15 in {
-// Setting the lowest element in the vector.
-def : Pat<(v4i32 (movl VR128:$src1, VR128:$src2)),
-          (MOVSSrr (v4i32 VR128:$src1),
-                   (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_ss))>;
-def : Pat<(v2i64 (movl VR128:$src1, VR128:$src2)),
-          (MOVSDrr (v2i64 VR128:$src1),
-                   (EXTRACT_SUBREG (v2i64 VR128:$src2), sub_sd))>;
-
-// vector_shuffle v1, v2 <4, 5, 2, 3> using movsd
-def : Pat<(v4f32 (movlp VR128:$src1, VR128:$src2)),
-          (MOVSDrr VR128:$src1, (EXTRACT_SUBREG VR128:$src2, sub_sd))>,
-      Requires<[HasSSE2]>;
-def : Pat<(v4i32 (movlp VR128:$src1, VR128:$src2)),
-          (MOVSDrr VR128:$src1, (EXTRACT_SUBREG VR128:$src2, sub_sd))>,
-      Requires<[HasSSE2]>;
-}
-
-// vector_shuffle v1, v2 <4, 5, 2, 3> using SHUFPSrri (we prefer movsd, but
-// fall back to this for SSE1)
-def : Pat<(v4f32 (movlp:$src3 VR128:$src1, (v4f32 VR128:$src2))),
-          (SHUFPSrri VR128:$src2, VR128:$src1,
-                     (SHUFFLE_get_shuf_imm VR128:$src3))>;
-
-// Set lowest element and zero upper elements.
-def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
-          (MOVZPQILo2PQIrr VR128:$src)>, Requires<[HasSSE2]>;
-
-// vector -> vector casts
-def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
-          (Int_CVTDQ2PSrr VR128:$src)>, Requires<[HasSSE2]>;
-def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
-          (CVTTPS2DQrr VR128:$src)>, Requires<[HasSSE2]>;
-
-// Use movaps / movups for SSE integer load / store (one byte shorter).
-// The instructions selected below are then converted to MOVDQA/MOVDQU
-// during the SSE domain pass.
-let Predicates = [HasSSE1] in {
-  def : Pat<(alignedloadv4i32 addr:$src),
-            (MOVAPSrm addr:$src)>;
-  def : Pat<(loadv4i32 addr:$src),
-            (MOVUPSrm addr:$src)>;
-  def : Pat<(alignedloadv2i64 addr:$src),
-            (MOVAPSrm addr:$src)>;
-  def : Pat<(loadv2i64 addr:$src),
-            (MOVUPSrm addr:$src)>;
-
-  def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
-            (MOVAPSmr addr:$dst, VR128:$src)>;
-  def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
-            (MOVAPSmr addr:$dst, VR128:$src)>;
-  def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
-            (MOVAPSmr addr:$dst, VR128:$src)>;
-  def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
-            (MOVAPSmr addr:$dst, VR128:$src)>;
-  def : Pat<(store (v2i64 VR128:$src), addr:$dst),
-            (MOVUPSmr addr:$dst, VR128:$src)>;
-  def : Pat<(store (v4i32 VR128:$src), addr:$dst),
-            (MOVUPSmr addr:$dst, VR128:$src)>;
-  def : Pat<(store (v8i16 VR128:$src), addr:$dst),
-            (MOVUPSmr addr:$dst, VR128:$src)>;
-  def : Pat<(store (v16i8 VR128:$src), addr:$dst),
-            (MOVUPSmr addr:$dst, VR128:$src)>;
+let Predicates = [HasSSSE3] in {
+def : Pat<(v4i32 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+          (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+def : Pat<(v4f32 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+          (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+def : Pat<(v8i16 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+          (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+def : Pat<(v16i8 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+          (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
 }
 
-// Use vmovaps/vmovups for AVX integer load/store.
 let Predicates = [HasAVX] in {
-  // 128-bit load/store
-  def : Pat<(alignedloadv4i32 addr:$src),
-            (VMOVAPSrm addr:$src)>;
-  def : Pat<(loadv4i32 addr:$src),
-            (VMOVUPSrm addr:$src)>;
-  def : Pat<(alignedloadv2i64 addr:$src),
-            (VMOVAPSrm addr:$src)>;
-  def : Pat<(loadv2i64 addr:$src),
-            (VMOVUPSrm addr:$src)>;
+def : Pat<(v4i32 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+          (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+def : Pat<(v4f32 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+          (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+def : Pat<(v8i16 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+          (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+def : Pat<(v16i8 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+          (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+}
 
-  def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
-            (VMOVAPSmr addr:$dst, VR128:$src)>;
-  def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
-            (VMOVAPSmr addr:$dst, VR128:$src)>;
-  def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
-            (VMOVAPSmr addr:$dst, VR128:$src)>;
-  def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
-            (VMOVAPSmr addr:$dst, VR128:$src)>;
-  def : Pat<(store (v2i64 VR128:$src), addr:$dst),
-            (VMOVUPSmr addr:$dst, VR128:$src)>;
-  def : Pat<(store (v4i32 VR128:$src), addr:$dst),
-            (VMOVUPSmr addr:$dst, VR128:$src)>;
-  def : Pat<(store (v8i16 VR128:$src), addr:$dst),
-            (VMOVUPSmr addr:$dst, VR128:$src)>;
-  def : Pat<(store (v16i8 VR128:$src), addr:$dst),
-            (VMOVUPSmr addr:$dst, VR128:$src)>;
+//===---------------------------------------------------------------------===//
+// SSSE3 - Thread synchronization
+//===---------------------------------------------------------------------===//
 
-  // 256-bit load/store
-  def : Pat<(alignedloadv4i64 addr:$src),
-            (VMOVAPSYrm addr:$src)>;
-  def : Pat<(loadv4i64 addr:$src),
-            (VMOVUPSYrm addr:$src)>;
-  def : Pat<(alignedloadv8i32 addr:$src),
-            (VMOVAPSYrm addr:$src)>;
-  def : Pat<(loadv8i32 addr:$src),
-            (VMOVUPSYrm addr:$src)>;
-  def : Pat<(alignedstore (v4i64 VR256:$src), addr:$dst),
-            (VMOVAPSYmr addr:$dst, VR256:$src)>;
-  def : Pat<(alignedstore (v8i32 VR256:$src), addr:$dst),
-            (VMOVAPSYmr addr:$dst, VR256:$src)>;
-  def : Pat<(store (v4i64 VR256:$src), addr:$dst),
-            (VMOVUPSYmr addr:$dst, VR256:$src)>;
-  def : Pat<(store (v8i32 VR256:$src), addr:$dst),
-            (VMOVUPSYmr addr:$dst, VR256:$src)>;
+let usesCustomInserter = 1 in {
+def MONITOR : PseudoI<(outs), (ins i32mem:$src1, GR32:$src2, GR32:$src3),
+                [(int_x86_sse3_monitor addr:$src1, GR32:$src2, GR32:$src3)]>;
+def MWAIT : PseudoI<(outs), (ins GR32:$src1, GR32:$src2),
+                [(int_x86_sse3_mwait GR32:$src1, GR32:$src2)]>;
 }
 
+let Uses = [EAX, ECX, EDX] in
+def MONITORrrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>, TB,
+                 Requires<[HasSSE3]>;
+let Uses = [ECX, EAX] in
+def MWAITrr   : I<0x01, MRM_C9, (outs), (ins), "mwait", []>, TB,
+                Requires<[HasSSE3]>;
+
+def : InstAlias<"mwait %eax, %ecx", (MWAITrr)>, Requires<[In32BitMode]>;
+def : InstAlias<"mwait %rax, %rcx", (MWAITrr)>, Requires<[In64BitMode]>;
+
+def : InstAlias<"monitor %eax, %ecx, %edx", (MONITORrrr)>,
+      Requires<[In32BitMode]>;
+def : InstAlias<"monitor %rax, %rcx, %rdx", (MONITORrrr)>,
+      Requires<[In64BitMode]>;
+
 //===----------------------------------------------------------------------===//
 // SSE4.1 - Packed Move with Sign/Zero Extend
 //===----------------------------------------------------------------------===//
@@ -3979,36 +5032,71 @@ defm PMOVZXBW   : SS41I_binop_rm_int8<0x30, "pmovzxbw", int_x86_sse41_pmovzxbw>;
 defm PMOVZXWD   : SS41I_binop_rm_int8<0x33, "pmovzxwd", int_x86_sse41_pmovzxwd>;
 defm PMOVZXDQ   : SS41I_binop_rm_int8<0x35, "pmovzxdq", int_x86_sse41_pmovzxdq>;
 
-// Common patterns involving scalar load.
-def : Pat<(int_x86_sse41_pmovsxbw (vzmovl_v2i64 addr:$src)),
-          (PMOVSXBWrm addr:$src)>, Requires<[HasSSE41]>;
-def : Pat<(int_x86_sse41_pmovsxbw (vzload_v2i64 addr:$src)),
-          (PMOVSXBWrm addr:$src)>, Requires<[HasSSE41]>;
+let Predicates = [HasSSE41] in {
+  // Common patterns involving scalar load.
+  def : Pat<(int_x86_sse41_pmovsxbw (vzmovl_v2i64 addr:$src)),
+            (PMOVSXBWrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovsxbw (vzload_v2i64 addr:$src)),
+            (PMOVSXBWrm addr:$src)>;
+
+  def : Pat<(int_x86_sse41_pmovsxwd (vzmovl_v2i64 addr:$src)),
+            (PMOVSXWDrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovsxwd (vzload_v2i64 addr:$src)),
+            (PMOVSXWDrm addr:$src)>;
 
-def : Pat<(int_x86_sse41_pmovsxwd (vzmovl_v2i64 addr:$src)),
-          (PMOVSXWDrm addr:$src)>, Requires<[HasSSE41]>;
-def : Pat<(int_x86_sse41_pmovsxwd (vzload_v2i64 addr:$src)),
-          (PMOVSXWDrm addr:$src)>, Requires<[HasSSE41]>;
+  def : Pat<(int_x86_sse41_pmovsxdq (vzmovl_v2i64 addr:$src)),
+            (PMOVSXDQrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovsxdq (vzload_v2i64 addr:$src)),
+            (PMOVSXDQrm addr:$src)>;
 
-def : Pat<(int_x86_sse41_pmovsxdq (vzmovl_v2i64 addr:$src)),
-          (PMOVSXDQrm addr:$src)>, Requires<[HasSSE41]>;
-def : Pat<(int_x86_sse41_pmovsxdq (vzload_v2i64 addr:$src)),
-          (PMOVSXDQrm addr:$src)>, Requires<[HasSSE41]>;
+  def : Pat<(int_x86_sse41_pmovzxbw (vzmovl_v2i64 addr:$src)),
+            (PMOVZXBWrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovzxbw (vzload_v2i64 addr:$src)),
+            (PMOVZXBWrm addr:$src)>;
 
-def : Pat<(int_x86_sse41_pmovzxbw (vzmovl_v2i64 addr:$src)),
-          (PMOVZXBWrm addr:$src)>, Requires<[HasSSE41]>;
-def : Pat<(int_x86_sse41_pmovzxbw (vzload_v2i64 addr:$src)),
-          (PMOVZXBWrm addr:$src)>, Requires<[HasSSE41]>;
+  def : Pat<(int_x86_sse41_pmovzxwd (vzmovl_v2i64 addr:$src)),
+            (PMOVZXWDrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovzxwd (vzload_v2i64 addr:$src)),
+            (PMOVZXWDrm addr:$src)>;
 
-def : Pat<(int_x86_sse41_pmovzxwd (vzmovl_v2i64 addr:$src)),
-          (PMOVZXWDrm addr:$src)>, Requires<[HasSSE41]>;
-def : Pat<(int_x86_sse41_pmovzxwd (vzload_v2i64 addr:$src)),
-          (PMOVZXWDrm addr:$src)>, Requires<[HasSSE41]>;
+  def : Pat<(int_x86_sse41_pmovzxdq (vzmovl_v2i64 addr:$src)),
+            (PMOVZXDQrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovzxdq (vzload_v2i64 addr:$src)),
+            (PMOVZXDQrm addr:$src)>;
+}
+
+let Predicates = [HasAVX] in {
+  // Common patterns involving scalar load.
+  def : Pat<(int_x86_sse41_pmovsxbw (vzmovl_v2i64 addr:$src)),
+            (VPMOVSXBWrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovsxbw (vzload_v2i64 addr:$src)),
+            (VPMOVSXBWrm addr:$src)>;
+
+  def : Pat<(int_x86_sse41_pmovsxwd (vzmovl_v2i64 addr:$src)),
+            (VPMOVSXWDrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovsxwd (vzload_v2i64 addr:$src)),
+            (VPMOVSXWDrm addr:$src)>;
 
-def : Pat<(int_x86_sse41_pmovzxdq (vzmovl_v2i64 addr:$src)),
-          (PMOVZXDQrm addr:$src)>, Requires<[HasSSE41]>;
-def : Pat<(int_x86_sse41_pmovzxdq (vzload_v2i64 addr:$src)),
-          (PMOVZXDQrm addr:$src)>, Requires<[HasSSE41]>;
+  def : Pat<(int_x86_sse41_pmovsxdq (vzmovl_v2i64 addr:$src)),
+            (VPMOVSXDQrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovsxdq (vzload_v2i64 addr:$src)),
+            (VPMOVSXDQrm addr:$src)>;
+
+  def : Pat<(int_x86_sse41_pmovzxbw (vzmovl_v2i64 addr:$src)),
+            (VPMOVZXBWrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovzxbw (vzload_v2i64 addr:$src)),
+            (VPMOVZXBWrm addr:$src)>;
+
+  def : Pat<(int_x86_sse41_pmovzxwd (vzmovl_v2i64 addr:$src)),
+            (VPMOVZXWDrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovzxwd (vzload_v2i64 addr:$src)),
+            (VPMOVZXWDrm addr:$src)>;
+
+  def : Pat<(int_x86_sse41_pmovzxdq (vzmovl_v2i64 addr:$src)),
+            (VPMOVZXDQrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovzxdq (vzload_v2i64 addr:$src)),
+            (VPMOVZXDQrm addr:$src)>;
+}
 
 
 multiclass SS41I_binop_rm_int4<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
@@ -4039,17 +5127,31 @@ defm PMOVSXWQ   : SS41I_binop_rm_int4<0x24, "pmovsxwq", int_x86_sse41_pmovsxwq>;
 defm PMOVZXBD   : SS41I_binop_rm_int4<0x31, "pmovzxbd", int_x86_sse41_pmovzxbd>;
 defm PMOVZXWQ   : SS41I_binop_rm_int4<0x34, "pmovzxwq", int_x86_sse41_pmovzxwq>;
 
-// Common patterns involving scalar load
-def : Pat<(int_x86_sse41_pmovsxbd (vzmovl_v4i32 addr:$src)),
-          (PMOVSXBDrm addr:$src)>, Requires<[HasSSE41]>;
-def : Pat<(int_x86_sse41_pmovsxwq (vzmovl_v4i32 addr:$src)),
-          (PMOVSXWQrm addr:$src)>, Requires<[HasSSE41]>;
+let Predicates = [HasSSE41] in {
+  // Common patterns involving scalar load
+  def : Pat<(int_x86_sse41_pmovsxbd (vzmovl_v4i32 addr:$src)),
+            (PMOVSXBDrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovsxwq (vzmovl_v4i32 addr:$src)),
+            (PMOVSXWQrm addr:$src)>;
 
-def : Pat<(int_x86_sse41_pmovzxbd (vzmovl_v4i32 addr:$src)),
-          (PMOVZXBDrm addr:$src)>, Requires<[HasSSE41]>;
-def : Pat<(int_x86_sse41_pmovzxwq (vzmovl_v4i32 addr:$src)),
-          (PMOVZXWQrm addr:$src)>, Requires<[HasSSE41]>;
+  def : Pat<(int_x86_sse41_pmovzxbd (vzmovl_v4i32 addr:$src)),
+            (PMOVZXBDrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovzxwq (vzmovl_v4i32 addr:$src)),
+            (PMOVZXWQrm addr:$src)>;
+}
+
+let Predicates = [HasAVX] in {
+  // Common patterns involving scalar load
+  def : Pat<(int_x86_sse41_pmovsxbd (vzmovl_v4i32 addr:$src)),
+            (VPMOVSXBDrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovsxwq (vzmovl_v4i32 addr:$src)),
+            (VPMOVSXWQrm addr:$src)>;
 
+  def : Pat<(int_x86_sse41_pmovzxbd (vzmovl_v4i32 addr:$src)),
+            (VPMOVZXBDrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovzxwq (vzmovl_v4i32 addr:$src)),
+            (VPMOVZXWQrm addr:$src)>;
+}
 
 multiclass SS41I_binop_rm_int2<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
   def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
@@ -4073,16 +5175,31 @@ defm VPMOVZXBQ : SS41I_binop_rm_int2<0x32, "vpmovzxbq", int_x86_sse41_pmovzxbq>,
 defm PMOVSXBQ   : SS41I_binop_rm_int2<0x22, "pmovsxbq", int_x86_sse41_pmovsxbq>;
 defm PMOVZXBQ   : SS41I_binop_rm_int2<0x32, "pmovzxbq", int_x86_sse41_pmovzxbq>;
 
-// Common patterns involving scalar load
-def : Pat<(int_x86_sse41_pmovsxbq
-            (bitconvert (v4i32 (X86vzmovl
-                             (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
-          (PMOVSXBQrm addr:$src)>, Requires<[HasSSE41]>;
+let Predicates = [HasSSE41] in {
+  // Common patterns involving scalar load
+  def : Pat<(int_x86_sse41_pmovsxbq
+              (bitconvert (v4i32 (X86vzmovl
+                            (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
+            (PMOVSXBQrm addr:$src)>;
+
+  def : Pat<(int_x86_sse41_pmovzxbq
+              (bitconvert (v4i32 (X86vzmovl
+                            (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
+            (PMOVZXBQrm addr:$src)>;
+}
+
+let Predicates = [HasAVX] in {
+  // Common patterns involving scalar load
+  def : Pat<(int_x86_sse41_pmovsxbq
+              (bitconvert (v4i32 (X86vzmovl
+                            (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
+            (VPMOVSXBQrm addr:$src)>;
 
-def : Pat<(int_x86_sse41_pmovzxbq
-            (bitconvert (v4i32 (X86vzmovl
-                             (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
-          (PMOVZXBQrm addr:$src)>, Requires<[HasSSE41]>;
+  def : Pat<(int_x86_sse41_pmovzxbq
+              (bitconvert (v4i32 (X86vzmovl
+                            (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
+            (VPMOVZXBQrm addr:$src)>;
+}
 
 //===----------------------------------------------------------------------===//
 // SSE4.1 - Extract Instructions
@@ -4208,7 +5325,12 @@ def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
                                               imm:$src2))),
                  addr:$dst),
           (EXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
-         Requires<[HasSSE41]>;
+          Requires<[HasSSE41]>;
+def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
+                                              imm:$src2))),
+                 addr:$dst),
+          (VEXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
+          Requires<[HasAVX]>;
 
 //===----------------------------------------------------------------------===//
 // SSE4.1 - Insert Instructions
@@ -4297,7 +5419,7 @@ let Constraints = "$src1 = $dst" in
 // in the target vector.
 multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> {
   def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
-      (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
+      (ins VR128:$src1, VR128:$src2, u32u8imm:$src3),
       !if(Is2Addr,
         !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
         !strconcat(asm,
@@ -4306,7 +5428,7 @@ multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> {
         (X86insrtps VR128:$src1, VR128:$src2, imm:$src3))]>,
       OpSize;
   def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
-      (ins VR128:$src1, f32mem:$src2, i32i8imm:$src3),
+      (ins VR128:$src1, f32mem:$src2, u32u8imm:$src3),
       !if(Is2Addr,
         !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
         !strconcat(asm,
@@ -4348,7 +5470,7 @@ multiclass sse41_fp_unop_rm<bits<8> opcps, bits<8> opcpd, string OpcodeStr,
 
   // Vector intrinsic operation, mem
   def PSm : Ii8<opcps, MRMSrcMem,
-                    (outs RC:$dst), (ins f256mem:$src1, i32i8imm:$src2),
+                    (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2),
                     !strconcat(OpcodeStr,
                     "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                     [(set RC:$dst,
@@ -4366,7 +5488,7 @@ multiclass sse41_fp_unop_rm<bits<8> opcps, bits<8> opcpd, string OpcodeStr,
 
   // Vector intrinsic operation, mem
   def PDm : SS4AIi8<opcpd, MRMSrcMem,
-                    (outs RC:$dst), (ins f256mem:$src1, i32i8imm:$src2),
+                    (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2),
                     !strconcat(OpcodeStr,
                     "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                     [(set RC:$dst,
@@ -4501,14 +5623,14 @@ let Predicates = [HasAVX] in {
                                   int_x86_avx_round_pd_256>, VEX;
   defm VROUND  : sse41_fp_binop_rm<0x0A, 0x0B, "vround",
                                   int_x86_sse41_round_ss,
-                                  int_x86_sse41_round_sd, 0>, VEX_4V;
+                                  int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG;
 
   // Instructions for the assembler
   defm VROUND  : sse41_fp_unop_rm_avx_p<0x08, 0x09, VR128, f128mem, "vround">,
                                         VEX;
   defm VROUNDY : sse41_fp_unop_rm_avx_p<0x08, 0x09, VR256, f256mem, "vround">,
                                         VEX;
-  defm VROUND  : sse41_fp_binop_rm_avx_s<0x0A, 0x0B, "vround">, VEX_4V;
+  defm VROUND  : sse41_fp_binop_rm_avx_s<0x0A, 0x0B, "vround">, VEX_4V, VEX_LIG;
 }
 
 defm ROUND  : sse41_fp_unop_rm<0x08, 0x09, "round", f128mem, VR128,
@@ -4578,26 +5700,34 @@ defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, memopv4f64, v4f64>;
 // SSE4.1 - Misc Instructions
 //===----------------------------------------------------------------------===//
 
-def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
-                   "popcnt{w}\t{$src, $dst|$dst, $src}",
-                   [(set GR16:$dst, (ctpop GR16:$src))]>, OpSize, XS;
-def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
-                   "popcnt{w}\t{$src, $dst|$dst, $src}",
-                   [(set GR16:$dst, (ctpop (loadi16 addr:$src)))]>, OpSize, XS;
-
-def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
-                   "popcnt{l}\t{$src, $dst|$dst, $src}",
-                   [(set GR32:$dst, (ctpop GR32:$src))]>, XS;
-def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
-                   "popcnt{l}\t{$src, $dst|$dst, $src}",
-                   [(set GR32:$dst, (ctpop (loadi32 addr:$src)))]>, XS;
-
-def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
-                    "popcnt{q}\t{$src, $dst|$dst, $src}",
-                    [(set GR64:$dst, (ctpop GR64:$src))]>, XS;
-def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
-                    "popcnt{q}\t{$src, $dst|$dst, $src}",
-                    [(set GR64:$dst, (ctpop (loadi64 addr:$src)))]>, XS;
+let Defs = [EFLAGS], Predicates = [HasPOPCNT] in {
+  def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+                     "popcnt{w}\t{$src, $dst|$dst, $src}",
+                     [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)]>,
+                     OpSize, XS;
+  def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+                     "popcnt{w}\t{$src, $dst|$dst, $src}",
+                     [(set GR16:$dst, (ctpop (loadi16 addr:$src))),
+                      (implicit EFLAGS)]>, OpSize, XS;
+
+  def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+                     "popcnt{l}\t{$src, $dst|$dst, $src}",
+                     [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)]>,
+                     XS;
+  def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+                     "popcnt{l}\t{$src, $dst|$dst, $src}",
+                     [(set GR32:$dst, (ctpop (loadi32 addr:$src))),
+                      (implicit EFLAGS)]>, XS;
+
+  def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+                      "popcnt{q}\t{$src, $dst|$dst, $src}",
+                      [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)]>,
+                      XS;
+  def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+                      "popcnt{q}\t{$src, $dst|$dst, $src}",
+                      [(set GR64:$dst, (ctpop (loadi64 addr:$src))),
+                       (implicit EFLAGS)]>, XS;
+}
 
 
 
@@ -4666,6 +5796,11 @@ let Predicates = [HasAVX] in {
                                                          0>, VEX_4V;
   defm VPMULDQ   : SS41I_binop_rm_int<0x28, "vpmuldq",   int_x86_sse41_pmuldq,
                                                          0>, VEX_4V;
+
+  def : Pat<(v2i64 (X86pcmpeqq VR128:$src1, VR128:$src2)),
+            (VPCMPEQQrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v2i64 (X86pcmpeqq VR128:$src1, (memop addr:$src2))),
+            (VPCMPEQQrm VR128:$src1, addr:$src2)>;
 }
 
 let Constraints = "$src1 = $dst" in {
@@ -4720,7 +5855,7 @@ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
                  X86MemOperand x86memop, bit Is2Addr = 1> {
   let isCommutable = 1 in
   def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
-        (ins RC:$src1, RC:$src2, i32i8imm:$src3),
+        (ins RC:$src1, RC:$src2, u32u8imm:$src3),
         !if(Is2Addr,
             !strconcat(OpcodeStr,
                 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
@@ -4729,7 +5864,7 @@ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
         [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))]>,
         OpSize;
   def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
-        (ins RC:$src1, x86memop:$src2, i32i8imm:$src3),
+        (ins RC:$src1, x86memop:$src2, u32u8imm:$src3),
         !if(Is2Addr,
             !strconcat(OpcodeStr,
                 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
@@ -4815,6 +5950,36 @@ defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, i256mem,
 defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, i256mem,
                                          memopv32i8, int_x86_avx_blendv_ps_256>;
 
+let Predicates = [HasAVX] in {
+  def : Pat<(v16i8 (vselect (v16i8 VR128:$mask), (v16i8 VR128:$src1),
+                            (v16i8 VR128:$src2))),
+            (VPBLENDVBrr VR128:$src2, VR128:$src1, VR128:$mask)>;
+  def : Pat<(v4i32 (vselect (v4i32 VR128:$mask), (v4i32 VR128:$src1),
+                            (v4i32 VR128:$src2))),
+            (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
+  def : Pat<(v4f32 (vselect (v4i32 VR128:$mask), (v4f32 VR128:$src1),
+                            (v4f32 VR128:$src2))),
+            (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
+  def : Pat<(v2i64 (vselect (v2i64 VR128:$mask), (v2i64 VR128:$src1),
+                            (v2i64 VR128:$src2))),
+            (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
+  def : Pat<(v2f64 (vselect (v2i64 VR128:$mask), (v2f64 VR128:$src1),
+                            (v2f64 VR128:$src2))),
+            (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
+  def : Pat<(v8i32 (vselect (v8i32 VR256:$mask), (v8i32 VR256:$src1),
+                            (v8i32 VR256:$src2))),
+            (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
+  def : Pat<(v8f32 (vselect (v8i32 VR256:$mask), (v8f32 VR256:$src1),
+                            (v8f32 VR256:$src2))),
+            (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
+  def : Pat<(v4i64 (vselect (v4i64 VR256:$mask), (v4i64 VR256:$src1),
+                            (v4i64 VR256:$src2))),
+            (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
+  def : Pat<(v4f64 (vselect (v4i64 VR256:$mask), (v4f64 VR256:$src1),
+                            (v4f64 VR256:$src2))),
+            (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
+}
+
 /// SS41I_ternary_int - SSE 4.1 ternary operator
 let Uses = [XMM0], Constraints = "$src1 = $dst" in {
   multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
@@ -4835,12 +6000,27 @@ let Uses = [XMM0], Constraints = "$src1 = $dst" in {
   }
 }
 
-defm BLENDVPD     : SS41I_ternary_int<0x15, "blendvpd", int_x86_sse41_blendvpd>;
-defm BLENDVPS     : SS41I_ternary_int<0x14, "blendvps", int_x86_sse41_blendvps>;
-defm PBLENDVB     : SS41I_ternary_int<0x10, "pblendvb", int_x86_sse41_pblendvb>;
-
-def : Pat<(X86pblendv VR128:$src1, VR128:$src2, XMM0),
-          (PBLENDVBrr0 VR128:$src1, VR128:$src2)>;
+defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", int_x86_sse41_blendvpd>;
+defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", int_x86_sse41_blendvps>;
+defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", int_x86_sse41_pblendvb>;
+
+let Predicates = [HasSSE41] in {
+  def : Pat<(v16i8 (vselect (v16i8 XMM0), (v16i8 VR128:$src1),
+                            (v16i8 VR128:$src2))),
+            (PBLENDVBrr0 VR128:$src2, VR128:$src1)>;
+  def : Pat<(v4i32 (vselect (v4i32 XMM0), (v4i32 VR128:$src1),
+                            (v4i32 VR128:$src2))),
+            (BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
+  def : Pat<(v4f32 (vselect (v4i32 XMM0), (v4f32 VR128:$src1),
+                            (v4f32 VR128:$src2))),
+            (BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
+  def : Pat<(v2i64 (vselect (v2i64 XMM0), (v2i64 VR128:$src1),
+                            (v2i64 VR128:$src2))),
+            (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
+  def : Pat<(v2f64 (vselect (v2i64 XMM0), (v2f64 VR128:$src1),
+                            (v2f64 VR128:$src2))),
+            (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
+}
 
 let Predicates = [HasAVX] in
 def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
@@ -4876,9 +6056,16 @@ multiclass SS42I_binop_rm_int<bits<8> opc, string OpcodeStr,
           (bitconvert (memopv16i8 addr:$src2))))]>, OpSize;
 }
 
-let Predicates = [HasAVX] in
+let Predicates = [HasAVX] in {
   defm VPCMPGTQ : SS42I_binop_rm_int<0x37, "vpcmpgtq", int_x86_sse42_pcmpgtq,
                                      0>, VEX_4V;
+
+  def : Pat<(v2i64 (X86pcmpgtq VR128:$src1, VR128:$src2)),
+            (VPCMPGTQrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v2i64 (X86pcmpgtq VR128:$src1, (memop addr:$src2))),
+            (VPCMPGTQrm VR128:$src1, addr:$src2)>;
+}
+
 let Constraints = "$src1 = $dst" in
   defm PCMPGTQ : SS42I_binop_rm_int<0x37, "pcmpgtq", int_x86_sse42_pcmpgtq>;
 
@@ -5158,22 +6345,43 @@ let Constraints = "$src1 = $dst" in {
                          int_x86_aesni_aesdeclast>;
 }
 
-def : Pat<(v2i64 (int_x86_aesni_aesenc VR128:$src1, VR128:$src2)),
-          (AESENCrr VR128:$src1, VR128:$src2)>;
-def : Pat<(v2i64 (int_x86_aesni_aesenc VR128:$src1, (memop addr:$src2))),
-          (AESENCrm VR128:$src1, addr:$src2)>;
-def : Pat<(v2i64 (int_x86_aesni_aesenclast VR128:$src1, VR128:$src2)),
-          (AESENCLASTrr VR128:$src1, VR128:$src2)>;
-def : Pat<(v2i64 (int_x86_aesni_aesenclast VR128:$src1, (memop addr:$src2))),
-          (AESENCLASTrm VR128:$src1, addr:$src2)>;
-def : Pat<(v2i64 (int_x86_aesni_aesdec VR128:$src1, VR128:$src2)),
-          (AESDECrr VR128:$src1, VR128:$src2)>;
-def : Pat<(v2i64 (int_x86_aesni_aesdec VR128:$src1, (memop addr:$src2))),
-          (AESDECrm VR128:$src1, addr:$src2)>;
-def : Pat<(v2i64 (int_x86_aesni_aesdeclast VR128:$src1, VR128:$src2)),
-          (AESDECLASTrr VR128:$src1, VR128:$src2)>;
-def : Pat<(v2i64 (int_x86_aesni_aesdeclast VR128:$src1, (memop addr:$src2))),
-          (AESDECLASTrm VR128:$src1, addr:$src2)>;
+let Predicates = [HasAES] in {
+  def : Pat<(v2i64 (int_x86_aesni_aesenc VR128:$src1, VR128:$src2)),
+            (AESENCrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v2i64 (int_x86_aesni_aesenc VR128:$src1, (memop addr:$src2))),
+            (AESENCrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v2i64 (int_x86_aesni_aesenclast VR128:$src1, VR128:$src2)),
+            (AESENCLASTrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v2i64 (int_x86_aesni_aesenclast VR128:$src1, (memop addr:$src2))),
+            (AESENCLASTrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v2i64 (int_x86_aesni_aesdec VR128:$src1, VR128:$src2)),
+            (AESDECrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v2i64 (int_x86_aesni_aesdec VR128:$src1, (memop addr:$src2))),
+            (AESDECrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v2i64 (int_x86_aesni_aesdeclast VR128:$src1, VR128:$src2)),
+            (AESDECLASTrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v2i64 (int_x86_aesni_aesdeclast VR128:$src1, (memop addr:$src2))),
+            (AESDECLASTrm VR128:$src1, addr:$src2)>;
+}
+
+let Predicates = [HasAVX, HasAES], AddedComplexity = 20 in {
+  def : Pat<(v2i64 (int_x86_aesni_aesenc VR128:$src1, VR128:$src2)),
+            (VAESENCrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v2i64 (int_x86_aesni_aesenc VR128:$src1, (memop addr:$src2))),
+            (VAESENCrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v2i64 (int_x86_aesni_aesenclast VR128:$src1, VR128:$src2)),
+            (VAESENCLASTrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v2i64 (int_x86_aesni_aesenclast VR128:$src1, (memop addr:$src2))),
+            (VAESENCLASTrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v2i64 (int_x86_aesni_aesdec VR128:$src1, VR128:$src2)),
+            (VAESDECrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v2i64 (int_x86_aesni_aesdec VR128:$src1, (memop addr:$src2))),
+            (VAESDECrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v2i64 (int_x86_aesni_aesdeclast VR128:$src1, VR128:$src2)),
+            (VAESDECLASTrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v2i64 (int_x86_aesni_aesdeclast VR128:$src1, (memop addr:$src2))),
+            (VAESDECLASTrm VR128:$src1, addr:$src2)>;
+}
 
 // Perform the AES InvMixColumn Transformation
 let Predicates = [HasAVX, HasAES] in {
@@ -5288,8 +6496,10 @@ defm : pclmul_alias<"lqlq", 0x00>;
 // AVX Instructions
 //===----------------------------------------------------------------------===//
 
-
-// Load from memory and broadcast to all elements of the destination operand
+//===----------------------------------------------------------------------===//
+// VBROADCAST - Load from memory and broadcast to all elements of the
+//              destination operand
+//
 class avx_broadcast<bits<8> opc, string OpcodeStr, RegisterClass RC,
                     X86MemOperand x86memop, Intrinsic Int> :
   AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
@@ -5305,7 +6515,26 @@ def VBROADCASTSD   : avx_broadcast<0x19, "vbroadcastsd", VR256, f64mem,
 def VBROADCASTF128 : avx_broadcast<0x1A, "vbroadcastf128", VR256, f128mem,
                                    int_x86_avx_vbroadcastf128_pd_256>;
 
-// Insert packed floating-point values
+def : Pat<(int_x86_avx_vbroadcastf128_ps_256 addr:$src),
+          (VBROADCASTF128 addr:$src)>;
+
+def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
+          (VBROADCASTSSY addr:$src)>;
+def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))),
+          (VBROADCASTSD addr:$src)>;
+def : Pat<(v8f32 (X86VBroadcast (loadf32 addr:$src))),
+          (VBROADCASTSSY addr:$src)>;
+def : Pat<(v4f64 (X86VBroadcast (loadf64 addr:$src))),
+          (VBROADCASTSD addr:$src)>;
+
+def : Pat<(v4f32 (X86VBroadcast (loadf32 addr:$src))),
+          (VBROADCASTSS addr:$src)>;
+def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
+          (VBROADCASTSS addr:$src)>;
+
+//===----------------------------------------------------------------------===//
+// VINSERTF128 - Insert packed floating-point values
+//
 def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst),
           (ins VR256:$src1, VR128:$src2, i8imm:$src3),
           "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
@@ -5315,7 +6544,41 @@ def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
           "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
           []>, VEX_4V;
 
-// Extract packed floating-point values
+def : Pat<(int_x86_avx_vinsertf128_pd_256 VR256:$src1, VR128:$src2, imm:$src3),
+          (VINSERTF128rr VR256:$src1, VR128:$src2, imm:$src3)>;
+def : Pat<(int_x86_avx_vinsertf128_ps_256 VR256:$src1, VR128:$src2, imm:$src3),
+          (VINSERTF128rr VR256:$src1, VR128:$src2, imm:$src3)>;
+def : Pat<(int_x86_avx_vinsertf128_si_256 VR256:$src1, VR128:$src2, imm:$src3),
+          (VINSERTF128rr VR256:$src1, VR128:$src2, imm:$src3)>;
+
+def : Pat<(vinsertf128_insert:$ins (v8f32 VR256:$src1), (v4f32 VR128:$src2),
+                                   (i32 imm)),
+          (VINSERTF128rr VR256:$src1, VR128:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v4f64 VR256:$src1), (v2f64 VR128:$src2),
+                                   (i32 imm)),
+          (VINSERTF128rr VR256:$src1, VR128:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2),
+                                   (i32 imm)),
+          (VINSERTF128rr VR256:$src1, VR128:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
+                                   (i32 imm)),
+          (VINSERTF128rr VR256:$src1, VR128:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2),
+                                   (i32 imm)),
+          (VINSERTF128rr VR256:$src1, VR128:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
+                                   (i32 imm)),
+          (VINSERTF128rr VR256:$src1, VR128:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+
+//===----------------------------------------------------------------------===//
+// VEXTRACTF128 - Extract packed floating-point values
+//
 def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst),
           (ins VR256:$src1, i8imm:$src2),
           "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -5325,7 +6588,41 @@ def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs),
           "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
           []>, VEX;
 
-// Conditional SIMD Packed Loads and Stores
+def : Pat<(int_x86_avx_vextractf128_pd_256 VR256:$src1, imm:$src2),
+          (VEXTRACTF128rr VR256:$src1, imm:$src2)>;
+def : Pat<(int_x86_avx_vextractf128_ps_256 VR256:$src1, imm:$src2),
+          (VEXTRACTF128rr VR256:$src1, imm:$src2)>;
+def : Pat<(int_x86_avx_vextractf128_si_256 VR256:$src1, imm:$src2),
+          (VEXTRACTF128rr VR256:$src1, imm:$src2)>;
+
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+          (v4f32 (VEXTRACTF128rr
+                    (v8f32 VR256:$src1),
+                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+          (v2f64 (VEXTRACTF128rr
+                    (v4f64 VR256:$src1),
+                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+          (v4i32 (VEXTRACTF128rr
+                    (v8i32 VR256:$src1),
+                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+          (v2i64 (VEXTRACTF128rr
+                    (v4i64 VR256:$src1),
+                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+          (v8i16 (VEXTRACTF128rr
+                    (v16i16 VR256:$src1),
+                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+          (v16i8 (VEXTRACTF128rr
+                    (v32i8 VR256:$src1),
+                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+
+//===----------------------------------------------------------------------===//
+// VMASKMOV - Conditional SIMD Packed Loads and Stores
+//
 multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr,
                           Intrinsic IntLd, Intrinsic IntLd256,
                           Intrinsic IntSt, Intrinsic IntSt256,
@@ -5363,7 +6660,9 @@ defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd",
                                  int_x86_avx_maskstore_pd_256,
                                  memopv2f64, memopv4f64>;
 
-// Permute Floating-Point Values
+//===----------------------------------------------------------------------===//
+// VPERMIL - Permute Single and Double Floating-Point Values
+//
 multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
                       RegisterClass RC, X86MemOperand x86memop_f,
                       X86MemOperand x86memop_i, PatFrag f_frag, PatFrag i_frag,
@@ -5404,6 +6703,18 @@ defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem,
                              int_x86_avx_vpermilvar_pd_256,
                              int_x86_avx_vpermil_pd_256>;
 
+def : Pat<(v8f32 (X86VPermilpsy VR256:$src1, (i8 imm:$imm))),
+          (VPERMILPSYri VR256:$src1, imm:$imm)>;
+def : Pat<(v4f64 (X86VPermilpdy VR256:$src1, (i8 imm:$imm))),
+          (VPERMILPDYri VR256:$src1, imm:$imm)>;
+def : Pat<(v8i32 (X86VPermilpsy VR256:$src1, (i8 imm:$imm))),
+          (VPERMILPSYri VR256:$src1, imm:$imm)>;
+def : Pat<(v4i64 (X86VPermilpdy VR256:$src1, (i8 imm:$imm))),
+          (VPERMILPDYri VR256:$src1, imm:$imm)>;
+
+//===----------------------------------------------------------------------===//
+// VPERM2F128 - Permute Floating-Point Values in 128-bit chunks
+//
 def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
           (ins VR256:$src1, VR256:$src2, i8imm:$src3),
           "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
@@ -5413,65 +6724,6 @@ def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
           "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
           []>, VEX_4V;
 
-// Zero All YMM registers
-def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall",
-                 [(int_x86_avx_vzeroall)]>, VEX, VEX_L, Requires<[HasAVX]>;
-
-// Zero Upper bits of YMM registers
-def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper",
-                   [(int_x86_avx_vzeroupper)]>, VEX, Requires<[HasAVX]>;
-
-def : Pat<(int_x86_avx_vinsertf128_pd_256 VR256:$src1, VR128:$src2, imm:$src3),
-          (VINSERTF128rr VR256:$src1, VR128:$src2, imm:$src3)>;
-def : Pat<(int_x86_avx_vinsertf128_ps_256 VR256:$src1, VR128:$src2, imm:$src3),
-          (VINSERTF128rr VR256:$src1, VR128:$src2, imm:$src3)>;
-def : Pat<(int_x86_avx_vinsertf128_si_256 VR256:$src1, VR128:$src2, imm:$src3),
-          (VINSERTF128rr VR256:$src1, VR128:$src2, imm:$src3)>;
-
-def : Pat<(vinsertf128_insert:$ins (v8f32 VR256:$src1), (v4f32 VR128:$src2),
-                                   (i32 imm)),
-          (VINSERTF128rr VR256:$src1, VR128:$src2,
-                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v4f64 VR256:$src1), (v2f64 VR128:$src2),
-                                   (i32 imm)),
-          (VINSERTF128rr VR256:$src1, VR128:$src2,
-                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2),
-                                   (i32 imm)),
-          (VINSERTF128rr VR256:$src1, VR128:$src2,
-                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
-                                   (i32 imm)),
-          (VINSERTF128rr VR256:$src1, VR128:$src2,
-                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
-
-def : Pat<(int_x86_avx_vextractf128_pd_256 VR256:$src1, imm:$src2),
-          (VEXTRACTF128rr VR256:$src1, imm:$src2)>;
-def : Pat<(int_x86_avx_vextractf128_ps_256 VR256:$src1, imm:$src2),
-          (VEXTRACTF128rr VR256:$src1, imm:$src2)>;
-def : Pat<(int_x86_avx_vextractf128_si_256 VR256:$src1, imm:$src2),
-          (VEXTRACTF128rr VR256:$src1, imm:$src2)>;
-
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
-          (v4f32 (VEXTRACTF128rr
-                    (v8f32 VR256:$src1),
-                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
-          (v2f64 (VEXTRACTF128rr
-                    (v4f64 VR256:$src1),
-                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
-          (v4i32 (VEXTRACTF128rr
-                    (v8i32 VR256:$src1),
-                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
-          (v2i64 (VEXTRACTF128rr
-                    (v4i64 VR256:$src1),
-                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-
-def : Pat<(int_x86_avx_vbroadcastf128_ps_256 addr:$src),
-          (VBROADCASTF128 addr:$src)>;
-
 def : Pat<(int_x86_avx_vperm2f128_ps_256 VR256:$src1, VR256:$src2, imm:$src3),
           (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$src3)>;
 def : Pat<(int_x86_avx_vperm2f128_pd_256 VR256:$src1, VR256:$src2, imm:$src3),
@@ -5489,377 +6741,59 @@ def : Pat<(int_x86_avx_vperm2f128_si_256
                   VR256:$src1, (memopv8i32 addr:$src2), imm:$src3),
           (VPERM2F128rm VR256:$src1, addr:$src2, imm:$src3)>;
 
+def : Pat<(v8f32 (X86VPerm2f128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v8i32 (X86VPerm2f128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v4i64 (X86VPerm2f128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v4f64 (X86VPerm2f128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v32i8 (X86VPerm2f128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v16i16 (X86VPerm2f128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+
 //===----------------------------------------------------------------------===//
-// SSE Shuffle pattern fragments
-//===----------------------------------------------------------------------===//
+// VZERO - Zero YMM registers
+//
+let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
+            YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in {
+  // Zero All YMM registers
+  def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall",
+                  [(int_x86_avx_vzeroall)]>, TB, VEX, VEX_L, Requires<[HasAVX]>;
 
-// This is part of a "work in progress" refactoring. The idea is that all
-// vector shuffles are going to be translated into target specific nodes and
-// directly matched by the patterns below (which can be changed along the way)
-// The AVX version of some but not all of them are described here, and more
-// should come in a near future.
-
-// Shuffle with PSHUFD instruction folding loads. The first two patterns match
-// SSE2 loads, which are always promoted to v2i64. The last one should match
-// the SSE1 case, where the only legal load is v4f32, but there is no PSHUFD
-// in SSE2, how does it ever worked? Anyway, the pattern will remain here until
-// we investigate further.
-def : Pat<(v4i32 (X86PShufd (bc_v4i32 (memopv2i64 addr:$src1)),
-                                 (i8 imm:$imm))),
-          (VPSHUFDmi addr:$src1, imm:$imm)>, Requires<[HasAVX]>;
-def : Pat<(v4i32 (X86PShufd (bc_v4i32 (memopv2i64 addr:$src1)),
-                                 (i8 imm:$imm))),
-          (PSHUFDmi addr:$src1, imm:$imm)>;
-def : Pat<(v4i32 (X86PShufd (bc_v4i32 (memopv4f32 addr:$src1)),
-                                 (i8 imm:$imm))),
-          (PSHUFDmi addr:$src1, imm:$imm)>; // FIXME: has this ever worked?
-
-// Shuffle with PSHUFD instruction.
-def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
-          (VPSHUFDri VR128:$src1, imm:$imm)>, Requires<[HasAVX]>;
-def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
-          (PSHUFDri VR128:$src1, imm:$imm)>;
-
-def : Pat<(v4i32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
-          (VPSHUFDri VR128:$src1, imm:$imm)>, Requires<[HasAVX]>;
-def : Pat<(v4i32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
-          (PSHUFDri VR128:$src1, imm:$imm)>;
-
-// Shuffle with SHUFPD instruction.
-def : Pat<(v2f64 (X86Shufps VR128:$src1,
-                     (memopv2f64 addr:$src2), (i8 imm:$imm))),
-          (VSHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>, Requires<[HasAVX]>;
-def : Pat<(v2f64 (X86Shufps VR128:$src1,
-                     (memopv2f64 addr:$src2), (i8 imm:$imm))),
-          (SHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>;
-
-def : Pat<(v2i64 (X86Shufpd VR128:$src1, VR128:$src2, (i8 imm:$imm))),
-          (VSHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>, Requires<[HasAVX]>;
-def : Pat<(v2i64 (X86Shufpd VR128:$src1, VR128:$src2, (i8 imm:$imm))),
-          (SHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>;
-
-def : Pat<(v2f64 (X86Shufpd VR128:$src1, VR128:$src2, (i8 imm:$imm))),
-          (VSHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>, Requires<[HasAVX]>;
-def : Pat<(v2f64 (X86Shufpd VR128:$src1, VR128:$src2, (i8 imm:$imm))),
-          (SHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>;
-
-// Shuffle with SHUFPS instruction.
-def : Pat<(v4f32 (X86Shufps VR128:$src1,
-                     (memopv4f32 addr:$src2), (i8 imm:$imm))),
-          (VSHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>, Requires<[HasAVX]>;
-def : Pat<(v4f32 (X86Shufps VR128:$src1,
-                     (memopv4f32 addr:$src2), (i8 imm:$imm))),
-          (SHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
-
-def : Pat<(v4f32 (X86Shufps VR128:$src1, VR128:$src2, (i8 imm:$imm))),
-          (VSHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>, Requires<[HasAVX]>;
-def : Pat<(v4f32 (X86Shufps VR128:$src1, VR128:$src2, (i8 imm:$imm))),
-          (SHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>;
-
-def : Pat<(v4i32 (X86Shufps VR128:$src1,
-                     (bc_v4i32 (memopv2i64 addr:$src2)), (i8 imm:$imm))),
-          (VSHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>, Requires<[HasAVX]>;
-def : Pat<(v4i32 (X86Shufps VR128:$src1,
-                     (bc_v4i32 (memopv2i64 addr:$src2)), (i8 imm:$imm))),
-          (SHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
-
-def : Pat<(v4i32 (X86Shufps VR128:$src1, VR128:$src2, (i8 imm:$imm))),
-          (VSHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>, Requires<[HasAVX]>;
-def : Pat<(v4i32 (X86Shufps VR128:$src1, VR128:$src2, (i8 imm:$imm))),
-          (SHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>;
-
-// Shuffle with MOVHLPS instruction
-def : Pat<(v4f32 (X86Movhlps VR128:$src1, VR128:$src2)),
-          (MOVHLPSrr VR128:$src1, VR128:$src2)>;
-def : Pat<(v4i32 (X86Movhlps VR128:$src1, VR128:$src2)),
-          (MOVHLPSrr VR128:$src1, VR128:$src2)>;
-
-// Shuffle with MOVDDUP instruction
-def : Pat<(X86Movddup (memopv2f64 addr:$src)),
-          (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
-def : Pat<(X86Movddup (memopv2f64 addr:$src)),
-          (MOVDDUPrm addr:$src)>;
-
-def : Pat<(X86Movddup (bc_v2f64 (memopv4f32 addr:$src))),
-          (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
-def : Pat<(X86Movddup (bc_v2f64 (memopv4f32 addr:$src))),
-          (MOVDDUPrm addr:$src)>;
-
-def : Pat<(X86Movddup (bc_v2f64 (memopv2i64 addr:$src))),
-          (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
-def : Pat<(X86Movddup (bc_v2f64 (memopv2i64 addr:$src))),
-          (MOVDDUPrm addr:$src)>;
-
-def : Pat<(X86Movddup (v2f64 (scalar_to_vector (loadf64 addr:$src)))),
-          (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
-def : Pat<(X86Movddup (v2f64 (scalar_to_vector (loadf64 addr:$src)))),
-          (MOVDDUPrm addr:$src)>;
-
-def : Pat<(X86Movddup (bc_v2f64
-                           (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
-          (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
-def : Pat<(X86Movddup (bc_v2f64
-                           (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
-          (MOVDDUPrm addr:$src)>;
-
-
-// Shuffle with UNPCKLPS
-def : Pat<(v4f32 (X86Unpcklps VR128:$src1, (memopv4f32 addr:$src2))),
-          (VUNPCKLPSrm VR128:$src1, addr:$src2)>, Requires<[HasAVX]>;
-def : Pat<(v8f32 (X86Unpcklpsy VR256:$src1, (memopv8f32 addr:$src2))),
-          (VUNPCKLPSYrm VR256:$src1, addr:$src2)>, Requires<[HasAVX]>;
-def : Pat<(v4f32 (X86Unpcklps VR128:$src1, (memopv4f32 addr:$src2))),
-          (UNPCKLPSrm VR128:$src1, addr:$src2)>;
-
-def : Pat<(v4f32 (X86Unpcklps VR128:$src1, VR128:$src2)),
-          (VUNPCKLPSrr VR128:$src1, VR128:$src2)>, Requires<[HasAVX]>;
-def : Pat<(v8f32 (X86Unpcklpsy VR256:$src1, VR256:$src2)),
-          (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>, Requires<[HasAVX]>;
-def : Pat<(v4f32 (X86Unpcklps VR128:$src1, VR128:$src2)),
-          (UNPCKLPSrr VR128:$src1, VR128:$src2)>;
-
-// Shuffle with UNPCKHPS
-def : Pat<(v4f32 (X86Unpckhps VR128:$src1, (memopv4f32 addr:$src2))),
-          (VUNPCKHPSrm VR128:$src1, addr:$src2)>, Requires<[HasAVX]>;
-def : Pat<(v4f32 (X86Unpckhps VR128:$src1, (memopv4f32 addr:$src2))),
-          (UNPCKHPSrm VR128:$src1, addr:$src2)>;
-
-def : Pat<(v4f32 (X86Unpckhps VR128:$src1, VR128:$src2)),
-          (VUNPCKHPSrr VR128:$src1, VR128:$src2)>, Requires<[HasAVX]>;
-def : Pat<(v4f32 (X86Unpckhps VR128:$src1, VR128:$src2)),
-          (UNPCKHPSrr VR128:$src1, VR128:$src2)>;
-
-// Shuffle with UNPCKLPD
-def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, (memopv2f64 addr:$src2))),
-          (VUNPCKLPDrm VR128:$src1, addr:$src2)>, Requires<[HasAVX]>;
-def : Pat<(v4f64 (X86Unpcklpdy VR256:$src1, (memopv4f64 addr:$src2))),
-          (VUNPCKLPDYrm VR256:$src1, addr:$src2)>, Requires<[HasAVX]>;
-def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, (memopv2f64 addr:$src2))),
-          (UNPCKLPDrm VR128:$src1, addr:$src2)>;
-
-def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, VR128:$src2)),
-          (VUNPCKLPDrr VR128:$src1, VR128:$src2)>, Requires<[HasAVX]>;
-def : Pat<(v4f64 (X86Unpcklpdy VR256:$src1, VR256:$src2)),
-          (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>, Requires<[HasAVX]>;
-def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, VR128:$src2)),
-          (UNPCKLPDrr VR128:$src1, VR128:$src2)>;
-
-// Shuffle with UNPCKHPD
-def : Pat<(v2f64 (X86Unpckhpd VR128:$src1, (memopv2f64 addr:$src2))),
-          (VUNPCKHPDrm VR128:$src1, addr:$src2)>, Requires<[HasAVX]>;
-def : Pat<(v2f64 (X86Unpckhpd VR128:$src1, (memopv2f64 addr:$src2))),
-          (UNPCKHPDrm VR128:$src1, addr:$src2)>;
-
-def : Pat<(v2f64 (X86Unpckhpd VR128:$src1, VR128:$src2)),
-          (VUNPCKHPDrr VR128:$src1, VR128:$src2)>, Requires<[HasAVX]>;
-def : Pat<(v2f64 (X86Unpckhpd VR128:$src1, VR128:$src2)),
-          (UNPCKHPDrr VR128:$src1, VR128:$src2)>;
-
-// Shuffle with PUNPCKLBW
-def : Pat<(v16i8 (X86Punpcklbw VR128:$src1,
-                                   (bc_v16i8 (memopv2i64 addr:$src2)))),
-          (PUNPCKLBWrm VR128:$src1, addr:$src2)>;
-def : Pat<(v16i8 (X86Punpcklbw VR128:$src1, VR128:$src2)),
-          (PUNPCKLBWrr VR128:$src1, VR128:$src2)>;
-
-// Shuffle with PUNPCKLWD
-def : Pat<(v8i16 (X86Punpcklwd VR128:$src1,
-                                   (bc_v8i16 (memopv2i64 addr:$src2)))),
-          (PUNPCKLWDrm VR128:$src1, addr:$src2)>;
-def : Pat<(v8i16 (X86Punpcklwd VR128:$src1, VR128:$src2)),
-          (PUNPCKLWDrr VR128:$src1, VR128:$src2)>;
-
-// Shuffle with PUNPCKLDQ
-def : Pat<(v4i32 (X86Punpckldq VR128:$src1,
-                                   (bc_v4i32 (memopv2i64 addr:$src2)))),
-          (PUNPCKLDQrm VR128:$src1, addr:$src2)>;
-def : Pat<(v4i32 (X86Punpckldq VR128:$src1, VR128:$src2)),
-          (PUNPCKLDQrr VR128:$src1, VR128:$src2)>;
-
-// Shuffle with PUNPCKLQDQ
-def : Pat<(v2i64 (X86Punpcklqdq VR128:$src1, (memopv2i64 addr:$src2))),
-          (PUNPCKLQDQrm VR128:$src1, addr:$src2)>;
-def : Pat<(v2i64 (X86Punpcklqdq VR128:$src1, VR128:$src2)),
-          (PUNPCKLQDQrr VR128:$src1, VR128:$src2)>;
-
-// Shuffle with PUNPCKHBW
-def : Pat<(v16i8 (X86Punpckhbw VR128:$src1,
-                                   (bc_v16i8 (memopv2i64 addr:$src2)))),
-          (PUNPCKHBWrm VR128:$src1, addr:$src2)>;
-def : Pat<(v16i8 (X86Punpckhbw VR128:$src1, VR128:$src2)),
-          (PUNPCKHBWrr VR128:$src1, VR128:$src2)>;
-
-// Shuffle with PUNPCKHWD
-def : Pat<(v8i16 (X86Punpckhwd VR128:$src1,
-                                   (bc_v8i16 (memopv2i64 addr:$src2)))),
-          (PUNPCKHWDrm VR128:$src1, addr:$src2)>;
-def : Pat<(v8i16 (X86Punpckhwd VR128:$src1, VR128:$src2)),
-          (PUNPCKHWDrr VR128:$src1, VR128:$src2)>;
-
-// Shuffle with PUNPCKHDQ
-def : Pat<(v4i32 (X86Punpckhdq VR128:$src1,
-                                   (bc_v4i32 (memopv2i64 addr:$src2)))),
-          (PUNPCKHDQrm VR128:$src1, addr:$src2)>;
-def : Pat<(v4i32 (X86Punpckhdq VR128:$src1, VR128:$src2)),
-          (PUNPCKHDQrr VR128:$src1, VR128:$src2)>;
-
-// Shuffle with PUNPCKHQDQ
-def : Pat<(v2i64 (X86Punpckhqdq VR128:$src1, (memopv2i64 addr:$src2))),
-          (PUNPCKHQDQrm VR128:$src1, addr:$src2)>;
-def : Pat<(v2i64 (X86Punpckhqdq VR128:$src1, VR128:$src2)),
-          (PUNPCKHQDQrr VR128:$src1, VR128:$src2)>;
-
-// Shuffle with MOVLHPS
-def : Pat<(X86Movlhps VR128:$src1,
-                    (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
-          (MOVHPSrm VR128:$src1, addr:$src2)>;
-def : Pat<(X86Movlhps VR128:$src1,
-                    (bc_v4i32 (v2i64 (X86vzload addr:$src2)))),
-          (MOVHPSrm VR128:$src1, addr:$src2)>;
-def : Pat<(v4f32 (X86Movlhps VR128:$src1, VR128:$src2)),
-          (MOVLHPSrr VR128:$src1, VR128:$src2)>;
-def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)),
-          (MOVLHPSrr VR128:$src1, VR128:$src2)>;
-def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)),
-          (MOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>;
-
-// FIXME: Instead of X86Movddup, there should be a X86Unpcklpd here, the problem
-// is during lowering, where it's not possible to recognize the load fold cause
-// it has two uses through a bitcast. One use disappears at isel time and the
-// fold opportunity reappears.
-def : Pat<(v2f64 (X86Movddup VR128:$src)),
-          (UNPCKLPDrr VR128:$src, VR128:$src)>;
-
-// Shuffle with MOVLHPD
-def : Pat<(v2f64 (X86Movlhpd VR128:$src1,
-                    (scalar_to_vector (loadf64 addr:$src2)))),
-          (MOVHPDrm VR128:$src1, addr:$src2)>;
-
-// FIXME: Instead of X86Unpcklpd, there should be a X86Movlhpd here, the problem
-// is during lowering, where it's not possible to recognize the load fold cause
-// it has two uses through a bitcast. One use disappears at isel time and the
-// fold opportunity reappears.
-def : Pat<(v2f64 (X86Unpcklpd VR128:$src1,
-                    (scalar_to_vector (loadf64 addr:$src2)))),
-          (MOVHPDrm VR128:$src1, addr:$src2)>;
-
-// Shuffle with MOVSS
-def : Pat<(v4f32 (X86Movss VR128:$src1, (scalar_to_vector FR32:$src2))),
-          (MOVSSrr VR128:$src1, FR32:$src2)>;
-def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
-          (MOVSSrr (v4i32 VR128:$src1),
-                   (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_ss))>;
-def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
-          (MOVSSrr (v4f32 VR128:$src1),
-                   (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_ss))>;
-// FIXME: Instead of a X86Movss there should be a X86Movlps here, the problem
-// is during lowering, where it's not possible to recognize the load fold cause
-// it has two uses through a bitcast. One use disappears at isel time and the
-// fold opportunity reappears.
-def : Pat<(X86Movss VR128:$src1,
-                    (bc_v4i32 (v2i64 (load addr:$src2)))),
-          (MOVLPSrm VR128:$src1, addr:$src2)>;
-
-// Shuffle with MOVSD
-def : Pat<(v2f64 (X86Movsd VR128:$src1, (scalar_to_vector FR64:$src2))),
-          (MOVSDrr VR128:$src1, FR64:$src2)>;
-def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
-          (MOVSDrr (v2i64 VR128:$src1),
-                   (EXTRACT_SUBREG (v2i64 VR128:$src2), sub_sd))>;
-def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
-          (MOVSDrr (v2f64 VR128:$src1),
-                   (EXTRACT_SUBREG (v2f64 VR128:$src2), sub_sd))>;
-def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)),
-          (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_sd))>;
-def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)),
-          (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_sd))>;
-
-// Shuffle with MOVSHDUP
-def : Pat<(v4i32 (X86Movshdup VR128:$src)),
-          (MOVSHDUPrr VR128:$src)>;
-def : Pat<(X86Movshdup (bc_v4i32 (memopv2i64 addr:$src))),
-          (MOVSHDUPrm addr:$src)>;
-
-def : Pat<(v4f32 (X86Movshdup VR128:$src)),
-          (MOVSHDUPrr VR128:$src)>;
-def : Pat<(X86Movshdup (memopv4f32 addr:$src)),
-          (MOVSHDUPrm addr:$src)>;
-
-// Shuffle with MOVSLDUP
-def : Pat<(v4i32 (X86Movsldup VR128:$src)),
-          (MOVSLDUPrr VR128:$src)>;
-def : Pat<(X86Movsldup (bc_v4i32 (memopv2i64 addr:$src))),
-          (MOVSLDUPrm addr:$src)>;
-
-def : Pat<(v4f32 (X86Movsldup VR128:$src)),
-          (MOVSLDUPrr VR128:$src)>;
-def : Pat<(X86Movsldup (memopv4f32 addr:$src)),
-          (MOVSLDUPrm addr:$src)>;
-
-// Shuffle with PSHUFHW
-def : Pat<(v8i16 (X86PShufhw VR128:$src, (i8 imm:$imm))),
-          (PSHUFHWri VR128:$src, imm:$imm)>;
-def : Pat<(v8i16 (X86PShufhw (bc_v8i16 (memopv2i64 addr:$src)), (i8 imm:$imm))),
-          (PSHUFHWmi addr:$src, imm:$imm)>;
-
-// Shuffle with PSHUFLW
-def : Pat<(v8i16 (X86PShuflw VR128:$src, (i8 imm:$imm))),
-          (PSHUFLWri VR128:$src, imm:$imm)>;
-def : Pat<(v8i16 (X86PShuflw (bc_v8i16 (memopv2i64 addr:$src)), (i8 imm:$imm))),
-          (PSHUFLWmi addr:$src, imm:$imm)>;
-
-// Shuffle with PALIGN
-def : Pat<(v4i32 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
-          (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
-def : Pat<(v4f32 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
-          (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
-def : Pat<(v8i16 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
-          (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
-def : Pat<(v16i8 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
-          (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+  // Zero Upper bits of YMM registers
+  def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper",
+                     [(int_x86_avx_vzeroupper)]>, TB, VEX, Requires<[HasAVX]>;
+}
 
-// Shuffle with MOVLPS
-def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))),
-          (MOVLPSrm VR128:$src1, addr:$src2)>;
-def : Pat<(v4i32 (X86Movlps VR128:$src1, (load addr:$src2))),
-          (MOVLPSrm VR128:$src1, addr:$src2)>;
-def : Pat<(X86Movlps VR128:$src1,
-                    (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
-          (MOVLPSrm VR128:$src1, addr:$src2)>;
-// FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem
-// is during lowering, where it's not possible to recognize the load fold cause
-// it has two uses through a bitcast. One use disappears at isel time and the
-// fold opportunity reappears.
-def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)),
-          (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_sd))>;
-
-def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)), 
-          (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_sd))>; 
-
-// Shuffle with MOVLPD
-def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))),
-          (MOVLPDrm VR128:$src1, addr:$src2)>;
-def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))),
-          (MOVLPDrm VR128:$src1, addr:$src2)>;
-def : Pat<(v2f64 (X86Movlpd VR128:$src1,
-                            (scalar_to_vector (loadf64 addr:$src2)))),
-          (MOVLPDrm VR128:$src1, addr:$src2)>;
-
-// Extra patterns to match stores with MOVHPS/PD and MOVLPS/PD
-def : Pat<(store (f64 (vector_extract
-          (v2f64 (X86Unpckhps VR128:$src, (undef))), (iPTR 0))),addr:$dst),
-          (MOVHPSmr addr:$dst, VR128:$src)>;
-def : Pat<(store (f64 (vector_extract
-          (v2f64 (X86Unpckhpd VR128:$src, (undef))), (iPTR 0))),addr:$dst),
-          (MOVHPDmr addr:$dst, VR128:$src)>;
-
-def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)),addr:$src1),
-          (MOVLPSmr addr:$src1, VR128:$src2)>;
-def : Pat<(store (v4i32 (X86Movlps
-                 (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)), addr:$src1),
-          (MOVLPSmr addr:$src1, VR128:$src2)>;
-
-def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)),addr:$src1),
-          (MOVLPDmr addr:$src1, VR128:$src2)>;
-def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128:$src2)),addr:$src1),
-          (MOVLPDmr addr:$src1, VR128:$src2)>;
+//===----------------------------------------------------------------------===//
+// Half precision conversion instructions
+//
+let Predicates = [HasAVX, HasF16C] in {
+  def VCVTPH2PSrm : I<0x13, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
+                     "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, T8, OpSize, VEX;
+  def VCVTPH2PSrr : I<0x13, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                     "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, T8, OpSize, VEX;
+  def VCVTPH2PSYrm : I<0x13, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
+                     "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, T8, OpSize, VEX;
+  def VCVTPH2PSYrr : I<0x13, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
+                     "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, T8, OpSize, VEX;
+  def VCVTPS2PHmr : Ii8<0x1D, MRMDestMem, (outs f64mem:$dst),
+                      (ins VR128:$src1, i32i8imm:$src2),
+                      "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+                      TA, OpSize, VEX;
+  def VCVTPS2PHrr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst),
+                      (ins VR128:$src1, i32i8imm:$src2),
+                      "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+                      TA, OpSize, VEX;
+  def VCVTPS2PHYmr : Ii8<0x1D, MRMDestMem, (outs f128mem:$dst),
+                      (ins VR256:$src1, i32i8imm:$src2),
+                      "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+                      TA, OpSize, VEX;
+  def VCVTPS2PHYrr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst),
+                      (ins VR256:$src1, i32i8imm:$src2),
+                      "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+                      TA, OpSize, VEX;
+}
diff --git a/lib/Target/X86/X86InstrSystem.td b/lib/Target/X86/X86InstrSystem.td
index 31de878343ef6..05a5b36b95eda 100644
--- a/lib/Target/X86/X86InstrSystem.td
+++ b/lib/Target/X86/X86InstrSystem.td
@@ -67,43 +67,43 @@ def IRET64 : RI<0xcf, RawFrm, (outs), (ins), "iretq", []>,
 //
 let Defs = [AL], Uses = [DX] in
 def IN8rr  : I<0xEC, RawFrm, (outs), (ins),
-               "in{b}\t{%dx, %al|%AL, %DX}", []>;
+               "in{b}\t{%dx, %al|AL, DX}", []>;
 let Defs = [AX], Uses = [DX] in
 def IN16rr : I<0xED, RawFrm, (outs), (ins),
-               "in{w}\t{%dx, %ax|%AX, %DX}", []>,  OpSize;
+               "in{w}\t{%dx, %ax|AX, DX}", []>,  OpSize;
 let Defs = [EAX], Uses = [DX] in
 def IN32rr : I<0xED, RawFrm, (outs), (ins),
-               "in{l}\t{%dx, %eax|%EAX, %DX}", []>;
+               "in{l}\t{%dx, %eax|EAX, DX}", []>;
 
 let Defs = [AL] in
 def IN8ri  : Ii8<0xE4, RawFrm, (outs), (ins i8imm:$port),
-                  "in{b}\t{$port, %al|%AL, $port}", []>;
+                  "in{b}\t{$port, %al|AL, $port}", []>;
 let Defs = [AX] in
 def IN16ri : Ii8<0xE5, RawFrm, (outs), (ins i8imm:$port),
-                  "in{w}\t{$port, %ax|%AX, $port}", []>, OpSize;
+                  "in{w}\t{$port, %ax|AX, $port}", []>, OpSize;
 let Defs = [EAX] in
 def IN32ri : Ii8<0xE5, RawFrm, (outs), (ins i8imm:$port),
-                  "in{l}\t{$port, %eax|%EAX, $port}", []>;
+                  "in{l}\t{$port, %eax|EAX, $port}", []>;
 
 let Uses = [DX, AL] in
 def OUT8rr  : I<0xEE, RawFrm, (outs), (ins),
-                "out{b}\t{%al, %dx|%DX, %AL}", []>;
+                "out{b}\t{%al, %dx|DX, AL}", []>;
 let Uses = [DX, AX] in
 def OUT16rr : I<0xEF, RawFrm, (outs), (ins),
-                "out{w}\t{%ax, %dx|%DX, %AX}", []>, OpSize;
+                "out{w}\t{%ax, %dx|DX, AX}", []>, OpSize;
 let Uses = [DX, EAX] in
 def OUT32rr : I<0xEF, RawFrm, (outs), (ins),
-                "out{l}\t{%eax, %dx|%DX, %EAX}", []>;
+                "out{l}\t{%eax, %dx|DX, EAX}", []>;
 
 let Uses = [AL] in
 def OUT8ir  : Ii8<0xE6, RawFrm, (outs), (ins i8imm:$port),
-                   "out{b}\t{%al, $port|$port, %AL}", []>;
+                   "out{b}\t{%al, $port|$port, AL}", []>;
 let Uses = [AX] in
 def OUT16ir : Ii8<0xE7, RawFrm, (outs), (ins i8imm:$port),
-                   "out{w}\t{%ax, $port|$port, %AX}", []>, OpSize;
+                   "out{w}\t{%ax, $port|$port, AX}", []>, OpSize;
 let Uses = [EAX] in
 def OUT32ir : Ii8<0xE7, RawFrm, (outs), (ins i8imm:$port),
-                   "out{l}\t{%eax, $port|$port, %EAX}", []>;
+                   "out{l}\t{%eax, $port|$port, EAX}", []>;
 
 def IN8  : I<0x6C, RawFrm, (outs), (ins), "ins{b}", []>;
 def IN16 : I<0x6D, RawFrm, (outs), (ins), "ins{w}", []>,  OpSize;
@@ -229,65 +229,65 @@ def LTRm : I<0x00, MRM3m, (outs), (ins i16mem:$src),
              "ltr{w}\t{$src}", []>, TB;
              
 def PUSHCS16 : I<0x0E, RawFrm, (outs), (ins),
-                 "push{w}\t%cs", []>, Requires<[In32BitMode]>, OpSize;
+                 "push{w}\t{%cs|CS}", []>, Requires<[In32BitMode]>, OpSize;
 def PUSHCS32 : I<0x0E, RawFrm, (outs), (ins),
-                 "push{l}\t%cs", []>, Requires<[In32BitMode]>;
+                 "push{l}\t{%cs|CS}", []>, Requires<[In32BitMode]>;
 def PUSHSS16 : I<0x16, RawFrm, (outs), (ins),
-                 "push{w}\t%ss", []>, Requires<[In32BitMode]>, OpSize;
+                 "push{w}\t{%ss|SS}", []>, Requires<[In32BitMode]>, OpSize;
 def PUSHSS32 : I<0x16, RawFrm, (outs), (ins),
-                 "push{l}\t%ss", []>, Requires<[In32BitMode]>;
+                 "push{l}\t{%ss|SS}", []>, Requires<[In32BitMode]>;
 def PUSHDS16 : I<0x1E, RawFrm, (outs), (ins),
-                 "push{w}\t%ds", []>, Requires<[In32BitMode]>, OpSize;
+                 "push{w}\t{%ds|DS}", []>, Requires<[In32BitMode]>, OpSize;
 def PUSHDS32 : I<0x1E, RawFrm, (outs), (ins),
-                 "push{l}\t%ds", []>, Requires<[In32BitMode]>;
+                 "push{l}\t{%ds|DS}", []>, Requires<[In32BitMode]>;
 def PUSHES16 : I<0x06, RawFrm, (outs), (ins),
-                 "push{w}\t%es", []>, Requires<[In32BitMode]>, OpSize;
+                 "push{w}\t{%es|ES}", []>, Requires<[In32BitMode]>, OpSize;
 def PUSHES32 : I<0x06, RawFrm, (outs), (ins),
-                 "push{l}\t%es", []>, Requires<[In32BitMode]>;
+                 "push{l}\t{%es|ES}", []>, Requires<[In32BitMode]>;
                  
 def PUSHFS16 : I<0xa0, RawFrm, (outs), (ins),
-                 "push{w}\t%fs", []>, OpSize, TB;
+                 "push{w}\t{%fs|FS}", []>, OpSize, TB;
 def PUSHFS32 : I<0xa0, RawFrm, (outs), (ins),
-                 "push{l}\t%fs", []>, TB, Requires<[In32BitMode]>;
+                 "push{l}\t{%fs|FS}", []>, TB, Requires<[In32BitMode]>;
 def PUSHGS16 : I<0xa8, RawFrm, (outs), (ins),
-                 "push{w}\t%gs", []>, OpSize, TB;
+                 "push{w}\t{%gs|GS}", []>, OpSize, TB;
 def PUSHGS32 : I<0xa8, RawFrm, (outs), (ins),
-                 "push{l}\t%gs", []>, TB, Requires<[In32BitMode]>;
+                 "push{l}\t{%gs|GS}", []>, TB, Requires<[In32BitMode]>;
 
 def PUSHFS64 : I<0xa0, RawFrm, (outs), (ins),
-                 "push{q}\t%fs", []>, TB;
+                 "push{q}\t{%fs|FS}", []>, TB;
 def PUSHGS64 : I<0xa8, RawFrm, (outs), (ins),
-                 "push{q}\t%gs", []>, TB;
+                 "push{q}\t{%gs|GS}", []>, TB;
 
 // No "pop cs" instruction.
 def POPSS16 : I<0x17, RawFrm, (outs), (ins),
-                "pop{w}\t%ss", []>, OpSize, Requires<[In32BitMode]>;
+                "pop{w}\t{%ss|SS}", []>, OpSize, Requires<[In32BitMode]>;
 def POPSS32 : I<0x17, RawFrm, (outs), (ins),
-                "pop{l}\t%ss", []>        , Requires<[In32BitMode]>;
+                "pop{l}\t{%ss|SS}", []>        , Requires<[In32BitMode]>;
                 
 def POPDS16 : I<0x1F, RawFrm, (outs), (ins),
-                "pop{w}\t%ds", []>, OpSize, Requires<[In32BitMode]>;
+                "pop{w}\t{%ds|DS}", []>, OpSize, Requires<[In32BitMode]>;
 def POPDS32 : I<0x1F, RawFrm, (outs), (ins),
-                "pop{l}\t%ds", []>        , Requires<[In32BitMode]>;
+                "pop{l}\t{%ds|DS}", []>        , Requires<[In32BitMode]>;
                 
 def POPES16 : I<0x07, RawFrm, (outs), (ins),
-                "pop{w}\t%es", []>, OpSize, Requires<[In32BitMode]>;
+                "pop{w}\t{%es|ES}", []>, OpSize, Requires<[In32BitMode]>;
 def POPES32 : I<0x07, RawFrm, (outs), (ins),
-                "pop{l}\t%es", []>        , Requires<[In32BitMode]>;
+                "pop{l}\t{%es|ES}", []>        , Requires<[In32BitMode]>;
                 
 def POPFS16 : I<0xa1, RawFrm, (outs), (ins),
-                "pop{w}\t%fs", []>, OpSize, TB;
+                "pop{w}\t{%fs|FS}", []>, OpSize, TB;
 def POPFS32 : I<0xa1, RawFrm, (outs), (ins),
-                "pop{l}\t%fs", []>, TB    , Requires<[In32BitMode]>;
+                "pop{l}\t{%fs|FS}", []>, TB    , Requires<[In32BitMode]>;
 def POPFS64 : I<0xa1, RawFrm, (outs), (ins),
-                "pop{q}\t%fs", []>, TB;
+                "pop{q}\t{%fs|FS}", []>, TB;
                 
 def POPGS16 : I<0xa9, RawFrm, (outs), (ins),
-                "pop{w}\t%gs", []>, OpSize, TB;
+                "pop{w}\t{%gs|GS}", []>, OpSize, TB;
 def POPGS32 : I<0xa9, RawFrm, (outs), (ins),
-                "pop{l}\t%gs", []>, TB    , Requires<[In32BitMode]>;
+                "pop{l}\t{%gs|GS}", []>, TB    , Requires<[In32BitMode]>;
 def POPGS64 : I<0xa9, RawFrm, (outs), (ins),
-                "pop{q}\t%gs", []>, TB;
+                "pop{q}\t{%gs|GS}", []>, TB;
                  
 
 def LDS16rm : I<0xc5, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
@@ -400,12 +400,29 @@ def CPUID : I<0xA2, RawFrm, (outs), (ins), "cpuid", []>, TB;
 def INVD : I<0x08, RawFrm, (outs), (ins), "invd", []>, TB;
 def WBINVD : I<0x09, RawFrm, (outs), (ins), "wbinvd", []>, TB;
 
+//===----------------------------------------------------------------------===//
+// XSAVE instructions
 let Defs = [RDX, RAX], Uses = [RCX] in
   def XGETBV : I<0x01, MRM_D0, (outs), (ins), "xgetbv", []>, TB;
 
 let Uses = [RDX, RAX, RCX] in
   def XSETBV : I<0x01, MRM_D1, (outs), (ins), "xsetbv", []>, TB;
 
+let Uses = [RDX, RAX] in {
+  def XSAVE : I<0xAE, MRM4m, (outs opaque512mem:$dst), (ins),
+               "xsave\t$dst", []>, TB;
+  def XSAVE64 : I<0xAE, MRM4m, (outs opaque512mem:$dst), (ins),
+                 "xsaveq\t$dst", []>, TB, REX_W, Requires<[In64BitMode]>;
+  def XRSTOR : I<0xAE, MRM5m, (outs), (ins opaque512mem:$dst),
+               "xrstor\t$dst", []>, TB;
+  def XRSTOR64 : I<0xAE, MRM5m, (outs), (ins opaque512mem:$dst),
+                 "xrstorq\t$dst", []>, TB, REX_W, Requires<[In64BitMode]>;
+  def XSAVEOPT : I<0xAE, MRM6m, (outs opaque512mem:$dst), (ins),
+                  "xsaveopt\t$dst", []>, TB;
+  def XSAVEOPT64 : I<0xAE, MRM6m, (outs opaque512mem:$dst), (ins),
+                    "xsaveoptq\t$dst", []>, TB, REX_W, Requires<[In64BitMode]>;
+}
+
 //===----------------------------------------------------------------------===//
 // VIA PadLock crypto instructions
 let Defs = [RAX, RDI], Uses = [RDX, RDI] in
@@ -427,3 +444,24 @@ let Defs = [RAX, RSI, RDI], Uses = [RAX, RSI, RDI] in {
 }
 let Defs = [RAX, RDX, RSI], Uses = [RAX, RSI] in
   def MONTMUL : I<0xc0, RawFrm, (outs), (ins), "montmul", []>, A6;
+
+//===----------------------------------------------------------------------===//
+// FS/GS Base Instructions
+let Predicates = [In64BitMode] in {
+  def RDFSBASE : I<0xAE, MRM0r, (outs GR32:$dst), (ins),
+                   "rdfsbase{l}\t$dst", []>, TB, XS;
+  def RDFSBASE64 : RI<0xAE, MRM0r, (outs GR64:$dst), (ins),
+                     "rdfsbase{q}\t$dst", []>, TB, XS;
+  def RDGSBASE : I<0xAE, MRM1r, (outs GR32:$dst), (ins),
+                   "rdgsbase{l}\t$dst", []>, TB, XS;
+  def RDGSBASE64 : RI<0xAE, MRM1r, (outs GR64:$dst), (ins),
+                     "rdgsbase{q}\t$dst", []>, TB, XS;
+  def WRFSBASE : I<0xAE, MRM2r, (outs), (ins GR32:$dst),
+                   "wrfsbase{l}\t$dst", []>, TB, XS;
+  def WRFSBASE64 : RI<0xAE, MRM2r, (outs), (ins GR64:$dst),
+                   "wrfsbase{q}\t$dst", []>, TB, XS;
+  def WRGSBASE : I<0xAE, MRM3r, (outs), (ins GR32:$dst),
+                   "wrgsbase{l}\t$dst", []>, TB, XS;
+  def WRGSBASE64 : RI<0xAE, MRM3r, (outs), (ins GR64:$dst),
+                    "wrgsbase{q}\t$dst", []>, TB, XS;
+}
diff --git a/lib/Target/X86/X86InstrVMX.td b/lib/Target/X86/X86InstrVMX.td
index daf61e4625d44..09a7a7d0c4d0b 100644
--- a/lib/Target/X86/X86InstrVMX.td
+++ b/lib/Target/X86/X86InstrVMX.td
@@ -16,9 +16,15 @@
 // VMX instructions
 
 // 66 0F 38 80
-def INVEPT : I<0x80, RawFrm, (outs), (ins), "invept", []>, OpSize, T8;
+def INVEPT32 : I<0x80, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2),
+               "invept {$src2, $src1|$src1, $src2}", []>, OpSize, T8;
+def INVEPT64 : I<0x80, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
+               "invept {$src2, $src1|$src1, $src2}", []>, OpSize, T8;
 // 66 0F 38 81
-def INVVPID : I<0x81, RawFrm, (outs), (ins), "invvpid", []>, OpSize, T8;
+def INVVPID32 : I<0x81, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2),
+                "invvpid {$src2, $src1|$src1, $src2}", []>, OpSize, T8;
+def INVVPID64 : I<0x81, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
+                "invvpid {$src2, $src1|$src1, $src2}", []>, OpSize, T8;
 // 0F 01 C1
 def VMCALL : I<0x01, MRM_C1, (outs), (ins), "vmcall", []>, TB;
 def VMCLEARm : I<0xC7, MRM6m, (outs), (ins i64mem:$vmcs),
diff --git a/lib/Target/X86/X86MCCodeEmitter.cpp b/lib/Target/X86/X86MCCodeEmitter.cpp
deleted file mode 100644
index ce8ef495c0010..0000000000000
--- a/lib/Target/X86/X86MCCodeEmitter.cpp
+++ /dev/null
@@ -1,1044 +0,0 @@
-//===-- X86/X86MCCodeEmitter.cpp - Convert X86 code to machine code -------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the X86MCCodeEmitter class.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "mccodeemitter"
-#include "X86.h"
-#include "X86InstrInfo.h"
-#include "X86FixupKinds.h"
-#include "llvm/MC/MCCodeEmitter.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-
-namespace {
-class X86MCCodeEmitter : public MCCodeEmitter {
-  X86MCCodeEmitter(const X86MCCodeEmitter &); // DO NOT IMPLEMENT
-  void operator=(const X86MCCodeEmitter &); // DO NOT IMPLEMENT
-  const MCInstrInfo &MCII;
-  const MCSubtargetInfo &STI;
-  MCContext &Ctx;
-public:
-  X86MCCodeEmitter(const MCInstrInfo &mcii, const MCSubtargetInfo &sti,
-                   MCContext &ctx)
-    : MCII(mcii), STI(sti), Ctx(ctx) {
-  }
-
-  ~X86MCCodeEmitter() {}
-
-  bool is64BitMode() const {
-    // FIXME: Can tablegen auto-generate this?
-    return (STI.getFeatureBits() & X86::Mode64Bit) != 0;
-  }
-
-  static unsigned GetX86RegNum(const MCOperand &MO) {
-    return X86RegisterInfo::getX86RegNum(MO.getReg());
-  }
-
-  // On regular x86, both XMM0-XMM7 and XMM8-XMM15 are encoded in the range
-  // 0-7 and the difference between the 2 groups is given by the REX prefix.
-  // In the VEX prefix, registers are seen sequencially from 0-15 and encoded
-  // in 1's complement form, example:
-  //
-  //  ModRM field => XMM9 => 1
-  //  VEX.VVVV    => XMM9 => ~9
-  //
-  // See table 4-35 of Intel AVX Programming Reference for details.
-  static unsigned char getVEXRegisterEncoding(const MCInst &MI,
-                                              unsigned OpNum) {
-    unsigned SrcReg = MI.getOperand(OpNum).getReg();
-    unsigned SrcRegNum = GetX86RegNum(MI.getOperand(OpNum));
-    if ((SrcReg >= X86::XMM8 && SrcReg <= X86::XMM15) ||
-        (SrcReg >= X86::YMM8 && SrcReg <= X86::YMM15))
-      SrcRegNum += 8;
-
-    // The registers represented through VEX_VVVV should
-    // be encoded in 1's complement form.
-    return (~SrcRegNum) & 0xf;
-  }
-
-  void EmitByte(unsigned char C, unsigned &CurByte, raw_ostream &OS) const {
-    OS << (char)C;
-    ++CurByte;
-  }
-
-  void EmitConstant(uint64_t Val, unsigned Size, unsigned &CurByte,
-                    raw_ostream &OS) const {
-    // Output the constant in little endian byte order.
-    for (unsigned i = 0; i != Size; ++i) {
-      EmitByte(Val & 255, CurByte, OS);
-      Val >>= 8;
-    }
-  }
-
-  void EmitImmediate(const MCOperand &Disp,
-                     unsigned ImmSize, MCFixupKind FixupKind,
-                     unsigned &CurByte, raw_ostream &OS,
-                     SmallVectorImpl<MCFixup> &Fixups,
-                     int ImmOffset = 0) const;
-
-  inline static unsigned char ModRMByte(unsigned Mod, unsigned RegOpcode,
-                                        unsigned RM) {
-    assert(Mod < 4 && RegOpcode < 8 && RM < 8 && "ModRM Fields out of range!");
-    return RM | (RegOpcode << 3) | (Mod << 6);
-  }
-
-  void EmitRegModRMByte(const MCOperand &ModRMReg, unsigned RegOpcodeFld,
-                        unsigned &CurByte, raw_ostream &OS) const {
-    EmitByte(ModRMByte(3, RegOpcodeFld, GetX86RegNum(ModRMReg)), CurByte, OS);
-  }
-
-  void EmitSIBByte(unsigned SS, unsigned Index, unsigned Base,
-                   unsigned &CurByte, raw_ostream &OS) const {
-    // SIB byte is in the same format as the ModRMByte.
-    EmitByte(ModRMByte(SS, Index, Base), CurByte, OS);
-  }
-
-
-  void EmitMemModRMByte(const MCInst &MI, unsigned Op,
-                        unsigned RegOpcodeField,
-                        uint64_t TSFlags, unsigned &CurByte, raw_ostream &OS,
-                        SmallVectorImpl<MCFixup> &Fixups) const;
-
-  void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
-                         SmallVectorImpl<MCFixup> &Fixups) const;
-
-  void EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, int MemOperand,
-                           const MCInst &MI, const MCInstrDesc &Desc,
-                           raw_ostream &OS) const;
-
-  void EmitSegmentOverridePrefix(uint64_t TSFlags, unsigned &CurByte,
-                                 int MemOperand, const MCInst &MI,
-                                 raw_ostream &OS) const;
-
-  void EmitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, int MemOperand,
-                        const MCInst &MI, const MCInstrDesc &Desc,
-                        raw_ostream &OS) const;
-};
-
-} // end anonymous namespace
-
-
-MCCodeEmitter *llvm::createX86MCCodeEmitter(const MCInstrInfo &MCII,
-                                            const MCSubtargetInfo &STI,
-                                            MCContext &Ctx) {
-  return new X86MCCodeEmitter(MCII, STI, Ctx);
-}
-
-/// isDisp8 - Return true if this signed displacement fits in a 8-bit
-/// sign-extended field.
-static bool isDisp8(int Value) {
-  return Value == (signed char)Value;
-}
-
-/// getImmFixupKind - Return the appropriate fixup kind to use for an immediate
-/// in an instruction with the specified TSFlags.
-static MCFixupKind getImmFixupKind(uint64_t TSFlags) {
-  unsigned Size = X86II::getSizeOfImm(TSFlags);
-  bool isPCRel = X86II::isImmPCRel(TSFlags);
-
-  return MCFixup::getKindForSize(Size, isPCRel);
-}
-
-/// Is32BitMemOperand - Return true if the specified instruction with a memory
-/// operand should emit the 0x67 prefix byte in 64-bit mode due to a 32-bit
-/// memory operand.  Op specifies the operand # of the memoperand.
-static bool Is32BitMemOperand(const MCInst &MI, unsigned Op) {
-  const MCOperand &BaseReg  = MI.getOperand(Op+X86::AddrBaseReg);
-  const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
-  
-  if ((BaseReg.getReg() != 0 && X86::GR32RegClass.contains(BaseReg.getReg())) ||
-      (IndexReg.getReg() != 0 && X86::GR32RegClass.contains(IndexReg.getReg())))
-    return true;
-  return false;
-}
-
-/// StartsWithGlobalOffsetTable - Return true for the simple cases where this
-/// expression starts with _GLOBAL_OFFSET_TABLE_. This is a needed to support
-/// PIC on ELF i386 as that symbol is magic. We check only simple case that
-/// are know to be used: _GLOBAL_OFFSET_TABLE_ by itself or at the start
-/// of a binary expression.
-static bool StartsWithGlobalOffsetTable(const MCExpr *Expr) {
-  if (Expr->getKind() == MCExpr::Binary) {
-    const MCBinaryExpr *BE = static_cast<const MCBinaryExpr *>(Expr);
-    Expr = BE->getLHS();
-  }
-
-  if (Expr->getKind() != MCExpr::SymbolRef)
-    return false;
-
-  const MCSymbolRefExpr *Ref = static_cast<const MCSymbolRefExpr*>(Expr);
-  const MCSymbol &S = Ref->getSymbol();
-  return S.getName() == "_GLOBAL_OFFSET_TABLE_";
-}
-
-void X86MCCodeEmitter::
-EmitImmediate(const MCOperand &DispOp, unsigned Size, MCFixupKind FixupKind,
-              unsigned &CurByte, raw_ostream &OS,
-              SmallVectorImpl<MCFixup> &Fixups, int ImmOffset) const {
-  const MCExpr *Expr = NULL;
-  if (DispOp.isImm()) {
-    // If this is a simple integer displacement that doesn't require a relocation,
-    // emit it now.
-    if (FixupKind != FK_PCRel_1 &&
-	FixupKind != FK_PCRel_2 &&
-	FixupKind != FK_PCRel_4) {
-      EmitConstant(DispOp.getImm()+ImmOffset, Size, CurByte, OS);
-      return;
-    }
-    Expr = MCConstantExpr::Create(DispOp.getImm(), Ctx);
-  } else {
-    Expr = DispOp.getExpr();
-  }
-
-  // If we have an immoffset, add it to the expression.
-  if (FixupKind == FK_Data_4 && StartsWithGlobalOffsetTable(Expr)) {
-    assert(ImmOffset == 0);
-
-    FixupKind = MCFixupKind(X86::reloc_global_offset_table);
-    ImmOffset = CurByte;
-  }
-
-  // If the fixup is pc-relative, we need to bias the value to be relative to
-  // the start of the field, not the end of the field.
-  if (FixupKind == FK_PCRel_4 ||
-      FixupKind == MCFixupKind(X86::reloc_riprel_4byte) ||
-      FixupKind == MCFixupKind(X86::reloc_riprel_4byte_movq_load))
-    ImmOffset -= 4;
-  if (FixupKind == FK_PCRel_2)
-    ImmOffset -= 2;
-  if (FixupKind == FK_PCRel_1)
-    ImmOffset -= 1;
-
-  if (ImmOffset)
-    Expr = MCBinaryExpr::CreateAdd(Expr, MCConstantExpr::Create(ImmOffset, Ctx),
-                                   Ctx);
-
-  // Emit a symbolic constant as a fixup and 4 zeros.
-  Fixups.push_back(MCFixup::Create(CurByte, Expr, FixupKind));
-  EmitConstant(0, Size, CurByte, OS);
-}
-
-void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op,
-                                        unsigned RegOpcodeField,
-                                        uint64_t TSFlags, unsigned &CurByte,
-                                        raw_ostream &OS,
-                                        SmallVectorImpl<MCFixup> &Fixups) const{
-  const MCOperand &Disp     = MI.getOperand(Op+X86::AddrDisp);
-  const MCOperand &Base     = MI.getOperand(Op+X86::AddrBaseReg);
-  const MCOperand &Scale    = MI.getOperand(Op+X86::AddrScaleAmt);
-  const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
-  unsigned BaseReg = Base.getReg();
-
-  // Handle %rip relative addressing.
-  if (BaseReg == X86::RIP) {    // [disp32+RIP] in X86-64 mode
-    assert(is64BitMode() && "Rip-relative addressing requires 64-bit mode");
-    assert(IndexReg.getReg() == 0 && "Invalid rip-relative address");
-    EmitByte(ModRMByte(0, RegOpcodeField, 5), CurByte, OS);
-
-    unsigned FixupKind = X86::reloc_riprel_4byte;
-
-    // movq loads are handled with a special relocation form which allows the
-    // linker to eliminate some loads for GOT references which end up in the
-    // same linkage unit.
-    if (MI.getOpcode() == X86::MOV64rm)
-      FixupKind = X86::reloc_riprel_4byte_movq_load;
-
-    // rip-relative addressing is actually relative to the *next* instruction.
-    // Since an immediate can follow the mod/rm byte for an instruction, this
-    // means that we need to bias the immediate field of the instruction with
-    // the size of the immediate field.  If we have this case, add it into the
-    // expression to emit.
-    int ImmSize = X86II::hasImm(TSFlags) ? X86II::getSizeOfImm(TSFlags) : 0;
-
-    EmitImmediate(Disp, 4, MCFixupKind(FixupKind),
-                  CurByte, OS, Fixups, -ImmSize);
-    return;
-  }
-
-  unsigned BaseRegNo = BaseReg ? GetX86RegNum(Base) : -1U;
-
-  // Determine whether a SIB byte is needed.
-  // If no BaseReg, issue a RIP relative instruction only if the MCE can
-  // resolve addresses on-the-fly, otherwise use SIB (Intel Manual 2A, table
-  // 2-7) and absolute references.
-
-  if (// The SIB byte must be used if there is an index register.
-      IndexReg.getReg() == 0 &&
-      // The SIB byte must be used if the base is ESP/RSP/R12, all of which
-      // encode to an R/M value of 4, which indicates that a SIB byte is
-      // present.
-      BaseRegNo != N86::ESP &&
-      // If there is no base register and we're in 64-bit mode, we need a SIB
-      // byte to emit an addr that is just 'disp32' (the non-RIP relative form).
-      (!is64BitMode() || BaseReg != 0)) {
-
-    if (BaseReg == 0) {          // [disp32]     in X86-32 mode
-      EmitByte(ModRMByte(0, RegOpcodeField, 5), CurByte, OS);
-      EmitImmediate(Disp, 4, FK_Data_4, CurByte, OS, Fixups);
-      return;
-    }
-
-    // If the base is not EBP/ESP and there is no displacement, use simple
-    // indirect register encoding, this handles addresses like [EAX].  The
-    // encoding for [EBP] with no displacement means [disp32] so we handle it
-    // by emitting a displacement of 0 below.
-    if (Disp.isImm() && Disp.getImm() == 0 && BaseRegNo != N86::EBP) {
-      EmitByte(ModRMByte(0, RegOpcodeField, BaseRegNo), CurByte, OS);
-      return;
-    }
-
-    // Otherwise, if the displacement fits in a byte, encode as [REG+disp8].
-    if (Disp.isImm() && isDisp8(Disp.getImm())) {
-      EmitByte(ModRMByte(1, RegOpcodeField, BaseRegNo), CurByte, OS);
-      EmitImmediate(Disp, 1, FK_Data_1, CurByte, OS, Fixups);
-      return;
-    }
-
-    // Otherwise, emit the most general non-SIB encoding: [REG+disp32]
-    EmitByte(ModRMByte(2, RegOpcodeField, BaseRegNo), CurByte, OS);
-    EmitImmediate(Disp, 4, MCFixupKind(X86::reloc_signed_4byte), CurByte, OS,
-                  Fixups);
-    return;
-  }
-
-  // We need a SIB byte, so start by outputting the ModR/M byte first
-  assert(IndexReg.getReg() != X86::ESP &&
-         IndexReg.getReg() != X86::RSP && "Cannot use ESP as index reg!");
-
-  bool ForceDisp32 = false;
-  bool ForceDisp8  = false;
-  if (BaseReg == 0) {
-    // If there is no base register, we emit the special case SIB byte with
-    // MOD=0, BASE=5, to JUST get the index, scale, and displacement.
-    EmitByte(ModRMByte(0, RegOpcodeField, 4), CurByte, OS);
-    ForceDisp32 = true;
-  } else if (!Disp.isImm()) {
-    // Emit the normal disp32 encoding.
-    EmitByte(ModRMByte(2, RegOpcodeField, 4), CurByte, OS);
-    ForceDisp32 = true;
-  } else if (Disp.getImm() == 0 &&
-             // Base reg can't be anything that ends up with '5' as the base
-             // reg, it is the magic [*] nomenclature that indicates no base.
-             BaseRegNo != N86::EBP) {
-    // Emit no displacement ModR/M byte
-    EmitByte(ModRMByte(0, RegOpcodeField, 4), CurByte, OS);
-  } else if (isDisp8(Disp.getImm())) {
-    // Emit the disp8 encoding.
-    EmitByte(ModRMByte(1, RegOpcodeField, 4), CurByte, OS);
-    ForceDisp8 = true;           // Make sure to force 8 bit disp if Base=EBP
-  } else {
-    // Emit the normal disp32 encoding.
-    EmitByte(ModRMByte(2, RegOpcodeField, 4), CurByte, OS);
-  }
-
-  // Calculate what the SS field value should be...
-  static const unsigned SSTable[] = { ~0, 0, 1, ~0, 2, ~0, ~0, ~0, 3 };
-  unsigned SS = SSTable[Scale.getImm()];
-
-  if (BaseReg == 0) {
-    // Handle the SIB byte for the case where there is no base, see Intel
-    // Manual 2A, table 2-7. The displacement has already been output.
-    unsigned IndexRegNo;
-    if (IndexReg.getReg())
-      IndexRegNo = GetX86RegNum(IndexReg);
-    else // Examples: [ESP+1*<noreg>+4] or [scaled idx]+disp32 (MOD=0,BASE=5)
-      IndexRegNo = 4;
-    EmitSIBByte(SS, IndexRegNo, 5, CurByte, OS);
-  } else {
-    unsigned IndexRegNo;
-    if (IndexReg.getReg())
-      IndexRegNo = GetX86RegNum(IndexReg);
-    else
-      IndexRegNo = 4;   // For example [ESP+1*<noreg>+4]
-    EmitSIBByte(SS, IndexRegNo, GetX86RegNum(Base), CurByte, OS);
-  }
-
-  // Do we need to output a displacement?
-  if (ForceDisp8)
-    EmitImmediate(Disp, 1, FK_Data_1, CurByte, OS, Fixups);
-  else if (ForceDisp32 || Disp.getImm() != 0)
-    EmitImmediate(Disp, 4, MCFixupKind(X86::reloc_signed_4byte), CurByte, OS,
-                  Fixups);
-}
-
-/// EmitVEXOpcodePrefix - AVX instructions are encoded using a opcode prefix
-/// called VEX.
-void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
-                                           int MemOperand, const MCInst &MI,
-                                           const MCInstrDesc &Desc,
-                                           raw_ostream &OS) const {
-  bool HasVEX_4V = false;
-  if ((TSFlags >> X86II::VEXShift) & X86II::VEX_4V)
-    HasVEX_4V = true;
-
-  // VEX_R: opcode externsion equivalent to REX.R in
-  // 1's complement (inverted) form
-  //
-  //  1: Same as REX_R=0 (must be 1 in 32-bit mode)
-  //  0: Same as REX_R=1 (64 bit mode only)
-  //
-  unsigned char VEX_R = 0x1;
-
-  // VEX_X: equivalent to REX.X, only used when a
-  // register is used for index in SIB Byte.
-  //
-  //  1: Same as REX.X=0 (must be 1 in 32-bit mode)
-  //  0: Same as REX.X=1 (64-bit mode only)
-  unsigned char VEX_X = 0x1;
-
-  // VEX_B:
-  //
-  //  1: Same as REX_B=0 (ignored in 32-bit mode)
-  //  0: Same as REX_B=1 (64 bit mode only)
-  //
-  unsigned char VEX_B = 0x1;
-
-  // VEX_W: opcode specific (use like REX.W, or used for
-  // opcode extension, or ignored, depending on the opcode byte)
-  unsigned char VEX_W = 0;
-
-  // VEX_5M (VEX m-mmmmm field):
-  //
-  //  0b00000: Reserved for future use
-  //  0b00001: implied 0F leading opcode
-  //  0b00010: implied 0F 38 leading opcode bytes
-  //  0b00011: implied 0F 3A leading opcode bytes
-  //  0b00100-0b11111: Reserved for future use
-  //
-  unsigned char VEX_5M = 0x1;
-
-  // VEX_4V (VEX vvvv field): a register specifier
-  // (in 1's complement form) or 1111 if unused.
-  unsigned char VEX_4V = 0xf;
-
-  // VEX_L (Vector Length):
-  //
-  //  0: scalar or 128-bit vector
-  //  1: 256-bit vector
-  //
-  unsigned char VEX_L = 0;
-
-  // VEX_PP: opcode extension providing equivalent
-  // functionality of a SIMD prefix
-  //
-  //  0b00: None
-  //  0b01: 66
-  //  0b10: F3
-  //  0b11: F2
-  //
-  unsigned char VEX_PP = 0;
-
-  // Encode the operand size opcode prefix as needed.
-  if (TSFlags & X86II::OpSize)
-    VEX_PP = 0x01;
-
-  if ((TSFlags >> X86II::VEXShift) & X86II::VEX_W)
-    VEX_W = 1;
-
-  if ((TSFlags >> X86II::VEXShift) & X86II::VEX_L)
-    VEX_L = 1;
-
-  switch (TSFlags & X86II::Op0Mask) {
-  default: assert(0 && "Invalid prefix!");
-  case X86II::T8:  // 0F 38
-    VEX_5M = 0x2;
-    break;
-  case X86II::TA:  // 0F 3A
-    VEX_5M = 0x3;
-    break;
-  case X86II::TF:  // F2 0F 38
-    VEX_PP = 0x3;
-    VEX_5M = 0x2;
-    break;
-  case X86II::XS:  // F3 0F
-    VEX_PP = 0x2;
-    break;
-  case X86II::XD:  // F2 0F
-    VEX_PP = 0x3;
-    break;
-  case X86II::A6:  // Bypass: Not used by VEX
-  case X86II::A7:  // Bypass: Not used by VEX
-  case X86II::TB:  // Bypass: Not used by VEX
-  case 0:
-    break;  // No prefix!
-  }
-
-  // Set the vector length to 256-bit if YMM0-YMM15 is used
-  for (unsigned i = 0; i != MI.getNumOperands(); ++i) {
-    if (!MI.getOperand(i).isReg())
-      continue;
-    unsigned SrcReg = MI.getOperand(i).getReg();
-    if (SrcReg >= X86::YMM0 && SrcReg <= X86::YMM15)
-      VEX_L = 1;
-  }
-
-  unsigned NumOps = MI.getNumOperands();
-  unsigned CurOp = 0;
-  bool IsDestMem = false;
-
-  switch (TSFlags & X86II::FormMask) {
-  case X86II::MRMInitReg: assert(0 && "FIXME: Remove this!");
-  case X86II::MRMDestMem:
-    IsDestMem = true;
-    // The important info for the VEX prefix is never beyond the address
-    // registers. Don't check beyond that.
-    NumOps = CurOp = X86::AddrNumOperands;
-  case X86II::MRM0m: case X86II::MRM1m:
-  case X86II::MRM2m: case X86II::MRM3m:
-  case X86II::MRM4m: case X86II::MRM5m:
-  case X86II::MRM6m: case X86II::MRM7m:
-  case X86II::MRMSrcMem:
-  case X86II::MRMSrcReg:
-    if (MI.getNumOperands() > CurOp && MI.getOperand(CurOp).isReg() &&
-        X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
-      VEX_R = 0x0;
-    CurOp++;
-
-    if (HasVEX_4V) {
-      VEX_4V = getVEXRegisterEncoding(MI, IsDestMem ? CurOp-1 : CurOp);
-      CurOp++;
-    }
-
-    // To only check operands before the memory address ones, start
-    // the search from the beginning
-    if (IsDestMem)
-      CurOp = 0;
-
-    // If the last register should be encoded in the immediate field
-    // do not use any bit from VEX prefix to this register, ignore it
-    if ((TSFlags >> X86II::VEXShift) & X86II::VEX_I8IMM)
-      NumOps--;
-
-    for (; CurOp != NumOps; ++CurOp) {
-      const MCOperand &MO = MI.getOperand(CurOp);
-      if (MO.isReg() && X86InstrInfo::isX86_64ExtendedReg(MO.getReg()))
-        VEX_B = 0x0;
-      if (!VEX_B && MO.isReg() &&
-          ((TSFlags & X86II::FormMask) == X86II::MRMSrcMem) &&
-          X86InstrInfo::isX86_64ExtendedReg(MO.getReg()))
-        VEX_X = 0x0;
-    }
-    break;
-  default: // MRMDestReg, MRM0r-MRM7r, RawFrm
-    if (!MI.getNumOperands())
-      break;
-
-    if (MI.getOperand(CurOp).isReg() &&
-        X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
-      VEX_B = 0;
-
-    if (HasVEX_4V)
-      VEX_4V = getVEXRegisterEncoding(MI, CurOp);
-
-    CurOp++;
-    for (; CurOp != NumOps; ++CurOp) {
-      const MCOperand &MO = MI.getOperand(CurOp);
-      if (MO.isReg() && !HasVEX_4V &&
-          X86InstrInfo::isX86_64ExtendedReg(MO.getReg()))
-        VEX_R = 0x0;
-    }
-    break;
-  }
-
-  // Emit segment override opcode prefix as needed.
-  EmitSegmentOverridePrefix(TSFlags, CurByte, MemOperand, MI, OS);
-
-  // VEX opcode prefix can have 2 or 3 bytes
-  //
-  //  3 bytes:
-  //    +-----+ +--------------+ +-------------------+
-  //    | C4h | | RXB | m-mmmm | | W | vvvv | L | pp |
-  //    +-----+ +--------------+ +-------------------+
-  //  2 bytes:
-  //    +-----+ +-------------------+
-  //    | C5h | | R | vvvv | L | pp |
-  //    +-----+ +-------------------+
-  //
-  unsigned char LastByte = VEX_PP | (VEX_L << 2) | (VEX_4V << 3);
-
-  if (VEX_B && VEX_X && !VEX_W && (VEX_5M == 1)) { // 2 byte VEX prefix
-    EmitByte(0xC5, CurByte, OS);
-    EmitByte(LastByte | (VEX_R << 7), CurByte, OS);
-    return;
-  }
-
-  // 3 byte VEX prefix
-  EmitByte(0xC4, CurByte, OS);
-  EmitByte(VEX_R << 7 | VEX_X << 6 | VEX_B << 5 | VEX_5M, CurByte, OS);
-  EmitByte(LastByte | (VEX_W << 7), CurByte, OS);
-}
-
-/// DetermineREXPrefix - Determine if the MCInst has to be encoded with a X86-64
-/// REX prefix which specifies 1) 64-bit instructions, 2) non-default operand
-/// size, and 3) use of X86-64 extended registers.
-static unsigned DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags,
-                                   const MCInstrDesc &Desc) {
-  unsigned REX = 0;
-  if (TSFlags & X86II::REX_W)
-    REX |= 1 << 3; // set REX.W
-
-  if (MI.getNumOperands() == 0) return REX;
-
-  unsigned NumOps = MI.getNumOperands();
-  // FIXME: MCInst should explicitize the two-addrness.
-  bool isTwoAddr = NumOps > 1 &&
-                      Desc.getOperandConstraint(1, MCOI::TIED_TO) != -1;
-
-  // If it accesses SPL, BPL, SIL, or DIL, then it requires a 0x40 REX prefix.
-  unsigned i = isTwoAddr ? 1 : 0;
-  for (; i != NumOps; ++i) {
-    const MCOperand &MO = MI.getOperand(i);
-    if (!MO.isReg()) continue;
-    unsigned Reg = MO.getReg();
-    if (!X86InstrInfo::isX86_64NonExtLowByteReg(Reg)) continue;
-    // FIXME: The caller of DetermineREXPrefix slaps this prefix onto anything
-    // that returns non-zero.
-    REX |= 0x40; // REX fixed encoding prefix
-    break;
-  }
-
-  switch (TSFlags & X86II::FormMask) {
-  case X86II::MRMInitReg: assert(0 && "FIXME: Remove this!");
-  case X86II::MRMSrcReg:
-    if (MI.getOperand(0).isReg() &&
-        X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(0).getReg()))
-      REX |= 1 << 2; // set REX.R
-    i = isTwoAddr ? 2 : 1;
-    for (; i != NumOps; ++i) {
-      const MCOperand &MO = MI.getOperand(i);
-      if (MO.isReg() && X86InstrInfo::isX86_64ExtendedReg(MO.getReg()))
-        REX |= 1 << 0; // set REX.B
-    }
-    break;
-  case X86II::MRMSrcMem: {
-    if (MI.getOperand(0).isReg() &&
-        X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(0).getReg()))
-      REX |= 1 << 2; // set REX.R
-    unsigned Bit = 0;
-    i = isTwoAddr ? 2 : 1;
-    for (; i != NumOps; ++i) {
-      const MCOperand &MO = MI.getOperand(i);
-      if (MO.isReg()) {
-        if (X86InstrInfo::isX86_64ExtendedReg(MO.getReg()))
-          REX |= 1 << Bit; // set REX.B (Bit=0) and REX.X (Bit=1)
-        Bit++;
-      }
-    }
-    break;
-  }
-  case X86II::MRM0m: case X86II::MRM1m:
-  case X86II::MRM2m: case X86II::MRM3m:
-  case X86II::MRM4m: case X86II::MRM5m:
-  case X86II::MRM6m: case X86II::MRM7m:
-  case X86II::MRMDestMem: {
-    unsigned e = (isTwoAddr ? X86::AddrNumOperands+1 : X86::AddrNumOperands);
-    i = isTwoAddr ? 1 : 0;
-    if (NumOps > e && MI.getOperand(e).isReg() &&
-        X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(e).getReg()))
-      REX |= 1 << 2; // set REX.R
-    unsigned Bit = 0;
-    for (; i != e; ++i) {
-      const MCOperand &MO = MI.getOperand(i);
-      if (MO.isReg()) {
-        if (X86InstrInfo::isX86_64ExtendedReg(MO.getReg()))
-          REX |= 1 << Bit; // REX.B (Bit=0) and REX.X (Bit=1)
-        Bit++;
-      }
-    }
-    break;
-  }
-  default:
-    if (MI.getOperand(0).isReg() &&
-        X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(0).getReg()))
-      REX |= 1 << 0; // set REX.B
-    i = isTwoAddr ? 2 : 1;
-    for (unsigned e = NumOps; i != e; ++i) {
-      const MCOperand &MO = MI.getOperand(i);
-      if (MO.isReg() && X86InstrInfo::isX86_64ExtendedReg(MO.getReg()))
-        REX |= 1 << 2; // set REX.R
-    }
-    break;
-  }
-  return REX;
-}
-
-/// EmitSegmentOverridePrefix - Emit segment override opcode prefix as needed
-void X86MCCodeEmitter::EmitSegmentOverridePrefix(uint64_t TSFlags,
-                                        unsigned &CurByte, int MemOperand,
-                                        const MCInst &MI,
-                                        raw_ostream &OS) const {
-  switch (TSFlags & X86II::SegOvrMask) {
-  default: assert(0 && "Invalid segment!");
-  case 0:
-    // No segment override, check for explicit one on memory operand.
-    if (MemOperand != -1) {   // If the instruction has a memory operand.
-      switch (MI.getOperand(MemOperand+X86::AddrSegmentReg).getReg()) {
-      default: assert(0 && "Unknown segment register!");
-      case 0: break;
-      case X86::CS: EmitByte(0x2E, CurByte, OS); break;
-      case X86::SS: EmitByte(0x36, CurByte, OS); break;
-      case X86::DS: EmitByte(0x3E, CurByte, OS); break;
-      case X86::ES: EmitByte(0x26, CurByte, OS); break;
-      case X86::FS: EmitByte(0x64, CurByte, OS); break;
-      case X86::GS: EmitByte(0x65, CurByte, OS); break;
-      }
-    }
-    break;
-  case X86II::FS:
-    EmitByte(0x64, CurByte, OS);
-    break;
-  case X86II::GS:
-    EmitByte(0x65, CurByte, OS);
-    break;
-  }
-}
-
-/// EmitOpcodePrefix - Emit all instruction prefixes prior to the opcode.
-///
-/// MemOperand is the operand # of the start of a memory operand if present.  If
-/// Not present, it is -1.
-void X86MCCodeEmitter::EmitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
-                                        int MemOperand, const MCInst &MI,
-                                        const MCInstrDesc &Desc,
-                                        raw_ostream &OS) const {
-
-  // Emit the lock opcode prefix as needed.
-  if (TSFlags & X86II::LOCK)
-    EmitByte(0xF0, CurByte, OS);
-
-  // Emit segment override opcode prefix as needed.
-  EmitSegmentOverridePrefix(TSFlags, CurByte, MemOperand, MI, OS);
-
-  // Emit the repeat opcode prefix as needed.
-  if ((TSFlags & X86II::Op0Mask) == X86II::REP)
-    EmitByte(0xF3, CurByte, OS);
-
-  // Emit the address size opcode prefix as needed.
-  if ((TSFlags & X86II::AdSize) ||
-      (MemOperand != -1 && is64BitMode() && Is32BitMemOperand(MI, MemOperand)))
-    EmitByte(0x67, CurByte, OS);
-  
-  // Emit the operand size opcode prefix as needed.
-  if (TSFlags & X86II::OpSize)
-    EmitByte(0x66, CurByte, OS);
-
-  bool Need0FPrefix = false;
-  switch (TSFlags & X86II::Op0Mask) {
-  default: assert(0 && "Invalid prefix!");
-  case 0: break;  // No prefix!
-  case X86II::REP: break; // already handled.
-  case X86II::TB:  // Two-byte opcode prefix
-  case X86II::T8:  // 0F 38
-  case X86II::TA:  // 0F 3A
-  case X86II::A6:  // 0F A6
-  case X86II::A7:  // 0F A7
-    Need0FPrefix = true;
-    break;
-  case X86II::TF: // F2 0F 38
-    EmitByte(0xF2, CurByte, OS);
-    Need0FPrefix = true;
-    break;
-  case X86II::XS:   // F3 0F
-    EmitByte(0xF3, CurByte, OS);
-    Need0FPrefix = true;
-    break;
-  case X86II::XD:   // F2 0F
-    EmitByte(0xF2, CurByte, OS);
-    Need0FPrefix = true;
-    break;
-  case X86II::D8: EmitByte(0xD8, CurByte, OS); break;
-  case X86II::D9: EmitByte(0xD9, CurByte, OS); break;
-  case X86II::DA: EmitByte(0xDA, CurByte, OS); break;
-  case X86II::DB: EmitByte(0xDB, CurByte, OS); break;
-  case X86II::DC: EmitByte(0xDC, CurByte, OS); break;
-  case X86II::DD: EmitByte(0xDD, CurByte, OS); break;
-  case X86II::DE: EmitByte(0xDE, CurByte, OS); break;
-  case X86II::DF: EmitByte(0xDF, CurByte, OS); break;
-  }
-
-  // Handle REX prefix.
-  // FIXME: Can this come before F2 etc to simplify emission?
-  if (is64BitMode()) {
-    if (unsigned REX = DetermineREXPrefix(MI, TSFlags, Desc))
-      EmitByte(0x40 | REX, CurByte, OS);
-  }
-
-  // 0x0F escape code must be emitted just before the opcode.
-  if (Need0FPrefix)
-    EmitByte(0x0F, CurByte, OS);
-
-  // FIXME: Pull this up into previous switch if REX can be moved earlier.
-  switch (TSFlags & X86II::Op0Mask) {
-  case X86II::TF:    // F2 0F 38
-  case X86II::T8:    // 0F 38
-    EmitByte(0x38, CurByte, OS);
-    break;
-  case X86II::TA:    // 0F 3A
-    EmitByte(0x3A, CurByte, OS);
-    break;
-  case X86II::A6:    // 0F A6
-    EmitByte(0xA6, CurByte, OS);
-    break;
-  case X86II::A7:    // 0F A7
-    EmitByte(0xA7, CurByte, OS);
-    break;
-  }
-}
-
-void X86MCCodeEmitter::
-EncodeInstruction(const MCInst &MI, raw_ostream &OS,
-                  SmallVectorImpl<MCFixup> &Fixups) const {
-  unsigned Opcode = MI.getOpcode();
-  const MCInstrDesc &Desc = MCII.get(Opcode);
-  uint64_t TSFlags = Desc.TSFlags;
-
-  // Pseudo instructions don't get encoded.
-  if ((TSFlags & X86II::FormMask) == X86II::Pseudo)
-    return;
-
-  // If this is a two-address instruction, skip one of the register operands.
-  // FIXME: This should be handled during MCInst lowering.
-  unsigned NumOps = Desc.getNumOperands();
-  unsigned CurOp = 0;
-  if (NumOps > 1 && Desc.getOperandConstraint(1, MCOI::TIED_TO) != -1)
-    ++CurOp;
-  else if (NumOps > 2 && Desc.getOperandConstraint(NumOps-1, MCOI::TIED_TO)== 0)
-    // Skip the last source operand that is tied_to the dest reg. e.g. LXADD32
-    --NumOps;
-
-  // Keep track of the current byte being emitted.
-  unsigned CurByte = 0;
-
-  // Is this instruction encoded using the AVX VEX prefix?
-  bool HasVEXPrefix = false;
-
-  // It uses the VEX.VVVV field?
-  bool HasVEX_4V = false;
-
-  if ((TSFlags >> X86II::VEXShift) & X86II::VEX)
-    HasVEXPrefix = true;
-  if ((TSFlags >> X86II::VEXShift) & X86II::VEX_4V)
-    HasVEX_4V = true;
-
-  
-  // Determine where the memory operand starts, if present.
-  int MemoryOperand = X86II::getMemoryOperandNo(TSFlags);
-  if (MemoryOperand != -1) MemoryOperand += CurOp;
-
-  if (!HasVEXPrefix)
-    EmitOpcodePrefix(TSFlags, CurByte, MemoryOperand, MI, Desc, OS);
-  else
-    EmitVEXOpcodePrefix(TSFlags, CurByte, MemoryOperand, MI, Desc, OS);
-
-  
-  unsigned char BaseOpcode = X86II::getBaseOpcodeFor(TSFlags);
-  
-  if ((TSFlags >> X86II::VEXShift) & X86II::Has3DNow0F0FOpcode)
-    BaseOpcode = 0x0F;   // Weird 3DNow! encoding.
-  
-  unsigned SrcRegNum = 0;
-  switch (TSFlags & X86II::FormMask) {
-  case X86II::MRMInitReg:
-    assert(0 && "FIXME: Remove this form when the JIT moves to MCCodeEmitter!");
-  default: errs() << "FORM: " << (TSFlags & X86II::FormMask) << "\n";
-    assert(0 && "Unknown FormMask value in X86MCCodeEmitter!");
-  case X86II::Pseudo:
-    assert(0 && "Pseudo instruction shouldn't be emitted");
-  case X86II::RawFrm:
-    EmitByte(BaseOpcode, CurByte, OS);
-    break;
-      
-  case X86II::RawFrmImm8:
-    EmitByte(BaseOpcode, CurByte, OS);
-    EmitImmediate(MI.getOperand(CurOp++),
-                  X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags),
-                  CurByte, OS, Fixups);
-    EmitImmediate(MI.getOperand(CurOp++), 1, FK_Data_1, CurByte, OS, Fixups);
-    break;
-  case X86II::RawFrmImm16:
-    EmitByte(BaseOpcode, CurByte, OS);
-    EmitImmediate(MI.getOperand(CurOp++),
-                  X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags),
-                  CurByte, OS, Fixups);
-    EmitImmediate(MI.getOperand(CurOp++), 2, FK_Data_2, CurByte, OS, Fixups);
-    break;
-
-  case X86II::AddRegFrm:
-    EmitByte(BaseOpcode + GetX86RegNum(MI.getOperand(CurOp++)), CurByte, OS);
-    break;
-
-  case X86II::MRMDestReg:
-    EmitByte(BaseOpcode, CurByte, OS);
-    EmitRegModRMByte(MI.getOperand(CurOp),
-                     GetX86RegNum(MI.getOperand(CurOp+1)), CurByte, OS);
-    CurOp += 2;
-    break;
-
-  case X86II::MRMDestMem:
-    EmitByte(BaseOpcode, CurByte, OS);
-    SrcRegNum = CurOp + X86::AddrNumOperands;
-
-    if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
-      SrcRegNum++;
-
-    EmitMemModRMByte(MI, CurOp,
-                     GetX86RegNum(MI.getOperand(SrcRegNum)),
-                     TSFlags, CurByte, OS, Fixups);
-    CurOp = SrcRegNum + 1;
-    break;
-
-  case X86II::MRMSrcReg:
-    EmitByte(BaseOpcode, CurByte, OS);
-    SrcRegNum = CurOp + 1;
-
-    if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
-      SrcRegNum++;
-
-    EmitRegModRMByte(MI.getOperand(SrcRegNum),
-                     GetX86RegNum(MI.getOperand(CurOp)), CurByte, OS);
-    CurOp = SrcRegNum + 1;
-    break;
-
-  case X86II::MRMSrcMem: {
-    int AddrOperands = X86::AddrNumOperands;
-    unsigned FirstMemOp = CurOp+1;
-    if (HasVEX_4V) {
-      ++AddrOperands;
-      ++FirstMemOp;  // Skip the register source (which is encoded in VEX_VVVV).
-    }
-
-    EmitByte(BaseOpcode, CurByte, OS);
-
-    EmitMemModRMByte(MI, FirstMemOp, GetX86RegNum(MI.getOperand(CurOp)),
-                     TSFlags, CurByte, OS, Fixups);
-    CurOp += AddrOperands + 1;
-    break;
-  }
-
-  case X86II::MRM0r: case X86II::MRM1r:
-  case X86II::MRM2r: case X86II::MRM3r:
-  case X86II::MRM4r: case X86II::MRM5r:
-  case X86II::MRM6r: case X86II::MRM7r:
-    if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV).
-      CurOp++;
-    EmitByte(BaseOpcode, CurByte, OS);
-    EmitRegModRMByte(MI.getOperand(CurOp++),
-                     (TSFlags & X86II::FormMask)-X86II::MRM0r,
-                     CurByte, OS);
-    break;
-  case X86II::MRM0m: case X86II::MRM1m:
-  case X86II::MRM2m: case X86II::MRM3m:
-  case X86II::MRM4m: case X86II::MRM5m:
-  case X86II::MRM6m: case X86II::MRM7m:
-    EmitByte(BaseOpcode, CurByte, OS);
-    EmitMemModRMByte(MI, CurOp, (TSFlags & X86II::FormMask)-X86II::MRM0m,
-                     TSFlags, CurByte, OS, Fixups);
-    CurOp += X86::AddrNumOperands;
-    break;
-  case X86II::MRM_C1:
-    EmitByte(BaseOpcode, CurByte, OS);
-    EmitByte(0xC1, CurByte, OS);
-    break;
-  case X86II::MRM_C2:
-    EmitByte(BaseOpcode, CurByte, OS);
-    EmitByte(0xC2, CurByte, OS);
-    break;
-  case X86II::MRM_C3:
-    EmitByte(BaseOpcode, CurByte, OS);
-    EmitByte(0xC3, CurByte, OS);
-    break;
-  case X86II::MRM_C4:
-    EmitByte(BaseOpcode, CurByte, OS);
-    EmitByte(0xC4, CurByte, OS);
-    break;
-  case X86II::MRM_C8:
-    EmitByte(BaseOpcode, CurByte, OS);
-    EmitByte(0xC8, CurByte, OS);
-    break;
-  case X86II::MRM_C9:
-    EmitByte(BaseOpcode, CurByte, OS);
-    EmitByte(0xC9, CurByte, OS);
-    break;
-  case X86II::MRM_E8:
-    EmitByte(BaseOpcode, CurByte, OS);
-    EmitByte(0xE8, CurByte, OS);
-    break;
-  case X86II::MRM_F0:
-    EmitByte(BaseOpcode, CurByte, OS);
-    EmitByte(0xF0, CurByte, OS);
-    break;
-  case X86II::MRM_F8:
-    EmitByte(BaseOpcode, CurByte, OS);
-    EmitByte(0xF8, CurByte, OS);
-    break;
-  case X86II::MRM_F9:
-    EmitByte(BaseOpcode, CurByte, OS);
-    EmitByte(0xF9, CurByte, OS);
-    break;
-  case X86II::MRM_D0:
-    EmitByte(BaseOpcode, CurByte, OS);
-    EmitByte(0xD0, CurByte, OS);
-    break;
-  case X86II::MRM_D1:
-    EmitByte(BaseOpcode, CurByte, OS);
-    EmitByte(0xD1, CurByte, OS);
-    break;
-  }
-
-  // If there is a remaining operand, it must be a trailing immediate.  Emit it
-  // according to the right size for the instruction.
-  if (CurOp != NumOps) {
-    // The last source register of a 4 operand instruction in AVX is encoded
-    // in bits[7:4] of a immediate byte, and bits[3:0] are ignored.
-    if ((TSFlags >> X86II::VEXShift) & X86II::VEX_I8IMM) {
-      const MCOperand &MO = MI.getOperand(CurOp++);
-      bool IsExtReg =
-        X86InstrInfo::isX86_64ExtendedReg(MO.getReg());
-      unsigned RegNum = (IsExtReg ? (1 << 7) : 0);
-      RegNum |= GetX86RegNum(MO) << 4;
-      EmitImmediate(MCOperand::CreateImm(RegNum), 1, FK_Data_1, CurByte, OS,
-                    Fixups);
-    } else {
-      unsigned FixupKind;
-      // FIXME: Is there a better way to know that we need a signed relocation?
-      if (MI.getOpcode() == X86::ADD64ri32 ||
-          MI.getOpcode() == X86::MOV64ri32 ||
-          MI.getOpcode() == X86::MOV64mi32 ||
-          MI.getOpcode() == X86::PUSH64i32)
-        FixupKind = X86::reloc_signed_4byte;
-      else
-        FixupKind = getImmFixupKind(TSFlags);
-      EmitImmediate(MI.getOperand(CurOp++),
-                    X86II::getSizeOfImm(TSFlags), MCFixupKind(FixupKind),
-                    CurByte, OS, Fixups);
-    }
-  }
-
-  if ((TSFlags >> X86II::VEXShift) & X86II::Has3DNow0F0FOpcode)
-    EmitByte(X86II::getBaseOpcodeFor(TSFlags), CurByte, OS);
-  
-
-#ifndef NDEBUG
-  // FIXME: Verify.
-  if (/*!Desc.isVariadic() &&*/ CurOp != NumOps) {
-    errs() << "Cannot encode all operands of: ";
-    MI.dump();
-    errs() << '\n';
-    abort();
-  }
-#endif
-}
diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index e385335555346..50bc14d357f83 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -372,15 +372,10 @@ ReSimplify:
   case X86::FsFLD0SD:      LowerUnaryToTwoAddr(OutMI, X86::PXORrr); break;
   case X86::VFsFLD0SS:     LowerUnaryToTwoAddr(OutMI, X86::VPXORrr); break;
   case X86::VFsFLD0SD:     LowerUnaryToTwoAddr(OutMI, X86::VPXORrr); break;
-  case X86::V_SET0PS:      LowerUnaryToTwoAddr(OutMI, X86::XORPSrr); break;
-  case X86::V_SET0PD:      LowerUnaryToTwoAddr(OutMI, X86::XORPDrr); break;
-  case X86::V_SET0PI:      LowerUnaryToTwoAddr(OutMI, X86::PXORrr); break;
   case X86::V_SETALLONES:  LowerUnaryToTwoAddr(OutMI, X86::PCMPEQDrr); break;
-  case X86::AVX_SET0PS:    LowerUnaryToTwoAddr(OutMI, X86::VXORPSrr); break;
   case X86::AVX_SET0PSY:   LowerUnaryToTwoAddr(OutMI, X86::VXORPSYrr); break;
-  case X86::AVX_SET0PD:    LowerUnaryToTwoAddr(OutMI, X86::VXORPDrr); break;
   case X86::AVX_SET0PDY:   LowerUnaryToTwoAddr(OutMI, X86::VXORPDYrr); break;
-  case X86::AVX_SET0PI:    LowerUnaryToTwoAddr(OutMI, X86::VPXORrr); break;
+  case X86::AVX_SETALLONES:  LowerUnaryToTwoAddr(OutMI, X86::VPCMPEQDrr); break;
 
   case X86::MOV16r0:
     LowerSubReg32_Op0(OutMI, X86::MOV32r0);   // MOV16r0 -> MOV32r0
@@ -468,6 +463,18 @@ ReSimplify:
   case X86::JLE_4: OutMI.setOpcode(X86::JLE_1); break;
   case X86::JG_4:  OutMI.setOpcode(X86::JG_1); break;
 
+  // Atomic load and store require a separate pseudo-inst because Acquire
+  // implies mayStore and Release implies mayLoad; fix these to regular MOV
+  // instructions here
+  case X86::ACQUIRE_MOV8rm:  OutMI.setOpcode(X86::MOV8rm); goto ReSimplify;
+  case X86::ACQUIRE_MOV16rm: OutMI.setOpcode(X86::MOV16rm); goto ReSimplify;
+  case X86::ACQUIRE_MOV32rm: OutMI.setOpcode(X86::MOV32rm); goto ReSimplify;
+  case X86::ACQUIRE_MOV64rm: OutMI.setOpcode(X86::MOV64rm); goto ReSimplify;
+  case X86::RELEASE_MOV8mr:  OutMI.setOpcode(X86::MOV8mr); goto ReSimplify;
+  case X86::RELEASE_MOV16mr: OutMI.setOpcode(X86::MOV16mr); goto ReSimplify;
+  case X86::RELEASE_MOV32mr: OutMI.setOpcode(X86::MOV32mr); goto ReSimplify;
+  case X86::RELEASE_MOV64mr: OutMI.setOpcode(X86::MOV64mr); goto ReSimplify;
+
   // We don't currently select the correct instruction form for instructions
   // which have a short %eax, etc. form. Handle this by custom lowering, for
   // now.
@@ -585,6 +592,8 @@ static void LowerTlsAddr(MCStreamer &OutStreamer,
 }
 
 void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  OutStreamer.EmitCodeRegion();
+
   X86MCInstLower MCInstLowering(Mang, *MF, *this);
   switch (MI->getOpcode()) {
   case TargetOpcode::DBG_VALUE:
@@ -601,7 +610,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
     if (OutStreamer.hasRawTextSupport())
       OutStreamer.EmitRawText(StringRef("\t#MEMBARRIER"));
     return;
-        
+
 
   case X86::EH_RETURN:
   case X86::EH_RETURN64: {
diff --git a/lib/Target/X86/X86MachObjectWriter.cpp b/lib/Target/X86/X86MachObjectWriter.cpp
deleted file mode 100644
index 37110382379ee..0000000000000
--- a/lib/Target/X86/X86MachObjectWriter.cpp
+++ /dev/null
@@ -1,554 +0,0 @@
-//===-- X86MachObjectWriter.cpp - X86 Mach-O Writer -----------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "X86.h"
-#include "X86FixupKinds.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/MC/MCAssembler.h"
-#include "llvm/MC/MCAsmLayout.h"
-#include "llvm/MC/MCMachObjectWriter.h"
-#include "llvm/MC/MCSectionMachO.h"
-#include "llvm/MC/MCValue.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Object/MachOFormat.h"
-
-using namespace llvm;
-using namespace llvm::object;
-
-namespace {
-class X86MachObjectWriter : public MCMachObjectTargetWriter {
-  void RecordScatteredRelocation(MachObjectWriter *Writer,
-                                 const MCAssembler &Asm,
-                                 const MCAsmLayout &Layout,
-                                 const MCFragment *Fragment,
-                                 const MCFixup &Fixup,
-                                 MCValue Target,
-                                 unsigned Log2Size,
-                                 uint64_t &FixedValue);
-  void RecordTLVPRelocation(MachObjectWriter *Writer,
-                            const MCAssembler &Asm,
-                            const MCAsmLayout &Layout,
-                            const MCFragment *Fragment,
-                            const MCFixup &Fixup,
-                            MCValue Target,
-                            uint64_t &FixedValue);
-
-  void RecordX86Relocation(MachObjectWriter *Writer,
-                              const MCAssembler &Asm,
-                              const MCAsmLayout &Layout,
-                              const MCFragment *Fragment,
-                              const MCFixup &Fixup,
-                              MCValue Target,
-                              uint64_t &FixedValue);
-  void RecordX86_64Relocation(MachObjectWriter *Writer,
-                              const MCAssembler &Asm,
-                              const MCAsmLayout &Layout,
-                              const MCFragment *Fragment,
-                              const MCFixup &Fixup,
-                              MCValue Target,
-                              uint64_t &FixedValue);
-public:
-  X86MachObjectWriter(bool Is64Bit, uint32_t CPUType,
-                      uint32_t CPUSubtype)
-    : MCMachObjectTargetWriter(Is64Bit, CPUType, CPUSubtype,
-                               /*UseAggressiveSymbolFolding=*/Is64Bit) {}
-
-  void RecordRelocation(MachObjectWriter *Writer,
-                        const MCAssembler &Asm, const MCAsmLayout &Layout,
-                        const MCFragment *Fragment, const MCFixup &Fixup,
-                        MCValue Target, uint64_t &FixedValue) {
-    if (Writer->is64Bit())
-      RecordX86_64Relocation(Writer, Asm, Layout, Fragment, Fixup, Target,
-                             FixedValue);
-    else
-      RecordX86Relocation(Writer, Asm, Layout, Fragment, Fixup, Target,
-                          FixedValue);
-  }
-};
-}
-
-static bool isFixupKindRIPRel(unsigned Kind) {
-  return Kind == X86::reloc_riprel_4byte ||
-    Kind == X86::reloc_riprel_4byte_movq_load;
-}
-
-static unsigned getFixupKindLog2Size(unsigned Kind) {
-  switch (Kind) {
-  default:
-    llvm_unreachable("invalid fixup kind!");
-  case FK_PCRel_1:
-  case FK_Data_1: return 0;
-  case FK_PCRel_2:
-  case FK_Data_2: return 1;
-  case FK_PCRel_4:
-    // FIXME: Remove these!!!
-  case X86::reloc_riprel_4byte:
-  case X86::reloc_riprel_4byte_movq_load:
-  case X86::reloc_signed_4byte:
-  case FK_Data_4: return 2;
-  case FK_Data_8: return 3;
-  }
-}
-
-void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer,
-                                                 const MCAssembler &Asm,
-                                                 const MCAsmLayout &Layout,
-                                                 const MCFragment *Fragment,
-                                                 const MCFixup &Fixup,
-                                                 MCValue Target,
-                                                 uint64_t &FixedValue) {
-  unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
-  unsigned IsRIPRel = isFixupKindRIPRel(Fixup.getKind());
-  unsigned Log2Size = getFixupKindLog2Size(Fixup.getKind());
-
-  // See <reloc.h>.
-  uint32_t FixupOffset =
-    Layout.getFragmentOffset(Fragment) + Fixup.getOffset();
-  uint32_t FixupAddress =
-    Writer->getFragmentAddress(Fragment, Layout) + Fixup.getOffset();
-  int64_t Value = 0;
-  unsigned Index = 0;
-  unsigned IsExtern = 0;
-  unsigned Type = 0;
-
-  Value = Target.getConstant();
-
-  if (IsPCRel) {
-    // Compensate for the relocation offset, Darwin x86_64 relocations only have
-    // the addend and appear to have attempted to define it to be the actual
-    // expression addend without the PCrel bias. However, instructions with data
-    // following the relocation are not accommodated for (see comment below
-    // regarding SIGNED{1,2,4}), so it isn't exactly that either.
-    Value += 1LL << Log2Size;
-  }
-
-  if (Target.isAbsolute()) { // constant
-    // SymbolNum of 0 indicates the absolute section.
-    Type = macho::RIT_X86_64_Unsigned;
-    Index = 0;
-
-    // FIXME: I believe this is broken, I don't think the linker can understand
-    // it. I think it would require a local relocation, but I'm not sure if that
-    // would work either. The official way to get an absolute PCrel relocation
-    // is to use an absolute symbol (which we don't support yet).
-    if (IsPCRel) {
-      IsExtern = 1;
-      Type = macho::RIT_X86_64_Branch;
-    }
-  } else if (Target.getSymB()) { // A - B + constant
-    const MCSymbol *A = &Target.getSymA()->getSymbol();
-    MCSymbolData &A_SD = Asm.getSymbolData(*A);
-    const MCSymbolData *A_Base = Asm.getAtom(&A_SD);
-
-    const MCSymbol *B = &Target.getSymB()->getSymbol();
-    MCSymbolData &B_SD = Asm.getSymbolData(*B);
-    const MCSymbolData *B_Base = Asm.getAtom(&B_SD);
-
-    // Neither symbol can be modified.
-    if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None ||
-        Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None)
-      report_fatal_error("unsupported relocation of modified symbol");
-
-    // We don't support PCrel relocations of differences. Darwin 'as' doesn't
-    // implement most of these correctly.
-    if (IsPCRel)
-      report_fatal_error("unsupported pc-relative relocation of difference");
-
-    // The support for the situation where one or both of the symbols would
-    // require a local relocation is handled just like if the symbols were
-    // external.  This is certainly used in the case of debug sections where the
-    // section has only temporary symbols and thus the symbols don't have base
-    // symbols.  This is encoded using the section ordinal and non-extern
-    // relocation entries.
-
-    // Darwin 'as' doesn't emit correct relocations for this (it ends up with a
-    // single SIGNED relocation); reject it for now.  Except the case where both
-    // symbols don't have a base, equal but both NULL.
-    if (A_Base == B_Base && A_Base)
-      report_fatal_error("unsupported relocation with identical base");
-
-    Value += Writer->getSymbolAddress(&A_SD, Layout) -
-      (A_Base == NULL ? 0 : Writer->getSymbolAddress(A_Base, Layout));
-    Value -= Writer->getSymbolAddress(&B_SD, Layout) -
-      (B_Base == NULL ? 0 : Writer->getSymbolAddress(B_Base, Layout));
-
-    if (A_Base) {
-      Index = A_Base->getIndex();
-      IsExtern = 1;
-    }
-    else {
-      Index = A_SD.getFragment()->getParent()->getOrdinal() + 1;
-      IsExtern = 0;
-    }
-    Type = macho::RIT_X86_64_Unsigned;
-
-    macho::RelocationEntry MRE;
-    MRE.Word0 = FixupOffset;
-    MRE.Word1 = ((Index     <<  0) |
-                 (IsPCRel   << 24) |
-                 (Log2Size  << 25) |
-                 (IsExtern  << 27) |
-                 (Type      << 28));
-    Writer->addRelocation(Fragment->getParent(), MRE);
-
-    if (B_Base) {
-      Index = B_Base->getIndex();
-      IsExtern = 1;
-    }
-    else {
-      Index = B_SD.getFragment()->getParent()->getOrdinal() + 1;
-      IsExtern = 0;
-    }
-    Type = macho::RIT_X86_64_Subtractor;
-  } else {
-    const MCSymbol *Symbol = &Target.getSymA()->getSymbol();
-    MCSymbolData &SD = Asm.getSymbolData(*Symbol);
-    const MCSymbolData *Base = Asm.getAtom(&SD);
-
-    // Relocations inside debug sections always use local relocations when
-    // possible. This seems to be done because the debugger doesn't fully
-    // understand x86_64 relocation entries, and expects to find values that
-    // have already been fixed up.
-    if (Symbol->isInSection()) {
-      const MCSectionMachO &Section = static_cast<const MCSectionMachO&>(
-        Fragment->getParent()->getSection());
-      if (Section.hasAttribute(MCSectionMachO::S_ATTR_DEBUG))
-        Base = 0;
-    }
-
-    // x86_64 almost always uses external relocations, except when there is no
-    // symbol to use as a base address (a local symbol with no preceding
-    // non-local symbol).
-    if (Base) {
-      Index = Base->getIndex();
-      IsExtern = 1;
-
-      // Add the local offset, if needed.
-      if (Base != &SD)
-        Value += Layout.getSymbolOffset(&SD) - Layout.getSymbolOffset(Base);
-    } else if (Symbol->isInSection() && !Symbol->isVariable()) {
-      // The index is the section ordinal (1-based).
-      Index = SD.getFragment()->getParent()->getOrdinal() + 1;
-      IsExtern = 0;
-      Value += Writer->getSymbolAddress(&SD, Layout);
-
-      if (IsPCRel)
-        Value -= FixupAddress + (1 << Log2Size);
-    } else if (Symbol->isVariable()) {
-      const MCExpr *Value = Symbol->getVariableValue();
-      int64_t Res;
-      bool isAbs = Value->EvaluateAsAbsolute(Res, Layout,
-                                             Writer->getSectionAddressMap());
-      if (isAbs) {
-        FixedValue = Res;
-        return;
-      } else {
-        report_fatal_error("unsupported relocation of variable '" +
-                           Symbol->getName() + "'");
-      }
-    } else {
-      report_fatal_error("unsupported relocation of undefined symbol '" +
-                         Symbol->getName() + "'");
-    }
-
-    MCSymbolRefExpr::VariantKind Modifier = Target.getSymA()->getKind();
-    if (IsPCRel) {
-      if (IsRIPRel) {
-        if (Modifier == MCSymbolRefExpr::VK_GOTPCREL) {
-          // x86_64 distinguishes movq foo@GOTPCREL so that the linker can
-          // rewrite the movq to an leaq at link time if the symbol ends up in
-          // the same linkage unit.
-          if (unsigned(Fixup.getKind()) == X86::reloc_riprel_4byte_movq_load)
-            Type = macho::RIT_X86_64_GOTLoad;
-          else
-            Type = macho::RIT_X86_64_GOT;
-        }  else if (Modifier == MCSymbolRefExpr::VK_TLVP) {
-          Type = macho::RIT_X86_64_TLV;
-        }  else if (Modifier != MCSymbolRefExpr::VK_None) {
-          report_fatal_error("unsupported symbol modifier in relocation");
-        } else {
-          Type = macho::RIT_X86_64_Signed;
-
-          // The Darwin x86_64 relocation format has a problem where it cannot
-          // encode an address (L<foo> + <constant>) which is outside the atom
-          // containing L<foo>. Generally, this shouldn't occur but it does
-          // happen when we have a RIPrel instruction with data following the
-          // relocation entry (e.g., movb $012, L0(%rip)). Even with the PCrel
-          // adjustment Darwin x86_64 uses, the offset is still negative and the
-          // linker has no way to recognize this.
-          //
-          // To work around this, Darwin uses several special relocation types
-          // to indicate the offsets. However, the specification or
-          // implementation of these seems to also be incomplete; they should
-          // adjust the addend as well based on the actual encoded instruction
-          // (the additional bias), but instead appear to just look at the final
-          // offset.
-          switch (-(Target.getConstant() + (1LL << Log2Size))) {
-          case 1: Type = macho::RIT_X86_64_Signed1; break;
-          case 2: Type = macho::RIT_X86_64_Signed2; break;
-          case 4: Type = macho::RIT_X86_64_Signed4; break;
-          }
-        }
-      } else {
-        if (Modifier != MCSymbolRefExpr::VK_None)
-          report_fatal_error("unsupported symbol modifier in branch "
-                             "relocation");
-
-        Type = macho::RIT_X86_64_Branch;
-      }
-    } else {
-      if (Modifier == MCSymbolRefExpr::VK_GOT) {
-        Type = macho::RIT_X86_64_GOT;
-      } else if (Modifier == MCSymbolRefExpr::VK_GOTPCREL) {
-        // GOTPCREL is allowed as a modifier on non-PCrel instructions, in which
-        // case all we do is set the PCrel bit in the relocation entry; this is
-        // used with exception handling, for example. The source is required to
-        // include any necessary offset directly.
-        Type = macho::RIT_X86_64_GOT;
-        IsPCRel = 1;
-      } else if (Modifier == MCSymbolRefExpr::VK_TLVP) {
-        report_fatal_error("TLVP symbol modifier should have been rip-rel");
-      } else if (Modifier != MCSymbolRefExpr::VK_None)
-        report_fatal_error("unsupported symbol modifier in relocation");
-      else
-        Type = macho::RIT_X86_64_Unsigned;
-    }
-  }
-
-  // x86_64 always writes custom values into the fixups.
-  FixedValue = Value;
-
-  // struct relocation_info (8 bytes)
-  macho::RelocationEntry MRE;
-  MRE.Word0 = FixupOffset;
-  MRE.Word1 = ((Index     <<  0) |
-               (IsPCRel   << 24) |
-               (Log2Size  << 25) |
-               (IsExtern  << 27) |
-               (Type      << 28));
-  Writer->addRelocation(Fragment->getParent(), MRE);
-}
-
-void X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer,
-                                                    const MCAssembler &Asm,
-                                                    const MCAsmLayout &Layout,
-                                                    const MCFragment *Fragment,
-                                                    const MCFixup &Fixup,
-                                                    MCValue Target,
-                                                    unsigned Log2Size,
-                                                    uint64_t &FixedValue) {
-  uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
-  unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
-  unsigned Type = macho::RIT_Vanilla;
-
-  // See <reloc.h>.
-  const MCSymbol *A = &Target.getSymA()->getSymbol();
-  MCSymbolData *A_SD = &Asm.getSymbolData(*A);
-
-  if (!A_SD->getFragment())
-    report_fatal_error("symbol '" + A->getName() +
-                       "' can not be undefined in a subtraction expression");
-
-  uint32_t Value = Writer->getSymbolAddress(A_SD, Layout);
-  uint64_t SecAddr = Writer->getSectionAddress(A_SD->getFragment()->getParent());
-  FixedValue += SecAddr;
-  uint32_t Value2 = 0;
-
-  if (const MCSymbolRefExpr *B = Target.getSymB()) {
-    MCSymbolData *B_SD = &Asm.getSymbolData(B->getSymbol());
-
-    if (!B_SD->getFragment())
-      report_fatal_error("symbol '" + B->getSymbol().getName() +
-                         "' can not be undefined in a subtraction expression");
-
-    // Select the appropriate difference relocation type.
-    //
-    // Note that there is no longer any semantic difference between these two
-    // relocation types from the linkers point of view, this is done solely for
-    // pedantic compatibility with 'as'.
-    Type = A_SD->isExternal() ? (unsigned)macho::RIT_Difference :
-      (unsigned)macho::RIT_Generic_LocalDifference;
-    Value2 = Writer->getSymbolAddress(B_SD, Layout);
-    FixedValue -= Writer->getSectionAddress(B_SD->getFragment()->getParent());
-  }
-
-  // Relocations are written out in reverse order, so the PAIR comes first.
-  if (Type == macho::RIT_Difference ||
-      Type == macho::RIT_Generic_LocalDifference) {
-    macho::RelocationEntry MRE;
-    MRE.Word0 = ((0         <<  0) |
-                 (macho::RIT_Pair  << 24) |
-                 (Log2Size  << 28) |
-                 (IsPCRel   << 30) |
-                 macho::RF_Scattered);
-    MRE.Word1 = Value2;
-    Writer->addRelocation(Fragment->getParent(), MRE);
-  }
-
-  macho::RelocationEntry MRE;
-  MRE.Word0 = ((FixupOffset <<  0) |
-               (Type        << 24) |
-               (Log2Size    << 28) |
-               (IsPCRel     << 30) |
-               macho::RF_Scattered);
-  MRE.Word1 = Value;
-  Writer->addRelocation(Fragment->getParent(), MRE);
-}
-
-void X86MachObjectWriter::RecordTLVPRelocation(MachObjectWriter *Writer,
-                                               const MCAssembler &Asm,
-                                               const MCAsmLayout &Layout,
-                                               const MCFragment *Fragment,
-                                               const MCFixup &Fixup,
-                                               MCValue Target,
-                                               uint64_t &FixedValue) {
-  assert(Target.getSymA()->getKind() == MCSymbolRefExpr::VK_TLVP &&
-         !is64Bit() &&
-         "Should only be called with a 32-bit TLVP relocation!");
-
-  unsigned Log2Size = getFixupKindLog2Size(Fixup.getKind());
-  uint32_t Value = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
-  unsigned IsPCRel = 0;
-
-  // Get the symbol data.
-  MCSymbolData *SD_A = &Asm.getSymbolData(Target.getSymA()->getSymbol());
-  unsigned Index = SD_A->getIndex();
-
-  // We're only going to have a second symbol in pic mode and it'll be a
-  // subtraction from the picbase. For 32-bit pic the addend is the difference
-  // between the picbase and the next address.  For 32-bit static the addend is
-  // zero.
-  if (Target.getSymB()) {
-    // If this is a subtraction then we're pcrel.
-    uint32_t FixupAddress =
-      Writer->getFragmentAddress(Fragment, Layout) + Fixup.getOffset();
-    MCSymbolData *SD_B = &Asm.getSymbolData(Target.getSymB()->getSymbol());
-    IsPCRel = 1;
-    FixedValue = (FixupAddress - Writer->getSymbolAddress(SD_B, Layout) +
-                  Target.getConstant());
-    FixedValue += 1ULL << Log2Size;
-  } else {
-    FixedValue = 0;
-  }
-
-  // struct relocation_info (8 bytes)
-  macho::RelocationEntry MRE;
-  MRE.Word0 = Value;
-  MRE.Word1 = ((Index                  <<  0) |
-               (IsPCRel                << 24) |
-               (Log2Size               << 25) |
-               (1                      << 27) | // Extern
-               (macho::RIT_Generic_TLV << 28)); // Type
-  Writer->addRelocation(Fragment->getParent(), MRE);
-}
-
-void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer,
-                                              const MCAssembler &Asm,
-                                              const MCAsmLayout &Layout,
-                                              const MCFragment *Fragment,
-                                              const MCFixup &Fixup,
-                                              MCValue Target,
-                                              uint64_t &FixedValue) {
-  unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
-  unsigned Log2Size = getFixupKindLog2Size(Fixup.getKind());
-
-  // If this is a 32-bit TLVP reloc it's handled a bit differently.
-  if (Target.getSymA() &&
-      Target.getSymA()->getKind() == MCSymbolRefExpr::VK_TLVP) {
-    RecordTLVPRelocation(Writer, Asm, Layout, Fragment, Fixup, Target,
-                         FixedValue);
-    return;
-  }
-
-  // If this is a difference or a defined symbol plus an offset, then we need a
-  // scattered relocation entry. Differences always require scattered
-  // relocations.
-  if (Target.getSymB())
-    return RecordScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup,
-                                     Target, Log2Size, FixedValue);
-
-  // Get the symbol data, if any.
-  MCSymbolData *SD = 0;
-  if (Target.getSymA())
-    SD = &Asm.getSymbolData(Target.getSymA()->getSymbol());
-
-  // If this is an internal relocation with an offset, it also needs a scattered
-  // relocation entry.
-  uint32_t Offset = Target.getConstant();
-  if (IsPCRel)
-    Offset += 1 << Log2Size;
-  if (Offset && SD && !Writer->doesSymbolRequireExternRelocation(SD))
-    return RecordScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup,
-                                     Target, Log2Size, FixedValue);
-
-  // See <reloc.h>.
-  uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
-  unsigned Index = 0;
-  unsigned IsExtern = 0;
-  unsigned Type = 0;
-
-  if (Target.isAbsolute()) { // constant
-    // SymbolNum of 0 indicates the absolute section.
-    //
-    // FIXME: Currently, these are never generated (see code below). I cannot
-    // find a case where they are actually emitted.
-    Type = macho::RIT_Vanilla;
-  } else {
-    // Resolve constant variables.
-    if (SD->getSymbol().isVariable()) {
-      int64_t Res;
-      if (SD->getSymbol().getVariableValue()->EvaluateAsAbsolute(
-            Res, Layout, Writer->getSectionAddressMap())) {
-        FixedValue = Res;
-        return;
-      }
-    }
-
-    // Check whether we need an external or internal relocation.
-    if (Writer->doesSymbolRequireExternRelocation(SD)) {
-      IsExtern = 1;
-      Index = SD->getIndex();
-      // For external relocations, make sure to offset the fixup value to
-      // compensate for the addend of the symbol address, if it was
-      // undefined. This occurs with weak definitions, for example.
-      if (!SD->Symbol->isUndefined())
-        FixedValue -= Layout.getSymbolOffset(SD);
-    } else {
-      // The index is the section ordinal (1-based).
-      const MCSectionData &SymSD = Asm.getSectionData(
-        SD->getSymbol().getSection());
-      Index = SymSD.getOrdinal() + 1;
-      FixedValue += Writer->getSectionAddress(&SymSD);
-    }
-    if (IsPCRel)
-      FixedValue -= Writer->getSectionAddress(Fragment->getParent());
-
-    Type = macho::RIT_Vanilla;
-  }
-
-  // struct relocation_info (8 bytes)
-  macho::RelocationEntry MRE;
-  MRE.Word0 = FixupOffset;
-  MRE.Word1 = ((Index     <<  0) |
-               (IsPCRel   << 24) |
-               (Log2Size  << 25) |
-               (IsExtern  << 27) |
-               (Type      << 28));
-  Writer->addRelocation(Fragment->getParent(), MRE);
-}
-
-MCObjectWriter *llvm::createX86MachObjectWriter(raw_ostream &OS,
-                                                bool Is64Bit,
-                                                uint32_t CPUType,
-                                                uint32_t CPUSubtype) {
-  return createMachObjectWriter(new X86MachObjectWriter(Is64Bit,
-                                                        CPUType,
-                                                        CPUSubtype),
-                                OS, /*IsLittleEndian=*/true);
-}
diff --git a/lib/Target/X86/X86MachineFunctionInfo.h b/lib/Target/X86/X86MachineFunctionInfo.h
index 06043ecd3f30e..b0bb313ec6396 100644
--- a/lib/Target/X86/X86MachineFunctionInfo.h
+++ b/lib/Target/X86/X86MachineFunctionInfo.h
@@ -53,10 +53,6 @@ class X86MachineFunctionInfo : public MachineFunctionInfo {
   /// relocation models.
   unsigned GlobalBaseReg;
 
-  /// ReserveFP - whether the function should reserve the frame pointer
-  /// when allocating, even if there may not actually be a frame pointer used.
-  bool ReserveFP;
-
   /// VarArgsFrameIndex - FrameIndex for start of varargs area.
   int VarArgsFrameIndex;
   /// RegSaveFrameIndex - X86-64 vararg func register save area.
@@ -65,6 +61,9 @@ class X86MachineFunctionInfo : public MachineFunctionInfo {
   unsigned VarArgsGPOffset;
   /// VarArgsFPOffset - X86-64 vararg func fp reg offset.
   unsigned VarArgsFPOffset;
+  /// ArgumentStackSize - The number of bytes on stack consumed by the arguments
+  /// being passed on the stack.
+  unsigned ArgumentStackSize;
 
 public:
   X86MachineFunctionInfo() : ForceFramePointer(false),
@@ -77,7 +76,8 @@ public:
                              VarArgsFrameIndex(0),
                              RegSaveFrameIndex(0),
                              VarArgsGPOffset(0),
-                             VarArgsFPOffset(0) {}
+                             VarArgsFPOffset(0),
+                             ArgumentStackSize(0) {}
   
   explicit X86MachineFunctionInfo(MachineFunction &MF)
     : ForceFramePointer(false),
@@ -87,11 +87,11 @@ public:
       TailCallReturnAddrDelta(0),
       SRetReturnReg(0),
       GlobalBaseReg(0),
-      ReserveFP(false),
       VarArgsFrameIndex(0),
       RegSaveFrameIndex(0),
       VarArgsGPOffset(0),
-      VarArgsFPOffset(0) {}
+      VarArgsFPOffset(0),
+      ArgumentStackSize(0) {}
   
   bool getForceFramePointer() const { return ForceFramePointer;} 
   void setForceFramePointer(bool forceFP) { ForceFramePointer = forceFP; }
@@ -114,9 +114,6 @@ public:
   unsigned getGlobalBaseReg() const { return GlobalBaseReg; }
   void setGlobalBaseReg(unsigned Reg) { GlobalBaseReg = Reg; }
 
-  bool getReserveFP() const { return ReserveFP; }
-  void setReserveFP(bool reserveFP) { ReserveFP = reserveFP; }
-
   int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
   void setVarArgsFrameIndex(int Idx) { VarArgsFrameIndex = Idx; }
 
@@ -128,6 +125,9 @@ public:
 
   unsigned getVarArgsFPOffset() const { return VarArgsFPOffset; }
   void setVarArgsFPOffset(unsigned Offset) { VarArgsFPOffset = Offset; }
+
+  unsigned getArgumentStackSize() const { return ArgumentStackSize; }
+  void setArgumentStackSize(unsigned size) { ArgumentStackSize = size; }
 };
 
 } // End llvm namespace
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index f2faf59367a1e..c1ac9f3431161 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -27,7 +27,6 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineLocation.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/MC/MCAsmInfo.h"
@@ -53,7 +52,13 @@ ForceStackAlign("force-align-stack",
 
 X86RegisterInfo::X86RegisterInfo(X86TargetMachine &tm,
                                  const TargetInstrInfo &tii)
-  : X86GenRegisterInfo(), TM(tm), TII(tii) {
+  : X86GenRegisterInfo(tm.getSubtarget<X86Subtarget>().is64Bit()
+                         ? X86::RIP : X86::EIP,
+                       X86_MC::getDwarfRegFlavour(tm.getTargetTriple(), false),
+                       X86_MC::getDwarfRegFlavour(tm.getTargetTriple(), true)),
+                       TM(tm), TII(tii) {
+  X86_MC::InitLLVM2SEHRegisterMapping(this);
+
   // Cache some information.
   const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
   Is64Bit = Subtarget->is64Bit();
@@ -70,40 +75,6 @@ X86RegisterInfo::X86RegisterInfo(X86TargetMachine &tm,
   }
 }
 
-static unsigned getFlavour(const X86Subtarget *Subtarget, bool isEH) {
-  if (!Subtarget->is64Bit()) {
-    if (Subtarget->isTargetDarwin()) {
-      if (isEH)
-        return DWARFFlavour::X86_32_DarwinEH;
-      else
-        return DWARFFlavour::X86_32_Generic;
-    } else if (Subtarget->isTargetCygMing()) {
-      // Unsupported by now, just quick fallback
-      return DWARFFlavour::X86_32_Generic;
-    } else {
-      return DWARFFlavour::X86_32_Generic;
-    }
-  }
-  return DWARFFlavour::X86_64;
-}
-
-/// getDwarfRegNum - This function maps LLVM register identifiers to the DWARF
-/// specific numbering, used in debug info and exception tables.
-int X86RegisterInfo::getDwarfRegNum(unsigned RegNo, bool isEH) const {
-  const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
-  unsigned Flavour = getFlavour(Subtarget, isEH);
-
-  return X86GenRegisterInfo::getDwarfRegNumFull(RegNo, Flavour);
-}
-
-/// getLLVMRegNum - This function maps DWARF register numbers to LLVM register.
-int X86RegisterInfo::getLLVMRegNum(unsigned DwarfRegNo, bool isEH) const {
-  const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
-  unsigned Flavour = getFlavour(Subtarget, isEH);
-
-  return X86GenRegisterInfo::getLLVMRegNumFull(DwarfRegNo, Flavour);
-}
-
 /// getCompactUnwindRegNum - This function maps the register to the number for
 /// compact unwind encoding. Return -1 if the register isn't valid.
 int X86RegisterInfo::getCompactUnwindRegNum(unsigned RegNum, bool isEH) const {
@@ -121,7 +92,7 @@ int X86RegisterInfo::getCompactUnwindRegNum(unsigned RegNum, bool isEH) const {
 
 int
 X86RegisterInfo::getSEHRegNum(unsigned i) const {
-  int reg = getX86RegNum(i);
+  int reg = X86_MC::getX86RegNum(i);
   switch (i) {
   case X86::R8:  case X86::R8D:  case X86::R8W:  case X86::R8B:
   case X86::R9:  case X86::R9D:  case X86::R9W:  case X86::R9B:
@@ -140,96 +111,16 @@ X86RegisterInfo::getSEHRegNum(unsigned i) const {
   return reg;
 }
 
-/// getX86RegNum - This function maps LLVM register identifiers to their X86
-/// specific numbering, which is used in various places encoding instructions.
-unsigned X86RegisterInfo::getX86RegNum(unsigned RegNo) {
-  switch(RegNo) {
-  case X86::RAX: case X86::EAX: case X86::AX: case X86::AL: return N86::EAX;
-  case X86::RCX: case X86::ECX: case X86::CX: case X86::CL: return N86::ECX;
-  case X86::RDX: case X86::EDX: case X86::DX: case X86::DL: return N86::EDX;
-  case X86::RBX: case X86::EBX: case X86::BX: case X86::BL: return N86::EBX;
-  case X86::RSP: case X86::ESP: case X86::SP: case X86::SPL: case X86::AH:
-    return N86::ESP;
-  case X86::RBP: case X86::EBP: case X86::BP: case X86::BPL: case X86::CH:
-    return N86::EBP;
-  case X86::RSI: case X86::ESI: case X86::SI: case X86::SIL: case X86::DH:
-    return N86::ESI;
-  case X86::RDI: case X86::EDI: case X86::DI: case X86::DIL: case X86::BH:
-    return N86::EDI;
-
-  case X86::R8:  case X86::R8D:  case X86::R8W:  case X86::R8B:
-    return N86::EAX;
-  case X86::R9:  case X86::R9D:  case X86::R9W:  case X86::R9B:
-    return N86::ECX;
-  case X86::R10: case X86::R10D: case X86::R10W: case X86::R10B:
-    return N86::EDX;
-  case X86::R11: case X86::R11D: case X86::R11W: case X86::R11B:
-    return N86::EBX;
-  case X86::R12: case X86::R12D: case X86::R12W: case X86::R12B:
-    return N86::ESP;
-  case X86::R13: case X86::R13D: case X86::R13W: case X86::R13B:
-    return N86::EBP;
-  case X86::R14: case X86::R14D: case X86::R14W: case X86::R14B:
-    return N86::ESI;
-  case X86::R15: case X86::R15D: case X86::R15W: case X86::R15B:
-    return N86::EDI;
-
-  case X86::ST0: case X86::ST1: case X86::ST2: case X86::ST3:
-  case X86::ST4: case X86::ST5: case X86::ST6: case X86::ST7:
-    return RegNo-X86::ST0;
-
-  case X86::XMM0: case X86::XMM8:
-  case X86::YMM0: case X86::YMM8: case X86::MM0:
-    return 0;
-  case X86::XMM1: case X86::XMM9:
-  case X86::YMM1: case X86::YMM9: case X86::MM1:
-    return 1;
-  case X86::XMM2: case X86::XMM10:
-  case X86::YMM2: case X86::YMM10: case X86::MM2:
-    return 2;
-  case X86::XMM3: case X86::XMM11:
-  case X86::YMM3: case X86::YMM11: case X86::MM3:
-    return 3;
-  case X86::XMM4: case X86::XMM12:
-  case X86::YMM4: case X86::YMM12: case X86::MM4:
-    return 4;
-  case X86::XMM5: case X86::XMM13:
-  case X86::YMM5: case X86::YMM13: case X86::MM5:
-    return 5;
-  case X86::XMM6: case X86::XMM14:
-  case X86::YMM6: case X86::YMM14: case X86::MM6:
-    return 6;
-  case X86::XMM7: case X86::XMM15:
-  case X86::YMM7: case X86::YMM15: case X86::MM7:
-    return 7;
-
-  case X86::ES: return 0;
-  case X86::CS: return 1;
-  case X86::SS: return 2;
-  case X86::DS: return 3;
-  case X86::FS: return 4;
-  case X86::GS: return 5;
-
-  case X86::CR0: case X86::CR8 : case X86::DR0: return 0;
-  case X86::CR1: case X86::CR9 : case X86::DR1: return 1;
-  case X86::CR2: case X86::CR10: case X86::DR2: return 2;
-  case X86::CR3: case X86::CR11: case X86::DR3: return 3;
-  case X86::CR4: case X86::CR12: case X86::DR4: return 4;
-  case X86::CR5: case X86::CR13: case X86::DR5: return 5;
-  case X86::CR6: case X86::CR14: case X86::DR6: return 6;
-  case X86::CR7: case X86::CR15: case X86::DR7: return 7;
-
-  // Pseudo index registers are equivalent to a "none"
-  // scaled index (See Intel Manual 2A, table 2-3)
-  case X86::EIZ:
-  case X86::RIZ:
-    return 4;
-
-  default:
-    assert(isVirtualRegister(RegNo) && "Unknown physical register!");
-    llvm_unreachable("Register allocator hasn't allocated reg correctly yet!");
-    return 0;
-  }
+const TargetRegisterClass *
+X86RegisterInfo::getSubClassWithSubReg(const TargetRegisterClass *RC,
+                                       unsigned Idx) const {
+  // The sub_8bit sub-register index is more constrained in 32-bit mode.
+  // It behaves just like the sub_8bit_hi index.
+  if (!Is64Bit && Idx == X86::sub_8bit)
+    Idx = X86::sub_8bit_hi;
+
+  // Forward to TableGen's default version.
+  return X86GenRegisterInfo::getSubClassWithSubReg(RC, Idx);
 }
 
 const TargetRegisterClass *
@@ -355,8 +246,19 @@ X86RegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A,
 
 const TargetRegisterClass*
 X86RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC) const{
+  // Don't allow super-classes of GR8_NOREX.  This class is only used after
+  // extrating sub_8bit_hi sub-registers.  The H sub-registers cannot be copied
+  // to the full GR8 register class in 64-bit mode, so we cannot allow the
+  // reigster class inflation.
+  //
+  // The GR8_NOREX class is always used in a way that won't be constrained to a
+  // sub-class, so sub-classes like GR8_ABCD_L are allowed to expand to the
+  // full GR8 class.
+  if (RC == X86::GR8_NOREXRegisterClass)
+    return RC;
+
   const TargetRegisterClass *Super = RC;
-  TargetRegisterClass::sc_iterator I = RC->superclasses_begin();
+  TargetRegisterClass::sc_iterator I = RC->getSuperClasses();
   do {
     switch (Super->getID()) {
     case X86::GR8RegClassID:
@@ -741,11 +643,6 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   }
 }
 
-unsigned X86RegisterInfo::getRARegister() const {
-  return Is64Bit ? X86::RIP     // Should have dwarf #16.
-                 : X86::EIP;    // Should have dwarf #8.
-}
-
 unsigned X86RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
   return TFI->hasFP(MF) ? FramePtr : StackPtr;
@@ -948,7 +845,7 @@ namespace {
       for (unsigned i = 0, e = RI.getNumVirtRegs(); i != e; ++i) {
         unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
         if (RI.getRegClass(Reg)->getAlignment() > StackAlignment) {
-          FuncInfo->setReserveFP(true);
+          FuncInfo->setForceFramePointer(true);
           return true;
         }
       }
diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h
index a12eb1297f7e1..7d39c68535978 100644
--- a/lib/Target/X86/X86RegisterInfo.h
+++ b/lib/Target/X86/X86RegisterInfo.h
@@ -24,22 +24,6 @@ namespace llvm {
   class TargetInstrInfo;
   class X86TargetMachine;
 
-/// N86 namespace - Native X86 register numbers
-///
-namespace N86 {
-  enum {
-    EAX = 0, ECX = 1, EDX = 2, EBX = 3, ESP = 4, EBP = 5, ESI = 6, EDI = 7
-  };
-}
-
-/// DWARFFlavour - Flavour of dwarf regnumbers
-///
-namespace DWARFFlavour {
-  enum {
-    X86_64 = 0, X86_32_DarwinEH = 1, X86_32_Generic = 2
-  };
-} 
-  
 class X86RegisterInfo : public X86GenRegisterInfo {
 public:
   X86TargetMachine &TM;
@@ -73,11 +57,6 @@ public:
   /// register identifier.
   static unsigned getX86RegNum(unsigned RegNo);
 
-  /// getDwarfRegNum - allows modification of X86GenRegisterInfo::getDwarfRegNum
-  /// (created by TableGen) for target dependencies.
-  int getDwarfRegNum(unsigned RegNum, bool isEH) const;
-  int getLLVMRegNum(unsigned RegNum, bool isEH) const;
-
   // FIXME: This should be tablegen'd like getDwarfRegNum is
   int getSEHRegNum(unsigned i) const;
 
@@ -95,6 +74,9 @@ public:
   getMatchingSuperRegClass(const TargetRegisterClass *A,
                            const TargetRegisterClass *B, unsigned Idx) const;
 
+  virtual const TargetRegisterClass *
+  getSubClassWithSubReg(const TargetRegisterClass *RC, unsigned Idx) const;
+
   const TargetRegisterClass*
   getLargestLegalSuperClass(const TargetRegisterClass *RC) const;
 
@@ -136,7 +118,6 @@ public:
                            int SPAdj, RegScavenger *RS = NULL) const;
 
   // Debug information queries.
-  unsigned getRARegister() const;
   unsigned getFrameRegister(const MachineFunction &MF) const;
   unsigned getStackRegister() const { return StackPtr; }
   // FIXME: Move to FrameInfok
diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td
index 203722a661620..9a7db36e08716 100644
--- a/lib/Target/X86/X86RegisterInfo.td
+++ b/lib/Target/X86/X86RegisterInfo.td
@@ -390,6 +390,13 @@ def GR64_NOREX : RegisterClass<"X86", [i64], 64,
                        (GR32_NOREX sub_32bit)];
 }
 
+// GR32_NOAX - GR32 registers except EAX. Used by AddRegFrm of XCHG32 in 64-bit
+// mode to prevent encoding using the 0x90 NOP encoding. xchg %eax, %eax needs
+// to clear upper 32-bits of RAX so is not a NOP.
+def GR32_NOAX : RegisterClass<"X86", [i32], 32, (sub GR32, EAX)> {
+  let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi), (GR16 sub_16bit)];
+}
+
 // GR32_NOSP - GR32 registers except ESP.
 def GR32_NOSP : RegisterClass<"X86", [i32], 32, (sub GR32, ESP)> {
   let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi), (GR16 sub_16bit)];
@@ -455,8 +462,8 @@ def VR128 : RegisterClass<"X86", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
   let SubRegClasses = [(FR32 sub_ss), (FR64 sub_sd)];
 }
 
-def VR256 : RegisterClass<"X86", [v32i8, v8i32, v4i64, v8f32, v4f64], 256,
-                          (sequence "YMM%u", 0, 15)> {
+def VR256 : RegisterClass<"X86", [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+                          256, (sequence "YMM%u", 0, 15)> {
   let SubRegClasses = [(FR32 sub_ss), (FR64 sub_sd), (VR128 sub_xmm)];
 }
 
diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp
index 02754f9ae503c..6406bce31187f 100644
--- a/lib/Target/X86/X86SelectionDAGInfo.cpp
+++ b/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -54,7 +54,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
     if (const char *bzeroEntry =  V &&
         V->isNullValue() ? Subtarget->getBZeroEntry() : 0) {
       EVT IntPtr = TLI.getPointerTy();
-      const Type *IntPtrTy = getTargetData()->getIntPtrType(*DAG.getContext());
+      Type *IntPtrTy = getTargetData()->getIntPtrType(*DAG.getContext());
       TargetLowering::ArgListTy Args;
       TargetLowering::ArgListEntry Entry;
       Entry.Node = Dst;
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index 5e6c659e53934..7064dd06fa307 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -16,9 +16,11 @@
 #include "X86InstrInfo.h"
 #include "llvm/GlobalValue.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/Host.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
 #include "llvm/ADT/SmallVector.h"
 
 #define GET_SUBTARGETINFO_TARGET_DESC
@@ -185,24 +187,53 @@ void X86Subtarget::AutoDetectSubtargetFeatures() {
 
   X86_MC::GetCpuIDAndInfo(0x1, &EAX, &EBX, &ECX, &EDX);
   
-  if ((EDX >> 15) & 1) HasCMov = true;      ToggleFeature(X86::FeatureCMOV);
-  if ((EDX >> 23) & 1) X86SSELevel = MMX;   ToggleFeature(X86::FeatureMMX);
-  if ((EDX >> 25) & 1) X86SSELevel = SSE1;  ToggleFeature(X86::FeatureSSE1);
-  if ((EDX >> 26) & 1) X86SSELevel = SSE2;  ToggleFeature(X86::FeatureSSE2);
-  if (ECX & 0x1)       X86SSELevel = SSE3;  ToggleFeature(X86::FeatureSSE3);
-  if ((ECX >> 9)  & 1) X86SSELevel = SSSE3; ToggleFeature(X86::FeatureSSSE3);
-  if ((ECX >> 19) & 1) X86SSELevel = SSE41; ToggleFeature(X86::FeatureSSE41);
-  if ((ECX >> 20) & 1) X86SSELevel = SSE42; ToggleFeature(X86::FeatureSSE42);
+  if ((EDX >> 15) & 1) { HasCMov = true;      ToggleFeature(X86::FeatureCMOV); }
+  if ((EDX >> 23) & 1) { X86SSELevel = MMX;   ToggleFeature(X86::FeatureMMX);  }
+  if ((EDX >> 25) & 1) { X86SSELevel = SSE1;  ToggleFeature(X86::FeatureSSE1); }
+  if ((EDX >> 26) & 1) { X86SSELevel = SSE2;  ToggleFeature(X86::FeatureSSE2); }
+  if (ECX & 0x1)       { X86SSELevel = SSE3;  ToggleFeature(X86::FeatureSSE3); }
+  if ((ECX >> 9)  & 1) { X86SSELevel = SSSE3; ToggleFeature(X86::FeatureSSSE3);}
+  if ((ECX >> 19) & 1) { X86SSELevel = SSE41; ToggleFeature(X86::FeatureSSE41);}
+  if ((ECX >> 20) & 1) { X86SSELevel = SSE42; ToggleFeature(X86::FeatureSSE42);}
   // FIXME: AVX codegen support is not ready.
-  //if ((ECX >> 28) & 1) { HasAVX = true; } ToggleFeature(X86::FeatureAVX);
+  //if ((ECX >> 28) & 1) { HasAVX = true;  ToggleFeature(X86::FeatureAVX); }
 
   bool IsIntel = memcmp(text.c, "GenuineIntel", 12) == 0;
   bool IsAMD   = !IsIntel && memcmp(text.c, "AuthenticAMD", 12) == 0;
 
-  HasCLMUL = IsIntel && ((ECX >> 1) & 0x1);   ToggleFeature(X86::FeatureCLMUL);
-  HasFMA3  = IsIntel && ((ECX >> 12) & 0x1);  ToggleFeature(X86::FeatureFMA3);
-  HasPOPCNT = IsIntel && ((ECX >> 23) & 0x1); ToggleFeature(X86::FeaturePOPCNT);
-  HasAES   = IsIntel && ((ECX >> 25) & 0x1);  ToggleFeature(X86::FeatureAES);
+  if (IsIntel && ((ECX >> 1) & 0x1)) {
+    HasCLMUL = true;
+    ToggleFeature(X86::FeatureCLMUL);
+  }
+  if (IsIntel && ((ECX >> 12) & 0x1)) {
+    HasFMA3 = true;
+    ToggleFeature(X86::FeatureFMA3);
+  }
+  if (IsIntel && ((ECX >> 22) & 0x1)) {
+    HasMOVBE = true;
+    ToggleFeature(X86::FeatureMOVBE);
+  }
+  if (IsIntel && ((ECX >> 23) & 0x1)) {
+    HasPOPCNT = true;
+    ToggleFeature(X86::FeaturePOPCNT);
+  }
+  if (IsIntel && ((ECX >> 25) & 0x1)) {
+    HasAES = true;
+    ToggleFeature(X86::FeatureAES);
+  }
+  if (IsIntel && ((ECX >> 29) & 0x1)) {
+    HasF16C = true;
+    ToggleFeature(X86::FeatureF16C);
+  }
+  if (IsIntel && ((ECX >> 30) & 0x1)) {
+    HasRDRAND = true;
+    ToggleFeature(X86::FeatureRDRAND);
+  }
+
+  if ((ECX >> 13) & 0x1) {
+    HasCmpxchg16b = true;
+    ToggleFeature(X86::FeatureCMPXCHG16B);
+  }
 
   if (IsIntel || IsAMD) {
     // Determine if bit test memory instructions are slow.
@@ -224,6 +255,10 @@ void X86Subtarget::AutoDetectSubtargetFeatures() {
       HasX86_64 = true;
       ToggleFeature(X86::Feature64Bit);
     }
+    if ((ECX >> 5) & 0x1) {
+      HasLZCNT = true;
+      ToggleFeature(X86::FeatureLZCNT);
+    }
     if (IsAMD && ((ECX >> 6) & 0x1)) {
       HasSSE4A = true;
       ToggleFeature(X86::FeatureSSE4A);
@@ -251,14 +286,21 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU,
   , HasCLMUL(false)
   , HasFMA3(false)
   , HasFMA4(false)
+  , HasMOVBE(false)
+  , HasRDRAND(false)
+  , HasF16C(false)
+  , HasLZCNT(false)
+  , HasBMI(false)
   , IsBTMemSlow(false)
   , IsUAMemFast(false)
   , HasVectorUAMem(false)
+  , HasCmpxchg16b(false)
   , stackAlignment(8)
   // FIXME: this is a known good value for Yonah. How about others?
   , MaxInlineSizeThreshold(128)
   , TargetTriple(TT)
-  , In64BitMode(is64Bit) {
+  , In64BitMode(is64Bit)
+  , InNaClMode(false) {
   // Determine default and user specified characteristics
   if (!FS.empty() || !CPU.empty()) {
     std::string CPUName = CPU;
@@ -304,6 +346,11 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU,
   if (In64BitMode)
     ToggleFeature(X86::Mode64Bit);
 
+  if (isTargetNaCl()) {
+    InNaClMode = true;
+    ToggleFeature(X86::ModeNaCl);
+  }
+
   if (HasAVX)
     X86SSELevel = NoMMXSSE;
     
@@ -313,6 +360,9 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU,
   assert((!In64BitMode || HasX86_64) &&
          "64-bit code requested on a subtarget that doesn't support it!");
 
+  if(EnableSegmentedStacks && !isTargetELF())
+    report_fatal_error("Segmented stacks are only implemented on ELF.");
+
   // Stack alignment is 16 bytes on Darwin, FreeBSD, Linux and Solaris (both
   // 32 and 64 bit) and for all 64-bit targets.
   if (StackAlignOverride)
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index 6d22027b7aa8e..3258d3d0ada39 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -90,6 +90,21 @@ protected:
   /// HasFMA4 - Target has 4-operand fused multiply-add
   bool HasFMA4;
 
+  /// HasMOVBE - True if the processor has the MOVBE instruction.
+  bool HasMOVBE;
+
+  /// HasRDRAND - True if the processor has the RDRAND instruction.
+  bool HasRDRAND;
+
+  /// HasF16C - Processor has 16-bit floating point conversion instructions.
+  bool HasF16C;
+
+  /// HasLZCNT - Processor has LZCNT instruction.
+  bool HasLZCNT;
+
+  /// HasBMI - Processor has BMI1 instructions.
+  bool HasBMI;
+
   /// IsBTMemSlow - True if BT (bit test) of memory instructions are slow.
   bool IsBTMemSlow;
 
@@ -100,6 +115,10 @@ protected:
   /// operands. This may require setting a feature bit in the processor.
   bool HasVectorUAMem;
 
+  /// HasCmpxchg16b - True if this processor has the CMPXCHG16B instruction;
+  /// this is true for most x86-64 chips, but not the first AMD chips.
+  bool HasCmpxchg16b;
+
   /// stackAlignment - The minimum alignment known to hold of the stack frame on
   /// entry to the function and which must be maintained by every function.
   unsigned stackAlignment;
@@ -115,6 +134,9 @@ private:
   /// In64BitMode - True if compiling for 64-bit, false for 32-bit.
   bool In64BitMode;
 
+  /// InNaClMode - True if compiling for Native Client target.
+  bool InNaClMode;
+
 public:
 
   /// This constructor initializes the data members to match that
@@ -165,9 +187,15 @@ public:
   bool hasCLMUL() const { return HasCLMUL; }
   bool hasFMA3() const { return HasFMA3; }
   bool hasFMA4() const { return HasFMA4; }
+  bool hasMOVBE() const { return HasMOVBE; }
+  bool hasRDRAND() const { return HasRDRAND; }
+  bool hasF16C() const { return HasF16C; }
+  bool hasLZCNT() const { return HasLZCNT; }
+  bool hasBMI() const { return HasBMI; }
   bool isBTMemSlow() const { return IsBTMemSlow; }
   bool isUnalignedMemAccessFast() const { return IsUAMemFast; }
   bool hasVectorUAMem() const { return HasVectorUAMem; }
+  bool hasCmpxchg16b() const { return HasCmpxchg16b; }
 
   const Triple &getTargetTriple() const { return TargetTriple; }
 
@@ -185,6 +213,11 @@ public:
     return !isTargetDarwin() && !isTargetWindows() && !isTargetCygMing();
   }
   bool isTargetLinux() const { return TargetTriple.getOS() == Triple::Linux; }
+  bool isTargetNaCl() const {
+    return TargetTriple.getOS() == Triple::NativeClient;
+  }
+  bool isTargetNaCl32() const { return isTargetNaCl() && !is64Bit(); }
+  bool isTargetNaCl64() const { return isTargetNaCl() && is64Bit(); }
 
   bool isTargetWindows() const { return TargetTriple.getOS() == Triple::Win32; }
   bool isTargetMingw() const { return TargetTriple.getOS() == Triple::MinGW32; }
@@ -199,7 +232,8 @@ public:
   }
 
   bool isTargetWin64() const {
-    return In64BitMode && (isTargetMingw() || isTargetWindows());
+    // FIXME: x86_64-cygwin has not been released yet.
+    return In64BitMode && (isTargetCygMing() || isTargetWindows());
   }
 
   bool isTargetEnvMacho() const {
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index 9cab0e0890986..15c6c4e7a7d21 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -16,65 +16,32 @@
 #include "llvm/PassManager.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/Passes.h"
-#include "llvm/MC/MCCodeEmitter.h"
-#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Target/TargetOptions.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
-static MCStreamer *createMCStreamer(const Target &T, const std::string &TT,
-                                    MCContext &Ctx, TargetAsmBackend &TAB,
-                                    raw_ostream &_OS,
-                                    MCCodeEmitter *_Emitter,
-                                    bool RelaxAll,
-                                    bool NoExecStack) {
-  Triple TheTriple(TT);
-
-  if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO)
-    return createMachOStreamer(Ctx, TAB, _OS, _Emitter, RelaxAll);
-
-  if (TheTriple.isOSWindows())
-    return createWinCOFFStreamer(Ctx, TAB, *_Emitter, _OS, RelaxAll);
-
-  return createELFStreamer(Ctx, TAB, _OS, _Emitter, RelaxAll, NoExecStack);
-}
-
 extern "C" void LLVMInitializeX86Target() {
   // Register the target.
   RegisterTargetMachine<X86_32TargetMachine> X(TheX86_32Target);
   RegisterTargetMachine<X86_64TargetMachine> Y(TheX86_64Target);
-
-  // Register the code emitter.
-  TargetRegistry::RegisterCodeEmitter(TheX86_32Target,
-                                      createX86MCCodeEmitter);
-  TargetRegistry::RegisterCodeEmitter(TheX86_64Target,
-                                      createX86MCCodeEmitter);
-
-  // Register the asm backend.
-  TargetRegistry::RegisterAsmBackend(TheX86_32Target,
-                                     createX86_32AsmBackend);
-  TargetRegistry::RegisterAsmBackend(TheX86_64Target,
-                                     createX86_64AsmBackend);
-
-  // Register the object streamer.
-  TargetRegistry::RegisterObjectStreamer(TheX86_32Target,
-                                         createMCStreamer);
-  TargetRegistry::RegisterObjectStreamer(TheX86_64Target,
-                                         createMCStreamer);
 }
 
 
-X86_32TargetMachine::X86_32TargetMachine(const Target &T, const std::string &TT,
-                                         const std::string &CPU,
-                                         const std::string &FS)
-  : X86TargetMachine(T, TT, CPU, FS, false),
+X86_32TargetMachine::X86_32TargetMachine(const Target &T, StringRef TT,
+                                         StringRef CPU, StringRef FS,
+                                         Reloc::Model RM, CodeModel::Model CM)
+  : X86TargetMachine(T, TT, CPU, FS, RM, CM, false),
     DataLayout(getSubtargetImpl()->isTargetDarwin() ?
-               "e-p:32:32-f64:32:64-i64:32:64-f80:128:128-f128:128:128-n8:16:32" :
+               "e-p:32:32-f64:32:64-i64:32:64-f80:128:128-f128:128:128-"
+               "n8:16:32-S128" :
                (getSubtargetImpl()->isTargetCygMing() ||
                 getSubtargetImpl()->isTargetWindows()) ?
-               "e-p:32:32-f64:64:64-i64:64:64-f80:32:32-f128:128:128-n8:16:32" :
-               "e-p:32:32-f64:32:64-i64:32:64-f80:32:32-f128:128:128-n8:16:32"),
+               "e-p:32:32-f64:64:64-i64:64:64-f80:32:32-f128:128:128-"
+               "n8:16:32-S32" :
+               "e-p:32:32-f64:32:64-i64:32:64-f80:32:32-f128:128:128-"
+               "n8:16:32-S128"),
     InstrInfo(*this),
     TSInfo(*this),
     TLInfo(*this),
@@ -82,11 +49,12 @@ X86_32TargetMachine::X86_32TargetMachine(const Target &T, const std::string &TT,
 }
 
 
-X86_64TargetMachine::X86_64TargetMachine(const Target &T, const std::string &TT,
-                                         const std::string &CPU, 
-                                         const std::string &FS)
-  : X86TargetMachine(T, TT, CPU, FS, true),
-    DataLayout("e-p:64:64-s:64-f64:64:64-i64:64:64-f80:128:128-f128:128:128-n8:16:32:64"),
+X86_64TargetMachine::X86_64TargetMachine(const Target &T, StringRef TT,
+                                         StringRef CPU, StringRef FS,
+                                         Reloc::Model RM, CodeModel::Model CM)
+  : X86TargetMachine(T, TT, CPU, FS, RM, CM, true),
+    DataLayout("e-p:64:64-s:64-f64:64:64-i64:64:64-f80:128:128-f128:128:128-"
+               "n8:16:32:64-S128"),
     InstrInfo(*this),
     TSInfo(*this),
     TLInfo(*this),
@@ -95,52 +63,14 @@ X86_64TargetMachine::X86_64TargetMachine(const Target &T, const std::string &TT,
 
 /// X86TargetMachine ctor - Create an X86 target.
 ///
-X86TargetMachine::X86TargetMachine(const Target &T, const std::string &TT,
-                                   const std::string &CPU,
-                                   const std::string &FS, bool is64Bit)
-  : LLVMTargetMachine(T, TT, CPU, FS),
+X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT,
+                                   StringRef CPU, StringRef FS,
+                                   Reloc::Model RM, CodeModel::Model CM,
+                                   bool is64Bit)
+  : LLVMTargetMachine(T, TT, CPU, FS, RM, CM),
     Subtarget(TT, CPU, FS, StackAlignmentOverride, is64Bit),
     FrameLowering(*this, Subtarget),
     ELFWriterInfo(is64Bit, true) {
-  DefRelocModel = getRelocationModel();
-
-  // If no relocation model was picked, default as appropriate for the target.
-  if (getRelocationModel() == Reloc::Default) {
-    // Darwin defaults to PIC in 64 bit mode and dynamic-no-pic in 32 bit mode.
-    // Win64 requires rip-rel addressing, thus we force it to PIC. Otherwise we
-    // use static relocation model by default.
-    if (Subtarget.isTargetDarwin()) {
-      if (Subtarget.is64Bit())
-        setRelocationModel(Reloc::PIC_);
-      else
-        setRelocationModel(Reloc::DynamicNoPIC);
-    } else if (Subtarget.isTargetWin64())
-      setRelocationModel(Reloc::PIC_);
-    else
-      setRelocationModel(Reloc::Static);
-  }
-
-  assert(getRelocationModel() != Reloc::Default &&
-         "Relocation mode not picked");
-
-  // ELF and X86-64 don't have a distinct DynamicNoPIC model.  DynamicNoPIC
-  // is defined as a model for code which may be used in static or dynamic
-  // executables but not necessarily a shared library. On X86-32 we just
-  // compile in -static mode, in x86-64 we use PIC.
-  if (getRelocationModel() == Reloc::DynamicNoPIC) {
-    if (is64Bit)
-      setRelocationModel(Reloc::PIC_);
-    else if (!Subtarget.isTargetDarwin())
-      setRelocationModel(Reloc::Static);
-  }
-
-  // If we are on Darwin, disallow static relocation model in X86-64 mode, since
-  // the Mach-O file format doesn't support it.
-  if (getRelocationModel() == Reloc::Static &&
-      Subtarget.isTargetDarwin() &&
-      is64Bit)
-    setRelocationModel(Reloc::PIC_);
-
   // Determine the PICStyle based on the target selected.
   if (getRelocationModel() == Reloc::Static) {
     // Unless we're in PIC or DynamicNoPIC mode, set the PIC style to None.
@@ -161,15 +91,19 @@ X86TargetMachine::X86TargetMachine(const Target &T, const std::string &TT,
     Subtarget.setPICStyle(PICStyles::GOT);
   }
 
-  // Finally, if we have "none" as our PIC style, force to static mode.
-  if (Subtarget.getPICStyle() == PICStyles::None)
-    setRelocationModel(Reloc::Static);
-
   // default to hard float ABI
   if (FloatABIType == FloatABI::Default)
     FloatABIType = FloatABI::Hard;    
 }
 
+//===----------------------------------------------------------------------===//
+// Command line options for x86
+//===----------------------------------------------------------------------===//
+static cl::opt<bool>
+UseVZeroUpper("x86-use-vzeroupper",
+  cl::desc("Minimize AVX to SSE transition penalty"),
+  cl::init(false));
+
 //===----------------------------------------------------------------------===//
 // Pass Pipeline Configuration
 //===----------------------------------------------------------------------===//
@@ -200,46 +134,25 @@ bool X86TargetMachine::addPostRegAlloc(PassManagerBase &PM,
 
 bool X86TargetMachine::addPreEmitPass(PassManagerBase &PM,
                                       CodeGenOpt::Level OptLevel) {
-  if (OptLevel != CodeGenOpt::None && Subtarget.hasSSE2()) {
-    PM.add(createSSEDomainFixPass());
-    return true;
+  bool ShouldPrint = false;
+  if (OptLevel != CodeGenOpt::None &&
+      (Subtarget.hasSSE2() || Subtarget.hasAVX())) {
+    PM.add(createExecutionDependencyFixPass(&X86::VR128RegClass));
+    ShouldPrint = true;
   }
-  return false;
+
+  if (Subtarget.hasAVX() && UseVZeroUpper) {
+    PM.add(createX86IssueVZeroUpperPass());
+    ShouldPrint = true;
+  }
+
+  return ShouldPrint;
 }
 
 bool X86TargetMachine::addCodeEmitter(PassManagerBase &PM,
                                       CodeGenOpt::Level OptLevel,
                                       JITCodeEmitter &JCE) {
-  // FIXME: Move this to TargetJITInfo!
-  // On Darwin, do not override 64-bit setting made in X86TargetMachine().
-  if (DefRelocModel == Reloc::Default &&
-      (!Subtarget.isTargetDarwin() || !Subtarget.is64Bit())) {
-    setRelocationModel(Reloc::Static);
-    Subtarget.setPICStyle(PICStyles::None);
-  }
-
-
   PM.add(createX86JITCodeEmitterPass(*this, JCE));
 
   return false;
 }
-
-void X86TargetMachine::setCodeModelForStatic() {
-
-    if (getCodeModel() != CodeModel::Default) return;
-
-    // For static codegen, if we're not already set, use Small codegen.
-    setCodeModel(CodeModel::Small);
-}
-
-
-void X86TargetMachine::setCodeModelForJIT() {
-
-  if (getCodeModel() != CodeModel::Default) return;
-
-  // 64-bit JIT places everything in the same buffer except external functions.
-  if (Subtarget.is64Bit())
-    setCodeModel(CodeModel::Large);
-  else
-    setCodeModel(CodeModel::Small);
-}
diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h
index 885334a365fec..d1569aa9d7510 100644
--- a/lib/Target/X86/X86TargetMachine.h
+++ b/lib/Target/X86/X86TargetMachine.h
@@ -29,21 +29,17 @@
 namespace llvm {
   
 class formatted_raw_ostream;
+class StringRef;
 
 class X86TargetMachine : public LLVMTargetMachine {
   X86Subtarget      Subtarget;
   X86FrameLowering  FrameLowering;
   X86ELFWriterInfo  ELFWriterInfo;
-  Reloc::Model      DefRelocModel; // Reloc model before it's overridden.
 
-private:
-  // We have specific defaults for X86.
-  virtual void setCodeModelForJIT();
-  virtual void setCodeModelForStatic();
-  
 public:
-  X86TargetMachine(const Target &T, const std::string &TT, 
-                   const std::string &CPU, const std::string &FS,
+  X86TargetMachine(const Target &T, StringRef TT, 
+                   StringRef CPU, StringRef FS,
+                   Reloc::Model RM, CodeModel::Model CM,
                    bool is64Bit);
 
   virtual const X86InstrInfo     *getInstrInfo() const {
@@ -87,8 +83,9 @@ class X86_32TargetMachine : public X86TargetMachine {
   X86TargetLowering TLInfo;
   X86JITInfo        JITInfo;
 public:
-  X86_32TargetMachine(const Target &T, const std::string &M,
-                      const std::string &CPU, const std::string &FS);
+  X86_32TargetMachine(const Target &T, StringRef TT,
+                      StringRef CPU, StringRef FS,
+                      Reloc::Model RM, CodeModel::Model CM);
   virtual const TargetData *getTargetData() const { return &DataLayout; }
   virtual const X86TargetLowering *getTargetLowering() const {
     return &TLInfo;
@@ -113,8 +110,9 @@ class X86_64TargetMachine : public X86TargetMachine {
   X86TargetLowering TLInfo;
   X86JITInfo        JITInfo;
 public:
-  X86_64TargetMachine(const Target &T, const std::string &TT,
-                      const std::string &CPU, const std::string &FS);
+  X86_64TargetMachine(const Target &T, StringRef TT,
+                      StringRef CPU, StringRef FS,
+                      Reloc::Model RM, CodeModel::Model CM);
   virtual const TargetData *getTargetData() const { return &DataLayout; }
   virtual const X86TargetLowering *getTargetLowering() const {
     return &TLInfo;
diff --git a/lib/Target/X86/X86TargetObjectFile.cpp b/lib/Target/X86/X86TargetObjectFile.cpp
index 1231798297ac6..991f322a6346f 100644
--- a/lib/Target/X86/X86TargetObjectFile.cpp
+++ b/lib/Target/X86/X86TargetObjectFile.cpp
@@ -43,79 +43,3 @@ getCFIPersonalitySymbol(const GlobalValue *GV, Mangler *Mang,
                         MachineModuleInfo *MMI) const {
   return Mang->getSymbol(GV);
 }
-
-unsigned X8632_ELFTargetObjectFile::getPersonalityEncoding() const {
-  if (TM.getRelocationModel() == Reloc::PIC_)
-    return DW_EH_PE_indirect | DW_EH_PE_pcrel | DW_EH_PE_sdata4;
-  else
-    return DW_EH_PE_absptr;
-}
-
-unsigned X8632_ELFTargetObjectFile::getLSDAEncoding() const {
-  if (TM.getRelocationModel() == Reloc::PIC_)
-    return DW_EH_PE_pcrel | DW_EH_PE_sdata4;
-  else
-    return DW_EH_PE_absptr;
-}
-
-unsigned X8632_ELFTargetObjectFile::getFDEEncoding(bool FDE) const {
-  if (TM.getRelocationModel() == Reloc::PIC_)
-    return DW_EH_PE_pcrel | DW_EH_PE_sdata4;
-  else
-    return DW_EH_PE_absptr;
-}
-
-unsigned X8632_ELFTargetObjectFile::getTTypeEncoding() const {
-  if (TM.getRelocationModel() == Reloc::PIC_)
-    return DW_EH_PE_indirect | DW_EH_PE_pcrel | DW_EH_PE_sdata4;
-  else
-    return DW_EH_PE_absptr;
-}
-
-unsigned X8664_ELFTargetObjectFile::getPersonalityEncoding() const {
-  CodeModel::Model Model = TM.getCodeModel();
-  if (TM.getRelocationModel() == Reloc::PIC_)
-    return DW_EH_PE_indirect | DW_EH_PE_pcrel | (Model == CodeModel::Small ||
-                                                 Model == CodeModel::Medium ?
-                                            DW_EH_PE_sdata4 : DW_EH_PE_sdata8);
-
-  if (Model == CodeModel::Small || Model == CodeModel::Medium)
-    return DW_EH_PE_udata4;
-
-  return DW_EH_PE_absptr;
-}
-
-unsigned X8664_ELFTargetObjectFile::getLSDAEncoding() const {
-  CodeModel::Model Model = TM.getCodeModel();
-  if (TM.getRelocationModel() == Reloc::PIC_)
-    return DW_EH_PE_pcrel | (Model == CodeModel::Small ?
-                             DW_EH_PE_sdata4 : DW_EH_PE_sdata8);
-
-  if (Model == CodeModel::Small)
-    return DW_EH_PE_udata4;
-
-  return DW_EH_PE_absptr;
-}
-
-unsigned X8664_ELFTargetObjectFile::getFDEEncoding(bool CFI) const {
-  if (CFI)
-    return DW_EH_PE_pcrel | DW_EH_PE_sdata4;
-
-  if (TM.getRelocationModel() == Reloc::PIC_)
-    return DW_EH_PE_pcrel | DW_EH_PE_sdata4;
-
-  return DW_EH_PE_udata4;
-}
-
-unsigned X8664_ELFTargetObjectFile::getTTypeEncoding() const {
-  CodeModel::Model Model = TM.getCodeModel();
-  if (TM.getRelocationModel() == Reloc::PIC_)
-    return DW_EH_PE_indirect | DW_EH_PE_pcrel | (Model == CodeModel::Small ||
-                                                 Model == CodeModel::Medium ?
-                                            DW_EH_PE_sdata4 : DW_EH_PE_sdata8);
-
-  if (Model == CodeModel::Small)
-    return DW_EH_PE_udata4;
-
-  return DW_EH_PE_absptr;
-}
diff --git a/lib/Target/X86/X86TargetObjectFile.h b/lib/Target/X86/X86TargetObjectFile.h
index e21b5bffd0594..d7adf27ecaef8 100644
--- a/lib/Target/X86/X86TargetObjectFile.h
+++ b/lib/Target/X86/X86TargetObjectFile.h
@@ -33,28 +33,6 @@ namespace llvm {
                             MachineModuleInfo *MMI) const;
   };
 
-  class X8632_ELFTargetObjectFile : public TargetLoweringObjectFileELF {
-    const X86TargetMachine &TM;
-  public:
-    X8632_ELFTargetObjectFile(const X86TargetMachine &tm)
-      :TM(tm) { }
-    virtual unsigned getPersonalityEncoding() const;
-    virtual unsigned getLSDAEncoding() const;
-    virtual unsigned getFDEEncoding(bool CFI) const;
-    virtual unsigned getTTypeEncoding() const;
-  };
-
-  class X8664_ELFTargetObjectFile : public TargetLoweringObjectFileELF {
-    const X86TargetMachine &TM;
-  public:
-    X8664_ELFTargetObjectFile(const X86TargetMachine &tm)
-      :TM(tm) { }
-    virtual unsigned getPersonalityEncoding() const;
-    virtual unsigned getLSDAEncoding() const;
-    virtual unsigned getFDEEncoding(bool CFI) const;
-    virtual unsigned getTTypeEncoding() const;
-  };
-
 } // end namespace llvm
 
 #endif
diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp
new file mode 100644
index 0000000000000..39584942468d1
--- /dev/null
+++ b/lib/Target/X86/X86VZeroUpper.cpp
@@ -0,0 +1,105 @@
+//===-- X86VZeroUpper.cpp - AVX vzeroupper instruction inserter -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass which inserts x86 AVX vzeroupper instructions
+// before calls to SSE encoded functions. This avoids transition latency
+// penalty when tranfering control between AVX encoded instructions and old
+// SSE encoding mode.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "x86-codegen"
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/GlobalValue.h"
+#include "llvm/Target/TargetInstrInfo.h"
+using namespace llvm;
+
+STATISTIC(NumVZU, "Number of vzeroupper instructions inserted");
+
+namespace {
+  struct VZeroUpperInserter : public MachineFunctionPass {
+    static char ID;
+    VZeroUpperInserter() : MachineFunctionPass(ID) {}
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+    bool processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB);
+
+    virtual const char *getPassName() const { return "X86 vzeroupper inserter";}
+
+  private:
+    const TargetInstrInfo *TII; // Machine instruction info.
+    MachineBasicBlock *MBB;     // Current basic block
+  };
+  char VZeroUpperInserter::ID = 0;
+}
+
+FunctionPass *llvm::createX86IssueVZeroUpperPass() {
+  return new VZeroUpperInserter();
+}
+
+/// runOnMachineFunction - Loop over all of the basic blocks, inserting
+/// vzero upper instructions before function calls.
+bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
+  TII = MF.getTarget().getInstrInfo();
+  bool Changed = false;
+
+  // Process any unreachable blocks in arbitrary order now.
+  for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB)
+    Changed |= processBasicBlock(MF, *BB);
+
+  return Changed;
+}
+
+static bool isCallToModuleFn(const MachineInstr *MI) {
+  assert(MI->getDesc().isCall() && "Isn't a call instruction");
+
+  for (int i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+
+    if (!MO.isGlobal())
+      continue;
+
+    const GlobalValue *GV = MO.getGlobal();
+    GlobalValue::LinkageTypes LT = GV->getLinkage();
+    if (GV->isInternalLinkage(LT) || GV->isPrivateLinkage(LT) ||
+        (GV->isExternalLinkage(LT) && !GV->isDeclaration()))
+      return true;
+
+    return false;
+  }
+  return false;
+}
+
+/// processBasicBlock - Loop over all of the instructions in the basic block,
+/// inserting vzero upper instructions before function calls.
+bool VZeroUpperInserter::processBasicBlock(MachineFunction &MF,
+                                           MachineBasicBlock &BB) {
+  bool Changed = false;
+  MBB = &BB;
+
+  for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) {
+    MachineInstr *MI = I;
+    DebugLoc dl = I->getDebugLoc();
+
+    // Insert a vzeroupper instruction before each control transfer
+    // to functions outside this module
+    if (MI->getDesc().isCall() && !isCallToModuleFn(MI)) {
+      BuildMI(*MBB, I, dl, TII->get(X86::VZEROUPPER));
+      ++NumVZU;
+    }
+  }
+
+  return Changed;
+}
diff --git a/lib/Target/XCore/CMakeLists.txt b/lib/Target/XCore/CMakeLists.txt
index a1d73c6b4f99d..3dc51e1991ede 100644
--- a/lib/Target/XCore/CMakeLists.txt
+++ b/lib/Target/XCore/CMakeLists.txt
@@ -1,11 +1,12 @@
 set(LLVM_TARGET_DEFINITIONS XCore.td)
 
-tablegen(XCoreGenRegisterInfo.inc -gen-register-info)
-tablegen(XCoreGenInstrInfo.inc -gen-instr-info)
-tablegen(XCoreGenAsmWriter.inc -gen-asm-writer)
-tablegen(XCoreGenDAGISel.inc -gen-dag-isel)
-tablegen(XCoreGenCallingConv.inc -gen-callingconv)
-tablegen(XCoreGenSubtargetInfo.inc -gen-subtarget)
+llvm_tablegen(XCoreGenRegisterInfo.inc -gen-register-info)
+llvm_tablegen(XCoreGenInstrInfo.inc -gen-instr-info)
+llvm_tablegen(XCoreGenAsmWriter.inc -gen-asm-writer)
+llvm_tablegen(XCoreGenDAGISel.inc -gen-dag-isel)
+llvm_tablegen(XCoreGenCallingConv.inc -gen-callingconv)
+llvm_tablegen(XCoreGenSubtargetInfo.inc -gen-subtarget)
+add_public_tablegen_target(XCoreCommonTableGen)
 
 add_llvm_target(XCoreCodeGen
   XCoreAsmPrinter.cpp
@@ -20,5 +21,17 @@ add_llvm_target(XCoreCodeGen
   XCoreSelectionDAGInfo.cpp
   )
 
+add_llvm_library_dependencies(LLVMXCoreCodeGen
+  LLVMAsmPrinter
+  LLVMCodeGen
+  LLVMCore
+  LLVMMC
+  LLVMSelectionDAG
+  LLVMSupport
+  LLVMTarget
+  LLVMXCoreDesc
+  LLVMXCoreInfo
+  )
+
 add_subdirectory(TargetInfo)
 add_subdirectory(MCTargetDesc)
diff --git a/lib/Target/XCore/MCTargetDesc/CMakeLists.txt b/lib/Target/XCore/MCTargetDesc/CMakeLists.txt
index c3b3dc9e647d6..269822db7113b 100644
--- a/lib/Target/XCore/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/XCore/MCTargetDesc/CMakeLists.txt
@@ -3,5 +3,12 @@ add_llvm_library(LLVMXCoreDesc
   XCoreMCAsmInfo.cpp
   )
 
+add_llvm_library_dependencies(LLVMXCoreDesc
+  LLVMMC
+  LLVMXCoreInfo
+  )
+
+add_dependencies(LLVMXCoreDesc XCoreCommonTableGen)
+
 # Hack: we need to include 'main' target directory to grab private headers
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/.. ${CMAKE_CURRENT_BINARY_DIR}/..)
diff --git a/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp b/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
index 939d97c9d87cf..276e841e6acc9 100644
--- a/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
+++ b/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
@@ -13,10 +13,11 @@
 
 #include "XCoreMCTargetDesc.h"
 #include "XCoreMCAsmInfo.h"
+#include "llvm/MC/MCCodeGenInfo.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 
 #define GET_INSTRINFO_MC_DESC
 #include "XCoreGenInstrInfo.inc"
@@ -35,8 +36,10 @@ static MCInstrInfo *createXCoreMCInstrInfo() {
   return X;
 }
 
-extern "C" void LLVMInitializeXCoreMCInstrInfo() {
-  TargetRegistry::RegisterMCInstrInfo(TheXCoreTarget, createXCoreMCInstrInfo);
+static MCRegisterInfo *createXCoreMCRegisterInfo(StringRef TT) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitXCoreMCRegisterInfo(X, XCore::LR);
+  return X;
 }
 
 static MCSubtargetInfo *createXCoreMCSubtargetInfo(StringRef TT, StringRef CPU,
@@ -46,11 +49,40 @@ static MCSubtargetInfo *createXCoreMCSubtargetInfo(StringRef TT, StringRef CPU,
   return X;
 }
 
-extern "C" void LLVMInitializeXCoreMCSubtargetInfo() {
-  TargetRegistry::RegisterMCSubtargetInfo(TheXCoreTarget,
-                                          createXCoreMCSubtargetInfo);
+static MCAsmInfo *createXCoreMCAsmInfo(const Target &T, StringRef TT) {
+  MCAsmInfo *MAI = new XCoreMCAsmInfo(T, TT);
+
+  // Initial state of the frame pointer is SP.
+  MachineLocation Dst(MachineLocation::VirtualFP);
+  MachineLocation Src(XCore::SP, 0);
+  MAI->addInitialFrameState(0, Dst, Src);
+
+  return MAI;
+}
+
+static MCCodeGenInfo *createXCoreMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                               CodeModel::Model CM) {
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+  X->InitMCCodeGenInfo(RM, CM);
+  return X;
 }
 
-extern "C" void LLVMInitializeXCoreMCAsmInfo() {
-  RegisterMCAsmInfo<XCoreMCAsmInfo> X(TheXCoreTarget);
+// Force static initialization.
+extern "C" void LLVMInitializeXCoreTargetMC() {
+  // Register the MC asm info.
+  RegisterMCAsmInfoFn X(TheXCoreTarget, createXCoreMCAsmInfo);
+
+  // Register the MC codegen info.
+  TargetRegistry::RegisterMCCodeGenInfo(TheXCoreTarget,
+                                        createXCoreMCCodeGenInfo);
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(TheXCoreTarget, createXCoreMCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(TheXCoreTarget, createXCoreMCRegisterInfo);
+
+  // Register the MC subtarget info.
+  TargetRegistry::RegisterMCSubtargetInfo(TheXCoreTarget,
+                                          createXCoreMCSubtargetInfo);
 }
diff --git a/lib/Target/XCore/TargetInfo/CMakeLists.txt b/lib/Target/XCore/TargetInfo/CMakeLists.txt
index c147b8a66bc33..7f84f69043058 100644
--- a/lib/Target/XCore/TargetInfo/CMakeLists.txt
+++ b/lib/Target/XCore/TargetInfo/CMakeLists.txt
@@ -4,4 +4,10 @@ add_llvm_library(LLVMXCoreInfo
   XCoreTargetInfo.cpp
   )
 
-add_dependencies(LLVMXCoreInfo XCoreCodeGenTable_gen)
+add_llvm_library_dependencies(LLVMXCoreInfo
+  LLVMMC
+  LLVMSupport
+  LLVMTarget
+  )
+
+add_dependencies(LLVMXCoreInfo XCoreCommonTableGen)
diff --git a/lib/Target/XCore/TargetInfo/XCoreTargetInfo.cpp b/lib/Target/XCore/TargetInfo/XCoreTargetInfo.cpp
index 7aa8965c4ac68..9a0971d1e45f9 100644
--- a/lib/Target/XCore/TargetInfo/XCoreTargetInfo.cpp
+++ b/lib/Target/XCore/TargetInfo/XCoreTargetInfo.cpp
@@ -9,7 +9,7 @@
 
 #include "XCore.h"
 #include "llvm/Module.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
 Target llvm::TheXCoreTarget;
diff --git a/lib/Target/XCore/XCoreAsmPrinter.cpp b/lib/Target/XCore/XCoreAsmPrinter.cpp
index 1a43714d63b9e..8906b2459e0ac 100644
--- a/lib/Target/XCore/XCoreAsmPrinter.cpp
+++ b/lib/Target/XCore/XCoreAsmPrinter.cpp
@@ -20,6 +20,7 @@
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Module.h"
+#include "llvm/Analysis/DebugInfo.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -32,11 +33,11 @@
 #include "llvm/Target/Mangler.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
-#include "llvm/Target/TargetRegistry.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cctype>
@@ -51,6 +52,7 @@ static cl::opt<unsigned> MaxThreads("xcore-max-threads", cl::Optional,
 namespace {
   class XCoreAsmPrinter : public AsmPrinter {
     const XCoreSubtarget &Subtarget;
+    void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS);
   public:
     explicit XCoreAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
       : AsmPrinter(TM, Streamer), Subtarget(TM.getSubtarget<XCoreSubtarget>()){}
@@ -79,6 +81,7 @@ namespace {
     void EmitFunctionEntryLabel();
     void EmitInstruction(const MachineInstr *MI);
     void EmitFunctionBodyEnd();
+    virtual MachineLocation getDebugValueLocation(const MachineInstr *MI) const;
   };
 } // end of anonymous namespace
 
@@ -88,7 +91,7 @@ void XCoreAsmPrinter::emitArrayBound(MCSymbol *Sym, const GlobalVariable *GV) {
   assert(((GV->hasExternalLinkage() ||
     GV->hasWeakLinkage()) ||
     GV->hasLinkOnceLinkage()) && "Unexpected linkage");
-  if (const ArrayType *ATy = dyn_cast<ArrayType>(
+  if (ArrayType *ATy = dyn_cast<ArrayType>(
     cast<PointerType>(GV->getType())->getElementType())) {
     OutStreamer.EmitSymbolAttribute(Sym, MCSA_Global);
     // FIXME: MCStreamerize.
@@ -261,16 +264,57 @@ bool XCoreAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
   return false;
 }
 
+void XCoreAsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
+                                             raw_ostream &OS) {
+  unsigned NOps = MI->getNumOperands();
+  assert(NOps == 4);
+  OS << '\t' << MAI->getCommentString() << "DEBUG_VALUE: ";
+  // cast away const; DIetc do not take const operands for some reason.
+  DIVariable V(const_cast<MDNode *>(MI->getOperand(NOps-1).getMetadata()));
+  OS << V.getName();
+  OS << " <- ";
+  // Frame address.  Currently handles register +- offset only.
+  assert(MI->getOperand(0).isReg() && MI->getOperand(1).isImm());
+  OS << '['; printOperand(MI, 0, OS); OS << '+'; printOperand(MI, 1, OS);
+  OS << ']';
+  OS << "+";
+  printOperand(MI, NOps-2, OS);
+}
+
+MachineLocation XCoreAsmPrinter::
+getDebugValueLocation(const MachineInstr *MI) const {
+  // Handles frame addresses emitted in XCoreInstrInfo::emitFrameIndexDebugValue.
+  assert(MI->getNumOperands() == 4 && "Invalid no. of machine operands!");
+  assert(MI->getOperand(0).isReg() && MI->getOperand(1).isImm() &&
+         "Unexpected MachineOperand types");
+  return MachineLocation(MI->getOperand(0).getReg(),
+                         MI->getOperand(1).getImm());
+}
+
 void XCoreAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   SmallString<128> Str;
   raw_svector_ostream O(Str);
 
-  // Check for mov mnemonic
-  if (MI->getOpcode() == XCore::ADD_2rus && !MI->getOperand(2).getImm())
-    O << "\tmov " << getRegisterName(MI->getOperand(0).getReg()) << ", "
-      << getRegisterName(MI->getOperand(1).getReg());
-  else
-    printInstruction(MI, O);
+  switch (MI->getOpcode()) {
+  case XCore::DBG_VALUE: {
+    if (isVerbose() && OutStreamer.hasRawTextSupport()) {
+      SmallString<128> TmpStr;
+      raw_svector_ostream OS(TmpStr);
+      PrintDebugValueComment(MI, OS);
+      OutStreamer.EmitRawText(StringRef(OS.str()));
+    }
+    return;
+  }
+  case XCore::ADD_2rus:
+    if (MI->getOperand(2).getImm() == 0) {
+      O << "\tmov " << getRegisterName(MI->getOperand(0).getReg()) << ", "
+        << getRegisterName(MI->getOperand(1).getReg());
+      OutStreamer.EmitRawText(O.str());
+      return;
+    }
+    break;
+  }
+  printInstruction(MI, O);
   OutStreamer.EmitRawText(O.str());
 }
 
diff --git a/lib/Target/XCore/XCoreFrameLowering.cpp b/lib/Target/XCore/XCoreFrameLowering.cpp
index 057822074e546..7f8b169819a74 100644
--- a/lib/Target/XCore/XCoreFrameLowering.cpp
+++ b/lib/Target/XCore/XCoreFrameLowering.cpp
@@ -100,7 +100,8 @@ void XCoreFrameLowering::emitPrologue(MachineFunction &MF) const {
   DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
 
   bool FP = hasFP(MF);
-  bool Nested = MF.getFunction()->getAttributes().hasAttrSomewhere(Attribute::Nest);
+  bool Nested = MF.getFunction()->
+                getAttributes().hasAttrSomewhere(Attribute::Nest);
 
   if (Nested) {
     loadFromStack(MBB, MBBI, XCore::R11, 0, dl, TII);
@@ -270,14 +271,6 @@ void XCoreFrameLowering::emitEpilogue(MachineFunction &MF,
   }
 }
 
-void XCoreFrameLowering::getInitialFrameState(std::vector<MachineMove> &Moves)
-                                                                        const {
-  // Initial state of the frame pointer is SP.
-  MachineLocation Dst(MachineLocation::VirtualFP);
-  MachineLocation Src(XCore::SP, 0);
-  Moves.push_back(MachineMove(0, Dst, Src));
-}
-
 bool XCoreFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
                                                MachineBasicBlock::iterator MI,
                                         const std::vector<CalleeSavedInfo> &CSI,
diff --git a/lib/Target/XCore/XCoreFrameLowering.h b/lib/Target/XCore/XCoreFrameLowering.h
index 7da19f0deb1b5..c591e93780aa5 100644
--- a/lib/Target/XCore/XCoreFrameLowering.h
+++ b/lib/Target/XCore/XCoreFrameLowering.h
@@ -42,8 +42,6 @@ namespace llvm {
 
     bool hasFP(const MachineFunction &MF) const;
 
-    void getInitialFrameState(std::vector<MachineMove> &Moves) const;
-
     void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
                                               RegScavenger *RS = NULL) const;
 
diff --git a/lib/Target/XCore/XCoreISelDAGToDAG.cpp b/lib/Target/XCore/XCoreISelDAGToDAG.cpp
index a8dd8479d45f1..4dac1cee98270 100644
--- a/lib/Target/XCore/XCoreISelDAGToDAG.cpp
+++ b/lib/Target/XCore/XCoreISelDAGToDAG.cpp
@@ -169,9 +169,14 @@ SDNode *XCoreDAGToDAGISel::Select(SDNode *N) {
         CurDAG->getTargetConstantPool(ConstantInt::get(
                               Type::getInt32Ty(*CurDAG->getContext()), Val),
                                       TLI.getPointerTy());
-      return CurDAG->getMachineNode(XCore::LDWCP_lru6, dl, MVT::i32, 
-                                    MVT::Other, CPIdx, 
-                                    CurDAG->getEntryNode());
+      SDNode *node = CurDAG->getMachineNode(XCore::LDWCP_lru6, dl, MVT::i32,
+                                            MVT::Other, CPIdx,
+                                            CurDAG->getEntryNode());
+      MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+      MemOp[0] = MF->getMachineMemOperand(
+        MachinePointerInfo::getConstantPool(), MachineMemOperand::MOLoad, 4, 4);      
+      cast<MachineSDNode>(node)->setMemRefs(MemOp, MemOp + 1);
+      return node;
     }
     break;
   }
diff --git a/lib/Target/XCore/XCoreISelLowering.cpp b/lib/Target/XCore/XCoreISelLowering.cpp
index 6d040e052659c..2afe0e35afb1b 100644
--- a/lib/Target/XCore/XCoreISelLowering.cpp
+++ b/lib/Target/XCore/XCoreISelLowering.cpp
@@ -81,6 +81,7 @@ XCoreTargetLowering::XCoreTargetLowering(XCoreTargetMachine &XTM)
 
   // Use i32 for setcc operations results (slt, sgt, ...).
   setBooleanContents(ZeroOrOneBooleanContent);
+  setBooleanVectorContents(ZeroOrOneBooleanContent); // FIXME: Is this correct?
 
   // XCore does not have the NodeTypes below.
   setOperationAction(ISD::BR_CC,     MVT::Other, Expand);
@@ -147,7 +148,8 @@ XCoreTargetLowering::XCoreTargetLowering(XCoreTargetMachine &XTM)
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
 
   // TRAMPOLINE is custom lowered.
-  setOperationAction(ISD::TRAMPOLINE, MVT::Other, Custom);
+  setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
+  setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
 
   maxStoresPerMemset = maxStoresPerMemsetOptSize = 4;
   maxStoresPerMemmove = maxStoresPerMemmoveOptSize
@@ -180,7 +182,8 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::ADD:
   case ISD::SUB:              return ExpandADDSUB(Op.getNode(), DAG);
   case ISD::FRAMEADDR:        return LowerFRAMEADDR(Op, DAG);
-  case ISD::TRAMPOLINE:       return LowerTRAMPOLINE(Op, DAG);
+  case ISD::INIT_TRAMPOLINE:  return LowerINIT_TRAMPOLINE(Op, DAG);
+  case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
   default:
     llvm_unreachable("unimplemented operand");
     return SDValue();
@@ -252,8 +255,8 @@ static inline SDValue BuildGetId(SelectionDAG &DAG, DebugLoc dl) {
                      DAG.getConstant(Intrinsic::xcore_getid, MVT::i32));
 }
 
-static inline bool isZeroLengthArray(const Type *Ty) {
-  const ArrayType *AT = dyn_cast_or_null<ArrayType>(Ty);
+static inline bool isZeroLengthArray(Type *Ty) {
+  ArrayType *AT = dyn_cast_or_null<ArrayType>(Ty);
   return AT && (AT->getNumElements() == 0);
 }
 
@@ -275,7 +278,7 @@ LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const
     llvm_unreachable("Thread local object not a GlobalVariable?");
     return SDValue();
   }
-  const Type *Ty = cast<PointerType>(GV->getType())->getElementType();
+  Type *Ty = cast<PointerType>(GV->getType())->getElementType();
   if (!Ty->isSized() || isZeroLengthArray(Ty)) {
 #ifndef NDEBUG
     errs() << "Size of thread local object " << GVar->getName()
@@ -465,7 +468,7 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   }
 
   // Lower to a call to __misaligned_load(BasePtr).
-  const Type *IntPtrTy = getTargetData()->getIntPtrType(*DAG.getContext());
+  Type *IntPtrTy = getTargetData()->getIntPtrType(*DAG.getContext());
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
 
@@ -524,7 +527,7 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG) const
   }
 
   // Lower to a call to __misaligned_store(BasePtr, Value).
-  const Type *IntPtrTy = getTargetData()->getIntPtrType(*DAG.getContext());
+  Type *IntPtrTy = getTargetData()->getIntPtrType(*DAG.getContext());
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
 
@@ -789,7 +792,12 @@ SDValue XCoreTargetLowering::LowerFRAMEADDR(SDValue Op,
 }
 
 SDValue XCoreTargetLowering::
-LowerTRAMPOLINE(SDValue Op, SelectionDAG &DAG) const {
+LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const {
+  return Op.getOperand(0);
+}
+
+SDValue XCoreTargetLowering::
+LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const {
   SDValue Chain = Op.getOperand(0);
   SDValue Trmp = Op.getOperand(1); // trampoline
   SDValue FPtr = Op.getOperand(2); // nested function
@@ -841,9 +849,7 @@ LowerTRAMPOLINE(SDValue Op, SelectionDAG &DAG) const {
                               MachinePointerInfo(TrmpAddr, 16), false, false,
                               0);
 
-  SDValue Ops[] =
-    { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 5) };
-  return DAG.getMergeValues(Ops, 2, dl);
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 5);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1148,10 +1154,10 @@ XCoreTargetLowering::LowerCCCArguments(SDValue Chain,
       int offset = 0;
       // Save remaining registers, storing higher register numbers at a higher
       // address
-      for (unsigned i = array_lengthof(ArgRegs) - 1; i >= FirstVAReg; --i) {
+      for (int i = array_lengthof(ArgRegs) - 1; i >= (int)FirstVAReg; --i) {
         // Create a stack slot
         int FI = MFI->CreateFixedObject(4, offset, true);
-        if (i == FirstVAReg) {
+        if (i == (int)FirstVAReg) {
           XFI->setVarArgsFrameIndex(FI);
         }
         offset -= StackSlotSize;
@@ -1409,7 +1415,8 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
     // operands are constant canonicalize smallest to RHS.
     if ((N0C && !N1C) ||
         (N0C && N1C && N0C->getZExtValue() < N1C->getZExtValue()))
-      return DAG.getNode(XCoreISD::LMUL, dl, DAG.getVTList(VT, VT), N1, N0, N2, N3);
+      return DAG.getNode(XCoreISD::LMUL, dl, DAG.getVTList(VT, VT),
+                         N1, N0, N2, N3);
 
     // lmul(x, 0, a, b)
     if (N1C && N1C->isNullValue()) {
@@ -1548,7 +1555,7 @@ static inline bool isImmUs4(int64_t val)
 /// by AM is legal for this target, for a load/store of the specified type.
 bool
 XCoreTargetLowering::isLegalAddressingMode(const AddrMode &AM,
-                                              const Type *Ty) const {
+                                              Type *Ty) const {
   if (Ty->getTypeID() == Type::VoidTyID)
     return AM.Scale == 0 && isImmUs(AM.BaseOffs) && isImmUs4(AM.BaseOffs);
 
diff --git a/lib/Target/XCore/XCoreISelLowering.h b/lib/Target/XCore/XCoreISelLowering.h
index 9c803bef6dd2c..d6c5b329a0a07 100644
--- a/lib/Target/XCore/XCoreISelLowering.h
+++ b/lib/Target/XCore/XCoreISelLowering.h
@@ -101,7 +101,7 @@ namespace llvm {
                                   MachineBasicBlock *MBB) const;
 
     virtual bool isLegalAddressingMode(const AddrMode &AM,
-                                       const Type *Ty) const;
+                                       Type *Ty) const;
 
   private:
     const XCoreTargetMachine &TM;
@@ -145,7 +145,8 @@ namespace llvm {
     SDValue LowerUMUL_LOHI(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSMUL_LOHI(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerTRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
 
     // Inline asm support
     std::pair<unsigned, const TargetRegisterClass*>
diff --git a/lib/Target/XCore/XCoreInstrInfo.cpp b/lib/Target/XCore/XCoreInstrInfo.cpp
index f90481f3fbc9c..a0946a197a1a7 100644
--- a/lib/Target/XCore/XCoreInstrInfo.cpp
+++ b/lib/Target/XCore/XCoreInstrInfo.cpp
@@ -17,11 +17,10 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineLocation.h"
-#include "llvm/Target/TargetRegistry.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
 
 #define GET_INSTRINFO_CTOR
 #include "XCoreGenInstrInfo.inc"
@@ -387,6 +386,15 @@ void XCoreInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
     .addImm(0);
 }
 
+MachineInstr*
+XCoreInstrInfo::emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx,
+                                         uint64_t Offset, const MDNode *MDPtr,
+                                         DebugLoc DL) const {
+  MachineInstrBuilder MIB = BuildMI(MF, DL, get(XCore::DBG_VALUE))
+    .addFrameIndex(FrameIx).addImm(0).addImm(Offset).addMetadata(MDPtr);
+  return &*MIB;
+}
+
 /// ReverseBranchCondition - Return the inverse opcode of the 
 /// specified Branch instruction.
 bool XCoreInstrInfo::
diff --git a/lib/Target/XCore/XCoreInstrInfo.h b/lib/Target/XCore/XCoreInstrInfo.h
index 840b1e1636528..d354802ee03fa 100644
--- a/lib/Target/XCore/XCoreInstrInfo.h
+++ b/lib/Target/XCore/XCoreInstrInfo.h
@@ -78,6 +78,11 @@ public:
                                     const TargetRegisterClass *RC,
                                     const TargetRegisterInfo *TRI) const;
 
+  virtual MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF,
+                                                 int FrameIx,
+                                                 uint64_t Offset,
+                                                 const MDNode *MDPtr,
+                                                 DebugLoc DL) const;
 
   virtual bool ReverseBranchCondition(
                             SmallVectorImpl<MachineOperand> &Cond) const;
diff --git a/lib/Target/XCore/XCoreInstrInfo.td b/lib/Target/XCore/XCoreInstrInfo.td
index 55c7527f4e831..4d2e93bc7a040 100644
--- a/lib/Target/XCore/XCoreInstrInfo.td
+++ b/lib/Target/XCore/XCoreInstrInfo.td
@@ -572,7 +572,7 @@ def STWDP_lru6 : _FLRU6<(outs), (ins GRRegs:$val, MEMii:$addr),
                   [(store GRRegs:$val, ADDRdpii:$addr)]>;
 
 //let Uses = [CP] in ..
-let mayLoad = 1, isReMaterializable = 1 in
+let mayLoad = 1, isReMaterializable = 1, neverHasSideEffects = 1 in
 defm LDWCP : FRU6_LRU6_cp<"ldw">;
 
 let Uses = [SP] in {
@@ -739,7 +739,7 @@ def LDAP_lu10_ba : _FLU10<(outs),
 
 let isCall=1,
 // All calls clobber the link register and the non-callee-saved registers:
-Defs = [R0, R1, R2, R3, R11, LR] in {
+Defs = [R0, R1, R2, R3, R11, LR], Uses = [SP] in {
 def BL_u10 : _FU10<
                   (outs),
                   (ins calltarget:$target, variable_ops),
@@ -754,7 +754,7 @@ def BL_lu10 : _FLU10<
 }
 
 // Two operand short
-// TODO eet, eef, testwct, tsetmr, sext (reg), zext (reg)
+// TODO eet, eef, tsetmr
 def NOT : _F2R<(outs GRRegs:$dst), (ins GRRegs:$b),
                  "not $dst, $b",
                  [(set GRRegs:$dst, (not GRRegs:$b))]>;
@@ -764,15 +764,25 @@ def NEG : _F2R<(outs GRRegs:$dst), (ins GRRegs:$b),
                  [(set GRRegs:$dst, (ineg GRRegs:$b))]>;
 
 let Constraints = "$src1 = $dst" in {
-let neverHasSideEffects = 1 in
 def SEXT_rus : _FRUS<(outs GRRegs:$dst), (ins GRRegs:$src1, i32imm:$src2),
-                 "sext $dst, $src2",
-                 []>;
+                      "sext $dst, $src2",
+                      [(set GRRegs:$dst, (int_xcore_sext GRRegs:$src1,
+                                                         immBitp:$src2))]>;
+
+def SEXT_2r : _FRUS<(outs GRRegs:$dst), (ins GRRegs:$src1, GRRegs:$src2),
+                     "sext $dst, $src2",
+                     [(set GRRegs:$dst, (int_xcore_sext GRRegs:$src1,
+                                                        GRRegs:$src2))]>;
 
-let neverHasSideEffects = 1 in
 def ZEXT_rus : _FRUS<(outs GRRegs:$dst), (ins GRRegs:$src1, i32imm:$src2),
-                 "zext $dst, $src2",
-                 []>;
+                      "zext $dst, $src2",
+                      [(set GRRegs:$dst, (int_xcore_zext GRRegs:$src1,
+                                                         immBitp:$src2))]>;
+
+def ZEXT_2r : _FRUS<(outs GRRegs:$dst), (ins GRRegs:$src1, GRRegs:$src2),
+                     "zext $dst, $src2",
+                     [(set GRRegs:$dst, (int_xcore_zext GRRegs:$src1,
+                                                        GRRegs:$src2))]>;
 
 def ANDNOT_2r : _F2R<(outs GRRegs:$dst), (ins GRRegs:$src1, GRRegs:$src2),
                  "andnot $dst, $src2",
@@ -819,7 +829,8 @@ def OUT_2r : _F2R<(outs), (ins GRRegs:$r, GRRegs:$val),
 let Constraints = "$src = $dst" in
 def OUTSHR_2r : _F2R<(outs GRRegs:$dst), (ins GRRegs:$r, GRRegs:$src),
                  "outshr res[$r], $src",
-                 [(set GRRegs:$dst, (int_xcore_outshr GRRegs:$r, GRRegs:$src))]>;
+                 [(set GRRegs:$dst, (int_xcore_outshr GRRegs:$r,
+                                                      GRRegs:$src))]>;
 
 def INCT_2r : _F2R<(outs GRRegs:$dst), (ins GRRegs:$r),
                  "inct $dst, res[$r]",
@@ -836,7 +847,8 @@ def IN_2r : _F2R<(outs GRRegs:$dst), (ins GRRegs:$r),
 let Constraints = "$src = $dst" in
 def INSHR_2r : _F2R<(outs GRRegs:$dst), (ins GRRegs:$r, GRRegs:$src),
                  "inshr $dst, res[$r]",
-                 [(set GRRegs:$dst, (int_xcore_inshr GRRegs:$r, GRRegs:$src))]>;
+                 [(set GRRegs:$dst, (int_xcore_inshr GRRegs:$r,
+                                                     GRRegs:$src))]>;
 
 def CHKCT_2r : _F2R<(outs), (ins GRRegs:$r, GRRegs:$val),
                  "chkct res[$r], $val",
@@ -846,6 +858,14 @@ def CHKCT_rus : _F2R<(outs), (ins GRRegs:$r, i32imm:$val),
                  "chkct res[$r], $val",
                  [(int_xcore_chkct GRRegs:$r, immUs:$val)]>;
 
+def TESTCT_2r : _F2R<(outs GRRegs:$dst), (ins GRRegs:$src),
+                     "testct $dst, res[$src]",
+                     [(set GRRegs:$dst, (int_xcore_testct GRRegs:$src))]>;
+
+def TESTWCT_2r : _F2R<(outs GRRegs:$dst), (ins GRRegs:$src),
+                      "testwct $dst, res[$src]",
+                      [(set GRRegs:$dst, (int_xcore_testwct GRRegs:$src))]>;
+
 def SETD_2r : _F2R<(outs), (ins GRRegs:$r, GRRegs:$val),
                  "setd res[$r], $val",
                  [(int_xcore_setd GRRegs:$r, GRRegs:$val)]>;
@@ -871,7 +891,6 @@ def INITDP_2r : _F2R<(outs), (ins GRRegs:$t, GRRegs:$src),
                      [(int_xcore_initdp GRRegs:$t, GRRegs:$src)]>;
 
 // Two operand long
-// TODO endin, peek,
 // getd, testlcl
 def BITREV_l2r : _FL2R<(outs GRRegs:$dst), (ins GRRegs:$src),
                  "bitrev $dst, $src",
@@ -917,6 +936,14 @@ def SETPSC_l2r : _FL2R<(outs), (ins GRRegs:$src1, GRRegs:$src2),
                        "setpsc res[$src1], $src2",
                        [(int_xcore_setpsc GRRegs:$src1, GRRegs:$src2)]>;
 
+def PEEK_l2r : _FL2R<(outs GRRegs:$dst), (ins GRRegs:$src),
+                      "peek $dst, res[$src]",
+                      [(set GRRegs:$dst, (int_xcore_peek GRRegs:$src))]>;
+
+def ENDIN_l2r : _FL2R<(outs GRRegs:$dst), (ins GRRegs:$src),
+                       "endin $dst, res[$src]",
+                       [(set GRRegs:$dst, (int_xcore_endin GRRegs:$src))]>;
+
 // One operand short
 // TODO edu, eeu, waitet, waitef, tstart, clrtp
 // setdp, setcp, setev, kcall
@@ -960,7 +987,7 @@ def ECALLF_1r : _F1R<(outs), (ins GRRegs:$src),
 
 let isCall=1, 
 // All calls clobber the link register and the non-callee-saved registers:
-Defs = [R0, R1, R2, R3, R11, LR] in {
+Defs = [R0, R1, R2, R3, R11, LR], Uses = [SP] in {
 def BLA_1r : _F1R<(outs), (ins GRRegs:$addr, variable_ops),
                  "bla $addr",
                  [(XCoreBranchLink GRRegs:$addr)]>;
@@ -974,10 +1001,15 @@ def FREER_1r : _F1R<(outs), (ins GRRegs:$r),
                "freer res[$r]",
                [(int_xcore_freer GRRegs:$r)]>;
 
-let Uses=[R11] in
+let Uses=[R11] in {
 def SETV_1r : _F1R<(outs), (ins GRRegs:$r),
-               "setv res[$r], r11",
-               [(int_xcore_setv GRRegs:$r, R11)]>;
+                   "setv res[$r], r11",
+                   [(int_xcore_setv GRRegs:$r, R11)]>;
+
+def SETEV_1r : _F1R<(outs), (ins GRRegs:$r),
+                    "setev res[$r], r11",
+                    [(int_xcore_setev GRRegs:$r, R11)]>;
+}
 
 def EEU_1r : _F1R<(outs), (ins GRRegs:$r),
                "eeu res[$r]",
@@ -985,15 +1017,24 @@ def EEU_1r : _F1R<(outs), (ins GRRegs:$r),
 
 // Zero operand short
 // TODO freet, ldspc, stspc, ldssr, stssr, ldsed, stsed,
-// stet, geted, getet, getkep, getksp, setkep, getid, kret, dcall, dret,
+// stet, getkep, getksp, setkep, getid, kret, dcall, dret,
 // dentsp, drestsp
 
 def CLRE_0R : _F0R<(outs), (ins), "clre", [(int_xcore_clre)]>;
 
-let Defs = [R11] in
+let Defs = [R11] in {
 def GETID_0R : _F0R<(outs), (ins),
-                 "get r11, id",
-                 [(set R11, (int_xcore_getid))]>;
+                    "get r11, id",
+                    [(set R11, (int_xcore_getid))]>;
+
+def GETED_0R : _F0R<(outs), (ins),
+                    "get r11, ed",
+                    [(set R11, (int_xcore_geted))]>;
+
+def GETET_0R : _F0R<(outs), (ins),
+                    "get r11, et",
+                    [(set R11, (int_xcore_getet))]>;
+}
 
 def SSYNC_0r : _F0R<(outs), (ins),
                     "ssync",
diff --git a/lib/Target/XCore/XCoreRegisterInfo.cpp b/lib/Target/XCore/XCoreRegisterInfo.cpp
index 357a4a0835825..1b78b373fffa4 100644
--- a/lib/Target/XCore/XCoreRegisterInfo.cpp
+++ b/lib/Target/XCore/XCoreRegisterInfo.cpp
@@ -17,7 +17,6 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineLocation.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
@@ -39,7 +38,7 @@
 using namespace llvm;
 
 XCoreRegisterInfo::XCoreRegisterInfo(const TargetInstrInfo &tii)
-  : XCoreGenRegisterInfo(), TII(tii) {
+  : XCoreGenRegisterInfo(XCore::LR), TII(tii) {
 }
 
 // helper functions
@@ -321,20 +320,8 @@ loadConstant(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   BuildMI(MBB, I, dl, TII.get(Opcode), DstReg).addImm(Value);
 }
 
-int XCoreRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
-  return XCoreGenRegisterInfo::getDwarfRegNumFull(RegNum, 0);
-}
-
-int XCoreRegisterInfo::getLLVMRegNum(unsigned DwarfRegNo, bool isEH) const {
-  return XCoreGenRegisterInfo::getLLVMRegNumFull(DwarfRegNo,0);
-}
-
 unsigned XCoreRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
 
   return TFI->hasFP(MF) ? XCore::R10 : XCore::SP;
 }
-
-unsigned XCoreRegisterInfo::getRARegister() const {
-  return XCore::LR;
-}
diff --git a/lib/Target/XCore/XCoreRegisterInfo.h b/lib/Target/XCore/XCoreRegisterInfo.h
index 801d9eba21712..5c28f39d87882 100644
--- a/lib/Target/XCore/XCoreRegisterInfo.h
+++ b/lib/Target/XCore/XCoreRegisterInfo.h
@@ -60,7 +60,6 @@ public:
                            int SPAdj, RegScavenger *RS = NULL) const;
 
   // Debug information queries.
-  unsigned getRARegister() const;
   unsigned getFrameRegister(const MachineFunction &MF) const;
 
   //! Return the array of argument passing registers
@@ -74,10 +73,6 @@ public:
   
   //! Return whether to emit frame moves
   static bool needsFrameMoves(const MachineFunction &MF);
-
-  //! Get DWARF debugging register number
-  int getDwarfRegNum(unsigned RegNum, bool isEH) const;
-  int getLLVMRegNum(unsigned RegNum, bool isEH) const;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/XCore/XCoreSubtarget.cpp b/lib/Target/XCore/XCoreSubtarget.cpp
index ad069bf138a7b..b4e9927104196 100644
--- a/lib/Target/XCore/XCoreSubtarget.cpp
+++ b/lib/Target/XCore/XCoreSubtarget.cpp
@@ -13,7 +13,7 @@
 
 #include "XCoreSubtarget.h"
 #include "XCore.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 
 #define GET_SUBTARGETINFO_TARGET_DESC
 #define GET_SUBTARGETINFO_CTOR
diff --git a/lib/Target/XCore/XCoreTargetMachine.cpp b/lib/Target/XCore/XCoreTargetMachine.cpp
index 342966ae5c861..fdc5d35036bbc 100644
--- a/lib/Target/XCore/XCoreTargetMachine.cpp
+++ b/lib/Target/XCore/XCoreTargetMachine.cpp
@@ -14,15 +14,15 @@
 #include "XCore.h"
 #include "llvm/Module.h"
 #include "llvm/PassManager.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
 /// XCoreTargetMachine ctor - Create an ILP32 architecture model
 ///
-XCoreTargetMachine::XCoreTargetMachine(const Target &T, const std::string &TT,
-                                       const std::string &CPU,
-                                       const std::string &FS)
-  : LLVMTargetMachine(T, TT, CPU, FS),
+XCoreTargetMachine::XCoreTargetMachine(const Target &T, StringRef TT,
+                                       StringRef CPU, StringRef FS,
+                                       Reloc::Model RM, CodeModel::Model CM)
+  : LLVMTargetMachine(T, TT, CPU, FS, RM, CM),
     Subtarget(TT, CPU, FS),
     DataLayout("e-p:32:32:32-a0:0:32-f32:32:32-f64:32:32-i1:8:32-i8:8:32-"
                "i16:16:32-i32:32:32-i64:32:32-n32"),
diff --git a/lib/Target/XCore/XCoreTargetMachine.h b/lib/Target/XCore/XCoreTargetMachine.h
index 6235ac3a6a1a5..83d09d6df49de 100644
--- a/lib/Target/XCore/XCoreTargetMachine.h
+++ b/lib/Target/XCore/XCoreTargetMachine.h
@@ -32,8 +32,9 @@ class XCoreTargetMachine : public LLVMTargetMachine {
   XCoreTargetLowering TLInfo;
   XCoreSelectionDAGInfo TSInfo;
 public:
-  XCoreTargetMachine(const Target &T, const std::string &TT,
-                     const std::string &CPU, const std::string &FS);
+  XCoreTargetMachine(const Target &T, StringRef TT,
+                     StringRef CPU, StringRef FS,
+                     Reloc::Model RM, CodeModel::Model CM);
 
   virtual const XCoreInstrInfo *getInstrInfo() const { return &InstrInfo; }
   virtual const XCoreFrameLowering *getFrameLowering() const {
-- 
cgit v1.2.3