19 files changed, 2135 insertions, 738 deletions
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
index 675a9c377b12d..0134b4efce727 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
@@ -56,7 +56,7 @@ void X86ATTInstPrinter::printInst(const MCInst *MI, uint64_t Address,
   if (MI->getOpcode() == X86::CALLpcrel32 &&
       (STI.getFeatureBits()[X86::Mode64Bit])) {
     OS << "\tcallq\t";
-    printPCRelImm(MI, 0, OS);
+    printPCRelImm(MI, Address, 0, OS);
   }
   // data16 and data32 both have the same encoding of 0x66. While data32 is
   // valid only in 16 bit systems, data16 is valid in the rest.
@@ -68,8 +68,7 @@ void X86ATTInstPrinter::printInst(const MCInst *MI, uint64_t Address,
    OS << "\tdata32";
   }
   // Try to print any aliases first.
-  else if (!printAliasInstr(MI, OS) &&
-           !printVecCompareInstr(MI, OS))
+  else if (!printAliasInstr(MI, Address, OS) && !printVecCompareInstr(MI, OS))
     printInstruction(MI, Address, OS);
 
   // Next always print the annotation.
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h b/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h
index 3d5d384dc4a01..51ddae61d2510 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h
@@ -30,9 +30,10 @@ public:
 
   // Autogenerated by tblgen, returns true if we successfully printed an
   // alias.
-  bool printAliasInstr(const MCInst *MI, raw_ostream &OS);
-  void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
-                               unsigned PrintMethodIdx, raw_ostream &O);
+  bool printAliasInstr(const MCInst *MI, uint64_t Address, raw_ostream &OS);
+  void printCustomAliasOperand(const MCInst *MI, uint64_t Address,
+                               unsigned OpIdx, unsigned PrintMethodIdx,
+                               raw_ostream &O);
 
   // Autogenerated by tblgen.
   void printInstruction(const MCInst *MI, uint64_t Address, raw_ostream &OS);
@@ -46,13 +47,6 @@ public:
   void printU8Imm(const MCInst *MI, unsigned Op, raw_ostream &OS);
   void printSTiRegOperand(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
 
-  void printanymem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
-    printMemReference(MI, OpNo, O);
-  }
-  void printopaquemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
-    printMemReference(MI, OpNo, O);
-  }
-
   void printbytemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
     printMemReference(MI, OpNo, O);
   }
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index dffda5217675b..bf3b6bcb5463f 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -12,7 +12,9 @@
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/BinaryFormat/MachO.h"
 #include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCELFObjectWriter.h"
@@ -60,10 +62,9 @@ public:
       else if (BranchType == "indirect")
         addKind(X86::AlignBranchIndirect);
       else {
-        report_fatal_error(
-            "'-x86-align-branch 'The branches's type is combination of jcc, "
-            "fused, jmp, call, ret, indirect.(plus separated)",
-            false);
+        errs() << "invalid argument " << BranchType.str()
+               << " to -x86-align-branch=; each element must be one of: fused, "
+                  "jcc, jmp, call, ret, indirect.(plus separated)\n";
       }
     }
   }
@@ -85,13 +86,14 @@ cl::opt<unsigned> X86AlignBranchBoundary(
 
 cl::opt<X86AlignBranchKind, true, cl::parser<std::string>> X86AlignBranch(
     "x86-align-branch",
-    cl::desc("Specify types of branches to align (plus separated list of "
-             "types). The branches's types are combination of jcc, fused, "
-             "jmp, call, ret, indirect."),
-    cl::value_desc("jcc indicates conditional jumps, fused indicates fused "
-                   "conditional jumps, jmp indicates unconditional jumps, call "
-                   "indicates direct and indirect calls, ret indicates rets, "
-                   "indirect indicates indirect jumps."),
+    cl::desc(
+        "Specify types of branches to align (plus separated list of types):"
+             "\njcc      indicates conditional jumps"
+             "\nfused    indicates fused conditional jumps"
+             "\njmp      indicates direct unconditional jumps"
+             "\ncall     indicates direct and indirect calls"
+             "\nret      indicates rets"
+             "\nindirect indicates indirect unconditional jumps"),
     cl::location(X86AlignBranchKindLoc));
 
 cl::opt<bool> X86AlignBranchWithin32BBoundaries(
@@ -102,6 +104,18 @@ cl::opt<bool> X86AlignBranchWithin32BBoundaries(
         "assumptions about labels corresponding to particular instructions, "
         "and should be used with caution."));
 
+cl::opt<unsigned> X86PadMaxPrefixSize(
+    "x86-pad-max-prefix-size", cl::init(0),
+    cl::desc("Maximum number of prefixes to use for padding"));
+
+cl::opt<bool> X86PadForAlign(
+    "x86-pad-for-align", cl::init(true), cl::Hidden,
+    cl::desc("Pad previous instructions to implement align directives"));
+
+cl::opt<bool> X86PadForBranchAlign(
+    "x86-pad-for-branch-align", cl::init(true), cl::Hidden,
+    cl::desc("Pad previous instructions to implement branch alignment"));
+
 class X86ELFObjectWriter : public MCELFObjectTargetWriter {
 public:
   X86ELFObjectWriter(bool is64Bit, uint8_t OSABI, uint16_t EMachine,
@@ -114,14 +128,18 @@ class X86AsmBackend : public MCAsmBackend {
   std::unique_ptr<const MCInstrInfo> MCII;
   X86AlignBranchKind AlignBranchType;
   Align AlignBoundary;
+  unsigned TargetPrefixMax = 0;
 
-  bool isMacroFused(const MCInst &Cmp, const MCInst &Jcc) const;
-
-  bool needAlign(MCObjectStreamer &OS) const;
-  bool needAlignInst(const MCInst &Inst) const;
-  MCBoundaryAlignFragment *
-  getOrCreateBoundaryAlignFragment(MCObjectStreamer &OS) const;
   MCInst PrevInst;
+  MCBoundaryAlignFragment *PendingBA = nullptr;
+  std::pair<MCFragment *, size_t> PrevInstPosition;
+  bool CanPadInst;
+
+  uint8_t determinePaddingPrefix(const MCInst &Inst) const;
+  bool isMacroFused(const MCInst &Cmp, const MCInst &Jcc) const;
+  bool needAlign(const MCInst &Inst) const;
+  bool canPadBranches(MCObjectStreamer &OS) const;
+  bool canPadInst(const MCInst &Inst, MCObjectStreamer &OS) const;
 
 public:
   X86AsmBackend(const Target &T, const MCSubtargetInfo &STI)
@@ -142,11 +160,14 @@ public:
       AlignBoundary = assumeAligned(X86AlignBranchBoundary);
     if (X86AlignBranch.getNumOccurrences())
       AlignBranchType = X86AlignBranchKindLoc;
+    if (X86PadMaxPrefixSize.getNumOccurrences())
+      TargetPrefixMax = X86PadMaxPrefixSize;
   }
 
   bool allowAutoPadding() const override;
-  void alignBranchesBegin(MCObjectStreamer &OS, const MCInst &Inst) override;
-  void alignBranchesEnd(MCObjectStreamer &OS, const MCInst &Inst) override;
+  bool allowEnhancedRelaxation() const override;
+  void emitInstructionBegin(MCObjectStreamer &OS, const MCInst &Inst) override;
+  void emitInstructionEnd(MCObjectStreamer &OS, const MCInst &Inst) override;
 
   unsigned getNumFixupKinds() const override {
     return X86::NumTargetFixupKinds;
@@ -155,7 +176,7 @@ public:
   Optional<MCFixupKind> getFixupKind(StringRef Name) const override;
 
   const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
-  
+
   bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
                              const MCValue &Target) override;
 
@@ -171,22 +192,34 @@ public:
                             const MCRelaxableFragment *DF,
                             const MCAsmLayout &Layout) const override;
 
-  void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
-                        MCInst &Res) const override;
+  void relaxInstruction(MCInst &Inst,
+                        const MCSubtargetInfo &STI) const override;
+
+  bool padInstructionViaRelaxation(MCRelaxableFragment &RF,
+                                   MCCodeEmitter &Emitter,
+                                   unsigned &RemainingSize) const;
+
+  bool padInstructionViaPrefix(MCRelaxableFragment &RF, MCCodeEmitter &Emitter,
+                               unsigned &RemainingSize) const;
+
+  bool padInstructionEncoding(MCRelaxableFragment &RF, MCCodeEmitter &Emitter,
+                              unsigned &RemainingSize) const;
+
+  void finishLayout(MCAssembler const &Asm, MCAsmLayout &Layout) const override;
 
   bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
 };
 } // end anonymous namespace
 
-static unsigned getRelaxedOpcodeBranch(const MCInst &Inst, bool is16BitMode) {
+static unsigned getRelaxedOpcodeBranch(const MCInst &Inst, bool Is16BitMode) {
   unsigned Op = Inst.getOpcode();
   switch (Op) {
   default:
     return Op;
   case X86::JCC_1:
-    return (is16BitMode) ? X86::JCC_2 : X86::JCC_4;
+    return (Is16BitMode) ? X86::JCC_2 : X86::JCC_4;
   case X86::JMP_1:
-    return (is16BitMode) ? X86::JMP_2 : X86::JMP_4;
+    return (Is16BitMode) ? X86::JMP_2 : X86::JMP_4;
   }
 }
 
@@ -275,11 +308,11 @@ static unsigned getRelaxedOpcodeArith(const MCInst &Inst) {
   }
 }
 
-static unsigned getRelaxedOpcode(const MCInst &Inst, bool is16BitMode) {
+static unsigned getRelaxedOpcode(const MCInst &Inst, bool Is16BitMode) {
   unsigned R = getRelaxedOpcodeArith(Inst);
   if (R != Inst.getOpcode())
     return R;
-  return getRelaxedOpcodeBranch(Inst, is16BitMode);
+  return getRelaxedOpcodeBranch(Inst, Is16BitMode);
 }
 
 static X86::CondCode getCondFromBranch(const MCInst &MI,
@@ -316,6 +349,11 @@ static bool isRIPRelative(const MCInst &MI, const MCInstrInfo &MCII) {
   return (BaseReg == X86::RIP);
 }
 
+/// Check if the instruction is a prefix.
+static bool isPrefix(const MCInst &MI, const MCInstrInfo &MCII) {
+  return X86II::isPrefix(MCII.get(MI.getOpcode()).TSFlags);
+}
+
 /// Check if the instruction is valid as the first instruction in macro fusion.
 static bool isFirstMacroFusibleInst(const MCInst &Inst,
                                     const MCInstrInfo &MCII) {
@@ -327,6 +365,69 @@ static bool isFirstMacroFusibleInst(const MCInst &Inst,
   return FIK != X86::FirstMacroFusionInstKind::Invalid;
 }
 
+/// X86 can reduce the bytes of NOP by padding instructions with prefixes to
+/// get a better peformance in some cases. Here, we determine which prefix is
+/// the most suitable.
+///
+/// If the instruction has a segment override prefix, use the existing one.
+/// If the target is 64-bit, use the CS.
+/// If the target is 32-bit,
+///   - If the instruction has a ESP/EBP base register, use SS.
+///   - Otherwise use DS.
+uint8_t X86AsmBackend::determinePaddingPrefix(const MCInst &Inst) const {
+  assert((STI.hasFeature(X86::Mode32Bit) || STI.hasFeature(X86::Mode64Bit)) &&
+         "Prefixes can be added only in 32-bit or 64-bit mode.");
+  const MCInstrDesc &Desc = MCII->get(Inst.getOpcode());
+  uint64_t TSFlags = Desc.TSFlags;
+
+  // Determine where the memory operand starts, if present.
+  int MemoryOperand = X86II::getMemoryOperandNo(TSFlags);
+  if (MemoryOperand != -1)
+    MemoryOperand += X86II::getOperandBias(Desc);
+
+  unsigned SegmentReg = 0;
+  if (MemoryOperand >= 0) {
+    // Check for explicit segment override on memory operand.
+    SegmentReg = Inst.getOperand(MemoryOperand + X86::AddrSegmentReg).getReg();
+  }
+
+  switch (TSFlags & X86II::FormMask) {
+  default:
+    break;
+  case X86II::RawFrmDstSrc: {
+    // Check segment override opcode prefix as needed (not for %ds).
+    if (Inst.getOperand(2).getReg() != X86::DS)
+      SegmentReg = Inst.getOperand(2).getReg();
+    break;
+  }
+  case X86II::RawFrmSrc: {
+    // Check segment override opcode prefix as needed (not for %ds).
+    if (Inst.getOperand(1).getReg() != X86::DS)
+      SegmentReg = Inst.getOperand(1).getReg();
+    break;
+  }
+  case X86II::RawFrmMemOffs: {
+    // Check segment override opcode prefix as needed.
+    SegmentReg = Inst.getOperand(1).getReg();
+    break;
+  }
+  }
+
+  if (SegmentReg != 0)
+    return X86::getSegmentOverridePrefixForReg(SegmentReg);
+
+  if (STI.hasFeature(X86::Mode64Bit))
+    return X86::CS_Encoding;
+
+  if (MemoryOperand >= 0) {
+    unsigned BaseRegNum = MemoryOperand + X86::AddrBaseReg;
+    unsigned BaseReg = Inst.getOperand(BaseRegNum).getReg();
+    if (BaseReg == X86::ESP || BaseReg == X86::EBP)
+      return X86::SS_Encoding;
+  }
+  return X86::DS_Encoding;
+}
+
 /// Check if the two instructions will be macro-fused on the target cpu.
 bool X86AsmBackend::isMacroFused(const MCInst &Cmp, const MCInst &Jcc) const {
   const MCInstrDesc &InstDesc = MCII->get(Jcc.getOpcode());
@@ -355,19 +456,122 @@ static bool hasVariantSymbol(const MCInst &MI) {
 }
 
 bool X86AsmBackend::allowAutoPadding() const {
-  return (AlignBoundary != Align::None() &&
-          AlignBranchType != X86::AlignBranchNone);
+  return (AlignBoundary != Align(1) && AlignBranchType != X86::AlignBranchNone);
+}
+
+bool X86AsmBackend::allowEnhancedRelaxation() const {
+  return allowAutoPadding() && TargetPrefixMax != 0 && X86PadForBranchAlign;
+}
+
+/// X86 has certain instructions which enable interrupts exactly one
+/// instruction *after* the instruction which stores to SS.  Return true if the
+/// given instruction has such an interrupt delay slot.
+static bool hasInterruptDelaySlot(const MCInst &Inst) {
+  switch (Inst.getOpcode()) {
+  case X86::POPSS16:
+  case X86::POPSS32:
+  case X86::STI:
+    return true;
+
+  case X86::MOV16sr:
+  case X86::MOV32sr:
+  case X86::MOV64sr:
+  case X86::MOV16sm:
+    if (Inst.getOperand(0).getReg() == X86::SS)
+      return true;
+    break;
+  }
+  return false;
+}
+
+/// Check if the instruction to be emitted is right after any data.
+static bool
+isRightAfterData(MCFragment *CurrentFragment,
+                 const std::pair<MCFragment *, size_t> &PrevInstPosition) {
+  MCFragment *F = CurrentFragment;
+  // Empty data fragments may be created to prevent further data being
+  // added into the previous fragment, we need to skip them since they
+  // have no contents.
+  for (; isa_and_nonnull<MCDataFragment>(F); F = F->getPrevNode())
+    if (cast<MCDataFragment>(F)->getContents().size() != 0)
+      break;
+
+  // Since data is always emitted into a DataFragment, our check strategy is
+  // simple here.
+  //   - If the fragment is a DataFragment
+  //     - If it's not the fragment where the previous instruction is,
+  //       returns true.
+  //     - If it's the fragment holding the previous instruction but its
+  //       size changed since the the previous instruction was emitted into
+  //       it, returns true.
+  //     - Otherwise returns false.
+  //   - If the fragment is not a DataFragment, returns false.
+  if (auto *DF = dyn_cast_or_null<MCDataFragment>(F))
+    return DF != PrevInstPosition.first ||
+           DF->getContents().size() != PrevInstPosition.second;
+
+  return false;
+}
+
+/// \returns the fragment size if it has instructions, otherwise returns 0.
+static size_t getSizeForInstFragment(const MCFragment *F) {
+  if (!F || !F->hasInstructions())
+    return 0;
+  // MCEncodedFragmentWithContents being templated makes this tricky.
+  switch (F->getKind()) {
+  default:
+    llvm_unreachable("Unknown fragment with instructions!");
+  case MCFragment::FT_Data:
+    return cast<MCDataFragment>(*F).getContents().size();
+  case MCFragment::FT_Relaxable:
+    return cast<MCRelaxableFragment>(*F).getContents().size();
+  case MCFragment::FT_CompactEncodedInst:
+    return cast<MCCompactEncodedInstFragment>(*F).getContents().size();
+  }
+}
+
+/// Return true if we can insert NOP or prefixes automatically before the
+/// the instruction to be emitted.
+bool X86AsmBackend::canPadInst(const MCInst &Inst, MCObjectStreamer &OS) const {
+  if (hasVariantSymbol(Inst))
+    // Linker may rewrite the instruction with variant symbol operand(e.g.
+    // TLSCALL).
+    return false;
+
+  if (hasInterruptDelaySlot(PrevInst))
+    // If this instruction follows an interrupt enabling instruction with a one
+    // instruction delay, inserting a nop would change behavior.
+    return false;
+
+  if (isPrefix(PrevInst, *MCII))
+    // If this instruction follows a prefix, inserting a nop/prefix would change
+    // semantic.
+    return false;
+
+  if (isPrefix(Inst, *MCII))
+    // If this instruction is a prefix, inserting a prefix would change
+    // semantic.
+    return false;
+
+  if (isRightAfterData(OS.getCurrentFragment(), PrevInstPosition))
+    // If this instruction follows any data, there is no clear
+    // instruction boundary, inserting a nop/prefix would change semantic.
+    return false;
+
+  return true;
 }
 
-bool X86AsmBackend::needAlign(MCObjectStreamer &OS) const {
+bool X86AsmBackend::canPadBranches(MCObjectStreamer &OS) const {
   if (!OS.getAllowAutoPadding())
     return false;
   assert(allowAutoPadding() && "incorrect initialization!");
 
-  MCAssembler &Assembler = OS.getAssembler();
-  MCSection *Sec = OS.getCurrentSectionOnly();
+  // We only pad in text section.
+  if (!OS.getCurrentSectionOnly()->getKind().isText())
+    return false;
+
   // To be Done: Currently don't deal with Bundle cases.
-  if (Assembler.isBundlingEnabled() && Sec->isBundleLocked())
+  if (OS.getAssembler().isBundlingEnabled())
     return false;
 
   // Branches only need to be aligned in 32-bit or 64-bit mode.
@@ -377,59 +581,42 @@ bool X86AsmBackend::needAlign(MCObjectStreamer &OS) const {
   return true;
 }
 
-/// Check if the instruction operand needs to be aligned. Padding is disabled
-/// before intruction which may be rewritten by linker(e.g. TLSCALL).
-bool X86AsmBackend::needAlignInst(const MCInst &Inst) const {
-  // Linker may rewrite the instruction with variant symbol operand.
-  if (hasVariantSymbol(Inst))
-    return false;
-
-  const MCInstrDesc &InstDesc = MCII->get(Inst.getOpcode());
-  return (InstDesc.isConditionalBranch() &&
+/// Check if the instruction operand needs to be aligned.
+bool X86AsmBackend::needAlign(const MCInst &Inst) const {
+  const MCInstrDesc &Desc = MCII->get(Inst.getOpcode());
+  return (Desc.isConditionalBranch() &&
           (AlignBranchType & X86::AlignBranchJcc)) ||
-         (InstDesc.isUnconditionalBranch() &&
+         (Desc.isUnconditionalBranch() &&
           (AlignBranchType & X86::AlignBranchJmp)) ||
-         (InstDesc.isCall() &&
-          (AlignBranchType & X86::AlignBranchCall)) ||
-         (InstDesc.isReturn() &&
-          (AlignBranchType & X86::AlignBranchRet)) ||
-         (InstDesc.isIndirectBranch() &&
+         (Desc.isCall() && (AlignBranchType & X86::AlignBranchCall)) ||
+         (Desc.isReturn() && (AlignBranchType & X86::AlignBranchRet)) ||
+         (Desc.isIndirectBranch() &&
           (AlignBranchType & X86::AlignBranchIndirect));
 }
 
-static bool canReuseBoundaryAlignFragment(const MCBoundaryAlignFragment &F) {
-  // If a MCBoundaryAlignFragment has not been used to emit NOP,we can reuse it.
-  return !F.canEmitNops();
-}
+/// Insert BoundaryAlignFragment before instructions to align branches.
+void X86AsmBackend::emitInstructionBegin(MCObjectStreamer &OS,
+                                         const MCInst &Inst) {
+  CanPadInst = canPadInst(Inst, OS);
 
-MCBoundaryAlignFragment *
-X86AsmBackend::getOrCreateBoundaryAlignFragment(MCObjectStreamer &OS) const {
-  auto *F = dyn_cast_or_null<MCBoundaryAlignFragment>(OS.getCurrentFragment());
-  if (!F || !canReuseBoundaryAlignFragment(*F)) {
-    F = new MCBoundaryAlignFragment(AlignBoundary);
-    OS.insert(F);
-  }
-  return F;
-}
+  if (!canPadBranches(OS))
+    return;
+
+  if (!isMacroFused(PrevInst, Inst))
+    // Macro fusion doesn't happen indeed, clear the pending.
+    PendingBA = nullptr;
 
-/// Insert MCBoundaryAlignFragment before instructions to align branches.
-void X86AsmBackend::alignBranchesBegin(MCObjectStreamer &OS,
-                                       const MCInst &Inst) {
-  if (!needAlign(OS))
+  if (!CanPadInst)
     return;
 
-  MCFragment *CF = OS.getCurrentFragment();
-  bool NeedAlignFused = AlignBranchType & X86::AlignBranchFused;
-  if (NeedAlignFused && isMacroFused(PrevInst, Inst) && CF) {
+  if (PendingBA && OS.getCurrentFragment()->getPrevNode() == PendingBA) {
     // Macro fusion actually happens and there is no other fragment inserted
-    // after the previous instruction. NOP can be emitted in PF to align fused
-    // jcc.
-    if (auto *PF =
-            dyn_cast_or_null<MCBoundaryAlignFragment>(CF->getPrevNode())) {
-      const_cast<MCBoundaryAlignFragment *>(PF)->setEmitNops(true);
-      const_cast<MCBoundaryAlignFragment *>(PF)->setFused(true);
-    }
-  } else if (needAlignInst(Inst)) {
+    // after the previous instruction.
+    //
+    // Do nothing here since we already inserted a BoudaryAlign fragment when
+    // we met the first instruction in the fused pair and we'll tie them
+    // together in emitInstructionEnd.
+    //
     // Note: When there is at least one fragment, such as MCAlignFragment,
     // inserted after the previous instruction, e.g.
     //
@@ -441,34 +628,41 @@ void X86AsmBackend::alignBranchesBegin(MCObjectStreamer &OS,
     //
     // We will treat the JCC as a unfused branch although it may be fused
     // with the CMP.
-    auto *F = getOrCreateBoundaryAlignFragment(OS);
-    F->setEmitNops(true);
-    F->setFused(false);
-  } else if (NeedAlignFused && isFirstMacroFusibleInst(Inst, *MCII)) {
-    // We don't know if macro fusion happens until the reaching the next
-    // instruction, so a place holder is put here if necessary.
-    getOrCreateBoundaryAlignFragment(OS);
+    return;
   }
 
-  PrevInst = Inst;
+  if (needAlign(Inst) || ((AlignBranchType & X86::AlignBranchFused) &&
+                          isFirstMacroFusibleInst(Inst, *MCII))) {
+    // If we meet a unfused branch or the first instuction in a fusiable pair,
+    // insert a BoundaryAlign fragment.
+    OS.insert(PendingBA = new MCBoundaryAlignFragment(AlignBoundary));
+  }
 }
 
-/// Insert a MCBoundaryAlignFragment to mark the end of the branch to be aligned
-/// if necessary.
-void X86AsmBackend::alignBranchesEnd(MCObjectStreamer &OS, const MCInst &Inst) {
-  if (!needAlign(OS))
+/// Set the last fragment to be aligned for the BoundaryAlignFragment.
+void X86AsmBackend::emitInstructionEnd(MCObjectStreamer &OS, const MCInst &Inst) {
+  PrevInst = Inst;
+  MCFragment *CF = OS.getCurrentFragment();
+  PrevInstPosition = std::make_pair(CF, getSizeForInstFragment(CF));
+  if (auto *F = dyn_cast_or_null<MCRelaxableFragment>(CF))
+    F->setAllowAutoPadding(CanPadInst);
+
+  if (!canPadBranches(OS))
     return;
-  // If the branch is emitted into a MCRelaxableFragment, we can determine the
-  // size of the branch easily in MCAssembler::relaxBoundaryAlign. When the
-  // branch is fused, the fused branch(macro fusion pair) must be emitted into
-  // two fragments. Or when the branch is unfused, the branch must be emitted
-  // into one fragment. The MCRelaxableFragment naturally marks the end of the
-  // fused or unfused branch.
-  // Otherwise, we need to insert a MCBoundaryAlignFragment to mark the end of
-  // the branch. This MCBoundaryAlignFragment may be reused to emit NOP to align
-  // other branch.
-  if (needAlignInst(Inst) && !isa<MCRelaxableFragment>(OS.getCurrentFragment()))
-    OS.insert(new MCBoundaryAlignFragment(AlignBoundary));
+
+  if (!needAlign(Inst) || !PendingBA)
+    return;
+
+  // Tie the aligned instructions into a a pending BoundaryAlign.
+  PendingBA->setLastFragment(CF);
+  PendingBA = nullptr;
+
+  // We need to ensure that further data isn't added to the current
+  // DataFragment, so that we can get the size of instructions later in
+  // MCAssembler::relaxBoundaryAlign. The easiest way is to insert a new empty
+  // DataFragment.
+  if (isa_and_nonnull<MCDataFragment>(CF))
+    OS.insert(new MCDataFragment());
 
   // Update the maximum alignment on the current section if necessary.
   MCSection *Sec = OS.getCurrentSectionOnly();
@@ -478,13 +672,23 @@ void X86AsmBackend::alignBranchesEnd(MCObjectStreamer &OS, const MCInst &Inst) {
 
 Optional<MCFixupKind> X86AsmBackend::getFixupKind(StringRef Name) const {
   if (STI.getTargetTriple().isOSBinFormatELF()) {
+    unsigned Type;
     if (STI.getTargetTriple().getArch() == Triple::x86_64) {
-      if (Name == "R_X86_64_NONE")
-        return FK_NONE;
+      Type = llvm::StringSwitch<unsigned>(Name)
+#define ELF_RELOC(X, Y) .Case(#X, Y)
+#include "llvm/BinaryFormat/ELFRelocs/x86_64.def"
+#undef ELF_RELOC
+                 .Default(-1u);
     } else {
-      if (Name == "R_386_NONE")
-        return FK_NONE;
+      Type = llvm::StringSwitch<unsigned>(Name)
+#define ELF_RELOC(X, Y) .Case(#X, Y)
+#include "llvm/BinaryFormat/ELFRelocs/i386.def"
+#undef ELF_RELOC
+                 .Default(-1u);
     }
+    if (Type == -1u)
+      return None;
+    return static_cast<MCFixupKind>(FirstLiteralRelocationKind + Type);
   }
   return MCAsmBackend::getFixupKind(Name);
 }
@@ -502,6 +706,11 @@ const MCFixupKindInfo &X86AsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
       {"reloc_branch_4byte_pcrel", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
   };
 
+  // Fixup kinds from .reloc directive are like R_386_NONE/R_X86_64_NONE. They
+  // do not require any extra processing.
+  if (Kind >= FirstLiteralRelocationKind)
+    return MCAsmBackend::getFixupKindInfo(FK_NONE);
+
   if (Kind < FirstTargetFixupKind)
     return MCAsmBackend::getFixupKindInfo(Kind);
 
@@ -514,7 +723,7 @@ const MCFixupKindInfo &X86AsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
 bool X86AsmBackend::shouldForceRelocation(const MCAssembler &,
                                           const MCFixup &Fixup,
                                           const MCValue &) {
-  return Fixup.getKind() == FK_NONE;
+  return Fixup.getKind() >= FirstLiteralRelocationKind;
 }
 
 static unsigned getFixupKindSize(unsigned Kind) {
@@ -556,7 +765,10 @@ void X86AsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
                                MutableArrayRef<char> Data,
                                uint64_t Value, bool IsResolved,
                                const MCSubtargetInfo *STI) const {
-  unsigned Size = getFixupKindSize(Fixup.getKind());
+  unsigned Kind = Fixup.getKind();
+  if (Kind >= FirstLiteralRelocationKind)
+    return;
+  unsigned Size = getFixupKindSize(Kind);
 
   assert(Fixup.getOffset() + Size <= Data.size() && "Invalid fixup offset!");
 
@@ -613,12 +825,11 @@ bool X86AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup,
 
 // FIXME: Can tblgen help at all here to verify there aren't other instructions
 // we can relax?
-void X86AsmBackend::relaxInstruction(const MCInst &Inst,
-                                     const MCSubtargetInfo &STI,
-                                     MCInst &Res) const {
+void X86AsmBackend::relaxInstruction(MCInst &Inst,
+                                     const MCSubtargetInfo &STI) const {
   // The only relaxations X86 does is from a 1byte pcrel to a 4byte pcrel.
-  bool is16BitMode = STI.getFeatureBits()[X86::Mode16Bit];
-  unsigned RelaxedOp = getRelaxedOpcode(Inst, is16BitMode);
+  bool Is16BitMode = STI.getFeatureBits()[X86::Mode16Bit];
+  unsigned RelaxedOp = getRelaxedOpcode(Inst, Is16BitMode);
 
   if (RelaxedOp == Inst.getOpcode()) {
     SmallString<256> Tmp;
@@ -628,8 +839,232 @@ void X86AsmBackend::relaxInstruction(const MCInst &Inst,
     report_fatal_error("unexpected instruction to relax: " + OS.str());
   }
 
-  Res = Inst;
-  Res.setOpcode(RelaxedOp);
+  Inst.setOpcode(RelaxedOp);
+}
+
+/// Return true if this instruction has been fully relaxed into it's most
+/// general available form.
+static bool isFullyRelaxed(const MCRelaxableFragment &RF) {
+  auto &Inst = RF.getInst();
+  auto &STI = *RF.getSubtargetInfo();
+  bool Is16BitMode = STI.getFeatureBits()[X86::Mode16Bit];
+  return getRelaxedOpcode(Inst, Is16BitMode) == Inst.getOpcode();
+}
+
+bool X86AsmBackend::padInstructionViaPrefix(MCRelaxableFragment &RF,
+                                            MCCodeEmitter &Emitter,
+                                            unsigned &RemainingSize) const {
+  if (!RF.getAllowAutoPadding())
+    return false;
+  // If the instruction isn't fully relaxed, shifting it around might require a
+  // larger value for one of the fixups then can be encoded.  The outer loop
+  // will also catch this before moving to the next instruction, but we need to
+  // prevent padding this single instruction as well.
+  if (!isFullyRelaxed(RF))
+    return false;
+
+  const unsigned OldSize = RF.getContents().size();
+  if (OldSize == 15)
+    return false;
+
+  const unsigned MaxPossiblePad = std::min(15 - OldSize, RemainingSize);
+  const unsigned RemainingPrefixSize = [&]() -> unsigned {
+    SmallString<15> Code;
+    raw_svector_ostream VecOS(Code);
+    Emitter.emitPrefix(RF.getInst(), VecOS, STI);
+    assert(Code.size() < 15 && "The number of prefixes must be less than 15.");
+
+    // TODO: It turns out we need a decent amount of plumbing for the target
+    // specific bits to determine number of prefixes its safe to add.  Various
+    // targets (older chips mostly, but also Atom family) encounter decoder
+    // stalls with too many prefixes.  For testing purposes, we set the value
+    // externally for the moment.
+    unsigned ExistingPrefixSize = Code.size();
+    if (TargetPrefixMax <= ExistingPrefixSize)
+      return 0;
+    return TargetPrefixMax - ExistingPrefixSize;
+  }();
+  const unsigned PrefixBytesToAdd =
+      std::min(MaxPossiblePad, RemainingPrefixSize);
+  if (PrefixBytesToAdd == 0)
+    return false;
+
+  const uint8_t Prefix = determinePaddingPrefix(RF.getInst());
+
+  SmallString<256> Code;
+  Code.append(PrefixBytesToAdd, Prefix);
+  Code.append(RF.getContents().begin(), RF.getContents().end());
+  RF.getContents() = Code;
+
+  // Adjust the fixups for the change in offsets
+  for (auto &F : RF.getFixups()) {
+    F.setOffset(F.getOffset() + PrefixBytesToAdd);
+  }
+
+  RemainingSize -= PrefixBytesToAdd;
+  return true;
+}
+
+bool X86AsmBackend::padInstructionViaRelaxation(MCRelaxableFragment &RF,
+                                                MCCodeEmitter &Emitter,
+                                                unsigned &RemainingSize) const {
+  if (isFullyRelaxed(RF))
+    // TODO: There are lots of other tricks we could apply for increasing
+    // encoding size without impacting performance.
+    return false;
+
+  MCInst Relaxed = RF.getInst();
+  relaxInstruction(Relaxed, *RF.getSubtargetInfo());
+
+  SmallVector<MCFixup, 4> Fixups;
+  SmallString<15> Code;
+  raw_svector_ostream VecOS(Code);
+  Emitter.encodeInstruction(Relaxed, VecOS, Fixups, *RF.getSubtargetInfo());
+  const unsigned OldSize = RF.getContents().size();
+  const unsigned NewSize = Code.size();
+  assert(NewSize >= OldSize && "size decrease during relaxation?");
+  unsigned Delta = NewSize - OldSize;
+  if (Delta > RemainingSize)
+    return false;
+  RF.setInst(Relaxed);
+  RF.getContents() = Code;
+  RF.getFixups() = Fixups;
+  RemainingSize -= Delta;
+  return true;
+}
+
+bool X86AsmBackend::padInstructionEncoding(MCRelaxableFragment &RF,
+                                           MCCodeEmitter &Emitter,
+                                           unsigned &RemainingSize) const {
+  bool Changed = false;
+  if (RemainingSize != 0)
+    Changed |= padInstructionViaRelaxation(RF, Emitter, RemainingSize);
+  if (RemainingSize != 0)
+    Changed |= padInstructionViaPrefix(RF, Emitter, RemainingSize);
+  return Changed;
+}
+
+void X86AsmBackend::finishLayout(MCAssembler const &Asm,
+                                 MCAsmLayout &Layout) const {
+  // See if we can further relax some instructions to cut down on the number of
+  // nop bytes required for code alignment.  The actual win is in reducing
+  // instruction count, not number of bytes.  Modern X86-64 can easily end up
+  // decode limited.  It is often better to reduce the number of instructions
+  // (i.e. eliminate nops) even at the cost of increasing the size and
+  // complexity of others.
+  if (!X86PadForAlign && !X86PadForBranchAlign)
+    return;
+
+  DenseSet<MCFragment *> LabeledFragments;
+  for (const MCSymbol &S : Asm.symbols())
+    LabeledFragments.insert(S.getFragment(false));
+
+  for (MCSection &Sec : Asm) {
+    if (!Sec.getKind().isText())
+      continue;
+
+    SmallVector<MCRelaxableFragment *, 4> Relaxable;
+    for (MCSection::iterator I = Sec.begin(), IE = Sec.end(); I != IE; ++I) {
+      MCFragment &F = *I;
+
+      if (LabeledFragments.count(&F))
+        Relaxable.clear();
+
+      if (F.getKind() == MCFragment::FT_Data ||
+          F.getKind() == MCFragment::FT_CompactEncodedInst)
+        // Skip and ignore
+        continue;
+
+      if (F.getKind() == MCFragment::FT_Relaxable) {
+        auto &RF = cast<MCRelaxableFragment>(*I);
+        Relaxable.push_back(&RF);
+        continue;
+      }
+
+      auto canHandle = [](MCFragment &F) -> bool {
+        switch (F.getKind()) {
+        default:
+          return false;
+        case MCFragment::FT_Align:
+          return X86PadForAlign;
+        case MCFragment::FT_BoundaryAlign:
+          return X86PadForBranchAlign;
+        }
+      };
+      // For any unhandled kind, assume we can't change layout.
+      if (!canHandle(F)) {
+        Relaxable.clear();
+        continue;
+      }
+
+#ifndef NDEBUG
+      const uint64_t OrigOffset = Layout.getFragmentOffset(&F);
+#endif
+      const uint64_t OrigSize = Asm.computeFragmentSize(Layout, F);
+
+      // To keep the effects local, prefer to relax instructions closest to
+      // the align directive.  This is purely about human understandability
+      // of the resulting code.  If we later find a reason to expand
+      // particular instructions over others, we can adjust.
+      MCFragment *FirstChangedFragment = nullptr;
+      unsigned RemainingSize = OrigSize;
+      while (!Relaxable.empty() && RemainingSize != 0) {
+        auto &RF = *Relaxable.pop_back_val();
+        // Give the backend a chance to play any tricks it wishes to increase
+        // the encoding size of the given instruction.  Target independent code
+        // will try further relaxation, but target's may play further tricks.
+        if (padInstructionEncoding(RF, Asm.getEmitter(), RemainingSize))
+          FirstChangedFragment = &RF;
+
+        // If we have an instruction which hasn't been fully relaxed, we can't
+        // skip past it and insert bytes before it.  Changing its starting
+        // offset might require a larger negative offset than it can encode.
+        // We don't need to worry about larger positive offsets as none of the
+        // possible offsets between this and our align are visible, and the
+        // ones afterwards aren't changing.
+        if (!isFullyRelaxed(RF))
+          break;
+      }
+      Relaxable.clear();
+
+      if (FirstChangedFragment) {
+        // Make sure the offsets for any fragments in the effected range get
+        // updated.  Note that this (conservatively) invalidates the offsets of
+        // those following, but this is not required.
+        Layout.invalidateFragmentsFrom(FirstChangedFragment);
+      }
+
+      // BoundaryAlign explicitly tracks it's size (unlike align)
+      if (F.getKind() == MCFragment::FT_BoundaryAlign)
+        cast<MCBoundaryAlignFragment>(F).setSize(RemainingSize);
+
+#ifndef NDEBUG
+      const uint64_t FinalOffset = Layout.getFragmentOffset(&F);
+      const uint64_t FinalSize = Asm.computeFragmentSize(Layout, F);
+      assert(OrigOffset + OrigSize == FinalOffset + FinalSize &&
+             "can't move start of next fragment!");
+      assert(FinalSize == RemainingSize && "inconsistent size computation?");
+#endif
+
+      // If we're looking at a boundary align, make sure we don't try to pad
+      // its target instructions for some following directive.  Doing so would
+      // break the alignment of the current boundary align.
+      if (auto *BF = dyn_cast<MCBoundaryAlignFragment>(&F)) {
+        const MCFragment *LastFragment = BF->getLastFragment();
+        if (!LastFragment)
+          continue;
+        while (&*I != LastFragment)
+          ++I;
+      }
+    }
+  }
+
+  // The layout is done. Mark every fragment as valid.
+  for (unsigned int i = 0, n = Layout.getSectionOrder().size(); i != n; ++i) {
+    MCSection &Section = *Layout.getSectionOrder()[i];
+    Layout.getFragmentOffset(&*Section.getFragmentList().rbegin());
+    Asm.computeFragmentSize(Layout, *Section.getFragmentList().rbegin());
+  }
 }
 
 /// Write a sequence of optimal nops to the output, covering \p Count
@@ -661,7 +1096,7 @@ bool X86AsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const {
 
   // This CPU doesn't support long nops. If needed add more.
   // FIXME: We could generated something better than plain 0x90.
-  if (!STI.getFeatureBits()[X86::FeatureNOPL]) {
+  if (!STI.hasFeature(X86::FeatureNOPL) && !STI.hasFeature(X86::Mode64Bit)) {
     for (uint64_t i = 0; i < Count; ++i)
       OS << '\x90';
     return true;
@@ -670,7 +1105,7 @@ bool X86AsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const {
   // 15-bytes is the longest single NOP instruction, but 10-bytes is
   // commonly the longest that can be efficiently decoded.
   uint64_t MaxNopLength = 10;
-  if (STI.getFeatureBits()[X86::ProcIntelSLM])
+  if (STI.getFeatureBits()[X86::FeatureFast7ByteNOP])
     MaxNopLength = 7;
   else if (STI.getFeatureBits()[X86::FeatureFast15ByteNOP])
     MaxNopLength = 15;
@@ -811,6 +1246,7 @@ class DarwinX86AsmBackend : public X86AsmBackend {
   enum { CU_NUM_SAVED_REGS = 6 };
 
   mutable unsigned SavedRegs[CU_NUM_SAVED_REGS];
+  Triple TT;
   bool Is64Bit;
 
   unsigned OffsetSize;                   ///< Offset of a "push" instruction.
@@ -838,10 +1274,140 @@ protected:
     return 1;
   }
 
+private:
+  /// Get the compact unwind number for a given register. The number
+  /// corresponds to the enum lists in compact_unwind_encoding.h.
+  int getCompactUnwindRegNum(unsigned Reg) const {
+    static const MCPhysReg CU32BitRegs[7] = {
+      X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0
+    };
+    static const MCPhysReg CU64BitRegs[] = {
+      X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0
+    };
+    const MCPhysReg *CURegs = Is64Bit ? CU64BitRegs : CU32BitRegs;
+    for (int Idx = 1; *CURegs; ++CURegs, ++Idx)
+      if (*CURegs == Reg)
+        return Idx;
+
+    return -1;
+  }
+
+  /// Return the registers encoded for a compact encoding with a frame
+  /// pointer.
+  uint32_t encodeCompactUnwindRegistersWithFrame() const {
+    // Encode the registers in the order they were saved --- 3-bits per
+    // register. The list of saved registers is assumed to be in reverse
+    // order. The registers are numbered from 1 to CU_NUM_SAVED_REGS.
+    uint32_t RegEnc = 0;
+    for (int i = 0, Idx = 0; i != CU_NUM_SAVED_REGS; ++i) {
+      unsigned Reg = SavedRegs[i];
+      if (Reg == 0) break;
+
+      int CURegNum = getCompactUnwindRegNum(Reg);
+      if (CURegNum == -1) return ~0U;
+
+      // Encode the 3-bit register number in order, skipping over 3-bits for
+      // each register.
+      RegEnc |= (CURegNum & 0x7) << (Idx++ * 3);
+    }
+
+    assert((RegEnc & 0x3FFFF) == RegEnc &&
+           "Invalid compact register encoding!");
+    return RegEnc;
+  }
+
+  /// Create the permutation encoding used with frameless stacks. It is
+  /// passed the number of registers to be saved and an array of the registers
+  /// saved.
+  uint32_t encodeCompactUnwindRegistersWithoutFrame(unsigned RegCount) const {
+    // The saved registers are numbered from 1 to 6. In order to encode the
+    // order in which they were saved, we re-number them according to their
+    // place in the register order. The re-numbering is relative to the last
+    // re-numbered register. E.g., if we have registers {6, 2, 4, 5} saved in
+    // that order:
+    //
+    //    Orig  Re-Num
+    //    ----  ------
+    //     6       6
+    //     2       2
+    //     4       3
+    //     5       3
+    //
+    for (unsigned i = 0; i < RegCount; ++i) {
+      int CUReg = getCompactUnwindRegNum(SavedRegs[i]);
+      if (CUReg == -1) return ~0U;
+      SavedRegs[i] = CUReg;
+    }
+
+    // Reverse the list.
+    std::reverse(&SavedRegs[0], &SavedRegs[CU_NUM_SAVED_REGS]);
+
+    uint32_t RenumRegs[CU_NUM_SAVED_REGS];
+    for (unsigned i = CU_NUM_SAVED_REGS - RegCount; i < CU_NUM_SAVED_REGS; ++i){
+      unsigned Countless = 0;
+      for (unsigned j = CU_NUM_SAVED_REGS - RegCount; j < i; ++j)
+        if (SavedRegs[j] < SavedRegs[i])
+          ++Countless;
+
+      RenumRegs[i] = SavedRegs[i] - Countless - 1;
+    }
+
+    // Take the renumbered values and encode them into a 10-bit number.
+    uint32_t permutationEncoding = 0;
+    switch (RegCount) {
+    case 6:
+      permutationEncoding |= 120 * RenumRegs[0] + 24 * RenumRegs[1]
+                             + 6 * RenumRegs[2] +  2 * RenumRegs[3]
+                             +     RenumRegs[4];
+      break;
+    case 5:
+      permutationEncoding |= 120 * RenumRegs[1] + 24 * RenumRegs[2]
+                             + 6 * RenumRegs[3] +  2 * RenumRegs[4]
+                             +     RenumRegs[5];
+      break;
+    case 4:
+      permutationEncoding |=  60 * RenumRegs[2] + 12 * RenumRegs[3]
+                             + 3 * RenumRegs[4] +      RenumRegs[5];
+      break;
+    case 3:
+      permutationEncoding |=  20 * RenumRegs[3] +  4 * RenumRegs[4]
+                             +     RenumRegs[5];
+      break;
+    case 2:
+      permutationEncoding |=   5 * RenumRegs[4] +      RenumRegs[5];
+      break;
+    case 1:
+      permutationEncoding |=       RenumRegs[5];
+      break;
+    }
+
+    assert((permutationEncoding & 0x3FF) == permutationEncoding &&
+           "Invalid compact register encoding!");
+    return permutationEncoding;
+  }
+
+public:
+  DarwinX86AsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                      const MCSubtargetInfo &STI)
+      : X86AsmBackend(T, STI), MRI(MRI), TT(STI.getTargetTriple()),
+        Is64Bit(TT.isArch64Bit()) {
+    memset(SavedRegs, 0, sizeof(SavedRegs));
+    OffsetSize = Is64Bit ? 8 : 4;
+    MoveInstrSize = Is64Bit ? 3 : 2;
+    StackDivide = Is64Bit ? 8 : 4;
+  }
+
+  std::unique_ptr<MCObjectTargetWriter>
+  createObjectTargetWriter() const override {
+    uint32_t CPUType = cantFail(MachO::getCPUType(TT));
+    uint32_t CPUSubType = cantFail(MachO::getCPUSubType(TT));
+    return createX86MachObjectWriter(Is64Bit, CPUType, CPUSubType);
+  }
+
   /// Implementation of algorithm to generate the compact unwind encoding
   /// for the CFI instructions.
   uint32_t
-  generateCompactUnwindEncodingImpl(ArrayRef<MCCFIInstruction> Instrs) const {
+  generateCompactUnwindEncoding(ArrayRef<MCCFIInstruction> Instrs) const override {
     if (Instrs.empty()) return 0;
 
     // Reset the saved registers.
@@ -904,7 +1470,7 @@ protected:
         //  L0:
         //     .cfi_def_cfa_offset 80
         //
-        StackSize = std::abs(Inst.getOffset()) / StackDivide;
+        StackSize = Inst.getOffset() / StackDivide;
         ++NumDefCFAOffsets;
         break;
       }
@@ -991,168 +1557,6 @@ protected:
 
     return CompactUnwindEncoding;
   }
-
-private:
-  /// Get the compact unwind number for a given register. The number
-  /// corresponds to the enum lists in compact_unwind_encoding.h.
-  int getCompactUnwindRegNum(unsigned Reg) const {
-    static const MCPhysReg CU32BitRegs[7] = {
-      X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0
-    };
-    static const MCPhysReg CU64BitRegs[] = {
-      X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0
-    };
-    const MCPhysReg *CURegs = Is64Bit ? CU64BitRegs : CU32BitRegs;
-    for (int Idx = 1; *CURegs; ++CURegs, ++Idx)
-      if (*CURegs == Reg)
-        return Idx;
-
-    return -1;
-  }
-
-  /// Return the registers encoded for a compact encoding with a frame
-  /// pointer.
-  uint32_t encodeCompactUnwindRegistersWithFrame() const {
-    // Encode the registers in the order they were saved --- 3-bits per
-    // register. The list of saved registers is assumed to be in reverse
-    // order. The registers are numbered from 1 to CU_NUM_SAVED_REGS.
-    uint32_t RegEnc = 0;
-    for (int i = 0, Idx = 0; i != CU_NUM_SAVED_REGS; ++i) {
-      unsigned Reg = SavedRegs[i];
-      if (Reg == 0) break;
-
-      int CURegNum = getCompactUnwindRegNum(Reg);
-      if (CURegNum == -1) return ~0U;
-
-      // Encode the 3-bit register number in order, skipping over 3-bits for
-      // each register.
-      RegEnc |= (CURegNum & 0x7) << (Idx++ * 3);
-    }
-
-    assert((RegEnc & 0x3FFFF) == RegEnc &&
-           "Invalid compact register encoding!");
-    return RegEnc;
-  }
-
-  /// Create the permutation encoding used with frameless stacks. It is
-  /// passed the number of registers to be saved and an array of the registers
-  /// saved.
-  uint32_t encodeCompactUnwindRegistersWithoutFrame(unsigned RegCount) const {
-    // The saved registers are numbered from 1 to 6. In order to encode the
-    // order in which they were saved, we re-number them according to their
-    // place in the register order. The re-numbering is relative to the last
-    // re-numbered register. E.g., if we have registers {6, 2, 4, 5} saved in
-    // that order:
-    //
-    //    Orig  Re-Num
-    //    ----  ------
-    //     6       6
-    //     2       2
-    //     4       3
-    //     5       3
-    //
-    for (unsigned i = 0; i < RegCount; ++i) {
-      int CUReg = getCompactUnwindRegNum(SavedRegs[i]);
-      if (CUReg == -1) return ~0U;
-      SavedRegs[i] = CUReg;
-    }
-
-    // Reverse the list.
-    std::reverse(&SavedRegs[0], &SavedRegs[CU_NUM_SAVED_REGS]);
-
-    uint32_t RenumRegs[CU_NUM_SAVED_REGS];
-    for (unsigned i = CU_NUM_SAVED_REGS - RegCount; i < CU_NUM_SAVED_REGS; ++i){
-      unsigned Countless = 0;
-      for (unsigned j = CU_NUM_SAVED_REGS - RegCount; j < i; ++j)
-        if (SavedRegs[j] < SavedRegs[i])
-          ++Countless;
-
-      RenumRegs[i] = SavedRegs[i] - Countless - 1;
-    }
-
-    // Take the renumbered values and encode them into a 10-bit number.
-    uint32_t permutationEncoding = 0;
-    switch (RegCount) {
-    case 6:
-      permutationEncoding |= 120 * RenumRegs[0] + 24 * RenumRegs[1]
-                             + 6 * RenumRegs[2] +  2 * RenumRegs[3]
-                             +     RenumRegs[4];
-      break;
-    case 5:
-      permutationEncoding |= 120 * RenumRegs[1] + 24 * RenumRegs[2]
-                             + 6 * RenumRegs[3] +  2 * RenumRegs[4]
-                             +     RenumRegs[5];
-      break;
-    case 4:
-      permutationEncoding |=  60 * RenumRegs[2] + 12 * RenumRegs[3]
-                             + 3 * RenumRegs[4] +      RenumRegs[5];
-      break;
-    case 3:
-      permutationEncoding |=  20 * RenumRegs[3] +  4 * RenumRegs[4]
-                             +     RenumRegs[5];
-      break;
-    case 2:
-      permutationEncoding |=   5 * RenumRegs[4] +      RenumRegs[5];
-      break;
-    case 1:
-      permutationEncoding |=       RenumRegs[5];
-      break;
-    }
-
-    assert((permutationEncoding & 0x3FF) == permutationEncoding &&
-           "Invalid compact register encoding!");
-    return permutationEncoding;
-  }
-
-public:
-  DarwinX86AsmBackend(const Target &T, const MCRegisterInfo &MRI,
-                      const MCSubtargetInfo &STI, bool Is64Bit)
-    : X86AsmBackend(T, STI), MRI(MRI), Is64Bit(Is64Bit) {
-    memset(SavedRegs, 0, sizeof(SavedRegs));
-    OffsetSize = Is64Bit ? 8 : 4;
-    MoveInstrSize = Is64Bit ? 3 : 2;
-    StackDivide = Is64Bit ? 8 : 4;
-  }
-};
-
-class DarwinX86_32AsmBackend : public DarwinX86AsmBackend {
-public:
-  DarwinX86_32AsmBackend(const Target &T, const MCRegisterInfo &MRI,
-                         const MCSubtargetInfo &STI)
-      : DarwinX86AsmBackend(T, MRI, STI, false) {}
-
-  std::unique_ptr<MCObjectTargetWriter>
-  createObjectTargetWriter() const override {
-    return createX86MachObjectWriter(/*Is64Bit=*/false,
-                                     MachO::CPU_TYPE_I386,
-                                     MachO::CPU_SUBTYPE_I386_ALL);
-  }
-
-  /// Generate the compact unwind encoding for the CFI instructions.
-  uint32_t generateCompactUnwindEncoding(
-                             ArrayRef<MCCFIInstruction> Instrs) const override {
-    return generateCompactUnwindEncodingImpl(Instrs);
-  }
-};
-
-class DarwinX86_64AsmBackend : public DarwinX86AsmBackend {
-  const MachO::CPUSubTypeX86 Subtype;
-public:
-  DarwinX86_64AsmBackend(const Target &T, const MCRegisterInfo &MRI,
-                         const MCSubtargetInfo &STI, MachO::CPUSubTypeX86 st)
-      : DarwinX86AsmBackend(T, MRI, STI, true), Subtype(st) {}
-
-  std::unique_ptr<MCObjectTargetWriter>
-  createObjectTargetWriter() const override {
-    return createX86MachObjectWriter(/*Is64Bit=*/true, MachO::CPU_TYPE_X86_64,
-                                     Subtype);
-  }
-
-  /// Generate the compact unwind encoding for the CFI instructions.
-  uint32_t generateCompactUnwindEncoding(
-                             ArrayRef<MCCFIInstruction> Instrs) const override {
-    return generateCompactUnwindEncodingImpl(Instrs);
-  }
 };
 
 } // end anonymous namespace
@@ -1163,7 +1567,7 @@ MCAsmBackend *llvm::createX86_32AsmBackend(const Target &T,
                                            const MCTargetOptions &Options) {
   const Triple &TheTriple = STI.getTargetTriple();
   if (TheTriple.isOSBinFormatMachO())
-    return new DarwinX86_32AsmBackend(T, MRI, STI);
+    return new DarwinX86AsmBackend(T, MRI, STI);
 
   if (TheTriple.isOSWindows() && TheTriple.isOSBinFormatCOFF())
     return new WindowsX86AsmBackend(T, false, STI);
@@ -1181,13 +1585,8 @@ MCAsmBackend *llvm::createX86_64AsmBackend(const Target &T,
                                            const MCRegisterInfo &MRI,
                                            const MCTargetOptions &Options) {
   const Triple &TheTriple = STI.getTargetTriple();
-  if (TheTriple.isOSBinFormatMachO()) {
-    MachO::CPUSubTypeX86 CS =
-        StringSwitch<MachO::CPUSubTypeX86>(TheTriple.getArchName())
-            .Case("x86_64h", MachO::CPU_SUBTYPE_X86_64_H)
-            .Default(MachO::CPU_SUBTYPE_X86_64_ALL);
-    return new DarwinX86_64AsmBackend(T, MRI, STI, CS);
-  }
+  if (TheTriple.isOSBinFormatMachO())
+    return new DarwinX86AsmBackend(T, MRI, STI);
 
   if (TheTriple.isOSWindows() && TheTriple.isOSBinFormatCOFF())
     return new WindowsX86AsmBackend(T, true, STI);
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
index a4f8dd669e1e5..79f07d3c7792a 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -91,7 +91,7 @@ namespace X86 {
     COND_G = 15,
     LAST_VALID_COND = COND_G,
 
-    // Artificial condition codes. These are used by AnalyzeBranch
+    // Artificial condition codes. These are used by analyzeBranch
     // to indicate a block terminated with two conditional branches that together
     // form a compound condition. They occur in code using FCMP_OEQ or FCMP_UNE,
     // which can't be represented on x86 with a single condition. These
@@ -356,6 +356,39 @@ namespace X86 {
     AlignBranchRet = 1U << 4,
     AlignBranchIndirect = 1U << 5
   };
+
+  /// Defines the encoding values for segment override prefix.
+  enum EncodingOfSegmentOverridePrefix : uint8_t {
+    CS_Encoding = 0x2E,
+    DS_Encoding = 0x3E,
+    ES_Encoding = 0x26,
+    FS_Encoding = 0x64,
+    GS_Encoding = 0x65,
+    SS_Encoding = 0x36
+  };
+
+  /// Given a segment register, return the encoding of the segment override
+  /// prefix for it.
+  inline EncodingOfSegmentOverridePrefix
+  getSegmentOverridePrefixForReg(unsigned Reg) {
+    switch (Reg) {
+    default:
+      llvm_unreachable("Unknown segment register!");
+    case X86::CS:
+      return CS_Encoding;
+    case X86::DS:
+      return DS_Encoding;
+    case X86::ES:
+      return ES_Encoding;
+    case X86::FS:
+      return FS_Encoding;
+    case X86::GS:
+      return GS_Encoding;
+    case X86::SS:
+      return SS_Encoding;
+    }
+  }
+
 } // end namespace X86;
 
 /// X86II - This namespace holds all of the target specific flags that
@@ -581,90 +614,107 @@ namespace X86II {
     /// in the lower 4 bits of the opcode.
     AddCCFrm = 9,
 
+    /// PrefixByte - This form is used for instructions that represent a prefix
+    /// byte like data16 or rep.
+    PrefixByte = 10,
+
     /// MRM[0-7][rm] - These forms are used to represent instructions that use
     /// a Mod/RM byte, and use the middle field to hold extended opcode
     /// information.  In the intel manual these are represented as /0, /1, ...
     ///
 
+    // Instructions operate on a register Reg/Opcode operand not the r/m field.
+    MRMr0 = 21,
+
+    /// MRMSrcMem - But force to use the SIB field.
+    MRMSrcMemFSIB  = 22,
+
+    /// MRMDestMem - But force to use the SIB field.
+    MRMDestMemFSIB = 23,
+
     /// MRMDestMem - This form is used for instructions that use the Mod/RM byte
     /// to specify a destination, which in this case is memory.
     ///
-    MRMDestMem     = 32,
+    MRMDestMem     = 24,
 
     /// MRMSrcMem - This form is used for instructions that use the Mod/RM byte
     /// to specify a source, which in this case is memory.
     ///
-    MRMSrcMem      = 33,
+    MRMSrcMem      = 25,
 
     /// MRMSrcMem4VOp3 - This form is used for instructions that encode
     /// operand 3 with VEX.VVVV and load from memory.
     ///
-    MRMSrcMem4VOp3 = 34,
+    MRMSrcMem4VOp3 = 26,
 
     /// MRMSrcMemOp4 - This form is used for instructions that use the Mod/RM
     /// byte to specify the fourth source, which in this case is memory.
     ///
-    MRMSrcMemOp4   = 35,
+    MRMSrcMemOp4   = 27,
 
     /// MRMSrcMemCC - This form is used for instructions that use the Mod/RM
     /// byte to specify the operands and also encodes a condition code.
     ///
-    MRMSrcMemCC    = 36,
+    MRMSrcMemCC    = 28,
 
     /// MRMXm - This form is used for instructions that use the Mod/RM byte
     /// to specify a memory source, but doesn't use the middle field. And has
     /// a condition code.
     ///
-    MRMXmCC = 38,
+    MRMXmCC = 30,
 
     /// MRMXm - This form is used for instructions that use the Mod/RM byte
     /// to specify a memory source, but doesn't use the middle field.
     ///
-    MRMXm = 39,
+    MRMXm = 31,
 
     // Next, instructions that operate on a memory r/m operand...
-    MRM0m = 40,  MRM1m = 41,  MRM2m = 42,  MRM3m = 43, // Format /0 /1 /2 /3
-    MRM4m = 44,  MRM5m = 45,  MRM6m = 46,  MRM7m = 47, // Format /4 /5 /6 /7
+    MRM0m = 32,  MRM1m = 33,  MRM2m = 34,  MRM3m = 35, // Format /0 /1 /2 /3
+    MRM4m = 36,  MRM5m = 37,  MRM6m = 38,  MRM7m = 39, // Format /4 /5 /6 /7
 
     /// MRMDestReg - This form is used for instructions that use the Mod/RM byte
     /// to specify a destination, which in this case is a register.
     ///
-    MRMDestReg     = 48,
+    MRMDestReg     = 40,
 
     /// MRMSrcReg - This form is used for instructions that use the Mod/RM byte
     /// to specify a source, which in this case is a register.
     ///
-    MRMSrcReg      = 49,
+    MRMSrcReg      = 41,
 
     /// MRMSrcReg4VOp3 - This form is used for instructions that encode
     /// operand 3 with VEX.VVVV and do not load from memory.
     ///
-    MRMSrcReg4VOp3 = 50,
+    MRMSrcReg4VOp3 = 42,
 
     /// MRMSrcRegOp4 - This form is used for instructions that use the Mod/RM
     /// byte to specify the fourth source, which in this case is a register.
     ///
-    MRMSrcRegOp4   = 51,
+    MRMSrcRegOp4   = 43,
 
     /// MRMSrcRegCC - This form is used for instructions that use the Mod/RM
     /// byte to specify the operands and also encodes a condition code
     ///
-    MRMSrcRegCC    = 52,
+    MRMSrcRegCC    = 44,
 
     /// MRMXCCr - This form is used for instructions that use the Mod/RM byte
     /// to specify a register source, but doesn't use the middle field. And has
     /// a condition code.
     ///
-    MRMXrCC = 54,
+    MRMXrCC = 46,
 
     /// MRMXr - This form is used for instructions that use the Mod/RM byte
     /// to specify a register source, but doesn't use the middle field.
     ///
-    MRMXr = 55,
+    MRMXr = 47,
 
     // Instructions that operate on a register r/m operand...
-    MRM0r = 56,  MRM1r = 57,  MRM2r = 58,  MRM3r = 59, // Format /0 /1 /2 /3
-    MRM4r = 60,  MRM5r = 61,  MRM6r = 62,  MRM7r = 63, // Format /4 /5 /6 /7
+    MRM0r = 48,  MRM1r = 49,  MRM2r = 50,  MRM3r = 51, // Format /0 /1 /2 /3
+    MRM4r = 52,  MRM5r = 53,  MRM6r = 54,  MRM7r = 55, // Format /4 /5 /6 /7
+
+    // Instructions that operate that have mod=11 and an opcode but ignore r/m.
+    MRM0X = 56,  MRM1X = 57,  MRM2X = 58,  MRM3X = 59, // Format /0 /1 /2 /3
+    MRM4X = 60,  MRM5X = 61,  MRM6X = 62,  MRM7X = 63, // Format /4 /5 /6 /7
 
     /// MRM_XX - A mod/rm byte of exactly 0xXX.
     MRM_C0 = 64,  MRM_C1 = 65,  MRM_C2 = 66,  MRM_C3 = 67,
@@ -900,6 +950,16 @@ namespace X86II {
     NOTRACK = 1ULL << NoTrackShift
   };
 
+  /// \returns true if the instruction with given opcode is a prefix.
+  inline bool isPrefix(uint64_t TSFlags) {
+    return (TSFlags & X86II::FormMask) == PrefixByte;
+  }
+
+  /// \returns true if the instruction with given opcode is a pseudo.
+  inline bool isPseudo(uint64_t TSFlags) {
+    return (TSFlags & X86II::FormMask) == Pseudo;
+  }
+
   /// \returns the "base" X86 opcode for the specified machine
   /// instruction.
   inline uint8_t getBaseOpcodeFor(uint64_t TSFlags) {
@@ -1028,10 +1088,13 @@ namespace X86II {
     case X86II::RawFrmDst:
     case X86II::RawFrmDstSrc:
     case X86II::AddCCFrm:
+    case X86II::PrefixByte:
       return -1;
     case X86II::MRMDestMem:
+    case X86II::MRMDestMemFSIB:
       return 0;
     case X86II::MRMSrcMem:
+    case X86II::MRMSrcMemFSIB:
       // Start from 1, skip any registers encoded in VEX_VVVV or I8IMM, or a
       // mask register.
       return 1 + HasVEX_4V + HasEVEX_K;
@@ -1051,12 +1114,18 @@ namespace X86II {
     case X86II::MRMSrcRegOp4:
     case X86II::MRMSrcRegCC:
     case X86II::MRMXrCC:
+    case X86II::MRMr0:
     case X86II::MRMXr:
     case X86II::MRM0r: case X86II::MRM1r:
     case X86II::MRM2r: case X86II::MRM3r:
     case X86II::MRM4r: case X86II::MRM5r:
     case X86II::MRM6r: case X86II::MRM7r:
       return -1;
+    case X86II::MRM0X: case X86II::MRM1X:
+    case X86II::MRM2X: case X86II::MRM3X:
+    case X86II::MRM4X: case X86II::MRM5X:
+    case X86II::MRM6X: case X86II::MRM7X:
+      return -1;
     case X86II::MRMXmCC:
     case X86II::MRMXm:
     case X86II::MRM0m: case X86II::MRM1m:
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
index bd009da60851e..292dd17e2f51c 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
@@ -317,8 +317,10 @@ static unsigned getRelocType32(MCContext &Ctx,
 unsigned X86ELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
                                           const MCFixup &Fixup,
                                           bool IsPCRel) const {
-  MCSymbolRefExpr::VariantKind Modifier = Target.getAccessVariant();
   MCFixupKind Kind = Fixup.getKind();
+  if (Kind >= FirstLiteralRelocationKind)
+    return Kind - FirstLiteralRelocationKind;
+  MCSymbolRefExpr::VariantKind Modifier = Target.getAccessVariant();
   X86_64RelType Type = getType64(Kind, Modifier, IsPCRel);
   if (getEMachine() == ELF::EM_X86_64)
     return getRelocType64(Ctx, Fixup.getLoc(), Modifier, Type, IsPCRel, Kind);
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp
index 73b1969b4e822..b51011e2c52fc 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp
@@ -15,7 +15,7 @@
 #include "X86ATTInstPrinter.h"
 #include "X86BaseInfo.h"
 #include "X86MCTargetDesc.h"
-#include "Utils/X86ShuffleDecode.h"
+#include "X86ShuffleDecode.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/Support/raw_ostream.h"
@@ -199,6 +199,40 @@ using namespace llvm;
   CASE_AVX512_INS_COMMON(Inst##SD, Z, m_Int)      \
   CASE_AVX512_INS_COMMON(Inst##SS, Z, m_Int)
 
+#define CASE_FMA4(Inst, suf)                      \
+  CASE_AVX_INS_COMMON(Inst, 4, suf)               \
+  CASE_AVX_INS_COMMON(Inst, 4Y, suf)
+
+#define CASE_FMA4_PACKED_RR(Inst)                 \
+  CASE_FMA4(Inst##PD, rr)                         \
+  CASE_FMA4(Inst##PS, rr)
+
+#define CASE_FMA4_PACKED_RM(Inst)                 \
+  CASE_FMA4(Inst##PD, rm)                         \
+  CASE_FMA4(Inst##PS, rm)
+
+#define CASE_FMA4_PACKED_MR(Inst)                 \
+  CASE_FMA4(Inst##PD, mr)                         \
+  CASE_FMA4(Inst##PS, mr)
+
+#define CASE_FMA4_SCALAR_RR(Inst)                 \
+  CASE_AVX_INS_COMMON(Inst##SD4, , rr)            \
+  CASE_AVX_INS_COMMON(Inst##SS4, , rr)            \
+  CASE_AVX_INS_COMMON(Inst##SD4, , rr_Int)        \
+  CASE_AVX_INS_COMMON(Inst##SS4, , rr_Int)
+
+#define CASE_FMA4_SCALAR_RM(Inst)                 \
+  CASE_AVX_INS_COMMON(Inst##SD4, , rm)            \
+  CASE_AVX_INS_COMMON(Inst##SS4, , rm)            \
+  CASE_AVX_INS_COMMON(Inst##SD4, , rm_Int)        \
+  CASE_AVX_INS_COMMON(Inst##SS4, , rm_Int)
+
+#define CASE_FMA4_SCALAR_MR(Inst)                 \
+  CASE_AVX_INS_COMMON(Inst##SD4, , mr)            \
+  CASE_AVX_INS_COMMON(Inst##SS4, , mr)            \
+  CASE_AVX_INS_COMMON(Inst##SD4, , mr_Int)        \
+  CASE_AVX_INS_COMMON(Inst##SS4, , mr_Int)
+
 static unsigned getVectorRegSize(unsigned RegNo) {
   if (X86::ZMM0 <= RegNo && RegNo <= X86::ZMM31)
     return 512;
@@ -247,14 +281,15 @@ static void printMasking(raw_ostream &OS, const MCInst *MI,
     OS << " {z}";
 }
 
-static bool printFMA3Comments(const MCInst *MI, raw_ostream &OS) {
+static bool printFMAComments(const MCInst *MI, raw_ostream &OS,
+                             const MCInstrInfo &MCII) {
   const char *Mul1Name = nullptr, *Mul2Name = nullptr, *AccName = nullptr;
   unsigned NumOperands = MI->getNumOperands();
   bool RegForm = false;
   bool Negate = false;
   StringRef AccStr = "+";
 
-  // The operands for FMA instructions without rounding fall into two forms.
+  // The operands for FMA3 instructions without rounding fall into two forms:
   //  dest, src1, src2, src3
   //  dest, src1, mask, src2, src3
   // Where src3 is either a register or 5 memory address operands. So to find
@@ -262,9 +297,112 @@ static bool printFMA3Comments(const MCInst *MI, raw_ostream &OS) {
   // index from the end by taking into account memory vs register form when
   // finding src2.
 
+  // The operands for FMA4 instructions:
+  //  dest, src1, src2, src3
+  // Where src2 OR src3 are either a register or 5 memory address operands. So
+  // to find dest and src1 we can index from the front, src2 (reg/mem) follows
+  // and then src3 (reg) will be at the end.
+
   switch (MI->getOpcode()) {
   default:
     return false;
+
+  CASE_FMA4_PACKED_RR(FMADD)
+  CASE_FMA4_SCALAR_RR(FMADD)
+    AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    LLVM_FALLTHROUGH;
+  CASE_FMA4_PACKED_RM(FMADD)
+  CASE_FMA4_SCALAR_RM(FMADD)
+    Mul2Name = getRegName(MI->getOperand(2).getReg());
+    Mul1Name = getRegName(MI->getOperand(1).getReg());
+    break;
+  CASE_FMA4_PACKED_MR(FMADD)
+  CASE_FMA4_SCALAR_MR(FMADD)
+    AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    Mul1Name = getRegName(MI->getOperand(1).getReg());
+    break;
+
+  CASE_FMA4_PACKED_RR(FMSUB)
+  CASE_FMA4_SCALAR_RR(FMSUB)
+    AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    LLVM_FALLTHROUGH;
+  CASE_FMA4_PACKED_RM(FMSUB)
+  CASE_FMA4_SCALAR_RM(FMSUB)
+    Mul2Name = getRegName(MI->getOperand(2).getReg());
+    Mul1Name = getRegName(MI->getOperand(1).getReg());
+    AccStr = "-";
+    break;
+  CASE_FMA4_PACKED_MR(FMSUB)
+  CASE_FMA4_SCALAR_MR(FMSUB)
+    AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    Mul1Name = getRegName(MI->getOperand(1).getReg());
+    AccStr = "-";
+    break;
+
+  CASE_FMA4_PACKED_RR(FNMADD)
+  CASE_FMA4_SCALAR_RR(FNMADD)
+    AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    LLVM_FALLTHROUGH;
+  CASE_FMA4_PACKED_RM(FNMADD)
+  CASE_FMA4_SCALAR_RM(FNMADD)
+    Mul2Name = getRegName(MI->getOperand(2).getReg());
+    Mul1Name = getRegName(MI->getOperand(1).getReg());
+    Negate = true;
+    break;
+  CASE_FMA4_PACKED_MR(FNMADD)
+  CASE_FMA4_SCALAR_MR(FNMADD)
+    AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    Mul1Name = getRegName(MI->getOperand(1).getReg());
+    Negate = true;
+    break;
+
+  CASE_FMA4_PACKED_RR(FNMSUB)
+  CASE_FMA4_SCALAR_RR(FNMSUB)
+    AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    LLVM_FALLTHROUGH;
+  CASE_FMA4_PACKED_RM(FNMSUB)
+  CASE_FMA4_SCALAR_RM(FNMSUB)
+    Mul2Name = getRegName(MI->getOperand(2).getReg());
+    Mul1Name = getRegName(MI->getOperand(1).getReg());
+    AccStr = "-";
+    Negate = true;
+    break;
+  CASE_FMA4_PACKED_MR(FNMSUB)
+  CASE_FMA4_SCALAR_MR(FNMSUB)
+    AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    Mul1Name = getRegName(MI->getOperand(1).getReg());
+    AccStr = "-";
+    Negate = true;
+    break;
+
+  CASE_FMA4_PACKED_RR(FMADDSUB)
+    AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    LLVM_FALLTHROUGH;
+  CASE_FMA4_PACKED_RM(FMADDSUB)
+    Mul2Name = getRegName(MI->getOperand(2).getReg());
+    Mul1Name = getRegName(MI->getOperand(1).getReg());
+    AccStr = "+/-";
+    break;
+  CASE_FMA4_PACKED_MR(FMADDSUB)
+    AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    Mul1Name = getRegName(MI->getOperand(1).getReg());
+    AccStr = "+/-";
+    break;
+
+  CASE_FMA4_PACKED_RR(FMSUBADD)
+    AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    LLVM_FALLTHROUGH;
+  CASE_FMA4_PACKED_RM(FMSUBADD)
+    Mul2Name = getRegName(MI->getOperand(2).getReg());
+    Mul1Name = getRegName(MI->getOperand(1).getReg());
+    AccStr = "-/+";
+    break;
+  CASE_FMA4_PACKED_MR(FMSUBADD)
+    AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    Mul1Name = getRegName(MI->getOperand(1).getReg());
+    AccStr = "-/+";
+    break;
+
   CASE_FMA_PACKED_REG(FMADD132)
   CASE_FMA_SCALAR_REG(FMADD132)
     Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
@@ -476,8 +614,9 @@ static bool printFMA3Comments(const MCInst *MI, raw_ostream &OS) {
   if (!Mul2Name) Mul2Name = "mem";
   if (!AccName)  AccName = "mem";
 
-  OS << DestName << " = ";
-  // TODO: Print masking information?
+  OS << DestName;
+  printMasking(OS, MI, MCII);
+  OS << " = ";
 
   if (Negate)
     OS << '-';
@@ -504,7 +643,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   unsigned NumOperands = MI->getNumOperands();
   bool RegForm = false;
 
-  if (printFMA3Comments(MI, OS))
+  if (printFMAComments(MI, OS, MCII))
     return true;
 
   switch (MI->getOpcode()) {
@@ -669,14 +808,14 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   case X86::PSLLDQri:
   case X86::VPSLLDQri:
   case X86::VPSLLDQYri:
-  case X86::VPSLLDQZ128rr:
-  case X86::VPSLLDQZ256rr:
-  case X86::VPSLLDQZrr:
+  case X86::VPSLLDQZ128ri:
+  case X86::VPSLLDQZ256ri:
+  case X86::VPSLLDQZri:
     Src1Name = getRegName(MI->getOperand(1).getReg());
     LLVM_FALLTHROUGH;
-  case X86::VPSLLDQZ128rm:
-  case X86::VPSLLDQZ256rm:
-  case X86::VPSLLDQZrm:
+  case X86::VPSLLDQZ128mi:
+  case X86::VPSLLDQZ256mi:
+  case X86::VPSLLDQZmi:
     DestName = getRegName(MI->getOperand(0).getReg());
     if (MI->getOperand(NumOperands - 1).isImm())
       DecodePSLLDQMask(getRegOperandNumElts(MI, 8, 0),
@@ -687,14 +826,14 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   case X86::PSRLDQri:
   case X86::VPSRLDQri:
   case X86::VPSRLDQYri:
-  case X86::VPSRLDQZ128rr:
-  case X86::VPSRLDQZ256rr:
-  case X86::VPSRLDQZrr:
+  case X86::VPSRLDQZ128ri:
+  case X86::VPSRLDQZ256ri:
+  case X86::VPSRLDQZri:
     Src1Name = getRegName(MI->getOperand(1).getReg());
     LLVM_FALLTHROUGH;
-  case X86::VPSRLDQZ128rm:
-  case X86::VPSRLDQZ256rm:
-  case X86::VPSRLDQZrm:
+  case X86::VPSRLDQZ128mi:
+  case X86::VPSRLDQZ256mi:
+  case X86::VPSRLDQZmi:
     DestName = getRegName(MI->getOperand(0).getReg());
     if (MI->getOperand(NumOperands - 1).isImm())
       DecodePSRLDQMask(getRegOperandNumElts(MI, 8, 0),
@@ -1178,28 +1317,28 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     DecodeSubVectorBroadcast(16, 8, ShuffleMask);
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
-  CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z128, r)
+  CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z128, rr)
     Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
     LLVM_FALLTHROUGH;
-  CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z128, m)
+  CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z128, rm)
     DecodeSubVectorBroadcast(4, 2, ShuffleMask);
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
-  CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z256, r)
-  CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z256, r)
+  CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z256, rr)
+  CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z256, rr)
     Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
     LLVM_FALLTHROUGH;
-  CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z256, m)
-  CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z256, m)
+  CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z256, rm)
+  CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z256, rm)
     DecodeSubVectorBroadcast(8, 2, ShuffleMask);
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
-  CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z, r)
-  CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z, r)
+  CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z, rr)
+  CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z, rr)
     Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
     LLVM_FALLTHROUGH;
-  CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z, m)
-  CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z, m)
+  CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z, rm)
+  CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z, rm)
     DecodeSubVectorBroadcast(16, 2, ShuffleMask);
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
index a215550769765..33d70fdb12142 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
@@ -13,6 +13,7 @@
 
 #include "X86InstPrinterCommon.h"
 #include "X86BaseInfo.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrDesc.h"
@@ -287,16 +288,23 @@ void X86InstPrinterCommon::printRoundingControl(const MCInst *MI, unsigned Op,
   }
 }
 
-/// printPCRelImm - This is used to print an immediate value that ends up
-/// being encoded as a pc-relative value (e.g. for jumps and calls).  In
-/// Intel-style these print slightly differently than normal immediates.
-/// for example, a $ is not emitted.
-void X86InstPrinterCommon::printPCRelImm(const MCInst *MI, unsigned OpNo,
-                                         raw_ostream &O) {
+/// value (e.g. for jumps and calls). In Intel-style these print slightly
+/// differently than normal immediates. For example, a $ is not emitted.
+///
+/// \p Address The address of the next instruction.
+/// \see MCInstPrinter::printInst
+void X86InstPrinterCommon::printPCRelImm(const MCInst *MI, uint64_t Address,
+                                         unsigned OpNo, raw_ostream &O) {
   const MCOperand &Op = MI->getOperand(OpNo);
-  if (Op.isImm())
-    O << formatImm(Op.getImm());
-  else {
+  if (Op.isImm()) {
+    if (PrintBranchImmAsAddress) {
+      uint64_t Target = Address + Op.getImm();
+      if (MAI.getCodePointerSize() == 4)
+        Target &= 0xffffffff;
+      O << formatHex(Target);
+    } else
+      O << formatImm(Op.getImm());
+  } else {
     assert(Op.isExpr() && "unknown pcrel immediate operand");
     // If a symbolic branch target was added as a constant expression then print
     // that address in hex.
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h
index 8e28f24b619a9..bb12ede3b7292 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h
@@ -29,7 +29,9 @@ public:
   void printVPCMPMnemonic(const MCInst *MI, raw_ostream &OS);
   void printCMPMnemonic(const MCInst *MI, bool IsVCmp, raw_ostream &OS);
   void printRoundingControl(const MCInst *MI, unsigned Op, raw_ostream &O);
-  void printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printPCRelImm(const MCInst *MI, uint64_t Address, unsigned OpNo,
+                     raw_ostream &O);
+
 protected:
   void printInstFlags(const MCInst *MI, raw_ostream &O);
   void printOptionalSegReg(const MCInst *MI, unsigned OpNo, raw_ostream &O);
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
index f4bb0fbf62cd7..d1eb4d09851dd 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
@@ -45,8 +45,7 @@ void X86IntelInstPrinter::printInst(const MCInst *MI, uint64_t Address,
   if (MI->getOpcode() == X86::DATA16_PREFIX &&
       STI.getFeatureBits()[X86::Mode16Bit]) {
     OS << "\tdata32";
-  } else if (!printAliasInstr(MI, OS) &&
-             !printVecCompareInstr(MI, OS))
+  } else if (!printAliasInstr(MI, Address, OS) && !printVecCompareInstr(MI, OS))
     printInstruction(MI, Address, OS);
 
   // Next always print the annotation.
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.h b/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.h
index b409b20cbea88..82baf611df038 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.h
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.h
@@ -31,9 +31,10 @@ public:
 
   // Autogenerated by tblgen, returns true if we successfully printed an
   // alias.
-  bool printAliasInstr(const MCInst *MI, raw_ostream &OS);
-  void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
-                               unsigned PrintMethodIdx, raw_ostream &O);
+  bool printAliasInstr(const MCInst *MI, uint64_t Address, raw_ostream &OS);
+  void printCustomAliasOperand(const MCInst *MI, uint64_t Address,
+                               unsigned OpIdx, unsigned PrintMethodIdx,
+                               raw_ostream &O);
 
   // Autogenerated by tblgen.
   void printInstruction(const MCInst *MI, uint64_t Address, raw_ostream &O);
@@ -47,14 +48,6 @@ public:
   void printU8Imm(const MCInst *MI, unsigned Op, raw_ostream &O);
   void printSTiRegOperand(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
 
-  void printanymem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
-    printMemReference(MI, OpNo, O);
-  }
-
-  void printopaquemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
-    printMemReference(MI, OpNo, O);
-  }
-
   void printbytemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
     O << "byte ptr ";
     printMemReference(MI, OpNo, O);
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
index d986c829d98eb..c294da6baffa7 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
@@ -71,8 +71,6 @@ X86MCAsmInfoDarwin::X86MCAsmInfoDarwin(const Triple &T) {
   // (actually, must, since otherwise the non-extern relocations we produce
   // overwhelm ld64's tiny little mind and it fails).
   DwarfFDESymbolsUseAbsDiff = true;
-
-  UseIntegratedAssembler = true;
 }
 
 X86_64MCAsmInfoDarwin::X86_64MCAsmInfoDarwin(const Triple &Triple)
@@ -102,10 +100,6 @@ X86ELFMCAsmInfo::X86ELFMCAsmInfo(const Triple &T) {
 
   // Exceptions handling
   ExceptionsType = ExceptionHandling::DwarfCFI;
-
-  // Always enable the integrated assembler by default.
-  // Clang also enabled it when the OS is Solaris but that is redundant here.
-  UseIntegratedAssembler = true;
 }
 
 const MCExpr *
@@ -141,8 +135,16 @@ X86MCAsmInfoMicrosoft::X86MCAsmInfoMicrosoft(const Triple &Triple) {
   TextAlignFillValue = 0x90;
 
   AllowAtInName = true;
+}
 
-  UseIntegratedAssembler = true;
+void X86MCAsmInfoMicrosoftMASM::anchor() { }
+
+X86MCAsmInfoMicrosoftMASM::X86MCAsmInfoMicrosoftMASM(const Triple &Triple)
+    : X86MCAsmInfoMicrosoft(Triple) {
+  DollarIsPC = true;
+  SeparatorString = "\n";
+  CommentString = ";";
+  AllowSymbolAtNameStart = true;
 }
 
 void X86MCAsmInfoGNUCOFF::anchor() { }
@@ -164,6 +166,4 @@ X86MCAsmInfoGNUCOFF::X86MCAsmInfoGNUCOFF(const Triple &Triple) {
   TextAlignFillValue = 0x90;
 
   AllowAtInName = true;
-
-  UseIntegratedAssembler = true;
 }
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h b/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h
index b2369647a40f2..ce8e84fb96b9b 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h
@@ -13,7 +13,6 @@
 #ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCASMINFO_H
 #define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCASMINFO_H
 
-#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAsmInfoCOFF.h"
 #include "llvm/MC/MCAsmInfoDarwin.h"
 #include "llvm/MC/MCAsmInfoELF.h"
@@ -49,6 +48,13 @@ public:
   explicit X86MCAsmInfoMicrosoft(const Triple &Triple);
 };
 
+class X86MCAsmInfoMicrosoftMASM : public X86MCAsmInfoMicrosoft {
+  void anchor() override;
+
+public:
+  explicit X86MCAsmInfoMicrosoftMASM(const Triple &Triple);
+};
+
 class X86MCAsmInfoGNUCOFF : public MCAsmInfoGNUCOFF {
   void anchor() override;
 
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index 54a293702bd0f..7dea0760a8310 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -55,83 +55,64 @@ public:
                          const MCSubtargetInfo &STI) const override;
 
 private:
-  unsigned getX86RegNum(const MCOperand &MO) const {
-    return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg()) & 0x7;
-  }
+  unsigned getX86RegNum(const MCOperand &MO) const;
 
-  unsigned getX86RegEncoding(const MCInst &MI, unsigned OpNum) const {
-    return Ctx.getRegisterInfo()->getEncodingValue(
-        MI.getOperand(OpNum).getReg());
-  }
+  unsigned getX86RegEncoding(const MCInst &MI, unsigned OpNum) const;
 
   /// \param MI a single low-level machine instruction.
   /// \param OpNum the operand #.
   /// \returns true if the OpNumth operand of MI  require a bit to be set in
   /// REX prefix.
-  bool isREXExtendedReg(const MCInst &MI, unsigned OpNum) const {
-    return (getX86RegEncoding(MI, OpNum) >> 3) & 1;
-  }
-
-  void emitByte(uint8_t C, unsigned &CurByte, raw_ostream &OS) const {
-    OS << (char)C;
-    ++CurByte;
-  }
-
-  void emitConstant(uint64_t Val, unsigned Size, unsigned &CurByte,
-                    raw_ostream &OS) const {
-    // Output the constant in little endian byte order.
-    for (unsigned i = 0; i != Size; ++i) {
-      emitByte(Val & 255, CurByte, OS);
-      Val >>= 8;
-    }
-  }
+  bool isREXExtendedReg(const MCInst &MI, unsigned OpNum) const;
 
   void emitImmediate(const MCOperand &Disp, SMLoc Loc, unsigned ImmSize,
-                     MCFixupKind FixupKind, unsigned &CurByte, raw_ostream &OS,
+                     MCFixupKind FixupKind, uint64_t StartByte, raw_ostream &OS,
                      SmallVectorImpl<MCFixup> &Fixups, int ImmOffset = 0) const;
 
-  static uint8_t modRMByte(unsigned Mod, unsigned RegOpcode, unsigned RM) {
-    assert(Mod < 4 && RegOpcode < 8 && RM < 8 && "ModRM Fields out of range!");
-    return RM | (RegOpcode << 3) | (Mod << 6);
-  }
-
   void emitRegModRMByte(const MCOperand &ModRMReg, unsigned RegOpcodeFld,
-                        unsigned &CurByte, raw_ostream &OS) const {
-    emitByte(modRMByte(3, RegOpcodeFld, getX86RegNum(ModRMReg)), CurByte, OS);
-  }
+                        raw_ostream &OS) const;
 
   void emitSIBByte(unsigned SS, unsigned Index, unsigned Base,
-                   unsigned &CurByte, raw_ostream &OS) const {
-    // SIB byte is in the same format as the modRMByte.
-    emitByte(modRMByte(SS, Index, Base), CurByte, OS);
-  }
+                   raw_ostream &OS) const;
 
   void emitMemModRMByte(const MCInst &MI, unsigned Op, unsigned RegOpcodeField,
-                        uint64_t TSFlags, bool Rex, unsigned &CurByte,
+                        uint64_t TSFlags, bool HasREX, uint64_t StartByte,
                         raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups,
-                        const MCSubtargetInfo &STI) const;
+                        const MCSubtargetInfo &STI,
+                        bool ForceSIB = false) const;
 
-  void emitPrefixImpl(uint64_t TSFlags, unsigned &CurOp, unsigned &CurByte,
-                  bool &Rex, const MCInst &MI, const MCInstrDesc &Desc,
-                  const MCSubtargetInfo &STI, raw_ostream &OS) const;
+  bool emitPrefixImpl(unsigned &CurOp, const MCInst &MI,
+                      const MCSubtargetInfo &STI, raw_ostream &OS) const;
 
-  void emitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, int MemOperand,
-                           const MCInst &MI, const MCInstrDesc &Desc,
+  void emitVEXOpcodePrefix(int MemOperand, const MCInst &MI,
                            raw_ostream &OS) const;
 
-  void emitSegmentOverridePrefix(unsigned &CurByte, unsigned SegOperand,
-                                 const MCInst &MI, raw_ostream &OS) const;
+  void emitSegmentOverridePrefix(unsigned SegOperand, const MCInst &MI,
+                                 raw_ostream &OS) const;
 
-  bool emitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, int MemOperand,
-                        const MCInst &MI, const MCInstrDesc &Desc,
+  bool emitOpcodePrefix(int MemOperand, const MCInst &MI,
                         const MCSubtargetInfo &STI, raw_ostream &OS) const;
 
-  uint8_t determineREXPrefix(const MCInst &MI, uint64_t TSFlags, int MemOperand,
-                             const MCInstrDesc &Desc) const;
+  bool emitREXPrefix(int MemOperand, const MCInst &MI, raw_ostream &OS) const;
 };
 
 } // end anonymous namespace
 
+static uint8_t modRMByte(unsigned Mod, unsigned RegOpcode, unsigned RM) {
+  assert(Mod < 4 && RegOpcode < 8 && RM < 8 && "ModRM Fields out of range!");
+  return RM | (RegOpcode << 3) | (Mod << 6);
+}
+
+static void emitByte(uint8_t C, raw_ostream &OS) { OS << static_cast<char>(C); }
+
+static void emitConstant(uint64_t Val, unsigned Size, raw_ostream &OS) {
+  // Output the constant in little endian byte order.
+  for (unsigned i = 0; i != Size; ++i) {
+    emitByte(Val & 255, OS);
+    Val >>= 8;
+  }
+}
+
 /// \returns true if this signed displacement fits in a 8-bit sign-extended
 /// field.
 static bool isDisp8(int Value) { return Value == (int8_t)Value; }
@@ -275,7 +256,8 @@ static bool hasSecRelSymbolRef(const MCExpr *Expr) {
 static bool isPCRel32Branch(const MCInst &MI, const MCInstrInfo &MCII) {
   unsigned Opcode = MI.getOpcode();
   const MCInstrDesc &Desc = MCII.get(Opcode);
-  if ((Opcode != X86::CALL64pcrel32 && Opcode != X86::JMP_4) ||
+  if ((Opcode != X86::CALL64pcrel32 && Opcode != X86::JMP_4 &&
+       Opcode != X86::JCC_4) ||
       getImmFixupKind(Desc.TSFlags) != FK_PCRel_4)
     return false;
 
@@ -288,9 +270,27 @@ static bool isPCRel32Branch(const MCInst &MI, const MCInstrInfo &MCII) {
   return Ref && Ref->getKind() == MCSymbolRefExpr::VK_None;
 }
 
+unsigned X86MCCodeEmitter::getX86RegNum(const MCOperand &MO) const {
+  return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg()) & 0x7;
+}
+
+unsigned X86MCCodeEmitter::getX86RegEncoding(const MCInst &MI,
+                                             unsigned OpNum) const {
+  return Ctx.getRegisterInfo()->getEncodingValue(MI.getOperand(OpNum).getReg());
+}
+
+/// \param MI a single low-level machine instruction.
+/// \param OpNum the operand #.
+/// \returns true if the OpNumth operand of MI  require a bit to be set in
+/// REX prefix.
+bool X86MCCodeEmitter::isREXExtendedReg(const MCInst &MI,
+                                        unsigned OpNum) const {
+  return (getX86RegEncoding(MI, OpNum) >> 3) & 1;
+}
+
 void X86MCCodeEmitter::emitImmediate(const MCOperand &DispOp, SMLoc Loc,
                                      unsigned Size, MCFixupKind FixupKind,
-                                     unsigned &CurByte, raw_ostream &OS,
+                                     uint64_t StartByte, raw_ostream &OS,
                                      SmallVectorImpl<MCFixup> &Fixups,
                                      int ImmOffset) const {
   const MCExpr *Expr = nullptr;
@@ -299,7 +299,7 @@ void X86MCCodeEmitter::emitImmediate(const MCOperand &DispOp, SMLoc Loc,
     // relocation, emit it now.
     if (FixupKind != FK_PCRel_1 && FixupKind != FK_PCRel_2 &&
         FixupKind != FK_PCRel_4) {
-      emitConstant(DispOp.getImm() + ImmOffset, Size, CurByte, OS);
+      emitConstant(DispOp.getImm() + ImmOffset, Size, OS);
       return;
     }
     Expr = MCConstantExpr::create(DispOp.getImm(), Ctx);
@@ -322,7 +322,7 @@ void X86MCCodeEmitter::emitImmediate(const MCOperand &DispOp, SMLoc Loc,
       }
 
       if (Kind == GOT_Normal)
-        ImmOffset = CurByte;
+        ImmOffset = static_cast<int>(OS.tell() - StartByte);
     } else if (Expr->getKind() == MCExpr::SymbolRef) {
       if (hasSecRelSymbolRef(Expr)) {
         FixupKind = MCFixupKind(FK_SecRel_4);
@@ -361,16 +361,30 @@ void X86MCCodeEmitter::emitImmediate(const MCOperand &DispOp, SMLoc Loc,
                                    Ctx);
 
   // Emit a symbolic constant as a fixup and 4 zeros.
-  Fixups.push_back(MCFixup::create(CurByte, Expr, FixupKind, Loc));
-  emitConstant(0, Size, CurByte, OS);
+  Fixups.push_back(MCFixup::create(static_cast<uint32_t>(OS.tell() - StartByte),
+                                   Expr, FixupKind, Loc));
+  emitConstant(0, Size, OS);
+}
+
+void X86MCCodeEmitter::emitRegModRMByte(const MCOperand &ModRMReg,
+                                        unsigned RegOpcodeFld,
+                                        raw_ostream &OS) const {
+  emitByte(modRMByte(3, RegOpcodeFld, getX86RegNum(ModRMReg)), OS);
+}
+
+void X86MCCodeEmitter::emitSIBByte(unsigned SS, unsigned Index, unsigned Base,
+                                   raw_ostream &OS) const {
+  // SIB byte is in the same format as the modRMByte.
+  emitByte(modRMByte(SS, Index, Base), OS);
 }
 
 void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
                                         unsigned RegOpcodeField,
-                                        uint64_t TSFlags, bool Rex,
-                                        unsigned &CurByte, raw_ostream &OS,
+                                        uint64_t TSFlags, bool HasREX,
+                                        uint64_t StartByte, raw_ostream &OS,
                                         SmallVectorImpl<MCFixup> &Fixups,
-                                        const MCSubtargetInfo &STI) const {
+                                        const MCSubtargetInfo &STI,
+                                        bool ForceSIB) const {
   const MCOperand &Disp = MI.getOperand(Op + X86::AddrDisp);
   const MCOperand &Base = MI.getOperand(Op + X86::AddrBaseReg);
   const MCOperand &Scale = MI.getOperand(Op + X86::AddrScaleAmt);
@@ -383,8 +397,9 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
       BaseReg == X86::EIP) { // [disp32+rIP] in X86-64 mode
     assert(STI.hasFeature(X86::Mode64Bit) &&
            "Rip-relative addressing requires 64-bit mode");
-    assert(IndexReg.getReg() == 0 && "Invalid rip-relative address");
-    emitByte(modRMByte(0, RegOpcodeField, 5), CurByte, OS);
+    assert(IndexReg.getReg() == 0 && !ForceSIB &&
+           "Invalid rip-relative address");
+    emitByte(modRMByte(0, RegOpcodeField, 5), OS);
 
     unsigned Opcode = MI.getOpcode();
     // movq loads are handled with a special relocation form which allows the
@@ -395,7 +410,7 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
       default:
         return X86::reloc_riprel_4byte;
       case X86::MOV64rm:
-        assert(Rex);
+        assert(HasREX);
         return X86::reloc_riprel_4byte_movq_load;
       case X86::CALL64m:
       case X86::JMP64m:
@@ -409,8 +424,8 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
       case X86::SBB64rm:
       case X86::SUB64rm:
       case X86::XOR64rm:
-        return Rex ? X86::reloc_riprel_4byte_relax_rex
-                   : X86::reloc_riprel_4byte_relax;
+        return HasREX ? X86::reloc_riprel_4byte_relax_rex
+                      : X86::reloc_riprel_4byte_relax;
       }
     }();
 
@@ -425,7 +440,7 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
                       ? X86II::getSizeOfImm(TSFlags)
                       : 0;
 
-    emitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(FixupKind), CurByte, OS,
+    emitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(FixupKind), StartByte, OS,
                   Fixups, -ImmSize);
     return;
   }
@@ -472,23 +487,23 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
       if (Disp.isImm() && isDisp8(Disp.getImm())) {
         if (Disp.getImm() == 0 && RMfield != 6) {
           // There is no displacement; just the register.
-          emitByte(modRMByte(0, RegOpcodeField, RMfield), CurByte, OS);
+          emitByte(modRMByte(0, RegOpcodeField, RMfield), OS);
           return;
         }
         // Use the [REG]+disp8 form, including for [BP] which cannot be encoded.
-        emitByte(modRMByte(1, RegOpcodeField, RMfield), CurByte, OS);
-        emitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups);
+        emitByte(modRMByte(1, RegOpcodeField, RMfield), OS);
+        emitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, StartByte, OS, Fixups);
         return;
       }
       // This is the [REG]+disp16 case.
-      emitByte(modRMByte(2, RegOpcodeField, RMfield), CurByte, OS);
+      emitByte(modRMByte(2, RegOpcodeField, RMfield), OS);
     } else {
       // There is no BaseReg; this is the plain [disp16] case.
-      emitByte(modRMByte(0, RegOpcodeField, 6), CurByte, OS);
+      emitByte(modRMByte(0, RegOpcodeField, 6), OS);
     }
 
     // Emit 16-bit displacement for plain disp16 or [REG]+disp16 cases.
-    emitImmediate(Disp, MI.getLoc(), 2, FK_Data_2, CurByte, OS, Fixups);
+    emitImmediate(Disp, MI.getLoc(), 2, FK_Data_2, StartByte, OS, Fixups);
     return;
   }
 
@@ -498,7 +513,7 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
   // 2-7) and absolute references.
 
   if ( // The SIB byte must be used if there is an index register.
-      IndexReg.getReg() == 0 &&
+      !ForceSIB && IndexReg.getReg() == 0 &&
       // The SIB byte must be used if the base is ESP/RSP/R12, all of which
       // encode to an R/M value of 4, which indicates that a SIB byte is
       // present.
@@ -508,8 +523,8 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
       (!STI.hasFeature(X86::Mode64Bit) || BaseReg != 0)) {
 
     if (BaseReg == 0) { // [disp32]     in X86-32 mode
-      emitByte(modRMByte(0, RegOpcodeField, 5), CurByte, OS);
-      emitImmediate(Disp, MI.getLoc(), 4, FK_Data_4, CurByte, OS, Fixups);
+      emitByte(modRMByte(0, RegOpcodeField, 5), OS);
+      emitImmediate(Disp, MI.getLoc(), 4, FK_Data_4, StartByte, OS, Fixups);
       return;
     }
 
@@ -519,7 +534,7 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
     // by emitting a displacement of 0 below.
     if (BaseRegNo != N86::EBP) {
       if (Disp.isImm() && Disp.getImm() == 0) {
-        emitByte(modRMByte(0, RegOpcodeField, BaseRegNo), CurByte, OS);
+        emitByte(modRMByte(0, RegOpcodeField, BaseRegNo), OS);
         return;
       }
 
@@ -530,7 +545,7 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
           // This is exclusively used by call *a@tlscall(base). The relocation
           // (R_386_TLSCALL or R_X86_64_TLSCALL) applies to the beginning.
           Fixups.push_back(MCFixup::create(0, Sym, FK_NONE, MI.getLoc()));
-          emitByte(modRMByte(0, RegOpcodeField, BaseRegNo), CurByte, OS);
+          emitByte(modRMByte(0, RegOpcodeField, BaseRegNo), OS);
           return;
         }
       }
@@ -539,27 +554,27 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
     // Otherwise, if the displacement fits in a byte, encode as [REG+disp8].
     if (Disp.isImm()) {
       if (!HasEVEX && isDisp8(Disp.getImm())) {
-        emitByte(modRMByte(1, RegOpcodeField, BaseRegNo), CurByte, OS);
-        emitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups);
+        emitByte(modRMByte(1, RegOpcodeField, BaseRegNo), OS);
+        emitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, StartByte, OS, Fixups);
         return;
       }
       // Try EVEX compressed 8-bit displacement first; if failed, fall back to
       // 32-bit displacement.
       int CDisp8 = 0;
       if (HasEVEX && isCDisp8(TSFlags, Disp.getImm(), CDisp8)) {
-        emitByte(modRMByte(1, RegOpcodeField, BaseRegNo), CurByte, OS);
-        emitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups,
+        emitByte(modRMByte(1, RegOpcodeField, BaseRegNo), OS);
+        emitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, StartByte, OS, Fixups,
                       CDisp8 - Disp.getImm());
         return;
       }
     }
 
     // Otherwise, emit the most general non-SIB encoding: [REG+disp32]
-    emitByte(modRMByte(2, RegOpcodeField, BaseRegNo), CurByte, OS);
+    emitByte(modRMByte(2, RegOpcodeField, BaseRegNo), OS);
     unsigned Opcode = MI.getOpcode();
     unsigned FixupKind = Opcode == X86::MOV32rm ? X86::reloc_signed_4byte_relax
                                                 : X86::reloc_signed_4byte;
-    emitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(FixupKind), CurByte, OS,
+    emitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(FixupKind), StartByte, OS,
                   Fixups);
     return;
   }
@@ -575,30 +590,30 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
   if (BaseReg == 0) {
     // If there is no base register, we emit the special case SIB byte with
     // MOD=0, BASE=5, to JUST get the index, scale, and displacement.
-    emitByte(modRMByte(0, RegOpcodeField, 4), CurByte, OS);
+    emitByte(modRMByte(0, RegOpcodeField, 4), OS);
     ForceDisp32 = true;
   } else if (!Disp.isImm()) {
     // Emit the normal disp32 encoding.
-    emitByte(modRMByte(2, RegOpcodeField, 4), CurByte, OS);
+    emitByte(modRMByte(2, RegOpcodeField, 4), OS);
     ForceDisp32 = true;
   } else if (Disp.getImm() == 0 &&
              // Base reg can't be anything that ends up with '5' as the base
              // reg, it is the magic [*] nomenclature that indicates no base.
              BaseRegNo != N86::EBP) {
     // Emit no displacement ModR/M byte
-    emitByte(modRMByte(0, RegOpcodeField, 4), CurByte, OS);
+    emitByte(modRMByte(0, RegOpcodeField, 4), OS);
   } else if (!HasEVEX && isDisp8(Disp.getImm())) {
     // Emit the disp8 encoding.
-    emitByte(modRMByte(1, RegOpcodeField, 4), CurByte, OS);
+    emitByte(modRMByte(1, RegOpcodeField, 4), OS);
     ForceDisp8 = true; // Make sure to force 8 bit disp if Base=EBP
   } else if (HasEVEX && isCDisp8(TSFlags, Disp.getImm(), CDisp8)) {
     // Emit the disp8 encoding.
-    emitByte(modRMByte(1, RegOpcodeField, 4), CurByte, OS);
+    emitByte(modRMByte(1, RegOpcodeField, 4), OS);
     ForceDisp8 = true; // Make sure to force 8 bit disp if Base=EBP
     ImmOffset = CDisp8 - Disp.getImm();
   } else {
     // Emit the normal disp32 encoding.
-    emitByte(modRMByte(2, RegOpcodeField, 4), CurByte, OS);
+    emitByte(modRMByte(2, RegOpcodeField, 4), OS);
   }
 
   // Calculate what the SS field value should be...
@@ -613,77 +628,78 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
       IndexRegNo = getX86RegNum(IndexReg);
     else // Examples: [ESP+1*<noreg>+4] or [scaled idx]+disp32 (MOD=0,BASE=5)
       IndexRegNo = 4;
-    emitSIBByte(SS, IndexRegNo, 5, CurByte, OS);
+    emitSIBByte(SS, IndexRegNo, 5, OS);
   } else {
     unsigned IndexRegNo;
     if (IndexReg.getReg())
       IndexRegNo = getX86RegNum(IndexReg);
     else
       IndexRegNo = 4; // For example [ESP+1*<noreg>+4]
-    emitSIBByte(SS, IndexRegNo, getX86RegNum(Base), CurByte, OS);
+    emitSIBByte(SS, IndexRegNo, getX86RegNum(Base), OS);
   }
 
   // Do we need to output a displacement?
   if (ForceDisp8)
-    emitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups,
+    emitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, StartByte, OS, Fixups,
                   ImmOffset);
   else if (ForceDisp32 || Disp.getImm() != 0)
     emitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(X86::reloc_signed_4byte),
-                  CurByte, OS, Fixups);
+                  StartByte, OS, Fixups);
 }
 
-void X86MCCodeEmitter::emitPrefixImpl(uint64_t TSFlags, unsigned &CurOp,
-                                  unsigned &CurByte, bool &Rex,
-                                  const MCInst &MI, const MCInstrDesc &Desc,
-                                  const MCSubtargetInfo &STI,
-                                  raw_ostream &OS) const {
+/// Emit all instruction prefixes.
+///
+/// \returns true if REX prefix is used, otherwise returns false.
+bool X86MCCodeEmitter::emitPrefixImpl(unsigned &CurOp, const MCInst &MI,
+                                      const MCSubtargetInfo &STI,
+                                      raw_ostream &OS) const {
+  uint64_t TSFlags = MCII.get(MI.getOpcode()).TSFlags;
   // Determine where the memory operand starts, if present.
   int MemoryOperand = X86II::getMemoryOperandNo(TSFlags);
-  if (MemoryOperand != -1)
-    MemoryOperand += CurOp;
-
   // Emit segment override opcode prefix as needed.
-  if (MemoryOperand >= 0)
-    emitSegmentOverridePrefix(CurByte, MemoryOperand + X86::AddrSegmentReg, MI,
-                              OS);
+  if (MemoryOperand != -1) {
+    MemoryOperand += CurOp;
+    emitSegmentOverridePrefix(MemoryOperand + X86::AddrSegmentReg, MI, OS);
+  }
 
   // Emit the repeat opcode prefix as needed.
   unsigned Flags = MI.getFlags();
   if (TSFlags & X86II::REP || Flags & X86::IP_HAS_REPEAT)
-    emitByte(0xF3, CurByte, OS);
+    emitByte(0xF3, OS);
   if (Flags & X86::IP_HAS_REPEAT_NE)
-    emitByte(0xF2, CurByte, OS);
+    emitByte(0xF2, OS);
 
   // Emit the address size opcode prefix as needed.
-  bool need_address_override;
+  bool NeedAddressOverride;
   uint64_t AdSize = TSFlags & X86II::AdSizeMask;
   if ((STI.hasFeature(X86::Mode16Bit) && AdSize == X86II::AdSize32) ||
       (STI.hasFeature(X86::Mode32Bit) && AdSize == X86II::AdSize16) ||
       (STI.hasFeature(X86::Mode64Bit) && AdSize == X86II::AdSize32)) {
-    need_address_override = true;
+    NeedAddressOverride = true;
   } else if (MemoryOperand < 0) {
-    need_address_override = false;
+    NeedAddressOverride = false;
   } else if (STI.hasFeature(X86::Mode64Bit)) {
     assert(!is16BitMemOperand(MI, MemoryOperand, STI));
-    need_address_override = is32BitMemOperand(MI, MemoryOperand);
+    NeedAddressOverride = is32BitMemOperand(MI, MemoryOperand);
   } else if (STI.hasFeature(X86::Mode32Bit)) {
     assert(!is64BitMemOperand(MI, MemoryOperand));
-    need_address_override = is16BitMemOperand(MI, MemoryOperand, STI);
+    NeedAddressOverride = is16BitMemOperand(MI, MemoryOperand, STI);
   } else {
     assert(STI.hasFeature(X86::Mode16Bit));
     assert(!is64BitMemOperand(MI, MemoryOperand));
-    need_address_override = !is16BitMemOperand(MI, MemoryOperand, STI);
+    NeedAddressOverride = !is16BitMemOperand(MI, MemoryOperand, STI);
   }
 
-  if (need_address_override)
-    emitByte(0x67, CurByte, OS);
+  if (NeedAddressOverride)
+    emitByte(0x67, OS);
 
   // Encoding type for this instruction.
   uint64_t Encoding = TSFlags & X86II::EncodingMask;
-  if (Encoding == 0)
-    Rex = emitOpcodePrefix(TSFlags, CurByte, MemoryOperand, MI, Desc, STI, OS);
+  bool HasREX = false;
+  if (Encoding)
+    emitVEXOpcodePrefix(MemoryOperand, MI, OS);
   else
-    emitVEXOpcodePrefix(TSFlags, CurByte, MemoryOperand, MI, Desc, OS);
+    HasREX = emitOpcodePrefix(MemoryOperand, MI, STI, OS);
 
   uint64_t Form = TSFlags & X86II::FormMask;
   switch (Form) {
@@ -697,11 +713,11 @@ void X86MCCodeEmitter::emitPrefixImpl(uint64_t TSFlags, unsigned &CurOp,
            "SI and DI register sizes do not match");
     // Emit segment override opcode prefix as needed (not for %ds).
     if (MI.getOperand(2).getReg() != X86::DS)
-      emitSegmentOverridePrefix(CurByte, 2, MI, OS);
+      emitSegmentOverridePrefix(2, MI, OS);
     // Emit AdSize prefix as needed.
     if ((!STI.hasFeature(X86::Mode32Bit) && siReg == X86::ESI) ||
         (STI.hasFeature(X86::Mode32Bit) && siReg == X86::SI))
-      emitByte(0x67, CurByte, OS);
+      emitByte(0x67, OS);
     CurOp += 3; // Consume operands.
     break;
   }
@@ -709,11 +725,11 @@ void X86MCCodeEmitter::emitPrefixImpl(uint64_t TSFlags, unsigned &CurOp,
     unsigned siReg = MI.getOperand(0).getReg();
     // Emit segment override opcode prefix as needed (not for %ds).
     if (MI.getOperand(1).getReg() != X86::DS)
-      emitSegmentOverridePrefix(CurByte, 1, MI, OS);
+      emitSegmentOverridePrefix(1, MI, OS);
     // Emit AdSize prefix as needed.
     if ((!STI.hasFeature(X86::Mode32Bit) && siReg == X86::ESI) ||
         (STI.hasFeature(X86::Mode32Bit) && siReg == X86::SI))
-      emitByte(0x67, CurByte, OS);
+      emitByte(0x67, OS);
     CurOp += 2; // Consume operands.
     break;
   }
@@ -722,24 +738,26 @@ void X86MCCodeEmitter::emitPrefixImpl(uint64_t TSFlags, unsigned &CurOp,
     // Emit AdSize prefix as needed.
     if ((!STI.hasFeature(X86::Mode32Bit) && siReg == X86::EDI) ||
         (STI.hasFeature(X86::Mode32Bit) && siReg == X86::DI))
-      emitByte(0x67, CurByte, OS);
+      emitByte(0x67, OS);
     ++CurOp; // Consume operand.
     break;
   }
   case X86II::RawFrmMemOffs: {
     // Emit segment override opcode prefix as needed.
-    emitSegmentOverridePrefix(CurByte, 1, MI, OS);
+    emitSegmentOverridePrefix(1, MI, OS);
     break;
   }
   }
+
+  return HasREX;
 }
 
-/// emitVEXOpcodePrefix - AVX instructions are encoded using a opcode prefix
-/// called VEX.
-void X86MCCodeEmitter::emitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
-                                           int MemOperand, const MCInst &MI,
-                                           const MCInstrDesc &Desc,
+/// AVX instructions are encoded using a opcode prefix called VEX.
+void X86MCCodeEmitter::emitVEXOpcodePrefix(int MemOperand, const MCInst &MI,
                                            raw_ostream &OS) const {
+  const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
+  uint64_t TSFlags = Desc.TSFlags;
+
   assert(!(TSFlags & X86II::LOCK) && "Can't have LOCK VEX.");
 
   uint64_t Encoding = TSFlags & X86II::EncodingMask;
@@ -868,8 +886,11 @@ void X86MCCodeEmitter::emitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
   switch (TSFlags & X86II::FormMask) {
   default:
     llvm_unreachable("Unexpected form in emitVEXOpcodePrefix!");
+  case X86II::MRM_C0:
   case X86II::RawFrm:
+  case X86II::PrefixByte:
     break;
+  case X86II::MRMDestMemFSIB:
   case X86II::MRMDestMem: {
     // MRMDestMem instructions forms:
     //  MemAddr, src1(ModR/M)
@@ -900,6 +921,7 @@ void X86MCCodeEmitter::emitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
     EVEX_R2 = ~(RegEnc >> 4) & 1;
     break;
   }
+  case X86II::MRMSrcMemFSIB:
   case X86II::MRMSrcMem: {
     // MRMSrcMem instructions forms:
     //  src1(ModR/M), MemAddr
@@ -1081,6 +1103,15 @@ void X86MCCodeEmitter::emitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
       EncodeRC = true;
     break;
   }
+  case X86II::MRMr0: {
+    // MRMr0 instructions forms:
+    //  11:rrr:000
+    //  dst(ModR/M)
+    unsigned RegEnc = getX86RegEncoding(MI, CurOp++);
+    VEX_R = ~(RegEnc >> 3) & 1;
+    EVEX_R2 = ~(RegEnc >> 4) & 1;
+    break;
+  }
   case X86II::MRM0r:
   case X86II::MRM1r:
   case X86II::MRM2r:
@@ -1127,15 +1158,15 @@ void X86MCCodeEmitter::emitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
     // Can we use the 2 byte VEX prefix?
     if (!(MI.getFlags() & X86::IP_USE_VEX3) && Encoding == X86II::VEX &&
         VEX_B && VEX_X && !VEX_W && (VEX_5M == 1)) {
-      emitByte(0xC5, CurByte, OS);
-      emitByte(LastByte | (VEX_R << 7), CurByte, OS);
+      emitByte(0xC5, OS);
+      emitByte(LastByte | (VEX_R << 7), OS);
       return;
     }
 
     // 3 byte VEX prefix
-    emitByte(Encoding == X86II::XOP ? 0x8F : 0xC4, CurByte, OS);
-    emitByte(VEX_R << 7 | VEX_X << 6 | VEX_B << 5 | VEX_5M, CurByte, OS);
-    emitByte(LastByte | (VEX_W << 7), CurByte, OS);
+    emitByte(Encoding == X86II::XOP ? 0x8F : 0xC4, OS);
+    emitByte(VEX_R << 7 | VEX_X << 6 | VEX_B << 5 | VEX_5M, OS);
+    emitByte(LastByte | (VEX_W << 7), OS);
   } else {
     assert(Encoding == X86II::EVEX && "unknown encoding!");
     // EVEX opcode prefix can have 4 bytes
@@ -1146,144 +1177,137 @@ void X86MCCodeEmitter::emitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
     assert((VEX_5M & 0x3) == VEX_5M &&
            "More than 2 significant bits in VEX.m-mmmm fields for EVEX!");
 
-    emitByte(0x62, CurByte, OS);
+    emitByte(0x62, OS);
     emitByte((VEX_R << 7) | (VEX_X << 6) | (VEX_B << 5) | (EVEX_R2 << 4) |
                  VEX_5M,
-             CurByte, OS);
-    emitByte((VEX_W << 7) | (VEX_4V << 3) | (EVEX_U << 2) | VEX_PP, CurByte,
              OS);
+    emitByte((VEX_W << 7) | (VEX_4V << 3) | (EVEX_U << 2) | VEX_PP, OS);
     if (EncodeRC)
       emitByte((EVEX_z << 7) | (EVEX_rc << 5) | (EVEX_b << 4) | (EVEX_V2 << 3) |
                    EVEX_aaa,
-               CurByte, OS);
+               OS);
     else
       emitByte((EVEX_z << 7) | (EVEX_L2 << 6) | (VEX_L << 5) | (EVEX_b << 4) |
                    (EVEX_V2 << 3) | EVEX_aaa,
-               CurByte, OS);
+               OS);
   }
 }
 
-/// Determine if the MCInst has to be encoded with a X86-64 REX prefix which
-/// specifies 1) 64-bit instructions, 2) non-default operand size, and 3) use
-/// of X86-64 extended registers.
-uint8_t X86MCCodeEmitter::determineREXPrefix(const MCInst &MI, uint64_t TSFlags,
-                                             int MemOperand,
-                                             const MCInstrDesc &Desc) const {
-  uint8_t REX = 0;
-  bool UsesHighByteReg = false;
-
-  if (TSFlags & X86II::REX_W)
-    REX |= 1 << 3; // set REX.W
+/// Emit REX prefix which specifies
+///   1) 64-bit instructions,
+///   2) non-default operand size, and
+///   3) use of X86-64 extended registers.
+///
+/// \returns true if REX prefix is used, otherwise returns false.
+bool X86MCCodeEmitter::emitREXPrefix(int MemOperand, const MCInst &MI,
+                                     raw_ostream &OS) const {
+  uint8_t REX = [&, MemOperand]() {
+    uint8_t REX = 0;
+    bool UsesHighByteReg = false;
+
+    const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
+    uint64_t TSFlags = Desc.TSFlags;
+
+    if (TSFlags & X86II::REX_W)
+      REX |= 1 << 3; // set REX.W
+
+    if (MI.getNumOperands() == 0)
+      return REX;
+
+    unsigned NumOps = MI.getNumOperands();
+    unsigned CurOp = X86II::getOperandBias(Desc);
+
+    // If it accesses SPL, BPL, SIL, or DIL, then it requires a 0x40 REX prefix.
+    for (unsigned i = CurOp; i != NumOps; ++i) {
+      const MCOperand &MO = MI.getOperand(i);
+      if (!MO.isReg())
+        continue;
+      unsigned Reg = MO.getReg();
+      if (Reg == X86::AH || Reg == X86::BH || Reg == X86::CH || Reg == X86::DH)
+        UsesHighByteReg = true;
+      if (X86II::isX86_64NonExtLowByteReg(Reg))
+        // FIXME: The caller of determineREXPrefix slaps this prefix onto
+        // anything that returns non-zero.
+        REX |= 0x40; // REX fixed encoding prefix
+    }
 
-  if (MI.getNumOperands() == 0)
+    switch (TSFlags & X86II::FormMask) {
+    case X86II::AddRegFrm:
+      REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B
+      break;
+    case X86II::MRMSrcReg:
+    case X86II::MRMSrcRegCC:
+      REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R
+      REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B
+      break;
+    case X86II::MRMSrcMem:
+    case X86II::MRMSrcMemCC:
+      REX |= isREXExtendedReg(MI, CurOp++) << 2;                        // REX.R
+      REX |= isREXExtendedReg(MI, MemOperand + X86::AddrBaseReg) << 0;  // REX.B
+      REX |= isREXExtendedReg(MI, MemOperand + X86::AddrIndexReg) << 1; // REX.X
+      CurOp += X86::AddrNumOperands;
+      break;
+    case X86II::MRMDestReg:
+      REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B
+      REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R
+      break;
+    case X86II::MRMDestMem:
+      REX |= isREXExtendedReg(MI, MemOperand + X86::AddrBaseReg) << 0;  // REX.B
+      REX |= isREXExtendedReg(MI, MemOperand + X86::AddrIndexReg) << 1; // REX.X
+      CurOp += X86::AddrNumOperands;
+      REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R
+      break;
+    case X86II::MRMXmCC:
+    case X86II::MRMXm:
+    case X86II::MRM0m:
+    case X86II::MRM1m:
+    case X86II::MRM2m:
+    case X86II::MRM3m:
+    case X86II::MRM4m:
+    case X86II::MRM5m:
+    case X86II::MRM6m:
+    case X86II::MRM7m:
+      REX |= isREXExtendedReg(MI, MemOperand + X86::AddrBaseReg) << 0;  // REX.B
+      REX |= isREXExtendedReg(MI, MemOperand + X86::AddrIndexReg) << 1; // REX.X
+      break;
+    case X86II::MRMXrCC:
+    case X86II::MRMXr:
+    case X86II::MRM0r:
+    case X86II::MRM1r:
+    case X86II::MRM2r:
+    case X86II::MRM3r:
+    case X86II::MRM4r:
+    case X86II::MRM5r:
+    case X86II::MRM6r:
+    case X86II::MRM7r:
+      REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B
+      break;
+    case X86II::MRMr0:
+      REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R
+      break;
+    case X86II::MRMDestMemFSIB:
+      llvm_unreachable("FSIB format never need REX prefix!");
+    }
+    if (REX && UsesHighByteReg)
+      report_fatal_error(
+          "Cannot encode high byte register in REX-prefixed instruction");
     return REX;
+  }();
 
-  unsigned NumOps = MI.getNumOperands();
-  unsigned CurOp = X86II::getOperandBias(Desc);
-
-  // If it accesses SPL, BPL, SIL, or DIL, then it requires a 0x40 REX prefix.
-  for (unsigned i = CurOp; i != NumOps; ++i) {
-    const MCOperand &MO = MI.getOperand(i);
-    if (!MO.isReg())
-      continue;
-    unsigned Reg = MO.getReg();
-    if (Reg == X86::AH || Reg == X86::BH || Reg == X86::CH || Reg == X86::DH)
-      UsesHighByteReg = true;
-    if (X86II::isX86_64NonExtLowByteReg(Reg))
-      // FIXME: The caller of determineREXPrefix slaps this prefix onto anything
-      // that returns non-zero.
-      REX |= 0x40; // REX fixed encoding prefix
-  }
-
-  switch (TSFlags & X86II::FormMask) {
-  case X86II::AddRegFrm:
-    REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B
-    break;
-  case X86II::MRMSrcReg:
-  case X86II::MRMSrcRegCC:
-    REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R
-    REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B
-    break;
-  case X86II::MRMSrcMem:
-  case X86II::MRMSrcMemCC:
-    REX |= isREXExtendedReg(MI, CurOp++) << 2;                        // REX.R
-    REX |= isREXExtendedReg(MI, MemOperand + X86::AddrBaseReg) << 0;  // REX.B
-    REX |= isREXExtendedReg(MI, MemOperand + X86::AddrIndexReg) << 1; // REX.X
-    CurOp += X86::AddrNumOperands;
-    break;
-  case X86II::MRMDestReg:
-    REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B
-    REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R
-    break;
-  case X86II::MRMDestMem:
-    REX |= isREXExtendedReg(MI, MemOperand + X86::AddrBaseReg) << 0;  // REX.B
-    REX |= isREXExtendedReg(MI, MemOperand + X86::AddrIndexReg) << 1; // REX.X
-    CurOp += X86::AddrNumOperands;
-    REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R
-    break;
-  case X86II::MRMXmCC:
-  case X86II::MRMXm:
-  case X86II::MRM0m:
-  case X86II::MRM1m:
-  case X86II::MRM2m:
-  case X86II::MRM3m:
-  case X86II::MRM4m:
-  case X86II::MRM5m:
-  case X86II::MRM6m:
-  case X86II::MRM7m:
-    REX |= isREXExtendedReg(MI, MemOperand + X86::AddrBaseReg) << 0;  // REX.B
-    REX |= isREXExtendedReg(MI, MemOperand + X86::AddrIndexReg) << 1; // REX.X
-    break;
-  case X86II::MRMXrCC:
-  case X86II::MRMXr:
-  case X86II::MRM0r:
-  case X86II::MRM1r:
-  case X86II::MRM2r:
-  case X86II::MRM3r:
-  case X86II::MRM4r:
-  case X86II::MRM5r:
-  case X86II::MRM6r:
-  case X86II::MRM7r:
-    REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B
-    break;
-  }
-  if (REX && UsesHighByteReg)
-    report_fatal_error(
-        "Cannot encode high byte register in REX-prefixed instruction");
+  if (!REX)
+    return false;
 
-  return REX;
+  emitByte(0x40 | REX, OS);
+  return true;
 }
 
 /// Emit segment override opcode prefix as needed.
-void X86MCCodeEmitter::emitSegmentOverridePrefix(unsigned &CurByte,
-                                                 unsigned SegOperand,
+void X86MCCodeEmitter::emitSegmentOverridePrefix(unsigned SegOperand,
                                                  const MCInst &MI,
                                                  raw_ostream &OS) const {
   // Check for explicit segment override on memory operand.
-  switch (MI.getOperand(SegOperand).getReg()) {
-  default:
-    llvm_unreachable("Unknown segment register!");
-  case 0:
-    break;
-  case X86::CS:
-    emitByte(0x2E, CurByte, OS);
-    break;
-  case X86::SS:
-    emitByte(0x36, CurByte, OS);
-    break;
-  case X86::DS:
-    emitByte(0x3E, CurByte, OS);
-    break;
-  case X86::ES:
-    emitByte(0x26, CurByte, OS);
-    break;
-  case X86::FS:
-    emitByte(0x64, CurByte, OS);
-    break;
-  case X86::GS:
-    emitByte(0x65, CurByte, OS);
-    break;
-  }
+  if (unsigned Reg = MI.getOperand(SegOperand).getReg())
+    emitByte(X86::getSegmentOverridePrefixForReg(Reg), OS);
 }
 
 /// Emit all instruction prefixes prior to the opcode.
@@ -1291,48 +1315,44 @@ void X86MCCodeEmitter::emitSegmentOverridePrefix(unsigned &CurByte,
 /// \param MemOperand the operand # of the start of a memory operand if present.
 /// If not present, it is -1.
 ///
-/// \returns true if a REX prefix was used.
-bool X86MCCodeEmitter::emitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
-                                        int MemOperand, const MCInst &MI,
-                                        const MCInstrDesc &Desc,
+/// \returns true if REX prefix is used, otherwise returns false.
+bool X86MCCodeEmitter::emitOpcodePrefix(int MemOperand, const MCInst &MI,
                                         const MCSubtargetInfo &STI,
                                         raw_ostream &OS) const {
-  bool Ret = false;
+  const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
+  uint64_t TSFlags = Desc.TSFlags;
+
   // Emit the operand size opcode prefix as needed.
   if ((TSFlags & X86II::OpSizeMask) ==
       (STI.hasFeature(X86::Mode16Bit) ? X86II::OpSize32 : X86II::OpSize16))
-    emitByte(0x66, CurByte, OS);
+    emitByte(0x66, OS);
 
   // Emit the LOCK opcode prefix.
   if (TSFlags & X86II::LOCK || MI.getFlags() & X86::IP_HAS_LOCK)
-    emitByte(0xF0, CurByte, OS);
+    emitByte(0xF0, OS);
 
   // Emit the NOTRACK opcode prefix.
   if (TSFlags & X86II::NOTRACK || MI.getFlags() & X86::IP_HAS_NOTRACK)
-    emitByte(0x3E, CurByte, OS);
+    emitByte(0x3E, OS);
 
   switch (TSFlags & X86II::OpPrefixMask) {
   case X86II::PD: // 66
-    emitByte(0x66, CurByte, OS);
+    emitByte(0x66, OS);
     break;
   case X86II::XS: // F3
-    emitByte(0xF3, CurByte, OS);
+    emitByte(0xF3, OS);
     break;
   case X86II::XD: // F2
-    emitByte(0xF2, CurByte, OS);
+    emitByte(0xF2, OS);
     break;
   }
 
   // Handle REX prefix.
-  // FIXME: Can this come before F2 etc to simplify emission?
-  if (STI.hasFeature(X86::Mode64Bit)) {
-    if (uint8_t REX = determineREXPrefix(MI, TSFlags, MemOperand, Desc)) {
-      emitByte(0x40 | REX, CurByte, OS);
-      Ret = true;
-    }
-  } else {
-    assert(!(TSFlags & X86II::REX_W) && "REX.W requires 64bit mode.");
-  }
+  assert((STI.hasFeature(X86::Mode64Bit) || !(TSFlags & X86II::REX_W)) &&
+         "REX.W requires 64bit mode.");
+  bool HasREX = STI.hasFeature(X86::Mode64Bit)
+                    ? emitREXPrefix(MemOperand, MI, OS)
+                    : false;
 
   // 0x0F escape code must be emitted just before the opcode.
   switch (TSFlags & X86II::OpMapMask) {
@@ -1340,19 +1360,20 @@ bool X86MCCodeEmitter::emitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
   case X86II::T8:        // 0F 38
   case X86II::TA:        // 0F 3A
   case X86II::ThreeDNow: // 0F 0F, second 0F emitted by caller.
-    emitByte(0x0F, CurByte, OS);
+    emitByte(0x0F, OS);
     break;
   }
 
   switch (TSFlags & X86II::OpMapMask) {
   case X86II::T8: // 0F 38
-    emitByte(0x38, CurByte, OS);
+    emitByte(0x38, OS);
     break;
   case X86II::TA: // 0F 3A
-    emitByte(0x3A, CurByte, OS);
+    emitByte(0x3A, OS);
     break;
   }
-  return Ret;
+
+  return HasREX;
 }
 
 void X86MCCodeEmitter::emitPrefix(const MCInst &MI, raw_ostream &OS,
@@ -1362,16 +1383,12 @@ void X86MCCodeEmitter::emitPrefix(const MCInst &MI, raw_ostream &OS,
   uint64_t TSFlags = Desc.TSFlags;
 
   // Pseudo instructions don't get encoded.
-  if ((TSFlags & X86II::FormMask) == X86II::Pseudo)
+  if (X86II::isPseudo(TSFlags))
     return;
 
   unsigned CurOp = X86II::getOperandBias(Desc);
 
-  // Keep track of the current byte being emitted.
-  unsigned CurByte = 0;
-
-  bool Rex = false;
-  emitPrefixImpl(TSFlags, CurOp, CurByte, Rex, MI, Desc, STI, OS);
+  emitPrefixImpl(CurOp, MI, STI, OS);
 }
 
 void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
@@ -1382,17 +1399,15 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
   uint64_t TSFlags = Desc.TSFlags;
 
   // Pseudo instructions don't get encoded.
-  if ((TSFlags & X86II::FormMask) == X86II::Pseudo)
+  if (X86II::isPseudo(TSFlags))
     return;
 
   unsigned NumOps = Desc.getNumOperands();
   unsigned CurOp = X86II::getOperandBias(Desc);
 
-  // Keep track of the current byte being emitted.
-  unsigned CurByte = 0;
+  uint64_t StartByte = OS.tell();
 
-  bool Rex = false;
-  emitPrefixImpl(TSFlags, CurOp, CurByte, Rex, MI, Desc, STI, OS);
+  bool HasREX = emitPrefixImpl(CurOp, MI, STI, OS);
 
   // It uses the VEX.VVVV field?
   bool HasVEX_4V = TSFlags & X86II::VEX_4V;
@@ -1422,7 +1437,8 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
   case X86II::RawFrmDstSrc:
   case X86II::RawFrmSrc:
   case X86II::RawFrmDst:
-    emitByte(BaseOpcode, CurByte, OS);
+  case X86II::PrefixByte:
+    emitByte(BaseOpcode, OS);
     break;
   case X86II::AddCCFrm: {
     // This will be added to the opcode in the fallthrough.
@@ -1431,47 +1447,47 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
     --NumOps; // Drop the operand from the end.
     LLVM_FALLTHROUGH;
   case X86II::RawFrm:
-    emitByte(BaseOpcode + OpcodeOffset, CurByte, OS);
+    emitByte(BaseOpcode + OpcodeOffset, OS);
 
     if (!STI.hasFeature(X86::Mode64Bit) || !isPCRel32Branch(MI, MCII))
       break;
 
     const MCOperand &Op = MI.getOperand(CurOp++);
     emitImmediate(Op, MI.getLoc(), X86II::getSizeOfImm(TSFlags),
-                  MCFixupKind(X86::reloc_branch_4byte_pcrel), CurByte, OS,
+                  MCFixupKind(X86::reloc_branch_4byte_pcrel), StartByte, OS,
                   Fixups);
     break;
   }
   case X86II::RawFrmMemOffs:
-    emitByte(BaseOpcode, CurByte, OS);
+    emitByte(BaseOpcode, OS);
     emitImmediate(MI.getOperand(CurOp++), MI.getLoc(),
                   X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags),
-                  CurByte, OS, Fixups);
+                  StartByte, OS, Fixups);
     ++CurOp; // skip segment operand
     break;
   case X86II::RawFrmImm8:
-    emitByte(BaseOpcode, CurByte, OS);
+    emitByte(BaseOpcode, OS);
     emitImmediate(MI.getOperand(CurOp++), MI.getLoc(),
                   X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags),
-                  CurByte, OS, Fixups);
-    emitImmediate(MI.getOperand(CurOp++), MI.getLoc(), 1, FK_Data_1, CurByte,
+                  StartByte, OS, Fixups);
+    emitImmediate(MI.getOperand(CurOp++), MI.getLoc(), 1, FK_Data_1, StartByte,
                   OS, Fixups);
     break;
   case X86II::RawFrmImm16:
-    emitByte(BaseOpcode, CurByte, OS);
+    emitByte(BaseOpcode, OS);
     emitImmediate(MI.getOperand(CurOp++), MI.getLoc(),
                   X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags),
-                  CurByte, OS, Fixups);
-    emitImmediate(MI.getOperand(CurOp++), MI.getLoc(), 2, FK_Data_2, CurByte,
+                  StartByte, OS, Fixups);
+    emitImmediate(MI.getOperand(CurOp++), MI.getLoc(), 2, FK_Data_2, StartByte,
                   OS, Fixups);
     break;
 
   case X86II::AddRegFrm:
-    emitByte(BaseOpcode + getX86RegNum(MI.getOperand(CurOp++)), CurByte, OS);
+    emitByte(BaseOpcode + getX86RegNum(MI.getOperand(CurOp++)), OS);
     break;
 
   case X86II::MRMDestReg: {
-    emitByte(BaseOpcode, CurByte, OS);
+    emitByte(BaseOpcode, OS);
     unsigned SrcRegNum = CurOp + 1;
 
     if (HasEVEX_K) // Skip writemask
@@ -1481,12 +1497,13 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
       ++SrcRegNum;
 
     emitRegModRMByte(MI.getOperand(CurOp),
-                     getX86RegNum(MI.getOperand(SrcRegNum)), CurByte, OS);
+                     getX86RegNum(MI.getOperand(SrcRegNum)), OS);
     CurOp = SrcRegNum + 1;
     break;
   }
+  case X86II::MRMDestMemFSIB:
   case X86II::MRMDestMem: {
-    emitByte(BaseOpcode, CurByte, OS);
+    emitByte(BaseOpcode, OS);
     unsigned SrcRegNum = CurOp + X86::AddrNumOperands;
 
     if (HasEVEX_K) // Skip writemask
@@ -1495,13 +1512,14 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
     if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
       ++SrcRegNum;
 
+    bool ForceSIB = (Form == X86II::MRMDestMemFSIB);
     emitMemModRMByte(MI, CurOp, getX86RegNum(MI.getOperand(SrcRegNum)), TSFlags,
-                     Rex, CurByte, OS, Fixups, STI);
+                     HasREX, StartByte, OS, Fixups, STI, ForceSIB);
     CurOp = SrcRegNum + 1;
     break;
   }
   case X86II::MRMSrcReg: {
-    emitByte(BaseOpcode, CurByte, OS);
+    emitByte(BaseOpcode, OS);
     unsigned SrcRegNum = CurOp + 1;
 
     if (HasEVEX_K) // Skip writemask
@@ -1511,7 +1529,7 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
       ++SrcRegNum;
 
     emitRegModRMByte(MI.getOperand(SrcRegNum),
-                     getX86RegNum(MI.getOperand(CurOp)), CurByte, OS);
+                     getX86RegNum(MI.getOperand(CurOp)), OS);
     CurOp = SrcRegNum + 1;
     if (HasVEX_I8Reg)
       I8RegNum = getX86RegEncoding(MI, CurOp++);
@@ -1521,17 +1539,17 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
     break;
   }
   case X86II::MRMSrcReg4VOp3: {
-    emitByte(BaseOpcode, CurByte, OS);
+    emitByte(BaseOpcode, OS);
     unsigned SrcRegNum = CurOp + 1;
 
     emitRegModRMByte(MI.getOperand(SrcRegNum),
-                     getX86RegNum(MI.getOperand(CurOp)), CurByte, OS);
+                     getX86RegNum(MI.getOperand(CurOp)), OS);
     CurOp = SrcRegNum + 1;
     ++CurOp; // Encoded in VEX.VVVV
     break;
   }
   case X86II::MRMSrcRegOp4: {
-    emitByte(BaseOpcode, CurByte, OS);
+    emitByte(BaseOpcode, OS);
     unsigned SrcRegNum = CurOp + 1;
 
     // Skip 1st src (which is encoded in VEX_VVVV)
@@ -1542,7 +1560,7 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
     I8RegNum = getX86RegEncoding(MI, SrcRegNum++);
 
     emitRegModRMByte(MI.getOperand(SrcRegNum),
-                     getX86RegNum(MI.getOperand(CurOp)), CurByte, OS);
+                     getX86RegNum(MI.getOperand(CurOp)), OS);
     CurOp = SrcRegNum + 1;
     break;
   }
@@ -1551,12 +1569,13 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
     unsigned SecondOp = CurOp++;
 
     unsigned CC = MI.getOperand(CurOp++).getImm();
-    emitByte(BaseOpcode + CC, CurByte, OS);
+    emitByte(BaseOpcode + CC, OS);
 
     emitRegModRMByte(MI.getOperand(SecondOp),
-                     getX86RegNum(MI.getOperand(FirstOp)), CurByte, OS);
+                     getX86RegNum(MI.getOperand(FirstOp)), OS);
     break;
   }
+  case X86II::MRMSrcMemFSIB:
   case X86II::MRMSrcMem: {
     unsigned FirstMemOp = CurOp + 1;
 
@@ -1566,10 +1585,11 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
     if (HasVEX_4V)
       ++FirstMemOp; // Skip the register source (which is encoded in VEX_VVVV).
 
-    emitByte(BaseOpcode, CurByte, OS);
+    emitByte(BaseOpcode, OS);
 
+    bool ForceSIB = (Form == X86II::MRMSrcMemFSIB);
     emitMemModRMByte(MI, FirstMemOp, getX86RegNum(MI.getOperand(CurOp)),
-                     TSFlags, Rex, CurByte, OS, Fixups, STI);
+                     TSFlags, HasREX, StartByte, OS, Fixups, STI, ForceSIB);
     CurOp = FirstMemOp + X86::AddrNumOperands;
     if (HasVEX_I8Reg)
       I8RegNum = getX86RegEncoding(MI, CurOp++);
@@ -1578,10 +1598,10 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
   case X86II::MRMSrcMem4VOp3: {
     unsigned FirstMemOp = CurOp + 1;
 
-    emitByte(BaseOpcode, CurByte, OS);
+    emitByte(BaseOpcode, OS);
 
     emitMemModRMByte(MI, FirstMemOp, getX86RegNum(MI.getOperand(CurOp)),
-                     TSFlags, Rex, CurByte, OS, Fixups, STI);
+                     TSFlags, HasREX, StartByte, OS, Fixups, STI);
     CurOp = FirstMemOp + X86::AddrNumOperands;
     ++CurOp; // Encoded in VEX.VVVV.
     break;
@@ -1595,10 +1615,10 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
     assert(HasVEX_I8Reg && "MRMSrcRegOp4 should imply VEX_I8Reg");
     I8RegNum = getX86RegEncoding(MI, FirstMemOp++);
 
-    emitByte(BaseOpcode, CurByte, OS);
+    emitByte(BaseOpcode, OS);
 
     emitMemModRMByte(MI, FirstMemOp, getX86RegNum(MI.getOperand(CurOp)),
-                     TSFlags, Rex, CurByte, OS, Fixups, STI);
+                     TSFlags, HasREX, StartByte, OS, Fixups, STI);
     CurOp = FirstMemOp + X86::AddrNumOperands;
     break;
   }
@@ -1608,10 +1628,10 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
     CurOp = FirstMemOp + X86::AddrNumOperands;
 
     unsigned CC = MI.getOperand(CurOp++).getImm();
-    emitByte(BaseOpcode + CC, CurByte, OS);
+    emitByte(BaseOpcode + CC, OS);
 
     emitMemModRMByte(MI, FirstMemOp, getX86RegNum(MI.getOperand(RegOp)),
-                     TSFlags, Rex, CurByte, OS, Fixups, STI);
+                     TSFlags, HasREX, StartByte, OS, Fixups, STI);
     break;
   }
 
@@ -1619,8 +1639,8 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
     unsigned RegOp = CurOp++;
 
     unsigned CC = MI.getOperand(CurOp++).getImm();
-    emitByte(BaseOpcode + CC, CurByte, OS);
-    emitRegModRMByte(MI.getOperand(RegOp), 0, CurByte, OS);
+    emitByte(BaseOpcode + CC, OS);
+    emitRegModRMByte(MI.getOperand(RegOp), 0, OS);
     break;
   }
 
@@ -1637,10 +1657,13 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
       ++CurOp;
     if (HasEVEX_K) // Skip writemask
       ++CurOp;
-    emitByte(BaseOpcode, CurByte, OS);
+    emitByte(BaseOpcode, OS);
     emitRegModRMByte(MI.getOperand(CurOp++),
-                     (Form == X86II::MRMXr) ? 0 : Form - X86II::MRM0r, CurByte,
-                     OS);
+                     (Form == X86II::MRMXr) ? 0 : Form - X86II::MRM0r, OS);
+    break;
+  case X86II::MRMr0:
+    emitByte(BaseOpcode, OS);
+    emitByte(modRMByte(3, getX86RegNum(MI.getOperand(CurOp++)),0), OS);
     break;
 
   case X86II::MRMXmCC: {
@@ -1648,9 +1671,10 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
     CurOp = FirstMemOp + X86::AddrNumOperands;
 
     unsigned CC = MI.getOperand(CurOp++).getImm();
-    emitByte(BaseOpcode + CC, CurByte, OS);
+    emitByte(BaseOpcode + CC, OS);
 
-    emitMemModRMByte(MI, FirstMemOp, 0, TSFlags, Rex, CurByte, OS, Fixups, STI);
+    emitMemModRMByte(MI, FirstMemOp, 0, TSFlags, HasREX, StartByte, OS, Fixups,
+                     STI);
     break;
   }
 
@@ -1667,13 +1691,25 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
       ++CurOp;
     if (HasEVEX_K) // Skip writemask
       ++CurOp;
-    emitByte(BaseOpcode, CurByte, OS);
+    emitByte(BaseOpcode, OS);
     emitMemModRMByte(MI, CurOp,
                      (Form == X86II::MRMXm) ? 0 : Form - X86II::MRM0m, TSFlags,
-                     Rex, CurByte, OS, Fixups, STI);
+                     HasREX, StartByte, OS, Fixups, STI);
     CurOp += X86::AddrNumOperands;
     break;
 
+  case X86II::MRM0X:
+  case X86II::MRM1X:
+  case X86II::MRM2X:
+  case X86II::MRM3X:
+  case X86II::MRM4X:
+  case X86II::MRM5X:
+  case X86II::MRM6X:
+  case X86II::MRM7X:
+    emitByte(BaseOpcode, OS);
+    emitByte(0xC0 + ((Form - X86II::MRM0X) << 3), OS);
+    break;
+
   case X86II::MRM_C0:
   case X86II::MRM_C1:
   case X86II::MRM_C2:
@@ -1738,8 +1774,8 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
   case X86II::MRM_FD:
   case X86II::MRM_FE:
   case X86II::MRM_FF:
-    emitByte(BaseOpcode, CurByte, OS);
-    emitByte(0xC0 + Form - X86II::MRM_C0, CurByte, OS);
+    emitByte(BaseOpcode, OS);
+    emitByte(0xC0 + Form - X86II::MRM_C0, OS);
     break;
   }
 
@@ -1754,7 +1790,7 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
       I8RegNum |= Val;
     }
     emitImmediate(MCOperand::createImm(I8RegNum), MI.getLoc(), 1, FK_Data_1,
-                  CurByte, OS, Fixups);
+                  StartByte, OS, Fixups);
   } else {
     // If there is a remaining operand, it must be a trailing immediate. Emit it
     // according to the right size for the instruction. Some instructions
@@ -1762,13 +1798,15 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
     while (CurOp != NumOps && NumOps - CurOp <= 2) {
       emitImmediate(MI.getOperand(CurOp++), MI.getLoc(),
                     X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags),
-                    CurByte, OS, Fixups);
+                    StartByte, OS, Fixups);
     }
   }
 
   if ((TSFlags & X86II::OpMapMask) == X86II::ThreeDNow)
-    emitByte(X86II::getBaseOpcodeFor(TSFlags), CurByte, OS);
+    emitByte(X86II::getBaseOpcodeFor(TSFlags), OS);
 
+  assert(OS.tell() - StartByte <= 15 &&
+         "The size of instruction must be no longer than 15.");
 #ifndef NDEBUG
   // FIXME: Verify.
   if (/*!Desc.isVariadic() &&*/ CurOp != NumOps) {
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
index 049a3a8159841..81110ba666e95 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -30,10 +30,6 @@
 #include "llvm/Support/Host.h"
 #include "llvm/Support/TargetRegistry.h"
 
-#if _MSC_VER
-#include <intrin.h>
-#endif
-
 using namespace llvm;
 
 #define GET_REGINFO_MC_DESC
@@ -294,7 +290,7 @@ MCSubtargetInfo *X86_MC::createX86MCSubtargetInfo(const Triple &TT,
   if (!FS.empty())
     ArchFS = (Twine(ArchFS) + "," + FS).str();
 
-  std::string CPUName = CPU;
+  std::string CPUName = std::string(CPU);
   if (CPUName.empty())
     CPUName = "generic";
 
@@ -335,7 +331,10 @@ static MCAsmInfo *createX86MCAsmInfo(const MCRegisterInfo &MRI,
     MAI = new X86ELFMCAsmInfo(TheTriple);
   } else if (TheTriple.isWindowsMSVCEnvironment() ||
              TheTriple.isWindowsCoreCLREnvironment()) {
-    MAI = new X86MCAsmInfoMicrosoft(TheTriple);
+    if (Options.getAssemblyLanguage().equals_lower("masm"))
+      MAI = new X86MCAsmInfoMicrosoftMASM(TheTriple);
+    else
+      MAI = new X86MCAsmInfoMicrosoft(TheTriple);
   } else if (TheTriple.isOSCygMing() ||
              TheTriple.isWindowsItaniumEnvironment()) {
     MAI = new X86MCAsmInfoGNUCOFF(TheTriple);
@@ -350,7 +349,7 @@ static MCAsmInfo *createX86MCAsmInfo(const MCRegisterInfo &MRI,
 
   // Initial state of the frame pointer is esp+stackGrowth.
   unsigned StackPtr = is64Bit ? X86::RSP : X86::ESP;
-  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(
+  MCCFIInstruction Inst = MCCFIInstruction::cfiDefCfa(
       nullptr, MRI.getDwarfRegNum(StackPtr, true), -stackGrowth);
   MAI->addInitialFrameState(Inst);
 
@@ -401,6 +400,9 @@ public:
   findPltEntries(uint64_t PltSectionVA, ArrayRef<uint8_t> PltContents,
                  uint64_t GotSectionVA,
                  const Triple &TargetTriple) const override;
+
+  bool evaluateBranch(const MCInst &Inst, uint64_t Addr, uint64_t Size,
+                      uint64_t &Target) const override;
   Optional<uint64_t> evaluateMemoryOperandAddress(const MCInst &Inst,
                                                   uint64_t Addr,
                                                   uint64_t Size) const override;
@@ -519,6 +521,15 @@ std::vector<std::pair<uint64_t, uint64_t>> X86MCInstrAnalysis::findPltEntries(
     }
 }
 
+bool X86MCInstrAnalysis::evaluateBranch(const MCInst &Inst, uint64_t Addr,
+                                        uint64_t Size, uint64_t &Target) const {
+  if (Inst.getNumOperands() == 0 ||
+      Info->get(Inst.getOpcode()).OpInfo[0].OperandType != MCOI::OPERAND_PCREL)
+    return false;
+  Target = Addr + Size + Inst.getOperand(0).getImm();
+  return true;
+}
+
 Optional<uint64_t> X86MCInstrAnalysis::evaluateMemoryOperandAddress(
     const MCInst &Inst, uint64_t Addr, uint64_t Size) const {
   const MCInstrDesc &MCID = Info->get(Inst.getOpcode());
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
index 0c789061f0e13..e8c72be1d9b6f 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
@@ -13,27 +13,28 @@
 #ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCTARGETDESC_H
 #define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCTARGETDESC_H
 
-#include "llvm/MC/MCRegister.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/Support/DataTypes.h"
+#include <memory>
 #include <string>
 
 namespace llvm {
+class formatted_raw_ostream;
 class MCAsmBackend;
 class MCCodeEmitter;
 class MCContext;
+class MCInst;
+class MCInstPrinter;
 class MCInstrInfo;
 class MCObjectTargetWriter;
 class MCObjectWriter;
+class MCRegister;
 class MCRegisterInfo;
+class MCStreamer;
 class MCSubtargetInfo;
-class MCRelocationInfo;
 class MCTargetOptions;
+class MCTargetStreamer;
 class Target;
 class Triple;
 class StringRef;
-class raw_ostream;
-class raw_pwrite_stream;
 
 /// Flavour of dwarf regnumbers
 ///
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86ShuffleDecode.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86ShuffleDecode.cpp
new file mode 100644
index 0000000000000..62c1c399a606e
--- /dev/null
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86ShuffleDecode.cpp
@@ -0,0 +1,571 @@
+//===-- X86ShuffleDecode.cpp - X86 shuffle decode logic -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Define several functions to decode x86 specific shuffle semantics into a
+// generic vector mask.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86ShuffleDecode.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+
+//===----------------------------------------------------------------------===//
+//  Vector Mask Decoding
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+
+void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+  // Defaults the copying the dest value.
+  ShuffleMask.push_back(0);
+  ShuffleMask.push_back(1);
+  ShuffleMask.push_back(2);
+  ShuffleMask.push_back(3);
+
+  // Decode the immediate.
+  unsigned ZMask = Imm & 15;
+  unsigned CountD = (Imm >> 4) & 3;
+  unsigned CountS = (Imm >> 6) & 3;
+
+  // CountS selects which input element to use.
+  unsigned InVal = 4 + CountS;
+  // CountD specifies which element of destination to update.
+  ShuffleMask[CountD] = InVal;
+  // ZMask zaps values, potentially overriding the CountD elt.
+  if (ZMask & 1) ShuffleMask[0] = SM_SentinelZero;
+  if (ZMask & 2) ShuffleMask[1] = SM_SentinelZero;
+  if (ZMask & 4) ShuffleMask[2] = SM_SentinelZero;
+  if (ZMask & 8) ShuffleMask[3] = SM_SentinelZero;
+}
+
+void DecodeInsertElementMask(unsigned NumElts, unsigned Idx, unsigned Len,
+                             SmallVectorImpl<int> &ShuffleMask) {
+  assert((Idx + Len) <= NumElts && "Insertion out of range");
+
+  for (unsigned i = 0; i != NumElts; ++i)
+    ShuffleMask.push_back(i);
+  for (unsigned i = 0; i != Len; ++i)
+    ShuffleMask[Idx + i] = NumElts + i;
+}
+
+// <3,1> or <6,7,2,3>
+void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask) {
+  for (unsigned i = NElts / 2; i != NElts; ++i)
+    ShuffleMask.push_back(NElts + i);
+
+  for (unsigned i = NElts / 2; i != NElts; ++i)
+    ShuffleMask.push_back(i);
+}
+
+// <0,2> or <0,1,4,5>
+void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask) {
+  for (unsigned i = 0; i != NElts / 2; ++i)
+    ShuffleMask.push_back(i);
+
+  for (unsigned i = 0; i != NElts / 2; ++i)
+    ShuffleMask.push_back(NElts + i);
+}
+
+void DecodeMOVSLDUPMask(unsigned NumElts, SmallVectorImpl<int> &ShuffleMask) {
+  for (int i = 0, e = NumElts / 2; i < e; ++i) {
+    ShuffleMask.push_back(2 * i);
+    ShuffleMask.push_back(2 * i);
+  }
+}
+
+void DecodeMOVSHDUPMask(unsigned NumElts, SmallVectorImpl<int> &ShuffleMask) {
+  for (int i = 0, e = NumElts / 2; i < e; ++i) {
+    ShuffleMask.push_back(2 * i + 1);
+    ShuffleMask.push_back(2 * i + 1);
+  }
+}
+
+void DecodeMOVDDUPMask(unsigned NumElts, SmallVectorImpl<int> &ShuffleMask) {
+  const unsigned NumLaneElts = 2;
+
+  for (unsigned l = 0; l < NumElts; l += NumLaneElts)
+    for (unsigned i = 0; i < NumLaneElts; ++i)
+      ShuffleMask.push_back(l);
+}
+
+void DecodePSLLDQMask(unsigned NumElts, unsigned Imm,
+                      SmallVectorImpl<int> &ShuffleMask) {
+  const unsigned NumLaneElts = 16;
+
+  for (unsigned l = 0; l < NumElts; l += NumLaneElts)
+    for (unsigned i = 0; i < NumLaneElts; ++i) {
+      int M = SM_SentinelZero;
+      if (i >= Imm) M = i - Imm + l;
+      ShuffleMask.push_back(M);
+    }
+}
+
+void DecodePSRLDQMask(unsigned NumElts, unsigned Imm,
+                      SmallVectorImpl<int> &ShuffleMask) {
+  const unsigned NumLaneElts = 16;
+
+  for (unsigned l = 0; l < NumElts; l += NumLaneElts)
+    for (unsigned i = 0; i < NumLaneElts; ++i) {
+      unsigned Base = i + Imm;
+      int M = Base + l;
+      if (Base >= NumLaneElts) M = SM_SentinelZero;
+      ShuffleMask.push_back(M);
+    }
+}
+
+void DecodePALIGNRMask(unsigned NumElts, unsigned Imm,
+                       SmallVectorImpl<int> &ShuffleMask) {
+  const unsigned NumLaneElts = 16;
+
+  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+    for (unsigned i = 0; i != NumLaneElts; ++i) {
+      unsigned Base = i + Imm;
+      // if i+imm is out of this lane then we actually need the other source
+      if (Base >= NumLaneElts) Base += NumElts - NumLaneElts;
+      ShuffleMask.push_back(Base + l);
+    }
+  }
+}
+
+void DecodeVALIGNMask(unsigned NumElts, unsigned Imm,
+                      SmallVectorImpl<int> &ShuffleMask) {
+  // Not all bits of the immediate are used so mask it.
+  assert(isPowerOf2_32(NumElts) && "NumElts should be power of 2");
+  Imm = Imm & (NumElts - 1);
+  for (unsigned i = 0; i != NumElts; ++i)
+    ShuffleMask.push_back(i + Imm);
+}
+
+void DecodePSHUFMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm,
+                     SmallVectorImpl<int> &ShuffleMask) {
+  unsigned Size = NumElts * ScalarBits;
+  unsigned NumLanes = Size / 128;
+  if (NumLanes == 0) NumLanes = 1;  // Handle MMX
+  unsigned NumLaneElts = NumElts / NumLanes;
+
+  uint32_t SplatImm = (Imm & 0xff) * 0x01010101;
+  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+    for (unsigned i = 0; i != NumLaneElts; ++i) {
+      ShuffleMask.push_back(SplatImm % NumLaneElts + l);
+      SplatImm /= NumLaneElts;
+    }
+  }
+}
+
+void DecodePSHUFHWMask(unsigned NumElts, unsigned Imm,
+                       SmallVectorImpl<int> &ShuffleMask) {
+  for (unsigned l = 0; l != NumElts; l += 8) {
+    unsigned NewImm = Imm;
+    for (unsigned i = 0, e = 4; i != e; ++i) {
+      ShuffleMask.push_back(l + i);
+    }
+    for (unsigned i = 4, e = 8; i != e; ++i) {
+      ShuffleMask.push_back(l + 4 + (NewImm & 3));
+      NewImm >>= 2;
+    }
+  }
+}
+
+void DecodePSHUFLWMask(unsigned NumElts, unsigned Imm,
+                       SmallVectorImpl<int> &ShuffleMask) {
+  for (unsigned l = 0; l != NumElts; l += 8) {
+    unsigned NewImm = Imm;
+    for (unsigned i = 0, e = 4; i != e; ++i) {
+      ShuffleMask.push_back(l + (NewImm & 3));
+      NewImm >>= 2;
+    }
+    for (unsigned i = 4, e = 8; i != e; ++i) {
+      ShuffleMask.push_back(l + i);
+    }
+  }
+}
+
+void DecodePSWAPMask(unsigned NumElts, SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumHalfElts = NumElts / 2;
+
+  for (unsigned l = 0; l != NumHalfElts; ++l)
+    ShuffleMask.push_back(l + NumHalfElts);
+  for (unsigned h = 0; h != NumHalfElts; ++h)
+    ShuffleMask.push_back(h);
+}
+
+void DecodeSHUFPMask(unsigned NumElts, unsigned ScalarBits,
+                     unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumLaneElts = 128 / ScalarBits;
+
+  unsigned NewImm = Imm;
+  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+    // each half of a lane comes from different source
+    for (unsigned s = 0; s != NumElts * 2; s += NumElts) {
+      for (unsigned i = 0; i != NumLaneElts / 2; ++i) {
+        ShuffleMask.push_back(NewImm % NumLaneElts + s + l);
+        NewImm /= NumLaneElts;
+      }
+    }
+    if (NumLaneElts == 4) NewImm = Imm; // reload imm
+  }
+}
+
+void DecodeUNPCKHMask(unsigned NumElts, unsigned ScalarBits,
+                      SmallVectorImpl<int> &ShuffleMask) {
+  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
+  // independently on 128-bit lanes.
+  unsigned NumLanes = (NumElts * ScalarBits) / 128;
+  if (NumLanes == 0) NumLanes = 1;  // Handle MMX
+  unsigned NumLaneElts = NumElts / NumLanes;
+
+  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+    for (unsigned i = l + NumLaneElts / 2, e = l + NumLaneElts; i != e; ++i) {
+      ShuffleMask.push_back(i);           // Reads from dest/src1
+      ShuffleMask.push_back(i + NumElts); // Reads from src/src2
+    }
+  }
+}
+
+void DecodeUNPCKLMask(unsigned NumElts, unsigned ScalarBits,
+                      SmallVectorImpl<int> &ShuffleMask) {
+  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
+  // independently on 128-bit lanes.
+  unsigned NumLanes = (NumElts * ScalarBits) / 128;
+  if (NumLanes == 0 ) NumLanes = 1;  // Handle MMX
+  unsigned NumLaneElts = NumElts / NumLanes;
+
+  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+    for (unsigned i = l, e = l + NumLaneElts / 2; i != e; ++i) {
+      ShuffleMask.push_back(i);           // Reads from dest/src1
+      ShuffleMask.push_back(i + NumElts); // Reads from src/src2
+    }
+  }
+}
+
+void DecodeVectorBroadcast(unsigned NumElts,
+                           SmallVectorImpl<int> &ShuffleMask) {
+  ShuffleMask.append(NumElts, 0);
+}
+
+void DecodeSubVectorBroadcast(unsigned DstNumElts, unsigned SrcNumElts,
+                              SmallVectorImpl<int> &ShuffleMask) {
+  unsigned Scale = DstNumElts / SrcNumElts;
+
+  for (unsigned i = 0; i != Scale; ++i)
+    for (unsigned j = 0; j != SrcNumElts; ++j)
+      ShuffleMask.push_back(j);
+}
+
+void decodeVSHUF64x2FamilyMask(unsigned NumElts, unsigned ScalarSize,
+                               unsigned Imm,
+                               SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumElementsInLane = 128 / ScalarSize;
+  unsigned NumLanes = NumElts / NumElementsInLane;
+
+  for (unsigned l = 0; l != NumElts; l += NumElementsInLane) {
+    unsigned Index = (Imm % NumLanes) * NumElementsInLane;
+    Imm /= NumLanes; // Discard the bits we just used.
+    // We actually need the other source.
+    if (l >= (NumElts / 2))
+      Index += NumElts;
+    for (unsigned i = 0; i != NumElementsInLane; ++i)
+      ShuffleMask.push_back(Index + i);
+  }
+}
+
+void DecodeVPERM2X128Mask(unsigned NumElts, unsigned Imm,
+                          SmallVectorImpl<int> &ShuffleMask) {
+  unsigned HalfSize = NumElts / 2;
+
+  for (unsigned l = 0; l != 2; ++l) {
+    unsigned HalfMask = Imm >> (l * 4);
+    unsigned HalfBegin = (HalfMask & 0x3) * HalfSize;
+    for (unsigned i = HalfBegin, e = HalfBegin + HalfSize; i != e; ++i)
+      ShuffleMask.push_back((HalfMask & 8) ? SM_SentinelZero : (int)i);
+  }
+}
+
+void DecodePSHUFBMask(ArrayRef<uint64_t> RawMask, const APInt &UndefElts,
+                      SmallVectorImpl<int> &ShuffleMask) {
+  for (int i = 0, e = RawMask.size(); i < e; ++i) {
+    uint64_t M = RawMask[i];
+    if (UndefElts[i]) {
+      ShuffleMask.push_back(SM_SentinelUndef);
+      continue;
+    }
+    // For 256/512-bit vectors the base of the shuffle is the 128-bit
+    // subvector we're inside.
+    int Base = (i / 16) * 16;
+    // If the high bit (7) of the byte is set, the element is zeroed.
+    if (M & (1 << 7))
+      ShuffleMask.push_back(SM_SentinelZero);
+    else {
+      // Only the least significant 4 bits of the byte are used.
+      int Index = Base + (M & 0xf);
+      ShuffleMask.push_back(Index);
+    }
+  }
+}
+
+void DecodeBLENDMask(unsigned NumElts, unsigned Imm,
+                     SmallVectorImpl<int> &ShuffleMask) {
+  for (unsigned i = 0; i < NumElts; ++i) {
+    // If there are more than 8 elements in the vector, then any immediate blend
+    // mask wraps around.
+    unsigned Bit = i % 8;
+    ShuffleMask.push_back(((Imm >> Bit) & 1) ? NumElts + i : i);
+  }
+}
+
+void DecodeVPPERMMask(ArrayRef<uint64_t> RawMask, const APInt &UndefElts,
+                      SmallVectorImpl<int> &ShuffleMask) {
+  assert(RawMask.size() == 16 && "Illegal VPPERM shuffle mask size");
+
+  // VPPERM Operation
+  // Bits[4:0] - Byte Index (0 - 31)
+  // Bits[7:5] - Permute Operation
+  //
+  // Permute Operation:
+  // 0 - Source byte (no logical operation).
+  // 1 - Invert source byte.
+  // 2 - Bit reverse of source byte.
+  // 3 - Bit reverse of inverted source byte.
+  // 4 - 00h (zero - fill).
+  // 5 - FFh (ones - fill).
+  // 6 - Most significant bit of source byte replicated in all bit positions.
+  // 7 - Invert most significant bit of source byte and replicate in all bit positions.
+  for (int i = 0, e = RawMask.size(); i < e; ++i) {
+    if (UndefElts[i]) {
+      ShuffleMask.push_back(SM_SentinelUndef);
+      continue;
+    }
+
+    uint64_t M = RawMask[i];
+    uint64_t PermuteOp = (M >> 5) & 0x7;
+    if (PermuteOp == 4) {
+      ShuffleMask.push_back(SM_SentinelZero);
+      continue;
+    }
+    if (PermuteOp != 0) {
+      ShuffleMask.clear();
+      return;
+    }
+
+    uint64_t Index = M & 0x1F;
+    ShuffleMask.push_back((int)Index);
+  }
+}
+
+void DecodeVPERMMask(unsigned NumElts, unsigned Imm,
+                     SmallVectorImpl<int> &ShuffleMask) {
+  for (unsigned l = 0; l != NumElts; l += 4)
+    for (unsigned i = 0; i != 4; ++i)
+      ShuffleMask.push_back(l + ((Imm >> (2 * i)) & 3));
+}
+
+void DecodeZeroExtendMask(unsigned SrcScalarBits, unsigned DstScalarBits,
+                          unsigned NumDstElts, bool IsAnyExtend,
+                          SmallVectorImpl<int> &ShuffleMask) {
+  unsigned Scale = DstScalarBits / SrcScalarBits;
+  assert(SrcScalarBits < DstScalarBits &&
+         "Expected zero extension mask to increase scalar size");
+
+  int Sentinel = IsAnyExtend ? SM_SentinelUndef : SM_SentinelZero;
+  for (unsigned i = 0; i != NumDstElts; i++) {
+    ShuffleMask.push_back(i);
+    ShuffleMask.append(Scale - 1, Sentinel);
+  }
+}
+
+void DecodeZeroMoveLowMask(unsigned NumElts,
+                           SmallVectorImpl<int> &ShuffleMask) {
+  ShuffleMask.push_back(0);
+  ShuffleMask.append(NumElts - 1, SM_SentinelZero);
+}
+
+void DecodeScalarMoveMask(unsigned NumElts, bool IsLoad,
+                          SmallVectorImpl<int> &ShuffleMask) {
+  // First element comes from the first element of second source.
+  // Remaining elements: Load zero extends / Move copies from first source.
+  ShuffleMask.push_back(NumElts);
+  for (unsigned i = 1; i < NumElts; i++)
+    ShuffleMask.push_back(IsLoad ? static_cast<int>(SM_SentinelZero) : i);
+}
+
+void DecodeEXTRQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx,
+                      SmallVectorImpl<int> &ShuffleMask) {
+  unsigned HalfElts = NumElts / 2;
+
+  // Only the bottom 6 bits are valid for each immediate.
+  Len &= 0x3F;
+  Idx &= 0x3F;
+
+  // We can only decode this bit extraction instruction as a shuffle if both the
+  // length and index work with whole elements.
+  if (0 != (Len % EltSize) || 0 != (Idx % EltSize))
+    return;
+
+  // A length of zero is equivalent to a bit length of 64.
+  if (Len == 0)
+    Len = 64;
+
+  // If the length + index exceeds the bottom 64 bits the result is undefined.
+  if ((Len + Idx) > 64) {
+    ShuffleMask.append(NumElts, SM_SentinelUndef);
+    return;
+  }
+
+  // Convert index and index to work with elements.
+  Len /= EltSize;
+  Idx /= EltSize;
+
+  // EXTRQ: Extract Len elements starting from Idx. Zero pad the remaining
+  // elements of the lower 64-bits. The upper 64-bits are undefined.
+  for (int i = 0; i != Len; ++i)
+    ShuffleMask.push_back(i + Idx);
+  for (int i = Len; i != (int)HalfElts; ++i)
+    ShuffleMask.push_back(SM_SentinelZero);
+  for (int i = HalfElts; i != (int)NumElts; ++i)
+    ShuffleMask.push_back(SM_SentinelUndef);
+}
+
+void DecodeINSERTQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx,
+                        SmallVectorImpl<int> &ShuffleMask) {
+  unsigned HalfElts = NumElts / 2;
+
+  // Only the bottom 6 bits are valid for each immediate.
+  Len &= 0x3F;
+  Idx &= 0x3F;
+
+  // We can only decode this bit insertion instruction as a shuffle if both the
+  // length and index work with whole elements.
+  if (0 != (Len % EltSize) || 0 != (Idx % EltSize))
+    return;
+
+  // A length of zero is equivalent to a bit length of 64.
+  if (Len == 0)
+    Len = 64;
+
+  // If the length + index exceeds the bottom 64 bits the result is undefined.
+  if ((Len + Idx) > 64) {
+    ShuffleMask.append(NumElts, SM_SentinelUndef);
+    return;
+  }
+
+  // Convert index and index to work with elements.
+  Len /= EltSize;
+  Idx /= EltSize;
+
+  // INSERTQ: Extract lowest Len elements from lower half of second source and
+  // insert over first source starting at Idx element. The upper 64-bits are
+  // undefined.
+  for (int i = 0; i != Idx; ++i)
+    ShuffleMask.push_back(i);
+  for (int i = 0; i != Len; ++i)
+    ShuffleMask.push_back(i + NumElts);
+  for (int i = Idx + Len; i != (int)HalfElts; ++i)
+    ShuffleMask.push_back(i);
+  for (int i = HalfElts; i != (int)NumElts; ++i)
+    ShuffleMask.push_back(SM_SentinelUndef);
+}
+
+void DecodeVPERMILPMask(unsigned NumElts, unsigned ScalarBits,
+                        ArrayRef<uint64_t> RawMask, const APInt &UndefElts,
+                        SmallVectorImpl<int> &ShuffleMask) {
+  unsigned VecSize = NumElts * ScalarBits;
+  unsigned NumLanes = VecSize / 128;
+  unsigned NumEltsPerLane = NumElts / NumLanes;
+  assert((VecSize == 128 || VecSize == 256 || VecSize == 512) &&
+         "Unexpected vector size");
+  assert((ScalarBits == 32 || ScalarBits == 64) && "Unexpected element size");
+
+  for (unsigned i = 0, e = RawMask.size(); i < e; ++i) {
+    if (UndefElts[i]) {
+      ShuffleMask.push_back(SM_SentinelUndef);
+      continue;
+    }
+    uint64_t M = RawMask[i];
+    M = (ScalarBits == 64 ? ((M >> 1) & 0x1) : (M & 0x3));
+    unsigned LaneOffset = i & ~(NumEltsPerLane - 1);
+    ShuffleMask.push_back((int)(LaneOffset + M));
+  }
+}
+
+void DecodeVPERMIL2PMask(unsigned NumElts, unsigned ScalarBits, unsigned M2Z,
+                         ArrayRef<uint64_t> RawMask, const APInt &UndefElts,
+                         SmallVectorImpl<int> &ShuffleMask) {
+  unsigned VecSize = NumElts * ScalarBits;
+  unsigned NumLanes = VecSize / 128;
+  unsigned NumEltsPerLane = NumElts / NumLanes;
+  assert((VecSize == 128 || VecSize == 256) && "Unexpected vector size");
+  assert((ScalarBits == 32 || ScalarBits == 64) && "Unexpected element size");
+  assert((NumElts == RawMask.size()) && "Unexpected mask size");
+
+  for (unsigned i = 0, e = RawMask.size(); i < e; ++i) {
+    if (UndefElts[i]) {
+      ShuffleMask.push_back(SM_SentinelUndef);
+      continue;
+    }
+
+    // VPERMIL2 Operation.
+    // Bits[3] - Match Bit.
+    // Bits[2:1] - (Per Lane) PD Shuffle Mask.
+    // Bits[2:0] - (Per Lane) PS Shuffle Mask.
+    uint64_t Selector = RawMask[i];
+    unsigned MatchBit = (Selector >> 3) & 0x1;
+
+    // M2Z[0:1]     MatchBit
+    //   0Xb           X        Source selected by Selector index.
+    //   10b           0        Source selected by Selector index.
+    //   10b           1        Zero.
+    //   11b           0        Zero.
+    //   11b           1        Source selected by Selector index.
+    if ((M2Z & 0x2) != 0 && MatchBit != (M2Z & 0x1)) {
+      ShuffleMask.push_back(SM_SentinelZero);
+      continue;
+    }
+
+    int Index = i & ~(NumEltsPerLane - 1);
+    if (ScalarBits == 64)
+      Index += (Selector >> 1) & 0x1;
+    else
+      Index += Selector & 0x3;
+
+    int Src = (Selector >> 2) & 0x1;
+    Index += Src * NumElts;
+    ShuffleMask.push_back(Index);
+  }
+}
+
+void DecodeVPERMVMask(ArrayRef<uint64_t> RawMask, const APInt &UndefElts,
+                      SmallVectorImpl<int> &ShuffleMask) {
+  uint64_t EltMaskSize = RawMask.size() - 1;
+  for (int i = 0, e = RawMask.size(); i != e; ++i) {
+    if (UndefElts[i]) {
+      ShuffleMask.push_back(SM_SentinelUndef);
+      continue;
+    }
+    uint64_t M = RawMask[i];
+    M &= EltMaskSize;
+    ShuffleMask.push_back((int)M);
+  }
+}
+
+void DecodeVPERMV3Mask(ArrayRef<uint64_t> RawMask, const APInt &UndefElts,
+                      SmallVectorImpl<int> &ShuffleMask) {
+  uint64_t EltMaskSize = (RawMask.size() * 2) - 1;
+  for (int i = 0, e = RawMask.size(); i != e; ++i) {
+    if (UndefElts[i]) {
+      ShuffleMask.push_back(SM_SentinelUndef);
+      continue;
+    }
+    uint64_t M = RawMask[i];
+    M &= EltMaskSize;
+    ShuffleMask.push_back((int)M);
+  }
+}
+
+} // llvm namespace
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86ShuffleDecode.h b/llvm/lib/Target/X86/MCTargetDesc/X86ShuffleDecode.h
new file mode 100644
index 0000000000000..4ef9959f7a278
--- /dev/null
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86ShuffleDecode.h
@@ -0,0 +1,166 @@
+//===-- X86ShuffleDecode.h - X86 shuffle decode logic -----------*-C++-*---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Define several functions to decode x86 specific shuffle semantics into a
+// generic vector mask.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_UTILS_X86SHUFFLEDECODE_H
+#define LLVM_LIB_TARGET_X86_UTILS_X86SHUFFLEDECODE_H
+
+#include <cstdint>
+
+//===----------------------------------------------------------------------===//
+//  Vector Mask Decoding
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+class APInt;
+template <typename T> class ArrayRef;
+template <typename T> class SmallVectorImpl;
+
+enum { SM_SentinelUndef = -1, SM_SentinelZero = -2 };
+
+/// Decode a 128-bit INSERTPS instruction as a v4f32 shuffle mask.
+void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+// Insert the bottom Len elements from a second source into a vector starting at
+// element Idx.
+void DecodeInsertElementMask(unsigned NumElts, unsigned Idx, unsigned Len,
+                             SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a MOVHLPS instruction as a v2f64/v4f32 shuffle mask.
+/// i.e. <3,1> or <6,7,2,3>
+void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a MOVLHPS instruction as a v2f64/v4f32 shuffle mask.
+/// i.e. <0,2> or <0,1,4,5>
+void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodeMOVSLDUPMask(unsigned NumElts, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodeMOVSHDUPMask(unsigned NumElts, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodeMOVDDUPMask(unsigned NumElts, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodePSLLDQMask(unsigned NumElts, unsigned Imm,
+                      SmallVectorImpl<int> &ShuffleMask);
+
+void DecodePSRLDQMask(unsigned NumElts, unsigned Imm,
+                      SmallVectorImpl<int> &ShuffleMask);
+
+void DecodePALIGNRMask(unsigned NumElts, unsigned Imm,
+                       SmallVectorImpl<int> &ShuffleMask);
+
+void DecodeVALIGNMask(unsigned NumElts, unsigned Imm,
+                      SmallVectorImpl<int> &ShuffleMask);
+
+/// Decodes the shuffle masks for pshufd/pshufw/vpermilpd/vpermilps.
+void DecodePSHUFMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm,
+                     SmallVectorImpl<int> &ShuffleMask);
+
+/// Decodes the shuffle masks for pshufhw.
+void DecodePSHUFHWMask(unsigned NumElts, unsigned Imm,
+                       SmallVectorImpl<int> &ShuffleMask);
+
+/// Decodes the shuffle masks for pshuflw.
+void DecodePSHUFLWMask(unsigned NumElts, unsigned Imm,
+                       SmallVectorImpl<int> &ShuffleMask);
+
+/// Decodes a PSWAPD 3DNow! instruction.
+void DecodePSWAPMask(unsigned NumElts, SmallVectorImpl<int> &ShuffleMask);
+
+/// Decodes the shuffle masks for shufp*.
+void DecodeSHUFPMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm,
+                     SmallVectorImpl<int> &ShuffleMask);
+
+/// Decodes the shuffle masks for unpckhps/unpckhpd and punpckh*.
+void DecodeUNPCKHMask(unsigned NumElts, unsigned ScalarBits,
+                      SmallVectorImpl<int> &ShuffleMask);
+
+/// Decodes the shuffle masks for unpcklps/unpcklpd and punpckl*.
+void DecodeUNPCKLMask(unsigned NumElts, unsigned ScalarBits,
+                      SmallVectorImpl<int> &ShuffleMask);
+
+/// Decodes a broadcast of the first element of a vector.
+void DecodeVectorBroadcast(unsigned NumElts, SmallVectorImpl<int> &ShuffleMask);
+
+/// Decodes a broadcast of a subvector to a larger vector type.
+void DecodeSubVectorBroadcast(unsigned DstNumElts, unsigned SrcNumElts,
+                              SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a PSHUFB mask from a raw array of constants such as from
+/// BUILD_VECTOR.
+void DecodePSHUFBMask(ArrayRef<uint64_t> RawMask, const APInt &UndefElts,
+                      SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a BLEND immediate mask into a shuffle mask.
+void DecodeBLENDMask(unsigned NumElts, unsigned Imm,
+                     SmallVectorImpl<int> &ShuffleMask);
+
+void DecodeVPERM2X128Mask(unsigned NumElts, unsigned Imm,
+                          SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a shuffle packed values at 128-bit granularity
+/// (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2)
+/// immediate mask into a shuffle mask.
+void decodeVSHUF64x2FamilyMask(unsigned NumElts, unsigned ScalarSize,
+                               unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+/// Decodes the shuffle masks for VPERMQ/VPERMPD.
+void DecodeVPERMMask(unsigned NumElts, unsigned Imm,
+                     SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a VPPERM mask from a raw array of constants such as from
+/// BUILD_VECTOR.
+/// This can only basic masks (permutes + zeros), not any of the other
+/// operations that VPPERM can perform.
+void DecodeVPPERMMask(ArrayRef<uint64_t> RawMask, const APInt &UndefElts,
+                      SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a zero extension instruction as a shuffle mask.
+void DecodeZeroExtendMask(unsigned SrcScalarBits, unsigned DstScalarBits,
+                          unsigned NumDstElts, bool IsAnyExtend,
+                          SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a move lower and zero upper instruction as a shuffle mask.
+void DecodeZeroMoveLowMask(unsigned NumElts, SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a scalar float move instruction as a shuffle mask.
+void DecodeScalarMoveMask(unsigned NumElts, bool IsLoad,
+                          SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a SSE4A EXTRQ instruction as a shuffle mask.
+void DecodeEXTRQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx,
+                      SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a SSE4A INSERTQ instruction as a shuffle mask.
+void DecodeINSERTQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx,
+                        SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a VPERMILPD/VPERMILPS variable mask from a raw array of constants.
+void DecodeVPERMILPMask(unsigned NumElts, unsigned ScalarBits,
+                        ArrayRef<uint64_t> RawMask, const APInt &UndefElts,
+                        SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a VPERMIL2PD/VPERMIL2PS variable mask from a raw array of constants.
+void DecodeVPERMIL2PMask(unsigned NumElts, unsigned ScalarBits, unsigned M2Z,
+                         ArrayRef<uint64_t> RawMask, const APInt &UndefElts,
+                         SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a VPERM W/D/Q/PS/PD mask from a raw array of constants.
+void DecodeVPERMVMask(ArrayRef<uint64_t> RawMask, const APInt &UndefElts,
+                      SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a VPERMT2 W/D/Q/PS/PD mask from a raw array of constants.
+void DecodeVPERMV3Mask(ArrayRef<uint64_t> RawMask, const APInt &UndefElts,
+                      SmallVectorImpl<int> &ShuffleMask);
+} // llvm namespace
+
+#endif
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
index db624378d517b..3bebcc24fd3a4 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
@@ -28,7 +28,7 @@ public:
   void EmitWinEHHandlerData(SMLoc Loc) override;
   void EmitWindowsUnwindTables() override;
   void EmitCVFPOData(const MCSymbol *ProcSym, SMLoc Loc) override;
-  void FinishImpl() override;
+  void finishImpl() override;
 };
 
 void X86WinCOFFStreamer::EmitWinEHHandlerData(SMLoc Loc) {
@@ -52,11 +52,11 @@ void X86WinCOFFStreamer::EmitCVFPOData(const MCSymbol *ProcSym, SMLoc Loc) {
   XTS->emitFPOData(ProcSym, Loc);
 }
 
-void X86WinCOFFStreamer::FinishImpl() {
-  EmitFrames(nullptr);
+void X86WinCOFFStreamer::finishImpl() {
+  emitFrames(nullptr);
   EmitWindowsUnwindTables();
 
-  MCWinCOFFStreamer::FinishImpl();
+  MCWinCOFFStreamer::finishImpl();
 }
 }
 
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp
index d5494ef12370f..11251fb2b2ba7 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp
@@ -159,7 +159,7 @@ bool X86WinCOFFTargetStreamer::checkInFPOPrologue(SMLoc L) {
 
 MCSymbol *X86WinCOFFTargetStreamer::emitFPOLabel() {
   MCSymbol *Label = getContext().createTempSymbol("cfi", true);
-  getStreamer().EmitLabel(Label);
+  getStreamer().emitLabel(Label);
   return Label;
 }
 
@@ -372,13 +372,13 @@ void FPOStateMachine::emitFrameDataRecord(MCStreamer &OS, MCSymbol *Label) {
 
   OS.emitAbsoluteSymbolDiff(Label, FPO->Begin, 4); // RvaStart
   OS.emitAbsoluteSymbolDiff(FPO->End, Label, 4);   // CodeSize
-  OS.EmitIntValue(LocalSize, 4);
-  OS.EmitIntValue(FPO->ParamsSize, 4);
-  OS.EmitIntValue(MaxStackSize, 4);
-  OS.EmitIntValue(FrameFuncStrTabOff, 4); // FrameFunc
+  OS.emitInt32(LocalSize);
+  OS.emitInt32(FPO->ParamsSize);
+  OS.emitInt32(MaxStackSize);
+  OS.emitInt32(FrameFuncStrTabOff); // FrameFunc
   OS.emitAbsoluteSymbolDiff(FPO->PrologueEnd, Label, 2);
-  OS.EmitIntValue(SavedRegSize, 2);
-  OS.EmitIntValue(CurFlags, 4);
+  OS.emitInt16(SavedRegSize);
+  OS.emitInt32(CurFlags);
 }
 
 /// Compute and emit the real CodeView FrameData subsection.
@@ -398,12 +398,12 @@ bool X86WinCOFFTargetStreamer::emitFPOData(const MCSymbol *ProcSym, SMLoc L) {
   MCSymbol *FrameBegin = Ctx.createTempSymbol(),
            *FrameEnd = Ctx.createTempSymbol();
 
-  OS.EmitIntValue(unsigned(DebugSubsectionKind::FrameData), 4);
+  OS.emitInt32(unsigned(DebugSubsectionKind::FrameData));
   OS.emitAbsoluteSymbolDiff(FrameEnd, FrameBegin, 4);
-  OS.EmitLabel(FrameBegin);
+  OS.emitLabel(FrameBegin);
 
   // Start with the RVA of the function in question.
-  OS.EmitValue(MCSymbolRefExpr::create(FPO->Function,
+  OS.emitValue(MCSymbolRefExpr::create(FPO->Function,
                                        MCSymbolRefExpr::VK_COFF_IMGREL32, Ctx),
                4);
 
@@ -437,8 +437,8 @@ bool X86WinCOFFTargetStreamer::emitFPOData(const MCSymbol *ProcSym, SMLoc L) {
     FSM.emitFrameDataRecord(OS, Inst.Label);
   }
 
-  OS.EmitValueToAlignment(4, 0);
-  OS.EmitLabel(FrameEnd);
+  OS.emitValueToAlignment(4, 0);
+  OS.emitLabel(FrameEnd);
   return false;
 }