1 files changed, 153 insertions, 13 deletions
diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp
index 78098fd6262f..2fc9a2af01d7 100644
--- a/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -569,6 +569,7 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
     if (OutMI.getOperand(OutMI.getNumOperands() - 1).getImm() == 0) {
       unsigned NewOpc;
       switch (OutMI.getOpcode()) {
+      default: llvm_unreachable("Invalid opcode");
       case X86::VPCMPBZ128rmi:   NewOpc = X86::VPCMPEQBZ128rm;   break;
       case X86::VPCMPBZ128rmik:  NewOpc = X86::VPCMPEQBZ128rmk;  break;
       case X86::VPCMPBZ128rri:   NewOpc = X86::VPCMPEQBZ128rr;   break;
@@ -640,6 +641,7 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
     if (OutMI.getOperand(OutMI.getNumOperands() - 1).getImm() == 6) {
       unsigned NewOpc;
       switch (OutMI.getOpcode()) {
+      default: llvm_unreachable("Invalid opcode");
       case X86::VPCMPBZ128rmi:   NewOpc = X86::VPCMPGTBZ128rm;   break;
       case X86::VPCMPBZ128rmik:  NewOpc = X86::VPCMPGTBZ128rmk;  break;
       case X86::VPCMPBZ128rri:   NewOpc = X86::VPCMPGTBZ128rr;   break;
@@ -876,6 +878,52 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
   case X86::MOVSX64rr32:
     SimplifyMOVSX(OutMI);
     break;
+
+  case X86::VCMPPDrri:
+  case X86::VCMPPDYrri:
+  case X86::VCMPPSrri:
+  case X86::VCMPPSYrri:
+  case X86::VCMPSDrr:
+  case X86::VCMPSSrr: {
+    // Swap the operands if it will enable a 2 byte VEX encoding.
+    // FIXME: Change the immediate to improve opportunities?
+    if (!X86II::isX86_64ExtendedReg(OutMI.getOperand(1).getReg()) &&
+        X86II::isX86_64ExtendedReg(OutMI.getOperand(2).getReg())) {
+      unsigned Imm = MI->getOperand(3).getImm() & 0x7;
+      switch (Imm) {
+      default: break;
+      case 0x00: // EQUAL
+      case 0x03: // UNORDERED
+      case 0x04: // NOT EQUAL
+      case 0x07: // ORDERED
+        std::swap(OutMI.getOperand(1), OutMI.getOperand(2));
+        break;
+      }
+    }
+    break;
+  }
+
+  case X86::VMOVHLPSrr:
+  case X86::VUNPCKHPDrr:
+    // These are not truly commutable so hide them from the default case.
+    break;
+
+  default: {
+    // If the instruction is a commutable arithmetic instruction we might be
+    // able to commute the operands to get a 2 byte VEX prefix.
+    uint64_t TSFlags = MI->getDesc().TSFlags;
+    if (MI->getDesc().isCommutable() &&
+        (TSFlags & X86II::EncodingMask) == X86II::VEX &&
+        (TSFlags & X86II::OpMapMask) == X86II::TB &&
+        (TSFlags & X86II::FormMask) == X86II::MRMSrcReg &&
+        !(TSFlags & X86II::VEX_W) && (TSFlags & X86II::VEX_4V) &&
+        OutMI.getNumOperands() == 3) {
+      if (!X86II::isX86_64ExtendedReg(OutMI.getOperand(1).getReg()) &&
+          X86II::isX86_64ExtendedReg(OutMI.getOperand(2).getReg()))
+        std::swap(OutMI.getOperand(1), OutMI.getOperand(2));
+    }
+    break;
+  }
   }
 }
 
@@ -983,13 +1031,32 @@ void X86AsmPrinter::LowerTlsAddr(X86MCInstLower &MCInstLowering,
   }
 }
 
+/// Return the longest nop which can be efficiently decoded for the given
+/// target cpu.  15-bytes is the longest single NOP instruction, but some
+/// platforms can't decode the longest forms efficiently.
+static unsigned MaxLongNopLength(const MCSubtargetInfo &STI) {
+  uint64_t MaxNopLength = 10;
+  if (STI.getFeatureBits()[X86::ProcIntelSLM])
+    MaxNopLength = 7;
+  else if (STI.getFeatureBits()[X86::FeatureFast15ByteNOP])
+    MaxNopLength = 15;
+  else if (STI.getFeatureBits()[X86::FeatureFast11ByteNOP])
+    MaxNopLength = 11;
+  return MaxNopLength;
+}
+
 /// Emit the largest nop instruction smaller than or equal to \p NumBytes
 /// bytes.  Return the size of nop emitted.
 static unsigned EmitNop(MCStreamer &OS, unsigned NumBytes, bool Is64Bit,
                         const MCSubtargetInfo &STI) {
-  // This works only for 64bit. For 32bit we have to do additional checking if
-  // the CPU supports multi-byte nops.
-  assert(Is64Bit && "EmitNops only supports X86-64");
+  if (!Is64Bit) {
+    // TODO Do additional checking if the CPU supports multi-byte nops.
+    OS.EmitInstruction(MCInstBuilder(X86::NOOP), STI);
+    return 1;
+  }
+
+  // Cap a single nop emission at the profitable value for the target
+  NumBytes = std::min(NumBytes, MaxLongNopLength(STI));
 
   unsigned NopSize;
   unsigned Opc, BaseReg, ScaleVal, IndexReg, Displacement, SegmentReg;
@@ -1094,10 +1161,35 @@ static void EmitNops(MCStreamer &OS, unsigned NumBytes, bool Is64Bit,
   }
 }
 
+/// A RAII helper which defines a region of instructions which can't have
+/// padding added between them for correctness.
+struct NoAutoPaddingScope {
+  MCStreamer &OS;
+  const bool OldAllowAutoPadding;
+  NoAutoPaddingScope(MCStreamer &OS)
+    : OS(OS), OldAllowAutoPadding(OS.getAllowAutoPadding()) {
+    changeAndComment(false);
+  }
+  ~NoAutoPaddingScope() {
+    changeAndComment(OldAllowAutoPadding);
+  }
+  void changeAndComment(bool b) {
+    if (b == OS.getAllowAutoPadding())
+      return;
+    OS.setAllowAutoPadding(b);
+    if (b)
+      OS.emitRawComment("autopadding");
+    else
+      OS.emitRawComment("noautopadding");
+  }
+};
+
 void X86AsmPrinter::LowerSTATEPOINT(const MachineInstr &MI,
                                     X86MCInstLower &MCIL) {
   assert(Subtarget->is64Bit() && "Statepoint currently only supports X86-64");
 
+  NoAutoPaddingScope NoPadScope(*OutStreamer);
+
   StatepointOpers SOpers(&MI);
   if (unsigned PatchBytes = SOpers.getNumPatchBytes()) {
     EmitNops(*OutStreamer, PatchBytes, Subtarget->is64Bit(),
@@ -1148,7 +1240,10 @@ void X86AsmPrinter::LowerSTATEPOINT(const MachineInstr &MI,
 
   // Record our statepoint node in the same section used by STACKMAP
   // and PATCHPOINT
-  SM.recordStatepoint(MI);
+  auto &Ctx = OutStreamer->getContext();
+  MCSymbol *MILabel = Ctx.createTempSymbol();
+  OutStreamer->EmitLabel(MILabel);
+  SM.recordStatepoint(*MILabel, MI);
 }
 
 void X86AsmPrinter::LowerFAULTING_OP(const MachineInstr &FaultingMI,
@@ -1156,6 +1251,8 @@ void X86AsmPrinter::LowerFAULTING_OP(const MachineInstr &FaultingMI,
   // FAULTING_LOAD_OP <def>, <faltinf type>, <MBB handler>,
   //                  <opcode>, <operands>
 
+  NoAutoPaddingScope NoPadScope(*OutStreamer);
+
   Register DefRegister = FaultingMI.getOperand(0).getReg();
   FaultMaps::FaultKind FK =
       static_cast<FaultMaps::FaultKind>(FaultingMI.getOperand(1).getImm());
@@ -1163,8 +1260,12 @@ void X86AsmPrinter::LowerFAULTING_OP(const MachineInstr &FaultingMI,
   unsigned Opcode = FaultingMI.getOperand(3).getImm();
   unsigned OperandsBeginIdx = 4;
 
+  auto &Ctx = OutStreamer->getContext();
+  MCSymbol *FaultingLabel = Ctx.createTempSymbol();
+  OutStreamer->EmitLabel(FaultingLabel);
+
   assert(FK < FaultMaps::FaultKindMax && "Invalid Faulting Kind!");
-  FM.recordFaultingOp(FK, HandlerLabel);
+  FM.recordFaultingOp(FK, FaultingLabel, HandlerLabel);
 
   MCInst MI;
   MI.setOpcode(Opcode);
@@ -1199,6 +1300,8 @@ void X86AsmPrinter::LowerPATCHABLE_OP(const MachineInstr &MI,
                                       X86MCInstLower &MCIL) {
   // PATCHABLE_OP minsize, opcode, operands
 
+  NoAutoPaddingScope NoPadScope(*OutStreamer);
+
   unsigned MinSize = MI.getOperand(0).getImm();
   unsigned Opcode = MI.getOperand(1).getImm();
 
@@ -1236,7 +1339,12 @@ void X86AsmPrinter::LowerPATCHABLE_OP(const MachineInstr &MI,
 // <id>, <shadowBytes>, ...
 void X86AsmPrinter::LowerSTACKMAP(const MachineInstr &MI) {
   SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo());
-  SM.recordStackMap(MI);
+
+  auto &Ctx = OutStreamer->getContext();
+  MCSymbol *MILabel = Ctx.createTempSymbol();
+  OutStreamer->EmitLabel(MILabel);
+
+  SM.recordStackMap(*MILabel, MI);
   unsigned NumShadowBytes = MI.getOperand(1).getImm();
   SMShadowTracker.reset(NumShadowBytes);
 }
@@ -1249,7 +1357,12 @@ void X86AsmPrinter::LowerPATCHPOINT(const MachineInstr &MI,
 
   SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo());
 
-  SM.recordPatchPoint(MI);
+  NoAutoPaddingScope NoPadScope(*OutStreamer);
+
+  auto &Ctx = OutStreamer->getContext();
+  MCSymbol *MILabel = Ctx.createTempSymbol();
+  OutStreamer->EmitLabel(MILabel);
+  SM.recordPatchPoint(*MILabel, MI);
 
   PatchPointOpers opers(&MI);
   unsigned ScratchIdx = opers.getNextScratchIdx();
@@ -1305,6 +1418,8 @@ void X86AsmPrinter::LowerPATCHABLE_EVENT_CALL(const MachineInstr &MI,
                                               X86MCInstLower &MCIL) {
   assert(Subtarget->is64Bit() && "XRay custom events only supports X86-64");
 
+  NoAutoPaddingScope NoPadScope(*OutStreamer);
+
   // We want to emit the following pattern, which follows the x86 calling
   // convention to prepare for the trampoline call to be patched in.
   //
@@ -1337,10 +1452,10 @@ void X86AsmPrinter::LowerPATCHABLE_EVENT_CALL(const MachineInstr &MI,
 
   // The default C calling convention will place two arguments into %rcx and
   // %rdx -- so we only work with those.
-  unsigned DestRegs[] = {X86::RDI, X86::RSI};
+  const Register DestRegs[] = {X86::RDI, X86::RSI};
   bool UsedMask[] = {false, false};
   // Filled out in loop.
-  unsigned SrcRegs[] = {0, 0};
+  Register SrcRegs[] = {0, 0};
 
   // Then we put the operands in the %rdi and %rsi registers. We spill the
   // values in the register before we clobber them, and mark them as used in
@@ -1350,7 +1465,7 @@ void X86AsmPrinter::LowerPATCHABLE_EVENT_CALL(const MachineInstr &MI,
   for (unsigned I = 0; I < MI.getNumOperands(); ++I)
     if (auto Op = MCIL.LowerMachineOperand(&MI, MI.getOperand(I))) {
       assert(Op->isReg() && "Only support arguments in registers");
-      SrcRegs[I] = Op->getReg();
+      SrcRegs[I] = getX86SubSuperRegister(Op->getReg(), 64);
       if (SrcRegs[I] != DestRegs[I]) {
         UsedMask[I] = true;
         EmitAndCountInstruction(
@@ -1361,6 +1476,9 @@ void X86AsmPrinter::LowerPATCHABLE_EVENT_CALL(const MachineInstr &MI,
     }
 
   // Now that the register values are stashed, mov arguments into place.
+  // FIXME: This doesn't work if one of the later SrcRegs is equal to an
+  // earlier DestReg. We will have already overwritten over the register before
+  // we can copy from it.
   for (unsigned I = 0; I < MI.getNumOperands(); ++I)
     if (SrcRegs[I] != DestRegs[I])
       EmitAndCountInstruction(
@@ -1396,6 +1514,8 @@ void X86AsmPrinter::LowerPATCHABLE_TYPED_EVENT_CALL(const MachineInstr &MI,
                                                     X86MCInstLower &MCIL) {
   assert(Subtarget->is64Bit() && "XRay typed events only supports X86-64");
 
+  NoAutoPaddingScope NoPadScope(*OutStreamer);
+
   // We want to emit the following pattern, which follows the x86 calling
   // convention to prepare for the trampoline call to be patched in.
   //
@@ -1429,11 +1549,11 @@ void X86AsmPrinter::LowerPATCHABLE_TYPED_EVENT_CALL(const MachineInstr &MI,
   // An x86-64 convention may place three arguments into %rcx, %rdx, and R8,
   // so we'll work with those. Or we may be called via SystemV, in which case
   // we don't have to do any translation.
-  unsigned DestRegs[] = {X86::RDI, X86::RSI, X86::RDX};
+  const Register DestRegs[] = {X86::RDI, X86::RSI, X86::RDX};
   bool UsedMask[] = {false, false, false};
 
   // Will fill out src regs in the loop.
-  unsigned SrcRegs[] = {0, 0, 0};
+  Register SrcRegs[] = {0, 0, 0};
 
   // Then we put the operands in the SystemV registers. We spill the values in
   // the registers before we clobber them, and mark them as used in UsedMask.
@@ -1443,7 +1563,7 @@ void X86AsmPrinter::LowerPATCHABLE_TYPED_EVENT_CALL(const MachineInstr &MI,
     if (auto Op = MCIL.LowerMachineOperand(&MI, MI.getOperand(I))) {
       // TODO: Is register only support adequate?
       assert(Op->isReg() && "Only supports arguments in registers");
-      SrcRegs[I] = Op->getReg();
+      SrcRegs[I] = getX86SubSuperRegister(Op->getReg(), 64);
       if (SrcRegs[I] != DestRegs[I]) {
         UsedMask[I] = true;
         EmitAndCountInstruction(
@@ -1459,6 +1579,9 @@ void X86AsmPrinter::LowerPATCHABLE_TYPED_EVENT_CALL(const MachineInstr &MI,
   // is clobbers. We've already added nops to account for the size of mov and
   // push if the register is in the right place, so we only have to worry about
   // emitting movs.
+  // FIXME: This doesn't work if one of the later SrcRegs is equal to an
+  // earlier DestReg. We will have already overwritten over the register before
+  // we can copy from it.
   for (unsigned I = 0; I < MI.getNumOperands(); ++I)
     if (UsedMask[I])
       EmitAndCountInstruction(
@@ -1490,6 +1613,19 @@ void X86AsmPrinter::LowerPATCHABLE_TYPED_EVENT_CALL(const MachineInstr &MI,
 
 void X86AsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI,
                                                   X86MCInstLower &MCIL) {
+
+  NoAutoPaddingScope NoPadScope(*OutStreamer);
+
+  const Function &F = MF->getFunction();
+  if (F.hasFnAttribute("patchable-function-entry")) {
+    unsigned Num;
+    if (F.getFnAttribute("patchable-function-entry")
+            .getValueAsString()
+            .getAsInteger(10, Num))
+      return;
+    EmitNops(*OutStreamer, Num, Subtarget->is64Bit(), getSubtargetInfo());
+    return;
+  }
   // We want to emit the following pattern:
   //
   //   .p2align 1, ...
@@ -1517,6 +1653,8 @@ void X86AsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI,
 
 void X86AsmPrinter::LowerPATCHABLE_RET(const MachineInstr &MI,
                                        X86MCInstLower &MCIL) {
+  NoAutoPaddingScope NoPadScope(*OutStreamer);
+
   // Since PATCHABLE_RET takes the opcode of the return statement as an
   // argument, we use that to emit the correct form of the RET that we want.
   // i.e. when we see this:
@@ -1547,6 +1685,8 @@ void X86AsmPrinter::LowerPATCHABLE_RET(const MachineInstr &MI,
 
 void X86AsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI,
                                              X86MCInstLower &MCIL) {
+  NoAutoPaddingScope NoPadScope(*OutStreamer);
+
   // Like PATCHABLE_RET, we have the actual instruction in the operands to this
   // instruction so we lower that particular instruction and its operands.
   // Unlike PATCHABLE_RET though, we put the sled before the JMP, much like how