6 files changed, 142 insertions, 181 deletions
diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index a7059c6914df..4ddc1f0ba429 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -396,10 +396,14 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
 
     // rip-relative addressing is actually relative to the *next* instruction.
     // Since an immediate can follow the mod/rm byte for an instruction, this
-    // means that we need to bias the immediate field of the instruction with
-    // the size of the immediate field.  If we have this case, add it into the
+    // means that we need to bias the displacement field of the instruction with
+    // the size of the immediate field. If we have this case, add it into the
     // expression to emit.
-    int ImmSize = X86II::hasImm(TSFlags) ? X86II::getSizeOfImm(TSFlags) : 0;
+    // Note: rip-relative addressing using immediate displacement values should
+    // not be adjusted, assuming it was the user's intent.
+    int ImmSize = !Disp.isImm() && X86II::hasImm(TSFlags)
+                      ? X86II::getSizeOfImm(TSFlags)
+                      : 0;
 
     EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(FixupKind),
                   CurByte, OS, Fixups, -ImmSize);
diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp
index 71526dd77f11..2a501efbc1bf 100644
--- a/lib/Target/X86/X86AsmPrinter.cpp
+++ b/lib/Target/X86/X86AsmPrinter.cpp
@@ -370,6 +370,8 @@ static void printIntelMemReference(X86AsmPrinter &P, const MachineInstr *MI,
 static bool printAsmMRegister(X86AsmPrinter &P, const MachineOperand &MO,
                               char Mode, raw_ostream &O) {
   unsigned Reg = MO.getReg();
+  bool EmitPercent = true;
+
   switch (Mode) {
   default: return true;  // Unknown mode.
   case 'b': // Print QImode register
@@ -384,6 +386,9 @@ static bool printAsmMRegister(X86AsmPrinter &P, const MachineOperand &MO,
   case 'k': // Print SImode register
     Reg = getX86SubSuperRegister(Reg, 32);
     break;
+  case 'V':
+    EmitPercent = false;
+    LLVM_FALLTHROUGH;
   case 'q':
     // Print 64-bit register names if 64-bit integer registers are available.
     // Otherwise, print 32-bit register names.
@@ -391,7 +396,10 @@ static bool printAsmMRegister(X86AsmPrinter &P, const MachineOperand &MO,
     break;
   }
 
-  O << '%' << X86ATTInstPrinter::getRegisterName(Reg);
+  if (EmitPercent)
+    O << '%';
+
+  O << X86ATTInstPrinter::getRegisterName(Reg);
   return false;
 }
 
@@ -464,6 +472,7 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
     case 'w': // Print HImode register
     case 'k': // Print SImode register
     case 'q': // Print DImode register
+    case 'V': // Print native register without '%'
       if (MO.isReg())
         return printAsmMRegister(*this, MO, ExtraCode[0], O);
       printOperand(*this, MI, OpNo, O);
diff --git a/lib/Target/X86/X86DomainReassignment.cpp b/lib/Target/X86/X86DomainReassignment.cpp
index ba7280c29cc9..bc0f55f581ff 100644
--- a/lib/Target/X86/X86DomainReassignment.cpp
+++ b/lib/Target/X86/X86DomainReassignment.cpp
@@ -663,8 +663,10 @@ void X86DomainReassignment::initConverters() {
     createReplacer(X86::XOR32rr, X86::KXORDrr);
     createReplacer(X86::XOR64rr, X86::KXORQrr);
 
-    createReplacer(X86::TEST32rr, X86::KTESTDrr);
-    createReplacer(X86::TEST64rr, X86::KTESTQrr);
+    // TODO: KTEST is not a replacement for TEST due to flag differences. Need
+    // to prove only Z flag is used.
+    //createReplacer(X86::TEST32rr, X86::KTESTDrr);
+    //createReplacer(X86::TEST64rr, X86::KTESTQrr);
   }
 
   if (STI->hasDQI()) {
@@ -684,8 +686,10 @@ void X86DomainReassignment::initConverters() {
     createReplacer(X86::SHR8ri, X86::KSHIFTRBri);
     createReplacer(X86::SHL8ri, X86::KSHIFTLBri);
 
-    createReplacer(X86::TEST8rr, X86::KTESTBrr);
-    createReplacer(X86::TEST16rr, X86::KTESTWrr);
+    // TODO: KTEST is not a replacement for TEST due to flag differences. Need
+    // to prove only Z flag is used.
+    //createReplacer(X86::TEST8rr, X86::KTESTBrr);
+    //createReplacer(X86::TEST16rr, X86::KTESTWrr);
 
     createReplacer(X86::XOR8rr, X86::KXORBrr);
   }
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 38885c42b529..9237833a2cd0 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -17017,24 +17017,6 @@ static bool hasNonFlagsUse(SDValue Op) {
   return false;
 }
 
-// Emit KTEST instruction for bit vectors on AVX-512
-static SDValue EmitKTEST(SDValue Op, SelectionDAG &DAG,
-                         const X86Subtarget &Subtarget) {
-  if (Op.getOpcode() == ISD::BITCAST) {
-    auto hasKTEST = [&](MVT VT) {
-      unsigned SizeInBits = VT.getSizeInBits();
-      return (Subtarget.hasDQI() && (SizeInBits == 8 || SizeInBits == 16)) ||
-        (Subtarget.hasBWI() && (SizeInBits == 32 || SizeInBits == 64));
-    };
-    SDValue Op0 = Op.getOperand(0);
-    MVT Op0VT = Op0.getValueType().getSimpleVT();
-    if (Op0VT.isVector() && Op0VT.getVectorElementType() == MVT::i1 &&
-        hasKTEST(Op0VT))
-      return DAG.getNode(X86ISD::KTEST, SDLoc(Op), Op0VT, Op0, Op0);
-  }
-  return SDValue();
-}
-
 /// Emit nodes that will be selected as "test Op0,Op0", or something
 /// equivalent.
 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
@@ -17079,9 +17061,6 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
   // doing a separate TEST. TEST always sets OF and CF to 0, so unless
   // we prove that the arithmetic won't overflow, we can't use OF or CF.
   if (Op.getResNo() != 0 || NeedOF || NeedCF) {
-    // Emit KTEST for bit vectors
-    if (auto Node = EmitKTEST(Op, DAG, Subtarget))
-      return Node;
     // Emit a CMP with 0, which is the TEST pattern.
     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
                        DAG.getConstant(0, dl, Op.getValueType()));
@@ -17310,10 +17289,6 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
   }
 
   if (Opcode == 0) {
-    // Emit KTEST for bit vectors
-    if (auto Node = EmitKTEST(Op, DAG, Subtarget))
-      return Node;
-
     // Emit a CMP with 0, which is the TEST pattern.
     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
                        DAG.getConstant(0, dl, Op.getValueType()));
@@ -18093,6 +18068,34 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
   return Result;
 }
 
+// Try to select this as a KTEST+SETCC if possible.
+static SDValue EmitKTEST(SDValue Op0, SDValue Op1, ISD::CondCode CC,
+                         const SDLoc &dl, SelectionDAG &DAG,
+                         const X86Subtarget &Subtarget) {
+  // Only support equality comparisons.
+  if (CC != ISD::SETEQ && CC != ISD::SETNE)
+    return SDValue();
+
+  // Must be a bitcast from vXi1.
+  if (Op0.getOpcode() != ISD::BITCAST)
+    return SDValue();
+
+  Op0 = Op0.getOperand(0);
+  MVT VT = Op0.getSimpleValueType();
+  if (!(Subtarget.hasDQI() && (VT == MVT::v8i1  || VT == MVT::v16i1)) &&
+      !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
+    return SDValue();
+
+  X86::CondCode X86CC;
+  if (isNullConstant(Op1)) {
+    X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
+  } else
+    return SDValue();
+
+  SDValue KTEST = DAG.getNode(X86ISD::KTEST, dl, MVT::i32, Op0, Op0);
+  return getSETCC(X86CC, KTEST, dl, DAG);
+}
+
 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
 
   MVT VT = Op.getSimpleValueType();
@@ -18115,6 +18118,10 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
       return NewSetCC;
   }
 
+  // Try to lower using KTEST.
+  if (SDValue NewSetCC = EmitKTEST(Op0, Op1, CC, dl, DAG, Subtarget))
+    return NewSetCC;
+
   // Look for X == 0, X == 1, X != 0, or X != 1.  We can simplify some forms of
   // these.
   if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
@@ -20525,6 +20532,18 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
       Mask = DAG.getBitcast(MaskVT, Mask);
       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask);
     }
+    case KUNPCK: {
+      MVT VT = Op.getSimpleValueType();
+      MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2);
+
+      SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
+      SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
+      // Arguments should be swapped.
+      SDValue Res = DAG.getNode(IntrData->Opc0, dl,
+                                MVT::getVectorVT(MVT::i1, VT.getSizeInBits()),
+                                Src2, Src1);
+      return DAG.getBitcast(VT, Res);
+    }
     case MASK_BINOP: {
       MVT VT = Op.getSimpleValueType();
       MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
@@ -27094,28 +27113,57 @@ static unsigned getOpcodeForRetpoline(unsigned RPOpc) {
 
 static const char *getRetpolineSymbol(const X86Subtarget &Subtarget,
                                       unsigned Reg) {
+  if (Subtarget.useRetpolineExternalThunk()) {
+    // When using an external thunk for retpolines, we pick names that match the
+    // names GCC happens to use as well. This helps simplify the implementation
+    // of the thunks for kernels where they have no easy ability to create
+    // aliases and are doing non-trivial configuration of the thunk's body. For
+    // example, the Linux kernel will do boot-time hot patching of the thunk
+    // bodies and cannot easily export aliases of these to loaded modules.
+    //
+    // Note that at any point in the future, we may need to change the semantics
+    // of how we implement retpolines and at that time will likely change the
+    // name of the called thunk. Essentially, there is no hard guarantee that
+    // LLVM will generate calls to specific thunks, we merely make a best-effort
+    // attempt to help out kernels and other systems where duplicating the
+    // thunks is costly.
+    switch (Reg) {
+    case X86::EAX:
+      assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+      return "__x86_indirect_thunk_eax";
+    case X86::ECX:
+      assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+      return "__x86_indirect_thunk_ecx";
+    case X86::EDX:
+      assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+      return "__x86_indirect_thunk_edx";
+    case X86::EDI:
+      assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+      return "__x86_indirect_thunk_edi";
+    case X86::R11:
+      assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
+      return "__x86_indirect_thunk_r11";
+    }
+    llvm_unreachable("unexpected reg for retpoline");
+  }
+
+  // When targeting an internal COMDAT thunk use an LLVM-specific name.
   switch (Reg) {
-  case 0:
-    assert(!Subtarget.is64Bit() && "R11 should always be available on x64");
-    return Subtarget.useRetpolineExternalThunk()
-               ? "__llvm_external_retpoline_push"
-               : "__llvm_retpoline_push";
   case X86::EAX:
-    return Subtarget.useRetpolineExternalThunk()
-               ? "__llvm_external_retpoline_eax"
-               : "__llvm_retpoline_eax";
+    assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+    return "__llvm_retpoline_eax";
   case X86::ECX:
-    return Subtarget.useRetpolineExternalThunk()
-               ? "__llvm_external_retpoline_ecx"
-               : "__llvm_retpoline_ecx";
+    assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+    return "__llvm_retpoline_ecx";
   case X86::EDX:
-    return Subtarget.useRetpolineExternalThunk()
-               ? "__llvm_external_retpoline_edx"
-               : "__llvm_retpoline_edx";
+    assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+    return "__llvm_retpoline_edx";
+  case X86::EDI:
+    assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+    return "__llvm_retpoline_edi";
   case X86::R11:
-    return Subtarget.useRetpolineExternalThunk()
-               ? "__llvm_external_retpoline_r11"
-               : "__llvm_retpoline_r11";
+    assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
+    return "__llvm_retpoline_r11";
   }
   llvm_unreachable("unexpected reg for retpoline");
 }
@@ -27134,15 +27182,13 @@ X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI,
   // just use R11, but we scan for uses anyway to ensure we don't generate
   // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
   // already a register use operand to the call to hold the callee. If none
-  // are available, push the callee instead. This is less efficient, but is
-  // necessary for functions using 3 regparms. Such function calls are
-  // (currently) not eligible for tail call optimization, because there is no
-  // scratch register available to hold the address of the callee.
+  // are available, use EDI instead. EDI is chosen because EBX is the PIC base
+  // register and ESI is the base pointer to realigned stack frames with VLAs.
   SmallVector<unsigned, 3> AvailableRegs;
   if (Subtarget.is64Bit())
     AvailableRegs.push_back(X86::R11);
   else
-    AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX});
+    AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
 
   // Zero out any registers that are already used.
   for (const auto &MO : MI.operands()) {
@@ -27160,30 +27206,18 @@ X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI,
       break;
     }
   }
+  if (!AvailableReg)
+    report_fatal_error("calling convention incompatible with retpoline, no "
+                       "available registers");
 
   const char *Symbol = getRetpolineSymbol(Subtarget, AvailableReg);
 
-  if (AvailableReg == 0) {
-    // No register available. Use PUSH. This must not be a tailcall, and this
-    // must not be x64.
-    if (Subtarget.is64Bit())
-      report_fatal_error(
-          "Cannot make an indirect call on x86-64 using both retpoline and a "
-          "calling convention that preservers r11");
-    if (Opc != X86::CALLpcrel32)
-      report_fatal_error("Cannot make an indirect tail call on x86 using "
-                         "retpoline without a preserved register");
-    BuildMI(*BB, MI, DL, TII->get(X86::PUSH32r)).addReg(CalleeVReg);
-    MI.getOperand(0).ChangeToES(Symbol);
-    MI.setDesc(TII->get(Opc));
-  } else {
-    BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
-        .addReg(CalleeVReg);
-    MI.getOperand(0).ChangeToES(Symbol);
-    MI.setDesc(TII->get(Opc));
-    MachineInstrBuilder(*BB->getParent(), &MI)
-        .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
-  }
+  BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
+      .addReg(CalleeVReg);
+  MI.getOperand(0).ChangeToES(Symbol);
+  MI.setDesc(TII->get(Opc));
+  MachineInstrBuilder(*BB->getParent(), &MI)
+      .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
   return BB;
 }
 
@@ -30432,53 +30466,6 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
   SDValue N0 = BitCast.getOperand(0);
   EVT VecVT = N0->getValueType(0);
 
-  if (VT.isVector() && VecVT.isScalarInteger() && Subtarget.hasAVX512() &&
-      N0->getOpcode() == ISD::OR) {
-    SDValue Op0 = N0->getOperand(0);
-    SDValue Op1 = N0->getOperand(1);
-    MVT TrunckVT;
-    MVT BitcastVT;
-    switch (VT.getSimpleVT().SimpleTy) {
-    default:
-      return SDValue();
-    case MVT::v16i1:
-      TrunckVT = MVT::i8;
-      BitcastVT = MVT::v8i1;
-      break;
-    case MVT::v32i1:
-      TrunckVT = MVT::i16;
-      BitcastVT = MVT::v16i1;
-      break;
-    case MVT::v64i1:
-      TrunckVT = MVT::i32;
-      BitcastVT = MVT::v32i1;
-      break;
-    }
-    bool isArg0UndefRight = Op0->getOpcode() == ISD::SHL;
-    bool isArg0UndefLeft =
-        Op0->getOpcode() == ISD::ZERO_EXTEND || Op0->getOpcode() == ISD::AND;
-    bool isArg1UndefRight = Op1->getOpcode() == ISD::SHL;
-    bool isArg1UndefLeft =
-        Op1->getOpcode() == ISD::ZERO_EXTEND || Op1->getOpcode() == ISD::AND;
-    SDValue OpLeft;
-    SDValue OpRight;
-    if (isArg0UndefRight && isArg1UndefLeft) {
-      OpLeft = Op0;
-      OpRight = Op1;
-    } else if (isArg1UndefRight && isArg0UndefLeft) {
-      OpLeft = Op1;
-      OpRight = Op0;
-    } else
-      return SDValue();
-    SDLoc DL(BitCast);
-    SDValue Shr = OpLeft->getOperand(0);
-    SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, TrunckVT, Shr);
-    SDValue Bitcast1 = DAG.getBitcast(BitcastVT, Trunc1);
-    SDValue Trunc2 = DAG.getNode(ISD::TRUNCATE, DL, TrunckVT, OpRight);
-    SDValue Bitcast2 = DAG.getBitcast(BitcastVT, Trunc2);
-    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Bitcast1, Bitcast2);
-  }
-
   if (!VT.isScalarInteger() || !VecVT.isSimple())
     return SDValue();
 
diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h
index 0782d5598746..fae0889950b2 100644
--- a/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/lib/Target/X86/X86IntrinsicsInfo.h
@@ -36,7 +36,7 @@ enum IntrinsicType : uint16_t {
   COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM,
   TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32,
   EXPAND_FROM_MEM,
-  TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, FIXUPIMM, FIXUPIMM_MASKZ, FIXUPIMMS,
+  TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, KUNPCK, FIXUPIMM, FIXUPIMM_MASKZ, FIXUPIMMS,
   FIXUPIMMS_MASKZ, CONVERT_TO_MASK, GATHER_AVX2, MASK_BINOP,
   ROUNDP, ROUNDS
 };
@@ -479,6 +479,9 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_exp2_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0),
   X86_INTRINSIC_DATA(avx512_kand_w, MASK_BINOP, ISD::AND, 0),
   X86_INTRINSIC_DATA(avx512_kor_w, MASK_BINOP, ISD::OR, 0),
+  X86_INTRINSIC_DATA(avx512_kunpck_bw, KUNPCK, ISD::CONCAT_VECTORS, 0),
+  X86_INTRINSIC_DATA(avx512_kunpck_dq, KUNPCK, ISD::CONCAT_VECTORS, 0),
+  X86_INTRINSIC_DATA(avx512_kunpck_wd, KUNPCK, ISD::CONCAT_VECTORS, 0),
   X86_INTRINSIC_DATA(avx512_kxor_w, MASK_BINOP, ISD::XOR, 0),
   X86_INTRINSIC_DATA(avx512_mask_add_pd_512, INTR_TYPE_2OP_MASK, ISD::FADD,
   X86ISD::FADD_RND),
diff --git a/lib/Target/X86/X86RetpolineThunks.cpp b/lib/Target/X86/X86RetpolineThunks.cpp
index 223fa5771498..d03826bbe992 100644
--- a/lib/Target/X86/X86RetpolineThunks.cpp
+++ b/lib/Target/X86/X86RetpolineThunks.cpp
@@ -43,7 +43,7 @@ static const char R11ThunkName[]    = "__llvm_retpoline_r11";
 static const char EAXThunkName[]    = "__llvm_retpoline_eax";
 static const char ECXThunkName[]    = "__llvm_retpoline_ecx";
 static const char EDXThunkName[]    = "__llvm_retpoline_edx";
-static const char PushThunkName[]   = "__llvm_retpoline_push";
+static const char EDIThunkName[]    = "__llvm_retpoline_edi";
 
 namespace {
 class X86RetpolineThunks : public MachineFunctionPass {
@@ -74,7 +74,6 @@ private:
 
   void createThunkFunction(Module &M, StringRef Name);
   void insertRegReturnAddrClobber(MachineBasicBlock &MBB, unsigned Reg);
-  void insert32BitPushReturnAddrClobber(MachineBasicBlock &MBB);
   void populateThunk(MachineFunction &MF, Optional<unsigned> Reg = None);
 };
 
@@ -127,7 +126,7 @@ bool X86RetpolineThunks::runOnMachineFunction(MachineFunction &MF) {
       createThunkFunction(M, R11ThunkName);
     else
       for (StringRef Name :
-           {EAXThunkName, ECXThunkName, EDXThunkName, PushThunkName})
+           {EAXThunkName, ECXThunkName, EDXThunkName, EDIThunkName})
         createThunkFunction(M, Name);
     InsertedThunks = true;
     return true;
@@ -151,9 +150,8 @@ bool X86RetpolineThunks::runOnMachineFunction(MachineFunction &MF) {
     populateThunk(MF, X86::R11);
   } else {
     // For 32-bit targets we need to emit a collection of thunks for various
-    // possible scratch registers as well as a fallback that is used when
-    // there are no scratch registers and assumes the retpoline target has
-    // been pushed.
+    // possible scratch registers as well as a fallback that uses EDI, which is
+    // normally callee saved.
     //   __llvm_retpoline_eax:
     //         calll .Leax_call_target
     //   .Leax_capture_spec:
@@ -174,32 +172,18 @@ bool X86RetpolineThunks::runOnMachineFunction(MachineFunction &MF) {
     //         movl %edx, (%esp)
     //         retl
     //
-    // This last one is a bit more special and so needs a little extra
-    // handling.
-    // __llvm_retpoline_push:
-    //         calll .Lpush_call_target
-    // .Lpush_capture_spec:
-    //         pause
-    //         lfence
-    //         jmp .Lpush_capture_spec
-    // .align 16
-    // .Lpush_call_target:
-    //         # Clear pause_loop return address.
-    //         addl $4, %esp
-    //         # Top of stack words are: Callee, RA. Exchange Callee and RA.
-    //         pushl 4(%esp)  # Push callee
-    //         pushl 4(%esp)  # Push RA
-    //         popl 8(%esp)   # Pop RA to final RA
-    //         popl (%esp)    # Pop callee to next top of stack
-    //         retl           # Ret to callee
+    //   __llvm_retpoline_edi:
+    //   ... # Same setup
+    //         movl %edi, (%esp)
+    //         retl
     if (MF.getName() == EAXThunkName)
       populateThunk(MF, X86::EAX);
     else if (MF.getName() == ECXThunkName)
       populateThunk(MF, X86::ECX);
     else if (MF.getName() == EDXThunkName)
       populateThunk(MF, X86::EDX);
-    else if (MF.getName() == PushThunkName)
-      populateThunk(MF);
+    else if (MF.getName() == EDIThunkName)
+      populateThunk(MF, X86::EDI);
     else
       llvm_unreachable("Invalid thunk name on x86-32!");
   }
@@ -240,31 +224,6 @@ void X86RetpolineThunks::insertRegReturnAddrClobber(MachineBasicBlock &MBB,
       .addReg(Reg);
 }
 
-void X86RetpolineThunks::insert32BitPushReturnAddrClobber(
-    MachineBasicBlock &MBB) {
-  // The instruction sequence we use to replace the return address without
-  // a scratch register is somewhat complicated:
-  //   # Clear capture_spec from return address.
-  //   addl $4, %esp
-  //   # Top of stack words are: Callee, RA. Exchange Callee and RA.
-  //   pushl 4(%esp)  # Push callee
-  //   pushl 4(%esp)  # Push RA
-  //   popl 8(%esp)   # Pop RA to final RA
-  //   popl (%esp)    # Pop callee to next top of stack
-  //   retl           # Ret to callee
-  BuildMI(&MBB, DebugLoc(), TII->get(X86::ADD32ri), X86::ESP)
-      .addReg(X86::ESP)
-      .addImm(4);
-  addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(X86::PUSH32rmm)), X86::ESP,
-               false, 4);
-  addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(X86::PUSH32rmm)), X86::ESP,
-               false, 4);
-  addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(X86::POP32rmm)), X86::ESP,
-               false, 8);
-  addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(X86::POP32rmm)), X86::ESP,
-               false, 0);
-}
-
 void X86RetpolineThunks::populateThunk(MachineFunction &MF,
                                        Optional<unsigned> Reg) {
   // Set MF properties. We never use vregs...
@@ -301,11 +260,6 @@ void X86RetpolineThunks::populateThunk(MachineFunction &MF,
   CaptureSpec->addSuccessor(CaptureSpec);
 
   CallTarget->setAlignment(4);
-  if (Reg) {
-    insertRegReturnAddrClobber(*CallTarget, *Reg);
-  } else {
-    assert(!Is64Bit && "We only support non-reg thunks on 32-bit x86!");
-    insert32BitPushReturnAddrClobber(*CallTarget);
-  }
+  insertRegReturnAddrClobber(*CallTarget, *Reg);
   BuildMI(CallTarget, DebugLoc(), TII->get(RetOpc));
 }