summaryrefslogtreecommitdiff
path: root/lib/Target/X86
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target/X86')
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp10
-rw-r--r--lib/Target/X86/X86AsmPrinter.cpp11
-rw-r--r--lib/Target/X86/X86DomainReassignment.cpp12
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp217
-rw-r--r--lib/Target/X86/X86IntrinsicsInfo.h5
-rw-r--r--lib/Target/X86/X86RetpolineThunks.cpp68
6 files changed, 142 insertions, 181 deletions
diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index a7059c6914df..4ddc1f0ba429 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -396,10 +396,14 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
// rip-relative addressing is actually relative to the *next* instruction.
// Since an immediate can follow the mod/rm byte for an instruction, this
- // means that we need to bias the immediate field of the instruction with
- // the size of the immediate field. If we have this case, add it into the
+ // means that we need to bias the displacement field of the instruction with
+ // the size of the immediate field. If we have this case, add it into the
// expression to emit.
- int ImmSize = X86II::hasImm(TSFlags) ? X86II::getSizeOfImm(TSFlags) : 0;
+ // Note: rip-relative addressing using immediate displacement values should
+ // not be adjusted, assuming it was the user's intent.
+ int ImmSize = !Disp.isImm() && X86II::hasImm(TSFlags)
+ ? X86II::getSizeOfImm(TSFlags)
+ : 0;
EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(FixupKind),
CurByte, OS, Fixups, -ImmSize);
diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp
index 71526dd77f11..2a501efbc1bf 100644
--- a/lib/Target/X86/X86AsmPrinter.cpp
+++ b/lib/Target/X86/X86AsmPrinter.cpp
@@ -370,6 +370,8 @@ static void printIntelMemReference(X86AsmPrinter &P, const MachineInstr *MI,
static bool printAsmMRegister(X86AsmPrinter &P, const MachineOperand &MO,
char Mode, raw_ostream &O) {
unsigned Reg = MO.getReg();
+ bool EmitPercent = true;
+
switch (Mode) {
default: return true; // Unknown mode.
case 'b': // Print QImode register
@@ -384,6 +386,9 @@ static bool printAsmMRegister(X86AsmPrinter &P, const MachineOperand &MO,
case 'k': // Print SImode register
Reg = getX86SubSuperRegister(Reg, 32);
break;
+ case 'V':
+ EmitPercent = false;
+ LLVM_FALLTHROUGH;
case 'q':
// Print 64-bit register names if 64-bit integer registers are available.
// Otherwise, print 32-bit register names.
@@ -391,7 +396,10 @@ static bool printAsmMRegister(X86AsmPrinter &P, const MachineOperand &MO,
break;
}
- O << '%' << X86ATTInstPrinter::getRegisterName(Reg);
+ if (EmitPercent)
+ O << '%';
+
+ O << X86ATTInstPrinter::getRegisterName(Reg);
return false;
}
@@ -464,6 +472,7 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
case 'w': // Print HImode register
case 'k': // Print SImode register
case 'q': // Print DImode register
+ case 'V': // Print native register without '%'
if (MO.isReg())
return printAsmMRegister(*this, MO, ExtraCode[0], O);
printOperand(*this, MI, OpNo, O);
diff --git a/lib/Target/X86/X86DomainReassignment.cpp b/lib/Target/X86/X86DomainReassignment.cpp
index ba7280c29cc9..bc0f55f581ff 100644
--- a/lib/Target/X86/X86DomainReassignment.cpp
+++ b/lib/Target/X86/X86DomainReassignment.cpp
@@ -663,8 +663,10 @@ void X86DomainReassignment::initConverters() {
createReplacer(X86::XOR32rr, X86::KXORDrr);
createReplacer(X86::XOR64rr, X86::KXORQrr);
- createReplacer(X86::TEST32rr, X86::KTESTDrr);
- createReplacer(X86::TEST64rr, X86::KTESTQrr);
+ // TODO: KTEST is not a replacement for TEST due to flag differences. Need
+ // to prove only Z flag is used.
+ //createReplacer(X86::TEST32rr, X86::KTESTDrr);
+ //createReplacer(X86::TEST64rr, X86::KTESTQrr);
}
if (STI->hasDQI()) {
@@ -684,8 +686,10 @@ void X86DomainReassignment::initConverters() {
createReplacer(X86::SHR8ri, X86::KSHIFTRBri);
createReplacer(X86::SHL8ri, X86::KSHIFTLBri);
- createReplacer(X86::TEST8rr, X86::KTESTBrr);
- createReplacer(X86::TEST16rr, X86::KTESTWrr);
+ // TODO: KTEST is not a replacement for TEST due to flag differences. Need
+ // to prove only Z flag is used.
+ //createReplacer(X86::TEST8rr, X86::KTESTBrr);
+ //createReplacer(X86::TEST16rr, X86::KTESTWrr);
createReplacer(X86::XOR8rr, X86::KXORBrr);
}
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 38885c42b529..9237833a2cd0 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -17017,24 +17017,6 @@ static bool hasNonFlagsUse(SDValue Op) {
return false;
}
-// Emit KTEST instruction for bit vectors on AVX-512
-static SDValue EmitKTEST(SDValue Op, SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
- if (Op.getOpcode() == ISD::BITCAST) {
- auto hasKTEST = [&](MVT VT) {
- unsigned SizeInBits = VT.getSizeInBits();
- return (Subtarget.hasDQI() && (SizeInBits == 8 || SizeInBits == 16)) ||
- (Subtarget.hasBWI() && (SizeInBits == 32 || SizeInBits == 64));
- };
- SDValue Op0 = Op.getOperand(0);
- MVT Op0VT = Op0.getValueType().getSimpleVT();
- if (Op0VT.isVector() && Op0VT.getVectorElementType() == MVT::i1 &&
- hasKTEST(Op0VT))
- return DAG.getNode(X86ISD::KTEST, SDLoc(Op), Op0VT, Op0, Op0);
- }
- return SDValue();
-}
-
/// Emit nodes that will be selected as "test Op0,Op0", or something
/// equivalent.
SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
@@ -17079,9 +17061,6 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
// doing a separate TEST. TEST always sets OF and CF to 0, so unless
// we prove that the arithmetic won't overflow, we can't use OF or CF.
if (Op.getResNo() != 0 || NeedOF || NeedCF) {
- // Emit KTEST for bit vectors
- if (auto Node = EmitKTEST(Op, DAG, Subtarget))
- return Node;
// Emit a CMP with 0, which is the TEST pattern.
return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
DAG.getConstant(0, dl, Op.getValueType()));
@@ -17310,10 +17289,6 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
}
if (Opcode == 0) {
- // Emit KTEST for bit vectors
- if (auto Node = EmitKTEST(Op, DAG, Subtarget))
- return Node;
-
// Emit a CMP with 0, which is the TEST pattern.
return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
DAG.getConstant(0, dl, Op.getValueType()));
@@ -18093,6 +18068,34 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
return Result;
}
+// Try to select this as a KTEST+SETCC if possible.
+static SDValue EmitKTEST(SDValue Op0, SDValue Op1, ISD::CondCode CC,
+ const SDLoc &dl, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ // Only support equality comparisons.
+ if (CC != ISD::SETEQ && CC != ISD::SETNE)
+ return SDValue();
+
+ // Must be a bitcast from vXi1.
+ if (Op0.getOpcode() != ISD::BITCAST)
+ return SDValue();
+
+ Op0 = Op0.getOperand(0);
+ MVT VT = Op0.getSimpleValueType();
+ if (!(Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1)) &&
+ !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
+ return SDValue();
+
+ X86::CondCode X86CC;
+ if (isNullConstant(Op1)) {
+ X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
+ } else
+ return SDValue();
+
+ SDValue KTEST = DAG.getNode(X86ISD::KTEST, dl, MVT::i32, Op0, Op0);
+ return getSETCC(X86CC, KTEST, dl, DAG);
+}
+
SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
MVT VT = Op.getSimpleValueType();
@@ -18115,6 +18118,10 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
return NewSetCC;
}
+ // Try to lower using KTEST.
+ if (SDValue NewSetCC = EmitKTEST(Op0, Op1, CC, dl, DAG, Subtarget))
+ return NewSetCC;
+
// Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
// these.
if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
@@ -20525,6 +20532,18 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
Mask = DAG.getBitcast(MaskVT, Mask);
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask);
}
+ case KUNPCK: {
+ MVT VT = Op.getSimpleValueType();
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2);
+
+ SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
+ SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
+ // Arguments should be swapped.
+ SDValue Res = DAG.getNode(IntrData->Opc0, dl,
+ MVT::getVectorVT(MVT::i1, VT.getSizeInBits()),
+ Src2, Src1);
+ return DAG.getBitcast(VT, Res);
+ }
case MASK_BINOP: {
MVT VT = Op.getSimpleValueType();
MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
@@ -27094,28 +27113,57 @@ static unsigned getOpcodeForRetpoline(unsigned RPOpc) {
static const char *getRetpolineSymbol(const X86Subtarget &Subtarget,
unsigned Reg) {
+ if (Subtarget.useRetpolineExternalThunk()) {
+ // When using an external thunk for retpolines, we pick names that match the
+ // names GCC happens to use as well. This helps simplify the implementation
+ // of the thunks for kernels where they have no easy ability to create
+ // aliases and are doing non-trivial configuration of the thunk's body. For
+ // example, the Linux kernel will do boot-time hot patching of the thunk
+ // bodies and cannot easily export aliases of these to loaded modules.
+ //
+ // Note that at any point in the future, we may need to change the semantics
+ // of how we implement retpolines and at that time will likely change the
+ // name of the called thunk. Essentially, there is no hard guarantee that
+ // LLVM will generate calls to specific thunks, we merely make a best-effort
+ // attempt to help out kernels and other systems where duplicating the
+ // thunks is costly.
+ switch (Reg) {
+ case X86::EAX:
+ assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+ return "__x86_indirect_thunk_eax";
+ case X86::ECX:
+ assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+ return "__x86_indirect_thunk_ecx";
+ case X86::EDX:
+ assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+ return "__x86_indirect_thunk_edx";
+ case X86::EDI:
+ assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+ return "__x86_indirect_thunk_edi";
+ case X86::R11:
+ assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
+ return "__x86_indirect_thunk_r11";
+ }
+ llvm_unreachable("unexpected reg for retpoline");
+ }
+
+ // When targeting an internal COMDAT thunk use an LLVM-specific name.
switch (Reg) {
- case 0:
- assert(!Subtarget.is64Bit() && "R11 should always be available on x64");
- return Subtarget.useRetpolineExternalThunk()
- ? "__llvm_external_retpoline_push"
- : "__llvm_retpoline_push";
case X86::EAX:
- return Subtarget.useRetpolineExternalThunk()
- ? "__llvm_external_retpoline_eax"
- : "__llvm_retpoline_eax";
+ assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+ return "__llvm_retpoline_eax";
case X86::ECX:
- return Subtarget.useRetpolineExternalThunk()
- ? "__llvm_external_retpoline_ecx"
- : "__llvm_retpoline_ecx";
+ assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+ return "__llvm_retpoline_ecx";
case X86::EDX:
- return Subtarget.useRetpolineExternalThunk()
- ? "__llvm_external_retpoline_edx"
- : "__llvm_retpoline_edx";
+ assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+ return "__llvm_retpoline_edx";
+ case X86::EDI:
+ assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+ return "__llvm_retpoline_edi";
case X86::R11:
- return Subtarget.useRetpolineExternalThunk()
- ? "__llvm_external_retpoline_r11"
- : "__llvm_retpoline_r11";
+ assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
+ return "__llvm_retpoline_r11";
}
llvm_unreachable("unexpected reg for retpoline");
}
@@ -27134,15 +27182,13 @@ X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI,
// just use R11, but we scan for uses anyway to ensure we don't generate
// incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
// already a register use operand to the call to hold the callee. If none
- // are available, push the callee instead. This is less efficient, but is
- // necessary for functions using 3 regparms. Such function calls are
- // (currently) not eligible for tail call optimization, because there is no
- // scratch register available to hold the address of the callee.
+ // are available, use EDI instead. EDI is chosen because EBX is the PIC base
+ // register and ESI is the base pointer to realigned stack frames with VLAs.
SmallVector<unsigned, 3> AvailableRegs;
if (Subtarget.is64Bit())
AvailableRegs.push_back(X86::R11);
else
- AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX});
+ AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
// Zero out any registers that are already used.
for (const auto &MO : MI.operands()) {
@@ -27160,30 +27206,18 @@ X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI,
break;
}
}
+ if (!AvailableReg)
+ report_fatal_error("calling convention incompatible with retpoline, no "
+ "available registers");
const char *Symbol = getRetpolineSymbol(Subtarget, AvailableReg);
- if (AvailableReg == 0) {
- // No register available. Use PUSH. This must not be a tailcall, and this
- // must not be x64.
- if (Subtarget.is64Bit())
- report_fatal_error(
- "Cannot make an indirect call on x86-64 using both retpoline and a "
- "calling convention that preservers r11");
- if (Opc != X86::CALLpcrel32)
- report_fatal_error("Cannot make an indirect tail call on x86 using "
- "retpoline without a preserved register");
- BuildMI(*BB, MI, DL, TII->get(X86::PUSH32r)).addReg(CalleeVReg);
- MI.getOperand(0).ChangeToES(Symbol);
- MI.setDesc(TII->get(Opc));
- } else {
- BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
- .addReg(CalleeVReg);
- MI.getOperand(0).ChangeToES(Symbol);
- MI.setDesc(TII->get(Opc));
- MachineInstrBuilder(*BB->getParent(), &MI)
- .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
- }
+ BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
+ .addReg(CalleeVReg);
+ MI.getOperand(0).ChangeToES(Symbol);
+ MI.setDesc(TII->get(Opc));
+ MachineInstrBuilder(*BB->getParent(), &MI)
+ .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
return BB;
}
@@ -30432,53 +30466,6 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
SDValue N0 = BitCast.getOperand(0);
EVT VecVT = N0->getValueType(0);
- if (VT.isVector() && VecVT.isScalarInteger() && Subtarget.hasAVX512() &&
- N0->getOpcode() == ISD::OR) {
- SDValue Op0 = N0->getOperand(0);
- SDValue Op1 = N0->getOperand(1);
- MVT TrunckVT;
- MVT BitcastVT;
- switch (VT.getSimpleVT().SimpleTy) {
- default:
- return SDValue();
- case MVT::v16i1:
- TrunckVT = MVT::i8;
- BitcastVT = MVT::v8i1;
- break;
- case MVT::v32i1:
- TrunckVT = MVT::i16;
- BitcastVT = MVT::v16i1;
- break;
- case MVT::v64i1:
- TrunckVT = MVT::i32;
- BitcastVT = MVT::v32i1;
- break;
- }
- bool isArg0UndefRight = Op0->getOpcode() == ISD::SHL;
- bool isArg0UndefLeft =
- Op0->getOpcode() == ISD::ZERO_EXTEND || Op0->getOpcode() == ISD::AND;
- bool isArg1UndefRight = Op1->getOpcode() == ISD::SHL;
- bool isArg1UndefLeft =
- Op1->getOpcode() == ISD::ZERO_EXTEND || Op1->getOpcode() == ISD::AND;
- SDValue OpLeft;
- SDValue OpRight;
- if (isArg0UndefRight && isArg1UndefLeft) {
- OpLeft = Op0;
- OpRight = Op1;
- } else if (isArg1UndefRight && isArg0UndefLeft) {
- OpLeft = Op1;
- OpRight = Op0;
- } else
- return SDValue();
- SDLoc DL(BitCast);
- SDValue Shr = OpLeft->getOperand(0);
- SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, TrunckVT, Shr);
- SDValue Bitcast1 = DAG.getBitcast(BitcastVT, Trunc1);
- SDValue Trunc2 = DAG.getNode(ISD::TRUNCATE, DL, TrunckVT, OpRight);
- SDValue Bitcast2 = DAG.getBitcast(BitcastVT, Trunc2);
- return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Bitcast1, Bitcast2);
- }
-
if (!VT.isScalarInteger() || !VecVT.isSimple())
return SDValue();
diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h
index 0782d5598746..fae0889950b2 100644
--- a/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/lib/Target/X86/X86IntrinsicsInfo.h
@@ -36,7 +36,7 @@ enum IntrinsicType : uint16_t {
COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM,
TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32,
EXPAND_FROM_MEM,
- TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, FIXUPIMM, FIXUPIMM_MASKZ, FIXUPIMMS,
+ TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, KUNPCK, FIXUPIMM, FIXUPIMM_MASKZ, FIXUPIMMS,
FIXUPIMMS_MASKZ, CONVERT_TO_MASK, GATHER_AVX2, MASK_BINOP,
ROUNDP, ROUNDS
};
@@ -479,6 +479,9 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_exp2_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0),
X86_INTRINSIC_DATA(avx512_kand_w, MASK_BINOP, ISD::AND, 0),
X86_INTRINSIC_DATA(avx512_kor_w, MASK_BINOP, ISD::OR, 0),
+ X86_INTRINSIC_DATA(avx512_kunpck_bw, KUNPCK, ISD::CONCAT_VECTORS, 0),
+ X86_INTRINSIC_DATA(avx512_kunpck_dq, KUNPCK, ISD::CONCAT_VECTORS, 0),
+ X86_INTRINSIC_DATA(avx512_kunpck_wd, KUNPCK, ISD::CONCAT_VECTORS, 0),
X86_INTRINSIC_DATA(avx512_kxor_w, MASK_BINOP, ISD::XOR, 0),
X86_INTRINSIC_DATA(avx512_mask_add_pd_512, INTR_TYPE_2OP_MASK, ISD::FADD,
X86ISD::FADD_RND),
diff --git a/lib/Target/X86/X86RetpolineThunks.cpp b/lib/Target/X86/X86RetpolineThunks.cpp
index 223fa5771498..d03826bbe992 100644
--- a/lib/Target/X86/X86RetpolineThunks.cpp
+++ b/lib/Target/X86/X86RetpolineThunks.cpp
@@ -43,7 +43,7 @@ static const char R11ThunkName[] = "__llvm_retpoline_r11";
static const char EAXThunkName[] = "__llvm_retpoline_eax";
static const char ECXThunkName[] = "__llvm_retpoline_ecx";
static const char EDXThunkName[] = "__llvm_retpoline_edx";
-static const char PushThunkName[] = "__llvm_retpoline_push";
+static const char EDIThunkName[] = "__llvm_retpoline_edi";
namespace {
class X86RetpolineThunks : public MachineFunctionPass {
@@ -74,7 +74,6 @@ private:
void createThunkFunction(Module &M, StringRef Name);
void insertRegReturnAddrClobber(MachineBasicBlock &MBB, unsigned Reg);
- void insert32BitPushReturnAddrClobber(MachineBasicBlock &MBB);
void populateThunk(MachineFunction &MF, Optional<unsigned> Reg = None);
};
@@ -127,7 +126,7 @@ bool X86RetpolineThunks::runOnMachineFunction(MachineFunction &MF) {
createThunkFunction(M, R11ThunkName);
else
for (StringRef Name :
- {EAXThunkName, ECXThunkName, EDXThunkName, PushThunkName})
+ {EAXThunkName, ECXThunkName, EDXThunkName, EDIThunkName})
createThunkFunction(M, Name);
InsertedThunks = true;
return true;
@@ -151,9 +150,8 @@ bool X86RetpolineThunks::runOnMachineFunction(MachineFunction &MF) {
populateThunk(MF, X86::R11);
} else {
// For 32-bit targets we need to emit a collection of thunks for various
- // possible scratch registers as well as a fallback that is used when
- // there are no scratch registers and assumes the retpoline target has
- // been pushed.
+ // possible scratch registers as well as a fallback that uses EDI, which is
+ // normally callee saved.
// __llvm_retpoline_eax:
// calll .Leax_call_target
// .Leax_capture_spec:
@@ -174,32 +172,18 @@ bool X86RetpolineThunks::runOnMachineFunction(MachineFunction &MF) {
// movl %edx, (%esp)
// retl
//
- // This last one is a bit more special and so needs a little extra
- // handling.
- // __llvm_retpoline_push:
- // calll .Lpush_call_target
- // .Lpush_capture_spec:
- // pause
- // lfence
- // jmp .Lpush_capture_spec
- // .align 16
- // .Lpush_call_target:
- // # Clear pause_loop return address.
- // addl $4, %esp
- // # Top of stack words are: Callee, RA. Exchange Callee and RA.
- // pushl 4(%esp) # Push callee
- // pushl 4(%esp) # Push RA
- // popl 8(%esp) # Pop RA to final RA
- // popl (%esp) # Pop callee to next top of stack
- // retl # Ret to callee
+ // __llvm_retpoline_edi:
+ // ... # Same setup
+ // movl %edi, (%esp)
+ // retl
if (MF.getName() == EAXThunkName)
populateThunk(MF, X86::EAX);
else if (MF.getName() == ECXThunkName)
populateThunk(MF, X86::ECX);
else if (MF.getName() == EDXThunkName)
populateThunk(MF, X86::EDX);
- else if (MF.getName() == PushThunkName)
- populateThunk(MF);
+ else if (MF.getName() == EDIThunkName)
+ populateThunk(MF, X86::EDI);
else
llvm_unreachable("Invalid thunk name on x86-32!");
}
@@ -240,31 +224,6 @@ void X86RetpolineThunks::insertRegReturnAddrClobber(MachineBasicBlock &MBB,
.addReg(Reg);
}
-void X86RetpolineThunks::insert32BitPushReturnAddrClobber(
- MachineBasicBlock &MBB) {
- // The instruction sequence we use to replace the return address without
- // a scratch register is somewhat complicated:
- // # Clear capture_spec from return address.
- // addl $4, %esp
- // # Top of stack words are: Callee, RA. Exchange Callee and RA.
- // pushl 4(%esp) # Push callee
- // pushl 4(%esp) # Push RA
- // popl 8(%esp) # Pop RA to final RA
- // popl (%esp) # Pop callee to next top of stack
- // retl # Ret to callee
- BuildMI(&MBB, DebugLoc(), TII->get(X86::ADD32ri), X86::ESP)
- .addReg(X86::ESP)
- .addImm(4);
- addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(X86::PUSH32rmm)), X86::ESP,
- false, 4);
- addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(X86::PUSH32rmm)), X86::ESP,
- false, 4);
- addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(X86::POP32rmm)), X86::ESP,
- false, 8);
- addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(X86::POP32rmm)), X86::ESP,
- false, 0);
-}
-
void X86RetpolineThunks::populateThunk(MachineFunction &MF,
Optional<unsigned> Reg) {
// Set MF properties. We never use vregs...
@@ -301,11 +260,6 @@ void X86RetpolineThunks::populateThunk(MachineFunction &MF,
CaptureSpec->addSuccessor(CaptureSpec);
CallTarget->setAlignment(4);
- if (Reg) {
- insertRegReturnAddrClobber(*CallTarget, *Reg);
- } else {
- assert(!Is64Bit && "We only support non-reg thunks on 32-bit x86!");
- insert32BitPushReturnAddrClobber(*CallTarget);
- }
+ insertRegReturnAddrClobber(*CallTarget, *Reg);
BuildMI(CallTarget, DebugLoc(), TII->get(RetOpc));
}