diff options
Diffstat (limited to 'lib/Target/X86')
-rw-r--r-- | lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp | 10 | ||||
-rw-r--r-- | lib/Target/X86/X86AsmPrinter.cpp | 11 | ||||
-rw-r--r-- | lib/Target/X86/X86DomainReassignment.cpp | 12 | ||||
-rw-r--r-- | lib/Target/X86/X86ISelLowering.cpp | 217 | ||||
-rw-r--r-- | lib/Target/X86/X86IntrinsicsInfo.h | 5 | ||||
-rw-r--r-- | lib/Target/X86/X86RetpolineThunks.cpp | 68 |
6 files changed, 142 insertions, 181 deletions
diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp index a7059c6914df..4ddc1f0ba429 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp @@ -396,10 +396,14 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op, // rip-relative addressing is actually relative to the *next* instruction. // Since an immediate can follow the mod/rm byte for an instruction, this - // means that we need to bias the immediate field of the instruction with - // the size of the immediate field. If we have this case, add it into the + // means that we need to bias the displacement field of the instruction with + // the size of the immediate field. If we have this case, add it into the // expression to emit. - int ImmSize = X86II::hasImm(TSFlags) ? X86II::getSizeOfImm(TSFlags) : 0; + // Note: rip-relative addressing using immediate displacement values should + // not be adjusted, assuming it was the user's intent. + int ImmSize = !Disp.isImm() && X86II::hasImm(TSFlags) + ? X86II::getSizeOfImm(TSFlags) + : 0; EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(FixupKind), CurByte, OS, Fixups, -ImmSize); diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp index 71526dd77f11..2a501efbc1bf 100644 --- a/lib/Target/X86/X86AsmPrinter.cpp +++ b/lib/Target/X86/X86AsmPrinter.cpp @@ -370,6 +370,8 @@ static void printIntelMemReference(X86AsmPrinter &P, const MachineInstr *MI, static bool printAsmMRegister(X86AsmPrinter &P, const MachineOperand &MO, char Mode, raw_ostream &O) { unsigned Reg = MO.getReg(); + bool EmitPercent = true; + switch (Mode) { default: return true; // Unknown mode. case 'b': // Print QImode register @@ -384,6 +386,9 @@ static bool printAsmMRegister(X86AsmPrinter &P, const MachineOperand &MO, case 'k': // Print SImode register Reg = getX86SubSuperRegister(Reg, 32); break; + case 'V': + EmitPercent = false; + LLVM_FALLTHROUGH; case 'q': // Print 64-bit register names if 64-bit integer registers are available. // Otherwise, print 32-bit register names. @@ -391,7 +396,10 @@ static bool printAsmMRegister(X86AsmPrinter &P, const MachineOperand &MO, break; } - O << '%' << X86ATTInstPrinter::getRegisterName(Reg); + if (EmitPercent) + O << '%'; + + O << X86ATTInstPrinter::getRegisterName(Reg); return false; } @@ -464,6 +472,7 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, case 'w': // Print HImode register case 'k': // Print SImode register case 'q': // Print DImode register + case 'V': // Print native register without '%' if (MO.isReg()) return printAsmMRegister(*this, MO, ExtraCode[0], O); printOperand(*this, MI, OpNo, O); diff --git a/lib/Target/X86/X86DomainReassignment.cpp b/lib/Target/X86/X86DomainReassignment.cpp index ba7280c29cc9..bc0f55f581ff 100644 --- a/lib/Target/X86/X86DomainReassignment.cpp +++ b/lib/Target/X86/X86DomainReassignment.cpp @@ -663,8 +663,10 @@ void X86DomainReassignment::initConverters() { createReplacer(X86::XOR32rr, X86::KXORDrr); createReplacer(X86::XOR64rr, X86::KXORQrr); - createReplacer(X86::TEST32rr, X86::KTESTDrr); - createReplacer(X86::TEST64rr, X86::KTESTQrr); + // TODO: KTEST is not a replacement for TEST due to flag differences. Need + // to prove only Z flag is used. + //createReplacer(X86::TEST32rr, X86::KTESTDrr); + //createReplacer(X86::TEST64rr, X86::KTESTQrr); } if (STI->hasDQI()) { @@ -684,8 +686,10 @@ void X86DomainReassignment::initConverters() { createReplacer(X86::SHR8ri, X86::KSHIFTRBri); createReplacer(X86::SHL8ri, X86::KSHIFTLBri); - createReplacer(X86::TEST8rr, X86::KTESTBrr); - createReplacer(X86::TEST16rr, X86::KTESTWrr); + // TODO: KTEST is not a replacement for TEST due to flag differences. Need + // to prove only Z flag is used. + //createReplacer(X86::TEST8rr, X86::KTESTBrr); + //createReplacer(X86::TEST16rr, X86::KTESTWrr); createReplacer(X86::XOR8rr, X86::KXORBrr); } diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 38885c42b529..9237833a2cd0 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -17017,24 +17017,6 @@ static bool hasNonFlagsUse(SDValue Op) { return false; } -// Emit KTEST instruction for bit vectors on AVX-512 -static SDValue EmitKTEST(SDValue Op, SelectionDAG &DAG, - const X86Subtarget &Subtarget) { - if (Op.getOpcode() == ISD::BITCAST) { - auto hasKTEST = [&](MVT VT) { - unsigned SizeInBits = VT.getSizeInBits(); - return (Subtarget.hasDQI() && (SizeInBits == 8 || SizeInBits == 16)) || - (Subtarget.hasBWI() && (SizeInBits == 32 || SizeInBits == 64)); - }; - SDValue Op0 = Op.getOperand(0); - MVT Op0VT = Op0.getValueType().getSimpleVT(); - if (Op0VT.isVector() && Op0VT.getVectorElementType() == MVT::i1 && - hasKTEST(Op0VT)) - return DAG.getNode(X86ISD::KTEST, SDLoc(Op), Op0VT, Op0, Op0); - } - return SDValue(); -} - /// Emit nodes that will be selected as "test Op0,Op0", or something /// equivalent. SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, @@ -17079,9 +17061,6 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, // doing a separate TEST. TEST always sets OF and CF to 0, so unless // we prove that the arithmetic won't overflow, we can't use OF or CF. if (Op.getResNo() != 0 || NeedOF || NeedCF) { - // Emit KTEST for bit vectors - if (auto Node = EmitKTEST(Op, DAG, Subtarget)) - return Node; // Emit a CMP with 0, which is the TEST pattern. return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, DAG.getConstant(0, dl, Op.getValueType())); @@ -17310,10 +17289,6 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, } if (Opcode == 0) { - // Emit KTEST for bit vectors - if (auto Node = EmitKTEST(Op, DAG, Subtarget)) - return Node; - // Emit a CMP with 0, which is the TEST pattern. return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, DAG.getConstant(0, dl, Op.getValueType())); @@ -18093,6 +18068,34 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, return Result; } +// Try to select this as a KTEST+SETCC if possible. +static SDValue EmitKTEST(SDValue Op0, SDValue Op1, ISD::CondCode CC, + const SDLoc &dl, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + // Only support equality comparisons. + if (CC != ISD::SETEQ && CC != ISD::SETNE) + return SDValue(); + + // Must be a bitcast from vXi1. + if (Op0.getOpcode() != ISD::BITCAST) + return SDValue(); + + Op0 = Op0.getOperand(0); + MVT VT = Op0.getSimpleValueType(); + if (!(Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1)) && + !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))) + return SDValue(); + + X86::CondCode X86CC; + if (isNullConstant(Op1)) { + X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE; + } else + return SDValue(); + + SDValue KTEST = DAG.getNode(X86ISD::KTEST, dl, MVT::i32, Op0, Op0); + return getSETCC(X86CC, KTEST, dl, DAG); +} + SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { MVT VT = Op.getSimpleValueType(); @@ -18115,6 +18118,10 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { return NewSetCC; } + // Try to lower using KTEST. + if (SDValue NewSetCC = EmitKTEST(Op0, Op1, CC, dl, DAG, Subtarget)) + return NewSetCC; + // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of // these. if ((isOneConstant(Op1) || isNullConstant(Op1)) && @@ -20525,6 +20532,18 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, Mask = DAG.getBitcast(MaskVT, Mask); return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask); } + case KUNPCK: { + MVT VT = Op.getSimpleValueType(); + MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2); + + SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl); + SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl); + // Arguments should be swapped. + SDValue Res = DAG.getNode(IntrData->Opc0, dl, + MVT::getVectorVT(MVT::i1, VT.getSizeInBits()), + Src2, Src1); + return DAG.getBitcast(VT, Res); + } case MASK_BINOP: { MVT VT = Op.getSimpleValueType(); MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()); @@ -27094,28 +27113,57 @@ static unsigned getOpcodeForRetpoline(unsigned RPOpc) { static const char *getRetpolineSymbol(const X86Subtarget &Subtarget, unsigned Reg) { + if (Subtarget.useRetpolineExternalThunk()) { + // When using an external thunk for retpolines, we pick names that match the + // names GCC happens to use as well. This helps simplify the implementation + // of the thunks for kernels where they have no easy ability to create + // aliases and are doing non-trivial configuration of the thunk's body. For + // example, the Linux kernel will do boot-time hot patching of the thunk + // bodies and cannot easily export aliases of these to loaded modules. + // + // Note that at any point in the future, we may need to change the semantics + // of how we implement retpolines and at that time will likely change the + // name of the called thunk. Essentially, there is no hard guarantee that + // LLVM will generate calls to specific thunks, we merely make a best-effort + // attempt to help out kernels and other systems where duplicating the + // thunks is costly. + switch (Reg) { + case X86::EAX: + assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"); + return "__x86_indirect_thunk_eax"; + case X86::ECX: + assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"); + return "__x86_indirect_thunk_ecx"; + case X86::EDX: + assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"); + return "__x86_indirect_thunk_edx"; + case X86::EDI: + assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"); + return "__x86_indirect_thunk_edi"; + case X86::R11: + assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"); + return "__x86_indirect_thunk_r11"; + } + llvm_unreachable("unexpected reg for retpoline"); + } + + // When targeting an internal COMDAT thunk use an LLVM-specific name. switch (Reg) { - case 0: - assert(!Subtarget.is64Bit() && "R11 should always be available on x64"); - return Subtarget.useRetpolineExternalThunk() - ? "__llvm_external_retpoline_push" - : "__llvm_retpoline_push"; case X86::EAX: - return Subtarget.useRetpolineExternalThunk() - ? "__llvm_external_retpoline_eax" - : "__llvm_retpoline_eax"; + assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"); + return "__llvm_retpoline_eax"; case X86::ECX: - return Subtarget.useRetpolineExternalThunk() - ? "__llvm_external_retpoline_ecx" - : "__llvm_retpoline_ecx"; + assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"); + return "__llvm_retpoline_ecx"; case X86::EDX: - return Subtarget.useRetpolineExternalThunk() - ? "__llvm_external_retpoline_edx" - : "__llvm_retpoline_edx"; + assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"); + return "__llvm_retpoline_edx"; + case X86::EDI: + assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"); + return "__llvm_retpoline_edi"; case X86::R11: - return Subtarget.useRetpolineExternalThunk() - ? "__llvm_external_retpoline_r11" - : "__llvm_retpoline_r11"; + assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"); + return "__llvm_retpoline_r11"; } llvm_unreachable("unexpected reg for retpoline"); } @@ -27134,15 +27182,13 @@ X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI, // just use R11, but we scan for uses anyway to ensure we don't generate // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't // already a register use operand to the call to hold the callee. If none - // are available, push the callee instead. This is less efficient, but is - // necessary for functions using 3 regparms. Such function calls are - // (currently) not eligible for tail call optimization, because there is no - // scratch register available to hold the address of the callee. + // are available, use EDI instead. EDI is chosen because EBX is the PIC base + // register and ESI is the base pointer to realigned stack frames with VLAs. SmallVector<unsigned, 3> AvailableRegs; if (Subtarget.is64Bit()) AvailableRegs.push_back(X86::R11); else - AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX}); + AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI}); // Zero out any registers that are already used. for (const auto &MO : MI.operands()) { @@ -27160,30 +27206,18 @@ X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI, break; } } + if (!AvailableReg) + report_fatal_error("calling convention incompatible with retpoline, no " + "available registers"); const char *Symbol = getRetpolineSymbol(Subtarget, AvailableReg); - if (AvailableReg == 0) { - // No register available. Use PUSH. This must not be a tailcall, and this - // must not be x64. - if (Subtarget.is64Bit()) - report_fatal_error( - "Cannot make an indirect call on x86-64 using both retpoline and a " - "calling convention that preservers r11"); - if (Opc != X86::CALLpcrel32) - report_fatal_error("Cannot make an indirect tail call on x86 using " - "retpoline without a preserved register"); - BuildMI(*BB, MI, DL, TII->get(X86::PUSH32r)).addReg(CalleeVReg); - MI.getOperand(0).ChangeToES(Symbol); - MI.setDesc(TII->get(Opc)); - } else { - BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg) - .addReg(CalleeVReg); - MI.getOperand(0).ChangeToES(Symbol); - MI.setDesc(TII->get(Opc)); - MachineInstrBuilder(*BB->getParent(), &MI) - .addReg(AvailableReg, RegState::Implicit | RegState::Kill); - } + BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg) + .addReg(CalleeVReg); + MI.getOperand(0).ChangeToES(Symbol); + MI.setDesc(TII->get(Opc)); + MachineInstrBuilder(*BB->getParent(), &MI) + .addReg(AvailableReg, RegState::Implicit | RegState::Kill); return BB; } @@ -30432,53 +30466,6 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast, SDValue N0 = BitCast.getOperand(0); EVT VecVT = N0->getValueType(0); - if (VT.isVector() && VecVT.isScalarInteger() && Subtarget.hasAVX512() && - N0->getOpcode() == ISD::OR) { - SDValue Op0 = N0->getOperand(0); - SDValue Op1 = N0->getOperand(1); - MVT TrunckVT; - MVT BitcastVT; - switch (VT.getSimpleVT().SimpleTy) { - default: - return SDValue(); - case MVT::v16i1: - TrunckVT = MVT::i8; - BitcastVT = MVT::v8i1; - break; - case MVT::v32i1: - TrunckVT = MVT::i16; - BitcastVT = MVT::v16i1; - break; - case MVT::v64i1: - TrunckVT = MVT::i32; - BitcastVT = MVT::v32i1; - break; - } - bool isArg0UndefRight = Op0->getOpcode() == ISD::SHL; - bool isArg0UndefLeft = - Op0->getOpcode() == ISD::ZERO_EXTEND || Op0->getOpcode() == ISD::AND; - bool isArg1UndefRight = Op1->getOpcode() == ISD::SHL; - bool isArg1UndefLeft = - Op1->getOpcode() == ISD::ZERO_EXTEND || Op1->getOpcode() == ISD::AND; - SDValue OpLeft; - SDValue OpRight; - if (isArg0UndefRight && isArg1UndefLeft) { - OpLeft = Op0; - OpRight = Op1; - } else if (isArg1UndefRight && isArg0UndefLeft) { - OpLeft = Op1; - OpRight = Op0; - } else - return SDValue(); - SDLoc DL(BitCast); - SDValue Shr = OpLeft->getOperand(0); - SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, TrunckVT, Shr); - SDValue Bitcast1 = DAG.getBitcast(BitcastVT, Trunc1); - SDValue Trunc2 = DAG.getNode(ISD::TRUNCATE, DL, TrunckVT, OpRight); - SDValue Bitcast2 = DAG.getBitcast(BitcastVT, Trunc2); - return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Bitcast1, Bitcast2); - } - if (!VT.isScalarInteger() || !VecVT.isSimple()) return SDValue(); diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h index 0782d5598746..fae0889950b2 100644 --- a/lib/Target/X86/X86IntrinsicsInfo.h +++ b/lib/Target/X86/X86IntrinsicsInfo.h @@ -36,7 +36,7 @@ enum IntrinsicType : uint16_t { COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32, EXPAND_FROM_MEM, - TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, FIXUPIMM, FIXUPIMM_MASKZ, FIXUPIMMS, + TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, KUNPCK, FIXUPIMM, FIXUPIMM_MASKZ, FIXUPIMMS, FIXUPIMMS_MASKZ, CONVERT_TO_MASK, GATHER_AVX2, MASK_BINOP, ROUNDP, ROUNDS }; @@ -479,6 +479,9 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_exp2_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0), X86_INTRINSIC_DATA(avx512_kand_w, MASK_BINOP, ISD::AND, 0), X86_INTRINSIC_DATA(avx512_kor_w, MASK_BINOP, ISD::OR, 0), + X86_INTRINSIC_DATA(avx512_kunpck_bw, KUNPCK, ISD::CONCAT_VECTORS, 0), + X86_INTRINSIC_DATA(avx512_kunpck_dq, KUNPCK, ISD::CONCAT_VECTORS, 0), + X86_INTRINSIC_DATA(avx512_kunpck_wd, KUNPCK, ISD::CONCAT_VECTORS, 0), X86_INTRINSIC_DATA(avx512_kxor_w, MASK_BINOP, ISD::XOR, 0), X86_INTRINSIC_DATA(avx512_mask_add_pd_512, INTR_TYPE_2OP_MASK, ISD::FADD, X86ISD::FADD_RND), diff --git a/lib/Target/X86/X86RetpolineThunks.cpp b/lib/Target/X86/X86RetpolineThunks.cpp index 223fa5771498..d03826bbe992 100644 --- a/lib/Target/X86/X86RetpolineThunks.cpp +++ b/lib/Target/X86/X86RetpolineThunks.cpp @@ -43,7 +43,7 @@ static const char R11ThunkName[] = "__llvm_retpoline_r11"; static const char EAXThunkName[] = "__llvm_retpoline_eax"; static const char ECXThunkName[] = "__llvm_retpoline_ecx"; static const char EDXThunkName[] = "__llvm_retpoline_edx"; -static const char PushThunkName[] = "__llvm_retpoline_push"; +static const char EDIThunkName[] = "__llvm_retpoline_edi"; namespace { class X86RetpolineThunks : public MachineFunctionPass { @@ -74,7 +74,6 @@ private: void createThunkFunction(Module &M, StringRef Name); void insertRegReturnAddrClobber(MachineBasicBlock &MBB, unsigned Reg); - void insert32BitPushReturnAddrClobber(MachineBasicBlock &MBB); void populateThunk(MachineFunction &MF, Optional<unsigned> Reg = None); }; @@ -127,7 +126,7 @@ bool X86RetpolineThunks::runOnMachineFunction(MachineFunction &MF) { createThunkFunction(M, R11ThunkName); else for (StringRef Name : - {EAXThunkName, ECXThunkName, EDXThunkName, PushThunkName}) + {EAXThunkName, ECXThunkName, EDXThunkName, EDIThunkName}) createThunkFunction(M, Name); InsertedThunks = true; return true; @@ -151,9 +150,8 @@ bool X86RetpolineThunks::runOnMachineFunction(MachineFunction &MF) { populateThunk(MF, X86::R11); } else { // For 32-bit targets we need to emit a collection of thunks for various - // possible scratch registers as well as a fallback that is used when - // there are no scratch registers and assumes the retpoline target has - // been pushed. + // possible scratch registers as well as a fallback that uses EDI, which is + // normally callee saved. // __llvm_retpoline_eax: // calll .Leax_call_target // .Leax_capture_spec: @@ -174,32 +172,18 @@ bool X86RetpolineThunks::runOnMachineFunction(MachineFunction &MF) { // movl %edx, (%esp) // retl // - // This last one is a bit more special and so needs a little extra - // handling. - // __llvm_retpoline_push: - // calll .Lpush_call_target - // .Lpush_capture_spec: - // pause - // lfence - // jmp .Lpush_capture_spec - // .align 16 - // .Lpush_call_target: - // # Clear pause_loop return address. - // addl $4, %esp - // # Top of stack words are: Callee, RA. Exchange Callee and RA. - // pushl 4(%esp) # Push callee - // pushl 4(%esp) # Push RA - // popl 8(%esp) # Pop RA to final RA - // popl (%esp) # Pop callee to next top of stack - // retl # Ret to callee + // __llvm_retpoline_edi: + // ... # Same setup + // movl %edi, (%esp) + // retl if (MF.getName() == EAXThunkName) populateThunk(MF, X86::EAX); else if (MF.getName() == ECXThunkName) populateThunk(MF, X86::ECX); else if (MF.getName() == EDXThunkName) populateThunk(MF, X86::EDX); - else if (MF.getName() == PushThunkName) - populateThunk(MF); + else if (MF.getName() == EDIThunkName) + populateThunk(MF, X86::EDI); else llvm_unreachable("Invalid thunk name on x86-32!"); } @@ -240,31 +224,6 @@ void X86RetpolineThunks::insertRegReturnAddrClobber(MachineBasicBlock &MBB, .addReg(Reg); } -void X86RetpolineThunks::insert32BitPushReturnAddrClobber( - MachineBasicBlock &MBB) { - // The instruction sequence we use to replace the return address without - // a scratch register is somewhat complicated: - // # Clear capture_spec from return address. - // addl $4, %esp - // # Top of stack words are: Callee, RA. Exchange Callee and RA. - // pushl 4(%esp) # Push callee - // pushl 4(%esp) # Push RA - // popl 8(%esp) # Pop RA to final RA - // popl (%esp) # Pop callee to next top of stack - // retl # Ret to callee - BuildMI(&MBB, DebugLoc(), TII->get(X86::ADD32ri), X86::ESP) - .addReg(X86::ESP) - .addImm(4); - addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(X86::PUSH32rmm)), X86::ESP, - false, 4); - addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(X86::PUSH32rmm)), X86::ESP, - false, 4); - addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(X86::POP32rmm)), X86::ESP, - false, 8); - addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(X86::POP32rmm)), X86::ESP, - false, 0); -} - void X86RetpolineThunks::populateThunk(MachineFunction &MF, Optional<unsigned> Reg) { // Set MF properties. We never use vregs... @@ -301,11 +260,6 @@ void X86RetpolineThunks::populateThunk(MachineFunction &MF, CaptureSpec->addSuccessor(CaptureSpec); CallTarget->setAlignment(4); - if (Reg) { - insertRegReturnAddrClobber(*CallTarget, *Reg); - } else { - assert(!Is64Bit && "We only support non-reg thunks on 32-bit x86!"); - insert32BitPushReturnAddrClobber(*CallTarget); - } + insertRegReturnAddrClobber(*CallTarget, *Reg); BuildMI(CallTarget, DebugLoc(), TII->get(RetOpc)); } |