diff options
| author | Dimitry Andric <dim@FreeBSD.org> | 2018-02-02 17:07:53 +0000 |
|---|---|---|
| committer | Dimitry Andric <dim@FreeBSD.org> | 2018-02-02 17:07:53 +0000 |
| commit | 6d18171c1901a4db5d3e757a5ba4737fe8789dec (patch) | |
| tree | 6adfbc90504e1005368a826374523b46773e1599 /lib/Target | |
| parent | 4a6a1ccbecd7e34f40b05b4ba0a05d0031dd1eff (diff) | |
Notes
Diffstat (limited to 'lib/Target')
23 files changed, 634 insertions, 39 deletions
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp index 61967605432e..2c127d787260 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -3756,36 +3756,45 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { // FIXME: This isn't safe because the addressing mode doesn't work // correctly if vaddr is negative. // - // FIXME: Handle v_add_u32 and VOP3 form. Also don't rely on immediate - // being in src0. - // // FIXME: Should probably be done somewhere else, maybe SIFoldOperands. // // See if we can extract an immediate offset by recognizing one of these: // V_ADD_I32_e32 dst, imm, src1 // V_ADD_I32_e32 dst, (S_MOV_B32 imm), src1 // V_ADD will be removed by "Remove dead machine instructions". - if (Add && Add->getOpcode() == AMDGPU::V_ADD_I32_e32) { - const MachineOperand *Src = - getNamedOperand(*Add, AMDGPU::OpName::src0); + if (Add && + (Add->getOpcode() == AMDGPU::V_ADD_I32_e32 || + Add->getOpcode() == AMDGPU::V_ADD_U32_e64)) { + static const unsigned SrcNames[2] = { + AMDGPU::OpName::src0, + AMDGPU::OpName::src1, + }; - if (Src->isReg()) { - auto Mov = MRI.getUniqueVRegDef(Src->getReg()); - if (Mov && Mov->getOpcode() == AMDGPU::S_MOV_B32) - Src = &Mov->getOperand(1); - } + // Find a literal offset in one of source operands. + for (int i = 0; i < 2; i++) { + const MachineOperand *Src = + getNamedOperand(*Add, SrcNames[i]); - if (Src) { - if (Src->isImm()) - Offset = Src->getImm(); - else if (Src->isCImm()) - Offset = Src->getCImm()->getZExtValue(); - } + if (Src->isReg()) { + auto Mov = MRI.getUniqueVRegDef(Src->getReg()); + if (Mov && Mov->getOpcode() == AMDGPU::S_MOV_B32) + Src = &Mov->getOperand(1); + } + + if (Src) { + if (Src->isImm()) + Offset = Src->getImm(); + else if (Src->isCImm()) + Offset = Src->getCImm()->getZExtValue(); + } + + if (Offset && isLegalMUBUFImmOffset(Offset)) { + VAddr = getNamedOperand(*Add, SrcNames[!i]); + break; + } - if (Offset && isLegalMUBUFImmOffset(Offset)) - VAddr = getNamedOperand(*Add, AMDGPU::OpName::src1); - else Offset = 0; + } } BuildMI(*MBB, Inst, Inst.getDebugLoc(), diff --git a/lib/Target/ARM/Thumb1InstrInfo.cpp b/lib/Target/ARM/Thumb1InstrInfo.cpp index 49645834e2de..05c98aab6f27 100644 --- a/lib/Target/ARM/Thumb1InstrInfo.cpp +++ b/lib/Target/ARM/Thumb1InstrInfo.cpp @@ -141,3 +141,16 @@ void Thumb1InstrInfo::expandLoadStackGuard( else expandLoadStackGuardBase(MI, ARM::tLDRLIT_ga_abs, ARM::tLDRi); } + +bool Thumb1InstrInfo::canCopyGluedNodeDuringSchedule(SDNode *N) const { + // In Thumb1 the scheduler may need to schedule a cross-copy between GPRS and CPSR + // but this is not always possible there, so allow the Scheduler to clone tADCS and tSBCS + // even if they have glue. + // FIXME. Actually implement the cross-copy where it is possible (post v6) + // because these copies entail more spilling. + unsigned Opcode = N->getMachineOpcode(); + if (Opcode == ARM::tADCS || Opcode == ARM::tSBCS) + return true; + + return false; +} diff --git a/lib/Target/ARM/Thumb1InstrInfo.h b/lib/Target/ARM/Thumb1InstrInfo.h index e8d9a9c4ff14..9f04a3ed262f 100644 --- a/lib/Target/ARM/Thumb1InstrInfo.h +++ b/lib/Target/ARM/Thumb1InstrInfo.h @@ -53,6 +53,7 @@ public: const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const override; + bool canCopyGluedNodeDuringSchedule(SDNode *N) const override; private: void expandLoadStackGuard(MachineBasicBlock::iterator MI) const override; }; diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp index 79ca9cc6b800..ba05b0f48df7 100644 --- a/lib/Target/Mips/MipsISelLowering.cpp +++ b/lib/Target/Mips/MipsISelLowering.cpp @@ -3507,10 +3507,9 @@ MipsTargetLowering::CanLowerReturn(CallingConv::ID CallConv, bool MipsTargetLowering::shouldSignExtendTypeInLibCall(EVT Type, bool IsSigned) const { - if (Subtarget.hasMips3() && Subtarget.useSoftFloat()) { - if (Type == MVT::i32) + if ((ABI.IsN32() || ABI.IsN64()) && Type == MVT::i32) return true; - } + return IsSigned; } diff --git a/lib/Target/Mips/MipsTargetObjectFile.cpp b/lib/Target/Mips/MipsTargetObjectFile.cpp index 9db6b7b1bcd6..f767c8321988 100644 --- a/lib/Target/Mips/MipsTargetObjectFile.cpp +++ b/lib/Target/Mips/MipsTargetObjectFile.cpp @@ -136,6 +136,13 @@ IsGlobalInSmallSectionImpl(const GlobalObject *GO, return false; Type *Ty = GVA->getValueType(); + + // It is possible that the type of the global is unsized, i.e. a declaration + // of a extern struct. In this case don't presume it is in the small data + // section. This happens e.g. when building the FreeBSD kernel. + if (!Ty->isSized()) + return false; + return IsInSmallSection( GVA->getParent()->getDataLayout().getTypeAllocSize(Ty)); } diff --git a/lib/Target/Sparc/SparcFrameLowering.cpp b/lib/Target/Sparc/SparcFrameLowering.cpp index 9864aa372354..9f6c7d65592d 100644 --- a/lib/Target/Sparc/SparcFrameLowering.cpp +++ b/lib/Target/Sparc/SparcFrameLowering.cpp @@ -88,10 +88,11 @@ void SparcFrameLowering::emitPrologue(MachineFunction &MF, assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported"); MachineFrameInfo &MFI = MF.getFrameInfo(); + const SparcSubtarget &Subtarget = MF.getSubtarget<SparcSubtarget>(); const SparcInstrInfo &TII = - *static_cast<const SparcInstrInfo *>(MF.getSubtarget().getInstrInfo()); + *static_cast<const SparcInstrInfo *>(Subtarget.getInstrInfo()); const SparcRegisterInfo &RegInfo = - *static_cast<const SparcRegisterInfo *>(MF.getSubtarget().getRegisterInfo()); + *static_cast<const SparcRegisterInfo *>(Subtarget.getRegisterInfo()); MachineBasicBlock::iterator MBBI = MBB.begin(); // Debug location must be unknown since the first debug location is used // to determine the end of the prologue. @@ -141,7 +142,7 @@ void SparcFrameLowering::emitPrologue(MachineFunction &MF, // Adds the SPARC subtarget-specific spill area to the stack // size. Also ensures target-required alignment. - NumBytes = MF.getSubtarget<SparcSubtarget>().getAdjustedFrameSize(NumBytes); + NumBytes = Subtarget.getAdjustedFrameSize(NumBytes); // Finally, ensure that the size is sufficiently aligned for the // data on the stack. @@ -176,9 +177,27 @@ void SparcFrameLowering::emitPrologue(MachineFunction &MF, .addCFIIndex(CFIIndex); if (NeedsStackRealignment) { - // andn %o6, MaxAlign-1, %o6 + int64_t Bias = Subtarget.getStackPointerBias(); + unsigned regUnbiased; + if (Bias) { + // This clobbers G1 which we always know is available here. + regUnbiased = SP::G1; + // add %o6, BIAS, %g1 + BuildMI(MBB, MBBI, dl, TII.get(SP::ADDri), regUnbiased) + .addReg(SP::O6).addImm(Bias); + } else + regUnbiased = SP::O6; + + // andn %regUnbiased, MaxAlign-1, %regUnbiased int MaxAlign = MFI.getMaxAlignment(); - BuildMI(MBB, MBBI, dl, TII.get(SP::ANDNri), SP::O6).addReg(SP::O6).addImm(MaxAlign - 1); + BuildMI(MBB, MBBI, dl, TII.get(SP::ANDNri), regUnbiased) + .addReg(regUnbiased).addImm(MaxAlign - 1); + + if (Bias) { + // add %g1, -BIAS, %o6 + BuildMI(MBB, MBBI, dl, TII.get(SP::ADDri), SP::O6) + .addReg(regUnbiased).addImm(-Bias); + } } } diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt index 7e0df2941467..23ac9d9936ad 100644 --- a/lib/Target/X86/CMakeLists.txt +++ b/lib/Target/X86/CMakeLists.txt @@ -48,6 +48,7 @@ set(sources X86PadShortFunction.cpp X86RegisterBankInfo.cpp X86RegisterInfo.cpp + X86RetpolineThunks.cpp X86SelectionDAGInfo.cpp X86ShuffleDecodeConstantPool.cpp X86Subtarget.cpp diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h index 5631648d2dc8..361326824292 100644 --- a/lib/Target/X86/X86.h +++ b/lib/Target/X86/X86.h @@ -22,6 +22,7 @@ namespace llvm { class FunctionPass; class ImmutablePass; class InstructionSelector; +class ModulePass; class PassRegistry; class X86RegisterBankInfo; class X86Subtarget; @@ -102,6 +103,9 @@ void initializeFixupBWInstPassPass(PassRegistry &); /// encoding when possible in order to reduce code size. FunctionPass *createX86EvexToVexInsts(); +/// This pass creates the thunks for the retpoline feature. +FunctionPass *createX86RetpolineThunksPass(); + InstructionSelector *createX86InstructionSelector(const X86TargetMachine &TM, X86Subtarget &, X86RegisterBankInfo &); diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td index ba998467b799..ba97982e3330 100644 --- a/lib/Target/X86/X86.td +++ b/lib/Target/X86/X86.td @@ -329,6 +329,27 @@ def FeatureHasFastGather : SubtargetFeature<"fast-gather", "HasFastGather", "true", "Indicates if gather is reasonably fast.">; +// Enable mitigation of some aspects of speculative execution related +// vulnerabilities by removing speculatable indirect branches. This disables +// jump-table formation, rewrites explicit `indirectbr` instructions into +// `switch` instructions, and uses a special construct called a "retpoline" to +// prevent speculation of the remaining indirect branches (indirect calls and +// tail calls). +def FeatureRetpoline + : SubtargetFeature<"retpoline", "UseRetpoline", "true", + "Remove speculation of indirect branches from the " + "generated code, either by avoiding them entirely or " + "lowering them with a speculation blocking construct.">; + +// Rely on external thunks for the emitted retpoline calls. This allows users +// to provide their own custom thunk definitions in highly specialized +// environments such as a kernel that does boot-time hot patching. +def FeatureRetpolineExternalThunk + : SubtargetFeature< + "retpoline-external-thunk", "UseRetpolineExternalThunk", "true", + "Enable retpoline, but with an externally provided thunk.", + [FeatureRetpoline]>; + //===----------------------------------------------------------------------===// // Register File Description //===----------------------------------------------------------------------===// diff --git a/lib/Target/X86/X86AsmPrinter.h b/lib/Target/X86/X86AsmPrinter.h index 7e70789ac82c..31328e6aea95 100644 --- a/lib/Target/X86/X86AsmPrinter.h +++ b/lib/Target/X86/X86AsmPrinter.h @@ -32,6 +32,7 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter { FaultMaps FM; std::unique_ptr<MCCodeEmitter> CodeEmitter; bool EmitFPOData = false; + bool NeedsRetpoline = false; // This utility class tracks the length of a stackmap instruction's 'shadow'. // It is used by the X86AsmPrinter to ensure that the stackmap shadow diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp index 5dae485f4c9f..80ce3c579fe0 100644 --- a/lib/Target/X86/X86FastISel.cpp +++ b/lib/Target/X86/X86FastISel.cpp @@ -3172,6 +3172,10 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { (CalledFn && CalledFn->hasFnAttribute("no_caller_saved_registers"))) return false; + // Functions using retpoline should use SDISel for calls. + if (Subtarget->useRetpoline()) + return false; + // Handle only C, fastcc, and webkit_js calling conventions for now. switch (CC) { default: return false; diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp index 80b1cc192a88..11808f8995fe 100644 --- a/lib/Target/X86/X86FrameLowering.cpp +++ b/lib/Target/X86/X86FrameLowering.cpp @@ -741,6 +741,11 @@ void X86FrameLowering::emitStackProbeCall(MachineFunction &MF, bool InProlog) const { bool IsLargeCodeModel = MF.getTarget().getCodeModel() == CodeModel::Large; + // FIXME: Add retpoline support and remove this. + if (Is64Bit && IsLargeCodeModel && STI.useRetpoline()) + report_fatal_error("Emitting stack probe calls on 64-bit with the large " + "code model and retpoline not yet implemented."); + unsigned CallOp; if (Is64Bit) CallOp = IsLargeCodeModel ? X86::CALL64r : X86::CALL64pcrel32; @@ -2345,6 +2350,10 @@ void X86FrameLowering::adjustForSegmentedStacks( // This solution is not perfect, as it assumes that the .rodata section // is laid out within 2^31 bytes of each function body, but this seems // to be sufficient for JIT. + // FIXME: Add retpoline support and remove the error here.. + if (STI.useRetpoline()) + report_fatal_error("Emitting morestack calls on 64-bit with the large " + "code model and retpoline not yet implemented."); BuildMI(allocMBB, DL, TII.get(X86::CALL64m)) .addReg(X86::RIP) .addImm(0) diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index 660c1eff3c4b..d79fd0ca4daa 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -629,11 +629,11 @@ void X86DAGToDAGISel::PreprocessISelDAG() { SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues. if (OptLevel != CodeGenOpt::None && - // Only does this when target favors doesn't favor register indirect - // call. + // Only do this when the target can fold the load into the call or + // jmp. + !Subtarget->useRetpoline() && ((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) || (N->getOpcode() == X86ISD::TC_RETURN && - // Only does this if load can be folded into TC_RETURN. (Subtarget->is64Bit() || !getTargetMachine().isPositionIndependent())))) { /// Also try moving call address load from outside callseq_start to just diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 3a163637da26..38885c42b529 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -25767,6 +25767,15 @@ X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, return isShuffleMaskLegal(Mask, VT); } +bool X86TargetLowering::areJTsAllowed(const Function *Fn) const { + // If the subtarget is using retpolines, we need to not generate jump tables. + if (Subtarget.useRetpoline()) + return false; + + // Otherwise, fallback on the generic logic. + return TargetLowering::areJTsAllowed(Fn); +} + //===----------------------------------------------------------------------===// // X86 Scheduler Hooks //===----------------------------------------------------------------------===// @@ -27069,6 +27078,115 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI, return BB; } +static unsigned getOpcodeForRetpoline(unsigned RPOpc) { + switch (RPOpc) { + case X86::RETPOLINE_CALL32: + return X86::CALLpcrel32; + case X86::RETPOLINE_CALL64: + return X86::CALL64pcrel32; + case X86::RETPOLINE_TCRETURN32: + return X86::TCRETURNdi; + case X86::RETPOLINE_TCRETURN64: + return X86::TCRETURNdi64; + } + llvm_unreachable("not retpoline opcode"); +} + +static const char *getRetpolineSymbol(const X86Subtarget &Subtarget, + unsigned Reg) { + switch (Reg) { + case 0: + assert(!Subtarget.is64Bit() && "R11 should always be available on x64"); + return Subtarget.useRetpolineExternalThunk() + ? "__llvm_external_retpoline_push" + : "__llvm_retpoline_push"; + case X86::EAX: + return Subtarget.useRetpolineExternalThunk() + ? "__llvm_external_retpoline_eax" + : "__llvm_retpoline_eax"; + case X86::ECX: + return Subtarget.useRetpolineExternalThunk() + ? "__llvm_external_retpoline_ecx" + : "__llvm_retpoline_ecx"; + case X86::EDX: + return Subtarget.useRetpolineExternalThunk() + ? "__llvm_external_retpoline_edx" + : "__llvm_retpoline_edx"; + case X86::R11: + return Subtarget.useRetpolineExternalThunk() + ? "__llvm_external_retpoline_r11" + : "__llvm_retpoline_r11"; + } + llvm_unreachable("unexpected reg for retpoline"); +} + +MachineBasicBlock * +X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI, + MachineBasicBlock *BB) const { + // Copy the virtual register into the R11 physical register and + // call the retpoline thunk. + DebugLoc DL = MI.getDebugLoc(); + const X86InstrInfo *TII = Subtarget.getInstrInfo(); + unsigned CalleeVReg = MI.getOperand(0).getReg(); + unsigned Opc = getOpcodeForRetpoline(MI.getOpcode()); + + // Find an available scratch register to hold the callee. On 64-bit, we can + // just use R11, but we scan for uses anyway to ensure we don't generate + // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't + // already a register use operand to the call to hold the callee. If none + // are available, push the callee instead. This is less efficient, but is + // necessary for functions using 3 regparms. Such function calls are + // (currently) not eligible for tail call optimization, because there is no + // scratch register available to hold the address of the callee. + SmallVector<unsigned, 3> AvailableRegs; + if (Subtarget.is64Bit()) + AvailableRegs.push_back(X86::R11); + else + AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX}); + + // Zero out any registers that are already used. + for (const auto &MO : MI.operands()) { + if (MO.isReg() && MO.isUse()) + for (unsigned &Reg : AvailableRegs) + if (Reg == MO.getReg()) + Reg = 0; + } + + // Choose the first remaining non-zero available register. + unsigned AvailableReg = 0; + for (unsigned MaybeReg : AvailableRegs) { + if (MaybeReg) { + AvailableReg = MaybeReg; + break; + } + } + + const char *Symbol = getRetpolineSymbol(Subtarget, AvailableReg); + + if (AvailableReg == 0) { + // No register available. Use PUSH. This must not be a tailcall, and this + // must not be x64. + if (Subtarget.is64Bit()) + report_fatal_error( + "Cannot make an indirect call on x86-64 using both retpoline and a " + "calling convention that preservers r11"); + if (Opc != X86::CALLpcrel32) + report_fatal_error("Cannot make an indirect tail call on x86 using " + "retpoline without a preserved register"); + BuildMI(*BB, MI, DL, TII->get(X86::PUSH32r)).addReg(CalleeVReg); + MI.getOperand(0).ChangeToES(Symbol); + MI.setDesc(TII->get(Opc)); + } else { + BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg) + .addReg(CalleeVReg); + MI.getOperand(0).ChangeToES(Symbol); + MI.setDesc(TII->get(Opc)); + MachineInstrBuilder(*BB->getParent(), &MI) + .addReg(AvailableReg, RegState::Implicit | RegState::Kill); + } + return BB; +} + MachineBasicBlock * X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI, MachineBasicBlock *MBB) const { @@ -27584,6 +27702,11 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, case X86::TLS_base_addr32: case X86::TLS_base_addr64: return EmitLoweredTLSAddr(MI, BB); + case X86::RETPOLINE_CALL32: + case X86::RETPOLINE_CALL64: + case X86::RETPOLINE_TCRETURN32: + case X86::RETPOLINE_TCRETURN64: + return EmitLoweredRetpoline(MI, BB); case X86::CATCHRET: return EmitLoweredCatchRet(MI, BB); case X86::CATCHPAD: diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 1fb7c7ed4e98..3aa9d01bff20 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -982,6 +982,9 @@ namespace llvm { bool isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, EVT VT) const override; + /// Returns true if lowering to a jump table is allowed. + bool areJTsAllowed(const Function *Fn) const override; + /// If true, then instruction selection should /// seek to shrink the FP constant of the specified type to a smaller type /// in order to save space and / or reduce runtime. @@ -1294,6 +1297,9 @@ namespace llvm { MachineBasicBlock *EmitLoweredTLSCall(MachineInstr &MI, MachineBasicBlock *BB) const; + MachineBasicBlock *EmitLoweredRetpoline(MachineInstr &MI, + MachineBasicBlock *BB) const; + MachineBasicBlock *emitEHSjLjSetJmp(MachineInstr &MI, MachineBasicBlock *MBB) const; diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td index 06600a4ef286..d66d9258e96f 100644 --- a/lib/Target/X86/X86InstrCompiler.td +++ b/lib/Target/X86/X86InstrCompiler.td @@ -1146,14 +1146,14 @@ def X86tcret_6regs : PatFrag<(ops node:$ptr, node:$off), def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off), (TCRETURNri ptr_rc_tailcall:$dst, imm:$off)>, - Requires<[Not64BitMode]>; + Requires<[Not64BitMode, NotUseRetpoline]>; // FIXME: This is disabled for 32-bit PIC mode because the global base // register which is part of the address mode may be assigned a // callee-saved register. def : Pat<(X86tcret (load addr:$dst), imm:$off), (TCRETURNmi addr:$dst, imm:$off)>, - Requires<[Not64BitMode, IsNotPIC]>; + Requires<[Not64BitMode, IsNotPIC, NotUseRetpoline]>; def : Pat<(X86tcret (i32 tglobaladdr:$dst), imm:$off), (TCRETURNdi tglobaladdr:$dst, imm:$off)>, @@ -1165,13 +1165,21 @@ def : Pat<(X86tcret (i32 texternalsym:$dst), imm:$off), def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off), (TCRETURNri64 ptr_rc_tailcall:$dst, imm:$off)>, - Requires<[In64BitMode]>; + Requires<[In64BitMode, NotUseRetpoline]>; // Don't fold loads into X86tcret requiring more than 6 regs. // There wouldn't be enough scratch registers for base+index. def : Pat<(X86tcret_6regs (load addr:$dst), imm:$off), (TCRETURNmi64 addr:$dst, imm:$off)>, - Requires<[In64BitMode]>; + Requires<[In64BitMode, NotUseRetpoline]>; + +def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off), + (RETPOLINE_TCRETURN64 ptr_rc_tailcall:$dst, imm:$off)>, + Requires<[In64BitMode, UseRetpoline]>; + +def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off), + (RETPOLINE_TCRETURN32 ptr_rc_tailcall:$dst, imm:$off)>, + Requires<[Not64BitMode, UseRetpoline]>; def : Pat<(X86tcret (i64 tglobaladdr:$dst), imm:$off), (TCRETURNdi64 tglobaladdr:$dst, imm:$off)>, diff --git a/lib/Target/X86/X86InstrControl.td b/lib/Target/X86/X86InstrControl.td index 5581fd462a1d..7932686ebc87 100644 --- a/lib/Target/X86/X86InstrControl.td +++ b/lib/Target/X86/X86InstrControl.td @@ -211,11 +211,12 @@ let isCall = 1 in Sched<[WriteJumpLd]>; def CALL32r : I<0xFF, MRM2r, (outs), (ins GR32:$dst), "call{l}\t{*}$dst", [(X86call GR32:$dst)], IIC_CALL_RI>, - OpSize32, Requires<[Not64BitMode]>, Sched<[WriteJump]>; + OpSize32, Requires<[Not64BitMode,NotUseRetpoline]>, + Sched<[WriteJump]>; def CALL32m : I<0xFF, MRM2m, (outs), (ins i32mem:$dst), "call{l}\t{*}$dst", [(X86call (loadi32 addr:$dst))], IIC_CALL_MEM>, OpSize32, - Requires<[Not64BitMode,FavorMemIndirectCall]>, + Requires<[Not64BitMode,FavorMemIndirectCall,NotUseRetpoline]>, Sched<[WriteJumpLd]>; let Predicates = [Not64BitMode] in { @@ -298,11 +299,12 @@ let isCall = 1, Uses = [RSP, SSP], SchedRW = [WriteJump] in { def CALL64r : I<0xFF, MRM2r, (outs), (ins GR64:$dst), "call{q}\t{*}$dst", [(X86call GR64:$dst)], IIC_CALL_RI>, - Requires<[In64BitMode]>; + Requires<[In64BitMode,NotUseRetpoline]>; def CALL64m : I<0xFF, MRM2m, (outs), (ins i64mem:$dst), "call{q}\t{*}$dst", [(X86call (loadi64 addr:$dst))], IIC_CALL_MEM>, - Requires<[In64BitMode,FavorMemIndirectCall]>; + Requires<[In64BitMode,FavorMemIndirectCall, + NotUseRetpoline]>; def FARCALL64 : RI<0xFF, MRM3m, (outs), (ins opaque80mem:$dst), "lcall{q}\t{*}$dst", [], IIC_CALL_FAR_MEM>; @@ -341,6 +343,27 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, } } +let isPseudo = 1, isCall = 1, isCodeGenOnly = 1, + Uses = [RSP, SSP], + usesCustomInserter = 1, + SchedRW = [WriteJump] in { + def RETPOLINE_CALL32 : + PseudoI<(outs), (ins GR32:$dst), [(X86call GR32:$dst)]>, + Requires<[Not64BitMode,UseRetpoline]>; + + def RETPOLINE_CALL64 : + PseudoI<(outs), (ins GR64:$dst), [(X86call GR64:$dst)]>, + Requires<[In64BitMode,UseRetpoline]>; + + // Retpoline variant of indirect tail calls. + let isTerminator = 1, isReturn = 1, isBarrier = 1 in { + def RETPOLINE_TCRETURN64 : + PseudoI<(outs), (ins GR64:$dst, i32imm:$offset), []>; + def RETPOLINE_TCRETURN32 : + PseudoI<(outs), (ins GR32:$dst, i32imm:$offset), []>; + } +} + // Conditional tail calls are similar to the above, but they are branches // rather than barriers, and they use EFLAGS. let isCall = 1, isTerminator = 1, isReturn = 1, isBranch = 1, diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index 27c67500b26f..a657b19c08c9 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -938,6 +938,8 @@ def HasFastLZCNT : Predicate<"Subtarget->hasFastLZCNT()">; def HasFastSHLDRotate : Predicate<"Subtarget->hasFastSHLDRotate()">; def HasERMSB : Predicate<"Subtarget->hasERMSB()">; def HasMFence : Predicate<"Subtarget->hasMFence()">; +def UseRetpoline : Predicate<"Subtarget->useRetpoline()">; +def NotUseRetpoline : Predicate<"!Subtarget->useRetpoline()">; //===----------------------------------------------------------------------===// // X86 Instruction Format Definitions. diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp index 8a7179e48a0b..730ba745eb70 100644 --- a/lib/Target/X86/X86MCInstLower.cpp +++ b/lib/Target/X86/X86MCInstLower.cpp @@ -874,6 +874,10 @@ void X86AsmPrinter::LowerSTATEPOINT(const MachineInstr &MI, // address is to far away. (TODO: support non-relative addressing) break; case MachineOperand::MO_Register: + // FIXME: Add retpoline support and remove this. + if (Subtarget->useRetpoline()) + report_fatal_error("Lowering register statepoints with retpoline not " + "yet implemented."); CallTargetMCOp = MCOperand::createReg(CallTarget.getReg()); CallOpcode = X86::CALL64r; break; @@ -1028,6 +1032,10 @@ void X86AsmPrinter::LowerPATCHPOINT(const MachineInstr &MI, EmitAndCountInstruction( MCInstBuilder(X86::MOV64ri).addReg(ScratchReg).addOperand(CalleeMCOp)); + // FIXME: Add retpoline support and remove this. + if (Subtarget->useRetpoline()) + report_fatal_error( + "Lowering patchpoint with retpoline not yet implemented."); EmitAndCountInstruction(MCInstBuilder(X86::CALL64r).addReg(ScratchReg)); } diff --git a/lib/Target/X86/X86RetpolineThunks.cpp b/lib/Target/X86/X86RetpolineThunks.cpp new file mode 100644 index 000000000000..223fa5771498 --- /dev/null +++ b/lib/Target/X86/X86RetpolineThunks.cpp @@ -0,0 +1,311 @@ +//======- X86RetpolineThunks.cpp - Construct retpoline thunks for x86 --=====// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// Pass that injects an MI thunk implementing a "retpoline". This is +/// a RET-implemented trampoline that is used to lower indirect calls in a way +/// that prevents speculation on some x86 processors and can be used to mitigate +/// security vulnerabilities due to targeted speculative execution and side +/// channels such as CVE-2017-5715. +/// +/// TODO(chandlerc): All of this code could use better comments and +/// documentation. +/// +//===----------------------------------------------------------------------===// + +#include "X86.h" +#include "X86InstrBuilder.h" +#include "X86Subtarget.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "x86-retpoline-thunks" + +static const char ThunkNamePrefix[] = "__llvm_retpoline_"; +static const char R11ThunkName[] = "__llvm_retpoline_r11"; +static const char EAXThunkName[] = "__llvm_retpoline_eax"; +static const char ECXThunkName[] = "__llvm_retpoline_ecx"; +static const char EDXThunkName[] = "__llvm_retpoline_edx"; +static const char PushThunkName[] = "__llvm_retpoline_push"; + +namespace { +class X86RetpolineThunks : public MachineFunctionPass { +public: + static char ID; + + X86RetpolineThunks() : MachineFunctionPass(ID) {} + + StringRef getPassName() const override { return "X86 Retpoline Thunks"; } + + bool doInitialization(Module &M) override; + bool runOnMachineFunction(MachineFunction &F) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + MachineFunctionPass::getAnalysisUsage(AU); + AU.addRequired<MachineModuleInfo>(); + AU.addPreserved<MachineModuleInfo>(); + } + +private: + MachineModuleInfo *MMI; + const TargetMachine *TM; + bool Is64Bit; + const X86Subtarget *STI; + const X86InstrInfo *TII; + + bool InsertedThunks; + + void createThunkFunction(Module &M, StringRef Name); + void insertRegReturnAddrClobber(MachineBasicBlock &MBB, unsigned Reg); + void insert32BitPushReturnAddrClobber(MachineBasicBlock &MBB); + void populateThunk(MachineFunction &MF, Optional<unsigned> Reg = None); +}; + +} // end anonymous namespace + +FunctionPass *llvm::createX86RetpolineThunksPass() { + return new X86RetpolineThunks(); +} + +char X86RetpolineThunks::ID = 0; + +bool X86RetpolineThunks::doInitialization(Module &M) { + InsertedThunks = false; + return false; +} + +bool X86RetpolineThunks::runOnMachineFunction(MachineFunction &MF) { + DEBUG(dbgs() << getPassName() << '\n'); + + TM = &MF.getTarget();; + STI = &MF.getSubtarget<X86Subtarget>(); + TII = STI->getInstrInfo(); + Is64Bit = TM->getTargetTriple().getArch() == Triple::x86_64; + + MMI = &getAnalysis<MachineModuleInfo>(); + Module &M = const_cast<Module &>(*MMI->getModule()); + + // If this function is not a thunk, check to see if we need to insert + // a thunk. + if (!MF.getName().startswith(ThunkNamePrefix)) { + // If we've already inserted a thunk, nothing else to do. + if (InsertedThunks) + return false; + + // Only add a thunk if one of the functions has the retpoline feature + // enabled in its subtarget, and doesn't enable external thunks. + // FIXME: Conditionalize on indirect calls so we don't emit a thunk when + // nothing will end up calling it. + // FIXME: It's a little silly to look at every function just to enumerate + // the subtargets, but eventually we'll want to look at them for indirect + // calls, so maybe this is OK. + if (!STI->useRetpoline() || STI->useRetpolineExternalThunk()) + return false; + + // Otherwise, we need to insert the thunk. + // WARNING: This is not really a well behaving thing to do in a function + // pass. We extract the module and insert a new function (and machine + // function) directly into the module. + if (Is64Bit) + createThunkFunction(M, R11ThunkName); + else + for (StringRef Name : + {EAXThunkName, ECXThunkName, EDXThunkName, PushThunkName}) + createThunkFunction(M, Name); + InsertedThunks = true; + return true; + } + + // If this *is* a thunk function, we need to populate it with the correct MI. + if (Is64Bit) { + assert(MF.getName() == "__llvm_retpoline_r11" && + "Should only have an r11 thunk on 64-bit targets"); + + // __llvm_retpoline_r11: + // callq .Lr11_call_target + // .Lr11_capture_spec: + // pause + // lfence + // jmp .Lr11_capture_spec + // .align 16 + // .Lr11_call_target: + // movq %r11, (%rsp) + // retq + populateThunk(MF, X86::R11); + } else { + // For 32-bit targets we need to emit a collection of thunks for various + // possible scratch registers as well as a fallback that is used when + // there are no scratch registers and assumes the retpoline target has + // been pushed. + // __llvm_retpoline_eax: + // calll .Leax_call_target + // .Leax_capture_spec: + // pause + // jmp .Leax_capture_spec + // .align 16 + // .Leax_call_target: + // movl %eax, (%esp) # Clobber return addr + // retl + // + // __llvm_retpoline_ecx: + // ... # Same setup + // movl %ecx, (%esp) + // retl + // + // __llvm_retpoline_edx: + // ... # Same setup + // movl %edx, (%esp) + // retl + // + // This last one is a bit more special and so needs a little extra + // handling. + // __llvm_retpoline_push: + // calll .Lpush_call_target + // .Lpush_capture_spec: + // pause + // lfence + // jmp .Lpush_capture_spec + // .align 16 + // .Lpush_call_target: + // # Clear pause_loop return address. + // addl $4, %esp + // # Top of stack words are: Callee, RA. Exchange Callee and RA. + // pushl 4(%esp) # Push callee + // pushl 4(%esp) # Push RA + // popl 8(%esp) # Pop RA to final RA + // popl (%esp) # Pop callee to next top of stack + // retl # Ret to callee + if (MF.getName() == EAXThunkName) + populateThunk(MF, X86::EAX); + else if (MF.getName() == ECXThunkName) + populateThunk(MF, X86::ECX); + else if (MF.getName() == EDXThunkName) + populateThunk(MF, X86::EDX); + else if (MF.getName() == PushThunkName) + populateThunk(MF); + else + llvm_unreachable("Invalid thunk name on x86-32!"); + } + + return true; +} + +void X86RetpolineThunks::createThunkFunction(Module &M, StringRef Name) { + assert(Name.startswith(ThunkNamePrefix) && + "Created a thunk with an unexpected prefix!"); + + LLVMContext &Ctx = M.getContext(); + auto Type = FunctionType::get(Type::getVoidTy(Ctx), false); + Function *F = + Function::Create(Type, GlobalValue::LinkOnceODRLinkage, Name, &M); + F->setVisibility(GlobalValue::HiddenVisibility); + F->setComdat(M.getOrInsertComdat(Name)); + + // Add Attributes so that we don't create a frame, unwind information, or + // inline. + AttrBuilder B; + B.addAttribute(llvm::Attribute::NoUnwind); + B.addAttribute(llvm::Attribute::Naked); + F->addAttributes(llvm::AttributeList::FunctionIndex, B); + + // Populate our function a bit so that we can verify. + BasicBlock *Entry = BasicBlock::Create(Ctx, "entry", F); + IRBuilder<> Builder(Entry); + + Builder.CreateRetVoid(); +} + +void X86RetpolineThunks::insertRegReturnAddrClobber(MachineBasicBlock &MBB, + unsigned Reg) { + const unsigned MovOpc = Is64Bit ? X86::MOV64mr : X86::MOV32mr; + const unsigned SPReg = Is64Bit ? X86::RSP : X86::ESP; + addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(MovOpc)), SPReg, false, 0) + .addReg(Reg); +} + +void X86RetpolineThunks::insert32BitPushReturnAddrClobber( + MachineBasicBlock &MBB) { + // The instruction sequence we use to replace the return address without + // a scratch register is somewhat complicated: + // # Clear capture_spec from return address. + // addl $4, %esp + // # Top of stack words are: Callee, RA. Exchange Callee and RA. + // pushl 4(%esp) # Push callee + // pushl 4(%esp) # Push RA + // popl 8(%esp) # Pop RA to final RA + // popl (%esp) # Pop callee to next top of stack + // retl # Ret to callee + BuildMI(&MBB, DebugLoc(), TII->get(X86::ADD32ri), X86::ESP) + .addReg(X86::ESP) + .addImm(4); + addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(X86::PUSH32rmm)), X86::ESP, + false, 4); + addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(X86::PUSH32rmm)), X86::ESP, + false, 4); + addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(X86::POP32rmm)), X86::ESP, + false, 8); + addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(X86::POP32rmm)), X86::ESP, + false, 0); +} + +void X86RetpolineThunks::populateThunk(MachineFunction &MF, + Optional<unsigned> Reg) { + // Set MF properties. We never use vregs... + MF.getProperties().set(MachineFunctionProperties::Property::NoVRegs); + + MachineBasicBlock *Entry = &MF.front(); + Entry->clear(); + + MachineBasicBlock *CaptureSpec = MF.CreateMachineBasicBlock(Entry->getBasicBlock()); + MachineBasicBlock *CallTarget = MF.CreateMachineBasicBlock(Entry->getBasicBlock()); + MF.push_back(CaptureSpec); + MF.push_back(CallTarget); + + const unsigned CallOpc = Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32; + const unsigned RetOpc = Is64Bit ? X86::RETQ : X86::RETL; + + BuildMI(Entry, DebugLoc(), TII->get(CallOpc)).addMBB(CallTarget); + Entry->addSuccessor(CallTarget); + Entry->addSuccessor(CaptureSpec); + CallTarget->setHasAddressTaken(); + + // In the capture loop for speculation, we want to stop the processor from + // speculating as fast as possible. On Intel processors, the PAUSE instruction + // will block speculation without consuming any execution resources. On AMD + // processors, the PAUSE instruction is (essentially) a nop, so we also use an + // LFENCE instruction which they have advised will stop speculation as well + // with minimal resource utilization. We still end the capture with a jump to + // form an infinite loop to fully guarantee that no matter what implementation + // of the x86 ISA, speculating this code path never escapes. + BuildMI(CaptureSpec, DebugLoc(), TII->get(X86::PAUSE)); + BuildMI(CaptureSpec, DebugLoc(), TII->get(X86::LFENCE)); + BuildMI(CaptureSpec, DebugLoc(), TII->get(X86::JMP_1)).addMBB(CaptureSpec); + CaptureSpec->setHasAddressTaken(); + CaptureSpec->addSuccessor(CaptureSpec); + + CallTarget->setAlignment(4); + if (Reg) { + insertRegReturnAddrClobber(*CallTarget, *Reg); + } else { + assert(!Is64Bit && "We only support non-reg thunks on 32-bit x86!"); + insert32BitPushReturnAddrClobber(*CallTarget); + } + BuildMI(CallTarget, DebugLoc(), TII->get(RetOpc)); +} diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp index ad023623142f..dca98d999e58 100644 --- a/lib/Target/X86/X86Subtarget.cpp +++ b/lib/Target/X86/X86Subtarget.cpp @@ -314,6 +314,8 @@ void X86Subtarget::initializeEnvironment() { HasSGX = false; HasCLFLUSHOPT = false; HasCLWB = false; + UseRetpoline = false; + UseRetpolineExternalThunk = false; IsPMULLDSlow = false; IsSHLDSlow = false; IsUAMem16Slow = false; diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h index c9435890fc1f..37ffac1faf68 100644 --- a/lib/Target/X86/X86Subtarget.h +++ b/lib/Target/X86/X86Subtarget.h @@ -341,6 +341,14 @@ protected: /// Processor supports Cache Line Write Back instruction bool HasCLWB; + /// Use a retpoline thunk rather than indirect calls to block speculative + /// execution. + bool UseRetpoline; + + /// When using a retpoline thunk, call an externally provided thunk rather + /// than emitting one inside the compiler. + bool UseRetpolineExternalThunk; + /// Use software floating point for code generation. bool UseSoftFloat; @@ -574,6 +582,8 @@ public: bool hasIBT() const { return HasIBT; } bool hasCLFLUSHOPT() const { return HasCLFLUSHOPT; } bool hasCLWB() const { return HasCLWB; } + bool useRetpoline() const { return UseRetpoline; } + bool useRetpolineExternalThunk() const { return UseRetpolineExternalThunk; } bool isXRaySupported() const override { return is64Bit(); } @@ -696,6 +706,10 @@ public: /// Return true if the subtarget allows calls to immediate address. bool isLegalToCallImmediateAddr() const; + /// If we are using retpolines, we need to expand indirectbr to avoid it + /// lowering to an actual indirect jump. + bool enableIndirectBrExpand() const override { return useRetpoline(); } + /// Enable the MachineScheduler pass for all X86 subtargets. bool enableMachineScheduler() const override { return true; } diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp index e95e6ecae091..ac242e1c00e0 100644 --- a/lib/Target/X86/X86TargetMachine.cpp +++ b/lib/Target/X86/X86TargetMachine.cpp @@ -321,6 +321,7 @@ public: void addPreRegAlloc() override; void addPostRegAlloc() override; void addPreEmitPass() override; + void addPreEmitPass2() override; void addPreSched2() override; }; @@ -350,6 +351,11 @@ void X86PassConfig::addIRPasses() { if (TM->getOptLevel() != CodeGenOpt::None) addPass(createInterleavedAccessPass()); + + // Add passes that handle indirect branch removal and insertion of a retpoline + // thunk. These will be a no-op unless a function subtarget has the retpoline + // feature enabled. + addPass(createIndirectBrExpandPass()); } bool X86PassConfig::addInstSelector() { @@ -436,3 +442,7 @@ void X86PassConfig::addPreEmitPass() { addPass(createX86EvexToVexInsts()); } } + +void X86PassConfig::addPreEmitPass2() { + addPass(createX86RetpolineThunksPass()); +} |
