diff options
Diffstat (limited to 'lib/Target/X86')
-rw-r--r-- | lib/Target/X86/CMakeLists.txt | 1 | ||||
-rw-r--r-- | lib/Target/X86/Disassembler/X86Disassembler.cpp | 7 | ||||
-rw-r--r-- | lib/Target/X86/X86.h | 3 | ||||
-rw-r--r-- | lib/Target/X86/X86DomainReassignment.cpp | 65 | ||||
-rw-r--r-- | lib/Target/X86/X86FastISel.cpp | 21 | ||||
-rw-r--r-- | lib/Target/X86/X86FlagsCopyLowering.cpp | 935 | ||||
-rw-r--r-- | lib/Target/X86/X86ISelLowering.cpp | 26 | ||||
-rw-r--r-- | lib/Target/X86/X86ISelLowering.h | 3 | ||||
-rw-r--r-- | lib/Target/X86/X86InstrArithmetic.td | 34 | ||||
-rw-r--r-- | lib/Target/X86/X86InstrCompiler.td | 8 | ||||
-rw-r--r-- | lib/Target/X86/X86InstrInfo.cpp | 121 | ||||
-rw-r--r-- | lib/Target/X86/X86InstrInfo.h | 6 | ||||
-rw-r--r-- | lib/Target/X86/X86InstrInfo.td | 75 | ||||
-rw-r--r-- | lib/Target/X86/X86InstrSystem.td | 13 | ||||
-rw-r--r-- | lib/Target/X86/X86RegisterInfo.td | 16 | ||||
-rw-r--r-- | lib/Target/X86/X86Schedule.td | 4 | ||||
-rw-r--r-- | lib/Target/X86/X86ScheduleAtom.td | 4 | ||||
-rw-r--r-- | lib/Target/X86/X86TargetMachine.cpp | 3 |
18 files changed, 1107 insertions, 238 deletions
diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt index 23ac9d9936ad8..44400813094b7 100644 --- a/lib/Target/X86/CMakeLists.txt +++ b/lib/Target/X86/CMakeLists.txt @@ -31,6 +31,7 @@ set(sources X86FixupBWInsts.cpp X86FixupLEAs.cpp X86FixupSetCC.cpp + X86FlagsCopyLowering.cpp X86FloatingPoint.cpp X86FrameLowering.cpp X86InstructionSelector.cpp diff --git a/lib/Target/X86/Disassembler/X86Disassembler.cpp b/lib/Target/X86/Disassembler/X86Disassembler.cpp index c58254ae38c19..b3c491b3de5e3 100644 --- a/lib/Target/X86/Disassembler/X86Disassembler.cpp +++ b/lib/Target/X86/Disassembler/X86Disassembler.cpp @@ -265,13 +265,10 @@ MCDisassembler::DecodeStatus X86GenericDisassembler::getInstruction( /// @param reg - The Reg to append. static void translateRegister(MCInst &mcInst, Reg reg) { #define ENTRY(x) X86::x, - uint8_t llvmRegnums[] = { - ALL_REGS - 0 - }; + static constexpr MCPhysReg llvmRegnums[] = {ALL_REGS}; #undef ENTRY - uint8_t llvmRegnum = llvmRegnums[reg]; + MCPhysReg llvmRegnum = llvmRegnums[reg]; mcInst.addOperand(MCOperand::createReg(llvmRegnum)); } diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h index 3613268242922..642dda8f42259 100644 --- a/lib/Target/X86/X86.h +++ b/lib/Target/X86/X86.h @@ -66,6 +66,9 @@ FunctionPass *createX86OptimizeLEAs(); /// Return a pass that transforms setcc + movzx pairs into xor + setcc. FunctionPass *createX86FixupSetCC(); +/// Return a pass that lowers EFLAGS copy pseudo instructions. +FunctionPass *createX86FlagsCopyLoweringPass(); + /// Return a pass that expands WinAlloca pseudo-instructions. FunctionPass *createX86WinAllocaExpander(); diff --git a/lib/Target/X86/X86DomainReassignment.cpp b/lib/Target/X86/X86DomainReassignment.cpp index bc0f55f581ff4..ffe176ad47701 100644 --- a/lib/Target/X86/X86DomainReassignment.cpp +++ b/lib/Target/X86/X86DomainReassignment.cpp @@ -26,6 +26,7 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/Printable.h" #include <bitset> using namespace llvm; @@ -262,25 +263,6 @@ public: } }; -/// An Instruction Converter which completely deletes an instruction. -/// For example, IMPLICIT_DEF instructions can be deleted when converting from -/// GPR to mask. -class InstrDeleter : public InstrConverterBase { -public: - InstrDeleter(unsigned SrcOpcode) : InstrConverterBase(SrcOpcode) {} - - bool convertInstr(MachineInstr *MI, const TargetInstrInfo *TII, - MachineRegisterInfo *MRI) const override { - assert(isLegal(MI, TII) && "Cannot convert instruction"); - return true; - } - - double getExtraCost(const MachineInstr *MI, - MachineRegisterInfo *MRI) const override { - return 0; - } -}; - // Key type to be used by the Instruction Converters map. // A converter is identified by <destination domain, source opcode> typedef std::pair<int, unsigned> InstrConverterBaseKeyTy; @@ -310,8 +292,12 @@ private: /// Domains which this closure can legally be reassigned to. std::bitset<NumDomains> LegalDstDomains; + /// An ID to uniquely identify this closure, even when it gets + /// moved around + unsigned ID; + public: - Closure(std::initializer_list<RegDomain> LegalDstDomainList) { + Closure(unsigned ID, std::initializer_list<RegDomain> LegalDstDomainList) : ID(ID) { for (RegDomain D : LegalDstDomainList) LegalDstDomains.set(D); } @@ -347,6 +333,27 @@ public: return Instrs; } + LLVM_DUMP_METHOD void dump(const MachineRegisterInfo *MRI) const { + dbgs() << "Registers: "; + bool First = true; + for (unsigned Reg : Edges) { + if (!First) + dbgs() << ", "; + First = false; + dbgs() << printReg(Reg, MRI->getTargetRegisterInfo()); + } + dbgs() << "\n" << "Instructions:"; + for (MachineInstr *MI : Instrs) { + dbgs() << "\n "; + MI->print(dbgs()); + } + dbgs() << "\n"; + } + + unsigned getID() const { + return ID; + } + }; class X86DomainReassignment : public MachineFunctionPass { @@ -358,7 +365,7 @@ class X86DomainReassignment : public MachineFunctionPass { DenseSet<unsigned> EnclosedEdges; /// All instructions that are included in some closure. - DenseMap<MachineInstr *, Closure *> EnclosedInstrs; + DenseMap<MachineInstr *, unsigned> EnclosedInstrs; public: static char ID; @@ -435,14 +442,14 @@ void X86DomainReassignment::visitRegister(Closure &C, unsigned Reg, void X86DomainReassignment::encloseInstr(Closure &C, MachineInstr *MI) { auto I = EnclosedInstrs.find(MI); if (I != EnclosedInstrs.end()) { - if (I->second != &C) + if (I->second != C.getID()) // Instruction already belongs to another closure, avoid conflicts between // closure and mark this closure as illegal. C.setAllIllegal(); return; } - EnclosedInstrs[MI] = &C; + EnclosedInstrs[MI] = C.getID(); C.addInstruction(MI); // Mark closure as illegal for reassignment to domains, if there is no @@ -587,7 +594,7 @@ void X86DomainReassignment::initConverters() { new InstrIgnore(TargetOpcode::PHI); Converters[{MaskDomain, TargetOpcode::IMPLICIT_DEF}] = - new InstrDeleter(TargetOpcode::IMPLICIT_DEF); + new InstrIgnore(TargetOpcode::IMPLICIT_DEF); Converters[{MaskDomain, TargetOpcode::INSERT_SUBREG}] = new InstrReplaceWithCopy(TargetOpcode::INSERT_SUBREG, 2); @@ -723,6 +730,7 @@ bool X86DomainReassignment::runOnMachineFunction(MachineFunction &MF) { std::vector<Closure> Closures; // Go over all virtual registers and calculate a closure. + unsigned ClosureID = 0; for (unsigned Idx = 0; Idx < MRI->getNumVirtRegs(); ++Idx) { unsigned Reg = TargetRegisterInfo::index2VirtReg(Idx); @@ -735,7 +743,7 @@ bool X86DomainReassignment::runOnMachineFunction(MachineFunction &MF) { continue; // Calculate closure starting with Reg. - Closure C({MaskDomain}); + Closure C(ClosureID++, {MaskDomain}); buildClosure(C, Reg); // Collect all closures that can potentially be converted. @@ -743,15 +751,16 @@ bool X86DomainReassignment::runOnMachineFunction(MachineFunction &MF) { Closures.push_back(std::move(C)); } - for (Closure &C : Closures) + for (Closure &C : Closures) { + DEBUG(C.dump(MRI)); if (isReassignmentProfitable(C, MaskDomain)) { reassign(C, MaskDomain); ++NumClosuresConverted; Changed = true; } + } - for (auto I : Converters) - delete I.second; + DeleteContainerSeconds(Converters); DEBUG(dbgs() << "***** Machine Function after Domain Reassignment *****\n"); DEBUG(MF.print(dbgs())); diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp index 80ce3c579fe0b..dca6c592614c9 100644 --- a/lib/Target/X86/X86FastISel.cpp +++ b/lib/Target/X86/X86FastISel.cpp @@ -1789,9 +1789,16 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { bool X86FastISel::X86SelectShift(const Instruction *I) { unsigned CReg = 0, OpReg = 0; const TargetRegisterClass *RC = nullptr; - assert(!I->getType()->isIntegerTy(8) && - "i8 shifts should be handled by autogenerated table"); - if (I->getType()->isIntegerTy(16)) { + if (I->getType()->isIntegerTy(8)) { + CReg = X86::CL; + RC = &X86::GR8RegClass; + switch (I->getOpcode()) { + case Instruction::LShr: OpReg = X86::SHR8rCL; break; + case Instruction::AShr: OpReg = X86::SAR8rCL; break; + case Instruction::Shl: OpReg = X86::SHL8rCL; break; + default: return false; + } + } else if (I->getType()->isIntegerTy(16)) { CReg = X86::CX; RC = &X86::GR16RegClass; switch (I->getOpcode()) { @@ -1836,10 +1843,10 @@ bool X86FastISel::X86SelectShift(const Instruction *I) { // The shift instruction uses X86::CL. If we defined a super-register // of X86::CL, emit a subreg KILL to precisely describe what we're doing here. - assert(CReg != X86::CL && "CReg should be a super register of CL"); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(TargetOpcode::KILL), X86::CL) - .addReg(CReg, RegState::Kill); + if (CReg != X86::CL) + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::KILL), X86::CL) + .addReg(CReg, RegState::Kill); unsigned ResultReg = createResultReg(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(OpReg), ResultReg) diff --git a/lib/Target/X86/X86FlagsCopyLowering.cpp b/lib/Target/X86/X86FlagsCopyLowering.cpp new file mode 100644 index 0000000000000..a6fccd1347407 --- /dev/null +++ b/lib/Target/X86/X86FlagsCopyLowering.cpp @@ -0,0 +1,935 @@ +//====- X86FlagsCopyLowering.cpp - Lowers COPY nodes of EFLAGS ------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// Lowers COPY nodes of EFLAGS by directly extracting and preserving individual +/// flag bits. +/// +/// We have to do this by carefully analyzing and rewriting the usage of the +/// copied EFLAGS register because there is no general way to rematerialize the +/// entire EFLAGS register safely and efficiently. Using `popf` both forces +/// dynamic stack adjustment and can create correctness issues due to IF, TF, +/// and other non-status flags being overwritten. Using sequences involving +/// SAHF don't work on all x86 processors and are often quite slow compared to +/// directly testing a single status preserved in its own GPR. +/// +//===----------------------------------------------------------------------===// + +#include "X86.h" +#include "X86InstrBuilder.h" +#include "X86InstrInfo.h" +#include "X86Subtarget.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/ScopeExit.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/SparseBitVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineSSAUpdater.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/CodeGen/TargetSchedule.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/MC/MCSchedule.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include <algorithm> +#include <cassert> +#include <iterator> +#include <utility> + +using namespace llvm; + +#define PASS_KEY "x86-flags-copy-lowering" +#define DEBUG_TYPE PASS_KEY + +STATISTIC(NumCopiesEliminated, "Number of copies of EFLAGS eliminated"); +STATISTIC(NumSetCCsInserted, "Number of setCC instructions inserted"); +STATISTIC(NumTestsInserted, "Number of test instructions inserted"); +STATISTIC(NumAddsInserted, "Number of adds instructions inserted"); + +namespace llvm { + +void initializeX86FlagsCopyLoweringPassPass(PassRegistry &); + +} // end namespace llvm + +namespace { + +// Convenient array type for storing registers associated with each condition. +using CondRegArray = std::array<unsigned, X86::LAST_VALID_COND + 1>; + +class X86FlagsCopyLoweringPass : public MachineFunctionPass { +public: + X86FlagsCopyLoweringPass() : MachineFunctionPass(ID) { + initializeX86FlagsCopyLoweringPassPass(*PassRegistry::getPassRegistry()); + } + + StringRef getPassName() const override { return "X86 EFLAGS copy lowering"; } + bool runOnMachineFunction(MachineFunction &MF) override; + void getAnalysisUsage(AnalysisUsage &AU) const override; + + /// Pass identification, replacement for typeid. + static char ID; + +private: + MachineRegisterInfo *MRI; + const X86InstrInfo *TII; + const TargetRegisterInfo *TRI; + const TargetRegisterClass *PromoteRC; + MachineDominatorTree *MDT; + + CondRegArray collectCondsInRegs(MachineBasicBlock &MBB, + MachineInstr &CopyDefI); + + unsigned promoteCondToReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator TestPos, + DebugLoc TestLoc, X86::CondCode Cond); + std::pair<unsigned, bool> + getCondOrInverseInReg(MachineBasicBlock &TestMBB, + MachineBasicBlock::iterator TestPos, DebugLoc TestLoc, + X86::CondCode Cond, CondRegArray &CondRegs); + void insertTest(MachineBasicBlock &MBB, MachineBasicBlock::iterator Pos, + DebugLoc Loc, unsigned Reg); + + void rewriteArithmetic(MachineBasicBlock &TestMBB, + MachineBasicBlock::iterator TestPos, DebugLoc TestLoc, + MachineInstr &MI, MachineOperand &FlagUse, + CondRegArray &CondRegs); + void rewriteCMov(MachineBasicBlock &TestMBB, + MachineBasicBlock::iterator TestPos, DebugLoc TestLoc, + MachineInstr &CMovI, MachineOperand &FlagUse, + CondRegArray &CondRegs); + void rewriteCondJmp(MachineBasicBlock &TestMBB, + MachineBasicBlock::iterator TestPos, DebugLoc TestLoc, + MachineInstr &JmpI, CondRegArray &CondRegs); + void rewriteCopy(MachineInstr &MI, MachineOperand &FlagUse, + MachineInstr &CopyDefI); + void rewriteSetCarryExtended(MachineBasicBlock &TestMBB, + MachineBasicBlock::iterator TestPos, + DebugLoc TestLoc, MachineInstr &SetBI, + MachineOperand &FlagUse, CondRegArray &CondRegs); + void rewriteSetCC(MachineBasicBlock &TestMBB, + MachineBasicBlock::iterator TestPos, DebugLoc TestLoc, + MachineInstr &SetCCI, MachineOperand &FlagUse, + CondRegArray &CondRegs); +}; + +} // end anonymous namespace + +INITIALIZE_PASS_BEGIN(X86FlagsCopyLoweringPass, DEBUG_TYPE, + "X86 EFLAGS copy lowering", false, false) +INITIALIZE_PASS_END(X86FlagsCopyLoweringPass, DEBUG_TYPE, + "X86 EFLAGS copy lowering", false, false) + +FunctionPass *llvm::createX86FlagsCopyLoweringPass() { + return new X86FlagsCopyLoweringPass(); +} + +char X86FlagsCopyLoweringPass::ID = 0; + +void X86FlagsCopyLoweringPass::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<MachineDominatorTree>(); + MachineFunctionPass::getAnalysisUsage(AU); +} + +namespace { +/// An enumeration of the arithmetic instruction mnemonics which have +/// interesting flag semantics. +/// +/// We can map instruction opcodes into these mnemonics to make it easy to +/// dispatch with specific functionality. +enum class FlagArithMnemonic { + ADC, + ADCX, + ADOX, + RCL, + RCR, + SBB, +}; +} // namespace + +static FlagArithMnemonic getMnemonicFromOpcode(unsigned Opcode) { + switch (Opcode) { + default: + report_fatal_error("No support for lowering a copy into EFLAGS when used " + "by this instruction!"); + +#define LLVM_EXPAND_INSTR_SIZES(MNEMONIC, SUFFIX) \ + case X86::MNEMONIC##8##SUFFIX: \ + case X86::MNEMONIC##16##SUFFIX: \ + case X86::MNEMONIC##32##SUFFIX: \ + case X86::MNEMONIC##64##SUFFIX: + +#define LLVM_EXPAND_ADC_SBB_INSTR(MNEMONIC) \ + LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rr) \ + LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rr_REV) \ + LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rm) \ + LLVM_EXPAND_INSTR_SIZES(MNEMONIC, mr) \ + case X86::MNEMONIC##8ri: \ + case X86::MNEMONIC##16ri8: \ + case X86::MNEMONIC##32ri8: \ + case X86::MNEMONIC##64ri8: \ + case X86::MNEMONIC##16ri: \ + case X86::MNEMONIC##32ri: \ + case X86::MNEMONIC##64ri32: \ + case X86::MNEMONIC##8mi: \ + case X86::MNEMONIC##16mi8: \ + case X86::MNEMONIC##32mi8: \ + case X86::MNEMONIC##64mi8: \ + case X86::MNEMONIC##16mi: \ + case X86::MNEMONIC##32mi: \ + case X86::MNEMONIC##64mi32: \ + case X86::MNEMONIC##8i8: \ + case X86::MNEMONIC##16i16: \ + case X86::MNEMONIC##32i32: \ + case X86::MNEMONIC##64i32: + + LLVM_EXPAND_ADC_SBB_INSTR(ADC) + return FlagArithMnemonic::ADC; + + LLVM_EXPAND_ADC_SBB_INSTR(SBB) + return FlagArithMnemonic::SBB; + +#undef LLVM_EXPAND_ADC_SBB_INSTR + + LLVM_EXPAND_INSTR_SIZES(RCL, rCL) + LLVM_EXPAND_INSTR_SIZES(RCL, r1) + LLVM_EXPAND_INSTR_SIZES(RCL, ri) + return FlagArithMnemonic::RCL; + + LLVM_EXPAND_INSTR_SIZES(RCR, rCL) + LLVM_EXPAND_INSTR_SIZES(RCR, r1) + LLVM_EXPAND_INSTR_SIZES(RCR, ri) + return FlagArithMnemonic::RCR; + +#undef LLVM_EXPAND_INSTR_SIZES + + case X86::ADCX32rr: + case X86::ADCX64rr: + case X86::ADCX32rm: + case X86::ADCX64rm: + return FlagArithMnemonic::ADCX; + + case X86::ADOX32rr: + case X86::ADOX64rr: + case X86::ADOX32rm: + case X86::ADOX64rm: + return FlagArithMnemonic::ADOX; + } +} + +static MachineBasicBlock &splitBlock(MachineBasicBlock &MBB, + MachineInstr &SplitI, + const X86InstrInfo &TII) { + MachineFunction &MF = *MBB.getParent(); + + assert(SplitI.getParent() == &MBB && + "Split instruction must be in the split block!"); + assert(SplitI.isBranch() && + "Only designed to split a tail of branch instructions!"); + assert(X86::getCondFromBranchOpc(SplitI.getOpcode()) != X86::COND_INVALID && + "Must split on an actual jCC instruction!"); + + // Dig out the previous instruction to the split point. + MachineInstr &PrevI = *std::prev(SplitI.getIterator()); + assert(PrevI.isBranch() && "Must split after a branch!"); + assert(X86::getCondFromBranchOpc(PrevI.getOpcode()) != X86::COND_INVALID && + "Must split after an actual jCC instruction!"); + assert(!std::prev(PrevI.getIterator())->isTerminator() && + "Must only have this one terminator prior to the split!"); + + // Grab the one successor edge that will stay in `MBB`. + MachineBasicBlock &UnsplitSucc = *PrevI.getOperand(0).getMBB(); + + // Analyze the original block to see if we are actually splitting an edge + // into two edges. This can happen when we have multiple conditional jumps to + // the same successor. + bool IsEdgeSplit = + std::any_of(SplitI.getIterator(), MBB.instr_end(), + [&](MachineInstr &MI) { + assert(MI.isTerminator() && + "Should only have spliced terminators!"); + return llvm::any_of( + MI.operands(), [&](MachineOperand &MOp) { + return MOp.isMBB() && MOp.getMBB() == &UnsplitSucc; + }); + }) || + MBB.getFallThrough() == &UnsplitSucc; + + MachineBasicBlock &NewMBB = *MF.CreateMachineBasicBlock(); + + // Insert the new block immediately after the current one. Any existing + // fallthrough will be sunk into this new block anyways. + MF.insert(std::next(MachineFunction::iterator(&MBB)), &NewMBB); + + // Splice the tail of instructions into the new block. + NewMBB.splice(NewMBB.end(), &MBB, SplitI.getIterator(), MBB.end()); + + // Copy the necessary succesors (and their probability info) into the new + // block. + for (auto SI = MBB.succ_begin(), SE = MBB.succ_end(); SI != SE; ++SI) + if (IsEdgeSplit || *SI != &UnsplitSucc) + NewMBB.copySuccessor(&MBB, SI); + // Normalize the probabilities if we didn't end up splitting the edge. + if (!IsEdgeSplit) + NewMBB.normalizeSuccProbs(); + + // Now replace all of the moved successors in the original block with the new + // block. This will merge their probabilities. + for (MachineBasicBlock *Succ : NewMBB.successors()) + if (Succ != &UnsplitSucc) + MBB.replaceSuccessor(Succ, &NewMBB); + + // We should always end up replacing at least one successor. + assert(MBB.isSuccessor(&NewMBB) && + "Failed to make the new block a successor!"); + + // Now update all the PHIs. + for (MachineBasicBlock *Succ : NewMBB.successors()) { + for (MachineInstr &MI : *Succ) { + if (!MI.isPHI()) + break; + + for (int OpIdx = 1, NumOps = MI.getNumOperands(); OpIdx < NumOps; + OpIdx += 2) { + MachineOperand &OpV = MI.getOperand(OpIdx); + MachineOperand &OpMBB = MI.getOperand(OpIdx + 1); + assert(OpMBB.isMBB() && "Block operand to a PHI is not a block!"); + if (OpMBB.getMBB() != &MBB) + continue; + + // Replace the operand for unsplit successors + if (!IsEdgeSplit || Succ != &UnsplitSucc) { + OpMBB.setMBB(&NewMBB); + + // We have to continue scanning as there may be multiple entries in + // the PHI. + continue; + } + + // When we have split the edge append a new successor. + MI.addOperand(MF, OpV); + MI.addOperand(MF, MachineOperand::CreateMBB(&NewMBB)); + break; + } + } + } + + return NewMBB; +} + +bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) { + DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName() + << " **********\n"); + + auto &Subtarget = MF.getSubtarget<X86Subtarget>(); + MRI = &MF.getRegInfo(); + TII = Subtarget.getInstrInfo(); + TRI = Subtarget.getRegisterInfo(); + MDT = &getAnalysis<MachineDominatorTree>(); + PromoteRC = &X86::GR8RegClass; + + if (MF.begin() == MF.end()) + // Nothing to do for a degenerate empty function... + return false; + + SmallVector<MachineInstr *, 4> Copies; + for (MachineBasicBlock &MBB : MF) + for (MachineInstr &MI : MBB) + if (MI.getOpcode() == TargetOpcode::COPY && + MI.getOperand(0).getReg() == X86::EFLAGS) + Copies.push_back(&MI); + + for (MachineInstr *CopyI : Copies) { + MachineBasicBlock &MBB = *CopyI->getParent(); + + MachineOperand &VOp = CopyI->getOperand(1); + assert(VOp.isReg() && + "The input to the copy for EFLAGS should always be a register!"); + MachineInstr &CopyDefI = *MRI->getVRegDef(VOp.getReg()); + if (CopyDefI.getOpcode() != TargetOpcode::COPY) { + // FIXME: The big likely candidate here are PHI nodes. We could in theory + // handle PHI nodes, but it gets really, really hard. Insanely hard. Hard + // enough that it is probably better to change every other part of LLVM + // to avoid creating them. The issue is that once we have PHIs we won't + // know which original EFLAGS value we need to capture with our setCCs + // below. The end result will be computing a complete set of setCCs that + // we *might* want, computing them in every place where we copy *out* of + // EFLAGS and then doing SSA formation on all of them to insert necessary + // PHI nodes and consume those here. Then hoping that somehow we DCE the + // unnecessary ones. This DCE seems very unlikely to be successful and so + // we will almost certainly end up with a glut of dead setCC + // instructions. Until we have a motivating test case and fail to avoid + // it by changing other parts of LLVM's lowering, we refuse to handle + // this complex case here. + DEBUG(dbgs() << "ERROR: Encountered unexpected def of an eflags copy: "; + CopyDefI.dump()); + report_fatal_error( + "Cannot lower EFLAGS copy unless it is defined in turn by a copy!"); + } + + auto Cleanup = make_scope_exit([&] { + // All uses of the EFLAGS copy are now rewritten, kill the copy into + // eflags and if dead the copy from. + CopyI->eraseFromParent(); + if (MRI->use_empty(CopyDefI.getOperand(0).getReg())) + CopyDefI.eraseFromParent(); + ++NumCopiesEliminated; + }); + + MachineOperand &DOp = CopyI->getOperand(0); + assert(DOp.isDef() && "Expected register def!"); + assert(DOp.getReg() == X86::EFLAGS && "Unexpected copy def register!"); + if (DOp.isDead()) + continue; + + MachineBasicBlock &TestMBB = *CopyDefI.getParent(); + auto TestPos = CopyDefI.getIterator(); + DebugLoc TestLoc = CopyDefI.getDebugLoc(); + + DEBUG(dbgs() << "Rewriting copy: "; CopyI->dump()); + + // Scan for usage of newly set EFLAGS so we can rewrite them. We just buffer + // jumps because their usage is very constrained. + bool FlagsKilled = false; + SmallVector<MachineInstr *, 4> JmpIs; + + // Gather the condition flags that have already been preserved in + // registers. We do this from scratch each time as we expect there to be + // very few of them and we expect to not revisit the same copy definition + // many times. If either of those change sufficiently we could build a map + // of these up front instead. + CondRegArray CondRegs = collectCondsInRegs(TestMBB, CopyDefI); + + // Collect the basic blocks we need to scan. Typically this will just be + // a single basic block but we may have to scan multiple blocks if the + // EFLAGS copy lives into successors. + SmallVector<MachineBasicBlock *, 2> Blocks; + SmallPtrSet<MachineBasicBlock *, 2> VisitedBlocks; + Blocks.push_back(&MBB); + VisitedBlocks.insert(&MBB); + + do { + MachineBasicBlock &UseMBB = *Blocks.pop_back_val(); + + // We currently don't do any PHI insertion and so we require that the + // test basic block dominates all of the use basic blocks. + // + // We could in theory do PHI insertion here if it becomes useful by just + // taking undef values in along every edge that we don't trace this + // EFLAGS copy along. This isn't as bad as fully general PHI insertion, + // but still seems like a great deal of complexity. + // + // Because it is theoretically possible that some earlier MI pass or + // other lowering transformation could induce this to happen, we do + // a hard check even in non-debug builds here. + if (&TestMBB != &UseMBB && !MDT->dominates(&TestMBB, &UseMBB)) { + DEBUG({ + dbgs() << "ERROR: Encountered use that is not dominated by our test " + "basic block! Rewriting this would require inserting PHI " + "nodes to track the flag state across the CFG.\n\nTest " + "block:\n"; + TestMBB.dump(); + dbgs() << "Use block:\n"; + UseMBB.dump(); + }); + report_fatal_error("Cannot lower EFLAGS copy when original copy def " + "does not dominate all uses."); + } + + for (auto MII = &UseMBB == &MBB ? std::next(CopyI->getIterator()) + : UseMBB.instr_begin(), + MIE = UseMBB.instr_end(); + MII != MIE;) { + MachineInstr &MI = *MII++; + MachineOperand *FlagUse = MI.findRegisterUseOperand(X86::EFLAGS); + if (!FlagUse) { + if (MI.findRegisterDefOperand(X86::EFLAGS)) { + // If EFLAGS are defined, it's as-if they were killed. We can stop + // scanning here. + // + // NB!!! Many instructions only modify some flags. LLVM currently + // models this as clobbering all flags, but if that ever changes + // this will need to be carefully updated to handle that more + // complex logic. + FlagsKilled = true; + break; + } + continue; + } + + DEBUG(dbgs() << " Rewriting use: "; MI.dump()); + + // Check the kill flag before we rewrite as that may change it. + if (FlagUse->isKill()) + FlagsKilled = true; + + // Once we encounter a branch, the rest of the instructions must also be + // branches. We can't rewrite in place here, so we handle them below. + // + // Note that we don't have to handle tail calls here, even conditional + // tail calls, as those are not introduced into the X86 MI until post-RA + // branch folding or black placement. As a consequence, we get to deal + // with the simpler formulation of conditional branches followed by tail + // calls. + if (X86::getCondFromBranchOpc(MI.getOpcode()) != X86::COND_INVALID) { + auto JmpIt = MI.getIterator(); + do { + JmpIs.push_back(&*JmpIt); + ++JmpIt; + } while (JmpIt != UseMBB.instr_end() && + X86::getCondFromBranchOpc(JmpIt->getOpcode()) != + X86::COND_INVALID); + break; + } + + // Otherwise we can just rewrite in-place. + if (X86::getCondFromCMovOpc(MI.getOpcode()) != X86::COND_INVALID) { + rewriteCMov(TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs); + } else if (X86::getCondFromSETOpc(MI.getOpcode()) != + X86::COND_INVALID) { + rewriteSetCC(TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs); + } else if (MI.getOpcode() == TargetOpcode::COPY) { + rewriteCopy(MI, *FlagUse, CopyDefI); + } else { + // We assume all other instructions that use flags also def them. + assert(MI.findRegisterDefOperand(X86::EFLAGS) && + "Expected a def of EFLAGS for this instruction!"); + + // NB!!! Several arithmetic instructions only *partially* update + // flags. Theoretically, we could generate MI code sequences that + // would rely on this fact and observe different flags independently. + // But currently LLVM models all of these instructions as clobbering + // all the flags in an undef way. We rely on that to simplify the + // logic. + FlagsKilled = true; + + switch (MI.getOpcode()) { + case X86::SETB_C8r: + case X86::SETB_C16r: + case X86::SETB_C32r: + case X86::SETB_C64r: + // Use custom lowering for arithmetic that is merely extending the + // carry flag. We model this as the SETB_C* pseudo instructions. + rewriteSetCarryExtended(TestMBB, TestPos, TestLoc, MI, *FlagUse, + CondRegs); + break; + + default: + // Generically handle remaining uses as arithmetic instructions. + rewriteArithmetic(TestMBB, TestPos, TestLoc, MI, *FlagUse, + CondRegs); + break; + } + break; + } + + // If this was the last use of the flags, we're done. + if (FlagsKilled) + break; + } + + // If the flags were killed, we're done with this block. + if (FlagsKilled) + break; + + // Otherwise we need to scan successors for ones where the flags live-in + // and queue those up for processing. + for (MachineBasicBlock *SuccMBB : UseMBB.successors()) + if (SuccMBB->isLiveIn(X86::EFLAGS) && + VisitedBlocks.insert(SuccMBB).second) + Blocks.push_back(SuccMBB); + } while (!Blocks.empty()); + + // Now rewrite the jumps that use the flags. These we handle specially + // because if there are multiple jumps in a single basic block we'll have + // to do surgery on the CFG. + MachineBasicBlock *LastJmpMBB = nullptr; + for (MachineInstr *JmpI : JmpIs) { + // Past the first jump within a basic block we need to split the blocks + // apart. + if (JmpI->getParent() == LastJmpMBB) + splitBlock(*JmpI->getParent(), *JmpI, *TII); + else + LastJmpMBB = JmpI->getParent(); + + rewriteCondJmp(TestMBB, TestPos, TestLoc, *JmpI, CondRegs); + } + + // FIXME: Mark the last use of EFLAGS before the copy's def as a kill if + // the copy's def operand is itself a kill. + } + +#ifndef NDEBUG + for (MachineBasicBlock &MBB : MF) + for (MachineInstr &MI : MBB) + if (MI.getOpcode() == TargetOpcode::COPY && + (MI.getOperand(0).getReg() == X86::EFLAGS || + MI.getOperand(1).getReg() == X86::EFLAGS)) { + DEBUG(dbgs() << "ERROR: Found a COPY involving EFLAGS: "; MI.dump()); + llvm_unreachable("Unlowered EFLAGS copy!"); + } +#endif + + return true; +} + +/// Collect any conditions that have already been set in registers so that we +/// can re-use them rather than adding duplicates. +CondRegArray +X86FlagsCopyLoweringPass::collectCondsInRegs(MachineBasicBlock &MBB, + MachineInstr &CopyDefI) { + CondRegArray CondRegs = {}; + + // Scan backwards across the range of instructions with live EFLAGS. + for (MachineInstr &MI : llvm::reverse( + llvm::make_range(MBB.instr_begin(), CopyDefI.getIterator()))) { + X86::CondCode Cond = X86::getCondFromSETOpc(MI.getOpcode()); + if (Cond != X86::COND_INVALID && MI.getOperand(0).isReg() && + TRI->isVirtualRegister(MI.getOperand(0).getReg())) + CondRegs[Cond] = MI.getOperand(0).getReg(); + + // Stop scanning when we see the first definition of the EFLAGS as prior to + // this we would potentially capture the wrong flag state. + if (MI.findRegisterDefOperand(X86::EFLAGS)) + break; + } + return CondRegs; +} + +unsigned X86FlagsCopyLoweringPass::promoteCondToReg( + MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos, + DebugLoc TestLoc, X86::CondCode Cond) { + unsigned Reg = MRI->createVirtualRegister(PromoteRC); + auto SetI = BuildMI(TestMBB, TestPos, TestLoc, + TII->get(X86::getSETFromCond(Cond)), Reg); + (void)SetI; + DEBUG(dbgs() << " save cond: "; SetI->dump()); + ++NumSetCCsInserted; + return Reg; +} + +std::pair<unsigned, bool> X86FlagsCopyLoweringPass::getCondOrInverseInReg( + MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos, + DebugLoc TestLoc, X86::CondCode Cond, CondRegArray &CondRegs) { + unsigned &CondReg = CondRegs[Cond]; + unsigned &InvCondReg = CondRegs[X86::GetOppositeBranchCondition(Cond)]; + if (!CondReg && !InvCondReg) + CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, Cond); + + if (CondReg) + return {CondReg, false}; + else + return {InvCondReg, true}; +} + +void X86FlagsCopyLoweringPass::insertTest(MachineBasicBlock &MBB, + MachineBasicBlock::iterator Pos, + DebugLoc Loc, unsigned Reg) { + // We emit test instructions as register/immediate test against -1. This + // allows register allocation to fold a memory operand if needed (that will + // happen often due to the places this code is emitted). But hopefully will + // also allow us to select a shorter encoding of `testb %reg, %reg` when that + // would be equivalent. + auto TestI = + BuildMI(MBB, Pos, Loc, TII->get(X86::TEST8rr)).addReg(Reg).addReg(Reg); + (void)TestI; + DEBUG(dbgs() << " test cond: "; TestI->dump()); + ++NumTestsInserted; +} + +void X86FlagsCopyLoweringPass::rewriteArithmetic( + MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos, + DebugLoc TestLoc, MachineInstr &MI, MachineOperand &FlagUse, + CondRegArray &CondRegs) { + // Arithmetic is either reading CF or OF. Figure out which condition we need + // to preserve in a register. + X86::CondCode Cond; + + // The addend to use to reset CF or OF when added to the flag value. + int Addend; + + switch (getMnemonicFromOpcode(MI.getOpcode())) { + case FlagArithMnemonic::ADC: + case FlagArithMnemonic::ADCX: + case FlagArithMnemonic::RCL: + case FlagArithMnemonic::RCR: + case FlagArithMnemonic::SBB: + Cond = X86::COND_B; // CF == 1 + // Set up an addend that when one is added will need a carry due to not + // having a higher bit available. + Addend = 255; + break; + + case FlagArithMnemonic::ADOX: + Cond = X86::COND_O; // OF == 1 + // Set up an addend that when one is added will turn from positive to + // negative and thus overflow in the signed domain. + Addend = 127; + break; + } + + // Now get a register that contains the value of the flag input to the + // arithmetic. We require exactly this flag to simplify the arithmetic + // required to materialize it back into the flag. + unsigned &CondReg = CondRegs[Cond]; + if (!CondReg) + CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, Cond); + + MachineBasicBlock &MBB = *MI.getParent(); + + // Insert an instruction that will set the flag back to the desired value. + unsigned TmpReg = MRI->createVirtualRegister(PromoteRC); + auto AddI = + BuildMI(MBB, MI.getIterator(), MI.getDebugLoc(), TII->get(X86::ADD8ri)) + .addDef(TmpReg, RegState::Dead) + .addReg(CondReg) + .addImm(Addend); + (void)AddI; + DEBUG(dbgs() << " add cond: "; AddI->dump()); + ++NumAddsInserted; + FlagUse.setIsKill(true); +} + +void X86FlagsCopyLoweringPass::rewriteCMov(MachineBasicBlock &TestMBB, + MachineBasicBlock::iterator TestPos, + DebugLoc TestLoc, + MachineInstr &CMovI, + MachineOperand &FlagUse, + CondRegArray &CondRegs) { + // First get the register containing this specific condition. + X86::CondCode Cond = X86::getCondFromCMovOpc(CMovI.getOpcode()); + unsigned CondReg; + bool Inverted; + std::tie(CondReg, Inverted) = + getCondOrInverseInReg(TestMBB, TestPos, TestLoc, Cond, CondRegs); + + MachineBasicBlock &MBB = *CMovI.getParent(); + + // Insert a direct test of the saved register. + insertTest(MBB, CMovI.getIterator(), CMovI.getDebugLoc(), CondReg); + + // Rewrite the CMov to use the !ZF flag from the test (but match register + // size and memory operand), and then kill its use of the flags afterward. + auto &CMovRC = *MRI->getRegClass(CMovI.getOperand(0).getReg()); + CMovI.setDesc(TII->get(X86::getCMovFromCond( + Inverted ? X86::COND_E : X86::COND_NE, TRI->getRegSizeInBits(CMovRC) / 8, + !CMovI.memoperands_empty()))); + FlagUse.setIsKill(true); + DEBUG(dbgs() << " fixed cmov: "; CMovI.dump()); +} + +void X86FlagsCopyLoweringPass::rewriteCondJmp( + MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos, + DebugLoc TestLoc, MachineInstr &JmpI, CondRegArray &CondRegs) { + // First get the register containing this specific condition. + X86::CondCode Cond = X86::getCondFromBranchOpc(JmpI.getOpcode()); + unsigned CondReg; + bool Inverted; + std::tie(CondReg, Inverted) = + getCondOrInverseInReg(TestMBB, TestPos, TestLoc, Cond, CondRegs); + + MachineBasicBlock &JmpMBB = *JmpI.getParent(); + + // Insert a direct test of the saved register. + insertTest(JmpMBB, JmpI.getIterator(), JmpI.getDebugLoc(), CondReg); + + // Rewrite the jump to use the !ZF flag from the test, and kill its use of + // flags afterward. + JmpI.setDesc(TII->get( + X86::GetCondBranchFromCond(Inverted ? X86::COND_E : X86::COND_NE))); + const int ImplicitEFLAGSOpIdx = 1; + JmpI.getOperand(ImplicitEFLAGSOpIdx).setIsKill(true); + DEBUG(dbgs() << " fixed jCC: "; JmpI.dump()); +} + +void X86FlagsCopyLoweringPass::rewriteCopy(MachineInstr &MI, + MachineOperand &FlagUse, + MachineInstr &CopyDefI) { + // Just replace this copy with the the original copy def. + MRI->replaceRegWith(MI.getOperand(0).getReg(), + CopyDefI.getOperand(0).getReg()); + MI.eraseFromParent(); +} + +void X86FlagsCopyLoweringPass::rewriteSetCarryExtended( + MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos, + DebugLoc TestLoc, MachineInstr &SetBI, MachineOperand &FlagUse, + CondRegArray &CondRegs) { + // This routine is only used to handle pseudos for setting a register to zero + // or all ones based on CF. This is essentially the sign extended from 1-bit + // form of SETB and modeled with the SETB_C* pseudos. They require special + // handling as they aren't normal SETcc instructions and are lowered to an + // EFLAGS clobbering operation (SBB typically). One simplifying aspect is that + // they are only provided in reg-defining forms. A complicating factor is that + // they can define many different register widths. + assert(SetBI.getOperand(0).isReg() && + "Cannot have a non-register defined operand to this variant of SETB!"); + + // Little helper to do the common final step of replacing the register def'ed + // by this SETB instruction with a new register and removing the SETB + // instruction. + auto RewriteToReg = [&](unsigned Reg) { + MRI->replaceRegWith(SetBI.getOperand(0).getReg(), Reg); + SetBI.eraseFromParent(); + }; + + // Grab the register class used for this particular instruction. + auto &SetBRC = *MRI->getRegClass(SetBI.getOperand(0).getReg()); + + MachineBasicBlock &MBB = *SetBI.getParent(); + auto SetPos = SetBI.getIterator(); + auto SetLoc = SetBI.getDebugLoc(); + + auto AdjustReg = [&](unsigned Reg) { + auto &OrigRC = *MRI->getRegClass(Reg); + if (&OrigRC == &SetBRC) + return Reg; + + unsigned NewReg; + + int OrigRegSize = TRI->getRegSizeInBits(OrigRC) / 8; + int TargetRegSize = TRI->getRegSizeInBits(SetBRC) / 8; + assert(OrigRegSize <= 8 && "No GPRs larger than 64-bits!"); + assert(TargetRegSize <= 8 && "No GPRs larger than 64-bits!"); + int SubRegIdx[] = {X86::NoSubRegister, X86::sub_8bit, X86::sub_16bit, + X86::NoSubRegister, X86::sub_32bit}; + + // If the original size is smaller than the target *and* is smaller than 4 + // bytes, we need to explicitly zero extend it. We always extend to 4-bytes + // to maximize the chance of being able to CSE that operation and to avoid + // partial dependency stalls extending to 2-bytes. + if (OrigRegSize < TargetRegSize && OrigRegSize < 4) { + NewReg = MRI->createVirtualRegister(&X86::GR32RegClass); + BuildMI(MBB, SetPos, SetLoc, TII->get(X86::MOVZX32rr8), NewReg) + .addReg(Reg); + if (&SetBRC == &X86::GR32RegClass) + return NewReg; + Reg = NewReg; + OrigRegSize = 4; + } + + NewReg = MRI->createVirtualRegister(&SetBRC); + if (OrigRegSize < TargetRegSize) { + BuildMI(MBB, SetPos, SetLoc, TII->get(TargetOpcode::SUBREG_TO_REG), + NewReg) + .addImm(0) + .addReg(Reg) + .addImm(SubRegIdx[OrigRegSize]); + } else if (OrigRegSize > TargetRegSize) { + BuildMI(MBB, SetPos, SetLoc, TII->get(TargetOpcode::EXTRACT_SUBREG), + NewReg) + .addReg(Reg) + .addImm(SubRegIdx[TargetRegSize]); + } else { + BuildMI(MBB, SetPos, SetLoc, TII->get(TargetOpcode::COPY), NewReg) + .addReg(Reg); + } + return NewReg; + }; + + unsigned &CondReg = CondRegs[X86::COND_B]; + if (!CondReg) + CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, X86::COND_B); + + // Adjust the condition to have the desired register width by zero-extending + // as needed. + // FIXME: We should use a better API to avoid the local reference and using a + // different variable here. + unsigned ExtCondReg = AdjustReg(CondReg); + + // Now we need to turn this into a bitmask. We do this by subtracting it from + // zero. + unsigned ZeroReg = MRI->createVirtualRegister(&X86::GR32RegClass); + BuildMI(MBB, SetPos, SetLoc, TII->get(X86::MOV32r0), ZeroReg); + ZeroReg = AdjustReg(ZeroReg); + + unsigned Sub; + switch (SetBI.getOpcode()) { + case X86::SETB_C8r: + Sub = X86::SUB8rr; + break; + + case X86::SETB_C16r: + Sub = X86::SUB16rr; + break; + + case X86::SETB_C32r: + Sub = X86::SUB32rr; + break; + + case X86::SETB_C64r: + Sub = X86::SUB64rr; + break; + + default: + llvm_unreachable("Invalid SETB_C* opcode!"); + } + unsigned ResultReg = MRI->createVirtualRegister(&SetBRC); + BuildMI(MBB, SetPos, SetLoc, TII->get(Sub), ResultReg) + .addReg(ZeroReg) + .addReg(ExtCondReg); + return RewriteToReg(ResultReg); +} + +void X86FlagsCopyLoweringPass::rewriteSetCC(MachineBasicBlock &TestMBB, + MachineBasicBlock::iterator TestPos, + DebugLoc TestLoc, + MachineInstr &SetCCI, + MachineOperand &FlagUse, + CondRegArray &CondRegs) { + X86::CondCode Cond = X86::getCondFromSETOpc(SetCCI.getOpcode()); + // Note that we can't usefully rewrite this to the inverse without complex + // analysis of the users of the setCC. Largely we rely on duplicates which + // could have been avoided already being avoided here. + unsigned &CondReg = CondRegs[Cond]; + if (!CondReg) + CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, Cond); + + // Rewriting a register def is trivial: we just replace the register and + // remove the setcc. + if (!SetCCI.mayStore()) { + assert(SetCCI.getOperand(0).isReg() && + "Cannot have a non-register defined operand to SETcc!"); + MRI->replaceRegWith(SetCCI.getOperand(0).getReg(), CondReg); + SetCCI.eraseFromParent(); + return; + } + + // Otherwise, we need to emit a store. + auto MIB = BuildMI(*SetCCI.getParent(), SetCCI.getIterator(), + SetCCI.getDebugLoc(), TII->get(X86::MOV8mr)); + // Copy the address operands. + for (int i = 0; i < X86::AddrNumOperands; ++i) + MIB.add(SetCCI.getOperand(i)); + + MIB.addReg(CondReg); + + MIB->setMemRefs(SetCCI.memoperands_begin(), SetCCI.memoperands_end()); + + SetCCI.eraseFromParent(); + return; +} diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 10e19f92b4a63..c1ddb771e2faa 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -27781,11 +27781,16 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64; unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r; MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF)); - // Permit reads of the FLAGS register without it being defined. + // Permit reads of the EFLAGS and DF registers without them being defined. // This intrinsic exists to read external processor state in flags, such as // the trap flag, interrupt flag, and direction flag, none of which are // modeled by the backend. + assert(Push->getOperand(2).getReg() == X86::EFLAGS && + "Unexpected register in operand!"); Push->getOperand(2).setIsUndef(); + assert(Push->getOperand(3).getReg() == X86::DF && + "Unexpected register in operand!"); + Push->getOperand(3).setIsUndef(); BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg()); MI.eraseFromParent(); // The pseudo is gone now. @@ -37829,25 +37834,6 @@ bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { } } -/// This function checks if any of the users of EFLAGS copies the EFLAGS. We -/// know that the code that lowers COPY of EFLAGS has to use the stack, and if -/// we don't adjust the stack we clobber the first frame index. -/// See X86InstrInfo::copyPhysReg. -static bool hasCopyImplyingStackAdjustment(const MachineFunction &MF) { - const MachineRegisterInfo &MRI = MF.getRegInfo(); - return any_of(MRI.reg_instructions(X86::EFLAGS), - [](const MachineInstr &RI) { return RI.isCopy(); }); -} - -void X86TargetLowering::finalizeLowering(MachineFunction &MF) const { - if (hasCopyImplyingStackAdjustment(MF)) { - MachineFrameInfo &MFI = MF.getFrameInfo(); - MFI.setHasCopyImplyingStackAdjustment(true); - } - - TargetLoweringBase::finalizeLowering(MF); -} - /// This method query the target whether it is beneficial for dag combiner to /// promote the specified node. If true, it should return the desired promotion /// type by reference. diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 3aa9d01bff20c..7820c3e032e55 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -1099,9 +1099,6 @@ namespace llvm { bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const override; - - void finalizeLowering(MachineFunction &MF) const override; - protected: std::pair<const TargetRegisterClass *, uint8_t> findRepresentativeClass(const TargetRegisterInfo *TRI, diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td index d09deb5b75848..98cc8fb7439e8 100644 --- a/lib/Target/X86/X86InstrArithmetic.td +++ b/lib/Target/X86/X86InstrArithmetic.td @@ -1334,7 +1334,7 @@ let Predicates = [HasBMI2] in { } //===----------------------------------------------------------------------===// -// ADCX Instruction +// ADCX and ADOX Instructions // let Predicates = [HasADX], Defs = [EFLAGS], Uses = [EFLAGS], Constraints = "$src0 = $dst", AddedComplexity = 10 in { @@ -1349,6 +1349,15 @@ let Predicates = [HasADX], Defs = [EFLAGS], Uses = [EFLAGS], [(set GR64:$dst, EFLAGS, (X86adc_flag GR64:$src0, GR64:$src, EFLAGS))], IIC_BIN_CARRY_NONMEM>, T8PD; + + // We don't have patterns for ADOX yet. + let hasSideEffects = 0 in { + def ADOX32rr : I<0xF6, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src0, GR32:$src), + "adox{l}\t{$src, $dst|$dst, $src}", [], IIC_BIN_NONMEM>, T8XS; + + def ADOX64rr : RI<0xF6, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src0, GR64:$src), + "adox{q}\t{$src, $dst|$dst, $src}", [], IIC_BIN_NONMEM>, T8XS; + } // hasSideEffects = 0 } // SchedRW let mayLoad = 1, SchedRW = [WriteALULd] in { @@ -1363,27 +1372,14 @@ let Predicates = [HasADX], Defs = [EFLAGS], Uses = [EFLAGS], [(set GR64:$dst, EFLAGS, (X86adc_flag GR64:$src0, (loadi64 addr:$src), EFLAGS))], IIC_BIN_CARRY_MEM>, T8PD; - } -} -//===----------------------------------------------------------------------===// -// ADOX Instruction -// -let Predicates = [HasADX], hasSideEffects = 0, Defs = [EFLAGS], - Uses = [EFLAGS] in { - let SchedRW = [WriteALU] in { - def ADOX32rr : I<0xF6, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), - "adox{l}\t{$src, $dst|$dst, $src}", [], IIC_BIN_NONMEM>, T8XS; - - def ADOX64rr : RI<0xF6, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), - "adox{q}\t{$src, $dst|$dst, $src}", [], IIC_BIN_NONMEM>, T8XS; - } // SchedRW - - let mayLoad = 1, SchedRW = [WriteALULd] in { - def ADOX32rm : I<0xF6, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), + // We don't have patterns for ADOX yet. + let hasSideEffects = 0 in { + def ADOX32rm : I<0xF6, MRMSrcMem, (outs GR32:$dst), (ins GR32:$src0, i32mem:$src), "adox{l}\t{$src, $dst|$dst, $src}", [], IIC_BIN_MEM>, T8XS; - def ADOX64rm : RI<0xF6, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), + def ADOX64rm : RI<0xF6, MRMSrcMem, (outs GR64:$dst), (ins GR64:$src0, i64mem:$src), "adox{q}\t{$src, $dst|$dst, $src}", [], IIC_BIN_MEM>, T8XS; } + } // hasSideEffects = 0 } diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td index d66d9258e96f9..b3371c96cc29d 100644 --- a/lib/Target/X86/X86InstrCompiler.td +++ b/lib/Target/X86/X86InstrCompiler.td @@ -473,7 +473,7 @@ let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, FP7, ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7, MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7, XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, - XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS], + XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS, DF], usesCustomInserter = 1, Uses = [ESP, SSP] in { def TLS_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym), "# TLS_addr32", @@ -493,7 +493,7 @@ let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11, ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7, MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7, XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, - XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS], + XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS, DF], usesCustomInserter = 1, Uses = [RSP, SSP] in { def TLS_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym), "# TLS_addr64", @@ -509,7 +509,7 @@ def TLS_base_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym), // For i386, the address of the thunk is passed on the stack, on return the // address of the variable is in %eax. %ecx is trashed during the function // call. All other registers are preserved. -let Defs = [EAX, ECX, EFLAGS], +let Defs = [EAX, ECX, EFLAGS, DF], Uses = [ESP, SSP], usesCustomInserter = 1 in def TLSCall_32 : I<0, Pseudo, (outs), (ins i32mem:$sym), @@ -522,7 +522,7 @@ def TLSCall_32 : I<0, Pseudo, (outs), (ins i32mem:$sym), // %rdi. The lowering will do the right thing with RDI. // On return the address of the variable is in %rax. All other // registers are preserved. -let Defs = [RAX, EFLAGS], +let Defs = [RAX, EFLAGS, DF], Uses = [RSP, SSP], usesCustomInserter = 1 in def TLSCall_64 : I<0, Pseudo, (outs), (ins i64mem:$sym), diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 7ca1c58184f63..11ada51a87046 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -5782,7 +5782,7 @@ bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1, return false; } -static X86::CondCode getCondFromBranchOpc(unsigned BrOpc) { +X86::CondCode X86::getCondFromBranchOpc(unsigned BrOpc) { switch (BrOpc) { default: return X86::COND_INVALID; case X86::JE_1: return X86::COND_E; @@ -5805,7 +5805,7 @@ static X86::CondCode getCondFromBranchOpc(unsigned BrOpc) { } /// Return condition code of a SET opcode. -static X86::CondCode getCondFromSETOpc(unsigned Opc) { +X86::CondCode X86::getCondFromSETOpc(unsigned Opc) { switch (Opc) { default: return X86::COND_INVALID; case X86::SETAr: case X86::SETAm: return X86::COND_A; @@ -6130,7 +6130,7 @@ void X86InstrInfo::replaceBranchWithTailCall( if (!I->isBranch()) assert(0 && "Can't find the branch to replace!"); - X86::CondCode CC = getCondFromBranchOpc(I->getOpcode()); + X86::CondCode CC = X86::getCondFromBranchOpc(I->getOpcode()); assert(BranchCond.size() == 1); if (CC != BranchCond[0].getImm()) continue; @@ -6237,7 +6237,7 @@ bool X86InstrInfo::AnalyzeBranchImpl( } // Handle conditional branches. - X86::CondCode BranchCode = getCondFromBranchOpc(I->getOpcode()); + X86::CondCode BranchCode = X86::getCondFromBranchOpc(I->getOpcode()); if (BranchCode == X86::COND_INVALID) return true; // Can't handle indirect branch. @@ -6433,7 +6433,7 @@ unsigned X86InstrInfo::removeBranch(MachineBasicBlock &MBB, if (I->isDebugValue()) continue; if (I->getOpcode() != X86::JMP_1 && - getCondFromBranchOpc(I->getOpcode()) == X86::COND_INVALID) + X86::getCondFromBranchOpc(I->getOpcode()) == X86::COND_INVALID) break; // Remove the branch. I->eraseFromParent(); @@ -6710,102 +6710,12 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB, return; } - bool FromEFLAGS = SrcReg == X86::EFLAGS; - bool ToEFLAGS = DestReg == X86::EFLAGS; - int Reg = FromEFLAGS ? DestReg : SrcReg; - bool is32 = X86::GR32RegClass.contains(Reg); - bool is64 = X86::GR64RegClass.contains(Reg); - - if ((FromEFLAGS || ToEFLAGS) && (is32 || is64)) { - int Mov = is64 ? X86::MOV64rr : X86::MOV32rr; - int Push = is64 ? X86::PUSH64r : X86::PUSH32r; - int PushF = is64 ? X86::PUSHF64 : X86::PUSHF32; - int Pop = is64 ? X86::POP64r : X86::POP32r; - int PopF = is64 ? X86::POPF64 : X86::POPF32; - int AX = is64 ? X86::RAX : X86::EAX; - - if (!Subtarget.hasLAHFSAHF()) { - assert(Subtarget.is64Bit() && - "Not having LAHF/SAHF only happens on 64-bit."); - // Moving EFLAGS to / from another register requires a push and a pop. - // Notice that we have to adjust the stack if we don't want to clobber the - // first frame index. See X86FrameLowering.cpp - usesTheStack. - if (FromEFLAGS) { - BuildMI(MBB, MI, DL, get(PushF)); - BuildMI(MBB, MI, DL, get(Pop), DestReg); - } - if (ToEFLAGS) { - BuildMI(MBB, MI, DL, get(Push)) - .addReg(SrcReg, getKillRegState(KillSrc)); - BuildMI(MBB, MI, DL, get(PopF)); - } - return; - } - - // The flags need to be saved, but saving EFLAGS with PUSHF/POPF is - // inefficient. Instead: - // - Save the overflow flag OF into AL using SETO, and restore it using a - // signed 8-bit addition of AL and INT8_MAX. - // - Save/restore the bottom 8 EFLAGS bits (CF, PF, AF, ZF, SF) to/from AH - // using LAHF/SAHF. - // - When RAX/EAX is live and isn't the destination register, make sure it - // isn't clobbered by PUSH/POP'ing it before and after saving/restoring - // the flags. - // This approach is ~2.25x faster than using PUSHF/POPF. - // - // This is still somewhat inefficient because we don't know which flags are - // actually live inside EFLAGS. Were we able to do a single SETcc instead of - // SETO+LAHF / ADDB+SAHF the code could be 1.02x faster. - // - // PUSHF/POPF is also potentially incorrect because it affects other flags - // such as TF/IF/DF, which LLVM doesn't model. - // - // Notice that we have to adjust the stack if we don't want to clobber the - // first frame index. - // See X86ISelLowering.cpp - X86::hasCopyImplyingStackAdjustment. - - const TargetRegisterInfo &TRI = getRegisterInfo(); - MachineBasicBlock::LivenessQueryResult LQR = - MBB.computeRegisterLiveness(&TRI, AX, MI); - // We do not want to save and restore AX if we do not have to. - // Moreover, if we do so whereas AX is dead, we would need to set - // an undef flag on the use of AX, otherwise the verifier will - // complain that we read an undef value. - // We do not want to change the behavior of the machine verifier - // as this is usually wrong to read an undef value. - if (MachineBasicBlock::LQR_Unknown == LQR) { - LivePhysRegs LPR(TRI); - LPR.addLiveOuts(MBB); - MachineBasicBlock::iterator I = MBB.end(); - while (I != MI) { - --I; - LPR.stepBackward(*I); - } - // AX contains the top most register in the aliasing hierarchy. - // It may not be live, but one of its aliases may be. - for (MCRegAliasIterator AI(AX, &TRI, true); - AI.isValid() && LQR != MachineBasicBlock::LQR_Live; ++AI) - LQR = LPR.contains(*AI) ? MachineBasicBlock::LQR_Live - : MachineBasicBlock::LQR_Dead; - } - bool AXDead = (Reg == AX) || (MachineBasicBlock::LQR_Dead == LQR); - if (!AXDead) - BuildMI(MBB, MI, DL, get(Push)).addReg(AX, getKillRegState(true)); - if (FromEFLAGS) { - BuildMI(MBB, MI, DL, get(X86::SETOr), X86::AL); - BuildMI(MBB, MI, DL, get(X86::LAHF)); - BuildMI(MBB, MI, DL, get(Mov), Reg).addReg(AX); - } - if (ToEFLAGS) { - BuildMI(MBB, MI, DL, get(Mov), AX).addReg(Reg, getKillRegState(KillSrc)); - BuildMI(MBB, MI, DL, get(X86::ADD8ri), X86::AL) - .addReg(X86::AL) - .addImm(INT8_MAX); - BuildMI(MBB, MI, DL, get(X86::SAHF)); - } - if (!AXDead) - BuildMI(MBB, MI, DL, get(Pop), AX); - return; + if (SrcReg == X86::EFLAGS || DestReg == X86::EFLAGS) { + // FIXME: We use a fatal error here because historically LLVM has tried + // lower some of these physreg copies and we want to ensure we get + // reasonable bug reports if someone encounters a case no other testing + // found. This path should be removed after the LLVM 7 release. + report_fatal_error("Unable to copy EFLAGS physical register!"); } DEBUG(dbgs() << "Cannot copy " << RI.getName(SrcReg) @@ -7465,9 +7375,9 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, if (IsCmpZero || IsSwapped) { // We decode the condition code from opcode. if (Instr.isBranch()) - OldCC = getCondFromBranchOpc(Instr.getOpcode()); + OldCC = X86::getCondFromBranchOpc(Instr.getOpcode()); else { - OldCC = getCondFromSETOpc(Instr.getOpcode()); + OldCC = X86::getCondFromSETOpc(Instr.getOpcode()); if (OldCC != X86::COND_INVALID) OpcIsSET = true; else @@ -9413,8 +9323,9 @@ bool X86InstrInfo:: isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const { // FIXME: Return false for x87 stack register classes for now. We can't // allow any loads of these registers before FpGet_ST0_80. - return !(RC == &X86::CCRRegClass || RC == &X86::RFP32RegClass || - RC == &X86::RFP64RegClass || RC == &X86::RFP80RegClass); + return !(RC == &X86::CCRRegClass || RC == &X86::DFCCRRegClass || + RC == &X86::RFP32RegClass || RC == &X86::RFP64RegClass || + RC == &X86::RFP80RegClass); } /// Return a virtual register initialized with the diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h index 02a09c340cef2..2b5ad934f9b18 100644 --- a/lib/Target/X86/X86InstrInfo.h +++ b/lib/Target/X86/X86InstrInfo.h @@ -77,6 +77,12 @@ unsigned getSETFromCond(CondCode CC, bool HasMemoryOperand = false); unsigned getCMovFromCond(CondCode CC, unsigned RegBytes, bool HasMemoryOperand = false); +// Turn jCC opcode into condition code. +CondCode getCondFromBranchOpc(unsigned Opc); + +// Turn setCC opcode into condition code. +CondCode getCondFromSETOpc(unsigned Opc); + // Turn CMov opcode into condition code. CondCode getCondFromCMovOpc(unsigned Opc); diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index a657b19c08c97..68f40c28d5279 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -1235,18 +1235,18 @@ let mayLoad = 1, mayStore = 1, usesCustomInserter = 1, let mayLoad = 1, mayStore = 1, usesCustomInserter = 1, SchedRW = [WriteRMW] in { - let Defs = [ESP, EFLAGS], Uses = [ESP] in + let Defs = [ESP, EFLAGS, DF], Uses = [ESP] in def WRFLAGS32 : PseudoI<(outs), (ins GR32:$src), [(int_x86_flags_write_u32 GR32:$src)]>, Requires<[Not64BitMode]>; - let Defs = [RSP, EFLAGS], Uses = [RSP] in + let Defs = [RSP, EFLAGS, DF], Uses = [RSP] in def WRFLAGS64 : PseudoI<(outs), (ins GR64:$src), [(int_x86_flags_write_u64 GR64:$src)]>, Requires<[In64BitMode]>; } -let Defs = [ESP, EFLAGS], Uses = [ESP], mayLoad = 1, hasSideEffects=0, +let Defs = [ESP, EFLAGS, DF], Uses = [ESP], mayLoad = 1, hasSideEffects=0, SchedRW = [WriteLoad] in { def POPF16 : I<0x9D, RawFrm, (outs), (ins), "popf{w}", [], IIC_POP_F>, OpSize16; @@ -1254,7 +1254,7 @@ def POPF32 : I<0x9D, RawFrm, (outs), (ins), "popf{l|d}", [], IIC_POP_FD>, OpSize32, Requires<[Not64BitMode]>; } -let Defs = [ESP], Uses = [ESP, EFLAGS], mayStore = 1, hasSideEffects=0, +let Defs = [ESP], Uses = [ESP, EFLAGS, DF], mayStore = 1, hasSideEffects=0, SchedRW = [WriteStore] in { def PUSHF16 : I<0x9C, RawFrm, (outs), (ins), "pushf{w}", [], IIC_PUSH_F>, OpSize16; @@ -1294,10 +1294,10 @@ def PUSH64i32 : Ii32S<0x68, RawFrm, (outs), (ins i64i32imm:$imm), Requires<[In64BitMode]>; } -let Defs = [RSP, EFLAGS], Uses = [RSP], mayLoad = 1, hasSideEffects=0 in +let Defs = [RSP, EFLAGS, DF], Uses = [RSP], mayLoad = 1, hasSideEffects=0 in def POPF64 : I<0x9D, RawFrm, (outs), (ins), "popfq", [], IIC_POP_FD>, OpSize32, Requires<[In64BitMode]>, Sched<[WriteLoad]>; -let Defs = [RSP], Uses = [RSP, EFLAGS], mayStore = 1, hasSideEffects=0 in +let Defs = [RSP], Uses = [RSP, EFLAGS, DF], mayStore = 1, hasSideEffects=0 in def PUSHF64 : I<0x9C, RawFrm, (outs), (ins), "pushfq", [], IIC_PUSH_F>, OpSize32, Requires<[In64BitMode]>, Sched<[WriteStore]>; @@ -1382,8 +1382,7 @@ def BSR64rm : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), } // Defs = [EFLAGS] let SchedRW = [WriteMicrocoded] in { -// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI -let Defs = [EDI,ESI], Uses = [EDI,ESI,EFLAGS] in { +let Defs = [EDI,ESI], Uses = [EDI,ESI,DF] in { def MOVSB : I<0xA4, RawFrmDstSrc, (outs), (ins dstidx8:$dst, srcidx8:$src), "movsb\t{$src, $dst|$dst, $src}", [], IIC_MOVS>; def MOVSW : I<0xA5, RawFrmDstSrc, (outs), (ins dstidx16:$dst, srcidx16:$src), @@ -1394,36 +1393,33 @@ def MOVSQ : RI<0xA5, RawFrmDstSrc, (outs), (ins dstidx64:$dst, srcidx64:$src), "movsq\t{$src, $dst|$dst, $src}", [], IIC_MOVS>; } -// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI -let Defs = [EDI], Uses = [AL,EDI,EFLAGS] in +let Defs = [EDI], Uses = [AL,EDI,DF] in def STOSB : I<0xAA, RawFrmDst, (outs), (ins dstidx8:$dst), "stosb\t{%al, $dst|$dst, al}", [], IIC_STOS>; -let Defs = [EDI], Uses = [AX,EDI,EFLAGS] in +let Defs = [EDI], Uses = [AX,EDI,DF] in def STOSW : I<0xAB, RawFrmDst, (outs), (ins dstidx16:$dst), "stosw\t{%ax, $dst|$dst, ax}", [], IIC_STOS>, OpSize16; -let Defs = [EDI], Uses = [EAX,EDI,EFLAGS] in +let Defs = [EDI], Uses = [EAX,EDI,DF] in def STOSL : I<0xAB, RawFrmDst, (outs), (ins dstidx32:$dst), "stos{l|d}\t{%eax, $dst|$dst, eax}", [], IIC_STOS>, OpSize32; -let Defs = [RDI], Uses = [RAX,RDI,EFLAGS] in +let Defs = [RDI], Uses = [RAX,RDI,DF] in def STOSQ : RI<0xAB, RawFrmDst, (outs), (ins dstidx64:$dst), "stosq\t{%rax, $dst|$dst, rax}", [], IIC_STOS>; -// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI -let Defs = [EDI,EFLAGS], Uses = [AL,EDI,EFLAGS] in +let Defs = [EDI,EFLAGS], Uses = [AL,EDI,DF] in def SCASB : I<0xAE, RawFrmDst, (outs), (ins dstidx8:$dst), "scasb\t{$dst, %al|al, $dst}", [], IIC_SCAS>; -let Defs = [EDI,EFLAGS], Uses = [AX,EDI,EFLAGS] in +let Defs = [EDI,EFLAGS], Uses = [AX,EDI,DF] in def SCASW : I<0xAF, RawFrmDst, (outs), (ins dstidx16:$dst), "scasw\t{$dst, %ax|ax, $dst}", [], IIC_SCAS>, OpSize16; -let Defs = [EDI,EFLAGS], Uses = [EAX,EDI,EFLAGS] in +let Defs = [EDI,EFLAGS], Uses = [EAX,EDI,DF] in def SCASL : I<0xAF, RawFrmDst, (outs), (ins dstidx32:$dst), "scas{l|d}\t{$dst, %eax|eax, $dst}", [], IIC_SCAS>, OpSize32; -let Defs = [EDI,EFLAGS], Uses = [RAX,EDI,EFLAGS] in +let Defs = [EDI,EFLAGS], Uses = [RAX,EDI,DF] in def SCASQ : RI<0xAF, RawFrmDst, (outs), (ins dstidx64:$dst), "scasq\t{$dst, %rax|rax, $dst}", [], IIC_SCAS>; -// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI -let Defs = [EDI,ESI,EFLAGS], Uses = [EDI,ESI,EFLAGS] in { +let Defs = [EDI,ESI,EFLAGS], Uses = [EDI,ESI,DF] in { def CMPSB : I<0xA6, RawFrmDstSrc, (outs), (ins dstidx8:$dst, srcidx8:$src), "cmpsb\t{$dst, $src|$src, $dst}", [], IIC_CMPS>; def CMPSW : I<0xA7, RawFrmDstSrc, (outs), (ins dstidx16:$dst, srcidx16:$src), @@ -2070,8 +2066,7 @@ def DATA32_PREFIX : I<0x66, RawFrm, (outs), (ins), "data32", [], IIC_NOP>, } // SchedRW // Repeat string operation instruction prefixes -// These use the DF flag in the EFLAGS register to inc or dec ECX -let Defs = [ECX], Uses = [ECX,EFLAGS], SchedRW = [WriteMicrocoded] in { +let Defs = [ECX], Uses = [ECX,DF], SchedRW = [WriteMicrocoded] in { // Repeat (used with INS, OUTS, MOVS, LODS and STOS) def REP_PREFIX : I<0xF3, RawFrm, (outs), (ins), "rep", []>; // Repeat while not equal (used with CMPS and SCAS) @@ -2080,24 +2075,22 @@ def REPNE_PREFIX : I<0xF2, RawFrm, (outs), (ins), "repne", []>; // String manipulation instructions let SchedRW = [WriteMicrocoded] in { -// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI -let Defs = [AL,ESI], Uses = [ESI,EFLAGS] in +let Defs = [AL,ESI], Uses = [ESI,DF] in def LODSB : I<0xAC, RawFrmSrc, (outs), (ins srcidx8:$src), "lodsb\t{$src, %al|al, $src}", [], IIC_LODS>; -let Defs = [AX,ESI], Uses = [ESI,EFLAGS] in +let Defs = [AX,ESI], Uses = [ESI,DF] in def LODSW : I<0xAD, RawFrmSrc, (outs), (ins srcidx16:$src), "lodsw\t{$src, %ax|ax, $src}", [], IIC_LODS>, OpSize16; -let Defs = [EAX,ESI], Uses = [ESI,EFLAGS] in +let Defs = [EAX,ESI], Uses = [ESI,DF] in def LODSL : I<0xAD, RawFrmSrc, (outs), (ins srcidx32:$src), "lods{l|d}\t{$src, %eax|eax, $src}", [], IIC_LODS>, OpSize32; -let Defs = [RAX,ESI], Uses = [ESI,EFLAGS] in +let Defs = [RAX,ESI], Uses = [ESI,DF] in def LODSQ : RI<0xAD, RawFrmSrc, (outs), (ins srcidx64:$src), "lodsq\t{$src, %rax|rax, $src}", [], IIC_LODS>; } let SchedRW = [WriteSystem] in { -// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI -let Defs = [ESI], Uses = [DX,ESI,EFLAGS] in { +let Defs = [ESI], Uses = [DX,ESI,DF] in { def OUTSB : I<0x6E, RawFrmSrc, (outs), (ins srcidx8:$src), "outsb\t{$src, %dx|dx, $src}", [], IIC_OUTS>; def OUTSW : I<0x6F, RawFrmSrc, (outs), (ins srcidx16:$src), @@ -2106,8 +2099,7 @@ def OUTSL : I<0x6F, RawFrmSrc, (outs), (ins srcidx32:$src), "outs{l|d}\t{$src, %dx|dx, $src}", [], IIC_OUTS>, OpSize32; } -// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI -let Defs = [EDI], Uses = [DX,EDI,EFLAGS] in { +let Defs = [EDI], Uses = [DX,EDI,DF] in { def INSB : I<0x6C, RawFrmDst, (outs), (ins dstidx8:$dst), "insb\t{%dx, $dst|$dst, dx}", [], IIC_INS>; def INSW : I<0x6D, RawFrmDst, (outs), (ins dstidx16:$dst), @@ -2117,19 +2109,22 @@ def INSL : I<0x6D, RawFrmDst, (outs), (ins dstidx32:$dst), } } -// Flag instructions -let SchedRW = [WriteALU] in { -def CLC : I<0xF8, RawFrm, (outs), (ins), "clc", [], IIC_CLC>; -def STC : I<0xF9, RawFrm, (outs), (ins), "stc", [], IIC_STC>; -def CLI : I<0xFA, RawFrm, (outs), (ins), "cli", [], IIC_CLI>; -def STI : I<0xFB, RawFrm, (outs), (ins), "sti", [], IIC_STI>; +// EFLAGS management instructions. +let SchedRW = [WriteALU], Defs = [EFLAGS], Uses = [EFLAGS] in { +def CLC : I<0xF8, RawFrm, (outs), (ins), "clc", [], IIC_CLC_CMC_STC>; +def STC : I<0xF9, RawFrm, (outs), (ins), "stc", [], IIC_CLC_CMC_STC>; +def CMC : I<0xF5, RawFrm, (outs), (ins), "cmc", [], IIC_CLC_CMC_STC>; +} + +// DF management instructions. +// FIXME: These are a bit more expensive than CLC and STC. We should consider +// adjusting their schedule bucket. +let SchedRW = [WriteALU], Defs = [DF] in { def CLD : I<0xFC, RawFrm, (outs), (ins), "cld", [], IIC_CLD>; def STD : I<0xFD, RawFrm, (outs), (ins), "std", [], IIC_STD>; -def CMC : I<0xF5, RawFrm, (outs), (ins), "cmc", [], IIC_CMC>; - -def CLTS : I<0x06, RawFrm, (outs), (ins), "clts", [], IIC_CLTS>, TB; } + // Table lookup instructions let Uses = [AL,EBX], Defs = [AL], hasSideEffects = 0, mayLoad = 1 in def XLAT : I<0xD7, RawFrm, (outs), (ins), "xlatb", [], IIC_XLAT>, diff --git a/lib/Target/X86/X86InstrSystem.td b/lib/Target/X86/X86InstrSystem.td index 40d2dca4f9ec9..576f87b13ab46 100644 --- a/lib/Target/X86/X86InstrSystem.td +++ b/lib/Target/X86/X86InstrSystem.td @@ -693,6 +693,19 @@ let Uses = [RAX, RBX, RCX, RDX], Defs = [RAX, RBX, RCX] in { } // SchedRW //===----------------------------------------------------------------------===// +// TS flag control instruction. +let SchedRW = [WriteSystem] in { +def CLTS : I<0x06, RawFrm, (outs), (ins), "clts", [], IIC_CLTS>, TB; +} + +//===----------------------------------------------------------------------===// +// IF (inside EFLAGS) management instructions. +let SchedRW = [WriteSystem], Uses = [EFLAGS], Defs = [EFLAGS] in { +def CLI : I<0xFA, RawFrm, (outs), (ins), "cli", [], IIC_CLI>; +def STI : I<0xFB, RawFrm, (outs), (ins), "sti", [], IIC_STI>; +} + +//===----------------------------------------------------------------------===// // RDPID Instruction let SchedRW = [WriteSystem] in { def RDPID32 : I<0xC7, MRM7r, (outs GR32:$src), (ins), diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td index 2341e1fb0facf..1a776dcd04ebb 100644 --- a/lib/Target/X86/X86RegisterInfo.td +++ b/lib/Target/X86/X86RegisterInfo.td @@ -251,9 +251,19 @@ def ST7 : X86Reg<"st(7)", 7>, DwarfRegNum<[40, 19, 18]>; // Floating-point status word def FPSW : X86Reg<"fpsw", 0>; -// Status flags register +// Status flags register. +// +// Note that some flags that are commonly thought of as part of the status +// flags register are modeled separately. Typically this is due to instructions +// reading and updating those flags independently of all the others. We don't +// want to create false dependencies between these instructions and so we use +// a separate register to model them. def EFLAGS : X86Reg<"flags", 0>; +// The direction flag. +def DF : X86Reg<"DF", 0>; + + // Segment registers def CS : X86Reg<"cs", 1>; def DS : X86Reg<"ds", 3>; @@ -497,6 +507,10 @@ def FPCCR : RegisterClass<"X86", [i16], 16, (add FPSW)> { let CopyCost = -1; // Don't allow copying of status registers. let isAllocatable = 0; } +def DFCCR : RegisterClass<"X86", [i32], 32, (add DF)> { + let CopyCost = -1; // Don't allow copying of status registers. + let isAllocatable = 0; +} // AVX-512 vector/mask registers. def VR512 : RegisterClass<"X86", [v16f32, v8f64, v64i8, v32i16, v16i32, v8i64], diff --git a/lib/Target/X86/X86Schedule.td b/lib/Target/X86/X86Schedule.td index 2e21a97541b2c..078d459634cea 100644 --- a/lib/Target/X86/X86Schedule.td +++ b/lib/Target/X86/X86Schedule.td @@ -608,12 +608,10 @@ def IIC_CMPXCHG_8B : InstrItinClass; def IIC_CMPXCHG_16B : InstrItinClass; def IIC_LODS : InstrItinClass; def IIC_OUTS : InstrItinClass; -def IIC_CLC : InstrItinClass; +def IIC_CLC_CMC_STC : InstrItinClass; def IIC_CLD : InstrItinClass; def IIC_CLI : InstrItinClass; -def IIC_CMC : InstrItinClass; def IIC_CLTS : InstrItinClass; -def IIC_STC : InstrItinClass; def IIC_STI : InstrItinClass; def IIC_STD : InstrItinClass; def IIC_XLAT : InstrItinClass; diff --git a/lib/Target/X86/X86ScheduleAtom.td b/lib/Target/X86/X86ScheduleAtom.td index e052ad98104cf..460b9823a7e7b 100644 --- a/lib/Target/X86/X86ScheduleAtom.td +++ b/lib/Target/X86/X86ScheduleAtom.td @@ -514,12 +514,10 @@ def AtomItineraries : ProcessorItineraries< InstrItinData<IIC_CMPXCHG_16B, [InstrStage<22, [Port0, Port1]>] >, InstrItinData<IIC_LODS, [InstrStage<2, [Port0, Port1]>] >, InstrItinData<IIC_OUTS, [InstrStage<74, [Port0, Port1]>] >, - InstrItinData<IIC_CLC, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_CLC_CMC_STC, [InstrStage<1, [Port0, Port1]>] >, InstrItinData<IIC_CLD, [InstrStage<3, [Port0, Port1]>] >, InstrItinData<IIC_CLI, [InstrStage<14, [Port0, Port1]>] >, - InstrItinData<IIC_CMC, [InstrStage<1, [Port0, Port1]>] >, InstrItinData<IIC_CLTS, [InstrStage<33, [Port0, Port1]>] >, - InstrItinData<IIC_STC, [InstrStage<1, [Port0, Port1]>] >, InstrItinData<IIC_STI, [InstrStage<17, [Port0, Port1]>] >, InstrItinData<IIC_STD, [InstrStage<21, [Port0, Port1]>] >, InstrItinData<IIC_XLAT, [InstrStage<6, [Port0, Port1]>] >, diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp index ac242e1c00e04..e41e16d82d83b 100644 --- a/lib/Target/X86/X86TargetMachine.cpp +++ b/lib/Target/X86/X86TargetMachine.cpp @@ -62,6 +62,7 @@ void initializeX86CallFrameOptimizationPass(PassRegistry &); void initializeX86CmovConverterPassPass(PassRegistry &); void initializeX86ExecutionDepsFixPass(PassRegistry &); void initializeX86DomainReassignmentPass(PassRegistry &); +void initializeX86FlagsCopyLoweringPassPass(PassRegistry &); } // end namespace llvm @@ -80,6 +81,7 @@ extern "C" void LLVMInitializeX86Target() { initializeX86CmovConverterPassPass(PR); initializeX86ExecutionDepsFixPass(PR); initializeX86DomainReassignmentPass(PR); + initializeX86FlagsCopyLoweringPassPass(PR); } static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { @@ -415,6 +417,7 @@ void X86PassConfig::addPreRegAlloc() { addPass(createX86CallFrameOptimization()); } + addPass(createX86FlagsCopyLoweringPass()); addPass(createX86WinAllocaExpander()); } void X86PassConfig::addMachineSSAOptimization() { |