diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2017-05-16 19:46:52 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2017-05-16 19:46:52 +0000 |
commit | 6b3f41ed88e8e440e11a4fbf20b6600529f80049 (patch) | |
tree | 928b056f24a634d628c80238dbbf10d41b1a71d5 /lib/Target/X86 | |
parent | c46e6a5940c50058e00c0c5f9123fd82e338d29a (diff) |
Diffstat (limited to 'lib/Target/X86')
-rw-r--r-- | lib/Target/X86/X86.td | 3 | ||||
-rw-r--r-- | lib/Target/X86/X86FastISel.cpp | 48 | ||||
-rw-r--r-- | lib/Target/X86/X86FixupLEAs.cpp | 269 | ||||
-rw-r--r-- | lib/Target/X86/X86ISelDAGToDAG.cpp | 41 | ||||
-rw-r--r-- | lib/Target/X86/X86ISelLowering.cpp | 214 | ||||
-rw-r--r-- | lib/Target/X86/X86InstrCompiler.td | 14 | ||||
-rw-r--r-- | lib/Target/X86/X86InstrInfo.cpp | 52 | ||||
-rw-r--r-- | lib/Target/X86/X86InstrInfo.h | 11 | ||||
-rw-r--r-- | lib/Target/X86/X86InstrInfo.td | 35 | ||||
-rw-r--r-- | lib/Target/X86/X86InstrSSE.td | 18 | ||||
-rw-r--r-- | lib/Target/X86/X86InstructionSelector.cpp | 214 | ||||
-rw-r--r-- | lib/Target/X86/X86IntrinsicsInfo.h | 2 | ||||
-rw-r--r-- | lib/Target/X86/X86LegalizerInfo.cpp | 16 | ||||
-rw-r--r-- | lib/Target/X86/X86RegisterInfo.cpp | 14 | ||||
-rw-r--r-- | lib/Target/X86/X86Subtarget.h | 6 | ||||
-rw-r--r-- | lib/Target/X86/X86TargetMachine.cpp | 4 | ||||
-rw-r--r-- | lib/Target/X86/X86TargetTransformInfo.cpp | 194 | ||||
-rw-r--r-- | lib/Target/X86/X86WinEHState.cpp | 2 |
18 files changed, 730 insertions, 427 deletions
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td index 784c3a6557ff..3a421fe77392 100644 --- a/lib/Target/X86/X86.td +++ b/lib/Target/X86/X86.td @@ -235,6 +235,8 @@ def FeatureLEAUsesAG : SubtargetFeature<"lea-uses-ag", "LEAUsesAG", "true", "LEA instruction needs inputs at AG stage">; def FeatureSlowLEA : SubtargetFeature<"slow-lea", "SlowLEA", "true", "LEA instruction with certain arguments is slow">; +def FeatureSlow3OpsLEA : SubtargetFeature<"slow-3ops-lea", "Slow3OpsLEA", "true", + "LEA instruction with 3 ops or certain registers is slow">; def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true", "INC and DEC instructions are slower than ADD and SUB">; def FeatureSoftFloat @@ -480,6 +482,7 @@ def SNBFeatures : ProcessorFeatures<[], [ FeatureXSAVE, FeatureXSAVEOPT, FeatureLAHFSAHF, + FeatureSlow3OpsLEA, FeatureFastScalarFSQRT, FeatureFastSHLDRotate ]>; diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp index ebd179e786da..fc3b4836c178 100644 --- a/lib/Target/X86/X86FastISel.cpp +++ b/lib/Target/X86/X86FastISel.cpp @@ -180,44 +180,6 @@ private: } // end anonymous namespace. -static std::pair<X86::CondCode, bool> -getX86ConditionCode(CmpInst::Predicate Predicate) { - X86::CondCode CC = X86::COND_INVALID; - bool NeedSwap = false; - switch (Predicate) { - default: break; - // Floating-point Predicates - case CmpInst::FCMP_UEQ: CC = X86::COND_E; break; - case CmpInst::FCMP_OLT: NeedSwap = true; LLVM_FALLTHROUGH; - case CmpInst::FCMP_OGT: CC = X86::COND_A; break; - case CmpInst::FCMP_OLE: NeedSwap = true; LLVM_FALLTHROUGH; - case CmpInst::FCMP_OGE: CC = X86::COND_AE; break; - case CmpInst::FCMP_UGT: NeedSwap = true; LLVM_FALLTHROUGH; - case CmpInst::FCMP_ULT: CC = X86::COND_B; break; - case CmpInst::FCMP_UGE: NeedSwap = true; LLVM_FALLTHROUGH; - case CmpInst::FCMP_ULE: CC = X86::COND_BE; break; - case CmpInst::FCMP_ONE: CC = X86::COND_NE; break; - case CmpInst::FCMP_UNO: CC = X86::COND_P; break; - case CmpInst::FCMP_ORD: CC = X86::COND_NP; break; - case CmpInst::FCMP_OEQ: LLVM_FALLTHROUGH; - case CmpInst::FCMP_UNE: CC = X86::COND_INVALID; break; - - // Integer Predicates - case CmpInst::ICMP_EQ: CC = X86::COND_E; break; - case CmpInst::ICMP_NE: CC = X86::COND_NE; break; - case CmpInst::ICMP_UGT: CC = X86::COND_A; break; - case CmpInst::ICMP_UGE: CC = X86::COND_AE; break; - case CmpInst::ICMP_ULT: CC = X86::COND_B; break; - case CmpInst::ICMP_ULE: CC = X86::COND_BE; break; - case CmpInst::ICMP_SGT: CC = X86::COND_G; break; - case CmpInst::ICMP_SGE: CC = X86::COND_GE; break; - case CmpInst::ICMP_SLT: CC = X86::COND_L; break; - case CmpInst::ICMP_SLE: CC = X86::COND_LE; break; - } - - return std::make_pair(CC, NeedSwap); -} - static std::pair<unsigned, bool> getX86SSEConditionCode(CmpInst::Predicate Predicate) { unsigned CC; @@ -1559,7 +1521,7 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) { X86::CondCode CC; bool SwapArgs; - std::tie(CC, SwapArgs) = getX86ConditionCode(Predicate); + std::tie(CC, SwapArgs) = X86::getX86ConditionCode(Predicate); assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code."); unsigned Opc = X86::getSETFromCond(CC); @@ -1697,7 +1659,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { bool SwapArgs; unsigned BranchOpc; - std::tie(CC, SwapArgs) = getX86ConditionCode(Predicate); + std::tie(CC, SwapArgs) = X86::getX86ConditionCode(Predicate); assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code."); BranchOpc = X86::GetCondBranchFromCond(CC); @@ -2070,7 +2032,7 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) { } bool NeedSwap; - std::tie(CC, NeedSwap) = getX86ConditionCode(Predicate); + std::tie(CC, NeedSwap) = X86::getX86ConditionCode(Predicate); assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code."); const Value *CmpLHS = CI->getOperand(0); @@ -2319,7 +2281,7 @@ bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) { const auto *CI = dyn_cast<CmpInst>(Cond); if (CI && (CI->getParent() == I->getParent())) { bool NeedSwap; - std::tie(CC, NeedSwap) = getX86ConditionCode(CI->getPredicate()); + std::tie(CC, NeedSwap) = X86::getX86ConditionCode(CI->getPredicate()); if (CC > X86::LAST_VALID_COND) return false; @@ -3293,7 +3255,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { // Issue CALLSEQ_START unsigned AdjStackDown = TII.getCallFrameSetupOpcode(); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown)) - .addImm(NumBytes).addImm(0); + .addImm(NumBytes).addImm(0).addImm(0); // Walk the register/memloc assignments, inserting copies/loads. const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); diff --git a/lib/Target/X86/X86FixupLEAs.cpp b/lib/Target/X86/X86FixupLEAs.cpp index 2cd4c1a3e7b3..9f649dad8bc0 100644 --- a/lib/Target/X86/X86FixupLEAs.cpp +++ b/lib/Target/X86/X86FixupLEAs.cpp @@ -27,20 +27,26 @@ #include "llvm/Target/TargetInstrInfo.h" using namespace llvm; -#define DEBUG_TYPE "x86-fixup-LEAs" +namespace llvm { +void initializeFixupLEAPassPass(PassRegistry &); +} + +#define FIXUPLEA_DESC "X86 LEA Fixup" +#define FIXUPLEA_NAME "x86-fixup-LEAs" + +#define DEBUG_TYPE FIXUPLEA_NAME STATISTIC(NumLEAs, "Number of LEA instructions created"); namespace { class FixupLEAPass : public MachineFunctionPass { enum RegUsageState { RU_NotUsed, RU_Write, RU_Read }; - static char ID; + /// \brief Loop over all of the instructions in the basic block /// replacing applicable instructions with LEA instructions, /// where appropriate. bool processBasicBlock(MachineFunction &MF, MachineFunction::iterator MFI); - StringRef getPassName() const override { return "X86 LEA Fixup"; } /// \brief Given a machine register, look for the instruction /// which writes it in the current basic block. If found, @@ -62,6 +68,22 @@ class FixupLEAPass : public MachineFunctionPass { void processInstructionForSLM(MachineBasicBlock::iterator &I, MachineFunction::iterator MFI); + + /// \brief Given a LEA instruction which is unprofitable + /// on SNB+ try to replace it with other instructions. + /// According to Intel's Optimization Reference Manual: + /// " For LEA instructions with three source operands and some specific + /// situations, instruction latency has increased to 3 cycles, and must + /// dispatch via port 1: + /// - LEA that has all three source operands: base, index, and offset + /// - LEA that uses base and index registers where the base is EBP, RBP, + /// or R13 + /// - LEA that uses RIP relative addressing mode + /// - LEA that uses 16-bit addressing mode " + /// This function currently handles the first 2 cases only. + MachineInstr *processInstrForSlow3OpLEA(MachineInstr &MI, + MachineFunction::iterator MFI); + /// \brief Look for LEAs that add 1 to reg or subtract 1 from reg /// and convert them to INC or DEC respectively. bool fixupIncDec(MachineBasicBlock::iterator &I, @@ -85,7 +107,13 @@ class FixupLEAPass : public MachineFunctionPass { MachineBasicBlock::iterator &MBBI) const; public: - FixupLEAPass() : MachineFunctionPass(ID) {} + static char ID; + + StringRef getPassName() const override { return FIXUPLEA_DESC; } + + FixupLEAPass() : MachineFunctionPass(ID) { + initializeFixupLEAPassPass(*PassRegistry::getPassRegistry()); + } /// \brief Loop over all of the basic blocks, /// replacing instructions by equivalent LEA instructions @@ -104,9 +132,12 @@ private: bool OptIncDec; bool OptLEA; }; -char FixupLEAPass::ID = 0; } +char FixupLEAPass::ID = 0; + +INITIALIZE_PASS(FixupLEAPass, FIXUPLEA_NAME, FIXUPLEA_DESC, false, false) + MachineInstr * FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI, MachineBasicBlock::iterator &MBBI) const { @@ -168,7 +199,7 @@ bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) { MF = &Func; const X86Subtarget &ST = Func.getSubtarget<X86Subtarget>(); OptIncDec = !ST.slowIncDec() || Func.getFunction()->optForMinSize(); - OptLEA = ST.LEAusesAG() || ST.slowLEA(); + OptLEA = ST.LEAusesAG() || ST.slowLEA() || ST.slow3OpsLEA(); if (!OptLEA && !OptIncDec) return false; @@ -242,9 +273,64 @@ FixupLEAPass::searchBackwards(MachineOperand &p, MachineBasicBlock::iterator &I, return MachineBasicBlock::iterator(); } -static inline bool isLEA(const int opcode) { - return opcode == X86::LEA16r || opcode == X86::LEA32r || - opcode == X86::LEA64r || opcode == X86::LEA64_32r; +static inline bool isLEA(const int Opcode) { + return Opcode == X86::LEA16r || Opcode == X86::LEA32r || + Opcode == X86::LEA64r || Opcode == X86::LEA64_32r; +} + +static inline bool isInefficientLEAReg(unsigned int Reg) { + return Reg == X86::EBP || Reg == X86::RBP || Reg == X86::R13; +} + +static inline bool isRegOperand(const MachineOperand &Op) { + return Op.isReg() && Op.getReg() != X86::NoRegister; +} +/// hasIneffecientLEARegs - LEA that uses base and index registers +/// where the base is EBP, RBP, or R13 +static inline bool hasInefficientLEABaseReg(const MachineOperand &Base, + const MachineOperand &Index) { + return Base.isReg() && isInefficientLEAReg(Base.getReg()) && + isRegOperand(Index); +} + +static inline bool hasLEAOffset(const MachineOperand &Offset) { + return (Offset.isImm() && Offset.getImm() != 0) || Offset.isGlobal(); +} + +// LEA instruction that has all three operands: offset, base and index +static inline bool isThreeOperandsLEA(const MachineOperand &Base, + const MachineOperand &Index, + const MachineOperand &Offset) { + return isRegOperand(Base) && isRegOperand(Index) && hasLEAOffset(Offset); +} + +static inline int getADDrrFromLEA(int LEAOpcode) { + switch (LEAOpcode) { + default: + llvm_unreachable("Unexpected LEA instruction"); + case X86::LEA16r: + return X86::ADD16rr; + case X86::LEA32r: + return X86::ADD32rr; + case X86::LEA64_32r: + case X86::LEA64r: + return X86::ADD64rr; + } +} + +static inline int getADDriFromLEA(int LEAOpcode, const MachineOperand &Offset) { + bool IsInt8 = Offset.isImm() && isInt<8>(Offset.getImm()); + switch (LEAOpcode) { + default: + llvm_unreachable("Unexpected LEA instruction"); + case X86::LEA16r: + return IsInt8 ? X86::ADD16ri8 : X86::ADD16ri; + case X86::LEA32r: + case X86::LEA64_32r: + return IsInt8 ? X86::ADD32ri8 : X86::ADD32ri; + case X86::LEA64r: + return IsInt8 ? X86::ADD64ri8 : X86::ADD64ri32; + } } /// isLEASimpleIncOrDec - Does this LEA have one these forms: @@ -337,8 +423,8 @@ void FixupLEAPass::seekLEAFixup(MachineOperand &p, void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I, MachineFunction::iterator MFI) { MachineInstr &MI = *I; - const int opcode = MI.getOpcode(); - if (!isLEA(opcode)) + const int Opcode = MI.getOpcode(); + if (!isLEA(Opcode)) return; if (MI.getOperand(5).getReg() != 0 || !MI.getOperand(4).isImm() || !TII->isSafeToClobberEFLAGS(*MFI, I)) @@ -350,53 +436,142 @@ void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I, return; if (MI.getOperand(2).getImm() > 1) return; - int addrr_opcode, addri_opcode; - switch (opcode) { - default: - llvm_unreachable("Unexpected LEA instruction"); - case X86::LEA16r: - addrr_opcode = X86::ADD16rr; - addri_opcode = X86::ADD16ri; - break; - case X86::LEA32r: - addrr_opcode = X86::ADD32rr; - addri_opcode = X86::ADD32ri; - break; - case X86::LEA64_32r: - case X86::LEA64r: - addrr_opcode = X86::ADD64rr; - addri_opcode = X86::ADD64ri32; - break; - } DEBUG(dbgs() << "FixLEA: Candidate to replace:"; I->dump();); DEBUG(dbgs() << "FixLEA: Replaced by: ";); MachineInstr *NewMI = nullptr; - const MachineOperand &Dst = MI.getOperand(0); // Make ADD instruction for two registers writing to LEA's destination if (SrcR1 != 0 && SrcR2 != 0) { - const MachineOperand &Src1 = MI.getOperand(SrcR1 == DstR ? 1 : 3); - const MachineOperand &Src2 = MI.getOperand(SrcR1 == DstR ? 3 : 1); - NewMI = BuildMI(*MF, MI.getDebugLoc(), TII->get(addrr_opcode)) - .add(Dst) - .add(Src1) - .add(Src2); - MFI->insert(I, NewMI); + const MCInstrDesc &ADDrr = TII->get(getADDrrFromLEA(Opcode)); + const MachineOperand &Src = MI.getOperand(SrcR1 == DstR ? 3 : 1); + NewMI = + BuildMI(*MFI, I, MI.getDebugLoc(), ADDrr, DstR).addReg(DstR).add(Src); DEBUG(NewMI->dump();); } // Make ADD instruction for immediate if (MI.getOperand(4).getImm() != 0) { + const MCInstrDesc &ADDri = + TII->get(getADDriFromLEA(Opcode, MI.getOperand(4))); const MachineOperand &SrcR = MI.getOperand(SrcR1 == DstR ? 1 : 3); - NewMI = BuildMI(*MF, MI.getDebugLoc(), TII->get(addri_opcode)) - .add(Dst) + NewMI = BuildMI(*MFI, I, MI.getDebugLoc(), ADDri, DstR) .add(SrcR) .addImm(MI.getOperand(4).getImm()); - MFI->insert(I, NewMI); DEBUG(NewMI->dump();); } if (NewMI) { MFI->erase(I); - I = static_cast<MachineBasicBlock::iterator>(NewMI); + I = NewMI; + } +} + +MachineInstr * +FixupLEAPass::processInstrForSlow3OpLEA(MachineInstr &MI, + MachineFunction::iterator MFI) { + + const int LEAOpcode = MI.getOpcode(); + if (!isLEA(LEAOpcode)) + return nullptr; + + const MachineOperand &Dst = MI.getOperand(0); + const MachineOperand &Base = MI.getOperand(1); + const MachineOperand &Scale = MI.getOperand(2); + const MachineOperand &Index = MI.getOperand(3); + const MachineOperand &Offset = MI.getOperand(4); + const MachineOperand &Segment = MI.getOperand(5); + + if (!(isThreeOperandsLEA(Base, Index, Offset) || + hasInefficientLEABaseReg(Base, Index)) || + !TII->isSafeToClobberEFLAGS(*MFI, MI) || + Segment.getReg() != X86::NoRegister) + return nullptr; + + unsigned int DstR = Dst.getReg(); + unsigned int BaseR = Base.getReg(); + unsigned int IndexR = Index.getReg(); + unsigned SSDstR = + (LEAOpcode == X86::LEA64_32r) ? getX86SubSuperRegister(DstR, 64) : DstR; + bool IsScale1 = Scale.getImm() == 1; + bool IsInefficientBase = isInefficientLEAReg(BaseR); + bool IsInefficientIndex = isInefficientLEAReg(IndexR); + + // Skip these cases since it takes more than 2 instructions + // to replace the LEA instruction. + if (IsInefficientBase && SSDstR == BaseR && !IsScale1) + return nullptr; + if (LEAOpcode == X86::LEA64_32r && IsInefficientBase && + (IsInefficientIndex || !IsScale1)) + return nullptr; + + const DebugLoc DL = MI.getDebugLoc(); + const MCInstrDesc &ADDrr = TII->get(getADDrrFromLEA(LEAOpcode)); + const MCInstrDesc &ADDri = TII->get(getADDriFromLEA(LEAOpcode, Offset)); + + DEBUG(dbgs() << "FixLEA: Candidate to replace:"; MI.dump();); + DEBUG(dbgs() << "FixLEA: Replaced by: ";); + + // First try to replace LEA with one or two (for the 3-op LEA case) + // add instructions: + // 1.lea (%base,%index,1), %base => add %index,%base + // 2.lea (%base,%index,1), %index => add %base,%index + if (IsScale1 && (DstR == BaseR || DstR == IndexR)) { + const MachineOperand &Src = DstR == BaseR ? Index : Base; + MachineInstr *NewMI = + BuildMI(*MFI, MI, DL, ADDrr, DstR).addReg(DstR).add(Src); + DEBUG(NewMI->dump();); + // Create ADD instruction for the Offset in case of 3-Ops LEA. + if (hasLEAOffset(Offset)) { + NewMI = BuildMI(*MFI, MI, DL, ADDri, DstR).addReg(DstR).add(Offset); + DEBUG(NewMI->dump();); + } + return NewMI; + } + // If the base is inefficient try switching the index and base operands, + // otherwise just break the 3-Ops LEA inst into 2-Ops LEA + ADD instruction: + // lea offset(%base,%index,scale),%dst => + // lea (%base,%index,scale); add offset,%dst + if (!IsInefficientBase || (!IsInefficientIndex && IsScale1)) { + MachineInstr *NewMI = BuildMI(*MFI, MI, DL, TII->get(LEAOpcode)) + .add(Dst) + .add(IsInefficientBase ? Index : Base) + .add(Scale) + .add(IsInefficientBase ? Base : Index) + .addImm(0) + .add(Segment); + DEBUG(NewMI->dump();); + // Create ADD instruction for the Offset in case of 3-Ops LEA. + if (hasLEAOffset(Offset)) { + NewMI = BuildMI(*MFI, MI, DL, ADDri, DstR).addReg(DstR).add(Offset); + DEBUG(NewMI->dump();); + } + return NewMI; + } + // Handle the rest of the cases with inefficient base register: + assert(SSDstR != BaseR && "SSDstR == BaseR should be handled already!"); + assert(IsInefficientBase && "efficient base should be handled already!"); + + // lea (%base,%index,1), %dst => mov %base,%dst; add %index,%dst + if (IsScale1 && !hasLEAOffset(Offset)) { + TII->copyPhysReg(*MFI, MI, DL, DstR, BaseR, Base.isKill()); + DEBUG(MI.getPrevNode()->dump();); + + MachineInstr *NewMI = + BuildMI(*MFI, MI, DL, ADDrr, DstR).addReg(DstR).add(Index); + DEBUG(NewMI->dump();); + return NewMI; } + // lea offset(%base,%index,scale), %dst => + // lea offset( ,%index,scale), %dst; add %base,%dst + MachineInstr *NewMI = BuildMI(*MFI, MI, DL, TII->get(LEAOpcode)) + .add(Dst) + .addReg(0) + .add(Scale) + .add(Index) + .add(Offset) + .add(Segment); + DEBUG(NewMI->dump();); + + NewMI = BuildMI(*MFI, MI, DL, ADDrr, DstR).addReg(DstR).add(Base); + DEBUG(NewMI->dump();); + return NewMI; } bool FixupLEAPass::processBasicBlock(MachineFunction &MF, @@ -410,8 +585,16 @@ bool FixupLEAPass::processBasicBlock(MachineFunction &MF, if (OptLEA) { if (MF.getSubtarget<X86Subtarget>().isSLM()) processInstructionForSLM(I, MFI); - else - processInstruction(I, MFI); + + else { + if (MF.getSubtarget<X86Subtarget>().slow3OpsLEA()) { + if (auto *NewMI = processInstrForSlow3OpLEA(*I, MFI)) { + MFI->erase(I); + I = NewMI; + } + } else + processInstruction(I, MFI); + } } } return false; diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index 12a10bf3072f..c899f0fd5100 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -1178,8 +1178,7 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break; - if (ConstantSDNode - *CN = dyn_cast<ConstantSDNode>(N.getNode()->getOperand(1))) { + if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) { unsigned Val = CN->getZExtValue(); // Note that we handle x<<1 as (,x,2) rather than (x,x) here so // that the base operand remains free for further matching. If @@ -1187,15 +1186,14 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, // in MatchAddress turns (,x,2) into (x,x), which is cheaper. if (Val == 1 || Val == 2 || Val == 3) { AM.Scale = 1 << Val; - SDValue ShVal = N.getNode()->getOperand(0); + SDValue ShVal = N.getOperand(0); // Okay, we know that we have a scale by now. However, if the scaled // value is an add of something and a constant, we can fold the // constant into the disp field here. if (CurDAG->isBaseWithConstantOffset(ShVal)) { - AM.IndexReg = ShVal.getNode()->getOperand(0); - ConstantSDNode *AddVal = - cast<ConstantSDNode>(ShVal.getNode()->getOperand(1)); + AM.IndexReg = ShVal.getOperand(0); + ConstantSDNode *AddVal = cast<ConstantSDNode>(ShVal.getOperand(1)); uint64_t Disp = (uint64_t)AddVal->getSExtValue() << Val; if (!foldOffsetIntoAddress(Disp, AM)) return false; @@ -1245,28 +1243,27 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() == nullptr && AM.IndexReg.getNode() == nullptr) { - if (ConstantSDNode - *CN = dyn_cast<ConstantSDNode>(N.getNode()->getOperand(1))) + if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 || CN->getZExtValue() == 9) { AM.Scale = unsigned(CN->getZExtValue())-1; - SDValue MulVal = N.getNode()->getOperand(0); + SDValue MulVal = N.getOperand(0); SDValue Reg; // Okay, we know that we have a scale by now. However, if the scaled // value is an add of something and a constant, we can fold the // constant into the disp field here. if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() && - isa<ConstantSDNode>(MulVal.getNode()->getOperand(1))) { - Reg = MulVal.getNode()->getOperand(0); + isa<ConstantSDNode>(MulVal.getOperand(1))) { + Reg = MulVal.getOperand(0); ConstantSDNode *AddVal = - cast<ConstantSDNode>(MulVal.getNode()->getOperand(1)); + cast<ConstantSDNode>(MulVal.getOperand(1)); uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue(); if (foldOffsetIntoAddress(Disp, AM)) - Reg = N.getNode()->getOperand(0); + Reg = N.getOperand(0); } else { - Reg = N.getNode()->getOperand(0); + Reg = N.getOperand(0); } AM.IndexReg = AM.Base_Reg = Reg; @@ -1289,7 +1286,7 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, // Test if the LHS of the sub can be folded. X86ISelAddressMode Backup = AM; - if (matchAddressRecursively(N.getNode()->getOperand(0), AM, Depth+1)) { + if (matchAddressRecursively(N.getOperand(0), AM, Depth+1)) { AM = Backup; break; } @@ -1300,7 +1297,7 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, } int Cost = 0; - SDValue RHS = Handle.getValue().getNode()->getOperand(1); + SDValue RHS = Handle.getValue().getOperand(1); // If the RHS involves a register with multiple uses, this // transformation incurs an extra mov, due to the neg instruction // clobbering its operand. @@ -1309,7 +1306,7 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, RHS.getNode()->getOpcode() == ISD::TRUNCATE || RHS.getNode()->getOpcode() == ISD::ANY_EXTEND || (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND && - RHS.getNode()->getOperand(0).getValueType() == MVT::i32)) + RHS.getOperand(0).getValueType() == MVT::i32)) ++Cost; // If the base is a register with multiple uses, this // transformation may save a mov. @@ -2524,7 +2521,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) { N0.getNode()->hasOneUse() && N0.getValueType() != MVT::i8 && X86::isZeroNode(N1)) { - ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getNode()->getOperand(1)); + ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); if (!C) break; // For example, convert "testl %eax, $8" to "testb %al, $8" @@ -2532,7 +2529,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) { (!(C->getZExtValue() & 0x80) || hasNoSignedComparisonUses(Node))) { SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl, MVT::i8); - SDValue Reg = N0.getNode()->getOperand(0); + SDValue Reg = N0.getOperand(0); // On x86-32, only the ABCD registers have 8-bit subregisters. if (!Subtarget->is64Bit()) { @@ -2568,7 +2565,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) { // Shift the immediate right by 8 bits. SDValue ShiftedImm = CurDAG->getTargetConstant(C->getZExtValue() >> 8, dl, MVT::i8); - SDValue Reg = N0.getNode()->getOperand(0); + SDValue Reg = N0.getOperand(0); // Put the value in an ABCD register. const TargetRegisterClass *TRC; @@ -2605,7 +2602,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) { hasNoSignedComparisonUses(Node))) { SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl, MVT::i16); - SDValue Reg = N0.getNode()->getOperand(0); + SDValue Reg = N0.getOperand(0); // Extract the 16-bit subregister. SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_16bit, dl, @@ -2628,7 +2625,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) { hasNoSignedComparisonUses(Node))) { SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl, MVT::i32); - SDValue Reg = N0.getNode()->getOperand(0); + SDValue Reg = N0.getOperand(0); // Extract the 32-bit subregister. SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_32bit, dl, diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 9ee2234595f9..11c08292518a 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -40,6 +40,7 @@ #include "llvm/IR/CallingConv.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalAlias.h" #include "llvm/IR/GlobalVariable.h" @@ -79,6 +80,17 @@ static cl::opt<int> ExperimentalPrefLoopAlignment( " of the loop header PC will be 0)."), cl::Hidden); +/// Call this when the user attempts to do something unsupported, like +/// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike +/// report_fatal_error, so calling code should attempt to recover without +/// crashing. +static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl, + const char *Msg) { + MachineFunction &MF = DAG.getMachineFunction(); + DAG.getContext()->diagnose( + DiagnosticInfoUnsupported(*MF.getFunction(), Msg, dl.getDebugLoc())); +} + X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, const X86Subtarget &STI) : TargetLowering(TM), Subtarget(STI) { @@ -1381,7 +1393,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); - setOperationAction(ISD::VSELECT, VT, Legal); + setOperationAction(ISD::VSELECT, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal); @@ -1445,8 +1457,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i1, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom); - setOperationAction(ISD::VSELECT, MVT::v32i16, Legal); - setOperationAction(ISD::VSELECT, MVT::v64i8, Legal); setOperationAction(ISD::TRUNCATE, MVT::v32i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v64i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom); @@ -1479,7 +1489,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, for (auto VT : { MVT::v64i8, MVT::v32i16 }) { setOperationAction(ISD::BUILD_VECTOR, VT, Custom); - setOperationAction(ISD::VSELECT, VT, Legal); + setOperationAction(ISD::VSELECT, VT, Custom); setOperationAction(ISD::ABS, VT, Legal); setOperationAction(ISD::SRL, VT, Custom); setOperationAction(ISD::SHL, VT, Custom); @@ -2207,15 +2217,17 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, // or SSE or MMX vectors. if ((ValVT == MVT::f32 || ValVT == MVT::f64 || VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) && - (Subtarget.is64Bit() && !Subtarget.hasSSE1())) { - report_fatal_error("SSE register return with SSE disabled"); + (Subtarget.is64Bit() && !Subtarget.hasSSE1())) { + errorUnsupported(DAG, dl, "SSE register return with SSE disabled"); + VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. + } else if (ValVT == MVT::f64 && + (Subtarget.is64Bit() && !Subtarget.hasSSE2())) { + // Likewise we can't return F64 values with SSE1 only. gcc does so, but + // llvm-gcc has never done it right and no one has noticed, so this + // should be OK for now. + errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled"); + VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. } - // Likewise we can't return F64 values with SSE1 only. gcc does so, but - // llvm-gcc has never done it right and no one has noticed, so this - // should be OK for now. - if (ValVT == MVT::f64 && - (Subtarget.is64Bit() && !Subtarget.hasSSE2())) - report_fatal_error("SSE2 register return with SSE2 disabled"); // Returns in ST0/ST1 are handled specially: these are pushed as operands to // the RET instruction and handled by the FP Stackifier. @@ -2528,7 +2540,8 @@ SDValue X86TargetLowering::LowerCallResult( // If this is x86-64, and we disabled SSE, we can't return FP values if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) && ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) { - report_fatal_error("SSE register return with SSE disabled"); + errorUnsupported(DAG, dl, "SSE register return with SSE disabled"); + VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. } // If we prefer to use the value in xmm registers, copy it out as f80 and @@ -3415,8 +3428,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, } if (!IsSibcall) - Chain = DAG.getCALLSEQ_START( - Chain, DAG.getIntPtrConstant(NumBytesToPush, dl, true), dl); + Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush, + NumBytes - NumBytesToPush, dl); SDValue RetAddrFrIdx; // Load return address for tail calls. @@ -6912,9 +6925,9 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { // for splat use " (select i1 splat_elt, all-ones, all-zeroes)" if (IsSplat) - return DAG.getNode(ISD::SELECT, dl, VT, Op.getOperand(SplatIdx), - DAG.getConstant(1, dl, VT), - DAG.getConstant(0, dl, VT)); + return DAG.getSelect(dl, VT, Op.getOperand(SplatIdx), + DAG.getConstant(1, dl, VT), + DAG.getConstant(0, dl, VT)); // insert elements one by one SDValue DstVec; @@ -8386,9 +8399,9 @@ static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT, Subtarget, DAG, DL); SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL); SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1; - return DAG.getNode(ISD::VSELECT, DL, VT, VMask, - DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector), - ZeroVector); + return DAG.getSelect(DL, VT, VMask, + DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector), + ZeroVector); } static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2, @@ -8748,8 +8761,9 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, V1 = DAG.getBitcast(BlendVT, V1); V2 = DAG.getBitcast(BlendVT, V2); return DAG.getBitcast( - VT, DAG.getNode(ISD::VSELECT, DL, BlendVT, - DAG.getBuildVector(BlendVT, DL, VSELECTMask), V1, V2)); + VT, + DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask), + V1, V2)); } case MVT::v16f32: case MVT::v8f64: @@ -13817,6 +13831,11 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode())) return SDValue(); + // If this VSELECT has a vector if i1 as a mask, it will be directly matched + // with patterns on the mask registers on AVX-512. + if (Op->getOperand(0).getValueType().getScalarSizeInBits() == 1) + return Op; + // Try to lower this to a blend-style vector shuffle. This can handle all // constant condition cases. if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG)) @@ -13826,10 +13845,30 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { if (!Subtarget.hasSSE41()) return SDValue(); + SDLoc dl(Op); + MVT VT = Op.getSimpleValueType(); + + // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition + // into an i1 condition so that we can use the mask-based 512-bit blend + // instructions. + if (VT.getSizeInBits() == 512) { + SDValue Cond = Op.getOperand(0); + // The vNi1 condition case should be handled above as it can be trivially + // lowered. + assert(Cond.getValueType().getScalarSizeInBits() == + VT.getScalarSizeInBits() && + "Should have a size-matched integer condition!"); + // Build a mask by testing the condition against itself (tests for zero). + MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); + SDValue Mask = DAG.getNode(X86ISD::TESTM, dl, MaskVT, Cond, Cond); + // Now return a new VSELECT using the mask. + return DAG.getSelect(dl, VT, Mask, Op.getOperand(1), Op.getOperand(2)); + } + // Only some types will be legal on some subtargets. If we can emit a legal // VSELECT-matching blend, return Op, and but if we need to expand, return // a null value. - switch (Op.getSimpleValueType().SimpleTy) { + switch (VT.SimpleTy) { default: // Most of the vector types have blends past SSE4.1. return Op; @@ -14725,7 +14764,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { // location. SDValue Chain = DAG.getEntryNode(); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); - Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, DL, true), DL); + Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL); SDValue Args[] = { Chain, Offset }; Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args); Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true), @@ -15348,8 +15387,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, // Get a pointer to FF if the sign bit was set, or to 0 otherwise. SDValue Zero = DAG.getIntPtrConstant(0, dl); SDValue Four = DAG.getIntPtrConstant(4, dl); - SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet, - Zero, Four); + SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Zero, Four); FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset); // Load the value out, extending it from f32 to f80. @@ -15621,7 +15659,7 @@ static SDValue LowerZERO_EXTEND_AVX512(SDValue Op, SDValue Zero = DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), DL, ExtVT); - SDValue SelectedVal = DAG.getNode(ISD::VSELECT, DL, ExtVT, In, One, Zero); + SDValue SelectedVal = DAG.getSelect(DL, ExtVT, In, One, Zero); if (VT == ExtVT) return SelectedVal; return DAG.getNode(X86ISD::VTRUNC, DL, VT, SelectedVal); @@ -16713,7 +16751,7 @@ static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, if (BitWidth > AndBitWidth) { KnownBits Known; DAG.computeKnownBits(Op0, Known); - if (Known.Zero.countLeadingOnes() < BitWidth - AndBitWidth) + if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth) return SDValue(); } LHS = Op1; @@ -17455,7 +17493,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64; VCmp = DAG.getBitcast(VCmpVT, VCmp); - SDValue VSel = DAG.getNode(ISD::VSELECT, DL, VecVT, VCmp, VOp1, VOp2); + SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VSel, DAG.getIntPtrConstant(0, DL)); @@ -17483,9 +17521,8 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0)) Op2Scalar = Op2.getOperand(0); if (Op1Scalar.getNode() && Op2Scalar.getNode()) { - SDValue newSelect = DAG.getNode(ISD::SELECT, DL, - Op1Scalar.getValueType(), - Cond, Op1Scalar, Op2Scalar); + SDValue newSelect = DAG.getSelect(DL, Op1Scalar.getValueType(), Cond, + Op1Scalar, Op2Scalar); if (newSelect.getValueSizeInBits() == VT.getSizeInBits()) return DAG.getBitcast(VT, newSelect); SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect); @@ -17500,8 +17537,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { DAG.getUNDEF(MVT::v8i1), Op1, zeroConst); Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1, DAG.getUNDEF(MVT::v8i1), Op2, zeroConst); - SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::v8i1, - Cond, Op1, Op2); + SDValue newSelect = DAG.getSelect(DL, MVT::v8i1, Cond, Op1, Op2); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst); } @@ -17770,7 +17806,7 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, } else { SDValue NegOne = getOnesVector(ExtVT, DAG, dl); SDValue Zero = getZeroVector(ExtVT, Subtarget, DAG, dl); - V = DAG.getNode(ISD::VSELECT, dl, ExtVT, In, NegOne, Zero); + V = DAG.getSelect(dl, ExtVT, In, NegOne, Zero); if (ExtVT == VT) return V; } @@ -18572,7 +18608,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, // Chain the dynamic stack allocation so that it doesn't modify the stack // pointer when other instructions are using the stack. - Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true), dl); + Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl); bool Is64Bit = Subtarget.is64Bit(); MVT SPTy = getPointerTy(DAG.getDataLayout()); @@ -19021,8 +19057,10 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG) { - if (isAllOnesConstant(Mask)) - return Op; + + if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask)) + if (MaskConst->getZExtValue() & 0x1) + return Op; MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); @@ -19081,7 +19119,7 @@ static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn, // registration, or the .set_setframe offset. MCSymbol *OffsetSym = MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol( - GlobalValue::getRealLinkageName(Fn->getName())); + GlobalValue::dropLLVMManglingEscape(Fn->getName())); SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT); SDValue ParentFrameOffset = DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal); @@ -19683,12 +19721,6 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget DAG.getIntPtrConstant(0, dl)); return DAG.getBitcast(Op.getValueType(), Res); } - case CONVERT_MASK_TO_VEC: { - SDValue Mask = Op.getOperand(1); - MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); - SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); - return DAG.getNode(IntrData->Opc0, dl, VT, VMask); - } case BRCST_SUBVEC_TO_VEC: { SDValue Src = Op.getOperand(1); SDValue Passthru = Op.getOperand(2); @@ -19932,7 +19964,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget SDValue Op1 = Op.getOperand(1); auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal()); MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol( - GlobalValue::getRealLinkageName(Fn->getName())); + GlobalValue::dropLLVMManglingEscape(Fn->getName())); // Generate a simple absolute symbol reference. This intrinsic is only // supported on 32-bit Windows, which isn't PIC. @@ -21741,6 +21773,14 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2); SDValue Ex = DAG.getBitcast(ExVT, R); + // ashr(R, 63) === cmp_slt(R, 0) + if (ShiftAmt == 63 && Subtarget.hasSSE42()) { + assert((VT != MVT::v4i64 || Subtarget.hasInt256()) && + "Unsupported PCMPGT op"); + return DAG.getNode(X86ISD::PCMPGT, dl, VT, + getZeroVector(VT, Subtarget, DAG, dl), R); + } + if (ShiftAmt >= 32) { // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32. SDValue Upper = @@ -21839,10 +21879,19 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, } // Special case in 32-bit mode, where i64 is expanded into high and low parts. + // TODO: Replace constant extraction with getTargetConstantBitsFromNode. if (!Subtarget.is64Bit() && !Subtarget.hasXOP() && (VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64) || (Subtarget.hasAVX512() && VT == MVT::v8i64))) { + // AVX1 targets maybe extracting a 128-bit vector from a 256-bit constant. + unsigned SubVectorScale = 1; + if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR) { + SubVectorScale = + Amt.getOperand(0).getValueSizeInBits() / Amt.getValueSizeInBits(); + Amt = Amt.getOperand(0); + } + // Peek through any splat that was introduced for i64 shift vectorization. int SplatIndex = -1; if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode())) @@ -21859,7 +21908,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, Amt = Amt.getOperand(0); unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() / - VT.getVectorNumElements(); + (SubVectorScale * VT.getVectorNumElements()); unsigned RatioInLog2 = Log2_32_Ceil(Ratio); uint64_t ShiftAmt = 0; unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio); @@ -22233,23 +22282,21 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, V1 = DAG.getBitcast(VT, V1); Sel = DAG.getBitcast(VT, Sel); Sel = DAG.getNode(X86ISD::CVT2MASK, dl, MaskVT, Sel); - return DAG.getBitcast(SelVT, - DAG.getNode(ISD::VSELECT, dl, VT, Sel, V0, V1)); + return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1)); } else if (Subtarget.hasSSE41()) { // On SSE41 targets we make use of the fact that VSELECT lowers // to PBLENDVB which selects bytes based just on the sign bit. V0 = DAG.getBitcast(VT, V0); V1 = DAG.getBitcast(VT, V1); Sel = DAG.getBitcast(VT, Sel); - return DAG.getBitcast(SelVT, - DAG.getNode(ISD::VSELECT, dl, VT, Sel, V0, V1)); + return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1)); } // On pre-SSE41 targets we test for the sign bit by comparing to // zero - a negative value will set all bits of the lanes to true // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering. SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl); SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel); - return DAG.getNode(ISD::VSELECT, dl, SelVT, C, V0, V1); + return DAG.getSelect(dl, SelVT, C, V0, V1); }; // Turn 'a' into a mask suitable for VSELECT: a = a << 5; @@ -22371,15 +22418,14 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, V0 = DAG.getBitcast(ExtVT, V0); V1 = DAG.getBitcast(ExtVT, V1); Sel = DAG.getBitcast(ExtVT, Sel); - return DAG.getBitcast( - VT, DAG.getNode(ISD::VSELECT, dl, ExtVT, Sel, V0, V1)); + return DAG.getBitcast(VT, DAG.getSelect(dl, ExtVT, Sel, V0, V1)); } // On pre-SSE41 targets we splat the sign bit - a negative value will // set all bits of the lanes to true and VSELECT uses that in // its OR(AND(V0,C),AND(V1,~C)) lowering. SDValue C = DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT)); - return DAG.getNode(ISD::VSELECT, dl, VT, C, V0, V1); + return DAG.getSelect(dl, VT, C, V0, V1); }; // Turn 'a' into a mask suitable for VSELECT: a = a << 12; @@ -23296,9 +23342,8 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget, SDValue Callee = DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout())); - Type *RetTy = isF64 - ? (Type*)StructType::get(ArgTy, ArgTy, nullptr) - : (Type*)VectorType::get(ArgTy, 4); + Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy) + : (Type *)VectorType::get(ArgTy, 4); TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl) @@ -25779,7 +25824,7 @@ X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI, // Emit CALLSEQ_START right before the instruction. unsigned AdjStackDown = TII.getCallFrameSetupOpcode(); MachineInstrBuilder CallseqStart = - BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0); + BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0); BB->insert(MachineBasicBlock::iterator(MI), CallseqStart); // Emit CALLSEQ_END right after the instruction. @@ -26517,7 +26562,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, case TargetOpcode::STACKMAP: case TargetOpcode::PATCHPOINT: return emitPatchPoint(MI, BB); - + case TargetOpcode::PATCHABLE_EVENT_CALL: // Do nothing here, handle in xray instrumentation pass. return BB; @@ -29532,7 +29577,7 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG, SDValue CondNew = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, DAG.getAllOnesConstant(DL, CondVT)); // Vselect cond, op1, op2 = Vselect not(cond), op2, op1 - return DAG.getNode(ISD::VSELECT, DL, VT, CondNew, RHS, LHS); + return DAG.getSelect(DL, VT, CondNew, RHS, LHS); } // To use the condition operand as a bitwise mask, it must have elements that @@ -30015,7 +30060,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE; Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(), Cond.getOperand(0), Cond.getOperand(1), NewCC); - return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS); + return DAG.getSelect(DL, VT, Cond, LHS, RHS); } } } @@ -31561,20 +31606,22 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, // (sub (xor X, M), M) static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { - assert(N->getOpcode() == ISD::OR); + assert(N->getOpcode() == ISD::OR && "Unexpected Opcode"); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); - if (!((VT == MVT::v2i64) || (VT == MVT::v4i64 && Subtarget.hasInt256()))) + if (!((VT.is128BitVector() && Subtarget.hasSSE2()) || + (VT.is256BitVector() && Subtarget.hasInt256()))) return SDValue(); - assert(Subtarget.hasSSE2() && "Unexpected i64 vector without SSE2!"); - // Canonicalize pandn to RHS - if (N0.getOpcode() == X86ISD::ANDNP) + // Canonicalize AND to LHS. + if (N1.getOpcode() == ISD::AND) std::swap(N0, N1); + // TODO: Attempt to match against AND(XOR(-1,X),Y) as well, waiting for + // ANDNP combine allows other combines to happen that prevent matching. if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP) return SDValue(); @@ -31596,20 +31643,10 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG, Y = peekThroughBitcasts(Y); EVT MaskVT = Mask.getValueType(); - - // Validate that the Mask operand is a vector sra node. - // FIXME: what to do for bytes, since there is a psignb/pblendvb, but - // there is no psrai.b unsigned EltBits = MaskVT.getScalarSizeInBits(); - unsigned SraAmt = ~0; - if (Mask.getOpcode() == ISD::SRA) { - if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1))) - if (auto *AmtConst = AmtBV->getConstantSplatNode()) - SraAmt = AmtConst->getZExtValue(); - } else if (Mask.getOpcode() == X86ISD::VSRAI) - SraAmt = Mask.getConstantOperandVal(1); - if ((SraAmt + 1) != EltBits) + // TODO: Attempt to handle floating point cases as well? + if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits) return SDValue(); SDLoc DL(N); @@ -31630,7 +31667,8 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG, // (add (xor X, M), (and M, 1)) // And further to: // (sub (xor X, M), M) - if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT) { + if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT && + DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT)) { auto IsNegV = [](SDNode *N, SDValue V) { return N->getOpcode() == ISD::SUB && N->getOperand(1) == V && ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()); @@ -31642,9 +31680,6 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG, V = Y; if (V) { - if (EltBits != 8 && EltBits != 16 && EltBits != 32) - return SDValue(); - SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask); SDValue SubOp2 = Mask; @@ -31661,8 +31696,8 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG, if (V == Y) std::swap(SubOp1, SubOp2); - return DAG.getBitcast(VT, - DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2)); + SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2); + return DAG.getBitcast(VT, Res); } } @@ -31675,7 +31710,7 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG, X = DAG.getBitcast(BlendVT, X); Y = DAG.getBitcast(BlendVT, Y); Mask = DAG.getBitcast(BlendVT, Mask); - Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X); + Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X); return DAG.getBitcast(VT, Mask); } @@ -33655,8 +33690,7 @@ static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG, // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands // are NaN, the NaN value of Op1 is the result. - auto SelectOpcode = VT.isVector() ? ISD::VSELECT : ISD::SELECT; - return DAG.getNode(SelectOpcode, DL, VT, IsOp0Nan, Op1, MinOrMax); + return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax); } /// Do target-specific dag combines on X86ISD::ANDNP nodes. @@ -33949,7 +33983,7 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG, if (InVT == MVT::i1) { SDValue Zero = DAG.getConstant(0, DL, VT); SDValue AllOnes = DAG.getAllOnesConstant(DL, VT); - return DAG.getNode(ISD::SELECT, DL, VT, N0, AllOnes, Zero); + return DAG.getSelect(DL, VT, N0, AllOnes, Zero); } return SDValue(); } diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td index 3dc673e3c35a..d003d027ddb9 100644 --- a/lib/Target/X86/X86InstrCompiler.td +++ b/lib/Target/X86/X86InstrCompiler.td @@ -43,7 +43,8 @@ let hasSideEffects = 0, isNotDuplicable = 1, Uses = [ESP] in // Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become // sub / add which can clobber EFLAGS. let Defs = [ESP, EFLAGS], Uses = [ESP] in { -def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2), +def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs), + (ins i32imm:$amt1, i32imm:$amt2, i32imm:$amt3), "#ADJCALLSTACKDOWN", []>, Requires<[NotLP64]>; @@ -52,8 +53,8 @@ def ADJCALLSTACKUP32 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2), [(X86callseq_end timm:$amt1, timm:$amt2)]>, Requires<[NotLP64]>; } -def : Pat<(X86callseq_start timm:$amt1), - (ADJCALLSTACKDOWN32 i32imm:$amt1, 0)>, Requires<[NotLP64]>; +def : Pat<(X86callseq_start timm:$amt1, timm:$amt2), + (ADJCALLSTACKDOWN32 i32imm:$amt1, i32imm:$amt2, 0)>, Requires<[NotLP64]>; // ADJCALLSTACKDOWN/UP implicitly use/def RSP because they may be expanded into @@ -62,7 +63,8 @@ def : Pat<(X86callseq_start timm:$amt1), // Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become // sub / add which can clobber EFLAGS. let Defs = [RSP, EFLAGS], Uses = [RSP] in { -def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2), +def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs), + (ins i32imm:$amt1, i32imm:$amt2, i32imm:$amt3), "#ADJCALLSTACKDOWN", []>, Requires<[IsLP64]>; @@ -71,8 +73,8 @@ def ADJCALLSTACKUP64 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2), [(X86callseq_end timm:$amt1, timm:$amt2)]>, Requires<[IsLP64]>; } -def : Pat<(X86callseq_start timm:$amt1), - (ADJCALLSTACKDOWN64 i32imm:$amt1, 0)>, Requires<[IsLP64]>; +def : Pat<(X86callseq_start timm:$amt1, timm:$amt2), + (ADJCALLSTACKDOWN64 i32imm:$amt1, i32imm:$amt2, 0)>, Requires<[IsLP64]>; // x86-64 va_start lowering magic. diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 888daa275265..092ceb207ada 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -5729,6 +5729,44 @@ static X86::CondCode getSwappedCondition(X86::CondCode CC) { } } +std::pair<X86::CondCode, bool> +X86::getX86ConditionCode(CmpInst::Predicate Predicate) { + X86::CondCode CC = X86::COND_INVALID; + bool NeedSwap = false; + switch (Predicate) { + default: break; + // Floating-point Predicates + case CmpInst::FCMP_UEQ: CC = X86::COND_E; break; + case CmpInst::FCMP_OLT: NeedSwap = true; LLVM_FALLTHROUGH; + case CmpInst::FCMP_OGT: CC = X86::COND_A; break; + case CmpInst::FCMP_OLE: NeedSwap = true; LLVM_FALLTHROUGH; + case CmpInst::FCMP_OGE: CC = X86::COND_AE; break; + case CmpInst::FCMP_UGT: NeedSwap = true; LLVM_FALLTHROUGH; + case CmpInst::FCMP_ULT: CC = X86::COND_B; break; + case CmpInst::FCMP_UGE: NeedSwap = true; LLVM_FALLTHROUGH; + case CmpInst::FCMP_ULE: CC = X86::COND_BE; break; + case CmpInst::FCMP_ONE: CC = X86::COND_NE; break; + case CmpInst::FCMP_UNO: CC = X86::COND_P; break; + case CmpInst::FCMP_ORD: CC = X86::COND_NP; break; + case CmpInst::FCMP_OEQ: LLVM_FALLTHROUGH; + case CmpInst::FCMP_UNE: CC = X86::COND_INVALID; break; + + // Integer Predicates + case CmpInst::ICMP_EQ: CC = X86::COND_E; break; + case CmpInst::ICMP_NE: CC = X86::COND_NE; break; + case CmpInst::ICMP_UGT: CC = X86::COND_A; break; + case CmpInst::ICMP_UGE: CC = X86::COND_AE; break; + case CmpInst::ICMP_ULT: CC = X86::COND_B; break; + case CmpInst::ICMP_ULE: CC = X86::COND_BE; break; + case CmpInst::ICMP_SGT: CC = X86::COND_G; break; + case CmpInst::ICMP_SGE: CC = X86::COND_GE; break; + case CmpInst::ICMP_SLT: CC = X86::COND_L; break; + case CmpInst::ICMP_SLE: CC = X86::COND_LE; break; + } + + return std::make_pair(CC, NeedSwap); +} + /// Return a set opcode for the given condition and /// whether it has memory operand. unsigned X86::getSETFromCond(CondCode CC, bool HasMemoryOperand) { @@ -7589,6 +7627,13 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { return Expand2AddrUndef(MIB, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr)); case X86::AVX2_SETALLONES: return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr)); + case X86::AVX1_SETALLONES: { + unsigned Reg = MIB->getOperand(0).getReg(); + // VCMPPSYrri with an immediate 0xf should produce VCMPTRUEPS. + MIB->setDesc(get(X86::VCMPPSYrri)); + MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef).addImm(0xf); + return true; + } case X86::AVX512_512_SETALLONES: { unsigned Reg = MIB->getOperand(0).getReg(); MIB->setDesc(get(X86::VPTERNLOGDZrri)); @@ -8477,6 +8522,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( Alignment = 64; break; case X86::AVX2_SETALLONES: + case X86::AVX1_SETALLONES: case X86::AVX_SET0: case X86::AVX512_256_SET0: Alignment = 32; @@ -8522,6 +8568,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( case X86::V_SET0: case X86::V_SETALLONES: case X86::AVX2_SETALLONES: + case X86::AVX1_SETALLONES: case X86::AVX_SET0: case X86::AVX512_128_SET0: case X86::AVX512_256_SET0: @@ -8563,13 +8610,14 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( else if (Opc == X86::AVX512_512_SET0 || Opc == X86::AVX512_512_SETALLONES) Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()),16); else if (Opc == X86::AVX2_SETALLONES || Opc == X86::AVX_SET0 || - Opc == X86::AVX512_256_SET0) + Opc == X86::AVX512_256_SET0 || Opc == X86::AVX1_SETALLONES) Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 8); else Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 4); bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX2_SETALLONES || - Opc == X86::AVX512_512_SETALLONES); + Opc == X86::AVX512_512_SETALLONES || + Opc == X86::AVX1_SETALLONES); const Constant *C = IsAllOnes ? Constant::getAllOnesValue(Ty) : Constant::getNullValue(Ty); unsigned CPI = MCP.getConstantPoolIndex(C, Alignment); diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h index 38567831b3a4..e64876073ccf 100644 --- a/lib/Target/X86/X86InstrInfo.h +++ b/lib/Target/X86/X86InstrInfo.h @@ -64,6 +64,10 @@ enum CondCode { // Turn condition code into conditional branch opcode. unsigned GetCondBranchFromCond(CondCode CC); +/// \brief Return a pair of condition code for the given predicate and whether +/// the instruction operands should be swaped to match the condition code. +std::pair<CondCode, bool> getX86ConditionCode(CmpInst::Predicate Predicate); + /// \brief Return a set opcode for the given condition and whether it has /// a memory operand. unsigned getSETFromCond(CondCode CC, bool HasMemoryOperand = false); @@ -186,6 +190,8 @@ public: /// setup..destroy sequence (e.g. by pushes, or inside the callee). int64_t getFrameAdjustment(const MachineInstr &I) const { assert(isFrameInstr(I)); + if (isFrameSetup(I)) + return I.getOperand(2).getImm(); return I.getOperand(1).getImm(); } @@ -193,7 +199,10 @@ public: /// instruction. void setFrameAdjustment(MachineInstr &I, int64_t V) const { assert(isFrameInstr(I)); - I.getOperand(1).setImm(V); + if (isFrameSetup(I)) + I.getOperand(2).setImm(V); + else + I.getOperand(1).setImm(V); } /// getSPAdjust - This returns the stack pointer adjustment made by diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index 902b0c2c04e3..4d7d8ece92d9 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -84,7 +84,8 @@ def SDTLockBinaryArithWithFlags : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, def SDTX86Ret : SDTypeProfile<0, -1, [SDTCisVT<0, i32>]>; -def SDT_X86CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>; +def SDT_X86CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>, + SDTCisVT<1, i32>]>; def SDT_X86CallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>; @@ -2351,6 +2352,38 @@ let Predicates = [HasBMI2] in { def : Pat<(and (loadi64 addr:$src), (add (shl 1, GR8:$lz), -1)), (BZHI64rm addr:$src, (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>; + + // x & (-1 >> (32 - y)) + def : Pat<(and GR32:$src, (srl -1, (i8 (trunc (sub 32, GR32:$lz))))), + (BZHI32rr GR32:$src, GR32:$lz)>; + def : Pat<(and (loadi32 addr:$src), (srl -1, (i8 (trunc (sub 32, GR32:$lz))))), + (BZHI32rm addr:$src, GR32:$lz)>; + + // x & (-1 >> (64 - y)) + def : Pat<(and GR64:$src, (srl -1, (i8 (trunc (sub 64, GR32:$lz))))), + (BZHI64rr GR64:$src, + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>; + def : Pat<(and (loadi64 addr:$src), (srl -1, (i8 (trunc (sub 64, GR32:$lz))))), + (BZHI64rm addr:$src, + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>; + + // x << (32 - y) >> (32 - y) + def : Pat<(srl (shl GR32:$src, (i8 (trunc (sub 32, GR32:$lz)))), + (i8 (trunc (sub 32, GR32:$lz)))), + (BZHI32rr GR32:$src, GR32:$lz)>; + def : Pat<(srl (shl (loadi32 addr:$src), (i8 (trunc (sub 32, GR32:$lz)))), + (i8 (trunc (sub 32, GR32:$lz)))), + (BZHI32rm addr:$src, GR32:$lz)>; + + // x << (64 - y) >> (64 - y) + def : Pat<(srl (shl GR64:$src, (i8 (trunc (sub 64, GR32:$lz)))), + (i8 (trunc (sub 64, GR32:$lz)))), + (BZHI64rr GR64:$src, + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>; + def : Pat<(srl (shl (loadi64 addr:$src), (i8 (trunc (sub 64, GR32:$lz)))), + (i8 (trunc (sub 64, GR32:$lz)))), + (BZHI64rm addr:$src, + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>; } // HasBMI2 let Predicates = [HasBMI] in { diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 48da2fa607af..f73d85e7e01b 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -486,6 +486,10 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, isPseudo = 1, SchedRW = [WriteZero] in { def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "", [(set VR128:$dst, (v4i32 immAllOnesV))]>; + let Predicates = [HasAVX1Only, OptForMinSize] in { + def AVX1_SETALLONES: I<0, Pseudo, (outs VR256:$dst), (ins), "", + [(set VR256:$dst, (v8i32 immAllOnesV))]>; + } let Predicates = [HasAVX2] in def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "", [(set VR256:$dst, (v8i32 immAllOnesV))]>; @@ -7755,14 +7759,12 @@ def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst), []>, Sched<[WriteFShuffleLd, ReadAfterLd]>, VEX_4V, VEX_L; } - -// Without AVX2 we need to concat two v4i32 V_SETALLONES to create a 256-bit -// all ones value. -let Predicates = [HasAVX1Only] in -def : Pat<(v8i32 immAllOnesV), - (VINSERTF128rr - (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), (V_SETALLONES), sub_xmm), - (V_SETALLONES), 1)>; +// To create a 256-bit all ones value, we should produce VCMPTRUEPS +// with YMM register containing zero. +// FIXME: Avoid producing vxorps to clear the fake inputs. +let Predicates = [HasAVX1Only] in { +def : Pat<(v8i32 immAllOnesV), (VCMPPSYrri (AVX_SET0), (AVX_SET0), 0xf)>; +} multiclass vinsert_lowering<string InstrStr, ValueType From, ValueType To, PatFrag memop_frag> { diff --git a/lib/Target/X86/X86InstructionSelector.cpp b/lib/Target/X86/X86InstructionSelector.cpp index d65eb1de8d09..de58d719acb4 100644 --- a/lib/Target/X86/X86InstructionSelector.cpp +++ b/lib/Target/X86/X86InstructionSelector.cpp @@ -56,13 +56,9 @@ private: bool selectImpl(MachineInstr &I) const; // TODO: remove after suported by Tablegen-erated instruction selection. - unsigned getFAddOp(LLT &Ty, const RegisterBank &RB) const; - unsigned getFSubOp(LLT &Ty, const RegisterBank &RB) const; unsigned getLoadStoreOp(LLT &Ty, const RegisterBank &RB, unsigned Opc, uint64_t Alignment) const; - bool selectBinaryOp(MachineInstr &I, MachineRegisterInfo &MRI, - MachineFunction &MF) const; bool selectLoadStoreOp(MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF) const; bool selectFrameIndexOrGep(MachineInstr &I, MachineRegisterInfo &MRI, @@ -71,6 +67,10 @@ private: MachineFunction &MF) const; bool selectTrunc(MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF) const; + bool selectZext(MachineInstr &I, MachineRegisterInfo &MRI, + MachineFunction &MF) const; + bool selectCmp(MachineInstr &I, MachineRegisterInfo &MRI, + MachineFunction &MF) const; const X86TargetMachine &TM; const X86Subtarget &STI; @@ -226,13 +226,11 @@ bool X86InstructionSelector::select(MachineInstr &I) const { "Generic instruction has unexpected implicit operands\n"); if (selectImpl(I)) - return true; + return true; DEBUG(dbgs() << " C++ instruction selection: "; I.print(dbgs())); // TODO: This should be implemented by tblgen. - if (selectBinaryOp(I, MRI, MF)) - return true; if (selectLoadStoreOp(I, MRI, MF)) return true; if (selectFrameIndexOrGep(I, MRI, MF)) @@ -241,109 +239,14 @@ bool X86InstructionSelector::select(MachineInstr &I) const { return true; if (selectTrunc(I, MRI, MF)) return true; + if (selectZext(I, MRI, MF)) + return true; + if (selectCmp(I, MRI, MF)) + return true; return false; } -unsigned X86InstructionSelector::getFAddOp(LLT &Ty, - const RegisterBank &RB) const { - - if (X86::VECRRegBankID != RB.getID()) - return TargetOpcode::G_FADD; - - if (Ty == LLT::scalar(32)) { - if (STI.hasAVX512()) { - return X86::VADDSSZrr; - } else if (STI.hasAVX()) { - return X86::VADDSSrr; - } else if (STI.hasSSE1()) { - return X86::ADDSSrr; - } - } else if (Ty == LLT::scalar(64)) { - if (STI.hasAVX512()) { - return X86::VADDSDZrr; - } else if (STI.hasAVX()) { - return X86::VADDSDrr; - } else if (STI.hasSSE2()) { - return X86::ADDSDrr; - } - } else if (Ty == LLT::vector(4, 32)) { - if ((STI.hasAVX512()) && (STI.hasVLX())) { - return X86::VADDPSZ128rr; - } else if (STI.hasAVX()) { - return X86::VADDPSrr; - } else if (STI.hasSSE1()) { - return X86::ADDPSrr; - } - } - - return TargetOpcode::G_FADD; -} - -unsigned X86InstructionSelector::getFSubOp(LLT &Ty, - const RegisterBank &RB) const { - - if (X86::VECRRegBankID != RB.getID()) - return TargetOpcode::G_FSUB; - - if (Ty == LLT::scalar(32)) { - if (STI.hasAVX512()) { - return X86::VSUBSSZrr; - } else if (STI.hasAVX()) { - return X86::VSUBSSrr; - } else if (STI.hasSSE1()) { - return X86::SUBSSrr; - } - } else if (Ty == LLT::scalar(64)) { - if (STI.hasAVX512()) { - return X86::VSUBSDZrr; - } else if (STI.hasAVX()) { - return X86::VSUBSDrr; - } else if (STI.hasSSE2()) { - return X86::SUBSDrr; - } - } else if (Ty == LLT::vector(4, 32)) { - if ((STI.hasAVX512()) && (STI.hasVLX())) { - return X86::VSUBPSZ128rr; - } else if (STI.hasAVX()) { - return X86::VSUBPSrr; - } else if (STI.hasSSE1()) { - return X86::SUBPSrr; - } - } - - return TargetOpcode::G_FSUB; -} - -bool X86InstructionSelector::selectBinaryOp(MachineInstr &I, - MachineRegisterInfo &MRI, - MachineFunction &MF) const { - - const unsigned DefReg = I.getOperand(0).getReg(); - LLT Ty = MRI.getType(DefReg); - const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI); - - unsigned NewOpc = I.getOpcode(); - - switch (NewOpc) { - case TargetOpcode::G_FADD: - NewOpc = getFAddOp(Ty, RB); - break; - case TargetOpcode::G_FSUB: - NewOpc = getFSubOp(Ty, RB); - break; - default: - break; - } - - if (NewOpc == I.getOpcode()) - return false; - - I.setDesc(TII.get(NewOpc)); - - return constrainSelectedInstRegOperands(I, TII, TRI, RBI); -} - unsigned X86InstructionSelector::getLoadStoreOp(LLT &Ty, const RegisterBank &RB, unsigned Opc, uint64_t Alignment) const { @@ -562,6 +465,105 @@ bool X86InstructionSelector::selectTrunc(MachineInstr &I, return true; } +bool X86InstructionSelector::selectZext(MachineInstr &I, + MachineRegisterInfo &MRI, + MachineFunction &MF) const { + if (I.getOpcode() != TargetOpcode::G_ZEXT) + return false; + + const unsigned DstReg = I.getOperand(0).getReg(); + const unsigned SrcReg = I.getOperand(1).getReg(); + + const LLT DstTy = MRI.getType(DstReg); + const LLT SrcTy = MRI.getType(SrcReg); + + if (SrcTy == LLT::scalar(1)) { + + unsigned AndOpc; + if (DstTy == LLT::scalar(32)) + AndOpc = X86::AND32ri8; + else if (DstTy == LLT::scalar(64)) + AndOpc = X86::AND64ri8; + else + return false; + + const RegisterBank &RegBank = *RBI.getRegBank(DstReg, MRI, TRI); + unsigned DefReg = + MRI.createVirtualRegister(getRegClassForTypeOnBank(DstTy, RegBank)); + + BuildMI(*I.getParent(), I, I.getDebugLoc(), + TII.get(TargetOpcode::SUBREG_TO_REG), DefReg) + .addImm(0) + .addReg(SrcReg) + .addImm(X86::sub_8bit); + + MachineInstr &AndInst = + *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AndOpc), DstReg) + .addReg(DefReg) + .addImm(1); + + constrainSelectedInstRegOperands(AndInst, TII, TRI, RBI); + + I.eraseFromParent(); + return true; + } + + return false; +} + +bool X86InstructionSelector::selectCmp(MachineInstr &I, + MachineRegisterInfo &MRI, + MachineFunction &MF) const { + if (I.getOpcode() != TargetOpcode::G_ICMP) + return false; + + X86::CondCode CC; + bool SwapArgs; + std::tie(CC, SwapArgs) = X86::getX86ConditionCode( + (CmpInst::Predicate)I.getOperand(1).getPredicate()); + unsigned OpSet = X86::getSETFromCond(CC); + + unsigned LHS = I.getOperand(2).getReg(); + unsigned RHS = I.getOperand(3).getReg(); + + if (SwapArgs) + std::swap(LHS, RHS); + + unsigned OpCmp; + LLT Ty = MRI.getType(LHS); + + switch (Ty.getSizeInBits()) { + default: + return false; + case 8: + OpCmp = X86::CMP8rr; + break; + case 16: + OpCmp = X86::CMP16rr; + break; + case 32: + OpCmp = X86::CMP32rr; + break; + case 64: + OpCmp = X86::CMP64rr; + break; + } + + MachineInstr &CmpInst = + *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(OpCmp)) + .addReg(LHS) + .addReg(RHS); + + MachineInstr &SetInst = *BuildMI(*I.getParent(), I, I.getDebugLoc(), + TII.get(OpSet), I.getOperand(0).getReg()); + + constrainSelectedInstRegOperands(CmpInst, TII, TRI, RBI); + constrainSelectedInstRegOperands(SetInst, TII, TRI, RBI); + + I.eraseFromParent(); + return true; +} + InstructionSelector * llvm::createX86InstructionSelector(const X86TargetMachine &TM, X86Subtarget &Subtarget, diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h index 2a40399ba571..bc73bb1ae8c5 100644 --- a/lib/Target/X86/X86IntrinsicsInfo.h +++ b/lib/Target/X86/X86IntrinsicsInfo.h @@ -36,7 +36,7 @@ enum IntrinsicType : uint16_t { TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32, EXPAND_FROM_MEM, TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, KUNPCK, FIXUPIMM, FIXUPIMM_MASKZ, FIXUPIMMS, - FIXUPIMMS_MASKZ, CONVERT_MASK_TO_VEC, CONVERT_TO_MASK, GATHER_AVX2, MASK_BINOP, + FIXUPIMMS_MASKZ, CONVERT_TO_MASK, GATHER_AVX2, MASK_BINOP, }; struct IntrinsicData { diff --git a/lib/Target/X86/X86LegalizerInfo.cpp b/lib/Target/X86/X86LegalizerInfo.cpp index 4f5e70414aa9..cf26238c0239 100644 --- a/lib/Target/X86/X86LegalizerInfo.cpp +++ b/lib/Target/X86/X86LegalizerInfo.cpp @@ -87,10 +87,16 @@ void X86LegalizerInfo::setLegalizerInfo32bit() { setAction({G_ZEXT, s32}, Legal); setAction({G_SEXT, s32}, Legal); - for (auto Ty : {s8, s16}) { + for (auto Ty : {s1, s8, s16}) { setAction({G_ZEXT, 1, Ty}, Legal); setAction({G_SEXT, 1, Ty}, Legal); } + + // Comparison + setAction({G_ICMP, s1}, Legal); + + for (auto Ty : {s8, s16, s32, p0}) + setAction({G_ICMP, 1, Ty}, Legal); } void X86LegalizerInfo::setLegalizerInfo64bit() { @@ -139,10 +145,16 @@ void X86LegalizerInfo::setLegalizerInfo64bit() { setAction({G_SEXT, Ty}, Legal); } - for (auto Ty : {s8, s16, s32}) { + for (auto Ty : {s1, s8, s16, s32}) { setAction({G_ZEXT, 1, Ty}, Legal); setAction({G_SEXT, 1, Ty}, Legal); } + + // Comparison + setAction({G_ICMP, s1}, Legal); + + for (auto Ty : {s8, s16, s32, s64, p0}) + setAction({G_ICMP, 1, Ty}, Legal); } void X86LegalizerInfo::setLegalizerInfoSSE1() { diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp index cf2ceef8013a..7e4cba1c8345 100644 --- a/lib/Target/X86/X86RegisterInfo.cpp +++ b/lib/Target/X86/X86RegisterInfo.cpp @@ -320,14 +320,14 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { case CallingConv::X86_RegCall: if (Is64Bit) { if (IsWin64) { - return (HasSSE ? CSR_Win64_RegCall_SaveList : + return (HasSSE ? CSR_Win64_RegCall_SaveList : CSR_Win64_RegCall_NoSSE_SaveList); } else { - return (HasSSE ? CSR_SysV64_RegCall_SaveList : + return (HasSSE ? CSR_SysV64_RegCall_SaveList : CSR_SysV64_RegCall_NoSSE_SaveList); } } else { - return (HasSSE ? CSR_32_RegCall_SaveList : + return (HasSSE ? CSR_32_RegCall_SaveList : CSR_32_RegCall_NoSSE_SaveList); } case CallingConv::Cold: @@ -435,15 +435,15 @@ X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF, return CSR_64_HHVM_RegMask; case CallingConv::X86_RegCall: if (Is64Bit) { - if (IsWin64) { - return (HasSSE ? CSR_Win64_RegCall_RegMask : + if (IsWin64) { + return (HasSSE ? CSR_Win64_RegCall_RegMask : CSR_Win64_RegCall_NoSSE_RegMask); } else { - return (HasSSE ? CSR_SysV64_RegCall_RegMask : + return (HasSSE ? CSR_SysV64_RegCall_RegMask : CSR_SysV64_RegCall_NoSSE_RegMask); } } else { - return (HasSSE ? CSR_32_RegCall_RegMask : + return (HasSSE ? CSR_32_RegCall_RegMask : CSR_32_RegCall_NoSSE_RegMask); } case CallingConv::Cold: diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h index de1514243aeb..02be95e2e556 100644 --- a/lib/Target/X86/X86Subtarget.h +++ b/lib/Target/X86/X86Subtarget.h @@ -253,6 +253,11 @@ protected: /// True if the LEA instruction with certain arguments is slow bool SlowLEA; + /// True if the LEA instruction has all three source operands: base, index, + /// and offset or if the LEA instruction uses base and index registers where + /// the base is EBP, RBP,or R13 + bool Slow3OpsLEA; + /// True if INC and DEC instructions are slow when writing to flags bool SlowIncDec; @@ -490,6 +495,7 @@ public: bool callRegIndirect() const { return CallRegIndirect; } bool LEAusesAG() const { return LEAUsesAG; } bool slowLEA() const { return SlowLEA; } + bool slow3OpsLEA() const { return Slow3OpsLEA; } bool slowIncDec() const { return SlowIncDec; } bool hasCDI() const { return HasCDI; } bool hasPFI() const { return HasPFI; } diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp index 086f55dd60b5..c6a90725d89c 100644 --- a/lib/Target/X86/X86TargetMachine.cpp +++ b/lib/Target/X86/X86TargetMachine.cpp @@ -61,6 +61,7 @@ static cl::opt<bool> EnableMachineCombinerPass("x86-machine-combiner", namespace llvm { void initializeWinEHStatePassPass(PassRegistry &); +void initializeFixupLEAPassPass(PassRegistry &); void initializeX86ExecutionDepsFixPass(PassRegistry &); } // end namespace llvm @@ -75,6 +76,7 @@ extern "C" void LLVMInitializeX86Target() { initializeWinEHStatePassPass(PR); initializeFixupBWInstPassPass(PR); initializeEvexToVexInstPassPass(PR); + initializeFixupLEAPassPass(PR); initializeX86ExecutionDepsFixPass(PR); } @@ -87,7 +89,7 @@ static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { if (TT.isOSFreeBSD()) return llvm::make_unique<X86FreeBSDTargetObjectFile>(); - if (TT.isOSLinux() || TT.isOSNaCl()) + if (TT.isOSLinux() || TT.isOSNaCl() || TT.isOSIAMCU()) return llvm::make_unique<X86LinuxNaClTargetObjectFile>(); if (TT.isOSFuchsia()) return llvm::make_unique<X86FuchsiaTargetObjectFile>(); diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp index f3b619a2956a..80e18161a94b 100644 --- a/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/lib/Target/X86/X86TargetTransformInfo.cpp @@ -247,35 +247,38 @@ int X86TTIImpl::getArithmeticInstrCost( } static const CostTblEntry SSE2UniformConstCostTable[] = { - { ISD::SHL, MVT::v16i8, 2 }, // psllw + pand. - { ISD::SRL, MVT::v16i8, 2 }, // psrlw + pand. - { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb. - - { ISD::SHL, MVT::v32i8, 4 }, // 2*(psllw + pand). - { ISD::SRL, MVT::v32i8, 4 }, // 2*(psrlw + pand). - { ISD::SRA, MVT::v32i8, 8 }, // 2*(psrlw, pand, pxor, psubb). - - { ISD::SDIV, MVT::v16i16, 12 }, // pmulhw sequence - { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence - { ISD::UDIV, MVT::v16i16, 12 }, // pmulhuw sequence - { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence - { ISD::SDIV, MVT::v8i32, 38 }, // pmuludq sequence - { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence - { ISD::UDIV, MVT::v8i32, 30 }, // pmuludq sequence - { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence + { ISD::SHL, MVT::v16i8, 2 }, // psllw + pand. + { ISD::SRL, MVT::v16i8, 2 }, // psrlw + pand. + { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb. + + { ISD::SHL, MVT::v32i8, 4+2 }, // 2*(psllw + pand) + split. + { ISD::SRL, MVT::v32i8, 4+2 }, // 2*(psrlw + pand) + split. + { ISD::SRA, MVT::v32i8, 8+2 }, // 2*(psrlw, pand, pxor, psubb) + split. + + { ISD::SDIV, MVT::v16i16, 12+2 }, // 2*pmulhw sequence + split. + { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence + { ISD::UDIV, MVT::v16i16, 12+2 }, // 2*pmulhuw sequence + split. + { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence + { ISD::SDIV, MVT::v8i32, 38+2 }, // 2*pmuludq sequence + split. + { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence + { ISD::UDIV, MVT::v8i32, 30+2 }, // 2*pmuludq sequence + split. + { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence }; if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && ST->hasSSE2()) { // pmuldq sequence. if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX()) - return LT.first * 30; + return LT.first * 32; if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41()) return LT.first * 15; - if (const auto *Entry = CostTableLookup(SSE2UniformConstCostTable, ISD, - LT.second)) - return LT.first * Entry->Cost; + // XOP has faster vXi8 shifts. + if ((ISD != ISD::SHL && ISD != ISD::SRL && ISD != ISD::SRA) || + !ST->hasXOP()) + if (const auto *Entry = + CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second)) + return LT.first * Entry->Cost; } static const CostTblEntry AVX2UniformCostTable[] = { @@ -430,18 +433,18 @@ int X86TTIImpl::getArithmeticInstrCost( { ISD::SRL, MVT::v2i64, 2 }, { ISD::SRA, MVT::v2i64, 2 }, // 256bit shifts require splitting if AVX2 didn't catch them above. - { ISD::SHL, MVT::v32i8, 2 }, - { ISD::SRL, MVT::v32i8, 4 }, - { ISD::SRA, MVT::v32i8, 4 }, - { ISD::SHL, MVT::v16i16, 2 }, - { ISD::SRL, MVT::v16i16, 4 }, - { ISD::SRA, MVT::v16i16, 4 }, - { ISD::SHL, MVT::v8i32, 2 }, - { ISD::SRL, MVT::v8i32, 4 }, - { ISD::SRA, MVT::v8i32, 4 }, - { ISD::SHL, MVT::v4i64, 2 }, - { ISD::SRL, MVT::v4i64, 4 }, - { ISD::SRA, MVT::v4i64, 4 }, + { ISD::SHL, MVT::v32i8, 2+2 }, + { ISD::SRL, MVT::v32i8, 4+2 }, + { ISD::SRA, MVT::v32i8, 4+2 }, + { ISD::SHL, MVT::v16i16, 2+2 }, + { ISD::SRL, MVT::v16i16, 4+2 }, + { ISD::SRA, MVT::v16i16, 4+2 }, + { ISD::SHL, MVT::v8i32, 2+2 }, + { ISD::SRL, MVT::v8i32, 4+2 }, + { ISD::SRA, MVT::v8i32, 4+2 }, + { ISD::SHL, MVT::v4i64, 2+2 }, + { ISD::SRL, MVT::v4i64, 4+2 }, + { ISD::SRA, MVT::v4i64, 4+2 }, }; // Look for XOP lowering tricks. @@ -451,23 +454,28 @@ int X86TTIImpl::getArithmeticInstrCost( static const CostTblEntry SSE2UniformShiftCostTable[] = { // Uniform splats are cheaper for the following instructions. - { ISD::SHL, MVT::v16i16, 2 }, // psllw. - { ISD::SHL, MVT::v8i32, 2 }, // pslld - { ISD::SHL, MVT::v4i64, 2 }, // psllq. - - { ISD::SRL, MVT::v16i16, 2 }, // psrlw. - { ISD::SRL, MVT::v8i32, 2 }, // psrld. - { ISD::SRL, MVT::v4i64, 2 }, // psrlq. - - { ISD::SRA, MVT::v16i16, 2 }, // psraw. - { ISD::SRA, MVT::v8i32, 2 }, // psrad. - { ISD::SRA, MVT::v2i64, 4 }, // 2 x psrad + shuffle. - { ISD::SRA, MVT::v4i64, 8 }, // 2 x psrad + shuffle. + { ISD::SHL, MVT::v16i16, 2+2 }, // 2*psllw + split. + { ISD::SHL, MVT::v8i32, 2+2 }, // 2*pslld + split. + { ISD::SHL, MVT::v4i64, 2+2 }, // 2*psllq + split. + + { ISD::SRL, MVT::v16i16, 2+2 }, // 2*psrlw + split. + { ISD::SRL, MVT::v8i32, 2+2 }, // 2*psrld + split. + { ISD::SRL, MVT::v4i64, 2+2 }, // 2*psrlq + split. + + { ISD::SRA, MVT::v16i16, 2+2 }, // 2*psraw + split. + { ISD::SRA, MVT::v8i32, 2+2 }, // 2*psrad + split. + { ISD::SRA, MVT::v2i64, 4 }, // 2*psrad + shuffle. + { ISD::SRA, MVT::v4i64, 8+2 }, // 2*(2*psrad + shuffle) + split. }; if (ST->hasSSE2() && ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) || (Op2Info == TargetTransformInfo::OK_UniformValue))) { + + // Handle AVX2 uniform v4i64 ISD::SRA, it's not worth a table. + if (ISD == ISD::SRA && LT.second == MVT::v4i64 && ST->hasAVX2()) + return LT.first * 4; // 2*psrad + shuffle. + if (const auto *Entry = CostTableLookup(SSE2UniformShiftCostTable, ISD, LT.second)) return LT.first * Entry->Cost; @@ -581,28 +589,28 @@ int X86TTIImpl::getArithmeticInstrCost( return LT.first * Entry->Cost; static const CostTblEntry SSE41CostTable[] = { - { ISD::SHL, MVT::v16i8, 11 }, // pblendvb sequence. - { ISD::SHL, MVT::v32i8, 2*11 }, // pblendvb sequence. - { ISD::SHL, MVT::v8i16, 14 }, // pblendvb sequence. - { ISD::SHL, MVT::v16i16, 2*14 }, // pblendvb sequence. - { ISD::SHL, MVT::v4i32, 4 }, // pslld/paddd/cvttps2dq/pmulld - { ISD::SHL, MVT::v8i32, 2*4 }, // pslld/paddd/cvttps2dq/pmulld - - { ISD::SRL, MVT::v16i8, 12 }, // pblendvb sequence. - { ISD::SRL, MVT::v32i8, 2*12 }, // pblendvb sequence. - { ISD::SRL, MVT::v8i16, 14 }, // pblendvb sequence. - { ISD::SRL, MVT::v16i16, 2*14 }, // pblendvb sequence. - { ISD::SRL, MVT::v4i32, 11 }, // Shift each lane + blend. - { ISD::SRL, MVT::v8i32, 2*11 }, // Shift each lane + blend. - - { ISD::SRA, MVT::v16i8, 24 }, // pblendvb sequence. - { ISD::SRA, MVT::v32i8, 2*24 }, // pblendvb sequence. - { ISD::SRA, MVT::v8i16, 14 }, // pblendvb sequence. - { ISD::SRA, MVT::v16i16, 2*14 }, // pblendvb sequence. - { ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend. - { ISD::SRA, MVT::v8i32, 2*12 }, // Shift each lane + blend. - - { ISD::MUL, MVT::v4i32, 1 } // pmulld + { ISD::SHL, MVT::v16i8, 11 }, // pblendvb sequence. + { ISD::SHL, MVT::v32i8, 2*11+2 }, // pblendvb sequence + split. + { ISD::SHL, MVT::v8i16, 14 }, // pblendvb sequence. + { ISD::SHL, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split. + { ISD::SHL, MVT::v4i32, 4 }, // pslld/paddd/cvttps2dq/pmulld + { ISD::SHL, MVT::v8i32, 2*4+2 }, // pslld/paddd/cvttps2dq/pmulld + split + + { ISD::SRL, MVT::v16i8, 12 }, // pblendvb sequence. + { ISD::SRL, MVT::v32i8, 2*12+2 }, // pblendvb sequence + split. + { ISD::SRL, MVT::v8i16, 14 }, // pblendvb sequence. + { ISD::SRL, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split. + { ISD::SRL, MVT::v4i32, 11 }, // Shift each lane + blend. + { ISD::SRL, MVT::v8i32, 2*11+2 }, // Shift each lane + blend + split. + + { ISD::SRA, MVT::v16i8, 24 }, // pblendvb sequence. + { ISD::SRA, MVT::v32i8, 2*24+2 }, // pblendvb sequence + split. + { ISD::SRA, MVT::v8i16, 14 }, // pblendvb sequence. + { ISD::SRA, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split. + { ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend. + { ISD::SRA, MVT::v8i32, 2*12+2 }, // Shift each lane + blend + split. + + { ISD::MUL, MVT::v4i32, 1 } // pmulld }; if (ST->hasSSE41()) @@ -612,33 +620,33 @@ int X86TTIImpl::getArithmeticInstrCost( static const CostTblEntry SSE2CostTable[] = { // We don't correctly identify costs of casts because they are marked as // custom. - { ISD::SHL, MVT::v16i8, 26 }, // cmpgtb sequence. - { ISD::SHL, MVT::v8i16, 32 }, // cmpgtb sequence. - { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul. - { ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence. - { ISD::SHL, MVT::v4i64, 2*4 }, // splat+shuffle sequence. - - { ISD::SRL, MVT::v16i8, 26 }, // cmpgtb sequence. - { ISD::SRL, MVT::v8i16, 32 }, // cmpgtb sequence. - { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend. - { ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence. - { ISD::SRL, MVT::v4i64, 2*4 }, // splat+shuffle sequence. - - { ISD::SRA, MVT::v16i8, 54 }, // unpacked cmpgtb sequence. - { ISD::SRA, MVT::v8i16, 32 }, // cmpgtb sequence. - { ISD::SRA, MVT::v4i32, 16 }, // Shift each lane + blend. - { ISD::SRA, MVT::v2i64, 12 }, // srl/xor/sub sequence. - { ISD::SRA, MVT::v4i64, 2*12 }, // srl/xor/sub sequence. - - { ISD::MUL, MVT::v16i8, 12 }, // extend/pmullw/trunc sequence. - { ISD::MUL, MVT::v8i16, 1 }, // pmullw - { ISD::MUL, MVT::v4i32, 6 }, // 3*pmuludq/4*shuffle - { ISD::MUL, MVT::v2i64, 8 }, // 3*pmuludq/3*shift/2*add - - { ISD::FDIV, MVT::f32, 23 }, // Pentium IV from http://www.agner.org/ - { ISD::FDIV, MVT::v4f32, 39 }, // Pentium IV from http://www.agner.org/ - { ISD::FDIV, MVT::f64, 38 }, // Pentium IV from http://www.agner.org/ - { ISD::FDIV, MVT::v2f64, 69 }, // Pentium IV from http://www.agner.org/ + { ISD::SHL, MVT::v16i8, 26 }, // cmpgtb sequence. + { ISD::SHL, MVT::v8i16, 32 }, // cmpgtb sequence. + { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul. + { ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence. + { ISD::SHL, MVT::v4i64, 2*4+2 }, // splat+shuffle sequence + split. + + { ISD::SRL, MVT::v16i8, 26 }, // cmpgtb sequence. + { ISD::SRL, MVT::v8i16, 32 }, // cmpgtb sequence. + { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend. + { ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence. + { ISD::SRL, MVT::v4i64, 2*4+2 }, // splat+shuffle sequence + split. + + { ISD::SRA, MVT::v16i8, 54 }, // unpacked cmpgtb sequence. + { ISD::SRA, MVT::v8i16, 32 }, // cmpgtb sequence. + { ISD::SRA, MVT::v4i32, 16 }, // Shift each lane + blend. + { ISD::SRA, MVT::v2i64, 12 }, // srl/xor/sub sequence. + { ISD::SRA, MVT::v4i64, 2*12+2 }, // srl/xor/sub sequence+split. + + { ISD::MUL, MVT::v16i8, 12 }, // extend/pmullw/trunc sequence. + { ISD::MUL, MVT::v8i16, 1 }, // pmullw + { ISD::MUL, MVT::v4i32, 6 }, // 3*pmuludq/4*shuffle + { ISD::MUL, MVT::v2i64, 8 }, // 3*pmuludq/3*shift/2*add + + { ISD::FDIV, MVT::f32, 23 }, // Pentium IV from http://www.agner.org/ + { ISD::FDIV, MVT::v4f32, 39 }, // Pentium IV from http://www.agner.org/ + { ISD::FDIV, MVT::f64, 38 }, // Pentium IV from http://www.agner.org/ + { ISD::FDIV, MVT::v2f64, 69 }, // Pentium IV from http://www.agner.org/ // It is not a good idea to vectorize division. We have to scalarize it and // in the process we will often end up having to spilling regular diff --git a/lib/Target/X86/X86WinEHState.cpp b/lib/Target/X86/X86WinEHState.cpp index 500b26b3be17..3ee14a0ff7b1 100644 --- a/lib/Target/X86/X86WinEHState.cpp +++ b/lib/Target/X86/X86WinEHState.cpp @@ -398,7 +398,7 @@ Function *WinEHStatePass::generateLSDAInEAXThunk(Function *ParentFunc) { /*isVarArg=*/false); Function *Trampoline = Function::Create(TrampolineTy, GlobalValue::InternalLinkage, - Twine("__ehhandler$") + GlobalValue::getRealLinkageName( + Twine("__ehhandler$") + GlobalValue::dropLLVMManglingEscape( ParentFunc->getName()), TheModule); BasicBlock *EntryBB = BasicBlock::Create(Context, "entry", Trampoline); |