diff options
| author | Dimitry Andric <dim@FreeBSD.org> | 2018-08-02 17:42:12 +0000 |
|---|---|---|
| committer | Dimitry Andric <dim@FreeBSD.org> | 2018-08-02 17:42:12 +0000 |
| commit | 1c4688a8498fea1db507842ff8dedaacad8ef77b (patch) | |
| tree | e74f1bea0e682a4cd6d7edea69293ab7958eb9ae /contrib/llvm/lib/Target/X86 | |
| parent | 68dc77c284115e8f103290474b3b9e35a3906c53 (diff) | |
| parent | b7eb8e35e481a74962664b63dfb09483b200209a (diff) | |
Notes
Diffstat (limited to 'contrib/llvm/lib/Target/X86')
28 files changed, 492 insertions, 396 deletions
diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp index b84c2d31a63e..fafbed0bd935 100644 --- a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -2603,11 +2603,11 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, bool HadVerifyError = false; // Append default arguments to "ins[bwld]" - if (Name.startswith("ins") && + if (Name.startswith("ins") && (Operands.size() == 1 || Operands.size() == 3) && (Name == "insb" || Name == "insw" || Name == "insl" || Name == "insd" || Name == "ins")) { - + AddDefaultSrcDestOperands(TmpOperands, X86Operand::CreateReg(X86::DX, NameLoc, NameLoc), DefaultMemDIOperand(NameLoc)); @@ -2615,7 +2615,7 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, } // Append default arguments to "outs[bwld]" - if (Name.startswith("outs") && + if (Name.startswith("outs") && (Operands.size() == 1 || Operands.size() == 3) && (Name == "outsb" || Name == "outsw" || Name == "outsl" || Name == "outsd" || Name == "outs")) { diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp index 82e82fe1efd9..0e861d5ddbc9 100644 --- a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp +++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp @@ -92,7 +92,7 @@ void X86ATTInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, // the hex value of the immediate operand when it isn't in the range // [-256,255]. if (CommentStream && !HasCustomInstComment && (Imm > 255 || Imm < -256)) { - // Don't print unnecessary hex sign bits. + // Don't print unnecessary hex sign bits. if (Imm == (int16_t)(Imm)) *CommentStream << format("imm = 0x%" PRIX16 "\n", (uint16_t)Imm); else if (Imm == (int32_t)(Imm)) diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp index d030f26d98de..f1d15e66918b 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp @@ -307,10 +307,84 @@ class X86MCInstrAnalysis : public MCInstrAnalysis { public: X86MCInstrAnalysis(const MCInstrInfo *MCII) : MCInstrAnalysis(MCII) {} + bool isDependencyBreaking(const MCSubtargetInfo &STI, + const MCInst &Inst) const override; bool clearsSuperRegisters(const MCRegisterInfo &MRI, const MCInst &Inst, APInt &Mask) const override; }; +bool X86MCInstrAnalysis::isDependencyBreaking(const MCSubtargetInfo &STI, + const MCInst &Inst) const { + if (STI.getCPU() == "btver2") { + // Reference: Agner Fog's microarchitecture.pdf - Section 20 "AMD Bobcat and + // Jaguar pipeline", subsection 8 "Dependency-breaking instructions". + switch (Inst.getOpcode()) { + default: + return false; + case X86::SUB32rr: + case X86::SUB64rr: + case X86::SBB32rr: + case X86::SBB64rr: + case X86::XOR32rr: + case X86::XOR64rr: + case X86::XORPSrr: + case X86::XORPDrr: + case X86::VXORPSrr: + case X86::VXORPDrr: + case X86::ANDNPSrr: + case X86::VANDNPSrr: + case X86::ANDNPDrr: + case X86::VANDNPDrr: + case X86::PXORrr: + case X86::VPXORrr: + case X86::PANDNrr: + case X86::VPANDNrr: + case X86::PSUBBrr: + case X86::PSUBWrr: + case X86::PSUBDrr: + case X86::PSUBQrr: + case X86::VPSUBBrr: + case X86::VPSUBWrr: + case X86::VPSUBDrr: + case X86::VPSUBQrr: + case X86::PCMPEQBrr: + case X86::PCMPEQWrr: + case X86::PCMPEQDrr: + case X86::PCMPEQQrr: + case X86::VPCMPEQBrr: + case X86::VPCMPEQWrr: + case X86::VPCMPEQDrr: + case X86::VPCMPEQQrr: + case X86::PCMPGTBrr: + case X86::PCMPGTWrr: + case X86::PCMPGTDrr: + case X86::PCMPGTQrr: + case X86::VPCMPGTBrr: + case X86::VPCMPGTWrr: + case X86::VPCMPGTDrr: + case X86::VPCMPGTQrr: + case X86::MMX_PXORirr: + case X86::MMX_PANDNirr: + case X86::MMX_PSUBBirr: + case X86::MMX_PSUBDirr: + case X86::MMX_PSUBQirr: + case X86::MMX_PSUBWirr: + case X86::MMX_PCMPGTBirr: + case X86::MMX_PCMPGTDirr: + case X86::MMX_PCMPGTWirr: + case X86::MMX_PCMPEQBirr: + case X86::MMX_PCMPEQDirr: + case X86::MMX_PCMPEQWirr: + return Inst.getOperand(1).getReg() == Inst.getOperand(2).getReg(); + case X86::CMP32rr: + case X86::CMP64rr: + return Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg(); + } + } + + return false; +} + bool X86MCInstrAnalysis::clearsSuperRegisters(const MCRegisterInfo &MRI, const MCInst &Inst, APInt &Mask) const { diff --git a/contrib/llvm/lib/Target/X86/X86CallingConv.h b/contrib/llvm/lib/Target/X86/X86CallingConv.h index c49a6838fa44..d0fcbd313312 100644 --- a/contrib/llvm/lib/Target/X86/X86CallingConv.h +++ b/contrib/llvm/lib/Target/X86/X86CallingConv.h @@ -66,7 +66,7 @@ inline bool CC_X86_32_MCUInReg(unsigned &ValNo, MVT &ValVT, // not to split i64 and double between a register and stack static const MCPhysReg RegList[] = {X86::EAX, X86::EDX, X86::ECX}; static const unsigned NumRegs = sizeof(RegList)/sizeof(RegList[0]); - + SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs(); // If this is the first part of an double/i64/i128, or if we're already diff --git a/contrib/llvm/lib/Target/X86/X86CmovConversion.cpp b/contrib/llvm/lib/Target/X86/X86CmovConversion.cpp index f73455cc31b8..1c5f110d8c60 100644 --- a/contrib/llvm/lib/Target/X86/X86CmovConversion.cpp +++ b/contrib/llvm/lib/Target/X86/X86CmovConversion.cpp @@ -622,7 +622,7 @@ void X86CmovConverterPass::convertCmovInstsToBranches( // If the CMOV group is not packed, e.g., there are debug instructions between // first CMOV and last CMOV, then pack the group and make the CMOV instruction - // consecutive by moving the debug instructions to after the last CMOV. + // consecutive by moving the debug instructions to after the last CMOV. packCmovGroup(Group.front(), Group.back()); // To convert a CMOVcc instruction, we actually have to insert the diamond diff --git a/contrib/llvm/lib/Target/X86/X86FastISel.cpp b/contrib/llvm/lib/Target/X86/X86FastISel.cpp index de8b40f28a86..35a15577fe09 100644 --- a/contrib/llvm/lib/Target/X86/X86FastISel.cpp +++ b/contrib/llvm/lib/Target/X86/X86FastISel.cpp @@ -1195,7 +1195,7 @@ bool X86FastISel::X86SelectRet(const Instruction *I) { if (Ret->getNumOperands() > 0) { SmallVector<ISD::OutputArg, 4> Outs; - GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI, DL); + GetReturnInfo(CC, F.getReturnType(), F.getAttributes(), Outs, TLI, DL); // Analyze operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ValLocs; @@ -2649,7 +2649,7 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::VMOVPDI2DIrr), ResultReg) .addReg(InputReg, RegState::Kill); - + // The result value is in the lower 16-bits of ResultReg. unsigned RegIdx = X86::sub_16bit; ResultReg = fastEmitInst_extractsubreg(MVT::i16, ResultReg, true, RegIdx); @@ -3687,7 +3687,7 @@ X86FastISel::fastSelectInstruction(const Instruction *I) { unsigned Reg = getRegForValue(I->getOperand(0)); if (Reg == 0) return false; - + // No instruction is needed for conversion. Reuse the register used by // the fist operand. updateValueMap(I, Reg); diff --git a/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp b/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp index d85389a0a7f1..f3f7f6a37360 100644 --- a/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp +++ b/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp @@ -578,7 +578,7 @@ bool FixupLEAPass::processBasicBlock(MachineFunction &MF, continue; if (OptLEA) { - if (MF.getSubtarget<X86Subtarget>().isSLM()) + if (MF.getSubtarget<X86Subtarget>().slowLEA()) processInstructionForSLM(I, MFI); else { diff --git a/contrib/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp b/contrib/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp index 1ba08d39c595..c17c51a7aeac 100644 --- a/contrib/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp +++ b/contrib/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp @@ -730,9 +730,12 @@ CondRegArray X86FlagsCopyLoweringPass::collectCondsInRegs( for (MachineInstr &MI : llvm::reverse(llvm::make_range(MBB.begin(), TestPos))) { X86::CondCode Cond = X86::getCondFromSETOpc(MI.getOpcode()); - if (Cond != X86::COND_INVALID && MI.getOperand(0).isReg() && - TRI->isVirtualRegister(MI.getOperand(0).getReg())) + if (Cond != X86::COND_INVALID && !MI.mayStore() && MI.getOperand(0).isReg() && + TRI->isVirtualRegister(MI.getOperand(0).getReg())) { + assert(MI.getOperand(0).isDef() && + "A non-storing SETcc should always define a register!"); CondRegs[Cond] = MI.getOperand(0).getReg(); + } // Stop scanning when we see the first definition of the EFLAGS as prior to // this we would potentially capture the wrong flag state. diff --git a/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp b/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp index ae748901164a..f330acff61a1 100644 --- a/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp +++ b/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp @@ -347,12 +347,12 @@ bool FPS::runOnMachineFunction(MachineFunction &MF) { LiveBundle &Bundle = LiveBundles[Bundles->getBundle(Entry->getNumber(), false)]; - + // In regcall convention, some FP registers may not be passed through // the stack, so they will need to be assigned to the stack first if ((Entry->getParent()->getFunction().getCallingConv() == CallingConv::X86_RegCall) && (Bundle.Mask && !Bundle.FixCount)) { - // In the register calling convention, up to one FP argument could be + // In the register calling convention, up to one FP argument could be // saved in the first FP register. // If bundle.mask is non-zero and Bundle.FixCount is zero, it means // that the FP registers contain arguments. @@ -991,7 +991,7 @@ void FPS::handleCall(MachineBasicBlock::iterator &I) { assert(STReturns == 0 || (isMask_32(STReturns) && N <= 2)); // Reset the FP Stack - It is required because of possible leftovers from - // passed arguments. The caller should assume that the FP stack is + // passed arguments. The caller should assume that the FP stack is // returned empty (unless the callee returns values on FP stack). while (StackTop > 0) popReg(); diff --git a/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp b/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp index a257ec41f75b..e207c343fac8 100644 --- a/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -68,7 +68,7 @@ X86FrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const { // needsFrameIndexResolution - Do we need to perform FI resolution for // this function. Normally, this is required only when the function // has any stack objects. However, FI resolution actually has another job, -// not apparent from the title - it resolves callframesetup/destroy +// not apparent from the title - it resolves callframesetup/destroy // that were not simplified earlier. // So, this is required for x86 functions that have push sequences even // when there are no stack objects. @@ -607,8 +607,7 @@ void X86FrameLowering::emitStackProbeInline(MachineFunction &MF, int64_t RCXShadowSlot = 0; int64_t RDXShadowSlot = 0; - // If inlining in the prolog, save RCX and RDX. - // Future optimization: don't save or restore if not live in. + // If inlining in the prolog, save RCX and RDX. if (InProlog) { // Compute the offsets. We need to account for things already // pushed onto the stack at this point: return address, frame @@ -616,15 +615,30 @@ void X86FrameLowering::emitStackProbeInline(MachineFunction &MF, X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); const int64_t CalleeSaveSize = X86FI->getCalleeSavedFrameSize(); const bool HasFP = hasFP(MF); - RCXShadowSlot = 8 + CalleeSaveSize + (HasFP ? 8 : 0); - RDXShadowSlot = RCXShadowSlot + 8; - // Emit the saves. - addRegOffset(BuildMI(&MBB, DL, TII.get(X86::MOV64mr)), X86::RSP, false, - RCXShadowSlot) - .addReg(X86::RCX); - addRegOffset(BuildMI(&MBB, DL, TII.get(X86::MOV64mr)), X86::RSP, false, - RDXShadowSlot) - .addReg(X86::RDX); + + // Check if we need to spill RCX and/or RDX. + // Here we assume that no earlier prologue instruction changes RCX and/or + // RDX, so checking the block live-ins is enough. + const bool IsRCXLiveIn = MBB.isLiveIn(X86::RCX); + const bool IsRDXLiveIn = MBB.isLiveIn(X86::RDX); + int64_t InitSlot = 8 + CalleeSaveSize + (HasFP ? 8 : 0); + // Assign the initial slot to both registers, then change RDX's slot if both + // need to be spilled. + if (IsRCXLiveIn) + RCXShadowSlot = InitSlot; + if (IsRDXLiveIn) + RDXShadowSlot = InitSlot; + if (IsRDXLiveIn && IsRCXLiveIn) + RDXShadowSlot += 8; + // Emit the saves if needed. + if (IsRCXLiveIn) + addRegOffset(BuildMI(&MBB, DL, TII.get(X86::MOV64mr)), X86::RSP, false, + RCXShadowSlot) + .addReg(X86::RCX); + if (IsRDXLiveIn) + addRegOffset(BuildMI(&MBB, DL, TII.get(X86::MOV64mr)), X86::RSP, false, + RDXShadowSlot) + .addReg(X86::RDX); } else { // Not in the prolog. Copy RAX to a virtual reg. BuildMI(&MBB, DL, TII.get(X86::MOV64rr), SizeReg).addReg(X86::RAX); @@ -661,6 +675,7 @@ void X86FrameLowering::emitStackProbeInline(MachineFunction &MF, BuildMI(&MBB, DL, TII.get(X86::JAE_1)).addMBB(ContinueMBB); // Add code to roundMBB to round the final stack pointer to a page boundary. + RoundMBB->addLiveIn(FinalReg); BuildMI(RoundMBB, DL, TII.get(X86::AND64ri32), RoundedReg) .addReg(FinalReg) .addImm(PageMask); @@ -677,6 +692,7 @@ void X86FrameLowering::emitStackProbeInline(MachineFunction &MF, .addMBB(LoopMBB); } + LoopMBB->addLiveIn(JoinReg); addRegOffset(BuildMI(LoopMBB, DL, TII.get(X86::LEA64r), ProbeReg), JoinReg, false, -PageSize); @@ -688,6 +704,8 @@ void X86FrameLowering::emitStackProbeInline(MachineFunction &MF, .addImm(0) .addReg(0) .addImm(0); + + LoopMBB->addLiveIn(RoundedReg); BuildMI(LoopMBB, DL, TII.get(X86::CMP64rr)) .addReg(RoundedReg) .addReg(ProbeReg); @@ -697,16 +715,19 @@ void X86FrameLowering::emitStackProbeInline(MachineFunction &MF, // If in prolog, restore RDX and RCX. if (InProlog) { - addRegOffset(BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(X86::MOV64rm), - X86::RCX), - X86::RSP, false, RCXShadowSlot); - addRegOffset(BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(X86::MOV64rm), - X86::RDX), - X86::RSP, false, RDXShadowSlot); + if (RCXShadowSlot) // It means we spilled RCX in the prologue. + addRegOffset(BuildMI(*ContinueMBB, ContinueMBBI, DL, + TII.get(X86::MOV64rm), X86::RCX), + X86::RSP, false, RCXShadowSlot); + if (RDXShadowSlot) // It means we spilled RDX in the prologue. + addRegOffset(BuildMI(*ContinueMBB, ContinueMBBI, DL, + TII.get(X86::MOV64rm), X86::RDX), + X86::RSP, false, RDXShadowSlot); } // Now that the probing is done, add code to continueMBB to update // the stack pointer for real. + ContinueMBB->addLiveIn(SizeReg); BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(X86::SUB64rr), X86::RSP) .addReg(X86::RSP) .addReg(SizeReg); @@ -734,8 +755,6 @@ void X86FrameLowering::emitStackProbeInline(MachineFunction &MF, CMBBI->setFlag(MachineInstr::FrameSetup); } } - - // Possible TODO: physreg liveness for InProlog case. } void X86FrameLowering::emitStackProbeCall(MachineFunction &MF, @@ -2694,7 +2713,7 @@ bool X86FrameLowering::adjustStackWithPops(MachineBasicBlock &MBB, Regs[FoundRegs++] = Regs[0]; for (int i = 0; i < NumPops; ++i) - BuildMI(MBB, MBBI, DL, + BuildMI(MBB, MBBI, DL, TII.get(STI.is64Bit() ? X86::POP64r : X86::POP32r), Regs[i]); return true; @@ -2984,7 +3003,7 @@ struct X86FrameSortingComparator { // in general. Something to keep in mind, though. if (DensityAScaled == DensityBScaled) return A.ObjectAlignment < B.ObjectAlignment; - + return DensityAScaled < DensityBScaled; } }; @@ -3020,7 +3039,7 @@ void X86FrameLowering::orderFrameObjects( if (ObjectSize == 0) // Variable size. Just use 4. SortingObjects[Obj].ObjectSize = 4; - else + else SortingObjects[Obj].ObjectSize = ObjectSize; } diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp index 7dcdb7967058..2820004cfc6d 100644 --- a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1800,17 +1800,19 @@ X86TargetLowering::getPreferredVectorAction(EVT VT) const { } MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, + CallingConv::ID CC, EVT VT) const { if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI()) return MVT::v32i8; - return TargetLowering::getRegisterTypeForCallingConv(Context, VT); + return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT); } unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, + CallingConv::ID CC, EVT VT) const { if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI()) return 1; - return TargetLowering::getNumRegistersForCallingConv(Context, VT); + return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT); } EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL, @@ -23366,7 +23368,7 @@ static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl, return DAG.getBuildVector(VT, dl, Elts); } - // If the target doesn't support variable shifts, use either FP conversion + // If the target doesn't support variable shifts, use either FP conversion // or integer multiplication to avoid shifting each element individually. if (VT == MVT::v4i32) { Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT)); @@ -23509,6 +23511,24 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG)) return DAG.getNode(ISD::MUL, dl, VT, R, Scale); + // Constant ISD::SRL can be performed efficiently on vXi8/vXi16 vectors as we + // can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt). + // TODO: Improve support for the shift by zero special case. + if (Op.getOpcode() == ISD::SRL && ConstantAmt && + ((Subtarget.hasSSE41() && VT == MVT::v8i16) || + DAG.isKnownNeverZero(Amt)) && + (VT == MVT::v16i8 || VT == MVT::v8i16 || + ((VT == MVT::v32i8 || VT == MVT::v16i16) && Subtarget.hasInt256()))) { + SDValue EltBits = DAG.getConstant(VT.getScalarSizeInBits(), dl, VT); + SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt); + if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) { + SDValue Zero = DAG.getConstant(0, dl, VT); + SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ); + SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale); + return DAG.getSelect(dl, VT, ZAmt, R, Res); + } + } + // v4i32 Non Uniform Shifts. // If the shift amount is constant we can shift each lane using the SSE2 // immediate shifts, else we need to zero-extend each lane to the lower i64 @@ -33425,33 +33445,32 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, } } - // Handle (CMOV C-1, (ADD (CTTZ X), C), (X != 0)) -> - // (ADD (CMOV (CTTZ X), -1, (X != 0)), C) or - // (CMOV (ADD (CTTZ X), C), C-1, (X == 0)) -> - // (ADD (CMOV C-1, (CTTZ X), (X == 0)), C) - if (CC == X86::COND_NE || CC == X86::COND_E) { - auto *Cnst = CC == X86::COND_E ? dyn_cast<ConstantSDNode>(TrueOp) - : dyn_cast<ConstantSDNode>(FalseOp); - SDValue Add = CC == X86::COND_E ? FalseOp : TrueOp; - - if (Cnst && Add.getOpcode() == ISD::ADD && Add.hasOneUse()) { - auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1)); - SDValue AddOp2 = Add.getOperand(0); - if (AddOp1 && (AddOp2.getOpcode() == ISD::CTTZ_ZERO_UNDEF || - AddOp2.getOpcode() == ISD::CTTZ)) { - APInt Diff = Cnst->getAPIntValue() - AddOp1->getAPIntValue(); - if (CC == X86::COND_E) { - Add = DAG.getNode(X86ISD::CMOV, DL, Add.getValueType(), AddOp2, - DAG.getConstant(Diff, DL, Add.getValueType()), - DAG.getConstant(CC, DL, MVT::i8), Cond); - } else { - Add = DAG.getNode(X86ISD::CMOV, DL, Add.getValueType(), - DAG.getConstant(Diff, DL, Add.getValueType()), - AddOp2, DAG.getConstant(CC, DL, MVT::i8), Cond); - } - return DAG.getNode(X86ISD::ADD, DL, Add.getValueType(), Add, - SDValue(AddOp1, 0)); - } + // Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) -> + // (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2) + // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) -> + // (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2) + if ((CC == X86::COND_NE || CC == X86::COND_E) && + Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) { + SDValue Add = TrueOp; + SDValue Const = FalseOp; + // Canonicalize the condition code for easier matching and output. + if (CC == X86::COND_E) { + std::swap(Add, Const); + CC = X86::COND_NE; + } + + // Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant. + if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD && + Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) && + (Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF || + Add.getOperand(0).getOpcode() == ISD::CTTZ) && + Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) { + EVT VT = N->getValueType(0); + // This should constant fold. + SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1)); + SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0), + DAG.getConstant(CC, DL, MVT::i8), Cond); + return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1)); } } @@ -33873,31 +33892,42 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG, ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); if (!C) return SDValue(); - uint64_t MulAmt = C->getZExtValue(); - if (isPowerOf2_64(MulAmt)) + if (isPowerOf2_64(C->getZExtValue())) return SDValue(); + int64_t SignMulAmt = C->getSExtValue(); + assert(SignMulAmt != INT64_MIN && "Int min should have been handled!"); + uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt; + SDLoc DL(N); - if (MulAmt == 3 || MulAmt == 5 || MulAmt == 9) - return DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), - N->getOperand(1)); + if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) { + SDValue NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), + DAG.getConstant(AbsMulAmt, DL, VT)); + if (SignMulAmt < 0) + NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), + NewMul); + + return NewMul; + } uint64_t MulAmt1 = 0; uint64_t MulAmt2 = 0; - if ((MulAmt % 9) == 0) { + if ((AbsMulAmt % 9) == 0) { MulAmt1 = 9; - MulAmt2 = MulAmt / 9; - } else if ((MulAmt % 5) == 0) { + MulAmt2 = AbsMulAmt / 9; + } else if ((AbsMulAmt % 5) == 0) { MulAmt1 = 5; - MulAmt2 = MulAmt / 5; - } else if ((MulAmt % 3) == 0) { + MulAmt2 = AbsMulAmt / 5; + } else if ((AbsMulAmt % 3) == 0) { MulAmt1 = 3; - MulAmt2 = MulAmt / 3; + MulAmt2 = AbsMulAmt / 3; } SDValue NewMul; + // For negative multiply amounts, only allow MulAmt2 to be a power of 2. if (MulAmt2 && - (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){ + (isPowerOf2_64(MulAmt2) || + (SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) { if (isPowerOf2_64(MulAmt2) && !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD)) @@ -33919,17 +33949,19 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG, else NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, DAG.getConstant(MulAmt2, DL, VT)); + + // Negate the result. + if (SignMulAmt < 0) + NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), + NewMul); } else if (!Subtarget.slowLEA()) - NewMul = combineMulSpecial(MulAmt, N, DAG, VT, DL); + NewMul = combineMulSpecial(C->getZExtValue(), N, DAG, VT, DL); if (!NewMul) { - assert(MulAmt != 0 && - MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && + assert(C->getZExtValue() != 0 && + C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && "Both cases that could cause potential overflows should have " "already been handled."); - int64_t SignMulAmt = C->getSExtValue(); - assert(SignMulAmt != INT64_MIN && "Int min should have been handled!"); - uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt; if (isPowerOf2_64(AbsMulAmt - 1)) { // (mul x, 2^N + 1) => (add (shl x, N), x) NewMul = DAG.getNode( @@ -36738,6 +36770,145 @@ static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL, return DAG.getNode(Opc, DL, VT, LHS, RHS); } +// Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes +// from one vector with signed bytes from another vector, adds together +// adjacent pairs of 16-bit products, and saturates the result before +// truncating to 16-bits. +// +// Which looks something like this: +// (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))), +// (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B)))))))) +static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG, + const X86Subtarget &Subtarget, + const SDLoc &DL) { + if (!VT.isVector() || !Subtarget.hasSSSE3()) + return SDValue(); + + unsigned NumElems = VT.getVectorNumElements(); + EVT ScalarVT = VT.getVectorElementType(); + if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems)) + return SDValue(); + + SDValue SSatVal = detectSSatPattern(In, VT); + if (!SSatVal || SSatVal.getOpcode() != ISD::ADD) + return SDValue(); + + // Ok this is a signed saturation of an ADD. See if this ADD is adding pairs + // of multiplies from even/odd elements. + SDValue N0 = SSatVal.getOperand(0); + SDValue N1 = SSatVal.getOperand(1); + + if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL) + return SDValue(); + + SDValue N00 = N0.getOperand(0); + SDValue N01 = N0.getOperand(1); + SDValue N10 = N1.getOperand(0); + SDValue N11 = N1.getOperand(1); + + // TODO: Handle constant vectors and use knownbits/computenumsignbits? + // Canonicalize zero_extend to LHS. + if (N01.getOpcode() == ISD::ZERO_EXTEND) + std::swap(N00, N01); + if (N11.getOpcode() == ISD::ZERO_EXTEND) + std::swap(N10, N11); + + // Ensure we have a zero_extend and a sign_extend. + if (N00.getOpcode() != ISD::ZERO_EXTEND || + N01.getOpcode() != ISD::SIGN_EXTEND || + N10.getOpcode() != ISD::ZERO_EXTEND || + N11.getOpcode() != ISD::SIGN_EXTEND) + return SDValue(); + + // Peek through the extends. + N00 = N00.getOperand(0); + N01 = N01.getOperand(0); + N10 = N10.getOperand(0); + N11 = N11.getOperand(0); + + // Ensure the extend is from vXi8. + if (N00.getValueType().getVectorElementType() != MVT::i8 || + N01.getValueType().getVectorElementType() != MVT::i8 || + N10.getValueType().getVectorElementType() != MVT::i8 || + N11.getValueType().getVectorElementType() != MVT::i8) + return SDValue(); + + // All inputs should be build_vectors. + if (N00.getOpcode() != ISD::BUILD_VECTOR || + N01.getOpcode() != ISD::BUILD_VECTOR || + N10.getOpcode() != ISD::BUILD_VECTOR || + N11.getOpcode() != ISD::BUILD_VECTOR) + return SDValue(); + + // N00/N10 are zero extended. N01/N11 are sign extended. + + // For each element, we need to ensure we have an odd element from one vector + // multiplied by the odd element of another vector and the even element from + // one of the same vectors being multiplied by the even element from the + // other vector. So we need to make sure for each element i, this operator + // is being performed: + // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1] + SDValue ZExtIn, SExtIn; + for (unsigned i = 0; i != NumElems; ++i) { + SDValue N00Elt = N00.getOperand(i); + SDValue N01Elt = N01.getOperand(i); + SDValue N10Elt = N10.getOperand(i); + SDValue N11Elt = N11.getOperand(i); + // TODO: Be more tolerant to undefs. + if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1)); + auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1)); + auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1)); + auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1)); + if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt) + return SDValue(); + unsigned IdxN00 = ConstN00Elt->getZExtValue(); + unsigned IdxN01 = ConstN01Elt->getZExtValue(); + unsigned IdxN10 = ConstN10Elt->getZExtValue(); + unsigned IdxN11 = ConstN11Elt->getZExtValue(); + // Add is commutative so indices can be reordered. + if (IdxN00 > IdxN10) { + std::swap(IdxN00, IdxN10); + std::swap(IdxN01, IdxN11); + } + // N0 indices be the even element. N1 indices must be the next odd element. + if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 || + IdxN01 != 2 * i || IdxN11 != 2 * i + 1) + return SDValue(); + SDValue N00In = N00Elt.getOperand(0); + SDValue N01In = N01Elt.getOperand(0); + SDValue N10In = N10Elt.getOperand(0); + SDValue N11In = N11Elt.getOperand(0); + // First time we find an input capture it. + if (!ZExtIn) { + ZExtIn = N00In; + SExtIn = N01In; + } + if (ZExtIn != N00In || SExtIn != N01In || + ZExtIn != N10In || SExtIn != N11In) + return SDValue(); + } + + auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL, + ArrayRef<SDValue> Ops) { + // Shrink by adding truncate nodes and let DAGCombine fold with the + // sources. + EVT InVT = Ops[0].getValueType(); + assert(InVT.getScalarType() == MVT::i8 && + "Unexpected scalar element type"); + assert(InVT == Ops[1].getValueType() && "Operands' types mismatch"); + EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, + InVT.getVectorNumElements() / 2); + return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]); + }; + return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn }, + PMADDBuilder); +} + static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); @@ -36752,6 +36923,10 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL)) return Avg; + // Try to detect PMADD + if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL)) + return PMAdd; + // Try to combine truncation with signed/unsigned saturation. if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget)) return Val; @@ -36793,38 +36968,14 @@ static SDValue isFNEG(SDNode *N) { if (!Op1.getValueType().isFloatingPoint()) return SDValue(); - SDValue Op0 = peekThroughBitcasts(Op.getOperand(0)); - - unsigned EltBits = Op1.getScalarValueSizeInBits(); - auto isSignMask = [&](const ConstantFP *C) { - return C->getValueAPF().bitcastToAPInt() == APInt::getSignMask(EltBits); - }; - - // There is more than one way to represent the same constant on - // the different X86 targets. The type of the node may also depend on size. - // - load scalar value and broadcast - // - BUILD_VECTOR node - // - load from a constant pool. - // We check all variants here. - if (Op1.getOpcode() == X86ISD::VBROADCAST) { - if (auto *C = getTargetConstantFromNode(Op1.getOperand(0))) - if (isSignMask(cast<ConstantFP>(C))) - return Op0; - - } else if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1)) { - if (ConstantFPSDNode *CN = BV->getConstantFPSplatNode()) - if (isSignMask(CN->getConstantFPValue())) - return Op0; + // Extract constant bits and see if they are all sign bit masks. + APInt UndefElts; + SmallVector<APInt, 16> EltBits; + if (getTargetConstantBitsFromNode(Op1, Op1.getScalarValueSizeInBits(), + UndefElts, EltBits, false, false)) + if (llvm::all_of(EltBits, [](APInt &I) { return I.isSignMask(); })) + return peekThroughBitcasts(Op.getOperand(0)); - } else if (auto *C = getTargetConstantFromNode(Op1)) { - if (C->getType()->isVectorTy()) { - if (auto *SplatV = C->getSplatValue()) - if (isSignMask(cast<ConstantFP>(SplatV))) - return Op0; - } else if (auto *FPConst = dyn_cast<ConstantFP>(C)) - if (isSignMask(FPConst)) - return Op0; - } return SDValue(); } @@ -37777,8 +37928,7 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG, // Look through extract_vector_elts. If it comes from an FNEG, create a // new extract from the FNEG input. if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT && - isa<ConstantSDNode>(V.getOperand(1)) && - cast<ConstantSDNode>(V.getOperand(1))->getZExtValue() == 0) { + isNullConstant(V.getOperand(1))) { if (SDValue NegVal = isFNEG(V.getOperand(0).getNode())) { NegVal = DAG.getBitcast(V.getOperand(0).getValueType(), NegVal); V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(), @@ -38896,7 +39046,7 @@ static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1, std::swap(IdxN00, IdxN10); std::swap(IdxN01, IdxN11); } - // N0 indices be the even elemtn. N1 indices must be the next odd element. + // N0 indices be the even element. N1 indices must be the next odd element. if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 || IdxN01 != 2 * i || IdxN11 != 2 * i + 1) return SDValue(); @@ -39322,8 +39472,7 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG, if ((IdxVal == OpVT.getVectorNumElements() / 2) && Vec.getOpcode() == ISD::INSERT_SUBVECTOR && OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) { - auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2)); - if (Idx2 && Idx2->getZExtValue() == 0) { + if (isNullConstant(Vec.getOperand(2))) { SDValue SubVec2 = Vec.getOperand(1); // If needed, look through bitcasts to get to the load. if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) { diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.h b/contrib/llvm/lib/Target/X86/X86ISelLowering.h index 32215b170a8c..ff5006d208e5 100644 --- a/contrib/llvm/lib/Target/X86/X86ISelLowering.h +++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.h @@ -1097,10 +1097,11 @@ namespace llvm { /// Customize the preferred legalization strategy for certain types. LegalizeTypeAction getPreferredVectorAction(EVT VT) const override; - MVT getRegisterTypeForCallingConv(LLVMContext &Context, + MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override; unsigned getNumRegistersForCallingConv(LLVMContext &Context, + CallingConv::ID CC, EVT VT) const override; bool isIntDivCheap(EVT VT, AttributeList Attr) const override; @@ -1125,8 +1126,8 @@ namespace llvm { bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const override; - SDValue expandIndirectJTBranch(const SDLoc& dl, SDValue Value, - SDValue Addr, SelectionDAG &DAG) + SDValue expandIndirectJTBranch(const SDLoc& dl, SDValue Value, + SDValue Addr, SelectionDAG &DAG) const override; protected: diff --git a/contrib/llvm/lib/Target/X86/X86InstrFoldTables.cpp b/contrib/llvm/lib/Target/X86/X86InstrFoldTables.cpp index 5d8400595bfa..7d31cfab4137 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrFoldTables.cpp +++ b/contrib/llvm/lib/Target/X86/X86InstrFoldTables.cpp @@ -1576,7 +1576,7 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = { { X86::SUBSDrr_Int, X86::SUBSDrm_Int, TB_NO_REVERSE }, { X86::SUBSSrr, X86::SUBSSrm, 0 }, { X86::SUBSSrr_Int, X86::SUBSSrm_Int, TB_NO_REVERSE }, - // FIXME: TEST*rr -> swapped operand of TEST *mr. + // FIXME: TEST*rr -> swapped operand of TEST *mr. { X86::UNPCKHPDrr, X86::UNPCKHPDrm, TB_ALIGN_16 }, { X86::UNPCKHPSrr, X86::UNPCKHPSrm, TB_ALIGN_16 }, { X86::UNPCKLPDrr, X86::UNPCKLPDrm, TB_ALIGN_16 }, diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp index 1b61accfb42b..96db8b4e7585 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -7725,7 +7725,7 @@ X86InstrInfo::insertOutlinedCall(Module &M, MachineBasicBlock &MBB, if (C.CallConstructionID == MachineOutlinerTailCall) { // Yes, just insert a JMP. It = MBB.insert(It, - BuildMI(MF, DebugLoc(), get(X86::JMP_1)) + BuildMI(MF, DebugLoc(), get(X86::TAILJMPd64)) .addGlobalAddress(M.getNamedValue(MF.getName()))); } else { // No, insert a call. diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.td b/contrib/llvm/lib/Target/X86/X86InstrInfo.td index 7509b312c100..bc7afd32d494 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrInfo.td +++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.td @@ -1750,7 +1750,7 @@ def LAHF : I<0x9F, RawFrm, (outs), (ins), "lahf", []>, // AH = flags // Bit tests instructions: BT, BTS, BTR, BTC. let Defs = [EFLAGS] in { -let SchedRW = [WriteALU] in { +let SchedRW = [WriteBitTest] in { def BT16rr : I<0xA3, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2), "bt{w}\t{$src2, $src1|$src1, $src2}", [(set EFLAGS, (X86bt GR16:$src1, GR16:$src2))]>, @@ -1783,7 +1783,7 @@ let mayLoad = 1, hasSideEffects = 0, SchedRW = [WriteALULd] in { []>, TB, NotMemoryFoldable; } -let SchedRW = [WriteALU] in { +let SchedRW = [WriteBitTest] in { def BT16ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR16:$src1, i16i8imm:$src2), "bt{w}\t{$src2, $src1|$src1, $src2}", [(set EFLAGS, (X86bt GR16:$src1, i16immSExt8:$src2))]>, @@ -1818,7 +1818,7 @@ def BT64mi8 : RIi8<0xBA, MRM4m, (outs), (ins i64mem:$src1, i64i8imm:$src2), } // SchedRW let hasSideEffects = 0 in { -let SchedRW = [WriteALU], Constraints = "$src1 = $dst" in { +let SchedRW = [WriteBitTest], Constraints = "$src1 = $dst" in { def BTC16rr : I<0xBB, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2), "btc{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize16, TB, NotMemoryFoldable; @@ -1842,7 +1842,7 @@ def BTC64mr : RI<0xBB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2), NotMemoryFoldable; } -let SchedRW = [WriteALU], Constraints = "$src1 = $dst" in { +let SchedRW = [WriteBitTest], Constraints = "$src1 = $dst" in { def BTC16ri8 : Ii8<0xBA, MRM7r, (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2), "btc{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize16, TB; def BTC32ri8 : Ii8<0xBA, MRM7r, (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2), @@ -1861,7 +1861,7 @@ def BTC64mi8 : RIi8<0xBA, MRM7m, (outs), (ins i64mem:$src1, i64i8imm:$src2), Requires<[In64BitMode]>; } -let SchedRW = [WriteALU], Constraints = "$src1 = $dst" in { +let SchedRW = [WriteBitTest], Constraints = "$src1 = $dst" in { def BTR16rr : I<0xB3, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2), "btr{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize16, TB, NotMemoryFoldable; @@ -1885,7 +1885,7 @@ def BTR64mr : RI<0xB3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2), NotMemoryFoldable; } -let SchedRW = [WriteALU], Constraints = "$src1 = $dst" in { +let SchedRW = [WriteBitTest], Constraints = "$src1 = $dst" in { def BTR16ri8 : Ii8<0xBA, MRM6r, (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2), "btr{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize16, TB; @@ -1908,7 +1908,7 @@ def BTR64mi8 : RIi8<0xBA, MRM6m, (outs), (ins i64mem:$src1, i64i8imm:$src2), Requires<[In64BitMode]>; } -let SchedRW = [WriteALU], Constraints = "$src1 = $dst" in { +let SchedRW = [WriteBitTest], Constraints = "$src1 = $dst" in { def BTS16rr : I<0xAB, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2), "bts{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize16, TB, NotMemoryFoldable; @@ -1932,7 +1932,7 @@ def BTS64mr : RI<0xAB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2), NotMemoryFoldable; } -let SchedRW = [WriteALU], Constraints = "$src1 = $dst" in { +let SchedRW = [WriteBitTest], Constraints = "$src1 = $dst" in { def BTS16ri8 : Ii8<0xBA, MRM5r, (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2), "bts{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize16, TB; def BTS32ri8 : Ii8<0xBA, MRM5r, (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2), diff --git a/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td b/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td index ee3b01159174..023137634df1 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td +++ b/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td @@ -650,9 +650,9 @@ def ROR64m1 : RI<0xD1, MRM1m, (outs), (ins i64mem:$dst), // Double shift instructions (generalizations of rotate) //===----------------------------------------------------------------------===// -let Constraints = "$src1 = $dst", SchedRW = [WriteShiftDouble] in { +let Constraints = "$src1 = $dst" in { -let Uses = [CL] in { +let Uses = [CL], SchedRW = [WriteSHDrrcl] in { def SHLD16rrCL : I<0xA5, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2), "shld{w}\t{%cl, $src2, $dst|$dst, $src2, cl}", @@ -683,9 +683,9 @@ def SHRD64rrCL : RI<0xAD, MRMDestReg, (outs GR64:$dst), "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, cl}", [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2, CL))]>, TB; -} +} // SchedRW -let isCommutable = 1 in { // These instructions commute to each other. +let isCommutable = 1, SchedRW = [WriteSHDrri] in { // These instructions commute to each other. def SHLD16rri8 : Ii8<0xA4, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2, u8imm:$src3), @@ -728,11 +728,10 @@ def SHRD64rri8 : RIi8<0xAC, MRMDestReg, [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2, (i8 imm:$src3)))]>, TB; -} -} // Constraints = "$src = $dst", SchedRW +} // SchedRW +} // Constraints = "$src = $dst" -let SchedRW = [WriteShiftDoubleLd, WriteRMW] in { -let Uses = [CL] in { +let Uses = [CL], SchedRW = [WriteSHDmrcl] in { def SHLD16mrCL : I<0xA5, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2), "shld{w}\t{%cl, $src2, $dst|$dst, $src2, cl}", [(store (X86shld (loadi16 addr:$dst), GR16:$src2, CL), @@ -759,8 +758,9 @@ def SHRD64mrCL : RI<0xAD, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2), "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, cl}", [(store (X86shrd (loadi64 addr:$dst), GR64:$src2, CL), addr:$dst)]>, TB; -} +} // SchedRW +let SchedRW = [WriteSHDmri] in { def SHLD16mri8 : Ii8<0xA4, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2, u8imm:$src3), "shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}", diff --git a/contrib/llvm/lib/Target/X86/X86SchedBroadwell.td b/contrib/llvm/lib/Target/X86/X86SchedBroadwell.td index c7713fea70fa..6334d9e89a60 100755 --- a/contrib/llvm/lib/Target/X86/X86SchedBroadwell.td +++ b/contrib/llvm/lib/Target/X86/X86SchedBroadwell.td @@ -119,8 +119,8 @@ defm : BWWriteResPair<WriteIDiv16, [BWPort0, BWDivider], 25, [1, 10]>; defm : BWWriteResPair<WriteIDiv32, [BWPort0, BWDivider], 25, [1, 10]>; defm : BWWriteResPair<WriteIDiv64, [BWPort0, BWDivider], 25, [1, 10]>; -defm : BWWriteResPair<WriteBSWAP32,[BWPort15], 1>; // -defm : BWWriteResPair<WriteBSWAP64,[BWPort06, BWPort15], 2, [1, 1], 2>; // +defm : X86WriteRes<WriteBSWAP32, [BWPort15], 1, [1], 1>; +defm : X86WriteRes<WriteBSWAP64, [BWPort06, BWPort15], 2, [1, 1], 2>; defm : BWWriteResPair<WriteCRC32, [BWPort1], 3>; def : WriteRes<WriteIMulH, []> { let Latency = 3; } // Integer multiplication, high part. @@ -137,6 +137,7 @@ def : WriteRes<WriteSETCCStore, [BWPort06,BWPort4,BWPort237]> { let NumMicroOps = 3; } def : WriteRes<WriteLAHFSAHF, [BWPort06]>; +def : WriteRes<WriteBitTest,[BWPort06]>; // Bit Test instrs // Bit counts. defm : BWWriteResPair<WriteBSF, [BWPort1], 3>; @@ -148,8 +149,11 @@ defm : BWWriteResPair<WritePOPCNT, [BWPort1], 3>; // Integer shifts and rotates. defm : BWWriteResPair<WriteShift, [BWPort06], 1>; -// Double shift instructions. -defm : BWWriteResPair<WriteShiftDouble, [BWPort06], 1>; +// SHLD/SHRD. +defm : X86WriteRes<WriteSHDrri, [BWPort1], 3, [1], 1>; +defm : X86WriteRes<WriteSHDrrcl,[BWPort1,BWPort06,BWPort0156], 6, [1, 1, 2], 4>; +defm : X86WriteRes<WriteSHDmri, [BWPort1,BWPort23,BWPort237,BWPort0156], 9, [1, 1, 1, 1], 4>; +defm : X86WriteRes<WriteSHDmrcl,[BWPort1,BWPort23,BWPort237,BWPort06,BWPort0156], 11, [1, 1, 1, 1, 2], 6>; // BMI1 BEXTR, BMI2 BZHI defm : BWWriteResPair<WriteBEXTR, [BWPort06,BWPort15], 2, [1,1], 2>; @@ -600,14 +604,6 @@ def BWWriteResGroup6 : SchedWriteRes<[BWPort06]> { let ResourceCycles = [1]; } def: InstRW<[BWWriteResGroup6], (instrs CDQ, CQO)>; -def: InstRW<[BWWriteResGroup6], (instregex "BT(16|32|64)ri8", - "BT(16|32|64)rr", - "BTC(16|32|64)ri8", - "BTC(16|32|64)rr", - "BTR(16|32|64)ri8", - "BTR(16|32|64)rr", - "BTS(16|32|64)ri8", - "BTS(16|32|64)rr")>; def BWWriteResGroup7 : SchedWriteRes<[BWPort15]> { let Latency = 1; @@ -746,8 +742,6 @@ def BWWriteResGroup27 : SchedWriteRes<[BWPort1]> { def: InstRW<[BWWriteResGroup27], (instregex "MMX_CVTPI2PSirr", "PDEP(32|64)rr", "PEXT(32|64)rr", - "SHLD(16|32|64)rri8", - "SHRD(16|32|64)rri8", "(V?)CVTDQ2PS(Y?)rr")>; def BWWriteResGroup27_16 : SchedWriteRes<[BWPort1, BWPort0156]> { @@ -1055,14 +1049,6 @@ def BWWriteResGroup66 : SchedWriteRes<[BWPort23,BWPort0156]> { def: InstRW<[BWWriteResGroup66], (instrs POP16r, POP32r, POP64r)>; def: InstRW<[BWWriteResGroup66], (instregex "POP(16|32|64)rmr")>; -def BWWriteResGroup67 : SchedWriteRes<[BWPort1,BWPort06,BWPort0156]> { - let Latency = 6; - let NumMicroOps = 4; - let ResourceCycles = [1,1,2]; -} -def: InstRW<[BWWriteResGroup67], (instregex "SHLD(16|32|64)rrCL", - "SHRD(16|32|64)rrCL")>; - def BWWriteResGroup68 : SchedWriteRes<[BWPort1,BWPort6,BWPort06,BWPort0156]> { let Latency = 6; let NumMicroOps = 4; @@ -1307,14 +1293,6 @@ def BWWriteResGroup108 : SchedWriteRes<[BWPort5,BWPort23,BWPort015]> { def: InstRW<[BWWriteResGroup108], (instregex "VPBROADCASTB(Y?)rm", "VPBROADCASTW(Y?)rm")>; -def BWWriteResGroup111 : SchedWriteRes<[BWPort1,BWPort23,BWPort237,BWPort0156]> { - let Latency = 9; - let NumMicroOps = 4; - let ResourceCycles = [1,1,1,1]; -} -def: InstRW<[BWWriteResGroup111], (instregex "SHLD(16|32|64)mri8", - "SHRD(16|32|64)mri8")>; - def BWWriteResGroup112 : SchedWriteRes<[BWPort23,BWPort06,BWPort0156]> { let Latency = 9; let NumMicroOps = 5; @@ -1380,14 +1358,6 @@ def BWWriteResGroup128 : SchedWriteRes<[BWPort1,BWPort5,BWPort23]> { } def: InstRW<[BWWriteResGroup128], (instregex "VCVTDQ2PDYrm")>; -def BWWriteResGroup130 : SchedWriteRes<[BWPort1,BWPort23,BWPort237,BWPort06,BWPort0156]> { - let Latency = 11; - let NumMicroOps = 6; - let ResourceCycles = [1,1,1,1,2]; -} -def: InstRW<[BWWriteResGroup130], (instregex "SHLD(16|32|64)mrCL", - "SHRD(16|32|64)mrCL")>; - def BWWriteResGroup131 : SchedWriteRes<[BWPort1,BWPort06,BWPort0156]> { let Latency = 11; let NumMicroOps = 7; diff --git a/contrib/llvm/lib/Target/X86/X86SchedHaswell.td b/contrib/llvm/lib/Target/X86/X86SchedHaswell.td index 189dd4183839..876c3e4162cf 100644 --- a/contrib/llvm/lib/Target/X86/X86SchedHaswell.td +++ b/contrib/llvm/lib/Target/X86/X86SchedHaswell.td @@ -118,17 +118,26 @@ defm : X86WriteRes<WriteLoad, [HWPort23], 5, [1], 1>; defm : X86WriteRes<WriteMove, [HWPort0156], 1, [1], 1>; def : WriteRes<WriteZero, []>; +// Arithmetic. defm : HWWriteResPair<WriteALU, [HWPort0156], 1>; -defm : HWWriteResPair<WriteADC, [HWPort06,HWPort0156], 2, [1,1], 2>; +defm : HWWriteResPair<WriteADC, [HWPort06, HWPort0156], 2, [1,1], 2>; defm : HWWriteResPair<WriteIMul, [HWPort1], 3>; defm : HWWriteResPair<WriteIMul64, [HWPort1], 3>; -defm : HWWriteResPair<WriteBSWAP32,[HWPort15], 1>; -defm : HWWriteResPair<WriteBSWAP64,[HWPort06, HWPort15], 2, [1,1], 2>; +defm : X86WriteRes<WriteBSWAP32, [HWPort15], 1, [1], 1>; +defm : X86WriteRes<WriteBSWAP64, [HWPort06, HWPort15], 2, [1,1], 2>; def : WriteRes<WriteIMulH, []> { let Latency = 3; } + +// Integer shifts and rotates. defm : HWWriteResPair<WriteShift, [HWPort06], 1>; -defm : HWWriteResPair<WriteShiftDouble, [HWPort06], 1>; + +// SHLD/SHRD. +defm : X86WriteRes<WriteSHDrri, [HWPort1], 3, [1], 1>; +defm : X86WriteRes<WriteSHDrrcl,[HWPort1, HWPort06, HWPort0156], 6, [1, 1, 2], 4>; +defm : X86WriteRes<WriteSHDmri, [HWPort1, HWPort23, HWPort237, HWPort0156], 10, [1, 1, 1, 1], 4>; +defm : X86WriteRes<WriteSHDmrcl,[HWPort1, HWPort23, HWPort237, HWPort06, HWPort0156], 12, [1, 1, 1, 1, 2], 6>; + defm : HWWriteResPair<WriteJump, [HWPort06], 1>; defm : HWWriteResPair<WriteCRC32, [HWPort1], 3>; @@ -141,6 +150,7 @@ def : WriteRes<WriteSETCCStore, [HWPort06,HWPort4,HWPort237]> { let NumMicroOps = 3; } def : WriteRes<WriteLAHFSAHF, [HWPort06]>; +def : WriteRes<WriteBitTest,[HWPort06]>; // This is for simple LEAs with one or two input operands. // The complex ones can only execute on port 1, and they require two cycles on @@ -886,14 +896,6 @@ def HWWriteResGroup7 : SchedWriteRes<[HWPort06]> { let ResourceCycles = [1]; } def: InstRW<[HWWriteResGroup7], (instrs CDQ, CQO)>; -def: InstRW<[HWWriteResGroup7], (instregex "BT(16|32|64)ri8", - "BT(16|32|64)rr", - "BTC(16|32|64)ri8", - "BTC(16|32|64)rr", - "BTR(16|32|64)ri8", - "BTR(16|32|64)rr", - "BTS(16|32|64)ri8", - "BTS(16|32|64)rr")>; def HWWriteResGroup8 : SchedWriteRes<[HWPort15]> { let Latency = 1; @@ -1240,8 +1242,6 @@ def HWWriteResGroup50 : SchedWriteRes<[HWPort1]> { def: InstRW<[HWWriteResGroup50], (instregex "MMX_CVTPI2PSirr", "PDEP(32|64)rr", "PEXT(32|64)rr", - "SHLD(16|32|64)rri8", - "SHRD(16|32|64)rri8", "(V?)CVTDQ2PS(Y?)rr")>; def HWWriteResGroup50_16i : SchedWriteRes<[HWPort1, HWPort0156]> { @@ -1513,14 +1513,6 @@ def HWWriteResGroup83 : SchedWriteRes<[HWPort1,HWPort6,HWPort0156]> { } def: InstRW<[HWWriteResGroup83], (instregex "LAR(16|32|64)rr")>; -def HWWriteResGroup86 : SchedWriteRes<[HWPort1,HWPort23,HWPort237,HWPort0156]> { - let Latency = 10; - let NumMicroOps = 4; - let ResourceCycles = [1,1,1,1]; -} -def: InstRW<[HWWriteResGroup86], (instregex "SHLD(16|32|64)mri8", - "SHRD(16|32|64)mri8")>; - def HWWriteResGroup87 : SchedWriteRes<[HWPort1,HWPort6,HWPort23,HWPort0156]> { let Latency = 9; let NumMicroOps = 5; @@ -1638,14 +1630,6 @@ def HWWriteResGroup104 : SchedWriteRes<[HWPort1,HWPort5,HWPort23]> { } def: InstRW<[HWWriteResGroup104], (instregex "VCVTDQ2PDYrm")>; -def HWWriteResGroup105 : SchedWriteRes<[HWPort1,HWPort06,HWPort0156]> { - let Latency = 6; - let NumMicroOps = 4; - let ResourceCycles = [1,1,2]; -} -def: InstRW<[HWWriteResGroup105], (instregex "SHLD(16|32|64)rrCL", - "SHRD(16|32|64)rrCL")>; - def HWWriteResGroup107 : SchedWriteRes<[HWPort1,HWPort6,HWPort06,HWPort0156]> { let Latency = 6; let NumMicroOps = 4; @@ -1660,14 +1644,6 @@ def HWWriteResGroup108 : SchedWriteRes<[HWPort6,HWPort0156]> { } def: InstRW<[HWWriteResGroup108], (instrs STD)>; -def HWWriteResGroup109 : SchedWriteRes<[HWPort1,HWPort23,HWPort237,HWPort06,HWPort0156]> { - let Latency = 12; - let NumMicroOps = 6; - let ResourceCycles = [1,1,1,1,2]; -} -def: InstRW<[HWWriteResGroup109], (instregex "SHLD(16|32|64)mrCL", - "SHRD(16|32|64)mrCL")>; - def HWWriteResGroup114 : SchedWriteRes<[HWPort6,HWPort06,HWPort15,HWPort0156]> { let Latency = 7; let NumMicroOps = 7; diff --git a/contrib/llvm/lib/Target/X86/X86SchedSandyBridge.td b/contrib/llvm/lib/Target/X86/X86SchedSandyBridge.td index 3b543c680ef4..6b7bbdea860a 100644 --- a/contrib/llvm/lib/Target/X86/X86SchedSandyBridge.td +++ b/contrib/llvm/lib/Target/X86/X86SchedSandyBridge.td @@ -106,13 +106,14 @@ def : WriteRes<WriteLoad, [SBPort23]> { let Latency = 5; } def : WriteRes<WriteMove, [SBPort015]>; def : WriteRes<WriteZero, []>; +// Arithmetic. defm : SBWriteResPair<WriteALU, [SBPort015], 1>; defm : SBWriteResPair<WriteADC, [SBPort05,SBPort015], 2, [1,1], 2>; defm : SBWriteResPair<WriteIMul, [SBPort1], 3>; defm : SBWriteResPair<WriteIMul64, [SBPort1], 3>; -defm : SBWriteResPair<WriteBSWAP32,[SBPort1], 1>; -defm : SBWriteResPair<WriteBSWAP64,[SBPort1,SBPort05], 2, [1,1], 2>; +defm : X86WriteRes<WriteBSWAP32, [SBPort1], 1, [1], 1>; +defm : X86WriteRes<WriteBSWAP64, [SBPort1,SBPort05], 2, [1,1], 2>; defm : SBWriteResPair<WriteDiv8, [SBPort0, SBDivider], 25, [1, 10]>; defm : SBWriteResPair<WriteDiv16, [SBPort0, SBDivider], 25, [1, 10]>; @@ -125,8 +126,13 @@ defm : SBWriteResPair<WriteIDiv64, [SBPort0, SBDivider], 25, [1, 10]>; def : WriteRes<WriteIMulH, []> { let Latency = 3; } +// SHLD/SHRD. +defm : X86WriteRes<WriteSHDrri, [SBPort05, SBPort015], 2, [1, 1], 2>; +defm : X86WriteRes<WriteSHDrrcl,[SBPort05, SBPort015], 4, [3, 1], 4>; +defm : X86WriteRes<WriteSHDmri, [SBPort4,SBPort23,SBPort05,SBPort015], 8, [1, 2, 1, 1], 5>; +defm : X86WriteRes<WriteSHDmrcl,[SBPort4,SBPort23,SBPort05,SBPort015], 10, [1, 2, 3, 1], 7>; + defm : SBWriteResPair<WriteShift, [SBPort05], 1>; -defm : SBWriteResPair<WriteShiftDouble, [SBPort05], 1>; defm : SBWriteResPair<WriteJump, [SBPort5], 1>; defm : SBWriteResPair<WriteCRC32, [SBPort1], 3, [1], 1, 5>; @@ -139,6 +145,7 @@ def : WriteRes<WriteSETCCStore, [SBPort05,SBPort4,SBPort23]> { let NumMicroOps = 3; } def : WriteRes<WriteLAHFSAHF, [SBPort05]>; +def : WriteRes<WriteBitTest,[SBPort05]>; // This is for simple LEAs with one or two input operands. // The complex ones can only execute on port 1, and they require two cycles on @@ -564,14 +571,6 @@ def SBWriteResGroup4 : SchedWriteRes<[SBPort05]> { let ResourceCycles = [1]; } def: InstRW<[SBWriteResGroup4], (instrs CDQ, CQO)>; -def: InstRW<[SBWriteResGroup4], (instregex "BT(16|32|64)ri8", - "BT(16|32|64)rr", - "BTC(16|32|64)ri8", - "BTC(16|32|64)rr", - "BTR(16|32|64)ri8", - "BTR(16|32|64)rr", - "BTS(16|32|64)ri8", - "BTS(16|32|64)rr")>; def SBWriteResGroup5 : SchedWriteRes<[SBPort15]> { let Latency = 1; @@ -630,14 +629,6 @@ def SBWriteResGroup18 : SchedWriteRes<[SBPort5,SBPort015]> { def: InstRW<[SBWriteResGroup18], (instrs JCXZ, JECXZ, JRCXZ)>; def: InstRW<[SBWriteResGroup18], (instregex "MMX_MOVDQ2Qrr")>; -def SBWriteResGroup19 : SchedWriteRes<[SBPort05,SBPort015]> { - let Latency = 2; - let NumMicroOps = 2; - let ResourceCycles = [1,1]; -} -def: InstRW<[SBWriteResGroup19], (instregex "SHLD(16|32|64)rri8", - "SHRD(16|32|64)rri8")>; - def SBWriteResGroup21 : SchedWriteRes<[SBPort1]> { let Latency = 3; let NumMicroOps = 1; @@ -728,14 +719,6 @@ def SBWriteResGroup29_2 : SchedWriteRes<[SBPort5,SBPort015]> { } def: InstRW<[SBWriteResGroup29_2], (instrs PAUSE)>; -def SBWriteResGroup29_3 : SchedWriteRes<[SBPort05,SBPort015]> { - let Latency = 4; - let NumMicroOps = 4; - let ResourceCycles = [3,1]; -} -def: InstRW<[SBWriteResGroup29_3], (instregex "SHLD(16|32|64)rrCL", - "SHRD(16|32|64)rrCL")>; - def SBWriteResGroup30 : SchedWriteRes<[SBPort0]> { let Latency = 5; let NumMicroOps = 1; @@ -1027,14 +1010,6 @@ def SBWriteResGroup87 : SchedWriteRes<[SBPort4,SBPort5,SBPort01,SBPort23]> { } def: InstRW<[SBWriteResGroup87], (instrs FARCALL64)>; -def SBWriteResGroup88 : SchedWriteRes<[SBPort4,SBPort23,SBPort05,SBPort015]> { - let Latency = 8; - let NumMicroOps = 5; - let ResourceCycles = [1,2,1,1]; -} -def: InstRW<[SBWriteResGroup88], (instregex "SHLD(16|32|64)mri8", - "SHRD(16|32|64)mri8")>; - def SBWriteResGroup93 : SchedWriteRes<[SBPort0,SBPort1,SBPort23]> { let Latency = 9; let NumMicroOps = 3; @@ -1130,14 +1105,6 @@ def SBWriteResGroup101 : SchedWriteRes<[SBPort1,SBPort23]> { def: InstRW<[SBWriteResGroup101], (instregex "(ADD|SUB|SUBR)_F(32|64)m", "ILD_F(16|32|64)m")>; -def SBWriteResGroup103_2 : SchedWriteRes<[SBPort4,SBPort23,SBPort05,SBPort015]> { - let Latency = 10; - let NumMicroOps = 7; - let ResourceCycles = [1,2,3,1]; -} -def: InstRW<[SBWriteResGroup103_2], (instregex "SHLD(16|32|64)mrCL", - "SHRD(16|32|64)mrCL")>; - def SBWriteResGroup104 : SchedWriteRes<[SBPort0,SBPort23]> { let Latency = 11; let NumMicroOps = 2; diff --git a/contrib/llvm/lib/Target/X86/X86SchedSkylakeClient.td b/contrib/llvm/lib/Target/X86/X86SchedSkylakeClient.td index 1417799d76be..bda088e1512f 100644 --- a/contrib/llvm/lib/Target/X86/X86SchedSkylakeClient.td +++ b/contrib/llvm/lib/Target/X86/X86SchedSkylakeClient.td @@ -110,8 +110,8 @@ defm : SKLWriteResPair<WriteADC, [SKLPort06], 1>; // Integer ALU + flags op defm : SKLWriteResPair<WriteIMul, [SKLPort1], 3>; // Integer multiplication. defm : SKLWriteResPair<WriteIMul64, [SKLPort1], 3>; // Integer 64-bit multiplication. -defm : SKLWriteResPair<WriteBSWAP32,[SKLPort15], 1>; // -defm : SKLWriteResPair<WriteBSWAP64,[SKLPort06, SKLPort15], 2, [1,1], 2>; // +defm : X86WriteRes<WriteBSWAP32, [SKLPort15], 1, [1], 1>; +defm : X86WriteRes<WriteBSWAP64, [SKLPort06, SKLPort15], 2, [1,1], 2>; defm : SKLWriteResPair<WriteDiv8, [SKLPort0, SKLDivider], 25, [1,10], 1, 4>; defm : SKLWriteResPair<WriteDiv16, [SKLPort0, SKLDivider], 25, [1,10], 1, 4>; @@ -136,6 +136,7 @@ def : WriteRes<WriteSETCCStore, [SKLPort06,SKLPort4,SKLPort237]> { let NumMicroOps = 3; } def : WriteRes<WriteLAHFSAHF, [SKLPort06]>; +def : WriteRes<WriteBitTest,[SKLPort06]>; // // Bit counts. defm : SKLWriteResPair<WriteBSF, [SKLPort1], 3>; @@ -147,8 +148,11 @@ defm : SKLWriteResPair<WritePOPCNT, [SKLPort1], 3>; // Integer shifts and rotates. defm : SKLWriteResPair<WriteShift, [SKLPort06], 1>; -// Double shift instructions. -defm : SKLWriteResPair<WriteShiftDouble, [SKLPort06], 1>; +// SHLD/SHRD. +defm : X86WriteRes<WriteSHDrri, [SKLPort1], 3, [1], 1>; +defm : X86WriteRes<WriteSHDrrcl,[SKLPort1,SKLPort06,SKLPort0156], 6, [1, 2, 1], 4>; +defm : X86WriteRes<WriteSHDmri, [SKLPort1,SKLPort23,SKLPort237,SKLPort0156], 9, [1, 1, 1, 1], 4>; +defm : X86WriteRes<WriteSHDmrcl,[SKLPort1,SKLPort23,SKLPort237,SKLPort06,SKLPort0156], 11, [1, 1, 1, 2, 1], 6>; // BMI1 BEXTR, BMI2 BZHI defm : SKLWriteResPair<WriteBEXTR, [SKLPort06,SKLPort15], 2, [1,1], 2>; @@ -602,14 +606,6 @@ def SKLWriteResGroup7 : SchedWriteRes<[SKLPort06]> { let ResourceCycles = [1]; } def: InstRW<[SKLWriteResGroup7], (instrs CDQ, CQO, CLAC, STAC)>; -def: InstRW<[SKLWriteResGroup7], (instregex "BT(16|32|64)ri8", - "BT(16|32|64)rr", - "BTC(16|32|64)ri8", - "BTC(16|32|64)rr", - "BTR(16|32|64)ri8", - "BTR(16|32|64)rr", - "BTS(16|32|64)ri8", - "BTS(16|32|64)rr")>; def SKLWriteResGroup8 : SchedWriteRes<[SKLPort15]> { let Latency = 1; @@ -743,9 +739,7 @@ def SKLWriteResGroup29 : SchedWriteRes<[SKLPort1]> { let ResourceCycles = [1]; } def: InstRW<[SKLWriteResGroup29], (instregex "PDEP(32|64)rr", - "PEXT(32|64)rr", - "SHLD(16|32|64)rri8", - "SHRD(16|32|64)rri8")>; + "PEXT(32|64)rr")>; def SKLWriteResGroup29_16i : SchedWriteRes<[SKLPort1, SKLPort0156]> { let Latency = 4; @@ -1096,14 +1090,6 @@ def SKLWriteResGroup78 : SchedWriteRes<[SKLPort5,SKLPort01]> { } def: InstRW<[SKLWriteResGroup78], (instregex "(V?)CVTSI642SSrr")>; -def SKLWriteResGroup79 : SchedWriteRes<[SKLPort1,SKLPort06,SKLPort0156]> { - let Latency = 6; - let NumMicroOps = 4; - let ResourceCycles = [1,2,1]; -} -def: InstRW<[SKLWriteResGroup79], (instregex "SHLD(16|32|64)rrCL", - "SHRD(16|32|64)rrCL")>; - def SKLWriteResGroup80 : SchedWriteRes<[SKLPort1,SKLPort6,SKLPort06,SKLPort0156]> { let Latency = 6; let NumMicroOps = 4; @@ -1392,14 +1378,6 @@ def SKLWriteResGroup128 : SchedWriteRes<[SKLPort5,SKLPort01,SKLPort23]> { def: InstRW<[SKLWriteResGroup128], (instregex "(V?)PHADDSWrm", "(V?)PHSUBSWrm")>; -def SKLWriteResGroup130 : SchedWriteRes<[SKLPort1,SKLPort23,SKLPort237,SKLPort0156]> { - let Latency = 9; - let NumMicroOps = 4; - let ResourceCycles = [1,1,1,1]; -} -def: InstRW<[SKLWriteResGroup130], (instregex "SHLD(16|32|64)mri8", - "SHRD(16|32|64)mri8")>; - def SKLWriteResGroup131 : SchedWriteRes<[SKLPort1,SKLPort6,SKLPort23,SKLPort0156]> { let Latency = 9; let NumMicroOps = 5; @@ -1519,14 +1497,6 @@ def: InstRW<[SKLWriteResGroup152], (instregex "CVTPD2PSrm", "CVT(T?)PD2DQrm", "MMX_CVT(T?)PD2PIirm")>; -def SKLWriteResGroup153 : SchedWriteRes<[SKLPort1,SKLPort23,SKLPort237,SKLPort06,SKLPort0156]> { - let Latency = 11; - let NumMicroOps = 6; - let ResourceCycles = [1,1,1,2,1]; -} -def: InstRW<[SKLWriteResGroup153], (instregex "SHLD(16|32|64)mrCL", - "SHRD(16|32|64)mrCL")>; - def SKLWriteResGroup154 : SchedWriteRes<[SKLPort1,SKLPort06,SKLPort0156]> { let Latency = 11; let NumMicroOps = 7; diff --git a/contrib/llvm/lib/Target/X86/X86SchedSkylakeServer.td b/contrib/llvm/lib/Target/X86/X86SchedSkylakeServer.td index 7095ec081bd9..9d5f8555c505 100755 --- a/contrib/llvm/lib/Target/X86/X86SchedSkylakeServer.td +++ b/contrib/llvm/lib/Target/X86/X86SchedSkylakeServer.td @@ -110,8 +110,8 @@ defm : SKXWriteResPair<WriteADC, [SKXPort06], 1>; // Integer ALU + flags op defm : SKXWriteResPair<WriteIMul, [SKXPort1], 3>; // Integer multiplication. defm : SKXWriteResPair<WriteIMul64, [SKXPort1], 3>; // Integer 64-bit multiplication. -defm : SKXWriteResPair<WriteBSWAP32,[SKXPort15], 1>; // -defm : SKXWriteResPair<WriteBSWAP64,[SKXPort06, SKXPort15], 2, [1,1], 2>; // +defm : X86WriteRes<WriteBSWAP32, [SKXPort15], 1, [1], 1>; +defm : X86WriteRes<WriteBSWAP64, [SKXPort06, SKXPort15], 2, [1,1], 2>; defm : SKXWriteResPair<WriteDiv8, [SKXPort0, SKXDivider], 25, [1,10], 1, 4>; defm : SKXWriteResPair<WriteDiv16, [SKXPort0, SKXDivider], 25, [1,10], 1, 4>; @@ -136,12 +136,16 @@ def : WriteRes<WriteSETCCStore, [SKXPort06,SKXPort4,SKXPort237]> { let NumMicroOps = 3; } def : WriteRes<WriteLAHFSAHF, [SKXPort06]>; +def : WriteRes<WriteBitTest,[SKXPort06]>; // // Integer shifts and rotates. defm : SKXWriteResPair<WriteShift, [SKXPort06], 1>; -// Double shift instructions. -defm : SKXWriteResPair<WriteShiftDouble, [SKXPort06], 1>; +// SHLD/SHRD. +defm : X86WriteRes<WriteSHDrri, [SKXPort1], 3, [1], 1>; +defm : X86WriteRes<WriteSHDrrcl,[SKXPort1,SKXPort06,SKXPort0156], 6, [1, 2, 1], 4>; +defm : X86WriteRes<WriteSHDmri, [SKXPort1,SKXPort23,SKXPort237,SKXPort0156], 9, [1, 1, 1, 1], 4>; +defm : X86WriteRes<WriteSHDmrcl,[SKXPort1,SKXPort23,SKXPort237,SKXPort06,SKXPort0156], 11, [1, 1, 1, 2, 1], 6>; // Bit counts. defm : SKXWriteResPair<WriteBSF, [SKXPort1], 3>; @@ -615,14 +619,6 @@ def SKXWriteResGroup7 : SchedWriteRes<[SKXPort06]> { let ResourceCycles = [1]; } def: InstRW<[SKXWriteResGroup7], (instrs CDQ, CQO, CLAC, STAC)>; -def: InstRW<[SKXWriteResGroup7], (instregex "BT(16|32|64)ri8", - "BT(16|32|64)rr", - "BTC(16|32|64)ri8", - "BTC(16|32|64)rr", - "BTR(16|32|64)ri8", - "BTR(16|32|64)rr", - "BTS(16|32|64)ri8", - "BTS(16|32|64)rr")>; def SKXWriteResGroup8 : SchedWriteRes<[SKXPort15]> { let Latency = 1; @@ -783,9 +779,7 @@ def SKXWriteResGroup31 : SchedWriteRes<[SKXPort1]> { let ResourceCycles = [1]; } def: InstRW<[SKXWriteResGroup31], (instregex "PDEP(32|64)rr", - "PEXT(32|64)rr", - "SHLD(16|32|64)rri8", - "SHRD(16|32|64)rri8")>; + "PEXT(32|64)rr")>; def SKXWriteResGroup31_16i : SchedWriteRes<[SKXPort1, SKXPort0156]> { let Latency = 4; @@ -1270,14 +1264,6 @@ def: InstRW<[SKXWriteResGroup82], (instregex "(V?)CVTSI642SSrr", "VCVTSI642SSZrr", "VCVTUSI642SSZrr")>; -def SKXWriteResGroup83 : SchedWriteRes<[SKXPort1,SKXPort06,SKXPort0156]> { - let Latency = 6; - let NumMicroOps = 4; - let ResourceCycles = [1,2,1]; -} -def: InstRW<[SKXWriteResGroup83], (instregex "SHLD(16|32|64)rrCL", - "SHRD(16|32|64)rrCL")>; - def SKXWriteResGroup84 : SchedWriteRes<[SKXPort1,SKXPort6,SKXPort06,SKXPort0156]> { let Latency = 6; let NumMicroOps = 4; @@ -1830,14 +1816,6 @@ def SKXWriteResGroup143 : SchedWriteRes<[SKXPort5,SKXPort01,SKXPort23]> { def: InstRW<[SKXWriteResGroup143], (instregex "(V?)PHADDSWrm", "(V?)PHSUBSWrm")>; -def SKXWriteResGroup145 : SchedWriteRes<[SKXPort1,SKXPort23,SKXPort237,SKXPort0156]> { - let Latency = 9; - let NumMicroOps = 4; - let ResourceCycles = [1,1,1,1]; -} -def: InstRW<[SKXWriteResGroup145], (instregex "SHLD(16|32|64)mri8", - "SHRD(16|32|64)mri8")>; - def SKXWriteResGroup146 : SchedWriteRes<[SKXPort1,SKXPort6,SKXPort23,SKXPort0156]> { let Latency = 9; let NumMicroOps = 5; @@ -2033,14 +2011,6 @@ def SKXWriteResGroup167 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> { } def: InstRW<[SKXWriteResGroup167], (instregex "VPCONFLICTQZ128rm(b?)")>; -def SKXWriteResGroup168 : SchedWriteRes<[SKXPort1,SKXPort23,SKXPort237,SKXPort06,SKXPort0156]> { - let Latency = 11; - let NumMicroOps = 6; - let ResourceCycles = [1,1,1,2,1]; -} -def: InstRW<[SKXWriteResGroup168], (instregex "SHLD(16|32|64)mrCL", - "SHRD(16|32|64)mrCL")>; - def SKXWriteResGroup169 : SchedWriteRes<[SKXPort1,SKXPort06,SKXPort0156]> { let Latency = 11; let NumMicroOps = 7; diff --git a/contrib/llvm/lib/Target/X86/X86Schedule.td b/contrib/llvm/lib/Target/X86/X86Schedule.td index d0167753ccd4..ef9ce94706df 100644 --- a/contrib/llvm/lib/Target/X86/X86Schedule.td +++ b/contrib/llvm/lib/Target/X86/X86Schedule.td @@ -118,8 +118,8 @@ defm WriteIMul64 : X86SchedWritePair; // Integer 64-bit multiplication. def WriteIMulH : SchedWrite; // Integer multiplication, high part. def WriteLEA : SchedWrite; // LEA instructions can't fold loads. -defm WriteBSWAP32: X86SchedWritePair; // Byte Order (Endiannes) Swap -defm WriteBSWAP64: X86SchedWritePair; // Byte Order (Endiannes) Swap +def WriteBSWAP32 : SchedWrite; // Byte Order (Endianness) 32-bit Swap. +def WriteBSWAP64 : SchedWrite; // Byte Order (Endianness) 64-bit Swap. // Integer division. defm WriteDiv8 : X86SchedWritePair; @@ -142,11 +142,15 @@ def WriteFCMOV : SchedWrite; // X87 conditional move. def WriteSETCC : SchedWrite; // Set register based on condition code. def WriteSETCCStore : SchedWrite; def WriteLAHFSAHF : SchedWrite; // Load/Store flags in AH. +def WriteBitTest : SchedWrite; // Bit Test - TODO add memory folding support // Integer shifts and rotates. defm WriteShift : X86SchedWritePair; // Double shift instructions. -defm WriteShiftDouble : X86SchedWritePair; +def WriteSHDrri : SchedWrite; +def WriteSHDrrcl : SchedWrite; +def WriteSHDmri : SchedWrite; +def WriteSHDmrcl : SchedWrite; // BMI1 BEXTR, BMI2 BZHI defm WriteBEXTR : X86SchedWritePair; diff --git a/contrib/llvm/lib/Target/X86/X86ScheduleAtom.td b/contrib/llvm/lib/Target/X86/X86ScheduleAtom.td index d1e902e6c43f..a7f461c456bd 100644 --- a/contrib/llvm/lib/Target/X86/X86ScheduleAtom.td +++ b/contrib/llvm/lib/Target/X86/X86ScheduleAtom.td @@ -81,8 +81,8 @@ defm : AtomWriteResPair<WriteADC, [AtomPort01], [AtomPort0]>; defm : AtomWriteResPair<WriteIMul, [AtomPort01], [AtomPort01], 7, 7, [7], [7]>; defm : AtomWriteResPair<WriteIMul64, [AtomPort01], [AtomPort01], 12, 12, [12], [12]>; -defm : AtomWriteResPair<WriteBSWAP32, [AtomPort0], [AtomPort0]>; -defm : AtomWriteResPair<WriteBSWAP64, [AtomPort0], [AtomPort0]>; +defm : X86WriteRes<WriteBSWAP32, [AtomPort0], 1, [1], 1>; +defm : X86WriteRes<WriteBSWAP64, [AtomPort0], 1, [1], 1>; defm : AtomWriteResPair<WriteDiv8, [AtomPort01], [AtomPort01], 50, 68, [50], [68]>; defm : AtomWriteResPair<WriteDiv16, [AtomPort01], [AtomPort01], 50, 50, [50], [50]>; @@ -108,6 +108,7 @@ def : WriteRes<WriteLAHFSAHF, [AtomPort01]> { let Latency = 2; let ResourceCycles = [2]; } +def : WriteRes<WriteBitTest,[AtomPort01]>; defm : X86WriteResUnsupported<WriteIMulH>; @@ -150,11 +151,10 @@ defm : X86WriteResPairUnsupported<WriteBZHI>; defm : AtomWriteResPair<WriteShift, [AtomPort0], [AtomPort0]>; -//////////////////////////////////////////////////////////////////////////////// -// Double shift instructions. -//////////////////////////////////////////////////////////////////////////////// - -defm : AtomWriteResPair<WriteShiftDouble, [AtomPort0], [AtomPort0]>; +defm : X86WriteRes<WriteSHDrri, [AtomPort01], 2, [2], 1>; +defm : X86WriteRes<WriteSHDrrcl,[AtomPort01], 2, [2], 1>; +defm : X86WriteRes<WriteSHDmri, [AtomPort01], 4, [4], 1>; +defm : X86WriteRes<WriteSHDmrcl,[AtomPort01], 4, [4], 1>; //////////////////////////////////////////////////////////////////////////////// // Loads, stores, and moves, not folded with other operations. @@ -562,9 +562,7 @@ def AtomWrite01_2 : SchedWriteRes<[AtomPort01]> { def : InstRW<[AtomWrite01_2], (instrs LEAVE, LEAVE64, POP16r, PUSH16rmm, PUSH32rmm, PUSH64rmm, LODSB, LODSL, LODSQ, LODSW, - SCASB, SCASL, SCASQ, SCASW, - SHLD32rrCL, SHRD32rrCL, - SHLD32rri8, SHRD32rri8)>; + SCASB, SCASL, SCASQ, SCASW)>; def : InstRW<[AtomWrite01_2], (instregex "BT(C|R|S)(16|32|64)mi8", "PUSH(CS|DS|ES|FS|GS|SS)(16|32|64)", "XADD(8|16|32|64)rr", @@ -598,8 +596,6 @@ def AtomWrite01_4 : SchedWriteRes<[AtomPort01]> { } def : InstRW<[AtomWrite01_4], (instrs CBW, CWD, CWDE, CDQ, CDQE, CQO, JCXZ, JECXZ, JRCXZ, - SHLD32mrCL, SHRD32mrCL, - SHLD32mri8, SHRD32mri8, LD_F80m)>; def : InstRW<[AtomWrite01_4], (instregex "PH(ADD|SUB)Drm", "(MMX_)?PEXTRWrr(_REV)?")>; diff --git a/contrib/llvm/lib/Target/X86/X86ScheduleBtVer2.td b/contrib/llvm/lib/Target/X86/X86ScheduleBtVer2.td index d78c343ebd5c..719e71cd25e5 100644 --- a/contrib/llvm/lib/Target/X86/X86ScheduleBtVer2.td +++ b/contrib/llvm/lib/Target/X86/X86ScheduleBtVer2.td @@ -168,8 +168,8 @@ defm : JWriteResIntPair<WriteIMul, [JALU1, JMul], 3, [1, 1], 2>; // i8/i16/i32 defm : JWriteResIntPair<WriteIMul64, [JALU1, JMul], 6, [1, 4], 2>; // i64 multiplication defm : X86WriteRes<WriteIMulH, [JALU1], 6, [4], 1>; -defm : JWriteResIntPair<WriteBSWAP32,[JALU01], 1>; -defm : JWriteResIntPair<WriteBSWAP64,[JALU01], 1>; +defm : X86WriteRes<WriteBSWAP32, [JALU01], 1, [1], 1>; +defm : X86WriteRes<WriteBSWAP64, [JALU01], 1, [1], 1>; defm : JWriteResIntPair<WriteDiv8, [JALU1, JDiv], 12, [1, 12], 1>; defm : JWriteResIntPair<WriteDiv16, [JALU1, JDiv], 17, [1, 17], 2>; @@ -188,6 +188,7 @@ defm : X86WriteRes<WriteFCMOV, [JFPU0, JFPA], 3, [1,1], 1>; // x87 conditional m def : WriteRes<WriteSETCC, [JALU01]>; // Setcc. def : WriteRes<WriteSETCCStore, [JALU01,JSAGU]>; def : WriteRes<WriteLAHFSAHF, [JALU01]>; +def : WriteRes<WriteBitTest,[JALU01]>; // This is for simple LEAs with one or two input operands. def : WriteRes<WriteLEA, [JALU01]>; @@ -209,33 +210,11 @@ defm : X86WriteResPairUnsupported<WriteBZHI>; defm : JWriteResIntPair<WriteShift, [JALU01], 1>; -defm : JWriteResIntPair<WriteShiftDouble, [JALU01], 1>; - -def JWriteSHLDrri : SchedWriteRes<[JALU01]> { - let Latency = 3; - let ResourceCycles = [6]; - let NumMicroOps = 6; -} -def: InstRW<[JWriteSHLDrri], (instrs SHLD16rri8, SHLD32rri8, SHLD64rri8, - SHRD16rri8, SHRD32rri8, SHRD64rri8)>; - -def JWriteSHLDrrCL : SchedWriteRes<[JALU01]> { - let Latency = 4; - let ResourceCycles = [8]; - let NumMicroOps = 7; -} -def: InstRW<[JWriteSHLDrrCL], (instrs SHLD16rrCL, SHLD32rrCL, SHLD64rrCL, - SHRD16rrCL, SHRD32rrCL, SHRD64rrCL)>; - -def JWriteSHLDm : SchedWriteRes<[JLAGU, JALU01]> { - let Latency = 9; - let ResourceCycles = [1, 22]; - let NumMicroOps = 8; -} -def: InstRW<[JWriteSHLDm],(instrs SHLD16mri8, SHLD32mri8, SHLD64mri8, - SHLD16mrCL, SHLD32mrCL, SHLD64mrCL, - SHRD16mri8, SHRD32mri8, SHRD64mri8, - SHRD16mrCL, SHRD32mrCL, SHRD64mrCL)>; +// SHLD/SHRD. +defm : X86WriteRes<WriteSHDrri, [JALU01], 3, [6], 6>; +defm : X86WriteRes<WriteSHDrrcl,[JALU01], 4, [8], 7>; +defm : X86WriteRes<WriteSHDmri, [JLAGU, JALU01], 9, [1, 22], 8>; +defm : X86WriteRes<WriteSHDmrcl,[JLAGU, JALU01], 9, [1, 22], 8>; //////////////////////////////////////////////////////////////////////////////// // Loads, stores, and moves, not folded with other operations. diff --git a/contrib/llvm/lib/Target/X86/X86ScheduleSLM.td b/contrib/llvm/lib/Target/X86/X86ScheduleSLM.td index c938a4a8939e..b1e843013707 100644 --- a/contrib/llvm/lib/Target/X86/X86ScheduleSLM.td +++ b/contrib/llvm/lib/Target/X86/X86ScheduleSLM.td @@ -98,11 +98,16 @@ defm : SLMWriteResPair<WriteADC, [SLM_IEC_RSV01], 1>; defm : SLMWriteResPair<WriteIMul, [SLM_IEC_RSV1], 3>; defm : SLMWriteResPair<WriteIMul64, [SLM_IEC_RSV1], 3>; -defm : SLMWriteResPair<WriteBSWAP32,[SLM_IEC_RSV01], 1>; -defm : SLMWriteResPair<WriteBSWAP64,[SLM_IEC_RSV01], 1>; +defm : X86WriteRes<WriteBSWAP32, [SLM_IEC_RSV01], 1, [1], 1>; +defm : X86WriteRes<WriteBSWAP64, [SLM_IEC_RSV01], 1, [1], 1>; defm : SLMWriteResPair<WriteShift, [SLM_IEC_RSV0], 1>; -defm : SLMWriteResPair<WriteShiftDouble, [SLM_IEC_RSV0], 1>; + +defm : X86WriteRes<WriteSHDrri, [SLM_IEC_RSV0], 1, [1], 1>; +defm : X86WriteRes<WriteSHDrrcl,[SLM_IEC_RSV0], 1, [1], 1>; +defm : X86WriteRes<WriteSHDmri, [SLM_MEC_RSV, SLM_IEC_RSV0], 4, [2, 1], 2>; +defm : X86WriteRes<WriteSHDmrcl,[SLM_MEC_RSV, SLM_IEC_RSV0], 4, [2, 1], 2>; + defm : SLMWriteResPair<WriteJump, [SLM_IEC_RSV1], 1>; defm : SLMWriteResPair<WriteCRC32, [SLM_IEC_RSV1], 3>; @@ -115,6 +120,7 @@ def : WriteRes<WriteSETCCStore, [SLM_IEC_RSV01, SLM_MEC_RSV]> { let ResourceCycles = [2,1]; } def : WriteRes<WriteLAHFSAHF, [SLM_IEC_RSV01]>; +def : WriteRes<WriteBitTest,[SLM_IEC_RSV01]>; // This is for simple LEAs with one or two input operands. // The complex ones can only execute on port 1, and they require two cycles on diff --git a/contrib/llvm/lib/Target/X86/X86ScheduleZnver1.td b/contrib/llvm/lib/Target/X86/X86ScheduleZnver1.td index d28d58580752..7184b850a195 100644 --- a/contrib/llvm/lib/Target/X86/X86ScheduleZnver1.td +++ b/contrib/llvm/lib/Target/X86/X86ScheduleZnver1.td @@ -180,11 +180,16 @@ defm : ZnWriteResPair<WriteADC, [ZnALU], 1>; defm : ZnWriteResPair<WriteIMul, [ZnALU1, ZnMultiplier], 4>; defm : ZnWriteResPair<WriteIMul64, [ZnALU1, ZnMultiplier], 4, [1,1], 2>; -defm : ZnWriteResPair<WriteBSWAP32,[ZnALU], 1, [4]>; -defm : ZnWriteResPair<WriteBSWAP64,[ZnALU], 1, [4]>; +defm : X86WriteRes<WriteBSWAP32, [ZnALU], 1, [4], 1>; +defm : X86WriteRes<WriteBSWAP64, [ZnALU], 1, [4], 1>; defm : ZnWriteResPair<WriteShift, [ZnALU], 1>; -defm : ZnWriteResPair<WriteShiftDouble, [ZnALU], 1>; + +defm : X86WriteRes<WriteSHDrri, [ZnALU], 1, [1], 1>; +defm : X86WriteResUnsupported<WriteSHDrrcl>; +defm : X86WriteResUnsupported<WriteSHDmri>; +defm : X86WriteResUnsupported<WriteSHDmrcl>; + defm : ZnWriteResPair<WriteJump, [ZnALU], 1>; defm : ZnWriteResFpuPair<WriteCRC32, [ZnFPU0], 3>; @@ -193,6 +198,7 @@ defm : ZnWriteResPair<WriteCMOV2, [ZnALU], 1>; def : WriteRes<WriteSETCC, [ZnALU]>; def : WriteRes<WriteSETCCStore, [ZnALU, ZnAGU]>; defm : X86WriteRes<WriteLAHFSAHF, [ZnALU], 2, [1], 2>; +def : WriteRes<WriteBitTest,[ZnALU]>; // Bit counts. defm : ZnWriteResPair<WriteBSF, [ZnALU], 3>; diff --git a/contrib/llvm/lib/Target/X86/X86Subtarget.h b/contrib/llvm/lib/Target/X86/X86Subtarget.h index fedb13f89e19..85e8256a6e94 100644 --- a/contrib/llvm/lib/Target/X86/X86Subtarget.h +++ b/contrib/llvm/lib/Target/X86/X86Subtarget.h @@ -51,7 +51,7 @@ enum Style { } // end namespace PICStyles class X86Subtarget final : public X86GenSubtargetInfo { -public: +public: enum X86ProcFamilyEnum { Others, IntelAtom, diff --git a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index bae2ef80c365..865462622627 100644 --- a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -2274,8 +2274,8 @@ int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { // Sign-extend all constants to a multiple of 64-bit. APInt ImmVal = Imm; - if (BitSize & 0x3f) - ImmVal = Imm.sext((BitSize + 63) & ~0x3fU); + if (BitSize % 64 != 0) + ImmVal = Imm.sext(alignTo(BitSize, 64)); // Split the constant into 64-bit chunks and calculate the cost for each // chunk. @@ -2332,9 +2332,15 @@ int X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, // immediates here as the normal path expects bit 31 to be sign extended. if (Idx == 1 && Imm.getBitWidth() == 64 && isUInt<32>(Imm.getZExtValue())) return TTI::TCC_Free; - LLVM_FALLTHROUGH; + ImmIdx = 1; + break; case Instruction::Add: case Instruction::Sub: + // For add/sub, we can use the opposite instruction for INT32_MIN. + if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.getZExtValue() == 0x80000000) + return TTI::TCC_Free; + ImmIdx = 1; + break; case Instruction::Mul: case Instruction::UDiv: case Instruction::SDiv: @@ -2366,7 +2372,7 @@ int X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, } if (Idx == ImmIdx) { - int NumConstants = (BitSize + 63) / 64; + int NumConstants = divideCeil(BitSize, 64); int Cost = X86TTIImpl::getIntImmCost(Imm, Ty); return (Cost <= NumConstants * TTI::TCC_Basic) ? static_cast<int>(TTI::TCC_Free) |
