diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2015-08-07 23:01:33 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2015-08-07 23:01:33 +0000 |
commit | ee8648bdac07986a0f1ec897b02ec82a2f144d46 (patch) | |
tree | 52d1861acda1205241ee35a94aa63129c604d469 /lib/Target | |
parent | 1a82d4c088707c791c792f6822f611b47a12bdfe (diff) |
Diffstat (limited to 'lib/Target')
251 files changed, 12046 insertions, 3562 deletions
diff --git a/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp b/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp index bffd9e6e8c76..79a84ad8c6c5 100644 --- a/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp +++ b/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp @@ -510,9 +510,17 @@ int AArch64A57FPLoadBalancing::scavengeRegister(Chain *G, Color C, if (J.isRegMask()) AvailableRegs.clearBitsNotInMask(J.getRegMask()); - if (J.isReg() && J.isDef() && AvailableRegs[J.getReg()]) { - assert(J.isDead() && "Non-dead def should have been removed by now!"); - AvailableRegs.reset(J.getReg()); + if (J.isReg() && J.isDef()) { + MCRegAliasIterator AI(J.getReg(), TRI, /*IncludeSelf=*/true); + if (J.isDead()) + for (; AI.isValid(); ++AI) + AvailableRegs.reset(*AI); +#ifndef NDEBUG + else + for (; AI.isValid(); ++AI) + assert(!AvailableRegs[*AI] && + "Non-dead def should have been removed by now!"); +#endif } } } @@ -585,7 +593,6 @@ bool AArch64A57FPLoadBalancing::colorChain(Chain *G, Color C, if (Change) { Substs[MO.getReg()] = Reg; MO.setReg(Reg); - MRI->setPhysRegUsed(Reg); Changed = true; } diff --git a/lib/Target/AArch64/AArch64CallingConvention.td b/lib/Target/AArch64/AArch64CallingConvention.td index 4691e949838d..815ebef177d8 100644 --- a/lib/Target/AArch64/AArch64CallingConvention.td +++ b/lib/Target/AArch64/AArch64CallingConvention.td @@ -40,6 +40,11 @@ def CC_AArch64_AAPCS : CallingConv<[ // slot is 64-bit. CCIfByVal<CCPassByVal<8, 8>>, + // The 'nest' parameter, if any, is passed in X18. + // Darwin uses X18 as the platform register and hence 'nest' isn't currently + // supported there. + CCIfNest<CCAssignToReg<[X18]>>, + CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Block">>, // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers, diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp index c19fcdc4bb18..072819836bb3 100644 --- a/lib/Target/AArch64/AArch64FastISel.cpp +++ b/lib/Target/AArch64/AArch64FastISel.cpp @@ -310,7 +310,7 @@ CCAssignFn *AArch64FastISel::CCAssignFnForCall(CallingConv::ID CC) const { } unsigned AArch64FastISel::fastMaterializeAlloca(const AllocaInst *AI) { - assert(TLI.getValueType(AI->getType(), true) == MVT::i64 && + assert(TLI.getValueType(DL, AI->getType(), true) == MVT::i64 && "Alloca should always return a pointer."); // Don't handle dynamic allocas. @@ -420,7 +420,7 @@ unsigned AArch64FastISel::materializeGV(const GlobalValue *GV) { unsigned char OpFlags = Subtarget->ClassifyGlobalReference(GV, TM); - EVT DestEVT = TLI.getValueType(GV->getType(), true); + EVT DestEVT = TLI.getValueType(DL, GV->getType(), true); if (!DestEVT.isSimple()) return 0; @@ -459,7 +459,7 @@ unsigned AArch64FastISel::materializeGV(const GlobalValue *GV) { } unsigned AArch64FastISel::fastMaterializeConstant(const Constant *C) { - EVT CEVT = TLI.getValueType(C->getType(), true); + EVT CEVT = TLI.getValueType(DL, C->getType(), true); // Only handle simple types. if (!CEVT.isSimple()) @@ -538,13 +538,14 @@ bool AArch64FastISel::computeAddress(const Value *Obj, Address &Addr, Type *Ty) } case Instruction::IntToPtr: { // Look past no-op inttoptrs. - if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy()) + if (TLI.getValueType(DL, U->getOperand(0)->getType()) == + TLI.getPointerTy(DL)) return computeAddress(U->getOperand(0), Addr, Ty); break; } case Instruction::PtrToInt: { // Look past no-op ptrtoints. - if (TLI.getValueType(U->getType()) == TLI.getPointerTy()) + if (TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL)) return computeAddress(U->getOperand(0), Addr, Ty); break; } @@ -879,13 +880,13 @@ bool AArch64FastISel::computeCallAddress(const Value *V, Address &Addr) { case Instruction::IntToPtr: // Look past no-op inttoptrs if its operand is in the same BB. if (InMBB && - TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy()) + TLI.getValueType(DL, U->getOperand(0)->getType()) == + TLI.getPointerTy(DL)) return computeCallAddress(U->getOperand(0), Addr); break; case Instruction::PtrToInt: // Look past no-op ptrtoints if its operand is in the same BB. - if (InMBB && - TLI.getValueType(U->getType()) == TLI.getPointerTy()) + if (InMBB && TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL)) return computeCallAddress(U->getOperand(0), Addr); break; } @@ -906,7 +907,7 @@ bool AArch64FastISel::computeCallAddress(const Value *V, Address &Addr) { bool AArch64FastISel::isTypeLegal(Type *Ty, MVT &VT) { - EVT evt = TLI.getValueType(Ty, true); + EVT evt = TLI.getValueType(DL, Ty, true); // Only handle simple types. if (evt == MVT::Other || !evt.isSimple()) @@ -1390,7 +1391,7 @@ unsigned AArch64FastISel::emitAddSub_rx(bool UseAdd, MVT RetVT, unsigned LHSReg, bool AArch64FastISel::emitCmp(const Value *LHS, const Value *RHS, bool IsZExt) { Type *Ty = LHS->getType(); - EVT EVT = TLI.getValueType(Ty, true); + EVT EVT = TLI.getValueType(DL, Ty, true); if (!EVT.isSimple()) return false; MVT VT = EVT.getSimpleVT(); @@ -2761,7 +2762,7 @@ bool AArch64FastISel::selectFPToInt(const Instruction *I, bool Signed) { if (SrcReg == 0) return false; - EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType(), true); + EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType(), true); if (SrcVT == MVT::f128) return false; @@ -2797,7 +2798,7 @@ bool AArch64FastISel::selectIntToFP(const Instruction *I, bool Signed) { return false; bool SrcIsKill = hasTrivialKill(I->getOperand(0)); - EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType(), true); + EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType(), true); // Handle sign-extension. if (SrcVT == MVT::i16 || SrcVT == MVT::i8 || SrcVT == MVT::i1) { @@ -2856,7 +2857,7 @@ bool AArch64FastISel::fastLowerArguments() { if (ArgTy->isStructTy() || ArgTy->isArrayTy()) return false; - EVT ArgVT = TLI.getValueType(ArgTy); + EVT ArgVT = TLI.getValueType(DL, ArgTy); if (!ArgVT.isSimple()) return false; @@ -2898,7 +2899,7 @@ bool AArch64FastISel::fastLowerArguments() { unsigned GPRIdx = 0; unsigned FPRIdx = 0; for (auto const &Arg : F->args()) { - MVT VT = TLI.getSimpleValueType(Arg.getType()); + MVT VT = TLI.getSimpleValueType(DL, Arg.getType()); unsigned SrcReg; const TargetRegisterClass *RC; if (VT >= MVT::i1 && VT <= MVT::i32) { @@ -3689,7 +3690,7 @@ bool AArch64FastISel::selectRet(const Instruction *I) { if (Ret->getNumOperands() > 0) { CallingConv::ID CC = F.getCallingConv(); SmallVector<ISD::OutputArg, 4> Outs; - GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI); + GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI, DL); // Analyze operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ValLocs; @@ -3724,7 +3725,7 @@ bool AArch64FastISel::selectRet(const Instruction *I) { if (!MRI.getRegClass(SrcReg)->contains(DestReg)) return false; - EVT RVEVT = TLI.getValueType(RV->getType()); + EVT RVEVT = TLI.getValueType(DL, RV->getType()); if (!RVEVT.isSimple()) return false; @@ -3772,8 +3773,8 @@ bool AArch64FastISel::selectTrunc(const Instruction *I) { Value *Op = I->getOperand(0); Type *SrcTy = Op->getType(); - EVT SrcEVT = TLI.getValueType(SrcTy, true); - EVT DestEVT = TLI.getValueType(DestTy, true); + EVT SrcEVT = TLI.getValueType(DL, SrcTy, true); + EVT DestEVT = TLI.getValueType(DL, DestTy, true); if (!SrcEVT.isSimple()) return false; if (!DestEVT.isSimple()) @@ -4459,7 +4460,7 @@ bool AArch64FastISel::selectIntExt(const Instruction *I) { } bool AArch64FastISel::selectRem(const Instruction *I, unsigned ISDOpcode) { - EVT DestEVT = TLI.getValueType(I->getType(), true); + EVT DestEVT = TLI.getValueType(DL, I->getType(), true); if (!DestEVT.isSimple()) return false; @@ -4825,7 +4826,7 @@ std::pair<unsigned, bool> AArch64FastISel::getRegForGEPIndex(const Value *Idx) { bool IdxNIsKill = hasTrivialKill(Idx); // If the index is smaller or larger than intptr_t, truncate or extend it. - MVT PtrVT = TLI.getPointerTy(); + MVT PtrVT = TLI.getPointerTy(DL); EVT IdxVT = EVT::getEVT(Idx->getType(), /*HandleUnknown=*/false); if (IdxVT.bitsLT(PtrVT)) { IdxN = emitIntExt(IdxVT.getSimpleVT(), IdxN, PtrVT, /*IsZExt=*/false); @@ -4849,7 +4850,7 @@ bool AArch64FastISel::selectGetElementPtr(const Instruction *I) { // into a single N = N + TotalOffset. uint64_t TotalOffs = 0; Type *Ty = I->getOperand(0)->getType(); - MVT VT = TLI.getPointerTy(); + MVT VT = TLI.getPointerTy(DL); for (auto OI = std::next(I->op_begin()), E = I->op_end(); OI != E; ++OI) { const Value *Idx = *OI; if (auto *StTy = dyn_cast<StructType>(Ty)) { diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp index 3ba7e70a102d..a7817f4f67dd 100644 --- a/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -349,12 +349,10 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, // Allocate space for the rest of the frame. const unsigned Alignment = MFI->getMaxAlignment(); - const bool NeedsRealignment = (Alignment > 16); + const bool NeedsRealignment = RegInfo->needsStackRealignment(MF); unsigned scratchSPReg = AArch64::SP; - if (NeedsRealignment) { - // Use the first callee-saved register as a scratch register - assert(MF.getRegInfo().isPhysRegUsed(AArch64::X9) && - "No scratch register to align SP!"); + if (NumBytes && NeedsRealignment) { + // Use the first callee-saved register as a scratch register. scratchSPReg = AArch64::X9; } @@ -366,9 +364,6 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP, -NumBytes, TII, MachineInstr::FrameSetup); - assert(!(NeedsRealignment && NumBytes==0) && - "NumBytes should never be 0 when realignment is needed"); - if (NumBytes && NeedsRealignment) { const unsigned NrBitsToZero = countTrailingZeros(Alignment); assert(NrBitsToZero > 1); @@ -881,28 +876,34 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( return true; } -void AArch64FrameLowering::processFunctionBeforeCalleeSavedScan( - MachineFunction &MF, RegScavenger *RS) const { +void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, + BitVector &SavedRegs, + RegScavenger *RS) const { + // All calls are tail calls in GHC calling conv, and functions have no + // prologue/epilogue. + if (MF.getFunction()->getCallingConv() == CallingConv::GHC) + return; + + TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>( MF.getSubtarget().getRegisterInfo()); AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); - MachineRegisterInfo *MRI = &MF.getRegInfo(); SmallVector<unsigned, 4> UnspilledCSGPRs; SmallVector<unsigned, 4> UnspilledCSFPRs; // The frame record needs to be created by saving the appropriate registers if (hasFP(MF)) { - MRI->setPhysRegUsed(AArch64::FP); - MRI->setPhysRegUsed(AArch64::LR); + SavedRegs.set(AArch64::FP); + SavedRegs.set(AArch64::LR); } // Spill the BasePtr if it's used. Do this first thing so that the // getCalleeSavedRegs() below will get the right answer. if (RegInfo->hasBasePointer(MF)) - MRI->setPhysRegUsed(RegInfo->getBaseRegister()); + SavedRegs.set(RegInfo->getBaseRegister()); if (RegInfo->needsStackRealignment(MF) && !RegInfo->hasBasePointer(MF)) - MRI->setPhysRegUsed(AArch64::X9); + SavedRegs.set(AArch64::X9); // If any callee-saved registers are used, the frame cannot be eliminated. unsigned NumGPRSpilled = 0; @@ -924,8 +925,8 @@ void AArch64FrameLowering::processFunctionBeforeCalleeSavedScan( AArch64::FPR64RegClass.contains(EvenReg)) && "Register class mismatch!"); - const bool OddRegUsed = MRI->isPhysRegUsed(OddReg); - const bool EvenRegUsed = MRI->isPhysRegUsed(EvenReg); + const bool OddRegUsed = SavedRegs.test(OddReg); + const bool EvenRegUsed = SavedRegs.test(EvenReg); // Early exit if none of the registers in the register pair is actually // used. @@ -946,7 +947,7 @@ void AArch64FrameLowering::processFunctionBeforeCalleeSavedScan( if (OddRegUsed ^ EvenRegUsed) { // Find out which register is the additional spill. Reg = OddRegUsed ? EvenReg : OddReg; - MRI->setPhysRegUsed(Reg); + SavedRegs.set(Reg); } DEBUG(dbgs() << ' ' << PrintReg(OddReg, RegInfo)); @@ -1001,7 +1002,7 @@ void AArch64FrameLowering::processFunctionBeforeCalleeSavedScan( UnspilledCSGPRs.pop_back(); DEBUG(dbgs() << "Spilling " << PrintReg(Reg, RegInfo) << " to get a scratch register.\n"); - MRI->setPhysRegUsed(Reg); + SavedRegs.set(Reg); ExtraCSSpill = true; ++Count; } diff --git a/lib/Target/AArch64/AArch64FrameLowering.h b/lib/Target/AArch64/AArch64FrameLowering.h index b496fccba349..731f031ff855 100644 --- a/lib/Target/AArch64/AArch64FrameLowering.h +++ b/lib/Target/AArch64/AArch64FrameLowering.h @@ -59,8 +59,8 @@ public: bool hasFP(const MachineFunction &MF) const override; bool hasReservedCallFrame(const MachineFunction &MF) const override; - void processFunctionBeforeCalleeSavedScan(MachineFunction &MF, - RegScavenger *RS) const override; + void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, + RegScavenger *RS) const override; }; } // End llvm namespace diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index 1ea4abcf05fa..772e894f4f0a 100644 --- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -610,10 +610,11 @@ static bool isWorthFoldingADDlow(SDValue N) { bool AArch64DAGToDAGISel::SelectAddrModeIndexed(SDValue N, unsigned Size, SDValue &Base, SDValue &OffImm) { SDLoc dl(N); + const DataLayout &DL = CurDAG->getDataLayout(); const TargetLowering *TLI = getTargetLowering(); if (N.getOpcode() == ISD::FrameIndex) { int FI = cast<FrameIndexSDNode>(N)->getIndex(); - Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy()); + Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL)); OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64); return true; } @@ -628,10 +629,9 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexed(SDValue N, unsigned Size, const GlobalValue *GV = GAN->getGlobal(); unsigned Alignment = GV->getAlignment(); - const DataLayout *DL = TLI->getDataLayout(); Type *Ty = GV->getType()->getElementType(); if (Alignment == 0 && Ty->isSized()) - Alignment = DL->getABITypeAlignment(Ty); + Alignment = DL.getABITypeAlignment(Ty); if (Alignment >= Size) return true; @@ -645,7 +645,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexed(SDValue N, unsigned Size, Base = N.getOperand(0); if (Base.getOpcode() == ISD::FrameIndex) { int FI = cast<FrameIndexSDNode>(Base)->getIndex(); - Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy()); + Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL)); } OffImm = CurDAG->getTargetConstant(RHSC >> Scale, dl, MVT::i64); return true; @@ -688,7 +688,8 @@ bool AArch64DAGToDAGISel::SelectAddrModeUnscaled(SDValue N, unsigned Size, if (Base.getOpcode() == ISD::FrameIndex) { int FI = cast<FrameIndexSDNode>(Base)->getIndex(); const TargetLowering *TLI = getTargetLowering(); - Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy()); + Base = CurDAG->getTargetFrameIndex( + FI, TLI->getPointerTy(CurDAG->getDataLayout())); } OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i64); return true; @@ -1494,7 +1495,7 @@ static bool isSeveralBitsExtractOpFromShr(SDNode *N, unsigned &Opc, } static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0, - unsigned &LSB, unsigned &MSB, + unsigned &Immr, unsigned &Imms, bool BiggerPattern) { assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) && "N must be a SHR/SRA operation to call this function"); @@ -1508,7 +1509,7 @@ static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0, "Type checking must have been done before calling this function"); // Check for AND + SRL doing several bits extract. - if (isSeveralBitsExtractOpFromShr(N, Opc, Opd0, LSB, MSB)) + if (isSeveralBitsExtractOpFromShr(N, Opc, Opd0, Immr, Imms)) return true; // we're looking for a shift of a shift @@ -1548,13 +1549,9 @@ static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0, assert(Srl_imm > 0 && Srl_imm < VT.getSizeInBits() && "bad amount in shift node!"); - // Note: The width operand is encoded as width-1. - unsigned Width = VT.getSizeInBits() - Trunc_bits - Srl_imm - 1; - int sLSB = Srl_imm - Shl_imm; - if (sLSB < 0) - return false; - LSB = sLSB; - MSB = LSB + Width; + int immr = Srl_imm - Shl_imm; + Immr = immr < 0 ? immr + VT.getSizeInBits() : immr; + Imms = VT.getSizeInBits() - Shl_imm - Trunc_bits - 1; // SRA requires a signed extraction if (VT == MVT::i32) Opc = N->getOpcode() == ISD::SRA ? AArch64::SBFMWri : AArch64::UBFMWri; @@ -1564,7 +1561,7 @@ static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0, } static bool isBitfieldExtractOp(SelectionDAG *CurDAG, SDNode *N, unsigned &Opc, - SDValue &Opd0, unsigned &LSB, unsigned &MSB, + SDValue &Opd0, unsigned &Immr, unsigned &Imms, unsigned NumberOfIgnoredLowBits = 0, bool BiggerPattern = false) { if (N->getValueType(0) != MVT::i32 && N->getValueType(0) != MVT::i64) @@ -1576,11 +1573,11 @@ static bool isBitfieldExtractOp(SelectionDAG *CurDAG, SDNode *N, unsigned &Opc, return false; break; case ISD::AND: - return isBitfieldExtractOpFromAnd(CurDAG, N, Opc, Opd0, LSB, MSB, + return isBitfieldExtractOpFromAnd(CurDAG, N, Opc, Opd0, Immr, Imms, NumberOfIgnoredLowBits, BiggerPattern); case ISD::SRL: case ISD::SRA: - return isBitfieldExtractOpFromShr(N, Opc, Opd0, LSB, MSB, BiggerPattern); + return isBitfieldExtractOpFromShr(N, Opc, Opd0, Immr, Imms, BiggerPattern); } unsigned NOpc = N->getMachineOpcode(); @@ -1593,8 +1590,8 @@ static bool isBitfieldExtractOp(SelectionDAG *CurDAG, SDNode *N, unsigned &Opc, case AArch64::UBFMXri: Opc = NOpc; Opd0 = N->getOperand(0); - LSB = cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); - MSB = cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); + Immr = cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); + Imms = cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); return true; } // Unreachable @@ -1602,9 +1599,9 @@ static bool isBitfieldExtractOp(SelectionDAG *CurDAG, SDNode *N, unsigned &Opc, } SDNode *AArch64DAGToDAGISel::SelectBitfieldExtractOp(SDNode *N) { - unsigned Opc, LSB, MSB; + unsigned Opc, Immr, Imms; SDValue Opd0; - if (!isBitfieldExtractOp(CurDAG, N, Opc, Opd0, LSB, MSB)) + if (!isBitfieldExtractOp(CurDAG, N, Opc, Opd0, Immr, Imms)) return nullptr; EVT VT = N->getValueType(0); @@ -1613,8 +1610,8 @@ SDNode *AArch64DAGToDAGISel::SelectBitfieldExtractOp(SDNode *N) { // If the bit extract operation is 64bit but the original type is 32bit, we // need to add one EXTRACT_SUBREG. if ((Opc == AArch64::SBFMXri || Opc == AArch64::UBFMXri) && VT == MVT::i32) { - SDValue Ops64[] = {Opd0, CurDAG->getTargetConstant(LSB, dl, MVT::i64), - CurDAG->getTargetConstant(MSB, dl, MVT::i64)}; + SDValue Ops64[] = {Opd0, CurDAG->getTargetConstant(Immr, dl, MVT::i64), + CurDAG->getTargetConstant(Imms, dl, MVT::i64)}; SDNode *BFM = CurDAG->getMachineNode(Opc, dl, MVT::i64, Ops64); SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, dl, MVT::i32); @@ -1624,8 +1621,8 @@ SDNode *AArch64DAGToDAGISel::SelectBitfieldExtractOp(SDNode *N) { return Node; } - SDValue Ops[] = {Opd0, CurDAG->getTargetConstant(LSB, dl, VT), - CurDAG->getTargetConstant(MSB, dl, VT)}; + SDValue Ops[] = {Opd0, CurDAG->getTargetConstant(Immr, dl, VT), + CurDAG->getTargetConstant(Imms, dl, VT)}; return CurDAG->SelectNodeTo(N, Opc, VT, Ops); } @@ -2351,7 +2348,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { int FI = cast<FrameIndexSDNode>(Node)->getIndex(); unsigned Shifter = AArch64_AM::getShifterImm(AArch64_AM::LSL, 0); const TargetLowering *TLI = getTargetLowering(); - SDValue TFI = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy()); + SDValue TFI = CurDAG->getTargetFrameIndex( + FI, TLI->getPointerTy(CurDAG->getDataLayout())); SDLoc DL(Node); SDValue Ops[] = { TFI, CurDAG->getTargetConstant(0, DL, MVT::i32), CurDAG->getTargetConstant(Shifter, DL, MVT::i32) }; diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index f3242cdd971d..3e8f46cf1ecd 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -705,7 +705,8 @@ void AArch64TargetLowering::addQRTypeForNEON(MVT VT) { addTypeForNEON(VT, MVT::v4i32); } -EVT AArch64TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { +EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &, + EVT VT) const { if (!VT.isVector()) return MVT::i32; return VT.changeVectorElementTypeToInteger(); @@ -774,7 +775,8 @@ void AArch64TargetLowering::computeKnownBitsForTargetNode( } } -MVT AArch64TargetLowering::getScalarShiftAmountTy(EVT LHSTy) const { +MVT AArch64TargetLowering::getScalarShiftAmountTy(const DataLayout &DL, + EVT) const { return MVT::i64; } @@ -1710,7 +1712,8 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op, const char *LibcallName = (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret"; - SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy()); + SDValue Callee = + DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout())); StructType *RetTy = StructType::get(ArgTy, ArgTy, nullptr); TargetLowering::CallLoweringInfo CLI(DAG); @@ -2089,7 +2092,8 @@ SDValue AArch64TargetLowering::LowerFormalArguments( CurArgIdx = Ins[i].getOrigArgIndex(); // Get type of the original argument. - EVT ActualVT = getValueType(CurOrigArg->getType(), /*AllowUnknown*/ true); + EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(), + /*AllowUnknown*/ true); MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other; // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16. if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8) @@ -2111,7 +2115,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments( if (Ins[i].Flags.isByVal()) { // Byval is used for HFAs in the PCS, but the system should work in a // non-compliant manner for larger structs. - EVT PtrTy = getPointerTy(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); int Size = Ins[i].Flags.getByValSize(); unsigned NumRegs = (Size + 7) / 8; @@ -2119,7 +2123,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments( // case. It should also work for fundamental types too. unsigned FrameIdx = MFI->CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false); - SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrTy); + SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT); InVals.push_back(FrameIdxN); continue; @@ -2186,7 +2190,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments( int FI = MFI->CreateFixedObject(ArgSize, ArgOffset + BEAlign, true); // Create load nodes to retrieve arguments from the stack. - SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); + SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); SDValue ArgValue; // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT) @@ -2265,6 +2269,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); + auto PtrVT = getPointerTy(DAG.getDataLayout()); SmallVector<SDValue, 8> MemOps; @@ -2279,7 +2284,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, if (GPRSaveSize != 0) { GPRIdx = MFI->CreateStackObject(GPRSaveSize, 8, false); - SDValue FIN = DAG.getFrameIndex(GPRIdx, getPointerTy()); + SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT); for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) { unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass); @@ -2288,8 +2293,8 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, DAG.getStore(Val.getValue(1), DL, Val, FIN, MachinePointerInfo::getStack(i * 8), false, false, 0); MemOps.push_back(Store); - FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN, - DAG.getConstant(8, DL, getPointerTy())); + FIN = + DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT)); } } FuncInfo->setVarArgsGPRIndex(GPRIdx); @@ -2307,7 +2312,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, if (FPRSaveSize != 0) { FPRIdx = MFI->CreateStackObject(FPRSaveSize, 16, false); - SDValue FIN = DAG.getFrameIndex(FPRIdx, getPointerTy()); + SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT); for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) { unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass); @@ -2317,8 +2322,8 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, DAG.getStore(Val.getValue(1), DL, Val, FIN, MachinePointerInfo::getStack(i * 16), false, false, 0); MemOps.push_back(Store); - FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN, - DAG.getConstant(16, DL, getPointerTy())); + FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, + DAG.getConstant(16, DL, PtrVT)); } } FuncInfo->setVarArgsFPRIndex(FPRIdx); @@ -2614,7 +2619,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, for (unsigned i = 0; i != NumArgs; ++i) { MVT ValVT = Outs[i].VT; // Get type of the original argument. - EVT ActualVT = getValueType(CLI.getArgs()[Outs[i].OrigArgIndex].Ty, + EVT ActualVT = getValueType(DAG.getDataLayout(), + CLI.getArgs()[Outs[i].OrigArgIndex].Ty, /*AllowUnknown*/ true); MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT; ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; @@ -2674,10 +2680,12 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, true), DL); - SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP, getPointerTy()); + SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP, + getPointerTy(DAG.getDataLayout())); SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; SmallVector<SDValue, 8> MemOpChains; + auto PtrVT = getPointerTy(DAG.getDataLayout()); // Walk the register/memloc assignments, inserting copies/loads. for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e; @@ -2743,13 +2751,13 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, unsigned LocMemOffset = VA.getLocMemOffset(); int32_t Offset = LocMemOffset + BEAlign; SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL); - PtrOff = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff); + PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff); if (IsTailCall) { Offset = Offset + FPDiff; int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); - DstAddr = DAG.getFrameIndex(FI, getPointerTy()); + DstAddr = DAG.getFrameIndex(FI, PtrVT); DstInfo = MachinePointerInfo::getFixedStack(FI); // Make sure any stack arguments overlapping with where we're storing @@ -2759,7 +2767,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, } else { SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL); - DstAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff); + DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff); DstInfo = MachinePointerInfo::getStack(LocMemOffset); } @@ -2809,25 +2817,24 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, const GlobalValue *GV = G->getGlobal(); bool InternalLinkage = GV->hasInternalLinkage(); if (InternalLinkage) - Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0, 0); + Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0); else { - Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0, - AArch64II::MO_GOT); - Callee = DAG.getNode(AArch64ISD::LOADgot, DL, getPointerTy(), Callee); + Callee = + DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_GOT); + Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee); } } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { const char *Sym = S->getSymbol(); - Callee = - DAG.getTargetExternalSymbol(Sym, getPointerTy(), AArch64II::MO_GOT); - Callee = DAG.getNode(AArch64ISD::LOADgot, DL, getPointerTy(), Callee); + Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, AArch64II::MO_GOT); + Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee); } } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { const GlobalValue *GV = G->getGlobal(); - Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0, 0); + Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0); } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { const char *Sym = S->getSymbol(); - Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy(), 0); + Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0); } // We don't usually want to end the call-sequence here because we would tidy @@ -2977,7 +2984,7 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { - EVT PtrVT = getPointerTy(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDLoc DL(Op); const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op); const GlobalValue *GV = GN->getGlobal(); @@ -3069,7 +3076,7 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op, assert(Subtarget->isTargetDarwin() && "TLS only supported on Darwin"); SDLoc DL(Op); - MVT PtrVT = getPointerTy(); + MVT PtrVT = getPointerTy(DAG.getDataLayout()); const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); SDValue TLVPAddr = @@ -3124,7 +3131,7 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op, /// the sequence is produced as per above. SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr, SDLoc DL, SelectionDAG &DAG) const { - EVT PtrVT = getPointerTy(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Chain = DAG.getEntryNode(); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); @@ -3159,7 +3166,7 @@ AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op, } SDValue TPOff; - EVT PtrVT = getPointerTy(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDLoc DL(Op); const GlobalValue *GV = GA->getGlobal(); @@ -3786,7 +3793,7 @@ SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op, // Jump table entries as PC relative offsets. No additional tweaking // is necessary here. Just get the address of the jump table. JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); - EVT PtrVT = getPointerTy(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDLoc DL(Op); if (getTargetMachine().getCodeModel() == CodeModel::Large && @@ -3812,7 +3819,7 @@ SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op, SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); - EVT PtrVT = getPointerTy(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDLoc DL(Op); if (getTargetMachine().getCodeModel() == CodeModel::Large) { @@ -3853,7 +3860,7 @@ SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op, SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); - EVT PtrVT = getPointerTy(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDLoc DL(Op); if (getTargetMachine().getCodeModel() == CodeModel::Large && !Subtarget->isTargetMachO()) { @@ -3879,8 +3886,8 @@ SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op, DAG.getMachineFunction().getInfo<AArch64FunctionInfo>(); SDLoc DL(Op); - SDValue FR = - DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), getPointerTy()); + SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), + getPointerTy(DAG.getDataLayout())); const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), MachinePointerInfo(SV), false, false, 0); @@ -3892,6 +3899,7 @@ SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op, // Standard, section B.3. MachineFunction &MF = DAG.getMachineFunction(); AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); + auto PtrVT = getPointerTy(DAG.getDataLayout()); SDLoc DL(Op); SDValue Chain = Op.getOperand(0); @@ -3900,8 +3908,7 @@ SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op, SmallVector<SDValue, 4> MemOps; // void *__stack at offset 0 - SDValue Stack = - DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), getPointerTy()); + SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT); MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList, MachinePointerInfo(SV), false, false, 8)); @@ -3910,12 +3917,12 @@ SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op, if (GPRSize > 0) { SDValue GRTop, GRTopAddr; - GRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList, - DAG.getConstant(8, DL, getPointerTy())); + GRTopAddr = + DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(8, DL, PtrVT)); - GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), getPointerTy()); - GRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), GRTop, - DAG.getConstant(GPRSize, DL, getPointerTy())); + GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT); + GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop, + DAG.getConstant(GPRSize, DL, PtrVT)); MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr, MachinePointerInfo(SV, 8), false, false, 8)); @@ -3925,28 +3932,28 @@ SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op, int FPRSize = FuncInfo->getVarArgsFPRSize(); if (FPRSize > 0) { SDValue VRTop, VRTopAddr; - VRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList, - DAG.getConstant(16, DL, getPointerTy())); + VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, + DAG.getConstant(16, DL, PtrVT)); - VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), getPointerTy()); - VRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), VRTop, - DAG.getConstant(FPRSize, DL, getPointerTy())); + VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT); + VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop, + DAG.getConstant(FPRSize, DL, PtrVT)); MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr, MachinePointerInfo(SV, 16), false, false, 8)); } // int __gr_offs at offset 24 - SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList, - DAG.getConstant(24, DL, getPointerTy())); + SDValue GROffsAddr = + DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(24, DL, PtrVT)); MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32), GROffsAddr, MachinePointerInfo(SV, 24), false, false, 4)); // int __vr_offs at offset 28 - SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList, - DAG.getConstant(28, DL, getPointerTy())); + SDValue VROffsAddr = + DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(28, DL, PtrVT)); MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32), VROffsAddr, MachinePointerInfo(SV, 28), false, @@ -3987,21 +3994,22 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = Op.getOperand(0); SDValue Addr = Op.getOperand(1); unsigned Align = Op.getConstantOperandVal(3); + auto PtrVT = getPointerTy(DAG.getDataLayout()); - SDValue VAList = DAG.getLoad(getPointerTy(), DL, Chain, Addr, - MachinePointerInfo(V), false, false, false, 0); + SDValue VAList = DAG.getLoad(PtrVT, DL, Chain, Addr, MachinePointerInfo(V), + false, false, false, 0); Chain = VAList.getValue(1); if (Align > 8) { assert(((Align & (Align - 1)) == 0) && "Expected Align to be a power of 2"); - VAList = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList, - DAG.getConstant(Align - 1, DL, getPointerTy())); - VAList = DAG.getNode(ISD::AND, DL, getPointerTy(), VAList, - DAG.getConstant(-(int64_t)Align, DL, getPointerTy())); + VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, + DAG.getConstant(Align - 1, DL, PtrVT)); + VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList, + DAG.getConstant(-(int64_t)Align, DL, PtrVT)); } Type *ArgTy = VT.getTypeForEVT(*DAG.getContext()); - uint64_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy); + uint64_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy); // Scalar integer and FP values smaller than 64 bits are implicitly extended // up to 64 bits. At the very least, we have to increase the striding of the @@ -4016,8 +4024,8 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { } // Increment the pointer, VAList, to the next vaarg - SDValue VANext = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList, - DAG.getConstant(ArgSize, DL, getPointerTy())); + SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, + DAG.getConstant(ArgSize, DL, PtrVT)); // Store the incremented VAList to the legalized pointer SDValue APStore = DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V), false, false, 0); @@ -4057,8 +4065,8 @@ SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op, // FIXME? Maybe this could be a TableGen attribute on some registers and // this table could be generated automatically from RegInfo. -unsigned AArch64TargetLowering::getRegisterByName(const char* RegName, - EVT VT) const { +unsigned AArch64TargetLowering::getRegisterByName(const char* RegName, EVT VT, + SelectionDAG &DAG) const { unsigned Reg = StringSwitch<unsigned>(RegName) .Case("sp", AArch64::SP) .Default(0); @@ -4079,7 +4087,7 @@ SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op, unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); if (Depth) { SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); - SDValue Offset = DAG.getConstant(8, DL, getPointerTy()); + SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout())); return DAG.getLoad(VT, DL, DAG.getEntryNode(), DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), MachinePointerInfo(), false, false, false, 0); @@ -4232,7 +4240,7 @@ bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { /// getConstraintType - Given a constraint letter, return the type of /// constraint it is for this target. AArch64TargetLowering::ConstraintType -AArch64TargetLowering::getConstraintType(const std::string &Constraint) const { +AArch64TargetLowering::getConstraintType(StringRef Constraint) const { if (Constraint.size() == 1) { switch (Constraint[0]) { default: @@ -4283,8 +4291,7 @@ AArch64TargetLowering::getSingleConstraintMatchWeight( std::pair<unsigned, const TargetRegisterClass *> AArch64TargetLowering::getRegForInlineAsmConstraint( - const TargetRegisterInfo *TRI, const std::string &Constraint, - MVT VT) const { + const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { if (Constraint.size() == 1) { switch (Constraint[0]) { case 'r': @@ -4320,10 +4327,9 @@ AArch64TargetLowering::getRegForInlineAsmConstraint( unsigned Size = Constraint.size(); if ((Size == 4 || Size == 5) && Constraint[0] == '{' && tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') { - const std::string Reg = - std::string(&Constraint[2], &Constraint[Size - 1]); - int RegNo = atoi(Reg.c_str()); - if (RegNo >= 0 && RegNo <= 31) { + int RegNo; + bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo); + if (!Failed && RegNo >= 0 && RegNo <= 31) { // v0 - v31 are aliases of q0 - q31. // By default we'll emit v0-v31 for this unless there's a modifier where // we'll emit the correct register as well. @@ -6429,6 +6435,7 @@ SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op, bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, unsigned Intrinsic) const { + auto &DL = I.getModule()->getDataLayout(); switch (Intrinsic) { case Intrinsic::aarch64_neon_ld2: case Intrinsic::aarch64_neon_ld3: @@ -6444,7 +6451,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::aarch64_neon_ld4r: { Info.opc = ISD::INTRINSIC_W_CHAIN; // Conservatively set memVT to the entire set of vectors loaded. - uint64_t NumElts = getDataLayout()->getTypeAllocSize(I.getType()) / 8; + uint64_t NumElts = DL.getTypeAllocSize(I.getType()) / 8; Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1); Info.offset = 0; @@ -6470,7 +6477,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Type *ArgTy = I.getArgOperand(ArgI)->getType(); if (!ArgTy->isVectorTy()) break; - NumElts += getDataLayout()->getTypeAllocSize(ArgTy) / 8; + NumElts += DL.getTypeAllocSize(ArgTy) / 8; } Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1); @@ -6488,7 +6495,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = MVT::getVT(PtrTy->getElementType()); Info.ptrVal = I.getArgOperand(0); Info.offset = 0; - Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType()); + Info.align = DL.getABITypeAlignment(PtrTy->getElementType()); Info.vol = true; Info.readMem = true; Info.writeMem = false; @@ -6501,7 +6508,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = MVT::getVT(PtrTy->getElementType()); Info.ptrVal = I.getArgOperand(1); Info.offset = 0; - Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType()); + Info.align = DL.getABITypeAlignment(PtrTy->getElementType()); Info.vol = true; Info.readMem = false; Info.writeMem = true; @@ -6572,7 +6579,8 @@ bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const { return true; const TargetOptions &Options = getTargetMachine().Options; - EVT VT = getValueType(User->getOperand(0)->getType()); + const DataLayout &DL = I->getModule()->getDataLayout(); + EVT VT = getValueType(DL, User->getOperand(0)->getType()); if (isFMAFasterThanFMulAndFAdd(VT) && isOperationLegalOrCustom(ISD::FMA, VT) && @@ -6637,6 +6645,7 @@ bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const { break; case Instruction::GetElementPtr: { gep_type_iterator GTI = gep_type_begin(Instr); + auto &DL = Ext->getModule()->getDataLayout(); std::advance(GTI, U.getOperandNo()); Type *IdxTy = *GTI; // This extension will end up with a shift because of the scaling factor. @@ -6644,7 +6653,7 @@ bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const { // Get the shift amount based on the scaling factor: // log2(sizeof(IdxTy)) - log2(8). uint64_t ShiftAmt = - countTrailingZeros(getDataLayout()->getTypeStoreSizeInBits(IdxTy)) - 3; + countTrailingZeros(DL.getTypeStoreSizeInBits(IdxTy)) - 3; // Is the constant foldable in the shift of the addressing mode? // I.e., shift amount is between 1 and 4 inclusive. if (ShiftAmt == 0 || ShiftAmt > 4) @@ -6708,10 +6717,10 @@ bool AArch64TargetLowering::lowerInterleavedLoad( assert(Shuffles.size() == Indices.size() && "Unmatched number of shufflevectors and indices"); - const DataLayout *DL = getDataLayout(); + const DataLayout &DL = LI->getModule()->getDataLayout(); VectorType *VecTy = Shuffles[0]->getType(); - unsigned VecSize = DL->getTypeAllocSizeInBits(VecTy); + unsigned VecSize = DL.getTypeAllocSizeInBits(VecTy); // Skip illegal vector types. if (VecSize != 64 && VecSize != 128) @@ -6721,8 +6730,8 @@ bool AArch64TargetLowering::lowerInterleavedLoad( // load integer vectors first and then convert to pointer vectors. Type *EltTy = VecTy->getVectorElementType(); if (EltTy->isPointerTy()) - VecTy = VectorType::get(DL->getIntPtrType(EltTy), - VecTy->getVectorNumElements()); + VecTy = + VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements()); Type *PtrTy = VecTy->getPointerTo(LI->getPointerAddressSpace()); Type *Tys[2] = {VecTy, PtrTy}; @@ -6796,8 +6805,8 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, Type *EltTy = VecTy->getVectorElementType(); VectorType *SubVecTy = VectorType::get(EltTy, NumSubElts); - const DataLayout *DL = getDataLayout(); - unsigned SubVecSize = DL->getTypeAllocSizeInBits(SubVecTy); + const DataLayout &DL = SI->getModule()->getDataLayout(); + unsigned SubVecSize = DL.getTypeAllocSizeInBits(SubVecTy); // Skip illegal vector types. if (SubVecSize != 64 && SubVecSize != 128) @@ -6810,7 +6819,7 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, // StN intrinsics don't support pointer vectors as arguments. Convert pointer // vectors to integer vectors. if (EltTy->isPointerTy()) { - Type *IntTy = DL->getIntPtrType(EltTy); + Type *IntTy = DL.getIntPtrType(EltTy); unsigned NumOpElts = dyn_cast<VectorType>(Op0->getType())->getVectorNumElements(); @@ -6894,8 +6903,8 @@ bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const { /// isLegalAddressingMode - Return true if the addressing mode represented /// by AM is legal for this target, for a load/store of the specified type. -bool AArch64TargetLowering::isLegalAddressingMode(const AddrMode &AM, - Type *Ty, +bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL, + const AddrMode &AM, Type *Ty, unsigned AS) const { // AArch64 has five basic addressing modes: // reg @@ -6916,7 +6925,7 @@ bool AArch64TargetLowering::isLegalAddressingMode(const AddrMode &AM, // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12 uint64_t NumBytes = 0; if (Ty->isSized()) { - uint64_t NumBits = getDataLayout()->getTypeSizeInBits(Ty); + uint64_t NumBits = DL.getTypeSizeInBits(Ty); NumBytes = NumBits / 8; if (!isPowerOf2_64(NumBits)) NumBytes = 0; @@ -6946,8 +6955,8 @@ bool AArch64TargetLowering::isLegalAddressingMode(const AddrMode &AM, return false; } -int AArch64TargetLowering::getScalingFactorCost(const AddrMode &AM, - Type *Ty, +int AArch64TargetLowering::getScalingFactorCost(const DataLayout &DL, + const AddrMode &AM, Type *Ty, unsigned AS) const { // Scaling factors are not free at all. // Operands | Rt Latency @@ -6956,7 +6965,7 @@ int AArch64TargetLowering::getScalingFactorCost(const AddrMode &AM, // ------------------------------------------- // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5 // Rt, [Xn, Wm, <extend> #imm] | - if (isLegalAddressingMode(AM, Ty, AS)) + if (isLegalAddressingMode(DL, AM, Ty, AS)) // Scale represents reg2 * scale, thus account for 1 if // it is not equal to 0 or 1. return AM.Scale != 0 && AM.Scale != 1; diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h index 46298c0e7de1..c73ce1e54b3e 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.h +++ b/lib/Target/AArch64/AArch64ISelLowering.h @@ -233,7 +233,7 @@ public: APInt &KnownOne, const SelectionDAG &DAG, unsigned Depth = 0) const override; - MVT getScalarShiftAmountTy(EVT LHSTy) const override; + MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override; /// allowsMisalignedMemoryAccesses - Returns true if the target allows /// unaligned memory accesses of the specified type. @@ -278,7 +278,8 @@ public: bool isShuffleMaskLegal(const SmallVectorImpl<int> &M, EVT VT) const override; /// getSetCCResultType - Return the ISD::SETCC ValueType - EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override; + EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, + EVT VT) const override; SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const; @@ -323,7 +324,7 @@ public: /// isLegalAddressingMode - Return true if the addressing mode represented /// by AM is legal for this target, for a load/store of the specified type. - bool isLegalAddressingMode(const AddrMode &AM, Type *Ty, + bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const override; /// \brief Return the cost of the scaling factor used in the addressing @@ -331,7 +332,7 @@ public: /// of the specified type. /// If the AM is supported, the return value must be >= 0. /// If the AM is not supported, it returns a negative value. - int getScalingFactorCost(const AddrMode &AM, Type *Ty, + int getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const override; /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster @@ -471,9 +472,9 @@ private: std::vector<SDNode *> *Created) const override; bool combineRepeatedFPDivisors(unsigned NumUsers) const override; - ConstraintType - getConstraintType(const std::string &Constraint) const override; - unsigned getRegisterByName(const char* RegName, EVT VT) const override; + ConstraintType getConstraintType(StringRef Constraint) const override; + unsigned getRegisterByName(const char* RegName, EVT VT, + SelectionDAG &DAG) const override; /// Examine constraint string and operand type and determine a weight value. /// The operand object must already have been set up with the operand type. @@ -483,14 +484,12 @@ private: std::pair<unsigned, const TargetRegisterClass *> getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, - const std::string &Constraint, - MVT VT) const override; + StringRef Constraint, MVT VT) const override; void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops, SelectionDAG &DAG) const override; - unsigned getInlineAsmMemConstraint( - const std::string &ConstraintCode) const override { + unsigned getInlineAsmMemConstraint(StringRef ConstraintCode) const override { if (ConstraintCode == "Q") return InlineAsm::Constraint_Q; // FIXME: clang has code for 'Ump', 'Utf', 'Usa', and 'Ush' but these are diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td index b73e0958df90..fa1a46acba84 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.td +++ b/lib/Target/AArch64/AArch64InstrInfo.td @@ -894,6 +894,8 @@ def REVXr : OneXRegData<0b011, "rev", bswap>; def REV32Xr : OneXRegData<0b010, "rev32", UnOpFrag<(rotr (bswap node:$LHS), (i64 32))>>; +def : InstAlias<"rev64 $Rd, $Rn", (REVXr GPR64:$Rd, GPR64:$Rn), 0>; + // The bswap commutes with the rotr so we want a pattern for both possible // orders. def : Pat<(bswap (rotr GPR32:$Rn, (i64 16))), (REV16Wr GPR32:$Rn)>; @@ -5283,18 +5285,23 @@ def : Pat<(v8f16 (AArch64NvCast (v4i32 FPR128:$src))), (v8f16 FPR128:$src)>; def : Pat<(v4i32 (AArch64NvCast (v4i32 FPR128:$src))), (v4i32 FPR128:$src)>; def : Pat<(v4f32 (AArch64NvCast (v4i32 FPR128:$src))), (v4f32 FPR128:$src)>; def : Pat<(v2i64 (AArch64NvCast (v4i32 FPR128:$src))), (v2i64 FPR128:$src)>; +def : Pat<(v2f64 (AArch64NvCast (v4i32 FPR128:$src))), (v2f64 FPR128:$src)>; def : Pat<(v16i8 (AArch64NvCast (v8i16 FPR128:$src))), (v16i8 FPR128:$src)>; def : Pat<(v8i16 (AArch64NvCast (v8i16 FPR128:$src))), (v8i16 FPR128:$src)>; def : Pat<(v8f16 (AArch64NvCast (v8i16 FPR128:$src))), (v8f16 FPR128:$src)>; def : Pat<(v4i32 (AArch64NvCast (v8i16 FPR128:$src))), (v4i32 FPR128:$src)>; def : Pat<(v2i64 (AArch64NvCast (v8i16 FPR128:$src))), (v2i64 FPR128:$src)>; +def : Pat<(v4f32 (AArch64NvCast (v8i16 FPR128:$src))), (v4f32 FPR128:$src)>; +def : Pat<(v2f64 (AArch64NvCast (v8i16 FPR128:$src))), (v2f64 FPR128:$src)>; def : Pat<(v16i8 (AArch64NvCast (v16i8 FPR128:$src))), (v16i8 FPR128:$src)>; def : Pat<(v8i16 (AArch64NvCast (v16i8 FPR128:$src))), (v8i16 FPR128:$src)>; def : Pat<(v8f16 (AArch64NvCast (v16i8 FPR128:$src))), (v8f16 FPR128:$src)>; def : Pat<(v4i32 (AArch64NvCast (v16i8 FPR128:$src))), (v4i32 FPR128:$src)>; def : Pat<(v2i64 (AArch64NvCast (v16i8 FPR128:$src))), (v2i64 FPR128:$src)>; +def : Pat<(v4f32 (AArch64NvCast (v16i8 FPR128:$src))), (v4f32 FPR128:$src)>; +def : Pat<(v2f64 (AArch64NvCast (v16i8 FPR128:$src))), (v2f64 FPR128:$src)>; def : Pat<(v16i8 (AArch64NvCast (v2i64 FPR128:$src))), (v16i8 FPR128:$src)>; def : Pat<(v8i16 (AArch64NvCast (v2i64 FPR128:$src))), (v8i16 FPR128:$src)>; @@ -5309,12 +5316,16 @@ def : Pat<(v8i16 (AArch64NvCast (v4f32 FPR128:$src))), (v8i16 FPR128:$src)>; def : Pat<(v4i32 (AArch64NvCast (v4f32 FPR128:$src))), (v4i32 FPR128:$src)>; def : Pat<(v4f32 (AArch64NvCast (v4f32 FPR128:$src))), (v4f32 FPR128:$src)>; def : Pat<(v2i64 (AArch64NvCast (v4f32 FPR128:$src))), (v2i64 FPR128:$src)>; +def : Pat<(v8f16 (AArch64NvCast (v4f32 FPR128:$src))), (v8f16 FPR128:$src)>; +def : Pat<(v2f64 (AArch64NvCast (v4f32 FPR128:$src))), (v2f64 FPR128:$src)>; def : Pat<(v16i8 (AArch64NvCast (v2f64 FPR128:$src))), (v16i8 FPR128:$src)>; def : Pat<(v8i16 (AArch64NvCast (v2f64 FPR128:$src))), (v8i16 FPR128:$src)>; def : Pat<(v4i32 (AArch64NvCast (v2f64 FPR128:$src))), (v4i32 FPR128:$src)>; def : Pat<(v2i64 (AArch64NvCast (v2f64 FPR128:$src))), (v2i64 FPR128:$src)>; def : Pat<(v2f64 (AArch64NvCast (v2f64 FPR128:$src))), (v2f64 FPR128:$src)>; +def : Pat<(v8f16 (AArch64NvCast (v2f64 FPR128:$src))), (v8f16 FPR128:$src)>; +def : Pat<(v4f32 (AArch64NvCast (v2f64 FPR128:$src))), (v4f32 FPR128:$src)>; let Predicates = [IsLE] in { def : Pat<(v8i8 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp index 1836682e386e..841af55f7a65 100644 --- a/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -90,7 +90,7 @@ AArch64RegisterInfo::getThisReturnPreservedMask(const MachineFunction &MF, BitVector AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const { - const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); + const AArch64FrameLowering *TFI = getFrameLowering(MF); // FIXME: avoid re-calculating this every time. BitVector Reserved(getNumRegs()); @@ -119,7 +119,7 @@ AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const { bool AArch64RegisterInfo::isReservedReg(const MachineFunction &MF, unsigned Reg) const { - const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); + const AArch64FrameLowering *TFI = getFrameLowering(MF); switch (Reg) { default: @@ -198,11 +198,9 @@ bool AArch64RegisterInfo::canRealignStack(const MachineFunction &MF) const { bool AArch64RegisterInfo::needsStackRealignment(const MachineFunction &MF) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); + const AArch64FrameLowering *TFI = getFrameLowering(MF); const Function *F = MF.getFunction(); - unsigned StackAlign = MF.getTarget() - .getSubtargetImpl(*MF.getFunction()) - ->getFrameLowering() - ->getStackAlignment(); + unsigned StackAlign = TFI->getStackAlignment(); bool requiresRealignment = ((MFI->getMaxAlignment() > StackAlign) || F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, @@ -213,8 +211,7 @@ AArch64RegisterInfo::needsStackRealignment(const MachineFunction &MF) const { unsigned AArch64RegisterInfo::getFrameRegister(const MachineFunction &MF) const { - const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); - + const AArch64FrameLowering *TFI = getFrameLowering(MF); return TFI->hasFP(MF) ? AArch64::FP : AArch64::SP; } @@ -280,7 +277,7 @@ bool AArch64RegisterInfo::needsFrameBaseReg(MachineInstr *MI, // Note that the incoming offset is based on the SP value at function entry, // so it'll be negative. MachineFunction &MF = *MI->getParent()->getParent(); - const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); + const AArch64FrameLowering *TFI = getFrameLowering(MF); MachineFrameInfo *MFI = MF.getFrameInfo(); // Estimate an offset from the frame pointer. @@ -376,8 +373,7 @@ void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, MachineFunction &MF = *MBB.getParent(); const AArch64InstrInfo *TII = MF.getSubtarget<AArch64Subtarget>().getInstrInfo(); - const AArch64FrameLowering *TFI = static_cast<const AArch64FrameLowering *>( - MF.getSubtarget().getFrameLowering()); + const AArch64FrameLowering *TFI = getFrameLowering(MF); int FrameIndex = MI.getOperand(FIOperandNum).getIndex(); unsigned FrameReg; @@ -415,7 +411,7 @@ namespace llvm { unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const { - const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); + const AArch64FrameLowering *TFI = getFrameLowering(MF); switch (RC->getID()) { default: diff --git a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp index b9c53998752a..f40293021d74 100644 --- a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp +++ b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp @@ -16,11 +16,6 @@ using namespace llvm; #define DEBUG_TYPE "aarch64-selectiondag-info" -AArch64SelectionDAGInfo::AArch64SelectionDAGInfo(const DataLayout *DL) - : TargetSelectionDAGInfo(DL) {} - -AArch64SelectionDAGInfo::~AArch64SelectionDAGInfo() {} - SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset( SelectionDAG &DAG, SDLoc dl, SDValue Chain, SDValue Dst, SDValue Src, SDValue Size, unsigned Align, bool isVolatile, @@ -37,8 +32,8 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset( if (bzeroEntry && (!SizeValue || SizeValue->getZExtValue() > 256)) { const AArch64TargetLowering &TLI = *STI.getTargetLowering(); - EVT IntPtr = TLI.getPointerTy(); - Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext()); + EVT IntPtr = TLI.getPointerTy(DAG.getDataLayout()); + Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext()); TargetLowering::ArgListTy Args; TargetLowering::ArgListEntry Entry; Entry.Node = Dst; diff --git a/lib/Target/AArch64/AArch64SelectionDAGInfo.h b/lib/Target/AArch64/AArch64SelectionDAGInfo.h index 11932d2b1c22..97421b45b122 100644 --- a/lib/Target/AArch64/AArch64SelectionDAGInfo.h +++ b/lib/Target/AArch64/AArch64SelectionDAGInfo.h @@ -20,8 +20,6 @@ namespace llvm { class AArch64SelectionDAGInfo : public TargetSelectionDAGInfo { public: - explicit AArch64SelectionDAGInfo(const DataLayout *DL); - ~AArch64SelectionDAGInfo(); SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, SDValue Chain, SDValue Dst, SDValue Src, SDValue Size, diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp index 554826b1e08a..486efd6ce3a2 100644 --- a/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/lib/Target/AArch64/AArch64Subtarget.cpp @@ -49,15 +49,15 @@ AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU, HasV8_1aOps(false), HasFPARMv8(false), HasNEON(false), HasCrypto(false), HasCRC(false), HasZeroCycleRegMove(false), HasZeroCycleZeroing(false), IsLittle(LittleEndian), CPUString(CPU), TargetTriple(TT), FrameLowering(), - InstrInfo(initializeSubtargetDependencies(FS)), - TSInfo(TM.getDataLayout()), TLInfo(TM, *this) {} + InstrInfo(initializeSubtargetDependencies(FS)), TSInfo(), + TLInfo(TM, *this) {} /// ClassifyGlobalReference - Find the target operand flags that describe /// how a global value should be referenced for the current subtarget. unsigned char AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const { - bool isDecl = GV->isDeclarationForLinker(); + bool isDef = GV->isStrongDefinitionForLinker(); // MachO large model always goes via a GOT, simply to get a single 8-byte // absolute relocation on all global addresses. @@ -66,8 +66,7 @@ AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV, // The small code mode's direct accesses use ADRP, which cannot necessarily // produce the value 0 (if the code is above 4GB). - if (TM.getCodeModel() == CodeModel::Small && - GV->isWeakForLinker() && isDecl) { + if (TM.getCodeModel() == CodeModel::Small && GV->hasExternalWeakLinkage()) { // In PIC mode use the GOT, but in absolute mode use a constant pool load. if (TM.getRelocationModel() == Reloc::Static) return AArch64II::MO_CONSTPOOL; @@ -85,8 +84,7 @@ AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV, // defined could end up in unexpected places. Use a GOT. if (TM.getRelocationModel() != Reloc::Static && GV->hasDefaultVisibility()) { if (isTargetMachO()) - return (isDecl || GV->isWeakForLinker()) ? AArch64II::MO_GOT - : AArch64II::MO_NO_FLAG; + return isDef ? AArch64II::MO_NO_FLAG : AArch64II::MO_GOT; else // No need to go through the GOT for local symbols on ELF. return GV->hasLocalLinkage() ? AArch64II::MO_NO_FLAG : AArch64II::MO_GOT; diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index fc91c94351cc..e085cca35f1c 100644 --- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -181,8 +181,8 @@ unsigned AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); - EVT SrcTy = TLI->getValueType(Src); - EVT DstTy = TLI->getValueType(Dst); + EVT SrcTy = TLI->getValueType(DL, Src); + EVT DstTy = TLI->getValueType(DL, Dst); if (!SrcTy.isSimple() || !DstTy.isSimple()) return BaseT::getCastInstrCost(Opcode, Dst, Src); @@ -265,7 +265,7 @@ unsigned AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, if (Index != -1U) { // Legalize the type. - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Val); + std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Val); // This type is legalized to a scalar type. if (!LT.second.isVector()) @@ -289,7 +289,7 @@ unsigned AArch64TTIImpl::getArithmeticInstrCost( TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo, TTI::OperandValueProperties Opd2PropInfo) { // Legalize the type. - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty); + std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); int ISD = TLI->InstructionOpcodeToISD(Opcode); @@ -364,8 +364,8 @@ unsigned AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost } }; - EVT SelCondTy = TLI->getValueType(CondTy); - EVT SelValTy = TLI->getValueType(ValTy); + EVT SelCondTy = TLI->getValueType(DL, CondTy); + EVT SelValTy = TLI->getValueType(DL, ValTy); if (SelCondTy.isSimple() && SelValTy.isSimple()) { int Idx = ConvertCostTableLookup(VectorSelectTbl, ISD, SelCondTy.getSimpleVT(), @@ -380,7 +380,7 @@ unsigned AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, unsigned AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace) { - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src); + std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); if (Opcode == Instruction::Store && Src->isVectorTy() && Alignment != 16 && Src->getVectorElementType()->isIntegerTy(64)) { @@ -416,7 +416,7 @@ unsigned AArch64TTIImpl::getInterleavedMemoryOpCost( if (Factor <= TLI->getMaxSupportedInterleaveFactor()) { unsigned NumElts = VecTy->getVectorNumElements(); Type *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor); - unsigned SubVecSize = TLI->getDataLayout()->getTypeAllocSize(SubVecTy); + unsigned SubVecSize = DL.getTypeAllocSize(SubVecTy); // ldN/stN only support legal vector types of size 64 or 128 in bits. if (NumElts % Factor == 0 && (SubVecSize == 64 || SubVecSize == 128)) diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.h b/lib/Target/AArch64/AArch64TargetTransformInfo.h index 4dabdadd8eeb..444d3ccc15e1 100644 --- a/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -31,7 +31,6 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> { typedef TargetTransformInfo TTI; friend BaseT; - const AArch64TargetMachine *TM; const AArch64Subtarget *ST; const AArch64TargetLowering *TLI; @@ -50,30 +49,15 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> { public: explicit AArch64TTIImpl(const AArch64TargetMachine *TM, Function &F) - : BaseT(TM), TM(TM), ST(TM->getSubtargetImpl(F)), + : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)), TLI(ST->getTargetLowering()) {} // Provide value semantics. MSVC requires that we spell all of these out. AArch64TTIImpl(const AArch64TTIImpl &Arg) - : BaseT(static_cast<const BaseT &>(Arg)), TM(Arg.TM), ST(Arg.ST), - TLI(Arg.TLI) {} + : BaseT(static_cast<const BaseT &>(Arg)), ST(Arg.ST), TLI(Arg.TLI) {} AArch64TTIImpl(AArch64TTIImpl &&Arg) - : BaseT(std::move(static_cast<BaseT &>(Arg))), TM(std::move(Arg.TM)), - ST(std::move(Arg.ST)), TLI(std::move(Arg.TLI)) {} - AArch64TTIImpl &operator=(const AArch64TTIImpl &RHS) { - BaseT::operator=(static_cast<const BaseT &>(RHS)); - TM = RHS.TM; - ST = RHS.ST; - TLI = RHS.TLI; - return *this; - } - AArch64TTIImpl &operator=(AArch64TTIImpl &&RHS) { - BaseT::operator=(std::move(static_cast<BaseT &>(RHS))); - TM = std::move(RHS.TM); - ST = std::move(RHS.ST); - TLI = std::move(RHS.TLI); - return *this; - } + : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)), + TLI(std::move(Arg.TLI)) {} /// \name Scalar TTI Implementations /// @{ diff --git a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp index 359c2e734e21..db9fb0e775df 100644 --- a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp +++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp @@ -228,7 +228,7 @@ DecodeStatus AArch64Disassembler::getInstruction(MCInst &MI, uint64_t &Size, } static MCSymbolizer * -createAArch64ExternalSymbolizer(StringRef TT, LLVMOpInfoCallback GetOpInfo, +createAArch64ExternalSymbolizer(const Triple &TT, LLVMOpInfoCallback GetOpInfo, LLVMSymbolLookupCallback SymbolLookUp, void *DisInfo, MCContext *Ctx, std::unique_ptr<MCRelocationInfo> &&RelInfo) { diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp index b5b1d1f9e19c..16d53569b231 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp @@ -199,7 +199,7 @@ MCELFStreamer *createAArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB, MCTargetStreamer * createAArch64ObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) { const Triple &TT = STI.getTargetTriple(); - if (TT.getObjectFormat() == Triple::ELF) + if (TT.isOSBinFormatELF()) return new AArch64TargetELFStreamer(S); return nullptr; } diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp index 099d1b01c339..9f7bed0d3b12 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp @@ -42,16 +42,13 @@ static MCInstrInfo *createAArch64MCInstrInfo() { static MCSubtargetInfo * createAArch64MCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) { - MCSubtargetInfo *X = new MCSubtargetInfo(); - if (CPU.empty()) CPU = "generic"; - InitAArch64MCSubtargetInfo(X, TT, CPU, FS); - return X; + return createAArch64MCSubtargetInfoImpl(TT, CPU, FS); } -static MCRegisterInfo *createAArch64MCRegisterInfo(StringRef Triple) { +static MCRegisterInfo *createAArch64MCRegisterInfo(const Triple &Triple) { MCRegisterInfo *X = new MCRegisterInfo(); InitAArch64MCRegisterInfo(X, AArch64::LR); return X; @@ -75,11 +72,11 @@ static MCAsmInfo *createAArch64MCAsmInfo(const MCRegisterInfo &MRI, return MAI; } -static MCCodeGenInfo *createAArch64MCCodeGenInfo(StringRef TT, Reloc::Model RM, +static MCCodeGenInfo *createAArch64MCCodeGenInfo(const Triple &TT, + Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) { - Triple TheTriple(TT); - assert((TheTriple.isOSBinFormatELF() || TheTriple.isOSBinFormatMachO()) && + assert((TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()) && "Only expect Darwin and ELF targets"); if (CM == CodeModel::Default) @@ -94,7 +91,7 @@ static MCCodeGenInfo *createAArch64MCCodeGenInfo(StringRef TT, Reloc::Model RM, "Only small and large code models are allowed on AArch64"); // AArch64 Darwin is always PIC. - if (TheTriple.isOSDarwin()) + if (TT.isOSDarwin()) RM = Reloc::PIC_; // On ELF platforms the default static relocation model has a smart enough // linker to cope with referencing external symbols defined in a shared diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td index 569ad3844b25..ef8ef6268548 100644 --- a/lib/Target/AMDGPU/AMDGPU.td +++ b/lib/Target/AMDGPU/AMDGPU.td @@ -98,6 +98,16 @@ def FeatureEnableLoadStoreOpt : SubtargetFeature <"load-store-opt", "true", "Enable SI load/store optimizer pass">; +// Performance debugging feature. Allow using DS instruction immediate +// offsets even if the base pointer can't be proven to be base. On SI, +// base pointer values that won't give the same result as a 16-bit add +// are not safe to fold, but this will override the conservative test +// for the base pointer. +def FeatureEnableUnsafeDSOffsetFolding : SubtargetFeature <"unsafe-ds-offset-folding", + "EnableUnsafeDSOffsetFolding", + "true", + "Force using DS instruction immediate offsets on SI">; + def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space", "FlatAddressSpace", "true", diff --git a/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp index 0b426bc63dd5..ad267d350850 100644 --- a/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp +++ b/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp @@ -22,7 +22,6 @@ using namespace llvm; namespace { class AMDGPUAlwaysInline : public ModulePass { - static char ID; public: @@ -36,10 +35,9 @@ public: char AMDGPUAlwaysInline::ID = 0; bool AMDGPUAlwaysInline::runOnModule(Module &M) { + std::vector<Function *> FuncsToClone; - std::vector<Function*> FuncsToClone; - for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { - Function &F = *I; + for (Function &F : M) { if (!F.hasLocalLinkage() && !F.isDeclaration() && !F.use_empty() && !F.hasFnAttribute(Attribute::NoInline)) FuncsToClone.push_back(&F); @@ -49,12 +47,11 @@ bool AMDGPUAlwaysInline::runOnModule(Module &M) { ValueToValueMapTy VMap; Function *NewFunc = CloneFunction(F, VMap, false); NewFunc->setLinkage(GlobalValue::InternalLinkage); - F->getParent()->getFunctionList().push_back(NewFunc); + M.getFunctionList().push_back(NewFunc); F->replaceAllUsesWith(NewFunc); } - for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { - Function &F = *I; + for (Function &F : M) { if (F.hasLocalLinkage() && !F.hasFnAttribute(Attribute::NoInline)) { F.addFnAttr(Attribute::AlwaysInline); } diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index df4461eac4db..37b77d778d9f 100644 --- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -110,8 +110,11 @@ private: SDValue &Offset, SDValue &GLC) const; SDNode *SelectAddrSpaceCast(SDNode *N); bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const; + bool SelectVOP3NoMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods, SDValue &Clamp, SDValue &Omod) const; + bool SelectVOP3NoMods0(SDValue In, SDValue &Src, SDValue &SrcMods, + SDValue &Clamp, SDValue &Omod) const; bool SelectVOP3Mods0Clamp(SDValue In, SDValue &Src, SDValue &SrcMods, SDValue &Omod) const; @@ -859,7 +862,8 @@ bool AMDGPUDAGToDAGISel::isDSOffsetLegal(const SDValue &Base, unsigned Offset, (OffsetBits == 8 && !isUInt<8>(Offset))) return false; - if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) + if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS || + Subtarget->unsafeDSOffsetFoldingEnabled()) return true; // On Southern Islands instruction with a negative base value and an offset @@ -1316,6 +1320,12 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src, return true; } +bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src, + SDValue &SrcMods) const { + bool Res = SelectVOP3Mods(In, Src, SrcMods); + return Res && cast<ConstantSDNode>(SrcMods)->isNullValue(); +} + bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods, SDValue &Clamp, SDValue &Omod) const { @@ -1327,6 +1337,16 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src, return SelectVOP3Mods(In, Src, SrcMods); } +bool AMDGPUDAGToDAGISel::SelectVOP3NoMods0(SDValue In, SDValue &Src, + SDValue &SrcMods, SDValue &Clamp, + SDValue &Omod) const { + bool Res = SelectVOP3Mods0(In, Src, SrcMods, Clamp, Omod); + + return Res && cast<ConstantSDNode>(SrcMods)->isNullValue() && + cast<ConstantSDNode>(Clamp)->isNullValue() && + cast<ConstantSDNode>(Omod)->isNullValue(); +} + bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp(SDValue In, SDValue &Src, SDValue &SrcMods, SDValue &Omod) const { @@ -1351,18 +1371,14 @@ void AMDGPUDAGToDAGISel::PostprocessISelDAG() { do { IsModified = false; // Go over all selected nodes and try to fold them a bit more - for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(), - E = CurDAG->allnodes_end(); I != E; ++I) { - - SDNode *Node = I; - - MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(I); + for (SDNode &Node : CurDAG->allnodes()) { + MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(&Node); if (!MachineNode) continue; SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG); - if (ResNode != Node) { - ReplaceUses(Node, ResNode); + if (ResNode != &Node) { + ReplaceUses(&Node, ResNode); IsModified = true; } } diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index d56838ec2019..3a65f3b56146 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -406,6 +406,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM, setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom); setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom); + setTargetDAGCombine(ISD::SHL); setTargetDAGCombine(ISD::MUL); setTargetDAGCombine(ISD::SELECT); setTargetDAGCombine(ISD::SELECT_CC); @@ -444,7 +445,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM, // Target Information //===----------------------------------------------------------------------===// -MVT AMDGPUTargetLowering::getVectorIdxTy() const { +MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const { return MVT::i32; } @@ -545,9 +546,8 @@ bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const { } bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const { - const DataLayout *DL = getDataLayout(); - unsigned SrcSize = DL->getTypeSizeInBits(Src->getScalarType()); - unsigned DestSize = DL->getTypeSizeInBits(Dest->getScalarType()); + unsigned SrcSize = Src->getScalarSizeInBits(); + unsigned DestSize = Dest->getScalarSizeInBits(); return SrcSize == 32 && DestSize == 64; } @@ -697,7 +697,7 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init, const SDValue &InitPtr, SDValue Chain, SelectionDAG &DAG) const { - const DataLayout *TD = getDataLayout(); + const DataLayout &TD = DAG.getDataLayout(); SDLoc DL(InitPtr); Type *InitTy = Init->getType(); @@ -705,20 +705,20 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init, EVT VT = EVT::getEVT(InitTy); PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS); return DAG.getStore(Chain, DL, DAG.getConstant(*CI, DL, VT), InitPtr, - MachinePointerInfo(UndefValue::get(PtrTy)), false, false, - TD->getPrefTypeAlignment(InitTy)); + MachinePointerInfo(UndefValue::get(PtrTy)), false, + false, TD.getPrefTypeAlignment(InitTy)); } if (const ConstantFP *CFP = dyn_cast<ConstantFP>(Init)) { EVT VT = EVT::getEVT(CFP->getType()); PointerType *PtrTy = PointerType::get(CFP->getType(), 0); return DAG.getStore(Chain, DL, DAG.getConstantFP(*CFP, DL, VT), InitPtr, - MachinePointerInfo(UndefValue::get(PtrTy)), false, false, - TD->getPrefTypeAlignment(CFP->getType())); + MachinePointerInfo(UndefValue::get(PtrTy)), false, + false, TD.getPrefTypeAlignment(CFP->getType())); } if (StructType *ST = dyn_cast<StructType>(InitTy)) { - const StructLayout *SL = TD->getStructLayout(ST); + const StructLayout *SL = TD.getStructLayout(ST); EVT PtrVT = InitPtr.getValueType(); SmallVector<SDValue, 8> Chains; @@ -745,7 +745,7 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init, else llvm_unreachable("Unexpected type"); - unsigned EltSize = TD->getTypeAllocSize(SeqTy->getElementType()); + unsigned EltSize = TD.getTypeAllocSize(SeqTy->getElementType()); SmallVector<SDValue, 8> Chains; for (unsigned i = 0; i < NumElements; ++i) { SDValue Offset = DAG.getConstant(i * EltSize, DL, PtrVT); @@ -762,8 +762,8 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init, EVT VT = EVT::getEVT(InitTy); PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS); return DAG.getStore(Chain, DL, DAG.getUNDEF(VT), InitPtr, - MachinePointerInfo(UndefValue::get(PtrTy)), false, false, - TD->getPrefTypeAlignment(InitTy)); + MachinePointerInfo(UndefValue::get(PtrTy)), false, + false, TD.getPrefTypeAlignment(InitTy)); } Init->dump(); @@ -785,7 +785,7 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, SDValue Op, SelectionDAG &DAG) const { - const DataLayout *TD = getDataLayout(); + const DataLayout &DL = DAG.getDataLayout(); GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op); const GlobalValue *GV = G->getGlobal(); @@ -801,7 +801,7 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, unsigned Offset; if (MFI->LocalMemoryObjects.count(GV) == 0) { - uint64_t Size = TD->getTypeAllocSize(GV->getType()->getElementType()); + uint64_t Size = DL.getTypeAllocSize(GV->getType()->getElementType()); Offset = MFI->LDSSize; MFI->LocalMemoryObjects[GV] = Offset; // XXX: Account for alignment? @@ -811,16 +811,16 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, } return DAG.getConstant(Offset, SDLoc(Op), - getPointerTy(AMDGPUAS::LOCAL_ADDRESS)); + getPointerTy(DL, AMDGPUAS::LOCAL_ADDRESS)); } case AMDGPUAS::CONSTANT_ADDRESS: { MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo(); Type *EltType = GV->getType()->getElementType(); - unsigned Size = TD->getTypeAllocSize(EltType); - unsigned Alignment = TD->getPrefTypeAlignment(EltType); + unsigned Size = DL.getTypeAllocSize(EltType); + unsigned Alignment = DL.getPrefTypeAlignment(EltType); - MVT PrivPtrVT = getPointerTy(AMDGPUAS::PRIVATE_ADDRESS); - MVT ConstPtrVT = getPointerTy(AMDGPUAS::CONSTANT_ADDRESS); + MVT PrivPtrVT = getPointerTy(DL, AMDGPUAS::PRIVATE_ADDRESS); + MVT ConstPtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS); int FI = FrameInfo->CreateStackObject(Size, Alignment, false); SDValue InitPtr = DAG.getFrameIndex(FI, PrivPtrVT); @@ -1653,7 +1653,7 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool // fb = fabs(fb); fb = DAG.getNode(ISD::FABS, DL, FltVT, fb); - EVT SetCCVT = getSetCCResultType(*DAG.getContext(), VT); + EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); // int cv = fr >= fb; SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE); @@ -1960,7 +1960,8 @@ SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const { const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64); const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64); - EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64); + EVT SetCCVT = + getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64); SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT); SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE); @@ -2020,7 +2021,8 @@ SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const { SDValue Not = DAG.getNOT(SL, Shr, MVT::i64); SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not); - EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::i32); + EVT SetCCVT = + getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32); const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32); @@ -2051,7 +2053,8 @@ SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const { APFloat C2Val(APFloat::IEEEdouble, "0x1.fffffffffffffp+51"); SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64); - EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64); + EVT SetCCVT = + getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64); SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT); return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2); @@ -2081,7 +2084,8 @@ SDValue AMDGPUTargetLowering::LowerFROUND32(SDValue Op, SelectionDAG &DAG) const SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f32, One, X); - EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f32); + EVT SetCCVT = + getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32); SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE); @@ -2100,8 +2104,8 @@ SDValue AMDGPUTargetLowering::LowerFROUND64(SDValue Op, SelectionDAG &DAG) const const SDValue One = DAG.getConstant(1, SL, MVT::i32); const SDValue NegOne = DAG.getConstant(-1, SL, MVT::i32); const SDValue FiftyOne = DAG.getConstant(51, SL, MVT::i32); - EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::i32); - + EVT SetCCVT = + getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32); SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X); @@ -2172,7 +2176,8 @@ SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const { const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64); const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64); - EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64); + EVT SetCCVT = + getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64); SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT); SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE); @@ -2411,6 +2416,33 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N, SN->getBasePtr(), SN->getMemOperand()); } +SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + if (N->getValueType(0) != MVT::i64) + return SDValue(); + + // i64 (shl x, 32) -> (build_pair 0, x) + + // Doing this with moves theoretically helps MI optimizations that understand + // copies. 2 v_mov_b32_e32 will have the same code size / cycle count as + // v_lshl_b64. In the SALU case, I think this is slightly worse since it + // doubles the code size and I'm unsure about cycle count. + const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1)); + if (!RHS || RHS->getZExtValue() != 32) + return SDValue(); + + SDValue LHS = N->getOperand(0); + + SDLoc SL(N); + SelectionDAG &DAG = DCI.DAG; + + // Extract low 32-bits. + SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS); + + const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); + return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Zero, Lo); +} + SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const { EVT VT = N->getValueType(0); @@ -2448,17 +2480,24 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, SDLoc DL(N); switch(N->getOpcode()) { - default: break; - case ISD::MUL: - return performMulCombine(N, DCI); - case AMDGPUISD::MUL_I24: - case AMDGPUISD::MUL_U24: { - SDValue N0 = N->getOperand(0); - SDValue N1 = N->getOperand(1); - simplifyI24(N0, DCI); - simplifyI24(N1, DCI); - return SDValue(); - } + default: + break; + case ISD::SHL: { + if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) + break; + + return performShlCombine(N, DCI); + } + case ISD::MUL: + return performMulCombine(N, DCI); + case AMDGPUISD::MUL_I24: + case AMDGPUISD::MUL_U24: { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + simplifyI24(N0, DCI); + simplifyI24(N1, DCI); + return SDValue(); + } case ISD::SELECT: { SDValue Cond = N->getOperand(0); if (Cond.getOpcode() == ISD::SETCC && Cond.hasOneUse()) { @@ -2644,6 +2683,18 @@ SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG, return DAG.getRegister(VirtualRegister, VT); } +uint32_t AMDGPUTargetLowering::getImplicitParameterOffset( + const AMDGPUMachineFunction *MFI, const ImplicitParameter Param) const { + uint64_t ArgOffset = MFI->ABIArgOffset; + switch (Param) { + case GRID_DIM: + return ArgOffset; + case GRID_OFFSET: + return ArgOffset + 4; + } + llvm_unreachable("unexpected implicit parameter type"); +} + #define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node; const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h index fbb7d3c88437..478b2035fd75 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -65,6 +65,7 @@ private: SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const; protected: @@ -123,7 +124,7 @@ public: bool isNarrowingProfitable(EVT VT1, EVT VT2) const override; - MVT getVectorIdxTy() const override; + MVT getVectorIdxTy(const DataLayout &) const override; bool isSelectSupported(SelectSupportKind) const override; bool isFPImmLegal(const APFloat &Imm, EVT VT) const override; @@ -207,6 +208,16 @@ public: virtual SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, unsigned Reg, EVT VT) const; + + enum ImplicitParameter { + GRID_DIM, + GRID_OFFSET + }; + + /// \brief Helper function that returns the byte offset of the given + /// type of implicit parameter. + unsigned getImplicitParameterOffset(const AMDGPUMachineFunction *MFI, + const ImplicitParameter Param) const; }; namespace AMDGPUISD { diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 0779d1d786b2..bd5abc4f546e 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -69,6 +69,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, FP64Denormals(false), FP32Denormals(false), FastFMAF32(false), CaymanISA(false), FlatAddressSpace(false), EnableIRStructurizer(true), EnablePromoteAlloca(false), EnableIfCvt(true), EnableLoadStoreOpt(false), + EnableUnsafeDSOffsetFolding(false), WavefrontSize(0), CFALUBug(false), LocalMemorySize(0), EnableVGPRSpilling(false), SGPRInitBug(false), IsGCN(false), GCN1Encoding(false), GCN3Encoding(false), CIInsts(false), LDSBankCount(0), diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h index 30f50eb1d2f3..90831bfb4458 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -76,6 +76,7 @@ private: bool EnablePromoteAlloca; bool EnableIfCvt; bool EnableLoadStoreOpt; + bool EnableUnsafeDSOffsetFolding; unsigned WavefrontSize; bool CFALUBug; int LocalMemorySize; @@ -222,6 +223,10 @@ public: return EnableLoadStoreOpt; } + bool unsafeDSOffsetFoldingEnabled() const { + return EnableUnsafeDSOffsetFolding; + } + unsigned getWavefrontSize() const { return WavefrontSize; } diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index a9a911a8efed..2297b52b423c 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -156,8 +156,10 @@ public: } // End of anonymous namespace TargetIRAnalysis AMDGPUTargetMachine::getTargetIRAnalysis() { - return TargetIRAnalysis( - [this](Function &F) { return TargetTransformInfo(AMDGPUTTIImpl(this)); }); + return TargetIRAnalysis([this](Function &F) { + return TargetTransformInfo( + AMDGPUTTIImpl(this, F.getParent()->getDataLayout())); + }); } void AMDGPUPassConfig::addIRPasses() { @@ -269,6 +271,7 @@ void GCNPassConfig::addPreRegAlloc() { // also need extra copies to the address operand to be eliminated. initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); insertPass(&MachineSchedulerID, &SILoadStoreOptimizerID); + insertPass(&MachineSchedulerID, &RegisterCoalescerID); } addPass(createSIShrinkInstructionsPass(), false); addPass(createSIFixSGPRLiveRangesPass(), false); @@ -280,10 +283,10 @@ void GCNPassConfig::addPostRegAlloc() { } void GCNPassConfig::addPreSched2() { - addPass(createSIInsertWaits(*TM), false); } void GCNPassConfig::addPreEmitPass() { + addPass(createSIInsertWaits(*TM), false); addPass(createSILowerControlFlowPass(*TM), false); } diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index 791c84e6f28b..dee0a69d1e68 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -37,8 +37,9 @@ class AMDGPUTTIImpl : public BasicTTIImplBase<AMDGPUTTIImpl> { const AMDGPUTargetLowering *getTLI() const { return TLI; } public: - explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM) - : BaseT(TM), ST(TM->getSubtargetImpl()), TLI(ST->getTargetLowering()) {} + explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const DataLayout &DL) + : BaseT(TM, DL), ST(TM->getSubtargetImpl()), + TLI(ST->getTargetLowering()) {} // Provide value semantics. MSVC requires that we spell all of these out. AMDGPUTTIImpl(const AMDGPUTTIImpl &Arg) @@ -46,18 +47,6 @@ public: AMDGPUTTIImpl(AMDGPUTTIImpl &&Arg) : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)), TLI(std::move(Arg.TLI)) {} - AMDGPUTTIImpl &operator=(const AMDGPUTTIImpl &RHS) { - BaseT::operator=(static_cast<const BaseT &>(RHS)); - ST = RHS.ST; - TLI = RHS.TLI; - return *this; - } - AMDGPUTTIImpl &operator=(AMDGPUTTIImpl &&RHS) { - BaseT::operator=(std::move(static_cast<BaseT &>(RHS))); - ST = std::move(RHS.ST); - TLI = std::move(RHS.TLI); - return *this; - } bool hasBranchDivergence() { return true; } diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp index 7172e4bb9335..c709741f3777 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp @@ -44,7 +44,7 @@ static MCInstrInfo *createAMDGPUMCInstrInfo() { return X; } -static MCRegisterInfo *createAMDGPUMCRegisterInfo(StringRef TT) { +static MCRegisterInfo *createAMDGPUMCRegisterInfo(const Triple &TT) { MCRegisterInfo *X = new MCRegisterInfo(); InitAMDGPUMCRegisterInfo(X, 0); return X; @@ -52,14 +52,13 @@ static MCRegisterInfo *createAMDGPUMCRegisterInfo(StringRef TT) { static MCSubtargetInfo * createAMDGPUMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) { - MCSubtargetInfo * X = new MCSubtargetInfo(); - InitAMDGPUMCSubtargetInfo(X, TT, CPU, FS); - return X; + return createAMDGPUMCSubtargetInfoImpl(TT, CPU, FS); } -static MCCodeGenInfo *createAMDGPUMCCodeGenInfo(StringRef TT, Reloc::Model RM, - CodeModel::Model CM, - CodeGenOpt::Level OL) { +static MCCodeGenInfo *createAMDGPUMCCodeGenInfo(const Triple &TT, + Reloc::Model RM, + CodeModel::Model CM, + CodeGenOpt::Level OL) { MCCodeGenInfo *X = new MCCodeGenInfo(); X->initMCCodeGenInfo(RM, CM, OL); return X; diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp index 8357b6d9d0ed..4e4d554f0ee7 100644 --- a/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -815,8 +815,10 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const case Intrinsic::r600_read_local_size_z: return LowerImplicitParameter(DAG, VT, DL, 8); - case Intrinsic::AMDGPU_read_workdim: - return LowerImplicitParameter(DAG, VT, DL, MFI->ABIArgOffset / 4); + case Intrinsic::AMDGPU_read_workdim: { + uint32_t ByteOffset = getImplicitParameterOffset(MFI, GRID_DIM); + return LowerImplicitParameter(DAG, VT, DL, ByteOffset / 4); + } case Intrinsic::r600_read_tgid_x: return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, @@ -897,8 +899,9 @@ SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG, for (unsigned i = 0, e = VecVT.getVectorNumElements(); i != e; ++i) { - Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vector, - DAG.getConstant(i, DL, getVectorIdxTy()))); + Args.push_back(DAG.getNode( + ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vector, + DAG.getConstant(i, DL, getVectorIdxTy(DAG.getDataLayout())))); } return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args); @@ -1459,22 +1462,17 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const SDValue Ptr = Op.getOperand(1); SDValue LoweredLoad; - SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG); - if (Ret.getNode()) { - SDValue Ops[2] = { - Ret, - Chain - }; - return DAG.getMergeValues(Ops, DL); - } + if (SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG)) + return Ret; // Lower loads constant address space global variable loads if (LoadNode->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS && isa<GlobalVariable>(GetUnderlyingObject( - LoadNode->getMemOperand()->getValue(), *getDataLayout()))) { + LoadNode->getMemOperand()->getValue(), DAG.getDataLayout()))) { - SDValue Ptr = DAG.getZExtOrTrunc(LoadNode->getBasePtr(), DL, - getPointerTy(AMDGPUAS::PRIVATE_ADDRESS)); + SDValue Ptr = DAG.getZExtOrTrunc( + LoadNode->getBasePtr(), DL, + getPointerTy(DAG.getDataLayout(), AMDGPUAS::PRIVATE_ADDRESS)); Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(2, DL, MVT::i32)); return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op->getVTList(), @@ -1702,7 +1700,8 @@ SDValue R600TargetLowering::LowerFormalArguments( return Chain; } -EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { +EVT R600TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, + EVT VT) const { if (!VT.isVector()) return MVT::i32; return VT.changeVectorElementTypeToInteger(); diff --git a/lib/Target/AMDGPU/R600ISelLowering.h b/lib/Target/AMDGPU/R600ISelLowering.h index c06d3c4fd309..4dbac97af2a1 100644 --- a/lib/Target/AMDGPU/R600ISelLowering.h +++ b/lib/Target/AMDGPU/R600ISelLowering.h @@ -38,7 +38,9 @@ public: const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const override; - EVT getSetCCResultType(LLVMContext &, EVT VT) const override; + EVT getSetCCResultType(const DataLayout &DL, LLVMContext &, + EVT VT) const override; + private: unsigned Gen; /// Each OpenCL kernel has nine implicit parameters that are stored in the diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp index d14e37a64612..c2887255cc11 100644 --- a/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -126,11 +126,42 @@ static bool updateOperand(FoldCandidate &Fold, return false; } +static bool isUseMIInFoldList(const std::vector<FoldCandidate> &FoldList, + const MachineInstr *MI) { + for (auto Candidate : FoldList) { + if (Candidate.UseMI == MI) + return true; + } + return false; +} + static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList, MachineInstr *MI, unsigned OpNo, MachineOperand *OpToFold, const SIInstrInfo *TII) { if (!TII->isOperandLegal(MI, OpNo, OpToFold)) { + + // Special case for v_mac_f32_e64 if we are trying to fold into src2 + unsigned Opc = MI->getOpcode(); + if (Opc == AMDGPU::V_MAC_F32_e64 && + (int)OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)) { + // Check if changing this to a v_mad_f32 instruction will allow us to + // fold the operand. + MI->setDesc(TII->get(AMDGPU::V_MAD_F32)); + bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold, TII); + if (FoldAsMAD) { + MI->untieRegOperand(OpNo); + return true; + } + MI->setDesc(TII->get(Opc)); + } + + // If we are already folding into another operand of MI, then + // we can't commute the instruction, otherwise we risk making the + // other fold illegal. + if (isUseMIInFoldList(FoldList, MI)) + return false; + // Operand is not legal, so try to commute the instruction to // see if this makes it possible to fold. unsigned CommuteIdx0; diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index ead1a3743473..dd818a9ba746 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -254,8 +254,9 @@ bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &, return false; } -bool SITargetLowering::isLegalAddressingMode(const AddrMode &AM, - Type *Ty, unsigned AS) const { +bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, + const AddrMode &AM, Type *Ty, + unsigned AS) const { // No global is ever allowed as a base. if (AM.BaseGV) return false; @@ -416,7 +417,7 @@ static EVT toIntegerVT(EVT VT) { SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, SDLoc SL, SDValue Chain, unsigned Offset, bool Signed) const { - const DataLayout *DL = getDataLayout(); + const DataLayout &DL = DAG.getDataLayout(); MachineFunction &MF = DAG.getMachineFunction(); const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo()); @@ -425,16 +426,16 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, Type *Ty = VT.getTypeForEVT(*DAG.getContext()); MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); - MVT PtrVT = getPointerTy(AMDGPUAS::CONSTANT_ADDRESS); + MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS); PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS); SDValue BasePtr = DAG.getCopyFromReg(Chain, SL, MRI.getLiveInVirtReg(InputPtrReg), PtrVT); SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, DAG.getConstant(Offset, SL, PtrVT)); - SDValue PtrOffset = DAG.getUNDEF(getPointerTy(AMDGPUAS::CONSTANT_ADDRESS)); + SDValue PtrOffset = DAG.getUNDEF(PtrVT); MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); - unsigned Align = DL->getABITypeAlignment(Ty); + unsigned Align = DL.getABITypeAlignment(Ty); if (VT != MemVT && VT.isFloatingPoint()) { // Do an integer load and convert. @@ -451,7 +452,12 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, true, // isNonTemporal true, // isInvariant Align); // Alignment - return DAG.getNode(ISD::FP16_TO_FP, SL, VT, Load); + SDValue Ops[] = { + DAG.getNode(ISD::FP16_TO_FP, SL, VT, Load), + Load.getValue(1) + }; + + return DAG.getMergeValues(Ops, SL); } ISD::LoadExtType ExtTy = Signed ? ISD::SEXTLOAD : ISD::ZEXTLOAD; @@ -569,6 +575,8 @@ SDValue SITargetLowering::LowerFormalArguments( AnalyzeFormalArguments(CCInfo, Splits); + SmallVector<SDValue, 16> Chains; + for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { const ISD::InputArg &Arg = Ins[i]; @@ -587,8 +595,9 @@ SDValue SITargetLowering::LowerFormalArguments( VA.getLocMemOffset(); // The first 36 bytes of the input buffer contains information about // thread group and global sizes. - SDValue Arg = LowerParameter(DAG, VT, MemVT, DL, DAG.getRoot(), + SDValue Arg = LowerParameter(DAG, VT, MemVT, DL, Chain, Offset, Ins[i].Flags.isSExt()); + Chains.push_back(Arg.getValue(1)); const PointerType *ParamTy = dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex())); @@ -614,7 +623,8 @@ SDValue SITargetLowering::LowerFormalArguments( Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, &AMDGPU::SReg_64RegClass); Reg = MF.addLiveIn(Reg, &AMDGPU::SReg_64RegClass); - InVals.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT)); + SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT); + InVals.push_back(Copy); continue; } @@ -634,7 +644,9 @@ SDValue SITargetLowering::LowerFormalArguments( for (unsigned j = 1; j != NumElements; ++j) { Reg = ArgLocs[ArgIdx++].getLocReg(); Reg = MF.addLiveIn(Reg, RC); - Regs.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT)); + + SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT); + Regs.push_back(Copy); } // Fill up the missing vector elements @@ -653,7 +665,11 @@ SDValue SITargetLowering::LowerFormalArguments( AMDGPU::SGPR_32RegClass.begin(), AMDGPU::SGPR_32RegClass.getNumRegs())); Info->ScratchOffsetReg = AMDGPU::SGPR_32RegClass.getRegister(ScratchIdx); } - return Chain; + + if (Chains.empty()) + return Chain; + + return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); } MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( @@ -695,14 +711,15 @@ bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const { return true; } -EVT SITargetLowering::getSetCCResultType(LLVMContext &Ctx, EVT VT) const { +EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx, + EVT VT) const { if (!VT.isVector()) { return MVT::i1; } return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements()); } -MVT SITargetLowering::getScalarShiftAmountTy(EVT VT) const { +MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT) const { return MVT::i32; } @@ -888,7 +905,7 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDLoc DL(GSD); const GlobalValue *GV = GSD->getGlobal(); - MVT PtrVT = getPointerTy(GSD->getAddressSpace()); + MVT PtrVT = getPointerTy(DAG.getDataLayout(), GSD->getAddressSpace()); SDValue Ptr = DAG.getNode(AMDGPUISD::CONST_DATA_PTR, DL, PtrVT); SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32); @@ -926,6 +943,7 @@ SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL, SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); + auto MFI = MF.getInfo<SIMachineFunctionInfo>(); const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); @@ -964,8 +982,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::AMDGPU_read_workdim: return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - MF.getInfo<SIMachineFunctionInfo>()->ABIArgOffset, - false); + getImplicitParameterOffset(MFI, GRID_DIM), false); case Intrinsic::r600_read_tgid_x: return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, @@ -1213,7 +1230,8 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); - EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f32); + EVT SetCCVT = + getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32); SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT); @@ -1411,7 +1429,7 @@ SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N, unsigned AS = Load->getAddressSpace(); unsigned Align = Load->getAlignment(); Type *Ty = LoadVT.getTypeForEVT(*DAG.getContext()); - unsigned ABIAlignment = getDataLayout()->getABITypeAlignment(Ty); + unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(Ty); // Don't try to replace the load if we have to expand it due to alignment // problems. Otherwise we will end up scalarizing the load, and trying to @@ -2212,9 +2230,8 @@ SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG, std::pair<unsigned, const TargetRegisterClass *> SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, - const std::string &Constraint_, + StringRef Constraint, MVT VT) const { - StringRef Constraint(Constraint_); if (Constraint == "r") { switch(VT.SimpleTy) { default: llvm_unreachable("Unhandled type for 'r' inline asm constraint"); diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h index a956b013bdb1..635b4edc89de 100644 --- a/lib/Target/AMDGPU/SIISelLowering.h +++ b/lib/Target/AMDGPU/SIISelLowering.h @@ -62,8 +62,8 @@ public: bool isShuffleMaskLegal(const SmallVectorImpl<int> &/*Mask*/, EVT /*VT*/) const override; - bool isLegalAddressingMode(const AddrMode &AM, - Type *Ty, unsigned AS) const override; + bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, + unsigned AS) const override; bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, unsigned Align, @@ -90,8 +90,9 @@ public: MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr * MI, MachineBasicBlock * BB) const override; bool enableAggressiveFMAFusion(EVT VT) const override; - EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override; - MVT getScalarShiftAmountTy(EVT VT) const override; + EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, + EVT VT) const override; + MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override; bool isFMAFasterThanFMulAndFAdd(EVT VT) const override; SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; @@ -114,9 +115,9 @@ public: SDLoc DL, SDValue Ptr) const; - std::pair<unsigned, const TargetRegisterClass *> getRegForInlineAsmConstraint( - const TargetRegisterInfo *TRI, - const std::string &Constraint, MVT VT) const override; + std::pair<unsigned, const TargetRegisterClass *> + getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, + StringRef Constraint, MVT VT) const override; SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL, SDValue V) const; }; diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp index eb96bd0227b2..18910615bebe 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -227,9 +227,8 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, uint8_t Offset0 = Offset0Imm->getImm(); uint8_t Offset1 = Offset1Imm->getImm(); - assert(Offset1 > Offset0); - if (Offset1 - Offset0 == 1) { + if (Offset1 > Offset0 && Offset1 - Offset0 == 1) { // Each of these offsets is in element sized units, so we need to convert // to bytes of the individual reads. @@ -924,7 +923,7 @@ bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI, return false; unsigned Opc = UseMI->getOpcode(); - if (Opc == AMDGPU::V_MAD_F32) { + if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64) { // Don't fold if we are using source modifiers. The new VOP2 instructions // don't have them. if (hasModifiersSet(*UseMI, AMDGPU::OpName::src0_modifiers) || @@ -963,9 +962,9 @@ bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI, // instead of having to modify in place. // Remove these first since they are at the end. - UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32, + UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod)); - UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32, + UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); unsigned Src1Reg = Src1->getReg(); @@ -980,6 +979,14 @@ bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI, Src1->setSubReg(Src2SubReg); Src1->setIsKill(Src2->isKill()); + if (Opc == AMDGPU::V_MAC_F32_e64) { + UseMI->untieRegOperand( + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); + } + + UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, + AMDGPU::OpName::src2)); + // ChangingToImmediate adds Src2 back to the instruction. Src2->ChangeToImmediate(Imm); removeModOperands(*UseMI); @@ -1010,11 +1017,17 @@ bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI, // instead of having to modify in place. // Remove these first since they are at the end. - UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32, + UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod)); - UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32, + UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); + if (Opc == AMDGPU::V_MAC_F32_e64) { + UseMI->untieRegOperand( + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); + } + + // ChangingToImmediate adds Src2 back to the instruction. Src2->ChangeToImmediate(Imm); // These come before src2. @@ -1126,6 +1139,38 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa, return false; } +MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, + MachineBasicBlock::iterator &MI, + LiveVariables *LV) const { + + switch (MI->getOpcode()) { + default: return nullptr; + case AMDGPU::V_MAC_F32_e64: break; + case AMDGPU::V_MAC_F32_e32: { + const MachineOperand *Src0 = getNamedOperand(*MI, AMDGPU::OpName::src0); + if (Src0->isImm() && !isInlineConstant(*Src0, 4)) + return nullptr; + break; + } + } + + const MachineOperand *Dst = getNamedOperand(*MI, AMDGPU::OpName::dst); + const MachineOperand *Src0 = getNamedOperand(*MI, AMDGPU::OpName::src0); + const MachineOperand *Src1 = getNamedOperand(*MI, AMDGPU::OpName::src1); + const MachineOperand *Src2 = getNamedOperand(*MI, AMDGPU::OpName::src2); + + return BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_MAD_F32)) + .addOperand(*Dst) + .addImm(0) // Src0 mods + .addOperand(*Src0) + .addImm(0) // Src1 mods + .addOperand(*Src1) + .addImm(0) // Src mods + .addOperand(*Src2) + .addImm(0) // clamp + .addImm(0); // omod +} + bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { int64_t SVal = Imm.getSExtValue(); if (SVal >= -16 && SVal <= 64) @@ -1625,7 +1670,10 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx, if (MO->isReg()) { assert(DefinedRC); - const TargetRegisterClass *RC = MRI.getRegClass(MO->getReg()); + const TargetRegisterClass *RC = + TargetRegisterInfo::isVirtualRegister(MO->getReg()) ? + MRI.getRegClass(MO->getReg()) : + RI.getPhysRegClass(MO->getReg()); // In order to be legal, the common sub-class must be equal to the // class of the current operand. For example: diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h index 0382272068d2..015ea12d4598 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.h +++ b/lib/Target/AMDGPU/SIInstrInfo.h @@ -144,6 +144,10 @@ public: unsigned getMachineCSELookAheadLimit() const override { return 500; } + MachineInstr *convertToThreeAddress(MachineFunction::iterator &MBB, + MachineBasicBlock::iterator &MI, + LiveVariables *LV) const override; + bool isSALU(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::SALU; } diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td index fcb58d5da3b0..b39a78714640 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.td +++ b/lib/Target/AMDGPU/SIInstrInfo.td @@ -529,9 +529,11 @@ def MUBUFOffset : ComplexPattern<i64, 6, "SelectMUBUFOffset">; def MUBUFOffsetAtomic : ComplexPattern<i64, 4, "SelectMUBUFOffset">; def VOP3Mods0 : ComplexPattern<untyped, 4, "SelectVOP3Mods0">; +def VOP3NoMods0 : ComplexPattern<untyped, 4, "SelectVOP3NoMods0">; def VOP3Mods0Clamp : ComplexPattern<untyped, 3, "SelectVOP3Mods0Clamp">; def VOP3Mods0Clamp0OMod : ComplexPattern<untyped, 4, "SelectVOP3Mods0Clamp0OMod">; def VOP3Mods : ComplexPattern<untyped, 2, "SelectVOP3Mods">; +def VOP3NoMods : ComplexPattern<untyped, 2, "SelectVOP3NoMods">; //===----------------------------------------------------------------------===// // SI assembler operands @@ -1113,6 +1115,13 @@ def VOP_MADK : VOPProfile <[f32, f32, f32, f32]> { field dag Ins = (ins VCSrc_32:$src0, VGPR_32:$vsrc1, u32imm:$src2); field string Asm = "$dst, $src0, $vsrc1, $src2"; } +def VOP_MAC : VOPProfile <[f32, f32, f32, f32]> { + let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VGPR_32:$src2); + let Ins64 = getIns64<Src0RC64, Src1RC64, RegisterOperand<VGPR_32>, 3, + HasModifiers>.ret; + let Asm32 = getAsm32<2>.ret; + let Asm64 = getAsm64<2, HasModifiers>.ret; +} def VOP_F64_F64_F64_F64 : VOPProfile <[f64, f64, f64, f64]>; def VOP_I32_I32_I32_I32 : VOPProfile <[i32, i32, i32, i32]>; def VOP_I64_I32_I32_I64 : VOPProfile <[i64, i32, i32, i64]>; diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td index 8c8d836776db..1ee63c675822 100644 --- a/lib/Target/AMDGPU/SIInstructions.td +++ b/lib/Target/AMDGPU/SIInstructions.td @@ -1488,7 +1488,10 @@ defm V_AND_B32 : VOP2Inst <vop2<0x1b, 0x13>, "v_and_b32", VOP_I32_I32_I32>; defm V_OR_B32 : VOP2Inst <vop2<0x1c, 0x14>, "v_or_b32", VOP_I32_I32_I32>; defm V_XOR_B32 : VOP2Inst <vop2<0x1d, 0x15>, "v_xor_b32", VOP_I32_I32_I32>; -defm V_MAC_F32 : VOP2Inst <vop2<0x1f, 0x16>, "v_mac_f32", VOP_F32_F32_F32>; +let Constraints = "$dst = $src2", DisableEncoding="$src2", + isConvertibleToThreeAddress = 1 in { +defm V_MAC_F32 : VOP2Inst <vop2<0x1f, 0x16>, "v_mac_f32", VOP_MAC>; +} } // End isCommutable = 1 defm V_MADMK_F32 : VOP2MADK <vop2<0x20, 0x17>, "v_madmk_f32">; @@ -2206,6 +2209,15 @@ def : Pat < (V_CNDMASK_B32_e64 $src2, $src1, $src0) >; +// Pattern for V_MAC_F32 +def : Pat < + (fmad (VOP3NoMods0 f32:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod), + (VOP3NoMods f32:$src1, i32:$src1_modifiers), + (VOP3NoMods f32:$src2, i32:$src2_modifiers)), + (V_MAC_F32_e64 $src0_modifiers, $src0, $src1_modifiers, $src1, + $src2_modifiers, $src2, $clamp, $omod) +>; + /********** ======================= **********/ /********** Image sampling patterns **********/ /********** ======================= **********/ diff --git a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index 9b1d256dc5a8..1bdb1f0ee9f9 100644 --- a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -214,12 +214,11 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( // cases, like vectors of pointers. const MachineOperand *AddrReg = TII->getNamedOperand(*I, AMDGPU::OpName::addr); - unsigned DestReg0 = TII->getNamedOperand(*I, AMDGPU::OpName::vdst)->getReg(); - unsigned DestReg1 - = TII->getNamedOperand(*Paired, AMDGPU::OpName::vdst)->getReg(); + const MachineOperand *Dest0 = TII->getNamedOperand(*I, AMDGPU::OpName::vdst); + const MachineOperand *Dest1 = TII->getNamedOperand(*Paired, AMDGPU::OpName::vdst); unsigned Offset0 - = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff; + = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff; unsigned Offset1 = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff; @@ -258,20 +257,43 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( unsigned SubRegIdx0 = (EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; unsigned SubRegIdx1 = (EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; - updateRegDefsUses(DestReg0, DestReg, SubRegIdx0); - updateRegDefsUses(DestReg1, DestReg, SubRegIdx1); - LIS->RemoveMachineInstrFromMaps(I); - // Replacing Paired in the maps with Read2 allows us to avoid updating the - // live range for the m0 register. - LIS->ReplaceMachineInstrInMaps(Paired, Read2); + const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); + + // Copy to the old destination registers. + MachineInstr *Copy0 = BuildMI(*MBB, I, DL, CopyDesc) + .addOperand(*Dest0) // Copy to same destination including flags and sub reg. + .addReg(DestReg, 0, SubRegIdx0); + MachineInstr *Copy1 = BuildMI(*MBB, I, DL, CopyDesc) + .addOperand(*Dest1) + .addReg(DestReg, RegState::Kill, SubRegIdx1); + + LIS->InsertMachineInstrInMaps(Read2); + + // repairLiveintervalsInRange() doesn't handle physical register, so we have + // to update the M0 range manually. + SlotIndex PairedIndex = LIS->getInstructionIndex(Paired); + LiveRange &M0Range = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::M0, TRI)); + LiveRange::Segment *M0Segment = M0Range.getSegmentContaining(PairedIndex); + bool UpdateM0Range = M0Segment->end == PairedIndex.getRegSlot(); + + // The new write to the original destination register is now the copy. Steal + // the old SlotIndex. + LIS->ReplaceMachineInstrInMaps(I, Copy0); + LIS->ReplaceMachineInstrInMaps(Paired, Copy1); + I->eraseFromParent(); Paired->eraseFromParent(); LiveInterval &AddrRegLI = LIS->getInterval(AddrReg->getReg()); LIS->shrinkToUses(&AddrRegLI); - LIS->getInterval(DestReg); // Create new LI + LIS->createAndComputeVirtRegInterval(DestReg); + + if (UpdateM0Range) { + SlotIndex Read2Index = LIS->getInstructionIndex(Read2); + M0Segment->end = Read2Index.getRegSlot(); + } DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); return Read2.getInstr(); diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 587ea63d6796..d23b92edef33 100644 --- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -53,7 +53,6 @@ SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg( if (!LaneVGPRs.count(LaneVGPRIdx)) { unsigned LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass); LaneVGPRs[LaneVGPRIdx] = LaneVGPR; - MRI.setPhysRegUsed(LaneVGPR); // Add this register as live-in to all blocks to avoid machine verifer // complaining about use of an undefined physical register. diff --git a/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp b/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp index 0a7f684552f0..b086d2ed6652 100644 --- a/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp +++ b/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp @@ -91,7 +91,6 @@ bool SIPrepareScratchRegs::runOnMachineFunction(MachineFunction &MF) { if (ScratchOffsetReg != AMDGPU::NoRegister) { // Found an SGPR to use - MRI.setPhysRegUsed(ScratchOffsetReg); BuildMI(*Entry, I, DL, TII->get(AMDGPU::S_MOV_B32), ScratchOffsetReg) .addReg(ScratchOffsetPreloadReg); } else { diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp index db2ff0b1f952..ce4acafac9fa 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -499,7 +499,7 @@ unsigned SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI, for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end(); I != E; ++I) { - if (!MRI.isPhysRegUsed(*I)) + if (MRI.reg_nodbg_empty(*I)) return *I; } return AMDGPU::NoRegister; diff --git a/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/lib/Target/AMDGPU/SIShrinkInstructions.cpp index 51e72cdb5f9e..5d00bdd6a9bb 100644 --- a/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -94,8 +94,20 @@ static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII, // is vcc. We should handle this the same way we handle vopc, by addding // a register allocation hint pre-regalloc and then do the shrining // post-regalloc. - if (Src2) - return false; + if (Src2) { + switch (MI.getOpcode()) { + default: return false; + + case AMDGPU::V_MAC_F32_e64: + if (!isVGPR(Src2, TRI, MRI) || + TII->hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers)) + return false; + break; + + case AMDGPU::V_CNDMASK_B32_e64: + break; + } + } const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); const MachineOperand *Src1Mod = @@ -149,7 +161,7 @@ static void foldImmediates(MachineInstr &MI, const SIInstrInfo *TII, return; // Try to fold Src0 - if (Src0.isReg()) { + if (Src0.isReg() && MRI.hasOneUse(Src0.getReg())) { unsigned Reg = Src0.getReg(); MachineInstr *Def = MRI.getUniqueVRegDef(Reg); if (Def && Def->isMoveImmediate()) { @@ -243,6 +255,22 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { continue; } + if (Op32 == AMDGPU::V_CNDMASK_B32_e32) { + // We shrink V_CNDMASK_B32_e64 using regalloc hints like we do for VOPC + // instructions. + const MachineOperand *Src2 = + TII->getNamedOperand(MI, AMDGPU::OpName::src2); + if (!Src2->isReg()) + continue; + unsigned SReg = Src2->getReg(); + if (TargetRegisterInfo::isVirtualRegister(SReg)) { + MRI.setRegAllocationHint(SReg, 0, AMDGPU::VCC); + continue; + } + if (SReg != AMDGPU::VCC) + continue; + } + // We can shrink this instruction DEBUG(dbgs() << "Shrinking "; MI.dump(); dbgs() << '\n';); @@ -259,6 +287,11 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { if (Src1) Inst32.addOperand(*Src1); + const MachineOperand *Src2 = + TII->getNamedOperand(MI, AMDGPU::OpName::src2); + if (Src2) + Inst32.addOperand(*Src2); + ++NumInstructionsShrunk; MI.eraseFromParent(); diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td index 96b4742da2bb..ef609a66d032 100644 --- a/lib/Target/ARM/ARM.td +++ b/lib/Target/ARM/ARM.td @@ -150,6 +150,10 @@ def FeatureAClass : SubtargetFeature<"aclass", "ARMProcClass", "AClass", def FeatureNaClTrap : SubtargetFeature<"nacl-trap", "UseNaClTrap", "true", "NaCl trap">; +def FeatureLongCalls : SubtargetFeature<"long-calls", "GenLongCalls", "true", + "Generate calls via indirect call " + "instructions">; + // ARM ISAs. def HasV4TOps : SubtargetFeature<"v4t", "HasV4TOps", "true", "Support ARM v4T instructions">; diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp index b1a11d626bda..9f43e732bd73 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -1230,8 +1230,7 @@ ARMBaseInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { Reloc::Model RM = MF.getTarget().getRelocationModel(); if (MI->getOpcode() == TargetOpcode::LOAD_STACK_GUARD) { - assert(getSubtarget().getTargetTriple().getObjectFormat() == - Triple::MachO && + assert(getSubtarget().getTargetTriple().isOSBinFormatMachO() && "LOAD_STACK_GUARD currently supported only for MachO."); expandLoadStackGuard(MI, RM); MI->getParent()->erase(MI); diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp index 3f79a9b53d70..e7d5be7753e4 100644 --- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp +++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp @@ -127,7 +127,7 @@ ARMBaseRegisterInfo::getThisReturnPreservedMask(const MachineFunction &MF, BitVector ARMBaseRegisterInfo:: getReservedRegs(const MachineFunction &MF) const { const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>(); - const TargetFrameLowering *TFI = STI.getFrameLowering(); + const ARMFrameLowering *TFI = getFrameLowering(MF); // FIXME: avoid re-calculating this every time. BitVector Reserved(getNumRegs()); @@ -194,7 +194,7 @@ unsigned ARMBaseRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const { const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>(); - const TargetFrameLowering *TFI = STI.getFrameLowering(); + const ARMFrameLowering *TFI = getFrameLowering(MF); switch (RC->getID()) { default: @@ -302,7 +302,7 @@ ARMBaseRegisterInfo::updateRegAllocHint(unsigned Reg, unsigned NewReg, bool ARMBaseRegisterInfo::hasBasePointer(const MachineFunction &MF) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); - const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); + const ARMFrameLowering *TFI = getFrameLowering(MF); // When outgoing call frames are so large that we adjust the stack pointer // around the call, we can no longer use the stack pointer to reach the @@ -333,6 +333,7 @@ bool ARMBaseRegisterInfo::hasBasePointer(const MachineFunction &MF) const { bool ARMBaseRegisterInfo::canRealignStack(const MachineFunction &MF) const { const MachineRegisterInfo *MRI = &MF.getRegInfo(); const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + const ARMFrameLowering *TFI = getFrameLowering(MF); // We can't realign the stack if: // 1. Dynamic stack realignment is explicitly disabled, // 2. This is a Thumb1 function (it's not useful, so we don't bother), or @@ -347,7 +348,7 @@ bool ARMBaseRegisterInfo::canRealignStack(const MachineFunction &MF) const { return false; // We may also need a base pointer if there are dynamic allocas or stack // pointer adjustments around calls. - if (MF.getSubtarget().getFrameLowering()->hasReservedCallFrame(MF)) + if (TFI->hasReservedCallFrame(MF)) return true; // A base pointer is required and allowed. Check that it isn't too late to // reserve it. @@ -357,9 +358,9 @@ bool ARMBaseRegisterInfo::canRealignStack(const MachineFunction &MF) const { bool ARMBaseRegisterInfo:: needsStackRealignment(const MachineFunction &MF) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); + const ARMFrameLowering *TFI = getFrameLowering(MF); const Function *F = MF.getFunction(); - unsigned StackAlign = - MF.getSubtarget().getFrameLowering()->getStackAlignment(); + unsigned StackAlign = TFI->getStackAlignment(); bool requiresRealignment = ((MFI->getMaxAlignment() > StackAlign) || F->hasFnAttribute(Attribute::StackAlignment)); @@ -378,7 +379,7 @@ cannotEliminateFrame(const MachineFunction &MF) const { unsigned ARMBaseRegisterInfo::getFrameRegister(const MachineFunction &MF) const { const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>(); - const TargetFrameLowering *TFI = STI.getFrameLowering(); + const ARMFrameLowering *TFI = getFrameLowering(MF); if (TFI->hasFP(MF)) return getFramePointerReg(STI); @@ -517,7 +518,7 @@ needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const { // Note that the incoming offset is based on the SP value at function entry, // so it'll be negative. MachineFunction &MF = *MI->getParent()->getParent(); - const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); + const ARMFrameLowering *TFI = getFrameLowering(MF); MachineFrameInfo *MFI = MF.getFrameInfo(); ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); @@ -694,8 +695,7 @@ ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, MachineFunction &MF = *MBB.getParent(); const ARMBaseInstrInfo &TII = *static_cast<const ARMBaseInstrInfo *>(MF.getSubtarget().getInstrInfo()); - const ARMFrameLowering *TFI = static_cast<const ARMFrameLowering *>( - MF.getSubtarget().getFrameLowering()); + const ARMFrameLowering *TFI = getFrameLowering(MF); ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); assert(!AFI->isThumb1OnlyFunction() && "This eliminateFrameIndex does not support Thumb1!"); diff --git a/lib/Target/ARM/ARMCallingConv.td b/lib/Target/ARM/ARMCallingConv.td index 7dd21ecbe91b..27cf06b995a0 100644 --- a/lib/Target/ARM/ARMCallingConv.td +++ b/lib/Target/ARM/ARMCallingConv.td @@ -142,6 +142,9 @@ def CC_ARM_AAPCS : CallingConv<[ // Handles byval parameters. CCIfByVal<CCPassByVal<4, 4>>, + // The 'nest' parameter, if any, is passed in R12. + CCIfNest<CCAssignToReg<[R12]>>, + // Handle all vector types as either f64 or v2f64. CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>, CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>, diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp index 4175b4af86e6..fdd0763ea608 100644 --- a/lib/Target/ARM/ARMFastISel.cpp +++ b/lib/Target/ARM/ARMFastISel.cpp @@ -49,8 +49,6 @@ #include "llvm/Target/TargetOptions.h" using namespace llvm; -extern cl::opt<bool> EnableARMLongCalls; - namespace { // All possible address modes, plus some. @@ -685,7 +683,7 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, MVT VT) { } unsigned ARMFastISel::fastMaterializeConstant(const Constant *C) { - EVT CEVT = TLI.getValueType(C->getType(), true); + EVT CEVT = TLI.getValueType(DL, C->getType(), true); // Only handle simple types. if (!CEVT.isSimple()) return 0; @@ -732,7 +730,7 @@ unsigned ARMFastISel::fastMaterializeAlloca(const AllocaInst *AI) { } bool ARMFastISel::isTypeLegal(Type *Ty, MVT &VT) { - EVT evt = TLI.getValueType(Ty, true); + EVT evt = TLI.getValueType(DL, Ty, true); // Only handle simple types. if (evt == MVT::Other || !evt.isSimple()) return false; @@ -786,12 +784,13 @@ bool ARMFastISel::ARMComputeAddress(const Value *Obj, Address &Addr) { return ARMComputeAddress(U->getOperand(0), Addr); case Instruction::IntToPtr: // Look past no-op inttoptrs. - if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy()) + if (TLI.getValueType(DL, U->getOperand(0)->getType()) == + TLI.getPointerTy(DL)) return ARMComputeAddress(U->getOperand(0), Addr); break; case Instruction::PtrToInt: // Look past no-op ptrtoints. - if (TLI.getValueType(U->getType()) == TLI.getPointerTy()) + if (TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL)) return ARMComputeAddress(U->getOperand(0), Addr); break; case Instruction::GetElementPtr: { @@ -1365,7 +1364,7 @@ bool ARMFastISel::SelectIndirectBr(const Instruction *I) { bool ARMFastISel::ARMEmitCmp(const Value *Src1Value, const Value *Src2Value, bool isZExt) { Type *Ty = Src1Value->getType(); - EVT SrcEVT = TLI.getValueType(Ty, true); + EVT SrcEVT = TLI.getValueType(DL, Ty, true); if (!SrcEVT.isSimple()) return false; MVT SrcVT = SrcEVT.getSimpleVT(); @@ -1557,7 +1556,7 @@ bool ARMFastISel::SelectIToFP(const Instruction *I, bool isSigned) { return false; Value *Src = I->getOperand(0); - EVT SrcEVT = TLI.getValueType(Src->getType(), true); + EVT SrcEVT = TLI.getValueType(DL, Src->getType(), true); if (!SrcEVT.isSimple()) return false; MVT SrcVT = SrcEVT.getSimpleVT(); @@ -1750,7 +1749,7 @@ bool ARMFastISel::SelectRem(const Instruction *I, bool isSigned) { } bool ARMFastISel::SelectBinaryIntOp(const Instruction *I, unsigned ISDOpcode) { - EVT DestVT = TLI.getValueType(I->getType(), true); + EVT DestVT = TLI.getValueType(DL, I->getType(), true); // We can get here in the case when we have a binary operation on a non-legal // type and the target independent selector doesn't know how to handle it. @@ -1790,7 +1789,7 @@ bool ARMFastISel::SelectBinaryIntOp(const Instruction *I, unsigned ISDOpcode) { } bool ARMFastISel::SelectBinaryFPOp(const Instruction *I, unsigned ISDOpcode) { - EVT FPVT = TLI.getValueType(I->getType(), true); + EVT FPVT = TLI.getValueType(DL, I->getType(), true); if (!FPVT.isSimple()) return false; MVT VT = FPVT.getSimpleVT(); @@ -2095,7 +2094,7 @@ bool ARMFastISel::SelectRet(const Instruction *I) { CallingConv::ID CC = F.getCallingConv(); if (Ret->getNumOperands() > 0) { SmallVector<ISD::OutputArg, 4> Outs; - GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI); + GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI, DL); // Analyze operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ValLocs; @@ -2122,7 +2121,7 @@ bool ARMFastISel::SelectRet(const Instruction *I) { return false; unsigned SrcReg = Reg + VA.getValNo(); - EVT RVEVT = TLI.getValueType(RV->getType()); + EVT RVEVT = TLI.getValueType(DL, RV->getType()); if (!RVEVT.isSimple()) return false; MVT RVVT = RVEVT.getSimpleVT(); MVT DestVT = VA.getValVT(); @@ -2173,7 +2172,7 @@ unsigned ARMFastISel::ARMSelectCallOp(bool UseReg) { unsigned ARMFastISel::getLibcallReg(const Twine &Name) { // Manually compute the global's type to avoid building it when unnecessary. Type *GVTy = Type::getInt32PtrTy(*Context, /*AS=*/0); - EVT LCREVT = TLI.getValueType(GVTy); + EVT LCREVT = TLI.getValueType(DL, GVTy); if (!LCREVT.isSimple()) return 0; GlobalValue *GV = new GlobalVariable(M, Type::getInt32Ty(*Context), false, @@ -2246,19 +2245,19 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) { return false; unsigned CalleeReg = 0; - if (EnableARMLongCalls) { + if (Subtarget->genLongCalls()) { CalleeReg = getLibcallReg(TLI.getLibcallName(Call)); if (CalleeReg == 0) return false; } // Issue the call. - unsigned CallOpc = ARMSelectCallOp(EnableARMLongCalls); + unsigned CallOpc = ARMSelectCallOp(Subtarget->genLongCalls()); MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CallOpc)); // BL / BLX don't take a predicate, but tBL / tBLX do. if (isThumb2) AddDefaultPred(MIB); - if (EnableARMLongCalls) + if (Subtarget->genLongCalls()) MIB.addReg(CalleeReg); else MIB.addExternalSymbol(TLI.getLibcallName(Call)); @@ -2380,7 +2379,7 @@ bool ARMFastISel::SelectCall(const Instruction *I, bool UseReg = false; const GlobalValue *GV = dyn_cast<GlobalValue>(Callee); - if (!GV || EnableARMLongCalls) UseReg = true; + if (!GV || Subtarget->genLongCalls()) UseReg = true; unsigned CalleeReg = 0; if (UseReg) { @@ -2576,8 +2575,8 @@ bool ARMFastISel::SelectTrunc(const Instruction *I) { Value *Op = I->getOperand(0); EVT SrcVT, DestVT; - SrcVT = TLI.getValueType(Op->getType(), true); - DestVT = TLI.getValueType(I->getType(), true); + SrcVT = TLI.getValueType(DL, Op->getType(), true); + DestVT = TLI.getValueType(DL, I->getType(), true); if (SrcVT != MVT::i32 && SrcVT != MVT::i16 && SrcVT != MVT::i8) return false; @@ -2742,8 +2741,8 @@ bool ARMFastISel::SelectIntExt(const Instruction *I) { if (!SrcReg) return false; EVT SrcEVT, DestEVT; - SrcEVT = TLI.getValueType(SrcTy, true); - DestEVT = TLI.getValueType(DestTy, true); + SrcEVT = TLI.getValueType(DL, SrcTy, true); + DestEVT = TLI.getValueType(DL, DestTy, true); if (!SrcEVT.isSimple()) return false; if (!DestEVT.isSimple()) return false; @@ -2763,7 +2762,7 @@ bool ARMFastISel::SelectShift(const Instruction *I, return false; // Only handle i32 now. - EVT DestVT = TLI.getValueType(I->getType(), true); + EVT DestVT = TLI.getValueType(DL, I->getType(), true); if (DestVT != MVT::i32) return false; @@ -3026,7 +3025,7 @@ bool ARMFastISel::fastLowerArguments() { if (ArgTy->isStructTy() || ArgTy->isArrayTy() || ArgTy->isVectorTy()) return false; - EVT ArgVT = TLI.getValueType(ArgTy); + EVT ArgVT = TLI.getValueType(DL, ArgTy); if (!ArgVT.isSimple()) return false; switch (ArgVT.getSimpleVT().SimpleTy) { case MVT::i8: diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp index a52e49780e27..6744000afe2b 100644 --- a/lib/Target/ARM/ARMFrameLowering.cpp +++ b/lib/Target/ARM/ARMFrameLowering.cpp @@ -800,7 +800,7 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF, // This is bad, if an interrupt is taken after the mov, sp is in an // inconsistent state. // Use the first callee-saved register as a scratch register. - assert(MF.getRegInfo().isPhysRegUsed(ARM::R4) && + assert(!MFI->getPristineRegs(MF).test(ARM::R4) && "No scratch register to restore SP from FP!"); emitT2RegPlusImmediate(MBB, MBBI, dl, ARM::R4, FramePtr, -NumBytes, ARMCC::AL, 0, TII); @@ -1470,7 +1470,8 @@ static unsigned estimateRSStackSizeLimit(MachineFunction &MF, // callee-saved vector registers after realigning the stack. The vst1 and vld1 // instructions take alignment hints that can improve performance. // -static void checkNumAlignedDPRCS2Regs(MachineFunction &MF) { +static void +checkNumAlignedDPRCS2Regs(MachineFunction &MF, BitVector &SavedRegs) { MF.getInfo<ARMFunctionInfo>()->setNumAlignedDPRCS2Regs(0); if (!SpillAlignedNEONRegs) return; @@ -1497,10 +1498,9 @@ static void checkNumAlignedDPRCS2Regs(MachineFunction &MF) { // callee-saved registers in order, but it can happen that there are holes in // the range. Registers above the hole will be spilled to the standard DPRCS // area. - MachineRegisterInfo &MRI = MF.getRegInfo(); unsigned NumSpills = 0; for (; NumSpills < 8; ++NumSpills) - if (!MRI.isPhysRegUsed(ARM::D8 + NumSpills)) + if (!SavedRegs.test(ARM::D8 + NumSpills)) break; // Don't do this for just one d-register. It's not worth it. @@ -1511,12 +1511,13 @@ static void checkNumAlignedDPRCS2Regs(MachineFunction &MF) { MF.getInfo<ARMFunctionInfo>()->setNumAlignedDPRCS2Regs(NumSpills); // A scratch register is required for the vst1 / vld1 instructions. - MF.getRegInfo().setPhysRegUsed(ARM::R4); + SavedRegs.set(ARM::R4); } -void -ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, - RegScavenger *RS) const { +void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF, + BitVector &SavedRegs, + RegScavenger *RS) const { + TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); // This tells PEI to spill the FP as if it is any other callee-save register // to take advantage the eliminateFrameIndex machinery. This also ensures it // is spilled in the order specified by getCalleeSavedRegs() to make it easier @@ -1543,12 +1544,12 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, // FIXME: It will be better just to find spare register here. if (AFI->isThumb2Function() && (MFI->hasVarSizedObjects() || RegInfo->needsStackRealignment(MF))) - MRI.setPhysRegUsed(ARM::R4); + SavedRegs.set(ARM::R4); if (AFI->isThumb1OnlyFunction()) { // Spill LR if Thumb1 function uses variable length argument lists. if (AFI->getArgRegsSaveSize() > 0) - MRI.setPhysRegUsed(ARM::LR); + SavedRegs.set(ARM::LR); // Spill R4 if Thumb1 epilogue has to restore SP from FP. We don't know // for sure what the stack size will be, but for this, an estimate is good @@ -1558,23 +1559,23 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, // FIXME: It will be better just to find spare register here. unsigned StackSize = MFI->estimateStackSize(MF); if (MFI->hasVarSizedObjects() || StackSize > 508) - MRI.setPhysRegUsed(ARM::R4); + SavedRegs.set(ARM::R4); } // See if we can spill vector registers to aligned stack. - checkNumAlignedDPRCS2Regs(MF); + checkNumAlignedDPRCS2Regs(MF, SavedRegs); // Spill the BasePtr if it's used. if (RegInfo->hasBasePointer(MF)) - MRI.setPhysRegUsed(RegInfo->getBaseRegister()); + SavedRegs.set(RegInfo->getBaseRegister()); // Don't spill FP if the frame can be eliminated. This is determined - // by scanning the callee-save registers to see if any is used. + // by scanning the callee-save registers to see if any is modified. const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF); for (unsigned i = 0; CSRegs[i]; ++i) { unsigned Reg = CSRegs[i]; bool Spilled = false; - if (MRI.isPhysRegUsed(Reg)) { + if (SavedRegs.test(Reg)) { Spilled = true; CanEliminateFrame = false; } @@ -1668,7 +1669,7 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, // If LR is not spilled, but at least one of R4, R5, R6, and R7 is spilled. // Spill LR as well so we can fold BX_RET to the registers restore (LDM). if (!LRSpilled && CS1Spilled) { - MRI.setPhysRegUsed(ARM::LR); + SavedRegs.set(ARM::LR); NumGPRSpills++; SmallVectorImpl<unsigned>::iterator LRPos; LRPos = std::find(UnspilledCS1GPRs.begin(), UnspilledCS1GPRs.end(), @@ -1681,7 +1682,7 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, } if (hasFP(MF)) { - MRI.setPhysRegUsed(FramePtr); + SavedRegs.set(FramePtr); auto FPPos = std::find(UnspilledCS1GPRs.begin(), UnspilledCS1GPRs.end(), FramePtr); if (FPPos != UnspilledCS1GPRs.end()) @@ -1700,7 +1701,7 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, // Don't spill high register if the function is thumb if (!AFI->isThumbFunction() || isARMLowRegister(Reg) || Reg == ARM::LR) { - MRI.setPhysRegUsed(Reg); + SavedRegs.set(Reg); if (!MRI.isReserved(Reg)) ExtraCSSpill = true; break; @@ -1708,7 +1709,7 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, } } else if (!UnspilledCS2GPRs.empty() && !AFI->isThumb1OnlyFunction()) { unsigned Reg = UnspilledCS2GPRs.front(); - MRI.setPhysRegUsed(Reg); + SavedRegs.set(Reg); if (!MRI.isReserved(Reg)) ExtraCSSpill = true; } @@ -1747,7 +1748,7 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, } if (Extras.size() && NumExtras == 0) { for (unsigned i = 0, e = Extras.size(); i != e; ++i) { - MRI.setPhysRegUsed(Extras[i]); + SavedRegs.set(Extras[i]); } } else if (!AFI->isThumb1OnlyFunction()) { // note: Thumb1 functions spill to R12, not the stack. Reserve a slot @@ -1761,7 +1762,7 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, } if (ForceLRSpill) { - MRI.setPhysRegUsed(ARM::LR); + SavedRegs.set(ARM::LR); AFI->setLRIsSpilledForFarJump(true); } } diff --git a/lib/Target/ARM/ARMFrameLowering.h b/lib/Target/ARM/ARMFrameLowering.h index d763d17a506f..6fdc5eff5e47 100644 --- a/lib/Target/ARM/ARMFrameLowering.h +++ b/lib/Target/ARM/ARMFrameLowering.h @@ -54,8 +54,8 @@ public: unsigned &FrameReg, int SPAdj) const; int getFrameIndexOffset(const MachineFunction &MF, int FI) const override; - void processFunctionBeforeCalleeSavedScan(MachineFunction &MF, - RegScavenger *RS) const override; + void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, + RegScavenger *RS) const override; void adjustForSegmentedStacks(MachineFunction &MF, MachineBasicBlock &MBB) const override; diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp index 50afb192b331..b110628a0a86 100644 --- a/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -533,7 +533,8 @@ bool ARMDAGToDAGISel::SelectAddrModeImm12(SDValue N, if (N.getOpcode() == ISD::FrameIndex) { // Match frame index. int FI = cast<FrameIndexSDNode>(N)->getIndex(); - Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy()); + Base = CurDAG->getTargetFrameIndex( + FI, TLI->getPointerTy(CurDAG->getDataLayout())); OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32); return true; } @@ -556,7 +557,8 @@ bool ARMDAGToDAGISel::SelectAddrModeImm12(SDValue N, Base = N.getOperand(0); if (Base.getOpcode() == ISD::FrameIndex) { int FI = cast<FrameIndexSDNode>(Base)->getIndex(); - Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy()); + Base = CurDAG->getTargetFrameIndex( + FI, TLI->getPointerTy(CurDAG->getDataLayout())); } OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i32); return true; @@ -702,7 +704,8 @@ AddrMode2Type ARMDAGToDAGISel::SelectAddrMode2Worker(SDValue N, Base = N; if (N.getOpcode() == ISD::FrameIndex) { int FI = cast<FrameIndexSDNode>(N)->getIndex(); - Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy()); + Base = CurDAG->getTargetFrameIndex( + FI, TLI->getPointerTy(CurDAG->getDataLayout())); } else if (N.getOpcode() == ARMISD::Wrapper && N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress) { Base = N.getOperand(0); @@ -722,7 +725,8 @@ AddrMode2Type ARMDAGToDAGISel::SelectAddrMode2Worker(SDValue N, Base = N.getOperand(0); if (Base.getOpcode() == ISD::FrameIndex) { int FI = cast<FrameIndexSDNode>(Base)->getIndex(); - Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy()); + Base = CurDAG->getTargetFrameIndex( + FI, TLI->getPointerTy(CurDAG->getDataLayout())); } Offset = CurDAG->getRegister(0, MVT::i32); @@ -900,7 +904,8 @@ bool ARMDAGToDAGISel::SelectAddrMode3(SDValue N, Base = N; if (N.getOpcode() == ISD::FrameIndex) { int FI = cast<FrameIndexSDNode>(N)->getIndex(); - Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy()); + Base = CurDAG->getTargetFrameIndex( + FI, TLI->getPointerTy(CurDAG->getDataLayout())); } Offset = CurDAG->getRegister(0, MVT::i32); Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(ARM_AM::add, 0), SDLoc(N), @@ -915,7 +920,8 @@ bool ARMDAGToDAGISel::SelectAddrMode3(SDValue N, Base = N.getOperand(0); if (Base.getOpcode() == ISD::FrameIndex) { int FI = cast<FrameIndexSDNode>(Base)->getIndex(); - Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy()); + Base = CurDAG->getTargetFrameIndex( + FI, TLI->getPointerTy(CurDAG->getDataLayout())); } Offset = CurDAG->getRegister(0, MVT::i32); @@ -964,7 +970,8 @@ bool ARMDAGToDAGISel::SelectAddrMode5(SDValue N, Base = N; if (N.getOpcode() == ISD::FrameIndex) { int FI = cast<FrameIndexSDNode>(N)->getIndex(); - Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy()); + Base = CurDAG->getTargetFrameIndex( + FI, TLI->getPointerTy(CurDAG->getDataLayout())); } else if (N.getOpcode() == ARMISD::Wrapper && N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress) { Base = N.getOperand(0); @@ -981,7 +988,8 @@ bool ARMDAGToDAGISel::SelectAddrMode5(SDValue N, Base = N.getOperand(0); if (Base.getOpcode() == ISD::FrameIndex) { int FI = cast<FrameIndexSDNode>(Base)->getIndex(); - Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy()); + Base = CurDAG->getTargetFrameIndex( + FI, TLI->getPointerTy(CurDAG->getDataLayout())); } ARM_AM::AddrOpc AddSub = ARM_AM::add; @@ -1215,7 +1223,8 @@ bool ARMDAGToDAGISel::SelectThumbAddrModeSP(SDValue N, MachineFrameInfo *MFI = MF->getFrameInfo(); if (MFI->getObjectAlignment(FI) < 4) MFI->setObjectAlignment(FI, 4); - Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy()); + Base = CurDAG->getTargetFrameIndex( + FI, TLI->getPointerTy(CurDAG->getDataLayout())); OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32); return true; } @@ -1237,7 +1246,8 @@ bool ARMDAGToDAGISel::SelectThumbAddrModeSP(SDValue N, MachineFrameInfo *MFI = MF->getFrameInfo(); if (MFI->getObjectAlignment(FI) < 4) MFI->setObjectAlignment(FI, 4); - Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy()); + Base = CurDAG->getTargetFrameIndex( + FI, TLI->getPointerTy(CurDAG->getDataLayout())); } OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i32); return true; @@ -1285,7 +1295,8 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm12(SDValue N, if (N.getOpcode() == ISD::FrameIndex) { // Match frame index. int FI = cast<FrameIndexSDNode>(N)->getIndex(); - Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy()); + Base = CurDAG->getTargetFrameIndex( + FI, TLI->getPointerTy(CurDAG->getDataLayout())); OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32); return true; } @@ -1314,7 +1325,8 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm12(SDValue N, Base = N.getOperand(0); if (Base.getOpcode() == ISD::FrameIndex) { int FI = cast<FrameIndexSDNode>(Base)->getIndex(); - Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy()); + Base = CurDAG->getTargetFrameIndex( + FI, TLI->getPointerTy(CurDAG->getDataLayout())); } OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i32); return true; @@ -1343,7 +1355,8 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm8(SDValue N, Base = N.getOperand(0); if (Base.getOpcode() == ISD::FrameIndex) { int FI = cast<FrameIndexSDNode>(Base)->getIndex(); - Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy()); + Base = CurDAG->getTargetFrameIndex( + FI, TLI->getPointerTy(CurDAG->getDataLayout())); } OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i32); return true; @@ -1438,7 +1451,8 @@ bool ARMDAGToDAGISel::SelectT2AddrModeExclusive(SDValue N, SDValue &Base, Base = N.getOperand(0); if (Base.getOpcode() == ISD::FrameIndex) { int FI = cast<FrameIndexSDNode>(Base)->getIndex(); - Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy()); + Base = CurDAG->getTargetFrameIndex( + FI, TLI->getPointerTy(CurDAG->getDataLayout())); } OffImm = CurDAG->getTargetConstant(RHSC/4, SDLoc(N), MVT::i32); @@ -2510,7 +2524,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { if (UseCP) { SDValue CPIdx = CurDAG->getTargetConstantPool( ConstantInt::get(Type::getInt32Ty(*CurDAG->getContext()), Val), - TLI->getPointerTy()); + TLI->getPointerTy(CurDAG->getDataLayout())); SDNode *ResNode; if (Subtarget->isThumb()) { @@ -2540,7 +2554,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { case ISD::FrameIndex: { // Selects to ADDri FI, 0 which in turn will become ADDri SP, imm. int FI = cast<FrameIndexSDNode>(N)->getIndex(); - SDValue TFI = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy()); + SDValue TFI = CurDAG->getTargetFrameIndex( + FI, TLI->getPointerTy(CurDAG->getDataLayout())); if (Subtarget->isThumb1Only()) { // Set the alignment of the frame object to 4, to avoid having to generate // more than one ADD diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 4b2105b7442f..e335784f6d87 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -60,11 +60,6 @@ STATISTIC(NumTailCalls, "Number of tail calls"); STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt"); STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments"); -cl::opt<bool> -EnableARMLongCalls("arm-long-calls", cl::Hidden, - cl::desc("Generate calls via indirect call instructions"), - cl::init(false)); - static cl::opt<bool> ARMInterworking("arm-interworking", cl::Hidden, cl::desc("Enable / disable ARM interworking (for debugging only)"), @@ -548,6 +543,27 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::CTPOP, MVT::v4i16, Custom); setOperationAction(ISD::CTPOP, MVT::v8i16, Custom); + // NEON does not have single instruction CTTZ for vectors. + setOperationAction(ISD::CTTZ, MVT::v8i8, Custom); + setOperationAction(ISD::CTTZ, MVT::v4i16, Custom); + setOperationAction(ISD::CTTZ, MVT::v2i32, Custom); + setOperationAction(ISD::CTTZ, MVT::v1i64, Custom); + + setOperationAction(ISD::CTTZ, MVT::v16i8, Custom); + setOperationAction(ISD::CTTZ, MVT::v8i16, Custom); + setOperationAction(ISD::CTTZ, MVT::v4i32, Custom); + setOperationAction(ISD::CTTZ, MVT::v2i64, Custom); + + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i8, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i16, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i32, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v1i64, Custom); + + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i8, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i16, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom); + // NEON only has FMA instructions as of VFP4. if (!Subtarget->hasVFP4()) { setOperationAction(ISD::FMA, MVT::v2f32, Expand); @@ -1149,8 +1165,10 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { return nullptr; } -EVT ARMTargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { - if (!VT.isVector()) return getPointerTy(); +EVT ARMTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, + EVT VT) const { + if (!VT.isVector()) + return getPointerTy(DL); return VT.changeVectorElementTypeToInteger(); } @@ -1429,7 +1447,8 @@ ARMTargetLowering::LowerMemOpCallTo(SDValue Chain, ISD::ArgFlagsTy Flags) const { unsigned LocMemOffset = VA.getLocMemOffset(); SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); - PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); + PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), + StackPtr, PtrOff); return DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo::getStack(LocMemOffset), false, false, 0); @@ -1453,7 +1472,8 @@ void ARMTargetLowering::PassF64ArgInRegs(SDLoc dl, SelectionDAG &DAG, else { assert(NextVA.isMemLoc()); if (!StackPtr.getNode()) - StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy()); + StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, + getPointerTy(DAG.getDataLayout())); MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, fmrrd.getValue(1-id), dl, DAG, NextVA, @@ -1526,7 +1546,8 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, dl, true), dl); - SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy()); + SDValue StackPtr = + DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout())); RegsToPassVector RegsToPass; SmallVector<SDValue, 8> MemOpChains; @@ -1607,7 +1628,8 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, unsigned RegBegin, RegEnd; CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd); - EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + EVT PtrVT = + DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); unsigned int i, j; for (i = 0, j = RegBegin; j < RegEnd; i++, j++) { SDValue Const = DAG.getConstant(4*i, dl, MVT::i32); @@ -1628,12 +1650,12 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, } if (Flags.getByValSize() > 4*offset) { + auto PtrVT = getPointerTy(DAG.getDataLayout()); unsigned LocMemOffset = VA.getLocMemOffset(); SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); - SDValue Dst = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, - StkPtrOff); + SDValue Dst = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, StkPtrOff); SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl); - SDValue Src = DAG.getNode(ISD::ADD, dl, getPointerTy(), Arg, SrcOffset); + SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, SrcOffset); SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl, MVT::i32); SDValue AlignNode = DAG.getConstant(Flags.getByValAlign(), dl, @@ -1693,8 +1715,9 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, bool isARMFunc = false; bool isLocalARMFunc = false; ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + auto PtrVt = getPointerTy(DAG.getDataLayout()); - if (EnableARMLongCalls) { + if (Subtarget->genLongCalls()) { assert((Subtarget->isTargetWindows() || getTargetMachine().getRelocationModel() == Reloc::Static) && "long-calls with non-static relocation model!"); @@ -1709,12 +1732,11 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 0); // Get the address of the callee into a register - SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4); + SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); - Callee = DAG.getLoad(getPointerTy(), dl, - DAG.getEntryNode(), CPAddr, - MachinePointerInfo::getConstantPool(), - false, false, false, 0); + Callee = DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), CPAddr, + MachinePointerInfo::getConstantPool(), false, false, + false, 0); } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) { const char *Sym = S->getSymbol(); @@ -1724,29 +1746,28 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym, ARMPCLabelIndex, 0); // Get the address of the callee into a register - SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4); + SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); - Callee = DAG.getLoad(getPointerTy(), dl, - DAG.getEntryNode(), CPAddr, - MachinePointerInfo::getConstantPool(), - false, false, false, 0); + Callee = DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), CPAddr, + MachinePointerInfo::getConstantPool(), false, false, + false, 0); } } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { const GlobalValue *GV = G->getGlobal(); isDirect = true; - bool isExt = GV->isDeclaration() || GV->isWeakForLinker(); - bool isStub = (isExt && Subtarget->isTargetMachO()) && + bool isDef = GV->isStrongDefinitionForLinker(); + bool isStub = (!isDef && Subtarget->isTargetMachO()) && getTargetMachine().getRelocationModel() != Reloc::Static; isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass()); // ARM call to a local ARM function is predicable. - isLocalARMFunc = !Subtarget->isThumb() && (!isExt || !ARMInterworking); + isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking); // tBX takes a register source operand. if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) { assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?"); - Callee = DAG.getNode(ARMISD::WrapperPIC, dl, getPointerTy(), - DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), - 0, ARMII::MO_NONLAZY)); - Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee, + Callee = DAG.getNode( + ARMISD::WrapperPIC, dl, PtrVt, + DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, ARMII::MO_NONLAZY)); + Callee = DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), Callee, MachinePointerInfo::getGOT(), false, false, true, 0); } else if (Subtarget->isTargetCOFF()) { assert(Subtarget->isTargetWindows() && @@ -1754,20 +1775,20 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, unsigned TargetFlags = GV->hasDLLImportStorageClass() ? ARMII::MO_DLLIMPORT : ARMII::MO_NO_FLAG; - Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), /*Offset=*/0, - TargetFlags); + Callee = + DAG.getTargetGlobalAddress(GV, dl, PtrVt, /*Offset=*/0, TargetFlags); if (GV->hasDLLImportStorageClass()) - Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), - DAG.getNode(ARMISD::Wrapper, dl, getPointerTy(), - Callee), MachinePointerInfo::getGOT(), - false, false, false, 0); + Callee = + DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), + DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee), + MachinePointerInfo::getGOT(), false, false, false, 0); } else { // On ELF targets for PIC code, direct calls should go through the PLT unsigned OpFlags = 0; if (Subtarget->isTargetELF() && getTargetMachine().getRelocationModel() == Reloc::PIC_) OpFlags = ARMII::MO_PLT; - Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags); + Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, OpFlags); } } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { isDirect = true; @@ -1781,22 +1802,20 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, ARMConstantPoolValue *CPV = ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym, ARMPCLabelIndex, 4); - SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4); + SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); - Callee = DAG.getLoad(getPointerTy(), dl, - DAG.getEntryNode(), CPAddr, - MachinePointerInfo::getConstantPool(), - false, false, false, 0); + Callee = DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), CPAddr, + MachinePointerInfo::getConstantPool(), false, false, + false, 0); SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); - Callee = DAG.getNode(ARMISD::PIC_ADD, dl, - getPointerTy(), Callee, PICLabel); + Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel); } else { unsigned OpFlags = 0; // On ELF targets for PIC code, direct calls should go through the PLT if (Subtarget->isTargetELF() && getTargetMachine().getRelocationModel() == Reloc::PIC_) OpFlags = ARMII::MO_PLT; - Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlags); + Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, OpFlags); } } @@ -2433,7 +2452,7 @@ SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op, ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); unsigned ARMPCLabelIndex = 0; SDLoc DL(Op); - EVT PtrVT = getPointerTy(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); Reloc::Model RelocM = getTargetMachine().getRelocationModel(); SDValue CPAddr; @@ -2462,7 +2481,7 @@ SDValue ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, SelectionDAG &DAG) const { SDLoc dl(GA); - EVT PtrVT = getPointerTy(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8; MachineFunction &MF = DAG.getMachineFunction(); ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); @@ -2508,7 +2527,7 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA, SDLoc dl(GA); SDValue Offset; SDValue Chain = DAG.getEntryNode(); - EVT PtrVT = getPointerTy(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); // Get the Thread Pointer SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT); @@ -2574,7 +2593,7 @@ ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, SelectionDAG &DAG) const { - EVT PtrVT = getPointerTy(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDLoc dl(Op); const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); if (getTargetMachine().getRelocationModel() == Reloc::PIC_) { @@ -2617,7 +2636,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op, SelectionDAG &DAG) const { - EVT PtrVT = getPointerTy(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDLoc dl(Op); const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); Reloc::Model RelocM = getTargetMachine().getRelocationModel(); @@ -2648,7 +2667,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op, const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); const ARMII::TOF TargetFlags = (GV->hasDLLImportStorageClass() ? ARMII::MO_DLLIMPORT : ARMII::MO_NO_FLAG); - EVT PtrVT = getPointerTy(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Result; SDLoc DL(Op); @@ -2672,7 +2691,7 @@ SDValue ARMTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op, MachineFunction &MF = DAG.getMachineFunction(); ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); - EVT PtrVT = getPointerTy(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDLoc dl(Op); unsigned PCAdj = Subtarget->isThumb() ? 4 : 8; ARMConstantPoolValue *CPV = @@ -2716,14 +2735,14 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, return DAG.getNode(ARMISD::RBIT, dl, MVT::i32, Op.getOperand(1)); } case Intrinsic::arm_thread_pointer: { - EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT); } case Intrinsic::eh_sjlj_lsda: { MachineFunction &MF = DAG.getMachineFunction(); ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); - EVT PtrVT = getPointerTy(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); Reloc::Model RelocM = getTargetMachine().getRelocationModel(); SDValue CPAddr; unsigned PCAdj = (RelocM != Reloc::PIC_) @@ -2820,7 +2839,7 @@ static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) { // vastart just stores the address of the VarArgsFrameIndex slot into the // memory location argument. SDLoc dl(Op); - EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), @@ -2850,7 +2869,7 @@ ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA, int FI = MFI->CreateFixedObject(4, NextVA.getLocMemOffset(), true); // Create load node to retrieve arguments from the stack. - SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); + SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); ArgValue2 = DAG.getLoad(MVT::i32, dl, Root, FIN, MachinePointerInfo::getFixedStack(FI), false, false, false, 0); @@ -2904,8 +2923,9 @@ ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG, if (REnd != RBegin) ArgOffset = -4 * (ARM::R4 - RBegin); + auto PtrVT = getPointerTy(DAG.getDataLayout()); int FrameIndex = MFI->CreateFixedObject(ArgSize, ArgOffset, false); - SDValue FIN = DAG.getFrameIndex(FrameIndex, getPointerTy()); + SDValue FIN = DAG.getFrameIndex(FrameIndex, PtrVT); SmallVector<SDValue, 4> MemOps; const TargetRegisterClass *RC = @@ -2918,8 +2938,7 @@ ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG, DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo(OrigArg, 4 * i), false, false, 0); MemOps.push_back(Store); - FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN, - DAG.getConstant(4, dl, getPointerTy())); + FIN = DAG.getNode(ISD::ADD, dl, PtrVT, FIN, DAG.getConstant(4, dl, PtrVT)); } if (!MemOps.empty()) @@ -3013,6 +3032,7 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, unsigned TotalArgRegsSaveSize = 4 * (ARM::R4 - ArgRegBegin); AFI->setArgRegsSaveSize(TotalArgRegsSaveSize); + auto PtrVT = getPointerTy(DAG.getDataLayout()); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; @@ -3035,7 +3055,7 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, SDValue ArgValue2; if (VA.isMemLoc()) { int FI = MFI->CreateFixedObject(8, VA.getLocMemOffset(), true); - SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); + SDValue FIN = DAG.getFrameIndex(FI, PtrVT); ArgValue2 = DAG.getLoad(MVT::f64, dl, Chain, FIN, MachinePointerInfo::getFixedStack(FI), false, false, false, 0); @@ -3122,7 +3142,7 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, int FrameIndex = StoreByValRegs(CCInfo, DAG, dl, Chain, CurOrigArg, CurByValIndex, VA.getLocMemOffset(), Flags.getByValSize()); - InVals.push_back(DAG.getFrameIndex(FrameIndex, getPointerTy())); + InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT)); CCInfo.nextInRegsParam(); } else { unsigned FIOffset = VA.getLocMemOffset(); @@ -3130,7 +3150,7 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, FIOffset, true); // Create load nodes to retrieve arguments from the stack. - SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); + SDValue FIN = DAG.getFrameIndex(FI, PtrVT); InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN, MachinePointerInfo::getFixedStack(FI), false, false, false, 0)); @@ -3855,7 +3875,7 @@ SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const { SDValue Index = Op.getOperand(2); SDLoc dl(Op); - EVT PTy = getPointerTy(); + EVT PTy = getPointerTy(DAG.getDataLayout()); JumpTableSDNode *JT = cast<JumpTableSDNode>(Table); SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy); Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI); @@ -4102,8 +4122,8 @@ SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { // FIXME? Maybe this could be a TableGen attribute on some registers and // this table could be generated automatically from RegInfo. -unsigned ARMTargetLowering::getRegisterByName(const char* RegName, - EVT VT) const { +unsigned ARMTargetLowering::getRegisterByName(const char* RegName, EVT VT, + SelectionDAG &DAG) const { unsigned Reg = StringSwitch<unsigned>(RegName) .Case("sp", ARM::SP) .Default(0); @@ -4163,7 +4183,7 @@ static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) { // Turn f64->i64 into VMOVRRD. if (DstVT == MVT::i64 && TLI.isTypeLegal(SrcVT)) { SDValue Cvt; - if (TLI.isBigEndian() && SrcVT.isVector() && + if (DAG.getDataLayout().isBigEndian() && SrcVT.isVector() && SrcVT.getVectorNumElements() > 1) Cvt = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), @@ -4283,8 +4303,82 @@ SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op, static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST) { - EVT VT = N->getValueType(0); SDLoc dl(N); + EVT VT = N->getValueType(0); + if (VT.isVector()) { + assert(ST->hasNEON()); + + // Compute the least significant set bit: LSB = X & -X + SDValue X = N->getOperand(0); + SDValue NX = DAG.getNode(ISD::SUB, dl, VT, getZeroVector(VT, DAG, dl), X); + SDValue LSB = DAG.getNode(ISD::AND, dl, VT, X, NX); + + EVT ElemTy = VT.getVectorElementType(); + + if (ElemTy == MVT::i8) { + // Compute with: cttz(x) = ctpop(lsb - 1) + SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT, + DAG.getTargetConstant(1, dl, ElemTy)); + SDValue Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One); + return DAG.getNode(ISD::CTPOP, dl, VT, Bits); + } + + if ((ElemTy == MVT::i16 || ElemTy == MVT::i32) && + (N->getOpcode() == ISD::CTTZ_ZERO_UNDEF)) { + // Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 0 + unsigned NumBits = ElemTy.getSizeInBits(); + SDValue WidthMinus1 = + DAG.getNode(ARMISD::VMOVIMM, dl, VT, + DAG.getTargetConstant(NumBits - 1, dl, ElemTy)); + SDValue CTLZ = DAG.getNode(ISD::CTLZ, dl, VT, LSB); + return DAG.getNode(ISD::SUB, dl, VT, WidthMinus1, CTLZ); + } + + // Compute with: cttz(x) = ctpop(lsb - 1) + + // Since we can only compute the number of bits in a byte with vcnt.8, we + // have to gather the result with pairwise addition (vpaddl) for i16, i32, + // and i64. + + // Compute LSB - 1. + SDValue Bits; + if (ElemTy == MVT::i64) { + // Load constant 0xffff'ffff'ffff'ffff to register. + SDValue FF = DAG.getNode(ARMISD::VMOVIMM, dl, VT, + DAG.getTargetConstant(0x1eff, dl, MVT::i32)); + Bits = DAG.getNode(ISD::ADD, dl, VT, LSB, FF); + } else { + SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT, + DAG.getTargetConstant(1, dl, ElemTy)); + Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One); + } + + // Count #bits with vcnt.8. + EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8; + SDValue BitsVT8 = DAG.getNode(ISD::BITCAST, dl, VT8Bit, Bits); + SDValue Cnt8 = DAG.getNode(ISD::CTPOP, dl, VT8Bit, BitsVT8); + + // Gather the #bits with vpaddl (pairwise add.) + EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16; + SDValue Cnt16 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT16Bit, + DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32), + Cnt8); + if (ElemTy == MVT::i16) + return Cnt16; + + EVT VT32Bit = VT.is64BitVector() ? MVT::v2i32 : MVT::v4i32; + SDValue Cnt32 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT32Bit, + DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32), + Cnt16); + if (ElemTy == MVT::i32) + return Cnt32; + + assert(ElemTy == MVT::i64); + SDValue Cnt64 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32), + Cnt32); + return Cnt64; + } if (!ST->hasV6T2Ops()) return SDValue(); @@ -4730,7 +4824,7 @@ static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, ImmMask <<= 1; } - if (DAG.getTargetLoweringInfo().isBigEndian()) + if (DAG.getDataLayout().isBigEndian()) // swap higher and lower 32 bit word Imm = ((Imm & 0xf) << 4) | ((Imm & 0xf0) >> 4); @@ -5868,7 +5962,7 @@ static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG, if (BVN->getValueType(0) != MVT::v4i32 || BVN->getOpcode() != ISD::BUILD_VECTOR) return false; - unsigned LoElt = DAG.getTargetLoweringInfo().isBigEndian() ? 1 : 0; + unsigned LoElt = DAG.getDataLayout().isBigEndian() ? 1 : 0; unsigned HiElt = 1 - LoElt; ConstantSDNode *Lo0 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt)); ConstantSDNode *Hi0 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt)); @@ -6013,7 +6107,7 @@ static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) { SDNode *BVN = N->getOperand(0).getNode(); assert(BVN->getOpcode() == ISD::BUILD_VECTOR && BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR"); - unsigned LowElt = DAG.getTargetLoweringInfo().isBigEndian() ? 1 : 0; + unsigned LowElt = DAG.getDataLayout().isBigEndian() ? 1 : 0; return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), MVT::v2i32, BVN->getOperand(LowElt), BVN->getOperand(LowElt+2)); } @@ -6342,18 +6436,19 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { SDValue Arg = Op.getOperand(0); EVT ArgVT = Arg.getValueType(); Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); + auto PtrVT = getPointerTy(DAG.getDataLayout()); MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo(); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // Pair of floats / doubles used to pass the result. StructType *RetTy = StructType::get(ArgTy, ArgTy, nullptr); // Create stack object for sret. - const uint64_t ByteSize = TLI.getDataLayout()->getTypeAllocSize(RetTy); - const unsigned StackAlign = TLI.getDataLayout()->getPrefTypeAlignment(RetTy); + auto &DL = DAG.getDataLayout(); + const uint64_t ByteSize = DL.getTypeAllocSize(RetTy); + const unsigned StackAlign = DL.getPrefTypeAlignment(RetTy); int FrameIdx = FrameInfo->CreateStackObject(ByteSize, StackAlign, false); - SDValue SRet = DAG.getFrameIndex(FrameIdx, TLI.getPointerTy()); + SDValue SRet = DAG.getFrameIndex(FrameIdx, getPointerTy(DL)); ArgListTy Args; ArgListEntry Entry; @@ -6373,7 +6468,7 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { const char *LibcallName = (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret"; - SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy()); + SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL)); TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl).setChain(DAG.getEntryNode()) @@ -6387,7 +6482,7 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { MachinePointerInfo(), false, false, false, 0); // Address of cos field. - SDValue Add = DAG.getNode(ISD::ADD, dl, getPointerTy(), SRet, + SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet, DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl)); SDValue LoadCos = DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo(), false, false, false, 0); @@ -6487,7 +6582,8 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SHL_PARTS: return LowerShiftLeftParts(Op, DAG); case ISD::SRL_PARTS: case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG); - case ISD::CTTZ: return LowerCTTZ(Op.getNode(), DAG, Subtarget); + case ISD::CTTZ: + case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget); case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget); case ISD::SETCC: return LowerVSETCC(Op, DAG); case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget); @@ -6845,9 +6941,9 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI, const Constant *C = ConstantInt::get(Int32Ty, NumLPads); // MachineConstantPool wants an explicit alignment. - unsigned Align = getDataLayout()->getPrefTypeAlignment(Int32Ty); + unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty); if (Align == 0) - Align = getDataLayout()->getTypeAllocSize(C->getType()); + Align = MF->getDataLayout().getTypeAllocSize(C->getType()); unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); unsigned VReg1 = MRI->createVirtualRegister(TRC); @@ -6935,9 +7031,9 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI, const Constant *C = ConstantInt::get(Int32Ty, NumLPads); // MachineConstantPool wants an explicit alignment. - unsigned Align = getDataLayout()->getPrefTypeAlignment(Int32Ty); + unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty); if (Align == 0) - Align = getDataLayout()->getTypeAllocSize(C->getType()); + Align = MF->getDataLayout().getTypeAllocSize(C->getType()); unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); unsigned VReg1 = MRI->createVirtualRegister(TRC); @@ -7313,9 +7409,9 @@ ARMTargetLowering::EmitStructByval(MachineInstr *MI, const Constant *C = ConstantInt::get(Int32Ty, LoopSize); // MachineConstantPool wants an explicit alignment. - unsigned Align = getDataLayout()->getPrefTypeAlignment(Int32Ty); + unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty); if (Align == 0) - Align = getDataLayout()->getTypeAllocSize(C->getType()); + Align = MF->getDataLayout().getTypeAllocSize(C->getType()); unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); if (IsThumb1) @@ -8001,7 +8097,7 @@ static SDValue AddCombineToVPADDL(SDNode *N, SDValue N0, SDValue N1, // Build operand list. SmallVector<SDValue, 8> Ops; Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, dl, - TLI.getPointerTy())); + TLI.getPointerTy(DAG.getDataLayout()))); // Input is the vector. Ops.push_back(Vec); @@ -8681,7 +8777,7 @@ static SDValue PerformVMOVRRDCombine(SDNode *N, std::min(4U, LD->getAlignment() / 2)); DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1)); - if (DCI.DAG.getTargetLoweringInfo().isBigEndian()) + if (DCI.DAG.getDataLayout().isBigEndian()) std::swap (NewLD1, NewLD2); SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2); return Result; @@ -9312,7 +9408,9 @@ static SDValue PerformSTORECombine(SDNode *N, SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal); SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); for (unsigned i = 0; i < NumElems; ++i) - ShuffleVec[i] = TLI.isBigEndian() ? (i+1) * SizeRatio - 1 : i * SizeRatio; + ShuffleVec[i] = DAG.getDataLayout().isBigEndian() + ? (i + 1) * SizeRatio - 1 + : i * SizeRatio; // Can't shuffle using an illegal type. if (!TLI.isTypeLegal(WideVecVT)) return SDValue(); @@ -9339,8 +9437,8 @@ static SDValue PerformSTORECombine(SDNode *N, assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff); SmallVector<SDValue, 8> Chains; - SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8, DL, - TLI.getPointerTy()); + SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL, + TLI.getPointerTy(DAG.getDataLayout())); SDValue BasePtr = St->getBasePtr(); // Perform one or more big stores into memory. @@ -9367,7 +9465,7 @@ static SDValue PerformSTORECombine(SDNode *N, if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR && StVal.getNode()->hasOneUse()) { SelectionDAG &DAG = DCI.DAG; - bool isBigEndian = DAG.getTargetLoweringInfo().isBigEndian(); + bool isBigEndian = DAG.getDataLayout().isBigEndian(); SDLoc DL(St); SDValue BasePtr = St->getBasePtr(); SDValue NewST1 = DAG.getStore(St->getChain(), DL, @@ -10078,7 +10176,7 @@ bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, // For any little-endian targets with neon, we can support unaligned ld/st // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8. // A big-endian target may also explicitly support unaligned accesses - if (Subtarget->hasNEON() && (AllowsUnaligned || isLittleEndian())) { + if (Subtarget->hasNEON() && (AllowsUnaligned || Subtarget->isLittle())) { if (Fast) *Fast = true; return true; @@ -10317,10 +10415,10 @@ bool ARMTargetLowering::isLegalT2ScaledAddressingMode(const AddrMode &AM, /// isLegalAddressingMode - Return true if the addressing mode represented /// by AM is legal for this target, for a load/store of the specified type. -bool ARMTargetLowering::isLegalAddressingMode(const AddrMode &AM, - Type *Ty, +bool ARMTargetLowering::isLegalAddressingMode(const DataLayout &DL, + const AddrMode &AM, Type *Ty, unsigned AS) const { - EVT VT = getValueType(Ty, true); + EVT VT = getValueType(DL, Ty, true); if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget)) return false; @@ -10664,7 +10762,7 @@ bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const { /// getConstraintType - Given a constraint letter, return the type of /// constraint it is for this target. ARMTargetLowering::ConstraintType -ARMTargetLowering::getConstraintType(const std::string &Constraint) const { +ARMTargetLowering::getConstraintType(StringRef Constraint) const { if (Constraint.size() == 1) { switch (Constraint[0]) { default: break; @@ -10723,10 +10821,8 @@ ARMTargetLowering::getSingleConstraintMatchWeight( } typedef std::pair<unsigned, const TargetRegisterClass*> RCPair; -RCPair -ARMTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, - const std::string &Constraint, - MVT VT) const { +RCPair ARMTargetLowering::getRegForInlineAsmConstraint( + const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { if (Constraint.size() == 1) { // GCC ARM Constraint Letters switch (Constraint[0]) { @@ -10974,7 +11070,7 @@ SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const { } SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), - getPointerTy()); + getPointerTy(DAG.getDataLayout())); Type *RetTy = (Type*)StructType::get(Ty, Ty, nullptr); @@ -11083,7 +11179,8 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::arm_neon_vld4lane: { Info.opc = ISD::INTRINSIC_W_CHAIN; // Conservatively set memVT to the entire set of vectors loaded. - uint64_t NumElts = getDataLayout()->getTypeAllocSize(I.getType()) / 8; + auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); + uint64_t NumElts = DL.getTypeAllocSize(I.getType()) / 8; Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); Info.ptrVal = I.getArgOperand(0); Info.offset = 0; @@ -11103,12 +11200,13 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::arm_neon_vst4lane: { Info.opc = ISD::INTRINSIC_VOID; // Conservatively set memVT to the entire set of vectors stored. + auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); unsigned NumElts = 0; for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) { Type *ArgTy = I.getArgOperand(ArgI)->getType(); if (!ArgTy->isVectorTy()) break; - NumElts += getDataLayout()->getTypeAllocSize(ArgTy) / 8; + NumElts += DL.getTypeAllocSize(ArgTy) / 8; } Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); Info.ptrVal = I.getArgOperand(0); @@ -11122,12 +11220,13 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, } case Intrinsic::arm_ldaex: case Intrinsic::arm_ldrex: { + auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType()); Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::getVT(PtrTy->getElementType()); Info.ptrVal = I.getArgOperand(0); Info.offset = 0; - Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType()); + Info.align = DL.getABITypeAlignment(PtrTy->getElementType()); Info.vol = true; Info.readMem = true; Info.writeMem = false; @@ -11135,12 +11234,13 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, } case Intrinsic::arm_stlex: case Intrinsic::arm_strex: { + auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType()); Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::getVT(PtrTy->getElementType()); Info.ptrVal = I.getArgOperand(1); Info.offset = 0; - Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType()); + Info.align = DL.getABITypeAlignment(PtrTy->getElementType()); Info.vol = true; Info.readMem = false; Info.writeMem = true; @@ -11427,9 +11527,9 @@ bool ARMTargetLowering::lowerInterleavedLoad( VectorType *VecTy = Shuffles[0]->getType(); Type *EltTy = VecTy->getVectorElementType(); - const DataLayout *DL = getDataLayout(); - unsigned VecSize = DL->getTypeAllocSizeInBits(VecTy); - bool EltIs64Bits = DL->getTypeAllocSizeInBits(EltTy) == 64; + const DataLayout &DL = LI->getModule()->getDataLayout(); + unsigned VecSize = DL.getTypeAllocSizeInBits(VecTy); + bool EltIs64Bits = DL.getTypeAllocSizeInBits(EltTy) == 64; // Skip illegal vector types and vector types of i64/f64 element (vldN doesn't // support i64/f64 element). @@ -11439,8 +11539,8 @@ bool ARMTargetLowering::lowerInterleavedLoad( // A pointer vector can not be the return type of the ldN intrinsics. Need to // load integer vectors first and then convert to pointer vectors. if (EltTy->isPointerTy()) - VecTy = VectorType::get(DL->getIntPtrType(EltTy), - VecTy->getVectorNumElements()); + VecTy = + VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements()); static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2, Intrinsic::arm_neon_vld3, @@ -11517,9 +11617,9 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, Type *EltTy = VecTy->getVectorElementType(); VectorType *SubVecTy = VectorType::get(EltTy, NumSubElts); - const DataLayout *DL = getDataLayout(); - unsigned SubVecSize = DL->getTypeAllocSizeInBits(SubVecTy); - bool EltIs64Bits = DL->getTypeAllocSizeInBits(EltTy) == 64; + const DataLayout &DL = SI->getModule()->getDataLayout(); + unsigned SubVecSize = DL.getTypeAllocSizeInBits(SubVecTy); + bool EltIs64Bits = DL.getTypeAllocSizeInBits(EltTy) == 64; // Skip illegal sub vector types and vector types of i64/f64 element (vstN // doesn't support i64/f64 element). @@ -11533,7 +11633,7 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, // StN intrinsics don't support pointer vectors as arguments. Convert pointer // vectors to integer vectors. if (EltTy->isPointerTy()) { - Type *IntTy = DL->getIntPtrType(EltTy); + Type *IntTy = DL.getIntPtrType(EltTy); // Convert to the corresponding integer vector. Type *IntVecTy = diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h index 74396392f8e3..efc9020c193a 100644 --- a/lib/Target/ARM/ARMISelLowering.h +++ b/lib/Target/ARM/ARMISelLowering.h @@ -249,7 +249,8 @@ namespace llvm { } /// getSetCCResultType - Return the value type to use for ISD::SETCC. - EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override; + EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, + EVT VT) const override; MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr *MI, @@ -286,8 +287,8 @@ namespace llvm { /// isLegalAddressingMode - Return true if the addressing mode represented /// by AM is legal for this target, for a load/store of the specified type. - bool isLegalAddressingMode(const AddrMode &AM, Type *Ty, - unsigned AS) const override; + bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, + Type *Ty, unsigned AS) const override; bool isLegalT2ScaledAddressingMode(const AddrMode &AM, EVT VT) const; /// isLegalICmpImmediate - Return true if the specified immediate is legal @@ -324,8 +325,7 @@ namespace llvm { bool ExpandInlineAsm(CallInst *CI) const override; - ConstraintType - getConstraintType(const std::string &Constraint) const override; + ConstraintType getConstraintType(StringRef Constraint) const override; /// Examine constraint string and operand type and determine a weight value. /// The operand object must already have been set up with the operand type. @@ -334,8 +334,7 @@ namespace llvm { std::pair<unsigned, const TargetRegisterClass *> getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, - const std::string &Constraint, - MVT VT) const override; + StringRef Constraint, MVT VT) const override; /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops /// vector. If it is invalid, don't add anything to Ops. If hasMemory is @@ -345,8 +344,8 @@ namespace llvm { std::vector<SDValue> &Ops, SelectionDAG &DAG) const override; - unsigned getInlineAsmMemConstraint( - const std::string &ConstraintCode) const override { + unsigned + getInlineAsmMemConstraint(StringRef ConstraintCode) const override { if (ConstraintCode == "Q") return InlineAsm::Constraint_Q; else if (ConstraintCode.size() == 2) { @@ -533,7 +532,8 @@ namespace llvm { SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; - unsigned getRegisterByName(const char* RegName, EVT VT) const override; + unsigned getRegisterByName(const char* RegName, EVT VT, + SelectionDAG &DAG) const override; /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td index b8cac135baf6..61c45af26fe1 100644 --- a/lib/Target/ARM/ARMInstrInfo.td +++ b/lib/Target/ARM/ARMInstrInfo.td @@ -306,8 +306,8 @@ def HasSlowVDUP32 : Predicate<"Subtarget->isSwift()">; def UseVMOVSR : Predicate<"Subtarget->isCortexA9() || !Subtarget->useNEONForSinglePrecisionFP()">; def DontUseVMOVSR : Predicate<"!Subtarget->isCortexA9() && Subtarget->useNEONForSinglePrecisionFP()">; -def IsLE : Predicate<"getTargetLowering()->isLittleEndian()">; -def IsBE : Predicate<"getTargetLowering()->isBigEndian()">; +def IsLE : Predicate<"MF->getDataLayout().isLittleEndian()">; +def IsBE : Predicate<"MF->getDataLayout().isBigEndian()">; //===----------------------------------------------------------------------===// // ARM Flag Definitions. diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp index 245c9e869bf6..37352810c99f 100644 --- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp +++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -31,11 +31,13 @@ #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/RegisterScavenging.h" +#include "llvm/CodeGen/RegisterClassInfo.h" #include "llvm/CodeGen/SelectionDAGNodes.h" +#include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" +#include "llvm/Support/Allocator.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" @@ -65,12 +67,18 @@ namespace { static char ID; ARMLoadStoreOpt() : MachineFunctionPass(ID) {} + const MachineFunction *MF; const TargetInstrInfo *TII; const TargetRegisterInfo *TRI; + const MachineRegisterInfo *MRI; const ARMSubtarget *STI; const TargetLowering *TL; ARMFunctionInfo *AFI; - RegScavenger *RS; + LivePhysRegs LiveRegs; + RegisterClassInfo RegClassInfo; + MachineBasicBlock::const_iterator LiveRegPos; + bool LiveRegsValid; + bool RegClassInfoValid; bool isThumb1, isThumb2; bool runOnMachineFunction(MachineFunction &Fn) override; @@ -80,64 +88,60 @@ namespace { } private: + /// A set of load/store MachineInstrs with same base register sorted by + /// offset. struct MemOpQueueEntry { - int Offset; - unsigned Reg; - bool isKill; - unsigned Position; - MachineBasicBlock::iterator MBBI; - bool Merged; - MemOpQueueEntry(int o, unsigned r, bool k, unsigned p, - MachineBasicBlock::iterator i) - : Offset(o), Reg(r), isKill(k), Position(p), MBBI(i), Merged(false) {} + MachineInstr *MI; + int Offset; ///< Load/Store offset. + unsigned Position; ///< Position as counted from end of basic block. + MemOpQueueEntry(MachineInstr *MI, int Offset, unsigned Position) + : MI(MI), Offset(Offset), Position(Position) {} }; typedef SmallVector<MemOpQueueEntry,8> MemOpQueue; - typedef MemOpQueue::iterator MemOpQueueIter; - void findUsesOfImpDef(SmallVectorImpl<MachineOperand *> &UsesOfImpDefs, - const MemOpQueue &MemOps, unsigned DefReg, - unsigned RangeBegin, unsigned RangeEnd); + /// A set of MachineInstrs that fulfill (nearly all) conditions to get + /// merged into a LDM/STM. + struct MergeCandidate { + /// List of instructions ordered by load/store offset. + SmallVector<MachineInstr*, 4> Instrs; + /// Index in Instrs of the instruction being latest in the schedule. + unsigned LatestMIIdx; + /// Index in Instrs of the instruction being earliest in the schedule. + unsigned EarliestMIIdx; + /// Index into the basic block where the merged instruction will be + /// inserted. (See MemOpQueueEntry.Position) + unsigned InsertPos; + /// Whether the instructions can be merged into a ldm/stm instruction. + bool CanMergeToLSMulti; + /// Whether the instructions can be merged into a ldrd/strd instruction. + bool CanMergeToLSDouble; + }; + SpecificBumpPtrAllocator<MergeCandidate> Allocator; + SmallVector<const MergeCandidate*,4> Candidates; + SmallVector<MachineInstr*,4> MergeBaseCandidates; + + void moveLiveRegsBefore(const MachineBasicBlock &MBB, + MachineBasicBlock::const_iterator Before); + unsigned findFreeReg(const TargetRegisterClass &RegClass); void UpdateBaseRegUses(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - DebugLoc dl, unsigned Base, unsigned WordOffset, + DebugLoc DL, unsigned Base, unsigned WordOffset, ARMCC::CondCodes Pred, unsigned PredReg); - bool MergeOps(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - int Offset, unsigned Base, bool BaseKill, unsigned Opcode, - ARMCC::CondCodes Pred, unsigned PredReg, unsigned Scratch, - DebugLoc dl, - ArrayRef<std::pair<unsigned, bool> > Regs, - ArrayRef<unsigned> ImpDefs); - void MergeOpsUpdate(MachineBasicBlock &MBB, - MemOpQueue &MemOps, - unsigned memOpsBegin, - unsigned memOpsEnd, - unsigned insertAfter, - int Offset, - unsigned Base, - bool BaseKill, - unsigned Opcode, - ARMCC::CondCodes Pred, - unsigned PredReg, - unsigned Scratch, - DebugLoc dl, - SmallVectorImpl<MachineBasicBlock::iterator> &Merges); - void MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex, unsigned Base, - unsigned Opcode, unsigned Size, - ARMCC::CondCodes Pred, unsigned PredReg, - unsigned Scratch, MemOpQueue &MemOps, - SmallVectorImpl<MachineBasicBlock::iterator> &Merges); - void AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps); + MachineInstr *CreateLoadStoreMulti(MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base, + bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg, + DebugLoc DL, ArrayRef<std::pair<unsigned, bool>> Regs); + MachineInstr *CreateLoadStoreDouble(MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base, + bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg, + DebugLoc DL, ArrayRef<std::pair<unsigned, bool>> Regs) const; + void FormCandidates(const MemOpQueue &MemOps); + MachineInstr *MergeOpsUpdate(const MergeCandidate &Cand); bool FixInvalidRegPairOp(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI); - bool MergeBaseUpdateLoadStore(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - const TargetInstrInfo *TII, - bool &Advance, - MachineBasicBlock::iterator &I); - bool MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - bool &Advance, - MachineBasicBlock::iterator &I); + bool MergeBaseUpdateLoadStore(MachineInstr *MI); + bool MergeBaseUpdateLSMultiple(MachineInstr *MI); + bool MergeBaseUpdateLSDouble(MachineInstr &MI) const; bool LoadStoreMultipleOpti(MachineBasicBlock &MBB); bool MergeReturnIntoLDM(MachineBasicBlock &MBB); }; @@ -185,6 +189,14 @@ static int getMemoryOpOffset(const MachineInstr *MI) { return Offset; } +static const MachineOperand &getLoadStoreBaseOp(const MachineInstr &MI) { + return MI.getOperand(1); +} + +static const MachineOperand &getLoadStoreRegOp(const MachineInstr &MI) { + return MI.getOperand(0); +} + static int getLoadStoreMultipleOpcode(unsigned Opcode, ARM_AM::AMSubMode Mode) { switch (Opcode) { default: llvm_unreachable("Unhandled opcode!"); @@ -348,6 +360,10 @@ static bool isi32Store(unsigned Opc) { return Opc == ARM::STRi12 || isT1i32Store(Opc) || isT2i32Store(Opc); } +static bool isLoadSingle(unsigned Opc) { + return isi32Load(Opc) || Opc == ARM::VLDRS || Opc == ARM::VLDRD; +} + static unsigned getImmScale(unsigned Opc) { switch (Opc) { default: llvm_unreachable("Unhandled opcode!"); @@ -365,12 +381,55 @@ static unsigned getImmScale(unsigned Opc) { } } +static unsigned getLSMultipleTransferSize(const MachineInstr *MI) { + switch (MI->getOpcode()) { + default: return 0; + case ARM::LDRi12: + case ARM::STRi12: + case ARM::tLDRi: + case ARM::tSTRi: + case ARM::tLDRspi: + case ARM::tSTRspi: + case ARM::t2LDRi8: + case ARM::t2LDRi12: + case ARM::t2STRi8: + case ARM::t2STRi12: + case ARM::VLDRS: + case ARM::VSTRS: + return 4; + case ARM::VLDRD: + case ARM::VSTRD: + return 8; + case ARM::LDMIA: + case ARM::LDMDA: + case ARM::LDMDB: + case ARM::LDMIB: + case ARM::STMIA: + case ARM::STMDA: + case ARM::STMDB: + case ARM::STMIB: + case ARM::tLDMIA: + case ARM::tLDMIA_UPD: + case ARM::tSTMIA_UPD: + case ARM::t2LDMIA: + case ARM::t2LDMDB: + case ARM::t2STMIA: + case ARM::t2STMDB: + case ARM::VLDMSIA: + case ARM::VSTMSIA: + return (MI->getNumOperands() - MI->getDesc().getNumOperands() + 1) * 4; + case ARM::VLDMDIA: + case ARM::VSTMDIA: + return (MI->getNumOperands() - MI->getDesc().getNumOperands() + 1) * 8; + } +} + /// Update future uses of the base register with the offset introduced /// due to writeback. This function only works on Thumb1. void ARMLoadStoreOpt::UpdateBaseRegUses(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - DebugLoc dl, unsigned Base, + DebugLoc DL, unsigned Base, unsigned WordOffset, ARMCC::CondCodes Pred, unsigned PredReg) { assert(isThumb1 && "Can only update base register uses for Thumb1!"); @@ -398,7 +457,7 @@ ARMLoadStoreOpt::UpdateBaseRegUses(MachineBasicBlock &MBB, Offset = MO.getImm() - WordOffset * getImmScale(Opc); // If storing the base register, it needs to be reset first. - unsigned InstrSrcReg = MBBI->getOperand(0).getReg(); + unsigned InstrSrcReg = getLoadStoreRegOp(*MBBI).getReg(); if (Offset >= 0 && !(IsStore && InstrSrcReg == Base)) MO.setImm(Offset); @@ -439,7 +498,7 @@ ARMLoadStoreOpt::UpdateBaseRegUses(MachineBasicBlock &MBB, if (InsertSub) { // An instruction above couldn't be updated, so insert a sub. - AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII->get(ARM::tSUBi8), Base), true) + AddDefaultT1CC(BuildMI(MBB, MBBI, DL, TII->get(ARM::tSUBi8), Base), true) .addReg(Base).addImm(WordOffset * 4).addImm(Pred).addReg(PredReg); return; } @@ -457,31 +516,65 @@ ARMLoadStoreOpt::UpdateBaseRegUses(MachineBasicBlock &MBB, // See PR21029. if (MBBI != MBB.end()) --MBBI; AddDefaultT1CC( - BuildMI(MBB, MBBI, dl, TII->get(ARM::tSUBi8), Base), true) + BuildMI(MBB, MBBI, DL, TII->get(ARM::tSUBi8), Base), true) .addReg(Base).addImm(WordOffset * 4).addImm(Pred).addReg(PredReg); } } +/// Return the first register of class \p RegClass that is not in \p Regs. +unsigned ARMLoadStoreOpt::findFreeReg(const TargetRegisterClass &RegClass) { + if (!RegClassInfoValid) { + RegClassInfo.runOnMachineFunction(*MF); + RegClassInfoValid = true; + } + + for (unsigned Reg : RegClassInfo.getOrder(&RegClass)) + if (!LiveRegs.contains(Reg)) + return Reg; + return 0; +} + +/// Compute live registers just before instruction \p Before (in normal schedule +/// direction). Computes backwards so multiple queries in the same block must +/// come in reverse order. +void ARMLoadStoreOpt::moveLiveRegsBefore(const MachineBasicBlock &MBB, + MachineBasicBlock::const_iterator Before) { + // Initialize if we never queried in this block. + if (!LiveRegsValid) { + LiveRegs.init(TRI); + LiveRegs.addLiveOuts(&MBB, true); + LiveRegPos = MBB.end(); + LiveRegsValid = true; + } + // Move backward just before the "Before" position. + while (LiveRegPos != Before) { + --LiveRegPos; + LiveRegs.stepBackward(*LiveRegPos); + } +} + +static bool ContainsReg(const ArrayRef<std::pair<unsigned, bool>> &Regs, + unsigned Reg) { + for (const std::pair<unsigned, bool> &R : Regs) + if (R.first == Reg) + return true; + return false; +} + /// Create and insert a LDM or STM with Base as base register and registers in /// Regs as the register operands that would be loaded / stored. It returns /// true if the transformation is done. -bool -ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - int Offset, unsigned Base, bool BaseKill, - unsigned Opcode, ARMCC::CondCodes Pred, - unsigned PredReg, unsigned Scratch, DebugLoc dl, - ArrayRef<std::pair<unsigned, bool> > Regs, - ArrayRef<unsigned> ImpDefs) { - // Only a single register to load / store. Don't bother. +MachineInstr *ARMLoadStoreOpt::CreateLoadStoreMulti(MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base, + bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg, + DebugLoc DL, ArrayRef<std::pair<unsigned, bool>> Regs) { unsigned NumRegs = Regs.size(); - if (NumRegs <= 1) - return false; + assert(NumRegs > 1); // For Thumb1 targets, it might be necessary to clobber the CPSR to merge. // Compute liveness information for that register to make the decision. bool SafeToClobberCPSR = !isThumb1 || - (MBB.computeRegisterLiveness(TRI, ARM::CPSR, std::prev(MBBI), 15) == + (MBB.computeRegisterLiveness(TRI, ARM::CPSR, InsertBefore, 20) == MachineBasicBlock::LQR_Dead); bool Writeback = isThumb1; // Thumb1 LDM/STM have base reg writeback. @@ -489,17 +582,14 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB, // Exception: If the base register is in the input reglist, Thumb1 LDM is // non-writeback. // It's also not possible to merge an STR of the base register in Thumb1. - if (isThumb1) - for (const std::pair<unsigned, bool> &R : Regs) - if (Base == R.first) { - assert(Base != ARM::SP && "Thumb1 does not allow SP in register list"); - if (Opcode == ARM::tLDRi) { - Writeback = false; - break; - } else if (Opcode == ARM::tSTRi) { - return false; - } - } + if (isThumb1 && isi32Load(Opcode) && ContainsReg(Regs, Base)) { + assert(Base != ARM::SP && "Thumb1 does not allow SP in register list"); + if (Opcode == ARM::tLDRi) { + Writeback = false; + } else if (Opcode == ARM::tSTRi) { + return nullptr; + } + } ARM_AM::AMSubMode Mode = ARM_AM::ia; // VFP and Thumb2 do not support IB or DA modes. Thumb1 only supports IA. @@ -516,18 +606,18 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB, } else if (Offset != 0 || Opcode == ARM::tLDRspi || Opcode == ARM::tSTRspi) { // Check if this is a supported opcode before inserting instructions to // calculate a new base register. - if (!getLoadStoreMultipleOpcode(Opcode, Mode)) return false; + if (!getLoadStoreMultipleOpcode(Opcode, Mode)) return nullptr; // If starting offset isn't zero, insert a MI to materialize a new base. // But only do so if it is cost effective, i.e. merging more than two // loads / stores. if (NumRegs <= 2) - return false; + return nullptr; // On Thumb1, it's not worth materializing a new base register without // clobbering the CPSR (i.e. not using ADDS/SUBS). if (!SafeToClobberCPSR) - return false; + return nullptr; unsigned NewBase; if (isi32Load(Opcode)) { @@ -535,10 +625,17 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB, // use as the new base. NewBase = Regs[NumRegs-1].first; } else { - // Use the scratch register to use as a new base. - NewBase = Scratch; + // Find a free register that we can use as scratch register. + moveLiveRegsBefore(MBB, InsertBefore); + // The merged instruction does not exist yet but will use several Regs if + // it is a Store. + if (!isLoadSingle(Opcode)) + for (const std::pair<unsigned, bool> &R : Regs) + LiveRegs.addReg(R.first); + + NewBase = findFreeReg(isThumb1 ? ARM::tGPRRegClass : ARM::GPRRegClass); if (NewBase == 0) - return false; + return nullptr; } int BaseOpc = @@ -557,7 +654,12 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB, if (!TL->isLegalAddImmediate(Offset)) // FIXME: Try add with register operand? - return false; // Probably not worth it then. + return nullptr; // Probably not worth it then. + + // We can only append a kill flag to the add/sub input if the value is not + // used in the register list of the stm as well. + bool KillOldBase = BaseKill && + (!isi32Store(Opcode) || !ContainsReg(Regs, Base)); if (isThumb1) { // Thumb1: depending on immediate size, use either @@ -572,43 +674,44 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB, !STI->hasV6Ops()) { // thumbv4t doesn't have lo->lo copies, and we can't predicate tMOVSr if (Pred != ARMCC::AL) - return false; - BuildMI(MBB, MBBI, dl, TII->get(ARM::tMOVSr), NewBase) - .addReg(Base, getKillRegState(BaseKill)); + return nullptr; + BuildMI(MBB, InsertBefore, DL, TII->get(ARM::tMOVSr), NewBase) + .addReg(Base, getKillRegState(KillOldBase)); } else - BuildMI(MBB, MBBI, dl, TII->get(ARM::tMOVr), NewBase) - .addReg(Base, getKillRegState(BaseKill)) + BuildMI(MBB, InsertBefore, DL, TII->get(ARM::tMOVr), NewBase) + .addReg(Base, getKillRegState(KillOldBase)) .addImm(Pred).addReg(PredReg); - // Set up BaseKill and Base correctly to insert the ADDS/SUBS below. + // The following ADDS/SUBS becomes an update. Base = NewBase; - BaseKill = false; + KillOldBase = true; } if (BaseOpc == ARM::tADDrSPi) { assert(Offset % 4 == 0 && "tADDrSPi offset is scaled by 4"); - BuildMI(MBB, MBBI, dl, TII->get(BaseOpc), NewBase) - .addReg(Base, getKillRegState(BaseKill)).addImm(Offset/4) + BuildMI(MBB, InsertBefore, DL, TII->get(BaseOpc), NewBase) + .addReg(Base, getKillRegState(KillOldBase)).addImm(Offset/4) .addImm(Pred).addReg(PredReg); } else - AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII->get(BaseOpc), NewBase), true) - .addReg(Base, getKillRegState(BaseKill)).addImm(Offset) + AddDefaultT1CC( + BuildMI(MBB, InsertBefore, DL, TII->get(BaseOpc), NewBase), true) + .addReg(Base, getKillRegState(KillOldBase)).addImm(Offset) .addImm(Pred).addReg(PredReg); } else { - BuildMI(MBB, MBBI, dl, TII->get(BaseOpc), NewBase) - .addReg(Base, getKillRegState(BaseKill)).addImm(Offset) + BuildMI(MBB, InsertBefore, DL, TII->get(BaseOpc), NewBase) + .addReg(Base, getKillRegState(KillOldBase)).addImm(Offset) .addImm(Pred).addReg(PredReg).addReg(0); } Base = NewBase; BaseKill = true; // New base is always killed straight away. } - bool isDef = (isi32Load(Opcode) || Opcode == ARM::VLDRS || - Opcode == ARM::VLDRD); + bool isDef = isLoadSingle(Opcode); // Get LS multiple opcode. Note that for Thumb1 this might be an opcode with // base register writeback. Opcode = getLoadStoreMultipleOpcode(Opcode, Mode); - if (!Opcode) return false; + if (!Opcode) + return nullptr; // Check if a Thumb1 LDM/STM merge is safe. This is the case if: // - There is no writeback (LDM of base register), @@ -619,7 +722,7 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB, // It's safe to return here since the code to materialize a new base register // above is also conditional on SafeToClobberCPSR. if (isThumb1 && !SafeToClobberCPSR && Writeback && !BaseKill) - return false; + return nullptr; MachineInstrBuilder MIB; @@ -628,7 +731,7 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB, // Update tLDMIA with writeback if necessary. Opcode = ARM::tLDMIA_UPD; - MIB = BuildMI(MBB, MBBI, dl, TII->get(Opcode)); + MIB = BuildMI(MBB, InsertBefore, DL, TII->get(Opcode)); // Thumb1: we might need to set base writeback when building the MI. MIB.addReg(Base, getDefRegState(true)) @@ -637,381 +740,257 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB, // The base isn't dead after a merged instruction with writeback. // Insert a sub instruction after the newly formed instruction to reset. if (!BaseKill) - UpdateBaseRegUses(MBB, MBBI, dl, Base, NumRegs, Pred, PredReg); + UpdateBaseRegUses(MBB, InsertBefore, DL, Base, NumRegs, Pred, PredReg); } else { // No writeback, simply build the MachineInstr. - MIB = BuildMI(MBB, MBBI, dl, TII->get(Opcode)); + MIB = BuildMI(MBB, InsertBefore, DL, TII->get(Opcode)); MIB.addReg(Base, getKillRegState(BaseKill)); } MIB.addImm(Pred).addReg(PredReg); for (const std::pair<unsigned, bool> &R : Regs) - MIB = MIB.addReg(R.first, getDefRegState(isDef) - | getKillRegState(R.second)); + MIB.addReg(R.first, getDefRegState(isDef) | getKillRegState(R.second)); - // Add implicit defs for super-registers. - for (unsigned ImpDef : ImpDefs) - MIB.addReg(ImpDef, RegState::ImplicitDefine); - - return true; + return MIB.getInstr(); } -/// Find all instructions using a given imp-def within a range. -/// -/// We are trying to combine a range of instructions, one of which (located at -/// position RangeBegin) implicitly defines a register. The final LDM/STM will -/// be placed at RangeEnd, and so any uses of this definition between RangeStart -/// and RangeEnd must be modified to use an undefined value. -/// -/// The live range continues until we find a second definition or one of the -/// uses we find is a kill. Unfortunately MemOps is not sorted by Position, so -/// we must consider all uses and decide which are relevant in a second pass. -void ARMLoadStoreOpt::findUsesOfImpDef( - SmallVectorImpl<MachineOperand *> &UsesOfImpDefs, const MemOpQueue &MemOps, - unsigned DefReg, unsigned RangeBegin, unsigned RangeEnd) { - std::map<unsigned, MachineOperand *> Uses; - unsigned LastLivePos = RangeEnd; - - // First we find all uses of this register with Position between RangeBegin - // and RangeEnd, any or all of these could be uses of a definition at - // RangeBegin. We also record the latest position a definition at RangeBegin - // would be considered live. - for (unsigned i = 0; i < MemOps.size(); ++i) { - MachineInstr &MI = *MemOps[i].MBBI; - unsigned MIPosition = MemOps[i].Position; - if (MIPosition <= RangeBegin || MIPosition > RangeEnd) - continue; - - // If this instruction defines the register, then any later use will be of - // that definition rather than ours. - if (MI.definesRegister(DefReg)) - LastLivePos = std::min(LastLivePos, MIPosition); - - MachineOperand *UseOp = MI.findRegisterUseOperand(DefReg); - if (!UseOp) - continue; - - // If this instruction kills the register then (assuming liveness is - // correct when we start) we don't need to think about anything after here. - if (UseOp->isKill()) - LastLivePos = std::min(LastLivePos, MIPosition); - - Uses[MIPosition] = UseOp; - } - - // Now we traverse the list of all uses, and append the ones that actually use - // our definition to the requested list. - for (std::map<unsigned, MachineOperand *>::iterator I = Uses.begin(), - E = Uses.end(); - I != E; ++I) { - // List is sorted by position so once we've found one out of range there - // will be no more to consider. - if (I->first > LastLivePos) - break; - UsesOfImpDefs.push_back(I->second); +MachineInstr *ARMLoadStoreOpt::CreateLoadStoreDouble(MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base, + bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg, + DebugLoc DL, ArrayRef<std::pair<unsigned, bool>> Regs) const { + bool IsLoad = isi32Load(Opcode); + assert((IsLoad || isi32Store(Opcode)) && "Must have integer load or store"); + unsigned LoadStoreOpcode = IsLoad ? ARM::t2LDRDi8 : ARM::t2STRDi8; + + assert(Regs.size() == 2); + MachineInstrBuilder MIB = BuildMI(MBB, InsertBefore, DL, + TII->get(LoadStoreOpcode)); + if (IsLoad) { + MIB.addReg(Regs[0].first, RegState::Define) + .addReg(Regs[1].first, RegState::Define); + } else { + MIB.addReg(Regs[0].first, getKillRegState(Regs[0].second)) + .addReg(Regs[1].first, getKillRegState(Regs[1].second)); } + MIB.addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg); + return MIB.getInstr(); } /// Call MergeOps and update MemOps and merges accordingly on success. -void ARMLoadStoreOpt::MergeOpsUpdate(MachineBasicBlock &MBB, - MemOpQueue &memOps, - unsigned memOpsBegin, unsigned memOpsEnd, - unsigned insertAfter, int Offset, - unsigned Base, bool BaseKill, - unsigned Opcode, - ARMCC::CondCodes Pred, unsigned PredReg, - unsigned Scratch, - DebugLoc dl, - SmallVectorImpl<MachineBasicBlock::iterator> &Merges) { - // First calculate which of the registers should be killed by the merged - // instruction. - const unsigned insertPos = memOps[insertAfter].Position; - SmallSet<unsigned, 4> KilledRegs; - DenseMap<unsigned, unsigned> Killer; - for (unsigned i = 0, e = memOps.size(); i != e; ++i) { - if (i == memOpsBegin) { - i = memOpsEnd; - if (i == e) - break; - } - if (memOps[i].Position < insertPos && memOps[i].isKill) { - unsigned Reg = memOps[i].Reg; +MachineInstr *ARMLoadStoreOpt::MergeOpsUpdate(const MergeCandidate &Cand) { + const MachineInstr *First = Cand.Instrs.front(); + unsigned Opcode = First->getOpcode(); + bool IsLoad = isLoadSingle(Opcode); + SmallVector<std::pair<unsigned, bool>, 8> Regs; + SmallVector<unsigned, 4> ImpDefs; + DenseSet<unsigned> KilledRegs; + // Determine list of registers and list of implicit super-register defs. + for (const MachineInstr *MI : Cand.Instrs) { + const MachineOperand &MO = getLoadStoreRegOp(*MI); + unsigned Reg = MO.getReg(); + bool IsKill = MO.isKill(); + if (IsKill) KilledRegs.insert(Reg); - Killer[Reg] = i; + Regs.push_back(std::make_pair(Reg, IsKill)); + + if (IsLoad) { + // Collect any implicit defs of super-registers, after merging we can't + // be sure anymore that we properly preserved these live ranges and must + // removed these implicit operands. + for (const MachineOperand &MO : MI->implicit_operands()) { + if (!MO.isReg() || !MO.isDef() || MO.isDead()) + continue; + assert(MO.isImplicit()); + unsigned DefReg = MO.getReg(); + + if (std::find(ImpDefs.begin(), ImpDefs.end(), DefReg) != ImpDefs.end()) + continue; + // We can ignore cases where the super-reg is read and written. + if (MI->readsRegister(DefReg)) + continue; + ImpDefs.push_back(DefReg); + } } } - for (unsigned i = memOpsBegin; i < memOpsEnd; ++i) { - MachineOperand &TransferOp = memOps[i].MBBI->getOperand(0); - if (TransferOp.isUse() && TransferOp.getReg() == Base) - BaseKill = false; + // Attempt the merge. + typedef MachineBasicBlock::iterator iterator; + MachineInstr *LatestMI = Cand.Instrs[Cand.LatestMIIdx]; + iterator InsertBefore = std::next(iterator(LatestMI)); + MachineBasicBlock &MBB = *LatestMI->getParent(); + unsigned Offset = getMemoryOpOffset(First); + unsigned Base = getLoadStoreBaseOp(*First).getReg(); + bool BaseKill = LatestMI->killsRegister(Base); + unsigned PredReg = 0; + ARMCC::CondCodes Pred = getInstrPredicate(First, PredReg); + DebugLoc DL = First->getDebugLoc(); + MachineInstr *Merged = nullptr; + if (Cand.CanMergeToLSDouble) + Merged = CreateLoadStoreDouble(MBB, InsertBefore, Offset, Base, BaseKill, + Opcode, Pred, PredReg, DL, Regs); + if (!Merged && Cand.CanMergeToLSMulti) + Merged = CreateLoadStoreMulti(MBB, InsertBefore, Offset, Base, BaseKill, + Opcode, Pred, PredReg, DL, Regs); + if (!Merged) + return nullptr; + + // Determine earliest instruction that will get removed. We then keep an + // iterator just above it so the following erases don't invalidated it. + iterator EarliestI(Cand.Instrs[Cand.EarliestMIIdx]); + bool EarliestAtBegin = false; + if (EarliestI == MBB.begin()) { + EarliestAtBegin = true; + } else { + EarliestI = std::prev(EarliestI); } - SmallVector<std::pair<unsigned, bool>, 8> Regs; - SmallVector<unsigned, 8> ImpDefs; - SmallVector<MachineOperand *, 8> UsesOfImpDefs; - for (unsigned i = memOpsBegin; i < memOpsEnd; ++i) { - unsigned Reg = memOps[i].Reg; - // If we are inserting the merged operation after an operation that - // uses the same register, make sure to transfer any kill flag. - bool isKill = memOps[i].isKill || KilledRegs.count(Reg); - Regs.push_back(std::make_pair(Reg, isKill)); - - // Collect any implicit defs of super-registers. They must be preserved. - for (const MachineOperand &MO : memOps[i].MBBI->operands()) { - if (!MO.isReg() || !MO.isDef() || !MO.isImplicit() || MO.isDead()) - continue; - unsigned DefReg = MO.getReg(); - if (std::find(ImpDefs.begin(), ImpDefs.end(), DefReg) == ImpDefs.end()) - ImpDefs.push_back(DefReg); - - // There may be other uses of the definition between this instruction and - // the eventual LDM/STM position. These should be marked undef if the - // merge takes place. - findUsesOfImpDef(UsesOfImpDefs, memOps, DefReg, memOps[i].Position, - insertPos); + // Remove instructions which have been merged. + for (MachineInstr *MI : Cand.Instrs) + MBB.erase(MI); + + // Determine range between the earliest removed instruction and the new one. + if (EarliestAtBegin) + EarliestI = MBB.begin(); + else + EarliestI = std::next(EarliestI); + auto FixupRange = make_range(EarliestI, iterator(Merged)); + + if (isLoadSingle(Opcode)) { + // If the previous loads defined a super-reg, then we have to mark earlier + // operands undef; Replicate the super-reg def on the merged instruction. + for (MachineInstr &MI : FixupRange) { + for (unsigned &ImpDefReg : ImpDefs) { + for (MachineOperand &MO : MI.implicit_operands()) { + if (!MO.isReg() || MO.getReg() != ImpDefReg) + continue; + if (MO.readsReg()) + MO.setIsUndef(); + else if (MO.isDef()) + ImpDefReg = 0; + } + } } - } - // Try to do the merge. - MachineBasicBlock::iterator Loc = memOps[insertAfter].MBBI; - ++Loc; - if (!MergeOps(MBB, Loc, Offset, Base, BaseKill, Opcode, - Pred, PredReg, Scratch, dl, Regs, ImpDefs)) - return; - - // Merge succeeded, update records. - Merges.push_back(std::prev(Loc)); - - // In gathering loads together, we may have moved the imp-def of a register - // past one of its uses. This is OK, since we know better than the rest of - // LLVM what's OK with ARM loads and stores; but we still have to adjust the - // affected uses. - for (SmallVectorImpl<MachineOperand *>::iterator I = UsesOfImpDefs.begin(), - E = UsesOfImpDefs.end(); - I != E; ++I) - (*I)->setIsUndef(); - - for (unsigned i = memOpsBegin; i < memOpsEnd; ++i) { - // Remove kill flags from any memops that come before insertPos. - if (Regs[i-memOpsBegin].second) { - unsigned Reg = Regs[i-memOpsBegin].first; - if (KilledRegs.count(Reg)) { - unsigned j = Killer[Reg]; - int Idx = memOps[j].MBBI->findRegisterUseOperandIdx(Reg, true); - assert(Idx >= 0 && "Cannot find killing operand"); - memOps[j].MBBI->getOperand(Idx).setIsKill(false); - memOps[j].isKill = false; + MachineInstrBuilder MIB(*Merged->getParent()->getParent(), Merged); + for (unsigned ImpDef : ImpDefs) + MIB.addReg(ImpDef, RegState::ImplicitDefine); + } else { + // Remove kill flags: We are possibly storing the values later now. + assert(isi32Store(Opcode) || Opcode == ARM::VSTRS || Opcode == ARM::VSTRD); + for (MachineInstr &MI : FixupRange) { + for (MachineOperand &MO : MI.uses()) { + if (!MO.isReg() || !MO.isKill()) + continue; + if (KilledRegs.count(MO.getReg())) + MO.setIsKill(false); } - memOps[i].isKill = true; } - MBB.erase(memOps[i].MBBI); - // Update this memop to refer to the merged instruction. - // We may need to move kill flags again. - memOps[i].Merged = true; - memOps[i].MBBI = Merges.back(); - memOps[i].Position = insertPos; + assert(ImpDefs.empty()); } - // Update memOps offsets, since they may have been modified by MergeOps. - for (auto &MemOp : memOps) { - MemOp.Offset = getMemoryOpOffset(MemOp.MBBI); - } + return Merged; } -/// Merge a number of load / store instructions into one or more load / store -/// multiple instructions. -void -ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex, - unsigned Base, unsigned Opcode, unsigned Size, - ARMCC::CondCodes Pred, unsigned PredReg, - unsigned Scratch, MemOpQueue &MemOps, - SmallVectorImpl<MachineBasicBlock::iterator> &Merges) { - bool isNotVFP = isi32Load(Opcode) || isi32Store(Opcode); - int Offset = MemOps[SIndex].Offset; - int SOffset = Offset; - unsigned insertAfter = SIndex; - MachineBasicBlock::iterator Loc = MemOps[SIndex].MBBI; - DebugLoc dl = Loc->getDebugLoc(); - const MachineOperand &PMO = Loc->getOperand(0); - unsigned PReg = PMO.getReg(); - unsigned PRegNum = PMO.isUndef() ? UINT_MAX : TRI->getEncodingValue(PReg); - unsigned Count = 1; - unsigned Limit = ~0U; - bool BaseKill = false; - // vldm / vstm limit are 32 for S variants, 16 for D variants. +static bool isValidLSDoubleOffset(int Offset) { + unsigned Value = abs(Offset); + // t2LDRDi8/t2STRDi8 supports an 8 bit immediate which is internally + // multiplied by 4. + return (Value % 4) == 0 && Value < 1024; +} - switch (Opcode) { - default: break; - case ARM::VSTRS: - Limit = 32; - break; - case ARM::VSTRD: - Limit = 16; - break; - case ARM::VLDRD: - Limit = 16; - break; - case ARM::VLDRS: - Limit = 32; - break; - } +/// Find candidates for load/store multiple merge in list of MemOpQueueEntries. +void ARMLoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) { + const MachineInstr *FirstMI = MemOps[0].MI; + unsigned Opcode = FirstMI->getOpcode(); + bool isNotVFP = isi32Load(Opcode) || isi32Store(Opcode); + unsigned Size = getLSMultipleTransferSize(FirstMI); + + unsigned SIndex = 0; + unsigned EIndex = MemOps.size(); + do { + // Look at the first instruction. + const MachineInstr *MI = MemOps[SIndex].MI; + int Offset = MemOps[SIndex].Offset; + const MachineOperand &PMO = getLoadStoreRegOp(*MI); + unsigned PReg = PMO.getReg(); + unsigned PRegNum = PMO.isUndef() ? UINT_MAX : TRI->getEncodingValue(PReg); + unsigned Latest = SIndex; + unsigned Earliest = SIndex; + unsigned Count = 1; + bool CanMergeToLSDouble = + STI->isThumb2() && isNotVFP && isValidLSDoubleOffset(Offset); + // ARM errata 602117: LDRD with base in list may result in incorrect base + // register when interrupted or faulted. + if (STI->isCortexM3() && isi32Load(Opcode) && + PReg == getLoadStoreBaseOp(*MI).getReg()) + CanMergeToLSDouble = false; + + bool CanMergeToLSMulti = true; + // On swift vldm/vstm starting with an odd register number as that needs + // more uops than single vldrs. + if (STI->isSwift() && !isNotVFP && (PRegNum % 2) == 1) + CanMergeToLSMulti = false; + + // Merge following instructions where possible. + for (unsigned I = SIndex+1; I < EIndex; ++I, ++Count) { + int NewOffset = MemOps[I].Offset; + if (NewOffset != Offset + (int)Size) + break; + const MachineOperand &MO = getLoadStoreRegOp(*MemOps[I].MI); + unsigned Reg = MO.getReg(); + unsigned RegNum = MO.isUndef() ? UINT_MAX : TRI->getEncodingValue(Reg); + + // See if the current load/store may be part of a multi load/store. + bool PartOfLSMulti = CanMergeToLSMulti; + if (PartOfLSMulti) { + // Cannot load from SP + if (Reg == ARM::SP) + PartOfLSMulti = false; + // Register numbers must be in ascending order. + else if (RegNum <= PRegNum) + PartOfLSMulti = false; + // For VFP / NEON load/store multiples, the registers must be + // consecutive and within the limit on the number of registers per + // instruction. + else if (!isNotVFP && RegNum != PRegNum+1) + PartOfLSMulti = false; + } + // See if the current load/store may be part of a double load/store. + bool PartOfLSDouble = CanMergeToLSDouble && Count <= 1; - for (unsigned i = SIndex+1, e = MemOps.size(); i != e; ++i) { - int NewOffset = MemOps[i].Offset; - const MachineOperand &MO = MemOps[i].MBBI->getOperand(0); - unsigned Reg = MO.getReg(); - unsigned RegNum = MO.isUndef() ? UINT_MAX : TRI->getEncodingValue(Reg); - // Register numbers must be in ascending order. For VFP / NEON load and - // store multiples, the registers must also be consecutive and within the - // limit on the number of registers per instruction. - if (Reg != ARM::SP && - NewOffset == Offset + (int)Size && - ((isNotVFP && RegNum > PRegNum) || - ((Count < Limit) && RegNum == PRegNum+1)) && - // On Swift we don't want vldm/vstm to start with a odd register num - // because Q register unaligned vldm/vstm need more uops. - (!STI->isSwift() || isNotVFP || Count != 1 || !(PRegNum & 0x1))) { + if (!PartOfLSMulti && !PartOfLSDouble) + break; + CanMergeToLSMulti &= PartOfLSMulti; + CanMergeToLSDouble &= PartOfLSDouble; + // Track MemOp with latest and earliest position (Positions are + // counted in reverse). + unsigned Position = MemOps[I].Position; + if (Position < MemOps[Latest].Position) + Latest = I; + else if (Position > MemOps[Earliest].Position) + Earliest = I; + // Prepare for next MemOp. Offset += Size; PRegNum = RegNum; - ++Count; - } else { - // Can't merge this in. Try merge the earlier ones first. - // We need to compute BaseKill here because the MemOps may have been - // reordered. - BaseKill = Loc->killsRegister(Base); - - MergeOpsUpdate(MBB, MemOps, SIndex, i, insertAfter, SOffset, Base, - BaseKill, Opcode, Pred, PredReg, Scratch, dl, Merges); - MergeLDR_STR(MBB, i, Base, Opcode, Size, Pred, PredReg, Scratch, - MemOps, Merges); - return; } - if (MemOps[i].Position > MemOps[insertAfter].Position) { - insertAfter = i; - Loc = MemOps[i].MBBI; - } - } - - BaseKill = Loc->killsRegister(Base); - MergeOpsUpdate(MBB, MemOps, SIndex, MemOps.size(), insertAfter, SOffset, - Base, BaseKill, Opcode, Pred, PredReg, Scratch, dl, Merges); -} - -static bool isMatchingDecrement(MachineInstr *MI, unsigned Base, - unsigned Bytes, unsigned Limit, - ARMCC::CondCodes Pred, unsigned PredReg) { - unsigned MyPredReg = 0; - if (!MI) - return false; - - bool CheckCPSRDef = false; - switch (MI->getOpcode()) { - default: return false; - case ARM::tSUBi8: - case ARM::t2SUBri: - case ARM::SUBri: - CheckCPSRDef = true; - break; - case ARM::tSUBspi: - break; - } - - // Make sure the offset fits in 8 bits. - if (Bytes == 0 || (Limit && Bytes >= Limit)) - return false; - - unsigned Scale = (MI->getOpcode() == ARM::tSUBspi || - MI->getOpcode() == ARM::tSUBi8) ? 4 : 1; // FIXME - if (!(MI->getOperand(0).getReg() == Base && - MI->getOperand(1).getReg() == Base && - (MI->getOperand(2).getImm() * Scale) == Bytes && - getInstrPredicate(MI, MyPredReg) == Pred && - MyPredReg == PredReg)) - return false; - - return CheckCPSRDef ? !definesCPSR(MI) : true; -} - -static bool isMatchingIncrement(MachineInstr *MI, unsigned Base, - unsigned Bytes, unsigned Limit, - ARMCC::CondCodes Pred, unsigned PredReg) { - unsigned MyPredReg = 0; - if (!MI) - return false; - - bool CheckCPSRDef = false; - switch (MI->getOpcode()) { - default: return false; - case ARM::tADDi8: - case ARM::t2ADDri: - case ARM::ADDri: - CheckCPSRDef = true; - break; - case ARM::tADDspi: - break; - } - - if (Bytes == 0 || (Limit && Bytes >= Limit)) - // Make sure the offset fits in 8 bits. - return false; - - unsigned Scale = (MI->getOpcode() == ARM::tADDspi || - MI->getOpcode() == ARM::tADDi8) ? 4 : 1; // FIXME - if (!(MI->getOperand(0).getReg() == Base && - MI->getOperand(1).getReg() == Base && - (MI->getOperand(2).getImm() * Scale) == Bytes && - getInstrPredicate(MI, MyPredReg) == Pred && - MyPredReg == PredReg)) - return false; - - return CheckCPSRDef ? !definesCPSR(MI) : true; -} - -static inline unsigned getLSMultipleTransferSize(MachineInstr *MI) { - switch (MI->getOpcode()) { - default: return 0; - case ARM::LDRi12: - case ARM::STRi12: - case ARM::tLDRi: - case ARM::tSTRi: - case ARM::tLDRspi: - case ARM::tSTRspi: - case ARM::t2LDRi8: - case ARM::t2LDRi12: - case ARM::t2STRi8: - case ARM::t2STRi12: - case ARM::VLDRS: - case ARM::VSTRS: - return 4; - case ARM::VLDRD: - case ARM::VSTRD: - return 8; - case ARM::LDMIA: - case ARM::LDMDA: - case ARM::LDMDB: - case ARM::LDMIB: - case ARM::STMIA: - case ARM::STMDA: - case ARM::STMDB: - case ARM::STMIB: - case ARM::tLDMIA: - case ARM::tLDMIA_UPD: - case ARM::tSTMIA_UPD: - case ARM::t2LDMIA: - case ARM::t2LDMDB: - case ARM::t2STMIA: - case ARM::t2STMDB: - case ARM::VLDMSIA: - case ARM::VSTMSIA: - return (MI->getNumOperands() - MI->getDesc().getNumOperands() + 1) * 4; - case ARM::VLDMDIA: - case ARM::VSTMDIA: - return (MI->getNumOperands() - MI->getDesc().getNumOperands() + 1) * 8; - } + // Form a candidate from the Ops collected so far. + MergeCandidate *Candidate = new(Allocator.Allocate()) MergeCandidate; + for (unsigned C = SIndex, CE = SIndex + Count; C < CE; ++C) + Candidate->Instrs.push_back(MemOps[C].MI); + Candidate->LatestMIIdx = Latest - SIndex; + Candidate->EarliestMIIdx = Earliest - SIndex; + Candidate->InsertPos = MemOps[Latest].Position; + if (Count == 1) + CanMergeToLSMulti = CanMergeToLSDouble = false; + Candidate->CanMergeToLSMulti = CanMergeToLSMulti; + Candidate->CanMergeToLSDouble = CanMergeToLSDouble; + Candidates.push_back(Candidate); + // Continue after the chain. + SIndex += Count; + } while (SIndex < EIndex); } static unsigned getUpdatingLSMultipleOpcode(unsigned Opc, @@ -1081,6 +1060,75 @@ static unsigned getUpdatingLSMultipleOpcode(unsigned Opc, } } +/// Check if the given instruction increments or decrements a register and +/// return the amount it is incremented/decremented. Returns 0 if the CPSR flags +/// generated by the instruction are possibly read as well. +static int isIncrementOrDecrement(const MachineInstr &MI, unsigned Reg, + ARMCC::CondCodes Pred, unsigned PredReg) { + bool CheckCPSRDef; + int Scale; + switch (MI.getOpcode()) { + case ARM::tADDi8: Scale = 4; CheckCPSRDef = true; break; + case ARM::tSUBi8: Scale = -4; CheckCPSRDef = true; break; + case ARM::t2SUBri: + case ARM::SUBri: Scale = -1; CheckCPSRDef = true; break; + case ARM::t2ADDri: + case ARM::ADDri: Scale = 1; CheckCPSRDef = true; break; + case ARM::tADDspi: Scale = 4; CheckCPSRDef = false; break; + case ARM::tSUBspi: Scale = -4; CheckCPSRDef = false; break; + default: return 0; + } + + unsigned MIPredReg; + if (MI.getOperand(0).getReg() != Reg || + MI.getOperand(1).getReg() != Reg || + getInstrPredicate(&MI, MIPredReg) != Pred || + MIPredReg != PredReg) + return 0; + + if (CheckCPSRDef && definesCPSR(&MI)) + return 0; + return MI.getOperand(2).getImm() * Scale; +} + +/// Searches for an increment or decrement of \p Reg before \p MBBI. +static MachineBasicBlock::iterator +findIncDecBefore(MachineBasicBlock::iterator MBBI, unsigned Reg, + ARMCC::CondCodes Pred, unsigned PredReg, int &Offset) { + Offset = 0; + MachineBasicBlock &MBB = *MBBI->getParent(); + MachineBasicBlock::iterator BeginMBBI = MBB.begin(); + MachineBasicBlock::iterator EndMBBI = MBB.end(); + if (MBBI == BeginMBBI) + return EndMBBI; + + // Skip debug values. + MachineBasicBlock::iterator PrevMBBI = std::prev(MBBI); + while (PrevMBBI->isDebugValue() && PrevMBBI != BeginMBBI) + --PrevMBBI; + + Offset = isIncrementOrDecrement(*PrevMBBI, Reg, Pred, PredReg); + return Offset == 0 ? EndMBBI : PrevMBBI; +} + +/// Searches for a increment or decrement of \p Reg after \p MBBI. +static MachineBasicBlock::iterator +findIncDecAfter(MachineBasicBlock::iterator MBBI, unsigned Reg, + ARMCC::CondCodes Pred, unsigned PredReg, int &Offset) { + Offset = 0; + MachineBasicBlock &MBB = *MBBI->getParent(); + MachineBasicBlock::iterator EndMBBI = MBB.end(); + MachineBasicBlock::iterator NextMBBI = std::next(MBBI); + // Skip debug values. + while (NextMBBI != EndMBBI && NextMBBI->isDebugValue()) + ++NextMBBI; + if (NextMBBI == EndMBBI) + return EndMBBI; + + Offset = isIncrementOrDecrement(*NextMBBI, Reg, Pred, PredReg); + return Offset == 0 ? EndMBBI : NextMBBI; +} + /// Fold proceeding/trailing inc/dec of base register into the /// LDM/STM/VLDM{D|S}/VSTM{D|S} op when possible: /// @@ -1093,21 +1141,17 @@ static unsigned getUpdatingLSMultipleOpcode(unsigned Opc, /// ldmia rn, <ra, rb, rc> /// => /// ldmdb rn!, <ra, rb, rc> -bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - bool &Advance, - MachineBasicBlock::iterator &I) { +bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) { // Thumb1 is already using updating loads/stores. if (isThumb1) return false; - MachineInstr *MI = MBBI; - unsigned Base = MI->getOperand(0).getReg(); - bool BaseKill = MI->getOperand(0).isKill(); - unsigned Bytes = getLSMultipleTransferSize(MI); + const MachineOperand &BaseOP = MI->getOperand(0); + unsigned Base = BaseOP.getReg(); + bool BaseKill = BaseOP.isKill(); unsigned PredReg = 0; ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg); unsigned Opcode = MI->getOpcode(); - DebugLoc dl = MI->getDebugLoc(); + DebugLoc DL = MI->getDebugLoc(); // Can't use an updating ld/st if the base register is also a dest // register. e.g. ldmdb r0!, {r0, r1, r2}. The behavior is undefined. @@ -1115,55 +1159,27 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB, if (MI->getOperand(i).getReg() == Base) return false; - bool DoMerge = false; + int Bytes = getLSMultipleTransferSize(MI); + MachineBasicBlock &MBB = *MI->getParent(); + MachineBasicBlock::iterator MBBI(MI); + int Offset; + MachineBasicBlock::iterator MergeInstr + = findIncDecBefore(MBBI, Base, Pred, PredReg, Offset); ARM_AM::AMSubMode Mode = getLoadStoreMultipleSubMode(Opcode); - - // Try merging with the previous instruction. - MachineBasicBlock::iterator BeginMBBI = MBB.begin(); - if (MBBI != BeginMBBI) { - MachineBasicBlock::iterator PrevMBBI = std::prev(MBBI); - while (PrevMBBI != BeginMBBI && PrevMBBI->isDebugValue()) - --PrevMBBI; - if (Mode == ARM_AM::ia && - isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) { - Mode = ARM_AM::db; - DoMerge = true; - } else if (Mode == ARM_AM::ib && - isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) { - Mode = ARM_AM::da; - DoMerge = true; - } - if (DoMerge) - MBB.erase(PrevMBBI); - } - - // Try merging with the next instruction. - MachineBasicBlock::iterator EndMBBI = MBB.end(); - if (!DoMerge && MBBI != EndMBBI) { - MachineBasicBlock::iterator NextMBBI = std::next(MBBI); - while (NextMBBI != EndMBBI && NextMBBI->isDebugValue()) - ++NextMBBI; - if ((Mode == ARM_AM::ia || Mode == ARM_AM::ib) && - isMatchingIncrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) { - DoMerge = true; - } else if ((Mode == ARM_AM::da || Mode == ARM_AM::db) && - isMatchingDecrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) { - DoMerge = true; - } - if (DoMerge) { - if (NextMBBI == I) { - Advance = true; - ++I; - } - MBB.erase(NextMBBI); - } + if (Mode == ARM_AM::ia && Offset == -Bytes) { + Mode = ARM_AM::db; + } else if (Mode == ARM_AM::ib && Offset == -Bytes) { + Mode = ARM_AM::da; + } else { + MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset); + if (((Mode != ARM_AM::ia && Mode != ARM_AM::ib) || Offset != Bytes) && + ((Mode != ARM_AM::da && Mode != ARM_AM::db) || Offset != -Bytes)) + return false; } - - if (!DoMerge) - return false; + MBB.erase(MergeInstr); unsigned NewOpc = getUpdatingLSMultipleOpcode(Opcode, Mode); - MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII->get(NewOpc)) + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc)) .addReg(Base, getDefRegState(true)) // WB base register .addReg(Base, getKillRegState(BaseKill)) .addImm(Pred).addReg(PredReg); @@ -1231,21 +1247,15 @@ static unsigned getPostIndexedLoadStoreOpcode(unsigned Opc, /// Fold proceeding/trailing inc/dec of base register into the /// LDR/STR/FLD{D|S}/FST{D|S} op when possible: -bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - const TargetInstrInfo *TII, - bool &Advance, - MachineBasicBlock::iterator &I) { +bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) { // Thumb1 doesn't have updating LDR/STR. // FIXME: Use LDM/STM with single register instead. if (isThumb1) return false; - MachineInstr *MI = MBBI; - unsigned Base = MI->getOperand(1).getReg(); - bool BaseKill = MI->getOperand(1).isKill(); - unsigned Bytes = getLSMultipleTransferSize(MI); + unsigned Base = getLoadStoreBaseOp(*MI).getReg(); + bool BaseKill = getLoadStoreBaseOp(*MI).isKill(); unsigned Opcode = MI->getOpcode(); - DebugLoc dl = MI->getDebugLoc(); + DebugLoc DL = MI->getDebugLoc(); bool isAM5 = (Opcode == ARM::VLDRD || Opcode == ARM::VLDRS || Opcode == ARM::VSTRD || Opcode == ARM::VSTRS); bool isAM2 = (Opcode == ARM::LDRi12 || Opcode == ARM::STRi12); @@ -1255,7 +1265,6 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB, if (isAM5 && ARM_AM::getAM5Offset(MI->getOperand(2).getImm()) != 0) return false; - bool isLd = isi32Load(Opcode) || Opcode == ARM::VLDRS || Opcode == ARM::VLDRD; // Can't do the merge if the destination register is the same as the would-be // writeback register. if (MI->getOperand(0).getReg() == Base) @@ -1263,64 +1272,38 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB, unsigned PredReg = 0; ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg); - bool DoMerge = false; - ARM_AM::AddrOpc AddSub = ARM_AM::add; - unsigned NewOpc = 0; - // AM2 - 12 bits, thumb2 - 8 bits. - unsigned Limit = isAM5 ? 0 : (isAM2 ? 0x1000 : 0x100); - - // Try merging with the previous instruction. - MachineBasicBlock::iterator BeginMBBI = MBB.begin(); - if (MBBI != BeginMBBI) { - MachineBasicBlock::iterator PrevMBBI = std::prev(MBBI); - while (PrevMBBI != BeginMBBI && PrevMBBI->isDebugValue()) - --PrevMBBI; - if (isMatchingDecrement(PrevMBBI, Base, Bytes, Limit, Pred, PredReg)) { - DoMerge = true; - AddSub = ARM_AM::sub; - } else if (!isAM5 && - isMatchingIncrement(PrevMBBI, Base, Bytes, Limit,Pred,PredReg)) { - DoMerge = true; - } - if (DoMerge) { - NewOpc = getPreIndexedLoadStoreOpcode(Opcode, AddSub); - MBB.erase(PrevMBBI); - } - } - - // Try merging with the next instruction. - MachineBasicBlock::iterator EndMBBI = MBB.end(); - if (!DoMerge && MBBI != EndMBBI) { - MachineBasicBlock::iterator NextMBBI = std::next(MBBI); - while (NextMBBI != EndMBBI && NextMBBI->isDebugValue()) - ++NextMBBI; - if (!isAM5 && - isMatchingDecrement(NextMBBI, Base, Bytes, Limit, Pred, PredReg)) { - DoMerge = true; - AddSub = ARM_AM::sub; - } else if (isMatchingIncrement(NextMBBI, Base, Bytes, Limit,Pred,PredReg)) { - DoMerge = true; - } - if (DoMerge) { - NewOpc = getPostIndexedLoadStoreOpcode(Opcode, AddSub); - if (NextMBBI == I) { - Advance = true; - ++I; - } - MBB.erase(NextMBBI); - } + int Bytes = getLSMultipleTransferSize(MI); + MachineBasicBlock &MBB = *MI->getParent(); + MachineBasicBlock::iterator MBBI(MI); + int Offset; + MachineBasicBlock::iterator MergeInstr + = findIncDecBefore(MBBI, Base, Pred, PredReg, Offset); + unsigned NewOpc; + if (!isAM5 && Offset == Bytes) { + NewOpc = getPreIndexedLoadStoreOpcode(Opcode, ARM_AM::add); + } else if (Offset == -Bytes) { + NewOpc = getPreIndexedLoadStoreOpcode(Opcode, ARM_AM::sub); + } else { + MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset); + if (Offset == Bytes) { + NewOpc = getPostIndexedLoadStoreOpcode(Opcode, ARM_AM::add); + } else if (!isAM5 && Offset == -Bytes) { + NewOpc = getPostIndexedLoadStoreOpcode(Opcode, ARM_AM::sub); + } else + return false; } + MBB.erase(MergeInstr); - if (!DoMerge) - return false; + ARM_AM::AddrOpc AddSub = Offset < 0 ? ARM_AM::sub : ARM_AM::add; + bool isLd = isLoadSingle(Opcode); if (isAM5) { // VLDM[SD]_UPD, VSTM[SD]_UPD // (There are no base-updating versions of VLDR/VSTR instructions, but the // updating load/store-multiple instructions can be used with only one // register.) MachineOperand &MO = MI->getOperand(0); - BuildMI(MBB, MBBI, dl, TII->get(NewOpc)) + BuildMI(MBB, MBBI, DL, TII->get(NewOpc)) .addReg(Base, getDefRegState(true)) // WB base register .addReg(Base, getKillRegState(isLd ? BaseKill : false)) .addImm(Pred).addReg(PredReg) @@ -1330,20 +1313,18 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB, if (isAM2) { // LDR_PRE, LDR_POST if (NewOpc == ARM::LDR_PRE_IMM || NewOpc == ARM::LDRB_PRE_IMM) { - int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes; - BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg()) + BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg()) .addReg(Base, RegState::Define) .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg); } else { - int Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift); - BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg()) + int Imm = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift); + BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg()) .addReg(Base, RegState::Define) - .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg); + .addReg(Base).addReg(0).addImm(Imm).addImm(Pred).addReg(PredReg); } } else { - int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes; // t2LDR_PRE, t2LDR_POST - BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg()) + BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg()) .addReg(Base, RegState::Define) .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg); } @@ -1353,15 +1334,14 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB, // the vestigal zero-reg offset register. When that's fixed, this clause // can be removed entirely. if (isAM2 && NewOpc == ARM::STR_POST_IMM) { - int Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift); + int Imm = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift); // STR_PRE, STR_POST - BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base) + BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base) .addReg(MO.getReg(), getKillRegState(MO.isKill())) - .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg); + .addReg(Base).addReg(0).addImm(Imm).addImm(Pred).addReg(PredReg); } else { - int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes; // t2STR_PRE, t2STR_POST - BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base) + BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base) .addReg(MO.getReg(), getKillRegState(MO.isKill())) .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg); } @@ -1371,6 +1351,66 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB, return true; } +bool ARMLoadStoreOpt::MergeBaseUpdateLSDouble(MachineInstr &MI) const { + unsigned Opcode = MI.getOpcode(); + assert((Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8) && + "Must have t2STRDi8 or t2LDRDi8"); + if (MI.getOperand(3).getImm() != 0) + return false; + + // Behaviour for writeback is undefined if base register is the same as one + // of the others. + const MachineOperand &BaseOp = MI.getOperand(2); + unsigned Base = BaseOp.getReg(); + const MachineOperand &Reg0Op = MI.getOperand(0); + const MachineOperand &Reg1Op = MI.getOperand(1); + if (Reg0Op.getReg() == Base || Reg1Op.getReg() == Base) + return false; + + unsigned PredReg; + ARMCC::CondCodes Pred = getInstrPredicate(&MI, PredReg); + MachineBasicBlock::iterator MBBI(MI); + MachineBasicBlock &MBB = *MI.getParent(); + int Offset; + MachineBasicBlock::iterator MergeInstr = findIncDecBefore(MBBI, Base, Pred, + PredReg, Offset); + unsigned NewOpc; + if (Offset == 8 || Offset == -8) { + NewOpc = Opcode == ARM::t2LDRDi8 ? ARM::t2LDRD_PRE : ARM::t2STRD_PRE; + } else { + MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset); + if (Offset == 8 || Offset == -8) { + NewOpc = Opcode == ARM::t2LDRDi8 ? ARM::t2LDRD_POST : ARM::t2STRD_POST; + } else + return false; + } + MBB.erase(MergeInstr); + + DebugLoc DL = MI.getDebugLoc(); + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc)); + if (NewOpc == ARM::t2LDRD_PRE || NewOpc == ARM::t2LDRD_POST) { + MIB.addOperand(Reg0Op).addOperand(Reg1Op) + .addReg(BaseOp.getReg(), RegState::Define); + } else { + assert(NewOpc == ARM::t2STRD_PRE || NewOpc == ARM::t2STRD_POST); + MIB.addReg(BaseOp.getReg(), RegState::Define) + .addOperand(Reg0Op).addOperand(Reg1Op); + } + MIB.addReg(BaseOp.getReg(), RegState::Kill) + .addImm(Offset).addImm(Pred).addReg(PredReg); + assert(TII->get(Opcode).getNumOperands() == 6 && + TII->get(NewOpc).getNumOperands() == 7 && + "Unexpected number of operands in Opcode specification."); + + // Transfer implicit operands. + for (const MachineOperand &MO : MI.implicit_operands()) + MIB.addOperand(MO); + MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + + MBB.erase(MBBI); + return true; +} + /// Returns true if instruction is a memory operation that this pass is capable /// of operating on. static bool isMemoryOp(const MachineInstr *MI) { @@ -1426,26 +1466,10 @@ static bool isMemoryOp(const MachineInstr *MI) { return false; } -/// Advance register scavenger to just before the earliest memory op that is -/// being merged. -void ARMLoadStoreOpt::AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps) { - MachineBasicBlock::iterator Loc = MemOps[0].MBBI; - unsigned Position = MemOps[0].Position; - for (unsigned i = 1, e = MemOps.size(); i != e; ++i) { - if (MemOps[i].Position < Position) { - Position = MemOps[i].Position; - Loc = MemOps[i].MBBI; - } - } - - if (Loc != MBB.begin()) - RS->forward(std::prev(Loc)); -} - static void InsertLDR_STR(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, int Offset, bool isDef, - DebugLoc dl, unsigned NewOpc, + DebugLoc DL, unsigned NewOpc, unsigned Reg, bool RegDeadKill, bool RegUndef, unsigned BaseReg, bool BaseKill, bool BaseUndef, bool OffKill, bool OffUndef, @@ -1491,7 +1515,6 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB, if (!Errata602117 && !NonConsecutiveRegs) return false; - MachineBasicBlock::iterator NewBBI = MBBI; bool isT2 = Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8; bool isLd = Opcode == ARM::LDRD || Opcode == ARM::t2LDRDi8; bool EvenDeadKill = isLd ? @@ -1531,7 +1554,6 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB, getKillRegState(OddDeadKill) | getUndefRegState(OddUndef)); ++NumSTRD2STM; } - NewBBI = std::prev(MBBI); } else { // Split into two instructions. unsigned NewOpc = (isLd) @@ -1553,7 +1575,6 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB, OddReg, OddDeadKill, false, BaseReg, false, BaseUndef, false, OffUndef, Pred, PredReg, TII, isT2); - NewBBI = std::prev(MBBI); InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc, EvenReg, EvenDeadKill, false, BaseReg, BaseKill, BaseUndef, OffKill, OffUndef, @@ -1573,7 +1594,6 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB, EvenReg, EvenDeadKill, EvenUndef, BaseReg, false, BaseUndef, false, OffUndef, Pred, PredReg, TII, isT2); - NewBBI = std::prev(MBBI); InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc2, OddReg, OddDeadKill, OddUndef, BaseReg, BaseKill, BaseUndef, OffKill, OffUndef, @@ -1585,191 +1605,160 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB, ++NumSTRD2STR; } - MBB.erase(MI); - MBBI = NewBBI; + MBBI = MBB.erase(MBBI); return true; } /// An optimization pass to turn multiple LDR / STR ops of the same base and /// incrementing offset into LDM / STM ops. bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { - unsigned NumMerges = 0; - unsigned NumMemOps = 0; MemOpQueue MemOps; unsigned CurrBase = 0; unsigned CurrOpc = ~0u; - unsigned CurrSize = 0; ARMCC::CondCodes CurrPred = ARMCC::AL; - unsigned CurrPredReg = 0; unsigned Position = 0; - SmallVector<MachineBasicBlock::iterator,4> Merges; - - RS->enterBasicBlock(&MBB); - MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); - while (MBBI != E) { + assert(Candidates.size() == 0); + assert(MergeBaseCandidates.size() == 0); + LiveRegsValid = false; + + for (MachineBasicBlock::iterator I = MBB.end(), MBBI; I != MBB.begin(); + I = MBBI) { + // The instruction in front of the iterator is the one we look at. + MBBI = std::prev(I); if (FixInvalidRegPairOp(MBB, MBBI)) continue; + ++Position; - bool Advance = false; - bool TryMerge = false; - - bool isMemOp = isMemoryOp(MBBI); - if (isMemOp) { + if (isMemoryOp(MBBI)) { unsigned Opcode = MBBI->getOpcode(); - unsigned Size = getLSMultipleTransferSize(MBBI); const MachineOperand &MO = MBBI->getOperand(0); unsigned Reg = MO.getReg(); - bool isKill = MO.isDef() ? false : MO.isKill(); - unsigned Base = MBBI->getOperand(1).getReg(); + unsigned Base = getLoadStoreBaseOp(*MBBI).getReg(); unsigned PredReg = 0; ARMCC::CondCodes Pred = getInstrPredicate(MBBI, PredReg); int Offset = getMemoryOpOffset(MBBI); - // Watch out for: - // r4 := ldr [r5] - // r5 := ldr [r5, #4] - // r6 := ldr [r5, #8] - // - // The second ldr has effectively broken the chain even though it - // looks like the later ldr(s) use the same base register. Try to - // merge the ldr's so far, including this one. But don't try to - // combine the following ldr(s). - bool Clobber = isi32Load(Opcode) && Base == MBBI->getOperand(0).getReg(); - - // Watch out for: - // r4 := ldr [r0, #8] - // r4 := ldr [r0, #4] - // - // The optimization may reorder the second ldr in front of the first - // ldr, which violates write after write(WAW) dependence. The same as - // str. Try to merge inst(s) already in MemOps. - bool Overlap = false; - for (MemOpQueueIter I = MemOps.begin(), E = MemOps.end(); I != E; ++I) { - if (TRI->regsOverlap(Reg, I->MBBI->getOperand(0).getReg())) { - Overlap = true; - break; - } - } - - if (CurrBase == 0 && !Clobber) { + if (CurrBase == 0) { // Start of a new chain. CurrBase = Base; CurrOpc = Opcode; - CurrSize = Size; CurrPred = Pred; - CurrPredReg = PredReg; - MemOps.push_back(MemOpQueueEntry(Offset, Reg, isKill, Position, MBBI)); - ++NumMemOps; - Advance = true; - } else if (!Overlap) { - if (Clobber) { - TryMerge = true; - Advance = true; + MemOps.push_back(MemOpQueueEntry(MBBI, Offset, Position)); + continue; + } + // Note: No need to match PredReg in the next if. + if (CurrOpc == Opcode && CurrBase == Base && CurrPred == Pred) { + // Watch out for: + // r4 := ldr [r0, #8] + // r4 := ldr [r0, #4] + // or + // r0 := ldr [r0] + // If a load overrides the base register or a register loaded by + // another load in our chain, we cannot take this instruction. + bool Overlap = false; + if (isLoadSingle(Opcode)) { + Overlap = (Base == Reg); + if (!Overlap) { + for (const MemOpQueueEntry &E : MemOps) { + if (TRI->regsOverlap(Reg, E.MI->getOperand(0).getReg())) { + Overlap = true; + break; + } + } + } } - if (CurrOpc == Opcode && CurrBase == Base && CurrPred == Pred) { - // No need to match PredReg. - // Continue adding to the queue. + if (!Overlap) { + // Check offset and sort memory operation into the current chain. if (Offset > MemOps.back().Offset) { - MemOps.push_back(MemOpQueueEntry(Offset, Reg, isKill, - Position, MBBI)); - ++NumMemOps; - Advance = true; + MemOps.push_back(MemOpQueueEntry(MBBI, Offset, Position)); + continue; } else { - for (MemOpQueueIter I = MemOps.begin(), E = MemOps.end(); - I != E; ++I) { - if (Offset < I->Offset) { - MemOps.insert(I, MemOpQueueEntry(Offset, Reg, isKill, - Position, MBBI)); - ++NumMemOps; - Advance = true; + MemOpQueue::iterator MI, ME; + for (MI = MemOps.begin(), ME = MemOps.end(); MI != ME; ++MI) { + if (Offset < MI->Offset) { + // Found a place to insert. break; - } else if (Offset == I->Offset) { - // Collision! This can't be merged! + } + if (Offset == MI->Offset) { + // Collision, abort. + MI = ME; break; } } + if (MI != MemOps.end()) { + MemOps.insert(MI, MemOpQueueEntry(MBBI, Offset, Position)); + continue; + } } } } - } - if (MBBI->isDebugValue()) { - ++MBBI; - if (MBBI == E) - // Reach the end of the block, try merging the memory instructions. - TryMerge = true; - } else if (Advance) { - ++Position; - ++MBBI; - if (MBBI == E) - // Reach the end of the block, try merging the memory instructions. - TryMerge = true; - } else { - TryMerge = true; + // Don't advance the iterator; The op will start a new chain next. + MBBI = I; + --Position; + // Fallthrough to look into existing chain. + } else if (MBBI->isDebugValue()) { + continue; + } else if (MBBI->getOpcode() == ARM::t2LDRDi8 || + MBBI->getOpcode() == ARM::t2STRDi8) { + // ARMPreAllocLoadStoreOpt has already formed some LDRD/STRD instructions + // remember them because we may still be able to merge add/sub into them. + MergeBaseCandidates.push_back(MBBI); } - if (TryMerge) { - if (NumMemOps > 1) { - // Try to find a free register to use as a new base in case it's needed. - // First advance to the instruction just before the start of the chain. - AdvanceRS(MBB, MemOps); - - // Find a scratch register. - unsigned Scratch = - RS->FindUnusedReg(isThumb1 ? &ARM::tGPRRegClass : &ARM::GPRRegClass); - - // Process the load / store instructions. - RS->forward(std::prev(MBBI)); - - // Merge ops. - Merges.clear(); - MergeLDR_STR(MBB, 0, CurrBase, CurrOpc, CurrSize, - CurrPred, CurrPredReg, Scratch, MemOps, Merges); - - // Try folding preceding/trailing base inc/dec into the generated - // LDM/STM ops. - for (unsigned i = 0, e = Merges.size(); i < e; ++i) - if (MergeBaseUpdateLSMultiple(MBB, Merges[i], Advance, MBBI)) - ++NumMerges; - NumMerges += Merges.size(); - - // Try folding preceding/trailing base inc/dec into those load/store - // that were not merged to form LDM/STM ops. - for (unsigned i = 0; i != NumMemOps; ++i) - if (!MemOps[i].Merged) - if (MergeBaseUpdateLoadStore(MBB, MemOps[i].MBBI, TII,Advance,MBBI)) - ++NumMerges; - - // RS may be pointing to an instruction that's deleted. - RS->skipTo(std::prev(MBBI)); - } else if (NumMemOps == 1) { - // Try folding preceding/trailing base inc/dec into the single - // load/store. - if (MergeBaseUpdateLoadStore(MBB, MemOps[0].MBBI, TII, Advance, MBBI)) { - ++NumMerges; - RS->forward(std::prev(MBBI)); - } - } + // If we are here then the chain is broken; Extract candidates for a merge. + if (MemOps.size() > 0) { + FormCandidates(MemOps); + // Reset for the next chain. CurrBase = 0; CurrOpc = ~0u; - CurrSize = 0; CurrPred = ARMCC::AL; - CurrPredReg = 0; - if (NumMemOps) { - MemOps.clear(); - NumMemOps = 0; - } + MemOps.clear(); + } + } + if (MemOps.size() > 0) + FormCandidates(MemOps); - // If iterator hasn't been advanced and this is not a memory op, skip it. - // It can't start a new chain anyway. - if (!Advance && !isMemOp && MBBI != E) { - ++Position; - ++MBBI; + // Sort candidates so they get processed from end to begin of the basic + // block later; This is necessary for liveness calculation. + auto LessThan = [](const MergeCandidate* M0, const MergeCandidate *M1) { + return M0->InsertPos < M1->InsertPos; + }; + std::sort(Candidates.begin(), Candidates.end(), LessThan); + + // Go through list of candidates and merge. + bool Changed = false; + for (const MergeCandidate *Candidate : Candidates) { + if (Candidate->CanMergeToLSMulti || Candidate->CanMergeToLSDouble) { + MachineInstr *Merged = MergeOpsUpdate(*Candidate); + // Merge preceding/trailing base inc/dec into the merged op. + if (Merged) { + Changed = true; + unsigned Opcode = Merged->getOpcode(); + if (Opcode == ARM::t2STRDi8 || Opcode == ARM::t2LDRDi8) + MergeBaseUpdateLSDouble(*Merged); + else + MergeBaseUpdateLSMultiple(Merged); + } else { + for (MachineInstr *MI : Candidate->Instrs) { + if (MergeBaseUpdateLoadStore(MI)) + Changed = true; + } } + } else { + assert(Candidate->Instrs.size() == 1); + if (MergeBaseUpdateLoadStore(Candidate->Instrs.front())) + Changed = true; } } - return NumMerges > 0; + Candidates.clear(); + // Try to fold add/sub into the LDRD/STRD formed by ARMPreAllocLoadStoreOpt. + for (MachineInstr *MI : MergeBaseCandidates) + MergeBaseUpdateLSDouble(*MI); + MergeBaseCandidates.clear(); + + return Changed; } /// If this is a exit BB, try merging the return ops ("bx lr" and "mov pc, lr") @@ -1814,12 +1803,14 @@ bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) { } bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { + MF = &Fn; STI = &static_cast<const ARMSubtarget &>(Fn.getSubtarget()); TL = STI->getTargetLowering(); AFI = Fn.getInfo<ARMFunctionInfo>(); TII = STI->getInstrInfo(); TRI = STI->getRegisterInfo(); - RS = new RegScavenger(); + MRI = &Fn.getRegInfo(); + RegClassInfoValid = false; isThumb2 = AFI->isThumb2Function(); isThumb1 = AFI->isThumbFunction() && !isThumb2; @@ -1832,7 +1823,7 @@ bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { Modified |= MergeReturnIntoLDM(MBB); } - delete RS; + Allocator.DestroyAll(); return Modified; } @@ -2219,7 +2210,7 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) { continue; int Opc = MI->getOpcode(); - bool isLd = isi32Load(Opc) || Opc == ARM::VLDRS || Opc == ARM::VLDRD; + bool isLd = isLoadSingle(Opc); unsigned Base = MI->getOperand(1).getReg(); int Offset = getMemoryOpOffset(MI); diff --git a/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/lib/Target/ARM/ARMSelectionDAGInfo.cpp index a59cf9851108..6cafbbb9f8eb 100644 --- a/lib/Target/ARM/ARMSelectionDAGInfo.cpp +++ b/lib/Target/ARM/ARMSelectionDAGInfo.cpp @@ -18,12 +18,6 @@ using namespace llvm; #define DEBUG_TYPE "arm-selectiondag-info" -ARMSelectionDAGInfo::ARMSelectionDAGInfo(const DataLayout &DL) - : TargetSelectionDAGInfo(&DL) {} - -ARMSelectionDAGInfo::~ARMSelectionDAGInfo() { -} - // Emit, if possible, a specialized version of the given Libcall. Typically this // means selecting the appropriately aligned version, but we also convert memset // of 0 into memclr. @@ -83,7 +77,7 @@ EmitSpecializedLibcall(SelectionDAG &DAG, SDLoc dl, TargetLowering::ArgListTy Args; TargetLowering::ArgListEntry Entry; - Entry.Ty = TLI->getDataLayout()->getIntPtrType(*DAG.getContext()); + Entry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext()); Entry.Node = Dst; Args.push_back(Entry); if (AEABILibcall == AEABI_MEMCLR) { @@ -121,12 +115,14 @@ EmitSpecializedLibcall(SelectionDAG &DAG, SDLoc dl, { "__aeabi_memclr", "__aeabi_memclr4", "__aeabi_memclr8" } }; TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(dl).setChain(Chain) - .setCallee(TLI->getLibcallCallingConv(LC), - Type::getVoidTy(*DAG.getContext()), - DAG.getExternalSymbol(FunctionNames[AEABILibcall][AlignVariant], - TLI->getPointerTy()), std::move(Args), 0) - .setDiscardResult(); + CLI.setDebugLoc(dl) + .setChain(Chain) + .setCallee( + TLI->getLibcallCallingConv(LC), Type::getVoidTy(*DAG.getContext()), + DAG.getExternalSymbol(FunctionNames[AEABILibcall][AlignVariant], + TLI->getPointerTy(DAG.getDataLayout())), + std::move(Args), 0) + .setDiscardResult(); std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI); return CallResult.second; diff --git a/lib/Target/ARM/ARMSelectionDAGInfo.h b/lib/Target/ARM/ARMSelectionDAGInfo.h index 1db190f41e1a..289879ee1d7e 100644 --- a/lib/Target/ARM/ARMSelectionDAGInfo.h +++ b/lib/Target/ARM/ARMSelectionDAGInfo.h @@ -37,8 +37,6 @@ namespace ARM_AM { class ARMSelectionDAGInfo : public TargetSelectionDAGInfo { public: - explicit ARMSelectionDAGInfo(const DataLayout &DL); - ~ARMSelectionDAGInfo(); SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, SDValue Chain, diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp index 55808dfb9efe..002c3e9b6291 100644 --- a/lib/Target/ARM/ARMSubtarget.cpp +++ b/lib/Target/ARM/ARMSubtarget.cpp @@ -112,7 +112,6 @@ ARMSubtarget::ARMSubtarget(const Triple &TT, const std::string &CPU, : ARMGenSubtargetInfo(TT, CPU, FS), ARMProcFamily(Others), ARMProcClass(None), stackAlignment(4), CPUString(CPU), IsLittle(IsLittle), TargetTriple(TT), Options(TM.Options), TM(TM), - TSInfo(*TM.getDataLayout()), FrameLowering(initializeFrameLowering(CPU, FS)), // At this point initializeSubtargetDependencies has been called so // we can query directly. @@ -172,6 +171,7 @@ void ARMSubtarget::initializeEnvironment() { AllowsUnalignedMem = false; Thumb2DSP = false; UseNaClTrap = false; + GenLongCalls = false; UnsafeFPMath = false; } @@ -286,7 +286,7 @@ ARMSubtarget::GVIsIndirectSymbol(const GlobalValue *GV, if (RelocM == Reloc::Static) return false; - bool isDecl = GV->isDeclarationForLinker(); + bool isDef = GV->isStrongDefinitionForLinker(); if (!isTargetMachO()) { // Extra load is needed for all externally visible. @@ -294,34 +294,22 @@ ARMSubtarget::GVIsIndirectSymbol(const GlobalValue *GV, return false; return true; } else { - if (RelocM == Reloc::PIC_) { - // If this is a strong reference to a definition, it is definitely not - // through a stub. - if (!isDecl && !GV->isWeakForLinker()) - return false; - - // Unless we have a symbol with hidden visibility, we have to go through a - // normal $non_lazy_ptr stub because this symbol might be resolved late. - if (!GV->hasHiddenVisibility()) // Non-hidden $non_lazy_ptr reference. - return true; + // If this is a strong reference to a definition, it is definitely not + // through a stub. + if (isDef) + return false; + + // Unless we have a symbol with hidden visibility, we have to go through a + // normal $non_lazy_ptr stub because this symbol might be resolved late. + if (!GV->hasHiddenVisibility()) // Non-hidden $non_lazy_ptr reference. + return true; + if (RelocM == Reloc::PIC_) { // If symbol visibility is hidden, we have a stub for common symbol // references and external declarations. - if (isDecl || GV->hasCommonLinkage()) + if (GV->isDeclarationForLinker() || GV->hasCommonLinkage()) // Hidden $non_lazy_ptr reference. return true; - - return false; - } else { - // If this is a strong reference to a definition, it is definitely not - // through a stub. - if (!isDecl && !GV->isWeakForLinker()) - return false; - - // Unless we have a symbol with hidden visibility, we have to go through a - // normal $non_lazy_ptr stub because this symbol might be resolved late. - if (!GV->hasHiddenVisibility()) // Non-hidden $non_lazy_ptr reference. - return true; } } diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h index 9909a6a6d198..dd101df9b63d 100644 --- a/lib/Target/ARM/ARMSubtarget.h +++ b/lib/Target/ARM/ARMSubtarget.h @@ -206,6 +206,9 @@ protected: /// NaCl TRAP instruction is generated instead of the regular TRAP. bool UseNaClTrap; + /// Generate calls via indirect call instructions. + bool GenLongCalls; + /// Target machine allowed unsafe FP math (such as use of NEON fp) bool UnsafeFPMath; @@ -342,6 +345,7 @@ public: bool hasMPExtension() const { return HasMPExtension; } bool hasThumb2DSP() const { return Thumb2DSP; } bool useNaClTrap() const { return UseNaClTrap; } + bool genLongCalls() const { return GenLongCalls; } bool hasFP16() const { return HasFP16; } bool hasD16() const { return HasD16; } diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp index 6e81bd2d349d..93495d66ae70 100644 --- a/lib/Target/ARM/ARMTargetMachine.cpp +++ b/lib/Target/ARM/ARMTargetMachine.cpp @@ -80,8 +80,7 @@ computeTargetABI(const Triple &TT, StringRef CPU, // FIXME: This is duplicated code from the front end and should be unified. if (TT.isOSBinFormatMachO()) { if (TT.getEnvironment() == llvm::Triple::EABI || - (TT.getOS() == llvm::Triple::UnknownOS && - TT.getObjectFormat() == llvm::Triple::MachO) || + (TT.getOS() == llvm::Triple::UnknownOS && TT.isOSBinFormatMachO()) || CPU.startswith("cortex-m")) { TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS; } else { @@ -104,8 +103,8 @@ computeTargetABI(const Triple &TT, StringRef CPU, TargetABI = ARMBaseTargetMachine::ARM_ABI_APCS; break; default: - if (TT.getOS() == llvm::Triple::NetBSD) - TargetABI = ARMBaseTargetMachine::ARM_ABI_APCS; + if (TT.isOSNetBSD()) + TargetABI = ARMBaseTargetMachine::ARM_ABI_APCS; else TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS; break; diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp index f4901fc24e44..2f194cf7ae06 100644 --- a/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -61,14 +61,14 @@ unsigned ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { if (Src->isVectorTy() && ST->hasNEON() && (ISD == ISD::FP_ROUND || ISD == ISD::FP_EXTEND)) { - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src); + std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); int Idx = CostTableLookup(NEONFltDblTbl, ISD, LT.second); if (Idx != -1) return LT.first * NEONFltDblTbl[Idx].Cost; } - EVT SrcTy = TLI->getValueType(Src); - EVT DstTy = TLI->getValueType(Dst); + EVT SrcTy = TLI->getValueType(DL, Src); + EVT DstTy = TLI->getValueType(DL, Dst); if (!SrcTy.isSimple() || !DstTy.isSimple()) return BaseT::getCastInstrCost(Opcode, Dst, Src); @@ -282,8 +282,8 @@ unsigned ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, { ISD::SELECT, MVT::v16i1, MVT::v16i64, 100 } }; - EVT SelCondTy = TLI->getValueType(CondTy); - EVT SelValTy = TLI->getValueType(ValTy); + EVT SelCondTy = TLI->getValueType(DL, CondTy); + EVT SelValTy = TLI->getValueType(DL, ValTy); if (SelCondTy.isSimple() && SelValTy.isSimple()) { int Idx = ConvertCostTableLookup(NEONVectorSelectTbl, ISD, SelCondTy.getSimpleVT(), @@ -292,7 +292,7 @@ unsigned ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, return NEONVectorSelectTbl[Idx].Cost; } - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(ValTy); + std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); return LT.first; } @@ -353,7 +353,7 @@ unsigned ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2}, {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}}; - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp); + std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); int Idx = CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second); if (Idx == -1) @@ -379,7 +379,7 @@ unsigned ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}}; - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp); + std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); int Idx = CostTableLookup(NEONAltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second); if (Idx == -1) @@ -395,7 +395,7 @@ unsigned ARMTTIImpl::getArithmeticInstrCost( TTI::OperandValueProperties Opd2PropInfo) { int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode); - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty); + std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); const unsigned FunctionCallDivCost = 20; const unsigned ReciprocalDivCost = 10; @@ -468,7 +468,7 @@ unsigned ARMTTIImpl::getArithmeticInstrCost( unsigned ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace) { - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src); + std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); if (Src->isVectorTy() && Alignment != 16 && Src->getVectorElementType()->isDoubleTy()) { @@ -488,12 +488,12 @@ unsigned ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, assert(isa<VectorType>(VecTy) && "Expect a vector type"); // vldN/vstN doesn't support vector types of i64/f64 element. - bool EltIs64Bits = DL->getTypeAllocSizeInBits(VecTy->getScalarType()) == 64; + bool EltIs64Bits = DL.getTypeAllocSizeInBits(VecTy->getScalarType()) == 64; if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits) { unsigned NumElts = VecTy->getVectorNumElements(); Type *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor); - unsigned SubVecSize = TLI->getDataLayout()->getTypeAllocSize(SubVecTy); + unsigned SubVecSize = DL.getTypeAllocSize(SubVecTy); // vldN/vstN only support legal vector types of size 64 or 128 in bits. if (NumElts % Factor == 0 && (SubVecSize == 64 || SubVecSize == 128)) diff --git a/lib/Target/ARM/ARMTargetTransformInfo.h b/lib/Target/ARM/ARMTargetTransformInfo.h index f2e5db655ccf..84f256f73722 100644 --- a/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/lib/Target/ARM/ARMTargetTransformInfo.h @@ -42,7 +42,8 @@ class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> { public: explicit ARMTTIImpl(const ARMBaseTargetMachine *TM, Function &F) - : BaseT(TM), ST(TM->getSubtargetImpl(F)), TLI(ST->getTargetLowering()) {} + : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)), + TLI(ST->getTargetLowering()) {} // Provide value semantics. MSVC requires that we spell all of these out. ARMTTIImpl(const ARMTTIImpl &Arg) @@ -50,18 +51,6 @@ public: ARMTTIImpl(ARMTTIImpl &&Arg) : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)), TLI(std::move(Arg.TLI)) {} - ARMTTIImpl &operator=(const ARMTTIImpl &RHS) { - BaseT::operator=(static_cast<const BaseT &>(RHS)); - ST = RHS.ST; - TLI = RHS.TLI; - return *this; - } - ARMTTIImpl &operator=(ARMTTIImpl &&RHS) { - BaseT::operator=(std::move(static_cast<BaseT &>(RHS))); - ST = std::move(RHS.ST); - TLI = std::move(RHS.TLI); - return *this; - } /// \name Scalar TTI Implementations /// @{ diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp index c2db74619871..f8f0eb2d4baa 100644 --- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp +++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -189,9 +189,9 @@ class ARMAsmParser : public MCTargetAsmParser { return getParser().Error(L, Msg, Ranges); } - bool validatetLDMRegList(MCInst Inst, const OperandVector &Operands, + bool validatetLDMRegList(const MCInst &Inst, const OperandVector &Operands, unsigned ListNo, bool IsARPop = false); - bool validatetSTMRegList(MCInst Inst, const OperandVector &Operands, + bool validatetSTMRegList(const MCInst &Inst, const OperandVector &Operands, unsigned ListNo); int tryParseRegister(); @@ -242,6 +242,8 @@ class ARMAsmParser : public MCTargetAsmParser { bool &CanAcceptCarrySet, bool &CanAcceptPredicationCode); + void tryConvertingToTwoOperandForm(StringRef Mnemonic, bool CarrySetting, + OperandVector &Operands); bool isThumb() const { // FIXME: Can tablegen auto-generate this? return STI.getFeatureBits()[ARM::ModeThumb]; @@ -5465,6 +5467,92 @@ void ARMAsmParser::getMnemonicAcceptInfo(StringRef Mnemonic, StringRef FullInst, CanAcceptPredicationCode = true; } +// \brief Some Thumb instructions have two operand forms that are not +// available as three operand, convert to two operand form if possible. +// +// FIXME: We would really like to be able to tablegen'erate this. +void ARMAsmParser::tryConvertingToTwoOperandForm(StringRef Mnemonic, + bool CarrySetting, + OperandVector &Operands) { + if (Operands.size() != 6) + return; + + const auto &Op3 = static_cast<ARMOperand &>(*Operands[3]); + auto &Op4 = static_cast<ARMOperand &>(*Operands[4]); + if (!Op3.isReg() || !Op4.isReg()) + return; + + auto Op3Reg = Op3.getReg(); + auto Op4Reg = Op4.getReg(); + + // For most Thumb2 cases we just generate the 3 operand form and reduce + // it in processInstruction(), but the 3 operand form of ADD (t2ADDrr) + // won't accept SP or PC so we do the transformation here taking care + // with immediate range in the 'add sp, sp #imm' case. + auto &Op5 = static_cast<ARMOperand &>(*Operands[5]); + if (isThumbTwo()) { + if (Mnemonic != "add") + return; + bool TryTransform = Op3Reg == ARM::PC || Op4Reg == ARM::PC || + (Op5.isReg() && Op5.getReg() == ARM::PC); + if (!TryTransform) { + TryTransform = (Op3Reg == ARM::SP || Op4Reg == ARM::SP || + (Op5.isReg() && Op5.getReg() == ARM::SP)) && + !(Op3Reg == ARM::SP && Op4Reg == ARM::SP && + Op5.isImm() && !Op5.isImm0_508s4()); + } + if (!TryTransform) + return; + } else if (!isThumbOne()) + return; + + if (!(Mnemonic == "add" || Mnemonic == "sub" || Mnemonic == "and" || + Mnemonic == "eor" || Mnemonic == "lsl" || Mnemonic == "lsr" || + Mnemonic == "asr" || Mnemonic == "adc" || Mnemonic == "sbc" || + Mnemonic == "ror" || Mnemonic == "orr" || Mnemonic == "bic")) + return; + + // If first 2 operands of a 3 operand instruction are the same + // then transform to 2 operand version of the same instruction + // e.g. 'adds r0, r0, #1' transforms to 'adds r0, #1' + bool Transform = Op3Reg == Op4Reg; + + // For communtative operations, we might be able to transform if we swap + // Op4 and Op5. The 'ADD Rdm, SP, Rdm' form is already handled specially + // as tADDrsp. + const ARMOperand *LastOp = &Op5; + bool Swap = false; + if (!Transform && Op5.isReg() && Op3Reg == Op5.getReg() && + ((Mnemonic == "add" && Op4Reg != ARM::SP) || + Mnemonic == "and" || Mnemonic == "eor" || + Mnemonic == "adc" || Mnemonic == "orr")) { + Swap = true; + LastOp = &Op4; + Transform = true; + } + + // If both registers are the same then remove one of them from + // the operand list, with certain exceptions. + if (Transform) { + // Don't transform 'adds Rd, Rd, Rm' or 'sub{s} Rd, Rd, Rm' because the + // 2 operand forms don't exist. + if (((Mnemonic == "add" && CarrySetting) || Mnemonic == "sub") && + LastOp->isReg()) + Transform = false; + + // Don't transform 'add/sub{s} Rd, Rd, #imm' if the immediate fits into + // 3-bits because the ARMARM says not to. + if ((Mnemonic == "add" || Mnemonic == "sub") && LastOp->isImm0_7()) + Transform = false; + } + + if (Transform) { + if (Swap) + std::swap(Op4, Op5); + Operands.erase(Operands.begin() + 3); + } +} + bool ARMAsmParser::shouldOmitCCOutOperand(StringRef Mnemonic, OperandVector &Operands) { // FIXME: This is all horribly hacky. We really need a better way to deal @@ -5838,6 +5926,8 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, "VFP/Neon double precision register expected"); } + tryConvertingToTwoOperandForm(Mnemonic, CarrySetting, Operands); + // Some instructions, mostly Thumb, have forms for the same mnemonic that // do and don't have a cc_out optional-def operand. With some spot-checks // of the operand list, we can figure out which variant we're trying to @@ -5901,48 +5991,6 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, } } - // If first 2 operands of a 3 operand instruction are the same - // then transform to 2 operand version of the same instruction - // e.g. 'adds r0, r0, #1' transforms to 'adds r0, #1' - // FIXME: We would really like to be able to tablegen'erate this. - if (isThumbOne() && Operands.size() == 6 && - (Mnemonic == "add" || Mnemonic == "sub" || Mnemonic == "and" || - Mnemonic == "eor" || Mnemonic == "lsl" || Mnemonic == "lsr" || - Mnemonic == "asr" || Mnemonic == "adc" || Mnemonic == "sbc" || - Mnemonic == "ror" || Mnemonic == "orr" || Mnemonic == "bic")) { - ARMOperand &Op3 = static_cast<ARMOperand &>(*Operands[3]); - ARMOperand &Op4 = static_cast<ARMOperand &>(*Operands[4]); - ARMOperand &Op5 = static_cast<ARMOperand &>(*Operands[5]); - - // If both registers are the same then remove one of them from - // the operand list. - if (Op3.isReg() && Op4.isReg() && Op3.getReg() == Op4.getReg()) { - // If 3rd operand (variable Op5) is a register and the instruction is adds/sub - // then do not transform as the backend already handles this instruction - // correctly. - if (!Op5.isReg() || !((Mnemonic == "add" && CarrySetting) || Mnemonic == "sub")) { - Operands.erase(Operands.begin() + 3); - if (Mnemonic == "add" && !CarrySetting) { - // Special case for 'add' (not 'adds') instruction must - // remove the CCOut operand as well. - Operands.erase(Operands.begin() + 1); - } - } - } - } - - // If instruction is 'add' and first two register operands - // use SP register, then remove one of the SP registers from - // the instruction. - // FIXME: We would really like to be able to tablegen'erate this. - if (isThumbOne() && Operands.size() == 5 && Mnemonic == "add" && !CarrySetting) { - ARMOperand &Op2 = static_cast<ARMOperand &>(*Operands[2]); - ARMOperand &Op3 = static_cast<ARMOperand &>(*Operands[3]); - if (Op2.isReg() && Op3.isReg() && Op2.getReg() == ARM::SP && Op3.getReg() == ARM::SP) { - Operands.erase(Operands.begin() + 2); - } - } - // GNU Assembler extension (compatibility) if ((Mnemonic == "ldrd" || Mnemonic == "strd")) { ARMOperand &Op2 = static_cast<ARMOperand &>(*Operands[2]); @@ -5985,8 +6033,9 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, // return 'true' if register list contains non-low GPR registers, // 'false' otherwise. If Reg is in the register list or is HiReg, set // 'containsReg' to true. -static bool checkLowRegisterList(MCInst Inst, unsigned OpNo, unsigned Reg, - unsigned HiReg, bool &containsReg) { +static bool checkLowRegisterList(const MCInst &Inst, unsigned OpNo, + unsigned Reg, unsigned HiReg, + bool &containsReg) { containsReg = false; for (unsigned i = OpNo; i < Inst.getNumOperands(); ++i) { unsigned OpReg = Inst.getOperand(i).getReg(); @@ -6001,8 +6050,8 @@ static bool checkLowRegisterList(MCInst Inst, unsigned OpNo, unsigned Reg, // Check if the specified regisgter is in the register list of the inst, // starting at the indicated operand number. -static bool listContainsReg(MCInst &Inst, unsigned OpNo, unsigned Reg) { - for (unsigned i = OpNo; i < Inst.getNumOperands(); ++i) { +static bool listContainsReg(const MCInst &Inst, unsigned OpNo, unsigned Reg) { + for (unsigned i = OpNo, e = Inst.getNumOperands(); i < e; ++i) { unsigned OpReg = Inst.getOperand(i).getReg(); if (OpReg == Reg) return true; @@ -6020,7 +6069,7 @@ static bool instIsBreakpoint(const MCInst &Inst) { } -bool ARMAsmParser::validatetLDMRegList(MCInst Inst, +bool ARMAsmParser::validatetLDMRegList(const MCInst &Inst, const OperandVector &Operands, unsigned ListNo, bool IsARPop) { const ARMOperand &Op = static_cast<const ARMOperand &>(*Operands[ListNo]); @@ -6043,7 +6092,7 @@ bool ARMAsmParser::validatetLDMRegList(MCInst Inst, return false; } -bool ARMAsmParser::validatetSTMRegList(MCInst Inst, +bool ARMAsmParser::validatetSTMRegList(const MCInst &Inst, const OperandVector &Operands, unsigned ListNo) { const ARMOperand &Op = static_cast<const ARMOperand &>(*Operands[ListNo]); @@ -8167,8 +8216,16 @@ bool ARMAsmParser::processInstruction(MCInst &Inst, // If the destination and first source operand are the same, and // there's no setting of the flags, use encoding T2 instead of T3. // Note that this is only for ADD, not SUB. This mirrors the system - // 'as' behaviour. Make sure the wide encoding wasn't explicit. - if (Inst.getOperand(0).getReg() != Inst.getOperand(1).getReg() || + // 'as' behaviour. Also take advantage of ADD being commutative. + // Make sure the wide encoding wasn't explicit. + bool Swap = false; + auto DestReg = Inst.getOperand(0).getReg(); + bool Transform = DestReg == Inst.getOperand(1).getReg(); + if (!Transform && DestReg == Inst.getOperand(2).getReg()) { + Transform = true; + Swap = true; + } + if (!Transform || Inst.getOperand(5).getReg() != 0 || (static_cast<ARMOperand &>(*Operands[3]).isToken() && static_cast<ARMOperand &>(*Operands[3]).getToken() == ".w")) @@ -8177,7 +8234,7 @@ bool ARMAsmParser::processInstruction(MCInst &Inst, TmpInst.setOpcode(ARM::tADDhirr); TmpInst.addOperand(Inst.getOperand(0)); TmpInst.addOperand(Inst.getOperand(0)); - TmpInst.addOperand(Inst.getOperand(2)); + TmpInst.addOperand(Inst.getOperand(Swap ? 1 : 2)); TmpInst.addOperand(Inst.getOperand(3)); TmpInst.addOperand(Inst.getOperand(4)); Inst = TmpInst; @@ -9176,8 +9233,7 @@ bool ARMAsmParser::parseDirectiveCPU(SMLoc L) { return false; } - STI.InitMCProcessorInfo(CPU, ""); - STI.InitCPUSchedModel(CPU); + STI.setDefaultFeatures(CPU); setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits())); return false; diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp index 4d12bfb5d60f..d17fdb95dbdf 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp @@ -1362,7 +1362,7 @@ MCTargetStreamer *createARMNullTargetStreamer(MCStreamer &S) { MCTargetStreamer *createARMObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) { const Triple &TT = STI.getTargetTriple(); - if (TT.getObjectFormat() == Triple::ELF) + if (TT.isOSBinFormatELF()) return new ARMTargetELFStreamer(S); return new ARMTargetStreamer(S); } diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp index fafe25ae5be5..21c9fc1e58b2 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp @@ -31,7 +31,7 @@ using namespace llvm; #define GET_REGINFO_MC_DESC #include "ARMGenRegisterInfo.inc" -static bool getMCRDeprecationInfo(MCInst &MI, MCSubtargetInfo &STI, +static bool getMCRDeprecationInfo(MCInst &MI, const MCSubtargetInfo &STI, std::string &Info) { if (STI.getFeatureBits()[llvm::ARM::HasV7Ops] && (MI.getOperand(0).isImm() && MI.getOperand(0).getImm() == 15) && @@ -63,7 +63,7 @@ static bool getMCRDeprecationInfo(MCInst &MI, MCSubtargetInfo &STI, return false; } -static bool getITDeprecationInfo(MCInst &MI, MCSubtargetInfo &STI, +static bool getITDeprecationInfo(MCInst &MI, const MCSubtargetInfo &STI, std::string &Info) { if (STI.getFeatureBits()[llvm::ARM::HasV8Ops] && MI.getOperand(1).isImm() && MI.getOperand(1).getImm() != 8) { @@ -75,7 +75,7 @@ static bool getITDeprecationInfo(MCInst &MI, MCSubtargetInfo &STI, return false; } -static bool getARMStoreDeprecationInfo(MCInst &MI, MCSubtargetInfo &STI, +static bool getARMStoreDeprecationInfo(MCInst &MI, const MCSubtargetInfo &STI, std::string &Info) { assert(!STI.getFeatureBits()[llvm::ARM::ModeThumb] && "cannot predicate thumb instructions"); @@ -92,7 +92,7 @@ static bool getARMStoreDeprecationInfo(MCInst &MI, MCSubtargetInfo &STI, return false; } -static bool getARMLoadDeprecationInfo(MCInst &MI, MCSubtargetInfo &STI, +static bool getARMLoadDeprecationInfo(MCInst &MI, const MCSubtargetInfo &STI, std::string &Info) { assert(!STI.getFeatureBits()[llvm::ARM::ModeThumb] && "cannot predicate thumb instructions"); @@ -257,9 +257,7 @@ MCSubtargetInfo *ARM_MC::createARMMCSubtargetInfo(const Triple &TT, ArchFS = FS; } - MCSubtargetInfo *X = new MCSubtargetInfo(); - InitARMMCSubtargetInfo(X, TT, CPU, ArchFS); - return X; + return createARMMCSubtargetInfoImpl(TT, CPU, ArchFS); } static MCInstrInfo *createARMMCInstrInfo() { @@ -268,7 +266,7 @@ static MCInstrInfo *createARMMCInstrInfo() { return X; } -static MCRegisterInfo *createARMMCRegisterInfo(StringRef Triple) { +static MCRegisterInfo *createARMMCRegisterInfo(const Triple &Triple) { MCRegisterInfo *X = new MCRegisterInfo(); InitARMMCRegisterInfo(X, ARM::LR, 0, 0, ARM::PC); return X; @@ -279,10 +277,10 @@ static MCAsmInfo *createARMMCAsmInfo(const MCRegisterInfo &MRI, MCAsmInfo *MAI; if (TheTriple.isOSDarwin() || TheTriple.isOSBinFormatMachO()) MAI = new ARMMCAsmInfoDarwin(TheTriple); - else if (TheTriple.isWindowsItaniumEnvironment()) - MAI = new ARMCOFFMCAsmInfoGNU(); else if (TheTriple.isWindowsMSVCEnvironment()) MAI = new ARMCOFFMCAsmInfoMicrosoft(); + else if (TheTriple.isOSWindows()) + MAI = new ARMCOFFMCAsmInfoGNU(); else MAI = new ARMELFMCAsmInfo(TheTriple); @@ -292,14 +290,13 @@ static MCAsmInfo *createARMMCAsmInfo(const MCRegisterInfo &MRI, return MAI; } -static MCCodeGenInfo *createARMMCCodeGenInfo(StringRef TT, Reloc::Model RM, +static MCCodeGenInfo *createARMMCCodeGenInfo(const Triple &TT, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) { MCCodeGenInfo *X = new MCCodeGenInfo(); if (RM == Reloc::Default) { - Triple TheTriple(TT); // Default relocation model on Darwin is PIC, not DynamicNoPIC. - RM = TheTriple.isOSDarwin() ? Reloc::PIC_ : Reloc::DynamicNoPIC; + RM = TT.isOSDarwin() ? Reloc::PIC_ : Reloc::DynamicNoPIC; } X->initMCCodeGenInfo(RM, CM, OL); return X; diff --git a/lib/Target/ARM/Thumb1FrameLowering.cpp b/lib/Target/ARM/Thumb1FrameLowering.cpp index 77cd890e4cad..3b4358b5d9bf 100644 --- a/lib/Target/ARM/Thumb1FrameLowering.cpp +++ b/lib/Target/ARM/Thumb1FrameLowering.cpp @@ -365,7 +365,7 @@ void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF, // frame pointer stack slot, the target is ELF and the function has FP, or // the target uses var sized objects. if (NumBytes) { - assert(MF.getRegInfo().isPhysRegUsed(ARM::R4) && + assert(!MFI->getPristineRegs(MF).test(ARM::R4) && "No scratch register to restore SP from FP!"); emitThumbRegPlusImmediate(MBB, MBBI, dl, ARM::R4, FramePtr, -NumBytes, TII, *RegInfo); diff --git a/lib/Target/BPF/BPFFrameLowering.cpp b/lib/Target/BPF/BPFFrameLowering.cpp index 54c5ececc7de..c2806c85f24f 100644 --- a/lib/Target/BPF/BPFFrameLowering.cpp +++ b/lib/Target/BPF/BPFFrameLowering.cpp @@ -29,12 +29,12 @@ void BPFFrameLowering::emitPrologue(MachineFunction &MF, void BPFFrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const {} -void BPFFrameLowering::processFunctionBeforeCalleeSavedScan( - MachineFunction &MF, RegScavenger *RS) const { - MachineRegisterInfo &MRI = MF.getRegInfo(); - - MRI.setPhysRegUnused(BPF::R6); - MRI.setPhysRegUnused(BPF::R7); - MRI.setPhysRegUnused(BPF::R8); - MRI.setPhysRegUnused(BPF::R9); +void BPFFrameLowering::determineCalleeSaves(MachineFunction &MF, + BitVector &SavedRegs, + RegScavenger *RS) const { + TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); + SavedRegs.reset(BPF::R6); + SavedRegs.reset(BPF::R7); + SavedRegs.reset(BPF::R8); + SavedRegs.reset(BPF::R9); } diff --git a/lib/Target/BPF/BPFFrameLowering.h b/lib/Target/BPF/BPFFrameLowering.h index 3b9fc443e053..251cda965ff5 100644 --- a/lib/Target/BPF/BPFFrameLowering.h +++ b/lib/Target/BPF/BPFFrameLowering.h @@ -28,8 +28,8 @@ public: void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override; bool hasFP(const MachineFunction &MF) const override; - void processFunctionBeforeCalleeSavedScan(MachineFunction &MF, - RegScavenger *RS) const override; + void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, + RegScavenger *RS) const override; void eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, diff --git a/lib/Target/BPF/BPFISelLowering.cpp b/lib/Target/BPF/BPFISelLowering.cpp index 38c56bbef81e..58498a1aec7d 100644 --- a/lib/Target/BPF/BPFISelLowering.cpp +++ b/lib/Target/BPF/BPFISelLowering.cpp @@ -302,8 +302,9 @@ SDValue BPFTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, DAG.getContext()->diagnose(Err); } + auto PtrVT = getPointerTy(MF.getDataLayout()); Chain = DAG.getCALLSEQ_START( - Chain, DAG.getConstant(NumBytes, CLI.DL, getPointerTy(), true), CLI.DL); + Chain, DAG.getConstant(NumBytes, CLI.DL, PtrVT, true), CLI.DL); SmallVector<std::pair<unsigned, SDValue>, 5> RegsToPass; @@ -350,10 +351,10 @@ SDValue BPFTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // turn it into a TargetGlobalAddress node so that legalize doesn't hack it. // Likewise ExternalSymbol -> TargetExternalSymbol. if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) - Callee = DAG.getTargetGlobalAddress(G->getGlobal(), CLI.DL, getPointerTy(), + Callee = DAG.getTargetGlobalAddress(G->getGlobal(), CLI.DL, PtrVT, G->getOffset(), 0); else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee)) - Callee = DAG.getTargetExternalSymbol(E->getSymbol(), getPointerTy(), 0); + Callee = DAG.getTargetExternalSymbol(E->getSymbol(), PtrVT, 0); // Returns a chain & a flag for retval copy to use. SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); @@ -374,8 +375,8 @@ SDValue BPFTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Create the CALLSEQ_END node. Chain = DAG.getCALLSEQ_END( - Chain, DAG.getConstant(NumBytes, CLI.DL, getPointerTy(), true), - DAG.getConstant(0, CLI.DL, getPointerTy(), true), InFlag, CLI.DL); + Chain, DAG.getConstant(NumBytes, CLI.DL, PtrVT, true), + DAG.getConstant(0, CLI.DL, PtrVT, true), InFlag, CLI.DL); InFlag = Chain.getValue(1); // Handle result values, copying them out of physregs into vregs that we diff --git a/lib/Target/BPF/BPFSubtarget.cpp b/lib/Target/BPF/BPFSubtarget.cpp index 65acd585116d..c3a8b1caa63d 100644 --- a/lib/Target/BPF/BPFSubtarget.cpp +++ b/lib/Target/BPF/BPFSubtarget.cpp @@ -28,4 +28,4 @@ void BPFSubtarget::anchor() {} BPFSubtarget::BPFSubtarget(const Triple &TT, const std::string &CPU, const std::string &FS, const TargetMachine &TM) : BPFGenSubtargetInfo(TT, CPU, FS), InstrInfo(), FrameLowering(*this), - TLInfo(TM, *this), TSInfo(TM.getDataLayout()) {} + TLInfo(TM, *this) {} diff --git a/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp b/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp index 3e928fc93a37..840570ebc392 100644 --- a/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp +++ b/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp @@ -40,7 +40,7 @@ static MCInstrInfo *createBPFMCInstrInfo() { return X; } -static MCRegisterInfo *createBPFMCRegisterInfo(StringRef TT) { +static MCRegisterInfo *createBPFMCRegisterInfo(const Triple &TT) { MCRegisterInfo *X = new MCRegisterInfo(); InitBPFMCRegisterInfo(X, BPF::R11 /* RAReg doesn't exist */); return X; @@ -48,12 +48,10 @@ static MCRegisterInfo *createBPFMCRegisterInfo(StringRef TT) { static MCSubtargetInfo *createBPFMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) { - MCSubtargetInfo *X = new MCSubtargetInfo(); - InitBPFMCSubtargetInfo(X, TT, CPU, FS); - return X; + return createBPFMCSubtargetInfoImpl(TT, CPU, FS); } -static MCCodeGenInfo *createBPFMCCodeGenInfo(StringRef TT, Reloc::Model RM, +static MCCodeGenInfo *createBPFMCCodeGenInfo(const Triple &TT, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) { MCCodeGenInfo *X = new MCCodeGenInfo(); diff --git a/lib/Target/CppBackend/CPPBackend.cpp b/lib/Target/CppBackend/CPPBackend.cpp index bc5d7f65b2f6..272688edb8a1 100644 --- a/lib/Target/CppBackend/CPPBackend.cpp +++ b/lib/Target/CppBackend/CPPBackend.cpp @@ -2148,8 +2148,8 @@ char CppWriter::ID = 0; bool CPPTargetMachine::addPassesToEmitFile( PassManagerBase &PM, raw_pwrite_stream &o, CodeGenFileType FileType, - bool DisableVerify, AnalysisID StartAfter, AnalysisID StopAfter, - MachineFunctionInitializer *MFInitializer) { + bool DisableVerify, AnalysisID StartBefore, AnalysisID StartAfter, + AnalysisID StopAfter, MachineFunctionInitializer *MFInitializer) { if (FileType != TargetMachine::CGFT_AssemblyFile) return true; auto FOut = llvm::make_unique<formatted_raw_ostream>(o); diff --git a/lib/Target/CppBackend/CPPTargetMachine.h b/lib/Target/CppBackend/CPPTargetMachine.h index ebf0635b12e4..00e402feffbc 100644 --- a/lib/Target/CppBackend/CPPTargetMachine.h +++ b/lib/Target/CppBackend/CPPTargetMachine.h @@ -31,7 +31,8 @@ struct CPPTargetMachine : public TargetMachine { public: bool addPassesToEmitFile(PassManagerBase &PM, raw_pwrite_stream &Out, CodeGenFileType FileType, bool DisableVerify, - AnalysisID StartAfter, AnalysisID StopAfter, + AnalysisID StartBefore, AnalysisID StartAfter, + AnalysisID StopAfter, MachineFunctionInitializer *MFInitializer) override; }; diff --git a/lib/Target/Hexagon/BitTracker.cpp b/lib/Target/Hexagon/BitTracker.cpp new file mode 100644 index 000000000000..cb7e633fb82f --- /dev/null +++ b/lib/Target/Hexagon/BitTracker.cpp @@ -0,0 +1,1127 @@ +//===--- BitTracker.cpp ---------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +// SSA-based bit propagation. +// +// The purpose of this code is, for a given virtual register, to provide +// information about the value of each bit in the register. The values +// of bits are represented by the class BitValue, and take one of four +// cases: 0, 1, "ref" and "bottom". The 0 and 1 are rather clear, the +// "ref" value means that the bit is a copy of another bit (which itself +// cannot be a copy of yet another bit---such chains are not allowed). +// A "ref" value is associated with a BitRef structure, which indicates +// which virtual register, and which bit in that register is the origin +// of the value. For example, given an instruction +// vreg2 = ASL vreg1, 1 +// assuming that nothing is known about bits of vreg1, bit 1 of vreg2 +// will be a "ref" to (vreg1, 0). If there is a subsequent instruction +// vreg3 = ASL vreg2, 2 +// then bit 3 of vreg3 will be a "ref" to (vreg1, 0) as well. +// The "bottom" case means that the bit's value cannot be determined, +// and that this virtual register actually defines it. The "bottom" case +// is discussed in detail in BitTracker.h. In fact, "bottom" is a "ref +// to self", so for the vreg1 above, the bit 0 of it will be a "ref" to +// (vreg1, 0), bit 1 will be a "ref" to (vreg1, 1), etc. +// +// The tracker implements the Wegman-Zadeck algorithm, originally developed +// for SSA-based constant propagation. Each register is represented as +// a sequence of bits, with the convention that bit 0 is the least signi- +// ficant bit. Each bit is propagated individually. The class RegisterCell +// implements the register's representation, and is also the subject of +// the lattice operations in the tracker. +// +// The intended usage of the bit tracker is to create a target-specific +// machine instruction evaluator, pass the evaluator to the BitTracker +// object, and run the tracker. The tracker will then collect the bit +// value information for a given machine function. After that, it can be +// queried for the cells for each virtual register. +// Sample code: +// const TargetSpecificEvaluator TSE(TRI, MRI); +// BitTracker BT(TSE, MF); +// BT.run(); +// ... +// unsigned Reg = interestingRegister(); +// RegisterCell RC = BT.get(Reg); +// if (RC[3].is(1)) +// Reg0bit3 = 1; +// +// The code below is intended to be fully target-independent. + +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/Constants.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetRegisterInfo.h" + +#include "BitTracker.h" + +using namespace llvm; + +typedef BitTracker BT; + +namespace { + // Local trickery to pretty print a register (without the whole "%vreg" + // business). + struct printv { + printv(unsigned r) : R(r) {} + unsigned R; + }; + raw_ostream &operator<< (raw_ostream &OS, const printv &PV) { + if (PV.R) + OS << 'v' << TargetRegisterInfo::virtReg2Index(PV.R); + else + OS << 's'; + return OS; + } +} + +raw_ostream &llvm::operator<<(raw_ostream &OS, const BT::BitValue &BV) { + switch (BV.Type) { + case BT::BitValue::Top: + OS << 'T'; + break; + case BT::BitValue::Zero: + OS << '0'; + break; + case BT::BitValue::One: + OS << '1'; + break; + case BT::BitValue::Ref: + OS << printv(BV.RefI.Reg) << '[' << BV.RefI.Pos << ']'; + break; + } + return OS; +} + +raw_ostream &llvm::operator<<(raw_ostream &OS, const BT::RegisterCell &RC) { + unsigned n = RC.Bits.size(); + OS << "{ w:" << n; + // Instead of printing each bit value individually, try to group them + // into logical segments, such as sequences of 0 or 1 bits or references + // to consecutive bits (e.g. "bits 3-5 are same as bits 7-9 of reg xyz"). + // "Start" will be the index of the beginning of the most recent segment. + unsigned Start = 0; + bool SeqRef = false; // A sequence of refs to consecutive bits. + bool ConstRef = false; // A sequence of refs to the same bit. + + for (unsigned i = 1, n = RC.Bits.size(); i < n; ++i) { + const BT::BitValue &V = RC[i]; + const BT::BitValue &SV = RC[Start]; + bool IsRef = (V.Type == BT::BitValue::Ref); + // If the current value is the same as Start, skip to the next one. + if (!IsRef && V == SV) + continue; + if (IsRef && SV.Type == BT::BitValue::Ref && V.RefI.Reg == SV.RefI.Reg) { + if (Start+1 == i) { + SeqRef = (V.RefI.Pos == SV.RefI.Pos+1); + ConstRef = (V.RefI.Pos == SV.RefI.Pos); + } + if (SeqRef && V.RefI.Pos == SV.RefI.Pos+(i-Start)) + continue; + if (ConstRef && V.RefI.Pos == SV.RefI.Pos) + continue; + } + + // The current value is different. Print the previous one and reset + // the Start. + OS << " [" << Start; + unsigned Count = i - Start; + if (Count == 1) { + OS << "]:" << SV; + } else { + OS << '-' << i-1 << "]:"; + if (SV.Type == BT::BitValue::Ref && SeqRef) + OS << printv(SV.RefI.Reg) << '[' << SV.RefI.Pos << '-' + << SV.RefI.Pos+(Count-1) << ']'; + else + OS << SV; + } + Start = i; + SeqRef = ConstRef = false; + } + + OS << " [" << Start; + unsigned Count = n - Start; + if (n-Start == 1) { + OS << "]:" << RC[Start]; + } else { + OS << '-' << n-1 << "]:"; + const BT::BitValue &SV = RC[Start]; + if (SV.Type == BT::BitValue::Ref && SeqRef) + OS << printv(SV.RefI.Reg) << '[' << SV.RefI.Pos << '-' + << SV.RefI.Pos+(Count-1) << ']'; + else + OS << SV; + } + OS << " }"; + + return OS; +} + +BitTracker::BitTracker(const MachineEvaluator &E, MachineFunction &F) + : Trace(false), ME(E), MF(F), MRI(F.getRegInfo()), Map(*new CellMapType) {} + +BitTracker::~BitTracker() { + delete ⤅ +} + + +// If we were allowed to update a cell for a part of a register, the meet +// operation would need to be parametrized by the register number and the +// exact part of the register, so that the computer BitRefs correspond to +// the actual bits of the "self" register. +// While this cannot happen in the current implementation, I'm not sure +// if this should be ruled out in the future. +bool BT::RegisterCell::meet(const RegisterCell &RC, unsigned SelfR) { + // An example when "meet" can be invoked with SelfR == 0 is a phi node + // with a physical register as an operand. + assert(SelfR == 0 || TargetRegisterInfo::isVirtualRegister(SelfR)); + bool Changed = false; + for (uint16_t i = 0, n = Bits.size(); i < n; ++i) { + const BitValue &RCV = RC[i]; + Changed |= Bits[i].meet(RCV, BitRef(SelfR, i)); + } + return Changed; +} + + +// Insert the entire cell RC into the current cell at position given by M. +BT::RegisterCell &BT::RegisterCell::insert(const BT::RegisterCell &RC, + const BitMask &M) { + uint16_t B = M.first(), E = M.last(), W = width(); + // Sanity: M must be a valid mask for *this. + assert(B < W && E < W); + // Sanity: the masked part of *this must have the same number of bits + // as the source. + assert(B > E || E-B+1 == RC.width()); // B <= E => E-B+1 = |RC|. + assert(B <= E || E+(W-B)+1 == RC.width()); // E < B => E+(W-B)+1 = |RC|. + if (B <= E) { + for (uint16_t i = 0; i <= E-B; ++i) + Bits[i+B] = RC[i]; + } else { + for (uint16_t i = 0; i < W-B; ++i) + Bits[i+B] = RC[i]; + for (uint16_t i = 0; i <= E; ++i) + Bits[i] = RC[i+(W-B)]; + } + return *this; +} + + +BT::RegisterCell BT::RegisterCell::extract(const BitMask &M) const { + uint16_t B = M.first(), E = M.last(), W = width(); + assert(B < W && E < W); + if (B <= E) { + RegisterCell RC(E-B+1); + for (uint16_t i = B; i <= E; ++i) + RC.Bits[i-B] = Bits[i]; + return RC; + } + + RegisterCell RC(E+(W-B)+1); + for (uint16_t i = 0; i < W-B; ++i) + RC.Bits[i] = Bits[i+B]; + for (uint16_t i = 0; i <= E; ++i) + RC.Bits[i+(W-B)] = Bits[i]; + return RC; +} + + +BT::RegisterCell &BT::RegisterCell::rol(uint16_t Sh) { + // Rotate left (i.e. towards increasing bit indices). + // Swap the two parts: [0..W-Sh-1] [W-Sh..W-1] + uint16_t W = width(); + Sh = Sh % W; + if (Sh == 0) + return *this; + + RegisterCell Tmp(W-Sh); + // Tmp = [0..W-Sh-1]. + for (uint16_t i = 0; i < W-Sh; ++i) + Tmp[i] = Bits[i]; + // Shift [W-Sh..W-1] to [0..Sh-1]. + for (uint16_t i = 0; i < Sh; ++i) + Bits[i] = Bits[W-Sh+i]; + // Copy Tmp to [Sh..W-1]. + for (uint16_t i = 0; i < W-Sh; ++i) + Bits[i+Sh] = Tmp.Bits[i]; + return *this; +} + + +BT::RegisterCell &BT::RegisterCell::fill(uint16_t B, uint16_t E, + const BitValue &V) { + assert(B <= E); + while (B < E) + Bits[B++] = V; + return *this; +} + + +BT::RegisterCell &BT::RegisterCell::cat(const RegisterCell &RC) { + // Append the cell given as the argument to the "this" cell. + // Bit 0 of RC becomes bit W of the result, where W is this->width(). + uint16_t W = width(), WRC = RC.width(); + Bits.resize(W+WRC); + for (uint16_t i = 0; i < WRC; ++i) + Bits[i+W] = RC.Bits[i]; + return *this; +} + + +uint16_t BT::RegisterCell::ct(bool B) const { + uint16_t W = width(); + uint16_t C = 0; + BitValue V = B; + while (C < W && Bits[C] == V) + C++; + return C; +} + + +uint16_t BT::RegisterCell::cl(bool B) const { + uint16_t W = width(); + uint16_t C = 0; + BitValue V = B; + while (C < W && Bits[W-(C+1)] == V) + C++; + return C; +} + + +bool BT::RegisterCell::operator== (const RegisterCell &RC) const { + uint16_t W = Bits.size(); + if (RC.Bits.size() != W) + return false; + for (uint16_t i = 0; i < W; ++i) + if (Bits[i] != RC[i]) + return false; + return true; +} + + +uint16_t BT::MachineEvaluator::getRegBitWidth(const RegisterRef &RR) const { + // The general problem is with finding a register class that corresponds + // to a given reference reg:sub. There can be several such classes, and + // since we only care about the register size, it does not matter which + // such class we would find. + // The easiest way to accomplish what we want is to + // 1. find a physical register PhysR from the same class as RR.Reg, + // 2. find a physical register PhysS that corresponds to PhysR:RR.Sub, + // 3. find a register class that contains PhysS. + unsigned PhysR; + if (TargetRegisterInfo::isVirtualRegister(RR.Reg)) { + const TargetRegisterClass *VC = MRI.getRegClass(RR.Reg); + assert(VC->begin() != VC->end() && "Empty register class"); + PhysR = *VC->begin(); + } else { + assert(TargetRegisterInfo::isPhysicalRegister(RR.Reg)); + PhysR = RR.Reg; + } + + unsigned PhysS = (RR.Sub == 0) ? PhysR : TRI.getSubReg(PhysR, RR.Sub); + const TargetRegisterClass *RC = TRI.getMinimalPhysRegClass(PhysS); + uint16_t BW = RC->getSize()*8; + return BW; +} + + +BT::RegisterCell BT::MachineEvaluator::getCell(const RegisterRef &RR, + const CellMapType &M) const { + uint16_t BW = getRegBitWidth(RR); + + // Physical registers are assumed to be present in the map with an unknown + // value. Don't actually insert anything in the map, just return the cell. + if (TargetRegisterInfo::isPhysicalRegister(RR.Reg)) + return RegisterCell::self(0, BW); + + assert(TargetRegisterInfo::isVirtualRegister(RR.Reg)); + // For virtual registers that belong to a class that is not tracked, + // generate an "unknown" value as well. + const TargetRegisterClass *C = MRI.getRegClass(RR.Reg); + if (!track(C)) + return RegisterCell::self(0, BW); + + CellMapType::const_iterator F = M.find(RR.Reg); + if (F != M.end()) { + if (!RR.Sub) + return F->second; + BitMask M = mask(RR.Reg, RR.Sub); + return F->second.extract(M); + } + // If not found, create a "top" entry, but do not insert it in the map. + return RegisterCell::top(BW); +} + + +void BT::MachineEvaluator::putCell(const RegisterRef &RR, RegisterCell RC, + CellMapType &M) const { + // While updating the cell map can be done in a meaningful way for + // a part of a register, it makes little sense to implement it as the + // SSA representation would never contain such "partial definitions". + if (!TargetRegisterInfo::isVirtualRegister(RR.Reg)) + return; + assert(RR.Sub == 0 && "Unexpected sub-register in definition"); + // Eliminate all ref-to-reg-0 bit values: replace them with "self". + for (unsigned i = 0, n = RC.width(); i < n; ++i) { + const BitValue &V = RC[i]; + if (V.Type == BitValue::Ref && V.RefI.Reg == 0) + RC[i].RefI = BitRef(RR.Reg, i); + } + M[RR.Reg] = RC; +} + + +// Check if the cell represents a compile-time integer value. +bool BT::MachineEvaluator::isInt(const RegisterCell &A) const { + uint16_t W = A.width(); + for (uint16_t i = 0; i < W; ++i) + if (!A[i].is(0) && !A[i].is(1)) + return false; + return true; +} + + +// Convert a cell to the integer value. The result must fit in uint64_t. +uint64_t BT::MachineEvaluator::toInt(const RegisterCell &A) const { + assert(isInt(A)); + uint64_t Val = 0; + uint16_t W = A.width(); + for (uint16_t i = 0; i < W; ++i) { + Val <<= 1; + Val |= A[i].is(1); + } + return Val; +} + + +// Evaluator helper functions. These implement some common operation on +// register cells that can be used to implement target-specific instructions +// in a target-specific evaluator. + +BT::RegisterCell BT::MachineEvaluator::eIMM(int64_t V, uint16_t W) const { + RegisterCell Res(W); + // For bits beyond the 63rd, this will generate the sign bit of V. + for (uint16_t i = 0; i < W; ++i) { + Res[i] = BitValue(V & 1); + V >>= 1; + } + return Res; +} + + +BT::RegisterCell BT::MachineEvaluator::eIMM(const ConstantInt *CI) const { + APInt A = CI->getValue(); + uint16_t BW = A.getBitWidth(); + assert((unsigned)BW == A.getBitWidth() && "BitWidth overflow"); + RegisterCell Res(BW); + for (uint16_t i = 0; i < BW; ++i) + Res[i] = A[i]; + return Res; +} + + +BT::RegisterCell BT::MachineEvaluator::eADD(const RegisterCell &A1, + const RegisterCell &A2) const { + uint16_t W = A1.width(); + assert(W == A2.width()); + RegisterCell Res(W); + bool Carry = false; + uint16_t I; + for (I = 0; I < W; ++I) { + const BitValue &V1 = A1[I]; + const BitValue &V2 = A2[I]; + if (!V1.num() || !V2.num()) + break; + unsigned S = bool(V1) + bool(V2) + Carry; + Res[I] = BitValue(S & 1); + Carry = (S > 1); + } + for (; I < W; ++I) { + const BitValue &V1 = A1[I]; + const BitValue &V2 = A2[I]; + // If the next bit is same as Carry, the result will be 0 plus the + // other bit. The Carry bit will remain unchanged. + if (V1.is(Carry)) + Res[I] = BitValue::ref(V2); + else if (V2.is(Carry)) + Res[I] = BitValue::ref(V1); + else + break; + } + for (; I < W; ++I) + Res[I] = BitValue::self(); + return Res; +} + + +BT::RegisterCell BT::MachineEvaluator::eSUB(const RegisterCell &A1, + const RegisterCell &A2) const { + uint16_t W = A1.width(); + assert(W == A2.width()); + RegisterCell Res(W); + bool Borrow = false; + uint16_t I; + for (I = 0; I < W; ++I) { + const BitValue &V1 = A1[I]; + const BitValue &V2 = A2[I]; + if (!V1.num() || !V2.num()) + break; + unsigned S = bool(V1) - bool(V2) - Borrow; + Res[I] = BitValue(S & 1); + Borrow = (S > 1); + } + for (; I < W; ++I) { + const BitValue &V1 = A1[I]; + const BitValue &V2 = A2[I]; + if (V1.is(Borrow)) { + Res[I] = BitValue::ref(V2); + break; + } + if (V2.is(Borrow)) + Res[I] = BitValue::ref(V1); + else + break; + } + for (; I < W; ++I) + Res[I] = BitValue::self(); + return Res; +} + + +BT::RegisterCell BT::MachineEvaluator::eMLS(const RegisterCell &A1, + const RegisterCell &A2) const { + uint16_t W = A1.width() + A2.width(); + uint16_t Z = A1.ct(0) + A2.ct(0); + RegisterCell Res(W); + Res.fill(0, Z, BitValue::Zero); + Res.fill(Z, W, BitValue::self()); + return Res; +} + + +BT::RegisterCell BT::MachineEvaluator::eMLU(const RegisterCell &A1, + const RegisterCell &A2) const { + uint16_t W = A1.width() + A2.width(); + uint16_t Z = A1.ct(0) + A2.ct(0); + RegisterCell Res(W); + Res.fill(0, Z, BitValue::Zero); + Res.fill(Z, W, BitValue::self()); + return Res; +} + + +BT::RegisterCell BT::MachineEvaluator::eASL(const RegisterCell &A1, + uint16_t Sh) const { + assert(Sh <= A1.width()); + RegisterCell Res = RegisterCell::ref(A1); + Res.rol(Sh); + Res.fill(0, Sh, BitValue::Zero); + return Res; +} + + +BT::RegisterCell BT::MachineEvaluator::eLSR(const RegisterCell &A1, + uint16_t Sh) const { + uint16_t W = A1.width(); + assert(Sh <= W); + RegisterCell Res = RegisterCell::ref(A1); + Res.rol(W-Sh); + Res.fill(W-Sh, W, BitValue::Zero); + return Res; +} + + +BT::RegisterCell BT::MachineEvaluator::eASR(const RegisterCell &A1, + uint16_t Sh) const { + uint16_t W = A1.width(); + assert(Sh <= W); + RegisterCell Res = RegisterCell::ref(A1); + BitValue Sign = Res[W-1]; + Res.rol(W-Sh); + Res.fill(W-Sh, W, Sign); + return Res; +} + + +BT::RegisterCell BT::MachineEvaluator::eAND(const RegisterCell &A1, + const RegisterCell &A2) const { + uint16_t W = A1.width(); + assert(W == A2.width()); + RegisterCell Res(W); + for (uint16_t i = 0; i < W; ++i) { + const BitValue &V1 = A1[i]; + const BitValue &V2 = A2[i]; + if (V1.is(1)) + Res[i] = BitValue::ref(V2); + else if (V2.is(1)) + Res[i] = BitValue::ref(V1); + else if (V1.is(0) || V2.is(0)) + Res[i] = BitValue::Zero; + else if (V1 == V2) + Res[i] = V1; + else + Res[i] = BitValue::self(); + } + return Res; +} + + +BT::RegisterCell BT::MachineEvaluator::eORL(const RegisterCell &A1, + const RegisterCell &A2) const { + uint16_t W = A1.width(); + assert(W == A2.width()); + RegisterCell Res(W); + for (uint16_t i = 0; i < W; ++i) { + const BitValue &V1 = A1[i]; + const BitValue &V2 = A2[i]; + if (V1.is(1) || V2.is(1)) + Res[i] = BitValue::One; + else if (V1.is(0)) + Res[i] = BitValue::ref(V2); + else if (V2.is(0)) + Res[i] = BitValue::ref(V1); + else if (V1 == V2) + Res[i] = V1; + else + Res[i] = BitValue::self(); + } + return Res; +} + + +BT::RegisterCell BT::MachineEvaluator::eXOR(const RegisterCell &A1, + const RegisterCell &A2) const { + uint16_t W = A1.width(); + assert(W == A2.width()); + RegisterCell Res(W); + for (uint16_t i = 0; i < W; ++i) { + const BitValue &V1 = A1[i]; + const BitValue &V2 = A2[i]; + if (V1.is(0)) + Res[i] = BitValue::ref(V2); + else if (V2.is(0)) + Res[i] = BitValue::ref(V1); + else if (V1 == V2) + Res[i] = BitValue::Zero; + else + Res[i] = BitValue::self(); + } + return Res; +} + + +BT::RegisterCell BT::MachineEvaluator::eNOT(const RegisterCell &A1) const { + uint16_t W = A1.width(); + RegisterCell Res(W); + for (uint16_t i = 0; i < W; ++i) { + const BitValue &V = A1[i]; + if (V.is(0)) + Res[i] = BitValue::One; + else if (V.is(1)) + Res[i] = BitValue::Zero; + else + Res[i] = BitValue::self(); + } + return Res; +} + + +BT::RegisterCell BT::MachineEvaluator::eSET(const RegisterCell &A1, + uint16_t BitN) const { + assert(BitN < A1.width()); + RegisterCell Res = RegisterCell::ref(A1); + Res[BitN] = BitValue::One; + return Res; +} + + +BT::RegisterCell BT::MachineEvaluator::eCLR(const RegisterCell &A1, + uint16_t BitN) const { + assert(BitN < A1.width()); + RegisterCell Res = RegisterCell::ref(A1); + Res[BitN] = BitValue::Zero; + return Res; +} + + +BT::RegisterCell BT::MachineEvaluator::eCLB(const RegisterCell &A1, bool B, + uint16_t W) const { + uint16_t C = A1.cl(B), AW = A1.width(); + // If the last leading non-B bit is not a constant, then we don't know + // the real count. + if ((C < AW && A1[AW-1-C].num()) || C == AW) + return eIMM(C, W); + return RegisterCell::self(0, W); +} + + +BT::RegisterCell BT::MachineEvaluator::eCTB(const RegisterCell &A1, bool B, + uint16_t W) const { + uint16_t C = A1.ct(B), AW = A1.width(); + // If the last trailing non-B bit is not a constant, then we don't know + // the real count. + if ((C < AW && A1[C].num()) || C == AW) + return eIMM(C, W); + return RegisterCell::self(0, W); +} + + +BT::RegisterCell BT::MachineEvaluator::eSXT(const RegisterCell &A1, + uint16_t FromN) const { + uint16_t W = A1.width(); + assert(FromN <= W); + RegisterCell Res = RegisterCell::ref(A1); + BitValue Sign = Res[FromN-1]; + // Sign-extend "inreg". + Res.fill(FromN, W, Sign); + return Res; +} + + +BT::RegisterCell BT::MachineEvaluator::eZXT(const RegisterCell &A1, + uint16_t FromN) const { + uint16_t W = A1.width(); + assert(FromN <= W); + RegisterCell Res = RegisterCell::ref(A1); + Res.fill(FromN, W, BitValue::Zero); + return Res; +} + + +BT::RegisterCell BT::MachineEvaluator::eXTR(const RegisterCell &A1, + uint16_t B, uint16_t E) const { + uint16_t W = A1.width(); + assert(B < W && E <= W); + if (B == E) + return RegisterCell(0); + uint16_t Last = (E > 0) ? E-1 : W-1; + RegisterCell Res = RegisterCell::ref(A1).extract(BT::BitMask(B, Last)); + // Return shorter cell. + return Res; +} + + +BT::RegisterCell BT::MachineEvaluator::eINS(const RegisterCell &A1, + const RegisterCell &A2, uint16_t AtN) const { + uint16_t W1 = A1.width(), W2 = A2.width(); + (void)W1; + assert(AtN < W1 && AtN+W2 <= W1); + // Copy bits from A1, insert A2 at position AtN. + RegisterCell Res = RegisterCell::ref(A1); + if (W2 > 0) + Res.insert(RegisterCell::ref(A2), BT::BitMask(AtN, AtN+W2-1)); + return Res; +} + + +BT::BitMask BT::MachineEvaluator::mask(unsigned Reg, unsigned Sub) const { + assert(Sub == 0 && "Generic BitTracker::mask called for Sub != 0"); + uint16_t W = getRegBitWidth(Reg); + assert(W > 0 && "Cannot generate mask for empty register"); + return BitMask(0, W-1); +} + + +bool BT::MachineEvaluator::evaluate(const MachineInstr *MI, + const CellMapType &Inputs, CellMapType &Outputs) const { + unsigned Opc = MI->getOpcode(); + switch (Opc) { + case TargetOpcode::REG_SEQUENCE: { + RegisterRef RD = MI->getOperand(0); + assert(RD.Sub == 0); + RegisterRef RS = MI->getOperand(1); + unsigned SS = MI->getOperand(2).getImm(); + RegisterRef RT = MI->getOperand(3); + unsigned ST = MI->getOperand(4).getImm(); + assert(SS != ST); + + uint16_t W = getRegBitWidth(RD); + RegisterCell Res(W); + Res.insert(RegisterCell::ref(getCell(RS, Inputs)), mask(RD.Reg, SS)); + Res.insert(RegisterCell::ref(getCell(RT, Inputs)), mask(RD.Reg, ST)); + putCell(RD, Res, Outputs); + break; + } + + case TargetOpcode::COPY: { + // COPY can transfer a smaller register into a wider one. + // If that is the case, fill the remaining high bits with 0. + RegisterRef RD = MI->getOperand(0); + RegisterRef RS = MI->getOperand(1); + assert(RD.Sub == 0); + uint16_t WD = getRegBitWidth(RD); + uint16_t WS = getRegBitWidth(RS); + assert(WD >= WS); + RegisterCell Src = getCell(RS, Inputs); + RegisterCell Res(WD); + Res.insert(Src, BitMask(0, WS-1)); + Res.fill(WS, WD, BitValue::Zero); + putCell(RD, Res, Outputs); + break; + } + + default: + return false; + } + + return true; +} + + +// Main W-Z implementation. + +void BT::visitPHI(const MachineInstr *PI) { + int ThisN = PI->getParent()->getNumber(); + if (Trace) + dbgs() << "Visit FI(BB#" << ThisN << "): " << *PI; + + const MachineOperand &MD = PI->getOperand(0); + assert(MD.getSubReg() == 0 && "Unexpected sub-register in definition"); + RegisterRef DefRR(MD); + uint16_t DefBW = ME.getRegBitWidth(DefRR); + + RegisterCell DefC = ME.getCell(DefRR, Map); + if (DefC == RegisterCell::self(DefRR.Reg, DefBW)) // XXX slow + return; + + bool Changed = false; + + for (unsigned i = 1, n = PI->getNumOperands(); i < n; i += 2) { + const MachineBasicBlock *PB = PI->getOperand(i+1).getMBB(); + int PredN = PB->getNumber(); + if (Trace) + dbgs() << " edge BB#" << PredN << "->BB#" << ThisN; + if (!EdgeExec.count(CFGEdge(PredN, ThisN))) { + if (Trace) + dbgs() << " not executable\n"; + continue; + } + + RegisterRef RU = PI->getOperand(i); + RegisterCell ResC = ME.getCell(RU, Map); + if (Trace) + dbgs() << " input reg: " << PrintReg(RU.Reg, &ME.TRI, RU.Sub) + << " cell: " << ResC << "\n"; + Changed |= DefC.meet(ResC, DefRR.Reg); + } + + if (Changed) { + if (Trace) + dbgs() << "Output: " << PrintReg(DefRR.Reg, &ME.TRI, DefRR.Sub) + << " cell: " << DefC << "\n"; + ME.putCell(DefRR, DefC, Map); + visitUsesOf(DefRR.Reg); + } +} + + +void BT::visitNonBranch(const MachineInstr *MI) { + if (Trace) { + int ThisN = MI->getParent()->getNumber(); + dbgs() << "Visit MI(BB#" << ThisN << "): " << *MI; + } + if (MI->isDebugValue()) + return; + assert(!MI->isBranch() && "Unexpected branch instruction"); + + CellMapType ResMap; + bool Eval = ME.evaluate(MI, Map, ResMap); + + if (Trace && Eval) { + for (unsigned i = 0, n = MI->getNumOperands(); i < n; ++i) { + const MachineOperand &MO = MI->getOperand(i); + if (!MO.isReg() || !MO.isUse()) + continue; + RegisterRef RU(MO); + dbgs() << " input reg: " << PrintReg(RU.Reg, &ME.TRI, RU.Sub) + << " cell: " << ME.getCell(RU, Map) << "\n"; + } + dbgs() << "Outputs:\n"; + for (CellMapType::iterator I = ResMap.begin(), E = ResMap.end(); + I != E; ++I) { + RegisterRef RD(I->first); + dbgs() << " " << PrintReg(I->first, &ME.TRI) << " cell: " + << ME.getCell(RD, ResMap) << "\n"; + } + } + + // Iterate over all definitions of the instruction, and update the + // cells accordingly. + for (unsigned i = 0, n = MI->getNumOperands(); i < n; ++i) { + const MachineOperand &MO = MI->getOperand(i); + // Visit register defs only. + if (!MO.isReg() || !MO.isDef()) + continue; + RegisterRef RD(MO); + assert(RD.Sub == 0 && "Unexpected sub-register in definition"); + if (!TargetRegisterInfo::isVirtualRegister(RD.Reg)) + continue; + + bool Changed = false; + if (!Eval || !ResMap.has(RD.Reg)) { + // Set to "ref" (aka "bottom"). + uint16_t DefBW = ME.getRegBitWidth(RD); + RegisterCell RefC = RegisterCell::self(RD.Reg, DefBW); + if (RefC != ME.getCell(RD, Map)) { + ME.putCell(RD, RefC, Map); + Changed = true; + } + } else { + RegisterCell DefC = ME.getCell(RD, Map); + RegisterCell ResC = ME.getCell(RD, ResMap); + // This is a non-phi instruction, so the values of the inputs come + // from the same registers each time this instruction is evaluated. + // During the propagation, the values of the inputs can become lowered + // in the sense of the lattice operation, which may cause different + // results to be calculated in subsequent evaluations. This should + // not cause the bottoming of the result in the map, since the new + // result is already reflecting the lowered inputs. + for (uint16_t i = 0, w = DefC.width(); i < w; ++i) { + BitValue &V = DefC[i]; + // Bits that are already "bottom" should not be updated. + if (V.Type == BitValue::Ref && V.RefI.Reg == RD.Reg) + continue; + // Same for those that are identical in DefC and ResC. + if (V == ResC[i]) + continue; + V = ResC[i]; + Changed = true; + } + if (Changed) + ME.putCell(RD, DefC, Map); + } + if (Changed) + visitUsesOf(RD.Reg); + } +} + + +void BT::visitBranchesFrom(const MachineInstr *BI) { + const MachineBasicBlock &B = *BI->getParent(); + MachineBasicBlock::const_iterator It = BI, End = B.end(); + BranchTargetList Targets, BTs; + bool FallsThrough = true, DefaultToAll = false; + int ThisN = B.getNumber(); + + do { + BTs.clear(); + const MachineInstr *MI = &*It; + if (Trace) + dbgs() << "Visit BR(BB#" << ThisN << "): " << *MI; + assert(MI->isBranch() && "Expecting branch instruction"); + InstrExec.insert(MI); + bool Eval = ME.evaluate(MI, Map, BTs, FallsThrough); + if (!Eval) { + // If the evaluation failed, we will add all targets. Keep going in + // the loop to mark all executable branches as such. + DefaultToAll = true; + FallsThrough = true; + if (Trace) + dbgs() << " failed to evaluate: will add all CFG successors\n"; + } else if (!DefaultToAll) { + // If evaluated successfully add the targets to the cumulative list. + if (Trace) { + dbgs() << " adding targets:"; + for (unsigned i = 0, n = BTs.size(); i < n; ++i) + dbgs() << " BB#" << BTs[i]->getNumber(); + if (FallsThrough) + dbgs() << "\n falls through\n"; + else + dbgs() << "\n does not fall through\n"; + } + Targets.insert(BTs.begin(), BTs.end()); + } + ++It; + } while (FallsThrough && It != End); + + typedef MachineBasicBlock::const_succ_iterator succ_iterator; + if (!DefaultToAll) { + // Need to add all CFG successors that lead to EH landing pads. + // There won't be explicit branches to these blocks, but they must + // be processed. + for (succ_iterator I = B.succ_begin(), E = B.succ_end(); I != E; ++I) { + const MachineBasicBlock *SB = *I; + if (SB->isLandingPad()) + Targets.insert(SB); + } + if (FallsThrough) { + MachineFunction::const_iterator BIt = &B; + MachineFunction::const_iterator Next = std::next(BIt); + if (Next != MF.end()) + Targets.insert(&*Next); + } + } else { + for (succ_iterator I = B.succ_begin(), E = B.succ_end(); I != E; ++I) + Targets.insert(*I); + } + + for (unsigned i = 0, n = Targets.size(); i < n; ++i) { + int TargetN = Targets[i]->getNumber(); + FlowQ.push(CFGEdge(ThisN, TargetN)); + } +} + + +void BT::visitUsesOf(unsigned Reg) { + if (Trace) + dbgs() << "visiting uses of " << PrintReg(Reg, &ME.TRI) << "\n"; + + typedef MachineRegisterInfo::use_nodbg_iterator use_iterator; + use_iterator End = MRI.use_nodbg_end(); + for (use_iterator I = MRI.use_nodbg_begin(Reg); I != End; ++I) { + MachineInstr *UseI = I->getParent(); + if (!InstrExec.count(UseI)) + continue; + if (UseI->isPHI()) + visitPHI(UseI); + else if (!UseI->isBranch()) + visitNonBranch(UseI); + else + visitBranchesFrom(UseI); + } +} + + +BT::RegisterCell BT::get(RegisterRef RR) const { + return ME.getCell(RR, Map); +} + + +void BT::put(RegisterRef RR, const RegisterCell &RC) { + ME.putCell(RR, RC, Map); +} + + +// Replace all references to bits from OldRR with the corresponding bits +// in NewRR. +void BT::subst(RegisterRef OldRR, RegisterRef NewRR) { + assert(Map.has(OldRR.Reg) && "OldRR not present in map"); + BitMask OM = ME.mask(OldRR.Reg, OldRR.Sub); + BitMask NM = ME.mask(NewRR.Reg, NewRR.Sub); + uint16_t OMB = OM.first(), OME = OM.last(); + uint16_t NMB = NM.first(), NME = NM.last(); + (void)NME; + assert((OME-OMB == NME-NMB) && + "Substituting registers of different lengths"); + for (CellMapType::iterator I = Map.begin(), E = Map.end(); I != E; ++I) { + RegisterCell &RC = I->second; + for (uint16_t i = 0, w = RC.width(); i < w; ++i) { + BitValue &V = RC[i]; + if (V.Type != BitValue::Ref || V.RefI.Reg != OldRR.Reg) + continue; + if (V.RefI.Pos < OMB || V.RefI.Pos > OME) + continue; + V.RefI.Reg = NewRR.Reg; + V.RefI.Pos += NMB-OMB; + } + } +} + + +// Check if the block has been "executed" during propagation. (If not, the +// block is dead, but it may still appear to be reachable.) +bool BT::reached(const MachineBasicBlock *B) const { + int BN = B->getNumber(); + assert(BN >= 0); + for (EdgeSetType::iterator I = EdgeExec.begin(), E = EdgeExec.end(); + I != E; ++I) { + if (I->second == BN) + return true; + } + return false; +} + + +void BT::reset() { + EdgeExec.clear(); + InstrExec.clear(); + Map.clear(); +} + + +void BT::run() { + reset(); + assert(FlowQ.empty()); + + typedef GraphTraits<const MachineFunction*> MachineFlowGraphTraits; + const MachineBasicBlock *Entry = MachineFlowGraphTraits::getEntryNode(&MF); + + unsigned MaxBN = 0; + for (MachineFunction::const_iterator I = MF.begin(), E = MF.end(); + I != E; ++I) { + assert(I->getNumber() >= 0 && "Disconnected block"); + unsigned BN = I->getNumber(); + if (BN > MaxBN) + MaxBN = BN; + } + + // Keep track of visited blocks. + BitVector BlockScanned(MaxBN+1); + + int EntryN = Entry->getNumber(); + // Generate a fake edge to get something to start with. + FlowQ.push(CFGEdge(-1, EntryN)); + + while (!FlowQ.empty()) { + CFGEdge Edge = FlowQ.front(); + FlowQ.pop(); + + if (EdgeExec.count(Edge)) + continue; + EdgeExec.insert(Edge); + + const MachineBasicBlock &B = *MF.getBlockNumbered(Edge.second); + MachineBasicBlock::const_iterator It = B.begin(), End = B.end(); + // Visit PHI nodes first. + while (It != End && It->isPHI()) { + const MachineInstr *PI = &*It++; + InstrExec.insert(PI); + visitPHI(PI); + } + + // If this block has already been visited through a flow graph edge, + // then the instructions have already been processed. Any updates to + // the cells would now only happen through visitUsesOf... + if (BlockScanned[Edge.second]) + continue; + BlockScanned[Edge.second] = true; + + // Visit non-branch instructions. + while (It != End && !It->isBranch()) { + const MachineInstr *MI = &*It++; + InstrExec.insert(MI); + visitNonBranch(MI); + } + // If block end has been reached, add the fall-through edge to the queue. + if (It == End) { + MachineFunction::const_iterator BIt = &B; + MachineFunction::const_iterator Next = std::next(BIt); + if (Next != MF.end()) { + int ThisN = B.getNumber(); + int NextN = Next->getNumber(); + FlowQ.push(CFGEdge(ThisN, NextN)); + } + } else { + // Handle the remaining sequence of branches. This function will update + // the work queue. + visitBranchesFrom(It); + } + } // while (!FlowQ->empty()) + + if (Trace) { + dbgs() << "Cells after propagation:\n"; + for (CellMapType::iterator I = Map.begin(), E = Map.end(); I != E; ++I) + dbgs() << PrintReg(I->first, &ME.TRI) << " -> " << I->second << "\n"; + } +} + diff --git a/lib/Target/Hexagon/BitTracker.h b/lib/Target/Hexagon/BitTracker.h new file mode 100644 index 000000000000..ed002a794d66 --- /dev/null +++ b/lib/Target/Hexagon/BitTracker.h @@ -0,0 +1,449 @@ +//===--- BitTracker.h -----------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef BITTRACKER_H +#define BITTRACKER_H + +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/MachineFunction.h" + +#include <map> +#include <queue> +#include <set> + +namespace llvm { + class ConstantInt; + class MachineRegisterInfo; + class MachineBasicBlock; + class MachineInstr; + class MachineOperand; + class raw_ostream; + +struct BitTracker { + struct BitRef; + struct RegisterRef; + struct BitValue; + struct BitMask; + struct RegisterCell; + struct MachineEvaluator; + + typedef SetVector<const MachineBasicBlock *> BranchTargetList; + + struct CellMapType : public std::map<unsigned,RegisterCell> { + bool has(unsigned Reg) const; + }; + + BitTracker(const MachineEvaluator &E, MachineFunction &F); + ~BitTracker(); + + void run(); + void trace(bool On = false) { Trace = On; } + bool has(unsigned Reg) const; + const RegisterCell &lookup(unsigned Reg) const; + RegisterCell get(RegisterRef RR) const; + void put(RegisterRef RR, const RegisterCell &RC); + void subst(RegisterRef OldRR, RegisterRef NewRR); + bool reached(const MachineBasicBlock *B) const; + +private: + void visitPHI(const MachineInstr *PI); + void visitNonBranch(const MachineInstr *MI); + void visitBranchesFrom(const MachineInstr *BI); + void visitUsesOf(unsigned Reg); + void reset(); + + typedef std::pair<int,int> CFGEdge; + typedef std::set<CFGEdge> EdgeSetType; + typedef std::set<const MachineInstr *> InstrSetType; + typedef std::queue<CFGEdge> EdgeQueueType; + + EdgeSetType EdgeExec; // Executable flow graph edges. + InstrSetType InstrExec; // Executable instructions. + EdgeQueueType FlowQ; // Work queue of CFG edges. + bool Trace; // Enable tracing for debugging. + + const MachineEvaluator &ME; + MachineFunction &MF; + MachineRegisterInfo &MRI; + CellMapType ⤅ +}; + + +// Abstraction of a reference to bit at position Pos from a register Reg. +struct BitTracker::BitRef { + BitRef(unsigned R = 0, uint16_t P = 0) : Reg(R), Pos(P) {} + BitRef(const BitRef &BR) : Reg(BR.Reg), Pos(BR.Pos) {} + bool operator== (const BitRef &BR) const { + // If Reg is 0, disregard Pos. + return Reg == BR.Reg && (Reg == 0 || Pos == BR.Pos); + } + unsigned Reg; + uint16_t Pos; +}; + + +// Abstraction of a register reference in MachineOperand. It contains the +// register number and the subregister index. +struct BitTracker::RegisterRef { + RegisterRef(unsigned R = 0, unsigned S = 0) + : Reg(R), Sub(S) {} + RegisterRef(const MachineOperand &MO) + : Reg(MO.getReg()), Sub(MO.getSubReg()) {} + unsigned Reg, Sub; +}; + + +// Value that a single bit can take. This is outside of the context of +// any register, it is more of an abstraction of the two-element set of +// possible bit values. One extension here is the "Ref" type, which +// indicates that this bit takes the same value as the bit described by +// RefInfo. +struct BitTracker::BitValue { + enum ValueType { + Top, // Bit not yet defined. + Zero, // Bit = 0. + One, // Bit = 1. + Ref // Bit value same as the one described in RefI. + // Conceptually, there is no explicit "bottom" value: the lattice's + // bottom will be expressed as a "ref to itself", which, in the context + // of registers, could be read as "this value of this bit is defined by + // this bit". + // The ordering is: + // x <= Top, + // Self <= x, where "Self" is "ref to itself". + // This makes the value lattice different for each virtual register + // (even for each bit in the same virtual register), since the "bottom" + // for one register will be a simple "ref" for another register. + // Since we do not store the "Self" bit and register number, the meet + // operation will need to take it as a parameter. + // + // In practice there is a special case for values that are not associa- + // ted with any specific virtual register. An example would be a value + // corresponding to a bit of a physical register, or an intermediate + // value obtained in some computation (such as instruction evaluation). + // Such cases are identical to the usual Ref type, but the register + // number is 0. In such case the Pos field of the reference is ignored. + // + // What is worthy of notice is that in value V (that is a "ref"), as long + // as the RefI.Reg is not 0, it may actually be the same register as the + // one in which V will be contained. If the RefI.Pos refers to the posi- + // tion of V, then V is assumed to be "bottom" (as a "ref to itself"), + // otherwise V is taken to be identical to the referenced bit of the + // same register. + // If RefI.Reg is 0, however, such a reference to the same register is + // not possible. Any value V that is a "ref", and whose RefI.Reg is 0 + // is treated as "bottom". + }; + ValueType Type; + BitRef RefI; + + BitValue(ValueType T = Top) : Type(T) {} + BitValue(bool B) : Type(B ? One : Zero) {} + BitValue(const BitValue &V) : Type(V.Type), RefI(V.RefI) {} + BitValue(unsigned Reg, uint16_t Pos) : Type(Ref), RefI(Reg, Pos) {} + + bool operator== (const BitValue &V) const { + if (Type != V.Type) + return false; + if (Type == Ref && !(RefI == V.RefI)) + return false; + return true; + } + bool operator!= (const BitValue &V) const { + return !operator==(V); + } + bool is(unsigned T) const { + assert(T == 0 || T == 1); + return T == 0 ? Type == Zero + : (T == 1 ? Type == One : false); + } + + // The "meet" operation is the "." operation in a semilattice (L, ., T, B): + // (1) x.x = x + // (2) x.y = y.x + // (3) x.(y.z) = (x.y).z + // (4) x.T = x (i.e. T = "top") + // (5) x.B = B (i.e. B = "bottom") + // + // This "meet" function will update the value of the "*this" object with + // the newly calculated one, and return "true" if the value of *this has + // changed, and "false" otherwise. + // To prove that it satisfies the conditions (1)-(5), it is sufficient + // to show that a relation + // x <= y <=> x.y = x + // defines a partial order (i.e. that "meet" is same as "infimum"). + bool meet(const BitValue &V, const BitRef &Self) { + // First, check the cases where there is nothing to be done. + if (Type == Ref && RefI == Self) // Bottom.meet(V) = Bottom (i.e. This) + return false; + if (V.Type == Top) // This.meet(Top) = This + return false; + if (*this == V) // This.meet(This) = This + return false; + + // At this point, we know that the value of "this" will change. + // If it is Top, it will become the same as V, otherwise it will + // become "bottom" (i.e. Self). + if (Type == Top) { + Type = V.Type; + RefI = V.RefI; // This may be irrelevant, but copy anyway. + return true; + } + // Become "bottom". + Type = Ref; + RefI = Self; + return true; + } + + // Create a reference to the bit value V. + static BitValue ref(const BitValue &V); + // Create a "self". + static BitValue self(const BitRef &Self = BitRef()); + + bool num() const { + return Type == Zero || Type == One; + } + operator bool() const { + assert(Type == Zero || Type == One); + return Type == One; + } + + friend raw_ostream &operator<<(raw_ostream &OS, const BitValue &BV); +}; + + +// This operation must be idempotent, i.e. ref(ref(V)) == ref(V). +inline BitTracker::BitValue +BitTracker::BitValue::ref(const BitValue &V) { + if (V.Type != Ref) + return BitValue(V.Type); + if (V.RefI.Reg != 0) + return BitValue(V.RefI.Reg, V.RefI.Pos); + return self(); +} + + +inline BitTracker::BitValue +BitTracker::BitValue::self(const BitRef &Self) { + return BitValue(Self.Reg, Self.Pos); +} + + +// A sequence of bits starting from index B up to and including index E. +// If E < B, the mask represents two sections: [0..E] and [B..W) where +// W is the width of the register. +struct BitTracker::BitMask { + BitMask() : B(0), E(0) {} + BitMask(uint16_t b, uint16_t e) : B(b), E(e) {} + uint16_t first() const { return B; } + uint16_t last() const { return E; } +private: + uint16_t B, E; +}; + + +// Representation of a register: a list of BitValues. +struct BitTracker::RegisterCell { + RegisterCell(uint16_t Width = DefaultBitN) : Bits(Width) {} + + uint16_t width() const { + return Bits.size(); + } + const BitValue &operator[](uint16_t BitN) const { + assert(BitN < Bits.size()); + return Bits[BitN]; + } + BitValue &operator[](uint16_t BitN) { + assert(BitN < Bits.size()); + return Bits[BitN]; + } + + bool meet(const RegisterCell &RC, unsigned SelfR); + RegisterCell &insert(const RegisterCell &RC, const BitMask &M); + RegisterCell extract(const BitMask &M) const; // Returns a new cell. + RegisterCell &rol(uint16_t Sh); // Rotate left. + RegisterCell &fill(uint16_t B, uint16_t E, const BitValue &V); + RegisterCell &cat(const RegisterCell &RC); // Concatenate. + uint16_t cl(bool B) const; + uint16_t ct(bool B) const; + + bool operator== (const RegisterCell &RC) const; + bool operator!= (const RegisterCell &RC) const { + return !operator==(RC); + } + + const RegisterCell &operator=(const RegisterCell &RC) { + Bits = RC.Bits; + return *this; + } + + // Generate a "ref" cell for the corresponding register. In the resulting + // cell each bit will be described as being the same as the corresponding + // bit in register Reg (i.e. the cell is "defined" by register Reg). + static RegisterCell self(unsigned Reg, uint16_t Width); + // Generate a "top" cell of given size. + static RegisterCell top(uint16_t Width); + // Generate a cell that is a "ref" to another cell. + static RegisterCell ref(const RegisterCell &C); + +private: + // The DefaultBitN is here only to avoid frequent reallocation of the + // memory in the vector. + static const unsigned DefaultBitN = 32; + typedef SmallVector<BitValue, DefaultBitN> BitValueList; + BitValueList Bits; + + friend raw_ostream &operator<<(raw_ostream &OS, const RegisterCell &RC); +}; + + +inline bool BitTracker::has(unsigned Reg) const { + return Map.find(Reg) != Map.end(); +} + + +inline const BitTracker::RegisterCell& +BitTracker::lookup(unsigned Reg) const { + CellMapType::const_iterator F = Map.find(Reg); + assert(F != Map.end()); + return F->second; +} + + +inline BitTracker::RegisterCell +BitTracker::RegisterCell::self(unsigned Reg, uint16_t Width) { + RegisterCell RC(Width); + for (uint16_t i = 0; i < Width; ++i) + RC.Bits[i] = BitValue::self(BitRef(Reg, i)); + return RC; +} + + +inline BitTracker::RegisterCell +BitTracker::RegisterCell::top(uint16_t Width) { + RegisterCell RC(Width); + for (uint16_t i = 0; i < Width; ++i) + RC.Bits[i] = BitValue(BitValue::Top); + return RC; +} + + +inline BitTracker::RegisterCell +BitTracker::RegisterCell::ref(const RegisterCell &C) { + uint16_t W = C.width(); + RegisterCell RC(W); + for (unsigned i = 0; i < W; ++i) + RC[i] = BitValue::ref(C[i]); + return RC; +} + + +inline bool BitTracker::CellMapType::has(unsigned Reg) const { + return find(Reg) != end(); +} + +// A class to evaluate target's instructions and update the cell maps. +// This is used internally by the bit tracker. A target that wants to +// utilize this should implement the evaluation functions (noted below) +// in a subclass of this class. +struct BitTracker::MachineEvaluator { + MachineEvaluator(const TargetRegisterInfo &T, MachineRegisterInfo &M) + : TRI(T), MRI(M) {} + virtual ~MachineEvaluator() {} + + uint16_t getRegBitWidth(const RegisterRef &RR) const; + + RegisterCell getCell(const RegisterRef &RR, const CellMapType &M) const; + void putCell(const RegisterRef &RR, RegisterCell RC, CellMapType &M) const; + // A result of any operation should use refs to the source cells, not + // the cells directly. This function is a convenience wrapper to quickly + // generate a ref for a cell corresponding to a register reference. + RegisterCell getRef(const RegisterRef &RR, const CellMapType &M) const { + RegisterCell RC = getCell(RR, M); + return RegisterCell::ref(RC); + } + + // Helper functions. + // Check if a cell is an immediate value (i.e. all bits are either 0 or 1). + bool isInt(const RegisterCell &A) const; + // Convert cell to an immediate value. + uint64_t toInt(const RegisterCell &A) const; + + // Generate cell from an immediate value. + RegisterCell eIMM(int64_t V, uint16_t W) const; + RegisterCell eIMM(const ConstantInt *CI) const; + + // Arithmetic. + RegisterCell eADD(const RegisterCell &A1, const RegisterCell &A2) const; + RegisterCell eSUB(const RegisterCell &A1, const RegisterCell &A2) const; + RegisterCell eMLS(const RegisterCell &A1, const RegisterCell &A2) const; + RegisterCell eMLU(const RegisterCell &A1, const RegisterCell &A2) const; + + // Shifts. + RegisterCell eASL(const RegisterCell &A1, uint16_t Sh) const; + RegisterCell eLSR(const RegisterCell &A1, uint16_t Sh) const; + RegisterCell eASR(const RegisterCell &A1, uint16_t Sh) const; + + // Logical. + RegisterCell eAND(const RegisterCell &A1, const RegisterCell &A2) const; + RegisterCell eORL(const RegisterCell &A1, const RegisterCell &A2) const; + RegisterCell eXOR(const RegisterCell &A1, const RegisterCell &A2) const; + RegisterCell eNOT(const RegisterCell &A1) const; + + // Set bit, clear bit. + RegisterCell eSET(const RegisterCell &A1, uint16_t BitN) const; + RegisterCell eCLR(const RegisterCell &A1, uint16_t BitN) const; + + // Count leading/trailing bits (zeros/ones). + RegisterCell eCLB(const RegisterCell &A1, bool B, uint16_t W) const; + RegisterCell eCTB(const RegisterCell &A1, bool B, uint16_t W) const; + + // Sign/zero extension. + RegisterCell eSXT(const RegisterCell &A1, uint16_t FromN) const; + RegisterCell eZXT(const RegisterCell &A1, uint16_t FromN) const; + + // Extract/insert + // XTR R,b,e: extract bits from A1 starting at bit b, ending at e-1. + // INS R,S,b: take R and replace bits starting from b with S. + RegisterCell eXTR(const RegisterCell &A1, uint16_t B, uint16_t E) const; + RegisterCell eINS(const RegisterCell &A1, const RegisterCell &A2, + uint16_t AtN) const; + + // User-provided functions for individual targets: + + // Return a sub-register mask that indicates which bits in Reg belong + // to the subregister Sub. These bits are assumed to be contiguous in + // the super-register, and have the same ordering in the sub-register + // as in the super-register. It is valid to call this function with + // Sub == 0, in this case, the function should return a mask that spans + // the entire register Reg (which is what the default implementation + // does). + virtual BitMask mask(unsigned Reg, unsigned Sub) const; + // Indicate whether a given register class should be tracked. + virtual bool track(const TargetRegisterClass *RC) const { return true; } + // Evaluate a non-branching machine instruction, given the cell map with + // the input values. Place the results in the Outputs map. Return "true" + // if evaluation succeeded, "false" otherwise. + virtual bool evaluate(const MachineInstr *MI, const CellMapType &Inputs, + CellMapType &Outputs) const; + // Evaluate a branch, given the cell map with the input values. Fill out + // a list of all possible branch targets and indicate (through a flag) + // whether the branch could fall-through. Return "true" if this information + // has been successfully computed, "false" otherwise. + virtual bool evaluate(const MachineInstr *BI, const CellMapType &Inputs, + BranchTargetList &Targets, bool &FallsThru) const = 0; + + const TargetRegisterInfo &TRI; + MachineRegisterInfo &MRI; +}; + +} // end namespace llvm + +#endif diff --git a/lib/Target/Hexagon/CMakeLists.txt b/lib/Target/Hexagon/CMakeLists.txt index 758ccc741007..7ab2f0ba01df 100644 --- a/lib/Target/Hexagon/CMakeLists.txt +++ b/lib/Target/Hexagon/CMakeLists.txt @@ -12,13 +12,19 @@ tablegen(LLVM HexagonGenSubtargetInfo.inc -gen-subtarget) add_public_tablegen_target(HexagonCommonTableGen) add_llvm_target(HexagonCodeGen + BitTracker.cpp HexagonAsmPrinter.cpp + HexagonBitTracker.cpp HexagonCFGOptimizer.cpp + HexagonCommonGEP.cpp HexagonCopyToCombine.cpp HexagonExpandCondsets.cpp HexagonExpandPredSpillCode.cpp HexagonFixupHwLoops.cpp HexagonFrameLowering.cpp + HexagonGenExtract.cpp + HexagonGenInsert.cpp + HexagonGenPredicate.cpp HexagonHardwareLoops.cpp HexagonInstrInfo.cpp HexagonISelDAGToDAG.cpp diff --git a/lib/Target/Hexagon/HexagonBitTracker.cpp b/lib/Target/Hexagon/HexagonBitTracker.cpp new file mode 100644 index 000000000000..021e58a1d08a --- /dev/null +++ b/lib/Target/Hexagon/HexagonBitTracker.cpp @@ -0,0 +1,1174 @@ +//===--- HexagonBitTracker.cpp --------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +#include "Hexagon.h" +#include "HexagonInstrInfo.h" +#include "HexagonRegisterInfo.h" +#include "HexagonTargetMachine.h" +#include "HexagonBitTracker.h" + +using namespace llvm; + +typedef BitTracker BT; + +HexagonEvaluator::HexagonEvaluator(const HexagonRegisterInfo &tri, + MachineRegisterInfo &mri, + const HexagonInstrInfo &tii, + MachineFunction &mf) + : MachineEvaluator(tri, mri), MF(mf), MFI(*mf.getFrameInfo()), TII(tii) { + // Populate the VRX map (VR to extension-type). + // Go over all the formal parameters of the function. If a given parameter + // P is sign- or zero-extended, locate the virtual register holding that + // parameter and create an entry in the VRX map indicating the type of ex- + // tension (and the source type). + // This is a bit complicated to do accurately, since the memory layout in- + // formation is necessary to precisely determine whether an aggregate para- + // meter will be passed in a register or in memory. What is given in MRI + // is the association between the physical register that is live-in (i.e. + // holds an argument), and the virtual register that this value will be + // copied into. This, by itself, is not sufficient to map back the virtual + // register to a formal parameter from Function (since consecutive live-ins + // from MRI may not correspond to consecutive formal parameters from Func- + // tion). To avoid the complications with in-memory arguments, only consi- + // der the initial sequence of formal parameters that are known to be + // passed via registers. + unsigned AttrIdx = 0; + unsigned InVirtReg, InPhysReg = 0; + const Function &F = *MF.getFunction(); + typedef Function::const_arg_iterator arg_iterator; + for (arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I) { + AttrIdx++; + const Argument &Arg = *I; + Type *ATy = Arg.getType(); + unsigned Width = 0; + if (ATy->isIntegerTy()) + Width = ATy->getIntegerBitWidth(); + else if (ATy->isPointerTy()) + Width = 32; + // If pointer size is not set through target data, it will default to + // Module::AnyPointerSize. + if (Width == 0 || Width > 64) + break; + InPhysReg = getNextPhysReg(InPhysReg, Width); + if (!InPhysReg) + break; + InVirtReg = getVirtRegFor(InPhysReg); + if (!InVirtReg) + continue; + AttributeSet Attrs = F.getAttributes(); + if (Attrs.hasAttribute(AttrIdx, Attribute::SExt)) + VRX.insert(std::make_pair(InVirtReg, ExtType(ExtType::SExt, Width))); + else if (Attrs.hasAttribute(AttrIdx, Attribute::ZExt)) + VRX.insert(std::make_pair(InVirtReg, ExtType(ExtType::ZExt, Width))); + } +} + + +BT::BitMask HexagonEvaluator::mask(unsigned Reg, unsigned Sub) const { + if (Sub == 0) + return MachineEvaluator::mask(Reg, 0); + using namespace Hexagon; + const TargetRegisterClass *RC = MRI.getRegClass(Reg); + unsigned ID = RC->getID(); + uint16_t RW = getRegBitWidth(RegisterRef(Reg, Sub)); + switch (ID) { + case DoubleRegsRegClassID: + return (Sub == subreg_loreg) ? BT::BitMask(0, RW-1) + : BT::BitMask(RW, 2*RW-1); + default: + break; + } +#ifndef NDEBUG + dbgs() << PrintReg(Reg, &TRI, Sub) << '\n'; +#endif + llvm_unreachable("Unexpected register/subregister"); +} + + +namespace { + struct RegisterRefs : public std::vector<BT::RegisterRef> { + typedef std::vector<BT::RegisterRef> Base; + RegisterRefs(const MachineInstr *MI); + const BT::RegisterRef &operator[](unsigned n) const { + // The main purpose of this operator is to assert with bad argument. + assert(n < size()); + return Base::operator[](n); + } + }; + + RegisterRefs::RegisterRefs(const MachineInstr *MI) + : Base(MI->getNumOperands()) { + for (unsigned i = 0, n = size(); i < n; ++i) { + const MachineOperand &MO = MI->getOperand(i); + if (MO.isReg()) + at(i) = BT::RegisterRef(MO); + // For indices that don't correspond to registers, the entry will + // remain constructed via the default constructor. + } + } +} + + +bool HexagonEvaluator::evaluate(const MachineInstr *MI, + const CellMapType &Inputs, CellMapType &Outputs) const { + unsigned NumDefs = 0; + + // Sanity verification: there should not be any defs with subregisters. + for (unsigned i = 0, n = MI->getNumOperands(); i < n; ++i) { + const MachineOperand &MO = MI->getOperand(i); + if (!MO.isReg() || !MO.isDef()) + continue; + NumDefs++; + assert(MO.getSubReg() == 0); + } + + if (NumDefs == 0) + return false; + + if (MI->mayLoad()) + return evaluateLoad(MI, Inputs, Outputs); + + // Check COPY instructions that copy formal parameters into virtual + // registers. Such parameters can be sign- or zero-extended at the + // call site, and we should take advantage of this knowledge. The MRI + // keeps a list of pairs of live-in physical and virtual registers, + // which provides information about which virtual registers will hold + // the argument values. The function will still contain instructions + // defining those virtual registers, and in practice those are COPY + // instructions from a physical to a virtual register. In such cases, + // applying the argument extension to the virtual register can be seen + // as simply mirroring the extension that had already been applied to + // the physical register at the call site. If the defining instruction + // was not a COPY, it would not be clear how to mirror that extension + // on the callee's side. For that reason, only check COPY instructions + // for potential extensions. + if (MI->isCopy()) { + if (evaluateFormalCopy(MI, Inputs, Outputs)) + return true; + } + + // Beyond this point, if any operand is a global, skip that instruction. + // The reason is that certain instructions that can take an immediate + // operand can also have a global symbol in that operand. To avoid + // checking what kind of operand a given instruction has individually + // for each instruction, do it here. Global symbols as operands gene- + // rally do not provide any useful information. + for (unsigned i = 0, n = MI->getNumOperands(); i < n; ++i) { + const MachineOperand &MO = MI->getOperand(i); + if (MO.isGlobal() || MO.isBlockAddress() || MO.isSymbol() || MO.isJTI() || + MO.isCPI()) + return false; + } + + RegisterRefs Reg(MI); + unsigned Opc = MI->getOpcode(); + using namespace Hexagon; + #define op(i) MI->getOperand(i) + #define rc(i) RegisterCell::ref(getCell(Reg[i],Inputs)) + #define im(i) MI->getOperand(i).getImm() + + // If the instruction has no register operands, skip it. + if (Reg.size() == 0) + return false; + + // Record result for register in operand 0. + auto rr0 = [this,Reg] (const BT::RegisterCell &Val, CellMapType &Outputs) + -> bool { + putCell(Reg[0], Val, Outputs); + return true; + }; + // Get the cell corresponding to the N-th operand. + auto cop = [this,Reg,MI,Inputs] (unsigned N, uint16_t W) + -> BT::RegisterCell { + const MachineOperand &Op = MI->getOperand(N); + if (Op.isImm()) + return eIMM(Op.getImm(), W); + if (!Op.isReg()) + return RegisterCell::self(0, W); + assert(getRegBitWidth(Reg[N]) == W && "Register width mismatch"); + return rc(N); + }; + // Extract RW low bits of the cell. + auto lo = [this] (const BT::RegisterCell &RC, uint16_t RW) + -> BT::RegisterCell { + assert(RW <= RC.width()); + return eXTR(RC, 0, RW); + }; + // Extract RW high bits of the cell. + auto hi = [this] (const BT::RegisterCell &RC, uint16_t RW) + -> BT::RegisterCell { + uint16_t W = RC.width(); + assert(RW <= W); + return eXTR(RC, W-RW, W); + }; + // Extract N-th halfword (counting from the least significant position). + auto half = [this] (const BT::RegisterCell &RC, unsigned N) + -> BT::RegisterCell { + assert(N*16+16 <= RC.width()); + return eXTR(RC, N*16, N*16+16); + }; + // Shuffle bits (pick even/odd from cells and merge into result). + auto shuffle = [this] (const BT::RegisterCell &Rs, const BT::RegisterCell &Rt, + uint16_t BW, bool Odd) -> BT::RegisterCell { + uint16_t I = Odd, Ws = Rs.width(); + assert(Ws == Rt.width()); + RegisterCell RC = eXTR(Rt, I*BW, I*BW+BW).cat(eXTR(Rs, I*BW, I*BW+BW)); + I += 2; + while (I*BW < Ws) { + RC.cat(eXTR(Rt, I*BW, I*BW+BW)).cat(eXTR(Rs, I*BW, I*BW+BW)); + I += 2; + } + return RC; + }; + + // The bitwidth of the 0th operand. In most (if not all) of the + // instructions below, the 0th operand is the defined register. + // Pre-compute the bitwidth here, because it is needed in many cases + // cases below. + uint16_t W0 = (Reg[0].Reg != 0) ? getRegBitWidth(Reg[0]) : 0; + + switch (Opc) { + // Transfer immediate: + + case A2_tfrsi: + case A2_tfrpi: + case CONST32: + case CONST32_Float_Real: + case CONST32_Int_Real: + case CONST64_Float_Real: + case CONST64_Int_Real: + return rr0(eIMM(im(1), W0), Outputs); + case TFR_PdFalse: + return rr0(RegisterCell(W0).fill(0, W0, BT::BitValue::Zero), Outputs); + case TFR_PdTrue: + return rr0(RegisterCell(W0).fill(0, W0, BT::BitValue::One), Outputs); + case TFR_FI: { + int FI = op(1).getIndex(); + int Off = op(2).getImm(); + unsigned A = MFI.getObjectAlignment(FI) + std::abs(Off); + unsigned L = Log2_32(A); + RegisterCell RC = RegisterCell::self(Reg[0].Reg, W0); + RC.fill(0, L, BT::BitValue::Zero); + return rr0(RC, Outputs); + } + + // Transfer register: + + case A2_tfr: + case A2_tfrp: + case C2_pxfer_map: + return rr0(rc(1), Outputs); + case C2_tfrpr: { + uint16_t RW = W0; + uint16_t PW = 8; // XXX Pred size: getRegBitWidth(Reg[1]); + assert(PW <= RW); + RegisterCell PC = eXTR(rc(1), 0, PW); + RegisterCell RC = RegisterCell(RW).insert(PC, BT::BitMask(0, PW-1)); + RC.fill(PW, RW, BT::BitValue::Zero); + return rr0(RC, Outputs); + } + case C2_tfrrp: { + RegisterCell RC = RegisterCell::self(Reg[0].Reg, W0); + W0 = 8; // XXX Pred size + return rr0(eINS(RC, eXTR(rc(1), 0, W0), 0), Outputs); + } + + // Arithmetic: + + case A2_abs: + case A2_absp: + // TODO + break; + + case A2_addsp: { + uint16_t W1 = getRegBitWidth(Reg[1]); + assert(W0 == 64 && W1 == 32); + RegisterCell CW = RegisterCell(W0).insert(rc(1), BT::BitMask(0, W1-1)); + RegisterCell RC = eADD(eSXT(CW, W1), rc(2)); + return rr0(RC, Outputs); + } + case A2_add: + case A2_addp: + return rr0(eADD(rc(1), rc(2)), Outputs); + case A2_addi: + return rr0(eADD(rc(1), eIMM(im(2), W0)), Outputs); + case S4_addi_asl_ri: { + RegisterCell RC = eADD(eIMM(im(1), W0), eASL(rc(2), im(3))); + return rr0(RC, Outputs); + } + case S4_addi_lsr_ri: { + RegisterCell RC = eADD(eIMM(im(1), W0), eLSR(rc(2), im(3))); + return rr0(RC, Outputs); + } + case S4_addaddi: { + RegisterCell RC = eADD(rc(1), eADD(rc(2), eIMM(im(3), W0))); + return rr0(RC, Outputs); + } + case M4_mpyri_addi: { + RegisterCell M = eMLS(rc(2), eIMM(im(3), W0)); + RegisterCell RC = eADD(eIMM(im(1), W0), lo(M, W0)); + return rr0(RC, Outputs); + } + case M4_mpyrr_addi: { + RegisterCell M = eMLS(rc(2), rc(3)); + RegisterCell RC = eADD(eIMM(im(1), W0), lo(M, W0)); + return rr0(RC, Outputs); + } + case M4_mpyri_addr_u2: { + RegisterCell M = eMLS(eIMM(im(2), W0), rc(3)); + RegisterCell RC = eADD(rc(1), lo(M, W0)); + return rr0(RC, Outputs); + } + case M4_mpyri_addr: { + RegisterCell M = eMLS(rc(2), eIMM(im(3), W0)); + RegisterCell RC = eADD(rc(1), lo(M, W0)); + return rr0(RC, Outputs); + } + case M4_mpyrr_addr: { + RegisterCell M = eMLS(rc(2), rc(3)); + RegisterCell RC = eADD(rc(1), lo(M, W0)); + return rr0(RC, Outputs); + } + case S4_subaddi: { + RegisterCell RC = eADD(rc(1), eSUB(eIMM(im(2), W0), rc(3))); + return rr0(RC, Outputs); + } + case M2_accii: { + RegisterCell RC = eADD(rc(1), eADD(rc(2), eIMM(im(3), W0))); + return rr0(RC, Outputs); + } + case M2_acci: { + RegisterCell RC = eADD(rc(1), eADD(rc(2), rc(3))); + return rr0(RC, Outputs); + } + case M2_subacc: { + RegisterCell RC = eADD(rc(1), eSUB(rc(2), rc(3))); + return rr0(RC, Outputs); + } + case S2_addasl_rrri: { + RegisterCell RC = eADD(rc(1), eASL(rc(2), im(3))); + return rr0(RC, Outputs); + } + case C4_addipc: { + RegisterCell RPC = RegisterCell::self(Reg[0].Reg, W0); + RPC.fill(0, 2, BT::BitValue::Zero); + return rr0(eADD(RPC, eIMM(im(2), W0)), Outputs); + } + case A2_sub: + case A2_subp: + return rr0(eSUB(rc(1), rc(2)), Outputs); + case A2_subri: + return rr0(eSUB(eIMM(im(1), W0), rc(2)), Outputs); + case S4_subi_asl_ri: { + RegisterCell RC = eSUB(eIMM(im(1), W0), eASL(rc(2), im(3))); + return rr0(RC, Outputs); + } + case S4_subi_lsr_ri: { + RegisterCell RC = eSUB(eIMM(im(1), W0), eLSR(rc(2), im(3))); + return rr0(RC, Outputs); + } + case M2_naccii: { + RegisterCell RC = eSUB(rc(1), eADD(rc(2), eIMM(im(3), W0))); + return rr0(RC, Outputs); + } + case M2_nacci: { + RegisterCell RC = eSUB(rc(1), eADD(rc(2), rc(3))); + return rr0(RC, Outputs); + } + // 32-bit negation is done by "Rd = A2_subri 0, Rs" + case A2_negp: + return rr0(eSUB(eIMM(0, W0), rc(1)), Outputs); + + case M2_mpy_up: { + RegisterCell M = eMLS(rc(1), rc(2)); + return rr0(hi(M, W0), Outputs); + } + case M2_dpmpyss_s0: + return rr0(eMLS(rc(1), rc(2)), Outputs); + case M2_dpmpyss_acc_s0: + return rr0(eADD(rc(1), eMLS(rc(2), rc(3))), Outputs); + case M2_dpmpyss_nac_s0: + return rr0(eSUB(rc(1), eMLS(rc(2), rc(3))), Outputs); + case M2_mpyi: { + RegisterCell M = eMLS(rc(1), rc(2)); + return rr0(lo(M, W0), Outputs); + } + case M2_macsip: { + RegisterCell M = eMLS(rc(2), eIMM(im(3), W0)); + RegisterCell RC = eADD(rc(1), lo(M, W0)); + return rr0(RC, Outputs); + } + case M2_macsin: { + RegisterCell M = eMLS(rc(2), eIMM(im(3), W0)); + RegisterCell RC = eSUB(rc(1), lo(M, W0)); + return rr0(RC, Outputs); + } + case M2_maci: { + RegisterCell M = eMLS(rc(2), rc(3)); + RegisterCell RC = eADD(rc(1), lo(M, W0)); + return rr0(RC, Outputs); + } + case M2_mpysmi: { + RegisterCell M = eMLS(rc(1), eIMM(im(2), W0)); + return rr0(lo(M, 32), Outputs); + } + case M2_mpysin: { + RegisterCell M = eMLS(rc(1), eIMM(-im(2), W0)); + return rr0(lo(M, 32), Outputs); + } + case M2_mpysip: { + RegisterCell M = eMLS(rc(1), eIMM(im(2), W0)); + return rr0(lo(M, 32), Outputs); + } + case M2_mpyu_up: { + RegisterCell M = eMLU(rc(1), rc(2)); + return rr0(hi(M, W0), Outputs); + } + case M2_dpmpyuu_s0: + return rr0(eMLU(rc(1), rc(2)), Outputs); + case M2_dpmpyuu_acc_s0: + return rr0(eADD(rc(1), eMLU(rc(2), rc(3))), Outputs); + case M2_dpmpyuu_nac_s0: + return rr0(eSUB(rc(1), eMLU(rc(2), rc(3))), Outputs); + //case M2_mpysu_up: + + // Logical/bitwise: + + case A2_andir: + return rr0(eAND(rc(1), eIMM(im(2), W0)), Outputs); + case A2_and: + case A2_andp: + return rr0(eAND(rc(1), rc(2)), Outputs); + case A4_andn: + case A4_andnp: + return rr0(eAND(rc(1), eNOT(rc(2))), Outputs); + case S4_andi_asl_ri: { + RegisterCell RC = eAND(eIMM(im(1), W0), eASL(rc(2), im(3))); + return rr0(RC, Outputs); + } + case S4_andi_lsr_ri: { + RegisterCell RC = eAND(eIMM(im(1), W0), eLSR(rc(2), im(3))); + return rr0(RC, Outputs); + } + case M4_and_and: + return rr0(eAND(rc(1), eAND(rc(2), rc(3))), Outputs); + case M4_and_andn: + return rr0(eAND(rc(1), eAND(rc(2), eNOT(rc(3)))), Outputs); + case M4_and_or: + return rr0(eAND(rc(1), eORL(rc(2), rc(3))), Outputs); + case M4_and_xor: + return rr0(eAND(rc(1), eXOR(rc(2), rc(3))), Outputs); + case A2_orir: + return rr0(eORL(rc(1), eIMM(im(2), W0)), Outputs); + case A2_or: + case A2_orp: + return rr0(eORL(rc(1), rc(2)), Outputs); + case A4_orn: + case A4_ornp: + return rr0(eORL(rc(1), eNOT(rc(2))), Outputs); + case S4_ori_asl_ri: { + RegisterCell RC = eORL(eIMM(im(1), W0), eASL(rc(2), im(3))); + return rr0(RC, Outputs); + } + case S4_ori_lsr_ri: { + RegisterCell RC = eORL(eIMM(im(1), W0), eLSR(rc(2), im(3))); + return rr0(RC, Outputs); + } + case M4_or_and: + return rr0(eORL(rc(1), eAND(rc(2), rc(3))), Outputs); + case M4_or_andn: + return rr0(eORL(rc(1), eAND(rc(2), eNOT(rc(3)))), Outputs); + case S4_or_andi: + case S4_or_andix: { + RegisterCell RC = eORL(rc(1), eAND(rc(2), eIMM(im(3), W0))); + return rr0(RC, Outputs); + } + case S4_or_ori: { + RegisterCell RC = eORL(rc(1), eORL(rc(2), eIMM(im(3), W0))); + return rr0(RC, Outputs); + } + case M4_or_or: + return rr0(eORL(rc(1), eORL(rc(2), rc(3))), Outputs); + case M4_or_xor: + return rr0(eORL(rc(1), eXOR(rc(2), rc(3))), Outputs); + case A2_xor: + case A2_xorp: + return rr0(eXOR(rc(1), rc(2)), Outputs); + case M4_xor_and: + return rr0(eXOR(rc(1), eAND(rc(2), rc(3))), Outputs); + case M4_xor_andn: + return rr0(eXOR(rc(1), eAND(rc(2), eNOT(rc(3)))), Outputs); + case M4_xor_or: + return rr0(eXOR(rc(1), eORL(rc(2), rc(3))), Outputs); + case M4_xor_xacc: + return rr0(eXOR(rc(1), eXOR(rc(2), rc(3))), Outputs); + case A2_not: + case A2_notp: + return rr0(eNOT(rc(1)), Outputs); + + case S2_asl_i_r: + case S2_asl_i_p: + return rr0(eASL(rc(1), im(2)), Outputs); + case A2_aslh: + return rr0(eASL(rc(1), 16), Outputs); + case S2_asl_i_r_acc: + case S2_asl_i_p_acc: + return rr0(eADD(rc(1), eASL(rc(2), im(3))), Outputs); + case S2_asl_i_r_nac: + case S2_asl_i_p_nac: + return rr0(eSUB(rc(1), eASL(rc(2), im(3))), Outputs); + case S2_asl_i_r_and: + case S2_asl_i_p_and: + return rr0(eAND(rc(1), eASL(rc(2), im(3))), Outputs); + case S2_asl_i_r_or: + case S2_asl_i_p_or: + return rr0(eORL(rc(1), eASL(rc(2), im(3))), Outputs); + case S2_asl_i_r_xacc: + case S2_asl_i_p_xacc: + return rr0(eXOR(rc(1), eASL(rc(2), im(3))), Outputs); + case S2_asl_i_vh: + case S2_asl_i_vw: + // TODO + break; + + case S2_asr_i_r: + case S2_asr_i_p: + return rr0(eASR(rc(1), im(2)), Outputs); + case A2_asrh: + return rr0(eASR(rc(1), 16), Outputs); + case S2_asr_i_r_acc: + case S2_asr_i_p_acc: + return rr0(eADD(rc(1), eASR(rc(2), im(3))), Outputs); + case S2_asr_i_r_nac: + case S2_asr_i_p_nac: + return rr0(eSUB(rc(1), eASR(rc(2), im(3))), Outputs); + case S2_asr_i_r_and: + case S2_asr_i_p_and: + return rr0(eAND(rc(1), eASR(rc(2), im(3))), Outputs); + case S2_asr_i_r_or: + case S2_asr_i_p_or: + return rr0(eORL(rc(1), eASR(rc(2), im(3))), Outputs); + case S2_asr_i_r_rnd: { + // The input is first sign-extended to 64 bits, then the output + // is truncated back to 32 bits. + assert(W0 == 32); + RegisterCell XC = eSXT(rc(1).cat(eIMM(0, W0)), W0); + RegisterCell RC = eASR(eADD(eASR(XC, im(2)), eIMM(1, 2*W0)), 1); + return rr0(eXTR(RC, 0, W0), Outputs); + } + case S2_asr_i_r_rnd_goodsyntax: { + int64_t S = im(2); + if (S == 0) + return rr0(rc(1), Outputs); + // Result: S2_asr_i_r_rnd Rs, u5-1 + RegisterCell XC = eSXT(rc(1).cat(eIMM(0, W0)), W0); + RegisterCell RC = eLSR(eADD(eASR(XC, S-1), eIMM(1, 2*W0)), 1); + return rr0(eXTR(RC, 0, W0), Outputs); + } + case S2_asr_r_vh: + case S2_asr_i_vw: + case S2_asr_i_svw_trun: + // TODO + break; + + case S2_lsr_i_r: + case S2_lsr_i_p: + return rr0(eLSR(rc(1), im(2)), Outputs); + case S2_lsr_i_r_acc: + case S2_lsr_i_p_acc: + return rr0(eADD(rc(1), eLSR(rc(2), im(3))), Outputs); + case S2_lsr_i_r_nac: + case S2_lsr_i_p_nac: + return rr0(eSUB(rc(1), eLSR(rc(2), im(3))), Outputs); + case S2_lsr_i_r_and: + case S2_lsr_i_p_and: + return rr0(eAND(rc(1), eLSR(rc(2), im(3))), Outputs); + case S2_lsr_i_r_or: + case S2_lsr_i_p_or: + return rr0(eORL(rc(1), eLSR(rc(2), im(3))), Outputs); + case S2_lsr_i_r_xacc: + case S2_lsr_i_p_xacc: + return rr0(eXOR(rc(1), eLSR(rc(2), im(3))), Outputs); + + case S2_clrbit_i: { + RegisterCell RC = rc(1); + RC[im(2)] = BT::BitValue::Zero; + return rr0(RC, Outputs); + } + case S2_setbit_i: { + RegisterCell RC = rc(1); + RC[im(2)] = BT::BitValue::One; + return rr0(RC, Outputs); + } + case S2_togglebit_i: { + RegisterCell RC = rc(1); + uint16_t BX = im(2); + RC[BX] = RC[BX].is(0) ? BT::BitValue::One + : RC[BX].is(1) ? BT::BitValue::Zero + : BT::BitValue::self(); + return rr0(RC, Outputs); + } + + case A4_bitspliti: { + uint16_t W1 = getRegBitWidth(Reg[1]); + uint16_t BX = im(2); + // Res.uw[1] = Rs[bx+1:], Res.uw[0] = Rs[0:bx] + const BT::BitValue Zero = BT::BitValue::Zero; + RegisterCell RZ = RegisterCell(W0).fill(BX, W1, Zero) + .fill(W1+(W1-BX), W0, Zero); + RegisterCell BF1 = eXTR(rc(1), 0, BX), BF2 = eXTR(rc(1), BX, W1); + RegisterCell RC = eINS(eINS(RZ, BF1, 0), BF2, W1); + return rr0(RC, Outputs); + } + case S4_extract: + case S4_extractp: + case S2_extractu: + case S2_extractup: { + uint16_t Wd = im(2), Of = im(3); + assert(Wd <= W0); + if (Wd == 0) + return rr0(eIMM(0, W0), Outputs); + // If the width extends beyond the register size, pad the register + // with 0 bits. + RegisterCell Pad = (Wd+Of > W0) ? rc(1).cat(eIMM(0, Wd+Of-W0)) : rc(1); + RegisterCell Ext = eXTR(Pad, Of, Wd+Of); + // Ext is short, need to extend it with 0s or sign bit. + RegisterCell RC = RegisterCell(W0).insert(Ext, BT::BitMask(0, Wd-1)); + if (Opc == S2_extractu || Opc == S2_extractup) + return rr0(eZXT(RC, Wd), Outputs); + return rr0(eSXT(RC, Wd), Outputs); + } + case S2_insert: + case S2_insertp: { + uint16_t Wd = im(3), Of = im(4); + assert(Wd < W0 && Of < W0); + // If Wd+Of exceeds W0, the inserted bits are truncated. + if (Wd+Of > W0) + Wd = W0-Of; + if (Wd == 0) + return rr0(rc(1), Outputs); + return rr0(eINS(rc(1), eXTR(rc(2), 0, Wd), Of), Outputs); + } + + // Bit permutations: + + case A2_combineii: + case A4_combineii: + case A4_combineir: + case A4_combineri: + case A2_combinew: + assert(W0 % 2 == 0); + return rr0(cop(2, W0/2).cat(cop(1, W0/2)), Outputs); + case A2_combine_ll: + case A2_combine_lh: + case A2_combine_hl: + case A2_combine_hh: { + assert(W0 == 32); + assert(getRegBitWidth(Reg[1]) == 32 && getRegBitWidth(Reg[2]) == 32); + // Low half in the output is 0 for _ll and _hl, 1 otherwise: + unsigned LoH = !(Opc == A2_combine_ll || Opc == A2_combine_hl); + // High half in the output is 0 for _ll and _lh, 1 otherwise: + unsigned HiH = !(Opc == A2_combine_ll || Opc == A2_combine_lh); + RegisterCell R1 = rc(1); + RegisterCell R2 = rc(2); + RegisterCell RC = half(R2, LoH).cat(half(R1, HiH)); + return rr0(RC, Outputs); + } + case S2_packhl: { + assert(W0 == 64); + assert(getRegBitWidth(Reg[1]) == 32 && getRegBitWidth(Reg[2]) == 32); + RegisterCell R1 = rc(1); + RegisterCell R2 = rc(2); + RegisterCell RC = half(R2, 0).cat(half(R1, 0)).cat(half(R2, 1)) + .cat(half(R1, 1)); + return rr0(RC, Outputs); + } + case S2_shuffeb: { + RegisterCell RC = shuffle(rc(1), rc(2), 8, false); + return rr0(RC, Outputs); + } + case S2_shuffeh: { + RegisterCell RC = shuffle(rc(1), rc(2), 16, false); + return rr0(RC, Outputs); + } + case S2_shuffob: { + RegisterCell RC = shuffle(rc(1), rc(2), 8, true); + return rr0(RC, Outputs); + } + case S2_shuffoh: { + RegisterCell RC = shuffle(rc(1), rc(2), 16, true); + return rr0(RC, Outputs); + } + case C2_mask: { + uint16_t WR = W0; + uint16_t WP = 8; // XXX Pred size: getRegBitWidth(Reg[1]); + assert(WR == 64 && WP == 8); + RegisterCell R1 = rc(1); + RegisterCell RC(WR); + for (uint16_t i = 0; i < WP; ++i) { + const BT::BitValue &V = R1[i]; + BT::BitValue F = (V.is(0) || V.is(1)) ? V : BT::BitValue::self(); + RC.fill(i*8, i*8+8, F); + } + return rr0(RC, Outputs); + } + + // Mux: + + case C2_muxii: + case C2_muxir: + case C2_muxri: + case C2_mux: { + BT::BitValue PC0 = rc(1)[0]; + RegisterCell R2 = cop(2, W0); + RegisterCell R3 = cop(3, W0); + if (PC0.is(0) || PC0.is(1)) + return rr0(RegisterCell::ref(PC0 ? R2 : R3), Outputs); + R2.meet(R3, Reg[0].Reg); + return rr0(R2, Outputs); + } + case C2_vmux: + // TODO + break; + + // Sign- and zero-extension: + + case A2_sxtb: + return rr0(eSXT(rc(1), 8), Outputs); + case A2_sxth: + return rr0(eSXT(rc(1), 16), Outputs); + case A2_sxtw: { + uint16_t W1 = getRegBitWidth(Reg[1]); + assert(W0 == 64 && W1 == 32); + RegisterCell RC = eSXT(rc(1).cat(eIMM(0, W1)), W1); + return rr0(RC, Outputs); + } + case A2_zxtb: + return rr0(eZXT(rc(1), 8), Outputs); + case A2_zxth: + return rr0(eZXT(rc(1), 16), Outputs); + + // Bit count: + + case S2_cl0: + case S2_cl0p: + // Always produce a 32-bit result. + return rr0(eCLB(rc(1), 0/*bit*/, 32), Outputs); + case S2_cl1: + case S2_cl1p: + return rr0(eCLB(rc(1), 1/*bit*/, 32), Outputs); + case S2_clb: + case S2_clbp: { + uint16_t W1 = getRegBitWidth(Reg[1]); + RegisterCell R1 = rc(1); + BT::BitValue TV = R1[W1-1]; + if (TV.is(0) || TV.is(1)) + return rr0(eCLB(R1, TV, 32), Outputs); + break; + } + case S2_ct0: + case S2_ct0p: + return rr0(eCTB(rc(1), 0/*bit*/, 32), Outputs); + case S2_ct1: + case S2_ct1p: + return rr0(eCTB(rc(1), 1/*bit*/, 32), Outputs); + case S5_popcountp: + // TODO + break; + + case C2_all8: { + RegisterCell P1 = rc(1); + bool Has0 = false, All1 = true; + for (uint16_t i = 0; i < 8/*XXX*/; ++i) { + if (!P1[i].is(1)) + All1 = false; + if (!P1[i].is(0)) + continue; + Has0 = true; + break; + } + if (!Has0 && !All1) + break; + RegisterCell RC(W0); + RC.fill(0, W0, (All1 ? BT::BitValue::One : BT::BitValue::Zero)); + return rr0(RC, Outputs); + } + case C2_any8: { + RegisterCell P1 = rc(1); + bool Has1 = false, All0 = true; + for (uint16_t i = 0; i < 8/*XXX*/; ++i) { + if (!P1[i].is(0)) + All0 = false; + if (!P1[i].is(1)) + continue; + Has1 = true; + break; + } + if (!Has1 && !All0) + break; + RegisterCell RC(W0); + RC.fill(0, W0, (Has1 ? BT::BitValue::One : BT::BitValue::Zero)); + return rr0(RC, Outputs); + } + case C2_and: + return rr0(eAND(rc(1), rc(2)), Outputs); + case C2_andn: + return rr0(eAND(rc(1), eNOT(rc(2))), Outputs); + case C2_not: + return rr0(eNOT(rc(1)), Outputs); + case C2_or: + return rr0(eORL(rc(1), rc(2)), Outputs); + case C2_orn: + return rr0(eORL(rc(1), eNOT(rc(2))), Outputs); + case C2_xor: + return rr0(eXOR(rc(1), rc(2)), Outputs); + case C4_and_and: + return rr0(eAND(rc(1), eAND(rc(2), rc(3))), Outputs); + case C4_and_andn: + return rr0(eAND(rc(1), eAND(rc(2), eNOT(rc(3)))), Outputs); + case C4_and_or: + return rr0(eAND(rc(1), eORL(rc(2), rc(3))), Outputs); + case C4_and_orn: + return rr0(eAND(rc(1), eORL(rc(2), eNOT(rc(3)))), Outputs); + case C4_or_and: + return rr0(eORL(rc(1), eAND(rc(2), rc(3))), Outputs); + case C4_or_andn: + return rr0(eORL(rc(1), eAND(rc(2), eNOT(rc(3)))), Outputs); + case C4_or_or: + return rr0(eORL(rc(1), eORL(rc(2), rc(3))), Outputs); + case C4_or_orn: + return rr0(eORL(rc(1), eORL(rc(2), eNOT(rc(3)))), Outputs); + case C2_bitsclr: + case C2_bitsclri: + case C2_bitsset: + case C4_nbitsclr: + case C4_nbitsclri: + case C4_nbitsset: + // TODO + break; + case S2_tstbit_i: + case S4_ntstbit_i: { + BT::BitValue V = rc(1)[im(2)]; + if (V.is(0) || V.is(1)) { + // If instruction is S2_tstbit_i, test for 1, otherwise test for 0. + bool TV = (Opc == S2_tstbit_i); + BT::BitValue F = V.is(TV) ? BT::BitValue::One : BT::BitValue::Zero; + return rr0(RegisterCell(W0).fill(0, W0, F), Outputs); + } + break; + } + + default: + return MachineEvaluator::evaluate(MI, Inputs, Outputs); + } + #undef im + #undef rc + #undef op + return false; +} + + +bool HexagonEvaluator::evaluate(const MachineInstr *BI, + const CellMapType &Inputs, BranchTargetList &Targets, + bool &FallsThru) const { + // We need to evaluate one branch at a time. TII::AnalyzeBranch checks + // all the branches in a basic block at once, so we cannot use it. + unsigned Opc = BI->getOpcode(); + bool SimpleBranch = false; + bool Negated = false; + switch (Opc) { + case Hexagon::J2_jumpf: + case Hexagon::J2_jumpfnew: + case Hexagon::J2_jumpfnewpt: + Negated = true; + case Hexagon::J2_jumpt: + case Hexagon::J2_jumptnew: + case Hexagon::J2_jumptnewpt: + // Simple branch: if([!]Pn) jump ... + // i.e. Op0 = predicate, Op1 = branch target. + SimpleBranch = true; + break; + case Hexagon::J2_jump: + Targets.insert(BI->getOperand(0).getMBB()); + FallsThru = false; + return true; + default: + // If the branch is of unknown type, assume that all successors are + // executable. + return false; + } + + if (!SimpleBranch) + return false; + + // BI is a conditional branch if we got here. + RegisterRef PR = BI->getOperand(0); + RegisterCell PC = getCell(PR, Inputs); + const BT::BitValue &Test = PC[0]; + + // If the condition is neither true nor false, then it's unknown. + if (!Test.is(0) && !Test.is(1)) + return false; + + // "Test.is(!Negated)" means "branch condition is true". + if (!Test.is(!Negated)) { + // Condition known to be false. + FallsThru = true; + return true; + } + + Targets.insert(BI->getOperand(1).getMBB()); + FallsThru = false; + return true; +} + + +bool HexagonEvaluator::evaluateLoad(const MachineInstr *MI, + const CellMapType &Inputs, CellMapType &Outputs) const { + if (TII.isPredicated(MI)) + return false; + assert(MI->mayLoad() && "A load that mayn't?"); + unsigned Opc = MI->getOpcode(); + + uint16_t BitNum; + bool SignEx; + using namespace Hexagon; + + switch (Opc) { + default: + return false; + +#if 0 + // memb_fifo + case L2_loadalignb_pbr: + case L2_loadalignb_pcr: + case L2_loadalignb_pi: + // memh_fifo + case L2_loadalignh_pbr: + case L2_loadalignh_pcr: + case L2_loadalignh_pi: + // membh + case L2_loadbsw2_pbr: + case L2_loadbsw2_pci: + case L2_loadbsw2_pcr: + case L2_loadbsw2_pi: + case L2_loadbsw4_pbr: + case L2_loadbsw4_pci: + case L2_loadbsw4_pcr: + case L2_loadbsw4_pi: + // memubh + case L2_loadbzw2_pbr: + case L2_loadbzw2_pci: + case L2_loadbzw2_pcr: + case L2_loadbzw2_pi: + case L2_loadbzw4_pbr: + case L2_loadbzw4_pci: + case L2_loadbzw4_pcr: + case L2_loadbzw4_pi: +#endif + + case L2_loadrbgp: + case L2_loadrb_io: + case L2_loadrb_pbr: + case L2_loadrb_pci: + case L2_loadrb_pcr: + case L2_loadrb_pi: + case L4_loadrb_abs: + case L4_loadrb_ap: + case L4_loadrb_rr: + case L4_loadrb_ur: + BitNum = 8; + SignEx = true; + break; + + case L2_loadrubgp: + case L2_loadrub_io: + case L2_loadrub_pbr: + case L2_loadrub_pci: + case L2_loadrub_pcr: + case L2_loadrub_pi: + case L4_loadrub_abs: + case L4_loadrub_ap: + case L4_loadrub_rr: + case L4_loadrub_ur: + BitNum = 8; + SignEx = false; + break; + + case L2_loadrhgp: + case L2_loadrh_io: + case L2_loadrh_pbr: + case L2_loadrh_pci: + case L2_loadrh_pcr: + case L2_loadrh_pi: + case L4_loadrh_abs: + case L4_loadrh_ap: + case L4_loadrh_rr: + case L4_loadrh_ur: + BitNum = 16; + SignEx = true; + break; + + case L2_loadruhgp: + case L2_loadruh_io: + case L2_loadruh_pbr: + case L2_loadruh_pci: + case L2_loadruh_pcr: + case L2_loadruh_pi: + case L4_loadruh_rr: + case L4_loadruh_abs: + case L4_loadruh_ap: + case L4_loadruh_ur: + BitNum = 16; + SignEx = false; + break; + + case L2_loadrigp: + case L2_loadri_io: + case L2_loadri_pbr: + case L2_loadri_pci: + case L2_loadri_pcr: + case L2_loadri_pi: + case L2_loadw_locked: + case L4_loadri_abs: + case L4_loadri_ap: + case L4_loadri_rr: + case L4_loadri_ur: + case LDriw_pred: + BitNum = 32; + SignEx = true; + break; + + case L2_loadrdgp: + case L2_loadrd_io: + case L2_loadrd_pbr: + case L2_loadrd_pci: + case L2_loadrd_pcr: + case L2_loadrd_pi: + case L4_loadd_locked: + case L4_loadrd_abs: + case L4_loadrd_ap: + case L4_loadrd_rr: + case L4_loadrd_ur: + BitNum = 64; + SignEx = true; + break; + } + + const MachineOperand &MD = MI->getOperand(0); + assert(MD.isReg() && MD.isDef()); + RegisterRef RD = MD; + + uint16_t W = getRegBitWidth(RD); + assert(W >= BitNum && BitNum > 0); + RegisterCell Res(W); + + for (uint16_t i = 0; i < BitNum; ++i) + Res[i] = BT::BitValue::self(BT::BitRef(RD.Reg, i)); + + if (SignEx) { + const BT::BitValue &Sign = Res[BitNum-1]; + for (uint16_t i = BitNum; i < W; ++i) + Res[i] = BT::BitValue::ref(Sign); + } else { + for (uint16_t i = BitNum; i < W; ++i) + Res[i] = BT::BitValue::Zero; + } + + putCell(RD, Res, Outputs); + return true; +} + + +bool HexagonEvaluator::evaluateFormalCopy(const MachineInstr *MI, + const CellMapType &Inputs, CellMapType &Outputs) const { + // If MI defines a formal parameter, but is not a copy (loads are handled + // in evaluateLoad), then it's not clear what to do. + assert(MI->isCopy()); + + RegisterRef RD = MI->getOperand(0); + RegisterRef RS = MI->getOperand(1); + assert(RD.Sub == 0); + if (!TargetRegisterInfo::isPhysicalRegister(RS.Reg)) + return false; + RegExtMap::const_iterator F = VRX.find(RD.Reg); + if (F == VRX.end()) + return false; + + uint16_t EW = F->second.Width; + // Store RD's cell into the map. This will associate the cell with a virtual + // register, and make zero-/sign-extends possible (otherwise we would be ex- + // tending "self" bit values, which will have no effect, since "self" values + // cannot be references to anything). + putCell(RD, getCell(RS, Inputs), Outputs); + + RegisterCell Res; + // Read RD's cell from the outputs instead of RS's cell from the inputs: + if (F->second.Type == ExtType::SExt) + Res = eSXT(getCell(RD, Outputs), EW); + else if (F->second.Type == ExtType::ZExt) + Res = eZXT(getCell(RD, Outputs), EW); + + putCell(RD, Res, Outputs); + return true; +} + + +unsigned HexagonEvaluator::getNextPhysReg(unsigned PReg, unsigned Width) const { + using namespace Hexagon; + bool Is64 = DoubleRegsRegClass.contains(PReg); + assert(PReg == 0 || Is64 || IntRegsRegClass.contains(PReg)); + + static const unsigned Phys32[] = { R0, R1, R2, R3, R4, R5 }; + static const unsigned Phys64[] = { D0, D1, D2 }; + const unsigned Num32 = sizeof(Phys32)/sizeof(unsigned); + const unsigned Num64 = sizeof(Phys64)/sizeof(unsigned); + + // Return the first parameter register of the required width. + if (PReg == 0) + return (Width <= 32) ? Phys32[0] : Phys64[0]; + + // Set Idx32, Idx64 in such a way that Idx+1 would give the index of the + // next register. + unsigned Idx32 = 0, Idx64 = 0; + if (!Is64) { + while (Idx32 < Num32) { + if (Phys32[Idx32] == PReg) + break; + Idx32++; + } + Idx64 = Idx32/2; + } else { + while (Idx64 < Num64) { + if (Phys64[Idx64] == PReg) + break; + Idx64++; + } + Idx32 = Idx64*2+1; + } + + if (Width <= 32) + return (Idx32+1 < Num32) ? Phys32[Idx32+1] : 0; + return (Idx64+1 < Num64) ? Phys64[Idx64+1] : 0; +} + + +unsigned HexagonEvaluator::getVirtRegFor(unsigned PReg) const { + typedef MachineRegisterInfo::livein_iterator iterator; + for (iterator I = MRI.livein_begin(), E = MRI.livein_end(); I != E; ++I) { + if (I->first == PReg) + return I->second; + } + return 0; +} diff --git a/lib/Target/Hexagon/HexagonBitTracker.h b/lib/Target/Hexagon/HexagonBitTracker.h new file mode 100644 index 000000000000..897af2d71870 --- /dev/null +++ b/lib/Target/Hexagon/HexagonBitTracker.h @@ -0,0 +1,64 @@ +//===--- HexagonBitTracker.h ----------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef HEXAGONBITTRACKER_H +#define HEXAGONBITTRACKER_H + +#include "BitTracker.h" +#include "llvm/ADT/DenseMap.h" + +namespace llvm { + class HexagonInstrInfo; + class HexagonRegisterInfo; + +struct HexagonEvaluator : public BitTracker::MachineEvaluator { + typedef BitTracker::CellMapType CellMapType; + typedef BitTracker::RegisterRef RegisterRef; + typedef BitTracker::RegisterCell RegisterCell; + typedef BitTracker::BranchTargetList BranchTargetList; + + HexagonEvaluator(const HexagonRegisterInfo &tri, MachineRegisterInfo &mri, + const HexagonInstrInfo &tii, MachineFunction &mf); + + bool evaluate(const MachineInstr *MI, const CellMapType &Inputs, + CellMapType &Outputs) const override; + bool evaluate(const MachineInstr *BI, const CellMapType &Inputs, + BranchTargetList &Targets, bool &FallsThru) const override; + + BitTracker::BitMask mask(unsigned Reg, unsigned Sub) const override; + + MachineFunction &MF; + MachineFrameInfo &MFI; + const HexagonInstrInfo &TII; + +private: + bool evaluateLoad(const MachineInstr *MI, const CellMapType &Inputs, + CellMapType &Outputs) const; + bool evaluateFormalCopy(const MachineInstr *MI, const CellMapType &Inputs, + CellMapType &Outputs) const; + + unsigned getNextPhysReg(unsigned PReg, unsigned Width) const; + unsigned getVirtRegFor(unsigned PReg) const; + + // Type of formal parameter extension. + struct ExtType { + enum { SExt, ZExt }; + char Type; + uint16_t Width; + ExtType() : Type(0), Width(0) {} + ExtType(char t, uint16_t w) : Type(t), Width(w) {} + }; + // Map VR -> extension type. + typedef DenseMap<unsigned, ExtType> RegExtMap; + RegExtMap VRX; +}; + +} // end namespace llvm + +#endif diff --git a/lib/Target/Hexagon/HexagonCommonGEP.cpp b/lib/Target/Hexagon/HexagonCommonGEP.cpp new file mode 100644 index 000000000000..9f5fac156527 --- /dev/null +++ b/lib/Target/Hexagon/HexagonCommonGEP.cpp @@ -0,0 +1,1325 @@ +//===--- HexagonCommonGEP.cpp ---------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "commgep" + +#include "llvm/Pass.h" +#include "llvm/ADT/FoldingSet.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/PostDominators.h" +#include "llvm/CodeGen/MachineFunctionAnalysis.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Verifier.h" +#include "llvm/Support/Allocator.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" + +#include <map> +#include <set> +#include <vector> + +#include "HexagonTargetMachine.h" + +using namespace llvm; + +static cl::opt<bool> OptSpeculate("commgep-speculate", cl::init(true), + cl::Hidden, cl::ZeroOrMore); + +static cl::opt<bool> OptEnableInv("commgep-inv", cl::init(true), cl::Hidden, + cl::ZeroOrMore); + +static cl::opt<bool> OptEnableConst("commgep-const", cl::init(true), + cl::Hidden, cl::ZeroOrMore); + +namespace llvm { + void initializeHexagonCommonGEPPass(PassRegistry&); +} + +namespace { + struct GepNode; + typedef std::set<GepNode*> NodeSet; + typedef std::map<GepNode*,Value*> NodeToValueMap; + typedef std::vector<GepNode*> NodeVect; + typedef std::map<GepNode*,NodeVect> NodeChildrenMap; + typedef std::set<Use*> UseSet; + typedef std::map<GepNode*,UseSet> NodeToUsesMap; + + // Numbering map for gep nodes. Used to keep track of ordering for + // gep nodes. + struct NodeNumbering : public std::map<const GepNode*,unsigned> { + }; + + struct NodeOrdering : public NodeNumbering { + NodeOrdering() : LastNum(0) {} +#ifdef _MSC_VER + void special_insert_for_special_msvc(const GepNode *N) +#else + using NodeNumbering::insert; + void insert(const GepNode* N) +#endif + { + insert(std::make_pair(N, ++LastNum)); + } + bool operator() (const GepNode* N1, const GepNode *N2) const { + const_iterator F1 = find(N1), F2 = find(N2); + assert(F1 != end() && F2 != end()); + return F1->second < F2->second; + } + private: + unsigned LastNum; + }; + + + class HexagonCommonGEP : public FunctionPass { + public: + static char ID; + HexagonCommonGEP() : FunctionPass(ID) { + initializeHexagonCommonGEPPass(*PassRegistry::getPassRegistry()); + } + virtual bool runOnFunction(Function &F); + virtual const char *getPassName() const { + return "Hexagon Common GEP"; + } + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addPreserved<DominatorTreeWrapperPass>(); + AU.addRequired<PostDominatorTree>(); + AU.addPreserved<PostDominatorTree>(); + AU.addRequired<LoopInfoWrapperPass>(); + AU.addPreserved<LoopInfoWrapperPass>(); + FunctionPass::getAnalysisUsage(AU); + } + + private: + typedef std::map<Value*,GepNode*> ValueToNodeMap; + typedef std::vector<Value*> ValueVect; + typedef std::map<GepNode*,ValueVect> NodeToValuesMap; + + void getBlockTraversalOrder(BasicBlock *Root, ValueVect &Order); + bool isHandledGepForm(GetElementPtrInst *GepI); + void processGepInst(GetElementPtrInst *GepI, ValueToNodeMap &NM); + void collect(); + void common(); + + BasicBlock *recalculatePlacement(GepNode *Node, NodeChildrenMap &NCM, + NodeToValueMap &Loc); + BasicBlock *recalculatePlacementRec(GepNode *Node, NodeChildrenMap &NCM, + NodeToValueMap &Loc); + bool isInvariantIn(Value *Val, Loop *L); + bool isInvariantIn(GepNode *Node, Loop *L); + bool isInMainPath(BasicBlock *B, Loop *L); + BasicBlock *adjustForInvariance(GepNode *Node, NodeChildrenMap &NCM, + NodeToValueMap &Loc); + void separateChainForNode(GepNode *Node, Use *U, NodeToValueMap &Loc); + void separateConstantChains(GepNode *Node, NodeChildrenMap &NCM, + NodeToValueMap &Loc); + void computeNodePlacement(NodeToValueMap &Loc); + + Value *fabricateGEP(NodeVect &NA, BasicBlock::iterator At, + BasicBlock *LocB); + void getAllUsersForNode(GepNode *Node, ValueVect &Values, + NodeChildrenMap &NCM); + void materialize(NodeToValueMap &Loc); + + void removeDeadCode(); + + NodeVect Nodes; + NodeToUsesMap Uses; + NodeOrdering NodeOrder; // Node ordering, for deterministic behavior. + SpecificBumpPtrAllocator<GepNode> *Mem; + LLVMContext *Ctx; + LoopInfo *LI; + DominatorTree *DT; + PostDominatorTree *PDT; + Function *Fn; + }; +} + + +char HexagonCommonGEP::ID = 0; +INITIALIZE_PASS_BEGIN(HexagonCommonGEP, "hcommgep", "Hexagon Common GEP", + false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(PostDominatorTree) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_END(HexagonCommonGEP, "hcommgep", "Hexagon Common GEP", + false, false) + +namespace { + struct GepNode { + enum { + None = 0, + Root = 0x01, + Internal = 0x02, + Used = 0x04 + }; + + uint32_t Flags; + union { + GepNode *Parent; + Value *BaseVal; + }; + Value *Idx; + Type *PTy; // Type of the pointer operand. + + GepNode() : Flags(0), Parent(0), Idx(0), PTy(0) {} + GepNode(const GepNode *N) : Flags(N->Flags), Idx(N->Idx), PTy(N->PTy) { + if (Flags & Root) + BaseVal = N->BaseVal; + else + Parent = N->Parent; + } + friend raw_ostream &operator<< (raw_ostream &OS, const GepNode &GN); + }; + + + Type *next_type(Type *Ty, Value *Idx) { + // Advance the type. + if (!Ty->isStructTy()) { + Type *NexTy = cast<SequentialType>(Ty)->getElementType(); + return NexTy; + } + // Otherwise it is a struct type. + ConstantInt *CI = dyn_cast<ConstantInt>(Idx); + assert(CI && "Struct type with non-constant index"); + int64_t i = CI->getValue().getSExtValue(); + Type *NextTy = cast<StructType>(Ty)->getElementType(i); + return NextTy; + } + + + raw_ostream &operator<< (raw_ostream &OS, const GepNode &GN) { + OS << "{ {"; + bool Comma = false; + if (GN.Flags & GepNode::Root) { + OS << "root"; + Comma = true; + } + if (GN.Flags & GepNode::Internal) { + if (Comma) + OS << ','; + OS << "internal"; + Comma = true; + } + if (GN.Flags & GepNode::Used) { + if (Comma) + OS << ','; + OS << "used"; + Comma = true; + } + OS << "} "; + if (GN.Flags & GepNode::Root) + OS << "BaseVal:" << GN.BaseVal->getName() << '(' << GN.BaseVal << ')'; + else + OS << "Parent:" << GN.Parent; + + OS << " Idx:"; + if (ConstantInt *CI = dyn_cast<ConstantInt>(GN.Idx)) + OS << CI->getValue().getSExtValue(); + else if (GN.Idx->hasName()) + OS << GN.Idx->getName(); + else + OS << "<anon> =" << *GN.Idx; + + OS << " PTy:"; + if (GN.PTy->isStructTy()) { + StructType *STy = cast<StructType>(GN.PTy); + if (!STy->isLiteral()) + OS << GN.PTy->getStructName(); + else + OS << "<anon-struct>:" << *STy; + } + else + OS << *GN.PTy; + OS << " }"; + return OS; + } + + + template <typename NodeContainer> + void dump_node_container(raw_ostream &OS, const NodeContainer &S) { + typedef typename NodeContainer::const_iterator const_iterator; + for (const_iterator I = S.begin(), E = S.end(); I != E; ++I) + OS << *I << ' ' << **I << '\n'; + } + + raw_ostream &operator<< (raw_ostream &OS, + const NodeVect &S) LLVM_ATTRIBUTE_UNUSED; + raw_ostream &operator<< (raw_ostream &OS, const NodeVect &S) { + dump_node_container(OS, S); + return OS; + } + + + raw_ostream &operator<< (raw_ostream &OS, + const NodeToUsesMap &M) LLVM_ATTRIBUTE_UNUSED; + raw_ostream &operator<< (raw_ostream &OS, const NodeToUsesMap &M){ + typedef NodeToUsesMap::const_iterator const_iterator; + for (const_iterator I = M.begin(), E = M.end(); I != E; ++I) { + const UseSet &Us = I->second; + OS << I->first << " -> #" << Us.size() << '{'; + for (UseSet::const_iterator J = Us.begin(), F = Us.end(); J != F; ++J) { + User *R = (*J)->getUser(); + if (R->hasName()) + OS << ' ' << R->getName(); + else + OS << " <?>(" << *R << ')'; + } + OS << " }\n"; + } + return OS; + } + + + struct in_set { + in_set(const NodeSet &S) : NS(S) {} + bool operator() (GepNode *N) const { + return NS.find(N) != NS.end(); + } + private: + const NodeSet &NS; + }; +} + + +inline void *operator new(size_t, SpecificBumpPtrAllocator<GepNode> &A) { + return A.Allocate(); +} + + +void HexagonCommonGEP::getBlockTraversalOrder(BasicBlock *Root, + ValueVect &Order) { + // Compute block ordering for a typical DT-based traversal of the flow + // graph: "before visiting a block, all of its dominators must have been + // visited". + + Order.push_back(Root); + DomTreeNode *DTN = DT->getNode(Root); + typedef GraphTraits<DomTreeNode*> GTN; + typedef GTN::ChildIteratorType Iter; + for (Iter I = GTN::child_begin(DTN), E = GTN::child_end(DTN); I != E; ++I) + getBlockTraversalOrder((*I)->getBlock(), Order); +} + + +bool HexagonCommonGEP::isHandledGepForm(GetElementPtrInst *GepI) { + // No vector GEPs. + if (!GepI->getType()->isPointerTy()) + return false; + // No GEPs without any indices. (Is this possible?) + if (GepI->idx_begin() == GepI->idx_end()) + return false; + return true; +} + + +void HexagonCommonGEP::processGepInst(GetElementPtrInst *GepI, + ValueToNodeMap &NM) { + DEBUG(dbgs() << "Visiting GEP: " << *GepI << '\n'); + GepNode *N = new (*Mem) GepNode; + Value *PtrOp = GepI->getPointerOperand(); + ValueToNodeMap::iterator F = NM.find(PtrOp); + if (F == NM.end()) { + N->BaseVal = PtrOp; + N->Flags |= GepNode::Root; + } else { + // If PtrOp was a GEP instruction, it must have already been processed. + // The ValueToNodeMap entry for it is the last gep node in the generated + // chain. Link to it here. + N->Parent = F->second; + } + N->PTy = PtrOp->getType(); + N->Idx = *GepI->idx_begin(); + + // Collect the list of users of this GEP instruction. Will add it to the + // last node created for it. + UseSet Us; + for (Value::user_iterator UI = GepI->user_begin(), UE = GepI->user_end(); + UI != UE; ++UI) { + // Check if this gep is used by anything other than other geps that + // we will process. + if (isa<GetElementPtrInst>(*UI)) { + GetElementPtrInst *UserG = cast<GetElementPtrInst>(*UI); + if (isHandledGepForm(UserG)) + continue; + } + Us.insert(&UI.getUse()); + } + Nodes.push_back(N); +#ifdef _MSC_VER + NodeOrder.special_insert_for_special_msvc(N); +#else + NodeOrder.insert(N); +#endif + + // Skip the first index operand, since we only handle 0. This dereferences + // the pointer operand. + GepNode *PN = N; + Type *PtrTy = cast<PointerType>(PtrOp->getType())->getElementType(); + for (User::op_iterator OI = GepI->idx_begin()+1, OE = GepI->idx_end(); + OI != OE; ++OI) { + Value *Op = *OI; + GepNode *Nx = new (*Mem) GepNode; + Nx->Parent = PN; // Link Nx to the previous node. + Nx->Flags |= GepNode::Internal; + Nx->PTy = PtrTy; + Nx->Idx = Op; + Nodes.push_back(Nx); +#ifdef _MSC_VER + NodeOrder.special_insert_for_special_msvc(Nx); +#else + NodeOrder.insert(Nx); +#endif + PN = Nx; + + PtrTy = next_type(PtrTy, Op); + } + + // After last node has been created, update the use information. + if (!Us.empty()) { + PN->Flags |= GepNode::Used; + Uses[PN].insert(Us.begin(), Us.end()); + } + + // Link the last node with the originating GEP instruction. This is to + // help with linking chained GEP instructions. + NM.insert(std::make_pair(GepI, PN)); +} + + +void HexagonCommonGEP::collect() { + // Establish depth-first traversal order of the dominator tree. + ValueVect BO; + getBlockTraversalOrder(Fn->begin(), BO); + + // The creation of gep nodes requires DT-traversal. When processing a GEP + // instruction that uses another GEP instruction as the base pointer, the + // gep node for the base pointer should already exist. + ValueToNodeMap NM; + for (ValueVect::iterator I = BO.begin(), E = BO.end(); I != E; ++I) { + BasicBlock *B = cast<BasicBlock>(*I); + for (BasicBlock::iterator J = B->begin(), F = B->end(); J != F; ++J) { + if (!isa<GetElementPtrInst>(J)) + continue; + GetElementPtrInst *GepI = cast<GetElementPtrInst>(J); + if (isHandledGepForm(GepI)) + processGepInst(GepI, NM); + } + } + + DEBUG(dbgs() << "Gep nodes after initial collection:\n" << Nodes); +} + + +namespace { + void invert_find_roots(const NodeVect &Nodes, NodeChildrenMap &NCM, + NodeVect &Roots) { + typedef NodeVect::const_iterator const_iterator; + for (const_iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) { + GepNode *N = *I; + if (N->Flags & GepNode::Root) { + Roots.push_back(N); + continue; + } + GepNode *PN = N->Parent; + NCM[PN].push_back(N); + } + } + + void nodes_for_root(GepNode *Root, NodeChildrenMap &NCM, NodeSet &Nodes) { + NodeVect Work; + Work.push_back(Root); + Nodes.insert(Root); + + while (!Work.empty()) { + NodeVect::iterator First = Work.begin(); + GepNode *N = *First; + Work.erase(First); + NodeChildrenMap::iterator CF = NCM.find(N); + if (CF != NCM.end()) { + Work.insert(Work.end(), CF->second.begin(), CF->second.end()); + Nodes.insert(CF->second.begin(), CF->second.end()); + } + } + } +} + + +namespace { + typedef std::set<NodeSet> NodeSymRel; + typedef std::pair<GepNode*,GepNode*> NodePair; + typedef std::set<NodePair> NodePairSet; + + const NodeSet *node_class(GepNode *N, NodeSymRel &Rel) { + for (NodeSymRel::iterator I = Rel.begin(), E = Rel.end(); I != E; ++I) + if (I->count(N)) + return &*I; + return 0; + } + + // Create an ordered pair of GepNode pointers. The pair will be used in + // determining equality. The only purpose of the ordering is to eliminate + // duplication due to the commutativity of equality/non-equality. + NodePair node_pair(GepNode *N1, GepNode *N2) { + uintptr_t P1 = uintptr_t(N1), P2 = uintptr_t(N2); + if (P1 <= P2) + return std::make_pair(N1, N2); + return std::make_pair(N2, N1); + } + + unsigned node_hash(GepNode *N) { + // Include everything except flags and parent. + FoldingSetNodeID ID; + ID.AddPointer(N->Idx); + ID.AddPointer(N->PTy); + return ID.ComputeHash(); + } + + bool node_eq(GepNode *N1, GepNode *N2, NodePairSet &Eq, NodePairSet &Ne) { + // Don't cache the result for nodes with different hashes. The hash + // comparison is fast enough. + if (node_hash(N1) != node_hash(N2)) + return false; + + NodePair NP = node_pair(N1, N2); + NodePairSet::iterator FEq = Eq.find(NP); + if (FEq != Eq.end()) + return true; + NodePairSet::iterator FNe = Ne.find(NP); + if (FNe != Ne.end()) + return false; + // Not previously compared. + bool Root1 = N1->Flags & GepNode::Root; + bool Root2 = N2->Flags & GepNode::Root; + NodePair P = node_pair(N1, N2); + // If the Root flag has different values, the nodes are different. + // If both nodes are root nodes, but their base pointers differ, + // they are different. + if (Root1 != Root2 || (Root1 && N1->BaseVal != N2->BaseVal)) { + Ne.insert(P); + return false; + } + // Here the root flags are identical, and for root nodes the + // base pointers are equal, so the root nodes are equal. + // For non-root nodes, compare their parent nodes. + if (Root1 || node_eq(N1->Parent, N2->Parent, Eq, Ne)) { + Eq.insert(P); + return true; + } + return false; + } +} + + +void HexagonCommonGEP::common() { + // The essence of this commoning is finding gep nodes that are equal. + // To do this we need to compare all pairs of nodes. To save time, + // first, partition the set of all nodes into sets of potentially equal + // nodes, and then compare pairs from within each partition. + typedef std::map<unsigned,NodeSet> NodeSetMap; + NodeSetMap MaybeEq; + + for (NodeVect::iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) { + GepNode *N = *I; + unsigned H = node_hash(N); + MaybeEq[H].insert(N); + } + + // Compute the equivalence relation for the gep nodes. Use two caches, + // one for equality and the other for non-equality. + NodeSymRel EqRel; // Equality relation (as set of equivalence classes). + NodePairSet Eq, Ne; // Caches. + for (NodeSetMap::iterator I = MaybeEq.begin(), E = MaybeEq.end(); + I != E; ++I) { + NodeSet &S = I->second; + for (NodeSet::iterator NI = S.begin(), NE = S.end(); NI != NE; ++NI) { + GepNode *N = *NI; + // If node already has a class, then the class must have been created + // in a prior iteration of this loop. Since equality is transitive, + // nothing more will be added to that class, so skip it. + if (node_class(N, EqRel)) + continue; + + // Create a new class candidate now. + NodeSet C; + for (NodeSet::iterator NJ = std::next(NI); NJ != NE; ++NJ) + if (node_eq(N, *NJ, Eq, Ne)) + C.insert(*NJ); + // If Tmp is empty, N would be the only element in it. Don't bother + // creating a class for it then. + if (!C.empty()) { + C.insert(N); // Finalize the set before adding it to the relation. + std::pair<NodeSymRel::iterator, bool> Ins = EqRel.insert(C); + (void)Ins; + assert(Ins.second && "Cannot add a class"); + } + } + } + + DEBUG({ + dbgs() << "Gep node equality:\n"; + for (NodePairSet::iterator I = Eq.begin(), E = Eq.end(); I != E; ++I) + dbgs() << "{ " << I->first << ", " << I->second << " }\n"; + + dbgs() << "Gep equivalence classes:\n"; + for (NodeSymRel::iterator I = EqRel.begin(), E = EqRel.end(); I != E; ++I) { + dbgs() << '{'; + const NodeSet &S = *I; + for (NodeSet::const_iterator J = S.begin(), F = S.end(); J != F; ++J) { + if (J != S.begin()) + dbgs() << ','; + dbgs() << ' ' << *J; + } + dbgs() << " }\n"; + } + }); + + + // Create a projection from a NodeSet to the minimal element in it. + typedef std::map<const NodeSet*,GepNode*> ProjMap; + ProjMap PM; + for (NodeSymRel::iterator I = EqRel.begin(), E = EqRel.end(); I != E; ++I) { + const NodeSet &S = *I; + GepNode *Min = *std::min_element(S.begin(), S.end(), NodeOrder); + std::pair<ProjMap::iterator,bool> Ins = PM.insert(std::make_pair(&S, Min)); + (void)Ins; + assert(Ins.second && "Cannot add minimal element"); + + // Update the min element's flags, and user list. + uint32_t Flags = 0; + UseSet &MinUs = Uses[Min]; + for (NodeSet::iterator J = S.begin(), F = S.end(); J != F; ++J) { + GepNode *N = *J; + uint32_t NF = N->Flags; + // If N is used, append all original values of N to the list of + // original values of Min. + if (NF & GepNode::Used) + MinUs.insert(Uses[N].begin(), Uses[N].end()); + Flags |= NF; + } + if (MinUs.empty()) + Uses.erase(Min); + + // The collected flags should include all the flags from the min element. + assert((Min->Flags & Flags) == Min->Flags); + Min->Flags = Flags; + } + + // Commoning: for each non-root gep node, replace "Parent" with the + // selected (minimum) node from the corresponding equivalence class. + // If a given parent does not have an equivalence class, leave it + // unchanged (it means that it's the only element in its class). + for (NodeVect::iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) { + GepNode *N = *I; + if (N->Flags & GepNode::Root) + continue; + const NodeSet *PC = node_class(N->Parent, EqRel); + if (!PC) + continue; + ProjMap::iterator F = PM.find(PC); + if (F == PM.end()) + continue; + // Found a replacement, use it. + GepNode *Rep = F->second; + N->Parent = Rep; + } + + DEBUG(dbgs() << "Gep nodes after commoning:\n" << Nodes); + + // Finally, erase the nodes that are no longer used. + NodeSet Erase; + for (NodeVect::iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) { + GepNode *N = *I; + const NodeSet *PC = node_class(N, EqRel); + if (!PC) + continue; + ProjMap::iterator F = PM.find(PC); + if (F == PM.end()) + continue; + if (N == F->second) + continue; + // Node for removal. + Erase.insert(*I); + } + NodeVect::iterator NewE = std::remove_if(Nodes.begin(), Nodes.end(), + in_set(Erase)); + Nodes.resize(std::distance(Nodes.begin(), NewE)); + + DEBUG(dbgs() << "Gep nodes after post-commoning cleanup:\n" << Nodes); +} + + +namespace { + template <typename T> + BasicBlock *nearest_common_dominator(DominatorTree *DT, T &Blocks) { + DEBUG({ + dbgs() << "NCD of {"; + for (typename T::iterator I = Blocks.begin(), E = Blocks.end(); + I != E; ++I) { + if (!*I) + continue; + BasicBlock *B = cast<BasicBlock>(*I); + dbgs() << ' ' << B->getName(); + } + dbgs() << " }\n"; + }); + + // Allow null basic blocks in Blocks. In such cases, return 0. + typename T::iterator I = Blocks.begin(), E = Blocks.end(); + if (I == E || !*I) + return 0; + BasicBlock *Dom = cast<BasicBlock>(*I); + while (++I != E) { + BasicBlock *B = cast_or_null<BasicBlock>(*I); + Dom = B ? DT->findNearestCommonDominator(Dom, B) : 0; + if (!Dom) + return 0; + } + DEBUG(dbgs() << "computed:" << Dom->getName() << '\n'); + return Dom; + } + + template <typename T> + BasicBlock *nearest_common_dominatee(DominatorTree *DT, T &Blocks) { + // If two blocks, A and B, dominate a block C, then A dominates B, + // or B dominates A. + typename T::iterator I = Blocks.begin(), E = Blocks.end(); + // Find the first non-null block. + while (I != E && !*I) + ++I; + if (I == E) + return DT->getRoot(); + BasicBlock *DomB = cast<BasicBlock>(*I); + while (++I != E) { + if (!*I) + continue; + BasicBlock *B = cast<BasicBlock>(*I); + if (DT->dominates(B, DomB)) + continue; + if (!DT->dominates(DomB, B)) + return 0; + DomB = B; + } + return DomB; + } + + // Find the first use in B of any value from Values. If no such use, + // return B->end(). + template <typename T> + BasicBlock::iterator first_use_of_in_block(T &Values, BasicBlock *B) { + BasicBlock::iterator FirstUse = B->end(), BEnd = B->end(); + typedef typename T::iterator iterator; + for (iterator I = Values.begin(), E = Values.end(); I != E; ++I) { + Value *V = *I; + // If V is used in a PHI node, the use belongs to the incoming block, + // not the block with the PHI node. In the incoming block, the use + // would be considered as being at the end of it, so it cannot + // influence the position of the first use (which is assumed to be + // at the end to start with). + if (isa<PHINode>(V)) + continue; + if (!isa<Instruction>(V)) + continue; + Instruction *In = cast<Instruction>(V); + if (In->getParent() != B) + continue; + BasicBlock::iterator It = In; + if (std::distance(FirstUse, BEnd) < std::distance(It, BEnd)) + FirstUse = It; + } + return FirstUse; + } + + bool is_empty(const BasicBlock *B) { + return B->empty() || (&*B->begin() == B->getTerminator()); + } +} + + +BasicBlock *HexagonCommonGEP::recalculatePlacement(GepNode *Node, + NodeChildrenMap &NCM, NodeToValueMap &Loc) { + DEBUG(dbgs() << "Loc for node:" << Node << '\n'); + // Recalculate the placement for Node, assuming that the locations of + // its children in Loc are valid. + // Return 0 if there is no valid placement for Node (for example, it + // uses an index value that is not available at the location required + // to dominate all children, etc.). + + // Find the nearest common dominator for: + // - all users, if the node is used, and + // - all children. + ValueVect Bs; + if (Node->Flags & GepNode::Used) { + // Append all blocks with uses of the original values to the + // block vector Bs. + NodeToUsesMap::iterator UF = Uses.find(Node); + assert(UF != Uses.end() && "Used node with no use information"); + UseSet &Us = UF->second; + for (UseSet::iterator I = Us.begin(), E = Us.end(); I != E; ++I) { + Use *U = *I; + User *R = U->getUser(); + if (!isa<Instruction>(R)) + continue; + BasicBlock *PB = isa<PHINode>(R) + ? cast<PHINode>(R)->getIncomingBlock(*U) + : cast<Instruction>(R)->getParent(); + Bs.push_back(PB); + } + } + // Append the location of each child. + NodeChildrenMap::iterator CF = NCM.find(Node); + if (CF != NCM.end()) { + NodeVect &Cs = CF->second; + for (NodeVect::iterator I = Cs.begin(), E = Cs.end(); I != E; ++I) { + GepNode *CN = *I; + NodeToValueMap::iterator LF = Loc.find(CN); + // If the child is only used in GEP instructions (i.e. is not used in + // non-GEP instructions), the nearest dominator computed for it may + // have been null. In such case it won't have a location available. + if (LF == Loc.end()) + continue; + Bs.push_back(LF->second); + } + } + + BasicBlock *DomB = nearest_common_dominator(DT, Bs); + if (!DomB) + return 0; + // Check if the index used by Node dominates the computed dominator. + Instruction *IdxI = dyn_cast<Instruction>(Node->Idx); + if (IdxI && !DT->dominates(IdxI->getParent(), DomB)) + return 0; + + // Avoid putting nodes into empty blocks. + while (is_empty(DomB)) { + DomTreeNode *N = (*DT)[DomB]->getIDom(); + if (!N) + break; + DomB = N->getBlock(); + } + + // Otherwise, DomB is fine. Update the location map. + Loc[Node] = DomB; + return DomB; +} + + +BasicBlock *HexagonCommonGEP::recalculatePlacementRec(GepNode *Node, + NodeChildrenMap &NCM, NodeToValueMap &Loc) { + DEBUG(dbgs() << "LocRec begin for node:" << Node << '\n'); + // Recalculate the placement of Node, after recursively recalculating the + // placements of all its children. + NodeChildrenMap::iterator CF = NCM.find(Node); + if (CF != NCM.end()) { + NodeVect &Cs = CF->second; + for (NodeVect::iterator I = Cs.begin(), E = Cs.end(); I != E; ++I) + recalculatePlacementRec(*I, NCM, Loc); + } + BasicBlock *LB = recalculatePlacement(Node, NCM, Loc); + DEBUG(dbgs() << "LocRec end for node:" << Node << '\n'); + return LB; +} + + +bool HexagonCommonGEP::isInvariantIn(Value *Val, Loop *L) { + if (isa<Constant>(Val) || isa<Argument>(Val)) + return true; + Instruction *In = dyn_cast<Instruction>(Val); + if (!In) + return false; + BasicBlock *HdrB = L->getHeader(), *DefB = In->getParent(); + return DT->properlyDominates(DefB, HdrB); +} + + +bool HexagonCommonGEP::isInvariantIn(GepNode *Node, Loop *L) { + if (Node->Flags & GepNode::Root) + if (!isInvariantIn(Node->BaseVal, L)) + return false; + return isInvariantIn(Node->Idx, L); +} + + +bool HexagonCommonGEP::isInMainPath(BasicBlock *B, Loop *L) { + BasicBlock *HB = L->getHeader(); + BasicBlock *LB = L->getLoopLatch(); + // B must post-dominate the loop header or dominate the loop latch. + if (PDT->dominates(B, HB)) + return true; + if (LB && DT->dominates(B, LB)) + return true; + return false; +} + + +namespace { + BasicBlock *preheader(DominatorTree *DT, Loop *L) { + if (BasicBlock *PH = L->getLoopPreheader()) + return PH; + if (!OptSpeculate) + return 0; + DomTreeNode *DN = DT->getNode(L->getHeader()); + if (!DN) + return 0; + return DN->getIDom()->getBlock(); + } +} + + +BasicBlock *HexagonCommonGEP::adjustForInvariance(GepNode *Node, + NodeChildrenMap &NCM, NodeToValueMap &Loc) { + // Find the "topmost" location for Node: it must be dominated by both, + // its parent (or the BaseVal, if it's a root node), and by the index + // value. + ValueVect Bs; + if (Node->Flags & GepNode::Root) { + if (Instruction *PIn = dyn_cast<Instruction>(Node->BaseVal)) + Bs.push_back(PIn->getParent()); + } else { + Bs.push_back(Loc[Node->Parent]); + } + if (Instruction *IIn = dyn_cast<Instruction>(Node->Idx)) + Bs.push_back(IIn->getParent()); + BasicBlock *TopB = nearest_common_dominatee(DT, Bs); + + // Traverse the loop nest upwards until we find a loop in which Node + // is no longer invariant, or until we get to the upper limit of Node's + // placement. The traversal will also stop when a suitable "preheader" + // cannot be found for a given loop. The "preheader" may actually be + // a regular block outside of the loop (i.e. not guarded), in which case + // the Node will be speculated. + // For nodes that are not in the main path of the containing loop (i.e. + // are not executed in each iteration), do not move them out of the loop. + BasicBlock *LocB = cast_or_null<BasicBlock>(Loc[Node]); + if (LocB) { + Loop *Lp = LI->getLoopFor(LocB); + while (Lp) { + if (!isInvariantIn(Node, Lp) || !isInMainPath(LocB, Lp)) + break; + BasicBlock *NewLoc = preheader(DT, Lp); + if (!NewLoc || !DT->dominates(TopB, NewLoc)) + break; + Lp = Lp->getParentLoop(); + LocB = NewLoc; + } + } + Loc[Node] = LocB; + + // Recursively compute the locations of all children nodes. + NodeChildrenMap::iterator CF = NCM.find(Node); + if (CF != NCM.end()) { + NodeVect &Cs = CF->second; + for (NodeVect::iterator I = Cs.begin(), E = Cs.end(); I != E; ++I) + adjustForInvariance(*I, NCM, Loc); + } + return LocB; +} + + +namespace { + struct LocationAsBlock { + LocationAsBlock(const NodeToValueMap &L) : Map(L) {} + const NodeToValueMap ⤅ + }; + + raw_ostream &operator<< (raw_ostream &OS, + const LocationAsBlock &Loc) LLVM_ATTRIBUTE_UNUSED ; + raw_ostream &operator<< (raw_ostream &OS, const LocationAsBlock &Loc) { + for (NodeToValueMap::const_iterator I = Loc.Map.begin(), E = Loc.Map.end(); + I != E; ++I) { + OS << I->first << " -> "; + BasicBlock *B = cast<BasicBlock>(I->second); + OS << B->getName() << '(' << B << ')'; + OS << '\n'; + } + return OS; + } + + inline bool is_constant(GepNode *N) { + return isa<ConstantInt>(N->Idx); + } +} + + +void HexagonCommonGEP::separateChainForNode(GepNode *Node, Use *U, + NodeToValueMap &Loc) { + User *R = U->getUser(); + DEBUG(dbgs() << "Separating chain for node (" << Node << ") user: " + << *R << '\n'); + BasicBlock *PB = cast<Instruction>(R)->getParent(); + + GepNode *N = Node; + GepNode *C = 0, *NewNode = 0; + while (is_constant(N) && !(N->Flags & GepNode::Root)) { + // XXX if (single-use) dont-replicate; + GepNode *NewN = new (*Mem) GepNode(N); + Nodes.push_back(NewN); + Loc[NewN] = PB; + + if (N == Node) + NewNode = NewN; + NewN->Flags &= ~GepNode::Used; + if (C) + C->Parent = NewN; + C = NewN; + N = N->Parent; + } + if (!NewNode) + return; + + // Move over all uses that share the same user as U from Node to NewNode. + NodeToUsesMap::iterator UF = Uses.find(Node); + assert(UF != Uses.end()); + UseSet &Us = UF->second; + UseSet NewUs; + for (UseSet::iterator I = Us.begin(); I != Us.end(); ) { + User *S = (*I)->getUser(); + UseSet::iterator Nx = std::next(I); + if (S == R) { + NewUs.insert(*I); + Us.erase(I); + } + I = Nx; + } + if (Us.empty()) { + Node->Flags &= ~GepNode::Used; + Uses.erase(UF); + } + + // Should at least have U in NewUs. + NewNode->Flags |= GepNode::Used; + DEBUG(dbgs() << "new node: " << NewNode << " " << *NewNode << '\n'); + assert(!NewUs.empty()); + Uses[NewNode] = NewUs; +} + + +void HexagonCommonGEP::separateConstantChains(GepNode *Node, + NodeChildrenMap &NCM, NodeToValueMap &Loc) { + // First approximation: extract all chains. + NodeSet Ns; + nodes_for_root(Node, NCM, Ns); + + DEBUG(dbgs() << "Separating constant chains for node: " << Node << '\n'); + // Collect all used nodes together with the uses from loads and stores, + // where the GEP node could be folded into the load/store instruction. + NodeToUsesMap FNs; // Foldable nodes. + for (NodeSet::iterator I = Ns.begin(), E = Ns.end(); I != E; ++I) { + GepNode *N = *I; + if (!(N->Flags & GepNode::Used)) + continue; + NodeToUsesMap::iterator UF = Uses.find(N); + assert(UF != Uses.end()); + UseSet &Us = UF->second; + // Loads/stores that use the node N. + UseSet LSs; + for (UseSet::iterator J = Us.begin(), F = Us.end(); J != F; ++J) { + Use *U = *J; + User *R = U->getUser(); + // We're interested in uses that provide the address. It can happen + // that the value may also be provided via GEP, but we won't handle + // those cases here for now. + if (LoadInst *Ld = dyn_cast<LoadInst>(R)) { + unsigned PtrX = LoadInst::getPointerOperandIndex(); + if (&Ld->getOperandUse(PtrX) == U) + LSs.insert(U); + } else if (StoreInst *St = dyn_cast<StoreInst>(R)) { + unsigned PtrX = StoreInst::getPointerOperandIndex(); + if (&St->getOperandUse(PtrX) == U) + LSs.insert(U); + } + } + // Even if the total use count is 1, separating the chain may still be + // beneficial, since the constant chain may be longer than the GEP alone + // would be (e.g. if the parent node has a constant index and also has + // other children). + if (!LSs.empty()) + FNs.insert(std::make_pair(N, LSs)); + } + + DEBUG(dbgs() << "Nodes with foldable users:\n" << FNs); + + for (NodeToUsesMap::iterator I = FNs.begin(), E = FNs.end(); I != E; ++I) { + GepNode *N = I->first; + UseSet &Us = I->second; + for (UseSet::iterator J = Us.begin(), F = Us.end(); J != F; ++J) + separateChainForNode(N, *J, Loc); + } +} + + +void HexagonCommonGEP::computeNodePlacement(NodeToValueMap &Loc) { + // Compute the inverse of the Node.Parent links. Also, collect the set + // of root nodes. + NodeChildrenMap NCM; + NodeVect Roots; + invert_find_roots(Nodes, NCM, Roots); + + // Compute the initial placement determined by the users' locations, and + // the locations of the child nodes. + for (NodeVect::iterator I = Roots.begin(), E = Roots.end(); I != E; ++I) + recalculatePlacementRec(*I, NCM, Loc); + + DEBUG(dbgs() << "Initial node placement:\n" << LocationAsBlock(Loc)); + + if (OptEnableInv) { + for (NodeVect::iterator I = Roots.begin(), E = Roots.end(); I != E; ++I) + adjustForInvariance(*I, NCM, Loc); + + DEBUG(dbgs() << "Node placement after adjustment for invariance:\n" + << LocationAsBlock(Loc)); + } + if (OptEnableConst) { + for (NodeVect::iterator I = Roots.begin(), E = Roots.end(); I != E; ++I) + separateConstantChains(*I, NCM, Loc); + } + DEBUG(dbgs() << "Node use information:\n" << Uses); + + // At the moment, there is no further refinement of the initial placement. + // Such a refinement could include splitting the nodes if they are placed + // too far from some of its users. + + DEBUG(dbgs() << "Final node placement:\n" << LocationAsBlock(Loc)); +} + + +Value *HexagonCommonGEP::fabricateGEP(NodeVect &NA, BasicBlock::iterator At, + BasicBlock *LocB) { + DEBUG(dbgs() << "Fabricating GEP in " << LocB->getName() + << " for nodes:\n" << NA); + unsigned Num = NA.size(); + GepNode *RN = NA[0]; + assert((RN->Flags & GepNode::Root) && "Creating GEP for non-root"); + + Value *NewInst = 0; + Value *Input = RN->BaseVal; + Value **IdxList = new Value*[Num+1]; + unsigned nax = 0; + do { + unsigned IdxC = 0; + // If the type of the input of the first node is not a pointer, + // we need to add an artificial i32 0 to the indices (because the + // actual input in the IR will be a pointer). + if (!NA[nax]->PTy->isPointerTy()) { + Type *Int32Ty = Type::getInt32Ty(*Ctx); + IdxList[IdxC++] = ConstantInt::get(Int32Ty, 0); + } + + // Keep adding indices from NA until we have to stop and generate + // an "intermediate" GEP. + while (++nax <= Num) { + GepNode *N = NA[nax-1]; + IdxList[IdxC++] = N->Idx; + if (nax < Num) { + // We have to stop, if the expected type of the output of this node + // is not the same as the input type of the next node. + Type *NextTy = next_type(N->PTy, N->Idx); + if (NextTy != NA[nax]->PTy) + break; + } + } + ArrayRef<Value*> A(IdxList, IdxC); + Type *InpTy = Input->getType(); + Type *ElTy = cast<PointerType>(InpTy->getScalarType())->getElementType(); + NewInst = GetElementPtrInst::Create(ElTy, Input, A, "cgep", At); + DEBUG(dbgs() << "new GEP: " << *NewInst << '\n'); + Input = NewInst; + } while (nax <= Num); + + delete[] IdxList; + return NewInst; +} + + +void HexagonCommonGEP::getAllUsersForNode(GepNode *Node, ValueVect &Values, + NodeChildrenMap &NCM) { + NodeVect Work; + Work.push_back(Node); + + while (!Work.empty()) { + NodeVect::iterator First = Work.begin(); + GepNode *N = *First; + Work.erase(First); + if (N->Flags & GepNode::Used) { + NodeToUsesMap::iterator UF = Uses.find(N); + assert(UF != Uses.end() && "No use information for used node"); + UseSet &Us = UF->second; + for (UseSet::iterator I = Us.begin(), E = Us.end(); I != E; ++I) + Values.push_back((*I)->getUser()); + } + NodeChildrenMap::iterator CF = NCM.find(N); + if (CF != NCM.end()) { + NodeVect &Cs = CF->second; + Work.insert(Work.end(), Cs.begin(), Cs.end()); + } + } +} + + +void HexagonCommonGEP::materialize(NodeToValueMap &Loc) { + DEBUG(dbgs() << "Nodes before materialization:\n" << Nodes << '\n'); + NodeChildrenMap NCM; + NodeVect Roots; + // Compute the inversion again, since computing placement could alter + // "parent" relation between nodes. + invert_find_roots(Nodes, NCM, Roots); + + while (!Roots.empty()) { + NodeVect::iterator First = Roots.begin(); + GepNode *Root = *First, *Last = *First; + Roots.erase(First); + + NodeVect NA; // Nodes to assemble. + // Append to NA all child nodes up to (and including) the first child + // that: + // (1) has more than 1 child, or + // (2) is used, or + // (3) has a child located in a different block. + bool LastUsed = false; + unsigned LastCN = 0; + // The location may be null if the computation failed (it can legitimately + // happen for nodes created from dead GEPs). + Value *LocV = Loc[Last]; + if (!LocV) + continue; + BasicBlock *LastB = cast<BasicBlock>(LocV); + do { + NA.push_back(Last); + LastUsed = (Last->Flags & GepNode::Used); + if (LastUsed) + break; + NodeChildrenMap::iterator CF = NCM.find(Last); + LastCN = (CF != NCM.end()) ? CF->second.size() : 0; + if (LastCN != 1) + break; + GepNode *Child = CF->second.front(); + BasicBlock *ChildB = cast_or_null<BasicBlock>(Loc[Child]); + if (ChildB != 0 && LastB != ChildB) + break; + Last = Child; + } while (true); + + BasicBlock::iterator InsertAt = LastB->getTerminator(); + if (LastUsed || LastCN > 0) { + ValueVect Urs; + getAllUsersForNode(Root, Urs, NCM); + BasicBlock::iterator FirstUse = first_use_of_in_block(Urs, LastB); + if (FirstUse != LastB->end()) + InsertAt = FirstUse; + } + + // Generate a new instruction for NA. + Value *NewInst = fabricateGEP(NA, InsertAt, LastB); + + // Convert all the children of Last node into roots, and append them + // to the Roots list. + if (LastCN > 0) { + NodeVect &Cs = NCM[Last]; + for (NodeVect::iterator I = Cs.begin(), E = Cs.end(); I != E; ++I) { + GepNode *CN = *I; + CN->Flags &= ~GepNode::Internal; + CN->Flags |= GepNode::Root; + CN->BaseVal = NewInst; + Roots.push_back(CN); + } + } + + // Lastly, if the Last node was used, replace all uses with the new GEP. + // The uses reference the original GEP values. + if (LastUsed) { + NodeToUsesMap::iterator UF = Uses.find(Last); + assert(UF != Uses.end() && "No use information found"); + UseSet &Us = UF->second; + for (UseSet::iterator I = Us.begin(), E = Us.end(); I != E; ++I) { + Use *U = *I; + U->set(NewInst); + } + } + } +} + + +void HexagonCommonGEP::removeDeadCode() { + ValueVect BO; + BO.push_back(&Fn->front()); + + for (unsigned i = 0; i < BO.size(); ++i) { + BasicBlock *B = cast<BasicBlock>(BO[i]); + DomTreeNode *N = DT->getNode(B); + typedef GraphTraits<DomTreeNode*> GTN; + typedef GTN::ChildIteratorType Iter; + for (Iter I = GTN::child_begin(N), E = GTN::child_end(N); I != E; ++I) + BO.push_back((*I)->getBlock()); + } + + for (unsigned i = BO.size(); i > 0; --i) { + BasicBlock *B = cast<BasicBlock>(BO[i-1]); + BasicBlock::InstListType &IL = B->getInstList(); + typedef BasicBlock::InstListType::reverse_iterator reverse_iterator; + ValueVect Ins; + for (reverse_iterator I = IL.rbegin(), E = IL.rend(); I != E; ++I) + Ins.push_back(&*I); + for (ValueVect::iterator I = Ins.begin(), E = Ins.end(); I != E; ++I) { + Instruction *In = cast<Instruction>(*I); + if (isInstructionTriviallyDead(In)) + In->eraseFromParent(); + } + } +} + + +bool HexagonCommonGEP::runOnFunction(Function &F) { + // For now bail out on C++ exception handling. + for (Function::iterator A = F.begin(), Z = F.end(); A != Z; ++A) + for (BasicBlock::iterator I = A->begin(), E = A->end(); I != E; ++I) + if (isa<InvokeInst>(I) || isa<LandingPadInst>(I)) + return false; + + Fn = &F; + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + PDT = &getAnalysis<PostDominatorTree>(); + LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + Ctx = &F.getContext(); + + Nodes.clear(); + Uses.clear(); + NodeOrder.clear(); + + SpecificBumpPtrAllocator<GepNode> Allocator; + Mem = &Allocator; + + collect(); + common(); + + NodeToValueMap Loc; + computeNodePlacement(Loc); + materialize(Loc); + removeDeadCode(); + +#ifdef XDEBUG + // Run this only when expensive checks are enabled. + verifyFunction(F); +#endif + return true; +} + + +namespace llvm { + FunctionPass *createHexagonCommonGEP() { + return new HexagonCommonGEP(); + } +} diff --git a/lib/Target/Hexagon/HexagonExpandCondsets.cpp b/lib/Target/Hexagon/HexagonExpandCondsets.cpp index 37ed173a79cd..ce10aeadef94 100644 --- a/lib/Target/Hexagon/HexagonExpandCondsets.cpp +++ b/lib/Target/Hexagon/HexagonExpandCondsets.cpp @@ -1,3 +1,12 @@ +//===--- HexagonExpandCondsets.cpp ----------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + // Replace mux instructions with the corresponding legal instructions. // It is meant to work post-SSA, but still on virtual registers. It was // originally placed between register coalescing and machine instruction diff --git a/lib/Target/Hexagon/HexagonFrameLowering.cpp b/lib/Target/Hexagon/HexagonFrameLowering.cpp index 868f87e18413..29283c81877e 100644 --- a/lib/Target/Hexagon/HexagonFrameLowering.cpp +++ b/lib/Target/Hexagon/HexagonFrameLowering.cpp @@ -864,13 +864,13 @@ static bool needToReserveScavengingSpillSlots(MachineFunction &MF, // Check for an unused caller-saved register. for ( ; *CallerSavedRegs; ++CallerSavedRegs) { MCPhysReg FreeReg = *CallerSavedRegs; - if (MRI.isPhysRegUsed(FreeReg)) + if (!MRI.reg_nodbg_empty(FreeReg)) continue; // Check aliased register usage. bool IsCurrentRegUsed = false; for (MCRegAliasIterator AI(FreeReg, &HRI, false); AI.isValid(); ++AI) - if (MRI.isPhysRegUsed(*AI)) { + if (!MRI.reg_nodbg_empty(*AI)) { IsCurrentRegUsed = true; break; } @@ -959,8 +959,11 @@ bool HexagonFrameLowering::replacePredRegPseudoSpillCode(MachineFunction &MF) } -void HexagonFrameLowering::processFunctionBeforeCalleeSavedScan( - MachineFunction &MF, RegScavenger* RS) const { +void HexagonFrameLowering::determineCalleeSaves(MachineFunction &MF, + BitVector &SavedRegs, + RegScavenger *RS) const { + TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); + auto &HST = static_cast<const HexagonSubtarget&>(MF.getSubtarget()); auto &HRI = *HST.getRegisterInfo(); @@ -969,11 +972,9 @@ void HexagonFrameLowering::processFunctionBeforeCalleeSavedScan( // If we have a function containing __builtin_eh_return we want to spill and // restore all callee saved registers. Pretend that they are used. if (HasEHReturn) { - MachineRegisterInfo &MRI = MF.getRegInfo(); for (const MCPhysReg *CSRegs = HRI.getCalleeSavedRegs(&MF); *CSRegs; ++CSRegs) - if (!MRI.isPhysRegUsed(*CSRegs)) - MRI.setPhysRegUsed(*CSRegs); + SavedRegs.set(*CSRegs); } const TargetRegisterClass &RC = Hexagon::IntRegsRegClass; diff --git a/lib/Target/Hexagon/HexagonFrameLowering.h b/lib/Target/Hexagon/HexagonFrameLowering.h index 89500cb85724..d39ee2c77195 100644 --- a/lib/Target/Hexagon/HexagonFrameLowering.h +++ b/lib/Target/Hexagon/HexagonFrameLowering.h @@ -45,7 +45,7 @@ public: MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const override; void processFunctionBeforeFrameFinalized(MachineFunction &MF, RegScavenger *RS = nullptr) const override; - void processFunctionBeforeCalleeSavedScan(MachineFunction &MF, + void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS) const override; bool targetHandlesStackFrameRounding() const override { diff --git a/lib/Target/Hexagon/HexagonGenExtract.cpp b/lib/Target/Hexagon/HexagonGenExtract.cpp new file mode 100644 index 000000000000..4d32208bd5aa --- /dev/null +++ b/lib/Target/Hexagon/HexagonGenExtract.cpp @@ -0,0 +1,259 @@ +//===--- HexagonGenExtract.cpp --------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/STLExtras.h" +#include "llvm/CodeGen/MachineFunctionAnalysis.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +static cl::opt<unsigned> ExtractCutoff("extract-cutoff", cl::init(~0U), + cl::Hidden, cl::desc("Cutoff for generating \"extract\"" + " instructions")); + +// This prevents generating extract instructions that have the offset of 0. +// One of the reasons for "extract" is to put a sequence of bits in a regis- +// ter, starting at offset 0 (so that these bits can then be used by an +// "insert"). If the bits are already at offset 0, it is better not to gene- +// rate "extract", since logical bit operations can be merged into compound +// instructions (as opposed to "extract"). +static cl::opt<bool> NoSR0("extract-nosr0", cl::init(true), cl::Hidden, + cl::desc("No extract instruction with offset 0")); + +static cl::opt<bool> NeedAnd("extract-needand", cl::init(true), cl::Hidden, + cl::desc("Require & in extract patterns")); + +namespace llvm { + void initializeHexagonGenExtractPass(PassRegistry&); + FunctionPass *createHexagonGenExtract(); +} + + +namespace { + class HexagonGenExtract : public FunctionPass { + public: + static char ID; + HexagonGenExtract() : FunctionPass(ID), ExtractCount(0) { + initializeHexagonGenExtractPass(*PassRegistry::getPassRegistry()); + } + virtual const char *getPassName() const override { + return "Hexagon generate \"extract\" instructions"; + } + virtual bool runOnFunction(Function &F) override; + virtual void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addPreserved<DominatorTreeWrapperPass>(); + AU.addPreserved<MachineFunctionAnalysis>(); + FunctionPass::getAnalysisUsage(AU); + } + private: + bool visitBlock(BasicBlock *B); + bool convert(Instruction *In); + + unsigned ExtractCount; + DominatorTree *DT; + }; + + char HexagonGenExtract::ID = 0; +} + +INITIALIZE_PASS_BEGIN(HexagonGenExtract, "hextract", "Hexagon generate " + "\"extract\" instructions", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_END(HexagonGenExtract, "hextract", "Hexagon generate " + "\"extract\" instructions", false, false) + + +bool HexagonGenExtract::convert(Instruction *In) { + using namespace PatternMatch; + Value *BF = 0; + ConstantInt *CSL = 0, *CSR = 0, *CM = 0; + BasicBlock *BB = In->getParent(); + LLVMContext &Ctx = BB->getContext(); + bool LogicalSR; + + // (and (shl (lshr x, #sr), #sl), #m) + LogicalSR = true; + bool Match = match(In, m_And(m_Shl(m_LShr(m_Value(BF), m_ConstantInt(CSR)), + m_ConstantInt(CSL)), + m_ConstantInt(CM))); + + if (!Match) { + // (and (shl (ashr x, #sr), #sl), #m) + LogicalSR = false; + Match = match(In, m_And(m_Shl(m_AShr(m_Value(BF), m_ConstantInt(CSR)), + m_ConstantInt(CSL)), + m_ConstantInt(CM))); + } + if (!Match) { + // (and (shl x, #sl), #m) + LogicalSR = true; + CSR = ConstantInt::get(Type::getInt32Ty(Ctx), 0); + Match = match(In, m_And(m_Shl(m_Value(BF), m_ConstantInt(CSL)), + m_ConstantInt(CM))); + if (Match && NoSR0) + return false; + } + if (!Match) { + // (and (lshr x, #sr), #m) + LogicalSR = true; + CSL = ConstantInt::get(Type::getInt32Ty(Ctx), 0); + Match = match(In, m_And(m_LShr(m_Value(BF), m_ConstantInt(CSR)), + m_ConstantInt(CM))); + } + if (!Match) { + // (and (ashr x, #sr), #m) + LogicalSR = false; + CSL = ConstantInt::get(Type::getInt32Ty(Ctx), 0); + Match = match(In, m_And(m_AShr(m_Value(BF), m_ConstantInt(CSR)), + m_ConstantInt(CM))); + } + if (!Match) { + CM = 0; + // (shl (lshr x, #sr), #sl) + LogicalSR = true; + Match = match(In, m_Shl(m_LShr(m_Value(BF), m_ConstantInt(CSR)), + m_ConstantInt(CSL))); + } + if (!Match) { + CM = 0; + // (shl (ashr x, #sr), #sl) + LogicalSR = false; + Match = match(In, m_Shl(m_AShr(m_Value(BF), m_ConstantInt(CSR)), + m_ConstantInt(CSL))); + } + if (!Match) + return false; + + Type *Ty = BF->getType(); + if (!Ty->isIntegerTy()) + return false; + unsigned BW = Ty->getPrimitiveSizeInBits(); + if (BW != 32 && BW != 64) + return false; + + uint32_t SR = CSR->getZExtValue(); + uint32_t SL = CSL->getZExtValue(); + + if (!CM) { + // If there was no and, and the shift left did not remove all potential + // sign bits created by the shift right, then extractu cannot reproduce + // this value. + if (!LogicalSR && (SR > SL)) + return false; + APInt A = APInt(BW, ~0ULL).lshr(SR).shl(SL); + CM = ConstantInt::get(Ctx, A); + } + + // CM is the shifted-left mask. Shift it back right to remove the zero + // bits on least-significant positions. + APInt M = CM->getValue().lshr(SL); + uint32_t T = M.countTrailingOnes(); + + // During the shifts some of the bits will be lost. Calculate how many + // of the original value will remain after shift right and then left. + uint32_t U = BW - std::max(SL, SR); + // The width of the extracted field is the minimum of the original bits + // that remain after the shifts and the number of contiguous 1s in the mask. + uint32_t W = std::min(U, T); + if (W == 0) + return false; + + // Check if the extracted bits are contained within the mask that it is + // and-ed with. The extract operation will copy these bits, and so the + // mask cannot any holes in it that would clear any of the bits of the + // extracted field. + if (!LogicalSR) { + // If the shift right was arithmetic, it could have included some 1 bits. + // It is still ok to generate extract, but only if the mask eliminates + // those bits (i.e. M does not have any bits set beyond U). + APInt C = APInt::getHighBitsSet(BW, BW-U); + if (M.intersects(C) || !APIntOps::isMask(W, M)) + return false; + } else { + // Check if M starts with a contiguous sequence of W times 1 bits. Get + // the low U bits of M (which eliminates the 0 bits shifted in on the + // left), and check if the result is APInt's "mask": + if (!APIntOps::isMask(W, M.getLoBits(U))) + return false; + } + + IRBuilder<> IRB(BB, In); + Intrinsic::ID IntId = (BW == 32) ? Intrinsic::hexagon_S2_extractu + : Intrinsic::hexagon_S2_extractup; + Module *Mod = BB->getParent()->getParent(); + Value *ExtF = Intrinsic::getDeclaration(Mod, IntId); + Value *NewIn = IRB.CreateCall(ExtF, {BF, IRB.getInt32(W), IRB.getInt32(SR)}); + if (SL != 0) + NewIn = IRB.CreateShl(NewIn, SL, CSL->getName()); + In->replaceAllUsesWith(NewIn); + return true; +} + + +bool HexagonGenExtract::visitBlock(BasicBlock *B) { + // Depth-first, bottom-up traversal. + DomTreeNode *DTN = DT->getNode(B); + typedef GraphTraits<DomTreeNode*> GTN; + typedef GTN::ChildIteratorType Iter; + for (Iter I = GTN::child_begin(DTN), E = GTN::child_end(DTN); I != E; ++I) + visitBlock((*I)->getBlock()); + + // Allow limiting the number of generated extracts for debugging purposes. + bool HasCutoff = ExtractCutoff.getPosition(); + unsigned Cutoff = ExtractCutoff; + + bool Changed = false; + BasicBlock::iterator I = std::prev(B->end()), NextI, Begin = B->begin(); + while (true) { + if (HasCutoff && (ExtractCount >= Cutoff)) + return Changed; + bool Last = (I == Begin); + if (!Last) + NextI = std::prev(I); + Instruction *In = &*I; + bool Done = convert(In); + if (HasCutoff && Done) + ExtractCount++; + Changed |= Done; + if (Last) + break; + I = NextI; + } + return Changed; +} + + +bool HexagonGenExtract::runOnFunction(Function &F) { + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + bool Changed; + + // Traverse the function bottom-up, to see super-expressions before their + // sub-expressions. + BasicBlock *Entry = GraphTraits<Function*>::getEntryNode(&F); + Changed = visitBlock(Entry); + + return Changed; +} + + +FunctionPass *llvm::createHexagonGenExtract() { + return new HexagonGenExtract(); +} diff --git a/lib/Target/Hexagon/HexagonGenInsert.cpp b/lib/Target/Hexagon/HexagonGenInsert.cpp new file mode 100644 index 000000000000..096da949e77b --- /dev/null +++ b/lib/Target/Hexagon/HexagonGenInsert.cpp @@ -0,0 +1,1598 @@ +//===--- HexagonGenInsert.cpp ---------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "hexinsert" + +#include "llvm/Pass.h" +#include "llvm/PassRegistry.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/Constants.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Support/Timer.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetRegisterInfo.h" + +#include "Hexagon.h" +#include "HexagonRegisterInfo.h" +#include "HexagonTargetMachine.h" +#include "HexagonBitTracker.h" + +#include <map> +#include <vector> + +using namespace llvm; + +static cl::opt<unsigned> VRegIndexCutoff("insert-vreg-cutoff", cl::init(~0U), + cl::Hidden, cl::ZeroOrMore, cl::desc("Vreg# cutoff for insert generation.")); +// The distance cutoff is selected based on the precheckin-perf results: +// cutoffs 20, 25, 35, and 40 are worse than 30. +static cl::opt<unsigned> VRegDistCutoff("insert-dist-cutoff", cl::init(30U), + cl::Hidden, cl::ZeroOrMore, cl::desc("Vreg distance cutoff for insert " + "generation.")); + +static cl::opt<bool> OptTiming("insert-timing", cl::init(false), cl::Hidden, + cl::ZeroOrMore, cl::desc("Enable timing of insert generation")); +static cl::opt<bool> OptTimingDetail("insert-timing-detail", cl::init(false), + cl::Hidden, cl::ZeroOrMore, cl::desc("Enable detailed timing of insert " + "generation")); + +static cl::opt<bool> OptSelectAll0("insert-all0", cl::init(false), cl::Hidden, + cl::ZeroOrMore); +static cl::opt<bool> OptSelectHas0("insert-has0", cl::init(false), cl::Hidden, + cl::ZeroOrMore); +// Whether to construct constant values via "insert". Could eliminate constant +// extenders, but often not practical. +static cl::opt<bool> OptConst("insert-const", cl::init(false), cl::Hidden, + cl::ZeroOrMore); + +namespace { + // The preprocessor gets confused when the DEBUG macro is passed larger + // chunks of code. Use this function to detect debugging. + inline bool isDebug() { +#ifndef NDEBUG + return ::llvm::DebugFlag && ::llvm::isCurrentDebugType(DEBUG_TYPE); +#else + return false; +#endif + } +} + + +namespace { + // Set of virtual registers, based on BitVector. + struct RegisterSet : private BitVector { + RegisterSet() : BitVector() {} + explicit RegisterSet(unsigned s, bool t = false) : BitVector(s, t) {} + RegisterSet(const RegisterSet &RS) : BitVector(RS) {} + + using BitVector::clear; + + unsigned find_first() const { + int First = BitVector::find_first(); + if (First < 0) + return 0; + return x2v(First); + } + + unsigned find_next(unsigned Prev) const { + int Next = BitVector::find_next(v2x(Prev)); + if (Next < 0) + return 0; + return x2v(Next); + } + + RegisterSet &insert(unsigned R) { + unsigned Idx = v2x(R); + ensure(Idx); + return static_cast<RegisterSet&>(BitVector::set(Idx)); + } + RegisterSet &remove(unsigned R) { + unsigned Idx = v2x(R); + if (Idx >= size()) + return *this; + return static_cast<RegisterSet&>(BitVector::reset(Idx)); + } + + RegisterSet &insert(const RegisterSet &Rs) { + return static_cast<RegisterSet&>(BitVector::operator|=(Rs)); + } + RegisterSet &remove(const RegisterSet &Rs) { + return static_cast<RegisterSet&>(BitVector::reset(Rs)); + } + + reference operator[](unsigned R) { + unsigned Idx = v2x(R); + ensure(Idx); + return BitVector::operator[](Idx); + } + bool operator[](unsigned R) const { + unsigned Idx = v2x(R); + assert(Idx < size()); + return BitVector::operator[](Idx); + } + bool has(unsigned R) const { + unsigned Idx = v2x(R); + if (Idx >= size()) + return false; + return BitVector::test(Idx); + } + + bool empty() const { + return !BitVector::any(); + } + bool includes(const RegisterSet &Rs) const { + // A.BitVector::test(B) <=> A-B != {} + return !Rs.BitVector::test(*this); + } + bool intersects(const RegisterSet &Rs) const { + return BitVector::anyCommon(Rs); + } + + private: + void ensure(unsigned Idx) { + if (size() <= Idx) + resize(std::max(Idx+1, 32U)); + } + static inline unsigned v2x(unsigned v) { + return TargetRegisterInfo::virtReg2Index(v); + } + static inline unsigned x2v(unsigned x) { + return TargetRegisterInfo::index2VirtReg(x); + } + }; + + + struct PrintRegSet { + PrintRegSet(const RegisterSet &S, const TargetRegisterInfo *RI) + : RS(S), TRI(RI) {} + friend raw_ostream &operator<< (raw_ostream &OS, + const PrintRegSet &P); + private: + const RegisterSet &RS; + const TargetRegisterInfo *TRI; + }; + + raw_ostream &operator<< (raw_ostream &OS, const PrintRegSet &P) { + OS << '{'; + for (unsigned R = P.RS.find_first(); R; R = P.RS.find_next(R)) + OS << ' ' << PrintReg(R, P.TRI); + OS << " }"; + return OS; + } +} + + +namespace { + // A convenience class to associate unsigned numbers (such as virtual + // registers) with unsigned numbers. + struct UnsignedMap : public DenseMap<unsigned,unsigned> { + UnsignedMap() : BaseType() {} + private: + typedef DenseMap<unsigned,unsigned> BaseType; + }; + + // A utility to establish an ordering between virtual registers: + // VRegA < VRegB <=> RegisterOrdering[VRegA] < RegisterOrdering[VRegB] + // This is meant as a cache for the ordering of virtual registers defined + // by a potentially expensive comparison function, or obtained by a proce- + // dure that should not be repeated each time two registers are compared. + struct RegisterOrdering : public UnsignedMap { + RegisterOrdering() : UnsignedMap() {} + unsigned operator[](unsigned VR) const { + const_iterator F = find(VR); + assert(F != end()); + return F->second; + } + // Add operator(), so that objects of this class can be used as + // comparators in std::sort et al. + bool operator() (unsigned VR1, unsigned VR2) const { + return operator[](VR1) < operator[](VR2); + } + }; +} + + +namespace { + // Ordering of bit values. This class does not have operator[], but + // is supplies a comparison operator() for use in std:: algorithms. + // The order is as follows: + // - 0 < 1 < ref + // - ref1 < ref2, if ord(ref1.Reg) < ord(ref2.Reg), + // or ord(ref1.Reg) == ord(ref2.Reg), and ref1.Pos < ref2.Pos. + struct BitValueOrdering { + BitValueOrdering(const RegisterOrdering &RB) : BaseOrd(RB) {} + bool operator() (const BitTracker::BitValue &V1, + const BitTracker::BitValue &V2) const; + const RegisterOrdering &BaseOrd; + }; +} + + +bool BitValueOrdering::operator() (const BitTracker::BitValue &V1, + const BitTracker::BitValue &V2) const { + if (V1 == V2) + return false; + // V1==0 => true, V2==0 => false + if (V1.is(0) || V2.is(0)) + return V1.is(0); + // Neither of V1,V2 is 0, and V1!=V2. + // V2==1 => false, V1==1 => true + if (V2.is(1) || V1.is(1)) + return !V2.is(1); + // Both V1,V2 are refs. + unsigned Ind1 = BaseOrd[V1.RefI.Reg], Ind2 = BaseOrd[V2.RefI.Reg]; + if (Ind1 != Ind2) + return Ind1 < Ind2; + // If V1.Pos==V2.Pos + assert(V1.RefI.Pos != V2.RefI.Pos && "Bit values should be different"); + return V1.RefI.Pos < V2.RefI.Pos; +} + + +namespace { + // Cache for the BitTracker's cell map. Map lookup has a logarithmic + // complexity, this class will memoize the lookup results to reduce + // the access time for repeated lookups of the same cell. + struct CellMapShadow { + CellMapShadow(const BitTracker &T) : BT(T) {} + const BitTracker::RegisterCell &lookup(unsigned VR) { + unsigned RInd = TargetRegisterInfo::virtReg2Index(VR); + // Grow the vector to at least 32 elements. + if (RInd >= CVect.size()) + CVect.resize(std::max(RInd+16, 32U), 0); + const BitTracker::RegisterCell *CP = CVect[RInd]; + if (CP == 0) + CP = CVect[RInd] = &BT.lookup(VR); + return *CP; + } + + const BitTracker &BT; + + private: + typedef std::vector<const BitTracker::RegisterCell*> CellVectType; + CellVectType CVect; + }; +} + + +namespace { + // Comparator class for lexicographic ordering of virtual registers + // according to the corresponding BitTracker::RegisterCell objects. + struct RegisterCellLexCompare { + RegisterCellLexCompare(const BitValueOrdering &BO, CellMapShadow &M) + : BitOrd(BO), CM(M) {} + bool operator() (unsigned VR1, unsigned VR2) const; + private: + const BitValueOrdering &BitOrd; + CellMapShadow &CM; + }; + + // Comparator class for lexicographic ordering of virtual registers + // according to the specified bits of the corresponding BitTracker:: + // RegisterCell objects. + // Specifically, this class will be used to compare bit B of a register + // cell for a selected virtual register R with bit N of any register + // other than R. + struct RegisterCellBitCompareSel { + RegisterCellBitCompareSel(unsigned R, unsigned B, unsigned N, + const BitValueOrdering &BO, CellMapShadow &M) + : SelR(R), SelB(B), BitN(N), BitOrd(BO), CM(M) {} + bool operator() (unsigned VR1, unsigned VR2) const; + private: + const unsigned SelR, SelB; + const unsigned BitN; + const BitValueOrdering &BitOrd; + CellMapShadow &CM; + }; +} + + +bool RegisterCellLexCompare::operator() (unsigned VR1, unsigned VR2) const { + // Ordering of registers, made up from two given orderings: + // - the ordering of the register numbers, and + // - the ordering of register cells. + // Def. R1 < R2 if: + // - cell(R1) < cell(R2), or + // - cell(R1) == cell(R2), and index(R1) < index(R2). + // + // For register cells, the ordering is lexicographic, with index 0 being + // the most significant. + if (VR1 == VR2) + return false; + + const BitTracker::RegisterCell &RC1 = CM.lookup(VR1), &RC2 = CM.lookup(VR2); + uint16_t W1 = RC1.width(), W2 = RC2.width(); + for (uint16_t i = 0, w = std::min(W1, W2); i < w; ++i) { + const BitTracker::BitValue &V1 = RC1[i], &V2 = RC2[i]; + if (V1 != V2) + return BitOrd(V1, V2); + } + // Cells are equal up until the common length. + if (W1 != W2) + return W1 < W2; + + return BitOrd.BaseOrd[VR1] < BitOrd.BaseOrd[VR2]; +} + + +bool RegisterCellBitCompareSel::operator() (unsigned VR1, unsigned VR2) const { + if (VR1 == VR2) + return false; + const BitTracker::RegisterCell &RC1 = CM.lookup(VR1); + const BitTracker::RegisterCell &RC2 = CM.lookup(VR2); + uint16_t W1 = RC1.width(), W2 = RC2.width(); + uint16_t Bit1 = (VR1 == SelR) ? SelB : BitN; + uint16_t Bit2 = (VR2 == SelR) ? SelB : BitN; + // If Bit1 exceeds the width of VR1, then: + // - return false, if at the same time Bit2 exceeds VR2, or + // - return true, otherwise. + // (I.e. "a bit value that does not exist is less than any bit value + // that does exist".) + if (W1 <= Bit1) + return Bit2 < W2; + // If Bit1 is within VR1, but Bit2 is not within VR2, return false. + if (W2 <= Bit2) + return false; + + const BitTracker::BitValue &V1 = RC1[Bit1], V2 = RC2[Bit2]; + if (V1 != V2) + return BitOrd(V1, V2); + return false; +} + + +namespace { + class OrderedRegisterList { + typedef std::vector<unsigned> ListType; + public: + OrderedRegisterList(const RegisterOrdering &RO) : Ord(RO) {} + void insert(unsigned VR); + void remove(unsigned VR); + unsigned operator[](unsigned Idx) const { + assert(Idx < Seq.size()); + return Seq[Idx]; + } + unsigned size() const { + return Seq.size(); + } + + typedef ListType::iterator iterator; + typedef ListType::const_iterator const_iterator; + iterator begin() { return Seq.begin(); } + iterator end() { return Seq.end(); } + const_iterator begin() const { return Seq.begin(); } + const_iterator end() const { return Seq.end(); } + + // Convenience function to convert an iterator to the corresponding index. + unsigned idx(iterator It) const { return It-begin(); } + private: + ListType Seq; + const RegisterOrdering &Ord; + }; + + + struct PrintORL { + PrintORL(const OrderedRegisterList &L, const TargetRegisterInfo *RI) + : RL(L), TRI(RI) {} + friend raw_ostream &operator<< (raw_ostream &OS, const PrintORL &P); + private: + const OrderedRegisterList &RL; + const TargetRegisterInfo *TRI; + }; + + raw_ostream &operator<< (raw_ostream &OS, const PrintORL &P) { + OS << '('; + OrderedRegisterList::const_iterator B = P.RL.begin(), E = P.RL.end(); + for (OrderedRegisterList::const_iterator I = B; I != E; ++I) { + if (I != B) + OS << ", "; + OS << PrintReg(*I, P.TRI); + } + OS << ')'; + return OS; + } +} + + +void OrderedRegisterList::insert(unsigned VR) { + iterator L = std::lower_bound(Seq.begin(), Seq.end(), VR, Ord); + if (L == Seq.end()) + Seq.push_back(VR); + else + Seq.insert(L, VR); +} + + +void OrderedRegisterList::remove(unsigned VR) { + iterator L = std::lower_bound(Seq.begin(), Seq.end(), VR, Ord); + assert(L != Seq.end()); + Seq.erase(L); +} + + +namespace { + // A record of the insert form. The fields correspond to the operands + // of the "insert" instruction: + // ... = insert(SrcR, InsR, #Wdh, #Off) + struct IFRecord { + IFRecord(unsigned SR = 0, unsigned IR = 0, uint16_t W = 0, uint16_t O = 0) + : SrcR(SR), InsR(IR), Wdh(W), Off(O) {} + unsigned SrcR, InsR; + uint16_t Wdh, Off; + }; + + struct PrintIFR { + PrintIFR(const IFRecord &R, const TargetRegisterInfo *RI) + : IFR(R), TRI(RI) {} + private: + const IFRecord &IFR; + const TargetRegisterInfo *TRI; + friend raw_ostream &operator<< (raw_ostream &OS, const PrintIFR &P); + }; + + raw_ostream &operator<< (raw_ostream &OS, const PrintIFR &P) { + unsigned SrcR = P.IFR.SrcR, InsR = P.IFR.InsR; + OS << '(' << PrintReg(SrcR, P.TRI) << ',' << PrintReg(InsR, P.TRI) + << ",#" << P.IFR.Wdh << ",#" << P.IFR.Off << ')'; + return OS; + } + + typedef std::pair<IFRecord,RegisterSet> IFRecordWithRegSet; +} + + +namespace llvm { + void initializeHexagonGenInsertPass(PassRegistry&); + FunctionPass *createHexagonGenInsert(); +} + + +namespace { + class HexagonGenInsert : public MachineFunctionPass { + public: + static char ID; + HexagonGenInsert() : MachineFunctionPass(ID), HII(0), HRI(0) { + initializeHexagonGenInsertPass(*PassRegistry::getPassRegistry()); + } + virtual const char *getPassName() const { + return "Hexagon generate \"insert\" instructions"; + } + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<MachineDominatorTree>(); + AU.addPreserved<MachineDominatorTree>(); + MachineFunctionPass::getAnalysisUsage(AU); + } + virtual bool runOnMachineFunction(MachineFunction &MF); + + private: + typedef DenseMap<std::pair<unsigned,unsigned>,unsigned> PairMapType; + + void buildOrderingMF(RegisterOrdering &RO) const; + void buildOrderingBT(RegisterOrdering &RB, RegisterOrdering &RO) const; + bool isIntClass(const TargetRegisterClass *RC) const; + bool isConstant(unsigned VR) const; + bool isSmallConstant(unsigned VR) const; + bool isValidInsertForm(unsigned DstR, unsigned SrcR, unsigned InsR, + uint16_t L, uint16_t S) const; + bool findSelfReference(unsigned VR) const; + bool findNonSelfReference(unsigned VR) const; + void getInstrDefs(const MachineInstr *MI, RegisterSet &Defs) const; + void getInstrUses(const MachineInstr *MI, RegisterSet &Uses) const; + unsigned distance(const MachineBasicBlock *FromB, + const MachineBasicBlock *ToB, const UnsignedMap &RPO, + PairMapType &M) const; + unsigned distance(MachineBasicBlock::const_iterator FromI, + MachineBasicBlock::const_iterator ToI, const UnsignedMap &RPO, + PairMapType &M) const; + bool findRecordInsertForms(unsigned VR, OrderedRegisterList &AVs); + void collectInBlock(MachineBasicBlock *B, OrderedRegisterList &AVs); + void findRemovableRegisters(unsigned VR, IFRecord IF, + RegisterSet &RMs) const; + void computeRemovableRegisters(); + + void pruneEmptyLists(); + void pruneCoveredSets(unsigned VR); + void pruneUsesTooFar(unsigned VR, const UnsignedMap &RPO, PairMapType &M); + void pruneRegCopies(unsigned VR); + void pruneCandidates(); + void selectCandidates(); + bool generateInserts(); + + bool removeDeadCode(MachineDomTreeNode *N); + + // IFRecord coupled with a set of potentially removable registers: + typedef std::vector<IFRecordWithRegSet> IFListType; + typedef DenseMap<unsigned,IFListType> IFMapType; // vreg -> IFListType + + void dump_map() const; + + const HexagonInstrInfo *HII; + const HexagonRegisterInfo *HRI; + + MachineFunction *MFN; + MachineRegisterInfo *MRI; + MachineDominatorTree *MDT; + CellMapShadow *CMS; + + RegisterOrdering BaseOrd; + RegisterOrdering CellOrd; + IFMapType IFMap; + }; + + char HexagonGenInsert::ID = 0; +} + + +void HexagonGenInsert::dump_map() const { + typedef IFMapType::const_iterator iterator; + for (iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I) { + dbgs() << " " << PrintReg(I->first, HRI) << ":\n"; + const IFListType &LL = I->second; + for (unsigned i = 0, n = LL.size(); i < n; ++i) + dbgs() << " " << PrintIFR(LL[i].first, HRI) << ", " + << PrintRegSet(LL[i].second, HRI) << '\n'; + } +} + + +void HexagonGenInsert::buildOrderingMF(RegisterOrdering &RO) const { + unsigned Index = 0; + typedef MachineFunction::const_iterator mf_iterator; + for (mf_iterator A = MFN->begin(), Z = MFN->end(); A != Z; ++A) { + const MachineBasicBlock &B = *A; + if (!CMS->BT.reached(&B)) + continue; + typedef MachineBasicBlock::const_iterator mb_iterator; + for (mb_iterator I = B.begin(), E = B.end(); I != E; ++I) { + const MachineInstr *MI = &*I; + for (unsigned i = 0, n = MI->getNumOperands(); i < n; ++i) { + const MachineOperand &MO = MI->getOperand(i); + if (MO.isReg() && MO.isDef()) { + unsigned R = MO.getReg(); + assert(MO.getSubReg() == 0 && "Unexpected subregister in definition"); + if (TargetRegisterInfo::isVirtualRegister(R)) + RO.insert(std::make_pair(R, Index++)); + } + } + } + } + // Since some virtual registers may have had their def and uses eliminated, + // they are no longer referenced in the code, and so they will not appear + // in the map. +} + + +void HexagonGenInsert::buildOrderingBT(RegisterOrdering &RB, + RegisterOrdering &RO) const { + // Create a vector of all virtual registers (collect them from the base + // ordering RB), and then sort it using the RegisterCell comparator. + BitValueOrdering BVO(RB); + RegisterCellLexCompare LexCmp(BVO, *CMS); + typedef std::vector<unsigned> SortableVectorType; + SortableVectorType VRs; + for (RegisterOrdering::iterator I = RB.begin(), E = RB.end(); I != E; ++I) + VRs.push_back(I->first); + std::sort(VRs.begin(), VRs.end(), LexCmp); + // Transfer the results to the outgoing register ordering. + for (unsigned i = 0, n = VRs.size(); i < n; ++i) + RO.insert(std::make_pair(VRs[i], i)); +} + + +inline bool HexagonGenInsert::isIntClass(const TargetRegisterClass *RC) const { + return RC == &Hexagon::IntRegsRegClass || RC == &Hexagon::DoubleRegsRegClass; +} + + +bool HexagonGenInsert::isConstant(unsigned VR) const { + const BitTracker::RegisterCell &RC = CMS->lookup(VR); + uint16_t W = RC.width(); + for (uint16_t i = 0; i < W; ++i) { + const BitTracker::BitValue &BV = RC[i]; + if (BV.is(0) || BV.is(1)) + continue; + return false; + } + return true; +} + + +bool HexagonGenInsert::isSmallConstant(unsigned VR) const { + const BitTracker::RegisterCell &RC = CMS->lookup(VR); + uint16_t W = RC.width(); + if (W > 64) + return false; + uint64_t V = 0, B = 1; + for (uint16_t i = 0; i < W; ++i) { + const BitTracker::BitValue &BV = RC[i]; + if (BV.is(1)) + V |= B; + else if (!BV.is(0)) + return false; + B <<= 1; + } + + // For 32-bit registers, consider: Rd = #s16. + if (W == 32) + return isInt<16>(V); + + // For 64-bit registers, it's Rdd = #s8 or Rdd = combine(#s8,#s8) + return isInt<8>(Lo_32(V)) && isInt<8>(Hi_32(V)); +} + + +bool HexagonGenInsert::isValidInsertForm(unsigned DstR, unsigned SrcR, + unsigned InsR, uint16_t L, uint16_t S) const { + const TargetRegisterClass *DstRC = MRI->getRegClass(DstR); + const TargetRegisterClass *SrcRC = MRI->getRegClass(SrcR); + const TargetRegisterClass *InsRC = MRI->getRegClass(InsR); + // Only integet (32-/64-bit) register classes. + if (!isIntClass(DstRC) || !isIntClass(SrcRC) || !isIntClass(InsRC)) + return false; + // The "source" register must be of the same class as DstR. + if (DstRC != SrcRC) + return false; + if (DstRC == InsRC) + return true; + // A 64-bit register can only be generated from other 64-bit registers. + if (DstRC == &Hexagon::DoubleRegsRegClass) + return false; + // Otherwise, the L and S cannot span 32-bit word boundary. + if (S < 32 && S+L > 32) + return false; + return true; +} + + +bool HexagonGenInsert::findSelfReference(unsigned VR) const { + const BitTracker::RegisterCell &RC = CMS->lookup(VR); + for (uint16_t i = 0, w = RC.width(); i < w; ++i) { + const BitTracker::BitValue &V = RC[i]; + if (V.Type == BitTracker::BitValue::Ref && V.RefI.Reg == VR) + return true; + } + return false; +} + + +bool HexagonGenInsert::findNonSelfReference(unsigned VR) const { + BitTracker::RegisterCell RC = CMS->lookup(VR); + for (uint16_t i = 0, w = RC.width(); i < w; ++i) { + const BitTracker::BitValue &V = RC[i]; + if (V.Type == BitTracker::BitValue::Ref && V.RefI.Reg != VR) + return true; + } + return false; +} + + +void HexagonGenInsert::getInstrDefs(const MachineInstr *MI, + RegisterSet &Defs) const { + for (unsigned i = 0, n = MI->getNumOperands(); i < n; ++i) { + const MachineOperand &MO = MI->getOperand(i); + if (!MO.isReg() || !MO.isDef()) + continue; + unsigned R = MO.getReg(); + if (!TargetRegisterInfo::isVirtualRegister(R)) + continue; + Defs.insert(R); + } +} + + +void HexagonGenInsert::getInstrUses(const MachineInstr *MI, + RegisterSet &Uses) const { + for (unsigned i = 0, n = MI->getNumOperands(); i < n; ++i) { + const MachineOperand &MO = MI->getOperand(i); + if (!MO.isReg() || !MO.isUse()) + continue; + unsigned R = MO.getReg(); + if (!TargetRegisterInfo::isVirtualRegister(R)) + continue; + Uses.insert(R); + } +} + + +unsigned HexagonGenInsert::distance(const MachineBasicBlock *FromB, + const MachineBasicBlock *ToB, const UnsignedMap &RPO, + PairMapType &M) const { + // Forward distance from the end of a block to the beginning of it does + // not make sense. This function should not be called with FromB == ToB. + assert(FromB != ToB); + + unsigned FromN = FromB->getNumber(), ToN = ToB->getNumber(); + // If we have already computed it, return the cached result. + PairMapType::iterator F = M.find(std::make_pair(FromN, ToN)); + if (F != M.end()) + return F->second; + unsigned ToRPO = RPO.lookup(ToN); + + unsigned MaxD = 0; + typedef MachineBasicBlock::const_pred_iterator pred_iterator; + for (pred_iterator I = ToB->pred_begin(), E = ToB->pred_end(); I != E; ++I) { + const MachineBasicBlock *PB = *I; + // Skip back edges. Also, if FromB is a predecessor of ToB, the distance + // along that path will be 0, and we don't need to do any calculations + // on it. + if (PB == FromB || RPO.lookup(PB->getNumber()) >= ToRPO) + continue; + unsigned D = PB->size() + distance(FromB, PB, RPO, M); + if (D > MaxD) + MaxD = D; + } + + // Memoize the result for later lookup. + M.insert(std::make_pair(std::make_pair(FromN, ToN), MaxD)); + return MaxD; +} + + +unsigned HexagonGenInsert::distance(MachineBasicBlock::const_iterator FromI, + MachineBasicBlock::const_iterator ToI, const UnsignedMap &RPO, + PairMapType &M) const { + const MachineBasicBlock *FB = FromI->getParent(), *TB = ToI->getParent(); + if (FB == TB) + return std::distance(FromI, ToI); + unsigned D1 = std::distance(TB->begin(), ToI); + unsigned D2 = distance(FB, TB, RPO, M); + unsigned D3 = std::distance(FromI, FB->end()); + return D1+D2+D3; +} + + +bool HexagonGenInsert::findRecordInsertForms(unsigned VR, + OrderedRegisterList &AVs) { + if (isDebug()) { + dbgs() << LLVM_FUNCTION_NAME << ": " << PrintReg(VR, HRI) + << " AVs: " << PrintORL(AVs, HRI) << "\n"; + } + if (AVs.size() == 0) + return false; + + typedef OrderedRegisterList::iterator iterator; + BitValueOrdering BVO(BaseOrd); + const BitTracker::RegisterCell &RC = CMS->lookup(VR); + uint16_t W = RC.width(); + + typedef std::pair<unsigned,uint16_t> RSRecord; // (reg,shift) + typedef std::vector<RSRecord> RSListType; + // Have a map, with key being the matching prefix length, and the value + // being the list of pairs (R,S), where R's prefix matches VR at S. + // (DenseMap<uint16_t,RSListType> fails to instantiate.) + typedef DenseMap<unsigned,RSListType> LRSMapType; + LRSMapType LM; + + // Conceptually, rotate the cell RC right (i.e. towards the LSB) by S, + // and find matching prefixes from AVs with the rotated RC. Such a prefix + // would match a string of bits (of length L) in RC starting at S. + for (uint16_t S = 0; S < W; ++S) { + iterator B = AVs.begin(), E = AVs.end(); + // The registers in AVs are ordered according to the lexical order of + // the corresponding register cells. This means that the range of regis- + // ters in AVs that match a prefix of length L+1 will be contained in + // the range that matches a prefix of length L. This means that we can + // keep narrowing the search space as the prefix length goes up. This + // helps reduce the overall complexity of the search. + uint16_t L; + for (L = 0; L < W-S; ++L) { + // Compare against VR's bits starting at S, which emulates rotation + // of VR by S. + RegisterCellBitCompareSel RCB(VR, S+L, L, BVO, *CMS); + iterator NewB = std::lower_bound(B, E, VR, RCB); + iterator NewE = std::upper_bound(NewB, E, VR, RCB); + // For the registers that are eliminated from the next range, L is + // the longest prefix matching VR at position S (their prefixes + // differ from VR at S+L). If L>0, record this information for later + // use. + if (L > 0) { + for (iterator I = B; I != NewB; ++I) + LM[L].push_back(std::make_pair(*I, S)); + for (iterator I = NewE; I != E; ++I) + LM[L].push_back(std::make_pair(*I, S)); + } + B = NewB, E = NewE; + if (B == E) + break; + } + // Record the final register range. If this range is non-empty, then + // L=W-S. + assert(B == E || L == W-S); + if (B != E) { + for (iterator I = B; I != E; ++I) + LM[L].push_back(std::make_pair(*I, S)); + // If B!=E, then we found a range of registers whose prefixes cover the + // rest of VR from position S. There is no need to further advance S. + break; + } + } + + if (isDebug()) { + dbgs() << "Prefixes matching register " << PrintReg(VR, HRI) << "\n"; + for (LRSMapType::iterator I = LM.begin(), E = LM.end(); I != E; ++I) { + dbgs() << " L=" << I->first << ':'; + const RSListType &LL = I->second; + for (unsigned i = 0, n = LL.size(); i < n; ++i) + dbgs() << " (" << PrintReg(LL[i].first, HRI) << ",@" + << LL[i].second << ')'; + dbgs() << '\n'; + } + } + + + bool Recorded = false; + + for (iterator I = AVs.begin(), E = AVs.end(); I != E; ++I) { + unsigned SrcR = *I; + int FDi = -1, LDi = -1; // First/last different bit. + const BitTracker::RegisterCell &AC = CMS->lookup(SrcR); + uint16_t AW = AC.width(); + for (uint16_t i = 0, w = std::min(W, AW); i < w; ++i) { + if (RC[i] == AC[i]) + continue; + if (FDi == -1) + FDi = i; + LDi = i; + } + if (FDi == -1) + continue; // TODO (future): Record identical registers. + // Look for a register whose prefix could patch the range [FD..LD] + // where VR and SrcR differ. + uint16_t FD = FDi, LD = LDi; // Switch to unsigned type. + uint16_t MinL = LD-FD+1; + for (uint16_t L = MinL; L < W; ++L) { + LRSMapType::iterator F = LM.find(L); + if (F == LM.end()) + continue; + RSListType &LL = F->second; + for (unsigned i = 0, n = LL.size(); i < n; ++i) { + uint16_t S = LL[i].second; + // MinL is the minimum length of the prefix. Any length above MinL + // allows some flexibility as to where the prefix can start: + // given the extra length EL=L-MinL, the prefix must start between + // max(0,FD-EL) and FD. + if (S > FD) // Starts too late. + continue; + uint16_t EL = L-MinL; + uint16_t LowS = (EL < FD) ? FD-EL : 0; + if (S < LowS) // Starts too early. + continue; + unsigned InsR = LL[i].first; + if (!isValidInsertForm(VR, SrcR, InsR, L, S)) + continue; + if (isDebug()) { + dbgs() << PrintReg(VR, HRI) << " = insert(" << PrintReg(SrcR, HRI) + << ',' << PrintReg(InsR, HRI) << ",#" << L << ",#" + << S << ")\n"; + } + IFRecordWithRegSet RR(IFRecord(SrcR, InsR, L, S), RegisterSet()); + IFMap[VR].push_back(RR); + Recorded = true; + } + } + } + + return Recorded; +} + + +void HexagonGenInsert::collectInBlock(MachineBasicBlock *B, + OrderedRegisterList &AVs) { + if (isDebug()) + dbgs() << "visiting block BB#" << B->getNumber() << "\n"; + + // First, check if this block is reachable at all. If not, the bit tracker + // will not have any information about registers in it. + if (!CMS->BT.reached(B)) + return; + + bool DoConst = OptConst; + // Keep a separate set of registers defined in this block, so that we + // can remove them from the list of available registers once all DT + // successors have been processed. + RegisterSet BlockDefs, InsDefs; + for (MachineBasicBlock::iterator I = B->begin(), E = B->end(); I != E; ++I) { + MachineInstr *MI = &*I; + InsDefs.clear(); + getInstrDefs(MI, InsDefs); + // Leave those alone. They are more transparent than "insert". + bool Skip = MI->isCopy() || MI->isRegSequence(); + + if (!Skip) { + // Visit all defined registers, and attempt to find the corresponding + // "insert" representations. + for (unsigned VR = InsDefs.find_first(); VR; VR = InsDefs.find_next(VR)) { + // Do not collect registers that are known to be compile-time cons- + // tants, unless requested. + if (!DoConst && isConstant(VR)) + continue; + // If VR's cell contains a reference to VR, then VR cannot be defined + // via "insert". If VR is a constant that can be generated in a single + // instruction (without constant extenders), generating it via insert + // makes no sense. + if (findSelfReference(VR) || isSmallConstant(VR)) + continue; + + findRecordInsertForms(VR, AVs); + } + } + + // Insert the defined registers into the list of available registers + // after they have been processed. + for (unsigned VR = InsDefs.find_first(); VR; VR = InsDefs.find_next(VR)) + AVs.insert(VR); + BlockDefs.insert(InsDefs); + } + + MachineDomTreeNode *N = MDT->getNode(B); + typedef GraphTraits<MachineDomTreeNode*> GTN; + typedef GTN::ChildIteratorType ChildIter; + for (ChildIter I = GTN::child_begin(N), E = GTN::child_end(N); I != E; ++I) { + MachineBasicBlock *SB = (*I)->getBlock(); + collectInBlock(SB, AVs); + } + + for (unsigned VR = BlockDefs.find_first(); VR; VR = BlockDefs.find_next(VR)) + AVs.remove(VR); +} + + +void HexagonGenInsert::findRemovableRegisters(unsigned VR, IFRecord IF, + RegisterSet &RMs) const { + // For a given register VR and a insert form, find the registers that are + // used by the current definition of VR, and which would no longer be + // needed for it after the definition of VR is replaced with the insert + // form. These are the registers that could potentially become dead. + RegisterSet Regs[2]; + + unsigned S = 0; // Register set selector. + Regs[S].insert(VR); + + while (!Regs[S].empty()) { + // Breadth-first search. + unsigned OtherS = 1-S; + Regs[OtherS].clear(); + for (unsigned R = Regs[S].find_first(); R; R = Regs[S].find_next(R)) { + Regs[S].remove(R); + if (R == IF.SrcR || R == IF.InsR) + continue; + // Check if a given register has bits that are references to any other + // registers. This is to detect situations where the instruction that + // defines register R takes register Q as an operand, but R itself does + // not contain any bits from Q. Loads are examples of how this could + // happen: + // R = load Q + // In this case (assuming we do not have any knowledge about the loaded + // value), we must not treat R as a "conveyance" of the bits from Q. + // (The information in BT about R's bits would have them as constants, + // in case of zero-extending loads, or refs to R.) + if (!findNonSelfReference(R)) + continue; + RMs.insert(R); + const MachineInstr *DefI = MRI->getVRegDef(R); + assert(DefI); + // Do not iterate past PHI nodes to avoid infinite loops. This can + // make the final set a bit less accurate, but the removable register + // sets are an approximation anyway. + if (DefI->isPHI()) + continue; + getInstrUses(DefI, Regs[OtherS]); + } + S = OtherS; + } + // The register VR is added to the list as a side-effect of the algorithm, + // but it is not "potentially removable". A potentially removable register + // is one that may become unused (dead) after conversion to the insert form + // IF, and obviously VR (or its replacement) will not become dead by apply- + // ing IF. + RMs.remove(VR); +} + + +void HexagonGenInsert::computeRemovableRegisters() { + for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I) { + IFListType &LL = I->second; + for (unsigned i = 0, n = LL.size(); i < n; ++i) + findRemovableRegisters(I->first, LL[i].first, LL[i].second); + } +} + + +void HexagonGenInsert::pruneEmptyLists() { + // Remove all entries from the map, where the register has no insert forms + // associated with it. + typedef SmallVector<IFMapType::iterator,16> IterListType; + IterListType Prune; + for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I) { + if (I->second.size() == 0) + Prune.push_back(I); + } + for (unsigned i = 0, n = Prune.size(); i < n; ++i) + IFMap.erase(Prune[i]); +} + + +void HexagonGenInsert::pruneCoveredSets(unsigned VR) { + IFMapType::iterator F = IFMap.find(VR); + assert(F != IFMap.end()); + IFListType &LL = F->second; + + // First, examine the IF candidates for register VR whose removable-regis- + // ter sets are empty. This means that a given candidate will not help eli- + // minate any registers, but since "insert" is not a constant-extendable + // instruction, using such a candidate may reduce code size if the defini- + // tion of VR is constant-extended. + // If there exists a candidate with a non-empty set, the ones with empty + // sets will not be used and can be removed. + MachineInstr *DefVR = MRI->getVRegDef(VR); + bool DefEx = HII->isConstExtended(DefVR); + bool HasNE = false; + for (unsigned i = 0, n = LL.size(); i < n; ++i) { + if (LL[i].second.empty()) + continue; + HasNE = true; + break; + } + if (!DefEx || HasNE) { + // The definition of VR is not constant-extended, or there is a candidate + // with a non-empty set. Remove all candidates with empty sets. + auto IsEmpty = [] (const IFRecordWithRegSet &IR) -> bool { + return IR.second.empty(); + }; + auto End = std::remove_if(LL.begin(), LL.end(), IsEmpty); + if (End != LL.end()) + LL.erase(End, LL.end()); + } else { + // The definition of VR is constant-extended, and all candidates have + // empty removable-register sets. Pick the maximum candidate, and remove + // all others. The "maximum" does not have any special meaning here, it + // is only so that the candidate that will remain on the list is selec- + // ted deterministically. + IFRecord MaxIF = LL[0].first; + for (unsigned i = 1, n = LL.size(); i < n; ++i) { + // If LL[MaxI] < LL[i], then MaxI = i. + const IFRecord &IF = LL[i].first; + unsigned M0 = BaseOrd[MaxIF.SrcR], M1 = BaseOrd[MaxIF.InsR]; + unsigned R0 = BaseOrd[IF.SrcR], R1 = BaseOrd[IF.InsR]; + if (M0 > R0) + continue; + if (M0 == R0) { + if (M1 > R1) + continue; + if (M1 == R1) { + if (MaxIF.Wdh > IF.Wdh) + continue; + if (MaxIF.Wdh == IF.Wdh && MaxIF.Off >= IF.Off) + continue; + } + } + // MaxIF < IF. + MaxIF = IF; + } + // Remove everything except the maximum candidate. All register sets + // are empty, so no need to preserve anything. + LL.clear(); + LL.push_back(std::make_pair(MaxIF, RegisterSet())); + } + + // Now, remove those whose sets of potentially removable registers are + // contained in another IF candidate for VR. For example, given these + // candidates for vreg45, + // %vreg45: + // (%vreg44,%vreg41,#9,#8), { %vreg42 } + // (%vreg43,%vreg41,#9,#8), { %vreg42 %vreg44 } + // remove the first one, since it is contained in the second one. + for (unsigned i = 0, n = LL.size(); i < n; ) { + const RegisterSet &RMi = LL[i].second; + unsigned j = 0; + while (j < n) { + if (j != i && LL[j].second.includes(RMi)) + break; + j++; + } + if (j == n) { // RMi not contained in anything else. + i++; + continue; + } + LL.erase(LL.begin()+i); + n = LL.size(); + } +} + + +void HexagonGenInsert::pruneUsesTooFar(unsigned VR, const UnsignedMap &RPO, + PairMapType &M) { + IFMapType::iterator F = IFMap.find(VR); + assert(F != IFMap.end()); + IFListType &LL = F->second; + unsigned Cutoff = VRegDistCutoff; + const MachineInstr *DefV = MRI->getVRegDef(VR); + + for (unsigned i = LL.size(); i > 0; --i) { + unsigned SR = LL[i-1].first.SrcR, IR = LL[i-1].first.InsR; + const MachineInstr *DefS = MRI->getVRegDef(SR); + const MachineInstr *DefI = MRI->getVRegDef(IR); + unsigned DSV = distance(DefS, DefV, RPO, M); + if (DSV < Cutoff) { + unsigned DIV = distance(DefI, DefV, RPO, M); + if (DIV < Cutoff) + continue; + } + LL.erase(LL.begin()+(i-1)); + } +} + + +void HexagonGenInsert::pruneRegCopies(unsigned VR) { + IFMapType::iterator F = IFMap.find(VR); + assert(F != IFMap.end()); + IFListType &LL = F->second; + + auto IsCopy = [] (const IFRecordWithRegSet &IR) -> bool { + return IR.first.Wdh == 32 && (IR.first.Off == 0 || IR.first.Off == 32); + }; + auto End = std::remove_if(LL.begin(), LL.end(), IsCopy); + if (End != LL.end()) + LL.erase(End, LL.end()); +} + + +void HexagonGenInsert::pruneCandidates() { + // Remove candidates that are not beneficial, regardless of the final + // selection method. + // First, remove candidates whose potentially removable set is a subset + // of another candidate's set. + for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I) + pruneCoveredSets(I->first); + + UnsignedMap RPO; + typedef ReversePostOrderTraversal<const MachineFunction*> RPOTType; + RPOTType RPOT(MFN); + unsigned RPON = 0; + for (RPOTType::rpo_iterator I = RPOT.begin(), E = RPOT.end(); I != E; ++I) + RPO[(*I)->getNumber()] = RPON++; + + PairMapType Memo; // Memoization map for distance calculation. + // Remove candidates that would use registers defined too far away. + for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I) + pruneUsesTooFar(I->first, RPO, Memo); + + pruneEmptyLists(); + + for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I) + pruneRegCopies(I->first); +} + + +namespace { + // Class for comparing IF candidates for registers that have multiple of + // them. The smaller the candidate, according to this ordering, the better. + // First, compare the number of zeros in the associated potentially remova- + // ble register sets. "Zero" indicates that the register is very likely to + // become dead after this transformation. + // Second, compare "averages", i.e. use-count per size. The lower wins. + // After that, it does not really matter which one is smaller. Resolve + // the tie in some deterministic way. + struct IFOrdering { + IFOrdering(const UnsignedMap &UC, const RegisterOrdering &BO) + : UseC(UC), BaseOrd(BO) {} + bool operator() (const IFRecordWithRegSet &A, + const IFRecordWithRegSet &B) const; + private: + void stats(const RegisterSet &Rs, unsigned &Size, unsigned &Zero, + unsigned &Sum) const; + const UnsignedMap &UseC; + const RegisterOrdering &BaseOrd; + }; +} + + +bool IFOrdering::operator() (const IFRecordWithRegSet &A, + const IFRecordWithRegSet &B) const { + unsigned SizeA = 0, ZeroA = 0, SumA = 0; + unsigned SizeB = 0, ZeroB = 0, SumB = 0; + stats(A.second, SizeA, ZeroA, SumA); + stats(B.second, SizeB, ZeroB, SumB); + + // We will pick the minimum element. The more zeros, the better. + if (ZeroA != ZeroB) + return ZeroA > ZeroB; + // Compare SumA/SizeA with SumB/SizeB, lower is better. + uint64_t AvgA = SumA*SizeB, AvgB = SumB*SizeA; + if (AvgA != AvgB) + return AvgA < AvgB; + + // The sets compare identical so far. Resort to comparing the IF records. + // The actual values don't matter, this is only for determinism. + unsigned OSA = BaseOrd[A.first.SrcR], OSB = BaseOrd[B.first.SrcR]; + if (OSA != OSB) + return OSA < OSB; + unsigned OIA = BaseOrd[A.first.InsR], OIB = BaseOrd[B.first.InsR]; + if (OIA != OIB) + return OIA < OIB; + if (A.first.Wdh != B.first.Wdh) + return A.first.Wdh < B.first.Wdh; + return A.first.Off < B.first.Off; +} + + +void IFOrdering::stats(const RegisterSet &Rs, unsigned &Size, unsigned &Zero, + unsigned &Sum) const { + for (unsigned R = Rs.find_first(); R; R = Rs.find_next(R)) { + UnsignedMap::const_iterator F = UseC.find(R); + assert(F != UseC.end()); + unsigned UC = F->second; + if (UC == 0) + Zero++; + Sum += UC; + Size++; + } +} + + +void HexagonGenInsert::selectCandidates() { + // Some registers may have multiple valid candidates. Pick the best one + // (or decide not to use any). + + // Compute the "removability" measure of R: + // For each potentially removable register R, record the number of regis- + // ters with IF candidates, where R appears in at least one set. + RegisterSet AllRMs; + UnsignedMap UseC, RemC; + IFMapType::iterator End = IFMap.end(); + + for (IFMapType::iterator I = IFMap.begin(); I != End; ++I) { + const IFListType &LL = I->second; + RegisterSet TT; + for (unsigned i = 0, n = LL.size(); i < n; ++i) + TT.insert(LL[i].second); + for (unsigned R = TT.find_first(); R; R = TT.find_next(R)) + RemC[R]++; + AllRMs.insert(TT); + } + + for (unsigned R = AllRMs.find_first(); R; R = AllRMs.find_next(R)) { + typedef MachineRegisterInfo::use_nodbg_iterator use_iterator; + typedef SmallSet<const MachineInstr*,16> InstrSet; + InstrSet UIs; + // Count as the number of instructions in which R is used, not the + // number of operands. + use_iterator E = MRI->use_nodbg_end(); + for (use_iterator I = MRI->use_nodbg_begin(R); I != E; ++I) + UIs.insert(I->getParent()); + unsigned C = UIs.size(); + // Calculate a measure, which is the number of instructions using R, + // minus the "removability" count computed earlier. + unsigned D = RemC[R]; + UseC[R] = (C > D) ? C-D : 0; // doz + } + + + bool SelectAll0 = OptSelectAll0, SelectHas0 = OptSelectHas0; + if (!SelectAll0 && !SelectHas0) + SelectAll0 = true; + + // The smaller the number UseC for a given register R, the "less used" + // R is aside from the opportunities for removal offered by generating + // "insert" instructions. + // Iterate over the IF map, and for those registers that have multiple + // candidates, pick the minimum one according to IFOrdering. + IFOrdering IFO(UseC, BaseOrd); + for (IFMapType::iterator I = IFMap.begin(); I != End; ++I) { + IFListType &LL = I->second; + if (LL.empty()) + continue; + // Get the minimum element, remember it and clear the list. If the + // element found is adequate, we will put it back on the list, other- + // wise the list will remain empty, and the entry for this register + // will be removed (i.e. this register will not be replaced by insert). + IFListType::iterator MinI = std::min_element(LL.begin(), LL.end(), IFO); + assert(MinI != LL.end()); + IFRecordWithRegSet M = *MinI; + LL.clear(); + + // We want to make sure that this replacement will have a chance to be + // beneficial, and that means that we want to have indication that some + // register will be removed. The most likely registers to be eliminated + // are the use operands in the definition of I->first. Accept/reject a + // candidate based on how many of its uses it can potentially eliminate. + + RegisterSet Us; + const MachineInstr *DefI = MRI->getVRegDef(I->first); + getInstrUses(DefI, Us); + bool Accept = false; + + if (SelectAll0) { + bool All0 = true; + for (unsigned R = Us.find_first(); R; R = Us.find_next(R)) { + if (UseC[R] == 0) + continue; + All0 = false; + break; + } + Accept = All0; + } else if (SelectHas0) { + bool Has0 = false; + for (unsigned R = Us.find_first(); R; R = Us.find_next(R)) { + if (UseC[R] != 0) + continue; + Has0 = true; + break; + } + Accept = Has0; + } + if (Accept) + LL.push_back(M); + } + + // Remove candidates that add uses of removable registers, unless the + // removable registers are among replacement candidates. + // Recompute the removable registers, since some candidates may have + // been eliminated. + AllRMs.clear(); + for (IFMapType::iterator I = IFMap.begin(); I != End; ++I) { + const IFListType &LL = I->second; + if (LL.size() > 0) + AllRMs.insert(LL[0].second); + } + for (IFMapType::iterator I = IFMap.begin(); I != End; ++I) { + IFListType &LL = I->second; + if (LL.size() == 0) + continue; + unsigned SR = LL[0].first.SrcR, IR = LL[0].first.InsR; + if (AllRMs[SR] || AllRMs[IR]) + LL.clear(); + } + + pruneEmptyLists(); +} + + +bool HexagonGenInsert::generateInserts() { + // Create a new register for each one from IFMap, and store them in the + // map. + UnsignedMap RegMap; + for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I) { + unsigned VR = I->first; + const TargetRegisterClass *RC = MRI->getRegClass(VR); + unsigned NewVR = MRI->createVirtualRegister(RC); + RegMap[VR] = NewVR; + } + + // We can generate the "insert" instructions using potentially stale re- + // gisters: SrcR and InsR for a given VR may be among other registers that + // are also replaced. This is fine, we will do the mass "rauw" a bit later. + for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I) { + MachineInstr *MI = MRI->getVRegDef(I->first); + MachineBasicBlock &B = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + unsigned NewR = RegMap[I->first]; + bool R32 = MRI->getRegClass(NewR) == &Hexagon::IntRegsRegClass; + const MCInstrDesc &D = R32 ? HII->get(Hexagon::S2_insert) + : HII->get(Hexagon::S2_insertp); + IFRecord IF = I->second[0].first; + unsigned Wdh = IF.Wdh, Off = IF.Off; + unsigned InsS = 0; + if (R32 && MRI->getRegClass(IF.InsR) == &Hexagon::DoubleRegsRegClass) { + InsS = Hexagon::subreg_loreg; + if (Off >= 32) { + InsS = Hexagon::subreg_hireg; + Off -= 32; + } + } + // Advance to the proper location for inserting instructions. This could + // be B.end(). + MachineBasicBlock::iterator At = MI; + if (MI->isPHI()) + At = B.getFirstNonPHI(); + + BuildMI(B, At, DL, D, NewR) + .addReg(IF.SrcR) + .addReg(IF.InsR, 0, InsS) + .addImm(Wdh) + .addImm(Off); + + MRI->clearKillFlags(IF.SrcR); + MRI->clearKillFlags(IF.InsR); + } + + for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I) { + MachineInstr *DefI = MRI->getVRegDef(I->first); + MRI->replaceRegWith(I->first, RegMap[I->first]); + DefI->eraseFromParent(); + } + + return true; +} + + +bool HexagonGenInsert::removeDeadCode(MachineDomTreeNode *N) { + bool Changed = false; + typedef GraphTraits<MachineDomTreeNode*> GTN; + for (auto I = GTN::child_begin(N), E = GTN::child_end(N); I != E; ++I) + Changed |= removeDeadCode(*I); + + MachineBasicBlock *B = N->getBlock(); + std::vector<MachineInstr*> Instrs; + for (auto I = B->rbegin(), E = B->rend(); I != E; ++I) + Instrs.push_back(&*I); + + for (auto I = Instrs.begin(), E = Instrs.end(); I != E; ++I) { + MachineInstr *MI = *I; + unsigned Opc = MI->getOpcode(); + // Do not touch lifetime markers. This is why the target-independent DCE + // cannot be used. + if (Opc == TargetOpcode::LIFETIME_START || + Opc == TargetOpcode::LIFETIME_END) + continue; + bool Store = false; + if (MI->isInlineAsm() || !MI->isSafeToMove(nullptr, Store)) + continue; + + bool AllDead = true; + SmallVector<unsigned,2> Regs; + for (ConstMIOperands Op(MI); Op.isValid(); ++Op) { + if (!Op->isReg() || !Op->isDef()) + continue; + unsigned R = Op->getReg(); + if (!TargetRegisterInfo::isVirtualRegister(R) || + !MRI->use_nodbg_empty(R)) { + AllDead = false; + break; + } + Regs.push_back(R); + } + if (!AllDead) + continue; + + B->erase(MI); + for (unsigned I = 0, N = Regs.size(); I != N; ++I) + MRI->markUsesInDebugValueAsUndef(Regs[I]); + Changed = true; + } + + return Changed; +} + + +bool HexagonGenInsert::runOnMachineFunction(MachineFunction &MF) { + bool Timing = OptTiming, TimingDetail = Timing && OptTimingDetail; + bool Changed = false; + TimerGroup __G("hexinsert"); + NamedRegionTimer __T("hexinsert", Timing && !TimingDetail); + + // Sanity check: one, but not both. + assert(!OptSelectAll0 || !OptSelectHas0); + + IFMap.clear(); + BaseOrd.clear(); + CellOrd.clear(); + + const auto &ST = MF.getSubtarget<HexagonSubtarget>(); + HII = ST.getInstrInfo(); + HRI = ST.getRegisterInfo(); + MFN = &MF; + MRI = &MF.getRegInfo(); + MDT = &getAnalysis<MachineDominatorTree>(); + + // Clean up before any further processing, so that dead code does not + // get used in a newly generated "insert" instruction. Have a custom + // version of DCE that preserves lifetime markers. Without it, merging + // of stack objects can fail to recognize and merge disjoint objects + // leading to unnecessary stack growth. + Changed |= removeDeadCode(MDT->getRootNode()); + + const HexagonEvaluator HE(*HRI, *MRI, *HII, MF); + BitTracker BTLoc(HE, MF); + BTLoc.trace(isDebug()); + BTLoc.run(); + CellMapShadow MS(BTLoc); + CMS = &MS; + + buildOrderingMF(BaseOrd); + buildOrderingBT(BaseOrd, CellOrd); + + if (isDebug()) { + dbgs() << "Cell ordering:\n"; + for (RegisterOrdering::iterator I = CellOrd.begin(), E = CellOrd.end(); + I != E; ++I) { + unsigned VR = I->first, Pos = I->second; + dbgs() << PrintReg(VR, HRI) << " -> " << Pos << "\n"; + } + } + + // Collect candidates for conversion into the insert forms. + MachineBasicBlock *RootB = MDT->getRoot(); + OrderedRegisterList AvailR(CellOrd); + + { + NamedRegionTimer _T("collection", "hexinsert", TimingDetail); + collectInBlock(RootB, AvailR); + // Complete the information gathered in IFMap. + computeRemovableRegisters(); + } + + if (isDebug()) { + dbgs() << "Candidates after collection:\n"; + dump_map(); + } + + if (IFMap.empty()) + return false; + + { + NamedRegionTimer _T("pruning", "hexinsert", TimingDetail); + pruneCandidates(); + } + + if (isDebug()) { + dbgs() << "Candidates after pruning:\n"; + dump_map(); + } + + if (IFMap.empty()) + return false; + + { + NamedRegionTimer _T("selection", "hexinsert", TimingDetail); + selectCandidates(); + } + + if (isDebug()) { + dbgs() << "Candidates after selection:\n"; + dump_map(); + } + + // Filter out vregs beyond the cutoff. + if (VRegIndexCutoff.getPosition()) { + unsigned Cutoff = VRegIndexCutoff; + typedef SmallVector<IFMapType::iterator,16> IterListType; + IterListType Out; + for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I) { + unsigned Idx = TargetRegisterInfo::virtReg2Index(I->first); + if (Idx >= Cutoff) + Out.push_back(I); + } + for (unsigned i = 0, n = Out.size(); i < n; ++i) + IFMap.erase(Out[i]); + } + + { + NamedRegionTimer _T("generation", "hexinsert", TimingDetail); + Changed = generateInserts(); + } + + return Changed; +} + + +FunctionPass *llvm::createHexagonGenInsert() { + return new HexagonGenInsert(); +} + + +//===----------------------------------------------------------------------===// +// Public Constructor Functions +//===----------------------------------------------------------------------===// + +INITIALIZE_PASS_BEGIN(HexagonGenInsert, "hexinsert", + "Hexagon generate \"insert\" instructions", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_END(HexagonGenInsert, "hexinsert", + "Hexagon generate \"insert\" instructions", false, false) diff --git a/lib/Target/Hexagon/HexagonGenPredicate.cpp b/lib/Target/Hexagon/HexagonGenPredicate.cpp new file mode 100644 index 000000000000..6905c4f6d125 --- /dev/null +++ b/lib/Target/Hexagon/HexagonGenPredicate.cpp @@ -0,0 +1,525 @@ +//===--- HexagonGenPredicate.cpp ------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "gen-pred" + +#include "llvm/ADT/SetVector.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "HexagonTargetMachine.h" + +#include <functional> +#include <queue> +#include <set> +#include <vector> + +using namespace llvm; + +namespace llvm { + void initializeHexagonGenPredicatePass(PassRegistry& Registry); + FunctionPass *createHexagonGenPredicate(); +} + +namespace { + struct Register { + unsigned R, S; + Register(unsigned r = 0, unsigned s = 0) : R(r), S(s) {} + Register(const MachineOperand &MO) : R(MO.getReg()), S(MO.getSubReg()) {} + bool operator== (const Register &Reg) const { + return R == Reg.R && S == Reg.S; + } + bool operator< (const Register &Reg) const { + return R < Reg.R || (R == Reg.R && S < Reg.S); + } + }; + struct PrintRegister { + PrintRegister(Register R, const TargetRegisterInfo &I) : Reg(R), TRI(I) {} + friend raw_ostream &operator<< (raw_ostream &OS, const PrintRegister &PR); + private: + Register Reg; + const TargetRegisterInfo &TRI; + }; + raw_ostream &operator<< (raw_ostream &OS, const PrintRegister &PR) + LLVM_ATTRIBUTE_UNUSED; + raw_ostream &operator<< (raw_ostream &OS, const PrintRegister &PR) { + return OS << PrintReg(PR.Reg.R, &PR.TRI, PR.Reg.S); + } + + class HexagonGenPredicate : public MachineFunctionPass { + public: + static char ID; + HexagonGenPredicate() : MachineFunctionPass(ID), TII(0), TRI(0), MRI(0) { + initializeHexagonGenPredicatePass(*PassRegistry::getPassRegistry()); + } + virtual const char *getPassName() const { + return "Hexagon generate predicate operations"; + } + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<MachineDominatorTree>(); + AU.addPreserved<MachineDominatorTree>(); + MachineFunctionPass::getAnalysisUsage(AU); + } + virtual bool runOnMachineFunction(MachineFunction &MF); + + private: + typedef SetVector<MachineInstr*> VectOfInst; + typedef std::set<Register> SetOfReg; + typedef std::map<Register,Register> RegToRegMap; + + const HexagonInstrInfo *TII; + const HexagonRegisterInfo *TRI; + MachineRegisterInfo *MRI; + SetOfReg PredGPRs; + VectOfInst PUsers; + RegToRegMap G2P; + + bool isPredReg(unsigned R); + void collectPredicateGPR(MachineFunction &MF); + void processPredicateGPR(const Register &Reg); + unsigned getPredForm(unsigned Opc); + bool isConvertibleToPredForm(const MachineInstr *MI); + bool isScalarCmp(unsigned Opc); + bool isScalarPred(Register PredReg); + Register getPredRegFor(const Register &Reg); + bool convertToPredForm(MachineInstr *MI); + bool eliminatePredCopies(MachineFunction &MF); + }; + + char HexagonGenPredicate::ID = 0; +} + +INITIALIZE_PASS_BEGIN(HexagonGenPredicate, "hexagon-gen-pred", + "Hexagon generate predicate operations", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_END(HexagonGenPredicate, "hexagon-gen-pred", + "Hexagon generate predicate operations", false, false) + +bool HexagonGenPredicate::isPredReg(unsigned R) { + if (!TargetRegisterInfo::isVirtualRegister(R)) + return false; + const TargetRegisterClass *RC = MRI->getRegClass(R); + return RC == &Hexagon::PredRegsRegClass; +} + + +unsigned HexagonGenPredicate::getPredForm(unsigned Opc) { + using namespace Hexagon; + + switch (Opc) { + case A2_and: + case A2_andp: + return C2_and; + case A4_andn: + case A4_andnp: + return C2_andn; + case M4_and_and: + return C4_and_and; + case M4_and_andn: + return C4_and_andn; + case M4_and_or: + return C4_and_or; + + case A2_or: + case A2_orp: + return C2_or; + case A4_orn: + case A4_ornp: + return C2_orn; + case M4_or_and: + return C4_or_and; + case M4_or_andn: + return C4_or_andn; + case M4_or_or: + return C4_or_or; + + case A2_xor: + case A2_xorp: + return C2_xor; + + case C2_tfrrp: + return COPY; + } + // The opcode corresponding to 0 is TargetOpcode::PHI. We can use 0 here + // to denote "none", but we need to make sure that none of the valid opcodes + // that we return will ever be 0. + assert(PHI == 0 && "Use different value for <none>"); + return 0; +} + + +bool HexagonGenPredicate::isConvertibleToPredForm(const MachineInstr *MI) { + unsigned Opc = MI->getOpcode(); + if (getPredForm(Opc) != 0) + return true; + + // Comparisons against 0 are also convertible. This does not apply to + // A4_rcmpeqi or A4_rcmpneqi, since they produce values 0 or 1, which + // may not match the value that the predicate register would have if + // it was converted to a predicate form. + switch (Opc) { + case Hexagon::C2_cmpeqi: + case Hexagon::C4_cmpneqi: + if (MI->getOperand(2).isImm() && MI->getOperand(2).getImm() == 0) + return true; + break; + } + return false; +} + + +void HexagonGenPredicate::collectPredicateGPR(MachineFunction &MF) { + for (MachineFunction::iterator A = MF.begin(), Z = MF.end(); A != Z; ++A) { + MachineBasicBlock &B = *A; + for (MachineBasicBlock::iterator I = B.begin(), E = B.end(); I != E; ++I) { + MachineInstr *MI = &*I; + unsigned Opc = MI->getOpcode(); + switch (Opc) { + case Hexagon::C2_tfrpr: + case TargetOpcode::COPY: + if (isPredReg(MI->getOperand(1).getReg())) { + Register RD = MI->getOperand(0); + if (TargetRegisterInfo::isVirtualRegister(RD.R)) + PredGPRs.insert(RD); + } + break; + } + } + } +} + + +void HexagonGenPredicate::processPredicateGPR(const Register &Reg) { + DEBUG(dbgs() << LLVM_FUNCTION_NAME << ": " + << PrintReg(Reg.R, TRI, Reg.S) << "\n"); + typedef MachineRegisterInfo::use_iterator use_iterator; + use_iterator I = MRI->use_begin(Reg.R), E = MRI->use_end(); + if (I == E) { + DEBUG(dbgs() << "Dead reg: " << PrintReg(Reg.R, TRI, Reg.S) << '\n'); + MachineInstr *DefI = MRI->getVRegDef(Reg.R); + DefI->eraseFromParent(); + return; + } + + for (; I != E; ++I) { + MachineInstr *UseI = I->getParent(); + if (isConvertibleToPredForm(UseI)) + PUsers.insert(UseI); + } +} + + +Register HexagonGenPredicate::getPredRegFor(const Register &Reg) { + // Create a predicate register for a given Reg. The newly created register + // will have its value copied from Reg, so that it can be later used as + // an operand in other instructions. + assert(TargetRegisterInfo::isVirtualRegister(Reg.R)); + RegToRegMap::iterator F = G2P.find(Reg); + if (F != G2P.end()) + return F->second; + + DEBUG(dbgs() << LLVM_FUNCTION_NAME << ": " << PrintRegister(Reg, *TRI)); + MachineInstr *DefI = MRI->getVRegDef(Reg.R); + assert(DefI); + unsigned Opc = DefI->getOpcode(); + if (Opc == Hexagon::C2_tfrpr || Opc == TargetOpcode::COPY) { + assert(DefI->getOperand(0).isDef() && DefI->getOperand(1).isUse()); + Register PR = DefI->getOperand(1); + G2P.insert(std::make_pair(Reg, PR)); + DEBUG(dbgs() << " -> " << PrintRegister(PR, *TRI) << '\n'); + return PR; + } + + MachineBasicBlock &B = *DefI->getParent(); + DebugLoc DL = DefI->getDebugLoc(); + const TargetRegisterClass *PredRC = &Hexagon::PredRegsRegClass; + unsigned NewPR = MRI->createVirtualRegister(PredRC); + + // For convertible instructions, do not modify them, so that they can + // be coverted later. Generate a copy from Reg to NewPR. + if (isConvertibleToPredForm(DefI)) { + MachineBasicBlock::iterator DefIt = DefI; + BuildMI(B, std::next(DefIt), DL, TII->get(TargetOpcode::COPY), NewPR) + .addReg(Reg.R, 0, Reg.S); + G2P.insert(std::make_pair(Reg, Register(NewPR))); + DEBUG(dbgs() << " -> !" << PrintRegister(Register(NewPR), *TRI) << '\n'); + return Register(NewPR); + } + + llvm_unreachable("Invalid argument"); +} + + +bool HexagonGenPredicate::isScalarCmp(unsigned Opc) { + switch (Opc) { + case Hexagon::C2_cmpeq: + case Hexagon::C2_cmpgt: + case Hexagon::C2_cmpgtu: + case Hexagon::C2_cmpeqp: + case Hexagon::C2_cmpgtp: + case Hexagon::C2_cmpgtup: + case Hexagon::C2_cmpeqi: + case Hexagon::C2_cmpgti: + case Hexagon::C2_cmpgtui: + case Hexagon::C2_cmpgei: + case Hexagon::C2_cmpgeui: + case Hexagon::C4_cmpneqi: + case Hexagon::C4_cmpltei: + case Hexagon::C4_cmplteui: + case Hexagon::C4_cmpneq: + case Hexagon::C4_cmplte: + case Hexagon::C4_cmplteu: + case Hexagon::A4_cmpbeq: + case Hexagon::A4_cmpbeqi: + case Hexagon::A4_cmpbgtu: + case Hexagon::A4_cmpbgtui: + case Hexagon::A4_cmpbgt: + case Hexagon::A4_cmpbgti: + case Hexagon::A4_cmpheq: + case Hexagon::A4_cmphgt: + case Hexagon::A4_cmphgtu: + case Hexagon::A4_cmpheqi: + case Hexagon::A4_cmphgti: + case Hexagon::A4_cmphgtui: + return true; + } + return false; +} + + +bool HexagonGenPredicate::isScalarPred(Register PredReg) { + std::queue<Register> WorkQ; + WorkQ.push(PredReg); + + while (!WorkQ.empty()) { + Register PR = WorkQ.front(); + WorkQ.pop(); + const MachineInstr *DefI = MRI->getVRegDef(PR.R); + if (!DefI) + return false; + unsigned DefOpc = DefI->getOpcode(); + switch (DefOpc) { + case TargetOpcode::COPY: { + const TargetRegisterClass *PredRC = &Hexagon::PredRegsRegClass; + if (MRI->getRegClass(PR.R) != PredRC) + return false; + // If it is a copy between two predicate registers, fall through. + } + case Hexagon::C2_and: + case Hexagon::C2_andn: + case Hexagon::C4_and_and: + case Hexagon::C4_and_andn: + case Hexagon::C4_and_or: + case Hexagon::C2_or: + case Hexagon::C2_orn: + case Hexagon::C4_or_and: + case Hexagon::C4_or_andn: + case Hexagon::C4_or_or: + case Hexagon::C4_or_orn: + case Hexagon::C2_xor: + // Add operands to the queue. + for (ConstMIOperands Mo(DefI); Mo.isValid(); ++Mo) + if (Mo->isReg() && Mo->isUse()) + WorkQ.push(Register(Mo->getReg())); + break; + + // All non-vector compares are ok, everything else is bad. + default: + return isScalarCmp(DefOpc); + } + } + + return true; +} + + +bool HexagonGenPredicate::convertToPredForm(MachineInstr *MI) { + DEBUG(dbgs() << LLVM_FUNCTION_NAME << ": " << MI << " " << *MI); + + unsigned Opc = MI->getOpcode(); + assert(isConvertibleToPredForm(MI)); + unsigned NumOps = MI->getNumOperands(); + for (unsigned i = 0; i < NumOps; ++i) { + MachineOperand &MO = MI->getOperand(i); + if (!MO.isReg() || !MO.isUse()) + continue; + Register Reg(MO); + if (Reg.S && Reg.S != Hexagon::subreg_loreg) + return false; + if (!PredGPRs.count(Reg)) + return false; + } + + MachineBasicBlock &B = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + + unsigned NewOpc = getPredForm(Opc); + // Special case for comparisons against 0. + if (NewOpc == 0) { + switch (Opc) { + case Hexagon::C2_cmpeqi: + NewOpc = Hexagon::C2_not; + break; + case Hexagon::C4_cmpneqi: + NewOpc = TargetOpcode::COPY; + break; + default: + return false; + } + + // If it's a scalar predicate register, then all bits in it are + // the same. Otherwise, to determine whether all bits are 0 or not + // we would need to use any8. + Register PR = getPredRegFor(MI->getOperand(1)); + if (!isScalarPred(PR)) + return false; + // This will skip the immediate argument when creating the predicate + // version instruction. + NumOps = 2; + } + + // Some sanity: check that def is in operand #0. + MachineOperand &Op0 = MI->getOperand(0); + assert(Op0.isDef()); + Register OutR(Op0); + + // Don't use getPredRegFor, since it will create an association between + // the argument and a created predicate register (i.e. it will insert a + // copy if a new predicate register is created). + const TargetRegisterClass *PredRC = &Hexagon::PredRegsRegClass; + Register NewPR = MRI->createVirtualRegister(PredRC); + MachineInstrBuilder MIB = BuildMI(B, MI, DL, TII->get(NewOpc), NewPR.R); + + // Add predicate counterparts of the GPRs. + for (unsigned i = 1; i < NumOps; ++i) { + Register GPR = MI->getOperand(i); + Register Pred = getPredRegFor(GPR); + MIB.addReg(Pred.R, 0, Pred.S); + } + DEBUG(dbgs() << "generated: " << *MIB); + + // Generate a copy-out: NewGPR = NewPR, and replace all uses of OutR + // with NewGPR. + const TargetRegisterClass *RC = MRI->getRegClass(OutR.R); + unsigned NewOutR = MRI->createVirtualRegister(RC); + BuildMI(B, MI, DL, TII->get(TargetOpcode::COPY), NewOutR) + .addReg(NewPR.R, 0, NewPR.S); + MRI->replaceRegWith(OutR.R, NewOutR); + MI->eraseFromParent(); + + // If the processed instruction was C2_tfrrp (i.e. Rn = Pm; Pk = Rn), + // then the output will be a predicate register. Do not visit the + // users of it. + if (!isPredReg(NewOutR)) { + Register R(NewOutR); + PredGPRs.insert(R); + processPredicateGPR(R); + } + return true; +} + + +bool HexagonGenPredicate::eliminatePredCopies(MachineFunction &MF) { + DEBUG(dbgs() << LLVM_FUNCTION_NAME << "\n"); + const TargetRegisterClass *PredRC = &Hexagon::PredRegsRegClass; + bool Changed = false; + VectOfInst Erase; + + // First, replace copies + // IntR = PredR1 + // PredR2 = IntR + // with + // PredR2 = PredR1 + // Such sequences can be generated when a copy-into-pred is generated from + // a gpr register holding a result of a convertible instruction. After + // the convertible instruction is converted, its predicate result will be + // copied back into the original gpr. + + for (MachineFunction::iterator A = MF.begin(), Z = MF.end(); A != Z; ++A) { + MachineBasicBlock &B = *A; + for (MachineBasicBlock::iterator I = B.begin(), E = B.end(); I != E; ++I) { + if (I->getOpcode() != TargetOpcode::COPY) + continue; + Register DR = I->getOperand(0); + Register SR = I->getOperand(1); + if (!TargetRegisterInfo::isVirtualRegister(DR.R)) + continue; + if (!TargetRegisterInfo::isVirtualRegister(SR.R)) + continue; + if (MRI->getRegClass(DR.R) != PredRC) + continue; + if (MRI->getRegClass(SR.R) != PredRC) + continue; + assert(!DR.S && !SR.S && "Unexpected subregister"); + MRI->replaceRegWith(DR.R, SR.R); + Erase.insert(I); + Changed = true; + } + } + + for (VectOfInst::iterator I = Erase.begin(), E = Erase.end(); I != E; ++I) + (*I)->eraseFromParent(); + + return Changed; +} + + +bool HexagonGenPredicate::runOnMachineFunction(MachineFunction &MF) { + TII = MF.getSubtarget<HexagonSubtarget>().getInstrInfo(); + TRI = MF.getSubtarget<HexagonSubtarget>().getRegisterInfo(); + MRI = &MF.getRegInfo(); + PredGPRs.clear(); + PUsers.clear(); + G2P.clear(); + + bool Changed = false; + collectPredicateGPR(MF); + for (SetOfReg::iterator I = PredGPRs.begin(), E = PredGPRs.end(); I != E; ++I) + processPredicateGPR(*I); + + bool Again; + do { + Again = false; + VectOfInst Processed, Copy; + + typedef VectOfInst::iterator iterator; + Copy = PUsers; + for (iterator I = Copy.begin(), E = Copy.end(); I != E; ++I) { + MachineInstr *MI = *I; + bool Done = convertToPredForm(MI); + if (Done) { + Processed.insert(MI); + Again = true; + } + } + Changed |= Again; + + auto Done = [Processed] (MachineInstr *MI) -> bool { + return Processed.count(MI); + }; + PUsers.remove_if(Done); + } while (Again); + + Changed |= eliminatePredCopies(MF); + return Changed; +} + + +FunctionPass *llvm::createHexagonGenPredicate() { + return new HexagonGenPredicate(); +} + diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp index 6e9e69f5a2c7..c739afb70c15 100644 --- a/lib/Target/Hexagon/HexagonISelLowering.cpp +++ b/lib/Target/Hexagon/HexagonISelLowering.cpp @@ -459,6 +459,7 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, bool IsStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet(); MachineFunction &MF = DAG.getMachineFunction(); + auto PtrVT = getPointerTy(MF.getDataLayout()); // Check for varargs. int NumNamedVarArgParams = -1; @@ -515,8 +516,8 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVector<SDValue, 8> MemOpChains; auto &HRI = *Subtarget.getRegisterInfo(); - SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, HRI.getStackRegister(), - getPointerTy()); + SDValue StackPtr = + DAG.getCopyFromReg(Chain, dl, HRI.getStackRegister(), PtrVT); // Walk the register/memloc assignments, inserting copies/loads. for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { @@ -574,7 +575,7 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); if (!isTailCall) { - SDValue C = DAG.getConstant(NumBytes, dl, getPointerTy(), true); + SDValue C = DAG.getConstant(NumBytes, dl, PtrVT, true); Chain = DAG.getCALLSEQ_START(Chain, C, dl); } @@ -615,13 +616,13 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, if (flag_aligned_memcpy) { const char *MemcpyName = "__hexagon_memcpy_likely_aligned_min32bytes_mult8bytes"; - Callee = DAG.getTargetExternalSymbol(MemcpyName, getPointerTy()); + Callee = DAG.getTargetExternalSymbol(MemcpyName, PtrVT); flag_aligned_memcpy = false; } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { - Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, getPointerTy()); + Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, PtrVT); } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { - Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy()); + Callee = DAG.getTargetExternalSymbol(S->getSymbol(), PtrVT); } // Returns a chain & a flag for retval copy to use. @@ -811,8 +812,8 @@ LowerBR_JT(SDValue Op, SelectionDAG &DAG) const BlockAddress::get(const_cast<BasicBlock *>(MBB->getBasicBlock())); } - SDValue JumpTableBase = DAG.getNode(HexagonISD::JT, dl, - getPointerTy(), TargetJT); + SDValue JumpTableBase = DAG.getNode( + HexagonISD::JT, dl, getPointerTy(DAG.getDataLayout()), TargetJT); SDValue ShiftIndex = DAG.getNode(ISD::SHL, dl, MVT::i32, Index, DAG.getConstant(2, dl, MVT::i32)); SDValue JTAddress = DAG.getNode(ISD::ADD, dl, MVT::i32, JumpTableBase, @@ -1231,16 +1232,17 @@ SDValue HexagonTargetLowering::LowerGLOBALADDRESS(SDValue Op, const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset(); SDLoc dl(Op); - Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset); + auto PtrVT = getPointerTy(DAG.getDataLayout()); + Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset); const HexagonTargetObjectFile *TLOF = static_cast<const HexagonTargetObjectFile *>( getTargetMachine().getObjFileLowering()); if (TLOF->IsGlobalInSmallSection(GV, getTargetMachine())) { - return DAG.getNode(HexagonISD::CONST32_GP, dl, getPointerTy(), Result); + return DAG.getNode(HexagonISD::CONST32_GP, dl, PtrVT, Result); } - return DAG.getNode(HexagonISD::CONST32, dl, getPointerTy(), Result); + return DAG.getNode(HexagonISD::CONST32, dl, PtrVT, Result); } // Specifies that for loads and stores VT can be promoted to PromotedLdStVT. @@ -1261,7 +1263,8 @@ HexagonTargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); SDValue BA_SD = DAG.getTargetBlockAddress(BA, MVT::i32); SDLoc dl(Op); - return DAG.getNode(HexagonISD::CONST32_GP, dl, getPointerTy(), BA_SD); + return DAG.getNode(HexagonISD::CONST32_GP, dl, + getPointerTy(DAG.getDataLayout()), BA_SD); } //===----------------------------------------------------------------------===// @@ -2254,6 +2257,7 @@ HexagonTargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { SDValue Offset = Op.getOperand(1); SDValue Handler = Op.getOperand(2); SDLoc dl(Op); + auto PtrVT = getPointerTy(DAG.getDataLayout()); // Mark function as containing a call to EH_RETURN. HexagonMachineFunctionInfo *FuncInfo = @@ -2262,9 +2266,9 @@ HexagonTargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { unsigned OffsetReg = Hexagon::R28; - SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), - DAG.getRegister(Hexagon::R30, getPointerTy()), - DAG.getIntPtrConstant(4, dl)); + SDValue StoreAddr = + DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getRegister(Hexagon::R30, PtrVT), + DAG.getIntPtrConstant(4, dl)); Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(), false, false, 0); Chain = DAG.getCopyToReg(Chain, dl, OffsetReg, Offset); @@ -2338,8 +2342,7 @@ HexagonTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, std::pair<unsigned, const TargetRegisterClass *> HexagonTargetLowering::getRegForInlineAsmConstraint( - const TargetRegisterInfo *TRI, const std::string &Constraint, - MVT VT) const { + const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { if (Constraint.size() == 1) { switch (Constraint[0]) { case 'r': // R0-R31 @@ -2372,8 +2375,8 @@ bool HexagonTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { /// isLegalAddressingMode - Return true if the addressing mode represented by /// AM is legal for this target, for a load/store of the specified type. -bool HexagonTargetLowering::isLegalAddressingMode(const AddrMode &AM, - Type *Ty, +bool HexagonTargetLowering::isLegalAddressingMode(const DataLayout &DL, + const AddrMode &AM, Type *Ty, unsigned AS) const { // Allows a signed-extended 11-bit immediate field. if (AM.BaseOffs <= -(1LL << 13) || AM.BaseOffs >= (1LL << 13)-1) @@ -2463,3 +2466,45 @@ bool llvm::isPositiveHalfWord(SDNode *N) { return true; } } + +Value *HexagonTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, + AtomicOrdering Ord) const { + BasicBlock *BB = Builder.GetInsertBlock(); + Module *M = BB->getParent()->getParent(); + Type *Ty = cast<PointerType>(Addr->getType())->getElementType(); + unsigned SZ = Ty->getPrimitiveSizeInBits(); + assert((SZ == 32 || SZ == 64) && "Only 32/64-bit atomic loads supported"); + Intrinsic::ID IntID = (SZ == 32) ? Intrinsic::hexagon_L2_loadw_locked + : Intrinsic::hexagon_L4_loadd_locked; + Value *Fn = Intrinsic::getDeclaration(M, IntID); + return Builder.CreateCall(Fn, Addr, "larx"); +} + +/// Perform a store-conditional operation to Addr. Return the status of the +/// store. This should be 0 if the store succeeded, non-zero otherwise. +Value *HexagonTargetLowering::emitStoreConditional(IRBuilder<> &Builder, + Value *Val, Value *Addr, AtomicOrdering Ord) const { + BasicBlock *BB = Builder.GetInsertBlock(); + Module *M = BB->getParent()->getParent(); + Type *Ty = Val->getType(); + unsigned SZ = Ty->getPrimitiveSizeInBits(); + assert((SZ == 32 || SZ == 64) && "Only 32/64-bit atomic stores supported"); + Intrinsic::ID IntID = (SZ == 32) ? Intrinsic::hexagon_S2_storew_locked + : Intrinsic::hexagon_S4_stored_locked; + Value *Fn = Intrinsic::getDeclaration(M, IntID); + Value *Call = Builder.CreateCall(Fn, {Addr, Val}, "stcx"); + Value *Cmp = Builder.CreateICmpEQ(Call, Builder.getInt32(0), ""); + Value *Ext = Builder.CreateZExt(Cmp, Type::getInt32Ty(M->getContext())); + return Ext; +} + +bool HexagonTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { + // Do not expand loads and stores that don't exceed 64 bits. + return LI->getType()->getPrimitiveSizeInBits() > 64; +} + +bool HexagonTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { + // Do not expand loads and stores that don't exceed 64 bits. + return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() > 64; +} + diff --git a/lib/Target/Hexagon/HexagonISelLowering.h b/lib/Target/Hexagon/HexagonISelLowering.h index b80e8477eb7b..2642abffaddd 100644 --- a/lib/Target/Hexagon/HexagonISelLowering.h +++ b/lib/Target/Hexagon/HexagonISelLowering.h @@ -165,7 +165,8 @@ bool isPositiveHalfWord(SDNode *N); SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const; SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const; - EVT getSetCCResultType(LLVMContext &C, EVT VT) const override { + EVT getSetCCResultType(const DataLayout &, LLVMContext &C, + EVT VT) const override { if (!VT.isVector()) return MVT::i1; else @@ -179,11 +180,10 @@ bool isPositiveHalfWord(SDNode *N); std::pair<unsigned, const TargetRegisterClass *> getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, - const std::string &Constraint, - MVT VT) const override; + StringRef Constraint, MVT VT) const override; - unsigned getInlineAsmMemConstraint( - const std::string &ConstraintCode) const override { + unsigned + getInlineAsmMemConstraint(StringRef ConstraintCode) const override { if (ConstraintCode == "o") return InlineAsm::Constraint_o; else if (ConstraintCode == "v") @@ -198,8 +198,8 @@ bool isPositiveHalfWord(SDNode *N); /// The type may be VoidTy, in which case only return true if the addressing /// mode is legal for a load/store of any legal type. /// TODO: Handle pre/postinc as well. - bool isLegalAddressingMode(const AddrMode &AM, Type *Ty, - unsigned AS) const override; + bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, + Type *Ty, unsigned AS) const override; bool isFPImmLegal(const APFloat &Imm, EVT VT) const override; /// isLegalICmpImmediate - Return true if the specified immediate is legal @@ -207,6 +207,21 @@ bool isPositiveHalfWord(SDNode *N); /// compare a register against the immediate without having to materialize /// the immediate into a register. bool isLegalICmpImmediate(int64_t Imm) const override; + + // Handling of atomic RMW instructions. + bool hasLoadLinkedStoreConditional() const override { + return true; + } + Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr, + AtomicOrdering Ord) const override; + Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val, + Value *Addr, AtomicOrdering Ord) const override; + bool shouldExpandAtomicLoadInIR(LoadInst *LI) const override; + bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override; + AtomicRMWExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) + const override { + return AtomicRMWExpansionKind::LLSC; + } }; } // end namespace llvm diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.cpp b/lib/Target/Hexagon/HexagonRegisterInfo.cpp index 8f255a08f534..f6bb4a045438 100644 --- a/lib/Target/Hexagon/HexagonRegisterInfo.cpp +++ b/lib/Target/Hexagon/HexagonRegisterInfo.cpp @@ -221,7 +221,7 @@ unsigned HexagonRegisterInfo::getRARegister() const { unsigned HexagonRegisterInfo::getFrameRegister(const MachineFunction &MF) const { - const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); + const HexagonFrameLowering *TFI = getFrameLowering(MF); if (TFI->hasFP(MF)) return Hexagon::R30; return Hexagon::R29; @@ -240,7 +240,8 @@ unsigned HexagonRegisterInfo::getStackRegister() const { bool HexagonRegisterInfo::useFPForScavengingIndex(const MachineFunction &MF) const { - return MF.getSubtarget().getFrameLowering()->hasFP(MF); + const HexagonFrameLowering *TFI = getFrameLowering(MF); + return TFI->hasFP(MF); } diff --git a/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp b/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp index b5db997eb1b8..276cc69eed0f 100644 --- a/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp +++ b/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp @@ -18,12 +18,6 @@ using namespace llvm; bool llvm::flag_aligned_memcpy; -HexagonSelectionDAGInfo::HexagonSelectionDAGInfo(const DataLayout &DL) - : TargetSelectionDAGInfo(&DL) {} - -HexagonSelectionDAGInfo::~HexagonSelectionDAGInfo() { -} - SDValue HexagonSelectionDAGInfo:: EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, SDValue Chain, diff --git a/lib/Target/Hexagon/HexagonSelectionDAGInfo.h b/lib/Target/Hexagon/HexagonSelectionDAGInfo.h index 8ac2e43f9294..80ac5d7bd9e2 100644 --- a/lib/Target/Hexagon/HexagonSelectionDAGInfo.h +++ b/lib/Target/Hexagon/HexagonSelectionDAGInfo.h @@ -20,8 +20,6 @@ namespace llvm { class HexagonSelectionDAGInfo : public TargetSelectionDAGInfo { public: - explicit HexagonSelectionDAGInfo(const DataLayout &DL); - ~HexagonSelectionDAGInfo(); SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, SDValue Chain, diff --git a/lib/Target/Hexagon/HexagonSubtarget.cpp b/lib/Target/Hexagon/HexagonSubtarget.cpp index fe6c4f4298b5..cd482b3e3af1 100644 --- a/lib/Target/Hexagon/HexagonSubtarget.cpp +++ b/lib/Target/Hexagon/HexagonSubtarget.cpp @@ -74,7 +74,7 @@ HexagonSubtarget::HexagonSubtarget(const Triple &TT, StringRef CPU, StringRef FS, const TargetMachine &TM) : HexagonGenSubtargetInfo(TT, CPU, FS), CPUString(CPU), InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this), - TSInfo(*TM.getDataLayout()), FrameLowering() { + FrameLowering() { // Initialize scheduling itinerary for the specified CPU. InstrItins = getInstrItineraryForCPU(CPUString); diff --git a/lib/Target/Hexagon/HexagonTargetMachine.cpp b/lib/Target/Hexagon/HexagonTargetMachine.cpp index a173a8087832..b50442969a29 100644 --- a/lib/Target/Hexagon/HexagonTargetMachine.cpp +++ b/lib/Target/Hexagon/HexagonTargetMachine.cpp @@ -37,6 +37,18 @@ static cl::opt<bool> EnableExpandCondsets("hexagon-expand-condsets", cl::init(true), cl::Hidden, cl::ZeroOrMore, cl::desc("Early expansion of MUX")); +static cl::opt<bool> EnableGenInsert("hexagon-insert", cl::init(true), + cl::Hidden, cl::desc("Generate \"insert\" instructions")); + +static cl::opt<bool> EnableCommGEP("hexagon-commgep", cl::init(true), + cl::Hidden, cl::ZeroOrMore, cl::desc("Enable commoning of GEP instructions")); + +static cl::opt<bool> EnableGenExtract("hexagon-extract", cl::init(true), + cl::Hidden, cl::desc("Generate \"extract\" instructions")); + +static cl::opt<bool> EnableGenPred("hexagon-gen-pred", cl::init(true), + cl::Hidden, cl::desc("Enable conversion of arithmetic operations to " + "predicate instructions")); /// HexagonTargetMachineModule - Note that this is used on hosts that /// cannot link in a library unless there are references into the @@ -60,23 +72,23 @@ SchedCustomRegistry("hexagon", "Run Hexagon's custom scheduler", createVLIWMachineSched); namespace llvm { - FunctionPass *createHexagonExpandCondsets(); - FunctionPass *createHexagonISelDag(HexagonTargetMachine &TM, - CodeGenOpt::Level OptLevel); - FunctionPass *createHexagonDelaySlotFillerPass(const TargetMachine &TM); - FunctionPass *createHexagonFPMoverPass(const TargetMachine &TM); - FunctionPass *createHexagonRemoveExtendArgs(const HexagonTargetMachine &TM); FunctionPass *createHexagonCFGOptimizer(); - - FunctionPass *createHexagonSplitConst32AndConst64(); + FunctionPass *createHexagonCommonGEP(); + FunctionPass *createHexagonCopyToCombine(); + FunctionPass *createHexagonExpandCondsets(); FunctionPass *createHexagonExpandPredSpillCode(); - FunctionPass *createHexagonHardwareLoops(); - FunctionPass *createHexagonPeephole(); FunctionPass *createHexagonFixupHwLoops(); + FunctionPass *createHexagonGenExtract(); + FunctionPass *createHexagonGenInsert(); + FunctionPass *createHexagonGenPredicate(); + FunctionPass *createHexagonHardwareLoops(); + FunctionPass *createHexagonISelDag(HexagonTargetMachine &TM, + CodeGenOpt::Level OptLevel); FunctionPass *createHexagonNewValueJump(); - FunctionPass *createHexagonCopyToCombine(); FunctionPass *createHexagonPacketizer(); - FunctionPass *createHexagonNewValueJump(); + FunctionPass *createHexagonPeephole(); + FunctionPass *createHexagonRemoveExtendArgs(const HexagonTargetMachine &TM); + FunctionPass *createHexagonSplitConst32AndConst64(); } // end namespace llvm; /// HexagonTargetMachine ctor - Create an ILP32 architecture model. @@ -122,6 +134,7 @@ public: return createVLIWMachineSched(C); } + void addIRPasses() override; bool addInstSelector() override; void addPreRegAlloc() override; void addPostRegAlloc() override; @@ -134,6 +147,20 @@ TargetPassConfig *HexagonTargetMachine::createPassConfig(PassManagerBase &PM) { return new HexagonPassConfig(this, PM); } +void HexagonPassConfig::addIRPasses() { + TargetPassConfig::addIRPasses(); + bool NoOpt = (getOptLevel() == CodeGenOpt::None); + + addPass(createAtomicExpandPass(TM)); + if (!NoOpt) { + if (EnableCommGEP) + addPass(createHexagonCommonGEP()); + // Replace certain combinations of shifts and ands with extracts. + if (EnableGenExtract) + addPass(createHexagonGenExtract()); + } +} + bool HexagonPassConfig::addInstSelector() { HexagonTargetMachine &TM = getHexagonTargetMachine(); bool NoOpt = (getOptLevel() == CodeGenOpt::None); @@ -144,8 +171,13 @@ bool HexagonPassConfig::addInstSelector() { addPass(createHexagonISelDag(TM, getOptLevel())); if (!NoOpt) { + // Create logical operations on predicate registers. + if (EnableGenPred) + addPass(createHexagonGenPredicate(), false); addPass(createHexagonPeephole()); printAndVerify("After hexagon peephole pass"); + if (EnableGenInsert) + addPass(createHexagonGenInsert(), false); } return false; diff --git a/lib/Target/Hexagon/LLVMBuild.txt b/lib/Target/Hexagon/LLVMBuild.txt index 8259055b3f41..9d288af0214a 100644 --- a/lib/Target/Hexagon/LLVMBuild.txt +++ b/lib/Target/Hexagon/LLVMBuild.txt @@ -39,4 +39,5 @@ required_libraries = SelectionDAG Support Target + TransformUtils add_to_library_groups = Hexagon diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp index 83ce0abd835e..53305d85fd80 100644 --- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp +++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp @@ -46,7 +46,7 @@ MCInstrInfo *llvm::createHexagonMCInstrInfo() { return X; } -static MCRegisterInfo *createHexagonMCRegisterInfo(StringRef TT) { +static MCRegisterInfo *createHexagonMCRegisterInfo(const Triple &TT) { MCRegisterInfo *X = new MCRegisterInfo(); InitHexagonMCRegisterInfo(X, Hexagon::R0); return X; @@ -54,9 +54,7 @@ static MCRegisterInfo *createHexagonMCRegisterInfo(StringRef TT) { static MCSubtargetInfo * createHexagonMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) { - MCSubtargetInfo *X = new MCSubtargetInfo(); - InitHexagonMCSubtargetInfo(X, TT, CPU, FS); - return X; + return createHexagonMCSubtargetInfoImpl(TT, CPU, FS); } namespace { @@ -151,7 +149,8 @@ static MCAsmInfo *createHexagonMCAsmInfo(const MCRegisterInfo &MRI, return MAI; } -static MCCodeGenInfo *createHexagonMCCodeGenInfo(StringRef TT, Reloc::Model RM, +static MCCodeGenInfo *createHexagonMCCodeGenInfo(const Triple &TT, + Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) { MCCodeGenInfo *X = new MCCodeGenInfo(); diff --git a/lib/Target/MSP430/CMakeLists.txt b/lib/Target/MSP430/CMakeLists.txt index a8f9b52746ad..3f377631c016 100644 --- a/lib/Target/MSP430/CMakeLists.txt +++ b/lib/Target/MSP430/CMakeLists.txt @@ -18,7 +18,6 @@ add_llvm_target(MSP430CodeGen MSP430RegisterInfo.cpp MSP430Subtarget.cpp MSP430TargetMachine.cpp - MSP430SelectionDAGInfo.cpp MSP430AsmPrinter.cpp MSP430MCInstLower.cpp ) diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp b/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp index be445c56389a..807d1129b5fc 100644 --- a/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp +++ b/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp @@ -37,7 +37,7 @@ static MCInstrInfo *createMSP430MCInstrInfo() { return X; } -static MCRegisterInfo *createMSP430MCRegisterInfo(StringRef TT) { +static MCRegisterInfo *createMSP430MCRegisterInfo(const Triple &TT) { MCRegisterInfo *X = new MCRegisterInfo(); InitMSP430MCRegisterInfo(X, MSP430::PC); return X; @@ -45,12 +45,11 @@ static MCRegisterInfo *createMSP430MCRegisterInfo(StringRef TT) { static MCSubtargetInfo * createMSP430MCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) { - MCSubtargetInfo *X = new MCSubtargetInfo(); - InitMSP430MCSubtargetInfo(X, TT, CPU, FS); - return X; + return createMSP430MCSubtargetInfoImpl(TT, CPU, FS); } -static MCCodeGenInfo *createMSP430MCCodeGenInfo(StringRef TT, Reloc::Model RM, +static MCCodeGenInfo *createMSP430MCCodeGenInfo(const Triple &TT, + Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) { MCCodeGenInfo *X = new MCCodeGenInfo(); diff --git a/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp b/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp index 5ce5013d898c..8a01334ee2dd 100644 --- a/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp +++ b/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp @@ -254,10 +254,11 @@ bool MSP430DAGToDAGISel::SelectAddr(SDValue N, AM.Base.Reg = CurDAG->getRegister(0, VT); } - Base = (AM.BaseType == MSP430ISelAddressMode::FrameIndexBase) ? - CurDAG->getTargetFrameIndex(AM.Base.FrameIndex, - getTargetLowering()->getPointerTy()) : - AM.Base.Reg; + Base = (AM.BaseType == MSP430ISelAddressMode::FrameIndexBase) + ? CurDAG->getTargetFrameIndex( + AM.Base.FrameIndex, + getTargetLowering()->getPointerTy(CurDAG->getDataLayout())) + : AM.Base.Reg; if (AM.GV) Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(N), diff --git a/lib/Target/MSP430/MSP430ISelLowering.cpp b/lib/Target/MSP430/MSP430ISelLowering.cpp index bc51741a836f..29bc8b33988a 100644 --- a/lib/Target/MSP430/MSP430ISelLowering.cpp +++ b/lib/Target/MSP430/MSP430ISelLowering.cpp @@ -213,7 +213,7 @@ SDValue MSP430TargetLowering::LowerOperation(SDValue Op, /// getConstraintType - Given a constraint letter, return the type of /// constraint it is for this target. TargetLowering::ConstraintType -MSP430TargetLowering::getConstraintType(const std::string &Constraint) const { +MSP430TargetLowering::getConstraintType(StringRef Constraint) const { if (Constraint.size() == 1) { switch (Constraint[0]) { case 'r': @@ -227,8 +227,7 @@ MSP430TargetLowering::getConstraintType(const std::string &Constraint) const { std::pair<unsigned, const TargetRegisterClass *> MSP430TargetLowering::getRegForInlineAsmConstraint( - const TargetRegisterInfo *TRI, const std::string &Constraint, - MVT VT) const { + const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { if (Constraint.size() == 1) { // GCC Constraint Letters switch (Constraint[0]) { @@ -494,7 +493,7 @@ MSP430TargetLowering::LowerCCCArguments(SDValue Chain, if (Flags.isByVal()) { int FI = MFI->CreateFixedObject(Flags.getByValSize(), VA.getLocMemOffset(), true); - InVal = DAG.getFrameIndex(FI, getPointerTy()); + InVal = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); } else { // Load the argument to a virtual register unsigned ObjSize = VA.getLocVT().getSizeInBits()/8; @@ -592,10 +591,10 @@ MSP430TargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee, // Get a count of how many bytes are to be pushed on the stack. unsigned NumBytes = CCInfo.getNextStackOffset(); + auto PtrVT = getPointerTy(DAG.getDataLayout()); - Chain = DAG.getCALLSEQ_START(Chain, DAG.getConstant(NumBytes, dl, - getPointerTy(), true), - dl); + Chain = DAG.getCALLSEQ_START(Chain, + DAG.getConstant(NumBytes, dl, PtrVT, true), dl); SmallVector<std::pair<unsigned, SDValue>, 4> RegsToPass; SmallVector<SDValue, 12> MemOpChains; @@ -630,12 +629,11 @@ MSP430TargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee, assert(VA.isMemLoc()); if (!StackPtr.getNode()) - StackPtr = DAG.getCopyFromReg(Chain, dl, MSP430::SP, getPointerTy()); + StackPtr = DAG.getCopyFromReg(Chain, dl, MSP430::SP, PtrVT); - SDValue PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), - StackPtr, - DAG.getIntPtrConstant(VA.getLocMemOffset(), - dl)); + SDValue PtrOff = + DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, + DAG.getIntPtrConstant(VA.getLocMemOffset(), dl)); SDValue MemOp; ISD::ArgFlagsTy Flags = Outs[i].Flags; @@ -700,11 +698,8 @@ MSP430TargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee, InFlag = Chain.getValue(1); // Create the CALLSEQ_END node. - Chain = DAG.getCALLSEQ_END(Chain, - DAG.getConstant(NumBytes, dl, getPointerTy(), - true), - DAG.getConstant(0, dl, getPointerTy(), true), - InFlag, dl); + Chain = DAG.getCALLSEQ_END(Chain, DAG.getConstant(NumBytes, dl, PtrVT, true), + DAG.getConstant(0, dl, PtrVT, true), InFlag, dl); InFlag = Chain.getValue(1); // Handle result values, copying them out of physregs into vregs that we @@ -788,30 +783,31 @@ SDValue MSP430TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset(); + auto PtrVT = getPointerTy(DAG.getDataLayout()); // Create the TargetGlobalAddress node, folding in the constant offset. - SDValue Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op), - getPointerTy(), Offset); - return DAG.getNode(MSP430ISD::Wrapper, SDLoc(Op), - getPointerTy(), Result); + SDValue Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op), PtrVT, Offset); + return DAG.getNode(MSP430ISD::Wrapper, SDLoc(Op), PtrVT, Result); } SDValue MSP430TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol(); - SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy()); + auto PtrVT = getPointerTy(DAG.getDataLayout()); + SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT); - return DAG.getNode(MSP430ISD::Wrapper, dl, getPointerTy(), Result); + return DAG.getNode(MSP430ISD::Wrapper, dl, PtrVT, Result); } SDValue MSP430TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); + auto PtrVT = getPointerTy(DAG.getDataLayout()); const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); - SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy()); + SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT); - return DAG.getNode(MSP430ISD::Wrapper, dl, getPointerTy(), Result); + return DAG.getNode(MSP430ISD::Wrapper, dl, PtrVT, Result); } static SDValue EmitCMP(SDValue &LHS, SDValue &RHS, SDValue &TargetCC, @@ -1024,16 +1020,17 @@ MSP430TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); MSP430MachineFunctionInfo *FuncInfo = MF.getInfo<MSP430MachineFunctionInfo>(); int ReturnAddrIndex = FuncInfo->getRAIndex(); + auto PtrVT = getPointerTy(MF.getDataLayout()); if (ReturnAddrIndex == 0) { // Set up a frame object for the return address. - uint64_t SlotSize = getDataLayout()->getPointerSize(); + uint64_t SlotSize = MF.getDataLayout().getPointerSize(); ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize, true); FuncInfo->setRAIndex(ReturnAddrIndex); } - return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy()); + return DAG.getFrameIndex(ReturnAddrIndex, PtrVT); } SDValue MSP430TargetLowering::LowerRETURNADDR(SDValue Op, @@ -1046,21 +1043,21 @@ SDValue MSP430TargetLowering::LowerRETURNADDR(SDValue Op, unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); SDLoc dl(Op); + auto PtrVT = getPointerTy(DAG.getDataLayout()); if (Depth > 0) { SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); SDValue Offset = - DAG.getConstant(getDataLayout()->getPointerSize(), dl, MVT::i16); - return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), - DAG.getNode(ISD::ADD, dl, getPointerTy(), - FrameAddr, Offset), + DAG.getConstant(DAG.getDataLayout().getPointerSize(), dl, MVT::i16); + return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), + DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset), MachinePointerInfo(), false, false, false, 0); } // Just load the return address. SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); - return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), - RetAddrFI, MachinePointerInfo(), false, false, false, 0); + return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI, + MachinePointerInfo(), false, false, false, 0); } SDValue MSP430TargetLowering::LowerFRAMEADDR(SDValue Op, @@ -1084,10 +1081,11 @@ SDValue MSP430TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); MSP430MachineFunctionInfo *FuncInfo = MF.getInfo<MSP430MachineFunctionInfo>(); + auto PtrVT = getPointerTy(DAG.getDataLayout()); // Frame index of first vararg argument - SDValue FrameIndex = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), - getPointerTy()); + SDValue FrameIndex = + DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); // Create a store of the frame index to the location operand @@ -1099,9 +1097,9 @@ SDValue MSP430TargetLowering::LowerVASTART(SDValue Op, SDValue MSP430TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); - SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy()); - return DAG.getNode(MSP430ISD::Wrapper, SDLoc(JT), - getPointerTy(), Result); + auto PtrVT = getPointerTy(DAG.getDataLayout()); + SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT); + return DAG.getNode(MSP430ISD::Wrapper, SDLoc(JT), PtrVT, Result); } /// getPostIndexedAddressParts - returns true by value, base pointer and diff --git a/lib/Target/MSP430/MSP430ISelLowering.h b/lib/Target/MSP430/MSP430ISelLowering.h index 80d3ae175fb1..2d63852c185b 100644 --- a/lib/Target/MSP430/MSP430ISelLowering.h +++ b/lib/Target/MSP430/MSP430ISelLowering.h @@ -72,7 +72,9 @@ namespace llvm { explicit MSP430TargetLowering(const TargetMachine &TM, const MSP430Subtarget &STI); - MVT getScalarShiftAmountTy(EVT LHSTy) const override { return MVT::i8; } + MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override { + return MVT::i8; + } /// LowerOperation - Provide custom lowering hooks for some operations. SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; @@ -96,11 +98,10 @@ namespace llvm { SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const; TargetLowering::ConstraintType - getConstraintType(const std::string &Constraint) const override; + getConstraintType(StringRef Constraint) const override; std::pair<unsigned, const TargetRegisterClass *> getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, - const std::string &Constraint, - MVT VT) const override; + StringRef Constraint, MVT VT) const override; /// isTruncateFree - Return true if it's free to truncate a value of type /// Ty1 to type Ty2. e.g. On msp430 it's free to truncate a i16 value in diff --git a/lib/Target/MSP430/MSP430RegisterInfo.cpp b/lib/Target/MSP430/MSP430RegisterInfo.cpp index 614467bcd248..2fb82e535e8d 100644 --- a/lib/Target/MSP430/MSP430RegisterInfo.cpp +++ b/lib/Target/MSP430/MSP430RegisterInfo.cpp @@ -37,7 +37,7 @@ MSP430RegisterInfo::MSP430RegisterInfo() const MCPhysReg* MSP430RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { - const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering(); + const MSP430FrameLowering *TFI = getFrameLowering(*MF); const Function* F = MF->getFunction(); static const MCPhysReg CalleeSavedRegs[] = { MSP430::FP, MSP430::R5, MSP430::R6, MSP430::R7, @@ -73,7 +73,7 @@ MSP430RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { BitVector MSP430RegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); - const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); + const MSP430FrameLowering *TFI = getFrameLowering(MF); // Mark 4 special registers with subregisters as reserved. Reserved.set(MSP430::PCB); @@ -109,7 +109,7 @@ MSP430RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, MachineInstr &MI = *II; MachineBasicBlock &MBB = *MI.getParent(); MachineFunction &MF = *MBB.getParent(); - const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); + const MSP430FrameLowering *TFI = getFrameLowering(MF); DebugLoc dl = MI.getDebugLoc(); int FrameIndex = MI.getOperand(FIOperandNum).getIndex(); @@ -156,7 +156,6 @@ MSP430RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, } unsigned MSP430RegisterInfo::getFrameRegister(const MachineFunction &MF) const { - const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); - + const MSP430FrameLowering *TFI = getFrameLowering(MF); return TFI->hasFP(MF) ? MSP430::FP : MSP430::SP; } diff --git a/lib/Target/MSP430/MSP430SelectionDAGInfo.cpp b/lib/Target/MSP430/MSP430SelectionDAGInfo.cpp deleted file mode 100644 index 3897ef684d4d..000000000000 --- a/lib/Target/MSP430/MSP430SelectionDAGInfo.cpp +++ /dev/null @@ -1,23 +0,0 @@ -//===-- MSP430SelectionDAGInfo.cpp - MSP430 SelectionDAG Info -------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file implements the MSP430SelectionDAGInfo class. -// -//===----------------------------------------------------------------------===// - -#include "MSP430TargetMachine.h" -using namespace llvm; - -#define DEBUG_TYPE "msp430-selectiondag-info" - -MSP430SelectionDAGInfo::MSP430SelectionDAGInfo(const DataLayout &DL) - : TargetSelectionDAGInfo(&DL) {} - -MSP430SelectionDAGInfo::~MSP430SelectionDAGInfo() { -} diff --git a/lib/Target/MSP430/MSP430SelectionDAGInfo.h b/lib/Target/MSP430/MSP430SelectionDAGInfo.h deleted file mode 100644 index 61a6b19111db..000000000000 --- a/lib/Target/MSP430/MSP430SelectionDAGInfo.h +++ /dev/null @@ -1,31 +0,0 @@ -//===-- MSP430SelectionDAGInfo.h - MSP430 SelectionDAG Info -----*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file defines the MSP430 subclass for TargetSelectionDAGInfo. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_MSP430_MSP430SELECTIONDAGINFO_H -#define LLVM_LIB_TARGET_MSP430_MSP430SELECTIONDAGINFO_H - -#include "llvm/Target/TargetSelectionDAGInfo.h" - -namespace llvm { - -class MSP430TargetMachine; - -class MSP430SelectionDAGInfo : public TargetSelectionDAGInfo { -public: - explicit MSP430SelectionDAGInfo(const DataLayout &DL); - ~MSP430SelectionDAGInfo(); -}; - -} - -#endif diff --git a/lib/Target/MSP430/MSP430Subtarget.cpp b/lib/Target/MSP430/MSP430Subtarget.cpp index 6374f41c00ea..6216348e4d71 100644 --- a/lib/Target/MSP430/MSP430Subtarget.cpp +++ b/lib/Target/MSP430/MSP430Subtarget.cpp @@ -34,5 +34,4 @@ MSP430Subtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) { MSP430Subtarget::MSP430Subtarget(const Triple &TT, const std::string &CPU, const std::string &FS, const TargetMachine &TM) : MSP430GenSubtargetInfo(TT, CPU, FS), FrameLowering(), - InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this), - TSInfo(*TM.getDataLayout()) {} + InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this) {} diff --git a/lib/Target/MSP430/MSP430Subtarget.h b/lib/Target/MSP430/MSP430Subtarget.h index 81f6f027d45c..ff2656d26dd2 100644 --- a/lib/Target/MSP430/MSP430Subtarget.h +++ b/lib/Target/MSP430/MSP430Subtarget.h @@ -18,8 +18,8 @@ #include "MSP430ISelLowering.h" #include "MSP430InstrInfo.h" #include "MSP430RegisterInfo.h" -#include "MSP430SelectionDAGInfo.h" #include "llvm/IR/DataLayout.h" +#include "llvm/Target/TargetSelectionDAGInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" #include <string> @@ -35,7 +35,7 @@ class MSP430Subtarget : public MSP430GenSubtargetInfo { MSP430FrameLowering FrameLowering; MSP430InstrInfo InstrInfo; MSP430TargetLowering TLInfo; - MSP430SelectionDAGInfo TSInfo; + TargetSelectionDAGInfo TSInfo; public: /// This constructor initializes the data members to match that @@ -60,7 +60,7 @@ public: const MSP430TargetLowering *getTargetLowering() const override { return &TLInfo; } - const MSP430SelectionDAGInfo *getSelectionDAGInfo() const override { + const TargetSelectionDAGInfo *getSelectionDAGInfo() const override { return &TSInfo; } }; diff --git a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp index f14156dbfa2b..5107d2ae58c3 100644 --- a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp +++ b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp @@ -1727,37 +1727,59 @@ bool MipsAsmParser::expandInstruction(MCInst &Inst, SMLoc IDLoc, } namespace { -template <unsigned ShiftAmount> +void emitRX(unsigned Opcode, unsigned DstReg, MCOperand Imm, SMLoc IDLoc, + SmallVectorImpl<MCInst> &Instructions) { + MCInst tmpInst; + tmpInst.setOpcode(Opcode); + tmpInst.addOperand(MCOperand::createReg(DstReg)); + tmpInst.addOperand(Imm); + tmpInst.setLoc(IDLoc); + Instructions.push_back(tmpInst); +} + +void emitRI(unsigned Opcode, unsigned DstReg, int16_t Imm, SMLoc IDLoc, + SmallVectorImpl<MCInst> &Instructions) { + emitRX(Opcode, DstReg, MCOperand::createImm(Imm), IDLoc, Instructions); +} + + +void emitRRX(unsigned Opcode, unsigned DstReg, unsigned SrcReg, MCOperand Imm, + SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions) { + MCInst tmpInst; + tmpInst.setOpcode(Opcode); + tmpInst.addOperand(MCOperand::createReg(DstReg)); + tmpInst.addOperand(MCOperand::createReg(SrcReg)); + tmpInst.addOperand(Imm); + tmpInst.setLoc(IDLoc); + Instructions.push_back(tmpInst); +} + +void emitRRR(unsigned Opcode, unsigned DstReg, unsigned SrcReg, + unsigned SrcReg2, SMLoc IDLoc, + SmallVectorImpl<MCInst> &Instructions) { + emitRRX(Opcode, DstReg, SrcReg, MCOperand::createReg(SrcReg2), IDLoc, + Instructions); +} + +void emitRRI(unsigned Opcode, unsigned DstReg, unsigned SrcReg, int16_t Imm, + SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions) { + emitRRX(Opcode, DstReg, SrcReg, MCOperand::createImm(Imm), IDLoc, + Instructions); +} + +template <int16_t ShiftAmount> void createLShiftOri(MCOperand Operand, unsigned RegNo, SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions) { - MCInst tmpInst; - if (ShiftAmount >= 32) { - tmpInst.setOpcode(Mips::DSLL32); - tmpInst.addOperand(MCOperand::createReg(RegNo)); - tmpInst.addOperand(MCOperand::createReg(RegNo)); - tmpInst.addOperand(MCOperand::createImm(ShiftAmount - 32)); - tmpInst.setLoc(IDLoc); - Instructions.push_back(tmpInst); - tmpInst.clear(); - } else if (ShiftAmount > 0) { - tmpInst.setOpcode(Mips::DSLL); - tmpInst.addOperand(MCOperand::createReg(RegNo)); - tmpInst.addOperand(MCOperand::createReg(RegNo)); - tmpInst.addOperand(MCOperand::createImm(ShiftAmount)); - tmpInst.setLoc(IDLoc); - Instructions.push_back(tmpInst); - tmpInst.clear(); - } + if (ShiftAmount >= 32) + emitRRI(Mips::DSLL32, RegNo, RegNo, ShiftAmount - 32, IDLoc, Instructions); + else if (ShiftAmount > 0) + emitRRI(Mips::DSLL, RegNo, RegNo, ShiftAmount, IDLoc, Instructions); + // There's no need for an ORi if the immediate is 0. if (Operand.isImm() && Operand.getImm() == 0) return; - tmpInst.setOpcode(Mips::ORi); - tmpInst.addOperand(MCOperand::createReg(RegNo)); - tmpInst.addOperand(MCOperand::createReg(RegNo)); - tmpInst.addOperand(Operand); - tmpInst.setLoc(IDLoc); - Instructions.push_back(tmpInst); + emitRRX(Mips::ORi, RegNo, RegNo, Operand, IDLoc, Instructions); } template <unsigned ShiftAmount> @@ -1818,12 +1840,22 @@ bool MipsAsmParser::loadImmediate(int64_t ImmValue, unsigned DstReg, return true; } + if (Is32BitImm) { + if (isInt<32>(ImmValue) || isUInt<32>(ImmValue)) { + // Sign extend up to 64-bit so that the predicates match the hardware + // behaviour. In particular, isInt<16>(0xffff8000) and similar should be + // true. + ImmValue = SignExtend64<32>(ImmValue); + } else { + Error(IDLoc, "instruction requires a 32-bit immediate"); + return true; + } + } + bool UseSrcReg = false; if (SrcReg != Mips::NoRegister) UseSrcReg = true; - MCInst tmpInst; - unsigned TmpReg = DstReg; if (UseSrcReg && (DstReg == SrcReg)) { // At this point we need AT to perform the expansions and we exit if it is @@ -1834,29 +1866,26 @@ bool MipsAsmParser::loadImmediate(int64_t ImmValue, unsigned DstReg, TmpReg = ATReg; } - tmpInst.setLoc(IDLoc); // FIXME: gas has a special case for values that are 000...1111, which // becomes a li -1 and then a dsrl - if (0 <= ImmValue && ImmValue <= 65535) { - // For unsigned and positive signed 16-bit values (0 <= j <= 65535): - // li d,j => ori d,$zero,j - if (!UseSrcReg) - SrcReg = isGP64bit() ? Mips::ZERO_64 : Mips::ZERO; - tmpInst.setOpcode(Mips::ORi); - tmpInst.addOperand(MCOperand::createReg(DstReg)); - tmpInst.addOperand(MCOperand::createReg(SrcReg)); - tmpInst.addOperand(MCOperand::createImm(ImmValue)); - Instructions.push_back(tmpInst); - } else if (ImmValue < 0 && ImmValue >= -32768) { - // For negative signed 16-bit values (-32768 <= j < 0): + if (isInt<16>(ImmValue)) { // li d,j => addiu d,$zero,j if (!UseSrcReg) SrcReg = Mips::ZERO; - tmpInst.setOpcode(Mips::ADDiu); - tmpInst.addOperand(MCOperand::createReg(DstReg)); - tmpInst.addOperand(MCOperand::createReg(SrcReg)); - tmpInst.addOperand(MCOperand::createImm(ImmValue)); - Instructions.push_back(tmpInst); + emitRRI(Mips::ADDiu, DstReg, SrcReg, ImmValue, IDLoc, Instructions); + } else if (isUInt<16>(ImmValue)) { + // li d,j => ori d,$zero,j + unsigned TmpReg = DstReg; + if (SrcReg == DstReg) { + unsigned ATReg = getATReg(IDLoc); + if (!ATReg) + return true; + TmpReg = ATReg; + } + + emitRRI(Mips::ORi, TmpReg, Mips::ZERO, ImmValue, IDLoc, Instructions); + if (UseSrcReg) + emitRRR(Mips::ADDu, DstReg, TmpReg, SrcReg, IDLoc, Instructions); } else if (isInt<32>(ImmValue) || isUInt<32>(ImmValue)) { warnIfNoMacro(IDLoc); @@ -1869,30 +1898,16 @@ bool MipsAsmParser::loadImmediate(int64_t ImmValue, unsigned DstReg, if (!Is32BitImm && !isInt<32>(ImmValue)) { // For DLI, expand to an ORi instead of a LUi to avoid sign-extending the // upper 32 bits. - tmpInst.setOpcode(Mips::ORi); - tmpInst.addOperand(MCOperand::createReg(TmpReg)); - tmpInst.addOperand(MCOperand::createReg(Mips::ZERO)); - tmpInst.addOperand(MCOperand::createImm(Bits31To16)); - tmpInst.setLoc(IDLoc); - Instructions.push_back(tmpInst); - // Move the value to the upper 16 bits by doing a 16-bit left shift. - createLShiftOri<16>(0, TmpReg, IDLoc, Instructions); - } else { - tmpInst.setOpcode(Mips::LUi); - tmpInst.addOperand(MCOperand::createReg(TmpReg)); - tmpInst.addOperand(MCOperand::createImm(Bits31To16)); - Instructions.push_back(tmpInst); - } + emitRRI(Mips::ORi, TmpReg, Mips::ZERO, Bits31To16, IDLoc, Instructions); + emitRRI(Mips::DSLL, TmpReg, TmpReg, 16, IDLoc, Instructions); + } else + emitRI(Mips::LUi, TmpReg, Bits31To16, IDLoc, Instructions); createLShiftOri<0>(Bits15To0, TmpReg, IDLoc, Instructions); if (UseSrcReg) createAddu(DstReg, TmpReg, SrcReg, !Is32BitImm, Instructions); } else if ((ImmValue & (0xffffLL << 48)) == 0) { - if (Is32BitImm) { - Error(IDLoc, "instruction requires a 32-bit immediate"); - return true; - } warnIfNoMacro(IDLoc); // <------- lo32 ------> @@ -1912,10 +1927,7 @@ bool MipsAsmParser::loadImmediate(int64_t ImmValue, unsigned DstReg, uint16_t Bits31To16 = (ImmValue >> 16) & 0xffff; uint16_t Bits15To0 = ImmValue & 0xffff; - tmpInst.setOpcode(Mips::LUi); - tmpInst.addOperand(MCOperand::createReg(TmpReg)); - tmpInst.addOperand(MCOperand::createImm(Bits47To32)); - Instructions.push_back(tmpInst); + emitRI(Mips::LUi, TmpReg, Bits47To32, IDLoc, Instructions); createLShiftOri<0>(Bits31To16, TmpReg, IDLoc, Instructions); createLShiftOri<16>(Bits15To0, TmpReg, IDLoc, Instructions); @@ -1923,10 +1935,6 @@ bool MipsAsmParser::loadImmediate(int64_t ImmValue, unsigned DstReg, createAddu(DstReg, TmpReg, SrcReg, !Is32BitImm, Instructions); } else { - if (Is32BitImm) { - Error(IDLoc, "instruction requires a 32-bit immediate"); - return true; - } warnIfNoMacro(IDLoc); // <------- hi32 ------> <------- lo32 ------> @@ -1948,10 +1956,7 @@ bool MipsAsmParser::loadImmediate(int64_t ImmValue, unsigned DstReg, uint16_t Bits31To16 = (ImmValue >> 16) & 0xffff; uint16_t Bits15To0 = ImmValue & 0xffff; - tmpInst.setOpcode(Mips::LUi); - tmpInst.addOperand(MCOperand::createReg(TmpReg)); - tmpInst.addOperand(MCOperand::createImm(Bits63To48)); - Instructions.push_back(tmpInst); + emitRI(Mips::LUi, TmpReg, Bits63To48, IDLoc, Instructions); createLShiftOri<0>(Bits47To32, TmpReg, IDLoc, Instructions); // When Bits31To16 is 0, do a left shift of 32 bits instead of doing @@ -2096,8 +2101,8 @@ bool MipsAsmParser::loadAndAddSymbolAddress( tmpInst.addOperand(MCOperand::createExpr(HiExpr)); Instructions.push_back(tmpInst); - createLShiftOri<0>(MCOperand::createExpr(LoExpr), TmpReg, SMLoc(), - Instructions); + emitRRX(Mips::ADDiu, TmpReg, TmpReg, MCOperand::createExpr(LoExpr), SMLoc(), + Instructions); } if (UseSrcReg) @@ -2708,12 +2713,8 @@ void MipsAsmParser::createNop(bool hasShortDelaySlot, SMLoc IDLoc, void MipsAsmParser::createAddu(unsigned DstReg, unsigned SrcReg, unsigned TrgReg, bool Is64Bit, SmallVectorImpl<MCInst> &Instructions) { - MCInst AdduInst; - AdduInst.setOpcode(Is64Bit ? Mips::DADDu : Mips::ADDu); - AdduInst.addOperand(MCOperand::createReg(DstReg)); - AdduInst.addOperand(MCOperand::createReg(SrcReg)); - AdduInst.addOperand(MCOperand::createReg(TrgReg)); - Instructions.push_back(AdduInst); + emitRRR(Is64Bit ? Mips::DADDu : Mips::ADDu, DstReg, SrcReg, TrgReg, SMLoc(), + Instructions); } unsigned MipsAsmParser::checkTargetMatchPredicate(MCInst &Inst) { diff --git a/lib/Target/Mips/CMakeLists.txt b/lib/Target/Mips/CMakeLists.txt index 36ba8e559e0b..bde843afd3d2 100644 --- a/lib/Target/Mips/CMakeLists.txt +++ b/lib/Target/Mips/CMakeLists.txt @@ -46,7 +46,6 @@ add_llvm_target(MipsCodeGen MipsSubtarget.cpp MipsTargetMachine.cpp MipsTargetObjectFile.cpp - MipsSelectionDAGInfo.cpp ) add_subdirectory(InstPrinter) diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp index 9bdf8235a2b4..949ee1474f96 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp @@ -59,7 +59,7 @@ static MCInstrInfo *createMipsMCInstrInfo() { return X; } -static MCRegisterInfo *createMipsMCRegisterInfo(StringRef TT) { +static MCRegisterInfo *createMipsMCRegisterInfo(const Triple &TT) { MCRegisterInfo *X = new MCRegisterInfo(); InitMipsMCRegisterInfo(X, Mips::RA); return X; @@ -68,9 +68,7 @@ static MCRegisterInfo *createMipsMCRegisterInfo(StringRef TT) { static MCSubtargetInfo *createMipsMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) { CPU = MIPS_MC::selectMipsCPU(TT, CPU); - MCSubtargetInfo *X = new MCSubtargetInfo(); - InitMipsMCSubtargetInfo(X, TT, CPU, FS); - return X; + return createMipsMCSubtargetInfoImpl(TT, CPU, FS); } static MCAsmInfo *createMipsMCAsmInfo(const MCRegisterInfo &MRI, @@ -84,7 +82,7 @@ static MCAsmInfo *createMipsMCAsmInfo(const MCRegisterInfo &MRI, return MAI; } -static MCCodeGenInfo *createMipsMCCodeGenInfo(StringRef TT, Reloc::Model RM, +static MCCodeGenInfo *createMipsMCCodeGenInfo(const Triple &TT, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) { MCCodeGenInfo *X = new MCCodeGenInfo(); diff --git a/lib/Target/Mips/Mips16FrameLowering.cpp b/lib/Target/Mips/Mips16FrameLowering.cpp index db2a924a99f9..46cc99c62393 100644 --- a/lib/Target/Mips/Mips16FrameLowering.cpp +++ b/lib/Target/Mips/Mips16FrameLowering.cpp @@ -152,18 +152,19 @@ Mips16FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const { return isInt<15>(MFI->getMaxCallFrameSize()) && !MFI->hasVarSizedObjects(); } -void Mips16FrameLowering:: -processFunctionBeforeCalleeSavedScan(MachineFunction &MF, - RegScavenger *RS) const { +void Mips16FrameLowering::determineCalleeSaves(MachineFunction &MF, + BitVector &SavedRegs, + RegScavenger *RS) const { + TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); const Mips16InstrInfo &TII = *static_cast<const Mips16InstrInfo *>(STI.getInstrInfo()); const MipsRegisterInfo &RI = TII.getRegisterInfo(); const BitVector Reserved = RI.getReservedRegs(MF); bool SaveS2 = Reserved[Mips::S2]; if (SaveS2) - MF.getRegInfo().setPhysRegUsed(Mips::S2); + SavedRegs.set(Mips::S2); if (hasFP(MF)) - MF.getRegInfo().setPhysRegUsed(Mips::S0); + SavedRegs.set(Mips::S0); } const MipsFrameLowering * diff --git a/lib/Target/Mips/Mips16FrameLowering.h b/lib/Target/Mips/Mips16FrameLowering.h index f281c927c1c4..b48ed4641ea7 100644 --- a/lib/Target/Mips/Mips16FrameLowering.h +++ b/lib/Target/Mips/Mips16FrameLowering.h @@ -38,8 +38,8 @@ public: bool hasReservedCallFrame(const MachineFunction &MF) const override; - void processFunctionBeforeCalleeSavedScan(MachineFunction &MF, - RegScavenger *RS) const override; + void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, + RegScavenger *RS) const override; }; } // End llvm namespace diff --git a/lib/Target/Mips/Mips16ISelDAGToDAG.cpp b/lib/Target/Mips/Mips16ISelDAGToDAG.cpp index 7b6a2a154471..bce2c1eb4485 100644 --- a/lib/Target/Mips/Mips16ISelDAGToDAG.cpp +++ b/lib/Target/Mips/Mips16ISelDAGToDAG.cpp @@ -120,13 +120,13 @@ void Mips16DAGToDAGISel::processFunctionAfterISel(MachineFunction &MF) { SDValue Mips16DAGToDAGISel::getMips16SPAliasReg() { unsigned Mips16SPAliasReg = MF->getInfo<MipsFunctionInfo>()->getMips16SPAliasReg(); - return CurDAG->getRegister(Mips16SPAliasReg, - getTargetLowering()->getPointerTy()); + auto PtrVT = getTargetLowering()->getPointerTy(CurDAG->getDataLayout()); + return CurDAG->getRegister(Mips16SPAliasReg, PtrVT); } void Mips16DAGToDAGISel::getMips16SPRefReg(SDNode *Parent, SDValue &AliasReg) { - SDValue AliasFPReg = CurDAG->getRegister(Mips::S0, - getTargetLowering()->getPointerTy()); + auto PtrVT = getTargetLowering()->getPointerTy(CurDAG->getDataLayout()); + SDValue AliasFPReg = CurDAG->getRegister(Mips::S0, PtrVT); if (Parent) { switch (Parent->getOpcode()) { case ISD::LOAD: { @@ -155,7 +155,7 @@ void Mips16DAGToDAGISel::getMips16SPRefReg(SDNode *Parent, SDValue &AliasReg) { } } } - AliasReg = CurDAG->getRegister(Mips::SP, getTargetLowering()->getPointerTy()); + AliasReg = CurDAG->getRegister(Mips::SP, PtrVT); return; } diff --git a/lib/Target/Mips/Mips16ISelLowering.cpp b/lib/Target/Mips/Mips16ISelLowering.cpp index 846e3c964f44..3522cbb1f36a 100644 --- a/lib/Target/Mips/Mips16ISelLowering.cpp +++ b/lib/Target/Mips/Mips16ISelLowering.cpp @@ -502,7 +502,8 @@ getOpndList(SmallVectorImpl<SDValue> &Ops, unsigned V0Reg = Mips::V0; if (NeedMips16Helper) { RegsToPass.push_front(std::make_pair(V0Reg, Callee)); - JumpTarget = DAG.getExternalSymbol(Mips16HelperFunction, getPointerTy()); + JumpTarget = DAG.getExternalSymbol(Mips16HelperFunction, + getPointerTy(DAG.getDataLayout())); ExternalSymbolSDNode *S = cast<ExternalSymbolSDNode>(JumpTarget); JumpTarget = getAddrGlobal(S, CLI.DL, JumpTarget.getValueType(), DAG, MipsII::MO_GOT, Chain, diff --git a/lib/Target/Mips/MipsFastISel.cpp b/lib/Target/Mips/MipsFastISel.cpp index c2651b82d285..e2f6fcc17726 100644 --- a/lib/Target/Mips/MipsFastISel.cpp +++ b/lib/Target/Mips/MipsFastISel.cpp @@ -267,7 +267,7 @@ unsigned MipsFastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT, } unsigned MipsFastISel::fastMaterializeAlloca(const AllocaInst *AI) { - assert(TLI.getValueType(AI->getType(), true) == MVT::i32 && + assert(TLI.getValueType(DL, AI->getType(), true) == MVT::i32 && "Alloca should always return a pointer."); DenseMap<const AllocaInst *, int>::iterator SI = @@ -382,7 +382,7 @@ unsigned MipsFastISel::materializeExternalCallSym(MCSymbol *Sym) { // Materialize a constant into a register, and return the register // number (or zero if we failed to handle it). unsigned MipsFastISel::fastMaterializeConstant(const Constant *C) { - EVT CEVT = TLI.getValueType(C->getType(), true); + EVT CEVT = TLI.getValueType(DL, C->getType(), true); // Only handle simple types. if (!CEVT.isSimple()) @@ -507,12 +507,13 @@ bool MipsFastISel::computeCallAddress(const Value *V, Address &Addr) { break; case Instruction::IntToPtr: // Look past no-op inttoptrs if its operand is in the same BB. - if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy()) + if (TLI.getValueType(DL, U->getOperand(0)->getType()) == + TLI.getPointerTy(DL)) return computeCallAddress(U->getOperand(0), Addr); break; case Instruction::PtrToInt: // Look past no-op ptrtoints if its operand is in the same BB. - if (TLI.getValueType(U->getType()) == TLI.getPointerTy()) + if (TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL)) return computeCallAddress(U->getOperand(0), Addr); break; } @@ -532,7 +533,7 @@ bool MipsFastISel::computeCallAddress(const Value *V, Address &Addr) { } bool MipsFastISel::isTypeLegal(Type *Ty, MVT &VT) { - EVT evt = TLI.getValueType(Ty, true); + EVT evt = TLI.getValueType(DL, Ty, true); // Only handle simple types. if (evt == MVT::Other || !evt.isSimple()) return false; @@ -931,8 +932,8 @@ bool MipsFastISel::selectFPExt(const Instruction *I) { if (UnsupportedFPMode) return false; Value *Src = I->getOperand(0); - EVT SrcVT = TLI.getValueType(Src->getType(), true); - EVT DestVT = TLI.getValueType(I->getType(), true); + EVT SrcVT = TLI.getValueType(DL, Src->getType(), true); + EVT DestVT = TLI.getValueType(DL, I->getType(), true); if (SrcVT != MVT::f32 || DestVT != MVT::f64) return false; @@ -998,8 +999,8 @@ bool MipsFastISel::selectFPTrunc(const Instruction *I) { if (UnsupportedFPMode) return false; Value *Src = I->getOperand(0); - EVT SrcVT = TLI.getValueType(Src->getType(), true); - EVT DestVT = TLI.getValueType(I->getType(), true); + EVT SrcVT = TLI.getValueType(DL, Src->getType(), true); + EVT DestVT = TLI.getValueType(DL, I->getType(), true); if (SrcVT != MVT::f64 || DestVT != MVT::f32) return false; @@ -1415,7 +1416,8 @@ bool MipsFastISel::selectRet(const Instruction *I) { if (Ret->getNumOperands() > 0) { CallingConv::ID CC = F.getCallingConv(); SmallVector<ISD::OutputArg, 4> Outs; - GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI); + GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI, DL); + // Analyze operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ValLocs; MipsCCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, ValLocs, @@ -1449,7 +1451,7 @@ bool MipsFastISel::selectRet(const Instruction *I) { if (!MRI.getRegClass(SrcReg)->contains(DestReg)) return false; - EVT RVEVT = TLI.getValueType(RV->getType()); + EVT RVEVT = TLI.getValueType(DL, RV->getType()); if (!RVEVT.isSimple()) return false; @@ -1493,8 +1495,8 @@ bool MipsFastISel::selectTrunc(const Instruction *I) { Value *Op = I->getOperand(0); EVT SrcVT, DestVT; - SrcVT = TLI.getValueType(Op->getType(), true); - DestVT = TLI.getValueType(I->getType(), true); + SrcVT = TLI.getValueType(DL, Op->getType(), true); + DestVT = TLI.getValueType(DL, I->getType(), true); if (SrcVT != MVT::i32 && SrcVT != MVT::i16 && SrcVT != MVT::i8) return false; @@ -1521,8 +1523,8 @@ bool MipsFastISel::selectIntExt(const Instruction *I) { return false; EVT SrcEVT, DestEVT; - SrcEVT = TLI.getValueType(SrcTy, true); - DestEVT = TLI.getValueType(DestTy, true); + SrcEVT = TLI.getValueType(DL, SrcTy, true); + DestEVT = TLI.getValueType(DL, DestTy, true); if (!SrcEVT.isSimple()) return false; if (!DestEVT.isSimple()) @@ -1620,7 +1622,7 @@ unsigned MipsFastISel::emitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, } bool MipsFastISel::selectDivRem(const Instruction *I, unsigned ISDOpcode) { - EVT DestEVT = TLI.getValueType(I->getType(), true); + EVT DestEVT = TLI.getValueType(DL, I->getType(), true); if (!DestEVT.isSimple()) return false; @@ -1685,7 +1687,7 @@ bool MipsFastISel::selectShift(const Instruction *I) { if (!TempReg) return false; - MVT Op0MVT = TLI.getValueType(Op0->getType(), true).getSimpleVT(); + MVT Op0MVT = TLI.getValueType(DL, Op0->getType(), true).getSimpleVT(); bool IsZExt = Opcode == Instruction::LShr; if (!emitIntExt(Op0MVT, Op0Reg, MVT::i32, TempReg, IsZExt)) return false; @@ -1803,7 +1805,7 @@ unsigned MipsFastISel::getRegEnsuringSimpleIntegerWidening(const Value *V, unsigned VReg = getRegForValue(V); if (VReg == 0) return 0; - MVT VMVT = TLI.getValueType(V->getType(), true).getSimpleVT(); + MVT VMVT = TLI.getValueType(DL, V->getType(), true).getSimpleVT(); if ((VMVT == MVT::i8) || (VMVT == MVT::i16)) { unsigned TempReg = createResultReg(&Mips::GPR32RegClass); if (!emitIntExt(VMVT, VReg, MVT::i32, TempReg, IsUnsigned)) diff --git a/lib/Target/Mips/MipsISelDAGToDAG.cpp b/lib/Target/Mips/MipsISelDAGToDAG.cpp index 2c9868ac051d..06502397b6b8 100644 --- a/lib/Target/Mips/MipsISelDAGToDAG.cpp +++ b/lib/Target/Mips/MipsISelDAGToDAG.cpp @@ -59,8 +59,9 @@ bool MipsDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) { /// GOT address into a register. SDNode *MipsDAGToDAGISel::getGlobalBaseReg() { unsigned GlobalBaseReg = MF->getInfo<MipsFunctionInfo>()->getGlobalBaseReg(); - return CurDAG->getRegister(GlobalBaseReg, - getTargetLowering()->getPointerTy()).getNode(); + return CurDAG->getRegister(GlobalBaseReg, getTargetLowering()->getPointerTy( + CurDAG->getDataLayout())) + .getNode(); } /// ComplexPattern used on MipsInstrInfo diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp index 67ddcc4dacb9..fbebb9abb4cc 100644 --- a/lib/Target/Mips/MipsISelLowering.cpp +++ b/lib/Target/Mips/MipsISelLowering.cpp @@ -466,7 +466,8 @@ MipsTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, return Mips::createFastISel(funcInfo, libInfo); } -EVT MipsTargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { +EVT MipsTargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &, + EVT VT) const { if (!VT.isVector()) return MVT::i32; return VT.changeVectorElementTypeToInteger(); @@ -1579,9 +1580,10 @@ SDValue MipsTargetLowering::lowerBR_JT(SDValue Op, SelectionDAG &DAG) const { SDValue Table = Op.getOperand(1); SDValue Index = Op.getOperand(2); SDLoc DL(Op); - EVT PTy = getPointerTy(); + auto &TD = DAG.getDataLayout(); + EVT PTy = getPointerTy(TD); unsigned EntrySize = - DAG.getMachineFunction().getJumpTableInfo()->getEntrySize(*getDataLayout()); + DAG.getMachineFunction().getJumpTableInfo()->getEntrySize(TD); Index = DAG.getNode(ISD::MUL, DL, PTy, Index, DAG.getConstant(EntrySize, DL, PTy)); @@ -1647,10 +1649,10 @@ lowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); EVT Ty = Op.getOperand(0).getValueType(); - SDValue Cond = DAG.getNode(ISD::SETCC, DL, - getSetCCResultType(*DAG.getContext(), Ty), - Op.getOperand(0), Op.getOperand(1), - Op.getOperand(4)); + SDValue Cond = + DAG.getNode(ISD::SETCC, DL, getSetCCResultType(DAG.getDataLayout(), + *DAG.getContext(), Ty), + Op.getOperand(0), Op.getOperand(1), Op.getOperand(4)); return DAG.getNode(ISD::SELECT, DL, Op.getValueType(), Cond, Op.getOperand(2), Op.getOperand(3)); @@ -1723,7 +1725,7 @@ lowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); SDLoc DL(GA); const GlobalValue *GV = GA->getGlobal(); - EVT PtrVT = getPointerTy(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); TLSModel::Model model = getTargetMachine().getTLSModel(GV); @@ -1831,7 +1833,7 @@ SDValue MipsTargetLowering::lowerVASTART(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); SDValue FI = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), - getPointerTy()); + getPointerTy(MF.getDataLayout())); // vastart just stores the address of the VarArgsFrameIndex slot into the // memory location argument. @@ -1850,9 +1852,9 @@ SDValue MipsTargetLowering::lowerVAARG(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Node); unsigned ArgSlotSizeInBytes = (ABI.IsN32() || ABI.IsN64()) ? 8 : 4; - SDValue VAListLoad = DAG.getLoad(getPointerTy(), DL, Chain, VAListPtr, - MachinePointerInfo(SV), false, false, false, - 0); + SDValue VAListLoad = + DAG.getLoad(getPointerTy(DAG.getDataLayout()), DL, Chain, VAListPtr, + MachinePointerInfo(SV), false, false, false, 0); SDValue VAList = VAListLoad; // Re-align the pointer if necessary. @@ -1874,7 +1876,9 @@ SDValue MipsTargetLowering::lowerVAARG(SDValue Op, SelectionDAG &DAG) const { } // Increment the pointer, VAList, to the next vaarg. - unsigned ArgSizeInBytes = getDataLayout()->getTypeAllocSize(VT.getTypeForEVT(*DAG.getContext())); + auto &TD = DAG.getDataLayout(); + unsigned ArgSizeInBytes = + TD.getTypeAllocSize(VT.getTypeForEVT(*DAG.getContext())); SDValue Tmp3 = DAG.getNode(ISD::ADD, DL, VAList.getValueType(), VAList, DAG.getConstant(RoundUpToAlignment(ArgSizeInBytes, ArgSlotSizeInBytes), @@ -2062,7 +2066,7 @@ SDValue MipsTargetLowering::lowerEH_RETURN(SDValue Op, SelectionDAG &DAG) Chain = DAG.getCopyToReg(Chain, DL, AddrReg, Handler, Chain.getValue(1)); return DAG.getNode(MipsISD::EH_RETURN, DL, MVT::Other, Chain, DAG.getRegister(OffsetReg, Ty), - DAG.getRegister(AddrReg, getPointerTy()), + DAG.getRegister(AddrReg, getPointerTy(MF.getDataLayout())), Chain.getValue(1)); } @@ -2479,15 +2483,16 @@ MipsTargetLowering::passArgOnStack(SDValue StackPtr, unsigned Offset, SDValue Chain, SDValue Arg, SDLoc DL, bool IsTailCall, SelectionDAG &DAG) const { if (!IsTailCall) { - SDValue PtrOff = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, - DAG.getIntPtrConstant(Offset, DL)); + SDValue PtrOff = + DAG.getNode(ISD::ADD, DL, getPointerTy(DAG.getDataLayout()), StackPtr, + DAG.getIntPtrConstant(Offset, DL)); return DAG.getStore(Chain, DL, Arg, PtrOff, MachinePointerInfo(), false, false, 0); } MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); int FI = MFI->CreateFixedObject(Arg.getValueSizeInBits() / 8, Offset, false); - SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); + SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); return DAG.getStore(Chain, DL, Arg, FIN, MachinePointerInfo(), /*isVolatile=*/ true, false, 0); } @@ -2611,8 +2616,9 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, if (!IsTailCall) Chain = DAG.getCALLSEQ_START(Chain, NextStackOffsetVal, DL); - SDValue StackPtr = DAG.getCopyFromReg( - Chain, DL, ABI.IsN64() ? Mips::SP_64 : Mips::SP, getPointerTy()); + SDValue StackPtr = + DAG.getCopyFromReg(Chain, DL, ABI.IsN64() ? Mips::SP_64 : Mips::SP, + getPointerTy(DAG.getDataLayout())); // With EABI is it possible to have 16 args on registers. std::deque< std::pair<unsigned, SDValue> > RegsToPass; @@ -2750,7 +2756,8 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, IsCallReloc = true; } } else - Callee = DAG.getTargetGlobalAddress(G->getGlobal(), DL, getPointerTy(), 0, + Callee = DAG.getTargetGlobalAddress(G->getGlobal(), DL, + getPointerTy(DAG.getDataLayout()), 0, MipsII::MO_NO_FLAG); GlobalOrExternal = true; } @@ -2758,8 +2765,8 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, const char *Sym = S->getSymbol(); if (!ABI.IsN64() && !IsPIC) // !N64 && static - Callee = - DAG.getTargetExternalSymbol(Sym, getPointerTy(), MipsII::MO_NO_FLAG); + Callee = DAG.getTargetExternalSymbol( + Sym, getPointerTy(DAG.getDataLayout()), MipsII::MO_NO_FLAG); else if (LargeGOT) { Callee = getAddrGlobalLargeGOT(S, DL, Ty, DAG, MipsII::MO_CALL_HI16, MipsII::MO_CALL_LO16, Chain, @@ -3029,7 +3036,7 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain, VA.getLocMemOffset(), true); // Create load nodes to retrieve arguments from the stack - SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); + SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); SDValue ArgValue = DAG.getLoad(LocVT, DL, Chain, FIN, MachinePointerInfo::getFixedStack(FI), false, false, false, 0); @@ -3174,12 +3181,13 @@ MipsTargetLowering::LowerReturn(SDValue Chain, if (!Reg) llvm_unreachable("sret virtual register not created in the entry block"); - SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy()); + SDValue Val = + DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(DAG.getDataLayout())); unsigned V0 = ABI.IsN64() ? Mips::V0_64 : Mips::V0; Chain = DAG.getCopyToReg(Chain, DL, V0, Val, Flag); Flag = Chain.getValue(1); - RetOps.push_back(DAG.getRegister(V0, getPointerTy())); + RetOps.push_back(DAG.getRegister(V0, getPointerTy(DAG.getDataLayout()))); } RetOps[0] = Chain; // Update chain. @@ -3198,9 +3206,8 @@ MipsTargetLowering::LowerReturn(SDValue Chain, /// getConstraintType - Given a constraint letter, return the type of /// constraint it is for this target. -MipsTargetLowering::ConstraintType MipsTargetLowering:: -getConstraintType(const std::string &Constraint) const -{ +MipsTargetLowering::ConstraintType +MipsTargetLowering::getConstraintType(StringRef Constraint) const { // Mips specific constraints // GCC config/mips/constraints.md // @@ -3290,9 +3297,8 @@ MipsTargetLowering::getSingleConstraintMatchWeight( /// into non-numeric and numeric parts (Prefix and Reg). The first boolean flag /// that is returned indicates whether parsing was successful. The second flag /// is true if the numeric part exists. -static std::pair<bool, bool> -parsePhysicalReg(StringRef C, std::string &Prefix, - unsigned long long &Reg) { +static std::pair<bool, bool> parsePhysicalReg(StringRef C, StringRef &Prefix, + unsigned long long &Reg) { if (C.front() != '{' || C.back() != '}') return std::make_pair(false, false); @@ -3300,7 +3306,7 @@ parsePhysicalReg(StringRef C, std::string &Prefix, StringRef::const_iterator I, B = C.begin() + 1, E = C.end() - 1; I = std::find_if(B, E, std::ptr_fun(isdigit)); - Prefix.assign(B, I - B); + Prefix = StringRef(B, I - B); // The second flag is set to false if no numeric characters were found. if (I == E) @@ -3316,7 +3322,7 @@ parseRegForInlineAsmConstraint(StringRef C, MVT VT) const { const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); const TargetRegisterClass *RC; - std::string Prefix; + StringRef Prefix; unsigned long long Reg; std::pair<bool, bool> R = parsePhysicalReg(C, Prefix, Reg); @@ -3332,7 +3338,7 @@ parseRegForInlineAsmConstraint(StringRef C, MVT VT) const { RC = TRI->getRegClass(Prefix == "hi" ? Mips::HI32RegClassID : Mips::LO32RegClassID); return std::make_pair(*(RC->begin()), RC); - } else if (Prefix.compare(0, 4, "$msa") == 0) { + } else if (Prefix.startswith("$msa")) { // Parse $msa(ir|csr|access|save|modify|request|map|unmap) // No numeric characters follow the name. @@ -3390,7 +3396,7 @@ parseRegForInlineAsmConstraint(StringRef C, MVT VT) const { /// pointer. std::pair<unsigned, const TargetRegisterClass *> MipsTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, - const std::string &Constraint, + StringRef Constraint, MVT VT) const { if (Constraint.size() == 1) { switch (Constraint[0]) { @@ -3546,8 +3552,8 @@ void MipsTargetLowering::LowerAsmOperandForConstraint(SDValue Op, TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); } -bool MipsTargetLowering::isLegalAddressingMode(const AddrMode &AM, - Type *Ty, +bool MipsTargetLowering::isLegalAddressingMode(const DataLayout &DL, + const AddrMode &AM, Type *Ty, unsigned AS) const { // No global is ever allowed as a base. if (AM.BaseGV) @@ -3625,7 +3631,7 @@ void MipsTargetLowering::copyByValRegs( FrameObjOffset = VA.getLocMemOffset(); // Create frame object. - EVT PtrTy = getPointerTy(); + EVT PtrTy = getPointerTy(DAG.getDataLayout()); int FI = MFI->CreateFixedObject(FrameObjSize, FrameObjOffset, true); SDValue FIN = DAG.getFrameIndex(FI, PtrTy); InVals.push_back(FIN); @@ -3662,7 +3668,8 @@ void MipsTargetLowering::passByValArg( unsigned OffsetInBytes = 0; // From beginning of struct unsigned RegSizeInBytes = Subtarget.getGPRSizeInBytes(); unsigned Alignment = std::min(Flags.getByValAlign(), RegSizeInBytes); - EVT PtrTy = getPointerTy(), RegTy = MVT::getIntegerVT(RegSizeInBytes * 8); + EVT PtrTy = getPointerTy(DAG.getDataLayout()), + RegTy = MVT::getIntegerVT(RegSizeInBytes * 8); unsigned NumRegs = LastReg - FirstReg; if (NumRegs) { @@ -3787,7 +3794,7 @@ void MipsTargetLowering::writeVarArgRegs(std::vector<SDValue> &OutChains, unsigned Reg = addLiveIn(MF, ArgRegs[I], RC); SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegTy); FI = MFI->CreateFixedObject(RegSizeInBytes, VaArgOffset, true); - SDValue PtrOff = DAG.getFrameIndex(FI, getPointerTy()); + SDValue PtrOff = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); SDValue Store = DAG.getStore(Chain, DL, ArgValue, PtrOff, MachinePointerInfo(), false, false, 0); cast<StoreSDNode>(Store.getNode())->getMemOperand()->setValue( @@ -3920,8 +3927,8 @@ MipsTargetLowering::emitPseudoSELECT(MachineInstr *MI, MachineBasicBlock *BB, // FIXME? Maybe this could be a TableGen attribute on some registers and // this table could be generated automatically from RegInfo. -unsigned MipsTargetLowering::getRegisterByName(const char* RegName, - EVT VT) const { +unsigned MipsTargetLowering::getRegisterByName(const char* RegName, EVT VT, + SelectionDAG &DAG) const { // Named registers is expected to be fairly rare. For now, just support $28 // since the linux kernel uses it. if (Subtarget.isGP64bit()) { diff --git a/lib/Target/Mips/MipsISelLowering.h b/lib/Target/Mips/MipsISelLowering.h index bc9a1ce64097..6fe8f830d35d 100644 --- a/lib/Target/Mips/MipsISelLowering.h +++ b/lib/Target/Mips/MipsISelLowering.h @@ -227,7 +227,9 @@ namespace llvm { FastISel *createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override; - MVT getScalarShiftAmountTy(EVT LHSTy) const override { return MVT::i32; } + MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override { + return MVT::i32; + } void LowerOperationWrapper(SDNode *N, SmallVectorImpl<SDValue> &Results, @@ -247,7 +249,8 @@ namespace llvm { const char *getTargetNodeName(unsigned Opcode) const override; /// getSetCCResultType - get the ISD::SETCC result ValueType - EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override; + EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, + EVT VT) const override; SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; @@ -263,7 +266,8 @@ namespace llvm { void HandleByVal(CCState *, unsigned &, unsigned) const override; - unsigned getRegisterByName(const char* RegName, EVT VT) const override; + unsigned getRegisterByName(const char* RegName, EVT VT, + SelectionDAG &DAG) const override; protected: SDValue getGlobalReg(SelectionDAG &DAG, EVT Ty) const; @@ -478,8 +482,7 @@ namespace llvm { bool shouldSignExtendTypeInLibCall(EVT Type, bool IsSigned) const override; // Inline asm support - ConstraintType - getConstraintType(const std::string &Constraint) const override; + ConstraintType getConstraintType(StringRef Constraint) const override; /// Examine constraint string and operand type and determine a weight value. /// The operand object must already have been set up with the operand type. @@ -493,8 +496,7 @@ namespace llvm { std::pair<unsigned, const TargetRegisterClass *> getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, - const std::string &Constraint, - MVT VT) const override; + StringRef Constraint, MVT VT) const override; /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops /// vector. If it is invalid, don't add anything to Ops. If hasMemory is @@ -505,8 +507,8 @@ namespace llvm { std::vector<SDValue> &Ops, SelectionDAG &DAG) const override; - unsigned getInlineAsmMemConstraint( - const std::string &ConstraintCode) const override { + unsigned + getInlineAsmMemConstraint(StringRef ConstraintCode) const override { if (ConstraintCode == "R") return InlineAsm::Constraint_R; else if (ConstraintCode == "ZC") @@ -514,8 +516,8 @@ namespace llvm { return TargetLowering::getInlineAsmMemConstraint(ConstraintCode); } - bool isLegalAddressingMode(const AddrMode &AM, Type *Ty, - unsigned AS) const override; + bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, + Type *Ty, unsigned AS) const override; bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override; diff --git a/lib/Target/Mips/MipsSEFrameLowering.cpp b/lib/Target/Mips/MipsSEFrameLowering.cpp index ec7bf314c641..096b3bee5d07 100644 --- a/lib/Target/Mips/MipsSEFrameLowering.cpp +++ b/lib/Target/Mips/MipsSEFrameLowering.cpp @@ -621,10 +621,17 @@ MipsSEFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const { !MFI->hasVarSizedObjects(); } -void MipsSEFrameLowering:: -processFunctionBeforeCalleeSavedScan(MachineFunction &MF, - RegScavenger *RS) const { - MachineRegisterInfo &MRI = MF.getRegInfo(); +/// Mark \p Reg and all registers aliasing it in the bitset. +void setAliasRegs(MachineFunction &MF, BitVector &SavedRegs, unsigned Reg) { + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) + SavedRegs.set(*AI); +} + +void MipsSEFrameLowering::determineCalleeSaves(MachineFunction &MF, + BitVector &SavedRegs, + RegScavenger *RS) const { + TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>(); MipsABIInfo ABI = STI.getABI(); unsigned FP = ABI.GetFramePtr(); @@ -632,10 +639,10 @@ processFunctionBeforeCalleeSavedScan(MachineFunction &MF, // Mark $fp as used if function has dedicated frame pointer. if (hasFP(MF)) - MRI.setPhysRegUsed(FP); + setAliasRegs(MF, SavedRegs, FP); // Mark $s7 as used if function has dedicated base pointer. if (hasBP(MF)) - MRI.setPhysRegUsed(BP); + setAliasRegs(MF, SavedRegs, BP); // Create spill slots for eh data registers if function calls eh_return. if (MipsFI->callsEhReturn()) diff --git a/lib/Target/Mips/MipsSEFrameLowering.h b/lib/Target/Mips/MipsSEFrameLowering.h index 2fcd6bbb9a15..9cb32e6c7829 100644 --- a/lib/Target/Mips/MipsSEFrameLowering.h +++ b/lib/Target/Mips/MipsSEFrameLowering.h @@ -34,8 +34,8 @@ public: bool hasReservedCallFrame(const MachineFunction &MF) const override; - void processFunctionBeforeCalleeSavedScan(MachineFunction &MF, - RegScavenger *RS) const override; + void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, + RegScavenger *RS) const override; unsigned ehDataReg(unsigned I) const; }; diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp index 990a2f8d8c85..cb46d731da29 100644 --- a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp +++ b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp @@ -841,7 +841,7 @@ std::pair<bool, SDNode*> MipsSEDAGToDAGISel::selectNode(SDNode *Node) { } case MipsISD::ThreadPointer: { - EVT PtrVT = getTargetLowering()->getPointerTy(); + EVT PtrVT = getTargetLowering()->getPointerTy(CurDAG->getDataLayout()); unsigned RdhwrOpc, DestReg; if (PtrVT == MVT::i32) { diff --git a/lib/Target/Mips/MipsSEISelLowering.cpp b/lib/Target/Mips/MipsSEISelLowering.cpp index ae2837a8582c..b319fd07884b 100644 --- a/lib/Target/Mips/MipsSEISelLowering.cpp +++ b/lib/Target/Mips/MipsSEISelLowering.cpp @@ -838,8 +838,9 @@ static SDValue performMULCombine(SDNode *N, SelectionDAG &DAG, if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1))) if (!VT.isVector()) - return genConstMult(N->getOperand(0), C->getZExtValue(), SDLoc(N), - VT, TL->getScalarShiftAmountTy(VT), DAG); + return genConstMult(N->getOperand(0), C->getZExtValue(), SDLoc(N), VT, + TL->getScalarShiftAmountTy(DAG.getDataLayout(), VT), + DAG); return SDValue(N, 0); } diff --git a/lib/Target/Mips/MipsSelectionDAGInfo.cpp b/lib/Target/Mips/MipsSelectionDAGInfo.cpp deleted file mode 100644 index edd8f670707f..000000000000 --- a/lib/Target/Mips/MipsSelectionDAGInfo.cpp +++ /dev/null @@ -1,23 +0,0 @@ -//===-- MipsSelectionDAGInfo.cpp - Mips SelectionDAG Info -----------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file implements the MipsSelectionDAGInfo class. -// -//===----------------------------------------------------------------------===// - -#include "MipsTargetMachine.h" -using namespace llvm; - -#define DEBUG_TYPE "mips-selectiondag-info" - -MipsSelectionDAGInfo::MipsSelectionDAGInfo(const DataLayout &DL) - : TargetSelectionDAGInfo(&DL) {} - -MipsSelectionDAGInfo::~MipsSelectionDAGInfo() { -} diff --git a/lib/Target/Mips/MipsSelectionDAGInfo.h b/lib/Target/Mips/MipsSelectionDAGInfo.h deleted file mode 100644 index 061423fbeb86..000000000000 --- a/lib/Target/Mips/MipsSelectionDAGInfo.h +++ /dev/null @@ -1,31 +0,0 @@ -//===-- MipsSelectionDAGInfo.h - Mips SelectionDAG Info ---------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file defines the Mips subclass for TargetSelectionDAGInfo. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_MIPS_MIPSSELECTIONDAGINFO_H -#define LLVM_LIB_TARGET_MIPS_MIPSSELECTIONDAGINFO_H - -#include "llvm/Target/TargetSelectionDAGInfo.h" - -namespace llvm { - -class MipsTargetMachine; - -class MipsSelectionDAGInfo : public TargetSelectionDAGInfo { -public: - explicit MipsSelectionDAGInfo(const DataLayout &DL); - ~MipsSelectionDAGInfo(); -}; - -} - -#endif diff --git a/lib/Target/Mips/MipsSubtarget.cpp b/lib/Target/Mips/MipsSubtarget.cpp index c41bb16a58ea..471b6e19a8bb 100644 --- a/lib/Target/Mips/MipsSubtarget.cpp +++ b/lib/Target/Mips/MipsSubtarget.cpp @@ -70,7 +70,7 @@ MipsSubtarget::MipsSubtarget(const Triple &TT, const std::string &CPU, HasMips4_32r2(false), HasMips5_32r2(false), InMips16Mode(false), InMips16HardFloat(Mips16HardFloat), InMicroMipsMode(false), HasDSP(false), HasDSPR2(false), AllowMixed16_32(Mixed16_32 | Mips_Os16), Os16(Mips_Os16), - HasMSA(false), TM(TM), TargetTriple(TT), TSInfo(*TM.getDataLayout()), + HasMSA(false), TM(TM), TargetTriple(TT), TSInfo(), InstrInfo( MipsInstrInfo::create(initializeSubtargetDependencies(CPU, FS, TM))), FrameLowering(MipsFrameLowering::create(*this)), diff --git a/lib/Target/Mips/MipsSubtarget.h b/lib/Target/Mips/MipsSubtarget.h index 5f9296812e1c..1db8881404c9 100644 --- a/lib/Target/Mips/MipsSubtarget.h +++ b/lib/Target/Mips/MipsSubtarget.h @@ -18,10 +18,10 @@ #include "MipsFrameLowering.h" #include "MipsISelLowering.h" #include "MipsInstrInfo.h" -#include "MipsSelectionDAGInfo.h" #include "llvm/IR/DataLayout.h" #include "llvm/MC/MCInstrItineraries.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Target/TargetSelectionDAGInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" #include <string> @@ -140,7 +140,7 @@ class MipsSubtarget : public MipsGenSubtargetInfo { Triple TargetTriple; - const MipsSelectionDAGInfo TSInfo; + const TargetSelectionDAGInfo TSInfo; std::unique_ptr<const MipsInstrInfo> InstrInfo; std::unique_ptr<const MipsFrameLowering> FrameLowering; std::unique_ptr<const MipsTargetLowering> TLInfo; @@ -275,7 +275,7 @@ public: void setHelperClassesMips16(); void setHelperClassesMipsSE(); - const MipsSelectionDAGInfo *getSelectionDAGInfo() const override { + const TargetSelectionDAGInfo *getSelectionDAGInfo() const override { return &TSInfo; } const MipsInstrInfo *getInstrInfo() const override { return InstrInfo.get(); } diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp index c820668befa0..1c77745d130b 100644 --- a/lib/Target/Mips/MipsTargetMachine.cpp +++ b/lib/Target/Mips/MipsTargetMachine.cpp @@ -62,7 +62,7 @@ static std::string computeDataLayout(const Triple &TT, StringRef CPU, if (!ABI.IsN64()) Ret += "-p:32:32"; - // 8 and 16 bit integers only need no have natural alignment, but try to + // 8 and 16 bit integers only need to have natural alignment, but try to // align them to 32 bits. 64 bit integers have natural alignment. Ret += "-i8:8:32-i16:16:32-i64:64"; @@ -237,7 +237,7 @@ TargetIRAnalysis MipsTargetMachine::getTargetIRAnalysis() { if (Subtarget->allowMixed16_32()) { DEBUG(errs() << "No Target Transform Info Pass Added\n"); // FIXME: This is no longer necessary as the TTI returned is per-function. - return TargetTransformInfo(getDataLayout()); + return TargetTransformInfo(F.getParent()->getDataLayout()); } DEBUG(errs() << "Target Transform Info Pass Added\n"); diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp index 221d2f093aeb..ad7302037cad 100644 --- a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp +++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp @@ -37,7 +37,7 @@ static MCInstrInfo *createNVPTXMCInstrInfo() { return X; } -static MCRegisterInfo *createNVPTXMCRegisterInfo(StringRef TT) { +static MCRegisterInfo *createNVPTXMCRegisterInfo(const Triple &TT) { MCRegisterInfo *X = new MCRegisterInfo(); // PTX does not have a return address register. InitNVPTXMCRegisterInfo(X, 0); @@ -46,13 +46,13 @@ static MCRegisterInfo *createNVPTXMCRegisterInfo(StringRef TT) { static MCSubtargetInfo * createNVPTXMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) { - MCSubtargetInfo *X = new MCSubtargetInfo(); - InitNVPTXMCSubtargetInfo(X, TT, CPU, FS); - return X; + return createNVPTXMCSubtargetInfoImpl(TT, CPU, FS); } -static MCCodeGenInfo *createNVPTXMCCodeGenInfo( - StringRef TT, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) { +static MCCodeGenInfo *createNVPTXMCCodeGenInfo(const Triple &TT, + Reloc::Model RM, + CodeModel::Model CM, + CodeGenOpt::Level OL) { MCCodeGenInfo *X = new MCCodeGenInfo(); // The default relocation model is used regardless of what the client has diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp index cadd7a46cd9d..ecb0f0a1d0a1 100644 --- a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp +++ b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp @@ -340,7 +340,7 @@ MCOperand NVPTXAsmPrinter::GetSymbolRef(const MCSymbol *Symbol) { } void NVPTXAsmPrinter::printReturnValStr(const Function *F, raw_ostream &O) { - const DataLayout *TD = TM.getDataLayout(); + const DataLayout &DL = getDataLayout(); const TargetLowering *TLI = nvptxSubtarget->getTargetLowering(); Type *Ty = F->getReturnType(); @@ -366,20 +366,20 @@ void NVPTXAsmPrinter::printReturnValStr(const Function *F, raw_ostream &O) { O << ".param .b" << size << " func_retval0"; } else if (isa<PointerType>(Ty)) { - O << ".param .b" << TLI->getPointerTy().getSizeInBits() + O << ".param .b" << TLI->getPointerTy(DL).getSizeInBits() << " func_retval0"; } else if ((Ty->getTypeID() == Type::StructTyID) || isa<VectorType>(Ty)) { - unsigned totalsz = TD->getTypeAllocSize(Ty); + unsigned totalsz = DL.getTypeAllocSize(Ty); unsigned retAlignment = 0; if (!llvm::getAlign(*F, 0, retAlignment)) - retAlignment = TD->getABITypeAlignment(Ty); + retAlignment = DL.getABITypeAlignment(Ty); O << ".param .align " << retAlignment << " .b8 func_retval0[" << totalsz << "]"; } else llvm_unreachable("Unknown return type"); } else { SmallVector<EVT, 16> vtparts; - ComputeValueVTs(*TLI, Ty, vtparts); + ComputeValueVTs(*TLI, DL, Ty, vtparts); unsigned idx = 0; for (unsigned i = 0, e = vtparts.size(); i != e; ++i) { unsigned elems = 1; @@ -1433,7 +1433,7 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) { bool first = true; bool isKernelFunc = llvm::isKernelFunction(*F); bool isABI = (nvptxSubtarget->getSmVersion() >= 20); - MVT thePointerTy = TLI->getPointerTy(); + MVT thePointerTy = TLI->getPointerTy(*TD); O << "(\n"; @@ -1579,7 +1579,7 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) { // Further, if a part is vector, print the above for // each vector element. SmallVector<EVT, 16> vtparts; - ComputeValueVTs(*TLI, ETy, vtparts); + ComputeValueVTs(*TLI, getDataLayout(), ETy, vtparts); for (unsigned i = 0, e = vtparts.size(); i != e; ++i) { unsigned elems = 1; EVT elemtype = vtparts[i]; diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp index 09e0bd5d3d88..b75cf4040312 100644 --- a/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -80,14 +80,14 @@ static bool IsPTXVectorType(MVT VT) { /// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the /// same number of types as the Ins/Outs arrays in LowerFormalArguments, /// LowerCall, and LowerReturn. -static void ComputePTXValueVTs(const TargetLowering &TLI, Type *Ty, - SmallVectorImpl<EVT> &ValueVTs, +static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL, + Type *Ty, SmallVectorImpl<EVT> &ValueVTs, SmallVectorImpl<uint64_t> *Offsets = nullptr, uint64_t StartingOffset = 0) { SmallVector<EVT, 16> TempVTs; SmallVector<uint64_t, 16> TempOffsets; - ComputeValueVTs(TLI, Ty, TempVTs, &TempOffsets, StartingOffset); + ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset); for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) { EVT VT = TempVTs[i]; uint64_t Off = TempOffsets[i]; @@ -885,15 +885,16 @@ SDValue NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); - Op = DAG.getTargetGlobalAddress(GV, dl, getPointerTy()); - return DAG.getNode(NVPTXISD::Wrapper, dl, getPointerTy(), Op); + auto PtrVT = getPointerTy(DAG.getDataLayout()); + Op = DAG.getTargetGlobalAddress(GV, dl, PtrVT); + return DAG.getNode(NVPTXISD::Wrapper, dl, PtrVT, Op); } -std::string -NVPTXTargetLowering::getPrototype(Type *retTy, const ArgListTy &Args, - const SmallVectorImpl<ISD::OutputArg> &Outs, - unsigned retAlignment, - const ImmutableCallSite *CS) const { +std::string NVPTXTargetLowering::getPrototype( + const DataLayout &DL, Type *retTy, const ArgListTy &Args, + const SmallVectorImpl<ISD::OutputArg> &Outs, unsigned retAlignment, + const ImmutableCallSite *CS) const { + auto PtrVT = getPointerTy(DL); bool isABI = (STI.getSmVersion() >= 20); assert(isABI && "Non-ABI compilation is not supported"); @@ -921,13 +922,12 @@ NVPTXTargetLowering::getPrototype(Type *retTy, const ArgListTy &Args, O << ".param .b" << size << " _"; } else if (isa<PointerType>(retTy)) { - O << ".param .b" << getPointerTy().getSizeInBits() << " _"; + O << ".param .b" << PtrVT.getSizeInBits() << " _"; } else if ((retTy->getTypeID() == Type::StructTyID) || isa<VectorType>(retTy)) { - O << ".param .align " - << retAlignment - << " .b8 _[" - << getDataLayout()->getTypeAllocSize(retTy) << "]"; + auto &DL = CS->getCalledFunction()->getParent()->getDataLayout(); + O << ".param .align " << retAlignment << " .b8 _[" + << DL.getTypeAllocSize(retTy) << "]"; } else { llvm_unreachable("Unknown return type"); } @@ -936,7 +936,6 @@ NVPTXTargetLowering::getPrototype(Type *retTy, const ArgListTy &Args, O << "_ ("; bool first = true; - MVT thePointerTy = getPointerTy(); unsigned OIdx = 0; for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) { @@ -950,24 +949,23 @@ NVPTXTargetLowering::getPrototype(Type *retTy, const ArgListTy &Args, if (Ty->isAggregateType() || Ty->isVectorTy()) { unsigned align = 0; const CallInst *CallI = cast<CallInst>(CS->getInstruction()); - const DataLayout *TD = getDataLayout(); // +1 because index 0 is reserved for return type alignment if (!llvm::getAlign(*CallI, i + 1, align)) - align = TD->getABITypeAlignment(Ty); - unsigned sz = TD->getTypeAllocSize(Ty); + align = DL.getABITypeAlignment(Ty); + unsigned sz = DL.getTypeAllocSize(Ty); O << ".param .align " << align << " .b8 "; O << "_"; O << "[" << sz << "]"; // update the index for Outs SmallVector<EVT, 16> vtparts; - ComputeValueVTs(*this, Ty, vtparts); + ComputeValueVTs(*this, DL, Ty, vtparts); if (unsigned len = vtparts.size()) OIdx += len - 1; continue; } // i8 types in IR will be i16 types in SDAG - assert((getValueType(Ty) == Outs[OIdx].VT || - (getValueType(Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) && + assert((getValueType(DL, Ty) == Outs[OIdx].VT || + (getValueType(DL, Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) && "type mismatch between callee prototype and arguments"); // scalar type unsigned sz = 0; @@ -976,7 +974,7 @@ NVPTXTargetLowering::getPrototype(Type *retTy, const ArgListTy &Args, if (sz < 32) sz = 32; } else if (isa<PointerType>(Ty)) - sz = thePointerTy.getSizeInBits(); + sz = PtrVT.getSizeInBits(); else sz = Ty->getPrimitiveSizeInBits(); O << ".param .b" << sz << " "; @@ -988,7 +986,7 @@ NVPTXTargetLowering::getPrototype(Type *retTy, const ArgListTy &Args, Type *ETy = PTy->getElementType(); unsigned align = Outs[OIdx].Flags.getByValAlign(); - unsigned sz = getDataLayout()->getTypeAllocSize(ETy); + unsigned sz = DL.getTypeAllocSize(ETy); O << ".param .align " << align << " .b8 "; O << "_"; O << "[" << sz << "]"; @@ -1002,7 +1000,6 @@ NVPTXTargetLowering::getArgumentAlignment(SDValue Callee, const ImmutableCallSite *CS, Type *Ty, unsigned Idx) const { - const DataLayout *TD = getDataLayout(); unsigned Align = 0; const Value *DirectCallee = CS->getCalledFunction(); @@ -1043,7 +1040,8 @@ NVPTXTargetLowering::getArgumentAlignment(SDValue Callee, // Call is indirect or alignment information is not available, fall back to // the ABI type alignment - return TD->getABITypeAlignment(Ty); + auto &DL = CS->getCaller()->getParent()->getDataLayout(); + return DL.getABITypeAlignment(Ty); } SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, @@ -1064,9 +1062,9 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, assert(isABI && "Non-ABI compilation is not supported"); if (!isABI) return Chain; - const DataLayout *TD = getDataLayout(); MachineFunction &MF = DAG.getMachineFunction(); const Function *F = MF.getFunction(); + auto &DL = MF.getDataLayout(); SDValue tempChain = Chain; Chain = DAG.getCALLSEQ_START(Chain, @@ -1096,11 +1094,12 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // aggregate SmallVector<EVT, 16> vtparts; SmallVector<uint64_t, 16> Offsets; - ComputePTXValueVTs(*this, Ty, vtparts, &Offsets, 0); + ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts, &Offsets, + 0); unsigned align = getArgumentAlignment(Callee, CS, Ty, paramCount + 1); // declare .param .align <align> .b8 .param<n>[<size>]; - unsigned sz = TD->getTypeAllocSize(Ty); + unsigned sz = DL.getTypeAllocSize(Ty); SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); SDValue DeclareParamOps[] = { Chain, DAG.getConstant(align, dl, MVT::i32), @@ -1137,10 +1136,10 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, continue; } if (Ty->isVectorTy()) { - EVT ObjectVT = getValueType(Ty); + EVT ObjectVT = getValueType(DL, Ty); unsigned align = getArgumentAlignment(Callee, CS, Ty, paramCount + 1); // declare .param .align <align> .b8 .param<n>[<size>]; - unsigned sz = TD->getTypeAllocSize(Ty); + unsigned sz = DL.getTypeAllocSize(Ty); SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); SDValue DeclareParamOps[] = { Chain, DAG.getConstant(align, dl, MVT::i32), @@ -1321,7 +1320,8 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVector<uint64_t, 16> Offsets; const PointerType *PTy = dyn_cast<PointerType>(Args[i].Ty); assert(PTy && "Type of a byval parameter should be pointer"); - ComputePTXValueVTs(*this, PTy->getElementType(), vtparts, &Offsets, 0); + ComputePTXValueVTs(*this, DAG.getDataLayout(), PTy->getElementType(), + vtparts, &Offsets, 0); // declare .param .align <align> .b8 .param<n>[<size>]; unsigned sz = Outs[OIdx].Flags.getByValSize(); @@ -1342,9 +1342,9 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, EVT elemtype = vtparts[j]; int curOffset = Offsets[j]; unsigned PartAlign = GreatestCommonDivisor64(ArgAlign, curOffset); - SDValue srcAddr = - DAG.getNode(ISD::ADD, dl, getPointerTy(), OutVals[OIdx], - DAG.getConstant(curOffset, dl, getPointerTy())); + auto PtrVT = getPointerTy(DAG.getDataLayout()); + SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, OutVals[OIdx], + DAG.getConstant(curOffset, dl, PtrVT)); SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr, MachinePointerInfo(), false, false, false, PartAlign); @@ -1371,12 +1371,12 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Handle Result if (Ins.size() > 0) { SmallVector<EVT, 16> resvtparts; - ComputeValueVTs(*this, retTy, resvtparts); + ComputeValueVTs(*this, DL, retTy, resvtparts); // Declare // .param .align 16 .b8 retval0[<size-in-bytes>], or // .param .b<size-in-bits> retval0 - unsigned resultsz = TD->getTypeAllocSizeInBits(retTy); + unsigned resultsz = DL.getTypeAllocSizeInBits(retTy); // Emit ".param .b<size-in-bits> retval0" instead of byte arrays only for // these three types to match the logic in // NVPTXAsmPrinter::printReturnValStr and NVPTXTargetLowering::getPrototype. @@ -1415,7 +1415,8 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // The prototype is embedded in a string and put as the operand for a // CallPrototype SDNode which will print out to the value of the string. SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue); - std::string Proto = getPrototype(retTy, Args, Outs, retAlignment, CS); + std::string Proto = + getPrototype(DAG.getDataLayout(), retTy, Args, Outs, retAlignment, CS); const char *ProtoStr = nvTM->getManagedStrPool()->getManagedString(Proto.c_str())->c_str(); SDValue ProtoOps[] = { @@ -1477,7 +1478,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Generate loads from param memory/moves from registers for result if (Ins.size() > 0) { if (retTy && retTy->isVectorTy()) { - EVT ObjectVT = getValueType(retTy); + EVT ObjectVT = getValueType(DL, retTy); unsigned NumElts = ObjectVT.getVectorNumElements(); EVT EltVT = ObjectVT.getVectorElementType(); assert(STI.getTargetLowering()->getNumRegisters(F->getContext(), @@ -1590,13 +1591,13 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Elt = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt); InVals.push_back(Elt); } - Ofst += TD->getTypeAllocSize(VecVT.getTypeForEVT(F->getContext())); + Ofst += DL.getTypeAllocSize(VecVT.getTypeForEVT(F->getContext())); } } } else { SmallVector<EVT, 16> VTs; SmallVector<uint64_t, 16> Offsets; - ComputePTXValueVTs(*this, retTy, VTs, &Offsets, 0); + ComputePTXValueVTs(*this, DAG.getDataLayout(), retTy, VTs, &Offsets, 0); assert(VTs.size() == Ins.size() && "Bad value decomposition"); unsigned RetAlign = getArgumentAlignment(Callee, CS, retTy, 0); for (unsigned i = 0, e = Ins.size(); i != e; ++i) { @@ -1608,8 +1609,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVector<EVT, 4> LoadRetVTs; EVT TheLoadType = VTs[i]; - if (retTy->isIntegerTy() && - TD->getTypeAllocSizeInBits(retTy) < 32) { + if (retTy->isIntegerTy() && DL.getTypeAllocSizeInBits(retTy) < 32) { // This is for integer types only, and specifically not for // aggregates. LoadRetVTs.push_back(MVT::i32); @@ -1920,11 +1920,11 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const { } MemSDNode *MemSD = cast<MemSDNode>(N); - const DataLayout *TD = getDataLayout(); + const DataLayout &TD = DAG.getDataLayout(); unsigned Align = MemSD->getAlignment(); unsigned PrefAlign = - TD->getPrefTypeAlignment(ValVT.getTypeForEVT(*DAG.getContext())); + TD.getPrefTypeAlignment(ValVT.getTypeForEVT(*DAG.getContext())); if (Align < PrefAlign) { // This store is not sufficiently aligned, so bail out and let this vector // store be scalarized. Note that we may still be able to emit smaller @@ -2064,7 +2064,8 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc dl, SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { MachineFunction &MF = DAG.getMachineFunction(); - const DataLayout *TD = getDataLayout(); + const DataLayout &DL = DAG.getDataLayout(); + auto PtrVT = getPointerTy(DAG.getDataLayout()); const Function *F = MF.getFunction(); const AttributeSet &PAL = F->getAttributes(); @@ -2118,7 +2119,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( if (Ty->isAggregateType()) { SmallVector<EVT, 16> vtparts; - ComputePTXValueVTs(*this, Ty, vtparts); + ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts); assert(vtparts.size() > 0 && "empty aggregate type not expected"); for (unsigned parti = 0, parte = vtparts.size(); parti != parte; ++parti) { @@ -2130,7 +2131,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( continue; } if (Ty->isVectorTy()) { - EVT ObjectVT = getValueType(Ty); + EVT ObjectVT = getValueType(DL, Ty); unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT); for (unsigned parti = 0; parti < NumRegs; ++parti) { InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT)); @@ -2156,13 +2157,14 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( // NOTE: Here, we lose the ability to issue vector loads for vectors // that are a part of a struct. This should be investigated in the // future. - ComputePTXValueVTs(*this, Ty, vtparts, &offsets, 0); + ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts, &offsets, + 0); assert(vtparts.size() > 0 && "empty aggregate type not expected"); bool aggregateIsPacked = false; if (StructType *STy = llvm::dyn_cast<StructType>(Ty)) aggregateIsPacked = STy->isPacked(); - SDValue Arg = getParamSymbol(DAG, idx, getPointerTy()); + SDValue Arg = getParamSymbol(DAG, idx, PtrVT); for (unsigned parti = 0, parte = vtparts.size(); parti != parte; ++parti) { EVT partVT = vtparts[parti]; @@ -2170,12 +2172,12 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( PointerType::get(partVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM)); SDValue srcAddr = - DAG.getNode(ISD::ADD, dl, getPointerTy(), Arg, - DAG.getConstant(offsets[parti], dl, getPointerTy())); - unsigned partAlign = - aggregateIsPacked ? 1 - : TD->getABITypeAlignment( - partVT.getTypeForEVT(F->getContext())); + DAG.getNode(ISD::ADD, dl, PtrVT, Arg, + DAG.getConstant(offsets[parti], dl, PtrVT)); + unsigned partAlign = aggregateIsPacked + ? 1 + : DL.getABITypeAlignment( + partVT.getTypeForEVT(F->getContext())); SDValue p; if (Ins[InsIdx].VT.getSizeInBits() > partVT.getSizeInBits()) { ISD::LoadExtType ExtOp = Ins[InsIdx].Flags.isSExt() ? @@ -2198,8 +2200,8 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( continue; } if (Ty->isVectorTy()) { - EVT ObjectVT = getValueType(Ty); - SDValue Arg = getParamSymbol(DAG, idx, getPointerTy()); + EVT ObjectVT = getValueType(DL, Ty); + SDValue Arg = getParamSymbol(DAG, idx, PtrVT); unsigned NumElts = ObjectVT.getVectorNumElements(); assert(TLI->getNumRegisters(F->getContext(), ObjectVT) == NumElts && "Vector was not scalarized"); @@ -2212,9 +2214,9 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( Value *SrcValue = Constant::getNullValue(PointerType::get( EltVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM)); SDValue P = DAG.getLoad( - EltVT, dl, Root, Arg, MachinePointerInfo(SrcValue), false, - false, true, - TD->getABITypeAlignment(EltVT.getTypeForEVT(F->getContext()))); + EltVT, dl, Root, Arg, MachinePointerInfo(SrcValue), false, false, + true, + DL.getABITypeAlignment(EltVT.getTypeForEVT(F->getContext()))); if (P.getNode()) P.getNode()->setIROrder(idx + 1); @@ -2229,9 +2231,9 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( Value *SrcValue = Constant::getNullValue(PointerType::get( VecVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM)); SDValue P = DAG.getLoad( - VecVT, dl, Root, Arg, MachinePointerInfo(SrcValue), false, - false, true, - TD->getABITypeAlignment(VecVT.getTypeForEVT(F->getContext()))); + VecVT, dl, Root, Arg, MachinePointerInfo(SrcValue), false, false, + true, + DL.getABITypeAlignment(VecVT.getTypeForEVT(F->getContext()))); if (P.getNode()) P.getNode()->setIROrder(idx + 1); @@ -2269,13 +2271,12 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( Value *SrcValue = Constant::getNullValue( PointerType::get(VecVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM)); - SDValue SrcAddr = - DAG.getNode(ISD::ADD, dl, getPointerTy(), Arg, - DAG.getConstant(Ofst, dl, getPointerTy())); + SDValue SrcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, + DAG.getConstant(Ofst, dl, PtrVT)); SDValue P = DAG.getLoad( VecVT, dl, Root, SrcAddr, MachinePointerInfo(SrcValue), false, false, true, - TD->getABITypeAlignment(VecVT.getTypeForEVT(F->getContext()))); + DL.getABITypeAlignment(VecVT.getTypeForEVT(F->getContext()))); if (P.getNode()) P.getNode()->setIROrder(idx + 1); @@ -2288,7 +2289,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( Elt = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, Elt); InVals.push_back(Elt); } - Ofst += TD->getTypeAllocSize(VecVT.getTypeForEVT(F->getContext())); + Ofst += DL.getTypeAllocSize(VecVT.getTypeForEVT(F->getContext())); } InsIdx += NumElts; } @@ -2298,23 +2299,24 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( continue; } // A plain scalar. - EVT ObjectVT = getValueType(Ty); + EVT ObjectVT = getValueType(DL, Ty); // If ABI, load from the param symbol - SDValue Arg = getParamSymbol(DAG, idx, getPointerTy()); + SDValue Arg = getParamSymbol(DAG, idx, PtrVT); Value *srcValue = Constant::getNullValue(PointerType::get( ObjectVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM)); SDValue p; if (ObjectVT.getSizeInBits() < Ins[InsIdx].VT.getSizeInBits()) { ISD::LoadExtType ExtOp = Ins[InsIdx].Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD; - p = DAG.getExtLoad(ExtOp, dl, Ins[InsIdx].VT, Root, Arg, - MachinePointerInfo(srcValue), ObjectVT, false, false, - false, - TD->getABITypeAlignment(ObjectVT.getTypeForEVT(F->getContext()))); + p = DAG.getExtLoad( + ExtOp, dl, Ins[InsIdx].VT, Root, Arg, MachinePointerInfo(srcValue), + ObjectVT, false, false, false, + DL.getABITypeAlignment(ObjectVT.getTypeForEVT(F->getContext()))); } else { - p = DAG.getLoad(Ins[InsIdx].VT, dl, Root, Arg, - MachinePointerInfo(srcValue), false, false, false, - TD->getABITypeAlignment(ObjectVT.getTypeForEVT(F->getContext()))); + p = DAG.getLoad( + Ins[InsIdx].VT, dl, Root, Arg, MachinePointerInfo(srcValue), false, + false, false, + DL.getABITypeAlignment(ObjectVT.getTypeForEVT(F->getContext()))); } if (p.getNode()) p.getNode()->setIROrder(idx + 1); @@ -2329,10 +2331,10 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( // machine instruction fails because TargetExternalSymbol // (not lowered) is target dependent, and CopyToReg assumes // the source is lowered. - EVT ObjectVT = getValueType(Ty); + EVT ObjectVT = getValueType(DL, Ty); assert(ObjectVT == Ins[InsIdx].VT && "Ins type did not match function type"); - SDValue Arg = getParamSymbol(DAG, idx, getPointerTy()); + SDValue Arg = getParamSymbol(DAG, idx, PtrVT); SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg); if (p.getNode()) p.getNode()->setIROrder(idx + 1); @@ -2370,7 +2372,7 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, MachineFunction &MF = DAG.getMachineFunction(); const Function *F = MF.getFunction(); Type *RetTy = F->getReturnType(); - const DataLayout *TD = getDataLayout(); + const DataLayout &TD = DAG.getDataLayout(); bool isABI = (STI.getSmVersion() >= 20); assert(isABI && "Non-ABI compilation is not supported"); @@ -2384,7 +2386,7 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, assert(NumElts == Outs.size() && "Bad scalarization of return value"); // const_cast can be removed in later LLVM versions - EVT EltVT = getValueType(RetTy).getVectorElementType(); + EVT EltVT = getValueType(TD, RetTy).getVectorElementType(); bool NeedExtend = false; if (EltVT.getSizeInBits() < 16) NeedExtend = true; @@ -2435,7 +2437,7 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, EVT VecVT = EVT::getVectorVT(F->getContext(), EltVT, VecSize); unsigned PerStoreOffset = - TD->getTypeAllocSize(VecVT.getTypeForEVT(F->getContext())); + TD.getTypeAllocSize(VecVT.getTypeForEVT(F->getContext())); for (unsigned i = 0; i < NumElts; i += VecSize) { // Get values @@ -2493,7 +2495,7 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, } else { SmallVector<EVT, 16> ValVTs; SmallVector<uint64_t, 16> Offsets; - ComputePTXValueVTs(*this, RetTy, ValVTs, &Offsets, 0); + ComputePTXValueVTs(*this, DAG.getDataLayout(), RetTy, ValVTs, &Offsets, 0); assert(ValVTs.size() == OutVals.size() && "Bad return value decomposition"); for (unsigned i = 0, e = Outs.size(); i != e; ++i) { @@ -2509,8 +2511,7 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, TheValType.getVectorElementType(), TmpVal, DAG.getIntPtrConstant(j, dl)); EVT TheStoreType = ValVTs[i]; - if (RetTy->isIntegerTy() && - TD->getTypeAllocSizeInBits(RetTy) < 32) { + if (RetTy->isIntegerTy() && TD.getTypeAllocSizeInBits(RetTy) < 32) { // The following zero-extension is for integer types only, and // specifically not for aggregates. TmpVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, TmpVal); @@ -3291,14 +3292,14 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( case Intrinsic::nvvm_ldu_global_i: case Intrinsic::nvvm_ldu_global_f: case Intrinsic::nvvm_ldu_global_p: { - + auto &DL = I.getModule()->getDataLayout(); Info.opc = ISD::INTRINSIC_W_CHAIN; if (Intrinsic == Intrinsic::nvvm_ldu_global_i) - Info.memVT = getValueType(I.getType()); + Info.memVT = getValueType(DL, I.getType()); else if(Intrinsic == Intrinsic::nvvm_ldu_global_p) - Info.memVT = getPointerTy(); + Info.memVT = getPointerTy(DL); else - Info.memVT = getValueType(I.getType()); + Info.memVT = getValueType(DL, I.getType()); Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.vol = 0; @@ -3311,14 +3312,15 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( case Intrinsic::nvvm_ldg_global_i: case Intrinsic::nvvm_ldg_global_f: case Intrinsic::nvvm_ldg_global_p: { + auto &DL = I.getModule()->getDataLayout(); Info.opc = ISD::INTRINSIC_W_CHAIN; if (Intrinsic == Intrinsic::nvvm_ldg_global_i) - Info.memVT = getValueType(I.getType()); + Info.memVT = getValueType(DL, I.getType()); else if(Intrinsic == Intrinsic::nvvm_ldg_global_p) - Info.memVT = getPointerTy(); + Info.memVT = getPointerTy(DL); else - Info.memVT = getValueType(I.getType()); + Info.memVT = getValueType(DL, I.getType()); Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.vol = 0; @@ -3731,8 +3733,8 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( /// Used to guide target specific optimizations, like loop strength reduction /// (LoopStrengthReduce.cpp) and memory optimization for address mode /// (CodeGenPrepare.cpp) -bool NVPTXTargetLowering::isLegalAddressingMode(const AddrMode &AM, - Type *Ty, +bool NVPTXTargetLowering::isLegalAddressingMode(const DataLayout &DL, + const AddrMode &AM, Type *Ty, unsigned AS) const { // AddrMode - This represents an addressing mode of: @@ -3772,7 +3774,7 @@ bool NVPTXTargetLowering::isLegalAddressingMode(const AddrMode &AM, /// getConstraintType - Given a constraint letter, return the type of /// constraint it is for this target. NVPTXTargetLowering::ConstraintType -NVPTXTargetLowering::getConstraintType(const std::string &Constraint) const { +NVPTXTargetLowering::getConstraintType(StringRef Constraint) const { if (Constraint.size() == 1) { switch (Constraint[0]) { default: @@ -3794,7 +3796,7 @@ NVPTXTargetLowering::getConstraintType(const std::string &Constraint) const { std::pair<unsigned, const TargetRegisterClass *> NVPTXTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, - const std::string &Constraint, + StringRef Constraint, MVT VT) const { if (Constraint.size() == 1) { switch (Constraint[0]) { @@ -4251,7 +4253,6 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, /// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads. static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, - const DataLayout *TD, SmallVectorImpl<SDValue> &Results) { EVT ResVT = N->getValueType(0); SDLoc DL(N); @@ -4282,8 +4283,9 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, LoadSDNode *LD = cast<LoadSDNode>(N); unsigned Align = LD->getAlignment(); + auto &TD = DAG.getDataLayout(); unsigned PrefAlign = - TD->getPrefTypeAlignment(ResVT.getTypeForEVT(*DAG.getContext())); + TD.getPrefTypeAlignment(ResVT.getTypeForEVT(*DAG.getContext())); if (Align < PrefAlign) { // This load is not sufficiently aligned, so bail out and let this vector // load be scalarized. Note that we may still be able to emit smaller @@ -4495,7 +4497,7 @@ void NVPTXTargetLowering::ReplaceNodeResults( default: report_fatal_error("Unhandled custom legalization"); case ISD::LOAD: - ReplaceLoadVector(N, DAG, getDataLayout(), Results); + ReplaceLoadVector(N, DAG, Results); return; case ISD::INTRINSIC_W_CHAIN: ReplaceINTRINSIC_W_CHAIN(N, DAG, Results); diff --git a/lib/Target/NVPTX/NVPTXISelLowering.h b/lib/Target/NVPTX/NVPTXISelLowering.h index ed94775b3002..e5c37321a33b 100644 --- a/lib/Target/NVPTX/NVPTXISelLowering.h +++ b/lib/Target/NVPTX/NVPTXISelLowering.h @@ -456,24 +456,23 @@ public: /// Used to guide target specific optimizations, like loop strength /// reduction (LoopStrengthReduce.cpp) and memory optimization for /// address mode (CodeGenPrepare.cpp) - bool isLegalAddressingMode(const AddrMode &AM, Type *Ty, + bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const override; /// getFunctionAlignment - Return the Log2 alignment of this function. unsigned getFunctionAlignment(const Function *F) const; - EVT getSetCCResultType(LLVMContext &Ctx, EVT VT) const override { + EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx, + EVT VT) const override { if (VT.isVector()) return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements()); return MVT::i1; } - ConstraintType - getConstraintType(const std::string &Constraint) const override; + ConstraintType getConstraintType(StringRef Constraint) const override; std::pair<unsigned, const TargetRegisterClass *> getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, - const std::string &Constraint, - MVT VT) const override; + StringRef Constraint, MVT VT) const override; SDValue LowerFormalArguments( SDValue Chain, CallingConv::ID CallConv, bool isVarArg, @@ -483,7 +482,7 @@ public: SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const override; - std::string getPrototype(Type *, const ArgListTy &, + std::string getPrototype(const DataLayout &DL, Type *, const ArgListTy &, const SmallVectorImpl<ISD::OutputArg> &, unsigned retAlignment, const ImmutableCallSite *CS) const; @@ -501,7 +500,9 @@ public: const NVPTXTargetMachine *nvTM; // PTX always uses 32-bit shift amounts - MVT getScalarShiftAmountTy(EVT LHSTy) const override { return MVT::i32; } + MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override { + return MVT::i32; + } TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(EVT VT) const override; diff --git a/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp b/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp index 6ab0fadf9a35..0bf72febc4a0 100644 --- a/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp +++ b/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp @@ -57,7 +57,6 @@ char NVPTXLowerAggrCopies::ID = 0; // Lower MemTransferInst or load-store pair to loop static void convertTransferToLoop( Instruction *splitAt, Value *srcAddr, Value *dstAddr, Value *len, - //unsigned numLoads, bool srcVolatile, bool dstVolatile, LLVMContext &Context, Function &F) { Type *indType = len->getType(); @@ -84,6 +83,8 @@ static void convertTransferToLoop( ind->addIncoming(ConstantInt::get(indType, 0), origBB); // load from srcAddr+ind + // TODO: we can leverage the align parameter of llvm.memcpy for more efficient + // word-sized loads and stores. Value *val = loop.CreateLoad(loop.CreateGEP(loop.getInt8Ty(), srcAddr, ind), srcVolatile); // store at dstAddr+ind @@ -137,13 +138,10 @@ bool NVPTXLowerAggrCopies::runOnFunction(Function &F) { // // Collect all the aggrLoads, aggrMemcpys and addrMemsets. // - //const BasicBlock *firstBB = &F.front(); // first BB in F for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) { - //BasicBlock *bb = BI; for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE; ++II) { if (LoadInst *load = dyn_cast<LoadInst>(II)) { - if (!load->hasOneUse()) continue; @@ -152,7 +150,7 @@ bool NVPTXLowerAggrCopies::runOnFunction(Function &F) { User *use = load->user_back(); if (StoreInst *store = dyn_cast<StoreInst>(use)) { - if (store->getOperand(0) != load) //getValueOperand + if (store->getOperand(0) != load) continue; aggrLoads.push_back(load); } @@ -188,8 +186,7 @@ bool NVPTXLowerAggrCopies::runOnFunction(Function &F) { // // Do the transformation of an aggr load/copy/set to a loop // - for (unsigned i = 0, e = aggrLoads.size(); i != e; ++i) { - LoadInst *load = aggrLoads[i]; + for (LoadInst *load : aggrLoads) { StoreInst *store = dyn_cast<StoreInst>(*load->user_begin()); Value *srcAddr = load->getOperand(0); Value *dstAddr = store->getOperand(1); @@ -203,20 +200,19 @@ bool NVPTXLowerAggrCopies::runOnFunction(Function &F) { load->eraseFromParent(); } - for (unsigned i = 0, e = aggrMemcpys.size(); i != e; ++i) { - MemTransferInst *cpy = aggrMemcpys[i]; - Value *len = cpy->getLength(); - // llvm 2.7 version of memcpy does not have volatile - // operand yet. So always making it non-volatile - // optimistically, so that we don't see unnecessary - // st.volatile in ptx - convertTransferToLoop(cpy, cpy->getSource(), cpy->getDest(), len, false, - false, Context, F); + for (MemTransferInst *cpy : aggrMemcpys) { + convertTransferToLoop(/* splitAt */ cpy, + /* srcAddr */ cpy->getSource(), + /* dstAddr */ cpy->getDest(), + /* len */ cpy->getLength(), + /* srcVolatile */ cpy->isVolatile(), + /* dstVolatile */ cpy->isVolatile(), + /* Context */ Context, + /* Function F */ F); cpy->eraseFromParent(); } - for (unsigned i = 0, e = aggrMemsets.size(); i != e; ++i) { - MemSetInst *memsetinst = aggrMemsets[i]; + for (MemSetInst *memsetinst : aggrMemsets) { Value *len = memsetinst->getLength(); Value *val = memsetinst->getValue(); convertMemSetToLoop(memsetinst, memsetinst->getDest(), len, val, Context, diff --git a/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp b/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp index e83f735a551e..5a83371b07f1 100644 --- a/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp +++ b/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp @@ -2,7 +2,7 @@ // // The LLVM Compiler Infrastructure // -// This file is distributed under the University of Illinois Open Source +// This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// @@ -115,7 +115,7 @@ bool NVPTXReplaceImageHandles::processInstr(MachineInstr &MI) { replaceImageHandle(Handle, MF); - return true; + return true; } return false; diff --git a/lib/Target/NVPTX/NVPTXSubtarget.cpp b/lib/Target/NVPTX/NVPTXSubtarget.cpp index 71645dca69c5..bd2509a3c8c9 100644 --- a/lib/Target/NVPTX/NVPTXSubtarget.cpp +++ b/lib/Target/NVPTX/NVPTXSubtarget.cpp @@ -48,7 +48,7 @@ NVPTXSubtarget::NVPTXSubtarget(const Triple &TT, const std::string &CPU, const NVPTXTargetMachine &TM) : NVPTXGenSubtargetInfo(TT, CPU, FS), PTXVersion(0), SmVersion(20), TM(TM), InstrInfo(), TLInfo(TM, initializeSubtargetDependencies(CPU, FS)), - TSInfo(TM.getDataLayout()), FrameLowering() {} + FrameLowering() {} bool NVPTXSubtarget::hasImageHandles() const { // Enable handles for Kepler+, where CUDA supports indirect surfaces and diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp index 9d9072efc382..248f9e117d83 100644 --- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -148,8 +148,9 @@ TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) { } TargetIRAnalysis NVPTXTargetMachine::getTargetIRAnalysis() { - return TargetIRAnalysis( - [this](Function &) { return TargetTransformInfo(NVPTXTTIImpl(this)); }); + return TargetIRAnalysis([this](Function &F) { + return TargetTransformInfo(NVPTXTTIImpl(this, F)); + }); } void NVPTXPassConfig::addIRPasses() { diff --git a/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp index dc81802f4b5a..e7250cdba5ac 100644 --- a/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp +++ b/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp @@ -94,7 +94,7 @@ unsigned NVPTXTTIImpl::getArithmeticInstrCost( TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo, TTI::OperandValueProperties Opd2PropInfo) { // Legalize the type. - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty); + std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); int ISD = TLI->InstructionOpcodeToISD(Opcode); @@ -117,3 +117,15 @@ unsigned NVPTXTTIImpl::getArithmeticInstrCost( Opd1PropInfo, Opd2PropInfo); } } + +void NVPTXTTIImpl::getUnrollingPreferences(Loop *L, + TTI::UnrollingPreferences &UP) { + BaseT::getUnrollingPreferences(L, UP); + + // Enable partial unrolling and runtime unrolling, but reduce the + // threshold. This partially unrolls small loops which are often + // unrolled by the PTX to SASS compiler and unrolling earlier can be + // beneficial. + UP.Partial = UP.Runtime = true; + UP.PartialThreshold = UP.Threshold / 4; +} diff --git a/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/lib/Target/NVPTX/NVPTXTargetTransformInfo.h index 4280888988f9..5bcd1e27a558 100644 --- a/lib/Target/NVPTX/NVPTXTargetTransformInfo.h +++ b/lib/Target/NVPTX/NVPTXTargetTransformInfo.h @@ -37,8 +37,9 @@ class NVPTXTTIImpl : public BasicTTIImplBase<NVPTXTTIImpl> { const NVPTXTargetLowering *getTLI() const { return TLI; }; public: - explicit NVPTXTTIImpl(const NVPTXTargetMachine *TM) - : BaseT(TM), ST(TM->getSubtargetImpl()), TLI(ST->getTargetLowering()) {} + explicit NVPTXTTIImpl(const NVPTXTargetMachine *TM, const Function &F) + : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl()), + TLI(ST->getTargetLowering()) {} // Provide value semantics. MSVC requires that we spell all of these out. NVPTXTTIImpl(const NVPTXTTIImpl &Arg) @@ -46,18 +47,6 @@ public: NVPTXTTIImpl(NVPTXTTIImpl &&Arg) : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)), TLI(std::move(Arg.TLI)) {} - NVPTXTTIImpl &operator=(const NVPTXTTIImpl &RHS) { - BaseT::operator=(static_cast<const BaseT &>(RHS)); - ST = RHS.ST; - TLI = RHS.TLI; - return *this; - } - NVPTXTTIImpl &operator=(NVPTXTTIImpl &&RHS) { - BaseT::operator=(std::move(static_cast<BaseT &>(RHS))); - ST = std::move(RHS.ST); - TLI = std::move(RHS.TLI); - return *this; - } bool hasBranchDivergence() { return true; } @@ -69,6 +58,8 @@ public: TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None); + + void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP); }; } // end namespace llvm diff --git a/lib/Target/PowerPC/CMakeLists.txt b/lib/Target/PowerPC/CMakeLists.txt index fe168a547597..c0c83cc258b8 100644 --- a/lib/Target/PowerPC/CMakeLists.txt +++ b/lib/Target/PowerPC/CMakeLists.txt @@ -33,7 +33,6 @@ add_llvm_target(PowerPCCodeGen PPCTargetObjectFile.cpp PPCTargetTransformInfo.cpp PPCTOCRegDeps.cpp - PPCSelectionDAGInfo.cpp PPCTLSDynamicCall.cpp PPCVSXCopy.cpp PPCVSXFMAMutate.cpp diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp index 5c38fe173d96..30f232a9a91e 100644 --- a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp +++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp @@ -51,10 +51,9 @@ static MCInstrInfo *createPPCMCInstrInfo() { return X; } -static MCRegisterInfo *createPPCMCRegisterInfo(StringRef TT) { - Triple TheTriple(TT); - bool isPPC64 = (TheTriple.getArch() == Triple::ppc64 || - TheTriple.getArch() == Triple::ppc64le); +static MCRegisterInfo *createPPCMCRegisterInfo(const Triple &TT) { + bool isPPC64 = + (TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le); unsigned Flavour = isPPC64 ? 0 : 1; unsigned RA = isPPC64 ? PPC::LR8 : PPC::LR; @@ -65,9 +64,7 @@ static MCRegisterInfo *createPPCMCRegisterInfo(StringRef TT) { static MCSubtargetInfo *createPPCMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) { - MCSubtargetInfo *X = new MCSubtargetInfo(); - InitPPCMCSubtargetInfo(X, TT, CPU, FS); - return X; + return createPPCMCSubtargetInfoImpl(TT, CPU, FS); } static MCAsmInfo *createPPCMCAsmInfo(const MCRegisterInfo &MRI, @@ -90,22 +87,20 @@ static MCAsmInfo *createPPCMCAsmInfo(const MCRegisterInfo &MRI, return MAI; } -static MCCodeGenInfo *createPPCMCCodeGenInfo(StringRef TT, Reloc::Model RM, +static MCCodeGenInfo *createPPCMCCodeGenInfo(const Triple &TT, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) { MCCodeGenInfo *X = new MCCodeGenInfo(); if (RM == Reloc::Default) { - Triple T(TT); - if (T.isOSDarwin()) + if (TT.isOSDarwin()) RM = Reloc::DynamicNoPIC; else RM = Reloc::Static; } if (CM == CodeModel::Default) { - Triple T(TT); - if (!T.isOSDarwin() && - (T.getArch() == Triple::ppc64 || T.getArch() == Triple::ppc64le)) + if (!TT.isOSDarwin() && + (TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le)) CM = CodeModel::Medium; } X->initMCCodeGenInfo(RM, CM, OL); @@ -231,7 +226,7 @@ static MCTargetStreamer *createAsmTargetStreamer(MCStreamer &S, static MCTargetStreamer * createObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) { const Triple &TT = STI.getTargetTriple(); - if (TT.getObjectFormat() == Triple::ELF) + if (TT.isOSBinFormatELF()) return new PPCTargetELFStreamer(S); return new PPCTargetMachOStreamer(S); } diff --git a/lib/Target/PowerPC/PPCAsmPrinter.cpp b/lib/Target/PowerPC/PPCAsmPrinter.cpp index 87a5236e711f..199a0debf88b 100644 --- a/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -197,7 +197,7 @@ void PPCAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo, // External or weakly linked global variables need non-lazily-resolved stubs if (TM.getRelocationModel() != Reloc::Static && - (GV->isDeclaration() || GV->isWeakForLinker())) { + !GV->isStrongDefinitionForLinker()) { if (!GV->hasHiddenVisibility()) { SymToPrint = getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr"); MachineModuleInfoImpl::StubValueTy &StubSym = @@ -369,28 +369,70 @@ void PPCAsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM, assert((CallTarget & 0xFFFFFFFFFFFF) == CallTarget && "High 16 bits of call target should be zero."); unsigned ScratchReg = MI.getOperand(Opers.getNextScratchIdx()).getReg(); - EncodedBytes = 6*4; + EncodedBytes = 0; // Materialize the jump address: EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LI8) .addReg(ScratchReg) .addImm((CallTarget >> 32) & 0xFFFF)); + ++EncodedBytes; EmitToStreamer(OutStreamer, MCInstBuilder(PPC::RLDIC) .addReg(ScratchReg) .addReg(ScratchReg) .addImm(32).addImm(16)); + ++EncodedBytes; EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ORIS8) .addReg(ScratchReg) .addReg(ScratchReg) .addImm((CallTarget >> 16) & 0xFFFF)); + ++EncodedBytes; EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ORI8) .addReg(ScratchReg) .addReg(ScratchReg) .addImm(CallTarget & 0xFFFF)); + // Save the current TOC pointer before the remote call. + int TOCSaveOffset = Subtarget->isELFv2ABI() ? 24 : 40; + EmitToStreamer(OutStreamer, MCInstBuilder(PPC::STD) + .addReg(PPC::X2) + .addImm(TOCSaveOffset) + .addReg(PPC::X1)); + ++EncodedBytes; + + + // If we're on ELFv1, then we need to load the actual function pointer from + // the function descriptor. + if (!Subtarget->isELFv2ABI()) { + // Load the new TOC pointer and the function address, but not r11 + // (needing this is rare, and loading it here would prevent passing it + // via a 'nest' parameter. + EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LD) + .addReg(PPC::X2) + .addImm(8) + .addReg(ScratchReg)); + ++EncodedBytes; + EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LD) + .addReg(ScratchReg) + .addImm(0) + .addReg(ScratchReg)); + ++EncodedBytes; + } + EmitToStreamer(OutStreamer, MCInstBuilder(PPC::MTCTR8).addReg(ScratchReg)); + ++EncodedBytes; EmitToStreamer(OutStreamer, MCInstBuilder(PPC::BCTRL8)); + ++EncodedBytes; + + // Restore the TOC pointer after the call. + EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LD) + .addReg(PPC::X2) + .addImm(TOCSaveOffset) + .addReg(PPC::X1)); + ++EncodedBytes; } + // Each instruction is 4 bytes. + EncodedBytes *= 4; + // Emit padding. unsigned NumBytes = Opers.getMetaOper(PatchPointOpers::NBytesPos).getImm(); assert(NumBytes >= EncodedBytes && @@ -624,7 +666,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { IsExternal = GV->isDeclaration(); IsCommon = GV->hasCommonLinkage(); IsNonLocalFunction = GV->getType()->getElementType()->isFunctionTy() && - (GV->isDeclaration() || GV->isWeakForLinker()); + !GV->isStrongDefinitionForLinker(); IsAvailExt = GV->hasAvailableExternallyLinkage(); } else if (MO.isCPI()) MOSymbol = GetCPISymbol(MO.getIndex()); @@ -706,7 +748,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { MOSymbol = getSymbol(GV); IsExternal = GV->isDeclaration(); IsNonLocalFunction = GV->getType()->getElementType()->isFunctionTy() && - (GV->isDeclaration() || GV->isWeakForLinker()); + !GV->isStrongDefinitionForLinker(); } else if (MO.isCPI()) MOSymbol = GetCPISymbol(MO.getIndex()); diff --git a/lib/Target/PowerPC/PPCCTRLoops.cpp b/lib/Target/PowerPC/PPCCTRLoops.cpp index 416131745806..baadf081a64c 100644 --- a/lib/Target/PowerPC/PPCCTRLoops.cpp +++ b/lib/Target/PowerPC/PPCCTRLoops.cpp @@ -351,8 +351,9 @@ bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) { Opcode = ISD::FTRUNC; break; } - MVT VTy = - TLI->getSimpleValueType(CI->getArgOperand(0)->getType(), true); + auto &DL = CI->getModule()->getDataLayout(); + MVT VTy = TLI->getSimpleValueType(DL, CI->getArgOperand(0)->getType(), + true); if (VTy == MVT::Other) return true; diff --git a/lib/Target/PowerPC/PPCCallingConv.td b/lib/Target/PowerPC/PPCCallingConv.td index 874a6fce0656..5bc9124f8085 100644 --- a/lib/Target/PowerPC/PPCCallingConv.td +++ b/lib/Target/PowerPC/PPCCallingConv.td @@ -133,6 +133,9 @@ def CC_PPC32_SVR4_Common : CallingConv<[ // register having an odd register number. CCIfType<[i32], CCIfSplit<CCCustom<"CC_PPC32_SVR4_Custom_AlignArgRegs">>>, + // The 'nest' parameter, if any, is passed in R11. + CCIfNest<CCAssignToReg<[R11]>>, + // The first 8 integer arguments are passed in integer registers. CCIfType<[i32], CCAssignToReg<[R3, R4, R5, R6, R7, R8, R9, R10]>>, diff --git a/lib/Target/PowerPC/PPCFastISel.cpp b/lib/Target/PowerPC/PPCFastISel.cpp index fafcd76f9d18..5f236f744fc4 100644 --- a/lib/Target/PowerPC/PPCFastISel.cpp +++ b/lib/Target/PowerPC/PPCFastISel.cpp @@ -262,7 +262,7 @@ static Optional<PPC::Predicate> getComparePred(CmpInst::Predicate Pred) { // fast-isel, and return its equivalent machine type in VT. // FIXME: Copied directly from ARM -- factor into base class? bool PPCFastISel::isTypeLegal(Type *Ty, MVT &VT) { - EVT Evt = TLI.getValueType(Ty, true); + EVT Evt = TLI.getValueType(DL, Ty, true); // Only handle simple types. if (Evt == MVT::Other || !Evt.isSimple()) return false; @@ -324,12 +324,13 @@ bool PPCFastISel::PPCComputeAddress(const Value *Obj, Address &Addr) { return PPCComputeAddress(U->getOperand(0), Addr); case Instruction::IntToPtr: // Look past no-op inttoptrs. - if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy()) + if (TLI.getValueType(DL, U->getOperand(0)->getType()) == + TLI.getPointerTy(DL)) return PPCComputeAddress(U->getOperand(0), Addr); break; case Instruction::PtrToInt: // Look past no-op ptrtoints. - if (TLI.getValueType(U->getType()) == TLI.getPointerTy()) + if (TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL)) return PPCComputeAddress(U->getOperand(0), Addr); break; case Instruction::GetElementPtr: { @@ -799,7 +800,7 @@ bool PPCFastISel::SelectBranch(const Instruction *I) { bool PPCFastISel::PPCEmitCmp(const Value *SrcValue1, const Value *SrcValue2, bool IsZExt, unsigned DestReg) { Type *Ty = SrcValue1->getType(); - EVT SrcEVT = TLI.getValueType(Ty, true); + EVT SrcEVT = TLI.getValueType(DL, Ty, true); if (!SrcEVT.isSimple()) return false; MVT SrcVT = SrcEVT.getSimpleVT(); @@ -893,8 +894,8 @@ bool PPCFastISel::PPCEmitCmp(const Value *SrcValue1, const Value *SrcValue2, // Attempt to fast-select a floating-point extend instruction. bool PPCFastISel::SelectFPExt(const Instruction *I) { Value *Src = I->getOperand(0); - EVT SrcVT = TLI.getValueType(Src->getType(), true); - EVT DestVT = TLI.getValueType(I->getType(), true); + EVT SrcVT = TLI.getValueType(DL, Src->getType(), true); + EVT DestVT = TLI.getValueType(DL, I->getType(), true); if (SrcVT != MVT::f32 || DestVT != MVT::f64) return false; @@ -911,8 +912,8 @@ bool PPCFastISel::SelectFPExt(const Instruction *I) { // Attempt to fast-select a floating-point truncate instruction. bool PPCFastISel::SelectFPTrunc(const Instruction *I) { Value *Src = I->getOperand(0); - EVT SrcVT = TLI.getValueType(Src->getType(), true); - EVT DestVT = TLI.getValueType(I->getType(), true); + EVT SrcVT = TLI.getValueType(DL, Src->getType(), true); + EVT DestVT = TLI.getValueType(DL, I->getType(), true); if (SrcVT != MVT::f64 || DestVT != MVT::f32) return false; @@ -992,7 +993,7 @@ bool PPCFastISel::SelectIToFP(const Instruction *I, bool IsSigned) { return false; Value *Src = I->getOperand(0); - EVT SrcEVT = TLI.getValueType(Src->getType(), true); + EVT SrcEVT = TLI.getValueType(DL, Src->getType(), true); if (!SrcEVT.isSimple()) return false; @@ -1157,7 +1158,7 @@ bool PPCFastISel::SelectFPToI(const Instruction *I, bool IsSigned) { // Attempt to fast-select a binary integer operation that isn't already // handled automatically. bool PPCFastISel::SelectBinaryIntOp(const Instruction *I, unsigned ISDOpcode) { - EVT DestVT = TLI.getValueType(I->getType(), true); + EVT DestVT = TLI.getValueType(DL, I->getType(), true); // We can get here in the case when we have a binary operation on a non-legal // type and the target independent selector doesn't know how to handle it. @@ -1594,7 +1595,7 @@ bool PPCFastISel::SelectRet(const Instruction *I) { if (Ret->getNumOperands() > 0) { SmallVector<ISD::OutputArg, 4> Outs; - GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI); + GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI, DL); // Analyze operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ValLocs; @@ -1641,7 +1642,7 @@ bool PPCFastISel::SelectRet(const Instruction *I) { RetRegs.push_back(VA.getLocReg()); unsigned SrcReg = Reg + VA.getValNo(); - EVT RVEVT = TLI.getValueType(RV->getType()); + EVT RVEVT = TLI.getValueType(DL, RV->getType()); if (!RVEVT.isSimple()) return false; MVT RVVT = RVEVT.getSimpleVT(); @@ -1769,8 +1770,8 @@ bool PPCFastISel::SelectIndirectBr(const Instruction *I) { // Attempt to fast-select an integer truncate instruction. bool PPCFastISel::SelectTrunc(const Instruction *I) { Value *Src = I->getOperand(0); - EVT SrcVT = TLI.getValueType(Src->getType(), true); - EVT DestVT = TLI.getValueType(I->getType(), true); + EVT SrcVT = TLI.getValueType(DL, Src->getType(), true); + EVT DestVT = TLI.getValueType(DL, I->getType(), true); if (SrcVT != MVT::i64 && SrcVT != MVT::i32 && SrcVT != MVT::i16) return false; @@ -1806,8 +1807,8 @@ bool PPCFastISel::SelectIntExt(const Instruction *I) { if (!SrcReg) return false; EVT SrcEVT, DestEVT; - SrcEVT = TLI.getValueType(SrcTy, true); - DestEVT = TLI.getValueType(DestTy, true); + SrcEVT = TLI.getValueType(DL, SrcTy, true); + DestEVT = TLI.getValueType(DL, DestTy, true); if (!SrcEVT.isSimple()) return false; if (!DestEVT.isSimple()) @@ -1979,7 +1980,7 @@ unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) { // on the "if" path here. if (CModel == CodeModel::Large || (GV->getType()->getElementType()->isFunctionTy() && - (GV->isDeclaration() || GV->isWeakForLinker())) || + !GV->isStrongDefinitionForLinker()) || GV->isDeclaration() || GV->hasCommonLinkage() || GV->hasAvailableExternallyLinkage()) BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::LDtocL), @@ -2127,7 +2128,7 @@ unsigned PPCFastISel::PPCMaterializeInt(const Constant *C, MVT VT, // Materialize a constant into a register, and return the register // number (or zero if we failed to handle it). unsigned PPCFastISel::fastMaterializeConstant(const Constant *C) { - EVT CEVT = TLI.getValueType(C->getType(), true); + EVT CEVT = TLI.getValueType(DL, C->getType(), true); // Only handle simple types. if (!CEVT.isSimple()) return 0; diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp index b4008e4a886a..87229d80d9c1 100644 --- a/lib/Target/PowerPC/PPCFrameLowering.cpp +++ b/lib/Target/PowerPC/PPCFrameLowering.cpp @@ -306,9 +306,10 @@ static void HandleVRSaveUpdate(MachineInstr *MI, const TargetInstrInfo &TII) { const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); DebugLoc dl = MI->getDebugLoc(); + const MachineRegisterInfo &MRI = MF->getRegInfo(); unsigned UsedRegMask = 0; for (unsigned i = 0; i != 32; ++i) - if (MF->getRegInfo().isPhysRegUsed(VRRegNo[i])) + if (MRI.isPhysRegModified(VRRegNo[i])) UsedRegMask |= 1 << (31-i); // Live in and live out values already must be in the mask, so don't bother @@ -1158,9 +1159,11 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF, } } -void -PPCFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, - RegScavenger *) const { +void PPCFrameLowering::determineCalleeSaves(MachineFunction &MF, + BitVector &SavedRegs, + RegScavenger *RS) const { + TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); + const PPCRegisterInfo *RegInfo = static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo()); @@ -1168,8 +1171,7 @@ PPCFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>(); unsigned LR = RegInfo->getRARegister(); FI->setMustSaveLR(MustSaveLR(MF, LR)); - MachineRegisterInfo &MRI = MF.getRegInfo(); - MRI.setPhysRegUnused(LR); + SavedRegs.reset(LR); // Save R31 if necessary int FPSI = FI->getFramePointerSaveIndex(); @@ -1214,9 +1216,9 @@ PPCFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, // For 32-bit SVR4, allocate the nonvolatile CR spill slot iff the // function uses CR 2, 3, or 4. if (!isPPC64 && !isDarwinABI && - (MRI.isPhysRegUsed(PPC::CR2) || - MRI.isPhysRegUsed(PPC::CR3) || - MRI.isPhysRegUsed(PPC::CR4))) { + (SavedRegs.test(PPC::CR2) || + SavedRegs.test(PPC::CR3) || + SavedRegs.test(PPC::CR4))) { int FrameIdx = MFI->CreateFixedObject((uint64_t)4, (int64_t)-4, true); FI->setCRSpillFrameIndex(FrameIdx); } diff --git a/lib/Target/PowerPC/PPCFrameLowering.h b/lib/Target/PowerPC/PPCFrameLowering.h index 28d074ecd79d..d6a389bfbf0d 100644 --- a/lib/Target/PowerPC/PPCFrameLowering.h +++ b/lib/Target/PowerPC/PPCFrameLowering.h @@ -45,8 +45,8 @@ public: bool needsFP(const MachineFunction &MF) const; void replaceFPWithRealFP(MachineFunction &MF) const; - void processFunctionBeforeCalleeSavedScan(MachineFunction &MF, - RegScavenger *RS = nullptr) const override; + void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, + RegScavenger *RS = nullptr) const override; void processFunctionBeforeFrameFinalized(MachineFunction &MF, RegScavenger *RS = nullptr) const override; void addScavengingSpillSlot(MachineFunction &MF, RegScavenger *RS) const; diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index c85c2610d2f5..01a3acb742e6 100644 --- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -102,7 +102,8 @@ namespace { /// getSmallIPtrImm - Return a target constant of pointer type. inline SDValue getSmallIPtrImm(unsigned Imm, SDLoc dl) { - return CurDAG->getTargetConstant(Imm, dl, PPCLowering->getPointerTy()); + return CurDAG->getTargetConstant( + Imm, dl, PPCLowering->getPointerTy(CurDAG->getDataLayout())); } /// isRotateAndMask - Returns true if Mask and Shift can be folded into a @@ -313,7 +314,7 @@ SDNode *PPCDAGToDAGISel::getGlobalBaseReg() { const Module *M = MF->getFunction()->getParent(); DebugLoc dl; - if (PPCLowering->getPointerTy() == MVT::i32) { + if (PPCLowering->getPointerTy(CurDAG->getDataLayout()) == MVT::i32) { if (PPCSubTarget->isTargetELF()) { GlobalBaseReg = PPC::R30; if (M->getPICLevel() == PICLevel::Small) { @@ -342,7 +343,8 @@ SDNode *PPCDAGToDAGISel::getGlobalBaseReg() { } } return CurDAG->getRegister(GlobalBaseReg, - PPCLowering->getPointerTy()).getNode(); + PPCLowering->getPointerTy(CurDAG->getDataLayout())) + .getNode(); } /// isIntS16Immediate - This method tests to see if the node is either a 32-bit @@ -2205,7 +2207,8 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) { SDLoc dl(N); unsigned Imm; ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); - EVT PtrVT = CurDAG->getTargetLoweringInfo().getPointerTy(); + EVT PtrVT = + CurDAG->getTargetLoweringInfo().getPointerTy(CurDAG->getDataLayout()); bool isPPC64 = (PtrVT == MVT::i64); if (!PPCSubTarget->useCRBits() && @@ -2468,10 +2471,11 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) { SDValue Chain = LD->getChain(); SDValue Base = LD->getBasePtr(); SDValue Ops[] = { Offset, Base, Chain }; - return transferMemOperands(N, CurDAG->getMachineNode(Opcode, dl, - LD->getValueType(0), - PPCLowering->getPointerTy(), - MVT::Other, Ops)); + return transferMemOperands( + N, CurDAG->getMachineNode( + Opcode, dl, LD->getValueType(0), + PPCLowering->getPointerTy(CurDAG->getDataLayout()), MVT::Other, + Ops)); } else { unsigned Opcode; bool isSExt = LD->getExtensionType() == ISD::SEXTLOAD; @@ -2506,10 +2510,11 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) { SDValue Chain = LD->getChain(); SDValue Base = LD->getBasePtr(); SDValue Ops[] = { Base, Offset, Chain }; - return transferMemOperands(N, CurDAG->getMachineNode(Opcode, dl, - LD->getValueType(0), - PPCLowering->getPointerTy(), - MVT::Other, Ops)); + return transferMemOperands( + N, CurDAG->getMachineNode( + Opcode, dl, LD->getValueType(0), + PPCLowering->getPointerTy(CurDAG->getDataLayout()), MVT::Other, + Ops)); } } @@ -2662,7 +2667,8 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) { } case ISD::SELECT_CC: { ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(4))->get(); - EVT PtrVT = CurDAG->getTargetLoweringInfo().getPointerTy(); + EVT PtrVT = + CurDAG->getTargetLoweringInfo().getPointerTy(CurDAG->getDataLayout()); bool isPPC64 = (PtrVT == MVT::i64); // If this is a select of i1 operands, we'll pattern match it. @@ -2901,7 +2907,7 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) { if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(GA)) { const GlobalValue *GValue = G->getGlobal(); if ((GValue->getType()->getElementType()->isFunctionTy() && - (GValue->isDeclaration() || GValue->isWeakForLinker())) || + !GValue->isStrongDefinitionForLinker()) || GValue->isDeclaration() || GValue->hasCommonLinkage() || GValue->hasAvailableExternallyLinkage()) return transferMemOperands(N, CurDAG->getMachineNode(PPC::LDtocL, dl, @@ -2915,7 +2921,9 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) { // Generate a PIC-safe GOT reference. assert(!PPCSubTarget->isPPC64() && PPCSubTarget->isSVR4ABI() && "PPCISD::PPC32_PICGOT is only supported for 32-bit SVR4"); - return CurDAG->SelectNodeTo(N, PPC::PPC32PICGOT, PPCLowering->getPointerTy(), MVT::i32); + return CurDAG->SelectNodeTo( + N, PPC::PPC32PICGOT, PPCLowering->getPointerTy(CurDAG->getDataLayout()), + MVT::i32); } case PPCISD::VADD_SPLAT: { // This expands into one of three sequences, depending on whether @@ -3398,9 +3406,8 @@ void PPCDAGToDAGISel::PeepholeCROps() { bool IsModified; do { IsModified = false; - for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(), - E = CurDAG->allnodes_end(); I != E; ++I) { - MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(I); + for (SDNode &Node : CurDAG->allnodes()) { + MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(&Node); if (!MachineNode || MachineNode->use_empty()) continue; SDNode *ResNode = MachineNode; diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index 594472bbb47b..0ed9b051ffed 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -952,7 +952,8 @@ static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign, /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate /// function arguments in the caller parameter area. -unsigned PPCTargetLowering::getByValTypeAlignment(Type *Ty) const { +unsigned PPCTargetLowering::getByValTypeAlignment(Type *Ty, + const DataLayout &DL) const { // Darwin passes everything on 4 byte boundary. if (Subtarget.isDarwin()) return 4; @@ -1055,7 +1056,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { return nullptr; } -EVT PPCTargetLowering::getSetCCResultType(LLVMContext &C, EVT VT) const { +EVT PPCTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &C, + EVT VT) const { if (!VT.isVector()) return Subtarget.useCRBits() ? MVT::i1 : MVT::i32; @@ -1101,7 +1103,7 @@ static bool isConstantOrUndef(int Op, int Val) { /// For the latter, the input operands are swapped (see PPCInstrAltivec.td). bool PPC::isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind, SelectionDAG &DAG) { - bool IsLE = DAG.getTarget().getDataLayout()->isLittleEndian(); + bool IsLE = DAG.getDataLayout().isLittleEndian(); if (ShuffleKind == 0) { if (IsLE) return false; @@ -1132,7 +1134,7 @@ bool PPC::isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind, /// For the latter, the input operands are swapped (see PPCInstrAltivec.td). bool PPC::isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind, SelectionDAG &DAG) { - bool IsLE = DAG.getTarget().getDataLayout()->isLittleEndian(); + bool IsLE = DAG.getDataLayout().isLittleEndian(); if (ShuffleKind == 0) { if (IsLE) return false; @@ -1174,7 +1176,7 @@ bool PPC::isVPKUDUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind, if (!Subtarget.hasP8Vector()) return false; - bool IsLE = DAG.getTarget().getDataLayout()->isLittleEndian(); + bool IsLE = DAG.getDataLayout().isLittleEndian(); if (ShuffleKind == 0) { if (IsLE) return false; @@ -1237,7 +1239,7 @@ static bool isVMerge(ShuffleVectorSDNode *N, unsigned UnitSize, /// the input operands are swapped (see PPCInstrAltivec.td). bool PPC::isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize, unsigned ShuffleKind, SelectionDAG &DAG) { - if (DAG.getTarget().getDataLayout()->isLittleEndian()) { + if (DAG.getDataLayout().isLittleEndian()) { if (ShuffleKind == 1) // unary return isVMerge(N, UnitSize, 0, 0); else if (ShuffleKind == 2) // swapped @@ -1262,7 +1264,7 @@ bool PPC::isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize, /// the input operands are swapped (see PPCInstrAltivec.td). bool PPC::isVMRGHShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize, unsigned ShuffleKind, SelectionDAG &DAG) { - if (DAG.getTarget().getDataLayout()->isLittleEndian()) { + if (DAG.getDataLayout().isLittleEndian()) { if (ShuffleKind == 1) // unary return isVMerge(N, UnitSize, 8, 8); else if (ShuffleKind == 2) // swapped @@ -1352,7 +1354,7 @@ static bool isVMerge(ShuffleVectorSDNode *N, unsigned IndexOffset, */ bool PPC::isVMRGEOShuffleMask(ShuffleVectorSDNode *N, bool CheckEven, unsigned ShuffleKind, SelectionDAG &DAG) { - if (DAG.getTarget().getDataLayout()->isLittleEndian()) { + if (DAG.getDataLayout().isLittleEndian()) { unsigned indexOffset = CheckEven ? 4 : 0; if (ShuffleKind == 1) // Unary return isVMerge(N, indexOffset, 0); @@ -1399,7 +1401,7 @@ int PPC::isVSLDOIShuffleMask(SDNode *N, unsigned ShuffleKind, if (ShiftAmt < i) return -1; ShiftAmt -= i; - bool isLE = DAG.getTarget().getDataLayout()->isLittleEndian(); + bool isLE = DAG.getDataLayout().isLittleEndian(); if ((ShuffleKind == 0 && !isLE) || (ShuffleKind == 2 && isLE)) { // Check the rest of the elements to see if they are consecutive. @@ -1456,7 +1458,7 @@ unsigned PPC::getVSPLTImmediate(SDNode *N, unsigned EltSize, SelectionDAG &DAG) { ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); assert(isSplatShuffleMask(SVOp, EltSize)); - if (DAG.getTarget().getDataLayout()->isLittleEndian()) + if (DAG.getDataLayout().isLittleEndian()) return (16 / EltSize) - 1 - (SVOp->getMaskElt(0) / EltSize); else return SVOp->getMaskElt(0) / EltSize; @@ -1796,7 +1798,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp, } } - Disp = DAG.getTargetConstant(0, dl, getPointerTy()); + Disp = DAG.getTargetConstant(0, dl, getPointerTy(DAG.getDataLayout())); if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N)) { Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType()); fixupFuncForFI(DAG, FI->getIndex(), N.getValueType()); @@ -2084,7 +2086,7 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op, GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); SDLoc dl(GA); const GlobalValue *GV = GA->getGlobal(); - EVT PtrVT = getPointerTy(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); bool is64bit = Subtarget.isPPC64(); const Module *M = DAG.getMachineFunction().getFunction()->getParent(); PICLevel::Level picLevel = M->getPICLevel(); @@ -2270,7 +2272,7 @@ SDValue PPCTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG, const PPCSubtarget &Subtarget) const { SDNode *Node = Op.getNode(); EVT VT = Node->getValueType(0); - EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); SDValue InChain = Node->getOperand(0); SDValue VAListPtr = Node->getOperand(1); const Value *SV = cast<SrcValueSDNode>(Node->getOperand(2))->getValue(); @@ -2399,11 +2401,9 @@ SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, SDValue Nest = Op.getOperand(3); // 'nest' parameter value SDLoc dl(Op); - EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); bool isPPC64 = (PtrVT == MVT::i64); - Type *IntPtrTy = - DAG.getTargetLoweringInfo().getDataLayout()->getIntPtrType( - *DAG.getContext()); + Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext()); TargetLowering::ArgListTy Args; TargetLowering::ArgListEntry Entry; @@ -2440,7 +2440,7 @@ SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG, if (Subtarget.isDarwinABI() || Subtarget.isPPC64()) { // vastart just stores the address of the VarArgsFrameIndex slot into the // memory location argument. - EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(MF.getDataLayout()); SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), @@ -2476,8 +2476,7 @@ SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG, SDValue ArgGPR = DAG.getConstant(FuncInfo->getVarArgsNumGPR(), dl, MVT::i32); SDValue ArgFPR = DAG.getConstant(FuncInfo->getVarArgsNumFPR(), dl, MVT::i32); - - EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(MF.getDataLayout()); SDValue StackOffsetFI = DAG.getFrameIndex(FuncInfo->getVarArgsStackOffset(), PtrVT); @@ -2797,7 +2796,7 @@ PPCTargetLowering::LowerFormalArguments_32SVR4( MachineFrameInfo *MFI = MF.getFrameInfo(); PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); - EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(MF.getDataLayout()); // Potential tail calls could cause overwriting of argument stack slots. bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt && (CallConv == CallingConv::Fast)); @@ -3023,7 +3022,7 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( assert(!(CallConv == CallingConv::Fast && isVarArg) && "fastcc not supported on varargs functions"); - EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(MF.getDataLayout()); // Potential tail calls could cause overwriting of argument stack slots. bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt && (CallConv == CallingConv::Fast)); @@ -3059,12 +3058,16 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( unsigned NumBytes = LinkageSize; unsigned AvailableFPRs = Num_FPR_Regs; unsigned AvailableVRs = Num_VR_Regs; - for (unsigned i = 0, e = Ins.size(); i != e; ++i) + for (unsigned i = 0, e = Ins.size(); i != e; ++i) { + if (Ins[i].Flags.isNest()) + continue; + if (CalculateStackSlotUsed(Ins[i].VT, Ins[i].ArgVT, Ins[i].Flags, PtrByteSize, LinkageSize, ParamAreaSize, NumBytes, AvailableFPRs, AvailableVRs, Subtarget.hasQPX())) HasParameterArea = true; + } // Add DAG nodes to load the arguments or copy them out of registers. On // entry to a function on PPC, the arguments start after the linkage area, @@ -3216,6 +3219,17 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( case MVT::i1: case MVT::i32: case MVT::i64: + if (Flags.isNest()) { + // The 'nest' parameter, if any, is passed in R11. + unsigned VReg = MF.addLiveIn(PPC::X11, &PPC::G8RCRegClass); + ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); + + if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1) + ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl); + + break; + } + // These can be scalar arguments or elements of an integer array type // passed directly. Clang may use those instead of "byval" aggregate // types to avoid forcing arguments to memory unnecessarily. @@ -3425,7 +3439,7 @@ PPCTargetLowering::LowerFormalArguments_Darwin( MachineFrameInfo *MFI = MF.getFrameInfo(); PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); - EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(MF.getDataLayout()); bool isPPC64 = PtrVT == MVT::i64; // Potential tail calls could cause overwriting of argument stack slots. bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt && @@ -3845,7 +3859,8 @@ static SDNode *isBLACompatibleAddress(SDValue Op, SelectionDAG &DAG) { return nullptr; // Top 6 bits have to be sext of immediate. return DAG.getConstant((int)C->getZExtValue() >> 2, SDLoc(Op), - DAG.getTargetLoweringInfo().getPointerTy()).getNode(); + DAG.getTargetLoweringInfo().getPointerTy( + DAG.getDataLayout())).getNode(); } namespace { @@ -3991,7 +4006,7 @@ LowerMemOpCallTo(SelectionDAG &DAG, MachineFunction &MF, SDValue Chain, bool isVector, SmallVectorImpl<SDValue> &MemOpChains, SmallVectorImpl<TailCallArgumentInfo> &TailCallArguments, SDLoc dl) { - EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); if (!isTailCall) { if (isVector) { SDValue StackPtr; @@ -4053,7 +4068,7 @@ static bool isFunctionGlobalAddress(SDValue Callee) { static unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, SDValue &Chain, SDValue CallSeqStart, SDLoc dl, int SPDiff, - bool isTailCall, bool IsPatchPoint, + bool isTailCall, bool IsPatchPoint, bool hasNest, SmallVectorImpl<std::pair<unsigned, SDValue> > &RegsToPass, SmallVectorImpl<SDValue> &Ops, std::vector<EVT> &NodeTys, ImmutableCallSite *CS, const PPCSubtarget &Subtarget) { @@ -4062,7 +4077,7 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, bool isSVR4ABI = Subtarget.isSVR4ABI(); bool isELFv2ABI = Subtarget.isELFv2ABI(); - EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); NodeTys.push_back(MVT::Other); // Returns a chain NodeTys.push_back(MVT::Glue); // Returns a flag for retval copy to use. @@ -4084,8 +4099,7 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, if ((DAG.getTarget().getRelocationModel() != Reloc::Static && (Subtarget.getTargetTriple().isMacOSX() && Subtarget.getTargetTriple().isMacOSXVersionLT(10, 5)) && - (G->getGlobal()->isDeclaration() || - G->getGlobal()->isWeakForLinker())) || + !G->getGlobal()->isStrongDefinitionForLinker()) || (Subtarget.isTargetELF() && !isPPC64 && !G->getGlobal()->hasLocalLinkage() && DAG.getTarget().getRelocationModel() == Reloc::PIC_)) { @@ -4196,11 +4210,15 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, Chain = TOCVal.getValue(0); InFlag = TOCVal.getValue(1); - SDValue EnvVal = DAG.getCopyToReg(Chain, dl, PPC::X11, LoadEnvPtr, - InFlag); + // If the function call has an explicit 'nest' parameter, it takes the + // place of the environment pointer. + if (!hasNest) { + SDValue EnvVal = DAG.getCopyToReg(Chain, dl, PPC::X11, LoadEnvPtr, + InFlag); - Chain = EnvVal.getValue(0); - InFlag = EnvVal.getValue(1); + Chain = EnvVal.getValue(0); + InFlag = EnvVal.getValue(1); + } MTCTROps[0] = Chain; MTCTROps[1] = LoadFuncPtr; @@ -4218,7 +4236,7 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, CallOpc = PPCISD::BCTRL; Callee.setNode(nullptr); // Add use of X11 (holding environment pointer) - if (isSVR4ABI && isPPC64 && !isELFv2ABI) + if (isSVR4ABI && isPPC64 && !isELFv2ABI && !hasNest) Ops.push_back(DAG.getRegister(PPC::X11, PtrVT)); // Add CTR register as callee so a bctr can be emitted later. if (isTailCall) @@ -4254,8 +4272,7 @@ static bool isLocalCall(const SDValue &Callee) { if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) - return !G->getGlobal()->isDeclaration() && - !G->getGlobal()->isWeakForLinker(); + return G->getGlobal()->isStrongDefinitionForLinker(); return false; } @@ -4308,7 +4325,7 @@ PPCTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, SDValue PPCTargetLowering::FinishCall(CallingConv::ID CallConv, SDLoc dl, bool isTailCall, bool isVarArg, bool IsPatchPoint, - SelectionDAG &DAG, + bool hasNest, SelectionDAG &DAG, SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, SDValue InFlag, SDValue Chain, @@ -4321,8 +4338,8 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, SDLoc dl, std::vector<EVT> NodeTys; SmallVector<SDValue, 8> Ops; unsigned CallOpc = PrepareCall(DAG, Callee, InFlag, Chain, CallSeqStart, dl, - SPDiff, isTailCall, IsPatchPoint, RegsToPass, - Ops, NodeTys, CS, Subtarget); + SPDiff, isTailCall, IsPatchPoint, hasNest, + RegsToPass, Ops, NodeTys, CS, Subtarget); // Add implicit use of CR bit 6 for 32-bit SVR4 vararg calls if (isVarArg && Subtarget.isSVR4ABI() && !Subtarget.isPPC64()) @@ -4381,7 +4398,7 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, SDLoc dl, // allocated and an unnecessary move instruction being generated. CallOpc = PPCISD::BCTRL_LOAD_TOC; - EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); SDValue StackPtr = DAG.getRegister(PPC::X1, PtrVT); unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset(); SDValue TOCOff = DAG.getIntPtrConstant(TOCSaveOffset, dl); @@ -4586,7 +4603,8 @@ PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee, unsigned LocMemOffset = ByValVA.getLocMemOffset(); SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); - PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); + PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(MF.getDataLayout()), + StackPtr, PtrOff); // Create a copy of the argument in the local area of the current // stack frame. @@ -4623,7 +4641,8 @@ PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee, if (!isTailCall) { SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); - PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); + PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(MF.getDataLayout()), + StackPtr, PtrOff); MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo(), @@ -4664,7 +4683,8 @@ PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee, PrepareTailCall(DAG, InFlag, Chain, dl, false, SPDiff, NumBytes, LROp, FPOp, false, TailCallArguments); - return FinishCall(CallConv, dl, isTailCall, isVarArg, IsPatchPoint, DAG, + return FinishCall(CallConv, dl, isTailCall, isVarArg, IsPatchPoint, + /* unused except on PPC64 ELFv1 */ false, DAG, RegsToPass, InFlag, Chain, CallSeqStart, Callee, SPDiff, NumBytes, Ins, InVals, CS); } @@ -4703,8 +4723,9 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, bool isELFv2ABI = Subtarget.isELFv2ABI(); bool isLittleEndian = Subtarget.isLittleEndian(); unsigned NumOps = Outs.size(); + bool hasNest = false; - EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); unsigned PtrByteSize = 8; MachineFunction &MF = DAG.getMachineFunction(); @@ -4758,6 +4779,9 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, EVT ArgVT = Outs[i].VT; EVT OrigVT = Outs[i].ArgVT; + if (Flags.isNest()) + continue; + if (CallConv == CallingConv::Fast) { if (Flags.isByVal()) NumGPRsUsed += (Flags.getByValSize()+7)/8; @@ -5021,6 +5045,13 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, case MVT::i1: case MVT::i32: case MVT::i64: + if (Flags.isNest()) { + // The 'nest' parameter, if any, is passed in R11. + RegsToPass.push_back(std::make_pair(PPC::X11, Arg)); + hasNest = true; + break; + } + // These can be scalar arguments or elements of an integer array type // passed directly. Clang may use those instead of "byval" aggregate // types to avoid forcing arguments to memory unnecessarily. @@ -5302,9 +5333,9 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, PrepareTailCall(DAG, InFlag, Chain, dl, true, SPDiff, NumBytes, LROp, FPOp, true, TailCallArguments); - return FinishCall(CallConv, dl, isTailCall, isVarArg, IsPatchPoint, DAG, - RegsToPass, InFlag, Chain, CallSeqStart, Callee, SPDiff, - NumBytes, Ins, InVals, CS); + return FinishCall(CallConv, dl, isTailCall, isVarArg, IsPatchPoint, + hasNest, DAG, RegsToPass, InFlag, Chain, CallSeqStart, + Callee, SPDiff, NumBytes, Ins, InVals, CS); } SDValue @@ -5320,7 +5351,7 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee, unsigned NumOps = Outs.size(); - EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); bool isPPC64 = PtrVT == MVT::i64; unsigned PtrByteSize = isPPC64 ? 8 : 4; @@ -5693,7 +5724,8 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee, PrepareTailCall(DAG, InFlag, Chain, dl, isPPC64, SPDiff, NumBytes, LROp, FPOp, true, TailCallArguments); - return FinishCall(CallConv, dl, isTailCall, isVarArg, IsPatchPoint, DAG, + return FinishCall(CallConv, dl, isTailCall, isVarArg, IsPatchPoint, + /* unused except on PPC64 ELFv1 */ false, DAG, RegsToPass, InFlag, Chain, CallSeqStart, Callee, SPDiff, NumBytes, Ins, InVals, CS); } @@ -5764,7 +5796,7 @@ SDValue PPCTargetLowering::LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG, SDLoc dl(Op); // Get the corect type for pointers. - EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); // Construct the stack pointer operand. bool isPPC64 = Subtarget.isPPC64(); @@ -5794,7 +5826,7 @@ SDValue PPCTargetLowering::getReturnAddrFrameIndex(SelectionDAG & DAG) const { MachineFunction &MF = DAG.getMachineFunction(); bool isPPC64 = Subtarget.isPPC64(); - EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(MF.getDataLayout()); // Get current frame pointer save index. The users of this index will be // primarily DYNALLOC instructions. @@ -5817,7 +5849,7 @@ SDValue PPCTargetLowering::getFramePointerFrameIndex(SelectionDAG & DAG) const { MachineFunction &MF = DAG.getMachineFunction(); bool isPPC64 = Subtarget.isPPC64(); - EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(MF.getDataLayout()); // Get current frame pointer save index. The users of this index will be // primarily DYNALLOC instructions. @@ -5845,7 +5877,7 @@ SDValue PPCTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SDLoc dl(Op); // Get the corect type for pointers. - EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); // Negate the size. SDValue NegSize = DAG.getNode(ISD::SUB, dl, PtrVT, DAG.getConstant(0, dl, PtrVT), Size); @@ -5888,8 +5920,9 @@ SDValue PPCTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { SDValue BasePtr = LD->getBasePtr(); MachineMemOperand *MMO = LD->getMemOperand(); - SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, dl, getPointerTy(), Chain, - BasePtr, MVT::i8, MMO); + SDValue NewLD = + DAG.getExtLoad(ISD::EXTLOAD, dl, getPointerTy(DAG.getDataLayout()), Chain, + BasePtr, MVT::i8, MMO); SDValue Result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewLD); SDValue Ops[] = { Result, SDValue(NewLD.getNode(), 1) }; @@ -5913,7 +5946,8 @@ SDValue PPCTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { SDValue Value = ST->getValue(); MachineMemOperand *MMO = ST->getMemOperand(); - Value = DAG.getNode(ISD::ZERO_EXTEND, dl, getPointerTy(), Value); + Value = DAG.getNode(ISD::ZERO_EXTEND, dl, getPointerTy(DAG.getDataLayout()), + Value); return DAG.getTruncStore(Chain, dl, Value, BasePtr, MVT::i8, MMO); } @@ -6374,7 +6408,7 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, SINT.getOpcode() == ISD::ZERO_EXTEND)) && SINT.getOperand(0).getValueType() == MVT::i32) { MachineFrameInfo *FrameInfo = MF.getFrameInfo(); - EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); int FrameIdx = FrameInfo->CreateStackObject(4, 4, false); SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); @@ -6419,7 +6453,7 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, // then lfd it and fcfid it. MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *FrameInfo = MF.getFrameInfo(); - EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(MF.getDataLayout()); SDValue Ld; if (Subtarget.hasLFIWAX() || Subtarget.hasFPCVT()) { @@ -6506,7 +6540,7 @@ SDValue PPCTargetLowering::LowerFLT_ROUNDS_(SDValue Op, MachineFunction &MF = DAG.getMachineFunction(); EVT VT = Op.getValueType(); - EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(MF.getDataLayout()); // Save FP Control Word to register EVT NodeTys[] = { @@ -6727,7 +6761,7 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo(); int FrameIdx = FrameInfo->CreateStackObject(16, 16, false); MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(FrameIdx); - EVT PtrVT = getPointerTy(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); assert(BVN->getNumOperands() == 4 && @@ -6760,9 +6794,9 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, } Constant *CP = ConstantVector::get(CV); - SDValue CPIdx = DAG.getConstantPool(CP, getPointerTy(), - 16 /* alignment */); - + SDValue CPIdx = DAG.getConstantPool(CP, getPointerTy(DAG.getDataLayout()), + 16 /* alignment */); + SmallVector<SDValue, 2> Ops; Ops.push_back(DAG.getEntryNode()); Ops.push_back(CPIdx); @@ -7453,7 +7487,7 @@ SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, // Create a stack slot that is 16-byte aligned. MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo(); int FrameIdx = FrameInfo->CreateStackObject(16, 16, false); - EVT PtrVT = getPointerTy(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); // Store the input value into Value#0 of the stack slot. @@ -7499,7 +7533,7 @@ SDValue PPCTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo(); int FrameIdx = FrameInfo->CreateStackObject(16, 16, false); MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(FrameIdx); - EVT PtrVT = getPointerTy(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); SDValue StoreChain = DAG.getEntryNode(); @@ -7651,9 +7685,9 @@ SDValue PPCTargetLowering::LowerVectorStore(SDValue Op, SmallVector<SDValue, 8> Stores; for (unsigned Idx = 0; Idx < 4; ++Idx) { - SDValue Ex = - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, Value, - DAG.getConstant(Idx, dl, getVectorIdxTy())); + SDValue Ex = DAG.getNode( + ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, Value, + DAG.getConstant(Idx, dl, getVectorIdxTy(DAG.getDataLayout()))); SDValue Store; if (ScalarVT != ScalarMemVT) Store = @@ -7715,7 +7749,7 @@ SDValue PPCTargetLowering::LowerVectorStore(SDValue Op, MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo(); int FrameIdx = FrameInfo->CreateStackObject(16, 16, false); MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(FrameIdx); - EVT PtrVT = getPointerTy(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); SmallVector<SDValue, 2> Ops; @@ -7920,7 +7954,8 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N, assert(N->getValueType(0) == MVT::i1 && "Unexpected result type for CTR decrement intrinsic"); - EVT SVT = getSetCCResultType(*DAG.getContext(), N->getValueType(0)); + EVT SVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), + N->getValueType(0)); SDVTList VTs = DAG.getVTList(SVT, MVT::Other); SDValue NewInt = DAG.getNode(N->getOpcode(), dl, VTs, N->getOperand(0), N->getOperand(1)); @@ -8248,7 +8283,7 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, unsigned mainDstReg = MRI.createVirtualRegister(RC); unsigned restoreDstReg = MRI.createVirtualRegister(RC); - MVT PVT = getPointerTy(); + MVT PVT = getPointerTy(MF->getDataLayout()); assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!"); // For v = setjmp(buf), we generate @@ -8386,7 +8421,7 @@ PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); - MVT PVT = getPointerTy(); + MVT PVT = getPointerTy(MF->getDataLayout()); assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!"); @@ -9032,6 +9067,19 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, // Target Optimization Hooks //===----------------------------------------------------------------------===// +static std::string getRecipOp(const char *Base, EVT VT) { + std::string RecipOp(Base); + if (VT.getScalarType() == MVT::f64) + RecipOp += "d"; + else + RecipOp += "f"; + + if (VT.isVector()) + RecipOp = "vec-" + RecipOp; + + return RecipOp; +} + SDValue PPCTargetLowering::getRsqrtEstimate(SDValue Operand, DAGCombinerInfo &DCI, unsigned &RefinementSteps, @@ -9043,13 +9091,12 @@ SDValue PPCTargetLowering::getRsqrtEstimate(SDValue Operand, (VT == MVT::v2f64 && Subtarget.hasVSX()) || (VT == MVT::v4f32 && Subtarget.hasQPX()) || (VT == MVT::v4f64 && Subtarget.hasQPX())) { - // Convergence is quadratic, so we essentially double the number of digits - // correct after every iteration. For both FRE and FRSQRTE, the minimum - // architected relative accuracy is 2^-5. When hasRecipPrec(), this is - // 2^-14. IEEE float has 23 digits and double has 52 digits. - RefinementSteps = Subtarget.hasRecipPrec() ? 1 : 3; - if (VT.getScalarType() == MVT::f64) - ++RefinementSteps; + TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals; + std::string RecipOp = getRecipOp("sqrt", VT); + if (!Recips.isEnabled(RecipOp)) + return SDValue(); + + RefinementSteps = Recips.getRefinementSteps(RecipOp); UseOneConstNR = true; return DCI.DAG.getNode(PPCISD::FRSQRTE, SDLoc(Operand), VT, Operand); } @@ -9066,13 +9113,12 @@ SDValue PPCTargetLowering::getRecipEstimate(SDValue Operand, (VT == MVT::v2f64 && Subtarget.hasVSX()) || (VT == MVT::v4f32 && Subtarget.hasQPX()) || (VT == MVT::v4f64 && Subtarget.hasQPX())) { - // Convergence is quadratic, so we essentially double the number of digits - // correct after every iteration. For both FRE and FRSQRTE, the minimum - // architected relative accuracy is 2^-5. When hasRecipPrec(), this is - // 2^-14. IEEE float has 23 digits and double has 52 digits. - RefinementSteps = Subtarget.hasRecipPrec() ? 1 : 3; - if (VT.getScalarType() == MVT::f64) - ++RefinementSteps; + TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals; + std::string RecipOp = getRecipOp("div", VT); + if (!Recips.isEnabled(RecipOp)) + return SDValue(); + + RefinementSteps = Recips.getRefinementSteps(RecipOp); return DCI.DAG.getNode(PPCISD::FRE, SDLoc(Operand), VT, Operand); } return SDValue(); @@ -9854,7 +9900,7 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N, assert(N->getOpcode() == ISD::SIGN_EXTEND && "Invalid extension type"); - EVT ShiftAmountTy = getShiftAmountTy(N->getValueType(0)); + EVT ShiftAmountTy = getShiftAmountTy(N->getValueType(0), DAG.getDataLayout()); SDValue ShiftCst = DAG.getConstant(N->getValueSizeInBits(0) - PromBits, dl, ShiftAmountTy); return DAG.getNode(ISD::SRA, dl, N->getValueType(0), @@ -10145,9 +10191,9 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, EVT MemVT = LD->getMemoryVT(); Type *Ty = MemVT.getTypeForEVT(*DAG.getContext()); - unsigned ABIAlignment = getDataLayout()->getABITypeAlignment(Ty); + unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(Ty); Type *STy = MemVT.getScalarType().getTypeForEVT(*DAG.getContext()); - unsigned ScalarABIAlignment = getDataLayout()->getABITypeAlignment(STy); + unsigned ScalarABIAlignment = DAG.getDataLayout().getABITypeAlignment(STy); if (LD->isUnindexed() && VT.isVector() && ((Subtarget.hasAltivec() && ISD::isNON_EXTLoad(N) && // P8 and later hardware should just use LOAD. @@ -10219,7 +10265,8 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, 2*MemVT.getStoreSize()-1); // Create the new base load. - SDValue LDXIntID = DAG.getTargetConstant(IntrLD, dl, getPointerTy()); + SDValue LDXIntID = + DAG.getTargetConstant(IntrLD, dl, getPointerTy(MF.getDataLayout())); SDValue BaseLoadOps[] = { Chain, LDXIntID, Ptr }; SDValue BaseLoad = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl, @@ -10243,7 +10290,8 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, if (!findConsecutiveLoad(LD, DAG)) --IncValue; - SDValue Increment = DAG.getConstant(IncValue, dl, getPointerTy()); + SDValue Increment = + DAG.getConstant(IncValue, dl, getPointerTy(MF.getDataLayout())); Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); MachineMemOperand *ExtraMMO = @@ -10691,7 +10739,7 @@ unsigned PPCTargetLowering::getPrefLoopAlignment(MachineLoop *ML) const { /// getConstraintType - Given a constraint, return the type of /// constraint it is for this target. PPCTargetLowering::ConstraintType -PPCTargetLowering::getConstraintType(const std::string &Constraint) const { +PPCTargetLowering::getConstraintType(StringRef Constraint) const { if (Constraint.size() == 1) { switch (Constraint[0]) { default: break; @@ -10776,7 +10824,7 @@ PPCTargetLowering::getSingleConstraintMatchWeight( std::pair<unsigned, const TargetRegisterClass *> PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, - const std::string &Constraint, + StringRef Constraint, MVT VT) const { if (Constraint.size() == 1) { // GCC RS6000 Constraint Letters @@ -10923,8 +10971,8 @@ void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op, // isLegalAddressingMode - Return true if the addressing mode represented // by AM is legal for this target, for a load/store of the specified type. -bool PPCTargetLowering::isLegalAddressingMode(const AddrMode &AM, - Type *Ty, +bool PPCTargetLowering::isLegalAddressingMode(const DataLayout &DL, + const AddrMode &AM, Type *Ty, unsigned AS) const { // PPC does not allow r+i addressing modes for vectors! if (Ty->isVectorTy() && AM.BaseOffs != 0) @@ -10977,22 +11025,22 @@ SDValue PPCTargetLowering::LowerRETURNADDR(SDValue Op, PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); FuncInfo->setLRStoreRequired(); bool isPPC64 = Subtarget.isPPC64(); + auto PtrVT = getPointerTy(MF.getDataLayout()); if (Depth > 0) { SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); SDValue Offset = DAG.getConstant(Subtarget.getFrameLowering()->getReturnSaveOffset(), dl, isPPC64 ? MVT::i64 : MVT::i32); - return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), - DAG.getNode(ISD::ADD, dl, getPointerTy(), - FrameAddr, Offset), + return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), + DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset), MachinePointerInfo(), false, false, false, 0); } // Just load the return address off the stack. SDValue RetAddrFI = getReturnAddrFrameIndex(DAG); - return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), - RetAddrFI, MachinePointerInfo(), false, false, false, 0); + return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI, + MachinePointerInfo(), false, false, false, 0); } SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op, @@ -11000,13 +11048,13 @@ SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op, SDLoc dl(Op); unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); - EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); - bool isPPC64 = PtrVT == MVT::i64; - MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); MFI->setFrameAddressIsTaken(true); + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(MF.getDataLayout()); + bool isPPC64 = PtrVT == MVT::i64; + // Naked functions never have a frame pointer, and so we use r1. For all // other functions, this decision must be delayed until during PEI. unsigned FrameReg; @@ -11026,8 +11074,8 @@ SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op, // FIXME? Maybe this could be a TableGen attribute on some registers and // this table could be generated automatically from RegInfo. -unsigned PPCTargetLowering::getRegisterByName(const char* RegName, - EVT VT) const { +unsigned PPCTargetLowering::getRegisterByName(const char* RegName, EVT VT, + SelectionDAG &DAG) const { bool isPPC64 = Subtarget.isPPC64(); bool isDarwinABI = Subtarget.isDarwinABI(); diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h index 02242b512a4f..6e13533cfdb3 100644 --- a/lib/Target/PowerPC/PPCISelLowering.h +++ b/lib/Target/PowerPC/PPCISelLowering.h @@ -423,7 +423,9 @@ namespace llvm { /// DAG node. const char *getTargetNodeName(unsigned Opcode) const override; - MVT getScalarShiftAmountTy(EVT LHSTy) const override { return MVT::i32; } + MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override { + return MVT::i32; + } bool isCheapToSpeculateCttz() const override { return true; @@ -434,7 +436,8 @@ namespace llvm { } /// getSetCCResultType - Return the ISD::SETCC ValueType - EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override; + EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, + EVT VT) const override; /// Return true if target always beneficiates from combining into FMA for a /// given value type. This must typically return false on targets where FMA @@ -487,7 +490,8 @@ namespace llvm { SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, std::vector<SDNode *> *Created) const override; - unsigned getRegisterByName(const char* RegName, EVT VT) const override; + unsigned getRegisterByName(const char* RegName, EVT VT, + SelectionDAG &DAG) const override; void computeKnownBitsForTargetNode(const SDValue Op, APInt &KnownZero, @@ -519,8 +523,7 @@ namespace llvm { MachineBasicBlock *emitEHSjLjLongJmp(MachineInstr *MI, MachineBasicBlock *MBB) const; - ConstraintType - getConstraintType(const std::string &Constraint) const override; + ConstraintType getConstraintType(StringRef Constraint) const override; /// Examine constraint string and operand type and determine a weight value. /// The operand object must already have been set up with the operand type. @@ -529,13 +532,13 @@ namespace llvm { std::pair<unsigned, const TargetRegisterClass *> getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, - const std::string &Constraint, - MVT VT) const override; + StringRef Constraint, MVT VT) const override; /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate /// function arguments in the caller parameter area. This is the actual /// alignment, not its logarithm. - unsigned getByValTypeAlignment(Type *Ty) const override; + unsigned getByValTypeAlignment(Type *Ty, + const DataLayout &DL) const override; /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops /// vector. If it is invalid, don't add anything to Ops. @@ -544,8 +547,8 @@ namespace llvm { std::vector<SDValue> &Ops, SelectionDAG &DAG) const override; - unsigned getInlineAsmMemConstraint( - const std::string &ConstraintCode) const override { + unsigned + getInlineAsmMemConstraint(StringRef ConstraintCode) const override { if (ConstraintCode == "es") return InlineAsm::Constraint_es; else if (ConstraintCode == "o") @@ -561,8 +564,8 @@ namespace llvm { /// isLegalAddressingMode - Return true if the addressing mode represented /// by AM is legal for this target, for a load/store of the specified type. - bool isLegalAddressingMode(const AddrMode &AM, Type *Ty, - unsigned AS) const override; + bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, + Type *Ty, unsigned AS) const override; /// isLegalICmpImmediate - Return true if the specified immediate is legal /// icmp immediate, that is the target has icmp instructions which can @@ -745,7 +748,7 @@ namespace llvm { SDLoc dl, SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const; SDValue FinishCall(CallingConv::ID CallConv, SDLoc dl, bool isTailCall, - bool isVarArg, bool IsPatchPoint, + bool isVarArg, bool IsPatchPoint, bool hasNest, SelectionDAG &DAG, SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp index 696a83860e53..bf6e40296405 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -57,6 +57,10 @@ static cl::opt<bool> VSXSelfCopyCrash("crash-on-ppc-vsx-self-copy", cl::desc("Causes the backend to crash instead of generating a nop VSX copy"), cl::Hidden); +static cl::opt<bool> +UseOldLatencyCalc("ppc-old-latency-calc", cl::Hidden, + cl::desc("Use the old (incorrect) instruction latency calculation")); + // Pin the vtable to this file. void PPCInstrInfo::anchor() {} @@ -103,6 +107,35 @@ PPCInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, return new ScoreboardHazardRecognizer(II, DAG); } +unsigned PPCInstrInfo::getInstrLatency(const InstrItineraryData *ItinData, + const MachineInstr *MI, + unsigned *PredCost) const { + if (!ItinData || UseOldLatencyCalc) + return PPCGenInstrInfo::getInstrLatency(ItinData, MI, PredCost); + + // The default implementation of getInstrLatency calls getStageLatency, but + // getStageLatency does not do the right thing for us. While we have + // itinerary, most cores are fully pipelined, and so the itineraries only + // express the first part of the pipeline, not every stage. Instead, we need + // to use the listed output operand cycle number (using operand 0 here, which + // is an output). + + unsigned Latency = 1; + unsigned DefClass = MI->getDesc().getSchedClass(); + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = MI->getOperand(i); + if (!MO.isReg() || !MO.isDef() || MO.isImplicit()) + continue; + + int Cycle = ItinData->getOperandCycle(DefClass, i); + if (Cycle < 0) + continue; + + Latency = std::max(Latency, (unsigned) Cycle); + } + + return Latency; +} int PPCInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, const MachineInstr *DefMI, unsigned DefIdx, diff --git a/lib/Target/PowerPC/PPCInstrInfo.h b/lib/Target/PowerPC/PPCInstrInfo.h index e2d6346aa532..40badae644d6 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.h +++ b/lib/Target/PowerPC/PPCInstrInfo.h @@ -95,6 +95,10 @@ public: CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, const ScheduleDAG *DAG) const override; + unsigned getInstrLatency(const InstrItineraryData *ItinData, + const MachineInstr *MI, + unsigned *PredCost = nullptr) const override; + int getOperandLatency(const InstrItineraryData *ItinData, const MachineInstr *DefMI, unsigned DefIdx, const MachineInstr *UseMI, diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td index 43ba4994fde6..20c95fe888e0 100644 --- a/lib/Target/PowerPC/PPCInstrVSX.td +++ b/lib/Target/PowerPC/PPCInstrVSX.td @@ -989,6 +989,18 @@ def : Pat<(int_ppc_vsx_xvdivsp v4f32:$A, v4f32:$B), def : Pat<(int_ppc_vsx_xvdivdp v2f64:$A, v2f64:$B), (XVDIVDP $A, $B)>; +// Reciprocal estimate +def : Pat<(int_ppc_vsx_xvresp v4f32:$A), + (XVRESP $A)>; +def : Pat<(int_ppc_vsx_xvredp v2f64:$A), + (XVREDP $A)>; + +// Recip. square root estimate +def : Pat<(int_ppc_vsx_xvrsqrtesp v4f32:$A), + (XVRSQRTESP $A)>; +def : Pat<(int_ppc_vsx_xvrsqrtedp v2f64:$A), + (XVRSQRTEDP $A)>; + } // AddedComplexity } // HasVSX @@ -1013,6 +1025,9 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns. v4i32:$XB)))]>; } // isCommutable + def : Pat<(int_ppc_vsx_xxleqv v4i32:$A, v4i32:$B), + (XXLEQV $A, $B)>; + def XXLORC : XX3Form<60, 170, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), "xxlorc $XT, $XA, $XB", IIC_VecGeneral, diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp index 656376c641aa..2b09b2f625de 100644 --- a/lib/Target/PowerPC/PPCRegisterInfo.cpp +++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp @@ -165,8 +165,7 @@ void PPCRegisterInfo::adjustStackMapLiveOutMask(uint32_t *Mask) const { BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>(); - const PPCFrameLowering *PPCFI = - static_cast<const PPCFrameLowering *>(Subtarget.getFrameLowering()); + const PPCFrameLowering *TFI = getFrameLowering(MF); // The ZERO register is not really a register, but the representation of r0 // when used in instructions that treat r0 as the constant 0. @@ -209,7 +208,7 @@ BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const { Reserved.set(PPC::X1); Reserved.set(PPC::X13); - if (PPCFI->needsFP(MF)) + if (TFI->needsFP(MF)) Reserved.set(PPC::X31); if (hasBasePointer(MF)) @@ -230,7 +229,7 @@ BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const { } } - if (PPCFI->needsFP(MF)) + if (TFI->needsFP(MF)) Reserved.set(PPC::R31); if (hasBasePointer(MF)) { @@ -256,8 +255,7 @@ BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const { unsigned PPCRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const { - const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>(); - const TargetFrameLowering *TFI = Subtarget.getFrameLowering(); + const PPCFrameLowering *TFI = getFrameLowering(MF); const unsigned DefaultSafety = 1; switch (RC->getID()) { @@ -341,7 +339,8 @@ void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II) const { unsigned FrameSize = MFI->getStackSize(); // Get stack alignments. - unsigned TargetAlign = Subtarget.getFrameLowering()->getStackAlignment(); + const PPCFrameLowering *TFI = getFrameLowering(MF); + unsigned TargetAlign = TFI->getStackAlignment(); unsigned MaxAlign = MFI->getMaxAlignment(); assert((maxCallFrameSize & (MaxAlign-1)) == 0 && "Maximum call-frame size not sufficiently aligned"); @@ -864,8 +863,7 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, } unsigned PPCRegisterInfo::getFrameRegister(const MachineFunction &MF) const { - const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>(); - const TargetFrameLowering *TFI = Subtarget.getFrameLowering(); + const PPCFrameLowering *TFI = getFrameLowering(MF); if (!TM.isPPC64()) return TFI->hasFP(MF) ? PPC::R31 : PPC::R1; @@ -908,10 +906,10 @@ bool PPCRegisterInfo::canRealignStack(const MachineFunction &MF) const { } bool PPCRegisterInfo::needsStackRealignment(const MachineFunction &MF) const { - const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>(); + const PPCFrameLowering *TFI = getFrameLowering(MF); const MachineFrameInfo *MFI = MF.getFrameInfo(); const Function *F = MF.getFunction(); - unsigned StackAlign = Subtarget.getFrameLowering()->getStackAlignment(); + unsigned StackAlign = TFI->getStackAlignment(); bool requiresRealignment = ((MFI->getMaxAlignment() > StackAlign) || F->hasFnAttribute(Attribute::StackAlignment)); @@ -946,11 +944,8 @@ needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const { MachineBasicBlock &MBB = *MI->getParent(); MachineFunction &MF = *MBB.getParent(); - const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>(); - const PPCFrameLowering *PPCFI = - static_cast<const PPCFrameLowering *>(Subtarget.getFrameLowering()); - unsigned StackEst = - PPCFI->determineFrameLayout(MF, false, true); + const PPCFrameLowering *TFI = getFrameLowering(MF); + unsigned StackEst = TFI->determineFrameLayout(MF, false, true); // If we likely don't need a stack frame, then we probably don't need a // virtual base register either. @@ -1034,4 +1029,3 @@ bool PPCRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, MI->getOpcode() == TargetOpcode::PATCHPOINT || (isInt<16>(Offset) && (!usesIXAddr(*MI) || (Offset & 3) == 0)); } - diff --git a/lib/Target/PowerPC/PPCScheduleP7.td b/lib/Target/PowerPC/PPCScheduleP7.td index 635d154d10bf..267f56726180 100644 --- a/lib/Target/PowerPC/PPCScheduleP7.td +++ b/lib/Target/PowerPC/PPCScheduleP7.td @@ -315,6 +315,10 @@ def P7Itineraries : ProcessorItineraries< P7_DU3, P7_DU4], 0>, InstrStage<1, [P7_VS1, P7_VS2]>], [5, 1, 1]>, + InstrItinData<IIC_FPAddSub , [InstrStage<1, [P7_DU1, P7_DU2, + P7_DU3, P7_DU4], 0>, + InstrStage<1, [P7_VS1, P7_VS2]>], + [5, 1, 1]>, InstrItinData<IIC_FPCompare , [InstrStage<1, [P7_DU1, P7_DU2, P7_DU3, P7_DU4], 0>, InstrStage<1, [P7_VS1, P7_VS2]>], diff --git a/lib/Target/PowerPC/PPCScheduleP8.td b/lib/Target/PowerPC/PPCScheduleP8.td index 020739baec3a..69e6d05c6604 100644 --- a/lib/Target/PowerPC/PPCScheduleP8.td +++ b/lib/Target/PowerPC/PPCScheduleP8.td @@ -323,6 +323,10 @@ def P8Itineraries : ProcessorItineraries< P8_DU4, P8_DU5, P8_DU6], 0>, InstrStage<1, [P8_FPU1, P8_FPU2]>], [5, 1, 1]>, + InstrItinData<IIC_FPAddSub , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3, + P8_DU4, P8_DU5, P8_DU6], 0>, + InstrStage<1, [P8_FPU1, P8_FPU2]>], + [5, 1, 1]>, InstrItinData<IIC_FPCompare , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3, P8_DU4, P8_DU5, P8_DU6], 0>, InstrStage<1, [P8_FPU1, P8_FPU2]>], diff --git a/lib/Target/PowerPC/PPCSelectionDAGInfo.cpp b/lib/Target/PowerPC/PPCSelectionDAGInfo.cpp deleted file mode 100644 index dc1674214769..000000000000 --- a/lib/Target/PowerPC/PPCSelectionDAGInfo.cpp +++ /dev/null @@ -1,22 +0,0 @@ -//===-- PPCSelectionDAGInfo.cpp - PowerPC SelectionDAG Info ---------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file implements the PPCSelectionDAGInfo class. -// -//===----------------------------------------------------------------------===// - -#include "PPCTargetMachine.h" -using namespace llvm; - -#define DEBUG_TYPE "powerpc-selectiondag-info" - -PPCSelectionDAGInfo::PPCSelectionDAGInfo(const DataLayout *DL) - : TargetSelectionDAGInfo(DL) {} - -PPCSelectionDAGInfo::~PPCSelectionDAGInfo() {} diff --git a/lib/Target/PowerPC/PPCSelectionDAGInfo.h b/lib/Target/PowerPC/PPCSelectionDAGInfo.h deleted file mode 100644 index 2c1378d5670d..000000000000 --- a/lib/Target/PowerPC/PPCSelectionDAGInfo.h +++ /dev/null @@ -1,31 +0,0 @@ -//===-- PPCSelectionDAGInfo.h - PowerPC SelectionDAG Info -------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file defines the PowerPC subclass for TargetSelectionDAGInfo. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_POWERPC_PPCSELECTIONDAGINFO_H -#define LLVM_LIB_TARGET_POWERPC_PPCSELECTIONDAGINFO_H - -#include "llvm/Target/TargetSelectionDAGInfo.h" - -namespace llvm { - -class PPCTargetMachine; - -class PPCSelectionDAGInfo : public TargetSelectionDAGInfo { -public: - explicit PPCSelectionDAGInfo(const DataLayout *DL); - ~PPCSelectionDAGInfo(); -}; - -} - -#endif diff --git a/lib/Target/PowerPC/PPCSubtarget.cpp b/lib/Target/PowerPC/PPCSubtarget.cpp index cf603fe17723..58daccae90f2 100644 --- a/lib/Target/PowerPC/PPCSubtarget.cpp +++ b/lib/Target/PowerPC/PPCSubtarget.cpp @@ -53,7 +53,7 @@ PPCSubtarget::PPCSubtarget(const Triple &TT, const std::string &CPU, IsPPC64(TargetTriple.getArch() == Triple::ppc64 || TargetTriple.getArch() == Triple::ppc64le), TM(TM), FrameLowering(initializeSubtargetDependencies(CPU, FS)), - InstrInfo(*this), TLInfo(TM, *this), TSInfo(TM.getDataLayout()) {} + InstrInfo(*this), TLInfo(TM, *this) {} void PPCSubtarget::initializeEnvironment() { StackAlignment = 16; diff --git a/lib/Target/PowerPC/PPCSubtarget.h b/lib/Target/PowerPC/PPCSubtarget.h index e9cc3d4bd5bc..0616c1f65604 100644 --- a/lib/Target/PowerPC/PPCSubtarget.h +++ b/lib/Target/PowerPC/PPCSubtarget.h @@ -17,10 +17,10 @@ #include "PPCFrameLowering.h" #include "PPCISelLowering.h" #include "PPCInstrInfo.h" -#include "PPCSelectionDAGInfo.h" #include "llvm/ADT/Triple.h" #include "llvm/IR/DataLayout.h" #include "llvm/MC/MCInstrItineraries.h" +#include "llvm/Target/TargetSelectionDAGInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" #include <string> @@ -129,7 +129,7 @@ protected: PPCFrameLowering FrameLowering; PPCInstrInfo InstrInfo; PPCTargetLowering TLInfo; - PPCSelectionDAGInfo TSInfo; + TargetSelectionDAGInfo TSInfo; public: /// This constructor initializes the data members to match that @@ -164,7 +164,7 @@ public: const PPCTargetLowering *getTargetLowering() const override { return &TLInfo; } - const PPCSelectionDAGInfo *getSelectionDAGInfo() const override { + const TargetSelectionDAGInfo *getSelectionDAGInfo() const override { return &TSInfo; } const PPCRegisterInfo *getRegisterInfo() const override { diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp index 074bc870751a..1daf244fed44 100644 --- a/lib/Target/PowerPC/PPCTargetMachine.cpp +++ b/lib/Target/PowerPC/PPCTargetMachine.cpp @@ -172,7 +172,26 @@ PPCTargetMachine::PPCTargetMachine(const Target &T, const Triple &TT, : LLVMTargetMachine(T, getDataLayoutString(TT), TT, CPU, computeFSAdditions(FS, OL, TT), Options, RM, CM, OL), TLOF(createTLOF(getTargetTriple())), - TargetABI(computeTargetABI(TT, Options)) { + TargetABI(computeTargetABI(TT, Options)), + Subtarget(TargetTriple, CPU, computeFSAdditions(FS, OL, TT), *this) { + + // For the estimates, convergence is quadratic, so we essentially double the + // number of digits correct after every iteration. For both FRE and FRSQRTE, + // the minimum architected relative accuracy is 2^-5. When hasRecipPrec(), + // this is 2^-14. IEEE float has 23 digits and double has 52 digits. + unsigned RefinementSteps = Subtarget.hasRecipPrec() ? 1 : 3, + RefinementSteps64 = RefinementSteps + 1; + + this->Options.Reciprocals.setDefaults("sqrtf", true, RefinementSteps); + this->Options.Reciprocals.setDefaults("vec-sqrtf", true, RefinementSteps); + this->Options.Reciprocals.setDefaults("divf", true, RefinementSteps); + this->Options.Reciprocals.setDefaults("vec-divf", true, RefinementSteps); + + this->Options.Reciprocals.setDefaults("sqrtd", true, RefinementSteps64); + this->Options.Reciprocals.setDefaults("vec-sqrtd", true, RefinementSteps64); + this->Options.Reciprocals.setDefaults("divd", true, RefinementSteps64); + this->Options.Reciprocals.setDefaults("vec-divd", true, RefinementSteps64); + initAsmInfo(); } diff --git a/lib/Target/PowerPC/PPCTargetMachine.h b/lib/Target/PowerPC/PPCTargetMachine.h index 5c0f7e629a69..6496339519a1 100644 --- a/lib/Target/PowerPC/PPCTargetMachine.h +++ b/lib/Target/PowerPC/PPCTargetMachine.h @@ -29,6 +29,8 @@ public: private: std::unique_ptr<TargetLoweringObjectFile> TLOF; PPCABI TargetABI; + PPCSubtarget Subtarget; + mutable StringMap<std::unique_ptr<PPCSubtarget>> SubtargetMap; public: diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp index 25d563a7d975..e21c2b77f4d7 100644 --- a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -317,7 +317,7 @@ unsigned PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace) { // Legalize the type. - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src); + std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); assert((Opcode == Instruction::Load || Opcode == Instruction::Store) && "Invalid Opcode"); diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.h b/lib/Target/PowerPC/PPCTargetTransformInfo.h index 35e7a1497c83..368bef93f0dd 100644 --- a/lib/Target/PowerPC/PPCTargetTransformInfo.h +++ b/lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -38,7 +38,8 @@ class PPCTTIImpl : public BasicTTIImplBase<PPCTTIImpl> { public: explicit PPCTTIImpl(const PPCTargetMachine *TM, Function &F) - : BaseT(TM), ST(TM->getSubtargetImpl(F)), TLI(ST->getTargetLowering()) {} + : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)), + TLI(ST->getTargetLowering()) {} // Provide value semantics. MSVC requires that we spell all of these out. PPCTTIImpl(const PPCTTIImpl &Arg) @@ -46,18 +47,6 @@ public: PPCTTIImpl(PPCTTIImpl &&Arg) : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)), TLI(std::move(Arg.TLI)) {} - PPCTTIImpl &operator=(const PPCTTIImpl &RHS) { - BaseT::operator=(static_cast<const BaseT &>(RHS)); - ST = RHS.ST; - TLI = RHS.TLI; - return *this; - } - PPCTTIImpl &operator=(PPCTTIImpl &&RHS) { - BaseT::operator=(std::move(static_cast<BaseT &>(RHS))); - ST = std::move(RHS.ST); - TLI = std::move(RHS.TLI); - return *this; - } /// \name Scalar TTI Implementations /// @{ diff --git a/lib/Target/PowerPC/PPCVSXFMAMutate.cpp b/lib/Target/PowerPC/PPCVSXFMAMutate.cpp index f352fa647ace..58d3c3d3fa2e 100644 --- a/lib/Target/PowerPC/PPCVSXFMAMutate.cpp +++ b/lib/Target/PowerPC/PPCVSXFMAMutate.cpp @@ -136,6 +136,16 @@ protected: // source of the copy, it must still be live here. We can't use // interval testing for a physical register, so as long as we're // walking the MIs we may as well test liveness here. + // + // FIXME: There is a case that occurs in practice, like this: + // %vreg9<def> = COPY %F1; VSSRC:%vreg9 + // ... + // %vreg6<def> = COPY %vreg9; VSSRC:%vreg6,%vreg9 + // %vreg7<def> = COPY %vreg9; VSSRC:%vreg7,%vreg9 + // %vreg9<def,tied1> = XSMADDASP %vreg9<tied0>, %vreg1, %vreg4; VSSRC: + // %vreg6<def,tied1> = XSMADDASP %vreg6<tied0>, %vreg1, %vreg2; VSSRC: + // %vreg7<def,tied1> = XSMADDASP %vreg7<tied0>, %vreg1, %vreg3; VSSRC: + // which prevents an otherwise-profitable transformation. bool OtherUsers = false, KillsAddendSrc = false; for (auto J = std::prev(I), JE = MachineBasicBlock::iterator(AddendMI); J != JE; --J) { diff --git a/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp b/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp index e7ab71ac2106..3fb1dcc3d4af 100644 --- a/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp +++ b/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp @@ -80,6 +80,7 @@ struct PPCVSXSwapEntry { unsigned int IsSwap : 1; unsigned int MentionsPhysVR : 1; unsigned int IsSwappable : 1; + unsigned int MentionsPartialVR : 1; unsigned int SpecialHandling : 3; unsigned int WebRejected : 1; unsigned int WillRemove : 1; @@ -91,7 +92,9 @@ enum SHValues { SH_INSERT, SH_NOSWAP_LD, SH_NOSWAP_ST, - SH_SPLAT + SH_SPLAT, + SH_XXPERMDI, + SH_COPYSCALAR }; struct PPCVSXSwapRemoval : public MachineFunctionPass { @@ -167,6 +170,21 @@ private: isRegInClass(Reg, &PPC::VRRCRegClass)); } + // Return true iff the given register is a partial vector register. + bool isScalarVecReg(unsigned Reg) { + return (isRegInClass(Reg, &PPC::VSFRCRegClass) || + isRegInClass(Reg, &PPC::VSSRCRegClass)); + } + + // Return true iff the given register mentions all or part of a + // vector register. Also sets Partial to true if the mention + // is for just the floating-point register overlap of the register. + bool isAnyVecReg(unsigned Reg, bool &Partial) { + if (isScalarVecReg(Reg)) + Partial = true; + return isScalarVecReg(Reg) || isVecReg(Reg); + } + public: // Main entry point for this pass. bool runOnMachineFunction(MachineFunction &MF) override { @@ -223,12 +241,13 @@ bool PPCVSXSwapRemoval::gatherVectorInstructions() { for (MachineInstr &MI : MBB) { bool RelevantInstr = false; + bool Partial = false; for (const MachineOperand &MO : MI.operands()) { if (!MO.isReg()) continue; unsigned Reg = MO.getReg(); - if (isVecReg(Reg)) { + if (isAnyVecReg(Reg, Partial)) { RelevantInstr = true; break; } @@ -250,8 +269,13 @@ bool PPCVSXSwapRemoval::gatherVectorInstructions() { // Unless noted otherwise, an instruction is considered // safe for the optimization. There are a large number of // such true-SIMD instructions (all vector math, logical, - // select, compare, etc.). - SwapVector[VecIdx].IsSwappable = 1; + // select, compare, etc.). However, if the instruction + // mentions a partial vector register and does not have + // special handling defined, it is not swappable. + if (Partial) + SwapVector[VecIdx].MentionsPartialVR = 1; + else + SwapVector[VecIdx].IsSwappable = 1; break; case PPC::XXPERMDI: { // This is a swap if it is of the form XXPERMDI t, s, s, 2. @@ -269,25 +293,37 @@ bool PPCVSXSwapRemoval::gatherVectorInstructions() { VecIdx); if (trueReg1 == trueReg2) SwapVector[VecIdx].IsSwap = 1; - } + else { + // We can still handle these if the two registers are not + // identical, by adjusting the form of the XXPERMDI. + SwapVector[VecIdx].IsSwappable = 1; + SwapVector[VecIdx].SpecialHandling = SHValues::SH_XXPERMDI; + } // This is a doubleword splat if it is of the form // XXPERMDI t, s, s, 0 or XXPERMDI t, s, s, 3. As above we // must look through chains of copy-likes to find the source // register. We turn off the marking for mention of a physical // register, because splatting it is safe; the optimization - // will not swap the value in the physical register. - else if (immed == 0 || immed == 3) { + // will not swap the value in the physical register. Whether + // or not the two input registers are identical, we can handle + // these by adjusting the form of the XXPERMDI. + } else if (immed == 0 || immed == 3) { + + SwapVector[VecIdx].IsSwappable = 1; + SwapVector[VecIdx].SpecialHandling = SHValues::SH_XXPERMDI; + unsigned trueReg1 = lookThruCopyLike(MI.getOperand(1).getReg(), VecIdx); unsigned trueReg2 = lookThruCopyLike(MI.getOperand(2).getReg(), VecIdx); - if (trueReg1 == trueReg2) { - SwapVector[VecIdx].IsSwappable = 1; + if (trueReg1 == trueReg2) SwapVector[VecIdx].MentionsPhysVR = 0; - } + + } else { + // We can still handle these by adjusting the form of the XXPERMDI. + SwapVector[VecIdx].IsSwappable = 1; + SwapVector[VecIdx].SpecialHandling = SHValues::SH_XXPERMDI; } - // Any other form of XXPERMDI is lane-sensitive and unsafe - // for the optimization. break; } case PPC::LVX: @@ -324,7 +360,32 @@ bool PPCVSXSwapRemoval::gatherVectorInstructions() { if (isVecReg(MI.getOperand(0).getReg()) && isVecReg(MI.getOperand(1).getReg())) SwapVector[VecIdx].IsSwappable = 1; + // If we have a copy from one scalar floating-point register + // to another, we can accept this even if it is a physical + // register. The only way this gets involved is if it feeds + // a SUBREG_TO_REG, which is handled by introducing a swap. + else if (isScalarVecReg(MI.getOperand(0).getReg()) && + isScalarVecReg(MI.getOperand(1).getReg())) + SwapVector[VecIdx].IsSwappable = 1; + break; + case PPC::SUBREG_TO_REG: { + // These are fine provided they are moving between full vector + // register classes. If they are moving from a scalar + // floating-point class to a vector class, we can handle those + // as well, provided we introduce a swap. It is generally the + // case that we will introduce fewer swaps than we remove, but + // (FIXME) a cost model could be used. However, introduced + // swaps could potentially be CSEd, so this is not trivial. + if (isVecReg(MI.getOperand(0).getReg()) && + isVecReg(MI.getOperand(2).getReg())) + SwapVector[VecIdx].IsSwappable = 1; + else if (isVecReg(MI.getOperand(0).getReg()) && + isScalarVecReg(MI.getOperand(2).getReg())) { + SwapVector[VecIdx].IsSwappable = 1; + SwapVector[VecIdx].SpecialHandling = SHValues::SH_COPYSCALAR; + } break; + } case PPC::VSPLTB: case PPC::VSPLTH: case PPC::VSPLTW: @@ -425,6 +486,10 @@ bool PPCVSXSwapRemoval::gatherVectorInstructions() { case PPC::VUPKLSW: case PPC::XXMRGHW: case PPC::XXMRGLW: + // XXSLDWI could be replaced by a general permute with one of three + // permute control vectors (for shift values 1, 2, 3). However, + // VPERM has a more restrictive register class. + case PPC::XXSLDWI: case PPC::XXSPLTW: break; } @@ -501,18 +566,20 @@ void PPCVSXSwapRemoval::formWebs() { DEBUG(MI->dump()); // It's sufficient to walk vector uses and join them to their unique - // definitions. In addition, check *all* vector register operands - // for physical regs. + // definitions. In addition, check full vector register operands + // for physical regs. We exclude partial-vector register operands + // because we can handle them if copied to a full vector. for (const MachineOperand &MO : MI->operands()) { if (!MO.isReg()) continue; unsigned Reg = MO.getReg(); - if (!isVecReg(Reg)) + if (!isVecReg(Reg) && !isScalarVecReg(Reg)) continue; if (!TargetRegisterInfo::isVirtualRegister(Reg)) { - SwapVector[EntryIdx].MentionsPhysVR = 1; + if (!(MI->isCopy() && isScalarVecReg(Reg))) + SwapVector[EntryIdx].MentionsPhysVR = 1; continue; } @@ -545,15 +612,21 @@ void PPCVSXSwapRemoval::recordUnoptimizableWebs() { for (unsigned EntryIdx = 0; EntryIdx < SwapVector.size(); ++EntryIdx) { int Repr = EC->getLeaderValue(SwapVector[EntryIdx].VSEId); - // Reject webs containing mentions of physical registers, or containing - // operations that we don't know how to handle in a lane-permuted region. + // If representative is already rejected, don't waste further time. + if (SwapVector[Repr].WebRejected) + continue; + + // Reject webs containing mentions of physical or partial registers, or + // containing operations that we don't know how to handle in a lane- + // permuted region. if (SwapVector[EntryIdx].MentionsPhysVR || + SwapVector[EntryIdx].MentionsPartialVR || !(SwapVector[EntryIdx].IsSwappable || SwapVector[EntryIdx].IsSwap)) { SwapVector[Repr].WebRejected = 1; DEBUG(dbgs() << - format("Web %d rejected for physreg, subreg, or not swap[pable]\n", + format("Web %d rejected for physreg, partial reg, or not swap[pable]\n", Repr)); DEBUG(dbgs() << " in " << EntryIdx << ": "); DEBUG(SwapVector[EntryIdx].VSEMI->dump()); @@ -588,7 +661,7 @@ void PPCVSXSwapRemoval::recordUnoptimizableWebs() { } } - // Reject webs than contain swapping stores that are fed by something + // Reject webs that contain swapping stores that are fed by something // other than a swap instruction. } else if (SwapVector[EntryIdx].IsStore && SwapVector[EntryIdx].IsSwap) { MachineInstr *MI = SwapVector[EntryIdx].VSEMI; @@ -670,7 +743,8 @@ void PPCVSXSwapRemoval::markSwapsForRemoval() { // The identified swap entry requires special handling to allow its // containing computation to be optimized. Perform that handling // here. -// FIXME: This code is to be phased in with subsequent patches. +// FIXME: Additional opportunities will be phased in with subsequent +// patches. void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) { switch (SwapVector[EntryIdx].SpecialHandling) { @@ -704,6 +778,91 @@ void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) { break; } + // For an XXPERMDI that isn't handled otherwise, we need to + // reverse the order of the operands. If the selector operand + // has a value of 0 or 3, we need to change it to 3 or 0, + // respectively. Otherwise we should leave it alone. (This + // is equivalent to reversing the two bits of the selector + // operand and complementing the result.) + case SHValues::SH_XXPERMDI: { + MachineInstr *MI = SwapVector[EntryIdx].VSEMI; + + DEBUG(dbgs() << "Changing XXPERMDI: "); + DEBUG(MI->dump()); + + unsigned Selector = MI->getOperand(3).getImm(); + if (Selector == 0 || Selector == 3) + Selector = 3 - Selector; + MI->getOperand(3).setImm(Selector); + + unsigned Reg1 = MI->getOperand(1).getReg(); + unsigned Reg2 = MI->getOperand(2).getReg(); + MI->getOperand(1).setReg(Reg2); + MI->getOperand(2).setReg(Reg1); + + DEBUG(dbgs() << " Into: "); + DEBUG(MI->dump()); + break; + } + + // For a copy from a scalar floating-point register to a vector + // register, removing swaps will leave the copied value in the + // wrong lane. Insert a swap following the copy to fix this. + case SHValues::SH_COPYSCALAR: { + MachineInstr *MI = SwapVector[EntryIdx].VSEMI; + + DEBUG(dbgs() << "Changing SUBREG_TO_REG: "); + DEBUG(MI->dump()); + + unsigned DstReg = MI->getOperand(0).getReg(); + const TargetRegisterClass *DstRC = MRI->getRegClass(DstReg); + unsigned NewVReg = MRI->createVirtualRegister(DstRC); + + MI->getOperand(0).setReg(NewVReg); + DEBUG(dbgs() << " Into: "); + DEBUG(MI->dump()); + + MachineBasicBlock::iterator InsertPoint = MI->getNextNode(); + + // Note that an XXPERMDI requires a VSRC, so if the SUBREG_TO_REG + // is copying to a VRRC, we need to be careful to avoid a register + // assignment problem. In this case we must copy from VRRC to VSRC + // prior to the swap, and from VSRC to VRRC following the swap. + // Coalescing will usually remove all this mess. + + if (DstRC == &PPC::VRRCRegClass) { + unsigned VSRCTmp1 = MRI->createVirtualRegister(&PPC::VSRCRegClass); + unsigned VSRCTmp2 = MRI->createVirtualRegister(&PPC::VSRCRegClass); + + BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(), + TII->get(PPC::COPY), VSRCTmp1) + .addReg(NewVReg); + DEBUG(MI->getNextNode()->dump()); + + BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(), + TII->get(PPC::XXPERMDI), VSRCTmp2) + .addReg(VSRCTmp1) + .addReg(VSRCTmp1) + .addImm(2); + DEBUG(MI->getNextNode()->getNextNode()->dump()); + + BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(), + TII->get(PPC::COPY), DstReg) + .addReg(VSRCTmp2); + DEBUG(MI->getNextNode()->getNextNode()->getNextNode()->dump()); + + } else { + + BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(), + TII->get(PPC::XXPERMDI), DstReg) + .addReg(NewVReg) + .addReg(NewVReg) + .addImm(2); + + DEBUG(MI->getNextNode()->dump()); + } + break; + } } } @@ -756,6 +915,8 @@ void PPCVSXSwapRemoval::dumpSwapVector() { DEBUG(dbgs() << "swap "); if (SwapVector[EntryIdx].MentionsPhysVR) DEBUG(dbgs() << "physreg "); + if (SwapVector[EntryIdx].MentionsPartialVR) + DEBUG(dbgs() << "partialreg "); if (SwapVector[EntryIdx].IsSwappable) { DEBUG(dbgs() << "swappable "); @@ -780,6 +941,12 @@ void PPCVSXSwapRemoval::dumpSwapVector() { case SH_SPLAT: DEBUG(dbgs() << "special:splat "); break; + case SH_XXPERMDI: + DEBUG(dbgs() << "special:xxpermdi "); + break; + case SH_COPYSCALAR: + DEBUG(dbgs() << "special:copyscalar "); + break; } } diff --git a/lib/Target/Sparc/CMakeLists.txt b/lib/Target/Sparc/CMakeLists.txt index c486411f9a1e..5b7bfdd28020 100644 --- a/lib/Target/Sparc/CMakeLists.txt +++ b/lib/Target/Sparc/CMakeLists.txt @@ -22,7 +22,6 @@ add_llvm_target(SparcCodeGen SparcRegisterInfo.cpp SparcSubtarget.cpp SparcTargetMachine.cpp - SparcSelectionDAGInfo.cpp SparcMCInstLower.cpp SparcTargetObjectFile.cpp ) diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp b/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp index 91d2eeef0cc0..9113e4a46b96 100644 --- a/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp +++ b/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp @@ -57,7 +57,7 @@ static MCInstrInfo *createSparcMCInstrInfo() { return X; } -static MCRegisterInfo *createSparcMCRegisterInfo(StringRef TT) { +static MCRegisterInfo *createSparcMCRegisterInfo(const Triple &TT) { MCRegisterInfo *X = new MCRegisterInfo(); InitSparcMCRegisterInfo(X, SP::O7); return X; @@ -65,11 +65,9 @@ static MCRegisterInfo *createSparcMCRegisterInfo(StringRef TT) { static MCSubtargetInfo * createSparcMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) { - MCSubtargetInfo *X = new MCSubtargetInfo(); if (CPU.empty()) CPU = (TT.getArch() == Triple::sparcv9) ? "v9" : "v8"; - InitSparcMCSubtargetInfo(X, TT, CPU, FS); - return X; + return createSparcMCSubtargetInfoImpl(TT, CPU, FS); } // Code models. Some only make sense for 64-bit code. @@ -83,7 +81,8 @@ createSparcMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) { // // All code models require that the text segment is smaller than 2GB. -static MCCodeGenInfo *createSparcMCCodeGenInfo(StringRef TT, Reloc::Model RM, +static MCCodeGenInfo *createSparcMCCodeGenInfo(const Triple &TT, + Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) { MCCodeGenInfo *X = new MCCodeGenInfo(); @@ -100,7 +99,8 @@ static MCCodeGenInfo *createSparcMCCodeGenInfo(StringRef TT, Reloc::Model RM, return X; } -static MCCodeGenInfo *createSparcV9MCCodeGenInfo(StringRef TT, Reloc::Model RM, +static MCCodeGenInfo *createSparcV9MCCodeGenInfo(const Triple &TT, + Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) { MCCodeGenInfo *X = new MCCodeGenInfo(); diff --git a/lib/Target/Sparc/SparcFrameLowering.cpp b/lib/Target/Sparc/SparcFrameLowering.cpp index bccc6bdd53eb..8fa10dcae114 100644 --- a/lib/Target/Sparc/SparcFrameLowering.cpp +++ b/lib/Target/Sparc/SparcFrameLowering.cpp @@ -190,11 +190,11 @@ static bool LLVM_ATTRIBUTE_UNUSED verifyLeafProcRegUse(MachineRegisterInfo *MRI) { for (unsigned reg = SP::I0; reg <= SP::I7; ++reg) - if (MRI->isPhysRegUsed(reg)) + if (!MRI->reg_nodbg_empty(reg)) return false; for (unsigned reg = SP::L0; reg <= SP::L7; ++reg) - if (MRI->isPhysRegUsed(reg)) + if (!MRI->reg_nodbg_empty(reg)) return false; return true; @@ -206,10 +206,10 @@ bool SparcFrameLowering::isLeafProc(MachineFunction &MF) const MachineRegisterInfo &MRI = MF.getRegInfo(); MachineFrameInfo *MFI = MF.getFrameInfo(); - return !(MFI->hasCalls() // has calls - || MRI.isPhysRegUsed(SP::L0) // Too many registers needed - || MRI.isPhysRegUsed(SP::O6) // %SP is used - || hasFP(MF)); // need %FP + return !(MFI->hasCalls() // has calls + || !MRI.reg_nodbg_empty(SP::L0) // Too many registers needed + || !MRI.reg_nodbg_empty(SP::O6) // %SP is used + || hasFP(MF)); // need %FP } void SparcFrameLowering::remapRegsForLeafProc(MachineFunction &MF) const { @@ -218,16 +218,13 @@ void SparcFrameLowering::remapRegsForLeafProc(MachineFunction &MF) const { // Remap %i[0-7] to %o[0-7]. for (unsigned reg = SP::I0; reg <= SP::I7; ++reg) { - if (!MRI.isPhysRegUsed(reg)) + if (MRI.reg_nodbg_empty(reg)) continue; unsigned mapped_reg = (reg - SP::I0 + SP::O0); - assert(!MRI.isPhysRegUsed(mapped_reg)); + assert(MRI.reg_nodbg_empty(mapped_reg)); // Replace I register with O register. MRI.replaceRegWith(reg, mapped_reg); - - // Mark the reg unused. - MRI.setPhysRegUnused(reg); } // Rewrite MBB's Live-ins. @@ -247,9 +244,10 @@ void SparcFrameLowering::remapRegsForLeafProc(MachineFunction &MF) const { #endif } -void SparcFrameLowering::processFunctionBeforeCalleeSavedScan - (MachineFunction &MF, RegScavenger *RS) const { - +void SparcFrameLowering::determineCalleeSaves(MachineFunction &MF, + BitVector &SavedRegs, + RegScavenger *RS) const { + TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); if (!DisableLeafProc && isLeafProc(MF)) { SparcMachineFunctionInfo *MFI = MF.getInfo<SparcMachineFunctionInfo>(); MFI->setLeafProc(true); diff --git a/lib/Target/Sparc/SparcFrameLowering.h b/lib/Target/Sparc/SparcFrameLowering.h index bb3b78861cbd..29fc7b7ba036 100644 --- a/lib/Target/Sparc/SparcFrameLowering.h +++ b/lib/Target/Sparc/SparcFrameLowering.h @@ -36,8 +36,8 @@ public: bool hasReservedCallFrame(const MachineFunction &MF) const override; bool hasFP(const MachineFunction &MF) const override; - void processFunctionBeforeCalleeSavedScan(MachineFunction &MF, - RegScavenger *RS = nullptr) const override; + void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, + RegScavenger *RS = nullptr) const override; private: // Remap input registers to output registers for leaf procedure. diff --git a/lib/Target/Sparc/SparcISelDAGToDAG.cpp b/lib/Target/Sparc/SparcISelDAGToDAG.cpp index 9c594a9f0f65..340b72e7940f 100644 --- a/lib/Target/Sparc/SparcISelDAGToDAG.cpp +++ b/lib/Target/Sparc/SparcISelDAGToDAG.cpp @@ -67,13 +67,16 @@ private: SDNode* SparcDAGToDAGISel::getGlobalBaseReg() { unsigned GlobalBaseReg = Subtarget->getInstrInfo()->getGlobalBaseReg(MF); - return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy()).getNode(); + return CurDAG->getRegister(GlobalBaseReg, + TLI->getPointerTy(CurDAG->getDataLayout())) + .getNode(); } bool SparcDAGToDAGISel::SelectADDRri(SDValue Addr, SDValue &Base, SDValue &Offset) { if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) { - Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), TLI->getPointerTy()); + Base = CurDAG->getTargetFrameIndex( + FIN->getIndex(), TLI->getPointerTy(CurDAG->getDataLayout())); Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32); return true; } @@ -88,8 +91,8 @@ bool SparcDAGToDAGISel::SelectADDRri(SDValue Addr, if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr.getOperand(0))) { // Constant offset from frame ref. - Base = - CurDAG->getTargetFrameIndex(FIN->getIndex(), TLI->getPointerTy()); + Base = CurDAG->getTargetFrameIndex( + FIN->getIndex(), TLI->getPointerTy(CurDAG->getDataLayout())); } else { Base = Addr.getOperand(0); } @@ -134,7 +137,7 @@ bool SparcDAGToDAGISel::SelectADDRrr(SDValue Addr, SDValue &R1, SDValue &R2) { } R1 = Addr; - R2 = CurDAG->getRegister(SP::G0, TLI->getPointerTy()); + R2 = CurDAG->getRegister(SP::G0, TLI->getPointerTy(CurDAG->getDataLayout())); return true; } @@ -168,10 +171,9 @@ SDNode *SparcDAGToDAGISel::Select(SDNode *N) { } else { TopPart = CurDAG->getRegister(SP::G0, MVT::i32); } - TopPart = SDValue(CurDAG->getMachineNode(SP::WRASRrr, dl, MVT::i32, - TopPart, - CurDAG->getRegister(SP::G0, MVT::i32)), 0); - TopPart = CurDAG->getCopyToReg(TopPart, dl, SP::Y, TopPart, SDValue()).getValue(1); + TopPart = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, SP::Y, TopPart, + SDValue()) + .getValue(1); // FIXME: Handle div by immediate. unsigned Opcode = N->getOpcode() == ISD::SDIV ? SP::SDIVrr : SP::UDIVrr; @@ -184,12 +186,11 @@ SDNode *SparcDAGToDAGISel::Select(SDNode *N) { SDValue MulLHS = N->getOperand(0); SDValue MulRHS = N->getOperand(1); unsigned Opcode = N->getOpcode() == ISD::MULHU ? SP::UMULrr : SP::SMULrr; - SDNode *Mul = CurDAG->getMachineNode(Opcode, dl, MVT::i32, MVT::Glue, - MulLHS, MulRHS); - // The high part is in the Y register. - return CurDAG->SelectNodeTo(N, SP::RDASR, MVT::i32, - CurDAG->getRegister(SP::Y, MVT::i32), - SDValue(Mul, 1)); + SDNode *Mul = + CurDAG->getMachineNode(Opcode, dl, MVT::i32, MVT::i32, MulLHS, MulRHS); + SDValue ResultHigh = SDValue(Mul, 1); + ReplaceUses(SDValue(N, 0), ResultHigh); + return nullptr; } } diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp index 0481676dc1ac..4879d4ee79e5 100644 --- a/lib/Target/Sparc/SparcISelLowering.cpp +++ b/lib/Target/Sparc/SparcISelLowering.cpp @@ -221,10 +221,11 @@ SparcTargetLowering::LowerReturn_32(SDValue Chain, unsigned Reg = SFI->getSRetReturnReg(); if (!Reg) llvm_unreachable("sret virtual register not created in the entry block"); - SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy()); + auto PtrVT = getPointerTy(DAG.getDataLayout()); + SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, PtrVT); Chain = DAG.getCopyToReg(Chain, DL, SP::I0, Val, Flag); Flag = Chain.getValue(1); - RetOps.push_back(DAG.getRegister(SP::I0, getPointerTy())); + RetOps.push_back(DAG.getRegister(SP::I0, PtrVT)); RetAddrOffset = 12; // CallInst + Delay Slot + Unimp } @@ -418,6 +419,7 @@ LowerFormalArguments_32(SDValue Chain, assert(VA.isMemLoc()); unsigned Offset = VA.getLocMemOffset()+StackOffset; + auto PtrVT = getPointerTy(DAG.getDataLayout()); if (VA.needsCustom()) { assert(VA.getValVT() == MVT::f64); @@ -426,7 +428,7 @@ LowerFormalArguments_32(SDValue Chain, int FI = MF.getFrameInfo()->CreateFixedObject(8, Offset, true); - SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy()); + SDValue FIPtr = DAG.getFrameIndex(FI, PtrVT); SDValue Load = DAG.getLoad(VA.getValVT(), dl, Chain, FIPtr, MachinePointerInfo(), false,false, false, 0); @@ -437,14 +439,14 @@ LowerFormalArguments_32(SDValue Chain, int FI = MF.getFrameInfo()->CreateFixedObject(4, Offset, true); - SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy()); + SDValue FIPtr = DAG.getFrameIndex(FI, PtrVT); SDValue HiVal = DAG.getLoad(MVT::i32, dl, Chain, FIPtr, MachinePointerInfo(), false, false, false, 0); int FI2 = MF.getFrameInfo()->CreateFixedObject(4, Offset+4, true); - SDValue FIPtr2 = DAG.getFrameIndex(FI2, getPointerTy()); + SDValue FIPtr2 = DAG.getFrameIndex(FI2, PtrVT); SDValue LoVal = DAG.getLoad(MVT::i32, dl, Chain, FIPtr2, MachinePointerInfo(), @@ -460,7 +462,7 @@ LowerFormalArguments_32(SDValue Chain, int FI = MF.getFrameInfo()->CreateFixedObject(4, Offset, true); - SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy()); + SDValue FIPtr = DAG.getFrameIndex(FI, PtrVT); SDValue Load ; if (VA.getValVT() == MVT::i32 || VA.getValVT() == MVT::f32) { Load = DAG.getLoad(VA.getValVT(), dl, Chain, FIPtr, @@ -607,10 +609,10 @@ LowerFormalArguments_64(SDValue Chain, if (VA.isExtInLoc()) Offset += 8 - ValSize; int FI = MF.getFrameInfo()->CreateFixedObject(ValSize, Offset, true); - InVals.push_back(DAG.getLoad(VA.getValVT(), DL, Chain, - DAG.getFrameIndex(FI, getPointerTy()), - MachinePointerInfo::getFixedStack(FI), - false, false, false, 0)); + InVals.push_back(DAG.getLoad( + VA.getValVT(), DL, Chain, + DAG.getFrameIndex(FI, getPointerTy(MF.getDataLayout())), + MachinePointerInfo::getFixedStack(FI), false, false, false, 0)); } if (!IsVarArg) @@ -637,10 +639,10 @@ LowerFormalArguments_64(SDValue Chain, unsigned VReg = MF.addLiveIn(SP::I0 + ArgOffset/8, &SP::I64RegsRegClass); SDValue VArg = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64); int FI = MF.getFrameInfo()->CreateFixedObject(8, ArgOffset + ArgArea, true); - OutChains.push_back(DAG.getStore(Chain, DL, VArg, - DAG.getFrameIndex(FI, getPointerTy()), - MachinePointerInfo::getFixedStack(FI), - false, false, 0)); + auto PtrVT = getPointerTy(MF.getDataLayout()); + OutChains.push_back( + DAG.getStore(Chain, DL, VArg, DAG.getFrameIndex(FI, PtrVT), + MachinePointerInfo::getFixedStack(FI), false, false, 0)); } if (!OutChains.empty()) @@ -722,7 +724,7 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI, unsigned Align = Flags.getByValAlign(); int FI = MFI->CreateStackObject(Size, Align, false); - SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy()); + SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); SDValue SizeNode = DAG.getConstant(Size, dl, MVT::i32); Chain = DAG.getMemcpy(Chain, dl, FIPtr, Arg, SizeNode, Align, @@ -993,7 +995,7 @@ SparcTargetLowering::getSRetArgSize(SelectionDAG &DAG, SDValue Callee) const PointerType *Ty = cast<PointerType>(CalleeFn->arg_begin()->getType()); Type *ElementTy = Ty->getElementType(); - return getDataLayout()->getTypeAllocSize(ElementTy); + return DAG.getDataLayout().getTypeAllocSize(ElementTy); } @@ -1057,6 +1059,7 @@ SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI, SelectionDAG &DAG = CLI.DAG; SDLoc DL = CLI.DL; SDValue Chain = CLI.Chain; + auto PtrVT = getPointerTy(DAG.getDataLayout()); // Sparc target does not yet support tail call optimization. CLI.IsTailCall = false; @@ -1130,13 +1133,11 @@ SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI, // Store and reload into the interger register reg and reg+1. unsigned Offset = 8 * (VA.getLocReg() - SP::I0); unsigned StackOffset = Offset + Subtarget->getStackPointerBias() + 128; - SDValue StackPtr = DAG.getRegister(SP::O6, getPointerTy()); + SDValue StackPtr = DAG.getRegister(SP::O6, PtrVT); SDValue HiPtrOff = DAG.getIntPtrConstant(StackOffset, DL); - HiPtrOff = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, - HiPtrOff); + HiPtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, HiPtrOff); SDValue LoPtrOff = DAG.getIntPtrConstant(StackOffset + 8, DL); - LoPtrOff = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, - LoPtrOff); + LoPtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, LoPtrOff); // Store to %sp+BIAS+128+Offset SDValue Store = DAG.getStore(Chain, DL, Arg, HiPtrOff, @@ -1180,13 +1181,13 @@ SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI, assert(VA.isMemLoc()); // Create a store off the stack pointer for this argument. - SDValue StackPtr = DAG.getRegister(SP::O6, getPointerTy()); + SDValue StackPtr = DAG.getRegister(SP::O6, PtrVT); // The argument area starts at %fp+BIAS+128 in the callee frame, // %sp+BIAS+128 in ours. SDValue PtrOff = DAG.getIntPtrConstant(VA.getLocMemOffset() + Subtarget->getStackPointerBias() + 128, DL); - PtrOff = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff); + PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff); MemOpChains.push_back(DAG.getStore(Chain, DL, Arg, PtrOff, MachinePointerInfo(), false, false, 0)); @@ -1215,10 +1216,9 @@ SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI, unsigned TF = ((getTargetMachine().getRelocationModel() == Reloc::PIC_) ? SparcMCExpr::VK_Sparc_WPLT30 : 0); if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) - Callee = DAG.getTargetGlobalAddress(G->getGlobal(), DL, getPointerTy(), 0, - TF); + Callee = DAG.getTargetGlobalAddress(G->getGlobal(), DL, PtrVT, 0, TF); else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee)) - Callee = DAG.getTargetExternalSymbol(E->getSymbol(), getPointerTy(), TF); + Callee = DAG.getTargetExternalSymbol(E->getSymbol(), PtrVT, TF); // Build the operands for the call instruction itself. SmallVector<SDValue, 8> Ops; @@ -1370,6 +1370,8 @@ static SPCC::CondCodes FPCondCCodeToFCC(ISD::CondCode CC) { SparcTargetLowering::SparcTargetLowering(TargetMachine &TM, const SparcSubtarget &STI) : TargetLowering(TM), Subtarget(&STI) { + auto &DL = *TM.getDataLayout(); + // Set up the register classes. addRegisterClass(MVT::i32, &SP::IntRegsRegClass); addRegisterClass(MVT::f32, &SP::FPRegsRegClass); @@ -1394,10 +1396,10 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM, setTruncStoreAction(MVT::f128, MVT::f64, Expand); // Custom legalize GlobalAddress nodes into LO/HI parts. - setOperationAction(ISD::GlobalAddress, getPointerTy(), Custom); - setOperationAction(ISD::GlobalTLSAddress, getPointerTy(), Custom); - setOperationAction(ISD::ConstantPool, getPointerTy(), Custom); - setOperationAction(ISD::BlockAddress, getPointerTy(), Custom); + setOperationAction(ISD::GlobalAddress, getPointerTy(DL), Custom); + setOperationAction(ISD::GlobalTLSAddress, getPointerTy(DL), Custom); + setOperationAction(ISD::ConstantPool, getPointerTy(DL), Custom); + setOperationAction(ISD::BlockAddress, getPointerTy(DL), Custom); // Sparc doesn't have sext_inreg, replace them with shl/sra setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand); @@ -1704,7 +1706,8 @@ const char *SparcTargetLowering::getTargetNodeName(unsigned Opcode) const { return nullptr; } -EVT SparcTargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { +EVT SparcTargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &, + EVT VT) const { if (!VT.isVector()) return MVT::i32; return VT.changeVectorElementTypeToInteger(); @@ -1804,7 +1807,7 @@ SDValue SparcTargetLowering::makeHiLoPair(SDValue Op, // or ExternalSymbol SDNode. SDValue SparcTargetLowering::makeAddress(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); - EVT VT = getPointerTy(); + EVT VT = getPointerTy(DAG.getDataLayout()); // Handle PIC mode first. if (getTargetMachine().getRelocationModel() == Reloc::PIC_) { @@ -1871,7 +1874,7 @@ SDValue SparcTargetLowering::LowerGlobalTLSAddress(SDValue Op, GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); SDLoc DL(GA); const GlobalValue *GV = GA->getGlobal(); - EVT PtrVT = getPointerTy(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); TLSModel::Model model = getTargetMachine().getTLSModel(GV); @@ -1983,7 +1986,7 @@ SparcTargetLowering::LowerF128_LibCallArg(SDValue Chain, ArgListTy &Args, if (ArgTy->isFP128Ty()) { // Create a stack object and pass the pointer to the library function. int FI = MFI->CreateStackObject(16, 8, false); - SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy()); + SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); Chain = DAG.getStore(Chain, DL, Entry.Node, @@ -2008,8 +2011,9 @@ SparcTargetLowering::LowerF128Op(SDValue Op, SelectionDAG &DAG, ArgListTy Args; MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); + auto PtrVT = getPointerTy(DAG.getDataLayout()); - SDValue Callee = DAG.getExternalSymbol(LibFuncName, getPointerTy()); + SDValue Callee = DAG.getExternalSymbol(LibFuncName, PtrVT); Type *RetTy = Op.getValueType().getTypeForEVT(*DAG.getContext()); Type *RetTyABI = RetTy; SDValue Chain = DAG.getEntryNode(); @@ -2019,7 +2023,7 @@ SparcTargetLowering::LowerF128Op(SDValue Op, SelectionDAG &DAG, // Create a Stack Object to receive the return value of type f128. ArgListEntry Entry; int RetFI = MFI->CreateStackObject(16, 8, false); - RetPtr = DAG.getFrameIndex(RetFI, getPointerTy()); + RetPtr = DAG.getFrameIndex(RetFI, PtrVT); Entry.Node = RetPtr; Entry.Ty = PointerType::getUnqual(RetTy); if (!Subtarget->is64Bit()) @@ -2082,7 +2086,8 @@ SparcTargetLowering::LowerF128Compare(SDValue LHS, SDValue RHS, case SPCC::FCC_UE : LibCall = is64Bit? "_Qp_cmp" : "_Q_cmp"; break; } - SDValue Callee = DAG.getExternalSymbol(LibCall, getPointerTy()); + auto PtrVT = getPointerTy(DAG.getDataLayout()); + SDValue Callee = DAG.getExternalSymbol(LibCall, PtrVT); Type *RetTy = Type::getInt32Ty(*DAG.getContext()); ArgListTy Args; SDValue Chain = DAG.getEntryNode(); @@ -2362,6 +2367,7 @@ static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG, const SparcTargetLowering &TLI) { MachineFunction &MF = DAG.getMachineFunction(); SparcMachineFunctionInfo *FuncInfo = MF.getInfo<SparcMachineFunctionInfo>(); + auto PtrVT = TLI.getPointerTy(DAG.getDataLayout()); // Need frame address to find the address of VarArgsFrameIndex. MF.getFrameInfo()->setFrameAddressIsTaken(true); @@ -2370,9 +2376,8 @@ static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG, // memory location argument. SDLoc DL(Op); SDValue Offset = - DAG.getNode(ISD::ADD, DL, TLI.getPointerTy(), - DAG.getRegister(SP::I6, TLI.getPointerTy()), - DAG.getIntPtrConstant(FuncInfo->getVarArgsFrameOffset(), DL)); + DAG.getNode(ISD::ADD, DL, PtrVT, DAG.getRegister(SP::I6, PtrVT), + DAG.getIntPtrConstant(FuncInfo->getVarArgsFrameOffset(), DL)); const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); return DAG.getStore(Op.getOperand(0), DL, Offset, Op.getOperand(1), MachinePointerInfo(SV), false, false, 0); @@ -2497,8 +2502,8 @@ static SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG, SDValue RetAddr; if (depth == 0) { - unsigned RetReg = MF.addLiveIn(SP::I7, - TLI.getRegClassFor(TLI.getPointerTy())); + auto PtrVT = TLI.getPointerTy(DAG.getDataLayout()); + unsigned RetReg = MF.addLiveIn(SP::I7, TLI.getRegClassFor(PtrVT)); RetAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, RetReg, VT); return RetAddr; } @@ -3065,7 +3070,7 @@ SparcTargetLowering::expandAtomicRMW(MachineInstr *MI, /// getConstraintType - Given a constraint letter, return the type of /// constraint it is for this target. SparcTargetLowering::ConstraintType -SparcTargetLowering::getConstraintType(const std::string &Constraint) const { +SparcTargetLowering::getConstraintType(StringRef Constraint) const { if (Constraint.size() == 1) { switch (Constraint[0]) { default: break; @@ -3139,7 +3144,7 @@ LowerAsmOperandForConstraint(SDValue Op, std::pair<unsigned, const TargetRegisterClass *> SparcTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, - const std::string &Constraint, + StringRef Constraint, MVT VT) const { if (Constraint.size() == 1) { switch (Constraint[0]) { diff --git a/lib/Target/Sparc/SparcISelLowering.h b/lib/Target/Sparc/SparcISelLowering.h index b6bc3d255713..bbc91a493c9d 100644 --- a/lib/Target/Sparc/SparcISelLowering.h +++ b/lib/Target/Sparc/SparcISelLowering.h @@ -72,7 +72,7 @@ namespace llvm { const char *getTargetNodeName(unsigned Opcode) const override; - ConstraintType getConstraintType(const std::string &Constraint) const override; + ConstraintType getConstraintType(StringRef Constraint) const override; ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const override; @@ -82,14 +82,16 @@ namespace llvm { SelectionDAG &DAG) const override; std::pair<unsigned, const TargetRegisterClass *> getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, - const std::string &Constraint, - MVT VT) const override; + StringRef Constraint, MVT VT) const override; bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override; - MVT getScalarShiftAmountTy(EVT LHSTy) const override { return MVT::i32; } + MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override { + return MVT::i32; + } /// getSetCCResultType - Return the ISD::SETCC ValueType - EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override; + EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, + EVT VT) const override; SDValue LowerFormalArguments(SDValue Chain, diff --git a/lib/Target/Sparc/SparcInstrAliases.td b/lib/Target/Sparc/SparcInstrAliases.td index 670e9e989c81..25cc652dbd9e 100644 --- a/lib/Target/Sparc/SparcInstrAliases.td +++ b/lib/Target/Sparc/SparcInstrAliases.td @@ -245,6 +245,7 @@ multiclass fp_cond_alias<string cond, int condVal> { } defm : int_cond_alias<"a", 0b1000>; +defm : int_cond_alias<"", 0b1000>; // same as a; gnu asm, not in manual defm : int_cond_alias<"n", 0b0000>; defm : int_cond_alias<"ne", 0b1001>; defm : int_cond_alias<"nz", 0b1001>; // same as ne @@ -266,6 +267,7 @@ defm : int_cond_alias<"vc", 0b1111>; defm : int_cond_alias<"vs", 0b0111>; defm : fp_cond_alias<"a", 0b0000>; +defm : fp_cond_alias<"", 0b0000>; // same as a; gnu asm, not in manual defm : fp_cond_alias<"n", 0b1000>; defm : fp_cond_alias<"u", 0b0111>; defm : fp_cond_alias<"g", 0b0110>; @@ -284,7 +286,16 @@ defm : fp_cond_alias<"le", 0b1101>; defm : fp_cond_alias<"ule", 0b1110>; defm : fp_cond_alias<"o", 0b1111>; -// Instruction aliases for JMPL. +// Section A.3 Synthetic Instructions + +// Most are marked as Emit=0, so that they are not used for disassembly. This is +// an aesthetic issue, but the chosen policy is to typically prefer using the +// non-alias form, except for the most obvious and clarifying aliases: cmp, jmp, +// call, tst, ret, retl. + +// Note: cmp is handled in SparcInstrInfo. +// jmp/call/ret/retl have special case handling for output in +// SparcInstPrinter.cpp // jmp addr -> jmpl addr, %g0 def : InstAlias<"jmp $addr", (JMPLrr G0, MEMrr:$addr), 0>; @@ -294,25 +305,129 @@ def : InstAlias<"jmp $addr", (JMPLri G0, MEMri:$addr), 0>; def : InstAlias<"call $addr", (JMPLrr O7, MEMrr:$addr), 0>; def : InstAlias<"call $addr", (JMPLri O7, MEMri:$addr), 0>; -// retl -> RETL 8 -def : InstAlias<"retl", (RETL 8)>; +// tst reg -> orcc %g0, reg, %g0 +def : InstAlias<"tst $rs2", (ORCCrr G0, IntRegs:$rs2, G0)>; -// ret -> RET 8 +// ret -> jmpl %i7+8, %g0 (aka RET 8) def : InstAlias<"ret", (RET 8)>; -// mov reg, rd -> or %g0, reg, rd -def : InstAlias<"mov $rs2, $rd", (ORrr IntRegs:$rd, G0, IntRegs:$rs2)>; +// retl -> jmpl %o7+8, %g0 (aka RETL 8) +def : InstAlias<"retl", (RETL 8)>; -// mov simm13, rd -> or %g0, simm13, rd -def : InstAlias<"mov $simm13, $rd", (ORri IntRegs:$rd, G0, i32imm:$simm13)>; +// restore -> restore %g0, %g0, %g0 +def : InstAlias<"restore", (RESTORErr G0, G0, G0)>; + +// save -> restore %g0, %g0, %g0 +def : InstAlias<"save", (SAVErr G0, G0, G0)>; // set value, rd // (turns into a sequence of sethi+or, depending on the value) // def : InstAlias<"set $val, $rd", (ORri IntRegs:$rd, (SETHIi (HI22 imm:$val)), (LO10 imm:$val))>; def SET : AsmPseudoInst<(outs IntRegs:$rd), (ins i32imm:$val), "set $val, $rd">; -// restore -> restore %g0, %g0, %g0 -def : InstAlias<"restore", (RESTORErr G0, G0, G0)>; +// not rd -> xnor rd, %g0, rd +def : InstAlias<"not $rd", (XNORrr IntRegs:$rd, IntRegs:$rd, G0), 0>; + +// not reg, rd -> xnor reg, %g0, rd +def : InstAlias<"not $rs1, $rd", (XNORrr IntRegs:$rd, IntRegs:$rs1, G0), 0>; + +// neg rd -> sub %g0, rd, rd +def : InstAlias<"neg $rd", (SUBrr IntRegs:$rd, G0, IntRegs:$rd), 0>; + +// neg reg, rd -> sub %g0, reg, rd +def : InstAlias<"neg $rs2, $rd", (SUBrr IntRegs:$rd, G0, IntRegs:$rs2), 0>; + +// inc rd -> add rd, 1, rd +def : InstAlias<"inc $rd", (ADDri IntRegs:$rd, IntRegs:$rd, 1), 0>; + +// inc simm13, rd -> add rd, simm13, rd +def : InstAlias<"inc $simm13, $rd", (ADDri IntRegs:$rd, IntRegs:$rd, i32imm:$simm13), 0>; + +// inccc rd -> addcc rd, 1, rd +def : InstAlias<"inccc $rd", (ADDCCri IntRegs:$rd, IntRegs:$rd, 1), 0>; + +// inccc simm13, rd -> addcc rd, simm13, rd +def : InstAlias<"inccc $simm13, $rd", (ADDCCri IntRegs:$rd, IntRegs:$rd, i32imm:$simm13), 0>; + +// dec rd -> sub rd, 1, rd +def : InstAlias<"dec $rd", (SUBri IntRegs:$rd, IntRegs:$rd, 1), 0>; + +// dec simm13, rd -> sub rd, simm13, rd +def : InstAlias<"dec $simm13, $rd", (SUBri IntRegs:$rd, IntRegs:$rd, i32imm:$simm13), 0>; + +// deccc rd -> subcc rd, 1, rd +def : InstAlias<"deccc $rd", (SUBCCri IntRegs:$rd, IntRegs:$rd, 1), 0>; + +// deccc simm13, rd -> subcc rd, simm13, rd +def : InstAlias<"deccc $simm13, $rd", (SUBCCri IntRegs:$rd, IntRegs:$rd, i32imm:$simm13), 0>; + +// btst reg_or_imm, reg -> andcc reg,reg_or_imm,%g0 +def : InstAlias<"btst $rs2, $rs1", (ANDCCrr G0, IntRegs:$rs1, IntRegs:$rs2), 0>; +def : InstAlias<"btst $simm13, $rs1", (ANDCCri G0, IntRegs:$rs1, i32imm:$simm13), 0>; + +// bset reg_or_imm, rd -> or rd,reg_or_imm,rd +def : InstAlias<"bset $rs2, $rd", (ORrr IntRegs:$rd, IntRegs:$rd, IntRegs:$rs2), 0>; +def : InstAlias<"bset $simm13, $rd", (ORri IntRegs:$rd, IntRegs:$rd, i32imm:$simm13), 0>; + +// bclr reg_or_imm, rd -> andn rd,reg_or_imm,rd +def : InstAlias<"bclr $rs2, $rd", (ANDNrr IntRegs:$rd, IntRegs:$rd, IntRegs:$rs2), 0>; +def : InstAlias<"bclr $simm13, $rd", (ANDNri IntRegs:$rd, IntRegs:$rd, i32imm:$simm13), 0>; + +// btog reg_or_imm, rd -> xor rd,reg_or_imm,rd +def : InstAlias<"btog $rs2, $rd", (XORrr IntRegs:$rd, IntRegs:$rd, IntRegs:$rs2), 0>; +def : InstAlias<"btog $simm13, $rd", (XORri IntRegs:$rd, IntRegs:$rd, i32imm:$simm13), 0>; + + +// clr rd -> or %g0, %g0, rd +def : InstAlias<"clr $rd", (ORrr IntRegs:$rd, G0, G0), 0>; + +// clr{b,h,} [addr] -> st{b,h,} %g0, [addr] +def : InstAlias<"clrb [$addr]", (STBrr MEMrr:$addr, G0), 0>; +def : InstAlias<"clrb [$addr]", (STBri MEMri:$addr, G0), 0>; +def : InstAlias<"clrh [$addr]", (STHrr MEMrr:$addr, G0), 0>; +def : InstAlias<"clrh [$addr]", (STHri MEMri:$addr, G0), 0>; +def : InstAlias<"clr [$addr]", (STrr MEMrr:$addr, G0), 0>; +def : InstAlias<"clr [$addr]", (STri MEMri:$addr, G0), 0>; + + +// mov reg_or_imm, rd -> or %g0, reg_or_imm, rd +def : InstAlias<"mov $rs2, $rd", (ORrr IntRegs:$rd, G0, IntRegs:$rs2)>; +def : InstAlias<"mov $simm13, $rd", (ORri IntRegs:$rd, G0, i32imm:$simm13)>; + +// mov specialreg, rd -> rd specialreg, rd +def : InstAlias<"mov $asr, $rd", (RDASR IntRegs:$rd, ASRRegs:$asr), 0>; +def : InstAlias<"mov %psr, $rd", (RDPSR IntRegs:$rd), 0>; +def : InstAlias<"mov %wim, $rd", (RDWIM IntRegs:$rd), 0>; +def : InstAlias<"mov %tbr, $rd", (RDTBR IntRegs:$rd), 0>; + +// mov reg_or_imm, specialreg -> wr %g0, reg_or_imm, specialreg +def : InstAlias<"mov $rs2, $asr", (WRASRrr ASRRegs:$asr, G0, IntRegs:$rs2), 0>; +def : InstAlias<"mov $simm13, $asr", (WRASRri ASRRegs:$asr, G0, i32imm:$simm13), 0>; +def : InstAlias<"mov $rs2, %psr", (WRPSRrr G0, IntRegs:$rs2), 0>; +def : InstAlias<"mov $simm13, %psr", (WRPSRri G0, i32imm:$simm13), 0>; +def : InstAlias<"mov $rs2, %wim", (WRWIMrr G0, IntRegs:$rs2), 0>; +def : InstAlias<"mov $simm13, %wim", (WRWIMri G0, i32imm:$simm13), 0>; +def : InstAlias<"mov $rs2, %tbr", (WRTBRrr G0, IntRegs:$rs2), 0>; +def : InstAlias<"mov $simm13, %tbr", (WRTBRri G0, i32imm:$simm13), 0>; + +// End of Section A.3 + +// wr reg_or_imm, specialreg -> wr %g0, reg_or_imm, specialreg +// (aka: omit the first arg when it's g0. This is not in the manual, but is +// supported by gnu and solaris as) +def : InstAlias<"wr $rs2, $asr", (WRASRrr ASRRegs:$asr, G0, IntRegs:$rs2), 0>; +def : InstAlias<"wr $simm13, $asr", (WRASRri ASRRegs:$asr, G0, i32imm:$simm13), 0>; +def : InstAlias<"wr $rs2, %psr", (WRPSRrr G0, IntRegs:$rs2), 0>; +def : InstAlias<"wr $simm13, %psr", (WRPSRri G0, i32imm:$simm13), 0>; +def : InstAlias<"wr $rs2, %wim", (WRWIMrr G0, IntRegs:$rs2), 0>; +def : InstAlias<"wr $simm13, %wim", (WRWIMri G0, i32imm:$simm13), 0>; +def : InstAlias<"wr $rs2, %tbr", (WRTBRrr G0, IntRegs:$rs2), 0>; +def : InstAlias<"wr $simm13, %tbr", (WRTBRri G0, i32imm:$simm13), 0>; + + +// flush -> flush %g0 +def : InstAlias<"flush", (FLUSH), 0>; + def : MnemonicAlias<"return", "rett">, Requires<[HasV9]>; diff --git a/lib/Target/Sparc/SparcInstrInfo.cpp b/lib/Target/Sparc/SparcInstrInfo.cpp index f87cee43e319..6167c532db80 100644 --- a/lib/Target/Sparc/SparcInstrInfo.cpp +++ b/lib/Target/Sparc/SparcInstrInfo.cpp @@ -324,6 +324,15 @@ void SparcInstrInfo::copyPhysReg(MachineBasicBlock &MBB, numSubRegs = 4; movOpc = SP::FMOVS; } + } else if (SP::ASRRegsRegClass.contains(DestReg) && + SP::IntRegsRegClass.contains(SrcReg)) { + BuildMI(MBB, I, DL, get(SP::WRASRrr), DestReg) + .addReg(SP::G0) + .addReg(SrcReg, getKillRegState(KillSrc)); + } else if (SP::IntRegsRegClass.contains(DestReg) && + SP::ASRRegsRegClass.contains(SrcReg)) { + BuildMI(MBB, I, DL, get(SP::RDASR), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); } else llvm_unreachable("Impossible reg-to-reg copy"); diff --git a/lib/Target/Sparc/SparcInstrInfo.td b/lib/Target/Sparc/SparcInstrInfo.td index a02bae07a336..3b9e048ea8b3 100644 --- a/lib/Target/Sparc/SparcInstrInfo.td +++ b/lib/Target/Sparc/SparcInstrInfo.td @@ -536,6 +536,7 @@ let Defs = [ICC] in let Uses = [ICC] in defm SUBC : F3_12np <"subx", 0b001100>; +// cmp (from Section A.3) is a specialized alias for subcc let Defs = [ICC], rd = 0 in { def CMPrr : F3_1<2, 0b010100, (outs), (ins IntRegs:$rs1, IntRegs:$rs2), @@ -559,12 +560,12 @@ let Defs = [Y, ICC] in { } // Section B.19 - Divide Instructions, p. 115 -let Defs = [Y] in { +let Uses = [Y], Defs = [Y] in { defm UDIV : F3_12np<"udiv", 0b001110>; defm SDIV : F3_12np<"sdiv", 0b001111>; } -let Defs = [Y, ICC] in { +let Uses = [Y], Defs = [Y, ICC] in { defm UDIVCC : F3_12np<"udivcc", 0b011110>; defm SDIVCC : F3_12np<"sdivcc", 0b011111>; } @@ -828,6 +829,20 @@ let rd = 0 in def UNIMP : F2_1<0b000, (outs), (ins i32imm:$imm22), "unimp $imm22", []>; +// Section B.32 - Flush Instruction Memory +let rd = 0 in { + def FLUSHrr : F3_1<2, 0b111011, (outs), (ins MEMrr:$addr), + "flush $addr", []>; + def FLUSHri : F3_2<2, 0b111011, (outs), (ins MEMri:$addr), + "flush $addr", []>; + + // The no-arg FLUSH is only here for the benefit of the InstAlias + // "flush", which cannot seem to use FLUSHrr, due to the inability + // to construct a MEMrr with fixed G0 registers. + let rs1 = 0, rs2 = 0 in + def FLUSH : F3_1<2, 0b111011, (outs), (ins), "flush %g0", []>; +} + // Section B.33 - Floating-point Operate (FPop) Instructions // Convert Integer to Floating-point Instructions, p. 141 diff --git a/lib/Target/Sparc/SparcRegisterInfo.td b/lib/Target/Sparc/SparcRegisterInfo.td index e504da4d3b21..db8a7e86962d 100644 --- a/lib/Target/Sparc/SparcRegisterInfo.td +++ b/lib/Target/Sparc/SparcRegisterInfo.td @@ -249,4 +249,6 @@ def FCCRegs : RegisterClass<"SP", [i1], 1, (sequence "FCC%u", 0, 3)>; // Ancillary state registers def ASRRegs : RegisterClass<"SP", [i32], 32, - (add Y, (sequence "ASR%u", 1, 31))>; + (add Y, (sequence "ASR%u", 1, 31))> { + let isAllocatable = 0; +} diff --git a/lib/Target/Sparc/SparcSelectionDAGInfo.cpp b/lib/Target/Sparc/SparcSelectionDAGInfo.cpp deleted file mode 100644 index a308fc5e739e..000000000000 --- a/lib/Target/Sparc/SparcSelectionDAGInfo.cpp +++ /dev/null @@ -1,24 +0,0 @@ -//===-- SparcSelectionDAGInfo.cpp - Sparc SelectionDAG Info ---------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file implements the SparcSelectionDAGInfo class. -// -//===----------------------------------------------------------------------===// - -#include "SparcSelectionDAGInfo.h" -using namespace llvm; - -#define DEBUG_TYPE "sparc-selectiondag-info" - -SparcSelectionDAGInfo::SparcSelectionDAGInfo(const DataLayout &DL) - : TargetSelectionDAGInfo(&DL) { -} - -SparcSelectionDAGInfo::~SparcSelectionDAGInfo() { -} diff --git a/lib/Target/Sparc/SparcSelectionDAGInfo.h b/lib/Target/Sparc/SparcSelectionDAGInfo.h deleted file mode 100644 index 6818291b30b4..000000000000 --- a/lib/Target/Sparc/SparcSelectionDAGInfo.h +++ /dev/null @@ -1,31 +0,0 @@ -//===-- SparcSelectionDAGInfo.h - Sparc SelectionDAG Info -------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file defines the Sparc subclass for TargetSelectionDAGInfo. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_SPARC_SPARCSELECTIONDAGINFO_H -#define LLVM_LIB_TARGET_SPARC_SPARCSELECTIONDAGINFO_H - -#include "llvm/Target/TargetSelectionDAGInfo.h" - -namespace llvm { - -class SparcTargetMachine; - -class SparcSelectionDAGInfo : public TargetSelectionDAGInfo { -public: - explicit SparcSelectionDAGInfo(const DataLayout &DL); - ~SparcSelectionDAGInfo() override; -}; - -} - -#endif diff --git a/lib/Target/Sparc/SparcSubtarget.cpp b/lib/Target/Sparc/SparcSubtarget.cpp index 479b25d2723f..d69da409e428 100644 --- a/lib/Target/Sparc/SparcSubtarget.cpp +++ b/lib/Target/Sparc/SparcSubtarget.cpp @@ -54,7 +54,7 @@ SparcSubtarget::SparcSubtarget(const Triple &TT, const std::string &CPU, bool is64Bit) : SparcGenSubtargetInfo(TT, CPU, FS), Is64Bit(is64Bit), InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this), - TSInfo(*TM.getDataLayout()), FrameLowering(*this) {} + FrameLowering(*this) {} int SparcSubtarget::getAdjustedFrameSize(int frameSize) const { diff --git a/lib/Target/Sparc/SparcSubtarget.h b/lib/Target/Sparc/SparcSubtarget.h index 983b1193975d..9d21911d88f0 100644 --- a/lib/Target/Sparc/SparcSubtarget.h +++ b/lib/Target/Sparc/SparcSubtarget.h @@ -17,9 +17,9 @@ #include "SparcFrameLowering.h" #include "SparcInstrInfo.h" #include "SparcISelLowering.h" -#include "SparcSelectionDAGInfo.h" #include "llvm/IR/DataLayout.h" #include "llvm/Target/TargetFrameLowering.h" +#include "llvm/Target/TargetSelectionDAGInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" #include <string> @@ -39,7 +39,7 @@ class SparcSubtarget : public SparcGenSubtargetInfo { bool UsePopc; SparcInstrInfo InstrInfo; SparcTargetLowering TLInfo; - SparcSelectionDAGInfo TSInfo; + TargetSelectionDAGInfo TSInfo; SparcFrameLowering FrameLowering; public: @@ -56,7 +56,7 @@ public: const SparcTargetLowering *getTargetLowering() const override { return &TLInfo; } - const SparcSelectionDAGInfo *getSelectionDAGInfo() const override { + const TargetSelectionDAGInfo *getSelectionDAGInfo() const override { return &TSInfo; } diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp index 81882106fc46..5fefa315a4cf 100644 --- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp +++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp @@ -148,7 +148,7 @@ static MCInstrInfo *createSystemZMCInstrInfo() { return X; } -static MCRegisterInfo *createSystemZMCRegisterInfo(StringRef TT) { +static MCRegisterInfo *createSystemZMCRegisterInfo(const Triple &TT) { MCRegisterInfo *X = new MCRegisterInfo(); InitSystemZMCRegisterInfo(X, SystemZ::R14D); return X; @@ -156,12 +156,11 @@ static MCRegisterInfo *createSystemZMCRegisterInfo(StringRef TT) { static MCSubtargetInfo * createSystemZMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) { - MCSubtargetInfo *X = new MCSubtargetInfo(); - InitSystemZMCSubtargetInfo(X, TT, CPU, FS); - return X; + return createSystemZMCSubtargetInfoImpl(TT, CPU, FS); } -static MCCodeGenInfo *createSystemZMCCodeGenInfo(StringRef TT, Reloc::Model RM, +static MCCodeGenInfo *createSystemZMCCodeGenInfo(const Triple &TT, + Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) { MCCodeGenInfo *X = new MCCodeGenInfo(); diff --git a/lib/Target/SystemZ/SystemZFrameLowering.cpp b/lib/Target/SystemZ/SystemZFrameLowering.cpp index a636b35635ce..397de472a6ee 100644 --- a/lib/Target/SystemZ/SystemZFrameLowering.cpp +++ b/lib/Target/SystemZ/SystemZFrameLowering.cpp @@ -61,11 +61,12 @@ SystemZFrameLowering::getCalleeSavedSpillSlots(unsigned &NumEntries) const { return SpillOffsetTable; } -void SystemZFrameLowering:: -processFunctionBeforeCalleeSavedScan(MachineFunction &MF, - RegScavenger *RS) const { +void SystemZFrameLowering::determineCalleeSaves(MachineFunction &MF, + BitVector &SavedRegs, + RegScavenger *RS) const { + TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); + MachineFrameInfo *MFFrame = MF.getFrameInfo(); - MachineRegisterInfo &MRI = MF.getRegInfo(); const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); bool HasFP = hasFP(MF); SystemZMachineFunctionInfo *MFI = MF.getInfo<SystemZMachineFunctionInfo>(); @@ -77,17 +78,17 @@ processFunctionBeforeCalleeSavedScan(MachineFunction &MF, // argument register R6D. if (IsVarArg) for (unsigned I = MFI->getVarArgsFirstGPR(); I < SystemZ::NumArgGPRs; ++I) - MRI.setPhysRegUsed(SystemZ::ArgGPRs[I]); + SavedRegs.set(SystemZ::ArgGPRs[I]); // If the function requires a frame pointer, record that the hard // frame pointer will be clobbered. if (HasFP) - MRI.setPhysRegUsed(SystemZ::R11D); + SavedRegs.set(SystemZ::R11D); // If the function calls other functions, record that the return // address register will be clobbered. if (MFFrame->hasCalls()) - MRI.setPhysRegUsed(SystemZ::R14D); + SavedRegs.set(SystemZ::R14D); // If we are saving GPRs other than the stack pointer, we might as well // save and restore the stack pointer at the same time, via STMG and LMG. @@ -96,8 +97,8 @@ processFunctionBeforeCalleeSavedScan(MachineFunction &MF, const MCPhysReg *CSRegs = TRI->getCalleeSavedRegs(&MF); for (unsigned I = 0; CSRegs[I]; ++I) { unsigned Reg = CSRegs[I]; - if (SystemZ::GR64BitRegClass.contains(Reg) && MRI.isPhysRegUsed(Reg)) { - MRI.setPhysRegUsed(SystemZ::R15D); + if (SystemZ::GR64BitRegClass.contains(Reg) && SavedRegs.test(Reg)) { + SavedRegs.set(SystemZ::R15D); break; } } diff --git a/lib/Target/SystemZ/SystemZFrameLowering.h b/lib/Target/SystemZ/SystemZFrameLowering.h index 60bad894ee44..5ade757f17f7 100644 --- a/lib/Target/SystemZ/SystemZFrameLowering.h +++ b/lib/Target/SystemZ/SystemZFrameLowering.h @@ -27,8 +27,8 @@ public: bool isFPCloseToIncomingSP() const override { return false; } const SpillSlot *getCalleeSavedSpillSlots(unsigned &NumEntries) const override; - void processFunctionBeforeCalleeSavedScan(MachineFunction &MF, - RegScavenger *RS) const override; + void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, + RegScavenger *RS) const override; bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const std::vector<CalleeSavedInfo> &CSI, diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp index 372f6fb3ea50..056ee02dcc21 100644 --- a/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -81,10 +81,11 @@ static MachineOperand earlyUseOperand(MachineOperand Op) { return Op; } -SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &tm, +SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, const SystemZSubtarget &STI) - : TargetLowering(tm), Subtarget(STI) { - MVT PtrVT = getPointerTy(); + : TargetLowering(TM), Subtarget(STI) { + auto &DL = *TM.getDataLayout(); + MVT PtrVT = getPointerTy(DL); // Set up the register classes. if (Subtarget.hasHighWord()) @@ -455,7 +456,8 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &tm, MaxStoresPerMemsetOptSize = 0; } -EVT SystemZTargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { +EVT SystemZTargetLowering::getSetCCResultType(const DataLayout &DL, + LLVMContext &, EVT VT) const { if (!VT.isVector()) return MVT::i32; return VT.changeVectorElementTypeToInteger(); @@ -507,8 +509,8 @@ bool SystemZTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, return true; } -bool SystemZTargetLowering::isLegalAddressingMode(const AddrMode &AM, - Type *Ty, +bool SystemZTargetLowering::isLegalAddressingMode(const DataLayout &DL, + const AddrMode &AM, Type *Ty, unsigned AS) const { // Punt on globals for now, although they can be used in limited // RELATIVE LONG cases. @@ -544,7 +546,7 @@ bool SystemZTargetLowering::isTruncateFree(EVT FromVT, EVT ToVT) const { //===----------------------------------------------------------------------===// TargetLowering::ConstraintType -SystemZTargetLowering::getConstraintType(const std::string &Constraint) const { +SystemZTargetLowering::getConstraintType(StringRef Constraint) const { if (Constraint.size() == 1) { switch (Constraint[0]) { case 'a': // Address register @@ -641,13 +643,14 @@ getSingleConstraintMatchWeight(AsmOperandInfo &info, // has already been verified. MC is the class associated with "t" and // Map maps 0-based register numbers to LLVM register numbers. static std::pair<unsigned, const TargetRegisterClass *> -parseRegisterNumber(const std::string &Constraint, - const TargetRegisterClass *RC, const unsigned *Map) { +parseRegisterNumber(StringRef Constraint, const TargetRegisterClass *RC, + const unsigned *Map) { assert(*(Constraint.end()-1) == '}' && "Missing '}'"); if (isdigit(Constraint[2])) { - std::string Suffix(Constraint.data() + 2, Constraint.size() - 2); - unsigned Index = atoi(Suffix.c_str()); - if (Index < 16 && Map[Index]) + unsigned Index; + bool Failed = + Constraint.slice(2, Constraint.size() - 1).getAsInteger(10, Index); + if (!Failed && Index < 16 && Map[Index]) return std::make_pair(Map[Index], RC); } return std::make_pair(0U, nullptr); @@ -655,8 +658,7 @@ parseRegisterNumber(const std::string &Constraint, std::pair<unsigned, const TargetRegisterClass *> SystemZTargetLowering::getRegForInlineAsmConstraint( - const TargetRegisterInfo *TRI, const std::string &Constraint, - MVT VT) const { + const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { if (Constraint.size() == 1) { // GCC Constraint Letters switch (Constraint[0]) { @@ -687,7 +689,7 @@ SystemZTargetLowering::getRegForInlineAsmConstraint( return std::make_pair(0U, &SystemZ::FP32BitRegClass); } } - if (Constraint[0] == '{') { + if (Constraint.size() > 0 && Constraint[0] == '{') { // We need to override the default register parsing for GPRs and FPRs // because the interpretation depends on VT. The internal names of // the registers are also different from the external names @@ -931,7 +933,7 @@ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, // Create the SelectionDAG nodes corresponding to a load // from this parameter. Unpromoted ints and floats are // passed as right-justified 8-byte values. - EVT PtrVT = getPointerTy(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDValue FIN = DAG.getFrameIndex(FI, PtrVT); if (VA.getLocVT() == MVT::i32 || VA.getLocVT() == MVT::f32) FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, @@ -969,7 +971,7 @@ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, for (unsigned I = NumFixedFPRs; I < SystemZ::NumArgFPRs; ++I) { unsigned Offset = TFL->getRegSpillOffset(SystemZ::ArgFPRs[I]); int FI = MFI->CreateFixedObject(8, RegSaveOffset + Offset, true); - SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); + SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); unsigned VReg = MF.addLiveIn(SystemZ::ArgFPRs[I], &SystemZ::FP64BitRegClass); SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f64); @@ -1019,7 +1021,7 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI, CallingConv::ID CallConv = CLI.CallConv; bool IsVarArg = CLI.IsVarArg; MachineFunction &MF = DAG.getMachineFunction(); - EVT PtrVT = getPointerTy(); + EVT PtrVT = getPointerTy(MF.getDataLayout()); // Detect unsupported vector argument and return types. if (Subtarget.hasVector()) { @@ -2401,7 +2403,7 @@ SDValue SystemZTargetLowering::lowerGlobalAddress(GlobalAddressSDNode *Node, SDLoc DL(Node); const GlobalValue *GV = Node->getGlobal(); int64_t Offset = Node->getOffset(); - EVT PtrVT = getPointerTy(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); Reloc::Model RM = DAG.getTarget().getRelocationModel(); CodeModel::Model CM = DAG.getTarget().getCodeModel(); @@ -2440,7 +2442,7 @@ SDValue SystemZTargetLowering::lowerTLSGetOffset(GlobalAddressSDNode *Node, unsigned Opcode, SDValue GOTOffset) const { SDLoc DL(Node); - EVT PtrVT = getPointerTy(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Chain = DAG.getEntryNode(); SDValue Glue; @@ -2486,7 +2488,7 @@ SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node, SelectionDAG &DAG) const { SDLoc DL(Node); const GlobalValue *GV = Node->getGlobal(); - EVT PtrVT = getPointerTy(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); TLSModel::Model model = DAG.getTarget().getTLSModel(GV); // The high part of the thread pointer is in access register 0. @@ -2587,7 +2589,7 @@ SDValue SystemZTargetLowering::lowerBlockAddress(BlockAddressSDNode *Node, SDLoc DL(Node); const BlockAddress *BA = Node->getBlockAddress(); int64_t Offset = Node->getOffset(); - EVT PtrVT = getPointerTy(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset); Result = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result); @@ -2597,7 +2599,7 @@ SDValue SystemZTargetLowering::lowerBlockAddress(BlockAddressSDNode *Node, SDValue SystemZTargetLowering::lowerJumpTable(JumpTableSDNode *JT, SelectionDAG &DAG) const { SDLoc DL(JT); - EVT PtrVT = getPointerTy(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT); // Use LARL to load the address of the table. @@ -2607,7 +2609,7 @@ SDValue SystemZTargetLowering::lowerJumpTable(JumpTableSDNode *JT, SDValue SystemZTargetLowering::lowerConstantPool(ConstantPoolSDNode *CP, SelectionDAG &DAG) const { SDLoc DL(CP); - EVT PtrVT = getPointerTy(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Result; if (CP->isMachineConstantPoolEntry()) @@ -2671,7 +2673,7 @@ SDValue SystemZTargetLowering::lowerVASTART(SDValue Op, MachineFunction &MF = DAG.getMachineFunction(); SystemZMachineFunctionInfo *FuncInfo = MF.getInfo<SystemZMachineFunctionInfo>(); - EVT PtrVT = getPointerTy(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Chain = Op.getOperand(0); SDValue Addr = Op.getOperand(1); diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h index 2f7617bbdac3..949b67f114ea 100644 --- a/lib/Target/SystemZ/SystemZISelLowering.h +++ b/lib/Target/SystemZ/SystemZISelLowering.h @@ -339,10 +339,10 @@ public: const SystemZSubtarget &STI); // Override TargetLowering. - MVT getScalarShiftAmountTy(EVT LHSTy) const override { + MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override { return MVT::i32; } - MVT getVectorIdxTy() const override { + MVT getVectorIdxTy(const DataLayout &DL) const override { // Only the lower 12 bits of an element index are used, so we don't // want to clobber the upper 32 bits of a GPR unnecessarily. return MVT::i32; @@ -364,12 +364,13 @@ public: return TypeWidenVector; return TargetLoweringBase::getPreferredVectorAction(VT); } - EVT getSetCCResultType(LLVMContext &, EVT) const override; + EVT getSetCCResultType(const DataLayout &DL, LLVMContext &, + EVT) const override; bool isFMAFasterThanFMulAndFAdd(EVT VT) const override; bool isFPImmLegal(const APFloat &Imm, EVT VT) const override; bool isLegalICmpImmediate(int64_t Imm) const override; bool isLegalAddImmediate(int64_t Imm) const override; - bool isLegalAddressingMode(const AddrMode &AM, Type *Ty, + bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const override; bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, unsigned Align, @@ -379,10 +380,9 @@ public: const char *getTargetNodeName(unsigned Opcode) const override; std::pair<unsigned, const TargetRegisterClass *> getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, - const std::string &Constraint, - MVT VT) const override; + StringRef Constraint, MVT VT) const override; TargetLowering::ConstraintType - getConstraintType(const std::string &Constraint) const override; + getConstraintType(StringRef Constraint) const override; TargetLowering::ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const override; @@ -391,8 +391,7 @@ public: std::vector<SDValue> &Ops, SelectionDAG &DAG) const override; - unsigned getInlineAsmMemConstraint( - const std::string &ConstraintCode) const override { + unsigned getInlineAsmMemConstraint(StringRef ConstraintCode) const override { if (ConstraintCode.size() == 1) { switch(ConstraintCode[0]) { default: diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.cpp b/lib/Target/SystemZ/SystemZRegisterInfo.cpp index 7cabea962e91..dc7bd25d7ed5 100644 --- a/lib/Target/SystemZ/SystemZRegisterInfo.cpp +++ b/lib/Target/SystemZ/SystemZRegisterInfo.cpp @@ -36,7 +36,7 @@ SystemZRegisterInfo::getCallPreservedMask(const MachineFunction &MF, BitVector SystemZRegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); - const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); + const SystemZFrameLowering *TFI = getFrameLowering(MF); if (TFI->hasFP(MF)) { // R11D is the frame pointer. Reserve all aliases. @@ -64,7 +64,7 @@ SystemZRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, MachineFunction &MF = *MBB.getParent(); auto *TII = static_cast<const SystemZInstrInfo *>(MF.getSubtarget().getInstrInfo()); - const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); + const SystemZFrameLowering *TFI = getFrameLowering(MF); DebugLoc DL = MI->getDebugLoc(); // Decompose the frame index into a base and offset. @@ -135,6 +135,6 @@ SystemZRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, unsigned SystemZRegisterInfo::getFrameRegister(const MachineFunction &MF) const { - const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); + const SystemZFrameLowering *TFI = getFrameLowering(MF); return TFI->hasFP(MF) ? SystemZ::R11D : SystemZ::R15D; } diff --git a/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp b/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp index e7e0268dbb8a..178aa3817311 100644 --- a/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp +++ b/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp @@ -18,12 +18,6 @@ using namespace llvm; #define DEBUG_TYPE "systemz-selectiondag-info" -SystemZSelectionDAGInfo::SystemZSelectionDAGInfo(const DataLayout &DL) - : TargetSelectionDAGInfo(&DL) {} - -SystemZSelectionDAGInfo::~SystemZSelectionDAGInfo() { -} - // Decide whether it is best to use a loop or straight-line code for // a block operation of Size bytes with source address Src and destination // address Dest. Sequence is the opcode to use for straight-line code diff --git a/lib/Target/SystemZ/SystemZSelectionDAGInfo.h b/lib/Target/SystemZ/SystemZSelectionDAGInfo.h index a257d6b55494..246fa3e5e656 100644 --- a/lib/Target/SystemZ/SystemZSelectionDAGInfo.h +++ b/lib/Target/SystemZ/SystemZSelectionDAGInfo.h @@ -22,8 +22,7 @@ class SystemZTargetMachine; class SystemZSelectionDAGInfo : public TargetSelectionDAGInfo { public: - explicit SystemZSelectionDAGInfo(const DataLayout &DL); - ~SystemZSelectionDAGInfo(); + explicit SystemZSelectionDAGInfo() = default; SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc DL, SDValue Chain, SDValue Dst, SDValue Src, diff --git a/lib/Target/SystemZ/SystemZSubtarget.cpp b/lib/Target/SystemZ/SystemZSubtarget.cpp index eb5e5c0b9ff8..0b49fcdd8f78 100644 --- a/lib/Target/SystemZ/SystemZSubtarget.cpp +++ b/lib/Target/SystemZ/SystemZSubtarget.cpp @@ -42,7 +42,7 @@ SystemZSubtarget::SystemZSubtarget(const Triple &TT, const std::string &CPU, HasTransactionalExecution(false), HasProcessorAssist(false), HasVector(false), TargetTriple(TT), InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this), - TSInfo(*TM.getDataLayout()), FrameLowering() {} + TSInfo(), FrameLowering() {} // Return true if GV binds locally under reloc model RM. static bool bindsLocally(const GlobalValue *GV, Reloc::Model RM) { diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/lib/Target/SystemZ/SystemZTargetTransformInfo.h index e9cabe968eea..4b80973ed879 100644 --- a/lib/Target/SystemZ/SystemZTargetTransformInfo.h +++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.h @@ -29,7 +29,8 @@ class SystemZTTIImpl : public BasicTTIImplBase<SystemZTTIImpl> { public: explicit SystemZTTIImpl(const SystemZTargetMachine *TM, Function &F) - : BaseT(TM), ST(TM->getSubtargetImpl(F)), TLI(ST->getTargetLowering()) {} + : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)), + TLI(ST->getTargetLowering()) {} // Provide value semantics. MSVC requires that we spell all of these out. SystemZTTIImpl(const SystemZTTIImpl &Arg) @@ -37,18 +38,6 @@ public: SystemZTTIImpl(SystemZTTIImpl &&Arg) : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)), TLI(std::move(Arg.TLI)) {} - SystemZTTIImpl &operator=(const SystemZTTIImpl &RHS) { - BaseT::operator=(static_cast<const BaseT &>(RHS)); - ST = RHS.ST; - TLI = RHS.TLI; - return *this; - } - SystemZTTIImpl &operator=(SystemZTTIImpl &&RHS) { - BaseT::operator=(std::move(static_cast<BaseT &>(RHS))); - ST = std::move(RHS.ST); - TLI = std::move(RHS.TLI); - return *this; - } /// \name Scalar TTI Implementations /// @{ diff --git a/lib/Target/TargetMachine.cpp b/lib/Target/TargetMachine.cpp index 0b05303f71bf..83174c20c8e9 100644 --- a/lib/Target/TargetMachine.cpp +++ b/lib/Target/TargetMachine.cpp @@ -150,8 +150,9 @@ void TargetMachine::setOptLevel(CodeGenOpt::Level Level) const { } TargetIRAnalysis TargetMachine::getTargetIRAnalysis() { - return TargetIRAnalysis( - [this](Function &) { return TargetTransformInfo(getDataLayout()); }); + return TargetIRAnalysis([this](Function &F) { + return TargetTransformInfo(F.getParent()->getDataLayout()); + }); } static bool canUsePrivateLabel(const MCAsmInfo &AsmInfo, diff --git a/lib/Target/TargetSubtargetInfo.cpp b/lib/Target/TargetSubtargetInfo.cpp index 87df7af84525..6a61fcdf0f86 100644 --- a/lib/Target/TargetSubtargetInfo.cpp +++ b/lib/Target/TargetSubtargetInfo.cpp @@ -19,7 +19,14 @@ using namespace llvm; //--------------------------------------------------------------------------- // TargetSubtargetInfo Class // -TargetSubtargetInfo::TargetSubtargetInfo() {} +TargetSubtargetInfo::TargetSubtargetInfo( + const Triple &TT, StringRef CPU, StringRef FS, + ArrayRef<SubtargetFeatureKV> PF, ArrayRef<SubtargetFeatureKV> PD, + const SubtargetInfoKV *ProcSched, const MCWriteProcResEntry *WPR, + const MCWriteLatencyEntry *WL, const MCReadAdvanceEntry *RA, + const InstrStage *IS, const unsigned *OC, const unsigned *FP) + : MCSubtargetInfo(TT, CPU, FS, PF, PD, ProcSched, WPR, WL, RA, IS, OC, FP) { +} TargetSubtargetInfo::~TargetSubtargetInfo() {} diff --git a/lib/Target/WebAssembly/CMakeLists.txt b/lib/Target/WebAssembly/CMakeLists.txt index df04c2a3460b..25de9eee0831 100644 --- a/lib/Target/WebAssembly/CMakeLists.txt +++ b/lib/Target/WebAssembly/CMakeLists.txt @@ -1,6 +1,7 @@ set(LLVM_TARGET_DEFINITIONS WebAssembly.td) tablegen(LLVM WebAssemblyGenMCCodeEmitter.inc -gen-emitter) +tablegen(LLVM WebAssemblyGenRegisterInfo.inc -gen-register-info) tablegen(LLVM WebAssemblyGenSubtargetInfo.inc -gen-subtarget) add_public_tablegen_target(WebAssemblyCommonTableGen) diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp index d248556c62d7..224aa773a80e 100644 --- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp +++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp @@ -29,6 +29,9 @@ using namespace llvm; #define GET_SUBTARGETINFO_MC_DESC #include "WebAssemblyGenSubtargetInfo.inc" +#define GET_REGINFO_MC_DESC +#include "WebAssemblyGenRegisterInfo.inc" + static MCAsmInfo *createWebAssemblyMCAsmInfo(const MCRegisterInfo &MRI, const Triple &TT) { MCAsmInfo *MAI = new WebAssemblyMCAsmInfo(TT); diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h index 24893daec7ea..eebf5b72f62b 100644 --- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h +++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h @@ -47,6 +47,9 @@ MCAsmBackend *createWebAssemblyAsmBackend(const Target &T, // Defines symbolic names for WebAssembly registers. This defines a mapping from // register name to register number. // +#define GET_REGINFO_ENUM +#include "WebAssemblyGenRegisterInfo.inc" + #define GET_SUBTARGETINFO_ENUM #include "WebAssemblyGenSubtargetInfo.inc" diff --git a/lib/Target/WebAssembly/Makefile b/lib/Target/WebAssembly/Makefile index 35d835c6506c..f102d73f6e86 100644 --- a/lib/Target/WebAssembly/Makefile +++ b/lib/Target/WebAssembly/Makefile @@ -12,7 +12,8 @@ LIBRARYNAME = LLVMWebAssemblyCodeGen TARGET = WebAssembly # Make sure that tblgen is run, first thing. -BUILT_SOURCES = WebAssemblyGenSubtargetInfo.inc WebAssemblyGenMCCodeEmitter.inc +BUILT_SOURCES = WebAssemblyGenRegisterInfo.inc WebAssemblyGenSubtargetInfo.inc \ + WebAssemblyGenMCCodeEmitter.inc DIRS = InstPrinter TargetInfo MCTargetDesc diff --git a/lib/Target/WebAssembly/README.txt b/lib/Target/WebAssembly/README.txt index 7a71060a638f..63e02c455895 100644 --- a/lib/Target/WebAssembly/README.txt +++ b/lib/Target/WebAssembly/README.txt @@ -12,4 +12,15 @@ binary encoding of WebAssembly itself: * https://github.com/WebAssembly/design/blob/master/AstSemantics.md * https://github.com/WebAssembly/design/blob/master/BinaryEncoding.md +Interesting work that remains to be done: +* Write a pass to restructurize irreducible control flow. This needs to be done + before register allocation to be efficient, because it may duplicate basic + blocks and WebAssembly performs register allocation at a whole-function + level. Note that LLVM's GPU code has such a pass, but it linearizes control + flow (e.g. both sides of branches execute and are masked) which is undesirable + for WebAssembly. +* Basic relooper to expose control flow as an AST. +* Figure out how to properly use MC for virtual ISAs. This may require some + refactoring of MC. + //===---------------------------------------------------------------------===// diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index 4eec02efbd94..4184eb6dc5a6 100644 --- a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -38,6 +38,8 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering( // WebAssembly does not produce floating-point exceptions on normal floating // point operations. setHasFloatingPointExceptions(false); + // We don't know the microarchitecture here, so just reduce register pressure. + setSchedulingPreference(Sched::RegPressure); } //===----------------------------------------------------------------------===// diff --git a/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td b/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td index 35e88eec8573..64415658ed81 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td @@ -6,9 +6,10 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// -// -// WebAssembly Atomic operand code-gen constructs. -// +/// +/// \file +/// \brief WebAssembly Atomic operand code-gen constructs. +/// //===----------------------------------------------------------------------===// // TODO: Implement atomic instructions. diff --git a/lib/Target/WebAssembly/WebAssemblyInstrCall.td b/lib/Target/WebAssembly/WebAssemblyInstrCall.td new file mode 100644 index 000000000000..6b5b6cd54173 --- /dev/null +++ b/lib/Target/WebAssembly/WebAssemblyInstrCall.td @@ -0,0 +1,21 @@ +//===- WebAssemblyInstrCall.td-WebAssembly Call codegen support -*- tablegen -*- +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief WebAssembly Call operand code-gen constructs. +/// +//===----------------------------------------------------------------------===// + +/* + * TODO(jfb): Add the following. + * + * call_direct: call function directly + * call_indirect: call function indirectly + * addressof: obtain a function pointer value for a given function + */ diff --git a/lib/Target/WebAssembly/WebAssemblyInstrConv.td b/lib/Target/WebAssembly/WebAssemblyInstrConv.td new file mode 100644 index 000000000000..3fa29061b1de --- /dev/null +++ b/lib/Target/WebAssembly/WebAssemblyInstrConv.td @@ -0,0 +1,44 @@ +//===-- WebAssemblyInstrConv.td-WebAssembly Conversion support -*- tablegen -*-= +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief WebAssembly datatype conversions, truncations, reinterpretations, +/// promotions, and demotions operand code-gen constructs. +/// +//===----------------------------------------------------------------------===// + +/* + * TODO(jfb): Add the following. + * + * int32.wrap[int64]: wrap a 64-bit integer to a 32-bit integer + * int32.trunc_signed[float32]: truncate a 32-bit float to a signed 32-bit integer + * int32.trunc_signed[float64]: truncate a 64-bit float to a signed 32-bit integer + * int32.trunc_unsigned[float32]: truncate a 32-bit float to an unsigned 32-bit integer + * int32.trunc_unsigned[float64]: truncate a 64-bit float to an unsigned 32-bit integer + * int32.reinterpret[float32]: reinterpret the bits of a 32-bit float as a 32-bit integer + * int64.extend_signed[int32]: extend a signed 32-bit integer to a 64-bit integer + * int64.extend_unsigned[int32]: extend an unsigned 32-bit integer to a 64-bit integer + * int64.trunc_signed[float32]: truncate a 32-bit float to a signed 64-bit integer + * int64.trunc_signed[float64]: truncate a 64-bit float to a signed 64-bit integer + * int64.trunc_unsigned[float32]: truncate a 32-bit float to an unsigned 64-bit integer + * int64.trunc_unsigned[float64]: truncate a 64-bit float to an unsigned 64-bit integer + * int64.reinterpret[float64]: reinterpret the bits of a 64-bit float as a 64-bit integer + * float32.demote[float64]: demote a 64-bit float to a 32-bit float + * float32.cvt_signed[int32]: convert a signed 32-bit integer to a 32-bit float + * float32.cvt_signed[int64]: convert a signed 64-bit integer to a 32-bit float + * float32.cvt_unsigned[int32]: convert an unsigned 32-bit integer to a 32-bit float + * float32.cvt_unsigned[int64]: convert an unsigned 64-bit integer to a 32-bit float + * float32.reinterpret[int32]: reinterpret the bits of a 32-bit integer as a 32-bit float + * float64.promote[float32]: promote a 32-bit float to a 64-bit float + * float64.cvt_signed[int32]: convert a signed 32-bit integer to a 64-bit float + * float64.cvt_signed[int64]: convert a signed 64-bit integer to a 64-bit float + * float64.cvt_unsigned[int32]: convert an unsigned 32-bit integer to a 64-bit float + * float64.cvt_unsigned[int64]: convert an unsigned 64-bit integer to a 64-bit float + * float64.reinterpret[int64]: reinterpret the bits of a 64-bit integer as a 64-bit float + */ diff --git a/lib/Target/WebAssembly/WebAssemblyInstrFloat.td b/lib/Target/WebAssembly/WebAssemblyInstrFloat.td new file mode 100644 index 000000000000..30ef6339d65a --- /dev/null +++ b/lib/Target/WebAssembly/WebAssemblyInstrFloat.td @@ -0,0 +1,44 @@ +// WebAssemblyInstrFloat.td-WebAssembly Float codegen support ---*- tablegen -*- +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief WebAssembly Floating-point operand code-gen constructs. +/// +//===----------------------------------------------------------------------===// + +defm FADD : BinaryFP<fadd>; +defm FSUB : BinaryFP<fsub>; +defm FMUL : BinaryFP<fmul>; +defm FDIV : BinaryFP<fdiv>; +defm FABS : UnaryFP<fabs>; +defm FNEG : UnaryFP<fneg>; +defm COPYSIGN : BinaryFP<fcopysign>; +defm CEIL : UnaryFP<fceil>; +defm FLOOR : UnaryFP<ffloor>; +defm TRUNC : UnaryFP<ftrunc>; +defm NEARESTINT : UnaryFP<fnearbyint>; + +/* + * TODO(jfb): Add the following for 32-bit and 64-bit. + * + * float32.eq: compare equal + * float32.lt: less than + * float32.le: less than or equal + * float32.gt: greater than + * float32.ge: greater than or equal + */ + +defm SQRT : UnaryFP<fsqrt>; + +/* + * TODO(jfb): Add the following for 32-bit and 64-bit. + * + * float32.min: minimum (binary operator); if either operand is NaN, returns NaN + * float32.max: maximum (binary operator); if either operand is NaN, returns NaN + */ diff --git a/lib/Target/WebAssembly/WebAssemblyInstrFormats.td b/lib/Target/WebAssembly/WebAssemblyInstrFormats.td index 8bbf3e9ec87b..513c36fa2ec2 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrFormats.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrFormats.td @@ -6,9 +6,10 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// -// -// WebAssembly instruction format definitions. -// +/// +/// \file +/// \brief WebAssembly instruction format definitions. +/// //===----------------------------------------------------------------------===// // WebAssembly Instruction Format @@ -26,3 +27,29 @@ class I<dag oops, dag iops, list<dag> pattern, string cstr = ""> dag InOperandList = iops; let Pattern = pattern; } + +// Unary and binary instructions, for the local types that WebAssembly supports. +multiclass UnaryInt<SDNode node> { + def _I32 : I<(outs Int32:$dst), (ins Int32:$src), + [(set Int32:$dst, (node Int32:$src))]>; + def _I64 : I<(outs Int64:$dst), (ins Int64:$src), + [(set Int64:$dst, (node Int64:$src))]>; +} +multiclass BinaryInt<SDNode node> { + def _I32 : I<(outs Int32:$dst), (ins Int32:$lhs, Int32:$rhs), + [(set Int32:$dst, (node Int32:$lhs, Int32:$rhs))]>; + def _I64 : I<(outs Int64:$dst), (ins Int64:$lhs, Int64:$rhs), + [(set Int64:$dst, (node Int64:$lhs, Int64:$rhs))]>; +} +multiclass UnaryFP<SDNode node> { + def _F32 : I<(outs Float32:$dst), (ins Float32:$src), + [(set Float32:$dst, (node Float32:$src))]>; + def _F64 : I<(outs Float64:$dst), (ins Float64:$src), + [(set Float64:$dst, (node Float64:$src))]>; +} +multiclass BinaryFP<SDNode node> { + def _F32 : I<(outs Float32:$dst), (ins Float32:$lhs, Float32:$rhs), + [(set Float32:$dst, (node Float32:$lhs, Float32:$rhs))]>; + def _F64 : I<(outs Float64:$dst), (ins Float64:$lhs, Float64:$rhs), + [(set Float64:$dst, (node Float64:$lhs, Float64:$rhs))]>; +} diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td index 142eccfbcaa5..fe3ca76dc08a 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td @@ -6,9 +6,10 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// -// -// WebAssembly Instruction definitions. -// +/// +/// \file +/// \brief WebAssembly Instruction definitions. +/// //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// @@ -32,6 +33,13 @@ def HasSIMD128 : Predicate<"Subtarget->hasSIMD128()">, // WebAssembly-specific Operands. //===----------------------------------------------------------------------===// +/* + * TODO(jfb): Add the following. + * + * get_local: read the current value of a local variable + * set_local: set the current value of a local variable +*/ + //===----------------------------------------------------------------------===// // WebAssembly Instruction Format Definitions. //===----------------------------------------------------------------------===// @@ -42,5 +50,10 @@ include "WebAssemblyInstrFormats.td" // Additional sets of instructions. //===----------------------------------------------------------------------===// +include "WebAssemblyInstrMemory.td" +include "WebAssemblyInstrCall.td" +include "WebAssemblyInstrInteger.td" +include "WebAssemblyInstrFloat.td" +include "WebAssemblyInstrConv.td" include "WebAssemblyInstrAtomics.td" include "WebAssemblyInstrSIMD.td" diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInteger.td b/lib/Target/WebAssembly/WebAssemblyInstrInteger.td new file mode 100644 index 000000000000..5f60fe81b1a2 --- /dev/null +++ b/lib/Target/WebAssembly/WebAssemblyInstrInteger.td @@ -0,0 +1,45 @@ +// WebAssemblyInstrInteger.td-WebAssembly Integer codegen -------*- tablegen -*- +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief WebAssembly Integer operand code-gen constructs. +/// +//===----------------------------------------------------------------------===// + +defm ADD : BinaryInt<add>; +defm SUB : BinaryInt<sub>; +defm MUL : BinaryInt<mul>; +defm SDIV : BinaryInt<sdiv>; +defm UDIV : BinaryInt<udiv>; +defm SREM : BinaryInt<srem>; +defm UREM : BinaryInt<urem>; +defm AND : BinaryInt<and>; +defm IOR : BinaryInt<or>; +defm XOR : BinaryInt<xor>; +defm SHL : BinaryInt<shl>; +defm SHR : BinaryInt<srl>; +defm SAR : BinaryInt<sra>; + +/* + * TODO(jfb): Add the following for 32-bit and 64-bit. + * + * int32.eq: signed-less compare equal + * int32.slt: signed less than + * int32.sle: signed less than or equal + * int32.ult: unsigned less than + * int32.ule: unsigned less than or equal + * int32.sgt: signed greater than + * int32.sge: signed greater than or equal + * int32.ugt: unsigned greater than + * int32.uge: unsigned greater than or equal + */ + +defm CLZ : UnaryInt<ctlz>; +defm CTZ : UnaryInt<cttz>; +defm POPCNT : UnaryInt<ctpop>; diff --git a/lib/Target/WebAssembly/WebAssemblyInstrMemory.td b/lib/Target/WebAssembly/WebAssemblyInstrMemory.td new file mode 100644 index 000000000000..5ab40e826caa --- /dev/null +++ b/lib/Target/WebAssembly/WebAssemblyInstrMemory.td @@ -0,0 +1,46 @@ +// WebAssemblyInstrMemory.td-WebAssembly Memory codegen support -*- tablegen -*- +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief WebAssembly Memory operand code-gen constructs. +/// +//===----------------------------------------------------------------------===// + +/* + * TODO(jfb): Add the following. + * Each has optional alignment and immediate byte offset. + * + * int32.load_sx[int8]: sign-extend to int32 + * int32.load_sx[int16]: sign-extend to int32 + * int32.load_zx[int8]: zero-extend to int32 + * int32.load_zx[int16]: zero-extend to int32 + * int32.load[int32]: (no conversion) + * int64.load_sx[int8]: sign-extend to int64 + * int64.load_sx[int16]: sign-extend to int64 + * int64.load_sx[int32]: sign-extend to int64 + * int64.load_zx[int8]: zero-extend to int64 + * int64.load_zx[int16]: zero-extend to int64 + * int64.load_zx[int32]: zero-extend to int64 + * int64.load[int64]: (no conversion) + * float32.load[float32]: (no conversion) + * float64.load[float64]: (no conversion) + * + * int32.store[int8]: wrap int32 to int8 + * int32.store[int16]: wrap int32 to int16 + * int32.store[int32]: (no conversion) + * int64.store[int8]: wrap int64 to int8 + * int64.store[int16]: wrap int64 to int16 + * int64.store[int32]: wrap int64 to int32 + * int64.store[int64]: (no conversion) + * float32.store[float32]: (no conversion) + * float64.store[float64]: (no conversion) + * + * load_global: load the value of a given global variable + * store_global: store a given value to a given global variable + */ diff --git a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td index e25483ad3f7a..3e29906219d2 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -6,9 +6,10 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// -// -// WebAssembly SIMD operand code-gen constructs. -// +/// +/// \file +/// \brief WebAssembly SIMD operand code-gen constructs. +/// //===----------------------------------------------------------------------===// // TODO: Implement SIMD instructions. diff --git a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp index ad24c90af6a2..385c40bf6693 100644 --- a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp +++ b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp @@ -30,4 +30,58 @@ using namespace llvm; #define DEBUG_TYPE "wasm-reg-info" -WebAssemblyRegisterInfo::WebAssemblyRegisterInfo(const Triple &TT) : TT(TT) {} +#define GET_REGINFO_TARGET_DESC +#include "WebAssemblyGenRegisterInfo.inc" + +WebAssemblyRegisterInfo::WebAssemblyRegisterInfo(const Triple &TT) + : WebAssemblyGenRegisterInfo(0), TT(TT) {} + +const MCPhysReg * +WebAssemblyRegisterInfo::getCalleeSavedRegs(const MachineFunction *) const { + static const MCPhysReg CalleeSavedRegs[] = {0}; + return CalleeSavedRegs; +} + +BitVector +WebAssemblyRegisterInfo::getReservedRegs(const MachineFunction &MF) const { + BitVector Reserved(getNumRegs()); + for (auto Reg : {WebAssembly::SP32, WebAssembly::SP64, WebAssembly::FP32, + WebAssembly::FP64}) + Reserved.set(Reg); + return Reserved; +} + +void WebAssemblyRegisterInfo::eliminateFrameIndex( + MachineBasicBlock::iterator II, int SPAdj, unsigned FIOperandNum, + RegScavenger *RS) const { + llvm_unreachable("WebAssemblyRegisterInfo::eliminateFrameIndex"); // FIXME +} + +unsigned +WebAssemblyRegisterInfo::getFrameRegister(const MachineFunction &MF) const { + static const unsigned Regs[2][2] = { + /* !isArch64Bit isArch64Bit */ + /* !hasFP */ {WebAssembly::SP32, WebAssembly::SP64}, + /* hasFP */ {WebAssembly::FP32, WebAssembly::FP64}}; + const WebAssemblyFrameLowering *TFI = getFrameLowering(MF); + return Regs[TFI->hasFP(MF)][TT.isArch64Bit()]; +} + +bool WebAssemblyRegisterInfo::canRealignStack(const MachineFunction &MF) const { + return !MF.getFunction()->hasFnAttribute("no-realign-stack"); +} + +// FIXME: share this with other backends with identical implementation? +bool WebAssemblyRegisterInfo::needsStackRealignment( + const MachineFunction &MF) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + const WebAssemblyFrameLowering *TFI = getFrameLowering(MF); + const Function *F = MF.getFunction(); + unsigned StackAlign = TFI->getStackAlignment(); + bool requiresRealignment = + ((MFI->getMaxAlignment() > StackAlign) || + F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, + Attribute::StackAlignment)); + + return requiresRealignment && canRealignStack(MF); +} diff --git a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.h b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.h index 55300287a51e..dbdb9d0457af 100644 --- a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.h +++ b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.h @@ -16,6 +16,9 @@ #ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYREGISTERINFO_H #define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYREGISTERINFO_H +#define GET_REGINFO_HEADER +#include "WebAssemblyGenRegisterInfo.inc" + namespace llvm { class MachineFunction; @@ -23,11 +26,25 @@ class RegScavenger; class TargetRegisterClass; class Triple; -class WebAssemblyRegisterInfo final { +class WebAssemblyRegisterInfo final : public WebAssemblyGenRegisterInfo { const Triple &TT; public: explicit WebAssemblyRegisterInfo(const Triple &TT); + + // Code Generation virtual methods. + const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override; + BitVector getReservedRegs(const MachineFunction &MF) const override; + void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, + unsigned FIOperandNum, + RegScavenger *RS = nullptr) const override; + + // Debug information queries. + unsigned getFrameRegister(const MachineFunction &MF) const override; + + // Base pointer (stack realignment) support. + bool canRealignStack(const MachineFunction &MF) const; + bool needsStackRealignment(const MachineFunction &MF) const override; }; } // end namespace llvm diff --git a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td index 7b3d636a2605..2ba42eb94a40 100644 --- a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td +++ b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td @@ -6,10 +6,11 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// -// -// This file describes the WebAssembly register classes and some nominal -// physical registers. -// +/// +/// \file +/// \brief This file describes the WebAssembly register classes and some nominal +/// physical registers. +/// //===----------------------------------------------------------------------===// class WebAssemblyReg<string n> : Register<n> { @@ -23,6 +24,31 @@ class WebAssemblyRegClass<list<ValueType> regTypes, int alignment, dag regList> // Registers //===----------------------------------------------------------------------===// +// Special registers used as the frame and stack pointer. +// +// WebAssembly may someday supports mixed 32-bit and 64-bit heaps in the same +// application, which requires separate width FP and SP. +def FP32 : WebAssemblyReg<"%FP32">; +def FP64 : WebAssemblyReg<"%FP64">; +def SP32 : WebAssemblyReg<"%SP32">; +def SP64 : WebAssemblyReg<"%SP64">; + +// TODO(jfb) The following comes from NVPTX. Is it really needed, or can we do +// away with it? Try deleting once the backend works. +// WebAssembly uses virtual registers, but the backend defines a few physical +// registers here to keep SDAG and the MachineInstr layers happy. +foreach i = 0-4 in { + def I#i : WebAssemblyReg<"%i."#i>; // i32 + def L#i : WebAssemblyReg<"%l."#i>; // i64 + def F#i : WebAssemblyReg<"%f."#i>; // f32 + def D#i : WebAssemblyReg<"%d."#i>; // f64 +} + //===----------------------------------------------------------------------===// // Register classes //===----------------------------------------------------------------------===// + +def Int32 : WebAssemblyRegClass<[i32], 32, (add (sequence "I%u", 0, 4), SP32)>; +def Int64 : WebAssemblyRegClass<[i64], 64, (add (sequence "L%u", 0, 4), SP64)>; +def Float32 : WebAssemblyRegClass<[f32], 32, (add (sequence "F%u", 0, 4))>; +def Float64 : WebAssemblyRegClass<[f64], 64, (add (sequence "D%u", 0, 4))>; diff --git a/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp b/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp index cfd1bafff236..fae9c6100510 100644 --- a/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp +++ b/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp @@ -17,7 +17,4 @@ using namespace llvm; #define DEBUG_TYPE "wasm-selectiondag-info" -WebAssemblySelectionDAGInfo::WebAssemblySelectionDAGInfo(const DataLayout *DL) - : TargetSelectionDAGInfo(DL) {} - WebAssemblySelectionDAGInfo::~WebAssemblySelectionDAGInfo() {} diff --git a/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h b/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h index 03e8d393558d..13d96671276d 100644 --- a/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h +++ b/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h @@ -22,7 +22,6 @@ namespace llvm { class WebAssemblySelectionDAGInfo final : public TargetSelectionDAGInfo { public: - explicit WebAssemblySelectionDAGInfo(const DataLayout *DL); ~WebAssemblySelectionDAGInfo() override; }; diff --git a/lib/Target/WebAssembly/WebAssemblySubtarget.cpp b/lib/Target/WebAssembly/WebAssemblySubtarget.cpp index addea8e3cc36..3d9e7aacbfbf 100644 --- a/lib/Target/WebAssembly/WebAssemblySubtarget.cpp +++ b/lib/Target/WebAssembly/WebAssemblySubtarget.cpp @@ -42,7 +42,7 @@ WebAssemblySubtarget::WebAssemblySubtarget(const Triple &TT, const TargetMachine &TM) : WebAssemblyGenSubtargetInfo(TT, CPU, FS), HasSIMD128(false), CPUString(CPU), TargetTriple(TT), FrameLowering(), - InstrInfo(initializeSubtargetDependencies(FS)), - TSInfo(TM.getDataLayout()), TLInfo(TM, *this) {} + InstrInfo(initializeSubtargetDependencies(FS)), TSInfo(), + TLInfo(TM, *this) {} bool WebAssemblySubtarget::enableMachineScheduler() const { return true; } diff --git a/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h index 08bd88c06985..7ffb6047b963 100644 --- a/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h +++ b/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h @@ -31,7 +31,6 @@ class WebAssemblyTTIImpl final : public BasicTTIImplBase<WebAssemblyTTIImpl> { typedef TargetTransformInfo TTI; friend BaseT; - const WebAssemblyTargetMachine *TM; const WebAssemblySubtarget *ST; const WebAssemblyTargetLowering *TLI; @@ -40,30 +39,15 @@ class WebAssemblyTTIImpl final : public BasicTTIImplBase<WebAssemblyTTIImpl> { public: WebAssemblyTTIImpl(const WebAssemblyTargetMachine *TM, Function &F) - : BaseT(TM), TM(TM), ST(TM->getSubtargetImpl(F)), + : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)), TLI(ST->getTargetLowering()) {} // Provide value semantics. MSVC requires that we spell all of these out. WebAssemblyTTIImpl(const WebAssemblyTTIImpl &Arg) - : BaseT(static_cast<const BaseT &>(Arg)), TM(Arg.TM), ST(Arg.ST), - TLI(Arg.TLI) {} + : BaseT(static_cast<const BaseT &>(Arg)), ST(Arg.ST), TLI(Arg.TLI) {} WebAssemblyTTIImpl(WebAssemblyTTIImpl &&Arg) - : BaseT(std::move(static_cast<BaseT &>(Arg))), TM(std::move(Arg.TM)), - ST(std::move(Arg.ST)), TLI(std::move(Arg.TLI)) {} - WebAssemblyTTIImpl &operator=(const WebAssemblyTTIImpl &RHS) { - BaseT::operator=(static_cast<const BaseT &>(RHS)); - TM = RHS.TM; - ST = RHS.ST; - TLI = RHS.TLI; - return *this; - } - WebAssemblyTTIImpl &operator=(WebAssemblyTTIImpl &&RHS) { - BaseT::operator=(std::move(static_cast<BaseT &>(RHS))); - TM = std::move(RHS.TM); - ST = std::move(RHS.ST); - TLI = std::move(RHS.TLI); - return *this; - } + : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)), + TLI(std::move(Arg.TLI)) {} /// \name Scalar TTI Implementations /// @{ diff --git a/lib/Target/X86/InstPrinter/X86InstComments.cpp b/lib/Target/X86/InstPrinter/X86InstComments.cpp index 3cad9fa1e2ae..91b144a44824 100644 --- a/lib/Target/X86/InstPrinter/X86InstComments.cpp +++ b/lib/Target/X86/InstPrinter/X86InstComments.cpp @@ -878,6 +878,29 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, DestName = getRegName(MI->getOperand(0).getReg()); break; + case X86::EXTRQI: + if (MI->getOperand(2).isImm() && + MI->getOperand(3).isImm()) + DecodeEXTRQIMask(MI->getOperand(2).getImm(), + MI->getOperand(3).getImm(), + ShuffleMask); + + DestName = getRegName(MI->getOperand(0).getReg()); + Src1Name = getRegName(MI->getOperand(1).getReg()); + break; + + case X86::INSERTQI: + if (MI->getOperand(3).isImm() && + MI->getOperand(4).isImm()) + DecodeINSERTQIMask(MI->getOperand(3).getImm(), + MI->getOperand(4).getImm(), + ShuffleMask); + + DestName = getRegName(MI->getOperand(0).getReg()); + Src1Name = getRegName(MI->getOperand(1).getReg()); + Src2Name = getRegName(MI->getOperand(2).getReg()); + break; + case X86::PMOVZXBWrr: case X86::PMOVZXBDrr: case X86::PMOVZXBQrr: diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index 3e0dc1424609..629802f5dc5e 100644 --- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -220,7 +220,6 @@ static unsigned getRelaxedOpcodeArith(unsigned Op) { case X86::PUSH32i8: return X86::PUSHi32; case X86::PUSH16i8: return X86::PUSHi16; case X86::PUSH64i8: return X86::PUSH64i32; - case X86::PUSH64i16: return X86::PUSH64i32; } } diff --git a/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp b/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp index 89f394582631..ddb764facdbf 100644 --- a/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp +++ b/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp @@ -34,14 +34,16 @@ public: report_fatal_error(EC.message()); StringRef SymName = *SymNameOrErr; - uint64_t SymAddr; SymI->getAddress(SymAddr); + ErrorOr<uint64_t> SymAddr = SymI->getAddress(); + if (std::error_code EC = SymAddr.getError()) + report_fatal_error(EC.message()); uint64_t SymSize = SymI->getSize(); int64_t Addend = *ELFRelocationRef(Rel).getAddend(); MCSymbol *Sym = Ctx.getOrCreateSymbol(SymName); // FIXME: check that the value is actually the same. if (!Sym->isVariable()) - Sym->setVariableValue(MCConstantExpr::create(SymAddr, Ctx)); + Sym->setVariableValue(MCConstantExpr::create(*SymAddr, Ctx)); const MCExpr *Expr = nullptr; // If hasAddend is true, then we need to add Addend (r_addend) to Expr. diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp index 431010d4cbc2..83b4091d7665 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp @@ -88,9 +88,7 @@ MCSubtargetInfo *X86_MC::createX86MCSubtargetInfo(const Triple &TT, if (CPUName.empty()) CPUName = "generic"; - MCSubtargetInfo *X = new MCSubtargetInfo(); - InitX86MCSubtargetInfo(X, TT, CPUName, ArchFS); - return X; + return createX86MCSubtargetInfoImpl(TT, CPUName, ArchFS); } static MCInstrInfo *createX86MCInstrInfo() { @@ -99,17 +97,14 @@ static MCInstrInfo *createX86MCInstrInfo() { return X; } -static MCRegisterInfo *createX86MCRegisterInfo(StringRef TT) { - Triple TheTriple(TT); - unsigned RA = (TheTriple.getArch() == Triple::x86_64) - ? X86::RIP // Should have dwarf #16. - : X86::EIP; // Should have dwarf #8. +static MCRegisterInfo *createX86MCRegisterInfo(const Triple &TT) { + unsigned RA = (TT.getArch() == Triple::x86_64) + ? X86::RIP // Should have dwarf #16. + : X86::EIP; // Should have dwarf #8. MCRegisterInfo *X = new MCRegisterInfo(); - InitX86MCRegisterInfo(X, RA, - X86_MC::getDwarfRegFlavour(TheTriple, false), - X86_MC::getDwarfRegFlavour(TheTriple, true), - RA); + InitX86MCRegisterInfo(X, RA, X86_MC::getDwarfRegFlavour(TT, false), + X86_MC::getDwarfRegFlavour(TT, true), RA); X86_MC::InitLLVM2SEHRegisterMapping(X); return X; } @@ -156,24 +151,23 @@ static MCAsmInfo *createX86MCAsmInfo(const MCRegisterInfo &MRI, return MAI; } -static MCCodeGenInfo *createX86MCCodeGenInfo(StringRef TT, Reloc::Model RM, +static MCCodeGenInfo *createX86MCCodeGenInfo(const Triple &TT, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) { MCCodeGenInfo *X = new MCCodeGenInfo(); - Triple T(TT); - bool is64Bit = T.getArch() == Triple::x86_64; + bool is64Bit = TT.getArch() == Triple::x86_64; if (RM == Reloc::Default) { // Darwin defaults to PIC in 64 bit mode and dynamic-no-pic in 32 bit mode. // Win64 requires rip-rel addressing, thus we force it to PIC. Otherwise we // use static relocation model by default. - if (T.isOSDarwin()) { + if (TT.isOSDarwin()) { if (is64Bit) RM = Reloc::PIC_; else RM = Reloc::DynamicNoPIC; - } else if (T.isOSWindows() && is64Bit) + } else if (TT.isOSWindows() && is64Bit) RM = Reloc::PIC_; else RM = Reloc::Static; @@ -186,13 +180,13 @@ static MCCodeGenInfo *createX86MCCodeGenInfo(StringRef TT, Reloc::Model RM, if (RM == Reloc::DynamicNoPIC) { if (is64Bit) RM = Reloc::PIC_; - else if (!T.isOSDarwin()) + else if (!TT.isOSDarwin()) RM = Reloc::Static; } // If we are on Darwin, disallow static relocation model in X86-64 mode, since // the Mach-O file format doesn't support it. - if (RM == Reloc::Static && T.isOSDarwin() && is64Bit) + if (RM == Reloc::Static && TT.isOSDarwin() && is64Bit) RM = Reloc::PIC_; // For static codegen, if we're not already set, use Small codegen. diff --git a/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp index c9479b62f7b6..9bfe999424fa 100644 --- a/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp @@ -34,7 +34,7 @@ public: if (std::error_code EC = SymNameOrErr.getError()) report_fatal_error(EC.message()); StringRef SymName = *SymNameOrErr; - uint64_t SymAddr; SymI->getAddress(SymAddr); + uint64_t SymAddr = SymI->getValue(); any_relocation_info RE = Obj->getRelocation(Rel.getRawDataRefImpl()); bool isPCRel = Obj->getAnyRelocationPCRel(RE); @@ -90,8 +90,7 @@ public: const MCExpr *LHS = MCSymbolRefExpr::create(Sym, Ctx); symbol_iterator RSymI = Rel.getSymbol(); - uint64_t RSymAddr; - RSymI->getAddress(RSymAddr); + uint64_t RSymAddr = RSymI->getValue(); ErrorOr<StringRef> RSymName = RSymI->getName(); if (std::error_code EC = RSymName.getError()) report_fatal_error(EC.message()); diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/lib/Target/X86/Utils/X86ShuffleDecode.cpp index ef3318ba7580..cae865a40819 100644 --- a/lib/Target/X86/Utils/X86ShuffleDecode.cpp +++ b/lib/Target/X86/Utils/X86ShuffleDecode.cpp @@ -255,15 +255,13 @@ void DecodeUNPCKLMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) { void DecodeVPERM2X128Mask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { - if (Imm & 0x88) - return; // Not a shuffle - unsigned HalfSize = VT.getVectorNumElements() / 2; for (unsigned l = 0; l != 2; ++l) { - unsigned HalfBegin = ((Imm >> (l * 4)) & 0x3) * HalfSize; + unsigned HalfMask = Imm >> (l * 4); + unsigned HalfBegin = (HalfMask & 0x3) * HalfSize; for (unsigned i = HalfBegin, e = HalfBegin + HalfSize; i != e; ++i) - ShuffleMask.push_back(i); + ShuffleMask.push_back(HalfMask & 8 ? SM_SentinelZero : (int)i); } } @@ -431,4 +429,78 @@ void DecodeScalarMoveMask(MVT VT, bool IsLoad, SmallVectorImpl<int> &Mask) { for (unsigned i = 1; i < NumElts; i++) Mask.push_back(IsLoad ? static_cast<int>(SM_SentinelZero) : i); } + +void DecodeEXTRQIMask(int Len, int Idx, + SmallVectorImpl<int> &ShuffleMask) { + // Only the bottom 6 bits are valid for each immediate. + Len &= 0x3F; + Idx &= 0x3F; + + // We can only decode this bit extraction instruction as a shuffle if both the + // length and index work with whole bytes. + if (0 != (Len % 8) || 0 != (Idx % 8)) + return; + + // A length of zero is equivalent to a bit length of 64. + if (Len == 0) + Len = 64; + + // If the length + index exceeds the bottom 64 bits the result is undefined. + if ((Len + Idx) > 64) { + ShuffleMask.append(16, SM_SentinelUndef); + return; + } + + // Convert index and index to work with bytes. + Len /= 8; + Idx /= 8; + + // EXTRQ: Extract Len bytes starting from Idx. Zero pad the remaining bytes + // of the lower 64-bits. The upper 64-bits are undefined. + for (int i = 0; i != Len; ++i) + ShuffleMask.push_back(i + Idx); + for (int i = Len; i != 8; ++i) + ShuffleMask.push_back(SM_SentinelZero); + for (int i = 8; i != 16; ++i) + ShuffleMask.push_back(SM_SentinelUndef); +} + +void DecodeINSERTQIMask(int Len, int Idx, + SmallVectorImpl<int> &ShuffleMask) { + // Only the bottom 6 bits are valid for each immediate. + Len &= 0x3F; + Idx &= 0x3F; + + // We can only decode this bit insertion instruction as a shuffle if both the + // length and index work with whole bytes. + if (0 != (Len % 8) || 0 != (Idx % 8)) + return; + + // A length of zero is equivalent to a bit length of 64. + if (Len == 0) + Len = 64; + + // If the length + index exceeds the bottom 64 bits the result is undefined. + if ((Len + Idx) > 64) { + ShuffleMask.append(16, SM_SentinelUndef); + return; + } + + // Convert index and index to work with bytes. + Len /= 8; + Idx /= 8; + + // INSERTQ: Extract lowest Len bytes from lower half of second source and + // insert over first source starting at Idx byte. The upper 64-bits are + // undefined. + for (int i = 0; i != Idx; ++i) + ShuffleMask.push_back(i); + for (int i = 0; i != Len; ++i) + ShuffleMask.push_back(i + 16); + for (int i = Idx + Len; i != 8; ++i) + ShuffleMask.push_back(i); + for (int i = 8; i != 16; ++i) + ShuffleMask.push_back(SM_SentinelUndef); +} + } // llvm namespace diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.h b/lib/Target/X86/Utils/X86ShuffleDecode.h index 14b69434806e..3d10d18e860e 100644 --- a/lib/Target/X86/Utils/X86ShuffleDecode.h +++ b/lib/Target/X86/Utils/X86ShuffleDecode.h @@ -100,6 +100,14 @@ void DecodeZeroMoveLowMask(MVT VT, SmallVectorImpl<int> &ShuffleMask); /// \brief Decode a scalar float move instruction as a shuffle mask. void DecodeScalarMoveMask(MVT VT, bool IsLoad, SmallVectorImpl<int> &ShuffleMask); + +/// \brief Decode a SSE4A EXTRQ instruction as a v16i8 shuffle mask. +void DecodeEXTRQIMask(int Len, int Idx, + SmallVectorImpl<int> &ShuffleMask); + +/// \brief Decode a SSE4A INSERTQ instruction as a v16i8 shuffle mask. +void DecodeINSERTQIMask(int Len, int Idx, + SmallVectorImpl<int> &ShuffleMask); } // llvm namespace #endif diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp index 02645460b6a2..b4319c8bb04f 100644 --- a/lib/Target/X86/X86FastISel.cpp +++ b/lib/Target/X86/X86FastISel.cpp @@ -317,7 +317,7 @@ bool X86FastISel::foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I, } bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) { - EVT evt = TLI.getValueType(Ty, /*HandleUnknown=*/true); + EVT evt = TLI.getValueType(DL, Ty, /*HandleUnknown=*/true); if (evt == MVT::Other || !evt.isSimple()) // Unhandled type. Halt "fast" selection and bail. return false; @@ -608,7 +608,7 @@ bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) { // Prepare for inserting code in the local-value area. SavePoint SaveInsertPt = enterLocalValueArea(); - if (TLI.getPointerTy() == MVT::i64) { + if (TLI.getPointerTy(DL) == MVT::i64) { Opc = X86::MOV64rm; RC = &X86::GR64RegClass; @@ -690,13 +690,14 @@ redo_gep: case Instruction::IntToPtr: // Look past no-op inttoptrs. - if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy()) + if (TLI.getValueType(DL, U->getOperand(0)->getType()) == + TLI.getPointerTy(DL)) return X86SelectAddress(U->getOperand(0), AM); break; case Instruction::PtrToInt: // Look past no-op ptrtoints. - if (TLI.getValueType(U->getType()) == TLI.getPointerTy()) + if (TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL)) return X86SelectAddress(U->getOperand(0), AM); break; @@ -866,14 +867,14 @@ bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) { case Instruction::IntToPtr: // Look past no-op inttoptrs if its operand is in the same BB. if (InMBB && - TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy()) + TLI.getValueType(DL, U->getOperand(0)->getType()) == + TLI.getPointerTy(DL)) return X86SelectCallAddress(U->getOperand(0), AM); break; case Instruction::PtrToInt: // Look past no-op ptrtoints if its operand is in the same BB. - if (InMBB && - TLI.getValueType(U->getType()) == TLI.getPointerTy()) + if (InMBB && TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL)) return X86SelectCallAddress(U->getOperand(0), AM); break; } @@ -1000,7 +1001,7 @@ bool X86FastISel::X86SelectRet(const Instruction *I) { if (Ret->getNumOperands() > 0) { SmallVector<ISD::OutputArg, 4> Outs; - GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI); + GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI, DL); // Analyze operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ValLocs; @@ -1031,7 +1032,7 @@ bool X86FastISel::X86SelectRet(const Instruction *I) { return false; unsigned SrcReg = Reg + VA.getValNo(); - EVT SrcVT = TLI.getValueType(RV->getType()); + EVT SrcVT = TLI.getValueType(DL, RV->getType()); EVT DstVT = VA.getValVT(); // Special handling for extended integers. if (SrcVT != DstVT) { @@ -1300,7 +1301,7 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) { } bool X86FastISel::X86SelectZExt(const Instruction *I) { - EVT DstVT = TLI.getValueType(I->getType()); + EVT DstVT = TLI.getValueType(DL, I->getType()); if (!TLI.isTypeLegal(DstVT)) return false; @@ -1309,7 +1310,7 @@ bool X86FastISel::X86SelectZExt(const Instruction *I) { return false; // Handle zero-extension from i1 to i8, which is common. - MVT SrcVT = TLI.getSimpleValueType(I->getOperand(0)->getType()); + MVT SrcVT = TLI.getSimpleValueType(DL, I->getOperand(0)->getType()); if (SrcVT.SimpleTy == MVT::i1) { // Set the high bits to zero. ResultReg = fastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false); @@ -1362,7 +1363,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { X86::CondCode CC; if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) { if (CI->hasOneUse() && CI->getParent() == I->getParent()) { - EVT VT = TLI.getValueType(CI->getOperand(0)->getType()); + EVT VT = TLI.getValueType(DL, CI->getOperand(0)->getType()); // Try to optimize or fold the cmp. CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); @@ -1802,7 +1803,7 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) { if (NeedSwap) std::swap(CmpLHS, CmpRHS); - EVT CmpVT = TLI.getValueType(CmpLHS->getType()); + EVT CmpVT = TLI.getValueType(DL, CmpLHS->getType()); // Emit a compare of the LHS and RHS, setting the flags. if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT, CI->getDebugLoc())) return false; @@ -2004,7 +2005,7 @@ bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) { if (NeedSwap) std::swap(CmpLHS, CmpRHS); - EVT CmpVT = TLI.getValueType(CmpLHS->getType()); + EVT CmpVT = TLI.getValueType(DL, CmpLHS->getType()); if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT, CI->getDebugLoc())) return false; } else { @@ -2166,8 +2167,8 @@ bool X86FastISel::X86SelectFPTrunc(const Instruction *I) { } bool X86FastISel::X86SelectTrunc(const Instruction *I) { - EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType()); - EVT DstVT = TLI.getValueType(I->getType()); + EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType()); + EVT DstVT = TLI.getValueType(DL, I->getType()); // This code only handles truncation to byte. if (DstVT != MVT::i8 && DstVT != MVT::i1) @@ -2416,7 +2417,7 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { } case Intrinsic::stackprotector: { // Emit code to store the stack guard onto the stack. - EVT PtrTy = TLI.getPointerTy(); + EVT PtrTy = TLI.getPointerTy(DL); const Value *Op1 = II->getArgOperand(0); // The guard's value. const AllocaInst *Slot = cast<AllocaInst>(II->getArgOperand(1)); @@ -2735,7 +2736,7 @@ bool X86FastISel::fastLowerArguments() { if (ArgTy->isStructTy() || ArgTy->isArrayTy() || ArgTy->isVectorTy()) return false; - EVT ArgVT = TLI.getValueType(ArgTy); + EVT ArgVT = TLI.getValueType(DL, ArgTy); if (!ArgVT.isSimple()) return false; switch (ArgVT.getSimpleVT().SimpleTy) { default: return false; @@ -2772,7 +2773,7 @@ bool X86FastISel::fastLowerArguments() { unsigned GPRIdx = 0; unsigned FPRIdx = 0; for (auto const &Arg : F->args()) { - MVT VT = TLI.getSimpleValueType(Arg.getType()); + MVT VT = TLI.getSimpleValueType(DL, Arg.getType()); const TargetRegisterClass *RC = TLI.getRegClassFor(VT); unsigned SrcReg; switch (VT.SimpleTy) { @@ -3108,7 +3109,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) { OpFlags = X86II::MO_PLT; } else if (Subtarget->isPICStyleStubAny() && - (GV->isDeclaration() || GV->isWeakForLinker()) && + !GV->isStrongDefinitionForLinker() && (!Subtarget->getTargetTriple().isMacOSX() || Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { // PC-relative references to external symbols should go through $stub, @@ -3240,8 +3241,8 @@ X86FastISel::fastSelectInstruction(const Instruction *I) { return X86SelectSIToFP(I); case Instruction::IntToPtr: // Deliberate fall-through. case Instruction::PtrToInt: { - EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType()); - EVT DstVT = TLI.getValueType(I->getType()); + EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType()); + EVT DstVT = TLI.getValueType(DL, I->getType()); if (DstVT.bitsGT(SrcVT)) return X86SelectZExt(I); if (DstVT.bitsLT(SrcVT)) @@ -3384,7 +3385,7 @@ unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) { addDirectMem(MIB, AddrReg); MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand( MachinePointerInfo::getConstantPool(), MachineMemOperand::MOLoad, - TM.getDataLayout()->getPointerSize(), Align); + DL.getPointerSize(), Align); MIB->addMemOperand(*FuncInfo.MF, MMO); return ResultReg; } @@ -3411,17 +3412,17 @@ unsigned X86FastISel::X86MaterializeGV(const GlobalValue *GV, MVT VT) { unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT)); if (TM.getRelocationModel() == Reloc::Static && - TLI.getPointerTy() == MVT::i64) { + TLI.getPointerTy(DL) == MVT::i64) { // The displacement code could be more than 32 bits away so we need to use // an instruction with a 64 bit immediate BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV64ri), ResultReg) .addGlobalAddress(GV); } else { - unsigned Opc = TLI.getPointerTy() == MVT::i32 - ? (Subtarget->isTarget64BitILP32() - ? X86::LEA64_32r : X86::LEA32r) - : X86::LEA64r; + unsigned Opc = + TLI.getPointerTy(DL) == MVT::i32 + ? (Subtarget->isTarget64BitILP32() ? X86::LEA64_32r : X86::LEA32r) + : X86::LEA64r; addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg), AM); } @@ -3431,7 +3432,7 @@ unsigned X86FastISel::X86MaterializeGV(const GlobalValue *GV, MVT VT) { } unsigned X86FastISel::fastMaterializeConstant(const Constant *C) { - EVT CEVT = TLI.getValueType(C->getType(), true); + EVT CEVT = TLI.getValueType(DL, C->getType(), true); // Only handle simple types. if (!CEVT.isSimple()) @@ -3463,11 +3464,11 @@ unsigned X86FastISel::fastMaterializeAlloca(const AllocaInst *C) { X86AddressMode AM; if (!X86SelectAddress(C, AM)) return 0; - unsigned Opc = TLI.getPointerTy() == MVT::i32 - ? (Subtarget->isTarget64BitILP32() - ? X86::LEA64_32r : X86::LEA32r) - : X86::LEA64r; - const TargetRegisterClass* RC = TLI.getRegClassFor(TLI.getPointerTy()); + unsigned Opc = + TLI.getPointerTy(DL) == MVT::i32 + ? (Subtarget->isTarget64BitILP32() ? X86::LEA64_32r : X86::LEA32r) + : X86::LEA64r; + const TargetRegisterClass *RC = TLI.getRegClassFor(TLI.getPointerTy(DL)); unsigned ResultReg = createResultReg(RC); addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg), AM); diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp index 40b9c8a863a3..36a8cdbab55b 100644 --- a/lib/Target/X86/X86FloatingPoint.cpp +++ b/lib/Target/X86/X86FloatingPoint.cpp @@ -301,8 +301,9 @@ bool FPS::runOnMachineFunction(MachineFunction &MF) { bool FPIsUsed = false; static_assert(X86::FP6 == X86::FP0+6, "Register enums aren't sorted right!"); + const MachineRegisterInfo &MRI = MF.getRegInfo(); for (unsigned i = 0; i <= 6; ++i) - if (MF.getRegInfo().isPhysRegUsed(X86::FP0+i)) { + if (!MRI.reg_nodbg_empty(X86::FP0 + i)) { FPIsUsed = true; break; } diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp index 85c5b6499131..2a35c4cf31f3 100644 --- a/lib/Target/X86/X86FrameLowering.cpp +++ b/lib/Target/X86/X86FrameLowering.cpp @@ -90,7 +90,7 @@ bool X86FrameLowering::hasFP(const MachineFunction &MF) const { return (MF.getTarget().Options.DisableFramePointerElim(MF) || TRI->needsStackRealignment(MF) || MFI->hasVarSizedObjects() || - MFI->isFrameAddressTaken() || MFI->hasInlineAsmWithSPAdjust() || + MFI->isFrameAddressTaken() || MFI->hasOpaqueSPAdjustment() || MF.getInfo<X86MachineFunctionInfo>()->getForceFramePointer() || MMI.callsUnwindInit() || MMI.callsEHReturn() || MFI->hasStackMap() || MFI->hasPatchPoint()); @@ -967,13 +967,26 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, .addReg(StackPtr) .setMIFlag(MachineInstr::FrameSetup); if (X86FI->getRestoreBasePointer()) { - // Stash value of base pointer. Saving RSP instead of EBP shortens dependence chain. + // Stash value of base pointer. Saving RSP instead of EBP shortens + // dependence chain. Used by SjLj EH. unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr; addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)), FramePtr, true, X86FI->getRestoreBasePointerOffset()) .addReg(StackPtr) .setMIFlag(MachineInstr::FrameSetup); } + + if (X86FI->getHasSEHFramePtrSave()) { + // Stash the value of the frame pointer relative to the base pointer for + // Win32 EH. This supports Win32 EH, which does the inverse of the above: + // it recovers the frame pointer from the base pointer rather than the + // other way around. + unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr; + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)), BasePtr, true, + getFrameIndexOffset(MF, X86FI->getSEHFramePtrSaveIndex())) + .addReg(FramePtr) + .setMIFlag(MachineInstr::FrameSetup); + } } if (((!HasFP && NumBytes) || PushedRegs) && NeedsDwarfCFI) { @@ -1412,9 +1425,11 @@ bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB, return true; } -void -X86FrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, - RegScavenger *RS) const { +void X86FrameLowering::determineCalleeSaves(MachineFunction &MF, + BitVector &SavedRegs, + RegScavenger *RS) const { + TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); + MachineFrameInfo *MFI = MF.getFrameInfo(); X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); @@ -1436,7 +1451,7 @@ X86FrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, // Spill the BasePtr if it's used. if (TRI->hasBasePointer(MF)) - MF.getRegInfo().setPhysRegUsed(TRI->getBaseRegister()); + SavedRegs.set(TRI->getBaseRegister()); } static bool @@ -1667,8 +1682,6 @@ void X86FrameLowering::adjustForSegmentedStacks( .addImm(StackSize); BuildMI(allocMBB, DL, TII.get(MOVri), Reg11) .addImm(X86FI->getArgumentStackSize()); - MF.getRegInfo().setPhysRegUsed(Reg10); - MF.getRegInfo().setPhysRegUsed(Reg11); } else { BuildMI(allocMBB, DL, TII.get(X86::PUSHi32)) .addImm(X86FI->getArgumentStackSize()); diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h index c274c8820149..495cfcd1c3f7 100644 --- a/lib/Target/X86/X86FrameLowering.h +++ b/lib/Target/X86/X86FrameLowering.h @@ -68,8 +68,8 @@ public: void adjustForHiPEPrologue(MachineFunction &MF, MachineBasicBlock &PrologueMBB) const override; - void processFunctionBeforeCalleeSavedScan(MachineFunction &MF, - RegScavenger *RS = nullptr) const override; + void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, + RegScavenger *RS = nullptr) const override; bool assignCalleeSavedSpillSlots(MachineFunction &MF, diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index 6b23e62a2d35..d5351d25d6ed 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -246,8 +246,9 @@ namespace { SDValue &Index, SDValue &Disp, SDValue &Segment) { Base = (AM.BaseType == X86ISelAddressMode::FrameIndexBase) - ? CurDAG->getTargetFrameIndex(AM.Base_FrameIndex, - TLI->getPointerTy()) + ? CurDAG->getTargetFrameIndex( + AM.Base_FrameIndex, + TLI->getPointerTy(CurDAG->getDataLayout())) : AM.Base_Reg; Scale = getI8Imm(AM.Scale, DL); Index = AM.IndexReg; @@ -581,11 +582,12 @@ void X86DAGToDAGISel::PreprocessISelDAG() { void X86DAGToDAGISel::EmitSpecialCodeForMain() { if (Subtarget->isTargetCygMing()) { TargetLowering::ArgListTy Args; + auto &DL = CurDAG->getDataLayout(); TargetLowering::CallLoweringInfo CLI(*CurDAG); CLI.setChain(CurDAG->getRoot()) .setCallee(CallingConv::C, Type::getVoidTy(*CurDAG->getContext()), - CurDAG->getExternalSymbol("__main", TLI->getPointerTy()), + CurDAG->getExternalSymbol("__main", TLI->getPointerTy(DL)), std::move(Args), 0); const TargetLowering &TLI = CurDAG->getTargetLoweringInfo(); std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI); @@ -1025,7 +1027,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, switch (N.getOpcode()) { default: break; - case ISD::FRAME_ALLOC_RECOVER: { + case ISD::LOCAL_RECOVER: { if (!AM.hasSymbolicDisplacement() && AM.Disp == 0) if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(N.getOperand(0))) { // Use the symbol and don't prefix it. @@ -1638,7 +1640,8 @@ bool X86DAGToDAGISel::TryFoldLoad(SDNode *P, SDValue N, /// SDNode *X86DAGToDAGISel::getGlobalBaseReg() { unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF); - return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy()).getNode(); + auto &DL = MF->getDataLayout(); + return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy(DL)).getNode(); } /// Atomic opcode table diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index b16bd18aefaa..6e22ab30057c 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -76,7 +76,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, : TargetLowering(TM), Subtarget(&STI) { X86ScalarSSEf64 = Subtarget->hasSSE2(); X86ScalarSSEf32 = Subtarget->hasSSE1(); - TD = getDataLayout(); + TD = TM.getDataLayout(); // Set up the TargetLowering object. static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }; @@ -505,7 +505,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); - setOperationAction(ISD::DYNAMIC_STACKALLOC, getPointerTy(), Custom); + setOperationAction(ISD::DYNAMIC_STACKALLOC, getPointerTy(*TD), Custom); // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering. setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom); @@ -825,6 +825,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FNEG, MVT::v2f64, Custom); setOperationAction(ISD::FABS, MVT::v2f64, Custom); + setOperationAction(ISD::SMAX, MVT::v8i16, Legal); + setOperationAction(ISD::UMAX, MVT::v16i8, Legal); + setOperationAction(ISD::SMIN, MVT::v8i16, Legal); + setOperationAction(ISD::UMIN, MVT::v16i8, Legal); + setOperationAction(ISD::SETCC, MVT::v2i64, Custom); setOperationAction(ISD::SETCC, MVT::v16i8, Custom); setOperationAction(ISD::SETCC, MVT::v8i16, Custom); @@ -944,6 +949,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal); } + setOperationAction(ISD::SMAX, MVT::v16i8, Legal); + setOperationAction(ISD::SMAX, MVT::v4i32, Legal); + setOperationAction(ISD::UMAX, MVT::v8i16, Legal); + setOperationAction(ISD::UMAX, MVT::v4i32, Legal); + setOperationAction(ISD::SMIN, MVT::v16i8, Legal); + setOperationAction(ISD::SMIN, MVT::v4i32, Legal); + setOperationAction(ISD::UMIN, MVT::v8i16, Legal); + setOperationAction(ISD::UMIN, MVT::v4i32, Legal); + // FIXME: Do we need to handle scalar-to-vector here? setOperationAction(ISD::MUL, MVT::v4i32, Legal); @@ -1018,6 +1032,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SHL, MVT::v2i64, Custom); setOperationAction(ISD::SHL, MVT::v4i32, Custom); + setOperationAction(ISD::SRA, MVT::v2i64, Custom); setOperationAction(ISD::SRA, MVT::v4i32, Custom); } @@ -1141,6 +1156,19 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::MULHU, MVT::v16i16, Legal); setOperationAction(ISD::MULHS, MVT::v16i16, Legal); + setOperationAction(ISD::SMAX, MVT::v32i8, Legal); + setOperationAction(ISD::SMAX, MVT::v16i16, Legal); + setOperationAction(ISD::SMAX, MVT::v8i32, Legal); + setOperationAction(ISD::UMAX, MVT::v32i8, Legal); + setOperationAction(ISD::UMAX, MVT::v16i16, Legal); + setOperationAction(ISD::UMAX, MVT::v8i32, Legal); + setOperationAction(ISD::SMIN, MVT::v32i8, Legal); + setOperationAction(ISD::SMIN, MVT::v16i16, Legal); + setOperationAction(ISD::SMIN, MVT::v8i32, Legal); + setOperationAction(ISD::UMIN, MVT::v32i8, Legal); + setOperationAction(ISD::UMIN, MVT::v16i16, Legal); + setOperationAction(ISD::UMIN, MVT::v8i32, Legal); + // The custom lowering for UINT_TO_FP for v8i32 becomes interesting // when we have a 256bit-wide blend with immediate. setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom); @@ -1184,6 +1212,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SHL, MVT::v4i64, Custom); setOperationAction(ISD::SHL, MVT::v8i32, Custom); + setOperationAction(ISD::SRA, MVT::v4i64, Custom); setOperationAction(ISD::SRA, MVT::v8i32, Custom); // Custom lower several nodes for 256-bit types. @@ -1376,6 +1405,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SELECT, MVT::v16i1, Custom); setOperationAction(ISD::SELECT, MVT::v8i1, Custom); + setOperationAction(ISD::SMAX, MVT::v16i32, Legal); + setOperationAction(ISD::SMAX, MVT::v8i64, Legal); + setOperationAction(ISD::UMAX, MVT::v16i32, Legal); + setOperationAction(ISD::UMAX, MVT::v8i64, Legal); + setOperationAction(ISD::SMIN, MVT::v16i32, Legal); + setOperationAction(ISD::SMIN, MVT::v8i64, Legal); + setOperationAction(ISD::UMIN, MVT::v16i32, Legal); + setOperationAction(ISD::UMIN, MVT::v8i64, Legal); + setOperationAction(ISD::ADD, MVT::v8i64, Legal); setOperationAction(ISD::ADD, MVT::v16i32, Legal); @@ -1473,6 +1511,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SUB, MVT::v32i16, Legal); setOperationAction(ISD::SUB, MVT::v64i8, Legal); setOperationAction(ISD::MUL, MVT::v32i16, Legal); + setOperationAction(ISD::MULHS, MVT::v32i16, Legal); + setOperationAction(ISD::MULHU, MVT::v32i16, Legal); setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom); @@ -1492,6 +1532,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::TRUNCATE, MVT::v32i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v64i1, Custom); + setOperationAction(ISD::SMAX, MVT::v64i8, Legal); + setOperationAction(ISD::SMAX, MVT::v32i16, Legal); + setOperationAction(ISD::UMAX, MVT::v64i8, Legal); + setOperationAction(ISD::UMAX, MVT::v32i16, Legal); + setOperationAction(ISD::SMIN, MVT::v64i8, Legal); + setOperationAction(ISD::SMIN, MVT::v32i16, Legal); + setOperationAction(ISD::UMIN, MVT::v64i8, Legal); + setOperationAction(ISD::UMIN, MVT::v32i16, Legal); + for (int i = MVT::v32i8; i != MVT::v8i64; ++i) { const MVT VT = (MVT::SimpleValueType)i; @@ -1531,6 +1580,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::XOR, MVT::v4i32, Legal); setOperationAction(ISD::SRA, MVT::v2i64, Custom); setOperationAction(ISD::SRA, MVT::v4i64, Custom); + + setOperationAction(ISD::SMAX, MVT::v2i64, Legal); + setOperationAction(ISD::SMAX, MVT::v4i64, Legal); + setOperationAction(ISD::UMAX, MVT::v2i64, Legal); + setOperationAction(ISD::UMAX, MVT::v4i64, Legal); + setOperationAction(ISD::SMIN, MVT::v2i64, Legal); + setOperationAction(ISD::SMIN, MVT::v4i64, Legal); + setOperationAction(ISD::UMIN, MVT::v2i64, Legal); + setOperationAction(ISD::UMIN, MVT::v4i64, Legal); } // We want to custom lower some of our intrinsics. @@ -1611,6 +1669,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setTargetDAGCombine(ISD::SIGN_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND_INREG); setTargetDAGCombine(ISD::SINT_TO_FP); + setTargetDAGCombine(ISD::UINT_TO_FP); setTargetDAGCombine(ISD::SETCC); setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); setTargetDAGCombine(ISD::BUILD_VECTOR); @@ -1652,7 +1711,8 @@ X86TargetLowering::getPreferredVectorAction(EVT VT) const { return TargetLoweringBase::getPreferredVectorAction(VT); } -EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { +EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, + EVT VT) const { if (!VT.isVector()) return Subtarget->hasAVX512() ? MVT::i1: MVT::i8; @@ -1724,10 +1784,11 @@ static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) { /// function arguments in the caller parameter area. For X86, aggregates /// that contain SSE vectors are placed at 16-byte boundaries while the rest /// are at 4-byte boundaries. -unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const { +unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty, + const DataLayout &DL) const { if (Subtarget->is64Bit()) { // Max of 8 and alignment of type. - unsigned TyAlign = TD->getABITypeAlignment(Ty); + unsigned TyAlign = DL.getABITypeAlignment(Ty); if (TyAlign > 8) return TyAlign; return 8; @@ -1840,7 +1901,8 @@ SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, if (!Subtarget->is64Bit()) // This doesn't have SDLoc associated with it, but is not really the // same as a Register. - return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy()); + return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), + getPointerTy(DAG.getDataLayout())); return Table; } @@ -2032,7 +2094,8 @@ X86TargetLowering::LowerReturn(SDValue Chain, // false, then an sret argument may be implicitly inserted in the SelDAG. In // either case FuncInfo->setSRetReturnReg() will have been called. if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) { - SDValue Val = DAG.getCopyFromReg(Chain, dl, SRetReg, getPointerTy()); + SDValue Val = DAG.getCopyFromReg(Chain, dl, SRetReg, + getPointerTy(MF.getDataLayout())); unsigned RetValReg = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ? @@ -2041,7 +2104,8 @@ X86TargetLowering::LowerReturn(SDValue Chain, Flag = Chain.getValue(1); // RAX/EAX now acts like a return value. - RetOps.push_back(DAG.getRegister(RetValReg, getPointerTy())); + RetOps.push_back( + DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout()))); } RetOps[0] = Chain; // Update chain. @@ -2288,11 +2352,11 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, unsigned Bytes = Flags.getByValSize(); if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects. int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable); - return DAG.getFrameIndex(FI, getPointerTy()); + return DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); } else { int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8, VA.getLocMemOffset(), isImmutable); - SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); + SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); SDValue Val = DAG.getLoad(ValVT, dl, Chain, FIN, MachinePointerInfo::getFixedStack(FI), false, false, false, 0); @@ -2471,7 +2535,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, if (Ins[i].Flags.isSRet()) { unsigned Reg = FuncInfo->getSRetReturnReg(); if (!Reg) { - MVT PtrTy = getPointerTy(); + MVT PtrTy = getPointerTy(DAG.getDataLayout()); Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy)); FuncInfo->setSRetReturnReg(Reg); } @@ -2499,7 +2563,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, MachineModuleInfo &MMI = MF.getMMI(); const Function *WinEHParent = nullptr; - if (IsWin64 && MMI.hasWinEHFuncInfo(Fn)) + if (MMI.hasWinEHFuncInfo(Fn)) WinEHParent = MMI.getWinEHParent(Fn); bool IsWinEHOutlined = WinEHParent && WinEHParent != Fn; bool IsWinEHParent = WinEHParent && WinEHParent == Fn; @@ -2561,11 +2625,11 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, // Store the integer parameter registers. SmallVector<SDValue, 8> MemOps; SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), - getPointerTy()); + getPointerTy(DAG.getDataLayout())); unsigned Offset = FuncInfo->getVarArgsGPOffset(); for (SDValue Val : LiveGPRs) { - SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, - DAG.getIntPtrConstant(Offset, dl)); + SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), + RSFIN, DAG.getIntPtrConstant(Offset, dl)); SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo::getFixedStack( @@ -2592,7 +2656,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, if (!MemOps.empty()) Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); - } else if (IsWinEHOutlined) { + } else if (IsWin64 && IsWinEHOutlined) { // Get to the caller-allocated home save location. Add 8 to account // for the return address. int HomeOffset = TFI.getOffsetOfLocalArea() + 8; @@ -2605,8 +2669,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, // Store the second integer parameter (rdx) into rsp+16 relative to the // stack pointer at the entry of the function. - SDValue RSFIN = - DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), getPointerTy()); + SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), + getPointerTy(DAG.getDataLayout())); unsigned GPR = MF.addLiveIn(X86::RDX, &X86::GR64RegClass); SDValue Val = DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64); Chain = DAG.getStore( @@ -2680,14 +2744,21 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, FuncInfo->setArgumentStackSize(StackSize); if (IsWinEHParent) { - int UnwindHelpFI = MFI->CreateStackObject(8, 8, /*isSS=*/false); - SDValue StackSlot = DAG.getFrameIndex(UnwindHelpFI, MVT::i64); - MMI.getWinEHFuncInfo(MF.getFunction()).UnwindHelpFrameIdx = UnwindHelpFI; - SDValue Neg2 = DAG.getConstant(-2, dl, MVT::i64); - Chain = DAG.getStore(Chain, dl, Neg2, StackSlot, - MachinePointerInfo::getFixedStack(UnwindHelpFI), - /*isVolatile=*/true, - /*isNonTemporal=*/false, /*Alignment=*/0); + if (Is64Bit) { + int UnwindHelpFI = MFI->CreateStackObject(8, 8, /*isSS=*/false); + SDValue StackSlot = DAG.getFrameIndex(UnwindHelpFI, MVT::i64); + MMI.getWinEHFuncInfo(MF.getFunction()).UnwindHelpFrameIdx = UnwindHelpFI; + SDValue Neg2 = DAG.getConstant(-2, dl, MVT::i64); + Chain = DAG.getStore(Chain, dl, Neg2, StackSlot, + MachinePointerInfo::getFixedStack(UnwindHelpFI), + /*isVolatile=*/true, + /*isNonTemporal=*/false, /*Alignment=*/0); + } else { + // Functions using Win32 EH are considered to have opaque SP adjustments + // to force local variables to be addressed from the frame or base + // pointers. + MFI->setHasOpaqueSPAdjustment(true); + } } return Chain; @@ -2701,7 +2772,8 @@ X86TargetLowering::LowerMemOpCallTo(SDValue Chain, ISD::ArgFlagsTy Flags) const { unsigned LocMemOffset = VA.getLocMemOffset(); SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); - PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); + PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), + StackPtr, PtrOff); if (Flags.isByVal()) return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); @@ -2718,7 +2790,7 @@ X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, bool IsTailCall, bool Is64Bit, int FPDiff, SDLoc dl) const { // Adjust the Return address stack slot. - EVT VT = getPointerTy(); + EVT VT = getPointerTy(DAG.getDataLayout()); OutRetAddr = getReturnAddressFrameIndex(DAG); // Load the "old" Return address. @@ -2942,7 +3014,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, assert(VA.isMemLoc()); if (!StackPtr.getNode()) StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), - getPointerTy()); + getPointerTy(DAG.getDataLayout())); MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, dl, DAG, VA, Flags)); } @@ -2955,8 +3027,9 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // ELF / PIC requires GOT in the EBX register before function calls via PLT // GOT pointer. if (!isTailCall) { - RegsToPass.push_back(std::make_pair(unsigned(X86::EBX), - DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy()))); + RegsToPass.push_back(std::make_pair( + unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), + getPointerTy(DAG.getDataLayout())))); } else { // If we are tail calling and generating PIC/GOT style code load the // address of the callee into ECX. The value in ecx is used as target of @@ -3036,16 +3109,16 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, int32_t Offset = VA.getLocMemOffset()+FPDiff; uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); - FIN = DAG.getFrameIndex(FI, getPointerTy()); + FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); if (Flags.isByVal()) { // Copy relative to framepointer. SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl); if (!StackPtr.getNode()) - StackPtr = DAG.getCopyFromReg(Chain, dl, - RegInfo->getStackRegister(), - getPointerTy()); - Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source); + StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), + getPointerTy(DAG.getDataLayout())); + Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), + StackPtr, Source); MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, ArgChain, @@ -3064,8 +3137,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Store the return address to the appropriate stack slot. Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, - getPointerTy(), RegInfo->getSlotSize(), - FPDiff, dl); + getPointerTy(DAG.getDataLayout()), + RegInfo->getSlotSize(), FPDiff, dl); } // Build a sequence of copy-to-reg nodes chained together with token chain @@ -3106,7 +3179,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) { OpFlags = X86II::MO_PLT; } else if (Subtarget->isPICStyleStubAny() && - (GV->isDeclaration() || GV->isWeakForLinker()) && + !GV->isStrongDefinitionForLinker() && (!Subtarget->getTargetTriple().isMacOSX() || Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { // PC-relative references to external symbols should go through $stub, @@ -3123,17 +3196,18 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, ExtraLoad = true; } - Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), - G->getOffset(), OpFlags); + Callee = DAG.getTargetGlobalAddress( + GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags); // Add a wrapper if needed. if (WrapperKind != ISD::DELETED_NODE) - Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee); + Callee = DAG.getNode(X86ISD::WrapperRIP, dl, + getPointerTy(DAG.getDataLayout()), Callee); // Add extra indirection if needed. if (ExtraLoad) - Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee, - MachinePointerInfo::getGOT(), - false, false, false, 0); + Callee = DAG.getLoad( + getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee, + MachinePointerInfo::getGOT(), false, false, false, 0); } } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { unsigned char OpFlags = 0; @@ -3152,8 +3226,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, OpFlags = X86II::MO_DARWIN_STUB; } - Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(), - OpFlags); + Callee = DAG.getTargetExternalSymbol( + S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags); } else if (Subtarget->isTarget64BitILP32() && Callee->getValueType(0) == MVT::i32) { // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI @@ -3184,9 +3258,24 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, RegsToPass[i].second.getValueType())); // Add a register mask operand representing the call-preserved registers. - const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo(); - const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv); + const uint32_t *Mask = RegInfo->getCallPreservedMask(MF, CallConv); assert(Mask && "Missing call preserved mask for calling convention"); + + // If this is an invoke in a 32-bit function using an MSVC personality, assume + // the function clobbers all registers. If an exception is thrown, the runtime + // will not restore CSRs. + // FIXME: Model this more precisely so that we can register allocate across + // the normal edge and spill and fill across the exceptional edge. + if (!Is64Bit && CLI.CS && CLI.CS->isInvoke()) { + const Function *CallerFn = MF.getFunction(); + EHPersonality Pers = + CallerFn->hasPersonalityFn() + ? classifyEHPersonality(CallerFn->getPersonalityFn()) + : EHPersonality::Unknown; + if (isMSVCEHPersonality(Pers)) + Mask = RegInfo->getNoPreservedMask(); + } + Ops.push_back(DAG.getRegisterMask(Mask)); if (InFlag.getNode()) @@ -3650,7 +3739,7 @@ SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { FuncInfo->setRAIndex(ReturnAddrIndex); } - return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy()); + return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout())); } bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, @@ -3881,6 +3970,15 @@ bool X86TargetLowering::isCheapToSpeculateCtlz() const { return Subtarget->hasLZCNT(); } +/// isUndefInRange - Return true if every element in Mask, beginning +/// from position Pos and ending in Pos+Size is undef. +static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) { + for (unsigned i = Pos, e = Pos + Size; i != e; ++i) + if (0 <= Mask[i]) + return false; + return true; +} + /// isUndefOrInRange - Return true if Val is undef or if its value falls within /// the specified range (L, H]. static bool isUndefOrInRange(int Val, int Low, int Hi) { @@ -4322,6 +4420,7 @@ static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, /// IsUnary to true if only uses one source. Note that this will set IsUnary for /// shuffles which use a single input multiple times, and in those cases it will /// adjust the mask to only have indices within that single input. +/// FIXME: Add support for Decode*Mask functions that return SM_SentinelZero. static bool getTargetShuffleMask(SDNode *N, MVT VT, SmallVectorImpl<int> &Mask, bool &IsUnary) { unsigned NumElems = VT.getVectorNumElements(); @@ -4451,6 +4550,10 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, ImmN = N->getOperand(N->getNumOperands()-1); DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); if (Mask.empty()) return false; + // Mask only contains negative index if an element is zero. + if (std::any_of(Mask.begin(), Mask.end(), + [](int M){ return M == SM_SentinelZero; })) + return false; break; case X86ISD::MOVSLDUP: DecodeMOVSLDUPMask(VT, Mask); @@ -4764,7 +4867,7 @@ static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, MVT ShVT = MVT::v2i64; unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ; SrcOp = DAG.getBitcast(ShVT, SrcOp); - MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(SrcOp.getValueType()); + MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), VT); assert(NumBits % 8 == 0 && "Only support byte sized shifts"); SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, ScalarShiftTy); return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal)); @@ -5082,7 +5185,8 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget, assert(C && "Invalid constant type"); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy()); + SDValue CP = + DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout())); unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment(); Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP, MachinePointerInfo::getConstantPool(), @@ -6857,6 +6961,136 @@ static SDValue lowerVectorShuffleAsShift(SDLoc DL, MVT VT, SDValue V1, return SDValue(); } +/// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ. +static SDValue lowerVectorShuffleWithSSE4A(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + SelectionDAG &DAG) { + SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + assert(!Zeroable.all() && "Fully zeroable shuffle mask"); + + int Size = Mask.size(); + int HalfSize = Size / 2; + assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size"); + + // Upper half must be undefined. + if (!isUndefInRange(Mask, HalfSize, HalfSize)) + return SDValue(); + + // EXTRQ: Extract Len elements from lower half of source, starting at Idx. + // Remainder of lower half result is zero and upper half is all undef. + auto LowerAsEXTRQ = [&]() { + // Determine the extraction length from the part of the + // lower half that isn't zeroable. + int Len = HalfSize; + for (; Len >= 0; --Len) + if (!Zeroable[Len - 1]) + break; + assert(Len > 0 && "Zeroable shuffle mask"); + + // Attempt to match first Len sequential elements from the lower half. + SDValue Src; + int Idx = -1; + for (int i = 0; i != Len; ++i) { + int M = Mask[i]; + if (M < 0) + continue; + SDValue &V = (M < Size ? V1 : V2); + M = M % Size; + + // All mask elements must be in the lower half. + if (M > HalfSize) + return SDValue(); + + if (Idx < 0 || (Src == V && Idx == (M - i))) { + Src = V; + Idx = M - i; + continue; + } + return SDValue(); + } + + if (Idx < 0) + return SDValue(); + + assert((Idx + Len) <= HalfSize && "Illegal extraction mask"); + int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f; + int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f; + return DAG.getNode(X86ISD::EXTRQI, DL, VT, Src, + DAG.getConstant(BitLen, DL, MVT::i8), + DAG.getConstant(BitIdx, DL, MVT::i8)); + }; + + if (SDValue ExtrQ = LowerAsEXTRQ()) + return ExtrQ; + + // INSERTQ: Extract lowest Len elements from lower half of second source and + // insert over first source, starting at Idx. + // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... } + auto LowerAsInsertQ = [&]() { + for (int Idx = 0; Idx != HalfSize; ++Idx) { + SDValue Base; + + // Attempt to match first source from mask before insertion point. + if (isUndefInRange(Mask, 0, Idx)) { + /* EMPTY */ + } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) { + Base = V1; + } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) { + Base = V2; + } else { + continue; + } + + // Extend the extraction length looking to match both the insertion of + // the second source and the remaining elements of the first. + for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) { + SDValue Insert; + int Len = Hi - Idx; + + // Match insertion. + if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) { + Insert = V1; + } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) { + Insert = V2; + } else { + continue; + } + + // Match the remaining elements of the lower half. + if (isUndefInRange(Mask, Hi, HalfSize - Hi)) { + /* EMPTY */ + } else if ((!Base || (Base == V1)) && + isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) { + Base = V1; + } else if ((!Base || (Base == V2)) && + isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, + Size + Hi)) { + Base = V2; + } else { + continue; + } + + // We may not have a base (first source) - this can safely be undefined. + if (!Base) + Base = DAG.getUNDEF(VT); + + int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f; + int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f; + return DAG.getNode(X86ISD::INSERTQI, DL, VT, Base, Insert, + DAG.getConstant(BitLen, DL, MVT::i8), + DAG.getConstant(BitIdx, DL, MVT::i8)); + } + } + + return SDValue(); + }; + + if (SDValue InsertQ = LowerAsInsertQ()) + return InsertQ; + + return SDValue(); +} + /// \brief Lower a vector shuffle as a zero or any extension. /// /// Given a specific number of elements, element bit width, and extension @@ -6864,7 +7098,7 @@ static SDValue lowerVectorShuffleAsShift(SDLoc DL, MVT VT, SDValue V1, /// features of the subtarget. static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( SDLoc DL, MVT VT, int Scale, bool AnyExt, SDValue InputV, - const X86Subtarget *Subtarget, SelectionDAG &DAG) { + ArrayRef<int> Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) { assert(Scale > 1 && "Need a scale to extend."); int NumElements = VT.getVectorNumElements(); int EltBits = VT.getScalarSizeInBits(); @@ -6901,6 +7135,28 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( getV4X86ShuffleImm8ForMask(PSHUFHWMask, DL, DAG))); } + // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes + // to 64-bits. + if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget->hasSSE4A()) { + assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!"); + assert(VT.getSizeInBits() == 128 && "Unexpected vector width!"); + + SDValue Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, + DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV, + DAG.getConstant(EltBits, DL, MVT::i8), + DAG.getConstant(0, DL, MVT::i8))); + if (isUndefInRange(Mask, NumElements/2, NumElements/2)) + return DAG.getNode(ISD::BITCAST, DL, VT, Lo); + + SDValue Hi = + DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, + DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV, + DAG.getConstant(EltBits, DL, MVT::i8), + DAG.getConstant(EltBits, DL, MVT::i8))); + return DAG.getNode(ISD::BITCAST, DL, VT, + DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi)); + } + // If this would require more than 2 unpack instructions to expand, use // pshufb when available. We can only use more than 2 unpack instructions // when zero extending i8 elements which also makes it easier to use pshufb. @@ -6991,7 +7247,7 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend( return SDValue(); return lowerVectorShuffleAsSpecificZeroOrAnyExtend( - DL, VT, Scale, AnyExt, InputV, Subtarget, DAG); + DL, VT, Scale, AnyExt, InputV, Mask, Subtarget, DAG); }; // The widest scale possible for extending is to a 64-bit integer. @@ -7166,9 +7422,9 @@ static SDValue lowerVectorShuffleAsElementInsertion( V2 = DAG.getBitcast(MVT::v2i64, V2); V2 = DAG.getNode( X86ISD::VSHLDQ, DL, MVT::v2i64, V2, - DAG.getConstant( - V2Index * EltVT.getSizeInBits()/8, DL, - DAG.getTargetLoweringInfo().getScalarShiftAmountTy(MVT::v2i64))); + DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL, + DAG.getTargetLoweringInfo().getScalarShiftAmountTy( + DAG.getDataLayout(), VT))); V2 = DAG.getBitcast(VT, V2); } } @@ -8518,6 +8774,11 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, DAG)) return Shift; + // See if we can use SSE4A Extraction / Insertion. + if (Subtarget->hasSSE4A()) + if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask, DAG)) + return V; + // There are special ways we can lower some single-element blends. if (NumV2Inputs == 1) if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v8i16, V1, V2, @@ -8670,6 +8931,11 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG)) return ZExt; + // See if we can use SSE4A Extraction / Insertion. + if (Subtarget->hasSSE4A()) + if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask, DAG)) + return V; + int NumV2Elements = std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 16; }); @@ -10613,12 +10879,13 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, MaskEltVT.getSizeInBits()); Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT); + auto PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT, - getZeroVector(MaskVT, Subtarget, DAG, dl), - Idx, DAG.getConstant(0, dl, getPointerTy())); + getZeroVector(MaskVT, Subtarget, DAG, dl), Idx, + DAG.getConstant(0, dl, PtrVT)); SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec); - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), - Perm, DAG.getConstant(0, dl, getPointerTy())); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Perm, + DAG.getConstant(0, dl, PtrVT)); } return SDValue(); } @@ -11009,17 +11276,16 @@ X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { else if (Subtarget->isPICStyleStubPIC()) OpFlag = X86II::MO_PIC_BASE_OFFSET; - SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(), - CP->getAlignment(), - CP->getOffset(), OpFlag); + auto PtrVT = getPointerTy(DAG.getDataLayout()); + SDValue Result = DAG.getTargetConstantPool( + CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag); SDLoc DL(CP); - Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); + Result = DAG.getNode(WrapperKind, DL, PtrVT, Result); // With PIC, the address is actually $g + Offset. if (OpFlag) { - Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), - DAG.getNode(X86ISD::GlobalBaseReg, - SDLoc(), getPointerTy()), - Result); + Result = + DAG.getNode(ISD::ADD, DL, PtrVT, + DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result); } return Result; @@ -11042,17 +11308,16 @@ SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { else if (Subtarget->isPICStyleStubPIC()) OpFlag = X86II::MO_PIC_BASE_OFFSET; - SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(), - OpFlag); + auto PtrVT = getPointerTy(DAG.getDataLayout()); + SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag); SDLoc DL(JT); - Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); + Result = DAG.getNode(WrapperKind, DL, PtrVT, Result); // With PIC, the address is actually $g + Offset. if (OpFlag) - Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), - DAG.getNode(X86ISD::GlobalBaseReg, - SDLoc(), getPointerTy()), - Result); + Result = + DAG.getNode(ISD::ADD, DL, PtrVT, + DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result); return Result; } @@ -11080,24 +11345,24 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { OpFlag = X86II::MO_DARWIN_NONLAZY; } - SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag); + auto PtrVT = getPointerTy(DAG.getDataLayout()); + SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag); SDLoc DL(Op); - Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); + Result = DAG.getNode(WrapperKind, DL, PtrVT, Result); // With PIC, the address is actually $g + Offset. if (DAG.getTarget().getRelocationModel() == Reloc::PIC_ && !Subtarget->is64Bit()) { - Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), - DAG.getNode(X86ISD::GlobalBaseReg, - SDLoc(), getPointerTy()), - Result); + Result = + DAG.getNode(ISD::ADD, DL, PtrVT, + DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result); } // For symbols that require a load from a stub to get the address, emit the // load. if (isGlobalStubReference(OpFlag)) - Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result, + Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result, MachinePointerInfo::getGOT(), false, false, false, 0); return Result; @@ -11112,20 +11377,19 @@ X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset(); SDLoc dl(Op); - SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy(), Offset, - OpFlags); + auto PtrVT = getPointerTy(DAG.getDataLayout()); + SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags); if (Subtarget->isPICStyleRIPRel() && (M == CodeModel::Small || M == CodeModel::Kernel)) - Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); + Result = DAG.getNode(X86ISD::WrapperRIP, dl, PtrVT, Result); else - Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); + Result = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, Result); // With PIC, the address is actually $g + Offset. if (isGlobalRelativeToPICBase(OpFlags)) { - Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), - DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), - Result); + Result = DAG.getNode(ISD::ADD, dl, PtrVT, + DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result); } return Result; @@ -11139,40 +11403,40 @@ X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl, unsigned char OpFlags = Subtarget->ClassifyGlobalReference(GV, DAG.getTarget()); CodeModel::Model M = DAG.getTarget().getCodeModel(); + auto PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Result; if (OpFlags == X86II::MO_NO_FLAG && X86::isOffsetSuitableForCodeModel(Offset, M)) { // A direct static reference to a global. - Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset); + Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset); Offset = 0; } else { - Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags); + Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags); } if (Subtarget->isPICStyleRIPRel() && (M == CodeModel::Small || M == CodeModel::Kernel)) - Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); + Result = DAG.getNode(X86ISD::WrapperRIP, dl, PtrVT, Result); else - Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); + Result = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, Result); // With PIC, the address is actually $g + Offset. if (isGlobalRelativeToPICBase(OpFlags)) { - Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), - DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), - Result); + Result = DAG.getNode(ISD::ADD, dl, PtrVT, + DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result); } // For globals that require a load from a stub to get the address, emit the // load. if (isGlobalStubReference(OpFlags)) - Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result, + Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result, MachinePointerInfo::getGOT(), false, false, false, 0); // If there was a non-zero offset that we didn't fold, create an explicit // addition for it. if (Offset != 0) - Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result, - DAG.getConstant(Offset, dl, getPointerTy())); + Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result, + DAG.getConstant(Offset, dl, PtrVT)); return Result; } @@ -11336,22 +11600,23 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); const GlobalValue *GV = GA->getGlobal(); + auto PtrVT = getPointerTy(DAG.getDataLayout()); if (Subtarget->isTargetELF()) { TLSModel::Model model = DAG.getTarget().getTLSModel(GV); switch (model) { case TLSModel::GeneralDynamic: if (Subtarget->is64Bit()) - return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy()); - return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy()); + return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT); + return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT); case TLSModel::LocalDynamic: - return LowerToTLSLocalDynamicModel(GA, DAG, getPointerTy(), + return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget->is64Bit()); case TLSModel::InitialExec: case TLSModel::LocalExec: - return LowerToTLSExecModel( - GA, DAG, getPointerTy(), model, Subtarget->is64Bit(), - DAG.getTarget().getRelocationModel() == Reloc::PIC_); + return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget->is64Bit(), + DAG.getTarget().getRelocationModel() == + Reloc::PIC_); } llvm_unreachable("Unknown TLS model."); } @@ -11374,13 +11639,12 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, GA->getValueType(0), GA->getOffset(), OpFlag); - SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); + SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result); // With PIC32, the address is actually $g + Offset. if (PIC32) - Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(), - DAG.getNode(X86ISD::GlobalBaseReg, - SDLoc(), getPointerTy()), + Offset = DAG.getNode(ISD::ADD, DL, PtrVT, + DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Offset); // Lowering the machine isd will make sure everything is in the right @@ -11397,8 +11661,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { // And our return value (tls address) is in the standard call return value // location. unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; - return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(), - Chain.getValue(1)); + return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1)); } if (Subtarget->isTargetKnownWindowsMSVC() || @@ -11426,50 +11689,50 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { : Type::getInt32PtrTy(*DAG.getContext(), 257)); - SDValue TlsArray = - Subtarget->is64Bit() - ? DAG.getIntPtrConstant(0x58, dl) - : (Subtarget->isTargetWindowsGNU() - ? DAG.getIntPtrConstant(0x2C, dl) - : DAG.getExternalSymbol("_tls_array", getPointerTy())); + SDValue TlsArray = Subtarget->is64Bit() + ? DAG.getIntPtrConstant(0x58, dl) + : (Subtarget->isTargetWindowsGNU() + ? DAG.getIntPtrConstant(0x2C, dl) + : DAG.getExternalSymbol("_tls_array", PtrVT)); SDValue ThreadPointer = - DAG.getLoad(getPointerTy(), dl, Chain, TlsArray, - MachinePointerInfo(Ptr), false, false, false, 0); + DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr), false, + false, false, 0); SDValue res; if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) { res = ThreadPointer; } else { // Load the _tls_index variable - SDValue IDX = DAG.getExternalSymbol("_tls_index", getPointerTy()); + SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT); if (Subtarget->is64Bit()) - IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, getPointerTy(), Chain, IDX, + IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX, MachinePointerInfo(), MVT::i32, false, false, false, 0); else - IDX = DAG.getLoad(getPointerTy(), dl, Chain, IDX, MachinePointerInfo(), - false, false, false, 0); + IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo(), false, + false, false, 0); - SDValue Scale = DAG.getConstant(Log2_64_Ceil(TD->getPointerSize()), dl, - getPointerTy()); - IDX = DAG.getNode(ISD::SHL, dl, getPointerTy(), IDX, Scale); + auto &DL = DAG.getDataLayout(); + SDValue Scale = + DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, PtrVT); + IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale); - res = DAG.getNode(ISD::ADD, dl, getPointerTy(), ThreadPointer, IDX); + res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX); } - res = DAG.getLoad(getPointerTy(), dl, Chain, res, MachinePointerInfo(), - false, false, false, 0); + res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo(), false, false, + false, 0); // Get the offset of start of .tls section SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0), GA->getOffset(), X86II::MO_SECREL); - SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), TGA); + SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA); // The address of the thread local variable is the add of the thread // pointer with the offset of the variable. - return DAG.getNode(ISD::ADD, dl, getPointerTy(), res, Offset); + return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset); } llvm_unreachable("TLS not implemented for this target."); @@ -11564,8 +11827,9 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, unsigned Size = SrcVT.getSizeInBits()/8; MachineFunction &MF = DAG.getMachineFunction(); + auto PtrVT = getPointerTy(MF.getDataLayout()); int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false); - SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); + SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), StackSlot, MachinePointerInfo::getFixedStack(SSFI), @@ -11614,7 +11878,8 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, MachineFunction &MF = DAG.getMachineFunction(); unsigned SSFISize = Op.getValueType().getSizeInBits()/8; int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false); - SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); + auto PtrVT = getPointerTy(MF.getDataLayout()); + SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); Tys = DAG.getVTList(MVT::Other); SDValue Ops[] = { Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag @@ -11656,7 +11921,8 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, // Build some magic constants. static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 }; Constant *C0 = ConstantDataVector::get(*Context, CV0); - SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16); + auto PtrVT = getPointerTy(DAG.getDataLayout()); + SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16); SmallVector<Constant*,2> CV1; CV1.push_back( @@ -11666,7 +11932,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble, APInt(64, 0x4530000000000000ULL)))); Constant *C1 = ConstantVector::get(CV1); - SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16); + SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16); // Load the 64-bit value into an XMM register. SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, @@ -11882,6 +12148,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { SDValue N0 = Op.getOperand(0); SDLoc dl(Op); + auto PtrVT = getPointerTy(DAG.getDataLayout()); if (Op.getValueType().isVector()) return lowerUINT_TO_FP_vec(Op, DAG); @@ -11904,9 +12171,8 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, // Make a 64-bit buffer, and use it to build an FILD. SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); if (SrcVT == MVT::i32) { - SDValue WordOff = DAG.getConstant(4, dl, getPointerTy()); - SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, - getPointerTy(), StackSlot, WordOff); + SDValue WordOff = DAG.getConstant(4, dl, PtrVT); + SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, PtrVT, StackSlot, WordOff); SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), StackSlot, MachinePointerInfo(), false, false, 0); @@ -11940,22 +12206,20 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, APInt FF(32, 0x5F800000ULL); // Check whether the sign bit is set. - SDValue SignSet = DAG.getSetCC(dl, - getSetCCResultType(*DAG.getContext(), MVT::i64), - Op.getOperand(0), - DAG.getConstant(0, dl, MVT::i64), ISD::SETLT); + SDValue SignSet = DAG.getSetCC( + dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64), + Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT); // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits. SDValue FudgePtr = DAG.getConstantPool( - ConstantInt::get(*DAG.getContext(), FF.zext(64)), - getPointerTy()); + ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT); // Get a pointer to FF if the sign bit was set, or to 0 otherwise. SDValue Zero = DAG.getIntPtrConstant(0, dl); SDValue Four = DAG.getIntPtrConstant(4, dl); SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet, Zero, Four); - FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset); + FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset); // Load the value out, extending it from f32 to f80. // FIXME: Avoid the extend by constructing the right constant pool? @@ -11974,6 +12238,7 @@ X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, SDLoc DL(Op); EVT DstTy = Op.getValueType(); + auto PtrVT = getPointerTy(DAG.getDataLayout()); if (!IsSigned && !isIntegerTypeFTOL(DstTy)) { assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); @@ -11998,7 +12263,7 @@ X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, MachineFunction &MF = DAG.getMachineFunction(); unsigned MemSize = DstTy.getSizeInBits()/8; int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); - SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); + SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); unsigned Opc; if (!IsSigned && isIntegerTypeFTOL(DstTy)) @@ -12032,7 +12297,7 @@ X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO); Chain = Value.getValue(1); SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); - StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); + StackSlot = DAG.getFrameIndex(SSFI, PtrVT); } MachineMemOperand *MMO = @@ -12403,7 +12668,7 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) { Constant *C = ConstantInt::get(*Context, MaskElt); C = ConstantVector::getSplat(NumElts, C); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy()); + SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout())); unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment(); SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, MachinePointerInfo::getConstantPool(), @@ -12462,7 +12727,8 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { CV[0] = ConstantFP::get(*Context, APFloat(Sem, APInt::getHighBitsSet(SizeInBits, 1))); Constant *C = ConstantVector::get(CV); - SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16); + auto PtrVT = TLI.getPointerTy(DAG.getDataLayout()); + SDValue CPIdx = DAG.getConstantPool(C, PtrVT, 16); SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx, MachinePointerInfo::getConstantPool(), false, false, false, 16); @@ -12483,7 +12749,7 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { APFloat(Sem, APInt::getLowBitsSet(SizeInBits, SizeInBits - 1))); } C = ConstantVector::get(CV); - CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16); + CPIdx = DAG.getConstantPool(C, PtrVT, 16); SDValue Val = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, MachinePointerInfo::getConstantPool(), false, false, false, 16); @@ -13352,8 +13618,8 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, if (hasMinMax) { switch (SetCCOpcode) { default: break; - case ISD::SETULE: Opc = X86ISD::UMIN; MinMax = true; break; - case ISD::SETUGE: Opc = X86ISD::UMAX; MinMax = true; break; + case ISD::SETULE: Opc = ISD::UMIN; MinMax = true; break; + case ISD::SETUGE: Opc = ISD::UMAX; MinMax = true; break; } if (MinMax) { Swap = false; Invert = false; FlipSigns = false; } @@ -14172,8 +14438,8 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget, SmallVector<SDValue, 8> Chains; SDValue Ptr = Ld->getBasePtr(); - SDValue Increment = - DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl, TLI.getPointerTy()); + SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl, + TLI.getPointerTy(DAG.getDataLayout())); SDValue Res = DAG.getUNDEF(LoadUnitVecVT); for (unsigned i = 0; i < NumLoads; ++i) { @@ -14613,7 +14879,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, EVT VT = Op.getNode()->getValueType(0); bool Is64Bit = Subtarget->is64Bit(); - EVT SPTy = getPointerTy(); + MVT SPTy = getPointerTy(DAG.getDataLayout()); if (SplitStack) { MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -14630,8 +14896,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, "have nested arguments."); } - const TargetRegisterClass *AddrRegClass = - getRegClassFor(getPointerTy()); + const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy); unsigned Vreg = MRI.createVirtualRegister(AddrRegClass); Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size); SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain, @@ -14666,6 +14931,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); + auto PtrVT = getPointerTy(MF.getDataLayout()); X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); @@ -14674,8 +14940,7 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) { // vastart just stores the address of the VarArgsFrameIndex slot into the // memory location argument. - SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), - getPointerTy()); + SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), MachinePointerInfo(SV), false, false, 0); } @@ -14695,8 +14960,7 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { MemOps.push_back(Store); // Store fp_offset - FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), - FIN, DAG.getIntPtrConstant(4, DL)); + FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL)); Store = DAG.getStore(Op.getOperand(0), DL, DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), @@ -14704,20 +14968,16 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { MemOps.push_back(Store); // Store ptr to overflow_arg_area - FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), - FIN, DAG.getIntPtrConstant(4, DL)); - SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), - getPointerTy()); + FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL)); + SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8), false, false, 0); MemOps.push_back(Store); // Store ptr to reg_save_area. - FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), - FIN, DAG.getIntPtrConstant(8, DL)); - SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), - getPointerTy()); + FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(8, DL)); + SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT); Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN, MachinePointerInfo(SV, 16), false, false, 0); MemOps.push_back(Store); @@ -14739,7 +14999,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { EVT ArgVT = Op.getNode()->getValueType(0); Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); - uint32_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy); + uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy); uint8_t ArgMode; // Decide which area this value should be read from. @@ -14768,7 +15028,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32), DAG.getConstant(ArgMode, dl, MVT::i8), DAG.getConstant(Align, dl, MVT::i32)}; - SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other); + SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other); SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl, VTs, InstOps, MVT::i64, MachinePointerInfo(SV), @@ -14995,6 +15255,20 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc); } +static int getSEHRegistrationNodeSize(const Function *Fn) { + if (!Fn->hasPersonalityFn()) + report_fatal_error( + "querying registration node size for function without personality"); + // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See + // WinEHStatePass for the full struct definition. + switch (classifyEHPersonality(Fn->getPersonalityFn())) { + case EHPersonality::MSVC_X86SEH: return 24; + case EHPersonality::MSVC_CXX: return 16; + default: break; + } + report_fatal_error("can only recover FP for MSVC EH personality functions"); +} + /// When the 32-bit MSVC runtime transfers control to us, either to an outlined /// function or when returning to a parent frame after catching an exception, we /// recover the parent frame pointer by doing arithmetic on the incoming EBP. @@ -15009,7 +15283,7 @@ static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn, SDLoc dl; const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - MVT PtrVT = TLI.getPointerTy(); + MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout()); // It's possible that the parent function no longer has a personality function // if the exceptional code was optimized away, in which case we just return @@ -15017,15 +15291,7 @@ static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn, if (!Fn->hasPersonalityFn()) return EntryEBP; - // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See - // WinEHStatePass for the full struct definition. - int RegNodeSize; - switch (classifyEHPersonality(Fn->getPersonalityFn())) { - default: - report_fatal_error("can only recover FP for MSVC EH personality functions"); - case EHPersonality::MSVC_X86SEH: RegNodeSize = 24; break; - case EHPersonality::MSVC_CXX: RegNodeSize = 16; break; - } + int RegNodeSize = getSEHRegistrationNodeSize(Fn); // Get an MCSymbol that will ultimately resolve to the frame offset of the EH // registration. @@ -15034,7 +15300,7 @@ static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn, GlobalValue::getRealLinkageName(Fn->getName())); SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT); SDValue RegNodeFrameOffset = - DAG.getNode(ISD::FRAME_ALLOC_RECOVER, dl, PtrVT, OffsetSymVal); + DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal); // RegNodeBase = EntryEBP - RegNodeSize // ParentFP = RegNodeBase - RegNodeFrameOffset @@ -15059,6 +15325,9 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget case INTR_TYPE_3OP: return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + case INTR_TYPE_4OP: + return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), + Op.getOperand(2), Op.getOperand(3), Op.getOperand(4)); case INTR_TYPE_1OP_MASK_RM: { SDValue Src = Op.getOperand(1); SDValue PassThru = Op.getOperand(2); @@ -15143,7 +15412,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget SDValue Rnd; if (Op.getNumOperands() == 6) Rnd = Op.getOperand(5); - else + else Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32); return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Rnd), @@ -15173,7 +15442,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget Src1, Src2, Src3), Mask, PassThru, Subtarget, DAG); } - case VPERM_3OP_MASKZ: + case VPERM_3OP_MASKZ: case VPERM_3OP_MASK: case FMA_OP_MASK3: case FMA_OP_MASKZ: @@ -15499,6 +15768,19 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget "llvm.x86.seh.recoverfp must take a function as the first argument"); return recoverFramePointer(DAG, Fn, IncomingFPOp); } + + case Intrinsic::localaddress: { + // Returns one of the stack, base, or frame pointer registers, depending on + // which is used to reference local variables. + MachineFunction &MF = DAG.getMachineFunction(); + const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); + unsigned Reg; + if (RegInfo->hasBasePointer(MF)) + Reg = RegInfo->getBaseRegister(); + else // This function handles the SP or FP case. + Reg = RegInfo->getPtrSizedFrameRegister(MF); + return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT); + } } } @@ -15712,34 +15994,60 @@ static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget, static SDValue LowerSEHRESTOREFRAME(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { MachineFunction &MF = DAG.getMachineFunction(); + const Function *Fn = MF.getFunction(); SDLoc dl(Op); SDValue Chain = Op.getOperand(0); + assert(Subtarget->getFrameLowering()->hasFP(MF) && + "using llvm.x86.seh.restoreframe requires a frame pointer"); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - MVT VT = TLI.getPointerTy(); + MVT VT = TLI.getPointerTy(DAG.getDataLayout()); const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); unsigned FrameReg = RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction()); unsigned SPReg = RegInfo->getStackRegister(); + unsigned SlotSize = RegInfo->getSlotSize(); // Get incoming EBP. SDValue IncomingEBP = DAG.getCopyFromReg(Chain, dl, FrameReg, VT); - // Load [EBP-24] into SP. - SDValue SPAddr = - DAG.getNode(ISD::ADD, dl, VT, IncomingEBP, DAG.getConstant(-24, dl, VT)); + // SP is saved in the first field of every registration node, so load + // [EBP-RegNodeSize] into SP. + int RegNodeSize = getSEHRegistrationNodeSize(Fn); + SDValue SPAddr = DAG.getNode(ISD::ADD, dl, VT, IncomingEBP, + DAG.getConstant(-RegNodeSize, dl, VT)); SDValue NewSP = DAG.getLoad(VT, dl, Chain, SPAddr, MachinePointerInfo(), false, false, false, VT.getScalarSizeInBits() / 8); Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); - // FIXME: Restore the base pointer in case of stack realignment! + if (!RegInfo->needsStackRealignment(MF)) { + // Adjust EBP to point back to the original frame position. + SDValue NewFP = recoverFramePointer(DAG, Fn, IncomingEBP); + Chain = DAG.getCopyToReg(Chain, dl, FrameReg, NewFP); + } else { + assert(RegInfo->hasBasePointer(MF) && + "functions with Win32 EH must use frame or base pointer register"); + + // Reload the base pointer (ESI) with the adjusted incoming EBP. + SDValue NewBP = recoverFramePointer(DAG, Fn, IncomingEBP); + Chain = DAG.getCopyToReg(Chain, dl, RegInfo->getBaseRegister(), NewBP); + + // Reload the spilled EBP value, now that the stack and base pointers are + // set up. + X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); + X86FI->setHasSEHFramePtrSave(true); + int FI = MF.getFrameInfo()->CreateSpillStackObject(SlotSize, SlotSize); + X86FI->setSEHFramePtrSaveIndex(FI); + SDValue NewFP = DAG.getLoad(VT, dl, Chain, DAG.getFrameIndex(FI, VT), + MachinePointerInfo(), false, false, false, + VT.getScalarSizeInBits() / 8); + Chain = DAG.getCopyToReg(NewFP, dl, FrameReg, NewFP); + } - // Adjust EBP to point back to the original frame position. - SDValue NewFP = recoverFramePointer(DAG, MF.getFunction(), IncomingEBP); - Chain = DAG.getCopyToReg(Chain, dl, FrameReg, NewFP); return Chain; } @@ -15910,7 +16218,7 @@ SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); SDLoc dl(Op); - EVT PtrVT = getPointerTy(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); if (Depth > 0) { SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); @@ -15969,14 +16277,36 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { // FIXME? Maybe this could be a TableGen attribute on some registers and // this table could be generated automatically from RegInfo. -unsigned X86TargetLowering::getRegisterByName(const char* RegName, - EVT VT) const { +unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT, + SelectionDAG &DAG) const { + const TargetFrameLowering &TFI = *Subtarget->getFrameLowering(); + const MachineFunction &MF = DAG.getMachineFunction(); + unsigned Reg = StringSwitch<unsigned>(RegName) .Case("esp", X86::ESP) .Case("rsp", X86::RSP) + .Case("ebp", X86::EBP) + .Case("rbp", X86::RBP) .Default(0); + + if (Reg == X86::EBP || Reg == X86::RBP) { + if (!TFI.hasFP(MF)) + report_fatal_error("register " + StringRef(RegName) + + " is allocatable: function has no frame pointer"); +#ifndef NDEBUG + else { + const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); + unsigned FrameReg = + RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction()); + assert((FrameReg == X86::EBP || FrameReg == X86::RBP) && + "Invalid Frame Register!"); + } +#endif + } + if (Reg) return Reg; + report_fatal_error("Invalid register name global variable"); } @@ -15992,7 +16322,7 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { SDValue Handler = Op.getOperand(2); SDLoc dl (Op); - EVT PtrVT = getPointerTy(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction()); assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) || @@ -16211,7 +16541,8 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, // Save FP Control Word to stack slot int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false); - SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); + SDValue StackSlot = + DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout())); MachineMemOperand *MMO = MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), @@ -16572,7 +16903,7 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons } SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), - getPointerTy()); + getPointerTy(DAG.getDataLayout())); TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl).setChain(InChain) @@ -16642,9 +16973,9 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget, // If we have a signed multiply but no PMULDQ fix up the high parts of a // unsigned multiply. if (IsSigned && !Subtarget->hasSSE41()) { - SDValue ShAmt = - DAG.getConstant(31, dl, - DAG.getTargetLoweringInfo().getShiftAmountTy(VT)); + SDValue ShAmt = DAG.getConstant( + 31, dl, + DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout())); SDValue T1 = DAG.getNode(ISD::AND, dl, VT, DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1); SDValue T2 = DAG.getNode(ISD::AND, dl, VT, @@ -16717,6 +17048,38 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, unsigned X86Opc = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI : (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI; + auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) { + assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type"); + MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2); + SDValue Ex = DAG.getBitcast(ExVT, R); + + if (ShiftAmt >= 32) { + // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32. + SDValue Upper = + getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG); + SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, + ShiftAmt - 32, DAG); + if (VT == MVT::v2i64) + Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3}); + if (VT == MVT::v4i64) + Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, + {9, 1, 11, 3, 13, 5, 15, 7}); + } else { + // SRA upper i32, SHL whole i64 and select lower i32. + SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, + ShiftAmt, DAG); + SDValue Lower = + getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG); + Lower = DAG.getBitcast(ExVT, Lower); + if (VT == MVT::v2i64) + Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3}); + if (VT == MVT::v4i64) + Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, + {8, 1, 10, 3, 12, 5, 14, 7}); + } + return DAG.getBitcast(VT, Ex); + }; + // Optimize shl/srl/sra with constant shift amount. if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) { if (auto *ShiftConst = BVAmt->getConstantSplatNode()) { @@ -16725,6 +17088,11 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG); + // i64 SRA needs to be performed as partial shifts. + if ((VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) && + Op.getOpcode() == ISD::SRA) + return ArithmeticShiftRight64(ShiftAmt); + if (VT == MVT::v16i8 || (Subtarget->hasInt256() && VT == MVT::v32i8)) { unsigned NumElts = VT.getVectorNumElements(); MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2); @@ -16808,7 +17176,12 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, if (ShAmt != ShiftAmt) return SDValue(); } - return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG); + + if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) + return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG); + + if (Op.getOpcode() == ISD::SRA) + return ArithmeticShiftRight64(ShiftAmt); } return SDValue(); @@ -16890,7 +17263,9 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, if (Vals[j] != Amt.getOperand(i + j)) return SDValue(); } - return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1)); + + if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) + return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1)); } return SDValue(); } @@ -17042,6 +17417,53 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, } } + // v4i32 Non Uniform Shifts. + // If the shift amount is constant we can shift each lane using the SSE2 + // immediate shifts, else we need to zero-extend each lane to the lower i64 + // and shift using the SSE2 variable shifts. + // The separate results can then be blended together. + if (VT == MVT::v4i32) { + unsigned Opc = Op.getOpcode(); + SDValue Amt0, Amt1, Amt2, Amt3; + if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) { + Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0}); + Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1}); + Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2}); + Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3}); + } else { + // ISD::SHL is handled above but we include it here for completeness. + switch (Opc) { + default: + llvm_unreachable("Unknown target vector shift node"); + case ISD::SHL: + Opc = X86ISD::VSHL; + break; + case ISD::SRL: + Opc = X86ISD::VSRL; + break; + case ISD::SRA: + Opc = X86ISD::VSRA; + break; + } + // The SSE2 shifts use the lower i64 as the same shift amount for + // all lanes and the upper i64 is ignored. These shuffle masks + // optimally zero-extend each lanes on SSE2/SSE41/AVX targets. + SDValue Z = getZeroVector(VT, Subtarget, DAG, dl); + Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1}); + Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1}); + Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1}); + Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1}); + } + + SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0); + SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1); + SDValue R2 = DAG.getNode(Opc, dl, VT, R, Amt2); + SDValue R3 = DAG.getNode(Opc, dl, VT, R, Amt3); + SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1}); + SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7}); + return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7}); + } + if (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget->hasInt256())) { MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2); unsigned ShiftOpcode = Op->getOpcode(); @@ -17944,7 +18366,8 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget, // the results are returned via SRet in memory. const char *LibcallName = isF64 ? "__sincos_stret" : "__sincosf_stret"; const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - SDValue Callee = DAG.getExternalSymbol(LibcallName, TLI.getPointerTy()); + SDValue Callee = + DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout())); Type *RetTy = isF64 ? (Type*)StructType::get(ArgTy, ArgTy, nullptr) @@ -18443,10 +18866,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::HSUB: return "X86ISD::HSUB"; case X86ISD::FHADD: return "X86ISD::FHADD"; case X86ISD::FHSUB: return "X86ISD::FHSUB"; - case X86ISD::UMAX: return "X86ISD::UMAX"; - case X86ISD::UMIN: return "X86ISD::UMIN"; - case X86ISD::SMAX: return "X86ISD::SMAX"; - case X86ISD::SMIN: return "X86ISD::SMIN"; case X86ISD::ABS: return "X86ISD::ABS"; case X86ISD::FMAX: return "X86ISD::FMAX"; case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND"; @@ -18456,6 +18875,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::FMINC: return "X86ISD::FMINC"; case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; case X86ISD::FRCP: return "X86ISD::FRCP"; + case X86ISD::EXTRQI: return "X86ISD::EXTRQI"; + case X86ISD::INSERTQI: return "X86ISD::INSERTQI"; case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR"; case X86ISD::TLSCALL: return "X86ISD::TLSCALL"; @@ -18478,6 +18899,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VFPEXT: return "X86ISD::VFPEXT"; case X86ISD::VFPROUND: return "X86ISD::VFPROUND"; case X86ISD::CVTDQ2PD: return "X86ISD::CVTDQ2PD"; + case X86ISD::CVTUDQ2PD: return "X86ISD::CVTUDQ2PD"; case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ"; case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ"; case X86ISD::VSHL: return "X86ISD::VSHL"; @@ -18594,16 +19016,19 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::ADDS: return "X86ISD::ADDS"; case X86ISD::SUBS: return "X86ISD::SUBS"; case X86ISD::AVG: return "X86ISD::AVG"; + case X86ISD::MULHRS: return "X86ISD::MULHRS"; case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND"; case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND"; + case X86ISD::FP_TO_SINT_RND: return "X86ISD::FP_TO_SINT_RND"; + case X86ISD::FP_TO_UINT_RND: return "X86ISD::FP_TO_UINT_RND"; } return nullptr; } // isLegalAddressingMode - Return true if the addressing mode represented // by AM is legal for this target, for a load/store of the specified type. -bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, - Type *Ty, +bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL, + const AddrMode &AM, Type *Ty, unsigned AS) const { // X86 supports extremely general addressing modes. CodeModel::Model M = getTargetMachine().getCodeModel(); @@ -19555,7 +19980,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineRegisterInfo &MRI = MF->getRegInfo(); const TargetRegisterClass *AddrRegClass = - getRegClassFor(getPointerTy()); + getRegClassFor(getPointerTy(MF->getDataLayout())); unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass), bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass), @@ -19750,7 +20175,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, MemOpndSlot = CurOp; - MVT PVT = getPointerTy(); + MVT PVT = getPointerTy(MF->getDataLayout()); assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!"); @@ -19882,7 +20307,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); - MVT PVT = getPointerTy(); + MVT PVT = getPointerTy(MF->getDataLayout()); assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!"); @@ -21377,7 +21802,7 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, // alignment is valid. unsigned Align = LN0->getAlignment(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - unsigned NewAlign = TLI.getDataLayout()->getABITypeAlignment( + unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment( EltVT.getTypeForEVT(*DAG.getContext())); if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT)) @@ -21513,14 +21938,15 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) { SDValue Cst = DAG.getBitcast(MVT::v2i64, InputVector); - EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(); + auto &DL = DAG.getDataLayout(); + EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(DL); SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst, DAG.getConstant(0, dl, VecIdxTy)); SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst, DAG.getConstant(1, dl, VecIdxTy)); - SDValue ShAmt = DAG.getConstant(32, dl, - DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64)); + SDValue ShAmt = DAG.getConstant( + 32, dl, DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64, DL)); Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf); Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt)); @@ -21539,10 +21965,11 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, // Replace each use (extract) with a load of the appropriate element. for (unsigned i = 0; i < 4; ++i) { uint64_t Offset = EltSize * i; - SDValue OffsetVal = DAG.getConstant(Offset, dl, TLI.getPointerTy()); + auto PtrVT = TLI.getPointerTy(DAG.getDataLayout()); + SDValue OffsetVal = DAG.getConstant(Offset, dl, PtrVT); - SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(), - StackPtr, OffsetVal); + SDValue ScalarAddr = + DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, OffsetVal); // Load the scalar. Vals[i] = DAG.getLoad(ElementType, dl, Ch, @@ -21622,16 +22049,16 @@ matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS, default: break; case ISD::SETULT: case ISD::SETULE: - Opc = hasUnsigned ? X86ISD::UMIN : 0u; break; + Opc = hasUnsigned ? ISD::UMIN : 0; break; case ISD::SETUGT: case ISD::SETUGE: - Opc = hasUnsigned ? X86ISD::UMAX : 0u; break; + Opc = hasUnsigned ? ISD::UMAX : 0; break; case ISD::SETLT: case ISD::SETLE: - Opc = hasSigned ? X86ISD::SMIN : 0u; break; + Opc = hasSigned ? ISD::SMIN : 0; break; case ISD::SETGT: case ISD::SETGE: - Opc = hasSigned ? X86ISD::SMAX : 0u; break; + Opc = hasSigned ? ISD::SMAX : 0; break; } // Check for x CC y ? y : x -- a min/max with reversed arms. } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) && @@ -21640,16 +22067,16 @@ matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS, default: break; case ISD::SETULT: case ISD::SETULE: - Opc = hasUnsigned ? X86ISD::UMAX : 0u; break; + Opc = hasUnsigned ? ISD::UMAX : 0; break; case ISD::SETUGT: case ISD::SETUGE: - Opc = hasUnsigned ? X86ISD::UMIN : 0u; break; + Opc = hasUnsigned ? ISD::UMIN : 0; break; case ISD::SETLT: case ISD::SETLE: - Opc = hasSigned ? X86ISD::SMAX : 0u; break; + Opc = hasSigned ? ISD::SMAX : 0; break; case ISD::SETGT: case ISD::SETGE: - Opc = hasSigned ? X86ISD::SMIN : 0u; break; + Opc = hasSigned ? ISD::SMIN : 0; break; } } @@ -22106,7 +22533,8 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // Check if the selector will be produced by CMPP*/PCMP* Cond.getOpcode() == ISD::SETCC && // Check if SETCC has already been promoted - TLI.getSetCCResultType(*DAG.getContext(), VT) == CondVT) { + TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) == + CondVT) { bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode()); bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode()); @@ -22826,7 +23254,7 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { // We shift all of the values by one. In many cases we do not have // hardware support for this operation. This is better expressed as an ADD // of two values. - if (N1SplatC->getZExtValue() == 1) + if (N1SplatC->getAPIntValue() == 1) return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0); } @@ -23478,7 +23906,8 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); SDValue Ptr = Ld->getBasePtr(); - SDValue Increment = DAG.getConstant(16, dl, TLI.getPointerTy()); + SDValue Increment = + DAG.getConstant(16, dl, TLI.getPointerTy(DAG.getDataLayout())); EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), NumElems/2); @@ -23687,7 +24116,8 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, SDValue Value0 = Extract128BitVector(StoredVal, 0, DAG, dl); SDValue Value1 = Extract128BitVector(StoredVal, NumElems/2, DAG, dl); - SDValue Stride = DAG.getConstant(16, dl, TLI.getPointerTy()); + SDValue Stride = + DAG.getConstant(16, dl, TLI.getPointerTy(DAG.getDataLayout())); SDValue Ptr0 = St->getBasePtr(); SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride); @@ -23760,8 +24190,8 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff); SmallVector<SDValue, 8> Chains; - SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8, dl, - TLI.getPointerTy()); + SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, dl, + TLI.getPointerTy(DAG.getDataLayout())); SDValue Ptr = St->getBasePtr(); // Perform one or more big stores into memory. @@ -24659,6 +25089,31 @@ static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, return SDValue(); } +static SDValue PerformUINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + SDValue Op0 = N->getOperand(0); + EVT VT = N->getValueType(0); + EVT InVT = Op0.getValueType(); + EVT InSVT = InVT.getScalarType(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32)) + // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32)) + if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) { + SDLoc dl(N); + EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, + InVT.getVectorNumElements()); + SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0); + + if (TLI.isOperationLegal(ISD::UINT_TO_FP, DstVT)) + return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P); + + return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P); + } + + return SDValue(); +} + static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { // First try to optimize away the conversion entirely when it's @@ -24913,6 +25368,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); case ISD::MSTORE: return PerformMSTORECombine(N, DAG, Subtarget); case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, Subtarget); + case ISD::UINT_TO_FP: return PerformUINT_TO_FPCombine(N, DAG, Subtarget); case ISD::FADD: return PerformFADDCombine(N, DAG, Subtarget); case ISD::FSUB: return PerformFSUBCombine(N, DAG, Subtarget); case X86ISD::FXOR: @@ -25135,7 +25591,7 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) || matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) { AsmPieces.clear(); - const std::string &ConstraintsStr = IA->getConstraintString(); + StringRef ConstraintsStr = IA->getConstraintString(); SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); array_pod_sort(AsmPieces.begin(), AsmPieces.end()); if (clobbersFlagRegisters(AsmPieces)) @@ -25149,7 +25605,7 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) && matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) { AsmPieces.clear(); - const std::string &ConstraintsStr = IA->getConstraintString(); + StringRef ConstraintsStr = IA->getConstraintString(); SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); array_pod_sort(AsmPieces.begin(), AsmPieces.end()); if (clobbersFlagRegisters(AsmPieces)) @@ -25176,7 +25632,7 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { /// getConstraintType - Given a constraint letter, return the type of /// constraint it is for this target. X86TargetLowering::ConstraintType -X86TargetLowering::getConstraintType(const std::string &Constraint) const { +X86TargetLowering::getConstraintType(StringRef Constraint) const { if (Constraint.size() == 1) { switch (Constraint[0]) { case 'R': @@ -25508,7 +25964,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, std::pair<unsigned, const TargetRegisterClass *> X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, - const std::string &Constraint, + StringRef Constraint, MVT VT) const { // First, see if this is a constraint that directly corresponds to an LLVM // register class. @@ -25717,8 +26173,8 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, return Res; } -int X86TargetLowering::getScalingFactorCost(const AddrMode &AM, - Type *Ty, +int X86TargetLowering::getScalingFactorCost(const DataLayout &DL, + const AddrMode &AM, Type *Ty, unsigned AS) const { // Scaling factors are not free at all. // An indexed folded instruction, i.e., inst (reg1, reg2, scale), @@ -25738,7 +26194,7 @@ int X86TargetLowering::getScalingFactorCost(const AddrMode &AM, // E.g., on Haswell: // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3. // vmovaps %ymm1, (%r8) can use port 2, 3, or 7. - if (isLegalAddressingMode(AM, Ty, AS)) + if (isLegalAddressingMode(DL, AM, Ty, AS)) // Scale represents reg2 * scale, thus account for 1 // as soon as we use a second register. return AM.Scale != 0; diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 17660891635c..723d5304495c 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -233,12 +233,6 @@ namespace llvm { /// Floating point horizontal sub. FHSUB, - /// Unsigned integer max and min. - UMAX, UMIN, - - /// Signed integer max and min. - SMAX, SMIN, - // Integer absolute value ABS, @@ -298,8 +292,8 @@ namespace llvm { // Vector FP round. VFPROUND, - // Vector signed integer to double. - CVTDQ2PD, + // Vector signed/unsigned integer to double. + CVTDQ2PD, CVTUDQ2PD, // 128-bit vector logical left / right shift VSHLDQ, VSRLDQ, @@ -400,10 +394,15 @@ namespace llvm { VINSERT, VEXTRACT, + /// SSE4A Extraction and Insertion. + EXTRQI, INSERTQI, + // Vector multiply packed unsigned doubleword integers PMULUDQ, // Vector multiply packed signed doubleword integers PMULDQ, + // Vector Multiply Packed UnsignedIntegers with Round and Scale + MULHRS, // FMA nodes FMADD, @@ -429,6 +428,9 @@ namespace llvm { //with rounding mode SINT_TO_FP_RND, UINT_TO_FP_RND, + + // Vector float/double to signed/unsigned integer. + FP_TO_SINT_RND, FP_TO_UINT_RND, // Save xmm argument registers to the stack, according to %al. An operator // is needed so that this can be expanded with control flow. VASTART_SAVE_XMM_REGS, @@ -599,7 +601,9 @@ namespace llvm { unsigned getJumpTableEncoding() const override; bool useSoftFloat() const override; - MVT getScalarShiftAmountTy(EVT LHSTy) const override { return MVT::i8; } + MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override { + return MVT::i8; + } const MCExpr * LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, @@ -617,7 +621,8 @@ namespace llvm { /// function arguments in the caller parameter area. For X86, aggregates /// that contains are placed at 16-byte boundaries while the rest are at /// 4-byte boundaries. - unsigned getByValTypeAlignment(Type *Ty) const override; + unsigned getByValTypeAlignment(Type *Ty, + const DataLayout &DL) const override; /// Returns the target specific optimal type for load /// and store operations as a result of memset, memcpy, and memmove @@ -685,7 +690,8 @@ namespace llvm { bool isCheapToSpeculateCtlz() const override; /// Return the value type to use for ISD::SETCC. - EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override; + EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, + EVT VT) const override; /// Determine which of the bits specified in Mask are known to be either /// zero or one and return them in the KnownZero/KnownOne bitsets. @@ -707,8 +713,7 @@ namespace llvm { bool ExpandInlineAsm(CallInst *CI) const override; - ConstraintType - getConstraintType(const std::string &Constraint) const override; + ConstraintType getConstraintType(StringRef Constraint) const override; /// Examine constraint string and operand type and determine a weight value. /// The operand object must already have been set up with the operand type. @@ -726,8 +731,8 @@ namespace llvm { std::vector<SDValue> &Ops, SelectionDAG &DAG) const override; - unsigned getInlineAsmMemConstraint( - const std::string &ConstraintCode) const override { + unsigned + getInlineAsmMemConstraint(StringRef ConstraintCode) const override { if (ConstraintCode == "i") return InlineAsm::Constraint_i; else if (ConstraintCode == "o") @@ -745,13 +750,12 @@ namespace llvm { /// error, this returns a register number of 0. std::pair<unsigned, const TargetRegisterClass *> getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, - const std::string &Constraint, - MVT VT) const override; + StringRef Constraint, MVT VT) const override; /// Return true if the addressing mode represented /// by AM is legal for this target, for a load/store of the specified type. - bool isLegalAddressingMode(const AddrMode &AM, Type *Ty, - unsigned AS) const override; + bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, + Type *Ty, unsigned AS) const override; /// Return true if the specified immediate is legal /// icmp immediate, that is the target has icmp instructions which can @@ -770,7 +774,7 @@ namespace llvm { /// of the specified type. /// If the AM is supported, the return value must be >= 0. /// If the AM is not supported, it returns a negative value. - int getScalingFactorCost(const AddrMode &AM, Type *Ty, + int getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const override; bool isVectorShiftByScalarCheap(Type *Ty) const override; @@ -872,7 +876,8 @@ namespace llvm { return nullptr; // nothing to do, move along. } - unsigned getRegisterByName(const char* RegName, EVT VT) const override; + unsigned getRegisterByName(const char* RegName, EVT VT, + SelectionDAG &DAG) const override; /// This method returns a target specific FastISel object, /// or null if the target does not support "fast" ISel. diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index b309b8210851..faa91500b181 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -3136,6 +3136,12 @@ defm VPMULLW : avx512_binop_rm_vl_w<0xD5, "vpmull", mul, SSE_INTALU_ITINS_P, HasBWI, 1>; defm VPMULLQ : avx512_binop_rm_vl_q<0x40, "vpmull", mul, SSE_INTALU_ITINS_P, HasDQI, 1>, T8PD; +defm VPMULHW : avx512_binop_rm_vl_w<0xE5, "vpmulh", mulhs, SSE_INTALU_ITINS_P, + HasBWI, 1>; +defm VPMULHUW : avx512_binop_rm_vl_w<0xE4, "vpmulhu", mulhu, SSE_INTMUL_ITINS_P, + HasBWI, 1>; +defm VPMULHRSW : avx512_binop_rm_vl_w<0x0B, "vpmulhrs", X86mulhrs, SSE_INTMUL_ITINS_P, + HasBWI, 1>, T8PD; defm VPAVG : avx512_binop_rm_vl_bw<0xE0, 0xE3, "vpavg", X86avg, SSE_INTALU_ITINS_P, HasBWI, 1>; @@ -3230,32 +3236,32 @@ let Predicates = [HasBWI] in { defm VPACKUSWB : avx512_packs_all_i16_i8 <0x67, "vpackuswb", X86Packus>, AVX512BIBase, VEX_W; } -defm VPMAXSB : avx512_binop_rm_vl_b<0x3C, "vpmaxs", X86smax, +defm VPMAXSB : avx512_binop_rm_vl_b<0x3C, "vpmaxs", smax, SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD; -defm VPMAXSW : avx512_binop_rm_vl_w<0xEE, "vpmaxs", X86smax, +defm VPMAXSW : avx512_binop_rm_vl_w<0xEE, "vpmaxs", smax, SSE_INTALU_ITINS_P, HasBWI, 1>; -defm VPMAXS : avx512_binop_rm_vl_dq<0x3D, 0x3D, "vpmaxs", X86smax, +defm VPMAXS : avx512_binop_rm_vl_dq<0x3D, 0x3D, "vpmaxs", smax, SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD; -defm VPMAXUB : avx512_binop_rm_vl_b<0xDE, "vpmaxu", X86umax, +defm VPMAXUB : avx512_binop_rm_vl_b<0xDE, "vpmaxu", umax, SSE_INTALU_ITINS_P, HasBWI, 1>; -defm VPMAXUW : avx512_binop_rm_vl_w<0x3E, "vpmaxu", X86umax, +defm VPMAXUW : avx512_binop_rm_vl_w<0x3E, "vpmaxu", umax, SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD; -defm VPMAXU : avx512_binop_rm_vl_dq<0x3F, 0x3F, "vpmaxu", X86umax, +defm VPMAXU : avx512_binop_rm_vl_dq<0x3F, 0x3F, "vpmaxu", umax, SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD; -defm VPMINSB : avx512_binop_rm_vl_b<0x38, "vpmins", X86smin, +defm VPMINSB : avx512_binop_rm_vl_b<0x38, "vpmins", smin, SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD; -defm VPMINSW : avx512_binop_rm_vl_w<0xEA, "vpmins", X86smin, +defm VPMINSW : avx512_binop_rm_vl_w<0xEA, "vpmins", smin, SSE_INTALU_ITINS_P, HasBWI, 1>; -defm VPMINS : avx512_binop_rm_vl_dq<0x39, 0x39, "vpmins", X86smin, +defm VPMINS : avx512_binop_rm_vl_dq<0x39, 0x39, "vpmins", smin, SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD; -defm VPMINUB : avx512_binop_rm_vl_b<0xDA, "vpminu", X86umin, +defm VPMINUB : avx512_binop_rm_vl_b<0xDA, "vpminu", umin, SSE_INTALU_ITINS_P, HasBWI, 1>; -defm VPMINUW : avx512_binop_rm_vl_w<0x3A, "vpminu", X86umin, +defm VPMINUW : avx512_binop_rm_vl_w<0x3A, "vpminu", umin, SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD; -defm VPMINU : avx512_binop_rm_vl_dq<0x3B, 0x3B, "vpminu", X86umin, +defm VPMINU : avx512_binop_rm_vl_dq<0x3B, 0x3B, "vpminu", umin, SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD; //===----------------------------------------------------------------------===// @@ -4035,7 +4041,7 @@ multiclass avx512_fma3p_213_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"), !strconcat("$src2, ${src3}", _.BroadcastStr ), (OpNode _.RC:$src1, - _.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))>, + _.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))>, AVX512FMA3Base, EVEX_B; } } @@ -4394,16 +4400,16 @@ def : Pat<(f64 (sint_to_fp GR32:$src)), def : Pat<(f64 (sint_to_fp GR64:$src)), (VCVTSI642SDZrr (f64 (IMPLICIT_DEF)), GR64:$src)>; -defm VCVTUSI2SSZ : avx512_vcvtsi_common<0x7B, X86SuintToFpRnd, GR32, +defm VCVTUSI2SSZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, GR32, v4f32x_info, i32mem, loadi32, "cvtusi2ss{l}">, XS, EVEX_CD8<32, CD8VT1>; -defm VCVTUSI642SSZ : avx512_vcvtsi_common<0x7B, X86SuintToFpRnd, GR64, +defm VCVTUSI642SSZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, GR64, v4f32x_info, i64mem, loadi64, "cvtusi2ss{q}">, XS, VEX_W, EVEX_CD8<64, CD8VT1>; -defm VCVTUSI2SDZ : avx512_vcvtsi<0x7B, X86SuintToFpRnd, GR32, v2f64x_info, +defm VCVTUSI2SDZ : avx512_vcvtsi<0x7B, X86UintToFpRnd, GR32, v2f64x_info, i32mem, loadi32, "cvtusi2sd{l}">, XD, VEX_LIG, EVEX_CD8<32, CD8VT1>; -defm VCVTUSI642SDZ : avx512_vcvtsi_common<0x7B, X86SuintToFpRnd, GR64, +defm VCVTUSI642SDZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, GR64, v2f64x_info, i64mem, loadi64, "cvtusi2sd{q}">, XD, VEX_W, EVEX_CD8<64, CD8VT1>; @@ -4604,117 +4610,389 @@ def : Pat<(extloadf32 addr:$src), def : Pat<(f32 (fround FR64X:$src)), (VCVTSD2SSZrr FR64X:$src, FR64X:$src)>, Requires<[HasAVX512]>; -multiclass avx512_vcvt_fp_with_rc<bits<8> opc, string asm, RegisterClass SrcRC, - RegisterClass DstRC, SDNode OpNode, PatFrag mem_frag, - X86MemOperand x86memop, ValueType OpVT, ValueType InVT, - Domain d> { -let hasSideEffects = 0 in { - def rr : AVX512PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), - !strconcat(asm,"\t{$src, $dst|$dst, $src}"), - [(set DstRC:$dst, - (OpVT (OpNode (InVT SrcRC:$src))))], d>, EVEX; - def rrb : AVX512PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src, AVX512RC:$rc), - !strconcat(asm,"\t{$rc, $src, $dst|$dst, $src, $rc}"), - [], d>, EVEX, EVEX_B, EVEX_RC; - let mayLoad = 1 in - def rm : AVX512PI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), - !strconcat(asm,"\t{$src, $dst|$dst, $src}"), - [(set DstRC:$dst, - (OpVT (OpNode (InVT (bitconvert (mem_frag addr:$src))))))], d>, EVEX; -} // hasSideEffects = 0 +//===----------------------------------------------------------------------===// +// AVX-512 Vector convert from signed/unsigned integer to float/double +// and from float/double to signed/unsigned integer +//===----------------------------------------------------------------------===// + +multiclass avx512_vcvt_fp<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, + X86VectorVTInfo _Src, SDNode OpNode, + string Broadcast = _.BroadcastStr, + string Alias = ""> { + + defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _Src.RC:$src), OpcodeStr, "$src", "$src", + (_.VT (OpNode (_Src.VT _Src.RC:$src)))>, EVEX; + + defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _Src.MemOp:$src), OpcodeStr#Alias, "$src", "$src", + (_.VT (OpNode (_Src.VT + (bitconvert (_Src.LdFrag addr:$src)))))>, EVEX; + + defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _Src.MemOp:$src), OpcodeStr, + "${src}"##Broadcast, "${src}"##Broadcast, + (_.VT (OpNode (_Src.VT + (X86VBroadcast (_Src.ScalarLdFrag addr:$src))) + ))>, EVEX, EVEX_B; +} +// Coversion with SAE - suppress all exceptions +multiclass avx512_vcvt_fp_sae<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, + X86VectorVTInfo _Src, SDNode OpNodeRnd> { + defm rrb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _Src.RC:$src), OpcodeStr, + "{sae}, $src", "$src, {sae}", + (_.VT (OpNodeRnd (_Src.VT _Src.RC:$src), + (i32 FROUND_NO_EXC)))>, + EVEX, EVEX_B; } -multiclass avx512_vcvt_fp<bits<8> opc, string asm, RegisterClass SrcRC, - RegisterClass DstRC, SDNode OpNode, PatFrag mem_frag, - X86MemOperand x86memop, ValueType OpVT, ValueType InVT, - Domain d> { -let hasSideEffects = 0 in { - def rr : AVX512PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), - !strconcat(asm,"\t{$src, $dst|$dst, $src}"), - [(set DstRC:$dst, - (OpVT (OpNode (InVT SrcRC:$src))))], d>, EVEX; - let mayLoad = 1 in - def rm : AVX512PI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), - !strconcat(asm,"\t{$src, $dst|$dst, $src}"), - [(set DstRC:$dst, - (OpVT (OpNode (InVT (bitconvert (mem_frag addr:$src))))))], d>, EVEX; -} // hasSideEffects = 0 +// Conversion with rounding control (RC) +multiclass avx512_vcvt_fp_rc<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, + X86VectorVTInfo _Src, SDNode OpNodeRnd> { + defm rrb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _Src.RC:$src, AVX512RC:$rc), OpcodeStr, + "$rc, $src", "$src, $rc", + (_.VT (OpNodeRnd (_Src.VT _Src.RC:$src), (i32 imm:$rc)))>, + EVEX, EVEX_B, EVEX_RC; } -defm VCVTPD2PSZ : avx512_vcvt_fp_with_rc<0x5A, "vcvtpd2ps", VR512, VR256X, fround, - loadv8f64, f512mem, v8f32, v8f64, - SSEPackedSingle>, EVEX_V512, VEX_W, PD, - EVEX_CD8<64, CD8VF>; +// Extend Float to Double +multiclass avx512_cvtps2pd<bits<8> opc, string OpcodeStr> { + let Predicates = [HasAVX512] in { + defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8f32x_info, fextend>, + avx512_vcvt_fp_sae<opc, OpcodeStr, v8f64_info, v8f32x_info, + X86vfpextRnd>, EVEX_V512; + } + let Predicates = [HasVLX] in { + defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v4f32x_info, + X86vfpext, "{1to2}">, EVEX_V128; + defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4f32x_info, fextend>, + EVEX_V256; + } +} + +// Truncate Double to Float +multiclass avx512_cvtpd2ps<bits<8> opc, string OpcodeStr> { + let Predicates = [HasAVX512] in { + defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8f64_info, fround>, + avx512_vcvt_fp_rc<opc, OpcodeStr, v8f32x_info, v8f64_info, + X86vfproundRnd>, EVEX_V512; + } + let Predicates = [HasVLX] in { + defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v2f64x_info, + X86vfpround, "{1to2}", "{x}">, EVEX_V128; + defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4f64x_info, fround, + "{1to4}", "{y}">, EVEX_V256; + } +} + +defm VCVTPD2PS : avx512_cvtpd2ps<0x5A, "vcvtpd2ps">, + VEX_W, PD, EVEX_CD8<64, CD8VF>; +defm VCVTPS2PD : avx512_cvtps2pd<0x5A, "vcvtps2pd">, + PS, EVEX_CD8<32, CD8VH>; -defm VCVTPS2PDZ : avx512_vcvt_fp<0x5A, "vcvtps2pd", VR256X, VR512, fextend, - loadv4f64, f256mem, v8f64, v8f32, - SSEPackedDouble>, EVEX_V512, PS, - EVEX_CD8<32, CD8VH>; def : Pat<(v8f64 (extloadv8f32 addr:$src)), (VCVTPS2PDZrm addr:$src)>; -def : Pat<(v8f32 (int_x86_avx512_mask_cvtpd2ps_512 (v8f64 VR512:$src), - (bc_v8f32(v8i32 immAllZerosV)), (i8 -1), (i32 FROUND_CURRENT))), - (VCVTPD2PSZrr VR512:$src)>; +let Predicates = [HasVLX] in { + def : Pat<(v4f64 (extloadv4f32 addr:$src)), + (VCVTPS2PDZ256rm addr:$src)>; +} -def : Pat<(v8f32 (int_x86_avx512_mask_cvtpd2ps_512 (v8f64 VR512:$src), - (bc_v8f32(v8i32 immAllZerosV)), (i8 -1), imm:$rc)), - (VCVTPD2PSZrrb VR512:$src, imm:$rc)>; +// Convert Signed/Unsigned Doubleword to Double +multiclass avx512_cvtdq2pd<bits<8> opc, string OpcodeStr, SDNode OpNode, + SDNode OpNode128> { + // No rounding in this op + let Predicates = [HasAVX512] in + defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8i32x_info, OpNode>, + EVEX_V512; -//===----------------------------------------------------------------------===// -// AVX-512 Vector convert from sign integer to float/double -//===----------------------------------------------------------------------===// + let Predicates = [HasVLX] in { + defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v4i32x_info, + OpNode128, "{1to2}">, EVEX_V128; + defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4i32x_info, OpNode>, + EVEX_V256; + } +} -defm VCVTDQ2PSZ : avx512_vcvt_fp_with_rc<0x5B, "vcvtdq2ps", VR512, VR512, sint_to_fp, - loadv8i64, i512mem, v16f32, v16i32, - SSEPackedSingle>, EVEX_V512, PS, - EVEX_CD8<32, CD8VF>; +// Convert Signed/Unsigned Doubleword to Float +multiclass avx512_cvtdq2ps<bits<8> opc, string OpcodeStr, SDNode OpNode, + SDNode OpNodeRnd> { + let Predicates = [HasAVX512] in + defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16f32_info, v16i32_info, OpNode>, + avx512_vcvt_fp_rc<opc, OpcodeStr, v16f32_info, v16i32_info, + OpNodeRnd>, EVEX_V512; + + let Predicates = [HasVLX] in { + defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4i32x_info, OpNode>, + EVEX_V128; + defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8i32x_info, OpNode>, + EVEX_V256; + } +} + +// Convert Float to Signed/Unsigned Doubleword with truncation +multiclass avx512_cvttps2dq<bits<8> opc, string OpcodeStr, + SDNode OpNode, SDNode OpNodeRnd> { + let Predicates = [HasAVX512] in { + defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16i32_info, v16f32_info, OpNode>, + avx512_vcvt_fp_sae<opc, OpcodeStr, v16i32_info, v16f32_info, + OpNodeRnd>, EVEX_V512; + } + let Predicates = [HasVLX] in { + defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f32x_info, OpNode>, + EVEX_V128; + defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f32x_info, OpNode>, + EVEX_V256; + } +} + +// Convert Float to Signed/Unsigned Doubleword +multiclass avx512_cvtps2dq<bits<8> opc, string OpcodeStr, + SDNode OpNode, SDNode OpNodeRnd> { + let Predicates = [HasAVX512] in { + defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16i32_info, v16f32_info, OpNode>, + avx512_vcvt_fp_rc<opc, OpcodeStr, v16i32_info, v16f32_info, + OpNodeRnd>, EVEX_V512; + } + let Predicates = [HasVLX] in { + defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f32x_info, OpNode>, + EVEX_V128; + defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f32x_info, OpNode>, + EVEX_V256; + } +} + +// Convert Double to Signed/Unsigned Doubleword with truncation +multiclass avx512_cvttpd2dq<bits<8> opc, string OpcodeStr, + SDNode OpNode, SDNode OpNodeRnd> { + let Predicates = [HasAVX512] in { + defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f64_info, OpNode>, + avx512_vcvt_fp_sae<opc, OpcodeStr, v8i32x_info, v8f64_info, + OpNodeRnd>, EVEX_V512; + } + let Predicates = [HasVLX] in { + // we need "x"/"y" suffixes in order to distinguish between 128 and 256 + // memory forms of these instructions in Asm Parcer. They have the same + // dest type - 'v4i32x_info'. We also specify the broadcast string explicitly + // due to the same reason. + defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v2f64x_info, OpNode, + "{1to2}", "{x}">, EVEX_V128; + defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f64x_info, OpNode, + "{1to4}", "{y}">, EVEX_V256; + } +} + +// Convert Double to Signed/Unsigned Doubleword +multiclass avx512_cvtpd2dq<bits<8> opc, string OpcodeStr, + SDNode OpNode, SDNode OpNodeRnd> { + let Predicates = [HasAVX512] in { + defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f64_info, OpNode>, + avx512_vcvt_fp_rc<opc, OpcodeStr, v8i32x_info, v8f64_info, + OpNodeRnd>, EVEX_V512; + } + let Predicates = [HasVLX] in { + // we need "x"/"y" suffixes in order to distinguish between 128 and 256 + // memory forms of these instructions in Asm Parcer. They have the same + // dest type - 'v4i32x_info'. We also specify the broadcast string explicitly + // due to the same reason. + defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v2f64x_info, OpNode, + "{1to2}", "{x}">, EVEX_V128; + defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f64x_info, OpNode, + "{1to4}", "{y}">, EVEX_V256; + } +} + +// Convert Double to Signed/Unsigned Quardword +multiclass avx512_cvtpd2qq<bits<8> opc, string OpcodeStr, + SDNode OpNode, SDNode OpNodeRnd> { + let Predicates = [HasDQI] in { + defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f64_info, OpNode>, + avx512_vcvt_fp_rc<opc, OpcodeStr, v8i64_info, v8f64_info, + OpNodeRnd>, EVEX_V512; + } + let Predicates = [HasDQI, HasVLX] in { + defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v2f64x_info, OpNode>, + EVEX_V128; + defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f64x_info, OpNode>, + EVEX_V256; + } +} -defm VCVTDQ2PDZ : avx512_vcvt_fp<0xE6, "vcvtdq2pd", VR256X, VR512, sint_to_fp, - loadv4i64, i256mem, v8f64, v8i32, - SSEPackedDouble>, EVEX_V512, XS, +// Convert Double to Signed/Unsigned Quardword with truncation +multiclass avx512_cvttpd2qq<bits<8> opc, string OpcodeStr, + SDNode OpNode, SDNode OpNodeRnd> { + let Predicates = [HasDQI] in { + defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f64_info, OpNode>, + avx512_vcvt_fp_sae<opc, OpcodeStr, v8i64_info, v8f64_info, + OpNodeRnd>, EVEX_V512; + } + let Predicates = [HasDQI, HasVLX] in { + defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v2f64x_info, OpNode>, + EVEX_V128; + defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f64x_info, OpNode>, + EVEX_V256; + } +} + +// Convert Signed/Unsigned Quardword to Double +multiclass avx512_cvtqq2pd<bits<8> opc, string OpcodeStr, + SDNode OpNode, SDNode OpNodeRnd> { + let Predicates = [HasDQI] in { + defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8i64_info, OpNode>, + avx512_vcvt_fp_rc<opc, OpcodeStr, v8f64_info, v8i64_info, + OpNodeRnd>, EVEX_V512; + } + let Predicates = [HasDQI, HasVLX] in { + defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v2i64x_info, OpNode>, + EVEX_V128; + defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4i64x_info, OpNode>, + EVEX_V256; + } +} + +// Convert Float to Signed/Unsigned Quardword +multiclass avx512_cvtps2qq<bits<8> opc, string OpcodeStr, + SDNode OpNode, SDNode OpNodeRnd> { + let Predicates = [HasDQI] in { + defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f32x_info, OpNode>, + avx512_vcvt_fp_rc<opc, OpcodeStr, v8i64_info, v8f32x_info, + OpNodeRnd>, EVEX_V512; + } + let Predicates = [HasDQI, HasVLX] in { + // Explicitly specified broadcast string, since we take only 2 elements + // from v4f32x_info source + defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v4f32x_info, OpNode, + "{1to2}">, EVEX_V128; + defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f32x_info, OpNode>, + EVEX_V256; + } +} + +// Convert Float to Signed/Unsigned Quardword with truncation +multiclass avx512_cvttps2qq<bits<8> opc, string OpcodeStr, + SDNode OpNode, SDNode OpNodeRnd> { + let Predicates = [HasDQI] in { + defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f32x_info, OpNode>, + avx512_vcvt_fp_sae<opc, OpcodeStr, v8i64_info, v8f32x_info, + OpNodeRnd>, EVEX_V512; + } + let Predicates = [HasDQI, HasVLX] in { + // Explicitly specified broadcast string, since we take only 2 elements + // from v4f32x_info source + defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v4f32x_info, OpNode, + "{1to2}">, EVEX_V128; + defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f32x_info, OpNode>, + EVEX_V256; + } +} + +// Convert Signed/Unsigned Quardword to Float +multiclass avx512_cvtqq2ps<bits<8> opc, string OpcodeStr, + SDNode OpNode, SDNode OpNodeRnd> { + let Predicates = [HasDQI] in { + defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8i64_info, OpNode>, + avx512_vcvt_fp_rc<opc, OpcodeStr, v8f32x_info, v8i64_info, + OpNodeRnd>, EVEX_V512; + } + let Predicates = [HasDQI, HasVLX] in { + // we need "x"/"y" suffixes in order to distinguish between 128 and 256 + // memory forms of these instructions in Asm Parcer. They have the same + // dest type - 'v4i32x_info'. We also specify the broadcast string explicitly + // due to the same reason. + defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v2i64x_info, OpNode, + "{1to2}", "{x}">, EVEX_V128; + defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4i64x_info, OpNode, + "{1to4}", "{y}">, EVEX_V256; + } +} + +defm VCVTDQ2PD : avx512_cvtdq2pd<0xE6, "vcvtdq2pd", sint_to_fp, X86cvtdq2pd>, XS, EVEX_CD8<32, CD8VH>; -defm VCVTTPS2DQZ : avx512_vcvt_fp<0x5B, "vcvttps2dq", VR512, VR512, fp_to_sint, - loadv16f32, f512mem, v16i32, v16f32, - SSEPackedSingle>, EVEX_V512, XS, +defm VCVTDQ2PS : avx512_cvtdq2ps<0x5B, "vcvtdq2ps", sint_to_fp, + X86VSintToFpRnd>, + PS, EVEX_CD8<32, CD8VF>; + +defm VCVTTPS2DQ : avx512_cvttps2dq<0x5B, "vcvttps2dq", fp_to_sint, + X86VFpToSintRnd>, + XS, EVEX_CD8<32, CD8VF>; + +defm VCVTTPD2DQ : avx512_cvttpd2dq<0xE6, "vcvttpd2dq", fp_to_sint, + X86VFpToSintRnd>, + PD, VEX_W, EVEX_CD8<64, CD8VF>; + +defm VCVTTPS2UDQ : avx512_cvttps2dq<0x78, "vcvttps2udq", fp_to_uint, + X86VFpToUintRnd>, PS, EVEX_CD8<32, CD8VF>; -defm VCVTTPD2DQZ : avx512_vcvt_fp<0xE6, "vcvttpd2dq", VR512, VR256X, fp_to_sint, - loadv8f64, f512mem, v8i32, v8f64, - SSEPackedDouble>, EVEX_V512, PD, VEX_W, +defm VCVTTPD2UDQ : avx512_cvttpd2dq<0x78, "vcvttpd2udq", fp_to_uint, + X86VFpToUintRnd>, PS, VEX_W, EVEX_CD8<64, CD8VF>; -defm VCVTTPS2UDQZ : avx512_vcvt_fp<0x78, "vcvttps2udq", VR512, VR512, fp_to_uint, - loadv16f32, f512mem, v16i32, v16f32, - SSEPackedSingle>, EVEX_V512, PS, +defm VCVTUDQ2PD : avx512_cvtdq2pd<0x7A, "vcvtudq2pd", uint_to_fp, X86cvtudq2pd>, + XS, EVEX_CD8<32, CD8VH>; + +defm VCVTUDQ2PS : avx512_cvtdq2ps<0x7A, "vcvtudq2ps", uint_to_fp, + X86VUintToFpRnd>, XD, EVEX_CD8<32, CD8VF>; -// cvttps2udq (src, 0, mask-all-ones, sae-current) -def : Pat<(v16i32 (int_x86_avx512_mask_cvttps2udq_512 (v16f32 VR512:$src), - (v16i32 immAllZerosV), (i16 -1), FROUND_CURRENT)), - (VCVTTPS2UDQZrr VR512:$src)>; +defm VCVTPS2DQ : avx512_cvtps2dq<0x5B, "vcvtps2dq", X86cvtps2Int, + X86cvtps2IntRnd>, PD, EVEX_CD8<32, CD8VF>; -defm VCVTTPD2UDQZ : avx512_vcvt_fp<0x78, "vcvttpd2udq", VR512, VR256X, fp_to_uint, - loadv8f64, f512mem, v8i32, v8f64, - SSEPackedDouble>, EVEX_V512, PS, VEX_W, +defm VCVTPD2DQ : avx512_cvtpd2dq<0xE6, "vcvtpd2dq", X86cvtpd2Int, + X86cvtpd2IntRnd>, XD, VEX_W, EVEX_CD8<64, CD8VF>; -// cvttpd2udq (src, 0, mask-all-ones, sae-current) -def : Pat<(v8i32 (int_x86_avx512_mask_cvttpd2udq_512 (v8f64 VR512:$src), - (v8i32 immAllZerosV), (i8 -1), FROUND_CURRENT)), - (VCVTTPD2UDQZrr VR512:$src)>; +defm VCVTPS2UDQ : avx512_cvtps2dq<0x79, "vcvtps2udq", X86cvtps2UInt, + X86cvtps2UIntRnd>, + PS, EVEX_CD8<32, CD8VF>; +defm VCVTPD2UDQ : avx512_cvtpd2dq<0x79, "vcvtpd2udq", X86cvtpd2UInt, + X86cvtpd2UIntRnd>, VEX_W, + PS, EVEX_CD8<64, CD8VF>; -defm VCVTUDQ2PDZ : avx512_vcvt_fp<0x7A, "vcvtudq2pd", VR256X, VR512, uint_to_fp, - loadv4i64, f256mem, v8f64, v8i32, - SSEPackedDouble>, EVEX_V512, XS, - EVEX_CD8<32, CD8VH>; +defm VCVTPD2QQ : avx512_cvtpd2qq<0x7B, "vcvtpd2qq", X86cvtpd2Int, + X86cvtpd2IntRnd>, VEX_W, + PD, EVEX_CD8<64, CD8VF>; -defm VCVTUDQ2PSZ : avx512_vcvt_fp_with_rc<0x7A, "vcvtudq2ps", VR512, VR512, uint_to_fp, - loadv16i32, f512mem, v16f32, v16i32, - SSEPackedSingle>, EVEX_V512, XD, - EVEX_CD8<32, CD8VF>; +defm VCVTPS2QQ : avx512_cvtps2qq<0x7B, "vcvtps2qq", X86cvtps2Int, + X86cvtps2IntRnd>, PD, EVEX_CD8<32, CD8VH>; + +defm VCVTPD2UQQ : avx512_cvtpd2qq<0x79, "vcvtpd2uqq", X86cvtpd2UInt, + X86cvtpd2UIntRnd>, VEX_W, + PD, EVEX_CD8<64, CD8VF>; + +defm VCVTPS2UQQ : avx512_cvtps2qq<0x79, "vcvtps2uqq", X86cvtps2UInt, + X86cvtps2UIntRnd>, PD, EVEX_CD8<32, CD8VH>; + +defm VCVTTPD2QQ : avx512_cvttpd2qq<0x7A, "vcvttpd2qq", fp_to_sint, + X86VFpToSlongRnd>, VEX_W, + PD, EVEX_CD8<64, CD8VF>; + +defm VCVTTPS2QQ : avx512_cvttps2qq<0x7A, "vcvttps2qq", fp_to_sint, + X86VFpToSlongRnd>, PD, EVEX_CD8<32, CD8VH>; + +defm VCVTTPD2UQQ : avx512_cvttpd2qq<0x78, "vcvttpd2uqq", fp_to_uint, + X86VFpToUlongRnd>, VEX_W, + PD, EVEX_CD8<64, CD8VF>; + +defm VCVTTPS2UQQ : avx512_cvttps2qq<0x78, "vcvttps2uqq", fp_to_uint, + X86VFpToUlongRnd>, PD, EVEX_CD8<32, CD8VH>; + +defm VCVTQQ2PD : avx512_cvtqq2pd<0xE6, "vcvtqq2pd", sint_to_fp, + X86VSlongToFpRnd>, VEX_W, XS, EVEX_CD8<64, CD8VF>; +defm VCVTUQQ2PD : avx512_cvtqq2pd<0x7A, "vcvtuqq2pd", uint_to_fp, + X86VUlongToFpRnd>, VEX_W, XS, EVEX_CD8<64, CD8VF>; + +defm VCVTQQ2PS : avx512_cvtqq2ps<0x5B, "vcvtqq2ps", sint_to_fp, + X86VSlongToFpRnd>, VEX_W, PS, EVEX_CD8<64, CD8VF>; + +defm VCVTUQQ2PS : avx512_cvtqq2ps<0x7A, "vcvtuqq2ps", uint_to_fp, + X86VUlongToFpRnd>, VEX_W, XD, EVEX_CD8<64, CD8VF>; + +let Predicates = [NoVLX] in { def : Pat<(v8i32 (fp_to_uint (v8f32 VR256X:$src1))), (EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>; @@ -4734,67 +5012,8 @@ def : Pat<(v4f32 (uint_to_fp (v4i32 VR128X:$src1))), def : Pat<(v4f64 (uint_to_fp (v4i32 VR128X:$src1))), (EXTRACT_SUBREG (v8f64 (VCVTUDQ2PDZrr (v8i32 (SUBREG_TO_REG (i32 0), VR128X:$src1, sub_xmm)))), sub_ymm)>; - -def : Pat<(v16f32 (int_x86_avx512_mask_cvtdq2ps_512 (v16i32 VR512:$src), - (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1), imm:$rc)), - (VCVTDQ2PSZrrb VR512:$src, imm:$rc)>; -def : Pat<(v8f64 (int_x86_avx512_mask_cvtdq2pd_512 (v8i32 VR256X:$src), - (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1))), - (VCVTDQ2PDZrr VR256X:$src)>; -def : Pat<(v16f32 (int_x86_avx512_mask_cvtudq2ps_512 (v16i32 VR512:$src), - (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1), imm:$rc)), - (VCVTUDQ2PSZrrb VR512:$src, imm:$rc)>; -def : Pat<(v8f64 (int_x86_avx512_mask_cvtudq2pd_512 (v8i32 VR256X:$src), - (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1))), - (VCVTUDQ2PDZrr VR256X:$src)>; - -multiclass avx512_vcvt_fp2int<bits<8> opc, string asm, RegisterClass SrcRC, - RegisterClass DstRC, PatFrag mem_frag, - X86MemOperand x86memop, Domain d> { -let hasSideEffects = 0 in { - def rr : AVX512PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), - !strconcat(asm,"\t{$src, $dst|$dst, $src}"), - [], d>, EVEX; - def rrb : AVX512PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src, AVX512RC:$rc), - !strconcat(asm,"\t{$rc, $src, $dst|$dst, $src, $rc}"), - [], d>, EVEX, EVEX_B, EVEX_RC; - let mayLoad = 1 in - def rm : AVX512PI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), - !strconcat(asm,"\t{$src, $dst|$dst, $src}"), - [], d>, EVEX; -} // hasSideEffects = 0 } -defm VCVTPS2DQZ : avx512_vcvt_fp2int<0x5B, "vcvtps2dq", VR512, VR512, - loadv16f32, f512mem, SSEPackedSingle>, PD, - EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VCVTPD2DQZ : avx512_vcvt_fp2int<0xE6, "vcvtpd2dq", VR512, VR256X, - loadv8f64, f512mem, SSEPackedDouble>, XD, VEX_W, - EVEX_V512, EVEX_CD8<64, CD8VF>; - -def : Pat <(v16i32 (int_x86_avx512_mask_cvtps2dq_512 (v16f32 VR512:$src), - (v16i32 immAllZerosV), (i16 -1), imm:$rc)), - (VCVTPS2DQZrrb VR512:$src, imm:$rc)>; - -def : Pat <(v8i32 (int_x86_avx512_mask_cvtpd2dq_512 (v8f64 VR512:$src), - (v8i32 immAllZerosV), (i8 -1), imm:$rc)), - (VCVTPD2DQZrrb VR512:$src, imm:$rc)>; - -defm VCVTPS2UDQZ : avx512_vcvt_fp2int<0x79, "vcvtps2udq", VR512, VR512, - loadv16f32, f512mem, SSEPackedSingle>, - PS, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VCVTPD2UDQZ : avx512_vcvt_fp2int<0x79, "vcvtpd2udq", VR512, VR256X, - loadv8f64, f512mem, SSEPackedDouble>, VEX_W, - PS, EVEX_V512, EVEX_CD8<64, CD8VF>; - -def : Pat <(v16i32 (int_x86_avx512_mask_cvtps2udq_512 (v16f32 VR512:$src), - (v16i32 immAllZerosV), (i16 -1), imm:$rc)), - (VCVTPS2UDQZrrb VR512:$src, imm:$rc)>; - -def : Pat <(v8i32 (int_x86_avx512_mask_cvtpd2udq_512 (v8f64 VR512:$src), - (v8i32 immAllZerosV), (i8 -1), imm:$rc)), - (VCVTPD2UDQZrrb VR512:$src, imm:$rc)>; - let Predicates = [HasAVX512] in { def : Pat<(v8f32 (fround (loadv8f64 addr:$src))), (VCVTPD2PSZrm addr:$src)>; diff --git a/lib/Target/X86/X86InstrControl.td b/lib/Target/X86/X86InstrControl.td index 6ab961f04ecf..4cd5563ce727 100644 --- a/lib/Target/X86/X86InstrControl.td +++ b/lib/Target/X86/X86InstrControl.td @@ -105,14 +105,16 @@ let isBranch = 1, isTerminator = 1, hasSideEffects = 0, SchedRW = [WriteJump] in // jecxz. let Uses = [CX] in def JCXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst), - "jcxz\t$dst", [], IIC_JCXZ>, AdSize16; + "jcxz\t$dst", [], IIC_JCXZ>, AdSize16, + Requires<[Not64BitMode]>; let Uses = [ECX] in def JECXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst), "jecxz\t$dst", [], IIC_JCXZ>, AdSize32; let Uses = [RCX] in def JRCXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst), - "jrcxz\t$dst", [], IIC_JCXZ>, AdSize64; + "jrcxz\t$dst", [], IIC_JCXZ>, AdSize64, + Requires<[In64BitMode]>; } // Indirect branches diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index fe245c3a7e38..1f61ffa84e9a 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -39,11 +39,6 @@ def SDTX86VFCMP : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>, SDTCisFP<1>, SDTCisVT<3, i8>, SDTCisVec<1>]>; -def X86umin : SDNode<"X86ISD::UMIN", SDTIntBinOp>; -def X86umax : SDNode<"X86ISD::UMAX", SDTIntBinOp>; -def X86smin : SDNode<"X86ISD::SMIN", SDTIntBinOp>; -def X86smax : SDNode<"X86ISD::SMAX", SDTIntBinOp>; - def X86fmin : SDNode<"X86ISD::FMIN", SDTFPBinOp>; def X86fmax : SDNode<"X86ISD::FMAX", SDTFPBinOp>; @@ -75,6 +70,9 @@ def X86cmps : SDNode<"X86ISD::FSETCC", SDTX86Cmps>; def X86cvtdq2pd: SDNode<"X86ISD::CVTDQ2PD", SDTypeProfile<1, 1, [SDTCisVT<0, v2f64>, SDTCisVT<1, v4i32>]>>; +def X86cvtudq2pd: SDNode<"X86ISD::CVTUDQ2PD", + SDTypeProfile<1, 1, [SDTCisVT<0, v2f64>, + SDTCisVT<1, v4i32>]>>; def X86pshufb : SDNode<"X86ISD::PSHUFB", SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>>; @@ -187,6 +185,7 @@ def X86addus : SDNode<"X86ISD::ADDUS", SDTIntBinOp>; def X86subus : SDNode<"X86ISD::SUBUS", SDTIntBinOp>; def X86adds : SDNode<"X86ISD::ADDS", SDTIntBinOp>; def X86subs : SDNode<"X86ISD::SUBS", SDTIntBinOp>; +def X86mulhrs : SDNode<"X86ISD::MULHRS" , SDTIntBinOp>; def X86avg : SDNode<"X86ISD::AVG" , SDTIntBinOp>; def X86ptest : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>; def X86testp : SDNode<"X86ISD::TESTP", SDTX86CmpPTest>; @@ -208,6 +207,14 @@ def X86pmuldq : SDNode<"X86ISD::PMULDQ", SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<1,2>]>>; +def X86extrqi : SDNode<"X86ISD::EXTRQI", + SDTypeProfile<1, 3, [SDTCisVT<0, v2i64>, SDTCisSameAs<0,1>, + SDTCisVT<2, i8>, SDTCisVT<3, i8>]>>; +def X86insertqi : SDNode<"X86ISD::INSERTQI", + SDTypeProfile<1, 4, [SDTCisVT<0, v2i64>, SDTCisSameAs<0,1>, + SDTCisSameAs<1,2>, SDTCisVT<3, i8>, + SDTCisVT<4, i8>]>>; + // Specific shuffle nodes - At some point ISD::VECTOR_SHUFFLE will always get // translated into one of the target nodes below during lowering. // Note: this is a work in progress... @@ -357,8 +364,70 @@ def X86expand : SDNode<"X86ISD::EXPAND", SDTypeProfile<1, 1, def SDTintToFPRound: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisFP<0>, SDTCisSameAs<0,1>, SDTCisInt<2>, SDTCisInt<3>]>; -def X86SintToFpRnd : SDNode<"X86ISD::SINT_TO_FP_RND", SDTintToFPRound>; -def X86SuintToFpRnd : SDNode<"X86ISD::UINT_TO_FP_RND", SDTintToFPRound>; +def SDTDoubleToInt: SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisInt<0>, SDTCVecEltisVT<1, f64>]>; +def SDTFloatToInt: SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisInt<0>, SDTCVecEltisVT<1, f32>]>; + +def SDTDoubleToIntRnd: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisInt<0>, SDTCVecEltisVT<1, f64>]>; +def SDTFloatToIntRnd: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisInt<0>, SDTCVecEltisVT<1, f32>]>; + +def SDTVintToFPRound: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisFP<0>, SDTCVecEltisVT<1, i32>, + SDTCisInt<2>]>; +def SDTVlongToFPRound: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisFP<0>, SDTCVecEltisVT<1, i64>, + SDTCisInt<2>]>; + +def SDTVFPToIntRound: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisFP<1>, SDTCVecEltisVT<0, i32>, + SDTCisInt<2>]>; +def SDTVFPToLongRound: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisFP<1>, SDTCVecEltisVT<0, i64>, + SDTCisInt<2>]>; + +// Scalar +def X86SintToFpRnd : SDNode<"X86ISD::SINT_TO_FP_RND", SDTintToFPRound>; +def X86UintToFpRnd : SDNode<"X86ISD::UINT_TO_FP_RND", SDTintToFPRound>; + +// Vector with rounding mode + +// cvtt fp-to-int staff +def X86VFpToSintRnd : SDNode<"ISD::FP_TO_SINT", SDTVFPToIntRound>; +def X86VFpToUintRnd : SDNode<"ISD::FP_TO_UINT", SDTVFPToIntRound>; +def X86VFpToSlongRnd : SDNode<"ISD::FP_TO_SINT", SDTVFPToLongRound>; +def X86VFpToUlongRnd : SDNode<"ISD::FP_TO_UINT", SDTVFPToLongRound>; + +def X86VSintToFpRnd : SDNode<"ISD::SINT_TO_FP", SDTVintToFPRound>; +def X86VUintToFpRnd : SDNode<"ISD::UINT_TO_FP", SDTVintToFPRound>; +def X86VSlongToFpRnd : SDNode<"ISD::SINT_TO_FP", SDTVlongToFPRound>; +def X86VUlongToFpRnd : SDNode<"ISD::UINT_TO_FP", SDTVlongToFPRound>; + +// cvt fp-to-int staff +def X86cvtps2IntRnd : SDNode<"X86ISD::FP_TO_SINT_RND", SDTFloatToIntRnd>; +def X86cvtps2UIntRnd : SDNode<"X86ISD::FP_TO_UINT_RND", SDTFloatToIntRnd>; +def X86cvtpd2IntRnd : SDNode<"X86ISD::FP_TO_SINT_RND", SDTDoubleToIntRnd>; +def X86cvtpd2UIntRnd : SDNode<"X86ISD::FP_TO_UINT_RND", SDTDoubleToIntRnd>; + +// Vector without rounding mode +def X86cvtps2Int : SDNode<"X86ISD::FP_TO_SINT_RND", SDTFloatToInt>; +def X86cvtps2UInt : SDNode<"X86ISD::FP_TO_UINT_RND", SDTFloatToInt>; +def X86cvtpd2Int : SDNode<"X86ISD::FP_TO_SINT_RND", SDTDoubleToInt>; +def X86cvtpd2UInt : SDNode<"X86ISD::FP_TO_UINT_RND", SDTDoubleToInt>; + +def X86vfpextRnd : SDNode<"X86ISD::VFPEXT", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisFP<0>, SDTCisFP<1>, + SDTCisOpSmallerThanOp<1, 0>, + SDTCisInt<2>]>>; +def X86vfproundRnd: SDNode<"X86ISD::VFPROUND", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisFP<0>, SDTCisFP<1>, + SDTCVecEltisVT<0, f32>, + SDTCVecEltisVT<1, f64>, + SDTCisInt<2>]>>; //===----------------------------------------------------------------------===// // SSE Complex Patterns diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index b92ba99fb100..786150760b93 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -269,14 +269,11 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::XOR8rr, X86::XOR8mr, 0 } }; - for (unsigned i = 0, e = array_lengthof(MemoryFoldTable2Addr); i != e; ++i) { - unsigned RegOp = MemoryFoldTable2Addr[i].RegOp; - unsigned MemOp = MemoryFoldTable2Addr[i].MemOp; - unsigned Flags = MemoryFoldTable2Addr[i].Flags; + for (X86MemoryFoldTableEntry Entry : MemoryFoldTable2Addr) { AddTableEntry(RegOp2MemOpTable2Addr, MemOp2RegOpTable, - RegOp, MemOp, + Entry.RegOp, Entry.MemOp, // Index 0, folded load and store, no alignment requirement. - Flags | TB_INDEX_0 | TB_FOLDED_LOAD | TB_FOLDED_STORE); + Entry.Flags | TB_INDEX_0 | TB_FOLDED_LOAD | TB_FOLDED_STORE); } static const X86MemoryFoldTableEntry MemoryFoldTable0[] = { @@ -424,12 +421,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VCVTPS2PHYrr, X86::VCVTPS2PHYmr, TB_FOLDED_STORE } }; - for (unsigned i = 0, e = array_lengthof(MemoryFoldTable0); i != e; ++i) { - unsigned RegOp = MemoryFoldTable0[i].RegOp; - unsigned MemOp = MemoryFoldTable0[i].MemOp; - unsigned Flags = MemoryFoldTable0[i].Flags; + for (X86MemoryFoldTableEntry Entry : MemoryFoldTable0) { AddTableEntry(RegOp2MemOpTable0, MemOp2RegOpTable, - RegOp, MemOp, TB_INDEX_0 | Flags); + Entry.RegOp, Entry.MemOp, TB_INDEX_0 | Entry.Flags); } static const X86MemoryFoldTableEntry MemoryFoldTable1[] = { @@ -862,14 +856,11 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VAESKEYGENASSIST128rr, X86::VAESKEYGENASSIST128rm, 0 } }; - for (unsigned i = 0, e = array_lengthof(MemoryFoldTable1); i != e; ++i) { - unsigned RegOp = MemoryFoldTable1[i].RegOp; - unsigned MemOp = MemoryFoldTable1[i].MemOp; - unsigned Flags = MemoryFoldTable1[i].Flags; + for (X86MemoryFoldTableEntry Entry : MemoryFoldTable1) { AddTableEntry(RegOp2MemOpTable1, MemOp2RegOpTable, - RegOp, MemOp, + Entry.RegOp, Entry.MemOp, // Index 1, folded load - Flags | TB_INDEX_1 | TB_FOLDED_LOAD); + Entry.Flags | TB_INDEX_1 | TB_FOLDED_LOAD); } static const X86MemoryFoldTableEntry MemoryFoldTable2[] = { @@ -1116,6 +1107,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::PUNPCKLQDQrr, X86::PUNPCKLQDQrm, TB_ALIGN_16 }, { X86::PUNPCKLWDrr, X86::PUNPCKLWDrm, TB_ALIGN_16 }, { X86::PXORrr, X86::PXORrm, TB_ALIGN_16 }, + { X86::ROUNDSDr, X86::ROUNDSDm, 0 }, + { X86::ROUNDSSr, X86::ROUNDSSm, 0 }, { X86::SBB32rr, X86::SBB32rm, 0 }, { X86::SBB64rr, X86::SBB64rm, 0 }, { X86::SHUFPDrri, X86::SHUFPDrmi, TB_ALIGN_16 }, @@ -1412,6 +1405,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPUNPCKLQDQrr, X86::VPUNPCKLQDQrm, 0 }, { X86::VPUNPCKLWDrr, X86::VPUNPCKLWDrm, 0 }, { X86::VPXORrr, X86::VPXORrm, 0 }, + { X86::VROUNDSDr, X86::VROUNDSDm, 0 }, + { X86::VROUNDSSr, X86::VROUNDSSm, 0 }, { X86::VSHUFPDrri, X86::VSHUFPDrmi, 0 }, { X86::VSHUFPSrri, X86::VSHUFPSrmi, 0 }, { X86::VSUBPDrr, X86::VSUBPDrm, 0 }, @@ -1733,14 +1728,11 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::SHA256RNDS2rr, X86::SHA256RNDS2rm, TB_ALIGN_16 } }; - for (unsigned i = 0, e = array_lengthof(MemoryFoldTable2); i != e; ++i) { - unsigned RegOp = MemoryFoldTable2[i].RegOp; - unsigned MemOp = MemoryFoldTable2[i].MemOp; - unsigned Flags = MemoryFoldTable2[i].Flags; + for (X86MemoryFoldTableEntry Entry : MemoryFoldTable2) { AddTableEntry(RegOp2MemOpTable2, MemOp2RegOpTable, - RegOp, MemOp, + Entry.RegOp, Entry.MemOp, // Index 2, folded load - Flags | TB_INDEX_2 | TB_FOLDED_LOAD); + Entry.Flags | TB_INDEX_2 | TB_FOLDED_LOAD); } static const X86MemoryFoldTableEntry MemoryFoldTable3[] = { @@ -1949,14 +1941,11 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMAXPDZ128rrkz, X86::VMAXPDZ128rmkz, 0 } }; - for (unsigned i = 0, e = array_lengthof(MemoryFoldTable3); i != e; ++i) { - unsigned RegOp = MemoryFoldTable3[i].RegOp; - unsigned MemOp = MemoryFoldTable3[i].MemOp; - unsigned Flags = MemoryFoldTable3[i].Flags; + for (X86MemoryFoldTableEntry Entry : MemoryFoldTable3) { AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable, - RegOp, MemOp, + Entry.RegOp, Entry.MemOp, // Index 3, folded load - Flags | TB_INDEX_3 | TB_FOLDED_LOAD); + Entry.Flags | TB_INDEX_3 | TB_FOLDED_LOAD); } static const X86MemoryFoldTableEntry MemoryFoldTable4[] = { @@ -2001,14 +1990,11 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMAXPDZ128rrk, X86::VMAXPDZ128rmk, 0 } }; - for (unsigned i = 0, e = array_lengthof(MemoryFoldTable4); i != e; ++i) { - unsigned RegOp = MemoryFoldTable4[i].RegOp; - unsigned MemOp = MemoryFoldTable4[i].MemOp; - unsigned Flags = MemoryFoldTable4[i].Flags; + for (X86MemoryFoldTableEntry Entry : MemoryFoldTable4) { AddTableEntry(RegOp2MemOpTable4, MemOp2RegOpTable, - RegOp, MemOp, + Entry.RegOp, Entry.MemOp, // Index 4, folded load - Flags | TB_INDEX_4 | TB_FOLDED_LOAD); + Entry.Flags | TB_INDEX_4 | TB_FOLDED_LOAD); } } @@ -3820,7 +3806,7 @@ static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg, X86::MOVPQIto64rr); if (X86::VR64RegClass.contains(SrcReg)) // Copy from a VR64 register to a GR64 register. - return X86::MOVSDto64rr; + return X86::MMX_MOVD64from64rr; } else if (X86::GR64RegClass.contains(SrcReg)) { // Copy from a GR64 register to a VR128 register. if (X86::VR128XRegClass.contains(DestReg)) @@ -3828,7 +3814,7 @@ static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg, X86::MOV64toPQIrr); // Copy from a GR64 register to a VR64 register. if (X86::VR64RegClass.contains(DestReg)) - return X86::MOV64toSDrr; + return X86::MMX_MOVD64to64rr; } // SrcReg(FR32) -> DestReg(GR32) @@ -6413,22 +6399,40 @@ static bool hasReassocSibling(const MachineInstr &Inst, bool &Commuted) { hasVirtualRegDefsInBasicBlock(*MI1, MBB) && MRI.hasOneNonDBGUse(MI1->getOperand(0).getReg())) return true; - + return false; } +// TODO: There are many more machine instruction opcodes to match: +// 1. Other data types (integer, vectors) +// 2. Other math / logic operations (and, or) +static bool isAssociativeAndCommutative(unsigned Opcode) { + switch (Opcode) { + case X86::ADDSDrr: + case X86::ADDSSrr: + case X86::VADDSDrr: + case X86::VADDSSrr: + case X86::MULSDrr: + case X86::MULSSrr: + case X86::VMULSDrr: + case X86::VMULSSrr: + return true; + default: + return false; + } +} + /// Return true if the input instruction is part of a chain of dependent ops /// that are suitable for reassociation, otherwise return false. /// If the instruction's operands must be commuted to have a previous /// instruction of the same type define the first source operand, Commuted will /// be set to true. -static bool isReassocCandidate(const MachineInstr &Inst, unsigned AssocOpcode, - bool &Commuted) { - // 1. The instruction must have the correct type. +static bool isReassocCandidate(const MachineInstr &Inst, bool &Commuted) { + // 1. The operation must be associative and commutative. // 2. The instruction must have virtual register definitions for its // operands in the same basic block. - // 3. The instruction must have a reassociatable sibling. - if (Inst.getOpcode() == AssocOpcode && + // 3. The instruction must have a reassociable sibling. + if (isAssociativeAndCommutative(Inst.getOpcode()) && hasVirtualRegDefsInBasicBlock(Inst, Inst.getParent()) && hasReassocSibling(Inst, Commuted)) return true; @@ -6455,14 +6459,8 @@ bool X86InstrInfo::getMachineCombinerPatterns(MachineInstr &Root, // B = A op X (Prev) // C = B op Y (Root) - // TODO: There are many more associative instruction types to match: - // 1. Other forms of scalar FP add (non-AVX) - // 2. Other data types (double, integer, vectors) - // 3. Other math / logic operations (mul, and, or) - unsigned AssocOpcode = X86::VADDSSrr; - - bool Commute = false; - if (isReassocCandidate(Root, AssocOpcode, Commute)) { + bool Commute; + if (isReassocCandidate(Root, Commute)) { // We found a sequence of instructions that may be suitable for a // reassociation of operands to increase ILP. Specify each commutation // possibility for the Prev instruction in the sequence and let the @@ -6512,7 +6510,7 @@ static void reassociateOps(MachineInstr &Root, MachineInstr &Prev, MachineOperand &OpX = Prev.getOperand(OpIdx[Pattern][2]); MachineOperand &OpY = Root.getOperand(OpIdx[Pattern][3]); MachineOperand &OpC = Root.getOperand(0); - + unsigned RegA = OpA.getReg(); unsigned RegB = OpB.getReg(); unsigned RegX = OpX.getReg(); @@ -6547,7 +6545,7 @@ static void reassociateOps(MachineInstr &Root, MachineInstr &Prev, .addReg(RegX, getKillRegState(KillX)) .addReg(RegY, getKillRegState(KillY)); InsInstrs.push_back(MIB1); - + MachineInstrBuilder MIB2 = BuildMI(*MF, Root.getDebugLoc(), TII->get(Opcode), RegC) .addReg(RegA, getKillRegState(KillA)) @@ -6579,7 +6577,7 @@ void X86InstrInfo::genAlternativeCodeSequence( Prev = MRI.getUniqueVRegDef(Root.getOperand(2).getReg()); } assert(Prev && "Unknown pattern for machine combiner"); - + reassociateOps(Root, *Prev, Pattern, InsInstrs, DelInstrs, InstIdxForVirtReg); return; } diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index 6f38cb8eaf33..52bab9c79b45 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -194,7 +194,7 @@ def X86rdpmc : SDNode<"X86ISD::RDPMC_DAG", SDTX86Void, def X86Wrapper : SDNode<"X86ISD::Wrapper", SDTX86Wrapper>; def X86WrapperRIP : SDNode<"X86ISD::WrapperRIP", SDTX86Wrapper>; -def X86RecoverFrameAlloc : SDNode<"ISD::FRAME_ALLOC_RECOVER", +def X86RecoverFrameAlloc : SDNode<"ISD::LOCAL_RECOVER", SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisInt<1>]>>; @@ -1028,14 +1028,13 @@ def PUSH32rmm: I<0xFF, MRM6m, (outs), (ins i32mem:$src), "push{l}\t$src",[], IIC_PUSH_MEM>, OpSize32, Requires<[Not64BitMode]>; def PUSH16i8 : Ii8<0x6a, RawFrm, (outs), (ins i16i8imm:$imm), - "push{w}\t$imm", [], IIC_PUSH_IMM>, OpSize16, - Requires<[Not64BitMode]>; + "push{w}\t$imm", [], IIC_PUSH_IMM>, OpSize16; +def PUSHi16 : Ii16<0x68, RawFrm, (outs), (ins i16imm:$imm), + "push{w}\t$imm", [], IIC_PUSH_IMM>, OpSize16; + def PUSH32i8 : Ii8<0x6a, RawFrm, (outs), (ins i32i8imm:$imm), "push{l}\t$imm", [], IIC_PUSH_IMM>, OpSize32, Requires<[Not64BitMode]>; -def PUSHi16 : Ii16<0x68, RawFrm, (outs), (ins i16imm:$imm), - "push{w}\t$imm", [], IIC_PUSH_IMM>, OpSize16, - Requires<[Not64BitMode]>; def PUSHi32 : Ii32<0x68, RawFrm, (outs), (ins i32imm:$imm), "push{l}\t$imm", [], IIC_PUSH_IMM>, OpSize32, Requires<[Not64BitMode]>; @@ -1081,9 +1080,6 @@ let Defs = [RSP], Uses = [RSP], hasSideEffects = 0, mayStore = 1, SchedRW = [WriteStore] in { def PUSH64i8 : Ii8<0x6a, RawFrm, (outs), (ins i64i8imm:$imm), "push{q}\t$imm", [], IIC_PUSH_IMM>, Requires<[In64BitMode]>; -def PUSH64i16 : Ii16<0x68, RawFrm, (outs), (ins i16imm:$imm), - "push{w}\t$imm", [], IIC_PUSH_IMM>, OpSize16, - Requires<[In64BitMode]>; def PUSH64i32 : Ii32S<0x68, RawFrm, (outs), (ins i64i32imm:$imm), "push{q}\t$imm", [], IIC_PUSH_IMM>, OpSize32, Requires<[In64BitMode]>; diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 2a896dfe8aa8..a5ff9edf05a3 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -4035,13 +4035,13 @@ defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", X86subus, v16i8, v32i8, SSE_INTALU_ITINS_P, 0>; defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", X86subus, v8i16, v16i16, SSE_INTALU_ITINS_P, 0>; -defm PMINUB : PDI_binop_all<0xDA, "pminub", X86umin, v16i8, v32i8, +defm PMINUB : PDI_binop_all<0xDA, "pminub", umin, v16i8, v32i8, SSE_INTALU_ITINS_P, 1>; -defm PMINSW : PDI_binop_all<0xEA, "pminsw", X86smin, v8i16, v16i16, +defm PMINSW : PDI_binop_all<0xEA, "pminsw", smin, v8i16, v16i16, SSE_INTALU_ITINS_P, 1>; -defm PMAXUB : PDI_binop_all<0xDE, "pmaxub", X86umax, v16i8, v32i8, +defm PMAXUB : PDI_binop_all<0xDE, "pmaxub", umax, v16i8, v32i8, SSE_INTALU_ITINS_P, 1>; -defm PMAXSW : PDI_binop_all<0xEE, "pmaxsw", X86smax, v8i16, v16i16, +defm PMAXSW : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16, SSE_INTALU_ITINS_P, 1>; // Intrinsic forms @@ -6834,29 +6834,28 @@ multiclass SS48I_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode, } let Predicates = [HasAVX, NoVLX] in { - let isCommutable = 0 in - defm VPMINSB : SS48I_binop_rm<0x38, "vpminsb", X86smin, v16i8, VR128, + defm VPMINSB : SS48I_binop_rm<0x38, "vpminsb", smin, v16i8, VR128, loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, VEX_4V; - defm VPMINSD : SS48I_binop_rm<0x39, "vpminsd", X86smin, v4i32, VR128, + defm VPMINSD : SS48I_binop_rm<0x39, "vpminsd", smin, v4i32, VR128, loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, VEX_4V; - defm VPMINUD : SS48I_binop_rm<0x3B, "vpminud", X86umin, v4i32, VR128, + defm VPMINUD : SS48I_binop_rm<0x3B, "vpminud", umin, v4i32, VR128, loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, VEX_4V; - defm VPMINUW : SS48I_binop_rm<0x3A, "vpminuw", X86umin, v8i16, VR128, + defm VPMINUW : SS48I_binop_rm<0x3A, "vpminuw", umin, v8i16, VR128, loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, VEX_4V; - defm VPMAXSB : SS48I_binop_rm<0x3C, "vpmaxsb", X86smax, v16i8, VR128, + defm VPMAXSB : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v16i8, VR128, loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, VEX_4V; - defm VPMAXSD : SS48I_binop_rm<0x3D, "vpmaxsd", X86smax, v4i32, VR128, + defm VPMAXSD : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v4i32, VR128, loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, VEX_4V; - defm VPMAXUD : SS48I_binop_rm<0x3F, "vpmaxud", X86umax, v4i32, VR128, + defm VPMAXUD : SS48I_binop_rm<0x3F, "vpmaxud", umax, v4i32, VR128, loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, VEX_4V; - defm VPMAXUW : SS48I_binop_rm<0x3E, "vpmaxuw", X86umax, v8i16, VR128, + defm VPMAXUW : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v8i16, VR128, loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, VEX_4V; defm VPMULDQ : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v2i64, v4i32, @@ -6865,29 +6864,28 @@ let Predicates = [HasAVX, NoVLX] in { } let Predicates = [HasAVX2, NoVLX] in { - let isCommutable = 0 in - defm VPMINSBY : SS48I_binop_rm<0x38, "vpminsb", X86smin, v32i8, VR256, + defm VPMINSBY : SS48I_binop_rm<0x38, "vpminsb", smin, v32i8, VR256, loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, VEX_4V, VEX_L; - defm VPMINSDY : SS48I_binop_rm<0x39, "vpminsd", X86smin, v8i32, VR256, + defm VPMINSDY : SS48I_binop_rm<0x39, "vpminsd", smin, v8i32, VR256, loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, VEX_4V, VEX_L; - defm VPMINUDY : SS48I_binop_rm<0x3B, "vpminud", X86umin, v8i32, VR256, + defm VPMINUDY : SS48I_binop_rm<0x3B, "vpminud", umin, v8i32, VR256, loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, VEX_4V, VEX_L; - defm VPMINUWY : SS48I_binop_rm<0x3A, "vpminuw", X86umin, v16i16, VR256, + defm VPMINUWY : SS48I_binop_rm<0x3A, "vpminuw", umin, v16i16, VR256, loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, VEX_4V, VEX_L; - defm VPMAXSBY : SS48I_binop_rm<0x3C, "vpmaxsb", X86smax, v32i8, VR256, + defm VPMAXSBY : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v32i8, VR256, loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, VEX_4V, VEX_L; - defm VPMAXSDY : SS48I_binop_rm<0x3D, "vpmaxsd", X86smax, v8i32, VR256, + defm VPMAXSDY : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v8i32, VR256, loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, VEX_4V, VEX_L; - defm VPMAXUDY : SS48I_binop_rm<0x3F, "vpmaxud", X86umax, v8i32, VR256, + defm VPMAXUDY : SS48I_binop_rm<0x3F, "vpmaxud", umax, v8i32, VR256, loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, VEX_4V, VEX_L; - defm VPMAXUWY : SS48I_binop_rm<0x3E, "vpmaxuw", X86umax, v16i16, VR256, + defm VPMAXUWY : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v16i16, VR256, loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, VEX_4V, VEX_L; defm VPMULDQY : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v4i64, v8i32, @@ -6896,22 +6894,21 @@ let Predicates = [HasAVX2, NoVLX] in { } let Constraints = "$src1 = $dst" in { - let isCommutable = 0 in - defm PMINSB : SS48I_binop_rm<0x38, "pminsb", X86smin, v16i8, VR128, + defm PMINSB : SS48I_binop_rm<0x38, "pminsb", smin, v16i8, VR128, memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>; - defm PMINSD : SS48I_binop_rm<0x39, "pminsd", X86smin, v4i32, VR128, + defm PMINSD : SS48I_binop_rm<0x39, "pminsd", smin, v4i32, VR128, memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>; - defm PMINUD : SS48I_binop_rm<0x3B, "pminud", X86umin, v4i32, VR128, + defm PMINUD : SS48I_binop_rm<0x3B, "pminud", umin, v4i32, VR128, memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>; - defm PMINUW : SS48I_binop_rm<0x3A, "pminuw", X86umin, v8i16, VR128, + defm PMINUW : SS48I_binop_rm<0x3A, "pminuw", umin, v8i16, VR128, memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>; - defm PMAXSB : SS48I_binop_rm<0x3C, "pmaxsb", X86smax, v16i8, VR128, + defm PMAXSB : SS48I_binop_rm<0x3C, "pmaxsb", smax, v16i8, VR128, memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>; - defm PMAXSD : SS48I_binop_rm<0x3D, "pmaxsd", X86smax, v4i32, VR128, + defm PMAXSD : SS48I_binop_rm<0x3D, "pmaxsd", smax, v4i32, VR128, memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>; - defm PMAXUD : SS48I_binop_rm<0x3F, "pmaxud", X86umax, v4i32, VR128, + defm PMAXUD : SS48I_binop_rm<0x3F, "pmaxud", umax, v4i32, VR128, memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>; - defm PMAXUW : SS48I_binop_rm<0x3E, "pmaxuw", X86umax, v8i16, VR128, + defm PMAXUW : SS48I_binop_rm<0x3E, "pmaxuw", umax, v8i16, VR128, memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>; defm PMULDQ : SS48I_binop_rm2<0x28, "pmuldq", X86pmuldq, v2i64, v4i32, VR128, memopv2i64, i128mem, @@ -7773,7 +7770,7 @@ let Constraints = "$src = $dst" in { def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst), (ins VR128:$src, u8imm:$len, u8imm:$idx), "extrq\t{$idx, $len, $src|$src, $len, $idx}", - [(set VR128:$dst, (int_x86_sse4a_extrqi VR128:$src, imm:$len, + [(set VR128:$dst, (X86extrqi VR128:$src, imm:$len, imm:$idx))]>, PD; def EXTRQ : I<0x79, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src, VR128:$mask), @@ -7784,8 +7781,8 @@ def EXTRQ : I<0x79, MRMSrcReg, (outs VR128:$dst), def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src, VR128:$src2, u8imm:$len, u8imm:$idx), "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}", - [(set VR128:$dst, (int_x86_sse4a_insertqi VR128:$src, - VR128:$src2, imm:$len, imm:$idx))]>, XD; + [(set VR128:$dst, (X86insertqi VR128:$src, VR128:$src2, + imm:$len, imm:$idx))]>, XD; def INSERTQ : I<0x79, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src, VR128:$mask), "insertq\t{$mask, $src|$src, $mask}", diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h index 61a33484b8bf..2c8b95bcba22 100644 --- a/lib/Target/X86/X86IntrinsicsInfo.h +++ b/lib/Target/X86/X86IntrinsicsInfo.h @@ -19,7 +19,7 @@ namespace llvm { enum IntrinsicType { INTR_NO_TYPE, GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, ADX, - INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, + INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, INTR_TYPE_4OP, CMP_MASK, CMP_MASK_CC, VSHIFT, VSHIFT_MASK, COMI, INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM, INTR_TYPE_2OP_MASK, INTR_TYPE_2OP_MASK_RM, INTR_TYPE_3OP_MASK, FMA_OP_MASK, FMA_OP_MASKZ, FMA_OP_MASK3, VPERM_3OP_MASK, @@ -213,18 +213,18 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx2_phadd_w, INTR_TYPE_2OP, X86ISD::HADD, 0), X86_INTRINSIC_DATA(avx2_phsub_d, INTR_TYPE_2OP, X86ISD::HSUB, 0), X86_INTRINSIC_DATA(avx2_phsub_w, INTR_TYPE_2OP, X86ISD::HSUB, 0), - X86_INTRINSIC_DATA(avx2_pmaxs_b, INTR_TYPE_2OP, X86ISD::SMAX, 0), - X86_INTRINSIC_DATA(avx2_pmaxs_d, INTR_TYPE_2OP, X86ISD::SMAX, 0), - X86_INTRINSIC_DATA(avx2_pmaxs_w, INTR_TYPE_2OP, X86ISD::SMAX, 0), - X86_INTRINSIC_DATA(avx2_pmaxu_b, INTR_TYPE_2OP, X86ISD::UMAX, 0), - X86_INTRINSIC_DATA(avx2_pmaxu_d, INTR_TYPE_2OP, X86ISD::UMAX, 0), - X86_INTRINSIC_DATA(avx2_pmaxu_w, INTR_TYPE_2OP, X86ISD::UMAX, 0), - X86_INTRINSIC_DATA(avx2_pmins_b, INTR_TYPE_2OP, X86ISD::SMIN, 0), - X86_INTRINSIC_DATA(avx2_pmins_d, INTR_TYPE_2OP, X86ISD::SMIN, 0), - X86_INTRINSIC_DATA(avx2_pmins_w, INTR_TYPE_2OP, X86ISD::SMIN, 0), - X86_INTRINSIC_DATA(avx2_pminu_b, INTR_TYPE_2OP, X86ISD::UMIN, 0), - X86_INTRINSIC_DATA(avx2_pminu_d, INTR_TYPE_2OP, X86ISD::UMIN, 0), - X86_INTRINSIC_DATA(avx2_pminu_w, INTR_TYPE_2OP, X86ISD::UMIN, 0), + X86_INTRINSIC_DATA(avx2_pmaxs_b, INTR_TYPE_2OP, ISD::SMAX, 0), + X86_INTRINSIC_DATA(avx2_pmaxs_d, INTR_TYPE_2OP, ISD::SMAX, 0), + X86_INTRINSIC_DATA(avx2_pmaxs_w, INTR_TYPE_2OP, ISD::SMAX, 0), + X86_INTRINSIC_DATA(avx2_pmaxu_b, INTR_TYPE_2OP, ISD::UMAX, 0), + X86_INTRINSIC_DATA(avx2_pmaxu_d, INTR_TYPE_2OP, ISD::UMAX, 0), + X86_INTRINSIC_DATA(avx2_pmaxu_w, INTR_TYPE_2OP, ISD::UMAX, 0), + X86_INTRINSIC_DATA(avx2_pmins_b, INTR_TYPE_2OP, ISD::SMIN, 0), + X86_INTRINSIC_DATA(avx2_pmins_d, INTR_TYPE_2OP, ISD::SMIN, 0), + X86_INTRINSIC_DATA(avx2_pmins_w, INTR_TYPE_2OP, ISD::SMIN, 0), + X86_INTRINSIC_DATA(avx2_pminu_b, INTR_TYPE_2OP, ISD::UMIN, 0), + X86_INTRINSIC_DATA(avx2_pminu_d, INTR_TYPE_2OP, ISD::UMIN, 0), + X86_INTRINSIC_DATA(avx2_pminu_w, INTR_TYPE_2OP, ISD::UMIN, 0), X86_INTRINSIC_DATA(avx2_pmovsxbd, INTR_TYPE_1OP, X86ISD::VSEXT, 0), X86_INTRINSIC_DATA(avx2_pmovsxbq, INTR_TYPE_1OP, X86ISD::VSEXT, 0), X86_INTRINSIC_DATA(avx2_pmovsxbw, INTR_TYPE_1OP, X86ISD::VSEXT, 0), @@ -596,60 +596,69 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_128, CMP_MASK, X86ISD::PCMPGTM, 0), X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_256, CMP_MASK, X86ISD::PCMPGTM, 0), X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_512, CMP_MASK, X86ISD::PCMPGTM, 0), - X86_INTRINSIC_DATA(avx512_mask_pmaxs_b_128, INTR_TYPE_2OP_MASK, X86ISD::SMAX, 0), - X86_INTRINSIC_DATA(avx512_mask_pmaxs_b_256, INTR_TYPE_2OP_MASK, X86ISD::SMAX, 0), - X86_INTRINSIC_DATA(avx512_mask_pmaxs_b_512, INTR_TYPE_2OP_MASK, X86ISD::SMAX, 0), - X86_INTRINSIC_DATA(avx512_mask_pmaxs_d_128, INTR_TYPE_2OP_MASK, X86ISD::SMAX, 0), - X86_INTRINSIC_DATA(avx512_mask_pmaxs_d_256, INTR_TYPE_2OP_MASK, X86ISD::SMAX, 0), - X86_INTRINSIC_DATA(avx512_mask_pmaxs_d_512, INTR_TYPE_2OP_MASK, X86ISD::SMAX, 0), - X86_INTRINSIC_DATA(avx512_mask_pmaxs_q_128, INTR_TYPE_2OP_MASK, X86ISD::SMAX, 0), - X86_INTRINSIC_DATA(avx512_mask_pmaxs_q_256, INTR_TYPE_2OP_MASK, X86ISD::SMAX, 0), - X86_INTRINSIC_DATA(avx512_mask_pmaxs_q_512, INTR_TYPE_2OP_MASK, X86ISD::SMAX, 0), - X86_INTRINSIC_DATA(avx512_mask_pmaxs_w_128, INTR_TYPE_2OP_MASK, X86ISD::SMAX, 0), - X86_INTRINSIC_DATA(avx512_mask_pmaxs_w_256, INTR_TYPE_2OP_MASK, X86ISD::SMAX, 0), - X86_INTRINSIC_DATA(avx512_mask_pmaxs_w_512, INTR_TYPE_2OP_MASK, X86ISD::SMAX, 0), - X86_INTRINSIC_DATA(avx512_mask_pmaxu_b_128, INTR_TYPE_2OP_MASK, X86ISD::UMAX, 0), - X86_INTRINSIC_DATA(avx512_mask_pmaxu_b_256, INTR_TYPE_2OP_MASK, X86ISD::UMAX, 0), - X86_INTRINSIC_DATA(avx512_mask_pmaxu_b_512, INTR_TYPE_2OP_MASK, X86ISD::UMAX, 0), - X86_INTRINSIC_DATA(avx512_mask_pmaxu_d_128, INTR_TYPE_2OP_MASK, X86ISD::UMAX, 0), - X86_INTRINSIC_DATA(avx512_mask_pmaxu_d_256, INTR_TYPE_2OP_MASK, X86ISD::UMAX, 0), - X86_INTRINSIC_DATA(avx512_mask_pmaxu_d_512, INTR_TYPE_2OP_MASK, X86ISD::UMAX, 0), - X86_INTRINSIC_DATA(avx512_mask_pmaxu_q_128, INTR_TYPE_2OP_MASK, X86ISD::UMAX, 0), - X86_INTRINSIC_DATA(avx512_mask_pmaxu_q_256, INTR_TYPE_2OP_MASK, X86ISD::UMAX, 0), - X86_INTRINSIC_DATA(avx512_mask_pmaxu_q_512, INTR_TYPE_2OP_MASK, X86ISD::UMAX, 0), - X86_INTRINSIC_DATA(avx512_mask_pmaxu_w_128, INTR_TYPE_2OP_MASK, X86ISD::UMAX, 0), - X86_INTRINSIC_DATA(avx512_mask_pmaxu_w_256, INTR_TYPE_2OP_MASK, X86ISD::UMAX, 0), - X86_INTRINSIC_DATA(avx512_mask_pmaxu_w_512, INTR_TYPE_2OP_MASK, X86ISD::UMAX, 0), - X86_INTRINSIC_DATA(avx512_mask_pmins_b_128, INTR_TYPE_2OP_MASK, X86ISD::SMIN, 0), - X86_INTRINSIC_DATA(avx512_mask_pmins_b_256, INTR_TYPE_2OP_MASK, X86ISD::SMIN, 0), - X86_INTRINSIC_DATA(avx512_mask_pmins_b_512, INTR_TYPE_2OP_MASK, X86ISD::SMIN, 0), - X86_INTRINSIC_DATA(avx512_mask_pmins_d_128, INTR_TYPE_2OP_MASK, X86ISD::SMIN, 0), - X86_INTRINSIC_DATA(avx512_mask_pmins_d_256, INTR_TYPE_2OP_MASK, X86ISD::SMIN, 0), - X86_INTRINSIC_DATA(avx512_mask_pmins_d_512, INTR_TYPE_2OP_MASK, X86ISD::SMIN, 0), - X86_INTRINSIC_DATA(avx512_mask_pmins_q_128, INTR_TYPE_2OP_MASK, X86ISD::SMIN, 0), - X86_INTRINSIC_DATA(avx512_mask_pmins_q_256, INTR_TYPE_2OP_MASK, X86ISD::SMIN, 0), - X86_INTRINSIC_DATA(avx512_mask_pmins_q_512, INTR_TYPE_2OP_MASK, X86ISD::SMIN, 0), - X86_INTRINSIC_DATA(avx512_mask_pmins_w_128, INTR_TYPE_2OP_MASK, X86ISD::SMIN, 0), - X86_INTRINSIC_DATA(avx512_mask_pmins_w_256, INTR_TYPE_2OP_MASK, X86ISD::SMIN, 0), - X86_INTRINSIC_DATA(avx512_mask_pmins_w_512, INTR_TYPE_2OP_MASK, X86ISD::SMIN, 0), - X86_INTRINSIC_DATA(avx512_mask_pminu_b_128, INTR_TYPE_2OP_MASK, X86ISD::UMIN, 0), - X86_INTRINSIC_DATA(avx512_mask_pminu_b_256, INTR_TYPE_2OP_MASK, X86ISD::UMIN, 0), - X86_INTRINSIC_DATA(avx512_mask_pminu_b_512, INTR_TYPE_2OP_MASK, X86ISD::UMIN, 0), - X86_INTRINSIC_DATA(avx512_mask_pminu_d_128, INTR_TYPE_2OP_MASK, X86ISD::UMIN, 0), - X86_INTRINSIC_DATA(avx512_mask_pminu_d_256, INTR_TYPE_2OP_MASK, X86ISD::UMIN, 0), - X86_INTRINSIC_DATA(avx512_mask_pminu_d_512, INTR_TYPE_2OP_MASK, X86ISD::UMIN, 0), - X86_INTRINSIC_DATA(avx512_mask_pminu_q_128, INTR_TYPE_2OP_MASK, X86ISD::UMIN, 0), - X86_INTRINSIC_DATA(avx512_mask_pminu_q_256, INTR_TYPE_2OP_MASK, X86ISD::UMIN, 0), - X86_INTRINSIC_DATA(avx512_mask_pminu_q_512, INTR_TYPE_2OP_MASK, X86ISD::UMIN, 0), - X86_INTRINSIC_DATA(avx512_mask_pminu_w_128, INTR_TYPE_2OP_MASK, X86ISD::UMIN, 0), - X86_INTRINSIC_DATA(avx512_mask_pminu_w_256, INTR_TYPE_2OP_MASK, X86ISD::UMIN, 0), - X86_INTRINSIC_DATA(avx512_mask_pminu_w_512, INTR_TYPE_2OP_MASK, X86ISD::UMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxs_b_128, INTR_TYPE_2OP_MASK, ISD::SMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxs_b_256, INTR_TYPE_2OP_MASK, ISD::SMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxs_b_512, INTR_TYPE_2OP_MASK, ISD::SMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxs_d_128, INTR_TYPE_2OP_MASK, ISD::SMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxs_d_256, INTR_TYPE_2OP_MASK, ISD::SMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxs_d_512, INTR_TYPE_2OP_MASK, ISD::SMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxs_q_128, INTR_TYPE_2OP_MASK, ISD::SMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxs_q_256, INTR_TYPE_2OP_MASK, ISD::SMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxs_q_512, INTR_TYPE_2OP_MASK, ISD::SMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxs_w_128, INTR_TYPE_2OP_MASK, ISD::SMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxs_w_256, INTR_TYPE_2OP_MASK, ISD::SMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxs_w_512, INTR_TYPE_2OP_MASK, ISD::SMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxu_b_128, INTR_TYPE_2OP_MASK, ISD::UMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxu_b_256, INTR_TYPE_2OP_MASK, ISD::UMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxu_b_512, INTR_TYPE_2OP_MASK, ISD::UMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxu_d_128, INTR_TYPE_2OP_MASK, ISD::UMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxu_d_256, INTR_TYPE_2OP_MASK, ISD::UMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxu_d_512, INTR_TYPE_2OP_MASK, ISD::UMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxu_q_128, INTR_TYPE_2OP_MASK, ISD::UMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxu_q_256, INTR_TYPE_2OP_MASK, ISD::UMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxu_q_512, INTR_TYPE_2OP_MASK, ISD::UMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxu_w_128, INTR_TYPE_2OP_MASK, ISD::UMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxu_w_256, INTR_TYPE_2OP_MASK, ISD::UMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxu_w_512, INTR_TYPE_2OP_MASK, ISD::UMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmins_b_128, INTR_TYPE_2OP_MASK, ISD::SMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pmins_b_256, INTR_TYPE_2OP_MASK, ISD::SMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pmins_b_512, INTR_TYPE_2OP_MASK, ISD::SMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pmins_d_128, INTR_TYPE_2OP_MASK, ISD::SMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pmins_d_256, INTR_TYPE_2OP_MASK, ISD::SMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pmins_d_512, INTR_TYPE_2OP_MASK, ISD::SMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pmins_q_128, INTR_TYPE_2OP_MASK, ISD::SMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pmins_q_256, INTR_TYPE_2OP_MASK, ISD::SMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pmins_q_512, INTR_TYPE_2OP_MASK, ISD::SMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pmins_w_128, INTR_TYPE_2OP_MASK, ISD::SMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pmins_w_256, INTR_TYPE_2OP_MASK, ISD::SMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pmins_w_512, INTR_TYPE_2OP_MASK, ISD::SMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pminu_b_128, INTR_TYPE_2OP_MASK, ISD::UMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pminu_b_256, INTR_TYPE_2OP_MASK, ISD::UMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pminu_b_512, INTR_TYPE_2OP_MASK, ISD::UMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pminu_d_128, INTR_TYPE_2OP_MASK, ISD::UMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pminu_d_256, INTR_TYPE_2OP_MASK, ISD::UMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pminu_d_512, INTR_TYPE_2OP_MASK, ISD::UMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pminu_q_128, INTR_TYPE_2OP_MASK, ISD::UMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pminu_q_256, INTR_TYPE_2OP_MASK, ISD::UMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pminu_q_512, INTR_TYPE_2OP_MASK, ISD::UMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pminu_w_128, INTR_TYPE_2OP_MASK, ISD::UMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pminu_w_256, INTR_TYPE_2OP_MASK, ISD::UMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pminu_w_512, INTR_TYPE_2OP_MASK, ISD::UMIN, 0), X86_INTRINSIC_DATA(avx512_mask_pmul_dq_128, INTR_TYPE_2OP_MASK, X86ISD::PMULDQ, 0), X86_INTRINSIC_DATA(avx512_mask_pmul_dq_256, INTR_TYPE_2OP_MASK, X86ISD::PMULDQ, 0), X86_INTRINSIC_DATA(avx512_mask_pmul_dq_512, INTR_TYPE_2OP_MASK, X86ISD::PMULDQ, 0), + X86_INTRINSIC_DATA(avx512_mask_pmul_hr_sw_128, INTR_TYPE_2OP_MASK, X86ISD::MULHRS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmul_hr_sw_256, INTR_TYPE_2OP_MASK, X86ISD::MULHRS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmul_hr_sw_512, INTR_TYPE_2OP_MASK, X86ISD::MULHRS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmulh_w_128, INTR_TYPE_2OP_MASK, ISD::MULHS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmulh_w_256, INTR_TYPE_2OP_MASK, ISD::MULHS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmulh_w_512, INTR_TYPE_2OP_MASK, ISD::MULHS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmulhu_w_128, INTR_TYPE_2OP_MASK, ISD::MULHU, 0), + X86_INTRINSIC_DATA(avx512_mask_pmulhu_w_256, INTR_TYPE_2OP_MASK, ISD::MULHU, 0), + X86_INTRINSIC_DATA(avx512_mask_pmulhu_w_512, INTR_TYPE_2OP_MASK, ISD::MULHU, 0), X86_INTRINSIC_DATA(avx512_mask_pmull_d_128, INTR_TYPE_2OP_MASK, ISD::MUL, 0), X86_INTRINSIC_DATA(avx512_mask_pmull_d_256, INTR_TYPE_2OP_MASK, ISD::MUL, 0), X86_INTRINSIC_DATA(avx512_mask_pmull_d_512, INTR_TYPE_2OP_MASK, ISD::MUL, 0), @@ -1008,10 +1017,10 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(sse2_packssdw_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(sse2_packsswb_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(sse2_packuswb_128, INTR_TYPE_2OP, X86ISD::PACKUS, 0), - X86_INTRINSIC_DATA(sse2_pmaxs_w, INTR_TYPE_2OP, X86ISD::SMAX, 0), - X86_INTRINSIC_DATA(sse2_pmaxu_b, INTR_TYPE_2OP, X86ISD::UMAX, 0), - X86_INTRINSIC_DATA(sse2_pmins_w, INTR_TYPE_2OP, X86ISD::SMIN, 0), - X86_INTRINSIC_DATA(sse2_pminu_b, INTR_TYPE_2OP, X86ISD::UMIN, 0), + X86_INTRINSIC_DATA(sse2_pmaxs_w, INTR_TYPE_2OP, ISD::SMAX, 0), + X86_INTRINSIC_DATA(sse2_pmaxu_b, INTR_TYPE_2OP, ISD::UMAX, 0), + X86_INTRINSIC_DATA(sse2_pmins_w, INTR_TYPE_2OP, ISD::SMIN, 0), + X86_INTRINSIC_DATA(sse2_pminu_b, INTR_TYPE_2OP, ISD::UMIN, 0), X86_INTRINSIC_DATA(sse2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0), X86_INTRINSIC_DATA(sse2_pmulhu_w, INTR_TYPE_2OP, ISD::MULHU, 0), X86_INTRINSIC_DATA(sse2_pmulu_dq, INTR_TYPE_2OP, X86ISD::PMULUDQ, 0), @@ -1049,14 +1058,14 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(sse3_hsub_ps, INTR_TYPE_2OP, X86ISD::FHSUB, 0), X86_INTRINSIC_DATA(sse41_insertps, INTR_TYPE_3OP, X86ISD::INSERTPS, 0), X86_INTRINSIC_DATA(sse41_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0), - X86_INTRINSIC_DATA(sse41_pmaxsb, INTR_TYPE_2OP, X86ISD::SMAX, 0), - X86_INTRINSIC_DATA(sse41_pmaxsd, INTR_TYPE_2OP, X86ISD::SMAX, 0), - X86_INTRINSIC_DATA(sse41_pmaxud, INTR_TYPE_2OP, X86ISD::UMAX, 0), - X86_INTRINSIC_DATA(sse41_pmaxuw, INTR_TYPE_2OP, X86ISD::UMAX, 0), - X86_INTRINSIC_DATA(sse41_pminsb, INTR_TYPE_2OP, X86ISD::SMIN, 0), - X86_INTRINSIC_DATA(sse41_pminsd, INTR_TYPE_2OP, X86ISD::SMIN, 0), - X86_INTRINSIC_DATA(sse41_pminud, INTR_TYPE_2OP, X86ISD::UMIN, 0), - X86_INTRINSIC_DATA(sse41_pminuw, INTR_TYPE_2OP, X86ISD::UMIN, 0), + X86_INTRINSIC_DATA(sse41_pmaxsb, INTR_TYPE_2OP, ISD::SMAX, 0), + X86_INTRINSIC_DATA(sse41_pmaxsd, INTR_TYPE_2OP, ISD::SMAX, 0), + X86_INTRINSIC_DATA(sse41_pmaxud, INTR_TYPE_2OP, ISD::UMAX, 0), + X86_INTRINSIC_DATA(sse41_pmaxuw, INTR_TYPE_2OP, ISD::UMAX, 0), + X86_INTRINSIC_DATA(sse41_pminsb, INTR_TYPE_2OP, ISD::SMIN, 0), + X86_INTRINSIC_DATA(sse41_pminsd, INTR_TYPE_2OP, ISD::SMIN, 0), + X86_INTRINSIC_DATA(sse41_pminud, INTR_TYPE_2OP, ISD::UMIN, 0), + X86_INTRINSIC_DATA(sse41_pminuw, INTR_TYPE_2OP, ISD::UMIN, 0), X86_INTRINSIC_DATA(sse41_pmovsxbd, INTR_TYPE_1OP, X86ISD::VSEXT, 0), X86_INTRINSIC_DATA(sse41_pmovsxbq, INTR_TYPE_1OP, X86ISD::VSEXT, 0), X86_INTRINSIC_DATA(sse41_pmovsxbw, INTR_TYPE_1OP, X86ISD::VSEXT, 0), @@ -1070,6 +1079,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(sse41_pmovzxwd, INTR_TYPE_1OP, X86ISD::VZEXT, 0), X86_INTRINSIC_DATA(sse41_pmovzxwq, INTR_TYPE_1OP, X86ISD::VZEXT, 0), X86_INTRINSIC_DATA(sse41_pmuldq, INTR_TYPE_2OP, X86ISD::PMULDQ, 0), + X86_INTRINSIC_DATA(sse4a_extrqi, INTR_TYPE_3OP, X86ISD::EXTRQI, 0), + X86_INTRINSIC_DATA(sse4a_insertqi, INTR_TYPE_4OP, X86ISD::INSERTQI, 0), X86_INTRINSIC_DATA(sse_comieq_ss, COMI, X86ISD::COMI, ISD::SETEQ), X86_INTRINSIC_DATA(sse_comige_ss, COMI, X86ISD::COMI, ISD::SETGE), X86_INTRINSIC_DATA(sse_comigt_ss, COMI, X86ISD::COMI, ISD::SETGT), diff --git a/lib/Target/X86/X86MachineFunctionInfo.h b/lib/Target/X86/X86MachineFunctionInfo.h index d598b55aae3e..e6db9708b677 100644 --- a/lib/Target/X86/X86MachineFunctionInfo.h +++ b/lib/Target/X86/X86MachineFunctionInfo.h @@ -30,59 +30,67 @@ class X86MachineFunctionInfo : public MachineFunctionInfo { /// pointer for reasons other than it containing dynamic allocation or /// that FP eliminatation is turned off. For example, Cygwin main function /// contains stack pointer re-alignment code which requires FP. - bool ForceFramePointer; + bool ForceFramePointer = false; /// RestoreBasePointerOffset - Non-zero if the function has base pointer /// and makes call to llvm.eh.sjlj.setjmp. When non-zero, the value is a /// displacement from the frame pointer to a slot where the base pointer /// is stashed. - signed char RestoreBasePointerOffset; + signed char RestoreBasePointerOffset = 0; /// CalleeSavedFrameSize - Size of the callee-saved register portion of the /// stack frame in bytes. - unsigned CalleeSavedFrameSize; + unsigned CalleeSavedFrameSize = 0; /// BytesToPopOnReturn - Number of bytes function pops on return (in addition /// to the space used by the return address). /// Used on windows platform for stdcall & fastcall name decoration - unsigned BytesToPopOnReturn; + unsigned BytesToPopOnReturn = 0; /// ReturnAddrIndex - FrameIndex for return slot. - int ReturnAddrIndex; + int ReturnAddrIndex = 0; /// \brief FrameIndex for return slot. - int FrameAddrIndex; + int FrameAddrIndex = 0; /// TailCallReturnAddrDelta - The number of bytes by which return address /// stack slot is moved as the result of tail call optimization. - int TailCallReturnAddrDelta; + int TailCallReturnAddrDelta = 0; /// SRetReturnReg - Some subtargets require that sret lowering includes /// returning the value of the returned struct in a register. This field /// holds the virtual register into which the sret argument is passed. - unsigned SRetReturnReg; + unsigned SRetReturnReg = 0; /// GlobalBaseReg - keeps track of the virtual register initialized for /// use as the global base register. This is used for PIC in some PIC /// relocation models. - unsigned GlobalBaseReg; + unsigned GlobalBaseReg = 0; /// VarArgsFrameIndex - FrameIndex for start of varargs area. - int VarArgsFrameIndex; + int VarArgsFrameIndex = 0; /// RegSaveFrameIndex - X86-64 vararg func register save area. - int RegSaveFrameIndex; + int RegSaveFrameIndex = 0; /// VarArgsGPOffset - X86-64 vararg func int reg offset. - unsigned VarArgsGPOffset; + unsigned VarArgsGPOffset = 0; /// VarArgsFPOffset - X86-64 vararg func fp reg offset. - unsigned VarArgsFPOffset; + unsigned VarArgsFPOffset = 0; /// ArgumentStackSize - The number of bytes on stack consumed by the arguments /// being passed on the stack. - unsigned ArgumentStackSize; + unsigned ArgumentStackSize = 0; /// NumLocalDynamics - Number of local-dynamic TLS accesses. - unsigned NumLocalDynamics; + unsigned NumLocalDynamics = 0; /// HasPushSequences - Keeps track of whether this function uses sequences /// of pushes to pass function parameters. - bool HasPushSequences; + bool HasPushSequences = false; + + /// True if the function uses llvm.x86.seh.restoreframe, and it needed a spill + /// slot for the frame pointer. + bool HasSEHFramePtrSave = false; + + /// The frame index of a stack object containing the original frame pointer + /// used to address arguments in a function using a base pointer. + int SEHFramePtrSaveIndex = 0; private: /// ForwardedMustTailRegParms - A list of virtual and physical registers @@ -90,40 +98,9 @@ private: SmallVector<ForwardedRegister, 1> ForwardedMustTailRegParms; public: - X86MachineFunctionInfo() : ForceFramePointer(false), - RestoreBasePointerOffset(0), - CalleeSavedFrameSize(0), - BytesToPopOnReturn(0), - ReturnAddrIndex(0), - FrameAddrIndex(0), - TailCallReturnAddrDelta(0), - SRetReturnReg(0), - GlobalBaseReg(0), - VarArgsFrameIndex(0), - RegSaveFrameIndex(0), - VarArgsGPOffset(0), - VarArgsFPOffset(0), - ArgumentStackSize(0), - NumLocalDynamics(0), - HasPushSequences(false) {} - - explicit X86MachineFunctionInfo(MachineFunction &MF) - : ForceFramePointer(false), - RestoreBasePointerOffset(0), - CalleeSavedFrameSize(0), - BytesToPopOnReturn(0), - ReturnAddrIndex(0), - FrameAddrIndex(0), - TailCallReturnAddrDelta(0), - SRetReturnReg(0), - GlobalBaseReg(0), - VarArgsFrameIndex(0), - RegSaveFrameIndex(0), - VarArgsGPOffset(0), - VarArgsFPOffset(0), - ArgumentStackSize(0), - NumLocalDynamics(0), - HasPushSequences(false) {} + X86MachineFunctionInfo() = default; + + explicit X86MachineFunctionInfo(MachineFunction &MF) {}; bool getForceFramePointer() const { return ForceFramePointer;} void setForceFramePointer(bool forceFP) { ForceFramePointer = forceFP; } @@ -174,6 +151,12 @@ public: unsigned getNumLocalDynamicTLSAccesses() const { return NumLocalDynamics; } void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamics; } + bool getHasSEHFramePtrSave() const { return HasSEHFramePtrSave; } + void setHasSEHFramePtrSave(bool V) { HasSEHFramePtrSave = V; } + + int getSEHFramePtrSaveIndex() const { return SEHFramePtrSaveIndex; } + void setSEHFramePtrSaveIndex(int Index) { SEHFramePtrSaveIndex = Index; } + SmallVectorImpl<ForwardedRegister> &getForwardedMustTailRegParms() { return ForwardedMustTailRegParms; } diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp index 0033b5058187..d8495e53e0e3 100644 --- a/lib/Target/X86/X86RegisterInfo.cpp +++ b/lib/Target/X86/X86RegisterInfo.cpp @@ -202,7 +202,7 @@ X86RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const { unsigned X86RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const { - const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); + const X86FrameLowering *TFI = getFrameLowering(MF); unsigned FPDiff = TFI->hasFP(MF) ? 1 : 0; switch (RC->getID()) { @@ -343,7 +343,7 @@ X86RegisterInfo::getNoPreservedMask() const { BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); - const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); + const X86FrameLowering *TFI = getFrameLowering(MF); // Set the stack-pointer register and its aliases as reserved. for (MCSubRegIterator I(X86::RSP, this, /*IncludeSelf=*/true); I.isValid(); @@ -452,7 +452,7 @@ bool X86RegisterInfo::hasBasePointer(const MachineFunction &MF) const { // use both the SP and the FP, we need a separate base pointer register. bool CantUseFP = needsStackRealignment(MF); bool CantUseSP = - MFI->hasVarSizedObjects() || MFI->hasInlineAsmWithSPAdjust(); + MFI->hasVarSizedObjects() || MFI->hasOpaqueSPAdjustment(); return CantUseFP && CantUseSP; } @@ -477,9 +477,9 @@ bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const { bool X86RegisterInfo::needsStackRealignment(const MachineFunction &MF) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); + const X86FrameLowering *TFI = getFrameLowering(MF); const Function *F = MF.getFunction(); - unsigned StackAlign = - MF.getSubtarget().getFrameLowering()->getStackAlignment(); + unsigned StackAlign = TFI->getStackAlignment(); bool requiresRealignment = ((MFI->getMaxAlignment() > StackAlign) || F->hasFnAttribute(Attribute::StackAlignment)); @@ -503,7 +503,7 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, RegScavenger *RS) const { MachineInstr &MI = *II; MachineFunction &MF = *MI.getParent()->getParent(); - const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); + const X86FrameLowering *TFI = getFrameLowering(MF); int FrameIndex = MI.getOperand(FIOperandNum).getIndex(); unsigned BasePtr; @@ -519,18 +519,17 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, else BasePtr = (TFI->hasFP(MF) ? FramePtr : StackPtr); - // FRAME_ALLOC uses a single offset, with no register. It only works in the + // LOCAL_ESCAPE uses a single offset, with no register. It only works in the // simple FP case, and doesn't work with stack realignment. On 32-bit, the // offset is from the traditional base pointer location. On 64-bit, the // offset is from the SP at the end of the prologue, not the FP location. This // matches the behavior of llvm.frameaddress. - if (Opc == TargetOpcode::FRAME_ALLOC) { + if (Opc == TargetOpcode::LOCAL_ESCAPE) { MachineOperand &FI = MI.getOperand(FIOperandNum); bool IsWinEH = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); int Offset; if (IsWinEH) - Offset = static_cast<const X86FrameLowering *>(TFI) - ->getFrameIndexOffsetFromSP(MF, FrameIndex); + Offset = TFI->getFrameIndexOffsetFromSP(MF, FrameIndex); else Offset = TFI->getFrameIndexOffset(MF, FrameIndex); FI.ChangeToImmediate(Offset); @@ -584,7 +583,7 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, } unsigned X86RegisterInfo::getFrameRegister(const MachineFunction &MF) const { - const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); + const X86FrameLowering *TFI = getFrameLowering(MF); return TFI->hasFP(MF) ? FramePtr : StackPtr; } diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp index 5ca40bc0091b..ce79fcf9ad81 100644 --- a/lib/Target/X86/X86SelectionDAGInfo.cpp +++ b/lib/Target/X86/X86SelectionDAGInfo.cpp @@ -24,11 +24,6 @@ using namespace llvm; #define DEBUG_TYPE "x86-selectiondag-info" -X86SelectionDAGInfo::X86SelectionDAGInfo(const DataLayout &DL) - : TargetSelectionDAGInfo(&DL) {} - -X86SelectionDAGInfo::~X86SelectionDAGInfo() {} - bool X86SelectionDAGInfo::isBaseRegConflictPossible( SelectionDAG &DAG, ArrayRef<unsigned> ClobberSet) const { // We cannot use TRI->hasBasePointer() until *after* we select all basic @@ -37,7 +32,7 @@ bool X86SelectionDAGInfo::isBaseRegConflictPossible( // dynamic stack adjustments (hopefully rare) and the base pointer would // conflict if we had to use it. MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); - if (!MFI->hasVarSizedObjects() && !MFI->hasInlineAsmWithSPAdjust()) + if (!MFI->hasVarSizedObjects() && !MFI->hasOpaqueSPAdjustment()) return false; const X86RegisterInfo *TRI = static_cast<const X86RegisterInfo *>( @@ -81,8 +76,9 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, if (const char *bzeroEntry = V && V->isNullValue() ? Subtarget.getBZeroEntry() : nullptr) { - EVT IntPtr = DAG.getTargetLoweringInfo().getPointerTy(); - Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext()); + EVT IntPtr = + DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); + Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext()); TargetLowering::ArgListTy Args; TargetLowering::ArgListEntry Entry; Entry.Node = Dst; diff --git a/lib/Target/X86/X86SelectionDAGInfo.h b/lib/Target/X86/X86SelectionDAGInfo.h index eb7e0ed9de6c..961bd8c8d5ef 100644 --- a/lib/Target/X86/X86SelectionDAGInfo.h +++ b/lib/Target/X86/X86SelectionDAGInfo.h @@ -29,8 +29,7 @@ class X86SelectionDAGInfo : public TargetSelectionDAGInfo { ArrayRef<unsigned> ClobberSet) const; public: - explicit X86SelectionDAGInfo(const DataLayout &DL); - ~X86SelectionDAGInfo(); + explicit X86SelectionDAGInfo() = default; SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, SDValue Chain, diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp index 3b25d30dc221..dff3624b7efe 100644 --- a/lib/Target/X86/X86Subtarget.cpp +++ b/lib/Target/X86/X86Subtarget.cpp @@ -68,7 +68,7 @@ ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const { if (GV->hasDLLImportStorageClass()) return X86II::MO_DLLIMPORT; - bool isDecl = GV->isDeclarationForLinker(); + bool isDef = GV->isStrongDefinitionForLinker(); // X86-64 in PIC mode. if (isPICStyleRIPRel()) { @@ -80,8 +80,7 @@ ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const { // If symbol visibility is hidden, the extra load is not needed if // target is x86-64 or the symbol is definitely defined in the current // translation unit. - if (GV->hasDefaultVisibility() && - (isDecl || GV->isWeakForLinker())) + if (GV->hasDefaultVisibility() && !isDef) return X86II::MO_GOTPCREL; } else if (!isTargetWin64()) { assert(isTargetELF() && "Unknown rip-relative target"); @@ -107,7 +106,7 @@ ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const { // If this is a strong reference to a definition, it is definitely not // through a stub. - if (!isDecl && !GV->isWeakForLinker()) + if (isDef) return X86II::MO_PIC_BASE_OFFSET; // Unless we have a symbol with hidden visibility, we have to go through a @@ -117,7 +116,7 @@ ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const { // If symbol visibility is hidden, we have a stub for common symbol // references and external declarations. - if (isDecl || GV->hasCommonLinkage()) { + if (GV->isDeclarationForLinker() || GV->hasCommonLinkage()) { // Hidden $non_lazy_ptr reference. return X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE; } @@ -131,7 +130,7 @@ ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const { // If this is a strong reference to a definition, it is definitely not // through a stub. - if (!isDecl && !GV->isWeakForLinker()) + if (isDef) return X86II::MO_NO_FLAG; // Unless we have a symbol with hidden visibility, we have to go through a @@ -193,12 +192,9 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { FullFS = "+64bit,+sse2"; } - // If feature string is not empty, parse features string. + // Parse features string and set the CPU. ParseSubtargetFeatures(CPUName, FullFS); - // Make sure the right MCSchedModel is used. - InitCPUSchedModel(CPUName); - InstrItins = getInstrItineraryForCPU(CPUName); // It's important to keep the MCSubtargetInfo feature bits in sync with @@ -298,9 +294,8 @@ X86Subtarget::X86Subtarget(const Triple &TT, const std::string &CPU, TargetTriple.getEnvironment() != Triple::CODE16), In16BitMode(TargetTriple.getArch() == Triple::x86 && TargetTriple.getEnvironment() == Triple::CODE16), - TSInfo(*TM.getDataLayout()), - InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this), - FrameLowering(*this, getStackAlignment()) { + TSInfo(), InstrInfo(initializeSubtargetDependencies(CPU, FS)), + TLInfo(TM, *this), FrameLowering(*this, getStackAlignment()) { // Determine the PICStyle based on the target selected. if (TM.getRelocationModel() == Reloc::Static) { // Unless we're in PIC or DynamicNoPIC mode, set the PIC style to None. diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h index d420abbe1433..f026d4295f71 100644 --- a/lib/Target/X86/X86Subtarget.h +++ b/lib/Target/X86/X86Subtarget.h @@ -447,8 +447,26 @@ public: } bool isCallingConvWin64(CallingConv::ID CC) const { - return (isTargetWin64() && CC != CallingConv::X86_64_SysV) || - CC == CallingConv::X86_64_Win64; + switch (CC) { + // On Win64, all these conventions just use the default convention. + case CallingConv::C: + case CallingConv::Fast: + case CallingConv::X86_FastCall: + case CallingConv::X86_StdCall: + case CallingConv::X86_ThisCall: + case CallingConv::X86_VectorCall: + case CallingConv::Intel_OCL_BI: + return isTargetWin64(); + // This convention allows using the Win64 convention on other targets. + case CallingConv::X86_64_Win64: + return true; + // This convention allows using the SysV convention on Windows targets. + case CallingConv::X86_64_SysV: + return false; + // Otherwise, who knows what this is. + default: + return false; + } } /// ClassifyGlobalReference - Classify a global variable reference for the diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp index 0c82a700952b..7df726091843 100644 --- a/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/lib/Target/X86/X86TargetTransformInfo.cpp @@ -89,7 +89,7 @@ unsigned X86TTIImpl::getArithmeticInstrCost( TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo, TTI::OperandValueProperties Opd2PropInfo) { // Legalize the type. - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty); + std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); @@ -117,6 +117,8 @@ unsigned X86TTIImpl::getArithmeticInstrCost( static const CostTblEntry<MVT::SimpleValueType> AVX2UniformConstCostTable[] = { + { ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle. + { ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence { ISD::UDIV, MVT::v16i16, 6 }, // vpmulhuw sequence { ISD::SDIV, MVT::v8i32, 15 }, // vpmuldq sequence @@ -211,6 +213,7 @@ unsigned X86TTIImpl::getArithmeticInstrCost( { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb. { ISD::SRA, MVT::v8i16, 1 }, // psraw. { ISD::SRA, MVT::v4i32, 1 }, // psrad. + { ISD::SRA, MVT::v2i64, 4 }, // 2 x psrad + shuffle. { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence @@ -261,12 +264,12 @@ unsigned X86TTIImpl::getArithmeticInstrCost( { ISD::SRL, MVT::v16i8, 26 }, // cmpgtb sequence. { ISD::SRL, MVT::v8i16, 32 }, // cmpgtb sequence. - { ISD::SRL, MVT::v4i32, 4*10 }, // Scalarized. + { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend. { ISD::SRL, MVT::v2i64, 2*10 }, // Scalarized. { ISD::SRA, MVT::v16i8, 54 }, // unpacked cmpgtb sequence. { ISD::SRA, MVT::v8i16, 32 }, // cmpgtb sequence. - { ISD::SRA, MVT::v4i32, 4*10 }, // Scalarized. + { ISD::SRA, MVT::v4i32, 16 }, // Shift each lane + blend. { ISD::SRA, MVT::v2i64, 2*10 }, // Scalarized. // It is not a good idea to vectorize division. We have to scalarize it and @@ -352,7 +355,7 @@ unsigned X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); if (Kind == TTI::SK_Reverse) { - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp); + std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); unsigned Cost = 1; if (LT.second.getSizeInBits() > 128) Cost = 3; // Extract + insert + copy. @@ -364,7 +367,7 @@ unsigned X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, if (Kind == TTI::SK_Alternate) { // 64-bit packed float vectors (v2f32) are widened to type v4f32. // 64-bit packed integer vectors (v2i32) are promoted to type v2i64. - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp); + std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); // The backend knows how to generate a single VEX.256 version of // instruction VPBLENDW if the target supports AVX2. @@ -464,8 +467,8 @@ unsigned X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); - std::pair<unsigned, MVT> LTSrc = TLI->getTypeLegalizationCost(Src); - std::pair<unsigned, MVT> LTDest = TLI->getTypeLegalizationCost(Dst); + std::pair<unsigned, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src); + std::pair<unsigned, MVT> LTDest = TLI->getTypeLegalizationCost(DL, Dst); static const TypeConversionCostTblEntry<MVT::SimpleValueType> SSE2ConvTbl[] = { @@ -537,8 +540,8 @@ unsigned X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { if (Idx != -1) return AVX512ConversionTbl[Idx].Cost; } - EVT SrcTy = TLI->getValueType(Src); - EVT DstTy = TLI->getValueType(Dst); + EVT SrcTy = TLI->getValueType(DL, Src); + EVT DstTy = TLI->getValueType(DL, Dst); // The function getSimpleVT only handles simple value types. if (!SrcTy.isSimple() || !DstTy.isSimple()) @@ -667,7 +670,7 @@ unsigned X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { unsigned X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) { // Legalize the type. - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(ValTy); + std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); MVT MTy = LT.second; @@ -740,7 +743,7 @@ unsigned X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, if (Index != -1U) { // Legalize the type. - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Val); + std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Val); // This type is legalized to a scalar type. if (!LT.second.isVector()) @@ -803,7 +806,7 @@ unsigned X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, } // Legalize the type. - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src); + std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); assert((Opcode == Instruction::Load || Opcode == Instruction::Store) && "Invalid Opcode"); @@ -850,9 +853,9 @@ unsigned X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, } // Legalize the type. - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(SrcVTy); + std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, SrcVTy); unsigned Cost = 0; - if (LT.second != TLI->getValueType(SrcVTy).getSimpleVT() && + if (LT.second != TLI->getValueType(DL, SrcVTy).getSimpleVT() && LT.second.getVectorNumElements() == NumElem) // Promotion requires expand/truncate for data and a shuffle for mask. Cost += getShuffleCost(TTI::SK_Alternate, SrcVTy, 0, 0) + @@ -887,7 +890,7 @@ unsigned X86TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) { unsigned X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy, bool IsPairwise) { - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(ValTy); + std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); MVT MTy = LT.second; @@ -1117,11 +1120,11 @@ unsigned X86TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, int Consecutive) { int DataWidth = DataTy->getPrimitiveSizeInBits(); - + // Todo: AVX512 allows gather/scatter, works with strided and random as well if ((DataWidth < 32) || (Consecutive == 0)) return false; - if (ST->hasAVX512() || ST->hasAVX2()) + if (ST->hasAVX512() || ST->hasAVX2()) return true; return false; } diff --git a/lib/Target/X86/X86TargetTransformInfo.h b/lib/Target/X86/X86TargetTransformInfo.h index a83158440193..da3f36c2e27e 100644 --- a/lib/Target/X86/X86TargetTransformInfo.h +++ b/lib/Target/X86/X86TargetTransformInfo.h @@ -40,7 +40,8 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> { public: explicit X86TTIImpl(const X86TargetMachine *TM, Function &F) - : BaseT(TM), ST(TM->getSubtargetImpl(F)), TLI(ST->getTargetLowering()) {} + : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)), + TLI(ST->getTargetLowering()) {} // Provide value semantics. MSVC requires that we spell all of these out. X86TTIImpl(const X86TTIImpl &Arg) @@ -48,18 +49,6 @@ public: X86TTIImpl(X86TTIImpl &&Arg) : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)), TLI(std::move(Arg.TLI)) {} - X86TTIImpl &operator=(const X86TTIImpl &RHS) { - BaseT::operator=(static_cast<const BaseT &>(RHS)); - ST = RHS.ST; - TLI = RHS.TLI; - return *this; - } - X86TTIImpl &operator=(X86TTIImpl &&RHS) { - BaseT::operator=(std::move(static_cast<BaseT &>(RHS))); - ST = std::move(RHS.ST); - TLI = std::move(RHS.TLI); - return *this; - } /// \name Scalar TTI Implementations /// @{ diff --git a/lib/Target/X86/X86WinEHState.cpp b/lib/Target/X86/X86WinEHState.cpp index 90357257b9ef..9190d0be9e4d 100644 --- a/lib/Target/X86/X86WinEHState.cpp +++ b/lib/Target/X86/X86WinEHState.cpp @@ -113,8 +113,8 @@ char WinEHStatePass::ID = 0; bool WinEHStatePass::doInitialization(Module &M) { TheModule = &M; - FrameEscape = Intrinsic::getDeclaration(TheModule, Intrinsic::frameescape); - FrameRecover = Intrinsic::getDeclaration(TheModule, Intrinsic::framerecover); + FrameEscape = Intrinsic::getDeclaration(TheModule, Intrinsic::localescape); + FrameRecover = Intrinsic::getDeclaration(TheModule, Intrinsic::localrecover); FrameAddress = Intrinsic::getDeclaration(TheModule, Intrinsic::frameaddress); return false; } @@ -133,7 +133,7 @@ bool WinEHStatePass::doFinalization(Module &M) { void WinEHStatePass::getAnalysisUsage(AnalysisUsage &AU) const { // This pass should only insert a stack allocation, memory accesses, and - // framerecovers. + // localrecovers. AU.setPreservesCFG(); } @@ -336,9 +336,11 @@ Function *WinEHStatePass::generateLSDAInEAXThunk(Function *ParentFunc) { FunctionType *TargetFuncTy = FunctionType::get(Int32Ty, makeArrayRef(&ArgTys[0], 5), /*isVarArg=*/false); - Function *Trampoline = Function::Create( - TrampolineTy, GlobalValue::InternalLinkage, - Twine("__ehhandler$") + ParentFunc->getName(), TheModule); + Function *Trampoline = + Function::Create(TrampolineTy, GlobalValue::InternalLinkage, + Twine("__ehhandler$") + GlobalValue::getRealLinkageName( + ParentFunc->getName()), + TheModule); BasicBlock *EntryBB = BasicBlock::Create(Context, "entry", Trampoline); IRBuilder<> Builder(EntryBB); Value *LSDA = emitEHLSDA(Builder, ParentFunc); @@ -419,14 +421,14 @@ void WinEHStatePass::addCXXStateStores(Function &F, MachineModuleInfo &MMI) { } /// Escape RegNode so that we can access it from child handlers. Find the call -/// to frameescape, if any, in the entry block and append RegNode to the list +/// to localescape, if any, in the entry block and append RegNode to the list /// of arguments. int WinEHStatePass::escapeRegNode(Function &F) { - // Find the call to frameescape and extract its arguments. + // Find the call to localescape and extract its arguments. IntrinsicInst *EscapeCall = nullptr; for (Instruction &I : F.getEntryBlock()) { IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I); - if (II && II->getIntrinsicID() == Intrinsic::frameescape) { + if (II && II->getIntrinsicID() == Intrinsic::localescape) { EscapeCall = II; break; } @@ -440,8 +442,10 @@ int WinEHStatePass::escapeRegNode(Function &F) { // Replace the call (if it exists) with new one. Otherwise, insert at the end // of the entry block. - IRBuilder<> Builder(&F.getEntryBlock(), - EscapeCall ? EscapeCall : F.getEntryBlock().end()); + Instruction *InsertPt = EscapeCall; + if (!EscapeCall) + InsertPt = F.getEntryBlock().getTerminator(); + IRBuilder<> Builder(&F.getEntryBlock(), InsertPt); Builder.CreateCall(FrameEscape, Args); if (EscapeCall) EscapeCall->eraseFromParent(); @@ -520,6 +524,11 @@ void WinEHStatePass::addSEHStateStores(Function &F, MachineModuleInfo &MMI) { for (auto &Handler : ActionList) { if (auto *CH = dyn_cast<CatchHandler>(Handler.get())) { auto *BA = cast<BlockAddress>(CH->getHandlerBlockOrFunc()); +#ifndef NDEBUG + for (BasicBlock *Pred : predecessors(BA->getBasicBlock())) + assert(Pred->isLandingPad() && + "WinEHPrepare failed to split block"); +#endif ExceptBlocks.insert(BA->getBasicBlock()); } } diff --git a/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp b/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp index ac954d0a8fa4..b4085835f285 100644 --- a/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp +++ b/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp @@ -40,7 +40,7 @@ static MCInstrInfo *createXCoreMCInstrInfo() { return X; } -static MCRegisterInfo *createXCoreMCRegisterInfo(StringRef TT) { +static MCRegisterInfo *createXCoreMCRegisterInfo(const Triple &TT) { MCRegisterInfo *X = new MCRegisterInfo(); InitXCoreMCRegisterInfo(X, XCore::LR); return X; @@ -48,9 +48,7 @@ static MCRegisterInfo *createXCoreMCRegisterInfo(StringRef TT) { static MCSubtargetInfo * createXCoreMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) { - MCSubtargetInfo *X = new MCSubtargetInfo(); - InitXCoreMCSubtargetInfo(X, TT, CPU, FS); - return X; + return createXCoreMCSubtargetInfoImpl(TT, CPU, FS); } static MCAsmInfo *createXCoreMCAsmInfo(const MCRegisterInfo &MRI, @@ -64,7 +62,8 @@ static MCAsmInfo *createXCoreMCAsmInfo(const MCRegisterInfo &MRI, return MAI; } -static MCCodeGenInfo *createXCoreMCCodeGenInfo(StringRef TT, Reloc::Model RM, +static MCCodeGenInfo *createXCoreMCCodeGenInfo(const Triple &TT, + Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) { MCCodeGenInfo *X = new MCCodeGenInfo(); diff --git a/lib/Target/XCore/XCoreFrameLowering.cpp b/lib/Target/XCore/XCoreFrameLowering.cpp index bd834cc5be4b..76c3d8130e75 100644 --- a/lib/Target/XCore/XCoreFrameLowering.cpp +++ b/lib/Target/XCore/XCoreFrameLowering.cpp @@ -525,12 +525,15 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, MBB.erase(I); } -void XCoreFrameLowering:: -processFunctionBeforeCalleeSavedScan(MachineFunction &MF, - RegScavenger *RS) const { +void XCoreFrameLowering::determineCalleeSaves(MachineFunction &MF, + BitVector &SavedRegs, + RegScavenger *RS) const { + TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); + XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>(); - bool LRUsed = MF.getRegInfo().isPhysRegUsed(XCore::LR); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + bool LRUsed = MRI.isPhysRegModified(XCore::LR); if (!LRUsed && !MF.getFunction()->isVarArg() && MF.getFrameInfo()->estimateStackSize(MF)) @@ -550,7 +553,7 @@ processFunctionBeforeCalleeSavedScan(MachineFunction &MF, if (LRUsed) { // We will handle the LR in the prologue/epilogue // and allocate space on the stack ourselves. - MF.getRegInfo().setPhysRegUnused(XCore::LR); + SavedRegs.reset(XCore::LR); XFI->createLRSpillSlot(MF); } diff --git a/lib/Target/XCore/XCoreFrameLowering.h b/lib/Target/XCore/XCoreFrameLowering.h index 607c77248952..69c71adc8d3f 100644 --- a/lib/Target/XCore/XCoreFrameLowering.h +++ b/lib/Target/XCore/XCoreFrameLowering.h @@ -47,8 +47,8 @@ namespace llvm { bool hasFP(const MachineFunction &MF) const override; - void processFunctionBeforeCalleeSavedScan(MachineFunction &MF, - RegScavenger *RS = nullptr) const override; + void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, + RegScavenger *RS = nullptr) const override; void processFunctionBeforeFrameFinalized(MachineFunction &MF, RegScavenger *RS = nullptr) const override; diff --git a/lib/Target/XCore/XCoreISelDAGToDAG.cpp b/lib/Target/XCore/XCoreISelDAGToDAG.cpp index f5b180b1ac0d..9d4a966dfba4 100644 --- a/lib/Target/XCore/XCoreISelDAGToDAG.cpp +++ b/lib/Target/XCore/XCoreISelDAGToDAG.cpp @@ -144,10 +144,9 @@ SDNode *XCoreDAGToDAGISel::Select(SDNode *N) { MVT::i32, MskSize); } else if (!isUInt<16>(Val)) { - SDValue CPIdx = - CurDAG->getTargetConstantPool(ConstantInt::get( - Type::getInt32Ty(*CurDAG->getContext()), Val), - getTargetLowering()->getPointerTy()); + SDValue CPIdx = CurDAG->getTargetConstantPool( + ConstantInt::get(Type::getInt32Ty(*CurDAG->getContext()), Val), + getTargetLowering()->getPointerTy(CurDAG->getDataLayout())); SDNode *node = CurDAG->getMachineNode(XCore::LDWCP_lru6, dl, MVT::i32, MVT::Other, CPIdx, CurDAG->getEntryNode()); diff --git a/lib/Target/XCore/XCoreISelLowering.cpp b/lib/Target/XCore/XCoreISelLowering.cpp index aa71241102ff..d62e7428299d 100644 --- a/lib/Target/XCore/XCoreISelLowering.cpp +++ b/lib/Target/XCore/XCoreISelLowering.cpp @@ -281,7 +281,8 @@ static bool IsSmallObject(const GlobalValue *GV, const XCoreTargetLowering &XTL) if (!ObjType->isSized()) return false; - unsigned ObjSize = XTL.getDataLayout()->getTypeAllocSize(ObjType); + auto &DL = GV->getParent()->getDataLayout(); + unsigned ObjSize = DL.getTypeAllocSize(ObjType); return ObjSize < CodeModelLargeSize && ObjSize != 0; } @@ -312,8 +313,9 @@ LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const Constant *GAI = ConstantExpr::getGetElementPtr( Type::getInt8Ty(*DAG.getContext()), GA, Idx); SDValue CP = DAG.getConstantPool(GAI, MVT::i32); - return DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), CP, - MachinePointerInfo(), false, false, false, 0); + return DAG.getLoad(getPointerTy(DAG.getDataLayout()), DL, + DAG.getEntryNode(), CP, MachinePointerInfo(), false, + false, false, 0); } } @@ -321,11 +323,11 @@ SDValue XCoreTargetLowering:: LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); - + auto PtrVT = getPointerTy(DAG.getDataLayout()); const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); - SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy()); + SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT); - return DAG.getNode(XCoreISD::PCRelativeWrapper, DL, getPointerTy(), Result); + return DAG.getNode(XCoreISD::PCRelativeWrapper, DL, PtrVT, Result); } SDValue XCoreTargetLowering:: @@ -378,9 +380,10 @@ SDValue XCoreTargetLowering:: lowerLoadWordFromAlignedBasePlusOffset(SDLoc DL, SDValue Chain, SDValue Base, int64_t Offset, SelectionDAG &DAG) const { + auto PtrVT = getPointerTy(DAG.getDataLayout()); if ((Offset & 0x3) == 0) { - return DAG.getLoad(getPointerTy(), DL, Chain, Base, MachinePointerInfo(), - false, false, false, 0); + return DAG.getLoad(PtrVT, DL, Chain, Base, MachinePointerInfo(), false, + false, false, 0); } // Lower to pair of consecutive word aligned loads plus some bit shifting. int32_t HighOffset = RoundUpToAlignment(Offset, 4); @@ -401,11 +404,9 @@ lowerLoadWordFromAlignedBasePlusOffset(SDLoc DL, SDValue Chain, SDValue Base, SDValue LowShift = DAG.getConstant((Offset - LowOffset) * 8, DL, MVT::i32); SDValue HighShift = DAG.getConstant((HighOffset - Offset) * 8, DL, MVT::i32); - SDValue Low = DAG.getLoad(getPointerTy(), DL, Chain, - LowAddr, MachinePointerInfo(), + SDValue Low = DAG.getLoad(PtrVT, DL, Chain, LowAddr, MachinePointerInfo(), false, false, false, 0); - SDValue High = DAG.getLoad(getPointerTy(), DL, Chain, - HighAddr, MachinePointerInfo(), + SDValue High = DAG.getLoad(PtrVT, DL, Chain, HighAddr, MachinePointerInfo(), false, false, false, 0); SDValue LowShifted = DAG.getNode(ISD::SRL, DL, MVT::i32, Low, LowShift); SDValue HighShifted = DAG.getNode(ISD::SHL, DL, MVT::i32, High, HighShift); @@ -435,8 +436,9 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG) const { LD->getAlignment())) return SDValue(); - unsigned ABIAlignment = getDataLayout()-> - getABITypeAlignment(LD->getMemoryVT().getTypeForEVT(*DAG.getContext())); + auto &TD = DAG.getDataLayout(); + unsigned ABIAlignment = TD.getABITypeAlignment( + LD->getMemoryVT().getTypeForEVT(*DAG.getContext())); // Leave aligned load alone. if (LD->getAlignment() >= ABIAlignment) return SDValue(); @@ -486,7 +488,7 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG) const { } // Lower to a call to __misaligned_load(BasePtr). - Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext()); + Type *IntPtrTy = TD.getIntPtrType(*DAG.getContext()); TargetLowering::ArgListTy Args; TargetLowering::ArgListEntry Entry; @@ -495,10 +497,11 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG) const { Args.push_back(Entry); TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(DL).setChain(Chain) - .setCallee(CallingConv::C, IntPtrTy, - DAG.getExternalSymbol("__misaligned_load", getPointerTy()), - std::move(Args), 0); + CLI.setDebugLoc(DL).setChain(Chain).setCallee( + CallingConv::C, IntPtrTy, + DAG.getExternalSymbol("__misaligned_load", + getPointerTy(DAG.getDataLayout())), + std::move(Args), 0); std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); SDValue Ops[] = { CallResult.first, CallResult.second }; @@ -516,8 +519,8 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG) const ST->getAlignment())) { return SDValue(); } - unsigned ABIAlignment = getDataLayout()-> - getABITypeAlignment(ST->getMemoryVT().getTypeForEVT(*DAG.getContext())); + unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment( + ST->getMemoryVT().getTypeForEVT(*DAG.getContext())); // Leave aligned store alone. if (ST->getAlignment() >= ABIAlignment) { return SDValue(); @@ -545,7 +548,7 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG) const } // Lower to a call to __misaligned_store(BasePtr, Value). - Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext()); + Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext()); TargetLowering::ArgListTy Args; TargetLowering::ArgListEntry Entry; @@ -557,10 +560,11 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG) const Args.push_back(Entry); TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(dl).setChain(Chain) - .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), - DAG.getExternalSymbol("__misaligned_store", getPointerTy()), - std::move(Args), 0); + CLI.setDebugLoc(dl).setChain(Chain).setCallee( + CallingConv::C, Type::getVoidTy(*DAG.getContext()), + DAG.getExternalSymbol("__misaligned_store", + getPointerTy(DAG.getDataLayout())), + std::move(Args), 0); std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); return CallResult.second; @@ -833,9 +837,9 @@ LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const { XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>(); int FI = XFI->createLRSpillSlot(MF); SDValue FIN = DAG.getFrameIndex(FI, MVT::i32); - return DAG.getLoad(getPointerTy(), SDLoc(Op), DAG.getEntryNode(), FIN, - MachinePointerInfo::getFixedStack(FI), false, false, - false, 0); + return DAG.getLoad( + getPointerTy(DAG.getDataLayout()), SDLoc(Op), DAG.getEntryNode(), FIN, + MachinePointerInfo::getFixedStack(FI), false, false, false, 0); } SDValue XCoreTargetLowering:: @@ -979,11 +983,10 @@ LowerATOMIC_LOAD(SDValue Op, SelectionDAG &DAG) const { if (N->getMemoryVT() == MVT::i32) { if (N->getAlignment() < 4) report_fatal_error("atomic load must be aligned"); - return DAG.getLoad(getPointerTy(), SDLoc(Op), N->getChain(), - N->getBasePtr(), N->getPointerInfo(), - N->isVolatile(), N->isNonTemporal(), - N->isInvariant(), N->getAlignment(), - N->getAAInfo(), N->getRanges()); + return DAG.getLoad(getPointerTy(DAG.getDataLayout()), SDLoc(Op), + N->getChain(), N->getBasePtr(), N->getPointerInfo(), + N->isVolatile(), N->isNonTemporal(), N->isInvariant(), + N->getAlignment(), N->getAAInfo(), N->getRanges()); } if (N->getMemoryVT() == MVT::i16) { if (N->getAlignment() < 2) @@ -1150,9 +1153,10 @@ XCoreTargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee, // Get a count of how many bytes are to be pushed on the stack. unsigned NumBytes = RetCCInfo.getNextStackOffset(); + auto PtrVT = getPointerTy(DAG.getDataLayout()); - Chain = DAG.getCALLSEQ_START(Chain,DAG.getConstant(NumBytes, dl, - getPointerTy(), true), dl); + Chain = DAG.getCALLSEQ_START(Chain, + DAG.getConstant(NumBytes, dl, PtrVT, true), dl); SmallVector<std::pair<unsigned, SDValue>, 4> RegsToPass; SmallVector<SDValue, 12> MemOpChains; @@ -1239,11 +1243,8 @@ XCoreTargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee, InFlag = Chain.getValue(1); // Create the CALLSEQ_END node. - Chain = DAG.getCALLSEQ_END(Chain, - DAG.getConstant(NumBytes, dl, getPointerTy(), - true), - DAG.getConstant(0, dl, getPointerTy(), true), - InFlag, dl); + Chain = DAG.getCALLSEQ_END(Chain, DAG.getConstant(NumBytes, dl, PtrVT, true), + DAG.getConstant(0, dl, PtrVT, true), InFlag, dl); InFlag = Chain.getValue(1); // Handle result values, copying them out of physregs into vregs that we @@ -1830,7 +1831,7 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N, if (StoreBits % 8) { break; } - unsigned ABIAlignment = getDataLayout()->getABITypeAlignment( + unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment( ST->getMemoryVT().getTypeForEVT(*DCI.DAG.getContext())); unsigned Alignment = ST->getAlignment(); if (Alignment >= ABIAlignment) { @@ -1924,15 +1925,13 @@ static inline bool isImmUs4(int64_t val) /// isLegalAddressingMode - Return true if the addressing mode represented /// by AM is legal for this target, for a load/store of the specified type. -bool -XCoreTargetLowering::isLegalAddressingMode(const AddrMode &AM, - Type *Ty, - unsigned AS) const { +bool XCoreTargetLowering::isLegalAddressingMode(const DataLayout &DL, + const AddrMode &AM, Type *Ty, + unsigned AS) const { if (Ty->getTypeID() == Type::VoidTyID) return AM.Scale == 0 && isImmUs(AM.BaseOffs) && isImmUs4(AM.BaseOffs); - const DataLayout *TD = TM.getDataLayout(); - unsigned Size = TD->getTypeAllocSize(Ty); + unsigned Size = DL.getTypeAllocSize(Ty); if (AM.BaseGV) { return Size >= 4 && !AM.HasBaseReg && AM.Scale == 0 && AM.BaseOffs%4 == 0; @@ -1970,7 +1969,7 @@ XCoreTargetLowering::isLegalAddressingMode(const AddrMode &AM, std::pair<unsigned, const TargetRegisterClass *> XCoreTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, - const std::string &Constraint, + StringRef Constraint, MVT VT) const { if (Constraint.size() == 1) { switch (Constraint[0]) { diff --git a/lib/Target/XCore/XCoreISelLowering.h b/lib/Target/XCore/XCoreISelLowering.h index 97f0494b6fe3..ddd675c5164d 100644 --- a/lib/Target/XCore/XCoreISelLowering.h +++ b/lib/Target/XCore/XCoreISelLowering.h @@ -101,7 +101,9 @@ namespace llvm { unsigned getJumpTableEncoding() const override; - MVT getScalarShiftAmountTy(EVT LHSTy) const override { return MVT::i32; } + MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override { + return MVT::i32; + } /// LowerOperation - Provide custom lowering hooks for some operations. SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; @@ -120,8 +122,8 @@ namespace llvm { EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const override; - bool isLegalAddressingMode(const AddrMode &AM, Type *Ty, - unsigned AS) const override; + bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, + Type *Ty, unsigned AS) const override; private: const TargetMachine &TM; @@ -175,8 +177,7 @@ namespace llvm { // Inline asm support std::pair<unsigned, const TargetRegisterClass *> getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, - const std::string &Constraint, - MVT VT) const override; + StringRef Constraint, MVT VT) const override; // Expand specifics SDValue TryExpandADDWithMul(SDNode *Op, SelectionDAG &DAG) const; diff --git a/lib/Target/XCore/XCoreRegisterInfo.cpp b/lib/Target/XCore/XCoreRegisterInfo.cpp index 1d569e8936df..1cfb57dc3af3 100644 --- a/lib/Target/XCore/XCoreRegisterInfo.cpp +++ b/lib/Target/XCore/XCoreRegisterInfo.cpp @@ -222,7 +222,7 @@ XCoreRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { XCore::R8, XCore::R9, 0 }; - const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering(); + const XCoreFrameLowering *TFI = getFrameLowering(*MF); if (TFI->hasFP(*MF)) return CalleeSavedRegsFP; return CalleeSavedRegs; @@ -230,7 +230,7 @@ XCoreRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { BitVector XCoreRegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); - const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); + const XCoreFrameLowering *TFI = getFrameLowering(MF); Reserved.set(XCore::CP); Reserved.set(XCore::DP); @@ -270,7 +270,7 @@ XCoreRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, const XCoreInstrInfo &TII = *static_cast<const XCoreInstrInfo *>(MF.getSubtarget().getInstrInfo()); - const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); + const XCoreFrameLowering *TFI = getFrameLowering(MF); int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex); int StackSize = MF.getFrameInfo()->getStackSize(); @@ -324,7 +324,7 @@ XCoreRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, unsigned XCoreRegisterInfo::getFrameRegister(const MachineFunction &MF) const { - const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); + const XCoreFrameLowering *TFI = getFrameLowering(MF); return TFI->hasFP(MF) ? XCore::R10 : XCore::SP; } diff --git a/lib/Target/XCore/XCoreSelectionDAGInfo.cpp b/lib/Target/XCore/XCoreSelectionDAGInfo.cpp index a34884480cea..40568d124de0 100644 --- a/lib/Target/XCore/XCoreSelectionDAGInfo.cpp +++ b/lib/Target/XCore/XCoreSelectionDAGInfo.cpp @@ -16,12 +16,6 @@ using namespace llvm; #define DEBUG_TYPE "xcore-selectiondag-info" -XCoreSelectionDAGInfo::XCoreSelectionDAGInfo(const DataLayout &DL) - : TargetSelectionDAGInfo(&DL) {} - -XCoreSelectionDAGInfo::~XCoreSelectionDAGInfo() { -} - SDValue XCoreSelectionDAGInfo:: EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, SDValue Chain, SDValue Dst, SDValue Src, SDValue Size, unsigned Align, @@ -36,18 +30,20 @@ EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, SDValue Chain, const TargetLowering &TLI = *DAG.getSubtarget().getTargetLowering(); TargetLowering::ArgListTy Args; TargetLowering::ArgListEntry Entry; - Entry.Ty = TLI.getDataLayout()->getIntPtrType(*DAG.getContext()); + Entry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext()); Entry.Node = Dst; Args.push_back(Entry); Entry.Node = Src; Args.push_back(Entry); Entry.Node = Size; Args.push_back(Entry); TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(dl).setChain(Chain) - .setCallee(TLI.getLibcallCallingConv(RTLIB::MEMCPY), - Type::getVoidTy(*DAG.getContext()), - DAG.getExternalSymbol("__memcpy_4", TLI.getPointerTy()), - std::move(Args), 0) - .setDiscardResult(); + CLI.setDebugLoc(dl) + .setChain(Chain) + .setCallee(TLI.getLibcallCallingConv(RTLIB::MEMCPY), + Type::getVoidTy(*DAG.getContext()), + DAG.getExternalSymbol("__memcpy_4", + TLI.getPointerTy(DAG.getDataLayout())), + std::move(Args), 0) + .setDiscardResult(); std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI); return CallResult.second; diff --git a/lib/Target/XCore/XCoreSelectionDAGInfo.h b/lib/Target/XCore/XCoreSelectionDAGInfo.h index cfd80b3f3172..77b3527d77e3 100644 --- a/lib/Target/XCore/XCoreSelectionDAGInfo.h +++ b/lib/Target/XCore/XCoreSelectionDAGInfo.h @@ -22,8 +22,6 @@ class XCoreTargetMachine; class XCoreSelectionDAGInfo : public TargetSelectionDAGInfo { public: - explicit XCoreSelectionDAGInfo(const DataLayout &DL); - ~XCoreSelectionDAGInfo(); SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, diff --git a/lib/Target/XCore/XCoreSubtarget.cpp b/lib/Target/XCore/XCoreSubtarget.cpp index c98518b60225..99ad2c88504f 100644 --- a/lib/Target/XCore/XCoreSubtarget.cpp +++ b/lib/Target/XCore/XCoreSubtarget.cpp @@ -28,4 +28,4 @@ void XCoreSubtarget::anchor() { } XCoreSubtarget::XCoreSubtarget(const Triple &TT, const std::string &CPU, const std::string &FS, const TargetMachine &TM) : XCoreGenSubtargetInfo(TT, CPU, FS), InstrInfo(), FrameLowering(*this), - TLInfo(TM, *this), TSInfo(*TM.getDataLayout()) {} + TLInfo(TM, *this), TSInfo() {} diff --git a/lib/Target/XCore/XCoreTargetMachine.cpp b/lib/Target/XCore/XCoreTargetMachine.cpp index 370b64b26688..f420081868f9 100644 --- a/lib/Target/XCore/XCoreTargetMachine.cpp +++ b/lib/Target/XCore/XCoreTargetMachine.cpp @@ -85,6 +85,7 @@ extern "C" void LLVMInitializeXCoreTarget() { } TargetIRAnalysis XCoreTargetMachine::getTargetIRAnalysis() { - return TargetIRAnalysis( - [this](Function &) { return TargetTransformInfo(XCoreTTIImpl(this)); }); + return TargetIRAnalysis([this](Function &F) { + return TargetTransformInfo(XCoreTTIImpl(this, F)); + }); } diff --git a/lib/Target/XCore/XCoreTargetTransformInfo.h b/lib/Target/XCore/XCoreTargetTransformInfo.h index 70b47dfa1156..e23aef3e3b4a 100644 --- a/lib/Target/XCore/XCoreTargetTransformInfo.h +++ b/lib/Target/XCore/XCoreTargetTransformInfo.h @@ -37,8 +37,9 @@ class XCoreTTIImpl : public BasicTTIImplBase<XCoreTTIImpl> { const XCoreTargetLowering *getTLI() const { return TLI; } public: - explicit XCoreTTIImpl(const XCoreTargetMachine *TM) - : BaseT(TM), ST(TM->getSubtargetImpl()), TLI(ST->getTargetLowering()) {} + explicit XCoreTTIImpl(const XCoreTargetMachine *TM, Function &F) + : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl()), + TLI(ST->getTargetLowering()) {} // Provide value semantics. MSVC requires that we spell all of these out. XCoreTTIImpl(const XCoreTTIImpl &Arg) @@ -46,18 +47,6 @@ public: XCoreTTIImpl(XCoreTTIImpl &&Arg) : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)), TLI(std::move(Arg.TLI)) {} - XCoreTTIImpl &operator=(const XCoreTTIImpl &RHS) { - BaseT::operator=(static_cast<const BaseT &>(RHS)); - ST = RHS.ST; - TLI = RHS.TLI; - return *this; - } - XCoreTTIImpl &operator=(XCoreTTIImpl &&RHS) { - BaseT::operator=(std::move(static_cast<BaseT &>(RHS))); - ST = std::move(RHS.ST); - TLI = std::move(RHS.TLI); - return *this; - } unsigned getNumberOfRegisters(bool Vector) { if (Vector) { |