diff options
Diffstat (limited to 'lib/Target/PowerPC/PPCISelLowering.cpp')
-rw-r--r-- | lib/Target/PowerPC/PPCISelLowering.cpp | 1087 |
1 files changed, 823 insertions, 264 deletions
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index 39608cb74bee..24d50074860d 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -1,9 +1,8 @@ //===-- PPCISelLowering.cpp - PPC DAG Lowering Implementation -------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -45,6 +44,7 @@ #include "llvm/CodeGen/MachineJumpTableInfo.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/RuntimeLibcalls.h" @@ -70,8 +70,10 @@ #include "llvm/IR/Type.h" #include "llvm/IR/Use.h" #include "llvm/IR/Value.h" +#include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSymbolXCOFF.h" #include "llvm/Support/AtomicOrdering.h" #include "llvm/Support/BranchProbability.h" #include "llvm/Support/Casting.h" @@ -111,6 +113,9 @@ cl::desc("disable unaligned load/store generation on PPC"), cl::Hidden); static cl::opt<bool> DisableSCO("disable-ppc-sco", cl::desc("disable sibling call optimization on ppc"), cl::Hidden); +static cl::opt<bool> DisableInnermostLoopAlign32("disable-ppc-innermost-loop-align32", +cl::desc("don't always align innermost loop to 32 bytes on ppc"), cl::Hidden); + static cl::opt<bool> EnableQuadPrecision("enable-ppc-quad-precision", cl::desc("enable quad precision float support on ppc"), cl::Hidden); @@ -119,6 +124,8 @@ STATISTIC(NumSiblingCalls, "Number of sibling calls"); static bool isNByteElemShuffleMask(ShuffleVectorSDNode *, unsigned, int); +static SDValue widenVec(SelectionDAG &DAG, SDValue Vec, const SDLoc &dl); + // FIXME: Remove this once the bug has been fixed! extern cl::opt<bool> ANDIGlueBug; @@ -550,7 +557,18 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, // add/sub are legal for all supported vector VT's. setOperationAction(ISD::ADD, VT, Legal); setOperationAction(ISD::SUB, VT, Legal); - setOperationAction(ISD::ABS, VT, Custom); + + // For v2i64, these are only valid with P8Vector. This is corrected after + // the loop. + setOperationAction(ISD::SMAX, VT, Legal); + setOperationAction(ISD::SMIN, VT, Legal); + setOperationAction(ISD::UMAX, VT, Legal); + setOperationAction(ISD::UMIN, VT, Legal); + + if (Subtarget.hasVSX()) { + setOperationAction(ISD::FMAXNUM, VT, Legal); + setOperationAction(ISD::FMINNUM, VT, Legal); + } // Vector instructions introduced in P8 if (Subtarget.hasP8Altivec() && (VT.SimpleTy != MVT::v1i128)) { @@ -635,11 +653,28 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand); } } + if (!Subtarget.hasP8Vector()) { + setOperationAction(ISD::SMAX, MVT::v2i64, Expand); + setOperationAction(ISD::SMIN, MVT::v2i64, Expand); + setOperationAction(ISD::UMAX, MVT::v2i64, Expand); + setOperationAction(ISD::UMIN, MVT::v2i64, Expand); + } + + for (auto VT : {MVT::v2i64, MVT::v4i32, MVT::v8i16, MVT::v16i8}) + setOperationAction(ISD::ABS, VT, Custom); // We can custom expand all VECTOR_SHUFFLEs to VPERM, others we can handle // with merges, splats, etc. setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i8, Custom); + // Vector truncates to sub-word integer that fit in an Altivec/VSX register + // are cheap, so handle them before they get expanded to scalar. + setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom); + setOperationAction(ISD::AND , MVT::v4i32, Legal); setOperationAction(ISD::OR , MVT::v4i32, Legal); setOperationAction(ISD::XOR , MVT::v4i32, Legal); @@ -804,6 +839,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::FNEG, MVT::v2f64, Legal); setOperationAction(ISD::FABS, MVT::v4f32, Legal); setOperationAction(ISD::FABS, MVT::v2f64, Legal); + setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Legal); + setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Legal); if (Subtarget.hasDirectMove()) setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); @@ -866,6 +903,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::FPOWI, MVT::f128, Expand); setOperationAction(ISD::FREM, MVT::f128, Expand); } + setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom); } @@ -1060,6 +1098,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setTargetDAGCombine(ISD::SHL); setTargetDAGCombine(ISD::SRA); setTargetDAGCombine(ISD::SRL); + setTargetDAGCombine(ISD::MUL); setTargetDAGCombine(ISD::SINT_TO_FP); setTargetDAGCombine(ISD::BUILD_VECTOR); if (Subtarget.hasFPCVT()) @@ -1232,22 +1271,6 @@ unsigned PPCTargetLowering::getByValTypeAlignment(Type *Ty, return Align; } -unsigned PPCTargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, - CallingConv:: ID CC, - EVT VT) const { - if (Subtarget.hasSPE() && VT == MVT::f64) - return 2; - return PPCTargetLowering::getNumRegisters(Context, VT); -} - -MVT PPCTargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, - CallingConv:: ID CC, - EVT VT) const { - if (Subtarget.hasSPE() && VT == MVT::f64) - return MVT::i32; - return PPCTargetLowering::getRegisterType(Context, VT); -} - bool PPCTargetLowering::useSoftFloat() const { return Subtarget.useSoftFloat(); } @@ -1256,6 +1279,10 @@ bool PPCTargetLowering::hasSPE() const { return Subtarget.hasSPE(); } +bool PPCTargetLowering::preferIncOfAddToSubOfNot(EVT VT) const { + return VT.isScalarInteger(); +} + const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { switch ((PPCISD::NodeType)Opcode) { case PPCISD::FIRST_NUMBER: break; @@ -1365,7 +1392,11 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { case PPCISD::QBFLT: return "PPCISD::QBFLT"; case PPCISD::QVLFSb: return "PPCISD::QVLFSb"; case PPCISD::BUILD_FP128: return "PPCISD::BUILD_FP128"; + case PPCISD::BUILD_SPE64: return "PPCISD::BUILD_SPE64"; + case PPCISD::EXTRACT_SPE: return "PPCISD::EXTRACT_SPE"; case PPCISD::EXTSWSLI: return "PPCISD::EXTSWSLI"; + case PPCISD::LD_VSX_LH: return "PPCISD::LD_VSX_LH"; + case PPCISD::FP_EXTEND_LH: return "PPCISD::FP_EXTEND_LH"; } return nullptr; } @@ -2202,16 +2233,43 @@ bool llvm::isIntS16Immediate(SDValue Op, int16_t &Imm) { return isIntS16Immediate(Op.getNode(), Imm); } + +/// SelectAddressEVXRegReg - Given the specified address, check to see if it can +/// be represented as an indexed [r+r] operation. +bool PPCTargetLowering::SelectAddressEVXRegReg(SDValue N, SDValue &Base, + SDValue &Index, + SelectionDAG &DAG) const { + for (SDNode::use_iterator UI = N->use_begin(), E = N->use_end(); + UI != E; ++UI) { + if (MemSDNode *Memop = dyn_cast<MemSDNode>(*UI)) { + if (Memop->getMemoryVT() == MVT::f64) { + Base = N.getOperand(0); + Index = N.getOperand(1); + return true; + } + } + } + return false; +} + /// SelectAddressRegReg - Given the specified addressed, check to see if it /// can be represented as an indexed [r+r] operation. Returns false if it -/// can be more efficiently represented with [r+imm]. +/// can be more efficiently represented as [r+imm]. If \p EncodingAlignment is +/// non-zero and N can be represented by a base register plus a signed 16-bit +/// displacement, make a more precise judgement by checking (displacement % \p +/// EncodingAlignment). bool PPCTargetLowering::SelectAddressRegReg(SDValue N, SDValue &Base, - SDValue &Index, - SelectionDAG &DAG) const { + SDValue &Index, SelectionDAG &DAG, + unsigned EncodingAlignment) const { int16_t imm = 0; if (N.getOpcode() == ISD::ADD) { - if (isIntS16Immediate(N.getOperand(1), imm)) - return false; // r+i + // Is there any SPE load/store (f64), which can't handle 16bit offset? + // SPE load/store can only handle 8-bit offsets. + if (hasSPE() && SelectAddressEVXRegReg(N, Base, Index, DAG)) + return true; + if (isIntS16Immediate(N.getOperand(1), imm) && + (!EncodingAlignment || !(imm % EncodingAlignment))) + return false; // r+i if (N.getOperand(1).getOpcode() == PPCISD::Lo) return false; // r+i @@ -2219,8 +2277,9 @@ bool PPCTargetLowering::SelectAddressRegReg(SDValue N, SDValue &Base, Index = N.getOperand(1); return true; } else if (N.getOpcode() == ISD::OR) { - if (isIntS16Immediate(N.getOperand(1), imm)) - return false; // r+i can fold it if we can. + if (isIntS16Immediate(N.getOperand(1), imm) && + (!EncodingAlignment || !(imm % EncodingAlignment))) + return false; // r+i can fold it if we can. // If this is an or of disjoint bitfields, we can codegen this as an add // (for better address arithmetic) if the LHS and RHS of the OR are provably @@ -2284,22 +2343,22 @@ static void fixupFuncForFI(SelectionDAG &DAG, int FrameIdx, EVT VT) { /// Returns true if the address N can be represented by a base register plus /// a signed 16-bit displacement [r+imm], and if it is not better -/// represented as reg+reg. If \p Alignment is non-zero, only accept +/// represented as reg+reg. If \p EncodingAlignment is non-zero, only accept /// displacements that are multiples of that value. bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp, SDValue &Base, SelectionDAG &DAG, - unsigned Alignment) const { + unsigned EncodingAlignment) const { // FIXME dl should come from parent load or store, not from address SDLoc dl(N); // If this can be more profitably realized as r+r, fail. - if (SelectAddressRegReg(N, Disp, Base, DAG)) + if (SelectAddressRegReg(N, Disp, Base, DAG, EncodingAlignment)) return false; if (N.getOpcode() == ISD::ADD) { int16_t imm = 0; if (isIntS16Immediate(N.getOperand(1), imm) && - (!Alignment || (imm % Alignment) == 0)) { + (!EncodingAlignment || (imm % EncodingAlignment) == 0)) { Disp = DAG.getTargetConstant(imm, dl, N.getValueType()); if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0))) { Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType()); @@ -2323,7 +2382,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp, } else if (N.getOpcode() == ISD::OR) { int16_t imm = 0; if (isIntS16Immediate(N.getOperand(1), imm) && - (!Alignment || (imm % Alignment) == 0)) { + (!EncodingAlignment || (imm % EncodingAlignment) == 0)) { // If this is an or of disjoint bitfields, we can codegen this as an add // (for better address arithmetic) if the LHS and RHS of the OR are // provably disjoint. @@ -2349,7 +2408,8 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp, // If this address fits entirely in a 16-bit sext immediate field, codegen // this as "d, 0" int16_t Imm; - if (isIntS16Immediate(CN, Imm) && (!Alignment || (Imm % Alignment) == 0)) { + if (isIntS16Immediate(CN, Imm) && + (!EncodingAlignment || (Imm % EncodingAlignment) == 0)) { Disp = DAG.getTargetConstant(Imm, dl, CN->getValueType(0)); Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO, CN->getValueType(0)); @@ -2359,7 +2419,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp, // Handle 32-bit sext immediates with LIS + addr mode. if ((CN->getValueType(0) == MVT::i32 || (int64_t)CN->getZExtValue() == (int)CN->getZExtValue()) && - (!Alignment || (CN->getZExtValue() % Alignment) == 0)) { + (!EncodingAlignment || (CN->getZExtValue() % EncodingAlignment) == 0)) { int Addr = (int)CN->getZExtValue(); // Otherwise, break this down into an LIS + disp. @@ -2416,24 +2476,45 @@ bool PPCTargetLowering::SelectAddressRegRegOnly(SDValue N, SDValue &Base, /// Returns true if we should use a direct load into vector instruction /// (such as lxsd or lfd), instead of a load into gpr + direct move sequence. -static bool usePartialVectorLoads(SDNode *N) { - if (!N->hasOneUse()) - return false; +static bool usePartialVectorLoads(SDNode *N, const PPCSubtarget& ST) { // If there are any other uses other than scalar to vector, then we should // keep it as a scalar load -> direct move pattern to prevent multiple - // loads. Currently, only check for i64 since we have lxsd/lfd to do this - // efficiently, but no update equivalent. - if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { - EVT MemVT = LD->getMemoryVT(); - if (MemVT.isSimple() && MemVT.getSimpleVT().SimpleTy == MVT::i64) { - SDNode *User = *(LD->use_begin()); - if (User->getOpcode() == ISD::SCALAR_TO_VECTOR) - return true; - } + // loads. + LoadSDNode *LD = dyn_cast<LoadSDNode>(N); + if (!LD) + return false; + + EVT MemVT = LD->getMemoryVT(); + if (!MemVT.isSimple()) + return false; + switch(MemVT.getSimpleVT().SimpleTy) { + case MVT::i64: + break; + case MVT::i32: + if (!ST.hasP8Vector()) + return false; + break; + case MVT::i16: + case MVT::i8: + if (!ST.hasP9Vector()) + return false; + break; + default: + return false; } - return false; + SDValue LoadedVal(N, 0); + if (!LoadedVal.hasOneUse()) + return false; + + for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); + UI != UE; ++UI) + if (UI.getUse().get().getResNo() == 0 && + UI->getOpcode() != ISD::SCALAR_TO_VECTOR) + return false; + + return true; } /// getPreIndexedAddressParts - returns true by value, base pointer and @@ -2464,7 +2545,7 @@ bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, // Do not generate pre-inc forms for specific loads that feed scalar_to_vector // instructions because we can fold these into a more efficient instruction // instead, (such as LXSD). - if (isLoad && usePartialVectorLoads(N)) { + if (isLoad && usePartialVectorLoads(N, Subtarget)) { return false; } @@ -2745,7 +2826,8 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op, const Module *M = DAG.getMachineFunction().getFunction().getParent(); PICLevel::Level picLevel = M->getPICLevel(); - TLSModel::Model Model = getTargetMachine().getTLSModel(GV); + const TargetMachine &TM = getTargetMachine(); + TLSModel::Model Model = TM.getTLSModel(GV); if (Model == TLSModel::LocalExec) { SDValue TGAHi = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, @@ -2769,8 +2851,14 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op, SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64); GOTPtr = DAG.getNode(PPCISD::ADDIS_GOT_TPREL_HA, dl, PtrVT, GOTReg, TGA); - } else - GOTPtr = DAG.getNode(PPCISD::PPC32_GOT, dl, PtrVT); + } else { + if (!TM.isPositionIndependent()) + GOTPtr = DAG.getNode(PPCISD::PPC32_GOT, dl, PtrVT); + else if (picLevel == PICLevel::SmallPIC) + GOTPtr = DAG.getNode(PPCISD::GlobalBaseReg, dl, PtrVT); + else + GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT); + } SDValue TPOffset = DAG.getNode(PPCISD::LD_GOT_TPREL_L, dl, PtrVT, TGA, GOTPtr); return DAG.getNode(PPCISD::ADD_TLS, dl, PtrVT, TPOffset, TGATLS); @@ -3147,101 +3235,6 @@ SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { MachinePointerInfo(SV, nextOffset)); } -#include "PPCGenCallingConv.inc" - -// Function whose sole purpose is to kill compiler warnings -// stemming from unused functions included from PPCGenCallingConv.inc. -CCAssignFn *PPCTargetLowering::useFastISelCCs(unsigned Flag) const { - return Flag ? CC_PPC64_ELF_FIS : RetCC_PPC64_ELF_FIS; -} - -bool llvm::CC_PPC32_SVR4_Custom_Dummy(unsigned &ValNo, MVT &ValVT, MVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, - CCState &State) { - return true; -} - -bool llvm::CC_PPC32_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT, - MVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, - CCState &State) { - static const MCPhysReg ArgRegs[] = { - PPC::R3, PPC::R4, PPC::R5, PPC::R6, - PPC::R7, PPC::R8, PPC::R9, PPC::R10, - }; - const unsigned NumArgRegs = array_lengthof(ArgRegs); - - unsigned RegNum = State.getFirstUnallocated(ArgRegs); - - // Skip one register if the first unallocated register has an even register - // number and there are still argument registers available which have not been - // allocated yet. RegNum is actually an index into ArgRegs, which means we - // need to skip a register if RegNum is odd. - if (RegNum != NumArgRegs && RegNum % 2 == 1) { - State.AllocateReg(ArgRegs[RegNum]); - } - - // Always return false here, as this function only makes sure that the first - // unallocated register has an odd register number and does not actually - // allocate a register for the current argument. - return false; -} - -bool -llvm::CC_PPC32_SVR4_Custom_SkipLastArgRegsPPCF128(unsigned &ValNo, MVT &ValVT, - MVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, - CCState &State) { - static const MCPhysReg ArgRegs[] = { - PPC::R3, PPC::R4, PPC::R5, PPC::R6, - PPC::R7, PPC::R8, PPC::R9, PPC::R10, - }; - const unsigned NumArgRegs = array_lengthof(ArgRegs); - - unsigned RegNum = State.getFirstUnallocated(ArgRegs); - int RegsLeft = NumArgRegs - RegNum; - - // Skip if there is not enough registers left for long double type (4 gpr regs - // in soft float mode) and put long double argument on the stack. - if (RegNum != NumArgRegs && RegsLeft < 4) { - for (int i = 0; i < RegsLeft; i++) { - State.AllocateReg(ArgRegs[RegNum + i]); - } - } - - return false; -} - -bool llvm::CC_PPC32_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT, - MVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, - CCState &State) { - static const MCPhysReg ArgRegs[] = { - PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7, - PPC::F8 - }; - - const unsigned NumArgRegs = array_lengthof(ArgRegs); - - unsigned RegNum = State.getFirstUnallocated(ArgRegs); - - // If there is only one Floating-point register left we need to put both f64 - // values of a split ppc_fp128 value on the stack. - if (RegNum != NumArgRegs && ArgRegs[RegNum] == PPC::F8) { - State.AllocateReg(ArgRegs[RegNum]); - } - - // Always return false here, as this function only makes sure that the two f64 - // values a ppc_fp128 value is split into are both passed in registers or both - // passed on the stack and does not actually allocate a register for the - // current argument. - return false; -} - /// FPR - The set of FP registers that should be allocated for arguments, /// on Darwin. static const MCPhysReg FPR[] = {PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, @@ -3449,7 +3442,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_32SVR4( // Reserve space for the linkage area on the stack. unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize(); CCInfo.AllocateStack(LinkageSize, PtrByteSize); - if (useSoftFloat() || hasSPE()) + if (useSoftFloat()) CCInfo.PreAnalyzeFormalArguments(Ins); CCInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4); @@ -3482,7 +3475,8 @@ SDValue PPCTargetLowering::LowerFormalArguments_32SVR4( if (Subtarget.hasVSX()) RC = &PPC::VSFRCRegClass; else if (Subtarget.hasSPE()) - RC = &PPC::SPERCRegClass; + // SPE passes doubles in GPR pairs. + RC = &PPC::GPRCRegClass; else RC = &PPC::F8RCRegClass; break; @@ -3506,13 +3500,26 @@ SDValue PPCTargetLowering::LowerFormalArguments_32SVR4( break; } - // Transform the arguments stored in physical registers into virtual ones. - unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); - SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, - ValVT == MVT::i1 ? MVT::i32 : ValVT); - - if (ValVT == MVT::i1) - ArgValue = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, ArgValue); + SDValue ArgValue; + // Transform the arguments stored in physical registers into + // virtual ones. + if (VA.getLocVT() == MVT::f64 && Subtarget.hasSPE()) { + assert(i + 1 < e && "No second half of double precision argument"); + unsigned RegLo = MF.addLiveIn(VA.getLocReg(), RC); + unsigned RegHi = MF.addLiveIn(ArgLocs[++i].getLocReg(), RC); + SDValue ArgValueLo = DAG.getCopyFromReg(Chain, dl, RegLo, MVT::i32); + SDValue ArgValueHi = DAG.getCopyFromReg(Chain, dl, RegHi, MVT::i32); + if (!Subtarget.isLittleEndian()) + std::swap (ArgValueLo, ArgValueHi); + ArgValue = DAG.getNode(PPCISD::BUILD_SPE64, dl, MVT::f64, ArgValueLo, + ArgValueHi); + } else { + unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); + ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, + ValVT == MVT::i1 ? MVT::i32 : ValVT); + if (ValVT == MVT::i1) + ArgValue = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, ArgValue); + } InVals.push_back(ArgValue); } else { @@ -4448,24 +4455,27 @@ static bool isFunctionGlobalAddress(SDValue Callee); static bool callsShareTOCBase(const Function *Caller, SDValue Callee, const TargetMachine &TM) { - // If !G, Callee can be an external symbol. - GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); - if (!G) - return false; - + // Callee is either a GlobalAddress or an ExternalSymbol. ExternalSymbols + // don't have enough information to determine if the caller and calle share + // the same TOC base, so we have to pessimistically assume they don't for + // correctness. + GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); + if (!G) + return false; + + const GlobalValue *GV = G->getGlobal(); // The medium and large code models are expected to provide a sufficiently // large TOC to provide all data addressing needs of a module with a // single TOC. Since each module will be addressed with a single TOC then we // only need to check that caller and callee don't cross dso boundaries. if (CodeModel::Medium == TM.getCodeModel() || CodeModel::Large == TM.getCodeModel()) - return TM.shouldAssumeDSOLocal(*Caller->getParent(), G->getGlobal()); + return TM.shouldAssumeDSOLocal(*Caller->getParent(), GV); // Otherwise we need to ensure callee and caller are in the same section, // since the linker may allocate multiple TOCs, and we don't know which // sections will belong to the same TOC base. - const GlobalValue *GV = G->getGlobal(); if (!GV->isStrongDefinitionForLinker()) return false; @@ -4917,6 +4927,7 @@ PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, SDValue &Chain, bool isPPC64 = Subtarget.isPPC64(); bool isSVR4ABI = Subtarget.isSVR4ABI(); bool isELFv2ABI = Subtarget.isELFv2ABI(); + bool isAIXABI = Subtarget.isAIXABI(); EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); NodeTys.push_back(MVT::Other); // Returns a chain @@ -4943,17 +4954,18 @@ PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, SDValue &Chain, bool Local = TM.shouldAssumeDSOLocal(*Mod, GV); bool UsePlt = !Local && Subtarget.isTargetELF() && !isPPC64; + // If the callee is a GlobalAddress/ExternalSymbol node (quite common, + // every direct call is) turn it into a TargetGlobalAddress / + // TargetExternalSymbol node so that legalize doesn't hack it. if (isFunctionGlobalAddress(Callee)) { GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Callee); + // A call to a TLS address is actually an indirect call to a // thread-specific pointer. unsigned OpFlags = 0; if (UsePlt) OpFlags = PPCII::MO_PLT; - // If the callee is a GlobalAddress/ExternalSymbol node (quite common, - // every direct call is) turn it into a TargetGlobalAddress / - // TargetExternalSymbol node so that legalize doesn't hack it. Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, Callee.getValueType(), 0, OpFlags); needIndirectCall = false; @@ -5095,17 +5107,18 @@ PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, SDValue &Chain, Ops.push_back(DAG.getRegister(RegsToPass[i].first, RegsToPass[i].second.getValueType())); - // All calls, in both the ELF V1 and V2 ABIs, need the TOC register live - // into the call. - // We do need to reserve X2 to appease the verifier for the PATCHPOINT. - if (isSVR4ABI && isPPC64) { + // All calls, in the AIX ABI and 64-bit ELF ABIs, need the TOC register + // live into the call. + // We do need to reserve R2/X2 to appease the verifier for the PATCHPOINT. + if ((isSVR4ABI && isPPC64) || isAIXABI) { setUsesTOCBasePtr(DAG); - // We cannot add X2 as an operand here for PATCHPOINT, because there is no - // way to mark dependencies as implicit here. We will add the X2 dependency - // in EmitInstrWithCustomInserter. - if (!isPatchPoint) - Ops.push_back(DAG.getRegister(PPC::X2, PtrVT)); + // We cannot add R2/X2 as an operand here for PATCHPOINT, because there is + // no way to mark dependencies as implicit here. + // We will add the R2/X2 dependency in EmitInstrWithCustomInserter. + if (!isPatchPoint) + Ops.push_back(DAG.getRegister(isPPC64 ? PPC::X2 + : PPC::R2, PtrVT)); } return CallOpc; @@ -5129,10 +5142,27 @@ SDValue PPCTargetLowering::LowerCallResult( CCValAssign &VA = RVLocs[i]; assert(VA.isRegLoc() && "Can only return in registers!"); - SDValue Val = DAG.getCopyFromReg(Chain, dl, - VA.getLocReg(), VA.getLocVT(), InFlag); - Chain = Val.getValue(1); - InFlag = Val.getValue(2); + SDValue Val; + + if (Subtarget.hasSPE() && VA.getLocVT() == MVT::f64) { + SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, + InFlag); + Chain = Lo.getValue(1); + InFlag = Lo.getValue(2); + VA = RVLocs[++i]; // skip ahead to next loc + SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, + InFlag); + Chain = Hi.getValue(1); + InFlag = Hi.getValue(2); + if (!Subtarget.isLittleEndian()) + std::swap (Lo, Hi); + Val = DAG.getNode(PPCISD::BUILD_SPE64, dl, MVT::f64, Lo, Hi); + } else { + Val = DAG.getCopyFromReg(Chain, dl, + VA.getLocReg(), VA.getLocVT(), InFlag); + Chain = Val.getValue(1); + InFlag = Val.getValue(2); + } switch (VA.getLocInfo()) { default: llvm_unreachable("Unknown loc info!"); @@ -5206,18 +5236,24 @@ SDValue PPCTargetLowering::FinishCall( } // Add a NOP immediately after the branch instruction when using the 64-bit - // SVR4 ABI. At link time, if caller and callee are in a different module and + // SVR4 or the AIX ABI. + // At link time, if caller and callee are in a different module and // thus have a different TOC, the call will be replaced with a call to a stub // function which saves the current TOC, loads the TOC of the callee and // branches to the callee. The NOP will be replaced with a load instruction // which restores the TOC of the caller from the TOC save slot of the current // stack frame. If caller and callee belong to the same module (and have the - // same TOC), the NOP will remain unchanged. + // same TOC), the NOP will remain unchanged, or become some other NOP. MachineFunction &MF = DAG.getMachineFunction(); - if (!isTailCall && Subtarget.isSVR4ABI()&& Subtarget.isPPC64() && - !isPatchPoint) { + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + if (!isTailCall && !isPatchPoint && + ((Subtarget.isSVR4ABI() && Subtarget.isPPC64()) || + Subtarget.isAIXABI())) { if (CallOpc == PPCISD::BCTRL) { + if (Subtarget.isAIXABI()) + report_fatal_error("Indirect call on AIX is not implemented."); + // This is a call through a function pointer. // Restore the caller TOC from the save area into R2. // See PrepareCall() for more information about calls through function @@ -5229,7 +5265,6 @@ SDValue PPCTargetLowering::FinishCall( // allocated and an unnecessary move instruction being generated. CallOpc = PPCISD::BCTRL_LOAD_TOC; - EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDValue StackPtr = DAG.getRegister(PPC::X1, PtrVT); unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset(); SDValue TOCOff = DAG.getIntPtrConstant(TOCSaveOffset, dl); @@ -5245,6 +5280,19 @@ SDValue PPCTargetLowering::FinishCall( } } + if (Subtarget.isAIXABI() && isFunctionGlobalAddress(Callee)) { + // On AIX, direct function calls reference the symbol for the function's + // entry point, which is named by inserting a "." before the function's + // C-linkage name. + GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Callee); + auto &Context = DAG.getMachineFunction().getMMI().getContext(); + MCSymbol *S = Context.getOrCreateSymbol(Twine(".") + + Twine(G->getGlobal()->getName())); + Callee = DAG.getMCSymbol(S, PtrVT); + // Replace the GlobalAddressSDNode Callee with the MCSymbolSDNode. + Ops[1] = Callee; + } + Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops); InFlag = Chain.getValue(1); @@ -5314,16 +5362,20 @@ PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, !isTailCall) Callee = LowerGlobalAddress(Callee, DAG); - if (Subtarget.isSVR4ABI()) { - if (Subtarget.isPPC64()) - return LowerCall_64SVR4(Chain, Callee, CallConv, isVarArg, - isTailCall, isPatchPoint, Outs, OutVals, Ins, - dl, DAG, InVals, CS); - else - return LowerCall_32SVR4(Chain, Callee, CallConv, isVarArg, - isTailCall, isPatchPoint, Outs, OutVals, Ins, - dl, DAG, InVals, CS); - } + if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) + return LowerCall_64SVR4(Chain, Callee, CallConv, isVarArg, + isTailCall, isPatchPoint, Outs, OutVals, Ins, + dl, DAG, InVals, CS); + + if (Subtarget.isSVR4ABI()) + return LowerCall_32SVR4(Chain, Callee, CallConv, isVarArg, + isTailCall, isPatchPoint, Outs, OutVals, Ins, + dl, DAG, InVals, CS); + + if (Subtarget.isAIXABI()) + return LowerCall_AIX(Chain, Callee, CallConv, isVarArg, + isTailCall, isPatchPoint, Outs, OutVals, Ins, + dl, DAG, InVals, CS); return LowerCall_Darwin(Chain, Callee, CallConv, isVarArg, isTailCall, isPatchPoint, Outs, OutVals, Ins, @@ -5444,12 +5496,15 @@ SDValue PPCTargetLowering::LowerCall_32SVR4( bool seenFloatArg = false; // Walk the register/memloc assignments, inserting copies/loads. - for (unsigned i = 0, j = 0, e = ArgLocs.size(); + // i - Tracks the index into the list of registers allocated for the call + // RealArgIdx - Tracks the index into the list of actual function arguments + // j - Tracks the index into the list of byval arguments + for (unsigned i = 0, RealArgIdx = 0, j = 0, e = ArgLocs.size(); i != e; - ++i) { + ++i, ++RealArgIdx) { CCValAssign &VA = ArgLocs[i]; - SDValue Arg = OutVals[i]; - ISD::ArgFlagsTy Flags = Outs[i].Flags; + SDValue Arg = OutVals[RealArgIdx]; + ISD::ArgFlagsTy Flags = Outs[RealArgIdx].Flags; if (Flags.isByVal()) { // Argument is an aggregate which is passed by value, thus we need to @@ -5498,7 +5553,17 @@ SDValue PPCTargetLowering::LowerCall_32SVR4( if (VA.isRegLoc()) { seenFloatArg |= VA.getLocVT().isFloatingPoint(); // Put argument in a physical register. - RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); + if (Subtarget.hasSPE() && Arg.getValueType() == MVT::f64) { + bool IsLE = Subtarget.isLittleEndian(); + SDValue SVal = DAG.getNode(PPCISD::EXTRACT_SPE, dl, MVT::i32, Arg, + DAG.getIntPtrConstant(IsLE ? 0 : 1, dl)); + RegsToPass.push_back(std::make_pair(VA.getLocReg(), SVal.getValue(0))); + SVal = DAG.getNode(PPCISD::EXTRACT_SPE, dl, MVT::i32, Arg, + DAG.getIntPtrConstant(IsLE ? 1 : 0, dl)); + RegsToPass.push_back(std::make_pair(ArgLocs[++i].getLocReg(), + SVal.getValue(0))); + } else + RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); } else { // Put argument in the parameter list area of the current stack frame. assert(VA.isMemLoc()); @@ -6613,6 +6678,128 @@ SDValue PPCTargetLowering::LowerCall_Darwin( NumBytes, Ins, InVals, CS); } + +SDValue PPCTargetLowering::LowerCall_AIX( + SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg, + bool isTailCall, bool isPatchPoint, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, + SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, + ImmutableCallSite CS) const { + + assert((CallConv == CallingConv::C || CallConv == CallingConv::Fast) && + "Unimplemented calling convention!"); + if (isVarArg || isPatchPoint) + report_fatal_error("This call type is unimplemented on AIX."); + + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + bool isPPC64 = PtrVT == MVT::i64; + unsigned PtrByteSize = isPPC64 ? 8 : 4; + unsigned NumOps = Outs.size(); + + + // Count how many bytes are to be pushed on the stack, including the linkage + // area, parameter list area. + // On XCOFF, we start with 24/48, which is reserved space for + // [SP][CR][LR][2 x reserved][TOC]. + unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize(); + + // The prolog code of the callee may store up to 8 GPR argument registers to + // the stack, allowing va_start to index over them in memory if the callee + // is variadic. + // Because we cannot tell if this is needed on the caller side, we have to + // conservatively assume that it is needed. As such, make sure we have at + // least enough stack space for the caller to store the 8 GPRs. + unsigned NumBytes = LinkageSize + 8 * PtrByteSize; + + // Adjust the stack pointer for the new arguments... + // These operations are automatically eliminated by the prolog/epilog + // inserter pass. + Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl); + SDValue CallSeqStart = Chain; + + static const MCPhysReg GPR_32[] = { // 32-bit registers. + PPC::R3, PPC::R4, PPC::R5, PPC::R6, + PPC::R7, PPC::R8, PPC::R9, PPC::R10 + }; + static const MCPhysReg GPR_64[] = { // 64-bit registers. + PPC::X3, PPC::X4, PPC::X5, PPC::X6, + PPC::X7, PPC::X8, PPC::X9, PPC::X10 + }; + + const unsigned NumGPRs = isPPC64 ? array_lengthof(GPR_64) + : array_lengthof(GPR_32); + const MCPhysReg *GPR = isPPC64 ? GPR_64 : GPR_32; + unsigned GPR_idx = 0; + + SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; + + if (isTailCall) + report_fatal_error("Handling of tail call is unimplemented!"); + int SPDiff = 0; + + for (unsigned i = 0; i != NumOps; ++i) { + SDValue Arg = OutVals[i]; + ISD::ArgFlagsTy Flags = Outs[i].Flags; + + // Promote integers if needed. + if (Arg.getValueType() == MVT::i1 || + (isPPC64 && Arg.getValueType() == MVT::i32)) { + unsigned ExtOp = Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; + Arg = DAG.getNode(ExtOp, dl, PtrVT, Arg); + } + + // Note: "by value" is code for passing a structure by value, not + // basic types. + if (Flags.isByVal()) + report_fatal_error("Passing structure by value is unimplemented!"); + + switch (Arg.getSimpleValueType().SimpleTy) { + default: llvm_unreachable("Unexpected ValueType for argument!"); + case MVT::i1: + case MVT::i32: + case MVT::i64: + if (GPR_idx != NumGPRs) + RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Arg)); + else + report_fatal_error("Handling of placing parameters on the stack is " + "unimplemented!"); + break; + case MVT::f32: + case MVT::f64: + case MVT::v4f32: + case MVT::v4i32: + case MVT::v8i16: + case MVT::v16i8: + case MVT::v2f64: + case MVT::v2i64: + case MVT::v1i128: + case MVT::f128: + case MVT::v4f64: + case MVT::v4i1: + report_fatal_error("Handling of this parameter type is unimplemented!"); + } + } + + if (!isFunctionGlobalAddress(Callee) && + !isa<ExternalSymbolSDNode>(Callee)) + report_fatal_error("Handling of indirect call is unimplemented!"); + + // Build a sequence of copy-to-reg nodes chained together with token chain + // and flag operands which copy the outgoing args into the appropriate regs. + SDValue InFlag; + for (auto Reg : RegsToPass) { + Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, InFlag); + InFlag = Chain.getValue(1); + } + + return FinishCall(CallConv, dl, isTailCall, isVarArg, isPatchPoint, + /* unused except on PPC64 ELFv1 */ false, DAG, + RegsToPass, InFlag, Chain, CallSeqStart, Callee, SPDiff, + NumBytes, Ins, InVals, CS); +} + bool PPCTargetLowering::CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, @@ -6644,11 +6831,11 @@ PPCTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, SmallVector<SDValue, 4> RetOps(1, Chain); // Copy the result values into the output registers. - for (unsigned i = 0; i != RVLocs.size(); ++i) { + for (unsigned i = 0, RealResIdx = 0; i != RVLocs.size(); ++i, ++RealResIdx) { CCValAssign &VA = RVLocs[i]; assert(VA.isRegLoc() && "Can only return in registers!"); - SDValue Arg = OutVals[i]; + SDValue Arg = OutVals[RealResIdx]; switch (VA.getLocInfo()) { default: llvm_unreachable("Unknown loc info!"); @@ -6663,8 +6850,21 @@ PPCTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg); break; } - - Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag); + if (Subtarget.hasSPE() && VA.getLocVT() == MVT::f64) { + bool isLittleEndian = Subtarget.isLittleEndian(); + // Legalize ret f64 -> ret 2 x i32. + SDValue SVal = + DAG.getNode(PPCISD::EXTRACT_SPE, dl, MVT::i32, Arg, + DAG.getIntPtrConstant(isLittleEndian ? 0 : 1, dl)); + Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), SVal, Flag); + RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); + SVal = DAG.getNode(PPCISD::EXTRACT_SPE, dl, MVT::i32, Arg, + DAG.getIntPtrConstant(isLittleEndian ? 1 : 0, dl)); + Flag = Chain.getValue(1); + VA = RVLocs[++i]; // skip ahead to next loc + Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), SVal, Flag); + } else + Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag); Flag = Chain.getValue(1); RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); } @@ -6890,6 +7090,61 @@ SDValue PPCTargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { Op.getOperand(0)); } +SDValue PPCTargetLowering::LowerTRUNCATEVector(SDValue Op, + SelectionDAG &DAG) const { + + // Implements a vector truncate that fits in a vector register as a shuffle. + // We want to legalize vector truncates down to where the source fits in + // a vector register (and target is therefore smaller than vector register + // size). At that point legalization will try to custom lower the sub-legal + // result and get here - where we can contain the truncate as a single target + // operation. + + // For example a trunc <2 x i16> to <2 x i8> could be visualized as follows: + // <MSB1|LSB1, MSB2|LSB2> to <LSB1, LSB2> + // + // We will implement it for big-endian ordering as this (where x denotes + // undefined): + // < MSB1|LSB1, MSB2|LSB2, uu, uu, uu, uu, uu, uu> to + // < LSB1, LSB2, u, u, u, u, u, u, u, u, u, u, u, u, u, u> + // + // The same operation in little-endian ordering will be: + // <uu, uu, uu, uu, uu, uu, LSB2|MSB2, LSB1|MSB1> to + // <u, u, u, u, u, u, u, u, u, u, u, u, u, u, LSB2, LSB1> + + assert(Op.getValueType().isVector() && "Vector type expected."); + + SDLoc DL(Op); + SDValue N1 = Op.getOperand(0); + unsigned SrcSize = N1.getValueType().getSizeInBits(); + assert(SrcSize <= 128 && "Source must fit in an Altivec/VSX vector"); + SDValue WideSrc = SrcSize == 128 ? N1 : widenVec(DAG, N1, DL); + + EVT TrgVT = Op.getValueType(); + unsigned TrgNumElts = TrgVT.getVectorNumElements(); + EVT EltVT = TrgVT.getVectorElementType(); + unsigned WideNumElts = 128 / EltVT.getSizeInBits(); + EVT WideVT = EVT::getVectorVT(*DAG.getContext(), EltVT, WideNumElts); + + // First list the elements we want to keep. + unsigned SizeMult = SrcSize / TrgVT.getSizeInBits(); + SmallVector<int, 16> ShuffV; + if (Subtarget.isLittleEndian()) + for (unsigned i = 0; i < TrgNumElts; ++i) + ShuffV.push_back(i * SizeMult); + else + for (unsigned i = 1; i <= TrgNumElts; ++i) + ShuffV.push_back(i * SizeMult - 1); + + // Populate the remaining elements with undefs. + for (unsigned i = TrgNumElts; i < WideNumElts; ++i) + // ShuffV.push_back(i + WideNumElts); + ShuffV.push_back(WideNumElts + 1); + + SDValue Conv = DAG.getNode(ISD::BITCAST, DL, WideVT, WideSrc); + return DAG.getVectorShuffle(WideVT, DL, Conv, DAG.getUNDEF(WideVT), ShuffV); +} + /// LowerSELECT_CC - Lower floating point select_cc's into fsel instruction when /// possible. SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { @@ -9604,10 +9859,63 @@ SDValue PPCTargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const { BifID = Intrinsic::ppc_altivec_vmaxsh; else if (VT == MVT::v16i8) BifID = Intrinsic::ppc_altivec_vmaxsb; - + return BuildIntrinsicOp(BifID, X, Y, DAG, dl, VT); } +// Custom lowering for fpext vf32 to v2f64 +SDValue PPCTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { + + assert(Op.getOpcode() == ISD::FP_EXTEND && + "Should only be called for ISD::FP_EXTEND"); + + // We only want to custom lower an extend from v2f32 to v2f64. + if (Op.getValueType() != MVT::v2f64 || + Op.getOperand(0).getValueType() != MVT::v2f32) + return SDValue(); + + SDLoc dl(Op); + SDValue Op0 = Op.getOperand(0); + + switch (Op0.getOpcode()) { + default: + return SDValue(); + case ISD::FADD: + case ISD::FMUL: + case ISD::FSUB: { + SDValue NewLoad[2]; + for (unsigned i = 0, ie = Op0.getNumOperands(); i != ie; ++i) { + // Ensure both input are loads. + SDValue LdOp = Op0.getOperand(i); + if (LdOp.getOpcode() != ISD::LOAD) + return SDValue(); + // Generate new load node. + LoadSDNode *LD = cast<LoadSDNode>(LdOp); + SDValue LoadOps[] = { LD->getChain(), LD->getBasePtr() }; + NewLoad[i] = + DAG.getMemIntrinsicNode(PPCISD::LD_VSX_LH, dl, + DAG.getVTList(MVT::v4f32, MVT::Other), + LoadOps, LD->getMemoryVT(), + LD->getMemOperand()); + } + SDValue NewOp = DAG.getNode(Op0.getOpcode(), SDLoc(Op0), MVT::v4f32, + NewLoad[0], NewLoad[1], + Op0.getNode()->getFlags()); + return DAG.getNode(PPCISD::FP_EXTEND_LH, dl, MVT::v2f64, NewOp); + } + case ISD::LOAD: { + LoadSDNode *LD = cast<LoadSDNode>(Op0); + SDValue LoadOps[] = { LD->getChain(), LD->getBasePtr() }; + SDValue NewLd = + DAG.getMemIntrinsicNode(PPCISD::LD_VSX_LH, dl, + DAG.getVTList(MVT::v4f32, MVT::Other), + LoadOps, LD->getMemoryVT(), LD->getMemOperand()); + return DAG.getNode(PPCISD::FP_EXTEND_LH, dl, MVT::v2f64, NewLd); + } + } + llvm_unreachable("ERROR:Should return for all cases within swtich."); +} + /// LowerOperation - Provide custom lowering hooks for some operations. /// SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { @@ -9661,6 +9969,7 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); case ISD::MUL: return LowerMUL(Op, DAG); case ISD::ABS: return LowerABS(Op, DAG); + case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); // For counter-based loop handling. case ISD::INTRINSIC_W_CHAIN: return SDValue(); @@ -9701,7 +10010,7 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N, } case ISD::INTRINSIC_W_CHAIN: { if (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() != - Intrinsic::ppc_is_decremented_ctr_nonzero) + Intrinsic::loop_decrement) break; assert(N->getValueType(0) == MVT::i1 && @@ -9737,6 +10046,14 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N, return; Results.push_back(LowerFP_TO_INT(SDValue(N, 0), DAG, dl)); return; + case ISD::TRUNCATE: { + EVT TrgVT = N->getValueType(0); + if (TrgVT.isVector() && + isOperationCustom(N->getOpcode(), TrgVT) && + N->getOperand(0).getValueType().getSizeInBits() <= 128) + Results.push_back(LowerTRUNCATEVector(SDValue(N, 0), DAG)); + return; + } case ISD::BITCAST: // Don't handle bitcast here. return; @@ -9822,10 +10139,10 @@ PPCTargetLowering::EmitAtomicBinary(MachineInstr &MI, MachineBasicBlock *BB, MachineFunction *F = BB->getParent(); MachineFunction::iterator It = ++BB->getIterator(); - unsigned dest = MI.getOperand(0).getReg(); - unsigned ptrA = MI.getOperand(1).getReg(); - unsigned ptrB = MI.getOperand(2).getReg(); - unsigned incr = MI.getOperand(3).getReg(); + Register dest = MI.getOperand(0).getReg(); + Register ptrA = MI.getOperand(1).getReg(); + Register ptrB = MI.getOperand(2).getReg(); + Register incr = MI.getOperand(3).getReg(); DebugLoc dl = MI.getDebugLoc(); MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(LLVM_BB); @@ -9841,7 +10158,7 @@ PPCTargetLowering::EmitAtomicBinary(MachineInstr &MI, MachineBasicBlock *BB, exitMBB->transferSuccessorsAndUpdatePHIs(BB); MachineRegisterInfo &RegInfo = F->getRegInfo(); - unsigned TmpReg = (!BinOpcode) ? incr : + Register TmpReg = (!BinOpcode) ? incr : RegInfo.createVirtualRegister( AtomicSize == 8 ? &PPC::G8RCRegClass : &PPC::GPRCRegClass); @@ -9949,20 +10266,20 @@ MachineBasicBlock *PPCTargetLowering::EmitPartwordAtomicBinary( is64bit ? &PPC::G8RCRegClass : &PPC::GPRCRegClass; const TargetRegisterClass *GPRC = &PPC::GPRCRegClass; - unsigned PtrReg = RegInfo.createVirtualRegister(RC); - unsigned Shift1Reg = RegInfo.createVirtualRegister(GPRC); - unsigned ShiftReg = + Register PtrReg = RegInfo.createVirtualRegister(RC); + Register Shift1Reg = RegInfo.createVirtualRegister(GPRC); + Register ShiftReg = isLittleEndian ? Shift1Reg : RegInfo.createVirtualRegister(GPRC); - unsigned Incr2Reg = RegInfo.createVirtualRegister(GPRC); - unsigned MaskReg = RegInfo.createVirtualRegister(GPRC); - unsigned Mask2Reg = RegInfo.createVirtualRegister(GPRC); - unsigned Mask3Reg = RegInfo.createVirtualRegister(GPRC); - unsigned Tmp2Reg = RegInfo.createVirtualRegister(GPRC); - unsigned Tmp3Reg = RegInfo.createVirtualRegister(GPRC); - unsigned Tmp4Reg = RegInfo.createVirtualRegister(GPRC); - unsigned TmpDestReg = RegInfo.createVirtualRegister(GPRC); - unsigned Ptr1Reg; - unsigned TmpReg = + Register Incr2Reg = RegInfo.createVirtualRegister(GPRC); + Register MaskReg = RegInfo.createVirtualRegister(GPRC); + Register Mask2Reg = RegInfo.createVirtualRegister(GPRC); + Register Mask3Reg = RegInfo.createVirtualRegister(GPRC); + Register Tmp2Reg = RegInfo.createVirtualRegister(GPRC); + Register Tmp3Reg = RegInfo.createVirtualRegister(GPRC); + Register Tmp4Reg = RegInfo.createVirtualRegister(GPRC); + Register TmpDestReg = RegInfo.createVirtualRegister(GPRC); + Register Ptr1Reg; + Register TmpReg = (!BinOpcode) ? Incr2Reg : RegInfo.createVirtualRegister(GPRC); // thisMBB: @@ -10764,23 +11081,23 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, is64bit ? &PPC::G8RCRegClass : &PPC::GPRCRegClass; const TargetRegisterClass *GPRC = &PPC::GPRCRegClass; - unsigned PtrReg = RegInfo.createVirtualRegister(RC); - unsigned Shift1Reg = RegInfo.createVirtualRegister(GPRC); - unsigned ShiftReg = + Register PtrReg = RegInfo.createVirtualRegister(RC); + Register Shift1Reg = RegInfo.createVirtualRegister(GPRC); + Register ShiftReg = isLittleEndian ? Shift1Reg : RegInfo.createVirtualRegister(GPRC); - unsigned NewVal2Reg = RegInfo.createVirtualRegister(GPRC); - unsigned NewVal3Reg = RegInfo.createVirtualRegister(GPRC); - unsigned OldVal2Reg = RegInfo.createVirtualRegister(GPRC); - unsigned OldVal3Reg = RegInfo.createVirtualRegister(GPRC); - unsigned MaskReg = RegInfo.createVirtualRegister(GPRC); - unsigned Mask2Reg = RegInfo.createVirtualRegister(GPRC); - unsigned Mask3Reg = RegInfo.createVirtualRegister(GPRC); - unsigned Tmp2Reg = RegInfo.createVirtualRegister(GPRC); - unsigned Tmp4Reg = RegInfo.createVirtualRegister(GPRC); - unsigned TmpDestReg = RegInfo.createVirtualRegister(GPRC); - unsigned Ptr1Reg; - unsigned TmpReg = RegInfo.createVirtualRegister(GPRC); - unsigned ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO; + Register NewVal2Reg = RegInfo.createVirtualRegister(GPRC); + Register NewVal3Reg = RegInfo.createVirtualRegister(GPRC); + Register OldVal2Reg = RegInfo.createVirtualRegister(GPRC); + Register OldVal3Reg = RegInfo.createVirtualRegister(GPRC); + Register MaskReg = RegInfo.createVirtualRegister(GPRC); + Register Mask2Reg = RegInfo.createVirtualRegister(GPRC); + Register Mask3Reg = RegInfo.createVirtualRegister(GPRC); + Register Tmp2Reg = RegInfo.createVirtualRegister(GPRC); + Register Tmp4Reg = RegInfo.createVirtualRegister(GPRC); + Register TmpDestReg = RegInfo.createVirtualRegister(GPRC); + Register Ptr1Reg; + Register TmpReg = RegInfo.createVirtualRegister(GPRC); + Register ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO; // thisMBB: // ... // fallthrough --> loopMBB @@ -10968,7 +11285,147 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MachineRegisterInfo &RegInfo = F->getRegInfo(); unsigned CRReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass); BuildMI(*BB, MI, Dl, TII->get(PPC::TCHECK), CRReg); - return BB; + BuildMI(*BB, MI, Dl, TII->get(TargetOpcode::COPY), + MI.getOperand(0).getReg()) + .addReg(CRReg); + } else if (MI.getOpcode() == PPC::TBEGIN_RET) { + DebugLoc Dl = MI.getDebugLoc(); + unsigned Imm = MI.getOperand(1).getImm(); + BuildMI(*BB, MI, Dl, TII->get(PPC::TBEGIN)).addImm(Imm); + BuildMI(*BB, MI, Dl, TII->get(TargetOpcode::COPY), + MI.getOperand(0).getReg()) + .addReg(PPC::CR0EQ); + } else if (MI.getOpcode() == PPC::SETRNDi) { + DebugLoc dl = MI.getDebugLoc(); + unsigned OldFPSCRReg = MI.getOperand(0).getReg(); + + // Save FPSCR value. + BuildMI(*BB, MI, dl, TII->get(PPC::MFFS), OldFPSCRReg); + + // The floating point rounding mode is in the bits 62:63 of FPCSR, and has + // the following settings: + // 00 Round to nearest + // 01 Round to 0 + // 10 Round to +inf + // 11 Round to -inf + + // When the operand is immediate, using the two least significant bits of + // the immediate to set the bits 62:63 of FPSCR. + unsigned Mode = MI.getOperand(1).getImm(); + BuildMI(*BB, MI, dl, TII->get((Mode & 1) ? PPC::MTFSB1 : PPC::MTFSB0)) + .addImm(31); + + BuildMI(*BB, MI, dl, TII->get((Mode & 2) ? PPC::MTFSB1 : PPC::MTFSB0)) + .addImm(30); + } else if (MI.getOpcode() == PPC::SETRND) { + DebugLoc dl = MI.getDebugLoc(); + + // Copy register from F8RCRegClass::SrcReg to G8RCRegClass::DestReg + // or copy register from G8RCRegClass::SrcReg to F8RCRegClass::DestReg. + // If the target doesn't have DirectMove, we should use stack to do the + // conversion, because the target doesn't have the instructions like mtvsrd + // or mfvsrd to do this conversion directly. + auto copyRegFromG8RCOrF8RC = [&] (unsigned DestReg, unsigned SrcReg) { + if (Subtarget.hasDirectMove()) { + BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), DestReg) + .addReg(SrcReg); + } else { + // Use stack to do the register copy. + unsigned StoreOp = PPC::STD, LoadOp = PPC::LFD; + MachineRegisterInfo &RegInfo = F->getRegInfo(); + const TargetRegisterClass *RC = RegInfo.getRegClass(SrcReg); + if (RC == &PPC::F8RCRegClass) { + // Copy register from F8RCRegClass to G8RCRegclass. + assert((RegInfo.getRegClass(DestReg) == &PPC::G8RCRegClass) && + "Unsupported RegClass."); + + StoreOp = PPC::STFD; + LoadOp = PPC::LD; + } else { + // Copy register from G8RCRegClass to F8RCRegclass. + assert((RegInfo.getRegClass(SrcReg) == &PPC::G8RCRegClass) && + (RegInfo.getRegClass(DestReg) == &PPC::F8RCRegClass) && + "Unsupported RegClass."); + } + + MachineFrameInfo &MFI = F->getFrameInfo(); + int FrameIdx = MFI.CreateStackObject(8, 8, false); + + MachineMemOperand *MMOStore = F->getMachineMemOperand( + MachinePointerInfo::getFixedStack(*F, FrameIdx, 0), + MachineMemOperand::MOStore, MFI.getObjectSize(FrameIdx), + MFI.getObjectAlignment(FrameIdx)); + + // Store the SrcReg into the stack. + BuildMI(*BB, MI, dl, TII->get(StoreOp)) + .addReg(SrcReg) + .addImm(0) + .addFrameIndex(FrameIdx) + .addMemOperand(MMOStore); + + MachineMemOperand *MMOLoad = F->getMachineMemOperand( + MachinePointerInfo::getFixedStack(*F, FrameIdx, 0), + MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIdx), + MFI.getObjectAlignment(FrameIdx)); + + // Load from the stack where SrcReg is stored, and save to DestReg, + // so we have done the RegClass conversion from RegClass::SrcReg to + // RegClass::DestReg. + BuildMI(*BB, MI, dl, TII->get(LoadOp), DestReg) + .addImm(0) + .addFrameIndex(FrameIdx) + .addMemOperand(MMOLoad); + } + }; + + unsigned OldFPSCRReg = MI.getOperand(0).getReg(); + + // Save FPSCR value. + BuildMI(*BB, MI, dl, TII->get(PPC::MFFS), OldFPSCRReg); + + // When the operand is gprc register, use two least significant bits of the + // register and mtfsf instruction to set the bits 62:63 of FPSCR. + // + // copy OldFPSCRTmpReg, OldFPSCRReg + // (INSERT_SUBREG ExtSrcReg, (IMPLICIT_DEF ImDefReg), SrcOp, 1) + // rldimi NewFPSCRTmpReg, ExtSrcReg, OldFPSCRReg, 0, 62 + // copy NewFPSCRReg, NewFPSCRTmpReg + // mtfsf 255, NewFPSCRReg + MachineOperand SrcOp = MI.getOperand(1); + MachineRegisterInfo &RegInfo = F->getRegInfo(); + unsigned OldFPSCRTmpReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass); + + copyRegFromG8RCOrF8RC(OldFPSCRTmpReg, OldFPSCRReg); + + unsigned ImDefReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass); + unsigned ExtSrcReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass); + + // The first operand of INSERT_SUBREG should be a register which has + // subregisters, we only care about its RegClass, so we should use an + // IMPLICIT_DEF register. + BuildMI(*BB, MI, dl, TII->get(TargetOpcode::IMPLICIT_DEF), ImDefReg); + BuildMI(*BB, MI, dl, TII->get(PPC::INSERT_SUBREG), ExtSrcReg) + .addReg(ImDefReg) + .add(SrcOp) + .addImm(1); + + unsigned NewFPSCRTmpReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass); + BuildMI(*BB, MI, dl, TII->get(PPC::RLDIMI), NewFPSCRTmpReg) + .addReg(OldFPSCRTmpReg) + .addReg(ExtSrcReg) + .addImm(0) + .addImm(62); + + unsigned NewFPSCRReg = RegInfo.createVirtualRegister(&PPC::F8RCRegClass); + copyRegFromG8RCOrF8RC(NewFPSCRReg, NewFPSCRTmpReg); + + // The mask 255 means that put the 32:63 bits of NewFPSCRReg to the 32:63 + // bits of FPSCR. + BuildMI(*BB, MI, dl, TII->get(PPC::MTFSF)) + .addImm(255) + .addReg(NewFPSCRReg) + .addImm(0) + .addImm(0); } else { llvm_unreachable("Unexpected instr type to insert"); } @@ -11006,7 +11463,9 @@ SDValue PPCTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, if (RefinementSteps == ReciprocalEstimate::Unspecified) RefinementSteps = getEstimateRefinementSteps(VT, Subtarget); - UseOneConstNR = true; + // The Newton-Raphson computation with a single constant does not provide + // enough accuracy on some CPUs. + UseOneConstNR = !Subtarget.needsTwoConstNR(); return DAG.getNode(PPCISD::FRSQRTE, SDLoc(Operand), VT, Operand); } return SDValue(); @@ -12062,9 +12521,14 @@ static SDValue combineBVOfConsecutiveLoads(SDNode *N, SelectionDAG &DAG) { "Should be called with a BUILD_VECTOR node"); SDLoc dl(N); + + // Return early for non byte-sized type, as they can't be consecutive. + if (!N->getValueType(0).getVectorElementType().isByteSized()) + return SDValue(); + bool InputsAreConsecutiveLoads = true; bool InputsAreReverseConsecutive = true; - unsigned ElemSize = N->getValueType(0).getScalarSizeInBits() / 8; + unsigned ElemSize = N->getValueType(0).getScalarType().getStoreSize(); SDValue FirstInput = N->getOperand(0); bool IsRoundOfExtLoad = false; @@ -12332,9 +12796,8 @@ SDValue PPCTargetLowering::DAGCombineBuildVector(SDNode *N, ConstantSDNode *Ext2Op = dyn_cast<ConstantSDNode>(Ext2.getOperand(1)); if (!Ext1Op || !Ext2Op) return SDValue(); - if (Ext1.getValueType() != MVT::i32 || - Ext2.getValueType() != MVT::i32) - if (Ext1.getOperand(0) != Ext2.getOperand(0)) + if (Ext1.getOperand(0).getValueType() != MVT::v4i32 || + Ext1.getOperand(0) != Ext2.getOperand(0)) return SDValue(); int FirstElem = Ext1Op->getZExtValue(); @@ -12664,6 +13127,8 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, return combineSRA(N, DCI); case ISD::SRL: return combineSRL(N, DCI); + case ISD::MUL: + return combineMUL(N, DCI); case PPCISD::SHL: if (isNullConstant(N->getOperand(0))) // 0 << V -> 0. return N->getOperand(0); @@ -13246,7 +13711,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, if (Cond.getOpcode() == ISD::INTRINSIC_W_CHAIN && cast<ConstantSDNode>(Cond.getOperand(1))->getZExtValue() == - Intrinsic::ppc_is_decremented_ctr_nonzero) { + Intrinsic::loop_decrement) { // We now need to make the intrinsic dead (it cannot be instruction // selected). @@ -13272,14 +13737,14 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, if (LHS.getOpcode() == ISD::AND && LHS.getOperand(0).getOpcode() == ISD::INTRINSIC_W_CHAIN && cast<ConstantSDNode>(LHS.getOperand(0).getOperand(1))->getZExtValue() == - Intrinsic::ppc_is_decremented_ctr_nonzero && + Intrinsic::loop_decrement && isa<ConstantSDNode>(LHS.getOperand(1)) && !isNullConstant(LHS.getOperand(1))) LHS = LHS.getOperand(0); if (LHS.getOpcode() == ISD::INTRINSIC_W_CHAIN && cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() == - Intrinsic::ppc_is_decremented_ctr_nonzero && + Intrinsic::loop_decrement && isa<ConstantSDNode>(RHS)) { assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Counter decrement comparison is not EQ or NE"); @@ -13355,9 +13820,9 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, } case ISD::BUILD_VECTOR: return DAGCombineBuildVector(N, DCI); - case ISD::ABS: + case ISD::ABS: return combineABS(N, DCI); - case ISD::VSELECT: + case ISD::VSELECT: return combineVSelect(N, DCI); } @@ -13453,6 +13918,15 @@ unsigned PPCTargetLowering::getPrefLoopAlignment(MachineLoop *ML) const { if (!ML) break; + if (!DisableInnermostLoopAlign32) { + // If the nested loop is an innermost loop, prefer to a 32-byte alignment, + // so that we can decrease cache misses and branch-prediction misses. + // Actual alignment of the loop will depend on the hotness check and other + // logic in alignBlocks. + if (ML->getLoopDepth() > 1 && ML->getSubLoops().empty()) + return 5; + } + const PPCInstrInfo *TII = Subtarget.getInstrInfo(); // For small loops (between 5 and 8 instructions), align to a 32-byte @@ -13502,7 +13976,7 @@ PPCTargetLowering::getConstraintType(StringRef Constraint) const { return C_RegisterClass; } else if (Constraint == "wa" || Constraint == "wd" || Constraint == "wf" || Constraint == "ws" || - Constraint == "wi") { + Constraint == "wi" || Constraint == "ww") { return C_RegisterClass; // VSX registers. } return TargetLowering::getConstraintType(Constraint); @@ -13530,10 +14004,12 @@ PPCTargetLowering::getSingleConstraintMatchWeight( StringRef(constraint) == "wf") && type->isVectorTy()) return CW_Register; - else if (StringRef(constraint) == "ws" && type->isDoubleTy()) - return CW_Register; else if (StringRef(constraint) == "wi" && type->isIntegerTy(64)) return CW_Register; // just hold 64-bit integers data. + else if (StringRef(constraint) == "ws" && type->isDoubleTy()) + return CW_Register; + else if (StringRef(constraint) == "ww" && type->isFloatTy()) + return CW_Register; switch (*constraint) { default: @@ -13619,7 +14095,7 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, Constraint == "wf" || Constraint == "wi") && Subtarget.hasVSX()) { return std::make_pair(0U, &PPC::VSRCRegClass); - } else if (Constraint == "ws" && Subtarget.hasVSX()) { + } else if ((Constraint == "ws" || Constraint == "ww") && Subtarget.hasVSX()) { if (VT == MVT::f32 && Subtarget.hasP8Vector()) return std::make_pair(0U, &PPC::VSSRCRegClass); else @@ -13865,7 +14341,7 @@ bool PPCTargetLowering::isAccessedAsGotIndirect(SDValue GA) const { if (CModel == CodeModel::Small || CModel == CodeModel::Large) return true; - // JumpTable and BlockAddress are accessed as got-indirect. + // JumpTable and BlockAddress are accessed as got-indirect. if (isa<JumpTableSDNode>(GA) || isa<BlockAddressSDNode>(GA)) return true; @@ -14082,18 +14558,16 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, /// source is constant so it does not need to be loaded. /// It returns EVT::Other if the type should be determined using generic /// target-independent logic. -EVT PPCTargetLowering::getOptimalMemOpType(uint64_t Size, - unsigned DstAlign, unsigned SrcAlign, - bool IsMemset, bool ZeroMemset, - bool MemcpyStrSrc, - MachineFunction &MF) const { +EVT PPCTargetLowering::getOptimalMemOpType( + uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, + bool ZeroMemset, bool MemcpyStrSrc, + const AttributeList &FuncAttributes) const { if (getTargetMachine().getOptLevel() != CodeGenOpt::None) { - const Function &F = MF.getFunction(); // When expanding a memset, require at least two QPX instructions to cover // the cost of loading the value to be stored from the constant pool. if (Subtarget.hasQPX() && Size >= 32 && (!IsMemset || Size >= 64) && (!SrcAlign || SrcAlign >= 32) && (!DstAlign || DstAlign >= 32) && - !F.hasFnAttribute(Attribute::NoImplicitFloat)) { + !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) { return MVT::v4f64; } @@ -14178,6 +14652,7 @@ bool PPCTargetLowering::isLegalAddImmediate(int64_t Imm) const { bool PPCTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned, unsigned, + MachineMemOperand::Flags, bool *Fast) const { if (DisablePPCUnaligned) return false; @@ -14324,7 +14799,7 @@ void PPCTargetLowering::insertCopiesSplitCSR( BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR) .addReg(*I); - // Insert the copy-back instructions right before the terminator + // Insert the copy-back instructions right before the terminator. for (auto *Exit : Exits) BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(), TII->get(TargetOpcode::COPY), *I) @@ -14345,7 +14820,8 @@ void PPCTargetLowering::insertSSPDeclarations(Module &M) const { return TargetLowering::insertSSPDeclarations(M); } -bool PPCTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { +bool PPCTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, + bool ForCodeSize) const { if (!VT.isSimple() || !Subtarget.hasVSX()) return false; @@ -14585,6 +15061,89 @@ SDValue PPCTargetLowering::combineTRUNCATE(SDNode *N, return SDValue(); } +SDValue PPCTargetLowering::combineMUL(SDNode *N, DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + + ConstantSDNode *ConstOpOrElement = isConstOrConstSplat(N->getOperand(1)); + if (!ConstOpOrElement) + return SDValue(); + + // An imul is usually smaller than the alternative sequence for legal type. + if (DAG.getMachineFunction().getFunction().hasMinSize() && + isOperationLegal(ISD::MUL, N->getValueType(0))) + return SDValue(); + + auto IsProfitable = [this](bool IsNeg, bool IsAddOne, EVT VT) -> bool { + switch (this->Subtarget.getDarwinDirective()) { + default: + // TODO: enhance the condition for subtarget before pwr8 + return false; + case PPC::DIR_PWR8: + // type mul add shl + // scalar 4 1 1 + // vector 7 2 2 + return true; + case PPC::DIR_PWR9: + // type mul add shl + // scalar 5 2 2 + // vector 7 2 2 + + // The cycle RATIO of related operations are showed as a table above. + // Because mul is 5(scalar)/7(vector), add/sub/shl are all 2 for both + // scalar and vector type. For 2 instrs patterns, add/sub + shl + // are 4, it is always profitable; but for 3 instrs patterns + // (mul x, -(2^N + 1)) => -(add (shl x, N), x), sub + add + shl are 6. + // So we should only do it for vector type. + return IsAddOne && IsNeg ? VT.isVector() : true; + } + }; + + EVT VT = N->getValueType(0); + SDLoc DL(N); + + const APInt &MulAmt = ConstOpOrElement->getAPIntValue(); + bool IsNeg = MulAmt.isNegative(); + APInt MulAmtAbs = MulAmt.abs(); + + if ((MulAmtAbs - 1).isPowerOf2()) { + // (mul x, 2^N + 1) => (add (shl x, N), x) + // (mul x, -(2^N + 1)) => -(add (shl x, N), x) + + if (!IsProfitable(IsNeg, true, VT)) + return SDValue(); + + SDValue Op0 = N->getOperand(0); + SDValue Op1 = + DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), + DAG.getConstant((MulAmtAbs - 1).logBase2(), DL, VT)); + SDValue Res = DAG.getNode(ISD::ADD, DL, VT, Op0, Op1); + + if (!IsNeg) + return Res; + + return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Res); + } else if ((MulAmtAbs + 1).isPowerOf2()) { + // (mul x, 2^N - 1) => (sub (shl x, N), x) + // (mul x, -(2^N - 1)) => (sub x, (shl x, N)) + + if (!IsProfitable(IsNeg, false, VT)) + return SDValue(); + + SDValue Op0 = N->getOperand(0); + SDValue Op1 = + DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), + DAG.getConstant((MulAmtAbs + 1).logBase2(), DL, VT)); + + if (!IsNeg) + return DAG.getNode(ISD::SUB, DL, VT, Op1, Op0); + else + return DAG.getNode(ISD::SUB, DL, VT, Op0, Op1); + + } else { + return SDValue(); + } +} + bool PPCTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { // Only duplicate to increase tail-calls for the 64bit SysV ABIs. if (!Subtarget.isSVR4ABI() || !Subtarget.isPPC64()) |