aboutsummaryrefslogtreecommitdiff
path: root/lib/Target/PowerPC/PPCISelLowering.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target/PowerPC/PPCISelLowering.cpp')
-rw-r--r--lib/Target/PowerPC/PPCISelLowering.cpp1087
1 files changed, 823 insertions, 264 deletions
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 39608cb74bee..24d50074860d 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -1,9 +1,8 @@
//===-- PPCISelLowering.cpp - PPC DAG Lowering Implementation -------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -45,6 +44,7 @@
#include "llvm/CodeGen/MachineJumpTableInfo.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/RuntimeLibcalls.h"
@@ -70,8 +70,10 @@
#include "llvm/IR/Type.h"
#include "llvm/IR/Use.h"
#include "llvm/IR/Value.h"
+#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSymbolXCOFF.h"
#include "llvm/Support/AtomicOrdering.h"
#include "llvm/Support/BranchProbability.h"
#include "llvm/Support/Casting.h"
@@ -111,6 +113,9 @@ cl::desc("disable unaligned load/store generation on PPC"), cl::Hidden);
static cl::opt<bool> DisableSCO("disable-ppc-sco",
cl::desc("disable sibling call optimization on ppc"), cl::Hidden);
+static cl::opt<bool> DisableInnermostLoopAlign32("disable-ppc-innermost-loop-align32",
+cl::desc("don't always align innermost loop to 32 bytes on ppc"), cl::Hidden);
+
static cl::opt<bool> EnableQuadPrecision("enable-ppc-quad-precision",
cl::desc("enable quad precision float support on ppc"), cl::Hidden);
@@ -119,6 +124,8 @@ STATISTIC(NumSiblingCalls, "Number of sibling calls");
static bool isNByteElemShuffleMask(ShuffleVectorSDNode *, unsigned, int);
+static SDValue widenVec(SelectionDAG &DAG, SDValue Vec, const SDLoc &dl);
+
// FIXME: Remove this once the bug has been fixed!
extern cl::opt<bool> ANDIGlueBug;
@@ -550,7 +557,18 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
// add/sub are legal for all supported vector VT's.
setOperationAction(ISD::ADD, VT, Legal);
setOperationAction(ISD::SUB, VT, Legal);
- setOperationAction(ISD::ABS, VT, Custom);
+
+ // For v2i64, these are only valid with P8Vector. This is corrected after
+ // the loop.
+ setOperationAction(ISD::SMAX, VT, Legal);
+ setOperationAction(ISD::SMIN, VT, Legal);
+ setOperationAction(ISD::UMAX, VT, Legal);
+ setOperationAction(ISD::UMIN, VT, Legal);
+
+ if (Subtarget.hasVSX()) {
+ setOperationAction(ISD::FMAXNUM, VT, Legal);
+ setOperationAction(ISD::FMINNUM, VT, Legal);
+ }
// Vector instructions introduced in P8
if (Subtarget.hasP8Altivec() && (VT.SimpleTy != MVT::v1i128)) {
@@ -635,11 +653,28 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
}
}
+ if (!Subtarget.hasP8Vector()) {
+ setOperationAction(ISD::SMAX, MVT::v2i64, Expand);
+ setOperationAction(ISD::SMIN, MVT::v2i64, Expand);
+ setOperationAction(ISD::UMAX, MVT::v2i64, Expand);
+ setOperationAction(ISD::UMIN, MVT::v2i64, Expand);
+ }
+
+ for (auto VT : {MVT::v2i64, MVT::v4i32, MVT::v8i16, MVT::v16i8})
+ setOperationAction(ISD::ABS, VT, Custom);
// We can custom expand all VECTOR_SHUFFLEs to VPERM, others we can handle
// with merges, splats, etc.
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i8, Custom);
+ // Vector truncates to sub-word integer that fit in an Altivec/VSX register
+ // are cheap, so handle them before they get expanded to scalar.
+ setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom);
+
setOperationAction(ISD::AND , MVT::v4i32, Legal);
setOperationAction(ISD::OR , MVT::v4i32, Legal);
setOperationAction(ISD::XOR , MVT::v4i32, Legal);
@@ -804,6 +839,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setOperationAction(ISD::FNEG, MVT::v2f64, Legal);
setOperationAction(ISD::FABS, MVT::v4f32, Legal);
setOperationAction(ISD::FABS, MVT::v2f64, Legal);
+ setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Legal);
+ setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Legal);
if (Subtarget.hasDirectMove())
setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom);
@@ -866,6 +903,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setOperationAction(ISD::FPOWI, MVT::f128, Expand);
setOperationAction(ISD::FREM, MVT::f128, Expand);
}
+ setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
}
@@ -1060,6 +1098,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setTargetDAGCombine(ISD::SHL);
setTargetDAGCombine(ISD::SRA);
setTargetDAGCombine(ISD::SRL);
+ setTargetDAGCombine(ISD::MUL);
setTargetDAGCombine(ISD::SINT_TO_FP);
setTargetDAGCombine(ISD::BUILD_VECTOR);
if (Subtarget.hasFPCVT())
@@ -1232,22 +1271,6 @@ unsigned PPCTargetLowering::getByValTypeAlignment(Type *Ty,
return Align;
}
-unsigned PPCTargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
- CallingConv:: ID CC,
- EVT VT) const {
- if (Subtarget.hasSPE() && VT == MVT::f64)
- return 2;
- return PPCTargetLowering::getNumRegisters(Context, VT);
-}
-
-MVT PPCTargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
- CallingConv:: ID CC,
- EVT VT) const {
- if (Subtarget.hasSPE() && VT == MVT::f64)
- return MVT::i32;
- return PPCTargetLowering::getRegisterType(Context, VT);
-}
-
bool PPCTargetLowering::useSoftFloat() const {
return Subtarget.useSoftFloat();
}
@@ -1256,6 +1279,10 @@ bool PPCTargetLowering::hasSPE() const {
return Subtarget.hasSPE();
}
+bool PPCTargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
+ return VT.isScalarInteger();
+}
+
const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
switch ((PPCISD::NodeType)Opcode) {
case PPCISD::FIRST_NUMBER: break;
@@ -1365,7 +1392,11 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
case PPCISD::QBFLT: return "PPCISD::QBFLT";
case PPCISD::QVLFSb: return "PPCISD::QVLFSb";
case PPCISD::BUILD_FP128: return "PPCISD::BUILD_FP128";
+ case PPCISD::BUILD_SPE64: return "PPCISD::BUILD_SPE64";
+ case PPCISD::EXTRACT_SPE: return "PPCISD::EXTRACT_SPE";
case PPCISD::EXTSWSLI: return "PPCISD::EXTSWSLI";
+ case PPCISD::LD_VSX_LH: return "PPCISD::LD_VSX_LH";
+ case PPCISD::FP_EXTEND_LH: return "PPCISD::FP_EXTEND_LH";
}
return nullptr;
}
@@ -2202,16 +2233,43 @@ bool llvm::isIntS16Immediate(SDValue Op, int16_t &Imm) {
return isIntS16Immediate(Op.getNode(), Imm);
}
+
+/// SelectAddressEVXRegReg - Given the specified address, check to see if it can
+/// be represented as an indexed [r+r] operation.
+bool PPCTargetLowering::SelectAddressEVXRegReg(SDValue N, SDValue &Base,
+ SDValue &Index,
+ SelectionDAG &DAG) const {
+ for (SDNode::use_iterator UI = N->use_begin(), E = N->use_end();
+ UI != E; ++UI) {
+ if (MemSDNode *Memop = dyn_cast<MemSDNode>(*UI)) {
+ if (Memop->getMemoryVT() == MVT::f64) {
+ Base = N.getOperand(0);
+ Index = N.getOperand(1);
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
/// SelectAddressRegReg - Given the specified addressed, check to see if it
/// can be represented as an indexed [r+r] operation. Returns false if it
-/// can be more efficiently represented with [r+imm].
+/// can be more efficiently represented as [r+imm]. If \p EncodingAlignment is
+/// non-zero and N can be represented by a base register plus a signed 16-bit
+/// displacement, make a more precise judgement by checking (displacement % \p
+/// EncodingAlignment).
bool PPCTargetLowering::SelectAddressRegReg(SDValue N, SDValue &Base,
- SDValue &Index,
- SelectionDAG &DAG) const {
+ SDValue &Index, SelectionDAG &DAG,
+ unsigned EncodingAlignment) const {
int16_t imm = 0;
if (N.getOpcode() == ISD::ADD) {
- if (isIntS16Immediate(N.getOperand(1), imm))
- return false; // r+i
+ // Is there any SPE load/store (f64), which can't handle 16bit offset?
+ // SPE load/store can only handle 8-bit offsets.
+ if (hasSPE() && SelectAddressEVXRegReg(N, Base, Index, DAG))
+ return true;
+ if (isIntS16Immediate(N.getOperand(1), imm) &&
+ (!EncodingAlignment || !(imm % EncodingAlignment)))
+ return false; // r+i
if (N.getOperand(1).getOpcode() == PPCISD::Lo)
return false; // r+i
@@ -2219,8 +2277,9 @@ bool PPCTargetLowering::SelectAddressRegReg(SDValue N, SDValue &Base,
Index = N.getOperand(1);
return true;
} else if (N.getOpcode() == ISD::OR) {
- if (isIntS16Immediate(N.getOperand(1), imm))
- return false; // r+i can fold it if we can.
+ if (isIntS16Immediate(N.getOperand(1), imm) &&
+ (!EncodingAlignment || !(imm % EncodingAlignment)))
+ return false; // r+i can fold it if we can.
// If this is an or of disjoint bitfields, we can codegen this as an add
// (for better address arithmetic) if the LHS and RHS of the OR are provably
@@ -2284,22 +2343,22 @@ static void fixupFuncForFI(SelectionDAG &DAG, int FrameIdx, EVT VT) {
/// Returns true if the address N can be represented by a base register plus
/// a signed 16-bit displacement [r+imm], and if it is not better
-/// represented as reg+reg. If \p Alignment is non-zero, only accept
+/// represented as reg+reg. If \p EncodingAlignment is non-zero, only accept
/// displacements that are multiples of that value.
bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
SDValue &Base,
SelectionDAG &DAG,
- unsigned Alignment) const {
+ unsigned EncodingAlignment) const {
// FIXME dl should come from parent load or store, not from address
SDLoc dl(N);
// If this can be more profitably realized as r+r, fail.
- if (SelectAddressRegReg(N, Disp, Base, DAG))
+ if (SelectAddressRegReg(N, Disp, Base, DAG, EncodingAlignment))
return false;
if (N.getOpcode() == ISD::ADD) {
int16_t imm = 0;
if (isIntS16Immediate(N.getOperand(1), imm) &&
- (!Alignment || (imm % Alignment) == 0)) {
+ (!EncodingAlignment || (imm % EncodingAlignment) == 0)) {
Disp = DAG.getTargetConstant(imm, dl, N.getValueType());
if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0))) {
Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
@@ -2323,7 +2382,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
} else if (N.getOpcode() == ISD::OR) {
int16_t imm = 0;
if (isIntS16Immediate(N.getOperand(1), imm) &&
- (!Alignment || (imm % Alignment) == 0)) {
+ (!EncodingAlignment || (imm % EncodingAlignment) == 0)) {
// If this is an or of disjoint bitfields, we can codegen this as an add
// (for better address arithmetic) if the LHS and RHS of the OR are
// provably disjoint.
@@ -2349,7 +2408,8 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
// If this address fits entirely in a 16-bit sext immediate field, codegen
// this as "d, 0"
int16_t Imm;
- if (isIntS16Immediate(CN, Imm) && (!Alignment || (Imm % Alignment) == 0)) {
+ if (isIntS16Immediate(CN, Imm) &&
+ (!EncodingAlignment || (Imm % EncodingAlignment) == 0)) {
Disp = DAG.getTargetConstant(Imm, dl, CN->getValueType(0));
Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
CN->getValueType(0));
@@ -2359,7 +2419,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
// Handle 32-bit sext immediates with LIS + addr mode.
if ((CN->getValueType(0) == MVT::i32 ||
(int64_t)CN->getZExtValue() == (int)CN->getZExtValue()) &&
- (!Alignment || (CN->getZExtValue() % Alignment) == 0)) {
+ (!EncodingAlignment || (CN->getZExtValue() % EncodingAlignment) == 0)) {
int Addr = (int)CN->getZExtValue();
// Otherwise, break this down into an LIS + disp.
@@ -2416,24 +2476,45 @@ bool PPCTargetLowering::SelectAddressRegRegOnly(SDValue N, SDValue &Base,
/// Returns true if we should use a direct load into vector instruction
/// (such as lxsd or lfd), instead of a load into gpr + direct move sequence.
-static bool usePartialVectorLoads(SDNode *N) {
- if (!N->hasOneUse())
- return false;
+static bool usePartialVectorLoads(SDNode *N, const PPCSubtarget& ST) {
// If there are any other uses other than scalar to vector, then we should
// keep it as a scalar load -> direct move pattern to prevent multiple
- // loads. Currently, only check for i64 since we have lxsd/lfd to do this
- // efficiently, but no update equivalent.
- if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
- EVT MemVT = LD->getMemoryVT();
- if (MemVT.isSimple() && MemVT.getSimpleVT().SimpleTy == MVT::i64) {
- SDNode *User = *(LD->use_begin());
- if (User->getOpcode() == ISD::SCALAR_TO_VECTOR)
- return true;
- }
+ // loads.
+ LoadSDNode *LD = dyn_cast<LoadSDNode>(N);
+ if (!LD)
+ return false;
+
+ EVT MemVT = LD->getMemoryVT();
+ if (!MemVT.isSimple())
+ return false;
+ switch(MemVT.getSimpleVT().SimpleTy) {
+ case MVT::i64:
+ break;
+ case MVT::i32:
+ if (!ST.hasP8Vector())
+ return false;
+ break;
+ case MVT::i16:
+ case MVT::i8:
+ if (!ST.hasP9Vector())
+ return false;
+ break;
+ default:
+ return false;
}
- return false;
+ SDValue LoadedVal(N, 0);
+ if (!LoadedVal.hasOneUse())
+ return false;
+
+ for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end();
+ UI != UE; ++UI)
+ if (UI.getUse().get().getResNo() == 0 &&
+ UI->getOpcode() != ISD::SCALAR_TO_VECTOR)
+ return false;
+
+ return true;
}
/// getPreIndexedAddressParts - returns true by value, base pointer and
@@ -2464,7 +2545,7 @@ bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
// Do not generate pre-inc forms for specific loads that feed scalar_to_vector
// instructions because we can fold these into a more efficient instruction
// instead, (such as LXSD).
- if (isLoad && usePartialVectorLoads(N)) {
+ if (isLoad && usePartialVectorLoads(N, Subtarget)) {
return false;
}
@@ -2745,7 +2826,8 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
const Module *M = DAG.getMachineFunction().getFunction().getParent();
PICLevel::Level picLevel = M->getPICLevel();
- TLSModel::Model Model = getTargetMachine().getTLSModel(GV);
+ const TargetMachine &TM = getTargetMachine();
+ TLSModel::Model Model = TM.getTLSModel(GV);
if (Model == TLSModel::LocalExec) {
SDValue TGAHi = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
@@ -2769,8 +2851,14 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);
GOTPtr = DAG.getNode(PPCISD::ADDIS_GOT_TPREL_HA, dl,
PtrVT, GOTReg, TGA);
- } else
- GOTPtr = DAG.getNode(PPCISD::PPC32_GOT, dl, PtrVT);
+ } else {
+ if (!TM.isPositionIndependent())
+ GOTPtr = DAG.getNode(PPCISD::PPC32_GOT, dl, PtrVT);
+ else if (picLevel == PICLevel::SmallPIC)
+ GOTPtr = DAG.getNode(PPCISD::GlobalBaseReg, dl, PtrVT);
+ else
+ GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT);
+ }
SDValue TPOffset = DAG.getNode(PPCISD::LD_GOT_TPREL_L, dl,
PtrVT, TGA, GOTPtr);
return DAG.getNode(PPCISD::ADD_TLS, dl, PtrVT, TPOffset, TGATLS);
@@ -3147,101 +3235,6 @@ SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
MachinePointerInfo(SV, nextOffset));
}
-#include "PPCGenCallingConv.inc"
-
-// Function whose sole purpose is to kill compiler warnings
-// stemming from unused functions included from PPCGenCallingConv.inc.
-CCAssignFn *PPCTargetLowering::useFastISelCCs(unsigned Flag) const {
- return Flag ? CC_PPC64_ELF_FIS : RetCC_PPC64_ELF_FIS;
-}
-
-bool llvm::CC_PPC32_SVR4_Custom_Dummy(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
- CCValAssign::LocInfo &LocInfo,
- ISD::ArgFlagsTy &ArgFlags,
- CCState &State) {
- return true;
-}
-
-bool llvm::CC_PPC32_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT,
- MVT &LocVT,
- CCValAssign::LocInfo &LocInfo,
- ISD::ArgFlagsTy &ArgFlags,
- CCState &State) {
- static const MCPhysReg ArgRegs[] = {
- PPC::R3, PPC::R4, PPC::R5, PPC::R6,
- PPC::R7, PPC::R8, PPC::R9, PPC::R10,
- };
- const unsigned NumArgRegs = array_lengthof(ArgRegs);
-
- unsigned RegNum = State.getFirstUnallocated(ArgRegs);
-
- // Skip one register if the first unallocated register has an even register
- // number and there are still argument registers available which have not been
- // allocated yet. RegNum is actually an index into ArgRegs, which means we
- // need to skip a register if RegNum is odd.
- if (RegNum != NumArgRegs && RegNum % 2 == 1) {
- State.AllocateReg(ArgRegs[RegNum]);
- }
-
- // Always return false here, as this function only makes sure that the first
- // unallocated register has an odd register number and does not actually
- // allocate a register for the current argument.
- return false;
-}
-
-bool
-llvm::CC_PPC32_SVR4_Custom_SkipLastArgRegsPPCF128(unsigned &ValNo, MVT &ValVT,
- MVT &LocVT,
- CCValAssign::LocInfo &LocInfo,
- ISD::ArgFlagsTy &ArgFlags,
- CCState &State) {
- static const MCPhysReg ArgRegs[] = {
- PPC::R3, PPC::R4, PPC::R5, PPC::R6,
- PPC::R7, PPC::R8, PPC::R9, PPC::R10,
- };
- const unsigned NumArgRegs = array_lengthof(ArgRegs);
-
- unsigned RegNum = State.getFirstUnallocated(ArgRegs);
- int RegsLeft = NumArgRegs - RegNum;
-
- // Skip if there is not enough registers left for long double type (4 gpr regs
- // in soft float mode) and put long double argument on the stack.
- if (RegNum != NumArgRegs && RegsLeft < 4) {
- for (int i = 0; i < RegsLeft; i++) {
- State.AllocateReg(ArgRegs[RegNum + i]);
- }
- }
-
- return false;
-}
-
-bool llvm::CC_PPC32_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT,
- MVT &LocVT,
- CCValAssign::LocInfo &LocInfo,
- ISD::ArgFlagsTy &ArgFlags,
- CCState &State) {
- static const MCPhysReg ArgRegs[] = {
- PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7,
- PPC::F8
- };
-
- const unsigned NumArgRegs = array_lengthof(ArgRegs);
-
- unsigned RegNum = State.getFirstUnallocated(ArgRegs);
-
- // If there is only one Floating-point register left we need to put both f64
- // values of a split ppc_fp128 value on the stack.
- if (RegNum != NumArgRegs && ArgRegs[RegNum] == PPC::F8) {
- State.AllocateReg(ArgRegs[RegNum]);
- }
-
- // Always return false here, as this function only makes sure that the two f64
- // values a ppc_fp128 value is split into are both passed in registers or both
- // passed on the stack and does not actually allocate a register for the
- // current argument.
- return false;
-}
-
/// FPR - The set of FP registers that should be allocated for arguments,
/// on Darwin.
static const MCPhysReg FPR[] = {PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5,
@@ -3449,7 +3442,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_32SVR4(
// Reserve space for the linkage area on the stack.
unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
CCInfo.AllocateStack(LinkageSize, PtrByteSize);
- if (useSoftFloat() || hasSPE())
+ if (useSoftFloat())
CCInfo.PreAnalyzeFormalArguments(Ins);
CCInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4);
@@ -3482,7 +3475,8 @@ SDValue PPCTargetLowering::LowerFormalArguments_32SVR4(
if (Subtarget.hasVSX())
RC = &PPC::VSFRCRegClass;
else if (Subtarget.hasSPE())
- RC = &PPC::SPERCRegClass;
+ // SPE passes doubles in GPR pairs.
+ RC = &PPC::GPRCRegClass;
else
RC = &PPC::F8RCRegClass;
break;
@@ -3506,13 +3500,26 @@ SDValue PPCTargetLowering::LowerFormalArguments_32SVR4(
break;
}
- // Transform the arguments stored in physical registers into virtual ones.
- unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
- SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, Reg,
- ValVT == MVT::i1 ? MVT::i32 : ValVT);
-
- if (ValVT == MVT::i1)
- ArgValue = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, ArgValue);
+ SDValue ArgValue;
+ // Transform the arguments stored in physical registers into
+ // virtual ones.
+ if (VA.getLocVT() == MVT::f64 && Subtarget.hasSPE()) {
+ assert(i + 1 < e && "No second half of double precision argument");
+ unsigned RegLo = MF.addLiveIn(VA.getLocReg(), RC);
+ unsigned RegHi = MF.addLiveIn(ArgLocs[++i].getLocReg(), RC);
+ SDValue ArgValueLo = DAG.getCopyFromReg(Chain, dl, RegLo, MVT::i32);
+ SDValue ArgValueHi = DAG.getCopyFromReg(Chain, dl, RegHi, MVT::i32);
+ if (!Subtarget.isLittleEndian())
+ std::swap (ArgValueLo, ArgValueHi);
+ ArgValue = DAG.getNode(PPCISD::BUILD_SPE64, dl, MVT::f64, ArgValueLo,
+ ArgValueHi);
+ } else {
+ unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
+ ArgValue = DAG.getCopyFromReg(Chain, dl, Reg,
+ ValVT == MVT::i1 ? MVT::i32 : ValVT);
+ if (ValVT == MVT::i1)
+ ArgValue = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, ArgValue);
+ }
InVals.push_back(ArgValue);
} else {
@@ -4448,24 +4455,27 @@ static bool isFunctionGlobalAddress(SDValue Callee);
static bool
callsShareTOCBase(const Function *Caller, SDValue Callee,
const TargetMachine &TM) {
- // If !G, Callee can be an external symbol.
- GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
- if (!G)
- return false;
-
+ // Callee is either a GlobalAddress or an ExternalSymbol. ExternalSymbols
+ // don't have enough information to determine if the caller and calle share
+ // the same TOC base, so we have to pessimistically assume they don't for
+ // correctness.
+ GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
+ if (!G)
+ return false;
+
+ const GlobalValue *GV = G->getGlobal();
// The medium and large code models are expected to provide a sufficiently
// large TOC to provide all data addressing needs of a module with a
// single TOC. Since each module will be addressed with a single TOC then we
// only need to check that caller and callee don't cross dso boundaries.
if (CodeModel::Medium == TM.getCodeModel() ||
CodeModel::Large == TM.getCodeModel())
- return TM.shouldAssumeDSOLocal(*Caller->getParent(), G->getGlobal());
+ return TM.shouldAssumeDSOLocal(*Caller->getParent(), GV);
// Otherwise we need to ensure callee and caller are in the same section,
// since the linker may allocate multiple TOCs, and we don't know which
// sections will belong to the same TOC base.
- const GlobalValue *GV = G->getGlobal();
if (!GV->isStrongDefinitionForLinker())
return false;
@@ -4917,6 +4927,7 @@ PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, SDValue &Chain,
bool isPPC64 = Subtarget.isPPC64();
bool isSVR4ABI = Subtarget.isSVR4ABI();
bool isELFv2ABI = Subtarget.isELFv2ABI();
+ bool isAIXABI = Subtarget.isAIXABI();
EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
NodeTys.push_back(MVT::Other); // Returns a chain
@@ -4943,17 +4954,18 @@ PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, SDValue &Chain,
bool Local = TM.shouldAssumeDSOLocal(*Mod, GV);
bool UsePlt = !Local && Subtarget.isTargetELF() && !isPPC64;
+ // If the callee is a GlobalAddress/ExternalSymbol node (quite common,
+ // every direct call is) turn it into a TargetGlobalAddress /
+ // TargetExternalSymbol node so that legalize doesn't hack it.
if (isFunctionGlobalAddress(Callee)) {
GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Callee);
+
// A call to a TLS address is actually an indirect call to a
// thread-specific pointer.
unsigned OpFlags = 0;
if (UsePlt)
OpFlags = PPCII::MO_PLT;
- // If the callee is a GlobalAddress/ExternalSymbol node (quite common,
- // every direct call is) turn it into a TargetGlobalAddress /
- // TargetExternalSymbol node so that legalize doesn't hack it.
Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl,
Callee.getValueType(), 0, OpFlags);
needIndirectCall = false;
@@ -5095,17 +5107,18 @@ PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, SDValue &Chain,
Ops.push_back(DAG.getRegister(RegsToPass[i].first,
RegsToPass[i].second.getValueType()));
- // All calls, in both the ELF V1 and V2 ABIs, need the TOC register live
- // into the call.
- // We do need to reserve X2 to appease the verifier for the PATCHPOINT.
- if (isSVR4ABI && isPPC64) {
+ // All calls, in the AIX ABI and 64-bit ELF ABIs, need the TOC register
+ // live into the call.
+ // We do need to reserve R2/X2 to appease the verifier for the PATCHPOINT.
+ if ((isSVR4ABI && isPPC64) || isAIXABI) {
setUsesTOCBasePtr(DAG);
- // We cannot add X2 as an operand here for PATCHPOINT, because there is no
- // way to mark dependencies as implicit here. We will add the X2 dependency
- // in EmitInstrWithCustomInserter.
- if (!isPatchPoint)
- Ops.push_back(DAG.getRegister(PPC::X2, PtrVT));
+ // We cannot add R2/X2 as an operand here for PATCHPOINT, because there is
+ // no way to mark dependencies as implicit here.
+ // We will add the R2/X2 dependency in EmitInstrWithCustomInserter.
+ if (!isPatchPoint)
+ Ops.push_back(DAG.getRegister(isPPC64 ? PPC::X2
+ : PPC::R2, PtrVT));
}
return CallOpc;
@@ -5129,10 +5142,27 @@ SDValue PPCTargetLowering::LowerCallResult(
CCValAssign &VA = RVLocs[i];
assert(VA.isRegLoc() && "Can only return in registers!");
- SDValue Val = DAG.getCopyFromReg(Chain, dl,
- VA.getLocReg(), VA.getLocVT(), InFlag);
- Chain = Val.getValue(1);
- InFlag = Val.getValue(2);
+ SDValue Val;
+
+ if (Subtarget.hasSPE() && VA.getLocVT() == MVT::f64) {
+ SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
+ InFlag);
+ Chain = Lo.getValue(1);
+ InFlag = Lo.getValue(2);
+ VA = RVLocs[++i]; // skip ahead to next loc
+ SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
+ InFlag);
+ Chain = Hi.getValue(1);
+ InFlag = Hi.getValue(2);
+ if (!Subtarget.isLittleEndian())
+ std::swap (Lo, Hi);
+ Val = DAG.getNode(PPCISD::BUILD_SPE64, dl, MVT::f64, Lo, Hi);
+ } else {
+ Val = DAG.getCopyFromReg(Chain, dl,
+ VA.getLocReg(), VA.getLocVT(), InFlag);
+ Chain = Val.getValue(1);
+ InFlag = Val.getValue(2);
+ }
switch (VA.getLocInfo()) {
default: llvm_unreachable("Unknown loc info!");
@@ -5206,18 +5236,24 @@ SDValue PPCTargetLowering::FinishCall(
}
// Add a NOP immediately after the branch instruction when using the 64-bit
- // SVR4 ABI. At link time, if caller and callee are in a different module and
+ // SVR4 or the AIX ABI.
+ // At link time, if caller and callee are in a different module and
// thus have a different TOC, the call will be replaced with a call to a stub
// function which saves the current TOC, loads the TOC of the callee and
// branches to the callee. The NOP will be replaced with a load instruction
// which restores the TOC of the caller from the TOC save slot of the current
// stack frame. If caller and callee belong to the same module (and have the
- // same TOC), the NOP will remain unchanged.
+ // same TOC), the NOP will remain unchanged, or become some other NOP.
MachineFunction &MF = DAG.getMachineFunction();
- if (!isTailCall && Subtarget.isSVR4ABI()&& Subtarget.isPPC64() &&
- !isPatchPoint) {
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+ if (!isTailCall && !isPatchPoint &&
+ ((Subtarget.isSVR4ABI() && Subtarget.isPPC64()) ||
+ Subtarget.isAIXABI())) {
if (CallOpc == PPCISD::BCTRL) {
+ if (Subtarget.isAIXABI())
+ report_fatal_error("Indirect call on AIX is not implemented.");
+
// This is a call through a function pointer.
// Restore the caller TOC from the save area into R2.
// See PrepareCall() for more information about calls through function
@@ -5229,7 +5265,6 @@ SDValue PPCTargetLowering::FinishCall(
// allocated and an unnecessary move instruction being generated.
CallOpc = PPCISD::BCTRL_LOAD_TOC;
- EVT PtrVT = getPointerTy(DAG.getDataLayout());
SDValue StackPtr = DAG.getRegister(PPC::X1, PtrVT);
unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset();
SDValue TOCOff = DAG.getIntPtrConstant(TOCSaveOffset, dl);
@@ -5245,6 +5280,19 @@ SDValue PPCTargetLowering::FinishCall(
}
}
+ if (Subtarget.isAIXABI() && isFunctionGlobalAddress(Callee)) {
+ // On AIX, direct function calls reference the symbol for the function's
+ // entry point, which is named by inserting a "." before the function's
+ // C-linkage name.
+ GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Callee);
+ auto &Context = DAG.getMachineFunction().getMMI().getContext();
+ MCSymbol *S = Context.getOrCreateSymbol(Twine(".") +
+ Twine(G->getGlobal()->getName()));
+ Callee = DAG.getMCSymbol(S, PtrVT);
+ // Replace the GlobalAddressSDNode Callee with the MCSymbolSDNode.
+ Ops[1] = Callee;
+ }
+
Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops);
InFlag = Chain.getValue(1);
@@ -5314,16 +5362,20 @@ PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
!isTailCall)
Callee = LowerGlobalAddress(Callee, DAG);
- if (Subtarget.isSVR4ABI()) {
- if (Subtarget.isPPC64())
- return LowerCall_64SVR4(Chain, Callee, CallConv, isVarArg,
- isTailCall, isPatchPoint, Outs, OutVals, Ins,
- dl, DAG, InVals, CS);
- else
- return LowerCall_32SVR4(Chain, Callee, CallConv, isVarArg,
- isTailCall, isPatchPoint, Outs, OutVals, Ins,
- dl, DAG, InVals, CS);
- }
+ if (Subtarget.isSVR4ABI() && Subtarget.isPPC64())
+ return LowerCall_64SVR4(Chain, Callee, CallConv, isVarArg,
+ isTailCall, isPatchPoint, Outs, OutVals, Ins,
+ dl, DAG, InVals, CS);
+
+ if (Subtarget.isSVR4ABI())
+ return LowerCall_32SVR4(Chain, Callee, CallConv, isVarArg,
+ isTailCall, isPatchPoint, Outs, OutVals, Ins,
+ dl, DAG, InVals, CS);
+
+ if (Subtarget.isAIXABI())
+ return LowerCall_AIX(Chain, Callee, CallConv, isVarArg,
+ isTailCall, isPatchPoint, Outs, OutVals, Ins,
+ dl, DAG, InVals, CS);
return LowerCall_Darwin(Chain, Callee, CallConv, isVarArg,
isTailCall, isPatchPoint, Outs, OutVals, Ins,
@@ -5444,12 +5496,15 @@ SDValue PPCTargetLowering::LowerCall_32SVR4(
bool seenFloatArg = false;
// Walk the register/memloc assignments, inserting copies/loads.
- for (unsigned i = 0, j = 0, e = ArgLocs.size();
+ // i - Tracks the index into the list of registers allocated for the call
+ // RealArgIdx - Tracks the index into the list of actual function arguments
+ // j - Tracks the index into the list of byval arguments
+ for (unsigned i = 0, RealArgIdx = 0, j = 0, e = ArgLocs.size();
i != e;
- ++i) {
+ ++i, ++RealArgIdx) {
CCValAssign &VA = ArgLocs[i];
- SDValue Arg = OutVals[i];
- ISD::ArgFlagsTy Flags = Outs[i].Flags;
+ SDValue Arg = OutVals[RealArgIdx];
+ ISD::ArgFlagsTy Flags = Outs[RealArgIdx].Flags;
if (Flags.isByVal()) {
// Argument is an aggregate which is passed by value, thus we need to
@@ -5498,7 +5553,17 @@ SDValue PPCTargetLowering::LowerCall_32SVR4(
if (VA.isRegLoc()) {
seenFloatArg |= VA.getLocVT().isFloatingPoint();
// Put argument in a physical register.
- RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+ if (Subtarget.hasSPE() && Arg.getValueType() == MVT::f64) {
+ bool IsLE = Subtarget.isLittleEndian();
+ SDValue SVal = DAG.getNode(PPCISD::EXTRACT_SPE, dl, MVT::i32, Arg,
+ DAG.getIntPtrConstant(IsLE ? 0 : 1, dl));
+ RegsToPass.push_back(std::make_pair(VA.getLocReg(), SVal.getValue(0)));
+ SVal = DAG.getNode(PPCISD::EXTRACT_SPE, dl, MVT::i32, Arg,
+ DAG.getIntPtrConstant(IsLE ? 1 : 0, dl));
+ RegsToPass.push_back(std::make_pair(ArgLocs[++i].getLocReg(),
+ SVal.getValue(0)));
+ } else
+ RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
} else {
// Put argument in the parameter list area of the current stack frame.
assert(VA.isMemLoc());
@@ -6613,6 +6678,128 @@ SDValue PPCTargetLowering::LowerCall_Darwin(
NumBytes, Ins, InVals, CS);
}
+
+SDValue PPCTargetLowering::LowerCall_AIX(
+ SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg,
+ bool isTailCall, bool isPatchPoint,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
+ ImmutableCallSite CS) const {
+
+ assert((CallConv == CallingConv::C || CallConv == CallingConv::Fast) &&
+ "Unimplemented calling convention!");
+ if (isVarArg || isPatchPoint)
+ report_fatal_error("This call type is unimplemented on AIX.");
+
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+ bool isPPC64 = PtrVT == MVT::i64;
+ unsigned PtrByteSize = isPPC64 ? 8 : 4;
+ unsigned NumOps = Outs.size();
+
+
+ // Count how many bytes are to be pushed on the stack, including the linkage
+ // area, parameter list area.
+ // On XCOFF, we start with 24/48, which is reserved space for
+ // [SP][CR][LR][2 x reserved][TOC].
+ unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
+
+ // The prolog code of the callee may store up to 8 GPR argument registers to
+ // the stack, allowing va_start to index over them in memory if the callee
+ // is variadic.
+ // Because we cannot tell if this is needed on the caller side, we have to
+ // conservatively assume that it is needed. As such, make sure we have at
+ // least enough stack space for the caller to store the 8 GPRs.
+ unsigned NumBytes = LinkageSize + 8 * PtrByteSize;
+
+ // Adjust the stack pointer for the new arguments...
+ // These operations are automatically eliminated by the prolog/epilog
+ // inserter pass.
+ Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
+ SDValue CallSeqStart = Chain;
+
+ static const MCPhysReg GPR_32[] = { // 32-bit registers.
+ PPC::R3, PPC::R4, PPC::R5, PPC::R6,
+ PPC::R7, PPC::R8, PPC::R9, PPC::R10
+ };
+ static const MCPhysReg GPR_64[] = { // 64-bit registers.
+ PPC::X3, PPC::X4, PPC::X5, PPC::X6,
+ PPC::X7, PPC::X8, PPC::X9, PPC::X10
+ };
+
+ const unsigned NumGPRs = isPPC64 ? array_lengthof(GPR_64)
+ : array_lengthof(GPR_32);
+ const MCPhysReg *GPR = isPPC64 ? GPR_64 : GPR_32;
+ unsigned GPR_idx = 0;
+
+ SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+
+ if (isTailCall)
+ report_fatal_error("Handling of tail call is unimplemented!");
+ int SPDiff = 0;
+
+ for (unsigned i = 0; i != NumOps; ++i) {
+ SDValue Arg = OutVals[i];
+ ISD::ArgFlagsTy Flags = Outs[i].Flags;
+
+ // Promote integers if needed.
+ if (Arg.getValueType() == MVT::i1 ||
+ (isPPC64 && Arg.getValueType() == MVT::i32)) {
+ unsigned ExtOp = Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+ Arg = DAG.getNode(ExtOp, dl, PtrVT, Arg);
+ }
+
+ // Note: "by value" is code for passing a structure by value, not
+ // basic types.
+ if (Flags.isByVal())
+ report_fatal_error("Passing structure by value is unimplemented!");
+
+ switch (Arg.getSimpleValueType().SimpleTy) {
+ default: llvm_unreachable("Unexpected ValueType for argument!");
+ case MVT::i1:
+ case MVT::i32:
+ case MVT::i64:
+ if (GPR_idx != NumGPRs)
+ RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Arg));
+ else
+ report_fatal_error("Handling of placing parameters on the stack is "
+ "unimplemented!");
+ break;
+ case MVT::f32:
+ case MVT::f64:
+ case MVT::v4f32:
+ case MVT::v4i32:
+ case MVT::v8i16:
+ case MVT::v16i8:
+ case MVT::v2f64:
+ case MVT::v2i64:
+ case MVT::v1i128:
+ case MVT::f128:
+ case MVT::v4f64:
+ case MVT::v4i1:
+ report_fatal_error("Handling of this parameter type is unimplemented!");
+ }
+ }
+
+ if (!isFunctionGlobalAddress(Callee) &&
+ !isa<ExternalSymbolSDNode>(Callee))
+ report_fatal_error("Handling of indirect call is unimplemented!");
+
+ // Build a sequence of copy-to-reg nodes chained together with token chain
+ // and flag operands which copy the outgoing args into the appropriate regs.
+ SDValue InFlag;
+ for (auto Reg : RegsToPass) {
+ Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, InFlag);
+ InFlag = Chain.getValue(1);
+ }
+
+ return FinishCall(CallConv, dl, isTailCall, isVarArg, isPatchPoint,
+ /* unused except on PPC64 ELFv1 */ false, DAG,
+ RegsToPass, InFlag, Chain, CallSeqStart, Callee, SPDiff,
+ NumBytes, Ins, InVals, CS);
+}
+
bool
PPCTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
MachineFunction &MF, bool isVarArg,
@@ -6644,11 +6831,11 @@ PPCTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
SmallVector<SDValue, 4> RetOps(1, Chain);
// Copy the result values into the output registers.
- for (unsigned i = 0; i != RVLocs.size(); ++i) {
+ for (unsigned i = 0, RealResIdx = 0; i != RVLocs.size(); ++i, ++RealResIdx) {
CCValAssign &VA = RVLocs[i];
assert(VA.isRegLoc() && "Can only return in registers!");
- SDValue Arg = OutVals[i];
+ SDValue Arg = OutVals[RealResIdx];
switch (VA.getLocInfo()) {
default: llvm_unreachable("Unknown loc info!");
@@ -6663,8 +6850,21 @@ PPCTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
break;
}
-
- Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag);
+ if (Subtarget.hasSPE() && VA.getLocVT() == MVT::f64) {
+ bool isLittleEndian = Subtarget.isLittleEndian();
+ // Legalize ret f64 -> ret 2 x i32.
+ SDValue SVal =
+ DAG.getNode(PPCISD::EXTRACT_SPE, dl, MVT::i32, Arg,
+ DAG.getIntPtrConstant(isLittleEndian ? 0 : 1, dl));
+ Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), SVal, Flag);
+ RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+ SVal = DAG.getNode(PPCISD::EXTRACT_SPE, dl, MVT::i32, Arg,
+ DAG.getIntPtrConstant(isLittleEndian ? 1 : 0, dl));
+ Flag = Chain.getValue(1);
+ VA = RVLocs[++i]; // skip ahead to next loc
+ Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), SVal, Flag);
+ } else
+ Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag);
Flag = Chain.getValue(1);
RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
}
@@ -6890,6 +7090,61 @@ SDValue PPCTargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
Op.getOperand(0));
}
+SDValue PPCTargetLowering::LowerTRUNCATEVector(SDValue Op,
+ SelectionDAG &DAG) const {
+
+ // Implements a vector truncate that fits in a vector register as a shuffle.
+ // We want to legalize vector truncates down to where the source fits in
+ // a vector register (and target is therefore smaller than vector register
+ // size). At that point legalization will try to custom lower the sub-legal
+ // result and get here - where we can contain the truncate as a single target
+ // operation.
+
+ // For example a trunc <2 x i16> to <2 x i8> could be visualized as follows:
+ // <MSB1|LSB1, MSB2|LSB2> to <LSB1, LSB2>
+ //
+ // We will implement it for big-endian ordering as this (where x denotes
+ // undefined):
+ // < MSB1|LSB1, MSB2|LSB2, uu, uu, uu, uu, uu, uu> to
+ // < LSB1, LSB2, u, u, u, u, u, u, u, u, u, u, u, u, u, u>
+ //
+ // The same operation in little-endian ordering will be:
+ // <uu, uu, uu, uu, uu, uu, LSB2|MSB2, LSB1|MSB1> to
+ // <u, u, u, u, u, u, u, u, u, u, u, u, u, u, LSB2, LSB1>
+
+ assert(Op.getValueType().isVector() && "Vector type expected.");
+
+ SDLoc DL(Op);
+ SDValue N1 = Op.getOperand(0);
+ unsigned SrcSize = N1.getValueType().getSizeInBits();
+ assert(SrcSize <= 128 && "Source must fit in an Altivec/VSX vector");
+ SDValue WideSrc = SrcSize == 128 ? N1 : widenVec(DAG, N1, DL);
+
+ EVT TrgVT = Op.getValueType();
+ unsigned TrgNumElts = TrgVT.getVectorNumElements();
+ EVT EltVT = TrgVT.getVectorElementType();
+ unsigned WideNumElts = 128 / EltVT.getSizeInBits();
+ EVT WideVT = EVT::getVectorVT(*DAG.getContext(), EltVT, WideNumElts);
+
+ // First list the elements we want to keep.
+ unsigned SizeMult = SrcSize / TrgVT.getSizeInBits();
+ SmallVector<int, 16> ShuffV;
+ if (Subtarget.isLittleEndian())
+ for (unsigned i = 0; i < TrgNumElts; ++i)
+ ShuffV.push_back(i * SizeMult);
+ else
+ for (unsigned i = 1; i <= TrgNumElts; ++i)
+ ShuffV.push_back(i * SizeMult - 1);
+
+ // Populate the remaining elements with undefs.
+ for (unsigned i = TrgNumElts; i < WideNumElts; ++i)
+ // ShuffV.push_back(i + WideNumElts);
+ ShuffV.push_back(WideNumElts + 1);
+
+ SDValue Conv = DAG.getNode(ISD::BITCAST, DL, WideVT, WideSrc);
+ return DAG.getVectorShuffle(WideVT, DL, Conv, DAG.getUNDEF(WideVT), ShuffV);
+}
+
/// LowerSELECT_CC - Lower floating point select_cc's into fsel instruction when
/// possible.
SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
@@ -9604,10 +9859,63 @@ SDValue PPCTargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
BifID = Intrinsic::ppc_altivec_vmaxsh;
else if (VT == MVT::v16i8)
BifID = Intrinsic::ppc_altivec_vmaxsb;
-
+
return BuildIntrinsicOp(BifID, X, Y, DAG, dl, VT);
}
+// Custom lowering for fpext vf32 to v2f64
+SDValue PPCTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
+
+ assert(Op.getOpcode() == ISD::FP_EXTEND &&
+ "Should only be called for ISD::FP_EXTEND");
+
+ // We only want to custom lower an extend from v2f32 to v2f64.
+ if (Op.getValueType() != MVT::v2f64 ||
+ Op.getOperand(0).getValueType() != MVT::v2f32)
+ return SDValue();
+
+ SDLoc dl(Op);
+ SDValue Op0 = Op.getOperand(0);
+
+ switch (Op0.getOpcode()) {
+ default:
+ return SDValue();
+ case ISD::FADD:
+ case ISD::FMUL:
+ case ISD::FSUB: {
+ SDValue NewLoad[2];
+ for (unsigned i = 0, ie = Op0.getNumOperands(); i != ie; ++i) {
+ // Ensure both input are loads.
+ SDValue LdOp = Op0.getOperand(i);
+ if (LdOp.getOpcode() != ISD::LOAD)
+ return SDValue();
+ // Generate new load node.
+ LoadSDNode *LD = cast<LoadSDNode>(LdOp);
+ SDValue LoadOps[] = { LD->getChain(), LD->getBasePtr() };
+ NewLoad[i] =
+ DAG.getMemIntrinsicNode(PPCISD::LD_VSX_LH, dl,
+ DAG.getVTList(MVT::v4f32, MVT::Other),
+ LoadOps, LD->getMemoryVT(),
+ LD->getMemOperand());
+ }
+ SDValue NewOp = DAG.getNode(Op0.getOpcode(), SDLoc(Op0), MVT::v4f32,
+ NewLoad[0], NewLoad[1],
+ Op0.getNode()->getFlags());
+ return DAG.getNode(PPCISD::FP_EXTEND_LH, dl, MVT::v2f64, NewOp);
+ }
+ case ISD::LOAD: {
+ LoadSDNode *LD = cast<LoadSDNode>(Op0);
+ SDValue LoadOps[] = { LD->getChain(), LD->getBasePtr() };
+ SDValue NewLd =
+ DAG.getMemIntrinsicNode(PPCISD::LD_VSX_LH, dl,
+ DAG.getVTList(MVT::v4f32, MVT::Other),
+ LoadOps, LD->getMemoryVT(), LD->getMemOperand());
+ return DAG.getNode(PPCISD::FP_EXTEND_LH, dl, MVT::v2f64, NewLd);
+ }
+ }
+ llvm_unreachable("ERROR:Should return for all cases within swtich.");
+}
+
/// LowerOperation - Provide custom lowering hooks for some operations.
///
SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
@@ -9661,6 +9969,7 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
case ISD::MUL: return LowerMUL(Op, DAG);
case ISD::ABS: return LowerABS(Op, DAG);
+ case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
// For counter-based loop handling.
case ISD::INTRINSIC_W_CHAIN: return SDValue();
@@ -9701,7 +10010,7 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
}
case ISD::INTRINSIC_W_CHAIN: {
if (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() !=
- Intrinsic::ppc_is_decremented_ctr_nonzero)
+ Intrinsic::loop_decrement)
break;
assert(N->getValueType(0) == MVT::i1 &&
@@ -9737,6 +10046,14 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
return;
Results.push_back(LowerFP_TO_INT(SDValue(N, 0), DAG, dl));
return;
+ case ISD::TRUNCATE: {
+ EVT TrgVT = N->getValueType(0);
+ if (TrgVT.isVector() &&
+ isOperationCustom(N->getOpcode(), TrgVT) &&
+ N->getOperand(0).getValueType().getSizeInBits() <= 128)
+ Results.push_back(LowerTRUNCATEVector(SDValue(N, 0), DAG));
+ return;
+ }
case ISD::BITCAST:
// Don't handle bitcast here.
return;
@@ -9822,10 +10139,10 @@ PPCTargetLowering::EmitAtomicBinary(MachineInstr &MI, MachineBasicBlock *BB,
MachineFunction *F = BB->getParent();
MachineFunction::iterator It = ++BB->getIterator();
- unsigned dest = MI.getOperand(0).getReg();
- unsigned ptrA = MI.getOperand(1).getReg();
- unsigned ptrB = MI.getOperand(2).getReg();
- unsigned incr = MI.getOperand(3).getReg();
+ Register dest = MI.getOperand(0).getReg();
+ Register ptrA = MI.getOperand(1).getReg();
+ Register ptrB = MI.getOperand(2).getReg();
+ Register incr = MI.getOperand(3).getReg();
DebugLoc dl = MI.getDebugLoc();
MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(LLVM_BB);
@@ -9841,7 +10158,7 @@ PPCTargetLowering::EmitAtomicBinary(MachineInstr &MI, MachineBasicBlock *BB,
exitMBB->transferSuccessorsAndUpdatePHIs(BB);
MachineRegisterInfo &RegInfo = F->getRegInfo();
- unsigned TmpReg = (!BinOpcode) ? incr :
+ Register TmpReg = (!BinOpcode) ? incr :
RegInfo.createVirtualRegister( AtomicSize == 8 ? &PPC::G8RCRegClass
: &PPC::GPRCRegClass);
@@ -9949,20 +10266,20 @@ MachineBasicBlock *PPCTargetLowering::EmitPartwordAtomicBinary(
is64bit ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
- unsigned PtrReg = RegInfo.createVirtualRegister(RC);
- unsigned Shift1Reg = RegInfo.createVirtualRegister(GPRC);
- unsigned ShiftReg =
+ Register PtrReg = RegInfo.createVirtualRegister(RC);
+ Register Shift1Reg = RegInfo.createVirtualRegister(GPRC);
+ Register ShiftReg =
isLittleEndian ? Shift1Reg : RegInfo.createVirtualRegister(GPRC);
- unsigned Incr2Reg = RegInfo.createVirtualRegister(GPRC);
- unsigned MaskReg = RegInfo.createVirtualRegister(GPRC);
- unsigned Mask2Reg = RegInfo.createVirtualRegister(GPRC);
- unsigned Mask3Reg = RegInfo.createVirtualRegister(GPRC);
- unsigned Tmp2Reg = RegInfo.createVirtualRegister(GPRC);
- unsigned Tmp3Reg = RegInfo.createVirtualRegister(GPRC);
- unsigned Tmp4Reg = RegInfo.createVirtualRegister(GPRC);
- unsigned TmpDestReg = RegInfo.createVirtualRegister(GPRC);
- unsigned Ptr1Reg;
- unsigned TmpReg =
+ Register Incr2Reg = RegInfo.createVirtualRegister(GPRC);
+ Register MaskReg = RegInfo.createVirtualRegister(GPRC);
+ Register Mask2Reg = RegInfo.createVirtualRegister(GPRC);
+ Register Mask3Reg = RegInfo.createVirtualRegister(GPRC);
+ Register Tmp2Reg = RegInfo.createVirtualRegister(GPRC);
+ Register Tmp3Reg = RegInfo.createVirtualRegister(GPRC);
+ Register Tmp4Reg = RegInfo.createVirtualRegister(GPRC);
+ Register TmpDestReg = RegInfo.createVirtualRegister(GPRC);
+ Register Ptr1Reg;
+ Register TmpReg =
(!BinOpcode) ? Incr2Reg : RegInfo.createVirtualRegister(GPRC);
// thisMBB:
@@ -10764,23 +11081,23 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
is64bit ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
- unsigned PtrReg = RegInfo.createVirtualRegister(RC);
- unsigned Shift1Reg = RegInfo.createVirtualRegister(GPRC);
- unsigned ShiftReg =
+ Register PtrReg = RegInfo.createVirtualRegister(RC);
+ Register Shift1Reg = RegInfo.createVirtualRegister(GPRC);
+ Register ShiftReg =
isLittleEndian ? Shift1Reg : RegInfo.createVirtualRegister(GPRC);
- unsigned NewVal2Reg = RegInfo.createVirtualRegister(GPRC);
- unsigned NewVal3Reg = RegInfo.createVirtualRegister(GPRC);
- unsigned OldVal2Reg = RegInfo.createVirtualRegister(GPRC);
- unsigned OldVal3Reg = RegInfo.createVirtualRegister(GPRC);
- unsigned MaskReg = RegInfo.createVirtualRegister(GPRC);
- unsigned Mask2Reg = RegInfo.createVirtualRegister(GPRC);
- unsigned Mask3Reg = RegInfo.createVirtualRegister(GPRC);
- unsigned Tmp2Reg = RegInfo.createVirtualRegister(GPRC);
- unsigned Tmp4Reg = RegInfo.createVirtualRegister(GPRC);
- unsigned TmpDestReg = RegInfo.createVirtualRegister(GPRC);
- unsigned Ptr1Reg;
- unsigned TmpReg = RegInfo.createVirtualRegister(GPRC);
- unsigned ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO;
+ Register NewVal2Reg = RegInfo.createVirtualRegister(GPRC);
+ Register NewVal3Reg = RegInfo.createVirtualRegister(GPRC);
+ Register OldVal2Reg = RegInfo.createVirtualRegister(GPRC);
+ Register OldVal3Reg = RegInfo.createVirtualRegister(GPRC);
+ Register MaskReg = RegInfo.createVirtualRegister(GPRC);
+ Register Mask2Reg = RegInfo.createVirtualRegister(GPRC);
+ Register Mask3Reg = RegInfo.createVirtualRegister(GPRC);
+ Register Tmp2Reg = RegInfo.createVirtualRegister(GPRC);
+ Register Tmp4Reg = RegInfo.createVirtualRegister(GPRC);
+ Register TmpDestReg = RegInfo.createVirtualRegister(GPRC);
+ Register Ptr1Reg;
+ Register TmpReg = RegInfo.createVirtualRegister(GPRC);
+ Register ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO;
// thisMBB:
// ...
// fallthrough --> loopMBB
@@ -10968,7 +11285,147 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MachineRegisterInfo &RegInfo = F->getRegInfo();
unsigned CRReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass);
BuildMI(*BB, MI, Dl, TII->get(PPC::TCHECK), CRReg);
- return BB;
+ BuildMI(*BB, MI, Dl, TII->get(TargetOpcode::COPY),
+ MI.getOperand(0).getReg())
+ .addReg(CRReg);
+ } else if (MI.getOpcode() == PPC::TBEGIN_RET) {
+ DebugLoc Dl = MI.getDebugLoc();
+ unsigned Imm = MI.getOperand(1).getImm();
+ BuildMI(*BB, MI, Dl, TII->get(PPC::TBEGIN)).addImm(Imm);
+ BuildMI(*BB, MI, Dl, TII->get(TargetOpcode::COPY),
+ MI.getOperand(0).getReg())
+ .addReg(PPC::CR0EQ);
+ } else if (MI.getOpcode() == PPC::SETRNDi) {
+ DebugLoc dl = MI.getDebugLoc();
+ unsigned OldFPSCRReg = MI.getOperand(0).getReg();
+
+ // Save FPSCR value.
+ BuildMI(*BB, MI, dl, TII->get(PPC::MFFS), OldFPSCRReg);
+
+ // The floating point rounding mode is in the bits 62:63 of FPCSR, and has
+ // the following settings:
+ // 00 Round to nearest
+ // 01 Round to 0
+ // 10 Round to +inf
+ // 11 Round to -inf
+
+ // When the operand is immediate, using the two least significant bits of
+ // the immediate to set the bits 62:63 of FPSCR.
+ unsigned Mode = MI.getOperand(1).getImm();
+ BuildMI(*BB, MI, dl, TII->get((Mode & 1) ? PPC::MTFSB1 : PPC::MTFSB0))
+ .addImm(31);
+
+ BuildMI(*BB, MI, dl, TII->get((Mode & 2) ? PPC::MTFSB1 : PPC::MTFSB0))
+ .addImm(30);
+ } else if (MI.getOpcode() == PPC::SETRND) {
+ DebugLoc dl = MI.getDebugLoc();
+
+ // Copy register from F8RCRegClass::SrcReg to G8RCRegClass::DestReg
+ // or copy register from G8RCRegClass::SrcReg to F8RCRegClass::DestReg.
+ // If the target doesn't have DirectMove, we should use stack to do the
+ // conversion, because the target doesn't have the instructions like mtvsrd
+ // or mfvsrd to do this conversion directly.
+ auto copyRegFromG8RCOrF8RC = [&] (unsigned DestReg, unsigned SrcReg) {
+ if (Subtarget.hasDirectMove()) {
+ BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), DestReg)
+ .addReg(SrcReg);
+ } else {
+ // Use stack to do the register copy.
+ unsigned StoreOp = PPC::STD, LoadOp = PPC::LFD;
+ MachineRegisterInfo &RegInfo = F->getRegInfo();
+ const TargetRegisterClass *RC = RegInfo.getRegClass(SrcReg);
+ if (RC == &PPC::F8RCRegClass) {
+ // Copy register from F8RCRegClass to G8RCRegclass.
+ assert((RegInfo.getRegClass(DestReg) == &PPC::G8RCRegClass) &&
+ "Unsupported RegClass.");
+
+ StoreOp = PPC::STFD;
+ LoadOp = PPC::LD;
+ } else {
+ // Copy register from G8RCRegClass to F8RCRegclass.
+ assert((RegInfo.getRegClass(SrcReg) == &PPC::G8RCRegClass) &&
+ (RegInfo.getRegClass(DestReg) == &PPC::F8RCRegClass) &&
+ "Unsupported RegClass.");
+ }
+
+ MachineFrameInfo &MFI = F->getFrameInfo();
+ int FrameIdx = MFI.CreateStackObject(8, 8, false);
+
+ MachineMemOperand *MMOStore = F->getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(*F, FrameIdx, 0),
+ MachineMemOperand::MOStore, MFI.getObjectSize(FrameIdx),
+ MFI.getObjectAlignment(FrameIdx));
+
+ // Store the SrcReg into the stack.
+ BuildMI(*BB, MI, dl, TII->get(StoreOp))
+ .addReg(SrcReg)
+ .addImm(0)
+ .addFrameIndex(FrameIdx)
+ .addMemOperand(MMOStore);
+
+ MachineMemOperand *MMOLoad = F->getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(*F, FrameIdx, 0),
+ MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIdx),
+ MFI.getObjectAlignment(FrameIdx));
+
+ // Load from the stack where SrcReg is stored, and save to DestReg,
+ // so we have done the RegClass conversion from RegClass::SrcReg to
+ // RegClass::DestReg.
+ BuildMI(*BB, MI, dl, TII->get(LoadOp), DestReg)
+ .addImm(0)
+ .addFrameIndex(FrameIdx)
+ .addMemOperand(MMOLoad);
+ }
+ };
+
+ unsigned OldFPSCRReg = MI.getOperand(0).getReg();
+
+ // Save FPSCR value.
+ BuildMI(*BB, MI, dl, TII->get(PPC::MFFS), OldFPSCRReg);
+
+ // When the operand is gprc register, use two least significant bits of the
+ // register and mtfsf instruction to set the bits 62:63 of FPSCR.
+ //
+ // copy OldFPSCRTmpReg, OldFPSCRReg
+ // (INSERT_SUBREG ExtSrcReg, (IMPLICIT_DEF ImDefReg), SrcOp, 1)
+ // rldimi NewFPSCRTmpReg, ExtSrcReg, OldFPSCRReg, 0, 62
+ // copy NewFPSCRReg, NewFPSCRTmpReg
+ // mtfsf 255, NewFPSCRReg
+ MachineOperand SrcOp = MI.getOperand(1);
+ MachineRegisterInfo &RegInfo = F->getRegInfo();
+ unsigned OldFPSCRTmpReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass);
+
+ copyRegFromG8RCOrF8RC(OldFPSCRTmpReg, OldFPSCRReg);
+
+ unsigned ImDefReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass);
+ unsigned ExtSrcReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass);
+
+ // The first operand of INSERT_SUBREG should be a register which has
+ // subregisters, we only care about its RegClass, so we should use an
+ // IMPLICIT_DEF register.
+ BuildMI(*BB, MI, dl, TII->get(TargetOpcode::IMPLICIT_DEF), ImDefReg);
+ BuildMI(*BB, MI, dl, TII->get(PPC::INSERT_SUBREG), ExtSrcReg)
+ .addReg(ImDefReg)
+ .add(SrcOp)
+ .addImm(1);
+
+ unsigned NewFPSCRTmpReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass);
+ BuildMI(*BB, MI, dl, TII->get(PPC::RLDIMI), NewFPSCRTmpReg)
+ .addReg(OldFPSCRTmpReg)
+ .addReg(ExtSrcReg)
+ .addImm(0)
+ .addImm(62);
+
+ unsigned NewFPSCRReg = RegInfo.createVirtualRegister(&PPC::F8RCRegClass);
+ copyRegFromG8RCOrF8RC(NewFPSCRReg, NewFPSCRTmpReg);
+
+ // The mask 255 means that put the 32:63 bits of NewFPSCRReg to the 32:63
+ // bits of FPSCR.
+ BuildMI(*BB, MI, dl, TII->get(PPC::MTFSF))
+ .addImm(255)
+ .addReg(NewFPSCRReg)
+ .addImm(0)
+ .addImm(0);
} else {
llvm_unreachable("Unexpected instr type to insert");
}
@@ -11006,7 +11463,9 @@ SDValue PPCTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG,
if (RefinementSteps == ReciprocalEstimate::Unspecified)
RefinementSteps = getEstimateRefinementSteps(VT, Subtarget);
- UseOneConstNR = true;
+ // The Newton-Raphson computation with a single constant does not provide
+ // enough accuracy on some CPUs.
+ UseOneConstNR = !Subtarget.needsTwoConstNR();
return DAG.getNode(PPCISD::FRSQRTE, SDLoc(Operand), VT, Operand);
}
return SDValue();
@@ -12062,9 +12521,14 @@ static SDValue combineBVOfConsecutiveLoads(SDNode *N, SelectionDAG &DAG) {
"Should be called with a BUILD_VECTOR node");
SDLoc dl(N);
+
+ // Return early for non byte-sized type, as they can't be consecutive.
+ if (!N->getValueType(0).getVectorElementType().isByteSized())
+ return SDValue();
+
bool InputsAreConsecutiveLoads = true;
bool InputsAreReverseConsecutive = true;
- unsigned ElemSize = N->getValueType(0).getScalarSizeInBits() / 8;
+ unsigned ElemSize = N->getValueType(0).getScalarType().getStoreSize();
SDValue FirstInput = N->getOperand(0);
bool IsRoundOfExtLoad = false;
@@ -12332,9 +12796,8 @@ SDValue PPCTargetLowering::DAGCombineBuildVector(SDNode *N,
ConstantSDNode *Ext2Op = dyn_cast<ConstantSDNode>(Ext2.getOperand(1));
if (!Ext1Op || !Ext2Op)
return SDValue();
- if (Ext1.getValueType() != MVT::i32 ||
- Ext2.getValueType() != MVT::i32)
- if (Ext1.getOperand(0) != Ext2.getOperand(0))
+ if (Ext1.getOperand(0).getValueType() != MVT::v4i32 ||
+ Ext1.getOperand(0) != Ext2.getOperand(0))
return SDValue();
int FirstElem = Ext1Op->getZExtValue();
@@ -12664,6 +13127,8 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
return combineSRA(N, DCI);
case ISD::SRL:
return combineSRL(N, DCI);
+ case ISD::MUL:
+ return combineMUL(N, DCI);
case PPCISD::SHL:
if (isNullConstant(N->getOperand(0))) // 0 << V -> 0.
return N->getOperand(0);
@@ -13246,7 +13711,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
if (Cond.getOpcode() == ISD::INTRINSIC_W_CHAIN &&
cast<ConstantSDNode>(Cond.getOperand(1))->getZExtValue() ==
- Intrinsic::ppc_is_decremented_ctr_nonzero) {
+ Intrinsic::loop_decrement) {
// We now need to make the intrinsic dead (it cannot be instruction
// selected).
@@ -13272,14 +13737,14 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
if (LHS.getOpcode() == ISD::AND &&
LHS.getOperand(0).getOpcode() == ISD::INTRINSIC_W_CHAIN &&
cast<ConstantSDNode>(LHS.getOperand(0).getOperand(1))->getZExtValue() ==
- Intrinsic::ppc_is_decremented_ctr_nonzero &&
+ Intrinsic::loop_decrement &&
isa<ConstantSDNode>(LHS.getOperand(1)) &&
!isNullConstant(LHS.getOperand(1)))
LHS = LHS.getOperand(0);
if (LHS.getOpcode() == ISD::INTRINSIC_W_CHAIN &&
cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() ==
- Intrinsic::ppc_is_decremented_ctr_nonzero &&
+ Intrinsic::loop_decrement &&
isa<ConstantSDNode>(RHS)) {
assert((CC == ISD::SETEQ || CC == ISD::SETNE) &&
"Counter decrement comparison is not EQ or NE");
@@ -13355,9 +13820,9 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
}
case ISD::BUILD_VECTOR:
return DAGCombineBuildVector(N, DCI);
- case ISD::ABS:
+ case ISD::ABS:
return combineABS(N, DCI);
- case ISD::VSELECT:
+ case ISD::VSELECT:
return combineVSelect(N, DCI);
}
@@ -13453,6 +13918,15 @@ unsigned PPCTargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
if (!ML)
break;
+ if (!DisableInnermostLoopAlign32) {
+ // If the nested loop is an innermost loop, prefer to a 32-byte alignment,
+ // so that we can decrease cache misses and branch-prediction misses.
+ // Actual alignment of the loop will depend on the hotness check and other
+ // logic in alignBlocks.
+ if (ML->getLoopDepth() > 1 && ML->getSubLoops().empty())
+ return 5;
+ }
+
const PPCInstrInfo *TII = Subtarget.getInstrInfo();
// For small loops (between 5 and 8 instructions), align to a 32-byte
@@ -13502,7 +13976,7 @@ PPCTargetLowering::getConstraintType(StringRef Constraint) const {
return C_RegisterClass;
} else if (Constraint == "wa" || Constraint == "wd" ||
Constraint == "wf" || Constraint == "ws" ||
- Constraint == "wi") {
+ Constraint == "wi" || Constraint == "ww") {
return C_RegisterClass; // VSX registers.
}
return TargetLowering::getConstraintType(Constraint);
@@ -13530,10 +14004,12 @@ PPCTargetLowering::getSingleConstraintMatchWeight(
StringRef(constraint) == "wf") &&
type->isVectorTy())
return CW_Register;
- else if (StringRef(constraint) == "ws" && type->isDoubleTy())
- return CW_Register;
else if (StringRef(constraint) == "wi" && type->isIntegerTy(64))
return CW_Register; // just hold 64-bit integers data.
+ else if (StringRef(constraint) == "ws" && type->isDoubleTy())
+ return CW_Register;
+ else if (StringRef(constraint) == "ww" && type->isFloatTy())
+ return CW_Register;
switch (*constraint) {
default:
@@ -13619,7 +14095,7 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
Constraint == "wf" || Constraint == "wi") &&
Subtarget.hasVSX()) {
return std::make_pair(0U, &PPC::VSRCRegClass);
- } else if (Constraint == "ws" && Subtarget.hasVSX()) {
+ } else if ((Constraint == "ws" || Constraint == "ww") && Subtarget.hasVSX()) {
if (VT == MVT::f32 && Subtarget.hasP8Vector())
return std::make_pair(0U, &PPC::VSSRCRegClass);
else
@@ -13865,7 +14341,7 @@ bool PPCTargetLowering::isAccessedAsGotIndirect(SDValue GA) const {
if (CModel == CodeModel::Small || CModel == CodeModel::Large)
return true;
- // JumpTable and BlockAddress are accessed as got-indirect.
+ // JumpTable and BlockAddress are accessed as got-indirect.
if (isa<JumpTableSDNode>(GA) || isa<BlockAddressSDNode>(GA))
return true;
@@ -14082,18 +14558,16 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
/// source is constant so it does not need to be loaded.
/// It returns EVT::Other if the type should be determined using generic
/// target-independent logic.
-EVT PPCTargetLowering::getOptimalMemOpType(uint64_t Size,
- unsigned DstAlign, unsigned SrcAlign,
- bool IsMemset, bool ZeroMemset,
- bool MemcpyStrSrc,
- MachineFunction &MF) const {
+EVT PPCTargetLowering::getOptimalMemOpType(
+ uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset,
+ bool ZeroMemset, bool MemcpyStrSrc,
+ const AttributeList &FuncAttributes) const {
if (getTargetMachine().getOptLevel() != CodeGenOpt::None) {
- const Function &F = MF.getFunction();
// When expanding a memset, require at least two QPX instructions to cover
// the cost of loading the value to be stored from the constant pool.
if (Subtarget.hasQPX() && Size >= 32 && (!IsMemset || Size >= 64) &&
(!SrcAlign || SrcAlign >= 32) && (!DstAlign || DstAlign >= 32) &&
- !F.hasFnAttribute(Attribute::NoImplicitFloat)) {
+ !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) {
return MVT::v4f64;
}
@@ -14178,6 +14652,7 @@ bool PPCTargetLowering::isLegalAddImmediate(int64_t Imm) const {
bool PPCTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
unsigned,
unsigned,
+ MachineMemOperand::Flags,
bool *Fast) const {
if (DisablePPCUnaligned)
return false;
@@ -14324,7 +14799,7 @@ void PPCTargetLowering::insertCopiesSplitCSR(
BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
.addReg(*I);
- // Insert the copy-back instructions right before the terminator
+ // Insert the copy-back instructions right before the terminator.
for (auto *Exit : Exits)
BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
TII->get(TargetOpcode::COPY), *I)
@@ -14345,7 +14820,8 @@ void PPCTargetLowering::insertSSPDeclarations(Module &M) const {
return TargetLowering::insertSSPDeclarations(M);
}
-bool PPCTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
+bool PPCTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
+ bool ForCodeSize) const {
if (!VT.isSimple() || !Subtarget.hasVSX())
return false;
@@ -14585,6 +15061,89 @@ SDValue PPCTargetLowering::combineTRUNCATE(SDNode *N,
return SDValue();
}
+SDValue PPCTargetLowering::combineMUL(SDNode *N, DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+
+ ConstantSDNode *ConstOpOrElement = isConstOrConstSplat(N->getOperand(1));
+ if (!ConstOpOrElement)
+ return SDValue();
+
+ // An imul is usually smaller than the alternative sequence for legal type.
+ if (DAG.getMachineFunction().getFunction().hasMinSize() &&
+ isOperationLegal(ISD::MUL, N->getValueType(0)))
+ return SDValue();
+
+ auto IsProfitable = [this](bool IsNeg, bool IsAddOne, EVT VT) -> bool {
+ switch (this->Subtarget.getDarwinDirective()) {
+ default:
+ // TODO: enhance the condition for subtarget before pwr8
+ return false;
+ case PPC::DIR_PWR8:
+ // type mul add shl
+ // scalar 4 1 1
+ // vector 7 2 2
+ return true;
+ case PPC::DIR_PWR9:
+ // type mul add shl
+ // scalar 5 2 2
+ // vector 7 2 2
+
+ // The cycle RATIO of related operations are showed as a table above.
+ // Because mul is 5(scalar)/7(vector), add/sub/shl are all 2 for both
+ // scalar and vector type. For 2 instrs patterns, add/sub + shl
+ // are 4, it is always profitable; but for 3 instrs patterns
+ // (mul x, -(2^N + 1)) => -(add (shl x, N), x), sub + add + shl are 6.
+ // So we should only do it for vector type.
+ return IsAddOne && IsNeg ? VT.isVector() : true;
+ }
+ };
+
+ EVT VT = N->getValueType(0);
+ SDLoc DL(N);
+
+ const APInt &MulAmt = ConstOpOrElement->getAPIntValue();
+ bool IsNeg = MulAmt.isNegative();
+ APInt MulAmtAbs = MulAmt.abs();
+
+ if ((MulAmtAbs - 1).isPowerOf2()) {
+ // (mul x, 2^N + 1) => (add (shl x, N), x)
+ // (mul x, -(2^N + 1)) => -(add (shl x, N), x)
+
+ if (!IsProfitable(IsNeg, true, VT))
+ return SDValue();
+
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 =
+ DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+ DAG.getConstant((MulAmtAbs - 1).logBase2(), DL, VT));
+ SDValue Res = DAG.getNode(ISD::ADD, DL, VT, Op0, Op1);
+
+ if (!IsNeg)
+ return Res;
+
+ return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Res);
+ } else if ((MulAmtAbs + 1).isPowerOf2()) {
+ // (mul x, 2^N - 1) => (sub (shl x, N), x)
+ // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
+
+ if (!IsProfitable(IsNeg, false, VT))
+ return SDValue();
+
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 =
+ DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+ DAG.getConstant((MulAmtAbs + 1).logBase2(), DL, VT));
+
+ if (!IsNeg)
+ return DAG.getNode(ISD::SUB, DL, VT, Op1, Op0);
+ else
+ return DAG.getNode(ISD::SUB, DL, VT, Op0, Op1);
+
+ } else {
+ return SDValue();
+ }
+}
+
bool PPCTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
// Only duplicate to increase tail-calls for the 64bit SysV ABIs.
if (!Subtarget.isSVR4ABI() || !Subtarget.isPPC64())