summaryrefslogtreecommitdiff
path: root/lib/Target/PowerPC
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target/PowerPC')
-rw-r--r--lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp2
-rw-r--r--lib/Target/PowerPC/PPCCTRLoops.cpp5
-rw-r--r--lib/Target/PowerPC/PPCFrameLowering.cpp36
-rw-r--r--lib/Target/PowerPC/PPCISelDAGToDAG.cpp43
-rw-r--r--lib/Target/PowerPC/PPCISelLowering.cpp177
-rw-r--r--lib/Target/PowerPC/PPCISelLowering.h7
-rw-r--r--lib/Target/PowerPC/PPCInstrInfo.td191
-rw-r--r--lib/Target/PowerPC/PPCInstrVSX.td190
-rw-r--r--lib/Target/PowerPC/PPCScheduleP9.td4
-rw-r--r--lib/Target/PowerPC/PPCSubtarget.h7
-rw-r--r--lib/Target/PowerPC/PPCVSXSwapRemoval.cpp4
11 files changed, 577 insertions, 89 deletions
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
index 7393f3d7a08a..bdad2fe8714f 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
@@ -115,7 +115,7 @@ public:
void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
const MCValue &Target, MutableArrayRef<char> Data,
- uint64_t Value, bool IsPCRel) const override {
+ uint64_t Value, bool IsResolved) const override {
Value = adjustFixupValue(Fixup.getKind(), Value);
if (!Value) return; // Doesn't change encoding.
diff --git a/lib/Target/PowerPC/PPCCTRLoops.cpp b/lib/Target/PowerPC/PPCCTRLoops.cpp
index 094d3e6a61b5..53f33ac1fc0e 100644
--- a/lib/Target/PowerPC/PPCCTRLoops.cpp
+++ b/lib/Target/PowerPC/PPCCTRLoops.cpp
@@ -607,7 +607,10 @@ bool PPCCTRLoops::convertToCTRLoop(Loop *L) {
// The old condition may be dead now, and may have even created a dead PHI
// (the original induction variable).
RecursivelyDeleteTriviallyDeadInstructions(OldCond);
- DeleteDeadPHIs(CountedExitBlock);
+ // Run through the basic blocks of the loop and see if any of them have dead
+ // PHIs that can be removed.
+ for (auto I : L->blocks())
+ DeleteDeadPHIs(I);
++NumCTRLoops;
return MadeChange;
diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp
index c2c115cb6daf..b49c3345a17d 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -435,22 +435,19 @@ unsigned PPCFrameLowering::determineFrameLayout(MachineFunction &MF,
const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo();
- // If we are a leaf function, and use up to 224 bytes of stack space,
- // don't have a frame pointer, calls, or dynamic alloca then we do not need
- // to adjust the stack pointer (we fit in the Red Zone).
- // The 32-bit SVR4 ABI has no Red Zone. However, it can still generate
- // stackless code if all local vars are reg-allocated.
- bool DisableRedZone = MF.getFunction()->hasFnAttribute(Attribute::NoRedZone);
unsigned LR = RegInfo->getRARegister();
- if (!DisableRedZone &&
- (Subtarget.isPPC64() || // 32-bit SVR4, no stack-
- !Subtarget.isSVR4ABI() || // allocated locals.
- FrameSize == 0) &&
- FrameSize <= 224 && // Fits in red zone.
- !MFI.hasVarSizedObjects() && // No dynamic alloca.
- !MFI.adjustsStack() && // No calls.
- !MustSaveLR(MF, LR) &&
- !RegInfo->hasBasePointer(MF)) { // No special alignment.
+ bool DisableRedZone = MF.getFunction()->hasFnAttribute(Attribute::NoRedZone);
+ bool CanUseRedZone = !MFI.hasVarSizedObjects() && // No dynamic alloca.
+ !MFI.adjustsStack() && // No calls.
+ !MustSaveLR(MF, LR) && // No need to save LR.
+ !RegInfo->hasBasePointer(MF); // No special alignment.
+
+ // Note: for PPC32 SVR4ABI (Non-DarwinABI), we can still generate stackless
+ // code if all local vars are reg-allocated.
+ bool FitsInRedZone = FrameSize <= Subtarget.getRedZoneSize();
+
+ // Check whether we can skip adjusting the stack pointer (by using red zone)
+ if (!DisableRedZone && CanUseRedZone && FitsInRedZone) {
// No need for frame
if (UpdateMF)
MFI.setStackSize(0);
@@ -1869,8 +1866,13 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF,
}
if (HasVRSaveArea) {
- // Insert alignment padding, we need 16-byte alignment.
- LowerBound = (LowerBound - 15) & ~(15);
+ // Insert alignment padding, we need 16-byte alignment. Note: for postive
+ // number the alignment formula is : y = (x + (n-1)) & (~(n-1)). But since
+ // we are using negative number here (the stack grows downward). We should
+ // use formula : y = x & (~(n-1)). Where x is the size before aligning, n
+ // is the alignment size ( n = 16 here) and y is the size after aligning.
+ assert(LowerBound <= 0 && "Expect LowerBound have a non-positive value!");
+ LowerBound &= ~(15);
for (unsigned i = 0, e = VRegs.size(); i != e; ++i) {
int FI = VRegs[i].getFrameIdx();
diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 535b9deaefac..3aaf7ef2c2a0 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -419,25 +419,6 @@ SDNode *PPCDAGToDAGISel::getGlobalBaseReg() {
.getNode();
}
-/// isIntS16Immediate - This method tests to see if the node is either a 32-bit
-/// or 64-bit immediate, and if the value can be accurately represented as a
-/// sign extension from a 16-bit value. If so, this returns true and the
-/// immediate.
-static bool isIntS16Immediate(SDNode *N, short &Imm) {
- if (N->getOpcode() != ISD::Constant)
- return false;
-
- Imm = (short)cast<ConstantSDNode>(N)->getZExtValue();
- if (N->getValueType(0) == MVT::i32)
- return Imm == (int32_t)cast<ConstantSDNode>(N)->getZExtValue();
- else
- return Imm == (int64_t)cast<ConstantSDNode>(N)->getZExtValue();
-}
-
-static bool isIntS16Immediate(SDValue Op, short &Imm) {
- return isIntS16Immediate(Op.getNode(), Imm);
-}
-
/// isInt32Immediate - This method tests to see if the node is a 32-bit constant
/// operand. If so Imm will receive the 32-bit value.
static bool isInt32Immediate(SDNode *N, unsigned &Imm) {
@@ -728,7 +709,10 @@ static uint64_t Rot64(uint64_t Imm, unsigned R) {
static unsigned getInt64Count(int64_t Imm) {
unsigned Count = getInt64CountDirect(Imm);
- if (Count == 1)
+
+ // If the instruction count is 1 or 2, we do not need further analysis
+ // since rotate + load constant requires at least 2 instructions.
+ if (Count <= 2)
return Count;
for (unsigned r = 1; r < 63; ++r) {
@@ -838,7 +822,10 @@ static SDNode *getInt64Direct(SelectionDAG *CurDAG, const SDLoc &dl,
static SDNode *getInt64(SelectionDAG *CurDAG, const SDLoc &dl, int64_t Imm) {
unsigned Count = getInt64CountDirect(Imm);
- if (Count == 1)
+
+ // If the instruction count is 1 or 2, we do not need further analysis
+ // since rotate + load constant requires at least 2 instructions.
+ if (Count <= 2)
return getInt64Direct(CurDAG, dl, Imm);
unsigned RMin = 0;
@@ -2126,7 +2113,7 @@ SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS, ISD::CondCode CC,
getI32Imm(Imm & 0xFFFF, dl)), 0);
Opc = PPC::CMPLW;
} else {
- short SImm;
+ int16_t SImm;
if (isIntS16Immediate(RHS, SImm))
return SDValue(CurDAG->getMachineNode(PPC::CMPWI, dl, MVT::i32, LHS,
getI32Imm((int)SImm & 0xFFFF,
@@ -2173,7 +2160,7 @@ SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS, ISD::CondCode CC,
getI64Imm(Imm & 0xFFFF, dl)), 0);
Opc = PPC::CMPLD;
} else {
- short SImm;
+ int16_t SImm;
if (isIntS16Immediate(RHS, SImm))
return SDValue(CurDAG->getMachineNode(PPC::CMPDI, dl, MVT::i64, LHS,
getI64Imm(SImm & 0xFFFF, dl)),
@@ -3323,7 +3310,7 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
if (tryLogicOpOfCompares(N))
return;
- short Imm;
+ int16_t Imm;
if (N->getOperand(0)->getOpcode() == ISD::FrameIndex &&
isIntS16Immediate(N->getOperand(1), Imm)) {
KnownBits LHSKnown;
@@ -3346,7 +3333,7 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
break;
}
case ISD::ADD: {
- short Imm;
+ int16_t Imm;
if (N->getOperand(0)->getOpcode() == ISD::FrameIndex &&
isIntS16Immediate(N->getOperand(1), Imm)) {
selectFrameIndex(N, N->getOperand(0).getNode(), (int)Imm);
@@ -4034,11 +4021,13 @@ void PPCDAGToDAGISel::foldBoolExts(SDValue &Res, SDNode *&N) {
O0.getNode(), O1.getNode());
};
+ // FIXME: When the semantics of the interaction between select and undef
+ // are clearly defined, it may turn out to be unnecessary to break here.
SDValue TrueRes = TryFold(ConstTrue);
- if (!TrueRes)
+ if (!TrueRes || TrueRes.isUndef())
break;
SDValue FalseRes = TryFold(ConstFalse);
- if (!FalseRes)
+ if (!FalseRes || FalseRes.isUndef())
break;
// For us to materialize these using one instruction, we must be able to
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 72f14e969138..0e069ec1665f 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -136,6 +136,10 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
addRegisterClass(MVT::f64, &PPC::F8RCRegClass);
}
+ // Match BITREVERSE to customized fast code sequence in the td file.
+ setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
+ setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
+
// PowerPC has an i16 but no i8 (or i1) SEXTLOAD.
for (MVT VT : MVT::integer_valuetypes()) {
setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
@@ -1168,6 +1172,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
case PPCISD::LXSIZX: return "PPCISD::LXSIZX";
case PPCISD::STXSIX: return "PPCISD::STXSIX";
case PPCISD::VEXTS: return "PPCISD::VEXTS";
+ case PPCISD::SExtVElems: return "PPCISD::SExtVElems";
case PPCISD::LXVD2X: return "PPCISD::LXVD2X";
case PPCISD::STXVD2X: return "PPCISD::STXVD2X";
case PPCISD::COND_BRANCH: return "PPCISD::COND_BRANCH";
@@ -2028,17 +2033,17 @@ int PPC::isQVALIGNIShuffleMask(SDNode *N) {
/// or 64-bit immediate, and if the value can be accurately represented as a
/// sign extension from a 16-bit value. If so, this returns true and the
/// immediate.
-static bool isIntS16Immediate(SDNode *N, short &Imm) {
+bool llvm::isIntS16Immediate(SDNode *N, int16_t &Imm) {
if (!isa<ConstantSDNode>(N))
return false;
- Imm = (short)cast<ConstantSDNode>(N)->getZExtValue();
+ Imm = (int16_t)cast<ConstantSDNode>(N)->getZExtValue();
if (N->getValueType(0) == MVT::i32)
return Imm == (int32_t)cast<ConstantSDNode>(N)->getZExtValue();
else
return Imm == (int64_t)cast<ConstantSDNode>(N)->getZExtValue();
}
-static bool isIntS16Immediate(SDValue Op, short &Imm) {
+bool llvm::isIntS16Immediate(SDValue Op, int16_t &Imm) {
return isIntS16Immediate(Op.getNode(), Imm);
}
@@ -2048,7 +2053,7 @@ static bool isIntS16Immediate(SDValue Op, short &Imm) {
bool PPCTargetLowering::SelectAddressRegReg(SDValue N, SDValue &Base,
SDValue &Index,
SelectionDAG &DAG) const {
- short imm = 0;
+ int16_t imm = 0;
if (N.getOpcode() == ISD::ADD) {
if (isIntS16Immediate(N.getOperand(1), imm))
return false; // r+i
@@ -2138,7 +2143,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
return false;
if (N.getOpcode() == ISD::ADD) {
- short imm = 0;
+ int16_t imm = 0;
if (isIntS16Immediate(N.getOperand(1), imm) &&
(!Aligned || (imm & 3) == 0)) {
Disp = DAG.getTargetConstant(imm, dl, N.getValueType());
@@ -2162,7 +2167,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
return true; // [&g+r]
}
} else if (N.getOpcode() == ISD::OR) {
- short imm = 0;
+ int16_t imm = 0;
if (isIntS16Immediate(N.getOperand(1), imm) &&
(!Aligned || (imm & 3) == 0)) {
// If this is an or of disjoint bitfields, we can codegen this as an add
@@ -2190,7 +2195,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
// If this address fits entirely in a 16-bit sext immediate field, codegen
// this as "d, 0"
- short Imm;
+ int16_t Imm;
if (isIntS16Immediate(CN, Imm) && (!Aligned || (Imm & 3) == 0)) {
Disp = DAG.getTargetConstant(Imm, dl, CN->getValueType(0));
Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
@@ -2235,10 +2240,15 @@ bool PPCTargetLowering::SelectAddressRegRegOnly(SDValue N, SDValue &Base,
if (SelectAddressRegReg(N, Base, Index, DAG))
return true;
- // If the operand is an addition, always emit this as [r+r], since this is
- // better (for code size, and execution, as the memop does the add for free)
- // than emitting an explicit add.
- if (N.getOpcode() == ISD::ADD) {
+ // If the address is the result of an add, we will utilize the fact that the
+ // address calculation includes an implicit add. However, we can reduce
+ // register pressure if we do not materialize a constant just for use as the
+ // index register. We only get rid of the add if it is not an add of a
+ // value and a 16-bit signed constant and both have a single use.
+ int16_t imm = 0;
+ if (N.getOpcode() == ISD::ADD &&
+ (!isIntS16Immediate(N.getOperand(1), imm) ||
+ !N.getOperand(1).hasOneUse() || !N.getOperand(0).hasOneUse())) {
Base = N.getOperand(0);
Index = N.getOperand(1);
return true;
@@ -6422,7 +6432,7 @@ PPCTargetLowering::LowerGET_DYNAMIC_AREA_OFFSET(SDValue Op,
SelectionDAG &DAG) const {
SDLoc dl(Op);
- // Get the corect type for integers.
+ // Get the correct type for integers.
EVT IntVT = Op.getValueType();
// Get the inputs.
@@ -6439,7 +6449,7 @@ SDValue PPCTargetLowering::LowerSTACKRESTORE(SDValue Op,
// When we pop the dynamic allocation we need to restore the SP link.
SDLoc dl(Op);
- // Get the corect type for pointers.
+ // Get the correct type for pointers.
EVT PtrVT = getPointerTy(DAG.getDataLayout());
// Construct the stack pointer operand.
@@ -6514,7 +6524,7 @@ SDValue PPCTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
SDValue Size = Op.getOperand(1);
SDLoc dl(Op);
- // Get the corect type for pointers.
+ // Get the correct type for pointers.
EVT PtrVT = getPointerTy(DAG.getDataLayout());
// Negate the size.
SDValue NegSize = DAG.getNode(ISD::SUB, dl, PtrVT,
@@ -6645,6 +6655,7 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
default: break; // SETUO etc aren't handled by fsel.
case ISD::SETNE:
std::swap(TV, FV);
+ LLVM_FALLTHROUGH;
case ISD::SETEQ:
if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits
LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS);
@@ -6656,6 +6667,7 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
case ISD::SETULT:
case ISD::SETLT:
std::swap(TV, FV); // fsel is natively setge, swap operands for setlt
+ LLVM_FALLTHROUGH;
case ISD::SETOGE:
case ISD::SETGE:
if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits
@@ -6664,6 +6676,7 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
case ISD::SETUGT:
case ISD::SETGT:
std::swap(TV, FV); // fsel is natively setge, swap operands for setlt
+ LLVM_FALLTHROUGH;
case ISD::SETOLE:
case ISD::SETLE:
if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits
@@ -6677,6 +6690,7 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
default: break; // SETUO etc aren't handled by fsel.
case ISD::SETNE:
std::swap(TV, FV);
+ LLVM_FALLTHROUGH;
case ISD::SETEQ:
Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, Flags);
if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
@@ -11311,6 +11325,132 @@ static SDValue combineBVOfConsecutiveLoads(SDNode *N, SelectionDAG &DAG) {
return SDValue();
}
+// This function adds the required vector_shuffle needed to get
+// the elements of the vector extract in the correct position
+// as specified by the CorrectElems encoding.
+static SDValue addShuffleForVecExtend(SDNode *N, SelectionDAG &DAG,
+ SDValue Input, uint64_t Elems,
+ uint64_t CorrectElems) {
+ SDLoc dl(N);
+
+ unsigned NumElems = Input.getValueType().getVectorNumElements();
+ SmallVector<int, 16> ShuffleMask(NumElems, -1);
+
+ // Knowing the element indices being extracted from the original
+ // vector and the order in which they're being inserted, just put
+ // them at element indices required for the instruction.
+ for (unsigned i = 0; i < N->getNumOperands(); i++) {
+ if (DAG.getDataLayout().isLittleEndian())
+ ShuffleMask[CorrectElems & 0xF] = Elems & 0xF;
+ else
+ ShuffleMask[(CorrectElems & 0xF0) >> 4] = (Elems & 0xF0) >> 4;
+ CorrectElems = CorrectElems >> 8;
+ Elems = Elems >> 8;
+ }
+
+ SDValue Shuffle =
+ DAG.getVectorShuffle(Input.getValueType(), dl, Input,
+ DAG.getUNDEF(Input.getValueType()), ShuffleMask);
+
+ EVT Ty = N->getValueType(0);
+ SDValue BV = DAG.getNode(PPCISD::SExtVElems, dl, Ty, Shuffle);
+ return BV;
+}
+
+// Look for build vector patterns where input operands come from sign
+// extended vector_extract elements of specific indices. If the correct indices
+// aren't used, add a vector shuffle to fix up the indices and create a new
+// PPCISD:SExtVElems node which selects the vector sign extend instructions
+// during instruction selection.
+static SDValue combineBVOfVecSExt(SDNode *N, SelectionDAG &DAG) {
+ // This array encodes the indices that the vector sign extend instructions
+ // extract from when extending from one type to another for both BE and LE.
+ // The right nibble of each byte corresponds to the LE incides.
+ // and the left nibble of each byte corresponds to the BE incides.
+ // For example: 0x3074B8FC byte->word
+ // For LE: the allowed indices are: 0x0,0x4,0x8,0xC
+ // For BE: the allowed indices are: 0x3,0x7,0xB,0xF
+ // For example: 0x000070F8 byte->double word
+ // For LE: the allowed indices are: 0x0,0x8
+ // For BE: the allowed indices are: 0x7,0xF
+ uint64_t TargetElems[] = {
+ 0x3074B8FC, // b->w
+ 0x000070F8, // b->d
+ 0x10325476, // h->w
+ 0x00003074, // h->d
+ 0x00001032, // w->d
+ };
+
+ uint64_t Elems = 0;
+ int Index;
+ SDValue Input;
+
+ auto isSExtOfVecExtract = [&](SDValue Op) -> bool {
+ if (!Op)
+ return false;
+ if (Op.getOpcode() != ISD::SIGN_EXTEND)
+ return false;
+
+ SDValue Extract = Op.getOperand(0);
+ if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+ return false;
+
+ ConstantSDNode *ExtOp = dyn_cast<ConstantSDNode>(Extract.getOperand(1));
+ if (!ExtOp)
+ return false;
+
+ Index = ExtOp->getZExtValue();
+ if (Input && Input != Extract.getOperand(0))
+ return false;
+
+ if (!Input)
+ Input = Extract.getOperand(0);
+
+ Elems = Elems << 8;
+ Index = DAG.getDataLayout().isLittleEndian() ? Index : Index << 4;
+ Elems |= Index;
+
+ return true;
+ };
+
+ // If the build vector operands aren't sign extended vector extracts,
+ // of the same input vector, then return.
+ for (unsigned i = 0; i < N->getNumOperands(); i++) {
+ if (!isSExtOfVecExtract(N->getOperand(i))) {
+ return SDValue();
+ }
+ }
+
+ // If the vector extract indicies are not correct, add the appropriate
+ // vector_shuffle.
+ int TgtElemArrayIdx;
+ int InputSize = Input.getValueType().getScalarSizeInBits();
+ int OutputSize = N->getValueType(0).getScalarSizeInBits();
+ if (InputSize + OutputSize == 40)
+ TgtElemArrayIdx = 0;
+ else if (InputSize + OutputSize == 72)
+ TgtElemArrayIdx = 1;
+ else if (InputSize + OutputSize == 48)
+ TgtElemArrayIdx = 2;
+ else if (InputSize + OutputSize == 80)
+ TgtElemArrayIdx = 3;
+ else if (InputSize + OutputSize == 96)
+ TgtElemArrayIdx = 4;
+ else
+ return SDValue();
+
+ uint64_t CorrectElems = TargetElems[TgtElemArrayIdx];
+ CorrectElems = DAG.getDataLayout().isLittleEndian()
+ ? CorrectElems & 0x0F0F0F0F0F0F0F0F
+ : CorrectElems & 0xF0F0F0F0F0F0F0F0;
+ if (Elems != CorrectElems) {
+ return addShuffleForVecExtend(N, DAG, Input, Elems, CorrectElems);
+ }
+
+ // Regular lowering will catch cases where a shuffle is not needed.
+ return SDValue();
+}
+
SDValue PPCTargetLowering::DAGCombineBuildVector(SDNode *N,
DAGCombinerInfo &DCI) const {
assert(N->getOpcode() == ISD::BUILD_VECTOR &&
@@ -11338,6 +11478,15 @@ SDValue PPCTargetLowering::DAGCombineBuildVector(SDNode *N,
if (Reduced)
return Reduced;
+ // If we're building a vector out of extended elements from another vector
+ // we have P9 vector integer extend instructions.
+ if (Subtarget.hasP9Altivec()) {
+ Reduced = combineBVOfVecSExt(N, DAG);
+ if (Reduced)
+ return Reduced;
+ }
+
+
if (N->getValueType(0) != MVT::v2f64)
return SDValue();
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index a5108727bb4b..821927d3b157 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -67,6 +67,10 @@ namespace llvm {
/// VSFRC that is sign-extended from ByteWidth to a 64-byte integer.
VEXTS,
+ /// SExtVElems, takes an input vector of a smaller type and sign
+ /// extends to an output vector of a larger type.
+ SExtVElems,
+
/// Reciprocal estimate instructions (unary FP ops).
FRE, FRSQRTE,
@@ -1092,6 +1096,9 @@ namespace llvm {
ISD::ArgFlagsTy &ArgFlags,
CCState &State);
+ bool isIntS16Immediate(SDNode *N, int16_t &Imm);
+ bool isIntS16Immediate(SDValue Op, int16_t &Imm);
+
} // end namespace llvm
#endif // LLVM_TARGET_POWERPC_PPC32ISELLOWERING_H
diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td
index 47d59c25392a..6d9f55206b6a 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/lib/Target/PowerPC/PPCInstrInfo.td
@@ -32,6 +32,9 @@ def SDT_PPCstxsix : SDTypeProfile<0, 3, [
def SDT_PPCVexts : SDTypeProfile<1, 2, [
SDTCisVT<0, f64>, SDTCisVT<1, f64>, SDTCisPtrTy<2>
]>;
+def SDT_PPCSExtVElems : SDTypeProfile<1, 1, [
+ SDTCisVec<0>, SDTCisVec<1>
+]>;
def SDT_PPCCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32>,
SDTCisVT<1, i32> ]>;
@@ -131,6 +134,7 @@ def PPClxsizx : SDNode<"PPCISD::LXSIZX", SDT_PPCLxsizx,
def PPCstxsix : SDNode<"PPCISD::STXSIX", SDT_PPCstxsix,
[SDNPHasChain, SDNPMayStore]>;
def PPCVexts : SDNode<"PPCISD::VEXTS", SDT_PPCVexts, []>;
+def PPCSExtVElems : SDNode<"PPCISD::SExtVElems", SDT_PPCSExtVElems, []>;
// Extract FPSCR (not modeled at the DAG level).
def PPCmffs : SDNode<"PPCISD::MFFS",
@@ -4450,3 +4454,190 @@ def MSGSYNC : XForm_0<31, 886, (outs), (ins), "msgsync", IIC_SprMSGSYNC, []>;
def STOP : XForm_0<19, 370, (outs), (ins), "stop", IIC_SprSTOP, []>;
} // IsISA3_0
+
+// Fast 32-bit reverse bits algorithm:
+// Step 1: 1-bit swap (swap odd 1-bit and even 1-bit):
+// n = ((n >> 1) & 0x55555555) | ((n << 1) & 0xAAAAAAAA);
+// Step 2: 2-bit swap (swap odd 2-bit and even 2-bit):
+// n = ((n >> 2) & 0x33333333) | ((n << 2) & 0xCCCCCCCC);
+// Step 3: 4-bit swap (swap odd 4-bit and even 4-bit):
+// n = ((n >> 4) & 0x0F0F0F0F) | ((n << 4) & 0xF0F0F0F0);
+// Step 4: byte reverse (Suppose n = [B1,B2,B3,B4]):
+// Step 4.1: Put B4,B2 in the right position (rotate left 3 bytes):
+// n' = (n rotl 24); After which n' = [B4, B1, B2, B3]
+// Step 4.2: Insert B3 to the right position:
+// n' = rlwimi n', n, 8, 8, 15; After which n' = [B4, B3, B2, B3]
+// Step 4.3: Insert B1 to the right position:
+// n' = rlwimi n', n, 8, 24, 31; After which n' = [B4, B3, B2, B1]
+def MaskValues {
+ dag Lo1 = (ORI (LIS 0x5555), 0x5555);
+ dag Hi1 = (ORI (LIS 0xAAAA), 0xAAAA);
+ dag Lo2 = (ORI (LIS 0x3333), 0x3333);
+ dag Hi2 = (ORI (LIS 0xCCCC), 0xCCCC);
+ dag Lo4 = (ORI (LIS 0x0F0F), 0x0F0F);
+ dag Hi4 = (ORI (LIS 0xF0F0), 0xF0F0);
+}
+
+def Shift1 {
+ dag Right = (RLWINM $A, 31, 1, 31);
+ dag Left = (RLWINM $A, 1, 0, 30);
+}
+
+def Swap1 {
+ dag Bit = (OR (AND Shift1.Right, MaskValues.Lo1),
+ (AND Shift1.Left, MaskValues.Hi1));
+}
+
+def Shift2 {
+ dag Right = (RLWINM Swap1.Bit, 30, 2, 31);
+ dag Left = (RLWINM Swap1.Bit, 2, 0, 29);
+}
+
+def Swap2 {
+ dag Bits = (OR (AND Shift2.Right, MaskValues.Lo2),
+ (AND Shift2.Left, MaskValues.Hi2));
+}
+
+def Shift4 {
+ dag Right = (RLWINM Swap2.Bits, 28, 4, 31);
+ dag Left = (RLWINM Swap2.Bits, 4, 0, 27);
+}
+
+def Swap4 {
+ dag Bits = (OR (AND Shift4.Right, MaskValues.Lo4),
+ (AND Shift4.Left, MaskValues.Hi4));
+}
+
+def Rotate {
+ dag Left3Bytes = (RLWINM Swap4.Bits, 24, 0, 31);
+}
+
+def RotateInsertByte3 {
+ dag Left = (RLWIMI Rotate.Left3Bytes, Swap4.Bits, 8, 8, 15);
+}
+
+def RotateInsertByte1 {
+ dag Left = (RLWIMI RotateInsertByte3.Left, Swap4.Bits, 8, 24, 31);
+}
+
+def : Pat<(i32 (bitreverse i32:$A)),
+ (RLDICL_32 RotateInsertByte1.Left, 0, 32)>;
+
+// Fast 64-bit reverse bits algorithm:
+// Step 1: 1-bit swap (swap odd 1-bit and even 1-bit):
+// n = ((n >> 1) & 0x5555555555555555) | ((n << 1) & 0xAAAAAAAAAAAAAAAA);
+// Step 2: 2-bit swap (swap odd 2-bit and even 2-bit):
+// n = ((n >> 2) & 0x3333333333333333) | ((n << 2) & 0xCCCCCCCCCCCCCCCC);
+// Step 3: 4-bit swap (swap odd 4-bit and even 4-bit):
+// n = ((n >> 4) & 0x0F0F0F0F0F0F0F0F) | ((n << 4) & 0xF0F0F0F0F0F0F0F0);
+// Step 4: byte reverse (Suppose n = [B1,B2,B3,B4,B5,B6,B7,B8]):
+// Apply the same byte reverse algorithm mentioned above for the fast 32-bit
+// reverse to both the high 32 bit and low 32 bit of the 64 bit value. And
+// then OR them together to get the final result.
+def MaskValues64 {
+ dag Lo1 = (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), MaskValues.Lo1, sub_32));
+ dag Hi1 = (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), MaskValues.Hi1, sub_32));
+ dag Lo2 = (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), MaskValues.Lo2, sub_32));
+ dag Hi2 = (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), MaskValues.Hi2, sub_32));
+ dag Lo4 = (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), MaskValues.Lo4, sub_32));
+ dag Hi4 = (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), MaskValues.Hi4, sub_32));
+}
+
+def DWMaskValues {
+ dag Lo1 = (ORI8 (ORIS8 (RLDICR MaskValues64.Lo1, 32, 31), 0x5555), 0x5555);
+ dag Hi1 = (ORI8 (ORIS8 (RLDICR MaskValues64.Hi1, 32, 31), 0xAAAA), 0xAAAA);
+ dag Lo2 = (ORI8 (ORIS8 (RLDICR MaskValues64.Lo2, 32, 31), 0x3333), 0x3333);
+ dag Hi2 = (ORI8 (ORIS8 (RLDICR MaskValues64.Hi2, 32, 31), 0xCCCC), 0xCCCC);
+ dag Lo4 = (ORI8 (ORIS8 (RLDICR MaskValues64.Lo4, 32, 31), 0x0F0F), 0x0F0F);
+ dag Hi4 = (ORI8 (ORIS8 (RLDICR MaskValues64.Hi4, 32, 31), 0xF0F0), 0xF0F0);
+}
+
+def DWShift1 {
+ dag Right = (RLDICL $A, 63, 1);
+ dag Left = (RLDICR $A, 1, 62);
+}
+
+def DWSwap1 {
+ dag Bit = (OR8 (AND8 DWShift1.Right, DWMaskValues.Lo1),
+ (AND8 DWShift1.Left, DWMaskValues.Hi1));
+}
+
+def DWShift2 {
+ dag Right = (RLDICL DWSwap1.Bit, 62, 2);
+ dag Left = (RLDICR DWSwap1.Bit, 2, 61);
+}
+
+def DWSwap2 {
+ dag Bits = (OR8 (AND8 DWShift2.Right, DWMaskValues.Lo2),
+ (AND8 DWShift2.Left, DWMaskValues.Hi2));
+}
+
+def DWShift4 {
+ dag Right = (RLDICL DWSwap2.Bits, 60, 4);
+ dag Left = (RLDICR DWSwap2.Bits, 4, 59);
+}
+
+def DWSwap4 {
+ dag Bits = (OR8 (AND8 DWShift4.Right, DWMaskValues.Lo4),
+ (AND8 DWShift4.Left, DWMaskValues.Hi4));
+}
+
+// Bit swap is done, now start byte swap.
+def DWExtractLo32 {
+ dag SubReg = (i32 (EXTRACT_SUBREG DWSwap4.Bits, sub_32));
+}
+
+def DWRotateLo32 {
+ dag Left24 = (RLWINM DWExtractLo32.SubReg, 24, 0, 31);
+}
+
+def DWLo32RotateInsertByte3 {
+ dag Left = (RLWIMI DWRotateLo32.Left24, DWExtractLo32.SubReg, 8, 8, 15);
+}
+
+// Lower 32 bits in the right order
+def DWLo32RotateInsertByte1 {
+ dag Left =
+ (RLWIMI DWLo32RotateInsertByte3.Left, DWExtractLo32.SubReg, 8, 24, 31);
+}
+
+def ExtendLo32 {
+ dag To64Bit =
+ (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+ DWLo32RotateInsertByte1.Left, sub_32));
+}
+
+def DWShiftHi32 { // SRDI DWSwap4.Bits, 32)
+ dag ToLo32 = (RLDICL DWSwap4.Bits, 32, 32);
+}
+
+def DWExtractHi32 {
+ dag SubReg = (i32 (EXTRACT_SUBREG DWShiftHi32.ToLo32, sub_32));
+}
+
+def DWRotateHi32 {
+ dag Left24 = (RLWINM DWExtractHi32.SubReg, 24, 0, 31);
+}
+
+def DWHi32RotateInsertByte3 {
+ dag Left = (RLWIMI DWRotateHi32.Left24, DWExtractHi32.SubReg, 8, 8, 15);
+}
+
+// High 32 bits in the right order, but in the low 32-bit position
+def DWHi32RotateInsertByte1 {
+ dag Left =
+ (RLWIMI DWHi32RotateInsertByte3.Left, DWExtractHi32.SubReg, 8, 24, 31);
+}
+
+def ExtendHi32 {
+ dag To64Bit =
+ (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+ DWHi32RotateInsertByte1.Left, sub_32));
+}
+
+def DWShiftLo32 { // SLDI ExtendHi32.To64Bit, 32
+ dag ToHi32 = (RLDICR ExtendHi32.To64Bit, 32, 31);
+}
+
+def : Pat<(i64 (bitreverse i64:$A)),
+ (OR8 DWShiftLo32.ToHi32, ExtendLo32.To64Bit)>;
diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td
index 9cfc897cdb3f..43635a8919e2 100644
--- a/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/lib/Target/PowerPC/PPCInstrVSX.td
@@ -1901,6 +1901,98 @@ let Predicates = [IsLittleEndian, HasVSX] in
def : Pat<(v4i32 (int_ppc_vsx_lxvw4x_be xoaddr:$src)), (LXVW4X xoaddr:$src)>;
def : Pat<(v2f64 (int_ppc_vsx_lxvd2x_be xoaddr:$src)), (LXVD2X xoaddr:$src)>;
+// Variable index unsigned vector_extract on Power9
+let Predicates = [HasP9Altivec, IsLittleEndian] in {
+ def : Pat<(i64 (anyext (i32 (vector_extract v16i8:$S, i64:$Idx)))),
+ (VEXTUBRX $Idx, $S)>;
+
+ def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, i64:$Idx)))),
+ (VEXTUHRX (RLWINM8 $Idx, 1, 28, 30), $S)>;
+ def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 0)))),
+ (VEXTUHRX (LI8 0), $S)>;
+ def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 1)))),
+ (VEXTUHRX (LI8 2), $S)>;
+ def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 2)))),
+ (VEXTUHRX (LI8 4), $S)>;
+ def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 3)))),
+ (VEXTUHRX (LI8 6), $S)>;
+ def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 4)))),
+ (VEXTUHRX (LI8 8), $S)>;
+ def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 5)))),
+ (VEXTUHRX (LI8 10), $S)>;
+ def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 6)))),
+ (VEXTUHRX (LI8 12), $S)>;
+ def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 7)))),
+ (VEXTUHRX (LI8 14), $S)>;
+
+ def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, i64:$Idx)))),
+ (VEXTUWRX (RLWINM8 $Idx, 2, 28, 29), $S)>;
+ def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 0)))),
+ (VEXTUWRX (LI8 0), $S)>;
+ def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 1)))),
+ (VEXTUWRX (LI8 4), $S)>;
+ def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 2)))),
+ (VEXTUWRX (LI8 8), $S)>;
+ def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 3)))),
+ (VEXTUWRX (LI8 12), $S)>;
+
+ def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, i64:$Idx)))),
+ (EXTSW (VEXTUWRX (RLWINM8 $Idx, 2, 28, 29), $S))>;
+ def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 0)))),
+ (EXTSW (VEXTUWRX (LI8 0), $S))>;
+ def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 1)))),
+ (EXTSW (VEXTUWRX (LI8 4), $S))>;
+ def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 2)))),
+ (EXTSW (VEXTUWRX (LI8 8), $S))>;
+ def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 3)))),
+ (EXTSW (VEXTUWRX (LI8 12), $S))>;
+}
+let Predicates = [HasP9Altivec, IsBigEndian] in {
+ def : Pat<(i64 (anyext (i32 (vector_extract v16i8:$S, i64:$Idx)))),
+ (VEXTUBLX $Idx, $S)>;
+
+ def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, i64:$Idx)))),
+ (VEXTUHLX (RLWINM8 $Idx, 1, 28, 30), $S)>;
+ def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 0)))),
+ (VEXTUHLX (LI8 0), $S)>;
+ def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 1)))),
+ (VEXTUHLX (LI8 2), $S)>;
+ def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 2)))),
+ (VEXTUHLX (LI8 4), $S)>;
+ def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 3)))),
+ (VEXTUHLX (LI8 6), $S)>;
+ def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 4)))),
+ (VEXTUHLX (LI8 8), $S)>;
+ def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 5)))),
+ (VEXTUHLX (LI8 10), $S)>;
+ def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 6)))),
+ (VEXTUHLX (LI8 12), $S)>;
+ def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 7)))),
+ (VEXTUHLX (LI8 14), $S)>;
+
+ def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, i64:$Idx)))),
+ (VEXTUWLX (RLWINM8 $Idx, 2, 28, 29), $S)>;
+ def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 0)))),
+ (VEXTUWLX (LI8 0), $S)>;
+ def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 1)))),
+ (VEXTUWLX (LI8 4), $S)>;
+ def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 2)))),
+ (VEXTUWLX (LI8 8), $S)>;
+ def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 3)))),
+ (VEXTUWLX (LI8 12), $S)>;
+
+ def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, i64:$Idx)))),
+ (EXTSW (VEXTUWLX (RLWINM8 $Idx, 2, 28, 29), $S))>;
+ def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 0)))),
+ (EXTSW (VEXTUWLX (LI8 0), $S))>;
+ def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 1)))),
+ (EXTSW (VEXTUWLX (LI8 4), $S))>;
+ def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 2)))),
+ (EXTSW (VEXTUWLX (LI8 8), $S))>;
+ def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 3)))),
+ (EXTSW (VEXTUWLX (LI8 12), $S))>;
+}
+
let Predicates = [IsLittleEndian, HasDirectMove] in {
// v16i8 scalar <-> vector conversions (LE)
def : Pat<(v16i8 (scalar_to_vector i32:$A)),
@@ -2729,36 +2821,54 @@ def DblToFlt {
}
def ByteToWord {
- dag A0 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 0)), i8));
- dag A1 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 4)), i8));
- dag A2 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 8)), i8));
- dag A3 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 12)), i8));
+ dag LE_A0 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 0)), i8));
+ dag LE_A1 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 4)), i8));
+ dag LE_A2 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 8)), i8));
+ dag LE_A3 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 12)), i8));
+ dag BE_A0 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 3)), i8));
+ dag BE_A1 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 7)), i8));
+ dag BE_A2 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 11)), i8));
+ dag BE_A3 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 15)), i8));
}
def ByteToDWord {
- dag A0 = (i64 (sext_inreg
- (i64 (anyext (i32 (vector_extract v16i8:$A, 0)))), i8));
- dag A1 = (i64 (sext_inreg
- (i64 (anyext (i32 (vector_extract v16i8:$A, 8)))), i8));
+ dag LE_A0 = (i64 (sext_inreg
+ (i64 (anyext (i32 (vector_extract v16i8:$A, 0)))), i8));
+ dag LE_A1 = (i64 (sext_inreg
+ (i64 (anyext (i32 (vector_extract v16i8:$A, 8)))), i8));
+ dag BE_A0 = (i64 (sext_inreg
+ (i64 (anyext (i32 (vector_extract v16i8:$A, 7)))), i8));
+ dag BE_A1 = (i64 (sext_inreg
+ (i64 (anyext (i32 (vector_extract v16i8:$A, 15)))), i8));
}
def HWordToWord {
- dag A0 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 0)), i16));
- dag A1 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 2)), i16));
- dag A2 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 4)), i16));
- dag A3 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 6)), i16));
+ dag LE_A0 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 0)), i16));
+ dag LE_A1 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 2)), i16));
+ dag LE_A2 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 4)), i16));
+ dag LE_A3 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 6)), i16));
+ dag BE_A0 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 1)), i16));
+ dag BE_A1 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 3)), i16));
+ dag BE_A2 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 5)), i16));
+ dag BE_A3 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 7)), i16));
}
def HWordToDWord {
- dag A0 = (i64 (sext_inreg
- (i64 (anyext (i32 (vector_extract v8i16:$A, 0)))), i16));
- dag A1 = (i64 (sext_inreg
- (i64 (anyext (i32 (vector_extract v8i16:$A, 4)))), i16));
+ dag LE_A0 = (i64 (sext_inreg
+ (i64 (anyext (i32 (vector_extract v8i16:$A, 0)))), i16));
+ dag LE_A1 = (i64 (sext_inreg
+ (i64 (anyext (i32 (vector_extract v8i16:$A, 4)))), i16));
+ dag BE_A0 = (i64 (sext_inreg
+ (i64 (anyext (i32 (vector_extract v8i16:$A, 3)))), i16));
+ dag BE_A1 = (i64 (sext_inreg
+ (i64 (anyext (i32 (vector_extract v8i16:$A, 7)))), i16));
}
def WordToDWord {
- dag A0 = (i64 (sext (i32 (vector_extract v4i32:$A, 0))));
- dag A1 = (i64 (sext (i32 (vector_extract v4i32:$A, 2))));
+ dag LE_A0 = (i64 (sext (i32 (vector_extract v4i32:$A, 0))));
+ dag LE_A1 = (i64 (sext (i32 (vector_extract v4i32:$A, 2))));
+ dag BE_A0 = (i64 (sext (i32 (vector_extract v4i32:$A, 1))));
+ dag BE_A1 = (i64 (sext (i32 (vector_extract v4i32:$A, 3))));
}
def FltToIntLoad {
@@ -3016,18 +3126,46 @@ let AddedComplexity = 400 in {
// P9 Altivec instructions that can be used to build vectors.
// Adding them to PPCInstrVSX.td rather than PPCAltivecVSX.td to compete
// with complexities of existing build vector patterns in this file.
- let Predicates = [HasP9Altivec] in {
- def : Pat<(v2i64 (build_vector WordToDWord.A0, WordToDWord.A1)),
+ let Predicates = [HasP9Altivec, IsLittleEndian] in {
+ def : Pat<(v2i64 (build_vector WordToDWord.LE_A0, WordToDWord.LE_A1)),
(v2i64 (VEXTSW2D $A))>;
- def : Pat<(v2i64 (build_vector HWordToDWord.A0, HWordToDWord.A1)),
+ def : Pat<(v2i64 (build_vector HWordToDWord.LE_A0, HWordToDWord.LE_A1)),
(v2i64 (VEXTSH2D $A))>;
- def : Pat<(v4i32 (build_vector HWordToWord.A0, HWordToWord.A1,
- HWordToWord.A2, HWordToWord.A3)),
+ def : Pat<(v4i32 (build_vector HWordToWord.LE_A0, HWordToWord.LE_A1,
+ HWordToWord.LE_A2, HWordToWord.LE_A3)),
(v4i32 (VEXTSH2W $A))>;
- def : Pat<(v4i32 (build_vector ByteToWord.A0, ByteToWord.A1,
- ByteToWord.A2, ByteToWord.A3)),
+ def : Pat<(v4i32 (build_vector ByteToWord.LE_A0, ByteToWord.LE_A1,
+ ByteToWord.LE_A2, ByteToWord.LE_A3)),
(v4i32 (VEXTSB2W $A))>;
- def : Pat<(v2i64 (build_vector ByteToDWord.A0, ByteToDWord.A1)),
+ def : Pat<(v2i64 (build_vector ByteToDWord.LE_A0, ByteToDWord.LE_A1)),
(v2i64 (VEXTSB2D $A))>;
}
+
+ let Predicates = [HasP9Altivec, IsBigEndian] in {
+ def : Pat<(v2i64 (build_vector WordToDWord.BE_A0, WordToDWord.BE_A1)),
+ (v2i64 (VEXTSW2D $A))>;
+ def : Pat<(v2i64 (build_vector HWordToDWord.BE_A0, HWordToDWord.BE_A1)),
+ (v2i64 (VEXTSH2D $A))>;
+ def : Pat<(v4i32 (build_vector HWordToWord.BE_A0, HWordToWord.BE_A1,
+ HWordToWord.BE_A2, HWordToWord.BE_A3)),
+ (v4i32 (VEXTSH2W $A))>;
+ def : Pat<(v4i32 (build_vector ByteToWord.BE_A0, ByteToWord.BE_A1,
+ ByteToWord.BE_A2, ByteToWord.BE_A3)),
+ (v4i32 (VEXTSB2W $A))>;
+ def : Pat<(v2i64 (build_vector ByteToDWord.BE_A0, ByteToDWord.BE_A1)),
+ (v2i64 (VEXTSB2D $A))>;
+ }
+
+ let Predicates = [HasP9Altivec] in {
+ def: Pat<(v2i64 (PPCSExtVElems v16i8:$A)),
+ (v2i64 (VEXTSB2D $A))>;
+ def: Pat<(v2i64 (PPCSExtVElems v8i16:$A)),
+ (v2i64 (VEXTSH2D $A))>;
+ def: Pat<(v2i64 (PPCSExtVElems v4i32:$A)),
+ (v2i64 (VEXTSW2D $A))>;
+ def: Pat<(v4i32 (PPCSExtVElems v16i8:$A)),
+ (v4i32 (VEXTSB2W $A))>;
+ def: Pat<(v4i32 (PPCSExtVElems v8i16:$A)),
+ (v4i32 (VEXTSH2W $A))>;
+ }
}
diff --git a/lib/Target/PowerPC/PPCScheduleP9.td b/lib/Target/PowerPC/PPCScheduleP9.td
index a9c1bd78b05e..a01995a629c2 100644
--- a/lib/Target/PowerPC/PPCScheduleP9.td
+++ b/lib/Target/PowerPC/PPCScheduleP9.td
@@ -260,8 +260,8 @@ let SchedModel = P9Model in {
// ***************** Defining Itinerary Class Resources *****************
- def : ItinRW<[P9_DFU_76C, IP_EXEC_1C, DISP_1C, DISP_1C], [IIC_IntSimple,
- IIC_IntGeneral]>;
+ def : ItinRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C],
+ [IIC_IntSimple, IIC_IntGeneral]>;
def : ItinRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
[IIC_IntISEL, IIC_IntRotate, IIC_IntShift]>;
diff --git a/lib/Target/PowerPC/PPCSubtarget.h b/lib/Target/PowerPC/PPCSubtarget.h
index 5a97f595ad8c..90d11f46a384 100644
--- a/lib/Target/PowerPC/PPCSubtarget.h
+++ b/lib/Target/PowerPC/PPCSubtarget.h
@@ -272,6 +272,13 @@ public:
return 16;
}
+
+ // DarwinABI has a 224-byte red zone. PPC32 SVR4ABI(Non-DarwinABI) has no
+ // red zone and PPC64 SVR4ABI has a 288-byte red zone.
+ unsigned getRedZoneSize() const {
+ return isDarwinABI() ? 224 : (isPPC64() ? 288 : 0);
+ }
+
bool hasHTM() const { return HasHTM; }
bool hasFusion() const { return HasFusion; }
bool hasFloat128() const { return HasFloat128; }
diff --git a/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp b/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp
index 491eaf326a50..7d34efd4af3e 100644
--- a/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp
+++ b/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp
@@ -195,8 +195,10 @@ public:
return false;
// If we don't have VSX on the subtarget, don't do anything.
+ // Also, on Power 9 the load and store ops preserve element order and so
+ // the swaps are not required.
const PPCSubtarget &STI = MF.getSubtarget<PPCSubtarget>();
- if (!STI.hasVSX())
+ if (!STI.hasVSX() || !STI.needsSwapsForVSXMemOps())
return false;
bool Changed = false;