diff options
Diffstat (limited to 'llvm/lib/Target/SystemZ/SystemZISelLowering.cpp')
-rw-r--r-- | llvm/lib/Target/SystemZ/SystemZISelLowering.cpp | 655 |
1 files changed, 522 insertions, 133 deletions
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp index c73905d3357a..eb1e51341ec4 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -88,25 +88,27 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, else addRegisterClass(MVT::i32, &SystemZ::GR32BitRegClass); addRegisterClass(MVT::i64, &SystemZ::GR64BitRegClass); - if (Subtarget.hasVector()) { - addRegisterClass(MVT::f32, &SystemZ::VR32BitRegClass); - addRegisterClass(MVT::f64, &SystemZ::VR64BitRegClass); - } else { - addRegisterClass(MVT::f32, &SystemZ::FP32BitRegClass); - addRegisterClass(MVT::f64, &SystemZ::FP64BitRegClass); - } - if (Subtarget.hasVectorEnhancements1()) - addRegisterClass(MVT::f128, &SystemZ::VR128BitRegClass); - else - addRegisterClass(MVT::f128, &SystemZ::FP128BitRegClass); + if (!useSoftFloat()) { + if (Subtarget.hasVector()) { + addRegisterClass(MVT::f32, &SystemZ::VR32BitRegClass); + addRegisterClass(MVT::f64, &SystemZ::VR64BitRegClass); + } else { + addRegisterClass(MVT::f32, &SystemZ::FP32BitRegClass); + addRegisterClass(MVT::f64, &SystemZ::FP64BitRegClass); + } + if (Subtarget.hasVectorEnhancements1()) + addRegisterClass(MVT::f128, &SystemZ::VR128BitRegClass); + else + addRegisterClass(MVT::f128, &SystemZ::FP128BitRegClass); - if (Subtarget.hasVector()) { - addRegisterClass(MVT::v16i8, &SystemZ::VR128BitRegClass); - addRegisterClass(MVT::v8i16, &SystemZ::VR128BitRegClass); - addRegisterClass(MVT::v4i32, &SystemZ::VR128BitRegClass); - addRegisterClass(MVT::v2i64, &SystemZ::VR128BitRegClass); - addRegisterClass(MVT::v4f32, &SystemZ::VR128BitRegClass); - addRegisterClass(MVT::v2f64, &SystemZ::VR128BitRegClass); + if (Subtarget.hasVector()) { + addRegisterClass(MVT::v16i8, &SystemZ::VR128BitRegClass); + addRegisterClass(MVT::v8i16, &SystemZ::VR128BitRegClass); + addRegisterClass(MVT::v4i32, &SystemZ::VR128BitRegClass); + addRegisterClass(MVT::v2i64, &SystemZ::VR128BitRegClass); + addRegisterClass(MVT::v4f32, &SystemZ::VR128BitRegClass); + addRegisterClass(MVT::v2f64, &SystemZ::VR128BitRegClass); + } } // Compute derived properties from the register classes @@ -639,12 +641,16 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::FP_ROUND); setTargetDAGCombine(ISD::STRICT_FP_ROUND); setTargetDAGCombine(ISD::FP_EXTEND); + setTargetDAGCombine(ISD::SINT_TO_FP); + setTargetDAGCombine(ISD::UINT_TO_FP); setTargetDAGCombine(ISD::STRICT_FP_EXTEND); setTargetDAGCombine(ISD::BSWAP); setTargetDAGCombine(ISD::SDIV); setTargetDAGCombine(ISD::UDIV); setTargetDAGCombine(ISD::SREM); setTargetDAGCombine(ISD::UREM); + setTargetDAGCombine(ISD::INTRINSIC_VOID); + setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); // Handle intrinsics. setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); @@ -666,6 +672,10 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, IsStrictFPEnabled = true; } +bool SystemZTargetLowering::useSoftFloat() const { + return Subtarget.hasSoftFloat(); +} + EVT SystemZTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, EVT VT) const { if (!VT.isVector()) @@ -816,6 +826,15 @@ bool SystemZTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, return SystemZVectorConstantInfo(Imm).isVectorConstantLegal(Subtarget); } +/// Returns true if stack probing through inline assembly is requested. +bool SystemZTargetLowering::hasInlineStackProbe(MachineFunction &MF) const { + // If the function specifically requests inline stack probes, emit them. + if (MF.getFunction().hasFnAttribute("probe-stack")) + return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() == + "inline-asm"; + return false; +} + bool SystemZTargetLowering::isLegalICmpImmediate(int64_t Imm) const { // We can use CGFI or CLGFI. return isInt<32>(Imm) || isUInt<32>(Imm); @@ -1123,12 +1142,14 @@ SystemZTargetLowering::getRegForInlineAsmConstraint( return std::make_pair(0U, &SystemZ::GRH32BitRegClass); case 'f': // Floating-point register - if (VT == MVT::f64) - return std::make_pair(0U, &SystemZ::FP64BitRegClass); - else if (VT == MVT::f128) - return std::make_pair(0U, &SystemZ::FP128BitRegClass); - return std::make_pair(0U, &SystemZ::FP32BitRegClass); - + if (!useSoftFloat()) { + if (VT == MVT::f64) + return std::make_pair(0U, &SystemZ::FP64BitRegClass); + else if (VT == MVT::f128) + return std::make_pair(0U, &SystemZ::FP128BitRegClass); + return std::make_pair(0U, &SystemZ::FP32BitRegClass); + } + break; case 'v': // Vector register if (Subtarget.hasVector()) { if (VT == MVT::f32) @@ -1156,6 +1177,9 @@ SystemZTargetLowering::getRegForInlineAsmConstraint( SystemZMC::GR64Regs, 16); } if (Constraint[1] == 'f') { + if (useSoftFloat()) + return std::make_pair( + 0u, static_cast<const TargetRegisterClass *>(nullptr)); if (VT == MVT::f32) return parseRegisterNumber(Constraint, &SystemZ::FP32BitRegClass, SystemZMC::FP32Regs, 16); @@ -1166,6 +1190,9 @@ SystemZTargetLowering::getRegForInlineAsmConstraint( SystemZMC::FP64Regs, 16); } if (Constraint[1] == 'v') { + if (!Subtarget.hasVector()) + return std::make_pair( + 0u, static_cast<const TargetRegisterClass *>(nullptr)); if (VT == MVT::f32) return parseRegisterNumber(Constraint, &SystemZ::VR32BitRegClass, SystemZMC::VR32Regs, 32); @@ -1179,6 +1206,19 @@ SystemZTargetLowering::getRegForInlineAsmConstraint( return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); } +// FIXME? Maybe this could be a TableGen attribute on some registers and +// this table could be generated automatically from RegInfo. +Register SystemZTargetLowering::getRegisterByName(const char *RegName, LLT VT, + const MachineFunction &MF) const { + + Register Reg = StringSwitch<Register>(RegName) + .Case("r15", SystemZ::R15D) + .Default(0); + if (Reg) + return Reg; + report_fatal_error("Invalid register name global variable"); +} + void SystemZTargetLowering:: LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops, @@ -1437,17 +1477,19 @@ SDValue SystemZTargetLowering::LowerFormalArguments( // ...and a similar frame index for the caller-allocated save area // that will be used to store the incoming registers. - int64_t RegSaveOffset = -SystemZMC::CallFrameSize; + int64_t RegSaveOffset = + -SystemZMC::CallFrameSize + TFL->getRegSpillOffset(MF, SystemZ::R2D) - 16; unsigned RegSaveIndex = MFI.CreateFixedObject(1, RegSaveOffset, true); FuncInfo->setRegSaveFrameIndex(RegSaveIndex); // Store the FPR varargs in the reserved frame slots. (We store the // GPRs as part of the prologue.) - if (NumFixedFPRs < SystemZ::NumArgFPRs) { + if (NumFixedFPRs < SystemZ::NumArgFPRs && !useSoftFloat()) { SDValue MemOps[SystemZ::NumArgFPRs]; for (unsigned I = NumFixedFPRs; I < SystemZ::NumArgFPRs; ++I) { - unsigned Offset = TFL->getRegSpillOffset(SystemZ::ArgFPRs[I]); - int FI = MFI.CreateFixedObject(8, RegSaveOffset + Offset, true); + unsigned Offset = TFL->getRegSpillOffset(MF, SystemZ::ArgFPRs[I]); + int FI = + MFI.CreateFixedObject(8, -SystemZMC::CallFrameSize + Offset, true); SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); unsigned VReg = MF.addLiveIn(SystemZ::ArgFPRs[I], &SystemZ::FP64BitRegClass); @@ -1633,6 +1675,7 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI, if (IsTailCall) return DAG.getNode(SystemZISD::SIBCALL, DL, NodeTys, Ops); Chain = DAG.getNode(SystemZISD::CALL, DL, NodeTys, Ops); + DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge); Glue = Chain.getValue(1); // Mark the end of the call, which is glued to the call itself. @@ -2020,8 +2063,9 @@ static void adjustSubwordCmp(SelectionDAG &DAG, const SDLoc &DL, // We must have an 8- or 16-bit load. auto *Load = cast<LoadSDNode>(C.Op0); - unsigned NumBits = Load->getMemoryVT().getStoreSizeInBits(); - if (NumBits != 8 && NumBits != 16) + unsigned NumBits = Load->getMemoryVT().getSizeInBits(); + if ((NumBits != 8 && NumBits != 16) || + NumBits != Load->getMemoryVT().getStoreSizeInBits()) return; // The load must be an extending one and the constant must be within the @@ -2161,15 +2205,6 @@ static bool shouldSwapCmpOperands(const Comparison &C) { return false; } -// Return a version of comparison CC mask CCMask in which the LT and GT -// actions are swapped. -static unsigned reverseCCMask(unsigned CCMask) { - return ((CCMask & SystemZ::CCMASK_CMP_EQ) | - (CCMask & SystemZ::CCMASK_CMP_GT ? SystemZ::CCMASK_CMP_LT : 0) | - (CCMask & SystemZ::CCMASK_CMP_LT ? SystemZ::CCMASK_CMP_GT : 0) | - (CCMask & SystemZ::CCMASK_CMP_UO)); -} - // Check whether C tests for equality between X and Y and whether X - Y // or Y - X is also computed. In that case it's better to compare the // result of the subtraction against zero. @@ -2205,7 +2240,7 @@ static void adjustForFNeg(Comparison &C) { SDNode *N = *I; if (N->getOpcode() == ISD::FNEG) { C.Op0 = SDValue(N, 0); - C.CCMask = reverseCCMask(C.CCMask); + C.CCMask = SystemZ::reverseCCMask(C.CCMask); return; } } @@ -2572,7 +2607,7 @@ static Comparison getCmp(SelectionDAG &DAG, SDValue CmpOp0, SDValue CmpOp1, if (shouldSwapCmpOperands(C)) { std::swap(C.Op0, C.Op1); - C.CCMask = reverseCCMask(C.CCMask); + C.CCMask = SystemZ::reverseCCMask(C.CCMask); } adjustForTestUnderMask(DAG, DL, C); @@ -3103,7 +3138,7 @@ SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node, SystemZConstantPoolValue *CPV = SystemZConstantPoolValue::Create(GV, SystemZCP::TLSGD); - Offset = DAG.getConstantPool(CPV, PtrVT, 8); + Offset = DAG.getConstantPool(CPV, PtrVT, Align(8)); Offset = DAG.getLoad( PtrVT, DL, DAG.getEntryNode(), Offset, MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); @@ -3118,7 +3153,7 @@ SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node, SystemZConstantPoolValue *CPV = SystemZConstantPoolValue::Create(GV, SystemZCP::TLSLDM); - Offset = DAG.getConstantPool(CPV, PtrVT, 8); + Offset = DAG.getConstantPool(CPV, PtrVT, Align(8)); Offset = DAG.getLoad( PtrVT, DL, DAG.getEntryNode(), Offset, MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); @@ -3136,7 +3171,7 @@ SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node, // Add the per-symbol offset. CPV = SystemZConstantPoolValue::Create(GV, SystemZCP::DTPOFF); - SDValue DTPOffset = DAG.getConstantPool(CPV, PtrVT, 8); + SDValue DTPOffset = DAG.getConstantPool(CPV, PtrVT, Align(8)); DTPOffset = DAG.getLoad( PtrVT, DL, DAG.getEntryNode(), DTPOffset, MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); @@ -3161,7 +3196,7 @@ SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node, SystemZConstantPoolValue *CPV = SystemZConstantPoolValue::Create(GV, SystemZCP::NTPOFF); - Offset = DAG.getConstantPool(CPV, PtrVT, 8); + Offset = DAG.getConstantPool(CPV, PtrVT, Align(8)); Offset = DAG.getLoad( PtrVT, DL, DAG.getEntryNode(), Offset, MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); @@ -3202,11 +3237,11 @@ SDValue SystemZTargetLowering::lowerConstantPool(ConstantPoolSDNode *CP, SDValue Result; if (CP->isMachineConstantPoolEntry()) - Result = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, - CP->getAlignment()); + Result = + DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, CP->getAlign()); else - Result = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, - CP->getAlignment(), CP->getOffset()); + Result = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlign(), + CP->getOffset()); // Use LARL to load the address of the constant pool entry. return DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result); @@ -3214,6 +3249,8 @@ SDValue SystemZTargetLowering::lowerConstantPool(ConstantPoolSDNode *CP, SDValue SystemZTargetLowering::lowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { + auto *TFL = + static_cast<const SystemZFrameLowering *>(Subtarget.getFrameLowering()); MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); MFI.setFrameAddressIsTaken(true); @@ -3222,9 +3259,12 @@ SDValue SystemZTargetLowering::lowerFRAMEADDR(SDValue Op, unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); EVT PtrVT = getPointerTy(DAG.getDataLayout()); + // Return null if the back chain is not present. + bool HasBackChain = MF.getFunction().hasFnAttribute("backchain"); + if (TFL->usePackedStack(MF) && !HasBackChain) + return DAG.getConstant(0, DL, PtrVT); + // By definition, the frame address is the address of the back chain. - auto *TFL = - static_cast<const SystemZFrameLowering *>(Subtarget.getFrameLowering()); int BackChainIdx = TFL->getOrCreateFramePointerSaveIndex(MF); SDValue BackChain = DAG.getFrameIndex(BackChainIdx, PtrVT); @@ -3355,9 +3395,9 @@ SDValue SystemZTargetLowering::lowerVACOPY(SDValue Op, SDLoc DL(Op); return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, DAG.getIntPtrConstant(32, DL), - /*Align*/8, /*isVolatile*/false, /*AlwaysInline*/false, - /*isTailCall*/false, - MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV)); + Align(8), /*isVolatile*/ false, /*AlwaysInline*/ false, + /*isTailCall*/ false, MachinePointerInfo(DstSV), + MachinePointerInfo(SrcSV)); } SDValue SystemZTargetLowering:: @@ -3398,10 +3438,17 @@ lowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { DAG.getConstant(ExtraAlignSpace, DL, MVT::i64)); // Get the new stack pointer value. - SDValue NewSP = DAG.getNode(ISD::SUB, DL, MVT::i64, OldSP, NeededSpace); - - // Copy the new stack pointer back. - Chain = DAG.getCopyToReg(Chain, DL, SPReg, NewSP); + SDValue NewSP; + if (hasInlineStackProbe(MF)) { + NewSP = DAG.getNode(SystemZISD::PROBED_ALLOCA, DL, + DAG.getVTList(MVT::i64, MVT::Other), Chain, OldSP, NeededSpace); + Chain = NewSP.getValue(1); + } + else { + NewSP = DAG.getNode(ISD::SUB, DL, MVT::i64, OldSP, NeededSpace); + // Copy the new stack pointer back. + Chain = DAG.getCopyToReg(Chain, DL, SPReg, NewSP); + } // The allocated data lives above the 160 bytes allocated for the standard // frame, plus any outgoing stack arguments. We don't know how much that @@ -3995,7 +4042,7 @@ SDValue SystemZTargetLowering::lowerATOMIC_CMP_SWAP(SDValue Op, } MachineMemOperand::Flags -SystemZTargetLowering::getMMOFlags(const Instruction &I) const { +SystemZTargetLowering::getTargetMMOFlags(const Instruction &I) const { // Because of how we convert atomic_load and atomic_store to normal loads and // stores in the DAG, we need to ensure that the MMOs are marked volatile // since DAGCombine hasn't been updated to account for atomic, but non @@ -4362,7 +4409,7 @@ static bool getShuffleInput(const SmallVectorImpl<int> &Bytes, unsigned Start, } // Bytes is a VPERM-like permute vector, except that -1 is used for -// undefined bytes. Return true if it can be performed using VSLDI. +// undefined bytes. Return true if it can be performed using VSLDB. // When returning true, set StartIndex to the shift amount and OpNo0 // and OpNo1 to the VPERM operands that should be used as the first // and second shift operand respectively. @@ -4420,23 +4467,86 @@ static SDValue getPermuteNode(SelectionDAG &DAG, const SDLoc &DL, return Op; } +static bool isZeroVector(SDValue N) { + if (N->getOpcode() == ISD::BITCAST) + N = N->getOperand(0); + if (N->getOpcode() == ISD::SPLAT_VECTOR) + if (auto *Op = dyn_cast<ConstantSDNode>(N->getOperand(0))) + return Op->getZExtValue() == 0; + return ISD::isBuildVectorAllZeros(N.getNode()); +} + +// Return the index of the zero/undef vector, or UINT32_MAX if not found. +static uint32_t findZeroVectorIdx(SDValue *Ops, unsigned Num) { + for (unsigned I = 0; I < Num ; I++) + if (isZeroVector(Ops[I])) + return I; + return UINT32_MAX; +} + // Bytes is a VPERM-like permute vector, except that -1 is used for // undefined bytes. Implement it on operands Ops[0] and Ops[1] using -// VSLDI or VPERM. +// VSLDB or VPERM. static SDValue getGeneralPermuteNode(SelectionDAG &DAG, const SDLoc &DL, SDValue *Ops, const SmallVectorImpl<int> &Bytes) { for (unsigned I = 0; I < 2; ++I) Ops[I] = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Ops[I]); - // First see whether VSLDI can be used. + // First see whether VSLDB can be used. unsigned StartIndex, OpNo0, OpNo1; if (isShlDoublePermute(Bytes, StartIndex, OpNo0, OpNo1)) return DAG.getNode(SystemZISD::SHL_DOUBLE, DL, MVT::v16i8, Ops[OpNo0], Ops[OpNo1], DAG.getTargetConstant(StartIndex, DL, MVT::i32)); - // Fall back on VPERM. Construct an SDNode for the permute vector. + // Fall back on VPERM. Construct an SDNode for the permute vector. Try to + // eliminate a zero vector by reusing any zero index in the permute vector. + unsigned ZeroVecIdx = findZeroVectorIdx(&Ops[0], 2); + if (ZeroVecIdx != UINT32_MAX) { + bool MaskFirst = true; + int ZeroIdx = -1; + for (unsigned I = 0; I < SystemZ::VectorBytes; ++I) { + unsigned OpNo = unsigned(Bytes[I]) / SystemZ::VectorBytes; + unsigned Byte = unsigned(Bytes[I]) % SystemZ::VectorBytes; + if (OpNo == ZeroVecIdx && I == 0) { + // If the first byte is zero, use mask as first operand. + ZeroIdx = 0; + break; + } + if (OpNo != ZeroVecIdx && Byte == 0) { + // If mask contains a zero, use it by placing that vector first. + ZeroIdx = I + SystemZ::VectorBytes; + MaskFirst = false; + break; + } + } + if (ZeroIdx != -1) { + SDValue IndexNodes[SystemZ::VectorBytes]; + for (unsigned I = 0; I < SystemZ::VectorBytes; ++I) { + if (Bytes[I] >= 0) { + unsigned OpNo = unsigned(Bytes[I]) / SystemZ::VectorBytes; + unsigned Byte = unsigned(Bytes[I]) % SystemZ::VectorBytes; + if (OpNo == ZeroVecIdx) + IndexNodes[I] = DAG.getConstant(ZeroIdx, DL, MVT::i32); + else { + unsigned BIdx = MaskFirst ? Byte + SystemZ::VectorBytes : Byte; + IndexNodes[I] = DAG.getConstant(BIdx, DL, MVT::i32); + } + } else + IndexNodes[I] = DAG.getUNDEF(MVT::i32); + } + SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, IndexNodes); + SDValue Src = ZeroVecIdx == 0 ? Ops[1] : Ops[0]; + if (MaskFirst) + return DAG.getNode(SystemZISD::PERMUTE, DL, MVT::v16i8, Mask, Src, + Mask); + else + return DAG.getNode(SystemZISD::PERMUTE, DL, MVT::v16i8, Src, Mask, + Mask); + } + } + SDValue IndexNodes[SystemZ::VectorBytes]; for (unsigned I = 0; I < SystemZ::VectorBytes; ++I) if (Bytes[I] >= 0) @@ -4444,16 +4554,20 @@ static SDValue getGeneralPermuteNode(SelectionDAG &DAG, const SDLoc &DL, else IndexNodes[I] = DAG.getUNDEF(MVT::i32); SDValue Op2 = DAG.getBuildVector(MVT::v16i8, DL, IndexNodes); - return DAG.getNode(SystemZISD::PERMUTE, DL, MVT::v16i8, Ops[0], Ops[1], Op2); + return DAG.getNode(SystemZISD::PERMUTE, DL, MVT::v16i8, Ops[0], + (!Ops[1].isUndef() ? Ops[1] : Ops[0]), Op2); } namespace { // Describes a general N-operand vector shuffle. struct GeneralShuffle { - GeneralShuffle(EVT vt) : VT(vt) {} + GeneralShuffle(EVT vt) : VT(vt), UnpackFromEltSize(UINT_MAX) {} void addUndef(); bool add(SDValue, unsigned); SDValue getNode(SelectionDAG &, const SDLoc &); + void tryPrepareForUnpack(); + bool unpackWasPrepared() { return UnpackFromEltSize <= 4; } + SDValue insertUnpackIfPrepared(SelectionDAG &DAG, const SDLoc &DL, SDValue Op); // The operands of the shuffle. SmallVector<SDValue, SystemZ::VectorBytes> Ops; @@ -4465,6 +4579,9 @@ struct GeneralShuffle { // The type of the shuffle result. EVT VT; + + // Holds a value of 1, 2 or 4 if a final unpack has been prepared for. + unsigned UnpackFromEltSize; }; } @@ -4547,6 +4664,9 @@ SDValue GeneralShuffle::getNode(SelectionDAG &DAG, const SDLoc &DL) { if (Ops.size() == 0) return DAG.getUNDEF(VT); + // Use a single unpack if possible as the last operation. + tryPrepareForUnpack(); + // Make sure that there are at least two shuffle operands. if (Ops.size() == 1) Ops.push_back(DAG.getUNDEF(MVT::v16i8)); @@ -4612,13 +4732,117 @@ SDValue GeneralShuffle::getNode(SelectionDAG &DAG, const SDLoc &DL) { // to VPERM. unsigned OpNo0, OpNo1; SDValue Op; - if (const Permute *P = matchPermute(Bytes, OpNo0, OpNo1)) + if (unpackWasPrepared() && Ops[1].isUndef()) + Op = Ops[0]; + else if (const Permute *P = matchPermute(Bytes, OpNo0, OpNo1)) Op = getPermuteNode(DAG, DL, *P, Ops[OpNo0], Ops[OpNo1]); else Op = getGeneralPermuteNode(DAG, DL, &Ops[0], Bytes); + + Op = insertUnpackIfPrepared(DAG, DL, Op); + return DAG.getNode(ISD::BITCAST, DL, VT, Op); } +#ifndef NDEBUG +static void dumpBytes(const SmallVectorImpl<int> &Bytes, std::string Msg) { + dbgs() << Msg.c_str() << " { "; + for (unsigned i = 0; i < Bytes.size(); i++) + dbgs() << Bytes[i] << " "; + dbgs() << "}\n"; +} +#endif + +// If the Bytes vector matches an unpack operation, prepare to do the unpack +// after all else by removing the zero vector and the effect of the unpack on +// Bytes. +void GeneralShuffle::tryPrepareForUnpack() { + uint32_t ZeroVecOpNo = findZeroVectorIdx(&Ops[0], Ops.size()); + if (ZeroVecOpNo == UINT32_MAX || Ops.size() == 1) + return; + + // Only do this if removing the zero vector reduces the depth, otherwise + // the critical path will increase with the final unpack. + if (Ops.size() > 2 && + Log2_32_Ceil(Ops.size()) == Log2_32_Ceil(Ops.size() - 1)) + return; + + // Find an unpack that would allow removing the zero vector from Ops. + UnpackFromEltSize = 1; + for (; UnpackFromEltSize <= 4; UnpackFromEltSize *= 2) { + bool MatchUnpack = true; + SmallVector<int, SystemZ::VectorBytes> SrcBytes; + for (unsigned Elt = 0; Elt < SystemZ::VectorBytes; Elt++) { + unsigned ToEltSize = UnpackFromEltSize * 2; + bool IsZextByte = (Elt % ToEltSize) < UnpackFromEltSize; + if (!IsZextByte) + SrcBytes.push_back(Bytes[Elt]); + if (Bytes[Elt] != -1) { + unsigned OpNo = unsigned(Bytes[Elt]) / SystemZ::VectorBytes; + if (IsZextByte != (OpNo == ZeroVecOpNo)) { + MatchUnpack = false; + break; + } + } + } + if (MatchUnpack) { + if (Ops.size() == 2) { + // Don't use unpack if a single source operand needs rearrangement. + for (unsigned i = 0; i < SystemZ::VectorBytes / 2; i++) + if (SrcBytes[i] != -1 && SrcBytes[i] % 16 != int(i)) { + UnpackFromEltSize = UINT_MAX; + return; + } + } + break; + } + } + if (UnpackFromEltSize > 4) + return; + + LLVM_DEBUG(dbgs() << "Preparing for final unpack of element size " + << UnpackFromEltSize << ". Zero vector is Op#" << ZeroVecOpNo + << ".\n"; + dumpBytes(Bytes, "Original Bytes vector:");); + + // Apply the unpack in reverse to the Bytes array. + unsigned B = 0; + for (unsigned Elt = 0; Elt < SystemZ::VectorBytes;) { + Elt += UnpackFromEltSize; + for (unsigned i = 0; i < UnpackFromEltSize; i++, Elt++, B++) + Bytes[B] = Bytes[Elt]; + } + while (B < SystemZ::VectorBytes) + Bytes[B++] = -1; + + // Remove the zero vector from Ops + Ops.erase(&Ops[ZeroVecOpNo]); + for (unsigned I = 0; I < SystemZ::VectorBytes; ++I) + if (Bytes[I] >= 0) { + unsigned OpNo = unsigned(Bytes[I]) / SystemZ::VectorBytes; + if (OpNo > ZeroVecOpNo) + Bytes[I] -= SystemZ::VectorBytes; + } + + LLVM_DEBUG(dumpBytes(Bytes, "Resulting Bytes vector, zero vector removed:"); + dbgs() << "\n";); +} + +SDValue GeneralShuffle::insertUnpackIfPrepared(SelectionDAG &DAG, + const SDLoc &DL, + SDValue Op) { + if (!unpackWasPrepared()) + return Op; + unsigned InBits = UnpackFromEltSize * 8; + EVT InVT = MVT::getVectorVT(MVT::getIntegerVT(InBits), + SystemZ::VectorBits / InBits); + SDValue PackedOp = DAG.getNode(ISD::BITCAST, DL, InVT, Op); + unsigned OutBits = InBits * 2; + EVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(OutBits), + SystemZ::VectorBits / OutBits); + return DAG.getNode(SystemZISD::UNPACKL_HIGH, DL, OutVT, PackedOp); +} + // Return true if the given BUILD_VECTOR is a scalar-to-vector conversion. static bool isScalarToVector(SDValue Op) { for (unsigned I = 1, E = Op.getNumOperands(); I != E; ++I) @@ -5013,9 +5237,8 @@ SystemZTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op, return DAG.getNode(ISD::BITCAST, DL, VT, Res); } -SDValue -SystemZTargetLowering::lowerExtendVectorInreg(SDValue Op, SelectionDAG &DAG, - unsigned UnpackHigh) const { +SDValue SystemZTargetLowering:: +lowerSIGN_EXTEND_VECTOR_INREG(SDValue Op, SelectionDAG &DAG) const { SDValue PackedOp = Op.getOperand(0); EVT OutVT = Op.getValueType(); EVT InVT = PackedOp.getValueType(); @@ -5025,11 +5248,39 @@ SystemZTargetLowering::lowerExtendVectorInreg(SDValue Op, SelectionDAG &DAG, FromBits *= 2; EVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(FromBits), SystemZ::VectorBits / FromBits); - PackedOp = DAG.getNode(UnpackHigh, SDLoc(PackedOp), OutVT, PackedOp); + PackedOp = + DAG.getNode(SystemZISD::UNPACK_HIGH, SDLoc(PackedOp), OutVT, PackedOp); } while (FromBits != ToBits); return PackedOp; } +// Lower a ZERO_EXTEND_VECTOR_INREG to a vector shuffle with a zero vector. +SDValue SystemZTargetLowering:: +lowerZERO_EXTEND_VECTOR_INREG(SDValue Op, SelectionDAG &DAG) const { + SDValue PackedOp = Op.getOperand(0); + SDLoc DL(Op); + EVT OutVT = Op.getValueType(); + EVT InVT = PackedOp.getValueType(); + unsigned InNumElts = InVT.getVectorNumElements(); + unsigned OutNumElts = OutVT.getVectorNumElements(); + unsigned NumInPerOut = InNumElts / OutNumElts; + + SDValue ZeroVec = + DAG.getSplatVector(InVT, DL, DAG.getConstant(0, DL, InVT.getScalarType())); + + SmallVector<int, 16> Mask(InNumElts); + unsigned ZeroVecElt = InNumElts; + for (unsigned PackedElt = 0; PackedElt < OutNumElts; PackedElt++) { + unsigned MaskElt = PackedElt * NumInPerOut; + unsigned End = MaskElt + NumInPerOut - 1; + for (; MaskElt < End; MaskElt++) + Mask[MaskElt] = ZeroVecElt++; + Mask[MaskElt] = PackedElt; + } + SDValue Shuf = DAG.getVectorShuffle(InVT, DL, PackedOp, ZeroVec, Mask); + return DAG.getNode(ISD::BITCAST, DL, OutVT, Shuf); +} + SDValue SystemZTargetLowering::lowerShift(SDValue Op, SelectionDAG &DAG, unsigned ByScalar) const { // Look for cases where a vector shift can use the *_BY_SCALAR form. @@ -5195,9 +5446,9 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op, case ISD::EXTRACT_VECTOR_ELT: return lowerEXTRACT_VECTOR_ELT(Op, DAG); case ISD::SIGN_EXTEND_VECTOR_INREG: - return lowerExtendVectorInreg(Op, DAG, SystemZISD::UNPACK_HIGH); + return lowerSIGN_EXTEND_VECTOR_INREG(Op, DAG); case ISD::ZERO_EXTEND_VECTOR_INREG: - return lowerExtendVectorInreg(Op, DAG, SystemZISD::UNPACKL_HIGH); + return lowerZERO_EXTEND_VECTOR_INREG(Op, DAG); case ISD::SHL: return lowerShift(Op, DAG, SystemZISD::VSHL_BY_SCALAR); case ISD::SRL: @@ -5315,6 +5566,7 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const { OPCODE(BR_CCMASK); OPCODE(SELECT_CCMASK); OPCODE(ADJDYNALLOC); + OPCODE(PROBED_ALLOCA); OPCODE(POPCNT); OPCODE(SMUL_LOHI); OPCODE(UMUL_LOHI); @@ -6056,6 +6308,32 @@ SDValue SystemZTargetLowering::combineFP_EXTEND( return SDValue(); } +SDValue SystemZTargetLowering::combineINT_TO_FP( + SDNode *N, DAGCombinerInfo &DCI) const { + if (DCI.Level != BeforeLegalizeTypes) + return SDValue(); + unsigned Opcode = N->getOpcode(); + EVT OutVT = N->getValueType(0); + SelectionDAG &DAG = DCI.DAG; + SDValue Op = N->getOperand(0); + unsigned OutScalarBits = OutVT.getScalarSizeInBits(); + unsigned InScalarBits = Op->getValueType(0).getScalarSizeInBits(); + + // Insert an extension before type-legalization to avoid scalarization, e.g.: + // v2f64 = uint_to_fp v2i16 + // => + // v2f64 = uint_to_fp (v2i64 zero_extend v2i16) + if (OutVT.isVector() && OutScalarBits > InScalarBits) { + MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(OutVT.getScalarSizeInBits()), + OutVT.getVectorNumElements()); + unsigned ExtOpcode = + (Opcode == ISD::UINT_TO_FP ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND); + SDValue ExtOp = DAG.getNode(ExtOpcode, SDLoc(N), ExtVT, Op); + return DAG.getNode(Opcode, SDLoc(N), OutVT, ExtOp); + } + return SDValue(); +} + SDValue SystemZTargetLowering::combineBSWAP( SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -6243,15 +6521,7 @@ static bool combineCCMask(SDValue &CCReg, int &CCValid, int &CCMask) { return false; // Compute the effective CC mask for the new branch or select. - switch (CCMask) { - case SystemZ::CCMASK_CMP_EQ: break; - case SystemZ::CCMASK_CMP_NE: break; - case SystemZ::CCMASK_CMP_LT: CCMask = SystemZ::CCMASK_CMP_GT; break; - case SystemZ::CCMASK_CMP_GT: CCMask = SystemZ::CCMASK_CMP_LT; break; - case SystemZ::CCMASK_CMP_LE: CCMask = SystemZ::CCMASK_CMP_GE; break; - case SystemZ::CCMASK_CMP_GE: CCMask = SystemZ::CCMASK_CMP_LE; break; - default: return false; - } + CCMask = SystemZ::reverseCCMask(CCMask); // Return the updated CCReg link. CCReg = IPM->getOperand(0); @@ -6367,6 +6637,34 @@ SDValue SystemZTargetLowering::combineIntDIVREM( return SDValue(); } +SDValue SystemZTargetLowering::combineINTRINSIC( + SDNode *N, DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + + unsigned Id = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); + switch (Id) { + // VECTOR LOAD (RIGHTMOST) WITH LENGTH with a length operand of 15 + // or larger is simply a vector load. + case Intrinsic::s390_vll: + case Intrinsic::s390_vlrl: + if (auto *C = dyn_cast<ConstantSDNode>(N->getOperand(2))) + if (C->getZExtValue() >= 15) + return DAG.getLoad(N->getValueType(0), SDLoc(N), N->getOperand(0), + N->getOperand(3), MachinePointerInfo()); + break; + // Likewise for VECTOR STORE (RIGHTMOST) WITH LENGTH. + case Intrinsic::s390_vstl: + case Intrinsic::s390_vstrl: + if (auto *C = dyn_cast<ConstantSDNode>(N->getOperand(3))) + if (C->getZExtValue() >= 15) + return DAG.getStore(N->getOperand(0), SDLoc(N), N->getOperand(2), + N->getOperand(4), MachinePointerInfo()); + break; + } + + return SDValue(); +} + SDValue SystemZTargetLowering::unwrapAddress(SDValue N) const { if (N->getOpcode() == SystemZISD::PCREL_WRAPPER) return N->getOperand(0); @@ -6391,6 +6689,8 @@ SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N, case ISD::FP_ROUND: return combineFP_ROUND(N, DCI); case ISD::STRICT_FP_EXTEND: case ISD::FP_EXTEND: return combineFP_EXTEND(N, DCI); + case ISD::SINT_TO_FP: + case ISD::UINT_TO_FP: return combineINT_TO_FP(N, DCI); case ISD::BSWAP: return combineBSWAP(N, DCI); case SystemZISD::BR_CCMASK: return combineBR_CCMASK(N, DCI); case SystemZISD::SELECT_CCMASK: return combineSELECT_CCMASK(N, DCI); @@ -6399,6 +6699,8 @@ SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N, case ISD::UDIV: case ISD::SREM: case ISD::UREM: return combineIntDIVREM(N, DCI); + case ISD::INTRINSIC_W_CHAIN: + case ISD::INTRINSIC_VOID: return combineINTRINSIC(N, DCI); } return SDValue(); @@ -6580,7 +6882,7 @@ SystemZTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, APInt SrcDemE = getDemandedSrcElements(Op, DemandedElts, 0); Known = DAG.computeKnownBits(SrcOp, SrcDemE, Depth + 1); if (IsLogical) { - Known = Known.zext(BitWidth, true); + Known = Known.zext(BitWidth); } else Known = Known.sext(BitWidth); break; @@ -6609,7 +6911,7 @@ SystemZTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, // Known has the width of the source operand(s). Adjust if needed to match // the passed bitwidth. if (Known.getBitWidth() != BitWidth) - Known = Known.zextOrTrunc(BitWidth, false); + Known = Known.anyextOrTrunc(BitWidth); } static unsigned computeNumSignBitsBinOp(SDValue Op, const APInt &DemandedElts, @@ -6690,38 +6992,29 @@ SystemZTargetLowering::ComputeNumSignBitsForTargetNode( return 1; } +unsigned +SystemZTargetLowering::getStackProbeSize(MachineFunction &MF) const { + const TargetFrameLowering *TFI = Subtarget.getFrameLowering(); + unsigned StackAlign = TFI->getStackAlignment(); + assert(StackAlign >=1 && isPowerOf2_32(StackAlign) && + "Unexpected stack alignment"); + // The default stack probe size is 4096 if the function has no + // stack-probe-size attribute. + unsigned StackProbeSize = 4096; + const Function &Fn = MF.getFunction(); + if (Fn.hasFnAttribute("stack-probe-size")) + Fn.getFnAttribute("stack-probe-size") + .getValueAsString() + .getAsInteger(0, StackProbeSize); + // Round down to the stack alignment. + StackProbeSize &= ~(StackAlign - 1); + return StackProbeSize ? StackProbeSize : StackAlign; +} + //===----------------------------------------------------------------------===// // Custom insertion //===----------------------------------------------------------------------===// -// Create a new basic block after MBB. -static MachineBasicBlock *emitBlockAfter(MachineBasicBlock *MBB) { - MachineFunction &MF = *MBB->getParent(); - MachineBasicBlock *NewMBB = MF.CreateMachineBasicBlock(MBB->getBasicBlock()); - MF.insert(std::next(MachineFunction::iterator(MBB)), NewMBB); - return NewMBB; -} - -// Split MBB after MI and return the new block (the one that contains -// instructions after MI). -static MachineBasicBlock *splitBlockAfter(MachineBasicBlock::iterator MI, - MachineBasicBlock *MBB) { - MachineBasicBlock *NewMBB = emitBlockAfter(MBB); - NewMBB->splice(NewMBB->begin(), MBB, - std::next(MachineBasicBlock::iterator(MI)), MBB->end()); - NewMBB->transferSuccessorsAndUpdatePHIs(MBB); - return NewMBB; -} - -// Split MBB before MI and return the new block (the one that contains MI). -static MachineBasicBlock *splitBlockBefore(MachineBasicBlock::iterator MI, - MachineBasicBlock *MBB) { - MachineBasicBlock *NewMBB = emitBlockAfter(MBB); - NewMBB->splice(NewMBB->begin(), MBB, MI, MBB->end()); - NewMBB->transferSuccessorsAndUpdatePHIs(MBB); - return NewMBB; -} - // Force base value Base into a register before MI. Return the register. static Register forceReg(MachineInstr &MI, MachineOperand &Base, const SystemZInstrInfo *TII) { @@ -6859,8 +7152,6 @@ SystemZTargetLowering::emitSelect(MachineInstr &MI, for (MachineBasicBlock::iterator NextMIIt = std::next(MachineBasicBlock::iterator(MI)); NextMIIt != MBB->end(); ++NextMIIt) { - if (NextMIIt->definesRegister(SystemZ::CC)) - break; if (isSelectPseudo(*NextMIIt)) { assert(NextMIIt->getOperand(3).getImm() == CCValid && "Bad CCValid operands since CC was not redefined."); @@ -6871,6 +7162,9 @@ SystemZTargetLowering::emitSelect(MachineInstr &MI, } break; } + if (NextMIIt->definesRegister(SystemZ::CC) || + NextMIIt->usesCustomInsertionHook()) + break; bool User = false; for (auto SelMI : Selects) if (NextMIIt->readsVirtualRegister(SelMI->getOperand(0).getReg())) { @@ -6891,8 +7185,8 @@ SystemZTargetLowering::emitSelect(MachineInstr &MI, bool CCKilled = (LastMI->killsRegister(SystemZ::CC) || checkCCKill(*LastMI, MBB)); MachineBasicBlock *StartMBB = MBB; - MachineBasicBlock *JoinMBB = splitBlockAfter(LastMI, MBB); - MachineBasicBlock *FalseMBB = emitBlockAfter(StartMBB); + MachineBasicBlock *JoinMBB = SystemZ::splitBlockAfter(LastMI, MBB); + MachineBasicBlock *FalseMBB = SystemZ::emitBlockAfter(StartMBB); // Unless CC was killed in the last Select instruction, mark it as // live-in to both FalseMBB and JoinMBB. @@ -6985,8 +7279,8 @@ MachineBasicBlock *SystemZTargetLowering::emitCondStore(MachineInstr &MI, CCMask ^= CCValid; MachineBasicBlock *StartMBB = MBB; - MachineBasicBlock *JoinMBB = splitBlockBefore(MI, MBB); - MachineBasicBlock *FalseMBB = emitBlockAfter(StartMBB); + MachineBasicBlock *JoinMBB = SystemZ::splitBlockBefore(MI, MBB); + MachineBasicBlock *FalseMBB = SystemZ::emitBlockAfter(StartMBB); // Unless CC was killed in the CondStore instruction, mark it as // live-in to both FalseMBB and JoinMBB. @@ -7069,8 +7363,8 @@ MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadBinary( // Insert a basic block for the main loop. MachineBasicBlock *StartMBB = MBB; - MachineBasicBlock *DoneMBB = splitBlockBefore(MI, MBB); - MachineBasicBlock *LoopMBB = emitBlockAfter(StartMBB); + MachineBasicBlock *DoneMBB = SystemZ::splitBlockBefore(MI, MBB); + MachineBasicBlock *LoopMBB = SystemZ::emitBlockAfter(StartMBB); // StartMBB: // ... @@ -7187,10 +7481,10 @@ MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadMinMax( // Insert 3 basic blocks for the loop. MachineBasicBlock *StartMBB = MBB; - MachineBasicBlock *DoneMBB = splitBlockBefore(MI, MBB); - MachineBasicBlock *LoopMBB = emitBlockAfter(StartMBB); - MachineBasicBlock *UseAltMBB = emitBlockAfter(LoopMBB); - MachineBasicBlock *UpdateMBB = emitBlockAfter(UseAltMBB); + MachineBasicBlock *DoneMBB = SystemZ::splitBlockBefore(MI, MBB); + MachineBasicBlock *LoopMBB = SystemZ::emitBlockAfter(StartMBB); + MachineBasicBlock *UseAltMBB = SystemZ::emitBlockAfter(LoopMBB); + MachineBasicBlock *UpdateMBB = SystemZ::emitBlockAfter(UseAltMBB); // StartMBB: // ... @@ -7298,9 +7592,9 @@ SystemZTargetLowering::emitAtomicCmpSwapW(MachineInstr &MI, // Insert 2 basic blocks for the loop. MachineBasicBlock *StartMBB = MBB; - MachineBasicBlock *DoneMBB = splitBlockBefore(MI, MBB); - MachineBasicBlock *LoopMBB = emitBlockAfter(StartMBB); - MachineBasicBlock *SetMBB = emitBlockAfter(LoopMBB); + MachineBasicBlock *DoneMBB = SystemZ::splitBlockBefore(MI, MBB); + MachineBasicBlock *LoopMBB = SystemZ::emitBlockAfter(StartMBB); + MachineBasicBlock *SetMBB = SystemZ::emitBlockAfter(LoopMBB); // StartMBB: // ... @@ -7460,7 +7754,7 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper( // When generating more than one CLC, all but the last will need to // branch to the end when a difference is found. MachineBasicBlock *EndMBB = (Length > 256 && Opcode == SystemZ::CLC ? - splitBlockAfter(MI, MBB) : nullptr); + SystemZ::splitBlockAfter(MI, MBB) : nullptr); // Check for the loop form, in which operand 5 is the trip count. if (MI.getNumExplicitOperands() > 5) { @@ -7484,9 +7778,10 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper( Register NextCountReg = MRI.createVirtualRegister(RC); MachineBasicBlock *StartMBB = MBB; - MachineBasicBlock *DoneMBB = splitBlockBefore(MI, MBB); - MachineBasicBlock *LoopMBB = emitBlockAfter(StartMBB); - MachineBasicBlock *NextMBB = (EndMBB ? emitBlockAfter(LoopMBB) : LoopMBB); + MachineBasicBlock *DoneMBB = SystemZ::splitBlockBefore(MI, MBB); + MachineBasicBlock *LoopMBB = SystemZ::emitBlockAfter(StartMBB); + MachineBasicBlock *NextMBB = + (EndMBB ? SystemZ::emitBlockAfter(LoopMBB) : LoopMBB); // StartMBB: // # fall through to LoopMMB @@ -7602,7 +7897,7 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper( // If there's another CLC to go, branch to the end if a difference // was found. if (EndMBB && Length > 0) { - MachineBasicBlock *NextMBB = splitBlockBefore(MI, MBB); + MachineBasicBlock *NextMBB = SystemZ::splitBlockBefore(MI, MBB); BuildMI(MBB, DL, TII->get(SystemZ::BRC)) .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_NE) .addMBB(EndMBB); @@ -7642,8 +7937,8 @@ MachineBasicBlock *SystemZTargetLowering::emitStringWrapper( uint64_t End2Reg = MRI.createVirtualRegister(RC); MachineBasicBlock *StartMBB = MBB; - MachineBasicBlock *DoneMBB = splitBlockBefore(MI, MBB); - MachineBasicBlock *LoopMBB = emitBlockAfter(StartMBB); + MachineBasicBlock *DoneMBB = SystemZ::splitBlockBefore(MI, MBB); + MachineBasicBlock *LoopMBB = SystemZ::emitBlockAfter(StartMBB); // StartMBB: // # fall through to LoopMMB @@ -7754,6 +8049,97 @@ MachineBasicBlock *SystemZTargetLowering::emitLoadAndTestCmp0( return MBB; } +MachineBasicBlock *SystemZTargetLowering::emitProbedAlloca( + MachineInstr &MI, MachineBasicBlock *MBB) const { + MachineFunction &MF = *MBB->getParent(); + MachineRegisterInfo *MRI = &MF.getRegInfo(); + const SystemZInstrInfo *TII = + static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo()); + DebugLoc DL = MI.getDebugLoc(); + const unsigned ProbeSize = getStackProbeSize(MF); + Register DstReg = MI.getOperand(0).getReg(); + Register SizeReg = MI.getOperand(2).getReg(); + + MachineBasicBlock *StartMBB = MBB; + MachineBasicBlock *DoneMBB = SystemZ::splitBlockAfter(MI, MBB); + MachineBasicBlock *LoopTestMBB = SystemZ::emitBlockAfter(StartMBB); + MachineBasicBlock *LoopBodyMBB = SystemZ::emitBlockAfter(LoopTestMBB); + MachineBasicBlock *TailTestMBB = SystemZ::emitBlockAfter(LoopBodyMBB); + MachineBasicBlock *TailMBB = SystemZ::emitBlockAfter(TailTestMBB); + + MachineMemOperand *VolLdMMO = MF.getMachineMemOperand(MachinePointerInfo(), + MachineMemOperand::MOVolatile | MachineMemOperand::MOLoad, 8, Align(1)); + + Register PHIReg = MRI->createVirtualRegister(&SystemZ::ADDR64BitRegClass); + Register IncReg = MRI->createVirtualRegister(&SystemZ::ADDR64BitRegClass); + + // LoopTestMBB + // BRC TailTestMBB + // # fallthrough to LoopBodyMBB + StartMBB->addSuccessor(LoopTestMBB); + MBB = LoopTestMBB; + BuildMI(MBB, DL, TII->get(SystemZ::PHI), PHIReg) + .addReg(SizeReg) + .addMBB(StartMBB) + .addReg(IncReg) + .addMBB(LoopBodyMBB); + BuildMI(MBB, DL, TII->get(SystemZ::CLGFI)) + .addReg(PHIReg) + .addImm(ProbeSize); + BuildMI(MBB, DL, TII->get(SystemZ::BRC)) + .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_LT) + .addMBB(TailTestMBB); + MBB->addSuccessor(LoopBodyMBB); + MBB->addSuccessor(TailTestMBB); + + // LoopBodyMBB: Allocate and probe by means of a volatile compare. + // J LoopTestMBB + MBB = LoopBodyMBB; + BuildMI(MBB, DL, TII->get(SystemZ::SLGFI), IncReg) + .addReg(PHIReg) + .addImm(ProbeSize); + BuildMI(MBB, DL, TII->get(SystemZ::SLGFI), SystemZ::R15D) + .addReg(SystemZ::R15D) + .addImm(ProbeSize); + BuildMI(MBB, DL, TII->get(SystemZ::CG)).addReg(SystemZ::R15D) + .addReg(SystemZ::R15D).addImm(ProbeSize - 8).addReg(0) + .setMemRefs(VolLdMMO); + BuildMI(MBB, DL, TII->get(SystemZ::J)).addMBB(LoopTestMBB); + MBB->addSuccessor(LoopTestMBB); + + // TailTestMBB + // BRC DoneMBB + // # fallthrough to TailMBB + MBB = TailTestMBB; + BuildMI(MBB, DL, TII->get(SystemZ::CGHI)) + .addReg(PHIReg) + .addImm(0); + BuildMI(MBB, DL, TII->get(SystemZ::BRC)) + .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_EQ) + .addMBB(DoneMBB); + MBB->addSuccessor(TailMBB); + MBB->addSuccessor(DoneMBB); + + // TailMBB + // # fallthrough to DoneMBB + MBB = TailMBB; + BuildMI(MBB, DL, TII->get(SystemZ::SLGR), SystemZ::R15D) + .addReg(SystemZ::R15D) + .addReg(PHIReg); + BuildMI(MBB, DL, TII->get(SystemZ::CG)).addReg(SystemZ::R15D) + .addReg(SystemZ::R15D).addImm(-8).addReg(PHIReg) + .setMemRefs(VolLdMMO); + MBB->addSuccessor(DoneMBB); + + // DoneMBB + MBB = DoneMBB; + BuildMI(*MBB, MBB->begin(), DL, TII->get(TargetOpcode::COPY), DstReg) + .addReg(SystemZ::R15D); + + MI.eraseFromParent(); + return DoneMBB; +} + MachineBasicBlock *SystemZTargetLowering::EmitInstrWithCustomInserter( MachineInstr &MI, MachineBasicBlock *MBB) const { switch (MI.getOpcode()) { @@ -8014,6 +8400,9 @@ MachineBasicBlock *SystemZTargetLowering::EmitInstrWithCustomInserter( case SystemZ::LTXBRCompare_VecPseudo: return emitLoadAndTestCmp0(MI, MBB, SystemZ::LTXBR); + case SystemZ::PROBED_ALLOCA: + return emitProbedAlloca(MI, MBB); + case TargetOpcode::STACKMAP: case TargetOpcode::PATCHPOINT: return emitPatchPoint(MI, MBB); |