diff options
| author | Dimitry Andric <dim@FreeBSD.org> | 2015-08-07 23:01:33 +0000 | 
|---|---|---|
| committer | Dimitry Andric <dim@FreeBSD.org> | 2015-08-07 23:01:33 +0000 | 
| commit | ee8648bdac07986a0f1ec897b02ec82a2f144d46 (patch) | |
| tree | 52d1861acda1205241ee35a94aa63129c604d469 /lib/Target/AMDGPU | |
| parent | 1a82d4c088707c791c792f6822f611b47a12bdfe (diff) | |
Diffstat (limited to 'lib/Target/AMDGPU')
24 files changed, 408 insertions, 150 deletions
| diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td index 569ad3844b25..ef8ef6268548 100644 --- a/lib/Target/AMDGPU/AMDGPU.td +++ b/lib/Target/AMDGPU/AMDGPU.td @@ -98,6 +98,16 @@ def FeatureEnableLoadStoreOpt : SubtargetFeature <"load-store-opt",          "true",          "Enable SI load/store optimizer pass">; +// Performance debugging feature. Allow using DS instruction immediate +// offsets even if the base pointer can't be proven to be base. On SI, +// base pointer values that won't give the same result as a 16-bit add +// are not safe to fold, but this will override the conservative test +// for the base pointer. +def FeatureEnableUnsafeDSOffsetFolding : SubtargetFeature <"unsafe-ds-offset-folding", +        "EnableUnsafeDSOffsetFolding", +        "true", +        "Force using DS instruction immediate offsets on SI">; +  def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space",          "FlatAddressSpace",          "true", diff --git a/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp index 0b426bc63dd5..ad267d350850 100644 --- a/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp +++ b/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp @@ -22,7 +22,6 @@ using namespace llvm;  namespace {  class AMDGPUAlwaysInline : public ModulePass { -    static char ID;  public: @@ -36,10 +35,9 @@ public:  char AMDGPUAlwaysInline::ID = 0;  bool AMDGPUAlwaysInline::runOnModule(Module &M) { +  std::vector<Function *> FuncsToClone; -  std::vector<Function*> FuncsToClone; -  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { -    Function &F = *I; +  for (Function &F : M) {      if (!F.hasLocalLinkage() && !F.isDeclaration() && !F.use_empty() &&          !F.hasFnAttribute(Attribute::NoInline))        FuncsToClone.push_back(&F); @@ -49,12 +47,11 @@ bool AMDGPUAlwaysInline::runOnModule(Module &M) {      ValueToValueMapTy VMap;      Function *NewFunc = CloneFunction(F, VMap, false);      NewFunc->setLinkage(GlobalValue::InternalLinkage); -    F->getParent()->getFunctionList().push_back(NewFunc); +    M.getFunctionList().push_back(NewFunc);      F->replaceAllUsesWith(NewFunc);    } -  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { -    Function &F = *I; +  for (Function &F : M) {      if (F.hasLocalLinkage() && !F.hasFnAttribute(Attribute::NoInline)) {        F.addFnAttr(Attribute::AlwaysInline);      } diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index df4461eac4db..37b77d778d9f 100644 --- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -110,8 +110,11 @@ private:                           SDValue &Offset, SDValue &GLC) const;    SDNode *SelectAddrSpaceCast(SDNode *N);    bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const; +  bool SelectVOP3NoMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;    bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods,                         SDValue &Clamp, SDValue &Omod) const; +  bool SelectVOP3NoMods0(SDValue In, SDValue &Src, SDValue &SrcMods, +                         SDValue &Clamp, SDValue &Omod) const;    bool SelectVOP3Mods0Clamp(SDValue In, SDValue &Src, SDValue &SrcMods,                              SDValue &Omod) const; @@ -859,7 +862,8 @@ bool AMDGPUDAGToDAGISel::isDSOffsetLegal(const SDValue &Base, unsigned Offset,        (OffsetBits == 8 && !isUInt<8>(Offset)))      return false; -  if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) +  if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS || +      Subtarget->unsafeDSOffsetFoldingEnabled())      return true;    // On Southern Islands instruction with a negative base value and an offset @@ -1316,6 +1320,12 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,    return true;  } +bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src, +                                         SDValue &SrcMods) const { +  bool Res = SelectVOP3Mods(In, Src, SrcMods); +  return Res && cast<ConstantSDNode>(SrcMods)->isNullValue(); +} +  bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,                                           SDValue &SrcMods, SDValue &Clamp,                                           SDValue &Omod) const { @@ -1327,6 +1337,16 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,    return SelectVOP3Mods(In, Src, SrcMods);  } +bool AMDGPUDAGToDAGISel::SelectVOP3NoMods0(SDValue In, SDValue &Src, +                                           SDValue &SrcMods, SDValue &Clamp, +                                           SDValue &Omod) const { +  bool Res = SelectVOP3Mods0(In, Src, SrcMods, Clamp, Omod); + +  return Res && cast<ConstantSDNode>(SrcMods)->isNullValue() && +                cast<ConstantSDNode>(Clamp)->isNullValue() && +                cast<ConstantSDNode>(Omod)->isNullValue(); +} +  bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp(SDValue In, SDValue &Src,                                                SDValue &SrcMods,                                                SDValue &Omod) const { @@ -1351,18 +1371,14 @@ void AMDGPUDAGToDAGISel::PostprocessISelDAG() {    do {      IsModified = false;      // Go over all selected nodes and try to fold them a bit more -    for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(), -         E = CurDAG->allnodes_end(); I != E; ++I) { - -      SDNode *Node = I; - -      MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(I); +    for (SDNode &Node : CurDAG->allnodes()) { +      MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(&Node);        if (!MachineNode)          continue;        SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG); -      if (ResNode != Node) { -        ReplaceUses(Node, ResNode); +      if (ResNode != &Node) { +        ReplaceUses(&Node, ResNode);          IsModified = true;        }      } diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index d56838ec2019..3a65f3b56146 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -406,6 +406,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,    setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);    setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom); +  setTargetDAGCombine(ISD::SHL);    setTargetDAGCombine(ISD::MUL);    setTargetDAGCombine(ISD::SELECT);    setTargetDAGCombine(ISD::SELECT_CC); @@ -444,7 +445,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,  // Target Information  //===----------------------------------------------------------------------===// -MVT AMDGPUTargetLowering::getVectorIdxTy() const { +MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const {    return MVT::i32;  } @@ -545,9 +546,8 @@ bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const {  }  bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const { -  const DataLayout *DL = getDataLayout(); -  unsigned SrcSize = DL->getTypeSizeInBits(Src->getScalarType()); -  unsigned DestSize = DL->getTypeSizeInBits(Dest->getScalarType()); +  unsigned SrcSize = Src->getScalarSizeInBits(); +  unsigned DestSize = Dest->getScalarSizeInBits();    return SrcSize == 32 && DestSize == 64;  } @@ -697,7 +697,7 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init,                                                         const SDValue &InitPtr,                                                         SDValue Chain,                                                         SelectionDAG &DAG) const { -  const DataLayout *TD = getDataLayout(); +  const DataLayout &TD = DAG.getDataLayout();    SDLoc DL(InitPtr);    Type *InitTy = Init->getType(); @@ -705,20 +705,20 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init,      EVT VT = EVT::getEVT(InitTy);      PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS);      return DAG.getStore(Chain, DL, DAG.getConstant(*CI, DL, VT), InitPtr, -                        MachinePointerInfo(UndefValue::get(PtrTy)), false, false, -                        TD->getPrefTypeAlignment(InitTy)); +                        MachinePointerInfo(UndefValue::get(PtrTy)), false, +                        false, TD.getPrefTypeAlignment(InitTy));    }    if (const ConstantFP *CFP = dyn_cast<ConstantFP>(Init)) {      EVT VT = EVT::getEVT(CFP->getType());      PointerType *PtrTy = PointerType::get(CFP->getType(), 0);      return DAG.getStore(Chain, DL, DAG.getConstantFP(*CFP, DL, VT), InitPtr, -                 MachinePointerInfo(UndefValue::get(PtrTy)), false, false, -                 TD->getPrefTypeAlignment(CFP->getType())); +                        MachinePointerInfo(UndefValue::get(PtrTy)), false, +                        false, TD.getPrefTypeAlignment(CFP->getType()));    }    if (StructType *ST = dyn_cast<StructType>(InitTy)) { -    const StructLayout *SL = TD->getStructLayout(ST); +    const StructLayout *SL = TD.getStructLayout(ST);      EVT PtrVT = InitPtr.getValueType();      SmallVector<SDValue, 8> Chains; @@ -745,7 +745,7 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init,      else        llvm_unreachable("Unexpected type"); -    unsigned EltSize = TD->getTypeAllocSize(SeqTy->getElementType()); +    unsigned EltSize = TD.getTypeAllocSize(SeqTy->getElementType());      SmallVector<SDValue, 8> Chains;      for (unsigned i = 0; i < NumElements; ++i) {        SDValue Offset = DAG.getConstant(i * EltSize, DL, PtrVT); @@ -762,8 +762,8 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init,      EVT VT = EVT::getEVT(InitTy);      PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS);      return DAG.getStore(Chain, DL, DAG.getUNDEF(VT), InitPtr, -                        MachinePointerInfo(UndefValue::get(PtrTy)), false, false, -                        TD->getPrefTypeAlignment(InitTy)); +                        MachinePointerInfo(UndefValue::get(PtrTy)), false, +                        false, TD.getPrefTypeAlignment(InitTy));    }    Init->dump(); @@ -785,7 +785,7 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,                                                   SDValue Op,                                                   SelectionDAG &DAG) const { -  const DataLayout *TD = getDataLayout(); +  const DataLayout &DL = DAG.getDataLayout();    GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);    const GlobalValue *GV = G->getGlobal(); @@ -801,7 +801,7 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,      unsigned Offset;      if (MFI->LocalMemoryObjects.count(GV) == 0) { -      uint64_t Size = TD->getTypeAllocSize(GV->getType()->getElementType()); +      uint64_t Size = DL.getTypeAllocSize(GV->getType()->getElementType());        Offset = MFI->LDSSize;        MFI->LocalMemoryObjects[GV] = Offset;        // XXX: Account for alignment? @@ -811,16 +811,16 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,      }      return DAG.getConstant(Offset, SDLoc(Op), -                           getPointerTy(AMDGPUAS::LOCAL_ADDRESS)); +                           getPointerTy(DL, AMDGPUAS::LOCAL_ADDRESS));    }    case AMDGPUAS::CONSTANT_ADDRESS: {      MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo();      Type *EltType = GV->getType()->getElementType(); -    unsigned Size = TD->getTypeAllocSize(EltType); -    unsigned Alignment = TD->getPrefTypeAlignment(EltType); +    unsigned Size = DL.getTypeAllocSize(EltType); +    unsigned Alignment = DL.getPrefTypeAlignment(EltType); -    MVT PrivPtrVT = getPointerTy(AMDGPUAS::PRIVATE_ADDRESS); -    MVT ConstPtrVT = getPointerTy(AMDGPUAS::CONSTANT_ADDRESS); +    MVT PrivPtrVT = getPointerTy(DL, AMDGPUAS::PRIVATE_ADDRESS); +    MVT ConstPtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);      int FI = FrameInfo->CreateStackObject(Size, Alignment, false);      SDValue InitPtr = DAG.getFrameIndex(FI, PrivPtrVT); @@ -1653,7 +1653,7 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool    // fb = fabs(fb);    fb = DAG.getNode(ISD::FABS, DL, FltVT, fb); -  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), VT); +  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);    // int cv = fr >= fb;    SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE); @@ -1960,7 +1960,8 @@ SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {    const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);    const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64); -  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64); +  EVT SetCCVT = +      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);    SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);    SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE); @@ -2020,7 +2021,8 @@ SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {    SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);    SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not); -  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::i32); +  EVT SetCCVT = +      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);    const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32); @@ -2051,7 +2053,8 @@ SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {    APFloat C2Val(APFloat::IEEEdouble, "0x1.fffffffffffffp+51");    SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64); -  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64); +  EVT SetCCVT = +      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);    SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);    return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2); @@ -2081,7 +2084,8 @@ SDValue AMDGPUTargetLowering::LowerFROUND32(SDValue Op, SelectionDAG &DAG) const    SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f32, One, X); -  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f32); +  EVT SetCCVT = +      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);    SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE); @@ -2100,8 +2104,8 @@ SDValue AMDGPUTargetLowering::LowerFROUND64(SDValue Op, SelectionDAG &DAG) const    const SDValue One = DAG.getConstant(1, SL, MVT::i32);    const SDValue NegOne = DAG.getConstant(-1, SL, MVT::i32);    const SDValue FiftyOne = DAG.getConstant(51, SL, MVT::i32); -  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::i32); - +  EVT SetCCVT = +      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);    SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X); @@ -2172,7 +2176,8 @@ SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {    const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);    const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64); -  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64); +  EVT SetCCVT = +      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);    SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);    SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE); @@ -2411,6 +2416,33 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,                        SN->getBasePtr(), SN->getMemOperand());  } +SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N, +                                                DAGCombinerInfo &DCI) const { +  if (N->getValueType(0) != MVT::i64) +    return SDValue(); + +  // i64 (shl x, 32) -> (build_pair 0, x) + +  // Doing this with moves theoretically helps MI optimizations that understand +  // copies. 2 v_mov_b32_e32 will have the same code size / cycle count as +  // v_lshl_b64. In the SALU case, I think this is slightly worse since it +  // doubles the code size and I'm unsure about cycle count. +  const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1)); +  if (!RHS || RHS->getZExtValue() != 32) +    return SDValue(); + +  SDValue LHS = N->getOperand(0); + +  SDLoc SL(N); +  SelectionDAG &DAG = DCI.DAG; + +  // Extract low 32-bits. +  SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS); + +  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); +  return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Zero, Lo); +} +  SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,                                                  DAGCombinerInfo &DCI) const {    EVT VT = N->getValueType(0); @@ -2448,17 +2480,24 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,    SDLoc DL(N);    switch(N->getOpcode()) { -    default: break; -    case ISD::MUL: -      return performMulCombine(N, DCI); -    case AMDGPUISD::MUL_I24: -    case AMDGPUISD::MUL_U24: { -      SDValue N0 = N->getOperand(0); -      SDValue N1 = N->getOperand(1); -      simplifyI24(N0, DCI); -      simplifyI24(N1, DCI); -      return SDValue(); -    } +  default: +    break; +  case ISD::SHL: { +    if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) +      break; + +    return performShlCombine(N, DCI); +  } +  case ISD::MUL: +    return performMulCombine(N, DCI); +  case AMDGPUISD::MUL_I24: +  case AMDGPUISD::MUL_U24: { +    SDValue N0 = N->getOperand(0); +    SDValue N1 = N->getOperand(1); +    simplifyI24(N0, DCI); +    simplifyI24(N1, DCI); +    return SDValue(); +  }    case ISD::SELECT: {      SDValue Cond = N->getOperand(0);      if (Cond.getOpcode() == ISD::SETCC && Cond.hasOneUse()) { @@ -2644,6 +2683,18 @@ SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,    return DAG.getRegister(VirtualRegister, VT);  } +uint32_t AMDGPUTargetLowering::getImplicitParameterOffset( +    const AMDGPUMachineFunction *MFI, const ImplicitParameter Param) const { +  uint64_t ArgOffset = MFI->ABIArgOffset; +  switch (Param) { +  case GRID_DIM: +    return ArgOffset; +  case GRID_OFFSET: +    return ArgOffset + 4; +  } +  llvm_unreachable("unexpected implicit parameter type"); +} +  #define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;  const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h index fbb7d3c88437..478b2035fd75 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -65,6 +65,7 @@ private:    SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;    SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const; +  SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const;    SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const;  protected: @@ -123,7 +124,7 @@ public:    bool isNarrowingProfitable(EVT VT1, EVT VT2) const override; -  MVT getVectorIdxTy() const override; +  MVT getVectorIdxTy(const DataLayout &) const override;    bool isSelectSupported(SelectSupportKind) const override;    bool isFPImmLegal(const APFloat &Imm, EVT VT) const override; @@ -207,6 +208,16 @@ public:    virtual SDValue CreateLiveInRegister(SelectionDAG &DAG,                                         const TargetRegisterClass *RC,                                         unsigned Reg, EVT VT) const; + +  enum ImplicitParameter { +    GRID_DIM, +    GRID_OFFSET +  }; + +  /// \brief Helper function that returns the byte offset of the given +  /// type of implicit parameter. +  unsigned getImplicitParameterOffset(const AMDGPUMachineFunction *MFI, +                                      const ImplicitParameter Param) const;  };  namespace AMDGPUISD { diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 0779d1d786b2..bd5abc4f546e 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -69,6 +69,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,        FP64Denormals(false), FP32Denormals(false), FastFMAF32(false),        CaymanISA(false), FlatAddressSpace(false), EnableIRStructurizer(true),        EnablePromoteAlloca(false), EnableIfCvt(true), EnableLoadStoreOpt(false), +      EnableUnsafeDSOffsetFolding(false),        WavefrontSize(0), CFALUBug(false), LocalMemorySize(0),        EnableVGPRSpilling(false), SGPRInitBug(false), IsGCN(false),        GCN1Encoding(false), GCN3Encoding(false), CIInsts(false), LDSBankCount(0), diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h index 30f50eb1d2f3..90831bfb4458 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -76,6 +76,7 @@ private:    bool EnablePromoteAlloca;    bool EnableIfCvt;    bool EnableLoadStoreOpt; +  bool EnableUnsafeDSOffsetFolding;    unsigned WavefrontSize;    bool CFALUBug;    int LocalMemorySize; @@ -222,6 +223,10 @@ public:      return EnableLoadStoreOpt;    } +  bool unsafeDSOffsetFoldingEnabled() const { +    return EnableUnsafeDSOffsetFolding; +  } +    unsigned getWavefrontSize() const {      return WavefrontSize;    } diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index a9a911a8efed..2297b52b423c 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -156,8 +156,10 @@ public:  } // End of anonymous namespace  TargetIRAnalysis AMDGPUTargetMachine::getTargetIRAnalysis() { -  return TargetIRAnalysis( -      [this](Function &F) { return TargetTransformInfo(AMDGPUTTIImpl(this)); }); +  return TargetIRAnalysis([this](Function &F) { +    return TargetTransformInfo( +        AMDGPUTTIImpl(this, F.getParent()->getDataLayout())); +  });  }  void AMDGPUPassConfig::addIRPasses() { @@ -269,6 +271,7 @@ void GCNPassConfig::addPreRegAlloc() {      // also need extra copies to the address operand to be eliminated.      initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());      insertPass(&MachineSchedulerID, &SILoadStoreOptimizerID); +    insertPass(&MachineSchedulerID, &RegisterCoalescerID);    }    addPass(createSIShrinkInstructionsPass(), false);    addPass(createSIFixSGPRLiveRangesPass(), false); @@ -280,10 +283,10 @@ void GCNPassConfig::addPostRegAlloc() {  }  void GCNPassConfig::addPreSched2() { -  addPass(createSIInsertWaits(*TM), false);  }  void GCNPassConfig::addPreEmitPass() { +  addPass(createSIInsertWaits(*TM), false);    addPass(createSILowerControlFlowPass(*TM), false);  } diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index 791c84e6f28b..dee0a69d1e68 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -37,8 +37,9 @@ class AMDGPUTTIImpl : public BasicTTIImplBase<AMDGPUTTIImpl> {    const AMDGPUTargetLowering *getTLI() const { return TLI; }  public: -  explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM) -      : BaseT(TM), ST(TM->getSubtargetImpl()), TLI(ST->getTargetLowering()) {} +  explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const DataLayout &DL) +      : BaseT(TM, DL), ST(TM->getSubtargetImpl()), +        TLI(ST->getTargetLowering()) {}    // Provide value semantics. MSVC requires that we spell all of these out.    AMDGPUTTIImpl(const AMDGPUTTIImpl &Arg) @@ -46,18 +47,6 @@ public:    AMDGPUTTIImpl(AMDGPUTTIImpl &&Arg)        : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)),          TLI(std::move(Arg.TLI)) {} -  AMDGPUTTIImpl &operator=(const AMDGPUTTIImpl &RHS) { -    BaseT::operator=(static_cast<const BaseT &>(RHS)); -    ST = RHS.ST; -    TLI = RHS.TLI; -    return *this; -  } -  AMDGPUTTIImpl &operator=(AMDGPUTTIImpl &&RHS) { -    BaseT::operator=(std::move(static_cast<BaseT &>(RHS))); -    ST = std::move(RHS.ST); -    TLI = std::move(RHS.TLI); -    return *this; -  }    bool hasBranchDivergence() { return true; } diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp index 7172e4bb9335..c709741f3777 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp @@ -44,7 +44,7 @@ static MCInstrInfo *createAMDGPUMCInstrInfo() {    return X;  } -static MCRegisterInfo *createAMDGPUMCRegisterInfo(StringRef TT) { +static MCRegisterInfo *createAMDGPUMCRegisterInfo(const Triple &TT) {    MCRegisterInfo *X = new MCRegisterInfo();    InitAMDGPUMCRegisterInfo(X, 0);    return X; @@ -52,14 +52,13 @@ static MCRegisterInfo *createAMDGPUMCRegisterInfo(StringRef TT) {  static MCSubtargetInfo *  createAMDGPUMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) { -  MCSubtargetInfo * X = new MCSubtargetInfo(); -  InitAMDGPUMCSubtargetInfo(X, TT, CPU, FS); -  return X; +  return createAMDGPUMCSubtargetInfoImpl(TT, CPU, FS);  } -static MCCodeGenInfo *createAMDGPUMCCodeGenInfo(StringRef TT, Reloc::Model RM, -                                               CodeModel::Model CM, -                                               CodeGenOpt::Level OL) { +static MCCodeGenInfo *createAMDGPUMCCodeGenInfo(const Triple &TT, +                                                Reloc::Model RM, +                                                CodeModel::Model CM, +                                                CodeGenOpt::Level OL) {    MCCodeGenInfo *X = new MCCodeGenInfo();    X->initMCCodeGenInfo(RM, CM, OL);    return X; diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp index 8357b6d9d0ed..4e4d554f0ee7 100644 --- a/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -815,8 +815,10 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const      case Intrinsic::r600_read_local_size_z:        return LowerImplicitParameter(DAG, VT, DL, 8); -    case Intrinsic::AMDGPU_read_workdim: -      return LowerImplicitParameter(DAG, VT, DL, MFI->ABIArgOffset / 4); +    case Intrinsic::AMDGPU_read_workdim: { +      uint32_t ByteOffset = getImplicitParameterOffset(MFI, GRID_DIM); +      return LowerImplicitParameter(DAG, VT, DL, ByteOffset / 4); +    }      case Intrinsic::r600_read_tgid_x:        return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, @@ -897,8 +899,9 @@ SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG,    for (unsigned i = 0, e = VecVT.getVectorNumElements();                                                             i != e; ++i) { -    Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vector, -                               DAG.getConstant(i, DL, getVectorIdxTy()))); +    Args.push_back(DAG.getNode( +        ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vector, +        DAG.getConstant(i, DL, getVectorIdxTy(DAG.getDataLayout()))));    }    return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args); @@ -1459,22 +1462,17 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const    SDValue Ptr = Op.getOperand(1);    SDValue LoweredLoad; -  SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG); -  if (Ret.getNode()) { -    SDValue Ops[2] = { -      Ret, -      Chain -    }; -    return DAG.getMergeValues(Ops, DL); -  } +  if (SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG)) +    return Ret;    // Lower loads constant address space global variable loads    if (LoadNode->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&        isa<GlobalVariable>(GetUnderlyingObject( -          LoadNode->getMemOperand()->getValue(), *getDataLayout()))) { +          LoadNode->getMemOperand()->getValue(), DAG.getDataLayout()))) { -    SDValue Ptr = DAG.getZExtOrTrunc(LoadNode->getBasePtr(), DL, -        getPointerTy(AMDGPUAS::PRIVATE_ADDRESS)); +    SDValue Ptr = DAG.getZExtOrTrunc( +        LoadNode->getBasePtr(), DL, +        getPointerTy(DAG.getDataLayout(), AMDGPUAS::PRIVATE_ADDRESS));      Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,          DAG.getConstant(2, DL, MVT::i32));      return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op->getVTList(), @@ -1702,7 +1700,8 @@ SDValue R600TargetLowering::LowerFormalArguments(    return Chain;  } -EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { +EVT R600TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, +                                           EVT VT) const {     if (!VT.isVector())       return MVT::i32;     return VT.changeVectorElementTypeToInteger(); diff --git a/lib/Target/AMDGPU/R600ISelLowering.h b/lib/Target/AMDGPU/R600ISelLowering.h index c06d3c4fd309..4dbac97af2a1 100644 --- a/lib/Target/AMDGPU/R600ISelLowering.h +++ b/lib/Target/AMDGPU/R600ISelLowering.h @@ -38,7 +38,9 @@ public:                                const SmallVectorImpl<ISD::InputArg> &Ins,                                SDLoc DL, SelectionDAG &DAG,                                SmallVectorImpl<SDValue> &InVals) const override; -  EVT getSetCCResultType(LLVMContext &, EVT VT) const override; +  EVT getSetCCResultType(const DataLayout &DL, LLVMContext &, +                         EVT VT) const override; +  private:    unsigned Gen;    /// Each OpenCL kernel has nine implicit parameters that are stored in the diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp index d14e37a64612..c2887255cc11 100644 --- a/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -126,11 +126,42 @@ static bool updateOperand(FoldCandidate &Fold,    return false;  } +static bool isUseMIInFoldList(const std::vector<FoldCandidate> &FoldList, +                              const MachineInstr *MI) { +  for (auto Candidate : FoldList) { +    if (Candidate.UseMI == MI) +      return true; +  } +  return false; +} +  static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList,                               MachineInstr *MI, unsigned OpNo,                               MachineOperand *OpToFold,                               const SIInstrInfo *TII) {    if (!TII->isOperandLegal(MI, OpNo, OpToFold)) { + +    // Special case for v_mac_f32_e64 if we are trying to fold into src2 +    unsigned Opc = MI->getOpcode(); +    if (Opc == AMDGPU::V_MAC_F32_e64 && +        (int)OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)) { +      // Check if changing this to a v_mad_f32 instruction will allow us to +      // fold the operand. +      MI->setDesc(TII->get(AMDGPU::V_MAD_F32)); +      bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold, TII); +      if (FoldAsMAD) { +        MI->untieRegOperand(OpNo); +        return true; +      } +      MI->setDesc(TII->get(Opc)); +    } + +    // If we are already folding into another operand of MI, then +    // we can't commute the instruction, otherwise we risk making the +    // other fold illegal. +    if (isUseMIInFoldList(FoldList, MI)) +      return false; +      // Operand is not legal, so try to commute the instruction to      // see if this makes it possible to fold.      unsigned CommuteIdx0; diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index ead1a3743473..dd818a9ba746 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -254,8 +254,9 @@ bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &,    return false;  } -bool SITargetLowering::isLegalAddressingMode(const AddrMode &AM, -                                             Type *Ty, unsigned AS) const { +bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, +                                             const AddrMode &AM, Type *Ty, +                                             unsigned AS) const {    // No global is ever allowed as a base.    if (AM.BaseGV)      return false; @@ -416,7 +417,7 @@ static EVT toIntegerVT(EVT VT) {  SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,                                           SDLoc SL, SDValue Chain,                                           unsigned Offset, bool Signed) const { -  const DataLayout *DL = getDataLayout(); +  const DataLayout &DL = DAG.getDataLayout();    MachineFunction &MF = DAG.getMachineFunction();    const SIRegisterInfo *TRI =        static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo()); @@ -425,16 +426,16 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,    Type *Ty = VT.getTypeForEVT(*DAG.getContext());    MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); -  MVT PtrVT = getPointerTy(AMDGPUAS::CONSTANT_ADDRESS); +  MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);    PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);    SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,                                         MRI.getLiveInVirtReg(InputPtrReg), PtrVT);    SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,                              DAG.getConstant(Offset, SL, PtrVT)); -  SDValue PtrOffset = DAG.getUNDEF(getPointerTy(AMDGPUAS::CONSTANT_ADDRESS)); +  SDValue PtrOffset = DAG.getUNDEF(PtrVT);    MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); -  unsigned Align = DL->getABITypeAlignment(Ty); +  unsigned Align = DL.getABITypeAlignment(Ty);    if (VT != MemVT && VT.isFloatingPoint()) {      // Do an integer load and convert. @@ -451,7 +452,12 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,                                 true, // isNonTemporal                                 true, // isInvariant                                 Align); // Alignment -    return DAG.getNode(ISD::FP16_TO_FP, SL, VT, Load); +    SDValue Ops[] = { +      DAG.getNode(ISD::FP16_TO_FP, SL, VT, Load), +      Load.getValue(1) +    }; + +    return DAG.getMergeValues(Ops, SL);    }    ISD::LoadExtType ExtTy = Signed ? ISD::SEXTLOAD : ISD::ZEXTLOAD; @@ -569,6 +575,8 @@ SDValue SITargetLowering::LowerFormalArguments(    AnalyzeFormalArguments(CCInfo, Splits); +  SmallVector<SDValue, 16> Chains; +    for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {      const ISD::InputArg &Arg = Ins[i]; @@ -587,8 +595,9 @@ SDValue SITargetLowering::LowerFormalArguments(                                VA.getLocMemOffset();        // The first 36 bytes of the input buffer contains information about        // thread group and global sizes. -      SDValue Arg = LowerParameter(DAG, VT, MemVT,  DL, DAG.getRoot(), +      SDValue Arg = LowerParameter(DAG, VT, MemVT,  DL, Chain,                                     Offset, Ins[i].Flags.isSExt()); +      Chains.push_back(Arg.getValue(1));        const PointerType *ParamTy =          dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex())); @@ -614,7 +623,8 @@ SDValue SITargetLowering::LowerFormalArguments(        Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0,                                       &AMDGPU::SReg_64RegClass);        Reg = MF.addLiveIn(Reg, &AMDGPU::SReg_64RegClass); -      InVals.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT)); +      SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT); +      InVals.push_back(Copy);        continue;      } @@ -634,7 +644,9 @@ SDValue SITargetLowering::LowerFormalArguments(        for (unsigned j = 1; j != NumElements; ++j) {          Reg = ArgLocs[ArgIdx++].getLocReg();          Reg = MF.addLiveIn(Reg, RC); -        Regs.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT)); + +        SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT); +        Regs.push_back(Copy);        }        // Fill up the missing vector elements @@ -653,7 +665,11 @@ SDValue SITargetLowering::LowerFormalArguments(          AMDGPU::SGPR_32RegClass.begin(), AMDGPU::SGPR_32RegClass.getNumRegs()));      Info->ScratchOffsetReg = AMDGPU::SGPR_32RegClass.getRegister(ScratchIdx);    } -  return Chain; + +  if (Chains.empty()) +    return Chain; + +  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);  }  MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( @@ -695,14 +711,15 @@ bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const {    return true;  } -EVT SITargetLowering::getSetCCResultType(LLVMContext &Ctx, EVT VT) const { +EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx, +                                         EVT VT) const {    if (!VT.isVector()) {      return MVT::i1;    }    return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());  } -MVT SITargetLowering::getScalarShiftAmountTy(EVT VT) const { +MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT) const {    return MVT::i32;  } @@ -888,7 +905,7 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,    SDLoc DL(GSD);    const GlobalValue *GV = GSD->getGlobal(); -  MVT PtrVT = getPointerTy(GSD->getAddressSpace()); +  MVT PtrVT = getPointerTy(DAG.getDataLayout(), GSD->getAddressSpace());    SDValue Ptr = DAG.getNode(AMDGPUISD::CONST_DATA_PTR, DL, PtrVT);    SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32); @@ -926,6 +943,7 @@ SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL,  SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,                                                    SelectionDAG &DAG) const {    MachineFunction &MF = DAG.getMachineFunction(); +  auto MFI = MF.getInfo<SIMachineFunctionInfo>();    const SIRegisterInfo *TRI =        static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); @@ -964,8 +982,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,    case Intrinsic::AMDGPU_read_workdim:      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), -                          MF.getInfo<SIMachineFunctionInfo>()->ABIArgOffset, -                          false); +                          getImplicitParameterOffset(MFI, GRID_DIM), false);    case Intrinsic::r600_read_tgid_x:      return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, @@ -1213,7 +1230,8 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {    const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); -  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f32); +  EVT SetCCVT = +      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);    SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT); @@ -1411,7 +1429,7 @@ SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,      unsigned AS = Load->getAddressSpace();      unsigned Align = Load->getAlignment();      Type *Ty = LoadVT.getTypeForEVT(*DAG.getContext()); -    unsigned ABIAlignment = getDataLayout()->getABITypeAlignment(Ty); +    unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(Ty);      // Don't try to replace the load if we have to expand it due to alignment      // problems. Otherwise we will end up scalarizing the load, and trying to @@ -2212,9 +2230,8 @@ SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG,  std::pair<unsigned, const TargetRegisterClass *>  SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, -                                               const std::string &Constraint_, +                                               StringRef Constraint,                                                 MVT VT) const { -  StringRef Constraint(Constraint_);    if (Constraint == "r") {      switch(VT.SimpleTy) {        default: llvm_unreachable("Unhandled type for 'r' inline asm constraint"); diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h index a956b013bdb1..635b4edc89de 100644 --- a/lib/Target/AMDGPU/SIISelLowering.h +++ b/lib/Target/AMDGPU/SIISelLowering.h @@ -62,8 +62,8 @@ public:    bool isShuffleMaskLegal(const SmallVectorImpl<int> &/*Mask*/,                            EVT /*VT*/) const override; -  bool isLegalAddressingMode(const AddrMode &AM, -                             Type *Ty, unsigned AS) const override; +  bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, +                             unsigned AS) const override;    bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS,                                        unsigned Align, @@ -90,8 +90,9 @@ public:    MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr * MI,                                        MachineBasicBlock * BB) const override;    bool enableAggressiveFMAFusion(EVT VT) const override; -  EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override; -  MVT getScalarShiftAmountTy(EVT VT) const override; +  EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, +                         EVT VT) const override; +  MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override;    bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;    SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;    SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; @@ -114,9 +115,9 @@ public:                                    SDLoc DL,                                    SDValue Ptr) const; -  std::pair<unsigned, const TargetRegisterClass *> getRegForInlineAsmConstraint( -                                   const TargetRegisterInfo *TRI, -                                   const std::string &Constraint, MVT VT) const override; +  std::pair<unsigned, const TargetRegisterClass *> +  getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, +                               StringRef Constraint, MVT VT) const override;    SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL, SDValue V) const;  }; diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp index eb96bd0227b2..18910615bebe 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -227,9 +227,8 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,      uint8_t Offset0 = Offset0Imm->getImm();      uint8_t Offset1 = Offset1Imm->getImm(); -    assert(Offset1 > Offset0); -    if (Offset1 - Offset0 == 1) { +    if (Offset1 > Offset0 && Offset1 - Offset0 == 1) {        // Each of these offsets is in element sized units, so we need to convert        // to bytes of the individual reads. @@ -924,7 +923,7 @@ bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,      return false;    unsigned Opc = UseMI->getOpcode(); -  if (Opc == AMDGPU::V_MAD_F32) { +  if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64) {      // Don't fold if we are using source modifiers. The new VOP2 instructions      // don't have them.      if (hasModifiersSet(*UseMI, AMDGPU::OpName::src0_modifiers) || @@ -963,9 +962,9 @@ bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,        // instead of having to modify in place.        // Remove these first since they are at the end. -      UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32, +      UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc,                                                        AMDGPU::OpName::omod)); -      UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32, +      UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc,                                                        AMDGPU::OpName::clamp));        unsigned Src1Reg = Src1->getReg(); @@ -980,6 +979,14 @@ bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,        Src1->setSubReg(Src2SubReg);        Src1->setIsKill(Src2->isKill()); +      if (Opc == AMDGPU::V_MAC_F32_e64) { +        UseMI->untieRegOperand( +          AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); +      } + +      UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, +                                                      AMDGPU::OpName::src2)); +      // ChangingToImmediate adds Src2 back to the instruction.        Src2->ChangeToImmediate(Imm);        removeModOperands(*UseMI); @@ -1010,11 +1017,17 @@ bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,        // instead of having to modify in place.        // Remove these first since they are at the end. -      UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32, +      UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc,                                                        AMDGPU::OpName::omod)); -      UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32, +      UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc,                                                        AMDGPU::OpName::clamp)); +      if (Opc == AMDGPU::V_MAC_F32_e64) { +        UseMI->untieRegOperand( +          AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); +      } + +      // ChangingToImmediate adds Src2 back to the instruction.        Src2->ChangeToImmediate(Imm);        // These come before src2. @@ -1126,6 +1139,38 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa,    return false;  } +MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, +                                                MachineBasicBlock::iterator &MI, +                                                LiveVariables *LV) const { + +  switch (MI->getOpcode()) { +    default: return nullptr; +    case AMDGPU::V_MAC_F32_e64: break; +    case AMDGPU::V_MAC_F32_e32: { +      const MachineOperand *Src0 = getNamedOperand(*MI, AMDGPU::OpName::src0); +      if (Src0->isImm() && !isInlineConstant(*Src0, 4)) +        return nullptr; +      break; +    } +  } + +  const MachineOperand *Dst = getNamedOperand(*MI, AMDGPU::OpName::dst); +  const MachineOperand *Src0 = getNamedOperand(*MI, AMDGPU::OpName::src0); +  const MachineOperand *Src1 = getNamedOperand(*MI, AMDGPU::OpName::src1); +  const MachineOperand *Src2 = getNamedOperand(*MI, AMDGPU::OpName::src2); + +  return BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_MAD_F32)) +                 .addOperand(*Dst) +                 .addImm(0) // Src0 mods +                 .addOperand(*Src0) +                 .addImm(0) // Src1 mods +                 .addOperand(*Src1) +                 .addImm(0) // Src mods +                 .addOperand(*Src2) +                 .addImm(0)  // clamp +                 .addImm(0); // omod +} +  bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {    int64_t SVal = Imm.getSExtValue();    if (SVal >= -16 && SVal <= 64) @@ -1625,7 +1670,10 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx,    if (MO->isReg()) {      assert(DefinedRC); -    const TargetRegisterClass *RC = MRI.getRegClass(MO->getReg()); +    const TargetRegisterClass *RC = +        TargetRegisterInfo::isVirtualRegister(MO->getReg()) ? +            MRI.getRegClass(MO->getReg()) : +            RI.getPhysRegClass(MO->getReg());      // In order to be legal, the common sub-class must be equal to the      // class of the current operand.  For example: diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h index 0382272068d2..015ea12d4598 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.h +++ b/lib/Target/AMDGPU/SIInstrInfo.h @@ -144,6 +144,10 @@ public:    unsigned getMachineCSELookAheadLimit() const override { return 500; } +  MachineInstr *convertToThreeAddress(MachineFunction::iterator &MBB, +                                      MachineBasicBlock::iterator &MI, +                                      LiveVariables *LV) const override; +    bool isSALU(uint16_t Opcode) const {      return get(Opcode).TSFlags & SIInstrFlags::SALU;    } diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td index fcb58d5da3b0..b39a78714640 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.td +++ b/lib/Target/AMDGPU/SIInstrInfo.td @@ -529,9 +529,11 @@ def MUBUFOffset : ComplexPattern<i64, 6, "SelectMUBUFOffset">;  def MUBUFOffsetAtomic : ComplexPattern<i64, 4, "SelectMUBUFOffset">;  def VOP3Mods0 : ComplexPattern<untyped, 4, "SelectVOP3Mods0">; +def VOP3NoMods0 : ComplexPattern<untyped, 4, "SelectVOP3NoMods0">;  def VOP3Mods0Clamp : ComplexPattern<untyped, 3, "SelectVOP3Mods0Clamp">;  def VOP3Mods0Clamp0OMod : ComplexPattern<untyped, 4, "SelectVOP3Mods0Clamp0OMod">;  def VOP3Mods  : ComplexPattern<untyped, 2, "SelectVOP3Mods">; +def VOP3NoMods : ComplexPattern<untyped, 2, "SelectVOP3NoMods">;  //===----------------------------------------------------------------------===//  // SI assembler operands @@ -1113,6 +1115,13 @@ def VOP_MADK : VOPProfile <[f32, f32, f32, f32]> {    field dag Ins = (ins VCSrc_32:$src0, VGPR_32:$vsrc1, u32imm:$src2);    field string Asm = "$dst, $src0, $vsrc1, $src2";  } +def VOP_MAC : VOPProfile <[f32, f32, f32, f32]> { +  let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VGPR_32:$src2); +  let Ins64 = getIns64<Src0RC64, Src1RC64, RegisterOperand<VGPR_32>, 3, +                             HasModifiers>.ret; +  let Asm32 = getAsm32<2>.ret; +  let Asm64 = getAsm64<2, HasModifiers>.ret; +}  def VOP_F64_F64_F64_F64 : VOPProfile <[f64, f64, f64, f64]>;  def VOP_I32_I32_I32_I32 : VOPProfile <[i32, i32, i32, i32]>;  def VOP_I64_I32_I32_I64 : VOPProfile <[i64, i32, i32, i64]>; diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td index 8c8d836776db..1ee63c675822 100644 --- a/lib/Target/AMDGPU/SIInstructions.td +++ b/lib/Target/AMDGPU/SIInstructions.td @@ -1488,7 +1488,10 @@ defm V_AND_B32 : VOP2Inst <vop2<0x1b, 0x13>, "v_and_b32", VOP_I32_I32_I32>;  defm V_OR_B32 : VOP2Inst <vop2<0x1c, 0x14>, "v_or_b32", VOP_I32_I32_I32>;  defm V_XOR_B32 : VOP2Inst <vop2<0x1d, 0x15>, "v_xor_b32", VOP_I32_I32_I32>; -defm V_MAC_F32 : VOP2Inst <vop2<0x1f, 0x16>, "v_mac_f32", VOP_F32_F32_F32>; +let Constraints = "$dst = $src2", DisableEncoding="$src2", +    isConvertibleToThreeAddress = 1 in { +defm V_MAC_F32 : VOP2Inst <vop2<0x1f, 0x16>, "v_mac_f32", VOP_MAC>; +}  } // End isCommutable = 1  defm V_MADMK_F32 : VOP2MADK <vop2<0x20, 0x17>, "v_madmk_f32">; @@ -2206,6 +2209,15 @@ def : Pat <    (V_CNDMASK_B32_e64 $src2, $src1, $src0)  >; +// Pattern for V_MAC_F32 +def : Pat < +  (fmad  (VOP3NoMods0 f32:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod), +         (VOP3NoMods f32:$src1, i32:$src1_modifiers), +         (VOP3NoMods f32:$src2, i32:$src2_modifiers)), +  (V_MAC_F32_e64 $src0_modifiers, $src0, $src1_modifiers, $src1, +                 $src2_modifiers, $src2, $clamp, $omod) +>; +  /********** ======================= **********/  /********** Image sampling patterns **********/  /********** ======================= **********/ diff --git a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index 9b1d256dc5a8..1bdb1f0ee9f9 100644 --- a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -214,12 +214,11 @@ MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair(    // cases, like vectors of pointers.    const MachineOperand *AddrReg = TII->getNamedOperand(*I, AMDGPU::OpName::addr); -  unsigned DestReg0 = TII->getNamedOperand(*I, AMDGPU::OpName::vdst)->getReg(); -  unsigned DestReg1 -    = TII->getNamedOperand(*Paired, AMDGPU::OpName::vdst)->getReg(); +  const MachineOperand *Dest0 = TII->getNamedOperand(*I, AMDGPU::OpName::vdst); +  const MachineOperand *Dest1 = TII->getNamedOperand(*Paired, AMDGPU::OpName::vdst);    unsigned Offset0 -          = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff; +    = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff;    unsigned Offset1      = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff; @@ -258,20 +257,43 @@ MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair(    unsigned SubRegIdx0 = (EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;    unsigned SubRegIdx1 = (EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; -  updateRegDefsUses(DestReg0, DestReg, SubRegIdx0); -  updateRegDefsUses(DestReg1, DestReg, SubRegIdx1); -  LIS->RemoveMachineInstrFromMaps(I); -  // Replacing Paired in the maps with Read2 allows us to avoid updating the -  // live range for the m0 register. -  LIS->ReplaceMachineInstrInMaps(Paired, Read2); +  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); + +  // Copy to the old destination registers. +  MachineInstr *Copy0 = BuildMI(*MBB, I, DL, CopyDesc) +    .addOperand(*Dest0) // Copy to same destination including flags and sub reg. +    .addReg(DestReg, 0, SubRegIdx0); +  MachineInstr *Copy1 = BuildMI(*MBB, I, DL, CopyDesc) +    .addOperand(*Dest1) +    .addReg(DestReg, RegState::Kill, SubRegIdx1); + +  LIS->InsertMachineInstrInMaps(Read2); + +  // repairLiveintervalsInRange() doesn't handle physical register, so we have +  // to update the M0 range manually. +  SlotIndex PairedIndex = LIS->getInstructionIndex(Paired); +  LiveRange &M0Range = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::M0, TRI)); +  LiveRange::Segment *M0Segment = M0Range.getSegmentContaining(PairedIndex); +  bool UpdateM0Range = M0Segment->end == PairedIndex.getRegSlot(); + +  // The new write to the original destination register is now the copy. Steal +  // the old SlotIndex. +  LIS->ReplaceMachineInstrInMaps(I, Copy0); +  LIS->ReplaceMachineInstrInMaps(Paired, Copy1); +    I->eraseFromParent();    Paired->eraseFromParent();    LiveInterval &AddrRegLI = LIS->getInterval(AddrReg->getReg());    LIS->shrinkToUses(&AddrRegLI); -  LIS->getInterval(DestReg); // Create new LI +  LIS->createAndComputeVirtRegInterval(DestReg); + +  if (UpdateM0Range) { +    SlotIndex Read2Index = LIS->getInstructionIndex(Read2); +    M0Segment->end = Read2Index.getRegSlot(); +  }    DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');    return Read2.getInstr(); diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 587ea63d6796..d23b92edef33 100644 --- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -53,7 +53,6 @@ SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg(    if (!LaneVGPRs.count(LaneVGPRIdx)) {      unsigned LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass);      LaneVGPRs[LaneVGPRIdx] = LaneVGPR; -    MRI.setPhysRegUsed(LaneVGPR);      // Add this register as live-in to all blocks to avoid machine verifer      // complaining about use of an undefined physical register. diff --git a/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp b/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp index 0a7f684552f0..b086d2ed6652 100644 --- a/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp +++ b/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp @@ -91,7 +91,6 @@ bool SIPrepareScratchRegs::runOnMachineFunction(MachineFunction &MF) {    if (ScratchOffsetReg != AMDGPU::NoRegister) {      // Found an SGPR to use -    MRI.setPhysRegUsed(ScratchOffsetReg);      BuildMI(*Entry, I, DL, TII->get(AMDGPU::S_MOV_B32), ScratchOffsetReg)              .addReg(ScratchOffsetPreloadReg);    } else { diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp index db2ff0b1f952..ce4acafac9fa 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -499,7 +499,7 @@ unsigned SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI,    for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end();         I != E; ++I) { -    if (!MRI.isPhysRegUsed(*I)) +    if (MRI.reg_nodbg_empty(*I))        return *I;    }    return AMDGPU::NoRegister; diff --git a/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/lib/Target/AMDGPU/SIShrinkInstructions.cpp index 51e72cdb5f9e..5d00bdd6a9bb 100644 --- a/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -94,8 +94,20 @@ static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII,    // is vcc.  We should handle this the same way we handle vopc, by addding    // a register allocation hint pre-regalloc and then do the shrining    // post-regalloc. -  if (Src2) -    return false; +  if (Src2) { +    switch (MI.getOpcode()) { +      default: return false; + +      case AMDGPU::V_MAC_F32_e64: +        if (!isVGPR(Src2, TRI, MRI) || +            TII->hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers)) +          return false; +        break; + +      case AMDGPU::V_CNDMASK_B32_e64: +        break; +    } +  }    const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);    const MachineOperand *Src1Mod = @@ -149,7 +161,7 @@ static void foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,      return;    // Try to fold Src0 -  if (Src0.isReg()) { +  if (Src0.isReg() && MRI.hasOneUse(Src0.getReg())) {      unsigned Reg = Src0.getReg();      MachineInstr *Def = MRI.getUniqueVRegDef(Reg);      if (Def && Def->isMoveImmediate()) { @@ -243,6 +255,22 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {            continue;        } +      if (Op32 == AMDGPU::V_CNDMASK_B32_e32) { +        // We shrink V_CNDMASK_B32_e64 using regalloc hints like we do for VOPC +        // instructions. +        const MachineOperand *Src2 = +            TII->getNamedOperand(MI, AMDGPU::OpName::src2); +        if (!Src2->isReg()) +          continue; +        unsigned SReg = Src2->getReg(); +        if (TargetRegisterInfo::isVirtualRegister(SReg)) { +          MRI.setRegAllocationHint(SReg, 0, AMDGPU::VCC); +          continue; +        } +        if (SReg != AMDGPU::VCC) +          continue; +      } +        // We can shrink this instruction        DEBUG(dbgs() << "Shrinking "; MI.dump(); dbgs() << '\n';); @@ -259,6 +287,11 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {        if (Src1)          Inst32.addOperand(*Src1); +      const MachineOperand *Src2 = +          TII->getNamedOperand(MI, AMDGPU::OpName::src2); +      if (Src2) +        Inst32.addOperand(*Src2); +        ++NumInstructionsShrunk;        MI.eraseFromParent(); | 
