diff options
| author | Dimitry Andric <dim@FreeBSD.org> | 2017-01-09 21:23:09 +0000 | 
|---|---|---|
| committer | Dimitry Andric <dim@FreeBSD.org> | 2017-01-09 21:23:09 +0000 | 
| commit | 909545a822eef491158f831688066f0ec2866938 (patch) | |
| tree | 5b0bf0e81294007a9b462b21031b3df272c655c3 /lib/Target/AMDGPU/R600ISelLowering.cpp | |
| parent | 7e7b6700743285c0af506ac6299ddf82ebd434b9 (diff) | |
Diffstat (limited to 'lib/Target/AMDGPU/R600ISelLowering.cpp')
| -rw-r--r-- | lib/Target/AMDGPU/R600ISelLowering.cpp | 281 | 
1 files changed, 132 insertions, 149 deletions
| diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp index 89c9266746ac..de7ce5cb9e47 100644 --- a/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -99,6 +99,18 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM,    setTruncStoreAction(MVT::i32, MVT::i8, Custom);    setTruncStoreAction(MVT::i32, MVT::i16, Custom); +  // We need to include these since trunc STORES to PRIVATE need +  // special handling to accommodate RMW +  setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom); +  setTruncStoreAction(MVT::v4i32, MVT::v4i16, Custom); +  setTruncStoreAction(MVT::v8i32, MVT::v8i16, Custom); +  setTruncStoreAction(MVT::v16i32, MVT::v16i16, Custom); +  setTruncStoreAction(MVT::v32i32, MVT::v32i16, Custom); +  setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom); +  setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom); +  setTruncStoreAction(MVT::v8i32, MVT::v8i8, Custom); +  setTruncStoreAction(MVT::v16i32, MVT::v16i8, Custom); +  setTruncStoreAction(MVT::v32i32, MVT::v32i8, Custom);    // Workaround for LegalizeDAG asserting on expansion of i1 vector stores.    setTruncStoreAction(MVT::v2i32, MVT::v2i1, Expand); @@ -1087,79 +1099,114 @@ void R600TargetLowering::getStackAddress(unsigned StackWidth,  SDValue R600TargetLowering::lowerPrivateTruncStore(StoreSDNode *Store,                                                     SelectionDAG &DAG) const {    SDLoc DL(Store); +  //TODO: Who creates the i8 stores? +  assert(Store->isTruncatingStore() +         || Store->getValue().getValueType() == MVT::i8); +  assert(Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS); -  unsigned Mask = 0; +  SDValue Mask;    if (Store->getMemoryVT() == MVT::i8) { -    Mask = 0xff; +    assert(Store->getAlignment() >= 1); +    Mask = DAG.getConstant(0xff, DL, MVT::i32);    } else if (Store->getMemoryVT() == MVT::i16) { -    Mask = 0xffff; +    assert(Store->getAlignment() >= 2); +    Mask = DAG.getConstant(0xffff, DL, MVT::i32);; +  } else { +    llvm_unreachable("Unsupported private trunc store");    }    SDValue Chain = Store->getChain();    SDValue BasePtr = Store->getBasePtr(); +  SDValue Offset = Store->getOffset();    EVT MemVT = Store->getMemoryVT(); -  SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, BasePtr, -                            DAG.getConstant(2, DL, MVT::i32)); -  SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32, -                            Chain, Ptr, -                            DAG.getTargetConstant(0, DL, MVT::i32)); +  SDValue LoadPtr = BasePtr; +  if (!Offset.isUndef()) { +    LoadPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, Offset); +  } + +  // Get dword location +  // TODO: this should be eliminated by the future SHR ptr, 2 +  SDValue Ptr = DAG.getNode(ISD::AND, DL, MVT::i32, LoadPtr, +                            DAG.getConstant(0xfffffffc, DL, MVT::i32)); + +  // Load dword +  // TODO: can we be smarter about machine pointer info? +  SDValue Dst = DAG.getLoad(MVT::i32, DL, Chain, Ptr, MachinePointerInfo()); -  SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, BasePtr, +  Chain = Dst.getValue(1); + +  // Get offset in dword +  SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, LoadPtr,                                  DAG.getConstant(0x3, DL, MVT::i32)); +  // Convert byte offset to bit shift    SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,                                   DAG.getConstant(3, DL, MVT::i32)); +  // TODO: Contrary to the name of the functiom, +  // it also handles sub i32 non-truncating stores (like i1)    SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32,                                    Store->getValue()); +  // Mask the value to the right type    SDValue MaskedValue = DAG.getZeroExtendInReg(SExtValue, DL, MemVT); +  // Shift the value in place    SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32,                                       MaskedValue, ShiftAmt); -  SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32, -                                DAG.getConstant(Mask, DL, MVT::i32), -                                ShiftAmt); -  DstMask = DAG.getNode(ISD::XOR, DL, MVT::i32, DstMask, -                        DAG.getConstant(0xffffffff, DL, MVT::i32)); +  // Shift the mask in place +  SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32, Mask, ShiftAmt); + +  // Invert the mask. NOTE: if we had native ROL instructions we could +  // use inverted mask +  DstMask = DAG.getNOT(DL, DstMask, MVT::i32); + +  // Cleanup the target bits    Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask); +  // Add the new bits    SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue); -  return DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, -                     Chain, Value, Ptr, -                     DAG.getTargetConstant(0, DL, MVT::i32)); + +  // Store dword +  // TODO: Can we be smarter about MachinePointerInfo? +  return DAG.getStore(Chain, DL, Value, Ptr, MachinePointerInfo());  }  SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {    StoreSDNode *StoreNode = cast<StoreSDNode>(Op);    unsigned AS = StoreNode->getAddressSpace(); + +  SDValue Chain = StoreNode->getChain(); +  SDValue Ptr = StoreNode->getBasePtr();    SDValue Value = StoreNode->getValue(); -  EVT ValueVT = Value.getValueType(); + +  EVT VT = Value.getValueType();    EVT MemVT = StoreNode->getMemoryVT(); -  unsigned Align = StoreNode->getAlignment(); +  EVT PtrVT = Ptr.getValueType(); +  SDLoc DL(Op); + +  // Neither LOCAL nor PRIVATE can do vectors at the moment    if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS) && -      ValueVT.isVector()) { -    return SplitVectorStore(Op, DAG); +      VT.isVector()) { +    return scalarizeVectorStore(StoreNode, DAG);    } -  // Private AS needs special fixes -  if (Align < MemVT.getStoreSize() && (AS != AMDGPUAS::PRIVATE_ADDRESS) && +  unsigned Align = StoreNode->getAlignment(); +  if (Align < MemVT.getStoreSize() &&        !allowsMisalignedMemoryAccesses(MemVT, AS, Align, nullptr)) {      return expandUnalignedStore(StoreNode, DAG);    } -  SDLoc DL(Op); -  SDValue Chain = StoreNode->getChain(); -  SDValue Ptr = StoreNode->getBasePtr(); +  SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, PtrVT, Ptr, +                                  DAG.getConstant(2, DL, PtrVT));    if (AS == AMDGPUAS::GLOBAL_ADDRESS) {      // It is beneficial to create MSKOR here instead of combiner to avoid      // artificial dependencies introduced by RMW      if (StoreNode->isTruncatingStore()) { -      EVT VT = Value.getValueType();        assert(VT.bitsLE(MVT::i32));        SDValue MaskConstant;        if (MemVT == MVT::i8) { @@ -1169,15 +1216,19 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {          assert(StoreNode->getAlignment() >= 2);          MaskConstant = DAG.getConstant(0xFFFF, DL, MVT::i32);        } -      SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr, -                                      DAG.getConstant(2, DL, MVT::i32)); -      SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr, -                                      DAG.getConstant(0x00000003, DL, VT)); + +      SDValue ByteIndex = DAG.getNode(ISD::AND, DL, PtrVT, Ptr, +                                      DAG.getConstant(0x00000003, DL, PtrVT)); +      SDValue BitShift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex, +                                     DAG.getConstant(3, DL, VT)); + +      // Put the mask in correct place +      SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, BitShift); + +      // Put the mask in correct place        SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant); -      SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex, -                                   DAG.getConstant(3, DL, VT)); -      SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift); -      SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift); +      SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, BitShift); +        // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32        // vector instead.        SDValue Src[4] = { @@ -1191,12 +1242,9 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {        return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,                                       Op->getVTList(), Args, MemVT,                                       StoreNode->getMemOperand()); -    } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR && -               ValueVT.bitsGE(MVT::i32)) { +    } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR && VT.bitsGE(MVT::i32)) {        // Convert pointer from byte address to dword address. -      Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(), -                        DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), -                                    Ptr, DAG.getConstant(2, DL, MVT::i32))); +      Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, PtrVT, DWordAddr);        if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {          llvm_unreachable("Truncated and indexed stores not supported yet"); @@ -1207,49 +1255,22 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {      }    } +  // GLOBAL_ADDRESS has been handled above, LOCAL_ADDRESS allows all sizes    if (AS != AMDGPUAS::PRIVATE_ADDRESS)      return SDValue();    if (MemVT.bitsLT(MVT::i32))      return lowerPrivateTruncStore(StoreNode, DAG); -  // Lowering for indirect addressing -  const MachineFunction &MF = DAG.getMachineFunction(); -  const R600FrameLowering *TFL = getSubtarget()->getFrameLowering(); -  unsigned StackWidth = TFL->getStackWidth(MF); - -  Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); - -  if (ValueVT.isVector()) { -    unsigned NumElemVT = ValueVT.getVectorNumElements(); -    EVT ElemVT = ValueVT.getVectorElementType(); -    SmallVector<SDValue, 4> Stores(NumElemVT); - -    assert(NumElemVT >= StackWidth && "Stack width cannot be greater than " -                                      "vector width in load"); - -    for (unsigned i = 0; i < NumElemVT; ++i) { -      unsigned Channel, PtrIncr; -      getStackAddress(StackWidth, i, Channel, PtrIncr); -      Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, -                        DAG.getConstant(PtrIncr, DL, MVT::i32)); -      SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, -                                 Value, DAG.getConstant(i, DL, MVT::i32)); - -      Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, -                              Chain, Elem, Ptr, -                              DAG.getTargetConstant(Channel, DL, MVT::i32)); -    } -     Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores); -   } else { -    if (ValueVT == MVT::i8) { -      Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value); -    } -    Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr, -    DAG.getTargetConstant(0, DL, MVT::i32)); // Channel +  // Standard i32+ store, tag it with DWORDADDR to note that the address +  // has been shifted +  if (Ptr.getOpcode() != AMDGPUISD::DWORDADDR) { +    Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, PtrVT, DWordAddr); +    return DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());    } -  return Chain; +  // Tagged i32+ stores will be matched by patterns +  return SDValue();  }  // return (512 + (kc_bank << 12) @@ -1299,51 +1320,50 @@ SDValue R600TargetLowering::lowerPrivateExtLoad(SDValue Op,    LoadSDNode *Load = cast<LoadSDNode>(Op);    ISD::LoadExtType ExtType = Load->getExtensionType();    EVT MemVT = Load->getMemoryVT(); +  assert(Load->getAlignment() >= MemVT.getStoreSize()); -  // <SI && AS=PRIVATE && EXTLOAD && size < 32bit, -  // register (2-)byte extract. +  SDValue BasePtr = Load->getBasePtr(); +  SDValue Chain = Load->getChain(); +  SDValue Offset = Load->getOffset(); -  // Get Register holding the target. -  SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(), -                            DAG.getConstant(2, DL, MVT::i32)); -  // Load the Register. -  SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(), -                            Load->getChain(), -                            Ptr, -                            DAG.getTargetConstant(0, DL, MVT::i32), -                            Op.getOperand(2)); +  SDValue LoadPtr = BasePtr; +  if (!Offset.isUndef()) { +    LoadPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, Offset); +  } + +  // Get dword location +  // NOTE: this should be eliminated by the future SHR ptr, 2 +  SDValue Ptr = DAG.getNode(ISD::AND, DL, MVT::i32, LoadPtr, +                            DAG.getConstant(0xfffffffc, DL, MVT::i32)); + +  // Load dword +  // TODO: can we be smarter about machine pointer info? +  SDValue Read = DAG.getLoad(MVT::i32, DL, Chain, Ptr, MachinePointerInfo());    // Get offset within the register.    SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, -                                Load->getBasePtr(), -                                DAG.getConstant(0x3, DL, MVT::i32)); +                                LoadPtr, DAG.getConstant(0x3, DL, MVT::i32));    // Bit offset of target byte (byteIdx * 8).    SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,                                   DAG.getConstant(3, DL, MVT::i32));    // Shift to the right. -  Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Ret, ShiftAmt); +  SDValue Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Read, ShiftAmt);    // Eliminate the upper bits by setting them to ...    EVT MemEltVT = MemVT.getScalarType(); -  // ... ones. -  if (ExtType == ISD::SEXTLOAD) { +  if (ExtType == ISD::SEXTLOAD) { // ... ones.      SDValue MemEltVTNode = DAG.getValueType(MemEltVT); - -    SDValue Ops[] = { -      DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode), -      Load->getChain() -    }; - -    return DAG.getMergeValues(Ops, DL); +    Ret = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode); +  } else { // ... or zeros. +    Ret = DAG.getZeroExtendInReg(Ret, DL, MemEltVT);    } -  // ... or zeros.    SDValue Ops[] = { -    DAG.getZeroExtendInReg(Ret, DL, MemEltVT), -    Load->getChain() +    Ret, +    Read.getValue(1) // This should be our output chain    };    return DAG.getMergeValues(Ops, DL); @@ -1365,12 +1385,10 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {    SDValue Chain = LoadNode->getChain();    SDValue Ptr = LoadNode->getBasePtr(); -  if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) { -    SDValue MergedValues[2] = { -      scalarizeVectorLoad(LoadNode, DAG), -      Chain -    }; -    return DAG.getMergeValues(MergedValues, DL); +  if ((LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || +      LoadNode->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) && +      VT.isVector()) { +      return scalarizeVectorLoad(LoadNode, DAG);    }    int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace()); @@ -1421,8 +1439,6 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {      return DAG.getMergeValues(MergedValues, DL);    } -  SDValue LoweredLoad; -    // For most operations returning SDValue() will result in the node being    // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we    // need to manually expand loads that may be legal in some address spaces and @@ -1447,47 +1463,14 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {      return SDValue();    } -  // Lowering for indirect addressing -  const MachineFunction &MF = DAG.getMachineFunction(); -  const R600FrameLowering *TFL = getSubtarget()->getFrameLowering(); -  unsigned StackWidth = TFL->getStackWidth(MF); - -  Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); - -  if (VT.isVector()) { -    unsigned NumElemVT = VT.getVectorNumElements(); -    EVT ElemVT = VT.getVectorElementType(); -    SDValue Loads[4]; - -    assert(NumElemVT <= 4); -    assert(NumElemVT >= StackWidth && "Stack width cannot be greater than " -                                      "vector width in load"); - -    for (unsigned i = 0; i < NumElemVT; ++i) { -      unsigned Channel, PtrIncr; -      getStackAddress(StackWidth, i, Channel, PtrIncr); -      Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, -                        DAG.getConstant(PtrIncr, DL, MVT::i32)); -      Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT, -                             Chain, Ptr, -                             DAG.getTargetConstant(Channel, DL, MVT::i32), -                             Op.getOperand(2)); -    } -    EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, NumElemVT); -    LoweredLoad = DAG.getBuildVector(TargetVT, DL, makeArrayRef(Loads, NumElemVT)); -  } else { -    LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT, -                              Chain, Ptr, -                              DAG.getTargetConstant(0, DL, MVT::i32), // Channel -                              Op.getOperand(2)); +  // DWORDADDR ISD marks already shifted address +  if (Ptr.getOpcode() != AMDGPUISD::DWORDADDR) { +    assert(VT == MVT::i32); +    Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(2, DL, MVT::i32)); +    Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, MVT::i32, Ptr); +    return DAG.getLoad(MVT::i32, DL, Chain, Ptr, LoadNode->getMemOperand());    } - -  SDValue Ops[2] = { -    LoweredLoad, -    Chain -  }; - -  return DAG.getMergeValues(Ops, DL); +  return SDValue();  }  SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { | 
