diff options
Diffstat (limited to 'lib/Target')
-rw-r--r-- | lib/Target/AMDGPU/AMDGPU.td | 9 | ||||
-rw-r--r-- | lib/Target/AMDGPU/AMDGPUFeatures.td | 9 | ||||
-rw-r--r-- | lib/Target/AMDGPU/R600ISelLowering.cpp | 88 | ||||
-rw-r--r-- | lib/Target/AMDGPU/R600ISelLowering.h | 8 | ||||
-rw-r--r-- | lib/Target/AMDGPU/VOP3Instructions.td | 11 | ||||
-rw-r--r-- | lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 116 |
6 files changed, 160 insertions, 81 deletions
diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td index 16c2a366db285..445b69b35eb15 100644 --- a/lib/Target/AMDGPU/AMDGPU.td +++ b/lib/Target/AMDGPU/AMDGPU.td @@ -267,15 +267,6 @@ def FeatureD16PreservesUnusedBits : SubtargetFeature< // Subtarget Features (options and debugging) //===------------------------------------------------------------===// -// Some instructions do not support denormals despite this flag. Using -// fp32 denormals also causes instructions to run at the double -// precision rate for the device. -def FeatureFP32Denormals : SubtargetFeature<"fp32-denormals", - "FP32Denormals", - "true", - "Enable single precision denormal handling" ->; - // Denormal handling for fp64 and fp16 is controlled by the same // config register when fp16 supported. // TODO: Do we need a separate f16 setting when not legal? diff --git a/lib/Target/AMDGPU/AMDGPUFeatures.td b/lib/Target/AMDGPU/AMDGPUFeatures.td index b375cae9018ea..3c7d8a8fc5509 100644 --- a/lib/Target/AMDGPU/AMDGPUFeatures.td +++ b/lib/Target/AMDGPU/AMDGPUFeatures.td @@ -19,6 +19,15 @@ def FeatureFMA : SubtargetFeature<"fmaf", "Enable single precision FMA (not as fast as mul+add, but fused)" >; +// Some instructions do not support denormals despite this flag. Using +// fp32 denormals also causes instructions to run at the double +// precision rate for the device. +def FeatureFP32Denormals : SubtargetFeature<"fp32-denormals", + "FP32Denormals", + "true", + "Enable single precision denormal handling" +>; + class SubtargetFeatureLocalMemorySize <int Value> : SubtargetFeature< "localmemorysize"#Value, "LocalMemorySize", diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp index 113d6249fa60a..e00dffc4be99a 100644 --- a/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -903,7 +903,7 @@ SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT, unsigned DwordOffset) const { unsigned ByteOffset = DwordOffset * 4; PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), - AMDGPUASI.CONSTANT_BUFFER_0); + AMDGPUASI.PARAM_I_ADDRESS); // We shouldn't be using an offset wider than 16-bits for implicit parameters. assert(isInt<16>(ByteOffset)); @@ -1457,33 +1457,17 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { return scalarizeVectorLoad(LoadNode, DAG); } + // This is still used for explicit load from addrspace(8) int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace()); if (ConstantBlock > -1 && ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) || (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) { SDValue Result; - if (isa<ConstantExpr>(LoadNode->getMemOperand()->getValue()) || - isa<Constant>(LoadNode->getMemOperand()->getValue()) || + if (isa<Constant>(LoadNode->getMemOperand()->getValue()) || isa<ConstantSDNode>(Ptr)) { - SDValue Slots[4]; - for (unsigned i = 0; i < 4; i++) { - // We want Const position encoded with the following formula : - // (((512 + (kc_bank << 12) + const_index) << 2) + chan) - // const_index is Ptr computed by llvm using an alignment of 16. - // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and - // then div by 4 at the ISel step - SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, - DAG.getConstant(4 * i + ConstantBlock * 16, DL, MVT::i32)); - Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr); - } - EVT NewVT = MVT::v4i32; - unsigned NumElements = 4; - if (VT.isVector()) { - NewVT = VT; - NumElements = VT.getVectorNumElements(); - } - Result = DAG.getBuildVector(NewVT, DL, makeArrayRef(Slots, NumElements)); + return constBufferLoad(LoadNode, LoadNode->getAddressSpace(), DAG); } else { + //TODO: Does this even work? // non-constant ptr can't be folded, keeps it as a v4f32 load Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32, DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, @@ -1622,7 +1606,7 @@ SDValue R600TargetLowering::LowerFormalArguments( } PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), - AMDGPUASI.CONSTANT_BUFFER_0); + AMDGPUASI.PARAM_I_ADDRESS); // i64 isn't a legal type, so the register type used ends up as i32, which // isn't expected here. It attempts to create this sextload, but it ends up @@ -1646,17 +1630,17 @@ SDValue R600TargetLowering::LowerFormalArguments( unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset(); unsigned PartOffset = VA.getLocMemOffset(); + unsigned Alignment = MinAlign(VT.getStoreSize(), PartOffset); MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase); SDValue Arg = DAG.getLoad( ISD::UNINDEXED, Ext, VT, DL, Chain, DAG.getConstant(PartOffset, DL, MVT::i32), DAG.getUNDEF(MVT::i32), PtrInfo, - MemVT, /* Alignment = */ 4, MachineMemOperand::MONonTemporal | + MemVT, Alignment, MachineMemOperand::MONonTemporal | MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant); - // 4 is the preferred alignment for the CONSTANT memory space. InVals.push_back(Arg); } return Chain; @@ -1804,6 +1788,52 @@ SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, SDValue Swz[4], return BuildVector; } +SDValue R600TargetLowering::constBufferLoad(LoadSDNode *LoadNode, int Block, + SelectionDAG &DAG) const { + SDLoc DL(LoadNode); + EVT VT = LoadNode->getValueType(0); + SDValue Chain = LoadNode->getChain(); + SDValue Ptr = LoadNode->getBasePtr(); + assert (isa<ConstantSDNode>(Ptr)); + + //TODO: Support smaller loads + if (LoadNode->getMemoryVT().getScalarType() != MVT::i32 || !ISD::isNON_EXTLoad(LoadNode)) + return SDValue(); + + if (LoadNode->getAlignment() < 4) + return SDValue(); + + int ConstantBlock = ConstantAddressBlock(Block); + + SDValue Slots[4]; + for (unsigned i = 0; i < 4; i++) { + // We want Const position encoded with the following formula : + // (((512 + (kc_bank << 12) + const_index) << 2) + chan) + // const_index is Ptr computed by llvm using an alignment of 16. + // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and + // then div by 4 at the ISel step + SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, + DAG.getConstant(4 * i + ConstantBlock * 16, DL, MVT::i32)); + Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr); + } + EVT NewVT = MVT::v4i32; + unsigned NumElements = 4; + if (VT.isVector()) { + NewVT = VT; + NumElements = VT.getVectorNumElements(); + } + SDValue Result = DAG.getBuildVector(NewVT, DL, makeArrayRef(Slots, NumElements)); + if (!VT.isVector()) { + Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result, + DAG.getConstant(0, DL, MVT::i32)); + } + SDValue MergedValues[2] = { + Result, + Chain + }; + return DAG.getMergeValues(MergedValues, DL); +} + //===----------------------------------------------------------------------===// // Custom DAG Optimizations //===----------------------------------------------------------------------===// @@ -2022,6 +2052,16 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG, DL); return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, N->getVTList(), NewArgs); } + + case ISD::LOAD: { + LoadSDNode *LoadNode = cast<LoadSDNode>(N); + SDValue Ptr = LoadNode->getBasePtr(); + if (LoadNode->getAddressSpace() == AMDGPUAS::PARAM_I_ADDRESS && + isa<ConstantSDNode>(Ptr)) + return constBufferLoad(LoadNode, AMDGPUAS::CONSTANT_BUFFER_0, DAG); + break; + } + default: break; } diff --git a/lib/Target/AMDGPU/R600ISelLowering.h b/lib/Target/AMDGPU/R600ISelLowering.h index 907d1f10e1519..767c3c7bd5bfe 100644 --- a/lib/Target/AMDGPU/R600ISelLowering.h +++ b/lib/Target/AMDGPU/R600ISelLowering.h @@ -98,9 +98,11 @@ private: bool isHWTrueValue(SDValue Op) const; bool isHWFalseValue(SDValue Op) const; - bool FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, - SDValue &Neg, SDValue &Abs, SDValue &Sel, SDValue &Imm, - SelectionDAG &DAG) const; + bool FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, + SDValue &Neg, SDValue &Abs, SDValue &Sel, SDValue &Imm, + SelectionDAG &DAG) const; + SDValue constBufferLoad(LoadSDNode *LoadNode, int Block, + SelectionDAG &DAG) const; SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override; }; diff --git a/lib/Target/AMDGPU/VOP3Instructions.td b/lib/Target/AMDGPU/VOP3Instructions.td index 17ae08dc62670..26bc5260e17f4 100644 --- a/lib/Target/AMDGPU/VOP3Instructions.td +++ b/lib/Target/AMDGPU/VOP3Instructions.td @@ -461,17 +461,6 @@ def : GCNPat < (inst i16:$src0, i16:$src1, i16:$src2, (i1 0)) >; -def : GCNPat< - (i32 (op3 (op2 (op1 i16:$src0, i16:$src1), i16:$src2))), - (inst i16:$src0, i16:$src1, i16:$src2, (i1 0)) ->; - -def : GCNPat< - (i64 (op3 (op2 (op1 i16:$src0, i16:$src1), i16:$src2))), - (REG_SEQUENCE VReg_64, - (inst i16:$src0, i16:$src1, i16:$src2, (i1 0)), sub0, - (V_MOV_B32_e32 (i32 0)), sub1) ->; } defm: Ternary_i16_Pats<mul, add, V_MAD_U16, zext>; diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 4dfa8477a362f..21939d836dc73 100644 --- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -16,6 +16,7 @@ #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/GlobalValue.h" #include "llvm/IR/Instructions.h" +#include "llvm/Support/AtomicOrdering.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" @@ -81,10 +82,12 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) { switch (N->getOpcode()) { case ISD::LOAD: + case ISD::ATOMIC_LOAD: if (tryLoad(N)) return; break; case ISD::STORE: + case ISD::ATOMIC_STORE: if (tryStore(N)) return; break; @@ -834,17 +837,27 @@ static Optional<unsigned> pickOpcodeForVT( bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { SDLoc dl(N); - LoadSDNode *LD = cast<LoadSDNode>(N); + MemSDNode *LD = cast<MemSDNode>(N); + assert(LD->readMem() && "Expected load"); + LoadSDNode *PlainLoad = dyn_cast<LoadSDNode>(N); EVT LoadedVT = LD->getMemoryVT(); SDNode *NVPTXLD = nullptr; // do not support pre/post inc/dec - if (LD->isIndexed()) + if (PlainLoad && PlainLoad->isIndexed()) return false; if (!LoadedVT.isSimple()) return false; + AtomicOrdering Ordering = LD->getOrdering(); + // In order to lower atomic loads with stronger guarantees we would need to + // use load.acquire or insert fences. However these features were only added + // with PTX ISA 6.0 / sm_70. + // TODO: Check if we can actually use the new instructions and implement them. + if (isStrongerThanMonotonic(Ordering)) + return false; + // Address Space Setting unsigned int CodeAddrSpace = getCodeAddrSpace(LD); if (canLowerToLDG(LD, *Subtarget, CodeAddrSpace, MF)) { @@ -855,8 +868,9 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { CurDAG->getDataLayout().getPointerSizeInBits(LD->getAddressSpace()); // Volatile Setting - // - .volatile is only availalble for .global and .shared - bool isVolatile = LD->isVolatile(); + // - .volatile is only available for .global and .shared + // - .volatile has the same memory synchronization semantics as .relaxed.sys + bool isVolatile = LD->isVolatile() || Ordering == AtomicOrdering::Monotonic; if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL && CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED && CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC) @@ -882,7 +896,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { fromTypeWidth = 32; } - if ((LD->getExtensionType() == ISD::SEXTLOAD)) + if (PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD)) fromType = NVPTX::PTXLdStInstCode::Signed; else if (ScalarVT.isFloatingPoint()) // f16 uses .b16 as its storage type. @@ -1691,25 +1705,38 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) { bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { SDLoc dl(N); - StoreSDNode *ST = cast<StoreSDNode>(N); + MemSDNode *ST = cast<MemSDNode>(N); + assert(ST->writeMem() && "Expected store"); + StoreSDNode *PlainStore = dyn_cast<StoreSDNode>(N); + AtomicSDNode *AtomicStore = dyn_cast<AtomicSDNode>(N); + assert((PlainStore || AtomicStore) && "Expected store"); EVT StoreVT = ST->getMemoryVT(); SDNode *NVPTXST = nullptr; // do not support pre/post inc/dec - if (ST->isIndexed()) + if (PlainStore && PlainStore->isIndexed()) return false; if (!StoreVT.isSimple()) return false; + AtomicOrdering Ordering = ST->getOrdering(); + // In order to lower atomic loads with stronger guarantees we would need to + // use store.release or insert fences. However these features were only added + // with PTX ISA 6.0 / sm_70. + // TODO: Check if we can actually use the new instructions and implement them. + if (isStrongerThanMonotonic(Ordering)) + return false; + // Address Space Setting unsigned int CodeAddrSpace = getCodeAddrSpace(ST); unsigned int PointerSize = CurDAG->getDataLayout().getPointerSizeInBits(ST->getAddressSpace()); // Volatile Setting - // - .volatile is only availalble for .global and .shared - bool isVolatile = ST->isVolatile(); + // - .volatile is only available for .global and .shared + // - .volatile has the same memory synchronization semantics as .relaxed.sys + bool isVolatile = ST->isVolatile() || Ordering == AtomicOrdering::Monotonic; if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL && CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED && CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC) @@ -1739,41 +1766,53 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { toType = NVPTX::PTXLdStInstCode::Unsigned; // Create the machine instruction DAG - SDValue Chain = N->getOperand(0); - SDValue N1 = N->getOperand(1); - SDValue N2 = N->getOperand(2); + SDValue Chain = ST->getChain(); + SDValue Value = PlainStore ? PlainStore->getValue() : AtomicStore->getVal(); + SDValue BasePtr = ST->getBasePtr(); SDValue Addr; SDValue Offset, Base; Optional<unsigned> Opcode; - MVT::SimpleValueType SourceVT = N1.getNode()->getSimpleValueType(0).SimpleTy; + MVT::SimpleValueType SourceVT = + Value.getNode()->getSimpleValueType(0).SimpleTy; - if (SelectDirectAddr(N2, Addr)) { + if (SelectDirectAddr(BasePtr, Addr)) { Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_avar, NVPTX::ST_i16_avar, NVPTX::ST_i32_avar, NVPTX::ST_i64_avar, NVPTX::ST_f16_avar, NVPTX::ST_f16x2_avar, NVPTX::ST_f32_avar, NVPTX::ST_f64_avar); if (!Opcode) return false; - SDValue Ops[] = { N1, getI32Imm(isVolatile, dl), - getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl), - getI32Imm(toType, dl), getI32Imm(toTypeWidth, dl), Addr, - Chain }; + SDValue Ops[] = {Value, + getI32Imm(isVolatile, dl), + getI32Imm(CodeAddrSpace, dl), + getI32Imm(vecType, dl), + getI32Imm(toType, dl), + getI32Imm(toTypeWidth, dl), + Addr, + Chain}; NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops); - } else if (PointerSize == 64 ? SelectADDRsi64(N2.getNode(), N2, Base, Offset) - : SelectADDRsi(N2.getNode(), N2, Base, Offset)) { + } else if (PointerSize == 64 + ? SelectADDRsi64(BasePtr.getNode(), BasePtr, Base, Offset) + : SelectADDRsi(BasePtr.getNode(), BasePtr, Base, Offset)) { Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_asi, NVPTX::ST_i16_asi, NVPTX::ST_i32_asi, NVPTX::ST_i64_asi, NVPTX::ST_f16_asi, NVPTX::ST_f16x2_asi, NVPTX::ST_f32_asi, NVPTX::ST_f64_asi); if (!Opcode) return false; - SDValue Ops[] = { N1, getI32Imm(isVolatile, dl), - getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl), - getI32Imm(toType, dl), getI32Imm(toTypeWidth, dl), Base, - Offset, Chain }; + SDValue Ops[] = {Value, + getI32Imm(isVolatile, dl), + getI32Imm(CodeAddrSpace, dl), + getI32Imm(vecType, dl), + getI32Imm(toType, dl), + getI32Imm(toTypeWidth, dl), + Base, + Offset, + Chain}; NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops); - } else if (PointerSize == 64 ? SelectADDRri64(N2.getNode(), N2, Base, Offset) - : SelectADDRri(N2.getNode(), N2, Base, Offset)) { + } else if (PointerSize == 64 + ? SelectADDRri64(BasePtr.getNode(), BasePtr, Base, Offset) + : SelectADDRri(BasePtr.getNode(), BasePtr, Base, Offset)) { if (PointerSize == 64) Opcode = pickOpcodeForVT( SourceVT, NVPTX::ST_i8_ari_64, NVPTX::ST_i16_ari_64, @@ -1787,10 +1826,15 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { if (!Opcode) return false; - SDValue Ops[] = { N1, getI32Imm(isVolatile, dl), - getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl), - getI32Imm(toType, dl), getI32Imm(toTypeWidth, dl), Base, - Offset, Chain }; + SDValue Ops[] = {Value, + getI32Imm(isVolatile, dl), + getI32Imm(CodeAddrSpace, dl), + getI32Imm(vecType, dl), + getI32Imm(toType, dl), + getI32Imm(toTypeWidth, dl), + Base, + Offset, + Chain}; NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops); } else { if (PointerSize == 64) @@ -1806,10 +1850,14 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { NVPTX::ST_f32_areg, NVPTX::ST_f64_areg); if (!Opcode) return false; - SDValue Ops[] = { N1, getI32Imm(isVolatile, dl), - getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl), - getI32Imm(toType, dl), getI32Imm(toTypeWidth, dl), N2, - Chain }; + SDValue Ops[] = {Value, + getI32Imm(isVolatile, dl), + getI32Imm(CodeAddrSpace, dl), + getI32Imm(vecType, dl), + getI32Imm(toType, dl), + getI32Imm(toTypeWidth, dl), + BasePtr, + Chain}; NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops); } |