aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp583
1 files changed, 329 insertions, 254 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 2b6308dc1549..aaf448346b53 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -16,7 +16,6 @@
#include "AMDGPUISelLowering.h" // For AMDGPUISD
#include "AMDGPUInstrInfo.h"
#include "AMDGPUPerfHintAnalysis.h"
-#include "AMDGPURegisterInfo.h"
#include "AMDGPUSubtarget.h"
#include "AMDGPUTargetMachine.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
@@ -29,6 +28,7 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/FunctionLoweringInfo.h"
#include "llvm/CodeGen/ISDOpcodes.h"
@@ -252,7 +252,6 @@ private:
bool SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const;
bool SelectVOP3Mods_NNaN(SDValue In, SDValue &Src, SDValue &SrcMods) const;
- bool SelectVOP3Mods_f32(SDValue In, SDValue &Src, SDValue &SrcMods) const;
bool SelectVOP3ModsImpl(SDValue In, SDValue &Src, unsigned &SrcMods) const;
bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
bool SelectVOP3NoMods(SDValue In, SDValue &Src) const;
@@ -265,16 +264,10 @@ private:
SDValue &Clamp, SDValue &Omod) const;
bool SelectVOP3PMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
- bool SelectVOP3PMods0(SDValue In, SDValue &Src, SDValue &SrcMods,
- SDValue &Clamp) const;
bool SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const;
- bool SelectVOP3OpSel0(SDValue In, SDValue &Src, SDValue &SrcMods,
- SDValue &Clamp) const;
bool SelectVOP3OpSelMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
- bool SelectVOP3OpSelMods0(SDValue In, SDValue &Src, SDValue &SrcMods,
- SDValue &Clamp) const;
bool SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, unsigned &Mods) const;
bool SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
@@ -286,7 +279,6 @@ private:
void SelectAddcSubb(SDNode *N);
void SelectUADDO_USUBO(SDNode *N);
void SelectDIV_SCALE(SDNode *N);
- void SelectDIV_FMAS(SDNode *N);
void SelectMAD_64_32(SDNode *N);
void SelectFMA_W_CHAIN(SDNode *N);
void SelectFMUL_W_CHAIN(SDNode *N);
@@ -301,6 +293,7 @@ private:
void SelectATOMIC_CMP_SWAP(SDNode *N);
void SelectDSAppendConsume(SDNode *N, unsigned IntrID);
void SelectDS_GWS(SDNode *N, unsigned IntrID);
+ void SelectInterpP1F16(SDNode *N);
void SelectINTRINSIC_W_CHAIN(SDNode *N);
void SelectINTRINSIC_WO_CHAIN(SDNode *N);
void SelectINTRINSIC_VOID(SDNode *N);
@@ -409,7 +402,7 @@ bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
}
#endif
Subtarget = &MF.getSubtarget<GCNSubtarget>();
- Mode = AMDGPU::SIModeRegisterDefaults(MF.getFunction(), *Subtarget);
+ Mode = AMDGPU::SIModeRegisterDefaults(MF.getFunction());
return SelectionDAGISel::runOnMachineFunction(MF);
}
@@ -655,29 +648,6 @@ MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops);
}
-static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) {
- switch (NumVectorElts) {
- case 1:
- return AMDGPU::SReg_32RegClassID;
- case 2:
- return AMDGPU::SReg_64RegClassID;
- case 3:
- return AMDGPU::SGPR_96RegClassID;
- case 4:
- return AMDGPU::SGPR_128RegClassID;
- case 5:
- return AMDGPU::SGPR_160RegClassID;
- case 8:
- return AMDGPU::SReg_256RegClassID;
- case 16:
- return AMDGPU::SReg_512RegClassID;
- case 32:
- return AMDGPU::SReg_1024RegClassID;
- }
-
- llvm_unreachable("invalid vector size");
-}
-
void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
EVT VT = N->getValueType(0);
unsigned NumVectorElts = VT.getVectorNumElements();
@@ -698,6 +668,8 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
// 1 = Vector Register Class
SmallVector<SDValue, 32 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
+ bool IsGCN = CurDAG->getSubtarget().getTargetTriple().getArch() ==
+ Triple::amdgcn;
RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
bool IsRegSeq = true;
unsigned NOps = N->getNumOperands();
@@ -707,7 +679,8 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
IsRegSeq = false;
break;
}
- unsigned Sub = AMDGPURegisterInfo::getSubRegFromChannel(i);
+ unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
+ : R600RegisterInfo::getSubRegFromChannel(i);
RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(Sub, DL, MVT::i32);
}
@@ -717,7 +690,8 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
DL, EltVT);
for (unsigned i = NOps; i < NumVectorElts; ++i) {
- unsigned Sub = AMDGPURegisterInfo::getSubRegFromChannel(i);
+ unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
+ : R600RegisterInfo::getSubRegFromChannel(i);
RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
RegSeqArgs[1 + (2 * i) + 1] =
CurDAG->getTargetConstant(Sub, DL, MVT::i32);
@@ -742,7 +716,8 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
(Opc == AMDGPUISD::ATOMIC_INC || Opc == AMDGPUISD::ATOMIC_DEC ||
Opc == ISD::ATOMIC_LOAD_FADD ||
Opc == AMDGPUISD::ATOMIC_LOAD_FMIN ||
- Opc == AMDGPUISD::ATOMIC_LOAD_FMAX)) {
+ Opc == AMDGPUISD::ATOMIC_LOAD_FMAX ||
+ Opc == AMDGPUISD::ATOMIC_LOAD_CSUB)) {
N = glueCopyToM0LDSInit(N);
SelectCode(N);
return;
@@ -801,7 +776,8 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
}
assert(VT.getVectorElementType().bitsEq(MVT::i32));
- unsigned RegClassID = selectSGPRVectorRegClassID(NumVectorElts);
+ unsigned RegClassID =
+ SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * 32)->getID();
SelectBuildVector(N, RegClassID);
return;
}
@@ -874,10 +850,6 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
SelectDIV_SCALE(N);
return;
}
- case AMDGPUISD::DIV_FMAS: {
- SelectDIV_FMAS(N);
- return;
- }
case AMDGPUISD::MAD_I64_I32:
case AMDGPUISD::MAD_U64_U32: {
SelectMAD_64_32(N);
@@ -1020,8 +992,14 @@ void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue);
- unsigned Opc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
- unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
+ static const unsigned OpcMap[2][2][2] = {
+ {{AMDGPU::S_SUB_U32, AMDGPU::S_ADD_U32},
+ {AMDGPU::V_SUB_I32_e32, AMDGPU::V_ADD_I32_e32}},
+ {{AMDGPU::S_SUBB_U32, AMDGPU::S_ADDC_U32},
+ {AMDGPU::V_SUBB_U32_e32, AMDGPU::V_ADDC_U32_e32}}};
+
+ unsigned Opc = OpcMap[0][N->isDivergent()][IsAdd];
+ unsigned CarryOpc = OpcMap[1][N->isDivergent()][IsAdd];
SDNode *AddLo;
if (!ConsumeCarry) {
@@ -1063,24 +1041,51 @@ void AMDGPUDAGToDAGISel::SelectAddcSubb(SDNode *N) {
SDValue RHS = N->getOperand(1);
SDValue CI = N->getOperand(2);
- unsigned Opc = N->getOpcode() == ISD::ADDCARRY ? AMDGPU::V_ADDC_U32_e64
- : AMDGPU::V_SUBB_U32_e64;
- CurDAG->SelectNodeTo(
- N, Opc, N->getVTList(),
- {LHS, RHS, CI, CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
+ if (N->isDivergent()) {
+ unsigned Opc = N->getOpcode() == ISD::ADDCARRY ? AMDGPU::V_ADDC_U32_e64
+ : AMDGPU::V_SUBB_U32_e64;
+ CurDAG->SelectNodeTo(
+ N, Opc, N->getVTList(),
+ {LHS, RHS, CI,
+ CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
+ } else {
+ unsigned Opc = N->getOpcode() == ISD::ADDCARRY ? AMDGPU::S_ADD_CO_PSEUDO
+ : AMDGPU::S_SUB_CO_PSEUDO;
+ CurDAG->SelectNodeTo(N, Opc, N->getVTList(), {LHS, RHS, CI});
+ }
}
void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
// The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned
// carry out despite the _i32 name. These were renamed in VI to _U32.
// FIXME: We should probably rename the opcodes here.
- unsigned Opc = N->getOpcode() == ISD::UADDO ?
- AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64;
+ bool IsAdd = N->getOpcode() == ISD::UADDO;
+ bool IsVALU = N->isDivergent();
+
+ for (SDNode::use_iterator UI = N->use_begin(), E = N->use_end(); UI != E;
+ ++UI)
+ if (UI.getUse().getResNo() == 1) {
+ if ((IsAdd && (UI->getOpcode() != ISD::ADDCARRY)) ||
+ (!IsAdd && (UI->getOpcode() != ISD::SUBCARRY))) {
+ IsVALU = true;
+ break;
+ }
+ }
+
+ if (IsVALU) {
+ unsigned Opc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64;
+
+ CurDAG->SelectNodeTo(
+ N, Opc, N->getVTList(),
+ {N->getOperand(0), N->getOperand(1),
+ CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
+ } else {
+ unsigned Opc = N->getOpcode() == ISD::UADDO ? AMDGPU::S_UADDO_PSEUDO
+ : AMDGPU::S_USUBO_PSEUDO;
- CurDAG->SelectNodeTo(
- N, Opc, N->getVTList(),
- {N->getOperand(0), N->getOperand(1),
- CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
+ CurDAG->SelectNodeTo(N, Opc, N->getVTList(),
+ {N->getOperand(0), N->getOperand(1)});
+ }
}
void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
@@ -1125,35 +1130,6 @@ void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
}
-void AMDGPUDAGToDAGISel::SelectDIV_FMAS(SDNode *N) {
- const GCNSubtarget *ST = static_cast<const GCNSubtarget *>(Subtarget);
- const SIRegisterInfo *TRI = ST->getRegisterInfo();
-
- SDLoc SL(N);
- EVT VT = N->getValueType(0);
-
- assert(VT == MVT::f32 || VT == MVT::f64);
-
- unsigned Opc
- = (VT == MVT::f64) ? AMDGPU::V_DIV_FMAS_F64 : AMDGPU::V_DIV_FMAS_F32;
-
- SDValue CarryIn = N->getOperand(3);
- // V_DIV_FMAS implicitly reads VCC.
- SDValue VCC = CurDAG->getCopyToReg(CurDAG->getEntryNode(), SL,
- TRI->getVCC(), CarryIn, SDValue());
-
- SDValue Ops[10];
-
- SelectVOP3Mods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);
- SelectVOP3Mods(N->getOperand(1), Ops[3], Ops[2]);
- SelectVOP3Mods(N->getOperand(2), Ops[5], Ops[4]);
-
- Ops[8] = VCC;
- Ops[9] = VCC.getValue(1);
-
- CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
-}
-
// We need to handle this here because tablegen doesn't support matching
// instructions with multiple outputs.
void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
@@ -1343,6 +1319,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
SDValue &TFE, SDValue &DLC,
SDValue &SWZ) const {
// Subtarget prefers to use flat instruction
+ // FIXME: This should be a pattern predicate and not reach here
if (Subtarget->useFlatForGlobal())
return false;
@@ -1438,6 +1415,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
SDValue Ptr, Offen, Idxen, Addr64;
// addr64 bit was removed for volcanic islands.
+ // FIXME: This should be a pattern predicate and not reach here
if (!Subtarget->hasAddr64())
return false;
@@ -1475,6 +1453,7 @@ static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) {
}
std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
+ SDLoc DL(N);
const MachineFunction &MF = CurDAG->getMachineFunction();
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
@@ -1489,9 +1468,8 @@ std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const
}
// If we don't know this private access is a local stack object, it needs to
- // be relative to the entry point's scratch wave offset register.
- return std::make_pair(N, CurDAG->getRegister(Info->getScratchWaveOffsetReg(),
- MVT::i32));
+ // be relative to the entry point's scratch wave offset.
+ return std::make_pair(N, CurDAG->getTargetConstant(0, DL, MVT::i32));
}
bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
@@ -1506,22 +1484,26 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
- unsigned Imm = CAddr->getZExtValue();
-
- SDValue HighBits = CurDAG->getTargetConstant(Imm & ~4095, DL, MVT::i32);
- MachineSDNode *MovHighBits = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
- DL, MVT::i32, HighBits);
- VAddr = SDValue(MovHighBits, 0);
-
- // In a call sequence, stores to the argument stack area are relative to the
- // stack pointer.
- const MachinePointerInfo &PtrInfo = cast<MemSDNode>(Parent)->getPointerInfo();
- unsigned SOffsetReg = isStackPtrRelative(PtrInfo) ?
- Info->getStackPtrOffsetReg() : Info->getScratchWaveOffsetReg();
-
- SOffset = CurDAG->getRegister(SOffsetReg, MVT::i32);
- ImmOffset = CurDAG->getTargetConstant(Imm & 4095, DL, MVT::i16);
- return true;
+ int64_t Imm = CAddr->getSExtValue();
+ const int64_t NullPtr =
+ AMDGPUTargetMachine::getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS);
+ // Don't fold null pointer.
+ if (Imm != NullPtr) {
+ SDValue HighBits = CurDAG->getTargetConstant(Imm & ~4095, DL, MVT::i32);
+ MachineSDNode *MovHighBits = CurDAG->getMachineNode(
+ AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits);
+ VAddr = SDValue(MovHighBits, 0);
+
+ // In a call sequence, stores to the argument stack area are relative to the
+ // stack pointer.
+ const MachinePointerInfo &PtrInfo
+ = cast<MemSDNode>(Parent)->getPointerInfo();
+ SOffset = isStackPtrRelative(PtrInfo)
+ ? CurDAG->getRegister(Info->getStackPtrOffsetReg(), MVT::i32)
+ : CurDAG->getTargetConstant(0, DL, MVT::i32);
+ ImmOffset = CurDAG->getTargetConstant(Imm & 4095, DL, MVT::i16);
+ return true;
+ }
}
if (CurDAG->isBaseWithConstantOffset(Addr)) {
@@ -1577,12 +1559,12 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
const MachinePointerInfo &PtrInfo = cast<MemSDNode>(Parent)->getPointerInfo();
- unsigned SOffsetReg = isStackPtrRelative(PtrInfo) ?
- Info->getStackPtrOffsetReg() : Info->getScratchWaveOffsetReg();
// FIXME: Get from MachinePointerInfo? We should only be using the frame
// offset if we know this is in a call sequence.
- SOffset = CurDAG->getRegister(SOffsetReg, MVT::i32);
+ SOffset = isStackPtrRelative(PtrInfo)
+ ? CurDAG->getRegister(Info->getStackPtrOffsetReg(), MVT::i32)
+ : CurDAG->getTargetConstant(0, DL, MVT::i32);
Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
return true;
@@ -1646,6 +1628,37 @@ static MemSDNode* findMemSDNode(SDNode *N) {
llvm_unreachable("cannot find MemSDNode in the pattern!");
}
+static bool getBaseWithOffsetUsingSplitOR(SelectionDAG &DAG, SDValue Addr,
+ SDValue &N0, SDValue &N1) {
+ if (Addr.getValueType() == MVT::i64 && Addr.getOpcode() == ISD::BITCAST &&
+ Addr.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
+ // As we split 64-bit `or` earlier, it's complicated pattern to match, i.e.
+ // (i64 (bitcast (v2i32 (build_vector
+ // (or (extract_vector_elt V, 0), OFFSET),
+ // (extract_vector_elt V, 1)))))
+ SDValue Lo = Addr.getOperand(0).getOperand(0);
+ if (Lo.getOpcode() == ISD::OR && DAG.isBaseWithConstantOffset(Lo)) {
+ SDValue BaseLo = Lo.getOperand(0);
+ SDValue BaseHi = Addr.getOperand(0).getOperand(1);
+ // Check that split base (Lo and Hi) are extracted from the same one.
+ if (BaseLo.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ BaseHi.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ BaseLo.getOperand(0) == BaseHi.getOperand(0) &&
+ // Lo is statically extracted from index 0.
+ isa<ConstantSDNode>(BaseLo.getOperand(1)) &&
+ BaseLo.getConstantOperandVal(1) == 0 &&
+ // Hi is statically extracted from index 0.
+ isa<ConstantSDNode>(BaseHi.getOperand(1)) &&
+ BaseHi.getConstantOperandVal(1) == 1) {
+ N0 = BaseLo.getOperand(0).getOperand(0);
+ N1 = Lo.getOperand(1);
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
template <bool IsSigned>
bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N,
SDValue Addr,
@@ -1656,84 +1669,91 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N,
if (Subtarget->hasFlatInstOffsets() &&
(!Subtarget->hasFlatSegmentOffsetBug() ||
- findMemSDNode(N)->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS) &&
- CurDAG->isBaseWithConstantOffset(Addr)) {
- SDValue N0 = Addr.getOperand(0);
- SDValue N1 = Addr.getOperand(1);
- uint64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
-
- const SIInstrInfo *TII = Subtarget->getInstrInfo();
- unsigned AS = findMemSDNode(N)->getAddressSpace();
- if (TII->isLegalFLATOffset(COffsetVal, AS, IsSigned)) {
- Addr = N0;
- OffsetVal = COffsetVal;
- } else {
- // If the offset doesn't fit, put the low bits into the offset field and
- // add the rest.
-
- SDLoc DL(N);
- uint64_t ImmField;
- const unsigned NumBits = TII->getNumFlatOffsetBits(AS, IsSigned);
- if (IsSigned) {
- ImmField = SignExtend64(COffsetVal, NumBits);
-
- // Don't use a negative offset field if the base offset is positive.
- // Since the scheduler currently relies on the offset field, doing so
- // could result in strange scheduling decisions.
-
- // TODO: Should we not do this in the opposite direction as well?
- if (static_cast<int64_t>(COffsetVal) > 0) {
- if (static_cast<int64_t>(ImmField) < 0) {
- const uint64_t OffsetMask = maskTrailingOnes<uint64_t>(NumBits - 1);
- ImmField = COffsetVal & OffsetMask;
+ findMemSDNode(N)->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS)) {
+ SDValue N0, N1;
+ if (CurDAG->isBaseWithConstantOffset(Addr)) {
+ N0 = Addr.getOperand(0);
+ N1 = Addr.getOperand(1);
+ } else if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, N0, N1)) {
+ assert(N0 && N1 && isa<ConstantSDNode>(N1));
+ }
+ if (N0 && N1) {
+ uint64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
+
+ const SIInstrInfo *TII = Subtarget->getInstrInfo();
+ unsigned AS = findMemSDNode(N)->getAddressSpace();
+ if (TII->isLegalFLATOffset(COffsetVal, AS, IsSigned)) {
+ Addr = N0;
+ OffsetVal = COffsetVal;
+ } else {
+ // If the offset doesn't fit, put the low bits into the offset field and
+ // add the rest.
+
+ SDLoc DL(N);
+ uint64_t ImmField;
+ const unsigned NumBits = TII->getNumFlatOffsetBits(AS, IsSigned);
+ if (IsSigned) {
+ ImmField = SignExtend64(COffsetVal, NumBits);
+
+ // Don't use a negative offset field if the base offset is positive.
+ // Since the scheduler currently relies on the offset field, doing so
+ // could result in strange scheduling decisions.
+
+ // TODO: Should we not do this in the opposite direction as well?
+ if (static_cast<int64_t>(COffsetVal) > 0) {
+ if (static_cast<int64_t>(ImmField) < 0) {
+ const uint64_t OffsetMask =
+ maskTrailingOnes<uint64_t>(NumBits - 1);
+ ImmField = COffsetVal & OffsetMask;
+ }
}
+ } else {
+ // TODO: Should we do this for a negative offset?
+ const uint64_t OffsetMask = maskTrailingOnes<uint64_t>(NumBits);
+ ImmField = COffsetVal & OffsetMask;
}
- } else {
- // TODO: Should we do this for a negative offset?
- const uint64_t OffsetMask = maskTrailingOnes<uint64_t>(NumBits);
- ImmField = COffsetVal & OffsetMask;
- }
- uint64_t RemainderOffset = COffsetVal - ImmField;
+ uint64_t RemainderOffset = COffsetVal - ImmField;
- assert(TII->isLegalFLATOffset(ImmField, AS, IsSigned));
- assert(RemainderOffset + ImmField == COffsetVal);
+ assert(TII->isLegalFLATOffset(ImmField, AS, IsSigned));
+ assert(RemainderOffset + ImmField == COffsetVal);
- OffsetVal = ImmField;
+ OffsetVal = ImmField;
- // TODO: Should this try to use a scalar add pseudo if the base address is
- // uniform and saddr is usable?
- SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
- SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
+ // TODO: Should this try to use a scalar add pseudo if the base address
+ // is uniform and saddr is usable?
+ SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
+ SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
- SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
- DL, MVT::i32, N0, Sub0);
- SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
- DL, MVT::i32, N0, Sub1);
+ SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
+ MVT::i32, N0, Sub0);
+ SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
+ MVT::i32, N0, Sub1);
- SDValue AddOffsetLo
- = getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
- SDValue AddOffsetHi
- = getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
+ SDValue AddOffsetLo =
+ getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
+ SDValue AddOffsetHi =
+ getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
- SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
- SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
+ SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
+ SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
- SDNode *Add = CurDAG->getMachineNode(
- AMDGPU::V_ADD_I32_e64, DL, VTs,
- {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
+ SDNode *Add =
+ CurDAG->getMachineNode(AMDGPU::V_ADD_I32_e64, DL, VTs,
+ {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
- SDNode *Addc = CurDAG->getMachineNode(
- AMDGPU::V_ADDC_U32_e64, DL, VTs,
- {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
+ SDNode *Addc = CurDAG->getMachineNode(
+ AMDGPU::V_ADDC_U32_e64, DL, VTs,
+ {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
- SDValue RegSequenceArgs[] = {
- CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32),
- SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1
- };
+ SDValue RegSequenceArgs[] = {
+ CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32),
+ SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
- Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
- MVT::i64, RegSequenceArgs), 0);
+ Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
+ MVT::i64, RegSequenceArgs),
+ 0);
+ }
}
}
@@ -1761,35 +1781,52 @@ bool AMDGPUDAGToDAGISel::SelectFlatAtomicSigned(SDNode *N,
bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
SDValue &Offset, bool &Imm) const {
-
- // FIXME: Handle non-constant offsets.
ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
- if (!C)
+ if (!C) {
+ if (ByteOffsetNode.getValueType().isScalarInteger() &&
+ ByteOffsetNode.getValueType().getSizeInBits() == 32) {
+ Offset = ByteOffsetNode;
+ Imm = false;
+ return true;
+ }
+ if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) {
+ if (ByteOffsetNode.getOperand(0).getValueType().getSizeInBits() == 32) {
+ Offset = ByteOffsetNode.getOperand(0);
+ Imm = false;
+ return true;
+ }
+ }
return false;
+ }
SDLoc SL(ByteOffsetNode);
- GCNSubtarget::Generation Gen = Subtarget->getGeneration();
+ // GFX9 and GFX10 have signed byte immediate offsets.
int64_t ByteOffset = C->getSExtValue();
- int64_t EncodedOffset = AMDGPU::getSMRDEncodedOffset(*Subtarget, ByteOffset);
-
- if (AMDGPU::isLegalSMRDImmOffset(*Subtarget, ByteOffset)) {
- Offset = CurDAG->getTargetConstant(EncodedOffset, SL, MVT::i32);
+ Optional<int64_t> EncodedOffset =
+ AMDGPU::getSMRDEncodedOffset(*Subtarget, ByteOffset, false);
+ if (EncodedOffset) {
+ Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
Imm = true;
return true;
}
- if (!isUInt<32>(EncodedOffset) || !isUInt<32>(ByteOffset))
+ // SGPR and literal offsets are unsigned.
+ if (ByteOffset < 0)
return false;
- if (Gen == AMDGPUSubtarget::SEA_ISLANDS && isUInt<32>(EncodedOffset)) {
- // 32-bit Immediates are supported on Sea Islands.
- Offset = CurDAG->getTargetConstant(EncodedOffset, SL, MVT::i32);
- } else {
- SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32);
- Offset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32,
- C32Bit), 0);
+ EncodedOffset = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget, ByteOffset);
+ if (EncodedOffset) {
+ Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
+ return true;
}
- Imm = false;
+
+ if (!isUInt<32>(ByteOffset) && !isInt<32>(ByteOffset))
+ return false;
+
+ SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32);
+ Offset = SDValue(
+ CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, C32Bit), 0);
+
return true;
}
@@ -1825,14 +1862,21 @@ bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
// A 32-bit (address + offset) should not cause unsigned 32-bit integer
// wraparound, because s_load instructions perform the addition in 64 bits.
if ((Addr.getValueType() != MVT::i32 ||
- Addr->getFlags().hasNoUnsignedWrap()) &&
- CurDAG->isBaseWithConstantOffset(Addr)) {
- SDValue N0 = Addr.getOperand(0);
- SDValue N1 = Addr.getOperand(1);
-
- if (SelectSMRDOffset(N1, Offset, Imm)) {
- SBase = Expand32BitAddress(N0);
- return true;
+ Addr->getFlags().hasNoUnsignedWrap())) {
+ SDValue N0, N1;
+ // Extract the base and offset if possible.
+ if (CurDAG->isBaseWithConstantOffset(Addr) ||
+ Addr.getOpcode() == ISD::ADD) {
+ N0 = Addr.getOperand(0);
+ N1 = Addr.getOperand(1);
+ } else if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, N0, N1)) {
+ assert(N0 && N1 && isa<ConstantSDNode>(N1));
+ }
+ if (N0 && N1) {
+ if (SelectSMRDOffset(N1, Offset, Imm)) {
+ SBase = Expand32BitAddress(N0);
+ return true;
+ }
}
}
SBase = Expand32BitAddress(Addr);
@@ -1843,17 +1887,16 @@ bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,
SDValue &Offset) const {
- bool Imm;
+ bool Imm = false;
return SelectSMRD(Addr, SBase, Offset, Imm) && Imm;
}
bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
SDValue &Offset) const {
- if (Subtarget->getGeneration() != AMDGPUSubtarget::SEA_ISLANDS)
- return false;
+ assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
- bool Imm;
+ bool Imm = false;
if (!SelectSMRD(Addr, SBase, Offset, Imm))
return false;
@@ -1862,27 +1905,38 @@ bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase,
SDValue &Offset) const {
- bool Imm;
+ bool Imm = false;
return SelectSMRD(Addr, SBase, Offset, Imm) && !Imm &&
!isa<ConstantSDNode>(Offset);
}
bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue Addr,
SDValue &Offset) const {
- bool Imm;
- return SelectSMRDOffset(Addr, Offset, Imm) && Imm;
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr)) {
+ // The immediate offset for S_BUFFER instructions is unsigned.
+ if (auto Imm =
+ AMDGPU::getSMRDEncodedOffset(*Subtarget, C->getZExtValue(), true)) {
+ Offset = CurDAG->getTargetConstant(*Imm, SDLoc(Addr), MVT::i32);
+ return true;
+ }
+ }
+
+ return false;
}
bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue Addr,
SDValue &Offset) const {
- if (Subtarget->getGeneration() != AMDGPUSubtarget::SEA_ISLANDS)
- return false;
+ assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
- bool Imm;
- if (!SelectSMRDOffset(Addr, Offset, Imm))
- return false;
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr)) {
+ if (auto Imm = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget,
+ C->getZExtValue())) {
+ Offset = CurDAG->getTargetConstant(*Imm, SDLoc(Addr), MVT::i32);
+ return true;
+ }
+ }
- return !Imm && isa<ConstantSDNode>(Offset);
+ return false;
}
bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
@@ -1898,7 +1952,9 @@ bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
// (add n0, c0)
// Don't peel off the offset (c0) if doing so could possibly lead
// the base (n0) to be negative.
- if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(N0)) {
+ // (or n0, |c0|) can never change a sign given isBaseWithConstantOffset.
+ if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(N0) ||
+ (Index->getOpcode() == ISD::OR && C1->getSExtValue() >= 0)) {
Base = N0;
Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
return true;
@@ -2066,7 +2122,7 @@ void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N);
unsigned BrOp = UseSCCBr ? AMDGPU::S_CBRANCH_SCC1 : AMDGPU::S_CBRANCH_VCCNZ;
- unsigned CondReg = UseSCCBr ? (unsigned)AMDGPU::SCC : TRI->getVCC();
+ Register CondReg = UseSCCBr ? AMDGPU::SCC : TRI->getVCC();
SDLoc SL(N);
if (!UseSCCBr) {
@@ -2121,7 +2177,7 @@ void AMDGPUDAGToDAGISel::SelectFMAD_FMA(SDNode *N) {
bool Sel1 = SelectVOP3PMadMixModsImpl(Src1, Src1, Src1Mods);
bool Sel2 = SelectVOP3PMadMixModsImpl(Src2, Src2, Src2Mods);
- assert((IsFMA || !Mode.FP32Denormals) &&
+ assert((IsFMA || !Mode.allFP32Denormals()) &&
"fmad selected with denormals enabled");
// TODO: We can select this with f32 denormals enabled if all the sources are
// converted from f16 (in which case fmad isn't legal).
@@ -2338,6 +2394,64 @@ void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
}
+void AMDGPUDAGToDAGISel::SelectInterpP1F16(SDNode *N) {
+ if (Subtarget->getLDSBankCount() != 16) {
+ // This is a single instruction with a pattern.
+ SelectCode(N);
+ return;
+ }
+
+ SDLoc DL(N);
+
+ // This requires 2 instructions. It is possible to write a pattern to support
+ // this, but the generated isel emitter doesn't correctly deal with multiple
+ // output instructions using the same physical register input. The copy to m0
+ // is incorrectly placed before the second instruction.
+ //
+ // TODO: Match source modifiers.
+ //
+ // def : Pat <
+ // (int_amdgcn_interp_p1_f16
+ // (VOP3Mods f32:$src0, i32:$src0_modifiers),
+ // (i32 timm:$attrchan), (i32 timm:$attr),
+ // (i1 timm:$high), M0),
+ // (V_INTERP_P1LV_F16 $src0_modifiers, VGPR_32:$src0, timm:$attr,
+ // timm:$attrchan, 0,
+ // (V_INTERP_MOV_F32 2, timm:$attr, timm:$attrchan), timm:$high)> {
+ // let Predicates = [has16BankLDS];
+ // }
+
+ // 16 bank LDS
+ SDValue ToM0 = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL, AMDGPU::M0,
+ N->getOperand(5), SDValue());
+
+ SDVTList VTs = CurDAG->getVTList(MVT::f32, MVT::Other);
+
+ SDNode *InterpMov =
+ CurDAG->getMachineNode(AMDGPU::V_INTERP_MOV_F32, DL, VTs, {
+ CurDAG->getTargetConstant(2, DL, MVT::i32), // P0
+ N->getOperand(3), // Attr
+ N->getOperand(2), // Attrchan
+ ToM0.getValue(1) // In glue
+ });
+
+ SDNode *InterpP1LV =
+ CurDAG->getMachineNode(AMDGPU::V_INTERP_P1LV_F16, DL, MVT::f32, {
+ CurDAG->getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
+ N->getOperand(1), // Src0
+ N->getOperand(3), // Attr
+ N->getOperand(2), // Attrchan
+ CurDAG->getTargetConstant(0, DL, MVT::i32), // $src2_modifiers
+ SDValue(InterpMov, 0), // Src2 - holds two f16 values selected by high
+ N->getOperand(4), // high
+ CurDAG->getTargetConstant(0, DL, MVT::i1), // $clamp
+ CurDAG->getTargetConstant(0, DL, MVT::i32), // $omod
+ SDValue(InterpMov, 1)
+ });
+
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), SDValue(InterpP1LV, 0));
+}
+
void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
switch (IntrID) {
@@ -2366,6 +2480,9 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
case Intrinsic::amdgcn_wwm:
Opcode = AMDGPU::WWM;
break;
+ case Intrinsic::amdgcn_interp_p1_f16:
+ SelectInterpP1F16(N);
+ return;
default:
SelectCode(N);
return;
@@ -2428,15 +2545,6 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods_NNaN(SDValue In, SDValue &Src,
return isNoNanSrc(Src);
}
-bool AMDGPUDAGToDAGISel::SelectVOP3Mods_f32(SDValue In, SDValue &Src,
- SDValue &SrcMods) const {
- if (In.getValueType() == MVT::f32)
- return SelectVOP3Mods(In, Src, SrcMods);
- Src = In;
- SrcMods = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);;
- return true;
-}
-
bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const {
if (In.getOpcode() == ISD::FABS || In.getOpcode() == ISD::FNEG)
return false;
@@ -2520,17 +2628,6 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
return true;
}
-bool AMDGPUDAGToDAGISel::SelectVOP3PMods0(SDValue In, SDValue &Src,
- SDValue &SrcMods,
- SDValue &Clamp) const {
- SDLoc SL(In);
-
- // FIXME: Handle clamp and op_sel
- Clamp = CurDAG->getTargetConstant(0, SL, MVT::i32);
-
- return SelectVOP3PMods(In, Src, SrcMods);
-}
-
bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
SDValue &SrcMods) const {
Src = In;
@@ -2539,34 +2636,12 @@ bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
return true;
}
-bool AMDGPUDAGToDAGISel::SelectVOP3OpSel0(SDValue In, SDValue &Src,
- SDValue &SrcMods,
- SDValue &Clamp) const {
- SDLoc SL(In);
-
- // FIXME: Handle clamp
- Clamp = CurDAG->getTargetConstant(0, SL, MVT::i32);
-
- return SelectVOP3OpSel(In, Src, SrcMods);
-}
-
bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src,
SDValue &SrcMods) const {
// FIXME: Handle op_sel
return SelectVOP3Mods(In, Src, SrcMods);
}
-bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods0(SDValue In, SDValue &Src,
- SDValue &SrcMods,
- SDValue &Clamp) const {
- SDLoc SL(In);
-
- // FIXME: Handle clamp
- Clamp = CurDAG->getTargetConstant(0, SL, MVT::i32);
-
- return SelectVOP3OpSelMods(In, Src, SrcMods);
-}
-
// The return value is not whether the match is possible (which it always is),
// but whether or not it a conversion is really used.
bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
@@ -2705,7 +2780,7 @@ bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode * N) const {
(
Subtarget->getScalarizeGlobalBehavior() &&
Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
- !Ld->isVolatile() &&
+ Ld->isSimple() &&
!N->isDivergent() &&
static_cast<const SITargetLowering *>(
getTargetLowering())->isMemOpHasNoClobberedMemOperand(N)