summaryrefslogtreecommitdiff
path: root/lib/Target/AMDGPU/SIISelLowering.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target/AMDGPU/SIISelLowering.cpp')
-rw-r--r--lib/Target/AMDGPU/SIISelLowering.cpp1663
1 files changed, 1448 insertions, 215 deletions
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 2356405f0919..50ee88fa635a 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -32,6 +32,7 @@
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/ADT/Twine.h"
@@ -45,11 +46,14 @@
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/TargetCallingConv.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
@@ -70,9 +74,7 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/MathExtras.h"
-#include "llvm/Target/TargetCallingConv.h"
#include "llvm/Target/TargetOptions.h"
-#include "llvm/Target/TargetRegisterInfo.h"
#include <cassert>
#include <cmath>
#include <cstdint>
@@ -83,11 +85,21 @@
using namespace llvm;
+#define DEBUG_TYPE "si-lower"
+
+STATISTIC(NumTailCalls, "Number of tail calls");
+
static cl::opt<bool> EnableVGPRIndexMode(
"amdgpu-vgpr-index-mode",
cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
cl::init(false));
+static cl::opt<unsigned> AssumeFrameIndexHighZeroBits(
+ "amdgpu-frame-index-zero-bits",
+ cl::desc("High bits of frame index assumed to be zero"),
+ cl::init(5),
+ cl::ReallyHidden);
+
static unsigned findFirstFreeSGPR(CCState &CCInfo) {
unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
@@ -214,6 +226,14 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::ADDCARRY, MVT::i32, Legal);
setOperationAction(ISD::SUBCARRY, MVT::i32, Legal);
+#if 0
+ setOperationAction(ISD::ADDCARRY, MVT::i64, Legal);
+ setOperationAction(ISD::SUBCARRY, MVT::i64, Legal);
+#endif
+
+ //setOperationAction(ISD::ADDC, MVT::i64, Expand);
+ //setOperationAction(ISD::SUBC, MVT::i64, Expand);
+
// We only support LOAD/STORE and vector manipulation ops for vectors
// with > 4 elements.
for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,
@@ -462,6 +482,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
+ setOperationAction(ISD::ANY_EXTEND, MVT::v2i32, Expand);
setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand);
setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand);
setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
@@ -496,6 +517,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
setTargetDAGCombine(ISD::ZERO_EXTEND);
setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
+ setTargetDAGCombine(ISD::BUILD_VECTOR);
// All memory operations. Some folding on the pointer operand is done to help
// matching the constant offsets in the addressing modes.
@@ -528,8 +550,7 @@ const SISubtarget *SITargetLowering::getSubtarget() const {
// TargetLowering queries
//===----------------------------------------------------------------------===//
-bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &,
- EVT) const {
+bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const {
// SI has some legal vector types, but no legal vector operations. Say no
// shuffles are legal in order to prefer scalarizing some vector operations.
return false;
@@ -537,6 +558,7 @@ bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &,
bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
const CallInst &CI,
+ MachineFunction &MF,
unsigned IntrID) const {
switch (IntrID) {
case Intrinsic::amdgcn_atomic_inc:
@@ -545,11 +567,12 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.memVT = MVT::getVT(CI.getType());
Info.ptrVal = CI.getOperand(0);
Info.align = 0;
+ Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4));
- Info.vol = !Vol || !Vol->isZero();
- Info.readMem = true;
- Info.writeMem = true;
+ if (!Vol || !Vol->isZero())
+ Info.flags |= MachineMemOperand::MOVolatile;
+
return true;
}
default:
@@ -587,6 +610,26 @@ bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
return isUInt<12>(AM.BaseOffs) && AM.Scale == 0;
}
+bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const {
+ if (Subtarget->hasFlatGlobalInsts())
+ return isInt<13>(AM.BaseOffs) && AM.Scale == 0;
+
+ if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
+ // Assume the we will use FLAT for all global memory accesses
+ // on VI.
+ // FIXME: This assumption is currently wrong. On VI we still use
+ // MUBUF instructions for the r + i addressing mode. As currently
+ // implemented, the MUBUF instructions only work on buffer < 4GB.
+ // It may be possible to support > 4GB buffers with MUBUF instructions,
+ // by setting the stride value in the resource descriptor which would
+ // increase the size limit to (stride * 4GB). However, this is risky,
+ // because it has never been validated.
+ return isLegalFlatAddressingMode(AM);
+ }
+
+ return isLegalMUBUFAddressingMode(AM);
+}
+
bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
// MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
// additionally can do r + r + i with addr64. 32-bit has more addressing
@@ -624,27 +667,15 @@ bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
const AddrMode &AM, Type *Ty,
- unsigned AS) const {
+ unsigned AS, Instruction *I) const {
// No global is ever allowed as a base.
if (AM.BaseGV)
return false;
- if (AS == AMDGPUASI.GLOBAL_ADDRESS) {
- if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
- // Assume the we will use FLAT for all global memory accesses
- // on VI.
- // FIXME: This assumption is currently wrong. On VI we still use
- // MUBUF instructions for the r + i addressing mode. As currently
- // implemented, the MUBUF instructions only work on buffer < 4GB.
- // It may be possible to support > 4GB buffers with MUBUF instructions,
- // by setting the stride value in the resource descriptor which would
- // increase the size limit to (stride * 4GB). However, this is risky,
- // because it has never been validated.
- return isLegalFlatAddressingMode(AM);
- }
+ if (AS == AMDGPUASI.GLOBAL_ADDRESS)
+ return isLegalGlobalAddressingMode(AM);
- return isLegalMUBUFAddressingMode(AM);
- } else if (AS == AMDGPUASI.CONSTANT_ADDRESS) {
+ if (AS == AMDGPUASI.CONSTANT_ADDRESS) {
// If the offset isn't a multiple of 4, it probably isn't going to be
// correctly aligned.
// FIXME: Can we get the real alignment here?
@@ -656,7 +687,7 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
// FIXME?: We also need to do this if unaligned, but we don't know the
// alignment here.
if (DL.getTypeStoreSize(Ty) < 4)
- return isLegalMUBUFAddressingMode(AM);
+ return isLegalGlobalAddressingMode(AM);
if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) {
// SMRD instructions have an 8-bit, dword offset on SI.
@@ -888,18 +919,30 @@ SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
uint64_t Offset) const {
const DataLayout &DL = DAG.getDataLayout();
MachineFunction &MF = DAG.getMachineFunction();
- const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
- unsigned InputPtrReg = TRI->getPreloadedValue(MF,
- SIRegisterInfo::KERNARG_SEGMENT_PTR);
+ const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+
+ const ArgDescriptor *InputPtrReg;
+ const TargetRegisterClass *RC;
+
+ std::tie(InputPtrReg, RC)
+ = Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
MVT PtrVT = getPointerTy(DL, AMDGPUASI.CONSTANT_ADDRESS);
SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
- MRI.getLiveInVirtReg(InputPtrReg), PtrVT);
+ MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
+
return DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
DAG.getConstant(Offset, SL, PtrVT));
}
+SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
+ const SDLoc &SL) const {
+ auto MFI = DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
+ uint64_t Offset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT);
+ return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
+}
+
SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
const SDLoc &SL, SDValue Val,
bool Signed,
@@ -991,6 +1034,17 @@ SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA
return ArgValue;
}
+SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
+ const SIMachineFunctionInfo &MFI,
+ EVT VT,
+ AMDGPUFunctionArgInfo::PreloadedValue PVID) const {
+ const ArgDescriptor *Reg;
+ const TargetRegisterClass *RC;
+
+ std::tie(Reg, RC) = MFI.getPreloadedValue(PVID);
+ return CreateLiveInRegister(DAG, RC, Reg->getRegister(), VT);
+}
+
static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
CallingConv::ID CallConv,
ArrayRef<ISD::InputArg> Ins,
@@ -1041,29 +1095,131 @@ static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
}
// Allocate special inputs passed in VGPRs.
-static void allocateSpecialInputVGPRs(CCState &CCInfo,
- MachineFunction &MF,
- const SIRegisterInfo &TRI,
- SIMachineFunctionInfo &Info) {
+static void allocateSpecialEntryInputVGPRs(CCState &CCInfo,
+ MachineFunction &MF,
+ const SIRegisterInfo &TRI,
+ SIMachineFunctionInfo &Info) {
if (Info.hasWorkItemIDX()) {
- unsigned Reg = TRI.getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X);
+ unsigned Reg = AMDGPU::VGPR0;
MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+
CCInfo.AllocateReg(Reg);
+ Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg));
}
if (Info.hasWorkItemIDY()) {
- unsigned Reg = TRI.getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y);
+ unsigned Reg = AMDGPU::VGPR1;
MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+
CCInfo.AllocateReg(Reg);
+ Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
}
if (Info.hasWorkItemIDZ()) {
- unsigned Reg = TRI.getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z);
+ unsigned Reg = AMDGPU::VGPR2;
MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+
CCInfo.AllocateReg(Reg);
+ Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
}
}
+// Try to allocate a VGPR at the end of the argument list, or if no argument
+// VGPRs are left allocating a stack slot.
+static ArgDescriptor allocateVGPR32Input(CCState &CCInfo) {
+ ArrayRef<MCPhysReg> ArgVGPRs
+ = makeArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
+ unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
+ if (RegIdx == ArgVGPRs.size()) {
+ // Spill to stack required.
+ int64_t Offset = CCInfo.AllocateStack(4, 4);
+
+ return ArgDescriptor::createStack(Offset);
+ }
+
+ unsigned Reg = ArgVGPRs[RegIdx];
+ Reg = CCInfo.AllocateReg(Reg);
+ assert(Reg != AMDGPU::NoRegister);
+
+ MachineFunction &MF = CCInfo.getMachineFunction();
+ MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+ return ArgDescriptor::createRegister(Reg);
+}
+
+static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo,
+ const TargetRegisterClass *RC,
+ unsigned NumArgRegs) {
+ ArrayRef<MCPhysReg> ArgSGPRs = makeArrayRef(RC->begin(), 32);
+ unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
+ if (RegIdx == ArgSGPRs.size())
+ report_fatal_error("ran out of SGPRs for arguments");
+
+ unsigned Reg = ArgSGPRs[RegIdx];
+ Reg = CCInfo.AllocateReg(Reg);
+ assert(Reg != AMDGPU::NoRegister);
+
+ MachineFunction &MF = CCInfo.getMachineFunction();
+ MF.addLiveIn(Reg, RC);
+ return ArgDescriptor::createRegister(Reg);
+}
+
+static ArgDescriptor allocateSGPR32Input(CCState &CCInfo) {
+ return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
+}
+
+static ArgDescriptor allocateSGPR64Input(CCState &CCInfo) {
+ return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
+}
+
+static void allocateSpecialInputVGPRs(CCState &CCInfo,
+ MachineFunction &MF,
+ const SIRegisterInfo &TRI,
+ SIMachineFunctionInfo &Info) {
+ if (Info.hasWorkItemIDX())
+ Info.setWorkItemIDX(allocateVGPR32Input(CCInfo));
+
+ if (Info.hasWorkItemIDY())
+ Info.setWorkItemIDY(allocateVGPR32Input(CCInfo));
+
+ if (Info.hasWorkItemIDZ())
+ Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo));
+}
+
+static void allocateSpecialInputSGPRs(CCState &CCInfo,
+ MachineFunction &MF,
+ const SIRegisterInfo &TRI,
+ SIMachineFunctionInfo &Info) {
+ auto &ArgInfo = Info.getArgInfo();
+
+ // TODO: Unify handling with private memory pointers.
+
+ if (Info.hasDispatchPtr())
+ ArgInfo.DispatchPtr = allocateSGPR64Input(CCInfo);
+
+ if (Info.hasQueuePtr())
+ ArgInfo.QueuePtr = allocateSGPR64Input(CCInfo);
+
+ if (Info.hasKernargSegmentPtr())
+ ArgInfo.KernargSegmentPtr = allocateSGPR64Input(CCInfo);
+
+ if (Info.hasDispatchID())
+ ArgInfo.DispatchID = allocateSGPR64Input(CCInfo);
+
+ // flat_scratch_init is not applicable for non-kernel functions.
+
+ if (Info.hasWorkGroupIDX())
+ ArgInfo.WorkGroupIDX = allocateSGPR32Input(CCInfo);
+
+ if (Info.hasWorkGroupIDY())
+ ArgInfo.WorkGroupIDY = allocateSGPR32Input(CCInfo);
+
+ if (Info.hasWorkGroupIDZ())
+ ArgInfo.WorkGroupIDZ = allocateSGPR32Input(CCInfo);
+
+ if (Info.hasImplicitArgPtr())
+ ArgInfo.ImplicitArgPtr = allocateSGPR64Input(CCInfo);
+}
+
// Allocate special inputs passed in user SGPRs.
static void allocateHSAUserSGPRs(CCState &CCInfo,
MachineFunction &MF,
@@ -1187,20 +1343,38 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
if (TM.getOptLevel() == CodeGenOpt::None)
HasStackObjects = true;
+ // For now assume stack access is needed in any callee functions, so we need
+ // the scratch registers to pass in.
+ bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
+
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
if (ST.isAmdCodeObjectV2(MF)) {
- if (HasStackObjects) {
+ if (RequiresStackAccess) {
// If we have stack objects, we unquestionably need the private buffer
// resource. For the Code Object V2 ABI, this will be the first 4 user
// SGPR inputs. We can reserve those and use them directly.
- unsigned PrivateSegmentBufferReg = TRI.getPreloadedValue(
- MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
+ unsigned PrivateSegmentBufferReg = Info.getPreloadedReg(
+ AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
Info.setScratchRSrcReg(PrivateSegmentBufferReg);
- unsigned PrivateSegmentWaveByteOffsetReg = TRI.getPreloadedValue(
- MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
- Info.setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
+ if (MFI.hasCalls()) {
+ // If we have calls, we need to keep the frame register in a register
+ // that won't be clobbered by a call, so ensure it is copied somewhere.
+
+ // This is not a problem for the scratch wave offset, because the same
+ // registers are reserved in all functions.
+
+ // FIXME: Nothing is really ensuring this is a call preserved register,
+ // it's just selected from the end so it happens to be.
+ unsigned ReservedOffsetReg
+ = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
+ Info.setScratchWaveOffsetReg(ReservedOffsetReg);
+ } else {
+ unsigned PrivateSegmentWaveByteOffsetReg = Info.getPreloadedReg(
+ AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
+ Info.setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
+ }
} else {
unsigned ReservedBufferReg
= TRI.reservedPrivateSegmentBufferReg(MF);
@@ -1223,9 +1397,9 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
// offset is still in an input SGPR.
Info.setScratchRSrcReg(ReservedBufferReg);
- if (HasStackObjects) {
- unsigned ScratchWaveOffsetReg = TRI.getPreloadedValue(
- MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
+ if (HasStackObjects && !MFI.hasCalls()) {
+ unsigned ScratchWaveOffsetReg = Info.getPreloadedReg(
+ AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
Info.setScratchWaveOffsetReg(ScratchWaveOffsetReg);
} else {
unsigned ReservedOffsetReg
@@ -1235,6 +1409,50 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
}
}
+bool SITargetLowering::supportSplitCSR(MachineFunction *MF) const {
+ const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
+ return !Info->isEntryFunction();
+}
+
+void SITargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
+
+}
+
+void SITargetLowering::insertCopiesSplitCSR(
+ MachineBasicBlock *Entry,
+ const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
+ const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
+
+ const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
+ if (!IStart)
+ return;
+
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+ MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
+ MachineBasicBlock::iterator MBBI = Entry->begin();
+ for (const MCPhysReg *I = IStart; *I; ++I) {
+ const TargetRegisterClass *RC = nullptr;
+ if (AMDGPU::SReg_64RegClass.contains(*I))
+ RC = &AMDGPU::SGPR_64RegClass;
+ else if (AMDGPU::SReg_32RegClass.contains(*I))
+ RC = &AMDGPU::SGPR_32RegClass;
+ else
+ llvm_unreachable("Unexpected register class in CSRsViaCopy!");
+
+ unsigned NewVR = MRI->createVirtualRegister(RC);
+ // Create copy from CSR to a virtual register.
+ Entry->addLiveIn(*I);
+ BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
+ .addReg(*I);
+
+ // Insert the copy-back instructions right before the terminator.
+ for (auto *Exit : Exits)
+ BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
+ TII->get(TargetOpcode::COPY), *I)
+ .addReg(NewVR);
+ }
+}
+
SDValue SITargetLowering::LowerFormalArguments(
SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
@@ -1242,14 +1460,14 @@ SDValue SITargetLowering::LowerFormalArguments(
const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
MachineFunction &MF = DAG.getMachineFunction();
- FunctionType *FType = MF.getFunction()->getFunctionType();
+ FunctionType *FType = MF.getFunction().getFunctionType();
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) {
- const Function *Fn = MF.getFunction();
+ const Function &Fn = MF.getFunction();
DiagnosticInfoUnsupported NoGraphicsHSA(
- *Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
+ Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
DAG.getContext()->diagnose(NoGraphicsHSA);
return DAG.getEntryNode();
}
@@ -1269,6 +1487,12 @@ SDValue SITargetLowering::LowerFormalArguments(
bool IsKernel = AMDGPU::isKernel(CallConv);
bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
+ if (!IsEntryFunc) {
+ // 4 bytes are reserved at offset 0 for the emergency stack slot. Skip over
+ // this when allocating argument fixed offsets.
+ CCInfo.AllocateStack(4, 4);
+ }
+
if (IsShader) {
processShaderInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
@@ -1285,14 +1509,31 @@ SDValue SITargetLowering::LowerFormalArguments(
// - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
// - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
// enabled too.
- if (CallConv == CallingConv::AMDGPU_PS &&
- ((Info->getPSInputAddr() & 0x7F) == 0 ||
- ((Info->getPSInputAddr() & 0xF) == 0 &&
- Info->isPSInputAllocated(11)))) {
- CCInfo.AllocateReg(AMDGPU::VGPR0);
- CCInfo.AllocateReg(AMDGPU::VGPR1);
- Info->markPSInputAllocated(0);
- Info->markPSInputEnabled(0);
+ if (CallConv == CallingConv::AMDGPU_PS) {
+ if ((Info->getPSInputAddr() & 0x7F) == 0 ||
+ ((Info->getPSInputAddr() & 0xF) == 0 &&
+ Info->isPSInputAllocated(11))) {
+ CCInfo.AllocateReg(AMDGPU::VGPR0);
+ CCInfo.AllocateReg(AMDGPU::VGPR1);
+ Info->markPSInputAllocated(0);
+ Info->markPSInputEnabled(0);
+ }
+ if (Subtarget->isAmdPalOS()) {
+ // For isAmdPalOS, the user does not enable some bits after compilation
+ // based on run-time states; the register values being generated here are
+ // the final ones set in hardware. Therefore we need to apply the
+ // workaround to PSInputAddr and PSInputEnable together. (The case where
+ // a bit is set in PSInputAddr but not PSInputEnable is where the
+ // frontend set up an input arg for a particular interpolation mode, but
+ // nothing uses that input arg. Really we should have an earlier pass
+ // that removes such an arg.)
+ unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
+ if ((PsInputBits & 0x7F) == 0 ||
+ ((PsInputBits & 0xF) == 0 &&
+ (PsInputBits >> 11 & 1)))
+ Info->markPSInputEnabled(
+ countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined));
+ }
}
assert(!Info->hasDispatchPtr() &&
@@ -1308,7 +1549,7 @@ SDValue SITargetLowering::LowerFormalArguments(
}
if (IsEntryFunc) {
- allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
+ allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
}
@@ -1375,6 +1616,17 @@ SDValue SITargetLowering::LowerFormalArguments(
Reg = MF.addLiveIn(Reg, RC);
SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
+ if (Arg.Flags.isSRet() && !getSubtarget()->enableHugePrivateBuffer()) {
+ // The return object should be reasonably addressable.
+
+ // FIXME: This helps when the return is a real sret. If it is a
+ // automatically inserted sret (i.e. CanLowerReturn returns false), an
+ // extra copy is inserted in SelectionDAGBuilder which obscures this.
+ unsigned NumBits = 32 - AssumeFrameIndexHighZeroBits;
+ Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
+ DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
+ }
+
// If this is an 8 or 16-bit value, it is really passed promoted
// to 32 bits. Insert an assert[sz]ext to capture this, then
// truncate to the right size.
@@ -1427,6 +1679,11 @@ SDValue SITargetLowering::LowerFormalArguments(
InVals.push_back(Val);
}
+ if (!IsEntryFunc) {
+ // Special inputs come after user arguments.
+ allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
+ }
+
// Start adding system SGPRs.
if (IsEntryFunc) {
allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader);
@@ -1434,8 +1691,16 @@ SDValue SITargetLowering::LowerFormalArguments(
CCInfo.AllocateReg(Info->getScratchRSrcReg());
CCInfo.AllocateReg(Info->getScratchWaveOffsetReg());
CCInfo.AllocateReg(Info->getFrameOffsetReg());
+ allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
}
+ auto &ArgUsageInfo =
+ DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
+ ArgUsageInfo.setFuncArgInfo(MF.getFunction(), Info->getArgInfo());
+
+ unsigned StackArgSize = CCInfo.getNextStackOffset();
+ Info->setBytesInStackArgArea(StackArgSize);
+
return Chains.empty() ? Chain :
DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
}
@@ -1575,6 +1840,22 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
}
// FIXME: Does sret work properly?
+ if (!Info->isEntryFunction()) {
+ const SIRegisterInfo *TRI
+ = static_cast<const SISubtarget *>(Subtarget)->getRegisterInfo();
+ const MCPhysReg *I =
+ TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
+ if (I) {
+ for (; *I; ++I) {
+ if (AMDGPU::SReg_64RegClass.contains(*I))
+ RetOps.push_back(DAG.getRegister(*I, MVT::i64));
+ else if (AMDGPU::SReg_32RegClass.contains(*I))
+ RetOps.push_back(DAG.getRegister(*I, MVT::i32));
+ else
+ llvm_unreachable("Unexpected register class in CSRsViaCopy!");
+ }
+ }
+ }
// Update chain and glue.
RetOps[0] = Chain;
@@ -1587,6 +1868,563 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
return DAG.getNode(Opc, DL, MVT::Other, RetOps);
}
+SDValue SITargetLowering::LowerCallResult(
+ SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool IsVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
+ SDValue ThisVal) const {
+ CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
+
+ // Assign locations to each value returned by this call.
+ SmallVector<CCValAssign, 16> RVLocs;
+ CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
+ *DAG.getContext());
+ CCInfo.AnalyzeCallResult(Ins, RetCC);
+
+ // Copy all of the result registers out of their specified physreg.
+ for (unsigned i = 0; i != RVLocs.size(); ++i) {
+ CCValAssign VA = RVLocs[i];
+ SDValue Val;
+
+ if (VA.isRegLoc()) {
+ Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
+ Chain = Val.getValue(1);
+ InFlag = Val.getValue(2);
+ } else if (VA.isMemLoc()) {
+ report_fatal_error("TODO: return values in memory");
+ } else
+ llvm_unreachable("unknown argument location type");
+
+ switch (VA.getLocInfo()) {
+ case CCValAssign::Full:
+ break;
+ case CCValAssign::BCvt:
+ Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
+ break;
+ case CCValAssign::ZExt:
+ Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
+ DAG.getValueType(VA.getValVT()));
+ Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
+ break;
+ case CCValAssign::SExt:
+ Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
+ DAG.getValueType(VA.getValVT()));
+ Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
+ break;
+ case CCValAssign::AExt:
+ Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
+ break;
+ default:
+ llvm_unreachable("Unknown loc info!");
+ }
+
+ InVals.push_back(Val);
+ }
+
+ return Chain;
+}
+
+// Add code to pass special inputs required depending on used features separate
+// from the explicit user arguments present in the IR.
+void SITargetLowering::passSpecialInputs(
+ CallLoweringInfo &CLI,
+ const SIMachineFunctionInfo &Info,
+ SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
+ SmallVectorImpl<SDValue> &MemOpChains,
+ SDValue Chain,
+ SDValue StackPtr) const {
+ // If we don't have a call site, this was a call inserted by
+ // legalization. These can never use special inputs.
+ if (!CLI.CS)
+ return;
+
+ const Function *CalleeFunc = CLI.CS.getCalledFunction();
+ assert(CalleeFunc);
+
+ SelectionDAG &DAG = CLI.DAG;
+ const SDLoc &DL = CLI.DL;
+
+ const SISubtarget *ST = getSubtarget();
+ const SIRegisterInfo *TRI = ST->getRegisterInfo();
+
+ auto &ArgUsageInfo =
+ DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
+ const AMDGPUFunctionArgInfo &CalleeArgInfo
+ = ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
+
+ const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
+
+ // TODO: Unify with private memory register handling. This is complicated by
+ // the fact that at least in kernels, the input argument is not necessarily
+ // in the same location as the input.
+ AMDGPUFunctionArgInfo::PreloadedValue InputRegs[] = {
+ AMDGPUFunctionArgInfo::DISPATCH_PTR,
+ AMDGPUFunctionArgInfo::QUEUE_PTR,
+ AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR,
+ AMDGPUFunctionArgInfo::DISPATCH_ID,
+ AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
+ AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
+ AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,
+ AMDGPUFunctionArgInfo::WORKITEM_ID_X,
+ AMDGPUFunctionArgInfo::WORKITEM_ID_Y,
+ AMDGPUFunctionArgInfo::WORKITEM_ID_Z,
+ AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR
+ };
+
+ for (auto InputID : InputRegs) {
+ const ArgDescriptor *OutgoingArg;
+ const TargetRegisterClass *ArgRC;
+
+ std::tie(OutgoingArg, ArgRC) = CalleeArgInfo.getPreloadedValue(InputID);
+ if (!OutgoingArg)
+ continue;
+
+ const ArgDescriptor *IncomingArg;
+ const TargetRegisterClass *IncomingArgRC;
+ std::tie(IncomingArg, IncomingArgRC)
+ = CallerArgInfo.getPreloadedValue(InputID);
+ assert(IncomingArgRC == ArgRC);
+
+ // All special arguments are ints for now.
+ EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
+ SDValue InputReg;
+
+ if (IncomingArg) {
+ InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
+ } else {
+ // The implicit arg ptr is special because it doesn't have a corresponding
+ // input for kernels, and is computed from the kernarg segment pointer.
+ assert(InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
+ InputReg = getImplicitArgPtr(DAG, DL);
+ }
+
+ if (OutgoingArg->isRegister()) {
+ RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
+ } else {
+ SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, StackPtr,
+ InputReg,
+ OutgoingArg->getStackOffset());
+ MemOpChains.push_back(ArgStore);
+ }
+ }
+}
+
+static bool canGuaranteeTCO(CallingConv::ID CC) {
+ return CC == CallingConv::Fast;
+}
+
+/// Return true if we might ever do TCO for calls with this calling convention.
+static bool mayTailCallThisCC(CallingConv::ID CC) {
+ switch (CC) {
+ case CallingConv::C:
+ return true;
+ default:
+ return canGuaranteeTCO(CC);
+ }
+}
+
+bool SITargetLowering::isEligibleForTailCallOptimization(
+ SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
+ if (!mayTailCallThisCC(CalleeCC))
+ return false;
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ const Function &CallerF = MF.getFunction();
+ CallingConv::ID CallerCC = CallerF.getCallingConv();
+ const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
+ const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
+
+ // Kernels aren't callable, and don't have a live in return address so it
+ // doesn't make sense to do a tail call with entry functions.
+ if (!CallerPreserved)
+ return false;
+
+ bool CCMatch = CallerCC == CalleeCC;
+
+ if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
+ if (canGuaranteeTCO(CalleeCC) && CCMatch)
+ return true;
+ return false;
+ }
+
+ // TODO: Can we handle var args?
+ if (IsVarArg)
+ return false;
+
+ for (const Argument &Arg : CallerF.args()) {
+ if (Arg.hasByValAttr())
+ return false;
+ }
+
+ LLVMContext &Ctx = *DAG.getContext();
+
+ // Check that the call results are passed in the same way.
+ if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
+ CCAssignFnForCall(CalleeCC, IsVarArg),
+ CCAssignFnForCall(CallerCC, IsVarArg)))
+ return false;
+
+ // The callee has to preserve all registers the caller needs to preserve.
+ if (!CCMatch) {
+ const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
+ if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
+ return false;
+ }
+
+ // Nothing more to check if the callee is taking no arguments.
+ if (Outs.empty())
+ return true;
+
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
+
+ CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
+
+ const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+ // If the stack arguments for this call do not fit into our own save area then
+ // the call cannot be made tail.
+ // TODO: Is this really necessary?
+ if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
+ return false;
+
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
+}
+
+bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
+ if (!CI->isTailCall())
+ return false;
+
+ const Function *ParentFn = CI->getParent()->getParent();
+ if (AMDGPU::isEntryFunctionCC(ParentFn->getCallingConv()))
+ return false;
+
+ auto Attr = ParentFn->getFnAttribute("disable-tail-calls");
+ return (Attr.getValueAsString() != "true");
+}
+
+// The wave scratch offset register is used as the global base pointer.
+SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const {
+ SelectionDAG &DAG = CLI.DAG;
+ const SDLoc &DL = CLI.DL;
+ SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
+ SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
+ SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
+ SDValue Chain = CLI.Chain;
+ SDValue Callee = CLI.Callee;
+ bool &IsTailCall = CLI.IsTailCall;
+ CallingConv::ID CallConv = CLI.CallConv;
+ bool IsVarArg = CLI.IsVarArg;
+ bool IsSibCall = false;
+ bool IsThisReturn = false;
+ MachineFunction &MF = DAG.getMachineFunction();
+
+ if (IsVarArg) {
+ return lowerUnhandledCall(CLI, InVals,
+ "unsupported call to variadic function ");
+ }
+
+ if (!CLI.CS.getCalledFunction()) {
+ return lowerUnhandledCall(CLI, InVals,
+ "unsupported indirect call to function ");
+ }
+
+ if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
+ return lowerUnhandledCall(CLI, InVals,
+ "unsupported required tail call to function ");
+ }
+
+ // The first 4 bytes are reserved for the callee's emergency stack slot.
+ const unsigned CalleeUsableStackOffset = 4;
+
+ if (IsTailCall) {
+ IsTailCall = isEligibleForTailCallOptimization(
+ Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
+ if (!IsTailCall && CLI.CS && CLI.CS.isMustTailCall()) {
+ report_fatal_error("failed to perform tail call elimination on a call "
+ "site marked musttail");
+ }
+
+ bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
+
+ // A sibling call is one where we're under the usual C ABI and not planning
+ // to change that but can still do a tail call:
+ if (!TailCallOpt && IsTailCall)
+ IsSibCall = true;
+
+ if (IsTailCall)
+ ++NumTailCalls;
+ }
+
+ if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Callee)) {
+ // FIXME: Remove this hack for function pointer types after removing
+ // support of old address space mapping. In the new address space
+ // mapping the pointer in default address space is 64 bit, therefore
+ // does not need this hack.
+ if (Callee.getValueType() == MVT::i32) {
+ const GlobalValue *GV = GA->getGlobal();
+ Callee = DAG.getGlobalAddress(GV, DL, MVT::i64, GA->getOffset(), false,
+ GA->getTargetFlags());
+ }
+ }
+ assert(Callee.getValueType() == MVT::i64);
+
+ const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+
+ // Analyze operands of the call, assigning locations to each operand.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
+ CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
+ CCInfo.AnalyzeCallOperands(Outs, AssignFn);
+
+ // Get a count of how many bytes are to be pushed on the stack.
+ unsigned NumBytes = CCInfo.getNextStackOffset();
+
+ if (IsSibCall) {
+ // Since we're not changing the ABI to make this a tail call, the memory
+ // operands are already available in the caller's incoming argument space.
+ NumBytes = 0;
+ }
+
+ // FPDiff is the byte offset of the call's argument area from the callee's.
+ // Stores to callee stack arguments will be placed in FixedStackSlots offset
+ // by this amount for a tail call. In a sibling call it must be 0 because the
+ // caller will deallocate the entire stack and the callee still expects its
+ // arguments to begin at SP+0. Completely unused for non-tail calls.
+ int32_t FPDiff = 0;
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+
+ SDValue CallerSavedFP;
+
+ // Adjust the stack pointer for the new arguments...
+ // These operations are automatically eliminated by the prolog/epilog pass
+ if (!IsSibCall) {
+ Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
+
+ unsigned OffsetReg = Info->getScratchWaveOffsetReg();
+
+ // In the HSA case, this should be an identity copy.
+ SDValue ScratchRSrcReg
+ = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
+ RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
+
+ // TODO: Don't hardcode these registers and get from the callee function.
+ SDValue ScratchWaveOffsetReg
+ = DAG.getCopyFromReg(Chain, DL, OffsetReg, MVT::i32);
+ RegsToPass.emplace_back(AMDGPU::SGPR4, ScratchWaveOffsetReg);
+
+ if (!Info->isEntryFunction()) {
+ // Avoid clobbering this function's FP value. In the current convention
+ // callee will overwrite this, so do save/restore around the call site.
+ CallerSavedFP = DAG.getCopyFromReg(Chain, DL,
+ Info->getFrameOffsetReg(), MVT::i32);
+ }
+ }
+
+ // Stack pointer relative accesses are done by changing the offset SGPR. This
+ // is just the VGPR offset component.
+ SDValue StackPtr = DAG.getConstant(CalleeUsableStackOffset, DL, MVT::i32);
+
+ SmallVector<SDValue, 8> MemOpChains;
+ MVT PtrVT = MVT::i32;
+
+ // Walk the register/memloc assignments, inserting copies/loads.
+ for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
+ ++i, ++realArgIdx) {
+ CCValAssign &VA = ArgLocs[i];
+ SDValue Arg = OutVals[realArgIdx];
+
+ // Promote the value if needed.
+ switch (VA.getLocInfo()) {
+ case CCValAssign::Full:
+ break;
+ case CCValAssign::BCvt:
+ Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::ZExt:
+ Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::SExt:
+ Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::AExt:
+ Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::FPExt:
+ Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
+ break;
+ default:
+ llvm_unreachable("Unknown loc info!");
+ }
+
+ if (VA.isRegLoc()) {
+ RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+ } else {
+ assert(VA.isMemLoc());
+
+ SDValue DstAddr;
+ MachinePointerInfo DstInfo;
+
+ unsigned LocMemOffset = VA.getLocMemOffset();
+ int32_t Offset = LocMemOffset;
+
+ SDValue PtrOff = DAG.getObjectPtrOffset(DL, StackPtr, Offset);
+
+ if (IsTailCall) {
+ ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
+ unsigned OpSize = Flags.isByVal() ?
+ Flags.getByValSize() : VA.getValVT().getStoreSize();
+
+ Offset = Offset + FPDiff;
+ int FI = MFI.CreateFixedObject(OpSize, Offset, true);
+
+ DstAddr = DAG.getObjectPtrOffset(DL, DAG.getFrameIndex(FI, PtrVT),
+ StackPtr);
+ DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
+
+ // Make sure any stack arguments overlapping with where we're storing
+ // are loaded before this eventual operation. Otherwise they'll be
+ // clobbered.
+
+ // FIXME: Why is this really necessary? This seems to just result in a
+ // lot of code to copy the stack and write them back to the same
+ // locations, which are supposed to be immutable?
+ Chain = addTokenForArgument(Chain, DAG, MFI, FI);
+ } else {
+ DstAddr = PtrOff;
+ DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
+ }
+
+ if (Outs[i].Flags.isByVal()) {
+ SDValue SizeNode =
+ DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
+ SDValue Cpy = DAG.getMemcpy(
+ Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
+ /*isVol = */ false, /*AlwaysInline = */ true,
+ /*isTailCall = */ false, DstInfo,
+ MachinePointerInfo(UndefValue::get(Type::getInt8PtrTy(
+ *DAG.getContext(), AMDGPUASI.PRIVATE_ADDRESS))));
+
+ MemOpChains.push_back(Cpy);
+ } else {
+ SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
+ MemOpChains.push_back(Store);
+ }
+ }
+ }
+
+ // Copy special input registers after user input arguments.
+ passSpecialInputs(CLI, *Info, RegsToPass, MemOpChains, Chain, StackPtr);
+
+ if (!MemOpChains.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
+
+ // Build a sequence of copy-to-reg nodes chained together with token chain
+ // and flag operands which copy the outgoing args into the appropriate regs.
+ SDValue InFlag;
+ for (auto &RegToPass : RegsToPass) {
+ Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
+ RegToPass.second, InFlag);
+ InFlag = Chain.getValue(1);
+ }
+
+
+ SDValue PhysReturnAddrReg;
+ if (IsTailCall) {
+ // Since the return is being combined with the call, we need to pass on the
+ // return address.
+
+ const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
+ SDValue ReturnAddrReg = CreateLiveInRegister(
+ DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
+
+ PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
+ MVT::i64);
+ Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, InFlag);
+ InFlag = Chain.getValue(1);
+ }
+
+ // We don't usually want to end the call-sequence here because we would tidy
+ // the frame up *after* the call, however in the ABI-changing tail-call case
+ // we've carefully laid out the parameters so that when sp is reset they'll be
+ // in the correct location.
+ if (IsTailCall && !IsSibCall) {
+ Chain = DAG.getCALLSEQ_END(Chain,
+ DAG.getTargetConstant(NumBytes, DL, MVT::i32),
+ DAG.getTargetConstant(0, DL, MVT::i32),
+ InFlag, DL);
+ InFlag = Chain.getValue(1);
+ }
+
+ std::vector<SDValue> Ops;
+ Ops.push_back(Chain);
+ Ops.push_back(Callee);
+
+ if (IsTailCall) {
+ // Each tail call may have to adjust the stack by a different amount, so
+ // this information must travel along with the operation for eventual
+ // consumption by emitEpilogue.
+ Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
+
+ Ops.push_back(PhysReturnAddrReg);
+ }
+
+ // Add argument registers to the end of the list so that they are known live
+ // into the call.
+ for (auto &RegToPass : RegsToPass) {
+ Ops.push_back(DAG.getRegister(RegToPass.first,
+ RegToPass.second.getValueType()));
+ }
+
+ // Add a register mask operand representing the call-preserved registers.
+
+ const AMDGPURegisterInfo *TRI = Subtarget->getRegisterInfo();
+ const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
+ assert(Mask && "Missing call preserved mask for calling convention");
+ Ops.push_back(DAG.getRegisterMask(Mask));
+
+ if (InFlag.getNode())
+ Ops.push_back(InFlag);
+
+ SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+
+ // If we're doing a tall call, use a TC_RETURN here rather than an
+ // actual call instruction.
+ if (IsTailCall) {
+ MFI.setHasTailCall();
+ return DAG.getNode(AMDGPUISD::TC_RETURN, DL, NodeTys, Ops);
+ }
+
+ // Returns a chain and a flag for retval copy to use.
+ SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops);
+ Chain = Call.getValue(0);
+ InFlag = Call.getValue(1);
+
+ if (CallerSavedFP) {
+ SDValue FPReg = DAG.getRegister(Info->getFrameOffsetReg(), MVT::i32);
+ Chain = DAG.getCopyToReg(Chain, DL, FPReg, CallerSavedFP, InFlag);
+ InFlag = Chain.getValue(1);
+ }
+
+ uint64_t CalleePopBytes = NumBytes;
+ Chain = DAG.getCALLSEQ_END(Chain, DAG.getTargetConstant(0, DL, MVT::i32),
+ DAG.getTargetConstant(CalleePopBytes, DL, MVT::i32),
+ InFlag, DL);
+ if (!Ins.empty())
+ InFlag = Chain.getValue(1);
+
+ // Handle result values, copying them out of physregs into vregs that we
+ // return.
+ return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
+ InVals, IsThisReturn,
+ IsThisReturn ? OutVals[0] : SDValue());
+}
+
unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
SelectionDAG &DAG) const {
unsigned Reg = StringSwitch<unsigned>(RegName)
@@ -1644,7 +2482,7 @@ MachineBasicBlock *SITargetLowering::splitKillBlock(MachineInstr &MI,
if (SplitPoint == BB->end()) {
// Don't bother with a new block.
- MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR));
+ MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
return BB;
}
@@ -1658,7 +2496,7 @@ MachineBasicBlock *SITargetLowering::splitKillBlock(MachineInstr &MI,
SplitBB->transferSuccessorsAndUpdatePHIs(BB);
BB->addSuccessor(SplitBB);
- MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR));
+ MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
return SplitBB;
}
@@ -1775,8 +2613,8 @@ static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII,
MachineBasicBlock::iterator I(&MI);
unsigned DstReg = MI.getOperand(0).getReg();
- unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
- unsigned TmpExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+ unsigned TmpExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
@@ -2121,19 +2959,66 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
if (MI.mayLoad())
Flags |= MachineMemOperand::MOLoad;
- auto MMO = MF->getMachineMemOperand(PtrInfo, Flags, 0, 0);
- MI.addMemOperand(*MF, MMO);
+ if (Flags != MachineMemOperand::MODereferenceable) {
+ auto MMO = MF->getMachineMemOperand(PtrInfo, Flags, 0, 0);
+ MI.addMemOperand(*MF, MMO);
+ }
+
return BB;
}
switch (MI.getOpcode()) {
- case AMDGPU::SI_INIT_M0:
+ case AMDGPU::S_ADD_U64_PSEUDO:
+ case AMDGPU::S_SUB_U64_PSEUDO: {
+ MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+ const DebugLoc &DL = MI.getDebugLoc();
+
+ MachineOperand &Dest = MI.getOperand(0);
+ MachineOperand &Src0 = MI.getOperand(1);
+ MachineOperand &Src1 = MI.getOperand(2);
+
+ unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+
+ MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
+ Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub0,
+ &AMDGPU::SReg_32_XM0RegClass);
+ MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
+ Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub1,
+ &AMDGPU::SReg_32_XM0RegClass);
+
+ MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
+ Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub0,
+ &AMDGPU::SReg_32_XM0RegClass);
+ MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
+ Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub1,
+ &AMDGPU::SReg_32_XM0RegClass);
+
+ bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
+
+ unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
+ unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
+ BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
+ .add(Src0Sub0)
+ .add(Src1Sub0);
+ BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
+ .add(Src0Sub1)
+ .add(Src1Sub1);
+ BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
+ .addReg(DestSub0)
+ .addImm(AMDGPU::sub0)
+ .addReg(DestSub1)
+ .addImm(AMDGPU::sub1);
+ MI.eraseFromParent();
+ return BB;
+ }
+ case AMDGPU::SI_INIT_M0: {
BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
.add(MI.getOperand(0));
MI.eraseFromParent();
return BB;
-
+ }
case AMDGPU::SI_INIT_EXEC:
// This should be before all vector instructions.
BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
@@ -2212,7 +3097,8 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
case AMDGPU::SI_INDIRECT_DST_V8:
case AMDGPU::SI_INDIRECT_DST_V16:
return emitIndirectDst(MI, *BB, *getSubtarget());
- case AMDGPU::SI_KILL:
+ case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
+ case AMDGPU::SI_KILL_I1_PSEUDO:
return splitKillBlock(MI, BB);
case AMDGPU::V_CNDMASK_B64_PSEUDO: {
MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
@@ -2225,15 +3111,18 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
unsigned DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
unsigned DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ unsigned SrcCondCopy = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+ BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy)
+ .addReg(SrcCond);
BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
.addReg(Src0, 0, AMDGPU::sub0)
.addReg(Src1, 0, AMDGPU::sub0)
- .addReg(SrcCond);
+ .addReg(SrcCondCopy);
BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
.addReg(Src0, 0, AMDGPU::sub1)
.addReg(Src1, 0, AMDGPU::sub1)
- .addReg(SrcCond);
+ .addReg(SrcCondCopy);
BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
.addReg(DstLo)
@@ -2252,11 +3141,57 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
MI.eraseFromParent();
return BB;
}
+ case AMDGPU::ADJCALLSTACKUP:
+ case AMDGPU::ADJCALLSTACKDOWN: {
+ const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
+ MachineInstrBuilder MIB(*MF, &MI);
+ MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
+ .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
+ return BB;
+ }
+ case AMDGPU::SI_CALL_ISEL:
+ case AMDGPU::SI_TCRETURN_ISEL: {
+ const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+ const DebugLoc &DL = MI.getDebugLoc();
+ unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
+
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ unsigned GlobalAddrReg = MI.getOperand(0).getReg();
+ MachineInstr *PCRel = MRI.getVRegDef(GlobalAddrReg);
+ assert(PCRel->getOpcode() == AMDGPU::SI_PC_ADD_REL_OFFSET);
+
+ const GlobalValue *G = PCRel->getOperand(1).getGlobal();
+
+ MachineInstrBuilder MIB;
+ if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
+ MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg)
+ .add(MI.getOperand(0))
+ .addGlobalAddress(G);
+ } else {
+ MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_TCRETURN))
+ .add(MI.getOperand(0))
+ .addGlobalAddress(G);
+
+ // There is an additional imm operand for tcreturn, but it should be in the
+ // right place already.
+ }
+
+ for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
+ MIB.add(MI.getOperand(I));
+
+ MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+ MI.eraseFromParent();
+ return BB;
+ }
default:
return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
}
}
+bool SITargetLowering::hasBitPreservingFPLogic(EVT VT) const {
+ return isTypeLegal(VT.getScalarType());
+}
+
bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const {
// This currently forces unfolding various combinations of fsub into fma with
// free fneg'd operands. As long as we have fast FMA (controlled by
@@ -2356,7 +3291,6 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return lowerEXTRACT_VECTOR_ELT(Op, DAG);
case ISD::FP_ROUND:
return lowerFP_ROUND(Op, DAG);
-
case ISD::TRAP:
case ISD::DEBUGTRAP:
return lowerTRAP(Op, DAG);
@@ -2660,11 +3594,11 @@ SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
case SISubtarget::TrapIDLLVMTrap:
return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain);
case SISubtarget::TrapIDLLVMDebugTrap: {
- DiagnosticInfoUnsupported NoTrap(*MF.getFunction(),
+ DiagnosticInfoUnsupported NoTrap(MF.getFunction(),
"debugtrap handler not supported",
Op.getDebugLoc(),
DS_Warning);
- LLVMContext &Ctx = MF.getFunction()->getContext();
+ LLVMContext &Ctx = MF.getFunction().getContext();
Ctx.diagnose(NoTrap);
return Chain;
}
@@ -2709,8 +3643,7 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
// private_segment_aperture_base_hi.
uint32_t StructOffset = (AS == AMDGPUASI.LOCAL_ADDRESS) ? 0x40 : 0x44;
- SDValue Ptr = DAG.getNode(ISD::ADD, DL, MVT::i64, QueuePtr,
- DAG.getConstant(StructOffset, DL, MVT::i64));
+ SDValue Ptr = DAG.getObjectPtrOffset(DL, QueuePtr, StructOffset);
// TODO: Use custom target PseudoSourceValue.
// TODO: We should use the value from the IR intrinsic call, but it might not
@@ -2778,7 +3711,7 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
const MachineFunction &MF = DAG.getMachineFunction();
DiagnosticInfoUnsupported InvalidAddrSpaceCast(
- *MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
+ MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
DAG.getContext()->diagnose(InvalidAddrSpaceCast);
return DAG.getUNDEF(ASC->getValueType(0));
@@ -2917,13 +3850,16 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
SDValue Op,
SelectionDAG &DAG) const {
GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
+ const GlobalValue *GV = GSD->getGlobal();
if (GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS &&
- GSD->getAddressSpace() != AMDGPUASI.GLOBAL_ADDRESS)
+ GSD->getAddressSpace() != AMDGPUASI.GLOBAL_ADDRESS &&
+ // FIXME: It isn't correct to rely on the type of the pointer. This should
+ // be removed when address space 0 is 64-bit.
+ !GV->getType()->getElementType()->isFunctionTy())
return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
SDLoc DL(GSD);
- const GlobalValue *GV = GSD->getGlobal();
EVT PtrVT = Op.getValueType();
if (shouldEmitFixup(GV))
@@ -2977,7 +3913,7 @@ SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
EVT VT) {
- DiagnosticInfoUnsupported BadIntrin(*DAG.getMachineFunction().getFunction(),
+ DiagnosticInfoUnsupported BadIntrin(DAG.getMachineFunction().getFunction(),
"non-hsa intrinsic with hsa target",
DL.getDebugLoc());
DAG.getContext()->diagnose(BadIntrin);
@@ -2986,7 +3922,7 @@ static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
EVT VT) {
- DiagnosticInfoUnsupported BadIntrin(*DAG.getMachineFunction().getFunction(),
+ DiagnosticInfoUnsupported BadIntrin(DAG.getMachineFunction().getFunction(),
"intrinsic not supported on subtarget",
DL.getDebugLoc());
DAG.getContext()->diagnose(BadIntrin);
@@ -2997,7 +3933,6 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
auto MFI = MF.getInfo<SIMachineFunctionInfo>();
- const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
EVT VT = Op.getValueType();
SDLoc DL(Op);
@@ -3009,38 +3944,35 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::amdgcn_implicit_buffer_ptr: {
if (getSubtarget()->isAmdCodeObjectV2(MF))
return emitNonHSAIntrinsicError(DAG, DL, VT);
-
- unsigned Reg = TRI->getPreloadedValue(MF,
- SIRegisterInfo::IMPLICIT_BUFFER_PTR);
- return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT);
+ return getPreloadedValue(DAG, *MFI, VT,
+ AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
}
case Intrinsic::amdgcn_dispatch_ptr:
case Intrinsic::amdgcn_queue_ptr: {
if (!Subtarget->isAmdCodeObjectV2(MF)) {
DiagnosticInfoUnsupported BadIntrin(
- *MF.getFunction(), "unsupported hsa intrinsic without hsa target",
+ MF.getFunction(), "unsupported hsa intrinsic without hsa target",
DL.getDebugLoc());
DAG.getContext()->diagnose(BadIntrin);
return DAG.getUNDEF(VT);
}
- auto Reg = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
- SIRegisterInfo::DISPATCH_PTR : SIRegisterInfo::QUEUE_PTR;
- return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass,
- TRI->getPreloadedValue(MF, Reg), VT);
+ auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
+ AMDGPUFunctionArgInfo::DISPATCH_PTR : AMDGPUFunctionArgInfo::QUEUE_PTR;
+ return getPreloadedValue(DAG, *MFI, VT, RegID);
}
case Intrinsic::amdgcn_implicitarg_ptr: {
- unsigned offset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT);
- return lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), offset);
+ if (MFI->isEntryFunction())
+ return getImplicitArgPtr(DAG, DL);
+ return getPreloadedValue(DAG, *MFI, VT,
+ AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
}
case Intrinsic::amdgcn_kernarg_segment_ptr: {
- unsigned Reg
- = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
- return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT);
+ return getPreloadedValue(DAG, *MFI, VT,
+ AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
}
case Intrinsic::amdgcn_dispatch_id: {
- unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::DISPATCH_ID);
- return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT);
+ return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
}
case Intrinsic::amdgcn_rcp:
return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
@@ -3125,28 +4057,32 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SI::KernelInputOffsets::LOCAL_SIZE_Z);
case Intrinsic::amdgcn_workgroup_id_x:
case Intrinsic::r600_read_tgid_x:
- return CreateLiveInRegister(DAG, &AMDGPU::SReg_32_XM0RegClass,
- TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_X), VT);
+ return getPreloadedValue(DAG, *MFI, VT,
+ AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
case Intrinsic::amdgcn_workgroup_id_y:
case Intrinsic::r600_read_tgid_y:
- return CreateLiveInRegister(DAG, &AMDGPU::SReg_32_XM0RegClass,
- TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Y), VT);
+ return getPreloadedValue(DAG, *MFI, VT,
+ AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
case Intrinsic::amdgcn_workgroup_id_z:
case Intrinsic::r600_read_tgid_z:
- return CreateLiveInRegister(DAG, &AMDGPU::SReg_32_XM0RegClass,
- TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Z), VT);
- case Intrinsic::amdgcn_workitem_id_x:
+ return getPreloadedValue(DAG, *MFI, VT,
+ AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
+ case Intrinsic::amdgcn_workitem_id_x: {
case Intrinsic::r600_read_tidig_x:
- return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
- TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X), VT);
+ return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
+ SDLoc(DAG.getEntryNode()),
+ MFI->getArgInfo().WorkItemIDX);
+ }
case Intrinsic::amdgcn_workitem_id_y:
case Intrinsic::r600_read_tidig_y:
- return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
- TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y), VT);
+ return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
+ SDLoc(DAG.getEntryNode()),
+ MFI->getArgInfo().WorkItemIDY);
case Intrinsic::amdgcn_workitem_id_z:
case Intrinsic::r600_read_tidig_z:
- return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
- TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z), VT);
+ return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
+ SDLoc(DAG.getEntryNode()),
+ MFI->getArgInfo().WorkItemIDZ);
case AMDGPUIntrinsic::SI_load_const: {
SDValue Ops[] = {
Op.getOperand(1),
@@ -3193,7 +4129,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return SDValue();
DiagnosticInfoUnsupported BadIntrin(
- *MF.getFunction(), "intrinsic not supported on subtarget",
+ MF.getFunction(), "intrinsic not supported on subtarget",
DL.getDebugLoc());
DAG.getContext()->diagnose(BadIntrin);
return DAG.getUNDEF(VT);
@@ -3224,7 +4160,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
// 3rd parameter required to be a constant.
const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3));
if (!Param)
- return DAG.getUNDEF(VT);
+ return DAG.getMergeValues({ DAG.getUNDEF(VT), DAG.getUNDEF(MVT::i1) }, DL);
// Translate to the operands expected by the machine instruction. The
// first parameter must be the same as the first instruction.
@@ -3292,6 +4228,26 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
Op.getOperand(1), Op.getOperand(2));
return DAG.getNode(ISD::BITCAST, DL, VT, Node);
}
+ case Intrinsic::amdgcn_wqm: {
+ SDValue Src = Op.getOperand(1);
+ return SDValue(DAG.getMachineNode(AMDGPU::WQM, DL, Src.getValueType(), Src),
+ 0);
+ }
+ case Intrinsic::amdgcn_wwm: {
+ SDValue Src = Op.getOperand(1);
+ return SDValue(DAG.getMachineNode(AMDGPU::WWM, DL, Src.getValueType(), Src),
+ 0);
+ }
+ case Intrinsic::amdgcn_image_getlod:
+ case Intrinsic::amdgcn_image_getresinfo: {
+ unsigned Idx = (IntrinsicID == Intrinsic::amdgcn_image_getresinfo) ? 3 : 4;
+
+ // Replace dmask with everything disabled with undef.
+ const ConstantSDNode *DMask = dyn_cast<ConstantSDNode>(Op.getOperand(Idx));
+ if (!DMask || DMask->isNullValue())
+ return DAG.getUNDEF(Op.getValueType());
+ return SDValue();
+ }
default:
return Op;
}
@@ -3365,6 +4321,95 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
Op->getVTList(), Ops, VT, MMO);
}
+ case Intrinsic::amdgcn_buffer_atomic_swap:
+ case Intrinsic::amdgcn_buffer_atomic_add:
+ case Intrinsic::amdgcn_buffer_atomic_sub:
+ case Intrinsic::amdgcn_buffer_atomic_smin:
+ case Intrinsic::amdgcn_buffer_atomic_umin:
+ case Intrinsic::amdgcn_buffer_atomic_smax:
+ case Intrinsic::amdgcn_buffer_atomic_umax:
+ case Intrinsic::amdgcn_buffer_atomic_and:
+ case Intrinsic::amdgcn_buffer_atomic_or:
+ case Intrinsic::amdgcn_buffer_atomic_xor: {
+ SDValue Ops[] = {
+ Op.getOperand(0), // Chain
+ Op.getOperand(2), // vdata
+ Op.getOperand(3), // rsrc
+ Op.getOperand(4), // vindex
+ Op.getOperand(5), // offset
+ Op.getOperand(6) // slc
+ };
+ EVT VT = Op.getOperand(3).getValueType();
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo(),
+ MachineMemOperand::MOLoad |
+ MachineMemOperand::MOStore |
+ MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOVolatile,
+ VT.getStoreSize(), 4);
+ unsigned Opcode = 0;
+
+ switch (IntrID) {
+ case Intrinsic::amdgcn_buffer_atomic_swap:
+ Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP;
+ break;
+ case Intrinsic::amdgcn_buffer_atomic_add:
+ Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD;
+ break;
+ case Intrinsic::amdgcn_buffer_atomic_sub:
+ Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB;
+ break;
+ case Intrinsic::amdgcn_buffer_atomic_smin:
+ Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN;
+ break;
+ case Intrinsic::amdgcn_buffer_atomic_umin:
+ Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN;
+ break;
+ case Intrinsic::amdgcn_buffer_atomic_smax:
+ Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX;
+ break;
+ case Intrinsic::amdgcn_buffer_atomic_umax:
+ Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX;
+ break;
+ case Intrinsic::amdgcn_buffer_atomic_and:
+ Opcode = AMDGPUISD::BUFFER_ATOMIC_AND;
+ break;
+ case Intrinsic::amdgcn_buffer_atomic_or:
+ Opcode = AMDGPUISD::BUFFER_ATOMIC_OR;
+ break;
+ case Intrinsic::amdgcn_buffer_atomic_xor:
+ Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
+ break;
+ default:
+ llvm_unreachable("unhandled atomic opcode");
+ }
+
+ return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT, MMO);
+ }
+
+ case Intrinsic::amdgcn_buffer_atomic_cmpswap: {
+ SDValue Ops[] = {
+ Op.getOperand(0), // Chain
+ Op.getOperand(2), // src
+ Op.getOperand(3), // cmp
+ Op.getOperand(4), // rsrc
+ Op.getOperand(5), // vindex
+ Op.getOperand(6), // offset
+ Op.getOperand(7) // slc
+ };
+ EVT VT = Op.getOperand(4).getValueType();
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo(),
+ MachineMemOperand::MOLoad |
+ MachineMemOperand::MOStore |
+ MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOVolatile,
+ VT.getStoreSize(), 4);
+
+ return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
+ Op->getVTList(), Ops, VT, MMO);
+ }
+
// Basic sample.
case Intrinsic::amdgcn_image_sample:
case Intrinsic::amdgcn_image_sample_cl:
@@ -3411,9 +4456,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
case Intrinsic::amdgcn_image_sample_c_b_cl_o:
case Intrinsic::amdgcn_image_sample_c_lz_o:
case Intrinsic::amdgcn_image_sample_c_cd_o:
- case Intrinsic::amdgcn_image_sample_c_cd_cl_o:
-
- case Intrinsic::amdgcn_image_getlod: {
+ case Intrinsic::amdgcn_image_sample_c_cd_cl_o: {
// Replace dmask with everything disabled with undef.
const ConstantSDNode *DMask = dyn_cast<ConstantSDNode>(Op.getOperand(5));
if (!DMask || DMask->isNullValue()) {
@@ -3516,7 +4559,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
case Intrinsic::amdgcn_s_barrier: {
if (getTargetMachine().getOptLevel() > CodeGenOpt::None) {
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
- unsigned WGSize = ST.getFlatWorkGroupSizes(*MF.getFunction()).second;
+ unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
if (WGSize <= ST.getWavefrontSize())
return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other,
Op.getOperand(0)), 0);
@@ -3592,6 +4635,30 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
Op->getVTList(), Ops, VT, MMO);
}
+ case Intrinsic::amdgcn_buffer_store:
+ case Intrinsic::amdgcn_buffer_store_format: {
+ SDValue Ops[] = {
+ Chain,
+ Op.getOperand(2), // vdata
+ Op.getOperand(3), // rsrc
+ Op.getOperand(4), // vindex
+ Op.getOperand(5), // offset
+ Op.getOperand(6), // glc
+ Op.getOperand(7) // slc
+ };
+ EVT VT = Op.getOperand(3).getValueType();
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo(),
+ MachineMemOperand::MOStore |
+ MachineMemOperand::MODereferenceable,
+ VT.getStoreSize(), 4);
+
+ unsigned Opcode = IntrinsicID == Intrinsic::amdgcn_buffer_store ?
+ AMDGPUISD::BUFFER_STORE :
+ AMDGPUISD::BUFFER_STORE_FORMAT;
+ return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT, MMO);
+ }
+
default:
return Op;
}
@@ -3604,6 +4671,9 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
EVT MemVT = Load->getMemoryVT();
if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
+ if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
+ return SDValue();
+
// FIXME: Copied from PPC
// First, load into 32 bits, then truncate to 1 bit.
@@ -4187,32 +5257,6 @@ SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
return SDValue();
}
-/// \brief Return true if the given offset Size in bytes can be folded into
-/// the immediate offsets of a memory instruction for the given address space.
-static bool canFoldOffset(unsigned OffsetSize, unsigned AS,
- const SISubtarget &STI) {
- auto AMDGPUASI = STI.getAMDGPUAS();
- if (AS == AMDGPUASI.GLOBAL_ADDRESS) {
- // MUBUF instructions a 12-bit offset in bytes.
- return isUInt<12>(OffsetSize);
- }
- if (AS == AMDGPUASI.CONSTANT_ADDRESS) {
- // SMRD instructions have an 8-bit offset in dwords on SI and
- // a 20-bit offset in bytes on VI.
- if (STI.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
- return isUInt<20>(OffsetSize);
- else
- return (OffsetSize % 4 == 0) && isUInt<8>(OffsetSize / 4);
- }
- if (AS == AMDGPUASI.LOCAL_ADDRESS ||
- AS == AMDGPUASI.REGION_ADDRESS) {
- // The single offset versions have a 16-bit offset in bytes.
- return isUInt<16>(OffsetSize);
- }
- // Indirect register addressing does not use any offsets.
- return false;
-}
-
// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
// This is a variant of
@@ -4229,11 +5273,15 @@ static bool canFoldOffset(unsigned OffsetSize, unsigned AS,
//
SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
unsigned AddrSpace,
+ EVT MemVT,
DAGCombinerInfo &DCI) const {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
- if (N0.getOpcode() != ISD::ADD)
+ // We only do this to handle cases where it's profitable when there are
+ // multiple uses of the add, so defer to the standard combine.
+ if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) ||
+ N0->hasOneUse())
return SDValue();
const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
@@ -4247,7 +5295,12 @@ SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
// If the resulting offset is too large, we can't fold it into the addressing
// mode offset.
APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
- if (!canFoldOffset(Offset.getZExtValue(), AddrSpace, *getSubtarget()))
+ Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
+
+ AddrMode AM;
+ AM.HasBaseReg = true;
+ AM.BaseOffs = Offset.getSExtValue();
+ if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
return SDValue();
SelectionDAG &DAG = DCI.DAG;
@@ -4257,7 +5310,12 @@ SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
SDValue COffset = DAG.getConstant(Offset, SL, MVT::i32);
- return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset);
+ SDNodeFlags Flags;
+ Flags.setNoUnsignedWrap(N->getFlags().hasNoUnsignedWrap() &&
+ (N0.getOpcode() == ISD::OR ||
+ N0->getFlags().hasNoUnsignedWrap()));
+
+ return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
}
SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
@@ -4267,9 +5325,9 @@ SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
SDLoc SL(N);
// TODO: We could also do this for multiplies.
- unsigned AS = N->getAddressSpace();
- if (Ptr.getOpcode() == ISD::SHL && AS != AMDGPUASI.PRIVATE_ADDRESS) {
- SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), AS, DCI);
+ if (Ptr.getOpcode() == ISD::SHL) {
+ SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
+ N->getMemoryVT(), DCI);
if (NewPtr) {
SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end());
@@ -4818,15 +5876,27 @@ SDValue SITargetLowering::performIntMed3ImmCombine(
return DAG.getNode(ISD::TRUNCATE, SL, VT, Med3);
}
+static ConstantFPSDNode *getSplatConstantFP(SDValue Op) {
+ if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
+ return C;
+
+ if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op)) {
+ if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
+ return C;
+ }
+
+ return nullptr;
+}
+
SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
const SDLoc &SL,
SDValue Op0,
SDValue Op1) const {
- ConstantFPSDNode *K1 = dyn_cast<ConstantFPSDNode>(Op1);
+ ConstantFPSDNode *K1 = getSplatConstantFP(Op1);
if (!K1)
return SDValue();
- ConstantFPSDNode *K0 = dyn_cast<ConstantFPSDNode>(Op0.getOperand(1));
+ ConstantFPSDNode *K0 = getSplatConstantFP(Op0.getOperand(1));
if (!K0)
return SDValue();
@@ -4836,7 +5906,7 @@ SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
return SDValue();
// TODO: Check IEEE bit enabled?
- EVT VT = K0->getValueType(0);
+ EVT VT = Op0.getValueType();
if (Subtarget->enableDX10Clamp()) {
// If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
// hardware fmed3 behavior converting to a min.
@@ -4845,19 +5915,21 @@ SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
}
- // med3 for f16 is only available on gfx9+.
- if (VT == MVT::f64 || (VT == MVT::f16 && !Subtarget->hasMed3_16()))
- return SDValue();
+ // med3 for f16 is only available on gfx9+, and not available for v2f16.
+ if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
+ // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
+ // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
+ // then give the other result, which is different from med3 with a NaN
+ // input.
+ SDValue Var = Op0.getOperand(0);
+ if (!isKnownNeverSNan(DAG, Var))
+ return SDValue();
- // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
- // signaling NaN gives a quiet NaN. The quiet NaN input to the min would then
- // give the other result, which is different from med3 with a NaN input.
- SDValue Var = Op0.getOperand(0);
- if (!isKnownNeverSNan(DAG, Var))
- return SDValue();
+ return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0),
+ Var, SDValue(K0, 0), SDValue(K1, 0));
+ }
- return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0),
- Var, SDValue(K0, 0), SDValue(K1, 0));
+ return SDValue();
}
SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
@@ -4918,7 +5990,8 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
(Opc == AMDGPUISD::FMIN_LEGACY &&
Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
(VT == MVT::f32 || VT == MVT::f64 ||
- (VT == MVT::f16 && Subtarget->has16BitInsts())) &&
+ (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
+ (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
Op0.hasOneUse()) {
if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
return Res;
@@ -4994,7 +6067,7 @@ SDValue SITargetLowering::performExtractVectorEltCombine(
SDNode *N, DAGCombinerInfo &DCI) const {
SDValue Vec = N->getOperand(0);
- SelectionDAG &DAG= DCI.DAG;
+ SelectionDAG &DAG = DCI.DAG;
if (Vec.getOpcode() == ISD::FNEG && allUsesHaveSourceMods(N)) {
SDLoc SL(N);
EVT EltVT = N->getValueType(0);
@@ -5007,6 +6080,47 @@ SDValue SITargetLowering::performExtractVectorEltCombine(
return SDValue();
}
+static bool convertBuildVectorCastElt(SelectionDAG &DAG,
+ SDValue &Lo, SDValue &Hi) {
+ if (Hi.getOpcode() == ISD::BITCAST &&
+ Hi.getOperand(0).getValueType() == MVT::f16 &&
+ (isa<ConstantSDNode>(Lo) || Lo.isUndef())) {
+ Lo = DAG.getNode(ISD::BITCAST, SDLoc(Lo), MVT::f16, Lo);
+ Hi = Hi.getOperand(0);
+ return true;
+ }
+
+ return false;
+}
+
+SDValue SITargetLowering::performBuildVectorCombine(
+ SDNode *N, DAGCombinerInfo &DCI) const {
+ SDLoc SL(N);
+
+ if (!isTypeLegal(MVT::v2i16))
+ return SDValue();
+ SelectionDAG &DAG = DCI.DAG;
+ EVT VT = N->getValueType(0);
+
+ if (VT == MVT::v2i16) {
+ SDValue Lo = N->getOperand(0);
+ SDValue Hi = N->getOperand(1);
+
+ // v2i16 build_vector (const|undef), (bitcast f16:$x)
+ // -> bitcast (v2f16 build_vector const|undef, $x
+ if (convertBuildVectorCastElt(DAG, Lo, Hi)) {
+ SDValue NewVec = DAG.getBuildVector(MVT::v2f16, SL, { Lo, Hi });
+ return DAG.getNode(ISD::BITCAST, SL, VT, NewVec);
+ }
+
+ if (convertBuildVectorCastElt(DAG, Hi, Lo)) {
+ SDValue NewVec = DAG.getBuildVector(MVT::v2f16, SL, { Hi, Lo });
+ return DAG.getNode(ISD::BITCAST, SL, VT, NewVec);
+ }
+ }
+
+ return SDValue();
+}
unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
const SDNode *N0,
@@ -5030,18 +6144,57 @@ unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
return 0;
}
+static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL,
+ EVT VT,
+ SDValue N0, SDValue N1, SDValue N2,
+ bool Signed) {
+ unsigned MadOpc = Signed ? AMDGPUISD::MAD_I64_I32 : AMDGPUISD::MAD_U64_U32;
+ SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
+ SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
+ return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
+}
+
SDValue SITargetLowering::performAddCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
EVT VT = N->getValueType(0);
-
- if (VT != MVT::i32)
- return SDValue();
-
SDLoc SL(N);
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
+ if ((LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL)
+ && Subtarget->hasMad64_32() &&
+ !VT.isVector() && VT.getScalarSizeInBits() > 32 &&
+ VT.getScalarSizeInBits() <= 64) {
+ if (LHS.getOpcode() != ISD::MUL)
+ std::swap(LHS, RHS);
+
+ SDValue MulLHS = LHS.getOperand(0);
+ SDValue MulRHS = LHS.getOperand(1);
+ SDValue AddRHS = RHS;
+
+ // TODO: Maybe restrict if SGPR inputs.
+ if (numBitsUnsigned(MulLHS, DAG) <= 32 &&
+ numBitsUnsigned(MulRHS, DAG) <= 32) {
+ MulLHS = DAG.getZExtOrTrunc(MulLHS, SL, MVT::i32);
+ MulRHS = DAG.getZExtOrTrunc(MulRHS, SL, MVT::i32);
+ AddRHS = DAG.getZExtOrTrunc(AddRHS, SL, MVT::i64);
+ return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, false);
+ }
+
+ if (numBitsSigned(MulLHS, DAG) < 32 && numBitsSigned(MulRHS, DAG) < 32) {
+ MulLHS = DAG.getSExtOrTrunc(MulLHS, SL, MVT::i32);
+ MulRHS = DAG.getSExtOrTrunc(MulRHS, SL, MVT::i32);
+ AddRHS = DAG.getSExtOrTrunc(AddRHS, SL, MVT::i64);
+ return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, true);
+ }
+
+ return SDValue();
+ }
+
+ if (VT != MVT::i32)
+ return SDValue();
+
// add x, zext (setcc) => addcarry x, 0, setcc
// add x, sext (setcc) => subcarry x, 0, setcc
unsigned Opc = LHS.getOpcode();
@@ -5428,6 +6581,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
}
case ISD::EXTRACT_VECTOR_ELT:
return performExtractVectorEltCombine(N, DCI);
+ case ISD::BUILD_VECTOR:
+ return performBuildVectorCombine(N, DCI);
}
return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
}
@@ -5444,13 +6599,19 @@ static unsigned SubIdx2Lane(unsigned Idx) {
}
/// \brief Adjust the writemask of MIMG instructions
-void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
- SelectionDAG &DAG) const {
- SDNode *Users[4] = { };
+SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
+ SelectionDAG &DAG) const {
+ SDNode *Users[4] = { nullptr };
unsigned Lane = 0;
unsigned DmaskIdx = (Node->getNumOperands() - Node->getNumValues() == 9) ? 2 : 3;
unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
unsigned NewDmask = 0;
+ bool HasChain = Node->getNumValues() > 1;
+
+ if (OldDmask == 0) {
+ // These are folded out, but on the chance it happens don't assert.
+ return Node;
+ }
// Try to figure out the used register components
for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
@@ -5463,9 +6624,9 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
// Abort if we can't understand the usage
if (!I->isMachineOpcode() ||
I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
- return;
+ return Node;
- // Lane means which subreg of %VGPRa_VGPRb_VGPRc_VGPRd is used.
+ // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
// Note that subregs are packed, i.e. Lane==0 is the first bit set
// in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
// set, etc.
@@ -5474,14 +6635,13 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
// Set which texture component corresponds to the lane.
unsigned Comp;
for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) {
- assert(Dmask);
Comp = countTrailingZeros(Dmask);
Dmask &= ~(1 << Comp);
}
// Abort if we have more than one user per component
if (Users[Lane])
- return;
+ return Node;
Users[Lane] = *I;
NewDmask |= 1 << Comp;
@@ -5489,25 +6649,47 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
// Abort if there's no change
if (NewDmask == OldDmask)
- return;
+ return Node;
+
+ unsigned BitsSet = countPopulation(NewDmask);
+
+ const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+ int NewOpcode = AMDGPU::getMaskedMIMGOp(*TII,
+ Node->getMachineOpcode(), BitsSet);
+ assert(NewOpcode != -1 &&
+ NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
+ "failed to find equivalent MIMG op");
// Adjust the writemask in the node
- std::vector<SDValue> Ops;
+ SmallVector<SDValue, 12> Ops;
Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());
- Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops);
-
- // If we only got one lane, replace it with a copy
- // (if NewDmask has only one bit set...)
- if (NewDmask && (NewDmask & (NewDmask-1)) == 0) {
- SDValue RC = DAG.getTargetConstant(AMDGPU::VGPR_32RegClassID, SDLoc(),
- MVT::i32);
- SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
- SDLoc(), Users[Lane]->getValueType(0),
- SDValue(Node, 0), RC);
+
+ MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
+
+ MVT ResultVT = BitsSet == 1 ?
+ SVT : MVT::getVectorVT(SVT, BitsSet == 3 ? 4 : BitsSet);
+ SDVTList NewVTList = HasChain ?
+ DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
+
+
+ MachineSDNode *NewNode = DAG.getMachineNode(NewOpcode, SDLoc(Node),
+ NewVTList, Ops);
+
+ if (HasChain) {
+ // Update chain.
+ NewNode->setMemRefs(Node->memoperands_begin(), Node->memoperands_end());
+ DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
+ }
+
+ if (BitsSet == 1) {
+ assert(Node->hasNUsesOfValue(1, 0));
+ SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY,
+ SDLoc(Node), Users[Lane]->getValueType(0),
+ SDValue(NewNode, 0));
DAG.ReplaceAllUsesWith(Users[Lane], Copy);
- return;
+ return nullptr;
}
// Update the users of the node with the new indices
@@ -5517,7 +6699,7 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
continue;
SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
- DAG.UpdateNodeOperands(User, User->getOperand(0), Op);
+ DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
switch (Idx) {
default: break;
@@ -5526,6 +6708,9 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
}
}
+
+ DAG.RemoveDeadNode(Node);
+ return nullptr;
}
static bool isFrameIndexOp(SDValue Op) {
@@ -5579,25 +6764,80 @@ SDNode *SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
Node->getOperand(i)), 0));
}
- DAG.UpdateNodeOperands(Node, Ops);
- return Node;
+ return DAG.UpdateNodeOperands(Node, Ops);
}
/// \brief Fold the instructions after selecting them.
+/// Returns null if users were already updated.
SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
SelectionDAG &DAG) const {
const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
unsigned Opcode = Node->getMachineOpcode();
if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore() &&
- !TII->isGather4(Opcode))
- adjustWritemask(Node, DAG);
+ !TII->isGather4(Opcode)) {
+ return adjustWritemask(Node, DAG);
+ }
if (Opcode == AMDGPU::INSERT_SUBREG ||
Opcode == AMDGPU::REG_SEQUENCE) {
legalizeTargetIndependentNode(Node, DAG);
return Node;
}
+
+ switch (Opcode) {
+ case AMDGPU::V_DIV_SCALE_F32:
+ case AMDGPU::V_DIV_SCALE_F64: {
+ // Satisfy the operand register constraint when one of the inputs is
+ // undefined. Ordinarily each undef value will have its own implicit_def of
+ // a vreg, so force these to use a single register.
+ SDValue Src0 = Node->getOperand(0);
+ SDValue Src1 = Node->getOperand(1);
+ SDValue Src2 = Node->getOperand(2);
+
+ if ((Src0.isMachineOpcode() &&
+ Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
+ (Src0 == Src1 || Src0 == Src2))
+ break;
+
+ MVT VT = Src0.getValueType().getSimpleVT();
+ const TargetRegisterClass *RC = getRegClassFor(VT);
+
+ MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
+ SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
+
+ SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node),
+ UndefReg, Src0, SDValue());
+
+ // src0 must be the same register as src1 or src2, even if the value is
+ // undefined, so make sure we don't violate this constraint.
+ if (Src0.isMachineOpcode() &&
+ Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
+ if (Src1.isMachineOpcode() &&
+ Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
+ Src0 = Src1;
+ else if (Src2.isMachineOpcode() &&
+ Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
+ Src0 = Src2;
+ else {
+ assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
+ Src0 = UndefReg;
+ Src1 = UndefReg;
+ }
+ } else
+ break;
+
+ SmallVector<SDValue, 4> Ops = { Src0, Src1, Src2 };
+ for (unsigned I = 3, N = Node->getNumOperands(); I != N; ++I)
+ Ops.push_back(Node->getOperand(I));
+
+ Ops.push_back(ImpDef.getValue(1));
+ return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
+ }
+ default:
+ break;
+ }
+
return Node;
}
@@ -5615,31 +6855,6 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
return;
}
- if (TII->isMIMG(MI)) {
- unsigned VReg = MI.getOperand(0).getReg();
- const TargetRegisterClass *RC = MRI.getRegClass(VReg);
- // TODO: Need mapping tables to handle other cases (register classes).
- if (RC != &AMDGPU::VReg_128RegClass)
- return;
-
- unsigned DmaskIdx = MI.getNumOperands() == 12 ? 3 : 4;
- unsigned Writemask = MI.getOperand(DmaskIdx).getImm();
- unsigned BitsSet = 0;
- for (unsigned i = 0; i < 4; ++i)
- BitsSet += Writemask & (1 << i) ? 1 : 0;
- switch (BitsSet) {
- default: return;
- case 1: RC = &AMDGPU::VGPR_32RegClass; break;
- case 2: RC = &AMDGPU::VReg_64RegClass; break;
- case 3: RC = &AMDGPU::VReg_96RegClass; break;
- }
-
- unsigned NewOpcode = TII->getMaskedMIMGOp(MI.getOpcode(), BitsSet);
- MI.setDesc(TII->get(NewOpcode));
- MRI.setRegClass(VReg, RC);
- return;
- }
-
// Replace unused atomics with the no return version.
int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI.getOpcode());
if (NoRetAtomicOp != -1) {
@@ -5870,3 +7085,21 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
TargetLoweringBase::finalizeLowering(MF);
}
+
+void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op,
+ KnownBits &Known,
+ const APInt &DemandedElts,
+ const SelectionDAG &DAG,
+ unsigned Depth) const {
+ TargetLowering::computeKnownBitsForFrameIndex(Op, Known, DemandedElts,
+ DAG, Depth);
+
+ if (getSubtarget()->enableHugePrivateBuffer())
+ return;
+
+ // Technically it may be possible to have a dispatch with a single workitem
+ // that uses the full private memory size, but that's not really useful. We
+ // can't use vaddr in MUBUF instructions if we don't know the address
+ // calculation won't overflow, so assume the sign bit is never set.
+ Known.Zero.setHighBits(AssumeFrameIndexHighZeroBits);
+}