aboutsummaryrefslogtreecommitdiff
path: root/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp')
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp241
1 files changed, 187 insertions, 54 deletions
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index e90487065992..01ac9968181a 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -29,6 +29,39 @@ using namespace llvm;
#define DEBUG_TYPE "AMDGPUtti"
+static cl::opt<unsigned> UnrollThresholdPrivate(
+ "amdgpu-unroll-threshold-private",
+ cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
+ cl::init(2500), cl::Hidden);
+
+static cl::opt<unsigned> UnrollThresholdLocal(
+ "amdgpu-unroll-threshold-local",
+ cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
+ cl::init(1000), cl::Hidden);
+
+static cl::opt<unsigned> UnrollThresholdIf(
+ "amdgpu-unroll-threshold-if",
+ cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
+ cl::init(150), cl::Hidden);
+
+static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
+ unsigned Depth = 0) {
+ const Instruction *I = dyn_cast<Instruction>(Cond);
+ if (!I)
+ return false;
+
+ for (const Value *V : I->operand_values()) {
+ if (!L->contains(I))
+ continue;
+ if (const PHINode *PHI = dyn_cast<PHINode>(V)) {
+ if (none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) {
+ return SubLoop->contains(PHI); }))
+ return true;
+ } else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1))
+ return true;
+ }
+ return false;
+}
void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L,
TTI::UnrollingPreferences &UP) {
@@ -38,29 +71,115 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L,
// TODO: Do we want runtime unrolling?
+ // Maximum alloca size than can fit registers. Reserve 16 registers.
+ const unsigned MaxAlloca = (256 - 16) * 4;
+ unsigned ThresholdPrivate = UnrollThresholdPrivate;
+ unsigned ThresholdLocal = UnrollThresholdLocal;
+ unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
+ AMDGPUAS ASST = ST->getAMDGPUAS();
for (const BasicBlock *BB : L->getBlocks()) {
const DataLayout &DL = BB->getModule()->getDataLayout();
+ unsigned LocalGEPsSeen = 0;
+
+ if (any_of(L->getSubLoops(), [BB](const Loop* SubLoop) {
+ return SubLoop->contains(BB); }))
+ continue; // Block belongs to an inner loop.
+
for (const Instruction &I : *BB) {
+
+ // Unroll a loop which contains an "if" statement whose condition
+ // defined by a PHI belonging to the loop. This may help to eliminate
+ // if region and potentially even PHI itself, saving on both divergence
+ // and registers used for the PHI.
+ // Add a small bonus for each of such "if" statements.
+ if (const BranchInst *Br = dyn_cast<BranchInst>(&I)) {
+ if (UP.Threshold < MaxBoost && Br->isConditional()) {
+ if (L->isLoopExiting(Br->getSuccessor(0)) ||
+ L->isLoopExiting(Br->getSuccessor(1)))
+ continue;
+ if (dependsOnLocalPhi(L, Br->getCondition())) {
+ UP.Threshold += UnrollThresholdIf;
+ DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
+ << " for loop:\n" << *L << " due to " << *Br << '\n');
+ if (UP.Threshold >= MaxBoost)
+ return;
+ }
+ }
+ continue;
+ }
+
const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I);
- if (!GEP || GEP->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS)
+ if (!GEP)
+ continue;
+
+ unsigned AS = GEP->getAddressSpace();
+ unsigned Threshold = 0;
+ if (AS == ASST.PRIVATE_ADDRESS)
+ Threshold = ThresholdPrivate;
+ else if (AS == ASST.LOCAL_ADDRESS)
+ Threshold = ThresholdLocal;
+ else
+ continue;
+
+ if (UP.Threshold >= Threshold)
continue;
- const Value *Ptr = GEP->getPointerOperand();
- const AllocaInst *Alloca =
- dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr, DL));
- if (Alloca) {
- // We want to do whatever we can to limit the number of alloca
- // instructions that make it through to the code generator. allocas
- // require us to use indirect addressing, which is slow and prone to
- // compiler bugs. If this loop does an address calculation on an
- // alloca ptr, then we want to use a higher than normal loop unroll
- // threshold. This will give SROA a better chance to eliminate these
- // allocas.
- //
- // Don't use the maximum allowed value here as it will make some
- // programs way too big.
- UP.Threshold = 800;
+ if (AS == ASST.PRIVATE_ADDRESS) {
+ const Value *Ptr = GEP->getPointerOperand();
+ const AllocaInst *Alloca =
+ dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr, DL));
+ if (!Alloca || !Alloca->isStaticAlloca())
+ continue;
+ Type *Ty = Alloca->getAllocatedType();
+ unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0;
+ if (AllocaSize > MaxAlloca)
+ continue;
+ } else if (AS == ASST.LOCAL_ADDRESS) {
+ LocalGEPsSeen++;
+ // Inhibit unroll for local memory if we have seen addressing not to
+ // a variable, most likely we will be unable to combine it.
+ // Do not unroll too deep inner loops for local memory to give a chance
+ // to unroll an outer loop for a more important reason.
+ if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 ||
+ (!isa<GlobalVariable>(GEP->getPointerOperand()) &&
+ !isa<Argument>(GEP->getPointerOperand())))
+ continue;
}
+
+ // Check if GEP depends on a value defined by this loop itself.
+ bool HasLoopDef = false;
+ for (const Value *Op : GEP->operands()) {
+ const Instruction *Inst = dyn_cast<Instruction>(Op);
+ if (!Inst || L->isLoopInvariant(Op))
+ continue;
+
+ if (any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {
+ return SubLoop->contains(Inst); }))
+ continue;
+ HasLoopDef = true;
+ break;
+ }
+ if (!HasLoopDef)
+ continue;
+
+ // We want to do whatever we can to limit the number of alloca
+ // instructions that make it through to the code generator. allocas
+ // require us to use indirect addressing, which is slow and prone to
+ // compiler bugs. If this loop does an address calculation on an
+ // alloca ptr, then we want to use a higher than normal loop unroll
+ // threshold. This will give SROA a better chance to eliminate these
+ // allocas.
+ //
+ // We also want to have more unrolling for local memory to let ds
+ // instructions with different offsets combine.
+ //
+ // Don't use the maximum allowed value here as it will make some
+ // programs way too big.
+ UP.Threshold = Threshold;
+ DEBUG(dbgs() << "Set unroll threshold " << Threshold << " for loop:\n"
+ << *L << " due to " << *GEP << '\n');
+ if (UP.Threshold >= MaxBoost)
+ return;
}
}
}
@@ -81,28 +200,56 @@ unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool Vector) {
}
unsigned AMDGPUTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
- switch (AddrSpace) {
- case AMDGPUAS::GLOBAL_ADDRESS:
- case AMDGPUAS::CONSTANT_ADDRESS:
- case AMDGPUAS::FLAT_ADDRESS:
+ AMDGPUAS AS = ST->getAMDGPUAS();
+ if (AddrSpace == AS.GLOBAL_ADDRESS ||
+ AddrSpace == AS.CONSTANT_ADDRESS ||
+ AddrSpace == AS.FLAT_ADDRESS)
return 128;
- case AMDGPUAS::LOCAL_ADDRESS:
- case AMDGPUAS::REGION_ADDRESS:
+ if (AddrSpace == AS.LOCAL_ADDRESS ||
+ AddrSpace == AS.REGION_ADDRESS)
return 64;
- case AMDGPUAS::PRIVATE_ADDRESS:
+ if (AddrSpace == AS.PRIVATE_ADDRESS)
return 8 * ST->getMaxPrivateElementSize();
- default:
- if (ST->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS &&
- (AddrSpace == AMDGPUAS::PARAM_D_ADDRESS ||
- AddrSpace == AMDGPUAS::PARAM_I_ADDRESS ||
- (AddrSpace >= AMDGPUAS::CONSTANT_BUFFER_0 &&
- AddrSpace <= AMDGPUAS::CONSTANT_BUFFER_15)))
- return 128;
- llvm_unreachable("unhandled address space");
+
+ if (ST->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS &&
+ (AddrSpace == AS.PARAM_D_ADDRESS ||
+ AddrSpace == AS.PARAM_I_ADDRESS ||
+ (AddrSpace >= AS.CONSTANT_BUFFER_0 &&
+ AddrSpace <= AS.CONSTANT_BUFFER_15)))
+ return 128;
+ llvm_unreachable("unhandled address space");
+}
+
+bool AMDGPUTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
+ unsigned Alignment,
+ unsigned AddrSpace) const {
+ // We allow vectorization of flat stores, even though we may need to decompose
+ // them later if they may access private memory. We don't have enough context
+ // here, and legalization can handle it.
+ if (AddrSpace == ST->getAMDGPUAS().PRIVATE_ADDRESS) {
+ return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) &&
+ ChainSizeInBytes <= ST->getMaxPrivateElementSize();
}
+ return true;
+}
+
+bool AMDGPUTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
+ unsigned Alignment,
+ unsigned AddrSpace) const {
+ return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
+}
+
+bool AMDGPUTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
+ unsigned Alignment,
+ unsigned AddrSpace) const {
+ return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
}
unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) {
+ // Disable unrolling if the loop is not vectorized.
+ if (VF == 1)
+ return 1;
+
// Semi-arbitrary large amount.
return 64;
}
@@ -228,16 +375,8 @@ int AMDGPUTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
}
}
-static bool isIntrinsicSourceOfDivergence(const TargetIntrinsicInfo *TII,
- const IntrinsicInst *I) {
+static bool isIntrinsicSourceOfDivergence(const IntrinsicInst *I) {
switch (I->getIntrinsicID()) {
- default:
- return false;
- case Intrinsic::not_intrinsic:
- // This means we have an intrinsic that isn't defined in
- // IntrinsicsAMDGPU.td
- break;
-
case Intrinsic::amdgcn_workitem_id_x:
case Intrinsic::amdgcn_workitem_id_y:
case Intrinsic::amdgcn_workitem_id_z:
@@ -249,6 +388,8 @@ static bool isIntrinsicSourceOfDivergence(const TargetIntrinsicInfo *TII,
case Intrinsic::r600_read_tidig_x:
case Intrinsic::r600_read_tidig_y:
case Intrinsic::r600_read_tidig_z:
+ case Intrinsic::amdgcn_atomic_inc:
+ case Intrinsic::amdgcn_atomic_dec:
case Intrinsic::amdgcn_image_atomic_swap:
case Intrinsic::amdgcn_image_atomic_add:
case Intrinsic::amdgcn_image_atomic_sub:
@@ -274,16 +415,10 @@ static bool isIntrinsicSourceOfDivergence(const TargetIntrinsicInfo *TII,
case Intrinsic::amdgcn_buffer_atomic_xor:
case Intrinsic::amdgcn_buffer_atomic_cmpswap:
case Intrinsic::amdgcn_ps_live:
+ case Intrinsic::amdgcn_ds_swizzle:
return true;
- }
-
- StringRef Name = I->getCalledFunction()->getName();
- switch (TII->lookupName((const char *)Name.bytes_begin(), Name.size())) {
default:
return false;
- case AMDGPUIntrinsic::SI_fs_interp:
- case AMDGPUIntrinsic::SI_fs_constant:
- return true;
}
}
@@ -295,8 +430,8 @@ static bool isArgPassedInSGPR(const Argument *A) {
return true;
// For non-compute shaders, SGPR inputs are marked with either inreg or byval.
- if (F->getAttributes().hasAttribute(A->getArgNo() + 1, Attribute::InReg) ||
- F->getAttributes().hasAttribute(A->getArgNo() + 1, Attribute::ByVal))
+ if (F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::InReg) ||
+ F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::ByVal))
return true;
// Everything else is in VGPRs.
@@ -318,7 +453,7 @@ bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const {
// All other loads are not divergent, because if threads issue loads with the
// same arguments, they will always get the same result.
if (const LoadInst *Load = dyn_cast<LoadInst>(V))
- return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS;
+ return Load->getPointerAddressSpace() == ST->getAMDGPUAS().PRIVATE_ADDRESS;
// Atomics are divergent because they are executed sequentially: when an
// atomic operation refers to the same address in each thread, then each
@@ -327,10 +462,8 @@ bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const {
if (isa<AtomicRMWInst>(V) || isa<AtomicCmpXchgInst>(V))
return true;
- if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
- const TargetMachine &TM = getTLI()->getTargetMachine();
- return isIntrinsicSourceOfDivergence(TM.getIntrinsicInfo(), Intrinsic);
- }
+ if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V))
+ return isIntrinsicSourceOfDivergence(Intrinsic);
// Assume all function calls are a source of divergence.
if (isa<CallInst>(V) || isa<InvokeInst>(V))