diff options
Diffstat (limited to 'lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp')
-rw-r--r-- | lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp | 38 |
1 files changed, 26 insertions, 12 deletions
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 11e4ba4b5010..aaed280a1270 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -1,9 +1,8 @@ //===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -118,8 +117,10 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, // Add a small bonus for each of such "if" statements. if (const BranchInst *Br = dyn_cast<BranchInst>(&I)) { if (UP.Threshold < MaxBoost && Br->isConditional()) { - if (L->isLoopExiting(Br->getSuccessor(0)) || - L->isLoopExiting(Br->getSuccessor(1))) + BasicBlock *Succ0 = Br->getSuccessor(0); + BasicBlock *Succ1 = Br->getSuccessor(1); + if ((L->contains(Succ0) && L->isLoopExiting(Succ0)) || + (L->contains(Succ1) && L->isLoopExiting(Succ1))) continue; if (dependsOnLocalPhi(L, Br->getCondition())) { UP.Threshold += UnrollThresholdIf; @@ -141,7 +142,7 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, unsigned Threshold = 0; if (AS == AMDGPUAS::PRIVATE_ADDRESS) Threshold = ThresholdPrivate; - else if (AS == AMDGPUAS::LOCAL_ADDRESS) + else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) Threshold = ThresholdLocal; else continue; @@ -159,7 +160,8 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0; if (AllocaSize > MaxAlloca) continue; - } else if (AS == AMDGPUAS::LOCAL_ADDRESS) { + } else if (AS == AMDGPUAS::LOCAL_ADDRESS || + AS == AMDGPUAS::REGION_ADDRESS) { LocalGEPsSeen++; // Inhibit unroll for local memory if we have seen addressing not to // a variable, most likely we will be unable to combine it. @@ -254,7 +256,8 @@ unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const { if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS || AddrSpace == AMDGPUAS::CONSTANT_ADDRESS || - AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { + AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT || + AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER) { return 512; } @@ -308,6 +311,8 @@ bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, switch (Inst->getIntrinsicID()) { case Intrinsic::amdgcn_atomic_inc: case Intrinsic::amdgcn_atomic_dec: + case Intrinsic::amdgcn_ds_ordered_add: + case Intrinsic::amdgcn_ds_ordered_swap: case Intrinsic::amdgcn_ds_fadd: case Intrinsic::amdgcn_ds_fmin: case Intrinsic::amdgcn_ds_fmax: { @@ -399,7 +404,7 @@ int GCNTTIImpl::getArithmeticInstrCost( if (SLT == MVT::f64) { int Cost = 4 * get64BitInstrCost() + 7 * getQuarterRateInstrCost(); // Add cost of workaround. - if (ST->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) + if (!ST->hasUsableDivScaleConditionOutput()) Cost += 3 * getFullRateInstrCost(); return LT.first * Cost * NElts; @@ -577,6 +582,8 @@ bool GCNTTIImpl::isAlwaysUniform(const Value *V) const { return false; case Intrinsic::amdgcn_readfirstlane: case Intrinsic::amdgcn_readlane: + case Intrinsic::amdgcn_icmp: + case Intrinsic::amdgcn_fcmp: return true; } } @@ -607,7 +614,7 @@ unsigned GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, } bool GCNTTIImpl::areInlineCompatible(const Function *Caller, - const Function *Callee) const { + const Function *Callee) const { const TargetMachine &TM = getTLI()->getTargetMachine(); const FeatureBitset &CallerBits = TM.getSubtargetImpl(*Caller)->getFeatureBits(); @@ -616,7 +623,14 @@ bool GCNTTIImpl::areInlineCompatible(const Function *Caller, FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList; FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList; - return ((RealCallerBits & RealCalleeBits) == RealCalleeBits); + if ((RealCallerBits & RealCalleeBits) != RealCalleeBits) + return false; + + // FIXME: dx10_clamp can just take the caller setting, but there seems to be + // no way to support merge for backend defined attributes. + AMDGPU::SIModeRegisterDefaults CallerMode(*Caller); + AMDGPU::SIModeRegisterDefaults CalleeMode(*Callee); + return CallerMode.isInlineCompatible(CalleeMode); } void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, |