summaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp714
1 files changed, 539 insertions, 175 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index cf908766caa0d..a795493017402 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -15,8 +15,10 @@
#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
#include "AMDGPUTargetMachine.h"
+#include "llvm/ADT/FloatingPointMode.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/ConstantFolding.h"
#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
#include "llvm/Analysis/Loads.h"
#include "llvm/Analysis/ValueTracking.h"
@@ -26,6 +28,7 @@
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstVisitor.h"
@@ -41,6 +44,7 @@
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/Support/Casting.h"
+#include "llvm/Transforms/Utils/IntegerDivision.h"
#include <cassert>
#include <iterator>
@@ -54,7 +58,7 @@ static cl::opt<bool> WidenLoads(
"amdgpu-codegenprepare-widen-constant-loads",
cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
cl::ReallyHidden,
- cl::init(true));
+ cl::init(false));
static cl::opt<bool> UseMul24Intrin(
"amdgpu-codegenprepare-mul24",
@@ -62,10 +66,26 @@ static cl::opt<bool> UseMul24Intrin(
cl::ReallyHidden,
cl::init(true));
+// Legalize 64-bit division by using the generic IR expansion.
+static cl::opt<bool> ExpandDiv64InIR(
+ "amdgpu-codegenprepare-expand-div64",
+ cl::desc("Expand 64-bit division in AMDGPUCodeGenPrepare"),
+ cl::ReallyHidden,
+ cl::init(false));
+
+// Leave all division operations as they are. This supersedes ExpandDiv64InIR
+// and is used for testing the legalizer.
+static cl::opt<bool> DisableIDivExpand(
+ "amdgpu-codegenprepare-disable-idiv-expansion",
+ cl::desc("Prevent expanding integer division in AMDGPUCodeGenPrepare"),
+ cl::ReallyHidden,
+ cl::init(false));
+
class AMDGPUCodeGenPrepare : public FunctionPass,
public InstVisitor<AMDGPUCodeGenPrepare, bool> {
const GCNSubtarget *ST = nullptr;
AssumptionCache *AC = nullptr;
+ DominatorTree *DT = nullptr;
LegacyDivergenceAnalysis *DA = nullptr;
Module *Mod = nullptr;
const DataLayout *DL = nullptr;
@@ -152,15 +172,33 @@ class AMDGPUCodeGenPrepare : public FunctionPass,
/// SelectionDAG has an issue where an and asserting the bits are known
bool replaceMulWithMul24(BinaryOperator &I) const;
+ /// Perform same function as equivalently named function in DAGCombiner. Since
+ /// we expand some divisions here, we need to perform this before obscuring.
+ bool foldBinOpIntoSelect(BinaryOperator &I) const;
+
+ bool divHasSpecialOptimization(BinaryOperator &I,
+ Value *Num, Value *Den) const;
+ int getDivNumBits(BinaryOperator &I,
+ Value *Num, Value *Den,
+ unsigned AtLeast, bool Signed) const;
+
/// Expands 24 bit div or rem.
Value* expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I,
Value *Num, Value *Den,
bool IsDiv, bool IsSigned) const;
+ Value *expandDivRem24Impl(IRBuilder<> &Builder, BinaryOperator &I,
+ Value *Num, Value *Den, unsigned NumBits,
+ bool IsDiv, bool IsSigned) const;
+
/// Expands 32 bit div or rem.
Value* expandDivRem32(IRBuilder<> &Builder, BinaryOperator &I,
Value *Num, Value *Den) const;
+ Value *shrinkDivRem64(IRBuilder<> &Builder, BinaryOperator &I,
+ Value *Num, Value *Den) const;
+ void expandDivRem64(BinaryOperator &I) const;
+
/// Widen a scalar load.
///
/// \details \p Widen scalar load for uniform, small type loads from constant
@@ -195,7 +233,10 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<AssumptionCacheTracker>();
AU.addRequired<LegacyDivergenceAnalysis>();
- AU.setPreservesAll();
+
+ // FIXME: Division expansion needs to preserve the dominator tree.
+ if (!ExpandDiv64InIR)
+ AU.setPreservesAll();
}
};
@@ -214,7 +255,7 @@ Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const {
if (T->isIntegerTy())
return B.getInt32Ty();
- return VectorType::get(B.getInt32Ty(), cast<VectorType>(T)->getNumElements());
+ return FixedVectorType::get(B.getInt32Ty(), cast<FixedVectorType>(T));
}
bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const {
@@ -276,10 +317,9 @@ bool AMDGPUCodeGenPrepare::canWidenScalarExtLoad(LoadInst &I) const {
Type *Ty = I.getType();
const DataLayout &DL = Mod->getDataLayout();
int TySize = DL.getTypeSizeInBits(Ty);
- unsigned Align = I.getAlignment() ?
- I.getAlignment() : DL.getABITypeAlignment(Ty);
+ Align Alignment = DL.getValueOrABITypeAlignment(I.getAlign(), Ty);
- return I.isSimple() && TySize < 32 && Align >= 4 && DA->isUniform(&I);
+ return I.isSimple() && TySize < 32 && Alignment >= 4 && DA->isUniform(&I);
}
bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const {
@@ -436,7 +476,7 @@ bool AMDGPUCodeGenPrepare::isU24(Value *V, unsigned ScalarSize) const {
static void extractValues(IRBuilder<> &Builder,
SmallVectorImpl<Value *> &Values, Value *V) {
- VectorType *VT = dyn_cast<VectorType>(V->getType());
+ auto *VT = dyn_cast<FixedVectorType>(V->getType());
if (!VT) {
Values.push_back(V);
return;
@@ -525,58 +565,218 @@ bool AMDGPUCodeGenPrepare::replaceMulWithMul24(BinaryOperator &I) const {
return true;
}
-static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv, bool HasDenormals) {
- const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
- if (!CNum)
- return HasDenormals;
+// Find a select instruction, which may have been casted. This is mostly to deal
+// with cases where i16 selects were promoted here to i32.
+static SelectInst *findSelectThroughCast(Value *V, CastInst *&Cast) {
+ Cast = nullptr;
+ if (SelectInst *Sel = dyn_cast<SelectInst>(V))
+ return Sel;
- if (UnsafeDiv)
- return true;
+ if ((Cast = dyn_cast<CastInst>(V))) {
+ if (SelectInst *Sel = dyn_cast<SelectInst>(Cast->getOperand(0)))
+ return Sel;
+ }
+
+ return nullptr;
+}
+
+bool AMDGPUCodeGenPrepare::foldBinOpIntoSelect(BinaryOperator &BO) const {
+ // Don't do this unless the old select is going away. We want to eliminate the
+ // binary operator, not replace a binop with a select.
+ int SelOpNo = 0;
+
+ CastInst *CastOp;
+
+ // TODO: Should probably try to handle some cases with multiple
+ // users. Duplicating the select may be profitable for division.
+ SelectInst *Sel = findSelectThroughCast(BO.getOperand(0), CastOp);
+ if (!Sel || !Sel->hasOneUse()) {
+ SelOpNo = 1;
+ Sel = findSelectThroughCast(BO.getOperand(1), CastOp);
+ }
+
+ if (!Sel || !Sel->hasOneUse())
+ return false;
+
+ Constant *CT = dyn_cast<Constant>(Sel->getTrueValue());
+ Constant *CF = dyn_cast<Constant>(Sel->getFalseValue());
+ Constant *CBO = dyn_cast<Constant>(BO.getOperand(SelOpNo ^ 1));
+ if (!CBO || !CT || !CF)
+ return false;
+
+ if (CastOp) {
+ if (!CastOp->hasOneUse())
+ return false;
+ CT = ConstantFoldCastOperand(CastOp->getOpcode(), CT, BO.getType(), *DL);
+ CF = ConstantFoldCastOperand(CastOp->getOpcode(), CF, BO.getType(), *DL);
+ }
+
+ // TODO: Handle special 0/-1 cases DAG combine does, although we only really
+ // need to handle divisions here.
+ Constant *FoldedT = SelOpNo ?
+ ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CT, *DL) :
+ ConstantFoldBinaryOpOperands(BO.getOpcode(), CT, CBO, *DL);
+ if (isa<ConstantExpr>(FoldedT))
+ return false;
+
+ Constant *FoldedF = SelOpNo ?
+ ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CF, *DL) :
+ ConstantFoldBinaryOpOperands(BO.getOpcode(), CF, CBO, *DL);
+ if (isa<ConstantExpr>(FoldedF))
+ return false;
+
+ IRBuilder<> Builder(&BO);
+ Builder.SetCurrentDebugLocation(BO.getDebugLoc());
+ if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(&BO))
+ Builder.setFastMathFlags(FPOp->getFastMathFlags());
+
+ Value *NewSelect = Builder.CreateSelect(Sel->getCondition(),
+ FoldedT, FoldedF);
+ NewSelect->takeName(&BO);
+ BO.replaceAllUsesWith(NewSelect);
+ BO.eraseFromParent();
+ if (CastOp)
+ CastOp->eraseFromParent();
+ Sel->eraseFromParent();
+ return true;
+}
+
+// Optimize fdiv with rcp:
+//
+// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
+// allowed with unsafe-fp-math or afn.
+//
+// a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn.
+static Value *optimizeWithRcp(Value *Num, Value *Den, bool AllowInaccurateRcp,
+ bool RcpIsAccurate, IRBuilder<> &Builder,
+ Module *Mod) {
+
+ if (!AllowInaccurateRcp && !RcpIsAccurate)
+ return nullptr;
+
+ Type *Ty = Den->getType();
+ if (const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num)) {
+ if (AllowInaccurateRcp || RcpIsAccurate) {
+ if (CLHS->isExactlyValue(1.0)) {
+ Function *Decl = Intrinsic::getDeclaration(
+ Mod, Intrinsic::amdgcn_rcp, Ty);
+
+ // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
+ // the CI documentation has a worst case error of 1 ulp.
+ // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
+ // use it as long as we aren't trying to use denormals.
+ //
+ // v_rcp_f16 and v_rsq_f16 DO support denormals.
+
+ // NOTE: v_sqrt and v_rcp will be combined to v_rsq later. So we don't
+ // insert rsq intrinsic here.
+
+ // 1.0 / x -> rcp(x)
+ return Builder.CreateCall(Decl, { Den });
+ }
+
+ // Same as for 1.0, but expand the sign out of the constant.
+ if (CLHS->isExactlyValue(-1.0)) {
+ Function *Decl = Intrinsic::getDeclaration(
+ Mod, Intrinsic::amdgcn_rcp, Ty);
+
+ // -1.0 / x -> rcp (fneg x)
+ Value *FNeg = Builder.CreateFNeg(Den);
+ return Builder.CreateCall(Decl, { FNeg });
+ }
+ }
+ }
- bool IsOne = CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0);
+ if (AllowInaccurateRcp) {
+ Function *Decl = Intrinsic::getDeclaration(
+ Mod, Intrinsic::amdgcn_rcp, Ty);
- // Reciprocal f32 is handled separately without denormals.
- return HasDenormals ^ IsOne;
+ // Turn into multiply by the reciprocal.
+ // x / y -> x * (1.0 / y)
+ Value *Recip = Builder.CreateCall(Decl, { Den });
+ return Builder.CreateFMul(Num, Recip);
+ }
+ return nullptr;
+}
+
+// optimize with fdiv.fast:
+//
+// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
+//
+// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
+//
+// NOTE: optimizeWithRcp should be tried first because rcp is the preference.
+static Value *optimizeWithFDivFast(Value *Num, Value *Den, float ReqdAccuracy,
+ bool HasDenormals, IRBuilder<> &Builder,
+ Module *Mod) {
+ // fdiv.fast can achieve 2.5 ULP accuracy.
+ if (ReqdAccuracy < 2.5f)
+ return nullptr;
+
+ // Only have fdiv.fast for f32.
+ Type *Ty = Den->getType();
+ if (!Ty->isFloatTy())
+ return nullptr;
+
+ bool NumIsOne = false;
+ if (const ConstantFP *CNum = dyn_cast<ConstantFP>(Num)) {
+ if (CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0))
+ NumIsOne = true;
+ }
+
+ // fdiv does not support denormals. But 1.0/x is always fine to use it.
+ if (HasDenormals && !NumIsOne)
+ return nullptr;
+
+ Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);
+ return Builder.CreateCall(Decl, { Num, Den });
}
-// Insert an intrinsic for fast fdiv for safe math situations where we can
-// reduce precision. Leave fdiv for situations where the generic node is
-// expected to be optimized.
+// Optimizations is performed based on fpmath, fast math flags as well as
+// denormals to optimize fdiv with either rcp or fdiv.fast.
+//
+// With rcp:
+// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
+// allowed with unsafe-fp-math or afn.
+//
+// a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn.
+//
+// With fdiv.fast:
+// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
+//
+// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
+//
+// NOTE: rcp is the preference in cases that both are legal.
bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
- Type *Ty = FDiv.getType();
- if (!Ty->getScalarType()->isFloatTy())
- return false;
+ Type *Ty = FDiv.getType()->getScalarType();
- MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath);
- if (!FPMath)
+ // No intrinsic for fdiv16 if target does not support f16.
+ if (Ty->isHalfTy() && !ST->has16BitInsts())
return false;
const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
- float ULP = FPOp->getFPAccuracy();
- if (ULP < 2.5f)
- return false;
+ const float ReqdAccuracy = FPOp->getFPAccuracy();
+ // Inaccurate rcp is allowed with unsafe-fp-math or afn.
FastMathFlags FMF = FPOp->getFastMathFlags();
- bool UnsafeDiv = HasUnsafeFPMath || FMF.isFast() ||
- FMF.allowReciprocal();
+ const bool AllowInaccurateRcp = HasUnsafeFPMath || FMF.approxFunc();
- // With UnsafeDiv node will be optimized to just rcp and mul.
- if (UnsafeDiv)
- return false;
+ // rcp_f16 is accurate for !fpmath >= 1.0ulp.
+ // rcp_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
+ // rcp_f64 is never accurate.
+ const bool RcpIsAccurate = (Ty->isHalfTy() && ReqdAccuracy >= 1.0f) ||
+ (Ty->isFloatTy() && !HasFP32Denormals && ReqdAccuracy >= 1.0f);
- IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath);
+ IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()));
Builder.setFastMathFlags(FMF);
Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
- Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);
-
Value *Num = FDiv.getOperand(0);
Value *Den = FDiv.getOperand(1);
Value *NewFDiv = nullptr;
-
- if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
+ if (auto *VT = dyn_cast<FixedVectorType>(FDiv.getType())) {
NewFDiv = UndefValue::get(VT);
// FIXME: Doesn't do the right thing for cases where the vector is partially
@@ -584,19 +784,25 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) {
Value *NumEltI = Builder.CreateExtractElement(Num, I);
Value *DenEltI = Builder.CreateExtractElement(Den, I);
- Value *NewElt;
-
- if (shouldKeepFDivF32(NumEltI, UnsafeDiv, HasFP32Denormals)) {
+ // Try rcp first.
+ Value *NewElt = optimizeWithRcp(NumEltI, DenEltI, AllowInaccurateRcp,
+ RcpIsAccurate, Builder, Mod);
+ if (!NewElt) // Try fdiv.fast.
+ NewElt = optimizeWithFDivFast(NumEltI, DenEltI, ReqdAccuracy,
+ HasFP32Denormals, Builder, Mod);
+ if (!NewElt) // Keep the original.
NewElt = Builder.CreateFDiv(NumEltI, DenEltI);
- } else {
- NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI });
- }
NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
}
- } else {
- if (!shouldKeepFDivF32(Num, UnsafeDiv, HasFP32Denormals))
- NewFDiv = Builder.CreateCall(Decl, { Num, Den });
+ } else { // Scalar FDiv.
+ // Try rcp first.
+ NewFDiv = optimizeWithRcp(Num, Den, AllowInaccurateRcp, RcpIsAccurate,
+ Builder, Mod);
+ if (!NewFDiv) { // Try fdiv.fast.
+ NewFDiv = optimizeWithFDivFast(Num, Den, ReqdAccuracy, HasFP32Denormals,
+ Builder, Mod);
+ }
}
if (NewFDiv) {
@@ -631,31 +837,49 @@ static Value* getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS) {
return getMul64(Builder, LHS, RHS).second;
}
-// The fractional part of a float is enough to accurately represent up to
-// a 24-bit signed integer.
-Value* AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder,
- BinaryOperator &I,
- Value *Num, Value *Den,
- bool IsDiv, bool IsSigned) const {
- assert(Num->getType()->isIntegerTy(32));
-
+/// Figure out how many bits are really needed for this ddivision. \p AtLeast is
+/// an optimization hint to bypass the second ComputeNumSignBits call if we the
+/// first one is insufficient. Returns -1 on failure.
+int AMDGPUCodeGenPrepare::getDivNumBits(BinaryOperator &I,
+ Value *Num, Value *Den,
+ unsigned AtLeast, bool IsSigned) const {
const DataLayout &DL = Mod->getDataLayout();
unsigned LHSSignBits = ComputeNumSignBits(Num, DL, 0, AC, &I);
- if (LHSSignBits < 9)
- return nullptr;
+ if (LHSSignBits < AtLeast)
+ return -1;
unsigned RHSSignBits = ComputeNumSignBits(Den, DL, 0, AC, &I);
- if (RHSSignBits < 9)
- return nullptr;
-
+ if (RHSSignBits < AtLeast)
+ return -1;
unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
- unsigned DivBits = 32 - SignBits;
+ unsigned DivBits = Num->getType()->getScalarSizeInBits() - SignBits;
if (IsSigned)
++DivBits;
+ return DivBits;
+}
- Type *Ty = Num->getType();
+// The fractional part of a float is enough to accurately represent up to
+// a 24-bit signed integer.
+Value *AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder,
+ BinaryOperator &I,
+ Value *Num, Value *Den,
+ bool IsDiv, bool IsSigned) const {
+ int DivBits = getDivNumBits(I, Num, Den, 9, IsSigned);
+ if (DivBits == -1)
+ return nullptr;
+ return expandDivRem24Impl(Builder, I, Num, Den, DivBits, IsDiv, IsSigned);
+}
+
+Value *AMDGPUCodeGenPrepare::expandDivRem24Impl(IRBuilder<> &Builder,
+ BinaryOperator &I,
+ Value *Num, Value *Den,
+ unsigned DivBits,
+ bool IsDiv, bool IsSigned) const {
Type *I32Ty = Builder.getInt32Ty();
+ Num = Builder.CreateTrunc(Num, I32Ty);
+ Den = Builder.CreateTrunc(Den, I32Ty);
+
Type *F32Ty = Builder.getFloatTy();
ConstantInt *One = Builder.getInt32(1);
Value *JQ = One;
@@ -685,7 +909,9 @@ Value* AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder,
Value *FB = IsSigned ? Builder.CreateSIToFP(IB,F32Ty)
: Builder.CreateUIToFP(IB,F32Ty);
- Value *RCP = Builder.CreateFDiv(ConstantFP::get(F32Ty, 1.0), FB);
+ Function *RcpDecl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_rcp,
+ Builder.getFloatTy());
+ Value *RCP = Builder.CreateCall(RcpDecl, { FB });
Value *FQM = Builder.CreateFMul(FA, RCP);
// fq = trunc(fqm);
@@ -696,7 +922,10 @@ Value* AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder,
Value *FQNeg = Builder.CreateFNeg(FQ);
// float fr = mad(fqneg, fb, fa);
- Value *FR = Builder.CreateIntrinsic(Intrinsic::amdgcn_fmad_ftz,
+ auto FMAD = !ST->hasMadMacF32Insts()
+ ? Intrinsic::fma
+ : (Intrinsic::ID)Intrinsic::amdgcn_fmad_ftz;
+ Value *FR = Builder.CreateIntrinsic(FMAD,
{FQNeg->getType()}, {FQNeg, FB, FA}, FQ);
// int iq = (int)fq;
@@ -725,21 +954,72 @@ Value* AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder,
Res = Builder.CreateSub(Num, Rem);
}
- // Truncate to number of bits this divide really is.
- if (IsSigned) {
- Res = Builder.CreateTrunc(Res, Builder.getIntNTy(DivBits));
- Res = Builder.CreateSExt(Res, Ty);
- } else {
- ConstantInt *TruncMask = Builder.getInt32((UINT64_C(1) << DivBits) - 1);
- Res = Builder.CreateAnd(Res, TruncMask);
+ if (DivBits != 0 && DivBits < 32) {
+ // Extend in register from the number of bits this divide really is.
+ if (IsSigned) {
+ int InRegBits = 32 - DivBits;
+
+ Res = Builder.CreateShl(Res, InRegBits);
+ Res = Builder.CreateAShr(Res, InRegBits);
+ } else {
+ ConstantInt *TruncMask
+ = Builder.getInt32((UINT64_C(1) << DivBits) - 1);
+ Res = Builder.CreateAnd(Res, TruncMask);
+ }
}
return Res;
}
-Value* AMDGPUCodeGenPrepare::expandDivRem32(IRBuilder<> &Builder,
- BinaryOperator &I,
- Value *Num, Value *Den) const {
+// Try to recognize special cases the DAG will emit special, better expansions
+// than the general expansion we do here.
+
+// TODO: It would be better to just directly handle those optimizations here.
+bool AMDGPUCodeGenPrepare::divHasSpecialOptimization(
+ BinaryOperator &I, Value *Num, Value *Den) const {
+ if (Constant *C = dyn_cast<Constant>(Den)) {
+ // Arbitrary constants get a better expansion as long as a wider mulhi is
+ // legal.
+ if (C->getType()->getScalarSizeInBits() <= 32)
+ return true;
+
+ // TODO: Sdiv check for not exact for some reason.
+
+ // If there's no wider mulhi, there's only a better expansion for powers of
+ // two.
+ // TODO: Should really know for each vector element.
+ if (isKnownToBeAPowerOfTwo(C, *DL, true, 0, AC, &I, DT))
+ return true;
+
+ return false;
+ }
+
+ if (BinaryOperator *BinOpDen = dyn_cast<BinaryOperator>(Den)) {
+ // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2
+ if (BinOpDen->getOpcode() == Instruction::Shl &&
+ isa<Constant>(BinOpDen->getOperand(0)) &&
+ isKnownToBeAPowerOfTwo(BinOpDen->getOperand(0), *DL, true,
+ 0, AC, &I, DT)) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static Value *getSign32(Value *V, IRBuilder<> &Builder, const DataLayout *DL) {
+ // Check whether the sign can be determined statically.
+ KnownBits Known = computeKnownBits(V, *DL);
+ if (Known.isNegative())
+ return Constant::getAllOnesValue(V->getType());
+ if (Known.isNonNegative())
+ return Constant::getNullValue(V->getType());
+ return Builder.CreateAShr(V, Builder.getInt32(31));
+}
+
+Value *AMDGPUCodeGenPrepare::expandDivRem32(IRBuilder<> &Builder,
+ BinaryOperator &I, Value *X,
+ Value *Y) const {
Instruction::BinaryOps Opc = I.getOpcode();
assert(Opc == Instruction::URem || Opc == Instruction::UDiv ||
Opc == Instruction::SRem || Opc == Instruction::SDiv);
@@ -748,142 +1028,171 @@ Value* AMDGPUCodeGenPrepare::expandDivRem32(IRBuilder<> &Builder,
FMF.setFast();
Builder.setFastMathFlags(FMF);
- if (isa<Constant>(Den))
- return nullptr; // Keep it for optimization
+ if (divHasSpecialOptimization(I, X, Y))
+ return nullptr; // Keep it for later optimization.
bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv;
bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv;
- Type *Ty = Num->getType();
+ Type *Ty = X->getType();
Type *I32Ty = Builder.getInt32Ty();
Type *F32Ty = Builder.getFloatTy();
if (Ty->getScalarSizeInBits() < 32) {
if (IsSigned) {
- Num = Builder.CreateSExt(Num, I32Ty);
- Den = Builder.CreateSExt(Den, I32Ty);
+ X = Builder.CreateSExt(X, I32Ty);
+ Y = Builder.CreateSExt(Y, I32Ty);
} else {
- Num = Builder.CreateZExt(Num, I32Ty);
- Den = Builder.CreateZExt(Den, I32Ty);
+ X = Builder.CreateZExt(X, I32Ty);
+ Y = Builder.CreateZExt(Y, I32Ty);
}
}
- if (Value *Res = expandDivRem24(Builder, I, Num, Den, IsDiv, IsSigned)) {
- Res = Builder.CreateTrunc(Res, Ty);
- return Res;
+ if (Value *Res = expandDivRem24(Builder, I, X, Y, IsDiv, IsSigned)) {
+ return IsSigned ? Builder.CreateSExtOrTrunc(Res, Ty) :
+ Builder.CreateZExtOrTrunc(Res, Ty);
}
ConstantInt *Zero = Builder.getInt32(0);
ConstantInt *One = Builder.getInt32(1);
- ConstantInt *MinusOne = Builder.getInt32(~0);
Value *Sign = nullptr;
if (IsSigned) {
- ConstantInt *K31 = Builder.getInt32(31);
- Value *LHSign = Builder.CreateAShr(Num, K31);
- Value *RHSign = Builder.CreateAShr(Den, K31);
+ Value *SignX = getSign32(X, Builder, DL);
+ Value *SignY = getSign32(Y, Builder, DL);
// Remainder sign is the same as LHS
- Sign = IsDiv ? Builder.CreateXor(LHSign, RHSign) : LHSign;
+ Sign = IsDiv ? Builder.CreateXor(SignX, SignY) : SignX;
- Num = Builder.CreateAdd(Num, LHSign);
- Den = Builder.CreateAdd(Den, RHSign);
+ X = Builder.CreateAdd(X, SignX);
+ Y = Builder.CreateAdd(Y, SignY);
- Num = Builder.CreateXor(Num, LHSign);
- Den = Builder.CreateXor(Den, RHSign);
+ X = Builder.CreateXor(X, SignX);
+ Y = Builder.CreateXor(Y, SignY);
}
- // RCP = URECIP(Den) = 2^32 / Den + e
- // e is rounding error.
- Value *DEN_F32 = Builder.CreateUIToFP(Den, F32Ty);
- Value *RCP_F32 = Builder.CreateFDiv(ConstantFP::get(F32Ty, 1.0), DEN_F32);
- Constant *UINT_MAX_PLUS_1 = ConstantFP::get(F32Ty, BitsToFloat(0x4f800000));
- Value *RCP_SCALE = Builder.CreateFMul(RCP_F32, UINT_MAX_PLUS_1);
- Value *RCP = Builder.CreateFPToUI(RCP_SCALE, I32Ty);
-
- // RCP_LO, RCP_HI = mul(RCP, Den) */
- Value *RCP_LO, *RCP_HI;
- std::tie(RCP_LO, RCP_HI) = getMul64(Builder, RCP, Den);
-
- // NEG_RCP_LO = -RCP_LO
- Value *NEG_RCP_LO = Builder.CreateNeg(RCP_LO);
-
- // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
- Value *RCP_HI_0_CC = Builder.CreateICmpEQ(RCP_HI, Zero);
- Value *ABS_RCP_LO = Builder.CreateSelect(RCP_HI_0_CC, NEG_RCP_LO, RCP_LO);
-
- // Calculate the rounding error from the URECIP instruction
- // E = mulhu(ABS_RCP_LO, RCP)
- Value *E = getMulHu(Builder, ABS_RCP_LO, RCP);
-
- // RCP_A_E = RCP + E
- Value *RCP_A_E = Builder.CreateAdd(RCP, E);
-
- // RCP_S_E = RCP - E
- Value *RCP_S_E = Builder.CreateSub(RCP, E);
-
- // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
- Value *Tmp0 = Builder.CreateSelect(RCP_HI_0_CC, RCP_A_E, RCP_S_E);
-
- // Quotient = mulhu(Tmp0, Num)
- Value *Quotient = getMulHu(Builder, Tmp0, Num);
-
- // Num_S_Remainder = Quotient * Den
- Value *Num_S_Remainder = Builder.CreateMul(Quotient, Den);
+ // The algorithm here is based on ideas from "Software Integer Division", Tom
+ // Rodeheffer, August 2008.
+ //
+ // unsigned udiv(unsigned x, unsigned y) {
+ // // Initial estimate of inv(y). The constant is less than 2^32 to ensure
+ // // that this is a lower bound on inv(y), even if some of the calculations
+ // // round up.
+ // unsigned z = (unsigned)((4294967296.0 - 512.0) * v_rcp_f32((float)y));
+ //
+ // // One round of UNR (Unsigned integer Newton-Raphson) to improve z.
+ // // Empirically this is guaranteed to give a "two-y" lower bound on
+ // // inv(y).
+ // z += umulh(z, -y * z);
+ //
+ // // Quotient/remainder estimate.
+ // unsigned q = umulh(x, z);
+ // unsigned r = x - q * y;
+ //
+ // // Two rounds of quotient/remainder refinement.
+ // if (r >= y) {
+ // ++q;
+ // r -= y;
+ // }
+ // if (r >= y) {
+ // ++q;
+ // r -= y;
+ // }
+ //
+ // return q;
+ // }
+
+ // Initial estimate of inv(y).
+ Value *FloatY = Builder.CreateUIToFP(Y, F32Ty);
+ Function *Rcp = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_rcp, F32Ty);
+ Value *RcpY = Builder.CreateCall(Rcp, {FloatY});
+ Constant *Scale = ConstantFP::get(F32Ty, BitsToFloat(0x4F7FFFFE));
+ Value *ScaledY = Builder.CreateFMul(RcpY, Scale);
+ Value *Z = Builder.CreateFPToUI(ScaledY, I32Ty);
+
+ // One round of UNR.
+ Value *NegY = Builder.CreateSub(Zero, Y);
+ Value *NegYZ = Builder.CreateMul(NegY, Z);
+ Z = Builder.CreateAdd(Z, getMulHu(Builder, Z, NegYZ));
+
+ // Quotient/remainder estimate.
+ Value *Q = getMulHu(Builder, X, Z);
+ Value *R = Builder.CreateSub(X, Builder.CreateMul(Q, Y));
+
+ // First quotient/remainder refinement.
+ Value *Cond = Builder.CreateICmpUGE(R, Y);
+ if (IsDiv)
+ Q = Builder.CreateSelect(Cond, Builder.CreateAdd(Q, One), Q);
+ R = Builder.CreateSelect(Cond, Builder.CreateSub(R, Y), R);
+
+ // Second quotient/remainder refinement.
+ Cond = Builder.CreateICmpUGE(R, Y);
+ Value *Res;
+ if (IsDiv)
+ Res = Builder.CreateSelect(Cond, Builder.CreateAdd(Q, One), Q);
+ else
+ Res = Builder.CreateSelect(Cond, Builder.CreateSub(R, Y), R);
- // Remainder = Num - Num_S_Remainder
- Value *Remainder = Builder.CreateSub(Num, Num_S_Remainder);
+ if (IsSigned) {
+ Res = Builder.CreateXor(Res, Sign);
+ Res = Builder.CreateSub(Res, Sign);
+ }
- // Remainder_GE_Den = (Remainder >= Den ? -1 : 0)
- Value *Rem_GE_Den_CC = Builder.CreateICmpUGE(Remainder, Den);
- Value *Remainder_GE_Den = Builder.CreateSelect(Rem_GE_Den_CC, MinusOne, Zero);
+ Res = Builder.CreateTrunc(Res, Ty);
- // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0)
- Value *Num_GE_Num_S_Rem_CC = Builder.CreateICmpUGE(Num, Num_S_Remainder);
- Value *Remainder_GE_Zero = Builder.CreateSelect(Num_GE_Num_S_Rem_CC,
- MinusOne, Zero);
+ return Res;
+}
- // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
- Value *Tmp1 = Builder.CreateAnd(Remainder_GE_Den, Remainder_GE_Zero);
- Value *Tmp1_0_CC = Builder.CreateICmpEQ(Tmp1, Zero);
+Value *AMDGPUCodeGenPrepare::shrinkDivRem64(IRBuilder<> &Builder,
+ BinaryOperator &I,
+ Value *Num, Value *Den) const {
+ if (!ExpandDiv64InIR && divHasSpecialOptimization(I, Num, Den))
+ return nullptr; // Keep it for later optimization.
- Value *Res;
- if (IsDiv) {
- // Quotient_A_One = Quotient + 1
- Value *Quotient_A_One = Builder.CreateAdd(Quotient, One);
+ Instruction::BinaryOps Opc = I.getOpcode();
- // Quotient_S_One = Quotient - 1
- Value *Quotient_S_One = Builder.CreateSub(Quotient, One);
+ bool IsDiv = Opc == Instruction::SDiv || Opc == Instruction::UDiv;
+ bool IsSigned = Opc == Instruction::SDiv || Opc == Instruction::SRem;
- // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One)
- Value *Div = Builder.CreateSelect(Tmp1_0_CC, Quotient, Quotient_A_One);
+ int NumDivBits = getDivNumBits(I, Num, Den, 32, IsSigned);
+ if (NumDivBits == -1)
+ return nullptr;
- // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div)
- Res = Builder.CreateSelect(Num_GE_Num_S_Rem_CC, Div, Quotient_S_One);
- } else {
- // Remainder_S_Den = Remainder - Den
- Value *Remainder_S_Den = Builder.CreateSub(Remainder, Den);
+ Value *Narrowed = nullptr;
+ if (NumDivBits <= 24) {
+ Narrowed = expandDivRem24Impl(Builder, I, Num, Den, NumDivBits,
+ IsDiv, IsSigned);
+ } else if (NumDivBits <= 32) {
+ Narrowed = expandDivRem32(Builder, I, Num, Den);
+ }
- // Remainder_A_Den = Remainder + Den
- Value *Remainder_A_Den = Builder.CreateAdd(Remainder, Den);
+ if (Narrowed) {
+ return IsSigned ? Builder.CreateSExt(Narrowed, Num->getType()) :
+ Builder.CreateZExt(Narrowed, Num->getType());
+ }
- // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den)
- Value *Rem = Builder.CreateSelect(Tmp1_0_CC, Remainder, Remainder_S_Den);
+ return nullptr;
+}
- // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem)
- Res = Builder.CreateSelect(Num_GE_Num_S_Rem_CC, Rem, Remainder_A_Den);
+void AMDGPUCodeGenPrepare::expandDivRem64(BinaryOperator &I) const {
+ Instruction::BinaryOps Opc = I.getOpcode();
+ // Do the general expansion.
+ if (Opc == Instruction::UDiv || Opc == Instruction::SDiv) {
+ expandDivisionUpTo64Bits(&I);
+ return;
}
- if (IsSigned) {
- Res = Builder.CreateXor(Res, Sign);
- Res = Builder.CreateSub(Res, Sign);
+ if (Opc == Instruction::URem || Opc == Instruction::SRem) {
+ expandRemainderUpTo64Bits(&I);
+ return;
}
- Res = Builder.CreateTrunc(Res, Ty);
-
- return Res;
+ llvm_unreachable("not a division");
}
bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
+ if (foldBinOpIntoSelect(I))
+ return true;
+
if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
DA->isUniform(&I) && promoteUniformOpToI32(I))
return true;
@@ -895,27 +1204,54 @@ bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
Instruction::BinaryOps Opc = I.getOpcode();
Type *Ty = I.getType();
Value *NewDiv = nullptr;
+ unsigned ScalarSize = Ty->getScalarSizeInBits();
+
+ SmallVector<BinaryOperator *, 8> Div64ToExpand;
+
if ((Opc == Instruction::URem || Opc == Instruction::UDiv ||
Opc == Instruction::SRem || Opc == Instruction::SDiv) &&
- Ty->getScalarSizeInBits() <= 32) {
+ ScalarSize <= 64 &&
+ !DisableIDivExpand) {
Value *Num = I.getOperand(0);
Value *Den = I.getOperand(1);
IRBuilder<> Builder(&I);
Builder.SetCurrentDebugLocation(I.getDebugLoc());
- if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
+ if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
NewDiv = UndefValue::get(VT);
for (unsigned N = 0, E = VT->getNumElements(); N != E; ++N) {
Value *NumEltN = Builder.CreateExtractElement(Num, N);
Value *DenEltN = Builder.CreateExtractElement(Den, N);
- Value *NewElt = expandDivRem32(Builder, I, NumEltN, DenEltN);
- if (!NewElt)
- NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
+
+ Value *NewElt;
+ if (ScalarSize <= 32) {
+ NewElt = expandDivRem32(Builder, I, NumEltN, DenEltN);
+ if (!NewElt)
+ NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
+ } else {
+ // See if this 64-bit division can be shrunk to 32/24-bits before
+ // producing the general expansion.
+ NewElt = shrinkDivRem64(Builder, I, NumEltN, DenEltN);
+ if (!NewElt) {
+ // The general 64-bit expansion introduces control flow and doesn't
+ // return the new value. Just insert a scalar copy and defer
+ // expanding it.
+ NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
+ Div64ToExpand.push_back(cast<BinaryOperator>(NewElt));
+ }
+ }
+
NewDiv = Builder.CreateInsertElement(NewDiv, NewElt, N);
}
} else {
- NewDiv = expandDivRem32(Builder, I, Num, Den);
+ if (ScalarSize <= 32)
+ NewDiv = expandDivRem32(Builder, I, Num, Den);
+ else {
+ NewDiv = shrinkDivRem64(Builder, I, Num, Den);
+ if (!NewDiv)
+ Div64ToExpand.push_back(&I);
+ }
}
if (NewDiv) {
@@ -925,6 +1261,14 @@ bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
}
}
+ if (ExpandDiv64InIR) {
+ // TODO: We get much worse code in specially handled constant cases.
+ for (BinaryOperator *Div : Div64ToExpand) {
+ expandDivRem64(*Div);
+ Changed = true;
+ }
+ }
+
return Changed;
}
@@ -1033,16 +1377,36 @@ bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
ST = &TM.getSubtarget<GCNSubtarget>(F);
AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
DA = &getAnalysis<LegacyDivergenceAnalysis>();
+
+ auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+ DT = DTWP ? &DTWP->getDomTree() : nullptr;
+
HasUnsafeFPMath = hasUnsafeFPMath(F);
- HasFP32Denormals = ST->hasFP32Denormals(F);
+
+ AMDGPU::SIModeRegisterDefaults Mode(F);
+ HasFP32Denormals = Mode.allFP32Denormals();
bool MadeChange = false;
- for (BasicBlock &BB : F) {
+ Function::iterator NextBB;
+ for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; FI = NextBB) {
+ BasicBlock *BB = &*FI;
+ NextBB = std::next(FI);
+
BasicBlock::iterator Next;
- for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) {
+ for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; I = Next) {
Next = std::next(I);
+
MadeChange |= visit(*I);
+
+ if (Next != E) { // Control flow changed
+ BasicBlock *NextInstBB = Next->getParent();
+ if (NextInstBB != BB) {
+ BB = NextInstBB;
+ E = BB->end();
+ FE = F.end();
+ }
+ }
}
}