diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 714 |
1 files changed, 539 insertions, 175 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index cf908766caa0d..a795493017402 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -15,8 +15,10 @@ #include "AMDGPU.h" #include "AMDGPUSubtarget.h" #include "AMDGPUTargetMachine.h" +#include "llvm/ADT/FloatingPointMode.h" #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/ValueTracking.h" @@ -26,6 +28,7 @@ #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstVisitor.h" @@ -41,6 +44,7 @@ #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" +#include "llvm/Transforms/Utils/IntegerDivision.h" #include <cassert> #include <iterator> @@ -54,7 +58,7 @@ static cl::opt<bool> WidenLoads( "amdgpu-codegenprepare-widen-constant-loads", cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"), cl::ReallyHidden, - cl::init(true)); + cl::init(false)); static cl::opt<bool> UseMul24Intrin( "amdgpu-codegenprepare-mul24", @@ -62,10 +66,26 @@ static cl::opt<bool> UseMul24Intrin( cl::ReallyHidden, cl::init(true)); +// Legalize 64-bit division by using the generic IR expansion. +static cl::opt<bool> ExpandDiv64InIR( + "amdgpu-codegenprepare-expand-div64", + cl::desc("Expand 64-bit division in AMDGPUCodeGenPrepare"), + cl::ReallyHidden, + cl::init(false)); + +// Leave all division operations as they are. This supersedes ExpandDiv64InIR +// and is used for testing the legalizer. +static cl::opt<bool> DisableIDivExpand( + "amdgpu-codegenprepare-disable-idiv-expansion", + cl::desc("Prevent expanding integer division in AMDGPUCodeGenPrepare"), + cl::ReallyHidden, + cl::init(false)); + class AMDGPUCodeGenPrepare : public FunctionPass, public InstVisitor<AMDGPUCodeGenPrepare, bool> { const GCNSubtarget *ST = nullptr; AssumptionCache *AC = nullptr; + DominatorTree *DT = nullptr; LegacyDivergenceAnalysis *DA = nullptr; Module *Mod = nullptr; const DataLayout *DL = nullptr; @@ -152,15 +172,33 @@ class AMDGPUCodeGenPrepare : public FunctionPass, /// SelectionDAG has an issue where an and asserting the bits are known bool replaceMulWithMul24(BinaryOperator &I) const; + /// Perform same function as equivalently named function in DAGCombiner. Since + /// we expand some divisions here, we need to perform this before obscuring. + bool foldBinOpIntoSelect(BinaryOperator &I) const; + + bool divHasSpecialOptimization(BinaryOperator &I, + Value *Num, Value *Den) const; + int getDivNumBits(BinaryOperator &I, + Value *Num, Value *Den, + unsigned AtLeast, bool Signed) const; + /// Expands 24 bit div or rem. Value* expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I, Value *Num, Value *Den, bool IsDiv, bool IsSigned) const; + Value *expandDivRem24Impl(IRBuilder<> &Builder, BinaryOperator &I, + Value *Num, Value *Den, unsigned NumBits, + bool IsDiv, bool IsSigned) const; + /// Expands 32 bit div or rem. Value* expandDivRem32(IRBuilder<> &Builder, BinaryOperator &I, Value *Num, Value *Den) const; + Value *shrinkDivRem64(IRBuilder<> &Builder, BinaryOperator &I, + Value *Num, Value *Den) const; + void expandDivRem64(BinaryOperator &I) const; + /// Widen a scalar load. /// /// \details \p Widen scalar load for uniform, small type loads from constant @@ -195,7 +233,10 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<LegacyDivergenceAnalysis>(); - AU.setPreservesAll(); + + // FIXME: Division expansion needs to preserve the dominator tree. + if (!ExpandDiv64InIR) + AU.setPreservesAll(); } }; @@ -214,7 +255,7 @@ Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const { if (T->isIntegerTy()) return B.getInt32Ty(); - return VectorType::get(B.getInt32Ty(), cast<VectorType>(T)->getNumElements()); + return FixedVectorType::get(B.getInt32Ty(), cast<FixedVectorType>(T)); } bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const { @@ -276,10 +317,9 @@ bool AMDGPUCodeGenPrepare::canWidenScalarExtLoad(LoadInst &I) const { Type *Ty = I.getType(); const DataLayout &DL = Mod->getDataLayout(); int TySize = DL.getTypeSizeInBits(Ty); - unsigned Align = I.getAlignment() ? - I.getAlignment() : DL.getABITypeAlignment(Ty); + Align Alignment = DL.getValueOrABITypeAlignment(I.getAlign(), Ty); - return I.isSimple() && TySize < 32 && Align >= 4 && DA->isUniform(&I); + return I.isSimple() && TySize < 32 && Alignment >= 4 && DA->isUniform(&I); } bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const { @@ -436,7 +476,7 @@ bool AMDGPUCodeGenPrepare::isU24(Value *V, unsigned ScalarSize) const { static void extractValues(IRBuilder<> &Builder, SmallVectorImpl<Value *> &Values, Value *V) { - VectorType *VT = dyn_cast<VectorType>(V->getType()); + auto *VT = dyn_cast<FixedVectorType>(V->getType()); if (!VT) { Values.push_back(V); return; @@ -525,58 +565,218 @@ bool AMDGPUCodeGenPrepare::replaceMulWithMul24(BinaryOperator &I) const { return true; } -static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv, bool HasDenormals) { - const ConstantFP *CNum = dyn_cast<ConstantFP>(Num); - if (!CNum) - return HasDenormals; +// Find a select instruction, which may have been casted. This is mostly to deal +// with cases where i16 selects were promoted here to i32. +static SelectInst *findSelectThroughCast(Value *V, CastInst *&Cast) { + Cast = nullptr; + if (SelectInst *Sel = dyn_cast<SelectInst>(V)) + return Sel; - if (UnsafeDiv) - return true; + if ((Cast = dyn_cast<CastInst>(V))) { + if (SelectInst *Sel = dyn_cast<SelectInst>(Cast->getOperand(0))) + return Sel; + } + + return nullptr; +} + +bool AMDGPUCodeGenPrepare::foldBinOpIntoSelect(BinaryOperator &BO) const { + // Don't do this unless the old select is going away. We want to eliminate the + // binary operator, not replace a binop with a select. + int SelOpNo = 0; + + CastInst *CastOp; + + // TODO: Should probably try to handle some cases with multiple + // users. Duplicating the select may be profitable for division. + SelectInst *Sel = findSelectThroughCast(BO.getOperand(0), CastOp); + if (!Sel || !Sel->hasOneUse()) { + SelOpNo = 1; + Sel = findSelectThroughCast(BO.getOperand(1), CastOp); + } + + if (!Sel || !Sel->hasOneUse()) + return false; + + Constant *CT = dyn_cast<Constant>(Sel->getTrueValue()); + Constant *CF = dyn_cast<Constant>(Sel->getFalseValue()); + Constant *CBO = dyn_cast<Constant>(BO.getOperand(SelOpNo ^ 1)); + if (!CBO || !CT || !CF) + return false; + + if (CastOp) { + if (!CastOp->hasOneUse()) + return false; + CT = ConstantFoldCastOperand(CastOp->getOpcode(), CT, BO.getType(), *DL); + CF = ConstantFoldCastOperand(CastOp->getOpcode(), CF, BO.getType(), *DL); + } + + // TODO: Handle special 0/-1 cases DAG combine does, although we only really + // need to handle divisions here. + Constant *FoldedT = SelOpNo ? + ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CT, *DL) : + ConstantFoldBinaryOpOperands(BO.getOpcode(), CT, CBO, *DL); + if (isa<ConstantExpr>(FoldedT)) + return false; + + Constant *FoldedF = SelOpNo ? + ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CF, *DL) : + ConstantFoldBinaryOpOperands(BO.getOpcode(), CF, CBO, *DL); + if (isa<ConstantExpr>(FoldedF)) + return false; + + IRBuilder<> Builder(&BO); + Builder.SetCurrentDebugLocation(BO.getDebugLoc()); + if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(&BO)) + Builder.setFastMathFlags(FPOp->getFastMathFlags()); + + Value *NewSelect = Builder.CreateSelect(Sel->getCondition(), + FoldedT, FoldedF); + NewSelect->takeName(&BO); + BO.replaceAllUsesWith(NewSelect); + BO.eraseFromParent(); + if (CastOp) + CastOp->eraseFromParent(); + Sel->eraseFromParent(); + return true; +} + +// Optimize fdiv with rcp: +// +// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is +// allowed with unsafe-fp-math or afn. +// +// a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn. +static Value *optimizeWithRcp(Value *Num, Value *Den, bool AllowInaccurateRcp, + bool RcpIsAccurate, IRBuilder<> &Builder, + Module *Mod) { + + if (!AllowInaccurateRcp && !RcpIsAccurate) + return nullptr; + + Type *Ty = Den->getType(); + if (const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num)) { + if (AllowInaccurateRcp || RcpIsAccurate) { + if (CLHS->isExactlyValue(1.0)) { + Function *Decl = Intrinsic::getDeclaration( + Mod, Intrinsic::amdgcn_rcp, Ty); + + // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to + // the CI documentation has a worst case error of 1 ulp. + // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to + // use it as long as we aren't trying to use denormals. + // + // v_rcp_f16 and v_rsq_f16 DO support denormals. + + // NOTE: v_sqrt and v_rcp will be combined to v_rsq later. So we don't + // insert rsq intrinsic here. + + // 1.0 / x -> rcp(x) + return Builder.CreateCall(Decl, { Den }); + } + + // Same as for 1.0, but expand the sign out of the constant. + if (CLHS->isExactlyValue(-1.0)) { + Function *Decl = Intrinsic::getDeclaration( + Mod, Intrinsic::amdgcn_rcp, Ty); + + // -1.0 / x -> rcp (fneg x) + Value *FNeg = Builder.CreateFNeg(Den); + return Builder.CreateCall(Decl, { FNeg }); + } + } + } - bool IsOne = CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0); + if (AllowInaccurateRcp) { + Function *Decl = Intrinsic::getDeclaration( + Mod, Intrinsic::amdgcn_rcp, Ty); - // Reciprocal f32 is handled separately without denormals. - return HasDenormals ^ IsOne; + // Turn into multiply by the reciprocal. + // x / y -> x * (1.0 / y) + Value *Recip = Builder.CreateCall(Decl, { Den }); + return Builder.CreateFMul(Num, Recip); + } + return nullptr; +} + +// optimize with fdiv.fast: +// +// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed. +// +// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp. +// +// NOTE: optimizeWithRcp should be tried first because rcp is the preference. +static Value *optimizeWithFDivFast(Value *Num, Value *Den, float ReqdAccuracy, + bool HasDenormals, IRBuilder<> &Builder, + Module *Mod) { + // fdiv.fast can achieve 2.5 ULP accuracy. + if (ReqdAccuracy < 2.5f) + return nullptr; + + // Only have fdiv.fast for f32. + Type *Ty = Den->getType(); + if (!Ty->isFloatTy()) + return nullptr; + + bool NumIsOne = false; + if (const ConstantFP *CNum = dyn_cast<ConstantFP>(Num)) { + if (CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0)) + NumIsOne = true; + } + + // fdiv does not support denormals. But 1.0/x is always fine to use it. + if (HasDenormals && !NumIsOne) + return nullptr; + + Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast); + return Builder.CreateCall(Decl, { Num, Den }); } -// Insert an intrinsic for fast fdiv for safe math situations where we can -// reduce precision. Leave fdiv for situations where the generic node is -// expected to be optimized. +// Optimizations is performed based on fpmath, fast math flags as well as +// denormals to optimize fdiv with either rcp or fdiv.fast. +// +// With rcp: +// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is +// allowed with unsafe-fp-math or afn. +// +// a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn. +// +// With fdiv.fast: +// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed. +// +// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp. +// +// NOTE: rcp is the preference in cases that both are legal. bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) { - Type *Ty = FDiv.getType(); - if (!Ty->getScalarType()->isFloatTy()) - return false; + Type *Ty = FDiv.getType()->getScalarType(); - MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath); - if (!FPMath) + // No intrinsic for fdiv16 if target does not support f16. + if (Ty->isHalfTy() && !ST->has16BitInsts()) return false; const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv); - float ULP = FPOp->getFPAccuracy(); - if (ULP < 2.5f) - return false; + const float ReqdAccuracy = FPOp->getFPAccuracy(); + // Inaccurate rcp is allowed with unsafe-fp-math or afn. FastMathFlags FMF = FPOp->getFastMathFlags(); - bool UnsafeDiv = HasUnsafeFPMath || FMF.isFast() || - FMF.allowReciprocal(); + const bool AllowInaccurateRcp = HasUnsafeFPMath || FMF.approxFunc(); - // With UnsafeDiv node will be optimized to just rcp and mul. - if (UnsafeDiv) - return false; + // rcp_f16 is accurate for !fpmath >= 1.0ulp. + // rcp_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed. + // rcp_f64 is never accurate. + const bool RcpIsAccurate = (Ty->isHalfTy() && ReqdAccuracy >= 1.0f) || + (Ty->isFloatTy() && !HasFP32Denormals && ReqdAccuracy >= 1.0f); - IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath); + IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator())); Builder.setFastMathFlags(FMF); Builder.SetCurrentDebugLocation(FDiv.getDebugLoc()); - Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast); - Value *Num = FDiv.getOperand(0); Value *Den = FDiv.getOperand(1); Value *NewFDiv = nullptr; - - if (VectorType *VT = dyn_cast<VectorType>(Ty)) { + if (auto *VT = dyn_cast<FixedVectorType>(FDiv.getType())) { NewFDiv = UndefValue::get(VT); // FIXME: Doesn't do the right thing for cases where the vector is partially @@ -584,19 +784,25 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) { for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) { Value *NumEltI = Builder.CreateExtractElement(Num, I); Value *DenEltI = Builder.CreateExtractElement(Den, I); - Value *NewElt; - - if (shouldKeepFDivF32(NumEltI, UnsafeDiv, HasFP32Denormals)) { + // Try rcp first. + Value *NewElt = optimizeWithRcp(NumEltI, DenEltI, AllowInaccurateRcp, + RcpIsAccurate, Builder, Mod); + if (!NewElt) // Try fdiv.fast. + NewElt = optimizeWithFDivFast(NumEltI, DenEltI, ReqdAccuracy, + HasFP32Denormals, Builder, Mod); + if (!NewElt) // Keep the original. NewElt = Builder.CreateFDiv(NumEltI, DenEltI); - } else { - NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI }); - } NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I); } - } else { - if (!shouldKeepFDivF32(Num, UnsafeDiv, HasFP32Denormals)) - NewFDiv = Builder.CreateCall(Decl, { Num, Den }); + } else { // Scalar FDiv. + // Try rcp first. + NewFDiv = optimizeWithRcp(Num, Den, AllowInaccurateRcp, RcpIsAccurate, + Builder, Mod); + if (!NewFDiv) { // Try fdiv.fast. + NewFDiv = optimizeWithFDivFast(Num, Den, ReqdAccuracy, HasFP32Denormals, + Builder, Mod); + } } if (NewFDiv) { @@ -631,31 +837,49 @@ static Value* getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS) { return getMul64(Builder, LHS, RHS).second; } -// The fractional part of a float is enough to accurately represent up to -// a 24-bit signed integer. -Value* AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder, - BinaryOperator &I, - Value *Num, Value *Den, - bool IsDiv, bool IsSigned) const { - assert(Num->getType()->isIntegerTy(32)); - +/// Figure out how many bits are really needed for this ddivision. \p AtLeast is +/// an optimization hint to bypass the second ComputeNumSignBits call if we the +/// first one is insufficient. Returns -1 on failure. +int AMDGPUCodeGenPrepare::getDivNumBits(BinaryOperator &I, + Value *Num, Value *Den, + unsigned AtLeast, bool IsSigned) const { const DataLayout &DL = Mod->getDataLayout(); unsigned LHSSignBits = ComputeNumSignBits(Num, DL, 0, AC, &I); - if (LHSSignBits < 9) - return nullptr; + if (LHSSignBits < AtLeast) + return -1; unsigned RHSSignBits = ComputeNumSignBits(Den, DL, 0, AC, &I); - if (RHSSignBits < 9) - return nullptr; - + if (RHSSignBits < AtLeast) + return -1; unsigned SignBits = std::min(LHSSignBits, RHSSignBits); - unsigned DivBits = 32 - SignBits; + unsigned DivBits = Num->getType()->getScalarSizeInBits() - SignBits; if (IsSigned) ++DivBits; + return DivBits; +} - Type *Ty = Num->getType(); +// The fractional part of a float is enough to accurately represent up to +// a 24-bit signed integer. +Value *AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder, + BinaryOperator &I, + Value *Num, Value *Den, + bool IsDiv, bool IsSigned) const { + int DivBits = getDivNumBits(I, Num, Den, 9, IsSigned); + if (DivBits == -1) + return nullptr; + return expandDivRem24Impl(Builder, I, Num, Den, DivBits, IsDiv, IsSigned); +} + +Value *AMDGPUCodeGenPrepare::expandDivRem24Impl(IRBuilder<> &Builder, + BinaryOperator &I, + Value *Num, Value *Den, + unsigned DivBits, + bool IsDiv, bool IsSigned) const { Type *I32Ty = Builder.getInt32Ty(); + Num = Builder.CreateTrunc(Num, I32Ty); + Den = Builder.CreateTrunc(Den, I32Ty); + Type *F32Ty = Builder.getFloatTy(); ConstantInt *One = Builder.getInt32(1); Value *JQ = One; @@ -685,7 +909,9 @@ Value* AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder, Value *FB = IsSigned ? Builder.CreateSIToFP(IB,F32Ty) : Builder.CreateUIToFP(IB,F32Ty); - Value *RCP = Builder.CreateFDiv(ConstantFP::get(F32Ty, 1.0), FB); + Function *RcpDecl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_rcp, + Builder.getFloatTy()); + Value *RCP = Builder.CreateCall(RcpDecl, { FB }); Value *FQM = Builder.CreateFMul(FA, RCP); // fq = trunc(fqm); @@ -696,7 +922,10 @@ Value* AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder, Value *FQNeg = Builder.CreateFNeg(FQ); // float fr = mad(fqneg, fb, fa); - Value *FR = Builder.CreateIntrinsic(Intrinsic::amdgcn_fmad_ftz, + auto FMAD = !ST->hasMadMacF32Insts() + ? Intrinsic::fma + : (Intrinsic::ID)Intrinsic::amdgcn_fmad_ftz; + Value *FR = Builder.CreateIntrinsic(FMAD, {FQNeg->getType()}, {FQNeg, FB, FA}, FQ); // int iq = (int)fq; @@ -725,21 +954,72 @@ Value* AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder, Res = Builder.CreateSub(Num, Rem); } - // Truncate to number of bits this divide really is. - if (IsSigned) { - Res = Builder.CreateTrunc(Res, Builder.getIntNTy(DivBits)); - Res = Builder.CreateSExt(Res, Ty); - } else { - ConstantInt *TruncMask = Builder.getInt32((UINT64_C(1) << DivBits) - 1); - Res = Builder.CreateAnd(Res, TruncMask); + if (DivBits != 0 && DivBits < 32) { + // Extend in register from the number of bits this divide really is. + if (IsSigned) { + int InRegBits = 32 - DivBits; + + Res = Builder.CreateShl(Res, InRegBits); + Res = Builder.CreateAShr(Res, InRegBits); + } else { + ConstantInt *TruncMask + = Builder.getInt32((UINT64_C(1) << DivBits) - 1); + Res = Builder.CreateAnd(Res, TruncMask); + } } return Res; } -Value* AMDGPUCodeGenPrepare::expandDivRem32(IRBuilder<> &Builder, - BinaryOperator &I, - Value *Num, Value *Den) const { +// Try to recognize special cases the DAG will emit special, better expansions +// than the general expansion we do here. + +// TODO: It would be better to just directly handle those optimizations here. +bool AMDGPUCodeGenPrepare::divHasSpecialOptimization( + BinaryOperator &I, Value *Num, Value *Den) const { + if (Constant *C = dyn_cast<Constant>(Den)) { + // Arbitrary constants get a better expansion as long as a wider mulhi is + // legal. + if (C->getType()->getScalarSizeInBits() <= 32) + return true; + + // TODO: Sdiv check for not exact for some reason. + + // If there's no wider mulhi, there's only a better expansion for powers of + // two. + // TODO: Should really know for each vector element. + if (isKnownToBeAPowerOfTwo(C, *DL, true, 0, AC, &I, DT)) + return true; + + return false; + } + + if (BinaryOperator *BinOpDen = dyn_cast<BinaryOperator>(Den)) { + // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2 + if (BinOpDen->getOpcode() == Instruction::Shl && + isa<Constant>(BinOpDen->getOperand(0)) && + isKnownToBeAPowerOfTwo(BinOpDen->getOperand(0), *DL, true, + 0, AC, &I, DT)) { + return true; + } + } + + return false; +} + +static Value *getSign32(Value *V, IRBuilder<> &Builder, const DataLayout *DL) { + // Check whether the sign can be determined statically. + KnownBits Known = computeKnownBits(V, *DL); + if (Known.isNegative()) + return Constant::getAllOnesValue(V->getType()); + if (Known.isNonNegative()) + return Constant::getNullValue(V->getType()); + return Builder.CreateAShr(V, Builder.getInt32(31)); +} + +Value *AMDGPUCodeGenPrepare::expandDivRem32(IRBuilder<> &Builder, + BinaryOperator &I, Value *X, + Value *Y) const { Instruction::BinaryOps Opc = I.getOpcode(); assert(Opc == Instruction::URem || Opc == Instruction::UDiv || Opc == Instruction::SRem || Opc == Instruction::SDiv); @@ -748,142 +1028,171 @@ Value* AMDGPUCodeGenPrepare::expandDivRem32(IRBuilder<> &Builder, FMF.setFast(); Builder.setFastMathFlags(FMF); - if (isa<Constant>(Den)) - return nullptr; // Keep it for optimization + if (divHasSpecialOptimization(I, X, Y)) + return nullptr; // Keep it for later optimization. bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv; bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv; - Type *Ty = Num->getType(); + Type *Ty = X->getType(); Type *I32Ty = Builder.getInt32Ty(); Type *F32Ty = Builder.getFloatTy(); if (Ty->getScalarSizeInBits() < 32) { if (IsSigned) { - Num = Builder.CreateSExt(Num, I32Ty); - Den = Builder.CreateSExt(Den, I32Ty); + X = Builder.CreateSExt(X, I32Ty); + Y = Builder.CreateSExt(Y, I32Ty); } else { - Num = Builder.CreateZExt(Num, I32Ty); - Den = Builder.CreateZExt(Den, I32Ty); + X = Builder.CreateZExt(X, I32Ty); + Y = Builder.CreateZExt(Y, I32Ty); } } - if (Value *Res = expandDivRem24(Builder, I, Num, Den, IsDiv, IsSigned)) { - Res = Builder.CreateTrunc(Res, Ty); - return Res; + if (Value *Res = expandDivRem24(Builder, I, X, Y, IsDiv, IsSigned)) { + return IsSigned ? Builder.CreateSExtOrTrunc(Res, Ty) : + Builder.CreateZExtOrTrunc(Res, Ty); } ConstantInt *Zero = Builder.getInt32(0); ConstantInt *One = Builder.getInt32(1); - ConstantInt *MinusOne = Builder.getInt32(~0); Value *Sign = nullptr; if (IsSigned) { - ConstantInt *K31 = Builder.getInt32(31); - Value *LHSign = Builder.CreateAShr(Num, K31); - Value *RHSign = Builder.CreateAShr(Den, K31); + Value *SignX = getSign32(X, Builder, DL); + Value *SignY = getSign32(Y, Builder, DL); // Remainder sign is the same as LHS - Sign = IsDiv ? Builder.CreateXor(LHSign, RHSign) : LHSign; + Sign = IsDiv ? Builder.CreateXor(SignX, SignY) : SignX; - Num = Builder.CreateAdd(Num, LHSign); - Den = Builder.CreateAdd(Den, RHSign); + X = Builder.CreateAdd(X, SignX); + Y = Builder.CreateAdd(Y, SignY); - Num = Builder.CreateXor(Num, LHSign); - Den = Builder.CreateXor(Den, RHSign); + X = Builder.CreateXor(X, SignX); + Y = Builder.CreateXor(Y, SignY); } - // RCP = URECIP(Den) = 2^32 / Den + e - // e is rounding error. - Value *DEN_F32 = Builder.CreateUIToFP(Den, F32Ty); - Value *RCP_F32 = Builder.CreateFDiv(ConstantFP::get(F32Ty, 1.0), DEN_F32); - Constant *UINT_MAX_PLUS_1 = ConstantFP::get(F32Ty, BitsToFloat(0x4f800000)); - Value *RCP_SCALE = Builder.CreateFMul(RCP_F32, UINT_MAX_PLUS_1); - Value *RCP = Builder.CreateFPToUI(RCP_SCALE, I32Ty); - - // RCP_LO, RCP_HI = mul(RCP, Den) */ - Value *RCP_LO, *RCP_HI; - std::tie(RCP_LO, RCP_HI) = getMul64(Builder, RCP, Den); - - // NEG_RCP_LO = -RCP_LO - Value *NEG_RCP_LO = Builder.CreateNeg(RCP_LO); - - // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO) - Value *RCP_HI_0_CC = Builder.CreateICmpEQ(RCP_HI, Zero); - Value *ABS_RCP_LO = Builder.CreateSelect(RCP_HI_0_CC, NEG_RCP_LO, RCP_LO); - - // Calculate the rounding error from the URECIP instruction - // E = mulhu(ABS_RCP_LO, RCP) - Value *E = getMulHu(Builder, ABS_RCP_LO, RCP); - - // RCP_A_E = RCP + E - Value *RCP_A_E = Builder.CreateAdd(RCP, E); - - // RCP_S_E = RCP - E - Value *RCP_S_E = Builder.CreateSub(RCP, E); - - // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E) - Value *Tmp0 = Builder.CreateSelect(RCP_HI_0_CC, RCP_A_E, RCP_S_E); - - // Quotient = mulhu(Tmp0, Num) - Value *Quotient = getMulHu(Builder, Tmp0, Num); - - // Num_S_Remainder = Quotient * Den - Value *Num_S_Remainder = Builder.CreateMul(Quotient, Den); + // The algorithm here is based on ideas from "Software Integer Division", Tom + // Rodeheffer, August 2008. + // + // unsigned udiv(unsigned x, unsigned y) { + // // Initial estimate of inv(y). The constant is less than 2^32 to ensure + // // that this is a lower bound on inv(y), even if some of the calculations + // // round up. + // unsigned z = (unsigned)((4294967296.0 - 512.0) * v_rcp_f32((float)y)); + // + // // One round of UNR (Unsigned integer Newton-Raphson) to improve z. + // // Empirically this is guaranteed to give a "two-y" lower bound on + // // inv(y). + // z += umulh(z, -y * z); + // + // // Quotient/remainder estimate. + // unsigned q = umulh(x, z); + // unsigned r = x - q * y; + // + // // Two rounds of quotient/remainder refinement. + // if (r >= y) { + // ++q; + // r -= y; + // } + // if (r >= y) { + // ++q; + // r -= y; + // } + // + // return q; + // } + + // Initial estimate of inv(y). + Value *FloatY = Builder.CreateUIToFP(Y, F32Ty); + Function *Rcp = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_rcp, F32Ty); + Value *RcpY = Builder.CreateCall(Rcp, {FloatY}); + Constant *Scale = ConstantFP::get(F32Ty, BitsToFloat(0x4F7FFFFE)); + Value *ScaledY = Builder.CreateFMul(RcpY, Scale); + Value *Z = Builder.CreateFPToUI(ScaledY, I32Ty); + + // One round of UNR. + Value *NegY = Builder.CreateSub(Zero, Y); + Value *NegYZ = Builder.CreateMul(NegY, Z); + Z = Builder.CreateAdd(Z, getMulHu(Builder, Z, NegYZ)); + + // Quotient/remainder estimate. + Value *Q = getMulHu(Builder, X, Z); + Value *R = Builder.CreateSub(X, Builder.CreateMul(Q, Y)); + + // First quotient/remainder refinement. + Value *Cond = Builder.CreateICmpUGE(R, Y); + if (IsDiv) + Q = Builder.CreateSelect(Cond, Builder.CreateAdd(Q, One), Q); + R = Builder.CreateSelect(Cond, Builder.CreateSub(R, Y), R); + + // Second quotient/remainder refinement. + Cond = Builder.CreateICmpUGE(R, Y); + Value *Res; + if (IsDiv) + Res = Builder.CreateSelect(Cond, Builder.CreateAdd(Q, One), Q); + else + Res = Builder.CreateSelect(Cond, Builder.CreateSub(R, Y), R); - // Remainder = Num - Num_S_Remainder - Value *Remainder = Builder.CreateSub(Num, Num_S_Remainder); + if (IsSigned) { + Res = Builder.CreateXor(Res, Sign); + Res = Builder.CreateSub(Res, Sign); + } - // Remainder_GE_Den = (Remainder >= Den ? -1 : 0) - Value *Rem_GE_Den_CC = Builder.CreateICmpUGE(Remainder, Den); - Value *Remainder_GE_Den = Builder.CreateSelect(Rem_GE_Den_CC, MinusOne, Zero); + Res = Builder.CreateTrunc(Res, Ty); - // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0) - Value *Num_GE_Num_S_Rem_CC = Builder.CreateICmpUGE(Num, Num_S_Remainder); - Value *Remainder_GE_Zero = Builder.CreateSelect(Num_GE_Num_S_Rem_CC, - MinusOne, Zero); + return Res; +} - // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero - Value *Tmp1 = Builder.CreateAnd(Remainder_GE_Den, Remainder_GE_Zero); - Value *Tmp1_0_CC = Builder.CreateICmpEQ(Tmp1, Zero); +Value *AMDGPUCodeGenPrepare::shrinkDivRem64(IRBuilder<> &Builder, + BinaryOperator &I, + Value *Num, Value *Den) const { + if (!ExpandDiv64InIR && divHasSpecialOptimization(I, Num, Den)) + return nullptr; // Keep it for later optimization. - Value *Res; - if (IsDiv) { - // Quotient_A_One = Quotient + 1 - Value *Quotient_A_One = Builder.CreateAdd(Quotient, One); + Instruction::BinaryOps Opc = I.getOpcode(); - // Quotient_S_One = Quotient - 1 - Value *Quotient_S_One = Builder.CreateSub(Quotient, One); + bool IsDiv = Opc == Instruction::SDiv || Opc == Instruction::UDiv; + bool IsSigned = Opc == Instruction::SDiv || Opc == Instruction::SRem; - // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One) - Value *Div = Builder.CreateSelect(Tmp1_0_CC, Quotient, Quotient_A_One); + int NumDivBits = getDivNumBits(I, Num, Den, 32, IsSigned); + if (NumDivBits == -1) + return nullptr; - // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div) - Res = Builder.CreateSelect(Num_GE_Num_S_Rem_CC, Div, Quotient_S_One); - } else { - // Remainder_S_Den = Remainder - Den - Value *Remainder_S_Den = Builder.CreateSub(Remainder, Den); + Value *Narrowed = nullptr; + if (NumDivBits <= 24) { + Narrowed = expandDivRem24Impl(Builder, I, Num, Den, NumDivBits, + IsDiv, IsSigned); + } else if (NumDivBits <= 32) { + Narrowed = expandDivRem32(Builder, I, Num, Den); + } - // Remainder_A_Den = Remainder + Den - Value *Remainder_A_Den = Builder.CreateAdd(Remainder, Den); + if (Narrowed) { + return IsSigned ? Builder.CreateSExt(Narrowed, Num->getType()) : + Builder.CreateZExt(Narrowed, Num->getType()); + } - // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den) - Value *Rem = Builder.CreateSelect(Tmp1_0_CC, Remainder, Remainder_S_Den); + return nullptr; +} - // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem) - Res = Builder.CreateSelect(Num_GE_Num_S_Rem_CC, Rem, Remainder_A_Den); +void AMDGPUCodeGenPrepare::expandDivRem64(BinaryOperator &I) const { + Instruction::BinaryOps Opc = I.getOpcode(); + // Do the general expansion. + if (Opc == Instruction::UDiv || Opc == Instruction::SDiv) { + expandDivisionUpTo64Bits(&I); + return; } - if (IsSigned) { - Res = Builder.CreateXor(Res, Sign); - Res = Builder.CreateSub(Res, Sign); + if (Opc == Instruction::URem || Opc == Instruction::SRem) { + expandRemainderUpTo64Bits(&I); + return; } - Res = Builder.CreateTrunc(Res, Ty); - - return Res; + llvm_unreachable("not a division"); } bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) { + if (foldBinOpIntoSelect(I)) + return true; + if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && DA->isUniform(&I) && promoteUniformOpToI32(I)) return true; @@ -895,27 +1204,54 @@ bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) { Instruction::BinaryOps Opc = I.getOpcode(); Type *Ty = I.getType(); Value *NewDiv = nullptr; + unsigned ScalarSize = Ty->getScalarSizeInBits(); + + SmallVector<BinaryOperator *, 8> Div64ToExpand; + if ((Opc == Instruction::URem || Opc == Instruction::UDiv || Opc == Instruction::SRem || Opc == Instruction::SDiv) && - Ty->getScalarSizeInBits() <= 32) { + ScalarSize <= 64 && + !DisableIDivExpand) { Value *Num = I.getOperand(0); Value *Den = I.getOperand(1); IRBuilder<> Builder(&I); Builder.SetCurrentDebugLocation(I.getDebugLoc()); - if (VectorType *VT = dyn_cast<VectorType>(Ty)) { + if (auto *VT = dyn_cast<FixedVectorType>(Ty)) { NewDiv = UndefValue::get(VT); for (unsigned N = 0, E = VT->getNumElements(); N != E; ++N) { Value *NumEltN = Builder.CreateExtractElement(Num, N); Value *DenEltN = Builder.CreateExtractElement(Den, N); - Value *NewElt = expandDivRem32(Builder, I, NumEltN, DenEltN); - if (!NewElt) - NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN); + + Value *NewElt; + if (ScalarSize <= 32) { + NewElt = expandDivRem32(Builder, I, NumEltN, DenEltN); + if (!NewElt) + NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN); + } else { + // See if this 64-bit division can be shrunk to 32/24-bits before + // producing the general expansion. + NewElt = shrinkDivRem64(Builder, I, NumEltN, DenEltN); + if (!NewElt) { + // The general 64-bit expansion introduces control flow and doesn't + // return the new value. Just insert a scalar copy and defer + // expanding it. + NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN); + Div64ToExpand.push_back(cast<BinaryOperator>(NewElt)); + } + } + NewDiv = Builder.CreateInsertElement(NewDiv, NewElt, N); } } else { - NewDiv = expandDivRem32(Builder, I, Num, Den); + if (ScalarSize <= 32) + NewDiv = expandDivRem32(Builder, I, Num, Den); + else { + NewDiv = shrinkDivRem64(Builder, I, Num, Den); + if (!NewDiv) + Div64ToExpand.push_back(&I); + } } if (NewDiv) { @@ -925,6 +1261,14 @@ bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) { } } + if (ExpandDiv64InIR) { + // TODO: We get much worse code in specially handled constant cases. + for (BinaryOperator *Div : Div64ToExpand) { + expandDivRem64(*Div); + Changed = true; + } + } + return Changed; } @@ -1033,16 +1377,36 @@ bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) { ST = &TM.getSubtarget<GCNSubtarget>(F); AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); DA = &getAnalysis<LegacyDivergenceAnalysis>(); + + auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); + DT = DTWP ? &DTWP->getDomTree() : nullptr; + HasUnsafeFPMath = hasUnsafeFPMath(F); - HasFP32Denormals = ST->hasFP32Denormals(F); + + AMDGPU::SIModeRegisterDefaults Mode(F); + HasFP32Denormals = Mode.allFP32Denormals(); bool MadeChange = false; - for (BasicBlock &BB : F) { + Function::iterator NextBB; + for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; FI = NextBB) { + BasicBlock *BB = &*FI; + NextBB = std::next(FI); + BasicBlock::iterator Next; - for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) { + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; I = Next) { Next = std::next(I); + MadeChange |= visit(*I); + + if (Next != E) { // Control flow changed + BasicBlock *NextInstBB = Next->getParent(); + if (NextInstBB != BB) { + BB = NextInstBB; + E = BB->end(); + FE = F.end(); + } + } } } |