1 files changed, 539 insertions, 175 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index cf908766caa0d..a795493017402 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -15,8 +15,10 @@
 #include "AMDGPU.h"
 #include "AMDGPUSubtarget.h"
 #include "AMDGPUTargetMachine.h"
+#include "llvm/ADT/FloatingPointMode.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -26,6 +28,7 @@
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstVisitor.h"
@@ -41,6 +44,7 @@
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Transforms/Utils/IntegerDivision.h"
 #include <cassert>
 #include <iterator>
 
@@ -54,7 +58,7 @@ static cl::opt<bool> WidenLoads(
   "amdgpu-codegenprepare-widen-constant-loads",
   cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
   cl::ReallyHidden,
-  cl::init(true));
+  cl::init(false));
 
 static cl::opt<bool> UseMul24Intrin(
   "amdgpu-codegenprepare-mul24",
@@ -62,10 +66,26 @@ static cl::opt<bool> UseMul24Intrin(
   cl::ReallyHidden,
   cl::init(true));
 
+// Legalize 64-bit division by using the generic IR expansion.
+static cl::opt<bool> ExpandDiv64InIR(
+  "amdgpu-codegenprepare-expand-div64",
+  cl::desc("Expand 64-bit division in AMDGPUCodeGenPrepare"),
+  cl::ReallyHidden,
+  cl::init(false));
+
+// Leave all division operations as they are. This supersedes ExpandDiv64InIR
+// and is used for testing the legalizer.
+static cl::opt<bool> DisableIDivExpand(
+  "amdgpu-codegenprepare-disable-idiv-expansion",
+  cl::desc("Prevent expanding integer division in AMDGPUCodeGenPrepare"),
+  cl::ReallyHidden,
+  cl::init(false));
+
 class AMDGPUCodeGenPrepare : public FunctionPass,
                              public InstVisitor<AMDGPUCodeGenPrepare, bool> {
   const GCNSubtarget *ST = nullptr;
   AssumptionCache *AC = nullptr;
+  DominatorTree *DT = nullptr;
   LegacyDivergenceAnalysis *DA = nullptr;
   Module *Mod = nullptr;
   const DataLayout *DL = nullptr;
@@ -152,15 +172,33 @@ class AMDGPUCodeGenPrepare : public FunctionPass,
   /// SelectionDAG has an issue where an and asserting the bits are known
   bool replaceMulWithMul24(BinaryOperator &I) const;
 
+  /// Perform same function as equivalently named function in DAGCombiner. Since
+  /// we expand some divisions here, we need to perform this before obscuring.
+  bool foldBinOpIntoSelect(BinaryOperator &I) const;
+
+  bool divHasSpecialOptimization(BinaryOperator &I,
+                                 Value *Num, Value *Den) const;
+  int getDivNumBits(BinaryOperator &I,
+                    Value *Num, Value *Den,
+                    unsigned AtLeast, bool Signed) const;
+
   /// Expands 24 bit div or rem.
   Value* expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I,
                         Value *Num, Value *Den,
                         bool IsDiv, bool IsSigned) const;
 
+  Value *expandDivRem24Impl(IRBuilder<> &Builder, BinaryOperator &I,
+                            Value *Num, Value *Den, unsigned NumBits,
+                            bool IsDiv, bool IsSigned) const;
+
   /// Expands 32 bit div or rem.
   Value* expandDivRem32(IRBuilder<> &Builder, BinaryOperator &I,
                         Value *Num, Value *Den) const;
 
+  Value *shrinkDivRem64(IRBuilder<> &Builder, BinaryOperator &I,
+                        Value *Num, Value *Den) const;
+  void expandDivRem64(BinaryOperator &I) const;
+
   /// Widen a scalar load.
   ///
   /// \details \p Widen scalar load for uniform, small type loads from constant
@@ -195,7 +233,10 @@ public:
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<AssumptionCacheTracker>();
     AU.addRequired<LegacyDivergenceAnalysis>();
-    AU.setPreservesAll();
+
+    // FIXME: Division expansion needs to preserve the dominator tree.
+    if (!ExpandDiv64InIR)
+      AU.setPreservesAll();
  }
 };
 
@@ -214,7 +255,7 @@ Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const {
 
   if (T->isIntegerTy())
     return B.getInt32Ty();
-  return VectorType::get(B.getInt32Ty(), cast<VectorType>(T)->getNumElements());
+  return FixedVectorType::get(B.getInt32Ty(), cast<FixedVectorType>(T));
 }
 
 bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const {
@@ -276,10 +317,9 @@ bool AMDGPUCodeGenPrepare::canWidenScalarExtLoad(LoadInst &I) const {
   Type *Ty = I.getType();
   const DataLayout &DL = Mod->getDataLayout();
   int TySize = DL.getTypeSizeInBits(Ty);
-  unsigned Align = I.getAlignment() ?
-                   I.getAlignment() : DL.getABITypeAlignment(Ty);
+  Align Alignment = DL.getValueOrABITypeAlignment(I.getAlign(), Ty);
 
-  return I.isSimple() && TySize < 32 && Align >= 4 && DA->isUniform(&I);
+  return I.isSimple() && TySize < 32 && Alignment >= 4 && DA->isUniform(&I);
 }
 
 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const {
@@ -436,7 +476,7 @@ bool AMDGPUCodeGenPrepare::isU24(Value *V, unsigned ScalarSize) const {
 
 static void extractValues(IRBuilder<> &Builder,
                           SmallVectorImpl<Value *> &Values, Value *V) {
-  VectorType *VT = dyn_cast<VectorType>(V->getType());
+  auto *VT = dyn_cast<FixedVectorType>(V->getType());
   if (!VT) {
     Values.push_back(V);
     return;
@@ -525,58 +565,218 @@ bool AMDGPUCodeGenPrepare::replaceMulWithMul24(BinaryOperator &I) const {
   return true;
 }
 
-static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv, bool HasDenormals) {
-  const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
-  if (!CNum)
-    return HasDenormals;
+// Find a select instruction, which may have been casted. This is mostly to deal
+// with cases where i16 selects were promoted here to i32.
+static SelectInst *findSelectThroughCast(Value *V, CastInst *&Cast) {
+  Cast = nullptr;
+  if (SelectInst *Sel = dyn_cast<SelectInst>(V))
+    return Sel;
 
-  if (UnsafeDiv)
-    return true;
+  if ((Cast = dyn_cast<CastInst>(V))) {
+    if (SelectInst *Sel = dyn_cast<SelectInst>(Cast->getOperand(0)))
+      return Sel;
+  }
+
+  return nullptr;
+}
+
+bool AMDGPUCodeGenPrepare::foldBinOpIntoSelect(BinaryOperator &BO) const {
+  // Don't do this unless the old select is going away. We want to eliminate the
+  // binary operator, not replace a binop with a select.
+  int SelOpNo = 0;
+
+  CastInst *CastOp;
+
+  // TODO: Should probably try to handle some cases with multiple
+  // users. Duplicating the select may be profitable for division.
+  SelectInst *Sel = findSelectThroughCast(BO.getOperand(0), CastOp);
+  if (!Sel || !Sel->hasOneUse()) {
+    SelOpNo = 1;
+    Sel = findSelectThroughCast(BO.getOperand(1), CastOp);
+  }
+
+  if (!Sel || !Sel->hasOneUse())
+    return false;
+
+  Constant *CT = dyn_cast<Constant>(Sel->getTrueValue());
+  Constant *CF = dyn_cast<Constant>(Sel->getFalseValue());
+  Constant *CBO = dyn_cast<Constant>(BO.getOperand(SelOpNo ^ 1));
+  if (!CBO || !CT || !CF)
+    return false;
+
+  if (CastOp) {
+    if (!CastOp->hasOneUse())
+      return false;
+    CT = ConstantFoldCastOperand(CastOp->getOpcode(), CT, BO.getType(), *DL);
+    CF = ConstantFoldCastOperand(CastOp->getOpcode(), CF, BO.getType(), *DL);
+  }
+
+  // TODO: Handle special 0/-1 cases DAG combine does, although we only really
+  // need to handle divisions here.
+  Constant *FoldedT = SelOpNo ?
+    ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CT, *DL) :
+    ConstantFoldBinaryOpOperands(BO.getOpcode(), CT, CBO, *DL);
+  if (isa<ConstantExpr>(FoldedT))
+    return false;
+
+  Constant *FoldedF = SelOpNo ?
+    ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CF, *DL) :
+    ConstantFoldBinaryOpOperands(BO.getOpcode(), CF, CBO, *DL);
+  if (isa<ConstantExpr>(FoldedF))
+    return false;
+
+  IRBuilder<> Builder(&BO);
+  Builder.SetCurrentDebugLocation(BO.getDebugLoc());
+  if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(&BO))
+    Builder.setFastMathFlags(FPOp->getFastMathFlags());
+
+  Value *NewSelect = Builder.CreateSelect(Sel->getCondition(),
+                                          FoldedT, FoldedF);
+  NewSelect->takeName(&BO);
+  BO.replaceAllUsesWith(NewSelect);
+  BO.eraseFromParent();
+  if (CastOp)
+    CastOp->eraseFromParent();
+  Sel->eraseFromParent();
+  return true;
+}
+
+// Optimize fdiv with rcp:
+//
+// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
+//               allowed with unsafe-fp-math or afn.
+//
+// a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn.
+static Value *optimizeWithRcp(Value *Num, Value *Den, bool AllowInaccurateRcp,
+                              bool RcpIsAccurate, IRBuilder<> &Builder,
+                              Module *Mod) {
+
+  if (!AllowInaccurateRcp && !RcpIsAccurate)
+    return nullptr;
+
+  Type *Ty = Den->getType();
+  if (const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num)) {
+    if (AllowInaccurateRcp || RcpIsAccurate) {
+      if (CLHS->isExactlyValue(1.0)) {
+        Function *Decl = Intrinsic::getDeclaration(
+          Mod, Intrinsic::amdgcn_rcp, Ty);
+
+        // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
+        // the CI documentation has a worst case error of 1 ulp.
+        // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
+        // use it as long as we aren't trying to use denormals.
+        //
+        // v_rcp_f16 and v_rsq_f16 DO support denormals.
+
+        // NOTE: v_sqrt and v_rcp will be combined to v_rsq later. So we don't
+        //       insert rsq intrinsic here.
+
+        // 1.0 / x -> rcp(x)
+        return Builder.CreateCall(Decl, { Den });
+      }
+
+       // Same as for 1.0, but expand the sign out of the constant.
+      if (CLHS->isExactlyValue(-1.0)) {
+        Function *Decl = Intrinsic::getDeclaration(
+          Mod, Intrinsic::amdgcn_rcp, Ty);
+
+         // -1.0 / x -> rcp (fneg x)
+         Value *FNeg = Builder.CreateFNeg(Den);
+         return Builder.CreateCall(Decl, { FNeg });
+       }
+    }
+  }
 
-  bool IsOne = CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0);
+  if (AllowInaccurateRcp) {
+    Function *Decl = Intrinsic::getDeclaration(
+      Mod, Intrinsic::amdgcn_rcp, Ty);
 
-  // Reciprocal f32 is handled separately without denormals.
-  return HasDenormals ^ IsOne;
+    // Turn into multiply by the reciprocal.
+    // x / y -> x * (1.0 / y)
+    Value *Recip = Builder.CreateCall(Decl, { Den });
+    return Builder.CreateFMul(Num, Recip);
+  }
+  return nullptr;
+}
+
+// optimize with fdiv.fast:
+//
+// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
+//
+// 1/x -> fdiv.fast(1,x)  when !fpmath >= 2.5ulp.
+//
+// NOTE: optimizeWithRcp should be tried first because rcp is the preference.
+static Value *optimizeWithFDivFast(Value *Num, Value *Den, float ReqdAccuracy,
+                                   bool HasDenormals, IRBuilder<> &Builder,
+                                   Module *Mod) {
+  // fdiv.fast can achieve 2.5 ULP accuracy.
+  if (ReqdAccuracy < 2.5f)
+    return nullptr;
+
+  // Only have fdiv.fast for f32.
+  Type *Ty = Den->getType();
+  if (!Ty->isFloatTy())
+    return nullptr;
+
+  bool NumIsOne = false;
+  if (const ConstantFP *CNum = dyn_cast<ConstantFP>(Num)) {
+    if (CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0))
+      NumIsOne = true;
+  }
+
+  // fdiv does not support denormals. But 1.0/x is always fine to use it.
+  if (HasDenormals && !NumIsOne)
+    return nullptr;
+
+  Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);
+  return Builder.CreateCall(Decl, { Num, Den });
 }
 
-// Insert an intrinsic for fast fdiv for safe math situations where we can
-// reduce precision. Leave fdiv for situations where the generic node is
-// expected to be optimized.
+// Optimizations is performed based on fpmath, fast math flags as well as
+// denormals to optimize fdiv with either rcp or fdiv.fast.
+//
+// With rcp:
+//   1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
+//                 allowed with unsafe-fp-math or afn.
+//
+//   a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn.
+//
+// With fdiv.fast:
+//   a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
+//
+//   1/x -> fdiv.fast(1,x)  when !fpmath >= 2.5ulp.
+//
+// NOTE: rcp is the preference in cases that both are legal.
 bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
-  Type *Ty = FDiv.getType();
 
-  if (!Ty->getScalarType()->isFloatTy())
-    return false;
+  Type *Ty = FDiv.getType()->getScalarType();
 
-  MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath);
-  if (!FPMath)
+  // No intrinsic for fdiv16 if target does not support f16.
+  if (Ty->isHalfTy() && !ST->has16BitInsts())
     return false;
 
   const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
-  float ULP = FPOp->getFPAccuracy();
-  if (ULP < 2.5f)
-    return false;
+  const float ReqdAccuracy =  FPOp->getFPAccuracy();
 
+  // Inaccurate rcp is allowed with unsafe-fp-math or afn.
   FastMathFlags FMF = FPOp->getFastMathFlags();
-  bool UnsafeDiv = HasUnsafeFPMath || FMF.isFast() ||
-                                      FMF.allowReciprocal();
+  const bool AllowInaccurateRcp = HasUnsafeFPMath || FMF.approxFunc();
 
-  // With UnsafeDiv node will be optimized to just rcp and mul.
-  if (UnsafeDiv)
-    return false;
+  // rcp_f16 is accurate for !fpmath >= 1.0ulp.
+  // rcp_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
+  // rcp_f64 is never accurate.
+  const bool RcpIsAccurate = (Ty->isHalfTy() && ReqdAccuracy >= 1.0f) ||
+            (Ty->isFloatTy() && !HasFP32Denormals && ReqdAccuracy >= 1.0f);
 
-  IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath);
+  IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()));
   Builder.setFastMathFlags(FMF);
   Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
 
-  Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);
-
   Value *Num = FDiv.getOperand(0);
   Value *Den = FDiv.getOperand(1);
 
   Value *NewFDiv = nullptr;
-
-  if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
+  if (auto *VT = dyn_cast<FixedVectorType>(FDiv.getType())) {
     NewFDiv = UndefValue::get(VT);
 
     // FIXME: Doesn't do the right thing for cases where the vector is partially
@@ -584,19 +784,25 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
     for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) {
       Value *NumEltI = Builder.CreateExtractElement(Num, I);
       Value *DenEltI = Builder.CreateExtractElement(Den, I);
-      Value *NewElt;
-
-      if (shouldKeepFDivF32(NumEltI, UnsafeDiv, HasFP32Denormals)) {
+      // Try rcp first.
+      Value *NewElt = optimizeWithRcp(NumEltI, DenEltI, AllowInaccurateRcp,
+                                      RcpIsAccurate, Builder, Mod);
+      if (!NewElt) // Try fdiv.fast.
+        NewElt = optimizeWithFDivFast(NumEltI, DenEltI, ReqdAccuracy,
+                                      HasFP32Denormals, Builder, Mod);
+      if (!NewElt) // Keep the original.
         NewElt = Builder.CreateFDiv(NumEltI, DenEltI);
-      } else {
-        NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI });
-      }
 
       NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
     }
-  } else {
-    if (!shouldKeepFDivF32(Num, UnsafeDiv, HasFP32Denormals))
-      NewFDiv = Builder.CreateCall(Decl, { Num, Den });
+  } else { // Scalar FDiv.
+    // Try rcp first.
+    NewFDiv = optimizeWithRcp(Num, Den, AllowInaccurateRcp, RcpIsAccurate,
+                              Builder, Mod);
+    if (!NewFDiv) { // Try fdiv.fast.
+      NewFDiv = optimizeWithFDivFast(Num, Den, ReqdAccuracy, HasFP32Denormals,
+                                     Builder, Mod);
+    }
   }
 
   if (NewFDiv) {
@@ -631,31 +837,49 @@ static Value* getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS) {
   return getMul64(Builder, LHS, RHS).second;
 }
 
-// The fractional part of a float is enough to accurately represent up to
-// a 24-bit signed integer.
-Value* AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder,
-                                            BinaryOperator &I,
-                                            Value *Num, Value *Den,
-                                            bool IsDiv, bool IsSigned) const {
-  assert(Num->getType()->isIntegerTy(32));
-
+/// Figure out how many bits are really needed for this ddivision. \p AtLeast is
+/// an optimization hint to bypass the second ComputeNumSignBits call if we the
+/// first one is insufficient. Returns -1 on failure.
+int AMDGPUCodeGenPrepare::getDivNumBits(BinaryOperator &I,
+                                        Value *Num, Value *Den,
+                                        unsigned AtLeast, bool IsSigned) const {
   const DataLayout &DL = Mod->getDataLayout();
   unsigned LHSSignBits = ComputeNumSignBits(Num, DL, 0, AC, &I);
-  if (LHSSignBits < 9)
-    return nullptr;
+  if (LHSSignBits < AtLeast)
+    return -1;
 
   unsigned RHSSignBits = ComputeNumSignBits(Den, DL, 0, AC, &I);
-  if (RHSSignBits < 9)
-    return nullptr;
-
+  if (RHSSignBits < AtLeast)
+    return -1;
 
   unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
-  unsigned DivBits = 32 - SignBits;
+  unsigned DivBits = Num->getType()->getScalarSizeInBits() - SignBits;
   if (IsSigned)
     ++DivBits;
+  return DivBits;
+}
 
-  Type *Ty = Num->getType();
+// The fractional part of a float is enough to accurately represent up to
+// a 24-bit signed integer.
+Value *AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder,
+                                            BinaryOperator &I,
+                                            Value *Num, Value *Den,
+                                            bool IsDiv, bool IsSigned) const {
+  int DivBits = getDivNumBits(I, Num, Den, 9, IsSigned);
+  if (DivBits == -1)
+    return nullptr;
+  return expandDivRem24Impl(Builder, I, Num, Den, DivBits, IsDiv, IsSigned);
+}
+
+Value *AMDGPUCodeGenPrepare::expandDivRem24Impl(IRBuilder<> &Builder,
+                                                BinaryOperator &I,
+                                                Value *Num, Value *Den,
+                                                unsigned DivBits,
+                                                bool IsDiv, bool IsSigned) const {
   Type *I32Ty = Builder.getInt32Ty();
+  Num = Builder.CreateTrunc(Num, I32Ty);
+  Den = Builder.CreateTrunc(Den, I32Ty);
+
   Type *F32Ty = Builder.getFloatTy();
   ConstantInt *One = Builder.getInt32(1);
   Value *JQ = One;
@@ -685,7 +909,9 @@ Value* AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder,
   Value *FB = IsSigned ? Builder.CreateSIToFP(IB,F32Ty)
                        : Builder.CreateUIToFP(IB,F32Ty);
 
-  Value *RCP = Builder.CreateFDiv(ConstantFP::get(F32Ty, 1.0), FB);
+  Function *RcpDecl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_rcp,
+                                                Builder.getFloatTy());
+  Value *RCP = Builder.CreateCall(RcpDecl, { FB });
   Value *FQM = Builder.CreateFMul(FA, RCP);
 
   // fq = trunc(fqm);
@@ -696,7 +922,10 @@ Value* AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder,
   Value *FQNeg = Builder.CreateFNeg(FQ);
 
   // float fr = mad(fqneg, fb, fa);
-  Value *FR = Builder.CreateIntrinsic(Intrinsic::amdgcn_fmad_ftz,
+  auto FMAD = !ST->hasMadMacF32Insts()
+                  ? Intrinsic::fma
+                  : (Intrinsic::ID)Intrinsic::amdgcn_fmad_ftz;
+  Value *FR = Builder.CreateIntrinsic(FMAD,
                                       {FQNeg->getType()}, {FQNeg, FB, FA}, FQ);
 
   // int iq = (int)fq;
@@ -725,21 +954,72 @@ Value* AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder,
     Res = Builder.CreateSub(Num, Rem);
   }
 
-  // Truncate to number of bits this divide really is.
-  if (IsSigned) {
-    Res = Builder.CreateTrunc(Res, Builder.getIntNTy(DivBits));
-    Res = Builder.CreateSExt(Res, Ty);
-  } else {
-    ConstantInt *TruncMask = Builder.getInt32((UINT64_C(1) << DivBits) - 1);
-    Res = Builder.CreateAnd(Res, TruncMask);
+  if (DivBits != 0 && DivBits < 32) {
+    // Extend in register from the number of bits this divide really is.
+    if (IsSigned) {
+      int InRegBits = 32 - DivBits;
+
+      Res = Builder.CreateShl(Res, InRegBits);
+      Res = Builder.CreateAShr(Res, InRegBits);
+    } else {
+      ConstantInt *TruncMask
+        = Builder.getInt32((UINT64_C(1) << DivBits) - 1);
+      Res = Builder.CreateAnd(Res, TruncMask);
+    }
   }
 
   return Res;
 }
 
-Value* AMDGPUCodeGenPrepare::expandDivRem32(IRBuilder<> &Builder,
-                                            BinaryOperator &I,
-                                            Value *Num, Value *Den) const {
+// Try to recognize special cases the DAG will emit special, better expansions
+// than the general expansion we do here.
+
+// TODO: It would be better to just directly handle those optimizations here.
+bool AMDGPUCodeGenPrepare::divHasSpecialOptimization(
+  BinaryOperator &I, Value *Num, Value *Den) const {
+  if (Constant *C = dyn_cast<Constant>(Den)) {
+    // Arbitrary constants get a better expansion as long as a wider mulhi is
+    // legal.
+    if (C->getType()->getScalarSizeInBits() <= 32)
+      return true;
+
+    // TODO: Sdiv check for not exact for some reason.
+
+    // If there's no wider mulhi, there's only a better expansion for powers of
+    // two.
+    // TODO: Should really know for each vector element.
+    if (isKnownToBeAPowerOfTwo(C, *DL, true, 0, AC, &I, DT))
+      return true;
+
+    return false;
+  }
+
+  if (BinaryOperator *BinOpDen = dyn_cast<BinaryOperator>(Den)) {
+    // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2
+    if (BinOpDen->getOpcode() == Instruction::Shl &&
+        isa<Constant>(BinOpDen->getOperand(0)) &&
+        isKnownToBeAPowerOfTwo(BinOpDen->getOperand(0), *DL, true,
+                               0, AC, &I, DT)) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+static Value *getSign32(Value *V, IRBuilder<> &Builder, const DataLayout *DL) {
+  // Check whether the sign can be determined statically.
+  KnownBits Known = computeKnownBits(V, *DL);
+  if (Known.isNegative())
+    return Constant::getAllOnesValue(V->getType());
+  if (Known.isNonNegative())
+    return Constant::getNullValue(V->getType());
+  return Builder.CreateAShr(V, Builder.getInt32(31));
+}
+
+Value *AMDGPUCodeGenPrepare::expandDivRem32(IRBuilder<> &Builder,
+                                            BinaryOperator &I, Value *X,
+                                            Value *Y) const {
   Instruction::BinaryOps Opc = I.getOpcode();
   assert(Opc == Instruction::URem || Opc == Instruction::UDiv ||
          Opc == Instruction::SRem || Opc == Instruction::SDiv);
@@ -748,142 +1028,171 @@ Value* AMDGPUCodeGenPrepare::expandDivRem32(IRBuilder<> &Builder,
   FMF.setFast();
   Builder.setFastMathFlags(FMF);
 
-  if (isa<Constant>(Den))
-    return nullptr; // Keep it for optimization
+  if (divHasSpecialOptimization(I, X, Y))
+    return nullptr;  // Keep it for later optimization.
 
   bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv;
   bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv;
 
-  Type *Ty = Num->getType();
+  Type *Ty = X->getType();
   Type *I32Ty = Builder.getInt32Ty();
   Type *F32Ty = Builder.getFloatTy();
 
   if (Ty->getScalarSizeInBits() < 32) {
     if (IsSigned) {
-      Num = Builder.CreateSExt(Num, I32Ty);
-      Den = Builder.CreateSExt(Den, I32Ty);
+      X = Builder.CreateSExt(X, I32Ty);
+      Y = Builder.CreateSExt(Y, I32Ty);
     } else {
-      Num = Builder.CreateZExt(Num, I32Ty);
-      Den = Builder.CreateZExt(Den, I32Ty);
+      X = Builder.CreateZExt(X, I32Ty);
+      Y = Builder.CreateZExt(Y, I32Ty);
     }
   }
 
-  if (Value *Res = expandDivRem24(Builder, I, Num, Den, IsDiv, IsSigned)) {
-    Res = Builder.CreateTrunc(Res, Ty);
-    return Res;
+  if (Value *Res = expandDivRem24(Builder, I, X, Y, IsDiv, IsSigned)) {
+    return IsSigned ? Builder.CreateSExtOrTrunc(Res, Ty) :
+                      Builder.CreateZExtOrTrunc(Res, Ty);
   }
 
   ConstantInt *Zero = Builder.getInt32(0);
   ConstantInt *One = Builder.getInt32(1);
-  ConstantInt *MinusOne = Builder.getInt32(~0);
 
   Value *Sign = nullptr;
   if (IsSigned) {
-    ConstantInt *K31 = Builder.getInt32(31);
-    Value *LHSign = Builder.CreateAShr(Num, K31);
-    Value *RHSign = Builder.CreateAShr(Den, K31);
+    Value *SignX = getSign32(X, Builder, DL);
+    Value *SignY = getSign32(Y, Builder, DL);
     // Remainder sign is the same as LHS
-    Sign = IsDiv ? Builder.CreateXor(LHSign, RHSign) : LHSign;
+    Sign = IsDiv ? Builder.CreateXor(SignX, SignY) : SignX;
 
-    Num = Builder.CreateAdd(Num, LHSign);
-    Den = Builder.CreateAdd(Den, RHSign);
+    X = Builder.CreateAdd(X, SignX);
+    Y = Builder.CreateAdd(Y, SignY);
 
-    Num = Builder.CreateXor(Num, LHSign);
-    Den = Builder.CreateXor(Den, RHSign);
+    X = Builder.CreateXor(X, SignX);
+    Y = Builder.CreateXor(Y, SignY);
   }
 
-  // RCP =  URECIP(Den) = 2^32 / Den + e
-  // e is rounding error.
-  Value *DEN_F32 = Builder.CreateUIToFP(Den, F32Ty);
-  Value *RCP_F32 = Builder.CreateFDiv(ConstantFP::get(F32Ty, 1.0), DEN_F32);
-  Constant *UINT_MAX_PLUS_1 = ConstantFP::get(F32Ty, BitsToFloat(0x4f800000));
-  Value *RCP_SCALE = Builder.CreateFMul(RCP_F32, UINT_MAX_PLUS_1);
-  Value *RCP = Builder.CreateFPToUI(RCP_SCALE, I32Ty);
-
-  // RCP_LO, RCP_HI = mul(RCP, Den) */
-  Value *RCP_LO, *RCP_HI;
-  std::tie(RCP_LO, RCP_HI) = getMul64(Builder, RCP, Den);
-
-  // NEG_RCP_LO = -RCP_LO
-  Value *NEG_RCP_LO = Builder.CreateNeg(RCP_LO);
-
-  // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
-  Value *RCP_HI_0_CC = Builder.CreateICmpEQ(RCP_HI, Zero);
-  Value *ABS_RCP_LO = Builder.CreateSelect(RCP_HI_0_CC, NEG_RCP_LO, RCP_LO);
-
-  // Calculate the rounding error from the URECIP instruction
-  // E = mulhu(ABS_RCP_LO, RCP)
-  Value *E = getMulHu(Builder, ABS_RCP_LO, RCP);
-
-  // RCP_A_E = RCP + E
-  Value *RCP_A_E = Builder.CreateAdd(RCP, E);
-
-  // RCP_S_E = RCP - E
-  Value *RCP_S_E = Builder.CreateSub(RCP, E);
-
-  // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
-  Value *Tmp0 = Builder.CreateSelect(RCP_HI_0_CC, RCP_A_E, RCP_S_E);
-
-  // Quotient = mulhu(Tmp0, Num)
-  Value *Quotient = getMulHu(Builder, Tmp0, Num);
-
-  // Num_S_Remainder = Quotient * Den
-  Value *Num_S_Remainder = Builder.CreateMul(Quotient, Den);
+  // The algorithm here is based on ideas from "Software Integer Division", Tom
+  // Rodeheffer, August 2008.
+  //
+  // unsigned udiv(unsigned x, unsigned y) {
+  //   // Initial estimate of inv(y). The constant is less than 2^32 to ensure
+  //   // that this is a lower bound on inv(y), even if some of the calculations
+  //   // round up.
+  //   unsigned z = (unsigned)((4294967296.0 - 512.0) * v_rcp_f32((float)y));
+  //
+  //   // One round of UNR (Unsigned integer Newton-Raphson) to improve z.
+  //   // Empirically this is guaranteed to give a "two-y" lower bound on
+  //   // inv(y).
+  //   z += umulh(z, -y * z);
+  //
+  //   // Quotient/remainder estimate.
+  //   unsigned q = umulh(x, z);
+  //   unsigned r = x - q * y;
+  //
+  //   // Two rounds of quotient/remainder refinement.
+  //   if (r >= y) {
+  //     ++q;
+  //     r -= y;
+  //   }
+  //   if (r >= y) {
+  //     ++q;
+  //     r -= y;
+  //   }
+  //
+  //   return q;
+  // }
+
+  // Initial estimate of inv(y).
+  Value *FloatY = Builder.CreateUIToFP(Y, F32Ty);
+  Function *Rcp = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_rcp, F32Ty);
+  Value *RcpY = Builder.CreateCall(Rcp, {FloatY});
+  Constant *Scale = ConstantFP::get(F32Ty, BitsToFloat(0x4F7FFFFE));
+  Value *ScaledY = Builder.CreateFMul(RcpY, Scale);
+  Value *Z = Builder.CreateFPToUI(ScaledY, I32Ty);
+
+  // One round of UNR.
+  Value *NegY = Builder.CreateSub(Zero, Y);
+  Value *NegYZ = Builder.CreateMul(NegY, Z);
+  Z = Builder.CreateAdd(Z, getMulHu(Builder, Z, NegYZ));
+
+  // Quotient/remainder estimate.
+  Value *Q = getMulHu(Builder, X, Z);
+  Value *R = Builder.CreateSub(X, Builder.CreateMul(Q, Y));
+
+  // First quotient/remainder refinement.
+  Value *Cond = Builder.CreateICmpUGE(R, Y);
+  if (IsDiv)
+    Q = Builder.CreateSelect(Cond, Builder.CreateAdd(Q, One), Q);
+  R = Builder.CreateSelect(Cond, Builder.CreateSub(R, Y), R);
+
+  // Second quotient/remainder refinement.
+  Cond = Builder.CreateICmpUGE(R, Y);
+  Value *Res;
+  if (IsDiv)
+    Res = Builder.CreateSelect(Cond, Builder.CreateAdd(Q, One), Q);
+  else
+    Res = Builder.CreateSelect(Cond, Builder.CreateSub(R, Y), R);
 
-  // Remainder = Num - Num_S_Remainder
-  Value *Remainder = Builder.CreateSub(Num, Num_S_Remainder);
+  if (IsSigned) {
+    Res = Builder.CreateXor(Res, Sign);
+    Res = Builder.CreateSub(Res, Sign);
+  }
 
-  // Remainder_GE_Den = (Remainder >= Den ? -1 : 0)
-  Value *Rem_GE_Den_CC = Builder.CreateICmpUGE(Remainder, Den);
-  Value *Remainder_GE_Den = Builder.CreateSelect(Rem_GE_Den_CC, MinusOne, Zero);
+  Res = Builder.CreateTrunc(Res, Ty);
 
-  // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0)
-  Value *Num_GE_Num_S_Rem_CC = Builder.CreateICmpUGE(Num, Num_S_Remainder);
-  Value *Remainder_GE_Zero = Builder.CreateSelect(Num_GE_Num_S_Rem_CC,
-                                                  MinusOne, Zero);
+  return Res;
+}
 
-  // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
-  Value *Tmp1 = Builder.CreateAnd(Remainder_GE_Den, Remainder_GE_Zero);
-  Value *Tmp1_0_CC = Builder.CreateICmpEQ(Tmp1, Zero);
+Value *AMDGPUCodeGenPrepare::shrinkDivRem64(IRBuilder<> &Builder,
+                                            BinaryOperator &I,
+                                            Value *Num, Value *Den) const {
+  if (!ExpandDiv64InIR && divHasSpecialOptimization(I, Num, Den))
+    return nullptr;  // Keep it for later optimization.
 
-  Value *Res;
-  if (IsDiv) {
-    // Quotient_A_One = Quotient + 1
-    Value *Quotient_A_One = Builder.CreateAdd(Quotient, One);
+  Instruction::BinaryOps Opc = I.getOpcode();
 
-    // Quotient_S_One = Quotient - 1
-    Value *Quotient_S_One = Builder.CreateSub(Quotient, One);
+  bool IsDiv = Opc == Instruction::SDiv || Opc == Instruction::UDiv;
+  bool IsSigned = Opc == Instruction::SDiv || Opc == Instruction::SRem;
 
-    // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One)
-    Value *Div = Builder.CreateSelect(Tmp1_0_CC, Quotient, Quotient_A_One);
+  int NumDivBits = getDivNumBits(I, Num, Den, 32, IsSigned);
+  if (NumDivBits == -1)
+    return nullptr;
 
-    // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div)
-    Res = Builder.CreateSelect(Num_GE_Num_S_Rem_CC, Div, Quotient_S_One);
-  } else {
-    // Remainder_S_Den = Remainder - Den
-    Value *Remainder_S_Den = Builder.CreateSub(Remainder, Den);
+  Value *Narrowed = nullptr;
+  if (NumDivBits <= 24) {
+    Narrowed = expandDivRem24Impl(Builder, I, Num, Den, NumDivBits,
+                                  IsDiv, IsSigned);
+  } else if (NumDivBits <= 32) {
+    Narrowed = expandDivRem32(Builder, I, Num, Den);
+  }
 
-    // Remainder_A_Den = Remainder + Den
-    Value *Remainder_A_Den = Builder.CreateAdd(Remainder, Den);
+  if (Narrowed) {
+    return IsSigned ? Builder.CreateSExt(Narrowed, Num->getType()) :
+                      Builder.CreateZExt(Narrowed, Num->getType());
+  }
 
-    // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den)
-    Value *Rem = Builder.CreateSelect(Tmp1_0_CC, Remainder, Remainder_S_Den);
+  return nullptr;
+}
 
-    // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem)
-    Res = Builder.CreateSelect(Num_GE_Num_S_Rem_CC, Rem, Remainder_A_Den);
+void AMDGPUCodeGenPrepare::expandDivRem64(BinaryOperator &I) const {
+  Instruction::BinaryOps Opc = I.getOpcode();
+  // Do the general expansion.
+  if (Opc == Instruction::UDiv || Opc == Instruction::SDiv) {
+    expandDivisionUpTo64Bits(&I);
+    return;
   }
 
-  if (IsSigned) {
-    Res = Builder.CreateXor(Res, Sign);
-    Res = Builder.CreateSub(Res, Sign);
+  if (Opc == Instruction::URem || Opc == Instruction::SRem) {
+    expandRemainderUpTo64Bits(&I);
+    return;
   }
 
-  Res = Builder.CreateTrunc(Res, Ty);
-
-  return Res;
+  llvm_unreachable("not a division");
 }
 
 bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
+  if (foldBinOpIntoSelect(I))
+    return true;
+
   if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
       DA->isUniform(&I) && promoteUniformOpToI32(I))
     return true;
@@ -895,27 +1204,54 @@ bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
   Instruction::BinaryOps Opc = I.getOpcode();
   Type *Ty = I.getType();
   Value *NewDiv = nullptr;
+  unsigned ScalarSize = Ty->getScalarSizeInBits();
+
+  SmallVector<BinaryOperator *, 8> Div64ToExpand;
+
   if ((Opc == Instruction::URem || Opc == Instruction::UDiv ||
        Opc == Instruction::SRem || Opc == Instruction::SDiv) &&
-      Ty->getScalarSizeInBits() <= 32) {
+      ScalarSize <= 64 &&
+      !DisableIDivExpand) {
     Value *Num = I.getOperand(0);
     Value *Den = I.getOperand(1);
     IRBuilder<> Builder(&I);
     Builder.SetCurrentDebugLocation(I.getDebugLoc());
 
-    if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
+    if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
       NewDiv = UndefValue::get(VT);
 
       for (unsigned N = 0, E = VT->getNumElements(); N != E; ++N) {
         Value *NumEltN = Builder.CreateExtractElement(Num, N);
         Value *DenEltN = Builder.CreateExtractElement(Den, N);
-        Value *NewElt = expandDivRem32(Builder, I, NumEltN, DenEltN);
-        if (!NewElt)
-          NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
+
+        Value *NewElt;
+        if (ScalarSize <= 32) {
+          NewElt = expandDivRem32(Builder, I, NumEltN, DenEltN);
+          if (!NewElt)
+            NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
+        } else {
+          // See if this 64-bit division can be shrunk to 32/24-bits before
+          // producing the general expansion.
+          NewElt = shrinkDivRem64(Builder, I, NumEltN, DenEltN);
+          if (!NewElt) {
+            // The general 64-bit expansion introduces control flow and doesn't
+            // return the new value. Just insert a scalar copy and defer
+            // expanding it.
+            NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
+            Div64ToExpand.push_back(cast<BinaryOperator>(NewElt));
+          }
+        }
+
         NewDiv = Builder.CreateInsertElement(NewDiv, NewElt, N);
       }
     } else {
-      NewDiv = expandDivRem32(Builder, I, Num, Den);
+      if (ScalarSize <= 32)
+        NewDiv = expandDivRem32(Builder, I, Num, Den);
+      else {
+        NewDiv = shrinkDivRem64(Builder, I, Num, Den);
+        if (!NewDiv)
+          Div64ToExpand.push_back(&I);
+      }
     }
 
     if (NewDiv) {
@@ -925,6 +1261,14 @@ bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
     }
   }
 
+  if (ExpandDiv64InIR) {
+    // TODO: We get much worse code in specially handled constant cases.
+    for (BinaryOperator *Div : Div64ToExpand) {
+      expandDivRem64(*Div);
+      Changed = true;
+    }
+  }
+
   return Changed;
 }
 
@@ -1033,16 +1377,36 @@ bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
   ST = &TM.getSubtarget<GCNSubtarget>(F);
   AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   DA = &getAnalysis<LegacyDivergenceAnalysis>();
+
+  auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+  DT = DTWP ? &DTWP->getDomTree() : nullptr;
+
   HasUnsafeFPMath = hasUnsafeFPMath(F);
-  HasFP32Denormals = ST->hasFP32Denormals(F);
+
+  AMDGPU::SIModeRegisterDefaults Mode(F);
+  HasFP32Denormals = Mode.allFP32Denormals();
 
   bool MadeChange = false;
 
-  for (BasicBlock &BB : F) {
+  Function::iterator NextBB;
+  for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; FI = NextBB) {
+    BasicBlock *BB = &*FI;
+    NextBB = std::next(FI);
+
     BasicBlock::iterator Next;
-    for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) {
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; I = Next) {
       Next = std::next(I);
+
       MadeChange |= visit(*I);
+
+      if (Next != E) { // Control flow changed
+        BasicBlock *NextInstBB = Next->getParent();
+        if (NextInstBB != BB) {
+          BB = NextInstBB;
+          E = BB->end();
+          FE = F.end();
+        }
+      }
     }
   }