1 files changed, 59 insertions, 34 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 60e79c2c6c2f..a55729586b8d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -148,11 +148,15 @@ class AMDGPUCodeGenPrepare : public FunctionPass,
   /// \returns True.
   bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;
 
-
+  /// \returns The minimum number of bits needed to store the value of \Op as an
+  /// unsigned integer. Truncating to this size and then zero-extending to
+  /// ScalarSize will not change the value.
   unsigned numBitsUnsigned(Value *Op, unsigned ScalarSize) const;
+
+  /// \returns The minimum number of bits needed to store the value of \Op as a
+  /// signed integer. Truncating to this size and then sign-extending to
+  /// ScalarSize will not change the value.
   unsigned numBitsSigned(Value *Op, unsigned ScalarSize) const;
-  bool isI24(Value *V, unsigned ScalarSize) const;
-  bool isU24(Value *V, unsigned ScalarSize) const;
 
   /// Replace mul instructions with llvm.amdgcn.mul.u24 or llvm.amdgcn.mul.s24.
   /// SelectionDAG has an issue where an and asserting the bits are known
@@ -451,17 +455,7 @@ unsigned AMDGPUCodeGenPrepare::numBitsSigned(Value *Op,
                                              unsigned ScalarSize) const {
   // In order for this to be a signed 24-bit value, bit 23, must
   // be a sign bit.
-  return ScalarSize - ComputeNumSignBits(Op, *DL, 0, AC);
-}
-
-bool AMDGPUCodeGenPrepare::isI24(Value *V, unsigned ScalarSize) const {
-  return ScalarSize >= 24 && // Types less than 24-bit should be treated
-                                     // as unsigned 24-bit values.
-    numBitsSigned(V, ScalarSize) < 24;
-}
-
-bool AMDGPUCodeGenPrepare::isU24(Value *V, unsigned ScalarSize) const {
-  return numBitsUnsigned(V, ScalarSize) <= 24;
+  return ScalarSize - ComputeNumSignBits(Op, *DL, 0, AC) + 1;
 }
 
 static void extractValues(IRBuilder<> &Builder,
@@ -489,6 +483,34 @@ static Value *insertValues(IRBuilder<> &Builder,
   return NewVal;
 }
 
+// Returns 24-bit or 48-bit (as per `NumBits` and `Size`) mul of `LHS` and
+// `RHS`. `NumBits` is the number of KnownBits of the result and `Size` is the
+// width of the original destination.
+static Value *getMul24(IRBuilder<> &Builder, Value *LHS, Value *RHS,
+                       unsigned Size, unsigned NumBits, bool IsSigned) {
+  if (Size <= 32 || NumBits <= 32) {
+    Intrinsic::ID ID =
+        IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;
+    return Builder.CreateIntrinsic(ID, {}, {LHS, RHS});
+  }
+
+  assert(NumBits <= 48);
+
+  Intrinsic::ID LoID =
+      IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;
+  Intrinsic::ID HiID =
+      IsSigned ? Intrinsic::amdgcn_mulhi_i24 : Intrinsic::amdgcn_mulhi_u24;
+
+  Value *Lo = Builder.CreateIntrinsic(LoID, {}, {LHS, RHS});
+  Value *Hi = Builder.CreateIntrinsic(HiID, {}, {LHS, RHS});
+
+  IntegerType *I64Ty = Builder.getInt64Ty();
+  Lo = Builder.CreateZExtOrTrunc(Lo, I64Ty);
+  Hi = Builder.CreateZExtOrTrunc(Hi, I64Ty);
+
+  return Builder.CreateOr(Lo, Builder.CreateShl(Hi, 32));
+}
+
 bool AMDGPUCodeGenPrepare::replaceMulWithMul24(BinaryOperator &I) const {
   if (I.getOpcode() != Instruction::Mul)
     return false;
@@ -507,13 +529,17 @@ bool AMDGPUCodeGenPrepare::replaceMulWithMul24(BinaryOperator &I) const {
   IRBuilder<> Builder(&I);
   Builder.SetCurrentDebugLocation(I.getDebugLoc());
 
-  Intrinsic::ID IntrID = Intrinsic::not_intrinsic;
+  unsigned LHSBits = 0, RHSBits = 0;
+  bool IsSigned = false;
+
+  if (ST->hasMulU24() && (LHSBits = numBitsUnsigned(LHS, Size)) <= 24 &&
+      (RHSBits = numBitsUnsigned(RHS, Size)) <= 24) {
+    IsSigned = false;
+
+  } else if (ST->hasMulI24() && (LHSBits = numBitsSigned(LHS, Size)) <= 24 &&
+             (RHSBits = numBitsSigned(RHS, Size)) <= 24) {
+    IsSigned = true;
 
-  // TODO: Should this try to match mulhi24?
-  if (ST->hasMulU24() && isU24(LHS, Size) && isU24(RHS, Size)) {
-    IntrID = Intrinsic::amdgcn_mul_u24;
-  } else if (ST->hasMulI24() && isI24(LHS, Size) && isI24(RHS, Size)) {
-    IntrID = Intrinsic::amdgcn_mul_i24;
   } else
     return false;
 
@@ -523,27 +549,26 @@ bool AMDGPUCodeGenPrepare::replaceMulWithMul24(BinaryOperator &I) const {
   extractValues(Builder, LHSVals, LHS);
   extractValues(Builder, RHSVals, RHS);
 
-
   IntegerType *I32Ty = Builder.getInt32Ty();
-  FunctionCallee Intrin = Intrinsic::getDeclaration(Mod, IntrID);
   for (int I = 0, E = LHSVals.size(); I != E; ++I) {
     Value *LHS, *RHS;
-    if (IntrID == Intrinsic::amdgcn_mul_u24) {
-      LHS = Builder.CreateZExtOrTrunc(LHSVals[I], I32Ty);
-      RHS = Builder.CreateZExtOrTrunc(RHSVals[I], I32Ty);
-    } else {
+    if (IsSigned) {
       LHS = Builder.CreateSExtOrTrunc(LHSVals[I], I32Ty);
       RHS = Builder.CreateSExtOrTrunc(RHSVals[I], I32Ty);
+    } else {
+      LHS = Builder.CreateZExtOrTrunc(LHSVals[I], I32Ty);
+      RHS = Builder.CreateZExtOrTrunc(RHSVals[I], I32Ty);
     }
 
-    Value *Result = Builder.CreateCall(Intrin, {LHS, RHS});
+    Value *Result =
+        getMul24(Builder, LHS, RHS, Size, LHSBits + RHSBits, IsSigned);
 
-    if (IntrID == Intrinsic::amdgcn_mul_u24) {
-      ResultVals.push_back(Builder.CreateZExtOrTrunc(Result,
-                                                     LHSVals[I]->getType()));
+    if (IsSigned) {
+      ResultVals.push_back(
+          Builder.CreateSExtOrTrunc(Result, LHSVals[I]->getType()));
     } else {
-      ResultVals.push_back(Builder.CreateSExtOrTrunc(Result,
-                                                     LHSVals[I]->getType()));
+      ResultVals.push_back(
+          Builder.CreateZExtOrTrunc(Result, LHSVals[I]->getType()));
     }
   }
 
@@ -816,7 +841,7 @@ bool AMDGPUCodeGenPrepare::visitXor(BinaryOperator &I) {
   if (!RHS || !IntrinsicCall || RHS->getSExtValue() != -1)
     return visitBinaryOperator(I);
 
-  // Check if the Call is an intrinsic intruction to amdgcn_class intrinsic
+  // Check if the Call is an intrinsic instruction to amdgcn_class intrinsic
   // has only one use
   if (IntrinsicCall->getIntrinsicID() != Intrinsic::amdgcn_class ||
       !IntrinsicCall->hasOneUse())
@@ -1314,7 +1339,7 @@ bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) {
       ConstantInt *Lower =
         mdconst::extract<ConstantInt>(Range->getOperand(0));
 
-      if (Lower->getValue().isNullValue()) {
+      if (Lower->isNullValue()) {
         WidenLoad->setMetadata(LLVMContext::MD_range, nullptr);
       } else {
         Metadata *LowAndHigh[] = {