summaryrefslogtreecommitdiff
path: root/contrib/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp')
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp524
1 files changed, 524 insertions, 0 deletions
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
new file mode 100644
index 000000000000..b312dbc8d14d
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -0,0 +1,524 @@
+//===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass does misc. AMDGPU optimizations on IR before instruction
+/// selection.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "AMDGPUTargetMachine.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include <cassert>
+#include <iterator>
+
+#define DEBUG_TYPE "amdgpu-codegenprepare"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPUCodeGenPrepare : public FunctionPass,
+ public InstVisitor<AMDGPUCodeGenPrepare, bool> {
+ const SISubtarget *ST = nullptr;
+ DivergenceAnalysis *DA = nullptr;
+ Module *Mod = nullptr;
+ bool HasUnsafeFPMath = false;
+
+ /// \brief Copies exact/nsw/nuw flags (if any) from binary operation \p I to
+ /// binary operation \p V.
+ ///
+ /// \returns Binary operation \p V.
+ /// \returns \p T's base element bit width.
+ unsigned getBaseElementBitWidth(const Type *T) const;
+
+ /// \returns Equivalent 32 bit integer type for given type \p T. For example,
+ /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32>
+ /// is returned.
+ Type *getI32Ty(IRBuilder<> &B, const Type *T) const;
+
+ /// \returns True if binary operation \p I is a signed binary operation, false
+ /// otherwise.
+ bool isSigned(const BinaryOperator &I) const;
+
+ /// \returns True if the condition of 'select' operation \p I comes from a
+ /// signed 'icmp' operation, false otherwise.
+ bool isSigned(const SelectInst &I) const;
+
+ /// \returns True if type \p T needs to be promoted to 32 bit integer type,
+ /// false otherwise.
+ bool needsPromotionToI32(const Type *T) const;
+
+ /// \brief Promotes uniform binary operation \p I to equivalent 32 bit binary
+ /// operation.
+ ///
+ /// \details \p I's base element bit width must be greater than 1 and less
+ /// than or equal 16. Promotion is done by sign or zero extending operands to
+ /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and
+ /// truncating the result of 32 bit binary operation back to \p I's original
+ /// type. Division operation is not promoted.
+ ///
+ /// \returns True if \p I is promoted to equivalent 32 bit binary operation,
+ /// false otherwise.
+ bool promoteUniformOpToI32(BinaryOperator &I) const;
+
+ /// \brief Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation.
+ ///
+ /// \details \p I's base element bit width must be greater than 1 and less
+ /// than or equal 16. Promotion is done by sign or zero extending operands to
+ /// 32 bits, and replacing \p I with 32 bit 'icmp' operation.
+ ///
+ /// \returns True.
+ bool promoteUniformOpToI32(ICmpInst &I) const;
+
+ /// \brief Promotes uniform 'select' operation \p I to 32 bit 'select'
+ /// operation.
+ ///
+ /// \details \p I's base element bit width must be greater than 1 and less
+ /// than or equal 16. Promotion is done by sign or zero extending operands to
+ /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the
+ /// result of 32 bit 'select' operation back to \p I's original type.
+ ///
+ /// \returns True.
+ bool promoteUniformOpToI32(SelectInst &I) const;
+
+ /// \brief Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse'
+ /// intrinsic.
+ ///
+ /// \details \p I's base element bit width must be greater than 1 and less
+ /// than or equal 16. Promotion is done by zero extending the operand to 32
+ /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the
+ /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the
+ /// shift amount is 32 minus \p I's base element bit width), and truncating
+ /// the result of the shift operation back to \p I's original type.
+ ///
+ /// \returns True.
+ bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;
+
+public:
+ static char ID;
+
+ AMDGPUCodeGenPrepare() : FunctionPass(ID) {}
+
+ bool visitFDiv(BinaryOperator &I);
+
+ bool visitInstruction(Instruction &I) { return false; }
+ bool visitBinaryOperator(BinaryOperator &I);
+ bool visitICmpInst(ICmpInst &I);
+ bool visitSelectInst(SelectInst &I);
+
+ bool visitIntrinsicInst(IntrinsicInst &I);
+ bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
+
+ bool doInitialization(Module &M) override;
+ bool runOnFunction(Function &F) override;
+
+ StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<DivergenceAnalysis>();
+ AU.setPreservesAll();
+ }
+};
+
+} // end anonymous namespace
+
+unsigned AMDGPUCodeGenPrepare::getBaseElementBitWidth(const Type *T) const {
+ assert(needsPromotionToI32(T) && "T does not need promotion to i32");
+
+ if (T->isIntegerTy())
+ return T->getIntegerBitWidth();
+ return cast<VectorType>(T)->getElementType()->getIntegerBitWidth();
+}
+
+Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const {
+ assert(needsPromotionToI32(T) && "T does not need promotion to i32");
+
+ if (T->isIntegerTy())
+ return B.getInt32Ty();
+ return VectorType::get(B.getInt32Ty(), cast<VectorType>(T)->getNumElements());
+}
+
+bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const {
+ return I.getOpcode() == Instruction::AShr ||
+ I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem;
+}
+
+bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const {
+ return isa<ICmpInst>(I.getOperand(0)) ?
+ cast<ICmpInst>(I.getOperand(0))->isSigned() : false;
+}
+
+bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const {
+ const IntegerType *IntTy = dyn_cast<IntegerType>(T);
+ if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16)
+ return true;
+
+ if (const VectorType *VT = dyn_cast<VectorType>(T)) {
+ // TODO: The set of packed operations is more limited, so may want to
+ // promote some anyway.
+ if (ST->hasVOP3PInsts())
+ return false;
+
+ return needsPromotionToI32(VT->getElementType());
+ }
+
+ return false;
+}
+
+// Return true if the op promoted to i32 should have nsw set.
+static bool promotedOpIsNSW(const Instruction &I) {
+ switch (I.getOpcode()) {
+ case Instruction::Shl:
+ case Instruction::Add:
+ case Instruction::Sub:
+ return true;
+ case Instruction::Mul:
+ return I.hasNoUnsignedWrap();
+ default:
+ return false;
+ }
+}
+
+// Return true if the op promoted to i32 should have nuw set.
+static bool promotedOpIsNUW(const Instruction &I) {
+ switch (I.getOpcode()) {
+ case Instruction::Shl:
+ case Instruction::Add:
+ case Instruction::Mul:
+ return true;
+ case Instruction::Sub:
+ return I.hasNoUnsignedWrap();
+ default:
+ return false;
+ }
+}
+
+bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const {
+ assert(needsPromotionToI32(I.getType()) &&
+ "I does not need promotion to i32");
+
+ if (I.getOpcode() == Instruction::SDiv ||
+ I.getOpcode() == Instruction::UDiv)
+ return false;
+
+ IRBuilder<> Builder(&I);
+ Builder.SetCurrentDebugLocation(I.getDebugLoc());
+
+ Type *I32Ty = getI32Ty(Builder, I.getType());
+ Value *ExtOp0 = nullptr;
+ Value *ExtOp1 = nullptr;
+ Value *ExtRes = nullptr;
+ Value *TruncRes = nullptr;
+
+ if (isSigned(I)) {
+ ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
+ ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
+ } else {
+ ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
+ ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
+ }
+
+ ExtRes = Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1);
+ if (Instruction *Inst = dyn_cast<Instruction>(ExtRes)) {
+ if (promotedOpIsNSW(cast<Instruction>(I)))
+ Inst->setHasNoSignedWrap();
+
+ if (promotedOpIsNUW(cast<Instruction>(I)))
+ Inst->setHasNoUnsignedWrap();
+
+ if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I))
+ Inst->setIsExact(ExactOp->isExact());
+ }
+
+ TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
+
+ I.replaceAllUsesWith(TruncRes);
+ I.eraseFromParent();
+
+ return true;
+}
+
+bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(ICmpInst &I) const {
+ assert(needsPromotionToI32(I.getOperand(0)->getType()) &&
+ "I does not need promotion to i32");
+
+ IRBuilder<> Builder(&I);
+ Builder.SetCurrentDebugLocation(I.getDebugLoc());
+
+ Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType());
+ Value *ExtOp0 = nullptr;
+ Value *ExtOp1 = nullptr;
+ Value *NewICmp = nullptr;
+
+ if (I.isSigned()) {
+ ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
+ ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
+ } else {
+ ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
+ ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
+ }
+ NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1);
+
+ I.replaceAllUsesWith(NewICmp);
+ I.eraseFromParent();
+
+ return true;
+}
+
+bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(SelectInst &I) const {
+ assert(needsPromotionToI32(I.getType()) &&
+ "I does not need promotion to i32");
+
+ IRBuilder<> Builder(&I);
+ Builder.SetCurrentDebugLocation(I.getDebugLoc());
+
+ Type *I32Ty = getI32Ty(Builder, I.getType());
+ Value *ExtOp1 = nullptr;
+ Value *ExtOp2 = nullptr;
+ Value *ExtRes = nullptr;
+ Value *TruncRes = nullptr;
+
+ if (isSigned(I)) {
+ ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
+ ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty);
+ } else {
+ ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
+ ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty);
+ }
+ ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2);
+ TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
+
+ I.replaceAllUsesWith(TruncRes);
+ I.eraseFromParent();
+
+ return true;
+}
+
+bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32(
+ IntrinsicInst &I) const {
+ assert(I.getIntrinsicID() == Intrinsic::bitreverse &&
+ "I must be bitreverse intrinsic");
+ assert(needsPromotionToI32(I.getType()) &&
+ "I does not need promotion to i32");
+
+ IRBuilder<> Builder(&I);
+ Builder.SetCurrentDebugLocation(I.getDebugLoc());
+
+ Type *I32Ty = getI32Ty(Builder, I.getType());
+ Function *I32 =
+ Intrinsic::getDeclaration(Mod, Intrinsic::bitreverse, { I32Ty });
+ Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty);
+ Value *ExtRes = Builder.CreateCall(I32, { ExtOp });
+ Value *LShrOp =
+ Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType()));
+ Value *TruncRes =
+ Builder.CreateTrunc(LShrOp, I.getType());
+
+ I.replaceAllUsesWith(TruncRes);
+ I.eraseFromParent();
+
+ return true;
+}
+
+static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) {
+ const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
+ if (!CNum)
+ return false;
+
+ // Reciprocal f32 is handled separately without denormals.
+ return UnsafeDiv || CNum->isExactlyValue(+1.0);
+}
+
+// Insert an intrinsic for fast fdiv for safe math situations where we can
+// reduce precision. Leave fdiv for situations where the generic node is
+// expected to be optimized.
+bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
+ Type *Ty = FDiv.getType();
+
+ if (!Ty->getScalarType()->isFloatTy())
+ return false;
+
+ MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath);
+ if (!FPMath)
+ return false;
+
+ const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
+ float ULP = FPOp->getFPAccuracy();
+ if (ULP < 2.5f)
+ return false;
+
+ FastMathFlags FMF = FPOp->getFastMathFlags();
+ bool UnsafeDiv = HasUnsafeFPMath || FMF.unsafeAlgebra() ||
+ FMF.allowReciprocal();
+ if (ST->hasFP32Denormals() && !UnsafeDiv)
+ return false;
+
+ IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath);
+ Builder.setFastMathFlags(FMF);
+ Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
+
+ Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);
+
+ Value *Num = FDiv.getOperand(0);
+ Value *Den = FDiv.getOperand(1);
+
+ Value *NewFDiv = nullptr;
+
+ if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
+ NewFDiv = UndefValue::get(VT);
+
+ // FIXME: Doesn't do the right thing for cases where the vector is partially
+ // constant. This works when the scalarizer pass is run first.
+ for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) {
+ Value *NumEltI = Builder.CreateExtractElement(Num, I);
+ Value *DenEltI = Builder.CreateExtractElement(Den, I);
+ Value *NewElt;
+
+ if (shouldKeepFDivF32(NumEltI, UnsafeDiv)) {
+ NewElt = Builder.CreateFDiv(NumEltI, DenEltI);
+ } else {
+ NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI });
+ }
+
+ NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
+ }
+ } else {
+ if (!shouldKeepFDivF32(Num, UnsafeDiv))
+ NewFDiv = Builder.CreateCall(Decl, { Num, Den });
+ }
+
+ if (NewFDiv) {
+ FDiv.replaceAllUsesWith(NewFDiv);
+ NewFDiv->takeName(&FDiv);
+ FDiv.eraseFromParent();
+ }
+
+ return true;
+}
+
+static bool hasUnsafeFPMath(const Function &F) {
+ Attribute Attr = F.getFnAttribute("unsafe-fp-math");
+ return Attr.getValueAsString() == "true";
+}
+
+bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
+ bool Changed = false;
+
+ if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
+ DA->isUniform(&I))
+ Changed |= promoteUniformOpToI32(I);
+
+ return Changed;
+}
+
+bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) {
+ bool Changed = false;
+
+ if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) &&
+ DA->isUniform(&I))
+ Changed |= promoteUniformOpToI32(I);
+
+ return Changed;
+}
+
+bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) {
+ bool Changed = false;
+
+ if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
+ DA->isUniform(&I))
+ Changed |= promoteUniformOpToI32(I);
+
+ return Changed;
+}
+
+bool AMDGPUCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) {
+ switch (I.getIntrinsicID()) {
+ case Intrinsic::bitreverse:
+ return visitBitreverseIntrinsicInst(I);
+ default:
+ return false;
+ }
+}
+
+bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(IntrinsicInst &I) {
+ bool Changed = false;
+
+ if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
+ DA->isUniform(&I))
+ Changed |= promoteUniformBitreverseToI32(I);
+
+ return Changed;
+}
+
+bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
+ Mod = &M;
+ return false;
+}
+
+bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
+ if (skipFunction(F))
+ return false;
+
+ auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+ if (!TPC)
+ return false;
+
+ const TargetMachine &TM = TPC->getTM<TargetMachine>();
+ ST = &TM.getSubtarget<SISubtarget>(F);
+ DA = &getAnalysis<DivergenceAnalysis>();
+ HasUnsafeFPMath = hasUnsafeFPMath(F);
+
+ bool MadeChange = false;
+
+ for (BasicBlock &BB : F) {
+ BasicBlock::iterator Next;
+ for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) {
+ Next = std::next(I);
+ MadeChange |= visit(*I);
+ }
+ }
+
+ return MadeChange;
+}
+
+INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
+ "AMDGPU IR optimizations", false, false)
+INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
+INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations",
+ false, false)
+
+char AMDGPUCodeGenPrepare::ID = 0;
+
+FunctionPass *llvm::createAMDGPUCodeGenPreparePass() {
+ return new AMDGPUCodeGenPrepare();
+}