diff options
Diffstat (limited to 'lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp')
| -rw-r--r-- | lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 301 |
1 files changed, 294 insertions, 7 deletions
diff --git a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index b955e231699a..e6230547a9b3 100644 --- a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -39,6 +39,78 @@ class AMDGPUCodeGenPrepare : public FunctionPass, Module *Mod; bool HasUnsafeFPMath; + /// \brief Copies exact/nsw/nuw flags (if any) from binary operation \p I to + /// binary operation \p V. + /// + /// \returns Binary operation \p V. + Value *copyFlags(const BinaryOperator &I, Value *V) const; + + /// \returns \p T's base element bit width. + unsigned getBaseElementBitWidth(const Type *T) const; + + /// \returns Equivalent 32 bit integer type for given type \p T. For example, + /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32> + /// is returned. + Type *getI32Ty(IRBuilder<> &B, const Type *T) const; + + /// \returns True if binary operation \p I is a signed binary operation, false + /// otherwise. + bool isSigned(const BinaryOperator &I) const; + + /// \returns True if the condition of 'select' operation \p I comes from a + /// signed 'icmp' operation, false otherwise. + bool isSigned(const SelectInst &I) const; + + /// \returns True if type \p T needs to be promoted to 32 bit integer type, + /// false otherwise. + bool needsPromotionToI32(const Type *T) const; + + /// \brief Promotes uniform binary operation \p I to equivalent 32 bit binary + /// operation. + /// + /// \details \p I's base element bit width must be greater than 1 and less + /// than or equal 16. Promotion is done by sign or zero extending operands to + /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and + /// truncating the result of 32 bit binary operation back to \p I's original + /// type. Division operation is not promoted. + /// + /// \returns True if \p I is promoted to equivalent 32 bit binary operation, + /// false otherwise. + bool promoteUniformOpToI32(BinaryOperator &I) const; + + /// \brief Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation. + /// + /// \details \p I's base element bit width must be greater than 1 and less + /// than or equal 16. Promotion is done by sign or zero extending operands to + /// 32 bits, and replacing \p I with 32 bit 'icmp' operation. + /// + /// \returns True. + bool promoteUniformOpToI32(ICmpInst &I) const; + + /// \brief Promotes uniform 'select' operation \p I to 32 bit 'select' + /// operation. + /// + /// \details \p I's base element bit width must be greater than 1 and less + /// than or equal 16. Promotion is done by sign or zero extending operands to + /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the + /// result of 32 bit 'select' operation back to \p I's original type. + /// + /// \returns True. + bool promoteUniformOpToI32(SelectInst &I) const; + + /// \brief Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse' + /// intrinsic. + /// + /// \details \p I's base element bit width must be greater than 1 and less + /// than or equal 16. Promotion is done by zero extending the operand to 32 + /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the + /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the + /// shift amount is 32 minus \p I's base element bit width), and truncating + /// the result of the shift operation back to \p I's original type. + /// + /// \returns True. + bool promoteUniformBitreverseToI32(IntrinsicInst &I) const; + public: static char ID; AMDGPUCodeGenPrepare(const TargetMachine *TM = nullptr) : @@ -51,16 +123,18 @@ public: bool visitFDiv(BinaryOperator &I); - bool visitInstruction(Instruction &I) { - return false; - } + bool visitInstruction(Instruction &I) { return false; } + bool visitBinaryOperator(BinaryOperator &I); + bool visitICmpInst(ICmpInst &I); + bool visitSelectInst(SelectInst &I); + + bool visitIntrinsicInst(IntrinsicInst &I); + bool visitBitreverseIntrinsicInst(IntrinsicInst &I); bool doInitialization(Module &M) override; bool runOnFunction(Function &F) override; - const char *getPassName() const override { - return "AMDGPU IR optimizations"; - } + StringRef getPassName() const override { return "AMDGPU IR optimizations"; } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<DivergenceAnalysis>(); @@ -70,6 +144,171 @@ public: } // End anonymous namespace +Value *AMDGPUCodeGenPrepare::copyFlags( + const BinaryOperator &I, Value *V) const { + BinaryOperator *BinOp = dyn_cast<BinaryOperator>(V); + if (!BinOp) // Possibly constant expression. + return V; + + if (isa<OverflowingBinaryOperator>(BinOp)) { + BinOp->setHasNoSignedWrap(I.hasNoSignedWrap()); + BinOp->setHasNoUnsignedWrap(I.hasNoUnsignedWrap()); + } else if (isa<PossiblyExactOperator>(BinOp)) + BinOp->setIsExact(I.isExact()); + + return V; +} + +unsigned AMDGPUCodeGenPrepare::getBaseElementBitWidth(const Type *T) const { + assert(needsPromotionToI32(T) && "T does not need promotion to i32"); + + if (T->isIntegerTy()) + return T->getIntegerBitWidth(); + return cast<VectorType>(T)->getElementType()->getIntegerBitWidth(); +} + +Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const { + assert(needsPromotionToI32(T) && "T does not need promotion to i32"); + + if (T->isIntegerTy()) + return B.getInt32Ty(); + return VectorType::get(B.getInt32Ty(), cast<VectorType>(T)->getNumElements()); +} + +bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const { + return I.getOpcode() == Instruction::AShr || + I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem; +} + +bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const { + return isa<ICmpInst>(I.getOperand(0)) ? + cast<ICmpInst>(I.getOperand(0))->isSigned() : false; +} + +bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const { + if (T->isIntegerTy() && T->getIntegerBitWidth() > 1 && + T->getIntegerBitWidth() <= 16) + return true; + if (!T->isVectorTy()) + return false; + return needsPromotionToI32(cast<VectorType>(T)->getElementType()); +} + +bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const { + assert(needsPromotionToI32(I.getType()) && + "I does not need promotion to i32"); + + if (I.getOpcode() == Instruction::SDiv || + I.getOpcode() == Instruction::UDiv) + return false; + + IRBuilder<> Builder(&I); + Builder.SetCurrentDebugLocation(I.getDebugLoc()); + + Type *I32Ty = getI32Ty(Builder, I.getType()); + Value *ExtOp0 = nullptr; + Value *ExtOp1 = nullptr; + Value *ExtRes = nullptr; + Value *TruncRes = nullptr; + + if (isSigned(I)) { + ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty); + ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); + } else { + ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty); + ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); + } + ExtRes = copyFlags(I, Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1)); + TruncRes = Builder.CreateTrunc(ExtRes, I.getType()); + + I.replaceAllUsesWith(TruncRes); + I.eraseFromParent(); + + return true; +} + +bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(ICmpInst &I) const { + assert(needsPromotionToI32(I.getOperand(0)->getType()) && + "I does not need promotion to i32"); + + IRBuilder<> Builder(&I); + Builder.SetCurrentDebugLocation(I.getDebugLoc()); + + Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType()); + Value *ExtOp0 = nullptr; + Value *ExtOp1 = nullptr; + Value *NewICmp = nullptr; + + if (I.isSigned()) { + ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty); + ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); + } else { + ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty); + ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); + } + NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1); + + I.replaceAllUsesWith(NewICmp); + I.eraseFromParent(); + + return true; +} + +bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(SelectInst &I) const { + assert(needsPromotionToI32(I.getType()) && + "I does not need promotion to i32"); + + IRBuilder<> Builder(&I); + Builder.SetCurrentDebugLocation(I.getDebugLoc()); + + Type *I32Ty = getI32Ty(Builder, I.getType()); + Value *ExtOp1 = nullptr; + Value *ExtOp2 = nullptr; + Value *ExtRes = nullptr; + Value *TruncRes = nullptr; + + if (isSigned(I)) { + ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); + ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty); + } else { + ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); + ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty); + } + ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2); + TruncRes = Builder.CreateTrunc(ExtRes, I.getType()); + + I.replaceAllUsesWith(TruncRes); + I.eraseFromParent(); + + return true; +} + +bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32( + IntrinsicInst &I) const { + assert(I.getIntrinsicID() == Intrinsic::bitreverse && + "I must be bitreverse intrinsic"); + assert(needsPromotionToI32(I.getType()) && + "I does not need promotion to i32"); + + IRBuilder<> Builder(&I); + Builder.SetCurrentDebugLocation(I.getDebugLoc()); + + Type *I32Ty = getI32Ty(Builder, I.getType()); + Function *I32 = + Intrinsic::getDeclaration(Mod, Intrinsic::bitreverse, { I32Ty }); + Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty); + Value *ExtRes = Builder.CreateCall(I32, { ExtOp }); + Value *LShrOp = + Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType())); + Value *TruncRes = + Builder.CreateTrunc(LShrOp, I.getType()); + + I.replaceAllUsesWith(TruncRes); + I.eraseFromParent(); + + return true; +} + static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) { const ConstantFP *CNum = dyn_cast<ConstantFP>(Num); if (!CNum) @@ -85,7 +324,6 @@ static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) { bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) { Type *Ty = FDiv.getType(); - // TODO: Handle half if (!Ty->getScalarType()->isFloatTy()) return false; @@ -154,6 +392,55 @@ static bool hasUnsafeFPMath(const Function &F) { return Attr.getValueAsString() == "true"; } +bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) { + bool Changed = false; + + if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && + DA->isUniform(&I)) + Changed |= promoteUniformOpToI32(I); + + return Changed; +} + +bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) { + bool Changed = false; + + if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) && + DA->isUniform(&I)) + Changed |= promoteUniformOpToI32(I); + + return Changed; +} + +bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) { + bool Changed = false; + + if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && + DA->isUniform(&I)) + Changed |= promoteUniformOpToI32(I); + + return Changed; +} + +bool AMDGPUCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) { + switch (I.getIntrinsicID()) { + case Intrinsic::bitreverse: + return visitBitreverseIntrinsicInst(I); + default: + return false; + } +} + +bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(IntrinsicInst &I) { + bool Changed = false; + + if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && + DA->isUniform(&I)) + Changed |= promoteUniformBitreverseToI32(I); + + return Changed; +} + bool AMDGPUCodeGenPrepare::doInitialization(Module &M) { Mod = &M; return false; |
